[Opt] len(params) == 1 case in foreach_all_gather

CyCle1024 · CyCle1024 · commit 92e40defa0bf · 2026-01-13T11:08:23.000+08:00
diff --git a/xtuner/v1/ops/comm/foreach_allgather.py b/xtuner/v1/ops/comm/foreach_allgather.py
@@ -31,33 +31,47 @@ def foreach_all_gather(
         dist.all_gather(global_input_tensor_numels, input_tensor_numels_tensor, group=group)
     else:
         global_input_tensor_numels = [
-            torch.tensor([reduce(mul, shape) for shape in param_shapes], dtype=torch.int64, device="cpu")
+            torch.tensor([reduce(mul, shape, 1) for shape in param_shapes], dtype=torch.int64, device="cpu")
             for param_shapes in params_shapes_across_group  # each param_shapes represents all params shapes on one rank
         ]
 
-    flatten_copyin_tensor = torch.empty((sum(input_tensor_numels),), dtype=param0.dtype, device=param0.device)
-    splits_copyin_tensor = torch.split(flatten_copyin_tensor, input_tensor_numels)
-    torch._foreach_copy_(splits_copyin_tensor, [p.flatten() for p in params])
-
-    copyout_size = int(sum(sum(i) for i in global_input_tensor_numels))
-    flatten_copyout_tensor = torch.empty((copyout_size,), dtype=param0.dtype, device=param0.device)
-
-    dist.all_gather_into_tensor(flatten_copyout_tensor, flatten_copyin_tensor, group=group)
-    copyout_split_size: list[int] = sum([i.tolist() for i in global_input_tensor_numels], [])
-    splits_copyout_tensor = torch.split(flatten_copyout_tensor, copyout_split_size)
-
-    _global_input_tensor_shapes: list[None] | list[list[tuple]] = [None for _ in range(dist.get_world_size(group))]
-    dist.all_gather_object(_global_input_tensor_shapes, input_tensor_shapes, group=group)
-    _global_input_tensor_shapes = cast(list[list[tuple]], _global_input_tensor_shapes)
-    global_input_tensor_shapes: list[tuple] = sum(_global_input_tensor_shapes, [])
-
-    gathered_params: list[list[torch.Tensor]] = []
-    for i in range(len(params)):
-        single_gathered_params: list[torch.Tensor] = []
-        for rank in range(dist.get_world_size(group)):
-            offset = len(params) * rank
-            origin_shape: tuple = global_input_tensor_shapes[offset + i]
-            single_gathered_params.append(splits_copyout_tensor[offset + i].view(origin_shape))
-        gathered_params.append(single_gathered_params)
+    if len(params) == 1:
+        param0_shape_except_dim0 = list(param0.shape)[1:]
+        param0_numel_except_dim0 = param0[0].numel()
+        # Calculate the size of dimension 0 of the gathered tensor, it's compatible for the case of uneven split
+        split_dim0_sizes = [t.tolist()[0] // param0_numel_except_dim0 for t in global_input_tensor_numels]
+        gathered_tensor_dim0_size = sum(split_dim0_sizes)
+
+        # all_gather_into_tensor gather different ranks data along dimension 0
+        gathered_tensor = torch.empty(
+            (gathered_tensor_dim0_size, *param0_shape_except_dim0), dtype=param0.dtype, device=param0.device
+        )
+        dist.all_gather_into_tensor(gathered_tensor, param0, group=group)
+        return [gathered_tensor.split(split_dim0_sizes, dim=0)]
+    else:
+        flatten_copyin_tensor = torch.empty((sum(input_tensor_numels),), dtype=param0.dtype, device=param0.device)
+        splits_copyin_tensor = torch.split(flatten_copyin_tensor, input_tensor_numels)
+        torch._foreach_copy_(splits_copyin_tensor, [p.flatten() for p in params])
+
+        copyout_size = int(sum(sum(i) for i in global_input_tensor_numels))
+        flatten_copyout_tensor = torch.empty((copyout_size,), dtype=param0.dtype, device=param0.device)
+
+        dist.all_gather_into_tensor(flatten_copyout_tensor, flatten_copyin_tensor, group=group)
+        copyout_split_size: list[int] = sum([i.tolist() for i in global_input_tensor_numels], [])
+        splits_copyout_tensor = torch.split(flatten_copyout_tensor, copyout_split_size)
+
+        _global_input_tensor_shapes: list[None] | list[list[tuple]] = [None for _ in range(dist.get_world_size(group))]
+        dist.all_gather_object(_global_input_tensor_shapes, input_tensor_shapes, group=group)
+        _global_input_tensor_shapes = cast(list[list[tuple]], _global_input_tensor_shapes)
+        global_input_tensor_shapes: list[tuple] = sum(_global_input_tensor_shapes, [])
+
+        gathered_params: list[list[torch.Tensor]] = []
+        for i in range(len(params)):
+            single_gathered_params: list[torch.Tensor] = []
+            for rank in range(dist.get_world_size(group)):
+                offset = len(params) * rank
+                origin_shape: tuple = global_input_tensor_shapes[offset + i]
+                single_gathered_params.append(splits_copyout_tensor[offset + i].view(origin_shape))
+            gathered_params.append(single_gathered_params)
 
     return gathered_params