InternLM
diff --git a/‎xtuner/v1/rl/base/controller.py‎
Lines changed: 234 additions & 80 deletions b/‎xtuner/v1/rl/base/controller.py‎
Lines changed: 234 additions & 80 deletions
@@ -1,14 +1,16 @@
 import math
-from typing import Literal, TypedDict
+from typing import Literal, TypedDict, cast
 
+import numpy as np
 import ray
 import torch
 from ray.actor import ActorProxy
 
 from xtuner.v1.data_proto.sequence_context import SequenceContext
 from xtuner.v1.model.compose.base import BaseComposeConfig
+from xtuner.v1.rl.utils import get_seqlen_balanced_partitions
 from xtuner.v1.train.trainer import LoadCheckpointConfig
-from xtuner.v1.utils import ray_method
+from xtuner.v1.utils import get_logger, ray_method
 
 from .worker import TrainingWorker
 
@@ -23,6 +25,12 @@ class ColateItem(TypedDict):
 class RawTrainingController:
     def __init__(self, workers: list[TrainingWorker]) -> None:
         self.workers = workers
+        refs = [
+            self.workers[0].get_model_cfg.remote(),
+            self.workers[0].get_worker_cfg.remote(),
+            self.workers[0].get_data_replicate_size.remote(),
+        ]
+        self.model_cfg, self.worker_cfg, self.data_replicate_size = ray.get(refs)
 
     # TODO(hha): 这个逻辑不够通用，应该复用 sft 函数，从而支持 expand soft pack
     def _get_pack_infos(self, dataset, num_tokens, target, random=None):
@@ -164,95 +172,241 @@ def _grouped_by_max_length(self, packed_data_batches):
         # 排序后这条 pack 会被放在最前面，导致 rank0 的第一个 step 消耗的有效 token 数往往少于其他 rank，是正常现象。
         return sorted(packed_data_batches, key=lambda x: x["seq_ctx"].max_length_q, reverse=True)
 
-    @ray_method
-    def fit(self, data_batches: list[ColateItem], pack_max_length: int, rollout_idx: int):
-        has_rollout_routed_experts = False
-        language_cfg = None
-        if data_batches[0]["seq_ctx"].rollout_routed_experts is not None:
-            model_cfg = ray.get(self.workers[0].get_model_cfg.remote())  # type: ignore[attr-defined]
-            has_rollout_routed_experts = True
-            language_cfg = model_cfg
-            if isinstance(model_cfg, BaseComposeConfig):
-                language_cfg = model_cfg.text_config
-
-        packed_data_batches = self._packing(data_batches, pack_max_length, language_cfg)
-        # packed_data_batches = self._grouped_by_max_length(packed_data_batches)
-
-        # TODO(hha): 这个逻辑不够通用，和模型绑定了
-        is_qwen3_vl = False
-        if len(packed_data_batches[0]["seq_ctx"].position_ids.shape) == 3:
-            is_qwen3_vl = True
+    def _balance_split_batch(self, data_batches, partition_size):
+        """Reorder the data on single controller such that each dp rank gets
+        similar total tokens."""
+        global_seqlen_lst = [data["seq_ctx"].input_ids.numel() for data in data_batches]
+        global_partition_lst = get_seqlen_balanced_partitions(
+            global_seqlen_lst, k_partitions=partition_size, equal_size=True
+        )
+        balanced_batches = []
+        tokens_in_partition = []
+        for partition in global_partition_lst:
+            partition_batch = [data_batches[i] for i in partition]
+            tokens_in_partition.append(sum(data["seq_ctx"].input_ids.numel() for data in partition_batch))
+            balanced_batches.append(partition_batch)
+        get_logger().info(f"Balanced split into {partition_size} partitions with tokens: {tokens_in_partition}")
+        return balanced_batches
+
+    def _create_padding_sample(
+        self,
+        pad_len: int,
+        pack_max_length: int,
+        is_qwen3_vl: bool = False,
+        has_rollout_routed_experts: bool = False,
+        has_rollout_logprobs: bool = True,
+        n_routed_experts: int | None = None,
+        split_size: int = 1024,
+    ):
+        # padding input_ids
+        pad_tokens = tuple(
+            torch.zeros(1, split_size, dtype=torch.long, device="cpu") for _ in range(pad_len // split_size)
+        )
+        if pad_len % split_size > 0:
+            pad_tokens = pad_tokens + (torch.zeros(1, pad_len % split_size, dtype=torch.long, device="cpu"),)
+        pad_tokens = cast(tuple[torch.LongTensor, ...], pad_tokens)
+        pad_seq_ctx = SequenceContext.from_input_ids(pad_tokens, device="cpu")
+        pad_seq_ctx.num_padding = pad_len
+
+        # padding mm positions_ids
+        if is_qwen3_vl:
+            _position_ids_list = []
+            for pad_token in pad_tokens:
+                _position_ids = torch.arange(pad_token.size(-1)).view(1, 1, -1).expand(3, 1, -1)
+                _position_ids_list.append(_position_ids)
+            position_ids = torch.cat(_position_ids_list, dim=-1)
+            position_ids = cast(torch.LongTensor, position_ids)
+            pad_seq_ctx.position_ids = position_ids
+
+        # padding rollout routed experts
+        if has_rollout_routed_experts:
+            assert n_routed_experts, "n_routed_experts must be provided when has_rollout_routed_experts is True"
+            if pad_len == pack_max_length:
+                pad_rand_index = torch.randint(
+                    low=0, high=1, size=(1, 1, 1)
+                )  # add dummy data, true data will be initialized in train worker.fit
+            else:
+                pad_rand_index = torch.randint(low=0, high=n_routed_experts, size=(pad_len, 1, 1))
+            pad_seq_ctx.rollout_routed_experts = pad_rand_index
+
+        pad_labels = torch.full((1, pad_len), -100, dtype=torch.long, device="cpu")
+        pad_advantage_length = pack_max_length if pad_len == pack_max_length else math.ceil(pad_len / 1024)
+        pad_advantage = torch.full(
+            (1, pad_advantage_length),
+            -100,
+            dtype=torch.float32,
+            device="cpu",
+        )
+        pad_rollout_logprobs = (
+            torch.zeros(1, pad_len, dtype=torch.float32, device="cpu") if has_rollout_logprobs else None
+        )
 
-        # todo: support round up
-        num_packed_data_batches = len(packed_data_batches)
-        data_replicate_size = ray.get(self.workers[0].get_data_replicate_size.remote())  # type: ignore[attr-defined]
-        dp_size = len(self.workers) // data_replicate_size
-        pad_num = math.ceil(num_packed_data_batches / dp_size) * dp_size - num_packed_data_batches
-        if pad_num > 0:
-            # Reduce the attn calculation time by using multiple short sequence packs
-            assert data_batches[0]["seq_ctx"].input_ids is not None
-            pad_tokens = tuple(
-                torch.zeros(1, 1024, dtype=data_batches[0]["seq_ctx"].input_ids.dtype, device="cpu")
-                for _ in range(pack_max_length // 1024)
-            )
-            if pack_max_length % 1024 > 0:
-                assert data_batches[0]["seq_ctx"].input_ids is not None
-                pad_tokens = pad_tokens + (
-                    torch.zeros(
-                        1, pack_max_length % 1024, dtype=data_batches[0]["seq_ctx"].input_ids.dtype, device="cpu"
-                    ),
-                )
-            pad_seq_ctx = SequenceContext.from_input_ids(pad_tokens, device="cpu")  # type: ignore
-            pad_seq_ctx.num_padding = pack_max_length
-            if is_qwen3_vl:
-                _position_ids_list = []
-                for pad_token in pad_tokens:
-                    _position_ids = torch.arange(pad_token.size(-1)).view(1, 1, -1).expand(3, 1, -1)
-                    _position_ids_list.append(_position_ids)
-                pad_seq_ctx.position_ids = torch.cat(_position_ids_list, dim=-1)  # type: ignore
-
-            pad_shifted_labels = torch.full(
-                (1, pack_max_length),
-                -100,
-                dtype=packed_data_batches[0]["shifted_labels"].dtype,
-                device="cpu",
-            )
-            pad_advantages = torch.full(
-                (1, pack_max_length),
-                -100,
-                dtype=packed_data_batches[0]["advantages"].dtype,
-                device="cpu",
+        return {
+            "seq_ctx": pad_seq_ctx,
+            "shifted_labels": pad_labels,
+            "advantages": pad_advantage,
+            "rollout_logprobs": pad_rollout_logprobs,
+        }
+
+    def _pack(self, mini_batch, pack_max_length):
+        assert len(mini_batch) > 0, "mini_batch should not be empty"
+        seqlen_list = []
+        for data in mini_batch:
+            assert data["seq_ctx"].input_ids.numel() <= pack_max_length, (
+                f"Single sample seq len {data['seq_ctx'].input_ids.numel()} exceeds pack_max_length {pack_max_length}"
             )
+            seqlen_list.append(data["seq_ctx"].input_ids.numel())
+        total_length = sum(seqlen_list)
 
-            if has_rollout_routed_experts:
-                pad_rand_index = torch.randint(
-                    low=0,
-                    high=1,
-                    size=(1, 1, 1),  # add dummy data, true data will be initialized in train worker.fit
-                )
-                pad_seq_ctx.rollout_routed_experts = pad_rand_index
+        if total_length <= pack_max_length:
+            return [mini_batch]  # No packing needed
 
-            pad_rollout_logprobs = None
-            if "rollout_logprobs" in packed_data_batches[0] and packed_data_batches[0]["rollout_logprobs"] is not None:
-                pad_rollout_logprobs = torch.zeros(
-                    1, pack_max_length, dtype=packed_data_batches[0]["rollout_logprobs"].dtype, device="cpu"
-                )
-            pad_data = {
-                "seq_ctx": pad_seq_ctx,
-                "shifted_labels": pad_shifted_labels,
-                "advantages": pad_advantages,
-                "rollout_logprobs": pad_rollout_logprobs,
+        num_packs = math.ceil(total_length / pack_max_length)
+        partitions_indices = get_seqlen_balanced_partitions(
+            seqlen_list=seqlen_list, k_partitions=num_packs, equal_size=False
+        )
+
+        packed_mini_batches = []
+        for partition in partitions_indices:
+            packed_batch = [mini_batch[i] for i in partition]
+            packed_mini_batches.append(packed_batch)
+        return packed_mini_batches
+
+    def _get_data_batches_properties(self, data_batches: list[ColateItem]):
+        """Extract properties from the first element of data_batches."""
+        if not data_batches:
+            return {
+                "is_qwen3_vl": False,
+                "has_rollout_routed_experts": False,
+                "has_rollout_logprobs": False,
+                "n_routed_experts": None,
             }
-            pad_data_samples = [pad_data for _ in range(pad_num)]
-            packed_data_batches = packed_data_batches + pad_data_samples
 
-        print(f"len(packed_data_batches): {len(packed_data_batches)}")
+        first_item = data_batches[0]
+        seq_ctx = first_item["seq_ctx"]
+
+        is_qwen3_vl = seq_ctx.position_ids is not None and len(seq_ctx.position_ids.shape) == 3
+        has_rollout_logprobs = "rollout_logprobs" in first_item and first_item["rollout_logprobs"] is not None
+        has_rollout_routed_experts = seq_ctx.rollout_routed_experts is not None
+
+        language_cfg = None
+        if has_rollout_routed_experts:
+            language_cfg = self.model_cfg
+            if isinstance(self.model_cfg, BaseComposeConfig):
+                language_cfg = self.model_cfg.text_config
+
+        return {
+            "is_qwen3_vl": is_qwen3_vl,
+            "has_rollout_routed_experts": has_rollout_routed_experts,
+            "has_rollout_logprobs": has_rollout_logprobs,
+            "n_routed_experts": language_cfg.n_routed_experts if language_cfg is not None else None,
+        }
+
+    @ray_method
+    def fit(
+        self, data_batches: list[ColateItem], pack_max_length: int, rollout_idx: int, enable_dp_balance: bool = True
+    ):
+        batch_props = self._get_data_batches_properties(data_batches)
+        is_qwen3_vl = batch_props["is_qwen3_vl"]
+        has_rollout_routed_experts = batch_props["has_rollout_routed_experts"]
+        has_rollout_logprobs = batch_props["has_rollout_logprobs"]
+        n_routed_experts = batch_props["n_routed_experts"]
+
+        world_size = len(self.workers)
+        dp_size = world_size // self.data_replicate_size
+        assert world_size % self.data_replicate_size == 0, "world_size must be divisible by data_replicate_size"
+        optimizer_steps = self.worker_cfg.optimizer_steps
+
+        if enable_dp_balance:
+            # 按照 dp_size 对数据进行重新分配，保证每个 dp rank 上的 token 数量大致相同
+            batches_per_dp_group = self._balance_split_batch(data_batches, dp_size)
+        else:
+            batches_per_dp_group = np.array_split(data_batches, dp_size)
+            tokens_in_partition = []
+            for batch in batches_per_dp_group:
+                tokens_in_partition.append(sum(data["seq_ctx"].input_ids.numel() for data in batch))
+            get_logger().info(f"default split into {dp_size} partitions with tokens: {tokens_in_partition}")
+
+        packed_data_batches: list[list[list[dict]]] = [[[] for _ in range(optimizer_steps)] for _ in range(dp_size)]
+        max_packs_per_card = [0] * optimizer_steps
+
+        for dp_rank, dp_worker_data_batches in enumerate(batches_per_dp_group):
+            # 每个worker 内部按照optimizer_steps将token均分
+            mini_batch_for_steps = self._balance_split_batch(dp_worker_data_batches, optimizer_steps)
+
+            for step_idx, step_mini_batch in enumerate(mini_batch_for_steps):
+                # pack
+                pack_mini_batch = self._pack(step_mini_batch, pack_max_length)
+                if len(pack_mini_batch) > max_packs_per_card[step_idx]:
+                    max_packs_per_card[step_idx] = len(pack_mini_batch)
+
+                for pack in pack_mini_batch:
+                    seq_ctx_list = [item["seq_ctx"] for item in pack]
+                    label_list = [item["shifted_labels"] for item in pack]
+                    advantage_list = [torch.tensor([item["advantage"]]).float().unsqueeze(0) for item in pack]
+                    rollout_logprobs_list = [
+                        item["rollout_logprobs"] if has_rollout_logprobs else None for item in pack
+                    ]
+                    padding_len = pack_max_length - sum([item["seq_ctx"].input_ids.numel() for item in pack])
+                    if padding_len > 0:
+                        padding_sample = self._create_padding_sample(
+                            padding_len,
+                            pack_max_length,
+                            is_qwen3_vl=is_qwen3_vl,
+                            has_rollout_routed_experts=has_rollout_routed_experts,
+                            has_rollout_logprobs=has_rollout_logprobs,
+                            n_routed_experts=n_routed_experts,
+                        )
+                        seq_ctx_list.append(padding_sample["seq_ctx"])
+                        label_list.append(padding_sample["shifted_labels"])
+                        advantage_list.append(padding_sample["advantages"])
+                        rollout_logprobs_list.append(padding_sample["rollout_logprobs"])
+
+                    packed_seq_ctx = SequenceContext.pack(seq_ctx_list)
+                    packed_shifted_labels = torch.cat(label_list, dim=1)
+                    cu_seq_lens_q = packed_seq_ctx.cu_seq_lens_q
+                    packed_num_tokens = cu_seq_lens_q[1:] - cu_seq_lens_q[:-1]
+                    packed_advantages = torch.cat(advantage_list, dim=1)
+                    packed_advantages = torch.repeat_interleave(packed_advantages, packed_num_tokens, dim=1)
+                    if has_rollout_logprobs:
+                        cast_rollout_logprobs_list = [cast(torch.Tensor, item) for item in rollout_logprobs_list]
+                        packed_rollout_logprobs = torch.cat(cast_rollout_logprobs_list, dim=1)
+                    else:
+                        packed_rollout_logprobs = None
+                    packed_data_batches[dp_rank][step_idx].append(
+                        {
+                            "seq_ctx": packed_seq_ctx,
+                            "shifted_labels": packed_shifted_labels,
+                            "advantages": packed_advantages,
+                            "rollout_logprobs": packed_rollout_logprobs,
+                        }
+                    )
+
+        get_logger().info(f"Gradient accumulation steps: {max_packs_per_card}")
+        # padding for each worker to have same number of packs
+        for dp_rank in range(dp_size):
+            for step_idx in range(optimizer_steps):
+                max_packs = max_packs_per_card[step_idx]
+                num_current_packs = len(packed_data_batches[dp_rank][step_idx])
+                num_padding_packs = max_packs - num_current_packs
+
+                if num_padding_packs > 0:
+                    padding_sample = self._create_padding_sample(
+                        pack_max_length,
+                        pack_max_length,
+                        is_qwen3_vl=is_qwen3_vl,
+                        has_rollout_routed_experts=has_rollout_routed_experts,
+                        has_rollout_logprobs=has_rollout_logprobs,
+                        n_routed_experts=n_routed_experts,
+                    )
+                    padding_samples = [padding_sample for _ in range(num_padding_packs)]
+                    packed_data_batches[dp_rank][step_idx].extend(padding_samples)
 
         handles = []
         for worker_idx, worker in enumerate(self.workers):
             handles.append(
                 worker.fit.remote(  # type: ignore[attr-defined]
-                    data_batches=packed_data_batches[(worker_idx // data_replicate_size) :: dp_size],
+                    data_batches=packed_data_batches[worker_idx // self.data_replicate_size],
                     rollout_idx=rollout_idx,
                 )
             )