clean

sufubao · sufubao · commit 611b216f3201 · 2026-01-27T12:47:30.000Z
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py
@@ -13,7 +13,7 @@
 from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num, get_env_start_args
 from lightllm.utils.dist_utils import get_global_world_size, get_global_rank
 from lightllm.utils.log_utils import init_logger
-from lightllm.common.basemodel.routing_manager import g_routing_capture_manager, get_next_moe_layer_index
+from lightllm.common.basemodel.routing_manager import get_next_moe_layer_index
 
 logger = init_logger(__name__)
 
@@ -105,7 +105,6 @@ def _init_parallel_params(self):
                 f"redundancy_expertids: {self.redundancy_expert_ids}"
             )
             self.local_n_routed_experts = self.n_routed_experts // self.global_world_size + self.redundancy_expert_num
-            self.split_inter_size = self.moe_intermediate_size
             n_experts_per_rank = self.n_routed_experts // self.global_world_size
             start_expert_id = self.global_rank_ * n_experts_per_rank
             self.local_expert_ids = (
diff --git a/lightllm/common/basemodel/routing_manager.py b/lightllm/common/basemodel/routing_manager.py
@@ -3,9 +3,52 @@
 from typing import Optional
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.dist_utils import get_current_rank_in_dp
+from lightllm.server.router.dynamic_prompt.shared_arr import SharedArray
+from lightllm.utils.envs_utils import get_unique_server_name
 
 logger = init_logger(__name__)
 
+
+class SharedRoutingConfig:
+    """Shared MoE routing configuration across processes."""
+
+    def __init__(self):
+        service_name = get_unique_server_name()
+        # Shape: [num_moe_layers, topk]
+        self._shm = SharedArray(f"{service_name}_routing_config", shape=(2,), dtype=np.int32)
+
+    @property
+    def num_moe_layers(self) -> int:
+        return int(self._shm.arr[0])
+
+    @num_moe_layers.setter
+    def num_moe_layers(self, value: int):
+        self._shm.arr[0] = value
+
+    @property
+    def topk(self) -> int:
+        return int(self._shm.arr[1])
+
+    @topk.setter
+    def topk(self, value: int):
+        self._shm.arr[1] = value
+
+    def is_initialized(self) -> bool:
+        return self.num_moe_layers > 0 and self.topk > 0
+
+
+# Global shared routing config (lazy initialized)
+_shared_routing_config: Optional[SharedRoutingConfig] = None
+
+
+def get_shared_routing_config() -> SharedRoutingConfig:
+    """Get or create the shared routing config."""
+    global _shared_routing_config
+    if _shared_routing_config is None:
+        _shared_routing_config = SharedRoutingConfig()
+    return _shared_routing_config
+
+
 # MoE layer counter for auto-incrementing moe_layer_index
 _moe_layer_counter: int = 0
 
@@ -75,12 +118,8 @@ def __init__(
         )
 
     def capture(self, moe_layer_index: int, topk_ids: torch.Tensor, microbatch_index: int = 0) -> None:
-        assert (
-            0 <= moe_layer_index < self.num_moe_layers
-        ), f"moe_layer_index {moe_layer_index} out of range [0, {self.num_moe_layers})"
-        slot = microbatch_index % self.num_slots
         num_tokens = topk_ids.shape[0]
-        self.gpu_buffer[slot, moe_layer_index, :num_tokens, :] = topk_ids.to(self.dtype)
+        self.gpu_buffer[microbatch_index, moe_layer_index, :num_tokens, :] = topk_ids.to(self.dtype)
 
     def flush_to_cpu_async(self, mem_indexes: torch.Tensor, microbatch_index: int) -> None:
         num_tokens = mem_indexes.shape[0]
@@ -98,9 +137,20 @@ def flush_to_cpu_async(self, mem_indexes: torch.Tensor, microbatch_index: int) -
             self.cpu_buffer[:, cpu_indexes, :] = self.gpu_buffer[slot, :, :num_tokens, :].cpu()
             event.record()
 
-    def extract_for_request(self, mem_indexes: torch.Tensor) -> np.ndarray:
+    def sync_events(self) -> None:
+        """Synchronize all flush events. Call once before batch extraction."""
         for event in self.flush_events:
             event.synchronize()
+
+    def extract_for_request(self, mem_indexes: torch.Tensor) -> np.ndarray:
+        self.sync_events()
+        return self.cpu_buffer[:, mem_indexes, :].numpy()
+
+    def extract_for_request_no_sync(self, mem_indexes: torch.Tensor) -> np.ndarray:
+        """Extract routing data without synchronizing events.
+
+        Call sync_events() once before using this method in a batch.
+        """
         return self.cpu_buffer[:, mem_indexes, :].numpy()
 
 
@@ -132,8 +182,6 @@ def init_routing_capture(model) -> None:
         return
 
     # Only create routing capture manager on rank 0
-    # Routing decisions are identical across all TP ranks, so we only need to capture on rank 0
-    # which is the rank that communicates results back to the Router/HTTP server
     if get_current_rank_in_dp() != 0:
         logger.info("Skipping routing capture initialization on non-zero rank")
         return
@@ -145,16 +193,9 @@ def init_routing_capture(model) -> None:
         )
         return
 
-    n_routed_experts = model.config.get("n_routed_experts", model.config.get("num_experts", 0))
-    if n_routed_experts == 0:
-        logger.warning(
-            "enable_return_routed_experts is set but n_routed_experts=0. " "Routing capture will not be enabled."
-        )
-        return
-
-    topk = model.config.get("num_experts_per_tok", 1)
-    num_experts = n_routed_experts
-
+    num_experts = model.config.get("n_routed_experts", model.config.get("num_experts", 0))
+    topk = model.config.get("num_experts_per_tok", 0)
+    assert num_experts > 0 and topk > 0
     enable_overlap = getattr(model.args, "enable_decode_microbatch_overlap", False)
 
     logger.info(
@@ -167,11 +208,16 @@ def init_routing_capture(model) -> None:
         topk=topk,
         num_experts=num_experts,
         batch_max_tokens=model.max_total_token_num,
-        # Add 1 to handle potential edge case where mem_index == size
         kv_cache_size=model.mem_manager.size + 1,
         enable_overlap=enable_overlap,
     )
 
+    # Set shared routing config for cross-process access
+    shared_config = get_shared_routing_config()
+    shared_config.num_moe_layers = num_moe_layers
+    shared_config.topk = topk
+    logger.info(f"Shared routing config set: num_moe_layers={num_moe_layers}, topk={topk}")
+
 
 def flush_routing_capture(mem_indexes: torch.Tensor, microbatch_index: int = 0) -> None:
     if g_routing_capture_manager is not None:
diff --git a/lightllm/common/quantization/w8a8.py b/lightllm/common/quantization/w8a8.py
@@ -72,7 +72,7 @@ def quantize(self, weight: torch.Tensor, output: WeightPack) -> None:
         weight = weight.float().cuda(self.device_id_)
         scale = weight.abs().max(dim=-1)[0] / 127
         weight = weight / scale.reshape(-1, 1)
-        weight = torch.round(weight.clamp(min=-128, max=127)).to(dtype=torch.int8)
+        weight = torch.round(weight.clamp(min=-127, max=127)).to(dtype=torch.int8)
         output.weight.copy_(weight)
         output.weight_scale.copy_(scale)
         return
diff --git a/lightllm/server/core/objs/req.py b/lightllm/server/core/objs/req.py
@@ -1,6 +1,7 @@
 import os
 import math
 import ctypes
+import base64
 import numpy as np
 import time
 from .sampling_params import SamplingParams
@@ -122,9 +123,6 @@ class Req(ctypes.Structure):
         ("cpu_cache_match_page_indexes", CpuCachePageList),
         # 分块hash的块大小
         ("cpu_cache_token_page_size", ctypes.c_int),
-        ("routing_data_num_moe_layers", ctypes.c_int),
-        ("routing_data_num_tokens", ctypes.c_int),
-        ("routing_data_topk", ctypes.c_int),
     ]
 
     def get_str(self):
@@ -183,10 +181,6 @@ def init(
         self.stop_str_matched = False
         self.stop_str_matched_token_index = -1
 
-        self.routing_data_num_moe_layers = 0
-        self.routing_data_num_tokens = 0
-        self.routing_data_topk = 0
-
         self.post_init()
 
         self.cpu_cache_token_page_size = get_env_start_args().cpu_cache_token_page_size
@@ -240,25 +234,21 @@ def create_routing_data_shm_array(self, num_moe_layers: int, num_tokens: int, to
         shape = (num_moe_layers, num_tokens, topk)
         self.shm_routing_data = ShmArray(name, shape, dtype=np.int32)
         self.shm_routing_data.create_shm()
-        self.routing_data_num_moe_layers = num_moe_layers
-        self.routing_data_num_tokens = num_tokens
-        self.routing_data_topk = topk
         return
 
-    def link_routing_data_shm_array(self):
-        if self.routing_data_num_moe_layers == 0:
+    def link_routing_data_shm_array(self, num_moe_layers: int, topk: int):
+        if num_moe_layers == 0:
             return
         service_uni_name = get_unique_server_name()
         name = f"{service_uni_name}_shm_routing_{self.index_in_shm_mem}"
-        shape = (self.routing_data_num_moe_layers, self.routing_data_num_tokens, self.routing_data_topk)
+        # num_tokens equals shm_cur_kv_len at the time of creation
+        shape = (num_moe_layers, self.shm_cur_kv_len, topk)
         self.shm_routing_data = ShmArray(name, shape, dtype=np.int32)
         self.shm_routing_data.link_shm()
         return
 
     def get_routing_data(self):
-        if self.routing_data_num_moe_layers == 0 or not hasattr(self, "shm_routing_data"):
-            return None
-        if self.shm_routing_data is None:
+        if not hasattr(self, "shm_routing_data") or self.shm_routing_data is None:
             return None
         return self.shm_routing_data.arr
 
@@ -268,6 +258,29 @@ def close_routing_data_shm_array(self):
             self.shm_routing_data = None
         return
 
+    def get_routing_metadata(self, num_moe_layers: int, topk: int):
+        """Safely extract routing data and format for API response.
+
+        Returns a dict with shape, dtype, and base64-encoded data, or None if unavailable.
+        """
+        if num_moe_layers == 0 or topk == 0:
+            return None
+        try:
+            self.link_routing_data_shm_array(num_moe_layers, topk)
+            routing_data = self.get_routing_data()
+            if routing_data is None:
+                return None
+            return {
+                "shape": list(routing_data.shape),
+                "dtype": str(routing_data.dtype),
+                "data": base64.b64encode(routing_data.tobytes()).decode("ascii"),
+            }
+        except Exception as e:
+            logger.warning(f"Failed to read routing data for req {self.request_id}: {e}")
+            return None
+        finally:
+            self.close_routing_data_shm_array()
+
     def get_prompt_ids(self):
         return self.shm_prompt_ids.arr[: self.input_len].tolist()
 
diff --git a/lightllm/server/core/objs/sampling_params.py b/lightllm/server/core/objs/sampling_params.py
@@ -497,7 +497,6 @@ def to_dict(self):
             "add_spaces_between_special_tokens": self.add_spaces_between_special_tokens,
             "print_eos_token": self.print_eos_token,
             "disable_prompt_cache": self.disable_prompt_cache,
-            "return_routed_experts": self.return_routed_experts,
         }
 
     def to_origin_dict(self):
diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
@@ -159,3 +159,5 @@ class StartArgs:
     # multi_modal
     enable_multimodal: bool = field(default=False)
     enable_multimodal_audio: bool = field(default=False)
+
+    enable_return_routed_experts: bool = field(default=False)
diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -10,7 +10,6 @@
 import hashlib
 import datetime
 import pickle
-import base64
 from frozendict import frozendict
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
@@ -30,6 +29,7 @@
 from lightllm.server.core.objs.shm_req_manager import ShmReqManager
 from lightllm.server.core.objs.atomic_array_lock import AtomicShmArrayLock, AsyncLock, AtomicLockItem
 from lightllm.server.router.dynamic_prompt.shared_arr import SharedInt
+from lightllm.common.basemodel.routing_manager import get_shared_routing_config
 from lightllm.utils.log_utils import init_logger
 from lightllm.server.metrics.manager import MetricClient
 from lightllm.utils.statics_utils import MovingAverage
@@ -115,6 +115,9 @@ def __init__(
         # If the timemark is not updated for a pre-set time, a prob request will be sent to the backend.
         self.latest_success_infer_time_mark = SharedInt(f"{get_unique_server_name()}_latest_success_infer_time_mark")
         self.latest_success_infer_time_mark.set_value(int(time.time()))
+
+        # Cache routing config for MoE expert routing data extraction
+        self._routing_config = get_shared_routing_config() if args.enable_return_routed_experts else None
         return
 
     async def _alloc_resource(self, items, md5sums, token_nums, datas):
@@ -779,19 +782,12 @@ async def handle_loop(self):
                                     else:
                                         finish_status = FinishStatus(req.finish_status.status)
 
-                                    if req.sample_params.return_routed_experts and req.routing_data_num_moe_layers > 0:
-                                        try:
-                                            req.link_routing_data_shm_array()
-                                            routing_data = req.get_routing_data()
-                                            if routing_data is not None:
-                                                metadata["routed_experts"] = {
-                                                    "shape": list(routing_data.shape),
-                                                    "dtype": str(routing_data.dtype),
-                                                    "data": base64.b64encode(routing_data.tobytes()).decode("ascii"),
-                                                }
-                                                req.close_routing_data_shm_array()
-                                        except Exception as e:
-                                            logger.warning(f"Failed to read routing data for req {req_id}: {e}")
+                                    if self._routing_config is not None and self._routing_config.is_initialized():
+                                        routing_meta = req.get_routing_metadata(
+                                            self._routing_config.num_moe_layers, self._routing_config.topk
+                                        )
+                                        if routing_meta is not None:
+                                            metadata["routed_experts"] = routing_meta
 
                                     token_list.append((req_id, text, metadata, finish_status))
                             else:
diff --git a/lightllm/server/router/model_infer/infer_batch.py b/lightllm/server/router/model_infer/infer_batch.py
@@ -114,16 +114,25 @@ def add_reqs(self, requests: List[Tuple[int, int, Any, int]], init_prefix_cache:
 
         return req_objs
 
-    def _extract_routing_data(self, req: "InferReq"):
+    def _extract_routing_data(self, req: "InferReq", sync: bool = True):
+        """Extract MoE routing data for a completed request.
+
+        Args:
+            req: The inference request to extract routing data for.
+            sync: If True, synchronize CUDA events before extraction. Set to False
+                  when processing multiple requests in batch after calling
+                  g_routing_capture_manager.sync_events() once.
+        """
         mem_indexes = self.req_manager.req_to_token_indexs[req.req_idx][0 : req.cur_kv_len]
         num_moe_layers = g_routing_capture_manager.num_moe_layers
         topk = g_routing_capture_manager.topk
         num_tokens = req.cur_kv_len
-        logger.debug(f"R3: Extracting routing for req {req.req_id}: {num_moe_layers}x{num_tokens}x{topk}")
-        routing_data = g_routing_capture_manager.extract_for_request(mem_indexes.cpu())
+        if sync:
+            routing_data = g_routing_capture_manager.extract_for_request(mem_indexes.cpu())
+        else:
+            routing_data = g_routing_capture_manager.extract_for_request_no_sync(mem_indexes.cpu())
         req.shm_req.create_routing_data_shm_array(num_moe_layers, num_tokens, topk)
         req.shm_req.shm_routing_data.arr[:] = routing_data
-        logger.debug(f"R3: Successfully extracted routing data for req {req.req_id}")
 
     def free_a_req_mem(self, free_token_index: List, req: "InferReq"):
         if self.radix_cache is None:
@@ -161,14 +170,20 @@ def _filter(self, finished_request_ids: List[int]):
         if len(finished_request_ids) == 0:
             return
 
+        # Optimization: sync CUDA events once for batch routing data extraction
+        need_routing_data = g_routing_capture_manager is not None
+        if need_routing_data:
+            g_routing_capture_manager.sync_events()
+
         free_req_index = []
         free_token_index = []
         for request_id in finished_request_ids:
             req: InferReq = self.requests_mapping.pop(request_id)
             if self.args.diverse_mode:
                 req.clear_master_slave_state()
 
-            self._extract_routing_data(req)
+            if need_routing_data:
+                self._extract_routing_data(req, sync=False)
 
             self.free_a_req_mem(free_token_index, req)
 

Original file line number	Diff line number	Diff line change
`@@ -497,7 +497,6 @@ def to_dict(self):`
`497`	`497`	`"add_spaces_between_special_tokens": self.add_spaces_between_special_tokens,`
`498`	`498`	`"print_eos_token": self.print_eos_token,`
`499`	`499`	`"disable_prompt_cache": self.disable_prompt_cache,`
`500`		`- "return_routed_experts": self.return_routed_experts,`
`501`	`500`	`}`
`502`	`501`
`503`	`502`	`def to_origin_dict(self):`