flashinfer-ai · IwakuraRein · Feb 17, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
@@ -112,6 +112,11 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner(
         continue;
       }
 
+      if (options.mDtypeA == tg::Dtype::MxE4m3 && options.mDtypeB == tg::Dtype::MxE4m3 &&
+          options.mNumSlicesForSplitK > 1) {
+        continue;
+      }
+
       if (mOptions.transposeMmaOutput && options.mEpilogueTileM == mOptions.epilogueTileM) {
         mPassingConfigIndices.push_back(i);
       }

@@ -89,7 +89,7 @@ class ArtifactPath:
 
     TRTLLM_GEN_FMHA: str = "75d477a640f268ea9ad117cc596eb39245713b9e/fmha/trtllm-gen/"
     TRTLLM_GEN_BMM: str = (
-        "e1e11bbfe0743743620ef997a6d5e8e2dbdf01cf/batched_gemm-2a674db-79e4d37"
+        "456b1ae890d436c794b17e4435b41b849d3e5950/batched_gemm-2a674db-3a84a12"
     )
     TRTLLM_GEN_GEMM: str = (
         "1fddc48b7b48af33914d040051b3e2ee9ba4701e/gemm-145d1b1-9b113e3"
@@ -110,7 +110,7 @@ class CheckSumHash:
         "e014d7a54c396733ef012b223603c1be2861019f88faa5dcc882ed1ecfe5c2d9"
     )
     TRTLLM_GEN_BMM: str = (
-        "03b1a419b594b7a4613ea8437c172dc2627d56bd360be25aa604859dc12a05fb"
+        "b9121fed5dd7700b7c2a0dcbcf2ef022483855cf585263324275b0072cca6bb7"
     )
     DEEPGEMM: str = "1a2a166839042dbd2a57f48051c82cd1ad032815927c753db269a4ed10d0ffbf"
     TRTLLM_GEN_GEMM: str = (

@@ -16,6 +16,7 @@
 
 from .core import (
     ActivationType,
+    Fp8QuantizationType,
     RoutingMethodType,
     WeightLayout,
     convert_to_block_layout,

@@ -173,6 +173,26 @@ class WeightLayout(IntEnum):
     BlockMajorK = 2
 
 
+# The type of gated activation function
+# Please keep this in sync with the counterpart defined in include/flashinfer/trtllm/fused_moe/runner.h
+class GatedActType(IntEnum):
+    # SwiGlu
+    SwiGlu = 0
+    # GeGlu
+    GeGlu = 1
+
+
+# The type of FP8 quantization
+# Please keep this in sync with the counterpart defined in trtllm_fused_moe_kernel_launcher.cu
+class Fp8QuantizationType(IntEnum):
+    # No FP8 quantization
+    NoneFp8 = 0
+    # DeepSeek FP8
+    DeepSeekFp8 = 1
+    # MxFp8 x MxFp8
+    MxFp8 = 2
+
+
 @functools.cache
 def is_trtllm_moe_supported(
     dtype_weights: DtypeTrtllmGen,
@@ -986,7 +1006,7 @@ def __init__(
             num_local_experts: int,
             dtype_act: DtypeTrtllmGen,
             dtype_weights: DtypeTrtllmGen,
-            use_deepseek_fp8: bool,
+            fp8_quantization_type: Fp8QuantizationType,
             hidden_size: int,
             intermediate_size: int,
             activation_type: int = ActivationType.Swiglu,
@@ -998,7 +1018,7 @@ def __init__(
             self.top_k = top_k
             self.dtype_act = dtype_act
             self.dtype_weights = dtype_weights
-            self.use_deepseek_fp8 = use_deepseek_fp8
+            self.fp8_quantization_type = fp8_quantization_type
             self.top_k = top_k
             self.hidden_size = hidden_size
             self.intermediate_size = intermediate_size
@@ -1025,7 +1045,7 @@ def get_valid_tactics(
             instance_key = (
                 self.dtype_act,
                 self.dtype_weights,
-                self.use_deepseek_fp8,
+                self.fp8_quantization_type,
                 self.top_k,
                 self.hidden_size,
                 self.intermediate_size,
@@ -1114,16 +1134,28 @@ def forward(
                 and self.dtype_weights == DtypeTrtllmGen.E4m3
             ):
                 # FP8 operations
-                if self.use_deepseek_fp8:
+                if (
+                    self.fp8_quantization_type == Fp8QuantizationType.DeepSeekFp8
+                    or self.fp8_quantization_type == Fp8QuantizationType.MxFp8
+                ):
                     # FP8 block scale
                     current_num_tokens = hidden_states.shape[0]
                     current_hidden_size = hidden_states.shape[1]
-                    current_hidden_states_scale = torch.full(
-                        (current_hidden_size // 128, current_num_tokens),
-                        2.0,
-                        dtype=torch.float,
-                        device=hidden_states.device,
-                    )
+                    if self.fp8_quantization_type == Fp8QuantizationType.DeepSeekFp8:
+                        current_hidden_states_scale = torch.full(
+                            (current_hidden_size // 128, current_num_tokens),
+                            2.0,
+                            dtype=torch.float,
+                            device=hidden_states.device,
+                        )
+                    elif self.fp8_quantization_type == Fp8QuantizationType.MxFp8:
+                        current_hidden_states_scale = extra_inputs[0]
+
+                    else:
+                        raise ValueError(
+                            f"Unsupported FP8 quantization type: {self.fp8_quantization_type}"
+                        )
+
                     moe_op.trtllm_fp8_block_scale_moe(
                         routing_logits,
                         topk_ids,
@@ -1149,6 +1181,7 @@ def forward(
                         kwargs["weight_layout"],
                         kwargs["enable_pdl"],
                         [-1, -1] if tactic == -1 else tactic,
+                        self.fp8_quantization_type,
                     )
                 else:
                     # FP8 per tensor scale
@@ -1319,7 +1352,7 @@ def trtllm_bf16_moe_op(
             num_local_experts=local_num_experts,
             dtype_act=dtype_act,
             dtype_weights=dtype_weights,
-            use_deepseek_fp8=False,
+            fp8_quantization_type=Fp8QuantizationType.NoneFp8,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             weight_layout=weight_layout,
@@ -1452,7 +1485,7 @@ def trtllm_fp8_per_tensor_scale_moe_op(
             num_local_experts=local_num_experts,
             dtype_act=dtype_act,
             dtype_weights=dtype_weights,
-            use_deepseek_fp8=False,  # per_tensor mode
+            fp8_quantization_type=Fp8QuantizationType.NoneFp8,  # per_tensor mode
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             weight_layout=WeightLayout.MajorK,
@@ -1569,6 +1602,7 @@ def trtllm_fp8_block_scale_moe_op(
         weight_layout: int = 0,
         enable_pdl: Optional[bool] = None,
         tune_max_num_tokens: int = 8192,
+        fp8_quantization_type: Fp8QuantizationType = Fp8QuantizationType.DeepSeekFp8,
     ) -> torch.Tensor:
         # Determine routing mode: compute from logits or use pre-computed
         if routing_logits is None:
@@ -1619,7 +1653,7 @@ def trtllm_fp8_block_scale_moe_op(
             num_local_experts=local_num_experts,
             dtype_act=dtype_act,
             dtype_weights=dtype_weights,
-            use_deepseek_fp8=True,  # block_scale mode
+            fp8_quantization_type=fp8_quantization_type,  # block_scale mode
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             weight_layout=weight_layout,
@@ -1682,6 +1716,7 @@ def trtllm_fp8_block_scale_moe_op(
             weight_layout,
             enable_pdl,
             [-1, -1] if tactic == -1 else tactic,
+            fp8_quantization_type,
         )
 
         return result
@@ -1712,6 +1747,7 @@ def _fake_trtllm_fp8_block_scale_moe(
         weight_layout: int = 0,
         enable_pdl: Optional[bool] = None,
         tune_max_num_tokens: int = 8192,
+        fp8_quantization_type: Fp8QuantizationType = Fp8QuantizationType.DeepSeekFp8,
     ):
         seq_len = hidden_states.shape[0]
         hidden_size = hidden_states.shape[1]
@@ -1809,7 +1845,7 @@ def trtllm_fp4_block_scale_moe_op(
             num_local_experts=num_local_experts,
             dtype_act=dtype_act,
             dtype_weights=dtype_weights,
-            use_deepseek_fp8=False,
+            fp8_quantization_type=Fp8QuantizationType.NoneFp8,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             activation_type=activation_type,
@@ -2007,7 +2043,7 @@ def trtllm_mxint4_block_scale_moe_op(
             num_local_experts=num_local_experts,
             dtype_act=dtype_act,
             dtype_weights=dtype_weights,
-            use_deepseek_fp8=False,
+            fp8_quantization_type=Fp8QuantizationType.NoneFp8,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             activation_type=ActivationType.Swiglu,
@@ -2303,6 +2339,7 @@ def trtllm_fp8_block_scale_moe(
     weight_layout: int = 0,
     enable_pdl: Optional[bool] = None,
     tune_max_num_tokens: int = 8192,
+    fp8_quantization_type: Fp8QuantizationType = Fp8QuantizationType.DeepSeekFp8,
 ) -> torch.Tensor:
     """FP8 block scale MoE operation.
 
@@ -2326,6 +2363,7 @@ def trtllm_fp8_block_scale_moe(
         routing_method_type: Type of routing method to use (default: 0)
         enable_pdl: Whether to enable Programmatic Dependent Launch (PDL). Auto-enabled for >= sm90.
         tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
+        fp8_quantization_type: Type of FP8 quantization to use (default: DeepSeekFp8)
     Returns:
         torch.Tensor: Output tensor of shape [seq_len, hidden_size]
     """
@@ -2357,6 +2395,7 @@ def trtllm_fp8_block_scale_moe(
         weight_layout,
         enable_pdl,
         tune_max_num_tokens,
+        fp8_quantization_type,
     )
 
 
@@ -2384,6 +2423,7 @@ def trtllm_fp8_block_scale_routed_moe(
     enable_pdl: Optional[bool] = None,
     output: Optional[torch.Tensor] = None,
     tune_max_num_tokens: int = 8192,
+    fp8_quantization_type: Fp8QuantizationType = Fp8QuantizationType.DeepSeekFp8,
 ) -> torch.Tensor:
     """FP8 block scale MoE operation with pre-computed routing (packed format).
 
@@ -2418,6 +2458,7 @@ def trtllm_fp8_block_scale_routed_moe(
         output (Optional[torch.Tensor]): shape [seq_len, hidden_size]
             Optional inplace output tensor.
         tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 8192)
+        fp8_quantization_type: Type of FP8 quantization to use (default: DeepSeekFp8)
     Returns:
         torch.Tensor: Output tensor of shape [seq_len, hidden_size]
     """
@@ -2446,6 +2487,7 @@ def trtllm_fp8_block_scale_routed_moe(
         weight_layout,
         enable_pdl,
         tune_max_num_tokens,
+        fp8_quantization_type,
     )