Skip to content

Commit c52127d

Browse files
nil0x9HAOCHENYE
authored andcommitted
[CI] Add internal_metrics UT
1 parent 04e0f18 commit c52127d

File tree

1 file changed

+135
-0
lines changed

1 file changed

+135
-0
lines changed
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import os
2+
from typing import cast
3+
import torch
4+
from torch import nn
5+
from torch.distributed.tensor import DTensor
6+
from transformers import AutoTokenizer
7+
8+
from xtuner._testing import DeterministicDDPTestCase
9+
from xtuner.v1.config import FSDPConfig
10+
from xtuner.v1.model.base import ModelItem
11+
from xtuner.v1.model.moe.moe import SequenceContext
12+
from xtuner.v1.model.moe.qwen3 import Qwen3MoEConfig
13+
from xtuner.v1.module.attention import MHAConfig
14+
from xtuner.v1.module.router.greedy import GreedyRouterConfig
15+
from xtuner.v1.module import RMSNorm
16+
from xtuner.v1.module.decoder_layer.moe_decoder_layer import MoEGate
17+
from xtuner.v1.module.grouped_linear.moe_group_linear import GroupedLinear
18+
from xtuner.v1.float8.float8_gmm_tile_wise import TileWiseFloat8GroupedLinear
19+
from xtuner.v1.utils import internal_metrics
20+
from xtuner.v1.utils.internal_metrics import InternalMetricsConfig, InternalMetricsRecorder
21+
from xtuner.v1.utils.device import get_device
22+
23+
24+
DEVICE = get_device()
25+
QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
26+
27+
def _get_model_config() -> Qwen3MoEConfig:
28+
return Qwen3MoEConfig(
29+
vocab_size=151936,
30+
max_position_embeddings=4096,
31+
pad_token_id=0,
32+
bos_token_id=151643,
33+
eos_token_id=151645,
34+
num_hidden_layers=1,
35+
hidden_size=2048,
36+
intermediate_size=6144,
37+
rms_norm_eps=1e-6,
38+
rope_theta=1000000.0,
39+
hidden_act="silu",
40+
attention=MHAConfig(
41+
num_attention_heads=16,
42+
num_key_value_heads=4,
43+
head_dim=128,
44+
),
45+
tie_word_embeddings=False,
46+
n_routed_experts=16,
47+
n_shared_experts=0,
48+
num_experts_per_tok=1,
49+
first_k_dense_replace=0,
50+
hidden_factor=1.0,
51+
moe_intermediate_size=768,
52+
router=GreedyRouterConfig(
53+
scoring_func="softmax",
54+
norm_topk_prob=True,
55+
router_scaling_factor=1.0,
56+
),
57+
)
58+
59+
60+
class TestInternalMetricsRecorder(DeterministicDDPTestCase):
61+
def test_internal_metrics_run(self):
62+
self.create_pg("cuda")
63+
64+
config = _get_model_config()
65+
with torch.device("meta"):
66+
model = config.build()
67+
68+
fsdp_config = FSDPConfig()
69+
model.fully_shard(fsdp_config=fsdp_config)
70+
model.init_weights()
71+
72+
internal_metrics_interval = 1
73+
74+
internal_metrics_cfg = InternalMetricsConfig(
75+
internal_metrics_interval=internal_metrics_interval,
76+
monitor_weights_rms_norm=True,
77+
monitor_attn_logits_stats=True,
78+
monitor_moe_router_logits_stats=True,
79+
monitor_moe_load_balance_stats=True,
80+
)
81+
82+
metrics_recorder = InternalMetricsRecorder(internal_metrics_cfg, model)
83+
84+
hf_model_path = QWEN3_MOE_PATH
85+
tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
86+
87+
text_list = [
88+
"一个好的研究者应自己先审视自己的 claim, 并真心地尝试用实验检验它们",
89+
]
90+
91+
data_batches = []
92+
93+
for text in text_list:
94+
input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda")
95+
seq_ctx = SequenceContext.from_input_ids(input_ids=(input_ids,))
96+
data_batches.append(ModelItem(seq_ctx=seq_ctx, loss_ctx=None)) # type: ignore[arg-type]
97+
98+
metrics = metrics_recorder.pop_metrics(data_batches)
99+
100+
# Check that all expected top-level keys exist
101+
assert "weight_rms" in metrics
102+
assert "router_logits_max" in metrics
103+
assert "router_logits_mean" in metrics
104+
assert "maxvio" in metrics
105+
assert "drop_ratio" in metrics
106+
107+
if DEVICE != "npu":
108+
assert "attn_max_lse" in metrics or "attn_max_logits" in metrics
109+
110+
# Check that all values are valid floats (not NaN or Inf)
111+
for metric_name, metric_dict in metrics.items():
112+
assert isinstance(metric_dict, dict), f"{metric_name} should be a dict"
113+
for key, value in metric_dict.items():
114+
assert isinstance(value, float), f"{metric_name}[{key}] should be float"
115+
assert not torch.isnan(torch.tensor(value)), f"{metric_name}[{key}] is NaN"
116+
assert not torch.isinf(torch.tensor(value)), f"{metric_name}[{key}] is Inf"
117+
118+
for key in ["embed_tokens", "lm_head"] + [f"layers.{i}" for i in range(model.config.num_hidden_layers)]:
119+
assert key in metrics["weight_rms"], f"key: {key}, weight_rms: {metrics['weight_rms']}"
120+
121+
for key in [f"layer{i}" for i in range(model.config.num_hidden_layers)]:
122+
assert key in metrics["maxvio"], f"key: {key}, maxvio: {metrics['maxvio']}"
123+
assert key in metrics["drop_ratio"], f"key: {key}, drop_ratio: {metrics['drop_ratio']}"
124+
assert key in metrics["router_logits_max"], f"key: {key}, router_logits_max: {metrics['router_logits_max']}"
125+
assert key in metrics["router_logits_mean"], f"key: {key}, router_logits_mean: {metrics['router_logits_mean']}"
126+
127+
if DEVICE != "npu":
128+
for layer in range(model.config.num_hidden_layers):
129+
assert (
130+
f"layers.{layer}.self_attn" in metrics["attn_max_lse"] or # type: ignore[attr-defined]
131+
f"layers.{layer}.self_attn" in metrics["attn_max_logits"] # type: ignore[attr-defined]
132+
)
133+
134+
assert "total" in metrics["maxvio"]
135+
assert "total" in metrics["drop_ratio"]

0 commit comments

Comments
 (0)