Skip to content

Commit 105b14c

Browse files
committed
more cases
1 parent 5e88142 commit 105b14c

File tree

4 files changed

+187
-3
lines changed

4 files changed

+187
-3
lines changed

.github/workflows/e2e_test_npu.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,6 @@ jobs:
3535
#ref: ${{github.event.inputs.repo_ref || 'main'}}
3636
- name: run-test
3737
run: |
38-
id
39-
which python
40-
conda env list
4138
source activate npuci
4239
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
4340
export DEVICE=npu && pytest autotest/test_all.py -m all -n 1 -vv --run_id ${{ github.run_id }}

autotest/config-npu.yaml

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,80 @@ case:
104104
runtime_info/tgs: 0.05
105105
runtime_info/text_tokens: 0
106106
timeout: 10800
107+
108+
npu-qwen3-sft-tp2:
109+
-
110+
type: sft
111+
parameters:
112+
config: autotest/config/npu_qwen3_moe_30BA3_tp2.py
113+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
114+
resource:
115+
envs:
116+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
117+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
118+
- XTUNER_DETERMINISTIC=true
119+
- XTUNER_USE_FA3=1
120+
- TORCH_NPU_USE_HCCL=1
121+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
122+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
123+
assert_info:
124+
base_metric: npu-qwen3-sft-tp2/812c1021/tracker.jsonl
125+
check_metrics:
126+
grad_norm: 0.000001
127+
loss/reduced_llm_loss: 0.000001
128+
lr: 0
129+
memory/max_memory_GB: 0.2
130+
runtime_info/tgs: 0.05
131+
runtime_info/text_tokens: 0
132+
timeout: 10800
133+
134+
npu-qwen3-sft-recompute:
135+
-
136+
type: sft
137+
parameters:
138+
config: autotest/config/npu_qwen3_recompute.py
139+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
140+
resource:
141+
num_nodes: 2
142+
cpus_per_task: 256
143+
envs:
144+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
145+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
146+
- XTUNER_DETERMINISTIC=true
147+
- XTUNER_ACTIVATION_OFFLOAD=1
148+
- TORCH_NPU_USE_HCCL=1
149+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
150+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
151+
assert_info:
152+
base_metric: npu-qwen3-sft-recompute/812c1021/tracker.jsonl
153+
check_metrics:
154+
grad_norm: 0.000001
155+
loss/reduced_llm_loss: 0.000001
156+
lr: 0
157+
memory/max_memory_GB: 0.2
158+
runtime_info/tgs: 0.05
159+
runtime_info/text_tokens: 0
160+
timeout: 10800
161+
162+
npu-qwen3-sft-16nums:
163+
-
164+
type: sft
165+
parameters:
166+
config: autotest/config/npu_qwen3_16nums.py
167+
output_path: /mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output
168+
resource:
169+
num_nodes: 2
170+
envs:
171+
- QWEN3_MOE_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/qa_test_models/Qwen3-30B-A3B
172+
- ALPACA_PATH=/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/xtuner_resource/datasets/alpaca
173+
- XTUNER_DETERMINISTIC=true
174+
- TORCH_NPU_USE_HCCL=1
175+
- PIP_INDEX_URL=http://pkg.pjlab.org.cn/repository/pypi-tsinghua/simple
176+
- PIP_TRUSTED_HOST=pkg.pjlab.org.cn
177+
assert_info:
178+
base_metric: npu-qwen3-sft/812c1021/tracker.jsonl
179+
check_metrics:
180+
grad_norm: 0.000001
181+
loss/reduced_llm_loss: 0.000001
182+
lr: 0
183+
timeout: 10800
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import os
2+
3+
from xtuner.v1.config import (
4+
AdamWConfig,
5+
FSDPConfig,
6+
LRConfig,
7+
)
8+
from xtuner.v1.datasets import FTDPTokenizeFnConfig
9+
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
10+
from xtuner.v1.loss.ce_loss import CELossConfig
11+
from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config
12+
from xtuner.v1.train import TrainerConfig
13+
14+
15+
QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
16+
ALPACA_PATH = os.environ["ALPACA_PATH"]
17+
18+
19+
moe_cfg = Qwen3MoE30BA3Config()
20+
optim_cfg = AdamWConfig(lr=6e-05)
21+
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
22+
fsdp_cfg = FSDPConfig(
23+
torch_compile=False,
24+
cpu_offload=False,
25+
ep_size=moe_cfg.ep_size,
26+
)
27+
28+
dataset_config = [
29+
{
30+
"dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0),
31+
"tokenize_fn": FTDPTokenizeFnConfig(max_length=16384),
32+
},
33+
]
34+
35+
dataloader_config = DataloaderConfig(pack_max_length=16384)
36+
37+
loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) # CELossConfig()
38+
39+
40+
trainer = TrainerConfig(
41+
load_from=QWEN3_MOE_PATH,
42+
model_cfg=moe_cfg,
43+
optim_cfg=optim_cfg,
44+
fsdp_cfg=fsdp_cfg,
45+
dataset_cfg=dataset_config,
46+
dataloader_cfg=dataloader_config,
47+
lr_cfg=lr_cfg,
48+
loss_cfg=loss_cfg,
49+
tokenizer_path=QWEN3_MOE_PATH,
50+
global_batch_size=64,
51+
total_epoch=1,
52+
work_dir=f"/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output/{os.environ['GITHUB_RUN_ID']}/npu-qwen3-sft-16nums/sft",
53+
seed=0,
54+
dist_backend="npu:hccl",
55+
)
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import os
2+
3+
from xtuner.v1.config import (
4+
AdamWConfig,
5+
FSDPConfig,
6+
LRConfig,
7+
)
8+
from xtuner.v1.datasets import FTDPTokenizeFnConfig
9+
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
10+
from xtuner.v1.loss.ce_loss import CELossConfig
11+
from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config
12+
from xtuner.v1.train import TrainerConfig
13+
14+
15+
QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
16+
ALPACA_PATH = os.environ["ALPACA_PATH"]
17+
18+
19+
moe_cfg = Qwen3MoE30BA3Config()
20+
optim_cfg = AdamWConfig(lr=6e-05)
21+
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
22+
fsdp_cfg = FSDPConfig(
23+
torch_compile=True,
24+
cpu_offload=False,
25+
ep_size=moe_cfg.ep_size,
26+
tp_size=2,
27+
)
28+
29+
dataset_config = [
30+
{
31+
"dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0),
32+
"tokenize_fn": FTDPTokenizeFnConfig(max_length=16384),
33+
},
34+
]
35+
36+
dataloader_config = DataloaderConfig(pack_max_length=16384)
37+
38+
loss_cfg = CELossConfig(mode="chunk", chunk_size=1024)
39+
40+
41+
trainer = TrainerConfig(
42+
load_from=QWEN3_MOE_PATH,
43+
model_cfg=moe_cfg,
44+
optim_cfg=optim_cfg,
45+
fsdp_cfg=fsdp_cfg,
46+
dataset_cfg=dataset_config,
47+
dataloader_cfg=dataloader_config,
48+
lr_cfg=lr_cfg,
49+
loss_cfg=loss_cfg,
50+
tokenizer_path=QWEN3_MOE_PATH,
51+
global_batch_size=16,
52+
total_epoch=1,
53+
work_dir=f"/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output/{os.environ['GITHUB_RUN_ID']}/npu-qwen3-sft-tp2/sft",
54+
seed=0,
55+
)

0 commit comments

Comments
 (0)