|
| 1 | +import os |
| 2 | + |
| 3 | +from xtuner.v1.config import ( |
| 4 | + AdamWConfig, |
| 5 | + FSDPConfig, |
| 6 | + LRConfig, |
| 7 | +) |
| 8 | +from xtuner.v1.datasets import FTDPTokenizeFnConfig |
| 9 | +from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig |
| 10 | +from xtuner.v1.loss.ce_loss import CELossConfig |
| 11 | +from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config |
| 12 | +from xtuner.v1.train import TrainerConfig |
| 13 | + |
| 14 | + |
| 15 | +QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"] |
| 16 | +ALPACA_PATH = os.environ["ALPACA_PATH"] |
| 17 | + |
| 18 | + |
| 19 | +moe_cfg = Qwen3MoE30BA3Config() |
| 20 | +optim_cfg = AdamWConfig(lr=6e-05) |
| 21 | +lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6) |
| 22 | +fsdp_cfg = FSDPConfig( |
| 23 | + torch_compile=False, |
| 24 | + cpu_offload=False, |
| 25 | + ep_size=moe_cfg.ep_size, |
| 26 | +) |
| 27 | + |
| 28 | +dataset_config = [ |
| 29 | + { |
| 30 | + "dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0), |
| 31 | + "tokenize_fn": FTDPTokenizeFnConfig(max_length=16384), |
| 32 | + }, |
| 33 | +] |
| 34 | + |
| 35 | +dataloader_config = DataloaderConfig(pack_max_length=16384) |
| 36 | + |
| 37 | +loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) # CELossConfig() |
| 38 | + |
| 39 | + |
| 40 | +trainer = TrainerConfig( |
| 41 | + load_from=QWEN3_MOE_PATH, |
| 42 | + model_cfg=moe_cfg, |
| 43 | + optim_cfg=optim_cfg, |
| 44 | + fsdp_cfg=fsdp_cfg, |
| 45 | + dataset_cfg=dataset_config, |
| 46 | + dataloader_cfg=dataloader_config, |
| 47 | + lr_cfg=lr_cfg, |
| 48 | + loss_cfg=loss_cfg, |
| 49 | + tokenizer_path=QWEN3_MOE_PATH, |
| 50 | + global_batch_size=16, |
| 51 | + total_epoch=1, |
| 52 | + work_dir=f"/mnt/hwfile/vc-intern-delivery/qa-llm-cicd/test_output/{os.environ['GITHUB_RUN_ID']}/npu-qwen3-sft-celoss-vs-gpu/sft", |
| 53 | + seed=0, |
| 54 | + dist_backend="npu:hccl", |
| 55 | +) |
0 commit comments