diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index e6d5e15875..92403f6fc8 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -514,7 +514,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_a100.sh + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt glm45_exit_code=$? if [[ "$glm45_exit_code" != "0" ]]; then echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" @@ -523,6 +523,106 @@ jobs: echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m" fi ' + + - name: GLM4.5 sft + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m" + fi + ' + + - name: GLM4.5 lora + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m" + fi + ' + + - name: GLM4.5 dpo + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m" + fi + ' + + - name: GLM4.5 dpo_lora + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo_lora + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo lora.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo lora.\033[0m" + fi + ' + + + - name: GLM4.5 pre-train (Grouped GEMM) + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh grouped_gemm + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m" + fi + ' + + - name: Qwen pre-train + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh pt + ' + + - name: Qwen sft + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh sft + ' + + - name: Qwen lora + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh lora + ' - name: Terminate and delete the container if: ${{ always() }} diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh new file mode 100644 index 0000000000..ff4d23ac14 --- /dev/null +++ b/tests/integration_test/glm45_a100.sh @@ -0,0 +1,184 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -exo pipefail +export root_dir=$(pwd) + +if [ -f 'PaddleFleet/.venv/bin/activate' ]; then + source PaddleFleet/.venv/bin/activate +fi + +if [ ! -d "$root_dir/glm45_fleet" ]; then + wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/glm45/glm45_fleet.12-18.tar --no-check-certificate + tar -xf glm45_fleet.12-18.tar +fi + +cd $root_dir/glm45_fleet +export cur_dir=$(pwd) + +step=$1 +if [[ ${step} == "pt" ]]; then + echo "Run GLM4.5 pretrain test" + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt + yq eval '.expert_model_parallel_size = 1 + | .num_hidden_layers = 2 + | .per_device_train_batch_size = 1 + | .use_expert_parallel = false + | .stage1_overlap = false + | .train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(cur_dir) + "/GLM-4.5-Air" + | .logging_dir = strenv(cur_dir) + "/vdl_log" + | .output_dir = strenv(cur_dir) + "/checkpoints/pretrain"' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +elif [[ ${step} == "sft" ]]; then + echo "Run GLM4.5 sft test" + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_sft.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft + yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(cur_dir) + "/checkpoints/pretrain" + | .logging_dir = strenv(cur_dir) + "/glm_full_pp_vdl_log" + | .num_empty_layers_add_in_head = 0 + | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +elif [[ ${step} == "lora" ]]; then + echo "Run GLM4.5 multi lora test" + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft + yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" + | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log" + | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps" + | .num_empty_layers_add_in_tail = 0 + | del(.moe_token_dispatcher_type)' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +elif [[ ${step} == "dpo" ]]; then + echo "Run GLM4.5 dpo test" + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo + yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" + | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log" + | .num_empty_layers_add_in_tail = 0 + | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +elif [[ ${step} == "grouped_gemm" ]]; then + echo "Run GLM4.5 grouped_gemm test" + export config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt_grouped_gemm.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt + yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(CACHE_DIR) + "/glm45/GLM-4.5-Air" + | .per_device_train_batch_size = 1 + | .num_hidden_layers = 2 + | .stage1_overlap = false + | .logging_dir = strenv(data_dir) + "/vdl_log" + | .output_dir = strenv(data_dir) + "/checkpoints/grouped_gemm"' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +elif [[ ${step} == "dpo_lora" ]]; then + echo "Run GLM4.5 dpo_lora test" + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml + config_json=$CACHE_DIR/glm45/GLM-4.5-Air/config.json + yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(CACHE_DIR) + "/zai-org/GLM-4.5-Air-Base" + | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_lora_vdl_log" + | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_lora_ckpts"' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +fi + +rm -rf vdl_log/ +master=$(hostname -i) +port=36677 + +unset http_proxy https_proxy + +export FLAGS_embedding_deterministic=1 +export FLAGS_cudnn_deterministic=1 +export FLAGS_use_stride_compute_kernel=False + +log_file=glm45_${step}_a100.txt +gt_loss_file=glm45_${step}_multi_card_a100_gt_loss.txt + +set +e +FLAGS_use_stride_compute_kernel=False NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file} + + +exit_code=$? + +if [ $exit_code -ne 0 ]; then + echo "Test failed with exit code $exit_code, check the log: ./${log_file}" + python $root_dir/PaddleFormers/tests/check_log_for_exitcode.py ./${log_file} "***** train metrics *****" + check_log_exit_code=$? + if [ $check_log_exit_code -ne 0 ]; then + echo "Failed to find 'Training completed' in log file." + exit 1 + else + echo "Log check passed" + fi +else + echo "Test passed." +fi + +export repo_name=PaddleFleet +export REPO_NAME=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}') +# if [[ "${PP}" == "rel" ]]; then +# export pppatch="_PPrel" +# fi +# if [[ "${PF}" == rel* ]]; then +# export pfpatch="rel" +# fi +wget --no-proxy --no-check-certificate https://xly-devops.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}_latest/${gt_loss_file} +if [ $? -ne 0 ]; then + echo "To request precision checks for new models, please contact swgu98." + exit 1 +fi + +log_loss_file=${log_file%.*}_loss.${log_file##*.} +python $root_dir/PaddleFormers/tests/integration_test/check_loss.py \ + --compare_step 10 \ + --log_file ./${log_file} \ + --log_loss_file ./${log_loss_file} \ + --gt_file ./${gt_loss_file} + +if [ $? -ne 0 ]; then + pushd $root_dir/PaddleFormers + source /root/proxy + bash $root_dir/PaddleFormers/tests/integration_test/check_precision_approval.sh + if [ $? -ne 0 ]; then + echo -e "\033[31mThe precision has been changed and requires approvals.\033[0m" + exit 1 + fi + popd + rm ${gt_loss_file} && mv ${log_loss_file} ${gt_loss_file} + if [ ! -f precision_list.txt ]; then + wget --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID}/precision_list.txt + if [ $? -ne 0 ]; then + wget --no-proxy --no-check-certificate https://xly-devops.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}_latest/precision_list.txt + python $root_dir/bos/BosClient.py precision_list.txt paddle-github-action/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID} + fi + fi + python $root_dir/bos/BosClient.py ${gt_loss_file} paddle-github-action/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID} +fi diff --git a/tests/integration_test/glm45_dpo_lora.sh b/tests/integration_test/glm45_dpo_lora.sh index 531e41ea55..b6109d6ed1 100644 --- a/tests/integration_test/glm45_dpo_lora.sh +++ b/tests/integration_test/glm45_dpo_lora.sh @@ -22,16 +22,12 @@ fi cd $root_dir/glm45_fleet export cur_dir=$(pwd) -# prepare dpo data -wget https://paddle-qa.bj.bcebos.com/fleet/fleet_dpo.tar -tar -xf fleet_dpo.tar - +export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml - config_json=$CACHE_DIR/glm45/GLM-4.5-Air/config.json -yq '.train_dataset_path = strenv(cur_dir) + "/dpo_data/dpo_train.jsonl" - | .eval_dataset_path = strenv(cur_dir) + "/dpo_data/dpo_eval.jsonl" +yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" | .model_name_or_path = strenv(CACHE_DIR) + "/zai-org/GLM-4.5-Air-Base" | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_lora_vdl_log" | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_lora_ckpts"' \ diff --git a/tests/integration_test/glm45_pt_a100.sh b/tests/integration_test/qwen3_a100.sh similarity index 55% rename from tests/integration_test/glm45_pt_a100.sh rename to tests/integration_test/qwen3_a100.sh index 37a460f2cc..8cb8173faf 100644 --- a/tests/integration_test/glm45_pt_a100.sh +++ b/tests/integration_test/qwen3_a100.sh @@ -1,11 +1,11 @@ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,65 +15,83 @@ set -exo pipefail export root_dir=$(pwd) +step=$1 + +if [[ ! -d $CACHE_DIR/Qwen3-30B-A3B ]]; then + pushd $CACHE_DIR + wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/Qwen/Qwen3-30B-A3B.tar.gz --no-check-certificate + tar xf Qwen3-30B-A3B.tar.gz + popd +fi + if [ -f 'PaddleFleet/.venv/bin/activate' ]; then source PaddleFleet/.venv/bin/activate fi -wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/glm45/glm45_fleet.12-18.tar --no-check-certificate -tar -xf glm45_fleet.12-18.tar # glm45_fleet -cd $root_dir/glm45_fleet -export cur_dir=$(pwd) - -config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt.yaml -export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt +if [[ "$step" == "pt" ]]; then + export config_yaml=$root_dir/PaddleFormers/tests/config/ci/qwen3_multicard_pt.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt + export model_name_or_path=$CACHE_DIR/Qwen3-30B-A3B + export output_dir=$root_dir/checkpoints/qwen-pt + yq eval '.moe_grouped_gemm = false' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +elif [[ "$step" == "sft" ]]; then + export config_yaml=$root_dir/PaddleFormers/tests/config/ci/qwen3_multicard_sft.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft + export model_name_or_path=$root_dir/checkpoints/qwen-pt + export output_dir=$root_dir/checkpoints/qwen-sft + yq eval '.moe_grouped_gemm = false' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +else + export config_yaml=$root_dir/PaddleFormers/tests/config/ci/qwen3_multicard_lora.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft + export model_name_or_path=$root_dir/checkpoints/qwen-sft + export output_dir=$root_dir/checkpoints/qwen-lora +fi -yq eval '.expert_model_parallel_size = 1 - | .num_hidden_layers = 2 - | .per_device_train_batch_size = 1 - | .use_expert_parallel = false - | .stage1_overlap = false - | .train_dataset_path = strenv(data_dir) + "/train.jsonl" +yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl" | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" - | .model_name_or_path = strenv(cur_dir) + "/GLM-4.5-Air" - | .logging_dir = strenv(cur_dir) + "/vdl_log" - | .output_dir = strenv(cur_dir) + "/checkpoints"' \ - $config_yaml > ${config_yaml}.tmp + | .model_name_or_path = strenv(model_name_or_path) + | .output_dir = strenv(output_dir)' \ + $config_yaml > ${config_yaml}.tmp mv ${config_yaml}.tmp $config_yaml -rm -rf checkpoints/ -rm -rf vdl_log/ +rm -rf ./outputs +rm -rf paddleformers_dist_log master=$(hostname -i) port=36677 -unset http_proxy https_proxy - export FLAGS_embedding_deterministic=1 export FLAGS_cudnn_deterministic=1 +export FLAGS_use_stride_compute_kernel=False +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -log_file=glm45_pt_a100.txt -gt_loss_file=glm45_pt_multi_card_a100_gt_loss.txt +unset http_proxy https_proxy -set +e -FLAGS_use_stride_compute_kernel=False NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file} +log_file=qwen_$step_a100.txt +gt_loss_file=qwen_${step}_a100_multi_card_gt_loss.txt +set +e +NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file} exit_code=$? - if [ $exit_code -ne 0 ]; then - echo "Test failed with exit code $exit_code, check the log: ./${log_file}" - python $root_dir/PaddleFormers/tests/check_log_for_exitcode.py ./${log_file} "***** train metrics *****" - check_log_exit_code=$? - if [ $check_log_exit_code -ne 0 ]; then - echo "Failed to find 'Training completed' in log file." - exit 1 - else - echo "Log check passed" - fi + echo "qwen multi-cards training failed, try to check the log file" + python $root_dir/PaddleFormers/tests/check_log_for_exitcode.py ./${log_file} "***** train metrics *****" + check_exit_code=$? + if [ $check_exit_code -ne 0 ]; then + echo "Failed to find 'Training completed' in log file." + exit 1 + else + echo "Log check passed." + fi else echo "Test passed." fi -# export repo_name=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}') + export repo_name=PaddleFleet export REPO_NAME=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}') # if [[ "${PP}" == "rel" ]]; then @@ -113,4 +131,4 @@ if [ $? -ne 0 ]; then fi fi python $root_dir/bos/BosClient.py ${gt_loss_file} paddle-github-action/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID} -fi +fi \ No newline at end of file