PaddlePaddle · tianlef · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
@@ -493,7 +493,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_a100.sh
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt
           glm45_exit_code=$?
           if [[ "$glm45_exit_code" != "0" ]]; then
             echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
@@ -502,6 +502,105 @@ jobs:
             echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m"
           fi
           '
+
+      - name: GLM4.5 sft
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m"
+          fi
+          '
+
+      # - name: GLM4.5 lora
+      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+      #   run: |
+      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
+      #     source /root/proxy
+      #     timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
+      #     glm45_exit_code=$?
+      #     if [[ "$glm45_exit_code" != "0" ]]; then
+      #       echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
+      #       exit 1
+      #     else
+      #       echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
+      #     fi
+      #     '
+
+      - name: GLM4.5 dpo
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m"
+          fi
+          '
+
+      # - name: GLM4.5 pre-train (FP8)
+      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+      #   run: |
+      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
+      #     source /root/proxy
+      #     timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh fp8
+      #     glm45_exit_code=$?
+      #     if [[ "$glm45_exit_code" != "0" ]]; then
+      #       echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
+      #       exit 1
+      #     else
+      #       echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m"
+      #     fi
+      #     '
+
+      - name: GLM4.5 pre-train (Grouped GEMM)
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh grouped_gemm
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m"
+          fi
+          '
+
+      - name: Qwen pre-train
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt
+          '
+
+      - name: Qwen sft
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft
+          '
+
+      # - name: Qwen lora
+      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+      #   run: |
+      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
+      #     source /root/proxy
+      #     timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
+      #     '
 
       - name: Terminate and delete the container
         if: ${{ always() }}

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
@@ -0,0 +1,171 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -exo pipefail
+export root_dir=$(pwd)
+
+if [ -f 'PaddleFleet/.venv/bin/activate' ]; then
+   source PaddleFleet/.venv/bin/activate
+fi
+
+if [ ! -d "$root_dir/glm45_fleet" ]; then
+  wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/glm45/glm45_fleet.12-18.tar --no-check-certificate
+  tar -xf glm45_fleet.12-18.tar
+fi
+
+cd $root_dir/glm45_fleet
+export cur_dir=$(pwd)
+
+step=$1
+if [[ ${step} == "pt" ]]; then
+  echo "Run GLM4.5 pretrain test"
+  config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt.yaml
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt
+  yq eval '.expert_model_parallel_size = 1
+    | .num_hidden_layers = 2
+    | .per_device_train_batch_size = 1
+    | .use_expert_parallel = false
+    | .stage1_overlap = false
+    | .train_dataset_path = strenv(data_dir) + "/train.jsonl"
+    | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+    | .model_name_or_path = strenv(cur_dir) + "/GLM-4.5-Air"
+    | .logging_dir = strenv(cur_dir) + "/vdl_log"
+    | .output_dir = strenv(cur_dir) + "/checkpoints/pretrain"' \
+  $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
+elif [[ ${step} == "sft" ]]; then
+  echo "Run GLM4.5 sft test"
+  config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_sft.yaml
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft
+  yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+    | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+    | .model_name_or_path = strenv(cur_dir) + "/checkpoints/pretrain"
+    | .logging_dir = strenv(cur_dir) + "/glm_full_pp_vdl_log"
+    | .num_empty_layers_add_in_head = 0
+    | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"' \
+   $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
+elif [[ ${step} == "lora" ]]; then
+  echo "Run GLM4.5 multi lora test"
+  config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft
+  yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+      | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+      | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
+      | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log"
+      | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"
+      | del(.moe_token_dispatcher_type)' \
+    $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
+elif [[ ${step} == "dpo" ]]; then
+  echo "Run GLM4.5 dpo test"
+  config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo
+  yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+      | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+      | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
+      | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log"
+      | .num_empty_layers_add_in_tail = 0
+      | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \
+    $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
+elif [[ ${step} == "grouped_gemm" ]]; then
+  echo "Run GLM4.5 grouped_gemm test"
+  export config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt_grouped_gemm.yaml
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt
+  yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+      | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+      | .model_name_or_path = strenv(CACHE_DIR) + "/glm45/GLM-4.5-Air"
+      | .per_device_train_batch_size = 1
+      | .num_hidden_layers = 2
+      | .stage1_overlap = false
+      | .moe_grouped_gemm = false
+      | .logging_dir = strenv(data_dir) + "/vdl_log"
+      | .output_dir = strenv(data_dir) + "/checkpoints"' \
+    $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
+fi
+
+rm -rf vdl_log/
+master=$(hostname -i)
+port=36677
+
+unset http_proxy https_proxy
+
+export FLAGS_embedding_deterministic=1
+export FLAGS_cudnn_deterministic=1
+
+log_file=glm45_${step}_a100.txt
+gt_loss_file=glm45_${step}_multi_card_a100_gt_loss.txt
+
+set +e
+FLAGS_use_stride_compute_kernel=False NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file}
+
+
+exit_code=$?
+
+if [ $exit_code -ne 0 ]; then
+    echo "Test failed with exit code $exit_code, check the log: ./${log_file}"
+    python $root_dir/PaddleFormers/tests/check_log_for_exitcode.py ./${log_file} "***** train metrics *****"
+    check_log_exit_code=$?
+    if [ $check_log_exit_code -ne 0 ]; then
+        echo "Failed to find 'Training completed' in log file."
+        exit 1
+    else
+        echo "Log check passed"
+    fi
+else
+    echo "Test passed."
+fi
+
+# export repo_name=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}')
+export repo_name=PaddleFleet
+# if [[ "${PP}" == "rel" ]]; then
+#   export pppatch="_PPrel"
+# fi
+# if [[ "${PF}" == rel* ]]; then
+#   export pfpatch="rel"
+# fi
+wget --no-proxy --no-check-certificate https://xly-devops.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}_latest/${gt_loss_file}
+if [ $? -ne 0 ]; then
+  echo "To request precision checks for new models, please contact swgu98."
+  exit 1
+fi
+
+log_loss_file=${log_file%.*}_loss.${log_file##*.}
+python $root_dir/PaddleFormers/tests/integration_test/check_loss.py \
+   --compare_step 10 \
+   --log_file ./${log_file} \
+   --log_loss_file ./${log_loss_file} \
+   --gt_file ./${gt_loss_file}
+
+if [ $? -ne 0 ]; then
+  pushd $root_dir/PaddleFormers
+  source /root/proxy
+  bash $root_dir/PaddleFormers/tests/integration_test/check_precision_approval.sh
+  if [ $? -ne 0 ]; then
+    echo -e "\033[31mThe precision has been changed and requires approvals.\033[0m"
+    exit 1
+  fi
+  popd
+  rm ${gt_loss_file} && mv ${log_loss_file} ${gt_loss_file}
+  if [ ! -f precision_list.txt ]; then
+    wget --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID}/precision_list.txt
+    if [ $? -ne 0 ]; then
+      wget --no-proxy --no-check-certificate https://xly-devops.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}_latest/precision_list.txt
+      python $root_dir/bos/BosClient.py precision_list.txt paddle-github-action/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID}
+    fi
+  fi
+  python $root_dir/bos/BosClient.py ${gt_loss_file} paddle-github-action/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID}
+fi