From 3a02f157b636ccccfe114d2a7c1d749ec63048da Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Mon, 26 Jan 2026 16:19:51 +0800
Subject: [PATCH 01/21] add a100

---
 .github/workflows/fleet-model-test.yml  | 585 ++++++++++++++----------
 tests/integration_test/glm45_pt_a100.sh |   2 +-
 2 files changed, 343 insertions(+), 244 deletions(-)

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index 2b905a757cb..6c0267ab4b9 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -68,159 +68,357 @@ jobs:
           fi
           echo "is_md_only: $(cat $GITHUB_OUTPUT | grep is_md_only || echo '未找到')"
     
-  integration-test-H20-single-card:
-    needs: check_documents_type
-    if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }}
-    name: Integration test (H20, single card)
-    runs-on:
-      group: Fleet-H-single-card
-    env:
-      PIP_CACHE_DIR: /home/.cache/pip
-      CACHE_DIR: /home/.cache
-      TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-single-card
-    steps:
-      - name: Determine the runner
-        run: |
-          gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 ))
-          echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV
+  # integration-test-H20-single-card:
+  #   needs: check_documents_type
+  #   if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }}
+  #   name: Integration test (H20, single card)
+  #   runs-on:
+  #     group: Fleet-H-single-card
+  #   env:
+  #     PIP_CACHE_DIR: /home/.cache/pip
+  #     CACHE_DIR: /home/.cache
+  #     TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-single-card
+  #   steps:
+  #     - name: Determine the runner
+  #       run: |
+  #         gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 ))
+  #         echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV
 
-      - name: Check docker image and run container
-        env:
-          GPU_DEVICES: ${{ env.GPU_DEVICES }}
-        run: |
-          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
-          echo "container_name=${container_name}" >> ${{ github.env }}
-          docker pull $docker_image
-          set -x
-          docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \
-            -v "/dev/shm:/dev/shm"  \
-            -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
-            -v ${{ github.workspace }}/../../..:/root \
-            -v /ssd1/paddle-1/action_cache:/home/.cache \
-            -v ${{ github.workspace }}:/workspace \
-            -e BRANCH \
-            -e PR_ID \
-            -e COMMIT_ID \
-            -e PADDLE_ROOT \
-            -e ci_scripts \
-            -e CACHE_DIR \
-            -e no_proxy \
-            -e CI_name \
-            -e PIP_CACHE_DIR \
-            -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
-            -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
-            -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
-            -e GITHUB_REPO_NAME="${{ github.repository }}" \
-            -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
-            -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
-            -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
-            -e GITHUB_RUN_ID="${{ github.run_id }}" \
-            -e PR_USER="${{ github.event.pull_request.user.login }}" \
-            -w /workspace --network host ${docker_image}
+  #     - name: Check docker image and run container
+  #       env:
+  #         GPU_DEVICES: ${{ env.GPU_DEVICES }}
+  #       run: |
+  #         container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
+  #         echo "container_name=${container_name}" >> ${{ github.env }}
+  #         docker pull $docker_image
+  #         set -x
+  #         docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \
+  #           -v "/dev/shm:/dev/shm"  \
+  #           -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
+  #           -v ${{ github.workspace }}/../../..:/root \
+  #           -v /ssd1/paddle-1/action_cache:/home/.cache \
+  #           -v ${{ github.workspace }}:/workspace \
+  #           -e BRANCH \
+  #           -e PR_ID \
+  #           -e COMMIT_ID \
+  #           -e PADDLE_ROOT \
+  #           -e ci_scripts \
+  #           -e CACHE_DIR \
+  #           -e no_proxy \
+  #           -e CI_name \
+  #           -e PIP_CACHE_DIR \
+  #           -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
+  #           -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
+  #           -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
+  #           -e GITHUB_REPO_NAME="${{ github.repository }}" \
+  #           -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
+  #           -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
+  #           -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
+  #           -e GITHUB_RUN_ID="${{ github.run_id }}" \
+  #           -e PR_USER="${{ github.event.pull_request.user.login }}" \
+  #           -w /workspace --network host ${docker_image}
 
-      - name: Install PaddleFormers
-        id: formers_install
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          rm -rf * .[^.]*
-          echo $PR_USER
-          source /root/proxy
-          mkdir -p /home/.cache/pip
-          pip cache dir
-          pip install --upgrade pip
-          git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH}
-          cd PaddleFormers
-          git status
-          git config --global --add safe.directory /workspace/PaddleFormers
-          git config user.name "PaddleCI"
-          git config user.email "paddle_ci@example.com"
-          git config pull.rebase false
-          git pull --no-edit origin pull/${PR_ID}/head
-          export UV_SKIP_WHEEL_FILENAME_CHECK=1
-          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py
-          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py
-          pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
-          wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
-          pip uninstall paddlefleet -y
-          pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
-          # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir
-          echo "paddle commit:"
-          python -c "import paddle; print(paddle.version.commit)"
-          echo "paddlefleet commit:"
-          python -c "import paddlefleet; print(paddlefleet.version.commit)"
-          cd /workspace
-          wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
-          mkdir bos
-          tar xf bos_new.tar.gz -C bos
-          pip install bce-python-sdk==0.8.74
-          pip install coverage==7.6.1
-          pip install librosa==0.11.0
-          '
+  #     - name: Install PaddleFormers
+  #       id: formers_install
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         rm -rf * .[^.]*
+  #         echo $PR_USER
+  #         source /root/proxy
+  #         mkdir -p /home/.cache/pip
+  #         pip cache dir
+  #         pip install --upgrade pip
+  #         git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH}
+  #         cd PaddleFormers
+  #         git status
+  #         git config --global --add safe.directory /workspace/PaddleFormers
+  #         git config user.name "PaddleCI"
+  #         git config user.email "paddle_ci@example.com"
+  #         git config pull.rebase false
+  #         git pull --no-edit origin pull/${PR_ID}/head
+  #         export UV_SKIP_WHEEL_FILENAME_CHECK=1
+  #         sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py
+  #         sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py
+  #         pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
+  #         wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
+  #         pip uninstall paddlefleet -y
+  #         pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
+  #         # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir
+  #         echo "paddle commit:"
+  #         python -c "import paddle; print(paddle.version.commit)"
+  #         echo "paddlefleet commit:"
+  #         python -c "import paddlefleet; print(paddlefleet.version.commit)"
+  #         cd /workspace
+  #         wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
+  #         mkdir bos
+  #         tar xf bos_new.tar.gz -C bos
+  #         pip install bce-python-sdk==0.8.74
+  #         pip install coverage==7.6.1
+  #         pip install librosa==0.11.0
+  #         '
 
-      - name: Proprocess for integration test
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/preprocess.sh
-          preprocess_exit_code=$?
-          if [[ "$preprocess_exit_code" != "0" ]]; then
-            echo -e "::error:: \033[31mPreprocess failed.\033[0m"
-            exit 1
-          else
-            echo -e "\033[32mPreprocess succeeded.\033[0m"
-          fi
-          '
+  #     - name: Proprocess for integration test
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/preprocess.sh
+  #         preprocess_exit_code=$?
+  #         if [[ "$preprocess_exit_code" != "0" ]]; then
+  #           echo -e "::error:: \033[31mPreprocess failed.\033[0m"
+  #           exit 1
+  #         else
+  #           echo -e "\033[32mPreprocess succeeded.\033[0m"
+  #         fi
+  #         '
 
-      - name: Integration test (GLM4.5 single-card)
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh
-          glm45_single_card_exit_code=$?
-          if [[ "$glm45_single_card_exit_code" != "0" ]]; then
-            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 single-card.\033[0m"
-            exit 1
-          else
-            echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m"
-          fi
-          '
+  #     - name: Integration test (GLM4.5 single-card)
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh
+  #         glm45_single_card_exit_code=$?
+  #         if [[ "$glm45_single_card_exit_code" != "0" ]]; then
+  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 single-card.\033[0m"
+  #           exit 1
+  #         else
+  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m"
+  #         fi
+  #         '
 
 
-      - name: Integration test (Qwen3-30B-A3B single-card)
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh
-          qwen3_single_card_exit_code=$?
-          if [[ "$qwen3_single_card_exit_code" != "0" ]]; then
-            echo -e "::error:: \033[31mIntegration test failed: Qwen3-30B-A3B single-card.\033[0m"
-            exit 1
-          else
-            echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m"
-          fi
-          '
+  #     - name: Integration test (Qwen3-30B-A3B single-card)
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh
+  #         qwen3_single_card_exit_code=$?
+  #         if [[ "$qwen3_single_card_exit_code" != "0" ]]; then
+  #           echo -e "::error:: \033[31mIntegration test failed: Qwen3-30B-A3B single-card.\033[0m"
+  #           exit 1
+  #         else
+  #           echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m"
+  #         fi
+  #         '
 
-      - name: Terminate and delete the container
-        if: ${{ always() }}
-        run: |
-          set +e
-          docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
-          docker rm -f ${{ env.container_name }}
+  #     - name: Terminate and delete the container
+  #       if: ${{ always() }}
+  #       run: |
+  #         set +e
+  #         docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
+  #         docker rm -f ${{ env.container_name }}
+
+
+  # integration-test-H20-multi-card:
+  #   needs: check_documents_type
+  #   if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }}
+  #   name: Integration test (H20, multi-card)
+  #   runs-on:
+  #     group: Fleet-H-multi-card
+  #   env:
+  #     PIP_CACHE_DIR: /home/.cache/pip
+  #     CACHE_DIR: /home/.cache
+  #     TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-multi-card
+  #   steps:
+  #     - name: Check docker image and run container
+  #       run: |
+  #         container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
+  #         echo "container_name=${container_name}" >> ${{ github.env }}
+  #         docker pull $docker_image
+  #         docker run -d -t --name ${container_name} --gpus all --shm-size=32G \
+  #           -v "/dev/shm:/dev/shm"  \
+  #           -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
+  #           -v ${{ github.workspace }}/../../..:/root \
+  #           -v /ssd1/paddle-1/action_cache:/home/.cache \
+  #           -v ${{ github.workspace }}:/workspace \
+  #           -e BRANCH \
+  #           -e PR_ID \
+  #           -e COMMIT_ID \
+  #           -e PADDLE_ROOT \
+  #           -e ci_scripts \
+  #           -e CACHE_DIR \
+  #           -e no_proxy \
+  #           -e CI_name \
+  #           -e PIP_CACHE_DIR \
+  #           -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
+  #           -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
+  #           -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
+  #           -e GITHUB_REPO_NAME="${{ github.repository }}" \
+  #           -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
+  #           -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
+  #           -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
+  #           -e GITHUB_RUN_ID="${{ github.run_id }}" \
+  #           -e PR_USER="${{ github.event.pull_request.user.login }}" \
+  #           -w /workspace --network host ${docker_image}
+
+  #     - name: Install PaddleFormers
+  #       id: formers_install
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         rm -rf * .[^.]*
+  #         source /root/proxy
+  #         mkdir -p /home/.cache/pip
+  #         pip cache dir
+  #         pip install --upgrade pip
+  #         git clone https://github.com/PaddlePaddle/PaddleFormers.git  -b ${BRANCH}
+  #         cd PaddleFormers
+  #         git status
+  #         git config --global --add safe.directory /workspace/PaddleFormers
+  #         git config user.name "PaddleCI"
+  #         git config user.email "paddle_ci@example.com"
+  #         git config pull.rebase false
+  #         git pull --no-edit origin pull/${PR_ID}/head
+  #         export UV_SKIP_WHEEL_FILENAME_CHECK=1
+  #         pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
+  #         wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
+  #         pip uninstall paddlefleet -y
+  #         pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
+  #         # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir
+  #         echo "paddle commit:"
+  #         python -c "import paddle; print(paddle.version.commit)"
+  #         echo "paddlefleet commit:"
+  #         python -c "import paddlefleet; print(paddlefleet.version.commit)"
+  #         cd /workspace
+  #         wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
+  #         mkdir bos
+  #         tar xf bos_new.tar.gz -C bos
+  #         pip install bce-python-sdk==0.8.74
+  #         pip install coverage==7.6.1
+  #         pip install librosa==0.11.0
+  #         '
+      
+  #     - name: GLM4.5 pre-train
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt.sh
+  #         glm45_exit_code=$?
+  #         if [[ "$glm45_exit_code" != "0" ]]; then
+  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
+  #           exit 1
+  #         else
+  #           echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m"
+  #         fi
+  #         '
+  #     - name: GLM4.5 sft
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_sft.sh
+  #         glm45_exit_code=$?
+  #         if [[ "$glm45_exit_code" != "0" ]]; then
+  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m"
+  #           exit 1
+  #         else
+  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m"
+  #         fi
+  #         '
+      
+  #     - name: GLM4.5 lora
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_lora.sh
+  #         glm45_exit_code=$?
+  #         if [[ "$glm45_exit_code" != "0" ]]; then
+  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
+  #           exit 1
+  #         else
+  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
+  #         fi
+  #         '
+      
+  #     - name: GLM4.5 dpo
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh
+  #         glm45_exit_code=$?
+  #         if [[ "$glm45_exit_code" != "0" ]]; then
+  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m"
+  #           exit 1
+  #         else
+  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m"
+  #         fi
+  #         '
+
+  #     - name: GLM4.5 pre-train (FP8)
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh
+  #         glm45_exit_code=$?
+  #         if [[ "$glm45_exit_code" != "0" ]]; then
+  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
+  #           exit 1
+  #         else
+  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m"
+  #         fi
+  #         '
+
+  #     - name: GLM4.5 pre-train (Grouped GEMM)
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh
+  #         glm45_exit_code=$?
+  #         if [[ "$glm45_exit_code" != "0" ]]; then
+  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m"
+  #           exit 1
+  #         else
+  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m"
+  #         fi
+  #         '
+
+  #     - name: Qwen pre-train
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt
+  #         '
+        
+  #     - name: Qwen sft
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft
+  #         '
 
+  #     - name: Qwen lora
+  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+  #       run: |
+  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
+  #         source /root/proxy
+  #         timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora
+  #         '
 
-  integration-test-H20-multi-card:
+  #     - name: Terminate and delete the container
+  #       if: ${{ always() }}
+  #       run: |
+  #         set +e
+  #         docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
+  #         docker rm -f ${{ env.container_name }}
+
+  integration-test-a100:
     needs: check_documents_type
     if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }}
-    name: Integration test (H20, multi-card)
+    name: Integration test (A100)
     runs-on:
-      group: Fleet-H-multi-card
+      group: Distribute
     env:
       PIP_CACHE_DIR: /home/.cache/pip
       CACHE_DIR: /home/.cache
-      TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-multi-card
+      TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-A100
     steps:
       - name: Check docker image and run container
         run: |
@@ -262,7 +460,7 @@ jobs:
           mkdir -p /home/.cache/pip
           pip cache dir
           pip install --upgrade pip
-          git clone https://github.com/PaddlePaddle/PaddleFormers.git  -b ${BRANCH}
+          git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH}
           cd PaddleFormers
           git status
           git config --global --add safe.directory /workspace/PaddleFormers
@@ -271,6 +469,8 @@ jobs:
           git config pull.rebase false
           git pull --no-edit origin pull/${PR_ID}/head
           export UV_SKIP_WHEEL_FILENAME_CHECK=1
+          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py
+          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py
           pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
           wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
           pip uninstall paddlefleet -y
@@ -288,13 +488,12 @@ jobs:
           pip install coverage==7.6.1
           pip install librosa==0.11.0
           '
-      
+
       - name: GLM4.5 pre-train
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt.sh
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_a100.sh
           glm45_exit_code=$?
           if [[ "$glm45_exit_code" != "0" ]]; then
             echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
@@ -303,6 +502,7 @@ jobs:
             echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m"
           fi
           '
+        
       - name: GLM4.5 sft
         if: (success() || failure()) && steps.formers_install.conclusion == 'success'
         run: |
@@ -408,104 +608,3 @@ jobs:
           set +e
           docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
           docker rm -f ${{ env.container_name }}
-
-  integration-test-a100:
-    needs: check_documents_type
-    if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }}
-    name: Integration test (A100)
-    runs-on:
-      group: Distribute
-    env:
-      PIP_CACHE_DIR: /home/.cache/pip
-      CACHE_DIR: /home/.cache
-      TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-A100
-    steps:
-      - name: Check docker image and run container
-        run: |
-          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
-          echo "container_name=${container_name}" >> ${{ github.env }}
-          docker pull $docker_image
-          docker run -d -t --name ${container_name} --gpus all --shm-size=32G \
-            -v "/dev/shm:/dev/shm"  \
-            -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
-            -v ${{ github.workspace }}/../../..:/root \
-            -v /ssd1/paddle-1/action_cache:/home/.cache \
-            -v ${{ github.workspace }}:/workspace \
-            -e BRANCH \
-            -e PR_ID \
-            -e COMMIT_ID \
-            -e PADDLE_ROOT \
-            -e ci_scripts \
-            -e CACHE_DIR \
-            -e no_proxy \
-            -e CI_name \
-            -e PIP_CACHE_DIR \
-            -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
-            -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
-            -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
-            -e GITHUB_REPO_NAME="${{ github.repository }}" \
-            -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
-            -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
-            -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
-            -e GITHUB_RUN_ID="${{ github.run_id }}" \
-            -e PR_USER="${{ github.event.pull_request.user.login }}" \
-            -w /workspace --network host ${docker_image}
-
-      - name: Install PaddleFormers
-        id: formers_install
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          rm -rf * .[^.]*
-          source /root/proxy
-          mkdir -p /home/.cache/pip
-          pip cache dir
-          pip install --upgrade pip
-          git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH}
-          cd PaddleFormers
-          git status
-          git config --global --add safe.directory /workspace/PaddleFormers
-          git config user.name "PaddleCI"
-          git config user.email "paddle_ci@example.com"
-          git config pull.rebase false
-          git pull --no-edit origin pull/${PR_ID}/head
-          export UV_SKIP_WHEEL_FILENAME_CHECK=1
-          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py
-          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py
-          pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
-          wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
-          pip uninstall paddlefleet -y
-          pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
-          # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir
-          echo "paddle commit:"
-          python -c "import paddle; print(paddle.version.commit)"
-          echo "paddlefleet commit:"
-          python -c "import paddlefleet; print(paddlefleet.version.commit)"
-          cd /workspace
-          wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
-          mkdir bos
-          tar xf bos_new.tar.gz -C bos
-          pip install bce-python-sdk==0.8.74
-          pip install coverage==7.6.1
-          pip install librosa==0.11.0
-          '
-
-      - name: GLM4.5 pre-train
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_a100.sh
-          glm45_exit_code=$?
-          if [[ "$glm45_exit_code" != "0" ]]; then
-            echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
-            exit 1
-          else
-            echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m"
-          fi
-          '
-
-      - name: Terminate and delete the container
-        if: ${{ always() }}
-        run: |
-          set +e
-          docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
-          docker rm -f ${{ env.container_name }}
diff --git a/tests/integration_test/glm45_pt_a100.sh b/tests/integration_test/glm45_pt_a100.sh
index 701504b837a..8e2e1a0dacc 100644
--- a/tests/integration_test/glm45_pt_a100.sh
+++ b/tests/integration_test/glm45_pt_a100.sh
@@ -36,7 +36,7 @@ yq eval '.expert_model_parallel_size = 1
     | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
     | .model_name_or_path = strenv(cur_dir) + "/GLM-4.5-Air"
     | .logging_dir = strenv(cur_dir) + "/vdl_log"
-    | .output_dir = strenv(cur_dir) + "/checkpoints"' \
+    | .output_dir = strenv(cur_dir) + "/checkpoints/pretrain"' \
   $config_yaml > ${config_yaml}.tmp
 mv ${config_yaml}.tmp $config_yaml
 

From 2e5ee892db3463ac06f622e3028983927e9d33c8 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Mon, 26 Jan 2026 17:25:41 +0800
Subject: [PATCH 02/21] add a100

---
 .github/workflows/fleet-model-test.yml        | 80 +++++++++----------
 .../{glm45_pt_a100.sh => glm45_a100.sh}       | 68 ++++++++++++++--
 2 files changed, 101 insertions(+), 47 deletions(-)
 rename tests/integration_test/{glm45_pt_a100.sh => glm45_a100.sh} (56%)

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index 6c0267ab4b9..5618a266f3b 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -493,7 +493,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_a100.sh
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt
           glm45_exit_code=$?
           if [[ "$glm45_exit_code" != "0" ]]; then
             echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
@@ -508,7 +508,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_sft.sh
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft
           glm45_exit_code=$?
           if [[ "$glm45_exit_code" != "0" ]]; then
             echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m"
@@ -523,7 +523,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_lora.sh
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
           glm45_exit_code=$?
           if [[ "$glm45_exit_code" != "0" ]]; then
             echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
@@ -538,7 +538,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo
           glm45_exit_code=$?
           if [[ "$glm45_exit_code" != "0" ]]; then
             echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m"
@@ -548,27 +548,27 @@ jobs:
           fi
           '
 
-      - name: GLM4.5 pre-train (FP8)
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh
-          glm45_exit_code=$?
-          if [[ "$glm45_exit_code" != "0" ]]; then
-            echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
-            exit 1
-          else
-            echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m"
-          fi
-          '
+      # - name: GLM4.5 pre-train (FP8)
+      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+      #   run: |
+      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
+      #     source /root/proxy
+      #     timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh fp8
+      #     glm45_exit_code=$?
+      #     if [[ "$glm45_exit_code" != "0" ]]; then
+      #       echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
+      #       exit 1
+      #     else
+      #       echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m"
+      #     fi
+      #     '
 
       - name: GLM4.5 pre-train (Grouped GEMM)
         if: (success() || failure()) && steps.formers_install.conclusion == 'success'
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh grouped_gemm
           glm45_exit_code=$?
           if [[ "$glm45_exit_code" != "0" ]]; then
             echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m"
@@ -578,29 +578,29 @@ jobs:
           fi
           '
 
-      - name: Qwen pre-train
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt
-          '
+      # - name: Qwen pre-train
+      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+      #   run: |
+      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
+      #     source /root/proxy
+      #     timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt
+      #     '
         
-      - name: Qwen sft
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft
-          '
+      # - name: Qwen sft
+      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+      #   run: |
+      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
+      #     source /root/proxy
+      #     timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft
+      #     '
 
-      - name: Qwen lora
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora
-          '
+      # - name: Qwen lora
+      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+      #   run: |
+      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
+      #     source /root/proxy
+      #     timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora
+      #     '
 
       - name: Terminate and delete the container
         if: ${{ always() }}
diff --git a/tests/integration_test/glm45_pt_a100.sh b/tests/integration_test/glm45_a100.sh
similarity index 56%
rename from tests/integration_test/glm45_pt_a100.sh
rename to tests/integration_test/glm45_a100.sh
index 8e2e1a0dacc..ae0bedbebd0 100644
--- a/tests/integration_test/glm45_pt_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -24,10 +24,12 @@ tar -xf glm45_fleet.12-18.tar # glm45_fleet
 cd $root_dir/glm45_fleet
 export cur_dir=$(pwd)
 
-config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt.yaml
-export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt
-
-yq eval '.expert_model_parallel_size = 1
+step=$1
+if [[ ${step} == "pt" ]]; then
+  echo "Run GLM4.5 pretrain test"
+  config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt.yaml
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt
+  yq eval '.expert_model_parallel_size = 1
     | .num_hidden_layers = 2
     | .per_device_train_batch_size = 1
     | .use_expert_parallel = false
@@ -38,7 +40,59 @@ yq eval '.expert_model_parallel_size = 1
     | .logging_dir = strenv(cur_dir) + "/vdl_log"
     | .output_dir = strenv(cur_dir) + "/checkpoints/pretrain"' \
   $config_yaml > ${config_yaml}.tmp
-mv ${config_yaml}.tmp $config_yaml
+  mv ${config_yaml}.tmp $config_yaml
+elif [[ ${step} == "sft" ]]; then
+  echo "Run GLM4.5 sft test"
+  config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_sft.yaml
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft
+  yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+    | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+    | .model_name_or_path = strenv(cur_dir) + "/checkpoints/pretrain"
+    | .logging_dir = strenv(cur_dir) + "/glm_full_pp_vdl_log"
+    | .num_empty_layers_add_in_head = 0
+    | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"' \
+   $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
+elif [[ ${step} == "lora" ]]; then
+  echo "Run GLM4.5 multi lora test"
+  config_lora_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml
+
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft
+
+  yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+      | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+      | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
+      | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log"
+      | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"' \
+    $config_lora_yaml > ${config_lora_yaml}.tmp
+  mv ${config_lora_yaml}.tmp $config_lora_yaml
+elif [[ ${step} == "dpo" ]]; then
+  echo "Run GLM4.5 dpo test"
+  config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo
+  yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+      | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+      | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
+      | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log"
+      | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \
+    $config_dpo_yaml > ${config_dpo_yaml}.tmp
+  mv ${config_dpo_yaml}.tmp $config_dpo_yaml
+elif [[ ${step} == "grouped_gemm" ]]; then
+  echo "Run GLM4.5 grouped_gemm test"
+  export config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt_grouped_gemm.yaml
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt
+  yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+      | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+      | .model_name_or_path = strenv(CACHE_DIR) + "/glm45/GLM-4.5-Air"
+      | .per_device_train_batch_size = 1
+      | .num_hidden_layers = 2
+      | .use_expert_parallel = false
+      | .stage1_overlap = false
+      | .logging_dir = strenv(data_dir) + "/vdl_log"
+      | .output_dir = strenv(data_dir) + "/checkpoints"' \
+    $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
+fi
 
 rm -rf checkpoints/
 rm -rf vdl_log/
@@ -50,8 +104,8 @@ unset http_proxy https_proxy
 export FLAGS_embedding_deterministic=1
 export FLAGS_cudnn_deterministic=1
 
-log_file=glm45_pt_a100.txt
-gt_loss_file=glm45_pt_multi_card_a100_gt_loss.txt
+log_file=glm45_${step}_a100.txt
+gt_loss_file=glm45_${step}_multi_card_a100_gt_loss.txt
 
 set +e
 FLAGS_use_stride_compute_kernel=False NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file}

From 058e84beb6ddb04e3bfe02bbe6f664fa062e7706 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Mon, 26 Jan 2026 17:38:26 +0800
Subject: [PATCH 03/21] fix

---
 tests/integration_test/glm45_a100.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index ae0bedbebd0..d569e22f1bf 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -19,8 +19,11 @@ if [ -f 'PaddleFleet/.venv/bin/activate' ]; then
    source PaddleFleet/.venv/bin/activate
 fi
 
-wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/glm45/glm45_fleet.12-18.tar --no-check-certificate
-tar -xf glm45_fleet.12-18.tar # glm45_fleet
+if [ ! -d "$root_dir/glm45_fleet" ]; then
+  wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/glm45/glm45_fleet.12-18.tar --no-check-certificate
+  ar -xf glm45_fleet.12-18.tar
+fi
+
 cd $root_dir/glm45_fleet
 export cur_dir=$(pwd)
 
@@ -86,7 +89,6 @@ elif [[ ${step} == "grouped_gemm" ]]; then
       | .model_name_or_path = strenv(CACHE_DIR) + "/glm45/GLM-4.5-Air"
       | .per_device_train_batch_size = 1
       | .num_hidden_layers = 2
-      | .use_expert_parallel = false
       | .stage1_overlap = false
       | .logging_dir = strenv(data_dir) + "/vdl_log"
       | .output_dir = strenv(data_dir) + "/checkpoints"' \

From ddcdd5cb2af5ba02bad9576a29c9747375f0c4cd Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Mon, 26 Jan 2026 19:01:37 +0800
Subject: [PATCH 04/21] add a100

---
 .github/workflows/fleet-model-test.yml |  42 ++++----
 tests/integration_test/glm45_a100.sh   |   2 +-
 tests/integration_test/qwen3_a100.sh   | 133 +++++++++++++++++++++++++
 3 files changed, 155 insertions(+), 22 deletions(-)
 create mode 100644 tests/integration_test/qwen3_a100.sh

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index 5618a266f3b..71cc9bb055f 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -578,29 +578,29 @@ jobs:
           fi
           '
 
-      # - name: Qwen pre-train
-      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-      #   run: |
-      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
-      #     source /root/proxy
-      #     timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt
-      #     '
+      - name: Qwen pre-train
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt
+          '
         
-      # - name: Qwen sft
-      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-      #   run: |
-      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
-      #     source /root/proxy
-      #     timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft
-      #     '
+      - name: Qwen sft
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft
+          '
 
-      # - name: Qwen lora
-      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-      #   run: |
-      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
-      #     source /root/proxy
-      #     timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora
-      #     '
+      - name: Qwen lora
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora
+          '
 
       - name: Terminate and delete the container
         if: ${{ always() }}
diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index d569e22f1bf..72796dbc20f 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -21,7 +21,7 @@ fi
 
 if [ ! -d "$root_dir/glm45_fleet" ]; then
   wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/glm45/glm45_fleet.12-18.tar --no-check-certificate
-  ar -xf glm45_fleet.12-18.tar
+  tar -xf glm45_fleet.12-18.tar
 fi
 
 cd $root_dir/glm45_fleet
diff --git a/tests/integration_test/qwen3_a100.sh b/tests/integration_test/qwen3_a100.sh
new file mode 100644
index 00000000000..36b9b069fe8
--- /dev/null
+++ b/tests/integration_test/qwen3_a100.sh
@@ -0,0 +1,133 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -exo pipefail
+export root_dir=$(pwd)
+
+step=$1
+
+if [[ ! -d $CACHE_DIR/Qwen3-30B-A3B ]]; then
+    pushd $CACHE_DIR
+    wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/Qwen/Qwen3-30B-A3B.tar.gz --no-check-certificate
+    tar xf Qwen3-30B-A3B.tar.gz
+    popd
+fi
+
+if [ -f 'PaddleFleet/.venv/bin/activate' ]; then
+   source PaddleFleet/.venv/bin/activate
+fi
+
+if [[ "$step" == "pt" ]]; then
+    export config_yaml=$root_dir/PaddleFormers/tests/config/ci/qwen3_multicard_pt.yaml
+    export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt
+    export model_name_or_path=$CACHE_DIR/Qwen3-30B-A3B
+    export output_dir=$root_dir/checkpoints/qwen-pt
+    yq eval '.moe_grouped_gemm = false' \
+    $config_yaml > ${config_yaml}.tmp
+    mv ${config_yaml}.tmp $config_yaml
+elif [[ "$step" == "sft" ]]; then
+    export config_yaml=$root_dir/PaddleFormers/tests/config/ci/qwen3_multicard_sft.yaml
+    export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft
+    export model_name_or_path=$root_dir/checkpoints/qwen-pt
+    export output_dir=$root_dir/checkpoints/qwen-sft
+    yq eval '.moe_grouped_gemm = false' \
+    $config_yaml > ${config_yaml}.tmp
+    mv ${config_yaml}.tmp $config_yaml
+else
+    export config_yaml=$root_dir/PaddleFormers/tests/config/ci/qwen3_multicard_lora.yaml
+    export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft
+    export model_name_or_path=$root_dir/checkpoints/qwen-sft
+    export output_dir=$root_dir/checkpoints/qwen-lora
+fi
+
+yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+    | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+    | .model_name_or_path = strenv(model_name_or_path)
+    | .output_dir = strenv(output_dir)' \
+   $config_yaml > ${config_yaml}.tmp
+mv ${config_yaml}.tmp $config_yaml
+
+rm -rf ./outputs
+rm -rf paddleformers_dist_log
+master=$(hostname -i)
+port=36677
+
+export FLAGS_embedding_deterministic=1
+export FLAGS_cudnn_deterministic=1
+export FLAGS_use_stride_compute_kernel=False
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+unset http_proxy https_proxy
+
+log_file=qwen_$step.txt
+gt_loss_file=qwen_${step}_multi_card_gt_loss.txt
+
+set +e
+NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file}
+
+exit_code=$?
+if [ $exit_code -ne 0 ]; then
+   echo "qwen multi-cards training failed, try to check the log file"
+   python $root_dir/PaddleFormers/tests/check_log_for_exitcode.py ./${log_file} "***** train metrics *****"
+   check_exit_code=$?
+   if [ $check_exit_code -ne 0 ]; then
+     echo "Failed to find 'Training completed' in log file."
+     exit 1
+   else
+     echo "Log check passed."
+   fi
+else
+    echo "Test passed."
+fi
+
+# export repo_name=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}')
+export repo_name=PaddleFleet
+# if [[ "${PP}" == "rel" ]]; then
+#   export pppatch="_PPrel"
+# fi
+# if [[ "${PF}" == rel* ]]; then
+#   export pfpatch="rel"
+# fi
+wget --no-proxy --no-check-certificate https://xly-devops.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}_latest/${gt_loss_file}
+if [ $? -ne 0 ]; then
+  echo "To request precision checks for new models, please contact swgu98."
+  exit 1
+fi
+
+log_loss_file=${log_file%.*}_loss.${log_file##*.}
+python $root_dir/PaddleFormers/tests/integration_test/check_loss.py \
+   --compare_step 10 \
+   --log_file ./${log_file} \
+   --log_loss_file ./${log_loss_file} \
+   --gt_file ./${gt_loss_file}
+
+if [ $? -ne 0 ]; then
+  pushd $root_dir/PaddleFormers
+  source /root/proxy
+  bash $root_dir/PaddleFormers/tests/integration_test/check_precision_approval.sh
+  if [ $? -ne 0 ]; then
+    echo -e "\033[31mThe precision has been changed and requires approvals.\033[0m"
+    exit 1
+  fi
+  popd
+  rm ${gt_loss_file} && mv ${log_loss_file} ${gt_loss_file}
+  if [ ! -f precision_list.txt ]; then
+    wget --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID}/precision_list.txt
+    if [ $? -ne 0 ]; then
+      wget --no-proxy --no-check-certificate https://xly-devops.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}_latest/precision_list.txt
+      python $root_dir/bos/BosClient.py precision_list.txt paddle-github-action/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID}
+    fi
+  fi
+  python $root_dir/bos/BosClient.py ${gt_loss_file} paddle-github-action/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID}
+fi
\ No newline at end of file

From ae63fc26d2103b56b54f772739bd97442ea5428d Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Mon, 26 Jan 2026 19:22:56 +0800
Subject: [PATCH 05/21] fix

---
 .github/workflows/fleet-model-test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index 71cc9bb055f..a3259ede997 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -487,6 +487,7 @@ jobs:
           pip install bce-python-sdk==0.8.74
           pip install coverage==7.6.1
           pip install librosa==0.11.0
+          pip install torchcodec
           '
 
       - name: GLM4.5 pre-train

From 7b0f02d3ebfa0ee76eafd98f026815448bad2f42 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Tue, 27 Jan 2026 15:55:08 +0800
Subject: [PATCH 06/21] fix

---
 .github/workflows/fleet-model-test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index a3259ede997..71cc9bb055f 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -487,7 +487,6 @@ jobs:
           pip install bce-python-sdk==0.8.74
           pip install coverage==7.6.1
           pip install librosa==0.11.0
-          pip install torchcodec
           '
 
       - name: GLM4.5 pre-train

From 15f4e8ea2ee22b15e0a6578bc8b7959a60d9a66e Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Tue, 27 Jan 2026 16:15:28 +0800
Subject: [PATCH 07/21] fix

---
 .github/workflows/fleet-model-test.yml |  6 +++---
 tests/integration_test/glm45_a100.sh   | 14 +++++++-------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index 71cc9bb055f..dcbcf926bc9 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -583,7 +583,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt
           '
         
       - name: Qwen sft
@@ -591,7 +591,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft
           '
 
       - name: Qwen lora
@@ -599,7 +599,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
           '
 
       - name: Terminate and delete the container
diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index 72796dbc20f..cb164bdb0ce 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -58,7 +58,7 @@ elif [[ ${step} == "sft" ]]; then
   mv ${config_yaml}.tmp $config_yaml
 elif [[ ${step} == "lora" ]]; then
   echo "Run GLM4.5 multi lora test"
-  config_lora_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml
+  config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml
 
   export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft
 
@@ -67,19 +67,19 @@ elif [[ ${step} == "lora" ]]; then
       | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
       | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log"
       | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"' \
-    $config_lora_yaml > ${config_lora_yaml}.tmp
-  mv ${config_lora_yaml}.tmp $config_lora_yaml
+    $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
 elif [[ ${step} == "dpo" ]]; then
   echo "Run GLM4.5 dpo test"
-  config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml
+  config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml
   export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo
   yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
       | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
       | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
       | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log"
       | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \
-    $config_dpo_yaml > ${config_dpo_yaml}.tmp
-  mv ${config_dpo_yaml}.tmp $config_dpo_yaml
+    $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
 elif [[ ${step} == "grouped_gemm" ]]; then
   echo "Run GLM4.5 grouped_gemm test"
   export config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt_grouped_gemm.yaml
@@ -90,13 +90,13 @@ elif [[ ${step} == "grouped_gemm" ]]; then
       | .per_device_train_batch_size = 1
       | .num_hidden_layers = 2
       | .stage1_overlap = false
+      | .moe_grouped_gemm = false
       | .logging_dir = strenv(data_dir) + "/vdl_log"
       | .output_dir = strenv(data_dir) + "/checkpoints"' \
     $config_yaml > ${config_yaml}.tmp
   mv ${config_yaml}.tmp $config_yaml
 fi
 
-rm -rf checkpoints/
 rm -rf vdl_log/
 master=$(hostname -i)
 port=36677

From 8f3b8afd799dbdb4435ebaea483cec7ad2a974f7 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Tue, 27 Jan 2026 17:00:03 +0800
Subject: [PATCH 08/21] fix deepep

---
 tests/integration_test/glm45_a100.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index cb164bdb0ce..4825aade05f 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -66,7 +66,8 @@ elif [[ ${step} == "lora" ]]; then
       | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
       | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
       | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log"
-      | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"' \
+      | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"
+      | del(.moe_token_dispatcher_type)' \
     $config_yaml > ${config_yaml}.tmp
   mv ${config_yaml}.tmp $config_yaml
 elif [[ ${step} == "dpo" ]]; then
@@ -77,6 +78,7 @@ elif [[ ${step} == "dpo" ]]; then
       | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
       | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
       | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log"
+      | .moe_token_dispatcher_type: "deepep"
       | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \
     $config_yaml > ${config_yaml}.tmp
   mv ${config_yaml}.tmp $config_yaml

From 52b75b17f524aab10fdd9689140df6ca0f8ad830 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Tue, 27 Jan 2026 19:17:24 +0800
Subject: [PATCH 09/21] fix dpo

---
 tests/integration_test/glm45_a100.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index 4825aade05f..86600436954 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -59,9 +59,7 @@ elif [[ ${step} == "sft" ]]; then
 elif [[ ${step} == "lora" ]]; then
   echo "Run GLM4.5 multi lora test"
   config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml
-
   export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft
-
   yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
       | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
       | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
@@ -78,7 +76,7 @@ elif [[ ${step} == "dpo" ]]; then
       | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
       | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
       | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log"
-      | .moe_token_dispatcher_type: "deepep"
+      | .num_empty_layers_add_in_tail = 0
       | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \
     $config_yaml > ${config_yaml}.tmp
   mv ${config_yaml}.tmp $config_yaml

From 3f6cdd4d2cb127b4d1f8b83ed48fd57133c5ea7e Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Wed, 28 Jan 2026 11:49:04 +0800
Subject: [PATCH 10/21] add

---
 .github/workflows/fleet-model-test.yml | 684 ++++++++++++-------------
 1 file changed, 342 insertions(+), 342 deletions(-)

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index dcbcf926bc9..8cb4fdf2d93 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -68,346 +68,346 @@ jobs:
           fi
           echo "is_md_only: $(cat $GITHUB_OUTPUT | grep is_md_only || echo '未找到')"
     
-  # integration-test-H20-single-card:
-  #   needs: check_documents_type
-  #   if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }}
-  #   name: Integration test (H20, single card)
-  #   runs-on:
-  #     group: Fleet-H-single-card
-  #   env:
-  #     PIP_CACHE_DIR: /home/.cache/pip
-  #     CACHE_DIR: /home/.cache
-  #     TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-single-card
-  #   steps:
-  #     - name: Determine the runner
-  #       run: |
-  #         gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 ))
-  #         echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV
+  integration-test-H20-single-card:
+    needs: check_documents_type
+    if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }}
+    name: Integration test (H20, single card)
+    runs-on:
+      group: Fleet-H-single-card
+    env:
+      PIP_CACHE_DIR: /home/.cache/pip
+      CACHE_DIR: /home/.cache
+      TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-single-card
+    steps:
+      - name: Determine the runner
+        run: |
+          gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 ))
+          echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV
 
-  #     - name: Check docker image and run container
-  #       env:
-  #         GPU_DEVICES: ${{ env.GPU_DEVICES }}
-  #       run: |
-  #         container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
-  #         echo "container_name=${container_name}" >> ${{ github.env }}
-  #         docker pull $docker_image
-  #         set -x
-  #         docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \
-  #           -v "/dev/shm:/dev/shm"  \
-  #           -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
-  #           -v ${{ github.workspace }}/../../..:/root \
-  #           -v /ssd1/paddle-1/action_cache:/home/.cache \
-  #           -v ${{ github.workspace }}:/workspace \
-  #           -e BRANCH \
-  #           -e PR_ID \
-  #           -e COMMIT_ID \
-  #           -e PADDLE_ROOT \
-  #           -e ci_scripts \
-  #           -e CACHE_DIR \
-  #           -e no_proxy \
-  #           -e CI_name \
-  #           -e PIP_CACHE_DIR \
-  #           -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
-  #           -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
-  #           -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
-  #           -e GITHUB_REPO_NAME="${{ github.repository }}" \
-  #           -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
-  #           -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
-  #           -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
-  #           -e GITHUB_RUN_ID="${{ github.run_id }}" \
-  #           -e PR_USER="${{ github.event.pull_request.user.login }}" \
-  #           -w /workspace --network host ${docker_image}
+      - name: Check docker image and run container
+        env:
+          GPU_DEVICES: ${{ env.GPU_DEVICES }}
+        run: |
+          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
+          echo "container_name=${container_name}" >> ${{ github.env }}
+          docker pull $docker_image
+          set -x
+          docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \
+            -v "/dev/shm:/dev/shm"  \
+            -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
+            -v ${{ github.workspace }}/../../..:/root \
+            -v /ssd1/paddle-1/action_cache:/home/.cache \
+            -v ${{ github.workspace }}:/workspace \
+            -e BRANCH \
+            -e PR_ID \
+            -e COMMIT_ID \
+            -e PADDLE_ROOT \
+            -e ci_scripts \
+            -e CACHE_DIR \
+            -e no_proxy \
+            -e CI_name \
+            -e PIP_CACHE_DIR \
+            -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
+            -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
+            -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
+            -e GITHUB_REPO_NAME="${{ github.repository }}" \
+            -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
+            -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
+            -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
+            -e GITHUB_RUN_ID="${{ github.run_id }}" \
+            -e PR_USER="${{ github.event.pull_request.user.login }}" \
+            -w /workspace --network host ${docker_image}
 
-  #     - name: Install PaddleFormers
-  #       id: formers_install
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         rm -rf * .[^.]*
-  #         echo $PR_USER
-  #         source /root/proxy
-  #         mkdir -p /home/.cache/pip
-  #         pip cache dir
-  #         pip install --upgrade pip
-  #         git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH}
-  #         cd PaddleFormers
-  #         git status
-  #         git config --global --add safe.directory /workspace/PaddleFormers
-  #         git config user.name "PaddleCI"
-  #         git config user.email "paddle_ci@example.com"
-  #         git config pull.rebase false
-  #         git pull --no-edit origin pull/${PR_ID}/head
-  #         export UV_SKIP_WHEEL_FILENAME_CHECK=1
-  #         sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py
-  #         sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py
-  #         pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
-  #         wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
-  #         pip uninstall paddlefleet -y
-  #         pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
-  #         # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir
-  #         echo "paddle commit:"
-  #         python -c "import paddle; print(paddle.version.commit)"
-  #         echo "paddlefleet commit:"
-  #         python -c "import paddlefleet; print(paddlefleet.version.commit)"
-  #         cd /workspace
-  #         wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
-  #         mkdir bos
-  #         tar xf bos_new.tar.gz -C bos
-  #         pip install bce-python-sdk==0.8.74
-  #         pip install coverage==7.6.1
-  #         pip install librosa==0.11.0
-  #         '
+      - name: Install PaddleFormers
+        id: formers_install
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          rm -rf * .[^.]*
+          echo $PR_USER
+          source /root/proxy
+          mkdir -p /home/.cache/pip
+          pip cache dir
+          pip install --upgrade pip
+          git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH}
+          cd PaddleFormers
+          git status
+          git config --global --add safe.directory /workspace/PaddleFormers
+          git config user.name "PaddleCI"
+          git config user.email "paddle_ci@example.com"
+          git config pull.rebase false
+          git pull --no-edit origin pull/${PR_ID}/head
+          export UV_SKIP_WHEEL_FILENAME_CHECK=1
+          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py
+          sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py
+          pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
+          wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
+          pip uninstall paddlefleet -y
+          pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
+          # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir
+          echo "paddle commit:"
+          python -c "import paddle; print(paddle.version.commit)"
+          echo "paddlefleet commit:"
+          python -c "import paddlefleet; print(paddlefleet.version.commit)"
+          cd /workspace
+          wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
+          mkdir bos
+          tar xf bos_new.tar.gz -C bos
+          pip install bce-python-sdk==0.8.74
+          pip install coverage==7.6.1
+          pip install librosa==0.11.0
+          '
 
-  #     - name: Proprocess for integration test
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/preprocess.sh
-  #         preprocess_exit_code=$?
-  #         if [[ "$preprocess_exit_code" != "0" ]]; then
-  #           echo -e "::error:: \033[31mPreprocess failed.\033[0m"
-  #           exit 1
-  #         else
-  #           echo -e "\033[32mPreprocess succeeded.\033[0m"
-  #         fi
-  #         '
+      - name: Proprocess for integration test
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/preprocess.sh
+          preprocess_exit_code=$?
+          if [[ "$preprocess_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mPreprocess failed.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mPreprocess succeeded.\033[0m"
+          fi
+          '
 
-  #     - name: Integration test (GLM4.5 single-card)
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh
-  #         glm45_single_card_exit_code=$?
-  #         if [[ "$glm45_single_card_exit_code" != "0" ]]; then
-  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 single-card.\033[0m"
-  #           exit 1
-  #         else
-  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m"
-  #         fi
-  #         '
+      - name: Integration test (GLM4.5 single-card)
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh
+          glm45_single_card_exit_code=$?
+          if [[ "$glm45_single_card_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 single-card.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m"
+          fi
+          '
 
 
-  #     - name: Integration test (Qwen3-30B-A3B single-card)
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh
-  #         qwen3_single_card_exit_code=$?
-  #         if [[ "$qwen3_single_card_exit_code" != "0" ]]; then
-  #           echo -e "::error:: \033[31mIntegration test failed: Qwen3-30B-A3B single-card.\033[0m"
-  #           exit 1
-  #         else
-  #           echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m"
-  #         fi
-  #         '
+      - name: Integration test (Qwen3-30B-A3B single-card)
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh
+          qwen3_single_card_exit_code=$?
+          if [[ "$qwen3_single_card_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: Qwen3-30B-A3B single-card.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m"
+          fi
+          '
 
-  #     - name: Terminate and delete the container
-  #       if: ${{ always() }}
-  #       run: |
-  #         set +e
-  #         docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
-  #         docker rm -f ${{ env.container_name }}
+      - name: Terminate and delete the container
+        if: ${{ always() }}
+        run: |
+          set +e
+          docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
+          docker rm -f ${{ env.container_name }}
 
 
-  # integration-test-H20-multi-card:
-  #   needs: check_documents_type
-  #   if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }}
-  #   name: Integration test (H20, multi-card)
-  #   runs-on:
-  #     group: Fleet-H-multi-card
-  #   env:
-  #     PIP_CACHE_DIR: /home/.cache/pip
-  #     CACHE_DIR: /home/.cache
-  #     TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-multi-card
-  #   steps:
-  #     - name: Check docker image and run container
-  #       run: |
-  #         container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
-  #         echo "container_name=${container_name}" >> ${{ github.env }}
-  #         docker pull $docker_image
-  #         docker run -d -t --name ${container_name} --gpus all --shm-size=32G \
-  #           -v "/dev/shm:/dev/shm"  \
-  #           -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
-  #           -v ${{ github.workspace }}/../../..:/root \
-  #           -v /ssd1/paddle-1/action_cache:/home/.cache \
-  #           -v ${{ github.workspace }}:/workspace \
-  #           -e BRANCH \
-  #           -e PR_ID \
-  #           -e COMMIT_ID \
-  #           -e PADDLE_ROOT \
-  #           -e ci_scripts \
-  #           -e CACHE_DIR \
-  #           -e no_proxy \
-  #           -e CI_name \
-  #           -e PIP_CACHE_DIR \
-  #           -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
-  #           -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
-  #           -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
-  #           -e GITHUB_REPO_NAME="${{ github.repository }}" \
-  #           -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
-  #           -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
-  #           -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
-  #           -e GITHUB_RUN_ID="${{ github.run_id }}" \
-  #           -e PR_USER="${{ github.event.pull_request.user.login }}" \
-  #           -w /workspace --network host ${docker_image}
+  integration-test-H20-multi-card:
+    needs: check_documents_type
+    if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }}
+    name: Integration test (H20, multi-card)
+    runs-on:
+      group: Fleet-H-multi-card
+    env:
+      PIP_CACHE_DIR: /home/.cache/pip
+      CACHE_DIR: /home/.cache
+      TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-multi-card
+    steps:
+      - name: Check docker image and run container
+        run: |
+          container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
+          echo "container_name=${container_name}" >> ${{ github.env }}
+          docker pull $docker_image
+          docker run -d -t --name ${container_name} --gpus all --shm-size=32G \
+            -v "/dev/shm:/dev/shm"  \
+            -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \
+            -v ${{ github.workspace }}/../../..:/root \
+            -v /ssd1/paddle-1/action_cache:/home/.cache \
+            -v ${{ github.workspace }}:/workspace \
+            -e BRANCH \
+            -e PR_ID \
+            -e COMMIT_ID \
+            -e PADDLE_ROOT \
+            -e ci_scripts \
+            -e CACHE_DIR \
+            -e no_proxy \
+            -e CI_name \
+            -e PIP_CACHE_DIR \
+            -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \
+            -e GITHUB_HEAD_REF="${{ github.head_ref }}" \
+            -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \
+            -e GITHUB_REPO_NAME="${{ github.repository }}" \
+            -e GITHUB_EVENT_NAME="${{ github.event_name }}" \
+            -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \
+            -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \
+            -e GITHUB_RUN_ID="${{ github.run_id }}" \
+            -e PR_USER="${{ github.event.pull_request.user.login }}" \
+            -w /workspace --network host ${docker_image}
 
-  #     - name: Install PaddleFormers
-  #       id: formers_install
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         rm -rf * .[^.]*
-  #         source /root/proxy
-  #         mkdir -p /home/.cache/pip
-  #         pip cache dir
-  #         pip install --upgrade pip
-  #         git clone https://github.com/PaddlePaddle/PaddleFormers.git  -b ${BRANCH}
-  #         cd PaddleFormers
-  #         git status
-  #         git config --global --add safe.directory /workspace/PaddleFormers
-  #         git config user.name "PaddleCI"
-  #         git config user.email "paddle_ci@example.com"
-  #         git config pull.rebase false
-  #         git pull --no-edit origin pull/${PR_ID}/head
-  #         export UV_SKIP_WHEEL_FILENAME_CHECK=1
-  #         pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
-  #         wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
-  #         pip uninstall paddlefleet -y
-  #         pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
-  #         # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir
-  #         echo "paddle commit:"
-  #         python -c "import paddle; print(paddle.version.commit)"
-  #         echo "paddlefleet commit:"
-  #         python -c "import paddlefleet; print(paddlefleet.version.commit)"
-  #         cd /workspace
-  #         wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
-  #         mkdir bos
-  #         tar xf bos_new.tar.gz -C bos
-  #         pip install bce-python-sdk==0.8.74
-  #         pip install coverage==7.6.1
-  #         pip install librosa==0.11.0
-  #         '
+      - name: Install PaddleFormers
+        id: formers_install
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          rm -rf * .[^.]*
+          source /root/proxy
+          mkdir -p /home/.cache/pip
+          pip cache dir
+          pip install --upgrade pip
+          git clone https://github.com/PaddlePaddle/PaddleFormers.git  -b ${BRANCH}
+          cd PaddleFormers
+          git status
+          git config --global --add safe.directory /workspace/PaddleFormers
+          git config user.name "PaddleCI"
+          git config user.email "paddle_ci@example.com"
+          git config pull.rebase false
+          git pull --no-edit origin pull/${PR_ID}/head
+          export UV_SKIP_WHEEL_FILENAME_CHECK=1
+          pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/
+          wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
+          pip uninstall paddlefleet -y
+          pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl
+          # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir
+          echo "paddle commit:"
+          python -c "import paddle; print(paddle.version.commit)"
+          echo "paddlefleet commit:"
+          python -c "import paddlefleet; print(paddlefleet.version.commit)"
+          cd /workspace
+          wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate
+          mkdir bos
+          tar xf bos_new.tar.gz -C bos
+          pip install bce-python-sdk==0.8.74
+          pip install coverage==7.6.1
+          pip install librosa==0.11.0
+          '
       
-  #     - name: GLM4.5 pre-train
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt.sh
-  #         glm45_exit_code=$?
-  #         if [[ "$glm45_exit_code" != "0" ]]; then
-  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
-  #           exit 1
-  #         else
-  #           echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m"
-  #         fi
-  #         '
-  #     - name: GLM4.5 sft
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_sft.sh
-  #         glm45_exit_code=$?
-  #         if [[ "$glm45_exit_code" != "0" ]]; then
-  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m"
-  #           exit 1
-  #         else
-  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m"
-  #         fi
-  #         '
+      - name: GLM4.5 pre-train
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt.sh
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m"
+          fi
+          '
+      - name: GLM4.5 sft
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_sft.sh
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m"
+          fi
+          '
       
-  #     - name: GLM4.5 lora
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_lora.sh
-  #         glm45_exit_code=$?
-  #         if [[ "$glm45_exit_code" != "0" ]]; then
-  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
-  #           exit 1
-  #         else
-  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
-  #         fi
-  #         '
+      - name: GLM4.5 lora
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_lora.sh
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
+          fi
+          '
       
-  #     - name: GLM4.5 dpo
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh
-  #         glm45_exit_code=$?
-  #         if [[ "$glm45_exit_code" != "0" ]]; then
-  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m"
-  #           exit 1
-  #         else
-  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m"
-  #         fi
-  #         '
+      - name: GLM4.5 dpo
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m"
+          fi
+          '
 
-  #     - name: GLM4.5 pre-train (FP8)
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh
-  #         glm45_exit_code=$?
-  #         if [[ "$glm45_exit_code" != "0" ]]; then
-  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
-  #           exit 1
-  #         else
-  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m"
-  #         fi
-  #         '
+      - name: GLM4.5 pre-train (FP8)
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m"
+          fi
+          '
 
-  #     - name: GLM4.5 pre-train (Grouped GEMM)
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh
-  #         glm45_exit_code=$?
-  #         if [[ "$glm45_exit_code" != "0" ]]; then
-  #           echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m"
-  #           exit 1
-  #         else
-  #           echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m"
-  #         fi
-  #         '
+      - name: GLM4.5 pre-train (Grouped GEMM)
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m"
+          fi
+          '
 
-  #     - name: Qwen pre-train
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt
-  #         '
+      - name: Qwen pre-train
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt
+          '
         
-  #     - name: Qwen sft
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft
-  #         '
+      - name: Qwen sft
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft
+          '
 
-  #     - name: Qwen lora
-  #       if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-  #       run: |
-  #         docker exec -t ${{ env.container_name }} /bin/bash -ce '
-  #         source /root/proxy
-  #         timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora
-  #         '
+      - name: Qwen lora
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora
+          '
 
-  #     - name: Terminate and delete the container
-  #       if: ${{ always() }}
-  #       run: |
-  #         set +e
-  #         docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
-  #         docker rm -f ${{ env.container_name }}
+      - name: Terminate and delete the container
+        if: ${{ always() }}
+        run: |
+          set +e
+          docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*'
+          docker rm -f ${{ env.container_name }}
 
   integration-test-a100:
     needs: check_documents_type
@@ -518,20 +518,20 @@ jobs:
           fi
           '
       
-      - name: GLM4.5 lora
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
-          glm45_exit_code=$?
-          if [[ "$glm45_exit_code" != "0" ]]; then
-            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
-            exit 1
-          else
-            echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
-          fi
-          '
+      # - name: GLM4.5 lora
+      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+      #   run: |
+      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
+      #     source /root/proxy
+      #     timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
+      #     glm45_exit_code=$?
+      #     if [[ "$glm45_exit_code" != "0" ]]; then
+      #       echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
+      #       exit 1
+      #     else
+      #       echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
+      #     fi
+      #     '
       
       - name: GLM4.5 dpo
         if: (success() || failure()) && steps.formers_install.conclusion == 'success'
@@ -594,13 +594,13 @@ jobs:
           timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft
           '
 
-      - name: Qwen lora
-        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-        run: |
-          docker exec -t ${{ env.container_name }} /bin/bash -ce '
-          source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
-          '
+      # - name: Qwen lora
+      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+      #   run: |
+      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
+      #     source /root/proxy
+      #     timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
+      #     '
 
       - name: Terminate and delete the container
         if: ${{ always() }}

From 7a3659b25da157aacf0d76dd98e15368acb1d82c Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Wed, 28 Jan 2026 15:56:41 +0800
Subject: [PATCH 11/21] fix

---
 .github/workflows/fleet-model-test.yml | 56 ++++++++++----------------
 tests/integration_test/glm45_dpo.sh    |  2 +-
 2 files changed, 22 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index 8cb4fdf2d93..af719cf6366 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -518,20 +518,20 @@ jobs:
           fi
           '
       
-      # - name: GLM4.5 lora
-      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-      #   run: |
-      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
-      #     source /root/proxy
-      #     timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
-      #     glm45_exit_code=$?
-      #     if [[ "$glm45_exit_code" != "0" ]]; then
-      #       echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
-      #       exit 1
-      #     else
-      #       echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
-      #     fi
-      #     '
+      - name: GLM4.5 lora
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m"
+          fi
+          '
       
       - name: GLM4.5 dpo
         if: (success() || failure()) && steps.formers_install.conclusion == 'success'
@@ -548,20 +548,6 @@ jobs:
           fi
           '
 
-      # - name: GLM4.5 pre-train (FP8)
-      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-      #   run: |
-      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
-      #     source /root/proxy
-      #     timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh fp8
-      #     glm45_exit_code=$?
-      #     if [[ "$glm45_exit_code" != "0" ]]; then
-      #       echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m"
-      #       exit 1
-      #     else
-      #       echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m"
-      #     fi
-      #     '
 
       - name: GLM4.5 pre-train (Grouped GEMM)
         if: (success() || failure()) && steps.formers_install.conclusion == 'success'
@@ -594,13 +580,13 @@ jobs:
           timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft
           '
 
-      # - name: Qwen lora
-      #   if: (success() || failure()) && steps.formers_install.conclusion == 'success'
-      #   run: |
-      #     docker exec -t ${{ env.container_name }} /bin/bash -ce '
-      #     source /root/proxy
-      #     timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
-      #     '
+      - name: Qwen lora
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
+          '
 
       - name: Terminate and delete the container
         if: ${{ always() }}
diff --git a/tests/integration_test/glm45_dpo.sh b/tests/integration_test/glm45_dpo.sh
index d636828bb2f..23d8bc742fa 100644
--- a/tests/integration_test/glm45_dpo.sh
+++ b/tests/integration_test/glm45_dpo.sh
@@ -27,7 +27,7 @@ config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml
 export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo
 yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
     | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
-    | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
+    | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"
     | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log"
     | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \
    $config_dpo_yaml > ${config_dpo_yaml}.tmp

From 74982f72794bad15213a6131817f451e56c51ed1 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Wed, 28 Jan 2026 16:11:40 +0800
Subject: [PATCH 12/21] fix lora

---
 tests/integration_test/glm45_a100.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index 86600436954..5b143fac611 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -65,6 +65,7 @@ elif [[ ${step} == "lora" ]]; then
       | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
       | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log"
       | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"
+      | .num_empty_layers_add_in_tail = 0
       | del(.moe_token_dispatcher_type)' \
     $config_yaml > ${config_yaml}.tmp
   mv ${config_yaml}.tmp $config_yaml

From 7d1ee49fb9714c23d561a8b80313ec18dfc11044 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Wed, 28 Jan 2026 16:28:23 +0800
Subject: [PATCH 13/21] fix

---
 .github/workflows/fleet-model-test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index af719cf6366..1659f8169e5 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -569,7 +569,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh pt
           '
         
       - name: Qwen sft
@@ -577,7 +577,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh sft
           '
 
       - name: Qwen lora
@@ -585,7 +585,7 @@ jobs:
         run: |
           docker exec -t ${{ env.container_name }} /bin/bash -ce '
           source /root/proxy
-          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora
+          timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh lora
           '
 
       - name: Terminate and delete the container

From b863be96b84d4bead7943054ebc09ad7cdae66e2 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Wed, 28 Jan 2026 16:32:11 +0800
Subject: [PATCH 14/21] fix

---
 tests/integration_test/glm45_a100.sh | 2 +-
 tests/integration_test/qwen3_a100.sh | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index 5b143fac611..bbcbbc154ea 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -130,8 +130,8 @@ else
     echo "Test passed."
 fi
 
-# export repo_name=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}')
 export repo_name=PaddleFleet
+export REPO_NAME=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}')
 # if [[ "${PP}" == "rel" ]]; then
 #   export pppatch="_PPrel"
 # fi
diff --git a/tests/integration_test/qwen3_a100.sh b/tests/integration_test/qwen3_a100.sh
index 36b9b069fe8..d57f8f7694b 100644
--- a/tests/integration_test/qwen3_a100.sh
+++ b/tests/integration_test/qwen3_a100.sh
@@ -91,8 +91,9 @@ else
     echo "Test passed."
 fi
 
-# export repo_name=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}')
+
 export repo_name=PaddleFleet
+export REPO_NAME=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}')
 # if [[ "${PP}" == "rel" ]]; then
 #   export pppatch="_PPrel"
 # fi

From d7421979fff3c6f95a383833e64a607bd167b5dd Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Wed, 28 Jan 2026 19:42:37 +0800
Subject: [PATCH 15/21] fix

---
 tests/integration_test/glm45_dpo.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_test/glm45_dpo.sh b/tests/integration_test/glm45_dpo.sh
index 23d8bc742fa..d636828bb2f 100644
--- a/tests/integration_test/glm45_dpo.sh
+++ b/tests/integration_test/glm45_dpo.sh
@@ -27,7 +27,7 @@ config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml
 export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo
 yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
     | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
-    | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"
+    | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"
     | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log"
     | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \
    $config_dpo_yaml > ${config_dpo_yaml}.tmp

From 0b77de812fbecf74c6b363fc3c93c6918ca94962 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Thu, 29 Jan 2026 10:40:39 +0800
Subject: [PATCH 16/21] fix

---
 .github/workflows/fleet-model-test.yml   | 15 +++++++++++++++
 tests/integration_test/glm45_a100.sh     | 15 +++++++++++++++
 tests/integration_test/glm45_dpo_lora.sh | 10 +++-------
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml
index 93e47ea2bc0..28cbe50e190 100644
--- a/.github/workflows/fleet-model-test.yml
+++ b/.github/workflows/fleet-model-test.yml
@@ -571,6 +571,21 @@ jobs:
             echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m"
           fi
           '
+      
+      - name: GLM4.5 dpo_lora
+        if: (success() || failure()) && steps.formers_install.conclusion == 'success'
+        run: |
+          docker exec -t ${{ env.container_name }} /bin/bash -ce '
+          source /root/proxy
+          timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo_lora
+          glm45_exit_code=$?
+          if [[ "$glm45_exit_code" != "0" ]]; then
+            echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo lora.\033[0m"
+            exit 1
+          else
+            echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo lora.\033[0m"
+          fi
+          '
 
 
       - name: GLM4.5 pre-train (Grouped GEMM)
diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index bbcbbc154ea..88e68581ab0 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -96,6 +96,21 @@ elif [[ ${step} == "grouped_gemm" ]]; then
       | .output_dir = strenv(data_dir) + "/checkpoints"' \
     $config_yaml > ${config_yaml}.tmp
   mv ${config_yaml}.tmp $config_yaml
+elif [[ ${step} == "dpo_lora" ]]; then
+  echo "Run GLM4.5 dpo_lora test"
+  export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo
+  config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml
+  config_json=$CACHE_DIR/glm45/GLM-4.5-Air/config.json
+  yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+      | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
+      | .model_name_or_path = strenv(CACHE_DIR) + "/zai-org/GLM-4.5-Air-Base"
+      | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_lora_vdl_log"
+      | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_lora_ckpts"' \
+    $config_dpo_yaml > ${config_dpo_yaml}.tmp
+  mv ${config_dpo_yaml}.tmp $config_dpo_yaml
+else
+  echo "Unknown step: ${step}, please choose from [pt, sft, lora, dpo, grouped_gemm, flash_attention]"
+  exit 1
 fi
 
 rm -rf vdl_log/
diff --git a/tests/integration_test/glm45_dpo_lora.sh b/tests/integration_test/glm45_dpo_lora.sh
index 531e41ea55d..b6109d6ed16 100644
--- a/tests/integration_test/glm45_dpo_lora.sh
+++ b/tests/integration_test/glm45_dpo_lora.sh
@@ -22,16 +22,12 @@ fi
 cd $root_dir/glm45_fleet
 export cur_dir=$(pwd)
 
-# prepare dpo data
-wget https://paddle-qa.bj.bcebos.com/fleet/fleet_dpo.tar
-tar -xf fleet_dpo.tar
-
+export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo
 config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml
-
 config_json=$CACHE_DIR/glm45/GLM-4.5-Air/config.json
 
-yq '.train_dataset_path = strenv(cur_dir) + "/dpo_data/dpo_train.jsonl"
-    | .eval_dataset_path = strenv(cur_dir) + "/dpo_data/dpo_eval.jsonl"
+yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
+    | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
     | .model_name_or_path = strenv(CACHE_DIR) + "/zai-org/GLM-4.5-Air-Base"
     | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_lora_vdl_log"
     | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_lora_ckpts"' \

From ad41af0ea1418163b72617040fe0bdbf699d4429 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Thu, 29 Jan 2026 10:51:30 +0800
Subject: [PATCH 17/21] fix

---
 tests/integration_test/glm45_a100.sh | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index 88e68581ab0..74566234c61 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -99,18 +99,15 @@ elif [[ ${step} == "grouped_gemm" ]]; then
 elif [[ ${step} == "dpo_lora" ]]; then
   echo "Run GLM4.5 dpo_lora test"
   export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo
-  config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml
+  config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml
   config_json=$CACHE_DIR/glm45/GLM-4.5-Air/config.json
   yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl"
       | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl"
       | .model_name_or_path = strenv(CACHE_DIR) + "/zai-org/GLM-4.5-Air-Base"
       | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_lora_vdl_log"
       | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_lora_ckpts"' \
-    $config_dpo_yaml > ${config_dpo_yaml}.tmp
-  mv ${config_dpo_yaml}.tmp $config_dpo_yaml
-else
-  echo "Unknown step: ${step}, please choose from [pt, sft, lora, dpo, grouped_gemm, flash_attention]"
-  exit 1
+    $config_yaml > ${config_yaml}.tmp
+  mv ${config_yaml}.tmp $config_yaml
 fi
 
 rm -rf vdl_log/

From 3fe9869e347ca6f8de1f4dcb0d51c2d16f1bae42 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Thu, 29 Jan 2026 11:51:21 +0800
Subject: [PATCH 18/21] fix

---
 tests/integration_test/qwen3_a100.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration_test/qwen3_a100.sh b/tests/integration_test/qwen3_a100.sh
index d57f8f7694b..8cb8173faf7 100644
--- a/tests/integration_test/qwen3_a100.sh
+++ b/tests/integration_test/qwen3_a100.sh
@@ -70,8 +70,8 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
 unset http_proxy https_proxy
 
-log_file=qwen_$step.txt
-gt_loss_file=qwen_${step}_multi_card_gt_loss.txt
+log_file=qwen_$step_a100.txt
+gt_loss_file=qwen_${step}_a100_multi_card_gt_loss.txt
 
 set +e
 NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file}

From 65139721d11c238e03da162a8d59c428036fc502 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Thu, 29 Jan 2026 13:02:28 +0800
Subject: [PATCH 19/21] fix

---
 tests/integration_test/glm45_a100.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index 74566234c61..ca2758d354b 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -118,6 +118,7 @@ unset http_proxy https_proxy
 
 export FLAGS_embedding_deterministic=1
 export FLAGS_cudnn_deterministic=1
+export FLAGS_use_stride_compute_kernel=False
 
 log_file=glm45_${step}_a100.txt
 gt_loss_file=glm45_${step}_multi_card_a100_gt_loss.txt

From 7ccc6d60f5dbbca2de16b86c88d6d890c6b017ff Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Thu, 29 Jan 2026 13:08:45 +0800
Subject: [PATCH 20/21] fix

---
 tests/integration_test/glm45_a100.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index ca2758d354b..21ee54f0c34 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -93,7 +93,7 @@ elif [[ ${step} == "grouped_gemm" ]]; then
       | .stage1_overlap = false
       | .moe_grouped_gemm = false
       | .logging_dir = strenv(data_dir) + "/vdl_log"
-      | .output_dir = strenv(data_dir) + "/checkpoints"' \
+      | .output_dir = strenv(data_dir) + "/checkpoints/grouped_gemm"' \
     $config_yaml > ${config_yaml}.tmp
   mv ${config_yaml}.tmp $config_yaml
 elif [[ ${step} == "dpo_lora" ]]; then

From 22e9408bd8779bfee09203f0d58e5dd4abb55c87 Mon Sep 17 00:00:00 2001
From: tianlef <1095012807@qq.com>
Date: Thu, 29 Jan 2026 15:08:54 +0800
Subject: [PATCH 21/21] fix

---
 tests/integration_test/glm45_a100.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh
index 21ee54f0c34..ff4d23ac14f 100644
--- a/tests/integration_test/glm45_a100.sh
+++ b/tests/integration_test/glm45_a100.sh
@@ -91,7 +91,6 @@ elif [[ ${step} == "grouped_gemm" ]]; then
       | .per_device_train_batch_size = 1
       | .num_hidden_layers = 2
       | .stage1_overlap = false
-      | .moe_grouped_gemm = false
       | .logging_dir = strenv(data_dir) + "/vdl_log"
       | .output_dir = strenv(data_dir) + "/checkpoints/grouped_gemm"' \
     $config_yaml > ${config_yaml}.tmp