From 3a02f157b636ccccfe114d2a7c1d749ec63048da Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Mon, 26 Jan 2026 16:19:51 +0800 Subject: [PATCH 01/21] add a100 --- .github/workflows/fleet-model-test.yml | 585 ++++++++++++++---------- tests/integration_test/glm45_pt_a100.sh | 2 +- 2 files changed, 343 insertions(+), 244 deletions(-) diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index 2b905a757cb..6c0267ab4b9 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -68,159 +68,357 @@ jobs: fi echo "is_md_only: $(cat $GITHUB_OUTPUT | grep is_md_only || echo '未找到')" - integration-test-H20-single-card: - needs: check_documents_type - if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }} - name: Integration test (H20, single card) - runs-on: - group: Fleet-H-single-card - env: - PIP_CACHE_DIR: /home/.cache/pip - CACHE_DIR: /home/.cache - TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-single-card - steps: - - name: Determine the runner - run: | - gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 )) - echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV + # integration-test-H20-single-card: + # needs: check_documents_type + # if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }} + # name: Integration test (H20, single card) + # runs-on: + # group: Fleet-H-single-card + # env: + # PIP_CACHE_DIR: /home/.cache/pip + # CACHE_DIR: /home/.cache + # TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-single-card + # steps: + # - name: Determine the runner + # run: | + # gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 )) + # echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV - - name: Check docker image and run container - env: - GPU_DEVICES: ${{ env.GPU_DEVICES }} - run: | - container_name=${TASK}-$(date +%Y%m%d-%H%M%S) - echo "container_name=${container_name}" >> ${{ github.env }} - docker pull $docker_image - set -x - docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \ - -v "/dev/shm:/dev/shm" \ - -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ - -v ${{ github.workspace }}/../../..:/root \ - -v /ssd1/paddle-1/action_cache:/home/.cache \ - -v ${{ github.workspace }}:/workspace \ - -e BRANCH \ - -e PR_ID \ - -e COMMIT_ID \ - -e PADDLE_ROOT \ - -e ci_scripts \ - -e CACHE_DIR \ - -e no_proxy \ - -e CI_name \ - -e PIP_CACHE_DIR \ - -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ - -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ - -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ - -e GITHUB_REPO_NAME="${{ github.repository }}" \ - -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ - -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ - -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ - -e GITHUB_RUN_ID="${{ github.run_id }}" \ - -e PR_USER="${{ github.event.pull_request.user.login }}" \ - -w /workspace --network host ${docker_image} + # - name: Check docker image and run container + # env: + # GPU_DEVICES: ${{ env.GPU_DEVICES }} + # run: | + # container_name=${TASK}-$(date +%Y%m%d-%H%M%S) + # echo "container_name=${container_name}" >> ${{ github.env }} + # docker pull $docker_image + # set -x + # docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \ + # -v "/dev/shm:/dev/shm" \ + # -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + # -v ${{ github.workspace }}/../../..:/root \ + # -v /ssd1/paddle-1/action_cache:/home/.cache \ + # -v ${{ github.workspace }}:/workspace \ + # -e BRANCH \ + # -e PR_ID \ + # -e COMMIT_ID \ + # -e PADDLE_ROOT \ + # -e ci_scripts \ + # -e CACHE_DIR \ + # -e no_proxy \ + # -e CI_name \ + # -e PIP_CACHE_DIR \ + # -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ + # -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ + # -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ + # -e GITHUB_REPO_NAME="${{ github.repository }}" \ + # -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ + # -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ + # -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ + # -e GITHUB_RUN_ID="${{ github.run_id }}" \ + # -e PR_USER="${{ github.event.pull_request.user.login }}" \ + # -w /workspace --network host ${docker_image} - - name: Install PaddleFormers - id: formers_install - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - rm -rf * .[^.]* - echo $PR_USER - source /root/proxy - mkdir -p /home/.cache/pip - pip cache dir - pip install --upgrade pip - git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} - cd PaddleFormers - git status - git config --global --add safe.directory /workspace/PaddleFormers - git config user.name "PaddleCI" - git config user.email "paddle_ci@example.com" - git config pull.rebase false - git pull --no-edit origin pull/${PR_ID}/head - export UV_SKIP_WHEEL_FILENAME_CHECK=1 - sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py - sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py - pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ - wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl - pip uninstall paddlefleet -y - pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl - # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir - echo "paddle commit:" - python -c "import paddle; print(paddle.version.commit)" - echo "paddlefleet commit:" - python -c "import paddlefleet; print(paddlefleet.version.commit)" - cd /workspace - wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate - mkdir bos - tar xf bos_new.tar.gz -C bos - pip install bce-python-sdk==0.8.74 - pip install coverage==7.6.1 - pip install librosa==0.11.0 - ' + # - name: Install PaddleFormers + # id: formers_install + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # rm -rf * .[^.]* + # echo $PR_USER + # source /root/proxy + # mkdir -p /home/.cache/pip + # pip cache dir + # pip install --upgrade pip + # git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} + # cd PaddleFormers + # git status + # git config --global --add safe.directory /workspace/PaddleFormers + # git config user.name "PaddleCI" + # git config user.email "paddle_ci@example.com" + # git config pull.rebase false + # git pull --no-edit origin pull/${PR_ID}/head + # export UV_SKIP_WHEEL_FILENAME_CHECK=1 + # sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py + # sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py + # pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ + # wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl + # pip uninstall paddlefleet -y + # pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl + # # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir + # echo "paddle commit:" + # python -c "import paddle; print(paddle.version.commit)" + # echo "paddlefleet commit:" + # python -c "import paddlefleet; print(paddlefleet.version.commit)" + # cd /workspace + # wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate + # mkdir bos + # tar xf bos_new.tar.gz -C bos + # pip install bce-python-sdk==0.8.74 + # pip install coverage==7.6.1 + # pip install librosa==0.11.0 + # ' - - name: Proprocess for integration test - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/preprocess.sh - preprocess_exit_code=$? - if [[ "$preprocess_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mPreprocess failed.\033[0m" - exit 1 - else - echo -e "\033[32mPreprocess succeeded.\033[0m" - fi - ' + # - name: Proprocess for integration test + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/preprocess.sh + # preprocess_exit_code=$? + # if [[ "$preprocess_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mPreprocess failed.\033[0m" + # exit 1 + # else + # echo -e "\033[32mPreprocess succeeded.\033[0m" + # fi + # ' - - name: Integration test (GLM4.5 single-card) - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh - glm45_single_card_exit_code=$? - if [[ "$glm45_single_card_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mIntegration test failed: GLM4.5 single-card.\033[0m" - exit 1 - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m" - fi - ' + # - name: Integration test (GLM4.5 single-card) + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh + # glm45_single_card_exit_code=$? + # if [[ "$glm45_single_card_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 single-card.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m" + # fi + # ' - - name: Integration test (Qwen3-30B-A3B single-card) - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh - qwen3_single_card_exit_code=$? - if [[ "$qwen3_single_card_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mIntegration test failed: Qwen3-30B-A3B single-card.\033[0m" - exit 1 - else - echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m" - fi - ' + # - name: Integration test (Qwen3-30B-A3B single-card) + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh + # qwen3_single_card_exit_code=$? + # if [[ "$qwen3_single_card_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: Qwen3-30B-A3B single-card.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m" + # fi + # ' - - name: Terminate and delete the container - if: ${{ always() }} - run: | - set +e - docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' - docker rm -f ${{ env.container_name }} + # - name: Terminate and delete the container + # if: ${{ always() }} + # run: | + # set +e + # docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' + # docker rm -f ${{ env.container_name }} + + + # integration-test-H20-multi-card: + # needs: check_documents_type + # if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }} + # name: Integration test (H20, multi-card) + # runs-on: + # group: Fleet-H-multi-card + # env: + # PIP_CACHE_DIR: /home/.cache/pip + # CACHE_DIR: /home/.cache + # TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-multi-card + # steps: + # - name: Check docker image and run container + # run: | + # container_name=${TASK}-$(date +%Y%m%d-%H%M%S) + # echo "container_name=${container_name}" >> ${{ github.env }} + # docker pull $docker_image + # docker run -d -t --name ${container_name} --gpus all --shm-size=32G \ + # -v "/dev/shm:/dev/shm" \ + # -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + # -v ${{ github.workspace }}/../../..:/root \ + # -v /ssd1/paddle-1/action_cache:/home/.cache \ + # -v ${{ github.workspace }}:/workspace \ + # -e BRANCH \ + # -e PR_ID \ + # -e COMMIT_ID \ + # -e PADDLE_ROOT \ + # -e ci_scripts \ + # -e CACHE_DIR \ + # -e no_proxy \ + # -e CI_name \ + # -e PIP_CACHE_DIR \ + # -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ + # -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ + # -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ + # -e GITHUB_REPO_NAME="${{ github.repository }}" \ + # -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ + # -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ + # -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ + # -e GITHUB_RUN_ID="${{ github.run_id }}" \ + # -e PR_USER="${{ github.event.pull_request.user.login }}" \ + # -w /workspace --network host ${docker_image} + + # - name: Install PaddleFormers + # id: formers_install + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # rm -rf * .[^.]* + # source /root/proxy + # mkdir -p /home/.cache/pip + # pip cache dir + # pip install --upgrade pip + # git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} + # cd PaddleFormers + # git status + # git config --global --add safe.directory /workspace/PaddleFormers + # git config user.name "PaddleCI" + # git config user.email "paddle_ci@example.com" + # git config pull.rebase false + # git pull --no-edit origin pull/${PR_ID}/head + # export UV_SKIP_WHEEL_FILENAME_CHECK=1 + # pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ + # wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl + # pip uninstall paddlefleet -y + # pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl + # # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir + # echo "paddle commit:" + # python -c "import paddle; print(paddle.version.commit)" + # echo "paddlefleet commit:" + # python -c "import paddlefleet; print(paddlefleet.version.commit)" + # cd /workspace + # wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate + # mkdir bos + # tar xf bos_new.tar.gz -C bos + # pip install bce-python-sdk==0.8.74 + # pip install coverage==7.6.1 + # pip install librosa==0.11.0 + # ' + + # - name: GLM4.5 pre-train + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt.sh + # glm45_exit_code=$? + # if [[ "$glm45_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m" + # fi + # ' + # - name: GLM4.5 sft + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_sft.sh + # glm45_exit_code=$? + # if [[ "$glm45_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m" + # fi + # ' + + # - name: GLM4.5 lora + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_lora.sh + # glm45_exit_code=$? + # if [[ "$glm45_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m" + # fi + # ' + + # - name: GLM4.5 dpo + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh + # glm45_exit_code=$? + # if [[ "$glm45_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m" + # fi + # ' + + # - name: GLM4.5 pre-train (FP8) + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh + # glm45_exit_code=$? + # if [[ "$glm45_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m" + # fi + # ' + + # - name: GLM4.5 pre-train (Grouped GEMM) + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh + # glm45_exit_code=$? + # if [[ "$glm45_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m" + # fi + # ' + + # - name: Qwen pre-train + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt + # ' + + # - name: Qwen sft + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft + # ' + # - name: Qwen lora + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora + # ' - integration-test-H20-multi-card: + # - name: Terminate and delete the container + # if: ${{ always() }} + # run: | + # set +e + # docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' + # docker rm -f ${{ env.container_name }} + + integration-test-a100: needs: check_documents_type if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }} - name: Integration test (H20, multi-card) + name: Integration test (A100) runs-on: - group: Fleet-H-multi-card + group: Distribute env: PIP_CACHE_DIR: /home/.cache/pip CACHE_DIR: /home/.cache - TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-multi-card + TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-A100 steps: - name: Check docker image and run container run: | @@ -262,7 +460,7 @@ jobs: mkdir -p /home/.cache/pip pip cache dir pip install --upgrade pip - git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} + git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} cd PaddleFormers git status git config --global --add safe.directory /workspace/PaddleFormers @@ -271,6 +469,8 @@ jobs: git config pull.rebase false git pull --no-edit origin pull/${PR_ID}/head export UV_SKIP_WHEEL_FILENAME_CHECK=1 + sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py + sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl pip uninstall paddlefleet -y @@ -288,13 +488,12 @@ jobs: pip install coverage==7.6.1 pip install librosa==0.11.0 ' - + - name: GLM4.5 pre-train - if: (success() || failure()) && steps.formers_install.conclusion == 'success' run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt.sh + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_a100.sh glm45_exit_code=$? if [[ "$glm45_exit_code" != "0" ]]; then echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" @@ -303,6 +502,7 @@ jobs: echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m" fi ' + - name: GLM4.5 sft if: (success() || failure()) && steps.formers_install.conclusion == 'success' run: | @@ -408,104 +608,3 @@ jobs: set +e docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' docker rm -f ${{ env.container_name }} - - integration-test-a100: - needs: check_documents_type - if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }} - name: Integration test (A100) - runs-on: - group: Distribute - env: - PIP_CACHE_DIR: /home/.cache/pip - CACHE_DIR: /home/.cache - TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-A100 - steps: - - name: Check docker image and run container - run: | - container_name=${TASK}-$(date +%Y%m%d-%H%M%S) - echo "container_name=${container_name}" >> ${{ github.env }} - docker pull $docker_image - docker run -d -t --name ${container_name} --gpus all --shm-size=32G \ - -v "/dev/shm:/dev/shm" \ - -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ - -v ${{ github.workspace }}/../../..:/root \ - -v /ssd1/paddle-1/action_cache:/home/.cache \ - -v ${{ github.workspace }}:/workspace \ - -e BRANCH \ - -e PR_ID \ - -e COMMIT_ID \ - -e PADDLE_ROOT \ - -e ci_scripts \ - -e CACHE_DIR \ - -e no_proxy \ - -e CI_name \ - -e PIP_CACHE_DIR \ - -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ - -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ - -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ - -e GITHUB_REPO_NAME="${{ github.repository }}" \ - -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ - -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ - -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ - -e GITHUB_RUN_ID="${{ github.run_id }}" \ - -e PR_USER="${{ github.event.pull_request.user.login }}" \ - -w /workspace --network host ${docker_image} - - - name: Install PaddleFormers - id: formers_install - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - rm -rf * .[^.]* - source /root/proxy - mkdir -p /home/.cache/pip - pip cache dir - pip install --upgrade pip - git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} - cd PaddleFormers - git status - git config --global --add safe.directory /workspace/PaddleFormers - git config user.name "PaddleCI" - git config user.email "paddle_ci@example.com" - git config pull.rebase false - git pull --no-edit origin pull/${PR_ID}/head - export UV_SKIP_WHEEL_FILENAME_CHECK=1 - sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py - sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py - pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ - wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl - pip uninstall paddlefleet -y - pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl - # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir - echo "paddle commit:" - python -c "import paddle; print(paddle.version.commit)" - echo "paddlefleet commit:" - python -c "import paddlefleet; print(paddlefleet.version.commit)" - cd /workspace - wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate - mkdir bos - tar xf bos_new.tar.gz -C bos - pip install bce-python-sdk==0.8.74 - pip install coverage==7.6.1 - pip install librosa==0.11.0 - ' - - - name: GLM4.5 pre-train - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_a100.sh - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" - exit 1 - else - echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m" - fi - ' - - - name: Terminate and delete the container - if: ${{ always() }} - run: | - set +e - docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' - docker rm -f ${{ env.container_name }} diff --git a/tests/integration_test/glm45_pt_a100.sh b/tests/integration_test/glm45_pt_a100.sh index 701504b837a..8e2e1a0dacc 100644 --- a/tests/integration_test/glm45_pt_a100.sh +++ b/tests/integration_test/glm45_pt_a100.sh @@ -36,7 +36,7 @@ yq eval '.expert_model_parallel_size = 1 | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" | .model_name_or_path = strenv(cur_dir) + "/GLM-4.5-Air" | .logging_dir = strenv(cur_dir) + "/vdl_log" - | .output_dir = strenv(cur_dir) + "/checkpoints"' \ + | .output_dir = strenv(cur_dir) + "/checkpoints/pretrain"' \ $config_yaml > ${config_yaml}.tmp mv ${config_yaml}.tmp $config_yaml From 2e5ee892db3463ac06f622e3028983927e9d33c8 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Mon, 26 Jan 2026 17:25:41 +0800 Subject: [PATCH 02/21] add a100 --- .github/workflows/fleet-model-test.yml | 80 +++++++++---------- .../{glm45_pt_a100.sh => glm45_a100.sh} | 68 ++++++++++++++-- 2 files changed, 101 insertions(+), 47 deletions(-) rename tests/integration_test/{glm45_pt_a100.sh => glm45_a100.sh} (56%) diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index 6c0267ab4b9..5618a266f3b 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -493,7 +493,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_a100.sh + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt glm45_exit_code=$? if [[ "$glm45_exit_code" != "0" ]]; then echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" @@ -508,7 +508,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_sft.sh + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft glm45_exit_code=$? if [[ "$glm45_exit_code" != "0" ]]; then echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m" @@ -523,7 +523,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_lora.sh + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora glm45_exit_code=$? if [[ "$glm45_exit_code" != "0" ]]; then echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m" @@ -538,7 +538,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo glm45_exit_code=$? if [[ "$glm45_exit_code" != "0" ]]; then echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m" @@ -548,27 +548,27 @@ jobs: fi ' - - name: GLM4.5 pre-train (FP8) - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" - exit 1 - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m" - fi - ' + # - name: GLM4.5 pre-train (FP8) + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh fp8 + # glm45_exit_code=$? + # if [[ "$glm45_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m" + # fi + # ' - name: GLM4.5 pre-train (Grouped GEMM) if: (success() || failure()) && steps.formers_install.conclusion == 'success' run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh grouped_gemm glm45_exit_code=$? if [[ "$glm45_exit_code" != "0" ]]; then echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m" @@ -578,29 +578,29 @@ jobs: fi ' - - name: Qwen pre-train - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt - ' + # - name: Qwen pre-train + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt + # ' - - name: Qwen sft - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft - ' + # - name: Qwen sft + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft + # ' - - name: Qwen lora - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora - ' + # - name: Qwen lora + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora + # ' - name: Terminate and delete the container if: ${{ always() }} diff --git a/tests/integration_test/glm45_pt_a100.sh b/tests/integration_test/glm45_a100.sh similarity index 56% rename from tests/integration_test/glm45_pt_a100.sh rename to tests/integration_test/glm45_a100.sh index 8e2e1a0dacc..ae0bedbebd0 100644 --- a/tests/integration_test/glm45_pt_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -24,10 +24,12 @@ tar -xf glm45_fleet.12-18.tar # glm45_fleet cd $root_dir/glm45_fleet export cur_dir=$(pwd) -config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt.yaml -export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt - -yq eval '.expert_model_parallel_size = 1 +step=$1 +if [[ ${step} == "pt" ]]; then + echo "Run GLM4.5 pretrain test" + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt + yq eval '.expert_model_parallel_size = 1 | .num_hidden_layers = 2 | .per_device_train_batch_size = 1 | .use_expert_parallel = false @@ -38,7 +40,59 @@ yq eval '.expert_model_parallel_size = 1 | .logging_dir = strenv(cur_dir) + "/vdl_log" | .output_dir = strenv(cur_dir) + "/checkpoints/pretrain"' \ $config_yaml > ${config_yaml}.tmp -mv ${config_yaml}.tmp $config_yaml + mv ${config_yaml}.tmp $config_yaml +elif [[ ${step} == "sft" ]]; then + echo "Run GLM4.5 sft test" + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_sft.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft + yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(cur_dir) + "/checkpoints/pretrain" + | .logging_dir = strenv(cur_dir) + "/glm_full_pp_vdl_log" + | .num_empty_layers_add_in_head = 0 + | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts"' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +elif [[ ${step} == "lora" ]]; then + echo "Run GLM4.5 multi lora test" + config_lora_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml + + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft + + yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" + | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log" + | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"' \ + $config_lora_yaml > ${config_lora_yaml}.tmp + mv ${config_lora_yaml}.tmp $config_lora_yaml +elif [[ ${step} == "dpo" ]]; then + echo "Run GLM4.5 dpo test" + config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo + yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" + | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log" + | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \ + $config_dpo_yaml > ${config_dpo_yaml}.tmp + mv ${config_dpo_yaml}.tmp $config_dpo_yaml +elif [[ ${step} == "grouped_gemm" ]]; then + echo "Run GLM4.5 grouped_gemm test" + export config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt_grouped_gemm.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt + yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(CACHE_DIR) + "/glm45/GLM-4.5-Air" + | .per_device_train_batch_size = 1 + | .num_hidden_layers = 2 + | .use_expert_parallel = false + | .stage1_overlap = false + | .logging_dir = strenv(data_dir) + "/vdl_log" + | .output_dir = strenv(data_dir) + "/checkpoints"' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +fi rm -rf checkpoints/ rm -rf vdl_log/ @@ -50,8 +104,8 @@ unset http_proxy https_proxy export FLAGS_embedding_deterministic=1 export FLAGS_cudnn_deterministic=1 -log_file=glm45_pt_a100.txt -gt_loss_file=glm45_pt_multi_card_a100_gt_loss.txt +log_file=glm45_${step}_a100.txt +gt_loss_file=glm45_${step}_multi_card_a100_gt_loss.txt set +e FLAGS_use_stride_compute_kernel=False NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file} From 058e84beb6ddb04e3bfe02bbe6f664fa062e7706 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Mon, 26 Jan 2026 17:38:26 +0800 Subject: [PATCH 03/21] fix --- tests/integration_test/glm45_a100.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index ae0bedbebd0..d569e22f1bf 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -19,8 +19,11 @@ if [ -f 'PaddleFleet/.venv/bin/activate' ]; then source PaddleFleet/.venv/bin/activate fi -wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/glm45/glm45_fleet.12-18.tar --no-check-certificate -tar -xf glm45_fleet.12-18.tar # glm45_fleet +if [ ! -d "$root_dir/glm45_fleet" ]; then + wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/glm45/glm45_fleet.12-18.tar --no-check-certificate + ar -xf glm45_fleet.12-18.tar +fi + cd $root_dir/glm45_fleet export cur_dir=$(pwd) @@ -86,7 +89,6 @@ elif [[ ${step} == "grouped_gemm" ]]; then | .model_name_or_path = strenv(CACHE_DIR) + "/glm45/GLM-4.5-Air" | .per_device_train_batch_size = 1 | .num_hidden_layers = 2 - | .use_expert_parallel = false | .stage1_overlap = false | .logging_dir = strenv(data_dir) + "/vdl_log" | .output_dir = strenv(data_dir) + "/checkpoints"' \ From ddcdd5cb2af5ba02bad9576a29c9747375f0c4cd Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Mon, 26 Jan 2026 19:01:37 +0800 Subject: [PATCH 04/21] add a100 --- .github/workflows/fleet-model-test.yml | 42 ++++---- tests/integration_test/glm45_a100.sh | 2 +- tests/integration_test/qwen3_a100.sh | 133 +++++++++++++++++++++++++ 3 files changed, 155 insertions(+), 22 deletions(-) create mode 100644 tests/integration_test/qwen3_a100.sh diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index 5618a266f3b..71cc9bb055f 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -578,29 +578,29 @@ jobs: fi ' - # - name: Qwen pre-train - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt - # ' + - name: Qwen pre-train + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt + ' - # - name: Qwen sft - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft - # ' + - name: Qwen sft + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft + ' - # - name: Qwen lora - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora - # ' + - name: Qwen lora + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora + ' - name: Terminate and delete the container if: ${{ always() }} diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index d569e22f1bf..72796dbc20f 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -21,7 +21,7 @@ fi if [ ! -d "$root_dir/glm45_fleet" ]; then wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/glm45/glm45_fleet.12-18.tar --no-check-certificate - ar -xf glm45_fleet.12-18.tar + tar -xf glm45_fleet.12-18.tar fi cd $root_dir/glm45_fleet diff --git a/tests/integration_test/qwen3_a100.sh b/tests/integration_test/qwen3_a100.sh new file mode 100644 index 00000000000..36b9b069fe8 --- /dev/null +++ b/tests/integration_test/qwen3_a100.sh @@ -0,0 +1,133 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -exo pipefail +export root_dir=$(pwd) + +step=$1 + +if [[ ! -d $CACHE_DIR/Qwen3-30B-A3B ]]; then + pushd $CACHE_DIR + wget -q --tries=5 --no-proxy https://xly-devops.cdn.bcebos.com/PaddleFleet/Qwen/Qwen3-30B-A3B.tar.gz --no-check-certificate + tar xf Qwen3-30B-A3B.tar.gz + popd +fi + +if [ -f 'PaddleFleet/.venv/bin/activate' ]; then + source PaddleFleet/.venv/bin/activate +fi + +if [[ "$step" == "pt" ]]; then + export config_yaml=$root_dir/PaddleFormers/tests/config/ci/qwen3_multicard_pt.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/pt + export model_name_or_path=$CACHE_DIR/Qwen3-30B-A3B + export output_dir=$root_dir/checkpoints/qwen-pt + yq eval '.moe_grouped_gemm = false' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +elif [[ "$step" == "sft" ]]; then + export config_yaml=$root_dir/PaddleFormers/tests/config/ci/qwen3_multicard_sft.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft + export model_name_or_path=$root_dir/checkpoints/qwen-pt + export output_dir=$root_dir/checkpoints/qwen-sft + yq eval '.moe_grouped_gemm = false' \ + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml +else + export config_yaml=$root_dir/PaddleFormers/tests/config/ci/qwen3_multicard_lora.yaml + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft + export model_name_or_path=$root_dir/checkpoints/qwen-sft + export output_dir=$root_dir/checkpoints/qwen-lora +fi + +yq eval '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(model_name_or_path) + | .output_dir = strenv(output_dir)' \ + $config_yaml > ${config_yaml}.tmp +mv ${config_yaml}.tmp $config_yaml + +rm -rf ./outputs +rm -rf paddleformers_dist_log +master=$(hostname -i) +port=36677 + +export FLAGS_embedding_deterministic=1 +export FLAGS_cudnn_deterministic=1 +export FLAGS_use_stride_compute_kernel=False +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + +unset http_proxy https_proxy + +log_file=qwen_$step.txt +gt_loss_file=qwen_${step}_multi_card_gt_loss.txt + +set +e +NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file} + +exit_code=$? +if [ $exit_code -ne 0 ]; then + echo "qwen multi-cards training failed, try to check the log file" + python $root_dir/PaddleFormers/tests/check_log_for_exitcode.py ./${log_file} "***** train metrics *****" + check_exit_code=$? + if [ $check_exit_code -ne 0 ]; then + echo "Failed to find 'Training completed' in log file." + exit 1 + else + echo "Log check passed." + fi +else + echo "Test passed." +fi + +# export repo_name=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}') +export repo_name=PaddleFleet +# if [[ "${PP}" == "rel" ]]; then +# export pppatch="_PPrel" +# fi +# if [[ "${PF}" == rel* ]]; then +# export pfpatch="rel" +# fi +wget --no-proxy --no-check-certificate https://xly-devops.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}_latest/${gt_loss_file} +if [ $? -ne 0 ]; then + echo "To request precision checks for new models, please contact swgu98." + exit 1 +fi + +log_loss_file=${log_file%.*}_loss.${log_file##*.} +python $root_dir/PaddleFormers/tests/integration_test/check_loss.py \ + --compare_step 10 \ + --log_file ./${log_file} \ + --log_loss_file ./${log_loss_file} \ + --gt_file ./${gt_loss_file} + +if [ $? -ne 0 ]; then + pushd $root_dir/PaddleFormers + source /root/proxy + bash $root_dir/PaddleFormers/tests/integration_test/check_precision_approval.sh + if [ $? -ne 0 ]; then + echo -e "\033[31mThe precision has been changed and requires approvals.\033[0m" + exit 1 + fi + popd + rm ${gt_loss_file} && mv ${log_loss_file} ${gt_loss_file} + if [ ! -f precision_list.txt ]; then + wget --no-proxy --no-check-certificate https://paddle-github-action.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID}/precision_list.txt + if [ $? -ne 0 ]; then + wget --no-proxy --no-check-certificate https://xly-devops.cdn.bcebos.com/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}_latest/precision_list.txt + python $root_dir/bos/BosClient.py precision_list.txt paddle-github-action/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID} + fi + fi + python $root_dir/bos/BosClient.py ${gt_loss_file} paddle-github-action/PaddleFleet/precision/${repo_name}${pfpatch}${pppatch}/${PR_ID} +fi \ No newline at end of file From ae63fc26d2103b56b54f772739bd97442ea5428d Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Mon, 26 Jan 2026 19:22:56 +0800 Subject: [PATCH 05/21] fix --- .github/workflows/fleet-model-test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index 71cc9bb055f..a3259ede997 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -487,6 +487,7 @@ jobs: pip install bce-python-sdk==0.8.74 pip install coverage==7.6.1 pip install librosa==0.11.0 + pip install torchcodec ' - name: GLM4.5 pre-train From 7b0f02d3ebfa0ee76eafd98f026815448bad2f42 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Tue, 27 Jan 2026 15:55:08 +0800 Subject: [PATCH 06/21] fix --- .github/workflows/fleet-model-test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index a3259ede997..71cc9bb055f 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -487,7 +487,6 @@ jobs: pip install bce-python-sdk==0.8.74 pip install coverage==7.6.1 pip install librosa==0.11.0 - pip install torchcodec ' - name: GLM4.5 pre-train From 15f4e8ea2ee22b15e0a6578bc8b7959a60d9a66e Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Tue, 27 Jan 2026 16:15:28 +0800 Subject: [PATCH 07/21] fix --- .github/workflows/fleet-model-test.yml | 6 +++--- tests/integration_test/glm45_a100.sh | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index 71cc9bb055f..dcbcf926bc9 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -583,7 +583,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt ' - name: Qwen sft @@ -591,7 +591,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft ' - name: Qwen lora @@ -599,7 +599,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora ' - name: Terminate and delete the container diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index 72796dbc20f..cb164bdb0ce 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -58,7 +58,7 @@ elif [[ ${step} == "sft" ]]; then mv ${config_yaml}.tmp $config_yaml elif [[ ${step} == "lora" ]]; then echo "Run GLM4.5 multi lora test" - config_lora_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft @@ -67,19 +67,19 @@ elif [[ ${step} == "lora" ]]; then | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log" | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"' \ - $config_lora_yaml > ${config_lora_yaml}.tmp - mv ${config_lora_yaml}.tmp $config_lora_yaml + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml elif [[ ${step} == "dpo" ]]; then echo "Run GLM4.5 dpo test" - config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log" | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \ - $config_dpo_yaml > ${config_dpo_yaml}.tmp - mv ${config_dpo_yaml}.tmp $config_dpo_yaml + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml elif [[ ${step} == "grouped_gemm" ]]; then echo "Run GLM4.5 grouped_gemm test" export config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_pt_grouped_gemm.yaml @@ -90,13 +90,13 @@ elif [[ ${step} == "grouped_gemm" ]]; then | .per_device_train_batch_size = 1 | .num_hidden_layers = 2 | .stage1_overlap = false + | .moe_grouped_gemm = false | .logging_dir = strenv(data_dir) + "/vdl_log" | .output_dir = strenv(data_dir) + "/checkpoints"' \ $config_yaml > ${config_yaml}.tmp mv ${config_yaml}.tmp $config_yaml fi -rm -rf checkpoints/ rm -rf vdl_log/ master=$(hostname -i) port=36677 From 8f3b8afd799dbdb4435ebaea483cec7ad2a974f7 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Tue, 27 Jan 2026 17:00:03 +0800 Subject: [PATCH 08/21] fix deepep --- tests/integration_test/glm45_a100.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index cb164bdb0ce..4825aade05f 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -66,7 +66,8 @@ elif [[ ${step} == "lora" ]]; then | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log" - | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps"' \ + | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps" + | del(.moe_token_dispatcher_type)' \ $config_yaml > ${config_yaml}.tmp mv ${config_yaml}.tmp $config_yaml elif [[ ${step} == "dpo" ]]; then @@ -77,6 +78,7 @@ elif [[ ${step} == "dpo" ]]; then | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log" + | .moe_token_dispatcher_type: "deepep" | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \ $config_yaml > ${config_yaml}.tmp mv ${config_yaml}.tmp $config_yaml From 52b75b17f524aab10fdd9689140df6ca0f8ad830 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Tue, 27 Jan 2026 19:17:24 +0800 Subject: [PATCH 09/21] fix dpo --- tests/integration_test/glm45_a100.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index 4825aade05f..86600436954 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -59,9 +59,7 @@ elif [[ ${step} == "sft" ]]; then elif [[ ${step} == "lora" ]]; then echo "Run GLM4.5 multi lora test" config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_lora.yaml - export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/sft - yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" @@ -78,7 +76,7 @@ elif [[ ${step} == "dpo" ]]; then | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log" - | .moe_token_dispatcher_type: "deepep" + | .num_empty_layers_add_in_tail = 0 | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \ $config_yaml > ${config_yaml}.tmp mv ${config_yaml}.tmp $config_yaml From 3f6cdd4d2cb127b4d1f8b83ed48fd57133c5ea7e Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Wed, 28 Jan 2026 11:49:04 +0800 Subject: [PATCH 10/21] add --- .github/workflows/fleet-model-test.yml | 684 ++++++++++++------------- 1 file changed, 342 insertions(+), 342 deletions(-) diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index dcbcf926bc9..8cb4fdf2d93 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -68,346 +68,346 @@ jobs: fi echo "is_md_only: $(cat $GITHUB_OUTPUT | grep is_md_only || echo '未找到')" - # integration-test-H20-single-card: - # needs: check_documents_type - # if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }} - # name: Integration test (H20, single card) - # runs-on: - # group: Fleet-H-single-card - # env: - # PIP_CACHE_DIR: /home/.cache/pip - # CACHE_DIR: /home/.cache - # TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-single-card - # steps: - # - name: Determine the runner - # run: | - # gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 )) - # echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV + integration-test-H20-single-card: + needs: check_documents_type + if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }} + name: Integration test (H20, single card) + runs-on: + group: Fleet-H-single-card + env: + PIP_CACHE_DIR: /home/.cache/pip + CACHE_DIR: /home/.cache + TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-single-card + steps: + - name: Determine the runner + run: | + gpu_id=$(( $(echo $PWD | awk -F'/' '{print $3}' | awk -F'-' '{print $2}') + 3 )) + echo GPU_DEVICES="$gpu_id" >> $GITHUB_ENV - # - name: Check docker image and run container - # env: - # GPU_DEVICES: ${{ env.GPU_DEVICES }} - # run: | - # container_name=${TASK}-$(date +%Y%m%d-%H%M%S) - # echo "container_name=${container_name}" >> ${{ github.env }} - # docker pull $docker_image - # set -x - # docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \ - # -v "/dev/shm:/dev/shm" \ - # -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ - # -v ${{ github.workspace }}/../../..:/root \ - # -v /ssd1/paddle-1/action_cache:/home/.cache \ - # -v ${{ github.workspace }}:/workspace \ - # -e BRANCH \ - # -e PR_ID \ - # -e COMMIT_ID \ - # -e PADDLE_ROOT \ - # -e ci_scripts \ - # -e CACHE_DIR \ - # -e no_proxy \ - # -e CI_name \ - # -e PIP_CACHE_DIR \ - # -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ - # -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ - # -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ - # -e GITHUB_REPO_NAME="${{ github.repository }}" \ - # -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ - # -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ - # -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ - # -e GITHUB_RUN_ID="${{ github.run_id }}" \ - # -e PR_USER="${{ github.event.pull_request.user.login }}" \ - # -w /workspace --network host ${docker_image} + - name: Check docker image and run container + env: + GPU_DEVICES: ${{ env.GPU_DEVICES }} + run: | + container_name=${TASK}-$(date +%Y%m%d-%H%M%S) + echo "container_name=${container_name}" >> ${{ github.env }} + docker pull $docker_image + set -x + docker run -d -t --name ${container_name} --gpus "\"device=${GPU_DEVICES}\"" --shm-size=32G \ + -v "/dev/shm:/dev/shm" \ + -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + -v ${{ github.workspace }}/../../..:/root \ + -v /ssd1/paddle-1/action_cache:/home/.cache \ + -v ${{ github.workspace }}:/workspace \ + -e BRANCH \ + -e PR_ID \ + -e COMMIT_ID \ + -e PADDLE_ROOT \ + -e ci_scripts \ + -e CACHE_DIR \ + -e no_proxy \ + -e CI_name \ + -e PIP_CACHE_DIR \ + -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ + -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ + -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ + -e GITHUB_REPO_NAME="${{ github.repository }}" \ + -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ + -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ + -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ + -e GITHUB_RUN_ID="${{ github.run_id }}" \ + -e PR_USER="${{ github.event.pull_request.user.login }}" \ + -w /workspace --network host ${docker_image} - # - name: Install PaddleFormers - # id: formers_install - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # rm -rf * .[^.]* - # echo $PR_USER - # source /root/proxy - # mkdir -p /home/.cache/pip - # pip cache dir - # pip install --upgrade pip - # git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} - # cd PaddleFormers - # git status - # git config --global --add safe.directory /workspace/PaddleFormers - # git config user.name "PaddleCI" - # git config user.email "paddle_ci@example.com" - # git config pull.rebase false - # git pull --no-edit origin pull/${PR_ID}/head - # export UV_SKIP_WHEEL_FILENAME_CHECK=1 - # sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py - # sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py - # pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ - # wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl - # pip uninstall paddlefleet -y - # pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl - # # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir - # echo "paddle commit:" - # python -c "import paddle; print(paddle.version.commit)" - # echo "paddlefleet commit:" - # python -c "import paddlefleet; print(paddlefleet.version.commit)" - # cd /workspace - # wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate - # mkdir bos - # tar xf bos_new.tar.gz -C bos - # pip install bce-python-sdk==0.8.74 - # pip install coverage==7.6.1 - # pip install librosa==0.11.0 - # ' + - name: Install PaddleFormers + id: formers_install + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + rm -rf * .[^.]* + echo $PR_USER + source /root/proxy + mkdir -p /home/.cache/pip + pip cache dir + pip install --upgrade pip + git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} + cd PaddleFormers + git status + git config --global --add safe.directory /workspace/PaddleFormers + git config user.name "PaddleCI" + git config user.email "paddle_ci@example.com" + git config pull.rebase false + git pull --no-edit origin pull/${PR_ID}/head + export UV_SKIP_WHEEL_FILENAME_CHECK=1 + sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/glm45_provider.py + sed -i "s/from gpt_provider import GPTModelProvider/from paddleformers.transformers.gpt_provider import GPTModelProvider/g" examples/experiments/paddlefleet/qwen_provider.py + pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ + wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl + pip uninstall paddlefleet -y + pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl + # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir + echo "paddle commit:" + python -c "import paddle; print(paddle.version.commit)" + echo "paddlefleet commit:" + python -c "import paddlefleet; print(paddlefleet.version.commit)" + cd /workspace + wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate + mkdir bos + tar xf bos_new.tar.gz -C bos + pip install bce-python-sdk==0.8.74 + pip install coverage==7.6.1 + pip install librosa==0.11.0 + ' - # - name: Proprocess for integration test - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/preprocess.sh - # preprocess_exit_code=$? - # if [[ "$preprocess_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mPreprocess failed.\033[0m" - # exit 1 - # else - # echo -e "\033[32mPreprocess succeeded.\033[0m" - # fi - # ' + - name: Proprocess for integration test + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/preprocess.sh + preprocess_exit_code=$? + if [[ "$preprocess_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mPreprocess failed.\033[0m" + exit 1 + else + echo -e "\033[32mPreprocess succeeded.\033[0m" + fi + ' - # - name: Integration test (GLM4.5 single-card) - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh - # glm45_single_card_exit_code=$? - # if [[ "$glm45_single_card_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 single-card.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m" - # fi - # ' + - name: Integration test (GLM4.5 single-card) + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_single_card.sh + glm45_single_card_exit_code=$? + if [[ "$glm45_single_card_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 single-card.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 single-card.\033[0m" + fi + ' - # - name: Integration test (Qwen3-30B-A3B single-card) - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh - # qwen3_single_card_exit_code=$? - # if [[ "$qwen3_single_card_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: Qwen3-30B-A3B single-card.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m" - # fi - # ' + - name: Integration test (Qwen3-30B-A3B single-card) + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_single_card.sh + qwen3_single_card_exit_code=$? + if [[ "$qwen3_single_card_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: Qwen3-30B-A3B single-card.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: Qwen3-30B-A3B single-card.\033[0m" + fi + ' - # - name: Terminate and delete the container - # if: ${{ always() }} - # run: | - # set +e - # docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' - # docker rm -f ${{ env.container_name }} + - name: Terminate and delete the container + if: ${{ always() }} + run: | + set +e + docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' + docker rm -f ${{ env.container_name }} - # integration-test-H20-multi-card: - # needs: check_documents_type - # if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }} - # name: Integration test (H20, multi-card) - # runs-on: - # group: Fleet-H-multi-card - # env: - # PIP_CACHE_DIR: /home/.cache/pip - # CACHE_DIR: /home/.cache - # TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-multi-card - # steps: - # - name: Check docker image and run container - # run: | - # container_name=${TASK}-$(date +%Y%m%d-%H%M%S) - # echo "container_name=${container_name}" >> ${{ github.env }} - # docker pull $docker_image - # docker run -d -t --name ${container_name} --gpus all --shm-size=32G \ - # -v "/dev/shm:/dev/shm" \ - # -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ - # -v ${{ github.workspace }}/../../..:/root \ - # -v /ssd1/paddle-1/action_cache:/home/.cache \ - # -v ${{ github.workspace }}:/workspace \ - # -e BRANCH \ - # -e PR_ID \ - # -e COMMIT_ID \ - # -e PADDLE_ROOT \ - # -e ci_scripts \ - # -e CACHE_DIR \ - # -e no_proxy \ - # -e CI_name \ - # -e PIP_CACHE_DIR \ - # -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ - # -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ - # -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ - # -e GITHUB_REPO_NAME="${{ github.repository }}" \ - # -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ - # -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ - # -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ - # -e GITHUB_RUN_ID="${{ github.run_id }}" \ - # -e PR_USER="${{ github.event.pull_request.user.login }}" \ - # -w /workspace --network host ${docker_image} + integration-test-H20-multi-card: + needs: check_documents_type + if: ${{ needs.check_documents_type.outputs.is_md_only == 'false' }} + name: Integration test (H20, multi-card) + runs-on: + group: Fleet-H-multi-card + env: + PIP_CACHE_DIR: /home/.cache/pip + CACHE_DIR: /home/.cache + TASK: formers-fleet-CI-${{ github.event.pull_request.number }}-integration-test-multi-card + steps: + - name: Check docker image and run container + run: | + container_name=${TASK}-$(date +%Y%m%d-%H%M%S) + echo "container_name=${container_name}" >> ${{ github.env }} + docker pull $docker_image + docker run -d -t --name ${container_name} --gpus all --shm-size=32G \ + -v "/dev/shm:/dev/shm" \ + -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + -v ${{ github.workspace }}/../../..:/root \ + -v /ssd1/paddle-1/action_cache:/home/.cache \ + -v ${{ github.workspace }}:/workspace \ + -e BRANCH \ + -e PR_ID \ + -e COMMIT_ID \ + -e PADDLE_ROOT \ + -e ci_scripts \ + -e CACHE_DIR \ + -e no_proxy \ + -e CI_name \ + -e PIP_CACHE_DIR \ + -e GITHUB_SHA="${{ github.event.pull_request.head.sha }}" \ + -e GITHUB_HEAD_REF="${{ github.head_ref }}" \ + -e GITHUB_BASE_SHA="${{ github.event.pull_request.base.sha }}" \ + -e GITHUB_REPO_NAME="${{ github.repository }}" \ + -e GITHUB_EVENT_NAME="${{ github.event_name }}" \ + -e GITHUB_EVENT_PULL_REQUEST_NUMBER="${{ github.event.pull_request.number }}" \ + -e GITHUB_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ + -e GITHUB_RUN_ID="${{ github.run_id }}" \ + -e PR_USER="${{ github.event.pull_request.user.login }}" \ + -w /workspace --network host ${docker_image} - # - name: Install PaddleFormers - # id: formers_install - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # rm -rf * .[^.]* - # source /root/proxy - # mkdir -p /home/.cache/pip - # pip cache dir - # pip install --upgrade pip - # git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} - # cd PaddleFormers - # git status - # git config --global --add safe.directory /workspace/PaddleFormers - # git config user.name "PaddleCI" - # git config user.email "paddle_ci@example.com" - # git config pull.rebase false - # git pull --no-edit origin pull/${PR_ID}/head - # export UV_SKIP_WHEEL_FILENAME_CHECK=1 - # pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ - # wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl - # pip uninstall paddlefleet -y - # pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl - # # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir - # echo "paddle commit:" - # python -c "import paddle; print(paddle.version.commit)" - # echo "paddlefleet commit:" - # python -c "import paddlefleet; print(paddlefleet.version.commit)" - # cd /workspace - # wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate - # mkdir bos - # tar xf bos_new.tar.gz -C bos - # pip install bce-python-sdk==0.8.74 - # pip install coverage==7.6.1 - # pip install librosa==0.11.0 - # ' + - name: Install PaddleFormers + id: formers_install + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + rm -rf * .[^.]* + source /root/proxy + mkdir -p /home/.cache/pip + pip cache dir + pip install --upgrade pip + git clone https://github.com/PaddlePaddle/PaddleFormers.git -b ${BRANCH} + cd PaddleFormers + git status + git config --global --add safe.directory /workspace/PaddleFormers + git config user.name "PaddleCI" + git config user.email "paddle_ci@example.com" + git config pull.rebase false + git pull --no-edit origin pull/${PR_ID}/head + export UV_SKIP_WHEEL_FILENAME_CHECK=1 + pip install -e ".[paddlefleet]" --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu129/ + wget https://paddle-github-action.cdn.bcebos.com/PaddleFleet/release/0.1.0/latest/cu129/paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl + pip uninstall paddlefleet -y + pip install paddlefleet-0.0.0-cp310-cp310-linux_x86_64.whl + # pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-GpuAll-LinuxCentos-Gcc11-Cuda129-Cudnn99-Trt105-Py310-Compile/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/ --force-reinstall --no-cache-dir + echo "paddle commit:" + python -c "import paddle; print(paddle.version.commit)" + echo "paddlefleet commit:" + python -c "import paddlefleet; print(paddlefleet.version.commit)" + cd /workspace + wget -q --no-proxy -O bos_new.tar.gz https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz --no-check-certificate + mkdir bos + tar xf bos_new.tar.gz -C bos + pip install bce-python-sdk==0.8.74 + pip install coverage==7.6.1 + pip install librosa==0.11.0 + ' - # - name: GLM4.5 pre-train - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt.sh - # glm45_exit_code=$? - # if [[ "$glm45_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m" - # fi - # ' - # - name: GLM4.5 sft - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_sft.sh - # glm45_exit_code=$? - # if [[ "$glm45_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m" - # fi - # ' + - name: GLM4.5 pre-train + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt.sh + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5.\033[0m" + fi + ' + - name: GLM4.5 sft + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_sft.sh + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 sft.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 sft.\033[0m" + fi + ' - # - name: GLM4.5 lora - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_lora.sh - # glm45_exit_code=$? - # if [[ "$glm45_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m" - # fi - # ' + - name: GLM4.5 lora + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_lora.sh + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m" + fi + ' - # - name: GLM4.5 dpo - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh - # glm45_exit_code=$? - # if [[ "$glm45_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m" - # fi - # ' + - name: GLM4.5 dpo + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_dpo.sh + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m" + fi + ' - # - name: GLM4.5 pre-train (FP8) - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh - # glm45_exit_code=$? - # if [[ "$glm45_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m" - # fi - # ' + - name: GLM4.5 pre-train (FP8) + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_fp8.sh + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m" + fi + ' - # - name: GLM4.5 pre-train (Grouped GEMM) - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh - # glm45_exit_code=$? - # if [[ "$glm45_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m" - # fi - # ' + - name: GLM4.5 pre-train (Grouped GEMM) + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_pt_grouped_gemm.sh + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 Grouped GEMM.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 Grouped GEMM.\033[0m" + fi + ' - # - name: Qwen pre-train - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt - # ' + - name: Qwen pre-train + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh pt + ' - # - name: Qwen sft - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft - # ' + - name: Qwen sft + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh sft + ' - # - name: Qwen lora - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora - # ' + - name: Qwen lora + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen.sh lora + ' - # - name: Terminate and delete the container - # if: ${{ always() }} - # run: | - # set +e - # docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' - # docker rm -f ${{ env.container_name }} + - name: Terminate and delete the container + if: ${{ always() }} + run: | + set +e + docker exec -t ${{ env.container_name }} /bin/bash -c 'rm -rf * .[^.]*' + docker rm -f ${{ env.container_name }} integration-test-a100: needs: check_documents_type @@ -518,20 +518,20 @@ jobs: fi ' - - name: GLM4.5 lora - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora - glm45_exit_code=$? - if [[ "$glm45_exit_code" != "0" ]]; then - echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m" - exit 1 - else - echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m" - fi - ' + # - name: GLM4.5 lora + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora + # glm45_exit_code=$? + # if [[ "$glm45_exit_code" != "0" ]]; then + # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m" + # exit 1 + # else + # echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m" + # fi + # ' - name: GLM4.5 dpo if: (success() || failure()) && steps.formers_install.conclusion == 'success' @@ -594,13 +594,13 @@ jobs: timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft ' - - name: Qwen lora - if: (success() || failure()) && steps.formers_install.conclusion == 'success' - run: | - docker exec -t ${{ env.container_name }} /bin/bash -ce ' - source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora - ' + # - name: Qwen lora + # if: (success() || failure()) && steps.formers_install.conclusion == 'success' + # run: | + # docker exec -t ${{ env.container_name }} /bin/bash -ce ' + # source /root/proxy + # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora + # ' - name: Terminate and delete the container if: ${{ always() }} From 7a3659b25da157aacf0d76dd98e15368acb1d82c Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Wed, 28 Jan 2026 15:56:41 +0800 Subject: [PATCH 11/21] fix --- .github/workflows/fleet-model-test.yml | 56 ++++++++++---------------- tests/integration_test/glm45_dpo.sh | 2 +- 2 files changed, 22 insertions(+), 36 deletions(-) diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index 8cb4fdf2d93..af719cf6366 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -518,20 +518,20 @@ jobs: fi ' - # - name: GLM4.5 lora - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora - # glm45_exit_code=$? - # if [[ "$glm45_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m" - # fi - # ' + - name: GLM4.5 lora + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 lora.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 lora.\033[0m" + fi + ' - name: GLM4.5 dpo if: (success() || failure()) && steps.formers_install.conclusion == 'success' @@ -548,20 +548,6 @@ jobs: fi ' - # - name: GLM4.5 pre-train (FP8) - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh fp8 - # glm45_exit_code=$? - # if [[ "$glm45_exit_code" != "0" ]]; then - # echo -e "::error:: \033[31mIntegration test failed: GLM4.5.\033[0m" - # exit 1 - # else - # echo -e "\033[32mIntegration test succeeded: GLM4.5 FP8.\033[0m" - # fi - # ' - name: GLM4.5 pre-train (Grouped GEMM) if: (success() || failure()) && steps.formers_install.conclusion == 'success' @@ -594,13 +580,13 @@ jobs: timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft ' - # - name: Qwen lora - # if: (success() || failure()) && steps.formers_install.conclusion == 'success' - # run: | - # docker exec -t ${{ env.container_name }} /bin/bash -ce ' - # source /root/proxy - # timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora - # ' + - name: Qwen lora + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora + ' - name: Terminate and delete the container if: ${{ always() }} diff --git a/tests/integration_test/glm45_dpo.sh b/tests/integration_test/glm45_dpo.sh index d636828bb2f..23d8bc742fa 100644 --- a/tests/integration_test/glm45_dpo.sh +++ b/tests/integration_test/glm45_dpo.sh @@ -27,7 +27,7 @@ config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" - | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" + | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps" | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log" | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \ $config_dpo_yaml > ${config_dpo_yaml}.tmp From 74982f72794bad15213a6131817f451e56c51ed1 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Wed, 28 Jan 2026 16:11:40 +0800 Subject: [PATCH 12/21] fix lora --- tests/integration_test/glm45_a100.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index 86600436954..5b143fac611 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -65,6 +65,7 @@ elif [[ ${step} == "lora" ]]; then | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" | .logging_dir = strenv(cur_dir) + "/glm_full_single_lora_log" | .output_dir = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps" + | .num_empty_layers_add_in_tail = 0 | del(.moe_token_dispatcher_type)' \ $config_yaml > ${config_yaml}.tmp mv ${config_yaml}.tmp $config_yaml From 7d1ee49fb9714c23d561a8b80313ec18dfc11044 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Wed, 28 Jan 2026 16:28:23 +0800 Subject: [PATCH 13/21] fix --- .github/workflows/fleet-model-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index af719cf6366..1659f8169e5 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -569,7 +569,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh pt + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh pt ' - name: Qwen sft @@ -577,7 +577,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh sft + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh sft ' - name: Qwen lora @@ -585,7 +585,7 @@ jobs: run: | docker exec -t ${{ env.container_name }} /bin/bash -ce ' source /root/proxy - timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh lora + timeout 5m bash -x PaddleFormers/tests/integration_test/qwen3_a100.sh lora ' - name: Terminate and delete the container From b863be96b84d4bead7943054ebc09ad7cdae66e2 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Wed, 28 Jan 2026 16:32:11 +0800 Subject: [PATCH 14/21] fix --- tests/integration_test/glm45_a100.sh | 2 +- tests/integration_test/qwen3_a100.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index 5b143fac611..bbcbbc154ea 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -130,8 +130,8 @@ else echo "Test passed." fi -# export repo_name=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}') export repo_name=PaddleFleet +export REPO_NAME=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}') # if [[ "${PP}" == "rel" ]]; then # export pppatch="_PPrel" # fi diff --git a/tests/integration_test/qwen3_a100.sh b/tests/integration_test/qwen3_a100.sh index 36b9b069fe8..d57f8f7694b 100644 --- a/tests/integration_test/qwen3_a100.sh +++ b/tests/integration_test/qwen3_a100.sh @@ -91,8 +91,9 @@ else echo "Test passed." fi -# export repo_name=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}') + export repo_name=PaddleFleet +export REPO_NAME=$(echo $GITHUB_REPO_NAME | awk -F'/' '{print $2}') # if [[ "${PP}" == "rel" ]]; then # export pppatch="_PPrel" # fi From d7421979fff3c6f95a383833e64a607bd167b5dd Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Wed, 28 Jan 2026 19:42:37 +0800 Subject: [PATCH 15/21] fix --- tests/integration_test/glm45_dpo.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_test/glm45_dpo.sh b/tests/integration_test/glm45_dpo.sh index 23d8bc742fa..d636828bb2f 100644 --- a/tests/integration_test/glm45_dpo.sh +++ b/tests/integration_test/glm45_dpo.sh @@ -27,7 +27,7 @@ config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo.yaml export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" - | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_single_lora_ckps" + | .model_name_or_path = strenv(cur_dir) + "/checkpoints/glm_full_pp_ckpts" | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_vdl_log" | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_ckpts"' \ $config_dpo_yaml > ${config_dpo_yaml}.tmp From 0b77de812fbecf74c6b363fc3c93c6918ca94962 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Thu, 29 Jan 2026 10:40:39 +0800 Subject: [PATCH 16/21] fix --- .github/workflows/fleet-model-test.yml | 15 +++++++++++++++ tests/integration_test/glm45_a100.sh | 15 +++++++++++++++ tests/integration_test/glm45_dpo_lora.sh | 10 +++------- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/.github/workflows/fleet-model-test.yml b/.github/workflows/fleet-model-test.yml index 93e47ea2bc0..28cbe50e190 100644 --- a/.github/workflows/fleet-model-test.yml +++ b/.github/workflows/fleet-model-test.yml @@ -571,6 +571,21 @@ jobs: echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo.\033[0m" fi ' + + - name: GLM4.5 dpo_lora + if: (success() || failure()) && steps.formers_install.conclusion == 'success' + run: | + docker exec -t ${{ env.container_name }} /bin/bash -ce ' + source /root/proxy + timeout 5m bash -x PaddleFormers/tests/integration_test/glm45_a100.sh dpo_lora + glm45_exit_code=$? + if [[ "$glm45_exit_code" != "0" ]]; then + echo -e "::error:: \033[31mIntegration test failed: GLM4.5 dpo lora.\033[0m" + exit 1 + else + echo -e "\033[32mIntegration test succeeded: GLM4.5 dpo lora.\033[0m" + fi + ' - name: GLM4.5 pre-train (Grouped GEMM) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index bbcbbc154ea..88e68581ab0 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -96,6 +96,21 @@ elif [[ ${step} == "grouped_gemm" ]]; then | .output_dir = strenv(data_dir) + "/checkpoints"' \ $config_yaml > ${config_yaml}.tmp mv ${config_yaml}.tmp $config_yaml +elif [[ ${step} == "dpo_lora" ]]; then + echo "Run GLM4.5 dpo_lora test" + export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo + config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml + config_json=$CACHE_DIR/glm45/GLM-4.5-Air/config.json + yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" + | .model_name_or_path = strenv(CACHE_DIR) + "/zai-org/GLM-4.5-Air-Base" + | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_lora_vdl_log" + | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_lora_ckpts"' \ + $config_dpo_yaml > ${config_dpo_yaml}.tmp + mv ${config_dpo_yaml}.tmp $config_dpo_yaml +else + echo "Unknown step: ${step}, please choose from [pt, sft, lora, dpo, grouped_gemm, flash_attention]" + exit 1 fi rm -rf vdl_log/ diff --git a/tests/integration_test/glm45_dpo_lora.sh b/tests/integration_test/glm45_dpo_lora.sh index 531e41ea55d..b6109d6ed16 100644 --- a/tests/integration_test/glm45_dpo_lora.sh +++ b/tests/integration_test/glm45_dpo_lora.sh @@ -22,16 +22,12 @@ fi cd $root_dir/glm45_fleet export cur_dir=$(pwd) -# prepare dpo data -wget https://paddle-qa.bj.bcebos.com/fleet/fleet_dpo.tar -tar -xf fleet_dpo.tar - +export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml - config_json=$CACHE_DIR/glm45/GLM-4.5-Air/config.json -yq '.train_dataset_path = strenv(cur_dir) + "/dpo_data/dpo_train.jsonl" - | .eval_dataset_path = strenv(cur_dir) + "/dpo_data/dpo_eval.jsonl" +yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" + | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" | .model_name_or_path = strenv(CACHE_DIR) + "/zai-org/GLM-4.5-Air-Base" | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_lora_vdl_log" | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_lora_ckpts"' \ From ad41af0ea1418163b72617040fe0bdbf699d4429 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Thu, 29 Jan 2026 10:51:30 +0800 Subject: [PATCH 17/21] fix --- tests/integration_test/glm45_a100.sh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index 88e68581ab0..74566234c61 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -99,18 +99,15 @@ elif [[ ${step} == "grouped_gemm" ]]; then elif [[ ${step} == "dpo_lora" ]]; then echo "Run GLM4.5 dpo_lora test" export data_dir=$root_dir/PaddleFormers/tests/fixtures/dummy/dpo - config_dpo_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml + config_yaml=$root_dir/PaddleFormers/tests/config/ci/glm45_dpo_lora.yaml config_json=$CACHE_DIR/glm45/GLM-4.5-Air/config.json yq '.train_dataset_path = strenv(data_dir) + "/train.jsonl" | .eval_dataset_path = strenv(data_dir) + "/eval.jsonl" | .model_name_or_path = strenv(CACHE_DIR) + "/zai-org/GLM-4.5-Air-Base" | .logging_dir = strenv(cur_dir) + "/glm_full_dpo_lora_vdl_log" | .output_dir = strenv(cur_dir) + "/checkpoints/glm_full_dpo_lora_ckpts"' \ - $config_dpo_yaml > ${config_dpo_yaml}.tmp - mv ${config_dpo_yaml}.tmp $config_dpo_yaml -else - echo "Unknown step: ${step}, please choose from [pt, sft, lora, dpo, grouped_gemm, flash_attention]" - exit 1 + $config_yaml > ${config_yaml}.tmp + mv ${config_yaml}.tmp $config_yaml fi rm -rf vdl_log/ From 3fe9869e347ca6f8de1f4dcb0d51c2d16f1bae42 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Thu, 29 Jan 2026 11:51:21 +0800 Subject: [PATCH 18/21] fix --- tests/integration_test/qwen3_a100.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration_test/qwen3_a100.sh b/tests/integration_test/qwen3_a100.sh index d57f8f7694b..8cb8173faf7 100644 --- a/tests/integration_test/qwen3_a100.sh +++ b/tests/integration_test/qwen3_a100.sh @@ -70,8 +70,8 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 unset http_proxy https_proxy -log_file=qwen_$step.txt -gt_loss_file=qwen_${step}_multi_card_gt_loss.txt +log_file=qwen_$step_a100.txt +gt_loss_file=qwen_${step}_a100_multi_card_gt_loss.txt set +e NNODES=1 MASTER_ADDR=$master MASTER_PORT=$port coverage run $(which paddleformers-cli) train $config_yaml 2>&1 | tee ./${log_file} From 65139721d11c238e03da162a8d59c428036fc502 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Thu, 29 Jan 2026 13:02:28 +0800 Subject: [PATCH 19/21] fix --- tests/integration_test/glm45_a100.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index 74566234c61..ca2758d354b 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -118,6 +118,7 @@ unset http_proxy https_proxy export FLAGS_embedding_deterministic=1 export FLAGS_cudnn_deterministic=1 +export FLAGS_use_stride_compute_kernel=False log_file=glm45_${step}_a100.txt gt_loss_file=glm45_${step}_multi_card_a100_gt_loss.txt From 7ccc6d60f5dbbca2de16b86c88d6d890c6b017ff Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Thu, 29 Jan 2026 13:08:45 +0800 Subject: [PATCH 20/21] fix --- tests/integration_test/glm45_a100.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index ca2758d354b..21ee54f0c34 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -93,7 +93,7 @@ elif [[ ${step} == "grouped_gemm" ]]; then | .stage1_overlap = false | .moe_grouped_gemm = false | .logging_dir = strenv(data_dir) + "/vdl_log" - | .output_dir = strenv(data_dir) + "/checkpoints"' \ + | .output_dir = strenv(data_dir) + "/checkpoints/grouped_gemm"' \ $config_yaml > ${config_yaml}.tmp mv ${config_yaml}.tmp $config_yaml elif [[ ${step} == "dpo_lora" ]]; then From 22e9408bd8779bfee09203f0d58e5dd4abb55c87 Mon Sep 17 00:00:00 2001 From: tianlef <1095012807@qq.com> Date: Thu, 29 Jan 2026 15:08:54 +0800 Subject: [PATCH 21/21] fix --- tests/integration_test/glm45_a100.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration_test/glm45_a100.sh b/tests/integration_test/glm45_a100.sh index 21ee54f0c34..ff4d23ac14f 100644 --- a/tests/integration_test/glm45_a100.sh +++ b/tests/integration_test/glm45_a100.sh @@ -91,7 +91,6 @@ elif [[ ${step} == "grouped_gemm" ]]; then | .per_device_train_batch_size = 1 | .num_hidden_layers = 2 | .stage1_overlap = false - | .moe_grouped_gemm = false | .logging_dir = strenv(data_dir) + "/vdl_log" | .output_dir = strenv(data_dir) + "/checkpoints/grouped_gemm"' \ $config_yaml > ${config_yaml}.tmp