Nightly Training #110
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly Training | |
| on: | |
| # push: | |
| workflow_dispatch: | |
| inputs: | |
| branch: | |
| description: 'Branch to test' | |
| required: true | |
| default: 'main' | |
| schedule: | |
| - cron: '0 8 * * *' # 8 AM UTC = midnight PST | |
| permissions: | |
| contents: read | |
| jobs: | |
| training: | |
| runs-on: codebuild-holosoma-gpu-build-${{ github.run_id }}-${{ github.run_attempt }} | |
| continue-on-error: true | |
| strategy: | |
| fail-fast: false # continue on one experiment failing | |
| matrix: | |
| experiment: | |
| - g1-29dof | |
| - t1-29dof | |
| - g1-29dof-fast-sac | |
| - t1-29dof-fast-sac | |
| multigpu: ['True', 'False'] | |
| simulator: [isaacgym, isaacsim] | |
| extra_args: [""] | |
| include: | |
| - experiment: g1-29dof-wbt | |
| simulator: isaacsim | |
| multigpu: 'False' | |
| extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz | |
| - experiment: g1-29dof-wbt-fast-sac | |
| simulator: isaacsim | |
| multigpu: 'False' | |
| extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384 | |
| name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }} | |
| timeout-minutes: 1000 # ~16 hours | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ github.event.inputs.branch || 'main' }} | |
| - name: Run training | |
| env: | |
| WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }} | |
| shell: bash | |
| timeout-minutes: 1000 # ~16 hours | |
| run: | | |
| # kill all old docker containers | |
| OLD_CONTAINERS=$(docker ps -a -q) | |
| if [[ -n "$OLD_CONTAINERS" ]]; then | |
| docker stop "$OLD_CONTAINERS" || true | |
| docker rm --force "$OLD_CONTAINERS" || true | |
| fi | |
| eval "$(aws configure export-credentials --format env)" | |
| docker run --rm --gpus all --runtime=nvidia --shm-size=12g \ | |
| -v "$GITHUB_WORKSPACE:/workspace/holosoma" \ | |
| --env AWS_ACCESS_KEY_ID --env AWS_SECRET_ACCESS_KEY \ | |
| --env AWS_SESSION_TOKEN --env WANDB_API_KEY \ | |
| --env GITHUB_RUN_ID --env HOLOSOMA_SIM=${{ matrix.simulator }} \ | |
| 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest \ | |
| bash -c """ | |
| source scripts/source_${{ matrix.simulator }}_setup.sh | |
| python -m wandb login | |
| python tests/nightly/nightly.py exp:${{ matrix.experiment }} \ | |
| logger:wandb --logger.video.enabled=False \ | |
| simulator:${{ matrix.simulator }} \ | |
| --training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }} | |
| """ | |
| evaluate-results: | |
| needs: [training] | |
| runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }} | |
| if: always() | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ github.event.inputs.branch || 'main' }} | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| - name: Generate wandb report | |
| env: | |
| WANDB_ENTITY: "amazon-far" | |
| WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }} | |
| run: uv run --script ./tests/nightly/generate_report.py --publish --github-run-id=${{ github.run_id }} | |
| - name: Post to Slack | |
| env: | |
| WANDB_ENTITY: "amazon-far" | |
| WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }} | |
| SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} | |
| SLACK_CHANNEL: '#far-holosoma-nightly-tests' | |
| run: uv run --script ./tests/nightly/run_summary.py --slack | |
| - name: Check results | |
| run: | | |
| if [[ "${{ needs.training.result }}" == "success" ]]; then | |
| echo "All nightly training tests passed!" | |
| exit 0 | |
| else | |
| echo "Some training tests failed." | |
| exit 1 | |
| fi |