Nightly Training #100
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly Training | |
| on: | |
| # push: | |
| workflow_dispatch: | |
| inputs: | |
| branch: | |
| description: 'Branch to test' | |
| required: true | |
| default: 'main' | |
| schedule: | |
| - cron: '0 8 * * *' # 8 AM UTC = midnight PST | |
| permissions: | |
| contents: read | |
| jobs: | |
| training: | |
| runs-on: codebuild-holosoma-gpu-build-${{ github.run_id }}-${{ github.run_attempt }} | |
| strategy: | |
| matrix: | |
| experiment: | |
| - g1-29dof | |
| - t1-29dof | |
| - g1-29dof-fast-sac | |
| - t1-29dof-fast-sac | |
| multigpu: ['False', 'True'] | |
| simulator: [isaacgym, isaacsim] | |
| extra_args: [""] | |
| include: | |
| - experiment: g1-29dof-wbt | |
| simulator: isaacsim | |
| multigpu: 'False' | |
| extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz | |
| - experiment: g1-29dof-wbt-fast-sac | |
| simulator: isaacsim | |
| multigpu: 'False' | |
| extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384 | |
| name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }} | |
| timeout-minutes: 1000 # ~16 hours | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ github.event.inputs.branch || 'main' }} | |
| - name: Run training | |
| env: | |
| WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }} | |
| shell: bash | |
| timeout-minutes: 1000 # ~16 hours | |
| run: | | |
| # kill all old docker containers | |
| OLD_CONTAINERS=$(docker ps -a -q) | |
| if [[ -n "$OLD_CONTAINERS" ]]; then | |
| docker stop "$OLD_CONTAINERS" || true | |
| docker rm --force "$OLD_CONTAINERS" || true | |
| fi | |
| eval "$(aws configure export-credentials --format env)" | |
| docker run --rm --gpus all --runtime=nvidia --shm-size=12g \ | |
| -v "$GITHUB_WORKSPACE:/workspace/holosoma" \ | |
| --env AWS_ACCESS_KEY_ID --env AWS_SECRET_ACCESS_KEY \ | |
| --env AWS_SESSION_TOKEN --env WANDB_API_KEY \ | |
| --env GITHUB_RUN_ID \ | |
| 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest \ | |
| bash -c """ | |
| source scripts/source_${{ matrix.simulator }}_setup.sh | |
| python -m wandb login | |
| python tests/nightly/nightly.py exp:${{ matrix.experiment }} \ | |
| logger:wandb --logger.video.enabled=False \ | |
| simulator:${{ matrix.simulator }} \ | |
| --training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }} | |
| """ | |
| evaluate-results: | |
| needs: [training] | |
| runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }} | |
| if: always() | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ github.event.inputs.branch || 'main' }} | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| - name: Post to Slack | |
| env: | |
| WANDB_ENTITY: "amazon-far" | |
| WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }} | |
| SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} | |
| SLACK_CHANNEL: '#far-holosoma-nightly-tests' | |
| run: uv run --script ./tests/nightly/run_summary.py --slack | |
| - name: Check results | |
| run: | | |
| if [[ "${{ needs.training.result }}" == "success" ]]; then | |
| echo "All nightly training tests passed!" | |
| exit 0 | |
| else | |
| echo "Some training tests failed." | |
| exit 1 | |
| fi |