diff --git a/.github/workflows/nightly-training.yaml b/.github/workflows/nightly-training.yaml new file mode 100644 index 00000000..b06f6d9d --- /dev/null +++ b/.github/workflows/nightly-training.yaml @@ -0,0 +1,84 @@ +name: Nightly Training +on: + # push: + workflow_dispatch: + inputs: + branch: + description: 'Branch to test' + required: true + default: 'main' + schedule: + - cron: '0 8 * * *' # 8 AM UTC = midnight PST + +permissions: + contents: read + +jobs: + training: + runs-on: codebuild-holosoma-gpu-build-${{ github.run_id }}-${{ github.run_attempt }} + strategy: + matrix: + experiment: + - g1-29dof + - t1-29dof + - g1-29dof-fast-sac + - t1-29dof-fast-sac + multigpu: ['False', 'True'] + simulator: [isaacgym, isaacsim] + extra_args: [""] + #TODO: fix AWS perms for pulling s3 assets + # include: + # - experiment: g1-29dof-wbt + # simulator: isaacsim + # multigpu: 'False' + # extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz + # - experiment: g1-29dof-wbt-fast-sac + # simulator: isaacsim + # multigpu: 'False' + # extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384 + + name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }} + timeout-minutes: 720 # 12 hours + container: + image: 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest + options: "--gpus all --runtime=nvidia --shm-size=12g" + volumes: + - "precommit-cache:/github/home/.cache/pre-commit" + steps: + - name: Checkout code + uses: actions/checkout@v6 + with: + ref: ${{ github.event.inputs.branch || 'main' }} + - name: Run training + env: + WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }} + shell: bash + timeout-minutes: 720 # 12 hours + run: | + ln -s /root/.holosoma_deps "$HOME/.holosoma_deps" + if [[ ! -L /workspace/holosoma ]]; then + rm -rf /workspace/holosoma + ln -sfF "$GITHUB_WORKSPACE" /workspace/holosoma + fi + source scripts/source_${{ matrix.simulator }}_setup.sh + nvidia-smi + python -m wandb login + python tests/nightly/nightly.py exp:${{ matrix.experiment }} \ + logger:wandb --logger.video.enabled=False \ + simulator:${{ matrix.simulator }} \ + --training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }} + + evaluate-results: + needs: [training] + runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }} + if: always() + steps: + - name: Check results + run: | + if [[ "${{ needs.training.result }}" == "success" ]]; then + echo "All nightly training tests passed!" + exit 0 + else + echo "Some training tests failed." + exit 1 + fi