holosoma/.github/workflows/nightly-training.yaml at 2cb7f6cecf9f8fd347f972f5f712f0434b17aced · amazon-far/holosoma · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
name: Nightly Training
on:
  # push:
  workflow_dispatch:
    inputs:
      branch:
        description: 'Branch to test'
        required: true
        default: 'main'
  schedule:
    - cron: '0 8 * * *'  # 8 AM UTC = midnight PST

permissions:
  contents: read

jobs:
  training:
    runs-on: codebuild-holosoma-gpu-build-${{ github.run_id }}-${{ github.run_attempt }}
    strategy:
      matrix:
        experiment:
        - g1-29dof
        - t1-29dof
        - g1-29dof-fast-sac
        - t1-29dof-fast-sac
        multigpu: ['False', 'True']
        simulator: [isaacgym, isaacsim]
        extra_args: [""]
        #TODO: fix AWS perms for pulling s3 assets
        # include:
        # - experiment: g1-29dof-wbt
        #   simulator: isaacsim
        #   multigpu: 'False'
        #   extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz
        # - experiment: g1-29dof-wbt-fast-sac
        #   simulator: isaacsim
        #   multigpu: 'False'
        #   extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384

    name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }}
    timeout-minutes: 720 # 12 hours
    container:
      image: 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest
      options: "--gpus all --runtime=nvidia --shm-size=12g"
      volumes:
        - "precommit-cache:/github/home/.cache/pre-commit"
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
        with:
          ref: ${{ github.event.inputs.branch || 'main' }}
      - name: Run training
        env:
          WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
        shell: bash
        timeout-minutes: 720 # 12 hours
        run: |
          ln -s /root/.holosoma_deps "$HOME/.holosoma_deps"
          if [[ ! -L /workspace/holosoma ]]; then
            rm -rf /workspace/holosoma
            ln -sfF "$GITHUB_WORKSPACE" /workspace/holosoma
          fi
          source scripts/source_${{ matrix.simulator }}_setup.sh
          nvidia-smi
          python -m wandb login
          python tests/nightly/nightly.py exp:${{ matrix.experiment }} \
            logger:wandb --logger.video.enabled=False \
            simulator:${{ matrix.simulator }} \
            --training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }}

  evaluate-results:
    needs: [training]
    runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }}
    if: always()
    steps:
      - name: Check results
        run: |
          if [[ "${{ needs.training.result }}" == "success" ]]; then
            echo "All nightly training tests passed!"
            exit 0
          else
            echo "Some training tests failed."
            exit 1
          fi