generated from amazon-archives/__template_Apache-2.0
-
Notifications
You must be signed in to change notification settings - Fork 144
84 lines (80 loc) · 2.96 KB
/
nightly-training.yaml
File metadata and controls
84 lines (80 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
name: Nightly Training
on:
# push:
workflow_dispatch:
inputs:
branch:
description: 'Branch to test'
required: true
default: 'main'
schedule:
- cron: '0 8 * * *' # 8 AM UTC = midnight PST
permissions:
contents: read
jobs:
training:
runs-on: codebuild-holosoma-gpu-build-${{ github.run_id }}-${{ github.run_attempt }}
strategy:
matrix:
experiment:
- g1-29dof
- t1-29dof
- g1-29dof-fast-sac
- t1-29dof-fast-sac
multigpu: ['False', 'True']
simulator: [isaacgym, isaacsim]
extra_args: [""]
#TODO: fix AWS perms for pulling s3 assets
# include:
# - experiment: g1-29dof-wbt
# simulator: isaacsim
# multigpu: 'False'
# extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz
# - experiment: g1-29dof-wbt-fast-sac
# simulator: isaacsim
# multigpu: 'False'
# extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384
name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }}
timeout-minutes: 720 # 12 hours
container:
image: 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest
options: "--gpus all --runtime=nvidia --shm-size=12g"
volumes:
- "precommit-cache:/github/home/.cache/pre-commit"
steps:
- name: Checkout code
uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || 'main' }}
- name: Run training
env:
WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
shell: bash
timeout-minutes: 720 # 12 hours
run: |
ln -s /root/.holosoma_deps "$HOME/.holosoma_deps"
if [[ ! -L /workspace/holosoma ]]; then
rm -rf /workspace/holosoma
ln -sfF "$GITHUB_WORKSPACE" /workspace/holosoma
fi
source scripts/source_${{ matrix.simulator }}_setup.sh
nvidia-smi
python -m wandb login
python tests/nightly/nightly.py exp:${{ matrix.experiment }} \
logger:wandb --logger.video.enabled=False \
simulator:${{ matrix.simulator }} \
--training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }}
evaluate-results:
needs: [training]
runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }}
if: always()
steps:
- name: Check results
run: |
if [[ "${{ needs.training.result }}" == "success" ]]; then
echo "All nightly training tests passed!"
exit 0
else
echo "Some training tests failed."
exit 1
fi