Skip to content

Nightly Training

Nightly Training #121

name: Nightly Training
on:
# push:
workflow_dispatch:
inputs:
branch:
description: 'Branch to test'
required: true
default: 'main'
schedule:
- cron: '0 8 * * *' # 8 AM UTC = midnight PST
permissions:
contents: read
jobs:
training:
runs-on: codebuild-holosoma-gpu-build-${{ github.run_id }}-${{ github.run_attempt }}
continue-on-error: true
strategy:
fail-fast: false # continue on one experiment failing
matrix:
experiment:
- g1-29dof
- t1-29dof
- g1-29dof-fast-sac
- t1-29dof-fast-sac
multigpu: ['True', 'False']
simulator: [isaacgym, isaacsim]
extra_args: [""]
include:
- experiment: g1-29dof-wbt
simulator: isaacsim
multigpu: 'False'
extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz
- experiment: g1-29dof-wbt-fast-sac
simulator: isaacsim
multigpu: 'False'
extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384
name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }}
timeout-minutes: 1000 # ~16 hours
steps:
- name: Checkout code
uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || 'main' }}
- name: Run training
env:
WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
shell: bash
timeout-minutes: 1000 # ~16 hours
run: |
# kill all old docker containers
OLD_CONTAINERS=$(docker ps -a -q)
if [[ -n "$OLD_CONTAINERS" ]]; then
docker stop "$OLD_CONTAINERS" || true
docker rm --force "$OLD_CONTAINERS" || true
fi
eval "$(aws configure export-credentials --format env)"
docker run --rm --gpus all --runtime=nvidia --shm-size=12g \
-v "$GITHUB_WORKSPACE:/workspace/holosoma" \
--env AWS_ACCESS_KEY_ID --env AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN --env WANDB_API_KEY \
--env GITHUB_RUN_ID --env HOLOSOMA_SIM=${{ matrix.simulator }} \
982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest \
bash -c """
source scripts/source_${{ matrix.simulator }}_setup.sh
python -m wandb login
python tests/nightly/nightly.py exp:${{ matrix.experiment }} \
logger:wandb --logger.video.enabled=False \
simulator:${{ matrix.simulator }} \
--training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }}
"""
evaluate-results:
needs: [training]
runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }}
if: always()
steps:
- name: Checkout code
uses: actions/checkout@v6
with:
ref: ${{ github.event.inputs.branch || 'main' }}
- name: Install uv
uses: astral-sh/setup-uv@v7
- name: Generate wandb report
env:
WANDB_ENTITY: "amazon-far"
WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
run: uv run --script ./tests/nightly/generate_report.py --publish --github-run-id=${{ github.run_id }}
- name: Post to Slack
env:
WANDB_ENTITY: "amazon-far"
WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
SLACK_CHANNEL: '#far-holosoma-nightly-tests'
run: uv run --script ./tests/nightly/run_summary.py --slack
- name: Check results
run: |
if [[ "${{ needs.training.result }}" == "success" ]]; then
echo "All nightly training tests passed!"
exit 0
else
echo "Some training tests failed."
exit 1
fi