Nightly Training #121

Workflow file for this run

.github/workflows/nightly-training.yaml at 0e6d021

	name: Nightly Training
	on:
	# push:
	workflow_dispatch:
	inputs:
	branch:
	description: 'Branch to test'
	required: true
	default: 'main'
	schedule:
	- cron: '0 8 * * *' # 8 AM UTC = midnight PST

	permissions:
	contents: read

	jobs:
	training:
	runs-on: codebuild-holosoma-gpu-build-${{ github.run_id }}-${{ github.run_attempt }}
	continue-on-error: true
	strategy:
	fail-fast: false # continue on one experiment failing
	matrix:
	experiment:
	- g1-29dof
	- t1-29dof
	- g1-29dof-fast-sac
	- t1-29dof-fast-sac
	multigpu: ['True', 'False']
	simulator: [isaacgym, isaacsim]
	extra_args: [""]
	include:
	- experiment: g1-29dof-wbt
	simulator: isaacsim
	multigpu: 'False'
	extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz
	- experiment: g1-29dof-wbt-fast-sac
	simulator: isaacsim
	multigpu: 'False'
	extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384

	name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' \|\| 'single-gpu' }} using ${{ matrix.simulator }}
	timeout-minutes: 1000 # ~16 hours

	steps:
	- name: Checkout code
	uses: actions/checkout@v6
	with:
	ref: ${{ github.event.inputs.branch \|\| 'main' }}
	- name: Run training
	env:
	WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
	shell: bash
	timeout-minutes: 1000 # ~16 hours

	run: \|
	# kill all old docker containers
	OLD_CONTAINERS=$(docker ps -a -q)
	if [[ -n "$OLD_CONTAINERS" ]]; then
	docker stop "$OLD_CONTAINERS" \|\| true
	docker rm --force "$OLD_CONTAINERS" \|\| true
	fi
	eval "$(aws configure export-credentials --format env)"
	docker run --rm --gpus all --runtime=nvidia --shm-size=12g \
	-v "$GITHUB_WORKSPACE:/workspace/holosoma" \
	--env AWS_ACCESS_KEY_ID --env AWS_SECRET_ACCESS_KEY \
	--env AWS_SESSION_TOKEN --env WANDB_API_KEY \
	--env GITHUB_RUN_ID --env HOLOSOMA_SIM=${{ matrix.simulator }} \
	982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest \
	bash -c """
	source scripts/source_${{ matrix.simulator }}_setup.sh
	python -m wandb login
	python tests/nightly/nightly.py exp:${{ matrix.experiment }} \
	logger:wandb --logger.video.enabled=False \
	simulator:${{ matrix.simulator }} \
	--training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }}
	"""


	evaluate-results:
	needs: [training]
	runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }}
	if: always()
	steps:
	- name: Checkout code
	uses: actions/checkout@v6
	with:
	ref: ${{ github.event.inputs.branch \|\| 'main' }}
	- name: Install uv
	uses: astral-sh/setup-uv@v7
	- name: Generate wandb report
	env:
	WANDB_ENTITY: "amazon-far"
	WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
	run: uv run --script ./tests/nightly/generate_report.py --publish --github-run-id=${{ github.run_id }}
	- name: Post to Slack
	env:
	WANDB_ENTITY: "amazon-far"
	WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
	SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
	SLACK_CHANNEL: '#far-holosoma-nightly-tests'
	run: uv run --script ./tests/nightly/run_summary.py --slack
	- name: Check results
	run: \|
	if [[ "${{ needs.training.result }}" == "success" ]]; then
	echo "All nightly training tests passed!"
	exit 0
	else
	echo "Some training tests failed."
	exit 1
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Nightly Training #121

Workflow file

Nightly Training #121

Uh oh!

Workflow file for this run