Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 35 additions & 37 deletions .github/workflows/nightly-training.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,19 @@ jobs:
multigpu: ['False', 'True']
simulator: [isaacgym, isaacsim]
extra_args: [""]
#TODO: fix AWS perms for pulling s3 assets
# include:
# - experiment: g1-29dof-wbt
# simulator: isaacsim
# multigpu: 'False'
# extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz
# - experiment: g1-29dof-wbt-fast-sac
# simulator: isaacsim
# multigpu: 'False'
# extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384
include:
- experiment: g1-29dof-wbt
simulator: isaacsim
multigpu: 'False'
extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz
- experiment: g1-29dof-wbt-fast-sac
simulator: isaacsim
multigpu: 'False'
extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384

name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }}
timeout-minutes: 720 # 12 hours
container:
image: 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest
options: "--gpus all --runtime=nvidia --shm-size=12g"
volumes:
- "precommit-cache:/github/home/.cache/pre-commit"

steps:
- name: Checkout code
uses: actions/checkout@v6
Expand All @@ -53,35 +48,29 @@ jobs:
env:
WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
shell: bash
timeout-minutes: 720 # 12 hours
timeout-minutes: 750 # 12.5 hours
run: |
ln -s /root/.holosoma_deps "$HOME/.holosoma_deps"
if [[ ! -L /workspace/holosoma ]]; then
rm -rf /workspace/holosoma
ln -sfF "$GITHUB_WORKSPACE" /workspace/holosoma
fi
source scripts/source_${{ matrix.simulator }}_setup.sh
nvidia-smi
python -m wandb login
python tests/nightly/nightly.py exp:${{ matrix.experiment }} \
logger:wandb --logger.video.enabled=False \
simulator:${{ matrix.simulator }} \
--training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }}
eval "$(aws configure export-credentials --format env)"
docker run --gpus all --runtime=nvidia --shm-size=12g \
-v "$GITHUB_WORKSPACE:/workspace/holosoma" \
--env AWS_ACCESS_KEY_ID --env AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN --env WANDB_API_KEY \
982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest \
bash -c """
source scripts/source_${{ matrix.simulator }}_setup.sh
python -m wandb login
python tests/nightly/nightly.py exp:${{ matrix.experiment }} \
logger:wandb --logger.video.enabled=False \
simulator:${{ matrix.simulator }} \
--training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }}
"""


evaluate-results:
needs: [training]
runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }}
if: always()
steps:
- name: Check results
run: |
if [[ "${{ needs.training.result }}" == "success" ]]; then
echo "All nightly training tests passed!"
exit 0
else
echo "Some training tests failed."
exit 1
fi
- name: Checkout code
uses: actions/checkout@v6
with:
Expand All @@ -95,3 +84,12 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
SLACK_CHANNEL: '#far-holosoma-nightly-tests'
run: uv run --script ./tests/nightly/run_summary.py --slack
- name: Check results
run: |
if [[ "${{ needs.training.result }}" == "success" ]]; then
echo "All nightly training tests passed!"
exit 0
else
echo "Some training tests failed."
exit 1
fi
6 changes: 6 additions & 0 deletions tests/nightly/nightly.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,18 @@ def main():

config = config.get_nightly_config()

run_tags = []

if os.getenv("GITHUB_RUN_ID"):
run_tags.append(f"gha-run-id-{os.getenv('GITHUB_RUN_ID')}")

config = dataclasses.replace(
config,
logger=dataclasses.replace(
config.logger,
project=f"nightly-{sanitized_exp}{multigpu_suffix}",
name=f"nightly-{sanitized_exp}{multigpu_suffix}-{now_timestamp()}",
tags=run_tags,
),
)

Expand Down
7 changes: 6 additions & 1 deletion tests/nightly/run_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def get_last_nightly_urls() -> list[str]:
api = wandb.Api(timeout=60)

nightly_urls = []
filter_tags = []

# Fetch all projects for the FAR entity
all_projects = list(api.projects(WANDB_ENTITY))
Expand All @@ -110,14 +111,18 @@ def get_last_nightly_urls() -> list[str]:
since_time = datetime.now(timezone.utc) - timedelta(hours=16)
since_iso = since_time.isoformat()

# GHA run ids filter
if os.getenv("GITHUB_RUN_ID"):
filter_tags.append(f"gha-run-id-{os.getenv('GITHUB_RUN_ID')}")

# Use parallel processing to speed up API calls
# Default to a reasonable number of workers based on CPU count
max_workers = min(32, (os.cpu_count() or 1) + 4)

with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
future2project_name = {
executor.submit(_fetch_project_runs, api, p.name, since_iso): p.name for p in nightly_projects
executor.submit(_fetch_project_runs, api, p.name, since_iso, filter_tags): p.name for p in nightly_projects
}

# Process completed tasks as they finish
Expand Down
Loading