From a7ff2e076c79acbfbb9893804f4bd87991035a01 Mon Sep 17 00:00:00 2001 From: Clay Rosenthal Date: Wed, 28 Jan 2026 12:44:20 -0800 Subject: [PATCH] Updating nightly gha to always post to slack --- .github/workflows/nightly-training.yaml | 72 ++++++++++++------------- tests/nightly/nightly.py | 6 +++ tests/nightly/run_summary.py | 7 ++- 3 files changed, 47 insertions(+), 38 deletions(-) diff --git a/.github/workflows/nightly-training.yaml b/.github/workflows/nightly-training.yaml index 0e46c9e7..62ae24c5 100644 --- a/.github/workflows/nightly-training.yaml +++ b/.github/workflows/nightly-training.yaml @@ -26,24 +26,19 @@ jobs: multigpu: ['False', 'True'] simulator: [isaacgym, isaacsim] extra_args: [""] - #TODO: fix AWS perms for pulling s3 assets - # include: - # - experiment: g1-29dof-wbt - # simulator: isaacsim - # multigpu: 'False' - # extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz - # - experiment: g1-29dof-wbt-fast-sac - # simulator: isaacsim - # multigpu: 'False' - # extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384 + include: + - experiment: g1-29dof-wbt + simulator: isaacsim + multigpu: 'False' + extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz + - experiment: g1-29dof-wbt-fast-sac + simulator: isaacsim + multigpu: 'False' + extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384 name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }} timeout-minutes: 720 # 12 hours - container: - image: 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest - options: "--gpus all --runtime=nvidia --shm-size=12g" - volumes: - - "precommit-cache:/github/home/.cache/pre-commit" + steps: - name: Checkout code uses: actions/checkout@v6 @@ -53,35 +48,29 @@ jobs: env: WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }} shell: bash - timeout-minutes: 720 # 12 hours + timeout-minutes: 750 # 12.5 hours run: | - ln -s /root/.holosoma_deps "$HOME/.holosoma_deps" - if [[ ! -L /workspace/holosoma ]]; then - rm -rf /workspace/holosoma - ln -sfF "$GITHUB_WORKSPACE" /workspace/holosoma - fi - source scripts/source_${{ matrix.simulator }}_setup.sh - nvidia-smi - python -m wandb login - python tests/nightly/nightly.py exp:${{ matrix.experiment }} \ - logger:wandb --logger.video.enabled=False \ - simulator:${{ matrix.simulator }} \ - --training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }} + eval "$(aws configure export-credentials --format env)" + docker run --gpus all --runtime=nvidia --shm-size=12g \ + -v "$GITHUB_WORKSPACE:/workspace/holosoma" \ + --env AWS_ACCESS_KEY_ID --env AWS_SECRET_ACCESS_KEY \ + --env AWS_SESSION_TOKEN --env WANDB_API_KEY \ + 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest \ + bash -c """ + source scripts/source_${{ matrix.simulator }}_setup.sh + python -m wandb login + python tests/nightly/nightly.py exp:${{ matrix.experiment }} \ + logger:wandb --logger.video.enabled=False \ + simulator:${{ matrix.simulator }} \ + --training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }} + """ + evaluate-results: needs: [training] runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }} if: always() steps: - - name: Check results - run: | - if [[ "${{ needs.training.result }}" == "success" ]]; then - echo "All nightly training tests passed!" - exit 0 - else - echo "Some training tests failed." - exit 1 - fi - name: Checkout code uses: actions/checkout@v6 with: @@ -95,3 +84,12 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} SLACK_CHANNEL: '#far-holosoma-nightly-tests' run: uv run --script ./tests/nightly/run_summary.py --slack + - name: Check results + run: | + if [[ "${{ needs.training.result }}" == "success" ]]; then + echo "All nightly training tests passed!" + exit 0 + else + echo "Some training tests failed." + exit 1 + fi diff --git a/tests/nightly/nightly.py b/tests/nightly/nightly.py index 84c31d7b..235d789d 100644 --- a/tests/nightly/nightly.py +++ b/tests/nightly/nightly.py @@ -89,12 +89,18 @@ def main(): config = config.get_nightly_config() + run_tags = [] + + if os.getenv("GITHUB_RUN_ID"): + run_tags.append(f"gha-run-id-{os.getenv('GITHUB_RUN_ID')}") + config = dataclasses.replace( config, logger=dataclasses.replace( config.logger, project=f"nightly-{sanitized_exp}{multigpu_suffix}", name=f"nightly-{sanitized_exp}{multigpu_suffix}-{now_timestamp()}", + tags=run_tags, ), ) diff --git a/tests/nightly/run_summary.py b/tests/nightly/run_summary.py index c9c1b44e..277dcd95 100755 --- a/tests/nightly/run_summary.py +++ b/tests/nightly/run_summary.py @@ -101,6 +101,7 @@ def get_last_nightly_urls() -> list[str]: api = wandb.Api(timeout=60) nightly_urls = [] + filter_tags = [] # Fetch all projects for the FAR entity all_projects = list(api.projects(WANDB_ENTITY)) @@ -110,6 +111,10 @@ def get_last_nightly_urls() -> list[str]: since_time = datetime.now(timezone.utc) - timedelta(hours=16) since_iso = since_time.isoformat() + # GHA run ids filter + if os.getenv("GITHUB_RUN_ID"): + filter_tags.append(f"gha-run-id-{os.getenv('GITHUB_RUN_ID')}") + # Use parallel processing to speed up API calls # Default to a reasonable number of workers based on CPU count max_workers = min(32, (os.cpu_count() or 1) + 4) @@ -117,7 +122,7 @@ def get_last_nightly_urls() -> list[str]: with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all tasks future2project_name = { - executor.submit(_fetch_project_runs, api, p.name, since_iso): p.name for p in nightly_projects + executor.submit(_fetch_project_runs, api, p.name, since_iso, filter_tags): p.name for p in nightly_projects } # Process completed tasks as they finish