From a7ff2e076c79acbfbb9893804f4bd87991035a01 Mon Sep 17 00:00:00 2001
From: Clay Rosenthal <clayros@amazon.com>
Date: Wed, 28 Jan 2026 12:44:20 -0800
Subject: [PATCH] Updating nightly gha to always post to slack

---
 .github/workflows/nightly-training.yaml | 72 ++++++++++++-------------
 tests/nightly/nightly.py                |  6 +++
 tests/nightly/run_summary.py            |  7 ++-
 3 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/nightly-training.yaml b/.github/workflows/nightly-training.yaml
index 0e46c9e7..62ae24c5 100644
--- a/.github/workflows/nightly-training.yaml
+++ b/.github/workflows/nightly-training.yaml
@@ -26,24 +26,19 @@ jobs:
         multigpu: ['False', 'True']
         simulator: [isaacgym, isaacsim]
         extra_args: [""]
-        #TODO: fix AWS perms for pulling s3 assets
-        # include:
-        # - experiment: g1-29dof-wbt
-        #   simulator: isaacsim
-        #   multigpu: 'False'
-        #   extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz
-        # - experiment: g1-29dof-wbt-fast-sac
-        #   simulator: isaacsim
-        #   multigpu: 'False'
-        #   extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384
+        include:
+        - experiment: g1-29dof-wbt
+          simulator: isaacsim
+          multigpu: 'False'
+          extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz
+        - experiment: g1-29dof-wbt-fast-sac
+          simulator: isaacsim
+          multigpu: 'False'
+          extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384
 
     name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }}
     timeout-minutes: 720 # 12 hours
-    container:
-      image: 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest
-      options: "--gpus all --runtime=nvidia --shm-size=12g"
-      volumes:
-        - "precommit-cache:/github/home/.cache/pre-commit"
+
     steps:
       - name: Checkout code
         uses: actions/checkout@v6
@@ -53,35 +48,29 @@ jobs:
         env:
           WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }}
         shell: bash
-        timeout-minutes: 720 # 12 hours
+        timeout-minutes: 750 # 12.5 hours
         run: |
-          ln -s /root/.holosoma_deps "$HOME/.holosoma_deps"
-          if [[ ! -L /workspace/holosoma ]]; then
-            rm -rf /workspace/holosoma
-            ln -sfF "$GITHUB_WORKSPACE" /workspace/holosoma
-          fi
-          source scripts/source_${{ matrix.simulator }}_setup.sh
-          nvidia-smi
-          python -m wandb login
-          python tests/nightly/nightly.py exp:${{ matrix.experiment }} \
-            logger:wandb --logger.video.enabled=False \
-            simulator:${{ matrix.simulator }} \
-            --training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }}
+          eval "$(aws configure export-credentials --format env)"
+          docker run --gpus all --runtime=nvidia --shm-size=12g \
+            -v "$GITHUB_WORKSPACE:/workspace/holosoma" \
+            --env AWS_ACCESS_KEY_ID --env AWS_SECRET_ACCESS_KEY \
+            --env AWS_SESSION_TOKEN --env WANDB_API_KEY \
+            982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest \
+            bash -c """
+              source scripts/source_${{ matrix.simulator }}_setup.sh
+              python -m wandb login
+              python tests/nightly/nightly.py exp:${{ matrix.experiment }} \
+                logger:wandb --logger.video.enabled=False \
+                simulator:${{ matrix.simulator }} \
+                --training.multigpu=${{ matrix.multigpu }} ${{ matrix.extra_args }}
+            """
+
 
   evaluate-results:
     needs: [training]
     runs-on: codebuild-holosoma-cpu-build-${{ github.run_id }}-${{ github.run_attempt }}
     if: always()
     steps:
-      - name: Check results
-        run: |
-          if [[ "${{ needs.training.result }}" == "success" ]]; then
-            echo "All nightly training tests passed!"
-            exit 0
-          else
-            echo "Some training tests failed."
-            exit 1
-          fi
       - name: Checkout code
         uses: actions/checkout@v6
         with:
@@ -95,3 +84,12 @@ jobs:
           SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
           SLACK_CHANNEL: '#far-holosoma-nightly-tests'
         run: uv run --script ./tests/nightly/run_summary.py --slack
+      - name: Check results
+        run: |
+          if [[ "${{ needs.training.result }}" == "success" ]]; then
+            echo "All nightly training tests passed!"
+            exit 0
+          else
+            echo "Some training tests failed."
+            exit 1
+          fi
diff --git a/tests/nightly/nightly.py b/tests/nightly/nightly.py
index 84c31d7b..235d789d 100644
--- a/tests/nightly/nightly.py
+++ b/tests/nightly/nightly.py
@@ -89,12 +89,18 @@ def main():
 
     config = config.get_nightly_config()
 
+    run_tags = []
+
+    if os.getenv("GITHUB_RUN_ID"):
+        run_tags.append(f"gha-run-id-{os.getenv('GITHUB_RUN_ID')}")
+
     config = dataclasses.replace(
         config,
         logger=dataclasses.replace(
             config.logger,
             project=f"nightly-{sanitized_exp}{multigpu_suffix}",
             name=f"nightly-{sanitized_exp}{multigpu_suffix}-{now_timestamp()}",
+            tags=run_tags,
         ),
     )
 
diff --git a/tests/nightly/run_summary.py b/tests/nightly/run_summary.py
index c9c1b44e..277dcd95 100755
--- a/tests/nightly/run_summary.py
+++ b/tests/nightly/run_summary.py
@@ -101,6 +101,7 @@ def get_last_nightly_urls() -> list[str]:
     api = wandb.Api(timeout=60)
 
     nightly_urls = []
+    filter_tags = []
 
     # Fetch all projects for the FAR entity
     all_projects = list(api.projects(WANDB_ENTITY))
@@ -110,6 +111,10 @@ def get_last_nightly_urls() -> list[str]:
     since_time = datetime.now(timezone.utc) - timedelta(hours=16)
     since_iso = since_time.isoformat()
 
+    # GHA run ids filter
+    if os.getenv("GITHUB_RUN_ID"):
+        filter_tags.append(f"gha-run-id-{os.getenv('GITHUB_RUN_ID')}")
+
     # Use parallel processing to speed up API calls
     # Default to a reasonable number of workers based on CPU count
     max_workers = min(32, (os.cpu_count() or 1) + 4)
@@ -117,7 +122,7 @@ def get_last_nightly_urls() -> list[str]:
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         # Submit all tasks
         future2project_name = {
-            executor.submit(_fetch_project_runs, api, p.name, since_iso): p.name for p in nightly_projects
+            executor.submit(_fetch_project_runs, api, p.name, since_iso, filter_tags): p.name for p in nightly_projects
         }
 
         # Process completed tasks as they finish