diff --git a/.github/workflows/nightly-training.yaml b/.github/workflows/nightly-training.yaml index 62ae24c5..46bca38e 100644 --- a/.github/workflows/nightly-training.yaml +++ b/.github/workflows/nightly-training.yaml @@ -37,7 +37,7 @@ jobs: extra_args: --command.setup_terms.motion_command.params.motion_config.motion_file=s3://far-holosoma-assets/data/motions/g1_29dof/whole_body_tracking/motion_dance_v3.npz --algo.config.buffer_size=384 name: Train ${{ matrix.experiment }} on ${{ matrix.multigpu == 'True' && 'multi-gpu' || 'single-gpu' }} using ${{ matrix.simulator }} - timeout-minutes: 720 # 12 hours + timeout-minutes: 1000 # ~16 hours steps: - name: Checkout code @@ -48,13 +48,21 @@ jobs: env: WANDB_API_KEY: ${{ secrets.WANDB_PUB_API_KEY }} shell: bash - timeout-minutes: 750 # 12.5 hours + timeout-minutes: 1000 # ~16 hours + run: | + # kill all old docker containers + OLD_CONTAINERS=$(docker ps -a -q) + if [[ -n "$OLD_CONTAINERS" ]]; then + docker stop "$OLD_CONTAINERS" || true + docker rm --force "$OLD_CONTAINERS" || true + fi eval "$(aws configure export-credentials --format env)" - docker run --gpus all --runtime=nvidia --shm-size=12g \ + docker run --rm --gpus all --runtime=nvidia --shm-size=12g \ -v "$GITHUB_WORKSPACE:/workspace/holosoma" \ --env AWS_ACCESS_KEY_ID --env AWS_SECRET_ACCESS_KEY \ --env AWS_SESSION_TOKEN --env WANDB_API_KEY \ + --env GITHUB_RUN_ID \ 982423663241.dkr.ecr.us-west-2.amazonaws.com/holosoma:latest \ bash -c """ source scripts/source_${{ matrix.simulator }}_setup.sh diff --git a/src/holosoma/holosoma/config_types/experiment.py b/src/holosoma/holosoma/config_types/experiment.py index c395b4b0..ef05016e 100644 --- a/src/holosoma/holosoma/config_types/experiment.py +++ b/src/holosoma/holosoma/config_types/experiment.py @@ -78,6 +78,9 @@ class TrainingConfig: name: str = "run" """Run name for logging. `logger.name` takes precedence if set.""" + tags: tuple[str, ...] = () + """Optional tags to attach to the run for logging.""" + # Evaluation settings max_eval_steps: int | None = None """Maximum number of evaluation steps (None for unlimited).""" diff --git a/tests/nightly/nightly.py b/tests/nightly/nightly.py index 235d789d..a4ef20d2 100644 --- a/tests/nightly/nightly.py +++ b/tests/nightly/nightly.py @@ -100,7 +100,7 @@ def main(): config.logger, project=f"nightly-{sanitized_exp}{multigpu_suffix}", name=f"nightly-{sanitized_exp}{multigpu_suffix}-{now_timestamp()}", - tags=run_tags, + tags=tuple(run_tags), ), )