Rebase main onto upstream v0.6.4 #4

Workflow file for this run

.github/workflows/pr-test.yml at ae3511d

	# CI workflow using AWS self-hosted runners.
	# Runs AOT build tests and GPU unit tests on push/PR to main.
	# Uses ci/bash.sh for Docker execution (same as Jenkins).
	#
	# Permission Control:
	# - Push to main: Always runs
	# - PR from org members (ci-users team): Runs automatically
	# - PR from external contributors: Requires 'run-ci' label
	# (added via @flashinfer-bot run command from authorized user)
	#
	# Rerun Strategy:
	# - Spot jobs run with fail-fast: true
	# - Background monitor checks AWS metadata for spot termination notice
	# - If termination detected, writes marker to log (captured by GitHub)
	# - Analyze job checks logs for marker to decide if should rerun
	# - Spot termination: rerun all failed/cancelled jobs on on-demand
	# - Real failure: no rerun, workflow fails fast

	name: PR Test

	on:
	push:
	branches: [main]
	pull_request:
	branches: [main]
	types: [opened, synchronize, reopened, labeled]
	workflow_dispatch:
	inputs:
	skip_aot:
	description: 'Skip AOT build tests'
	type: boolean
	default: false
	skip_gpu:
	description: 'Skip GPU tests'
	type: boolean
	default: false

	concurrency:
	group: pr-test-${{ github.ref }}
	cancel-in-progress: true

	permissions:
	contents: read
	pull-requests: write
	actions: read

	env:
	EXECUTOR_NUMBER: "0"

	jobs:
	# ---------------------------------------------------------------------------
	# Gate - Check if PR is authorized to run CI
	# ---------------------------------------------------------------------------
	gate:
	name: Permission Check
	runs-on: ubuntu-latest
	outputs:
	authorized: ${{ steps.check.outputs.authorized }}
	steps:
	- name: Check authorization
	id: check
	env:
	GH_TOKEN: ${{ secrets.FLASHINFER_GITHUB_TOKEN }}
	run: \|
	# Always allow push to main and workflow_dispatch
	if [[ "${{ github.event_name }}" != "pull_request" ]]; then
	echo "authorized=true" >> "$GITHUB_OUTPUT"
	echo "Not a PR, authorized"
	exit 0
	fi

	# Check if PR has run-ci label
	if [[ "${{ contains(github.event.pull_request.labels.*.name, 'run-ci') }}" == "true" ]]; then
	echo "authorized=true" >> "$GITHUB_OUTPUT"
	echo "PR has run-ci label, authorized"
	exit 0
	fi

	# Check if PR author is a member of ci-users team
	AUTHOR="${{ github.event.pull_request.user.login }}"
	ORG="${{ github.repository_owner }}"
	TEAM="ci-users"

	echo "Checking if $AUTHOR is a member of $ORG/$TEAM..."

	if [[ -z "$GH_TOKEN" ]]; then
	echo "::warning::FLASHINFER_GITHUB_TOKEN not set, falling back to association check"
	# Fallback: check if author has write access
	ASSOC="${{ github.event.pull_request.author_association }}"
	if [[ "$ASSOC" =~ ^(OWNER\|MEMBER\|COLLABORATOR)$ ]]; then
	echo "authorized=true" >> "$GITHUB_OUTPUT"
	echo "PR author has $ASSOC access, authorized"
	else
	echo "authorized=false" >> "$GITHUB_OUTPUT"
	echo "PR author is $ASSOC, not authorized"
	fi
	exit 0
	fi

	# Check team membership
	MEMBERS=$(gh api \
	-H "Accept: application/vnd.github+json" \
	-H "X-GitHub-Api-Version: 2022-11-28" \
	--paginate \
	"/orgs/${ORG}/teams/${TEAM}/members" \
	--jq '.[].login' 2>&1) \|\| {
	echo "::warning::Failed to get team members: $MEMBERS"
	echo "authorized=false" >> "$GITHUB_OUTPUT"
	exit 0
	}

	if echo "$MEMBERS" \| grep -qx "$AUTHOR"; then
	echo "authorized=true" >> "$GITHUB_OUTPUT"
	echo "$AUTHOR is a member of $TEAM, authorized"
	else
	echo "authorized=false" >> "$GITHUB_OUTPUT"
	echo "$AUTHOR is not a member of $TEAM, not authorized"
	fi

	# ---------------------------------------------------------------------------
	# Setup - Read docker tag and check if build should be skipped
	# ---------------------------------------------------------------------------
	setup:
	name: Setup
	needs: gate
	if: needs.gate.outputs.authorized == 'true'
	runs-on: ubuntu-latest
	outputs:
	docker_tag: ${{ steps.get-tag.outputs.tag }}
	skip_build: ${{ steps.check.outputs.skip }}
	steps:
	- uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Get Docker Tag
	id: get-tag
	run: \|
	TAG=$(grep 'flashinfer/flashinfer-ci-cu129:' ci/docker-tags.yml \| cut -d':' -f2 \| tr -d ' ')
	if [ -z "$TAG" ]; then
	echo "::error::Failed to extract Docker tag from ci/docker-tags.yml"
	exit 1
	fi
	echo "tag=$TAG" >> $GITHUB_OUTPUT
	echo "Docker tag: $TAG"

	- name: Check Skip Conditions
	id: check
	run: \|
	if [ "${{ github.event_name }}" != "pull_request" ]; then
	echo "skip=false" >> $GITHUB_OUTPUT
	exit 0
	fi

	# Use PR event SHAs for reliable diff (avoids issues with origin refs)
	BASE_SHA="${{ github.event.pull_request.base.sha }}"
	HEAD_SHA="${{ github.event.pull_request.head.sha }}"
	CHANGED=$(git diff --name-only "$BASE_SHA...$HEAD_SHA")
	# TODO (yongwww): Add back ^\.github/ before merging to main
	SKIP_PATTERNS="\.md$\|\.txt$\|^docs/\|^docker/\|^licenses/\|^LICENSE$\|^NOTICE$\|^benchmarks/"

	SKIP=true
	while IFS= read -r file; do
	if [ -n "$file" ] && ! echo "$file" \| grep -qE "$SKIP_PATTERNS"; then
	SKIP=false
	break
	fi
	done <<< "$CHANGED"

	echo "skip=$SKIP" >> $GITHUB_OUTPUT
	if [ "$SKIP" == "true" ]; then
	echo "::notice::Skipping build - only docs/config files changed"
	fi

	# ---------------------------------------------------------------------------
	# AOT Build Import Tests (Spot + On-Demand Rerun)
	# ---------------------------------------------------------------------------
	aot-build-import:
	name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }})
	needs: [gate, setup]
	if: \|
	needs.gate.outputs.authorized == 'true' &&
	needs.setup.outputs.skip_build != 'true' &&
	github.event.inputs.skip_aot != 'true'
	runs-on:
	- self-hosted
	- linux
	- ${{ matrix.arch }}
	- cpu
	- spot
	timeout-minutes: 360
	strategy:
	fail-fast: true
	matrix:
	arch: [x64, arm64]
	cuda: [cu126, cu128, cu129, cu130]
	env:
	DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }}
	steps:
	- name: Cleanup
	run: \|
	# Stop all Docker containers to free memory
	docker stop $(docker ps -q) 2>/dev/null \|\| true
	docker rm $(docker ps -aq) 2>/dev/null \|\| true
	# Clean workspace and caches
	sudo rm -rf ${{ github.workspace }}/* \|\| true
	sudo rm -rf ${{ github.workspace }}/.[!.]* \|\| true
	rm -rf ~/.cache/flashinfer_jit \|\| true
	docker image prune -f \|\| true
	docker builder prune -f --filter "until=24h" \|\| true

	- uses: actions/checkout@v4
	with:
	submodules: recursive

	- name: Start spot termination monitor
	run: ./scripts/task_monitor_spot.sh &


	- name: Show Node Info
	run: ./scripts/task_show_node_info.sh
	env:
	NODE_NAME: ${{ runner.name }}
	WORKSPACE: ${{ github.workspace }}
	BUILD_NUMBER: ${{ github.run_number }}

	- name: Run Test
	run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh

	analyze-aot-failure:
	name: Analyze AOT Failure
	needs: [setup, aot-build-import]
	if: "!cancelled() && (needs.aot-build-import.result == 'failure' \|\| needs.aot-build-import.result == 'cancelled')"
	runs-on: ubuntu-latest
	outputs:
	is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
	rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
	steps:
	- name: Checkout scripts
	uses: actions/checkout@v4
	with:
	sparse-checkout: scripts
	sparse-checkout-cone-mode: false

	- name: Analyze failure from job logs
	id: analyze
	env:
	GH_TOKEN: ${{ github.token }}
	run: ./scripts/task_analyze_spot.sh 'startswith("AOT")' '${{ github.repository }}' '${{ github.run_id }}'

	- name: Build rerun matrix
	id: matrix
	if: steps.analyze.outputs.is_spot_termination == 'true'
	run: \|
	MATRIX='{"include":['
	for arch in x64 arm64; do
	for cuda in cu126 cu128 cu129 cu130; do
	MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},'
	done
	done
	MATRIX="${MATRIX%,}]}"
	echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT

	aot-build-import-rerun:
	name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }})
	needs: [setup, analyze-aot-failure]
	if: \|
	!cancelled() &&
	needs.analyze-aot-failure.outputs.is_spot_termination == 'true' &&
	needs.analyze-aot-failure.outputs.rerun_matrix != ''
	runs-on:
	- self-hosted
	- linux
	- ${{ matrix.arch }}
	- cpu
	- on-demand
	timeout-minutes: 360
	strategy:
	fail-fast: true
	matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }}
	env:
	DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }}
	steps:
	- name: Cleanup
	run: \|
	# Stop all Docker containers to free memory
	docker stop $(docker ps -q) 2>/dev/null \|\| true
	docker rm $(docker ps -aq) 2>/dev/null \|\| true
	# Clean workspace and caches
	sudo rm -rf ${{ github.workspace }}/* \|\| true
	sudo rm -rf ${{ github.workspace }}/.[!.]* \|\| true
	rm -rf ~/.cache/flashinfer_jit \|\| true
	docker image prune -f \|\| true
	docker builder prune -f --filter "until=24h" \|\| true

	- uses: actions/checkout@v4
	with:
	submodules: recursive


	- name: Show Node Info
	run: ./scripts/task_show_node_info.sh
	env:
	NODE_NAME: ${{ runner.name }}
	WORKSPACE: ${{ github.workspace }}
	BUILD_NUMBER: ${{ github.run_number }}

	- name: Run Test
	run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh

	# ---------------------------------------------------------------------------
	# GPU JIT Tests - SM86 (A10G) - Spot + On-Demand Rerun
	# ---------------------------------------------------------------------------
	gpu-tests-a10g:
	name: JIT Unittest ${{ matrix.shard }} (A10G)
	needs: [gate, setup]
	if: \|
	needs.gate.outputs.authorized == 'true' &&
	needs.setup.outputs.skip_build != 'true' &&
	github.event.inputs.skip_gpu != 'true'
	runs-on: [self-hosted, linux, x64, gpu, sm86, spot]
	timeout-minutes: 360
	strategy:
	fail-fast: true
	matrix:
	shard: [1, 2, 3, 4, 5]
	env:
	DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
	steps:
	- name: Cleanup
	run: \|
	# Stop all Docker containers to free memory
	docker stop $(docker ps -q) 2>/dev/null \|\| true
	docker rm $(docker ps -aq) 2>/dev/null \|\| true
	# Clean workspace and caches
	sudo rm -rf ${{ github.workspace }}/* \|\| true
	sudo rm -rf ${{ github.workspace }}/.[!.]* \|\| true
	rm -rf ~/.cache/flashinfer_jit \|\| true
	docker image prune -f \|\| true
	docker builder prune -f --filter "until=24h" \|\| true
	nvidia-smi \|\| true

	- uses: actions/checkout@v4
	with:
	submodules: recursive

	- name: Start spot termination monitor
	run: ./scripts/task_monitor_spot.sh &


	- name: Show Node Info
	run: ./scripts/task_show_node_info.sh
	env:
	NODE_NAME: ${{ runner.name }}
	WORKSPACE: ${{ github.workspace }}
	BUILD_NUMBER: ${{ github.run_number }}

	- name: Run JIT Unittest Part ${{ matrix.shard }}
	run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh

	analyze-gpu-a10g-failure:
	name: Analyze GPU A10G Failure
	needs: [setup, gpu-tests-a10g]
	if: "!cancelled() && (needs.gpu-tests-a10g.result == 'failure' \|\| needs.gpu-tests-a10g.result == 'cancelled')"
	runs-on: ubuntu-latest
	outputs:
	is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
	rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
	steps:
	- name: Checkout scripts
	uses: actions/checkout@v4
	with:
	sparse-checkout: scripts
	sparse-checkout-cone-mode: false

	- name: Analyze failure from job logs
	id: analyze
	env:
	GH_TOKEN: ${{ github.token }}
	run: ./scripts/task_analyze_spot.sh 'contains("A10G")' '${{ github.repository }}' '${{ github.run_id }}'

	- name: Build rerun matrix
	id: matrix
	if: steps.analyze.outputs.is_spot_termination == 'true'
	run: \|
	echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT

	gpu-tests-a10g-rerun:
	name: JIT Rerun ${{ matrix.shard }} (A10G)
	needs: [setup, analyze-gpu-a10g-failure]
	if: \|
	!cancelled() &&
	needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' &&
	needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != ''
	runs-on: [self-hosted, linux, x64, gpu, sm86, on-demand]
	timeout-minutes: 360
	strategy:
	fail-fast: true
	matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }}
	env:
	DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
	steps:
	- name: Cleanup
	run: \|
	# Stop all Docker containers to free memory
	docker stop $(docker ps -q) 2>/dev/null \|\| true
	docker rm $(docker ps -aq) 2>/dev/null \|\| true
	# Clean workspace and caches
	sudo rm -rf ${{ github.workspace }}/* \|\| true
	sudo rm -rf ${{ github.workspace }}/.[!.]* \|\| true
	rm -rf ~/.cache/flashinfer_jit \|\| true
	docker image prune -f \|\| true
	docker builder prune -f --filter "until=24h" \|\| true
	nvidia-smi \|\| true

	- uses: actions/checkout@v4
	with:
	submodules: recursive


	- name: Show Node Info
	run: ./scripts/task_show_node_info.sh
	env:
	NODE_NAME: ${{ runner.name }}
	WORKSPACE: ${{ github.workspace }}
	BUILD_NUMBER: ${{ github.run_number }}

	- name: Run JIT Unittest Part ${{ matrix.shard }}
	run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh

	# ---------------------------------------------------------------------------
	# GPU JIT Tests - SM75 (T4) - Spot + On-Demand Rerun
	# ---------------------------------------------------------------------------
	gpu-tests-t4:
	name: JIT Unittest (T4)
	needs: [gate, setup]
	if: \|
	needs.gate.outputs.authorized == 'true' &&
	needs.setup.outputs.skip_build != 'true' &&
	github.event.inputs.skip_gpu != 'true'
	runs-on: [self-hosted, linux, x64, gpu, sm75, spot]
	timeout-minutes: 360
	env:
	DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
	steps:
	- name: Cleanup
	run: \|
	# Stop all Docker containers to free memory
	docker stop $(docker ps -q) 2>/dev/null \|\| true
	docker rm $(docker ps -aq) 2>/dev/null \|\| true
	# Clean workspace and caches
	sudo rm -rf ${{ github.workspace }}/* \|\| true
	sudo rm -rf ${{ github.workspace }}/.[!.]* \|\| true
	rm -rf ~/.cache/flashinfer_jit \|\| true
	docker image prune -f \|\| true
	docker builder prune -f --filter "until=24h" \|\| true
	nvidia-smi \|\| true

	- uses: actions/checkout@v4
	with:
	submodules: recursive

	- name: Start spot termination monitor
	run: ./scripts/task_monitor_spot.sh &


	- name: Show Node Info
	run: ./scripts/task_show_node_info.sh
	env:
	NODE_NAME: ${{ runner.name }}
	WORKSPACE: ${{ github.workspace }}
	BUILD_NUMBER: ${{ github.run_number }}

	- name: Run JIT Unittest Part 3 (T4)
	run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh

	analyze-gpu-t4-failure:
	name: Analyze GPU T4 Failure
	needs: [setup, gpu-tests-t4]
	if: "!cancelled() && (needs.gpu-tests-t4.result == 'failure' \|\| needs.gpu-tests-t4.result == 'cancelled')"
	runs-on: ubuntu-latest
	outputs:
	is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
	steps:
	- name: Checkout scripts
	uses: actions/checkout@v4
	with:
	sparse-checkout: scripts
	sparse-checkout-cone-mode: false

	- name: Analyze failure from job logs
	id: analyze
	env:
	GH_TOKEN: ${{ github.token }}
	run: ./scripts/task_analyze_spot.sh 'contains("T4")' '${{ github.repository }}' '${{ github.run_id }}'

	gpu-tests-t4-rerun:
	name: JIT Rerun (T4)
	needs: [setup, analyze-gpu-t4-failure]
	if: \|
	!cancelled() &&
	needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true'
	runs-on: [self-hosted, linux, x64, gpu, sm75, on-demand]
	timeout-minutes: 360
	env:
	DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
	steps:
	- name: Cleanup
	run: \|
	# Stop all Docker containers to free memory
	docker stop $(docker ps -q) 2>/dev/null \|\| true
	docker rm $(docker ps -aq) 2>/dev/null \|\| true
	# Clean workspace and caches
	sudo rm -rf ${{ github.workspace }}/* \|\| true
	sudo rm -rf ${{ github.workspace }}/.[!.]* \|\| true
	rm -rf ~/.cache/flashinfer_jit \|\| true
	docker image prune -f \|\| true
	docker builder prune -f --filter "until=24h" \|\| true
	nvidia-smi \|\| true

	- uses: actions/checkout@v4
	with:
	submodules: recursive


	- name: Show Node Info
	run: ./scripts/task_show_node_info.sh
	env:
	NODE_NAME: ${{ runner.name }}
	WORKSPACE: ${{ github.workspace }}
	BUILD_NUMBER: ${{ github.run_number }}

	- name: Run JIT Unittest Part 3 (T4)
	run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh

	# ---------------------------------------------------------------------------
	# GPU JIT Tests - H100 (Hopper) - Capacity Block
	# Requires manually purchased CB via AWS Console
	# ---------------------------------------------------------------------------
	gpu-tests-h100:
	name: JIT Unittest (H100)
	needs: [gate, setup]
	if: \|
	needs.gate.outputs.authorized == 'true' &&
	needs.setup.outputs.skip_build != 'true' &&
	github.event.inputs.skip_gpu != 'true'
	runs-on: [self-hosted, linux, x64, gpu, h100, 1gpu]
	timeout-minutes: 360
	env:
	DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
	steps:
	- name: Cleanup
	run: \|
	# Stop all Docker containers to free GPU memory
	docker stop $(docker ps -q) 2>/dev/null \|\| true
	docker rm $(docker ps -aq) 2>/dev/null \|\| true
	# Clean workspace and caches
	sudo rm -rf ${{ github.workspace }}/* \|\| true
	sudo rm -rf ${{ github.workspace }}/.[!.]* \|\| true
	rm -rf ~/.cache/flashinfer_jit \|\| true
	docker image prune -f \|\| true
	docker builder prune -f --filter "until=24h" \|\| true
	# Show GPU info (should show 1 GPU due to CUDA_VISIBLE_DEVICES)
	echo "=== GPU Info ==="
	nvidia-smi \|\| true
	echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"

	- uses: actions/checkout@v4
	with:
	submodules: recursive


	- name: Show Node Info
	run: ./scripts/task_show_node_info.sh
	env:
	NODE_NAME: ${{ runner.name }}
	WORKSPACE: ${{ github.workspace }}
	BUILD_NUMBER: ${{ github.run_number }}

	- name: Run H100 Kernel Tests
	run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_run_unit_tests.sh

	# ---------------------------------------------------------------------------
	# Test Results Summary
	# ---------------------------------------------------------------------------
	test-results-summary:
	name: Test Results Summary
	if: "!cancelled()"
	needs:
	- gate
	- setup
	- aot-build-import
	- analyze-aot-failure
	- aot-build-import-rerun
	- gpu-tests-a10g
	- analyze-gpu-a10g-failure
	- gpu-tests-a10g-rerun
	- gpu-tests-t4
	- analyze-gpu-t4-failure
	- gpu-tests-t4-rerun
	- gpu-tests-h100
	runs-on: ubuntu-latest
	steps:
	- name: Check Results
	run: \|
	echo "## Test Results Summary" >> $GITHUB_STEP_SUMMARY

	# Check if CI was skipped due to permissions
	if [ "${{ needs.gate.outputs.authorized }}" != "true" ]; then
	echo "CI skipped (pending authorization)" >> $GITHUB_STEP_SUMMARY
	echo "A contributor in @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY
	exit 0
	fi
	# Helper function to check job status
	check_status() {
	local name=$1 skip=$2 spot=$3 spot_term=$4 rerun=$5
	echo "$name" >> $GITHUB_STEP_SUMMARY
	if [ "$skip" == "true" ]; then
	echo "- Status: Skipped" >> $GITHUB_STEP_SUMMARY
	elif [ "$spot" == "success" ]; then
	echo "- Status: Passed (spot)" >> $GITHUB_STEP_SUMMARY
	elif [ "$spot_term" == "true" ] && [ "$rerun" == "success" ]; then
	echo "- Status: Passed (on-demand rerun)" >> $GITHUB_STEP_SUMMARY
	else
	echo "- Status: Failed" >> $GITHUB_STEP_SUMMARY
	return 1
	fi
	return 0
	}

	echo "Test Results Summary" >> $GITHUB_STEP_SUMMARY
	echo "" >> $GITHUB_STEP_SUMMARY

	if [ "${{ needs.setup.outputs.skip_build }}" == "true" ]; then
	echo "Build skipped (docs/config only changes)" >> $GITHUB_STEP_SUMMARY
	exit 0
	fi

	FAILED=false

	check_status "AOT Build Import Tests" \
	"${{ github.event.inputs.skip_aot }}" \
	"${{ needs.aot-build-import.result }}" \
	"${{ needs.analyze-aot-failure.outputs.is_spot_termination }}" \
	"${{ needs.aot-build-import-rerun.result }}" \|\| FAILED=true

	echo "" >> $GITHUB_STEP_SUMMARY
	check_status "GPU Tests (A10G)" \
	"${{ github.event.inputs.skip_gpu }}" \
	"${{ needs.gpu-tests-a10g.result }}" \
	"${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }}" \
	"${{ needs.gpu-tests-a10g-rerun.result }}" \|\| FAILED=true

	echo "" >> $GITHUB_STEP_SUMMARY
	check_status "GPU Tests (T4)" \
	"${{ github.event.inputs.skip_gpu }}" \
	"${{ needs.gpu-tests-t4.result }}" \
	"${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \
	"${{ needs.gpu-tests-t4-rerun.result }}" \|\| FAILED=true

	# H100 tests (no rerun logic yet - CB instances don't get spot terminated)
	echo "" >> $GITHUB_STEP_SUMMARY
	H100="${{ needs.gpu-tests-h100.result }}"
	echo "GPU Tests (H100): $H100" >> $GITHUB_STEP_SUMMARY
	if [ "$H100" != "success" ] && [ "$H100" != "skipped" ] && [ "${{ github.event.inputs.skip_gpu }}" != "true" ]; then
	FAILED=true
	fi

	echo "" >> $GITHUB_STEP_SUMMARY
	if [ "$FAILED" == "true" ]; then
	echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY
	exit 1
	fi
	echo "Result: Tests Passed" >> $GITHUB_STEP_SUMMARY

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Rebase main onto upstream v0.6.4 #4

Workflow file

Rebase main onto upstream v0.6.4 #4

Uh oh!

Workflow file for this run