Skip to content

Commit 28506b9

Browse files
committed
Enhance CI test retry logic with comprehensive failure detection
This commit significantly improves the CI workflow by implementing a robust retry mechanism for failed tests. It introduces multiple methods to identify failed tests, including standard failure patterns, panic detection, and API/auth errors. The logic now deduplicates and formats failed test names for reruns, ensuring a more reliable and informative testing process. These enhancements aim to increase the resilience of the CI pipeline and provide clearer diagnostics for test failures.
1 parent 5edf0f3 commit 28506b9

File tree

1 file changed

+175
-38
lines changed

1 file changed

+175
-38
lines changed

.github/workflows/ci-tests.yml

Lines changed: 175 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -360,53 +360,190 @@ jobs:
360360
set -e # Re-enable exit on error
361361
echo "First run exit code: $FIRST_RUN_EXIT_CODE"
362362
363-
# Check for failed tests (including panics) and rerun them once
364-
if [ "$FIRST_RUN_EXIT_CODE" -ne 0 ]; then
365-
echo "Some tests failed or panicked, checking failure type..."
363+
# ============================================================
364+
# BULLETPROOF RETRY LOGIC - Handles ALL failure scenarios
365+
# ============================================================
366+
# Retries: Up to 2 retries (3 total attempts)
367+
# Detects: FAIL, panic, API errors, auth failures, timeouts
368+
# ============================================================
369+
370+
extract_failed_tests() {
371+
local LOG_FILE="$1"
372+
local FAILED_TESTS=""
373+
374+
echo "=== Analyzing log file for failures ==="
375+
376+
# Method 1: Standard "--- FAIL: TestName" pattern
377+
local STANDARD_FAILS=$(grep -E "^--- FAIL:" "$LOG_FILE" 2>/dev/null | \
378+
grep -oE "Test[A-Za-z0-9_]+" | sort -u | tr '\n' ' ' || true)
379+
if [ -n "$STANDARD_FAILS" ]; then
380+
echo " [Method 1] Found via --- FAIL: $STANDARD_FAILS"
381+
FAILED_TESTS="$STANDARD_FAILS"
382+
fi
366383
367-
# Check for infrastructure/server failures that shouldn't be retried
368-
if grep -qE "Could not reach provided Checkmarx server|connection refused|no such host|timeout exceeded" test_output_${{ matrix.group }}.log; then
369-
echo "Infrastructure/server connectivity issue detected. This is not a test logic failure."
370-
echo "Failing the job - please check Checkmarx server availability."
371-
exit 1
384+
# Method 2: Find tests that panicked (look for === RUN before each panic)
385+
if grep -q "^panic:" "$LOG_FILE" 2>/dev/null; then
386+
echo " [Method 2] Panic detected, finding affected tests..."
387+
# Get all panic line numbers
388+
local PANIC_LINES=$(grep -n "^panic:" "$LOG_FILE" | cut -d: -f1)
389+
for PANIC_LINE in $PANIC_LINES; do
390+
local PANIC_TEST=$(head -n "$PANIC_LINE" "$LOG_FILE" | grep -E "^=== RUN" | tail -1 | grep -oE "Test[A-Za-z0-9_]+" | head -1 || true)
391+
if [ -n "$PANIC_TEST" ]; then
392+
echo " Panic in: $PANIC_TEST"
393+
FAILED_TESTS="$FAILED_TESTS $PANIC_TEST"
394+
fi
395+
done
372396
fi
373397
374-
# Extract failed test names from various failure patterns
375-
FAILED_TESTS=$(grep -E "^--- FAIL:|panic:.*Test" test_output_${{ matrix.group }}.log | \
376-
grep -oE "Test[A-Za-z0-9_]+" | sort -u | tr '\n' '|' | sed 's/|$//')
398+
# Method 3: Find tests with error messages (API errors, auth failures, etc.)
399+
local ERROR_PATTERNS="Authorization failed|Failed showing|Failed creating|Failed getting|error getting|API error|status code: 5[0-9][0-9]"
400+
if grep -qE "$ERROR_PATTERNS" "$LOG_FILE" 2>/dev/null; then
401+
echo " [Method 3] API/Auth errors detected, finding affected tests..."
402+
local ERROR_LINES=$(grep -nE "$ERROR_PATTERNS" "$LOG_FILE" | cut -d: -f1 | head -5)
403+
for ERROR_LINE in $ERROR_LINES; do
404+
local ERROR_TEST=$(head -n "$ERROR_LINE" "$LOG_FILE" | grep -E "^=== RUN" | tail -1 | grep -oE "Test[A-Za-z0-9_]+" | head -1 || true)
405+
if [ -n "$ERROR_TEST" ]; then
406+
echo " Error in: $ERROR_TEST"
407+
FAILED_TESTS="$FAILED_TESTS $ERROR_TEST"
408+
fi
409+
done
410+
fi
377411
412+
# Method 4: Last resort - get the last running test before FAIL
413+
if [ -z "$FAILED_TESTS" ]; then
414+
echo " [Method 4] Using last running test as fallback..."
415+
local LAST_TEST=$(grep -E "^=== RUN" "$LOG_FILE" | tail -1 | grep -oE "Test[A-Za-z0-9_]+" | head -1 || true)
416+
if [ -n "$LAST_TEST" ]; then
417+
echo " Last running: $LAST_TEST"
418+
FAILED_TESTS="$LAST_TEST"
419+
fi
420+
fi
421+
422+
# Clean up: deduplicate and format as pipe-separated for -run flag
378423
if [ -n "$FAILED_TESTS" ]; then
379-
echo "Rerunning failed tests: $FAILED_TESTS"
380-
381-
# Add a delay before retry to allow any cleanup and server recovery
382-
sleep 10
383-
384-
go test \
385-
-tags integration \
386-
-v \
387-
-p 1 \
388-
-timeout 60m \
389-
-run "^($FAILED_TESTS)$" \
390-
-coverpkg github.com/checkmarx/ast-cli/internal/commands,github.com/checkmarx/ast-cli/internal/services,github.com/checkmarx/ast-cli/internal/wrappers \
391-
-coverprofile cover-${{ matrix.group }}-rerun.out \
392-
github.com/checkmarx/ast-cli/test/integration 2>&1 | tee test_output_${{ matrix.group }}_rerun.log
393-
394-
RERUN_EXIT_CODE=$?
395-
echo "Rerun exit code: $RERUN_EXIT_CODE"
396-
397-
if [ "$RERUN_EXIT_CODE" -ne 0 ]; then
398-
echo "Tests still failing after retry"
399-
exit 1
424+
# Also extract parent test names (for subtests like TestFoo/SubTest -> TestFoo)
425+
local ALL_TESTS=""
426+
for TEST in $FAILED_TESTS; do
427+
ALL_TESTS="$ALL_TESTS $TEST"
428+
# Extract parent test name if this looks like a subtest
429+
local PARENT=$(echo "$TEST" | sed 's/_[^_]*$//' | grep -E "^Test" || true)
430+
if [ -n "$PARENT" ] && [ "$PARENT" != "$TEST" ]; then
431+
ALL_TESTS="$ALL_TESTS $PARENT"
432+
fi
433+
done
434+
FAILED_TESTS=$(echo "$ALL_TESTS" | tr ' ' '\n' | grep -E "^Test" | sort -u | tr '\n' '|' | sed 's/|$//')
435+
fi
436+
437+
echo "$FAILED_TESTS"
438+
}
439+
440+
run_tests_with_retry() {
441+
local PATTERN="$1"
442+
local ATTEMPT="$2"
443+
local MAX_ATTEMPTS="$3"
444+
local LOG_SUFFIX="$4"
445+
446+
echo ""
447+
echo "=========================================="
448+
echo " RETRY ATTEMPT $ATTEMPT of $MAX_ATTEMPTS"
449+
echo " Pattern: $PATTERN"
450+
echo "=========================================="
451+
echo ""
452+
453+
# Wait before retry to allow cleanup and server recovery
454+
if [ "$ATTEMPT" -gt 1 ]; then
455+
local WAIT_TIME=$((ATTEMPT * 15))
456+
echo "Waiting ${WAIT_TIME}s before retry..."
457+
sleep $WAIT_TIME
458+
fi
459+
460+
set +e
461+
go test \
462+
-tags integration \
463+
-v \
464+
-p 1 \
465+
-timeout 60m \
466+
-run "$PATTERN" \
467+
-coverpkg github.com/checkmarx/ast-cli/internal/commands,github.com/checkmarx/ast-cli/internal/services,github.com/checkmarx/ast-cli/internal/wrappers \
468+
-coverprofile cover-${{ matrix.group }}-${LOG_SUFFIX}.out \
469+
github.com/checkmarx/ast-cli/test/integration 2>&1 | tee test_output_${{ matrix.group }}_${LOG_SUFFIX}.log
470+
local EXIT_CODE=${PIPESTATUS[0]}
471+
set -e
472+
473+
return $EXIT_CODE
474+
}
475+
476+
if [ "$FIRST_RUN_EXIT_CODE" -ne 0 ]; then
477+
echo ""
478+
echo "============================================"
479+
echo " FIRST RUN FAILED - Starting retry logic"
480+
echo "============================================"
481+
482+
# Check for hard infrastructure failures that shouldn't be retried
483+
if grep -qE "Could not reach provided Checkmarx server|connection refused|no such host" test_output_${{ matrix.group }}.log; then
484+
echo "::error::Infrastructure failure detected - Checkmarx server unreachable"
485+
echo "This is a server connectivity issue, not a test failure."
486+
exit 1
487+
fi
488+
489+
# Extract failed tests
490+
FAILED_TESTS=$(extract_failed_tests "test_output_${{ matrix.group }}.log")
491+
492+
if [ -z "$FAILED_TESTS" ]; then
493+
echo "::error::Could not identify which tests failed"
494+
echo "Check the log file for details"
495+
exit 1
496+
fi
497+
498+
echo ""
499+
echo "Tests to retry: $FAILED_TESTS"
500+
501+
# Retry loop - up to 2 more attempts
502+
MAX_RETRIES=2
503+
CURRENT_RETRY=1
504+
RETRY_SUCCESS=false
505+
506+
while [ $CURRENT_RETRY -le $MAX_RETRIES ]; do
507+
run_tests_with_retry "^($FAILED_TESTS)$" "$CURRENT_RETRY" "$MAX_RETRIES" "retry${CURRENT_RETRY}"
508+
RETRY_EXIT_CODE=$?
509+
510+
if [ $RETRY_EXIT_CODE -eq 0 ]; then
511+
echo ""
512+
echo "=========================================="
513+
echo " ✅ TESTS PASSED ON RETRY $CURRENT_RETRY"
514+
echo "=========================================="
515+
RETRY_SUCCESS=true
516+
break
400517
else
401-
echo "Tests passed on retry!"
402-
fi
403-
else
404-
echo "Could not extract failed test names from log, checking for timeout..."
405-
if grep -q "test timed out" test_output_${{ matrix.group }}.log; then
406-
echo "Test timed out - this may be a long-running test or infrastructure issue"
518+
echo ""
519+
echo "Retry $CURRENT_RETRY failed with exit code: $RETRY_EXIT_CODE"
520+
521+
# Check if we should continue retrying
522+
if [ $CURRENT_RETRY -lt $MAX_RETRIES ]; then
523+
# Extract any new failures from this retry
524+
NEW_FAILURES=$(extract_failed_tests "test_output_${{ matrix.group }}_retry${CURRENT_RETRY}.log")
525+
if [ -n "$NEW_FAILURES" ]; then
526+
FAILED_TESTS="$NEW_FAILURES"
527+
echo "Updated failed tests for next retry: $FAILED_TESTS"
528+
fi
529+
fi
407530
fi
531+
532+
CURRENT_RETRY=$((CURRENT_RETRY + 1))
533+
done
534+
535+
if [ "$RETRY_SUCCESS" = false ]; then
536+
echo ""
537+
echo "=========================================="
538+
echo " ❌ TESTS FAILED AFTER $MAX_RETRIES RETRIES"
539+
echo "=========================================="
408540
exit 1
409541
fi
542+
else
543+
echo ""
544+
echo "=========================================="
545+
echo " ✅ ALL TESTS PASSED ON FIRST RUN"
546+
echo "=========================================="
410547
fi
411548
412549
- name: Skip notification (no uncovered tests)

0 commit comments

Comments
 (0)