refactor: extract vLLM cleanup to shared function

planetf1 · planetf1 · commit 98a9d9769cb5 · 2026-02-10T10:08:50.000Z
- Extract duplicated cleanup code to cleanup_vllm_backend() in conftest.py
- Add NCCL process group cleanup to suppress warnings
- Add Mistral tokenizer mode to suppress FutureWarning
- Simplify comments for conciseness
diff --git a/test/backends/test_vllm.py b/test/backends/test_vllm.py
@@ -29,12 +29,6 @@
 @pytest.fixture(scope="module")
 def backend():
     """Shared vllm backend for all tests in this module."""
-    # Import cleanup dependencies at top to avoid scoping issues
-    import gc
-    import time
-
-    import torch
-
     if os.environ.get("VLLM_USE_V1", -1) != "0":
         pytest.skip("skipping vllm tests; tests require `export VLLM_USE_V1=0`")
 
@@ -50,38 +44,11 @@ def backend():
         },
     )
     yield backend
-    # Cleanup: shutdown vLLM engine and release GPU memory
-
-    # Shutdown the background loop
-    backend._underlying_model.shutdown_background_loop()
-
-    # Delete the model and clear references
-    del backend._underlying_model
-    del backend
-
-    # Force garbage collection and clear CUDA cache
-    gc.collect()
-
-    if torch.cuda.is_available():
-        # Synchronize to ensure all CUDA operations complete
-        torch.cuda.synchronize()
-        # Empty the cache multiple times to ensure memory is released
-        torch.cuda.empty_cache()
-
-        # Set environment variable to help with memory fragmentation
-        # This tells PyTorch to use expandable memory segments
-        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-
-        # Reset peak memory stats
-        torch.cuda.reset_peak_memory_stats()
-        # Reset accumulated memory stats
-        torch.cuda.reset_accumulated_memory_stats()
-
-        # Multiple cleanup passes with delays to allow CUDA runtime to release memory
-        for _ in range(3):
-            gc.collect()
-            torch.cuda.empty_cache()
-            time.sleep(1)
+
+    # Cleanup using shared function (best-effort within module)
+    from test.conftest import cleanup_vllm_backend
+
+    cleanup_vllm_backend(backend)
 
 
 @pytest.fixture(scope="function")
diff --git a/test/backends/test_vllm_tools.py b/test/backends/test_vllm_tools.py
@@ -26,12 +26,6 @@
 @pytest.fixture(scope="module")
 def backend():
     """Shared vllm backend for all tests in this module."""
-    # Import cleanup dependencies at top to avoid scoping issues
-    import gc
-    import time
-
-    import torch
-
     if os.environ.get("VLLM_USE_V1", -1) != "0":
         pytest.skip("skipping vllm tests; tests require `export VLLM_USE_V1=0`")
 
@@ -43,41 +37,16 @@ def backend():
             "gpu_memory_utilization": 0.8,
             "max_model_len": 8192,
             "max_num_seqs": 8,
+            # Suppress Mistral tokenizer warning
+            "tokenizer_mode": "mistral",
         },
     )
     yield backend
-    # Cleanup: shutdown vLLM engine and release GPU memory
-
-    # Shutdown the background loop
-    backend._underlying_model.shutdown_background_loop()
-
-    # Delete the model and clear references
-    del backend._underlying_model
-    del backend
-
-    # Force garbage collection and clear CUDA cache
-    gc.collect()
-
-    if torch.cuda.is_available():
-        # Synchronize to ensure all CUDA operations complete
-        torch.cuda.synchronize()
-        # Empty the cache multiple times to ensure memory is released
-        torch.cuda.empty_cache()
-
-        # Set environment variable to help with memory fragmentation
-        # This tells PyTorch to use expandable memory segments
-        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-
-        # Reset peak memory stats
-        torch.cuda.reset_peak_memory_stats()
-        # Reset accumulated memory stats
-        torch.cuda.reset_accumulated_memory_stats()
-
-        # Multiple cleanup passes with delays to allow CUDA runtime to release memory
-        for _ in range(3):
-            gc.collect()
-            torch.cuda.empty_cache()
-            time.sleep(1)
+
+    # Cleanup using shared function (best-effort within module)
+    from test.conftest import cleanup_vllm_backend
+
+    cleanup_vllm_backend(backend)
 
 
 @pytest.fixture(scope="function")
diff --git a/test/conftest.py b/test/conftest.py
@@ -267,6 +267,52 @@ def _run_heavy_modules_isolated(session, heavy_modules: list[str]) -> int:
     return 0 if all_passed else 1
 
 
+# ============================================================================
+# vLLM Backend Cleanup Helper
+# ============================================================================
+
+
+def cleanup_vllm_backend(backend):
+    """Best-effort cleanup of vLLM backend GPU memory.
+
+    Note: CUDA driver holds GPU memory at process level. Only process exit
+    reliably releases it. Cross-module isolation uses separate subprocesses
+    (see pytest_collection_finish hook).
+
+    Args:
+        backend: The vLLM backend instance to cleanup
+    """
+    import gc
+    import time
+
+    import torch
+
+    backend._underlying_model.shutdown_background_loop()
+    del backend._underlying_model
+    del backend
+    gc.collect()
+
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.reset_accumulated_memory_stats()
+
+        # Cleanup NCCL process groups to suppress warnings
+        if torch.distributed.is_initialized():
+            try:
+                torch.distributed.destroy_process_group()
+            except Exception:
+                # Ignore if already destroyed
+                pass
+
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+            time.sleep(1)
+
+
 def pytest_collection_finish(session):
     """After collection, check if we need heavy GPU test process isolation.