Skip to content

Commit 98a9d97

Browse files
committed
refactor: extract vLLM cleanup to shared function
- Extract duplicated cleanup code to cleanup_vllm_backend() in conftest.py - Add NCCL process group cleanup to suppress warnings - Add Mistral tokenizer mode to suppress FutureWarning - Simplify comments for conciseness
1 parent cd5a632 commit 98a9d97

File tree

3 files changed

+58
-76
lines changed

3 files changed

+58
-76
lines changed

test/backends/test_vllm.py

Lines changed: 5 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,6 @@
2929
@pytest.fixture(scope="module")
3030
def backend():
3131
"""Shared vllm backend for all tests in this module."""
32-
# Import cleanup dependencies at top to avoid scoping issues
33-
import gc
34-
import time
35-
36-
import torch
37-
3832
if os.environ.get("VLLM_USE_V1", -1) != "0":
3933
pytest.skip("skipping vllm tests; tests require `export VLLM_USE_V1=0`")
4034

@@ -50,38 +44,11 @@ def backend():
5044
},
5145
)
5246
yield backend
53-
# Cleanup: shutdown vLLM engine and release GPU memory
54-
55-
# Shutdown the background loop
56-
backend._underlying_model.shutdown_background_loop()
57-
58-
# Delete the model and clear references
59-
del backend._underlying_model
60-
del backend
61-
62-
# Force garbage collection and clear CUDA cache
63-
gc.collect()
64-
65-
if torch.cuda.is_available():
66-
# Synchronize to ensure all CUDA operations complete
67-
torch.cuda.synchronize()
68-
# Empty the cache multiple times to ensure memory is released
69-
torch.cuda.empty_cache()
70-
71-
# Set environment variable to help with memory fragmentation
72-
# This tells PyTorch to use expandable memory segments
73-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
74-
75-
# Reset peak memory stats
76-
torch.cuda.reset_peak_memory_stats()
77-
# Reset accumulated memory stats
78-
torch.cuda.reset_accumulated_memory_stats()
79-
80-
# Multiple cleanup passes with delays to allow CUDA runtime to release memory
81-
for _ in range(3):
82-
gc.collect()
83-
torch.cuda.empty_cache()
84-
time.sleep(1)
47+
48+
# Cleanup using shared function (best-effort within module)
49+
from test.conftest import cleanup_vllm_backend
50+
51+
cleanup_vllm_backend(backend)
8552

8653

8754
@pytest.fixture(scope="function")

test/backends/test_vllm_tools.py

Lines changed: 7 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,6 @@
2626
@pytest.fixture(scope="module")
2727
def backend():
2828
"""Shared vllm backend for all tests in this module."""
29-
# Import cleanup dependencies at top to avoid scoping issues
30-
import gc
31-
import time
32-
33-
import torch
34-
3529
if os.environ.get("VLLM_USE_V1", -1) != "0":
3630
pytest.skip("skipping vllm tests; tests require `export VLLM_USE_V1=0`")
3731

@@ -43,41 +37,16 @@ def backend():
4337
"gpu_memory_utilization": 0.8,
4438
"max_model_len": 8192,
4539
"max_num_seqs": 8,
40+
# Suppress Mistral tokenizer warning
41+
"tokenizer_mode": "mistral",
4642
},
4743
)
4844
yield backend
49-
# Cleanup: shutdown vLLM engine and release GPU memory
50-
51-
# Shutdown the background loop
52-
backend._underlying_model.shutdown_background_loop()
53-
54-
# Delete the model and clear references
55-
del backend._underlying_model
56-
del backend
57-
58-
# Force garbage collection and clear CUDA cache
59-
gc.collect()
60-
61-
if torch.cuda.is_available():
62-
# Synchronize to ensure all CUDA operations complete
63-
torch.cuda.synchronize()
64-
# Empty the cache multiple times to ensure memory is released
65-
torch.cuda.empty_cache()
66-
67-
# Set environment variable to help with memory fragmentation
68-
# This tells PyTorch to use expandable memory segments
69-
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
70-
71-
# Reset peak memory stats
72-
torch.cuda.reset_peak_memory_stats()
73-
# Reset accumulated memory stats
74-
torch.cuda.reset_accumulated_memory_stats()
75-
76-
# Multiple cleanup passes with delays to allow CUDA runtime to release memory
77-
for _ in range(3):
78-
gc.collect()
79-
torch.cuda.empty_cache()
80-
time.sleep(1)
45+
46+
# Cleanup using shared function (best-effort within module)
47+
from test.conftest import cleanup_vllm_backend
48+
49+
cleanup_vllm_backend(backend)
8150

8251

8352
@pytest.fixture(scope="function")

test/conftest.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,52 @@ def _run_heavy_modules_isolated(session, heavy_modules: list[str]) -> int:
267267
return 0 if all_passed else 1
268268

269269

270+
# ============================================================================
271+
# vLLM Backend Cleanup Helper
272+
# ============================================================================
273+
274+
275+
def cleanup_vllm_backend(backend):
276+
"""Best-effort cleanup of vLLM backend GPU memory.
277+
278+
Note: CUDA driver holds GPU memory at process level. Only process exit
279+
reliably releases it. Cross-module isolation uses separate subprocesses
280+
(see pytest_collection_finish hook).
281+
282+
Args:
283+
backend: The vLLM backend instance to cleanup
284+
"""
285+
import gc
286+
import time
287+
288+
import torch
289+
290+
backend._underlying_model.shutdown_background_loop()
291+
del backend._underlying_model
292+
del backend
293+
gc.collect()
294+
295+
if torch.cuda.is_available():
296+
torch.cuda.synchronize()
297+
torch.cuda.empty_cache()
298+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
299+
torch.cuda.reset_peak_memory_stats()
300+
torch.cuda.reset_accumulated_memory_stats()
301+
302+
# Cleanup NCCL process groups to suppress warnings
303+
if torch.distributed.is_initialized():
304+
try:
305+
torch.distributed.destroy_process_group()
306+
except Exception:
307+
# Ignore if already destroyed
308+
pass
309+
310+
for _ in range(3):
311+
gc.collect()
312+
torch.cuda.empty_cache()
313+
time.sleep(1)
314+
315+
270316
def pytest_collection_finish(session):
271317
"""After collection, check if we need heavy GPU test process isolation.
272318

0 commit comments

Comments
 (0)