Skip to content

Commit 2568f43

Browse files
Wh1isperpaintress-yw
authored andcommitted
Improve batch fetch with parallel chunks and better code quality
- Parallelize _batch_fetch_dates using asyncio.gather for better performance - Improve variable naming (t0/t1/t2 -> descriptive names) - Restore helpful comments for code readability - Fix type handling in _batch_fetch_headers (accept both bytes and str) - Add comprehensive tests for batch methods Co-Authored-By: Paintress <paintress@arcoer.com>
1 parent dc6be80 commit 2568f43

File tree

2 files changed

+169
-32
lines changed

2 files changed

+169
-32
lines changed

mcp_email_server/emails/classic.py

Lines changed: 61 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
import email.utils
23
import mimetypes
34
import re
@@ -268,52 +269,74 @@ def _parse_headers(self, email_id: str, raw_headers: bytes) -> dict[str, Any] |
268269
logger.error(f"Error parsing email headers: {e!s}")
269270
return None
270271

272+
async def _fetch_dates_chunk(
273+
self,
274+
imap: aioimaplib.IMAP4_SSL | aioimaplib.IMAP4,
275+
chunk: list[bytes],
276+
chunk_num: int,
277+
total_chunks: int,
278+
) -> dict[str, datetime]:
279+
"""Fetch INTERNALDATE for a single chunk of UIDs."""
280+
uid_list = ",".join(uid.decode() for uid in chunk)
281+
chunk_start = time.perf_counter()
282+
_, data = await imap.uid("fetch", uid_list, "(INTERNALDATE)")
283+
chunk_elapsed = time.perf_counter() - chunk_start
284+
285+
chunk_dates: dict[str, datetime] = {}
286+
for item in data:
287+
if not isinstance(item, bytes) or b"INTERNALDATE" not in item:
288+
continue
289+
uid_match = re.search(rb"UID (\d+)", item)
290+
date_match = re.search(rb'INTERNALDATE "([^"]+)"', item)
291+
if uid_match and date_match:
292+
uid = uid_match.group(1).decode()
293+
date_str = date_match.group(1).decode().strip()
294+
chunk_dates[uid] = datetime.strptime(date_str, "%d-%b-%Y %H:%M:%S %z")
295+
296+
if total_chunks > 1:
297+
logger.info(f"Fetched dates chunk {chunk_num}/{total_chunks}: {len(chunk)} UIDs in {chunk_elapsed:.2f}s")
298+
299+
return chunk_dates
300+
271301
async def _batch_fetch_dates(
272302
self,
273303
imap: aioimaplib.IMAP4_SSL | aioimaplib.IMAP4,
274304
email_ids: list[bytes],
275305
chunk_size: int = 5000,
276306
) -> dict[str, datetime]:
277-
"""Batch fetch INTERNALDATE for all UIDs in chunks."""
307+
"""Batch fetch INTERNALDATE for all UIDs in parallel chunks."""
278308
if not email_ids:
279309
return {}
280310

311+
# Split into chunks
312+
chunks = [email_ids[i : i + chunk_size] for i in range(0, len(email_ids), chunk_size)]
313+
total_chunks = len(chunks)
314+
315+
# Fetch all chunks in parallel
316+
tasks = [
317+
self._fetch_dates_chunk(imap, chunk, chunk_num, total_chunks) for chunk_num, chunk in enumerate(chunks, 1)
318+
]
319+
results = await asyncio.gather(*tasks)
320+
321+
# Merge results
281322
uid_dates: dict[str, datetime] = {}
282-
total_chunks = (len(email_ids) + chunk_size - 1) // chunk_size
283-
for chunk_num, i in enumerate(range(0, len(email_ids), chunk_size), 1):
284-
chunk = email_ids[i : i + chunk_size]
285-
uid_list = ",".join(uid.decode() for uid in chunk)
286-
chunk_start = time.perf_counter()
287-
_, data = await imap.uid("fetch", uid_list, "(INTERNALDATE)")
288-
chunk_elapsed = time.perf_counter() - chunk_start
289-
290-
for item in data:
291-
if not isinstance(item, bytes) or b"INTERNALDATE" not in item:
292-
continue
293-
uid_match = re.search(rb"UID (\d+)", item)
294-
date_match = re.search(rb'INTERNALDATE "([^"]+)"', item)
295-
if uid_match and date_match:
296-
uid = uid_match.group(1).decode()
297-
date_str = date_match.group(1).decode().strip()
298-
uid_dates[uid] = datetime.strptime(date_str, "%d-%b-%Y %H:%M:%S %z")
299-
300-
if total_chunks > 1:
301-
logger.info(
302-
f"Fetched dates chunk {chunk_num}/{total_chunks}: {len(chunk)} UIDs in {chunk_elapsed:.2f}s"
303-
)
323+
for chunk_dates in results:
324+
uid_dates.update(chunk_dates)
304325

305326
return uid_dates
306327

307328
async def _batch_fetch_headers(
308329
self,
309330
imap: aioimaplib.IMAP4_SSL | aioimaplib.IMAP4,
310-
email_ids: list[str],
331+
email_ids: list[bytes] | list[str],
311332
) -> dict[str, dict[str, Any]]:
312333
"""Batch fetch headers for a list of UIDs."""
313334
if not email_ids:
314335
return {}
315336

316-
uid_list = ",".join(email_ids)
337+
# Normalize to list of strings
338+
str_ids = [uid.decode() if isinstance(uid, bytes) else uid for uid in email_ids]
339+
uid_list = ",".join(str_ids)
317340
_, data = await imap.uid("fetch", uid_list, "BODY.PEEK[HEADER]")
318341

319342
results: dict[str, dict[str, Any]] = {}
@@ -390,8 +413,11 @@ async def get_emails_metadata_stream(
390413
) -> AsyncGenerator[dict[str, Any], None]:
391414
imap = self.imap_class(self.email_server.host, self.email_server.port)
392415
try:
416+
# Wait for the connection to be established
393417
await imap._client_task
394418
await imap.wait_hello_from_server()
419+
420+
# Login and select mailbox
395421
await imap.login(self.email_server.user_name, self.email_server.password)
396422
await _send_imap_id(imap)
397423
await imap.select(_quote_mailbox(mailbox))
@@ -408,19 +434,21 @@ async def get_emails_metadata_stream(
408434
)
409435
logger.info(f"Get metadata: Search criteria: {search_criteria}")
410436

437+
# Search for messages - use UID SEARCH for better compatibility
411438
_, messages = await imap.uid_search(*search_criteria)
412439

440+
# Handle empty or None responses
413441
if not messages or not messages[0]:
414442
logger.warning("No messages returned from search")
415443
return
416444

417445
email_ids = messages[0].split()
418446
logger.info(f"Found {len(email_ids)} email IDs")
419447

420-
# Phase 1: Batch fetch INTERNALDATE for sorting
421-
t0 = time.perf_counter()
448+
# Phase 1: Batch fetch INTERNALDATE for sorting (parallel chunks)
449+
fetch_dates_start = time.perf_counter()
422450
uid_dates = await self._batch_fetch_dates(imap, email_ids)
423-
t1 = time.perf_counter()
451+
fetch_dates_elapsed = time.perf_counter() - fetch_dates_start
424452

425453
# Sort by INTERNALDATE
426454
sorted_uids = sorted(uid_dates.items(), key=lambda x: x[1], reverse=(order == "desc"))
@@ -430,16 +458,17 @@ async def get_emails_metadata_stream(
430458
page_uids = [uid for uid, _ in sorted_uids[start : start + page_size]]
431459

432460
if not page_uids:
433-
logger.info(f"Phase 1 (dates): {len(uid_dates)} UIDs in {t1 - t0:.2f}s, page {page} empty")
461+
logger.info(f"Phase 1 (dates): {len(uid_dates)} UIDs in {fetch_dates_elapsed:.2f}s, page {page} empty")
434462
return
435463

436464
# Phase 2: Batch fetch headers for requested page only
465+
fetch_headers_start = time.perf_counter()
437466
metadata_by_uid = await self._batch_fetch_headers(imap, page_uids)
438-
t2 = time.perf_counter()
467+
fetch_headers_elapsed = time.perf_counter() - fetch_headers_start
439468

440469
logger.info(
441-
f"Fetched page {page}: {t1 - t0:.2f}s dates ({len(uid_dates)} UIDs), "
442-
f"{t2 - t1:.2f}s headers ({len(page_uids)} UIDs)"
470+
f"Fetched page {page}: {fetch_dates_elapsed:.2f}s dates ({len(uid_dates)} UIDs), "
471+
f"{fetch_headers_elapsed:.2f}s headers ({len(page_uids)} UIDs)"
443472
)
444473

445474
# Yield in sorted order

tests/test_classic_handler.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,3 +364,111 @@ async def test_get_emails_content_includes_message_id(self, classic_handler):
364364

365365
# Verify the client method was called correctly
366366
mock_get_body.assert_called_once_with("123", "INBOX")
367+
368+
369+
class TestEmailClientBatchMethods:
370+
"""Test batch fetch methods for performance optimization."""
371+
372+
@pytest.fixture
373+
def email_client(self, email_settings):
374+
return EmailClient(email_settings.incoming)
375+
376+
def test_parse_headers(self, email_client):
377+
"""Test _parse_headers method parses email headers correctly."""
378+
raw_headers = b"""From: sender@example.com
379+
To: recipient@example.com
380+
Cc: cc@example.com
381+
Subject: Test Subject
382+
Date: Mon, 20 Jan 2025 10:30:00 +0000
383+
384+
"""
385+
result = email_client._parse_headers("123", raw_headers)
386+
387+
assert result is not None
388+
assert result["email_id"] == "123"
389+
assert result["subject"] == "Test Subject"
390+
assert result["from"] == "sender@example.com"
391+
assert "recipient@example.com" in result["to"]
392+
assert "cc@example.com" in result["to"]
393+
assert result["attachments"] == []
394+
395+
def test_parse_headers_with_invalid_data(self, email_client):
396+
"""Test _parse_headers handles malformed headers gracefully."""
397+
# Completely broken data that can't be parsed
398+
raw_headers = b"\xff\xfe\x00\x00"
399+
result = email_client._parse_headers("123", raw_headers)
400+
401+
# Should return None or a valid dict with fallback values
402+
# The implementation catches exceptions and returns None
403+
assert result is None or isinstance(result, dict)
404+
405+
def test_parse_headers_missing_date(self, email_client):
406+
"""Test _parse_headers handles missing date with fallback."""
407+
raw_headers = b"""From: sender@example.com
408+
To: recipient@example.com
409+
Subject: No Date Email
410+
411+
"""
412+
result = email_client._parse_headers("123", raw_headers)
413+
414+
assert result is not None
415+
assert result["email_id"] == "123"
416+
assert result["date"] is not None # Should have fallback to now()
417+
418+
@pytest.mark.asyncio
419+
async def test_batch_fetch_dates_empty_list(self, email_client):
420+
"""Test _batch_fetch_dates with empty list returns empty dict."""
421+
mock_imap = AsyncMock()
422+
result = await email_client._batch_fetch_dates(mock_imap, [])
423+
424+
assert result == {}
425+
mock_imap.uid.assert_not_called()
426+
427+
@pytest.mark.asyncio
428+
async def test_batch_fetch_headers_empty_list(self, email_client):
429+
"""Test _batch_fetch_headers with empty list returns empty dict."""
430+
mock_imap = AsyncMock()
431+
result = await email_client._batch_fetch_headers(mock_imap, [])
432+
433+
assert result == {}
434+
mock_imap.uid.assert_not_called()
435+
436+
@pytest.mark.asyncio
437+
async def test_batch_fetch_dates_parses_response(self, email_client):
438+
"""Test _batch_fetch_dates correctly parses IMAP INTERNALDATE response."""
439+
mock_imap = AsyncMock()
440+
# Simulate IMAP response format for INTERNALDATE
441+
mock_imap.uid.return_value = (
442+
"OK",
443+
[
444+
b'1 FETCH (UID 100 INTERNALDATE "20-Jan-2025 10:30:00 +0000")',
445+
b'2 FETCH (UID 101 INTERNALDATE "21-Jan-2025 11:00:00 +0000")',
446+
],
447+
)
448+
449+
result = await email_client._batch_fetch_dates(mock_imap, [b"100", b"101"])
450+
451+
assert "100" in result
452+
assert "101" in result
453+
assert result["100"].day == 20
454+
assert result["101"].day == 21
455+
456+
@pytest.mark.asyncio
457+
async def test_batch_fetch_headers_parses_response(self, email_client):
458+
"""Test _batch_fetch_headers correctly parses IMAP BODY[HEADER] response."""
459+
mock_imap = AsyncMock()
460+
# Simulate IMAP response format for BODY[HEADER]
461+
mock_imap.uid.return_value = (
462+
"OK",
463+
[
464+
b"1 FETCH (UID 100 BODY[HEADER] {100}",
465+
bytearray(b"From: sender@example.com\r\nTo: recipient@example.com\r\nSubject: Test\r\n\r\n"),
466+
b")",
467+
],
468+
)
469+
470+
result = await email_client._batch_fetch_headers(mock_imap, ["100"])
471+
472+
assert "100" in result
473+
assert result["100"]["subject"] == "Test"
474+
assert result["100"]["from"] == "sender@example.com"

0 commit comments

Comments
 (0)