Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,43 @@ await send_email(

The `in_reply_to` parameter sets the `In-Reply-To` header, and `references` sets the `References` header. Both are used by email clients to thread conversations properly.

### Extracting Clean Text from HTML Emails

When fetching email content, you can control how the body is formatted using the `content_format` parameter:

```python
# Get raw content (default) - returns text/plain if available, otherwise HTML
emails = await get_emails_content(account_name="work", email_ids=["123"])

# Get HTML content as-is
emails = await get_emails_content(account_name="work", email_ids=["123"], content_format="html")

# Strip HTML tags and get clean plain text
emails = await get_emails_content(account_name="work", email_ids=["123"], content_format="text")

# Convert HTML to markdown format
emails = await get_emails_content(account_name="work", email_ids=["123"], content_format="markdown")
```

Available formats:

- `raw` (default): Returns text/plain content if available, falls back to HTML
- `html`: Returns HTML content as-is
- `text`: Strips all HTML tags and returns clean plain text
- `markdown`: Converts HTML to markdown, preserving links and basic formatting

To set a default format for all email content retrieval, configure `default_content_format` in your config file:

```toml
default_content_format = "markdown"
```

Or via environment variable:

```bash
export MCP_EMAIL_SERVER_DEFAULT_CONTENT_FORMAT=markdown
```

## Development

This project is managed using [uv](https://github.com/ai-zerolab/uv).
Expand Down
18 changes: 17 additions & 1 deletion mcp_email_server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,25 @@ async def get_emails_content(
),
],
mailbox: Annotated[str, Field(default="INBOX", description="The mailbox to retrieve emails from.")] = "INBOX",
content_format: Annotated[
str | None,
Field(
default=None,
description=(
"How to format the email body content: "
"'raw' returns text/plain if available or HTML otherwise; "
"'html' returns HTML content; "
"'text' strips HTML tags to return clean plain text; "
"'markdown' converts HTML to markdown format. "
"Defaults to the server's default_content_format setting (usually 'raw')."
),
),
] = None,
) -> EmailContentBatchResponse:
settings = get_settings()
effective_format = content_format if content_format is not None else settings.default_content_format
handler = dispatch_handler(account_name)
return await handler.get_emails_content(email_ids, mailbox)
return await handler.get_emails_content(email_ids, mailbox, effective_format)


@mcp.tool(
Expand Down
16 changes: 16 additions & 0 deletions mcp_email_server/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,15 @@ def _parse_bool_env(value: str | None, default: bool = False) -> bool:
return value.lower() in ("true", "1", "yes", "on")


VALID_CONTENT_FORMATS = {"raw", "html", "text", "markdown"}


class Settings(BaseSettings):
emails: list[EmailSettings] = []
providers: list[ProviderSettings] = []
db_location: str = CONFIG_PATH.with_name("db.sqlite3").as_posix()
enable_attachment_download: bool = False
default_content_format: str = "raw"

model_config = SettingsConfigDict(toml_file=CONFIG_PATH, validate_assignment=True, revalidate_instances="always")

Expand All @@ -236,6 +240,18 @@ def __init__(self, **data: Any) -> None:
self.enable_attachment_download = _parse_bool_env(env_enable_attachment, False)
logger.info(f"Set enable_attachment_download={self.enable_attachment_download} from environment variable")

# Check for default_content_format from environment variable
env_content_format = os.getenv("MCP_EMAIL_SERVER_DEFAULT_CONTENT_FORMAT")
if env_content_format is not None:
if env_content_format in VALID_CONTENT_FORMATS:
self.default_content_format = env_content_format
logger.info(f"Set default_content_format={self.default_content_format} from environment variable")
else:
logger.warning(
f"Invalid MCP_EMAIL_SERVER_DEFAULT_CONTENT_FORMAT '{env_content_format}', "
f"must be one of {VALID_CONTENT_FORMATS}. Using default 'raw'."
)

# Check for email configuration from environment variables
env_email = EmailSettings.from_env()
if env_email:
Expand Down
16 changes: 15 additions & 1 deletion mcp_email_server/emails/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,23 @@ async def get_emails_metadata(
"""

@abc.abstractmethod
async def get_emails_content(self, email_ids: list[str], mailbox: str = "INBOX") -> "EmailContentBatchResponse":
async def get_emails_content(
self,
email_ids: list[str],
mailbox: str = "INBOX",
content_format: str = "raw",
) -> "EmailContentBatchResponse":
"""
Get full content (including body) of multiple emails by their email IDs (IMAP UIDs)

Args:
email_ids: List of email UIDs to retrieve.
mailbox: The mailbox to search in (default: "INBOX").
content_format: How to format the body content:
- "raw": Return original content (text/plain preferred, falls back to HTML)
- "html": Return HTML content as-is
- "text": Strip HTML tags and return clean plain text
- "markdown": Convert HTML to markdown format
"""

@abc.abstractmethod
Expand Down
94 changes: 86 additions & 8 deletions mcp_email_server/emails/classic.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import aioimaplib
import aiosmtplib
import justhtml

from mcp_email_server.config import EmailServer, EmailSettings
from mcp_email_server.emails import EmailHandler
Expand Down Expand Up @@ -72,6 +73,42 @@ async def _send_imap_id(imap: aioimaplib.IMAP4 | aioimaplib.IMAP4_SSL) -> None:
logger.warning(f"IMAP ID command failed: {e!s}")


def _format_body_content(body: str, html_body: str, content_format: str) -> str:
"""Format email body content based on the requested format.

Args:
body: Plain text body content.
html_body: HTML body content.
content_format: One of "raw", "html", "text", "markdown".

Returns:
Formatted body content.
"""
if content_format == "raw":
# Return plain text if available, else HTML
return body if body else html_body

if content_format == "html":
# Return HTML content, fall back to plain text if no HTML
return html_body if html_body else body

if content_format == "text":
# Convert HTML to clean text, or return plain text
if html_body:
return justhtml.JustHTML(html_body).to_text()
return body

if content_format == "markdown":
# Convert HTML to markdown
if html_body:
return justhtml.JustHTML(html_body).to_markdown()
return body

# Unknown format, return raw
logger.warning(f"Unknown content_format: {content_format}, returning raw content")
return body if body else html_body


class EmailClient:
def __init__(self, email_server: EmailServer, sender: str | None = None):
self.email_server = email_server
Expand Down Expand Up @@ -118,8 +155,9 @@ def _parse_email_data(self, raw_email: bytes, email_id: str | None = None) -> di
except Exception:
date = datetime.now(timezone.utc)

# Get body content
# Get body content - extract both plain text and HTML
body = ""
html_body = ""
attachments = []

if email_message.is_multipart():
Expand All @@ -141,25 +179,44 @@ def _parse_email_data(self, raw_email: bytes, email_id: str | None = None) -> di
body += body_part.decode(charset)
except UnicodeDecodeError:
body += body_part.decode("utf-8", errors="replace")
elif content_type == "text/html":
html_part = part.get_payload(decode=True)
if html_part:
charset = part.get_content_charset("utf-8")
try:
html_body += html_part.decode(charset)
except UnicodeDecodeError:
html_body += html_part.decode("utf-8", errors="replace")
else:
# Handle plain text emails
# Handle single-part emails
payload = email_message.get_payload(decode=True)
if payload:
charset = email_message.get_content_charset("utf-8")
content_type = email_message.get_content_type()
try:
body = payload.decode(charset)
decoded = payload.decode(charset)
except UnicodeDecodeError:
body = payload.decode("utf-8", errors="replace")
# TODO: Allow retrieving full email body
decoded = payload.decode("utf-8", errors="replace")

if content_type == "text/html":
html_body = decoded
else:
body = decoded

# Truncate if too long
if body and len(body) > 20000:
body = body[:20000] + "...[TRUNCATED]"
if html_body and len(html_body) > 20000:
html_body = html_body[:20000] + "...[TRUNCATED]"

return {
"email_id": email_id or "",
"message_id": message_id,
"subject": subject,
"from": sender,
"to": to_addresses,
"body": body,
"html_body": html_body,
"date": date,
"attachments": attachments,
}
Expand Down Expand Up @@ -837,15 +894,36 @@ async def get_emails_metadata(
total=total,
)

async def get_emails_content(self, email_ids: list[str], mailbox: str = "INBOX") -> EmailContentBatchResponse:
"""Batch retrieve email body content"""
async def get_emails_content(
self,
email_ids: list[str],
mailbox: str = "INBOX",
content_format: str = "raw",
) -> EmailContentBatchResponse:
"""Batch retrieve email body content.

Args:
email_ids: List of email UIDs to retrieve.
mailbox: The mailbox to search in (default: "INBOX").
content_format: How to format the body content:
- "raw": Return original content (text/plain preferred, falls back to HTML)
- "html": Return HTML content as-is
- "text": Strip HTML tags and return clean plain text
- "markdown": Convert HTML to markdown format
"""
emails = []
failed_ids = []

for email_id in email_ids:
try:
email_data = await self.incoming_client.get_email_body_by_id(email_id, mailbox)
if email_data:
# Apply content format conversion
formatted_body = _format_body_content(
email_data.get("body", ""),
email_data.get("html_body", ""),
content_format,
)
emails.append(
EmailBodyResponse(
email_id=email_data["email_id"],
Expand All @@ -854,7 +932,7 @@ async def get_emails_content(self, email_ids: list[str], mailbox: str = "INBOX")
sender=email_data["from"],
recipients=email_data["to"],
date=email_data["date"],
body=email_data["body"],
body=formatted_body,
attachments=email_data["attachments"],
)
)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies = [
"aiosmtplib>=4.0.0",
"gradio>=6.0.1",
"jinja2>=3.1.5",
"justhtml>=0.35.0",
"loguru>=0.7.3",
"mcp[cli]>=1.3.0",
"pydantic>=2.11.0",
Expand Down
Loading
Loading