feat: Add HTML body fallback when text/plain is not available (#114)

peteWT · Peter Tittmann · claude · web-flow · commit c442e33166c8 · 2026-02-14T22:00:33.000+08:00
* feat: Add HTML body fallback when text/plain is not available Many email clients (Outlook, Gmail, etc.) send HTML-only emails without a text/plain alternative. This causes get_emails_content to return empty body fields. This change: - Collects text/html content as a fallback when parsing multipart emails - Strips HTML tags to convert to readable text if no text/plain is found - Handles single-part HTML emails the same way Fixes #113 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * style: Use ternary operator per ruff SIM108 --------- Co-authored-by: Peter Tittmann <ptittmann@@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/mcp_email_server/emails/classic.py b/mcp_email_server/emails/classic.py
@@ -150,8 +150,28 @@ def _parse_email_data(self, raw_email: bytes, email_id: str | None = None) -> di
 
         # Get body content
         body = ""
+        html_body = ""  # Fallback if no text/plain
         attachments = []
 
+        def _strip_html(html: str) -> str:
+            """Simple HTML to text conversion."""
+            import re
+
+            # Remove script and style elements
+            text = re.sub(r"<(script|style)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
+            # Convert common block elements to newlines
+            text = re.sub(r"<(br|p|div|tr|li)[^>]*/?>", "\n", text, flags=re.IGNORECASE)
+            # Remove all remaining HTML tags
+            text = re.sub(r"<[^>]+>", "", text)
+            # Decode common HTML entities
+            text = text.replace("&nbsp;", " ").replace("&amp;", "&")
+            text = text.replace("&lt;", "<").replace("&gt;", ">")
+            text = text.replace("&quot;", '"').replace("&#39;", "'")
+            # Collapse multiple newlines and whitespace
+            text = re.sub(r"\n\s*\n", "\n\n", text)
+            text = re.sub(r" +", " ", text)
+            return text.strip()
+
         if email_message.is_multipart():
             for part in email_message.walk():
                 content_type = part.get_content_type()
@@ -162,7 +182,7 @@ def _parse_email_data(self, raw_email: bytes, email_id: str | None = None) -> di
                     filename = part.get_filename()
                     if filename:
                         attachments.append(filename)
-                # Handle text parts
+                # Handle text parts - prefer text/plain
                 elif content_type == "text/plain":
                     body_part = part.get_payload(decode=True)
                     if body_part:
@@ -171,15 +191,31 @@ def _parse_email_data(self, raw_email: bytes, email_id: str | None = None) -> di
                             body += body_part.decode(charset)
                         except UnicodeDecodeError:
                             body += body_part.decode("utf-8", errors="replace")
+                # Collect HTML as fallback
+                elif content_type == "text/html" and not body:
+                    html_part = part.get_payload(decode=True)
+                    if html_part:
+                        charset = part.get_content_charset("utf-8")
+                        try:
+                            html_body += html_part.decode(charset)
+                        except UnicodeDecodeError:
+                            html_body += html_part.decode("utf-8", errors="replace")
+
+            # Fall back to HTML if no plain text found
+            if not body and html_body:
+                body = _strip_html(html_body)
         else:
-            # Handle plain text emails
+            # Handle single-part emails
+            content_type = email_message.get_content_type()
             payload = email_message.get_payload(decode=True)
             if payload:
                 charset = email_message.get_content_charset("utf-8")
                 try:
-                    body = payload.decode(charset)
+                    text = payload.decode(charset)
                 except UnicodeDecodeError:
-                    body = payload.decode("utf-8", errors="replace")
+                    text = payload.decode("utf-8", errors="replace")
+
+                body = _strip_html(text) if content_type == "text/html" else text
         # TODO: Allow retrieving full email body
         if body and len(body) > MAX_BODY_LENGTH:
             body = body[:MAX_BODY_LENGTH] + "...[TRUNCATED]"