Unstructured-IO · bittoby · Jan 28, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## 0.18.36
+
+### Enhancements
+- Add character-level deduplication for PDF text extraction to handle fake-bold rendering
+
+### Fixes
+- **Fix duplicate characters in PDF bold text extraction**: Some PDFs render bold text by drawing each character twice at slightly offset positions, causing text like "BOLD" to be extracted as "BBOOLLDD". Added character-level deduplication based on position proximity. Configurable via `PDF_CHAR_DUPLICATE_THRESHOLD` environment variable (default: 3.0 pixels, set to 0 to disable)(fixes #3864).
+
 ## 0.18.35-dev0
 
 ### Enhancements

diff --git a/example-docs/pdf/fake-bold-sample.pdf b/example-docs/pdf/fake-bold-sample.pdf
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -6,6 +6,7 @@
 import os
 import tempfile
 from dataclasses import dataclass
+from importlib import reload
 from pathlib import Path
 from tempfile import SpooledTemporaryFile
 from unittest import mock
@@ -38,6 +39,7 @@
 from unstructured.partition import pdf, strategies
 from unstructured.partition.pdf_image import ocr, pdfminer_processing
 from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots
+from unstructured.partition.utils import config as partition_config
 from unstructured.partition.utils.constants import (
     OCR_AGENT_PADDLE,
     OCR_AGENT_TESSERACT,
@@ -439,6 +441,41 @@ def test_partition_pdf_with_fast_strategy_and_page_breaks(caplog):
         assert element.metadata.filename == "layout-parser-paper-fast.pdf"
 
 
+def test_partition_pdf_with_fast_strategy_deduplicates_fake_bold(monkeypatch):
+    """Test that fast strategy properly deduplicates fake-bold text in PDFs.
+
+    Some PDFs create bold text by rendering each character twice at slightly offset
+    positions (fake-bold). The fast strategy should remove these duplicate characters.
+    """
+    filename = example_doc_path("pdf/fake-bold-sample.pdf")
+
+    # Extract WITHOUT deduplication (threshold=0) - shows doubled characters
+    monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0")
+    reload(partition_config)
+    elements_no_dedup = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
+    text_no_dedup = " ".join([el.text for el in elements_no_dedup])
+
+    # Extract WITH deduplication (threshold=3.0) - shows clean text
+    monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "3.0")
+    reload(partition_config)
+    elements_with_dedup = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
+    text_with_dedup = " ".join([el.text for el in elements_with_dedup])
+
+    # Verify fake-bold text shows doubled characters without deduplication
+    assert (
+        "BBOOLLDD" in text_no_dedup
+    ), "Without deduplication, fake-bold text should show doubled chars like 'BBOOLLDD'"
+
+    # Verify deduplication produces clean text
+    assert "BOLD" in text_with_dedup, "With deduplication, text should contain clean 'BOLD'"
+
+    # Verify deduplicated text is shorter
+    assert len(text_with_dedup) < len(text_no_dedup), (
+        f"Deduplicated text ({len(text_with_dedup)} chars) should be shorter "
+        f"than non-deduplicated text ({len(text_no_dedup)} chars)"
+    )
+
+
 def test_partition_pdf_raises_with_bad_strategy():
     filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
     with pytest.raises(ValueError):
@@ -1345,12 +1382,12 @@ def expected_element_ids_for_fast_strategy():
         "a90a54baba0093296a013d26b7acbc17",
         "9be424e2d151dac4b5f36a85e9bbfe65",
         "4631da875fb4996c63b2d80cea6b588e",
-        "6264f4eda97a049f4710f9bea0c01cbd",
+        "8cdb940788d2ed43523a5327292477a0",
         "abded7b2ff3a5542c88b4a831755ec24",
         "b781ea5123cb31e0571391b7b42cac75",
         "033f27d2618ba4cda9068b267b5a731e",
         "8982a12fcced30dd12ccbf61d14f30bf",
-        "41af2fd5df0cf47aa7e8ecca200d3ac6",
+        "0dfcc8870cf2aa54a0e780cb301b9c91",
     ]
 
 
@@ -1361,12 +1398,12 @@ def expected_element_ids_for_hi_res_strategy():
         "a90a54baba0093296a013d26b7acbc17",
         "9be424e2d151dac4b5f36a85e9bbfe65",
         "4631da875fb4996c63b2d80cea6b588e",
-        "6264f4eda97a049f4710f9bea0c01cbd",
+        "8cdb940788d2ed43523a5327292477a0",
         "abded7b2ff3a5542c88b4a831755ec24",
         "b781ea5123cb31e0571391b7b42cac75",
         "033f27d2618ba4cda9068b267b5a731e",
         "8982a12fcced30dd12ccbf61d14f30bf",
-        "41af2fd5df0cf47aa7e8ecca200d3ac6",
+        "0dfcc8870cf2aa54a0e780cb301b9c91",
     ]
 
 

diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -18,6 +18,7 @@
 from test_unstructured.unit_utils import example_doc_path
 from unstructured.partition.auto import partition
 from unstructured.partition.pdf_image.pdfminer_processing import (
+    _deduplicate_ltchars,
     _validate_bbox,
     aggregate_embedded_text_by_block,
     bboxes1_is_almost_subregion_of_bboxes2,
@@ -362,3 +363,93 @@ def test_text_is_embedded():
 
     assert text_is_embedded(container, threshold=0.5)
     assert not text_is_embedded(container, threshold=0.3)
+
+
+# -- Tests for _deduplicate_ltchars (fake bold fix) --
+
+
+def _create_positioned_ltchar(text: str, x0: float, y0: float) -> LTChar:
+    """Create an LTChar with a specific position for deduplication testing."""
+    graphicstate = Mock()
+    # Matrix format: (a, b, c, d, e, f) where e=x, f=y for translation
+    matrix = (1, 0, 0, 1, x0, y0)
+
+    char = LTChar(
+        matrix=matrix,
+        font=Mock(),
+        fontsize=12,
+        scaling=1,
+        rise=0,
+        text=text,
+        textwidth=10,
+        textdisp=(0, 1),
+        ncs=Mock(),
+        graphicstate=graphicstate,
+    )
+    return char
+
+
+class TestDeduplicateLtchars:
+    """Tests for _deduplicate_ltchars function."""
+
+    def test_empty_list_returns_empty(self):
+        """Empty character list should return empty list."""
+        result = _deduplicate_ltchars([], threshold=3.0)
+        assert result == []
+
+    def test_threshold_zero_disables_deduplication(self):
+        """Threshold of 0 should disable deduplication and return original list."""
+        chars = [
+            _create_positioned_ltchar("A", 10.0, 20.0),
+            _create_positioned_ltchar("A", 10.5, 20.0),  # Would be duplicate
+        ]
+        result = _deduplicate_ltchars(chars, threshold=0)
+        assert len(result) == 2
+
+    def test_fake_bold_duplicates_removed(self):
+        """Fake bold (double-rendered) characters should be deduplicated."""
+        # Simulate "AB" rendered as "AABB" with fake bold
+        chars = [
+            _create_positioned_ltchar("A", 10.0, 20.0),
+            _create_positioned_ltchar("A", 10.5, 20.0),  # Duplicate - close position
+            _create_positioned_ltchar("B", 25.0, 20.0),
+            _create_positioned_ltchar("B", 25.5, 20.0),  # Duplicate - close position
+        ]
+        result = _deduplicate_ltchars(chars, threshold=3.0)
+        assert len(result) == 2
+        assert result[0].get_text() == "A"
+        assert result[1].get_text() == "B"
+
+    def test_legitimate_repeated_chars_preserved(self):
+        """Legitimate repeated characters at different positions should be preserved."""
+        # "AA" where both A's are at legitimately different positions
+        chars = [
+            _create_positioned_ltchar("A", 10.0, 20.0),
+            _create_positioned_ltchar("A", 25.0, 20.0),  # Far enough - not duplicate
+        ]
+        result = _deduplicate_ltchars(chars, threshold=3.0)
+        assert len(result) == 2
+
+    def test_single_char_returns_single(self):
+        """Single character should return single character."""
+        chars = [_create_positioned_ltchar("X", 10.0, 20.0)]
+        result = _deduplicate_ltchars(chars, threshold=3.0)
+        assert len(result) == 1
+        assert result[0].get_text() == "X"
+
+    def test_mixed_duplicates_and_normal(self):
+        """Mix of duplicated and normal characters should be handled correctly."""
+        # "HELLO" where only H and L are fake-bold
+        chars = [
+            _create_positioned_ltchar("H", 10.0, 20.0),
+            _create_positioned_ltchar("H", 10.5, 20.0),  # Duplicate
+            _create_positioned_ltchar("E", 20.0, 20.0),  # Normal
+            _create_positioned_ltchar("L", 30.0, 20.0),
+            _create_positioned_ltchar("L", 30.5, 20.0),  # Duplicate
+            _create_positioned_ltchar("L", 40.0, 20.0),  # Second L (normal, different position)
+            _create_positioned_ltchar("O", 50.0, 20.0),  # Normal
+        ]
+        result = _deduplicate_ltchars(chars, threshold=3.0)
+        assert len(result) == 5
+        text = "".join(c.get_text() for c in result)
+        assert text == "HELLO"