Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
## 0.18.36

### Enhancements
- Add character-level deduplication for PDF text extraction to handle fake-bold rendering

### Fixes
- **Fix duplicate characters in PDF bold text extraction**: Some PDFs render bold text by drawing each character twice at slightly offset positions, causing text like "BOLD" to be extracted as "BBOOLLDD". Added character-level deduplication based on position proximity. Configurable via `PDF_CHAR_DUPLICATE_THRESHOLD` environment variable (default: 3.0 pixels, set to 0 to disable)(fixes #3864).

## 0.18.35-dev0

### Enhancements
Expand Down
Binary file added example-docs/pdf/fake-bold-sample.pdf
Binary file not shown.
45 changes: 41 additions & 4 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import tempfile
from dataclasses import dataclass
from importlib import reload
from pathlib import Path
from tempfile import SpooledTemporaryFile
from unittest import mock
Expand Down Expand Up @@ -38,6 +39,7 @@
from unstructured.partition import pdf, strategies
from unstructured.partition.pdf_image import ocr, pdfminer_processing
from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots
from unstructured.partition.utils import config as partition_config
from unstructured.partition.utils.constants import (
OCR_AGENT_PADDLE,
OCR_AGENT_TESSERACT,
Expand Down Expand Up @@ -439,6 +441,41 @@ def test_partition_pdf_with_fast_strategy_and_page_breaks(caplog):
assert element.metadata.filename == "layout-parser-paper-fast.pdf"


def test_partition_pdf_with_fast_strategy_deduplicates_fake_bold(monkeypatch):
"""Test that fast strategy properly deduplicates fake-bold text in PDFs.

Some PDFs create bold text by rendering each character twice at slightly offset
positions (fake-bold). The fast strategy should remove these duplicate characters.
"""
filename = example_doc_path("pdf/fake-bold-sample.pdf")

# Extract WITHOUT deduplication (threshold=0) - shows doubled characters
monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0")
reload(partition_config)
elements_no_dedup = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
text_no_dedup = " ".join([el.text for el in elements_no_dedup])

# Extract WITH deduplication (threshold=3.0) - shows clean text
monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "3.0")
reload(partition_config)
elements_with_dedup = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
text_with_dedup = " ".join([el.text for el in elements_with_dedup])

# Verify fake-bold text shows doubled characters without deduplication
assert (
"BBOOLLDD" in text_no_dedup
), "Without deduplication, fake-bold text should show doubled chars like 'BBOOLLDD'"

# Verify deduplication produces clean text
assert "BOLD" in text_with_dedup, "With deduplication, text should contain clean 'BOLD'"

# Verify deduplicated text is shorter
assert len(text_with_dedup) < len(text_no_dedup), (
f"Deduplicated text ({len(text_with_dedup)} chars) should be shorter "
f"than non-deduplicated text ({len(text_no_dedup)} chars)"
)


def test_partition_pdf_raises_with_bad_strategy():
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
with pytest.raises(ValueError):
Expand Down Expand Up @@ -1345,12 +1382,12 @@ def expected_element_ids_for_fast_strategy():
"a90a54baba0093296a013d26b7acbc17",
"9be424e2d151dac4b5f36a85e9bbfe65",
"4631da875fb4996c63b2d80cea6b588e",
"6264f4eda97a049f4710f9bea0c01cbd",
"8cdb940788d2ed43523a5327292477a0",
"abded7b2ff3a5542c88b4a831755ec24",
"b781ea5123cb31e0571391b7b42cac75",
"033f27d2618ba4cda9068b267b5a731e",
"8982a12fcced30dd12ccbf61d14f30bf",
"41af2fd5df0cf47aa7e8ecca200d3ac6",
"0dfcc8870cf2aa54a0e780cb301b9c91",
]


Expand All @@ -1361,12 +1398,12 @@ def expected_element_ids_for_hi_res_strategy():
"a90a54baba0093296a013d26b7acbc17",
"9be424e2d151dac4b5f36a85e9bbfe65",
"4631da875fb4996c63b2d80cea6b588e",
"6264f4eda97a049f4710f9bea0c01cbd",
"8cdb940788d2ed43523a5327292477a0",
"abded7b2ff3a5542c88b4a831755ec24",
"b781ea5123cb31e0571391b7b42cac75",
"033f27d2618ba4cda9068b267b5a731e",
"8982a12fcced30dd12ccbf61d14f30bf",
"41af2fd5df0cf47aa7e8ecca200d3ac6",
"0dfcc8870cf2aa54a0e780cb301b9c91",
]


Expand Down
91 changes: 91 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdfminer_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from test_unstructured.unit_utils import example_doc_path
from unstructured.partition.auto import partition
from unstructured.partition.pdf_image.pdfminer_processing import (
_deduplicate_ltchars,
_validate_bbox,
aggregate_embedded_text_by_block,
bboxes1_is_almost_subregion_of_bboxes2,
Expand Down Expand Up @@ -362,3 +363,93 @@ def test_text_is_embedded():

assert text_is_embedded(container, threshold=0.5)
assert not text_is_embedded(container, threshold=0.3)


# -- Tests for _deduplicate_ltchars (fake bold fix) --


def _create_positioned_ltchar(text: str, x0: float, y0: float) -> LTChar:
"""Create an LTChar with a specific position for deduplication testing."""
graphicstate = Mock()
# Matrix format: (a, b, c, d, e, f) where e=x, f=y for translation
matrix = (1, 0, 0, 1, x0, y0)

char = LTChar(
matrix=matrix,
font=Mock(),
fontsize=12,
scaling=1,
rise=0,
text=text,
textwidth=10,
textdisp=(0, 1),
ncs=Mock(),
graphicstate=graphicstate,
)
return char


class TestDeduplicateLtchars:
"""Tests for _deduplicate_ltchars function."""

def test_empty_list_returns_empty(self):
"""Empty character list should return empty list."""
result = _deduplicate_ltchars([], threshold=3.0)
assert result == []

def test_threshold_zero_disables_deduplication(self):
"""Threshold of 0 should disable deduplication and return original list."""
chars = [
_create_positioned_ltchar("A", 10.0, 20.0),
_create_positioned_ltchar("A", 10.5, 20.0), # Would be duplicate
]
result = _deduplicate_ltchars(chars, threshold=0)
assert len(result) == 2

def test_fake_bold_duplicates_removed(self):
"""Fake bold (double-rendered) characters should be deduplicated."""
# Simulate "AB" rendered as "AABB" with fake bold
chars = [
_create_positioned_ltchar("A", 10.0, 20.0),
_create_positioned_ltchar("A", 10.5, 20.0), # Duplicate - close position
_create_positioned_ltchar("B", 25.0, 20.0),
_create_positioned_ltchar("B", 25.5, 20.0), # Duplicate - close position
]
result = _deduplicate_ltchars(chars, threshold=3.0)
assert len(result) == 2
assert result[0].get_text() == "A"
assert result[1].get_text() == "B"

def test_legitimate_repeated_chars_preserved(self):
"""Legitimate repeated characters at different positions should be preserved."""
# "AA" where both A's are at legitimately different positions
chars = [
_create_positioned_ltchar("A", 10.0, 20.0),
_create_positioned_ltchar("A", 25.0, 20.0), # Far enough - not duplicate
]
result = _deduplicate_ltchars(chars, threshold=3.0)
assert len(result) == 2

def test_single_char_returns_single(self):
"""Single character should return single character."""
chars = [_create_positioned_ltchar("X", 10.0, 20.0)]
result = _deduplicate_ltchars(chars, threshold=3.0)
assert len(result) == 1
assert result[0].get_text() == "X"

def test_mixed_duplicates_and_normal(self):
"""Mix of duplicated and normal characters should be handled correctly."""
# "HELLO" where only H and L are fake-bold
chars = [
_create_positioned_ltchar("H", 10.0, 20.0),
_create_positioned_ltchar("H", 10.5, 20.0), # Duplicate
_create_positioned_ltchar("E", 20.0, 20.0), # Normal
_create_positioned_ltchar("L", 30.0, 20.0),
_create_positioned_ltchar("L", 30.5, 20.0), # Duplicate
_create_positioned_ltchar("L", 40.0, 20.0), # Second L (normal, different position)
_create_positioned_ltchar("O", 50.0, 20.0), # Normal
]
result = _deduplicate_ltchars(chars, threshold=3.0)
assert len(result) == 5
text = "".join(c.get_text() for c in result)
assert text == "HELLO"
Loading
Loading