From f8af84b5ff54611a6482b525c46d80f1989c1419 Mon Sep 17 00:00:00 2001 From: bittoby Date: Wed, 28 Jan 2026 13:11:47 +0100 Subject: [PATCH 01/10] fix: remove duplicate characters caused by fake bold rendering in PDFs --- CHANGELOG.md | 3 + .../pdf_image/test_pdfminer_processing.py | 91 +++++++++ .../pdf_image/test_pdfminer_utils.py | 180 +++++++++++++++++- .../pdf_image/pdfminer_processing.py | 46 ++++- .../partition/pdf_image/pdfminer_utils.py | 98 +++++++++- unstructured/partition/utils/config.py | 10 + 6 files changed, 424 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d7adf2496..fd0ca46a21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,9 @@ ### Enhancements - **Add `group_elements_by_parent_id` utility function**: Groups elements by their `parent_id` metadata field for easier document hierarchy traversal (fixes #1489) +### Fixes +- **Fix duplicate characters in PDF bold text extraction**: Some PDFs render bold text by drawing each character twice at slightly offset positions, causing text like "BOLD" to be extracted as "BBOOLLDD". Added character-level deduplication based on position proximity. Configurable via `PDF_CHAR_DUPLICATE_THRESHOLD` environment variable (default: 3.0 pixels, set to 0 to disable). + ## 0.18.32 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py index 5d5b28e5e2..ea2f5338ee 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py @@ -18,6 +18,7 @@ from test_unstructured.unit_utils import example_doc_path from unstructured.partition.auto import partition from unstructured.partition.pdf_image.pdfminer_processing import ( + _deduplicate_ltchars, _validate_bbox, aggregate_embedded_text_by_block, bboxes1_is_almost_subregion_of_bboxes2, @@ -362,3 +363,93 @@ def test_text_is_embedded(): assert text_is_embedded(container, threshold=0.5) assert not text_is_embedded(container, threshold=0.3) + + +# -- Tests for _deduplicate_ltchars (fake bold fix) -- + + +def _create_positioned_ltchar(text: str, x0: float, y0: float) -> LTChar: + """Create an LTChar with a specific position for deduplication testing.""" + graphicstate = Mock() + # Matrix format: (a, b, c, d, e, f) where e=x, f=y for translation + matrix = (1, 0, 0, 1, x0, y0) + + char = LTChar( + matrix=matrix, + font=Mock(), + fontsize=12, + scaling=1, + rise=0, + text=text, + textwidth=10, + textdisp=(0, 1), + ncs=Mock(), + graphicstate=graphicstate, + ) + return char + + +class TestDeduplicateLtchars: + """Tests for _deduplicate_ltchars function.""" + + def test_empty_list_returns_empty(self): + """Empty character list should return empty list.""" + result = _deduplicate_ltchars([], threshold=3.0) + assert result == [] + + def test_threshold_zero_disables_deduplication(self): + """Threshold of 0 should disable deduplication and return original list.""" + chars = [ + _create_positioned_ltchar("A", 10.0, 20.0), + _create_positioned_ltchar("A", 10.5, 20.0), # Would be duplicate + ] + result = _deduplicate_ltchars(chars, threshold=0) + assert len(result) == 2 + + def test_fake_bold_duplicates_removed(self): + """Fake bold (double-rendered) characters should be deduplicated.""" + # Simulate "AB" rendered as "AABB" with fake bold + chars = [ + _create_positioned_ltchar("A", 10.0, 20.0), + _create_positioned_ltchar("A", 10.5, 20.0), # Duplicate - close position + _create_positioned_ltchar("B", 25.0, 20.0), + _create_positioned_ltchar("B", 25.5, 20.0), # Duplicate - close position + ] + result = _deduplicate_ltchars(chars, threshold=3.0) + assert len(result) == 2 + assert result[0].get_text() == "A" + assert result[1].get_text() == "B" + + def test_legitimate_repeated_chars_preserved(self): + """Legitimate repeated characters at different positions should be preserved.""" + # "AA" where both A's are at legitimately different positions + chars = [ + _create_positioned_ltchar("A", 10.0, 20.0), + _create_positioned_ltchar("A", 25.0, 20.0), # Far enough - not duplicate + ] + result = _deduplicate_ltchars(chars, threshold=3.0) + assert len(result) == 2 + + def test_single_char_returns_single(self): + """Single character should return single character.""" + chars = [_create_positioned_ltchar("X", 10.0, 20.0)] + result = _deduplicate_ltchars(chars, threshold=3.0) + assert len(result) == 1 + assert result[0].get_text() == "X" + + def test_mixed_duplicates_and_normal(self): + """Mix of duplicated and normal characters should be handled correctly.""" + # "HELLO" where only H and L are fake-bold + chars = [ + _create_positioned_ltchar("H", 10.0, 20.0), + _create_positioned_ltchar("H", 10.5, 20.0), # Duplicate + _create_positioned_ltchar("E", 20.0, 20.0), # Normal + _create_positioned_ltchar("L", 30.0, 20.0), + _create_positioned_ltchar("L", 30.5, 20.0), # Duplicate + _create_positioned_ltchar("L", 40.0, 20.0), # Second L (normal, different position) + _create_positioned_ltchar("O", 50.0, 20.0), # Normal + ] + result = _deduplicate_ltchars(chars, threshold=3.0) + assert len(result) == 5 + text = "".join(c.get_text() for c in result) + assert text == "HELLO" diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py index 075a4e151e..cd70a2b18a 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py @@ -1,8 +1,13 @@ from unittest.mock import MagicMock -from pdfminer.layout import LTContainer, LTTextLine +from pdfminer.layout import LTChar, LTContainer, LTTextLine -from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects +from unstructured.partition.pdf_image.pdfminer_utils import ( + _is_duplicate_char, + deduplicate_chars_in_text_line, + extract_text_objects, + get_text_with_deduplication, +) def test_extract_text_objects_nested_containers(): @@ -26,3 +31,174 @@ def test_extract_text_objects_nested_containers(): assert len(result) == 2 assert mock_text_line1 in result assert mock_text_line2 in result + + +# -- Tests for character deduplication (fake bold fix) -- + + +def _create_mock_ltchar(text: str, x0: float, y0: float) -> MagicMock: + """Helper to create a mock LTChar with specified text and position.""" + mock_char = MagicMock(spec=LTChar) + mock_char.get_text.return_value = text + mock_char.x0 = x0 + mock_char.y0 = y0 + return mock_char + + +class TestIsDuplicateChar: + """Tests for _is_duplicate_char function.""" + + def test_same_char_same_position_is_duplicate(self): + """Two identical characters at the same position should be duplicates.""" + char1 = _create_mock_ltchar("A", 10.0, 20.0) + char2 = _create_mock_ltchar("A", 10.0, 20.0) + assert _is_duplicate_char(char1, char2, threshold=3.0) is True + + def test_same_char_close_position_is_duplicate(self): + """Two identical characters at close positions should be duplicates.""" + char1 = _create_mock_ltchar("B", 10.0, 20.0) + char2 = _create_mock_ltchar("B", 11.5, 21.0) # Within 3.0 threshold + assert _is_duplicate_char(char1, char2, threshold=3.0) is True + + def test_same_char_far_position_not_duplicate(self): + """Two identical characters at far positions should not be duplicates.""" + char1 = _create_mock_ltchar("C", 10.0, 20.0) + char2 = _create_mock_ltchar("C", 15.0, 20.0) # 5.0 > 3.0 threshold + assert _is_duplicate_char(char1, char2, threshold=3.0) is False + + def test_different_chars_same_position_not_duplicate(self): + """Two different characters at the same position should not be duplicates.""" + char1 = _create_mock_ltchar("A", 10.0, 20.0) + char2 = _create_mock_ltchar("B", 10.0, 20.0) + assert _is_duplicate_char(char1, char2, threshold=3.0) is False + + def test_threshold_boundary(self): + """Test behavior at exact threshold boundary.""" + char1 = _create_mock_ltchar("X", 10.0, 20.0) + char2 = _create_mock_ltchar("X", 13.0, 20.0) # Exactly at threshold + # At threshold means NOT within threshold (uses < not <=) + assert _is_duplicate_char(char1, char2, threshold=3.0) is False + + char3 = _create_mock_ltchar("X", 12.9, 20.0) # Just under threshold + assert _is_duplicate_char(char1, char3, threshold=3.0) is True + + +class TestDeduplicateCharsInTextLine: + """Tests for deduplicate_chars_in_text_line function.""" + + def test_no_duplicates_returns_original(self): + """Text line without duplicates should return original text.""" + chars = [ + _create_mock_ltchar("H", 10.0, 20.0), + _create_mock_ltchar("i", 15.0, 20.0), + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + mock_text_line.get_text.return_value = "Hi" + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0) + assert result == "Hi" + + def test_fake_bold_duplicates_removed(self): + """Fake bold text (each char doubled) should be deduplicated.""" + # Simulates "BOLD" rendered as "BBOOLLDD" with duplicate positions + chars = [ + _create_mock_ltchar("B", 10.0, 20.0), + _create_mock_ltchar("B", 10.5, 20.0), # Duplicate + _create_mock_ltchar("O", 20.0, 20.0), + _create_mock_ltchar("O", 20.5, 20.0), # Duplicate + _create_mock_ltchar("L", 30.0, 20.0), + _create_mock_ltchar("L", 30.5, 20.0), # Duplicate + _create_mock_ltchar("D", 40.0, 20.0), + _create_mock_ltchar("D", 40.5, 20.0), # Duplicate + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0) + assert result == "BOLD" + + def test_threshold_zero_disables_deduplication(self): + """Setting threshold to 0 should disable deduplication.""" + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.get_text.return_value = "BBOOLLDD" + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=0) + assert result == "BBOOLLDD" + + def test_negative_threshold_disables_deduplication(self): + """Setting negative threshold should disable deduplication.""" + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.get_text.return_value = "BBOOLLDD" + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=-1.0) + assert result == "BBOOLLDD" + + def test_empty_text_line(self): + """Empty text line should return original text.""" + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter([]) + mock_text_line.get_text.return_value = "" + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0) + assert result == "" + + def test_legitimate_repeated_chars_preserved(self): + """Legitimate repeated characters (different positions) should be preserved.""" + # "AA" where both A's are at different positions + chars = [ + _create_mock_ltchar("A", 10.0, 20.0), + _create_mock_ltchar("A", 20.0, 20.0), # Different position, not duplicate + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + + result = deduplicate_chars_in_text_line(mock_text_line, threshold=3.0) + assert result == "AA" + + +class TestGetTextWithDeduplication: + """Tests for get_text_with_deduplication function.""" + + def test_with_text_line(self): + """Should properly deduplicate text from LTTextLine.""" + chars = [ + _create_mock_ltchar("H", 10.0, 20.0), + _create_mock_ltchar("H", 10.5, 20.0), # Duplicate + _create_mock_ltchar("i", 20.0, 20.0), + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + + result = get_text_with_deduplication(mock_text_line, threshold=3.0) + assert result == "Hi" + + def test_with_container(self): + """Should handle LTContainer with nested LTTextLine.""" + chars = [ + _create_mock_ltchar("T", 10.0, 20.0), + _create_mock_ltchar("T", 10.5, 20.0), # Duplicate + ] + mock_text_line = MagicMock(spec=LTTextLine) + mock_text_line.__iter__ = lambda self: iter(chars) + + mock_container = MagicMock(spec=LTContainer) + mock_container.__iter__ = lambda self: iter([mock_text_line]) + + result = get_text_with_deduplication(mock_container, threshold=3.0) + assert result == "T" + + def test_with_generic_object(self): + """Should fall back to get_text() for non-standard objects.""" + mock_obj = MagicMock() + mock_obj.get_text.return_value = "fallback text" + + result = get_text_with_deduplication(mock_obj, threshold=3.0) + assert result == "fallback text" + + def test_without_get_text(self): + """Should return empty string for objects without get_text.""" + mock_obj = MagicMock(spec=[]) # No get_text method + + result = get_text_with_deduplication(mock_obj, threshold=3.0) + assert result == "" diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index 991d5c5d6f..0a7c7453f2 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -17,8 +17,10 @@ from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters from unstructured.partition.pdf_image.pdfminer_utils import ( PDFMinerConfig, + _is_duplicate_char, extract_image_objects, extract_text_objects, + get_text_with_deduplication, open_pdfminer_pages_generator, rect_to_bbox, ) @@ -466,11 +468,13 @@ def process_page_layout_from_pdfminer( if hasattr(obj, "get_text"): inner_text_objects = extract_text_objects(obj) + char_dedup_threshold = env_config.PDF_CHAR_DUPLICATE_THRESHOLD for inner_obj in inner_text_objects: inner_bbox = rect_to_bbox(inner_obj.bbox, page_height) if not _validate_bbox(inner_bbox): continue - texts.append(inner_obj.get_text()) + # Use deduplication to handle fake bold text (characters rendered twice) + texts.append(get_text_with_deduplication(inner_obj, char_dedup_threshold)) element_coords.append(inner_bbox) element_class.append(0) is_extracted.append(IsExtracted.TRUE if text_is_embedded(inner_obj) else None) @@ -1006,6 +1010,33 @@ def check_annotations_within_element( return annotations_within_element +def _deduplicate_ltchars( + chars: list[LTChar], + threshold: float, +) -> list[LTChar]: + """Remove duplicate characters caused by fake bold rendering. + + Some PDFs create bold text by rendering the same character twice at slightly offset + positions. This function removes such duplicates. + + Args: + chars: List of LTChar objects to deduplicate. + threshold: Maximum pixel distance to consider characters as duplicates. + Set to 0 to disable deduplication. + + Returns: + Deduplicated list of LTChar objects. + """ + if threshold <= 0 or not chars: + return chars + + result = [chars[0]] + for char in chars[1:]: + if not _is_duplicate_char(result[-1], char, threshold): + result.append(char) + return result + + def get_words_from_obj( obj: LTTextBox, height: float, @@ -1026,13 +1057,25 @@ def get_words_from_obj( characters = [] words = [] text_len = 0 + char_dedup_threshold = env_config.PDF_CHAR_DUPLICATE_THRESHOLD for text_line in obj: word = "" x1, y1, x2, y2 = None, None, None, None start_index = 0 + last_char: LTChar | None = None # Track last character for deduplication + for index, character in enumerate(text_line): if isinstance(character, LTChar): + # Skip duplicate characters (fake bold fix) + if ( + char_dedup_threshold > 0 + and last_char is not None + and _is_duplicate_char(last_char, character, char_dedup_threshold) + ): + continue + + last_char = character characters.append(character) char = character.get_text() @@ -1066,6 +1109,7 @@ def get_words_from_obj( word += char else: + # Non-LTChar items (e.g., LTAnno) act as word boundaries words.append( {"text": word, "bbox": (x1, y1, x2, y2), "start_index": start_index}, ) diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index 948cf8ba48..ba9c9062a8 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -1,6 +1,6 @@ import os import tempfile -from typing import BinaryIO, List, Optional, Tuple +from typing import BinaryIO, List, Optional, Tuple, Union from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTChar, LTContainer, LTImage, LTItem, LTTextLine @@ -106,6 +106,102 @@ def rect_to_bbox( return (x1, y1, x2, y2) +def _is_duplicate_char(char1: LTChar, char2: LTChar, threshold: float) -> bool: + """Detect if two characters are duplicates caused by fake bold rendering. + + Some PDF generators create bold text by rendering the same character twice at slightly + offset positions. This function detects such duplicates by checking if two characters + have the same text content and nearly identical positions. + + Args: + char1: First LTChar object. + char2: Second LTChar object. + threshold: Maximum pixel distance to consider as duplicate. + + Returns: + True if char2 appears to be a duplicate of char1. + """ + # Must be the same character + if char1.get_text() != char2.get_text(): + return False + + # Check if positions are nearly identical (within threshold) + x_diff = abs(char1.x0 - char2.x0) + y_diff = abs(char1.y0 - char2.y0) + + return x_diff < threshold and y_diff < threshold + + +def deduplicate_chars_in_text_line(text_line: LTTextLine, threshold: float) -> str: + """Extract text from an LTTextLine with duplicate characters removed. + + Some PDFs create bold text by rendering each character twice at slightly offset + positions. This function removes such duplicates by keeping only the first instance + when two identical characters appear at nearly the same position. + + Args: + text_line: An LTTextLine object containing characters to extract. + threshold: Maximum pixel distance to consider characters as duplicates. + Set to 0 to disable deduplication. + + Returns: + The extracted text with duplicate characters removed. + """ + if threshold <= 0: + return text_line.get_text() + + # Build deduplicated text while preserving non-LTChar items (like LTAnno for spaces) + result_parts: List[str] = [] + last_ltchar: Optional[LTChar] = None + + for item in text_line: + if isinstance(item, LTChar): + # Check if this is a duplicate of the last LTChar + if last_ltchar is not None and _is_duplicate_char(last_ltchar, item, threshold): + # Skip this duplicate character + continue + last_ltchar = item + result_parts.append(item.get_text()) + else: + # Non-LTChar items (e.g., LTAnno for spaces) - keep as-is + if hasattr(item, "get_text"): + result_parts.append(item.get_text()) + + return "".join(result_parts) + + +def get_text_with_deduplication( + text_obj: Union[LTTextLine, LTContainer, LTItem], + threshold: float, +) -> str: + """Get text from a text object with optional character deduplication. + + This is the main entry point for extracting text with fake-bold deduplication. + It handles LTTextLine objects and recursively processes containers. + + Args: + text_obj: An LTTextLine, LTContainer, or other LTItem object. + threshold: Maximum pixel distance to consider characters as duplicates. + Set to 0 to disable deduplication. + + Returns: + The extracted text with duplicate characters removed. + """ + if isinstance(text_obj, LTTextLine): + return deduplicate_chars_in_text_line(text_obj, threshold) + elif isinstance(text_obj, LTContainer): + parts: List[str] = [] + for child in text_obj: + if isinstance(child, LTTextLine): + parts.append(deduplicate_chars_in_text_line(child, threshold)) + elif hasattr(child, "get_text"): + parts.append(child.get_text()) + return "".join(parts) + elif hasattr(text_obj, "get_text"): + return text_obj.get_text() + return "" + + @requires_dependencies(["pikepdf", "pypdf"]) def open_pdfminer_pages_generator( fp: BinaryIO, password: Optional[str] = None, pdfminer_config: Optional[PDFMinerConfig] = None diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index d92457ce1f..133212ac11 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -240,6 +240,16 @@ def TEXT_COVERAGE_THRESHOLD(self) -> float: the inferred element to be considered contaning extracted text""" return self._get_float("TEXT_COVERAGE_THRESHOLD", 0.25) + @property + def PDF_CHAR_DUPLICATE_THRESHOLD(self) -> float: + """Maximum pixel distance to consider two characters as duplicates (fake bold rendering). + + Some PDFs create bold text by rendering the same character twice at slightly offset + positions. This threshold determines how close two identical characters must be to be + considered duplicates. Set to 0 to disable duplicate character removal. + """ + return self._get_float("PDF_CHAR_DUPLICATE_THRESHOLD", 3.0) + @property def PDF_RENDER_DPI(self) -> int: """The DPI to use for rendering PDF pages""" From 92c02d68caeba6474d94e315699026efeecf4d18 Mon Sep 17 00:00:00 2001 From: bittoby Date: Fri, 30 Jan 2026 18:56:31 +0100 Subject: [PATCH 02/10] fix: apply character deduplication to fast strategy for fake-bold PDFs --- diagnose_fake_bold.py | 70 +++++++++++++ example-docs/pdf/fake-bold-sample.pdf | Bin 0 -> 2125 bytes .../partition/pdf_image/test_pdf.py | 37 +++++++ .../pdf_image/test_pdfminer_utils.py | 95 ++++++++++++++++++ unstructured/partition/pdf.py | 6 +- 5 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 diagnose_fake_bold.py create mode 100644 example-docs/pdf/fake-bold-sample.pdf diff --git a/diagnose_fake_bold.py b/diagnose_fake_bold.py new file mode 100644 index 0000000000..50daa0d77d --- /dev/null +++ b/diagnose_fake_bold.py @@ -0,0 +1,70 @@ +"""Diagnostic script to verify fake-bold PDF deduplication is working.""" +import os + +# Test 1: Extract WITHOUT deduplication +os.environ["PDF_CHAR_DUPLICATE_THRESHOLD"] = "0" + +from unstructured.partition.pdf import partition_pdf +from unstructured.partition.utils.config import env_config + +PDF_PATH = "example-docs/pdf/fake-bold-sample.pdf" + +print("=" * 70) +print("FAKE-BOLD PDF DIAGNOSTIC") +print("=" * 70) + +# Extract without deduplication +print(f"\n1. WITHOUT deduplication (threshold=0):") +print("-" * 50) + +elements_no_dedup = partition_pdf(filename=PDF_PATH, strategy="fast") +text_no_dedup = " ".join([el.text for el in elements_no_dedup]) + +print(f"Character count: {len(text_no_dedup)}") +print(f"First 200 chars:\n'{text_no_dedup[:200]}'") + +# Now reload with deduplication enabled +print(f"\n2. WITH deduplication (threshold=3.0):") +print("-" * 50) + +os.environ["PDF_CHAR_DUPLICATE_THRESHOLD"] = "3.0" +from importlib import reload +from unstructured.partition.utils import config +reload(config) + +elements_with_dedup = partition_pdf(filename=PDF_PATH, strategy="fast") +text_with_dedup = " ".join([el.text for el in elements_with_dedup]) + +print(f"Character count: {len(text_with_dedup)}") +print(f"First 200 chars:\n'{text_with_dedup[:200]}'") + +# Compare +print("\n" + "=" * 70) +print("COMPARISON RESULTS:") +print("=" * 70) + +diff = len(text_no_dedup) - len(text_with_dedup) +print(f"Text length WITHOUT dedup: {len(text_no_dedup)} characters") +print(f"Text length WITH dedup: {len(text_with_dedup)} characters") +print(f"Difference: {diff} characters removed") + +if diff > 0: + reduction_pct = (diff / len(text_no_dedup)) * 100 + print(f"Reduction: {reduction_pct:.1f}%") + print("\n*** SUCCESS: Deduplication removed duplicate characters! ***") + print(" Your PDF has fake-bold text and the fix is working.") +elif diff == 0: + print("\n*** WARNING: No difference detected ***") + print(" Possible reasons:") + print(" 1. The PDF doesn't have fake-bold text (uses real font weight)") + print(" 2. The deduplication threshold may need adjustment") +else: + print("\n*** ERROR: Deduplicated text is LONGER (unexpected) ***") + +# Show specific differences if any +if text_no_dedup != text_with_dedup: + print("\n" + "-" * 50) + print("SAMPLE TEXT COMPARISON:") + print("-" * 50) + print(f"WITHOUT dedup (first 100): '{text_no_dedup[:100]}'") + print(f"WITH dedup (first 100): '{text_with_dedup[:100]}'") diff --git a/example-docs/pdf/fake-bold-sample.pdf b/example-docs/pdf/fake-bold-sample.pdf new file mode 100644 index 0000000000000000000000000000000000000000..758310acdafdf4a83dd7a41363f6646af31742c7 GIT binary patch literal 2125 zcmah~*_NV65PkP6Qpu^dVK^Y>r>f-!U*FOao8=s_ELS^#x`B7p83ZWgoykak>X z`_9cfz)XYyIB@otNTU(K1Ok+hIgYSw1loG&4(*R+PV=GQAOP$Mn!mxoIA{Rt+6PBB z>=VG|hTXP(DBtunGK(3K1>s z1iAOI^FE4Ti_!wv*a;AMXgfCXH2V!Q^hdx)0YkL;t^g8+esC2o4ICX+IrX99*k}^E z{EtMxGSJZi&2&z87}Om``xcxX52Q8QvA~?7kEHzcC&pK9*m;_b#&gu2uzgn%?`rJo5#~!7d0m)Xr_NpWO$h$M0Oms16?8} z87T2J2)kq)Ui?=E-$=dVFYK3(t%6|&D0y__ZM7qcRf>nGSHs3L5qs|jLXx3<3eKQf zB|{)TKonb%o)#nyjA`nNWL;XxDeLuNU}mqe`Et?R-SH1=wrwi6ECKpvldc7$xvs9YwslzfW zWd*}k>$x4(lnu6@AE5C#q3bcvnTa=c4R4? z!7SO`HK)=O8&8#6%=2NZF}_4`saitKKL`q>wCH8C@iL8KCDtD%Ij*k_TytV zlbqK$JP;M7RaQ0i8FFERDNB~q=YXH3)8`NSs@7Tag&fnLjR zNTDSrr{f8W7rKk`4Ps|5$hNFPYOy$dQp8p5iFr{WeLKus1lGOOqs8XwIyO&^S|2R4 z!|2gW)Ai}&a`~|5AOB>L=DdG>@9Oi}gU1LCsuBF#189mWQ{V```DpZq4 0, "Should extract some text from the PDF" + + # Verify no obvious fake-bold patterns remain (doubled consecutive chars) + # Note: Some legitimate words have double letters (e.g., "book", "see") + # but fake-bold would have patterns like "BBOOLLDD" for every character + assert "BBOOLLDD" not in extracted_text, "Fake-bold text should be deduplicated" + + def test_fake_bold_pdf_deduplication_disabled(self, monkeypatch): + """Test PDF extraction with deduplication disabled shows raw text. + + When PDF_CHAR_DUPLICATE_THRESHOLD is set to 0, deduplication is disabled + and the raw text (potentially with doubled characters) should be visible. + """ + # Disable deduplication by setting threshold to 0 + monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") + + # Reload config to pick up the environment variable change + from importlib import reload + + from unstructured.partition.utils import config + + reload(config) + + filename = example_doc_path("pdf/fake-bold-sample.pdf") + + # Extract with deduplication disabled + elements = partition_pdf(filename=filename, strategy="fast") + + # Combine all extracted text + extracted_text = " ".join([el.text for el in elements]) + + # Text should still be extracted + assert len(extracted_text) > 0, "Should extract some text from the PDF" + + def test_fake_bold_deduplication_reduces_text_length(self, monkeypatch): + """Test that deduplication actually reduces text length for fake-bold PDFs. + + If the PDF truly has fake-bold text, the deduplicated version should be + shorter than the non-deduplicated version. + """ + filename = example_doc_path("pdf/fake-bold-sample.pdf") + + # First, extract WITHOUT deduplication + monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") + from importlib import reload + + from unstructured.partition.utils import config + + reload(config) + + elements_no_dedup = partition_pdf(filename=filename, strategy="fast") + text_no_dedup = " ".join([el.text for el in elements_no_dedup]) + + # Then, extract WITH deduplication (reset to default) + monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "3.0") + reload(config) + + elements_with_dedup = partition_pdf(filename=filename, strategy="fast") + text_with_dedup = " ".join([el.text for el in elements_with_dedup]) + + # If the PDF has fake-bold text, deduplicated text should be shorter + # or at minimum the same length (if no duplicates were found) + assert len(text_with_dedup) <= len(text_no_dedup), ( + f"Deduplicated text ({len(text_with_dedup)} chars) should not be longer " + f"than non-deduplicated text ({len(text_no_dedup)} chars)" + ) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 0ada2a979c..db9fc4506d 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -80,6 +80,7 @@ ) from unstructured.partition.pdf_image.pdfminer_utils import ( PDFMinerConfig, + get_text_with_deduplication, open_pdfminer_pages_generator, rect_to_bbox, ) @@ -520,7 +521,10 @@ def _process_pdfminer_pages( urls_metadata.append(map_bbox_and_index(words, annot)) if hasattr(obj, "get_text"): - _text_snippets: list[str] = [obj.get_text()] + # Use deduplication to handle fake bold text (characters rendered twice) + _text_snippets: list[str] = [ + get_text_with_deduplication(obj, env_config.PDF_CHAR_DUPLICATE_THRESHOLD) + ] else: _text = _extract_text(obj) _text_snippets = re.split(PARAGRAPH_PATTERN, _text) From 83773989d8c19f50aa5df46dd2303a832acfba31 Mon Sep 17 00:00:00 2001 From: bittoby Date: Fri, 30 Jan 2026 19:10:04 +0100 Subject: [PATCH 03/10] fix: define imports at the top --- .../partition/pdf_image/test_pdf.py | 10 ++++------ .../partition/pdf_image/test_pdfminer_utils.py | 18 +++++------------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index f457d32100..49f1d8e9b4 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -6,6 +6,7 @@ import os import tempfile from dataclasses import dataclass +from importlib import reload from pathlib import Path from tempfile import SpooledTemporaryFile from unittest import mock @@ -38,6 +39,7 @@ from unstructured.partition import pdf, strategies from unstructured.partition.pdf_image import ocr, pdfminer_processing from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots +from unstructured.partition.utils import config as partition_config from unstructured.partition.utils.constants import ( OCR_AGENT_PADDLE, OCR_AGENT_TESSERACT, @@ -449,11 +451,7 @@ def test_partition_pdf_with_fast_strategy_deduplicates_fake_bold(monkeypatch): # First, extract WITHOUT deduplication (threshold=0) monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") - from importlib import reload - - from unstructured.partition.utils import config - - reload(config) + reload(partition_config) elements_no_dedup = pdf.partition_pdf( filename=filename, strategy=PartitionStrategy.FAST @@ -462,7 +460,7 @@ def test_partition_pdf_with_fast_strategy_deduplicates_fake_bold(monkeypatch): # Then, extract WITH deduplication (threshold=3.0) monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "3.0") - reload(config) + reload(partition_config) elements_with_dedup = pdf.partition_pdf( filename=filename, strategy=PartitionStrategy.FAST diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py index 67ab3c7abb..437c7856fc 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py @@ -1,3 +1,4 @@ +from importlib import reload from unittest.mock import MagicMock import pytest @@ -11,6 +12,7 @@ extract_text_objects, get_text_with_deduplication, ) +from unstructured.partition.utils import config as partition_config def test_extract_text_objects_nested_containers(): @@ -246,13 +248,7 @@ def test_fake_bold_pdf_deduplication_disabled(self, monkeypatch): """ # Disable deduplication by setting threshold to 0 monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") - - # Reload config to pick up the environment variable change - from importlib import reload - - from unstructured.partition.utils import config - - reload(config) + reload(partition_config) filename = example_doc_path("pdf/fake-bold-sample.pdf") @@ -275,18 +271,14 @@ def test_fake_bold_deduplication_reduces_text_length(self, monkeypatch): # First, extract WITHOUT deduplication monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") - from importlib import reload - - from unstructured.partition.utils import config - - reload(config) + reload(partition_config) elements_no_dedup = partition_pdf(filename=filename, strategy="fast") text_no_dedup = " ".join([el.text for el in elements_no_dedup]) # Then, extract WITH deduplication (reset to default) monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "3.0") - reload(config) + reload(partition_config) elements_with_dedup = partition_pdf(filename=filename, strategy="fast") text_with_dedup = " ".join([el.text for el in elements_with_dedup]) From d817d42cc85003758e30ba671fecb65943da3b3c Mon Sep 17 00:00:00 2001 From: bittoby Date: Fri, 30 Jan 2026 19:57:05 +0100 Subject: [PATCH 04/10] test: simplify fake-bold integration test assertions --- diagnose_fake_bold.py | 9 ++++----- .../partition/pdf_image/test_pdfminer_utils.py | 11 ++--------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/diagnose_fake_bold.py b/diagnose_fake_bold.py index 50daa0d77d..5c7107c1c1 100644 --- a/diagnose_fake_bold.py +++ b/diagnose_fake_bold.py @@ -1,11 +1,12 @@ """Diagnostic script to verify fake-bold PDF deduplication is working.""" import os +from importlib import reload -# Test 1: Extract WITHOUT deduplication +# Set environment variable BEFORE importing unstructured modules os.environ["PDF_CHAR_DUPLICATE_THRESHOLD"] = "0" from unstructured.partition.pdf import partition_pdf -from unstructured.partition.utils.config import env_config +from unstructured.partition.utils import config as partition_config PDF_PATH = "example-docs/pdf/fake-bold-sample.pdf" @@ -28,9 +29,7 @@ print("-" * 50) os.environ["PDF_CHAR_DUPLICATE_THRESHOLD"] = "3.0" -from importlib import reload -from unstructured.partition.utils import config -reload(config) +reload(partition_config) elements_with_dedup = partition_pdf(filename=PDF_PATH, strategy="fast") text_with_dedup = " ".join([el.text for el in elements_with_dedup]) diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py index 437c7856fc..19d6216ccf 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py @@ -230,15 +230,8 @@ def test_fake_bold_pdf_deduplication_enabled(self): # Combine all extracted text extracted_text = " ".join([el.text for el in elements]) - # With deduplication enabled, text should not contain doubled characters - # that are characteristic of fake-bold rendering (e.g., "BBOOLLDD" instead of "BOLD") - # The text should be clean and readable - assert len(extracted_text) > 0, "Should extract some text from the PDF" - - # Verify no obvious fake-bold patterns remain (doubled consecutive chars) - # Note: Some legitimate words have double letters (e.g., "book", "see") - # but fake-bold would have patterns like "BBOOLLDD" for every character - assert "BBOOLLDD" not in extracted_text, "Fake-bold text should be deduplicated" + # Basic validation - text should be extracted successfully + assert len(elements) > 0, "Should extract elements from the PDF" def test_fake_bold_pdf_deduplication_disabled(self, monkeypatch): """Test PDF extraction with deduplication disabled shows raw text. From 3d11da7b06b554c8e939f55550d2535e5141a6e2 Mon Sep 17 00:00:00 2001 From: bittoby Date: Mon, 2 Feb 2026 18:16:36 +0100 Subject: [PATCH 05/10] fix: improve fake-bold deduplication tests with specific assertions --- diagnose_fake_bold.py | 69 ---------------- .../partition/pdf_image/test_pdf.py | 24 ++++-- .../pdf_image/test_pdfminer_utils.py | 78 ++++++++++--------- 3 files changed, 59 insertions(+), 112 deletions(-) delete mode 100644 diagnose_fake_bold.py diff --git a/diagnose_fake_bold.py b/diagnose_fake_bold.py deleted file mode 100644 index 5c7107c1c1..0000000000 --- a/diagnose_fake_bold.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Diagnostic script to verify fake-bold PDF deduplication is working.""" -import os -from importlib import reload - -# Set environment variable BEFORE importing unstructured modules -os.environ["PDF_CHAR_DUPLICATE_THRESHOLD"] = "0" - -from unstructured.partition.pdf import partition_pdf -from unstructured.partition.utils import config as partition_config - -PDF_PATH = "example-docs/pdf/fake-bold-sample.pdf" - -print("=" * 70) -print("FAKE-BOLD PDF DIAGNOSTIC") -print("=" * 70) - -# Extract without deduplication -print(f"\n1. WITHOUT deduplication (threshold=0):") -print("-" * 50) - -elements_no_dedup = partition_pdf(filename=PDF_PATH, strategy="fast") -text_no_dedup = " ".join([el.text for el in elements_no_dedup]) - -print(f"Character count: {len(text_no_dedup)}") -print(f"First 200 chars:\n'{text_no_dedup[:200]}'") - -# Now reload with deduplication enabled -print(f"\n2. WITH deduplication (threshold=3.0):") -print("-" * 50) - -os.environ["PDF_CHAR_DUPLICATE_THRESHOLD"] = "3.0" -reload(partition_config) - -elements_with_dedup = partition_pdf(filename=PDF_PATH, strategy="fast") -text_with_dedup = " ".join([el.text for el in elements_with_dedup]) - -print(f"Character count: {len(text_with_dedup)}") -print(f"First 200 chars:\n'{text_with_dedup[:200]}'") - -# Compare -print("\n" + "=" * 70) -print("COMPARISON RESULTS:") -print("=" * 70) - -diff = len(text_no_dedup) - len(text_with_dedup) -print(f"Text length WITHOUT dedup: {len(text_no_dedup)} characters") -print(f"Text length WITH dedup: {len(text_with_dedup)} characters") -print(f"Difference: {diff} characters removed") - -if diff > 0: - reduction_pct = (diff / len(text_no_dedup)) * 100 - print(f"Reduction: {reduction_pct:.1f}%") - print("\n*** SUCCESS: Deduplication removed duplicate characters! ***") - print(" Your PDF has fake-bold text and the fix is working.") -elif diff == 0: - print("\n*** WARNING: No difference detected ***") - print(" Possible reasons:") - print(" 1. The PDF doesn't have fake-bold text (uses real font weight)") - print(" 2. The deduplication threshold may need adjustment") -else: - print("\n*** ERROR: Deduplicated text is LONGER (unexpected) ***") - -# Show specific differences if any -if text_no_dedup != text_with_dedup: - print("\n" + "-" * 50) - print("SAMPLE TEXT COMPARISON:") - print("-" * 50) - print(f"WITHOUT dedup (first 100): '{text_no_dedup[:100]}'") - print(f"WITH dedup (first 100): '{text_with_dedup[:100]}'") diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 49f1d8e9b4..d6897af1d9 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -445,31 +445,39 @@ def test_partition_pdf_with_fast_strategy_deduplicates_fake_bold(monkeypatch): """Test that fast strategy properly deduplicates fake-bold text in PDFs. Some PDFs create bold text by rendering each character twice at slightly offset - positions. The fast strategy should remove these duplicate characters. + positions (fake-bold). The fast strategy should remove these duplicate characters. """ filename = example_doc_path("pdf/fake-bold-sample.pdf") - # First, extract WITHOUT deduplication (threshold=0) + # Extract WITHOUT deduplication (threshold=0) - shows doubled characters monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") reload(partition_config) - elements_no_dedup = pdf.partition_pdf( filename=filename, strategy=PartitionStrategy.FAST ) text_no_dedup = " ".join([el.text for el in elements_no_dedup]) - # Then, extract WITH deduplication (threshold=3.0) + # Extract WITH deduplication (threshold=3.0) - shows clean text monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "3.0") reload(partition_config) - elements_with_dedup = pdf.partition_pdf( filename=filename, strategy=PartitionStrategy.FAST ) text_with_dedup = " ".join([el.text for el in elements_with_dedup]) - # Deduplicated text should be shorter or equal (if PDF has fake-bold text) - assert len(text_with_dedup) <= len(text_no_dedup), ( - f"Deduplicated text ({len(text_with_dedup)} chars) should not be longer " + # Verify fake-bold text shows doubled characters without deduplication + assert "BBOOLLDD" in text_no_dedup, ( + "Without deduplication, fake-bold text should show doubled chars like 'BBOOLLDD'" + ) + + # Verify deduplication produces clean text + assert "BOLD" in text_with_dedup, ( + "With deduplication, text should contain clean 'BOLD'" + ) + + # Verify deduplicated text is shorter + assert len(text_with_dedup) < len(text_no_dedup), ( + f"Deduplicated text ({len(text_with_dedup)} chars) should be shorter " f"than non-deduplicated text ({len(text_no_dedup)} chars)" ) diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py index 19d6216ccf..43b14eacb3 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py @@ -213,72 +213,80 @@ def test_without_get_text(self): class TestFakeBoldPdfIntegration: - """Integration tests for fake-bold PDF deduplication using real PDF files.""" + """Integration tests for fake-bold PDF deduplication using real PDF files. - def test_fake_bold_pdf_deduplication_enabled(self): - """Test that fake-bold text is properly deduplicated from a real PDF. + The test PDF (fake-bold-sample.pdf) contains text rendered with the "fake bold" + technique where each character is drawn twice at slightly offset positions. + This causes text extraction to show doubled characters (e.g., "BBOOLLDD" instead + of "BOLD") unless deduplication is applied. + """ - Uses a PDF file that contains text rendered with the "fake bold" technique, - where each character is drawn twice at slightly offset positions. - With deduplication enabled (default), the extracted text should be clean. + def test_fake_bold_pdf_without_deduplication_shows_doubled_chars(self, monkeypatch): + """Test that extraction WITHOUT deduplication shows doubled characters. + + When PDF_CHAR_DUPLICATE_THRESHOLD is set to 0, deduplication is disabled + and the raw text shows the fake-bold doubled characters. """ - filename = example_doc_path("pdf/fake-bold-sample.pdf") + monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") + reload(partition_config) - # Extract with deduplication enabled (default, threshold=3.0) + filename = example_doc_path("pdf/fake-bold-sample.pdf") elements = partition_pdf(filename=filename, strategy="fast") - - # Combine all extracted text extracted_text = " ".join([el.text for el in elements]) - # Basic validation - text should be extracted successfully - assert len(elements) > 0, "Should extract elements from the PDF" + # Without deduplication, fake-bold text appears with doubled characters + assert "BBOOLLDD" in extracted_text, ( + "Without deduplication, fake-bold text should show doubled characters " + "like 'BBOOLLDD' instead of 'BOLD'" + ) - def test_fake_bold_pdf_deduplication_disabled(self, monkeypatch): - """Test PDF extraction with deduplication disabled shows raw text. + def test_fake_bold_pdf_with_deduplication_shows_clean_text(self, monkeypatch): + """Test that extraction WITH deduplication shows clean text. - When PDF_CHAR_DUPLICATE_THRESHOLD is set to 0, deduplication is disabled - and the raw text (potentially with doubled characters) should be visible. + When PDF_CHAR_DUPLICATE_THRESHOLD is set to default (3.0), deduplication + removes the duplicate characters and produces clean, readable text. """ - # Disable deduplication by setting threshold to 0 - monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") + monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "3.0") reload(partition_config) filename = example_doc_path("pdf/fake-bold-sample.pdf") - - # Extract with deduplication disabled elements = partition_pdf(filename=filename, strategy="fast") - - # Combine all extracted text extracted_text = " ".join([el.text for el in elements]) - # Text should still be extracted - assert len(extracted_text) > 0, "Should extract some text from the PDF" + # With deduplication, fake-bold text should be clean (no doubled chars) + assert "BOLD" in extracted_text, ( + "With deduplication, text should contain clean 'BOLD' not 'BBOOLLDD'" + ) + # Verify the doubled pattern is NOT present in the deduplicated fake-bold section + # Note: The PDF contains 'BBOOLLDD' as explanatory text, so we check for + # the specific pattern that would appear if deduplication failed on the + # fake-bold rendered text (e.g., "TTEEXXTT" from "TEXT") + assert "TTEEXXTT" not in extracted_text, ( + "With deduplication, fake-bold 'TEXT' should not appear as 'TTEEXXTT'" + ) def test_fake_bold_deduplication_reduces_text_length(self, monkeypatch): - """Test that deduplication actually reduces text length for fake-bold PDFs. + """Test that deduplication reduces text length for fake-bold PDFs. - If the PDF truly has fake-bold text, the deduplicated version should be - shorter than the non-deduplicated version. + Compares extraction with and without deduplication to verify that + the deduplicated text is shorter due to removal of duplicate characters. """ filename = example_doc_path("pdf/fake-bold-sample.pdf") - # First, extract WITHOUT deduplication + # Extract WITHOUT deduplication (threshold=0) monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") reload(partition_config) - elements_no_dedup = partition_pdf(filename=filename, strategy="fast") text_no_dedup = " ".join([el.text for el in elements_no_dedup]) - # Then, extract WITH deduplication (reset to default) + # Extract WITH deduplication (threshold=3.0) monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "3.0") reload(partition_config) - elements_with_dedup = partition_pdf(filename=filename, strategy="fast") text_with_dedup = " ".join([el.text for el in elements_with_dedup]) - # If the PDF has fake-bold text, deduplicated text should be shorter - # or at minimum the same length (if no duplicates were found) - assert len(text_with_dedup) <= len(text_no_dedup), ( - f"Deduplicated text ({len(text_with_dedup)} chars) should not be longer " + # Deduplicated text should be shorter than non-deduplicated text + assert len(text_with_dedup) < len(text_no_dedup), ( + f"Deduplicated text ({len(text_with_dedup)} chars) should be shorter " f"than non-deduplicated text ({len(text_no_dedup)} chars)" ) From 355e9255601f89600ed73afe9343c92afd7a2ad0 Mon Sep 17 00:00:00 2001 From: bittoby Date: Tue, 3 Feb 2026 18:48:14 +0100 Subject: [PATCH 06/10] fix: remove unused pytest import to pass ruff linter --- test_unstructured/partition/pdf_image/test_pdfminer_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py index 43b14eacb3..bd9a33fa4e 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py @@ -1,7 +1,6 @@ from importlib import reload from unittest.mock import MagicMock -import pytest from pdfminer.layout import LTChar, LTContainer, LTTextLine from test_unstructured.unit_utils import example_doc_path From 14d1231dfcf2ccbc5fabe4c0e4a372d456a3358f Mon Sep 17 00:00:00 2001 From: bittoby Date: Thu, 5 Feb 2026 18:39:39 +0100 Subject: [PATCH 07/10] fix: black formatting violations in PDF test files for CI/CD compliance --- .../partition/pdf_image/test_pdf.py | 18 ++++++------------ .../partition/pdf_image/test_pdfminer_utils.py | 12 ++++++------ 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index d6897af1d9..09a3140d33 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -452,28 +452,22 @@ def test_partition_pdf_with_fast_strategy_deduplicates_fake_bold(monkeypatch): # Extract WITHOUT deduplication (threshold=0) - shows doubled characters monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "0") reload(partition_config) - elements_no_dedup = pdf.partition_pdf( - filename=filename, strategy=PartitionStrategy.FAST - ) + elements_no_dedup = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST) text_no_dedup = " ".join([el.text for el in elements_no_dedup]) # Extract WITH deduplication (threshold=3.0) - shows clean text monkeypatch.setenv("PDF_CHAR_DUPLICATE_THRESHOLD", "3.0") reload(partition_config) - elements_with_dedup = pdf.partition_pdf( - filename=filename, strategy=PartitionStrategy.FAST - ) + elements_with_dedup = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST) text_with_dedup = " ".join([el.text for el in elements_with_dedup]) # Verify fake-bold text shows doubled characters without deduplication - assert "BBOOLLDD" in text_no_dedup, ( - "Without deduplication, fake-bold text should show doubled chars like 'BBOOLLDD'" - ) + assert ( + "BBOOLLDD" in text_no_dedup + ), "Without deduplication, fake-bold text should show doubled chars like 'BBOOLLDD'" # Verify deduplication produces clean text - assert "BOLD" in text_with_dedup, ( - "With deduplication, text should contain clean 'BOLD'" - ) + assert "BOLD" in text_with_dedup, "With deduplication, text should contain clean 'BOLD'" # Verify deduplicated text is shorter assert len(text_with_dedup) < len(text_no_dedup), ( diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py index bd9a33fa4e..a033ab4d6c 100644 --- a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py @@ -253,16 +253,16 @@ def test_fake_bold_pdf_with_deduplication_shows_clean_text(self, monkeypatch): extracted_text = " ".join([el.text for el in elements]) # With deduplication, fake-bold text should be clean (no doubled chars) - assert "BOLD" in extracted_text, ( - "With deduplication, text should contain clean 'BOLD' not 'BBOOLLDD'" - ) + assert ( + "BOLD" in extracted_text + ), "With deduplication, text should contain clean 'BOLD' not 'BBOOLLDD'" # Verify the doubled pattern is NOT present in the deduplicated fake-bold section # Note: The PDF contains 'BBOOLLDD' as explanatory text, so we check for # the specific pattern that would appear if deduplication failed on the # fake-bold rendered text (e.g., "TTEEXXTT" from "TEXT") - assert "TTEEXXTT" not in extracted_text, ( - "With deduplication, fake-bold 'TEXT' should not appear as 'TTEEXXTT'" - ) + assert ( + "TTEEXXTT" not in extracted_text + ), "With deduplication, fake-bold 'TEXT' should not appear as 'TTEEXXTT'" def test_fake_bold_deduplication_reduces_text_length(self, monkeypatch): """Test that deduplication reduces text length for fake-bold PDFs. From 80e27740b0f1dc26286d2c9fd44128ec9468206a Mon Sep 17 00:00:00 2001 From: bittoby Date: Fri, 6 Feb 2026 01:23:23 +0100 Subject: [PATCH 08/10] fix: Update code formatting and element ID to match new deterministric ID generation --- CHANGELOG.md | 3 +++ test_unstructured/partition/pdf_image/test_pdf.py | 8 ++++---- unstructured/__version__.py | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9ce8fb1fa..c5c6b0be51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,9 @@ ### Enhancements - increase the `PIL.Image.MAX_IMAGE_PIXELS` for pdf partition to accomodate higher dpi values +### Fixes +- **Fix Black formatting violations in PDF test files**: Corrected code formatting in `test_pdfminer_utils.py` and `test_pdf.py` to comply with Black style guidelines and updated expected element IDs in tests to match new deterministic ID generation + ## 0.18.34 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 09a3140d33..9b6fca48ea 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1382,12 +1382,12 @@ def expected_element_ids_for_fast_strategy(): "a90a54baba0093296a013d26b7acbc17", "9be424e2d151dac4b5f36a85e9bbfe65", "4631da875fb4996c63b2d80cea6b588e", - "6264f4eda97a049f4710f9bea0c01cbd", + "8cdb940788d2ed43523a5327292477a0", "abded7b2ff3a5542c88b4a831755ec24", "b781ea5123cb31e0571391b7b42cac75", "033f27d2618ba4cda9068b267b5a731e", "8982a12fcced30dd12ccbf61d14f30bf", - "41af2fd5df0cf47aa7e8ecca200d3ac6", + "0dfcc8870cf2aa54a0e780cb301b9c91", ] @@ -1398,12 +1398,12 @@ def expected_element_ids_for_hi_res_strategy(): "a90a54baba0093296a013d26b7acbc17", "9be424e2d151dac4b5f36a85e9bbfe65", "4631da875fb4996c63b2d80cea6b588e", - "6264f4eda97a049f4710f9bea0c01cbd", + "8cdb940788d2ed43523a5327292477a0", "abded7b2ff3a5542c88b4a831755ec24", "b781ea5123cb31e0571391b7b42cac75", "033f27d2618ba4cda9068b267b5a731e", "8982a12fcced30dd12ccbf61d14f30bf", - "41af2fd5df0cf47aa7e8ecca200d3ac6", + "0dfcc8870cf2aa54a0e780cb301b9c91", ] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6f1bbf4ebc..59c20382fc 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.35-dev0" # pragma: no cover +__version__ = "0.18.35-dev1" # pragma: no cover From 68fc61c8119e65786ce0cb5c988311ece5899d6c Mon Sep 17 00:00:00 2001 From: bittoby Date: Fri, 6 Feb 2026 17:18:49 +0100 Subject: [PATCH 09/10] fix: Update CHANGELOG --- CHANGELOG.md | 9 ++++++++- unstructured/__version__.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5c6b0be51..cf26a7f8ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.18.36 + +### Enhancements +- Add character-level deduplication for PDF text extraction to handle fake-bold rendering + +### Fixes +- **Fix duplicate characters in PDF bold text extraction**: Some PDFs render bold text by drawing each character twice at slightly offset positions, causing text like "BOLD" to be extracted as "BBOOLLDD". Added character-level deduplication based on position proximity. Configurable via `PDF_CHAR_DUPLICATE_THRESHOLD` environment variable (default: 3.0 pixels, set to 0 to disable)(fixes #3864). + ## 0.18.35-dev0 ### Enhancements @@ -21,7 +29,6 @@ - **Add `group_elements_by_parent_id` utility function**: Groups elements by their `parent_id` metadata field for easier document hierarchy traversal (fixes #1489) ### Fixes -- **Fix duplicate characters in PDF bold text extraction**: Some PDFs render bold text by drawing each character twice at slightly offset positions, causing text like "BOLD" to be extracted as "BBOOLLDD". Added character-level deduplication based on position proximity. Configurable via `PDF_CHAR_DUPLICATE_THRESHOLD` environment variable (default: 3.0 pixels, set to 0 to disable)(fixes #3864). - **Preserve newlines in Table/TableChunk elements during PDF partitioning**: Skip whitespace normalization for Table and TableChunk elements so newlines that carry structural meaning (row separation) are preserved (fixes #3983) - Fix inconsistent pdf_image_dpi value in partition pdf with hi_res strategy diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 59c20382fc..5e4d481462 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.35-dev1" # pragma: no cover +__version__ = "0.18.36" # pragma: no cover From 0728ec0258460a51cf0ff4f312ca545337396ee3 Mon Sep 17 00:00:00 2001 From: bittoby Date: Fri, 6 Feb 2026 17:21:51 +0100 Subject: [PATCH 10/10] fix: recover origin 0.18.35 --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf26a7f8ae..c75852d49c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,9 +11,6 @@ ### Enhancements - increase the `PIL.Image.MAX_IMAGE_PIXELS` for pdf partition to accomodate higher dpi values -### Fixes -- **Fix Black formatting violations in PDF test files**: Corrected code formatting in `test_pdfminer_utils.py` and `test_pdf.py` to comply with Black style guidelines and updated expected element IDs in tests to match new deterministic ID generation - ## 0.18.34 ### Enhancements