Skip to content

Commit f7ce64d

Browse files
reiffd7claude
andauthored
Add collimate algorithm and Otsu improvements to OCR stitch block (#1936)
* Add collimate algorithm and Otsu improvements to OCR stitch block - Add collimate algorithm for skewed/curved text using greedy parent-child traversal - Fix vertical orientation bug and word spacing in collimate implementation - Add algorithm-specific parameters (collimate_tolerance, otsu_threshold_multiplier) - Update LONG_DESCRIPTION to document all three algorithms - Add comprehensive unit tests for all algorithms - Add integration tests for workflow execution * Change stitch_ocr_detections v2 to use @v2 type identifier - Update v2.py to use "roboflow_core/stitch_ocr_detections@v2" type - Load both v1 and v2 blocks in loader.py so @v1 workflows still work - Update v2 unit tests and integration tests to use @v2 identifier Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * removed ocr stitch v1 from loader * Update existing OCR stitch tests to use v2 parameter structure Add stitching_algorithm field to test_workflow_with_easy_ocr.py and test_workflow_with_ocr_detections_stitching.py to work with the v2 block. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * added default for the stitching algorithm * load stitch v1, right version --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent e436ae8 commit f7ce64d

File tree

8 files changed

+1863
-3
lines changed

8 files changed

+1863
-3
lines changed

inference/core/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.64.4"
1+
__version__ = "0.64.5"
22

33

44
if __name__ == "__main__":

inference/core/workflows/core_steps/loader.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,9 @@
433433
from inference.core.workflows.core_steps.transformations.stitch_ocr_detections.v1 import (
434434
StitchOCRDetectionsBlockV1,
435435
)
436+
from inference.core.workflows.core_steps.transformations.stitch_ocr_detections.v2 import (
437+
StitchOCRDetectionsBlockV2,
438+
)
436439

437440
# Visualizers
438441
from inference.core.workflows.core_steps.visualizations.background_color.v1 import (
@@ -777,6 +780,7 @@ def load_blocks() -> List[Type[WorkflowBlock]]:
777780
StabilityAIOutpaintingBlockV1,
778781
StabilizeTrackedDetectionsBlockV1,
779782
StitchImagesBlockV1,
783+
StitchOCRDetectionsBlockV2,
780784
StitchOCRDetectionsBlockV1,
781785
TemplateMatchingBlockV1,
782786
TimeInZoneBlockV1,

inference/core/workflows/core_steps/transformations/stitch_ocr_detections/v2.py

Lines changed: 885 additions & 0 deletions
Large diffs are not rendered by default.

tests/workflows/integration_tests/execution/test_workflow_with_easy_ocr.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,10 @@
2424
"character_set": "English",
2525
},
2626
{
27-
"type": "roboflow_core/stitch_ocr_detections@v1",
27+
"type": "roboflow_core/stitch_ocr_detections@v2",
2828
"name": "detections_stitch",
2929
"predictions": "$steps.easy_ocr.predictions",
30+
"stitching_algorithm": "tolerance",
3031
"reading_direction": "left_to_right",
3132
"tolerance": "$inputs.tolerance",
3233
},

tests/workflows/integration_tests/execution/test_workflow_with_ocr_detections_stitching.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@
2929
"confidence": "$inputs.confidence",
3030
},
3131
{
32-
"type": "roboflow_core/stitch_ocr_detections@v1",
32+
"type": "roboflow_core/stitch_ocr_detections@v2",
3333
"name": "detections_stitch",
3434
"predictions": "$steps.ocr_detection.predictions",
35+
"stitching_algorithm": "tolerance",
3536
"reading_direction": "left_to_right",
3637
"tolerance": "$inputs.tolerance",
3738
},
Lines changed: 331 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
import numpy as np
2+
import pytest
3+
4+
from inference.core.env import WORKFLOWS_MAX_CONCURRENT_STEPS
5+
from inference.core.managers.base import ModelManager
6+
from inference.core.workflows.core_steps.common.entities import StepExecutionMode
7+
from inference.core.workflows.execution_engine.core import ExecutionEngine
8+
from tests.workflows.integration_tests.execution.workflows_gallery_collector.decorators import (
9+
add_to_workflows_gallery,
10+
)
11+
12+
WORKFLOW_STITCHING_OCR_DETECTIONS_TOLERANCE = {
13+
"version": "1.0",
14+
"inputs": [
15+
{"type": "WorkflowImage", "name": "image"},
16+
{
17+
"type": "WorkflowParameter",
18+
"name": "model_id",
19+
"default_value": "ocr-oy9a7/1",
20+
},
21+
{"type": "WorkflowParameter", "name": "tolerance", "default_value": 10},
22+
{"type": "WorkflowParameter", "name": "confidence", "default_value": 0.4},
23+
],
24+
"steps": [
25+
{
26+
"type": "roboflow_core/roboflow_object_detection_model@v2",
27+
"name": "ocr_detection",
28+
"image": "$inputs.image",
29+
"model_id": "$inputs.model_id",
30+
"confidence": "$inputs.confidence",
31+
},
32+
{
33+
"type": "roboflow_core/stitch_ocr_detections@v2",
34+
"name": "detections_stitch",
35+
"predictions": "$steps.ocr_detection.predictions",
36+
"stitching_algorithm": "tolerance",
37+
"reading_direction": "left_to_right",
38+
"tolerance": "$inputs.tolerance",
39+
},
40+
],
41+
"outputs": [
42+
{
43+
"type": "JsonField",
44+
"name": "ocr_text",
45+
"selector": "$steps.detections_stitch.ocr_text",
46+
},
47+
],
48+
}
49+
50+
WORKFLOW_STITCHING_OCR_DETECTIONS_OTSU = {
51+
"version": "1.0",
52+
"inputs": [
53+
{"type": "WorkflowImage", "name": "image"},
54+
{
55+
"type": "WorkflowParameter",
56+
"name": "model_id",
57+
"default_value": "ocr-oy9a7/1",
58+
},
59+
{"type": "WorkflowParameter", "name": "confidence", "default_value": 0.4},
60+
{
61+
"type": "WorkflowParameter",
62+
"name": "threshold_multiplier",
63+
"default_value": 1.0,
64+
},
65+
],
66+
"steps": [
67+
{
68+
"type": "roboflow_core/roboflow_object_detection_model@v2",
69+
"name": "ocr_detection",
70+
"image": "$inputs.image",
71+
"model_id": "$inputs.model_id",
72+
"confidence": "$inputs.confidence",
73+
},
74+
{
75+
"type": "roboflow_core/stitch_ocr_detections@v2",
76+
"name": "detections_stitch",
77+
"predictions": "$steps.ocr_detection.predictions",
78+
"stitching_algorithm": "otsu",
79+
"reading_direction": "left_to_right",
80+
"otsu_threshold_multiplier": "$inputs.threshold_multiplier",
81+
},
82+
],
83+
"outputs": [
84+
{
85+
"type": "JsonField",
86+
"name": "ocr_text",
87+
"selector": "$steps.detections_stitch.ocr_text",
88+
},
89+
],
90+
}
91+
92+
WORKFLOW_STITCHING_OCR_DETECTIONS_COLLIMATE = {
93+
"version": "1.0",
94+
"inputs": [
95+
{"type": "WorkflowImage", "name": "image"},
96+
{
97+
"type": "WorkflowParameter",
98+
"name": "model_id",
99+
"default_value": "ocr-oy9a7/1",
100+
},
101+
{"type": "WorkflowParameter", "name": "confidence", "default_value": 0.4},
102+
{
103+
"type": "WorkflowParameter",
104+
"name": "collimate_tolerance",
105+
"default_value": 10,
106+
},
107+
],
108+
"steps": [
109+
{
110+
"type": "roboflow_core/roboflow_object_detection_model@v2",
111+
"name": "ocr_detection",
112+
"image": "$inputs.image",
113+
"model_id": "$inputs.model_id",
114+
"confidence": "$inputs.confidence",
115+
},
116+
{
117+
"type": "roboflow_core/stitch_ocr_detections@v2",
118+
"name": "detections_stitch",
119+
"predictions": "$steps.ocr_detection.predictions",
120+
"stitching_algorithm": "collimate",
121+
"reading_direction": "left_to_right",
122+
"collimate_tolerance": "$inputs.collimate_tolerance",
123+
},
124+
],
125+
"outputs": [
126+
{
127+
"type": "JsonField",
128+
"name": "ocr_text",
129+
"selector": "$steps.detections_stitch.ocr_text",
130+
},
131+
],
132+
}
133+
134+
135+
@add_to_workflows_gallery(
136+
category="Workflows for OCR",
137+
use_case_title="Workflow with model detecting individual characters and text stitching (tolerance algorithm)",
138+
use_case_description="""
139+
This workflow extracts and organizes text from an image using OCR with the tolerance-based stitching algorithm.
140+
It detects individual characters or words and their positions, then groups nearby text into lines based on a
141+
specified pixel `tolerance` for spacing and arranges them in reading order (`left-to-right`).
142+
143+
The tolerance algorithm is best for consistent font sizes and well-aligned horizontal/vertical text.
144+
""",
145+
workflow_definition=WORKFLOW_STITCHING_OCR_DETECTIONS_TOLERANCE,
146+
workflow_name_in_app="ocr-detections-stitch-v2-tolerance",
147+
)
148+
def test_ocr_stitching_v2_tolerance_algorithm(
149+
model_manager: ModelManager,
150+
multi_line_text_image: np.ndarray,
151+
roboflow_api_key: str,
152+
) -> None:
153+
# given
154+
workflow_init_parameters = {
155+
"workflows_core.model_manager": model_manager,
156+
"workflows_core.api_key": roboflow_api_key,
157+
"workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
158+
}
159+
execution_engine = ExecutionEngine.init(
160+
workflow_definition=WORKFLOW_STITCHING_OCR_DETECTIONS_TOLERANCE,
161+
init_parameters=workflow_init_parameters,
162+
max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
163+
)
164+
165+
# when
166+
result = execution_engine.run(
167+
runtime_parameters={
168+
"image": multi_line_text_image,
169+
"tolerance": 20,
170+
"confidence": 0.6,
171+
}
172+
)
173+
174+
assert isinstance(result, list), "Expected list to be delivered"
175+
assert len(result) == 1, "Expected 1 element in the output for one input image"
176+
assert set(result[0].keys()) == {
177+
"ocr_text",
178+
}, "Expected all declared outputs to be delivered"
179+
assert result[0]["ocr_text"] == "MAKE\nTHISDAY\nGREAT"
180+
181+
182+
@add_to_workflows_gallery(
183+
category="Workflows for OCR",
184+
use_case_title="Workflow with model detecting individual characters and text stitching (Otsu algorithm)",
185+
use_case_description="""
186+
This workflow extracts and organizes text from an image using OCR with the Otsu thresholding algorithm.
187+
It detects individual characters and uses Otsu's method on normalized gap distances to automatically find
188+
the optimal threshold separating character gaps from word gaps.
189+
190+
The Otsu algorithm is resolution-invariant and works well with variable font sizes and automatic word
191+
boundary detection. It detects bimodal distributions to distinguish single words from multi-word text.
192+
""",
193+
workflow_definition=WORKFLOW_STITCHING_OCR_DETECTIONS_OTSU,
194+
workflow_name_in_app="ocr-detections-stitch-v2-otsu",
195+
)
196+
def test_ocr_stitching_v2_otsu_algorithm(
197+
model_manager: ModelManager,
198+
multi_line_text_image: np.ndarray,
199+
roboflow_api_key: str,
200+
) -> None:
201+
# given
202+
workflow_init_parameters = {
203+
"workflows_core.model_manager": model_manager,
204+
"workflows_core.api_key": roboflow_api_key,
205+
"workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
206+
}
207+
execution_engine = ExecutionEngine.init(
208+
workflow_definition=WORKFLOW_STITCHING_OCR_DETECTIONS_OTSU,
209+
init_parameters=workflow_init_parameters,
210+
max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
211+
)
212+
213+
# when
214+
result = execution_engine.run(
215+
runtime_parameters={
216+
"image": multi_line_text_image,
217+
"confidence": 0.6,
218+
"threshold_multiplier": 1.0,
219+
}
220+
)
221+
222+
assert isinstance(result, list), "Expected list to be delivered"
223+
assert len(result) == 1, "Expected 1 element in the output for one input image"
224+
assert set(result[0].keys()) == {
225+
"ocr_text",
226+
}, "Expected all declared outputs to be delivered"
227+
# Otsu may insert spaces between words if it detects bimodal distribution
228+
assert isinstance(result[0]["ocr_text"], str)
229+
assert len(result[0]["ocr_text"]) > 0
230+
231+
232+
@add_to_workflows_gallery(
233+
category="Workflows for OCR",
234+
use_case_title="Workflow with model detecting individual characters and text stitching (collimate algorithm)",
235+
use_case_description="""
236+
This workflow extracts and organizes text from an image using OCR with the collimate algorithm.
237+
It detects individual characters and uses greedy parent-child traversal to follow text flow,
238+
building lines through traversal rather than bucketing.
239+
240+
The collimate algorithm is best for skewed, curved, or non-axis-aligned text where traditional
241+
bucket-based line grouping may fail.
242+
""",
243+
workflow_definition=WORKFLOW_STITCHING_OCR_DETECTIONS_COLLIMATE,
244+
workflow_name_in_app="ocr-detections-stitch-v2-collimate",
245+
)
246+
def test_ocr_stitching_v2_collimate_algorithm(
247+
model_manager: ModelManager,
248+
multi_line_text_image: np.ndarray,
249+
roboflow_api_key: str,
250+
) -> None:
251+
# given
252+
workflow_init_parameters = {
253+
"workflows_core.model_manager": model_manager,
254+
"workflows_core.api_key": roboflow_api_key,
255+
"workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
256+
}
257+
execution_engine = ExecutionEngine.init(
258+
workflow_definition=WORKFLOW_STITCHING_OCR_DETECTIONS_COLLIMATE,
259+
init_parameters=workflow_init_parameters,
260+
max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
261+
)
262+
263+
# when
264+
result = execution_engine.run(
265+
runtime_parameters={
266+
"image": multi_line_text_image,
267+
"confidence": 0.6,
268+
"collimate_tolerance": 15,
269+
}
270+
)
271+
272+
assert isinstance(result, list), "Expected list to be delivered"
273+
assert len(result) == 1, "Expected 1 element in the output for one input image"
274+
assert set(result[0].keys()) == {
275+
"ocr_text",
276+
}, "Expected all declared outputs to be delivered"
277+
assert isinstance(result[0]["ocr_text"], str)
278+
assert len(result[0]["ocr_text"]) > 0
279+
280+
281+
@pytest.mark.parametrize(
282+
"algorithm,workflow_definition",
283+
[
284+
("tolerance", WORKFLOW_STITCHING_OCR_DETECTIONS_TOLERANCE),
285+
("otsu", WORKFLOW_STITCHING_OCR_DETECTIONS_OTSU),
286+
("collimate", WORKFLOW_STITCHING_OCR_DETECTIONS_COLLIMATE),
287+
],
288+
)
289+
def test_ocr_stitching_v2_all_algorithms_produce_output(
290+
model_manager: ModelManager,
291+
multi_line_text_image: np.ndarray,
292+
roboflow_api_key: str,
293+
algorithm: str,
294+
workflow_definition: dict,
295+
) -> None:
296+
"""Test that all stitching algorithms produce valid output."""
297+
# given
298+
workflow_init_parameters = {
299+
"workflows_core.model_manager": model_manager,
300+
"workflows_core.api_key": roboflow_api_key,
301+
"workflows_core.step_execution_mode": StepExecutionMode.LOCAL,
302+
}
303+
execution_engine = ExecutionEngine.init(
304+
workflow_definition=workflow_definition,
305+
init_parameters=workflow_init_parameters,
306+
max_concurrent_steps=WORKFLOWS_MAX_CONCURRENT_STEPS,
307+
)
308+
309+
# when
310+
runtime_params = {
311+
"image": multi_line_text_image,
312+
"confidence": 0.6,
313+
}
314+
if algorithm == "tolerance":
315+
runtime_params["tolerance"] = 20
316+
elif algorithm == "otsu":
317+
runtime_params["threshold_multiplier"] = 1.0
318+
elif algorithm == "collimate":
319+
runtime_params["collimate_tolerance"] = 15
320+
321+
result = execution_engine.run(runtime_parameters=runtime_params)
322+
323+
# then
324+
assert isinstance(result, list), "Expected list to be delivered"
325+
assert len(result) == 1, "Expected 1 element in the output for one input image"
326+
assert "ocr_text" in result[0], "Expected ocr_text in output"
327+
assert isinstance(result[0]["ocr_text"], str), "Expected string output"
328+
# All algorithms should detect some text
329+
assert (
330+
len(result[0]["ocr_text"]) > 0
331+
), f"Algorithm {algorithm} produced empty output"

tests/workflows/unit_tests/core_steps/transformations/test_stitch_ocr_detections.py renamed to tests/workflows/unit_tests/core_steps/transformations/test_stitch_ocr_detections_v1.py

File renamed without changes.

0 commit comments

Comments
 (0)