Skip to content

Commit 751f631

Browse files
committed
feat: add processor plugin system
Adds support for third-party processor plugins via plugin discovery: - PluginType.PROCESSOR for external processor plugins - ProcessorRegistry discovers and loads processor plugins - processor_types.py with plugin-injected type union - PluginRegistry uses RLock for nested imports Demo processors: - RegexFilterProcessor (preprocess stage) - SemanticDedupProcessor (postprocess stage)
1 parent ec54289 commit 751f631

File tree

27 files changed

+898
-12
lines changed

27 files changed

+898
-12
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Data Designer Demo Processors
2+
3+
Demo processor plugins demonstrating PRE_GENERATION and POST_GENERATION stages.
4+
5+
## Installation
6+
7+
```bash
8+
uv pip install -e demo/data_designer_demo_processors
9+
```
10+
11+
## Processors
12+
13+
### RegexFilterProcessor (PRE_GENERATION)
14+
15+
Filters seed data rows based on regex pattern matching.
16+
17+
```python
18+
from data_designer.config.config_builder import DataDesignerConfigBuilder
19+
from data_designer_demo_processors.regex_filter import RegexFilterProcessorConfig
20+
21+
builder = DataDesignerConfigBuilder(model_configs=[...])
22+
builder.add_processor(RegexFilterProcessorConfig(
23+
name="filter_emails",
24+
column="email",
25+
pattern=r"@company\.com$",
26+
invert=False, # Keep only matching rows
27+
))
28+
```
29+
30+
### SemanticDedupProcessor (POST_GENERATION)
31+
32+
Removes semantically similar rows using sentence embeddings.
33+
34+
```python
35+
from data_designer_demo_processors.semantic_dedup import SemanticDedupProcessorConfig
36+
37+
builder.add_processor(SemanticDedupProcessorConfig(
38+
name="dedup_responses",
39+
column="response",
40+
similarity_threshold=0.9, # Remove rows with >90% similarity
41+
model_name="all-MiniLM-L6-v2",
42+
))
43+
```
44+
45+
## Pre-downloading the Embedding Model
46+
47+
The semantic dedup processor downloads the embedding model on first use. To pre-download:
48+
49+
```bash
50+
download-semantic-dedup-model
51+
```
52+
53+
## Entry Points
54+
55+
The package registers plugins via entry points:
56+
57+
```toml
58+
[project.entry-points."data_designer.plugins"]
59+
regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
60+
semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"
61+
```
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Demo: Processor Plugins with PRE_GENERATION and POST_GENERATION stages.
5+
6+
This notebook demonstrates:
7+
1. RegexFilterProcessor (PRE_GENERATION) - filters seed data before generation
8+
2. SemanticDedupProcessor (POST_GENERATION) - deduplicates final dataset
9+
10+
Run cells with `#%%` markers in VS Code or PyCharm.
11+
"""
12+
13+
# %% Imports
14+
import tempfile
15+
from pathlib import Path
16+
17+
import pandas as pd
18+
from data_designer_demo_processors.regex_filter import RegexFilterProcessorConfig
19+
from data_designer_demo_processors.semantic_dedup import SemanticDedupProcessorConfig
20+
21+
import data_designer.config as dd
22+
from data_designer.interface import DataDesigner
23+
24+
# %% Create seed data with some rows we want to filter out
25+
seed_data = pd.DataFrame(
26+
{
27+
"topic": [
28+
"Python programming",
29+
"Machine learning",
30+
"SPAM: Buy now!", # Will be filtered by regex
31+
"Data science",
32+
"SPAM: Click here", # Will be filtered by regex
33+
"Natural language processing",
34+
"Computer vision",
35+
],
36+
"difficulty": ["beginner", "advanced", "N/A", "intermediate", "N/A", "advanced", "advanced"],
37+
}
38+
)
39+
40+
print("Seed data before PRE_GENERATION filtering:")
41+
print(seed_data)
42+
print(f"Total rows: {len(seed_data)}")
43+
44+
# %% Setup temporary directory and save seed data
45+
output_dir = Path(tempfile.mkdtemp())
46+
seed_path = output_dir / "seed.parquet"
47+
seed_data.to_parquet(seed_path, index=False)
48+
49+
# %% Build the Data Designer configuration (uses default openai-text model)
50+
config_builder = dd.DataDesignerConfigBuilder()
51+
52+
# Add seed dataset
53+
config_builder.with_seed_dataset(dd.LocalFileSeedSource(path=str(seed_path)))
54+
55+
# Add LLM column to generate explanations
56+
config_builder.add_column(
57+
dd.LLMTextColumnConfig(
58+
name="explanation",
59+
prompt="""Write a brief one-sentence explanation of the topic: {{ topic }}
60+
Difficulty level: {{ difficulty }}
61+
62+
Keep it concise and educational.""",
63+
model_alias="openai-text",
64+
)
65+
)
66+
67+
# Add PRE_GENERATION processor to filter out spam rows
68+
config_builder.add_processor(
69+
RegexFilterProcessorConfig(
70+
name="filter_spam",
71+
column="topic",
72+
pattern=r"^SPAM:",
73+
invert=True, # Keep rows that do NOT match (i.e., filter out spam)
74+
)
75+
)
76+
77+
# Add POST_GENERATION processor to deduplicate similar explanations
78+
config_builder.add_processor(
79+
SemanticDedupProcessorConfig(
80+
name="dedup_explanations",
81+
column="explanation",
82+
similarity_threshold=0.85,
83+
)
84+
)
85+
86+
print("Configuration created successfully!")
87+
processor_configs = config_builder.get_processor_configs()
88+
print(f"Processors configured: {[p.name for p in processor_configs]}")
89+
90+
# %% Run preview to test with a few records
91+
data_designer = DataDesigner()
92+
93+
print("\nRunning preview (3 records)...")
94+
preview = data_designer.preview(config_builder, num_records=3)
95+
96+
print("\nPreview dataset:")
97+
print(preview.dataset)
98+
99+
# %% Run full generation
100+
print("\nRunning full generation...")
101+
results = data_designer.create(
102+
config_builder,
103+
num_records=5,
104+
dataset_name="processor-demo",
105+
)
106+
107+
# Load the final dataset
108+
final_dataset = results.load_dataset()
109+
110+
print("\nFinal dataset after all processors:")
111+
print(final_dataset)
112+
print(f"\nTotal rows in final dataset: {len(final_dataset)}")
113+
114+
# %% Summary
115+
print("\n" + "=" * 60)
116+
print("DEMO SUMMARY")
117+
print("=" * 60)
118+
print(f"Original seed rows: {len(seed_data)}")
119+
print("After PRE_GENERATION (regex filter): Expected ~5 rows (SPAM removed)")
120+
print(f"After POST_GENERATION (semantic dedup): {len(final_dataset)} rows")
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
[project]
2+
name = "data-designer-demo-processors"
3+
version = "0.1.0"
4+
description = "Demo processor plugins for Data Designer showing PRE_GENERATION and POST_GENERATION stages"
5+
readme = "README.md"
6+
requires-python = ">=3.11"
7+
dependencies = [
8+
"data-designer-config",
9+
"data-designer-engine",
10+
"sentence-transformers>=2.2.0",
11+
]
12+
13+
[project.entry-points."data_designer.plugins"]
14+
regex-filter = "data_designer_demo_processors.regex_filter.plugin:regex_filter_plugin"
15+
semantic-dedup = "data_designer_demo_processors.semantic_dedup.plugin:semantic_dedup_plugin"
16+
17+
[project.scripts]
18+
download-semantic-dedup-model = "data_designer_demo_processors.download_model:main"
19+
20+
[build-system]
21+
requires = ["hatchling"]
22+
build-backend = "hatchling.build"
23+
24+
[tool.hatch.build.targets.wheel]
25+
packages = ["src/data_designer_demo_processors"]
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Demo processor plugins for Data Designer."""
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Pre-download the semantic dedup embedding model."""
5+
6+
DEFAULT_MODEL = "all-MiniLM-L6-v2"
7+
8+
9+
def main():
10+
"""Download the embedding model to cache."""
11+
from sentence_transformers import SentenceTransformer
12+
13+
print(f"Downloading model: {DEFAULT_MODEL}")
14+
SentenceTransformer(DEFAULT_MODEL)
15+
print("Model downloaded successfully!")
16+
17+
18+
if __name__ == "__main__":
19+
main()
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig
5+
from data_designer_demo_processors.regex_filter.impl import RegexFilterProcessor
6+
7+
__all__ = ["RegexFilterProcessorConfig", "RegexFilterProcessor"]
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from typing import Literal
5+
6+
from pydantic import Field
7+
8+
from data_designer.config.processors import ProcessorConfig
9+
10+
11+
class RegexFilterProcessorConfig(ProcessorConfig):
12+
"""Filter rows based on regex matching on a column.
13+
14+
This processor filters seed data during the preprocess stage.
15+
"""
16+
17+
processor_type: Literal["regex-filter"] = "regex-filter"
18+
column: str = Field(description="Column to apply regex filter on")
19+
pattern: str = Field(description="Regex pattern to match")
20+
invert: bool = Field(default=False, description="If True, keep rows that do NOT match")
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from __future__ import annotations
5+
6+
import logging
7+
import re
8+
from typing import TYPE_CHECKING
9+
10+
from data_designer.engine.processing.processors.base import Processor
11+
from data_designer_demo_processors.regex_filter.config import RegexFilterProcessorConfig
12+
13+
if TYPE_CHECKING:
14+
import pandas as pd
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
class RegexFilterProcessor(Processor[RegexFilterProcessorConfig]):
20+
"""Filters rows based on regex matching on a specified column.
21+
22+
Runs during preprocess to filter seed data before generation.
23+
"""
24+
25+
def preprocess(self, data: pd.DataFrame) -> pd.DataFrame:
26+
column = self.config.column
27+
pattern = self.config.pattern
28+
invert = self.config.invert
29+
30+
if column not in data.columns:
31+
logger.warning(f"⚠️ Column '{column}' not found in dataset. Skipping regex filter.")
32+
return data
33+
34+
compiled = re.compile(pattern)
35+
mask = data[column].astype(str).apply(lambda x: bool(compiled.search(x)))
36+
37+
if invert:
38+
mask = ~mask
39+
40+
original_count = len(data)
41+
data = data[mask].reset_index(drop=True)
42+
filtered_count = original_count - len(data)
43+
44+
action = "excluded" if not invert else "kept only non-matching"
45+
logger.info(f"🔍 Regex filter: {filtered_count} rows {action} (pattern: {pattern!r} on column '{column}')")
46+
47+
return data
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from data_designer.plugins.plugin import Plugin, PluginType
5+
6+
regex_filter_plugin = Plugin(
7+
config_qualified_name="data_designer_demo_processors.regex_filter.config.RegexFilterProcessorConfig",
8+
impl_qualified_name="data_designer_demo_processors.regex_filter.impl.RegexFilterProcessor",
9+
plugin_type=PluginType.PROCESSOR,
10+
)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from data_designer_demo_processors.semantic_dedup.config import SemanticDedupProcessorConfig
5+
from data_designer_demo_processors.semantic_dedup.impl import SemanticDedupProcessor
6+
7+
__all__ = ["SemanticDedupProcessorConfig", "SemanticDedupProcessor"]

0 commit comments

Comments
 (0)