Skip to content

Commit fcb6e6f

Browse files
fc
Signed-off-by: Praateek <praateekm@gmail.com>
1 parent f8d9d04 commit fcb6e6f

File tree

2 files changed

+298
-2
lines changed

2 files changed

+298
-2
lines changed

benchmarking/nightly-benchmark.yaml

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ datasets:
5959
formats:
6060
- type: "bin"
6161
path: "{model_weights_path}/fasttext/lid.176.bin"
62+
- name: "gretel_symptoms"
63+
formats:
64+
- type: "jsonl"
65+
path: "{datasets_path}/gretel_symptoms"
6266

6367
default_timeout_s: 7200
6468

@@ -470,7 +474,7 @@ entries:
470474
exact_value: 3800
471475
- metric: throughput_images_per_sec
472476
min_value: 3.0
473-
477+
474478
- name: audio_fleurs
475479
enabled: true
476480
script: audio_fleurs_benchmark.py
@@ -561,7 +565,7 @@ entries:
561565
exact_value: 1400
562566
- metric: throughput_clips_per_sec
563567
min_value: 4.0
564-
568+
565569
- name: video_transcoding
566570
enabled: true
567571
script: video_pipeline_benchmark.py
@@ -634,3 +638,27 @@ entries:
634638
exact_value: 113
635639
- metric: throughput_clips_per_sec
636640
min_value: 0.25
641+
642+
- name: ndd_nvidia_nim
643+
enabled: true
644+
script: ndd_benchmark.py
645+
args: >-
646+
--benchmark-results-path={session_entry_dir}
647+
--input-path={dataset:gretel_symptoms,jsonl}
648+
--output-path={session_entry_dir}/scratch/output
649+
--model-type=nvidia-nim
650+
--model-id=openai/gpt-oss-20b
651+
--executor=ray_data
652+
timeout_s: 3600
653+
ray:
654+
num_cpus: 8
655+
num_gpus: 0
656+
enable_object_spilling: false
657+
sink_data:
658+
- name: slack
659+
additional_metrics:
660+
- input_row_count
661+
- output_row_count
662+
- input_total_chars
663+
- output_total_chars
664+
- throughput_rows_per_sec
Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# ruff: noqa: PLR0913
16+
17+
"""NeMo Data Designer (NDD) benchmarking script.
18+
19+
Benchmarks synthetic data generation via NDD using the NVIDIA NIM cloud API.
20+
21+
Usage from the benchmarking orchestrator (run.py) -- see ndd.yaml for the
22+
full configuration. Can also be run standalone:
23+
24+
python ndd_benchmark.py \
25+
--benchmark-results-path /tmp/results \
26+
--input-path ./data/ndd \
27+
--output-path /tmp/ndd_output \
28+
--model-type nvidia-nim \
29+
--model-id openai/gpt-oss-20b \
30+
--executor ray_data
31+
"""
32+
33+
import argparse
34+
import os
35+
import time
36+
from pathlib import Path
37+
from typing import Any
38+
39+
import data_designer.config as dd
40+
from loguru import logger
41+
from utils import setup_executor, write_benchmark_results
42+
43+
from nemo_curator.pipeline import Pipeline
44+
from nemo_curator.stages.synthetic.nemo_data_designer.data_designer import DataDesignerStage
45+
from nemo_curator.stages.text.io.reader.jsonl import JsonlReader
46+
from nemo_curator.stages.text.io.writer.jsonl import JsonlWriter
47+
from nemo_curator.tasks.utils import TaskPerfUtils
48+
from nemo_curator.utils.file_utils import get_all_file_paths_under
49+
50+
# ---------------------------------------------------------------------------
51+
# Data Designer config builder
52+
# ---------------------------------------------------------------------------
53+
54+
55+
def _build_config(model_id: str) -> dd.DataDesignerConfigBuilder:
56+
"""Build the DataDesigner config for the medical-notes generation task."""
57+
model_alias = model_id
58+
59+
model_configs = [
60+
dd.ModelConfig(
61+
alias=model_alias,
62+
model=model_id,
63+
provider="nvidia",
64+
skip_health_check=False,
65+
inference_parameters=dd.ChatCompletionInferenceParams(
66+
temperature=1.0,
67+
top_p=1.0,
68+
max_tokens=2048,
69+
),
70+
),
71+
]
72+
73+
config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
74+
75+
# -- Sampler columns ------------------------------------------------
76+
config_builder.add_column(
77+
dd.SamplerColumnConfig(
78+
name="patient_sampler",
79+
sampler_type=dd.SamplerType.PERSON_FROM_FAKER,
80+
params=dd.PersonFromFakerSamplerParams(),
81+
)
82+
)
83+
config_builder.add_column(
84+
dd.SamplerColumnConfig(
85+
name="doctor_sampler",
86+
sampler_type=dd.SamplerType.PERSON_FROM_FAKER,
87+
params=dd.PersonFromFakerSamplerParams(),
88+
)
89+
)
90+
config_builder.add_column(
91+
dd.SamplerColumnConfig(
92+
name="patient_id",
93+
sampler_type=dd.SamplerType.UUID,
94+
params=dd.UUIDSamplerParams(prefix="PT-", short_form=True, uppercase=True),
95+
)
96+
)
97+
98+
# -- Expression columns ---------------------------------------------
99+
config_builder.add_column(dd.ExpressionColumnConfig(name="first_name", expr="{{ patient_sampler.first_name}}"))
100+
config_builder.add_column(dd.ExpressionColumnConfig(name="last_name", expr="{{ patient_sampler.last_name }}"))
101+
config_builder.add_column(dd.ExpressionColumnConfig(name="dob", expr="{{ patient_sampler.birth_date }}"))
102+
config_builder.add_column(
103+
dd.SamplerColumnConfig(
104+
name="symptom_onset_date",
105+
sampler_type=dd.SamplerType.DATETIME,
106+
params=dd.DatetimeSamplerParams(start="2024-01-01", end="2024-12-31"),
107+
)
108+
)
109+
config_builder.add_column(
110+
dd.SamplerColumnConfig(
111+
name="date_of_visit",
112+
sampler_type=dd.SamplerType.TIMEDELTA,
113+
params=dd.TimeDeltaSamplerParams(dt_min=1, dt_max=30, reference_column_name="symptom_onset_date"),
114+
)
115+
)
116+
config_builder.add_column(dd.ExpressionColumnConfig(name="physician", expr="Dr. {{ doctor_sampler.last_name }}"))
117+
118+
# -- LLM column -----------------------------------------------------
119+
config_builder.add_column(
120+
dd.LLMTextColumnConfig(
121+
name="physician_notes",
122+
prompt="""\
123+
You are a primary-care physician who just had an appointment with {{ first_name }} {{ last_name }},
124+
who has been struggling with symptoms from {{ output_text }} since {{ symptom_onset_date }}.
125+
The date of today's visit is {{ date_of_visit }}.
126+
127+
{{ input_text }}
128+
129+
Write careful notes about your visit with {{ first_name }},
130+
as Dr. {{ doctor_sampler.first_name }} {{ doctor_sampler.last_name }}.
131+
132+
Format the notes as a busy doctor might.
133+
Respond with only the notes, no other text.
134+
""",
135+
model_alias=model_alias,
136+
)
137+
)
138+
139+
return config_builder
140+
141+
142+
# ---------------------------------------------------------------------------
143+
# Benchmark runner
144+
# ---------------------------------------------------------------------------
145+
146+
147+
def run_ndd_benchmark(
148+
model_type: str,
149+
model_id: str,
150+
input_path: str,
151+
output_path: str,
152+
executor: str,
153+
num_files: int | None,
154+
**kwargs, # noqa: ARG001
155+
) -> dict[str, Any]:
156+
"""Run the NDD benchmark and collect metrics."""
157+
input_path = Path(input_path)
158+
output_path = Path(output_path).absolute()
159+
output_path.mkdir(parents=True, exist_ok=True)
160+
161+
logger.info(f"Model type: {model_type}")
162+
logger.info(f"Model ID: {model_id}")
163+
logger.info(f"Input path: {input_path}")
164+
logger.info(f"Output path: {output_path}")
165+
logger.info(f"Executor: {executor}")
166+
167+
# Resolve input files using Curator utility
168+
input_files = get_all_file_paths_under(str(input_path), keep_extensions="jsonl")
169+
if num_files is not None and num_files > 0:
170+
logger.info(f"Using {num_files} of {len(input_files)} input files")
171+
input_files = input_files[:num_files]
172+
173+
# -- Environment setup: nvidia-nim requires NVIDIA_API_KEY ----------
174+
if not os.environ.get("NVIDIA_API_KEY"):
175+
msg = "NVIDIA_API_KEY must be set for nvidia-nim model type"
176+
raise OSError(msg)
177+
178+
# -- Build config and run pipeline ----------------------------------
179+
config_builder = _build_config(model_id)
180+
181+
executor_obj = setup_executor(executor)
182+
183+
pipeline = Pipeline(
184+
name="ndd_benchmark_pipeline",
185+
stages=[
186+
JsonlReader(file_paths=input_files, fields=["output_text", "input_text"]),
187+
DataDesignerStage(config_builder=config_builder),
188+
JsonlWriter(path=str(output_path)),
189+
],
190+
)
191+
192+
logger.info("Starting NDD pipeline...")
193+
run_start_time = time.perf_counter()
194+
output_tasks = pipeline.run(executor_obj)
195+
run_time_taken = time.perf_counter() - run_start_time
196+
197+
# -- Post-run: extract metrics from _stage_perf ----------------------
198+
ndd_metrics = TaskPerfUtils.aggregate_task_metrics(output_tasks, prefix="custom")
199+
input_row_count = int(ndd_metrics["num_input_records"])
200+
input_total_chars = int(ndd_metrics["input_total_chars"])
201+
# TODO: add this to data_designer.py
202+
output_row_count = int(ndd_metrics["num_output_records"])
203+
output_total_chars = int(ndd_metrics["output_total_chars"])
204+
throughput_rows_per_sec = output_row_count / run_time_taken if run_time_taken > 0 else 0
205+
206+
logger.success(f"NDD benchmark completed in {run_time_taken:.2f}s")
207+
logger.success(f"Input: {input_row_count} rows, {input_total_chars:,} chars")
208+
logger.success(f"Output: {output_row_count} rows, {output_total_chars:,} chars")
209+
logger.success(f"Throughput: {throughput_rows_per_sec:.2f} rows/sec")
210+
211+
return {
212+
"metrics": {
213+
"is_success": True,
214+
"time_taken_s": run_time_taken,
215+
"model_type": model_type,
216+
"model_id": model_id,
217+
"input_row_count": input_row_count,
218+
"input_total_chars": input_total_chars,
219+
"output_row_count": output_row_count,
220+
"output_total_chars": output_total_chars,
221+
"throughput_rows_per_sec": throughput_rows_per_sec,
222+
"num_files": num_files or "all",
223+
},
224+
"tasks": output_tasks,
225+
}
226+
227+
228+
# ---------------------------------------------------------------------------
229+
# CLI
230+
# ---------------------------------------------------------------------------
231+
232+
233+
def main() -> int:
234+
parser = argparse.ArgumentParser(description="NeMo Data Designer (NDD) benchmark")
235+
parser.add_argument("--benchmark-results-path", required=True, help="Path to write benchmark results")
236+
parser.add_argument("--input-path", required=True, help="Path to input JSONL seed data")
237+
parser.add_argument("--output-path", required=True, help="Path to write generated output")
238+
parser.add_argument(
239+
"--model-type",
240+
required=True,
241+
choices=["nvidia-nim"],
242+
help="Model serving backend",
243+
)
244+
parser.add_argument("--model-id", default="openai/gpt-oss-20b", help="Model identifier")
245+
parser.add_argument("--executor", default="ray_data", choices=["ray_data", "xenna"], help="Pipeline executor")
246+
parser.add_argument("--num-files", type=int, default=None, help="Limit number of input files (default: all)")
247+
248+
args = parser.parse_args()
249+
250+
logger.info("=== NDD Benchmark Starting ===")
251+
logger.info(f"Arguments: {vars(args)}")
252+
253+
success_code = 1
254+
result_dict: dict[str, Any] = {
255+
"params": vars(args),
256+
"metrics": {"is_success": False},
257+
"tasks": [],
258+
}
259+
try:
260+
result_dict.update(run_ndd_benchmark(**vars(args)))
261+
success_code = 0 if result_dict["metrics"]["is_success"] else 1
262+
finally:
263+
write_benchmark_results(result_dict, args.benchmark_results_path)
264+
return success_code
265+
266+
267+
if __name__ == "__main__":
268+
raise SystemExit(main())

0 commit comments

Comments
 (0)