Merge remote-tracking branch 'origin/main' into huvu/nemo_data_designer

Huy Vu2 · Huy Vu2 · commit 656bbaffbafc · 2026-02-12T11:05:33.000-08:00
diff --git a/nemo_curator/stages/audio/common.py b/nemo_curator/stages/audio/common.py
@@ -32,7 +32,11 @@ class LegacySpeechStage(ProcessingStage[Task, Task]):
     def process(self, task: AudioBatch) -> list[Task]:
         result = []
         for entry in task.data:
-            result.extend(self.process_dataset_entry(entry))
+            entries = self.process_dataset_entry(entry)
+            for r in entries:
+                if r is not task and not r._stage_perf:
+                    r._stage_perf = list(task._stage_perf)
+            result.extend(entries)
         return result
 
     @abstractmethod
@@ -54,6 +58,7 @@ class GetAudioDurationStage(LegacySpeechStage):
         All the same fields as in the input manifest plus duration_key
     """
 
+    name = "GetAudioDurationStage"
     audio_filepath_key: str
     duration_key: str
 
@@ -80,14 +85,14 @@ class PreserveByValueStage(LegacySpeechStage):
 
     """
 
+    name = "PreserveByValueStage"
+
     def __init__(
         self,
         input_value_key: str,
         target_value: int | str,
         operator: str = "eq",
-        **kwargs,
     ):
-        super().__init__(**kwargs)
         self.input_value_key = input_value_key
         self.target_value = target_value
         if operator == "lt":
diff --git a/nemo_curator/stages/audio/inference/asr_nemo.py b/nemo_curator/stages/audio/inference/asr_nemo.py
@@ -148,4 +148,5 @@ def process(self, task: FileGroupTask | DocumentBatch | AudioBatch) -> AudioBatc
             dataset_name=f"{self.model_name}_inference",
             filepath_key=self.filepath_key,
             data=audio_items,
+            _stage_perf=task._stage_perf,
         )
diff --git a/nemo_curator/stages/audio/io/convert.py b/nemo_curator/stages/audio/io/convert.py
@@ -24,11 +24,14 @@ class AudioToDocumentStage(ProcessingStage[AudioBatch, DocumentBatch]):
 
     """
 
+    name = "AudioToDocumentStage"
+
     def process(self, task: AudioBatch) -> list[DocumentBatch]:
         return [
             DocumentBatch(
                 data=pd.DataFrame(task.data),
                 task_id=task.task_id,
                 dataset_name=task.dataset_name,
+                _stage_perf=task._stage_perf,
             )
         ]
diff --git a/nemo_curator/stages/audio/metrics/get_wer.py b/nemo_curator/stages/audio/metrics/get_wer.py
@@ -62,6 +62,7 @@ class GetPairwiseWerStage(LegacySpeechStage):
          The same data as in the input manifest with wer_key and corresponding values.
     """
 
+    name = "GetPairwiseWerStage"
     text_key: str = "text"
     pred_text_key: str = "pred_text"
     wer_key: str = "wer"

Original file line number	Diff line number	Diff line change
`@@ -148,4 +148,5 @@ def process(self, task: FileGroupTask \| DocumentBatch \| AudioBatch) -> AudioBatc`
`148`	`148`	`dataset_name=f"{self.model_name}_inference",`
`149`	`149`	`filepath_key=self.filepath_key,`
`150`	`150`	`data=audio_items,`
	`151`	`+ _stage_perf=task._stage_perf,`
`151`	`152`	`)`
Original file line number	Diff line number	Diff line change
`@@ -24,11 +24,14 @@ class AudioToDocumentStage(ProcessingStage[AudioBatch, DocumentBatch]):`
`24`	`24`
`25`	`25`	`"""`
`26`	`26`
	`27`	`+ name = "AudioToDocumentStage"`
	`28`	`+`
`27`	`29`	`def process(self, task: AudioBatch) -> list[DocumentBatch]:`
`28`	`30`	`return [`
`29`	`31`	`DocumentBatch(`
`30`	`32`	`data=pd.DataFrame(task.data),`
`31`	`33`	`task_id=task.task_id,`
`32`	`34`	`dataset_name=task.dataset_name,`
	`35`	`+ _stage_perf=task._stage_perf,`
`33`	`36`	`)`
`34`	`37`	`]`