facebookresearch · cirquit · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/demo/toxicity-alti-hb/ETOX/etox.py b/demo/toxicity-alti-hb/ETOX/etox.py
@@ -446,7 +446,6 @@ def etox_paired_file_wrapper(
     oldcolumns=True,
     filetype=None,
 ):
-
     """
     file loading/writing wrapper for the paired language toxicity evaluation function.
 

diff --git a/demo/toxicity-alti-hb/analysis/00c_plot_toxicity_per_lang.py b/demo/toxicity-alti-hb/analysis/00c_plot_toxicity_per_lang.py
@@ -43,9 +43,11 @@ def plot_toxicity_per_lang():
     }
     sorted_axes = sorted(list(axis_colors.keys()))
     axis_display_names = {
-        axis: "Race and ethnicity"
-        if axis == "race_ethnicity"
-        else axis[0].upper() + axis[1:].replace("_", " ")
+        axis: (
+            "Race and ethnicity"
+            if axis == "race_ethnicity"
+            else axis[0].upper() + axis[1:].replace("_", " ")
+        )
         for axis in sorted_axes
     }
     sorted_axis_names = sorted(list(axis_colors.keys()))

diff --git a/pyproject.toml b/pyproject.toml
@@ -79,15 +79,15 @@ classifiers=[
     "numba",
     "transformers",
     "openai-whisper==20230314",
-    "fairseq2==0.2.*",
+    "fairseq2>=0.5.*",
     "sonar-space==0.2.*",
   ]
   vocal_style_sim = [
     "s3prl",
   ]
   sonar_mining = [
     "sonar-space==0.2.*",
-    "fairseq2==0.2.*",
+    "fairseq2>=0.5.*",
   ]
   dev = [
       # Test

diff --git a/stopes/core/jobs_registry/registry.py b/stopes/core/jobs_registry/registry.py
@@ -14,6 +14,7 @@
 
 logger = logging.getLogger("stopes.jobs")
 
+
 ################################################################################
 #  Registry Exceptions
 ################################################################################

diff --git a/stopes/core/jobs_registry/submitit_slurm_job.py b/stopes/core/jobs_registry/submitit_slurm_job.py
@@ -284,7 +284,9 @@ def _convert_slurm_status_into_registry_job_status(
 
             return job_status
 
-        except KeyError:  # Entering this except block means slurm_status doesn't exist in submitit_state_to_registry_state_dict
+        except (
+            KeyError
+        ):  # Entering this except block means slurm_status doesn't exist in submitit_state_to_registry_state_dict
             logger.warning(
                 f"Job with id: {job_id} has unrecognized slurm status: {slurm_status}. Please inspect and if suitable, add this status to the slurm_state_to_registry_state_map converter."
             )

diff --git a/stopes/core/tests/test_utils.py b/stopes/core/tests/test_utils.py
@@ -188,7 +188,7 @@ async def test_semaphore():
 
     # make sure that the semaphore blocks execution
     ends.sort()
-    for (end1, end2) in zip(ends, ends[1:]):
+    for end1, end2 in zip(ends, ends[1:]):
         t_diff = end2 - end1
         assert (
             t_diff.total_seconds() >= sleep_time

diff --git a/stopes/eval/alti/alignment/align.py b/stopes/eval/alti/alignment/align.py
@@ -6,7 +6,7 @@
 
 # This code was adapted from the repository https://github.com/mt-upc/transformer-contributions-nmt by Javier Ferrando.
 
-""" Various utilities for computing word attributions and word alignment quality metrics."""
+"""Various utilities for computing word attributions and word alignment quality metrics."""
 
 import itertools
 import typing as tp
@@ -109,7 +109,8 @@ def compute_alignment_metrics(
     sure: tp.List[tp.Set], possible: tp.List[tp.Set], hypothesis: tp.List[tp.Set]
 ) -> tp.Tuple[float, float, float]:
     """Compute average alignment rate, precision and recall for alignment.
-    Inputs are lists of alignments. All alignments are presented as sets of (tgt, src) pairs."""
+    Inputs are lists of alignments. All alignments are presented as sets of (tgt, src) pairs.
+    """
     sum_a_intersect_p, sum_a_intersect_s, sum_s, sum_a = 0, 0, 0, 0
 
     for s, p, a in itertools.zip_longest(sure, possible, hypothesis):

diff --git a/stopes/eval/alti/alti_metrics/nllb_alti_detector.py b/stopes/eval/alti/alti_metrics/nllb_alti_detector.py
@@ -104,7 +104,8 @@ def load_bilingual_model(
 @dataclasses.dataclass
 class ALTIMetricsConfig:
     """The config indicating how to load sentence pairs, load the model,
-    compute the ALTI metrics with it, and save results. - to use with the `compute_nllb_alti` function."""
+    compute the ALTI metrics with it, and save results. - to use with the `compute_nllb_alti` function.
+    """
 
     # the model used to compute ALTI
     is_multilingual: bool
@@ -118,7 +119,9 @@ class ALTIMetricsConfig:
         Path
     ]  # a .jsonl file with token-level contributions
     # format and location of the source data
-    input_filename: Path  # the source file with sources and translations; assumed to be .tsv
+    input_filename: (
+        Path  # the source file with sources and translations; assumed to be .tsv
+    )
     src_lang: str
     tgt_lang: str
     src_col: tp.Union[str, int] = "src"

diff --git a/stopes/eval/alti/wrappers/utils.py b/stopes/eval/alti/wrappers/utils.py
@@ -18,7 +18,8 @@
 
 def spearmanr(x, y):
     """Compute Spearman rank's correlation bertween two attribution vectors.
-    https://github.com/samiraabnar/attention_flow/blob/master/compute_corel_distilbert_sst.py"""
+    https://github.com/samiraabnar/attention_flow/blob/master/compute_corel_distilbert_sst.py
+    """
 
     x = pd.Series(x)
     y = pd.Series(y)

diff --git a/stopes/eval/local_prosody/compare_utterances.py b/stopes/eval/local_prosody/compare_utterances.py
@@ -124,7 +124,7 @@ def align_pauses(
 
     duration_scores = []
     alignment_scores = []
-    for (pauses, p2a) in [(pp_src, p2a_src), (pp_tgt, p2a_tgt)]:
+    for pauses, p2a in [(pp_src, p2a_src), (pp_tgt, p2a_tgt)]:
         for prev_word_id, duration in pauses:
             if prev_word_id not in p2a:
                 duration_scores.append(0.0)
@@ -353,12 +353,12 @@ def aggregate_pause_alignment_statistics(df: pd.DataFrame):
             "mean_duration_score": df.duration_score.mean(),
             "mean_alignment_score": df.alignment_score.mean(),
             "mean_joint_score": joint_score.mean(),
-            "wmean_duration_score": (df.duration_score * w).sum() / w.sum()
-            if non_empty
-            else 1,
-            "wmean_alignment_score": (df.alignment_score * w).sum() / w.sum()
-            if non_empty
-            else 1,
+            "wmean_duration_score": (
+                (df.duration_score * w).sum() / w.sum() if non_empty else 1
+            ),
+            "wmean_alignment_score": (
+                (df.alignment_score * w).sum() / w.sum() if non_empty else 1
+            ),
             "wmean_joint_score": (joint_score * w).sum() / w.sum() if non_empty else 1,
             "total_weight": w.sum(),
             "n_items": df.shape[0],

diff --git a/stopes/eval/local_prosody/utterance.py b/stopes/eval/local_prosody/utterance.py
@@ -124,7 +124,7 @@ def get_text_with_markup(self, min_pause_duration=0.1, min_emph_score=0.5) -> st
         parts = []
         pause_durations = self.get_pauses_after_words(min_duration=min_pause_duration)
         emphasis_scores = self.emphasis_scores or [0] * len(self.words)
-        for (word, pause, emph_score) in zip(
+        for word, pause, emph_score in zip(
             self.words, pause_durations, emphasis_scores
         ):
             if emph_score > min_emph_score:

diff --git a/stopes/modules/bitext/indexing/merge_faiss_indexes.py b/stopes/modules/bitext/indexing/merge_faiss_indexes.py
@@ -118,12 +118,16 @@ def checkpoint(
         return submitit.helpers.DelayedSubmission(
             MergeFAISSIndexesModule(
                 config=self.config,
-                checkpoint_part=self.partial_merge_file
-                if self.config.enable_checkpointing
-                else None,
-                checkpoint_file_idx=self.checkpoint_file_idx
-                if self.config.enable_checkpointing
-                else None,
+                checkpoint_part=(
+                    self.partial_merge_file
+                    if self.config.enable_checkpointing
+                    else None
+                ),
+                checkpoint_file_idx=(
+                    self.checkpoint_file_idx
+                    if self.config.enable_checkpointing
+                    else None
+                ),
             ),
             *args,
             **kwargs,

diff --git a/stopes/modules/bitext/indexing/populate_faiss_index.py b/stopes/modules/bitext/indexing/populate_faiss_index.py
@@ -202,9 +202,11 @@ def checkpoint(
         return submitit.helpers.DelayedSubmission(
             PopulateFAISSIndexModule(
                 config=self.config,
-                checkpoint_summary=self.checkpoint_summary
-                if self.config.enable_checkpointing
-                else None,
+                checkpoint_summary=(
+                    self.checkpoint_summary
+                    if self.config.enable_checkpointing
+                    else None
+                ),
             ),
             *args,
             **kwargs,

diff --git a/stopes/modules/bitext/mining/merge_shards.py b/stopes/modules/bitext/mining/merge_shards.py
@@ -33,7 +33,8 @@ class MergeShardsConfig:
 
 class ShardForMerge:
     """Represent an input shard opened for merge.
-    Both input and output shards are in decreasing order of match scores, and this object helps manage that."""
+    Both input and output shards are in decreasing order of match scores, and this object helps manage that.
+    """
 
     def __init__(self, text_path: Path, meta_path: tp.Optional[Path]):
         self.text_path = text_path

diff --git a/stopes/modules/evaluation/generate_multi_bleu_detok_module.py b/stopes/modules/evaluation/generate_multi_bleu_detok_module.py
@@ -234,9 +234,10 @@ def process_output_ref_hyp_file(
     return_file = Path(f"{fairseq_generate_output_file}.{file_type}")
     desired_line_prefix = "T" if file_type == "ref" else "H"
     desired_col_number = 1 if file_type == "ref" else 2
-    with open(fairseq_generate_output_file, "r", encoding="utf-8") as read_file, open(
-        return_file, "w", encoding="utf-8"
-    ) as write_file:
+    with (
+        open(fairseq_generate_output_file, "r", encoding="utf-8") as read_file,
+        open(return_file, "w", encoding="utf-8") as write_file,
+    ):
         for line in read_file:
             if line.startswith(desired_line_prefix):
                 line = line.rstrip("\n")

diff --git a/stopes/modules/nmt_bitext_eval_utils.py b/stopes/modules/nmt_bitext_eval_utils.py
@@ -205,7 +205,7 @@ def concat_public_bitext(self, output: Path, tgt: bool) -> int:
         """
         lines = 0
         with utils.open(output, "wb") as o:
-            for (src_file, tgt_file) in self.public_bitext:
+            for src_file, tgt_file in self.public_bitext:
                 with utils.open(tgt_file if tgt else src_file, "rb") as f:
                     for line in f:
                         lines += 1

diff --git a/stopes/modules/preprocess/bitext_processor.py b/stopes/modules/preprocess/bitext_processor.py
@@ -49,7 +49,6 @@ def __init__(
     def process_lines(
         self, dataset_reader: tp.Generator[DatasetLine, None, None]
     ) -> None:
-
         """
         process a batch of lines from two files and writes them to two output_files the way you want.
         The input are two iterators of lines with their line number in the input file

diff --git a/stopes/modules/preprocess/fairseq_binarizer_encoder.py b/stopes/modules/preprocess/fairseq_binarizer_encoder.py
@@ -87,7 +87,7 @@ def __enter__(self):
 
     def process_lines(self, lines_with_number: tp.Iterator[tp.Tuple[int, str]]) -> None:
         summary = BinarizeSummary()
-        for (_, s) in lines_with_number:
+        for _, s in lines_with_number:
             self.dataset_builder.add_item(self.binarizer.binarize_line(s, summary))
         self.summary.merge(summary)
         log.info(self.summary)

diff --git a/stopes/modules/preprocess/laser_sentence_encoder.py b/stopes/modules/preprocess/laser_sentence_encoder.py
@@ -306,9 +306,9 @@ def combine_bidir(outs):
         return {
             "sentemb": sentemb,
             "encoder_out": (x, final_hiddens, final_cells),
-            "encoder_padding_mask": encoder_padding_mask
-            if encoder_padding_mask.any()
-            else None,
+            "encoder_padding_mask": (
+                encoder_padding_mask if encoder_padding_mask.any() else None
+            ),
         }
 
 

diff --git a/stopes/modules/preprocess/multiproc_fairseq_binarizer_encoder.py b/stopes/modules/preprocess/multiproc_fairseq_binarizer_encoder.py
@@ -111,7 +111,7 @@ def __enter__(self):
 
     def process_lines(self, lines_with_number: tp.Iterator[tp.Tuple[int, str]]) -> None:
         summary = BinarizeSummary()
-        for (_, s) in lines_with_number:
+        for _, s in lines_with_number:
             self.dataset_builder.add_item(self.binarizer.binarize_line(s, summary))
         self.summary.merge(summary)
         logger.info(self.summary)

diff --git a/stopes/modules/preprocess/sonar_text_embedding.py b/stopes/modules/preprocess/sonar_text_embedding.py
@@ -14,7 +14,7 @@
 import pandas as pd
 import pyarrow as pa
 import torch
-from fairseq2.assets.error import AssetError
+from fairseq2.error import OperationalError
 from retrying import retry
 from sonar.inference_pipelines.text import (
     EmbeddingToTextModelPipeline,
@@ -37,7 +37,9 @@
 from stopes.utils.sharding.parquet_shards import ParquetOutputConfig
 
 fairse2_asset_loading_retry = retry(
-    retry_on_exception=lambda exception: isinstance(exception, (AssetError, IOError)),
+    retry_on_exception=lambda exception: isinstance(
+        exception, (OperationalError, IOError)
+    ),
     stop_max_attempt_number=20,
     wait_random_min=1000,
     wait_random_max=30_000,

diff --git a/stopes/modules/preprocess/uromanize_cli_module.py b/stopes/modules/preprocess/uromanize_cli_module.py
@@ -73,7 +73,10 @@ def run_uroman_cli_standalone(input_file: Path, output_file: Path, lang: str) ->
 def uromanize(text: tp.List[str]) -> tp.List[str]:
     if text is None or len(text) == 0:
         return []
-    with tempfile.NamedTemporaryFile() as input_file, tempfile.NamedTemporaryFile() as output_file:
+    with (
+        tempfile.NamedTemporaryFile() as input_file,
+        tempfile.NamedTemporaryFile() as output_file,
+    ):
         with open(input_file.name, "w") as f:
             for sentence in text:
                 f.write(f"{sentence}\n")

diff --git a/stopes/modules/preprocess/wav2vec_laser_speech_encoder.py b/stopes/modules/preprocess/wav2vec_laser_speech_encoder.py
@@ -54,7 +54,8 @@ def __init__(
     @property
     def fbank_features(self) -> int:
         """Number of fbank features to feed to the encoder (0 if instead of fbank it expects a raw waveform).
-        This parameter is defined based on the architecture of the underlying encoder."""
+        This parameter is defined based on the architecture of the underlying encoder.
+        """
         return self.encoder_cfg.task.get("fbank_features", 0)
 
     def _encode_batch(self, source, padding_mask):

diff --git a/stopes/modules/speech/utils.py b/stopes/modules/speech/utils.py
@@ -261,9 +261,11 @@ def auto_parse_line(line: str, sampling_factor: tp.Optional[int] = None) -> Line
     columns = line.split("\t")
     return LineResult(
         columns=[
-            auto_parse(column, sampling_factor)
-            if i == 0
-            else parse_audio_or_text(column, sampling_factor)
+            (
+                auto_parse(column, sampling_factor)
+                if i == 0
+                else parse_audio_or_text(column, sampling_factor)
+            )
             for (i, column) in enumerate(columns)
         ]
     )

diff --git a/stopes/modules/speech/video_alignement/video_segmentor.py b/stopes/modules/speech/video_alignement/video_segmentor.py
@@ -115,7 +115,6 @@ class WhisperSegmentorConfig:
 
 
 class WhisperSegmentorModule(StopesModule):
-
     """Extract utterances from an audio and embed them
     This module is multi-lingual : different languages can be processed with the same pipeline
 
@@ -382,7 +381,9 @@ def compute_whisper_segmentation(
         logger.info(
             f"Starting Whisper segmentation on wav of length {round(len(wav) / self.SR / 60, 3)} minuntes in lang = {lang}"
         )
-        with tempfile.TemporaryDirectory() as data_gym_cache:  # attempt to fixe loading issue
+        with (
+            tempfile.TemporaryDirectory() as data_gym_cache
+        ):  # attempt to fixe loading issue
             os.environ["DATA_GYM_CACHE_DIR"] = str(data_gym_cache)
             whisper_model = self._load_whisper(self.config.whisper_model).cuda()
             wav = wav.cpu()

diff --git a/stopes/modules/tests/test_split_merge_langs.py b/stopes/modules/tests/test_split_merge_langs.py
@@ -42,9 +42,10 @@ async def test_split_with_meta(tmp_path: Path):
         )
         input_shards.append(text_name)
         input_metas.append(meta_name)
-        with gzip.open(text_name, mode="wt") as f_text, gzip.open(
-            meta_name, mode="wt"
-        ) as f_meta:
+        with (
+            gzip.open(text_name, mode="wt") as f_text,
+            gzip.open(meta_name, mode="wt") as f_meta,
+        ):
             for line_id in range(shard_size):
                 print("text", i, line_id, file=f_text)
                 print("meta", i, line_id, file=f_meta)
@@ -164,9 +165,10 @@ async def test_merge_bitext(tmp_path: Path):
             tmp_path / f"bimeta_{j}.tsv.gz",
         )
         inputs.append((text_name, meta_name))
-        with utils.open(text_name, mode="wt") as f_text, utils.open(
-            meta_name, mode="wt"
-        ) as f_meta:
+        with (
+            utils.open(text_name, mode="wt") as f_text,
+            utils.open(meta_name, mode="wt") as f_meta,
+        ):
             unique_texts = [
                 (f"unique_text_en_{j}_{i}", f"unique_text_fr_{j}_{i}")
                 for i in range(50)
-Original file line number
+Diff line change
@@ Expand Up / @@ -446,7 +446,6 @@ def etox_paired_file_wrapper( @@
         oldcolumns=True,
         filetype=None,
     ):
         """
         file loading/writing wrapper for the paired language toxicity evaluation function.
@@ Expand Down @@