Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion demo/toxicity-alti-hb/ETOX/etox.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,6 @@ def etox_paired_file_wrapper(
oldcolumns=True,
filetype=None,
):

"""
file loading/writing wrapper for the paired language toxicity evaluation function.

Expand Down
8 changes: 5 additions & 3 deletions demo/toxicity-alti-hb/analysis/00c_plot_toxicity_per_lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@ def plot_toxicity_per_lang():
}
sorted_axes = sorted(list(axis_colors.keys()))
axis_display_names = {
axis: "Race and ethnicity"
if axis == "race_ethnicity"
else axis[0].upper() + axis[1:].replace("_", " ")
axis: (
"Race and ethnicity"
if axis == "race_ethnicity"
else axis[0].upper() + axis[1:].replace("_", " ")
)
for axis in sorted_axes
}
sorted_axis_names = sorted(list(axis_colors.keys()))
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,15 @@ classifiers=[
"numba",
"transformers",
"openai-whisper==20230314",
"fairseq2==0.2.*",
"fairseq2>=0.5.*",
"sonar-space==0.2.*",
]
vocal_style_sim = [
"s3prl",
]
sonar_mining = [
"sonar-space==0.2.*",
"fairseq2==0.2.*",
"fairseq2>=0.5.*",
]
dev = [
# Test
Expand Down
1 change: 1 addition & 0 deletions stopes/core/jobs_registry/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

logger = logging.getLogger("stopes.jobs")


################################################################################
# Registry Exceptions
################################################################################
Expand Down
4 changes: 3 additions & 1 deletion stopes/core/jobs_registry/submitit_slurm_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,9 @@ def _convert_slurm_status_into_registry_job_status(

return job_status

except KeyError: # Entering this except block means slurm_status doesn't exist in submitit_state_to_registry_state_dict
except (
KeyError
): # Entering this except block means slurm_status doesn't exist in submitit_state_to_registry_state_dict
logger.warning(
f"Job with id: {job_id} has unrecognized slurm status: {slurm_status}. Please inspect and if suitable, add this status to the slurm_state_to_registry_state_map converter."
)
Expand Down
2 changes: 1 addition & 1 deletion stopes/core/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ async def test_semaphore():

# make sure that the semaphore blocks execution
ends.sort()
for (end1, end2) in zip(ends, ends[1:]):
for end1, end2 in zip(ends, ends[1:]):
t_diff = end2 - end1
assert (
t_diff.total_seconds() >= sleep_time
Expand Down
5 changes: 3 additions & 2 deletions stopes/eval/alti/alignment/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

# This code was adapted from the repository https://github.com/mt-upc/transformer-contributions-nmt by Javier Ferrando.

""" Various utilities for computing word attributions and word alignment quality metrics."""
"""Various utilities for computing word attributions and word alignment quality metrics."""

import itertools
import typing as tp
Expand Down Expand Up @@ -109,7 +109,8 @@ def compute_alignment_metrics(
sure: tp.List[tp.Set], possible: tp.List[tp.Set], hypothesis: tp.List[tp.Set]
) -> tp.Tuple[float, float, float]:
"""Compute average alignment rate, precision and recall for alignment.
Inputs are lists of alignments. All alignments are presented as sets of (tgt, src) pairs."""
Inputs are lists of alignments. All alignments are presented as sets of (tgt, src) pairs.
"""
sum_a_intersect_p, sum_a_intersect_s, sum_s, sum_a = 0, 0, 0, 0

for s, p, a in itertools.zip_longest(sure, possible, hypothesis):
Expand Down
7 changes: 5 additions & 2 deletions stopes/eval/alti/alti_metrics/nllb_alti_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def load_bilingual_model(
@dataclasses.dataclass
class ALTIMetricsConfig:
"""The config indicating how to load sentence pairs, load the model,
compute the ALTI metrics with it, and save results. - to use with the `compute_nllb_alti` function."""
compute the ALTI metrics with it, and save results. - to use with the `compute_nllb_alti` function.
"""

# the model used to compute ALTI
is_multilingual: bool
Expand All @@ -118,7 +119,9 @@ class ALTIMetricsConfig:
Path
] # a .jsonl file with token-level contributions
# format and location of the source data
input_filename: Path # the source file with sources and translations; assumed to be .tsv
input_filename: (
Path # the source file with sources and translations; assumed to be .tsv
)
src_lang: str
tgt_lang: str
src_col: tp.Union[str, int] = "src"
Expand Down
3 changes: 2 additions & 1 deletion stopes/eval/alti/wrappers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@

def spearmanr(x, y):
"""Compute Spearman rank's correlation bertween two attribution vectors.
https://github.com/samiraabnar/attention_flow/blob/master/compute_corel_distilbert_sst.py"""
https://github.com/samiraabnar/attention_flow/blob/master/compute_corel_distilbert_sst.py
"""

x = pd.Series(x)
y = pd.Series(y)
Expand Down
14 changes: 7 additions & 7 deletions stopes/eval/local_prosody/compare_utterances.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def align_pauses(

duration_scores = []
alignment_scores = []
for (pauses, p2a) in [(pp_src, p2a_src), (pp_tgt, p2a_tgt)]:
for pauses, p2a in [(pp_src, p2a_src), (pp_tgt, p2a_tgt)]:
for prev_word_id, duration in pauses:
if prev_word_id not in p2a:
duration_scores.append(0.0)
Expand Down Expand Up @@ -353,12 +353,12 @@ def aggregate_pause_alignment_statistics(df: pd.DataFrame):
"mean_duration_score": df.duration_score.mean(),
"mean_alignment_score": df.alignment_score.mean(),
"mean_joint_score": joint_score.mean(),
"wmean_duration_score": (df.duration_score * w).sum() / w.sum()
if non_empty
else 1,
"wmean_alignment_score": (df.alignment_score * w).sum() / w.sum()
if non_empty
else 1,
"wmean_duration_score": (
(df.duration_score * w).sum() / w.sum() if non_empty else 1
),
"wmean_alignment_score": (
(df.alignment_score * w).sum() / w.sum() if non_empty else 1
),
"wmean_joint_score": (joint_score * w).sum() / w.sum() if non_empty else 1,
"total_weight": w.sum(),
"n_items": df.shape[0],
Expand Down
2 changes: 1 addition & 1 deletion stopes/eval/local_prosody/utterance.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def get_text_with_markup(self, min_pause_duration=0.1, min_emph_score=0.5) -> st
parts = []
pause_durations = self.get_pauses_after_words(min_duration=min_pause_duration)
emphasis_scores = self.emphasis_scores or [0] * len(self.words)
for (word, pause, emph_score) in zip(
for word, pause, emph_score in zip(
self.words, pause_durations, emphasis_scores
):
if emph_score > min_emph_score:
Expand Down
16 changes: 10 additions & 6 deletions stopes/modules/bitext/indexing/merge_faiss_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,16 @@ def checkpoint(
return submitit.helpers.DelayedSubmission(
MergeFAISSIndexesModule(
config=self.config,
checkpoint_part=self.partial_merge_file
if self.config.enable_checkpointing
else None,
checkpoint_file_idx=self.checkpoint_file_idx
if self.config.enable_checkpointing
else None,
checkpoint_part=(
self.partial_merge_file
if self.config.enable_checkpointing
else None
),
checkpoint_file_idx=(
self.checkpoint_file_idx
if self.config.enable_checkpointing
else None
),
),
*args,
**kwargs,
Expand Down
8 changes: 5 additions & 3 deletions stopes/modules/bitext/indexing/populate_faiss_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,11 @@ def checkpoint(
return submitit.helpers.DelayedSubmission(
PopulateFAISSIndexModule(
config=self.config,
checkpoint_summary=self.checkpoint_summary
if self.config.enable_checkpointing
else None,
checkpoint_summary=(
self.checkpoint_summary
if self.config.enable_checkpointing
else None
),
),
*args,
**kwargs,
Expand Down
3 changes: 2 additions & 1 deletion stopes/modules/bitext/mining/merge_shards.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ class MergeShardsConfig:

class ShardForMerge:
"""Represent an input shard opened for merge.
Both input and output shards are in decreasing order of match scores, and this object helps manage that."""
Both input and output shards are in decreasing order of match scores, and this object helps manage that.
"""

def __init__(self, text_path: Path, meta_path: tp.Optional[Path]):
self.text_path = text_path
Expand Down
7 changes: 4 additions & 3 deletions stopes/modules/evaluation/generate_multi_bleu_detok_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,10 @@ def process_output_ref_hyp_file(
return_file = Path(f"{fairseq_generate_output_file}.{file_type}")
desired_line_prefix = "T" if file_type == "ref" else "H"
desired_col_number = 1 if file_type == "ref" else 2
with open(fairseq_generate_output_file, "r", encoding="utf-8") as read_file, open(
return_file, "w", encoding="utf-8"
) as write_file:
with (
open(fairseq_generate_output_file, "r", encoding="utf-8") as read_file,
open(return_file, "w", encoding="utf-8") as write_file,
):
for line in read_file:
if line.startswith(desired_line_prefix):
line = line.rstrip("\n")
Expand Down
2 changes: 1 addition & 1 deletion stopes/modules/nmt_bitext_eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def concat_public_bitext(self, output: Path, tgt: bool) -> int:
"""
lines = 0
with utils.open(output, "wb") as o:
for (src_file, tgt_file) in self.public_bitext:
for src_file, tgt_file in self.public_bitext:
with utils.open(tgt_file if tgt else src_file, "rb") as f:
for line in f:
lines += 1
Expand Down
1 change: 0 additions & 1 deletion stopes/modules/preprocess/bitext_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def __init__(
def process_lines(
self, dataset_reader: tp.Generator[DatasetLine, None, None]
) -> None:

"""
process a batch of lines from two files and writes them to two output_files the way you want.
The input are two iterators of lines with their line number in the input file
Expand Down
2 changes: 1 addition & 1 deletion stopes/modules/preprocess/fairseq_binarizer_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def __enter__(self):

def process_lines(self, lines_with_number: tp.Iterator[tp.Tuple[int, str]]) -> None:
summary = BinarizeSummary()
for (_, s) in lines_with_number:
for _, s in lines_with_number:
self.dataset_builder.add_item(self.binarizer.binarize_line(s, summary))
self.summary.merge(summary)
log.info(self.summary)
Expand Down
6 changes: 3 additions & 3 deletions stopes/modules/preprocess/laser_sentence_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,9 +306,9 @@ def combine_bidir(outs):
return {
"sentemb": sentemb,
"encoder_out": (x, final_hiddens, final_cells),
"encoder_padding_mask": encoder_padding_mask
if encoder_padding_mask.any()
else None,
"encoder_padding_mask": (
encoder_padding_mask if encoder_padding_mask.any() else None
),
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def __enter__(self):

def process_lines(self, lines_with_number: tp.Iterator[tp.Tuple[int, str]]) -> None:
summary = BinarizeSummary()
for (_, s) in lines_with_number:
for _, s in lines_with_number:
self.dataset_builder.add_item(self.binarizer.binarize_line(s, summary))
self.summary.merge(summary)
logger.info(self.summary)
Expand Down
6 changes: 4 additions & 2 deletions stopes/modules/preprocess/sonar_text_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import pandas as pd
import pyarrow as pa
import torch
from fairseq2.assets.error import AssetError
from fairseq2.error import OperationalError
from retrying import retry
from sonar.inference_pipelines.text import (
EmbeddingToTextModelPipeline,
Expand All @@ -37,7 +37,9 @@
from stopes.utils.sharding.parquet_shards import ParquetOutputConfig

fairse2_asset_loading_retry = retry(
retry_on_exception=lambda exception: isinstance(exception, (AssetError, IOError)),
retry_on_exception=lambda exception: isinstance(
exception, (OperationalError, IOError)
),
stop_max_attempt_number=20,
wait_random_min=1000,
wait_random_max=30_000,
Expand Down
5 changes: 4 additions & 1 deletion stopes/modules/preprocess/uromanize_cli_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@ def run_uroman_cli_standalone(input_file: Path, output_file: Path, lang: str) ->
def uromanize(text: tp.List[str]) -> tp.List[str]:
if text is None or len(text) == 0:
return []
with tempfile.NamedTemporaryFile() as input_file, tempfile.NamedTemporaryFile() as output_file:
with (
tempfile.NamedTemporaryFile() as input_file,
tempfile.NamedTemporaryFile() as output_file,
):
with open(input_file.name, "w") as f:
for sentence in text:
f.write(f"{sentence}\n")
Expand Down
3 changes: 2 additions & 1 deletion stopes/modules/preprocess/wav2vec_laser_speech_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def __init__(
@property
def fbank_features(self) -> int:
"""Number of fbank features to feed to the encoder (0 if instead of fbank it expects a raw waveform).
This parameter is defined based on the architecture of the underlying encoder."""
This parameter is defined based on the architecture of the underlying encoder.
"""
return self.encoder_cfg.task.get("fbank_features", 0)

def _encode_batch(self, source, padding_mask):
Expand Down
8 changes: 5 additions & 3 deletions stopes/modules/speech/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,11 @@ def auto_parse_line(line: str, sampling_factor: tp.Optional[int] = None) -> Line
columns = line.split("\t")
return LineResult(
columns=[
auto_parse(column, sampling_factor)
if i == 0
else parse_audio_or_text(column, sampling_factor)
(
auto_parse(column, sampling_factor)
if i == 0
else parse_audio_or_text(column, sampling_factor)
)
for (i, column) in enumerate(columns)
]
)
Expand Down
5 changes: 3 additions & 2 deletions stopes/modules/speech/video_alignement/video_segmentor.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ class WhisperSegmentorConfig:


class WhisperSegmentorModule(StopesModule):

"""Extract utterances from an audio and embed them
This module is multi-lingual : different languages can be processed with the same pipeline

Expand Down Expand Up @@ -382,7 +381,9 @@ def compute_whisper_segmentation(
logger.info(
f"Starting Whisper segmentation on wav of length {round(len(wav) / self.SR / 60, 3)} minuntes in lang = {lang}"
)
with tempfile.TemporaryDirectory() as data_gym_cache: # attempt to fixe loading issue
with (
tempfile.TemporaryDirectory() as data_gym_cache
): # attempt to fixe loading issue
os.environ["DATA_GYM_CACHE_DIR"] = str(data_gym_cache)
whisper_model = self._load_whisper(self.config.whisper_model).cuda()
wav = wav.cpu()
Expand Down
14 changes: 8 additions & 6 deletions stopes/modules/tests/test_split_merge_langs.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@ async def test_split_with_meta(tmp_path: Path):
)
input_shards.append(text_name)
input_metas.append(meta_name)
with gzip.open(text_name, mode="wt") as f_text, gzip.open(
meta_name, mode="wt"
) as f_meta:
with (
gzip.open(text_name, mode="wt") as f_text,
gzip.open(meta_name, mode="wt") as f_meta,
):
for line_id in range(shard_size):
print("text", i, line_id, file=f_text)
print("meta", i, line_id, file=f_meta)
Expand Down Expand Up @@ -164,9 +165,10 @@ async def test_merge_bitext(tmp_path: Path):
tmp_path / f"bimeta_{j}.tsv.gz",
)
inputs.append((text_name, meta_name))
with utils.open(text_name, mode="wt") as f_text, utils.open(
meta_name, mode="wt"
) as f_meta:
with (
utils.open(text_name, mode="wt") as f_text,
utils.open(meta_name, mode="wt") as f_meta,
):
unique_texts = [
(f"unique_text_en_{j}_{i}", f"unique_text_fr_{j}_{i}")
for i in range(50)
Expand Down
Loading