openvpi
diff --git a/‎augmentation/pitch_shift.py‎
Lines changed: 0 additions & 26 deletions b/‎augmentation/pitch_shift.py‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎augmentation/spec_stretch.py‎
Lines changed: 67 additions & 0 deletions b/‎augmentation/spec_stretch.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎basics/base_binarizer.py‎
Lines changed: 69 additions & 8 deletions b/‎basics/base_binarizer.py‎
Lines changed: 69 additions & 8 deletions
diff --git a/‎configs/acoustic/nomidi.yaml‎
Lines changed: 6 additions & 1 deletion b/‎configs/acoustic/nomidi.yaml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎data_gen/data_gen_utils.py‎
Lines changed: 6 additions & 4 deletions b/‎data_gen/data_gen_utils.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎inference/ds_cascade.py‎
Lines changed: 33 additions & 6 deletions b/‎inference/ds_cascade.py‎
Lines changed: 33 additions & 6 deletions
diff --git a/‎modules/fastspeech/tts_modules.py‎
Lines changed: 27 additions & 0 deletions b/‎modules/fastspeech/tts_modules.py‎
Lines changed: 27 additions & 0 deletions
@@ -0,0 +1,67 @@
+from copy import deepcopy
+
+import numpy as np
+import torch
+
+from basics.base_augmentation import BaseAugmentation
+from data_gen.data_gen_utils import get_pitch_parselmouth
+from modules.fastspeech.tts_modules import LengthRegulator
+from src.vocoders.base_vocoder import VOCODERS
+from utils.hparams import hparams
+from utils.pitch_utils import f0_to_coarse
+
+
+class SpectrogramStretchAugmentation(BaseAugmentation):
+    """
+    This class contains methods for frequency-domain and time-domain stretching augmentation.
+    """
+    def __init__(self, data_dirs: list, augmentation_args: dict):
+        super().__init__(data_dirs, augmentation_args)
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.lr = LengthRegulator().to(self.device)
+
+    def process_item(self, item: dict, key_shift=0., speed=1., replace_spk_id=None) -> dict:
+        aug_item = deepcopy(item)
+        if hparams['vocoder'] in VOCODERS:
+            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(
+                aug_item['wav_fn'], keyshift=key_shift, speed=speed
+            )
+        else:
+            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(
+                aug_item['wav_fn'], keyshift=key_shift, speed=speed
+            )
+
+        aug_item['mel'] = mel
+
+        if speed != 1. or hparams.get('use_speed_embed', False):
+            aug_item['len'] = len(mel)
+            aug_item['speed'] = int(np.round(hparams['hop_size'] * speed)) / hparams['hop_size'] # real speed
+            aug_item['sec'] /= aug_item['speed']
+            aug_item['ph_durs'] /= aug_item['speed']
+            aug_item['mel2ph'] = self.get_mel2ph(aug_item['ph_durs'], aug_item['len'])
+            aug_item['f0'], aug_item['pitch'] = get_pitch_parselmouth(wav, mel, hparams, speed=speed)
+
+        if key_shift != 0. or hparams.get('use_key_shift_embed', False):
+            aug_item['key_shift'] = key_shift
+            aug_item['f0'] *= 2 ** (key_shift / 12)
+            aug_item['pitch'] = f0_to_coarse(aug_item['f0'])
+
+        if replace_spk_id is not None:
+            aug_item['spk_id'] = replace_spk_id
+
+        return aug_item
+
+    @torch.no_grad()
+    def get_mel2ph(self, durs, length):
+        ph_acc = np.around(
+            np.add.accumulate(durs) * hparams['audio_sample_rate'] / hparams['hop_size'] + 0.5
+        ).astype('int')
+        ph_dur = np.diff(ph_acc, prepend=0)
+        ph_dur = torch.LongTensor(ph_dur)[None].to(self.device)
+        mel2ph = self.lr(ph_dur).cpu().numpy()[0]
+        num_frames = len(mel2ph)
+        if num_frames < length:
+            mel2ph = np.concatenate((mel2ph, np.full((length - num_frames, mel2ph[-1]))), axis=0)
+        elif num_frames > length:
+            mel2ph = mel2ph[:length]
+        return mel2ph
@@ -1,3 +1,4 @@
+import copy
 import shutil
 import os
 os.environ["OMP_NUM_THREADS"] = "1"
@@ -218,38 +219,43 @@ def arrange_data_augmentation(self, prefix):
         Code for all types of data augmentation should be added here.
         """
         aug_map = {}
+        aug_list = []
         all_item_names = [item_name for item_name, _ in self.meta_data_iterator(prefix)]
+        total_scale = 0
         if self.augmentation_args.get('random_pitch_shifting') is not None:
-            from augmentation.pitch_shift import PitchShiftAugmentation
+            from augmentation.spec_stretch import SpectrogramStretchAugmentation
             aug_args = self.augmentation_args['random_pitch_shifting']
             key_shift_min, key_shift_max = aug_args['range']
             assert hparams.get('use_key_shift_embed', False), \
                 'Random pitch shifting augmentation requires use_key_shift_embed == True.'
             assert key_shift_min < 0 < key_shift_max, \
                 'Random pitch shifting augmentation must have a range where min < 0 < max.'
 
-            aug_ins = PitchShiftAugmentation(self.raw_data_dirs, aug_args)
+            aug_ins = SpectrogramStretchAugmentation(self.raw_data_dirs, aug_args)
             scale = aug_args['scale']
-            aug_item_names = all_item_names * int(scale) \
-                             + random.sample(all_item_names, int(len(all_item_names) * (scale - int(scale))))
+            aug_item_names = random.choices(all_item_names, k=int(scale * len(all_item_names)))
 
             for aug_item_name in aug_item_names:
-                rand = random.random() * 2 - 1
+                rand = random.uniform(-1, 1)
                 if rand < 0:
                     key_shift = key_shift_min * abs(rand)
                 else:
                     key_shift = key_shift_max * rand
                 aug_task = {
+                    'name': aug_item_name,
                     'func': aug_ins.process_item,
                     'kwargs': {'key_shift': key_shift}
                 }
                 if aug_item_name in aug_map:
                     aug_map[aug_item_name].append(aug_task)
                 else:
                     aug_map[aug_item_name] = [aug_task]
+                aug_list.append(aug_task)
+
+            total_scale += scale
 
         if self.augmentation_args.get('fixed_pitch_shifting') is not None:
-            from augmentation.pitch_shift import PitchShiftAugmentation
+            from augmentation.spec_stretch import SpectrogramStretchAugmentation
             aug_args = self.augmentation_args['fixed_pitch_shifting']
             targets = aug_args['targets']
             scale = aug_args['scale']
@@ -262,19 +268,74 @@ def arrange_data_augmentation(self, prefix):
                 'Fixed pitch shifting augmentation requires num_spk >= (1 + len(targets)) * len(speakers).'
             assert scale < 1, 'Fixed pitch shifting augmentation requires scale < 1.'
 
-            aug_ins = PitchShiftAugmentation(self.raw_data_dirs, aug_args)
+            aug_ins = SpectrogramStretchAugmentation(self.raw_data_dirs, aug_args)
             for i, target in enumerate(targets):
-                aug_item_names = random.sample(all_item_names, int(len(all_item_names) * scale))
+                aug_item_names = random.choices(all_item_names, k=int(scale * len(all_item_names)))
                 for aug_item_name in aug_item_names:
                     replace_spk_id = int(aug_item_name.split(':', maxsplit=1)[0]) + (i + 1) * len(self.spk_map)
                     aug_task = {
+                        'name': aug_item_name,
                         'func': aug_ins.process_item,
                         'kwargs': {'key_shift': target, 'replace_spk_id': replace_spk_id}
                     }
                     if aug_item_name in aug_map:
                         aug_map[aug_item_name].append(aug_task)
                     else:
                         aug_map[aug_item_name] = [aug_task]
+                    aug_list.append(aug_task)
+
+            total_scale += scale * len(targets)
+
+        if self.augmentation_args.get('random_time_stretching') is not None:
+            from augmentation.spec_stretch import SpectrogramStretchAugmentation
+            aug_args = self.augmentation_args['random_time_stretching']
+            speed_min, speed_max = aug_args['range']
+            domain = aug_args['domain']
+            assert hparams.get('use_speed_embed', False), \
+                'Random time stretching augmentation requires use_speed_embed == True.'
+            assert 0 < speed_min < 1 < speed_max, \
+                'Random time stretching augmentation must have a range where 0 < min < 1 < max.'
+            assert domain in ['log', 'linear'], 'domain must be \'log\' or \'linear\'.'
+
+            aug_ins = SpectrogramStretchAugmentation(self.raw_data_dirs, aug_args)
+            scale = aug_args['scale']
+            k_from_raw = int(scale / (1 + total_scale) * len(all_item_names))
+            k_from_aug = int(total_scale * scale / (1 + total_scale) * len(all_item_names))
+            k_mutate = int(total_scale * scale / (1 + scale) * len(all_item_names))
+            aug_types = [0] * k_from_raw + [1] * k_from_aug + [2] * k_mutate
+            aug_items = random.choices(all_item_names, k=k_from_raw) + random.choices(aug_list, k=k_from_aug + k_mutate)
+
+            for aug_type, aug_item in zip(aug_types, aug_items):
+                if domain == 'log':
+                    # Uniform distribution in log domain
+                    speed = speed_min * (speed_max / speed_min) ** random.random()
+                else:
+                    # Uniform distribution in linear domain
+                    rand = random.uniform(-1, 1)
+                    speed = 1 + (speed_max - 1) * rand if rand >= 0 else 1 + (1 - speed_min) * rand
+                if aug_type == 0:
+                    aug_task = {
+                        'name': aug_item,
+                        'func': aug_ins.process_item,
+                        'kwargs': {'speed': speed}
+                    }
+                    if aug_item in aug_map:
+                        aug_map[aug_item].append(aug_task)
+                    else:
+                        aug_map[aug_item] = [aug_task]
+                    aug_list.append(aug_task)
+                elif aug_type == 1:
+                    aug_task = copy.deepcopy(aug_item)
+                    aug_item['kwargs']['speed'] = speed
+                    if aug_item['name'] in aug_map:
+                        aug_map[aug_item['name']].append(aug_task)
+                    else:
+                        aug_map[aug_item['name']] = [aug_task]
+                    aug_list.append(aug_task)
+                elif aug_type == 2:
+                    aug_item['kwargs']['speed'] = speed
+
+            total_scale += scale
 
         return aug_map
 
 
@@ -40,6 +40,10 @@ binarization_args:
 #  fixed_pitch_shifting:
 #    targets: [-5., 5.]
 #    scale: 0.75
+#  random_time_stretching:
+#    range: [0.5, 2.]
+#    domain: log  # or linear
+#    scale: 2.0
 
 raw_data_dir: 'data/opencpop/raw'
 processed_data_dir: ''
@@ -66,7 +70,8 @@ use_uv: false
 use_midi: false
 use_spk_embed: false
 use_spk_id: false
-#use_key_shift_embed: true
+use_key_shift_embed: false
+use_speed_embed: false
 use_gt_f0: false  #  for midi exp
 use_gt_dur: false  # for further midi exp
 f0_embed_type: continuous
 
@@ -147,22 +147,24 @@ def process_utterance(wav_path,
         return wav, mel, spc
 
 
-def get_pitch_parselmouth(wav_data, mel, hparams):
+def get_pitch_parselmouth(wav_data, mel, hparams, speed=1):
     """
 
     :param wav_data: [T]
-    :param mel: [T, 80]
+    :param mel: [T, mel_bins]
     :param hparams:
     :return:
     """
-    time_step = hparams['hop_size'] / hparams['audio_sample_rate'] * 1000
+    hop_size = int(np.round(hparams['hop_size'] * speed))
+    
+    time_step = hop_size / hparams['audio_sample_rate'] * 1000
     f0_min = 65
     f0_max = 800
 
     f0 = parselmouth.Sound(wav_data, hparams['audio_sample_rate']).to_pitch_ac(
         time_step=time_step / 1000, voicing_threshold=0.6,
         pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
-    pad_size=(int(len(wav_data) // hparams['hop_size']) - len(f0) + 1) // 2
+    pad_size=(int(len(wav_data) // hop_size) - len(f0) + 1) // 2
     f0 = np.pad(f0,[[pad_size,len(mel) - len(f0) - pad_size]], mode='constant')
     pitch_coarse = f0_to_coarse(f0)
     return f0, pitch_coarse
 
@@ -44,6 +44,11 @@ def preprocess_phoneme_level_input(self, inp):
                 gender = np.array(inp['gender'].split(), 'float')
             else:
                 gender = float(inp['gender'])
+        velocity_timestep = None
+        velocity = None
+        if inp.get('velocity') is not None:
+            velocity_timestep = float(inp['velocity_timestep'])
+            velocity = np.array(inp['velocity'].split(), 'float')
         ph_seq_lst = ph_seq.split()
         if inp['ph_dur'] is not None:
             ph_dur = np.array(inp['ph_dur'].split(), 'float')
@@ -58,7 +63,8 @@ def preprocess_phoneme_level_input(self, inp):
                                    f'{len(note_lst)} {len(ph_seq.split())} {len(midi_dur_lst)}')
             print(f'Processed {len(ph_seq_lst)} tokens: {" ".join(ph_seq_lst)}')
 
-        return ph_seq, note_lst, midi_dur_lst, is_slur, ph_dur, f0_timestep, f0_seq, gender_timestep, gender
+        return ph_seq, note_lst, midi_dur_lst, is_slur, ph_dur, \
+            f0_timestep, f0_seq, gender_timestep, gender, velocity_timestep, velocity
 
     def preprocess_input(self, inp, input_type='word'):
         """
@@ -90,9 +96,10 @@ def preprocess_input(self, inp, input_type='word'):
         # get ph seq, note lst, midi dur lst, is slur lst.
         if input_type == 'word':
             ph_seq, note_lst, midi_dur_lst, is_slur = self.preprocess_word_level_input(inp)
-            ph_dur = f0_timestep = f0_seq = gender_timestep = gender = None
+            ph_dur = f0_timestep = f0_seq = gender_timestep = gender = velocity_timestep = velocity = None
         elif input_type == 'phoneme':  # like transcriptions.txt in Opencpop dataset.
-            ph_seq, note_lst, midi_dur_lst, is_slur, ph_dur, f0_timestep, f0_seq, gender_timestep, gender = \
+            ph_seq, note_lst, midi_dur_lst, is_slur, ph_dur, \
+                f0_timestep, f0_seq, gender_timestep, gender, velocity_timestep, velocity = \
                 self.preprocess_phoneme_level_input(inp)
         else:
             raise ValueError('Invalid input type. Must be \'word\' or \'phoneme\'.')
@@ -118,6 +125,8 @@ def preprocess_input(self, inp, input_type='word'):
             item['f0_seq'] = f0_seq
             item['gender_timestep'] = gender_timestep
             item['gender'] = gender
+            item['velocity_timestep'] = velocity_timestep
+            item['velocity'] = velocity
             item['spk_mix_timestep'] = inp.get('spk_mix_timestep')
         return item
 
@@ -210,6 +219,23 @@ def input_to_batch(self, item):
         else:
             key_shift = None
 
+        if hparams.get('use_speed_embed', False):
+            if item['velocity'] is None:
+                print('Using default velocity curve')
+                speed = torch.FloatTensor([1.]).to(self.device)
+            else:
+                print('Using manual velocity curve')
+                velocity_timestep = item['velocity_timestep']
+                velocity_seq = item['velocity']
+                speed_min, speed_max = hparams['augmentation_args']['random_time_stretching']['range']
+                speed_seq = np.clip(velocity_seq, a_min=speed_min, a_max=speed_max)
+                t_max = (len(speed_seq) - 1) * velocity_timestep
+                dt = hparams['hop_size'] / hparams['audio_sample_rate']
+                speed_interp = np.interp(np.arange(0, t_max, dt), velocity_timestep * np.arange(len(speed_seq)), speed_seq)
+                speed = torch.FloatTensor(speed_interp)[None, :].to(self.device)
+        else:
+            speed = None
+
         batch = {
             'item_name': item_names,
             'text': text,
@@ -222,7 +248,8 @@ def input_to_batch(self, item):
             'is_slur': is_slur,
             'mel2ph': mel2ph,
             'log2f0': log2f0,
-            'key_shift': key_shift
+            'key_shift': key_shift,
+            'speed': speed
         }
         return batch
 
@@ -240,8 +267,8 @@ def forward_model(self, inp, return_mel=False):
             output = self.model(txt_tokens, spk_mix_embed=spk_mix_embed, ref_mels=None, infer=True,
                                 pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
                                 is_slur=sample['is_slur'], mel2ph=sample['mel2ph'], f0=sample['log2f0'],
-                                key_shift=sample['key_shift'])
-            mel_out = output['mel_out']  # [B, T,80]
+                                key_shift=sample['key_shift'], speed=sample['speed'])
+            mel_out = output['mel_out']  # [B, T, M]
             f0_pred = output['f0_denorm']
             if return_mel:
                 return mel_out.cpu(), f0_pred.cpu()
 
@@ -189,6 +189,33 @@ def forward(self, dur, dur_padding=None, alpha=1.0):
         return mel2ph
 
 
+class StretchRegulator(torch.nn.Module):
+    def forward(self, dur, mel2ph):
+        """
+        Example (no batch dim version):
+            1. dur = [2,4,3]
+            2. mel2ph = [1,1,2,2,2,2,3,3,3]
+            3. mel2dur = [2,2,4,4,4,4,3,3,3]
+            4. bound_mask = [0,1,0,0,0,1,0,0,1]
+            5. 1 - bound_mask * mel2dur = [1,-1,1,1,1,-3,1,1,-2] => pad => [0,1,-1,1,1,1,-3,1,1]
+            6. stretch_denorm = [0,1,0,1,2,3,0,1,2]
+
+        :param dur: Batch of durations of each frame (B, T_txt)
+        :param mel2ph: Batch of mel2ph (B, T_speech)
+        :return:
+            stretch (B, T_speech)
+        """
+        dur = F.pad(dur, [1, 0], value=1)  # Avoid dividing by zero
+        mel2dur = torch.gather(dur, 1, mel2ph)
+        bound_mask = torch.gt(mel2ph[:, 1:], mel2ph[:, :-1])
+        bound_mask = F.pad(bound_mask, [0, 1], mode='constant', value=True)
+        stretch_delta = 1 - bound_mask * mel2dur
+        stretch_delta = F.pad(stretch_delta, [1, -1], mode='constant', value=0)
+        stretch_denorm = torch.cumsum(stretch_delta, dim=1)
+        stretch = stretch_denorm / mel2dur
+        return stretch * (mel2ph > 0)
+
+
 class PitchPredictor(torch.nn.Module):
     def __init__(self, idim, n_layers=5, n_chans=384, odim=2, kernel_size=5,
                  dropout_rate=0.1, padding='SAME'):