Some changes to make it easier to install the dependencies

2026-03-20 03:55:09 +08:00 · 2023-06-02 17:22:38 +08:00
parent b78d0d2a26
commit 9f1dbeeecc
13 changed files with 93 additions and 46 deletions
--- a/models/encoder/audio.py
+++ b/models/encoder/audio.py
@@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
    
    # Resample the wav if needed
    if source_sr is not None and source_sr != sampling_rate:
-        wav = librosa.resample(wav, source_sr, sampling_rate)
+        wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
        
    # Apply the preprocessing: normalize volume and shorten long silences 
    if normalize:
@@ -99,7 +99,7 @@ def trim_long_silences(wav):
        return ret[width - 1:] / width
    
    audio_mask = moving_average(voice_flags, vad_moving_average_width)
-    audio_mask = np.round(audio_mask).astype(np.bool)
+    audio_mask = np.round(audio_mask).astype(bool)
    
    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
--- a/models/encoder/visualizations.py
+++ b/models/encoder/visualizations.py
@@ -21,7 +21,7 @@ colormap = np.array([
    [33, 0, 127],
    [0, 0, 0],
    [183, 183, 183],
-], dtype=np.float) / 255 
+], dtype=float) / 255 


 class Visualizations:
--- a/models/ppg_extractor/log_mel.py
+++ b/models/ppg_extractor/log_mel.py
@@ -31,14 +31,13 @@ class LogMel(torch.nn.Module):
        fs: int = 16000,
        n_fft: int = 512,
        n_mels: int = 80,
-        fmin: float = None,
+        fmin: float = 0,
        fmax: float = None,
        htk: bool = False,
        norm=1,
    ):
        super().__init__()

-        fmin = 0 if fmin is None else fmin
        fmax = fs / 2 if fmax is None else fmax
        _mel_options = dict(
            sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
--- a/models/synthesizer/audio.py
+++ b/models/synthesizer/audio.py
@@ -107,7 +107,7 @@ def _griffin_lim(S, hparams):
    Based on https://github.com/librosa/librosa/issues/434
    """
    angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
-    S_complex = np.abs(S).astype(np.complex)
+    S_complex = np.abs(S).astype(complex)
    y = _istft(S_complex * angles, hparams)
    for i in range(hparams.griffin_lim_iters):
        angles = np.exp(1j * np.angle(_stft(y, hparams)))
--- a/models/synthesizer/preprocess.py
+++ b/models/synthesizer/preprocess.py
@@ -78,12 +78,12 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
    
    func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
                   hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
-    job = Pool(n_processes).imap(func, speaker_dirs)
+    job = Pool(n_processes).imap_unordered(func, speaker_dirs)
    
    for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
        if speaker_metadata is not None:
            for metadatum in speaker_metadata:
-                metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
+                metadata_file.write("|".join(map(str,metadatum)) + "\n")
    metadata_file.close()

    # Verify the contents of the metadata file
@@ -134,7 +134,7 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
    # Embed the utterances in separate threads
    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap(func, fpaths)
-    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
+    tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

 def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
    wav_dir = synthesizer_root.joinpath("audio")
@@ -152,4 +152,4 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
    # Embed the utterances in separate threads
    func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
    job = Pool(n_processes).imap(func, fpaths)
-    list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
+    tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
--- a/models/synthesizer/preprocess_audio.py
+++ b/models/synthesizer/preprocess_audio.py
@@ -104,29 +104,58 @@ def _split_on_silences(wav_fpath, words, hparams):
        wav = logmmse.denoise(wav, profile, eta=0)

    resp = pinyin(words, style=Style.TONE3)
-    res = [v[0] for v in resp if v[0].strip()]
+    res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp)) 
    res = " ".join(res)

    return wav, res

 def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
    metadata = []
-    extensions = ["*.wav", "*.flac", "*.mp3"]
-    for extension in extensions:
-        wav_fpath_list = speaker_dir.glob(extension)
-        # Iterate over each wav
-        for wav_fpath in wav_fpath_list:
-            words = dict_info.get(wav_fpath.name.split(".")[0])
-            words = dict_info.get(wav_fpath.name) if not words else words # try with extension 
-            if not words:
-                print("no wordS")
-                continue
-            sub_basename = "%s_%02d" % (wav_fpath.name, 0)
-            wav, text = _split_on_silences(wav_fpath, words, hparams)
-            result = _process_utterance(wav, text, out_dir, sub_basename, 
-                                                skip_existing, hparams, encoder_model_fpath)
-            if result is None:
-                continue
-            wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
-            metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
-    return [m for m in metadata if m is not None]
+    extensions = ("*.wav", "*.flac", "*.mp3")
+    if skip_existing:
+        for extension in extensions:
+            wav_fpath_list = speaker_dir.glob(extension)
+            # Iterate over each wav
+            for wav_fpath in wav_fpath_list:
+                words = dict_info.get(wav_fpath.name.split(".")[0])
+                if not words:
+                    words = dict_info.get(wav_fpath.name) # try with extension 
+                    if not words:
+                        print("no wordS")
+                        continue
+                sub_basename = "%s_%02d" % (wav_fpath.name, 0)
+                
+                mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
+                wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
+                
+                if mel_fpath.exists() and wav_fpath_.exists():
+                    continue
+
+                wav, text = _split_on_silences(wav_fpath, words, hparams)
+                result = _process_utterance(wav, text, out_dir, sub_basename, 
+                                                    False, hparams, encoder_model_fpath) # accelarate
+                if result is None:
+                    continue
+                wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
+                metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
+    else:
+        for extension in extensions:
+            wav_fpath_list = speaker_dir.glob(extension)
+            # Iterate over each wav
+            for wav_fpath in wav_fpath_list:
+                words = dict_info.get(wav_fpath.name.split(".")[0])
+                if not words:
+                    words = dict_info.get(wav_fpath.name) # try with extension 
+                    if not words:
+                        print("no wordS")
+                        continue
+                sub_basename = "%s_%02d" % (wav_fpath.name, 0)
+
+                wav, text = _split_on_silences(wav_fpath, words, hparams)
+                result = _process_utterance(wav, text, out_dir, sub_basename, 
+                                                    False, hparams, encoder_model_fpath)
+                if result is None:
+                    continue
+                wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
+                metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
+    return metadata
--- a/models/vocoder/wavernn/audio.py
+++ b/models/vocoder/wavernn/audio.py
@@ -50,7 +50,7 @@ def linear_to_mel(spectrogram):


 def build_mel_basis():
-    return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
+    return librosa.filters.mel(sr = hp.sample_rate, n_fft = hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)


 def normalize(S):