Add quick path to preprocess audio, denoise audio when loading in toolbox

2026-06-15 06:36:36 +08:00 · 2021-08-30 22:22:06 +08:00
parent 2f1f4f70b4
commit 5c0cb50c3e
5 changed files with 96 additions and 10 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -23,6 +23,16 @@
            "args": [
                "dev", "..\\..\\chs1"
            ],
+        },
+        {
+            "name": "Python: demo box",
+            "type": "python",
+            "request": "launch",
+            "program": "demo_toolbox.py",
+            "console": "integratedTerminal",
+            "args": [
+                "-d", "..\\..\\chs"
+            ],
        }
    ]
 }
--- a/README-CN.md
+++ b/README-CN.md
@@ -31,16 +31,13 @@

 ### 2. 使用数据集训练合成器
 * 下载 数据集并解压：确保您可以访问 *train* 文件夹中的所有音频文件（如.wav）
-* 使用音频和梅尔频谱图进行预处理：
-`python synthesizer_preprocess_audio.py <datasets_root>`
+* 进行音频和梅尔频谱图预处理：
+`python pre.py <datasets_root>`
 可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3
 > 假如你下载的 `aidatatang_200zh`文件放在D盘，`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`

 >假如發生 `頁面文件太小，無法完成操作`，請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030)，將虛擬內存更改為100G(102400)，例如:档案放置D槽就更改D槽的虚拟内存

-* 预处理嵌入：
-`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
-
 * 训练合成器：
 `python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`

--- a/README.md
+++ b/README.md
@@ -32,13 +32,11 @@
 ### 2. Train synthesizer with your dataset
 * Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder
 * Preprocess with the audios and the mel spectrograms:
-`python synthesizer_preprocess_audio.py <datasets_root>`
+`python pre.py <datasets_root>`
 Allow parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3

 >If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.

-* Preprocess the embeddings:
-`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`

 * Train the synthesizer:
 `python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
--- a/pre.py
+++ b/pre.py
@@ -0,0 +1,72 @@
+from synthesizer.preprocess import create_embeddings
+from utils.argutils import print_args
+from pathlib import Path
+import argparse
+
+from synthesizer.preprocess import preprocess_dataset
+from synthesizer.hparams import hparams
+from utils.argutils import print_args
+from pathlib import Path
+import argparse
+
+recognized_datasets = [
+    "aidatatang_200zh",
+    "magicdata",
+    "aishell3"
+]
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
+                    "and writes them to  the disk. Audio files are also saved, to be used by the "
+                    "vocoder for training.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("datasets_root", type=Path, help=\
+        "Path to the directory containing your datasets.")
+    parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
+        "Path to the output directory that will contain the mel spectrograms, the audios and the "
+        "embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
+    parser.add_argument("-n", "--n_processes", type=int, default=1, help=\
+        "Number of processes in parallel.An encoder is created for each, so you may need to lower "
+        "this value on GPUs with low memory. Set it to 1 if CUDA is unhappy")
+    parser.add_argument("-s", "--skip_existing", action="store_true", help=\
+        "Whether to overwrite existing files with the same name. Useful if the preprocessing was "
+        "interrupted. ")
+    parser.add_argument("--hparams", type=str, default="", help=\
+        "Hyperparameter overrides as a comma-separated list of name-value pairs")
+    parser.add_argument("--no_trim", action="store_true", help=\
+        "Preprocess audio without trimming silences (not recommended).")
+    parser.add_argument("--no_alignments", action="store_true", help=\
+        "Use this option when dataset does not include alignments\
+        (these are used to split long audio files into sub-utterances.)")
+    parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\
+        "Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3.")
+    parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
+        "Path your trained encoder model.")
+    args = parser.parse_args()
+
+    # Process the arguments
+    if not hasattr(args, "out_dir"):
+        args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
+    assert args.dataset in recognized_datasets, 'is not supported, please vote for it in https://github.com/babysor/MockingBird/issues/10'
+    # Create directories
+    assert args.datasets_root.exists()
+    args.out_dir.mkdir(exist_ok=True, parents=True)
+
+    # Verify webrtcvad is available
+    if not args.no_trim:
+        try:
+            import webrtcvad
+        except:
+            raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
+                "noise removal and is recommended. Please install and try again. If installation fails, "
+                "use --no_trim to disable this error message.")
+    encoder_model_fpath = args.encoder_model_fpath
+    del args.no_trim, args.encoder_model_fpath
+   
+    args.hparams = hparams.parse(args.hparams)
+
+    preprocess_dataset(**vars(args))
+    
+    create_embeddings(synthesizer_root=args.out_dir, n_processes=args.n_processes, encoder_model_fpath=encoder_model_fpath)    
--- a/synthesizer/inference.py
+++ b/synthesizer/inference.py
@@ -9,6 +9,7 @@ from pathlib import Path
 from typing import Union, List
 import numpy as np
 import librosa
+from utils import logmmse
 from pypinyin import lazy_pinyin, Style

 class Synthesizer:
@@ -90,8 +91,10 @@ class Synthesizer:

            simple_table([("Tacotron", str(tts_k) + "k"),
                        ("r", self._model.r)])
-        texts = [" ".join(lazy_pinyin(v, style=Style.TONE3)) for v in texts]
-
+        
+        print("Read " + str(texts))
+        texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]
+        print("Synthesizing " + str(texts))
        # Preprocess text inputs
        inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]
        if not isinstance(embeddings, list):
@@ -143,6 +146,12 @@ class Synthesizer:
        wav = librosa.load(str(fpath), hparams.sample_rate)[0]
        if hparams.rescale:
            wav = wav / np.abs(wav).max() * hparams.rescaling_max
+        # denoise
+        if len(wav) > hparams.sample_rate*(0.3+0.1):
+            noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
+                                        wav[-int(hparams.sample_rate*0.15):]])
+            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
+            wav = logmmse.denoise(wav, profile)
        return wav

    @staticmethod