mirror of
https://github.com/babysor/Realtime-Voice-Clone-Chinese.git
synced 2026-04-13 17:29:42 +08:00
Add quick path to preprocess audio, denoise audio when loading in toolbox
This commit is contained in:
10
.vscode/launch.json
vendored
10
.vscode/launch.json
vendored
@@ -23,6 +23,16 @@
|
||||
"args": [
|
||||
"dev", "..\\..\\chs1"
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Python: demo box",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"-d", "..\\..\\chs"
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -31,16 +31,13 @@
|
||||
|
||||
### 2. 使用数据集训练合成器
|
||||
* 下载 数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav)
|
||||
* 使用音频和梅尔频谱图进行预处理:
|
||||
`python synthesizer_preprocess_audio.py <datasets_root>`
|
||||
* 进行音频和梅尔频谱图预处理:
|
||||
`python pre.py <datasets_root>`
|
||||
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3
|
||||
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
||||
|
||||
>假如發生 `頁面文件太小,無法完成操作`,請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
|
||||
|
||||
* 预处理嵌入:
|
||||
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
||||
|
||||
* 训练合成器:
|
||||
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
|
||||
|
||||
|
||||
@@ -32,13 +32,11 @@
|
||||
### 2. Train synthesizer with your dataset
|
||||
* Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder
|
||||
* Preprocess with the audios and the mel spectrograms:
|
||||
`python synthesizer_preprocess_audio.py <datasets_root>`
|
||||
`python pre.py <datasets_root>`
|
||||
Allow parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3
|
||||
|
||||
>If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.
|
||||
|
||||
* Preprocess the embeddings:
|
||||
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
||||
|
||||
* Train the synthesizer:
|
||||
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
|
||||
|
||||
72
pre.py
Normal file
72
pre.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from synthesizer.preprocess import create_embeddings
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
from synthesizer.preprocess import preprocess_dataset
|
||||
from synthesizer.hparams import hparams
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
recognized_datasets = [
|
||||
"aidatatang_200zh",
|
||||
"magicdata",
|
||||
"aishell3"
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
|
||||
"and writes them to the disk. Audio files are also saved, to be used by the "
|
||||
"vocoder for training.",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
parser.add_argument("datasets_root", type=Path, help=\
|
||||
"Path to the directory containing your datasets.")
|
||||
parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
|
||||
"Path to the output directory that will contain the mel spectrograms, the audios and the "
|
||||
"embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
|
||||
parser.add_argument("-n", "--n_processes", type=int, default=1, help=\
|
||||
"Number of processes in parallel.An encoder is created for each, so you may need to lower "
|
||||
"this value on GPUs with low memory. Set it to 1 if CUDA is unhappy")
|
||||
parser.add_argument("-s", "--skip_existing", action="store_true", help=\
|
||||
"Whether to overwrite existing files with the same name. Useful if the preprocessing was "
|
||||
"interrupted. ")
|
||||
parser.add_argument("--hparams", type=str, default="", help=\
|
||||
"Hyperparameter overrides as a comma-separated list of name-value pairs")
|
||||
parser.add_argument("--no_trim", action="store_true", help=\
|
||||
"Preprocess audio without trimming silences (not recommended).")
|
||||
parser.add_argument("--no_alignments", action="store_true", help=\
|
||||
"Use this option when dataset does not include alignments\
|
||||
(these are used to split long audio files into sub-utterances.)")
|
||||
parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\
|
||||
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3.")
|
||||
parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
|
||||
"Path your trained encoder model.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process the arguments
|
||||
if not hasattr(args, "out_dir"):
|
||||
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
|
||||
assert args.dataset in recognized_datasets, 'is not supported, please vote for it in https://github.com/babysor/MockingBird/issues/10'
|
||||
# Create directories
|
||||
assert args.datasets_root.exists()
|
||||
args.out_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# Verify webrtcvad is available
|
||||
if not args.no_trim:
|
||||
try:
|
||||
import webrtcvad
|
||||
except:
|
||||
raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
|
||||
"noise removal and is recommended. Please install and try again. If installation fails, "
|
||||
"use --no_trim to disable this error message.")
|
||||
encoder_model_fpath = args.encoder_model_fpath
|
||||
del args.no_trim, args.encoder_model_fpath
|
||||
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
|
||||
preprocess_dataset(**vars(args))
|
||||
|
||||
create_embeddings(synthesizer_root=args.out_dir, n_processes=args.n_processes, encoder_model_fpath=encoder_model_fpath)
|
||||
@@ -9,6 +9,7 @@ from pathlib import Path
|
||||
from typing import Union, List
|
||||
import numpy as np
|
||||
import librosa
|
||||
from utils import logmmse
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
|
||||
class Synthesizer:
|
||||
@@ -90,8 +91,10 @@ class Synthesizer:
|
||||
|
||||
simple_table([("Tacotron", str(tts_k) + "k"),
|
||||
("r", self._model.r)])
|
||||
texts = [" ".join(lazy_pinyin(v, style=Style.TONE3)) for v in texts]
|
||||
|
||||
|
||||
print("Read " + str(texts))
|
||||
texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]
|
||||
print("Synthesizing " + str(texts))
|
||||
# Preprocess text inputs
|
||||
inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]
|
||||
if not isinstance(embeddings, list):
|
||||
@@ -143,6 +146,12 @@ class Synthesizer:
|
||||
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
|
||||
if hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||
# denoise
|
||||
if len(wav) > hparams.sample_rate*(0.3+0.1):
|
||||
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
|
||||
wav[-int(hparams.sample_rate*0.15):]])
|
||||
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
|
||||
wav = logmmse.denoise(wav, profile)
|
||||
return wav
|
||||
|
||||
@staticmethod
|
||||
|
||||
Reference in New Issue
Block a user