mirror of
https://github.com/babysor/Realtime-Voice-Clone-Chinese.git
synced 2026-02-09 13:26:37 +08:00
Compare commits
32 Commits
restruct-p
...
dev-fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
156723e37c | ||
|
|
1862d2145b | ||
|
|
72a22d448b | ||
|
|
98d38d84c3 | ||
|
|
7ab86c6f4c | ||
|
|
ab79881480 | ||
|
|
fd93b40398 | ||
|
|
dbf01347fc | ||
|
|
28f9173dfa | ||
|
|
d073e1f349 | ||
|
|
baa8b5005d | ||
|
|
d54f4fb631 | ||
|
|
7353888d35 | ||
|
|
e9ce943f6c | ||
|
|
77c145328c | ||
|
|
3bce6bbbe7 | ||
|
|
9dd8ea11e5 | ||
|
|
5a0d77e699 | ||
|
|
4914fc0776 | ||
|
|
8f95faa0d3 | ||
|
|
facc97e236 | ||
|
|
3d1e3dc542 | ||
|
|
536ae8899c | ||
|
|
9f1dbeeecc | ||
|
|
d3bdf816db | ||
|
|
b78d0d2a26 | ||
|
|
5c17fc8bb0 | ||
|
|
3ce874ab46 | ||
|
|
beec0b93ed | ||
|
|
9d67b757f0 | ||
|
|
d1ba355c5f | ||
|
|
83d95c6c81 |
8
.gitignore
vendored
8
.gitignore
vendored
@@ -14,11 +14,13 @@
|
|||||||
*.bcf
|
*.bcf
|
||||||
*.toc
|
*.toc
|
||||||
*.sh
|
*.sh
|
||||||
data/ckpt
|
data/ckpt/*/*
|
||||||
!data/ckpt/vocoder/pretrained/**
|
|
||||||
!data/ckpt/encoder/pretrained.pt
|
!data/ckpt/encoder/pretrained.pt
|
||||||
|
!data/ckpt/vocoder/pretrained/
|
||||||
wavs
|
wavs
|
||||||
log
|
log
|
||||||
!/docker-entrypoint.sh
|
!/docker-entrypoint.sh
|
||||||
!/datasets_download/*.sh
|
!/datasets_download/*.sh
|
||||||
/datasets
|
/datasets
|
||||||
|
monotonic_align/build
|
||||||
|
monotonic_align/monotonic_align
|
||||||
12
.vscode/launch.json
vendored
12
.vscode/launch.json
vendored
@@ -53,7 +53,7 @@
|
|||||||
"request": "launch",
|
"request": "launch",
|
||||||
"program": "train.py",
|
"program": "train.py",
|
||||||
"console": "integratedTerminal",
|
"console": "integratedTerminal",
|
||||||
"args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"]
|
"args": ["--type", "vits"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Python: PPG Convert",
|
"name": "Python: PPG Convert",
|
||||||
@@ -64,6 +64,14 @@
|
|||||||
"args": ["-c", ".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2.yaml",
|
"args": ["-c", ".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2.yaml",
|
||||||
"-m", ".\\ppg2mel\\saved_models\\best_loss_step_304000.pth", "--wav_dir", ".\\wavs\\input", "--ref_wav_path", ".\\wavs\\pkq.mp3", "-o", ".\\wavs\\output\\"
|
"-m", ".\\ppg2mel\\saved_models\\best_loss_step_304000.pth", "--wav_dir", ".\\wavs\\input", "--ref_wav_path", ".\\wavs\\pkq.mp3", "-o", ".\\wavs\\output\\"
|
||||||
]
|
]
|
||||||
}
|
},
|
||||||
|
{
|
||||||
|
"name": "Python: Vits Train",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "train.py",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"args": ["--type", "vits"]
|
||||||
|
},
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
92
README-CN.md
92
README-CN.md
@@ -18,22 +18,10 @@
|
|||||||
|
|
||||||
🌍 **Webserver Ready** 可伺服你的训练结果,供远程调用
|
🌍 **Webserver Ready** 可伺服你的训练结果,供远程调用
|
||||||
|
|
||||||
### 进行中的工作
|
|
||||||
* GUI/客户端大升级与合并
|
|
||||||
|
|
||||||
- [x] 初始化框架 `./mkgui` (基于streamlit + fastapi)和 [技术设计](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
|
|
||||||
|
|
||||||
- [x] 增加 Voice Cloning and Conversion的演示页面
|
|
||||||
|
|
||||||
- [x] 增加Voice Conversion的预处理preprocessing 和训练 training 页面
|
|
||||||
|
|
||||||
- [ ] 增加其他的的预处理preprocessing 和训练 training 页面
|
|
||||||
|
|
||||||
* 模型后端基于ESPnet2升级
|
|
||||||
|
|
||||||
|
|
||||||
## 开始
|
## 开始
|
||||||
### 1. 安装要求
|
### 1. 安装要求
|
||||||
|
#### 1.1 通用配置
|
||||||
> 按照原始存储库测试您是否已准备好所有环境。
|
> 按照原始存储库测试您是否已准备好所有环境。
|
||||||
运行工具箱(demo_toolbox.py)需要 **Python 3.7 或更高版本** 。
|
运行工具箱(demo_toolbox.py)需要 **Python 3.7 或更高版本** 。
|
||||||
|
|
||||||
@@ -43,6 +31,67 @@
|
|||||||
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
|
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
|
||||||
* 安装 webrtcvad `pip install webrtcvad-wheels`。
|
* 安装 webrtcvad `pip install webrtcvad-wheels`。
|
||||||
|
|
||||||
|
或者
|
||||||
|
- 用`conda` 或者 `mamba` 安装依赖
|
||||||
|
|
||||||
|
```conda env create -n env_name -f env.yml```
|
||||||
|
|
||||||
|
```mamba env create -n env_name -f env.yml```
|
||||||
|
|
||||||
|
会创建新环境安装必须的依赖. 之后用 `conda activate env_name` 切换环境就完成了.
|
||||||
|
> env.yml只包含了运行时必要的依赖,暂时不包括monotonic-align,如果想要装GPU版本的pytorch可以查看官网教程。
|
||||||
|
|
||||||
|
#### 1.2 M1芯片Mac环境配置(Inference Time)
|
||||||
|
> 以下环境按x86-64搭建,使用原生的`demo_toolbox.py`,可作为在不改代码情况下快速使用的workaround。
|
||||||
|
>
|
||||||
|
> 如需使用M1芯片训练,因`demo_toolbox.py`依赖的`PyQt5`不支持M1,则应按需修改代码,或者尝试使用`web.py`。
|
||||||
|
|
||||||
|
* 安装`PyQt5`,参考[这个链接](https://stackoverflow.com/a/68038451/20455983)
|
||||||
|
* 用Rosetta打开Terminal,参考[这个链接](https://dev.to/courier/tips-and-tricks-to-setup-your-apple-m1-for-development-547g)
|
||||||
|
* 用系统Python创建项目虚拟环境
|
||||||
|
```
|
||||||
|
/usr/bin/python3 -m venv /PathToMockingBird/venv
|
||||||
|
source /PathToMockingBird/venv/bin/activate
|
||||||
|
```
|
||||||
|
* 升级pip并安装`PyQt5`
|
||||||
|
```
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install pyqt5
|
||||||
|
```
|
||||||
|
* 安装`pyworld`和`ctc-segmentation`
|
||||||
|
> 这里两个文件直接`pip install`的时候找不到wheel,尝试从c里build时找不到`Python.h`报错
|
||||||
|
* 安装`pyworld`
|
||||||
|
* `brew install python` 通过brew安装python时会自动安装`Python.h`
|
||||||
|
* `export CPLUS_INCLUDE_PATH=/opt/homebrew/Frameworks/Python.framework/Headers` 对于M1,brew安装`Python.h`到上述路径。把路径添加到环境变量里
|
||||||
|
* `pip install pyworld`
|
||||||
|
|
||||||
|
* 安装`ctc-segmentation`
|
||||||
|
> 因上述方法没有成功,选择从[github](https://github.com/lumaku/ctc-segmentation) clone源码手动编译
|
||||||
|
* `git clone https://github.com/lumaku/ctc-segmentation.git` 克隆到任意位置
|
||||||
|
* `cd ctc-segmentation`
|
||||||
|
* `source /PathToMockingBird/venv/bin/activate` 假设一开始未开启,打开MockingBird项目的虚拟环境
|
||||||
|
* `cythonize -3 ctc_segmentation/ctc_segmentation_dyn.pyx`
|
||||||
|
* `/usr/bin/arch -x86_64 python setup.py build` 要注意明确用x86-64架构编译
|
||||||
|
* `/usr/bin/arch -x86_64 python setup.py install --optimize=1 --skip-build`用x86-64架构安装
|
||||||
|
|
||||||
|
* 安装其他依赖
|
||||||
|
* `/usr/bin/arch -x86_64 pip install torch torchvision torchaudio` 这里用pip安装`PyTorch`,明确架构是x86
|
||||||
|
* `pip install ffmpeg` 安装ffmpeg
|
||||||
|
* `pip install -r requirements.txt`
|
||||||
|
|
||||||
|
* 运行
|
||||||
|
> 参考[这个链接](https://youtrack.jetbrains.com/issue/PY-46290/Allow-running-Python-under-Rosetta-2-in-PyCharm-for-Apple-Silicon)
|
||||||
|
,让项目跑在x86架构环境上
|
||||||
|
* `vim /PathToMockingBird/venv/bin/pythonM1`
|
||||||
|
* 写入以下代码
|
||||||
|
```
|
||||||
|
#!/usr/bin/env zsh
|
||||||
|
mydir=${0:a:h}
|
||||||
|
/usr/bin/arch -x86_64 $mydir/python "$@"
|
||||||
|
```
|
||||||
|
* `chmod +x pythonM1` 设为可执行文件
|
||||||
|
* 如果使用PyCharm,则把Interpreter指向`pythonM1`,否则也可命令行运行`/PathToMockingBird/venv/bin/pythonM1 demo_toolbox.py`
|
||||||
|
|
||||||
### 2. 准备预训练模型
|
### 2. 准备预训练模型
|
||||||
考虑训练您自己专属的模型或者下载社区他人训练好的模型:
|
考虑训练您自己专属的模型或者下载社区他人训练好的模型:
|
||||||
> 近期创建了[知乎专题](https://www.zhihu.com/column/c_1425605280340504576) 将不定期更新炼丹小技巧or心得,也欢迎提问
|
> 近期创建了[知乎专题](https://www.zhihu.com/column/c_1425605280340504576) 将不定期更新炼丹小技巧or心得,也欢迎提问
|
||||||
@@ -64,7 +113,7 @@
|
|||||||
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
||||||
|
|
||||||
* 训练合成器:
|
* 训练合成器:
|
||||||
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
|
`python ./control/cli/synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
|
||||||
|
|
||||||
* 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时,请转到`启动程序`一步。
|
* 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时,请转到`启动程序`一步。
|
||||||
|
|
||||||
@@ -75,7 +124,7 @@
|
|||||||
| --- | ----------- | ----- | ----- |
|
| --- | ----------- | ----- | ----- |
|
||||||
| 作者 | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [百度盘链接](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps 用3个开源数据集混合训练
|
| 作者 | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [百度盘链接](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps 用3个开源数据集混合训练
|
||||||
| 作者 | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [百度盘链接](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) 提取码:om7f | | 25k steps 用3个开源数据集混合训练, 切换到tag v0.0.1使用
|
| 作者 | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [百度盘链接](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) 提取码:om7f | | 25k steps 用3个开源数据集混合训练, 切换到tag v0.0.1使用
|
||||||
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing [百度盘链接](https://pan.baidu.com/s/1vSYXO4wsLyjnF3Unl-Xoxg) 提取码:1024 | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps 台湾口音需切换到tag v0.0.1使用
|
|@FawenYo | https://yisiou-my.sharepoint.com/:u:/g/personal/lawrence_cheng_fawenyo_onmicrosoft_com/EWFWDHzee-NNg9TWdKckCc4BC7bK2j9cCbOWn0-_tK0nOg?e=n0gGgC | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps 台湾口音需切换到tag v0.0.1使用
|
||||||
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码:2021 | https://www.bilibili.com/video/BV1uh411B7AD/ | 150k steps 注意:根据[issue](https://github.com/babysor/MockingBird/issues/37)修复 并切换到tag v0.0.1使用
|
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码:2021 | https://www.bilibili.com/video/BV1uh411B7AD/ | 150k steps 注意:根据[issue](https://github.com/babysor/MockingBird/issues/37)修复 并切换到tag v0.0.1使用
|
||||||
|
|
||||||
#### 2.4训练声码器 (可选)
|
#### 2.4训练声码器 (可选)
|
||||||
@@ -86,14 +135,14 @@
|
|||||||
|
|
||||||
|
|
||||||
* 训练wavernn声码器:
|
* 训练wavernn声码器:
|
||||||
`python vocoder_train.py <trainid> <datasets_root>`
|
`python ./control/cli/vocoder_train.py <trainid> <datasets_root>`
|
||||||
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
||||||
|
|
||||||
* 训练hifigan声码器:
|
* 训练hifigan声码器:
|
||||||
`python vocoder_train.py <trainid> <datasets_root> hifigan`
|
`python ./control/cli/vocoder_train.py <trainid> <datasets_root> hifigan`
|
||||||
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
||||||
* 训练fregan声码器:
|
* 训练fregan声码器:
|
||||||
`python vocoder_train.py <trainid> <datasets_root> --config config.json fregan`
|
`python ./control/cli/vocoder_train.py <trainid> <datasets_root> --config config.json fregan`
|
||||||
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
||||||
* 将GAN声码器的训练切换为多GPU模式:修改GAN文件夹下.json文件中的"num_gpus"参数
|
* 将GAN声码器的训练切换为多GPU模式:修改GAN文件夹下.json文件中的"num_gpus"参数
|
||||||
### 3. 启动程序或工具箱
|
### 3. 启动程序或工具箱
|
||||||
@@ -114,7 +163,7 @@
|
|||||||
想像柯南拿着变声器然后发出毛利小五郎的声音吗?本项目现基于PPG-VC,引入额外两个模块(PPG extractor + PPG2Mel), 可以实现变声功能。(文档不全,尤其是训练部分,正在努力补充中)
|
想像柯南拿着变声器然后发出毛利小五郎的声音吗?本项目现基于PPG-VC,引入额外两个模块(PPG extractor + PPG2Mel), 可以实现变声功能。(文档不全,尤其是训练部分,正在努力补充中)
|
||||||
#### 4.0 准备环境
|
#### 4.0 准备环境
|
||||||
* 确保项目以上环境已经安装ok,运行`pip install espnet` 来安装剩余的必要包。
|
* 确保项目以上环境已经安装ok,运行`pip install espnet` 来安装剩余的必要包。
|
||||||
* 下载以下模型 链接:https://pan.baidu.com/s/1bl_x_DHJSAUyN2fma-Q_Wg
|
* 下载以下模型 链接:https://pan.baidu.com/s/1bl_x_DHJSAUyN2fma-Q_Wg
|
||||||
提取码:gh41
|
提取码:gh41
|
||||||
* 24K采样率专用的vocoder(hifigan)到 *vocoder\saved_models\xxx*
|
* 24K采样率专用的vocoder(hifigan)到 *vocoder\saved_models\xxx*
|
||||||
* 预训练的ppg特征encoder(ppg_extractor)到 *ppg_extractor\saved_models\xxx*
|
* 预训练的ppg特征encoder(ppg_extractor)到 *ppg_extractor\saved_models\xxx*
|
||||||
@@ -124,14 +173,14 @@
|
|||||||
|
|
||||||
* 下载aidatatang_200zh数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav)
|
* 下载aidatatang_200zh数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav)
|
||||||
* 进行音频和梅尔频谱图预处理:
|
* 进行音频和梅尔频谱图预处理:
|
||||||
`python pre4ppg.py <datasets_root> -d {dataset} -n {number}`
|
`python ./control/cli/pre4ppg.py <datasets_root> -d {dataset} -n {number}`
|
||||||
可传入参数:
|
可传入参数:
|
||||||
* `-d {dataset}` 指定数据集,支持 aidatatang_200zh, 不传默认为aidatatang_200zh
|
* `-d {dataset}` 指定数据集,支持 aidatatang_200zh, 不传默认为aidatatang_200zh
|
||||||
* `-n {number}` 指定并行数,CPU 11700k在8的情况下,需要运行12到18小时!待优化
|
* `-n {number}` 指定并行数,CPU 11700k在8的情况下,需要运行12到18小时!待优化
|
||||||
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
||||||
|
|
||||||
* 训练合成器, 注意在上一步先下载好`ppg2mel.yaml`, 修改里面的地址指向预训练好的文件夹:
|
* 训练合成器, 注意在上一步先下载好`ppg2mel.yaml`, 修改里面的地址指向预训练好的文件夹:
|
||||||
`python ppg2mel_train.py --config .\ppg2mel\saved_models\ppg2mel.yaml --oneshotvc `
|
`python ./control/cli/ppg2mel_train.py --config .\ppg2mel\saved_models\ppg2mel.yaml --oneshotvc `
|
||||||
* 如果想要继续上一次的训练,可以通过`--load .\ppg2mel\saved_models\<old_pt_file>` 参数指定一个预训练模型文件。
|
* 如果想要继续上一次的训练,可以通过`--load .\ppg2mel\saved_models\<old_pt_file>` 参数指定一个预训练模型文件。
|
||||||
|
|
||||||
#### 4.2 启动工具箱VC模式
|
#### 4.2 启动工具箱VC模式
|
||||||
@@ -232,4 +281,3 @@ voc_pad =2
|
|||||||

|

|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
|||||||
223
README-LINUX-CN.md
Normal file
223
README-LINUX-CN.md
Normal file
@@ -0,0 +1,223 @@
|
|||||||
|
## 实时语音克隆 - 中文/普通话
|
||||||
|

|
||||||
|
|
||||||
|
[](http://choosealicense.com/licenses/mit/)
|
||||||
|
|
||||||
|
### [English](README.md) | 中文
|
||||||
|
|
||||||
|
### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/) | [Wiki教程](https://github.com/babysor/MockingBird/wiki/Quick-Start-(Newbie)) | [训练教程](https://vaj2fgg8yn.feishu.cn/docs/doccn7kAbr3SJz0KM0SIDJ0Xnhd)
|
||||||
|
|
||||||
|
## 特性
|
||||||
|
🌍 **中文** 支持普通话并使用多种中文数据集进行测试:aidatatang_200zh, magicdata, aishell3, biaobei, MozillaCommonVoice, data_aishell 等
|
||||||
|
|
||||||
|
🤩 **Easy & Awesome** 仅需下载或新训练合成器(synthesizer)就有良好效果,复用预训练的编码器/声码器,或实时的HiFi-GAN作为vocoder
|
||||||
|
|
||||||
|
🌍 **Webserver Ready** 可伺服你的训练结果,供远程调用。
|
||||||
|
|
||||||
|
🤩 **感谢各位小伙伴的支持,本项目将开启新一轮的更新**
|
||||||
|
|
||||||
|
## 1.快速开始
|
||||||
|
### 1.1 建议环境
|
||||||
|
- Ubuntu 18.04
|
||||||
|
- Cuda 11.7 && CuDNN 8.5.0
|
||||||
|
- Python 3.8 或 3.9
|
||||||
|
- Pytorch 2.0.1 <post cuda-11.7>
|
||||||
|
### 1.2 环境配置
|
||||||
|
```shell
|
||||||
|
# 下载前建议更换国内镜像源
|
||||||
|
|
||||||
|
conda create -n sound python=3.9
|
||||||
|
|
||||||
|
conda activate sound
|
||||||
|
|
||||||
|
git clone https://github.com/babysor/MockingBird.git
|
||||||
|
|
||||||
|
cd MockingBird
|
||||||
|
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
pip install webrtcvad-wheels
|
||||||
|
|
||||||
|
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.3 模型准备
|
||||||
|
> 当实在没有设备或者不想慢慢调试,可以使用社区贡献的模型(欢迎持续分享):
|
||||||
|
|
||||||
|
| 作者 | 下载链接 | 效果预览 | 信息 |
|
||||||
|
| --- | ----------- | ----- | ----- |
|
||||||
|
| 作者 | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [百度盘链接](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps 用3个开源数据集混合训练
|
||||||
|
| 作者 | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [百度盘链接](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) 提取码:om7f | | 25k steps 用3个开源数据集混合训练, 切换到tag v0.0.1使用
|
||||||
|
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing [百度盘链接](https://pan.baidu.com/s/1vSYXO4wsLyjnF3Unl-Xoxg) 提取码:1024 | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps 台湾口音需切换到tag v0.0.1使用
|
||||||
|
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码:2021 | https://www.bilibili.com/video/BV1uh411B7AD/ | 150k steps 注意:根据[issue](https://github.com/babysor/MockingBird/issues/37)修复 并切换到tag v0.0.1使用
|
||||||
|
|
||||||
|
### 1.4 文件结构准备
|
||||||
|
文件结构准备如下所示,算法将自动遍历synthesizer下的.pt模型文件。
|
||||||
|
```
|
||||||
|
# 以第一个 pretrained-11-7-21_75k.pt 为例
|
||||||
|
|
||||||
|
└── data
|
||||||
|
└── ckpt
|
||||||
|
└── synthesizer
|
||||||
|
└── pretrained-11-7-21_75k.pt
|
||||||
|
```
|
||||||
|
### 1.5 运行
|
||||||
|
```
|
||||||
|
python web.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## 2.模型训练
|
||||||
|
### 2.1 数据准备
|
||||||
|
#### 2.1.1 数据下载
|
||||||
|
``` shell
|
||||||
|
# aidatatang_200zh
|
||||||
|
|
||||||
|
wget https://openslr.elda.org/resources/62/aidatatang_200zh.tgz
|
||||||
|
```
|
||||||
|
``` shell
|
||||||
|
# MAGICDATA
|
||||||
|
|
||||||
|
wget https://openslr.magicdatatech.com/resources/68/train_set.tar.gz
|
||||||
|
|
||||||
|
wget https://openslr.magicdatatech.com/resources/68/dev_set.tar.gz
|
||||||
|
|
||||||
|
wget https://openslr.magicdatatech.com/resources/68/test_set.tar.gz
|
||||||
|
```
|
||||||
|
``` shell
|
||||||
|
# AISHELL-3
|
||||||
|
|
||||||
|
wget https://openslr.elda.org/resources/93/data_aishell3.tgz
|
||||||
|
```
|
||||||
|
```shell
|
||||||
|
# Aishell
|
||||||
|
|
||||||
|
wget https://openslr.elda.org/resources/33/data_aishell.tgz
|
||||||
|
```
|
||||||
|
#### 2.1.2 数据批量解压
|
||||||
|
```shell
|
||||||
|
# 该指令为解压当前目录下的所有压缩文件
|
||||||
|
|
||||||
|
for gz in *.gz; do tar -zxvf $gz; done
|
||||||
|
```
|
||||||
|
### 2.2 encoder模型训练
|
||||||
|
#### 2.2.1 数据预处理:
|
||||||
|
需要先在`pre.py `头部加入:
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
torch.multiprocessing.set_start_method('spawn', force=True)
|
||||||
|
```
|
||||||
|
使用以下指令对数据预处理:
|
||||||
|
```shell
|
||||||
|
python pre.py <datasets_root> \
|
||||||
|
-d <datasets_name>
|
||||||
|
```
|
||||||
|
其中`<datasets_root>`为原数据集路径,`<datasets_name>` 为数据集名称。
|
||||||
|
|
||||||
|
支持 `librispeech_other`,`voxceleb1`,`aidatatang_200zh`,使用逗号分割处理多数据集。
|
||||||
|
|
||||||
|
### 2.2.2 encoder模型训练:
|
||||||
|
超参数文件路径:`models/encoder/hparams.py`
|
||||||
|
```shell
|
||||||
|
python encoder_train.py <name> \
|
||||||
|
<datasets_root>/SV2TTS/encoder
|
||||||
|
```
|
||||||
|
其中 `<name>` 是训练产生文件的名称,可自行修改。
|
||||||
|
|
||||||
|
其中 `<datasets_root>` 是经过 `Step 2.1.1` 处理过后的数据集路径。
|
||||||
|
#### 2.2.3 开启encoder模型训练数据可视化(可选)
|
||||||
|
```shell
|
||||||
|
visdom
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.3 synthesizer模型训练
|
||||||
|
#### 2.3.1 数据预处理:
|
||||||
|
```shell
|
||||||
|
python pre.py <datasets_root> \
|
||||||
|
-d <datasets_name> \
|
||||||
|
-o <datasets_path> \
|
||||||
|
-n <number>
|
||||||
|
```
|
||||||
|
`<datasets_root>` 为原数据集路径,当你的`aidatatang_200zh`路径为`/data/aidatatang_200zh/corpus/train`时,`<datasets_root>` 为 `/data/`。
|
||||||
|
|
||||||
|
`<datasets_name>` 为数据集名称。
|
||||||
|
|
||||||
|
`<datasets_path>` 为数据集处理后的保存路径。
|
||||||
|
|
||||||
|
`<number>` 为数据集处理时进程数,根据CPU情况调整大小。
|
||||||
|
|
||||||
|
#### 2.3.2 新增数据预处理:
|
||||||
|
```shell
|
||||||
|
python pre.py <datasets_root> \
|
||||||
|
-d <datasets_name> \
|
||||||
|
-o <datasets_path> \
|
||||||
|
-n <number> \
|
||||||
|
-s
|
||||||
|
```
|
||||||
|
当新增数据集时,应加 `-s` 选择数据拼接,不加则为覆盖。
|
||||||
|
#### 2.3.3 synthesizer模型训练:
|
||||||
|
超参数文件路径:`models/synthesizer/hparams.py`,需将`MockingBird/control/cli/synthesizer_train.py`移成`MockingBird/synthesizer_train.py`结构。
|
||||||
|
```shell
|
||||||
|
python synthesizer_train.py <name> <datasets_path> \
|
||||||
|
-m <out_dir>
|
||||||
|
```
|
||||||
|
其中 `<name>` 是训练产生文件的名称,可自行修改。
|
||||||
|
|
||||||
|
其中 `<datasets_path>` 是经过 `Step 2.2.1` 处理过后的数据集路径。
|
||||||
|
|
||||||
|
其中 `<out_dir> `为训练时所有数据的保存路径。
|
||||||
|
|
||||||
|
### 2.4 vocoder模型训练
|
||||||
|
vocoder模型对生成效果影响不大,已预置3款。
|
||||||
|
#### 2.4.1 数据预处理
|
||||||
|
```shell
|
||||||
|
python vocoder_preprocess.py <datasets_root> \
|
||||||
|
-m <synthesizer_model_path>
|
||||||
|
```
|
||||||
|
|
||||||
|
其中`<datasets_root>`为你数据集路径。
|
||||||
|
|
||||||
|
其中 `<synthesizer_model_path>`为synthesizer模型地址。
|
||||||
|
|
||||||
|
#### 2.4.2 wavernn声码器训练:
|
||||||
|
```
|
||||||
|
python vocoder_train.py <name> <datasets_root>
|
||||||
|
```
|
||||||
|
#### 2.4.3 hifigan声码器训练:
|
||||||
|
```
|
||||||
|
python vocoder_train.py <name> <datasets_root> hifigan
|
||||||
|
```
|
||||||
|
#### 2.4.4 fregan声码器训练:
|
||||||
|
```
|
||||||
|
python vocoder_train.py <name> <datasets_root> \
|
||||||
|
--config config.json fregan
|
||||||
|
```
|
||||||
|
将GAN声码器的训练切换为多GPU模式:修改`GAN`文件夹下`.json`文件中的`num_gpus`参数。
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## 3.致谢
|
||||||
|
### 3.1 项目致谢
|
||||||
|
该库一开始从仅支持英语的[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) 分叉出来的,鸣谢作者。
|
||||||
|
### 3.2 论文致谢
|
||||||
|
| URL | Designation | 标题 | 实现源码 |
|
||||||
|
| --- | ----------- | ----- | --------------------- |
|
||||||
|
| [1803.09017](https://arxiv.org/abs/1803.09017) | GlobalStyleToken (synthesizer)| Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis | 本代码库 |
|
||||||
|
| [2010.05646](https://arxiv.org/abs/2010.05646) | HiFi-GAN (vocoder)| Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | 本代码库 |
|
||||||
|
| [2106.02297](https://arxiv.org/abs/2106.02297) | Fre-GAN (vocoder)| Fre-GAN: Adversarial Frequency-consistent Audio Synthesis | 本代码库 |
|
||||||
|
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | SV2TTS | Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis | 本代码库 |
|
||||||
|
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|
||||||
|
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|
||||||
|
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
|
||||||
|
|
||||||
|
### 3.3 开发者致谢
|
||||||
|
|
||||||
|
作为AI领域的从业者,我们不仅乐于开发一些具有里程碑意义的算法项目,同时也乐于分享项目以及开发过程中收获的喜悦。
|
||||||
|
|
||||||
|
因此,你们的使用是对我们项目的最大认可。同时当你们在项目使用中遇到一些问题时,欢迎你们随时在issue上留言。你们的指正这对于项目的后续优化具有十分重大的的意义。
|
||||||
|
|
||||||
|
为了表示感谢,我们将在本项目中留下各位开发者信息以及相对应的贡献。
|
||||||
|
|
||||||
|
- ------------------------------------------------ 开 发 者 贡 献 内 容 ---------------------------------------------------------------------------------
|
||||||
|
|
||||||
83
README.md
83
README.md
@@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
[](http://choosealicense.com/licenses/mit/)
|
[](http://choosealicense.com/licenses/mit/)
|
||||||
|
|
||||||
> English | [中文](README-CN.md)
|
> English | [中文](README-CN.md)| [中文Linux](README-LINUX-CN.md)
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3, data_aishell, and etc.
|
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3, data_aishell, and etc.
|
||||||
@@ -18,17 +18,10 @@
|
|||||||
|
|
||||||
### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/)
|
### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/)
|
||||||
|
|
||||||
### Ongoing Works(Helps Needed)
|
|
||||||
* Major upgrade on GUI/Client and unifying web and toolbox
|
|
||||||
[X] Init framework `./mkgui` and [tech design](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
|
|
||||||
[X] Add demo part of Voice Cloning and Conversion
|
|
||||||
[X] Add preprocessing and training for Voice Conversion
|
|
||||||
[ ] Add preprocessing and training for Encoder/Synthesizer/Vocoder
|
|
||||||
* Major upgrade on model backend based on ESPnet2(not yet started)
|
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
### 1. Install Requirements
|
### 1. Install Requirements
|
||||||
|
#### 1.1 General Setup
|
||||||
> Follow the original repo to test if you got all environment ready.
|
> Follow the original repo to test if you got all environment ready.
|
||||||
**Python 3.7 or higher ** is needed to run the toolbox.
|
**Python 3.7 or higher ** is needed to run the toolbox.
|
||||||
|
|
||||||
@@ -37,8 +30,74 @@
|
|||||||
* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
|
* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
|
||||||
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
||||||
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
||||||
> Note that we are using the pretrained encoder/vocoder but synthesizer since the original model is incompatible with the Chinese symbols. It means the demo_cli is not working at this moment.
|
|
||||||
|
or
|
||||||
|
- install dependencies with `conda` or `mamba`
|
||||||
|
|
||||||
|
```conda env create -n env_name -f env.yml```
|
||||||
|
|
||||||
|
```mamba env create -n env_name -f env.yml```
|
||||||
|
|
||||||
|
will create a virtual environment where necessary dependencies are installed. Switch to the new environment by `conda activate env_name` and enjoy it.
|
||||||
|
> env.yml only includes the necessary dependencies to run the project,temporarily without monotonic-align. You can check the official website to install the GPU version of pytorch.
|
||||||
|
|
||||||
|
#### 1.2 Setup with a M1 Mac
|
||||||
|
> The following steps are a workaround to directly use the original `demo_toolbox.py`without the changing of codes.
|
||||||
|
>
|
||||||
|
> Since the major issue comes with the PyQt5 packages used in `demo_toolbox.py` not compatible with M1 chips, were one to attempt on training models with the M1 chip, either that person can forgo `demo_toolbox.py`, or one can try the `web.py` in the project.
|
||||||
|
|
||||||
|
##### 1.2.1 Install `PyQt5`, with [ref](https://stackoverflow.com/a/68038451/20455983) here.
|
||||||
|
* Create and open a Rosetta Terminal, with [ref](https://dev.to/courier/tips-and-tricks-to-setup-your-apple-m1-for-development-547g) here.
|
||||||
|
* Use system Python to create a virtual environment for the project
|
||||||
|
```
|
||||||
|
/usr/bin/python3 -m venv /PathToMockingBird/venv
|
||||||
|
source /PathToMockingBird/venv/bin/activate
|
||||||
|
```
|
||||||
|
* Upgrade pip and install `PyQt5`
|
||||||
|
```
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install pyqt5
|
||||||
|
```
|
||||||
|
##### 1.2.2 Install `pyworld` and `ctc-segmentation`
|
||||||
|
|
||||||
|
> Both packages seem to be unique to this project and are not seen in the original [Real-Time Voice Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) project. When installing with `pip install`, both packages lack wheels so the program tries to directly compile from c code and could not find `Python.h`.
|
||||||
|
|
||||||
|
* Install `pyworld`
|
||||||
|
* `brew install python` `Python.h` can come with Python installed by brew
|
||||||
|
* `export CPLUS_INCLUDE_PATH=/opt/homebrew/Frameworks/Python.framework/Headers` The filepath of brew-installed `Python.h` is unique to M1 MacOS and listed above. One needs to manually add the path to the environment variables.
|
||||||
|
* `pip install pyworld` that should do.
|
||||||
|
|
||||||
|
|
||||||
|
* Install`ctc-segmentation`
|
||||||
|
> Same method does not apply to `ctc-segmentation`, and one needs to compile it from the source code on [github](https://github.com/lumaku/ctc-segmentation).
|
||||||
|
* `git clone https://github.com/lumaku/ctc-segmentation.git`
|
||||||
|
* `cd ctc-segmentation`
|
||||||
|
* `source /PathToMockingBird/venv/bin/activate` If the virtual environment hasn't been deployed, activate it.
|
||||||
|
* `cythonize -3 ctc_segmentation/ctc_segmentation_dyn.pyx`
|
||||||
|
* `/usr/bin/arch -x86_64 python setup.py build` Build with x86 architecture.
|
||||||
|
* `/usr/bin/arch -x86_64 python setup.py install --optimize=1 --skip-build`Install with x86 architecture.
|
||||||
|
|
||||||
|
##### 1.2.3 Other dependencies
|
||||||
|
* `/usr/bin/arch -x86_64 pip install torch torchvision torchaudio` Pip installing `PyTorch` as an example, articulate that it's installed with x86 architecture
|
||||||
|
* `pip install ffmpeg` Install ffmpeg
|
||||||
|
* `pip install -r requirements.txt` Install other requirements.
|
||||||
|
|
||||||
|
##### 1.2.4 Run the Inference Time (with Toolbox)
|
||||||
|
> To run the project on x86 architecture. [ref](https://youtrack.jetbrains.com/issue/PY-46290/Allow-running-Python-under-Rosetta-2-in-PyCharm-for-Apple-Silicon).
|
||||||
|
* `vim /PathToMockingBird/venv/bin/pythonM1` Create an executable file `pythonM1` to condition python interpreter at `/PathToMockingBird/venv/bin`.
|
||||||
|
* Write in the following content:
|
||||||
|
```
|
||||||
|
#!/usr/bin/env zsh
|
||||||
|
mydir=${0:a:h}
|
||||||
|
/usr/bin/arch -x86_64 $mydir/python "$@"
|
||||||
|
```
|
||||||
|
* `chmod +x pythonM1` Set the file as executable.
|
||||||
|
* If using PyCharm IDE, configure project interpreter to `pythonM1`([steps here](https://www.jetbrains.com/help/pycharm/configuring-python-interpreter.html#add-existing-interpreter)), if using command line python, run `/PathToMockingBird/venv/bin/pythonM1 demo_toolbox.py`
|
||||||
|
|
||||||
|
|
||||||
### 2. Prepare your models
|
### 2. Prepare your models
|
||||||
|
> Note that we are using the pretrained encoder/vocoder but not synthesizer, since the original model is incompatible with the Chinese symbols. It means the demo_cli is not working at this moment, so additional synthesizer models are required.
|
||||||
|
|
||||||
You can either train your models or use existing ones:
|
You can either train your models or use existing ones:
|
||||||
|
|
||||||
#### 2.1 Train encoder with your dataset (Optional)
|
#### 2.1 Train encoder with your dataset (Optional)
|
||||||
@@ -56,7 +115,7 @@ You can either train your models or use existing ones:
|
|||||||
Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata, aishell3, data_aishell, etc.If this parameter is not passed, the default dataset will be aidatatang_200zh.
|
Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata, aishell3, data_aishell, etc.If this parameter is not passed, the default dataset will be aidatatang_200zh.
|
||||||
|
|
||||||
* Train the synthesizer:
|
* Train the synthesizer:
|
||||||
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
|
`python train.py --type=synth mandarin <datasets_root>/SV2TTS/synthesizer`
|
||||||
|
|
||||||
* Go to next step when you see attention line show and loss meet your need in training folder *synthesizer/saved_models/*.
|
* Go to next step when you see attention line show and loss meet your need in training folder *synthesizer/saved_models/*.
|
||||||
|
|
||||||
@@ -67,7 +126,7 @@ Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata,
|
|||||||
| --- | ----------- | ----- |----- |
|
| --- | ----------- | ----- |----- |
|
||||||
| @author | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [Baidu](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps trained by multiple datasets
|
| @author | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [Baidu](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps trained by multiple datasets
|
||||||
| @author | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [Baidu](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) code:om7f | | 25k steps trained by multiple datasets, only works under version 0.0.1
|
| @author | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [Baidu](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) code:om7f | | 25k steps trained by multiple datasets, only works under version 0.0.1
|
||||||
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing https://u.teknik.io/AYxWf.pt | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps with local accent of Taiwan, only works under version 0.0.1
|
|@FawenYo | https://yisiou-my.sharepoint.com/:u:/g/personal/lawrence_cheng_fawenyo_onmicrosoft_com/EWFWDHzee-NNg9TWdKckCc4BC7bK2j9cCbOWn0-_tK0nOg?e=n0gGgC | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps with local accent of Taiwan, only works under version 0.0.1
|
||||||
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code: 2021 https://www.aliyundrive.com/s/AwPsbo8mcSP code: z2m0 | https://www.bilibili.com/video/BV1uh411B7AD/ | only works under version 0.0.1
|
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code: 2021 https://www.aliyundrive.com/s/AwPsbo8mcSP code: z2m0 | https://www.bilibili.com/video/BV1uh411B7AD/ | only works under version 0.0.1
|
||||||
|
|
||||||
#### 2.4 Train vocoder (Optional)
|
#### 2.4 Train vocoder (Optional)
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
train_hifigan(0, args, h)
|
train_hifigan(0, args, h)
|
||||||
elif args.vocoder_type == "fregan":
|
elif args.vocoder_type == "fregan":
|
||||||
with open('vocoder/fregan/config.json') as f:
|
with Path('vocoder/fregan/config.json').open() as f:
|
||||||
json_config = json.load(f)
|
json_config = json.load(f)
|
||||||
h = AttrDict(json_config)
|
h = AttrDict(json_config)
|
||||||
if h.num_gpus > 1:
|
if h.num_gpus > 1:
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ colormap = np.array([
|
|||||||
[0, 0, 0],
|
[0, 0, 0],
|
||||||
[183, 183, 183],
|
[183, 183, 183],
|
||||||
[76, 255, 0],
|
[76, 255, 0],
|
||||||
], dtype=np.float) / 255
|
], dtype=float) / 255
|
||||||
|
|
||||||
default_text = \
|
default_text = \
|
||||||
"欢迎使用工具箱, 现已支持中文输入!"
|
"欢迎使用工具箱, 现已支持中文输入!"
|
||||||
@@ -402,8 +402,8 @@ class UI(QDialog):
|
|||||||
self.app.processEvents()
|
self.app.processEvents()
|
||||||
|
|
||||||
def set_loading(self, value, maximum=1):
|
def set_loading(self, value, maximum=1):
|
||||||
self.loading_bar.setValue(value * 100)
|
self.loading_bar.setValue(int(value * 100))
|
||||||
self.loading_bar.setMaximum(maximum * 100)
|
self.loading_bar.setMaximum(int(maximum * 100))
|
||||||
self.loading_bar.setTextVisible(value != 0)
|
self.loading_bar.setTextVisible(value != 0)
|
||||||
self.app.processEvents()
|
self.app.processEvents()
|
||||||
|
|
||||||
|
|||||||
BIN
data/ckpt/encoder/pretrained.pt
Normal file
BIN
data/ckpt/encoder/pretrained.pt
Normal file
Binary file not shown.
31
data/ckpt/vocoder/pretrained/config_16k.json
Normal file
31
data/ckpt/vocoder/pretrained/config_16k.json
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
{
|
||||||
|
"resblock": "1",
|
||||||
|
"num_gpus": 0,
|
||||||
|
"batch_size": 16,
|
||||||
|
"learning_rate": 0.0002,
|
||||||
|
"adam_b1": 0.8,
|
||||||
|
"adam_b2": 0.99,
|
||||||
|
"lr_decay": 0.999,
|
||||||
|
"seed": 1234,
|
||||||
|
|
||||||
|
"upsample_rates": [5,5,4,2],
|
||||||
|
"upsample_kernel_sizes": [10,10,8,4],
|
||||||
|
"upsample_initial_channel": 512,
|
||||||
|
"resblock_kernel_sizes": [3,7,11],
|
||||||
|
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||||
|
|
||||||
|
"segment_size": 6400,
|
||||||
|
"num_mels": 80,
|
||||||
|
"num_freq": 1025,
|
||||||
|
"n_fft": 1024,
|
||||||
|
"hop_size": 200,
|
||||||
|
"win_size": 800,
|
||||||
|
|
||||||
|
"sampling_rate": 16000,
|
||||||
|
|
||||||
|
"fmin": 0,
|
||||||
|
"fmax": 7600,
|
||||||
|
"fmax_for_loss": null,
|
||||||
|
|
||||||
|
"num_workers": 4
|
||||||
|
}
|
||||||
BIN
data/ckpt/vocoder/pretrained/g_hifigan.pt
Normal file
BIN
data/ckpt/vocoder/pretrained/g_hifigan.pt
Normal file
Binary file not shown.
BIN
data/ckpt/vocoder/pretrained/pretrained.pt
Normal file
BIN
data/ckpt/vocoder/pretrained/pretrained.pt
Normal file
Binary file not shown.
@@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
|||||||
|
|
||||||
# Resample the wav if needed
|
# Resample the wav if needed
|
||||||
if source_sr is not None and source_sr != sampling_rate:
|
if source_sr is not None and source_sr != sampling_rate:
|
||||||
wav = librosa.resample(wav, source_sr, sampling_rate)
|
wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
|
||||||
|
|
||||||
# Apply the preprocessing: normalize volume and shorten long silences
|
# Apply the preprocessing: normalize volume and shorten long silences
|
||||||
if normalize:
|
if normalize:
|
||||||
@@ -99,7 +99,7 @@ def trim_long_silences(wav):
|
|||||||
return ret[width - 1:] / width
|
return ret[width - 1:] / width
|
||||||
|
|
||||||
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
||||||
audio_mask = np.round(audio_mask).astype(np.bool)
|
audio_mask = np.round(audio_mask).astype(bool)
|
||||||
|
|
||||||
# Dilate the voiced regions
|
# Dilate the voiced regions
|
||||||
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ colormap = np.array([
|
|||||||
[33, 0, 127],
|
[33, 0, 127],
|
||||||
[0, 0, 0],
|
[0, 0, 0],
|
||||||
[183, 183, 183],
|
[183, 183, 183],
|
||||||
], dtype=np.float) / 255
|
], dtype=float) / 255
|
||||||
|
|
||||||
|
|
||||||
class Visualizations:
|
class Visualizations:
|
||||||
|
|||||||
@@ -31,14 +31,13 @@ class LogMel(torch.nn.Module):
|
|||||||
fs: int = 16000,
|
fs: int = 16000,
|
||||||
n_fft: int = 512,
|
n_fft: int = 512,
|
||||||
n_mels: int = 80,
|
n_mels: int = 80,
|
||||||
fmin: float = None,
|
fmin: float = 0,
|
||||||
fmax: float = None,
|
fmax: float = None,
|
||||||
htk: bool = False,
|
htk: bool = False,
|
||||||
norm=1,
|
norm=1,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
fmin = 0 if fmin is None else fmin
|
|
||||||
fmax = fs / 2 if fmax is None else fmax
|
fmax = fs / 2 if fmax is None else fmax
|
||||||
_mel_options = dict(
|
_mel_options = dict(
|
||||||
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
|
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
|
||||||
|
|||||||
@@ -107,7 +107,7 @@ def _griffin_lim(S, hparams):
|
|||||||
Based on https://github.com/librosa/librosa/issues/434
|
Based on https://github.com/librosa/librosa/issues/434
|
||||||
"""
|
"""
|
||||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||||
S_complex = np.abs(S).astype(np.complex)
|
S_complex = np.abs(S).astype(complex)
|
||||||
y = _istft(S_complex * angles, hparams)
|
y = _istft(S_complex * angles, hparams)
|
||||||
for i in range(hparams.griffin_lim_iters):
|
for i in range(hparams.griffin_lim_iters):
|
||||||
angles = np.exp(1j * np.angle(_stft(y, hparams)))
|
angles = np.exp(1j * np.angle(_stft(y, hparams)))
|
||||||
|
|||||||
@@ -3,10 +3,10 @@ from utils.hparams import HParams
|
|||||||
hparams = HParams(
|
hparams = HParams(
|
||||||
### Signal Processing (used in both synthesizer and vocoder)
|
### Signal Processing (used in both synthesizer and vocoder)
|
||||||
sample_rate = 16000,
|
sample_rate = 16000,
|
||||||
n_fft = 800,
|
n_fft = 1024, # filter_length
|
||||||
num_mels = 80,
|
num_mels = 80,
|
||||||
hop_size = 200, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
|
hop_size = 256, # Tacotron uses 12.5 ms frame shift (set to sample_rate * 0.0125)
|
||||||
win_size = 800, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
|
win_size = 1024, # Tacotron uses 50 ms frame length (set to sample_rate * 0.050)
|
||||||
fmin = 55,
|
fmin = 55,
|
||||||
min_level_db = -100,
|
min_level_db = -100,
|
||||||
ref_level_db = 20,
|
ref_level_db = 20,
|
||||||
@@ -67,7 +67,7 @@ hparams = HParams(
|
|||||||
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
|
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
|
||||||
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
|
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
|
||||||
# and [0, max_abs_value] if False
|
# and [0, max_abs_value] if False
|
||||||
trim_silence = True, # Use with sample_rate of 16000 for best results
|
trim_silence = False, # Use with sample_rate of 16000 for best results
|
||||||
|
|
||||||
### SV2TTS
|
### SV2TTS
|
||||||
speaker_embedding_size = 256, # Dimension for the speaker embedding
|
speaker_embedding_size = 256, # Dimension for the speaker embedding
|
||||||
|
|||||||
@@ -2,12 +2,12 @@ import math
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
from .sublayer.vits_modules import *
|
from .sublayer.vits_modules import *
|
||||||
import monotonic_align
|
import monotonic_align
|
||||||
|
|
||||||
from .base import Base
|
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
||||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
|
||||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||||
from utils.util import init_weights, get_padding, sequence_mask, rand_slice_segments, generate_path
|
from utils.util import init_weights, get_padding, sequence_mask, rand_slice_segments, generate_path
|
||||||
|
|
||||||
@@ -386,7 +386,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
|
|||||||
|
|
||||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||||
|
|
||||||
class Vits(Base):
|
class Vits(nn.Module):
|
||||||
"""
|
"""
|
||||||
Synthesizer of Vits
|
Synthesizer of Vits
|
||||||
"""
|
"""
|
||||||
@@ -408,13 +408,12 @@ class Vits(Base):
|
|||||||
upsample_rates,
|
upsample_rates,
|
||||||
upsample_initial_channel,
|
upsample_initial_channel,
|
||||||
upsample_kernel_sizes,
|
upsample_kernel_sizes,
|
||||||
stop_threshold,
|
|
||||||
n_speakers=0,
|
n_speakers=0,
|
||||||
gin_channels=0,
|
gin_channels=0,
|
||||||
use_sdp=True,
|
use_sdp=True,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
|
|
||||||
super().__init__(stop_threshold)
|
super().__init__()
|
||||||
self.n_vocab = n_vocab
|
self.n_vocab = n_vocab
|
||||||
self.spec_channels = spec_channels
|
self.spec_channels = spec_channels
|
||||||
self.inter_channels = inter_channels
|
self.inter_channels = inter_channels
|
||||||
@@ -457,7 +456,7 @@ class Vits(Base):
|
|||||||
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
||||||
|
|
||||||
def forward(self, x, x_lengths, y, y_lengths, sid=None, emo=None):
|
def forward(self, x, x_lengths, y, y_lengths, sid=None, emo=None):
|
||||||
|
# logger.info(f'====> Forward: 1.1.0')
|
||||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emo)
|
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emo)
|
||||||
if self.n_speakers > 0:
|
if self.n_speakers > 0:
|
||||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||||
@@ -466,7 +465,7 @@ class Vits(Base):
|
|||||||
|
|
||||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
||||||
z_p = self.flow(z, y_mask, g=g)
|
z_p = self.flow(z, y_mask, g=g)
|
||||||
|
# logger.info(f'====> Forward: 1.1.1')
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# negative cross-entropy
|
# negative cross-entropy
|
||||||
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
|
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
|
||||||
@@ -475,10 +474,11 @@ class Vits(Base):
|
|||||||
neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
||||||
neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
|
neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
|
||||||
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
|
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
|
||||||
|
#logger.info(f'====> Forward: 1.1.1.1')
|
||||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||||
attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
|
attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
|
||||||
|
|
||||||
|
# logger.info(f'====> Forward: 1.1.2')
|
||||||
w = attn.sum(2)
|
w = attn.sum(2)
|
||||||
if self.use_sdp:
|
if self.use_sdp:
|
||||||
l_length = self.dp(x, x_mask, w, g=g)
|
l_length = self.dp(x, x_mask, w, g=g)
|
||||||
@@ -487,7 +487,6 @@ class Vits(Base):
|
|||||||
logw_ = torch.log(w + 1e-6) * x_mask
|
logw_ = torch.log(w + 1e-6) * x_mask
|
||||||
logw = self.dp(x, x_mask, g=g)
|
logw = self.dp(x, x_mask, g=g)
|
||||||
l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
|
l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
|
||||||
|
|
||||||
# expand prior
|
# expand prior
|
||||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
||||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
|
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
|
||||||
@@ -497,7 +496,9 @@ class Vits(Base):
|
|||||||
return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||||
|
|
||||||
def infer(self, x, x_lengths, sid=None, emo=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
def infer(self, x, x_lengths, sid=None, emo=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
||||||
|
# logger.info(f'====> Infer: 1.1.0')
|
||||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths,emo)
|
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths,emo)
|
||||||
|
# logger.info(f'====> Infer: 1.1.1')
|
||||||
if self.n_speakers > 0:
|
if self.n_speakers > 0:
|
||||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||||
else:
|
else:
|
||||||
@@ -514,11 +515,14 @@ class Vits(Base):
|
|||||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||||
attn = generate_path(w_ceil, attn_mask)
|
attn = generate_path(w_ceil, attn_mask)
|
||||||
|
|
||||||
|
# logger.info(f'====> Infer: 1.1.2')
|
||||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||||
|
|
||||||
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
||||||
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
||||||
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
||||||
|
|
||||||
|
# logger.info(f'====> Infer: 1.1.3')
|
||||||
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from pathlib import Path
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from models.encoder import inference as encoder
|
from models.encoder import inference as encoder
|
||||||
from models.synthesizer.preprocess_audio import preprocess_general
|
from models.synthesizer.preprocess_audio import preprocess_general, extract_emo
|
||||||
from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata
|
from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata
|
||||||
|
|
||||||
data_info = {
|
data_info = {
|
||||||
@@ -39,9 +39,12 @@ data_info = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def should_skip(fpath: Path, skip_existing: bool) -> bool:
|
||||||
|
return skip_existing and fpath.exists()
|
||||||
|
|
||||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||||
skip_existing: bool, hparams, no_alignments: bool,
|
skip_existing: bool, hparams, no_alignments: bool,
|
||||||
dataset: str, emotion_extract = False):
|
dataset: str, emotion_extract = False, encoder_model_fpath=None):
|
||||||
dataset_info = data_info[dataset]
|
dataset_info = data_info[dataset]
|
||||||
# Gather the input directories
|
# Gather the input directories
|
||||||
dataset_root = datasets_root.joinpath(dataset)
|
dataset_root = datasets_root.joinpath(dataset)
|
||||||
@@ -77,13 +80,13 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
|||||||
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
||||||
|
|
||||||
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||||
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, emotion_extract=emotion_extract)
|
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
|
||||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
job = Pool(n_processes).imap_unordered(func, speaker_dirs)
|
||||||
|
|
||||||
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
||||||
if speaker_metadata is not None:
|
if speaker_metadata is not None:
|
||||||
for metadatum in speaker_metadata:
|
for metadatum in speaker_metadata:
|
||||||
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
metadata_file.write("|".join(map(str,metadatum)) + "\n")
|
||||||
metadata_file.close()
|
metadata_file.close()
|
||||||
|
|
||||||
# Verify the contents of the metadata file
|
# Verify the contents of the metadata file
|
||||||
@@ -99,7 +102,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
|||||||
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
||||||
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
||||||
|
|
||||||
def embed_utterance(fpaths, encoder_model_fpath):
|
def _embed_utterance(fpaths: str, encoder_model_fpath: str):
|
||||||
if not encoder.is_loaded():
|
if not encoder.is_loaded():
|
||||||
encoder.load_model(encoder_model_fpath)
|
encoder.load_model(encoder_model_fpath)
|
||||||
|
|
||||||
@@ -110,8 +113,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
|
|||||||
embed = encoder.embed_utterance(wav)
|
embed = encoder.embed_utterance(wav)
|
||||||
np.save(embed_fpath, embed, allow_pickle=False)
|
np.save(embed_fpath, embed, allow_pickle=False)
|
||||||
|
|
||||||
|
def _emo_extract_from_utterance(fpaths, hparams):
|
||||||
|
wav_fpath, emo_fpath = fpaths
|
||||||
|
wav = np.load(wav_fpath)
|
||||||
|
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
|
||||||
|
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
|
||||||
|
|
||||||
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool):
|
||||||
wav_dir = synthesizer_root.joinpath("audio")
|
wav_dir = synthesizer_root.joinpath("audio")
|
||||||
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
||||||
assert wav_dir.exists() and metadata_fpath.exists()
|
assert wav_dir.exists() and metadata_fpath.exists()
|
||||||
@@ -121,10 +129,28 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
|
|||||||
# Gather the input wave filepath and the target output embed filepath
|
# Gather the input wave filepath and the target output embed filepath
|
||||||
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
||||||
metadata = [line.split("|") for line in metadata_file]
|
metadata = [line.split("|") for line in metadata_file]
|
||||||
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
|
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata if not should_skip(embed_dir.joinpath(m[2]), skip_existing)]
|
||||||
|
|
||||||
|
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
||||||
|
# Embed the utterances in separate threads
|
||||||
|
func = partial(_embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
||||||
|
job = Pool(n_processes).imap(func, fpaths)
|
||||||
|
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||||
|
|
||||||
|
def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
|
||||||
|
wav_dir = synthesizer_root.joinpath("audio")
|
||||||
|
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
||||||
|
assert wav_dir.exists() and metadata_fpath.exists()
|
||||||
|
emo_dir = synthesizer_root.joinpath("emo")
|
||||||
|
emo_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Gather the input wave filepath and the target output embed filepath
|
||||||
|
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
||||||
|
metadata = [line.split("|") for line in metadata_file]
|
||||||
|
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata if not should_skip(emo_dir.joinpath(m[0].replace("audio-", "emo-")), skip_existing)]
|
||||||
|
|
||||||
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
||||||
# Embed the utterances in separate threads
|
# Embed the utterances in separate threads
|
||||||
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
func = partial(_emo_extract_from_utterance, hparams=hparams)
|
||||||
job = Pool(n_processes).imap(func, fpaths)
|
job = Pool(n_processes).imap(func, fpaths)
|
||||||
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
||||||
|
|||||||
@@ -13,15 +13,17 @@ import torch
|
|||||||
from transformers import Wav2Vec2Processor
|
from transformers import Wav2Vec2Processor
|
||||||
from .models.wav2emo import EmotionExtractorModel
|
from .models.wav2emo import EmotionExtractorModel
|
||||||
|
|
||||||
SAMPLE_RATE = 16000
|
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
|
||||||
|
pass
|
||||||
|
|
||||||
|
pinyin = Pinyin(PinyinConverter()).pinyin
|
||||||
|
|
||||||
|
|
||||||
# load model from hub
|
# load model from hub
|
||||||
device = 'cuda' if torch.cuda.is_available() else "cpu"
|
device = 'cuda' if torch.cuda.is_available() else "cpu"
|
||||||
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
|
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
|
||||||
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
||||||
model = EmotionExtractorModel.from_pretrained(model_name).to(device)
|
model = EmotionExtractorModel.from_pretrained(model_name).to(device)
|
||||||
embs = []
|
|
||||||
wavnames = []
|
|
||||||
|
|
||||||
def extract_emo(
|
def extract_emo(
|
||||||
x: np.ndarray,
|
x: np.ndarray,
|
||||||
@@ -42,16 +44,8 @@ def extract_emo(
|
|||||||
|
|
||||||
return y
|
return y
|
||||||
|
|
||||||
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
|
|
||||||
pass
|
|
||||||
|
|
||||||
pinyin = Pinyin(PinyinConverter()).pinyin
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||||
skip_existing: bool, hparams, emotion_extract: bool):
|
mel_fpath: str, wav_fpath: str, hparams, encoder_model_fpath):
|
||||||
## FOR REFERENCE:
|
## FOR REFERENCE:
|
||||||
# For you not to lose your head if you ever wish to change things here or implement your own
|
# For you not to lose your head if you ever wish to change things here or implement your own
|
||||||
# synthesizer.
|
# synthesizer.
|
||||||
@@ -64,16 +58,10 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
|||||||
# without extra padding. This means that you won't have an exact relation between the length
|
# without extra padding. This means that you won't have an exact relation between the length
|
||||||
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
# of the wav and of the mel spectrogram. See the vocoder data loader.
|
||||||
|
|
||||||
# Skip existing utterances if needed
|
|
||||||
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
|
|
||||||
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
|
|
||||||
emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % basename)
|
|
||||||
skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
|
|
||||||
if skip_existing and mel_fpath.exists() and wav_fpath.exists() and skip_emo_extract:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Trim silence
|
# Trim silence
|
||||||
if hparams.trim_silence:
|
if hparams.trim_silence:
|
||||||
|
if not encoder.is_loaded():
|
||||||
|
encoder.load_model(encoder_model_fpath)
|
||||||
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
||||||
|
|
||||||
# Skip utterances that are too short
|
# Skip utterances that are too short
|
||||||
@@ -91,18 +79,14 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
|||||||
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
|
||||||
np.save(wav_fpath, wav, allow_pickle=False)
|
np.save(wav_fpath, wav, allow_pickle=False)
|
||||||
|
|
||||||
if not skip_emo_extract:
|
|
||||||
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
|
|
||||||
np.save(emo_fpath, emo, allow_pickle=False)
|
|
||||||
|
|
||||||
# Return a tuple describing this training example
|
# Return a tuple describing this training example
|
||||||
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
|
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, wav, mel_frames, text
|
||||||
|
|
||||||
|
|
||||||
def _split_on_silences(wav_fpath, words, hparams):
|
def _split_on_silences(wav_fpath, words, hparams):
|
||||||
# Load the audio waveform
|
# Load the audio waveform
|
||||||
wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate)
|
wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate)
|
||||||
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
|
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=1024)[0]
|
||||||
if hparams.rescale:
|
if hparams.rescale:
|
||||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||||
# denoise, we may not need it here.
|
# denoise, we may not need it here.
|
||||||
@@ -113,25 +97,36 @@ def _split_on_silences(wav_fpath, words, hparams):
|
|||||||
wav = logmmse.denoise(wav, profile, eta=0)
|
wav = logmmse.denoise(wav, profile, eta=0)
|
||||||
|
|
||||||
resp = pinyin(words, style=Style.TONE3)
|
resp = pinyin(words, style=Style.TONE3)
|
||||||
res = [v[0] for v in resp if v[0].strip()]
|
res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp))
|
||||||
res = " ".join(res)
|
res = " ".join(res)
|
||||||
|
|
||||||
return wav, res
|
return wav, res
|
||||||
|
|
||||||
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, emotion_extract: bool):
|
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
|
||||||
metadata = []
|
metadata = []
|
||||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
extensions = ("*.wav", "*.flac", "*.mp3")
|
||||||
for extension in extensions:
|
for extension in extensions:
|
||||||
wav_fpath_list = speaker_dir.glob(extension)
|
wav_fpath_list = speaker_dir.glob(extension)
|
||||||
# Iterate over each wav
|
# Iterate over each wav
|
||||||
for wav_fpath in wav_fpath_list:
|
for wav_fpath in wav_fpath_list:
|
||||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||||
words = dict_info.get(wav_fpath.name) if not words else words # try with extension
|
|
||||||
if not words:
|
if not words:
|
||||||
print("no wordS")
|
words = dict_info.get(wav_fpath.name) # try with extension
|
||||||
continue
|
if not words:
|
||||||
|
print(f"No word found in dict_info for {wav_fpath.name}, skip it")
|
||||||
|
continue
|
||||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||||
|
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
|
||||||
|
wav_fpath = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
|
||||||
|
|
||||||
|
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
|
||||||
|
continue
|
||||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||||
skip_existing, hparams, emotion_extract))
|
False, hparams, encoder_model_fpath) # accelarate
|
||||||
return [m for m in metadata if m is not None]
|
if result is None:
|
||||||
|
continue
|
||||||
|
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||||
|
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ def new_train():
|
|||||||
parser.add_argument("--syn_dir", type=str, default="../audiodata/SV2TTS/synthesizer", help= \
|
parser.add_argument("--syn_dir", type=str, default="../audiodata/SV2TTS/synthesizer", help= \
|
||||||
"Path to the synthesizer directory that contains the ground truth mel spectrograms, "
|
"Path to the synthesizer directory that contains the ground truth mel spectrograms, "
|
||||||
"the wavs, the emos and the embeds.")
|
"the wavs, the emos and the embeds.")
|
||||||
parser.add_argument("-m", "--model_dir", type=str, default="data/ckpt/synthesizer/vits", help=\
|
parser.add_argument("-m", "--model_dir", type=str, default="data/ckpt/synthesizer/vits2", help=\
|
||||||
"Path to the output directory that will contain the saved model weights and the logs.")
|
"Path to the output directory that will contain the saved model weights and the logs.")
|
||||||
parser.add_argument('--ckptG', type=str, required=False,
|
parser.add_argument('--ckptG', type=str, required=False,
|
||||||
help='original VITS G checkpoint path')
|
help='original VITS G checkpoint path')
|
||||||
@@ -65,7 +65,7 @@ def new_train():
|
|||||||
run(0, 1, hparams)
|
run(0, 1, hparams)
|
||||||
|
|
||||||
|
|
||||||
def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False):
|
def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False, epochs=10000):
|
||||||
assert os.path.isfile(checkpoint_path)
|
assert os.path.isfile(checkpoint_path)
|
||||||
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
||||||
iteration = checkpoint_dict['iteration']
|
iteration = checkpoint_dict['iteration']
|
||||||
@@ -89,8 +89,12 @@ def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False):
|
|||||||
try:
|
try:
|
||||||
new_state_dict[k] = saved_state_dict[k]
|
new_state_dict[k] = saved_state_dict[k]
|
||||||
except:
|
except:
|
||||||
logger.info("%s is not in the checkpoint" % k)
|
if k == 'step':
|
||||||
new_state_dict[k] = v
|
new_state_dict[k] = iteration * epochs
|
||||||
|
else:
|
||||||
|
logger.info("%s is not in the checkpoint" % k)
|
||||||
|
new_state_dict[k] = v
|
||||||
|
|
||||||
if hasattr(model, 'module'):
|
if hasattr(model, 'module'):
|
||||||
model.module.load_state_dict(new_state_dict, strict=False)
|
model.module.load_state_dict(new_state_dict, strict=False)
|
||||||
else:
|
else:
|
||||||
@@ -173,13 +177,13 @@ def run(rank, n_gpus, hps):
|
|||||||
print("加载原版VITS模型G记录点成功")
|
print("加载原版VITS模型G记录点成功")
|
||||||
else:
|
else:
|
||||||
_, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g,
|
_, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g,
|
||||||
optim_g)
|
optim_g, epochs=hps.train.epochs)
|
||||||
if ckptD is not None:
|
if ckptD is not None:
|
||||||
_, _, _, epoch_str = load_checkpoint(ckptG, net_g, optim_g, is_old=True)
|
_, _, _, epoch_str = load_checkpoint(ckptG, net_g, optim_g, is_old=True)
|
||||||
print("加载原版VITS模型D记录点成功")
|
print("加载原版VITS模型D记录点成功")
|
||||||
else:
|
else:
|
||||||
_, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d,
|
_, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d,
|
||||||
optim_d)
|
optim_d, epochs=hps.train.epochs)
|
||||||
global_step = (epoch_str - 1) * len(train_loader)
|
global_step = (epoch_str - 1) * len(train_loader)
|
||||||
except:
|
except:
|
||||||
epoch_str = 1
|
epoch_str = 1
|
||||||
@@ -216,17 +220,17 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|||||||
net_g.train()
|
net_g.train()
|
||||||
net_d.train()
|
net_d.train()
|
||||||
for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers, emo) in enumerate(train_loader):
|
for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers, emo) in enumerate(train_loader):
|
||||||
logger.info(f'====> Step: 1 {batch_idx}')
|
# logger.info(f'====> Step: 1 {batch_idx}')
|
||||||
x, x_lengths = x.cuda(rank, non_blocking=True), x_lengths.cuda(rank, non_blocking=True)
|
x, x_lengths = x.cuda(rank), x_lengths.cuda(rank)
|
||||||
spec, spec_lengths = spec.cuda(rank, non_blocking=True), spec_lengths.cuda(rank, non_blocking=True)
|
spec, spec_lengths = spec.cuda(rank), spec_lengths.cuda(rank)
|
||||||
y, y_lengths = y.cuda(rank, non_blocking=True), y_lengths.cuda(rank, non_blocking=True)
|
y, y_lengths = y.cuda(rank), y_lengths.cuda(rank)
|
||||||
speakers = speakers.cuda(rank, non_blocking=True)
|
speakers = speakers.cuda(rank)
|
||||||
emo = emo.cuda(rank, non_blocking=True)
|
emo = emo.cuda(rank)
|
||||||
|
# logger.info(f'====> Step: 1.0 {batch_idx}')
|
||||||
with autocast(enabled=hps.train.fp16_run):
|
with autocast(enabled=hps.train.fp16_run):
|
||||||
y_hat, l_length, attn, ids_slice, x_mask, z_mask, \
|
y_hat, l_length, attn, ids_slice, x_mask, z_mask, \
|
||||||
(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers, emo)
|
(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers, emo)
|
||||||
|
# logger.info(f'====> Step: 1.1 {batch_idx}')
|
||||||
mel = spec_to_mel(
|
mel = spec_to_mel(
|
||||||
spec,
|
spec,
|
||||||
hps.data.filter_length,
|
hps.data.filter_length,
|
||||||
@@ -247,7 +251,7 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|||||||
)
|
)
|
||||||
|
|
||||||
y = slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
|
y = slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
|
||||||
|
# logger.info(f'====> Step: 1.3 {batch_idx}')
|
||||||
# Discriminator
|
# Discriminator
|
||||||
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
|
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
|
||||||
with autocast(enabled=False):
|
with autocast(enabled=False):
|
||||||
@@ -258,7 +262,6 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|||||||
scaler.unscale_(optim_d)
|
scaler.unscale_(optim_d)
|
||||||
grad_norm_d = clip_grad_value_(net_d.parameters(), None)
|
grad_norm_d = clip_grad_value_(net_d.parameters(), None)
|
||||||
scaler.step(optim_d)
|
scaler.step(optim_d)
|
||||||
logger.info(f'====> Step: 2 {batch_idx}')
|
|
||||||
|
|
||||||
with autocast(enabled=hps.train.fp16_run):
|
with autocast(enabled=hps.train.fp16_run):
|
||||||
# Generator
|
# Generator
|
||||||
@@ -277,7 +280,6 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
|
|||||||
grad_norm_g = clip_grad_value_(net_g.parameters(), None)
|
grad_norm_g = clip_grad_value_(net_g.parameters(), None)
|
||||||
scaler.step(optim_g)
|
scaler.step(optim_g)
|
||||||
scaler.update()
|
scaler.update()
|
||||||
# logger.info(f'====> Step: 3 {batch_idx}')
|
|
||||||
if rank == 0:
|
if rank == 0:
|
||||||
if global_step % hps.train.log_interval == 0:
|
if global_step % hps.train.log_interval == 0:
|
||||||
lr = optim_g.param_groups[0]['lr']
|
lr = optim_g.param_groups[0]['lr']
|
||||||
@@ -339,6 +341,8 @@ def evaluate(hps, generator, eval_loader, writer_eval):
|
|||||||
emo = emo[:1]
|
emo = emo[:1]
|
||||||
break
|
break
|
||||||
y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, emo, max_len=1000)
|
y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, emo, max_len=1000)
|
||||||
|
# y_hat, attn, mask, *_ = generator.infer(x, x_lengths, speakers, emo, max_len=1000) # for non DistributedDataParallel object
|
||||||
|
|
||||||
y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length
|
y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length
|
||||||
|
|
||||||
mel = spec_to_mel(
|
mel = spec_to_mel(
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import torch.nn.functional as F
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.data
|
import torch.utils.data
|
||||||
|
|
||||||
from utils.audio_utils import spectrogram, load_wav
|
from utils.audio_utils import load_wav_to_torch, spectrogram
|
||||||
from utils.util import intersperse
|
from utils.util import intersperse
|
||||||
from models.synthesizer.utils.text import text_to_sequence
|
from models.synthesizer.utils.text import text_to_sequence
|
||||||
|
|
||||||
@@ -51,19 +52,10 @@ class VitsDataset(torch.utils.data.Dataset):
|
|||||||
lengths = []
|
lengths = []
|
||||||
|
|
||||||
# for audiopath, sid, text in self.audio_metadata:
|
# for audiopath, sid, text in self.audio_metadata:
|
||||||
sid = 0
|
for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid in self.audio_metadata:
|
||||||
spk_to_sid = {}
|
|
||||||
for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text in self.audio_metadata:
|
|
||||||
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
||||||
# TODO: for magic data only
|
audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid])
|
||||||
speaker_name = wav_fpath.split("_")[1]
|
|
||||||
if speaker_name not in spk_to_sid:
|
|
||||||
sid += 1
|
|
||||||
spk_to_sid[speaker_name] = sid
|
|
||||||
|
|
||||||
audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spk_to_sid[speaker_name]])
|
|
||||||
lengths.append(os.path.getsize(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') // (2 * self.hop_length))
|
lengths.append(os.path.getsize(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') // (2 * self.hop_length))
|
||||||
print("found sid:%d", sid)
|
|
||||||
self.audio_metadata = audio_metadata_new
|
self.audio_metadata = audio_metadata_new
|
||||||
self.lengths = lengths
|
self.lengths = lengths
|
||||||
|
|
||||||
@@ -71,42 +63,32 @@ class VitsDataset(torch.utils.data.Dataset):
|
|||||||
# separate filename, speaker_id and text
|
# separate filename, speaker_id and text
|
||||||
wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6]
|
wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6]
|
||||||
text = self.get_text(text)
|
text = self.get_text(text)
|
||||||
|
|
||||||
spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
|
spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
|
||||||
sid = self.get_sid(sid)
|
sid = self.get_sid(sid)
|
||||||
emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}'))
|
emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}'))
|
||||||
return (text, spec, wav, sid, emo)
|
return (text, spec, wav, sid, emo)
|
||||||
|
|
||||||
def get_audio(self, filename):
|
def get_audio(self, filename):
|
||||||
# audio, sampling_rate = load_wav(filename)
|
# Load preprocessed wav npy instead of reading from wav file
|
||||||
|
audio = torch.FloatTensor(np.load(filename))
|
||||||
|
audio_norm = audio.unsqueeze(0)
|
||||||
|
|
||||||
# if sampling_rate != self.sampling_rate:
|
spec_filename = filename.replace(".wav", ".spec")
|
||||||
# raise ValueError("{} {} SR doesn't match target {} SR".format(
|
if os.path.exists(spec_filename):
|
||||||
# sampling_rate, self.sampling_rate))
|
spec = torch.load(spec_filename)
|
||||||
# audio = torch.load(filename)
|
else:
|
||||||
audio = torch.FloatTensor(np.load(filename).astype(np.float32))
|
spec = spectrogram(audio_norm, self.filter_length,self.hop_length, self.win_length,
|
||||||
audio = audio.unsqueeze(0)
|
center=False)
|
||||||
# audio_norm = audio / self.max_wav_value
|
torch.save(spec, spec_filename)
|
||||||
# audio_norm = audio_norm.unsqueeze(0)
|
|
||||||
# spec_filename = filename.replace(".wav", ".spec.pt")
|
|
||||||
# if os.path.exists(spec_filename):
|
|
||||||
# spec = torch.load(spec_filename)
|
|
||||||
# else:
|
|
||||||
# spec = spectrogram(audio, self.filter_length,
|
|
||||||
# self.sampling_rate, self.hop_length, self.win_length,
|
|
||||||
# center=False)
|
|
||||||
# spec = torch.squeeze(spec, 0)
|
|
||||||
# torch.save(spec, spec_filename)
|
|
||||||
spec = spectrogram(audio, self.filter_length, self.hop_length, self.win_length,
|
|
||||||
center=False)
|
|
||||||
spec = torch.squeeze(spec, 0)
|
spec = torch.squeeze(spec, 0)
|
||||||
return spec, audio
|
return spec, audio_norm
|
||||||
|
|
||||||
def get_text(self, text):
|
def get_text(self, text):
|
||||||
if self.cleaned_text:
|
if self.cleaned_text:
|
||||||
text_norm = text_to_sequence(text, self.text_cleaners)
|
text_norm = text_to_sequence(text, self.text_cleaners)
|
||||||
if self.add_blank:
|
if self.add_blank:
|
||||||
text_norm = intersperse(text_norm, 0)
|
text_norm = intersperse(text_norm, 0) # 在所有文本数值序列中的元素前后都补充一个0 - 不适用于中文
|
||||||
text_norm = torch.LongTensor(text_norm)
|
text_norm = torch.LongTensor(text_norm)
|
||||||
return text_norm
|
return text_norm
|
||||||
|
|
||||||
@@ -177,7 +159,7 @@ class VitsDatasetCollate():
|
|||||||
emo[i, :] = row[4]
|
emo[i, :] = row[4]
|
||||||
|
|
||||||
if self.return_ids:
|
if self.return_ids:
|
||||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
|
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing, emo
|
||||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo
|
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ def linear_to_mel(spectrogram):
|
|||||||
|
|
||||||
|
|
||||||
def build_mel_basis():
|
def build_mel_basis():
|
||||||
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
return librosa.filters.mel(sr = hp.sample_rate, n_fft = hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
||||||
|
|
||||||
|
|
||||||
def normalize(S):
|
def normalize(S):
|
||||||
|
|||||||
19
monotonic_align/__init__.py
Normal file
19
monotonic_align/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from .monotonic_align.core import maximum_path_c
|
||||||
|
|
||||||
|
|
||||||
|
def maximum_path(neg_cent, mask):
|
||||||
|
""" Cython optimized version.
|
||||||
|
neg_cent: [b, t_t, t_s]
|
||||||
|
mask: [b, t_t, t_s]
|
||||||
|
"""
|
||||||
|
device = neg_cent.device
|
||||||
|
dtype = neg_cent.dtype
|
||||||
|
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
|
||||||
|
path = np.zeros(neg_cent.shape, dtype=np.int32)
|
||||||
|
|
||||||
|
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||||
|
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||||
|
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
|
||||||
|
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||||
21446
monotonic_align/core.c
Normal file
21446
monotonic_align/core.c
Normal file
File diff suppressed because it is too large
Load Diff
42
monotonic_align/core.pyx
Normal file
42
monotonic_align/core.pyx
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
cimport cython
|
||||||
|
from cython.parallel import prange
|
||||||
|
|
||||||
|
|
||||||
|
@cython.boundscheck(False)
|
||||||
|
@cython.wraparound(False)
|
||||||
|
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
|
||||||
|
cdef int x
|
||||||
|
cdef int y
|
||||||
|
cdef float v_prev
|
||||||
|
cdef float v_cur
|
||||||
|
cdef float tmp
|
||||||
|
cdef int index = t_x - 1
|
||||||
|
|
||||||
|
for y in range(t_y):
|
||||||
|
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
|
||||||
|
if x == y:
|
||||||
|
v_cur = max_neg_val
|
||||||
|
else:
|
||||||
|
v_cur = value[y-1, x]
|
||||||
|
if x == 0:
|
||||||
|
if y == 0:
|
||||||
|
v_prev = 0.
|
||||||
|
else:
|
||||||
|
v_prev = max_neg_val
|
||||||
|
else:
|
||||||
|
v_prev = value[y-1, x-1]
|
||||||
|
value[y, x] += max(v_prev, v_cur)
|
||||||
|
|
||||||
|
for y in range(t_y - 1, -1, -1):
|
||||||
|
path[y, index] = 1
|
||||||
|
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
|
||||||
|
index = index - 1
|
||||||
|
|
||||||
|
|
||||||
|
@cython.boundscheck(False)
|
||||||
|
@cython.wraparound(False)
|
||||||
|
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
|
||||||
|
cdef int b = paths.shape[0]
|
||||||
|
cdef int i
|
||||||
|
for i in prange(b, nogil=True):
|
||||||
|
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
|
||||||
9
monotonic_align/setup.py
Normal file
9
monotonic_align/setup.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from distutils.core import setup
|
||||||
|
from Cython.Build import cythonize
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name = 'monotonic_align',
|
||||||
|
ext_modules = cythonize("core.pyx"),
|
||||||
|
include_dirs=[numpy.get_include()]
|
||||||
|
)
|
||||||
8
pre.py
8
pre.py
@@ -1,4 +1,4 @@
|
|||||||
from models.synthesizer.preprocess import create_embeddings, preprocess_dataset
|
from models.synthesizer.preprocess import create_embeddings, preprocess_dataset, create_emo
|
||||||
from models.synthesizer.hparams import hparams
|
from models.synthesizer.hparams import hparams
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import argparse
|
import argparse
|
||||||
@@ -64,12 +64,14 @@ if __name__ == "__main__":
|
|||||||
"noise removal and is recommended. Please install and try again. If installation fails, "
|
"noise removal and is recommended. Please install and try again. If installation fails, "
|
||||||
"use --no_trim to disable this error message.")
|
"use --no_trim to disable this error message.")
|
||||||
encoder_model_fpath = args.encoder_model_fpath
|
encoder_model_fpath = args.encoder_model_fpath
|
||||||
del args.no_trim, args.encoder_model_fpath
|
del args.no_trim
|
||||||
|
|
||||||
args.hparams = hparams.parse(args.hparams)
|
args.hparams = hparams.parse(args.hparams)
|
||||||
n_processes_embed = args.n_processes_embed
|
n_processes_embed = args.n_processes_embed
|
||||||
del args.n_processes_embed
|
del args.n_processes_embed
|
||||||
preprocess_dataset(**vars(args))
|
preprocess_dataset(**vars(args))
|
||||||
|
|
||||||
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath)
|
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath, skip_existing=args.skip_existing)
|
||||||
|
|
||||||
|
if args.emotion_extract:
|
||||||
|
create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams)
|
||||||
|
|||||||
@@ -1,9 +1,8 @@
|
|||||||
umap-learn
|
umap-learn
|
||||||
visdom
|
visdom
|
||||||
librosa==0.8.1
|
librosa
|
||||||
matplotlib>=3.3.0
|
matplotlib>=3.3.0
|
||||||
numpy==1.19.3; platform_system == "Windows"
|
numpy
|
||||||
numpy==1.19.4; platform_system != "Windows"
|
|
||||||
scipy>=1.0.0
|
scipy>=1.0.0
|
||||||
tqdm
|
tqdm
|
||||||
sounddevice
|
sounddevice
|
||||||
@@ -13,22 +12,22 @@ inflect
|
|||||||
PyQt5
|
PyQt5
|
||||||
multiprocess
|
multiprocess
|
||||||
numba
|
numba
|
||||||
webrtcvad; platform_system != "Windows"
|
webrtcvad
|
||||||
pypinyin
|
pypinyin
|
||||||
flask
|
flask
|
||||||
flask_wtf
|
flask_wtf
|
||||||
flask_cors==3.0.10
|
flask_cors
|
||||||
gevent==21.8.0
|
gevent
|
||||||
flask_restx
|
flask_restx
|
||||||
tensorboard==1.15
|
tensorboard
|
||||||
streamlit==1.8.0
|
streamlit
|
||||||
PyYAML==5.4.1
|
PyYAML
|
||||||
torch_complex
|
torch_complex
|
||||||
espnet
|
espnet
|
||||||
PyWavelets
|
PyWavelets
|
||||||
monotonic-align==0.0.3
|
monotonic-align==0.0.3
|
||||||
transformers==4.26.0
|
transformers
|
||||||
fastapi
|
fastapi
|
||||||
loguru
|
loguru
|
||||||
typer[all]
|
typer[all]
|
||||||
click==8.0.4
|
click
|
||||||
|
|||||||
@@ -17,13 +17,12 @@ def load_wav_to_torch(full_path):
|
|||||||
sampling_rate, data = read(full_path)
|
sampling_rate, data = read(full_path)
|
||||||
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
||||||
|
|
||||||
|
|
||||||
def spectrogram(y, n_fft, hop_size, win_size, center=False):
|
def spectrogram(y, n_fft, hop_size, win_size, center=False):
|
||||||
if torch.min(y) < -1.:
|
if torch.min(y) < -1.:
|
||||||
print('min value is ', torch.min(y))
|
print('min value is ', torch.min(y))
|
||||||
if torch.max(y) > 1.:
|
if torch.max(y) > 1.:
|
||||||
print('max value is ', torch.max(y))
|
print('max value is ', torch.max(y))
|
||||||
|
|
||||||
global hann_window
|
global hann_window
|
||||||
dtype_device = str(y.dtype) + '_' + str(y.device)
|
dtype_device = str(y.dtype) + '_' + str(y.device)
|
||||||
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
||||||
@@ -34,7 +33,7 @@ def spectrogram(y, n_fft, hop_size, win_size, center=False):
|
|||||||
y = y.squeeze(1)
|
y = y.squeeze(1)
|
||||||
|
|
||||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
||||||
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
|
||||||
|
|
||||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||||
return spec
|
return spec
|
||||||
@@ -68,20 +67,12 @@ def mel_spectrogram(
|
|||||||
if torch.max(y) > 1.:
|
if torch.max(y) > 1.:
|
||||||
print('max value is ', torch.max(y))
|
print('max value is ', torch.max(y))
|
||||||
|
|
||||||
# global mel_basis, hann_window
|
|
||||||
# if fmax not in mel_basis:
|
|
||||||
# mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
|
||||||
# mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
|
||||||
# hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
|
|
||||||
global mel_basis, hann_window
|
global mel_basis, hann_window
|
||||||
dtype_device = str(y.dtype) + '_' + str(y.device)
|
if fmax not in mel_basis:
|
||||||
fmax_dtype_device = str(fmax) + '_' + dtype_device
|
|
||||||
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
|
||||||
if fmax_dtype_device not in mel_basis:
|
|
||||||
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
||||||
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
|
mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
||||||
if wnsize_dtype_device not in hann_window:
|
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
|
||||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
|
||||||
|
|
||||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
||||||
y = y.squeeze(1)
|
y = y.squeeze(1)
|
||||||
|
|||||||
550
vits.ipynb
vendored
550
vits.ipynb
vendored
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user