48 Commits

Author SHA1 Message Date
Vega
156723e37c Skip embedding (#950)
* Skip embedding

* Skip earlier

* Remove unused paramater

* Pass param
2023-09-05 23:15:04 +08:00
Vega
1862d2145b Merge pull request #953 from babysor/babysor-patch-3
Update README.md
2023-08-31 11:42:15 +08:00
Vega
72a22d448b Update README.md 2023-08-31 11:42:05 +08:00
Vega
98d38d84c3 Merge pull request #952 from SeaTidesPro/main
add readme-linux-zh
2023-08-31 11:41:10 +08:00
Tide
7ab86c6f4c Update README-LINUX-CN.md 2023-08-30 14:41:45 +08:00
Tide
ab79881480 Update README-LINUX-CN.md 2023-08-30 14:40:30 +08:00
Tide
fd93b40398 Update README-LINUX-CN.md 2023-08-30 14:35:34 +08:00
Tide
dbf01347fc Update README-LINUX-CN.md 2023-08-30 14:35:12 +08:00
Tide
28f9173dfa Update README-LINUX-CN.md 2023-08-30 14:34:20 +08:00
Tide
d073e1f349 Update README-LINUX-CN.md 2023-08-30 14:24:05 +08:00
Tide
baa8b5005d Update README-LINUX-CN.md 2023-08-30 14:05:40 +08:00
Tide
d54f4fb631 Update README-LINUX-CN.md 2023-08-30 13:18:33 +08:00
Tide
7353888d35 Create README-LINUX-CN.md 2023-08-30 12:20:29 +08:00
Vega
e9ce943f6c Merge pull request #947 from FawenYo/doc/update_link
📝 Update model download link
2023-08-11 22:02:41 +08:00
FawenYo
77c145328c 📝 Update model download link 2023-08-11 14:31:39 +08:00
Vega
3bce6bbbe7 Merge pull request #945 from babysor/babysor-patch-1
Update README.md
2023-08-10 15:54:23 +08:00
Vega
9dd8ea11e5 Merge pull request #944 from babysor/babysor-patch-2
Update README-CN.md
2023-08-10 15:53:52 +08:00
Vega
5a0d77e699 Update README-CN.md 2023-08-10 15:53:42 +08:00
Vega
4914fc0776 Update README.md 2023-08-10 15:50:21 +08:00
Vega
8f95faa0d3 Merge pull request #914 from cloudxu/readme_update
Removing ongoing work session in README
2023-06-15 17:39:13 +08:00
Cloud Xu
facc97e236 removing ongoing work 2023-06-15 17:00:05 +08:00
Vega
3d1e3dc542 Merge pull request #892 from FawenYo/main
Update FawenYo's shared model
2023-06-10 10:25:04 +08:00
Vega
536ae8899c Merge pull request #908 from 0warning0error/main
Some changes to make it easier to install the dependencies
2023-06-10 10:24:39 +08:00
0warning0error
9f1dbeeecc Some changes to make it easier to install the dependencies 2023-06-03 00:07:36 +08:00
FawenYo
d3bdf816db Update FawenYo's shared model 2023-05-06 14:37:44 +08:00
Vega
b78d0d2a26 Merge pull request #782 from Nier-Y/main
Update README.md and README-CN.md
2023-03-07 16:41:48 +08:00
babysor00
5c17fc8bb0 add pretrained 2023-02-18 09:31:05 +08:00
babysor00
3ce874ab46 Fix issue for training and preprocessing 2023-02-10 20:34:01 +08:00
babysor00
beec0b93ed Fix issues 2023-02-04 17:00:49 +08:00
Vega
9d67b757f0 Merge pull request #822 from babysor/restruct-project
Restruct project
2023-02-04 14:37:48 +08:00
Vega
26331fe019 Merge branch 'main' into restruct-project 2023-02-04 14:22:37 +08:00
babysor00
712a53f557 Add vits 2023-02-04 14:13:38 +08:00
babysor00
24cb262c3f remove used files 2023-02-01 20:16:06 +08:00
babysor00
e469bd06ae init 2023-02-01 19:59:15 +08:00
神楽坂·喵
cd20d21f3d add docker support (#802)
* add docker support

* 修复训练集解压问题

* 修复web.py启动问题

* 混合数据集训练参数
2022-12-16 11:16:25 +08:00
babysor00
74a3fc97d0 Refactor Project to 3 parts: Models, Control, Data
Need readme
2022-12-03 16:54:06 +08:00
李子
b402f9dbdf 修复web界面vc模式不能用问题。改拼写错误、给字符串前加f标 (#790) 2022-11-30 15:32:52 +08:00
yan
d1ba355c5f Adding English version M1 Mac Setup 2022-11-18 00:36:42 +08:00
yan
83d95c6c81 新增M1 Mac的环境配置 2022-11-17 23:48:52 +08:00
wei-z
85a53c9e05 update aidatatang_200zh folders (#743)
Co-authored-by: wei-z-git <wei-z>
2022-10-15 11:45:51 +08:00
xxxxx
028b131570 Update streamlit_ui.py (#748) 2022-10-15 11:44:44 +08:00
Dong
2a1890f9e1 Update README-CN.md (#751)
修正文档复选框格式
2022-10-15 11:44:24 +08:00
wei-z
c91bc3208e 添加 -d 指定数据集时错误提示 (#741)
* 添加 -d 指定数据集时错误提示

Warning: you do not have any of the recognized datasets in G:\AI\Dataset\aidatatang_200zh\aidatatang_200zh 
Please note use 'E:\datasets' as root path instead of 'E:\datasetsidatatang_200zh\corpus/test' as a example .
The recognized datasets are:

* Update ui.py

* Update ui.py
2022-09-14 21:37:54 +08:00
XCwosjw
dd1ea3e714 更正错误的CPU型号 (#715)
11770k😂😂😂
2022-09-10 23:56:01 +08:00
Vega
e7313c514f Refactor (#705)
* Refactor model

* Add description for

* update launch json

* Fix #657

* Avoid recursive calls of web ui for M1
2022-08-12 23:13:57 +08:00
Xu Meng
f57d1a69b6 Translate update README-CN.md (#698)
Fix: Traditional Chinese to Simplified Chinese
2022-08-06 23:51:34 +08:00
Vega
ab7d692619 Refactor (#663)
* Refactor model

* Add description for

* update launch json

* Fix #657
2022-07-19 23:43:51 +08:00
Vega
f17e3b04e1 Refactor (#650)
* Refactor model

* Add description for

* update launch json
2022-07-17 14:27:45 +08:00
214 changed files with 25673 additions and 28546 deletions

4
.dockerignore Normal file
View File

@@ -0,0 +1,4 @@
*/saved_models
!vocoder/saved_models/pretrained/**
!encoder/saved_models/pretrained.pt
/datasets

13
.gitignore vendored
View File

@@ -14,8 +14,13 @@
*.bcf *.bcf
*.toc *.toc
*.sh *.sh
*/saved_models data/ckpt/*/*
!vocoder/saved_models/pretrained/** !data/ckpt/encoder/pretrained.pt
!encoder/saved_models/pretrained.pt !data/ckpt/vocoder/pretrained/
wavs wavs
log log
!/docker-entrypoint.sh
!/datasets_download/*.sh
/datasets
monotonic_align/build
monotonic_align/monotonic_align

18
.vscode/launch.json vendored
View File

@@ -15,7 +15,8 @@
"name": "Python: Vocoder Preprocess", "name": "Python: Vocoder Preprocess",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "vocoder_preprocess.py", "program": "control\\cli\\vocoder_preprocess.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["..\\audiodata"] "args": ["..\\audiodata"]
}, },
@@ -23,7 +24,8 @@
"name": "Python: Vocoder Train", "name": "Python: Vocoder Train",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "vocoder_train.py", "program": "control\\cli\\vocoder_train.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["dev", "..\\audiodata"] "args": ["dev", "..\\audiodata"]
}, },
@@ -32,6 +34,7 @@
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "demo_toolbox.py", "program": "demo_toolbox.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["-d","..\\audiodata"] "args": ["-d","..\\audiodata"]
}, },
@@ -40,6 +43,7 @@
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "demo_toolbox.py", "program": "demo_toolbox.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["-d","..\\audiodata","-vc"] "args": ["-d","..\\audiodata","-vc"]
}, },
@@ -47,9 +51,9 @@
"name": "Python: Synth Train", "name": "Python: Synth Train",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "synthesizer_train.py", "program": "train.py",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["my_run", "..\\"] "args": ["--type", "vits"]
}, },
{ {
"name": "Python: PPG Convert", "name": "Python: PPG Convert",
@@ -62,12 +66,12 @@
] ]
}, },
{ {
"name": "GUI", "name": "Python: Vits Train",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "mkgui\\base\\_cli.py", "program": "train.py",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": [] "args": ["--type", "vits"]
}, },
] ]
} }

17
Dockerfile Normal file
View File

@@ -0,0 +1,17 @@
FROM pytorch/pytorch:latest
RUN apt-get update && apt-get install -y build-essential ffmpeg parallel aria2 && apt-get clean
COPY ./requirements.txt /workspace/requirements.txt
RUN pip install -r requirements.txt && pip install webrtcvad-wheels
COPY . /workspace
VOLUME [ "/datasets", "/workspace/synthesizer/saved_models/" ]
ENV DATASET_MIRROR=default FORCE_RETRAIN=false TRAIN_DATASETS=aidatatang_200zh\ magicdata\ aishell3\ data_aishell TRAIN_SKIP_EXISTING=true
EXPOSE 8080
ENTRYPOINT [ "/workspace/docker-entrypoint.sh" ]

View File

@@ -18,17 +18,10 @@
🌍 **Webserver Ready** 可伺服你的训练结果,供远程调用 🌍 **Webserver Ready** 可伺服你的训练结果,供远程调用
### 进行中的工作
* GUI/客户端大升级与合并
[X] 初始化框架 `./mkgui` 基于streamlit + fastapi和 [技术设计](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
[X] 增加 Voice Cloning and Conversion的演示页面
[X] 增加Voice Conversion的预处理preprocessing 和训练 training 页面
[ ] 增加其他的的预处理preprocessing 和训练 training 页面
* 模型后端基于ESPnet2升级
## 开始 ## 开始
### 1. 安装要求 ### 1. 安装要求
#### 1.1 通用配置
> 按照原始存储库测试您是否已准备好所有环境。 > 按照原始存储库测试您是否已准备好所有环境。
运行工具箱(demo_toolbox.py)需要 **Python 3.7 或更高版本** 运行工具箱(demo_toolbox.py)需要 **Python 3.7 或更高版本**
@@ -38,6 +31,67 @@
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。 * 运行`pip install -r requirements.txt` 来安装剩余的必要包。
* 安装 webrtcvad `pip install webrtcvad-wheels` * 安装 webrtcvad `pip install webrtcvad-wheels`
或者
-`conda` 或者 `mamba` 安装依赖
```conda env create -n env_name -f env.yml```
```mamba env create -n env_name -f env.yml```
会创建新环境安装必须的依赖. 之后用 `conda activate env_name` 切换环境就完成了.
> env.yml只包含了运行时必要的依赖暂时不包括monotonic-align如果想要装GPU版本的pytorch可以查看官网教程。
#### 1.2 M1芯片Mac环境配置Inference Time)
> 以下环境按x86-64搭建使用原生的`demo_toolbox.py`可作为在不改代码情况下快速使用的workaround。
>
> 如需使用M1芯片训练因`demo_toolbox.py`依赖的`PyQt5`不支持M1则应按需修改代码或者尝试使用`web.py`。
* 安装`PyQt5`,参考[这个链接](https://stackoverflow.com/a/68038451/20455983)
* 用Rosetta打开Terminal参考[这个链接](https://dev.to/courier/tips-and-tricks-to-setup-your-apple-m1-for-development-547g)
* 用系统Python创建项目虚拟环境
```
/usr/bin/python3 -m venv /PathToMockingBird/venv
source /PathToMockingBird/venv/bin/activate
```
* 升级pip并安装`PyQt5`
```
pip install --upgrade pip
pip install pyqt5
```
* 安装`pyworld`和`ctc-segmentation`
> 这里两个文件直接`pip install`的时候找不到wheel尝试从c里build时找不到`Python.h`报错
* 安装`pyworld`
* `brew install python` 通过brew安装python时会自动安装`Python.h`
* `export CPLUS_INCLUDE_PATH=/opt/homebrew/Frameworks/Python.framework/Headers` 对于M1brew安装`Python.h`到上述路径。把路径添加到环境变量里
* `pip install pyworld`
* 安装`ctc-segmentation`
> 因上述方法没有成功,选择从[github](https://github.com/lumaku/ctc-segmentation) clone源码手动编译
* `git clone https://github.com/lumaku/ctc-segmentation.git` 克隆到任意位置
* `cd ctc-segmentation`
* `source /PathToMockingBird/venv/bin/activate` 假设一开始未开启打开MockingBird项目的虚拟环境
* `cythonize -3 ctc_segmentation/ctc_segmentation_dyn.pyx`
* `/usr/bin/arch -x86_64 python setup.py build` 要注意明确用x86-64架构编译
* `/usr/bin/arch -x86_64 python setup.py install --optimize=1 --skip-build`用x86-64架构安装
* 安装其他依赖
* `/usr/bin/arch -x86_64 pip install torch torchvision torchaudio` 这里用pip安装`PyTorch`明确架构是x86
* `pip install ffmpeg` 安装ffmpeg
* `pip install -r requirements.txt`
* 运行
> 参考[这个链接](https://youtrack.jetbrains.com/issue/PY-46290/Allow-running-Python-under-Rosetta-2-in-PyCharm-for-Apple-Silicon)
让项目跑在x86架构环境上
* `vim /PathToMockingBird/venv/bin/pythonM1`
* 写入以下代码
```
#!/usr/bin/env zsh
mydir=${0:a:h}
/usr/bin/arch -x86_64 $mydir/python "$@"
```
* `chmod +x pythonM1` 设为可执行文件
* 如果使用PyCharm则把Interpreter指向`pythonM1`,否则也可命令行运行`/PathToMockingBird/venv/bin/pythonM1 demo_toolbox.py`
### 2. 准备预训练模型 ### 2. 准备预训练模型
考虑训练您自己专属的模型或者下载社区他人训练好的模型: 考虑训练您自己专属的模型或者下载社区他人训练好的模型:
> 近期创建了[知乎专题](https://www.zhihu.com/column/c_1425605280340504576) 将不定期更新炼丹小技巧or心得也欢迎提问 > 近期创建了[知乎专题](https://www.zhihu.com/column/c_1425605280340504576) 将不定期更新炼丹小技巧or心得也欢迎提问
@@ -59,7 +113,7 @@
> 假如你下载的 `aidatatang_200zh`文件放在D盘`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\` > 假如你下载的 `aidatatang_200zh`文件放在D盘`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
* 训练合成器: * 训练合成器:
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer` `python ./control/cli/synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
* 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时,请转到`启动程序`一步。 * 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时,请转到`启动程序`一步。
@@ -70,7 +124,7 @@
| --- | ----------- | ----- | ----- | | --- | ----------- | ----- | ----- |
| 作者 | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [百度盘链接](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps 用3个开源数据集混合训练 | 作者 | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [百度盘链接](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps 用3个开源数据集混合训练
| 作者 | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [百度盘链接](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) 提取码om7f | | 25k steps 用3个开源数据集混合训练, 切换到tag v0.0.1使用 | 作者 | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [百度盘链接](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) 提取码om7f | | 25k steps 用3个开源数据集混合训练, 切换到tag v0.0.1使用
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing [百度盘链接](https://pan.baidu.com/s/1vSYXO4wsLyjnF3Unl-Xoxg) 提取码1024 | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps 台湾口音需切换到tag v0.0.1使用 |@FawenYo | https://yisiou-my.sharepoint.com/:u:/g/personal/lawrence_cheng_fawenyo_onmicrosoft_com/EWFWDHzee-NNg9TWdKckCc4BC7bK2j9cCbOWn0-_tK0nOg?e=n0gGgC | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps 台湾口音需切换到tag v0.0.1使用
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码2021 | https://www.bilibili.com/video/BV1uh411B7AD/ | 150k steps 注意:根据[issue](https://github.com/babysor/MockingBird/issues/37)修复 并切换到tag v0.0.1使用 |@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码2021 | https://www.bilibili.com/video/BV1uh411B7AD/ | 150k steps 注意:根据[issue](https://github.com/babysor/MockingBird/issues/37)修复 并切换到tag v0.0.1使用
#### 2.4训练声码器 (可选) #### 2.4训练声码器 (可选)
@@ -81,14 +135,14 @@
* 训练wavernn声码器: * 训练wavernn声码器:
`python vocoder_train.py <trainid> <datasets_root>` `python ./control/cli/vocoder_train.py <trainid> <datasets_root>`
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型 > `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
* 训练hifigan声码器: * 训练hifigan声码器:
`python vocoder_train.py <trainid> <datasets_root> hifigan` `python ./control/cli/vocoder_train.py <trainid> <datasets_root> hifigan`
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型 > `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
* 训练fregan声码器: * 训练fregan声码器:
`python vocoder_train.py <trainid> <datasets_root> --config config.json fregan` `python ./control/cli/vocoder_train.py <trainid> <datasets_root> --config config.json fregan`
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型 > `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
* 将GAN声码器的训练切换为多GPU模式修改GAN文件夹下.json文件中的"num_gpus"参数 * 将GAN声码器的训练切换为多GPU模式修改GAN文件夹下.json文件中的"num_gpus"参数
### 3. 启动程序或工具箱 ### 3. 启动程序或工具箱
@@ -109,7 +163,7 @@
想像柯南拿着变声器然后发出毛利小五郎的声音吗本项目现基于PPG-VC引入额外两个模块PPG extractor + PPG2Mel, 可以实现变声功能。(文档不全,尤其是训练部分,正在努力补充中) 想像柯南拿着变声器然后发出毛利小五郎的声音吗本项目现基于PPG-VC引入额外两个模块PPG extractor + PPG2Mel, 可以实现变声功能。(文档不全,尤其是训练部分,正在努力补充中)
#### 4.0 准备环境 #### 4.0 准备环境
* 确保项目以上环境已经安装ok运行`pip install espnet` 来安装剩余的必要包。 * 确保项目以上环境已经安装ok运行`pip install espnet` 来安装剩余的必要包。
* 下载以下模型 链接https://pan.baidu.com/s/1bl_x_DHJSAUyN2fma-Q_Wg * 下载以下模型 链接https://pan.baidu.com/s/1bl_x_DHJSAUyN2fma-Q_Wg
提取码gh41 提取码gh41
* 24K采样率专用的vocoderhifigan*vocoder\saved_models\xxx* * 24K采样率专用的vocoderhifigan*vocoder\saved_models\xxx*
* 预训练的ppg特征encoder(ppg_extractor)到 *ppg_extractor\saved_models\xxx* * 预训练的ppg特征encoder(ppg_extractor)到 *ppg_extractor\saved_models\xxx*
@@ -119,14 +173,14 @@
* 下载aidatatang_200zh数据集并解压确保您可以访问 *train* 文件夹中的所有音频文件(如.wav * 下载aidatatang_200zh数据集并解压确保您可以访问 *train* 文件夹中的所有音频文件(如.wav
* 进行音频和梅尔频谱图预处理: * 进行音频和梅尔频谱图预处理:
`python pre4ppg.py <datasets_root> -d {dataset} -n {number}` `python ./control/cli/pre4ppg.py <datasets_root> -d {dataset} -n {number}`
可传入参数: 可传入参数:
* `-d {dataset}` 指定数据集,支持 aidatatang_200zh, 不传默认为aidatatang_200zh * `-d {dataset}` 指定数据集,支持 aidatatang_200zh, 不传默认为aidatatang_200zh
* `-n {number}` 指定并行数CPU 11770k在8的情况下需要运行12到18小时待优化 * `-n {number}` 指定并行数CPU 11700k在8的情况下需要运行12到18小时待优化
> 假如你下载的 `aidatatang_200zh`文件放在D盘`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\` > 假如你下载的 `aidatatang_200zh`文件放在D盘`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
* 训练合成器, 注意在上一步先下载好`ppg2mel.yaml`, 修改里面的地址指向预训练好的文件夹: * 训练合成器, 注意在上一步先下载好`ppg2mel.yaml`, 修改里面的地址指向预训练好的文件夹:
`python ppg2mel_train.py --config .\ppg2mel\saved_models\ppg2mel.yaml --oneshotvc ` `python ./control/cli/ppg2mel_train.py --config .\ppg2mel\saved_models\ppg2mel.yaml --oneshotvc `
* 如果想要继续上一次的训练,可以通过`--load .\ppg2mel\saved_models\<old_pt_file>` 参数指定一个预训练模型文件。 * 如果想要继续上一次的训练,可以通过`--load .\ppg2mel\saved_models\<old_pt_file>` 参数指定一个预训练模型文件。
#### 4.2 启动工具箱VC模式 #### 4.2 启动工具箱VC模式
@@ -148,30 +202,30 @@
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 | |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
## 常見問題(FQ&A) ## 常见问题(FQ&A)
#### 1.數據集哪裡下載? #### 1.数据集在哪里下载?
| 数据集 | OpenSLR地址 | 其他源 (Google Drive, Baidu网盘等) | | 数据集 | OpenSLR地址 | 其他源 (Google Drive, Baidu网盘等) |
| --- | ----------- | ---------------| | --- | ----------- | ---------------|
| aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) | | aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) |
| magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) | | magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) |
| aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) | | aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) |
| data_aishell | [OpenSLR](https://www.openslr.org/33/) | | | data_aishell | [OpenSLR](https://www.openslr.org/33/) | |
> aidatatang_200zh 後,還需將 `aidatatang_200zh\corpus\train`下的檔案全選解壓縮 > aidatatang_200zh 后,还需将 `aidatatang_200zh\corpus\train`下的文件全选解压缩
#### 2.`<datasets_root>`是什麼意思? #### 2.`<datasets_root>`是什麼意思?
假如數據集路徑為 `D:\data\aidatatang_200zh`,那 `<datasets_root>`就是 `D:\data` 假如数据集路径为 `D:\data\aidatatang_200zh`,那 `<datasets_root>`就是 `D:\data`
#### 3.訓練模型存不足 #### 3.训练模型存不足
訓練合成器時:將 `synthesizer/hparams.py`中的batch_size參數調 训练合成器时:将 `synthesizer/hparams.py`中的batch_size参数调
``` ```
//調整前 //整前
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size) (2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
(2, 2e-4, 80_000, 12), # (2, 2e-4, 80_000, 12), #
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames (2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration) (2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
(2, 1e-5, 640_000, 12)], # lr = learning rate (2, 1e-5, 640_000, 12)], # lr = learning rate
//調整後 //调整后
tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
(2, 5e-4, 40_000, 8), # (r, lr, step, batch_size) (2, 5e-4, 40_000, 8), # (r, lr, step, batch_size)
(2, 2e-4, 80_000, 8), # (2, 2e-4, 80_000, 8), #
@@ -180,15 +234,15 @@ tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
(2, 1e-5, 640_000, 8)], # lr = learning rate (2, 1e-5, 640_000, 8)], # lr = learning rate
``` ```
聲碼器-預處理數據集時:將 `synthesizer/hparams.py`中的batch_size參數調 声码器-预处理数据集时:将 `synthesizer/hparams.py`中的batch_size参数调
``` ```
//調整前 //整前
### Data Preprocessing ### Data Preprocessing
max_mel_frames = 900, max_mel_frames = 900,
rescale = True, rescale = True,
rescaling_max = 0.9, rescaling_max = 0.9,
synthesis_batch_size = 16, # For vocoder preprocessing and inference. synthesis_batch_size = 16, # For vocoder preprocessing and inference.
//調整後 //调整后
### Data Preprocessing ### Data Preprocessing
max_mel_frames = 900, max_mel_frames = 900,
rescale = True, rescale = True,
@@ -196,16 +250,16 @@ tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
synthesis_batch_size = 8, # For vocoder preprocessing and inference. synthesis_batch_size = 8, # For vocoder preprocessing and inference.
``` ```
聲碼器-訓練聲碼器時:將 `vocoder/wavernn/hparams.py`中的batch_size參數調 声码器-训练声码器时:将 `vocoder/wavernn/hparams.py`中的batch_size参数调
``` ```
//調整前 //整前
# Training # Training
voc_batch_size = 100 voc_batch_size = 100
voc_lr = 1e-4 voc_lr = 1e-4
voc_gen_at_checkpoint = 5 voc_gen_at_checkpoint = 5
voc_pad = 2 voc_pad = 2
//調整後 //调整后
# Training # Training
voc_batch_size = 6 voc_batch_size = 6
voc_lr = 1e-4 voc_lr = 1e-4
@@ -214,17 +268,16 @@ voc_pad =2
``` ```
#### 4.碰到`RuntimeError: Error(s) in loading state_dict for Tacotron: size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([70, 512]) from checkpoint, the shape in current model is torch.Size([75, 512]).` #### 4.碰到`RuntimeError: Error(s) in loading state_dict for Tacotron: size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([70, 512]) from checkpoint, the shape in current model is torch.Size([75, 512]).`
請參照 issue [#37](https://github.com/babysor/MockingBird/issues/37) 请参照 issue [#37](https://github.com/babysor/MockingBird/issues/37)
#### 5.如何改善CPU、GPU用率? #### 5.如何改善CPU、GPU用率?
適情況調整batch_size參數來改善 视情况调整batch_size参数来改善
#### 6.生 `面文件太小,法完成操作` #### 6.生 `面文件太小,法完成操作`
請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030)將虛擬內存更改100G(102400),例如:档案放置D就更改D的虚拟内存 请参考这篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030)将虚拟内存更改100G(102400),例如:文件放置D就更改D的虚拟内存
#### 7.什么时候算训练完成? #### 7.什么时候算训练完成?
首先一定要出现注意力模型其次是loss足够低取决于硬件设备和数据集。拿本人的供参考我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4 首先一定要出现注意力模型其次是loss足够低取决于硬件设备和数据集。拿本人的供参考我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4
![attention_step_20500_sample_1](https://user-images.githubusercontent.com/7423248/128587252-f669f05a-f411-4811-8784-222156ea5e9d.png) ![attention_step_20500_sample_1](https://user-images.githubusercontent.com/7423248/128587252-f669f05a-f411-4811-8784-222156ea5e9d.png)
![step-135500-mel-spectrogram_sample_1](https://user-images.githubusercontent.com/7423248/128587255-4945faa0-5517-46ea-b173-928eff999330.png) ![step-135500-mel-spectrogram_sample_1](https://user-images.githubusercontent.com/7423248/128587255-4945faa0-5517-46ea-b173-928eff999330.png)

223
README-LINUX-CN.md Normal file
View File

@@ -0,0 +1,223 @@
## 实时语音克隆 - 中文/普通话
![mockingbird](https://user-images.githubusercontent.com/12797292/131216767-6eb251d6-14fc-4951-8324-2722f0cd4c63.jpg)
[![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](http://choosealicense.com/licenses/mit/)
### [English](README.md) | 中文
### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/) | [Wiki教程](https://github.com/babysor/MockingBird/wiki/Quick-Start-(Newbie)) [训练教程](https://vaj2fgg8yn.feishu.cn/docs/doccn7kAbr3SJz0KM0SIDJ0Xnhd)
## 特性
🌍 **中文** 支持普通话并使用多种中文数据集进行测试aidatatang_200zh, magicdata, aishell3, biaobei, MozillaCommonVoice, data_aishell 等
🤩 **Easy & Awesome** 仅需下载或新训练合成器synthesizer就有良好效果复用预训练的编码器/声码器或实时的HiFi-GAN作为vocoder
🌍 **Webserver Ready** 可伺服你的训练结果,供远程调用。
🤩 **感谢各位小伙伴的支持,本项目将开启新一轮的更新**
## 1.快速开始
### 1.1 建议环境
- Ubuntu 18.04
- Cuda 11.7 && CuDNN 8.5.0
- Python 3.8 或 3.9
- Pytorch 2.0.1 <post cuda-11.7>
### 1.2 环境配置
```shell
# 下载前建议更换国内镜像源
conda create -n sound python=3.9
conda activate sound
git clone https://github.com/babysor/MockingBird.git
cd MockingBird
pip install -r requirements.txt
pip install webrtcvad-wheels
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
```
### 1.3 模型准备
> 当实在没有设备或者不想慢慢调试,可以使用社区贡献的模型(欢迎持续分享):
| 作者 | 下载链接 | 效果预览 | 信息 |
| --- | ----------- | ----- | ----- |
| 作者 | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [百度盘链接](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps 用3个开源数据集混合训练
| 作者 | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [百度盘链接](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) 提取码om7f | | 25k steps 用3个开源数据集混合训练, 切换到tag v0.0.1使用
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing [百度盘链接](https://pan.baidu.com/s/1vSYXO4wsLyjnF3Unl-Xoxg) 提取码1024 | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps 台湾口音需切换到tag v0.0.1使用
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码2021 | https://www.bilibili.com/video/BV1uh411B7AD/ | 150k steps 注意:根据[issue](https://github.com/babysor/MockingBird/issues/37)修复 并切换到tag v0.0.1使用
### 1.4 文件结构准备
文件结构准备如下所示算法将自动遍历synthesizer下的.pt模型文件。
```
# 以第一个 pretrained-11-7-21_75k.pt 为例
└── data
└── ckpt
└── synthesizer
└── pretrained-11-7-21_75k.pt
```
### 1.5 运行
```
python web.py
```
## 2.模型训练
### 2.1 数据准备
#### 2.1.1 数据下载
``` shell
# aidatatang_200zh
wget https://openslr.elda.org/resources/62/aidatatang_200zh.tgz
```
``` shell
# MAGICDATA
wget https://openslr.magicdatatech.com/resources/68/train_set.tar.gz
wget https://openslr.magicdatatech.com/resources/68/dev_set.tar.gz
wget https://openslr.magicdatatech.com/resources/68/test_set.tar.gz
```
``` shell
# AISHELL-3
wget https://openslr.elda.org/resources/93/data_aishell3.tgz
```
```shell
# Aishell
wget https://openslr.elda.org/resources/33/data_aishell.tgz
```
#### 2.1.2 数据批量解压
```shell
# 该指令为解压当前目录下的所有压缩文件
for gz in *.gz; do tar -zxvf $gz; done
```
### 2.2 encoder模型训练
#### 2.2.1 数据预处理:
需要先在`pre.py `头部加入:
```python
import torch
torch.multiprocessing.set_start_method('spawn', force=True)
```
使用以下指令对数据预处理:
```shell
python pre.py <datasets_root> \
-d <datasets_name>
```
其中`<datasets_root>`为原数据集路径,`<datasets_name>` 为数据集名称。
支持 `librispeech_other``voxceleb1``aidatatang_200zh`,使用逗号分割处理多数据集。
### 2.2.2 encoder模型训练
超参数文件路径:`models/encoder/hparams.py`
```shell
python encoder_train.py <name> \
<datasets_root>/SV2TTS/encoder
```
其中 `<name>` 是训练产生文件的名称,可自行修改。
其中 `<datasets_root>` 是经过 `Step 2.1.1` 处理过后的数据集路径。
#### 2.2.3 开启encoder模型训练数据可视化可选
```shell
visdom
```
### 2.3 synthesizer模型训练
#### 2.3.1 数据预处理:
```shell
python pre.py <datasets_root> \
-d <datasets_name> \
-o <datasets_path> \
-n <number>
```
`<datasets_root>` 为原数据集路径,当你的`aidatatang_200zh`路径为`/data/aidatatang_200zh/corpus/train`时,`<datasets_root>` 为 `/data/`。
`<datasets_name>` 为数据集名称。
`<datasets_path>` 为数据集处理后的保存路径。
`<number>` 为数据集处理时进程数根据CPU情况调整大小。
#### 2.3.2 新增数据预处理:
```shell
python pre.py <datasets_root> \
-d <datasets_name> \
-o <datasets_path> \
-n <number> \
-s
```
当新增数据集时,应加 `-s` 选择数据拼接,不加则为覆盖。
#### 2.3.3 synthesizer模型训练
超参数文件路径:`models/synthesizer/hparams.py`,需将`MockingBird/control/cli/synthesizer_train.py`移成`MockingBird/synthesizer_train.py`结构。
```shell
python synthesizer_train.py <name> <datasets_path> \
-m <out_dir>
```
其中 `<name>` 是训练产生文件的名称,可自行修改。
其中 `<datasets_path>` 是经过 `Step 2.2.1` 处理过后的数据集路径。
其中 `<out_dir> `为训练时所有数据的保存路径。
### 2.4 vocoder模型训练
vocoder模型对生成效果影响不大已预置3款。
#### 2.4.1 数据预处理
```shell
python vocoder_preprocess.py <datasets_root> \
-m <synthesizer_model_path>
```
其中`<datasets_root>`为你数据集路径。
其中 `<synthesizer_model_path>`为synthesizer模型地址。
#### 2.4.2 wavernn声码器训练:
```
python vocoder_train.py <name> <datasets_root>
```
#### 2.4.3 hifigan声码器训练:
```
python vocoder_train.py <name> <datasets_root> hifigan
```
#### 2.4.4 fregan声码器训练:
```
python vocoder_train.py <name> <datasets_root> \
--config config.json fregan
```
将GAN声码器的训练切换为多GPU模式修改`GAN`文件夹下`.json`文件中的`num_gpus`参数。
## 3.致谢
### 3.1 项目致谢
该库一开始从仅支持英语的[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) 分叉出来的,鸣谢作者。
### 3.2 论文致谢
| URL | Designation | 标题 | 实现源码 |
| --- | ----------- | ----- | --------------------- |
| [1803.09017](https://arxiv.org/abs/1803.09017) | GlobalStyleToken (synthesizer)| Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis | 本代码库 |
| [2010.05646](https://arxiv.org/abs/2010.05646) | HiFi-GAN (vocoder)| Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | 本代码库 |
| [2106.02297](https://arxiv.org/abs/2106.02297) | Fre-GAN (vocoder)| Fre-GAN: Adversarial Frequency-consistent Audio Synthesis | 本代码库 |
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | SV2TTS | Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis | 本代码库 |
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
### 3.3 开发者致谢
作为AI领域的从业者我们不仅乐于开发一些具有里程碑意义的算法项目同时也乐于分享项目以及开发过程中收获的喜悦。
因此你们的使用是对我们项目的最大认可。同时当你们在项目使用中遇到一些问题时欢迎你们随时在issue上留言。你们的指正这对于项目的后续优化具有十分重大的的意义。
为了表示感谢,我们将在本项目中留下各位开发者信息以及相对应的贡献。
- ------------------------------------------------ 开 发 者 贡 献 内 容 ---------------------------------------------------------------------------------

View File

@@ -3,7 +3,7 @@
[![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](http://choosealicense.com/licenses/mit/) [![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](http://choosealicense.com/licenses/mit/)
> English | [中文](README-CN.md) > English | [中文](README-CN.md)| [中文Linux](README-LINUX-CN.md)
## Features ## Features
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3, data_aishell, and etc. 🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3, data_aishell, and etc.
@@ -18,17 +18,10 @@
### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/) ### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/)
### Ongoing Works(Helps Needed)
* Major upgrade on GUI/Client and unifying web and toolbox
[X] Init framework `./mkgui` and [tech design](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
[X] Add demo part of Voice Cloning and Conversion
[X] Add preprocessing and training for Voice Conversion
[ ] Add preprocessing and training for Encoder/Synthesizer/Vocoder
* Major upgrade on model backend based on ESPnet2(not yet started)
## Quick Start ## Quick Start
### 1. Install Requirements ### 1. Install Requirements
#### 1.1 General Setup
> Follow the original repo to test if you got all environment ready. > Follow the original repo to test if you got all environment ready.
**Python 3.7 or higher ** is needed to run the toolbox. **Python 3.7 or higher ** is needed to run the toolbox.
@@ -37,8 +30,74 @@
* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). * Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
* Run `pip install -r requirements.txt` to install the remaining necessary packages. * Run `pip install -r requirements.txt` to install the remaining necessary packages.
* Install webrtcvad `pip install webrtcvad-wheels`(If you need) * Install webrtcvad `pip install webrtcvad-wheels`(If you need)
> Note that we are using the pretrained encoder/vocoder but synthesizer since the original model is incompatible with the Chinese symbols. It means the demo_cli is not working at this moment.
or
- install dependencies with `conda` or `mamba`
```conda env create -n env_name -f env.yml```
```mamba env create -n env_name -f env.yml```
will create a virtual environment where necessary dependencies are installed. Switch to the new environment by `conda activate env_name` and enjoy it.
> env.yml only includes the necessary dependencies to run the projecttemporarily without monotonic-align. You can check the official website to install the GPU version of pytorch.
#### 1.2 Setup with a M1 Mac
> The following steps are a workaround to directly use the original `demo_toolbox.py`without the changing of codes.
>
> Since the major issue comes with the PyQt5 packages used in `demo_toolbox.py` not compatible with M1 chips, were one to attempt on training models with the M1 chip, either that person can forgo `demo_toolbox.py`, or one can try the `web.py` in the project.
##### 1.2.1 Install `PyQt5`, with [ref](https://stackoverflow.com/a/68038451/20455983) here.
* Create and open a Rosetta Terminal, with [ref](https://dev.to/courier/tips-and-tricks-to-setup-your-apple-m1-for-development-547g) here.
* Use system Python to create a virtual environment for the project
```
/usr/bin/python3 -m venv /PathToMockingBird/venv
source /PathToMockingBird/venv/bin/activate
```
* Upgrade pip and install `PyQt5`
```
pip install --upgrade pip
pip install pyqt5
```
##### 1.2.2 Install `pyworld` and `ctc-segmentation`
> Both packages seem to be unique to this project and are not seen in the original [Real-Time Voice Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) project. When installing with `pip install`, both packages lack wheels so the program tries to directly compile from c code and could not find `Python.h`.
* Install `pyworld`
* `brew install python` `Python.h` can come with Python installed by brew
* `export CPLUS_INCLUDE_PATH=/opt/homebrew/Frameworks/Python.framework/Headers` The filepath of brew-installed `Python.h` is unique to M1 MacOS and listed above. One needs to manually add the path to the environment variables.
* `pip install pyworld` that should do.
* Install`ctc-segmentation`
> Same method does not apply to `ctc-segmentation`, and one needs to compile it from the source code on [github](https://github.com/lumaku/ctc-segmentation).
* `git clone https://github.com/lumaku/ctc-segmentation.git`
* `cd ctc-segmentation`
* `source /PathToMockingBird/venv/bin/activate` If the virtual environment hasn't been deployed, activate it.
* `cythonize -3 ctc_segmentation/ctc_segmentation_dyn.pyx`
* `/usr/bin/arch -x86_64 python setup.py build` Build with x86 architecture.
* `/usr/bin/arch -x86_64 python setup.py install --optimize=1 --skip-build`Install with x86 architecture.
##### 1.2.3 Other dependencies
* `/usr/bin/arch -x86_64 pip install torch torchvision torchaudio` Pip installing `PyTorch` as an example, articulate that it's installed with x86 architecture
* `pip install ffmpeg` Install ffmpeg
* `pip install -r requirements.txt` Install other requirements.
##### 1.2.4 Run the Inference Time (with Toolbox)
> To run the project on x86 architecture. [ref](https://youtrack.jetbrains.com/issue/PY-46290/Allow-running-Python-under-Rosetta-2-in-PyCharm-for-Apple-Silicon).
* `vim /PathToMockingBird/venv/bin/pythonM1` Create an executable file `pythonM1` to condition python interpreter at `/PathToMockingBird/venv/bin`.
* Write in the following content:
```
#!/usr/bin/env zsh
mydir=${0:a:h}
/usr/bin/arch -x86_64 $mydir/python "$@"
```
* `chmod +x pythonM1` Set the file as executable.
* If using PyCharm IDE, configure project interpreter to `pythonM1`([steps here](https://www.jetbrains.com/help/pycharm/configuring-python-interpreter.html#add-existing-interpreter)), if using command line python, run `/PathToMockingBird/venv/bin/pythonM1 demo_toolbox.py`
### 2. Prepare your models ### 2. Prepare your models
> Note that we are using the pretrained encoder/vocoder but not synthesizer, since the original model is incompatible with the Chinese symbols. It means the demo_cli is not working at this moment, so additional synthesizer models are required.
You can either train your models or use existing ones: You can either train your models or use existing ones:
#### 2.1 Train encoder with your dataset (Optional) #### 2.1 Train encoder with your dataset (Optional)
@@ -56,7 +115,7 @@ You can either train your models or use existing ones:
Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata, aishell3, data_aishell, etc.If this parameter is not passed, the default dataset will be aidatatang_200zh. Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata, aishell3, data_aishell, etc.If this parameter is not passed, the default dataset will be aidatatang_200zh.
* Train the synthesizer: * Train the synthesizer:
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer` `python train.py --type=synth mandarin <datasets_root>/SV2TTS/synthesizer`
* Go to next step when you see attention line show and loss meet your need in training folder *synthesizer/saved_models/*. * Go to next step when you see attention line show and loss meet your need in training folder *synthesizer/saved_models/*.
@@ -67,7 +126,7 @@ Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata,
| --- | ----------- | ----- |----- | | --- | ----------- | ----- |----- |
| @author | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [Baidu](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps trained by multiple datasets | @author | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [Baidu](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps trained by multiple datasets
| @author | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [Baidu](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) codeom7f | | 25k steps trained by multiple datasets, only works under version 0.0.1 | @author | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [Baidu](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) codeom7f | | 25k steps trained by multiple datasets, only works under version 0.0.1
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing https://u.teknik.io/AYxWf.pt | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps with local accent of Taiwan, only works under version 0.0.1 |@FawenYo | https://yisiou-my.sharepoint.com/:u:/g/personal/lawrence_cheng_fawenyo_onmicrosoft_com/EWFWDHzee-NNg9TWdKckCc4BC7bK2j9cCbOWn0-_tK0nOg?e=n0gGgC | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps with local accent of Taiwan, only works under version 0.0.1
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code: 2021 https://www.aliyundrive.com/s/AwPsbo8mcSP code: z2m0 | https://www.bilibili.com/video/BV1uh411B7AD/ | only works under version 0.0.1 |@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code: 2021 https://www.aliyundrive.com/s/AwPsbo8mcSP code: z2m0 | https://www.bilibili.com/video/BV1uh411B7AD/ | only works under version 0.0.1
#### 2.4 Train vocoder (Optional) #### 2.4 Train vocoder (Optional)

View File

@@ -1,9 +1,9 @@
from encoder.params_model import model_embedding_size as speaker_embedding_size from models.encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args from utils.argutils import print_args
from utils.modelutils import check_model_paths from utils.modelutils import check_model_paths
from synthesizer.inference import Synthesizer from models.synthesizer.inference import Synthesizer
from encoder import inference as encoder from models.encoder import inference as encoder
from vocoder import inference as vocoder from models.vocoder import inference as vocoder
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf

View File

@@ -1,7 +1,10 @@
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aidatatang_200zh
from utils.argutils import print_args
from pathlib import Path
import argparse import argparse
from pathlib import Path
from models.encoder.preprocess import (preprocess_aidatatang_200zh,
preprocess_librispeech, preprocess_voxceleb1,
preprocess_voxceleb2)
from utils.argutils import print_args
if __name__ == "__main__": if __name__ == "__main__":
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):

View File

@@ -1,5 +1,5 @@
from utils.argutils import print_args from utils.argutils import print_args
from encoder.train import train from models.encoder.train import train
from pathlib import Path from pathlib import Path
import argparse import argparse

View File

@@ -2,8 +2,8 @@ import sys
import torch import torch
import argparse import argparse
import numpy as np import numpy as np
from utils.load_yaml import HpsYaml from utils.hparams import HpsYaml
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training # For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True

View File

@@ -1,7 +1,7 @@
from pathlib import Path from pathlib import Path
import argparse import argparse
from ppg2mel.preprocess import preprocess_dataset from models.ppg2mel.preprocess import preprocess_dataset
from pathlib import Path from pathlib import Path
import argparse import argparse

View File

@@ -1,10 +1,9 @@
from synthesizer.hparams import hparams from models.synthesizer.hparams import hparams
from synthesizer.train import train from models.synthesizer.train import train
from utils.argutils import print_args from utils.argutils import print_args
import argparse import argparse
def new_train():
if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("run_id", type=str, help= \ parser.add_argument("run_id", type=str, help= \
"Name for this model instance. If a model state from the same run ID was previously " "Name for this model instance. If a model state from the same run ID was previously "
@@ -13,7 +12,7 @@ if __name__ == "__main__":
parser.add_argument("syn_dir", type=str, default=argparse.SUPPRESS, help= \ parser.add_argument("syn_dir", type=str, default=argparse.SUPPRESS, help= \
"Path to the synthesizer directory that contains the ground truth mel spectrograms, " "Path to the synthesizer directory that contains the ground truth mel spectrograms, "
"the wavs and the embeds.") "the wavs and the embeds.")
parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\ parser.add_argument("-m", "--models_dir", type=str, default=f"data/ckpt/synthesizer/", help=\
"Path to the output directory that will contain the saved model weights and the logs.") "Path to the output directory that will contain the saved model weights and the logs.")
parser.add_argument("-s", "--save_every", type=int, default=1000, help= \ parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
"Number of steps between updates of the model on the disk. Set to 0 to never save the " "Number of steps between updates of the model on the disk. Set to 0 to never save the "
@@ -28,10 +27,14 @@ if __name__ == "__main__":
parser.add_argument("--hparams", default="", parser.add_argument("--hparams", default="",
help="Hyperparameter overrides as a comma-separated list of name=value " help="Hyperparameter overrides as a comma-separated list of name=value "
"pairs") "pairs")
args = parser.parse_args() args, _ = parser.parse_known_args()
print_args(args, parser) print_args(args, parser)
args.hparams = hparams.parse(args.hparams) args.hparams = hparams.parse(args.hparams)
# Run the training # Run the training
train(**vars(args)) train(**vars(args))
if __name__ == "__main__":
new_train()

View File

@@ -0,0 +1,66 @@
import sys
import torch
import argparse
import numpy as np
from utils.hparams import HpsYaml
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def main():
# Arguments
parser = argparse.ArgumentParser(description=
'Training PPG2Mel VC model.')
parser.add_argument('--config', type=str,
help='Path to experiment config, e.g., config/vc.yaml')
parser.add_argument('--name', default=None, type=str, help='Name for logging.')
parser.add_argument('--logdir', default='log/', type=str,
help='Logging path.', required=False)
parser.add_argument('--ckpdir', default='ppg2mel/saved_models/', type=str,
help='Checkpoint path.', required=False)
parser.add_argument('--outdir', default='result/', type=str,
help='Decode output path.', required=False)
parser.add_argument('--load', default=None, type=str,
help='Load pre-trained model (for training only)', required=False)
parser.add_argument('--warm_start', action='store_true',
help='Load model weights only, ignore specified layers.')
parser.add_argument('--seed', default=0, type=int,
help='Random seed for reproducable results.', required=False)
parser.add_argument('--njobs', default=8, type=int,
help='Number of threads for dataloader/decoding.', required=False)
parser.add_argument('--cpu', action='store_true', help='Disable GPU training.')
parser.add_argument('--no-pin', action='store_true',
help='Disable pin-memory for dataloader')
parser.add_argument('--test', action='store_true', help='Test the model.')
parser.add_argument('--no-msg', action='store_true', help='Hide all messages.')
parser.add_argument('--finetune', action='store_true', help='Finetune model')
parser.add_argument('--oneshotvc', action='store_true', help='Oneshot VC model')
parser.add_argument('--bilstm', action='store_true', help='BiLSTM VC model')
parser.add_argument('--lsa', action='store_true', help='Use location-sensitive attention (LSA)')
###
paras = parser.parse_args()
setattr(paras, 'gpu', not paras.cpu)
setattr(paras, 'pin_memory', not paras.no_pin)
setattr(paras, 'verbose', not paras.no_msg)
# Make the config dict dot visitable
config = HpsYaml(paras.config)
np.random.seed(paras.seed)
torch.manual_seed(paras.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(paras.seed)
print(">>> OneShot VC training ...")
mode = "train"
solver = Solver(config, paras, mode)
solver.load_data()
solver.set_model()
solver.exec()
print(">>> Oneshot VC train finished!")
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -1,5 +1,5 @@
from synthesizer.synthesize import run_synthesis from models.synthesizer.synthesize import run_synthesis
from synthesizer.hparams import hparams from models.synthesizer.hparams import hparams
from utils.argutils import print_args from utils.argutils import print_args
import argparse import argparse
import os import os

View File

@@ -1,7 +1,7 @@
from utils.argutils import print_args from utils.argutils import print_args
from vocoder.wavernn.train import train from models.vocoder.wavernn.train import train
from vocoder.hifigan.train import train as train_hifigan from models.vocoder.hifigan.train import train as train_hifigan
from vocoder.fregan.train import train as train_fregan from models.vocoder.fregan.train import train as train_fregan
from utils.util import AttrDict from utils.util import AttrDict
from pathlib import Path from pathlib import Path
import argparse import argparse
@@ -78,7 +78,7 @@ if __name__ == "__main__":
else: else:
train_hifigan(0, args, h) train_hifigan(0, args, h)
elif args.vocoder_type == "fregan": elif args.vocoder_type == "fregan":
with open('vocoder/fregan/config.json') as f: with Path('vocoder/fregan/config.json').open() as f:
json_config = json.load(f) json_config = json.load(f)
h = AttrDict(json_config) h = AttrDict(json_config)
if h.num_gpus > 1: if h.num_gpus > 1:

View File

@@ -2,24 +2,26 @@ from pydantic import BaseModel, Field
import os import os
from pathlib import Path from pathlib import Path
from enum import Enum from enum import Enum
from encoder import inference as encoder from models.encoder import inference as encoder
import librosa import librosa
from scipy.io.wavfile import write from scipy.io.wavfile import write
import re import re
import numpy as np import numpy as np
from mkgui.base.components.types import FileContent from control.mkgui.base.components.types import FileContent
from vocoder.hifigan import inference as gan_vocoder from models.vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer from models.synthesizer.inference import Synthesizer
from typing import Any, Tuple from typing import Any, Tuple
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
# Constants # Constants
AUDIO_SAMPLES_DIR = f"samples{os.sep}" AUDIO_SAMPLES_DIR = f"data{os.sep}samples{os.sep}"
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models" SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
VOC_MODELS_DIRT = f"vocoder{os.sep}saved_models" VOC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}vocoder"
TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav" TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav" TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
if not os.path.isdir("wavs"):
os.makedirs("wavs")
# Load local sample audio as options TODO: load dataset # Load local sample audio as options TODO: load dataset
if os.path.isdir(AUDIO_SAMPLES_DIR): if os.path.isdir(AUDIO_SAMPLES_DIR):
@@ -29,7 +31,7 @@ if os.path.isdir(SYN_MODELS_DIRT):
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt"))) synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
print("Loaded synthesizer models: " + str(len(synthesizers))) print("Loaded synthesizer models: " + str(len(synthesizers)))
else: else:
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.") raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist. 请将模型文件位置移动到上述位置中进行重试!")
if os.path.isdir(ENC_MODELS_DIRT): if os.path.isdir(ENC_MODELS_DIRT):
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt"))) encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
@@ -49,9 +51,11 @@ class Input(BaseModel):
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容" ..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容"
) )
local_audio_file: audio_input_selection = Field( local_audio_file: audio_input_selection = Field(
..., alias="输入语音本地wav", ..., alias="选择语音本地wav",
description="选择本地语音文件." description="选择本地语音文件."
) )
record_audio_file: FileContent = Field(default=None, alias="录制语音",
description="录音.", is_recorder=True, mime_type="audio/wav")
upload_audio_file: FileContent = Field(default=None, alias="或上传语音", upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
description="拖拽或点击上传.", mime_type="audio/wav") description="拖拽或点击上传.", mime_type="audio/wav")
encoder: encoders = Field( encoder: encoders = Field(
@@ -101,7 +105,12 @@ def synthesize(input: Input) -> Output:
gan_vocoder.load_model(Path(input.vocoder.value)) gan_vocoder.load_model(Path(input.vocoder.value))
# load file # load file
if input.upload_audio_file != None: if input.record_audio_file != None:
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
f.write(input.record_audio_file.as_bytes())
f.seek(0)
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
elif input.upload_audio_file != None:
with open(TEMP_SOURCE_AUDIO, "w+b") as f: with open(TEMP_SOURCE_AUDIO, "w+b") as f:
f.write(input.upload_audio_file.as_bytes()) f.write(input.upload_audio_file.as_bytes())
f.seek(0) f.seek(0)

View File

@@ -1,27 +1,26 @@
from synthesizer.inference import Synthesizer
from pydantic import BaseModel, Field
from encoder import inference as speacker_encoder
import torch
import os import os
from pathlib import Path
from enum import Enum from enum import Enum
import ppg_extractor as Extractor from pathlib import Path
import ppg2mel as Convertor
import librosa
from scipy.io.wavfile import write
import re
import numpy as np
from mkgui.base.components.types import FileContent
from vocoder.hifigan import inference as gan_vocoder
from typing import Any, Tuple from typing import Any, Tuple
import matplotlib.pyplot as plt
import librosa
import matplotlib.pyplot as plt
import torch
from pydantic import BaseModel, Field
from scipy.io.wavfile import write
import models.ppg2mel as Convertor
import models.ppg_extractor as Extractor
from control.mkgui.base.components.types import FileContent
from models.encoder import inference as speacker_encoder
from models.synthesizer.inference import Synthesizer
from models.vocoder.hifigan import inference as gan_vocoder
# Constants # Constants
AUDIO_SAMPLES_DIR = f'sample{os.sep}' AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}'
EXT_MODELS_DIRT = f'ppg_extractor{os.sep}saved_models' EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor'
CONV_MODELS_DIRT = f'ppg2mel{os.sep}saved_models' CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel'
VOC_MODELS_DIRT = f'vocoder{os.sep}saved_models' VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder'
TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav' TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav' TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav' TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'
@@ -132,9 +131,10 @@ def convert(input: Input) -> Output:
ppg = extractor.extract_from_wav(src_wav) ppg = extractor.extract_from_wav(src_wav)
# Import necessary dependency of Voice Conversion # Import necessary dependency of Voice Conversion
from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0,
get_converted_lf0uv)
ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav))) ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
speacker_encoder.load_model(Path("encoder{os.sep}saved_models{os.sep}pretrained_bak_5805000.pt")) speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt"))
embed = speacker_encoder.embed_utterance(ref_wav) embed = speacker_encoder.embed_utterance(ref_wav)
lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True) lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
min_len = min(ppg.shape[1], len(lf0_uv)) min_len = min(ppg.shape[1], len(lf0_uv))

View File

@@ -37,6 +37,12 @@ def is_single_file_property(property: Dict) -> bool:
# TODO: binary? # TODO: binary?
return property.get("format") == "byte" return property.get("format") == "byte"
def is_single_autio_property(property: Dict) -> bool:
if property.get("type") != "string":
return False
# TODO: binary?
return property.get("format") == "bytes"
def is_single_directory_property(property: Dict) -> bool: def is_single_directory_property(property: Dict) -> bool:
if property.get("type") != "string": if property.get("type") != "string":

View File

@@ -2,7 +2,7 @@ import datetime
import inspect import inspect
import mimetypes import mimetypes
import sys import sys
from os import getcwd, unlink from os import getcwd, unlink, path
from platform import system from platform import system
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
from typing import Any, Callable, Dict, List, Type from typing import Any, Callable, Dict, List, Type
@@ -14,14 +14,13 @@ from fastapi.encoders import jsonable_encoder
from loguru import logger from loguru import logger
from pydantic import BaseModel, ValidationError, parse_obj_as from pydantic import BaseModel, ValidationError, parse_obj_as
from mkgui.base import Opyrator from control.mkgui.base import Opyrator
from mkgui.base.core import name_to_title from control.mkgui.base.core import name_to_title
from mkgui.base.ui import schema_utils from . import schema_utils
from mkgui.base.ui.streamlit_utils import CUSTOM_STREAMLIT_CSS from .streamlit_utils import CUSTOM_STREAMLIT_CSS
STREAMLIT_RUNNER_SNIPPET = """ STREAMLIT_RUNNER_SNIPPET = """
from mkgui.base.ui import render_streamlit_ui from control.mkgui.base.ui import render_streamlit_ui
from mkgui.base import Opyrator
import streamlit as st import streamlit as st
@@ -243,7 +242,14 @@ class InputUI:
file_extension = None file_extension = None
if "mime_type" in property: if "mime_type" in property:
file_extension = mimetypes.guess_extension(property["mime_type"]) file_extension = mimetypes.guess_extension(property["mime_type"])
if "is_recorder" in property:
from audio_recorder_streamlit import audio_recorder
audio_bytes = audio_recorder()
if audio_bytes:
streamlit_app.audio(audio_bytes, format="audio/wav")
return audio_bytes
uploaded_file = streamlit_app.file_uploader( uploaded_file = streamlit_app.file_uploader(
**streamlit_kwargs, accept_multiple_files=False, type=file_extension **streamlit_kwargs, accept_multiple_files=False, type=file_extension
) )
@@ -263,6 +269,39 @@ class InputUI:
streamlit_app.video(bytes, format=property.get("mime_type")) streamlit_app.video(bytes, format=property.get("mime_type"))
return bytes return bytes
def _render_single_audio_input(
self, streamlit_app: st, key: str, property: Dict
) -> Any:
# streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
from audio_recorder_streamlit import audio_recorder
audio_bytes = audio_recorder()
if audio_bytes:
streamlit_app.audio(audio_bytes, format="audio/wav")
return audio_bytes
# file_extension = None
# if "mime_type" in property:
# file_extension = mimetypes.guess_extension(property["mime_type"])
# uploaded_file = streamlit_app.file_uploader(
# **streamlit_kwargs, accept_multiple_files=False, type=file_extension
# )
# if uploaded_file is None:
# return None
# bytes = uploaded_file.getvalue()
# if property.get("mime_type"):
# if is_compatible_audio(property["mime_type"]):
# # Show audio
# streamlit_app.audio(bytes, format=property.get("mime_type"))
# if is_compatible_image(property["mime_type"]):
# # Show image
# streamlit_app.image(bytes)
# if is_compatible_video(property["mime_type"]):
# # Show video
# streamlit_app.video(bytes, format=property.get("mime_type"))
# return bytes
def _render_single_string_input( def _render_single_string_input(
self, streamlit_app: st, key: str, property: Dict self, streamlit_app: st, key: str, property: Dict
) -> Any: ) -> Any:
@@ -807,21 +846,20 @@ class OutputUI:
def getOpyrator(mode: str) -> Opyrator: def getOpyrator(mode: str) -> Opyrator:
if mode == None or mode.startswith('VC'): if mode == None or mode.startswith('VC'):
from mkgui.app_vc import convert from control.mkgui.app_vc import convert
return Opyrator(convert) return Opyrator(convert)
if mode == None or mode.startswith('预处理'): if mode == None or mode.startswith('预处理'):
from mkgui.preprocess import preprocess from control.mkgui.preprocess import preprocess
return Opyrator(preprocess) return Opyrator(preprocess)
if mode == None or mode.startswith('模型训练'): if mode == None or mode.startswith('模型训练'):
from mkgui.train import train from control.mkgui.train import train
return Opyrator(train) return Opyrator(train)
if mode == None or mode.startswith('模型训练(VC)'): if mode == None or mode.startswith('模型训练(VC)'):
from mkgui.train_vc import train_vc from control.mkgui.train_vc import train_vc
return Opyrator(train_vc) return Opyrator(train_vc)
from mkgui.app import synthesize from control.mkgui.app import synthesize
return Opyrator(synthesize) return Opyrator(synthesize)
def render_streamlit_ui() -> None: def render_streamlit_ui() -> None:
# init # init
session_state = st.session_state session_state = st.session_state
@@ -845,7 +883,7 @@ def render_streamlit_ui() -> None:
col2.title(title) col2.title(title)
col2.markdown("欢迎使用MockingBird Web 2") col2.markdown("欢迎使用MockingBird Web 2")
image = Image.open('.\\mkgui\\static\\mb.png') image = Image.open(path.join('control','mkgui', 'static', 'mb.png'))
col1.image(image) col1.image(image)
st.markdown("---") st.markdown("---")
@@ -853,6 +891,13 @@ def render_streamlit_ui() -> None:
with left: with left:
st.header("Control 控制") st.header("Control 控制")
# if session_state.mode in ["AI拟音", "VC拟音"] :
# from audiorecorder import audiorecorder
# audio = audiorecorder("Click to record", "Recording...")
# if len(audio) > 0:
# # To play audio in frontend:
# st.audio(audio.tobytes())
InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st) InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st)
execute_selected = st.button(opyrator.action) execute_selected = st.button(opyrator.action)
if execute_selected: if execute_selected:

View File

@@ -6,8 +6,8 @@ from typing import Any, Tuple
# Constants # Constants
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models" EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
if os.path.isdir(EXT_MODELS_DIRT): if os.path.isdir(EXT_MODELS_DIRT):
@@ -83,7 +83,7 @@ def preprocess(input: Input) -> Output:
"""Preprocess(预处理)""" """Preprocess(预处理)"""
finished = 0 finished = 0
if input.model == Model.VC_PPG2MEL: if input.model == Model.VC_PPG2MEL:
from ppg2mel.preprocess import preprocess_dataset from models.ppg2mel.preprocess import preprocess_dataset
finished = preprocess_dataset( finished = preprocess_dataset(
datasets_root=Path(input.datasets_root), datasets_root=Path(input.datasets_root),
dataset=input.dataset, dataset=input.dataset,

View File

Before

Width:  |  Height:  |  Size: 5.6 KiB

After

Width:  |  Height:  |  Size: 5.6 KiB

View File

@@ -3,17 +3,17 @@ import os
from pathlib import Path from pathlib import Path
from enum import Enum from enum import Enum
from typing import Any from typing import Any
from synthesizer.hparams import hparams from models.synthesizer.hparams import hparams
from synthesizer.train import train as synt_train from models.synthesizer.train import train as synt_train
# Constants # Constants
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models" SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
# EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models" # EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
# CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models" # CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
# ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" # ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
# Pre-Load models # Pre-Load models
if os.path.isdir(SYN_MODELS_DIRT): if os.path.isdir(SYN_MODELS_DIRT):
@@ -96,7 +96,7 @@ def train(input: Input) -> Output:
synt_train( synt_train(
input.run_id, input.run_id,
input.input_root, input.input_root,
f"synthesizer{os.sep}saved_models", f"data{os.sep}ckpt{os.sep}synthesizer",
input.save_every, input.save_every,
input.backup_every, input.backup_every,
input.log_every, input.log_every,

View File

@@ -4,14 +4,14 @@ from pathlib import Path
from enum import Enum from enum import Enum
from typing import Any, Tuple from typing import Any, Tuple
import numpy as np import numpy as np
from utils.load_yaml import HpsYaml from utils.hparams import HpsYaml
from utils.util import AttrDict from utils.util import AttrDict
import torch import torch
# Constants # Constants
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models" EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models" CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
if os.path.isdir(EXT_MODELS_DIRT): if os.path.isdir(EXT_MODELS_DIRT):
@@ -144,7 +144,7 @@ def train_vc(input: Input) -> Output:
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.manual_seed_all(input.seed) torch.cuda.manual_seed_all(input.seed)
mode = "train" mode = "train"
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
solver = Solver(config, params, mode) solver = Solver(config, params, mode)
solver.load_data() solver.load_data()
solver.set_model() solver.set_model()

View File

@@ -1,12 +1,12 @@
from toolbox.ui import UI from control.toolbox.ui import UI
from encoder import inference as encoder from models.encoder import inference as encoder
from synthesizer.inference import Synthesizer from models.synthesizer.inference import Synthesizer
from vocoder.wavernn import inference as rnn_vocoder from models.vocoder.wavernn import inference as rnn_vocoder
from vocoder.hifigan import inference as gan_vocoder from models.vocoder.hifigan import inference as gan_vocoder
from vocoder.fregan import inference as fgan_vocoder from models.vocoder.fregan import inference as fgan_vocoder
from pathlib import Path from pathlib import Path
from time import perf_counter as timer from time import perf_counter as timer
from toolbox.utterance import Utterance from control.toolbox.utterance import Utterance
import numpy as np import numpy as np
import traceback import traceback
import sys import sys
@@ -38,8 +38,8 @@ recognized_datasets = [
"VoxCeleb2/dev/aac", "VoxCeleb2/dev/aac",
"VoxCeleb2/test/aac", "VoxCeleb2/test/aac",
"VCTK-Corpus/wav48", "VCTK-Corpus/wav48",
"aidatatang_200zh/corpus/dev",
"aidatatang_200zh/corpus/test", "aidatatang_200zh/corpus/test",
"aidatatang_200zh/corpus/train",
"aishell3/test/wav", "aishell3/test/wav",
"magicdata/train", "magicdata/train",
] ]
@@ -397,7 +397,7 @@ class Toolbox:
self.ui.log("Loading the extractor %s... " % model_fpath) self.ui.log("Loading the extractor %s... " % model_fpath)
self.ui.set_loading(1) self.ui.set_loading(1)
start = timer() start = timer()
import ppg_extractor as extractor import models.ppg_extractor as extractor
self.extractor = extractor.load_model(model_fpath) self.extractor = extractor.load_model(model_fpath)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0) self.ui.set_loading(0)
@@ -409,7 +409,7 @@ class Toolbox:
self.ui.log("Loading the convertor %s... " % model_fpath) self.ui.log("Loading the convertor %s... " % model_fpath)
self.ui.set_loading(1) self.ui.set_loading(1)
start = timer() start = timer()
import ppg2mel as convertor import models.ppg2mel as convertor
self.convertor = convertor.load_model( model_fpath) self.convertor = convertor.load_model( model_fpath)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0) self.ui.set_loading(0)

View File

Before

Width:  |  Height:  |  Size: 5.6 KiB

After

Width:  |  Height:  |  Size: 5.6 KiB

View File

@@ -3,9 +3,8 @@ from PyQt5 import QtGui
from PyQt5.QtWidgets import * from PyQt5.QtWidgets import *
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure from models.encoder.inference import plot_embedding_as_heatmap
from encoder.inference import plot_embedding_as_heatmap from control.toolbox.utterance import Utterance
from toolbox.utterance import Utterance
from pathlib import Path from pathlib import Path
from typing import List, Set from typing import List, Set
import sounddevice as sd import sounddevice as sd
@@ -34,7 +33,7 @@ colormap = np.array([
[0, 0, 0], [0, 0, 0],
[183, 183, 183], [183, 183, 183],
[76, 255, 0], [76, 255, 0],
], dtype=np.float) / 255 ], dtype=float) / 255
default_text = \ default_text = \
"欢迎使用工具箱, 现已支持中文输入!" "欢迎使用工具箱, 现已支持中文输入!"
@@ -274,7 +273,9 @@ class UI(QDialog):
if datasets_root is None or len(datasets) == 0: if datasets_root is None or len(datasets) == 0:
msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \ msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
if datasets_root is None else "o not have any of the recognized datasets" \ if datasets_root is None else "o not have any of the recognized datasets" \
" in %s" % datasets_root) " in %s \n" \
"Please note use 'E:\datasets' as root path " \
"instead of 'E:\datasets\aidatatang_200zh\corpus\test' as an example " % datasets_root)
self.log(msg) self.log(msg)
msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \ msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
"can still use the toolbox by recording samples yourself." % \ "can still use the toolbox by recording samples yourself." % \
@@ -401,8 +402,8 @@ class UI(QDialog):
self.app.processEvents() self.app.processEvents()
def set_loading(self, value, maximum=1): def set_loading(self, value, maximum=1):
self.loading_bar.setValue(value * 100) self.loading_bar.setValue(int(value * 100))
self.loading_bar.setMaximum(maximum * 100) self.loading_bar.setMaximum(int(maximum * 100))
self.loading_bar.setTextVisible(value != 0) self.loading_bar.setTextVisible(value != 0)
self.app.processEvents() self.app.processEvents()

View File

@@ -0,0 +1,31 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 16,
"learning_rate": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999,
"seed": 1234,
"upsample_rates": [5,5,4,2],
"upsample_kernel_sizes": [10,10,8,4],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"segment_size": 6400,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 200,
"win_size": 800,
"sampling_rate": 16000,
"fmin": 0,
"fmax": 7600,
"fmax_for_loss": null,
"num_workers": 4
}

8
datasets_download/CN.txt Normal file
View File

@@ -0,0 +1,8 @@
https://openslr.magicdatatech.com/resources/62/aidatatang_200zh.tgz
out=download/aidatatang_200zh.tgz
https://openslr.magicdatatech.com/resources/68/train_set.tar.gz
out=download/magicdata.tgz
https://openslr.magicdatatech.com/resources/93/data_aishell3.tgz
out=download/aishell3.tgz
https://openslr.magicdatatech.com/resources/33/data_aishell.tgz
out=download/data_aishell.tgz

8
datasets_download/EU.txt Normal file
View File

@@ -0,0 +1,8 @@
https://openslr.elda.org/resources/62/aidatatang_200zh.tgz
out=download/aidatatang_200zh.tgz
https://openslr.elda.org/resources/68/train_set.tar.gz
out=download/magicdata.tgz
https://openslr.elda.org/resources/93/data_aishell3.tgz
out=download/aishell3.tgz
https://openslr.elda.org/resources/33/data_aishell.tgz
out=download/data_aishell.tgz

8
datasets_download/US.txt Normal file
View File

@@ -0,0 +1,8 @@
https://us.openslr.org/resources/62/aidatatang_200zh.tgz
out=download/aidatatang_200zh.tgz
https://us.openslr.org/resources/68/train_set.tar.gz
out=download/magicdata.tgz
https://us.openslr.org/resources/93/data_aishell3.tgz
out=download/aishell3.tgz
https://us.openslr.org/resources/33/data_aishell.tgz
out=download/data_aishell.tgz

View File

@@ -0,0 +1,4 @@
0c0ace77fe8ee77db8d7542d6eb0b7ddf09b1bfb880eb93a7fbdbf4611e9984b /datasets/download/aidatatang_200zh.tgz
be2507d431ad59419ec871e60674caedb2b585f84ffa01fe359784686db0e0cc /datasets/download/aishell3.tgz
a4a0313cde0a933e0e01a451f77de0a23d6c942f4694af5bb7f40b9dc38143fe /datasets/download/data_aishell.tgz
1d2647c614b74048cfe16492570cc5146d800afdc07483a43b31809772632143 /datasets/download/magicdata.tgz

View File

@@ -0,0 +1,8 @@
https://www.openslr.org/resources/62/aidatatang_200zh.tgz
out=download/aidatatang_200zh.tgz
https://www.openslr.org/resources/68/train_set.tar.gz
out=download/magicdata.tgz
https://www.openslr.org/resources/93/data_aishell3.tgz
out=download/aishell3.tgz
https://www.openslr.org/resources/33/data_aishell.tgz
out=download/data_aishell.tgz

8
datasets_download/download.sh Executable file
View File

@@ -0,0 +1,8 @@
#!/usr/bin/env bash
set -Eeuo pipefail
aria2c -x 10 --disable-ipv6 --input-file /workspace/datasets_download/${DATASET_MIRROR}.txt --dir /datasets --continue
echo "Verifying sha256sum..."
parallel --will-cite -a /workspace/datasets_download/datasets.sha256sum "echo -n {} | sha256sum -c"

29
datasets_download/extract.sh Executable file
View File

@@ -0,0 +1,29 @@
#!/usr/bin/env bash
set -Eeuo pipefail
mkdir -p /datasets/aidatatang_200zh
if [ -z "$(ls -A /datasets/aidatatang_200zh)" ] ; then
tar xvz --directory /datasets/ -f /datasets/download/aidatatang_200zh.tgz --exclude 'aidatatang_200zh/corpus/dev/*' --exclude 'aidatatang_200zh/corpus/test/*'
cd /datasets/aidatatang_200zh/corpus/train/
cat *.tar.gz | tar zxvf - -i
rm -f *.tar.gz
fi
mkdir -p /datasets/magicdata
if [ -z "$(ls -A /datasets/magicdata)" ] ; then
tar xvz --directory /datasets/magicdata -f /datasets/download/magicdata.tgz train/
fi
mkdir -p /datasets/aishell3
if [ -z "$(ls -A /datasets/aishell3)" ] ; then
tar xvz --directory /datasets/aishell3 -f /datasets/download/aishell3.tgz train/
fi
mkdir -p /datasets/data_aishell
if [ -z "$(ls -A /datasets/data_aishell)" ] ; then
tar xvz --directory /datasets/ -f /datasets/download/data_aishell.tgz
cd /datasets/data_aishell/wav/
cat *.tar.gz | tar zxvf - -i --exclude 'dev/*' --exclude 'test/*'
rm -f *.tar.gz
fi

View File

@@ -1,5 +1,5 @@
from pathlib import Path from pathlib import Path
from toolbox import Toolbox from control.toolbox import Toolbox
from utils.argutils import print_args from utils.argutils import print_args
from utils.modelutils import check_model_paths from utils.modelutils import check_model_paths
import argparse import argparse
@@ -17,15 +17,15 @@ if __name__ == '__main__':
"supported datasets.", default=None) "supported datasets.", default=None)
parser.add_argument("-vc", "--vc_mode", action="store_true", parser.add_argument("-vc", "--vc_mode", action="store_true",
help="Voice Conversion Mode(PPG based)") help="Voice Conversion Mode(PPG based)")
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models", parser.add_argument("-e", "--enc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}encoder",
help="Directory containing saved encoder models") help="Directory containing saved encoder models")
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models", parser.add_argument("-s", "--syn_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}synthesizer",
help="Directory containing saved synthesizer models") help="Directory containing saved synthesizer models")
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models", parser.add_argument("-v", "--voc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}vocoder",
help="Directory containing saved vocoder models") help="Directory containing saved vocoder models")
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default="ppg_extractor/saved_models", parser.add_argument("-ex", "--extractor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg_extractor",
help="Directory containing saved extrator models") help="Directory containing saved extrator models")
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default="ppg2mel/saved_models", parser.add_argument("-cv", "--convertor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg2mel",
help="Directory containing saved convert models") help="Directory containing saved convert models")
parser.add_argument("--cpu", action="store_true", help=\ parser.add_argument("--cpu", action="store_true", help=\
"If True, processing is done on CPU, even when a GPU is available.") "If True, processing is done on CPU, even when a GPU is available.")

23
docker-compose.yml Normal file
View File

@@ -0,0 +1,23 @@
version: '3.8'
services:
server:
image: mockingbird:latest
build: .
volumes:
- ./datasets:/datasets
- ./synthesizer/saved_models:/workspace/synthesizer/saved_models
environment:
- DATASET_MIRROR=US
- FORCE_RETRAIN=false
- TRAIN_DATASETS=aidatatang_200zh magicdata aishell3 data_aishell
- TRAIN_SKIP_EXISTING=true
ports:
- 8080:8080
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: [ '0' ]
capabilities: [ gpu ]

17
docker-entrypoint.sh Executable file
View File

@@ -0,0 +1,17 @@
#!/usr/bin/env bash
if [ -z "$(ls -A /workspace/synthesizer/saved_models)" ] || [ "$FORCE_RETRAIN" = true ] ; then
/workspace/datasets_download/download.sh
/workspace/datasets_download/extract.sh
for DATASET in ${TRAIN_DATASETS}
do
if [ "$TRAIN_SKIP_EXISTING" = true ] ; then
python pre.py /datasets -d ${DATASET} -n $(nproc) --skip_existing
else
python pre.py /datasets -d ${DATASET} -n $(nproc)
fi
done
python synthesizer_train.py mandarin /datasets/SV2TTS/synthesizer
fi
python web.py

View File

@@ -1,2 +0,0 @@
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

BIN
env.yml Normal file

Binary file not shown.

View File

@@ -1,23 +1,15 @@
from encoder.params_model import model_embedding_size as speaker_embedding_size from models.synthesizer.inference import Synthesizer
from utils.argutils import print_args from models.encoder import inference as encoder
from utils.modelutils import check_model_paths from models.vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder.wavernn import inference as rnn_vocoder
from vocoder.hifigan import inference as gan_vocoder
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
import librosa
import argparse
import torch import torch
import sys import sys
import os import os
import re import re
import cn2an import cn2an
import glob
from audioread.exceptions import NoBackendError
vocoder = gan_vocoder vocoder = gan_vocoder
def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq): def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):

View File

@@ -1,5 +1,5 @@
from scipy.ndimage.morphology import binary_dilation from scipy.ndimage.morphology import binary_dilation
from encoder.params_data import * from models.encoder.params_data import *
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Optional, Union
from warnings import warn from warnings import warn
@@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
# Resample the wav if needed # Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate: if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, source_sr, sampling_rate) wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
# Apply the preprocessing: normalize volume and shorten long silences # Apply the preprocessing: normalize volume and shorten long silences
if normalize: if normalize:
@@ -99,7 +99,7 @@ def trim_long_silences(wav):
return ret[width - 1:] / width return ret[width - 1:] / width
audio_mask = moving_average(voice_flags, vad_moving_average_width) audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool) audio_mask = np.round(audio_mask).astype(bool)
# Dilate the voiced regions # Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))

View File

@@ -0,0 +1,2 @@
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

View File

@@ -1,5 +1,5 @@
from encoder.data_objects.random_cycler import RandomCycler from models.encoder.data_objects.random_cycler import RandomCycler
from encoder.data_objects.utterance import Utterance from models.encoder.data_objects.utterance import Utterance
from pathlib import Path from pathlib import Path
# Contains the set of utterances of a single speaker # Contains the set of utterances of a single speaker

View File

@@ -1,6 +1,6 @@
import numpy as np import numpy as np
from typing import List from typing import List
from encoder.data_objects.speaker import Speaker from models.encoder.data_objects.speaker import Speaker
class SpeakerBatch: class SpeakerBatch:
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):

View File

@@ -1,7 +1,7 @@
from encoder.data_objects.random_cycler import RandomCycler from models.encoder.data_objects.random_cycler import RandomCycler
from encoder.data_objects.speaker_batch import SpeakerBatch from models.encoder.data_objects.speaker_batch import SpeakerBatch
from encoder.data_objects.speaker import Speaker from models.encoder.data_objects.speaker import Speaker
from encoder.params_data import partials_n_frames from models.encoder.params_data import partials_n_frames
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
from pathlib import Path from pathlib import Path

View File

@@ -1,8 +1,8 @@
from encoder.params_data import * from models.encoder.params_data import *
from encoder.model import SpeakerEncoder from models.encoder.model import SpeakerEncoder
from encoder.audio import preprocess_wav # We want to expose this function from here from models.encoder.audio import preprocess_wav # We want to expose this function from here
from matplotlib import cm from matplotlib import cm
from encoder import audio from models.encoder import audio
from pathlib import Path from pathlib import Path
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np

View File

@@ -1,5 +1,5 @@
from encoder.params_model import * from models.encoder.params_model import *
from encoder.params_data import * from models.encoder.params_data import *
from scipy.interpolate import interp1d from scipy.interpolate import interp1d
from sklearn.metrics import roc_curve from sklearn.metrics import roc_curve
from torch.nn.utils import clip_grad_norm_ from torch.nn.utils import clip_grad_norm_

View File

@@ -1,8 +1,8 @@
from multiprocess.pool import ThreadPool from multiprocess.pool import ThreadPool
from encoder.params_data import * from models.encoder.params_data import *
from encoder.config import librispeech_datasets, anglophone_nationalites from models.encoder.config import librispeech_datasets, anglophone_nationalites
from datetime import datetime from datetime import datetime
from encoder import audio from models.encoder import audio
from pathlib import Path from pathlib import Path
from tqdm import tqdm from tqdm import tqdm
import numpy as np import numpy as np
@@ -22,7 +22,7 @@ class DatasetLog:
self._log_params() self._log_params()
def _log_params(self): def _log_params(self):
from encoder import params_data from models.encoder import params_data
self.write_line("Parameter values:") self.write_line("Parameter values:")
for param_name in (p for p in dir(params_data) if not p.startswith("__")): for param_name in (p for p in dir(params_data) if not p.startswith("__")):
value = getattr(params_data, param_name) value = getattr(params_data, param_name)

View File

@@ -1,7 +1,7 @@
from encoder.visualizations import Visualizations from models.encoder.visualizations import Visualizations
from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset from models.encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
from encoder.params_model import * from models.encoder.params_model import *
from encoder.model import SpeakerEncoder from models.encoder.model import SpeakerEncoder
from utils.profiler import Profiler from utils.profiler import Profiler
from pathlib import Path from pathlib import Path
import torch import torch

View File

@@ -1,4 +1,4 @@
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from datetime import datetime from datetime import datetime
from time import perf_counter as timer from time import perf_counter as timer
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@@ -21,7 +21,7 @@ colormap = np.array([
[33, 0, 127], [33, 0, 127],
[0, 0, 0], [0, 0, 0],
[183, 183, 183], [183, 183, 183],
], dtype=np.float) / 255 ], dtype=float) / 255
class Visualizations: class Visualizations:
@@ -65,8 +65,8 @@ class Visualizations:
def log_params(self): def log_params(self):
if self.disabled: if self.disabled:
return return
from encoder import params_data from models.encoder import params_data
from encoder import params_model from models.encoder import params_model
param_string = "<b>Model parameters</b>:<br>" param_string = "<b>Model parameters</b>:<br>"
for param_name in (p for p in dir(params_model) if not p.startswith("__")): for param_name in (p for p in dir(params_model) if not p.startswith("__")):
value = getattr(params_model, param_name) value = getattr(params_model, param_name)

View File

@@ -15,7 +15,7 @@ from .rnn_decoder_mol import Decoder
from .utils.cnn_postnet import Postnet from .utils.cnn_postnet import Postnet
from .utils.vc_utils import get_mask_from_lengths from .utils.vc_utils import get_mask_from_lengths
from utils.load_yaml import HpsYaml from utils.hparams import HpsYaml
class MelDecoderMOLv2(AbsMelDecoder): class MelDecoderMOLv2(AbsMelDecoder):
"""Use an encoder to preprocess ppg.""" """Use an encoder to preprocess ppg."""

View File

@@ -7,10 +7,10 @@ from pathlib import Path
import soundfile import soundfile
import resampy import resampy
from ppg_extractor import load_model from models.ppg_extractor import load_model
import encoder.inference as Encoder import encoder.inference as Encoder
from encoder.audio import preprocess_wav from models.encoder.audio import preprocess_wav
from encoder import audio from models.encoder import audio
from utils.f0_utils import compute_f0 from utils.f0_utils import compute_f0
from torch.multiprocessing import Pool, cpu_count from torch.multiprocessing import Pool, cpu_count

View File

@@ -2,8 +2,8 @@ import sys
import torch import torch
import argparse import argparse
import numpy as np import numpy as np
from utils.load_yaml import HpsYaml from utils.hparams import HpsYaml
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training # For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True

View File

@@ -8,7 +8,6 @@ from torch.utils.tensorboard import SummaryWriter
from .option import default_hparas from .option import default_hparas
from utils.util import human_format, Timer from utils.util import human_format, Timer
from utils.load_yaml import HpsYaml
class BaseSolver(): class BaseSolver():

View File

@@ -14,7 +14,7 @@ from utils.data_load import OneshotVcDataset, MultiSpkVcCollate
from .loss import MaskedMSELoss from .loss import MaskedMSELoss
from .optim import Optimizer from .optim import Optimizer
from utils.util import human_format from utils.util import human_format
from ppg2mel import MelDecoderMOLv2 from models.ppg2mel import MelDecoderMOLv2
class Solver(BaseSolver): class Solver(BaseSolver):

View File

Some files were not shown because too many files have changed in this diff Show More