5 Commits

Author SHA1 Message Date
babysor00
f9ee4d7890 Avoid recursive calls of web ui for M1 2022-08-12 23:13:35 +08:00
babysor00
8d0d22bc00 Fix #657 2022-07-19 23:43:19 +08:00
babysor00
87f4859874 update launch json 2022-07-17 14:27:26 +08:00
babysor00
c3590bffb2 Add description for 2022-07-17 11:56:13 +08:00
babysor00
efbdb21b70 Refactor model 2022-07-17 11:55:53 +08:00
214 changed files with 28550 additions and 25447 deletions

View File

@@ -1,4 +0,0 @@
*/saved_models
!vocoder/saved_models/pretrained/**
!encoder/saved_models/pretrained.pt
/datasets

13
.gitignore vendored
View File

@@ -14,13 +14,8 @@
*.bcf
*.toc
*.sh
data/ckpt/*/*
!data/ckpt/encoder/pretrained.pt
!data/ckpt/vocoder/pretrained/
*/saved_models
!vocoder/saved_models/pretrained/**
!encoder/saved_models/pretrained.pt
wavs
log
!/docker-entrypoint.sh
!/datasets_download/*.sh
/datasets
monotonic_align/build
monotonic_align/monotonic_align
log

22
.vscode/launch.json vendored
View File

@@ -15,8 +15,7 @@
"name": "Python: Vocoder Preprocess",
"type": "python",
"request": "launch",
"program": "control\\cli\\vocoder_preprocess.py",
"cwd": "${workspaceFolder}",
"program": "vocoder_preprocess.py",
"console": "integratedTerminal",
"args": ["..\\audiodata"]
},
@@ -24,8 +23,7 @@
"name": "Python: Vocoder Train",
"type": "python",
"request": "launch",
"program": "control\\cli\\vocoder_train.py",
"cwd": "${workspaceFolder}",
"program": "vocoder_train.py",
"console": "integratedTerminal",
"args": ["dev", "..\\audiodata"]
},
@@ -34,7 +32,6 @@
"type": "python",
"request": "launch",
"program": "demo_toolbox.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"args": ["-d","..\\audiodata"]
},
@@ -43,7 +40,6 @@
"type": "python",
"request": "launch",
"program": "demo_toolbox.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"args": ["-d","..\\audiodata","-vc"]
},
@@ -51,9 +47,9 @@
"name": "Python: Synth Train",
"type": "python",
"request": "launch",
"program": "train.py",
"program": "synthesizer_train.py",
"console": "integratedTerminal",
"args": ["--type", "vits"]
"args": ["my_run", "..\\"]
},
{
"name": "Python: PPG Convert",
@@ -64,14 +60,6 @@
"args": ["-c", ".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2.yaml",
"-m", ".\\ppg2mel\\saved_models\\best_loss_step_304000.pth", "--wav_dir", ".\\wavs\\input", "--ref_wav_path", ".\\wavs\\pkq.mp3", "-o", ".\\wavs\\output\\"
]
},
{
"name": "Python: Vits Train",
"type": "python",
"request": "launch",
"program": "train.py",
"console": "integratedTerminal",
"args": ["--type", "vits"]
},
}
]
}

View File

@@ -1,17 +0,0 @@
FROM pytorch/pytorch:latest
RUN apt-get update && apt-get install -y build-essential ffmpeg parallel aria2 && apt-get clean
COPY ./requirements.txt /workspace/requirements.txt
RUN pip install -r requirements.txt && pip install webrtcvad-wheels
COPY . /workspace
VOLUME [ "/datasets", "/workspace/synthesizer/saved_models/" ]
ENV DATASET_MIRROR=default FORCE_RETRAIN=false TRAIN_DATASETS=aidatatang_200zh\ magicdata\ aishell3\ data_aishell TRAIN_SKIP_EXISTING=true
EXPOSE 8080
ENTRYPOINT [ "/workspace/docker-entrypoint.sh" ]

View File

@@ -18,10 +18,17 @@
🌍 **Webserver Ready** 可伺服你的训练结果,供远程调用
### 进行中的工作
* GUI/客户端大升级与合并
[X] 初始化框架 `./mkgui` 基于streamlit + fastapi和 [技术设计](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
[X] 增加 Voice Cloning and Conversion的演示页面
[X] 增加Voice Conversion的预处理preprocessing 和训练 training 页面
[ ] 增加其他的的预处理preprocessing 和训练 training 页面
* 模型后端基于ESPnet2升级
## 开始
### 1. 安装要求
#### 1.1 通用配置
> 按照原始存储库测试您是否已准备好所有环境。
运行工具箱(demo_toolbox.py)需要 **Python 3.7 或更高版本**
@@ -31,67 +38,6 @@
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
* 安装 webrtcvad `pip install webrtcvad-wheels`
或者
-`conda` 或者 `mamba` 安装依赖
```conda env create -n env_name -f env.yml```
```mamba env create -n env_name -f env.yml```
会创建新环境安装必须的依赖. 之后用 `conda activate env_name` 切换环境就完成了.
> env.yml只包含了运行时必要的依赖暂时不包括monotonic-align如果想要装GPU版本的pytorch可以查看官网教程。
#### 1.2 M1芯片Mac环境配置Inference Time)
> 以下环境按x86-64搭建使用原生的`demo_toolbox.py`可作为在不改代码情况下快速使用的workaround。
>
> 如需使用M1芯片训练因`demo_toolbox.py`依赖的`PyQt5`不支持M1则应按需修改代码或者尝试使用`web.py`。
* 安装`PyQt5`,参考[这个链接](https://stackoverflow.com/a/68038451/20455983)
* 用Rosetta打开Terminal参考[这个链接](https://dev.to/courier/tips-and-tricks-to-setup-your-apple-m1-for-development-547g)
* 用系统Python创建项目虚拟环境
```
/usr/bin/python3 -m venv /PathToMockingBird/venv
source /PathToMockingBird/venv/bin/activate
```
* 升级pip并安装`PyQt5`
```
pip install --upgrade pip
pip install pyqt5
```
* 安装`pyworld`和`ctc-segmentation`
> 这里两个文件直接`pip install`的时候找不到wheel尝试从c里build时找不到`Python.h`报错
* 安装`pyworld`
* `brew install python` 通过brew安装python时会自动安装`Python.h`
* `export CPLUS_INCLUDE_PATH=/opt/homebrew/Frameworks/Python.framework/Headers` 对于M1brew安装`Python.h`到上述路径。把路径添加到环境变量里
* `pip install pyworld`
* 安装`ctc-segmentation`
> 因上述方法没有成功,选择从[github](https://github.com/lumaku/ctc-segmentation) clone源码手动编译
* `git clone https://github.com/lumaku/ctc-segmentation.git` 克隆到任意位置
* `cd ctc-segmentation`
* `source /PathToMockingBird/venv/bin/activate` 假设一开始未开启打开MockingBird项目的虚拟环境
* `cythonize -3 ctc_segmentation/ctc_segmentation_dyn.pyx`
* `/usr/bin/arch -x86_64 python setup.py build` 要注意明确用x86-64架构编译
* `/usr/bin/arch -x86_64 python setup.py install --optimize=1 --skip-build`用x86-64架构安装
* 安装其他依赖
* `/usr/bin/arch -x86_64 pip install torch torchvision torchaudio` 这里用pip安装`PyTorch`明确架构是x86
* `pip install ffmpeg` 安装ffmpeg
* `pip install -r requirements.txt`
* 运行
> 参考[这个链接](https://youtrack.jetbrains.com/issue/PY-46290/Allow-running-Python-under-Rosetta-2-in-PyCharm-for-Apple-Silicon)
让项目跑在x86架构环境上
* `vim /PathToMockingBird/venv/bin/pythonM1`
* 写入以下代码
```
#!/usr/bin/env zsh
mydir=${0:a:h}
/usr/bin/arch -x86_64 $mydir/python "$@"
```
* `chmod +x pythonM1` 设为可执行文件
* 如果使用PyCharm则把Interpreter指向`pythonM1`,否则也可命令行运行`/PathToMockingBird/venv/bin/pythonM1 demo_toolbox.py`
### 2. 准备预训练模型
考虑训练您自己专属的模型或者下载社区他人训练好的模型:
> 近期创建了[知乎专题](https://www.zhihu.com/column/c_1425605280340504576) 将不定期更新炼丹小技巧or心得也欢迎提问
@@ -113,7 +59,7 @@
> 假如你下载的 `aidatatang_200zh`文件放在D盘`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
* 训练合成器:
`python ./control/cli/synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
* 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时,请转到`启动程序`一步。
@@ -135,14 +81,14 @@
* 训练wavernn声码器:
`python ./control/cli/vocoder_train.py <trainid> <datasets_root>`
`python vocoder_train.py <trainid> <datasets_root>`
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
* 训练hifigan声码器:
`python ./control/cli/vocoder_train.py <trainid> <datasets_root> hifigan`
`python vocoder_train.py <trainid> <datasets_root> hifigan`
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
* 训练fregan声码器:
`python ./control/cli/vocoder_train.py <trainid> <datasets_root> --config config.json fregan`
`python vocoder_train.py <trainid> <datasets_root> --config config.json fregan`
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
* 将GAN声码器的训练切换为多GPU模式修改GAN文件夹下.json文件中的"num_gpus"参数
### 3. 启动程序或工具箱
@@ -163,7 +109,7 @@
想像柯南拿着变声器然后发出毛利小五郎的声音吗本项目现基于PPG-VC引入额外两个模块PPG extractor + PPG2Mel, 可以实现变声功能。(文档不全,尤其是训练部分,正在努力补充中)
#### 4.0 准备环境
* 确保项目以上环境已经安装ok运行`pip install espnet` 来安装剩余的必要包。
* 下载以下模型 链接https://pan.baidu.com/s/1bl_x_DHJSAUyN2fma-Q_Wg
* 下载以下模型 链接https://pan.baidu.com/s/1bl_x_DHJSAUyN2fma-Q_Wg
提取码gh41
* 24K采样率专用的vocoderhifigan*vocoder\saved_models\xxx*
* 预训练的ppg特征encoder(ppg_extractor)到 *ppg_extractor\saved_models\xxx*
@@ -173,14 +119,14 @@
* 下载aidatatang_200zh数据集并解压确保您可以访问 *train* 文件夹中的所有音频文件(如.wav
* 进行音频和梅尔频谱图预处理:
`python ./control/cli/pre4ppg.py <datasets_root> -d {dataset} -n {number}`
`python pre4ppg.py <datasets_root> -d {dataset} -n {number}`
可传入参数:
* `-d {dataset}` 指定数据集,支持 aidatatang_200zh, 不传默认为aidatatang_200zh
* `-n {number}` 指定并行数CPU 11700k在8的情况下需要运行12到18小时待优化
* `-n {number}` 指定并行数CPU 11770k在8的情况下需要运行12到18小时待优化
> 假如你下载的 `aidatatang_200zh`文件放在D盘`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
* 训练合成器, 注意在上一步先下载好`ppg2mel.yaml`, 修改里面的地址指向预训练好的文件夹:
`python ./control/cli/ppg2mel_train.py --config .\ppg2mel\saved_models\ppg2mel.yaml --oneshotvc `
`python ppg2mel_train.py --config .\ppg2mel\saved_models\ppg2mel.yaml --oneshotvc `
* 如果想要继续上一次的训练,可以通过`--load .\ppg2mel\saved_models\<old_pt_file>` 参数指定一个预训练模型文件。
#### 4.2 启动工具箱VC模式
@@ -202,30 +148,30 @@
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
## 常见问题(FQ&A)
#### 1.数据集在哪里下载?
## 常見問題(FQ&A)
#### 1.數據集哪裡下載?
| 数据集 | OpenSLR地址 | 其他源 (Google Drive, Baidu网盘等) |
| --- | ----------- | ---------------|
| aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) |
| magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) |
| aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) |
| data_aishell | [OpenSLR](https://www.openslr.org/33/) | |
> aidatatang_200zh 后,还需将 `aidatatang_200zh\corpus\train`下的文件全选解压缩
> aidatatang_200zh 後,還需將 `aidatatang_200zh\corpus\train`下的檔案全選解壓縮
#### 2.`<datasets_root>`是什麼意思?
假如数据集路径为 `D:\data\aidatatang_200zh`,那 `<datasets_root>`就是 `D:\data`
假如數據集路徑為 `D:\data\aidatatang_200zh`,那 `<datasets_root>`就是 `D:\data`
#### 3.训练模型存不足
训练合成器时:将 `synthesizer/hparams.py`中的batch_size参数调
#### 3.訓練模型存不足
訓練合成器時:將 `synthesizer/hparams.py`中的batch_size參數調
```
//整前
//調整前
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
(2, 2e-4, 80_000, 12), #
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
(2, 1e-5, 640_000, 12)], # lr = learning rate
//调整后
//調整後
tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
(2, 5e-4, 40_000, 8), # (r, lr, step, batch_size)
(2, 2e-4, 80_000, 8), #
@@ -234,15 +180,15 @@ tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
(2, 1e-5, 640_000, 8)], # lr = learning rate
```
声码器-预处理数据集时:将 `synthesizer/hparams.py`中的batch_size参数调
聲碼器-預處理數據集時:將 `synthesizer/hparams.py`中的batch_size參數調
```
//整前
//調整前
### Data Preprocessing
max_mel_frames = 900,
rescale = True,
rescaling_max = 0.9,
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
//调整后
//調整後
### Data Preprocessing
max_mel_frames = 900,
rescale = True,
@@ -250,16 +196,16 @@ tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
synthesis_batch_size = 8, # For vocoder preprocessing and inference.
```
声码器-训练声码器时:将 `vocoder/wavernn/hparams.py`中的batch_size参数调
聲碼器-訓練聲碼器時:將 `vocoder/wavernn/hparams.py`中的batch_size參數調
```
//整前
//調整前
# Training
voc_batch_size = 100
voc_lr = 1e-4
voc_gen_at_checkpoint = 5
voc_pad = 2
//调整后
//調整後
# Training
voc_batch_size = 6
voc_lr = 1e-4
@@ -268,16 +214,17 @@ voc_pad =2
```
#### 4.碰到`RuntimeError: Error(s) in loading state_dict for Tacotron: size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([70, 512]) from checkpoint, the shape in current model is torch.Size([75, 512]).`
请参照 issue [#37](https://github.com/babysor/MockingBird/issues/37)
請參照 issue [#37](https://github.com/babysor/MockingBird/issues/37)
#### 5.如何改善CPU、GPU用率?
视情况调整batch_size参数来改善
#### 5.如何改善CPU、GPU用率?
適情況調整batch_size參數來改善
#### 6.生 `面文件太小,法完成操作`
请参考这篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030)将虚拟内存更改100G(102400),例如:文件放置D就更改D的虚拟内存
#### 6.生 `面文件太小,法完成操作`
請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030)將虛擬內存更改100G(102400),例如:档案放置D就更改D的虚拟内存
#### 7.什么时候算训练完成?
首先一定要出现注意力模型其次是loss足够低取决于硬件设备和数据集。拿本人的供参考我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4
![attention_step_20500_sample_1](https://user-images.githubusercontent.com/7423248/128587252-f669f05a-f411-4811-8784-222156ea5e9d.png)
![step-135500-mel-spectrogram_sample_1](https://user-images.githubusercontent.com/7423248/128587255-4945faa0-5517-46ea-b173-928eff999330.png)

View File

@@ -18,10 +18,17 @@
### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/)
### Ongoing Works(Helps Needed)
* Major upgrade on GUI/Client and unifying web and toolbox
[X] Init framework `./mkgui` and [tech design](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
[X] Add demo part of Voice Cloning and Conversion
[X] Add preprocessing and training for Voice Conversion
[ ] Add preprocessing and training for Encoder/Synthesizer/Vocoder
* Major upgrade on model backend based on ESPnet2(not yet started)
## Quick Start
### 1. Install Requirements
#### 1.1 General Setup
> Follow the original repo to test if you got all environment ready.
**Python 3.7 or higher ** is needed to run the toolbox.
@@ -30,74 +37,8 @@
* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
or
- install dependencies with `conda` or `mamba`
```conda env create -n env_name -f env.yml```
```mamba env create -n env_name -f env.yml```
will create a virtual environment where necessary dependencies are installed. Switch to the new environment by `conda activate env_name` and enjoy it.
> env.yml only includes the necessary dependencies to run the projecttemporarily without monotonic-align. You can check the official website to install the GPU version of pytorch.
#### 1.2 Setup with a M1 Mac
> The following steps are a workaround to directly use the original `demo_toolbox.py`without the changing of codes.
>
> Since the major issue comes with the PyQt5 packages used in `demo_toolbox.py` not compatible with M1 chips, were one to attempt on training models with the M1 chip, either that person can forgo `demo_toolbox.py`, or one can try the `web.py` in the project.
##### 1.2.1 Install `PyQt5`, with [ref](https://stackoverflow.com/a/68038451/20455983) here.
* Create and open a Rosetta Terminal, with [ref](https://dev.to/courier/tips-and-tricks-to-setup-your-apple-m1-for-development-547g) here.
* Use system Python to create a virtual environment for the project
```
/usr/bin/python3 -m venv /PathToMockingBird/venv
source /PathToMockingBird/venv/bin/activate
```
* Upgrade pip and install `PyQt5`
```
pip install --upgrade pip
pip install pyqt5
```
##### 1.2.2 Install `pyworld` and `ctc-segmentation`
> Both packages seem to be unique to this project and are not seen in the original [Real-Time Voice Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) project. When installing with `pip install`, both packages lack wheels so the program tries to directly compile from c code and could not find `Python.h`.
* Install `pyworld`
* `brew install python` `Python.h` can come with Python installed by brew
* `export CPLUS_INCLUDE_PATH=/opt/homebrew/Frameworks/Python.framework/Headers` The filepath of brew-installed `Python.h` is unique to M1 MacOS and listed above. One needs to manually add the path to the environment variables.
* `pip install pyworld` that should do.
* Install`ctc-segmentation`
> Same method does not apply to `ctc-segmentation`, and one needs to compile it from the source code on [github](https://github.com/lumaku/ctc-segmentation).
* `git clone https://github.com/lumaku/ctc-segmentation.git`
* `cd ctc-segmentation`
* `source /PathToMockingBird/venv/bin/activate` If the virtual environment hasn't been deployed, activate it.
* `cythonize -3 ctc_segmentation/ctc_segmentation_dyn.pyx`
* `/usr/bin/arch -x86_64 python setup.py build` Build with x86 architecture.
* `/usr/bin/arch -x86_64 python setup.py install --optimize=1 --skip-build`Install with x86 architecture.
##### 1.2.3 Other dependencies
* `/usr/bin/arch -x86_64 pip install torch torchvision torchaudio` Pip installing `PyTorch` as an example, articulate that it's installed with x86 architecture
* `pip install ffmpeg` Install ffmpeg
* `pip install -r requirements.txt` Install other requirements.
##### 1.2.4 Run the Inference Time (with Toolbox)
> To run the project on x86 architecture. [ref](https://youtrack.jetbrains.com/issue/PY-46290/Allow-running-Python-under-Rosetta-2-in-PyCharm-for-Apple-Silicon).
* `vim /PathToMockingBird/venv/bin/pythonM1` Create an executable file `pythonM1` to condition python interpreter at `/PathToMockingBird/venv/bin`.
* Write in the following content:
```
#!/usr/bin/env zsh
mydir=${0:a:h}
/usr/bin/arch -x86_64 $mydir/python "$@"
```
* `chmod +x pythonM1` Set the file as executable.
* If using PyCharm IDE, configure project interpreter to `pythonM1`([steps here](https://www.jetbrains.com/help/pycharm/configuring-python-interpreter.html#add-existing-interpreter)), if using command line python, run `/PathToMockingBird/venv/bin/pythonM1 demo_toolbox.py`
> Note that we are using the pretrained encoder/vocoder but synthesizer since the original model is incompatible with the Chinese symbols. It means the demo_cli is not working at this moment.
### 2. Prepare your models
> Note that we are using the pretrained encoder/vocoder but not synthesizer, since the original model is incompatible with the Chinese symbols. It means the demo_cli is not working at this moment, so additional synthesizer models are required.
You can either train your models or use existing ones:
#### 2.1 Train encoder with your dataset (Optional)
@@ -126,7 +67,7 @@ Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata,
| --- | ----------- | ----- |----- |
| @author | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [Baidu](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps trained by multiple datasets
| @author | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [Baidu](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) codeom7f | | 25k steps trained by multiple datasets, only works under version 0.0.1
|@FawenYo | https://yisiou-my.sharepoint.com/:u:/g/personal/lawrence_cheng_yisiou_onmicrosoft_com/EWFWDHzee-NNg9TWdKckCc4BC7bK2j9cCbOWn0-_tK0nOg?e=Cc4EFA https://u.teknik.io/AYxWf.pt | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps with local accent of Taiwan, only works under version 0.0.1
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing https://u.teknik.io/AYxWf.pt | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps with local accent of Taiwan, only works under version 0.0.1
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code: 2021 https://www.aliyundrive.com/s/AwPsbo8mcSP code: z2m0 | https://www.bilibili.com/video/BV1uh411B7AD/ | only works under version 0.0.1
#### 2.4 Train vocoder (Optional)

View File

@@ -1,9 +1,9 @@
from models.encoder.params_model import model_embedding_size as speaker_embedding_size
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from utils.modelutils import check_model_paths
from models.synthesizer.inference import Synthesizer
from models.encoder import inference as encoder
from models.vocoder import inference as vocoder
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import soundfile as sf

View File

@@ -1,66 +0,0 @@
import sys
import torch
import argparse
import numpy as np
from utils.hparams import HpsYaml
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def main():
# Arguments
parser = argparse.ArgumentParser(description=
'Training PPG2Mel VC model.')
parser.add_argument('--config', type=str,
help='Path to experiment config, e.g., config/vc.yaml')
parser.add_argument('--name', default=None, type=str, help='Name for logging.')
parser.add_argument('--logdir', default='log/', type=str,
help='Logging path.', required=False)
parser.add_argument('--ckpdir', default='ppg2mel/saved_models/', type=str,
help='Checkpoint path.', required=False)
parser.add_argument('--outdir', default='result/', type=str,
help='Decode output path.', required=False)
parser.add_argument('--load', default=None, type=str,
help='Load pre-trained model (for training only)', required=False)
parser.add_argument('--warm_start', action='store_true',
help='Load model weights only, ignore specified layers.')
parser.add_argument('--seed', default=0, type=int,
help='Random seed for reproducable results.', required=False)
parser.add_argument('--njobs', default=8, type=int,
help='Number of threads for dataloader/decoding.', required=False)
parser.add_argument('--cpu', action='store_true', help='Disable GPU training.')
parser.add_argument('--no-pin', action='store_true',
help='Disable pin-memory for dataloader')
parser.add_argument('--test', action='store_true', help='Test the model.')
parser.add_argument('--no-msg', action='store_true', help='Hide all messages.')
parser.add_argument('--finetune', action='store_true', help='Finetune model')
parser.add_argument('--oneshotvc', action='store_true', help='Oneshot VC model')
parser.add_argument('--bilstm', action='store_true', help='BiLSTM VC model')
parser.add_argument('--lsa', action='store_true', help='Use location-sensitive attention (LSA)')
###
paras = parser.parse_args()
setattr(paras, 'gpu', not paras.cpu)
setattr(paras, 'pin_memory', not paras.no_pin)
setattr(paras, 'verbose', not paras.no_msg)
# Make the config dict dot visitable
config = HpsYaml(paras.config)
np.random.seed(paras.seed)
torch.manual_seed(paras.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(paras.seed)
print(">>> OneShot VC training ...")
mode = "train"
solver = Solver(config, paras, mode)
solver.load_data()
solver.set_model()
solver.exec()
print(">>> Oneshot VC train finished!")
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -1,31 +0,0 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 16,
"learning_rate": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999,
"seed": 1234,
"upsample_rates": [5,5,4,2],
"upsample_kernel_sizes": [10,10,8,4],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"segment_size": 6400,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 200,
"win_size": 800,
"sampling_rate": 16000,
"fmin": 0,
"fmax": 7600,
"fmax_for_loss": null,
"num_workers": 4
}

View File

@@ -1,8 +0,0 @@
https://openslr.magicdatatech.com/resources/62/aidatatang_200zh.tgz
out=download/aidatatang_200zh.tgz
https://openslr.magicdatatech.com/resources/68/train_set.tar.gz
out=download/magicdata.tgz
https://openslr.magicdatatech.com/resources/93/data_aishell3.tgz
out=download/aishell3.tgz
https://openslr.magicdatatech.com/resources/33/data_aishell.tgz
out=download/data_aishell.tgz

View File

@@ -1,8 +0,0 @@
https://openslr.elda.org/resources/62/aidatatang_200zh.tgz
out=download/aidatatang_200zh.tgz
https://openslr.elda.org/resources/68/train_set.tar.gz
out=download/magicdata.tgz
https://openslr.elda.org/resources/93/data_aishell3.tgz
out=download/aishell3.tgz
https://openslr.elda.org/resources/33/data_aishell.tgz
out=download/data_aishell.tgz

View File

@@ -1,8 +0,0 @@
https://us.openslr.org/resources/62/aidatatang_200zh.tgz
out=download/aidatatang_200zh.tgz
https://us.openslr.org/resources/68/train_set.tar.gz
out=download/magicdata.tgz
https://us.openslr.org/resources/93/data_aishell3.tgz
out=download/aishell3.tgz
https://us.openslr.org/resources/33/data_aishell.tgz
out=download/data_aishell.tgz

View File

@@ -1,4 +0,0 @@
0c0ace77fe8ee77db8d7542d6eb0b7ddf09b1bfb880eb93a7fbdbf4611e9984b /datasets/download/aidatatang_200zh.tgz
be2507d431ad59419ec871e60674caedb2b585f84ffa01fe359784686db0e0cc /datasets/download/aishell3.tgz
a4a0313cde0a933e0e01a451f77de0a23d6c942f4694af5bb7f40b9dc38143fe /datasets/download/data_aishell.tgz
1d2647c614b74048cfe16492570cc5146d800afdc07483a43b31809772632143 /datasets/download/magicdata.tgz

View File

@@ -1,8 +0,0 @@
https://www.openslr.org/resources/62/aidatatang_200zh.tgz
out=download/aidatatang_200zh.tgz
https://www.openslr.org/resources/68/train_set.tar.gz
out=download/magicdata.tgz
https://www.openslr.org/resources/93/data_aishell3.tgz
out=download/aishell3.tgz
https://www.openslr.org/resources/33/data_aishell.tgz
out=download/data_aishell.tgz

View File

@@ -1,8 +0,0 @@
#!/usr/bin/env bash
set -Eeuo pipefail
aria2c -x 10 --disable-ipv6 --input-file /workspace/datasets_download/${DATASET_MIRROR}.txt --dir /datasets --continue
echo "Verifying sha256sum..."
parallel --will-cite -a /workspace/datasets_download/datasets.sha256sum "echo -n {} | sha256sum -c"

View File

@@ -1,29 +0,0 @@
#!/usr/bin/env bash
set -Eeuo pipefail
mkdir -p /datasets/aidatatang_200zh
if [ -z "$(ls -A /datasets/aidatatang_200zh)" ] ; then
tar xvz --directory /datasets/ -f /datasets/download/aidatatang_200zh.tgz --exclude 'aidatatang_200zh/corpus/dev/*' --exclude 'aidatatang_200zh/corpus/test/*'
cd /datasets/aidatatang_200zh/corpus/train/
cat *.tar.gz | tar zxvf - -i
rm -f *.tar.gz
fi
mkdir -p /datasets/magicdata
if [ -z "$(ls -A /datasets/magicdata)" ] ; then
tar xvz --directory /datasets/magicdata -f /datasets/download/magicdata.tgz train/
fi
mkdir -p /datasets/aishell3
if [ -z "$(ls -A /datasets/aishell3)" ] ; then
tar xvz --directory /datasets/aishell3 -f /datasets/download/aishell3.tgz train/
fi
mkdir -p /datasets/data_aishell
if [ -z "$(ls -A /datasets/data_aishell)" ] ; then
tar xvz --directory /datasets/ -f /datasets/download/data_aishell.tgz
cd /datasets/data_aishell/wav/
cat *.tar.gz | tar zxvf - -i --exclude 'dev/*' --exclude 'test/*'
rm -f *.tar.gz
fi

View File

@@ -1,5 +1,5 @@
from pathlib import Path
from control.toolbox import Toolbox
from toolbox import Toolbox
from utils.argutils import print_args
from utils.modelutils import check_model_paths
import argparse
@@ -17,15 +17,15 @@ if __name__ == '__main__':
"supported datasets.", default=None)
parser.add_argument("-vc", "--vc_mode", action="store_true",
help="Voice Conversion Mode(PPG based)")
parser.add_argument("-e", "--enc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}encoder",
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models",
help="Directory containing saved encoder models")
parser.add_argument("-s", "--syn_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}synthesizer",
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models",
help="Directory containing saved synthesizer models")
parser.add_argument("-v", "--voc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}vocoder",
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
help="Directory containing saved vocoder models")
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg_extractor",
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default="ppg_extractor/saved_models",
help="Directory containing saved extrator models")
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg2mel",
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default="ppg2mel/saved_models",
help="Directory containing saved convert models")
parser.add_argument("--cpu", action="store_true", help=\
"If True, processing is done on CPU, even when a GPU is available.")

View File

@@ -1,23 +0,0 @@
version: '3.8'
services:
server:
image: mockingbird:latest
build: .
volumes:
- ./datasets:/datasets
- ./synthesizer/saved_models:/workspace/synthesizer/saved_models
environment:
- DATASET_MIRROR=US
- FORCE_RETRAIN=false
- TRAIN_DATASETS=aidatatang_200zh magicdata aishell3 data_aishell
- TRAIN_SKIP_EXISTING=true
ports:
- 8080:8080
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: [ '0' ]
capabilities: [ gpu ]

View File

@@ -1,17 +0,0 @@
#!/usr/bin/env bash
if [ -z "$(ls -A /workspace/synthesizer/saved_models)" ] || [ "$FORCE_RETRAIN" = true ] ; then
/workspace/datasets_download/download.sh
/workspace/datasets_download/extract.sh
for DATASET in ${TRAIN_DATASETS}
do
if [ "$TRAIN_SKIP_EXISTING" = true ] ; then
python pre.py /datasets -d ${DATASET} -n $(nproc) --skip_existing
else
python pre.py /datasets -d ${DATASET} -n $(nproc)
fi
done
python synthesizer_train.py mandarin /datasets/SV2TTS/synthesizer
fi
python web.py

View File

@@ -1,5 +1,5 @@
from scipy.ndimage.morphology import binary_dilation
from models.encoder.params_data import *
from encoder.params_data import *
from pathlib import Path
from typing import Optional, Union
from warnings import warn
@@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
# Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
wav = librosa.resample(wav, source_sr, sampling_rate)
# Apply the preprocessing: normalize volume and shorten long silences
if normalize:
@@ -99,7 +99,7 @@ def trim_long_silences(wav):
return ret[width - 1:] / width
audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(bool)
audio_mask = np.round(audio_mask).astype(np.bool)
# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))

View File

@@ -0,0 +1,2 @@
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

View File

@@ -1,5 +1,5 @@
from models.encoder.data_objects.random_cycler import RandomCycler
from models.encoder.data_objects.utterance import Utterance
from encoder.data_objects.random_cycler import RandomCycler
from encoder.data_objects.utterance import Utterance
from pathlib import Path
# Contains the set of utterances of a single speaker

View File

@@ -1,6 +1,6 @@
import numpy as np
from typing import List
from models.encoder.data_objects.speaker import Speaker
from encoder.data_objects.speaker import Speaker
class SpeakerBatch:
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):

View File

@@ -1,7 +1,7 @@
from models.encoder.data_objects.random_cycler import RandomCycler
from models.encoder.data_objects.speaker_batch import SpeakerBatch
from models.encoder.data_objects.speaker import Speaker
from models.encoder.params_data import partials_n_frames
from encoder.data_objects.random_cycler import RandomCycler
from encoder.data_objects.speaker_batch import SpeakerBatch
from encoder.data_objects.speaker import Speaker
from encoder.params_data import partials_n_frames
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

View File

@@ -1,8 +1,8 @@
from models.encoder.params_data import *
from models.encoder.model import SpeakerEncoder
from models.encoder.audio import preprocess_wav # We want to expose this function from here
from encoder.params_data import *
from encoder.model import SpeakerEncoder
from encoder.audio import preprocess_wav # We want to expose this function from here
from matplotlib import cm
from models.encoder import audio
from encoder import audio
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

View File

@@ -1,5 +1,5 @@
from models.encoder.params_model import *
from models.encoder.params_data import *
from encoder.params_model import *
from encoder.params_data import *
from scipy.interpolate import interp1d
from sklearn.metrics import roc_curve
from torch.nn.utils import clip_grad_norm_

View File

@@ -1,8 +1,8 @@
from multiprocess.pool import ThreadPool
from models.encoder.params_data import *
from models.encoder.config import librispeech_datasets, anglophone_nationalites
from encoder.params_data import *
from encoder.config import librispeech_datasets, anglophone_nationalites
from datetime import datetime
from models.encoder import audio
from encoder import audio
from pathlib import Path
from tqdm import tqdm
import numpy as np
@@ -22,7 +22,7 @@ class DatasetLog:
self._log_params()
def _log_params(self):
from models.encoder import params_data
from encoder import params_data
self.write_line("Parameter values:")
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
value = getattr(params_data, param_name)

View File

@@ -1,7 +1,7 @@
from models.encoder.visualizations import Visualizations
from models.encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
from models.encoder.params_model import *
from models.encoder.model import SpeakerEncoder
from encoder.visualizations import Visualizations
from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
from encoder.params_model import *
from encoder.model import SpeakerEncoder
from utils.profiler import Profiler
from pathlib import Path
import torch

View File

@@ -1,4 +1,4 @@
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from datetime import datetime
from time import perf_counter as timer
import matplotlib.pyplot as plt
@@ -21,7 +21,7 @@ colormap = np.array([
[33, 0, 127],
[0, 0, 0],
[183, 183, 183],
], dtype=float) / 255
], dtype=np.float) / 255
class Visualizations:
@@ -65,8 +65,8 @@ class Visualizations:
def log_params(self):
if self.disabled:
return
from models.encoder import params_data
from models.encoder import params_model
from encoder import params_data
from encoder import params_model
param_string = "<b>Model parameters</b>:<br>"
for param_name in (p for p in dir(params_model) if not p.startswith("__")):
value = getattr(params_model, param_name)

View File

@@ -1,10 +1,7 @@
import argparse
from pathlib import Path
from models.encoder.preprocess import (preprocess_aidatatang_200zh,
preprocess_librispeech, preprocess_voxceleb1,
preprocess_voxceleb2)
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aidatatang_200zh
from utils.argutils import print_args
from pathlib import Path
import argparse
if __name__ == "__main__":
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):

View File

@@ -1,5 +1,5 @@
from utils.argutils import print_args
from models.encoder.train import train
from encoder.train import train
from pathlib import Path
import argparse

BIN
env.yml

Binary file not shown.

View File

@@ -1,15 +1,23 @@
from models.synthesizer.inference import Synthesizer
from models.encoder import inference as encoder
from models.vocoder.hifigan import inference as gan_vocoder
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from utils.modelutils import check_model_paths
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder.wavernn import inference as rnn_vocoder
from vocoder.hifigan import inference as gan_vocoder
from pathlib import Path
import numpy as np
import soundfile as sf
import librosa
import argparse
import torch
import sys
import os
import re
import cn2an
import glob
from audioread.exceptions import NoBackendError
vocoder = gan_vocoder
def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):

View File

@@ -2,22 +2,22 @@ from pydantic import BaseModel, Field
import os
from pathlib import Path
from enum import Enum
from models.encoder import inference as encoder
from encoder import inference as encoder
import librosa
from scipy.io.wavfile import write
import re
import numpy as np
from control.mkgui.base.components.types import FileContent
from models.vocoder.hifigan import inference as gan_vocoder
from models.synthesizer.inference import Synthesizer
from mkgui.base.components.types import FileContent
from vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer
from typing import Any, Tuple
import matplotlib.pyplot as plt
# Constants
AUDIO_SAMPLES_DIR = f"data{os.sep}samples{os.sep}"
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
VOC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}vocoder"
AUDIO_SAMPLES_DIR = f"samples{os.sep}"
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
VOC_MODELS_DIRT = f"vocoder{os.sep}saved_models"
TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
if not os.path.isdir("wavs"):
@@ -31,7 +31,7 @@ if os.path.isdir(SYN_MODELS_DIRT):
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
print("Loaded synthesizer models: " + str(len(synthesizers)))
else:
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist. 请将模型文件位置移动到上述位置中进行重试!")
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
if os.path.isdir(ENC_MODELS_DIRT):
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
@@ -46,16 +46,15 @@ else:
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
class Input(BaseModel):
message: str = Field(
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容"
)
local_audio_file: audio_input_selection = Field(
..., alias="选择语音本地wav",
..., alias="输入语音本地wav",
description="选择本地语音文件."
)
record_audio_file: FileContent = Field(default=None, alias="录制语音",
description="录音.", is_recorder=True, mime_type="audio/wav")
upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
description="拖拽或点击上传.", mime_type="audio/wav")
encoder: encoders = Field(
@@ -105,12 +104,7 @@ def synthesize(input: Input) -> Output:
gan_vocoder.load_model(Path(input.vocoder.value))
# load file
if input.record_audio_file != None:
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
f.write(input.record_audio_file.as_bytes())
f.seek(0)
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
elif input.upload_audio_file != None:
if input.upload_audio_file != None:
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
f.write(input.upload_audio_file.as_bytes())
f.seek(0)

View File

@@ -1,26 +1,27 @@
import os
from enum import Enum
from pathlib import Path
from typing import Any, Tuple
import librosa
import matplotlib.pyplot as plt
import torch
from synthesizer.inference import Synthesizer
from pydantic import BaseModel, Field
from encoder import inference as speacker_encoder
import torch
import os
from pathlib import Path
from enum import Enum
import ppg_extractor as Extractor
import ppg2mel as Convertor
import librosa
from scipy.io.wavfile import write
import re
import numpy as np
from mkgui.base.components.types import FileContent
from vocoder.hifigan import inference as gan_vocoder
from typing import Any, Tuple
import matplotlib.pyplot as plt
import models.ppg2mel as Convertor
import models.ppg_extractor as Extractor
from control.mkgui.base.components.types import FileContent
from models.encoder import inference as speacker_encoder
from models.synthesizer.inference import Synthesizer
from models.vocoder.hifigan import inference as gan_vocoder
# Constants
AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}'
EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor'
CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel'
VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder'
AUDIO_SAMPLES_DIR = f'sample{os.sep}'
EXT_MODELS_DIRT = f'ppg_extractor{os.sep}saved_models'
CONV_MODELS_DIRT = f'ppg2mel{os.sep}saved_models'
VOC_MODELS_DIRT = f'vocoder{os.sep}saved_models'
TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'
@@ -131,10 +132,9 @@ def convert(input: Input) -> Output:
ppg = extractor.extract_from_wav(src_wav)
# Import necessary dependency of Voice Conversion
from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0,
get_converted_lf0uv)
from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv
ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt"))
speacker_encoder.load_model(Path("encoder{os.sep}saved_models{os.sep}pretrained_bak_5805000.pt"))
embed = speacker_encoder.embed_utterance(ref_wav)
lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
min_len = min(ppg.shape[1], len(lf0_uv))

View File

@@ -37,12 +37,6 @@ def is_single_file_property(property: Dict) -> bool:
# TODO: binary?
return property.get("format") == "byte"
def is_single_autio_property(property: Dict) -> bool:
if property.get("type") != "string":
return False
# TODO: binary?
return property.get("format") == "bytes"
def is_single_directory_property(property: Dict) -> bool:
if property.get("type") != "string":

View File

@@ -2,7 +2,7 @@ import datetime
import inspect
import mimetypes
import sys
from os import getcwd, unlink, path
from os import getcwd, unlink
from platform import system
from tempfile import NamedTemporaryFile
from typing import Any, Callable, Dict, List, Type
@@ -14,13 +14,14 @@ from fastapi.encoders import jsonable_encoder
from loguru import logger
from pydantic import BaseModel, ValidationError, parse_obj_as
from control.mkgui.base import Opyrator
from control.mkgui.base.core import name_to_title
from . import schema_utils
from .streamlit_utils import CUSTOM_STREAMLIT_CSS
from mkgui.base import Opyrator
from mkgui.base.core import name_to_title
from mkgui.base.ui import schema_utils
from mkgui.base.ui.streamlit_utils import CUSTOM_STREAMLIT_CSS
STREAMLIT_RUNNER_SNIPPET = """
from control.mkgui.base.ui import render_streamlit_ui
from mkgui.base.ui import render_streamlit_ui
from mkgui.base import Opyrator
import streamlit as st
@@ -242,14 +243,7 @@ class InputUI:
file_extension = None
if "mime_type" in property:
file_extension = mimetypes.guess_extension(property["mime_type"])
if "is_recorder" in property:
from audio_recorder_streamlit import audio_recorder
audio_bytes = audio_recorder()
if audio_bytes:
streamlit_app.audio(audio_bytes, format="audio/wav")
return audio_bytes
uploaded_file = streamlit_app.file_uploader(
**streamlit_kwargs, accept_multiple_files=False, type=file_extension
)
@@ -269,39 +263,6 @@ class InputUI:
streamlit_app.video(bytes, format=property.get("mime_type"))
return bytes
def _render_single_audio_input(
self, streamlit_app: st, key: str, property: Dict
) -> Any:
# streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
from audio_recorder_streamlit import audio_recorder
audio_bytes = audio_recorder()
if audio_bytes:
streamlit_app.audio(audio_bytes, format="audio/wav")
return audio_bytes
# file_extension = None
# if "mime_type" in property:
# file_extension = mimetypes.guess_extension(property["mime_type"])
# uploaded_file = streamlit_app.file_uploader(
# **streamlit_kwargs, accept_multiple_files=False, type=file_extension
# )
# if uploaded_file is None:
# return None
# bytes = uploaded_file.getvalue()
# if property.get("mime_type"):
# if is_compatible_audio(property["mime_type"]):
# # Show audio
# streamlit_app.audio(bytes, format=property.get("mime_type"))
# if is_compatible_image(property["mime_type"]):
# # Show image
# streamlit_app.image(bytes)
# if is_compatible_video(property["mime_type"]):
# # Show video
# streamlit_app.video(bytes, format=property.get("mime_type"))
# return bytes
def _render_single_string_input(
self, streamlit_app: st, key: str, property: Dict
) -> Any:
@@ -846,20 +807,21 @@ class OutputUI:
def getOpyrator(mode: str) -> Opyrator:
if mode == None or mode.startswith('VC'):
from control.mkgui.app_vc import convert
from mkgui.app_vc import convert
return Opyrator(convert)
if mode == None or mode.startswith('预处理'):
from control.mkgui.preprocess import preprocess
from mkgui.preprocess import preprocess
return Opyrator(preprocess)
if mode == None or mode.startswith('模型训练'):
from control.mkgui.train import train
from mkgui.train import train
return Opyrator(train)
if mode == None or mode.startswith('模型训练(VC)'):
from control.mkgui.train_vc import train_vc
from mkgui.train_vc import train_vc
return Opyrator(train_vc)
from control.mkgui.app import synthesize
from mkgui.app import synthesize
return Opyrator(synthesize)
def render_streamlit_ui() -> None:
# init
session_state = st.session_state
@@ -883,7 +845,7 @@ def render_streamlit_ui() -> None:
col2.title(title)
col2.markdown("欢迎使用MockingBird Web 2")
image = Image.open(path.join('control','mkgui', 'static', 'mb.png'))
image = Image.open('.\\mkgui\\static\\mb.png')
col1.image(image)
st.markdown("---")
@@ -891,13 +853,6 @@ def render_streamlit_ui() -> None:
with left:
st.header("Control 控制")
# if session_state.mode in ["AI拟音", "VC拟音"] :
# from audiorecorder import audiorecorder
# audio = audiorecorder("Click to record", "Recording...")
# if len(audio) > 0:
# # To play audio in frontend:
# st.audio(audio.tobytes())
InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st)
execute_selected = st.button(opyrator.action)
if execute_selected:

View File

@@ -6,8 +6,8 @@ from typing import Any, Tuple
# Constants
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
if os.path.isdir(EXT_MODELS_DIRT):
@@ -83,7 +83,7 @@ def preprocess(input: Input) -> Output:
"""Preprocess(预处理)"""
finished = 0
if input.model == Model.VC_PPG2MEL:
from models.ppg2mel.preprocess import preprocess_dataset
from ppg2mel.preprocess import preprocess_dataset
finished = preprocess_dataset(
datasets_root=Path(input.datasets_root),
dataset=input.dataset,

View File

Before

Width:  |  Height:  |  Size: 5.6 KiB

After

Width:  |  Height:  |  Size: 5.6 KiB

View File

@@ -3,17 +3,17 @@ import os
from pathlib import Path
from enum import Enum
from typing import Any
from models.synthesizer.hparams import hparams
from models.synthesizer.train import train as synt_train
from synthesizer.hparams import hparams
from synthesizer.train import train as synt_train
# Constants
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
# EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
# CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
# ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
# EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
# CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
# ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
# Pre-Load models
if os.path.isdir(SYN_MODELS_DIRT):
@@ -96,7 +96,7 @@ def train(input: Input) -> Output:
synt_train(
input.run_id,
input.input_root,
f"data{os.sep}ckpt{os.sep}synthesizer",
f"synthesizer{os.sep}saved_models",
input.save_every,
input.backup_every,
input.log_every,

View File

@@ -4,14 +4,14 @@ from pathlib import Path
from enum import Enum
from typing import Any, Tuple
import numpy as np
from utils.hparams import HpsYaml
from utils.load_yaml import HpsYaml
from utils.util import AttrDict
import torch
# Constants
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
if os.path.isdir(EXT_MODELS_DIRT):
@@ -144,7 +144,7 @@ def train_vc(input: Input) -> Output:
if torch.cuda.is_available():
torch.cuda.manual_seed_all(input.seed)
mode = "train"
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
solver = Solver(config, params, mode)
solver.load_data()
solver.set_model()

View File

@@ -1,2 +0,0 @@
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

View File

@@ -1,193 +0,0 @@
import torch
from torch.nn import functional as F
import numpy as np
DEFAULT_MIN_BIN_WIDTH = 1e-3
DEFAULT_MIN_BIN_HEIGHT = 1e-3
DEFAULT_MIN_DERIVATIVE = 1e-3
def piecewise_rational_quadratic_transform(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails=None,
tail_bound=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
if tails is None:
spline_fn = rational_quadratic_spline
spline_kwargs = {}
else:
spline_fn = unconstrained_rational_quadratic_spline
spline_kwargs = {
'tails': tails,
'tail_bound': tail_bound
}
outputs, logabsdet = spline_fn(
inputs=inputs,
unnormalized_widths=unnormalized_widths,
unnormalized_heights=unnormalized_heights,
unnormalized_derivatives=unnormalized_derivatives,
inverse=inverse,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
**spline_kwargs
)
return outputs, logabsdet
def searchsorted(bin_locations, inputs, eps=1e-6):
bin_locations[..., -1] += eps
return torch.sum(
inputs[..., None] >= bin_locations,
dim=-1
) - 1
def unconstrained_rational_quadratic_spline(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails='linear',
tail_bound=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
outside_interval_mask = ~inside_interval_mask
outputs = torch.zeros_like(inputs)
logabsdet = torch.zeros_like(inputs)
if tails == 'linear':
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
constant = np.log(np.exp(1 - min_derivative) - 1)
unnormalized_derivatives[..., 0] = constant
unnormalized_derivatives[..., -1] = constant
outputs[outside_interval_mask] = inputs[outside_interval_mask]
logabsdet[outside_interval_mask] = 0
else:
raise RuntimeError('{} tails are not implemented.'.format(tails))
outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
inputs=inputs[inside_interval_mask],
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
inverse=inverse,
left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative
)
return outputs, logabsdet
def rational_quadratic_spline(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
left=0., right=1., bottom=0., top=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
if torch.min(inputs) < left or torch.max(inputs) > right:
raise ValueError('Input to a transform is not within its domain')
num_bins = unnormalized_widths.shape[-1]
if min_bin_width * num_bins > 1.0:
raise ValueError('Minimal bin width too large for the number of bins')
if min_bin_height * num_bins > 1.0:
raise ValueError('Minimal bin height too large for the number of bins')
widths = F.softmax(unnormalized_widths, dim=-1)
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
cumwidths = torch.cumsum(widths, dim=-1)
cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
cumwidths = (right - left) * cumwidths + left
cumwidths[..., 0] = left
cumwidths[..., -1] = right
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
heights = F.softmax(unnormalized_heights, dim=-1)
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
cumheights = torch.cumsum(heights, dim=-1)
cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
cumheights = (top - bottom) * cumheights + bottom
cumheights[..., 0] = bottom
cumheights[..., -1] = top
heights = cumheights[..., 1:] - cumheights[..., :-1]
if inverse:
bin_idx = searchsorted(cumheights, inputs)[..., None]
else:
bin_idx = searchsorted(cumwidths, inputs)[..., None]
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
delta = heights / widths
input_delta = delta.gather(-1, bin_idx)[..., 0]
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
input_heights = heights.gather(-1, bin_idx)[..., 0]
if inverse:
a = (((inputs - input_cumheights) * (input_derivatives
+ input_derivatives_plus_one
- 2 * input_delta)
+ input_heights * (input_delta - input_derivatives)))
b = (input_heights * input_derivatives
- (inputs - input_cumheights) * (input_derivatives
+ input_derivatives_plus_one
- 2 * input_delta))
c = - input_delta * (inputs - input_cumheights)
discriminant = b.pow(2) - 4 * a * c
assert (discriminant >= 0).all()
root = (2 * c) / (-b - torch.sqrt(discriminant))
outputs = root * input_bin_widths + input_cumwidths
theta_one_minus_theta = root * (1 - root)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta)
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - root).pow(2))
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, -logabsdet
else:
theta = (inputs - input_cumwidths) / input_bin_widths
theta_one_minus_theta = theta * (1 - theta)
numerator = input_heights * (input_delta * theta.pow(2)
+ input_derivatives * theta_one_minus_theta)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta)
outputs = input_cumheights + numerator / denominator
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - theta).pow(2))
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, logabsdet

View File

@@ -1,675 +0,0 @@
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import Conv1d
from torch.nn.utils import weight_norm, remove_weight_norm
from utils.util import init_weights, get_padding, convert_pad_shape, convert_pad_shape, subsequent_mask, fused_add_tanh_sigmoid_multiply
from .common.transforms import piecewise_rational_quadratic_transform
LRELU_SLOPE = 0.1
class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5):
super().__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)
class ConvReluNorm(nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
super().__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
assert n_layers > 1, "Number of layers should be larger than 0."
self.conv_layers = nn.ModuleList()
self.norm_layers = nn.ModuleList()
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
self.norm_layers.append(LayerNorm(hidden_channels))
self.relu_drop = nn.Sequential(
nn.ReLU(),
nn.Dropout(p_dropout))
for _ in range(n_layers-1):
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
self.norm_layers.append(LayerNorm(hidden_channels))
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask):
x_org = x
for i in range(self.n_layers):
x = self.conv_layers[i](x * x_mask)
x = self.norm_layers[i](x)
x = self.relu_drop(x)
x = x_org + self.proj(x)
return x * x_mask
class DDSConv(nn.Module):
"""
Dilated and Depth-Separable Convolution
"""
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
super().__init__()
self.channels = channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
self.drop = nn.Dropout(p_dropout)
self.convs_sep = nn.ModuleList()
self.convs_1x1 = nn.ModuleList()
self.norms_1 = nn.ModuleList()
self.norms_2 = nn.ModuleList()
for i in range(n_layers):
dilation = kernel_size ** i
padding = (kernel_size * dilation - dilation) // 2
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
groups=channels, dilation=dilation, padding=padding
))
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
self.norms_1.append(LayerNorm(channels))
self.norms_2.append(LayerNorm(channels))
def forward(self, x, x_mask, g=None):
if g is not None:
x = x + g
for i in range(self.n_layers):
y = self.convs_sep[i](x * x_mask)
y = self.norms_1[i](y)
y = F.gelu(y)
y = self.convs_1x1[i](y)
y = self.norms_2[i](y)
y = F.gelu(y)
y = self.drop(y)
x = x + y
return x * x_mask
class WN(torch.nn.Module):
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
super(WN, self).__init__()
assert(kernel_size % 2 == 1)
self.hidden_channels =hidden_channels
self.kernel_size = kernel_size,
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.p_dropout = p_dropout
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.drop = nn.Dropout(p_dropout)
if gin_channels != 0:
cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
for i in range(n_layers):
dilation = dilation_rate ** i
padding = int((kernel_size * dilation - dilation) / 2)
in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
dilation=dilation, padding=padding)
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
self.in_layers.append(in_layer)
# last one is not necessary
if i < n_layers - 1:
res_skip_channels = 2 * hidden_channels
else:
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
self.res_skip_layers.append(res_skip_layer)
def forward(self, x, x_mask, g=None, **kwargs):
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
for i in range(self.n_layers):
x_in = self.in_layers[i](x)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
else:
g_l = torch.zeros_like(x_in)
acts = fused_add_tanh_sigmoid_multiply(
x_in,
g_l,
n_channels_tensor)
acts = self.drop(acts)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1:
res_acts = res_skip_acts[:,:self.hidden_channels,:]
x = (x + res_acts) * x_mask
output = output + res_skip_acts[:,self.hidden_channels:,:]
else:
output = output + res_skip_acts
return output * x_mask
def remove_weight_norm(self):
if self.gin_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
class ResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
super(ResBlock1, self).__init__()
self.convs1 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2])))
])
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1)))
])
self.convs2.apply(init_weights)
def forward(self, x, x_mask=None):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c2(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class ResBlock2(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.convs = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1])))
])
self.convs.apply(init_weights)
def forward(self, x, x_mask=None):
for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class Log(nn.Module):
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
logdet = torch.sum(-y, [1, 2])
return y, logdet
else:
x = torch.exp(x) * x_mask
return x
class Flip(nn.Module):
def forward(self, x, *args, reverse=False, **kwargs):
x = torch.flip(x, [1])
if not reverse:
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
return x, logdet
else:
return x
class ElementwiseAffine(nn.Module):
def __init__(self, channels):
super().__init__()
self.channels = channels
self.m = nn.Parameter(torch.zeros(channels,1))
self.logs = nn.Parameter(torch.zeros(channels,1))
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = self.m + torch.exp(self.logs) * x
y = y * x_mask
logdet = torch.sum(self.logs * x_mask, [1,2])
return y, logdet
else:
x = (x - self.m) * torch.exp(-self.logs) * x_mask
return x
class ResidualCouplingLayer(nn.Module):
def __init__(self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
p_dropout=0,
gin_channels=0,
mean_only=False):
assert channels % 2 == 0, "channels should be divisible by 2"
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.half_channels = channels // 2
self.mean_only = mean_only
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_()
self.post.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g)
stats = self.post(h) * x_mask
if not self.mean_only:
m, logs = torch.split(stats, [self.half_channels]*2, 1)
else:
m = stats
logs = torch.zeros_like(m)
if not reverse:
x1 = m + x1 * torch.exp(logs) * x_mask
x = torch.cat([x0, x1], 1)
logdet = torch.sum(logs, [1,2])
return x, logdet
else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1)
return x
class ConvFlow(nn.Module):
def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
super().__init__()
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.num_bins = num_bins
self.tail_bound = tail_bound
self.half_channels = in_channels // 2
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
h = self.pre(x0)
h = self.convs(h, x_mask, g=g)
h = self.proj(h) * x_mask
b, c, t = x0.shape
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
unnormalized_derivatives = h[..., 2 * self.num_bins:]
x1, logabsdet = piecewise_rational_quadratic_transform(x1,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=reverse,
tails='linear',
tail_bound=self.tail_bound
)
x = torch.cat([x0, x1], 1) * x_mask
logdet = torch.sum(logabsdet * x_mask, [1,2])
if not reverse:
return x, logdet
else:
return x
class Encoder(nn.Module):
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask):
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.attn_layers[i](x, x, attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class Decoder(nn.Module):
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.drop = nn.Dropout(p_dropout)
self.self_attn_layers = nn.ModuleList()
self.norm_layers_0 = nn.ModuleList()
self.encdec_attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
self.norm_layers_0.append(LayerNorm(hidden_channels))
self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask, h, h_mask):
"""
x: decoder input
h: encoder output
"""
self_attn_mask = subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.self_attn_layers[i](x, x, self_attn_mask)
y = self.drop(y)
x = self.norm_layers_0[i](x + y)
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class MultiHeadAttention(nn.Module):
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
super().__init__()
assert channels % n_heads == 0
self.channels = channels
self.out_channels = out_channels
self.n_heads = n_heads
self.p_dropout = p_dropout
self.window_size = window_size
self.heads_share = heads_share
self.block_length = block_length
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.attn = None
self.k_channels = channels // n_heads
self.conv_q = nn.Conv1d(channels, channels, 1)
self.conv_k = nn.Conv1d(channels, channels, 1)
self.conv_v = nn.Conv1d(channels, channels, 1)
self.conv_o = nn.Conv1d(channels, out_channels, 1)
self.drop = nn.Dropout(p_dropout)
if window_size is not None:
n_heads_rel = 1 if heads_share else n_heads
rel_stddev = self.k_channels**-0.5
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
nn.init.xavier_uniform_(self.conv_q.weight)
nn.init.xavier_uniform_(self.conv_k.weight)
nn.init.xavier_uniform_(self.conv_v.weight)
if proximal_init:
with torch.no_grad():
self.conv_k.weight.copy_(self.conv_q.weight)
self.conv_k.bias.copy_(self.conv_q.bias)
def forward(self, x, c, attn_mask=None):
q = self.conv_q(x)
k = self.conv_k(c)
v = self.conv_v(c)
x, self.attn = self.attention(q, k, v, mask=attn_mask)
x = self.conv_o(x)
return x
def attention(self, query, key, value, mask=None):
# reshape [b, d, t] -> [b, n_h, t, d_k]
b, d, t_s, t_t = (*key.size(), query.size(2))
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
if self.window_size is not None:
assert t_s == t_t, "Relative attention is only available for self-attention."
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
scores_local = self._relative_position_to_absolute_position(rel_logits)
scores = scores + scores_local
if self.proximal_bias:
assert t_s == t_t, "Proximal bias is only available for self-attention."
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e4)
if self.block_length is not None:
assert t_s == t_t, "Local attention is only available for self-attention."
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
scores = scores.masked_fill(block_mask == 0, -1e4)
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
p_attn = self.drop(p_attn)
output = torch.matmul(p_attn, value)
if self.window_size is not None:
relative_weights = self._absolute_position_to_relative_position(p_attn)
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
return output, p_attn
def _matmul_with_relative_values(self, x, y):
"""
x: [b, h, l, m]
y: [h or 1, m, d]
ret: [b, h, l, d]
"""
ret = torch.matmul(x, y.unsqueeze(0))
return ret
def _matmul_with_relative_keys(self, x, y):
"""
x: [b, h, l, d]
y: [h or 1, m, d]
ret: [b, h, l, m]
"""
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
return ret
def _get_relative_embeddings(self, relative_embeddings, length):
max_relative_position = 2 * self.window_size + 1
# Pad first before slice to avoid using cond ops.
pad_length = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0)
slice_end_position = slice_start_position + 2 * length - 1
if pad_length > 0:
padded_relative_embeddings = F.pad(
relative_embeddings,
convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
else:
padded_relative_embeddings = relative_embeddings
used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
return used_relative_embeddings
def _relative_position_to_absolute_position(self, x):
"""
x: [b, h, l, 2*l-1]
ret: [b, h, l, l]
"""
batch, heads, length, _ = x.size()
# Concat columns of pad to shift from relative to absolute indexing.
x = F.pad(x, convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
# Concat extra elements so to add up to shape (len+1, 2*len-1).
x_flat = x.view([batch, heads, length * 2 * length])
x_flat = F.pad(x_flat, convert_pad_shape([[0,0],[0,0],[0,length-1]]))
# Reshape and slice out the padded elements.
x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
return x_final
def _absolute_position_to_relative_position(self, x):
"""
x: [b, h, l, l]
ret: [b, h, l, 2*l-1]
"""
batch, heads, length, _ = x.size()
# padd along column
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
x_flat = x.view([batch, heads, length**2 + length*(length -1)])
# add 0's in the beginning that will skew the elements after reshape
x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
return x_final
def _attention_bias_proximal(self, length):
"""Bias for self-attention to encourage attention to close positions.
Args:
length: an integer scalar.
Returns:
a Tensor with shape [1, 1, length, length]
"""
r = torch.arange(length, dtype=torch.float32)
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
class FFN(nn.Module):
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.activation = activation
self.causal = causal
if causal:
self.padding = self._causal_padding
else:
self.padding = self._same_padding
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
self.drop = nn.Dropout(p_dropout)
def forward(self, x, x_mask):
x = self.conv_1(self.padding(x * x_mask))
if self.activation == "gelu":
x = x * torch.sigmoid(1.702 * x)
else:
x = torch.relu(x)
x = self.drop(x)
x = self.conv_2(self.padding(x * x_mask))
return x * x_mask
def _causal_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = self.kernel_size - 1
pad_r = 0
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, convert_pad_shape(padding))
return x
def _same_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = (self.kernel_size - 1) // 2
pad_r = self.kernel_size // 2
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, convert_pad_shape(padding))
return x

View File

@@ -1,528 +0,0 @@
import math
import torch
from torch import nn
from torch.nn import functional as F
from loguru import logger
from .sublayer.vits_modules import *
import monotonic_align
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from utils.util import init_weights, get_padding, sequence_mask, rand_slice_segments, generate_path
class StochasticDurationPredictor(nn.Module):
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
super().__init__()
filter_channels = in_channels # it needs to be removed from future version.
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.n_flows = n_flows
self.gin_channels = gin_channels
self.log_flow = Log()
self.flows = nn.ModuleList()
self.flows.append(ElementwiseAffine(2))
for i in range(n_flows):
self.flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
self.flows.append(Flip())
self.post_pre = nn.Conv1d(1, filter_channels, 1)
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
self.post_convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
self.post_flows = nn.ModuleList()
self.post_flows.append(ElementwiseAffine(2))
for i in range(4):
self.post_flows.append(ConvFlow(2, filter_channels, kernel_size, n_layers=3))
self.post_flows.append(Flip())
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
self.convs = DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
x = torch.detach(x)
x = self.pre(x)
if g is not None:
g = torch.detach(g)
x = x + self.cond(g)
x = self.convs(x, x_mask)
x = self.proj(x) * x_mask
if not reverse:
flows = self.flows
assert w is not None
logdet_tot_q = 0
h_w = self.post_pre(w)
h_w = self.post_convs(h_w, x_mask)
h_w = self.post_proj(h_w) * x_mask
e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
z_q = e_q
for flow in self.post_flows:
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
logdet_tot_q += logdet_q
z_u, z1 = torch.split(z_q, [1, 1], 1)
u = torch.sigmoid(z_u) * x_mask
z0 = (w - u) * x_mask
logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
logdet_tot = 0
z0, logdet = self.log_flow(z0, x_mask)
logdet_tot += logdet
z = torch.cat([z0, z1], 1)
for flow in flows:
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
logdet_tot = logdet_tot + logdet
nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
return nll + logq # [b]
else:
flows = list(reversed(self.flows))
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
for flow in flows:
z = flow(z, x_mask, g=x, reverse=reverse)
z0, z1 = torch.split(z, [1, 1], 1)
logw = z0
return logw
class DurationPredictor(nn.Module):
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
super().__init__()
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.gin_channels = gin_channels
self.drop = nn.Dropout(p_dropout)
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size//2)
self.norm_1 = LayerNorm(filter_channels)
self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2)
self.norm_2 = LayerNorm(filter_channels)
self.proj = nn.Conv1d(filter_channels, 1, 1)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
def forward(self, x, x_mask, g=None):
x = torch.detach(x)
if g is not None:
g = torch.detach(g)
x = x + self.cond(g)
x = self.conv_1(x * x_mask)
x = torch.relu(x)
x = self.norm_1(x)
x = self.drop(x)
x = self.conv_2(x * x_mask)
x = torch.relu(x)
x = self.norm_2(x)
x = self.drop(x)
x = self.proj(x * x_mask)
return x * x_mask
class TextEncoder(nn.Module):
def __init__(self,
n_vocab,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout):
super().__init__()
self.n_vocab = n_vocab
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb = nn.Embedding(n_vocab, hidden_channels)
self.emo_proj = nn.Linear(1024, hidden_channels)
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
self.encoder = Encoder(
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout)
self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths, emo):
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
x = x + self.emo_proj(emo.unsqueeze(1))
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
x = self.encoder(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return x, m, logs, x_mask
class ResidualCouplingBlock(nn.Module):
def __init__(self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
n_flows=4,
gin_channels=0):
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.n_flows = n_flows
self.gin_channels = gin_channels
self.flows = nn.ModuleList()
for i in range(n_flows):
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
self.flows.append(Flip())
def forward(self, x, x_mask, g=None, reverse=False):
if not reverse:
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
for flow in reversed(self.flows):
x = flow(x, x_mask, g=g, reverse=reverse)
return x
class PosteriorEncoder(nn.Module):
def __init__(self,
in_channels,
out_channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths, g=None):
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
return z, m, logs, x_mask
class Generator(torch.nn.Module):
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
resblock = ResBlock1 if resblock == '1' else ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups.append(weight_norm(
ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
k, u, padding=(k-u)//2)))
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel//(2**(i+1))
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if gin_channels != 0:
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
def forward(self, x, g=None):
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i*self.num_kernels+j](x)
else:
xs += self.resblocks[i*self.num_kernels+j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
print('Removing weight norm...')
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
self.use_spectral_norm = use_spectral_norm
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
])
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
])
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__()
periods = [2,3,5,7,11]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
y_d_gs.append(y_d_g)
fmap_rs.append(fmap_r)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class Vits(nn.Module):
"""
Synthesizer of Vits
"""
def __init__(self,
n_vocab,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
n_speakers=0,
gin_channels=0,
use_sdp=True,
**kwargs):
super().__init__()
self.n_vocab = n_vocab
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.n_speakers = n_speakers
self.gin_channels = gin_channels
self.use_sdp = use_sdp
self.enc_p = TextEncoder(n_vocab,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout)
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
if use_sdp:
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
else:
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
if n_speakers > 1:
self.emb_g = nn.Embedding(n_speakers, gin_channels)
def forward(self, x, x_lengths, y, y_lengths, sid=None, emo=None):
# logger.info(f'====> Forward: 1.1.0')
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emo)
if self.n_speakers > 0:
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
else:
g = None
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
# logger.info(f'====> Forward: 1.1.1')
with torch.no_grad():
# negative cross-entropy
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
#logger.info(f'====> Forward: 1.1.1.1')
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
# logger.info(f'====> Forward: 1.1.2')
w = attn.sum(2)
if self.use_sdp:
l_length = self.dp(x, x_mask, w, g=g)
l_length = l_length / torch.sum(x_mask)
else:
logw_ = torch.log(w + 1e-6) * x_mask
logw = self.dp(x, x_mask, g=g)
l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
# expand prior
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size)
o = self.dec(z_slice, g=g)
return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, x, x_lengths, sid=None, emo=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
# logger.info(f'====> Infer: 1.1.0')
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths,emo)
# logger.info(f'====> Infer: 1.1.1')
if self.n_speakers > 0:
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
else:
g = None
if self.use_sdp:
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
else:
logw = self.dp(x, x_mask, g=g)
w = torch.exp(logw) * x_mask * length_scale
w_ceil = torch.ceil(w)
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
attn = generate_path(w_ceil, attn_mask)
# logger.info(f'====> Infer: 1.1.2')
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
z = self.flow(z_p, y_mask, g=g, reverse=True)
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
# logger.info(f'====> Infer: 1.1.3')
return o, attn, y_mask, (z, z_p, m_p, logs_p)

View File

@@ -1,50 +0,0 @@
import torch
import torch.nn as nn
from transformers.models.wav2vec2.modeling_wav2vec2 import (
Wav2Vec2Model,
Wav2Vec2PreTrainedModel,
)
class RegressionHead(nn.Module):
r"""Classification head."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.final_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class EmotionExtractorModel(Wav2Vec2PreTrainedModel):
r"""Speech emotion classifier."""
def __init__(self, config):
super().__init__(config)
self.config = config
self.wav2vec2 = Wav2Vec2Model(config)
self.classifier = RegressionHead(config)
self.init_weights()
def forward(
self,
input_values,
):
outputs = self.wav2vec2(input_values)
hidden_states = outputs[0]
hidden_states = torch.mean(hidden_states, dim=1)
logits = self.classifier(hidden_states)
return hidden_states, logits

View File

@@ -1,161 +0,0 @@
import librosa
import numpy as np
from models.encoder import inference as encoder
from utils import logmmse
from models.synthesizer import audio
from pathlib import Path
from pypinyin import Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin
import torch
from transformers import Wav2Vec2Processor
from .models.wav2emo import EmotionExtractorModel
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
pass
pinyin = Pinyin(PinyinConverter()).pinyin
# load model from hub
device = 'cuda' if torch.cuda.is_available() else "cpu"
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionExtractorModel.from_pretrained(model_name).to(device)
def extract_emo(
x: np.ndarray,
sampling_rate: int,
embeddings: bool = False,
) -> np.ndarray:
r"""Predict emotions or extract embeddings from raw audio signal."""
y = processor(x, sampling_rate=sampling_rate)
y = y['input_values'][0]
y = torch.from_numpy(y).to(device)
# run through model
with torch.no_grad():
y = model(y)[0 if embeddings else 1]
# convert to numpy
y = y.detach().cpu().numpy()
return y
def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams, encoder_model_fpath):
## FOR REFERENCE:
# For you not to lose your head if you ever wish to change things here or implement your own
# synthesizer.
# - Both the audios and the mel spectrograms are saved as numpy arrays
# - There is no processing done to the audios that will be saved to disk beyond volume
# normalization (in split_on_silences)
# - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
# is why we re-apply it on the audio on the side of the vocoder.
# - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
# without extra padding. This means that you won't have an exact relation between the length
# of the wav and of the mel spectrogram. See the vocoder data loader.
# Skip existing utterances if needed
mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
if skip_existing and mel_fpath.exists() and wav_fpath.exists():
return None
# Trim silence
if hparams.trim_silence:
if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath)
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
# Skip utterances that are too short
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
return None
# Compute the mel spectrogram
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]
# Skip utterances that are too long
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
return None
# Write the spectrogram, embed and audio to disk
np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
np.save(wav_fpath, wav, allow_pickle=False)
# Return a tuple describing this training example
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, wav, mel_frames, text
def _split_on_silences(wav_fpath, words, hparams):
# Load the audio waveform
wav, _ = librosa.load(wav_fpath, sr= hparams.sample_rate)
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=1024)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise, we may not need it here.
if len(wav) > hparams.sample_rate*(0.3+0.1):
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
wav[-int(hparams.sample_rate*0.15):]])
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile, eta=0)
resp = pinyin(words, style=Style.TONE3)
res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp))
res = " ".join(res)
return wav, res
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
metadata = []
extensions = ("*.wav", "*.flac", "*.mp3")
if skip_existing:
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
if mel_fpath.exists() and wav_fpath_.exists():
continue
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath) # accelarate
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
else:
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath)
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
return metadata

View File

@@ -1,393 +0,0 @@
import os
from loguru import logger
import torch
import glob
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast, GradScaler
from utils.audio_utils import mel_spectrogram, spec_to_mel
from utils.loss import feature_loss, generator_loss, discriminator_loss, kl_loss
from utils.util import slice_segments, clip_grad_value_
from models.synthesizer.vits_dataset import (
VitsDataset,
VitsDatasetCollate,
DistributedBucketSampler
)
from models.synthesizer.models.vits import (
Vits,
MultiPeriodDiscriminator,
)
from models.synthesizer.utils.symbols import symbols
from models.synthesizer.utils.plot import plot_spectrogram_to_numpy, plot_alignment_to_numpy
from pathlib import Path
from utils.hparams import HParams
import torch.multiprocessing as mp
import argparse
# torch.backends.cudnn.benchmark = True
global_step = 0
def new_train():
"""Assume Single Node Multi GPUs Training Only"""
assert torch.cuda.is_available(), "CPU training is not allowed."
parser = argparse.ArgumentParser()
parser.add_argument("--syn_dir", type=str, default="../audiodata/SV2TTS/synthesizer", help= \
"Path to the synthesizer directory that contains the ground truth mel spectrograms, "
"the wavs, the emos and the embeds.")
parser.add_argument("-m", "--model_dir", type=str, default="data/ckpt/synthesizer/vits2", help=\
"Path to the output directory that will contain the saved model weights and the logs.")
parser.add_argument('--ckptG', type=str, required=False,
help='original VITS G checkpoint path')
parser.add_argument('--ckptD', type=str, required=False,
help='original VITS D checkpoint path')
args, _ = parser.parse_known_args()
datasets_root = Path(args.syn_dir)
hparams= HParams(
model_dir = args.model_dir,
)
hparams.loadJson(Path(hparams.model_dir).joinpath("config.json"))
hparams.data["training_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["validation_files"] = str(datasets_root.joinpath("train.txt"))
hparams.data["datasets_root"] = str(datasets_root)
hparams.ckptG = args.ckptG
hparams.ckptD = args.ckptD
n_gpus = torch.cuda.device_count()
# for spawn
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '8899'
# mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hparams))
run(0, 1, hparams)
def load_checkpoint(checkpoint_path, model, optimizer=None, is_old=False, epochs=10000):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
iteration = checkpoint_dict['iteration']
learning_rate = checkpoint_dict['learning_rate']
if optimizer is not None:
if not is_old:
optimizer.load_state_dict(checkpoint_dict['optimizer'])
else:
new_opt_dict = optimizer.state_dict()
new_opt_dict_params = new_opt_dict['param_groups'][0]['params']
new_opt_dict['param_groups'] = checkpoint_dict['optimizer']['param_groups']
new_opt_dict['param_groups'][0]['params'] = new_opt_dict_params
optimizer.load_state_dict(new_opt_dict)
saved_state_dict = checkpoint_dict['model']
if hasattr(model, 'module'):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict= {}
for k, v in state_dict.items():
try:
new_state_dict[k] = saved_state_dict[k]
except:
if k == 'step':
new_state_dict[k] = iteration * epochs
else:
logger.info("%s is not in the checkpoint" % k)
new_state_dict[k] = v
if hasattr(model, 'module'):
model.module.load_state_dict(new_state_dict, strict=False)
else:
model.load_state_dict(new_state_dict, strict=False)
logger.info("Loaded checkpoint '{}' (iteration {})" .format(
checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
logger.info("Saving model and optimizer state at iteration {} to {}".format(
iteration, checkpoint_path))
if hasattr(model, 'module'):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save({'model': state_dict,
'iteration': iteration,
'optimizer': optimizer.state_dict(),
'learning_rate': learning_rate}, checkpoint_path)
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
f_list = glob.glob(os.path.join(dir_path, regex))
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
x = f_list[-1]
print(x)
return x
def run(rank, n_gpus, hps):
global global_step
if rank == 0:
logger.info(hps)
writer = SummaryWriter(log_dir=hps.model_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
dist.init_process_group(backend='gloo', init_method='env://', world_size=n_gpus, rank=rank)
torch.manual_seed(hps.train.seed)
torch.cuda.set_device(rank)
train_dataset = VitsDataset(hps.data.training_files, hps.data)
train_sampler = DistributedBucketSampler(
train_dataset,
hps.train.batch_size,
[32, 300, 400, 500, 600, 700, 800, 900, 1000],
num_replicas=n_gpus,
rank=rank,
shuffle=True)
collate_fn = VitsDatasetCollate()
train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, pin_memory=True,
collate_fn=collate_fn, batch_sampler=train_sampler)
if rank == 0:
eval_dataset = VitsDataset(hps.data.validation_files, hps.data)
eval_loader = DataLoader(eval_dataset, num_workers=8, shuffle=False,
batch_size=hps.train.batch_size, pin_memory=True,
drop_last=False, collate_fn=collate_fn)
net_g = Vits(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model).cuda(rank)
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
optim_g = torch.optim.AdamW(
net_g.parameters(),
hps.train.learning_rate,
betas=hps.train.betas,
eps=hps.train.eps)
optim_d = torch.optim.AdamW(
net_d.parameters(),
hps.train.learning_rate,
betas=hps.train.betas,
eps=hps.train.eps)
net_g = DDP(net_g, device_ids=[rank])
net_d = DDP(net_d, device_ids=[rank])
ckptG = hps.ckptG
ckptD = hps.ckptD
try:
if ckptG is not None:
_, _, _, epoch_str = load_checkpoint(ckptG, net_g, optim_g, is_old=True)
print("加载原版VITS模型G记录点成功")
else:
_, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g,
optim_g, epochs=hps.train.epochs)
if ckptD is not None:
_, _, _, epoch_str = load_checkpoint(ckptG, net_g, optim_g, is_old=True)
print("加载原版VITS模型D记录点成功")
else:
_, _, _, epoch_str = load_checkpoint(latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d,
optim_d, epochs=hps.train.epochs)
global_step = (epoch_str - 1) * len(train_loader)
except:
epoch_str = 1
global_step = 0
if ckptG is not None or ckptD is not None:
epoch_str = 1
global_step = 0
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
scaler = GradScaler(enabled=hps.train.fp16_run)
for epoch in range(epoch_str, hps.train.epochs + 1):
if rank == 0:
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
[train_loader, eval_loader], logger, [writer, writer_eval])
else:
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d], scaler,
[train_loader, None], None, None)
scheduler_g.step()
scheduler_d.step()
def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers):
net_g, net_d = nets
optim_g, optim_d = optims
scheduler_g, scheduler_d = schedulers
train_loader, eval_loader = loaders
if writers is not None:
writer, writer_eval = writers
train_loader.batch_sampler.set_epoch(epoch)
global global_step
net_g.train()
net_d.train()
for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers, emo) in enumerate(train_loader):
# logger.info(f'====> Step: 1 {batch_idx}')
x, x_lengths = x.cuda(rank), x_lengths.cuda(rank)
spec, spec_lengths = spec.cuda(rank), spec_lengths.cuda(rank)
y, y_lengths = y.cuda(rank), y_lengths.cuda(rank)
speakers = speakers.cuda(rank)
emo = emo.cuda(rank)
# logger.info(f'====> Step: 1.0 {batch_idx}')
with autocast(enabled=hps.train.fp16_run):
y_hat, l_length, attn, ids_slice, x_mask, z_mask, \
(z, z_p, m_p, logs_p, m_q, logs_q) = net_g(x, x_lengths, spec, spec_lengths, speakers, emo)
# logger.info(f'====> Step: 1.1 {batch_idx}')
mel = spec_to_mel(
spec,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.mel_fmin,
hps.data.mel_fmax)
y_mel = slice_segments(mel, ids_slice, hps.train.segment_size // hps.data.hop_length)
y_hat_mel = mel_spectrogram(
y_hat.squeeze(1),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax
)
y = slice_segments(y, ids_slice * hps.data.hop_length, hps.train.segment_size) # slice
# logger.info(f'====> Step: 1.3 {batch_idx}')
# Discriminator
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
with autocast(enabled=False):
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
loss_disc_all = loss_disc
optim_d.zero_grad()
scaler.scale(loss_disc_all).backward()
scaler.unscale_(optim_d)
grad_norm_d = clip_grad_value_(net_d.parameters(), None)
scaler.step(optim_d)
with autocast(enabled=hps.train.fp16_run):
# Generator
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
with autocast(enabled=False):
loss_dur = torch.sum(l_length.float())
loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl
loss_fm = feature_loss(fmap_r, fmap_g)
loss_gen, losses_gen = generator_loss(y_d_hat_g)
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_dur + loss_kl
optim_g.zero_grad()
scaler.scale(loss_gen_all.float()).backward()
scaler.unscale_(optim_g)
grad_norm_g = clip_grad_value_(net_g.parameters(), None)
scaler.step(optim_g)
scaler.update()
if rank == 0:
if global_step % hps.train.log_interval == 0:
lr = optim_g.param_groups[0]['lr']
losses = [loss_disc, loss_gen, loss_fm, loss_mel, loss_dur, loss_kl]
logger.info('Train Epoch: {} [{:.0f}%]'.format(
epoch,
100. * batch_idx / len(train_loader)))
logger.info([x.item() for x in losses] + [global_step, lr])
scalar_dict = {"loss/g/total": loss_gen_all, "loss/d/total": loss_disc_all, "learning_rate": lr,
"grad_norm_d": grad_norm_d, "grad_norm_g": grad_norm_g}
scalar_dict.update(
{"loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/dur": loss_dur, "loss/g/kl": loss_kl})
scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
image_dict = {
"slice/mel_org": plot_spectrogram_to_numpy(y_mel[0].data.cpu().numpy()),
"slice/mel_gen": plot_spectrogram_to_numpy(y_hat_mel[0].data.cpu().numpy()),
"all/mel": plot_spectrogram_to_numpy(mel[0].data.cpu().numpy()),
"all/attn": plot_alignment_to_numpy(attn[0, 0].data.cpu().numpy())
}
summarize(
writer=writer,
global_step=global_step,
images=image_dict,
scalars=scalar_dict)
if global_step % hps.train.eval_interval == 0:
evaluate(hps, net_g, eval_loader, writer_eval)
save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch,
os.path.join(hps.model_dir, "G_{}.pth".format(global_step)))
save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch,
os.path.join(hps.model_dir, "D_{}.pth".format(global_step)))
global_step += 1
if rank == 0:
logger.info('====> Epoch: {}'.format(epoch))
def evaluate(hps, generator, eval_loader, writer_eval):
generator.eval()
with torch.no_grad():
for batch_idx, (x, x_lengths, spec, spec_lengths, y, y_lengths, speakers, emo) in enumerate(eval_loader):
x, x_lengths = x.cuda(0), x_lengths.cuda(0)
spec, spec_lengths = spec.cuda(0), spec_lengths.cuda(0)
y, y_lengths = y.cuda(0), y_lengths.cuda(0)
speakers = speakers.cuda(0)
emo = emo.cuda(0)
# remove else
x = x[:1]
x_lengths = x_lengths[:1]
spec = spec[:1]
spec_lengths = spec_lengths[:1]
y = y[:1]
y_lengths = y_lengths[:1]
speakers = speakers[:1]
emo = emo[:1]
break
y_hat, attn, mask, *_ = generator.module.infer(x, x_lengths, speakers, emo, max_len=1000)
# y_hat, attn, mask, *_ = generator.infer(x, x_lengths, speakers, emo, max_len=1000) # for non DistributedDataParallel object
y_hat_lengths = mask.sum([1, 2]).long() * hps.data.hop_length
mel = spec_to_mel(
spec,
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.mel_fmin,
hps.data.mel_fmax)
y_hat_mel = mel_spectrogram(
y_hat.squeeze(1).float(),
hps.data.filter_length,
hps.data.n_mel_channels,
hps.data.sampling_rate,
hps.data.hop_length,
hps.data.win_length,
hps.data.mel_fmin,
hps.data.mel_fmax
)
image_dict = {
"gen/mel": plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy())
}
audio_dict = {
"gen/audio": y_hat[0, :, :y_hat_lengths[0]]
}
if global_step == 0:
image_dict.update({"gt/mel": plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
audio_dict.update({"gt/audio": y[0, :, :y_lengths[0]]})
summarize(
writer=writer_eval,
global_step=global_step,
images=image_dict,
audios=audio_dict,
audio_sampling_rate=hps.data.sampling_rate
)
generator.train()
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
for k, v in scalars.items():
writer.add_scalar(k, v, global_step)
for k, v in histograms.items():
writer.add_histogram(k, v, global_step)
for k, v in images.items():
writer.add_image(k, v, global_step, dataformats='HWC')
for k, v in audios.items():
writer.add_audio(k, v, global_step, audio_sampling_rate)

View File

@@ -1,262 +0,0 @@
import os
import random
import numpy as np
import torch.nn.functional as F
import torch
import torch.utils.data
from utils.audio_utils import load_wav_to_torch, spectrogram
from utils.util import intersperse
from models.synthesizer.utils.text import text_to_sequence
"""Multi speaker version"""
class VitsDataset(torch.utils.data.Dataset):
"""
1) loads audio, speaker_id, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audio_file_path, hparams):
with open(audio_file_path, encoding='utf-8') as f:
self.audio_metadata = [line.strip().split('|') for line in f]
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.cleaned_text = getattr(hparams, "cleaned_text", False)
self.add_blank = hparams.add_blank
self.datasets_root = hparams.datasets_root
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 190)
random.seed(1234)
random.shuffle(self.audio_metadata)
self._filter()
def _filter(self):
"""
Filter text & store spec lengths
"""
# Store spectrogram lengths for Bucketing
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
# spec_length = wav_length // hop_length
audio_metadata_new = []
lengths = []
# for audiopath, sid, text in self.audio_metadata:
for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid in self.audio_metadata:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid])
lengths.append(os.path.getsize(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') // (2 * self.hop_length))
self.audio_metadata = audio_metadata_new
self.lengths = lengths
def get_audio_text_speaker_pair(self, audio_metadata):
# separate filename, speaker_id and text
wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6]
text = self.get_text(text)
spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
sid = self.get_sid(sid)
emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}'))
return (text, spec, wav, sid, emo)
def get_audio(self, filename):
# Load preprocessed wav npy instead of reading from wav file
audio = torch.FloatTensor(np.load(filename))
audio_norm = audio.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec")
if os.path.exists(spec_filename):
spec = torch.load(spec_filename)
else:
spec = spectrogram(audio_norm, self.filter_length,self.hop_length, self.win_length,
center=False)
torch.save(spec, spec_filename)
spec = torch.squeeze(spec, 0)
return spec, audio_norm
def get_text(self, text):
if self.cleaned_text:
text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank:
text_norm = intersperse(text_norm, 0) # 在所有文本数值序列中的元素前后都补充一个0 - 不适用于中文
text_norm = torch.LongTensor(text_norm)
return text_norm
def get_sid(self, sid):
sid = torch.LongTensor([int(sid)])
return sid
def __getitem__(self, index):
return self.get_audio_text_speaker_pair(self.audio_metadata[index])
def __len__(self):
return len(self.audio_metadata)
class VitsDatasetCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
def __call__(self, batch):
"""Collate's training batch from normalized text, audio and speaker identities
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized, sid]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
max_text_len = max([len(x[0]) for x in batch])
max_spec_len = max([x[1].size(1) for x in batch])
max_wav_len = max([x[2].size(1) for x in batch])
text_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
wav_lengths = torch.LongTensor(len(batch))
sid = torch.LongTensor(len(batch))
text_padded = torch.LongTensor(len(batch), max_text_len)
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
emo = torch.FloatTensor(len(batch), 1024)
text_padded.zero_()
spec_padded.zero_()
wav_padded.zero_()
emo.zero_()
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
text = row[0]
text_padded[i, :text.size(0)] = text
text_lengths[i] = text.size(0)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
sid[i] = row[3]
emo[i, :] = row[4]
if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing, emo
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
"""
Maintain similar input lengths in a batch.
Length groups are specified by boundaries.
Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
It removes samples which are not included in the boundaries.
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
"""
def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
self.lengths = dataset.lengths
self.batch_size = batch_size
self.boundaries = boundaries
self.buckets, self.num_samples_per_bucket = self._create_buckets()
self.total_size = sum(self.num_samples_per_bucket)
self.num_samples = self.total_size // self.num_replicas
def _create_buckets(self):
buckets = [[] for _ in range(len(self.boundaries) - 1)]
for i in range(len(self.lengths)):
length = self.lengths[i]
idx_bucket = self._bisect(length)
if idx_bucket != -1:
buckets[idx_bucket].append(i)
for i in range(len(buckets) - 1, 0, -1):
if len(buckets[i]) == 0:
buckets.pop(i)
self.boundaries.pop(i+1)
num_samples_per_bucket = []
for i in range(len(buckets)):
len_bucket = len(buckets[i])
total_batch_size = self.num_replicas * self.batch_size
rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
num_samples_per_bucket.append(len_bucket + rem)
return buckets, num_samples_per_bucket
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch)
indices = []
if self.shuffle:
for bucket in self.buckets:
indices.append(torch.randperm(len(bucket), generator=g).tolist())
else:
for bucket in self.buckets:
indices.append(list(range(len(bucket))))
batches = []
for i in range(len(self.buckets)):
bucket = self.buckets[i]
len_bucket = len(bucket)
ids_bucket = indices[i]
num_samples_bucket = self.num_samples_per_bucket[i]
# add extra samples to make it evenly divisible
rem = num_samples_bucket - len_bucket
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
# subsample
ids_bucket = ids_bucket[self.rank::self.num_replicas]
# batching
for j in range(len(ids_bucket) // self.batch_size):
batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
batches.append(batch)
if self.shuffle:
batch_ids = torch.randperm(len(batches), generator=g).tolist()
batches = [batches[i] for i in batch_ids]
self.batches = batches
assert len(self.batches) * self.batch_size == self.num_samples
return iter(self.batches)
def _bisect(self, x, lo=0, hi=None):
if hi is None:
hi = len(self.boundaries) - 1
if hi > lo:
mid = (hi + lo) // 2
if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
return mid
elif x <= self.boundaries[mid]:
return self._bisect(x, lo, mid)
else:
return self._bisect(x, mid + 1, hi)
else:
return -1
def __len__(self):
return self.num_samples // self.batch_size

View File

@@ -1 +0,0 @@
#

View File

@@ -1 +0,0 @@
#

View File

@@ -1,19 +0,0 @@
import numpy as np
import torch
from .monotonic_align.core import maximum_path_c
def maximum_path(neg_cent, mask):
""" Cython optimized version.
neg_cent: [b, t_t, t_s]
mask: [b, t_t, t_s]
"""
device = neg_cent.device
dtype = neg_cent.dtype
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
path = np.zeros(neg_cent.shape, dtype=np.int32)
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
return torch.from_numpy(path).to(device=device, dtype=dtype)

File diff suppressed because it is too large Load Diff

View File

@@ -1,42 +0,0 @@
cimport cython
from cython.parallel import prange
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
cdef int x
cdef int y
cdef float v_prev
cdef float v_cur
cdef float tmp
cdef int index = t_x - 1
for y in range(t_y):
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
if x == y:
v_cur = max_neg_val
else:
v_cur = value[y-1, x]
if x == 0:
if y == 0:
v_prev = 0.
else:
v_prev = max_neg_val
else:
v_prev = value[y-1, x-1]
value[y, x] += max(v_prev, v_cur)
for y in range(t_y - 1, -1, -1):
path[y, index] = 1
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
index = index - 1
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
cdef int b = paths.shape[0]
cdef int i
for i in prange(b, nogil=True):
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])

View File

@@ -1,9 +0,0 @@
from distutils.core import setup
from Cython.Build import cythonize
import numpy
setup(
name = 'monotonic_align',
ext_modules = cythonize("core.pyx"),
include_dirs=[numpy.get_include()]
)

View File

@@ -15,7 +15,7 @@ from .rnn_decoder_mol import Decoder
from .utils.cnn_postnet import Postnet
from .utils.vc_utils import get_mask_from_lengths
from utils.hparams import HpsYaml
from utils.load_yaml import HpsYaml
class MelDecoderMOLv2(AbsMelDecoder):
"""Use an encoder to preprocess ppg."""

View File

@@ -7,10 +7,10 @@ from pathlib import Path
import soundfile
import resampy
from models.ppg_extractor import load_model
from ppg_extractor import load_model
import encoder.inference as Encoder
from models.encoder.audio import preprocess_wav
from models.encoder import audio
from encoder.audio import preprocess_wav
from encoder import audio
from utils.f0_utils import compute_f0
from torch.multiprocessing import Pool, cpu_count

View File

@@ -2,8 +2,8 @@ import sys
import torch
import argparse
import numpy as np
from utils.hparams import HpsYaml
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
from utils.load_yaml import HpsYaml
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True

View File

@@ -8,6 +8,7 @@ from torch.utils.tensorboard import SummaryWriter
from .option import default_hparas
from utils.util import human_format, Timer
from utils.load_yaml import HpsYaml
class BaseSolver():

View File

@@ -14,7 +14,7 @@ from utils.data_load import OneshotVcDataset, MultiSpkVcCollate
from .loss import MaskedMSELoss
from .optim import Optimizer
from utils.util import human_format
from models.ppg2mel import MelDecoderMOLv2
from ppg2mel import MelDecoderMOLv2
class Solver(BaseSolver):

View File

@@ -2,8 +2,8 @@ import sys
import torch
import argparse
import numpy as np
from utils.hparams import HpsYaml
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
from utils.load_yaml import HpsYaml
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True

Some files were not shown because too many files have changed in this diff Show More