36 Commits

Author SHA1 Message Date
babysor00
53fe291d3c Add proprocess_transcript file 2021-09-02 09:40:12 +08:00
babysor00
bd0e47e76b Support new dataset "biaobei" BZNSYP High quality single speaker for Chinese 2021-09-01 23:44:40 +08:00
Nemo
024d88ae96 Ignore vocoder models except pretrained 2021-09-01 08:44:43 +08:00
babysor00
5c0cb50c3e Add quick path to preprocess audio, denoise audio when loading in toolbox 2021-08-30 22:22:06 +08:00
babysor00
2f1f4f70b4 add support of aishell, magic data in toolbox 2021-08-29 16:31:40 +08:00
babysor00
4132bd113f Add readme instruction to train vocoder 2021-08-29 15:48:11 +08:00
babysor00
5950eea895 Support training your own vocoder 2021-08-29 15:43:54 +08:00
babysor00
630023c7b2 format readme and add paper references 2021-08-29 10:55:59 +08:00
babysor00
17d47589c1 Fix compatibility issue of symbols 2021-08-29 00:48:07 +08:00
Nemo
0bba0a806e Update README.md 2021-08-28 19:56:19 +08:00
Nemo
331e1d4238 Update README-CN.md 2021-08-28 19:55:42 +08:00
Vega
2130908449 Add newbie friendly version 2021-08-28 09:40:44 +08:00
Nemo
95bbcf6cd8 Add readme to support aishell3 2021-08-25 23:14:30 +08:00
Nemo
a6f8c8a39a Merge branch 'main' of https://github.com/babysor/Realtime-Voice-Clone-Chinese into main 2021-08-25 23:11:34 +08:00
Nemo
0cc3f569fa [dataset]support aishell3(tested) 2021-08-25 23:11:29 +08:00
Vega
c3fb378b63 Update README-CN.md 2021-08-24 00:03:13 +08:00
Vega
9f30ca8e92 Merge pull request #38 from XiuChen-Liu/main
更新 README
2021-08-23 16:48:41 +08:00
XiuChen-Liu
0ede0ad771 UPDATE README.md 2021-08-23 15:37:33 +08:00
XiuChen-Liu
16c1d2049f UPDATE README-CN.md 2021-08-23 15:37:16 +08:00
Vega
a810e6a472 Merge pull request #32 from Nthily/main
更新 README
2021-08-23 13:14:38 +08:00
Nthily
4f23833a69 Update README.md 2021-08-23 03:17:32 +02:00
Nthily
e25072a4a8 Update README-CN.md 2021-08-23 03:16:35 +02:00
Nthily
289ededebc Update README-CN.md 2021-08-23 03:14:58 +02:00
Vega
3084fdeb10 Merge pull request #30 from lonelyman0108/patch-1
Update README-CN.md
2021-08-23 08:54:07 +08:00
LM
0adf29c35b Update README-CN.md
fix : 解决在未安装vs时会报错"Microsoft Visual C++ 14.0 is required"的问题
2021-08-23 00:54:26 +08:00
babysor00
3c86cd5bca 【bugfix】 fix bug causing non-sense output for long texts 修复多段文字发音错误 2021-08-22 23:44:25 +08:00
babysor00
21dd124360 Share a Model from community [Update Readme] 2021-08-22 20:38:13 +08:00
Vega
e501ac5f76 Update to install webrtcvad 2021-08-21 23:00:20 +08:00
Vega
81cf5ff485 Merge pull request #17 from babysor/add-code-of-conduct-1
Create CODE_OF_CONDUCT.md
2021-08-18 20:31:54 +08:00
Vega
ddc0fd8bf7 Create CODE_OF_CONDUCT.md 2021-08-18 16:37:25 +08:00
babysor00
feb1c7cb88 rename slr68 to magicdata to keep consistent naming convention
(cherry picked from commit bbdad858ebc4d0ee3b720ba22ae3e0ce9732a734)
2021-08-17 21:07:13 +08:00
Weijia Chen
e66d29872f Merge pull request #13 from babysor/chineseinputsupport
Toolbox add Chinese character input support
2021-08-16 22:23:59 +08:00
57b06a29ec Built-in pretrained encoder/vocoder model 简化配置流程,预集成模型 2021-08-16 22:18:46 +08:00
Weijia Chen
b73dc6885c Update README-CN.md 2021-08-16 11:24:06 +08:00
Weijia Chen
4f0a21969f Upload pretrained model 2021-08-16 09:35:09 +08:00
Weijia Chen
a88e311e40 Update README-CN.md 2021-08-16 09:29:50 +08:00
20 changed files with 441 additions and 204 deletions

2
.gitignore vendored
View File

@@ -15,6 +15,6 @@
*.toc
*.wav
*.sh
encoder/saved_models/*
synthesizer/saved_models/*
vocoder/saved_models/*
!vocoder/saved_models/pretrained/*

48
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,48 @@
{
// 使用 IntelliSense 了解相关属性。
// 悬停以查看现有属性的描述。
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Syn Preprocess",
"type": "python",
"request": "launch",
"program": "pre.py",
"console": "integratedTerminal",
"args": [
"D:\\ttsdata\\BZNSYP", "-d", "BZNSYP"
],
},
{
"name": "Python: Vocoder Preprocess",
"type": "python",
"request": "launch",
"program": "vocoder_preprocess.py",
"console": "integratedTerminal",
"args": [
"..\\..\\chs1"
],
},
{
"name": "Python: Vocoder Train",
"type": "python",
"request": "launch",
"program": "vocoder_train.py",
"console": "integratedTerminal",
"args": [
"dev", "..\\..\\chs1"
],
},
{
"name": "Python: demo box",
"type": "python",
"request": "launch",
"program": "demo_toolbox.py",
"console": "integratedTerminal",
"args": [
"-d", "..\\..\\chs"
],
}
]
}

130
CODE_OF_CONDUCT.md Normal file
View File

@@ -0,0 +1,130 @@
# Contributor Covenant Code of Conduct
## First of all
Don't be evil, never
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the
overall community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or
advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email
address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
babysor00@gmail.com.
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series
of actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within
the community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.0, available at
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see the FAQ at
https://www.contributor-covenant.org/faq. Translations are available at
https://www.contributor-covenant.org/translations.

View File

@@ -1,13 +1,14 @@
## 实时语音克隆 - 中文/普通话
![WechatIMG2968](https://user-images.githubusercontent.com/7423248/128490653-f55fefa8-f944-4617-96b8-5cc94f14f8f6.png)
![mockingbird](https://user-images.githubusercontent.com/12797292/131216767-6eb251d6-14fc-4951-8324-2722f0cd4c63.jpg)
[![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](http://choosealicense.com/licenses/mit/)
> 该库是从仅支持英语的[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) 分叉出来的。
### [English](README.md) | 中文
### [DEMO VIDEO](https://www.bilibili.com/video/BV1sA411P7wM/)
## 特性
🌍 **中文** 支持普通话并使用多种中文数据集进行测试adatatang_200zh, SLR68
🌍 **中文** 支持普通话并使用多种中文数据集进行测试adatatang_200zh, magicdata, aishell3
🤩 **PyTorch** 适用于 pytorch已在 1.9.0 版本(最新于 2021 年 8 月中测试GPU Tesla T4 和 GTX 2060
@@ -16,43 +17,71 @@
🤩 **Easy & Awesome** 仅使用新训练的合成器synthesizer就有良好效果复用预训练的编码器/声码器
## 快速开始
> 0训练新手友好版可以参考 [Quick Start (Newbie)](https://github.com/babysor/Realtime-Voice-Clone-Chinese/wiki/Quick-Start-(Newbie))
### 1. 安装要求
> 按照原始存储库测试您是否已准备好所有环境。
**Python 3.7 或更高版本 ** 需要运行工具箱。
**Python 3.7 或更高版本** 需要运行工具箱。
* 安装 [PyTorch](https://pytorch.org/get-started/locally/)。
> 如果在用 pip 方式安装的时候出现 `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2)` 这个错误可能是 python 版本过低3.9 可以安装成功
* 安装 [ffmpeg](https://ffmpeg.org/download.html#get-packages)。
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
* 安装 webrtcvad 用 `pip install webrtcvad-wheels`
### 2. 使用预训练好的编码器/声码
下载以下模型,解压替换到本代码库的根目录
https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
### 2. 使用数据集训练合成
* 下载 数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav
* 进行音频和梅尔频谱图预处理:
`python pre.py <datasets_root>`
### 3. 使用 aidatatang_200zh 训练合成器
* 下载 adatatang_200zh 数据集并解压:确保您可以访问 *train* 文件夹中的所有 .wav
* 使用音频和梅尔频谱图进行预处理:
`python synthesizer_preprocess_audio.py <datasets_root>`
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, SLR68
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3, BZNSYP
> 假如你下载的 `aidatatang_200zh`文件放在D盘`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
* 预处理嵌入:
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
>假如發生 `頁面文件太小,無法完成操作`,請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030)將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
* 训练合成器:
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
* 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时,请转到下一步。
> 仅供参考,我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4
> 仅供参考,我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4
![attention_step_20500_sample_1](https://user-images.githubusercontent.com/7423248/128587252-f669f05a-f411-4811-8784-222156ea5e9d.png)
![step-135500-mel-spectrogram_sample_1](https://user-images.githubusercontent.com/7423248/128587255-4945faa0-5517-46ea-b173-928eff999330.png)
### 2.2 使用预先训练好的合成器
> 实在没有设备或者不想慢慢调试,可以使用网友贡献的模型(欢迎持续分享):
### 4. 启动工具箱
| 作者 | 下载链接 | 效果预览 |
| --- | ----------- | ----- |
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码2021 | https://www.bilibili.com/video/BV1uh411B7AD/)
### 2.3 训练声码器 (Optional)
* 预处理数据:
`python vocoder_preprocess.py <datasets_root>`
* 训练声码器:
`python vocoder_train.py mandarin <datasets_root>`
### 3. 启动工具箱
然后您可以尝试使用工具箱:
`python demo_toolbox.py -d <datasets_root>`
> Good news🤩: 可直接使用中文
## TODO
- [X] 添加演示视频
- [X] 允许直接使用中文
- [X] 添加演示视频
- [X] 添加对更多数据集的支持
- [ ] 上传预训练模型
- [ ] 🙏 欢迎补充
- [X] 上传预训练模型
- [ ] 支持parallel tacotron
- [ ] 服务化与容器化
- [ ] 🙏 欢迎补充
## 引用及论文
> 该库一开始从仅支持英语的[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) 分叉出来的,鸣谢作者。
| URL | Designation | 标题 | 实现源码 |
| --- | ----------- | ----- | --------------------- |
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |

View File

@@ -1,16 +1,16 @@
![WechatIMG2968](https://user-images.githubusercontent.com/7423248/128490653-f55fefa8-f944-4617-96b8-5cc94f14f8f6.png)
![mockingbird](https://user-images.githubusercontent.com/12797292/131216767-6eb251d6-14fc-4951-8324-2722f0cd4c63.jpg)
[![MIT License](https://img.shields.io/badge/license-MIT-blue.svg?style=flat)](http://choosealicense.com/licenses/mit/)
> This repository is forked from [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) which only support English.
> English | [中文](README-CN.md)
> English | [中文](README-CN.md)
## Features
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, SLR68
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3
🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
🌍 **Windows + Linux** tested in both Windows OS and linux OS after fixing nits
🌍 **Windows + Linux** tested in both Windows OS and linux OS after fixing nits
🤩 **Easy & Awesome** effect with only newly-trained synthesizer, by reusing the pretrained encoder/vocoder
@@ -24,40 +24,69 @@
**Python 3.7 or higher ** is needed to run the toolbox.
* Install [PyTorch](https://pytorch.org/get-started/locally/).
> If you get an `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2 )` This error is probably due to a low version of python, try using 3.9 and it will install successfully
* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
### 2. Reuse the pretrained encoder/vocoder
* Download the following models and extract to the root directory of this project. Don't use the synthesizer
https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
> Note that we need to specify the newly trained synthesizer model, since the original model is incompatible with the Chinese sympols. It means the demo_cli is not working at this moment.
### 3. Train synthesizer with aidatatang_200zh
* Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
> Note that we are using the pretrained encoder/vocoder but synthesizer, since the original model is incompatible with the Chinese sympols. It means the demo_cli is not working at this moment.
### 2. Train synthesizer with your dataset
* Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder
* Preprocess with the audios and the mel spectrograms:
`python synthesizer_preprocess_audio.py <datasets_root>`
Allow parameter `--dataset {dataset}` to support adatatang_200zh, SLR68
* Preprocess the embeddings:
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
`python pre.py <datasets_root>`
Allowing parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3, BZNSYP
>If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.
* Train the synthesizer:
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
* Go to next step when you see attention line show and loss meet your need in training folder *synthesizer/saved_models/*.
* Go to next step when you see attention line show and loss meet your need in training folder *synthesizer/saved_models/*.
> FYI, my attention came after 18k steps and loss became lower than 0.4 after 50k steps.
![attention_step_20500_sample_1](https://user-images.githubusercontent.com/7423248/128587252-f669f05a-f411-4811-8784-222156ea5e9d.png)
![step-135500-mel-spectrogram_sample_1](https://user-images.githubusercontent.com/7423248/128587255-4945faa0-5517-46ea-b173-928eff999330.png)
### 4. Launch the Toolbox
### 2.2 Use pretrained model of synthesizer
> Thanks to the community, some models will be shared:
| author | Download link | Previow Video |
| --- | ----------- | ----- |
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code2021 | https://www.bilibili.com/video/BV1uh411B7AD/
> A link to my early trained model: [Baidu Yun](https://pan.baidu.com/s/10t3XycWiNIg5dN5E_bMORQ)
Codeaid4
### 2.3 Train vocoder (Optional)
* Preprocess the data:
`python vocoder_preprocess.py <datasets_root>`
* Train the vocoder:
`python vocoder_train.py mandarin <datasets_root>`
### 3. Launch the Toolbox
You can then try the toolbox:
`python demo_toolbox.py -d <datasets_root>`
or
`python demo_toolbox.py`
`python demo_toolbox.py -d <datasets_root>`
or
`python demo_toolbox.py`
> Good news🤩: Chinese Characters are supported
## TODO
- [x] Add demo video
- [X] Add support for more dataset
- [ ] Upload pretrained model
- [X] Upload pretrained model
- [ ] Support parallel tacotron
- [ ] Service orianted and docterize
- 🙏 Welcome to add more
## Reference
> This repository is forked from [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) which only support English.
| URL | Designation | Title | Implementation source |
| --- | ----------- | ----- | --------------------- |
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |

Binary file not shown.

73
pre.py Normal file
View File

@@ -0,0 +1,73 @@
from synthesizer.preprocess import create_embeddings
from utils.argutils import print_args
from pathlib import Path
import argparse
from synthesizer.preprocess import preprocess_dataset
from synthesizer.hparams import hparams
from utils.argutils import print_args
from pathlib import Path
import argparse
recognized_datasets = [
"aidatatang_200zh",
"magicdata",
"aishell3",
"BZNSYP"
]
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
"and writes them to the disk. Audio files are also saved, to be used by the "
"vocoder for training.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("datasets_root", type=Path, help=\
"Path to the directory containing your datasets.")
parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
"Path to the output directory that will contain the mel spectrograms, the audios and the "
"embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
parser.add_argument("-n", "--n_processes", type=int, default=1, help=\
"Number of processes in parallel.An encoder is created for each, so you may need to lower "
"this value on GPUs with low memory. Set it to 1 if CUDA is unhappy")
parser.add_argument("-s", "--skip_existing", action="store_true", help=\
"Whether to overwrite existing files with the same name. Useful if the preprocessing was "
"interrupted. ")
parser.add_argument("--hparams", type=str, default="", help=\
"Hyperparameter overrides as a comma-separated list of name-value pairs")
parser.add_argument("--no_trim", action="store_true", help=\
"Preprocess audio without trimming silences (not recommended).")
parser.add_argument("--no_alignments", action="store_true", help=\
"Use this option when dataset does not include alignments\
(these are used to split long audio files into sub-utterances.)")
parser.add_argument("-d","--dataset", type=str, default="aidatatang_200zh", help=\
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3, BZNSYP.")
parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
"Path your trained encoder model.")
args = parser.parse_args()
# Process the arguments
if not hasattr(args, "out_dir"):
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
assert args.dataset in recognized_datasets, 'is not supported, please vote for it in https://github.com/babysor/MockingBird/issues/10'
# Create directories
assert args.datasets_root.exists()
args.out_dir.mkdir(exist_ok=True, parents=True)
# Verify webrtcvad is available
if not args.no_trim:
try:
import webrtcvad
except:
raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
"noise removal and is recommended. Please install and try again. If installation fails, "
"use --no_trim to disable this error message.")
encoder_model_fpath = args.encoder_model_fpath
del args.no_trim, args.encoder_model_fpath
args.hparams = hparams.parse(args.hparams)
preprocess_dataset(**vars(args))
create_embeddings(synthesizer_root=args.out_dir, n_processes=args.n_processes, encoder_model_fpath=encoder_model_fpath)

View File

@@ -87,6 +87,3 @@ hparams = HParams(
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
)
def hparams_debug_string():
return str(hparams)

View File

@@ -9,6 +9,7 @@ from pathlib import Path
from typing import Union, List
import numpy as np
import librosa
from utils import logmmse
from pypinyin import lazy_pinyin, Style
class Synthesizer:
@@ -90,13 +91,12 @@ class Synthesizer:
simple_table([("Tacotron", str(tts_k) + "k"),
("r", self._model.r)])
#convert chinese char to pinyin
list_of_pinyin = lazy_pinyin(texts, style=Style.TONE3)
texts = [" ".join([v for v in list_of_pinyin if v.strip()])]
print("Read " + str(texts))
texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]
print("Synthesizing " + str(texts))
# Preprocess text inputs
inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]
if not isinstance(embeddings, list):
embeddings = [embeddings]
@@ -146,6 +146,12 @@ class Synthesizer:
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise
if len(wav) > hparams.sample_rate*(0.3+0.1):
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
wav[-int(hparams.sample_rate*0.15):]])
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile)
return wav
@staticmethod

View File

@@ -6,7 +6,8 @@ from pathlib import Path
from tqdm import tqdm
import numpy as np
from encoder import inference as encoder
from synthesizer.preprocess_speaker import preprocess_speaker_general
from synthesizer.preprocess_speaker import preprocess_speaker_general, preprocess_speaker_bznsyp
from synthesizer.preprocess_transcript import preprocess_transcript_bznsyp
data_info = {
"aidatatang_200zh": {
@@ -14,11 +15,22 @@ data_info = {
"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
"speak_func": preprocess_speaker_general
},
"SLR68": {
"magicdata": {
"subfolders": ["train"],
"trans_filepath": "train/TRANS.txt",
"speak_func": preprocess_speaker_general
},
"aishell3":{
"subfolders": ["train/wav"],
"trans_filepath": "train/content.txt",
"speak_func": preprocess_speaker_general
},
"BZNSYP":{
"subfolders": ["Wave"],
"trans_filepath": "ProsodyLabeling/000001-010000.txt",
"speak_func": preprocess_speaker_bznsyp,
"transcript_func": preprocess_transcript_bznsyp,
},
}
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
@@ -44,11 +56,15 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
dict_info[v[0]] = " ".join(v[1:])
# process with specific function for your dataset
if "transcript_func" in dataset_info:
dataset_info["transcript_func"](dict_info, dict_transcript)
else:
for v in dict_transcript:
if not v:
continue
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
dict_info[v[0]] = " ".join(v[1:])
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,

View File

@@ -81,9 +81,16 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
return wav, res
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
metadata = []
wav_fpath_list = speaker_dir.glob("*.wav")
return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
def preprocess_speaker_bznsyp(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
wav_fpath_list = [speaker_dir]
return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
def preprocess_speaker_internal(wav_fpath_list, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
# Iterate over each wav
metadata = []
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
@@ -94,134 +101,4 @@ def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool,
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))
return [m for m in metadata if m is not None]
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
metadata = []
for book_dir in speaker_dir.glob("*"):
if no_alignments:
# Gather the utterance audios and texts
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpaths = book_dir.glob(extension)
for wav_fpath in wav_fpaths:
# Load the audio waveform
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Get the corresponding text
# Check for .txt (for compatibility with other datasets)
text_fpath = wav_fpath.with_suffix(".txt")
if not text_fpath.exists():
# Check for .normalized.txt (LibriTTS)
text_fpath = wav_fpath.with_suffix(".normalized.txt")
assert text_fpath.exists()
with text_fpath.open("r") as text_file:
text = "".join([line for line in text_file])
text = text.replace("\"", "")
text = text.strip()
# Process the utterance
metadata.append(_process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
skip_existing, hparams))
else:
# Process alignment file (LibriSpeech support)
# Gather the utterance audios and texts
try:
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
with alignments_fpath.open("r") as alignments_file:
alignments = [line.rstrip().split(" ") for line in alignments_file]
except StopIteration:
# A few alignment files will be missing
continue
# Iterate over each entry in the alignments file
for wav_fname, words, end_times in alignments:
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
assert wav_fpath.exists()
words = words.replace("\"", "").split(",")
end_times = list(map(float, end_times.replace("\"", "").split(",")))
# Process each sub-utterance
wavs, texts = _split_on_silences(wav_fpath, words, end_times, hparams)
for i, (wav, text) in enumerate(zip(wavs, texts)):
sub_basename = "%s_%02d" % (wav_fname, i)
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams))
return [m for m in metadata if m is not None]
# TODO: use original split func
def _split_on_silences(wav_fpath, words, end_times, hparams):
# Load the audio waveform
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
words = np.array(words)
start_times = np.array([0.0] + end_times[:-1])
end_times = np.array(end_times)
assert len(words) == len(end_times) == len(start_times)
assert words[0] == "" and words[-1] == ""
# Find pauses that are too long
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
mask[0] = mask[-1] = True
breaks = np.where(mask)[0]
# Profile the noise from the silences and perform noise reduction on the waveform
silence_times = [[start_times[i], end_times[i]] for i in breaks]
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
if len(noisy_wav) > hparams.sample_rate * 0.02:
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile, eta=0)
# Re-attach segments that are too short
segments = list(zip(breaks[:-1], breaks[1:]))
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
i = 0
while i < len(segments) and len(segments) > 1:
if segment_durations[i] < hparams.utterance_min_duration:
# See if the segment can be re-attached with the right or the left segment
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
joined_duration = segment_durations[i] + min(left_duration, right_duration)
# Do not re-attach if it causes the joined utterance to be too long
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
i += 1
continue
# Re-attach the segment with the neighbour of shortest duration
j = i - 1 if left_duration <= right_duration else i
segments[j] = (segments[j][0], segments[j + 1][1])
segment_durations[j] = joined_duration
del segments[j + 1], segment_durations[j + 1]
else:
i += 1
# Split the utterance
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
# # DEBUG: play the audio segments (run with -n=1)
# import sounddevice as sd
# if len(wavs) > 1:
# print("This sentence was split in %d segments:" % len(wavs))
# else:
# print("There are no silences long enough for this sentence to be split:")
# for wav, text in zip(wavs, texts):
# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
# # when playing them. You shouldn't need to do that in your parsers.
# wav = np.concatenate((wav, [0] * 16000))
# print("\t%s" % text)
# sd.play(wav, 16000, blocking=True)
# print("")
return wavs, texts
return [m for m in metadata if m is not None]

View File

@@ -0,0 +1,10 @@
def preprocess_transcript_bznsyp(dict_info, dict_transcript):
transList = []
for t in dict_transcript:
transList.append(t)
for i in range(0, len(transList), 2):
if not transList[i]:
continue
key = transList[i].split("\t")[0]
transcript = transList[i+1].strip().replace("\n","").replace("\t"," ")
dict_info[key] = transcript

View File

@@ -1,6 +1,5 @@
import torch
from torch.utils.data import DataLoader
from synthesizer.hparams import hparams_debug_string
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
from synthesizer.models.tacotron import Tacotron
from synthesizer.utils.text import text_to_sequence
@@ -8,13 +7,14 @@ from synthesizer.utils.symbols import symbols
import numpy as np
from pathlib import Path
from tqdm import tqdm
import sys
def run_synthesis(in_dir, out_dir, model_dir, hparams):
# This generates ground truth-aligned mels for vocoder training
synth_dir = Path(out_dir).joinpath("mels_gta")
synth_dir.mkdir(exist_ok=True)
print(hparams_debug_string(hparams))
synth_dir.mkdir(parents=True, exist_ok=True)
print(str(hparams))
# Check for GPU
if torch.cuda.is_available():
@@ -59,12 +59,12 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
metadata_fpath = in_dir.joinpath("train.txt")
mel_dir = in_dir.joinpath("mels")
embed_dir = in_dir.joinpath("embeds")
num_workers = 0 if sys.platform.startswith("win") else 2;
dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
data_loader = DataLoader(dataset,
collate_fn=lambda batch: collate_synthesizer(batch, r),
collate_fn=lambda batch: collate_synthesizer(batch),
batch_size=hparams.synthesis_batch_size,
num_workers=2,
num_workers=num_workers,
shuffle=False,
pin_memory=True)
@@ -78,9 +78,9 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
# Parallelize model onto GPUS using workaround due to python bug
if device.type == "cuda" and torch.cuda.device_count() > 1:
_, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
_, mels_out, _ , _ = data_parallel_workaround(model, texts, mels, embeds)
else:
_, mels_out, _ = model(texts, mels, embeds)
_, mels_out, _, _ = model(texts, mels, embeds)
for j, k in enumerate(idx):
# Note: outputs mel-spectrogram files and target ones have same names, just different folders

View File

@@ -67,8 +67,17 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
# Instantiate Tacotron Model
print("\nInitialising Tacotron Model...\n")
num_chars = len(symbols)
if weights_fpath.exists():
# for compatibility purpose, change symbols accordingly:
loaded_shape = torch.load(str(weights_fpath), map_location=device)["model_state"]["encoder.embedding.weight"].shape
if num_chars != loaded_shape[0]:
print("WARNING: you are using compatible mode due to wrong sympols length, please modify varible _characters in `utils\symbols.py`")
num_chars != loaded_shape[0]
model = Tacotron(embed_dims=hparams.tts_embed_dims,
num_chars=len(symbols),
num_chars=num_chars,
encoder_dims=hparams.tts_encoder_dims,
decoder_dims=hparams.tts_decoder_dims,
n_mels=hparams.num_mels,

View File

@@ -8,7 +8,9 @@ through Unidecode. For other data, you can modify _characters. See TRAINING_DATA
_pad = "_"
_eos = "~"
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? '
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!\'(),-.:;? '
#_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? ' # use this old one if you want to train old model
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
#_arpabet = ["@' + s for s in cmudict.valid_symbols]

View File

@@ -7,7 +7,8 @@ import argparse
recognized_datasets = [
"aidatatang_200zh",
"SLR68",
"magicdata",
"aishell3"
]
if __name__ == "__main__":
@@ -35,13 +36,13 @@ if __name__ == "__main__":
"Use this option when dataset does not include alignments\
(these are used to split long audio files into sub-utterances.)")
parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\
"Name of the dataset to process.")
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh.")
args = parser.parse_args()
# Process the arguments
if not hasattr(args, "out_dir"):
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
assert args.dataset in recognized_datasets, 'not surpport such dataset'
assert args.dataset in recognized_datasets, 'is not supported, please vote for it in https://github.com/babysor/MockingBird/issues/10'
# Create directories
assert args.datasets_root.exists()
args.out_dir.mkdir(exist_ok=True, parents=True)

View File

@@ -10,6 +10,7 @@ import traceback
import sys
import torch
import librosa
import re
from audioread.exceptions import NoBackendError
# Use this directory structure for your datasets, or modify it to fit your needs
@@ -36,6 +37,8 @@ recognized_datasets = [
"VCTK-Corpus/wav48",
"aidatatang_200zh/corpus/dev",
"aidatatang_200zh/corpus/test",
"aishell3/test/wav",
"magicdata/train",
]
#Maximum of generated wavs to keep on memory
@@ -224,6 +227,13 @@ class Toolbox:
self.init_synthesizer()
texts = self.ui.text_prompt.toPlainText().split("\n")
punctuation = '!,。、,' # punctuate and split/clean text
processed_texts = []
for text in texts:
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
if processed_text:
processed_texts.append(processed_text.strip())
texts = processed_texts
embed = self.ui.selected_utterance.embed
embeds = [embed] * len(texts)
specs = self.synthesizer.synthesize_spectrograms(texts, embeds)

Binary file not shown.

View File

@@ -17,7 +17,7 @@ if __name__ == "__main__":
"Path to the directory containing your SV2TTS directory. If you specify both --in_dir and "
"--out_dir, this argument won't be used.")
parser.add_argument("--model_dir", type=str,
default="synthesizer/saved_models/pretrained/", help=\
default="synthesizer/saved_models/train3/", help=\
"Path to the pretrained model directory.")
parser.add_argument("-i", "--in_dir", type=str, default=argparse.SUPPRESS, help= \
"Path to the synthesizer directory that contains the mel spectrograms, the wavs and the "