mirror of
https://github.com/babysor/Realtime-Voice-Clone-Chinese.git
synced 2026-02-04 02:54:07 +08:00
Compare commits
36 Commits
chineseinp
...
biaobei
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
53fe291d3c | ||
|
|
bd0e47e76b | ||
|
|
024d88ae96 | ||
|
|
5c0cb50c3e | ||
|
|
2f1f4f70b4 | ||
|
|
4132bd113f | ||
|
|
5950eea895 | ||
|
|
630023c7b2 | ||
|
|
17d47589c1 | ||
|
|
0bba0a806e | ||
|
|
331e1d4238 | ||
|
|
2130908449 | ||
|
|
95bbcf6cd8 | ||
|
|
a6f8c8a39a | ||
|
|
0cc3f569fa | ||
|
|
c3fb378b63 | ||
|
|
9f30ca8e92 | ||
|
|
0ede0ad771 | ||
|
|
16c1d2049f | ||
|
|
a810e6a472 | ||
|
|
4f23833a69 | ||
|
|
e25072a4a8 | ||
|
|
289ededebc | ||
|
|
3084fdeb10 | ||
|
|
0adf29c35b | ||
|
|
3c86cd5bca | ||
|
|
21dd124360 | ||
|
|
e501ac5f76 | ||
|
|
81cf5ff485 | ||
|
|
ddc0fd8bf7 | ||
|
|
feb1c7cb88 | ||
|
|
e66d29872f | ||
| 57b06a29ec | |||
|
|
b73dc6885c | ||
|
|
4f0a21969f | ||
|
|
a88e311e40 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -15,6 +15,6 @@
|
||||
*.toc
|
||||
*.wav
|
||||
*.sh
|
||||
encoder/saved_models/*
|
||||
synthesizer/saved_models/*
|
||||
vocoder/saved_models/*
|
||||
!vocoder/saved_models/pretrained/*
|
||||
48
.vscode/launch.json
vendored
Normal file
48
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
{
|
||||
// 使用 IntelliSense 了解相关属性。
|
||||
// 悬停以查看现有属性的描述。
|
||||
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python: Syn Preprocess",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "pre.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"D:\\ttsdata\\BZNSYP", "-d", "BZNSYP"
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Python: Vocoder Preprocess",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_preprocess.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"..\\..\\chs1"
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Python: Vocoder Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_train.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"dev", "..\\..\\chs1"
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Python: demo box",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"-d", "..\\..\\chs"
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
130
CODE_OF_CONDUCT.md
Normal file
130
CODE_OF_CONDUCT.md
Normal file
@@ -0,0 +1,130 @@
|
||||
# Contributor Covenant Code of Conduct
|
||||
## First of all
|
||||
Don't be evil, never
|
||||
|
||||
## Our Pledge
|
||||
|
||||
We as members, contributors, and leaders pledge to make participation in our
|
||||
community a harassment-free experience for everyone, regardless of age, body
|
||||
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||
identity and expression, level of experience, education, socio-economic status,
|
||||
nationality, personal appearance, race, religion, or sexual identity
|
||||
and orientation.
|
||||
|
||||
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||
diverse, inclusive, and healthy community.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to a positive environment for our
|
||||
community include:
|
||||
|
||||
* Demonstrating empathy and kindness toward other people
|
||||
* Being respectful of differing opinions, viewpoints, and experiences
|
||||
* Giving and gracefully accepting constructive feedback
|
||||
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||
and learning from the experience
|
||||
* Focusing on what is best not just for us as individuals, but for the
|
||||
overall community
|
||||
|
||||
Examples of unacceptable behavior include:
|
||||
|
||||
* The use of sexualized language or imagery, and sexual attention or
|
||||
advances of any kind
|
||||
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or email
|
||||
address, without their explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Enforcement Responsibilities
|
||||
|
||||
Community leaders are responsible for clarifying and enforcing our standards of
|
||||
acceptable behavior and will take appropriate and fair corrective action in
|
||||
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||
or harmful.
|
||||
|
||||
Community leaders have the right and responsibility to remove, edit, or reject
|
||||
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||
decisions when appropriate.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies within all community spaces, and also applies when
|
||||
an individual is officially representing the community in public spaces.
|
||||
Examples of representing our community include using an official e-mail address,
|
||||
posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported to the community leaders responsible for enforcement at
|
||||
babysor00@gmail.com.
|
||||
All complaints will be reviewed and investigated promptly and fairly.
|
||||
|
||||
All community leaders are obligated to respect the privacy and security of the
|
||||
reporter of any incident.
|
||||
|
||||
## Enforcement Guidelines
|
||||
|
||||
Community leaders will follow these Community Impact Guidelines in determining
|
||||
the consequences for any action they deem in violation of this Code of Conduct:
|
||||
|
||||
### 1. Correction
|
||||
|
||||
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||
unprofessional or unwelcome in the community.
|
||||
|
||||
**Consequence**: A private, written warning from community leaders, providing
|
||||
clarity around the nature of the violation and an explanation of why the
|
||||
behavior was inappropriate. A public apology may be requested.
|
||||
|
||||
### 2. Warning
|
||||
|
||||
**Community Impact**: A violation through a single incident or series
|
||||
of actions.
|
||||
|
||||
**Consequence**: A warning with consequences for continued behavior. No
|
||||
interaction with the people involved, including unsolicited interaction with
|
||||
those enforcing the Code of Conduct, for a specified period of time. This
|
||||
includes avoiding interactions in community spaces as well as external channels
|
||||
like social media. Violating these terms may lead to a temporary or
|
||||
permanent ban.
|
||||
|
||||
### 3. Temporary Ban
|
||||
|
||||
**Community Impact**: A serious violation of community standards, including
|
||||
sustained inappropriate behavior.
|
||||
|
||||
**Consequence**: A temporary ban from any sort of interaction or public
|
||||
communication with the community for a specified period of time. No public or
|
||||
private interaction with the people involved, including unsolicited interaction
|
||||
with those enforcing the Code of Conduct, is allowed during this period.
|
||||
Violating these terms may lead to a permanent ban.
|
||||
|
||||
### 4. Permanent Ban
|
||||
|
||||
**Community Impact**: Demonstrating a pattern of violation of community
|
||||
standards, including sustained inappropriate behavior, harassment of an
|
||||
individual, or aggression toward or disparagement of classes of individuals.
|
||||
|
||||
**Consequence**: A permanent ban from any sort of public interaction within
|
||||
the community.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||
version 2.0, available at
|
||||
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
|
||||
|
||||
Community Impact Guidelines were inspired by [Mozilla's code of conduct
|
||||
enforcement ladder](https://github.com/mozilla/diversity).
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
|
||||
For answers to common questions about this code of conduct, see the FAQ at
|
||||
https://www.contributor-covenant.org/faq. Translations are available at
|
||||
https://www.contributor-covenant.org/translations.
|
||||
67
README-CN.md
67
README-CN.md
@@ -1,13 +1,14 @@
|
||||
## 实时语音克隆 - 中文/普通话
|
||||

|
||||

|
||||
|
||||
[](http://choosealicense.com/licenses/mit/)
|
||||
> 该库是从仅支持英语的[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) 分叉出来的。
|
||||
|
||||
### [English](README.md) | 中文
|
||||
|
||||
### [DEMO VIDEO](https://www.bilibili.com/video/BV1sA411P7wM/)
|
||||
|
||||
## 特性
|
||||
🌍 **中文** 支持普通话并使用多种中文数据集进行测试:adatatang_200zh, SLR68
|
||||
🌍 **中文** 支持普通话并使用多种中文数据集进行测试:adatatang_200zh, magicdata, aishell3
|
||||
|
||||
🤩 **PyTorch** 适用于 pytorch,已在 1.9.0 版本(最新于 2021 年 8 月)中测试,GPU Tesla T4 和 GTX 2060
|
||||
|
||||
@@ -16,43 +17,71 @@
|
||||
🤩 **Easy & Awesome** 仅使用新训练的合成器(synthesizer)就有良好效果,复用预训练的编码器/声码器
|
||||
|
||||
## 快速开始
|
||||
> 0训练新手友好版可以参考 [Quick Start (Newbie)](https://github.com/babysor/Realtime-Voice-Clone-Chinese/wiki/Quick-Start-(Newbie))
|
||||
|
||||
### 1. 安装要求
|
||||
> 按照原始存储库测试您是否已准备好所有环境。
|
||||
**Python 3.7 或更高版本 ** 需要运行工具箱。
|
||||
**Python 3.7 或更高版本** 需要运行工具箱。
|
||||
|
||||
* 安装 [PyTorch](https://pytorch.org/get-started/locally/)。
|
||||
> 如果在用 pip 方式安装的时候出现 `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2)` 这个错误可能是 python 版本过低,3.9 可以安装成功
|
||||
* 安装 [ffmpeg](https://ffmpeg.org/download.html#get-packages)。
|
||||
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
|
||||
* 安装 webrtcvad 用 `pip install webrtcvad-wheels`。
|
||||
|
||||
### 2. 使用预训练好的编码器/声码器
|
||||
下载以下模型,解压替换到本代码库的根目录
|
||||
https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
|
||||
### 2. 使用数据集训练合成器
|
||||
* 下载 数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav)
|
||||
* 进行音频和梅尔频谱图预处理:
|
||||
`python pre.py <datasets_root>`
|
||||
|
||||
### 3. 使用 aidatatang_200zh 训练合成器
|
||||
* 下载 adatatang_200zh 数据集并解压:确保您可以访问 *train* 文件夹中的所有 .wav
|
||||
* 使用音频和梅尔频谱图进行预处理:
|
||||
`python synthesizer_preprocess_audio.py <datasets_root>`
|
||||
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, SLR68
|
||||
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3, BZNSYP
|
||||
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
||||
|
||||
* 预处理嵌入:
|
||||
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
||||
>假如發生 `頁面文件太小,無法完成操作`,請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
|
||||
|
||||
* 训练合成器:
|
||||
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
|
||||
|
||||
* 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时,请转到下一步。
|
||||
> 仅供参考,我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4。
|
||||
> 仅供参考,我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4
|
||||

|
||||

|
||||
|
||||
### 2.2 使用预先训练好的合成器
|
||||
> 实在没有设备或者不想慢慢调试,可以使用网友贡献的模型(欢迎持续分享):
|
||||
|
||||
### 4. 启动工具箱
|
||||
| 作者 | 下载链接 | 效果预览 |
|
||||
| --- | ----------- | ----- |
|
||||
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码:2021 | https://www.bilibili.com/video/BV1uh411B7AD/)
|
||||
|
||||
### 2.3 训练声码器 (Optional)
|
||||
* 预处理数据:
|
||||
`python vocoder_preprocess.py <datasets_root>`
|
||||
|
||||
* 训练声码器:
|
||||
`python vocoder_train.py mandarin <datasets_root>`
|
||||
|
||||
### 3. 启动工具箱
|
||||
然后您可以尝试使用工具箱:
|
||||
`python demo_toolbox.py -d <datasets_root>`
|
||||
|
||||
> Good news🤩: 可直接使用中文
|
||||
|
||||
## TODO
|
||||
- [X] 添加演示视频
|
||||
- [X] 允许直接使用中文
|
||||
- [X] 添加演示视频
|
||||
- [X] 添加对更多数据集的支持
|
||||
- [ ] 上传预训练模型
|
||||
- [ ] 🙏 欢迎补充
|
||||
- [X] 上传预训练模型
|
||||
- [ ] 支持parallel tacotron
|
||||
- [ ] 服务化与容器化
|
||||
- [ ] 🙏 欢迎补充
|
||||
|
||||
## 引用及论文
|
||||
> 该库一开始从仅支持英语的[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) 分叉出来的,鸣谢作者。
|
||||
|
||||
| URL | Designation | 标题 | 实现源码 |
|
||||
| --- | ----------- | ----- | --------------------- |
|
||||
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
|
||||
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|
||||
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|
||||
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
|
||||
73
README.md
73
README.md
@@ -1,16 +1,16 @@
|
||||

|
||||

|
||||
|
||||
|
||||
[](http://choosealicense.com/licenses/mit/)
|
||||
> This repository is forked from [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) which only support English.
|
||||
|
||||
> English | [中文](README-CN.md)
|
||||
> English | [中文](README-CN.md)
|
||||
|
||||
## Features
|
||||
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, SLR68
|
||||
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3
|
||||
|
||||
🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
|
||||
|
||||
🌍 **Windows + Linux** tested in both Windows OS and linux OS after fixing nits
|
||||
🌍 **Windows + Linux** tested in both Windows OS and linux OS after fixing nits
|
||||
|
||||
🤩 **Easy & Awesome** effect with only newly-trained synthesizer, by reusing the pretrained encoder/vocoder
|
||||
|
||||
@@ -24,40 +24,69 @@
|
||||
**Python 3.7 or higher ** is needed to run the toolbox.
|
||||
|
||||
* Install [PyTorch](https://pytorch.org/get-started/locally/).
|
||||
> If you get an `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2 )` This error is probably due to a low version of python, try using 3.9 and it will install successfully
|
||||
* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
|
||||
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
||||
|
||||
### 2. Reuse the pretrained encoder/vocoder
|
||||
* Download the following models and extract to the root directory of this project. Don't use the synthesizer
|
||||
https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models
|
||||
> Note that we need to specify the newly trained synthesizer model, since the original model is incompatible with the Chinese sympols. It means the demo_cli is not working at this moment.
|
||||
### 3. Train synthesizer with aidatatang_200zh
|
||||
* Download aidatatang_200zh dataset and unzip: make sure you can access all .wav in *train* folder
|
||||
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
||||
> Note that we are using the pretrained encoder/vocoder but synthesizer, since the original model is incompatible with the Chinese sympols. It means the demo_cli is not working at this moment.
|
||||
### 2. Train synthesizer with your dataset
|
||||
* Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder
|
||||
* Preprocess with the audios and the mel spectrograms:
|
||||
`python synthesizer_preprocess_audio.py <datasets_root>`
|
||||
Allow parameter `--dataset {dataset}` to support adatatang_200zh, SLR68
|
||||
* Preprocess the embeddings:
|
||||
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
|
||||
`python pre.py <datasets_root>`
|
||||
|
||||
Allowing parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3, BZNSYP
|
||||
|
||||
>If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.
|
||||
|
||||
|
||||
* Train the synthesizer:
|
||||
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
|
||||
|
||||
* Go to next step when you see attention line show and loss meet your need in training folder *synthesizer/saved_models/*.
|
||||
* Go to next step when you see attention line show and loss meet your need in training folder *synthesizer/saved_models/*.
|
||||
> FYI, my attention came after 18k steps and loss became lower than 0.4 after 50k steps.
|
||||

|
||||

|
||||
|
||||
### 4. Launch the Toolbox
|
||||
### 2.2 Use pretrained model of synthesizer
|
||||
> Thanks to the community, some models will be shared:
|
||||
|
||||
| author | Download link | Previow Video |
|
||||
| --- | ----------- | ----- |
|
||||
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code:2021 | https://www.bilibili.com/video/BV1uh411B7AD/
|
||||
|
||||
> A link to my early trained model: [Baidu Yun](https://pan.baidu.com/s/10t3XycWiNIg5dN5E_bMORQ)
|
||||
Code:aid4
|
||||
|
||||
### 2.3 Train vocoder (Optional)
|
||||
* Preprocess the data:
|
||||
`python vocoder_preprocess.py <datasets_root>`
|
||||
|
||||
* Train the vocoder:
|
||||
`python vocoder_train.py mandarin <datasets_root>`
|
||||
|
||||
### 3. Launch the Toolbox
|
||||
You can then try the toolbox:
|
||||
|
||||
`python demo_toolbox.py -d <datasets_root>`
|
||||
or
|
||||
`python demo_toolbox.py`
|
||||
`python demo_toolbox.py -d <datasets_root>`
|
||||
or
|
||||
`python demo_toolbox.py`
|
||||
|
||||
> Good news🤩: Chinese Characters are supported
|
||||
|
||||
## TODO
|
||||
- [x] Add demo video
|
||||
- [X] Add support for more dataset
|
||||
- [ ] Upload pretrained model
|
||||
- [X] Upload pretrained model
|
||||
- [ ] Support parallel tacotron
|
||||
- [ ] Service orianted and docterize
|
||||
- 🙏 Welcome to add more
|
||||
|
||||
## Reference
|
||||
> This repository is forked from [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) which only support English.
|
||||
|
||||
| URL | Designation | Title | Implementation source |
|
||||
| --- | ----------- | ----- | --------------------- |
|
||||
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
|
||||
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|
||||
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|
||||
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
|
||||
BIN
encoder/saved_models/pretrained.pt
Normal file
BIN
encoder/saved_models/pretrained.pt
Normal file
Binary file not shown.
73
pre.py
Normal file
73
pre.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from synthesizer.preprocess import create_embeddings
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
from synthesizer.preprocess import preprocess_dataset
|
||||
from synthesizer.hparams import hparams
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
recognized_datasets = [
|
||||
"aidatatang_200zh",
|
||||
"magicdata",
|
||||
"aishell3",
|
||||
"BZNSYP"
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
|
||||
"and writes them to the disk. Audio files are also saved, to be used by the "
|
||||
"vocoder for training.",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
parser.add_argument("datasets_root", type=Path, help=\
|
||||
"Path to the directory containing your datasets.")
|
||||
parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
|
||||
"Path to the output directory that will contain the mel spectrograms, the audios and the "
|
||||
"embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
|
||||
parser.add_argument("-n", "--n_processes", type=int, default=1, help=\
|
||||
"Number of processes in parallel.An encoder is created for each, so you may need to lower "
|
||||
"this value on GPUs with low memory. Set it to 1 if CUDA is unhappy")
|
||||
parser.add_argument("-s", "--skip_existing", action="store_true", help=\
|
||||
"Whether to overwrite existing files with the same name. Useful if the preprocessing was "
|
||||
"interrupted. ")
|
||||
parser.add_argument("--hparams", type=str, default="", help=\
|
||||
"Hyperparameter overrides as a comma-separated list of name-value pairs")
|
||||
parser.add_argument("--no_trim", action="store_true", help=\
|
||||
"Preprocess audio without trimming silences (not recommended).")
|
||||
parser.add_argument("--no_alignments", action="store_true", help=\
|
||||
"Use this option when dataset does not include alignments\
|
||||
(these are used to split long audio files into sub-utterances.)")
|
||||
parser.add_argument("-d","--dataset", type=str, default="aidatatang_200zh", help=\
|
||||
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3, BZNSYP.")
|
||||
parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
|
||||
"Path your trained encoder model.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process the arguments
|
||||
if not hasattr(args, "out_dir"):
|
||||
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
|
||||
assert args.dataset in recognized_datasets, 'is not supported, please vote for it in https://github.com/babysor/MockingBird/issues/10'
|
||||
# Create directories
|
||||
assert args.datasets_root.exists()
|
||||
args.out_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# Verify webrtcvad is available
|
||||
if not args.no_trim:
|
||||
try:
|
||||
import webrtcvad
|
||||
except:
|
||||
raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
|
||||
"noise removal and is recommended. Please install and try again. If installation fails, "
|
||||
"use --no_trim to disable this error message.")
|
||||
encoder_model_fpath = args.encoder_model_fpath
|
||||
del args.no_trim, args.encoder_model_fpath
|
||||
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
|
||||
preprocess_dataset(**vars(args))
|
||||
|
||||
create_embeddings(synthesizer_root=args.out_dir, n_processes=args.n_processes, encoder_model_fpath=encoder_model_fpath)
|
||||
@@ -87,6 +87,3 @@ hparams = HParams(
|
||||
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
|
||||
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
|
||||
)
|
||||
|
||||
def hparams_debug_string():
|
||||
return str(hparams)
|
||||
@@ -9,6 +9,7 @@ from pathlib import Path
|
||||
from typing import Union, List
|
||||
import numpy as np
|
||||
import librosa
|
||||
from utils import logmmse
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
|
||||
class Synthesizer:
|
||||
@@ -90,13 +91,12 @@ class Synthesizer:
|
||||
|
||||
simple_table([("Tacotron", str(tts_k) + "k"),
|
||||
("r", self._model.r)])
|
||||
|
||||
#convert chinese char to pinyin
|
||||
list_of_pinyin = lazy_pinyin(texts, style=Style.TONE3)
|
||||
texts = [" ".join([v for v in list_of_pinyin if v.strip()])]
|
||||
|
||||
|
||||
print("Read " + str(texts))
|
||||
texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]
|
||||
print("Synthesizing " + str(texts))
|
||||
# Preprocess text inputs
|
||||
inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
|
||||
inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]
|
||||
if not isinstance(embeddings, list):
|
||||
embeddings = [embeddings]
|
||||
|
||||
@@ -146,6 +146,12 @@ class Synthesizer:
|
||||
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
|
||||
if hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||
# denoise
|
||||
if len(wav) > hparams.sample_rate*(0.3+0.1):
|
||||
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
|
||||
wav[-int(hparams.sample_rate*0.15):]])
|
||||
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
|
||||
wav = logmmse.denoise(wav, profile)
|
||||
return wav
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -6,7 +6,8 @@ from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
from encoder import inference as encoder
|
||||
from synthesizer.preprocess_speaker import preprocess_speaker_general
|
||||
from synthesizer.preprocess_speaker import preprocess_speaker_general, preprocess_speaker_bznsyp
|
||||
from synthesizer.preprocess_transcript import preprocess_transcript_bznsyp
|
||||
|
||||
data_info = {
|
||||
"aidatatang_200zh": {
|
||||
@@ -14,11 +15,22 @@ data_info = {
|
||||
"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
|
||||
"speak_func": preprocess_speaker_general
|
||||
},
|
||||
"SLR68": {
|
||||
"magicdata": {
|
||||
"subfolders": ["train"],
|
||||
"trans_filepath": "train/TRANS.txt",
|
||||
"speak_func": preprocess_speaker_general
|
||||
},
|
||||
"aishell3":{
|
||||
"subfolders": ["train/wav"],
|
||||
"trans_filepath": "train/content.txt",
|
||||
"speak_func": preprocess_speaker_general
|
||||
},
|
||||
"BZNSYP":{
|
||||
"subfolders": ["Wave"],
|
||||
"trans_filepath": "ProsodyLabeling/000001-010000.txt",
|
||||
"speak_func": preprocess_speaker_bznsyp,
|
||||
"transcript_func": preprocess_transcript_bznsyp,
|
||||
},
|
||||
}
|
||||
|
||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||
@@ -44,11 +56,15 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||
transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
|
||||
assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
|
||||
with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
|
||||
for v in dict_transcript:
|
||||
if not v:
|
||||
continue
|
||||
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
||||
dict_info[v[0]] = " ".join(v[1:])
|
||||
# process with specific function for your dataset
|
||||
if "transcript_func" in dataset_info:
|
||||
dataset_info["transcript_func"](dict_info, dict_transcript)
|
||||
else:
|
||||
for v in dict_transcript:
|
||||
if not v:
|
||||
continue
|
||||
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
||||
dict_info[v[0]] = " ".join(v[1:])
|
||||
|
||||
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
|
||||
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||
|
||||
@@ -81,9 +81,16 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
|
||||
return wav, res
|
||||
|
||||
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||
metadata = []
|
||||
wav_fpath_list = speaker_dir.glob("*.wav")
|
||||
return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
|
||||
|
||||
def preprocess_speaker_bznsyp(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||
wav_fpath_list = [speaker_dir]
|
||||
return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
|
||||
|
||||
def preprocess_speaker_internal(wav_fpath_list, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||
# Iterate over each wav
|
||||
metadata = []
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
|
||||
@@ -94,134 +101,4 @@ def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool,
|
||||
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
|
||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||
skip_existing, hparams))
|
||||
return [m for m in metadata if m is not None]
|
||||
|
||||
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
|
||||
metadata = []
|
||||
for book_dir in speaker_dir.glob("*"):
|
||||
if no_alignments:
|
||||
# Gather the utterance audios and texts
|
||||
# LibriTTS uses .wav but we will include extensions for compatibility with other datasets
|
||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
||||
for extension in extensions:
|
||||
wav_fpaths = book_dir.glob(extension)
|
||||
|
||||
for wav_fpath in wav_fpaths:
|
||||
# Load the audio waveform
|
||||
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
||||
if hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||
|
||||
# Get the corresponding text
|
||||
# Check for .txt (for compatibility with other datasets)
|
||||
text_fpath = wav_fpath.with_suffix(".txt")
|
||||
if not text_fpath.exists():
|
||||
# Check for .normalized.txt (LibriTTS)
|
||||
text_fpath = wav_fpath.with_suffix(".normalized.txt")
|
||||
assert text_fpath.exists()
|
||||
with text_fpath.open("r") as text_file:
|
||||
text = "".join([line for line in text_file])
|
||||
text = text.replace("\"", "")
|
||||
text = text.strip()
|
||||
|
||||
# Process the utterance
|
||||
metadata.append(_process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name),
|
||||
skip_existing, hparams))
|
||||
else:
|
||||
# Process alignment file (LibriSpeech support)
|
||||
# Gather the utterance audios and texts
|
||||
try:
|
||||
alignments_fpath = next(book_dir.glob("*.alignment.txt"))
|
||||
with alignments_fpath.open("r") as alignments_file:
|
||||
alignments = [line.rstrip().split(" ") for line in alignments_file]
|
||||
except StopIteration:
|
||||
# A few alignment files will be missing
|
||||
continue
|
||||
|
||||
# Iterate over each entry in the alignments file
|
||||
for wav_fname, words, end_times in alignments:
|
||||
wav_fpath = book_dir.joinpath(wav_fname + ".flac")
|
||||
assert wav_fpath.exists()
|
||||
words = words.replace("\"", "").split(",")
|
||||
end_times = list(map(float, end_times.replace("\"", "").split(",")))
|
||||
|
||||
# Process each sub-utterance
|
||||
wavs, texts = _split_on_silences(wav_fpath, words, end_times, hparams)
|
||||
for i, (wav, text) in enumerate(zip(wavs, texts)):
|
||||
sub_basename = "%s_%02d" % (wav_fname, i)
|
||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||
skip_existing, hparams))
|
||||
|
||||
return [m for m in metadata if m is not None]
|
||||
|
||||
# TODO: use original split func
|
||||
def _split_on_silences(wav_fpath, words, end_times, hparams):
|
||||
# Load the audio waveform
|
||||
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
||||
if hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||
|
||||
words = np.array(words)
|
||||
start_times = np.array([0.0] + end_times[:-1])
|
||||
end_times = np.array(end_times)
|
||||
assert len(words) == len(end_times) == len(start_times)
|
||||
assert words[0] == "" and words[-1] == ""
|
||||
|
||||
# Find pauses that are too long
|
||||
mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
|
||||
mask[0] = mask[-1] = True
|
||||
breaks = np.where(mask)[0]
|
||||
|
||||
# Profile the noise from the silences and perform noise reduction on the waveform
|
||||
silence_times = [[start_times[i], end_times[i]] for i in breaks]
|
||||
silence_times = (np.array(silence_times) * hparams.sample_rate).astype(np.int)
|
||||
noisy_wav = np.concatenate([wav[stime[0]:stime[1]] for stime in silence_times])
|
||||
if len(noisy_wav) > hparams.sample_rate * 0.02:
|
||||
profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
|
||||
wav = logmmse.denoise(wav, profile, eta=0)
|
||||
|
||||
# Re-attach segments that are too short
|
||||
segments = list(zip(breaks[:-1], breaks[1:]))
|
||||
segment_durations = [start_times[end] - end_times[start] for start, end in segments]
|
||||
i = 0
|
||||
while i < len(segments) and len(segments) > 1:
|
||||
if segment_durations[i] < hparams.utterance_min_duration:
|
||||
# See if the segment can be re-attached with the right or the left segment
|
||||
left_duration = float("inf") if i == 0 else segment_durations[i - 1]
|
||||
right_duration = float("inf") if i == len(segments) - 1 else segment_durations[i + 1]
|
||||
joined_duration = segment_durations[i] + min(left_duration, right_duration)
|
||||
|
||||
# Do not re-attach if it causes the joined utterance to be too long
|
||||
if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Re-attach the segment with the neighbour of shortest duration
|
||||
j = i - 1 if left_duration <= right_duration else i
|
||||
segments[j] = (segments[j][0], segments[j + 1][1])
|
||||
segment_durations[j] = joined_duration
|
||||
del segments[j + 1], segment_durations[j + 1]
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Split the utterance
|
||||
segment_times = [[end_times[start], start_times[end]] for start, end in segments]
|
||||
segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
|
||||
wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
|
||||
texts = [" ".join(words[start + 1:end]).replace(" ", " ") for start, end in segments]
|
||||
|
||||
# # DEBUG: play the audio segments (run with -n=1)
|
||||
# import sounddevice as sd
|
||||
# if len(wavs) > 1:
|
||||
# print("This sentence was split in %d segments:" % len(wavs))
|
||||
# else:
|
||||
# print("There are no silences long enough for this sentence to be split:")
|
||||
# for wav, text in zip(wavs, texts):
|
||||
# # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
|
||||
# # when playing them. You shouldn't need to do that in your parsers.
|
||||
# wav = np.concatenate((wav, [0] * 16000))
|
||||
# print("\t%s" % text)
|
||||
# sd.play(wav, 16000, blocking=True)
|
||||
# print("")
|
||||
|
||||
return wavs, texts
|
||||
return [m for m in metadata if m is not None]
|
||||
10
synthesizer/preprocess_transcript.py
Normal file
10
synthesizer/preprocess_transcript.py
Normal file
@@ -0,0 +1,10 @@
|
||||
def preprocess_transcript_bznsyp(dict_info, dict_transcript):
|
||||
transList = []
|
||||
for t in dict_transcript:
|
||||
transList.append(t)
|
||||
for i in range(0, len(transList), 2):
|
||||
if not transList[i]:
|
||||
continue
|
||||
key = transList[i].split("\t")[0]
|
||||
transcript = transList[i+1].strip().replace("\n","").replace("\t"," ")
|
||||
dict_info[key] = transcript
|
||||
@@ -1,6 +1,5 @@
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from synthesizer.hparams import hparams_debug_string
|
||||
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
|
||||
from synthesizer.models.tacotron import Tacotron
|
||||
from synthesizer.utils.text import text_to_sequence
|
||||
@@ -8,13 +7,14 @@ from synthesizer.utils.symbols import symbols
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import sys
|
||||
|
||||
|
||||
def run_synthesis(in_dir, out_dir, model_dir, hparams):
|
||||
# This generates ground truth-aligned mels for vocoder training
|
||||
synth_dir = Path(out_dir).joinpath("mels_gta")
|
||||
synth_dir.mkdir(exist_ok=True)
|
||||
print(hparams_debug_string(hparams))
|
||||
synth_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(str(hparams))
|
||||
|
||||
# Check for GPU
|
||||
if torch.cuda.is_available():
|
||||
@@ -59,12 +59,12 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
|
||||
metadata_fpath = in_dir.joinpath("train.txt")
|
||||
mel_dir = in_dir.joinpath("mels")
|
||||
embed_dir = in_dir.joinpath("embeds")
|
||||
|
||||
num_workers = 0 if sys.platform.startswith("win") else 2;
|
||||
dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
|
||||
data_loader = DataLoader(dataset,
|
||||
collate_fn=lambda batch: collate_synthesizer(batch, r),
|
||||
collate_fn=lambda batch: collate_synthesizer(batch),
|
||||
batch_size=hparams.synthesis_batch_size,
|
||||
num_workers=2,
|
||||
num_workers=num_workers,
|
||||
shuffle=False,
|
||||
pin_memory=True)
|
||||
|
||||
@@ -78,9 +78,9 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
|
||||
|
||||
# Parallelize model onto GPUS using workaround due to python bug
|
||||
if device.type == "cuda" and torch.cuda.device_count() > 1:
|
||||
_, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
|
||||
_, mels_out, _ , _ = data_parallel_workaround(model, texts, mels, embeds)
|
||||
else:
|
||||
_, mels_out, _ = model(texts, mels, embeds)
|
||||
_, mels_out, _, _ = model(texts, mels, embeds)
|
||||
|
||||
for j, k in enumerate(idx):
|
||||
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
|
||||
|
||||
@@ -67,8 +67,17 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
|
||||
|
||||
# Instantiate Tacotron Model
|
||||
print("\nInitialising Tacotron Model...\n")
|
||||
num_chars = len(symbols)
|
||||
if weights_fpath.exists():
|
||||
# for compatibility purpose, change symbols accordingly:
|
||||
loaded_shape = torch.load(str(weights_fpath), map_location=device)["model_state"]["encoder.embedding.weight"].shape
|
||||
if num_chars != loaded_shape[0]:
|
||||
print("WARNING: you are using compatible mode due to wrong sympols length, please modify varible _characters in `utils\symbols.py`")
|
||||
num_chars != loaded_shape[0]
|
||||
|
||||
|
||||
model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
||||
num_chars=len(symbols),
|
||||
num_chars=num_chars,
|
||||
encoder_dims=hparams.tts_encoder_dims,
|
||||
decoder_dims=hparams.tts_decoder_dims,
|
||||
n_mels=hparams.num_mels,
|
||||
|
||||
@@ -8,7 +8,9 @@ through Unidecode. For other data, you can modify _characters. See TRAINING_DATA
|
||||
|
||||
_pad = "_"
|
||||
_eos = "~"
|
||||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? '
|
||||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890!\'(),-.:;? '
|
||||
|
||||
#_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz12340!\'(),-.:;? ' # use this old one if you want to train old model
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||
#_arpabet = ["@' + s for s in cmudict.valid_symbols]
|
||||
|
||||
|
||||
@@ -7,7 +7,8 @@ import argparse
|
||||
|
||||
recognized_datasets = [
|
||||
"aidatatang_200zh",
|
||||
"SLR68",
|
||||
"magicdata",
|
||||
"aishell3"
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -35,13 +36,13 @@ if __name__ == "__main__":
|
||||
"Use this option when dataset does not include alignments\
|
||||
(these are used to split long audio files into sub-utterances.)")
|
||||
parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\
|
||||
"Name of the dataset to process.")
|
||||
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh.")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process the arguments
|
||||
if not hasattr(args, "out_dir"):
|
||||
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
|
||||
assert args.dataset in recognized_datasets, 'not surpport such dataset'
|
||||
assert args.dataset in recognized_datasets, 'is not supported, please vote for it in https://github.com/babysor/MockingBird/issues/10'
|
||||
# Create directories
|
||||
assert args.datasets_root.exists()
|
||||
args.out_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
@@ -10,6 +10,7 @@ import traceback
|
||||
import sys
|
||||
import torch
|
||||
import librosa
|
||||
import re
|
||||
from audioread.exceptions import NoBackendError
|
||||
|
||||
# Use this directory structure for your datasets, or modify it to fit your needs
|
||||
@@ -36,6 +37,8 @@ recognized_datasets = [
|
||||
"VCTK-Corpus/wav48",
|
||||
"aidatatang_200zh/corpus/dev",
|
||||
"aidatatang_200zh/corpus/test",
|
||||
"aishell3/test/wav",
|
||||
"magicdata/train",
|
||||
]
|
||||
|
||||
#Maximum of generated wavs to keep on memory
|
||||
@@ -224,6 +227,13 @@ class Toolbox:
|
||||
self.init_synthesizer()
|
||||
|
||||
texts = self.ui.text_prompt.toPlainText().split("\n")
|
||||
punctuation = '!,。、,' # punctuate and split/clean text
|
||||
processed_texts = []
|
||||
for text in texts:
|
||||
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
|
||||
if processed_text:
|
||||
processed_texts.append(processed_text.strip())
|
||||
texts = processed_texts
|
||||
embed = self.ui.selected_utterance.embed
|
||||
embeds = [embed] * len(texts)
|
||||
specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
|
||||
|
||||
BIN
vocoder/saved_models/pretrained/pretrained.pt
Normal file
BIN
vocoder/saved_models/pretrained/pretrained.pt
Normal file
Binary file not shown.
@@ -17,7 +17,7 @@ if __name__ == "__main__":
|
||||
"Path to the directory containing your SV2TTS directory. If you specify both --in_dir and "
|
||||
"--out_dir, this argument won't be used.")
|
||||
parser.add_argument("--model_dir", type=str,
|
||||
default="synthesizer/saved_models/pretrained/", help=\
|
||||
default="synthesizer/saved_models/train3/", help=\
|
||||
"Path to the pretrained model directory.")
|
||||
parser.add_argument("-i", "--in_dir", type=str, default=argparse.SUPPRESS, help= \
|
||||
"Path to the synthesizer directory that contains the mel spectrograms, the wavs and the "
|
||||
Reference in New Issue
Block a user