mirror of
https://github.com/babysor/Realtime-Voice-Clone-Chinese.git
synced 2026-02-04 02:54:07 +08:00
Compare commits
16 Commits
refactor
...
restruct-p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
26331fe019 | ||
|
|
712a53f557 | ||
|
|
24cb262c3f | ||
|
|
e469bd06ae | ||
|
|
cd20d21f3d | ||
|
|
74a3fc97d0 | ||
|
|
b402f9dbdf | ||
|
|
85a53c9e05 | ||
|
|
028b131570 | ||
|
|
2a1890f9e1 | ||
|
|
c91bc3208e | ||
|
|
dd1ea3e714 | ||
|
|
e7313c514f | ||
|
|
f57d1a69b6 | ||
|
|
ab7d692619 | ||
|
|
f17e3b04e1 |
4
.dockerignore
Normal file
4
.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
||||
*/saved_models
|
||||
!vocoder/saved_models/pretrained/**
|
||||
!encoder/saved_models/pretrained.pt
|
||||
/datasets
|
||||
11
.gitignore
vendored
11
.gitignore
vendored
@@ -14,8 +14,11 @@
|
||||
*.bcf
|
||||
*.toc
|
||||
*.sh
|
||||
*/saved_models
|
||||
!vocoder/saved_models/pretrained/**
|
||||
!encoder/saved_models/pretrained.pt
|
||||
data/ckpt
|
||||
!data/ckpt/vocoder/pretrained/**
|
||||
!data/ckpt/encoder/pretrained.pt
|
||||
wavs
|
||||
log
|
||||
log
|
||||
!/docker-entrypoint.sh
|
||||
!/datasets_download/*.sh
|
||||
/datasets
|
||||
22
.vscode/launch.json
vendored
22
.vscode/launch.json
vendored
@@ -15,7 +15,8 @@
|
||||
"name": "Python: Vocoder Preprocess",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_preprocess.py",
|
||||
"program": "control\\cli\\vocoder_preprocess.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["..\\audiodata"]
|
||||
},
|
||||
@@ -23,7 +24,8 @@
|
||||
"name": "Python: Vocoder Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_train.py",
|
||||
"program": "control\\cli\\vocoder_train.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["dev", "..\\audiodata"]
|
||||
},
|
||||
@@ -32,6 +34,7 @@
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["-d","..\\audiodata"]
|
||||
},
|
||||
@@ -40,6 +43,7 @@
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["-d","..\\audiodata","-vc"]
|
||||
},
|
||||
@@ -47,9 +51,9 @@
|
||||
"name": "Python: Synth Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "synthesizer_train.py",
|
||||
"program": "train.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["my_run", "..\\"]
|
||||
"args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"]
|
||||
},
|
||||
{
|
||||
"name": "Python: PPG Convert",
|
||||
@@ -60,14 +64,6 @@
|
||||
"args": ["-c", ".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2.yaml",
|
||||
"-m", ".\\ppg2mel\\saved_models\\best_loss_step_304000.pth", "--wav_dir", ".\\wavs\\input", "--ref_wav_path", ".\\wavs\\pkq.mp3", "-o", ".\\wavs\\output\\"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "GUI",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "mkgui\\base\\_cli.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": []
|
||||
},
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
17
Dockerfile
Normal file
17
Dockerfile
Normal file
@@ -0,0 +1,17 @@
|
||||
FROM pytorch/pytorch:latest
|
||||
|
||||
RUN apt-get update && apt-get install -y build-essential ffmpeg parallel aria2 && apt-get clean
|
||||
|
||||
COPY ./requirements.txt /workspace/requirements.txt
|
||||
|
||||
RUN pip install -r requirements.txt && pip install webrtcvad-wheels
|
||||
|
||||
COPY . /workspace
|
||||
|
||||
VOLUME [ "/datasets", "/workspace/synthesizer/saved_models/" ]
|
||||
|
||||
ENV DATASET_MIRROR=default FORCE_RETRAIN=false TRAIN_DATASETS=aidatatang_200zh\ magicdata\ aishell3\ data_aishell TRAIN_SKIP_EXISTING=true
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
ENTRYPOINT [ "/workspace/docker-entrypoint.sh" ]
|
||||
53
README-CN.md
53
README-CN.md
@@ -20,10 +20,15 @@
|
||||
|
||||
### 进行中的工作
|
||||
* GUI/客户端大升级与合并
|
||||
[X] 初始化框架 `./mkgui` (基于streamlit + fastapi)和 [技术设计](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
|
||||
[X] 增加 Voice Cloning and Conversion的演示页面
|
||||
[X] 增加Voice Conversion的预处理preprocessing 和训练 training 页面
|
||||
[ ] 增加其他的的预处理preprocessing 和训练 training 页面
|
||||
|
||||
- [x] 初始化框架 `./mkgui` (基于streamlit + fastapi)和 [技术设计](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
|
||||
|
||||
- [x] 增加 Voice Cloning and Conversion的演示页面
|
||||
|
||||
- [x] 增加Voice Conversion的预处理preprocessing 和训练 training 页面
|
||||
|
||||
- [ ] 增加其他的的预处理preprocessing 和训练 training 页面
|
||||
|
||||
* 模型后端基于ESPnet2升级
|
||||
|
||||
|
||||
@@ -122,7 +127,7 @@
|
||||
`python pre4ppg.py <datasets_root> -d {dataset} -n {number}`
|
||||
可传入参数:
|
||||
* `-d {dataset}` 指定数据集,支持 aidatatang_200zh, 不传默认为aidatatang_200zh
|
||||
* `-n {number}` 指定并行数,CPU 11770k在8的情况下,需要运行12到18小时!待优化
|
||||
* `-n {number}` 指定并行数,CPU 11700k在8的情况下,需要运行12到18小时!待优化
|
||||
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
||||
|
||||
* 训练合成器, 注意在上一步先下载好`ppg2mel.yaml`, 修改里面的地址指向预训练好的文件夹:
|
||||
@@ -148,30 +153,30 @@
|
||||
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|
||||
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
|
||||
|
||||
## 常見問題(FQ&A)
|
||||
#### 1.數據集哪裡下載?
|
||||
## 常见问题(FQ&A)
|
||||
#### 1.数据集在哪里下载?
|
||||
| 数据集 | OpenSLR地址 | 其他源 (Google Drive, Baidu网盘等) |
|
||||
| --- | ----------- | ---------------|
|
||||
| aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) |
|
||||
| magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) |
|
||||
| aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) |
|
||||
| data_aishell | [OpenSLR](https://www.openslr.org/33/) | |
|
||||
> 解壓 aidatatang_200zh 後,還需將 `aidatatang_200zh\corpus\train`下的檔案全選解壓縮
|
||||
> 解压 aidatatang_200zh 后,还需将 `aidatatang_200zh\corpus\train`下的文件全选解压缩
|
||||
|
||||
#### 2.`<datasets_root>`是什麼意思?
|
||||
假如數據集路徑為 `D:\data\aidatatang_200zh`,那麼 `<datasets_root>`就是 `D:\data`
|
||||
假如数据集路径为 `D:\data\aidatatang_200zh`,那么 `<datasets_root>`就是 `D:\data`
|
||||
|
||||
#### 3.訓練模型顯存不足
|
||||
訓練合成器時:將 `synthesizer/hparams.py`中的batch_size參數調小
|
||||
#### 3.训练模型显存不足
|
||||
训练合成器时:将 `synthesizer/hparams.py`中的batch_size参数调小
|
||||
```
|
||||
//調整前
|
||||
//调整前
|
||||
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 12), #
|
||||
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
||||
//調整後
|
||||
//调整后
|
||||
tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 8), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 8), #
|
||||
@@ -180,15 +185,15 @@ tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
|
||||
(2, 1e-5, 640_000, 8)], # lr = learning rate
|
||||
```
|
||||
|
||||
聲碼器-預處理數據集時:將 `synthesizer/hparams.py`中的batch_size參數調小
|
||||
声码器-预处理数据集时:将 `synthesizer/hparams.py`中的batch_size参数调小
|
||||
```
|
||||
//調整前
|
||||
//调整前
|
||||
### Data Preprocessing
|
||||
max_mel_frames = 900,
|
||||
rescale = True,
|
||||
rescaling_max = 0.9,
|
||||
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
|
||||
//調整後
|
||||
//调整后
|
||||
### Data Preprocessing
|
||||
max_mel_frames = 900,
|
||||
rescale = True,
|
||||
@@ -196,16 +201,16 @@ tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
|
||||
synthesis_batch_size = 8, # For vocoder preprocessing and inference.
|
||||
```
|
||||
|
||||
聲碼器-訓練聲碼器時:將 `vocoder/wavernn/hparams.py`中的batch_size參數調小
|
||||
声码器-训练声码器时:将 `vocoder/wavernn/hparams.py`中的batch_size参数调小
|
||||
```
|
||||
//調整前
|
||||
//调整前
|
||||
# Training
|
||||
voc_batch_size = 100
|
||||
voc_lr = 1e-4
|
||||
voc_gen_at_checkpoint = 5
|
||||
voc_pad = 2
|
||||
|
||||
//調整後
|
||||
//调整后
|
||||
# Training
|
||||
voc_batch_size = 6
|
||||
voc_lr = 1e-4
|
||||
@@ -214,13 +219,13 @@ voc_pad =2
|
||||
```
|
||||
|
||||
#### 4.碰到`RuntimeError: Error(s) in loading state_dict for Tacotron: size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([70, 512]) from checkpoint, the shape in current model is torch.Size([75, 512]).`
|
||||
請參照 issue [#37](https://github.com/babysor/MockingBird/issues/37)
|
||||
请参照 issue [#37](https://github.com/babysor/MockingBird/issues/37)
|
||||
|
||||
#### 5.如何改善CPU、GPU佔用率?
|
||||
適情況調整batch_size參數來改善
|
||||
#### 5.如何改善CPU、GPU占用率?
|
||||
视情况调整batch_size参数来改善
|
||||
|
||||
#### 6.發生 `頁面文件太小,無法完成操作`
|
||||
請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
|
||||
#### 6.发生 `页面文件太小,无法完成操作`
|
||||
请参考这篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),将虚拟内存更改为100G(102400),例如:文件放置D盘就更改D盘的虚拟内存
|
||||
|
||||
#### 7.什么时候算训练完成?
|
||||
首先一定要出现注意力模型,其次是loss足够低,取决于硬件设备和数据集。拿本人的供参考,我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from encoder.params_model import model_embedding_size as speaker_embedding_size
|
||||
from models.encoder.params_model import model_embedding_size as speaker_embedding_size
|
||||
from utils.argutils import print_args
|
||||
from utils.modelutils import check_model_paths
|
||||
from synthesizer.inference import Synthesizer
|
||||
from encoder import inference as encoder
|
||||
from vocoder import inference as vocoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.encoder import inference as encoder
|
||||
from models.vocoder import inference as vocoder
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aidatatang_200zh
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from models.encoder.preprocess import (preprocess_aidatatang_200zh,
|
||||
preprocess_librispeech, preprocess_voxceleb1,
|
||||
preprocess_voxceleb2)
|
||||
from utils.argutils import print_args
|
||||
|
||||
if __name__ == "__main__":
|
||||
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
|
||||
@@ -1,5 +1,5 @@
|
||||
from utils.argutils import print_args
|
||||
from encoder.train import train
|
||||
from models.encoder.train import train
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
@@ -2,8 +2,8 @@ import sys
|
||||
import torch
|
||||
import argparse
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
from utils.hparams import HpsYaml
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
|
||||
# For reproducibility, comment these may speed up training
|
||||
torch.backends.cudnn.deterministic = True
|
||||
@@ -1,7 +1,7 @@
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
from ppg2mel.preprocess import preprocess_dataset
|
||||
from models.ppg2mel.preprocess import preprocess_dataset
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.train import train
|
||||
from models.synthesizer.hparams import hparams
|
||||
from models.synthesizer.train import train
|
||||
from utils.argutils import print_args
|
||||
import argparse
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def new_train():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("run_id", type=str, help= \
|
||||
"Name for this model instance. If a model state from the same run ID was previously "
|
||||
@@ -13,7 +12,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("syn_dir", type=str, default=argparse.SUPPRESS, help= \
|
||||
"Path to the synthesizer directory that contains the ground truth mel spectrograms, "
|
||||
"the wavs and the embeds.")
|
||||
parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\
|
||||
parser.add_argument("-m", "--models_dir", type=str, default=f"data/ckpt/synthesizer/", help=\
|
||||
"Path to the output directory that will contain the saved model weights and the logs.")
|
||||
parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
|
||||
"Number of steps between updates of the model on the disk. Set to 0 to never save the "
|
||||
@@ -28,10 +27,14 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--hparams", default="",
|
||||
help="Hyperparameter overrides as a comma-separated list of name=value "
|
||||
"pairs")
|
||||
args = parser.parse_args()
|
||||
args, _ = parser.parse_known_args()
|
||||
print_args(args, parser)
|
||||
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
|
||||
# Run the training
|
||||
train(**vars(args))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
new_train()
|
||||
66
control/cli/train_ppg2mel.py
Normal file
66
control/cli/train_ppg2mel.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import sys
|
||||
import torch
|
||||
import argparse
|
||||
import numpy as np
|
||||
from utils.hparams import HpsYaml
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
|
||||
# For reproducibility, comment these may speed up training
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
def main():
|
||||
# Arguments
|
||||
parser = argparse.ArgumentParser(description=
|
||||
'Training PPG2Mel VC model.')
|
||||
parser.add_argument('--config', type=str,
|
||||
help='Path to experiment config, e.g., config/vc.yaml')
|
||||
parser.add_argument('--name', default=None, type=str, help='Name for logging.')
|
||||
parser.add_argument('--logdir', default='log/', type=str,
|
||||
help='Logging path.', required=False)
|
||||
parser.add_argument('--ckpdir', default='ppg2mel/saved_models/', type=str,
|
||||
help='Checkpoint path.', required=False)
|
||||
parser.add_argument('--outdir', default='result/', type=str,
|
||||
help='Decode output path.', required=False)
|
||||
parser.add_argument('--load', default=None, type=str,
|
||||
help='Load pre-trained model (for training only)', required=False)
|
||||
parser.add_argument('--warm_start', action='store_true',
|
||||
help='Load model weights only, ignore specified layers.')
|
||||
parser.add_argument('--seed', default=0, type=int,
|
||||
help='Random seed for reproducable results.', required=False)
|
||||
parser.add_argument('--njobs', default=8, type=int,
|
||||
help='Number of threads for dataloader/decoding.', required=False)
|
||||
parser.add_argument('--cpu', action='store_true', help='Disable GPU training.')
|
||||
parser.add_argument('--no-pin', action='store_true',
|
||||
help='Disable pin-memory for dataloader')
|
||||
parser.add_argument('--test', action='store_true', help='Test the model.')
|
||||
parser.add_argument('--no-msg', action='store_true', help='Hide all messages.')
|
||||
parser.add_argument('--finetune', action='store_true', help='Finetune model')
|
||||
parser.add_argument('--oneshotvc', action='store_true', help='Oneshot VC model')
|
||||
parser.add_argument('--bilstm', action='store_true', help='BiLSTM VC model')
|
||||
parser.add_argument('--lsa', action='store_true', help='Use location-sensitive attention (LSA)')
|
||||
|
||||
###
|
||||
paras = parser.parse_args()
|
||||
setattr(paras, 'gpu', not paras.cpu)
|
||||
setattr(paras, 'pin_memory', not paras.no_pin)
|
||||
setattr(paras, 'verbose', not paras.no_msg)
|
||||
# Make the config dict dot visitable
|
||||
config = HpsYaml(paras.config)
|
||||
|
||||
np.random.seed(paras.seed)
|
||||
torch.manual_seed(paras.seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(paras.seed)
|
||||
|
||||
print(">>> OneShot VC training ...")
|
||||
mode = "train"
|
||||
solver = Solver(config, paras, mode)
|
||||
solver.load_data()
|
||||
solver.set_model()
|
||||
solver.exec()
|
||||
print(">>> Oneshot VC train finished!")
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,5 +1,5 @@
|
||||
from synthesizer.synthesize import run_synthesis
|
||||
from synthesizer.hparams import hparams
|
||||
from models.synthesizer.synthesize import run_synthesis
|
||||
from models.synthesizer.hparams import hparams
|
||||
from utils.argutils import print_args
|
||||
import argparse
|
||||
import os
|
||||
@@ -1,7 +1,7 @@
|
||||
from utils.argutils import print_args
|
||||
from vocoder.wavernn.train import train
|
||||
from vocoder.hifigan.train import train as train_hifigan
|
||||
from vocoder.fregan.train import train as train_fregan
|
||||
from models.vocoder.wavernn.train import train
|
||||
from models.vocoder.hifigan.train import train as train_hifigan
|
||||
from models.vocoder.fregan.train import train as train_fregan
|
||||
from utils.util import AttrDict
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
@@ -2,24 +2,26 @@ from pydantic import BaseModel, Field
|
||||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from encoder import inference as encoder
|
||||
from models.encoder import inference as encoder
|
||||
import librosa
|
||||
from scipy.io.wavfile import write
|
||||
import re
|
||||
import numpy as np
|
||||
from mkgui.base.components.types import FileContent
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from synthesizer.inference import Synthesizer
|
||||
from control.mkgui.base.components.types import FileContent
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from typing import Any, Tuple
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Constants
|
||||
AUDIO_SAMPLES_DIR = f"samples{os.sep}"
|
||||
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
VOC_MODELS_DIRT = f"vocoder{os.sep}saved_models"
|
||||
AUDIO_SAMPLES_DIR = f"data{os.sep}samples{os.sep}"
|
||||
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
VOC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}vocoder"
|
||||
TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
|
||||
TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
|
||||
if not os.path.isdir("wavs"):
|
||||
os.makedirs("wavs")
|
||||
|
||||
# Load local sample audio as options TODO: load dataset
|
||||
if os.path.isdir(AUDIO_SAMPLES_DIR):
|
||||
@@ -29,7 +31,7 @@ if os.path.isdir(SYN_MODELS_DIRT):
|
||||
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded synthesizer models: " + str(len(synthesizers)))
|
||||
else:
|
||||
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
|
||||
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist. 请将模型文件位置移动到上述位置中进行重试!")
|
||||
|
||||
if os.path.isdir(ENC_MODELS_DIRT):
|
||||
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
||||
@@ -49,9 +51,11 @@ class Input(BaseModel):
|
||||
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容"
|
||||
)
|
||||
local_audio_file: audio_input_selection = Field(
|
||||
..., alias="输入语音(本地wav)",
|
||||
..., alias="选择语音(本地wav)",
|
||||
description="选择本地语音文件."
|
||||
)
|
||||
record_audio_file: FileContent = Field(default=None, alias="录制语音",
|
||||
description="录音.", is_recorder=True, mime_type="audio/wav")
|
||||
upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
|
||||
description="拖拽或点击上传.", mime_type="audio/wav")
|
||||
encoder: encoders = Field(
|
||||
@@ -101,7 +105,12 @@ def synthesize(input: Input) -> Output:
|
||||
gan_vocoder.load_model(Path(input.vocoder.value))
|
||||
|
||||
# load file
|
||||
if input.upload_audio_file != None:
|
||||
if input.record_audio_file != None:
|
||||
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
|
||||
f.write(input.record_audio_file.as_bytes())
|
||||
f.seek(0)
|
||||
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
|
||||
elif input.upload_audio_file != None:
|
||||
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
|
||||
f.write(input.upload_audio_file.as_bytes())
|
||||
f.seek(0)
|
||||
@@ -1,27 +1,26 @@
|
||||
from synthesizer.inference import Synthesizer
|
||||
from pydantic import BaseModel, Field
|
||||
from encoder import inference as speacker_encoder
|
||||
import torch
|
||||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
import ppg_extractor as Extractor
|
||||
import ppg2mel as Convertor
|
||||
import librosa
|
||||
from scipy.io.wavfile import write
|
||||
import re
|
||||
import numpy as np
|
||||
from mkgui.base.components.types import FileContent
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from pathlib import Path
|
||||
from typing import Any, Tuple
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import librosa
|
||||
import matplotlib.pyplot as plt
|
||||
import torch
|
||||
from pydantic import BaseModel, Field
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
import models.ppg2mel as Convertor
|
||||
import models.ppg_extractor as Extractor
|
||||
from control.mkgui.base.components.types import FileContent
|
||||
from models.encoder import inference as speacker_encoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
|
||||
# Constants
|
||||
AUDIO_SAMPLES_DIR = f'sample{os.sep}'
|
||||
EXT_MODELS_DIRT = f'ppg_extractor{os.sep}saved_models'
|
||||
CONV_MODELS_DIRT = f'ppg2mel{os.sep}saved_models'
|
||||
VOC_MODELS_DIRT = f'vocoder{os.sep}saved_models'
|
||||
AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}'
|
||||
EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor'
|
||||
CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel'
|
||||
VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder'
|
||||
TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
|
||||
TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
|
||||
TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'
|
||||
@@ -132,9 +131,10 @@ def convert(input: Input) -> Output:
|
||||
|
||||
ppg = extractor.extract_from_wav(src_wav)
|
||||
# Import necessary dependency of Voice Conversion
|
||||
from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv
|
||||
from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0,
|
||||
get_converted_lf0uv)
|
||||
ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
|
||||
speacker_encoder.load_model(Path("encoder{os.sep}saved_models{os.sep}pretrained_bak_5805000.pt"))
|
||||
speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt"))
|
||||
embed = speacker_encoder.embed_utterance(ref_wav)
|
||||
lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
|
||||
min_len = min(ppg.shape[1], len(lf0_uv))
|
||||
@@ -37,6 +37,12 @@ def is_single_file_property(property: Dict) -> bool:
|
||||
# TODO: binary?
|
||||
return property.get("format") == "byte"
|
||||
|
||||
def is_single_autio_property(property: Dict) -> bool:
|
||||
if property.get("type") != "string":
|
||||
return False
|
||||
# TODO: binary?
|
||||
return property.get("format") == "bytes"
|
||||
|
||||
|
||||
def is_single_directory_property(property: Dict) -> bool:
|
||||
if property.get("type") != "string":
|
||||
@@ -2,7 +2,7 @@ import datetime
|
||||
import inspect
|
||||
import mimetypes
|
||||
import sys
|
||||
from os import getcwd, unlink
|
||||
from os import getcwd, unlink, path
|
||||
from platform import system
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import Any, Callable, Dict, List, Type
|
||||
@@ -14,14 +14,13 @@ from fastapi.encoders import jsonable_encoder
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, ValidationError, parse_obj_as
|
||||
|
||||
from mkgui.base import Opyrator
|
||||
from mkgui.base.core import name_to_title
|
||||
from mkgui.base.ui import schema_utils
|
||||
from mkgui.base.ui.streamlit_utils import CUSTOM_STREAMLIT_CSS
|
||||
from control.mkgui.base import Opyrator
|
||||
from control.mkgui.base.core import name_to_title
|
||||
from . import schema_utils
|
||||
from .streamlit_utils import CUSTOM_STREAMLIT_CSS
|
||||
|
||||
STREAMLIT_RUNNER_SNIPPET = """
|
||||
from mkgui.base.ui import render_streamlit_ui
|
||||
from mkgui.base import Opyrator
|
||||
from control.mkgui.base.ui import render_streamlit_ui
|
||||
|
||||
import streamlit as st
|
||||
|
||||
@@ -243,7 +242,14 @@ class InputUI:
|
||||
file_extension = None
|
||||
if "mime_type" in property:
|
||||
file_extension = mimetypes.guess_extension(property["mime_type"])
|
||||
|
||||
|
||||
if "is_recorder" in property:
|
||||
from audio_recorder_streamlit import audio_recorder
|
||||
audio_bytes = audio_recorder()
|
||||
if audio_bytes:
|
||||
streamlit_app.audio(audio_bytes, format="audio/wav")
|
||||
return audio_bytes
|
||||
|
||||
uploaded_file = streamlit_app.file_uploader(
|
||||
**streamlit_kwargs, accept_multiple_files=False, type=file_extension
|
||||
)
|
||||
@@ -263,6 +269,39 @@ class InputUI:
|
||||
streamlit_app.video(bytes, format=property.get("mime_type"))
|
||||
return bytes
|
||||
|
||||
def _render_single_audio_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
# streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
from audio_recorder_streamlit import audio_recorder
|
||||
audio_bytes = audio_recorder()
|
||||
if audio_bytes:
|
||||
streamlit_app.audio(audio_bytes, format="audio/wav")
|
||||
return audio_bytes
|
||||
|
||||
# file_extension = None
|
||||
# if "mime_type" in property:
|
||||
# file_extension = mimetypes.guess_extension(property["mime_type"])
|
||||
|
||||
# uploaded_file = streamlit_app.file_uploader(
|
||||
# **streamlit_kwargs, accept_multiple_files=False, type=file_extension
|
||||
# )
|
||||
# if uploaded_file is None:
|
||||
# return None
|
||||
|
||||
# bytes = uploaded_file.getvalue()
|
||||
# if property.get("mime_type"):
|
||||
# if is_compatible_audio(property["mime_type"]):
|
||||
# # Show audio
|
||||
# streamlit_app.audio(bytes, format=property.get("mime_type"))
|
||||
# if is_compatible_image(property["mime_type"]):
|
||||
# # Show image
|
||||
# streamlit_app.image(bytes)
|
||||
# if is_compatible_video(property["mime_type"]):
|
||||
# # Show video
|
||||
# streamlit_app.video(bytes, format=property.get("mime_type"))
|
||||
# return bytes
|
||||
|
||||
def _render_single_string_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
@@ -807,21 +846,20 @@ class OutputUI:
|
||||
|
||||
def getOpyrator(mode: str) -> Opyrator:
|
||||
if mode == None or mode.startswith('VC'):
|
||||
from mkgui.app_vc import convert
|
||||
from control.mkgui.app_vc import convert
|
||||
return Opyrator(convert)
|
||||
if mode == None or mode.startswith('预处理'):
|
||||
from mkgui.preprocess import preprocess
|
||||
from control.mkgui.preprocess import preprocess
|
||||
return Opyrator(preprocess)
|
||||
if mode == None or mode.startswith('模型训练'):
|
||||
from mkgui.train import train
|
||||
from control.mkgui.train import train
|
||||
return Opyrator(train)
|
||||
if mode == None or mode.startswith('模型训练(VC)'):
|
||||
from mkgui.train_vc import train_vc
|
||||
from control.mkgui.train_vc import train_vc
|
||||
return Opyrator(train_vc)
|
||||
from mkgui.app import synthesize
|
||||
from control.mkgui.app import synthesize
|
||||
return Opyrator(synthesize)
|
||||
|
||||
|
||||
def render_streamlit_ui() -> None:
|
||||
# init
|
||||
session_state = st.session_state
|
||||
@@ -845,7 +883,7 @@ def render_streamlit_ui() -> None:
|
||||
col2.title(title)
|
||||
col2.markdown("欢迎使用MockingBird Web 2")
|
||||
|
||||
image = Image.open('.\\mkgui\\static\\mb.png')
|
||||
image = Image.open(path.join('control','mkgui', 'static', 'mb.png'))
|
||||
col1.image(image)
|
||||
|
||||
st.markdown("---")
|
||||
@@ -853,6 +891,13 @@ def render_streamlit_ui() -> None:
|
||||
|
||||
with left:
|
||||
st.header("Control 控制")
|
||||
# if session_state.mode in ["AI拟音", "VC拟音"] :
|
||||
# from audiorecorder import audiorecorder
|
||||
# audio = audiorecorder("Click to record", "Recording...")
|
||||
# if len(audio) > 0:
|
||||
# # To play audio in frontend:
|
||||
# st.audio(audio.tobytes())
|
||||
|
||||
InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st)
|
||||
execute_selected = st.button(opyrator.action)
|
||||
if execute_selected:
|
||||
@@ -6,8 +6,8 @@ from typing import Any, Tuple
|
||||
|
||||
|
||||
# Constants
|
||||
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
|
||||
if os.path.isdir(EXT_MODELS_DIRT):
|
||||
@@ -83,7 +83,7 @@ def preprocess(input: Input) -> Output:
|
||||
"""Preprocess(预处理)"""
|
||||
finished = 0
|
||||
if input.model == Model.VC_PPG2MEL:
|
||||
from ppg2mel.preprocess import preprocess_dataset
|
||||
from models.ppg2mel.preprocess import preprocess_dataset
|
||||
finished = preprocess_dataset(
|
||||
datasets_root=Path(input.datasets_root),
|
||||
dataset=input.dataset,
|
||||
|
Before Width: | Height: | Size: 5.6 KiB After Width: | Height: | Size: 5.6 KiB |
@@ -3,17 +3,17 @@ import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.train import train as synt_train
|
||||
from models.synthesizer.hparams import hparams
|
||||
from models.synthesizer.train import train as synt_train
|
||||
|
||||
# Constants
|
||||
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
|
||||
# EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
|
||||
# CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
|
||||
# ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
# EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
|
||||
# CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
|
||||
# ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
# Pre-Load models
|
||||
if os.path.isdir(SYN_MODELS_DIRT):
|
||||
@@ -96,7 +96,7 @@ def train(input: Input) -> Output:
|
||||
synt_train(
|
||||
input.run_id,
|
||||
input.input_root,
|
||||
f"synthesizer{os.sep}saved_models",
|
||||
f"data{os.sep}ckpt{os.sep}synthesizer",
|
||||
input.save_every,
|
||||
input.backup_every,
|
||||
input.log_every,
|
||||
@@ -4,14 +4,14 @@ from pathlib import Path
|
||||
from enum import Enum
|
||||
from typing import Any, Tuple
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from utils.hparams import HpsYaml
|
||||
from utils.util import AttrDict
|
||||
import torch
|
||||
|
||||
# Constants
|
||||
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models"
|
||||
CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models"
|
||||
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models"
|
||||
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
|
||||
CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
|
||||
if os.path.isdir(EXT_MODELS_DIRT):
|
||||
@@ -144,7 +144,7 @@ def train_vc(input: Input) -> Output:
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(input.seed)
|
||||
mode = "train"
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
solver = Solver(config, params, mode)
|
||||
solver.load_data()
|
||||
solver.set_model()
|
||||
@@ -1,12 +1,12 @@
|
||||
from toolbox.ui import UI
|
||||
from encoder import inference as encoder
|
||||
from synthesizer.inference import Synthesizer
|
||||
from vocoder.wavernn import inference as rnn_vocoder
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from vocoder.fregan import inference as fgan_vocoder
|
||||
from control.toolbox.ui import UI
|
||||
from models.encoder import inference as encoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.vocoder.wavernn import inference as rnn_vocoder
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
from models.vocoder.fregan import inference as fgan_vocoder
|
||||
from pathlib import Path
|
||||
from time import perf_counter as timer
|
||||
from toolbox.utterance import Utterance
|
||||
from control.toolbox.utterance import Utterance
|
||||
import numpy as np
|
||||
import traceback
|
||||
import sys
|
||||
@@ -38,8 +38,8 @@ recognized_datasets = [
|
||||
"VoxCeleb2/dev/aac",
|
||||
"VoxCeleb2/test/aac",
|
||||
"VCTK-Corpus/wav48",
|
||||
"aidatatang_200zh/corpus/dev",
|
||||
"aidatatang_200zh/corpus/test",
|
||||
"aidatatang_200zh/corpus/train",
|
||||
"aishell3/test/wav",
|
||||
"magicdata/train",
|
||||
]
|
||||
@@ -397,7 +397,7 @@ class Toolbox:
|
||||
self.ui.log("Loading the extractor %s... " % model_fpath)
|
||||
self.ui.set_loading(1)
|
||||
start = timer()
|
||||
import ppg_extractor as extractor
|
||||
import models.ppg_extractor as extractor
|
||||
self.extractor = extractor.load_model(model_fpath)
|
||||
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
|
||||
self.ui.set_loading(0)
|
||||
@@ -409,7 +409,7 @@ class Toolbox:
|
||||
self.ui.log("Loading the convertor %s... " % model_fpath)
|
||||
self.ui.set_loading(1)
|
||||
start = timer()
|
||||
import ppg2mel as convertor
|
||||
import models.ppg2mel as convertor
|
||||
self.convertor = convertor.load_model( model_fpath)
|
||||
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
|
||||
self.ui.set_loading(0)
|
||||
|
Before Width: | Height: | Size: 5.6 KiB After Width: | Height: | Size: 5.6 KiB |
@@ -3,9 +3,8 @@ from PyQt5 import QtGui
|
||||
from PyQt5.QtWidgets import *
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
|
||||
from matplotlib.figure import Figure
|
||||
from encoder.inference import plot_embedding_as_heatmap
|
||||
from toolbox.utterance import Utterance
|
||||
from models.encoder.inference import plot_embedding_as_heatmap
|
||||
from control.toolbox.utterance import Utterance
|
||||
from pathlib import Path
|
||||
from typing import List, Set
|
||||
import sounddevice as sd
|
||||
@@ -274,7 +273,9 @@ class UI(QDialog):
|
||||
if datasets_root is None or len(datasets) == 0:
|
||||
msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
|
||||
if datasets_root is None else "o not have any of the recognized datasets" \
|
||||
" in %s" % datasets_root)
|
||||
" in %s \n" \
|
||||
"Please note use 'E:\datasets' as root path " \
|
||||
"instead of 'E:\datasets\aidatatang_200zh\corpus\test' as an example " % datasets_root)
|
||||
self.log(msg)
|
||||
msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
|
||||
"can still use the toolbox by recording samples yourself." % \
|
||||
8
datasets_download/CN.txt
Normal file
8
datasets_download/CN.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
https://openslr.magicdatatech.com/resources/62/aidatatang_200zh.tgz
|
||||
out=download/aidatatang_200zh.tgz
|
||||
https://openslr.magicdatatech.com/resources/68/train_set.tar.gz
|
||||
out=download/magicdata.tgz
|
||||
https://openslr.magicdatatech.com/resources/93/data_aishell3.tgz
|
||||
out=download/aishell3.tgz
|
||||
https://openslr.magicdatatech.com/resources/33/data_aishell.tgz
|
||||
out=download/data_aishell.tgz
|
||||
8
datasets_download/EU.txt
Normal file
8
datasets_download/EU.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
https://openslr.elda.org/resources/62/aidatatang_200zh.tgz
|
||||
out=download/aidatatang_200zh.tgz
|
||||
https://openslr.elda.org/resources/68/train_set.tar.gz
|
||||
out=download/magicdata.tgz
|
||||
https://openslr.elda.org/resources/93/data_aishell3.tgz
|
||||
out=download/aishell3.tgz
|
||||
https://openslr.elda.org/resources/33/data_aishell.tgz
|
||||
out=download/data_aishell.tgz
|
||||
8
datasets_download/US.txt
Normal file
8
datasets_download/US.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
https://us.openslr.org/resources/62/aidatatang_200zh.tgz
|
||||
out=download/aidatatang_200zh.tgz
|
||||
https://us.openslr.org/resources/68/train_set.tar.gz
|
||||
out=download/magicdata.tgz
|
||||
https://us.openslr.org/resources/93/data_aishell3.tgz
|
||||
out=download/aishell3.tgz
|
||||
https://us.openslr.org/resources/33/data_aishell.tgz
|
||||
out=download/data_aishell.tgz
|
||||
4
datasets_download/datasets.sha256sum
Normal file
4
datasets_download/datasets.sha256sum
Normal file
@@ -0,0 +1,4 @@
|
||||
0c0ace77fe8ee77db8d7542d6eb0b7ddf09b1bfb880eb93a7fbdbf4611e9984b /datasets/download/aidatatang_200zh.tgz
|
||||
be2507d431ad59419ec871e60674caedb2b585f84ffa01fe359784686db0e0cc /datasets/download/aishell3.tgz
|
||||
a4a0313cde0a933e0e01a451f77de0a23d6c942f4694af5bb7f40b9dc38143fe /datasets/download/data_aishell.tgz
|
||||
1d2647c614b74048cfe16492570cc5146d800afdc07483a43b31809772632143 /datasets/download/magicdata.tgz
|
||||
8
datasets_download/default.txt
Normal file
8
datasets_download/default.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
https://www.openslr.org/resources/62/aidatatang_200zh.tgz
|
||||
out=download/aidatatang_200zh.tgz
|
||||
https://www.openslr.org/resources/68/train_set.tar.gz
|
||||
out=download/magicdata.tgz
|
||||
https://www.openslr.org/resources/93/data_aishell3.tgz
|
||||
out=download/aishell3.tgz
|
||||
https://www.openslr.org/resources/33/data_aishell.tgz
|
||||
out=download/data_aishell.tgz
|
||||
8
datasets_download/download.sh
Executable file
8
datasets_download/download.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
aria2c -x 10 --disable-ipv6 --input-file /workspace/datasets_download/${DATASET_MIRROR}.txt --dir /datasets --continue
|
||||
|
||||
echo "Verifying sha256sum..."
|
||||
parallel --will-cite -a /workspace/datasets_download/datasets.sha256sum "echo -n {} | sha256sum -c"
|
||||
29
datasets_download/extract.sh
Executable file
29
datasets_download/extract.sh
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
mkdir -p /datasets/aidatatang_200zh
|
||||
if [ -z "$(ls -A /datasets/aidatatang_200zh)" ] ; then
|
||||
tar xvz --directory /datasets/ -f /datasets/download/aidatatang_200zh.tgz --exclude 'aidatatang_200zh/corpus/dev/*' --exclude 'aidatatang_200zh/corpus/test/*'
|
||||
cd /datasets/aidatatang_200zh/corpus/train/
|
||||
cat *.tar.gz | tar zxvf - -i
|
||||
rm -f *.tar.gz
|
||||
fi
|
||||
|
||||
mkdir -p /datasets/magicdata
|
||||
if [ -z "$(ls -A /datasets/magicdata)" ] ; then
|
||||
tar xvz --directory /datasets/magicdata -f /datasets/download/magicdata.tgz train/
|
||||
fi
|
||||
|
||||
mkdir -p /datasets/aishell3
|
||||
if [ -z "$(ls -A /datasets/aishell3)" ] ; then
|
||||
tar xvz --directory /datasets/aishell3 -f /datasets/download/aishell3.tgz train/
|
||||
fi
|
||||
|
||||
mkdir -p /datasets/data_aishell
|
||||
if [ -z "$(ls -A /datasets/data_aishell)" ] ; then
|
||||
tar xvz --directory /datasets/ -f /datasets/download/data_aishell.tgz
|
||||
cd /datasets/data_aishell/wav/
|
||||
cat *.tar.gz | tar zxvf - -i --exclude 'dev/*' --exclude 'test/*'
|
||||
rm -f *.tar.gz
|
||||
fi
|
||||
@@ -1,5 +1,5 @@
|
||||
from pathlib import Path
|
||||
from toolbox import Toolbox
|
||||
from control.toolbox import Toolbox
|
||||
from utils.argutils import print_args
|
||||
from utils.modelutils import check_model_paths
|
||||
import argparse
|
||||
@@ -17,15 +17,15 @@ if __name__ == '__main__':
|
||||
"supported datasets.", default=None)
|
||||
parser.add_argument("-vc", "--vc_mode", action="store_true",
|
||||
help="Voice Conversion Mode(PPG based)")
|
||||
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models",
|
||||
parser.add_argument("-e", "--enc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}encoder",
|
||||
help="Directory containing saved encoder models")
|
||||
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models",
|
||||
parser.add_argument("-s", "--syn_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}synthesizer",
|
||||
help="Directory containing saved synthesizer models")
|
||||
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
|
||||
parser.add_argument("-v", "--voc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}vocoder",
|
||||
help="Directory containing saved vocoder models")
|
||||
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default="ppg_extractor/saved_models",
|
||||
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg_extractor",
|
||||
help="Directory containing saved extrator models")
|
||||
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default="ppg2mel/saved_models",
|
||||
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg2mel",
|
||||
help="Directory containing saved convert models")
|
||||
parser.add_argument("--cpu", action="store_true", help=\
|
||||
"If True, processing is done on CPU, even when a GPU is available.")
|
||||
|
||||
23
docker-compose.yml
Normal file
23
docker-compose.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
server:
|
||||
image: mockingbird:latest
|
||||
build: .
|
||||
volumes:
|
||||
- ./datasets:/datasets
|
||||
- ./synthesizer/saved_models:/workspace/synthesizer/saved_models
|
||||
environment:
|
||||
- DATASET_MIRROR=US
|
||||
- FORCE_RETRAIN=false
|
||||
- TRAIN_DATASETS=aidatatang_200zh magicdata aishell3 data_aishell
|
||||
- TRAIN_SKIP_EXISTING=true
|
||||
ports:
|
||||
- 8080:8080
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: [ '0' ]
|
||||
capabilities: [ gpu ]
|
||||
17
docker-entrypoint.sh
Executable file
17
docker-entrypoint.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if [ -z "$(ls -A /workspace/synthesizer/saved_models)" ] || [ "$FORCE_RETRAIN" = true ] ; then
|
||||
/workspace/datasets_download/download.sh
|
||||
/workspace/datasets_download/extract.sh
|
||||
for DATASET in ${TRAIN_DATASETS}
|
||||
do
|
||||
if [ "$TRAIN_SKIP_EXISTING" = true ] ; then
|
||||
python pre.py /datasets -d ${DATASET} -n $(nproc) --skip_existing
|
||||
else
|
||||
python pre.py /datasets -d ${DATASET} -n $(nproc)
|
||||
fi
|
||||
done
|
||||
python synthesizer_train.py mandarin /datasets/SV2TTS/synthesizer
|
||||
fi
|
||||
|
||||
python web.py
|
||||
@@ -1,2 +0,0 @@
|
||||
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
|
||||
Binary file not shown.
14
gen_voice.py
14
gen_voice.py
@@ -1,23 +1,15 @@
|
||||
from encoder.params_model import model_embedding_size as speaker_embedding_size
|
||||
from utils.argutils import print_args
|
||||
from utils.modelutils import check_model_paths
|
||||
from synthesizer.inference import Synthesizer
|
||||
from encoder import inference as encoder
|
||||
from vocoder.wavernn import inference as rnn_vocoder
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.encoder import inference as encoder
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import librosa
|
||||
import argparse
|
||||
import torch
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import cn2an
|
||||
import glob
|
||||
|
||||
from audioread.exceptions import NoBackendError
|
||||
vocoder = gan_vocoder
|
||||
|
||||
def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from scipy.ndimage.morphology import binary_dilation
|
||||
from encoder.params_data import *
|
||||
from models.encoder.params_data import *
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
from warnings import warn
|
||||
2
models/encoder/data_objects/__init__.py
Normal file
2
models/encoder/data_objects/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
|
||||
@@ -1,5 +1,5 @@
|
||||
from encoder.data_objects.random_cycler import RandomCycler
|
||||
from encoder.data_objects.utterance import Utterance
|
||||
from models.encoder.data_objects.random_cycler import RandomCycler
|
||||
from models.encoder.data_objects.utterance import Utterance
|
||||
from pathlib import Path
|
||||
|
||||
# Contains the set of utterances of a single speaker
|
||||
@@ -1,6 +1,6 @@
|
||||
import numpy as np
|
||||
from typing import List
|
||||
from encoder.data_objects.speaker import Speaker
|
||||
from models.encoder.data_objects.speaker import Speaker
|
||||
|
||||
class SpeakerBatch:
|
||||
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
|
||||
@@ -1,7 +1,7 @@
|
||||
from encoder.data_objects.random_cycler import RandomCycler
|
||||
from encoder.data_objects.speaker_batch import SpeakerBatch
|
||||
from encoder.data_objects.speaker import Speaker
|
||||
from encoder.params_data import partials_n_frames
|
||||
from models.encoder.data_objects.random_cycler import RandomCycler
|
||||
from models.encoder.data_objects.speaker_batch import SpeakerBatch
|
||||
from models.encoder.data_objects.speaker import Speaker
|
||||
from models.encoder.params_data import partials_n_frames
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from pathlib import Path
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from encoder.params_data import *
|
||||
from encoder.model import SpeakerEncoder
|
||||
from encoder.audio import preprocess_wav # We want to expose this function from here
|
||||
from models.encoder.params_data import *
|
||||
from models.encoder.model import SpeakerEncoder
|
||||
from models.encoder.audio import preprocess_wav # We want to expose this function from here
|
||||
from matplotlib import cm
|
||||
from encoder import audio
|
||||
from models.encoder import audio
|
||||
from pathlib import Path
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
@@ -1,5 +1,5 @@
|
||||
from encoder.params_model import *
|
||||
from encoder.params_data import *
|
||||
from models.encoder.params_model import *
|
||||
from models.encoder.params_data import *
|
||||
from scipy.interpolate import interp1d
|
||||
from sklearn.metrics import roc_curve
|
||||
from torch.nn.utils import clip_grad_norm_
|
||||
@@ -1,8 +1,8 @@
|
||||
from multiprocess.pool import ThreadPool
|
||||
from encoder.params_data import *
|
||||
from encoder.config import librispeech_datasets, anglophone_nationalites
|
||||
from models.encoder.params_data import *
|
||||
from models.encoder.config import librispeech_datasets, anglophone_nationalites
|
||||
from datetime import datetime
|
||||
from encoder import audio
|
||||
from models.encoder import audio
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
@@ -22,7 +22,7 @@ class DatasetLog:
|
||||
self._log_params()
|
||||
|
||||
def _log_params(self):
|
||||
from encoder import params_data
|
||||
from models.encoder import params_data
|
||||
self.write_line("Parameter values:")
|
||||
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
|
||||
value = getattr(params_data, param_name)
|
||||
@@ -1,7 +1,7 @@
|
||||
from encoder.visualizations import Visualizations
|
||||
from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
|
||||
from encoder.params_model import *
|
||||
from encoder.model import SpeakerEncoder
|
||||
from models.encoder.visualizations import Visualizations
|
||||
from models.encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
|
||||
from models.encoder.params_model import *
|
||||
from models.encoder.model import SpeakerEncoder
|
||||
from utils.profiler import Profiler
|
||||
from pathlib import Path
|
||||
import torch
|
||||
@@ -1,4 +1,4 @@
|
||||
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from datetime import datetime
|
||||
from time import perf_counter as timer
|
||||
import matplotlib.pyplot as plt
|
||||
@@ -65,8 +65,8 @@ class Visualizations:
|
||||
def log_params(self):
|
||||
if self.disabled:
|
||||
return
|
||||
from encoder import params_data
|
||||
from encoder import params_model
|
||||
from models.encoder import params_data
|
||||
from models.encoder import params_model
|
||||
param_string = "<b>Model parameters</b>:<br>"
|
||||
for param_name in (p for p in dir(params_model) if not p.startswith("__")):
|
||||
value = getattr(params_model, param_name)
|
||||
@@ -15,7 +15,7 @@ from .rnn_decoder_mol import Decoder
|
||||
from .utils.cnn_postnet import Postnet
|
||||
from .utils.vc_utils import get_mask_from_lengths
|
||||
|
||||
from utils.load_yaml import HpsYaml
|
||||
from utils.hparams import HpsYaml
|
||||
|
||||
class MelDecoderMOLv2(AbsMelDecoder):
|
||||
"""Use an encoder to preprocess ppg."""
|
||||
@@ -7,10 +7,10 @@ from pathlib import Path
|
||||
import soundfile
|
||||
import resampy
|
||||
|
||||
from ppg_extractor import load_model
|
||||
from models.ppg_extractor import load_model
|
||||
import encoder.inference as Encoder
|
||||
from encoder.audio import preprocess_wav
|
||||
from encoder import audio
|
||||
from models.encoder.audio import preprocess_wav
|
||||
from models.encoder import audio
|
||||
from utils.f0_utils import compute_f0
|
||||
|
||||
from torch.multiprocessing import Pool, cpu_count
|
||||
@@ -2,8 +2,8 @@ import sys
|
||||
import torch
|
||||
import argparse
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
from utils.hparams import HpsYaml
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
|
||||
# For reproducibility, comment these may speed up training
|
||||
torch.backends.cudnn.deterministic = True
|
||||
@@ -8,7 +8,6 @@ from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from .option import default_hparas
|
||||
from utils.util import human_format, Timer
|
||||
from utils.load_yaml import HpsYaml
|
||||
|
||||
|
||||
class BaseSolver():
|
||||
@@ -14,7 +14,7 @@ from utils.data_load import OneshotVcDataset, MultiSpkVcCollate
|
||||
from .loss import MaskedMSELoss
|
||||
from .optim import Optimizer
|
||||
from utils.util import human_format
|
||||
from ppg2mel import MelDecoderMOLv2
|
||||
from models.ppg2mel import MelDecoderMOLv2
|
||||
|
||||
|
||||
class Solver(BaseSolver):
|
||||
0
models/ppg_extractor/encoder/__init__.py
Normal file
0
models/ppg_extractor/encoder/__init__.py
Normal file
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user