mirror of
https://github.com/babysor/Realtime-Voice-Clone-Chinese.git
synced 2026-02-03 18:43:41 +08:00
Compare commits
34 Commits
dev
...
restruct-p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
26331fe019 | ||
|
|
712a53f557 | ||
|
|
24cb262c3f | ||
|
|
e469bd06ae | ||
|
|
cd20d21f3d | ||
|
|
74a3fc97d0 | ||
|
|
b402f9dbdf | ||
|
|
85a53c9e05 | ||
|
|
028b131570 | ||
|
|
2a1890f9e1 | ||
|
|
c91bc3208e | ||
|
|
dd1ea3e714 | ||
|
|
e7313c514f | ||
|
|
f57d1a69b6 | ||
|
|
ab7d692619 | ||
|
|
f17e3b04e1 | ||
|
|
6abdd0ebf0 | ||
|
|
400a7207e3 | ||
|
|
6f023e313d | ||
|
|
a39b6d3117 | ||
|
|
885225045d | ||
|
|
ee643d7cbc | ||
|
|
6a793cea84 | ||
|
|
7317ba5ffe | ||
|
|
05f886162c | ||
|
|
e726c2eb12 | ||
|
|
c00474525a | ||
|
|
350b190662 | ||
|
|
0caed984e3 | ||
|
|
c5d03fb3cb | ||
|
|
7f799d322f | ||
|
|
a1f2e4a790 | ||
|
|
b136f80f43 | ||
|
|
f082a82420 |
4
.dockerignore
Normal file
4
.dockerignore
Normal file
@@ -0,0 +1,4 @@
|
||||
*/saved_models
|
||||
!vocoder/saved_models/pretrained/**
|
||||
!encoder/saved_models/pretrained.pt
|
||||
/datasets
|
||||
12
.gitignore
vendored
12
.gitignore
vendored
@@ -13,10 +13,12 @@
|
||||
*.bbl
|
||||
*.bcf
|
||||
*.toc
|
||||
*.wav
|
||||
*.sh
|
||||
*/saved_models
|
||||
!vocoder/saved_models/pretrained/**
|
||||
!encoder/saved_models/pretrained.pt
|
||||
data/ckpt
|
||||
!data/ckpt/vocoder/pretrained/**
|
||||
!data/ckpt/encoder/pretrained.pt
|
||||
wavs
|
||||
log
|
||||
log
|
||||
!/docker-entrypoint.sh
|
||||
!/datasets_download/*.sh
|
||||
/datasets
|
||||
14
.vscode/launch.json
vendored
14
.vscode/launch.json
vendored
@@ -15,7 +15,8 @@
|
||||
"name": "Python: Vocoder Preprocess",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_preprocess.py",
|
||||
"program": "control\\cli\\vocoder_preprocess.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["..\\audiodata"]
|
||||
},
|
||||
@@ -23,7 +24,8 @@
|
||||
"name": "Python: Vocoder Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_train.py",
|
||||
"program": "control\\cli\\vocoder_train.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["dev", "..\\audiodata"]
|
||||
},
|
||||
@@ -32,6 +34,7 @@
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["-d","..\\audiodata"]
|
||||
},
|
||||
@@ -40,6 +43,7 @@
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["-d","..\\audiodata","-vc"]
|
||||
},
|
||||
@@ -47,9 +51,9 @@
|
||||
"name": "Python: Synth Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "synthesizer_train.py",
|
||||
"program": "train.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["my_run", "..\\"]
|
||||
"args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"]
|
||||
},
|
||||
{
|
||||
"name": "Python: PPG Convert",
|
||||
@@ -60,6 +64,6 @@
|
||||
"args": ["-c", ".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2.yaml",
|
||||
"-m", ".\\ppg2mel\\saved_models\\best_loss_step_304000.pth", "--wav_dir", ".\\wavs\\input", "--ref_wav_path", ".\\wavs\\pkq.mp3", "-o", ".\\wavs\\output\\"
|
||||
]
|
||||
},
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
17
Dockerfile
Normal file
17
Dockerfile
Normal file
@@ -0,0 +1,17 @@
|
||||
FROM pytorch/pytorch:latest
|
||||
|
||||
RUN apt-get update && apt-get install -y build-essential ffmpeg parallel aria2 && apt-get clean
|
||||
|
||||
COPY ./requirements.txt /workspace/requirements.txt
|
||||
|
||||
RUN pip install -r requirements.txt && pip install webrtcvad-wheels
|
||||
|
||||
COPY . /workspace
|
||||
|
||||
VOLUME [ "/datasets", "/workspace/synthesizer/saved_models/" ]
|
||||
|
||||
ENV DATASET_MIRROR=default FORCE_RETRAIN=false TRAIN_DATASETS=aidatatang_200zh\ magicdata\ aishell3\ data_aishell TRAIN_SKIP_EXISTING=true
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
ENTRYPOINT [ "/workspace/docker-entrypoint.sh" ]
|
||||
79
README-CN.md
79
README-CN.md
@@ -18,6 +18,20 @@
|
||||
|
||||
🌍 **Webserver Ready** 可伺服你的训练结果,供远程调用
|
||||
|
||||
### 进行中的工作
|
||||
* GUI/客户端大升级与合并
|
||||
|
||||
- [x] 初始化框架 `./mkgui` (基于streamlit + fastapi)和 [技术设计](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
|
||||
|
||||
- [x] 增加 Voice Cloning and Conversion的演示页面
|
||||
|
||||
- [x] 增加Voice Conversion的预处理preprocessing 和训练 training 页面
|
||||
|
||||
- [ ] 增加其他的的预处理preprocessing 和训练 training 页面
|
||||
|
||||
* 模型后端基于ESPnet2升级
|
||||
|
||||
|
||||
## 开始
|
||||
### 1. 安装要求
|
||||
> 按照原始存储库测试您是否已准备好所有环境。
|
||||
@@ -68,7 +82,7 @@
|
||||
对效果影响不大,已经预置3款,如果希望自己训练可以参考以下命令。
|
||||
* 预处理数据:
|
||||
`python vocoder_preprocess.py <datasets_root> -m <synthesizer_model_path>`
|
||||
> `<datasets_root>`替换为你的数据集目录,`<synthesizer_model_path>`替换为一个你最好的synthesizer模型目录,例如 *sythensizer\saved_mode\xxx*
|
||||
> `<datasets_root>`替换为你的数据集目录,`<synthesizer_model_path>`替换为一个你最好的synthesizer模型目录,例如 *sythensizer\saved_models\xxx*
|
||||
|
||||
|
||||
* 训练wavernn声码器:
|
||||
@@ -78,19 +92,17 @@
|
||||
* 训练hifigan声码器:
|
||||
`python vocoder_train.py <trainid> <datasets_root> hifigan`
|
||||
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
||||
|
||||
* 训练fregan声码器:
|
||||
`python vocoder_train.py <trainid> <datasets_root> --config config.json fregan`
|
||||
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
||||
* 将GAN声码器的训练切换为多GPU模式:修改GAN文件夹下.json文件中的"num_gpus"参数
|
||||
### 3. 启动程序或工具箱
|
||||
您可以尝试使用以下命令:
|
||||
|
||||
### 3.1 启动Web程序:
|
||||
### 3.1 启动Web程序(v2):
|
||||
`python web.py`
|
||||
运行成功后在浏览器打开地址, 默认为 `http://localhost:8080`
|
||||

|
||||
> 注:目前界面比较buggy,
|
||||
> * 第一次点击`录制`要等待几秒浏览器正常启动录音,否则会有重音
|
||||
> * 录制结束不要再点`录制`而是`停止`
|
||||
> * 仅支持手动新录音(16khz), 不支持超过4MB的录音,最佳长度在5~15秒
|
||||
> * 默认使用第一个找到的模型,有动手能力的可以看代码修改 `web\__init__.py`。
|
||||
|
||||
### 3.2 启动工具箱:
|
||||
`python demo_toolbox.py -d <datasets_root>`
|
||||
@@ -101,12 +113,12 @@
|
||||
### 4. 番外:语音转换Voice Conversion(PPG based)
|
||||
想像柯南拿着变声器然后发出毛利小五郎的声音吗?本项目现基于PPG-VC,引入额外两个模块(PPG extractor + PPG2Mel), 可以实现变声功能。(文档不全,尤其是训练部分,正在努力补充中)
|
||||
#### 4.0 准备环境
|
||||
* 确保项目以上环境已经安装ok,运行`pip install -r requirements_vc.txt` 来安装剩余的必要包。
|
||||
* 确保项目以上环境已经安装ok,运行`pip install espnet` 来安装剩余的必要包。
|
||||
* 下载以下模型 链接:https://pan.baidu.com/s/1bl_x_DHJSAUyN2fma-Q_Wg
|
||||
提取码:gh41
|
||||
* 24K采样率专用的vocoder(hifigan)到 *vocoder\saved_mode\xxx*
|
||||
* 预训练的ppg特征encoder(ppg_extractor)到 *ppg_extractor\saved_mode\xxx*
|
||||
* 预训练的PPG2Mel到 *ppg2mel\saved_mode\xxx*
|
||||
* 24K采样率专用的vocoder(hifigan)到 *vocoder\saved_models\xxx*
|
||||
* 预训练的ppg特征encoder(ppg_extractor)到 *ppg_extractor\saved_models\xxx*
|
||||
* 预训练的PPG2Mel到 *ppg2mel\saved_models\xxx*
|
||||
|
||||
#### 4.1 使用数据集自己训练PPG2Mel模型 (可选)
|
||||
|
||||
@@ -115,7 +127,7 @@
|
||||
`python pre4ppg.py <datasets_root> -d {dataset} -n {number}`
|
||||
可传入参数:
|
||||
* `-d {dataset}` 指定数据集,支持 aidatatang_200zh, 不传默认为aidatatang_200zh
|
||||
* `-n {number}` 指定并行数,CPU 11770k在8的情况下,需要运行12到18小时!待优化
|
||||
* `-n {number}` 指定并行数,CPU 11700k在8的情况下,需要运行12到18小时!待优化
|
||||
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
||||
|
||||
* 训练合成器, 注意在上一步先下载好`ppg2mel.yaml`, 修改里面的地址指向预训练好的文件夹:
|
||||
@@ -124,7 +136,7 @@
|
||||
|
||||
#### 4.2 启动工具箱VC模式
|
||||
您可以尝试使用以下命令:
|
||||
`python demo_toolbox.py vc -d <datasets_root>`
|
||||
`python demo_toolbox.py -vc -d <datasets_root>`
|
||||
> 请指定一个可用的数据集文件路径,如果有支持的数据集则会自动加载供调试,也同时会作为手动录制音频的存储目录。
|
||||
<img width="971" alt="微信图片_20220305005351" src="https://user-images.githubusercontent.com/7423248/156805733-2b093dbc-d989-4e68-8609-db11f365886a.png">
|
||||
|
||||
@@ -135,35 +147,36 @@
|
||||
| --- | ----------- | ----- | --------------------- |
|
||||
| [1803.09017](https://arxiv.org/abs/1803.09017) | GlobalStyleToken (synthesizer)| Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis | 本代码库 |
|
||||
| [2010.05646](https://arxiv.org/abs/2010.05646) | HiFi-GAN (vocoder)| Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | 本代码库 |
|
||||
| [2106.02297](https://arxiv.org/abs/2106.02297) | Fre-GAN (vocoder)| Fre-GAN: Adversarial Frequency-consistent Audio Synthesis | 本代码库 |
|
||||
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | SV2TTS | Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis | 本代码库 |
|
||||
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|
||||
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|
||||
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
|
||||
|
||||
## 常見問題(FQ&A)
|
||||
#### 1.數據集哪裡下載?
|
||||
## 常见问题(FQ&A)
|
||||
#### 1.数据集在哪里下载?
|
||||
| 数据集 | OpenSLR地址 | 其他源 (Google Drive, Baidu网盘等) |
|
||||
| --- | ----------- | ---------------|
|
||||
| aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) |
|
||||
| magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) |
|
||||
| aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) |
|
||||
| data_aishell | [OpenSLR](https://www.openslr.org/33/) | |
|
||||
> 解壓 aidatatang_200zh 後,還需將 `aidatatang_200zh\corpus\train`下的檔案全選解壓縮
|
||||
> 解压 aidatatang_200zh 后,还需将 `aidatatang_200zh\corpus\train`下的文件全选解压缩
|
||||
|
||||
#### 2.`<datasets_root>`是什麼意思?
|
||||
假如數據集路徑為 `D:\data\aidatatang_200zh`,那麼 `<datasets_root>`就是 `D:\data`
|
||||
假如数据集路径为 `D:\data\aidatatang_200zh`,那么 `<datasets_root>`就是 `D:\data`
|
||||
|
||||
#### 3.訓練模型顯存不足
|
||||
訓練合成器時:將 `synthesizer/hparams.py`中的batch_size參數調小
|
||||
#### 3.训练模型显存不足
|
||||
训练合成器时:将 `synthesizer/hparams.py`中的batch_size参数调小
|
||||
```
|
||||
//調整前
|
||||
//调整前
|
||||
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 12), #
|
||||
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
||||
//調整後
|
||||
//调整后
|
||||
tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 8), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 8), #
|
||||
@@ -172,15 +185,15 @@ tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
|
||||
(2, 1e-5, 640_000, 8)], # lr = learning rate
|
||||
```
|
||||
|
||||
聲碼器-預處理數據集時:將 `synthesizer/hparams.py`中的batch_size參數調小
|
||||
声码器-预处理数据集时:将 `synthesizer/hparams.py`中的batch_size参数调小
|
||||
```
|
||||
//調整前
|
||||
//调整前
|
||||
### Data Preprocessing
|
||||
max_mel_frames = 900,
|
||||
rescale = True,
|
||||
rescaling_max = 0.9,
|
||||
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
|
||||
//調整後
|
||||
//调整后
|
||||
### Data Preprocessing
|
||||
max_mel_frames = 900,
|
||||
rescale = True,
|
||||
@@ -188,16 +201,16 @@ tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
|
||||
synthesis_batch_size = 8, # For vocoder preprocessing and inference.
|
||||
```
|
||||
|
||||
聲碼器-訓練聲碼器時:將 `vocoder/wavernn/hparams.py`中的batch_size參數調小
|
||||
声码器-训练声码器时:将 `vocoder/wavernn/hparams.py`中的batch_size参数调小
|
||||
```
|
||||
//調整前
|
||||
//调整前
|
||||
# Training
|
||||
voc_batch_size = 100
|
||||
voc_lr = 1e-4
|
||||
voc_gen_at_checkpoint = 5
|
||||
voc_pad = 2
|
||||
|
||||
//調整後
|
||||
//调整后
|
||||
# Training
|
||||
voc_batch_size = 6
|
||||
voc_lr = 1e-4
|
||||
@@ -206,13 +219,13 @@ voc_pad =2
|
||||
```
|
||||
|
||||
#### 4.碰到`RuntimeError: Error(s) in loading state_dict for Tacotron: size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([70, 512]) from checkpoint, the shape in current model is torch.Size([75, 512]).`
|
||||
請參照 issue [#37](https://github.com/babysor/MockingBird/issues/37)
|
||||
请参照 issue [#37](https://github.com/babysor/MockingBird/issues/37)
|
||||
|
||||
#### 5.如何改善CPU、GPU佔用率?
|
||||
適情況調整batch_size參數來改善
|
||||
#### 5.如何改善CPU、GPU占用率?
|
||||
视情况调整batch_size参数来改善
|
||||
|
||||
#### 6.發生 `頁面文件太小,無法完成操作`
|
||||
請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
|
||||
#### 6.发生 `页面文件太小,无法完成操作`
|
||||
请参考这篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),将虚拟内存更改为100G(102400),例如:文件放置D盘就更改D盘的虚拟内存
|
||||
|
||||
#### 7.什么时候算训练完成?
|
||||
首先一定要出现注意力模型,其次是loss足够低,取决于硬件设备和数据集。拿本人的供参考,我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4
|
||||
|
||||
18
README.md
18
README.md
@@ -18,6 +18,14 @@
|
||||
|
||||
### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/)
|
||||
|
||||
### Ongoing Works(Helps Needed)
|
||||
* Major upgrade on GUI/Client and unifying web and toolbox
|
||||
[X] Init framework `./mkgui` and [tech design](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
|
||||
[X] Add demo part of Voice Cloning and Conversion
|
||||
[X] Add preprocessing and training for Voice Conversion
|
||||
[ ] Add preprocessing and training for Encoder/Synthesizer/Vocoder
|
||||
* Major upgrade on model backend based on ESPnet2(not yet started)
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Install Requirements
|
||||
@@ -29,7 +37,7 @@
|
||||
* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
|
||||
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
||||
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
||||
> Note that we are using the pretrained encoder/vocoder but synthesizer, since the original model is incompatible with the Chinese sympols. It means the demo_cli is not working at this moment.
|
||||
> Note that we are using the pretrained encoder/vocoder but synthesizer since the original model is incompatible with the Chinese symbols. It means the demo_cli is not working at this moment.
|
||||
### 2. Prepare your models
|
||||
You can either train your models or use existing ones:
|
||||
|
||||
@@ -60,7 +68,7 @@ Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata,
|
||||
| @author | https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g [Baidu](https://pan.baidu.com/s/1iONvRxmkI-t1nHqxKytY3g) 4j5d | | 75k steps trained by multiple datasets
|
||||
| @author | https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw [Baidu](https://pan.baidu.com/s/1fMh9IlgKJlL2PIiRTYDUvw) code:om7f | | 25k steps trained by multiple datasets, only works under version 0.0.1
|
||||
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing https://u.teknik.io/AYxWf.pt | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps with local accent of Taiwan, only works under version 0.0.1
|
||||
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code:2021 | https://www.bilibili.com/video/BV1uh411B7AD/ | only works under version 0.0.1
|
||||
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code: 2021 https://www.aliyundrive.com/s/AwPsbo8mcSP code: z2m0 | https://www.bilibili.com/video/BV1uh411B7AD/ | only works under version 0.0.1
|
||||
|
||||
#### 2.4 Train vocoder (Optional)
|
||||
> note: vocoder has little difference in effect, so you may not need to train a new one.
|
||||
@@ -82,6 +90,11 @@ You can then try to run:`python web.py` and open it in browser, default as `http
|
||||
You can then try the toolbox:
|
||||
`python demo_toolbox.py -d <datasets_root>`
|
||||
|
||||
#### 3.3 Using the command line
|
||||
You can then try the command:
|
||||
`python gen_voice.py <text_file.txt> your_wav_file.wav`
|
||||
you may need to install cn2an by "pip install cn2an" for better digital number result.
|
||||
|
||||
## Reference
|
||||
> This repository is forked from [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) which only support English.
|
||||
|
||||
@@ -89,6 +102,7 @@ You can then try the toolbox:
|
||||
| --- | ----------- | ----- | --------------------- |
|
||||
| [1803.09017](https://arxiv.org/abs/1803.09017) | GlobalStyleToken (synthesizer)| Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis | This repo |
|
||||
| [2010.05646](https://arxiv.org/abs/2010.05646) | HiFi-GAN (vocoder)| Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | This repo |
|
||||
| [2106.02297](https://arxiv.org/abs/2106.02297) | Fre-GAN (vocoder)| Fre-GAN: Adversarial Frequency-consistent Audio Synthesis | This repo |
|
||||
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
|
||||
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|
||||
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
from encoder.params_model import model_embedding_size as speaker_embedding_size
|
||||
from models.encoder.params_model import model_embedding_size as speaker_embedding_size
|
||||
from utils.argutils import print_args
|
||||
from utils.modelutils import check_model_paths
|
||||
from synthesizer.inference import Synthesizer
|
||||
from encoder import inference as encoder
|
||||
from vocoder import inference as vocoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.encoder import inference as encoder
|
||||
from models.vocoder import inference as vocoder
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aidatatang_200zh
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from models.encoder.preprocess import (preprocess_aidatatang_200zh,
|
||||
preprocess_librispeech, preprocess_voxceleb1,
|
||||
preprocess_voxceleb2)
|
||||
from utils.argutils import print_args
|
||||
|
||||
if __name__ == "__main__":
|
||||
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
|
||||
@@ -1,5 +1,5 @@
|
||||
from utils.argutils import print_args
|
||||
from encoder.train import train
|
||||
from models.encoder.train import train
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
@@ -2,8 +2,8 @@ import sys
|
||||
import torch
|
||||
import argparse
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
from utils.hparams import HpsYaml
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
|
||||
# For reproducibility, comment these may speed up training
|
||||
torch.backends.cudnn.deterministic = True
|
||||
@@ -1,7 +1,7 @@
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
from ppg2mel.preprocess import preprocess_dataset
|
||||
from models.ppg2mel.preprocess import preprocess_dataset
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.train import train
|
||||
from models.synthesizer.hparams import hparams
|
||||
from models.synthesizer.train import train
|
||||
from utils.argutils import print_args
|
||||
import argparse
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def new_train():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("run_id", type=str, help= \
|
||||
"Name for this model instance. If a model state from the same run ID was previously "
|
||||
@@ -13,7 +12,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("syn_dir", type=str, default=argparse.SUPPRESS, help= \
|
||||
"Path to the synthesizer directory that contains the ground truth mel spectrograms, "
|
||||
"the wavs and the embeds.")
|
||||
parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\
|
||||
parser.add_argument("-m", "--models_dir", type=str, default=f"data/ckpt/synthesizer/", help=\
|
||||
"Path to the output directory that will contain the saved model weights and the logs.")
|
||||
parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
|
||||
"Number of steps between updates of the model on the disk. Set to 0 to never save the "
|
||||
@@ -28,10 +27,14 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--hparams", default="",
|
||||
help="Hyperparameter overrides as a comma-separated list of name=value "
|
||||
"pairs")
|
||||
args = parser.parse_args()
|
||||
args, _ = parser.parse_known_args()
|
||||
print_args(args, parser)
|
||||
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
|
||||
# Run the training
|
||||
train(**vars(args))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
new_train()
|
||||
@@ -2,8 +2,8 @@ import sys
|
||||
import torch
|
||||
import argparse
|
||||
import numpy as np
|
||||
from utils.load_yaml import HpsYaml
|
||||
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
from utils.hparams import HpsYaml
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
|
||||
# For reproducibility, comment these may speed up training
|
||||
torch.backends.cudnn.deterministic = True
|
||||
@@ -18,7 +18,7 @@ def main():
|
||||
parser.add_argument('--name', default=None, type=str, help='Name for logging.')
|
||||
parser.add_argument('--logdir', default='log/', type=str,
|
||||
help='Logging path.', required=False)
|
||||
parser.add_argument('--ckpdir', default='ckpt/', type=str,
|
||||
parser.add_argument('--ckpdir', default='ppg2mel/saved_models/', type=str,
|
||||
help='Checkpoint path.', required=False)
|
||||
parser.add_argument('--outdir', default='result/', type=str,
|
||||
help='Decode output path.', required=False)
|
||||
@@ -41,7 +41,6 @@ def main():
|
||||
parser.add_argument('--lsa', action='store_true', help='Use location-sensitive attention (LSA)')
|
||||
|
||||
###
|
||||
|
||||
paras = parser.parse_args()
|
||||
setattr(paras, 'gpu', not paras.cpu)
|
||||
setattr(paras, 'pin_memory', not paras.no_pin)
|
||||
@@ -1,5 +1,5 @@
|
||||
from synthesizer.synthesize import run_synthesis
|
||||
from synthesizer.hparams import hparams
|
||||
from models.synthesizer.synthesize import run_synthesis
|
||||
from models.synthesizer.hparams import hparams
|
||||
from utils.argutils import print_args
|
||||
import argparse
|
||||
import os
|
||||
@@ -1,11 +1,13 @@
|
||||
from utils.argutils import print_args
|
||||
from vocoder.wavernn.train import train
|
||||
from vocoder.hifigan.train import train as train_hifigan
|
||||
from vocoder.hifigan.env import AttrDict
|
||||
from models.vocoder.wavernn.train import train
|
||||
from models.vocoder.hifigan.train import train as train_hifigan
|
||||
from models.vocoder.fregan.train import train as train_fregan
|
||||
from utils.util import AttrDict
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import torch
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
@@ -61,11 +63,30 @@ if __name__ == "__main__":
|
||||
# Process the arguments
|
||||
if args.vocoder_type == "wavernn":
|
||||
# Run the training wavernn
|
||||
delattr(args, 'vocoder_type')
|
||||
delattr(args, 'config')
|
||||
train(**vars(args))
|
||||
elif args.vocoder_type == "hifigan":
|
||||
with open(args.config) as f:
|
||||
json_config = json.load(f)
|
||||
h = AttrDict(json_config)
|
||||
train_hifigan(0, args, h)
|
||||
if h.num_gpus > 1:
|
||||
h.num_gpus = torch.cuda.device_count()
|
||||
h.batch_size = int(h.batch_size / h.num_gpus)
|
||||
print('Batch size per GPU :', h.batch_size)
|
||||
mp.spawn(train_hifigan, nprocs=h.num_gpus, args=(args, h,))
|
||||
else:
|
||||
train_hifigan(0, args, h)
|
||||
elif args.vocoder_type == "fregan":
|
||||
with open('vocoder/fregan/config.json') as f:
|
||||
json_config = json.load(f)
|
||||
h = AttrDict(json_config)
|
||||
if h.num_gpus > 1:
|
||||
h.num_gpus = torch.cuda.device_count()
|
||||
h.batch_size = int(h.batch_size / h.num_gpus)
|
||||
print('Batch size per GPU :', h.batch_size)
|
||||
mp.spawn(train_fregan, nprocs=h.num_gpus, args=(args, h,))
|
||||
else:
|
||||
train_fregan(0, args, h)
|
||||
|
||||
|
||||
@@ -1,24 +1,27 @@
|
||||
from asyncio.windows_events import NULL
|
||||
from pydantic import BaseModel, Field
|
||||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from encoder import inference as encoder
|
||||
from models.encoder import inference as encoder
|
||||
import librosa
|
||||
from scipy.io.wavfile import write
|
||||
import re
|
||||
import numpy as np
|
||||
from opyrator.components.types import FileContent
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from synthesizer.inference import Synthesizer
|
||||
from control.mkgui.base.components.types import FileContent
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from typing import Any, Tuple
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Constants
|
||||
AUDIO_SAMPLES_DIR = 'samples\\'
|
||||
SYN_MODELS_DIRT = "synthesizer\\saved_models"
|
||||
ENC_MODELS_DIRT = "encoder\\saved_models"
|
||||
VOC_MODELS_DIRT = "vocoder\\saved_models"
|
||||
TEMP_SOURCE_AUDIO = "wavs/temp_source.wav"
|
||||
TEMP_RESULT_AUDIO = "wavs/temp_result.wav"
|
||||
AUDIO_SAMPLES_DIR = f"data{os.sep}samples{os.sep}"
|
||||
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
VOC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}vocoder"
|
||||
TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
|
||||
TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
|
||||
if not os.path.isdir("wavs"):
|
||||
os.makedirs("wavs")
|
||||
|
||||
# Load local sample audio as options TODO: load dataset
|
||||
if os.path.isdir(AUDIO_SAMPLES_DIR):
|
||||
@@ -27,20 +30,33 @@ if os.path.isdir(AUDIO_SAMPLES_DIR):
|
||||
if os.path.isdir(SYN_MODELS_DIRT):
|
||||
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded synthesizer models: " + str(len(synthesizers)))
|
||||
else:
|
||||
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist. 请将模型文件位置移动到上述位置中进行重试!")
|
||||
|
||||
if os.path.isdir(ENC_MODELS_DIRT):
|
||||
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded encoders models: " + str(len(encoders)))
|
||||
else:
|
||||
raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(VOC_MODELS_DIRT):
|
||||
vocoders = Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
|
||||
print("Loaded vocoders models: " + str(len(synthesizers)))
|
||||
else:
|
||||
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
|
||||
class Input(BaseModel):
|
||||
message: str = Field(
|
||||
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容"
|
||||
)
|
||||
local_audio_file: audio_input_selection = Field(
|
||||
..., alias="输入语音(本地wav)",
|
||||
..., alias="选择语音(本地wav)",
|
||||
description="选择本地语音文件."
|
||||
)
|
||||
upload_audio_file: FileContent = Field(..., alias="或上传语音",
|
||||
record_audio_file: FileContent = Field(default=None, alias="录制语音",
|
||||
description="录音.", is_recorder=True, mime_type="audio/wav")
|
||||
upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
|
||||
description="拖拽或点击上传.", mime_type="audio/wav")
|
||||
encoder: encoders = Field(
|
||||
..., alias="编码模型",
|
||||
@@ -48,37 +64,53 @@ class Input(BaseModel):
|
||||
)
|
||||
synthesizer: synthesizers = Field(
|
||||
..., alias="合成模型",
|
||||
description="选择语音编码模型文件."
|
||||
description="选择语音合成模型文件."
|
||||
)
|
||||
vocoder: vocoders = Field(
|
||||
..., alias="语音编码模型",
|
||||
description="选择语音编码模型文件(目前只支持HifiGan类型)."
|
||||
)
|
||||
message: str = Field(
|
||||
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="输出文本内容"
|
||||
..., alias="语音解码模型",
|
||||
description="选择语音解码模型文件(目前只支持HifiGan类型)."
|
||||
)
|
||||
|
||||
class AudioEntity(BaseModel):
|
||||
content: bytes
|
||||
mel: Any
|
||||
|
||||
class Output(BaseModel):
|
||||
result_file: FileContent = Field(
|
||||
...,
|
||||
mime_type="audio/wav",
|
||||
description="输出音频",
|
||||
)
|
||||
source_file: FileContent = Field(
|
||||
...,
|
||||
mime_type="audio/wav",
|
||||
description="原始音频.",
|
||||
)
|
||||
__root__: Tuple[AudioEntity, AudioEntity]
|
||||
|
||||
def mocking_bird(input: Input) -> Output:
|
||||
"""欢迎使用MockingBird Web 2"""
|
||||
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore
|
||||
"""Custom output UI.
|
||||
If this method is implmeneted, it will be used instead of the default Output UI renderer.
|
||||
"""
|
||||
src, result = self.__root__
|
||||
|
||||
streamlit_app.subheader("Synthesized Audio")
|
||||
streamlit_app.audio(result.content, format="audio/wav")
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.imshow(src.mel, aspect="equal", interpolation="none")
|
||||
ax.set_title("mel spectrogram(Source Audio)")
|
||||
streamlit_app.pyplot(fig)
|
||||
fig, ax = plt.subplots()
|
||||
ax.imshow(result.mel, aspect="equal", interpolation="none")
|
||||
ax.set_title("mel spectrogram(Result Audio)")
|
||||
streamlit_app.pyplot(fig)
|
||||
|
||||
|
||||
def synthesize(input: Input) -> Output:
|
||||
"""synthesize(合成)"""
|
||||
# load models
|
||||
encoder.load_model(Path(input.encoder.value))
|
||||
current_synt = Synthesizer(Path(input.synthesizer.value))
|
||||
gan_vocoder.load_model(Path(input.vocoder.value))
|
||||
|
||||
# load file
|
||||
if input.upload_audio_file != NULL:
|
||||
if input.record_audio_file != None:
|
||||
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
|
||||
f.write(input.record_audio_file.as_bytes())
|
||||
f.seek(0)
|
||||
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
|
||||
elif input.upload_audio_file != None:
|
||||
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
|
||||
f.write(input.upload_audio_file.as_bytes())
|
||||
f.seek(0)
|
||||
@@ -87,6 +119,8 @@ def mocking_bird(input: Input) -> Output:
|
||||
wav, sample_rate = librosa.load(input.local_audio_file.value)
|
||||
write(TEMP_SOURCE_AUDIO, sample_rate, wav) #Make sure we get the correct wav
|
||||
|
||||
source_spec = Synthesizer.make_spectrogram(wav)
|
||||
|
||||
# preprocess
|
||||
encoder_wav = encoder.preprocess_wav(wav, sample_rate)
|
||||
embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
|
||||
@@ -114,4 +148,4 @@ def mocking_bird(input: Input) -> Output:
|
||||
source_file = f.read()
|
||||
with open(TEMP_RESULT_AUDIO, "rb") as f:
|
||||
result_file = f.read()
|
||||
return Output(source_file=source_file, result_file=result_file)
|
||||
return Output(__root__=(AudioEntity(content=source_file, mel=source_spec), AudioEntity(content=result_file, mel=spec)))
|
||||
166
control/mkgui/app_vc.py
Normal file
166
control/mkgui/app_vc.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import os
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Tuple
|
||||
|
||||
import librosa
|
||||
import matplotlib.pyplot as plt
|
||||
import torch
|
||||
from pydantic import BaseModel, Field
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
import models.ppg2mel as Convertor
|
||||
import models.ppg_extractor as Extractor
|
||||
from control.mkgui.base.components.types import FileContent
|
||||
from models.encoder import inference as speacker_encoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
|
||||
# Constants
|
||||
AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}'
|
||||
EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor'
|
||||
CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel'
|
||||
VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder'
|
||||
TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
|
||||
TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
|
||||
TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'
|
||||
|
||||
# Load local sample audio as options TODO: load dataset
|
||||
if os.path.isdir(AUDIO_SAMPLES_DIR):
|
||||
audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
|
||||
# Pre-Load models
|
||||
if os.path.isdir(EXT_MODELS_DIRT):
|
||||
extractors = Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded extractor models: " + str(len(extractors)))
|
||||
else:
|
||||
raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(CONV_MODELS_DIRT):
|
||||
convertors = Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
|
||||
print("Loaded convertor models: " + str(len(convertors)))
|
||||
else:
|
||||
raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(VOC_MODELS_DIRT):
|
||||
vocoders = Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
|
||||
print("Loaded vocoders models: " + str(len(vocoders)))
|
||||
else:
|
||||
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
class Input(BaseModel):
|
||||
local_audio_file: audio_input_selection = Field(
|
||||
..., alias="输入语音(本地wav)",
|
||||
description="选择本地语音文件."
|
||||
)
|
||||
upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
|
||||
description="拖拽或点击上传.", mime_type="audio/wav")
|
||||
local_audio_file_target: audio_input_selection = Field(
|
||||
..., alias="目标语音(本地wav)",
|
||||
description="选择本地语音文件."
|
||||
)
|
||||
upload_audio_file_target: FileContent = Field(default=None, alias="或上传目标语音",
|
||||
description="拖拽或点击上传.", mime_type="audio/wav")
|
||||
extractor: extractors = Field(
|
||||
..., alias="编码模型",
|
||||
description="选择语音编码模型文件."
|
||||
)
|
||||
convertor: convertors = Field(
|
||||
..., alias="转换模型",
|
||||
description="选择语音转换模型文件."
|
||||
)
|
||||
vocoder: vocoders = Field(
|
||||
..., alias="语音解码模型",
|
||||
description="选择语音解码模型文件(目前只支持HifiGan类型)."
|
||||
)
|
||||
|
||||
class AudioEntity(BaseModel):
|
||||
content: bytes
|
||||
mel: Any
|
||||
|
||||
class Output(BaseModel):
|
||||
__root__: Tuple[AudioEntity, AudioEntity, AudioEntity]
|
||||
|
||||
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore
|
||||
"""Custom output UI.
|
||||
If this method is implmeneted, it will be used instead of the default Output UI renderer.
|
||||
"""
|
||||
src, target, result = self.__root__
|
||||
|
||||
streamlit_app.subheader("Synthesized Audio")
|
||||
streamlit_app.audio(result.content, format="audio/wav")
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
ax.imshow(src.mel, aspect="equal", interpolation="none")
|
||||
ax.set_title("mel spectrogram(Source Audio)")
|
||||
streamlit_app.pyplot(fig)
|
||||
fig, ax = plt.subplots()
|
||||
ax.imshow(target.mel, aspect="equal", interpolation="none")
|
||||
ax.set_title("mel spectrogram(Target Audio)")
|
||||
streamlit_app.pyplot(fig)
|
||||
fig, ax = plt.subplots()
|
||||
ax.imshow(result.mel, aspect="equal", interpolation="none")
|
||||
ax.set_title("mel spectrogram(Result Audio)")
|
||||
streamlit_app.pyplot(fig)
|
||||
|
||||
def convert(input: Input) -> Output:
|
||||
"""convert(转换)"""
|
||||
# load models
|
||||
extractor = Extractor.load_model(Path(input.extractor.value))
|
||||
convertor = Convertor.load_model(Path(input.convertor.value))
|
||||
# current_synt = Synthesizer(Path(input.synthesizer.value))
|
||||
gan_vocoder.load_model(Path(input.vocoder.value))
|
||||
|
||||
# load file
|
||||
if input.upload_audio_file != None:
|
||||
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
|
||||
f.write(input.upload_audio_file.as_bytes())
|
||||
f.seek(0)
|
||||
src_wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
|
||||
else:
|
||||
src_wav, sample_rate = librosa.load(input.local_audio_file.value)
|
||||
write(TEMP_SOURCE_AUDIO, sample_rate, src_wav) #Make sure we get the correct wav
|
||||
|
||||
if input.upload_audio_file_target != None:
|
||||
with open(TEMP_TARGET_AUDIO, "w+b") as f:
|
||||
f.write(input.upload_audio_file_target.as_bytes())
|
||||
f.seek(0)
|
||||
ref_wav, _ = librosa.load(TEMP_TARGET_AUDIO)
|
||||
else:
|
||||
ref_wav, _ = librosa.load(input.local_audio_file_target.value)
|
||||
write(TEMP_TARGET_AUDIO, sample_rate, ref_wav) #Make sure we get the correct wav
|
||||
|
||||
ppg = extractor.extract_from_wav(src_wav)
|
||||
# Import necessary dependency of Voice Conversion
|
||||
from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0,
|
||||
get_converted_lf0uv)
|
||||
ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
|
||||
speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt"))
|
||||
embed = speacker_encoder.embed_utterance(ref_wav)
|
||||
lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
|
||||
min_len = min(ppg.shape[1], len(lf0_uv))
|
||||
ppg = ppg[:, :min_len]
|
||||
lf0_uv = lf0_uv[:min_len]
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
_, mel_pred, att_ws = convertor.inference(
|
||||
ppg,
|
||||
logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device),
|
||||
spembs=torch.from_numpy(embed).unsqueeze(0).to(device),
|
||||
)
|
||||
mel_pred= mel_pred.transpose(0, 1)
|
||||
breaks = [mel_pred.shape[1]]
|
||||
mel_pred= mel_pred.detach().cpu().numpy()
|
||||
|
||||
# synthesize and vocode
|
||||
wav, sample_rate = gan_vocoder.infer_waveform(mel_pred)
|
||||
|
||||
# write and output
|
||||
write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
|
||||
with open(TEMP_SOURCE_AUDIO, "rb") as f:
|
||||
source_file = f.read()
|
||||
with open(TEMP_TARGET_AUDIO, "rb") as f:
|
||||
target_file = f.read()
|
||||
with open(TEMP_RESULT_AUDIO, "rb") as f:
|
||||
result_file = f.read()
|
||||
|
||||
|
||||
return Output(__root__=(AudioEntity(content=source_file, mel=Synthesizer.make_spectrogram(src_wav)), AudioEntity(content=target_file, mel=Synthesizer.make_spectrogram(ref_wav)), AudioEntity(content=result_file, mel=Synthesizer.make_spectrogram(wav))))
|
||||
2
control/mkgui/base/__init__.py
Normal file
2
control/mkgui/base/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
|
||||
from .core import Opyrator
|
||||
1
control/mkgui/base/api/__init__.py
Normal file
1
control/mkgui/base/api/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .fastapi_app import create_api
|
||||
102
control/mkgui/base/api/fastapi_utils.py
Normal file
102
control/mkgui/base/api/fastapi_utils.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Collection of utilities for FastAPI apps."""
|
||||
|
||||
import inspect
|
||||
from typing import Any, Type
|
||||
|
||||
from fastapi import FastAPI, Form
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
def as_form(cls: Type[BaseModel]) -> Any:
|
||||
"""Adds an as_form class method to decorated models.
|
||||
|
||||
The as_form class method can be used with FastAPI endpoints
|
||||
"""
|
||||
new_params = [
|
||||
inspect.Parameter(
|
||||
field.alias,
|
||||
inspect.Parameter.POSITIONAL_ONLY,
|
||||
default=(Form(field.default) if not field.required else Form(...)),
|
||||
)
|
||||
for field in cls.__fields__.values()
|
||||
]
|
||||
|
||||
async def _as_form(**data): # type: ignore
|
||||
return cls(**data)
|
||||
|
||||
sig = inspect.signature(_as_form)
|
||||
sig = sig.replace(parameters=new_params)
|
||||
_as_form.__signature__ = sig # type: ignore
|
||||
setattr(cls, "as_form", _as_form)
|
||||
return cls
|
||||
|
||||
|
||||
def patch_fastapi(app: FastAPI) -> None:
|
||||
"""Patch function to allow relative url resolution.
|
||||
|
||||
This patch is required to make fastapi fully functional with a relative url path.
|
||||
This code snippet can be copy-pasted to any Fastapi application.
|
||||
"""
|
||||
from fastapi.openapi.docs import get_redoc_html, get_swagger_ui_html
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import HTMLResponse
|
||||
|
||||
async def redoc_ui_html(req: Request) -> HTMLResponse:
|
||||
assert app.openapi_url is not None
|
||||
redoc_ui = get_redoc_html(
|
||||
openapi_url="./" + app.openapi_url.lstrip("/"),
|
||||
title=app.title + " - Redoc UI",
|
||||
)
|
||||
|
||||
return HTMLResponse(redoc_ui.body.decode("utf-8"))
|
||||
|
||||
async def swagger_ui_html(req: Request) -> HTMLResponse:
|
||||
assert app.openapi_url is not None
|
||||
swagger_ui = get_swagger_ui_html(
|
||||
openapi_url="./" + app.openapi_url.lstrip("/"),
|
||||
title=app.title + " - Swagger UI",
|
||||
oauth2_redirect_url=app.swagger_ui_oauth2_redirect_url,
|
||||
)
|
||||
|
||||
# insert request interceptor to have all request run on relativ path
|
||||
request_interceptor = (
|
||||
"requestInterceptor: (e) => {"
|
||||
"\n\t\t\tvar url = window.location.origin + window.location.pathname"
|
||||
'\n\t\t\turl = url.substring( 0, url.lastIndexOf( "/" ) + 1);'
|
||||
"\n\t\t\turl = e.url.replace(/http(s)?:\/\/[^/]*\//i, url);" # noqa: W605
|
||||
"\n\t\t\te.contextUrl = url"
|
||||
"\n\t\t\te.url = url"
|
||||
"\n\t\t\treturn e;}"
|
||||
)
|
||||
|
||||
return HTMLResponse(
|
||||
swagger_ui.body.decode("utf-8").replace(
|
||||
"dom_id: '#swagger-ui',",
|
||||
"dom_id: '#swagger-ui',\n\t\t" + request_interceptor + ",",
|
||||
)
|
||||
)
|
||||
|
||||
# remove old docs route and add our patched route
|
||||
routes_new = []
|
||||
for app_route in app.routes:
|
||||
if app_route.path == "/docs": # type: ignore
|
||||
continue
|
||||
|
||||
if app_route.path == "/redoc": # type: ignore
|
||||
continue
|
||||
|
||||
routes_new.append(app_route)
|
||||
|
||||
app.router.routes = routes_new
|
||||
|
||||
assert app.docs_url is not None
|
||||
app.add_route(app.docs_url, swagger_ui_html, include_in_schema=False)
|
||||
assert app.redoc_url is not None
|
||||
app.add_route(app.redoc_url, redoc_ui_html, include_in_schema=False)
|
||||
|
||||
# Make graphql realtive
|
||||
from starlette import graphql
|
||||
|
||||
graphql.GRAPHIQL = graphql.GRAPHIQL.replace(
|
||||
"({{REQUEST_PATH}}", '("." + {{REQUEST_PATH}}'
|
||||
)
|
||||
43
control/mkgui/base/components/outputs.py
Normal file
43
control/mkgui/base/components/outputs.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import List
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ScoredLabel(BaseModel):
|
||||
label: str
|
||||
score: float
|
||||
|
||||
|
||||
class ClassificationOutput(BaseModel):
|
||||
__root__: List[ScoredLabel]
|
||||
|
||||
def __iter__(self): # type: ignore
|
||||
return iter(self.__root__)
|
||||
|
||||
def __getitem__(self, item): # type: ignore
|
||||
return self.__root__[item]
|
||||
|
||||
def render_output_ui(self, streamlit) -> None: # type: ignore
|
||||
import plotly.express as px
|
||||
|
||||
sorted_predictions = sorted(
|
||||
[prediction.dict() for prediction in self.__root__],
|
||||
key=lambda k: k["score"],
|
||||
)
|
||||
|
||||
num_labels = len(sorted_predictions)
|
||||
if len(sorted_predictions) > 10:
|
||||
num_labels = streamlit.slider(
|
||||
"Maximum labels to show: ",
|
||||
min_value=1,
|
||||
max_value=len(sorted_predictions),
|
||||
value=len(sorted_predictions),
|
||||
)
|
||||
fig = px.bar(
|
||||
sorted_predictions[len(sorted_predictions) - num_labels :],
|
||||
x="score",
|
||||
y="label",
|
||||
orientation="h",
|
||||
)
|
||||
streamlit.plotly_chart(fig, use_container_width=True)
|
||||
# fig.show()
|
||||
46
control/mkgui/base/components/types.py
Normal file
46
control/mkgui/base/components/types.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import base64
|
||||
from typing import Any, Dict, overload
|
||||
|
||||
|
||||
class FileContent(str):
|
||||
def as_bytes(self) -> bytes:
|
||||
return base64.b64decode(self, validate=True)
|
||||
|
||||
def as_str(self) -> str:
|
||||
return self.as_bytes().decode()
|
||||
|
||||
@classmethod
|
||||
def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:
|
||||
field_schema.update(format="byte")
|
||||
|
||||
@classmethod
|
||||
def __get_validators__(cls) -> Any: # type: ignore
|
||||
yield cls.validate
|
||||
|
||||
@classmethod
|
||||
def validate(cls, value: Any) -> "FileContent":
|
||||
if isinstance(value, FileContent):
|
||||
return value
|
||||
elif isinstance(value, str):
|
||||
return FileContent(value)
|
||||
elif isinstance(value, (bytes, bytearray, memoryview)):
|
||||
return FileContent(base64.b64encode(value).decode())
|
||||
else:
|
||||
raise Exception("Wrong type")
|
||||
|
||||
# # 暂时无法使用,因为浏览器中没有考虑选择文件夹
|
||||
# class DirectoryContent(FileContent):
|
||||
# @classmethod
|
||||
# def __modify_schema__(cls, field_schema: Dict[str, Any]) -> None:
|
||||
# field_schema.update(format="path")
|
||||
|
||||
# @classmethod
|
||||
# def validate(cls, value: Any) -> "DirectoryContent":
|
||||
# if isinstance(value, DirectoryContent):
|
||||
# return value
|
||||
# elif isinstance(value, str):
|
||||
# return DirectoryContent(value)
|
||||
# elif isinstance(value, (bytes, bytearray, memoryview)):
|
||||
# return DirectoryContent(base64.b64encode(value).decode())
|
||||
# else:
|
||||
# raise Exception("Wrong type")
|
||||
203
control/mkgui/base/core.py
Normal file
203
control/mkgui/base/core.py
Normal file
@@ -0,0 +1,203 @@
|
||||
import importlib
|
||||
import inspect
|
||||
import re
|
||||
from typing import Any, Callable, Type, Union, get_type_hints
|
||||
|
||||
from pydantic import BaseModel, parse_raw_as
|
||||
from pydantic.tools import parse_obj_as
|
||||
|
||||
|
||||
def name_to_title(name: str) -> str:
|
||||
"""Converts a camelCase or snake_case name to title case."""
|
||||
# If camelCase -> convert to snake case
|
||||
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
|
||||
name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()
|
||||
# Convert to title case
|
||||
return name.replace("_", " ").strip().title()
|
||||
|
||||
|
||||
def is_compatible_type(type: Type) -> bool:
|
||||
"""Returns `True` if the type is opyrator-compatible."""
|
||||
try:
|
||||
if issubclass(type, BaseModel):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
# valid list type
|
||||
if type.__origin__ is list and issubclass(type.__args__[0], BaseModel):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_input_type(func: Callable) -> Type:
|
||||
"""Returns the input type of a given function (callable).
|
||||
|
||||
Args:
|
||||
func: The function for which to get the input type.
|
||||
|
||||
Raises:
|
||||
ValueError: If the function does not have a valid input type annotation.
|
||||
"""
|
||||
type_hints = get_type_hints(func)
|
||||
|
||||
if "input" not in type_hints:
|
||||
raise ValueError(
|
||||
"The callable MUST have a parameter with the name `input` with typing annotation. "
|
||||
"For example: `def my_opyrator(input: InputModel) -> OutputModel:`."
|
||||
)
|
||||
|
||||
input_type = type_hints["input"]
|
||||
|
||||
if not is_compatible_type(input_type):
|
||||
raise ValueError(
|
||||
"The `input` parameter MUST be a subclass of the Pydantic BaseModel or a list of Pydantic models."
|
||||
)
|
||||
|
||||
# TODO: return warning if more than one input parameters
|
||||
|
||||
return input_type
|
||||
|
||||
|
||||
def get_output_type(func: Callable) -> Type:
|
||||
"""Returns the output type of a given function (callable).
|
||||
|
||||
Args:
|
||||
func: The function for which to get the output type.
|
||||
|
||||
Raises:
|
||||
ValueError: If the function does not have a valid output type annotation.
|
||||
"""
|
||||
type_hints = get_type_hints(func)
|
||||
if "return" not in type_hints:
|
||||
raise ValueError(
|
||||
"The return type of the callable MUST be annotated with type hints."
|
||||
"For example: `def my_opyrator(input: InputModel) -> OutputModel:`."
|
||||
)
|
||||
|
||||
output_type = type_hints["return"]
|
||||
|
||||
if not is_compatible_type(output_type):
|
||||
raise ValueError(
|
||||
"The return value MUST be a subclass of the Pydantic BaseModel or a list of Pydantic models."
|
||||
)
|
||||
|
||||
return output_type
|
||||
|
||||
|
||||
def get_callable(import_string: str) -> Callable:
|
||||
"""Import a callable from an string."""
|
||||
callable_seperator = ":"
|
||||
if callable_seperator not in import_string:
|
||||
# Use dot as seperator
|
||||
callable_seperator = "."
|
||||
|
||||
if callable_seperator not in import_string:
|
||||
raise ValueError("The callable path MUST specify the function. ")
|
||||
|
||||
mod_name, callable_name = import_string.rsplit(callable_seperator, 1)
|
||||
mod = importlib.import_module(mod_name)
|
||||
return getattr(mod, callable_name)
|
||||
|
||||
|
||||
class Opyrator:
|
||||
def __init__(self, func: Union[Callable, str]) -> None:
|
||||
if isinstance(func, str):
|
||||
# Try to load the function from a string notion
|
||||
self.function = get_callable(func)
|
||||
else:
|
||||
self.function = func
|
||||
|
||||
self._action = "Execute"
|
||||
self._input_type = None
|
||||
self._output_type = None
|
||||
|
||||
if not callable(self.function):
|
||||
raise ValueError("The provided function parameters is not a callable.")
|
||||
|
||||
if inspect.isclass(self.function):
|
||||
raise ValueError(
|
||||
"The provided callable is an uninitialized Class. This is not allowed."
|
||||
)
|
||||
|
||||
if inspect.isfunction(self.function):
|
||||
# The provided callable is a function
|
||||
self._input_type = get_input_type(self.function)
|
||||
self._output_type = get_output_type(self.function)
|
||||
|
||||
try:
|
||||
# Get name
|
||||
self._name = name_to_title(self.function.__name__)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Get description from function
|
||||
doc_string = inspect.getdoc(self.function)
|
||||
if doc_string:
|
||||
self._action = doc_string
|
||||
except Exception:
|
||||
pass
|
||||
elif hasattr(self.function, "__call__"):
|
||||
# The provided callable is a function
|
||||
self._input_type = get_input_type(self.function.__call__) # type: ignore
|
||||
self._output_type = get_output_type(self.function.__call__) # type: ignore
|
||||
|
||||
try:
|
||||
# Get name
|
||||
self._name = name_to_title(type(self.function).__name__)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
# Get action from
|
||||
doc_string = inspect.getdoc(self.function.__call__) # type: ignore
|
||||
if doc_string:
|
||||
self._action = doc_string
|
||||
|
||||
if (
|
||||
not self._action
|
||||
or self._action == "Call"
|
||||
):
|
||||
# Get docstring from class instead of __call__ function
|
||||
doc_string = inspect.getdoc(self.function)
|
||||
if doc_string:
|
||||
self._action = doc_string
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
raise ValueError("Unknown callable type.")
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return self._name
|
||||
|
||||
@property
|
||||
def action(self) -> str:
|
||||
return self._action
|
||||
|
||||
@property
|
||||
def input_type(self) -> Any:
|
||||
return self._input_type
|
||||
|
||||
@property
|
||||
def output_type(self) -> Any:
|
||||
return self._output_type
|
||||
|
||||
def __call__(self, input: Any, **kwargs: Any) -> Any:
|
||||
|
||||
input_obj = input
|
||||
|
||||
if isinstance(input, str):
|
||||
# Allow json input
|
||||
input_obj = parse_raw_as(self.input_type, input)
|
||||
|
||||
if isinstance(input, dict):
|
||||
# Allow dict input
|
||||
input_obj = parse_obj_as(self.input_type, input)
|
||||
|
||||
return self.function(input_obj, **kwargs)
|
||||
1
control/mkgui/base/ui/__init__.py
Normal file
1
control/mkgui/base/ui/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .streamlit_ui import render_streamlit_ui
|
||||
135
control/mkgui/base/ui/schema_utils.py
Normal file
135
control/mkgui/base/ui/schema_utils.py
Normal file
@@ -0,0 +1,135 @@
|
||||
from typing import Dict
|
||||
|
||||
|
||||
def resolve_reference(reference: str, references: Dict) -> Dict:
|
||||
return references[reference.split("/")[-1]]
|
||||
|
||||
|
||||
def get_single_reference_item(property: Dict, references: Dict) -> Dict:
|
||||
# Ref can either be directly in the properties or the first element of allOf
|
||||
reference = property.get("$ref")
|
||||
if reference is None:
|
||||
reference = property["allOf"][0]["$ref"]
|
||||
return resolve_reference(reference, references)
|
||||
|
||||
|
||||
def is_single_string_property(property: Dict) -> bool:
|
||||
return property.get("type") == "string"
|
||||
|
||||
|
||||
def is_single_datetime_property(property: Dict) -> bool:
|
||||
if property.get("type") != "string":
|
||||
return False
|
||||
return property.get("format") in ["date-time", "time", "date"]
|
||||
|
||||
|
||||
def is_single_boolean_property(property: Dict) -> bool:
|
||||
return property.get("type") == "boolean"
|
||||
|
||||
|
||||
def is_single_number_property(property: Dict) -> bool:
|
||||
return property.get("type") in ["integer", "number"]
|
||||
|
||||
|
||||
def is_single_file_property(property: Dict) -> bool:
|
||||
if property.get("type") != "string":
|
||||
return False
|
||||
# TODO: binary?
|
||||
return property.get("format") == "byte"
|
||||
|
||||
def is_single_autio_property(property: Dict) -> bool:
|
||||
if property.get("type") != "string":
|
||||
return False
|
||||
# TODO: binary?
|
||||
return property.get("format") == "bytes"
|
||||
|
||||
|
||||
def is_single_directory_property(property: Dict) -> bool:
|
||||
if property.get("type") != "string":
|
||||
return False
|
||||
return property.get("format") == "path"
|
||||
|
||||
def is_multi_enum_property(property: Dict, references: Dict) -> bool:
|
||||
if property.get("type") != "array":
|
||||
return False
|
||||
|
||||
if property.get("uniqueItems") is not True:
|
||||
# Only relevant if it is a set or other datastructures with unique items
|
||||
return False
|
||||
|
||||
try:
|
||||
_ = resolve_reference(property["items"]["$ref"], references)["enum"]
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def is_single_enum_property(property: Dict, references: Dict) -> bool:
|
||||
try:
|
||||
_ = get_single_reference_item(property, references)["enum"]
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def is_single_dict_property(property: Dict) -> bool:
|
||||
if property.get("type") != "object":
|
||||
return False
|
||||
return "additionalProperties" in property
|
||||
|
||||
|
||||
def is_single_reference(property: Dict) -> bool:
|
||||
if property.get("type") is not None:
|
||||
return False
|
||||
|
||||
return bool(property.get("$ref"))
|
||||
|
||||
|
||||
def is_multi_file_property(property: Dict) -> bool:
|
||||
if property.get("type") != "array":
|
||||
return False
|
||||
|
||||
if property.get("items") is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
# TODO: binary
|
||||
return property["items"]["format"] == "byte"
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def is_single_object(property: Dict, references: Dict) -> bool:
|
||||
try:
|
||||
object_reference = get_single_reference_item(property, references)
|
||||
if object_reference["type"] != "object":
|
||||
return False
|
||||
return "properties" in object_reference
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def is_property_list(property: Dict) -> bool:
|
||||
if property.get("type") != "array":
|
||||
return False
|
||||
|
||||
if property.get("items") is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
return property["items"]["type"] in ["string", "number", "integer"]
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def is_object_list_property(property: Dict, references: Dict) -> bool:
|
||||
if property.get("type") != "array":
|
||||
return False
|
||||
|
||||
try:
|
||||
object_reference = resolve_reference(property["items"]["$ref"], references)
|
||||
if object_reference["type"] != "object":
|
||||
return False
|
||||
return "properties" in object_reference
|
||||
except Exception:
|
||||
return False
|
||||
933
control/mkgui/base/ui/streamlit_ui.py
Normal file
933
control/mkgui/base/ui/streamlit_ui.py
Normal file
@@ -0,0 +1,933 @@
|
||||
import datetime
|
||||
import inspect
|
||||
import mimetypes
|
||||
import sys
|
||||
from os import getcwd, unlink, path
|
||||
from platform import system
|
||||
from tempfile import NamedTemporaryFile
|
||||
from typing import Any, Callable, Dict, List, Type
|
||||
from PIL import Image
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, ValidationError, parse_obj_as
|
||||
|
||||
from control.mkgui.base import Opyrator
|
||||
from control.mkgui.base.core import name_to_title
|
||||
from . import schema_utils
|
||||
from .streamlit_utils import CUSTOM_STREAMLIT_CSS
|
||||
|
||||
STREAMLIT_RUNNER_SNIPPET = """
|
||||
from control.mkgui.base.ui import render_streamlit_ui
|
||||
|
||||
import streamlit as st
|
||||
|
||||
# TODO: Make it configurable
|
||||
# Page config can only be setup once
|
||||
st.set_page_config(
|
||||
page_title="MockingBird",
|
||||
page_icon="🧊",
|
||||
layout="wide")
|
||||
|
||||
render_streamlit_ui()
|
||||
"""
|
||||
|
||||
# with st.spinner("Loading MockingBird GUI. Please wait..."):
|
||||
# opyrator = Opyrator("{opyrator_path}")
|
||||
|
||||
|
||||
def launch_ui(port: int = 8501) -> None:
|
||||
with NamedTemporaryFile(
|
||||
suffix=".py", mode="w", encoding="utf-8", delete=False
|
||||
) as f:
|
||||
f.write(STREAMLIT_RUNNER_SNIPPET)
|
||||
f.seek(0)
|
||||
|
||||
import subprocess
|
||||
|
||||
python_path = f'PYTHONPATH="$PYTHONPATH:{getcwd()}"'
|
||||
if system() == "Windows":
|
||||
python_path = f"set PYTHONPATH=%PYTHONPATH%;{getcwd()} &&"
|
||||
subprocess.run(
|
||||
f"""set STREAMLIT_GLOBAL_SHOW_WARNING_ON_DIRECT_EXECUTION=false""",
|
||||
shell=True,
|
||||
)
|
||||
|
||||
subprocess.run(
|
||||
f"""{python_path} "{sys.executable}" -m streamlit run --server.port={port} --server.headless=True --runner.magicEnabled=False --server.maxUploadSize=50 --browser.gatherUsageStats=False {f.name}""",
|
||||
shell=True,
|
||||
)
|
||||
|
||||
f.close()
|
||||
unlink(f.name)
|
||||
|
||||
|
||||
def function_has_named_arg(func: Callable, parameter: str) -> bool:
|
||||
try:
|
||||
sig = inspect.signature(func)
|
||||
for param in sig.parameters.values():
|
||||
if param.name == "input":
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
def has_output_ui_renderer(data_item: BaseModel) -> bool:
|
||||
return hasattr(data_item, "render_output_ui")
|
||||
|
||||
|
||||
def has_input_ui_renderer(input_class: Type[BaseModel]) -> bool:
|
||||
return hasattr(input_class, "render_input_ui")
|
||||
|
||||
|
||||
def is_compatible_audio(mime_type: str) -> bool:
|
||||
return mime_type in ["audio/mpeg", "audio/ogg", "audio/wav"]
|
||||
|
||||
|
||||
def is_compatible_image(mime_type: str) -> bool:
|
||||
return mime_type in ["image/png", "image/jpeg"]
|
||||
|
||||
|
||||
def is_compatible_video(mime_type: str) -> bool:
|
||||
return mime_type in ["video/mp4"]
|
||||
|
||||
|
||||
class InputUI:
|
||||
def __init__(self, session_state, input_class: Type[BaseModel]):
|
||||
self._session_state = session_state
|
||||
self._input_class = input_class
|
||||
|
||||
self._schema_properties = input_class.schema(by_alias=True).get(
|
||||
"properties", {}
|
||||
)
|
||||
self._schema_references = input_class.schema(by_alias=True).get(
|
||||
"definitions", {}
|
||||
)
|
||||
|
||||
def render_ui(self, streamlit_app_root) -> None:
|
||||
if has_input_ui_renderer(self._input_class):
|
||||
# The input model has a rendering function
|
||||
# The rendering also returns the current state of input data
|
||||
self._session_state.input_data = self._input_class.render_input_ui( # type: ignore
|
||||
st, self._session_state.input_data
|
||||
)
|
||||
return
|
||||
|
||||
# print(self._schema_properties)
|
||||
for property_key in self._schema_properties.keys():
|
||||
property = self._schema_properties[property_key]
|
||||
|
||||
if not property.get("title"):
|
||||
# Set property key as fallback title
|
||||
property["title"] = name_to_title(property_key)
|
||||
|
||||
try:
|
||||
if "input_data" in self._session_state:
|
||||
self._store_value(
|
||||
property_key,
|
||||
self._render_property(streamlit_app_root, property_key, property),
|
||||
)
|
||||
except Exception as e:
|
||||
print("Exception!", e)
|
||||
pass
|
||||
|
||||
def _get_default_streamlit_input_kwargs(self, key: str, property: Dict) -> Dict:
|
||||
streamlit_kwargs = {
|
||||
"label": property.get("title"),
|
||||
"key": key,
|
||||
}
|
||||
|
||||
if property.get("description"):
|
||||
streamlit_kwargs["help"] = property.get("description")
|
||||
return streamlit_kwargs
|
||||
|
||||
def _store_value(self, key: str, value: Any) -> None:
|
||||
data_element = self._session_state.input_data
|
||||
key_elements = key.split(".")
|
||||
for i, key_element in enumerate(key_elements):
|
||||
if i == len(key_elements) - 1:
|
||||
# add value to this element
|
||||
data_element[key_element] = value
|
||||
return
|
||||
if key_element not in data_element:
|
||||
data_element[key_element] = {}
|
||||
data_element = data_element[key_element]
|
||||
|
||||
def _get_value(self, key: str) -> Any:
|
||||
data_element = self._session_state.input_data
|
||||
key_elements = key.split(".")
|
||||
for i, key_element in enumerate(key_elements):
|
||||
if i == len(key_elements) - 1:
|
||||
# add value to this element
|
||||
if key_element not in data_element:
|
||||
return None
|
||||
return data_element[key_element]
|
||||
if key_element not in data_element:
|
||||
data_element[key_element] = {}
|
||||
data_element = data_element[key_element]
|
||||
return None
|
||||
|
||||
def _render_single_datetime_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
|
||||
if property.get("format") == "time":
|
||||
if property.get("default"):
|
||||
try:
|
||||
streamlit_kwargs["value"] = datetime.time.fromisoformat( # type: ignore
|
||||
property.get("default")
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return streamlit_app.time_input(**streamlit_kwargs)
|
||||
elif property.get("format") == "date":
|
||||
if property.get("default"):
|
||||
try:
|
||||
streamlit_kwargs["value"] = datetime.date.fromisoformat( # type: ignore
|
||||
property.get("default")
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return streamlit_app.date_input(**streamlit_kwargs)
|
||||
elif property.get("format") == "date-time":
|
||||
if property.get("default"):
|
||||
try:
|
||||
streamlit_kwargs["value"] = datetime.datetime.fromisoformat( # type: ignore
|
||||
property.get("default")
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
with streamlit_app.container():
|
||||
streamlit_app.subheader(streamlit_kwargs.get("label"))
|
||||
if streamlit_kwargs.get("description"):
|
||||
streamlit_app.text(streamlit_kwargs.get("description"))
|
||||
selected_date = None
|
||||
selected_time = None
|
||||
date_col, time_col = streamlit_app.columns(2)
|
||||
with date_col:
|
||||
date_kwargs = {"label": "Date", "key": key + "-date-input"}
|
||||
if streamlit_kwargs.get("value"):
|
||||
try:
|
||||
date_kwargs["value"] = streamlit_kwargs.get( # type: ignore
|
||||
"value"
|
||||
).date()
|
||||
except Exception:
|
||||
pass
|
||||
selected_date = streamlit_app.date_input(**date_kwargs)
|
||||
|
||||
with time_col:
|
||||
time_kwargs = {"label": "Time", "key": key + "-time-input"}
|
||||
if streamlit_kwargs.get("value"):
|
||||
try:
|
||||
time_kwargs["value"] = streamlit_kwargs.get( # type: ignore
|
||||
"value"
|
||||
).time()
|
||||
except Exception:
|
||||
pass
|
||||
selected_time = streamlit_app.time_input(**time_kwargs)
|
||||
return datetime.datetime.combine(selected_date, selected_time)
|
||||
else:
|
||||
streamlit_app.warning(
|
||||
"Date format is not supported: " + str(property.get("format"))
|
||||
)
|
||||
|
||||
def _render_single_file_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
file_extension = None
|
||||
if "mime_type" in property:
|
||||
file_extension = mimetypes.guess_extension(property["mime_type"])
|
||||
|
||||
if "is_recorder" in property:
|
||||
from audio_recorder_streamlit import audio_recorder
|
||||
audio_bytes = audio_recorder()
|
||||
if audio_bytes:
|
||||
streamlit_app.audio(audio_bytes, format="audio/wav")
|
||||
return audio_bytes
|
||||
|
||||
uploaded_file = streamlit_app.file_uploader(
|
||||
**streamlit_kwargs, accept_multiple_files=False, type=file_extension
|
||||
)
|
||||
if uploaded_file is None:
|
||||
return None
|
||||
|
||||
bytes = uploaded_file.getvalue()
|
||||
if property.get("mime_type"):
|
||||
if is_compatible_audio(property["mime_type"]):
|
||||
# Show audio
|
||||
streamlit_app.audio(bytes, format=property.get("mime_type"))
|
||||
if is_compatible_image(property["mime_type"]):
|
||||
# Show image
|
||||
streamlit_app.image(bytes)
|
||||
if is_compatible_video(property["mime_type"]):
|
||||
# Show video
|
||||
streamlit_app.video(bytes, format=property.get("mime_type"))
|
||||
return bytes
|
||||
|
||||
def _render_single_audio_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
# streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
from audio_recorder_streamlit import audio_recorder
|
||||
audio_bytes = audio_recorder()
|
||||
if audio_bytes:
|
||||
streamlit_app.audio(audio_bytes, format="audio/wav")
|
||||
return audio_bytes
|
||||
|
||||
# file_extension = None
|
||||
# if "mime_type" in property:
|
||||
# file_extension = mimetypes.guess_extension(property["mime_type"])
|
||||
|
||||
# uploaded_file = streamlit_app.file_uploader(
|
||||
# **streamlit_kwargs, accept_multiple_files=False, type=file_extension
|
||||
# )
|
||||
# if uploaded_file is None:
|
||||
# return None
|
||||
|
||||
# bytes = uploaded_file.getvalue()
|
||||
# if property.get("mime_type"):
|
||||
# if is_compatible_audio(property["mime_type"]):
|
||||
# # Show audio
|
||||
# streamlit_app.audio(bytes, format=property.get("mime_type"))
|
||||
# if is_compatible_image(property["mime_type"]):
|
||||
# # Show image
|
||||
# streamlit_app.image(bytes)
|
||||
# if is_compatible_video(property["mime_type"]):
|
||||
# # Show video
|
||||
# streamlit_app.video(bytes, format=property.get("mime_type"))
|
||||
# return bytes
|
||||
|
||||
def _render_single_string_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
|
||||
if property.get("default"):
|
||||
streamlit_kwargs["value"] = property.get("default")
|
||||
elif property.get("example"):
|
||||
# TODO: also use example for other property types
|
||||
# Use example as value if it is provided
|
||||
streamlit_kwargs["value"] = property.get("example")
|
||||
|
||||
if property.get("maxLength") is not None:
|
||||
streamlit_kwargs["max_chars"] = property.get("maxLength")
|
||||
|
||||
if (
|
||||
property.get("format")
|
||||
or (
|
||||
property.get("maxLength") is not None
|
||||
and int(property.get("maxLength")) < 140 # type: ignore
|
||||
)
|
||||
or property.get("writeOnly")
|
||||
):
|
||||
# If any format is set, use single text input
|
||||
# If max chars is set to less than 140, use single text input
|
||||
# If write only -> password field
|
||||
if property.get("writeOnly"):
|
||||
streamlit_kwargs["type"] = "password"
|
||||
return streamlit_app.text_input(**streamlit_kwargs)
|
||||
else:
|
||||
# Otherwise use multiline text area
|
||||
return streamlit_app.text_area(**streamlit_kwargs)
|
||||
|
||||
def _render_multi_enum_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
reference_item = schema_utils.resolve_reference(
|
||||
property["items"]["$ref"], self._schema_references
|
||||
)
|
||||
# TODO: how to select defaults
|
||||
return streamlit_app.multiselect(
|
||||
**streamlit_kwargs, options=reference_item["enum"]
|
||||
)
|
||||
|
||||
def _render_single_enum_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
|
||||
streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
reference_item = schema_utils.get_single_reference_item(
|
||||
property, self._schema_references
|
||||
)
|
||||
|
||||
if property.get("default") is not None:
|
||||
try:
|
||||
streamlit_kwargs["index"] = reference_item["enum"].index(
|
||||
property.get("default")
|
||||
)
|
||||
except Exception:
|
||||
# Use default selection
|
||||
pass
|
||||
|
||||
return streamlit_app.selectbox(
|
||||
**streamlit_kwargs, options=reference_item["enum"]
|
||||
)
|
||||
|
||||
def _render_single_dict_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
|
||||
# Add title and subheader
|
||||
streamlit_app.subheader(property.get("title"))
|
||||
if property.get("description"):
|
||||
streamlit_app.markdown(property.get("description"))
|
||||
|
||||
streamlit_app.markdown("---")
|
||||
|
||||
current_dict = self._get_value(key)
|
||||
if not current_dict:
|
||||
current_dict = {}
|
||||
|
||||
key_col, value_col = streamlit_app.columns(2)
|
||||
|
||||
with key_col:
|
||||
updated_key = streamlit_app.text_input(
|
||||
"Key", value="", key=key + "-new-key"
|
||||
)
|
||||
|
||||
with value_col:
|
||||
# TODO: also add boolean?
|
||||
value_kwargs = {"label": "Value", "key": key + "-new-value"}
|
||||
if property["additionalProperties"].get("type") == "integer":
|
||||
value_kwargs["value"] = 0 # type: ignore
|
||||
updated_value = streamlit_app.number_input(**value_kwargs)
|
||||
elif property["additionalProperties"].get("type") == "number":
|
||||
value_kwargs["value"] = 0.0 # type: ignore
|
||||
value_kwargs["format"] = "%f"
|
||||
updated_value = streamlit_app.number_input(**value_kwargs)
|
||||
else:
|
||||
value_kwargs["value"] = ""
|
||||
updated_value = streamlit_app.text_input(**value_kwargs)
|
||||
|
||||
streamlit_app.markdown("---")
|
||||
|
||||
with streamlit_app.container():
|
||||
clear_col, add_col = streamlit_app.columns([1, 2])
|
||||
|
||||
with clear_col:
|
||||
if streamlit_app.button("Clear Items", key=key + "-clear-items"):
|
||||
current_dict = {}
|
||||
|
||||
with add_col:
|
||||
if (
|
||||
streamlit_app.button("Add Item", key=key + "-add-item")
|
||||
and updated_key
|
||||
):
|
||||
current_dict[updated_key] = updated_value
|
||||
|
||||
streamlit_app.write(current_dict)
|
||||
|
||||
return current_dict
|
||||
|
||||
def _render_single_reference(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
reference_item = schema_utils.get_single_reference_item(
|
||||
property, self._schema_references
|
||||
)
|
||||
return self._render_property(streamlit_app, key, reference_item)
|
||||
|
||||
def _render_multi_file_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
|
||||
file_extension = None
|
||||
if "mime_type" in property:
|
||||
file_extension = mimetypes.guess_extension(property["mime_type"])
|
||||
|
||||
uploaded_files = streamlit_app.file_uploader(
|
||||
**streamlit_kwargs, accept_multiple_files=True, type=file_extension
|
||||
)
|
||||
uploaded_files_bytes = []
|
||||
if uploaded_files:
|
||||
for uploaded_file in uploaded_files:
|
||||
uploaded_files_bytes.append(uploaded_file.read())
|
||||
return uploaded_files_bytes
|
||||
|
||||
def _render_single_boolean_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
|
||||
if property.get("default"):
|
||||
streamlit_kwargs["value"] = property.get("default")
|
||||
return streamlit_app.checkbox(**streamlit_kwargs)
|
||||
|
||||
def _render_single_number_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
|
||||
|
||||
number_transform = int
|
||||
if property.get("type") == "number":
|
||||
number_transform = float # type: ignore
|
||||
streamlit_kwargs["format"] = "%f"
|
||||
|
||||
if "multipleOf" in property:
|
||||
# Set stepcount based on multiple of parameter
|
||||
streamlit_kwargs["step"] = number_transform(property["multipleOf"])
|
||||
elif number_transform == int:
|
||||
# Set step size to 1 as default
|
||||
streamlit_kwargs["step"] = 1
|
||||
elif number_transform == float:
|
||||
# Set step size to 0.01 as default
|
||||
# TODO: adapt to default value
|
||||
streamlit_kwargs["step"] = 0.01
|
||||
|
||||
if "minimum" in property:
|
||||
streamlit_kwargs["min_value"] = number_transform(property["minimum"])
|
||||
if "exclusiveMinimum" in property:
|
||||
streamlit_kwargs["min_value"] = number_transform(
|
||||
property["exclusiveMinimum"] + streamlit_kwargs["step"]
|
||||
)
|
||||
if "maximum" in property:
|
||||
streamlit_kwargs["max_value"] = number_transform(property["maximum"])
|
||||
|
||||
if "exclusiveMaximum" in property:
|
||||
streamlit_kwargs["max_value"] = number_transform(
|
||||
property["exclusiveMaximum"] - streamlit_kwargs["step"]
|
||||
)
|
||||
|
||||
if property.get("default") is not None:
|
||||
streamlit_kwargs["value"] = number_transform(property.get("default")) # type: ignore
|
||||
else:
|
||||
if "min_value" in streamlit_kwargs:
|
||||
streamlit_kwargs["value"] = streamlit_kwargs["min_value"]
|
||||
elif number_transform == int:
|
||||
streamlit_kwargs["value"] = 0
|
||||
else:
|
||||
# Set default value to step
|
||||
streamlit_kwargs["value"] = number_transform(streamlit_kwargs["step"])
|
||||
|
||||
if "min_value" in streamlit_kwargs and "max_value" in streamlit_kwargs:
|
||||
# TODO: Only if less than X steps
|
||||
return streamlit_app.slider(**streamlit_kwargs)
|
||||
else:
|
||||
return streamlit_app.number_input(**streamlit_kwargs)
|
||||
|
||||
def _render_object_input(self, streamlit_app: st, key: str, property: Dict) -> Any:
|
||||
properties = property["properties"]
|
||||
object_inputs = {}
|
||||
for property_key in properties:
|
||||
property = properties[property_key]
|
||||
if not property.get("title"):
|
||||
# Set property key as fallback title
|
||||
property["title"] = name_to_title(property_key)
|
||||
# construct full key based on key parts -> required later to get the value
|
||||
full_key = key + "." + property_key
|
||||
object_inputs[property_key] = self._render_property(
|
||||
streamlit_app, full_key, property
|
||||
)
|
||||
return object_inputs
|
||||
|
||||
def _render_single_object_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
# Add title and subheader
|
||||
title = property.get("title")
|
||||
streamlit_app.subheader(title)
|
||||
if property.get("description"):
|
||||
streamlit_app.markdown(property.get("description"))
|
||||
|
||||
object_reference = schema_utils.get_single_reference_item(
|
||||
property, self._schema_references
|
||||
)
|
||||
return self._render_object_input(streamlit_app, key, object_reference)
|
||||
|
||||
def _render_property_list_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
|
||||
# Add title and subheader
|
||||
streamlit_app.subheader(property.get("title"))
|
||||
if property.get("description"):
|
||||
streamlit_app.markdown(property.get("description"))
|
||||
|
||||
streamlit_app.markdown("---")
|
||||
|
||||
current_list = self._get_value(key)
|
||||
if not current_list:
|
||||
current_list = []
|
||||
|
||||
value_kwargs = {"label": "Value", "key": key + "-new-value"}
|
||||
if property["items"]["type"] == "integer":
|
||||
value_kwargs["value"] = 0 # type: ignore
|
||||
new_value = streamlit_app.number_input(**value_kwargs)
|
||||
elif property["items"]["type"] == "number":
|
||||
value_kwargs["value"] = 0.0 # type: ignore
|
||||
value_kwargs["format"] = "%f"
|
||||
new_value = streamlit_app.number_input(**value_kwargs)
|
||||
else:
|
||||
value_kwargs["value"] = ""
|
||||
new_value = streamlit_app.text_input(**value_kwargs)
|
||||
|
||||
streamlit_app.markdown("---")
|
||||
|
||||
with streamlit_app.container():
|
||||
clear_col, add_col = streamlit_app.columns([1, 2])
|
||||
|
||||
with clear_col:
|
||||
if streamlit_app.button("Clear Items", key=key + "-clear-items"):
|
||||
current_list = []
|
||||
|
||||
with add_col:
|
||||
if (
|
||||
streamlit_app.button("Add Item", key=key + "-add-item")
|
||||
and new_value is not None
|
||||
):
|
||||
current_list.append(new_value)
|
||||
|
||||
streamlit_app.write(current_list)
|
||||
|
||||
return current_list
|
||||
|
||||
def _render_object_list_input(
|
||||
self, streamlit_app: st, key: str, property: Dict
|
||||
) -> Any:
|
||||
|
||||
# TODO: support max_items, and min_items properties
|
||||
|
||||
# Add title and subheader
|
||||
streamlit_app.subheader(property.get("title"))
|
||||
if property.get("description"):
|
||||
streamlit_app.markdown(property.get("description"))
|
||||
|
||||
streamlit_app.markdown("---")
|
||||
|
||||
current_list = self._get_value(key)
|
||||
if not current_list:
|
||||
current_list = []
|
||||
|
||||
object_reference = schema_utils.resolve_reference(
|
||||
property["items"]["$ref"], self._schema_references
|
||||
)
|
||||
input_data = self._render_object_input(streamlit_app, key, object_reference)
|
||||
|
||||
streamlit_app.markdown("---")
|
||||
|
||||
with streamlit_app.container():
|
||||
clear_col, add_col = streamlit_app.columns([1, 2])
|
||||
|
||||
with clear_col:
|
||||
if streamlit_app.button("Clear Items", key=key + "-clear-items"):
|
||||
current_list = []
|
||||
|
||||
with add_col:
|
||||
if (
|
||||
streamlit_app.button("Add Item", key=key + "-add-item")
|
||||
and input_data
|
||||
):
|
||||
current_list.append(input_data)
|
||||
|
||||
streamlit_app.write(current_list)
|
||||
return current_list
|
||||
|
||||
def _render_property(self, streamlit_app: st, key: str, property: Dict) -> Any:
|
||||
if schema_utils.is_single_enum_property(property, self._schema_references):
|
||||
return self._render_single_enum_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_multi_enum_property(property, self._schema_references):
|
||||
return self._render_multi_enum_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_single_file_property(property):
|
||||
return self._render_single_file_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_multi_file_property(property):
|
||||
return self._render_multi_file_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_single_datetime_property(property):
|
||||
return self._render_single_datetime_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_single_boolean_property(property):
|
||||
return self._render_single_boolean_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_single_dict_property(property):
|
||||
return self._render_single_dict_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_single_number_property(property):
|
||||
return self._render_single_number_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_single_string_property(property):
|
||||
return self._render_single_string_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_single_object(property, self._schema_references):
|
||||
return self._render_single_object_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_object_list_property(property, self._schema_references):
|
||||
return self._render_object_list_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_property_list(property):
|
||||
return self._render_property_list_input(streamlit_app, key, property)
|
||||
|
||||
if schema_utils.is_single_reference(property):
|
||||
return self._render_single_reference(streamlit_app, key, property)
|
||||
|
||||
streamlit_app.warning(
|
||||
"The type of the following property is currently not supported: "
|
||||
+ str(property.get("title"))
|
||||
)
|
||||
raise Exception("Unsupported property")
|
||||
|
||||
|
||||
class OutputUI:
|
||||
def __init__(self, output_data: Any, input_data: Any):
|
||||
self._output_data = output_data
|
||||
self._input_data = input_data
|
||||
|
||||
def render_ui(self, streamlit_app) -> None:
|
||||
try:
|
||||
if isinstance(self._output_data, BaseModel):
|
||||
self._render_single_output(streamlit_app, self._output_data)
|
||||
return
|
||||
if type(self._output_data) == list:
|
||||
self._render_list_output(streamlit_app, self._output_data)
|
||||
return
|
||||
except Exception as ex:
|
||||
streamlit_app.exception(ex)
|
||||
# Fallback to
|
||||
streamlit_app.json(jsonable_encoder(self._output_data))
|
||||
|
||||
def _render_single_text_property(
|
||||
self, streamlit: st, property_schema: Dict, value: Any
|
||||
) -> None:
|
||||
# Add title and subheader
|
||||
streamlit.subheader(property_schema.get("title"))
|
||||
if property_schema.get("description"):
|
||||
streamlit.markdown(property_schema.get("description"))
|
||||
if value is None or value == "":
|
||||
streamlit.info("No value returned!")
|
||||
else:
|
||||
streamlit.code(str(value), language="plain")
|
||||
|
||||
def _render_single_file_property(
|
||||
self, streamlit: st, property_schema: Dict, value: Any
|
||||
) -> None:
|
||||
# Add title and subheader
|
||||
streamlit.subheader(property_schema.get("title"))
|
||||
if property_schema.get("description"):
|
||||
streamlit.markdown(property_schema.get("description"))
|
||||
if value is None or value == "":
|
||||
streamlit.info("No value returned!")
|
||||
else:
|
||||
# TODO: Detect if it is a FileContent instance
|
||||
# TODO: detect if it is base64
|
||||
file_extension = ""
|
||||
if "mime_type" in property_schema:
|
||||
mime_type = property_schema["mime_type"]
|
||||
file_extension = mimetypes.guess_extension(mime_type) or ""
|
||||
|
||||
if is_compatible_audio(mime_type):
|
||||
streamlit.audio(value.as_bytes(), format=mime_type)
|
||||
return
|
||||
|
||||
if is_compatible_image(mime_type):
|
||||
streamlit.image(value.as_bytes())
|
||||
return
|
||||
|
||||
if is_compatible_video(mime_type):
|
||||
streamlit.video(value.as_bytes(), format=mime_type)
|
||||
return
|
||||
|
||||
filename = (
|
||||
(property_schema["title"] + file_extension)
|
||||
.lower()
|
||||
.strip()
|
||||
.replace(" ", "-")
|
||||
)
|
||||
streamlit.markdown(
|
||||
f'<a href="data:application/octet-stream;base64,{value}" download="{filename}"><input type="button" value="Download File"></a>',
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
|
||||
def _render_single_complex_property(
|
||||
self, streamlit: st, property_schema: Dict, value: Any
|
||||
) -> None:
|
||||
# Add title and subheader
|
||||
streamlit.subheader(property_schema.get("title"))
|
||||
if property_schema.get("description"):
|
||||
streamlit.markdown(property_schema.get("description"))
|
||||
|
||||
streamlit.json(jsonable_encoder(value))
|
||||
|
||||
def _render_single_output(self, streamlit: st, output_data: BaseModel) -> None:
|
||||
try:
|
||||
if has_output_ui_renderer(output_data):
|
||||
if function_has_named_arg(output_data.render_output_ui, "input"): # type: ignore
|
||||
# render method also requests the input data
|
||||
output_data.render_output_ui(streamlit, input=self._input_data) # type: ignore
|
||||
else:
|
||||
output_data.render_output_ui(streamlit) # type: ignore
|
||||
return
|
||||
except Exception:
|
||||
# Use default auto-generation methods if the custom rendering throws an exception
|
||||
logger.exception(
|
||||
"Failed to execute custom render_output_ui function. Using auto-generation instead"
|
||||
)
|
||||
|
||||
model_schema = output_data.schema(by_alias=False)
|
||||
model_properties = model_schema.get("properties")
|
||||
definitions = model_schema.get("definitions")
|
||||
|
||||
if model_properties:
|
||||
for property_key in output_data.__dict__:
|
||||
property_schema = model_properties.get(property_key)
|
||||
if not property_schema.get("title"):
|
||||
# Set property key as fallback title
|
||||
property_schema["title"] = property_key
|
||||
|
||||
output_property_value = output_data.__dict__[property_key]
|
||||
|
||||
if has_output_ui_renderer(output_property_value):
|
||||
output_property_value.render_output_ui(streamlit) # type: ignore
|
||||
continue
|
||||
|
||||
if isinstance(output_property_value, BaseModel):
|
||||
# Render output recursivly
|
||||
streamlit.subheader(property_schema.get("title"))
|
||||
if property_schema.get("description"):
|
||||
streamlit.markdown(property_schema.get("description"))
|
||||
self._render_single_output(streamlit, output_property_value)
|
||||
continue
|
||||
|
||||
if property_schema:
|
||||
if schema_utils.is_single_file_property(property_schema):
|
||||
self._render_single_file_property(
|
||||
streamlit, property_schema, output_property_value
|
||||
)
|
||||
continue
|
||||
|
||||
if (
|
||||
schema_utils.is_single_string_property(property_schema)
|
||||
or schema_utils.is_single_number_property(property_schema)
|
||||
or schema_utils.is_single_datetime_property(property_schema)
|
||||
or schema_utils.is_single_boolean_property(property_schema)
|
||||
):
|
||||
self._render_single_text_property(
|
||||
streamlit, property_schema, output_property_value
|
||||
)
|
||||
continue
|
||||
if definitions and schema_utils.is_single_enum_property(
|
||||
property_schema, definitions
|
||||
):
|
||||
self._render_single_text_property(
|
||||
streamlit, property_schema, output_property_value.value
|
||||
)
|
||||
continue
|
||||
|
||||
# TODO: render dict as table
|
||||
|
||||
self._render_single_complex_property(
|
||||
streamlit, property_schema, output_property_value
|
||||
)
|
||||
return
|
||||
|
||||
def _render_list_output(self, streamlit: st, output_data: List) -> None:
|
||||
try:
|
||||
data_items: List = []
|
||||
for data_item in output_data:
|
||||
if has_output_ui_renderer(data_item):
|
||||
# Render using the render function
|
||||
data_item.render_output_ui(streamlit) # type: ignore
|
||||
continue
|
||||
data_items.append(data_item.dict())
|
||||
# Try to show as dataframe
|
||||
streamlit.table(pd.DataFrame(data_items))
|
||||
except Exception:
|
||||
# Fallback to
|
||||
streamlit.json(jsonable_encoder(output_data))
|
||||
|
||||
|
||||
def getOpyrator(mode: str) -> Opyrator:
|
||||
if mode == None or mode.startswith('VC'):
|
||||
from control.mkgui.app_vc import convert
|
||||
return Opyrator(convert)
|
||||
if mode == None or mode.startswith('预处理'):
|
||||
from control.mkgui.preprocess import preprocess
|
||||
return Opyrator(preprocess)
|
||||
if mode == None or mode.startswith('模型训练'):
|
||||
from control.mkgui.train import train
|
||||
return Opyrator(train)
|
||||
if mode == None or mode.startswith('模型训练(VC)'):
|
||||
from control.mkgui.train_vc import train_vc
|
||||
return Opyrator(train_vc)
|
||||
from control.mkgui.app import synthesize
|
||||
return Opyrator(synthesize)
|
||||
|
||||
def render_streamlit_ui() -> None:
|
||||
# init
|
||||
session_state = st.session_state
|
||||
session_state.input_data = {}
|
||||
# Add custom css settings
|
||||
st.markdown(f"<style>{CUSTOM_STREAMLIT_CSS}</style>", unsafe_allow_html=True)
|
||||
|
||||
with st.spinner("Loading MockingBird GUI. Please wait..."):
|
||||
session_state.mode = st.sidebar.selectbox(
|
||||
'模式选择',
|
||||
( "AI拟音", "VC拟音", "预处理", "模型训练", "模型训练(VC)")
|
||||
)
|
||||
if "mode" in session_state:
|
||||
mode = session_state.mode
|
||||
else:
|
||||
mode = ""
|
||||
opyrator = getOpyrator(mode)
|
||||
title = opyrator.name + mode
|
||||
|
||||
col1, col2, _ = st.columns(3)
|
||||
col2.title(title)
|
||||
col2.markdown("欢迎使用MockingBird Web 2")
|
||||
|
||||
image = Image.open(path.join('control','mkgui', 'static', 'mb.png'))
|
||||
col1.image(image)
|
||||
|
||||
st.markdown("---")
|
||||
left, right = st.columns([0.4, 0.6])
|
||||
|
||||
with left:
|
||||
st.header("Control 控制")
|
||||
# if session_state.mode in ["AI拟音", "VC拟音"] :
|
||||
# from audiorecorder import audiorecorder
|
||||
# audio = audiorecorder("Click to record", "Recording...")
|
||||
# if len(audio) > 0:
|
||||
# # To play audio in frontend:
|
||||
# st.audio(audio.tobytes())
|
||||
|
||||
InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st)
|
||||
execute_selected = st.button(opyrator.action)
|
||||
if execute_selected:
|
||||
with st.spinner("Executing operation. Please wait..."):
|
||||
try:
|
||||
input_data_obj = parse_obj_as(
|
||||
opyrator.input_type, session_state.input_data
|
||||
)
|
||||
session_state.output_data = opyrator(input=input_data_obj)
|
||||
session_state.latest_operation_input = input_data_obj # should this really be saved as additional session object?
|
||||
except ValidationError as ex:
|
||||
st.error(ex)
|
||||
else:
|
||||
# st.success("Operation executed successfully.")
|
||||
pass
|
||||
|
||||
with right:
|
||||
st.header("Result 结果")
|
||||
if 'output_data' in session_state:
|
||||
OutputUI(
|
||||
session_state.output_data, session_state.latest_operation_input
|
||||
).render_ui(st)
|
||||
if st.button("Clear"):
|
||||
# Clear all state
|
||||
for key in st.session_state.keys():
|
||||
del st.session_state[key]
|
||||
session_state.input_data = {}
|
||||
st.experimental_rerun()
|
||||
else:
|
||||
# placeholder
|
||||
st.caption("请使用左侧控制板进行输入并运行获得结果")
|
||||
|
||||
|
||||
13
control/mkgui/base/ui/streamlit_utils.py
Normal file
13
control/mkgui/base/ui/streamlit_utils.py
Normal file
@@ -0,0 +1,13 @@
|
||||
CUSTOM_STREAMLIT_CSS = """
|
||||
div[data-testid="stBlock"] button {
|
||||
width: 100% !important;
|
||||
margin-bottom: 20px !important;
|
||||
border-color: #bfbfbf !important;
|
||||
}
|
||||
section[data-testid="stSidebar"] div {
|
||||
max-width: 10rem;
|
||||
}
|
||||
pre code {
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
"""
|
||||
96
control/mkgui/preprocess.py
Normal file
96
control/mkgui/preprocess.py
Normal file
@@ -0,0 +1,96 @@
|
||||
from pydantic import BaseModel, Field
|
||||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from typing import Any, Tuple
|
||||
|
||||
|
||||
# Constants
|
||||
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
|
||||
if os.path.isdir(EXT_MODELS_DIRT):
|
||||
extractors = Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded extractor models: " + str(len(extractors)))
|
||||
else:
|
||||
raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(ENC_MODELS_DIRT):
|
||||
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded encoders models: " + str(len(encoders)))
|
||||
else:
|
||||
raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
class Model(str, Enum):
|
||||
VC_PPG2MEL = "ppg2mel"
|
||||
|
||||
class Dataset(str, Enum):
|
||||
AIDATATANG_200ZH = "aidatatang_200zh"
|
||||
AIDATATANG_200ZH_S = "aidatatang_200zh_s"
|
||||
|
||||
class Input(BaseModel):
|
||||
# def render_input_ui(st, input) -> Dict:
|
||||
# input["selected_dataset"] = st.selectbox(
|
||||
# '选择数据集',
|
||||
# ("aidatatang_200zh", "aidatatang_200zh_s")
|
||||
# )
|
||||
# return input
|
||||
model: Model = Field(
|
||||
Model.VC_PPG2MEL, title="目标模型",
|
||||
)
|
||||
dataset: Dataset = Field(
|
||||
Dataset.AIDATATANG_200ZH, title="数据集选择",
|
||||
)
|
||||
datasets_root: str = Field(
|
||||
..., alias="数据集根目录", description="输入数据集根目录(相对/绝对)",
|
||||
format=True,
|
||||
example="..\\trainning_data\\"
|
||||
)
|
||||
output_root: str = Field(
|
||||
..., alias="输出根目录", description="输出结果根目录(相对/绝对)",
|
||||
format=True,
|
||||
example="..\\trainning_data\\"
|
||||
)
|
||||
n_processes: int = Field(
|
||||
2, alias="处理线程数", description="根据CPU线程数来设置",
|
||||
le=32, ge=1
|
||||
)
|
||||
extractor: extractors = Field(
|
||||
..., alias="特征提取模型",
|
||||
description="选择PPG特征提取模型文件."
|
||||
)
|
||||
encoder: encoders = Field(
|
||||
..., alias="语音编码模型",
|
||||
description="选择语音编码模型文件."
|
||||
)
|
||||
|
||||
class AudioEntity(BaseModel):
|
||||
content: bytes
|
||||
mel: Any
|
||||
|
||||
class Output(BaseModel):
|
||||
__root__: Tuple[str, int]
|
||||
|
||||
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore
|
||||
"""Custom output UI.
|
||||
If this method is implmeneted, it will be used instead of the default Output UI renderer.
|
||||
"""
|
||||
sr, count = self.__root__
|
||||
streamlit_app.subheader(f"Dataset {sr} done processed total of {count}")
|
||||
|
||||
def preprocess(input: Input) -> Output:
|
||||
"""Preprocess(预处理)"""
|
||||
finished = 0
|
||||
if input.model == Model.VC_PPG2MEL:
|
||||
from models.ppg2mel.preprocess import preprocess_dataset
|
||||
finished = preprocess_dataset(
|
||||
datasets_root=Path(input.datasets_root),
|
||||
dataset=input.dataset,
|
||||
out_dir=Path(input.output_root),
|
||||
n_processes=input.n_processes,
|
||||
ppg_encoder_model_fpath=Path(input.extractor.value),
|
||||
speaker_encoder_model=Path(input.encoder.value)
|
||||
)
|
||||
# TODO: pass useful return code
|
||||
return Output(__root__=(input.dataset, finished))
|
||||
|
Before Width: | Height: | Size: 5.6 KiB After Width: | Height: | Size: 5.6 KiB |
106
control/mkgui/train.py
Normal file
106
control/mkgui/train.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from pydantic import BaseModel, Field
|
||||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from models.synthesizer.hparams import hparams
|
||||
from models.synthesizer.train import train as synt_train
|
||||
|
||||
# Constants
|
||||
SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
|
||||
# EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
|
||||
# CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
|
||||
# ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
# Pre-Load models
|
||||
if os.path.isdir(SYN_MODELS_DIRT):
|
||||
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded synthesizer models: " + str(len(synthesizers)))
|
||||
else:
|
||||
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(ENC_MODELS_DIRT):
|
||||
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded encoders models: " + str(len(encoders)))
|
||||
else:
|
||||
raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
class Model(str, Enum):
|
||||
DEFAULT = "default"
|
||||
|
||||
class Input(BaseModel):
|
||||
model: Model = Field(
|
||||
Model.DEFAULT, title="模型类型",
|
||||
)
|
||||
# datasets_root: str = Field(
|
||||
# ..., alias="预处理数据根目录", description="输入目录(相对/绝对),不适用于ppg2mel模型",
|
||||
# format=True,
|
||||
# example="..\\trainning_data\\"
|
||||
# )
|
||||
input_root: str = Field(
|
||||
..., alias="输入目录", description="预处理数据根目录",
|
||||
format=True,
|
||||
example=f"..{os.sep}audiodata{os.sep}SV2TTS{os.sep}synthesizer"
|
||||
)
|
||||
run_id: str = Field(
|
||||
"", alias="新模型名/运行ID", description="使用新ID进行重新训练,否则选择下面的模型进行继续训练",
|
||||
)
|
||||
synthesizer: synthesizers = Field(
|
||||
..., alias="已有合成模型",
|
||||
description="选择语音合成模型文件."
|
||||
)
|
||||
gpu: bool = Field(
|
||||
True, alias="GPU训练", description="选择“是”,则使用GPU训练",
|
||||
)
|
||||
verbose: bool = Field(
|
||||
True, alias="打印详情", description="选择“是”,输出更多详情",
|
||||
)
|
||||
encoder: encoders = Field(
|
||||
..., alias="语音编码模型",
|
||||
description="选择语音编码模型文件."
|
||||
)
|
||||
save_every: int = Field(
|
||||
1000, alias="更新间隔", description="每隔n步则更新一次模型",
|
||||
)
|
||||
backup_every: int = Field(
|
||||
10000, alias="保存间隔", description="每隔n步则保存一次模型",
|
||||
)
|
||||
log_every: int = Field(
|
||||
500, alias="打印间隔", description="每隔n步则打印一次训练统计",
|
||||
)
|
||||
|
||||
class AudioEntity(BaseModel):
|
||||
content: bytes
|
||||
mel: Any
|
||||
|
||||
class Output(BaseModel):
|
||||
__root__: int
|
||||
|
||||
def render_output_ui(self, streamlit_app) -> None: # type: ignore
|
||||
"""Custom output UI.
|
||||
If this method is implmeneted, it will be used instead of the default Output UI renderer.
|
||||
"""
|
||||
streamlit_app.subheader(f"Training started with code: {self.__root__}")
|
||||
|
||||
def train(input: Input) -> Output:
|
||||
"""Train(训练)"""
|
||||
|
||||
print(">>> Start training ...")
|
||||
force_restart = len(input.run_id) > 0
|
||||
if not force_restart:
|
||||
input.run_id = Path(input.synthesizer.value).name.split('.')[0]
|
||||
|
||||
synt_train(
|
||||
input.run_id,
|
||||
input.input_root,
|
||||
f"data{os.sep}ckpt{os.sep}synthesizer",
|
||||
input.save_every,
|
||||
input.backup_every,
|
||||
input.log_every,
|
||||
force_restart,
|
||||
hparams
|
||||
)
|
||||
return Output(__root__=0)
|
||||
155
control/mkgui/train_vc.py
Normal file
155
control/mkgui/train_vc.py
Normal file
@@ -0,0 +1,155 @@
|
||||
from pydantic import BaseModel, Field
|
||||
import os
|
||||
from pathlib import Path
|
||||
from enum import Enum
|
||||
from typing import Any, Tuple
|
||||
import numpy as np
|
||||
from utils.hparams import HpsYaml
|
||||
from utils.util import AttrDict
|
||||
import torch
|
||||
|
||||
# Constants
|
||||
EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
|
||||
CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
|
||||
ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
|
||||
|
||||
|
||||
if os.path.isdir(EXT_MODELS_DIRT):
|
||||
extractors = Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded extractor models: " + str(len(extractors)))
|
||||
else:
|
||||
raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(CONV_MODELS_DIRT):
|
||||
convertors = Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
|
||||
print("Loaded convertor models: " + str(len(convertors)))
|
||||
else:
|
||||
raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
if os.path.isdir(ENC_MODELS_DIRT):
|
||||
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
|
||||
print("Loaded encoders models: " + str(len(encoders)))
|
||||
else:
|
||||
raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
|
||||
|
||||
class Model(str, Enum):
|
||||
VC_PPG2MEL = "ppg2mel"
|
||||
|
||||
class Dataset(str, Enum):
|
||||
AIDATATANG_200ZH = "aidatatang_200zh"
|
||||
AIDATATANG_200ZH_S = "aidatatang_200zh_s"
|
||||
|
||||
class Input(BaseModel):
|
||||
# def render_input_ui(st, input) -> Dict:
|
||||
# input["selected_dataset"] = st.selectbox(
|
||||
# '选择数据集',
|
||||
# ("aidatatang_200zh", "aidatatang_200zh_s")
|
||||
# )
|
||||
# return input
|
||||
model: Model = Field(
|
||||
Model.VC_PPG2MEL, title="模型类型",
|
||||
)
|
||||
# datasets_root: str = Field(
|
||||
# ..., alias="预处理数据根目录", description="输入目录(相对/绝对),不适用于ppg2mel模型",
|
||||
# format=True,
|
||||
# example="..\\trainning_data\\"
|
||||
# )
|
||||
output_root: str = Field(
|
||||
..., alias="输出目录(可选)", description="建议不填,保持默认",
|
||||
format=True,
|
||||
example=""
|
||||
)
|
||||
continue_mode: bool = Field(
|
||||
True, alias="继续训练模式", description="选择“是”,则从下面选择的模型中继续训练",
|
||||
)
|
||||
gpu: bool = Field(
|
||||
True, alias="GPU训练", description="选择“是”,则使用GPU训练",
|
||||
)
|
||||
verbose: bool = Field(
|
||||
True, alias="打印详情", description="选择“是”,输出更多详情",
|
||||
)
|
||||
# TODO: Move to hiden fields by default
|
||||
convertor: convertors = Field(
|
||||
..., alias="转换模型",
|
||||
description="选择语音转换模型文件."
|
||||
)
|
||||
extractor: extractors = Field(
|
||||
..., alias="特征提取模型",
|
||||
description="选择PPG特征提取模型文件."
|
||||
)
|
||||
encoder: encoders = Field(
|
||||
..., alias="语音编码模型",
|
||||
description="选择语音编码模型文件."
|
||||
)
|
||||
njobs: int = Field(
|
||||
8, alias="进程数", description="适用于ppg2mel",
|
||||
)
|
||||
seed: int = Field(
|
||||
default=0, alias="初始随机数", description="适用于ppg2mel",
|
||||
)
|
||||
model_name: str = Field(
|
||||
..., alias="新模型名", description="仅在重新训练时生效,选中继续训练时无效",
|
||||
example="test"
|
||||
)
|
||||
model_config: str = Field(
|
||||
..., alias="新模型配置", description="仅在重新训练时生效,选中继续训练时无效",
|
||||
example=".\\ppg2mel\\saved_models\\seq2seq_mol_ppg2mel_vctk_libri_oneshotvc_r4_normMel_v2"
|
||||
)
|
||||
|
||||
class AudioEntity(BaseModel):
|
||||
content: bytes
|
||||
mel: Any
|
||||
|
||||
class Output(BaseModel):
|
||||
__root__: Tuple[str, int]
|
||||
|
||||
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore
|
||||
"""Custom output UI.
|
||||
If this method is implmeneted, it will be used instead of the default Output UI renderer.
|
||||
"""
|
||||
sr, count = self.__root__
|
||||
streamlit_app.subheader(f"Dataset {sr} done processed total of {count}")
|
||||
|
||||
def train_vc(input: Input) -> Output:
|
||||
"""Train VC(训练 VC)"""
|
||||
|
||||
print(">>> OneShot VC training ...")
|
||||
params = AttrDict()
|
||||
params.update({
|
||||
"gpu": input.gpu,
|
||||
"cpu": not input.gpu,
|
||||
"njobs": input.njobs,
|
||||
"seed": input.seed,
|
||||
"verbose": input.verbose,
|
||||
"load": input.convertor.value,
|
||||
"warm_start": False,
|
||||
})
|
||||
if input.continue_mode:
|
||||
# trace old model and config
|
||||
p = Path(input.convertor.value)
|
||||
params.name = p.parent.name
|
||||
# search a config file
|
||||
model_config_fpaths = list(p.parent.rglob("*.yaml"))
|
||||
if len(model_config_fpaths) == 0:
|
||||
raise "No model yaml config found for convertor"
|
||||
config = HpsYaml(model_config_fpaths[0])
|
||||
params.ckpdir = p.parent.parent
|
||||
params.config = model_config_fpaths[0]
|
||||
params.logdir = os.path.join(p.parent, "log")
|
||||
else:
|
||||
# Make the config dict dot visitable
|
||||
config = HpsYaml(input.config)
|
||||
np.random.seed(input.seed)
|
||||
torch.manual_seed(input.seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(input.seed)
|
||||
mode = "train"
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
solver = Solver(config, params, mode)
|
||||
solver.load_data()
|
||||
solver.set_model()
|
||||
solver.exec()
|
||||
print(">>> Oneshot VC train finished!")
|
||||
|
||||
# TODO: pass useful return code
|
||||
return Output(__root__=(input.dataset, 0))
|
||||
@@ -1,11 +1,12 @@
|
||||
from toolbox.ui import UI
|
||||
from encoder import inference as encoder
|
||||
from synthesizer.inference import Synthesizer
|
||||
from vocoder.wavernn import inference as rnn_vocoder
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from control.toolbox.ui import UI
|
||||
from models.encoder import inference as encoder
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.vocoder.wavernn import inference as rnn_vocoder
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
from models.vocoder.fregan import inference as fgan_vocoder
|
||||
from pathlib import Path
|
||||
from time import perf_counter as timer
|
||||
from toolbox.utterance import Utterance
|
||||
from control.toolbox.utterance import Utterance
|
||||
import numpy as np
|
||||
import traceback
|
||||
import sys
|
||||
@@ -37,8 +38,8 @@ recognized_datasets = [
|
||||
"VoxCeleb2/dev/aac",
|
||||
"VoxCeleb2/test/aac",
|
||||
"VCTK-Corpus/wav48",
|
||||
"aidatatang_200zh/corpus/dev",
|
||||
"aidatatang_200zh/corpus/test",
|
||||
"aidatatang_200zh/corpus/train",
|
||||
"aishell3/test/wav",
|
||||
"magicdata/train",
|
||||
]
|
||||
@@ -396,7 +397,7 @@ class Toolbox:
|
||||
self.ui.log("Loading the extractor %s... " % model_fpath)
|
||||
self.ui.set_loading(1)
|
||||
start = timer()
|
||||
import ppg_extractor as extractor
|
||||
import models.ppg_extractor as extractor
|
||||
self.extractor = extractor.load_model(model_fpath)
|
||||
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
|
||||
self.ui.set_loading(0)
|
||||
@@ -405,16 +406,11 @@ class Toolbox:
|
||||
if self.ui.current_convertor_fpath is None:
|
||||
return
|
||||
model_fpath = self.ui.current_convertor_fpath
|
||||
# search a config file
|
||||
model_config_fpaths = list(model_fpath.parent.rglob("*.yaml"))
|
||||
if self.ui.current_convertor_fpath is None:
|
||||
return
|
||||
model_config_fpath = model_config_fpaths[0]
|
||||
self.ui.log("Loading the convertor %s... " % model_fpath)
|
||||
self.ui.set_loading(1)
|
||||
start = timer()
|
||||
import ppg2mel as convertor
|
||||
self.convertor = convertor.load_model(model_config_fpath, model_fpath)
|
||||
import models.ppg2mel as convertor
|
||||
self.convertor = convertor.load_model( model_fpath)
|
||||
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
|
||||
self.ui.set_loading(0)
|
||||
|
||||
@@ -447,7 +443,7 @@ class Toolbox:
|
||||
return
|
||||
# Sekect vocoder based on model name
|
||||
model_config_fpath = None
|
||||
if model_fpath.name[0] == "g":
|
||||
if model_fpath.name is not None and model_fpath.name.find("hifigan") > -1:
|
||||
vocoder = gan_vocoder
|
||||
self.ui.log("set hifigan as vocoder")
|
||||
# search a config file
|
||||
@@ -456,6 +452,15 @@ class Toolbox:
|
||||
return
|
||||
if len(model_config_fpaths) > 0:
|
||||
model_config_fpath = model_config_fpaths[0]
|
||||
elif model_fpath.name is not None and model_fpath.name.find("fregan") > -1:
|
||||
vocoder = fgan_vocoder
|
||||
self.ui.log("set fregan as vocoder")
|
||||
# search a config file
|
||||
model_config_fpaths = list(model_fpath.parent.rglob("*.json"))
|
||||
if self.vc_mode and self.ui.current_extractor_fpath is None:
|
||||
return
|
||||
if len(model_config_fpaths) > 0:
|
||||
model_config_fpath = model_config_fpaths[0]
|
||||
else:
|
||||
vocoder = rnn_vocoder
|
||||
self.ui.log("set wavernn as vocoder")
|
||||
BIN
control/toolbox/assets/mb.png
Normal file
BIN
control/toolbox/assets/mb.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.6 KiB |
@@ -3,9 +3,8 @@ from PyQt5 import QtGui
|
||||
from PyQt5.QtWidgets import *
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
|
||||
from matplotlib.figure import Figure
|
||||
from encoder.inference import plot_embedding_as_heatmap
|
||||
from toolbox.utterance import Utterance
|
||||
from models.encoder.inference import plot_embedding_as_heatmap
|
||||
from control.toolbox.utterance import Utterance
|
||||
from pathlib import Path
|
||||
from typing import List, Set
|
||||
import sounddevice as sd
|
||||
@@ -274,7 +273,9 @@ class UI(QDialog):
|
||||
if datasets_root is None or len(datasets) == 0:
|
||||
msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
|
||||
if datasets_root is None else "o not have any of the recognized datasets" \
|
||||
" in %s" % datasets_root)
|
||||
" in %s \n" \
|
||||
"Please note use 'E:\datasets' as root path " \
|
||||
"instead of 'E:\datasets\aidatatang_200zh\corpus\test' as an example " % datasets_root)
|
||||
self.log(msg)
|
||||
msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
|
||||
"can still use the toolbox by recording samples yourself." % \
|
||||
BIN
data/samples/T0055G0013S0005.wav
Normal file
BIN
data/samples/T0055G0013S0005.wav
Normal file
Binary file not shown.
8
datasets_download/CN.txt
Normal file
8
datasets_download/CN.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
https://openslr.magicdatatech.com/resources/62/aidatatang_200zh.tgz
|
||||
out=download/aidatatang_200zh.tgz
|
||||
https://openslr.magicdatatech.com/resources/68/train_set.tar.gz
|
||||
out=download/magicdata.tgz
|
||||
https://openslr.magicdatatech.com/resources/93/data_aishell3.tgz
|
||||
out=download/aishell3.tgz
|
||||
https://openslr.magicdatatech.com/resources/33/data_aishell.tgz
|
||||
out=download/data_aishell.tgz
|
||||
8
datasets_download/EU.txt
Normal file
8
datasets_download/EU.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
https://openslr.elda.org/resources/62/aidatatang_200zh.tgz
|
||||
out=download/aidatatang_200zh.tgz
|
||||
https://openslr.elda.org/resources/68/train_set.tar.gz
|
||||
out=download/magicdata.tgz
|
||||
https://openslr.elda.org/resources/93/data_aishell3.tgz
|
||||
out=download/aishell3.tgz
|
||||
https://openslr.elda.org/resources/33/data_aishell.tgz
|
||||
out=download/data_aishell.tgz
|
||||
8
datasets_download/US.txt
Normal file
8
datasets_download/US.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
https://us.openslr.org/resources/62/aidatatang_200zh.tgz
|
||||
out=download/aidatatang_200zh.tgz
|
||||
https://us.openslr.org/resources/68/train_set.tar.gz
|
||||
out=download/magicdata.tgz
|
||||
https://us.openslr.org/resources/93/data_aishell3.tgz
|
||||
out=download/aishell3.tgz
|
||||
https://us.openslr.org/resources/33/data_aishell.tgz
|
||||
out=download/data_aishell.tgz
|
||||
4
datasets_download/datasets.sha256sum
Normal file
4
datasets_download/datasets.sha256sum
Normal file
@@ -0,0 +1,4 @@
|
||||
0c0ace77fe8ee77db8d7542d6eb0b7ddf09b1bfb880eb93a7fbdbf4611e9984b /datasets/download/aidatatang_200zh.tgz
|
||||
be2507d431ad59419ec871e60674caedb2b585f84ffa01fe359784686db0e0cc /datasets/download/aishell3.tgz
|
||||
a4a0313cde0a933e0e01a451f77de0a23d6c942f4694af5bb7f40b9dc38143fe /datasets/download/data_aishell.tgz
|
||||
1d2647c614b74048cfe16492570cc5146d800afdc07483a43b31809772632143 /datasets/download/magicdata.tgz
|
||||
8
datasets_download/default.txt
Normal file
8
datasets_download/default.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
https://www.openslr.org/resources/62/aidatatang_200zh.tgz
|
||||
out=download/aidatatang_200zh.tgz
|
||||
https://www.openslr.org/resources/68/train_set.tar.gz
|
||||
out=download/magicdata.tgz
|
||||
https://www.openslr.org/resources/93/data_aishell3.tgz
|
||||
out=download/aishell3.tgz
|
||||
https://www.openslr.org/resources/33/data_aishell.tgz
|
||||
out=download/data_aishell.tgz
|
||||
8
datasets_download/download.sh
Executable file
8
datasets_download/download.sh
Executable file
@@ -0,0 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
aria2c -x 10 --disable-ipv6 --input-file /workspace/datasets_download/${DATASET_MIRROR}.txt --dir /datasets --continue
|
||||
|
||||
echo "Verifying sha256sum..."
|
||||
parallel --will-cite -a /workspace/datasets_download/datasets.sha256sum "echo -n {} | sha256sum -c"
|
||||
29
datasets_download/extract.sh
Executable file
29
datasets_download/extract.sh
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
mkdir -p /datasets/aidatatang_200zh
|
||||
if [ -z "$(ls -A /datasets/aidatatang_200zh)" ] ; then
|
||||
tar xvz --directory /datasets/ -f /datasets/download/aidatatang_200zh.tgz --exclude 'aidatatang_200zh/corpus/dev/*' --exclude 'aidatatang_200zh/corpus/test/*'
|
||||
cd /datasets/aidatatang_200zh/corpus/train/
|
||||
cat *.tar.gz | tar zxvf - -i
|
||||
rm -f *.tar.gz
|
||||
fi
|
||||
|
||||
mkdir -p /datasets/magicdata
|
||||
if [ -z "$(ls -A /datasets/magicdata)" ] ; then
|
||||
tar xvz --directory /datasets/magicdata -f /datasets/download/magicdata.tgz train/
|
||||
fi
|
||||
|
||||
mkdir -p /datasets/aishell3
|
||||
if [ -z "$(ls -A /datasets/aishell3)" ] ; then
|
||||
tar xvz --directory /datasets/aishell3 -f /datasets/download/aishell3.tgz train/
|
||||
fi
|
||||
|
||||
mkdir -p /datasets/data_aishell
|
||||
if [ -z "$(ls -A /datasets/data_aishell)" ] ; then
|
||||
tar xvz --directory /datasets/ -f /datasets/download/data_aishell.tgz
|
||||
cd /datasets/data_aishell/wav/
|
||||
cat *.tar.gz | tar zxvf - -i --exclude 'dev/*' --exclude 'test/*'
|
||||
rm -f *.tar.gz
|
||||
fi
|
||||
@@ -1,5 +1,5 @@
|
||||
from pathlib import Path
|
||||
from toolbox import Toolbox
|
||||
from control.toolbox import Toolbox
|
||||
from utils.argutils import print_args
|
||||
from utils.modelutils import check_model_paths
|
||||
import argparse
|
||||
@@ -17,15 +17,15 @@ if __name__ == '__main__':
|
||||
"supported datasets.", default=None)
|
||||
parser.add_argument("-vc", "--vc_mode", action="store_true",
|
||||
help="Voice Conversion Mode(PPG based)")
|
||||
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models",
|
||||
parser.add_argument("-e", "--enc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}encoder",
|
||||
help="Directory containing saved encoder models")
|
||||
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models",
|
||||
parser.add_argument("-s", "--syn_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}synthesizer",
|
||||
help="Directory containing saved synthesizer models")
|
||||
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
|
||||
parser.add_argument("-v", "--voc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}vocoder",
|
||||
help="Directory containing saved vocoder models")
|
||||
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default="ppg_extractor/saved_models",
|
||||
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg_extractor",
|
||||
help="Directory containing saved extrator models")
|
||||
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default="ppg2mel/saved_models",
|
||||
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg2mel",
|
||||
help="Directory containing saved convert models")
|
||||
parser.add_argument("--cpu", action="store_true", help=\
|
||||
"If True, processing is done on CPU, even when a GPU is available.")
|
||||
|
||||
23
docker-compose.yml
Normal file
23
docker-compose.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
server:
|
||||
image: mockingbird:latest
|
||||
build: .
|
||||
volumes:
|
||||
- ./datasets:/datasets
|
||||
- ./synthesizer/saved_models:/workspace/synthesizer/saved_models
|
||||
environment:
|
||||
- DATASET_MIRROR=US
|
||||
- FORCE_RETRAIN=false
|
||||
- TRAIN_DATASETS=aidatatang_200zh magicdata aishell3 data_aishell
|
||||
- TRAIN_SKIP_EXISTING=true
|
||||
ports:
|
||||
- 8080:8080
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: [ '0' ]
|
||||
capabilities: [ gpu ]
|
||||
17
docker-entrypoint.sh
Executable file
17
docker-entrypoint.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if [ -z "$(ls -A /workspace/synthesizer/saved_models)" ] || [ "$FORCE_RETRAIN" = true ] ; then
|
||||
/workspace/datasets_download/download.sh
|
||||
/workspace/datasets_download/extract.sh
|
||||
for DATASET in ${TRAIN_DATASETS}
|
||||
do
|
||||
if [ "$TRAIN_SKIP_EXISTING" = true ] ; then
|
||||
python pre.py /datasets -d ${DATASET} -n $(nproc) --skip_existing
|
||||
else
|
||||
python pre.py /datasets -d ${DATASET} -n $(nproc)
|
||||
fi
|
||||
done
|
||||
python synthesizer_train.py mandarin /datasets/SV2TTS/synthesizer
|
||||
fi
|
||||
|
||||
python web.py
|
||||
@@ -1,2 +0,0 @@
|
||||
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
|
||||
Binary file not shown.
120
gen_voice.py
Normal file
120
gen_voice.py
Normal file
@@ -0,0 +1,120 @@
|
||||
from models.synthesizer.inference import Synthesizer
|
||||
from models.encoder import inference as encoder
|
||||
from models.vocoder.hifigan import inference as gan_vocoder
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import torch
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import cn2an
|
||||
|
||||
vocoder = gan_vocoder
|
||||
|
||||
def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):
|
||||
embeds = [embed] * len(texts)
|
||||
# If you know what the attention layer alignments are, you can retrieve them here by
|
||||
# passing return_alignments=True
|
||||
specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400)
|
||||
#spec = specs[0]
|
||||
breaks = [spec.shape[1] for spec in specs]
|
||||
spec = np.concatenate(specs, axis=1)
|
||||
|
||||
# If seed is specified, reset torch seed and reload vocoder
|
||||
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
|
||||
# spectrogram, the more time-efficient the vocoder.
|
||||
generated_wav, output_sample_rate = vocoder.infer_waveform(spec)
|
||||
|
||||
# Add breaks
|
||||
b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size)
|
||||
b_starts = np.concatenate(([0], b_ends[:-1]))
|
||||
wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
|
||||
breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks)
|
||||
generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
|
||||
|
||||
## Post-generation
|
||||
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
|
||||
# pad it.
|
||||
|
||||
# Trim excess silences to compensate for gaps in spectrograms (issue #53)
|
||||
generated_wav = encoder.preprocess_wav(generated_wav)
|
||||
generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97
|
||||
|
||||
# Save it on the disk
|
||||
model=os.path.basename(in_fpath)
|
||||
filename = "%s_%d_%s.wav" %(file_name, seq, model)
|
||||
sf.write(filename, generated_wav, synthesizer.sample_rate)
|
||||
|
||||
print("\nSaved output as %s\n\n" % filename)
|
||||
|
||||
|
||||
def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name):
|
||||
if torch.cuda.is_available():
|
||||
device_id = torch.cuda.current_device()
|
||||
gpu_properties = torch.cuda.get_device_properties(device_id)
|
||||
## Print some environment information (for debugging purposes)
|
||||
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
|
||||
"%.1fGb total memory.\n" %
|
||||
(torch.cuda.device_count(),
|
||||
device_id,
|
||||
gpu_properties.name,
|
||||
gpu_properties.major,
|
||||
gpu_properties.minor,
|
||||
gpu_properties.total_memory / 1e9))
|
||||
else:
|
||||
print("Using CPU for inference.\n")
|
||||
|
||||
print("Preparing the encoder, the synthesizer and the vocoder...")
|
||||
encoder.load_model(enc_model_fpath)
|
||||
synthesizer = Synthesizer(syn_model_fpath)
|
||||
vocoder.load_model(voc_model_fpath)
|
||||
|
||||
encoder_wav = synthesizer.load_preprocess_wav(in_fpath)
|
||||
embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
|
||||
|
||||
texts = input_txt.split("\n")
|
||||
seq=0
|
||||
each_num=1500
|
||||
|
||||
punctuation = '!,。、,' # punctuate and split/clean text
|
||||
processed_texts = []
|
||||
cur_num = 0
|
||||
for text in texts:
|
||||
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
|
||||
if processed_text:
|
||||
processed_texts.append(processed_text.strip())
|
||||
cur_num += len(processed_text.strip())
|
||||
if cur_num > each_num:
|
||||
seq = seq +1
|
||||
gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
|
||||
processed_texts = []
|
||||
cur_num = 0
|
||||
|
||||
if len(processed_texts)>0:
|
||||
seq = seq +1
|
||||
gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
|
||||
|
||||
if (len(sys.argv)>=3):
|
||||
my_txt = ""
|
||||
print("reading from :", sys.argv[1])
|
||||
with open(sys.argv[1], "r") as f:
|
||||
for line in f.readlines():
|
||||
#line = line.strip('\n')
|
||||
my_txt += line
|
||||
txt_file_name = sys.argv[1]
|
||||
wav_file_name = sys.argv[2]
|
||||
|
||||
output = cn2an.transform(my_txt, "an2cn")
|
||||
print(output)
|
||||
generate_wav(
|
||||
Path("encoder/saved_models/pretrained.pt"),
|
||||
Path("synthesizer/saved_models/mandarin.pt"),
|
||||
Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name
|
||||
)
|
||||
|
||||
else:
|
||||
print("please input the file name")
|
||||
exit(1)
|
||||
|
||||
|
||||
0
models/encoder/__init__.py
Normal file
0
models/encoder/__init__.py
Normal file
@@ -1,5 +1,5 @@
|
||||
from scipy.ndimage.morphology import binary_dilation
|
||||
from encoder.params_data import *
|
||||
from models.encoder.params_data import *
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union
|
||||
from warnings import warn
|
||||
@@ -56,8 +56,8 @@ def wav_to_mel_spectrogram(wav):
|
||||
Note: this not a log-mel spectrogram.
|
||||
"""
|
||||
frames = librosa.feature.melspectrogram(
|
||||
wav,
|
||||
sampling_rate,
|
||||
y=wav,
|
||||
sr=sampling_rate,
|
||||
n_fft=int(sampling_rate * mel_window_length / 1000),
|
||||
hop_length=int(sampling_rate * mel_window_step / 1000),
|
||||
n_mels=mel_n_channels
|
||||
2
models/encoder/data_objects/__init__.py
Normal file
2
models/encoder/data_objects/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
|
||||
@@ -1,5 +1,5 @@
|
||||
from encoder.data_objects.random_cycler import RandomCycler
|
||||
from encoder.data_objects.utterance import Utterance
|
||||
from models.encoder.data_objects.random_cycler import RandomCycler
|
||||
from models.encoder.data_objects.utterance import Utterance
|
||||
from pathlib import Path
|
||||
|
||||
# Contains the set of utterances of a single speaker
|
||||
@@ -1,6 +1,6 @@
|
||||
import numpy as np
|
||||
from typing import List
|
||||
from encoder.data_objects.speaker import Speaker
|
||||
from models.encoder.data_objects.speaker import Speaker
|
||||
|
||||
class SpeakerBatch:
|
||||
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
|
||||
@@ -1,7 +1,7 @@
|
||||
from encoder.data_objects.random_cycler import RandomCycler
|
||||
from encoder.data_objects.speaker_batch import SpeakerBatch
|
||||
from encoder.data_objects.speaker import Speaker
|
||||
from encoder.params_data import partials_n_frames
|
||||
from models.encoder.data_objects.random_cycler import RandomCycler
|
||||
from models.encoder.data_objects.speaker_batch import SpeakerBatch
|
||||
from models.encoder.data_objects.speaker import Speaker
|
||||
from models.encoder.params_data import partials_n_frames
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from pathlib import Path
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from encoder.params_data import *
|
||||
from encoder.model import SpeakerEncoder
|
||||
from encoder.audio import preprocess_wav # We want to expose this function from here
|
||||
from models.encoder.params_data import *
|
||||
from models.encoder.model import SpeakerEncoder
|
||||
from models.encoder.audio import preprocess_wav # We want to expose this function from here
|
||||
from matplotlib import cm
|
||||
from encoder import audio
|
||||
from models.encoder import audio
|
||||
from pathlib import Path
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
@@ -1,5 +1,5 @@
|
||||
from encoder.params_model import *
|
||||
from encoder.params_data import *
|
||||
from models.encoder.params_model import *
|
||||
from models.encoder.params_data import *
|
||||
from scipy.interpolate import interp1d
|
||||
from sklearn.metrics import roc_curve
|
||||
from torch.nn.utils import clip_grad_norm_
|
||||
@@ -1,8 +1,8 @@
|
||||
from multiprocess.pool import ThreadPool
|
||||
from encoder.params_data import *
|
||||
from encoder.config import librispeech_datasets, anglophone_nationalites
|
||||
from models.encoder.params_data import *
|
||||
from models.encoder.config import librispeech_datasets, anglophone_nationalites
|
||||
from datetime import datetime
|
||||
from encoder import audio
|
||||
from models.encoder import audio
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
@@ -22,7 +22,7 @@ class DatasetLog:
|
||||
self._log_params()
|
||||
|
||||
def _log_params(self):
|
||||
from encoder import params_data
|
||||
from models.encoder import params_data
|
||||
self.write_line("Parameter values:")
|
||||
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
|
||||
value = getattr(params_data, param_name)
|
||||
@@ -1,7 +1,7 @@
|
||||
from encoder.visualizations import Visualizations
|
||||
from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
|
||||
from encoder.params_model import *
|
||||
from encoder.model import SpeakerEncoder
|
||||
from models.encoder.visualizations import Visualizations
|
||||
from models.encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
|
||||
from models.encoder.params_model import *
|
||||
from models.encoder.model import SpeakerEncoder
|
||||
from utils.profiler import Profiler
|
||||
from pathlib import Path
|
||||
import torch
|
||||
@@ -1,4 +1,4 @@
|
||||
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
||||
from datetime import datetime
|
||||
from time import perf_counter as timer
|
||||
import matplotlib.pyplot as plt
|
||||
@@ -65,8 +65,8 @@ class Visualizations:
|
||||
def log_params(self):
|
||||
if self.disabled:
|
||||
return
|
||||
from encoder import params_data
|
||||
from encoder import params_model
|
||||
from models.encoder import params_data
|
||||
from models.encoder import params_model
|
||||
param_string = "<b>Model parameters</b>:<br>"
|
||||
for param_name in (p for p in dir(params_model) if not p.startswith("__")):
|
||||
value = getattr(params_model, param_name)
|
||||
@@ -15,7 +15,7 @@ from .rnn_decoder_mol import Decoder
|
||||
from .utils.cnn_postnet import Postnet
|
||||
from .utils.vc_utils import get_mask_from_lengths
|
||||
|
||||
from utils.load_yaml import HpsYaml
|
||||
from utils.hparams import HpsYaml
|
||||
|
||||
class MelDecoderMOLv2(AbsMelDecoder):
|
||||
"""Use an encoder to preprocess ppg."""
|
||||
@@ -191,12 +191,15 @@ class MelDecoderMOLv2(AbsMelDecoder):
|
||||
|
||||
return mel_outputs[0], mel_outputs_postnet[0], alignments[0]
|
||||
|
||||
def load_model(train_config, model_file, device=None):
|
||||
|
||||
def load_model(model_file, device=None):
|
||||
# search a config file
|
||||
model_config_fpaths = list(model_file.parent.rglob("*.yaml"))
|
||||
if len(model_config_fpaths) == 0:
|
||||
raise "No model yaml config found for convertor"
|
||||
if device is None:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
model_config = HpsYaml(train_config)
|
||||
model_config = HpsYaml(model_config_fpaths[0])
|
||||
ppg2mel_model = MelDecoderMOLv2(
|
||||
**model_config["model"]
|
||||
).to(device)
|
||||
@@ -7,10 +7,10 @@ from pathlib import Path
|
||||
import soundfile
|
||||
import resampy
|
||||
|
||||
from ppg_extractor import load_model
|
||||
from models.ppg_extractor import load_model
|
||||
import encoder.inference as Encoder
|
||||
from encoder.audio import preprocess_wav
|
||||
from encoder import audio
|
||||
from models.encoder.audio import preprocess_wav
|
||||
from models.encoder import audio
|
||||
from utils.f0_utils import compute_f0
|
||||
|
||||
from torch.multiprocessing import Pool, cpu_count
|
||||
@@ -110,3 +110,4 @@ def preprocess_dataset(datasets_root, dataset, out_dir, n_processes, ppg_encoder
|
||||
t_fid_file.close()
|
||||
d_fid_file.close()
|
||||
e_fid_file.close()
|
||||
return len(wav_file_list)
|
||||
62
models/ppg2mel/train.py
Normal file
62
models/ppg2mel/train.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import sys
|
||||
import torch
|
||||
import argparse
|
||||
import numpy as np
|
||||
from utils.hparams import HpsYaml
|
||||
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
|
||||
|
||||
# For reproducibility, comment these may speed up training
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
|
||||
def main():
|
||||
# Arguments
|
||||
parser = argparse.ArgumentParser(description=
|
||||
'Training PPG2Mel VC model.')
|
||||
parser.add_argument('--config', type=str,
|
||||
help='Path to experiment config, e.g., config/vc.yaml')
|
||||
parser.add_argument('--name', default=None, type=str, help='Name for logging.')
|
||||
parser.add_argument('--logdir', default='log/', type=str,
|
||||
help='Logging path.', required=False)
|
||||
parser.add_argument('--ckpdir', default='ckpt/', type=str,
|
||||
help='Checkpoint path.', required=False)
|
||||
parser.add_argument('--outdir', default='result/', type=str,
|
||||
help='Decode output path.', required=False)
|
||||
parser.add_argument('--load', default=None, type=str,
|
||||
help='Load pre-trained model (for training only)', required=False)
|
||||
parser.add_argument('--warm_start', action='store_true',
|
||||
help='Load model weights only, ignore specified layers.')
|
||||
parser.add_argument('--seed', default=0, type=int,
|
||||
help='Random seed for reproducable results.', required=False)
|
||||
parser.add_argument('--njobs', default=8, type=int,
|
||||
help='Number of threads for dataloader/decoding.', required=False)
|
||||
parser.add_argument('--cpu', action='store_true', help='Disable GPU training.')
|
||||
# parser.add_argument('--no-pin', action='store_true',
|
||||
# help='Disable pin-memory for dataloader')
|
||||
parser.add_argument('--no-msg', action='store_true', help='Hide all messages.')
|
||||
|
||||
###
|
||||
|
||||
paras = parser.parse_args()
|
||||
setattr(paras, 'gpu', not paras.cpu)
|
||||
setattr(paras, 'pin_memory', not paras.no_pin)
|
||||
setattr(paras, 'verbose', not paras.no_msg)
|
||||
# Make the config dict dot visitable
|
||||
config = HpsYaml(paras.config)
|
||||
|
||||
np.random.seed(paras.seed)
|
||||
torch.manual_seed(paras.seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(paras.seed)
|
||||
|
||||
print(">>> OneShot VC training ...")
|
||||
mode = "train"
|
||||
solver = Solver(config, paras, mode)
|
||||
solver.load_data()
|
||||
solver.set_model()
|
||||
solver.exec()
|
||||
print(">>> Oneshot VC train finished!")
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -8,7 +8,6 @@ from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from .option import default_hparas
|
||||
from utils.util import human_format, Timer
|
||||
from utils.load_yaml import HpsYaml
|
||||
|
||||
|
||||
class BaseSolver():
|
||||
@@ -93,6 +92,7 @@ class BaseSolver():
|
||||
|
||||
def load_ckpt(self):
|
||||
''' Load ckpt if --load option is specified '''
|
||||
print(self.paras)
|
||||
if self.paras.load is not None:
|
||||
if self.paras.warm_start:
|
||||
self.verbose(f"Warm starting model from checkpoint {self.paras.load}.")
|
||||
@@ -100,7 +100,7 @@ class BaseSolver():
|
||||
self.paras.load, map_location=self.device if self.mode == 'train'
|
||||
else 'cpu')
|
||||
model_dict = ckpt['model']
|
||||
if len(self.config.model.ignore_layers) > 0:
|
||||
if "ignore_layers" in self.config.model and len(self.config.model.ignore_layers) > 0:
|
||||
model_dict = {k:v for k, v in model_dict.items()
|
||||
if k not in self.config.model.ignore_layers}
|
||||
dummy_dict = self.model.state_dict()
|
||||
@@ -14,7 +14,7 @@ from utils.data_load import OneshotVcDataset, MultiSpkVcCollate
|
||||
from .loss import MaskedMSELoss
|
||||
from .optim import Optimizer
|
||||
from utils.util import human_format
|
||||
from ppg2mel import MelDecoderMOLv2
|
||||
from models.ppg2mel import MelDecoderMOLv2
|
||||
|
||||
|
||||
class Solver(BaseSolver):
|
||||
0
models/ppg_extractor/encoder/__init__.py
Normal file
0
models/ppg_extractor/encoder/__init__.py
Normal file
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user