mirror of
https://github.com/babysor/Realtime-Voice-Clone-Chinese.git
synced 2026-02-04 11:04:43 +08:00
Compare commits
34 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
178787887b | ||
|
|
43c86eb411 | ||
|
|
37f11ab9ce | ||
|
|
e2017d0314 | ||
|
|
547ac816df | ||
|
|
6b4ab39601 | ||
|
|
b46e7a7866 | ||
|
|
8a384a1191 | ||
|
|
11154783d8 | ||
|
|
d52db0444e | ||
|
|
790d11a58b | ||
|
|
cb82fcfe58 | ||
|
|
26ecb7546d | ||
|
|
f64914fca8 | ||
|
|
512da52775 | ||
|
|
9c219f05c2 | ||
|
|
4d9e460063 | ||
|
|
0d0b55d3e9 | ||
|
|
4acfee2a64 | ||
|
|
99269b2046 | ||
|
|
28e6bce570 | ||
|
|
5238c43799 | ||
|
|
2dd76e1b8d | ||
|
|
ddd478c0ad | ||
|
|
4178416385 | ||
|
|
3fbe03f2ff | ||
|
|
222e302274 | ||
|
|
32b9755cbe | ||
|
|
78fcfc4651 | ||
|
|
45bc43bf3c | ||
|
|
dacedfa9cc | ||
|
|
b60b75ea89 | ||
|
|
c4a8c72b83 | ||
|
|
8195a55a25 |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -17,4 +17,7 @@
|
||||
*.sh
|
||||
synthesizer/saved_models/*
|
||||
vocoder/saved_models/*
|
||||
!vocoder/saved_models/pretrained/*
|
||||
encoder/saved_models/*
|
||||
cp_hifigan/*
|
||||
!vocoder/saved_models/pretrained/*
|
||||
!encoder/saved_models/pretrained.pt
|
||||
93
.vscode/launch.json
vendored
93
.vscode/launch.json
vendored
@@ -1,48 +1,47 @@
|
||||
{
|
||||
// 使用 IntelliSense 了解相关属性。
|
||||
// 悬停以查看现有属性的描述。
|
||||
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python: Syn Preprocess",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "pre.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"D:\\ttsdata\\BZNSYP", "-d", "BZNSYP"
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Python: Vocoder Preprocess",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_preprocess.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"..\\..\\chs1"
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Python: Vocoder Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_train.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"dev", "..\\..\\chs1"
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Python: demo box",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"-d", "..\\..\\chs"
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
// 使用 IntelliSense 了解相关属性。
|
||||
// 悬停以查看现有属性的描述。
|
||||
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python: Web",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "web.py",
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "Python: Vocoder Preprocess",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_preprocess.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["..\\audiodata"]
|
||||
},
|
||||
{
|
||||
"name": "Python: Vocoder Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "vocoder_train.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["dev", "..\\audiodata"]
|
||||
},
|
||||
{
|
||||
"name": "Python: Demo Box",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "demo_toolbox.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["-d","..\\audiodata"]
|
||||
},
|
||||
{
|
||||
"name": "Python: Synth Train",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "synthesizer_train.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": ["my_run", "..\\"]
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"python.formatting.provider": "black"
|
||||
}
|
||||
196
README-CN.md
196
README-CN.md
@@ -5,19 +5,18 @@
|
||||
|
||||
### [English](README.md) | 中文
|
||||
|
||||
### [DEMO VIDEO](https://www.bilibili.com/video/BV1sA411P7wM/)
|
||||
### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/)
|
||||
|
||||
## 特性
|
||||
🌍 **中文** 支持普通话并使用多种中文数据集进行测试:adatatang_200zh, magicdata, aishell3
|
||||
🌍 **中文** 支持普通话并使用多种中文数据集进行测试:aidatatang_200zh, magicdata, aishell3, biaobei,MozillaCommonVoice 等
|
||||
|
||||
🤩 **PyTorch** 适用于 pytorch,已在 1.9.0 版本(最新于 2021 年 8 月)中测试,GPU Tesla T4 和 GTX 2060
|
||||
|
||||
🌍 **Windows + Linux** 在修复 nits 后在 Windows 操作系统和 linux 操作系统中进行测试
|
||||
🌍 **Windows + Linux** 可在 Windows 操作系统和 linux 操作系统中运行(苹果系统M1版也有社区成功运行案例)
|
||||
|
||||
🤩 **Easy & Awesome** 仅使用新训练的合成器(synthesizer)就有良好效果,复用预训练的编码器/声码器
|
||||
🤩 **Easy & Awesome** 仅需下载或新训练合成器(synthesizer)就有良好效果,复用预训练的编码器/声码器,或实时的HiFi-GAN作为vocoder
|
||||
|
||||
## 快速开始
|
||||
> 0训练新手友好版可以参考 [Quick Start (Newbie)](https://github.com/babysor/Realtime-Voice-Clone-Chinese/wiki/Quick-Start-(Newbie))
|
||||
🌍 **Webserver Ready** 可伺服你的训练结果,供远程调用
|
||||
|
||||
### 1. 安装要求
|
||||
> 按照原始存储库测试您是否已准备好所有环境。
|
||||
@@ -27,61 +26,182 @@
|
||||
> 如果在用 pip 方式安装的时候出现 `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2)` 这个错误可能是 python 版本过低,3.9 可以安装成功
|
||||
* 安装 [ffmpeg](https://ffmpeg.org/download.html#get-packages)。
|
||||
* 运行`pip install -r requirements.txt` 来安装剩余的必要包。
|
||||
* 安装 webrtcvad 用 `pip install webrtcvad-wheels`。
|
||||
* 安装 webrtcvad `pip install webrtcvad-wheels`。
|
||||
|
||||
### 2. 使用数据集训练合成器
|
||||
### 2. 准备预训练模型
|
||||
考虑训练您自己专属的模型或者下载社区他人训练好的模型:
|
||||
> 近期创建了[知乎专题](https://www.zhihu.com/column/c_1425605280340504576) 将不定期更新炼丹小技巧or心得,也欢迎提问
|
||||
#### 2.1 使用数据集自己训练合成器模型(与2.2二选一)
|
||||
* 下载 数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav)
|
||||
* 进行音频和梅尔频谱图预处理:
|
||||
`python pre.py <datasets_root>`
|
||||
|
||||
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3, BZNSYP
|
||||
可以传入参数 --dataset `{dataset}` 支持 aidatatang_200zh, magicdata, aishell3
|
||||
> 假如你下载的 `aidatatang_200zh`文件放在D盘,`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
|
||||
|
||||
>假如發生 `頁面文件太小,無法完成操作`,請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
|
||||
|
||||
* 训练合成器:
|
||||
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
|
||||
|
||||
* 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时,请转到下一步。
|
||||
> 仅供参考,我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4
|
||||

|
||||

|
||||
* 当您在训练文件夹 *synthesizer/saved_models/* 中看到注意线显示和损失满足您的需要时,请转到`启动程序`一步。
|
||||
|
||||
### 2.2 使用预先训练好的合成器
|
||||
> 实在没有设备或者不想慢慢调试,可以使用网友贡献的模型(欢迎持续分享):
|
||||
#### 2.2使用社区预先训练好的合成器(与2.1二选一)
|
||||
> 当实在没有设备或者不想慢慢调试,可以使用社区贡献的模型(欢迎持续分享):
|
||||
|
||||
| 作者 | 下载链接 | 效果预览 |
|
||||
| --- | ----------- | ----- |
|
||||
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码:2021 | https://www.bilibili.com/video/BV1uh411B7AD/)
|
||||
| 作者 | 下载链接 | 效果预览 | 信息 |
|
||||
| --- | ----------- | ----- | ----- |
|
||||
| 作者 | https://pan.baidu.com/s/1VHSKIbxXQejtxi2at9IrpA [百度盘链接](https://pan.baidu.com/s/1VHSKIbxXQejtxi2at9IrpA ) 提取码:i183 | | 200k steps 只用aidatatang_200zh
|
||||
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing [百度盘链接](https://pan.baidu.com/s/1vSYXO4wsLyjnF3Unl-Xoxg) 提取码:1024 | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps 台湾口音
|
||||
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ 提取码:2021 | https://www.bilibili.com/video/BV1uh411B7AD/ | 150k steps 旧版需根据[issue](https://github.com/babysor/MockingBird/issues/37)修复
|
||||
|
||||
### 2.3 训练声码器 (Optional)
|
||||
#### 2.3训练声码器 (可选)
|
||||
对效果影响不大,已经预置3款,如果希望自己训练可以参考以下命令。
|
||||
* 预处理数据:
|
||||
`python vocoder_preprocess.py <datasets_root>`
|
||||
`python vocoder_preprocess.py <datasets_root> -m <synthesizer_model_path>`
|
||||
> `<datasets_root>`替换为你的数据集目录,`<synthesizer_model_path>`替换为一个你最好的synthesizer模型目录,例如 *sythensizer\saved_mode\xxx*
|
||||
|
||||
* 训练声码器:
|
||||
`python vocoder_train.py mandarin <datasets_root>`
|
||||
|
||||
### 3. 启动工具箱
|
||||
然后您可以尝试使用工具箱:
|
||||
* 训练wavernn声码器:
|
||||
`python vocoder_train.py <trainid> <datasets_root>`
|
||||
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
||||
|
||||
* 训练hifigan声码器:
|
||||
`python vocoder_train.py <trainid> <datasets_root> hifigan`
|
||||
> `<trainid>`替换为你想要的标识,同一标识再次训练时会延续原模型
|
||||
|
||||
### 3. 启动程序或工具箱
|
||||
您可以尝试使用以下命令:
|
||||
|
||||
### 3.1 启动Web程序:
|
||||
`python web.py`
|
||||
运行成功后在浏览器打开地址, 默认为 `http://localhost:8080`
|
||||

|
||||
> 注:目前界面比较buggy,
|
||||
> * 第一次点击`录制`要等待几秒浏览器正常启动录音,否则会有重音
|
||||
> * 录制结束不要再点`录制`而是`停止`
|
||||
> * 仅支持手动新录音(16khz), 不支持超过4MB的录音,最佳长度在5~15秒
|
||||
> * 默认使用第一个找到的模型,有动手能力的可以看代码修改 `web\__init__.py`。
|
||||
|
||||
### 3.2 启动工具箱:
|
||||
`python demo_toolbox.py -d <datasets_root>`
|
||||
> 请指定一个可用的数据集文件路径,如果有支持的数据集则会自动加载供调试,也同时会作为手动录制音频的存储目录。
|
||||
|
||||
> Good news🤩: 可直接使用中文
|
||||
<img width="1042" alt="d48ea37adf3660e657cfb047c10edbc" src="https://user-images.githubusercontent.com/7423248/134275227-c1ddf154-f118-4b77-8949-8c4c7daf25f0.png">
|
||||
|
||||
## TODO
|
||||
- [X] 允许直接使用中文
|
||||
- [X] 添加演示视频
|
||||
- [X] 添加对更多数据集的支持
|
||||
- [X] 上传预训练模型
|
||||
- [ ] 支持parallel tacotron
|
||||
- [ ] 服务化与容器化
|
||||
- [ ] 🙏 欢迎补充
|
||||
## 文件结构(目标读者:开发者)
|
||||
```
|
||||
├─archived_untest_files 废弃文件
|
||||
├─encoder encoder模型
|
||||
│ ├─data_objects
|
||||
│ └─saved_models 预训练好的模型
|
||||
├─samples 样例语音
|
||||
├─synthesizer synthesizer模型
|
||||
│ ├─models
|
||||
│ ├─saved_models 预训练好的模型
|
||||
│ └─utils 工具类库
|
||||
├─toolbox 图形化工具箱
|
||||
├─utils 工具类库
|
||||
├─vocoder vocoder模型(目前包含hifi-gan、wavrnn)
|
||||
│ ├─hifigan
|
||||
│ ├─saved_models 预训练好的模型
|
||||
│ └─wavernn
|
||||
└─web
|
||||
├─api
|
||||
│ └─Web端接口
|
||||
├─config
|
||||
│ └─ Web端配置文件
|
||||
├─static 前端静态脚本
|
||||
│ └─js
|
||||
├─templates 前端模板
|
||||
└─__init__.py Web端入口文件
|
||||
```
|
||||
|
||||
## 引用及论文
|
||||
> 该库一开始从仅支持英语的[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) 分叉出来的,鸣谢作者。
|
||||
|
||||
| URL | Designation | 标题 | 实现源码 |
|
||||
| --- | ----------- | ----- | --------------------- |
|
||||
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
|
||||
| [1803.09017](https://arxiv.org/abs/1803.09017) | GlobalStyleToken (synthesizer)| Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis | 本代码库 |
|
||||
| [2010.05646](https://arxiv.org/abs/2010.05646) | HiFi-GAN (vocoder)| Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | 本代码库 |
|
||||
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | SV2TTS | Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis | This repo |
|
||||
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|
||||
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|
||||
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
|
||||
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | 本代码库 |
|
||||
|
||||
## 常見問題(FQ&A)
|
||||
#### 1.數據集哪裡下載?
|
||||
| 数据集 | OpenSLR地址 | 其他源 (Google Drive, Baidu网盘等) |
|
||||
| --- | ----------- | ---------------|
|
||||
| aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) |
|
||||
| magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) |
|
||||
| aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) |
|
||||
> 解壓 aidatatang_200zh 後,還需將 `aidatatang_200zh\corpus\train`下的檔案全選解壓縮
|
||||
|
||||
#### 2.`<datasets_root>`是什麼意思?
|
||||
假如數據集路徑為 `D:\data\aidatatang_200zh`,那麼 `<datasets_root>`就是 `D:\data`
|
||||
|
||||
#### 3.訓練模型顯存不足
|
||||
訓練合成器時:將 `synthesizer/hparams.py`中的batch_size參數調小
|
||||
```
|
||||
//調整前
|
||||
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 12), #
|
||||
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
||||
//調整後
|
||||
tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 8), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 8), #
|
||||
(2, 1e-4, 160_000, 8), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 320_000, 8), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 640_000, 8)], # lr = learning rate
|
||||
```
|
||||
|
||||
聲碼器-預處理數據集時:將 `synthesizer/hparams.py`中的batch_size參數調小
|
||||
```
|
||||
//調整前
|
||||
### Data Preprocessing
|
||||
max_mel_frames = 900,
|
||||
rescale = True,
|
||||
rescaling_max = 0.9,
|
||||
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
|
||||
//調整後
|
||||
### Data Preprocessing
|
||||
max_mel_frames = 900,
|
||||
rescale = True,
|
||||
rescaling_max = 0.9,
|
||||
synthesis_batch_size = 8, # For vocoder preprocessing and inference.
|
||||
```
|
||||
|
||||
聲碼器-訓練聲碼器時:將 `vocoder/wavernn/hparams.py`中的batch_size參數調小
|
||||
```
|
||||
//調整前
|
||||
# Training
|
||||
voc_batch_size = 100
|
||||
voc_lr = 1e-4
|
||||
voc_gen_at_checkpoint = 5
|
||||
voc_pad = 2
|
||||
|
||||
//調整後
|
||||
# Training
|
||||
voc_batch_size = 6
|
||||
voc_lr = 1e-4
|
||||
voc_gen_at_checkpoint = 5
|
||||
voc_pad =2
|
||||
```
|
||||
|
||||
#### 4.碰到`RuntimeError: Error(s) in loading state_dict for Tacotron: size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([70, 512]) from checkpoint, the shape in current model is torch.Size([75, 512]).`
|
||||
請參照 issue [#37](https://github.com/babysor/MockingBird/issues/37)
|
||||
|
||||
#### 5.如何改善CPU、GPU佔用率?
|
||||
適情況調整batch_size參數來改善
|
||||
|
||||
#### 6.發生 `頁面文件太小,無法完成操作`
|
||||
請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030),將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
|
||||
|
||||
#### 7.什么时候算训练完成?
|
||||
首先一定要出现注意力模型,其次是loss足够低,取决于硬件设备和数据集。拿本人的供参考,我的注意力是在 18k 步之后出现的,并且在 50k 步之后损失变得低于 0.4
|
||||

|
||||
|
||||

|
||||
|
||||
|
||||
146
README.md
146
README.md
@@ -6,16 +6,17 @@
|
||||
> English | [中文](README-CN.md)
|
||||
|
||||
## Features
|
||||
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3
|
||||
🌍 **Chinese** supported mandarin and tested with multiple datasets: aidatatang_200zh, magicdata, aishell3, and etc.
|
||||
|
||||
🤩 **PyTorch** worked for pytorch, tested in version of 1.9.0(latest in August 2021), with GPU Tesla T4 and GTX 2060
|
||||
|
||||
🌍 **Windows + Linux** tested in both Windows OS and linux OS after fixing nits
|
||||
🌍 **Windows + Linux** run in both Windows OS and linux OS (even in M1 MACOS)
|
||||
|
||||
🤩 **Easy & Awesome** effect with only newly-trained synthesizer, by reusing the pretrained encoder/vocoder
|
||||
|
||||
🌍 **Webserver Ready** to serve your result with remote calling
|
||||
|
||||
### [DEMO VIDEO](https://www.bilibili.com/video/BV1sA411P7wM/)
|
||||
### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/)
|
||||
|
||||
## Quick Start
|
||||
|
||||
@@ -29,64 +30,135 @@
|
||||
* Run `pip install -r requirements.txt` to install the remaining necessary packages.
|
||||
* Install webrtcvad `pip install webrtcvad-wheels`(If you need)
|
||||
> Note that we are using the pretrained encoder/vocoder but synthesizer, since the original model is incompatible with the Chinese sympols. It means the demo_cli is not working at this moment.
|
||||
### 2. Train synthesizer with your dataset
|
||||
* Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder
|
||||
### 2. Prepare your models
|
||||
You can either train your models or use existing ones:
|
||||
#### 2.1. Train synthesizer with your dataset
|
||||
* Download dataset and unzip: make sure you can access all .wav in folder
|
||||
* Preprocess with the audios and the mel spectrograms:
|
||||
`python pre.py <datasets_root>`
|
||||
|
||||
Allowing parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3, BZNSYP
|
||||
|
||||
>If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.
|
||||
|
||||
Allowing parameter `--dataset {dataset}` to support aidatatang_200zh, magicdata, aishell3, etc.
|
||||
|
||||
* Train the synthesizer:
|
||||
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`
|
||||
|
||||
* Go to next step when you see attention line show and loss meet your need in training folder *synthesizer/saved_models/*.
|
||||
> FYI, my attention came after 18k steps and loss became lower than 0.4 after 50k steps.
|
||||

|
||||

|
||||
|
||||
### 2.2 Use pretrained model of synthesizer
|
||||
#### 2.2 Use pretrained model of synthesizer
|
||||
> Thanks to the community, some models will be shared:
|
||||
|
||||
| author | Download link | Previow Video |
|
||||
| --- | ----------- | ----- |
|
||||
| author | Download link | Preview Video | Info |
|
||||
| --- | ----------- | ----- |----- |
|
||||
| @myself | https://pan.baidu.com/s/1VHSKIbxXQejtxi2at9IrpA [Baidu](https://pan.baidu.com/s/1VHSKIbxXQejtxi2at9IrpA ) code:i183 | | 200k steps only trained by aidatatang_200zh
|
||||
|@FawenYo | https://drive.google.com/file/d/1H-YGOUHpmqKxJ9FRc6vAjPuqQki24UbC/view?usp=sharing [Baidu Pan](https://pan.baidu.com/s/1vSYXO4wsLyjnF3Unl-Xoxg) Code:1024 | [input](https://github.com/babysor/MockingBird/wiki/audio/self_test.mp3) [output](https://github.com/babysor/MockingBird/wiki/audio/export.wav) | 200k steps with local accent of Taiwan
|
||||
|@miven| https://pan.baidu.com/s/1PI-hM3sn5wbeChRryX-RCQ code:2021 | https://www.bilibili.com/video/BV1uh411B7AD/
|
||||
|
||||
> A link to my early trained model: [Baidu Yun](https://pan.baidu.com/s/10t3XycWiNIg5dN5E_bMORQ)
|
||||
Code:aid4
|
||||
|
||||
### 2.3 Train vocoder (Optional)
|
||||
#### 2.3 Train vocoder (Optional)
|
||||
> note: vocoder has little difference in effect, so you may not need to train a new one.
|
||||
* Preprocess the data:
|
||||
`python vocoder_preprocess.py <datasets_root>`
|
||||
`python vocoder_preprocess.py <datasets_root> -m <synthesizer_model_path>`
|
||||
> `<datasets_root>` replace with your dataset root,`<synthesizer_model_path>`replace with directory of your best trained models of sythensizer, e.g. *sythensizer\saved_mode\xxx*
|
||||
|
||||
* Train the vocoder:
|
||||
* Train the wavernn vocoder:
|
||||
`python vocoder_train.py mandarin <datasets_root>`
|
||||
|
||||
### 3. Launch the Toolbox
|
||||
* Train the hifigan vocoder
|
||||
`python vocoder_train.py mandarin <datasets_root> hifigan`
|
||||
|
||||
### 3. Launch
|
||||
#### 3.1 Using the web server
|
||||
You can then try to run:`python web.py` and open it in browser, default as `http://localhost:8080`
|
||||
|
||||
#### 3.2 Using the Toolbox
|
||||
You can then try the toolbox:
|
||||
|
||||
`python demo_toolbox.py -d <datasets_root>`
|
||||
or
|
||||
`python demo_toolbox.py`
|
||||
|
||||
> Good news🤩: Chinese Characters are supported
|
||||
|
||||
## TODO
|
||||
- [x] Add demo video
|
||||
- [X] Add support for more dataset
|
||||
- [X] Upload pretrained model
|
||||
- [ ] Support parallel tacotron
|
||||
- [ ] Service orianted and docterize
|
||||
- 🙏 Welcome to add more
|
||||
|
||||
## Reference
|
||||
> This repository is forked from [Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning) which only support English.
|
||||
|
||||
| URL | Designation | Title | Implementation source |
|
||||
| --- | ----------- | ----- | --------------------- |
|
||||
| [1803.09017](https://arxiv.org/abs/1803.09017) | GlobalStyleToken (synthesizer)| Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End Speech Synthesis | This repo |
|
||||
| [2010.05646](https://arxiv.org/abs/2010.05646) | HiFi-GAN (vocoder)| Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | This repo |
|
||||
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
|
||||
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|
||||
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|
||||
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
|
||||
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
|
||||
|
||||
## F Q&A
|
||||
#### 1.Where can I download the dataset?
|
||||
| Dataset | Original Source | Alternative Sources |
|
||||
| --- | ----------- | ---------------|
|
||||
| aidatatang_200zh | [OpenSLR](http://www.openslr.org/62/) | [Google Drive](https://drive.google.com/file/d/110A11KZoVe7vy6kXlLb6zVPLb_J91I_t/view?usp=sharing) |
|
||||
| magicdata | [OpenSLR](http://www.openslr.org/68/) | [Google Drive (Dev set)](https://drive.google.com/file/d/1g5bWRUSNH68ycC6eNvtwh07nX3QhOOlo/view?usp=sharing) |
|
||||
| aishell3 | [OpenSLR](https://www.openslr.org/93/) | [Google Drive](https://drive.google.com/file/d/1shYp_o4Z0X0cZSKQDtFirct2luFUwKzZ/view?usp=sharing) |
|
||||
> After unzip aidatatang_200zh, you need to unzip all the files under `aidatatang_200zh\corpus\train`
|
||||
|
||||
#### 2.What is`<datasets_root>`?
|
||||
If the dataset path is `D:\data\aidatatang_200zh`,then `<datasets_root>` is`D:\data`
|
||||
|
||||
#### 3.Not enough VRAM
|
||||
Train the synthesizer:adjust the batch_size in `synthesizer/hparams.py`
|
||||
```
|
||||
//Before
|
||||
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 12), #
|
||||
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
||||
//After
|
||||
tts_schedule = [(2, 1e-3, 20_000, 8), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 8), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 8), #
|
||||
(2, 1e-4, 160_000, 8), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 320_000, 8), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 640_000, 8)], # lr = learning rate
|
||||
```
|
||||
|
||||
Train Vocoder-Preprocess the data:adjust the batch_size in `synthesizer/hparams.py`
|
||||
```
|
||||
//Before
|
||||
### Data Preprocessing
|
||||
max_mel_frames = 900,
|
||||
rescale = True,
|
||||
rescaling_max = 0.9,
|
||||
synthesis_batch_size = 16, # For vocoder preprocessing and inference.
|
||||
//After
|
||||
### Data Preprocessing
|
||||
max_mel_frames = 900,
|
||||
rescale = True,
|
||||
rescaling_max = 0.9,
|
||||
synthesis_batch_size = 8, # For vocoder preprocessing and inference.
|
||||
```
|
||||
|
||||
Train Vocoder-Train the vocoder:adjust the batch_size in `vocoder/wavernn/hparams.py`
|
||||
```
|
||||
//Before
|
||||
# Training
|
||||
voc_batch_size = 100
|
||||
voc_lr = 1e-4
|
||||
voc_gen_at_checkpoint = 5
|
||||
voc_pad = 2
|
||||
|
||||
//After
|
||||
# Training
|
||||
voc_batch_size = 6
|
||||
voc_lr = 1e-4
|
||||
voc_gen_at_checkpoint = 5
|
||||
voc_pad =2
|
||||
```
|
||||
|
||||
#### 4.If it happens `RuntimeError: Error(s) in loading state_dict for Tacotron: size mismatch for encoder.embedding.weight: copying a param with shape torch.Size([70, 512]) from checkpoint, the shape in current model is torch.Size([75, 512]).`
|
||||
Please refer to issue [#37](https://github.com/babysor/MockingBird/issues/37)
|
||||
|
||||
#### 5. How to improve CPU and GPU occupancy rate?
|
||||
Adjust the batch_size as appropriate to improve
|
||||
|
||||
|
||||
#### 6. What if it happens `the page file is too small to complete the operation`
|
||||
Please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.
|
||||
|
||||
#### 7. When should I stop during training?
|
||||
FYI, my attention came after 18k steps and loss became lower than 0.4 after 50k steps.
|
||||

|
||||

|
||||
|
||||
@@ -117,6 +117,15 @@ def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir,
|
||||
logger.finalize()
|
||||
print("Done preprocessing %s.\n" % dataset_name)
|
||||
|
||||
def preprocess_aidatatang_200zh(datasets_root: Path, out_dir: Path, skip_existing=False):
|
||||
dataset_name = "aidatatang_200zh"
|
||||
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
|
||||
if not dataset_root:
|
||||
return
|
||||
# Preprocess all speakers
|
||||
speaker_dirs = list(dataset_root.joinpath("corpus", "train").glob("*"))
|
||||
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
|
||||
skip_existing, logger)
|
||||
|
||||
def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
|
||||
for dataset_name in librispeech_datasets["train"]["other"]:
|
||||
|
||||
Binary file not shown.
@@ -1,4 +1,4 @@
|
||||
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2
|
||||
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aidatatang_200zh
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
@@ -10,17 +10,7 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocesses audio files from datasets, encodes them as mel spectrograms and "
|
||||
"writes them to the disk. This will allow you to train the encoder. The "
|
||||
"datasets required are at least one of VoxCeleb1, VoxCeleb2 and LibriSpeech. "
|
||||
"Ideally, you should have all three. You should extract them as they are "
|
||||
"after having downloaded them and put them in a same directory, e.g.:\n"
|
||||
"-[datasets_root]\n"
|
||||
" -LibriSpeech\n"
|
||||
" -train-other-500\n"
|
||||
" -VoxCeleb1\n"
|
||||
" -wav\n"
|
||||
" -vox1_meta.csv\n"
|
||||
" -VoxCeleb2\n"
|
||||
" -dev",
|
||||
"datasets required are at least one of LibriSpeech, VoxCeleb1, VoxCeleb2, aidatatang_200zh. ",
|
||||
formatter_class=MyFormatter
|
||||
)
|
||||
parser.add_argument("datasets_root", type=Path, help=\
|
||||
@@ -29,7 +19,7 @@ if __name__ == "__main__":
|
||||
"Path to the output directory that will contain the mel spectrograms. If left out, "
|
||||
"defaults to <datasets_root>/SV2TTS/encoder/")
|
||||
parser.add_argument("-d", "--datasets", type=str,
|
||||
default="librispeech_other,voxceleb1,voxceleb2", help=\
|
||||
default="librispeech_other,voxceleb1,aidatatang_200zh", help=\
|
||||
"Comma-separated list of the name of the datasets you want to preprocess. Only the train "
|
||||
"set of these datasets will be used. Possible names: librispeech_other, voxceleb1, "
|
||||
"voxceleb2.")
|
||||
@@ -63,6 +53,7 @@ if __name__ == "__main__":
|
||||
"librispeech_other": preprocess_librispeech,
|
||||
"voxceleb1": preprocess_voxceleb1,
|
||||
"voxceleb2": preprocess_voxceleb2,
|
||||
"aidatatang_200zh": preprocess_aidatatang_200zh,
|
||||
}
|
||||
args = vars(args)
|
||||
for dataset in args.pop("datasets"):
|
||||
18
pre.py
18
pre.py
@@ -12,8 +12,7 @@ import argparse
|
||||
recognized_datasets = [
|
||||
"aidatatang_200zh",
|
||||
"magicdata",
|
||||
"aishell3",
|
||||
"BZNSYP"
|
||||
"aishell3"
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -29,8 +28,7 @@ if __name__ == "__main__":
|
||||
"Path to the output directory that will contain the mel spectrograms, the audios and the "
|
||||
"embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
|
||||
parser.add_argument("-n", "--n_processes", type=int, default=1, help=\
|
||||
"Number of processes in parallel.An encoder is created for each, so you may need to lower "
|
||||
"this value on GPUs with low memory. Set it to 1 if CUDA is unhappy")
|
||||
"Number of processes in parallel.")
|
||||
parser.add_argument("-s", "--skip_existing", action="store_true", help=\
|
||||
"Whether to overwrite existing files with the same name. Useful if the preprocessing was "
|
||||
"interrupted. ")
|
||||
@@ -41,10 +39,13 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--no_alignments", action="store_true", help=\
|
||||
"Use this option when dataset does not include alignments\
|
||||
(these are used to split long audio files into sub-utterances.)")
|
||||
parser.add_argument("-d","--dataset", type=str, default="aidatatang_200zh", help=\
|
||||
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3, BZNSYP.")
|
||||
parser.add_argument("-d", "--dataset", type=str, default="aidatatang_200zh", help=\
|
||||
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3.")
|
||||
parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
|
||||
"Path your trained encoder model.")
|
||||
parser.add_argument("-ne", "--n_processes_embed", type=int, default=1, help=\
|
||||
"Number of processes in parallel.An encoder is created for each, so you may need to lower "
|
||||
"this value on GPUs with low memory. Set it to 1 if CUDA is unhappy")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process the arguments
|
||||
@@ -67,7 +68,8 @@ if __name__ == "__main__":
|
||||
del args.no_trim, args.encoder_model_fpath
|
||||
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
|
||||
n_processes_embed = args.n_processes_embed
|
||||
del args.n_processes_embed
|
||||
preprocess_dataset(**vars(args))
|
||||
|
||||
create_embeddings(synthesizer_root=args.out_dir, n_processes=args.n_processes, encoder_model_fpath=encoder_model_fpath)
|
||||
create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath)
|
||||
|
||||
@@ -14,4 +14,10 @@ PyQt5
|
||||
multiprocess
|
||||
numba
|
||||
webrtcvad; platform_system != "Windows"
|
||||
pypinyin
|
||||
pypinyin
|
||||
flask
|
||||
flask_wtf
|
||||
flask_cors
|
||||
gevent==21.8.0
|
||||
flask_restx
|
||||
tensorboard
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,22 +0,0 @@
|
||||
The audio files in this folder are provided for toolbox testing and
|
||||
benchmarking purposes. These are the same reference utterances
|
||||
used by the SV2TTS authors to generate the audio samples located at:
|
||||
https://google.github.io/tacotron/publications/speaker_adaptation/index.html
|
||||
|
||||
The `p240_00000.mp3` and `p260_00000.mp3` files are compressed
|
||||
versions of audios from the VCTK corpus available at:
|
||||
https://datashare.is.ed.ac.uk/handle/10283/3443
|
||||
VCTK.txt contains the copyright notices and licensing information.
|
||||
|
||||
The `1320_00000.mp3`, `3575_00000.mp3`, `6829_00000.mp3`
|
||||
and `8230_00000.mp3` files are compressed versions of audios
|
||||
from the LibriSpeech dataset available at: https://openslr.org/12
|
||||
For these files, the following notice applies:
|
||||
```
|
||||
LibriSpeech (c) 2014 by Vassil Panayotov
|
||||
|
||||
LibriSpeech ASR corpus is licensed under a
|
||||
Creative Commons Attribution 4.0 International License.
|
||||
|
||||
See <http://creativecommons.org/licenses/by/4.0/>.
|
||||
```
|
||||
@@ -1,94 +0,0 @@
|
||||
---------------------------------------------------------------------
|
||||
CSTR VCTK Corpus
|
||||
English Multi-speaker Corpus for CSTR Voice Cloning Toolkit
|
||||
|
||||
(Version 0.92)
|
||||
RELEASE September 2019
|
||||
The Centre for Speech Technology Research
|
||||
University of Edinburgh
|
||||
Copyright (c) 2019
|
||||
|
||||
Junichi Yamagishi
|
||||
jyamagis@inf.ed.ac.uk
|
||||
---------------------------------------------------------------------
|
||||
|
||||
Overview
|
||||
|
||||
This CSTR VCTK Corpus includes speech data uttered by 110 English
|
||||
speakers with various accents. Each speaker reads out about 400
|
||||
sentences, which were selected from a newspaper, the rainbow passage
|
||||
and an elicitation paragraph used for the speech accent archive.
|
||||
|
||||
The newspaper texts were taken from Herald Glasgow, with permission
|
||||
from Herald & Times Group. Each speaker has a different set of the
|
||||
newspaper texts selected based a greedy algorithm that increases the
|
||||
contextual and phonetic coverage. The details of the text selection
|
||||
algorithms are described in the following paper:
|
||||
|
||||
C. Veaux, J. Yamagishi and S. King,
|
||||
"The voice bank corpus: Design, collection and data analysis of
|
||||
a large regional accent speech database,"
|
||||
https://doi.org/10.1109/ICSDA.2013.6709856
|
||||
|
||||
The rainbow passage and elicitation paragraph are the same for all
|
||||
speakers. The rainbow passage can be found at International Dialects
|
||||
of English Archive:
|
||||
(http://web.ku.edu/~idea/readings/rainbow.htm). The elicitation
|
||||
paragraph is identical to the one used for the speech accent archive
|
||||
(http://accent.gmu.edu). The details of the the speech accent archive
|
||||
can be found at
|
||||
http://www.ualberta.ca/~aacl2009/PDFs/WeinbergerKunath2009AACL.pdf
|
||||
|
||||
All speech data was recorded using an identical recording setup: an
|
||||
omni-directional microphone (DPA 4035) and a small diaphragm condenser
|
||||
microphone with very wide bandwidth (Sennheiser MKH 800), 96kHz
|
||||
sampling frequency at 24 bits and in a hemi-anechoic chamber of
|
||||
the University of Edinburgh. (However, two speakers, p280 and p315
|
||||
had technical issues of the audio recordings using MKH 800).
|
||||
All recordings were converted into 16 bits, were downsampled to
|
||||
48 kHz, and were manually end-pointed.
|
||||
|
||||
This corpus was originally aimed for HMM-based text-to-speech synthesis
|
||||
systems, especially for speaker-adaptive HMM-based speech synthesis
|
||||
that uses average voice models trained on multiple speakers and speaker
|
||||
adaptation technologies. This corpus is also suitable for DNN-based
|
||||
multi-speaker text-to-speech synthesis systems and waveform modeling.
|
||||
|
||||
COPYING
|
||||
|
||||
This corpus is licensed under the Creative Commons License: Attribution 4.0 International
|
||||
http://creativecommons.org/licenses/by/4.0/legalcode
|
||||
|
||||
VCTK VARIANTS
|
||||
There are several variants of the VCTK corpus:
|
||||
Speech enhancement
|
||||
- Noisy speech database for training speech enhancement algorithms and TTS models where we added various types of noises to VCTK artificially: http://dx.doi.org/10.7488/ds/2117
|
||||
- Reverberant speech database for training speech dereverberation algorithms and TTS models where we added various types of reverberantion to VCTK artificially http://dx.doi.org/10.7488/ds/1425
|
||||
- Noisy reverberant speech database for training speech enhancement algorithms and TTS models http://dx.doi.org/10.7488/ds/2139
|
||||
- Device Recorded VCTK where speech signals of the VCTK corpus were played back and re-recorded in office environments using relatively inexpensive consumer devices http://dx.doi.org/10.7488/ds/2316
|
||||
- The Microsoft Scalable Noisy Speech Dataset (MS-SNSD) https://github.com/microsoft/MS-SNSD
|
||||
|
||||
ASV and anti-spoofing
|
||||
- Spoofing and Anti-Spoofing (SAS) corpus, which is a collection of synthetic speech signals produced by nine techniques, two of which are speech synthesis, and seven are voice conversion. All of them were built using the VCTK corpus. http://dx.doi.org/10.7488/ds/252
|
||||
- Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) Database. This database consists of synthetic speech signals produced by ten techniques and this has been used in the first Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2015) http://dx.doi.org/10.7488/ds/298
|
||||
- ASVspoof 2019: The 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge database. This database has been used in the 3rd Automatic Speaker Verification Spoofing and Countermeasures Challenge (ASVspoof 2019) https://doi.org/10.7488/ds/2555
|
||||
|
||||
|
||||
ACKNOWLEDGEMENTS
|
||||
|
||||
The CSTR VCTK Corpus was constructed by:
|
||||
|
||||
Christophe Veaux (University of Edinburgh)
|
||||
Junichi Yamagishi (University of Edinburgh)
|
||||
Kirsten MacDonald
|
||||
|
||||
The research leading to these results was partly funded from EPSRC
|
||||
grants EP/I031022/1 (NST) and EP/J002526/1 (CAF), from the RSE-NSFC
|
||||
grant (61111130120), and from the JST CREST (uDialogue).
|
||||
|
||||
Please cite this corpus as follows:
|
||||
Christophe Veaux, Junichi Yamagishi, Kirsten MacDonald,
|
||||
"CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit",
|
||||
The Centre for Speech Technology Research (CSTR),
|
||||
University of Edinburgh
|
||||
|
||||
Binary file not shown.
Binary file not shown.
13
synthesizer/gst_hyperparameters.py
Normal file
13
synthesizer/gst_hyperparameters.py
Normal file
@@ -0,0 +1,13 @@
|
||||
class GSTHyperparameters():
|
||||
E = 512
|
||||
|
||||
# reference encoder
|
||||
ref_enc_filters = [32, 32, 64, 64, 128, 128]
|
||||
|
||||
# style token layer
|
||||
token_num = 10
|
||||
# token_emb_size = 256
|
||||
num_heads = 8
|
||||
|
||||
n_mels = 256 # Number of Mel banks to generate
|
||||
|
||||
@@ -49,12 +49,15 @@ hparams = HParams(
|
||||
# frame that has all values < -3.4
|
||||
|
||||
### Tacotron Training
|
||||
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 12), #
|
||||
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
||||
tts_schedule = [(2, 1e-3, 10_000, 12), # Progressive training schedule
|
||||
(2, 5e-4, 15_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 20_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 1e-4, 30_000, 12), #
|
||||
(2, 5e-5, 40_000, 12), #
|
||||
(2, 1e-5, 60_000, 12), #
|
||||
(2, 5e-6, 160_000, 12), # r = reduction factor (# of mel frames
|
||||
(2, 3e-6, 320_000, 12), # synthesized for each decoder iteration)
|
||||
(2, 1e-6, 640_000, 12)], # lr = learning rate
|
||||
|
||||
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
|
||||
tts_eval_interval = 500, # Number of steps between model evaluation (sample generation)
|
||||
|
||||
@@ -70,7 +70,7 @@ class Synthesizer:
|
||||
|
||||
def synthesize_spectrograms(self, texts: List[str],
|
||||
embeddings: Union[np.ndarray, List[np.ndarray]],
|
||||
return_alignments=False):
|
||||
return_alignments=False, style_idx=0):
|
||||
"""
|
||||
Synthesizes mel spectrograms from texts and speaker embeddings.
|
||||
|
||||
@@ -125,7 +125,7 @@ class Synthesizer:
|
||||
speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
|
||||
|
||||
# Inference
|
||||
_, mels, alignments = self._model.generate(chars, speaker_embeddings)
|
||||
_, mels, alignments = self._model.generate(chars, speaker_embeddings, style_idx=style_idx)
|
||||
mels = mels.detach().cpu().numpy()
|
||||
for m in mels:
|
||||
# Trim silence from end of each spectrogram
|
||||
|
||||
135
synthesizer/models/global_style_token.py
Normal file
135
synthesizer/models/global_style_token.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.init as init
|
||||
import torch.nn.functional as tFunctional
|
||||
from synthesizer.gst_hyperparameters import GSTHyperparameters as hp
|
||||
|
||||
|
||||
class GlobalStyleToken(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
|
||||
super().__init__()
|
||||
self.encoder = ReferenceEncoder()
|
||||
self.stl = STL()
|
||||
|
||||
def forward(self, inputs):
|
||||
enc_out = self.encoder(inputs)
|
||||
style_embed = self.stl(enc_out)
|
||||
|
||||
return style_embed
|
||||
|
||||
|
||||
class ReferenceEncoder(nn.Module):
|
||||
'''
|
||||
inputs --- [N, Ty/r, n_mels*r] mels
|
||||
outputs --- [N, ref_enc_gru_size]
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
|
||||
super().__init__()
|
||||
K = len(hp.ref_enc_filters)
|
||||
filters = [1] + hp.ref_enc_filters
|
||||
convs = [nn.Conv2d(in_channels=filters[i],
|
||||
out_channels=filters[i + 1],
|
||||
kernel_size=(3, 3),
|
||||
stride=(2, 2),
|
||||
padding=(1, 1)) for i in range(K)]
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=hp.ref_enc_filters[i]) for i in range(K)])
|
||||
|
||||
out_channels = self.calculate_channels(hp.n_mels, 3, 2, 1, K)
|
||||
self.gru = nn.GRU(input_size=hp.ref_enc_filters[-1] * out_channels,
|
||||
hidden_size=hp.E // 2,
|
||||
batch_first=True)
|
||||
|
||||
def forward(self, inputs):
|
||||
N = inputs.size(0)
|
||||
out = inputs.view(N, 1, -1, hp.n_mels) # [N, 1, Ty, n_mels]
|
||||
for conv, bn in zip(self.convs, self.bns):
|
||||
out = conv(out)
|
||||
out = bn(out)
|
||||
out = tFunctional.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
|
||||
|
||||
out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
|
||||
T = out.size(1)
|
||||
N = out.size(0)
|
||||
out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
|
||||
|
||||
self.gru.flatten_parameters()
|
||||
memory, out = self.gru(out) # out --- [1, N, E//2]
|
||||
|
||||
return out.squeeze(0)
|
||||
|
||||
def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
|
||||
for i in range(n_convs):
|
||||
L = (L - kernel_size + 2 * pad) // stride + 1
|
||||
return L
|
||||
|
||||
|
||||
class STL(nn.Module):
|
||||
'''
|
||||
inputs --- [N, E//2]
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
|
||||
super().__init__()
|
||||
self.embed = nn.Parameter(torch.FloatTensor(hp.token_num, hp.E // hp.num_heads))
|
||||
d_q = hp.E // 2
|
||||
d_k = hp.E // hp.num_heads
|
||||
# self.attention = MultiHeadAttention(hp.num_heads, d_model, d_q, d_v)
|
||||
self.attention = MultiHeadAttention(query_dim=d_q, key_dim=d_k, num_units=hp.E, num_heads=hp.num_heads)
|
||||
|
||||
init.normal_(self.embed, mean=0, std=0.5)
|
||||
|
||||
def forward(self, inputs):
|
||||
N = inputs.size(0)
|
||||
query = inputs.unsqueeze(1) # [N, 1, E//2]
|
||||
keys = tFunctional.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads]
|
||||
style_embed = self.attention(query, keys)
|
||||
|
||||
return style_embed
|
||||
|
||||
|
||||
class MultiHeadAttention(nn.Module):
|
||||
'''
|
||||
input:
|
||||
query --- [N, T_q, query_dim]
|
||||
key --- [N, T_k, key_dim]
|
||||
output:
|
||||
out --- [N, T_q, num_units]
|
||||
'''
|
||||
|
||||
def __init__(self, query_dim, key_dim, num_units, num_heads):
|
||||
|
||||
super().__init__()
|
||||
self.num_units = num_units
|
||||
self.num_heads = num_heads
|
||||
self.key_dim = key_dim
|
||||
|
||||
self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
|
||||
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
||||
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
||||
|
||||
def forward(self, query, key):
|
||||
querys = self.W_query(query) # [N, T_q, num_units]
|
||||
keys = self.W_key(key) # [N, T_k, num_units]
|
||||
values = self.W_value(key)
|
||||
|
||||
split_size = self.num_units // self.num_heads
|
||||
querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h]
|
||||
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
||||
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
||||
|
||||
# score = softmax(QK^T / (d_k ** 0.5))
|
||||
scores = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k]
|
||||
scores = scores / (self.key_dim ** 0.5)
|
||||
scores = tFunctional.softmax(scores, dim=3)
|
||||
|
||||
# out = score * V
|
||||
out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
|
||||
out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
|
||||
|
||||
return out
|
||||
@@ -3,8 +3,7 @@ import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
from synthesizer.models.global_style_token import GlobalStyleToken
|
||||
|
||||
|
||||
class HighwayNetwork(nn.Module):
|
||||
@@ -338,6 +337,7 @@ class Tacotron(nn.Module):
|
||||
self.encoder = Encoder(embed_dims, num_chars, encoder_dims,
|
||||
encoder_K, num_highways, dropout)
|
||||
self.encoder_proj = nn.Linear(encoder_dims + speaker_embedding_size, decoder_dims, bias=False)
|
||||
self.gst = GlobalStyleToken()
|
||||
self.decoder = Decoder(n_mels, encoder_dims, decoder_dims, lstm_dims,
|
||||
dropout, speaker_embedding_size)
|
||||
self.postnet = CBHG(postnet_K, n_mels, postnet_dims,
|
||||
@@ -358,11 +358,11 @@ class Tacotron(nn.Module):
|
||||
def r(self, value):
|
||||
self.decoder.r = self.decoder.r.new_tensor(value, requires_grad=False)
|
||||
|
||||
def forward(self, x, m, speaker_embedding):
|
||||
def forward(self, texts, mels, speaker_embedding):
|
||||
device = next(self.parameters()).device # use same device as parameters
|
||||
|
||||
self.step += 1
|
||||
batch_size, _, steps = m.size()
|
||||
batch_size, _, steps = mels.size()
|
||||
|
||||
# Initialise all hidden states and pack into tuple
|
||||
attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device)
|
||||
@@ -383,7 +383,12 @@ class Tacotron(nn.Module):
|
||||
|
||||
# SV2TTS: Run the encoder with the speaker embedding
|
||||
# The projection avoids unnecessary matmuls in the decoder loop
|
||||
encoder_seq = self.encoder(x, speaker_embedding)
|
||||
encoder_seq = self.encoder(texts, speaker_embedding)
|
||||
# put after encoder
|
||||
if self.gst is not None:
|
||||
style_embed = self.gst(speaker_embedding)
|
||||
style_embed = style_embed.expand_as(encoder_seq)
|
||||
encoder_seq = encoder_seq + style_embed
|
||||
encoder_seq_proj = self.encoder_proj(encoder_seq)
|
||||
|
||||
# Need a couple of lists for outputs
|
||||
@@ -391,10 +396,10 @@ class Tacotron(nn.Module):
|
||||
|
||||
# Run the decoder loop
|
||||
for t in range(0, steps, self.r):
|
||||
prenet_in = m[:, :, t - 1] if t > 0 else go_frame
|
||||
prenet_in = mels[:, :, t - 1] if t > 0 else go_frame
|
||||
mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \
|
||||
self.decoder(encoder_seq, encoder_seq_proj, prenet_in,
|
||||
hidden_states, cell_states, context_vec, t, x)
|
||||
hidden_states, cell_states, context_vec, t, texts)
|
||||
mel_outputs.append(mel_frames)
|
||||
attn_scores.append(scores)
|
||||
stop_outputs.extend([stop_tokens] * self.r)
|
||||
@@ -414,7 +419,7 @@ class Tacotron(nn.Module):
|
||||
|
||||
return mel_outputs, linear, attn_scores, stop_outputs
|
||||
|
||||
def generate(self, x, speaker_embedding=None, steps=2000):
|
||||
def generate(self, x, speaker_embedding=None, steps=200, style_idx=0):
|
||||
self.eval()
|
||||
device = next(self.parameters()).device # use same device as parameters
|
||||
|
||||
@@ -440,6 +445,18 @@ class Tacotron(nn.Module):
|
||||
# SV2TTS: Run the encoder with the speaker embedding
|
||||
# The projection avoids unnecessary matmuls in the decoder loop
|
||||
encoder_seq = self.encoder(x, speaker_embedding)
|
||||
|
||||
# put after encoder
|
||||
if self.gst is not None and style_idx >= 0 and style_idx < 10:
|
||||
gst_embed = self.gst.stl.embed.cpu().data.numpy() #[0, number_token]
|
||||
gst_embed = np.tile(gst_embed, (1, 8))
|
||||
scale = np.zeros(512)
|
||||
scale[:] = 0.3
|
||||
speaker_embedding = (gst_embed[style_idx] * scale).astype(np.float32)
|
||||
speaker_embedding = torch.from_numpy(np.tile(speaker_embedding, (x.shape[0], 1))).to(device)
|
||||
style_embed = self.gst(speaker_embedding)
|
||||
style_embed = style_embed.expand_as(encoder_seq)
|
||||
encoder_seq = encoder_seq + style_embed
|
||||
encoder_seq_proj = self.encoder_proj(encoder_seq)
|
||||
|
||||
# Need a couple of lists for outputs
|
||||
@@ -494,7 +511,7 @@ class Tacotron(nn.Module):
|
||||
# Use device of model params as location for loaded state
|
||||
device = next(self.parameters()).device
|
||||
checkpoint = torch.load(str(path), map_location=device)
|
||||
self.load_state_dict(checkpoint["model_state"])
|
||||
self.load_state_dict(checkpoint["model_state"], strict=False)
|
||||
|
||||
if "optimizer_state" in checkpoint and optimizer is not None:
|
||||
optimizer.load_state_dict(checkpoint["optimizer_state"])
|
||||
|
||||
@@ -6,8 +6,8 @@ from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
from encoder import inference as encoder
|
||||
from synthesizer.preprocess_speaker import preprocess_speaker_general, preprocess_speaker_bznsyp
|
||||
from synthesizer.preprocess_transcript import preprocess_transcript_bznsyp
|
||||
from synthesizer.preprocess_speaker import preprocess_speaker_general
|
||||
from synthesizer.preprocess_transcript import preprocess_transcript_aishell3
|
||||
|
||||
data_info = {
|
||||
"aidatatang_200zh": {
|
||||
@@ -23,14 +23,9 @@ data_info = {
|
||||
"aishell3":{
|
||||
"subfolders": ["train/wav"],
|
||||
"trans_filepath": "train/content.txt",
|
||||
"speak_func": preprocess_speaker_general
|
||||
},
|
||||
"BZNSYP":{
|
||||
"subfolders": ["Wave"],
|
||||
"trans_filepath": "ProsodyLabeling/000001-010000.txt",
|
||||
"speak_func": preprocess_speaker_bznsyp,
|
||||
"transcript_func": preprocess_transcript_bznsyp,
|
||||
},
|
||||
"speak_func": preprocess_speaker_general,
|
||||
"transcript_func": preprocess_transcript_aishell3,
|
||||
}
|
||||
}
|
||||
|
||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||
|
||||
@@ -61,7 +61,7 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||
return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
|
||||
|
||||
|
||||
def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
|
||||
def _split_on_silences(wav_fpath, words, hparams):
|
||||
# Load the audio waveform
|
||||
wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
|
||||
wav = librosa.effects.trim(wav, top_db= 40, frame_length=2048, hop_length=512)[0]
|
||||
@@ -81,24 +81,19 @@ def _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams):
|
||||
return wav, res
|
||||
|
||||
def preprocess_speaker_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||
wav_fpath_list = speaker_dir.glob("*.wav")
|
||||
return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
|
||||
|
||||
def preprocess_speaker_bznsyp(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||
wav_fpath_list = [speaker_dir]
|
||||
return preprocess_speaker_internal(wav_fpath_list, out_dir, skip_existing, hparams, dict_info, no_alignments)
|
||||
|
||||
def preprocess_speaker_internal(wav_fpath_list, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool):
|
||||
# Iterate over each wav
|
||||
metadata = []
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
wav, text = _split_on_silences_aidatatang_200zh(wav_fpath, words, hparams)
|
||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||
skip_existing, hparams))
|
||||
return [m for m in metadata if m is not None]
|
||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
||||
for extension in extensions:
|
||||
wav_fpath_list = speaker_dir.glob(extension)
|
||||
# Iterate over each wav
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
words = dict_info.get(wav_fpath.name) if not words else words # try with wav
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||
metadata.append(_process_utterance(wav, text, out_dir, sub_basename,
|
||||
skip_existing, hparams))
|
||||
return [m for m in metadata if m is not None]
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
def preprocess_transcript_bznsyp(dict_info, dict_transcript):
|
||||
transList = []
|
||||
for t in dict_transcript:
|
||||
transList.append(t)
|
||||
for i in range(0, len(transList), 2):
|
||||
if not transList[i]:
|
||||
def preprocess_transcript_aishell3(dict_info, dict_transcript):
|
||||
for v in dict_transcript:
|
||||
if not v:
|
||||
continue
|
||||
key = transList[i].split("\t")[0]
|
||||
transcript = transList[i+1].strip().replace("\n","").replace("\t"," ")
|
||||
dict_info[key] = transcript
|
||||
v = v.strip().replace("\n","").replace("\t"," ").split(" ")
|
||||
transList = []
|
||||
for i in range(2, len(v), 2):
|
||||
transList.append(v[i])
|
||||
dict_info[v[0]] = " ".join(transList)
|
||||
@@ -2,11 +2,12 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import optim
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
from synthesizer import audio
|
||||
from synthesizer.models.tacotron import Tacotron
|
||||
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
|
||||
from synthesizer.utils import ValueWindow, data_parallel_workaround
|
||||
from synthesizer.utils.plot import plot_spectrogram
|
||||
from synthesizer.utils.plot import plot_spectrogram, plot_spectrogram_and_trace
|
||||
from synthesizer.utils.symbols import symbols
|
||||
from synthesizer.utils.text import sequence_to_text
|
||||
from vocoder.display import *
|
||||
@@ -23,7 +24,7 @@ def time_string():
|
||||
return datetime.now().strftime("%Y-%m-%d %H:%M")
|
||||
|
||||
def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
|
||||
backup_every: int, force_restart:bool, hparams):
|
||||
backup_every: int, log_every:int, force_restart:bool, hparams):
|
||||
|
||||
syn_dir = Path(syn_dir)
|
||||
models_dir = Path(models_dir)
|
||||
@@ -123,6 +124,9 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
|
||||
shuffle=True,
|
||||
pin_memory=True)
|
||||
|
||||
# tracing training step
|
||||
sw = SummaryWriter(log_dir=model_dir.joinpath("logs"))
|
||||
|
||||
for i, session in enumerate(hparams.tts_schedule):
|
||||
current_step = model.get_step()
|
||||
|
||||
@@ -208,9 +212,13 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
|
||||
step = model.get_step()
|
||||
k = step // 1000
|
||||
|
||||
|
||||
msg = f"| Epoch: {epoch}/{epochs} ({i}/{steps_per_epoch}) | Loss: {loss_window.average:#.4} | {1./time_window.average:#.2} steps/s | Step: {k}k | "
|
||||
stream(msg)
|
||||
|
||||
if log_every != 0 and step % log_every == 0 :
|
||||
sw.add_scalar("training/loss", loss_window.average, step)
|
||||
|
||||
# Backup or save model as appropriate
|
||||
if backup_every != 0 and step % backup_every == 0 :
|
||||
backup_fpath = Path("{}/{}_{}k.pt".format(str(weights_fpath.parent), run_id, k))
|
||||
@@ -220,6 +228,7 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
|
||||
# Must save latest optimizer state to ensure that resuming training
|
||||
# doesn't produce artifacts
|
||||
model.save(weights_fpath, optimizer)
|
||||
|
||||
|
||||
# Evaluate model to generate samples
|
||||
epoch_eval = hparams.tts_eval_interval == -1 and i == steps_per_epoch # If epoch is done
|
||||
@@ -233,7 +242,8 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
|
||||
mel_prediction = np_now(m2_hat[sample_idx]).T[:mel_length]
|
||||
target_spectrogram = np_now(mels[sample_idx]).T[:mel_length]
|
||||
attention_len = mel_length // model.r
|
||||
|
||||
# eval_loss = F.mse_loss(mel_prediction, target_spectrogram)
|
||||
# sw.add_scalar("validing/loss", eval_loss.item(), step)
|
||||
eval_model(attention=np_now(attention[sample_idx][:, :attention_len]),
|
||||
mel_prediction=mel_prediction,
|
||||
target_spectrogram=target_spectrogram,
|
||||
@@ -244,7 +254,8 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
|
||||
wav_dir=wav_dir,
|
||||
sample_num=sample_idx + 1,
|
||||
loss=loss,
|
||||
hparams=hparams)
|
||||
hparams=hparams,
|
||||
sw=sw)
|
||||
|
||||
# Break out of loop to update training schedule
|
||||
if step >= max_step:
|
||||
@@ -254,10 +265,11 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
|
||||
print("")
|
||||
|
||||
def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step,
|
||||
plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams):
|
||||
plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams, sw):
|
||||
# Save some results for evaluation
|
||||
attention_path = str(plot_dir.joinpath("attention_step_{}_sample_{}".format(step, sample_num)))
|
||||
save_attention(attention, attention_path)
|
||||
# save_attention(attention, attention_path)
|
||||
save_and_trace_attention(attention, attention_path, sw, step)
|
||||
|
||||
# save predicted mel spectrogram to disk (debug)
|
||||
mel_output_fpath = mel_output_dir.joinpath("mel-prediction-step-{}_sample_{}.npy".format(step, sample_num))
|
||||
@@ -271,7 +283,15 @@ def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step,
|
||||
# save real and predicted mel-spectrogram plot to disk (control purposes)
|
||||
spec_fpath = plot_dir.joinpath("step-{}-mel-spectrogram_sample_{}.png".format(step, sample_num))
|
||||
title_str = "{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), step, loss)
|
||||
plot_spectrogram(mel_prediction, str(spec_fpath), title=title_str,
|
||||
target_spectrogram=target_spectrogram,
|
||||
max_len=target_spectrogram.size // hparams.num_mels)
|
||||
# plot_spectrogram(mel_prediction, str(spec_fpath), title=title_str,
|
||||
# target_spectrogram=target_spectrogram,
|
||||
# max_len=target_spectrogram.size // hparams.num_mels)
|
||||
plot_spectrogram_and_trace(
|
||||
mel_prediction,
|
||||
str(spec_fpath),
|
||||
title=title_str,
|
||||
target_spectrogram=target_spectrogram,
|
||||
max_len=target_spectrogram.size // hparams.num_mels,
|
||||
sw=sw,
|
||||
step=step)
|
||||
print("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
|
||||
|
||||
@@ -74,3 +74,42 @@ def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, targ
|
||||
plt.tight_layout()
|
||||
plt.savefig(path, format="png")
|
||||
plt.close()
|
||||
|
||||
|
||||
def plot_spectrogram_and_trace(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False, sw=None, step=0):
|
||||
if max_len is not None:
|
||||
target_spectrogram = target_spectrogram[:max_len]
|
||||
pred_spectrogram = pred_spectrogram[:max_len]
|
||||
|
||||
if split_title:
|
||||
title = split_title_line(title)
|
||||
|
||||
fig = plt.figure(figsize=(10, 8))
|
||||
# Set common labels
|
||||
fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)
|
||||
|
||||
#target spectrogram subplot
|
||||
if target_spectrogram is not None:
|
||||
ax1 = fig.add_subplot(311)
|
||||
ax2 = fig.add_subplot(312)
|
||||
|
||||
if auto_aspect:
|
||||
im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
|
||||
else:
|
||||
im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
|
||||
ax1.set_title("Target Mel-Spectrogram")
|
||||
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
|
||||
ax2.set_title("Predicted Mel-Spectrogram")
|
||||
else:
|
||||
ax2 = fig.add_subplot(211)
|
||||
|
||||
if auto_aspect:
|
||||
im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
|
||||
else:
|
||||
im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
|
||||
fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(path, format="png")
|
||||
sw.add_figure("spectrogram", fig, step)
|
||||
plt.close()
|
||||
@@ -12,6 +12,7 @@ recognized_datasets = [
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("This method is deprecaded and will not be longer supported, please use 'pre.py'")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
|
||||
"and writes them to the disk. Audio files are also saved, to be used by the "
|
||||
|
||||
@@ -5,6 +5,7 @@ import argparse
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("This method is deprecaded and will not be longer supported, please use 'pre.py'")
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Creates embeddings for the synthesizer from the LibriSpeech utterances.",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
|
||||
@@ -21,6 +21,8 @@ if __name__ == "__main__":
|
||||
parser.add_argument("-b", "--backup_every", type=int, default=25000, help= \
|
||||
"Number of steps between backups of the model. Set to 0 to never make backups of the "
|
||||
"model.")
|
||||
parser.add_argument("-l", "--log_every", type=int, default=200, help= \
|
||||
"Number of steps between summary the training info in tensorboard")
|
||||
parser.add_argument("-f", "--force_restart", action="store_true", help= \
|
||||
"Do not load any saved model and restart from scratch.")
|
||||
parser.add_argument("--hparams", default="",
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
from toolbox.ui import UI
|
||||
from encoder import inference as encoder
|
||||
from synthesizer.inference import Synthesizer
|
||||
from vocoder import inference as vocoder
|
||||
from vocoder.wavernn import inference as rnn_vocoder
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from pathlib import Path
|
||||
from time import perf_counter as timer
|
||||
from toolbox.utterance import Utterance
|
||||
@@ -13,6 +14,9 @@ import librosa
|
||||
import re
|
||||
from audioread.exceptions import NoBackendError
|
||||
|
||||
# 默认使用wavernn
|
||||
vocoder = rnn_vocoder
|
||||
|
||||
# Use this directory structure for your datasets, or modify it to fit your needs
|
||||
recognized_datasets = [
|
||||
"LibriSpeech/dev-clean",
|
||||
@@ -46,13 +50,6 @@ MAX_WAVES = 15
|
||||
|
||||
class Toolbox:
|
||||
def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, seed, no_mp3_support):
|
||||
if not no_mp3_support:
|
||||
try:
|
||||
librosa.load("samples/6829_00000.mp3")
|
||||
except NoBackendError:
|
||||
print("Librosa will be unable to open mp3 files if additional software is not installed.\n"
|
||||
"Please install ffmpeg or add the '--no_mp3_support' option to proceed without support for mp3 files.")
|
||||
exit(-1)
|
||||
self.no_mp3_support = no_mp3_support
|
||||
sys.excepthook = self.excepthook
|
||||
self.datasets_root = datasets_root
|
||||
@@ -74,6 +71,7 @@ class Toolbox:
|
||||
|
||||
# Initialize the events and the interface
|
||||
self.ui = UI()
|
||||
self.style_idx = 0
|
||||
self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir, seed)
|
||||
self.setup_events()
|
||||
self.ui.start()
|
||||
@@ -236,7 +234,7 @@ class Toolbox:
|
||||
texts = processed_texts
|
||||
embed = self.ui.selected_utterance.embed
|
||||
embeds = [embed] * len(texts)
|
||||
specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
|
||||
specs = self.synthesizer.synthesize_spectrograms(texts, embeds, style_idx=int(self.ui.style_idx_textbox.text()))
|
||||
breaks = [spec.shape[1] for spec in specs]
|
||||
spec = np.concatenate(specs, axis=1)
|
||||
|
||||
@@ -353,10 +351,21 @@ class Toolbox:
|
||||
self.ui.set_loading(0)
|
||||
|
||||
def init_vocoder(self):
|
||||
|
||||
global vocoder
|
||||
model_fpath = self.ui.current_vocoder_fpath
|
||||
# Case of Griffin-lim
|
||||
if model_fpath is None:
|
||||
return
|
||||
|
||||
|
||||
# Sekect vocoder based on model name
|
||||
if model_fpath.name[0] == "g":
|
||||
vocoder = gan_vocoder
|
||||
self.ui.log("set hifigan as vocoder")
|
||||
else:
|
||||
vocoder = rnn_vocoder
|
||||
self.ui.log("set wavernn as vocoder")
|
||||
|
||||
self.ui.log("Loading the vocoder %s... " % model_fpath)
|
||||
self.ui.set_loading(1)
|
||||
|
||||
@@ -574,10 +574,14 @@ class UI(QDialog):
|
||||
self.seed_textbox = QLineEdit()
|
||||
self.seed_textbox.setMaximumWidth(80)
|
||||
layout_seed.addWidget(self.seed_textbox, 0, 1)
|
||||
layout_seed.addWidget(QLabel("Style#:(0~9)"), 0, 2)
|
||||
self.style_idx_textbox = QLineEdit("-1")
|
||||
self.style_idx_textbox.setMaximumWidth(80)
|
||||
layout_seed.addWidget(self.style_idx_textbox, 0, 3)
|
||||
self.trim_silences_checkbox = QCheckBox("Enhance vocoder output")
|
||||
self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output."
|
||||
" This feature requires `webrtcvad` to be installed.")
|
||||
layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2)
|
||||
layout_seed.addWidget(self.trim_silences_checkbox, 0, 4, 1, 2)
|
||||
gen_layout.addLayout(layout_seed)
|
||||
|
||||
self.loading_bar = QProgressBar()
|
||||
|
||||
@@ -11,7 +11,6 @@ def check_model_paths(encoder_path: Path, synthesizer_path: Path, vocoder_path:
|
||||
|
||||
# If none of the paths exist, remind the user to download models if needed
|
||||
print("********************************************************************************")
|
||||
print("Error: Model files not found. Follow these instructions to get and install the models:")
|
||||
print("https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models")
|
||||
print("Error: Model files not found. Please download the models")
|
||||
print("********************************************************************************\n")
|
||||
quit(-1)
|
||||
|
||||
@@ -91,6 +91,14 @@ def save_attention(attn, path) :
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def save_and_trace_attention(attn, path, sw, step):
|
||||
fig = plt.figure(figsize=(12, 6))
|
||||
plt.imshow(attn.T, interpolation='nearest', aspect='auto')
|
||||
fig.savefig(f'{path}.png', bbox_inches='tight')
|
||||
sw.add_figure('attention', fig, step)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def save_spectrogram(M, path, length=None) :
|
||||
M = np.flip(M, axis=0)
|
||||
if length : M = M[:, :length]
|
||||
|
||||
31
vocoder/hifigan/config_16k_.json
Normal file
31
vocoder/hifigan/config_16k_.json
Normal file
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"resblock": "1",
|
||||
"num_gpus": 0,
|
||||
"batch_size": 16,
|
||||
"learning_rate": 0.0002,
|
||||
"adam_b1": 0.8,
|
||||
"adam_b2": 0.99,
|
||||
"lr_decay": 0.999,
|
||||
"seed": 1234,
|
||||
|
||||
"upsample_rates": [5,5,4,2],
|
||||
"upsample_kernel_sizes": [10,10,8,4],
|
||||
"upsample_initial_channel": 512,
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
|
||||
"segment_size": 6400,
|
||||
"num_mels": 80,
|
||||
"num_freq": 1025,
|
||||
"n_fft": 1024,
|
||||
"hop_size": 200,
|
||||
"win_size": 800,
|
||||
|
||||
"sampling_rate": 16000,
|
||||
|
||||
"fmin": 0,
|
||||
"fmax": 7600,
|
||||
"fmax_for_loss": null,
|
||||
|
||||
"num_workers": 4
|
||||
}
|
||||
15
vocoder/hifigan/env.py
Normal file
15
vocoder/hifigan/env.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
self.__dict__ = self
|
||||
|
||||
|
||||
def build_env(config, config_name, path):
|
||||
t_path = os.path.join(path, config_name)
|
||||
if config != t_path:
|
||||
os.makedirs(path, exist_ok=True)
|
||||
shutil.copyfile(config, os.path.join(path, config_name))
|
||||
70
vocoder/hifigan/inference.py
Normal file
70
vocoder/hifigan/inference.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import json
|
||||
import torch
|
||||
from scipy.io.wavfile import write
|
||||
from vocoder.hifigan.env import AttrDict
|
||||
from vocoder.hifigan.meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
|
||||
from vocoder.hifigan.models import Generator
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
generator = None # type: Generator
|
||||
_device = None
|
||||
|
||||
|
||||
def load_checkpoint(filepath, device):
|
||||
assert os.path.isfile(filepath)
|
||||
print("Loading '{}'".format(filepath))
|
||||
checkpoint_dict = torch.load(filepath, map_location=device)
|
||||
print("Complete.")
|
||||
return checkpoint_dict
|
||||
|
||||
|
||||
def load_model(weights_fpath, verbose=True):
|
||||
global generator, _device
|
||||
|
||||
if verbose:
|
||||
print("Building hifigan")
|
||||
|
||||
with open("./vocoder/hifigan/config_16k_.json") as f:
|
||||
data = f.read()
|
||||
json_config = json.loads(data)
|
||||
h = AttrDict(json_config)
|
||||
torch.manual_seed(h.seed)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
# _model = _model.cuda()
|
||||
_device = torch.device('cuda')
|
||||
else:
|
||||
_device = torch.device('cpu')
|
||||
|
||||
generator = Generator(h).to(_device)
|
||||
state_dict_g = load_checkpoint(
|
||||
weights_fpath, _device
|
||||
)
|
||||
generator.load_state_dict(state_dict_g['generator'])
|
||||
generator.eval()
|
||||
generator.remove_weight_norm()
|
||||
|
||||
|
||||
def is_loaded():
|
||||
return generator is not None
|
||||
|
||||
|
||||
def infer_waveform(mel, progress_callback=None):
|
||||
|
||||
if generator is None:
|
||||
raise Exception("Please load hifi-gan in memory before using it")
|
||||
|
||||
mel = torch.FloatTensor(mel).to(_device)
|
||||
mel = mel.unsqueeze(0)
|
||||
|
||||
with torch.no_grad():
|
||||
y_g_hat = generator(mel)
|
||||
audio = y_g_hat.squeeze()
|
||||
audio = audio.cpu().numpy()
|
||||
|
||||
return audio
|
||||
|
||||
178
vocoder/hifigan/meldataset.py
Normal file
178
vocoder/hifigan/meldataset.py
Normal file
@@ -0,0 +1,178 @@
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import torch
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
from librosa.util import normalize
|
||||
from scipy.io.wavfile import read
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
|
||||
MAX_WAV_VALUE = 32768.0
|
||||
|
||||
|
||||
def load_wav(full_path):
|
||||
sampling_rate, data = read(full_path)
|
||||
return data, sampling_rate
|
||||
|
||||
|
||||
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
||||
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression(x, C=1):
|
||||
return np.exp(x) / C
|
||||
|
||||
|
||||
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression_torch(x, C=1):
|
||||
return torch.exp(x) / C
|
||||
|
||||
|
||||
def spectral_normalize_torch(magnitudes):
|
||||
output = dynamic_range_compression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
def spectral_de_normalize_torch(magnitudes):
|
||||
output = dynamic_range_decompression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
mel_basis = {}
|
||||
hann_window = {}
|
||||
|
||||
|
||||
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
||||
if torch.min(y) < -1.:
|
||||
print('min value is ', torch.min(y))
|
||||
if torch.max(y) > 1.:
|
||||
print('max value is ', torch.max(y))
|
||||
|
||||
global mel_basis, hann_window
|
||||
if fmax not in mel_basis:
|
||||
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
||||
mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
||||
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
|
||||
|
||||
spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
|
||||
spec = spectral_normalize_torch(spec)
|
||||
|
||||
return spec
|
||||
|
||||
|
||||
def get_dataset_filelist(a):
|
||||
# with open(a.input_training_file, 'r', encoding='utf-8') as fi:
|
||||
# training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
|
||||
# for x in fi.read().split('\n') if len(x) > 0]
|
||||
|
||||
# with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
|
||||
# validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
|
||||
# for x in fi.read().split('\n') if len(x) > 0]
|
||||
|
||||
files = os.listdir(a.input_wavs_dir)
|
||||
random.shuffle(files)
|
||||
files = [os.path.join(a.input_wavs_dir, f) for f in files]
|
||||
training_files = files[: -int(len(files)*0.05)]
|
||||
validation_files = files[-int(len(files)*0.05): ]
|
||||
|
||||
return training_files, validation_files
|
||||
|
||||
|
||||
class MelDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, training_files, segment_size, n_fft, num_mels,
|
||||
hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
|
||||
device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
|
||||
self.audio_files = training_files
|
||||
random.seed(1234)
|
||||
if shuffle:
|
||||
random.shuffle(self.audio_files)
|
||||
self.segment_size = segment_size
|
||||
self.sampling_rate = sampling_rate
|
||||
self.split = split
|
||||
self.n_fft = n_fft
|
||||
self.num_mels = num_mels
|
||||
self.hop_size = hop_size
|
||||
self.win_size = win_size
|
||||
self.fmin = fmin
|
||||
self.fmax = fmax
|
||||
self.fmax_loss = fmax_loss
|
||||
self.cached_wav = None
|
||||
self.n_cache_reuse = n_cache_reuse
|
||||
self._cache_ref_count = 0
|
||||
self.device = device
|
||||
self.fine_tuning = fine_tuning
|
||||
self.base_mels_path = base_mels_path
|
||||
|
||||
def __getitem__(self, index):
|
||||
filename = self.audio_files[index]
|
||||
if self._cache_ref_count == 0:
|
||||
# audio, sampling_rate = load_wav(filename)
|
||||
# audio = audio / MAX_WAV_VALUE
|
||||
audio = np.load(filename)
|
||||
if not self.fine_tuning:
|
||||
audio = normalize(audio) * 0.95
|
||||
self.cached_wav = audio
|
||||
# if sampling_rate != self.sampling_rate:
|
||||
# raise ValueError("{} SR doesn't match target {} SR".format(
|
||||
# sampling_rate, self.sampling_rate))
|
||||
self._cache_ref_count = self.n_cache_reuse
|
||||
else:
|
||||
audio = self.cached_wav
|
||||
self._cache_ref_count -= 1
|
||||
|
||||
audio = torch.FloatTensor(audio)
|
||||
audio = audio.unsqueeze(0)
|
||||
|
||||
if not self.fine_tuning:
|
||||
if self.split:
|
||||
if audio.size(1) >= self.segment_size:
|
||||
max_audio_start = audio.size(1) - self.segment_size
|
||||
audio_start = random.randint(0, max_audio_start)
|
||||
audio = audio[:, audio_start:audio_start+self.segment_size]
|
||||
else:
|
||||
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
|
||||
|
||||
mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
|
||||
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
|
||||
center=False)
|
||||
else:
|
||||
mel_path = os.path.join(self.base_mels_path, "mel" + "-" + filename.split("/")[-1].split("-")[-1])
|
||||
mel = np.load(mel_path).T
|
||||
# mel = np.load(
|
||||
# os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
|
||||
mel = torch.from_numpy(mel)
|
||||
|
||||
if len(mel.shape) < 3:
|
||||
mel = mel.unsqueeze(0)
|
||||
|
||||
if self.split:
|
||||
frames_per_seg = math.ceil(self.segment_size / self.hop_size)
|
||||
|
||||
if audio.size(1) >= self.segment_size:
|
||||
mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
|
||||
mel = mel[:, :, mel_start:mel_start + frames_per_seg]
|
||||
audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
|
||||
else:
|
||||
mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
|
||||
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
|
||||
|
||||
mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
|
||||
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
|
||||
center=False)
|
||||
|
||||
return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
|
||||
|
||||
def __len__(self):
|
||||
return len(self.audio_files)
|
||||
286
vocoder/hifigan/models.py
Normal file
286
vocoder/hifigan/models.py
Normal file
@@ -0,0 +1,286 @@
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from vocoder.hifigan.utils import init_weights, get_padding
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
||||
class ResBlock1(torch.nn.Module):
|
||||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
||||
super(ResBlock1, self).__init__()
|
||||
self.h = h
|
||||
self.convs1 = nn.ModuleList([
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
||||
padding=get_padding(kernel_size, dilation[0]))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
||||
padding=get_padding(kernel_size, dilation[1]))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
||||
padding=get_padding(kernel_size, dilation[2])))
|
||||
])
|
||||
self.convs1.apply(init_weights)
|
||||
|
||||
self.convs2 = nn.ModuleList([
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||
padding=get_padding(kernel_size, 1))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||
padding=get_padding(kernel_size, 1))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||
padding=get_padding(kernel_size, 1)))
|
||||
])
|
||||
self.convs2.apply(init_weights)
|
||||
|
||||
def forward(self, x):
|
||||
for c1, c2 in zip(self.convs1, self.convs2):
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
xt = c1(xt)
|
||||
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||
xt = c2(xt)
|
||||
x = xt + x
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs1:
|
||||
remove_weight_norm(l)
|
||||
for l in self.convs2:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock2(torch.nn.Module):
|
||||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
|
||||
super(ResBlock2, self).__init__()
|
||||
self.h = h
|
||||
self.convs = nn.ModuleList([
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
||||
padding=get_padding(kernel_size, dilation[0]))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
||||
padding=get_padding(kernel_size, dilation[1])))
|
||||
])
|
||||
self.convs.apply(init_weights)
|
||||
|
||||
def forward(self, x):
|
||||
for c in self.convs:
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
xt = c(xt)
|
||||
x = xt + x
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
def __init__(self, h):
|
||||
super(Generator, self).__init__()
|
||||
self.h = h
|
||||
self.num_kernels = len(h.resblock_kernel_sizes)
|
||||
self.num_upsamples = len(h.upsample_rates)
|
||||
self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
|
||||
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
|
||||
|
||||
self.ups = nn.ModuleList()
|
||||
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
|
||||
# self.ups.append(weight_norm(
|
||||
# ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
|
||||
# k, u, padding=(k-u)//2)))
|
||||
self.ups.append(weight_norm(ConvTranspose1d(h.upsample_initial_channel//(2**i),
|
||||
h.upsample_initial_channel//(2**(i+1)),
|
||||
k, u, padding=(u//2 + u%2), output_padding=u%2)))
|
||||
|
||||
self.resblocks = nn.ModuleList()
|
||||
for i in range(len(self.ups)):
|
||||
ch = h.upsample_initial_channel//(2**(i+1))
|
||||
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
|
||||
self.resblocks.append(resblock(h, ch, k, d))
|
||||
|
||||
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
||||
self.ups.apply(init_weights)
|
||||
self.conv_post.apply(init_weights)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv_pre(x)
|
||||
for i in range(self.num_upsamples):
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
x = self.ups[i](x)
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i*self.num_kernels+j](x)
|
||||
else:
|
||||
xs += self.resblocks[i*self.num_kernels+j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print('Removing weight norm...')
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
for l in self.resblocks:
|
||||
l.remove_weight_norm()
|
||||
remove_weight_norm(self.conv_pre)
|
||||
remove_weight_norm(self.conv_post)
|
||||
|
||||
|
||||
class DiscriminatorP(torch.nn.Module):
|
||||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
||||
])
|
||||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
# 1d to 2d
|
||||
b, c, t = x.shape
|
||||
if t % self.period != 0: # pad first
|
||||
n_pad = self.period - (t % self.period)
|
||||
x = F.pad(x, (0, n_pad), "reflect")
|
||||
t = t + n_pad
|
||||
x = x.view(b, c, t // self.period, self.period)
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
self.discriminators = nn.ModuleList([
|
||||
DiscriminatorP(2),
|
||||
DiscriminatorP(3),
|
||||
DiscriminatorP(5),
|
||||
DiscriminatorP(7),
|
||||
DiscriminatorP(11),
|
||||
])
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
||||
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
||||
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||
])
|
||||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiScaleDiscriminator(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(MultiScaleDiscriminator, self).__init__()
|
||||
self.discriminators = nn.ModuleList([
|
||||
DiscriminatorS(use_spectral_norm=True),
|
||||
DiscriminatorS(),
|
||||
DiscriminatorS(),
|
||||
])
|
||||
self.meanpools = nn.ModuleList([
|
||||
AvgPool1d(4, 2, padding=2),
|
||||
AvgPool1d(4, 2, padding=2)
|
||||
])
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
if i != 0:
|
||||
y = self.meanpools[i-1](y)
|
||||
y_hat = self.meanpools[i-1](y_hat)
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
def feature_loss(fmap_r, fmap_g):
|
||||
loss = 0
|
||||
for dr, dg in zip(fmap_r, fmap_g):
|
||||
for rl, gl in zip(dr, dg):
|
||||
loss += torch.mean(torch.abs(rl - gl))
|
||||
|
||||
return loss*2
|
||||
|
||||
|
||||
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
||||
loss = 0
|
||||
r_losses = []
|
||||
g_losses = []
|
||||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
||||
r_loss = torch.mean((1-dr)**2)
|
||||
g_loss = torch.mean(dg**2)
|
||||
loss += (r_loss + g_loss)
|
||||
r_losses.append(r_loss.item())
|
||||
g_losses.append(g_loss.item())
|
||||
|
||||
return loss, r_losses, g_losses
|
||||
|
||||
|
||||
def generator_loss(disc_outputs):
|
||||
loss = 0
|
||||
gen_losses = []
|
||||
for dg in disc_outputs:
|
||||
l = torch.mean((1-dg)**2)
|
||||
gen_losses.append(l)
|
||||
loss += l
|
||||
|
||||
return loss, gen_losses
|
||||
|
||||
240
vocoder/hifigan/train.py
Normal file
240
vocoder/hifigan/train.py
Normal file
@@ -0,0 +1,240 @@
|
||||
import warnings
|
||||
warnings.simplefilter(action='ignore', category=FutureWarning)
|
||||
import itertools
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import json
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
from torch.utils.data import DistributedSampler, DataLoader
|
||||
import torch.multiprocessing as mp
|
||||
from torch.distributed import init_process_group
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
from vocoder.hifigan.env import AttrDict, build_env
|
||||
from vocoder.hifigan.meldataset import MelDataset, mel_spectrogram, get_dataset_filelist
|
||||
from vocoder.hifigan.models import Generator, MultiPeriodDiscriminator, MultiScaleDiscriminator, feature_loss, generator_loss,\
|
||||
discriminator_loss
|
||||
from vocoder.hifigan.utils import plot_spectrogram, scan_checkpoint, load_checkpoint, save_checkpoint
|
||||
|
||||
torch.backends.cudnn.benchmark = True
|
||||
|
||||
|
||||
def train(rank, a, h):
|
||||
|
||||
a.checkpoint_path = a.models_dir.joinpath(a.run_id+'_hifigan')
|
||||
a.checkpoint_path.mkdir(exist_ok=True)
|
||||
a.training_epochs = 3100
|
||||
a.stdout_interval = 5
|
||||
a.checkpoint_interval = 25000
|
||||
a.summary_interval = 5000
|
||||
a.validation_interval = 1000
|
||||
a.fine_tuning = True
|
||||
|
||||
a.input_wavs_dir = a.syn_dir.joinpath("audio")
|
||||
a.input_mels_dir = a.syn_dir.joinpath("mels")
|
||||
|
||||
if h.num_gpus > 1:
|
||||
init_process_group(backend=h.dist_config['dist_backend'], init_method=h.dist_config['dist_url'],
|
||||
world_size=h.dist_config['world_size'] * h.num_gpus, rank=rank)
|
||||
|
||||
torch.cuda.manual_seed(h.seed)
|
||||
device = torch.device('cuda:{:d}'.format(rank))
|
||||
|
||||
generator = Generator(h).to(device)
|
||||
mpd = MultiPeriodDiscriminator().to(device)
|
||||
msd = MultiScaleDiscriminator().to(device)
|
||||
|
||||
if rank == 0:
|
||||
print(generator)
|
||||
os.makedirs(a.checkpoint_path, exist_ok=True)
|
||||
print("checkpoints directory : ", a.checkpoint_path)
|
||||
|
||||
if os.path.isdir(a.checkpoint_path):
|
||||
cp_g = scan_checkpoint(a.checkpoint_path, 'g_')
|
||||
cp_do = scan_checkpoint(a.checkpoint_path, 'do_')
|
||||
|
||||
steps = 0
|
||||
if cp_g is None or cp_do is None:
|
||||
state_dict_do = None
|
||||
last_epoch = -1
|
||||
else:
|
||||
state_dict_g = load_checkpoint(cp_g, device)
|
||||
state_dict_do = load_checkpoint(cp_do, device)
|
||||
generator.load_state_dict(state_dict_g['generator'])
|
||||
mpd.load_state_dict(state_dict_do['mpd'])
|
||||
msd.load_state_dict(state_dict_do['msd'])
|
||||
steps = state_dict_do['steps'] + 1
|
||||
last_epoch = state_dict_do['epoch']
|
||||
|
||||
if h.num_gpus > 1:
|
||||
generator = DistributedDataParallel(generator, device_ids=[rank]).to(device)
|
||||
mpd = DistributedDataParallel(mpd, device_ids=[rank]).to(device)
|
||||
msd = DistributedDataParallel(msd, device_ids=[rank]).to(device)
|
||||
|
||||
optim_g = torch.optim.AdamW(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2])
|
||||
optim_d = torch.optim.AdamW(itertools.chain(msd.parameters(), mpd.parameters()),
|
||||
h.learning_rate, betas=[h.adam_b1, h.adam_b2])
|
||||
|
||||
if state_dict_do is not None:
|
||||
optim_g.load_state_dict(state_dict_do['optim_g'])
|
||||
optim_d.load_state_dict(state_dict_do['optim_d'])
|
||||
|
||||
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=h.lr_decay, last_epoch=last_epoch)
|
||||
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=h.lr_decay, last_epoch=last_epoch)
|
||||
|
||||
training_filelist, validation_filelist = get_dataset_filelist(a)
|
||||
|
||||
# print(training_filelist)
|
||||
# exit()
|
||||
|
||||
trainset = MelDataset(training_filelist, h.segment_size, h.n_fft, h.num_mels,
|
||||
h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, n_cache_reuse=0,
|
||||
shuffle=False if h.num_gpus > 1 else True, fmax_loss=h.fmax_for_loss, device=device,
|
||||
fine_tuning=a.fine_tuning, base_mels_path=a.input_mels_dir)
|
||||
|
||||
train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None
|
||||
|
||||
train_loader = DataLoader(trainset, num_workers=h.num_workers, shuffle=False,
|
||||
sampler=train_sampler,
|
||||
batch_size=h.batch_size,
|
||||
pin_memory=True,
|
||||
drop_last=True)
|
||||
|
||||
if rank == 0:
|
||||
validset = MelDataset(validation_filelist, h.segment_size, h.n_fft, h.num_mels,
|
||||
h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, False, False, n_cache_reuse=0,
|
||||
fmax_loss=h.fmax_for_loss, device=device, fine_tuning=a.fine_tuning,
|
||||
base_mels_path=a.input_mels_dir)
|
||||
validation_loader = DataLoader(validset, num_workers=1, shuffle=False,
|
||||
sampler=None,
|
||||
batch_size=1,
|
||||
pin_memory=True,
|
||||
drop_last=True)
|
||||
|
||||
sw = SummaryWriter(os.path.join(a.checkpoint_path, 'logs'))
|
||||
|
||||
generator.train()
|
||||
mpd.train()
|
||||
msd.train()
|
||||
for epoch in range(max(0, last_epoch), a.training_epochs):
|
||||
if rank == 0:
|
||||
start = time.time()
|
||||
print("Epoch: {}".format(epoch+1))
|
||||
|
||||
if h.num_gpus > 1:
|
||||
train_sampler.set_epoch(epoch)
|
||||
|
||||
for i, batch in enumerate(train_loader):
|
||||
if rank == 0:
|
||||
start_b = time.time()
|
||||
x, y, _, y_mel = batch
|
||||
x = torch.autograd.Variable(x.to(device, non_blocking=True))
|
||||
y = torch.autograd.Variable(y.to(device, non_blocking=True))
|
||||
y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True))
|
||||
y = y.unsqueeze(1)
|
||||
|
||||
y_g_hat = generator(x)
|
||||
y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size,
|
||||
h.fmin, h.fmax_for_loss)
|
||||
|
||||
optim_d.zero_grad()
|
||||
|
||||
# MPD
|
||||
y_df_hat_r, y_df_hat_g, _, _ = mpd(y, y_g_hat.detach())
|
||||
loss_disc_f, losses_disc_f_r, losses_disc_f_g = discriminator_loss(y_df_hat_r, y_df_hat_g)
|
||||
|
||||
# MSD
|
||||
y_ds_hat_r, y_ds_hat_g, _, _ = msd(y, y_g_hat.detach())
|
||||
loss_disc_s, losses_disc_s_r, losses_disc_s_g = discriminator_loss(y_ds_hat_r, y_ds_hat_g)
|
||||
|
||||
loss_disc_all = loss_disc_s + loss_disc_f
|
||||
|
||||
loss_disc_all.backward()
|
||||
optim_d.step()
|
||||
|
||||
# Generator
|
||||
optim_g.zero_grad()
|
||||
|
||||
# L1 Mel-Spectrogram Loss
|
||||
loss_mel = F.l1_loss(y_mel, y_g_hat_mel) * 45
|
||||
|
||||
y_df_hat_r, y_df_hat_g, fmap_f_r, fmap_f_g = mpd(y, y_g_hat)
|
||||
y_ds_hat_r, y_ds_hat_g, fmap_s_r, fmap_s_g = msd(y, y_g_hat)
|
||||
loss_fm_f = feature_loss(fmap_f_r, fmap_f_g)
|
||||
loss_fm_s = feature_loss(fmap_s_r, fmap_s_g)
|
||||
loss_gen_f, losses_gen_f = generator_loss(y_df_hat_g)
|
||||
loss_gen_s, losses_gen_s = generator_loss(y_ds_hat_g)
|
||||
loss_gen_all = loss_gen_s + loss_gen_f + loss_fm_s + loss_fm_f + loss_mel
|
||||
|
||||
loss_gen_all.backward()
|
||||
optim_g.step()
|
||||
|
||||
if rank == 0:
|
||||
# STDOUT logging
|
||||
if steps % a.stdout_interval == 0:
|
||||
with torch.no_grad():
|
||||
mel_error = F.l1_loss(y_mel, y_g_hat_mel).item()
|
||||
|
||||
print('Steps : {:d}, Gen Loss Total : {:4.3f}, Mel-Spec. Error : {:4.3f}, s/b : {:4.3f}'.
|
||||
format(steps, loss_gen_all, mel_error, time.time() - start_b))
|
||||
|
||||
# checkpointing
|
||||
if steps % a.checkpoint_interval == 0 and steps != 0:
|
||||
checkpoint_path = "{}/g_{:08d}.pt".format(a.checkpoint_path, steps)
|
||||
save_checkpoint(checkpoint_path,
|
||||
{'generator': (generator.module if h.num_gpus > 1 else generator).state_dict()})
|
||||
checkpoint_path = "{}/do_{:08d}".format(a.checkpoint_path, steps)
|
||||
save_checkpoint(checkpoint_path,
|
||||
{'mpd': (mpd.module if h.num_gpus > 1
|
||||
else mpd).state_dict(),
|
||||
'msd': (msd.module if h.num_gpus > 1
|
||||
else msd).state_dict(),
|
||||
'optim_g': optim_g.state_dict(), 'optim_d': optim_d.state_dict(), 'steps': steps,
|
||||
'epoch': epoch})
|
||||
|
||||
# Tensorboard summary logging
|
||||
if steps % a.summary_interval == 0:
|
||||
sw.add_scalar("training/gen_loss_total", loss_gen_all, steps)
|
||||
sw.add_scalar("training/mel_spec_error", mel_error, steps)
|
||||
|
||||
# Validation
|
||||
if steps % a.validation_interval == 0: # and steps != 0:
|
||||
generator.eval()
|
||||
torch.cuda.empty_cache()
|
||||
val_err_tot = 0
|
||||
with torch.no_grad():
|
||||
for j, batch in enumerate(validation_loader):
|
||||
x, y, _, y_mel = batch
|
||||
y_g_hat = generator(x.to(device))
|
||||
y_mel = torch.autograd.Variable(y_mel.to(device, non_blocking=True))
|
||||
y_g_hat_mel = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels, h.sampling_rate,
|
||||
h.hop_size, h.win_size,
|
||||
h.fmin, h.fmax_for_loss)
|
||||
# val_err_tot += F.l1_loss(y_mel, y_g_hat_mel).item()
|
||||
|
||||
if j <= 4:
|
||||
if steps == 0:
|
||||
sw.add_audio('gt/y_{}'.format(j), y[0], steps, h.sampling_rate)
|
||||
sw.add_figure('gt/y_spec_{}'.format(j), plot_spectrogram(x[0]), steps)
|
||||
|
||||
sw.add_audio('generated/y_hat_{}'.format(j), y_g_hat[0], steps, h.sampling_rate)
|
||||
y_hat_spec = mel_spectrogram(y_g_hat.squeeze(1), h.n_fft, h.num_mels,
|
||||
h.sampling_rate, h.hop_size, h.win_size,
|
||||
h.fmin, h.fmax)
|
||||
sw.add_figure('generated/y_hat_spec_{}'.format(j),
|
||||
plot_spectrogram(y_hat_spec.squeeze(0).cpu().numpy()), steps)
|
||||
|
||||
val_err = val_err_tot / (j+1)
|
||||
sw.add_scalar("validation/mel_spec_error", val_err, steps)
|
||||
|
||||
generator.train()
|
||||
|
||||
steps += 1
|
||||
|
||||
scheduler_g.step()
|
||||
scheduler_d.step()
|
||||
|
||||
if rank == 0:
|
||||
print('Time taken for epoch {} is {} sec\n'.format(epoch + 1, int(time.time() - start)))
|
||||
58
vocoder/hifigan/utils.py
Normal file
58
vocoder/hifigan/utils.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import glob
|
||||
import os
|
||||
import matplotlib
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
|
||||
def plot_spectrogram(spectrogram):
|
||||
fig, ax = plt.subplots(figsize=(10, 2))
|
||||
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
||||
interpolation='none')
|
||||
plt.colorbar(im, ax=ax)
|
||||
|
||||
fig.canvas.draw()
|
||||
plt.close()
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
m.weight.data.normal_(mean, std)
|
||||
|
||||
|
||||
def apply_weight_norm(m):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
weight_norm(m)
|
||||
|
||||
|
||||
def get_padding(kernel_size, dilation=1):
|
||||
return int((kernel_size*dilation - dilation)/2)
|
||||
|
||||
|
||||
def load_checkpoint(filepath, device):
|
||||
assert os.path.isfile(filepath)
|
||||
print("Loading '{}'".format(filepath))
|
||||
checkpoint_dict = torch.load(filepath, map_location=device)
|
||||
print("Complete.")
|
||||
return checkpoint_dict
|
||||
|
||||
|
||||
def save_checkpoint(filepath, obj):
|
||||
print("Saving checkpoint to {}".format(filepath))
|
||||
torch.save(obj, filepath)
|
||||
print("Complete.")
|
||||
|
||||
|
||||
def scan_checkpoint(cp_dir, prefix):
|
||||
pattern = os.path.join(cp_dir, prefix + '????????')
|
||||
cp_list = glob.glob(pattern)
|
||||
if len(cp_list) == 0:
|
||||
return None
|
||||
return sorted(cp_list)[-1]
|
||||
|
||||
BIN
vocoder/saved_models/pretrained/g_hifigan.pt
Normal file
BIN
vocoder/saved_models/pretrained/g_hifigan.pt
Normal file
Binary file not shown.
@@ -1,7 +1,7 @@
|
||||
from torch.utils.data import Dataset
|
||||
from pathlib import Path
|
||||
from vocoder import audio
|
||||
import vocoder.hparams as hp
|
||||
from vocoder.wavernn import audio
|
||||
import vocoder.wavernn.hparams as hp
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import math
|
||||
import numpy as np
|
||||
import librosa
|
||||
import vocoder.hparams as hp
|
||||
import vocoder.wavernn.hparams as hp
|
||||
from scipy.signal import lfilter
|
||||
import soundfile as sf
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from vocoder.models.fatchord_version import WaveRNN
|
||||
from vocoder.audio import *
|
||||
from vocoder.wavernn.models.fatchord_version import WaveRNN
|
||||
from vocoder.wavernn.audio import *
|
||||
|
||||
|
||||
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path):
|
||||
@@ -1,5 +1,5 @@
|
||||
from vocoder.models.fatchord_version import WaveRNN
|
||||
from vocoder import hparams as hp
|
||||
from vocoder.wavernn.models.fatchord_version import WaveRNN
|
||||
from vocoder.wavernn import hparams as hp
|
||||
import torch
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from vocoder.distribution import sample_from_discretized_mix_logistic
|
||||
from vocoder.display import *
|
||||
from vocoder.audio import *
|
||||
from vocoder.wavernn.audio import *
|
||||
|
||||
|
||||
class ResBlock(nn.Module):
|
||||
@@ -1,13 +1,13 @@
|
||||
from vocoder.models.fatchord_version import WaveRNN
|
||||
from vocoder.wavernn.models.fatchord_version import WaveRNN
|
||||
from vocoder.vocoder_dataset import VocoderDataset, collate_vocoder
|
||||
from vocoder.distribution import discretized_mix_logistic_loss
|
||||
from vocoder.display import stream, simple_table
|
||||
from vocoder.gen_wavernn import gen_testset
|
||||
from vocoder.wavernn.gen_wavernn import gen_testset
|
||||
from torch.utils.data import DataLoader
|
||||
from pathlib import Path
|
||||
from torch import optim
|
||||
import torch.nn.functional as F
|
||||
import vocoder.hparams as hp
|
||||
import vocoder.wavernn.hparams as hp
|
||||
import numpy as np
|
||||
import time
|
||||
import torch
|
||||
@@ -16,8 +16,8 @@ if __name__ == "__main__":
|
||||
parser.add_argument("datasets_root", type=str, help=\
|
||||
"Path to the directory containing your SV2TTS directory. If you specify both --in_dir and "
|
||||
"--out_dir, this argument won't be used.")
|
||||
parser.add_argument("--model_dir", type=str,
|
||||
default="synthesizer/saved_models/train3/", help=\
|
||||
parser.add_argument("-m", "--model_dir", type=str,
|
||||
default="synthesizer/saved_models/mandarin/", help=\
|
||||
"Path to the pretrained model directory.")
|
||||
parser.add_argument("-i", "--in_dir", type=str, default=argparse.SUPPRESS, help= \
|
||||
"Path to the synthesizer directory that contains the mel spectrograms, the wavs and the "
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
from utils.argutils import print_args
|
||||
from vocoder.train import train
|
||||
from vocoder.wavernn.train import train
|
||||
from vocoder.hifigan.train import train as train_hifigan
|
||||
from vocoder.hifigan.env import AttrDict
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import json
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -18,6 +21,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument("datasets_root", type=str, help= \
|
||||
"Path to the directory containing your SV2TTS directory. Specifying --syn_dir or --voc_dir "
|
||||
"will take priority over this argument.")
|
||||
parser.add_argument("vocoder_type", type=str, default="wavernn", help= \
|
||||
"Choose the vocoder type for train. Defaults to wavernn"
|
||||
"Now, Support <hifigan> and <wavernn> for choose")
|
||||
parser.add_argument("--syn_dir", type=str, default=argparse.SUPPRESS, help= \
|
||||
"Path to the synthesizer directory that contains the ground truth mel spectrograms, "
|
||||
"the wavs and the embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/.")
|
||||
@@ -37,9 +43,9 @@ if __name__ == "__main__":
|
||||
"model.")
|
||||
parser.add_argument("-f", "--force_restart", action="store_true", help= \
|
||||
"Do not load any saved model and restart from scratch.")
|
||||
parser.add_argument("--config", type=str, default="vocoder/hifigan/config_16k_.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process the arguments
|
||||
if not hasattr(args, "syn_dir"):
|
||||
args.syn_dir = Path(args.datasets_root, "SV2TTS", "synthesizer")
|
||||
args.syn_dir = Path(args.syn_dir)
|
||||
@@ -50,7 +56,16 @@ if __name__ == "__main__":
|
||||
args.models_dir = Path(args.models_dir)
|
||||
args.models_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Run the training
|
||||
print_args(args, parser)
|
||||
train(**vars(args))
|
||||
|
||||
|
||||
# Process the arguments
|
||||
if args.vocoder_type == "wavernn":
|
||||
# Run the training wavernn
|
||||
train(**vars(args))
|
||||
elif args.vocoder_type == "hifigan":
|
||||
with open(args.config) as f:
|
||||
json_config = json.load(f)
|
||||
h = AttrDict(json_config)
|
||||
train_hifigan(0, args, h)
|
||||
|
||||
|
||||
11
web.py
Normal file
11
web.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from web import webApp
|
||||
from gevent import pywsgi as wsgi
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = webApp()
|
||||
host = app.config.get("HOST")
|
||||
port = app.config.get("PORT")
|
||||
print(f"Web server: http://{host}:{port}")
|
||||
server = wsgi.WSGIServer((host, port), app)
|
||||
server.serve_forever()
|
||||
10
web/DOCKERFILE
Normal file
10
web/DOCKERFILE
Normal file
@@ -0,0 +1,10 @@
|
||||
|
||||
FROM python:3.7
|
||||
|
||||
RUN pip install gevent uwsgi flask
|
||||
|
||||
COPY app.py /app.py
|
||||
|
||||
EXPOSE 3000
|
||||
|
||||
ENTRYPOINT ["uwsgi", "--http", ":3000", "--master", "--module", "app:app"]
|
||||
132
web/__init__.py
Normal file
132
web/__init__.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from web.api import api_blueprint
|
||||
from pathlib import Path
|
||||
from gevent import pywsgi as wsgi
|
||||
from flask import Flask, Response, request, render_template
|
||||
from synthesizer.inference import Synthesizer
|
||||
from encoder import inference as encoder
|
||||
from vocoder.hifigan import inference as gan_vocoder
|
||||
from vocoder.wavernn import inference as rnn_vocoder
|
||||
import numpy as np
|
||||
import re
|
||||
from scipy.io.wavfile import write
|
||||
import librosa
|
||||
import io
|
||||
import base64
|
||||
from flask_cors import CORS
|
||||
from flask_wtf import CSRFProtect
|
||||
import webbrowser
|
||||
|
||||
def webApp():
|
||||
# Init and load config
|
||||
app = Flask(__name__, instance_relative_config=True)
|
||||
app.config.from_object("web.config.default")
|
||||
app.config['RESTPLUS_MASK_SWAGGER'] = False
|
||||
app.register_blueprint(api_blueprint)
|
||||
|
||||
# CORS(app) #允许跨域,注释掉此行则禁止跨域请求
|
||||
csrf = CSRFProtect(app)
|
||||
csrf.init_app(app)
|
||||
|
||||
syn_models_dirt = "synthesizer/saved_models"
|
||||
synthesizers = list(Path(syn_models_dirt).glob("**/*.pt"))
|
||||
synthesizers_cache = {}
|
||||
encoder.load_model(Path("encoder/saved_models/pretrained.pt"))
|
||||
# rnn_vocoder.load_model(Path("vocoder/saved_models/pretrained/pretrained.pt"))
|
||||
gan_vocoder.load_model(Path("vocoder/saved_models/pretrained/g_hifigan.pt"))
|
||||
|
||||
def pcm2float(sig, dtype='float32'):
|
||||
"""Convert PCM signal to floating point with a range from -1 to 1.
|
||||
Use dtype='float32' for single precision.
|
||||
Parameters
|
||||
----------
|
||||
sig : array_like
|
||||
Input array, must have integral type.
|
||||
dtype : data type, optional
|
||||
Desired (floating point) data type.
|
||||
Returns
|
||||
-------
|
||||
numpy.ndarray
|
||||
Normalized floating point data.
|
||||
See Also
|
||||
--------
|
||||
float2pcm, dtype
|
||||
"""
|
||||
sig = np.asarray(sig)
|
||||
if sig.dtype.kind not in 'iu':
|
||||
raise TypeError("'sig' must be an array of integers")
|
||||
dtype = np.dtype(dtype)
|
||||
if dtype.kind != 'f':
|
||||
raise TypeError("'dtype' must be a floating point type")
|
||||
|
||||
i = np.iinfo(sig.dtype)
|
||||
abs_max = 2 ** (i.bits - 1)
|
||||
offset = i.min + abs_max
|
||||
return (sig.astype(dtype) - offset) / abs_max
|
||||
|
||||
# Cache for synthesizer
|
||||
@csrf.exempt
|
||||
@app.route("/api/synthesize", methods=["POST"])
|
||||
def synthesize():
|
||||
# TODO Implementation with json to support more platform
|
||||
# Load synthesizer
|
||||
if "synt_path" in request.form:
|
||||
synt_path = request.form["synt_path"]
|
||||
else:
|
||||
synt_path = synthesizers[0]
|
||||
print("NO synthsizer is specified, try default first one.")
|
||||
if synthesizers_cache.get(synt_path) is None:
|
||||
current_synt = Synthesizer(Path(synt_path))
|
||||
synthesizers_cache[synt_path] = current_synt
|
||||
else:
|
||||
current_synt = synthesizers_cache[synt_path]
|
||||
print("using synthesizer model: " + str(synt_path))
|
||||
# Load input wav
|
||||
if "upfile_b64" in request.form:
|
||||
wav_base64 = request.form["upfile_b64"]
|
||||
wav = base64.b64decode(bytes(wav_base64, 'utf-8'))
|
||||
wav = pcm2float(np.frombuffer(wav, dtype=np.int16), dtype=np.float32)
|
||||
sample_rate = Synthesizer.sample_rate
|
||||
else:
|
||||
wav, sample_rate, = librosa.load(request.files['file'])
|
||||
write("temp.wav", sample_rate, wav) #Make sure we get the correct wav
|
||||
|
||||
encoder_wav = encoder.preprocess_wav(wav, sample_rate)
|
||||
embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
|
||||
|
||||
# Load input text
|
||||
texts = request.form["text"].split("\n")
|
||||
punctuation = '!,。、,' # punctuate and split/clean text
|
||||
processed_texts = []
|
||||
for text in texts:
|
||||
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
|
||||
if processed_text:
|
||||
processed_texts.append(processed_text.strip())
|
||||
texts = processed_texts
|
||||
|
||||
# synthesize and vocode
|
||||
embeds = [embed] * len(texts)
|
||||
specs = current_synt.synthesize_spectrograms(texts, embeds)
|
||||
spec = np.concatenate(specs, axis=1)
|
||||
# wav = rnn_vocoder.infer_waveform(spec)
|
||||
wav = gan_vocoder.infer_waveform(spec)
|
||||
|
||||
# Return cooked wav
|
||||
out = io.BytesIO()
|
||||
write(out, Synthesizer.sample_rate, wav)
|
||||
return Response(out, mimetype="audio/wav")
|
||||
|
||||
@app.route('/', methods=['GET'])
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
|
||||
host = app.config.get("HOST")
|
||||
port = app.config.get("PORT")
|
||||
web_address = 'http://{}:{}'.format(host, port)
|
||||
print(f"Web server:" + web_address)
|
||||
webbrowser.open(web_address)
|
||||
server = wsgi.WSGIServer((host, port), app)
|
||||
server.serve_forever()
|
||||
return app
|
||||
|
||||
if __name__ == "__main__":
|
||||
webApp()
|
||||
16
web/api/__init__.py
Normal file
16
web/api/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from flask import Blueprint
|
||||
from flask_restx import Api
|
||||
from .audio import api as audio
|
||||
from .synthesizer import api as synthesizer
|
||||
|
||||
api_blueprint = Blueprint('api', __name__, url_prefix='/api')
|
||||
|
||||
api = Api(
|
||||
app=api_blueprint,
|
||||
title='Mocking Bird',
|
||||
version='1.0',
|
||||
description='My API'
|
||||
)
|
||||
|
||||
api.add_namespace(audio)
|
||||
api.add_namespace(synthesizer)
|
||||
43
web/api/audio.py
Normal file
43
web/api/audio.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from flask_restx import Namespace, Resource, fields
|
||||
from flask import Response, current_app
|
||||
|
||||
api = Namespace('audios', description='Audios related operations')
|
||||
|
||||
audio = api.model('Audio', {
|
||||
'name': fields.String(required=True, description='The audio name'),
|
||||
})
|
||||
|
||||
def generate(wav_path):
|
||||
with open(wav_path, "rb") as fwav:
|
||||
data = fwav.read(1024)
|
||||
while data:
|
||||
yield data
|
||||
data = fwav.read(1024)
|
||||
|
||||
@api.route('/')
|
||||
class AudioList(Resource):
|
||||
@api.doc('list_audios')
|
||||
@api.marshal_list_with(audio)
|
||||
def get(self):
|
||||
'''List all audios'''
|
||||
audio_samples = []
|
||||
AUDIO_SAMPLES_DIR = current_app.config.get("AUDIO_SAMPLES_DIR")
|
||||
if os.path.isdir(AUDIO_SAMPLES_DIR):
|
||||
audio_samples = list(Path(AUDIO_SAMPLES_DIR).glob("*.wav"))
|
||||
return list(a.name for a in audio_samples)
|
||||
|
||||
@api.route('/<name>')
|
||||
@api.param('name', 'The name of audio')
|
||||
@api.response(404, 'audio not found')
|
||||
class Audio(Resource):
|
||||
@api.doc('get_audio')
|
||||
@api.marshal_with(audio)
|
||||
def get(self, name):
|
||||
'''Fetch a cat given its identifier'''
|
||||
AUDIO_SAMPLES_DIR = current_app.config.get("AUDIO_SAMPLES_DIR")
|
||||
if Path(AUDIO_SAMPLES_DIR + name).exists():
|
||||
return Response(generate(AUDIO_SAMPLES_DIR + name), mimetype="audio/x-wav")
|
||||
api.abort(404)
|
||||
|
||||
23
web/api/synthesizer.py
Normal file
23
web/api/synthesizer.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from pathlib import Path
|
||||
from flask_restx import Namespace, Resource, fields
|
||||
|
||||
api = Namespace('synthesizers', description='Synthesizers related operations')
|
||||
|
||||
synthesizer = api.model('Synthesizer', {
|
||||
'name': fields.String(required=True, description='The synthesizer name'),
|
||||
'path': fields.String(required=True, description='The synthesizer path'),
|
||||
})
|
||||
|
||||
synthesizers_cache = {}
|
||||
syn_models_dirt = "synthesizer/saved_models"
|
||||
synthesizers = list(Path(syn_models_dirt).glob("**/*.pt"))
|
||||
print("Loaded synthesizer models: " + str(len(synthesizers)))
|
||||
|
||||
@api.route('/')
|
||||
class SynthesizerList(Resource):
|
||||
@api.doc('list_synthesizers')
|
||||
@api.marshal_list_with(synthesizer)
|
||||
def get(self):
|
||||
'''List all synthesizers'''
|
||||
return list({"name": e.name, "path": str(e)} for e in synthesizers)
|
||||
|
||||
0
web/config/__init__.py
Normal file
0
web/config/__init__.py
Normal file
8
web/config/default.py
Normal file
8
web/config/default.py
Normal file
@@ -0,0 +1,8 @@
|
||||
AUDIO_SAMPLES_DIR = 'samples\\'
|
||||
DEVICE = '0'
|
||||
HOST = 'localhost'
|
||||
PORT = 8080
|
||||
MAX_CONTENT_PATH =1024 * 1024 * 4 # mp3文件大小限定不能超过4M
|
||||
SECRET_KEY = "mockingbird_key"
|
||||
WTF_CSRF_SECRET_KEY = "mockingbird_key"
|
||||
TEMPLATES_AUTO_RELOAD = True
|
||||
BIN
web/static/img/bird-sm.png
Normal file
BIN
web/static/img/bird-sm.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 40 KiB |
BIN
web/static/img/bird.png
Normal file
BIN
web/static/img/bird.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 39 KiB |
BIN
web/static/img/mockingbird.png
Normal file
BIN
web/static/img/mockingbird.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 89 KiB |
2
web/static/js/eruda.min.js
vendored
Normal file
2
web/static/js/eruda.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
338
web/static/js/frequency.histogram.view.js
Normal file
338
web/static/js/frequency.histogram.view.js
Normal file
@@ -0,0 +1,338 @@
|
||||
/*
|
||||
录音 Recorder扩展,频率直方图显示
|
||||
使用本扩展需要引入lib.fft.js支持,直方图特意优化主要显示0-5khz语音部分,其他高频显示区域较小,不适合用来展示音乐频谱
|
||||
|
||||
https://github.com/xiangyuecn/Recorder
|
||||
|
||||
本扩展核心算法主要参考了Java开源库jmp123 版本0.3 的代码:
|
||||
https://www.iteye.com/topic/851459
|
||||
https://sourceforge.net/projects/jmp123/files/
|
||||
*/
|
||||
(function(){
|
||||
"use strict";
|
||||
|
||||
var FrequencyHistogramView=function(set){
|
||||
return new fn(set);
|
||||
};
|
||||
var fn=function(set){
|
||||
var This=this;
|
||||
var o={
|
||||
/*
|
||||
elem:"css selector" //自动显示到dom,并以此dom大小为显示大小
|
||||
//或者配置显示大小,手动把frequencyObj.elem显示到别的地方
|
||||
,width:0 //显示宽度
|
||||
,height:0 //显示高度
|
||||
|
||||
以上配置二选一
|
||||
*/
|
||||
|
||||
scale:2 //缩放系数,应为正整数,使用2(3? no!)倍宽高进行绘制,避免移动端绘制模糊
|
||||
|
||||
,fps:20 //绘制帧率,不可过高
|
||||
|
||||
,lineCount:30 //直方图柱子数量,数量的多少对性能影响不大,密集运算集中在FFT算法中
|
||||
,widthRatio:0.6 //柱子线条宽度占比,为所有柱子占用整个视图宽度的比例,剩下的空白区域均匀插入柱子中间;默认值也基本相当于一根柱子占0.6,一根空白占0.4;设为1不留空白,当视图不足容下所有柱子时也不留空白
|
||||
,spaceWidth:0 //柱子间空白固定基础宽度,柱子宽度自适应,当不为0时widthRatio无效,当视图不足容下所有柱子时将不会留空白,允许为负数,让柱子发生重叠
|
||||
,minHeight:0 //柱子保留基础高度,position不为±1时应该保留点高度
|
||||
,position:-1 //绘制位置,取值-1到1,-1为最底下,0为中间,1为最顶上,小数为百分比
|
||||
,mirrorEnable:false //是否启用镜像,如果启用,视图宽度会分成左右两块,右边这块进行绘制,左边这块进行镜像(以中间这根柱子的中心进行镜像)
|
||||
|
||||
,stripeEnable:true //是否启用柱子顶上的峰值小横条,position不是-1时应当关闭,否则会很丑
|
||||
,stripeHeight:3 //峰值小横条基础高度
|
||||
,stripeMargin:6 //峰值小横条和柱子保持的基础距离
|
||||
|
||||
,fallDuration:1000 //柱子从最顶上下降到最底部最长时间ms
|
||||
,stripeFallDuration:3500 //峰值小横条从最顶上下降到底部最长时间ms
|
||||
|
||||
//柱子颜色配置:[位置,css颜色,...] 位置: 取值0.0-1.0之间
|
||||
,linear:[0,"rgba(0,187,17,1)",0.5,"rgba(255,215,0,1)",1,"rgba(255,102,0,1)"]
|
||||
//峰值小横条渐变颜色配置,取值格式和linear一致,留空为柱子的渐变颜色
|
||||
,stripeLinear:null
|
||||
|
||||
,shadowBlur:0 //柱子阴影基础大小,设为0不显示阴影,如果柱子数量太多时请勿开启,非常影响性能
|
||||
,shadowColor:"#bbb" //柱子阴影颜色
|
||||
,stripeShadowBlur:-1 //峰值小横条阴影基础大小,设为0不显示阴影,-1为柱子的大小,如果柱子数量太多时请勿开启,非常影响性能
|
||||
,stripeShadowColor:"" //峰值小横条阴影颜色,留空为柱子的阴影颜色
|
||||
|
||||
//当发生绘制时会回调此方法,参数为当前绘制的频率数据和采样率,可实现多个直方图同时绘制,只消耗一个input输入和计算时间
|
||||
,onDraw:function(frequencyData,sampleRate){}
|
||||
};
|
||||
for(var k in set){
|
||||
o[k]=set[k];
|
||||
};
|
||||
This.set=set=o;
|
||||
|
||||
var elem=set.elem;
|
||||
if(elem){
|
||||
if(typeof(elem)=="string"){
|
||||
elem=document.querySelector(elem);
|
||||
}else if(elem.length){
|
||||
elem=elem[0];
|
||||
};
|
||||
};
|
||||
if(elem){
|
||||
set.width=elem.offsetWidth;
|
||||
set.height=elem.offsetHeight;
|
||||
};
|
||||
|
||||
var scale=set.scale;
|
||||
var width=set.width*scale;
|
||||
var height=set.height*scale;
|
||||
|
||||
var thisElem=This.elem=document.createElement("div");
|
||||
var lowerCss=["","transform-origin:0 0;","transform:scale("+(1/scale)+");"];
|
||||
thisElem.innerHTML='<div style="width:'+set.width+'px;height:'+set.height+'px;overflow:hidden"><div style="width:'+width+'px;height:'+height+'px;'+lowerCss.join("-webkit-")+lowerCss.join("-ms-")+lowerCss.join("-moz-")+lowerCss.join("")+'"><canvas/></div></div>';
|
||||
|
||||
var canvas=This.canvas=thisElem.querySelector("canvas");
|
||||
var ctx=This.ctx=canvas.getContext("2d");
|
||||
canvas.width=width;
|
||||
canvas.height=height;
|
||||
|
||||
if(elem){
|
||||
elem.innerHTML="";
|
||||
elem.appendChild(thisElem);
|
||||
};
|
||||
|
||||
if(!Recorder.LibFFT){
|
||||
throw new Error("需要lib.fft.js支持");
|
||||
};
|
||||
This.fft=Recorder.LibFFT(1024);
|
||||
|
||||
//柱子所在高度
|
||||
This.lastH=[];
|
||||
//峰值小横条所在高度
|
||||
This.stripesH=[];
|
||||
};
|
||||
fn.prototype=FrequencyHistogramView.prototype={
|
||||
genLinear:function(ctx,colors,from,to){
|
||||
var rtv=ctx.createLinearGradient(0,from,0,to);
|
||||
for(var i=0;i<colors.length;){
|
||||
rtv.addColorStop(colors[i++],colors[i++]);
|
||||
};
|
||||
return rtv;
|
||||
}
|
||||
,input:function(pcmData,powerLevel,sampleRate){
|
||||
var This=this;
|
||||
This.sampleRate=sampleRate;
|
||||
This.pcmData=pcmData;
|
||||
This.pcmPos=0;
|
||||
|
||||
This.inputTime=Date.now();
|
||||
This.schedule();
|
||||
}
|
||||
,schedule:function(){
|
||||
var This=this,set=This.set;
|
||||
var interval=Math.floor(1000/set.fps);
|
||||
if(!This.timer){
|
||||
This.timer=setInterval(function(){
|
||||
This.schedule();
|
||||
},interval);
|
||||
};
|
||||
|
||||
var now=Date.now();
|
||||
var drawTime=This.drawTime||0;
|
||||
if(now-This.inputTime>set.stripeFallDuration*1.3){
|
||||
//超时没有输入,顶部横条已全部落下,干掉定时器
|
||||
clearInterval(This.timer);
|
||||
This.timer=0;
|
||||
return;
|
||||
};
|
||||
if(now-drawTime<interval){
|
||||
//没到间隔时间,不绘制
|
||||
return;
|
||||
};
|
||||
This.drawTime=now;
|
||||
|
||||
//调用FFT计算频率数据
|
||||
var bufferSize=This.fft.bufferSize;
|
||||
var pcm=This.pcmData;
|
||||
var pos=This.pcmPos;
|
||||
var arr=new Int16Array(bufferSize);
|
||||
for(var i=0;i<bufferSize&&pos<pcm.length;i++,pos++){
|
||||
arr[i]=pcm[pos];
|
||||
};
|
||||
This.pcmPos=pos;
|
||||
|
||||
var frequencyData=This.fft.transform(arr);
|
||||
|
||||
//推入绘制
|
||||
This.draw(frequencyData,This.sampleRate);
|
||||
}
|
||||
,draw:function(frequencyData,sampleRate){
|
||||
var This=this,set=This.set;
|
||||
var ctx=This.ctx;
|
||||
var scale=set.scale;
|
||||
var width=set.width*scale;
|
||||
var height=set.height*scale;
|
||||
var lineCount=set.lineCount;
|
||||
var bufferSize=This.fft.bufferSize;
|
||||
|
||||
|
||||
//计算高度位置
|
||||
var position=set.position;
|
||||
var posAbs=Math.abs(set.position);
|
||||
var originY=position==1?0:height;//y轴原点
|
||||
var heightY=height;//最高的一边高度
|
||||
if(posAbs<1){
|
||||
heightY=heightY/2;
|
||||
originY=heightY;
|
||||
heightY=Math.floor(heightY*(1+posAbs));
|
||||
originY=Math.floor(position>0?originY*(1-posAbs):originY*(1+posAbs));
|
||||
};
|
||||
|
||||
var lastH=This.lastH;
|
||||
var stripesH=This.stripesH;
|
||||
var speed=Math.ceil(heightY/(set.fallDuration/(1000/set.fps)));
|
||||
var stripeSpeed=Math.ceil(heightY/(set.stripeFallDuration/(1000/set.fps)));
|
||||
var stripeMargin=set.stripeMargin*scale;
|
||||
|
||||
var Y0=1 << (Math.round(Math.log(bufferSize)/Math.log(2) + 3) << 1);
|
||||
var logY0 = Math.log(Y0)/Math.log(10);
|
||||
var dBmax=20*Math.log(0x7fff)/Math.log(10);
|
||||
|
||||
var fftSize=bufferSize/2;
|
||||
var fftSize5k=Math.min(fftSize,Math.floor(fftSize*5000/(sampleRate/2)));//5khz所在位置,8000采样率及以下最高只有4khz
|
||||
var fftSize5kIsAll=fftSize5k==fftSize;
|
||||
var line80=fftSize5kIsAll?lineCount:Math.round(lineCount*0.8);//80%的柱子位置
|
||||
var fftSizeStep1=fftSize5k/line80;
|
||||
var fftSizeStep2=fftSize5kIsAll?0:(fftSize-fftSize5k)/(lineCount-line80);
|
||||
var fftIdx=0;
|
||||
for(var i=0;i<lineCount;i++){
|
||||
//不采用jmp123的非线性划分频段,录音语音并不适用于音乐的频率,应当弱化高频部分
|
||||
//80%关注0-5khz主要人声部分 20%关注剩下的高频,这样不管什么采样率都能做到大部分频率显示一致。
|
||||
var start=Math.ceil(fftIdx);
|
||||
if(i<line80){
|
||||
//5khz以下
|
||||
fftIdx+=fftSizeStep1;
|
||||
}else{
|
||||
//5khz以上
|
||||
fftIdx+=fftSizeStep2;
|
||||
};
|
||||
var end=Math.min(Math.ceil(fftIdx),fftSize);
|
||||
|
||||
|
||||
//参考AudioGUI.java .drawHistogram方法
|
||||
|
||||
//查找当前频段的最大"幅值"
|
||||
var maxAmp=0;
|
||||
for (var j=start; j<end; j++) {
|
||||
maxAmp=Math.max(maxAmp,Math.abs(frequencyData[j]));
|
||||
};
|
||||
|
||||
//计算音量
|
||||
var dB= (maxAmp > Y0) ? Math.floor((Math.log(maxAmp)/Math.log(10) - logY0) * 17) : 0;
|
||||
var h=heightY*Math.min(dB/dBmax,1);
|
||||
|
||||
//使柱子匀速下降
|
||||
lastH[i]=(lastH[i]||0)-speed;
|
||||
if(h<lastH[i]){h=lastH[i];};
|
||||
if(h<0){h=0;};
|
||||
lastH[i]=h;
|
||||
|
||||
var shi=stripesH[i]||0;
|
||||
if(h&&h+stripeMargin>shi) {
|
||||
stripesH[i]=h+stripeMargin;
|
||||
}else{
|
||||
//使峰值小横条匀速度下落
|
||||
var sh =shi-stripeSpeed;
|
||||
if(sh < 0){sh = 0;};
|
||||
stripesH[i] = sh;
|
||||
};
|
||||
};
|
||||
|
||||
//开始绘制图形
|
||||
ctx.clearRect(0,0,width,height);
|
||||
|
||||
var linear1=This.genLinear(ctx,set.linear,originY,originY-heightY);//上半部分的填充
|
||||
var stripeLinear1=set.stripeLinear&&This.genLinear(ctx,set.stripeLinear,originY,originY-heightY)||linear1;//上半部分的峰值小横条填充
|
||||
|
||||
var linear2=This.genLinear(ctx,set.linear,originY,originY+heightY);//下半部分的填充
|
||||
var stripeLinear2=set.stripeLinear&&This.genLinear(ctx,set.stripeLinear,originY,originY+heightY)||linear2;//上半部分的峰值小横条填充
|
||||
|
||||
//计算柱子间距
|
||||
ctx.shadowBlur=set.shadowBlur*scale;
|
||||
ctx.shadowColor=set.shadowColor;
|
||||
var mirrorEnable=set.mirrorEnable;
|
||||
var mirrorCount=mirrorEnable?lineCount*2-1:lineCount;//镜像柱子数量翻一倍-1根
|
||||
|
||||
var widthRatio=set.widthRatio;
|
||||
var spaceWidth=set.spaceWidth*scale;
|
||||
if(spaceWidth!=0){
|
||||
widthRatio=(width-spaceWidth*(mirrorCount+1))/width;
|
||||
};
|
||||
|
||||
var lineWidth=Math.max(1*scale,Math.floor((width*widthRatio)/mirrorCount));//柱子宽度至少1个单位
|
||||
var spaceFloat=(width-mirrorCount*lineWidth)/(mirrorCount+1);//均匀间隔,首尾都留空,可能为负数,柱子将发生重叠
|
||||
|
||||
//绘制柱子
|
||||
var minHeight=set.minHeight*scale;
|
||||
var mirrorSubX=spaceFloat+lineWidth/2;
|
||||
var XFloat=mirrorEnable?width/2-mirrorSubX:0;//镜像时,中间柱子位于正中心
|
||||
for(var i=0,xFloat=XFloat,x,y,h;i<lineCount;i++){
|
||||
xFloat+=spaceFloat;
|
||||
x=Math.floor(xFloat);
|
||||
h=Math.max(lastH[i],minHeight);
|
||||
|
||||
//绘制上半部分
|
||||
if(originY!=0){
|
||||
y=originY-h;
|
||||
ctx.fillStyle=linear1;
|
||||
ctx.fillRect(x, y, lineWidth, h);
|
||||
};
|
||||
//绘制下半部分
|
||||
if(originY!=height){
|
||||
ctx.fillStyle=linear2;
|
||||
ctx.fillRect(x, originY, lineWidth, h);
|
||||
};
|
||||
|
||||
xFloat+=lineWidth;
|
||||
};
|
||||
|
||||
//绘制柱子顶上峰值小横条
|
||||
if(set.stripeEnable){
|
||||
var stripeShadowBlur=set.stripeShadowBlur;
|
||||
ctx.shadowBlur=(stripeShadowBlur==-1?set.shadowBlur:stripeShadowBlur)*scale;
|
||||
ctx.shadowColor=set.stripeShadowColor||set.shadowColor;
|
||||
var stripeHeight=set.stripeHeight*scale;
|
||||
for(var i=0,xFloat=XFloat,x,y,h;i<lineCount;i++){
|
||||
xFloat+=spaceFloat;
|
||||
x=Math.floor(xFloat);
|
||||
h=stripesH[i];
|
||||
|
||||
//绘制上半部分
|
||||
if(originY!=0){
|
||||
y=originY-h-stripeHeight;
|
||||
if(y<0){y=0;};
|
||||
ctx.fillStyle=stripeLinear1;
|
||||
ctx.fillRect(x, y, lineWidth, stripeHeight);
|
||||
};
|
||||
//绘制下半部分
|
||||
if(originY!=height){
|
||||
y=originY+h;
|
||||
if(y+stripeHeight>height){
|
||||
y=height-stripeHeight;
|
||||
};
|
||||
ctx.fillStyle=stripeLinear2;
|
||||
ctx.fillRect(x, y, lineWidth, stripeHeight);
|
||||
};
|
||||
|
||||
xFloat+=lineWidth;
|
||||
};
|
||||
};
|
||||
|
||||
//镜像,从中间直接镜像即可
|
||||
if(mirrorEnable){
|
||||
var srcW=Math.floor(width/2);
|
||||
ctx.save();
|
||||
ctx.scale(-1,1);
|
||||
ctx.drawImage(This.canvas,Math.ceil(width/2),0,srcW,height,-srcW,0,srcW,height);
|
||||
ctx.restore();
|
||||
};
|
||||
|
||||
set.onDraw(frequencyData,sampleRate);
|
||||
}
|
||||
};
|
||||
Recorder.FrequencyHistogramView=FrequencyHistogramView;
|
||||
|
||||
|
||||
})();
|
||||
10881
web/static/js/jquery.js
vendored
Normal file
10881
web/static/js/jquery.js
vendored
Normal file
File diff suppressed because it is too large
Load Diff
111
web/static/js/lib.fft.js
Normal file
111
web/static/js/lib.fft.js
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
时域转频域,快速傅里叶变换(FFT)
|
||||
https://github.com/xiangyuecn/Recorder
|
||||
|
||||
var fft=Recorder.LibFFT(bufferSize)
|
||||
bufferSize取值2的n次方
|
||||
|
||||
fft.bufferSize 实际采用的bufferSize
|
||||
fft.transform(inBuffer)
|
||||
inBuffer:[Int16,...] 数组长度必须是bufferSize
|
||||
返回[Float64(Long),...],长度为bufferSize/2
|
||||
*/
|
||||
|
||||
/*
|
||||
从FFT.java 移植,Java开源库:jmp123 版本0.3
|
||||
https://www.iteye.com/topic/851459
|
||||
https://sourceforge.net/projects/jmp123/files/
|
||||
*/
|
||||
Recorder.LibFFT=function(bufferSize){
|
||||
"use strict";
|
||||
|
||||
var FFT_N_LOG,FFT_N,MINY;
|
||||
var real, imag, sintable, costable;
|
||||
var bitReverse;
|
||||
|
||||
var FFT_Fn=function(bufferSize) {//bufferSize只能取值2的n次方
|
||||
FFT_N_LOG=Math.round(Math.log(bufferSize)/Math.log(2));
|
||||
FFT_N = 1 << FFT_N_LOG;
|
||||
MINY = ((FFT_N << 2) * Math.sqrt(2));
|
||||
|
||||
real = [];
|
||||
imag = [];
|
||||
sintable = [0];
|
||||
costable = [0];
|
||||
bitReverse = [];
|
||||
|
||||
var i, j, k, reve;
|
||||
for (i = 0; i < FFT_N; i++) {
|
||||
k = i;
|
||||
for (j = 0, reve = 0; j != FFT_N_LOG; j++) {
|
||||
reve <<= 1;
|
||||
reve |= (k & 1);
|
||||
k >>>= 1;
|
||||
}
|
||||
bitReverse[i] = reve;
|
||||
}
|
||||
|
||||
var theta, dt = 2 * Math.PI / FFT_N;
|
||||
for (i = (FFT_N >> 1) - 1; i > 0; i--) {
|
||||
theta = i * dt;
|
||||
costable[i] = Math.cos(theta);
|
||||
sintable[i] = Math.sin(theta);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
用于频谱显示的快速傅里叶变换
|
||||
inBuffer 输入FFT_N个实数,返回 FFT_N/2个输出值(复数模的平方)。
|
||||
*/
|
||||
var getModulus=function(inBuffer) {
|
||||
var i, j, k, ir, j0 = 1, idx = FFT_N_LOG - 1;
|
||||
var cosv, sinv, tmpr, tmpi;
|
||||
for (i = 0; i != FFT_N; i++) {
|
||||
real[i] = inBuffer[bitReverse[i]];
|
||||
imag[i] = 0;
|
||||
}
|
||||
|
||||
for (i = FFT_N_LOG; i != 0; i--) {
|
||||
for (j = 0; j != j0; j++) {
|
||||
cosv = costable[j << idx];
|
||||
sinv = sintable[j << idx];
|
||||
for (k = j; k < FFT_N; k += j0 << 1) {
|
||||
ir = k + j0;
|
||||
tmpr = cosv * real[ir] - sinv * imag[ir];
|
||||
tmpi = cosv * imag[ir] + sinv * real[ir];
|
||||
real[ir] = real[k] - tmpr;
|
||||
imag[ir] = imag[k] - tmpi;
|
||||
real[k] += tmpr;
|
||||
imag[k] += tmpi;
|
||||
}
|
||||
}
|
||||
j0 <<= 1;
|
||||
idx--;
|
||||
}
|
||||
|
||||
j = FFT_N >> 1;
|
||||
var outBuffer=new Float64Array(j);
|
||||
/*
|
||||
* 输出模的平方:
|
||||
* for(i = 1; i <= j; i++)
|
||||
* inBuffer[i-1] = real[i] * real[i] + imag[i] * imag[i];
|
||||
*
|
||||
* 如果FFT只用于频谱显示,可以"淘汰"幅值较小的而减少浮点乘法运算. MINY的值
|
||||
* 和Spectrum.Y0,Spectrum.logY0对应.
|
||||
*/
|
||||
sinv = MINY;
|
||||
cosv = -MINY;
|
||||
for (i = j; i != 0; i--) {
|
||||
tmpr = real[i];
|
||||
tmpi = imag[i];
|
||||
if (tmpr > cosv && tmpr < sinv && tmpi > cosv && tmpi < sinv)
|
||||
outBuffer[i - 1] = 0;
|
||||
else
|
||||
outBuffer[i - 1] = Math.round(tmpr * tmpr + tmpi * tmpi);
|
||||
}
|
||||
return outBuffer;
|
||||
}
|
||||
|
||||
FFT_Fn(bufferSize);
|
||||
return {transform:getModulus,bufferSize:FFT_N};
|
||||
};
|
||||
14173
web/static/js/mp3-engine.js
Normal file
14173
web/static/js/mp3-engine.js
Normal file
File diff suppressed because it is too large
Load Diff
424
web/static/js/mp3.js
Normal file
424
web/static/js/mp3.js
Normal file
@@ -0,0 +1,424 @@
|
||||
/*
|
||||
mp3编码器,需带上mp3-engine.js引擎使用
|
||||
https://github.com/xiangyuecn/Recorder
|
||||
|
||||
当然最佳推荐使用mp3、wav格式,代码也是优先照顾这两种格式
|
||||
浏览器支持情况
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTML/Supported_media_formats
|
||||
*/
|
||||
(function(){
|
||||
"use strict";
|
||||
|
||||
Recorder.prototype.enc_mp3={
|
||||
stable:true
|
||||
,testmsg:"采样率范围48000, 44100, 32000, 24000, 22050, 16000, 12000, 11025, 8000"
|
||||
};
|
||||
|
||||
|
||||
|
||||
//*******标准UI线程转码支持函数************
|
||||
|
||||
Recorder.prototype.mp3=function(res,True,False){
|
||||
var This=this,set=This.set,size=res.length;
|
||||
|
||||
//优先采用worker编码,太低版本下面用老方法提供兼容
|
||||
var ctx=This.mp3_start(set);
|
||||
if(ctx){
|
||||
This.mp3_encode(ctx,res);
|
||||
This.mp3_complete(ctx,True,False,1);
|
||||
return;
|
||||
};
|
||||
|
||||
//https://github.com/wangpengfei15975/recorder.js
|
||||
//https://github.com/zhuker/lamejs bug:采样率必须和源一致,不然8k时没有声音,有问题fix:https://github.com/zhuker/lamejs/pull/11
|
||||
var mp3=new Recorder.lamejs.Mp3Encoder(1,set.sampleRate,set.bitRate);
|
||||
|
||||
var blockSize=57600;
|
||||
var data=[];
|
||||
|
||||
var idx=0,mp3Size=0;
|
||||
var run=function(){
|
||||
if(idx<size){
|
||||
var buf=mp3.encodeBuffer(res.subarray(idx,idx+blockSize));
|
||||
if(buf.length>0){
|
||||
mp3Size+=buf.buffer.byteLength;
|
||||
data.push(buf.buffer);
|
||||
};
|
||||
idx+=blockSize;
|
||||
setTimeout(run);//尽量避免卡ui
|
||||
}else{
|
||||
var buf=mp3.flush();
|
||||
if(buf.length>0){
|
||||
mp3Size+=buf.buffer.byteLength;
|
||||
data.push(buf.buffer);
|
||||
};
|
||||
|
||||
//去掉开头的标记信息帧
|
||||
var meta=mp3TrimFix.fn(data,mp3Size,size,set.sampleRate);
|
||||
mp3TrimFixSetMeta(meta,set);
|
||||
|
||||
True(new Blob(data,{type:"audio/mp3"}));
|
||||
};
|
||||
};
|
||||
run();
|
||||
}
|
||||
|
||||
|
||||
//********边录边转码(Worker)支持函数,如果提供就代表可能支持,否则只支持标准转码*********
|
||||
|
||||
//全局共享一个Worker,后台串行执行。如果每次都开一个新的,编码速度可能会慢很多,可能是浏览器运行缓存的因素,并且可能瞬间产生多个并行操作占用大量cpu
|
||||
var mp3Worker;
|
||||
Recorder.BindDestroy("mp3Worker",function(){
|
||||
console.log("mp3Worker Destroy");
|
||||
mp3Worker&&mp3Worker.terminate();
|
||||
mp3Worker=null;
|
||||
});
|
||||
|
||||
|
||||
Recorder.prototype.mp3_envCheck=function(envInfo,set){//检查环境下配置是否可用
|
||||
var errMsg="";
|
||||
//需要实时编码返回数据,此时需要检查环境是否有实时特性、和是否可实时编码
|
||||
if(set.takeoffEncodeChunk){
|
||||
if(!envInfo.canProcess){
|
||||
errMsg=envInfo.envName+"环境不支持实时处理";
|
||||
}else if(!newContext()){//浏览器不能创建实时编码环境
|
||||
errMsg="当前浏览器版本太低,无法实时处理";
|
||||
};
|
||||
};
|
||||
return errMsg;
|
||||
};
|
||||
Recorder.prototype.mp3_start=function(set){//如果返回null代表不支持
|
||||
return newContext(set);
|
||||
};
|
||||
var openList={id:0};
|
||||
var newContext=function(setOrNull){
|
||||
var worker=mp3Worker;
|
||||
try{
|
||||
if(!worker){
|
||||
var onmsg=function(e){
|
||||
var ed=e.data;
|
||||
var cur=wk_ctxs[ed.id];
|
||||
if(ed.action=="init"){
|
||||
wk_ctxs[ed.id]={
|
||||
sampleRate:ed.sampleRate
|
||||
,bitRate:ed.bitRate
|
||||
,takeoff:ed.takeoff
|
||||
|
||||
,mp3Size:0
|
||||
,pcmSize:0
|
||||
,encArr:[]
|
||||
,encObj:new wk_lame.Mp3Encoder(1,ed.sampleRate,ed.bitRate)
|
||||
};
|
||||
}else if(!cur){
|
||||
return;
|
||||
};
|
||||
|
||||
switch(ed.action){
|
||||
case "stop":
|
||||
cur.encObj=null;
|
||||
delete wk_ctxs[ed.id];
|
||||
break;
|
||||
case "encode":
|
||||
cur.pcmSize+=ed.pcm.length;
|
||||
var buf=cur.encObj.encodeBuffer(ed.pcm);
|
||||
if(buf.length>0){
|
||||
if(cur.takeoff){
|
||||
self.postMessage({action:"takeoff",id:ed.id,chunk:buf});
|
||||
}else{
|
||||
cur.mp3Size+=buf.buffer.byteLength;
|
||||
cur.encArr.push(buf.buffer);
|
||||
};
|
||||
};
|
||||
break;
|
||||
case "complete":
|
||||
var buf=cur.encObj.flush();
|
||||
if(buf.length>0){
|
||||
if(cur.takeoff){
|
||||
self.postMessage({action:"takeoff",id:ed.id,chunk:buf});
|
||||
}else{
|
||||
cur.mp3Size+=buf.buffer.byteLength;
|
||||
cur.encArr.push(buf.buffer);
|
||||
};
|
||||
};
|
||||
|
||||
//去掉开头的标记信息帧
|
||||
var meta=wk_mp3TrimFix.fn(cur.encArr,cur.mp3Size,cur.pcmSize,cur.sampleRate);
|
||||
|
||||
self.postMessage({
|
||||
action:ed.action
|
||||
,id:ed.id
|
||||
,blob:new Blob(cur.encArr,{type:"audio/mp3"})
|
||||
,meta:meta
|
||||
});
|
||||
break;
|
||||
};
|
||||
};
|
||||
|
||||
//创建一个新Worker
|
||||
var jsCode=");wk_lame();var wk_ctxs={};self.onmessage="+onmsg;
|
||||
jsCode+=";var wk_mp3TrimFix={rm:"+mp3TrimFix.rm+",fn:"+mp3TrimFix.fn+"}";
|
||||
|
||||
var lamejsCode=Recorder.lamejs.toString();
|
||||
var url=(window.URL||webkitURL).createObjectURL(new Blob(["var wk_lame=(",lamejsCode,jsCode], {type:"text/javascript"}));
|
||||
|
||||
worker=new Worker(url);
|
||||
setTimeout(function(){
|
||||
(window.URL||webkitURL).revokeObjectURL(url);//必须要释放,不然每次调用内存都明显泄露内存
|
||||
},10000);//chrome 83 file协议下如果直接释放,将会使WebWorker无法启动
|
||||
|
||||
worker.onmessage=function(e){
|
||||
var data=e.data;
|
||||
var ctx=openList[data.id];
|
||||
if(ctx){
|
||||
if(data.action=="takeoff"){
|
||||
//取走实时生成的mp3数据
|
||||
ctx.set.takeoffEncodeChunk(new Uint8Array(data.chunk.buffer));
|
||||
}else{
|
||||
//complete
|
||||
ctx.call&&ctx.call(data);
|
||||
ctx.call=null;
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
var ctx={worker:worker,set:setOrNull,takeoffQueue:[]};
|
||||
if(setOrNull){
|
||||
ctx.id=++openList.id;
|
||||
openList[ctx.id]=ctx;
|
||||
|
||||
worker.postMessage({
|
||||
action:"init"
|
||||
,id:ctx.id
|
||||
,sampleRate:setOrNull.sampleRate
|
||||
,bitRate:setOrNull.bitRate
|
||||
,takeoff:!!setOrNull.takeoffEncodeChunk
|
||||
|
||||
,x:new Int16Array(5)//低版本浏览器不支持序列化TypedArray
|
||||
});
|
||||
}else{
|
||||
worker.postMessage({
|
||||
x:new Int16Array(5)//低版本浏览器不支持序列化TypedArray
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
mp3Worker=worker;
|
||||
return ctx;
|
||||
}catch(e){//出错了就不要提供了
|
||||
worker&&worker.terminate();
|
||||
|
||||
console.error(e);
|
||||
return null;
|
||||
};
|
||||
};
|
||||
Recorder.prototype.mp3_stop=function(startCtx){
|
||||
if(startCtx&&startCtx.worker){
|
||||
startCtx.worker.postMessage({
|
||||
action:"stop"
|
||||
,id:startCtx.id
|
||||
});
|
||||
startCtx.worker=null;
|
||||
delete openList[startCtx.id];
|
||||
|
||||
//疑似泄露检测 排除id
|
||||
var opens=-1;
|
||||
for(var k in openList){
|
||||
opens++;
|
||||
};
|
||||
if(opens){
|
||||
console.warn("mp3 worker剩"+opens+"个在串行等待");
|
||||
};
|
||||
};
|
||||
};
|
||||
Recorder.prototype.mp3_encode=function(startCtx,pcm){
|
||||
if(startCtx&&startCtx.worker){
|
||||
startCtx.worker.postMessage({
|
||||
action:"encode"
|
||||
,id:startCtx.id
|
||||
,pcm:pcm
|
||||
});
|
||||
};
|
||||
};
|
||||
Recorder.prototype.mp3_complete=function(startCtx,True,False,autoStop){
|
||||
var This=this;
|
||||
if(startCtx&&startCtx.worker){
|
||||
startCtx.call=function(data){
|
||||
mp3TrimFixSetMeta(data.meta,startCtx.set);
|
||||
True(data.blob);
|
||||
|
||||
if(autoStop){
|
||||
This.mp3_stop(startCtx);
|
||||
};
|
||||
};
|
||||
startCtx.worker.postMessage({
|
||||
action:"complete"
|
||||
,id:startCtx.id
|
||||
});
|
||||
}else{
|
||||
False("mp3编码器未打开");
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//*******辅助函数************
|
||||
|
||||
/*读取lamejs编码出来的mp3信息,只能读特定格式,如果读取失败返回null
|
||||
mp3Buffers=[ArrayBuffer,...]
|
||||
length=mp3Buffers的数据二进制总长度
|
||||
*/
|
||||
Recorder.mp3ReadMeta=function(mp3Buffers,length){
|
||||
//kill babel-polyfill ES6 Number.parseInt 不然放到Worker里面找不到方法
|
||||
var parseInt_ES3=typeof(window)=="object"?window.parseInt:self.parseInt;
|
||||
|
||||
var u8arr0=new Uint8Array(mp3Buffers[0]||[]);
|
||||
if(u8arr0.length<4){
|
||||
return null;
|
||||
};
|
||||
var byteAt=function(idx,u8){
|
||||
return ("0000000"+((u8||u8arr0)[idx]||0).toString(2)).substr(-8);
|
||||
};
|
||||
var b2=byteAt(0)+byteAt(1);
|
||||
var b4=byteAt(2)+byteAt(3);
|
||||
|
||||
if(!/^1{11}/.test(b2)){//未发现帧同步
|
||||
return null;
|
||||
};
|
||||
var version=({"00":2.5,"10":2,"11":1})[b2.substr(11,2)];
|
||||
var layer=({"01":3})[b2.substr(13,2)];//仅支持Layer3
|
||||
var sampleRate=({ //lamejs -> Tables.samplerate_table
|
||||
"1":[44100, 48000, 32000]
|
||||
,"2":[22050, 24000, 16000]
|
||||
,"2.5":[11025, 12000, 8000]
|
||||
})[version];
|
||||
sampleRate&&(sampleRate=sampleRate[parseInt_ES3(b4.substr(4,2),2)]);
|
||||
var bitRate=[ //lamejs -> Tables.bitrate_table
|
||||
[0, 8, 16, 24, 32, 40, 48, 56, 64, 80, 96, 112, 128, 144, 160] //MPEG 2 2.5
|
||||
,[0, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 256, 320]//MPEG 1
|
||||
][version==1?1:0][parseInt_ES3(b4.substr(0,4),2)];
|
||||
|
||||
if(!version || !layer || !bitRate || !sampleRate){
|
||||
return null;
|
||||
};
|
||||
|
||||
var duration=Math.round(length*8/bitRate);
|
||||
var frame=layer==1?384:layer==2?1152:version==1?1152:576;
|
||||
var frameDurationFloat=frame/sampleRate*1000;
|
||||
var frameSize=Math.floor((frame*bitRate)/8/sampleRate*1000);
|
||||
|
||||
//检测是否存在Layer3帧填充1字节。这里只获取第二帧的填充信息,首帧永远没有填充。其他帧可能隔一帧出现一个填充,或者隔很多帧出现一个填充;目测是取决于frameSize未舍入时的小数部分,因为有些采样率的frameSize会出现小数(11025、22050、44100 典型的除不尽),然后字节数无法表示这种小数,就通过一定步长来填充弥补小数部分丢失
|
||||
var hasPadding=0,seek=0;
|
||||
for(var i=0;i<mp3Buffers.length;i++){
|
||||
//寻找第二帧
|
||||
var buf=mp3Buffers[i];
|
||||
seek+=buf.byteLength;
|
||||
if(seek>=frameSize+3){
|
||||
var buf8=new Uint8Array(buf);
|
||||
var idx=buf.byteLength-(seek-(frameSize+3)+1);
|
||||
var ib4=byteAt(idx,buf8);
|
||||
hasPadding=ib4.charAt(6)=="1";
|
||||
break;
|
||||
};
|
||||
};
|
||||
if(hasPadding){
|
||||
frameSize++;
|
||||
};
|
||||
|
||||
|
||||
return {
|
||||
version:version //1 2 2.5 -> MPEG1 MPEG2 MPEG2.5
|
||||
,layer:layer//3 -> Layer3
|
||||
,sampleRate:sampleRate //采样率 hz
|
||||
,bitRate:bitRate //比特率 kbps
|
||||
|
||||
,duration:duration //音频时长 ms
|
||||
,size:length //总长度 byte
|
||||
,hasPadding:hasPadding //是否存在1字节填充,首帧永远没有,这个值其实代表的第二帧是否有填充,并不代表其他帧的
|
||||
,frameSize:frameSize //每帧最大长度,含可能存在的1字节padding byte
|
||||
,frameDurationFloat:frameDurationFloat //每帧时长,含小数 ms
|
||||
};
|
||||
};
|
||||
|
||||
//去掉lamejs开头的标记信息帧,免得mp3解码出来的时长比pcm的长太多
|
||||
var mp3TrimFix={//minfiy keep name
|
||||
rm:Recorder.mp3ReadMeta
|
||||
,fn:function(mp3Buffers,length,pcmLength,pcmSampleRate){
|
||||
var meta=this.rm(mp3Buffers,length);
|
||||
if(!meta){
|
||||
return {err:"mp3非预定格式"};
|
||||
};
|
||||
var pcmDuration=Math.round(pcmLength/pcmSampleRate*1000);
|
||||
|
||||
//开头多出这么多帧,移除掉;正常情况下最多为2帧
|
||||
var num=Math.floor((meta.duration-pcmDuration)/meta.frameDurationFloat);
|
||||
if(num>0){
|
||||
var size=num*meta.frameSize-(meta.hasPadding?1:0);//首帧没有填充,第二帧可能有填充,这里假设最多为2帧(测试并未出现3帧以上情况),其他帧不管,就算出现了并且导致了错误后面自动容错
|
||||
length-=size;
|
||||
var arr0=0,arrs=[];
|
||||
for(var i=0;i<mp3Buffers.length;i++){
|
||||
var arr=mp3Buffers[i];
|
||||
if(size<=0){
|
||||
break;
|
||||
};
|
||||
if(size>=arr.byteLength){
|
||||
size-=arr.byteLength;
|
||||
arrs.push(arr);
|
||||
mp3Buffers.splice(i,1);
|
||||
i--;
|
||||
}else{
|
||||
mp3Buffers[i]=arr.slice(size);
|
||||
arr0=arr;
|
||||
size=0;
|
||||
};
|
||||
};
|
||||
var checkMeta=this.rm(mp3Buffers,length);
|
||||
if(!checkMeta){
|
||||
//还原变更,应该不太可能会出现
|
||||
arr0&&(mp3Buffers[0]=arr0);
|
||||
for(var i=0;i<arrs.length;i++){
|
||||
mp3Buffers.splice(i,0,arrs[i]);
|
||||
};
|
||||
meta.err="fix后数据错误,已还原,错误原因不明";
|
||||
};
|
||||
|
||||
var fix=meta.trimFix={};
|
||||
fix.remove=num;
|
||||
fix.removeDuration=Math.round(num*meta.frameDurationFloat);
|
||||
fix.duration=Math.round(length*8/meta.bitRate);
|
||||
};
|
||||
return meta;
|
||||
}
|
||||
};
|
||||
var mp3TrimFixSetMeta=function(meta,set){
|
||||
var tag="MP3信息 ";
|
||||
if(meta.sampleRate&&meta.sampleRate!=set.sampleRate || meta.bitRate&&meta.bitRate!=set.bitRate){
|
||||
console.warn(tag+"和设置的不匹配set:"+set.bitRate+"kbps "+set.sampleRate+"hz,已更新set:"+meta.bitRate+"kbps "+meta.sampleRate+"hz",set);
|
||||
set.sampleRate=meta.sampleRate;
|
||||
set.bitRate=meta.bitRate;
|
||||
};
|
||||
|
||||
var trimFix=meta.trimFix;
|
||||
if(trimFix){
|
||||
tag+="Fix移除"+trimFix.remove+"帧"+trimFix.removeDuration+"ms -> "+trimFix.duration+"ms";
|
||||
if(trimFix.remove>2){
|
||||
meta.err=(meta.err?meta.err+", ":"")+"移除帧数过多";
|
||||
};
|
||||
}else{
|
||||
tag+=(meta.duration||"-")+"ms";
|
||||
};
|
||||
|
||||
if(meta.err){
|
||||
console.error(tag,meta.err,meta);
|
||||
}else{
|
||||
console.log(tag,meta);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
})();
|
||||
950
web/static/js/recorder-core.js
Normal file
950
web/static/js/recorder-core.js
Normal file
@@ -0,0 +1,950 @@
|
||||
/*
|
||||
录音
|
||||
https://github.com/xiangyuecn/Recorder
|
||||
*/
|
||||
(function(factory){
|
||||
factory(window);
|
||||
//umd returnExports.js
|
||||
if(typeof(define)=='function' && define.amd){
|
||||
define(function(){
|
||||
return Recorder;
|
||||
});
|
||||
};
|
||||
if(typeof(module)=='object' && module.exports){
|
||||
module.exports=Recorder;
|
||||
};
|
||||
}(function(window){
|
||||
"use strict";
|
||||
|
||||
//兼容环境
|
||||
var LM="2021-08-03 20:01:03";
|
||||
var NOOP=function(){};
|
||||
//end 兼容环境 ****从以下开始copy源码*****
|
||||
|
||||
var Recorder=function(set){
|
||||
return new initFn(set);
|
||||
};
|
||||
//是否已经打开了全局的麦克风录音,所有工作都已经准备好了,就等接收音频数据了
|
||||
Recorder.IsOpen=function(){
|
||||
var stream=Recorder.Stream;
|
||||
if(stream){
|
||||
var tracks=stream.getTracks&&stream.getTracks()||stream.audioTracks||[];
|
||||
var track=tracks[0];
|
||||
if(track){
|
||||
var state=track.readyState;
|
||||
return state=="live"||state==track.LIVE;
|
||||
};
|
||||
};
|
||||
return false;
|
||||
};
|
||||
/*H5录音时的AudioContext缓冲大小。会影响H5录音时的onProcess调用速率,相对于AudioContext.sampleRate=48000时,4096接近12帧/s,调节此参数可生成比较流畅的回调动画。
|
||||
取值256, 512, 1024, 2048, 4096, 8192, or 16384
|
||||
注意,取值不能过低,2048开始不同浏览器可能回调速率跟不上造成音质问题。
|
||||
一般无需调整,调整后需要先close掉已打开的录音,再open时才会生效。
|
||||
*/
|
||||
Recorder.BufferSize=4096;
|
||||
//销毁已持有的所有全局资源,当要彻底移除Recorder时需要显式的调用此方法
|
||||
Recorder.Destroy=function(){
|
||||
CLog("Recorder Destroy");
|
||||
Disconnect();//断开可能存在的全局Stream、资源
|
||||
|
||||
for(var k in DestroyList){
|
||||
DestroyList[k]();
|
||||
};
|
||||
};
|
||||
var DestroyList={};
|
||||
//登记一个需要销毁全局资源的处理方法
|
||||
Recorder.BindDestroy=function(key,call){
|
||||
DestroyList[key]=call;
|
||||
};
|
||||
//判断浏览器是否支持录音,随时可以调用。注意:仅仅是检测浏览器支持情况,不会判断和调起用户授权,不会判断是否支持特定格式录音。
|
||||
Recorder.Support=function(){
|
||||
var AC=window.AudioContext;
|
||||
if(!AC){
|
||||
AC=window.webkitAudioContext;
|
||||
};
|
||||
if(!AC){
|
||||
return false;
|
||||
};
|
||||
var scope=navigator.mediaDevices||{};
|
||||
if(!scope.getUserMedia){
|
||||
scope=navigator;
|
||||
scope.getUserMedia||(scope.getUserMedia=scope.webkitGetUserMedia||scope.mozGetUserMedia||scope.msGetUserMedia);
|
||||
};
|
||||
if(!scope.getUserMedia){
|
||||
return false;
|
||||
};
|
||||
|
||||
Recorder.Scope=scope;
|
||||
if(!Recorder.Ctx||Recorder.Ctx.state=="closed"){
|
||||
//不能反复构造,低版本number of hardware contexts reached maximum (6)
|
||||
Recorder.Ctx=new AC();
|
||||
|
||||
Recorder.BindDestroy("Ctx",function(){
|
||||
var ctx=Recorder.Ctx;
|
||||
if(ctx&&ctx.close){//能关掉就关掉,关不掉就保留着
|
||||
ctx.close();
|
||||
Recorder.Ctx=0;
|
||||
};
|
||||
});
|
||||
};
|
||||
return true;
|
||||
};
|
||||
/*初始化H5音频采集连接。如果自行提供了sourceStream将只进行一次简单的连接处理。如果是普通麦克风录音,此时的Stream是全局的,Safari上断开后就无法再次进行连接使用,表现为静音,因此使用全部使用全局处理避免调用到disconnect;全局处理也有利于屏蔽底层细节,start时无需再调用底层接口,提升兼容、可靠性。*/
|
||||
var Connect=function(streamStore){
|
||||
streamStore=streamStore||Recorder;
|
||||
var bufferSize=streamStore.BufferSize||Recorder.BufferSize;
|
||||
|
||||
var ctx=Recorder.Ctx,stream=streamStore.Stream;
|
||||
var media=stream._m=ctx.createMediaStreamSource(stream);
|
||||
var process=stream._p=(ctx.createScriptProcessor||ctx.createJavaScriptNode).call(ctx,bufferSize,1,1);//单声道,省的数据处理复杂
|
||||
|
||||
media.connect(process);
|
||||
process.connect(ctx.destination);
|
||||
|
||||
var calls=stream._call;
|
||||
process.onaudioprocess=function(e){
|
||||
for(var k0 in calls){//has item
|
||||
var o=e.inputBuffer.getChannelData(0);//块是共享的,必须复制出来
|
||||
var size=o.length;
|
||||
|
||||
var pcm=new Int16Array(size);
|
||||
var sum=0;
|
||||
for(var j=0;j<size;j++){//floatTo16BitPCM
|
||||
var s=Math.max(-1,Math.min(1,o[j]));
|
||||
s=s<0?s*0x8000:s*0x7FFF;
|
||||
pcm[j]=s;
|
||||
sum+=Math.abs(s);
|
||||
};
|
||||
|
||||
for(var k in calls){
|
||||
calls[k](pcm,sum);
|
||||
};
|
||||
|
||||
return;
|
||||
};
|
||||
};
|
||||
};
|
||||
var Disconnect=function(streamStore){
|
||||
streamStore=streamStore||Recorder;
|
||||
var isGlobal=streamStore==Recorder;
|
||||
|
||||
var stream=streamStore.Stream;
|
||||
if(stream){
|
||||
if(stream._m){
|
||||
stream._m.disconnect();
|
||||
stream._p.disconnect();
|
||||
stream._p.onaudioprocess=stream._p=stream._m=null;
|
||||
};
|
||||
|
||||
if(isGlobal){//全局的时候,要把流关掉(麦克风),直接提供的流不处理
|
||||
var tracks=stream.getTracks&&stream.getTracks()||stream.audioTracks||[];
|
||||
for(var i=0;i<tracks.length;i++){
|
||||
var track=tracks[i];
|
||||
track.stop&&track.stop();
|
||||
};
|
||||
stream.stop&&stream.stop();
|
||||
};
|
||||
};
|
||||
streamStore.Stream=0;
|
||||
};
|
||||
|
||||
/*对pcm数据的采样率进行转换
|
||||
pcmDatas: [[Int16,...]] pcm片段列表
|
||||
pcmSampleRate:48000 pcm数据的采样率
|
||||
newSampleRate:16000 需要转换成的采样率,newSampleRate>=pcmSampleRate时不会进行任何处理,小于时会进行重新采样
|
||||
prevChunkInfo:{} 可选,上次调用时的返回值,用于连续转换,本次调用将从上次结束位置开始进行处理。或可自行定义一个ChunkInfo从pcmDatas指定的位置开始进行转换
|
||||
option:{ 可选,配置项
|
||||
frameSize:123456 帧大小,每帧的PCM Int16的数量,采样率转换后的pcm长度为frameSize的整数倍,用于连续转换。目前仅在mp3格式时才有用,frameSize取值为1152,这样编码出来的mp3时长和pcm的时长完全一致,否则会因为mp3最后一帧录音不够填满时添加填充数据导致mp3的时长变长。
|
||||
frameType:"" 帧类型,一般为rec.set.type,提供此参数时无需提供frameSize,会自动使用最佳的值给frameSize赋值,目前仅支持mp3=1152(MPEG1 Layer3的每帧采采样数),其他类型=1。
|
||||
以上两个参数用于连续转换时使用,最多使用一个,不提供时不进行帧的特殊处理,提供时必须同时提供prevChunkInfo才有作用。最后一段数据处理时无需提供帧大小以便输出最后一丁点残留数据。
|
||||
}
|
||||
|
||||
返回ChunkInfo:{
|
||||
//可定义,从指定位置开始转换到结尾
|
||||
index:0 pcmDatas已处理到的索引
|
||||
offset:0.0 已处理到的index对应的pcm中的偏移的下一个位置
|
||||
|
||||
//仅作为返回值
|
||||
frameNext:null||[Int16,...] 下一帧的部分数据,frameSize设置了的时候才可能会有
|
||||
sampleRate:16000 结果的采样率,<=newSampleRate
|
||||
data:[Int16,...] 转换后的PCM结果;如果是连续转换,并且pcmDatas中并没有新数据时,data的长度可能为0
|
||||
}
|
||||
*/
|
||||
Recorder.SampleData=function(pcmDatas,pcmSampleRate,newSampleRate,prevChunkInfo,option){
|
||||
prevChunkInfo||(prevChunkInfo={});
|
||||
var index=prevChunkInfo.index||0;
|
||||
var offset=prevChunkInfo.offset||0;
|
||||
|
||||
var frameNext=prevChunkInfo.frameNext||[];
|
||||
option||(option={});
|
||||
var frameSize=option.frameSize||1;
|
||||
if(option.frameType){
|
||||
frameSize=option.frameType=="mp3"?1152:1;
|
||||
};
|
||||
|
||||
var size=0;
|
||||
for(var i=index;i<pcmDatas.length;i++){
|
||||
size+=pcmDatas[i].length;
|
||||
};
|
||||
size=Math.max(0,size-Math.floor(offset));
|
||||
|
||||
//采样 https://www.cnblogs.com/blqw/p/3782420.html
|
||||
var step=pcmSampleRate/newSampleRate;
|
||||
if(step>1){//新采样低于录音采样,进行抽样
|
||||
size=Math.floor(size/step);
|
||||
}else{//新采样高于录音采样不处理,省去了插值处理
|
||||
step=1;
|
||||
newSampleRate=pcmSampleRate;
|
||||
};
|
||||
|
||||
size+=frameNext.length;
|
||||
var res=new Int16Array(size);
|
||||
var idx=0;
|
||||
//添加上一次不够一帧的剩余数据
|
||||
for(var i=0;i<frameNext.length;i++){
|
||||
res[idx]=frameNext[i];
|
||||
idx++;
|
||||
};
|
||||
//处理数据
|
||||
for (var nl=pcmDatas.length;index<nl;index++) {
|
||||
var o=pcmDatas[index];
|
||||
var i=offset,il=o.length;
|
||||
while(i<il){
|
||||
//res[idx]=o[Math.round(i)]; 直接简单抽样
|
||||
|
||||
//https://www.cnblogs.com/xiaoqi/p/6993912.html
|
||||
//当前点=当前点+到后面一个点之间的增量,音质比直接简单抽样好些
|
||||
var before = Math.floor(i);
|
||||
var after = Math.ceil(i);
|
||||
var atPoint = i - before;
|
||||
|
||||
var beforeVal=o[before];
|
||||
var afterVal=after<il ? o[after]
|
||||
: (//后个点越界了,查找下一个数组
|
||||
(pcmDatas[index+1]||[beforeVal])[0]||0
|
||||
);
|
||||
res[idx]=beforeVal+(afterVal-beforeVal)*atPoint;
|
||||
|
||||
idx++;
|
||||
i+=step;//抽样
|
||||
};
|
||||
offset=i-il;
|
||||
};
|
||||
//帧处理
|
||||
frameNext=null;
|
||||
var frameNextSize=res.length%frameSize;
|
||||
if(frameNextSize>0){
|
||||
var u8Pos=(res.length-frameNextSize)*2;
|
||||
frameNext=new Int16Array(res.buffer.slice(u8Pos));
|
||||
res=new Int16Array(res.buffer.slice(0,u8Pos));
|
||||
};
|
||||
|
||||
return {
|
||||
index:index
|
||||
,offset:offset
|
||||
|
||||
,frameNext:frameNext
|
||||
,sampleRate:newSampleRate
|
||||
,data:res
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
/*计算音量百分比的一个方法
|
||||
pcmAbsSum: pcm Int16所有采样的绝对值的和
|
||||
pcmLength: pcm长度
|
||||
返回值:0-100,主要当做百分比用
|
||||
注意:这个不是分贝,因此没用volume当做名称*/
|
||||
Recorder.PowerLevel=function(pcmAbsSum,pcmLength){
|
||||
/*计算音量 https://blog.csdn.net/jody1989/article/details/73480259
|
||||
更高灵敏度算法:
|
||||
限定最大感应值10000
|
||||
线性曲线:低音量不友好
|
||||
power/10000*100
|
||||
对数曲线:低音量友好,但需限定最低感应值
|
||||
(1+Math.log10(power/10000))*100
|
||||
*/
|
||||
var power=(pcmAbsSum/pcmLength) || 0;//NaN
|
||||
var level;
|
||||
if(power<1251){//1250的结果10%,更小的音量采用线性取值
|
||||
level=Math.round(power/1250*10);
|
||||
}else{
|
||||
level=Math.round(Math.min(100,Math.max(0,(1+Math.log(power/10000)/Math.log(10))*100)));
|
||||
};
|
||||
return level;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
//带时间的日志输出,CLog(msg,errOrLogMsg, logMsg...) err为数字时代表日志类型1:error 2:log默认 3:warn,否则当做内容输出,第一个参数不能是对象因为要拼接时间,后面可以接无数个输出参数
|
||||
var CLog=function(msg,err){
|
||||
var now=new Date();
|
||||
var t=("0"+now.getMinutes()).substr(-2)
|
||||
+":"+("0"+now.getSeconds()).substr(-2)
|
||||
+"."+("00"+now.getMilliseconds()).substr(-3);
|
||||
var arr=["["+t+" Recorder]"+msg];
|
||||
var a=arguments;
|
||||
var i=2,fn=console.log;
|
||||
if(typeof(err)=="number"){
|
||||
fn=err==1?console.error:err==3?console.warn:fn;
|
||||
}else{
|
||||
i=1;
|
||||
};
|
||||
for(;i<a.length;i++){
|
||||
arr.push(a[i]);
|
||||
};
|
||||
fn.apply(console,arr);
|
||||
};
|
||||
Recorder.CLog=CLog;
|
||||
|
||||
|
||||
|
||||
|
||||
var ID=0;
|
||||
function initFn(set){
|
||||
this.id=++ID;
|
||||
|
||||
//如果开启了流量统计,这里将发送一个图片请求
|
||||
Recorder.Traffic&&Recorder.Traffic();
|
||||
|
||||
|
||||
var o={
|
||||
type:"mp3" //输出类型:mp3,wav,wav输出文件尺寸超大不推荐使用,但mp3编码支持会导致js文件超大,如果不需支持mp3可以使js文件大幅减小
|
||||
,bitRate:16 //比特率 wav:16或8位,MP3:8kbps 1k/s,8kbps 2k/s 录音文件很小
|
||||
|
||||
,sampleRate:16000 //采样率,wav格式大小=sampleRate*时间;mp3此项对低比特率有影响,高比特率几乎无影响。
|
||||
//wav任意值,mp3取值范围:48000, 44100, 32000, 24000, 22050, 16000, 12000, 11025, 8000
|
||||
//采样率参考https://www.cnblogs.com/devin87/p/mp3-recorder.html
|
||||
|
||||
,onProcess:NOOP //fn(buffers,powerLevel,bufferDuration,bufferSampleRate,newBufferIdx,asyncEnd) buffers=[[Int16,...],...]:缓冲的PCM数据,为从开始录音到现在的所有pcm片段;powerLevel:当前缓冲的音量级别0-100,bufferDuration:已缓冲时长,bufferSampleRate:缓冲使用的采样率(当type支持边录边转码(Worker)时,此采样率和设置的采样率相同,否则不一定相同);newBufferIdx:本次回调新增的buffer起始索引;asyncEnd:fn() 如果onProcess是异步的(返回值为true时),处理完成时需要调用此回调,如果不是异步的请忽略此参数,此方法回调时必须是真异步(不能真异步时需用setTimeout包裹)。onProcess返回值:如果返回true代表开启异步模式,在某些大量运算的场合异步是必须的,必须在异步处理完成时调用asyncEnd(不能真异步时需用setTimeout包裹),在onProcess执行后新增的buffer会全部替换成空数组,因此本回调开头应立即将newBufferIdx到本次回调结尾位置的buffer全部保存到另外一个数组内,处理完成后写回buffers中本次回调的结尾位置。
|
||||
|
||||
//*******高级设置******
|
||||
//,sourceStream:MediaStream Object
|
||||
//可选直接提供一个媒体流,从这个流中录制、实时处理音频数据(当前Recorder实例独享此流);不提供时为普通的麦克风录音,由getUserMedia提供音频流(所有Recorder实例共享同一个流)
|
||||
//比如:audio、video标签dom节点的captureStream方法(实验特性,不同浏览器支持程度不高)返回的流;WebRTC中的remote流;自己创建的流等
|
||||
//注意:流内必须至少存在一条音轨(Audio Track),比如audio标签必须等待到可以开始播放后才会有音轨,否则open会失败
|
||||
|
||||
//,audioTrackSet:{ deviceId:"",groupId:"", autoGainControl:true, echoCancellation:true, noiseSuppression:true }
|
||||
//普通麦克风录音时getUserMedia方法的audio配置参数,比如指定设备id,回声消除、降噪开关;注意:提供的任何配置值都不一定会生效
|
||||
//由于麦克风是全局共享的,所以新配置后需要close掉以前的再重新open
|
||||
//更多参考: https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints
|
||||
|
||||
//,disableEnvInFix:false 内部参数,禁用设备卡顿时音频输入丢失补偿功能
|
||||
|
||||
//,takeoffEncodeChunk:NOOP //fn(chunkBytes) chunkBytes=[Uint8,...]:实时编码环境下接管编码器输出,当编码器实时编码出一块有效的二进制音频数据时实时回调此方法;参数为二进制的Uint8Array,就是编码出来的音频数据片段,所有的chunkBytes拼接在一起即为完整音频。本实现的想法最初由QQ2543775048提出
|
||||
//当提供此回调方法时,将接管编码器的数据输出,编码器内部将放弃存储生成的音频数据;环境要求比较苛刻:如果当前环境不支持实时编码处理,将在open时直接走fail逻辑
|
||||
//因此提供此回调后调用stop方法将无法获得有效的音频数据,因为编码器内没有音频数据,因此stop时返回的blob将是一个字节长度为0的blob
|
||||
//目前只有mp3格式实现了实时编码,在支持实时处理的环境中将会实时的将编码出来的mp3片段通过此方法回调,所有的chunkBytes拼接到一起即为完整的mp3,此种拼接的结果比mock方法实时生成的音质更加,因为天然避免了首尾的静默
|
||||
//目前除mp3外其他格式不可以提供此回调,提供了将在open时直接走fail逻辑
|
||||
};
|
||||
|
||||
for(var k in set){
|
||||
o[k]=set[k];
|
||||
};
|
||||
this.set=o;
|
||||
|
||||
this._S=9;//stop同步锁,stop可以阻止open过程中还未运行的start
|
||||
this.Sync={O:9,C:9};//和Recorder.Sync一致,只不过这个是非全局的,仅用来简化代码逻辑,无实际作用
|
||||
};
|
||||
//同步锁,控制对Stream的竞争;用于close时中断异步的open;一个对象open如果变化了都要阻止close,Stream的控制权交个新的对象
|
||||
Recorder.Sync={/*open*/O:9,/*close*/C:9};
|
||||
|
||||
Recorder.prototype=initFn.prototype={
|
||||
//流相关的数据存储在哪个对象里面;如果提供了sourceStream,数据直接存储在当前对象中,否则存储在全局
|
||||
_streamStore:function(){
|
||||
if(this.set.sourceStream){
|
||||
return this;
|
||||
}else{
|
||||
return Recorder;
|
||||
}
|
||||
}
|
||||
|
||||
//打开录音资源True(),False(msg,isUserNotAllow),需要调用close。注意:此方法是异步的;一般使用时打开,用完立即关闭;可重复调用,可用来测试是否能录音
|
||||
,open:function(True,False){
|
||||
var This=this,streamStore=This._streamStore();
|
||||
True=True||NOOP;
|
||||
var failCall=function(errMsg,isUserNotAllow){
|
||||
isUserNotAllow=!!isUserNotAllow;
|
||||
CLog("录音open失败:"+errMsg+",isUserNotAllow:"+isUserNotAllow,1);
|
||||
False&&False(errMsg,isUserNotAllow);
|
||||
};
|
||||
|
||||
var ok=function(){
|
||||
CLog("open成功");
|
||||
True();
|
||||
|
||||
This._SO=0;//解除stop对open中的start调用的阻止
|
||||
};
|
||||
|
||||
|
||||
//同步锁
|
||||
var Lock=streamStore.Sync;
|
||||
var lockOpen=++Lock.O,lockClose=Lock.C;
|
||||
This._O=This._O_=lockOpen;//记住当前的open,如果变化了要阻止close,这里假定了新对象已取代当前对象并且不再使用
|
||||
This._SO=This._S;//记住open过程中的stop,中途任何stop调用后都不能继续open中的start
|
||||
var lockFail=function(){
|
||||
//允许多次open,但不允许任何一次close,或者自身已经调用了关闭
|
||||
if(lockClose!=Lock.C || !This._O){
|
||||
var err="open被取消";
|
||||
if(lockOpen==Lock.O){
|
||||
//无新的open,已经调用了close进行取消,此处应让上次的close明确生效
|
||||
This.close();
|
||||
}else{
|
||||
err="open被中断";
|
||||
};
|
||||
failCall(err);
|
||||
return true;
|
||||
};
|
||||
};
|
||||
|
||||
//环境配置检查
|
||||
var checkMsg=This.envCheck({envName:"H5",canProcess:true});
|
||||
if(checkMsg){
|
||||
failCall("不能录音:"+checkMsg);
|
||||
return;
|
||||
};
|
||||
|
||||
|
||||
//***********已直接提供了音频流************
|
||||
if(This.set.sourceStream){
|
||||
if(!Recorder.Support()){
|
||||
failCall("不支持此浏览器从流中获取录音");
|
||||
return;
|
||||
};
|
||||
|
||||
Disconnect(streamStore);//可能已open过,直接先尝试断开
|
||||
This.Stream=This.set.sourceStream;
|
||||
This.Stream._call={};
|
||||
|
||||
try{
|
||||
Connect(streamStore);
|
||||
}catch(e){
|
||||
failCall("从流中打开录音失败:"+e.message);
|
||||
return;
|
||||
}
|
||||
ok();
|
||||
return;
|
||||
};
|
||||
|
||||
|
||||
//***********打开麦克风得到全局的音频流************
|
||||
var codeFail=function(code,msg){
|
||||
try{//跨域的优先检测一下
|
||||
window.top.a;
|
||||
}catch(e){
|
||||
failCall('无权录音(跨域,请尝试给iframe添加麦克风访问策略,如allow="camera;microphone")');
|
||||
return;
|
||||
};
|
||||
|
||||
if(/Permission|Allow/i.test(code)){
|
||||
failCall("用户拒绝了录音权限",true);
|
||||
}else if(window.isSecureContext===false){
|
||||
failCall("无权录音(需https)");
|
||||
}else if(/Found/i.test(code)){//可能是非安全环境导致的没有设备
|
||||
failCall(msg+",无可用麦克风");
|
||||
}else{
|
||||
failCall(msg);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
//如果已打开并且有效就不要再打开了
|
||||
if(Recorder.IsOpen()){
|
||||
ok();
|
||||
return;
|
||||
};
|
||||
if(!Recorder.Support()){
|
||||
codeFail("","此浏览器不支持录音");
|
||||
return;
|
||||
};
|
||||
|
||||
//请求权限,如果从未授权,一般浏览器会弹出权限请求弹框
|
||||
var f1=function(stream){
|
||||
Recorder.Stream=stream;
|
||||
stream._call={};//此时is open,但并未connect,是允许绑定接收数据的
|
||||
if(lockFail())return;
|
||||
|
||||
//https://github.com/xiangyuecn/Recorder/issues/14 获取到的track.readyState!="live",刚刚回调时可能是正常的,但过一下可能就被关掉了,原因不明。延迟一下保证真异步。对正常浏览器不影响
|
||||
setTimeout(function(){
|
||||
if(lockFail())return;
|
||||
|
||||
if(Recorder.IsOpen()){
|
||||
Connect();
|
||||
ok();
|
||||
}else{
|
||||
failCall("录音功能无效:无音频流");
|
||||
};
|
||||
},100);
|
||||
};
|
||||
var f2=function(e){
|
||||
var code=e.name||e.message||e.code+":"+e;
|
||||
CLog("请求录音权限错误",1,e);
|
||||
|
||||
codeFail(code,"无法录音:"+code);
|
||||
};
|
||||
var pro=Recorder.Scope.getUserMedia({audio:This.set.audioTrackSet||true},f1,f2);
|
||||
if(pro&&pro.then){
|
||||
pro.then(f1)[True&&"catch"](f2); //fix 关键字,保证catch压缩时保持字符串形式
|
||||
};
|
||||
}
|
||||
//关闭释放录音资源
|
||||
,close:function(call){
|
||||
call=call||NOOP;
|
||||
|
||||
var This=this,streamStore=This._streamStore();
|
||||
This._stop();
|
||||
|
||||
var Lock=streamStore.Sync;
|
||||
This._O=0;
|
||||
if(This._O_!=Lock.O){
|
||||
//唯一资源Stream的控制权已交给新对象,这里不能关闭。此处在每次都弹权限的浏览器内可能存在泄漏,新对象被拒绝权限可能不会调用close,忽略这种不处理
|
||||
CLog("close被忽略",3);
|
||||
call();
|
||||
return;
|
||||
};
|
||||
Lock.C++;//获得控制权
|
||||
|
||||
Disconnect(streamStore);
|
||||
|
||||
CLog("close");
|
||||
call();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*模拟一段录音数据,后面可以调用stop进行编码,需提供pcm数据[1,2,3...],pcm的采样率*/
|
||||
,mock:function(pcmData,pcmSampleRate){
|
||||
var This=this;
|
||||
This._stop();//清理掉已有的资源
|
||||
|
||||
This.isMock=1;
|
||||
This.mockEnvInfo=null;
|
||||
This.buffers=[pcmData];
|
||||
This.recSize=pcmData.length;
|
||||
This.srcSampleRate=pcmSampleRate;
|
||||
return This;
|
||||
}
|
||||
,envCheck:function(envInfo){//平台环境下的可用性检查,任何时候都可以调用检查,返回errMsg:""正常,"失败原因"
|
||||
//envInfo={envName:"H5",canProcess:true}
|
||||
var errMsg,This=this,set=This.set;
|
||||
|
||||
//编码器检查环境下配置是否可用
|
||||
if(!errMsg){
|
||||
if(This[set.type+"_envCheck"]){//编码器已实现环境检查
|
||||
errMsg=This[set.type+"_envCheck"](envInfo,set);
|
||||
}else{//未实现检查的手动检查配置是否有效
|
||||
if(set.takeoffEncodeChunk){
|
||||
errMsg=set.type+"类型不支持设置takeoffEncodeChunk";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
return errMsg||"";
|
||||
}
|
||||
,envStart:function(mockEnvInfo,sampleRate){//平台环境相关的start调用
|
||||
var This=this,set=This.set;
|
||||
This.isMock=mockEnvInfo?1:0;//非H5环境需要启用mock,并提供envCheck需要的环境信息
|
||||
This.mockEnvInfo=mockEnvInfo;
|
||||
This.buffers=[];//数据缓冲
|
||||
This.recSize=0;//数据大小
|
||||
|
||||
This.envInLast=0;//envIn接收到最后录音内容的时间
|
||||
This.envInFirst=0;//envIn接收到的首个录音内容的录制时间
|
||||
This.envInFix=0;//补偿的总时间
|
||||
This.envInFixTs=[];//补偿计数列表
|
||||
|
||||
set.sampleRate=Math.min(sampleRate,set.sampleRate);//engineCtx需要提前确定最终的采样率
|
||||
This.srcSampleRate=sampleRate;
|
||||
|
||||
This.engineCtx=0;
|
||||
//此类型有边录边转码(Worker)支持
|
||||
if(This[set.type+"_start"]){
|
||||
var engineCtx=This.engineCtx=This[set.type+"_start"](set);
|
||||
if(engineCtx){
|
||||
engineCtx.pcmDatas=[];
|
||||
engineCtx.pcmSize=0;
|
||||
};
|
||||
};
|
||||
}
|
||||
,envResume:function(){//和平台环境无关的恢复录音
|
||||
//重新开始计数
|
||||
this.envInFixTs=[];
|
||||
}
|
||||
,envIn:function(pcm,sum){//和平台环境无关的pcm[Int16]输入
|
||||
var This=this,set=This.set,engineCtx=This.engineCtx;
|
||||
var bufferSampleRate=This.srcSampleRate;
|
||||
var size=pcm.length;
|
||||
var powerLevel=Recorder.PowerLevel(sum,size);
|
||||
|
||||
var buffers=This.buffers;
|
||||
var bufferFirstIdx=buffers.length;//之前的buffer都是经过onProcess处理好的,不允许再修改
|
||||
buffers.push(pcm);
|
||||
|
||||
//有engineCtx时会被覆盖,这里保存一份
|
||||
var buffersThis=buffers;
|
||||
var bufferFirstIdxThis=bufferFirstIdx;
|
||||
|
||||
//卡顿丢失补偿:因为设备很卡的时候导致H5接收到的数据量不够造成播放时候变速,结果比实际的时长要短,此处保证了不会变短,但不能修复丢失的音频数据造成音质变差。当前算法采用输入时间侦测下一帧是否需要添加补偿帧,需要(6次输入||超过1秒)以上才会开始侦测,如果滑动窗口内丢失超过1/3就会进行补偿
|
||||
var now=Date.now();
|
||||
var pcmTime=Math.round(size/bufferSampleRate*1000);
|
||||
This.envInLast=now;
|
||||
if(This.buffers.length==1){//记下首个录音数据的录制时间
|
||||
This.envInFirst=now-pcmTime;
|
||||
};
|
||||
var envInFixTs=This.envInFixTs;
|
||||
envInFixTs.splice(0,0,{t:now,d:pcmTime});
|
||||
//保留3秒的计数滑动窗口,另外超过3秒的停顿不补偿
|
||||
var tsInStart=now,tsPcm=0;
|
||||
for(var i=0;i<envInFixTs.length;i++){
|
||||
var o=envInFixTs[i];
|
||||
if(now-o.t>3000){
|
||||
envInFixTs.length=i;
|
||||
break;
|
||||
};
|
||||
tsInStart=o.t;
|
||||
tsPcm+=o.d;
|
||||
};
|
||||
//达到需要的数据量,开始侦测是否需要补偿
|
||||
var tsInPrev=envInFixTs[1];
|
||||
var tsIn=now-tsInStart;
|
||||
var lost=tsIn-tsPcm;
|
||||
if( lost>tsIn/3 && (tsInPrev&&tsIn>1000 || envInFixTs.length>=6) ){
|
||||
//丢失过多,开始执行补偿
|
||||
var addTime=now-tsInPrev.t-pcmTime;//距离上次输入丢失这么多ms
|
||||
if(addTime>pcmTime/5){//丢失超过本帧的1/5
|
||||
var fixOpen=!set.disableEnvInFix;
|
||||
CLog("["+now+"]"+(fixOpen?"":"未")+"补偿"+addTime+"ms",3);
|
||||
This.envInFix+=addTime;
|
||||
|
||||
//用静默进行补偿
|
||||
if(fixOpen){
|
||||
var addPcm=new Int16Array(addTime*bufferSampleRate/1000);
|
||||
size+=addPcm.length;
|
||||
buffers.push(addPcm);
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
var sizeOld=This.recSize,addSize=size;
|
||||
var bufferSize=sizeOld+addSize;
|
||||
This.recSize=bufferSize;//此值在onProcess后需要修正,可能新数据被修改
|
||||
|
||||
|
||||
//此类型有边录边转码(Worker)支持,开启实时转码
|
||||
if(engineCtx){
|
||||
//转换成set的采样率
|
||||
var chunkInfo=Recorder.SampleData(buffers,bufferSampleRate,set.sampleRate,engineCtx.chunkInfo);
|
||||
engineCtx.chunkInfo=chunkInfo;
|
||||
|
||||
sizeOld=engineCtx.pcmSize;
|
||||
addSize=chunkInfo.data.length;
|
||||
bufferSize=sizeOld+addSize;
|
||||
engineCtx.pcmSize=bufferSize;//此值在onProcess后需要修正,可能新数据被修改
|
||||
|
||||
buffers=engineCtx.pcmDatas;
|
||||
bufferFirstIdx=buffers.length;
|
||||
buffers.push(chunkInfo.data);
|
||||
bufferSampleRate=chunkInfo.sampleRate;
|
||||
};
|
||||
|
||||
var duration=Math.round(bufferSize/bufferSampleRate*1000);
|
||||
var bufferNextIdx=buffers.length;
|
||||
var bufferNextIdxThis=buffersThis.length;
|
||||
|
||||
//允许异步处理buffer数据
|
||||
var asyncEnd=function(){
|
||||
//重新计算size,异步的早已减去添加的,同步的需去掉本次添加的然后重新计算
|
||||
var num=asyncBegin?0:-addSize;
|
||||
var hasClear=buffers[0]==null;
|
||||
for(var i=bufferFirstIdx;i<bufferNextIdx;i++){
|
||||
var buffer=buffers[i];
|
||||
if(buffer==null){//已被主动释放内存,比如长时间实时传输录音时
|
||||
hasClear=1;
|
||||
}else{
|
||||
num+=buffer.length;
|
||||
|
||||
//推入后台边录边转码
|
||||
if(engineCtx&&buffer.length){
|
||||
This[set.type+"_encode"](engineCtx,buffer);
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
//同步清理This.buffers,不管buffers到底清了多少个,buffersThis是使用不到的进行全清
|
||||
if(hasClear && engineCtx){
|
||||
var i=bufferFirstIdxThis;
|
||||
if(buffersThis[0]){
|
||||
i=0;
|
||||
};
|
||||
for(;i<bufferNextIdxThis;i++){
|
||||
buffersThis[i]=null;
|
||||
};
|
||||
};
|
||||
|
||||
//统计修改后的size,如果异步发生clear要原样加回来,同步的无需操作
|
||||
if(hasClear){
|
||||
num=asyncBegin?addSize:0;
|
||||
|
||||
buffers[0]=null;//彻底被清理
|
||||
};
|
||||
if(engineCtx){
|
||||
engineCtx.pcmSize+=num;
|
||||
}else{
|
||||
This.recSize+=num;
|
||||
};
|
||||
};
|
||||
//实时回调处理数据,允许修改或替换上次回调以来新增的数据 ,但是不允许修改已处理过的,不允许增删第一维数组 ,允许将第二维数组任意修改替换成空数组也可以
|
||||
var asyncBegin=set.onProcess(buffers,powerLevel,duration,bufferSampleRate,bufferFirstIdx,asyncEnd);
|
||||
|
||||
if(asyncBegin===true){
|
||||
//开启了异步模式,onProcess已接管buffers新数据,立即清空,避免出现未处理的数据
|
||||
var hasClear=0;
|
||||
for(var i=bufferFirstIdx;i<bufferNextIdx;i++){
|
||||
if(buffers[i]==null){//已被主动释放内存,比如长时间实时传输录音时 ,但又要开启异步模式,此种情况是非法的
|
||||
hasClear=1;
|
||||
}else{
|
||||
buffers[i]=new Int16Array(0);
|
||||
};
|
||||
};
|
||||
|
||||
if(hasClear){
|
||||
CLog("未进入异步前不能清除buffers",3);
|
||||
}else{
|
||||
//还原size,异步结束后再统计仅修改后的size,如果发生clear要原样加回来
|
||||
if(engineCtx){
|
||||
engineCtx.pcmSize-=addSize;
|
||||
}else{
|
||||
This.recSize-=addSize;
|
||||
};
|
||||
};
|
||||
}else{
|
||||
asyncEnd();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//开始录音,需先调用open;只要open成功时,调用此方法是安全的,如果未open强行调用导致的内部错误将不会有任何提示,stop时自然能得到错误
|
||||
,start:function(){
|
||||
var This=this,ctx=Recorder.Ctx;
|
||||
|
||||
var isOpen=1;
|
||||
if(This.set.sourceStream){//直接提供了流,仅判断是否调用了open
|
||||
if(!This.Stream){
|
||||
isOpen=0;
|
||||
}
|
||||
}else if(!Recorder.IsOpen()){//监测全局麦克风是否打开并且有效
|
||||
isOpen=0;
|
||||
};
|
||||
if(!isOpen){
|
||||
CLog("未open",1);
|
||||
return;
|
||||
};
|
||||
CLog("开始录音");
|
||||
|
||||
This._stop();
|
||||
This.state=0;
|
||||
This.envStart(null,ctx.sampleRate);
|
||||
|
||||
//检查open过程中stop是否已经调用过
|
||||
if(This._SO&&This._SO+1!=This._S){//上面调用过一次 _stop
|
||||
//open未完成就调用了stop,此种情况终止start。也应尽量避免出现此情况
|
||||
CLog("start被中断",3);
|
||||
return;
|
||||
};
|
||||
This._SO=0;
|
||||
|
||||
var end=function(){
|
||||
This.state=1;
|
||||
This.resume();
|
||||
};
|
||||
if(ctx.state=="suspended"){
|
||||
ctx.resume().then(function(){
|
||||
CLog("ctx resume");
|
||||
end();
|
||||
});
|
||||
}else{
|
||||
end();
|
||||
};
|
||||
}
|
||||
/*暂停录音*/
|
||||
,pause:function(){
|
||||
var This=this;
|
||||
if(This.state){
|
||||
This.state=2;
|
||||
CLog("pause");
|
||||
delete This._streamStore().Stream._call[This.id];
|
||||
};
|
||||
}
|
||||
/*恢复录音*/
|
||||
,resume:function(){
|
||||
var This=this;
|
||||
if(This.state){
|
||||
This.state=1;
|
||||
CLog("resume");
|
||||
This.envResume();
|
||||
|
||||
This._streamStore().Stream._call[This.id]=function(pcm,sum){
|
||||
if(This.state==1){
|
||||
This.envIn(pcm,sum);
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
,_stop:function(keepEngine){
|
||||
var This=this,set=This.set;
|
||||
if(!This.isMock){
|
||||
This._S++;
|
||||
};
|
||||
if(This.state){
|
||||
This.pause();
|
||||
This.state=0;
|
||||
};
|
||||
if(!keepEngine && This[set.type+"_stop"]){
|
||||
This[set.type+"_stop"](This.engineCtx);
|
||||
This.engineCtx=0;
|
||||
};
|
||||
}
|
||||
/*
|
||||
结束录音并返回录音数据blob对象
|
||||
True(blob,duration) blob:录音数据audio/mp3|wav格式
|
||||
duration:录音时长,单位毫秒
|
||||
False(msg)
|
||||
autoClose:false 可选,是否自动调用close,默认为false
|
||||
*/
|
||||
,stop:function(True,False,autoClose){
|
||||
var This=this,set=This.set,t1;
|
||||
CLog("Stop "+(This.envInLast?This.envInLast-This.envInFirst+"ms 补"+This.envInFix+"ms":"-"));
|
||||
|
||||
var end=function(){
|
||||
This._stop();//彻底关掉engineCtx
|
||||
if(autoClose){
|
||||
This.close();
|
||||
};
|
||||
};
|
||||
var err=function(msg){
|
||||
CLog("结束录音失败:"+msg,1);
|
||||
False&&False(msg);
|
||||
end();
|
||||
};
|
||||
var ok=function(blob,duration){
|
||||
CLog("结束录音 编码"+(Date.now()-t1)+"ms 音频"+duration+"ms/"+blob.size+"b");
|
||||
if(set.takeoffEncodeChunk){//接管了输出,此时blob长度为0
|
||||
CLog("启用takeoffEncodeChunk后stop返回的blob长度为0不提供音频数据",3);
|
||||
}else if(blob.size<Math.max(100,duration/2)){//1秒小于0.5k?
|
||||
err("生成的"+set.type+"无效");
|
||||
return;
|
||||
};
|
||||
True&&True(blob,duration);
|
||||
end();
|
||||
};
|
||||
if(!This.isMock){
|
||||
if(!This.state){
|
||||
err("未开始录音");
|
||||
return;
|
||||
};
|
||||
This._stop(true);
|
||||
};
|
||||
var size=This.recSize;
|
||||
if(!size){
|
||||
err("未采集到录音");
|
||||
return;
|
||||
};
|
||||
if(!This.buffers[0]){
|
||||
err("音频被释放");
|
||||
return;
|
||||
};
|
||||
if(!This[set.type]){
|
||||
err("未加载"+set.type+"编码器");
|
||||
return;
|
||||
};
|
||||
|
||||
//环境配置检查,此处仅针对mock调用,因为open已经检查过了
|
||||
if(This.isMock){
|
||||
var checkMsg=This.envCheck(This.mockEnvInfo||{envName:"mock",canProcess:false});//没有提供环境信息的mock时没有onProcess回调
|
||||
if(checkMsg){
|
||||
err("录音错误:"+checkMsg);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
//此类型有边录边转码(Worker)支持
|
||||
var engineCtx=This.engineCtx;
|
||||
if(This[set.type+"_complete"]&&engineCtx){
|
||||
var duration=Math.round(engineCtx.pcmSize/set.sampleRate*1000);//采用后的数据长度和buffers的长度可能微小的不一致,是采样率连续转换的精度问题
|
||||
|
||||
t1=Date.now();
|
||||
This[set.type+"_complete"](engineCtx,function(blob){
|
||||
ok(blob,duration);
|
||||
},err);
|
||||
return;
|
||||
};
|
||||
|
||||
//标准UI线程转码,调整采样率
|
||||
t1=Date.now();
|
||||
var chunk=Recorder.SampleData(This.buffers,This.srcSampleRate,set.sampleRate);
|
||||
|
||||
set.sampleRate=chunk.sampleRate;
|
||||
var res=chunk.data;
|
||||
var duration=Math.round(res.length/set.sampleRate*1000);
|
||||
|
||||
CLog("采样"+size+"->"+res.length+" 花:"+(Date.now()-t1)+"ms");
|
||||
|
||||
setTimeout(function(){
|
||||
t1=Date.now();
|
||||
This[set.type](res,function(blob){
|
||||
ok(blob,duration);
|
||||
},function(msg){
|
||||
err(msg);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
if(window.Recorder){
|
||||
window.Recorder.Destroy();
|
||||
};
|
||||
window.Recorder=Recorder;
|
||||
|
||||
//end ****copy源码结束*****
|
||||
Recorder.LM=LM;
|
||||
|
||||
//流量统计用1像素图片地址,设置为空将不参与统计
|
||||
Recorder.TrafficImgUrl="//ia.51.la/go1?id=20469973&pvFlag=1";
|
||||
Recorder.Traffic=function(){
|
||||
var imgUrl=Recorder.TrafficImgUrl;
|
||||
if(imgUrl){
|
||||
var data=Recorder.Traffic;
|
||||
var idf=location.href.replace(/#.*/,"");
|
||||
|
||||
if(imgUrl.indexOf("//")==0){
|
||||
//给url加上http前缀,如果是file协议下,不加前缀没法用
|
||||
if(/^https:/i.test(idf)){
|
||||
imgUrl="https:"+imgUrl;
|
||||
}else{
|
||||
imgUrl="http:"+imgUrl;
|
||||
};
|
||||
};
|
||||
|
||||
if(!data[idf]){
|
||||
data[idf]=1;
|
||||
|
||||
var img=new Image();
|
||||
img.src=imgUrl;
|
||||
CLog("Traffic Analysis Image: Recorder.TrafficImgUrl="+Recorder.TrafficImgUrl);
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
}));
|
||||
86
web/static/js/wav.js
Normal file
86
web/static/js/wav.js
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
wav编码器+编码引擎
|
||||
https://github.com/xiangyuecn/Recorder
|
||||
|
||||
当然最佳推荐使用mp3、wav格式,代码也是优先照顾这两种格式
|
||||
浏览器支持情况
|
||||
https://developer.mozilla.org/en-US/docs/Web/HTML/Supported_media_formats
|
||||
|
||||
编码原理:给pcm数据加上一个44直接的wav头即成wav文件;pcm数据就是Recorder中的buffers原始数据(重新采样),16位时为LE小端模式(Little Endian),实质上是未经过任何编码处理
|
||||
*/
|
||||
(function(){
|
||||
"use strict";
|
||||
|
||||
Recorder.prototype.enc_wav={
|
||||
stable:true
|
||||
,testmsg:"支持位数8位、16位(填在比特率里面),采样率取值无限制"
|
||||
};
|
||||
Recorder.prototype.wav=function(res,True,False){
|
||||
var This=this,set=This.set
|
||||
,size=res.length
|
||||
,sampleRate=set.sampleRate
|
||||
,bitRate=set.bitRate==8?8:16;
|
||||
|
||||
//编码数据 https://github.com/mattdiamond/Recorderjs https://www.cnblogs.com/blqw/p/3782420.html https://www.cnblogs.com/xiaoqi/p/6993912.html
|
||||
var dataLength=size*(bitRate/8);
|
||||
var buffer=new ArrayBuffer(44+dataLength);
|
||||
var data=new DataView(buffer);
|
||||
|
||||
var offset=0;
|
||||
var writeString=function(str){
|
||||
for (var i=0;i<str.length;i++,offset++) {
|
||||
data.setUint8(offset,str.charCodeAt(i));
|
||||
};
|
||||
};
|
||||
var write16=function(v){
|
||||
data.setUint16(offset,v,true);
|
||||
offset+=2;
|
||||
};
|
||||
var write32=function(v){
|
||||
data.setUint32(offset,v,true);
|
||||
offset+=4;
|
||||
};
|
||||
|
||||
/* RIFF identifier */
|
||||
writeString('RIFF');
|
||||
/* RIFF chunk length */
|
||||
write32(36+dataLength);
|
||||
/* RIFF type */
|
||||
writeString('WAVE');
|
||||
/* format chunk identifier */
|
||||
writeString('fmt ');
|
||||
/* format chunk length */
|
||||
write32(16);
|
||||
/* sample format (raw) */
|
||||
write16(1);
|
||||
/* channel count */
|
||||
write16(1);
|
||||
/* sample rate */
|
||||
write32(sampleRate);
|
||||
/* byte rate (sample rate * block align) */
|
||||
write32(sampleRate*(bitRate/8));
|
||||
/* block align (channel count * bytes per sample) */
|
||||
write16(bitRate/8);
|
||||
/* bits per sample */
|
||||
write16(bitRate);
|
||||
/* data chunk identifier */
|
||||
writeString('data');
|
||||
/* data chunk length */
|
||||
write32(dataLength);
|
||||
// 写入采样数据
|
||||
if(bitRate==8) {
|
||||
for(var i=0;i<size;i++,offset++) {
|
||||
//16转8据说是雷霄骅的 https://blog.csdn.net/sevennight1989/article/details/85376149 细节比blqw的按比例的算法清晰点,虽然都有明显杂音
|
||||
var val=(res[i]>>8)+128;
|
||||
data.setInt8(offset,val,true);
|
||||
};
|
||||
}else{
|
||||
for (var i=0;i<size;i++,offset+=2){
|
||||
data.setInt16(offset,res[i],true);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
True(new Blob([data.buffer],{type:"audio/wav"}));
|
||||
}
|
||||
})();
|
||||
509
web/templates/index.html
Normal file
509
web/templates/index.html
Normal file
@@ -0,0 +1,509 @@
|
||||
<!DOCTYPE HTML>
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0">
|
||||
<link rel="shortcut icon" type="image/png" href="../static/img/bird-sm.png">
|
||||
|
||||
<title>MockingBird Web Server</title>
|
||||
|
||||
<script src="{{ url_for('static',filename='js/recorder-core.js') }}"></script>
|
||||
<script src="{{ url_for('static',filename='js/mp3.js') }}"></script>
|
||||
<script src="{{ url_for('static',filename='js/wav.js') }}"></script>
|
||||
<script src="{{ url_for('static',filename='js/mp3-engine.js') }}"></script>
|
||||
<script src="{{ url_for('static',filename='js/frequency.histogram.view.js') }}"></script>
|
||||
<script src="{{ url_for('static',filename='js/lib.fft.js') }}"></script>
|
||||
|
||||
<script src="{{ url_for('static',filename='js/jquery.js') }}"></script>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
<div class="main">
|
||||
|
||||
<div class="mainBox">
|
||||
<div class="title" >
|
||||
<div style="width: 15%;float: left;margin-left: 5%;">
|
||||
<img src="../static/img/bird.png" style="width: 100%;border-radius:50%;"></img>
|
||||
</div>
|
||||
<div style="width: 80% ;height: 15%;; margin-left: 15%;overflow: hidden;">
|
||||
<div style="margin-left: 5%;margin-top: 15px;font-size: xx-large;font-weight: bolder;">
|
||||
拟声鸟工具箱
|
||||
</div>
|
||||
<div style="margin-left: 5%;margin-top: 3px;font-size: large;">
|
||||
<a href="https://github.com/babysor/MockingBird" target="_blank">https://github.com/babysor/MockingBird</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style="margin-left: 5%;margin-top: 50px;width: 90%;">
|
||||
<div style="font-size: larger;font-weight: bolder;">1. 请输入中文</div>
|
||||
<textarea id="user_input_text"
|
||||
style="border:1px solid #ccc; width: 100%; height: 100px; font-size: 15px; margin-top: 10px;"></textarea>
|
||||
</div>
|
||||
<div class="pd btns" style="margin-left: 5%;margin-top: 20px;width: 90%; ">
|
||||
<!-- <div>
|
||||
<button onclick="recOpen()" style="margin-right:10px">打开录音,请求权限</button>
|
||||
<button onclick="recClose()" style="margin-right:0">关闭录音,释放资源</button>
|
||||
</div> -->
|
||||
<div style="font-size: larger;font-weight: bolder;">2. 请直接录音,点击停止结束</div>
|
||||
<button onclick="recStart()" >录制</button>
|
||||
<button onclick="recStop()">停止</button>
|
||||
<button onclick="recPlay()" >播放</button>
|
||||
</div>
|
||||
<div class="pd btns" style="margin-left: 5%;margin-top: 20px;width: 90%; ">
|
||||
<div style="font-size: larger;font-weight: bolder;">或上传音频</div>
|
||||
<input type="file" id="fileInput" accept=".wav" />
|
||||
<label for="fileInput">选择音频</label>
|
||||
<div id="audio1"></div>
|
||||
</div>
|
||||
<div class="pd btns" style="margin-left: 5%;margin-top: 20px;width: 90%; ">
|
||||
<div style="font-size: larger;font-weight: bolder;">3. 选择Synthesizer模型</div>
|
||||
<span class="box">
|
||||
<select id="select">
|
||||
</select>
|
||||
</span>
|
||||
</div>
|
||||
<div class="pd btns" style="margin-left: 5%;margin-top: 20px;width: 90%; text-align:right;">
|
||||
<button id="upload" onclick="recUpload()">上传合成</button>
|
||||
</div>
|
||||
|
||||
<!-- 波形绘制区域 -->
|
||||
<!-- <div class="pd recpower">
|
||||
<div style="height:40px;width:100%;background:#fff;position:relative;">
|
||||
<div class="recpowerx" style="height:40px;background:#ff3295;position:absolute;"></div>
|
||||
<div class="recpowert" style="padding-left:50px; line-height:40px; position: relative;"></div>
|
||||
</div>
|
||||
</div> -->
|
||||
<!-- <div class="pd waveBox" style="height:100px;">
|
||||
<div style="border:1px solid #ccc;display:inline-block; width: 100%; height: 100px;">
|
||||
<div style="height:100px; width: 100%; background-color: #5da1f5; position: relative;left: 0px;top: 0px;z-index: 10;"
|
||||
class="recwave"></div>
|
||||
<div
|
||||
style="background-color: transparent;position: relative;top: -80px;left: 30%;z-index: 20;font-size: 48px;color: #fff;">
|
||||
音频预览</div>
|
||||
</div>
|
||||
</div> -->
|
||||
<div class="reclog" style="margin-left: 5%;margin-top: 20px;width: 90%;"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script>
|
||||
|
||||
$("#fileInput").change(function(){
|
||||
var file = $("#fileInput").get(0).files;
|
||||
if (file.length > 0) {
|
||||
var path = URL.createObjectURL(file[0]);
|
||||
var audio = document.createElement('audio');
|
||||
audio.src = path;
|
||||
audio.controls = true;
|
||||
$('#audio1').empty().append(audio);
|
||||
}
|
||||
});
|
||||
|
||||
fetch("/api/synthesizers", {
|
||||
method: 'get',
|
||||
headers: {
|
||||
"X-CSRFToken": "{{ csrf_token() }}"
|
||||
}
|
||||
}).then(function (res) {
|
||||
if (!res.ok) throw Error(res.statusText);
|
||||
return res.json();
|
||||
}).then(function (data) {
|
||||
for (var synt of data) {
|
||||
var option = document.createElement('option');
|
||||
option.text = synt.name
|
||||
option.value = synt.path
|
||||
$("#select").append(option);
|
||||
}
|
||||
}).catch(function (err) {
|
||||
console.log('Error: ' + err.message);
|
||||
})
|
||||
|
||||
var rec, wave, recBlob;
|
||||
/**调用open打开录音请求好录音权限**/
|
||||
var recOpen = function () {//一般在显示出录音按钮或相关的录音界面时进行此方法调用,后面用户点击开始录音时就能畅通无阻了
|
||||
rec = null;
|
||||
wave = null;
|
||||
recBlob = null;
|
||||
var newRec = Recorder({
|
||||
type: "wav", bitRate: 16, sampleRate: 16000
|
||||
, onProcess: function (buffers, powerLevel, bufferDuration, bufferSampleRate, newBufferIdx, asyncEnd) {
|
||||
//录音实时回调,大约1秒调用12次本回调
|
||||
// document.querySelector(".recpowerx").style.width = powerLevel + "%";
|
||||
// document.querySelector(".recpowert").innerText = bufferDuration + " / " + powerLevel;
|
||||
|
||||
//可视化图形绘制
|
||||
// wave.input(buffers[buffers.length - 1], powerLevel, bufferSampleRate);
|
||||
}
|
||||
});
|
||||
|
||||
createDelayDialog(); //我们可以选择性的弹一个对话框:为了防止移动端浏览器存在第三种情况:用户忽略,并且(或者国产系统UC系)浏览器没有任何回调,此处demo省略了弹窗的代码
|
||||
newRec.open(function () {//打开麦克风授权获得相关资源
|
||||
dialogCancel(); //如果开启了弹框,此处需要取消
|
||||
|
||||
rec = newRec;
|
||||
|
||||
//此处创建这些音频可视化图形绘制浏览器支持妥妥的
|
||||
// wave = Recorder.FrequencyHistogramView({ elem: ".recwave" });
|
||||
|
||||
reclog("已打开录音,可以点击录制开始录音了", 2);
|
||||
}, function (msg, isUserNotAllow) {//用户拒绝未授权或不支持
|
||||
dialogCancel(); //如果开启了弹框,此处需要取消
|
||||
reclog((isUserNotAllow ? "UserNotAllow," : "") + "打开录音失败:" + msg, 1);
|
||||
});
|
||||
|
||||
window.waitDialogClick = function () {
|
||||
dialogCancel();
|
||||
reclog("打开失败:权限请求被忽略,<span style='color:#f00'>用户主动点击的弹窗</span>", 1);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
/**关闭录音,释放资源**/
|
||||
function recClose() {
|
||||
if (rec) {
|
||||
rec.close();
|
||||
reclog("已关闭");
|
||||
} else {
|
||||
reclog("未打开录音", 1);
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
/**开始录音**/
|
||||
function recStart() {//打开了录音后才能进行start、stop调用
|
||||
if (rec && Recorder.IsOpen()) {
|
||||
recBlob = null;
|
||||
rec.start();
|
||||
reclog("已开始录音...");
|
||||
} else {
|
||||
reclog("未打开录音,请求录音权限,如已允许录音权限,请再次点击录制", 1);
|
||||
recOpen();
|
||||
};
|
||||
};
|
||||
|
||||
function recStop() {
|
||||
rec.stop(function (blob, duration) {
|
||||
rec.close();//释放录音资源
|
||||
console.log(blob, (window.URL || webkitURL).createObjectURL(blob), "时长:" + duration + "ms");
|
||||
recBlob = blob;
|
||||
reclog("已录制wav:" + duration + "ms " + blob.size + "字节,可以点击播放、上传了", 2);
|
||||
}, function (msg) {
|
||||
reclog("录音失败:" + msg, 1);
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
/**播放**/
|
||||
function recPlay() {
|
||||
if (!recBlob) {
|
||||
reclog("请先录音,然后停止后再播放", 1);
|
||||
return;
|
||||
};
|
||||
var cls = ("a" + Math.random()).replace(".", "");
|
||||
reclog('播放中: <span class="' + cls + '"></span>');
|
||||
var audio = document.createElement("audio");
|
||||
audio.controls = true;
|
||||
document.querySelector("." + cls).appendChild(audio);
|
||||
//简单利用URL生成播放地址,注意不用了时需要revokeObjectURL,否则霸占内存
|
||||
audio.src = (window.URL || webkitURL).createObjectURL(recBlob);
|
||||
audio.play();
|
||||
|
||||
setTimeout(function () {
|
||||
(window.URL || webkitURL).revokeObjectURL(audio.src);
|
||||
}, 5000);
|
||||
};
|
||||
|
||||
function playResult(resultBlob) {
|
||||
if (!resultBlob) {
|
||||
reclog("服务端出错,请重试", 1);
|
||||
return;
|
||||
};
|
||||
var cls = ("a" + Math.random()).replace(".", "");
|
||||
reclog('播放中: <span class="' + cls + '"></span>');
|
||||
var audio = document.createElement("audio");
|
||||
audio.controls = true;
|
||||
document.querySelector("." + cls).appendChild(audio);
|
||||
//简单利用URL生成播放地址,注意不用了时需要revokeObjectURL,否则霸占内存
|
||||
audio.src = (window.URL || webkitURL).createObjectURL(resultBlob);
|
||||
audio.play();
|
||||
|
||||
setTimeout(function () {
|
||||
(window.URL || webkitURL).revokeObjectURL(audio.src);
|
||||
}, 12000);
|
||||
};
|
||||
|
||||
/**上传**/
|
||||
function recUpload() {
|
||||
var blob
|
||||
var loadedAudios = $("#fileInput").get(0).files
|
||||
if (loadedAudios.length > 0) {
|
||||
blob = loadedAudios[0];
|
||||
} else {
|
||||
blob = recBlob;
|
||||
}
|
||||
if (!blob) {
|
||||
reclog("请先录音或选择音频,然后停止后再上传", 1);
|
||||
return;
|
||||
};
|
||||
|
||||
//本例子假设使用原始XMLHttpRequest请求方式,实际使用中自行调整为自己的请求方式
|
||||
//录音结束时拿到了blob文件对象,可以用FileReader读取出内容,或者用FormData上传
|
||||
var api = "/api/synthesize";
|
||||
|
||||
reclog("开始上传到" + api + ",请求稍后...");
|
||||
|
||||
var reader = new FileReader();
|
||||
reader.onloadend = function () {
|
||||
var csrftoken = "{{ csrf_token() }}";
|
||||
var user_input_text = document.getElementById("user_input_text");
|
||||
var input_text = user_input_text.value;
|
||||
var postData = new FormData();
|
||||
postData.append("text", input_text)
|
||||
postData.append("file", blob)
|
||||
var sel = document.getElementById("select");
|
||||
var path = sel.options[sel.selectedIndex].value;
|
||||
if (!!path) {
|
||||
postData.append("synt_path", path);
|
||||
}
|
||||
|
||||
fetch(api, {
|
||||
method: 'post',
|
||||
headers: {
|
||||
"X-CSRFToken": csrftoken
|
||||
},
|
||||
body: postData
|
||||
}).then(function (res) {
|
||||
if (!res.ok) throw Error(res.statusText);
|
||||
return res.blob();
|
||||
}).then(function (blob) {
|
||||
playResult(blob)
|
||||
}).catch(function (err) {
|
||||
console.log('Error: ' + err.message);
|
||||
})
|
||||
};
|
||||
reader.readAsDataURL(blob);
|
||||
};
|
||||
|
||||
//recOpen我们可以选择性的弹一个对话框:为了防止移动端浏览器存在第三种情况:用户忽略,并且(或者国产系统UC系)浏览器没有任何回调
|
||||
var showDialog = function () {
|
||||
if (!/mobile/i.test(navigator.userAgent)) {
|
||||
return;//只在移动端开启没有权限请求的检测
|
||||
};
|
||||
dialogCancel();
|
||||
|
||||
//显示弹框,应该使用自己的弹框方式
|
||||
var div = document.createElement("div");
|
||||
document.body.appendChild(div);
|
||||
div.innerHTML = (''
|
||||
+ '<div class="waitDialog" style="z-index:99999;width:100%;height:100%;top:0;left:0;position:fixed;background:rgba(0,0,0,0.3);">'
|
||||
+ '<div style="display:flex;height:100%;align-items:center;">'
|
||||
+ '<div style="flex:1;"></div>'
|
||||
+ '<div style="width:240px;background:#fff;padding:15px 20px;border-radius: 10px;">'
|
||||
+ '<div style="padding-bottom:10px;">录音功能需要麦克风权限,请允许;如果未看到任何请求,请点击忽略~</div>'
|
||||
+ '<div style="text-align:center;"><a onclick="waitDialogClick()" style="color:#0B1">忽略</a></div>'
|
||||
+ '</div>'
|
||||
+ '<div style="flex:1;"></div>'
|
||||
+ '</div>'
|
||||
+ '</div>');
|
||||
};
|
||||
var createDelayDialog = function () {
|
||||
dialogInt = setTimeout(function () {//定时8秒后打开弹窗,用于监测浏览器没有发起权限请求的情况,在open前放置定时器利于收到了回调能及时取消(不管open是同步还是异步回调的)
|
||||
showDialog();
|
||||
}, 8000);
|
||||
};
|
||||
var dialogInt;
|
||||
var dialogCancel = function () {
|
||||
clearTimeout(dialogInt);
|
||||
|
||||
//关闭弹框,应该使用自己的弹框方式
|
||||
var elems = document.querySelectorAll(".waitDialog");
|
||||
for (var i = 0; i < elems.length; i++) {
|
||||
elems[i].parentNode.removeChild(elems[i]);
|
||||
};
|
||||
};
|
||||
//recOpen弹框End
|
||||
</script>
|
||||
|
||||
<!--以下这坨可以忽略-->
|
||||
<script>
|
||||
function reclog(s, color) {
|
||||
var now = new Date();
|
||||
var t = ("0" + now.getHours()).substr(-2)
|
||||
+ ":" + ("0" + now.getMinutes()).substr(-2)
|
||||
+ ":" + ("0" + now.getSeconds()).substr(-2);
|
||||
var div = document.createElement("div");
|
||||
var elem = document.querySelector(".reclog");
|
||||
elem.insertBefore(div, elem.firstChild);
|
||||
div.innerHTML = '<div style="color:' + (!color ? "" : color == 1 ? "#327de8" : color == 2 ? "#5da1f5" : color) + '">[' + t + ']' + s + '</div>';
|
||||
};
|
||||
window.onerror = function (message, url, lineNo, columnNo, error) {
|
||||
reclog('<span style="color:red">【Uncaught Error】' + message + '<pre>' + "at:" + lineNo + ":" + columnNo + " url:" + url + "\n" + (error && error.stack || "不能获得错误堆栈") + '</pre></span>');
|
||||
};
|
||||
</script>
|
||||
|
||||
<script>
|
||||
if (/mobile/i.test(navigator.userAgent)) {
|
||||
//移动端加载控制台组件
|
||||
var elem = document.createElement("script");
|
||||
elem.setAttribute("type", "text/javascript");
|
||||
|
||||
elem.setAttribute("src", "{{ url_for('static',filename='js/eruda.min.js') }}");
|
||||
document.body.appendChild(elem);
|
||||
elem.onload = function () {
|
||||
eruda.init();
|
||||
};
|
||||
};
|
||||
</script>
|
||||
|
||||
|
||||
<style>
|
||||
body {
|
||||
word-wrap: break-word;
|
||||
background: #f5f5f5 center top no-repeat;
|
||||
background-size: auto 680px;
|
||||
}
|
||||
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
a {
|
||||
text-decoration: none;
|
||||
color: #327de8;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
color: #5da1f5;
|
||||
}
|
||||
|
||||
.main {
|
||||
max-width: 700px;
|
||||
margin: 0 auto;
|
||||
padding-bottom: 80px
|
||||
}
|
||||
|
||||
.mainBox {
|
||||
margin-top: 12px;
|
||||
padding: 12px;
|
||||
border-radius: 6px;
|
||||
background: #fff;
|
||||
box-shadow: 2px 2px 3px #aaa;
|
||||
}
|
||||
|
||||
|
||||
.btns button {
|
||||
display: inline-block;
|
||||
cursor: pointer;
|
||||
border: none;
|
||||
border-radius: 3px;
|
||||
background: #5698c3;
|
||||
color: #fff;
|
||||
padding: 0 15px;
|
||||
margin: 3px 10px 3px 0;
|
||||
width: 70px;
|
||||
line-height: 36px;
|
||||
height: 36px;
|
||||
overflow: hidden;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
.btns #upload {
|
||||
background: #5698c3;
|
||||
color: #fff;
|
||||
width: 100px;
|
||||
height: 42px;
|
||||
}
|
||||
|
||||
.btns button:active {
|
||||
background: #5da1f5
|
||||
}
|
||||
|
||||
.btns button:hover {
|
||||
background: #5da1f5
|
||||
}
|
||||
.pd {
|
||||
padding: 0 0 6px 0;
|
||||
}
|
||||
|
||||
.lb {
|
||||
display: inline-block;
|
||||
vertical-align: middle;
|
||||
background: #327de8;
|
||||
color: #fff;
|
||||
font-size: 14px;
|
||||
padding: 2px 8px;
|
||||
border-radius: 99px;
|
||||
}
|
||||
|
||||
#fileInput {
|
||||
width: 0.1px;
|
||||
height: 0.1px;
|
||||
opacity: 0;
|
||||
overflow: hidden;
|
||||
position: absolute;
|
||||
z-index: -1;
|
||||
}
|
||||
#fileInput + label {
|
||||
padding: 0 15px;
|
||||
border-radius: 4px;
|
||||
color: white;
|
||||
background-color: #5698c3;
|
||||
display: inline-block;
|
||||
width: 70px;
|
||||
line-height: 36px;
|
||||
height: 36px;
|
||||
}
|
||||
#fileInput + label {
|
||||
cursor: pointer; /* "hand" cursor */
|
||||
}
|
||||
#fileInput:focus + label,
|
||||
#fileInput + label:hover {
|
||||
background-color: #5da1f5;
|
||||
}
|
||||
|
||||
.box select {
|
||||
background-color: #5698c3;
|
||||
color: white;
|
||||
padding: 8px;
|
||||
width: 120px;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
font-size: 0.5em;
|
||||
outline: none;
|
||||
margin: 3px 10px 3px 0;
|
||||
}
|
||||
|
||||
.box::before {
|
||||
content: "\f13a";
|
||||
position: absolute;
|
||||
top: 0;
|
||||
right: 0;
|
||||
width: 20%;
|
||||
height: 100%;
|
||||
text-align: center;
|
||||
font-size: 28px;
|
||||
line-height: 45px;
|
||||
color: rgba(255, 255, 255, 0.5);
|
||||
background-color: rgba(255, 255, 255, 0.1);
|
||||
pointer-events: none;
|
||||
}
|
||||
|
||||
.box:hover::before {
|
||||
color: rgba(255, 255, 255, 0.6);
|
||||
background-color: rgba(255, 255, 255, 0.2);
|
||||
}
|
||||
|
||||
.box select option {
|
||||
padding: 30px;
|
||||
}
|
||||
</style>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
Reference in New Issue
Block a user