From 37f11ab9ceadcd200ff861dc3003bb78f95cd8b4 Mon Sep 17 00:00:00 2001 From: babysor00 Date: Mon, 11 Oct 2021 21:52:15 +0800 Subject: [PATCH] Commit with working GST --- .vscode/launch.json | 10 ++- requirements.txt | 3 +- synthesizer/global_style_token.py | 135 +++++++++++++++++++++++++++++ synthesizer/gst_hyperparameters.py | 13 +++ synthesizer/models/tacotron.py | 26 ++++-- utils/modelutils.py | 3 +- 6 files changed, 178 insertions(+), 12 deletions(-) create mode 100644 synthesizer/global_style_token.py create mode 100644 synthesizer/gst_hyperparameters.py diff --git a/.vscode/launch.json b/.vscode/launch.json index 2d83598..0f6b728 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -34,6 +34,14 @@ "program": "demo_toolbox.py", "console": "integratedTerminal", "args": ["-d","..\\..\\chs"] - } + }, + { + "name": "Python: Synth Train", + "type": "python", + "request": "launch", + "program": "synthesizer_train.py", + "console": "integratedTerminal", + "args": ["my_run", "..\\"] + }, ] } diff --git a/requirements.txt b/requirements.txt index 27cac28..9010c30 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,5 @@ flask flask_wtf flask_cors gevent==21.8.0 -flask_restx \ No newline at end of file +flask_restx +tensorboard \ No newline at end of file diff --git a/synthesizer/global_style_token.py b/synthesizer/global_style_token.py new file mode 100644 index 0000000..a884867 --- /dev/null +++ b/synthesizer/global_style_token.py @@ -0,0 +1,135 @@ +import torch +import torch.nn as nn +import torch.nn.init as init +import torch.nn.functional as tFunctional +from synthesizer.gst_hyperparameters import GSTHyperparameters as hp + + +class GlobalStyleToken(nn.Module): + + def __init__(self): + + super().__init__() + self.encoder = ReferenceEncoder() + self.stl = STL() + + def forward(self, inputs): + enc_out = self.encoder(inputs) + style_embed = self.stl(enc_out) + + return style_embed + + +class ReferenceEncoder(nn.Module): + ''' + inputs --- [N, Ty/r, n_mels*r] mels + outputs --- [N, ref_enc_gru_size] + ''' + + def __init__(self): + + super().__init__() + K = len(hp.ref_enc_filters) + filters = [1] + hp.ref_enc_filters + convs = [nn.Conv2d(in_channels=filters[i], + out_channels=filters[i + 1], + kernel_size=(3, 3), + stride=(2, 2), + padding=(1, 1)) for i in range(K)] + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=hp.ref_enc_filters[i]) for i in range(K)]) + + out_channels = self.calculate_channels(hp.n_mels, 3, 2, 1, K) + self.gru = nn.GRU(input_size=hp.ref_enc_filters[-1] * out_channels, + hidden_size=hp.E // 2, + batch_first=True) + + def forward(self, inputs): + N = inputs.size(0) + out = inputs.view(N, 1, -1, hp.n_mels) # [N, 1, Ty, n_mels] + for conv, bn in zip(self.convs, self.bns): + out = conv(out) + out = bn(out) + out = tFunctional.relu(out) # [N, 128, Ty//2^K, n_mels//2^K] + + out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K] + T = out.size(1) + N = out.size(0) + out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K] + + self.gru.flatten_parameters() + memory, out = self.gru(out) # out --- [1, N, E//2] + + return out.squeeze(0) + + def calculate_channels(self, L, kernel_size, stride, pad, n_convs): + for i in range(n_convs): + L = (L - kernel_size + 2 * pad) // stride + 1 + return L + + +class STL(nn.Module): + ''' + inputs --- [N, E//2] + ''' + + def __init__(self): + + super().__init__() + self.embed = nn.Parameter(torch.FloatTensor(hp.token_num, hp.E // hp.num_heads)) + d_q = hp.E // 2 + d_k = hp.E // hp.num_heads + # self.attention = MultiHeadAttention(hp.num_heads, d_model, d_q, d_v) + self.attention = MultiHeadAttention(query_dim=d_q, key_dim=d_k, num_units=hp.E, num_heads=hp.num_heads) + + init.normal_(self.embed, mean=0, std=0.5) + + def forward(self, inputs): + N = inputs.size(0) + query = inputs.unsqueeze(1) # [N, 1, E//2] + keys = tFunctional.tanh(self.embed).unsqueeze(0).expand(N, -1, -1) # [N, token_num, E // num_heads] + style_embed = self.attention(query, keys) + + return style_embed, keys + + +class MultiHeadAttention(nn.Module): + ''' + input: + query --- [N, T_q, query_dim] + key --- [N, T_k, key_dim] + output: + out --- [N, T_q, num_units] + ''' + + def __init__(self, query_dim, key_dim, num_units, num_heads): + + super().__init__() + self.num_units = num_units + self.num_heads = num_heads + self.key_dim = key_dim + + self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False) + self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) + self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) + + def forward(self, query, key): + querys = self.W_query(query) # [N, T_q, num_units] + keys = self.W_key(key) # [N, T_k, num_units] + values = self.W_value(key) + + split_size = self.num_units // self.num_heads + querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h] + keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] + values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] + + # score = softmax(QK^T / (d_k ** 0.5)) + scores = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k] + scores = scores / (self.key_dim ** 0.5) + scores = tFunctional.softmax(scores, dim=3) + + # out = score * V + out = torch.matmul(scores, values) # [h, N, T_q, num_units/h] + out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units] + + return out diff --git a/synthesizer/gst_hyperparameters.py b/synthesizer/gst_hyperparameters.py new file mode 100644 index 0000000..1403144 --- /dev/null +++ b/synthesizer/gst_hyperparameters.py @@ -0,0 +1,13 @@ +class GSTHyperparameters(): + E = 512 + + # reference encoder + ref_enc_filters = [32, 32, 64, 64, 128, 128] + + # style token layer + token_num = 10 + # token_emb_size = 256 + num_heads = 8 + + n_mels = 256 # Number of Mel banks to generate + diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py index 769f7f9..44407e3 100644 --- a/synthesizer/models/tacotron.py +++ b/synthesizer/models/tacotron.py @@ -3,8 +3,7 @@ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -from pathlib import Path -from typing import Union +from synthesizer.global_style_token import GlobalStyleToken class HighwayNetwork(nn.Module): @@ -338,6 +337,7 @@ class Tacotron(nn.Module): self.encoder = Encoder(embed_dims, num_chars, encoder_dims, encoder_K, num_highways, dropout) self.encoder_proj = nn.Linear(encoder_dims + speaker_embedding_size, decoder_dims, bias=False) + self.gst = GlobalStyleToken() self.decoder = Decoder(n_mels, encoder_dims, decoder_dims, lstm_dims, dropout, speaker_embedding_size) self.postnet = CBHG(postnet_K, n_mels, postnet_dims, @@ -358,11 +358,11 @@ class Tacotron(nn.Module): def r(self, value): self.decoder.r = self.decoder.r.new_tensor(value, requires_grad=False) - def forward(self, x, m, speaker_embedding): + def forward(self, texts, mels, speaker_embedding): device = next(self.parameters()).device # use same device as parameters self.step += 1 - batch_size, _, steps = m.size() + batch_size, _, steps = mels.size() # Initialise all hidden states and pack into tuple attn_hidden = torch.zeros(batch_size, self.decoder_dims, device=device) @@ -383,7 +383,12 @@ class Tacotron(nn.Module): # SV2TTS: Run the encoder with the speaker embedding # The projection avoids unnecessary matmuls in the decoder loop - encoder_seq = self.encoder(x, speaker_embedding) + encoder_seq = self.encoder(texts, speaker_embedding) + # put after encoder + style_embed, _ = self.gst(speaker_embedding) # [N, 256] + style_embed = style_embed.expand_as(encoder_seq) + encoder_seq = encoder_seq + style_embed + encoder_seq_proj = self.encoder_proj(encoder_seq) # Need a couple of lists for outputs @@ -391,10 +396,10 @@ class Tacotron(nn.Module): # Run the decoder loop for t in range(0, steps, self.r): - prenet_in = m[:, :, t - 1] if t > 0 else go_frame + prenet_in = mels[:, :, t - 1] if t > 0 else go_frame mel_frames, scores, hidden_states, cell_states, context_vec, stop_tokens = \ self.decoder(encoder_seq, encoder_seq_proj, prenet_in, - hidden_states, cell_states, context_vec, t, x) + hidden_states, cell_states, context_vec, t, texts) mel_outputs.append(mel_frames) attn_scores.append(scores) stop_outputs.extend([stop_tokens] * self.r) @@ -414,7 +419,7 @@ class Tacotron(nn.Module): return mel_outputs, linear, attn_scores, stop_outputs - def generate(self, x, speaker_embedding=None, steps=2000): + def generate(self, x, speaker_embedding=None, steps=200): self.eval() device = next(self.parameters()).device # use same device as parameters @@ -440,6 +445,11 @@ class Tacotron(nn.Module): # SV2TTS: Run the encoder with the speaker embedding # The projection avoids unnecessary matmuls in the decoder loop encoder_seq = self.encoder(x, speaker_embedding) + + # put after encoder + style_embed = self.gst(speaker_embedding) # [N, 256] + style_embed = style_embed.expand_as(encoder_seq) + encoder_seq = encoder_seq + style_embed encoder_seq_proj = self.encoder_proj(encoder_seq) # Need a couple of lists for outputs diff --git a/utils/modelutils.py b/utils/modelutils.py index 6acaa98..4b2efc7 100644 --- a/utils/modelutils.py +++ b/utils/modelutils.py @@ -11,7 +11,6 @@ def check_model_paths(encoder_path: Path, synthesizer_path: Path, vocoder_path: # If none of the paths exist, remind the user to download models if needed print("********************************************************************************") - print("Error: Model files not found. Follow these instructions to get and install the models:") - print("https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models") + print("Error: Model files not found. Please download the models") print("********************************************************************************\n") quit(-1)