From c5998bfe71d40e4947a04037c8a9f7e2e8eccfd0 Mon Sep 17 00:00:00 2001 From: babysor00 Date: Sat, 30 Apr 2022 10:22:28 +0800 Subject: [PATCH] Add vc mode --- mkgui/app.py | 6 +- mkgui/app_vc.py | 164 ++++++++++++++++++++++++++++++++++ mkgui/base/ui/streamlit_ui.py | 6 +- ppg2mel/__init__.py | 9 +- toolbox/__init__.py | 7 +- vocoder/hifigan/inference.py | 6 +- 6 files changed, 182 insertions(+), 16 deletions(-) create mode 100644 mkgui/app_vc.py diff --git a/mkgui/app.py b/mkgui/app.py index 775f695..5e681ef 100644 --- a/mkgui/app.py +++ b/mkgui/app.py @@ -53,11 +53,11 @@ class Input(BaseModel): ) synthesizer: synthesizers = Field( ..., alias="合成模型", - description="选择语音编码模型文件." + description="选择语音合成模型文件." ) vocoder: vocoders = Field( - ..., alias="语音编码模型", - description="选择语音编码模型文件(目前只支持HifiGan类型)." + ..., alias="语音解码模型", + description="选择语音解码模型文件(目前只支持HifiGan类型)." ) class AudioEntity(BaseModel): diff --git a/mkgui/app_vc.py b/mkgui/app_vc.py new file mode 100644 index 0000000..8f55a9c --- /dev/null +++ b/mkgui/app_vc.py @@ -0,0 +1,164 @@ +from asyncio.windows_events import NULL +from synthesizer.inference import Synthesizer +from pydantic import BaseModel, Field +from encoder import inference as speacker_encoder +import torch +import os +from pathlib import Path +from enum import Enum +import ppg_extractor as Extractor +import ppg2mel as Convertor +import librosa +from scipy.io.wavfile import write +import re +import numpy as np +from mkgui.base.components.types import FileContent +from vocoder.hifigan import inference as gan_vocoder +from typing import Any +import matplotlib.pyplot as plt + + +# Constants +AUDIO_SAMPLES_DIR = 'samples\\' +EXT_MODELS_DIRT = "ppg_extractor\\saved_models" +CONV_MODELS_DIRT = "ppg2mel\\saved_models" +VOC_MODELS_DIRT = "vocoder\\saved_models" +TEMP_SOURCE_AUDIO = "wavs/temp_source.wav" +TEMP_TARGET_AUDIO = "wavs/temp_target.wav" +TEMP_RESULT_AUDIO = "wavs/temp_result.wav" + +# Load local sample audio as options TODO: load dataset +if os.path.isdir(AUDIO_SAMPLES_DIR): + audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav"))) +# Pre-Load models +if os.path.isdir(EXT_MODELS_DIRT): + extractors = Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt"))) + print("Loaded extractor models: " + str(len(extractors))) +if os.path.isdir(CONV_MODELS_DIRT): + convertors = Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth"))) + print("Loaded convertor models: " + str(len(convertors))) + +if os.path.isdir(VOC_MODELS_DIRT): + vocoders = Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt"))) + print("Loaded vocoders models: " + str(len(vocoders))) + + +class Input(BaseModel): + message: str = Field( + ..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容" + ) + local_audio_file: audio_input_selection = Field( + ..., alias="输入语音(本地wav)", + description="选择本地语音文件." + ) + upload_audio_file: FileContent = Field(default=None, alias="或上传语音", + description="拖拽或点击上传.", mime_type="audio/wav") + local_audio_file_target: audio_input_selection = Field( + ..., alias="目标语音(本地wav)", + description="选择本地语音文件." + ) + upload_audio_file_target: FileContent = Field(default=None, alias="或上传目标语音", + description="拖拽或点击上传.", mime_type="audio/wav") + extractor: extractors = Field( + ..., alias="编码模型", + description="选择语音编码模型文件." + ) + convertor: convertors = Field( + ..., alias="转换模型", + description="选择语音转换模型文件." + ) + vocoder: vocoders = Field( + ..., alias="语音编码模型", + description="选择语音解码模型文件(目前只支持HifiGan类型)." + ) + +class AudioEntity(BaseModel): + content: bytes + mel: Any + +class Output(BaseModel): + __root__: tuple[AudioEntity, AudioEntity, AudioEntity] + + def render_output_ui(self, streamlit_app, input) -> None: # type: ignore + """Custom output UI. + If this method is implmeneted, it will be used instead of the default Output UI renderer. + """ + src, target, result = self.__root__ + + streamlit_app.subheader("Synthesized Audio") + streamlit_app.audio(result.content, format="audio/wav") + + fig, ax = plt.subplots() + ax.imshow(src.mel, aspect="equal", interpolation="none") + ax.set_title("mel spectrogram(Source Audio)") + streamlit_app.pyplot(fig) + fig, ax = plt.subplots() + ax.imshow(target.mel, aspect="equal", interpolation="none") + ax.set_title("mel spectrogram(Target Audio)") + streamlit_app.pyplot(fig) + fig, ax = plt.subplots() + ax.imshow(result.mel, aspect="equal", interpolation="none") + ax.set_title("mel spectrogram(Result Audio)") + streamlit_app.pyplot(fig) + +def main(input: Input) -> Output: + """convert(转换)""" + # load models + extractor = Extractor.load_model(Path(input.extractor.value)) + convertor = Convertor.load_model(Path(input.convertor.value)) + # current_synt = Synthesizer(Path(input.synthesizer.value)) + gan_vocoder.load_model(Path(input.vocoder.value)) + + # load file + if input.upload_audio_file != None: + with open(TEMP_SOURCE_AUDIO, "w+b") as f: + f.write(input.upload_audio_file.as_bytes()) + f.seek(0) + src_wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO) + else: + src_wav, sample_rate = librosa.load(input.local_audio_file.value) + write(TEMP_SOURCE_AUDIO, sample_rate, src_wav) #Make sure we get the correct wav + + if input.upload_audio_file_target != None: + with open(TEMP_TARGET_AUDIO, "w+b") as f: + f.write(input.upload_audio_file_target.as_bytes()) + f.seek(0) + ref_wav, _ = librosa.load(TEMP_TARGET_AUDIO) + else: + ref_wav, _ = librosa.load(input.local_audio_file_target.value) + write(TEMP_TARGET_AUDIO, sample_rate, ref_wav) #Make sure we get the correct wav + + ppg = extractor.extract_from_wav(src_wav) + # Import necessary dependency of Voice Conversion + from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv + ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav))) + speacker_encoder.load_model(Path("encoder/saved_models/pretrained_bak_5805000.pt")) + embed = speacker_encoder.embed_utterance(ref_wav) + lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True) + min_len = min(ppg.shape[1], len(lf0_uv)) + ppg = ppg[:, :min_len] + lf0_uv = lf0_uv[:min_len] + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + _, mel_pred, att_ws = convertor.inference( + ppg, + logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device), + spembs=torch.from_numpy(embed).unsqueeze(0).to(device), + ) + mel_pred= mel_pred.transpose(0, 1) + breaks = [mel_pred.shape[1]] + mel_pred= mel_pred.detach().cpu().numpy() + + # synthesize and vocode + wav, sample_rate = gan_vocoder.infer_waveform(mel_pred) + + # write and output + write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav + with open(TEMP_SOURCE_AUDIO, "rb") as f: + source_file = f.read() + with open(TEMP_TARGET_AUDIO, "rb") as f: + target_file = f.read() + with open(TEMP_RESULT_AUDIO, "rb") as f: + result_file = f.read() + + + return Output(__root__=(AudioEntity(content=source_file, mel=Synthesizer.make_spectrogram(src_wav)), AudioEntity(content=target_file, mel=Synthesizer.make_spectrogram(ref_wav)), AudioEntity(content=result_file, mel=Synthesizer.make_spectrogram(wav)))) \ No newline at end of file diff --git a/mkgui/base/ui/streamlit_ui.py b/mkgui/base/ui/streamlit_ui.py index 7401957..9708668 100644 --- a/mkgui/base/ui/streamlit_ui.py +++ b/mkgui/base/ui/streamlit_ui.py @@ -801,9 +801,9 @@ class OutputUI: def getOpyrator(mode: str) -> Opyrator: - # if mode == None or mode.startswith('VC'): - # from mkgui.app_vc import vc - # return Opyrator(vc) + if mode == None or mode.startswith('VC'): + from mkgui.app_vc import main + return Opyrator(main) from mkgui.app import main return Opyrator(main) diff --git a/ppg2mel/__init__.py b/ppg2mel/__init__.py index 53ee3b2..cc54db8 100644 --- a/ppg2mel/__init__.py +++ b/ppg2mel/__init__.py @@ -191,12 +191,15 @@ class MelDecoderMOLv2(AbsMelDecoder): return mel_outputs[0], mel_outputs_postnet[0], alignments[0] -def load_model(train_config, model_file, device=None): - +def load_model(model_file, device=None): + # search a config file + model_config_fpaths = list(model_file.parent.rglob("*.yaml")) + if len(model_config_fpaths) == 0: + raise "No model yaml config found for convertor" if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model_config = HpsYaml(train_config) + model_config = HpsYaml(model_config_fpaths[0]) ppg2mel_model = MelDecoderMOLv2( **model_config["model"] ).to(device) diff --git a/toolbox/__init__.py b/toolbox/__init__.py index d0cd20e..f83ae23 100644 --- a/toolbox/__init__.py +++ b/toolbox/__init__.py @@ -405,16 +405,11 @@ class Toolbox: if self.ui.current_convertor_fpath is None: return model_fpath = self.ui.current_convertor_fpath - # search a config file - model_config_fpaths = list(model_fpath.parent.rglob("*.yaml")) - if self.ui.current_convertor_fpath is None: - return - model_config_fpath = model_config_fpaths[0] self.ui.log("Loading the convertor %s... " % model_fpath) self.ui.set_loading(1) start = timer() import ppg2mel as convertor - self.convertor = convertor.load_model(model_config_fpath, model_fpath) + self.convertor = convertor.load_model( model_fpath) self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") self.ui.set_loading(0) diff --git a/vocoder/hifigan/inference.py b/vocoder/hifigan/inference.py index edbcd38..423cbc6 100644 --- a/vocoder/hifigan/inference.py +++ b/vocoder/hifigan/inference.py @@ -26,7 +26,11 @@ def load_model(weights_fpath, config_fpath=None, verbose=True): print("Building hifigan") if config_fpath == None: - config_fpath = "./vocoder/hifigan/config_16k_.json" + model_config_fpaths = list(weights_fpath.parent.rglob("*.json")) + if len(model_config_fpaths) > 0: + config_fpath = model_config_fpaths[0] + else: + config_fpath = "./vocoder/hifigan/config_16k_.json" with open(config_fpath) as f: data = f.read() json_config = json.loads(data)