Add vc mode

2026-04-14 02:20:46 +08:00 · 2022-04-30 10:22:28 +08:00
parent c997dbdf66
commit c5998bfe71
6 changed files with 182 additions and 16 deletions
--- a/mkgui/app.py
+++ b/mkgui/app.py
@@ -53,11 +53,11 @@ class Input(BaseModel):
    )
    synthesizer: synthesizers = Field(
        ..., alias="合成模型", 
-        description="选择语音编码模型文件."
+        description="选择语音合成模型文件."
    )
    vocoder: vocoders = Field(
-        ..., alias="语音编码模型", 
-        description="选择语音编码模型文件(目前只支持HifiGan类型)."
+        ..., alias="语音解码模型", 
+        description="选择语音解码模型文件(目前只支持HifiGan类型)."
    )

 class AudioEntity(BaseModel):
--- a/mkgui/app_vc.py
+++ b/mkgui/app_vc.py
@@ -0,0 +1,164 @@
+from asyncio.windows_events import NULL
+from synthesizer.inference import Synthesizer
+from pydantic import BaseModel, Field
+from encoder import inference as speacker_encoder
+import torch
+import os
+from pathlib import Path
+from enum import Enum
+import ppg_extractor as Extractor
+import ppg2mel as Convertor
+import librosa
+from scipy.io.wavfile import write
+import re
+import numpy as np
+from mkgui.base.components.types import FileContent
+from vocoder.hifigan import inference as gan_vocoder
+from typing import Any
+import matplotlib.pyplot as plt
+
+
+# Constants
+AUDIO_SAMPLES_DIR = 'samples\\'
+EXT_MODELS_DIRT = "ppg_extractor\\saved_models"
+CONV_MODELS_DIRT = "ppg2mel\\saved_models"
+VOC_MODELS_DIRT = "vocoder\\saved_models"
+TEMP_SOURCE_AUDIO = "wavs/temp_source.wav"
+TEMP_TARGET_AUDIO = "wavs/temp_target.wav"
+TEMP_RESULT_AUDIO = "wavs/temp_result.wav"
+
+# Load local sample audio as options TODO: load dataset 
+if os.path.isdir(AUDIO_SAMPLES_DIR):
+    audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
+# Pre-Load models
+if os.path.isdir(EXT_MODELS_DIRT):    
+    extractors =  Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded extractor models: " + str(len(extractors)))
+if os.path.isdir(CONV_MODELS_DIRT):    
+    convertors =  Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
+    print("Loaded convertor models: " + str(len(convertors)))
+
+if os.path.isdir(VOC_MODELS_DIRT):    
+    vocoders =  Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
+    print("Loaded vocoders models: " + str(len(vocoders)))
+
+
+class Input(BaseModel):
+    message: str = Field(
+        ..., example="欢迎使用工具箱, 现已支持中文输入！", alias="文本内容"
+    )
+    local_audio_file: audio_input_selection = Field(
+        ..., alias="输入语音（本地wav）",
+        description="选择本地语音文件."
+    )
+    upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
+        description="拖拽或点击上传.", mime_type="audio/wav")
+    local_audio_file_target: audio_input_selection = Field(
+        ..., alias="目标语音（本地wav）",
+        description="选择本地语音文件."
+    )
+    upload_audio_file_target: FileContent = Field(default=None, alias="或上传目标语音",
+        description="拖拽或点击上传.", mime_type="audio/wav")
+    extractor: extractors = Field(
+        ..., alias="编码模型", 
+        description="选择语音编码模型文件."
+    )
+    convertor: convertors = Field(
+        ..., alias="转换模型", 
+        description="选择语音转换模型文件."
+    )
+    vocoder: vocoders = Field(
+        ..., alias="语音编码模型", 
+        description="选择语音解码模型文件(目前只支持HifiGan类型)."
+    )
+
+class AudioEntity(BaseModel):
+    content: bytes
+    mel: Any
+
+class Output(BaseModel):
+    __root__: tuple[AudioEntity, AudioEntity, AudioEntity]
+
+    def render_output_ui(self, streamlit_app, input) -> None:  # type: ignore
+        """Custom output UI.
+        If this method is implmeneted, it will be used instead of the default Output UI renderer.
+        """
+        src, target, result = self.__root__
+        
+        streamlit_app.subheader("Synthesized Audio")
+        streamlit_app.audio(result.content, format="audio/wav")
+
+        fig, ax = plt.subplots()
+        ax.imshow(src.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Source Audio)")
+        streamlit_app.pyplot(fig)
+        fig, ax = plt.subplots()
+        ax.imshow(target.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Target Audio)")
+        streamlit_app.pyplot(fig)
+        fig, ax = plt.subplots()
+        ax.imshow(result.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Result Audio)")
+        streamlit_app.pyplot(fig)
+
+def main(input: Input) -> Output:
+    """convert(转换)"""
+    # load models
+    extractor = Extractor.load_model(Path(input.extractor.value))
+    convertor = Convertor.load_model(Path(input.convertor.value))
+    # current_synt = Synthesizer(Path(input.synthesizer.value))
+    gan_vocoder.load_model(Path(input.vocoder.value))
+
+    # load file
+    if input.upload_audio_file != None:
+        with open(TEMP_SOURCE_AUDIO, "w+b") as f:
+            f.write(input.upload_audio_file.as_bytes())
+            f.seek(0)
+        src_wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
+    else:
+        src_wav, sample_rate  = librosa.load(input.local_audio_file.value)
+        write(TEMP_SOURCE_AUDIO, sample_rate, src_wav) #Make sure we get the correct wav
+
+    if input.upload_audio_file_target != None:
+        with open(TEMP_TARGET_AUDIO, "w+b") as f:
+            f.write(input.upload_audio_file_target.as_bytes())
+            f.seek(0)
+        ref_wav, _ = librosa.load(TEMP_TARGET_AUDIO)
+    else:
+        ref_wav, _  = librosa.load(input.local_audio_file_target.value)
+        write(TEMP_TARGET_AUDIO, sample_rate, ref_wav) #Make sure we get the correct wav
+
+    ppg = extractor.extract_from_wav(src_wav)
+    # Import necessary dependency of Voice Conversion
+    from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv   
+    ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
+    speacker_encoder.load_model(Path("encoder/saved_models/pretrained_bak_5805000.pt"))
+    embed = speacker_encoder.embed_utterance(ref_wav)
+    lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
+    min_len = min(ppg.shape[1], len(lf0_uv))
+    ppg = ppg[:, :min_len]
+    lf0_uv = lf0_uv[:min_len]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    _, mel_pred, att_ws = convertor.inference(
+        ppg,
+        logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device),
+        spembs=torch.from_numpy(embed).unsqueeze(0).to(device),
+    )
+    mel_pred= mel_pred.transpose(0, 1)
+    breaks = [mel_pred.shape[1]]
+    mel_pred= mel_pred.detach().cpu().numpy()
+
+    # synthesize and vocode
+    wav, sample_rate = gan_vocoder.infer_waveform(mel_pred)
+
+    # write and output 
+    write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
+    with open(TEMP_SOURCE_AUDIO, "rb") as f:
+        source_file = f.read()
+    with open(TEMP_TARGET_AUDIO, "rb") as f:
+        target_file = f.read()
+    with open(TEMP_RESULT_AUDIO, "rb") as f:
+        result_file = f.read()
+    
+
+    return Output(__root__=(AudioEntity(content=source_file, mel=Synthesizer.make_spectrogram(src_wav)), AudioEntity(content=target_file, mel=Synthesizer.make_spectrogram(ref_wav)), AudioEntity(content=result_file, mel=Synthesizer.make_spectrogram(wav))))
--- a/mkgui/base/ui/streamlit_ui.py
+++ b/mkgui/base/ui/streamlit_ui.py
@@ -801,9 +801,9 @@ class OutputUI:


 def getOpyrator(mode: str) -> Opyrator:
-    # if mode == None or mode.startswith('VC'):
-    #     from mkgui.app_vc import vc
-    #     return  Opyrator(vc)
+    if mode == None or mode.startswith('VC'):
+        from mkgui.app_vc import main
+        return  Opyrator(main)
    from mkgui.app import main
    return Opyrator(main)
    
--- a/ppg2mel/init.py
+++ b/ppg2mel/init.py
@@ -191,12 +191,15 @@ class MelDecoderMOLv2(AbsMelDecoder):
        
        return mel_outputs[0], mel_outputs_postnet[0], alignments[0]

-def load_model(train_config, model_file, device=None):
-    
+def load_model(model_file, device=None):
+    # search a config file
+    model_config_fpaths = list(model_file.parent.rglob("*.yaml"))
+    if len(model_config_fpaths) == 0:
+        raise "No model yaml config found for convertor"
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

-    model_config = HpsYaml(train_config)
+    model_config = HpsYaml(model_config_fpaths[0])
    ppg2mel_model = MelDecoderMOLv2(
        **model_config["model"]
    ).to(device)
--- a/toolbox/init.py
+++ b/toolbox/init.py
@@ -405,16 +405,11 @@ class Toolbox:
        if self.ui.current_convertor_fpath is None:
            return
        model_fpath = self.ui.current_convertor_fpath
-        # search a config file
-        model_config_fpaths = list(model_fpath.parent.rglob("*.yaml"))
-        if self.ui.current_convertor_fpath is None:
-            return
-        model_config_fpath = model_config_fpaths[0]
        self.ui.log("Loading the convertor %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        import ppg2mel as convertor
-        self.convertor = convertor.load_model(model_config_fpath, model_fpath)
+        self.convertor = convertor.load_model( model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)
        
--- a/vocoder/hifigan/inference.py
+++ b/vocoder/hifigan/inference.py
@@ -26,7 +26,11 @@ def load_model(weights_fpath, config_fpath=None, verbose=True):
        print("Building hifigan")

    if config_fpath == None:
-        config_fpath = "./vocoder/hifigan/config_16k_.json"
+        model_config_fpaths = list(weights_fpath.parent.rglob("*.json"))
+        if len(model_config_fpaths) > 0:
+            config_fpath = model_config_fpaths[0]
+        else:
+            config_fpath = "./vocoder/hifigan/config_16k_.json"
    with open(config_fpath) as f:
        data = f.read()
    json_config = json.loads(data)