Update README.md

Update README.md, fix a typo (#1007 )
Update README.md (#1005 )
2026-02-03 18:43:41 +08:00 · 2024-11-01 12:54:16 +08:00 · 2024-10-22 10:21:44 +08:00 · 2024-10-16 22:48:15 +08:00 · 2024-08-29 17:52:56 +08:00 · 2024-08-22 15:06:40 +08:00
6 changed files with 46 additions and 70 deletions
--- a/README-CN.md
+++ b/README-CN.md
@@ -29,6 +29,7 @@
 > 如果在用 pip 方式安装的时候出现 `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2)` 这个错误可能是 python 版本过低，3.9 可以安装成功
 * 安装 [ffmpeg](https://ffmpeg.org/download.html#get-packages)。
 * 运行`pip install -r requirements.txt` 来安装剩余的必要包。
+> 这里的环境建议使用 `Repo Tag 0.0.1` `Pytorch1.9.0 with Torchvision0.10.0 and cudatoolkit10.2` `requirements.txt` `webrtcvad-wheels` 因为 `requiremants.txt` 是在几个月前导出的，所以不适配新版本
 * 安装 webrtcvad `pip install webrtcvad-wheels`。

 或者
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+> 🚧 While I no longer actively update this repo, you can find me continuously pushing this tech forward to good side and open-source. Join me at [MaskGCT](https://github.com/open-mmlab/Amphion/tree/main/models/tts/maskgct). I'm also building an optimized and cloud hosted version: https://noiz.ai/
+>
 ![mockingbird](https://user-images.githubusercontent.com/12797292/131216767-6eb251d6-14fc-4951-8324-2722f0cd4c63.jpg)


@@ -29,6 +31,7 @@
 > If you get an `ERROR: Could not find a version that satisfies the requirement torch==1.9.0+cu102 (from versions: 0.1.2, 0.1.2.post1, 0.1.2.post2 )` This error is probably due to a low version of python, try using 3.9 and it will install successfully
 * Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
 * Run `pip install -r requirements.txt` to install the remaining necessary packages.
+> The recommended environment here is `Repo Tag 0.0.1` `Pytorch1.9.0 with Torchvision0.10.0 and cudatoolkit10.2` `requirements.txt` `webrtcvad-wheels` because `requirements. txt` was exported a few months ago, so it doesn't work with newer versions
 * Install webrtcvad `pip install webrtcvad-wheels`(If you need)

 or
--- a/models/synthesizer/preprocess.py
+++ b/models/synthesizer/preprocess.py
@@ -39,6 +39,9 @@ data_info = {
    }
 }

+def should_skip(fpath: Path, skip_existing: bool) -> bool:
+    return skip_existing and fpath.exists()
+
 def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
                           skip_existing: bool, hparams, no_alignments: bool, 
                           dataset: str, emotion_extract = False, encoder_model_fpath=None):
@@ -99,7 +102,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))

-def embed_utterance(fpaths, encoder_model_fpath):
+def _embed_utterance(fpaths: str, encoder_model_fpath: str):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

@@ -110,15 +113,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
    
-def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
-    if skip_existing and fpaths.exists():
-        return
+def _emo_extract_from_utterance(fpaths, hparams):
    wav_fpath, emo_fpath = fpaths
    wav = np.load(wav_fpath)
    emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
    np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
 
-def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
+def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool):
    wav_dir = synthesizer_root.joinpath("audio")
    metadata_fpath = synthesizer_root.joinpath("train.txt")
    assert wav_dir.exists() and metadata_fpath.exists()
@@ -128,11 +129,11 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
    # Gather the input wave filepath and the target output embed filepath
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
-        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
-        
+        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata if not should_skip(embed_dir.joinpath(m[2]), skip_existing)]
+
    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
    # Embed the utterances in separate threads
-    func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
+    func = partial(_embed_utterance, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap(func, fpaths)
    tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

@@ -142,14 +143,14 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
    assert wav_dir.exists() and metadata_fpath.exists()
    emo_dir = synthesizer_root.joinpath("emo")
    emo_dir.mkdir(exist_ok=True)
-    
+
    # Gather the input wave filepath and the target output embed filepath
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
-        fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]
+        fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata if not should_skip(emo_dir.joinpath(m[0].replace("audio-", "emo-")), skip_existing)]
        
    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
    # Embed the utterances in separate threads
-    func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
+    func = partial(_emo_extract_from_utterance, hparams=hparams)
    job = Pool(n_processes).imap(func, fpaths)
    tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
--- a/models/synthesizer/preprocess_audio.py
+++ b/models/synthesizer/preprocess_audio.py
@@ -45,7 +45,7 @@ def extract_emo(
    return y

 def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, 
-                      skip_existing: bool, hparams, encoder_model_fpath):
+                      mel_fpath: str, wav_fpath: str, hparams, encoder_model_fpath):
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
@@ -58,13 +58,6 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
    #   without extra padding. This means that you won't have an exact relation between the length
    #   of the wav and of the mel spectrogram. See the vocoder data loader.
        
-    # Skip existing utterances if needed
-    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
-    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
-    
-    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
-        return None
-
    # Trim silence
    if hparams.trim_silence:
        if not encoder.is_loaded():
@@ -112,50 +105,27 @@ def _split_on_silences(wav_fpath, words, hparams):
 def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
    metadata = []
    extensions = ("*.wav", "*.flac", "*.mp3")
-    if skip_existing:
-        for extension in extensions:
-            wav_fpath_list = speaker_dir.glob(extension)
-            # Iterate over each wav
-            for wav_fpath in wav_fpath_list:
-                words = dict_info.get(wav_fpath.name.split(".")[0])
+    for extension in extensions:
+        wav_fpath_list = speaker_dir.glob(extension)
+        # Iterate over each wav
+        for wav_fpath in wav_fpath_list:
+            words = dict_info.get(wav_fpath.name.split(".")[0])
+            if not words:
+                words = dict_info.get(wav_fpath.name) # try with extension 
                if not words:
-                    words = dict_info.get(wav_fpath.name) # try with extension 
-                    if not words:
-                        print("no wordS")
-                        continue
-                sub_basename = "%s_%02d" % (wav_fpath.name, 0)
-                
-                mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
-                wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
-                
-                if mel_fpath.exists() and wav_fpath_.exists():
+                    print(f"No word found in dict_info for {wav_fpath.name}, skip it")
                    continue
+            sub_basename = "%s_%02d" % (wav_fpath.name, 0)
+            mel_fpath_out = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
+            wav_fpath_out = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
+            
+            if skip_existing and mel_fpath_out.exists() and wav_fpath_out.exists():
+                continue
+            wav, text = _split_on_silences(wav_fpath, words, hparams)
+            result = _process_utterance(wav, text, out_dir, sub_basename, mel_fpath_out, wav_fpath_out, hparams, encoder_model_fpath)
+            if result is None:
+                continue
+            wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
+            metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))

-                wav, text = _split_on_silences(wav_fpath, words, hparams)
-                result = _process_utterance(wav, text, out_dir, sub_basename, 
-                                                    False, hparams, encoder_model_fpath) # accelarate
-                if result is None:
-                    continue
-                wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
-                metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
-    else:
-        for extension in extensions:
-            wav_fpath_list = speaker_dir.glob(extension)
-            # Iterate over each wav
-            for wav_fpath in wav_fpath_list:
-                words = dict_info.get(wav_fpath.name.split(".")[0])
-                if not words:
-                    words = dict_info.get(wav_fpath.name) # try with extension 
-                    if not words:
-                        print("no wordS")
-                        continue
-                sub_basename = "%s_%02d" % (wav_fpath.name, 0)
-
-                wav, text = _split_on_silences(wav_fpath, words, hparams)
-                result = _process_utterance(wav, text, out_dir, sub_basename, 
-                                                    False, hparams, encoder_model_fpath)
-                if result is None:
-                    continue
-                wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
-                metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
    return metadata
--- a/pre.py
+++ b/pre.py
@@ -71,7 +71,7 @@ if __name__ == "__main__":
    del args.n_processes_embed
    preprocess_dataset(**vars(args))
    
-    create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath)
+    create_embeddings(synthesizer_root=args.out_dir, n_processes=n_processes_embed, encoder_model_fpath=encoder_model_fpath, skip_existing=args.skip_existing)
    
    if args.emotion_extract:
        create_emo(synthesizer_root=args.out_dir, n_processes=n_processes_embed, skip_existing=args.skip_existing, hparams=args.hparams)
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,8 @@ umap-learn
 visdom
 librosa
 matplotlib>=3.3.0
-numpy
+numpy==1.19.3; platform_system == "Windows"
+numpy==1.20.3; platform_system != "Windows"
 scipy>=1.0.0
 tqdm
 sounddevice
@@ -12,8 +13,8 @@ inflect
 PyQt5
 multiprocess
 numba
-webrtcvad
-pypinyin
+webrtcvad; platform_system != "Windows"
+pypinyin==0.44.0
 flask
 flask_wtf
 flask_cors
@@ -25,9 +26,9 @@ PyYAML
 torch_complex
 espnet
 PyWavelets
-monotonic-align==0.0.3
-transformers
 fastapi
 loguru
-typer[all]
-click
+click==8.0.4
+typer
+monotonic-align==1.0.0
+transformers
Author	SHA1	Message	Date
Vega	d889235518	Update README.md	2024-11-01 12:54:16 +08:00
Bob Conan	42789babd8	Update README.md, fix a typo (#1007 )	2024-10-22 10:21:44 +08:00
Vega	2354bb42d1	Update README.md (#1005 )	2024-10-16 22:48:15 +08:00
Vega	4358f6f353	Update README.md	2024-08-29 17:52:56 +08:00
xxxxx	5971555319	Update requirements.txt (#747 ) Ubuntu 20.04.1 CUDA 11.3 缺少依赖，还有依赖冲突 Co-authored-by: Vega <babysor00@gmail.com>	2024-08-22 15:06:40 +08:00
Emma Thompson	6f84026c51	Env update 添加环境需求注释 (#660 ) * Update Readme Doc 添加环境需求注释 * Update Readme Doc Add environmental requirement notes --------- Co-authored-by: Limingrui0 <65227354+Limingrui0@users.noreply.github.com>	2024-07-06 10:13:09 +08:00
Terminal	a30657ecf5	fix:preprocess_audio.py--The .npy file failed to save (#988 )	2024-07-06 10:12:36 +08:00
Terminal	cc250af1f6	fix requirements monotonic-align error (#989 )	2024-07-06 10:12:06 +08:00
Vega	156723e37c	Skip embedding (#950 ) * Skip embedding * Skip earlier * Remove unused paramater * Pass param	2023-09-05 23:15:04 +08:00
Vega	1862d2145b	Merge pull request #953 from babysor/babysor-patch-3 Update README.md	2023-08-31 11:42:15 +08:00