mirror of
https://github.com/babysor/Realtime-Voice-Clone-Chinese.git
synced 2026-03-20 03:55:09 +08:00
Some changes to make it easier to install the dependencies
This commit is contained in:
@@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
||||
|
||||
# Resample the wav if needed
|
||||
if source_sr is not None and source_sr != sampling_rate:
|
||||
wav = librosa.resample(wav, source_sr, sampling_rate)
|
||||
wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
|
||||
|
||||
# Apply the preprocessing: normalize volume and shorten long silences
|
||||
if normalize:
|
||||
@@ -99,7 +99,7 @@ def trim_long_silences(wav):
|
||||
return ret[width - 1:] / width
|
||||
|
||||
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
||||
audio_mask = np.round(audio_mask).astype(np.bool)
|
||||
audio_mask = np.round(audio_mask).astype(bool)
|
||||
|
||||
# Dilate the voiced regions
|
||||
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
||||
|
||||
@@ -21,7 +21,7 @@ colormap = np.array([
|
||||
[33, 0, 127],
|
||||
[0, 0, 0],
|
||||
[183, 183, 183],
|
||||
], dtype=np.float) / 255
|
||||
], dtype=float) / 255
|
||||
|
||||
|
||||
class Visualizations:
|
||||
|
||||
@@ -31,14 +31,13 @@ class LogMel(torch.nn.Module):
|
||||
fs: int = 16000,
|
||||
n_fft: int = 512,
|
||||
n_mels: int = 80,
|
||||
fmin: float = None,
|
||||
fmin: float = 0,
|
||||
fmax: float = None,
|
||||
htk: bool = False,
|
||||
norm=1,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
fmin = 0 if fmin is None else fmin
|
||||
fmax = fs / 2 if fmax is None else fmax
|
||||
_mel_options = dict(
|
||||
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm
|
||||
|
||||
@@ -107,7 +107,7 @@ def _griffin_lim(S, hparams):
|
||||
Based on https://github.com/librosa/librosa/issues/434
|
||||
"""
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
S_complex = np.abs(S).astype(complex)
|
||||
y = _istft(S_complex * angles, hparams)
|
||||
for i in range(hparams.griffin_lim_iters):
|
||||
angles = np.exp(1j * np.angle(_stft(y, hparams)))
|
||||
|
||||
@@ -78,12 +78,12 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
||||
|
||||
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
|
||||
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
|
||||
job = Pool(n_processes).imap(func, speaker_dirs)
|
||||
job = Pool(n_processes).imap_unordered(func, speaker_dirs)
|
||||
|
||||
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
|
||||
if speaker_metadata is not None:
|
||||
for metadatum in speaker_metadata:
|
||||
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
|
||||
metadata_file.write("|".join(map(str,metadatum)) + "\n")
|
||||
metadata_file.close()
|
||||
|
||||
# Verify the contents of the metadata file
|
||||
@@ -134,7 +134,7 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
|
||||
# Embed the utterances in separate threads
|
||||
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
||||
job = Pool(n_processes).imap(func, fpaths)
|
||||
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||
|
||||
def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
|
||||
wav_dir = synthesizer_root.joinpath("audio")
|
||||
@@ -152,4 +152,4 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
|
||||
# Embed the utterances in separate threads
|
||||
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
|
||||
job = Pool(n_processes).imap(func, fpaths)
|
||||
list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
||||
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
||||
|
||||
@@ -104,29 +104,58 @@ def _split_on_silences(wav_fpath, words, hparams):
|
||||
wav = logmmse.denoise(wav, profile, eta=0)
|
||||
|
||||
resp = pinyin(words, style=Style.TONE3)
|
||||
res = [v[0] for v in resp if v[0].strip()]
|
||||
res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp))
|
||||
res = " ".join(res)
|
||||
|
||||
return wav, res
|
||||
|
||||
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
|
||||
metadata = []
|
||||
extensions = ["*.wav", "*.flac", "*.mp3"]
|
||||
for extension in extensions:
|
||||
wav_fpath_list = speaker_dir.glob(extension)
|
||||
# Iterate over each wav
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
words = dict_info.get(wav_fpath.name) if not words else words # try with extension
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||
skip_existing, hparams, encoder_model_fpath)
|
||||
if result is None:
|
||||
continue
|
||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
|
||||
return [m for m in metadata if m is not None]
|
||||
extensions = ("*.wav", "*.flac", "*.mp3")
|
||||
if skip_existing:
|
||||
for extension in extensions:
|
||||
wav_fpath_list = speaker_dir.glob(extension)
|
||||
# Iterate over each wav
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
if not words:
|
||||
words = dict_info.get(wav_fpath.name) # try with extension
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
|
||||
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
|
||||
wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
|
||||
|
||||
if mel_fpath.exists() and wav_fpath_.exists():
|
||||
continue
|
||||
|
||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||
False, hparams, encoder_model_fpath) # accelarate
|
||||
if result is None:
|
||||
continue
|
||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
||||
else:
|
||||
for extension in extensions:
|
||||
wav_fpath_list = speaker_dir.glob(extension)
|
||||
# Iterate over each wav
|
||||
for wav_fpath in wav_fpath_list:
|
||||
words = dict_info.get(wav_fpath.name.split(".")[0])
|
||||
if not words:
|
||||
words = dict_info.get(wav_fpath.name) # try with extension
|
||||
if not words:
|
||||
print("no wordS")
|
||||
continue
|
||||
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
|
||||
|
||||
wav, text = _split_on_silences(wav_fpath, words, hparams)
|
||||
result = _process_utterance(wav, text, out_dir, sub_basename,
|
||||
False, hparams, encoder_model_fpath)
|
||||
if result is None:
|
||||
continue
|
||||
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
|
||||
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
|
||||
return metadata
|
||||
|
||||
@@ -50,7 +50,7 @@ def linear_to_mel(spectrogram):
|
||||
|
||||
|
||||
def build_mel_basis():
|
||||
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
||||
return librosa.filters.mel(sr = hp.sample_rate, n_fft = hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
|
||||
|
||||
|
||||
def normalize(S):
|
||||
|
||||
Reference in New Issue
Block a user