Some changes to make it easier to install the dependencies

This commit is contained in:
0warning0error
2023-06-02 17:22:38 +08:00
parent b78d0d2a26
commit 9f1dbeeecc
13 changed files with 93 additions and 46 deletions

View File

@@ -39,7 +39,7 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
# Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, source_sr, sampling_rate)
wav = librosa.resample(wav, orig_sr = source_sr, target_sr = sampling_rate)
# Apply the preprocessing: normalize volume and shorten long silences
if normalize:
@@ -99,7 +99,7 @@ def trim_long_silences(wav):
return ret[width - 1:] / width
audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool)
audio_mask = np.round(audio_mask).astype(bool)
# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))

View File

@@ -21,7 +21,7 @@ colormap = np.array([
[33, 0, 127],
[0, 0, 0],
[183, 183, 183],
], dtype=np.float) / 255
], dtype=float) / 255
class Visualizations:

View File

@@ -31,14 +31,13 @@ class LogMel(torch.nn.Module):
fs: int = 16000,
n_fft: int = 512,
n_mels: int = 80,
fmin: float = None,
fmin: float = 0,
fmax: float = None,
htk: bool = False,
norm=1,
):
super().__init__()
fmin = 0 if fmin is None else fmin
fmax = fs / 2 if fmax is None else fmax
_mel_options = dict(
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk, norm=norm

View File

@@ -107,7 +107,7 @@ def _griffin_lim(S, hparams):
Based on https://github.com/librosa/librosa/issues/434
"""
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
S_complex = np.abs(S).astype(complex)
y = _istft(S_complex * angles, hparams)
for i in range(hparams.griffin_lim_iters):
angles = np.exp(1j * np.angle(_stft(y, hparams)))

View File

@@ -78,12 +78,12 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, speaker_dirs)
job = Pool(n_processes).imap_unordered(func, speaker_dirs)
for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
if speaker_metadata is not None:
for metadatum in speaker_metadata:
metadata_file.write("|".join(str(x) for x in metadatum) + "\n")
metadata_file.write("|".join(map(str,metadatum)) + "\n")
metadata_file.close()
# Verify the contents of the metadata file
@@ -134,7 +134,7 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
# Embed the utterances in separate threads
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
wav_dir = synthesizer_root.joinpath("audio")
@@ -152,4 +152,4 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
# Embed the utterances in separate threads
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))

View File

@@ -104,29 +104,58 @@ def _split_on_silences(wav_fpath, words, hparams):
wav = logmmse.denoise(wav, profile, eta=0)
resp = pinyin(words, style=Style.TONE3)
res = [v[0] for v in resp if v[0].strip()]
res = filter(lambda v : not v.isspace(),map(lambda v: v[0],resp))
res = " ".join(res)
return wav, res
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
metadata = []
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
words = dict_info.get(wav_fpath.name) if not words else words # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams, encoder_model_fpath)
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
return [m for m in metadata if m is not None]
extensions = ("*.wav", "*.flac", "*.mp3")
if skip_existing:
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
mel_fpath = out_dir.joinpath("mels", f"mel-{sub_basename}.npy")
wav_fpath_ = out_dir.joinpath("audio", f"audio-{sub_basename}.npy")
if mel_fpath.exists() and wav_fpath_.exists():
continue
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath) # accelarate
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
else:
for extension in extensions:
wav_fpath_list = speaker_dir.glob(extension)
# Iterate over each wav
for wav_fpath in wav_fpath_list:
words = dict_info.get(wav_fpath.name.split(".")[0])
if not words:
words = dict_info.get(wav_fpath.name) # try with extension
if not words:
print("no wordS")
continue
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
False, hparams, encoder_model_fpath)
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
metadata.append ((wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text))
return metadata

View File

@@ -50,7 +50,7 @@ def linear_to_mel(spectrogram):
def build_mel_basis():
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
return librosa.filters.mel(sr = hp.sample_rate, n_fft = hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin)
def normalize(S):