Spaces:
Running
Running
import numpy as np | |
import os | |
import random | |
import librosa | |
import parselmouth | |
from utils.tools import load_wav | |
np.random.seed(0) | |
random.seed(0) | |
def REAPER_F0(wav_path, sr=24000, frame_period=0.01): # frame_period s | |
if not os.path.isfile(f'{wav_path}.f0'): | |
cmd = f'REAPER/build/reaper -i {wav_path} -f {wav_path}.f0 -e {frame_period} -x 1000 -m 65 -a' | |
os.system(cmd) | |
f0 = [] | |
try: | |
with open(f'{wav_path}.f0', 'r') as rf: | |
for line in rf.readlines()[7:]: | |
f0.append(float(line.split()[2])) | |
except FileNotFoundError as e: | |
return None | |
cmd = f'rm -f {wav_path}.f0' | |
os.system(cmd) | |
f0 = np.array(f0) | |
minus_one_indexes = (f0 == -1) | |
f0[minus_one_indexes] = 0 | |
return f0 | |
def ParselMouth_F0(wav, sr=24000, frame_period=0.01): | |
wav = parselmouth.Sound(wav, sampling_frequency=sr) | |
pitch = wav.to_pitch(time_step=frame_period, pitch_floor=65, pitch_ceiling=1000) | |
f0 = pitch.selected_array['frequency'] | |
return f0 | |
def PYIN_F0(wav, sr=24000, frame_period=10): | |
fmin = librosa.note_to_hz('C2') # ~65Hz | |
fmax = librosa.note_to_hz('C7') # ~2093Hz | |
# fmax = fs/2 | |
f0, voiced_flag, voiced_prob = librosa.pyin( | |
wav, fmin=fmin, fmax=fmax, sr=sr, frame_length=int(sr*frame_period/1000*4)) | |
f0 = np.where(np.isnan(f0), 0.0, f0) | |
return f0 | |
def pad_arrays(arrays: list[np.ndarray], std_len: int): | |
""" | |
Pad arrays value to a specified standard length. | |
Args: | |
arrays (List[numpy.ndarray]): List of arrays to be padded. | |
std_len (int): Standard length to which the arrays will be padded. | |
Returns: | |
List[numpy.ndarray]: List of padded arrays. | |
Raises: | |
ValueError: If the length of any array in the input list is greater than the specified standard length. | |
""" | |
padded_arrays = [] | |
for arr in arrays: | |
cur_len = len(arr) | |
if cur_len <= std_len: | |
pad_width = std_len - cur_len | |
left_pad = pad_width // 2 | |
right_pad = pad_width - left_pad | |
padded_arr = np.pad(arr, (left_pad, right_pad), 'edge') | |
padded_arrays.append(padded_arr) | |
else: | |
raise ValueError(f'cur_len: {cur_len}, std_len: {std_len}.') | |
return padded_arrays | |
def compute_pitch(wav_path: str, pitch_path: str=None, frame_period=0.01): | |
""" | |
Computes the pitch information from an audio waveform. | |
Args: | |
wav_path (str): Path to the audio waveform file (must be 24kHz). | |
pitch_path (str, optional): Path to save or load the computed pitch information as a numpy file. | |
If specified, the function will first attempt to load the pitch information from this path. | |
If the file does not exist, the pitch will be computed and saved to this path. | |
Defaults to None. | |
frame_period (float, optional): Time duration in seconds for each frame. Defaults to 0.01. | |
Returns: | |
numpy.ndarray: Computed pitch information. | |
Notes: | |
For precise pitch representation, the pitch values are extracted by the median of three methods: | |
the PYIN, the REAPER, and the Parselmouth. | |
""" | |
import time | |
if pitch_path is not None and os.path.isfile(pitch_path): | |
pitch = np.load(pitch_path) | |
return pitch | |
else: | |
# extract pitch using 24kHz audio | |
wav, fs = load_wav(wav_path, 24000) | |
f0_std_len = wav.shape[0] // int(frame_period*fs) + 1 | |
compute_median = [] | |
# Compute pitch using PYIN algorithm | |
f0 = PYIN_F0(wav, sr=fs, frame_period=frame_period*1000) | |
compute_median.append(f0) | |
# Compute pitch using ParselMouth algorithm | |
f0 = ParselMouth_F0(wav, sr=fs, frame_period=frame_period) | |
compute_median.append(f0) | |
# Compute pitch using REAPER algorithm | |
f0 = REAPER_F0(wav_path, sr=fs, frame_period=frame_period) | |
if f0 is not None: | |
compute_median.append(f0) | |
# Compute median F0 | |
compute_median = pad_arrays(compute_median, f0_std_len) | |
compute_median = np.array(compute_median) | |
median_f0 = np.median(compute_median, axis=0) | |
if pitch_path is not None: | |
os.makedirs(pitch_path.parent, exist_ok=True) | |
np.save(pitch_path, median_f0) | |
return median_f0 | |
def coarse_f0(f0): | |
f0_bin = 256 | |
f0_max = 1000.0 | |
f0_min = 65.0 | |
f0_mel_min = 1127 * np.log(1 + f0_min / 700) | |
f0_mel_max = 1127 * np.log(1 + f0_max / 700) | |
f0_mel = 1127 * np.log(1 + f0 / 700) | |
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * ( | |
f0_bin - 2 | |
) / (f0_mel_max - f0_mel_min) + 1 | |
# use 0 or 1 | |
f0_mel[f0_mel <= 1] = 1 | |
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 | |
f0_coarse = np.rint(f0_mel).astype(int) | |
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( | |
f0_coarse.max(), | |
f0_coarse.min(), | |
) | |
return f0_coarse | |
def extract_pitch_ref(wav_path: str, ref_path: str, predefined_factor=0, speech_enroll=False): | |
""" | |
Extracts pitch information from an audio waveform and adjusts it based on a reference audio. | |
Args: | |
wav_path (str): Path to the audio waveform file. | |
ref_path (str): Path to the reference audio waveform file. | |
predefined_factor (float, optional): Predefined factor to adjust the pitch. | |
If non-zero, this factor will be used instead of computing it from the reference audio. Defaults to 0. | |
speech_enroll (bool, optional): Flag indicating whether the pitch adjustment is for speech enrollment. Defaults to False. | |
Returns: | |
Tuple[numpy.ndarray, float]: Tuple containing the adjusted pitch information (source_f0) and the pitch shift factor (factor). | |
""" | |
source_f0 = compute_pitch(wav_path) | |
nonzero_indices = np.nonzero(source_f0) | |
source_mean = np.mean(source_f0[nonzero_indices], axis=0) | |
if predefined_factor != 0.: | |
print(f'Using predefined factor {predefined_factor}.') | |
factor = predefined_factor | |
else: | |
# Compute mean and std for pitch with the reference audio | |
ref_wav, fs = load_wav(ref_path) | |
ref_f0 = ParselMouth_F0(ref_wav, fs) | |
nonzero_indices = np.nonzero(ref_f0) | |
ref_mean = np.mean(ref_f0[nonzero_indices], axis=0) | |
factor = ref_mean / source_mean | |
if speech_enroll: | |
factor = factor * 1.2 | |
print(f'pitch shift factor: {factor:.2f}') | |
# Modify f0 to fit with different persons | |
source_f0 = source_f0 * factor | |
return source_f0, factor | |