Spaces:

KasugaiSakura
/

so-vits-svc-Sora

Runtime error

App Files Files Community

so-vits-svc-Sora / inference /infer_tool_grad.py

KasugaiSakura

Upload folder using huggingface_hub

58fbdee verified 8 months ago

raw

history blame

5.57 kB

	import io
	import logging
	import os

	import librosa
	import numpy as np
	import parselmouth
	import soundfile
	import torch
	import torchaudio

	import utils
	from inference import slicer
	from models import SynthesizerTrn

	logging.getLogger('numba').setLevel(logging.WARNING)
	logging.getLogger('matplotlib').setLevel(logging.WARNING)

	def resize2d_f0(x, target_len):
	source = np.array(x)
	source[source < 0.001] = np.nan
	target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
	source)
	res = np.nan_to_num(target)
	return res

	def get_f0(x, p_len,f0_up_key=0):

	time_step = 160 / 16000 * 1000
	f0_min = 50
	f0_max = 1100
	f0_mel_min = 1127 * np.log(1 + f0_min / 700)
	f0_mel_max = 1127 * np.log(1 + f0_max / 700)

	f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
	time_step=time_step / 1000, voicing_threshold=0.6,
	pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']

	pad_size=(p_len - len(f0) + 1) // 2
	if(pad_size>0 or p_len - len(f0) - pad_size>0):
	f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')

	f0 *= pow(2, f0_up_key / 12)
	f0_mel = 1127 * np.log(1 + f0 / 700)
	f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
	f0_mel[f0_mel <= 1] = 1
	f0_mel[f0_mel > 255] = 255
	f0_coarse = np.rint(f0_mel).astype(np.int)
	return f0_coarse, f0

	def clean_pitch(input_pitch):
	num_nan = np.sum(input_pitch == 1)
	if num_nan / len(input_pitch) > 0.9:
	input_pitch[input_pitch != 1] = 1
	return input_pitch


	def plt_pitch(input_pitch):
	input_pitch = input_pitch.astype(float)
	input_pitch[input_pitch == 1] = np.nan
	return input_pitch


	def f0_to_pitch(ff):
	f0_pitch = 69 + 12 * np.log2(ff / 440)
	return f0_pitch


	def fill_a_to_b(a, b):
	if len(a) < len(b):
	for _ in range(0, len(b) - len(a)):
	a.append(a[0])


	def mkdir(paths: list):
	for path in paths:
	if not os.path.exists(path):
	os.mkdir(path)


	class VitsSvc(object):
	def __init__(self):
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.SVCVITS = None
	self.hps = None
	self.speakers = None
	self.hubert_soft = utils.get_hubert_model()

	def set_device(self, device):
	self.device = torch.device(device)
	self.hubert_soft.to(self.device)
	if self.SVCVITS is not None:
	self.SVCVITS.to(self.device)

	def loadCheckpoint(self, path):
	self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
	self.SVCVITS = SynthesizerTrn(
	self.hps.data.filter_length // 2 + 1,
	self.hps.train.segment_size // self.hps.data.hop_length,
	**self.hps.model)
	_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None)
	_ = self.SVCVITS.eval().to(self.device)
	self.speakers = self.hps.spk

	def get_units(self, source, sr):
	source = source.unsqueeze(0).to(self.device)
	with torch.inference_mode():
	units = self.hubert_soft.units(source)
	return units


	def get_unit_pitch(self, in_path, tran):
	source, sr = torchaudio.load(in_path)
	source = torchaudio.functional.resample(source, sr, 16000)
	if len(source.shape) == 2 and source.shape[1] >= 2:
	source = torch.mean(source, dim=0).unsqueeze(0)
	soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
	f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
	return soft, f0

	def infer(self, speaker_id, tran, raw_path):
	speaker_id = self.speakers[speaker_id]
	sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0)
	soft, pitch = self.get_unit_pitch(raw_path, tran)
	f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
	stn_tst = torch.FloatTensor(soft)
	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0).to(self.device)
	x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
	audio,_ = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
	return audio, audio.shape[-1]

	def inference(self,srcaudio,chara,tran,slice_db):
	sampling_rate, audio = srcaudio
	audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
	if len(audio.shape) > 1:
	audio = librosa.to_mono(audio.transpose(1, 0))
	if sampling_rate != 16000:
	audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
	soundfile.write("tmpwav.wav", audio, 16000, format="wav")
	chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
	audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
	audio = []
	for (slice_tag, data) in audio_data:
	length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate))
	raw_path = io.BytesIO()
	soundfile.write(raw_path, data, audio_sr, format="wav")
	raw_path.seek(0)
	if slice_tag:
	_audio = np.zeros(length)
	else:
	out_audio, out_sr = self.infer(chara, tran, raw_path)
	_audio = out_audio.cpu().numpy()
	audio.extend(list(_audio))
	audio = (np.array(audio) * 32768.0).astype('int16')
	return (self.hps.data.sampling_rate,audio)