Spaces:

Plachta
/

FAcodecV2

Running on Zero

App Files Files Community

FAcodecV2 / webui.py

Plachta

Update webui.py

43bc05b verified 3 months ago

raw

history blame contribute delete

No virus

4.45 kB

	import spaces
	import gradio as gr
	import torch
	import torchaudio
	import librosa
	import numpy as np
	import os
	from huggingface_hub import hf_hub_download
	import yaml
	from modules.commons import recursive_munch, build_model

	# setup device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	# load model
	def load_model(repo_id):
	ckpt_path = hf_hub_download(repo_id, "pytorch_model.bin", cache_dir="./checkpoints")
	config_path = hf_hub_download(repo_id, "config.yml", cache_dir="./checkpoints")

	config = yaml.safe_load(open(config_path))
	model_params = recursive_munch(config['model_params'])

	if "redecoder" in repo_id:
	model = build_model(model_params, stage="redecoder")
	else:
	model = build_model(model_params, stage="codec")

	ckpt_params = torch.load(ckpt_path, map_location="cpu")

	for key in model:
	model[key].load_state_dict(ckpt_params[key])
	model[key].eval()
	model[key].to(device)

	return model


	# load models
	codec_model = load_model("Plachta/FAcodec")
	redecoder_model = load_model("Plachta/FAcodec-redecoder")


	# preprocess audio
	def preprocess_audio(audio_path, sr=24000):
	audio = librosa.load(audio_path, sr=sr)[0]
	# if audio has two channels, take the first one
	if len(audio.shape) > 1:
	audio = audio[0]
	audio = audio[:sr * 180] # crop only the first 180 seconds
	return torch.tensor(audio).unsqueeze(0).float().to(device)


	# audio reconstruction function
	@spaces.GPU
	@torch.no_grad()
	def reconstruct_audio(audio):
	source_audio = preprocess_audio(audio)

	z = codec_model.encoder(source_audio[None, ...])
	z, _, _, _, _ = codec_model.quantizer(z, source_audio[None, ...], n_c=2)

	reconstructed_wave = codec_model.decoder(z)

	return (24000, reconstructed_wave[0, 0].cpu().numpy())


	# voice conversion function
	@spaces.GPU
	@torch.no_grad()
	def voice_conversion(source_audio, target_audio):
	source_audio = preprocess_audio(source_audio)
	target_audio = preprocess_audio(target_audio)

	z = codec_model.encoder(source_audio[None, ...])
	z, _, _, _, timbre, codes = codec_model.quantizer(z, source_audio[None, ...], n_c=2, return_codes=True)

	z_target = codec_model.encoder(target_audio[None, ...])
	_, _, _, _, timbre_target, _ = codec_model.quantizer(z_target, target_audio[None, ...], n_c=2, return_codes=True)

	z_converted = redecoder_model.encoder(codes[0], codes[1], timbre_target, use_p_code=False, n_c=1)
	converted_wave = redecoder_model.decoder(z_converted)

	return (24000, converted_wave[0, 0].cpu().numpy())


	# gradio interface
	def gradio_interface():
	with gr.Blocks() as demo:
	gr.Markdown(
	"# FAcodec reconstruction and voice conversion"
	"[![GitHub stars](https://img.shields.io/github/stars/Plachtaa/FAcodec)](https://github.com/Plachtaa/FAcodec)"
	)
	gr.Markdown(
	"FAcodec from [Natural Speech 3](https://arxiv.org/pdf/2403.03100). <br>The checkpoint used in this demo is trained on an improved pipeline "
	"where all kinds of annotations are not required, enabling the scale up of training data. <br>This model is "
	"trained on 50k hours 24000Hz speech data with over 1 million speakers, largely improved timbre diversity compared to "
	"the [original FAcodec](https://huggingface.co/spaces/amphion/naturalspeech3_facodec)."
	"<br><br>This project is supported by [Amphion](https://github.com/open-mmlab/Amphion)"
	)

	with gr.Tab("reconstruction"):
	with gr.Row():
	input_audio = gr.Audio(type="filepath", label="Input audio")
	output_audio = gr.Audio(label="Reconstructed audio")
	reconstruct_btn = gr.Button("Reconstruct")
	reconstruct_btn.click(reconstruct_audio, inputs=[input_audio], outputs=[output_audio])

	with gr.Tab("voice conversion"):
	with gr.Row():
	source_audio = gr.Audio(type="filepath", label="Source audio")
	target_audio = gr.Audio(type="filepath", label="Reference audio")
	converted_audio = gr.Audio(label="Converted audio")
	convert_btn = gr.Button("Convert")
	convert_btn.click(voice_conversion, inputs=[source_audio, target_audio], outputs=[converted_audio])

	return demo


	if __name__ == "__main__":
	iface = gradio_interface()
	iface.launch()