Spaces:

GitMylo
/

bark-voice-cloning

Running

Mylo

Fix flat audio inputs

388b9f1 over 1 year ago

3.51 kB

	import math
	import os.path
	import uuid

	import gradio
	import numpy
	import torch

	from hubert.hubert_manager import HuBERTManager
	from hubert.pre_kmeans_hubert import CustomHubert
	from hubert.customtokenizer import CustomTokenizer
	from encodec import EncodecModel
	from encodec.utils import convert_audio


	hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed())
	tokenizer_model = CustomTokenizer.load_from_checkpoint(
	HuBERTManager.make_sure_tokenizer_installed(model='quantifier_V1_hubert_base_ls960_23.pth'),
	map_location=torch.device('cpu')
	)
	encodec_model = EncodecModel.encodec_model_24khz()



	def clone(audio, *args):
	sr, wav = audio

	wav = torch.tensor(wav)

	if wav.dtype == torch.int16:
	wav = wav.float() / 32767.0

	if len(wav.shape) == 2:
	if wav.shape[0] == 2: # Stereo to mono if needed
	wav = wav.mean(0, keepdim=True)
	if wav.shape[1] == 2:
	wav = wav.mean(1, keepdim=False).unsqueeze(-1)

	wav = wav[-int(sr*20):] # Take only the last 20 seconds

	wav = wav.reshape(1, -1) # Reshape from gradio style to HuBERT shape. (N, 1) to (1, N)

	semantic_vectors = hubert_model.forward(wav, input_sample_hz=sr)
	semantic_tokens = tokenizer_model.get_token(semantic_vectors)

	encodec_model.set_target_bandwidth(6.0)
	wav = convert_audio(wav, sr, encodec_model.sample_rate, 1)
	wav = wav.unsqueeze(0)

	with torch.no_grad():
	encoded_frames = encodec_model.encode(wav)

	codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [B, n_q, T]

	if not os.path.isdir('data/speakers'):
	os.makedirs('data/speakers')

	file_path = f'data/speakers/{uuid.uuid4().hex}.npz'

	numpy.savez(
	file_path,
	semantic_prompt=semantic_tokens,
	fine_prompt=codes,
	coarse_prompt=codes[:2, :]
	)

	return file_path



	iface = gradio.interface.Interface(fn=clone, inputs=[
	'audio',
	gradio.Markdown(
	'''
	# Bark text to speech voice cloning
	[Model](https://huggingface.co/GitMylo/bark-voice-cloning/), [Model GitHub](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer), [Webui GitHub](https://github.com/gitmylo/audio-webui)

	For faster creation of voice clones [Duplicate this space](https://huggingface.co/spaces/GitMylo/bark-voice-cloning?duplicate=true)

	Uploaded audio files get cut to 20 seconds in order to keep it fast for everyone. Only the last 20 seconds will be used. (Bark only uses the last 14 seconds anyway)

	## Tips for better cloning
	### Make sure these things are NOT in your voice input: (in no particular order)
	* Noise (You can use a noise remover before)
	* Music (There are also music remover tools) (Unless you want music in the background)
	* A cut-off at the end (This will cause it to try and continue on the generation)
	* Under 1 second of training data (i personally suggest around 10 seconds for good potential, but i've had great results with 5 seconds as well.)

	### What makes for good prompt audio? (in no particular order)
	* Clearly spoken
	* No weird background noises
	* Only one speaker
	* Audio which ends after a sentence ends
	* Regular/common voice (They usually have more success, it's still capable of cloning complex voices, but not as good at it)
	* Around 10 seconds of data
	''')
	], outputs='file')
	iface.launch()