Spaces:

kotoba-tech
/

kotoba-speech

Running on T4

yuta0306

first commit

565faca 8 months ago

3.95 kB

	import os
	import pathlib
	import uuid
	from abc import ABC, abstractmethod
	from typing import Callable, Optional, Union

	import julius
	import torch
	from audiocraft.data.audio import audio_read, audio_write
	from audiocraft.models import MultiBandDiffusion # type: ignore

	from IPython import embed

	class Decoder(ABC):
	@abstractmethod
	def decode(self, tokens: list[int], ref_audio_path: Optional[str] = None, causal: Optional[bool] = None):
	raise NotImplementedError


	class EncodecDecoder(Decoder):
	def __init__(
	self,
	tokeniser_decode_fn: Callable[[list[int]], str],
	data_adapter_fn: Callable[[list[list[int]]], tuple[list[int], list[list[int]]]],
	output_dir: str,
	):
	self._mbd_bandwidth = 6 # 1.5
	self._mbd_sample_rate = 24_000
	self._end_of_audio_token = 1024
	self._num_codebooks = 8
	self.mbd = MultiBandDiffusion.get_mbd_24khz(bw=self._mbd_bandwidth)

	self.tokeniser_decode_fn = tokeniser_decode_fn
	self._data_adapter_fn = data_adapter_fn

	self.output_dir = pathlib.Path(output_dir).resolve()
	os.makedirs(self.output_dir, exist_ok=True)

	def _save_audio(self, name: str, wav: torch.Tensor):
	audio_write(
	name,
	wav.squeeze(0).cpu(),
	self._mbd_sample_rate,
	strategy="loudness",
	loudness_compressor=True,
	)

	def get_tokens(self, audio_path: str) -> list[list[int]]:
	"""
	Utility method to get tokens from audio. Useful when you want to test reconstruction in some form (e.g.
	limited codebook reconstruction or sampling from second stage model only).
	"""
	pass
	wav, sr = audio_read(audio_path)
	if sr != self._mbd_sample_rate:
	wav = julius.resample_frac(wav, sr, self._mbd_sample_rate)
	if wav.ndim == 2:
	wav = wav.unsqueeze(1)
	wav = wav.to("cuda")
	tokens = self.mbd.codec_model.encode(wav)
	tokens = tokens[0][0]
	# embed()
	return tokens.tolist()

	def decode(
	self, tokens: list[list[int]], causal: bool = True, ref_audio_path: Optional[str] = None
	) -> Union[str, torch.Tensor]:
	# TODO: this has strange behaviour -- if causal is True, it returns tokens. if causal is False, it SAVES the audio file.
	text_ids, extracted_audio_ids = self._data_adapter_fn(tokens)
	text = self.tokeniser_decode_fn(text_ids)
	print(f"Text: {text}")

	tokens = torch.tensor(extracted_audio_ids, device="cuda").unsqueeze(0)

	if tokens.shape[1] < self._num_codebooks:
	tokens = torch.cat(
	[tokens, [torch.ones_like(tokens[0:1, 0:1]) 0] * (self._num_codebooks - tokens.shape[1])], dim=1
	)

	if causal:
	return tokens
	else:
	with torch.amp.autocast(device_type="cuda", dtype=torch.float32):
	# embed()
	wav = self.mbd.tokens_to_wav(tokens)
	# NOTE: we couldn't just return wav here as it goes through loudness compression etc :)

	if wav.shape[-1] < 9600:
	# this causes problem for the code below, and is also odd :)
	# first happened for tokens (1, 8, 28) -> wav (1, 1, 8960) (~320x factor in time dimension!)
	raise Exception("wav predicted is shorter than 400ms!")

	try:
	wav_file_name = self.output_dir / f"synth_{text.replace(' ', '_')[:25]}_{uuid.uuid4()}"
	self._save_audio(wav_file_name, wav)
	print(f"\nSaved audio to {wav_file_name}.wav")
	return wav_file_name
	except Exception as e:
	print(f"Failed to save audio! Reason: {e}")
	wav_file_name = self.output_dir / f"synth_{uuid.uuid4()}"
	self._save_audio(wav_file_name, wav)
	print(f"\nSaved audio to {wav_file_name}.wav")
	return wav_file_name