kotoba-speech / fam /llm /decoders.py
yuta0306
first commit
565faca
import os
import pathlib
import uuid
from abc import ABC, abstractmethod
from typing import Callable, Optional, Union
import julius
import torch
from audiocraft.data.audio import audio_read, audio_write
from audiocraft.models import MultiBandDiffusion # type: ignore
from IPython import embed
class Decoder(ABC):
@abstractmethod
def decode(self, tokens: list[int], ref_audio_path: Optional[str] = None, causal: Optional[bool] = None):
raise NotImplementedError
class EncodecDecoder(Decoder):
def __init__(
self,
tokeniser_decode_fn: Callable[[list[int]], str],
data_adapter_fn: Callable[[list[list[int]]], tuple[list[int], list[list[int]]]],
output_dir: str,
):
self._mbd_bandwidth = 6 # 1.5
self._mbd_sample_rate = 24_000
self._end_of_audio_token = 1024
self._num_codebooks = 8
self.mbd = MultiBandDiffusion.get_mbd_24khz(bw=self._mbd_bandwidth)
self.tokeniser_decode_fn = tokeniser_decode_fn
self._data_adapter_fn = data_adapter_fn
self.output_dir = pathlib.Path(output_dir).resolve()
os.makedirs(self.output_dir, exist_ok=True)
def _save_audio(self, name: str, wav: torch.Tensor):
audio_write(
name,
wav.squeeze(0).cpu(),
self._mbd_sample_rate,
strategy="loudness",
loudness_compressor=True,
)
def get_tokens(self, audio_path: str) -> list[list[int]]:
"""
Utility method to get tokens from audio. Useful when you want to test reconstruction in some form (e.g.
limited codebook reconstruction or sampling from second stage model only).
"""
pass
wav, sr = audio_read(audio_path)
if sr != self._mbd_sample_rate:
wav = julius.resample_frac(wav, sr, self._mbd_sample_rate)
if wav.ndim == 2:
wav = wav.unsqueeze(1)
wav = wav.to("cuda")
tokens = self.mbd.codec_model.encode(wav)
tokens = tokens[0][0]
# embed()
return tokens.tolist()
def decode(
self, tokens: list[list[int]], causal: bool = True, ref_audio_path: Optional[str] = None
) -> Union[str, torch.Tensor]:
# TODO: this has strange behaviour -- if causal is True, it returns tokens. if causal is False, it SAVES the audio file.
text_ids, extracted_audio_ids = self._data_adapter_fn(tokens)
text = self.tokeniser_decode_fn(text_ids)
print(f"Text: {text}")
tokens = torch.tensor(extracted_audio_ids, device="cuda").unsqueeze(0)
if tokens.shape[1] < self._num_codebooks:
tokens = torch.cat(
[tokens, *[torch.ones_like(tokens[0:1, 0:1]) * 0] * (self._num_codebooks - tokens.shape[1])], dim=1
)
if causal:
return tokens
else:
with torch.amp.autocast(device_type="cuda", dtype=torch.float32):
# embed()
wav = self.mbd.tokens_to_wav(tokens)
# NOTE: we couldn't just return wav here as it goes through loudness compression etc :)
if wav.shape[-1] < 9600:
# this causes problem for the code below, and is also odd :)
# first happened for tokens (1, 8, 28) -> wav (1, 1, 8960) (~320x factor in time dimension!)
raise Exception("wav predicted is shorter than 400ms!")
try:
wav_file_name = self.output_dir / f"synth_{text.replace(' ', '_')[:25]}_{uuid.uuid4()}"
self._save_audio(wav_file_name, wav)
print(f"\nSaved audio to {wav_file_name}.wav")
return wav_file_name
except Exception as e:
print(f"Failed to save audio! Reason: {e}")
wav_file_name = self.output_dir / f"synth_{uuid.uuid4()}"
self._save_audio(wav_file_name, wav)
print(f"\nSaved audio to {wav_file_name}.wav")
return wav_file_name