vlsp2023-tts / tts.py
hahunavth
add source code
614861a
raw
history blame
2.13 kB
import base64
from abc import ABC, abstractmethod
from gtts import gTTS
from io import BytesIO
import numpy as np
class ExpressiveModel(ABC):
@abstractmethod
def load(self):
pass
@abstractmethod
def synthesize(self, text: str, emotion: str):
"""
Synthesis audio with emotion
:param text: (str)
:param emotion: (str) neutral | happy | ...
:return: np.array
"""
pass
class StyleTransferModel(ABC):
@abstractmethod
def load(self):
pass
@abstractmethod
def synthesize(self, text: str, ref_audio):
"""
Synthesis audio with reference audio
:param text: (str)
:param ref_audio: (np.array)
:return: np.array
"""
pass
class TTSService:
"""
Get input text (str), emotion label (str) or reference audio (np.array)
Synthesis audio (np.array)
Convert audio to base64
"""
@staticmethod
def synthesis(text: str) -> str:
tts = gTTS(text)
# Using in-memory handling
audio_data = BytesIO()
tts.write_to_fp(audio_data)
encoded_audio = base64.b64encode(audio_data.getvalue()).decode('utf-8')
return encoded_audio
#
# tts.save("output.mp3")
# with open("output.mp3", "rb") as audio_file:
# audio_data = audio_file.read()
# encoded_audio = base64.b64encode(audio_data).decode('utf-8')
# return encoded_audio
@staticmethod
def transfer(input_text: str, ref_audio: np.array) -> str:
# Process reference audio
# ..
# np.array to audio
# tts_output_np_array = np.array([0, 1, 0, 1])
# tts_output_bytes = tts_output_np_array.tobytes()
# audio_data = base64.b64encode(tts_output_bytes).decode('utf-8')
# return audio_data
#
# example
tts_text = input_text
tts = gTTS(tts_text)
audio_data = BytesIO()
tts.write_to_fp(audio_data)
encoded_audio = base64.b64encode(audio_data.getvalue()).decode('utf-8')
return encoded_audio