vlsp2023-tts / tts.py
hahunavth
add source code
614861a
import base64
from abc import ABC, abstractmethod
from gtts import gTTS
from io import BytesIO
import numpy as np
class ExpressiveModel(ABC):
@abstractmethod
def load(self):
pass
@abstractmethod
def synthesize(self, text: str, emotion: str):
"""
Synthesis audio with emotion
:param text: (str)
:param emotion: (str) neutral | happy | ...
:return: np.array
"""
pass
class StyleTransferModel(ABC):
@abstractmethod
def load(self):
pass
@abstractmethod
def synthesize(self, text: str, ref_audio):
"""
Synthesis audio with reference audio
:param text: (str)
:param ref_audio: (np.array)
:return: np.array
"""
pass
class TTSService:
"""
Get input text (str), emotion label (str) or reference audio (np.array)
Synthesis audio (np.array)
Convert audio to base64
"""
@staticmethod
def synthesis(text: str) -> str:
tts = gTTS(text)
# Using in-memory handling
audio_data = BytesIO()
tts.write_to_fp(audio_data)
encoded_audio = base64.b64encode(audio_data.getvalue()).decode('utf-8')
return encoded_audio
#
# tts.save("output.mp3")
# with open("output.mp3", "rb") as audio_file:
# audio_data = audio_file.read()
# encoded_audio = base64.b64encode(audio_data).decode('utf-8')
# return encoded_audio
@staticmethod
def transfer(input_text: str, ref_audio: np.array) -> str:
# Process reference audio
# ..
# np.array to audio
# tts_output_np_array = np.array([0, 1, 0, 1])
# tts_output_bytes = tts_output_np_array.tobytes()
# audio_data = base64.b64encode(tts_output_bytes).decode('utf-8')
# return audio_data
#
# example
tts_text = input_text
tts = gTTS(tts_text)
audio_data = BytesIO()
tts.write_to_fp(audio_data)
encoded_audio = base64.b64encode(audio_data.getvalue()).decode('utf-8')
return encoded_audio