import torch | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech | |
import soundfile as sf | |
# Initialize the model and processor from Hugging Face | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
def text_to_speech(text): | |
# Process the input text into tokens | |
inputs = processor(text, return_tensors="pt") | |
# Generate speech | |
with torch.no_grad(): | |
speech = model.generate_speech(inputs.input_ids) | |
# Save the generated speech as a WAV file | |
sf.write('output.wav', speech.squeeze().cpu().numpy(), 16000) | |
return "output.wav" | |