Spaces:

Kartheesh
/

speech-to-speech

Sleeping

File size: 2,240 Bytes

import fitz  # PyMuPDF
from transformers import VitsModel, MBartForConditionalGeneration, AutoTokenizer
import torch
import soundfile as sf
import gradio as gr

# Load the translation model and tokenizer
translation_tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", use_fast=False)
translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")

# Load the TTS model and tokenizer
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
tts_model = VitsModel.from_pretrained("facebook/mms-tts-hin")

def extract_text_from_pdf(pdf_file):
    """Extract text from a PDF file."""
    doc = fitz.open(pdf_file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def process_pdf(pdf_file):
    # Extract text from the PDF
    input_text = extract_text_from_pdf(pdf_file)

    # Convert sentences to tensors
    model_inputs = translation_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    # Translate from English to Hindi
    generated_tokens = translation_model.generate(
        **model_inputs,
        forced_bos_token_id=translation_tokenizer.lang_code_to_id["hi_IN"]
    )

    # Decode the translated tokens to text
    translation = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    translated_text = " ".join(translation)  # Join all translated sentences

    # Tokenize the translated text for TTS
    tts_inputs = tts_tokenizer(translated_text, return_tensors="pt")

    # Generate the waveform
    try:
        with torch.no_grad():
            tts_output = tts_model(**tts_inputs)
            waveform = tts_output.waveform.squeeze().cpu().numpy()
    except RuntimeError as e:
        return f"Runtime Error: {e}"

    # Save the waveform to an audio file
    audio_path = "output.wav"
    sf.write(audio_path, waveform, 22050)

    return audio_path

def gradio_interface(pdf_file):
    audio_path = process_pdf(pdf_file.name)
    return audio_path

# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(file_count="single"),
    outputs="audio"
)

# Launch the Gradio app
iface.launch(debug=True)