import gradio as gr from transformers import pipeline, AutoTokenizer import nltk from nltk.tokenize import sent_tokenize import time # Download NLTK data nltk.download('punkt') # Translation models translation_models = { 'Vietnamese': "Helsinki-NLP/opus-mt-en-vi", 'Japanese': "Helsinki-NLP/opus-mt-en-jap", 'Thai': "Helsinki-NLP/opus-mt-en-tha", 'Spanish': "Helsinki-NLP/opus-mt-en-es" } # Summarization models summarization_models = { 'Scientific': "facebook/bart-large-cnn", 'Literature': "google/pegasus-xsum" } # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") # Helper function to initialize summarization pipeline def get_summarizer(model_name): return pipeline("summarization", model=model_name) # Initialize translation pipeline def get_translator(language): model_name = translation_models.get(language) if model_name: return pipeline("translation", model=model_name) return None # Helper function to split text into chunks def split_text(text, max_tokens=1024): sentences = sent_tokenize(text) chunks = [] current_chunk = [] current_length = 0 for sentence in sentences: sentence_length = len(tokenizer.tokenize(sentence)) if current_length + sentence_length <= max_tokens: current_chunk.append(sentence) current_length += sentence_length else: chunks.append(" ".join(current_chunk)) current_chunk = [sentence] current_length = sentence_length if current_chunk: chunks.append(" ".join(current_chunk)) return chunks # Helper function to summarize text def summarize_text(text, model_name): if len(text) < 200: # Adjust the threshold as needed print("Input text is too short for summarization. Please provide longer text.") return "" summarizer = get_summarizer(model_name) chunks = split_text(text) summaries = [] for chunk in chunks: try: summary = summarizer(chunk, max_length=150, min_length=20, do_sample=False)[0]['summary_text'] summaries.append(summary) except Exception as e: print(f"Error summarizing chunk: {chunk}\nError: {e}") return " ".join(summaries) # Helper function to translate text def translate_text(text, language): translator = get_translator(language) if translator: try: translated_text = translator(text)[0]['translation_text'] return translated_text except Exception as e: print(f"Error translating text: {text}\nError: {e}") return text return text def process_text(input_text, model, language): start_time = time.time() print(f"Input text: {input_text[:500]}...") # Show only the first 500 characters for brevity model_name = summarization_models[model] summary = summarize_text(input_text, model_name) if not summary: print("Summarization failed. Please provide longer text or try a different model.") return "", "" print(f"Summary: {summary[:500]}...") # Show only the first 500 characters for brevity bullet_points = generate_bullet_points(summary) if not bullet_points: print("Bullet points generation failed.") return "", "" print(f"Bullet Points: {bullet_points}") translated_text = translate_text(bullet_points, language) print(f"Translated Text: {translated_text}") end_time = time.time() print(f"Processing time: {end_time - start_time} seconds") return bullet_points, translated_text def generate_bullet_points(summary): print("Summary Text:", summary) # Extract key sentences sentences = sent_tokenize(summary) if not sentences: return "" key_sentences = sentences[:3] # Extract the first three sentences as key points bullet_points = "\n".join(f"- {sentence}" for sentence in key_sentences) print("Bullet Points:", bullet_points) return bullet_points # Create Gradio interface iface = gr.Interface( fn=process_text, inputs=[ gr.Textbox(label="Input Text", placeholder="Paste your text here...", lines=10), gr.Radio(choices=["Scientific", "Literature"], label="Summarization Model"), gr.Dropdown(choices=["Vietnamese", "Japanese", "Thai", "Spanish"], label="Translate to", value="Vietnamese") ], outputs=[ gr.Textbox(label="Bullet Points", lines=10), gr.Textbox(label="Translated Bullet Points", lines=10) ], title="Text to Bullet Points and Translation", description="Paste any text, choose the summarization model, and optionally translate the bullet points into Vietnamese, Japanese, Thai, or Spanish." ) iface.launch()