import gradio as gr from transformers import pipeline, AutoTokenizer from sentence_transformers import SentenceTransformer, util import math # Translation models translation_models = { 'Vietnamese': "Helsinki-NLP/opus-mt-en-vi", 'Japanese': "Helsinki-NLP/opus-mt-en-jap", 'Thai': "Helsinki-NLP/opus-mt-en-tha", 'Spanish': "Helsinki-NLP/opus-mt-en-es" } # Initialize summarization pipeline with a specified model model_name = "sshleifer/distilbart-cnn-12-6" summarizer = pipeline("summarization", model=model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Initialize translation pipeline def get_translator(language): model_name = translation_models.get(language) if model_name: return pipeline("translation", model=model_name) return None # Helper function to generate bullet points def generate_bullet_points(text): model = SentenceTransformer('paraphrase-MiniLM-L6-v2') sentences = text.split('. ') embeddings = model.encode(sentences, convert_to_tensor=True) clusters = util.community_detection(embeddings, threshold=0.75) bullet_points = [] for cluster in clusters: cluster_sentences = [sentences[idx] for idx in cluster] main_sentence = cluster_sentences[0] if cluster_sentences else "" bullet_points.append(main_sentence.strip()) return "\n".join(f"- {point}" for point in bullet_points) # Helper function to split text into chunks def split_text(text, max_tokens=1024): inputs = tokenizer(text, return_tensors='pt', truncation=False) input_ids = inputs['input_ids'][0] total_tokens = len(input_ids) chunks = [] for i in range(0, total_tokens, max_tokens): chunk_ids = input_ids[i:i+max_tokens] chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True) chunks.append(chunk_text) return chunks # Helper function to summarize text def summarize_text(text): chunks = split_text(text) summaries = [summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]['summary_text'] for chunk in chunks] return " ".join(summaries) # Helper function to translate text def translate_text(text, language): translator = get_translator(language) if translator: translated_text = translator(text)[0]['translation_text'] return translated_text return text def process_text(input_text, language): summary = summarize_text(input_text) bullet_points = generate_bullet_points(summary) translated_text = translate_text(bullet_points, language) return bullet_points, translated_text # Create Gradio interface iface = gr.Interface( fn=process_text, inputs=[ gr.Textbox(label="Input Text", placeholder="Paste your text here...", lines=10), gr.Dropdown(choices=["Vietnamese", "Japanese", "Thai", "Spanish"], label="Translate to", value="Vietnamese") ], outputs=[ gr.Textbox(label="Bullet Points", lines=10), gr.Textbox(label="Translated Bullet Points", lines=10) ], title="Text to Bullet Points and Translation", description="Paste any text, and the program will summarize it into bullet points. Optionally, translate the bullet points into Vietnamese, Japanese, Thai, or Spanish." ) iface.launch()