|
import gradio as gr |
|
from transformers import pipeline, AutoTokenizer |
|
from sentence_transformers import SentenceTransformer, util |
|
import math |
|
|
|
|
|
translation_models = { |
|
'Vietnamese': "Helsinki-NLP/opus-mt-en-vi", |
|
'Japanese': "Helsinki-NLP/opus-mt-en-jap", |
|
'Thai': "Helsinki-NLP/opus-mt-en-tha", |
|
'Spanish': "Helsinki-NLP/opus-mt-en-es" |
|
} |
|
|
|
|
|
model_name = "sshleifer/distilbart-cnn-12-6" |
|
summarizer = pipeline("summarization", model=model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
def get_translator(language): |
|
model_name = translation_models.get(language) |
|
if model_name: |
|
return pipeline("translation", model=model_name) |
|
return None |
|
|
|
|
|
def generate_bullet_points(text): |
|
model = SentenceTransformer('paraphrase-MiniLM-L6-v2') |
|
sentences = text.split('. ') |
|
embeddings = model.encode(sentences, convert_to_tensor=True) |
|
clusters = util.community_detection(embeddings, threshold=0.75) |
|
|
|
bullet_points = [] |
|
for cluster in clusters: |
|
cluster_sentences = [sentences[idx] for idx in cluster] |
|
main_sentence = cluster_sentences[0] if cluster_sentences else "" |
|
bullet_points.append(main_sentence.strip()) |
|
|
|
return "\n".join(f"- {point}" for point in bullet_points) |
|
|
|
|
|
def split_text(text, max_tokens=1024): |
|
inputs = tokenizer(text, return_tensors='pt', truncation=False) |
|
input_ids = inputs['input_ids'][0] |
|
total_tokens = len(input_ids) |
|
|
|
chunks = [] |
|
for i in range(0, total_tokens, max_tokens): |
|
chunk_ids = input_ids[i:i+max_tokens] |
|
chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True) |
|
chunks.append(chunk_text) |
|
|
|
return chunks |
|
|
|
|
|
def summarize_text(text): |
|
chunks = split_text(text) |
|
summaries = [summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]['summary_text'] for chunk in chunks] |
|
return " ".join(summaries) |
|
|
|
|
|
def translate_text(text, language): |
|
translator = get_translator(language) |
|
if translator: |
|
translated_text = translator(text)[0]['translation_text'] |
|
return translated_text |
|
return text |
|
|
|
def process_text(input_text, language): |
|
summary = summarize_text(input_text) |
|
bullet_points = generate_bullet_points(summary) |
|
translated_text = translate_text(bullet_points, language) |
|
return bullet_points, translated_text |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_text, |
|
inputs=[ |
|
gr.Textbox(label="Input Text", placeholder="Paste your text here...", lines=10), |
|
gr.Dropdown(choices=["Vietnamese", "Japanese", "Thai", "Spanish"], label="Translate to", value="Vietnamese") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Bullet Points", lines=10), |
|
gr.Textbox(label="Translated Bullet Points", lines=10) |
|
], |
|
title="Text to Bullet Points and Translation", |
|
description="Paste any text, and the program will summarize it into bullet points. Optionally, translate the bullet points into Vietnamese, Japanese, Thai, or Spanish." |
|
) |
|
|
|
iface.launch() |
|
|
|
|
|
|
|
|
|
|