File size: 3,806 Bytes
2e727c2 c81d8ab 2e727c2 b3c7043 2e727c2 580e12f 2e727c2 fb658f7 c81d8ab b588b2c 2e727c2 b451def c81d8ab b451def c81d8ab 7cd8f48 c81d8ab 7cd8f48 b451def b588b2c b451def 980b3f8 b451def b588b2c 2e727c2 980b3f8 2e727c2 980b3f8 b588b2c 980b3f8 b588b2c 980b3f8 2e727c2 980b3f8 2e727c2 23659c4 2e727c2 c81d8ab 2e727c2 c81d8ab 2e727c2 b588b2c 2e727c2 b588b2c fb658f7 b451def c81d8ab 7cd8f48 b3c7043 580e12f 23659c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import gradio as gr
from transformers import pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.tokenize import sent_tokenize
# Download NLTK data
nltk.download('punkt')
# Translation models
translation_models = {
'Vietnamese': "Helsinki-NLP/opus-mt-en-vi",
'Japanese': "Helsinki-NLP/opus-mt-en-jap",
'Thai': "Helsinki-NLP/opus-mt-en-tha",
'Spanish': "Helsinki-NLP/opus-mt-en-es"
}
# Initialize summarization pipeline with a specified model
model_name = "sshleifer/distilbart-cnn-12-6"
summarizer = pipeline("summarization", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Initialize translation pipeline
def get_translator(language):
model_name = translation_models.get(language)
if model_name:
return pipeline("translation", model=model_name)
return None
# Helper function to split text into chunks
def split_text(text, max_tokens=1024):
inputs = tokenizer(text, return_tensors='pt', truncation=False)
input_ids = inputs['input_ids'][0]
total_tokens = len(input_ids)
chunks = []
start = 0
while start < total_tokens:
end = min(start + max_tokens, total_tokens)
chunk_ids = input_ids[start:end]
chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
chunks.append(chunk_text)
start = end
return chunks
# Helper function to summarize text
def summarize_text(text):
chunks = split_text(text)
summaries = []
for chunk in chunks:
try:
summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
summaries.append(summary)
except Exception as e:
print(f"Error summarizing chunk: {chunk}\nError: {e}")
return " ".join(summaries)
# Helper function to translate text
def translate_text(text, language):
translator = get_translator(language)
if translator:
try:
translated_text = translator(text)[0]['translation_text']
return translated_text
except Exception as e:
print(f"Error translating text: {text}\nError: {e}")
return text
return text
def process_text(input_text, language):
print(f"Input text: {input_text[:500]}...") # Show only the first 500 characters for brevity
summary = summarize_text(input_text)
print(f"Summary: {summary[:500]}...") # Show only the first 500 characters for brevity
bullet_points = generate_bullet_points(summary)
print(f"Bullet Points: {bullet_points}")
translated_text = translate_text(bullet_points, language)
print(f"Translated Text: {translated_text}")
return bullet_points, translated_text
def generate_bullet_points(text):
print("Original Text:", text)
sentences = sent_tokenize(text)
print("Sentences:", sentences)
bullet_points = []
for sentence in sentences:
bullet_points.append(sentence.strip())
result = "\n".join(f"- {point}" for point in bullet_points)
print("Bullet Points:", result)
return result
# Create Gradio interface
iface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(label="Input Text", placeholder="Paste your text here...", lines=10),
gr.Dropdown(choices=["Vietnamese", "Japanese", "Thai", "Spanish"], label="Translate to", value="Vietnamese")
],
outputs=[
gr.Textbox(label="Bullet Points", lines=10),
gr.Textbox(label="Translated Bullet Points", lines=10)
],
title="Text to Bullet Points and Translation",
description="Paste any text, and the program will summarize it into bullet points. Optionally, translate the bullet points into Vietnamese, Japanese, Thai, or Spanish."
)
iface.launch()
|