File size: 3,806 Bytes
2e727c2
c81d8ab
2e727c2
b3c7043
 
2e727c2
580e12f
 
 
2e727c2
 
 
 
 
 
 
 
fb658f7
c81d8ab
 
 
b588b2c
2e727c2
 
 
 
 
 
 
b451def
 
c81d8ab
 
 
b451def
c81d8ab
7cd8f48
 
 
 
c81d8ab
 
7cd8f48
b451def
 
 
b588b2c
 
b451def
980b3f8
 
 
 
 
 
 
b451def
b588b2c
2e727c2
 
 
 
980b3f8
 
 
 
 
 
2e727c2
 
 
980b3f8
b588b2c
980b3f8
b588b2c
980b3f8
2e727c2
980b3f8
2e727c2
 
23659c4
 
 
 
 
 
 
 
 
 
 
 
 
 
2e727c2
 
 
 
c81d8ab
2e727c2
 
 
c81d8ab
 
2e727c2
 
b588b2c
2e727c2
 
 
b588b2c
fb658f7
b451def
c81d8ab
7cd8f48
b3c7043
580e12f
23659c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import gradio as gr
from transformers import pipeline, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import nltk
from nltk.tokenize import sent_tokenize

# Download NLTK data
nltk.download('punkt')

# Translation models
translation_models = {
    'Vietnamese': "Helsinki-NLP/opus-mt-en-vi",
    'Japanese': "Helsinki-NLP/opus-mt-en-jap",
    'Thai': "Helsinki-NLP/opus-mt-en-tha",
    'Spanish': "Helsinki-NLP/opus-mt-en-es"
}

# Initialize summarization pipeline with a specified model
model_name = "sshleifer/distilbart-cnn-12-6"
summarizer = pipeline("summarization", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Initialize translation pipeline
def get_translator(language):
    model_name = translation_models.get(language)
    if model_name:
        return pipeline("translation", model=model_name)
    return None

# Helper function to split text into chunks
def split_text(text, max_tokens=1024):
    inputs = tokenizer(text, return_tensors='pt', truncation=False)
    input_ids = inputs['input_ids'][0]
    total_tokens = len(input_ids)
    
    chunks = []
    start = 0
    while start < total_tokens:
        end = min(start + max_tokens, total_tokens)
        chunk_ids = input_ids[start:end]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunks.append(chunk_text)
        start = end
    
    return chunks

# Helper function to summarize text
def summarize_text(text):
    chunks = split_text(text)
    summaries = []
    for chunk in chunks:
        try:
            summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)[0]['summary_text']
            summaries.append(summary)
        except Exception as e:
            print(f"Error summarizing chunk: {chunk}\nError: {e}")
    return " ".join(summaries)

# Helper function to translate text
def translate_text(text, language):
    translator = get_translator(language)
    if translator:
        try:
            translated_text = translator(text)[0]['translation_text']
            return translated_text
        except Exception as e:
            print(f"Error translating text: {text}\nError: {e}")
            return text
    return text

def process_text(input_text, language):
    print(f"Input text: {input_text[:500]}...")  # Show only the first 500 characters for brevity
    summary = summarize_text(input_text)
    print(f"Summary: {summary[:500]}...")  # Show only the first 500 characters for brevity
    bullet_points = generate_bullet_points(summary)
    print(f"Bullet Points: {bullet_points}")
    translated_text = translate_text(bullet_points, language)
    print(f"Translated Text: {translated_text}")
    return bullet_points, translated_text

def generate_bullet_points(text):
    print("Original Text:", text)
    sentences = sent_tokenize(text)
    print("Sentences:", sentences)
    
    bullet_points = []
    for sentence in sentences:
        bullet_points.append(sentence.strip())
    
    result = "\n".join(f"- {point}" for point in bullet_points)
    print("Bullet Points:", result)
    
    return result

# Create Gradio interface
iface = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Textbox(label="Input Text", placeholder="Paste your text here...", lines=10),
        gr.Dropdown(choices=["Vietnamese", "Japanese", "Thai", "Spanish"], label="Translate to", value="Vietnamese")
    ],
    outputs=[
        gr.Textbox(label="Bullet Points", lines=10),
        gr.Textbox(label="Translated Bullet Points", lines=10)
    ],
    title="Text to Bullet Points and Translation",
    description="Paste any text, and the program will summarize it into bullet points. Optionally, translate the bullet points into Vietnamese, Japanese, Thai, or Spanish."
)

iface.launch()