File size: 2,539 Bytes
94f9b65
f9bab84
429ace2
 
f9bab84
429ace2
d075e6c
94f9b65
429ace2
d075e6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429ace2
d075e6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429ace2
d075e6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import streamlit as st
from transformers import MarianTokenizer, MarianMTModel , BertTokenizer, AutoModelForSeq2SeqLM, pipeline
from ar_corrector.corrector import Corrector

import mishkal.tashkeel 

from arabert.preprocess import ArabertPreprocessor


# Initialize Mishkal vocalizer
vocalizer = mishkal.tashkeel.TashkeelClass()

# Initialize Marian tokenizer and model for translation
mname = "marefa-nlp/marefa-mt-en-ar"
tokenizer = MarianTokenizer.from_pretrained(mname)
model = MarianMTModel.from_pretrained(mname)

# Initialize BERT tokenizer and model for summarization
model_name = "malmarjeh/mbert2mbert-arabic-text-summarization"
preprocessor = ArabertPreprocessor(model_name="")
tokenizer_summarization = BertTokenizer.from_pretrained(model_name)
model_summarization = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipeline_summarization = pipeline("text2text-generation", model=model_summarization, tokenizer=tokenizer_summarization)

corr = Corrector()
def main():
    st.title("U3reb Demo")

    # Text Input
    input_text = st.text_area("Enter Arabic Text:")

    # Tokenization
    st.subheader("Tokenization (Mishkal)")
    if input_text:
        text_mishkal = vocalizer.tashkeel(input_text)
        st.write("Tokenized Text (with diacritics):", text_mishkal)

    # Translation
    st.subheader("Translation")
    if input_text:
        translated_tokens = model.generate(**tokenizer.prepare_seq2seq_batch([input_text], return_tensors="pt"))
        translated_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
        st.write("Translated Text:", translated_text)

    # Arabic Text Correction
    st.subheader("Arabic Text Correction (ar_correct)")
    if input_text:
        corrected_text = corr.contextual_correct(input_text)
        st.write("Corrected Text:", corrected_text)

    # Text Summarization
    st.subheader("Text Summarization")
    if input_text:
        preprocessed_text = preprocessor.preprocess(input_text)
        result = pipeline_summarization(preprocessed_text,
                                        pad_token_id=tokenizer_summarization.eos_token_id,
                                        num_beams=3,
                                        repetition_penalty=3.0,
                                        max_length=200,
                                        length_penalty=1.0,
                                        no_repeat_ngram_size=3)[0]['generated_text']
        st.write("Summarized Text:", result)


if __name__ == "__main__":
    main()