abdalrahmanshahrour commited on
Commit
b581cfa
1 Parent(s): 5d711dd
Files changed (1) hide show
  1. summarize.py +0 -171
summarize.py DELETED
@@ -1,171 +0,0 @@
1
- import logging
2
- import os
3
- import re
4
- from functools import lru_cache
5
- from urllib.parse import unquote
6
-
7
- import streamlit as st
8
- from codetiming import Timer
9
- from transformers import pipeline
10
- from arabert.preprocess import ArabertPreprocessor
11
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
12
- import tokenizers
13
- import re
14
- import heapq
15
- from string import punctuation
16
- import nltk
17
- from nltk.corpus import stopwords
18
-
19
- punctuation = punctuation + '\n'
20
- logger = logging.getLogger(__name__)
21
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
22
-
23
- logger.info("Loading models...")
24
- reader_time = Timer("loading", text="Time: {:.2f}", logger=logging.info)
25
- reader_time.start()
26
-
27
-
28
- reader_time.stop()
29
-
30
-
31
- logger.info("Finished loading the models...")
32
- logger.info(f"Time spent loading: {reader_time.last}")
33
-
34
- @lru_cache(maxsize=200)
35
- def get_results(text, model_selected, num_beams, length_penalty,number_of_sentence):
36
- logger.info("\n=================================================================")
37
- logger.info(f"Text: {text}")
38
- logger.info(f"model_selected: {model_selected}")
39
- logger.info(f"length_penalty: {length_penalty}")
40
- reader_time = Timer("summarize", text="Time: {:.2f}", logger=logging.info)
41
- reader_time.start()
42
- if model_selected == 'GPT-2':
43
- number_of_tokens_limit = 80
44
- else:
45
- number_of_tokens_limit = 150
46
- logger.info(f"input length: {len(text.split())}")
47
-
48
- if model_selected == 'arabartsummarization':
49
- model_name="abdalrahmanshahrour/arabartsummarization"
50
- preprocessor = ArabertPreprocessor(model_name="")
51
-
52
- tokenizer = AutoTokenizer.from_pretrained(model_name)
53
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
54
- pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
55
- result = pipeline1(text,
56
- pad_token_id= tokenizer.eos_token_id,
57
- num_beams=num_beams,
58
- repetition_penalty=3.0,
59
- max_length=200,
60
- length_penalty=length_penalty,
61
- no_repeat_ngram_size = 3)[0]['generated_text']
62
- logger.info('arabartsummarization')
63
- elif model_selected == 'AraBART':
64
-
65
- model_name= "abdalrahmanshahrour/AraBART-summ"
66
- preprocessor = ArabertPreprocessor(model_name="")
67
-
68
- tokenizer = AutoTokenizer.from_pretrained(model_name)
69
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
70
- pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
71
- result = pipeline1(text,
72
- pad_token_id= tokenizer.eos_token_id,
73
- num_beams=num_beams,
74
- repetition_penalty=3.0,
75
- max_length=200,
76
- length_penalty=length_penalty,
77
- no_repeat_ngram_size = 3)[0]['generated_text']
78
- logger.info('AraBART')
79
-
80
- elif model_selected == "auto-arabic-summarization":
81
-
82
- model_name="abdalrahmanshahrour/auto-arabic-summarization"
83
- preprocessor = ArabertPreprocessor(model_name="")
84
-
85
- tokenizer = AutoTokenizer.from_pretrained(model_name)
86
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
87
- pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
88
- result = pipeline1(text,
89
- pad_token_id= tokenizer.eos_token_id,
90
- num_beams=num_beams,
91
- repetition_penalty=3.0,
92
- max_length=200,
93
- length_penalty=length_penalty,
94
- no_repeat_ngram_size = 3)[0]['generated_text']
95
- logger.info('auto-arabic-summarization')
96
-
97
- elif model_selected == 'BERT2BERT':
98
-
99
- model_name="malmarjeh/bert2bert"
100
- preprocessor = ArabertPreprocessor(model_name="")
101
-
102
- tokenizer = AutoTokenizer.from_pretrained(model_name)
103
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
104
- pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
105
- result = pipeline1(text,
106
- pad_token_id= tokenizer.eos_token_id,
107
- num_beams=num_beams,
108
- repetition_penalty=3.0,
109
- max_length=200,
110
- length_penalty=length_penalty,
111
- no_repeat_ngram_size = 3)[0]['generated_text']
112
- logger.info('BERT2BERT')
113
-
114
- elif model_selected == "xlmroberta2xlmroberta":
115
- model_name="ahmeddbahaa/xlmroberta2xlmroberta-finetune-summarization-ar"
116
- preprocessor = ArabertPreprocessor(model_name="")
117
-
118
- tokenizer = AutoTokenizer.from_pretrained(model_name)
119
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
120
- pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
121
- result = pipeline1(text,
122
- pad_token_id= tokenizer.eos_token_id,
123
- num_beams=num_beams,
124
- repetition_penalty=3.0,
125
- max_length=200,
126
- length_penalty=length_penalty,
127
- no_repeat_ngram_size = 3)[0]['generated_text']
128
- logger.info('xlmroberta2xlmroberta')
129
-
130
- elif model_selected == "nltk_summarizer":
131
- # number_of_sentence = 3
132
- stopWords = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
133
- word_frequencies = {}
134
- for word in nltk.word_tokenize(text):
135
- if word not in stopWords:
136
- if word not in punctuation:
137
- if word not in word_frequencies.keys():
138
- word_frequencies[word] = 1
139
- else:
140
- word_frequencies[word] += 1
141
-
142
- maximum_frequncy = max(list(word_frequencies.values()),default=3)
143
-
144
- for word in word_frequencies.keys():
145
- word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
146
-
147
- sentence_list = nltk.sent_tokenize(text)
148
- sentence_scores = {}
149
- for sent in sentence_list:
150
- for word in nltk.word_tokenize(sent.lower()):
151
- if word in word_frequencies.keys():
152
- if len(sent.split(' ')) < 30:
153
- if sent not in sentence_scores.keys():
154
- sentence_scores[sent] = word_frequencies[word]
155
- else:
156
- sentence_scores[sent] += word_frequencies[word]
157
-
158
- summary_sentences = heapq.nlargest(number_of_sentence, sentence_scores, key=sentence_scores.get)
159
-
160
- result = ' '.join(summary_sentences)
161
- else:
162
- result = "الرجاء اختيار نموذج"
163
-
164
- reader_time.stop()
165
- logger.info(f"Time spent summarizing: {reader_time.last}")
166
-
167
- return result
168
-
169
-
170
- if __name__ == "__main__":
171
- results_dict = ""