Spaces:
Running
Running
Commit
•
60121a2
1
Parent(s):
1b74db6
Using gliner for keyword extraction (#3)
Browse files- Using gliner for keyword extraction (66e214b665ada0bf8ce14b58bb0d69527ea87e21)
Co-authored-by: Thakkar Aneri Pareshkumar <AneriThakkar@users.noreply.huggingface.co>
app.py
CHANGED
@@ -41,6 +41,9 @@ from email.mime.base import MIMEBase
|
|
41 |
from email.mime.application import MIMEApplication
|
42 |
from email import encoders
|
43 |
# '------------------'
|
|
|
|
|
|
|
44 |
print("***************************************************************")
|
45 |
|
46 |
st.set_page_config(
|
@@ -323,14 +326,24 @@ def segment_text(text, max_segment_length=700, batch_size=7):
|
|
323 |
# Function to extract keywords using combined techniques
|
324 |
def extract_keywords(text, extract_all):
|
325 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
doc = nlp(text)
|
327 |
spacy_keywords = set([ent.text for ent in doc.ents])
|
328 |
spacy_entities = spacy_keywords
|
329 |
print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
|
330 |
|
331 |
-
#
|
332 |
-
if extract_all is False:
|
333 |
-
|
334 |
|
335 |
# Use RAKE
|
336 |
rake = Rake()
|
@@ -347,7 +360,7 @@ def extract_keywords(text, extract_all):
|
|
347 |
print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
|
348 |
|
349 |
# Combine all keywords
|
350 |
-
combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
|
351 |
|
352 |
return list(combined_keywords)
|
353 |
except Exception as e:
|
@@ -427,9 +440,12 @@ def map_keywords_to_sentences(text, keywords, context_window_size):
|
|
427 |
for i, sentence in enumerate(sentences):
|
428 |
if keyword in sentence:
|
429 |
# Combine current sentence with surrounding sentences for context
|
430 |
-
start = max(0, i - context_window_size)
|
431 |
-
end = min(len(sentences), i + context_window_size + 1)
|
432 |
-
|
|
|
|
|
|
|
433 |
if keyword not in keyword_sentence_mapping:
|
434 |
keyword_sentence_mapping[keyword] = context
|
435 |
else:
|
@@ -647,8 +663,8 @@ def main():
|
|
647 |
text = clean_text(text)
|
648 |
with st.expander("Show text"):
|
649 |
st.write(text)
|
650 |
-
generate_questions_button = st.button("Generate Questions")
|
651 |
-
st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
|
652 |
|
653 |
# if generate_questions_button:
|
654 |
if generate_questions_button and text:
|
|
|
41 |
from email.mime.application import MIMEApplication
|
42 |
from email import encoders
|
43 |
# '------------------'
|
44 |
+
from gliner import GLiNER
|
45 |
+
# -------------------
|
46 |
+
|
47 |
print("***************************************************************")
|
48 |
|
49 |
st.set_page_config(
|
|
|
326 |
# Function to extract keywords using combined techniques
|
327 |
def extract_keywords(text, extract_all):
|
328 |
try:
|
329 |
+
gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
|
330 |
+
labels = ["person", "organization", "email", "Award", "Date", "Competitions", "Teams", "location", "percentage", "money"]
|
331 |
+
entities = gliner_model.predict_entities(text, labels, threshold=0.7)
|
332 |
+
|
333 |
+
gliner_keywords = list(set([ent["text"] for ent in entities]))
|
334 |
+
print(f"Gliner keywords:{gliner_keywords}")
|
335 |
+
# Use Only Gliner Entities
|
336 |
+
if extract_all is False:
|
337 |
+
return list(gliner_keywords)
|
338 |
+
|
339 |
doc = nlp(text)
|
340 |
spacy_keywords = set([ent.text for ent in doc.ents])
|
341 |
spacy_entities = spacy_keywords
|
342 |
print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
|
343 |
|
344 |
+
#
|
345 |
+
# if extract_all is False:
|
346 |
+
# return list(spacy_entities)
|
347 |
|
348 |
# Use RAKE
|
349 |
rake = Rake()
|
|
|
360 |
print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
|
361 |
|
362 |
# Combine all keywords
|
363 |
+
combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords).union(gliner_keywords)
|
364 |
|
365 |
return list(combined_keywords)
|
366 |
except Exception as e:
|
|
|
440 |
for i, sentence in enumerate(sentences):
|
441 |
if keyword in sentence:
|
442 |
# Combine current sentence with surrounding sentences for context
|
443 |
+
# start = max(0, i - context_window_size)
|
444 |
+
# end = min(len(sentences), i + context_window_size + 1)
|
445 |
+
start = max(0,i - context_window_size)
|
446 |
+
context_sentenses = sentences[start:i+1]
|
447 |
+
context = ' '.join(context_sentenses)
|
448 |
+
# context = ' '.join(sentences[start:end])
|
449 |
if keyword not in keyword_sentence_mapping:
|
450 |
keyword_sentence_mapping[keyword] = context
|
451 |
else:
|
|
|
663 |
text = clean_text(text)
|
664 |
with st.expander("Show text"):
|
665 |
st.write(text)
|
666 |
+
generate_questions_button = st.button("Generate Questions",help="This is the generate questions button")
|
667 |
+
# st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
|
668 |
|
669 |
# if generate_questions_button:
|
670 |
if generate_questions_button and text:
|