DevBM AneriThakkar commited on
Commit
60121a2
1 Parent(s): 1b74db6

Using gliner for keyword extraction (#3)

Browse files

- Using gliner for keyword extraction (66e214b665ada0bf8ce14b58bb0d69527ea87e21)


Co-authored-by: Thakkar Aneri Pareshkumar <AneriThakkar@users.noreply.huggingface.co>

Files changed (1) hide show
  1. app.py +25 -9
app.py CHANGED
@@ -41,6 +41,9 @@ from email.mime.base import MIMEBase
41
  from email.mime.application import MIMEApplication
42
  from email import encoders
43
  # '------------------'
 
 
 
44
  print("***************************************************************")
45
 
46
  st.set_page_config(
@@ -323,14 +326,24 @@ def segment_text(text, max_segment_length=700, batch_size=7):
323
  # Function to extract keywords using combined techniques
324
  def extract_keywords(text, extract_all):
325
  try:
 
 
 
 
 
 
 
 
 
 
326
  doc = nlp(text)
327
  spacy_keywords = set([ent.text for ent in doc.ents])
328
  spacy_entities = spacy_keywords
329
  print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
330
 
331
- # Use Only Spacy Entities
332
- if extract_all is False:
333
- return list(spacy_entities)
334
 
335
  # Use RAKE
336
  rake = Rake()
@@ -347,7 +360,7 @@ def extract_keywords(text, extract_all):
347
  print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
348
 
349
  # Combine all keywords
350
- combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords)
351
 
352
  return list(combined_keywords)
353
  except Exception as e:
@@ -427,9 +440,12 @@ def map_keywords_to_sentences(text, keywords, context_window_size):
427
  for i, sentence in enumerate(sentences):
428
  if keyword in sentence:
429
  # Combine current sentence with surrounding sentences for context
430
- start = max(0, i - context_window_size)
431
- end = min(len(sentences), i + context_window_size + 1)
432
- context = ' '.join(sentences[start:end])
 
 
 
433
  if keyword not in keyword_sentence_mapping:
434
  keyword_sentence_mapping[keyword] = context
435
  else:
@@ -647,8 +663,8 @@ def main():
647
  text = clean_text(text)
648
  with st.expander("Show text"):
649
  st.write(text)
650
- generate_questions_button = st.button("Generate Questions")
651
- st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
652
 
653
  # if generate_questions_button:
654
  if generate_questions_button and text:
 
41
  from email.mime.application import MIMEApplication
42
  from email import encoders
43
  # '------------------'
44
+ from gliner import GLiNER
45
+ # -------------------
46
+
47
  print("***************************************************************")
48
 
49
  st.set_page_config(
 
326
  # Function to extract keywords using combined techniques
327
  def extract_keywords(text, extract_all):
328
  try:
329
+ gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
330
+ labels = ["person", "organization", "email", "Award", "Date", "Competitions", "Teams", "location", "percentage", "money"]
331
+ entities = gliner_model.predict_entities(text, labels, threshold=0.7)
332
+
333
+ gliner_keywords = list(set([ent["text"] for ent in entities]))
334
+ print(f"Gliner keywords:{gliner_keywords}")
335
+ # Use Only Gliner Entities
336
+ if extract_all is False:
337
+ return list(gliner_keywords)
338
+
339
  doc = nlp(text)
340
  spacy_keywords = set([ent.text for ent in doc.ents])
341
  spacy_entities = spacy_keywords
342
  print(f"\n\nSpacy Entities: {spacy_entities} \n\n")
343
 
344
+ #
345
+ # if extract_all is False:
346
+ # return list(spacy_entities)
347
 
348
  # Use RAKE
349
  rake = Rake()
 
360
  print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
361
 
362
  # Combine all keywords
363
+ combined_keywords = rake_keywords.union(spacy_keywords).union(tfidf_keywords).union(gliner_keywords)
364
 
365
  return list(combined_keywords)
366
  except Exception as e:
 
440
  for i, sentence in enumerate(sentences):
441
  if keyword in sentence:
442
  # Combine current sentence with surrounding sentences for context
443
+ # start = max(0, i - context_window_size)
444
+ # end = min(len(sentences), i + context_window_size + 1)
445
+ start = max(0,i - context_window_size)
446
+ context_sentenses = sentences[start:i+1]
447
+ context = ' '.join(context_sentenses)
448
+ # context = ' '.join(sentences[start:end])
449
  if keyword not in keyword_sentence_mapping:
450
  keyword_sentence_mapping[keyword] = context
451
  else:
 
663
  text = clean_text(text)
664
  with st.expander("Show text"):
665
  st.write(text)
666
+ generate_questions_button = st.button("Generate Questions",help="This is the generate questions button")
667
+ # st.markdown('<span aria-label="Generate questions button">Above is the generate questions button</span>', unsafe_allow_html=True)
668
 
669
  # if generate_questions_button:
670
  if generate_questions_button and text: