Spaces:
Running
Running
File size: 6,204 Bytes
7557bc5 c9ff03b 7557bc5 f7842f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from nltk.corpus import stopwords
from rake_nltk import Rake
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from transformers import pipeline
from gliner import GLiNER
from load_models import load_nlp_models
nlp, s2v = load_nlp_models()
def filter_keywords(extracted_keywords):
unwanted_keywords =[
# Common punctuation marks
'.', ',', '!', '?', ':', ';', '-', '_', '(', ')', '[', ']', '{', '}',
'/', '\\', '|', '@', '#', '$', '%', '^', '&', '*', '+', '=', '<', '>',
'`', '~', '"', "'",
# Common contractions (if not already removed as stopwords)
"n't", "'s", "'m", "'re", "'ll", "'ve", "'d",
# Common abbreviations
'etc', 'eg', 'ie', 'ex', 'vs', 'viz',
'tbd', 'tba', # To be determined/announced
'na', 'n/a', # Not applicable
# Single characters
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
# HTML-related tags (if the text contains any HTML content)
'<html>', '</html>', '<body>', '</body>', '<head>', '</head>', '<div>', '</div>', '<p>', '</p>', '<br>', '<hr>', '<h1>', '</h1>', '<h2>', '</h2>', '<h3>', '</h3>',
# Random technical or common abbreviations that aren't meaningful keywords
'etc', 'e.g', 'i.e', 'vs', 'ex', 'vol', 'sec', 'pg', 'id', 'ref', 'eq',
# Miscellaneous tokens
'www', 'com', 'http', 'https', 'ftp', 'pdf', 'doc', 'img', 'gif', 'jpeg', 'jpg', 'png', 'mp4', 'mp3', 'org', 'net', 'edu',
'untitled', 'noname', 'unknown', 'undefined',
# Single letters commonly used in bullet points or references
'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii',
# Common file extensions (if filenames are included in the text)
'.jpg', '.png', '.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx', '.csv', '.txt', '.zip', '.tar', '.gz', '.exe', '.bat', '.sh', '.py', '.cpp', '.java',
# Other tokens related to formatting or structure
'chapter', 'section', 'figure', 'table', 'appendix',
# Miscellaneous general noise terms
'note', 'item', 'items', 'number', 'numbers', 'figure', 'case', 'cases', 'example', 'examples', 'type', 'types', 'section', 'part', 'parts'
]
# Convert both lists to sets for efficient lookup
extracted_set = set(extracted_keywords)
unwanted_set = set(unwanted_keywords)
# Remove unwanted keywords
filtered_keywords = extracted_set - unwanted_set
# Convert back to a list and sort (optional)
return sorted(list(filtered_keywords))
def remove_stopwords(keywords):
stop_words = set(stopwords.words('english'))
modified_keywords = [''.join(keyword.split()) for keyword in keywords]
filtered_keywords = [keyword for keyword in modified_keywords if keyword.lower() not in stop_words]
original_keywords = []
for keyword in filtered_keywords:
for original_keyword in keywords:
if ''.join(original_keyword.split()).lower() == keyword.lower():
original_keywords.append(original_keyword)
break
return original_keywords
# def enhanced_ner(text):
# nlp = spacy.load("en_core_web_trf")
# ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
# doc = nlp(text)
# spacy_entities = set((ent.text, ent.label_) for ent in doc.ents)
# hf_entities = set((ent['word'], ent['entity']) for ent in ner_pipeline(text))
# combined_entities = spacy_entities.union(hf_entities)
# keywords = [entity[0] for entity in combined_entities]
# return list(keywords)
def extract_keywords(text, extract_all):
try:
text = text.lower()
# enhanced_ner_entities = enhanced_ner(text)
# print("Enhanced ner entities: ",enhanced_ner_entities)
# enhanced_ner_entities = remove_stopwords(enhanced_ner_entities)
# enhanced_ner_entities = filter_keywords(enhanced_ner_entities)
# print("Enhanced ner entities after applying filter and stopwords removal: ",enhanced_ner_entities)
gliner_model = GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5")
labels = ["person", "organization", "phone number", "address", "email", "date of birth",
"mobile phone number", "medication", "ip address", "email address",
"landline phone number", "blood type", "digital signature", "postal code",
"date"]
entities = gliner_model.predict_entities(text, labels, threshold=0.5)
gliner_keywords = set(remove_stopwords([ent["text"] for ent in entities]))
print(f"Gliner keywords:{gliner_keywords}")
# if extract_all is False:
# return list(gliner_keywords)
doc = nlp(text)
spacy_keywords = set(remove_stopwords([ent.text for ent in doc.ents]))
print(f"\n\nSpacy Entities: {spacy_keywords} \n\n")
if extract_all is False:
# combined_keywords_without_all = list(spacy_keywords.union(gliner_keywords).union(enhanced_ner_entities))
combined_keywords_without_all = list(spacy_keywords.union(gliner_keywords))
filtered_results = filter_keywords(combined_keywords_without_all)
print("Keywords returned: ",filtered_results)
return list(filtered_results)
rake = Rake()
rake.extract_keywords_from_text(text)
rake_keywords = set(remove_stopwords(rake.get_ranked_phrases()))
print(f"\n\nRake Keywords: {rake_keywords} \n\n")
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform([text])
tfidf_keywords = set(remove_stopwords(vectorizer.get_feature_names_out()))
print(f"\n\nTFIDF Entities: {tfidf_keywords} \n\n")
combined_keywords = list(rake_keywords.union(spacy_keywords).union(tfidf_keywords).union(gliner_keywords))
filtered_results = filter_keywords(combined_keywords)
print("Keywords returned: ",filtered_results)
return list(filtered_results)
except Exception as e:
raise Exception(f"Error in keyword extraction: {str(e)}") |