Spaces:
Running
Running
File size: 5,668 Bytes
f7842f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import nltk
import random
import asyncio
nltk.download('wordnet')
from nltk.corpus import wordnet
from sentence_transformers import util
from load_models import load_nlp_models, load_llama, load_qa_models
from utils import QuestionGenerationError
nlp, s2v = load_nlp_models()
llm = load_llama()
similarity_model, spell = load_qa_models()
context_model = similarity_model
def get_similar_words_sense2vec(word, n=3):
# Try to find the word with its most likely part-of-speech
word_with_pos = word + "|NOUN"
if word_with_pos in s2v:
similar_words = s2v.most_similar(word_with_pos, n=n)
return [word.split("|")[0] for word, _ in similar_words]
# If not found, try without POS
if word in s2v:
similar_words = s2v.most_similar(word, n=n)
return [word.split("|")[0] for word, _ in similar_words]
return []
def get_synonyms(word, n=3):
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
if lemma.name() != word and lemma.name() not in synonyms:
synonyms.append(lemma.name())
if len(synonyms) == n:
return synonyms
return synonyms
def gen_options(answer,context,question):
prompt=f'''Given the following context, question, and correct answer,
generate {4} incorrect but plausible answer options. The options should be:
1. Contextually related to the given context
2. Grammatically consistent with the question
3. Different from the correct answer
4. Not explicitly mentioned in the given context
Context: {context}
Question: {question}
Correct Answer: {answer}
Provide the options in a semi colon-separated list. Output must contain only the options and nothing else.
'''
options= [answer]
response = llm.invoke(prompt, stop=['<|eot_id|>'])
incorrect_options = [option.strip() for option in response.split(';')]
options.extend(incorrect_options)
random.shuffle(options)
print(options)
return options
# print(response)
def generate_options(answer, context, n=3):
options = [answer]
# Add contextually relevant words using a pre-trained model
context_embedding = context_model.encode(context)
answer_embedding = context_model.encode(answer)
context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
# Compute similarity scores and sort context words
similarity_scores = [util.pytorch_cos_sim(context_model.encode(word), answer_embedding).item() for word in context_words]
sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
options.extend(sorted_context_words[:n])
# Try to get similar words based on sense2vec
similar_words = get_similar_words_sense2vec(answer, n)
options.extend(similar_words)
# If we don't have enough options, try synonyms
if len(options) < n + 1:
synonyms = get_synonyms(answer, n - len(options) + 1)
options.extend(synonyms)
# If we still don't have enough options, extract other entities from the context
if len(options) < n + 1:
doc = nlp(context)
entities = [ent.text for ent in doc.ents if ent.text.lower() != answer.lower()]
options.extend(entities[:n - len(options) + 1])
# If we still need more options, add some random words from the context
if len(options) < n + 1:
context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
options.extend(random.sample(context_words, min(n - len(options) + 1, len(context_words))))
print(f"\n\nAll Possible Options: {options}\n\n")
# Ensure we have the correct number of unique options
options = list(dict.fromkeys(options))[:n+1]
# Shuffle the options
random.shuffle(options)
return options
async def generate_options_async(answer, context, n=3):
try:
options = [answer]
# Add contextually relevant words using a pre-trained model
context_embedding = await asyncio.to_thread(context_model.encode, context)
answer_embedding = await asyncio.to_thread(context_model.encode, answer)
context_words = [token.text for token in nlp(context) if token.is_alpha and token.text.lower() != answer.lower()]
# Compute similarity scores and sort context words
similarity_scores = [util.pytorch_cos_sim(await asyncio.to_thread(context_model.encode, word), answer_embedding).item() for word in context_words]
sorted_context_words = [word for _, word in sorted(zip(similarity_scores, context_words), reverse=True)]
options.extend(sorted_context_words[:n])
# Try to get similar words based on sense2vec
similar_words = await asyncio.to_thread(get_similar_words_sense2vec, answer, n)
options.extend(similar_words)
# If we don't have enough options, try synonyms
if len(options) < n + 1:
synonyms = await asyncio.to_thread(get_synonyms, answer, n - len(options) + 1)
options.extend(synonyms)
# Ensure we have the correct number of unique options
options = list(dict.fromkeys(options))[:n+1]
# Shuffle the options
random.shuffle(options)
return options
except Exception as e:
raise QuestionGenerationError(f"Error in generating options: {str(e)}")
|