|
from tqdm import tqdm
|
|
from commons.Configs import configs
|
|
from commons.File import file
|
|
from commons.OpenAIClient import openaiClient
|
|
from commons.SpacyUtils import spacyUtils
|
|
|
|
|
|
class Dataset:
|
|
def __init__(self, debug=False):
|
|
self.debug = debug
|
|
|
|
|
|
|
|
|
|
def generateDatasetFromFile(self, inputFile):
|
|
outputFile = configs.generatedDatasetPath
|
|
|
|
|
|
allQaRows = []
|
|
print("Reading input file: ", inputFile)
|
|
text = file.readFile(inputFile)
|
|
|
|
print("Generating questions and answers for each sentence")
|
|
for sent in tqdm(spacyUtils.splitSentences(text)):
|
|
prompt = openaiClient.buildPrompt("generateQuestionsPerson", {
|
|
'NAME': configs.PROMPT_PERSON_NAME,
|
|
'SOCIALNAME': configs.PROMPT_PERSON_SOCIALNAME,
|
|
'TITLE': configs.PROMPT_PERSON_TITLE,
|
|
'HESHEIT': configs.PROMPT_PERSON_HESHEIT,
|
|
'BIRTHDAY': configs.PROMPT_PERSON_BIRTHDAY,
|
|
'DEATHDAY': configs.PROMPT_PERSON_DEATHDAY,
|
|
'BIRTHPLACE': configs.PROMPT_PERSON_BIRTHPLACE,
|
|
'DEATHPLACE': configs.PROMPT_PERSON_DEATHPLACE,
|
|
'NUMBER_OF_QUESTIONS': configs.PROMPT_PERSON_NUMBER_OF_QUESTIONS,
|
|
'SENTENCE': sent
|
|
})
|
|
genq = openaiClient.generateSyntheticQuestions(
|
|
prompt, debugSentence=sent)
|
|
allQaRows.extend(genq)
|
|
|
|
if self.debug:
|
|
for x in genq:
|
|
print("Sentence: ", sent)
|
|
print("Q: ", x['question'])
|
|
print("A: ", x['answer'])
|
|
|
|
|
|
print("Writing dataset to file: ", outputFile)
|
|
file.writeFile(outputFile, allQaRows)
|
|
|
|
def loadDataset(self):
|
|
inputFilePath = configs.generatedDatasetPath
|
|
return file.readJsonFile(inputFilePath)
|
|
|
|
|
|
dataset = Dataset()
|
|
|