JohnnyBoy00 commited on
Commit
ee73a88
1 Parent(s): 256a288

Upload preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +23 -0
preprocessing.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MAX_INPUT_LENGTH = 256
2
+ MAX_TARGET_LENGTH = 128
3
+
4
+ def preprocess_function(examples):
5
+ """
6
+ Preprocess entries of the given dataset (should be used with a `map` function)
7
+ Params:
8
+ examples (Dataset): dataset to be preprocessed
9
+ Returns:
10
+ model_inputs (BatchEncoding): tokenized dataset entries
11
+ """
12
+ inputs, targets = [], []
13
+ for i in range(len(examples['question'])):
14
+ inputs.append(f"Answer: {examples['provided_answer'][i]} Reference: {examples['reference_answer'][i]} Question: {examples['question'][i]}")
15
+ targets.append(f"{examples['score'][i]} Feedback: {examples['answer_feedback'][i]}")
16
+
17
+ # apply tokenization to inputs and labels
18
+ model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding='max_length', truncation=True)
19
+ labels = tokenizer(text_target=targets, max_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True)
20
+
21
+ model_inputs['labels'] = labels['input_ids']
22
+
23
+ return model_inputs