import argparse import pickle import random import copy import pandas as pd import numpy as np from collections import Counter import os from data_preprocessor import DataPreprocessor def prepare_pretraining_files(data_processor, options): # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") val_file = open(options.val_file_path, "w") val_info = open(options.val_info_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): # if options.workspace_name == section: if "ratio_proportion_change3" == section: for student, student_groups in section_groups.groupby("Anon Student Id"): # step_names_token = [step for step in prob_groups['Step Name'] if str(step) != 'nan'] # print(step_names_token) # writtenTrain = False # writtenTest = False student_groups.sort_values(by="Time", inplace=True) # prob_list = list(pd.unique(student_groups["Problem Name"])) # print(len(prob_list), prob_list) # first_prob_list = prob_list[:3] # last_prob_list = prob_list[-3:] # print(len(first_prob_list), first_prob_list) # print(len(last_prob_list), last_prob_list) # final_prob_list = first_prob_list + last_prob_list # print(len(prob_list), len(final_prob_list), final_prob_list) for prob, prob_groups in student_groups.groupby("Problem Name"): # For first 3 and last 3 only # if not prob in final_prob_list: # continue # print(prob) actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups["Step Name"])) unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) if unique_steps_len < 4: continue time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 1800: time_stamps_list.add(time_stamps[i+1]) # progress = "" step_names_token = [] outcome = [] help_level = [] auto_complete = False means_and_extremes = False for index, row in prob_groups[['Time', 'Step Name', 'CF (Etalon)', 'Outcome', 'Help Level','CF (Workspace Progress Status)']].iterrows(): step = row["Step Name"] progress = row["CF (Workspace Progress Status)"] etalon = row["CF (Etalon)"] if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: # if row["Time"] in time_stamps_list: auto_complete = True # print(row) continue if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) # 4 and more in sequence if step_names_token and unique_steps_len > 4: # and len(step_names_token) > 3 # For information # indices = [str(i) for i in prob_groups.index] where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") proba = random.random() # if prob in first_prob_list: if proba <= 0.8: # writtenTrain = True train_file.write("\t".join(step_names_token)) train_file.write("\n") # train_info.write(",".join([str(progress),str(prob), str(student), str(len(step_names_token)), # "\t".join(map(str, outcome)), "\t".join(map(str, help_level))])) # progress, problem name, student id, auto_complete, total steps length, er or me, outcome seq, help_level seq, encoding in steps length train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) train_info.write("\n") elif proba > 0.9: # elif prob in last_prob_list: # writtenTest = True test_file.write("\t".join(step_names_token)) test_file.write("\n") # test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), # "\t".join(map(str, outcome)), "\t".join(map(str, help_level))])) # progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) test_info.write("\n") else: val_file.write("\t".join(step_names_token)) val_file.write("\n") # test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), # "\t".join(map(str, outcome)), "\t".join(map(str, help_level))])) # progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length val_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) val_info.write("\n") # Indicates actions of next student # Indicates next problem # if writtenTrain: # train_file.write("\n") # train_info.write("\n") # if writtenTest: # test_file.write("\n") # test_info.write("\n") # if not writtenTrain and not writtenTest: # print(f"Student {student} is not involved in workspace : {options.workspace_name}.") train_file.close() train_info.close() val_file.close() val_info.close() test_file.close() test_info.close() def prepare_school_pretraining_files(data_processor, options): # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator(sep=",") train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") val_file = open(options.val_file_path, "w") val_info = open(options.val_info_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") for chunk_data in chunk_iterator: for school, school_group in chunk_data.groupby('CF (Anon School Id)'): if not options.school or school in options.school: for class_id, class_group in school_group.groupby('CF (Anon Class Id)'): for student, student_groups in class_group.groupby("Anon Student Id"): student_groups.sort_values(by="Time") # prob_list = list(pd.unique(student_groups["Problem Name"])) for prob, prob_groups in student_groups.groupby("Problem Name"): # For first 3 and last 3 only # if not prob in final_prob_list: # continue # print(prob) step_names_token = [] means_and_extremes = False for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): progress = row["CF (Workspace Progress Status)"] action = row["Action"] attempt = row["Attempt At Step"] autofilled = row["CF (Is Autofilled)"] step = row["Step Name"] scenario = row['CF (Problem Scenario Tags)'] if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: etalon = row["CF (Etalon)"] if not pd.isna(etalon): etalon = etalon.strip('{}') key, value = etalon.split('=') etalon = value try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True except Exception as e: pass if not autofilled: new_step = f"{step}:{action}:{attempt}" step_names_token.append(new_step) if step_names_token: where_opt = [] step1 = False step2 = False strategy_data = False for step_oh in step_names_token: step = step_oh.split(":") if len(step) == 3: step = step[0] else: step = ":".join(step[:2]) # print(f"changed {step_oh} = ? {step}") if step == options.opt_step1[0]: where_opt.append("_1") step1 = True elif step == options.opt_step2[0]: where_opt.append("_2") step2 = True elif step in options.opt_step1[1:]: where_opt.append("1") if step1: strategy_data = True elif step in options.opt_step2[1:]: where_opt.append("2") if step2: strategy_data = True else: where_opt.append("0") if strategy_data and step_names_token[-1].split(":")[-2] != "Done": strategy_data = False if strategy_data: proba = random.random() step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token] step_names_token = [] for s in step_names_tokens: if s != "nan": if not step_names_token or s != step_names_token[-1]: step_names_token.append(s) # if prob in first_prob_list: if proba <= 0.8: train_file.write("\t".join(step_names_token)) train_file.write("\n") # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) train_info.write("\n") elif proba > 0.9: # elif prob in last_prob_list: test_file.write("\t".join(step_names_token)) test_file.write("\n") # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) test_info.write("\n") else: val_file.write("\t".join(step_names_token)) val_file.write("\n") # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) val_info.write("\n") # break # break # break # break # break train_file.close() train_info.close() val_file.close() val_info.close() test_file.close() test_info.close() def prepare_school_coded_pretraining_files(data_processor, options): # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator(sep=",") train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") val_file = open(options.val_file_path, "w") val_info = open(options.val_info_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") for chunk_data in chunk_iterator: for school, school_group in chunk_data.groupby('CF (Anon School Id)'): if not options.school or school in options.school: print(f"{school} : {school_group.shape}") school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & (school_group['CF (Encounter)'] == 0) & (school_group['CF (Is Review Mode)'] == -1) ] print(f"{school} : {school_group.shape}") # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): for student, student_groups in school_group.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) # At least 3 last problems are selected prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) prob_list = prob_list[-int(len(prob_list)/2):] for prob, prob_groups in student_groups.groupby("Problem Name"): if not prob in prob_list: continue progress = list(pd.unique(prob_groups["CF (Workspace Progress Status)"]))[0] if progress != "GRADUATED": continue actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) if unique_steps_len < 4: continue class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) step_names_token = [] original_steps_actions_attempts_help_levels_outcomes = [] original_steps = [] means_and_extremes = False for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): step = row["Step Name"] action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] attempt = row["Attempt At Step"] # number outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] help_level = row["Help Level"] # number # progress = row["CF (Workspace Progress Status)"] scenario = row['CF (Problem Scenario Tags)'] if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: etalon = row["CF (Etalon)"] if not pd.isna(etalon): etalon = etalon.strip('{}') key, value = etalon.split('=') etalon = value try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True except Exception as e: pass if row['CF (Is Autofilled)'] == True: continue prev = step_names_token[-1] if step_names_token else "" prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" if not step_names_token or step != prev_step: if step in options.opt_step1 or step in options.opt_step2: new_step = step else: if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" step_names_token.append(new_step) else: if not (step in options.opt_step1 or step in options.opt_step2): if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if prev < new_step: step_names_token[-1] = new_step original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") original_steps.append(step) unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) if step_names_token and unique_steps_len > 4: proba = random.random() # if prob in first_prob_list: if proba <= 0.8: train_file.write("\t".join(step_names_token)) train_file.write("\n") # school, class, student id, progress, problem name, scenario, # prefered ER or ME, total steps length, # original seq-action-attempt-help_level-outcome train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) train_info.write("\n") elif proba > 0.9: # elif prob in last_prob_list: test_file.write("\t".join(step_names_token)) test_file.write("\n") # school, class, student id, progress, problem name, scenario, # prefered ER or ME, total steps length, # original seq-action-attempt-help_level-outcome test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) test_info.write("\n") else: val_file.write("\t".join(step_names_token)) val_file.write("\n") # school, class, student id, progress, problem name, scenario, # prefered ER or ME, total steps length, # original seq-action-attempt-help_level-outcome val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) val_info.write("\n") # break # break # break # break # break train_file.close() train_info.close() val_file.close() val_info.close() test_file.close() test_info.close() def prepare_school_attention_files(data_processor, options): # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator(sep=",") train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") val_file = open(options.val_file_path, "w") val_info = open(options.val_info_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") for chunk_data in chunk_iterator: for school, school_group in chunk_data.groupby('CF (Anon School Id)'): if not options.school or school in options.school: for class_id, class_group in school_group.groupby('CF (Anon Class Id)'): for student, student_groups in class_group.groupby("Anon Student Id"): student_groups.sort_values(by="Time") # prob_list = list(pd.unique(student_groups["Problem Name"])) # if len(prob_list) > 0 : # first_fews = int(len(prob_list)/2) # last_fews = len(prob_list) - first_fews # first_prob_list = prob_list[:first_fews] # last_prob_list = prob_list[-last_fews:] # final_prob_list = first_prob_list + last_prob_list for prob, prob_groups in student_groups.groupby("Problem Name"): step_names_token = [] means_and_extremes = False for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): progress = row["CF (Workspace Progress Status)"] action = row["Action"] attempt = row["Attempt At Step"] autofilled = row["CF (Is Autofilled)"] step = row["Step Name"] scenario = row['CF (Problem Scenario Tags)'] if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: etalon = row["CF (Etalon)"] if not pd.isna(etalon): etalon = etalon.strip('{}') key, value = etalon.split('=') etalon = value try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True except Exception as e: pass if not autofilled: new_step = f"{step}:{action}:{attempt}" step_names_token.append(new_step) if step_names_token: where_opt = [] step1 = False step2 = False strategy_data = False for step_oh in step_names_token: step = step_oh.split(":") if len(step) == 3: step = step[0] else: step = ":".join(step[:2]) # print(f"changed {step_oh} = ? {step}") if step == options.opt_step1[0]: where_opt.append("_1") step1 = True elif step == options.opt_step2[0]: where_opt.append("_2") step2 = True elif step in options.opt_step1[1:]: where_opt.append("1") if step1: strategy_data = True elif step in options.opt_step2[1:]: where_opt.append("2") if step2: strategy_data = True else: where_opt.append("0") if strategy_data and step_names_token[-1].split(":")[-2] != "Done": strategy_data = False if strategy_data: # proba = random.random() step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token] step_names_token = [] for s in step_names_tokens: if s != "nan": if not step_names_token or s != step_names_token[-1]: step_names_token.append(s) # if prob in first_prob_list: if progress == "GRADUATED":# and means_and_extremes:# and prob in first_prob_list: train_file.write("\t".join(step_names_token)) train_file.write("\n") # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) train_info.write("\n") elif progress == "PROMOTED":# and means_and_extremes:# and prob in last_prob_list: # elif prob in last_prob_list: test_file.write("\t".join(step_names_token)) test_file.write("\n") # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) test_info.write("\n") # else: # val_file.write("\t".join(step_names_token)) # val_file.write("\n") # # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length # val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) # val_info.write("\n") # break # break # break # break # break train_file.close() train_info.close() val_file.close() val_info.close() test_file.close() test_info.close() def prepare_finetuning_10per_files(data_processor, options): ''' Used for L@S paper. Only two strategies were defined as: 0: non-opt strategy 1: opt used strategy ''' chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") # val_file = open(options.val_file_path, "w") # val_info = open(options.val_info_path, "w") # val_label = open(options.val_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") overall_data = [] overall_labels = [] for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if "ratio_proportion_change3" == section: for student, student_groups in section_groups.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) for prob, prob_groups in student_groups.groupby("Problem Name"): actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups["Step Name"])) unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) if unique_steps_len < 4: continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 1800: time_stamps_list.add(time_stamps[i+1]) outcome = [] help_level = [] auto_complete = False means_and_extremes = False for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: auto_complete = True continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) # 4 and more in sequence if step_names_token and unique_steps_len > 4: where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if options.opt_step1: any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) if any_opt1: label_opt = "1" if options.opt_step2: any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) if any_opt2: label_opt = "1" # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), f"{1 if means_and_extremes else 0}"]) overall_data.append(["\t".join(step_names_token), info]) overall_labels.append(label_opt) # overall_data.append('') # overall_labels.append('') overall_labels = np.array(overall_labels) indices_of_zeros = list(np.where(overall_labels == '0')[0]) indices_of_ones = list(np.where(overall_labels == '1')[0]) train_len = int(len(overall_labels) * 0.10) sample_size = int(train_len/2) print(f"sample_size: {sample_size}") sampled_instances = random.sample(indices_of_zeros, sample_size) sampled_instances.extend(random.sample(indices_of_ones, sample_size)) balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) test_sampled_instances = random.sample(indices_of_zeros, balanced_test) test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) # writtenTrain = False # writtenTest = False for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): steps_seq = all_data[0] info = all_data[1] if index in sampled_instances: train_file.write(steps_seq) train_file.write("\n") train_info.write(info) train_info.write("\n") train_label.write(label) train_label.write("\n") elif index in test_sampled_instances: # proba = random.random() # if proba <0.5: test_file.write(steps_seq) test_file.write("\n") test_info.write(info) test_info.write("\n") test_label.write(label) test_label.write("\n") # else: # val_file.write(steps_seq) # val_file.write("\n") # val_info.write(info) # val_info.write("\n") # val_label.write(label) # val_label.write("\n") train_file.close() train_info.close() train_label.close() # val_file.close() # val_info.close() # val_label.close() test_file.close() test_info.close() test_label.close() def prepare_finetuning_IS_FS_files(data_processor, options): ''' Used for L@S paper. This function gathers first three problems of each student. Only two strategies were defined as: 0: non-opt strategy 1: opt used strategy train: IS test: FS ''' chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") # val_file = open(options.val_file_path, "w") # val_info = open(options.val_info_path, "w") # val_label = open(options.val_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if "ratio_proportion_change3" == section: for student, student_groups in section_groups.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) prob_list = list(pd.unique(student_groups["Problem Name"])) if len(prob_list) < 3: continue selected = 3 #1. int(len(prob_list)/2) #2. 3 & <6 #3. 3 & <3 first_prob_list = prob_list[:selected] last_prob_list = prob_list[-selected:] for prob, prob_groups in student_groups.groupby("Problem Name"): actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups["Step Name"])) unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) if unique_steps_len < 4: continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 1800: time_stamps_list.add(time_stamps[i+1]) outcome = [] help_level = [] auto_complete = False means_and_extremes = False for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: auto_complete = True continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) # 4 and more in sequence if step_names_token and unique_steps_len > 4: where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if options.opt_step1: any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) if any_opt1: label_opt = "1" if options.opt_step2: any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) if any_opt2: label_opt = "1" # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), f"{1 if means_and_extremes else 0}"]) if prob in first_prob_list: train_file.write("\t".join(step_names_token)) train_file.write("\n") train_info.write(info) train_info.write("\n") train_label.write(label_opt) train_label.write("\n") elif prob in last_prob_list: test_file.write("\t".join(step_names_token)) test_file.write("\n") test_info.write(info) test_info.write("\n") test_label.write(label_opt) test_label.write("\n") train_file.close() train_info.close() train_label.close() # val_file.close() # val_info.close() # val_label.close() test_file.close() test_info.close() test_label.close() def prepare_finetuning_IS_files_old(data_processor, opts): ''' Used for L@S paper. This function gathers first three problems of each student. Only two strategies were defined as: 0: non-opt strategy 1: opt used strategy ''' options = copy.deepcopy(opts) for k,v in vars(opts).items(): if k.startswith("train") or k.startswith("test"): if v: f_path = v.split("/") f_path = f_path[0]+"/"+f_path[1]+"/IS/"+f_path[2] setattr(options, f"{k}", f_path) print(f"options.{k} : {getattr(options, f'{k}')}") steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") trainr_label = open(options.trainr_label_path, "w") train_gt_label = open(options.train_gt_label_path, "w") # test_file = open(options.test_file_path, "w") # test_info = open(options.test_info_path, "w") # test_label = open(options.test_label_path, "w") # testr_label = open(options.testr_label_path, "w") # test_gt_label = open(options.test_gt_label_path, "w") overall_data = [] overall_labels = [] for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if options.workspace_name == section: for student, student_groups in section_groups.groupby("Anon Student Id"): student_groups.sort_values(by="Time") prob_list = list(pd.unique(student_groups["Problem Name"])) if len(prob_list) < 3: continue first_prob_list = prob_list[:3] # last_prob_list = prob_list[-3:] # # print(len(first_prob_list), len(last_prob_list)) # final_prob_list = first_prob_list + last_prob_list # print(len(prob_list), len(final_prob_list)) #, final_prob_list) for prob, prob_groups in student_groups.groupby("Problem Name"): # For first 3 and last 3 only if not prob in first_prob_list: continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 2000: time_stamps_list.add(time_stamps[i+1]) progress = "" outcome = [] help_level = [] auto_complete = False means_and_extremes = False finals = len(options.final_step) totals = 0 for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: # if row["Time"] in time_stamps_list: auto_complete = True # print(row) continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) if finals == 0: totals += 1 else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] errors = 0 for step, out in zip(step_names_token, outcome): if (finals and step in options.final_step) or totals > 0: out = out.split(":") if any(any(ind in o for o in out) for ind in error_ind): errors +=1 if finals: totals = finals # 4 and more in sequence if step_names_token: # and len(step_names_token) > 3 where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if options.opt_step1: any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) if any_opt1: label_opt = "1" if options.opt_step2: any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) if any_opt2: label_opt = "1" correctness = 1 - errors/totals strat_correct = "0" if correctness > 0.75: strat_correct = "1" # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)]) overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) overall_labels.append(label_opt) overall_data.append('') overall_labels.append('') # overall_labels = np.array(overall_labels) # indices_of_zeros = list(np.where(overall_labels == '0')[0]) # indices_of_ones = list(np.where(overall_labels == '1')[0]) # zeros_instances_size = int(1 * len(indices_of_zeros)) # ones_instances_size = int(1 * len(indices_of_ones)) # sample_size = min(zeros_instances_size, ones_instances_size) # sampled_instances = random.sample(indices_of_zeros, sample_size) # sampled_instances.extend(random.sample(indices_of_ones, sample_size)) writtenTrain = False # writtenTest = False for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): if all_data: steps_seq = all_data[0] strat_correct = all_data[1] info = all_data[2] me_opt = all_data[3] # if index in sampled_instances: writtenTrain = True train_file.write(steps_seq) train_file.write("\n") train_label.write(label) train_label.write("\n") trainr_label.write(strat_correct) trainr_label.write("\n") train_info.write(info) train_info.write("\n") train_gt_label.write(me_opt) train_gt_label.write("\n") # else: # writtenTest = True # test_file.write(steps_seq) # test_file.write("\n") # # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length # test_label.write(label) # test_label.write("\n") # # testr_label.write(str(correctness)) # testr_label.write(strat_correct) # testr_label.write("\n") # test_info.write(info) # test_info.write("\n") # test_gt_label.write(me_opt) # test_gt_label.write("\n") else: # Indicates actions of next student # Indicates next problem if writtenTrain: writtenTrain = False train_file.write("\n") train_info.write("\n") train_label.write("\n") trainr_label.write("\n") train_gt_label.write("\n") # if writtenTest: # writtenTest = False # test_file.write("\n") # test_info.write("\n") # test_label.write("\n") # testr_label.write("\n") # test_gt_label.write("\n") train_file.close() train_info.close() train_label.close() trainr_label.close() train_gt_label.close() # test_file.close() # test_info.close() # test_label.close() # testr_label.close() # test_gt_label.close() def prepare_finetuning_FS_files_old(data_processor, opts): ''' Used for L@S paper. This function gathers last three problems of each student. Only two strategies were defined as: 0: non-opt strategy 1: opt used strategy ''' options = copy.deepcopy(opts) for k,v in vars(opts).items(): if k.startswith("train") or k.startswith("test"): if v: f_path = v.split("/") f_path = f_path[0]+"/"+f_path[1]+"/FS/"+f_path[2] setattr(options, f"{k}", f_path) print(f"options.{k} : {getattr(options, f'{k}')}") steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") trainr_label = open(options.trainr_label_path, "w") train_gt_label = open(options.train_gt_label_path, "w") # test_file = open(options.test_file_path, "w") # test_info = open(options.test_info_path, "w") # test_label = open(options.test_label_path, "w") # testr_label = open(options.testr_label_path, "w") # test_gt_label = open(options.test_gt_label_path, "w") overall_data = [] overall_labels = [] for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if options.workspace_name == section: for student, student_groups in section_groups.groupby("Anon Student Id"): student_groups.sort_values(by="Time") prob_list = list(pd.unique(student_groups["Problem Name"])) if len(prob_list) < 3: continue # first_prob_list = prob_list[:3] last_prob_list = prob_list[-3:] # # print(len(first_prob_list), len(last_prob_list)) # final_prob_list = first_prob_list + last_prob_list # print(len(prob_list), len(final_prob_list)) #, final_prob_list) for prob, prob_groups in student_groups.groupby("Problem Name"): # For first 3 and last 3 only if not prob in last_prob_list: continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 2000: time_stamps_list.add(time_stamps[i+1]) progress = "" outcome = [] help_level = [] auto_complete = False means_and_extremes = False finals = len(options.final_step) totals = 0 for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: # if row["Time"] in time_stamps_list: auto_complete = True # print(row) continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) if finals == 0: totals += 1 else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] errors = 0 for step, out in zip(step_names_token, outcome): if (finals and step in options.final_step) or totals > 0: out = out.split(":") if any(any(ind in o for o in out) for ind in error_ind): errors +=1 if finals: totals = finals # 4 and more in sequence if step_names_token: # and len(step_names_token) > 3 where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if options.opt_step1: any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) if any_opt1: label_opt = "1" if options.opt_step2: any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) if any_opt2: label_opt = "1" correctness = 1 - errors/totals strat_correct = "0" if correctness > 0.75: strat_correct = "1" # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)]) overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) overall_labels.append(label_opt) overall_data.append('') overall_labels.append('') # overall_labels = np.array(overall_labels) # indices_of_zeros = list(np.where(overall_labels == '0')[0]) # indices_of_ones = list(np.where(overall_labels == '1')[0]) # zeros_instances_size = int(0.10 * len(indices_of_zeros)) # ones_instances_size = int(0.10 * len(indices_of_ones)) # sample_size = min(zeros_instances_size, ones_instances_size) # sampled_instances = random.sample(indices_of_zeros, sample_size) # sampled_instances.extend(random.sample(indices_of_ones, sample_size)) writtenTrain = False # writtenTest = False for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): if all_data: steps_seq = all_data[0] strat_correct = all_data[1] info = all_data[2] me_opt = all_data[3] # if index in sampled_instances: writtenTrain = True train_file.write(steps_seq) train_file.write("\n") train_label.write(label) train_label.write("\n") trainr_label.write(strat_correct) trainr_label.write("\n") train_info.write(info) train_info.write("\n") train_gt_label.write(me_opt) train_gt_label.write("\n") # else: # writtenTest = True # test_file.write(steps_seq) # test_file.write("\n") # # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length # test_label.write(label) # test_label.write("\n") # # testr_label.write(str(correctness)) # testr_label.write(strat_correct) # testr_label.write("\n") # test_info.write(info) # test_info.write("\n") # test_gt_label.write(me_opt) # test_gt_label.write("\n") else: # Indicates actions of next student # Indicates next problem if writtenTrain: writtenTrain = False train_file.write("\n") train_info.write("\n") train_label.write("\n") trainr_label.write("\n") train_gt_label.write("\n") # if writtenTest: # writtenTest = False # test_file.write("\n") # test_info.write("\n") # test_label.write("\n") # testr_label.write("\n") # test_gt_label.write("\n") train_file.close() train_info.close() train_label.close() trainr_label.close() train_gt_label.close() # test_file.close() # test_info.close() # test_label.close() # testr_label.close() # test_gt_label.close() def prepare_finetuning_correctness_files(data_processor, options): ''' Ongoing research. Student strategy learning/predicting. FinalAnswer step Correct: 1 , correctness of final strategy > 0.75 Incorrect: 0 , else < 0.75 ''' chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") # val_file = open(options.val_file_path, "w") # val_info = open(options.val_info_path, "w") # val_label = open(options.val_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") overall_data = [] overall_labels = [] for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if "ratio_proportion_change3" == section: for student, student_groups in section_groups.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) for prob, prob_groups in student_groups.groupby("Problem Name"): actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups["Step Name"])) unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) if unique_steps_len < 4: continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 1800: time_stamps_list.add(time_stamps[i+1]) outcome = [] help_level = [] auto_complete = False means_and_extremes = False final_correct = 0 for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: auto_complete = True continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) if step == "FinalAnswer": final_correct += 1 unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) # 4 and more in sequence if step_names_token and unique_steps_len > 4: where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if final_correct == 1: label_opt = "1" # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), f"{1 if means_and_extremes else 0}"]) overall_data.append(["\t".join(step_names_token), info]) overall_labels.append(label_opt) # overall_data.append('') # overall_labels.append('') overall_labels = np.array(overall_labels) indices_of_zeros = list(np.where(overall_labels == '0')[0]) indices_of_ones = list(np.where(overall_labels == '1')[0]) train_len = int(len(overall_labels) * 0.10) sample_size = int(train_len/2) print(f"sample_size: {sample_size}") sampled_instances = random.sample(indices_of_zeros, sample_size) sampled_instances.extend(random.sample(indices_of_ones, sample_size)) # writtenTrain = False # writtenTest = False for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): steps_seq = all_data[0] info = all_data[1] if index in sampled_instances: train_file.write(steps_seq) train_file.write("\n") train_info.write(info) train_info.write("\n") train_label.write(label) train_label.write("\n") else: # proba = random.random() # if proba <0.5: test_file.write(steps_seq) test_file.write("\n") test_info.write(info) test_info.write("\n") test_label.write(label) test_label.write("\n") # else: # val_file.write(steps_seq) # val_file.write("\n") # val_info.write(info) # val_info.write("\n") # val_label.write(label) # val_label.write("\n") train_file.close() train_info.close() train_label.close() # val_file.close() # val_info.close() # val_label.close() test_file.close() test_info.close() test_label.close() def prepare_finetuning_correctness_files_old(data_processor, opts): ''' Ongoing research. Student strategy learning/predicting. Correct, 1: correctness of final strategy > 0.75 Incorrect, 0: else < 0.75 ''' options = copy.deepcopy(opts) for k,v in vars(opts).items(): if k.startswith("train") or k.startswith("test"): if v: f_path = v.split("/") f_path = f_path[0]+"/"+f_path[1]+"/fa_correctness/"+f_path[2] # f_path = f_path[0]+"/"+f_path[1]+"/check2/"+f_path[2] setattr(options, f"{k}", f_path) print(f"options.{k} : {getattr(options, f'{k}')}") steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") # trainr_label = open(options.trainr_label_path, "w") # train_gt_label = open(options.train_gt_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") # testr_label = open(options.testr_label_path, "w") # test_gt_label = open(options.test_gt_label_path, "w") ws = "_".join(options.workspace_name.split("_")[:-1]) print("Workspace: ", ws) overall_data = [] overall_labels = [] for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if ws == section: for student, student_groups in section_groups.groupby("Anon Student Id"): student_groups.sort_values(by="Time") prob_list = list(pd.unique(student_groups["Problem Name"])) # if len(prob_list) < 3: # continue # first_prob_list = prob_list[:3] # last_prob_list = prob_list[-3:] # # print(len(first_prob_list), len(last_prob_list)) # final_prob_list = first_prob_list + last_prob_list # print(len(prob_list), len(final_prob_list)) #, final_prob_list) for prob, prob_groups in student_groups.groupby("Problem Name"): # For first 3 and last 3 only # if not prob in last_prob_list: # continue # print(options.final_step in list(prob_groups["Step Name"])) # if not (options.final_step in list(prob_groups["Step Name"])): # continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 2000: time_stamps_list.add(time_stamps[i+1]) progress = "" outcome = [] help_level = [] auto_complete = False means_and_extremes = False # finals = len(options.final_step) totals = 0 for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: # if row["Time"] in time_stamps_list: auto_complete = True # print(row) continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) # if finals == 0: # totals += 1 else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] errors = 0 for step, out in zip(step_names_token, outcome): if (step in options.final_step):# or totals > 0: out = out.split(":") totals = len(out) # print(totals) for ind in error_ind: if ind in out: errors +=1 # if finals: # totals = finals # 4 and more in sequence if step_names_token and totals>0: # and len(step_names_token) > 3 where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if options.opt_step1: all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) if any_opt1: label_opt = "2" if all_opt1: label_opt = "1" if options.opt_step2: all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) if any_opt2: label_opt = "4" if all_opt2: label_opt = "3" if any_opt1 and any_opt2: label_opt = "5" if any_opt1 and all_opt2: label_opt = "6" if all_opt1 and any_opt2: label_opt = "7" if all_opt1 and all_opt2: label_opt = "8" correctness = 1 - errors/totals strat_correct = "0" if correctness > 0.75: strat_correct = "1" # if not means_and_extremes and label_opt == "2": # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), f"{1 if means_and_extremes else 0}"]) overall_data.append(["\t".join(step_names_token), label_opt, info]) overall_labels.append(strat_correct) overall_data.append('') overall_labels.append('') overall_labels = np.array(overall_labels, dtype=str) indices_of_zeros = list(np.where(overall_labels == '0')[0]) indices_of_ones = list(np.where(overall_labels == '1')[0]) per = 0.20 zeros_instances_size = int(per * len(indices_of_zeros)) ones_instances_size = int(per * len(indices_of_ones)) sample_size = min(zeros_instances_size, ones_instances_size) sampled_instances = random.sample(indices_of_zeros, sample_size) sampled_instances.extend(random.sample(indices_of_ones, sample_size)) writtenTrain = False writtenTest = False for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): if all_data: steps_seq = all_data[0] label_opt = all_data[1] info = all_data[2] # me_opt = all_data[3] if index in sampled_instances: writtenTrain = True train_file.write(steps_seq) train_file.write("\n") train_label.write(label) train_label.write("\n") # trainr_label.write(label_opt) # trainr_label.write("\n") train_info.write(info) train_info.write("\n") # train_gt_label.write(me_opt) # train_gt_label.write("\n") else: writtenTest = True test_file.write(steps_seq) test_file.write("\n") # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length test_label.write(label) test_label.write("\n") # testr_label.write(str(correctness)) # testr_label.write(label_opt) # testr_label.write("\n") test_info.write(info) test_info.write("\n") # test_gt_label.write(me_opt) # test_gt_label.write("\n") else: # Indicates actions of next student # Indicates next problem if writtenTrain: writtenTrain = False train_file.write("\n") train_info.write("\n") train_label.write("\n") # trainr_label.write("\n") # train_gt_label.write("\n") if writtenTest: writtenTest = False test_file.write("\n") test_info.write("\n") test_label.write("\n") # testr_label.write("\n") # test_gt_label.write("\n") train_file.close() train_info.close() train_label.close() # trainr_label.close() # train_gt_label.close() test_file.close() test_info.close() test_label.close() # testr_label.close() # test_gt_label.close() def prepare_finetuning_correctness_aaai_files(data_processor, opts): ''' Ongoing research. Student strategy learning/predicting. Correct, 1: correctness of final strategy > 0.75 Incorrect, 0: else < 0.75 ''' options = copy.deepcopy(opts) for k,v in vars(opts).items(): if k.startswith("train") or k.startswith("test") or k.startswith("val"): if v: f_path = v.split("/") # f_path = f_path[0]+"/"+f_path[1]+"/correctness/"+f_path[2] f_path = f_path[0]+"/"+f_path[1]+"/aaai/"+f_path[2] setattr(options, f"{k}", f_path) print(f"options.{k} : {getattr(options, f'{k}')}") # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") val_file = open(options.val_file_path, "w") val_info = open(options.val_info_path, "w") val_label = open(options.val_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") high_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_high_performers.pkl", "rb")) mid_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_mid_performers.pkl", "rb")) low_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_low_performers.pkl", "rb")) prob_sel_list = pickle.load(open(f"{options.workspace_name}/aaai/change3_problem_list.pkl", "rb")) ws = "_".join(options.workspace_name.split("_")[:-1]) print(ws, len(high_performer), len(mid_performer), len(low_performer), len(prob_sel_list)) overall_data = [] overall_labels = [] for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): # if options.workspace_name == section: if ws == section: for student, student_groups in section_groups.groupby("Anon Student Id"): if student in high_performer or student in mid_performer or student in low_performer: student_groups.sort_values(by="Time") prob_list = list(pd.unique(student_groups["Problem Name"])) for prob, prob_groups in student_groups.groupby("Problem Name"): # For first 3 and last 3 only if not prob in prob_sel_list: continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 2000: time_stamps_list.add(time_stamps[i+1]) progress = "" outcome = [] help_level = [] auto_complete = False means_and_extremes = False totals = 0 for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: # if row["Time"] in time_stamps_list: auto_complete = True # print(row) continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] errors = 0 for step, out in zip(step_names_token, outcome): if (step in options.final_step): out = out.split(":") totals = len(out) # print(totals) for ind in error_ind: if ind in out: errors +=1 # 4 and more in sequence if step_names_token and totals>0: # and len(step_names_token) > 3 where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") correctness = 1 - errors/totals strat_correct = "0" if correctness > 0.75: strat_correct = "1" # if not means_and_extremes and label_opt == "2": # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length info = ",".join([str(progress), str(correctness), f"{1 if means_and_extremes else 0}",str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]) overall_data.append(["\t".join(step_names_token), info]) overall_labels.append(strat_correct) # overall_data.append('') # overall_labels.append('') overall_labels = np.array(overall_labels) for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): if all_data: steps_seq = all_data[0] info = all_data[1] student = info.split(",")[4] if student in high_performer: train_file.write(steps_seq) train_file.write("\n") train_label.write(label) train_label.write("\n") train_info.write(info) train_info.write("\n") elif student in mid_performer: val_file.write(steps_seq) val_file.write("\n") val_label.write(label) val_label.write("\n") val_info.write(info) val_info.write("\n") elif student in low_performer: test_file.write(steps_seq) test_file.write("\n") test_label.write(label) test_label.write("\n") test_info.write(info) test_info.write("\n") train_file.close() train_info.close() train_label.close() val_file.close() val_info.close() val_label.close() test_file.close() test_info.close() test_label.close() def prepare_finetuning_SL_files(data_processor, opts): ''' Ongoing research. Student strategy learning/predicting. We have defined 9 strategy as: Notation; Label UU; 0 CU; 1 PU; 2 UC; 3 UP; 4 PP; 5 PC; 6 CP; 7 CC; 8 ''' options = copy.deepcopy(opts) for k,v in vars(opts).items(): if k.startswith("train") or k.startswith("test"): if v: f_path = v.split("/") f_path = f_path[0]+"/"+f_path[1]+"/SL/"+f_path[2] setattr(options, f"{k}", f_path) print(f"options.{k} : {getattr(options, f'{k}')}") steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") trainr_label = open(options.trainr_label_path, "w") train_gt_label = open(options.train_gt_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") testr_label = open(options.testr_label_path, "w") test_gt_label = open(options.test_gt_label_path, "w") overall_data = [] overall_labels = [] for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if options.workspace_name == section: for student, student_groups in section_groups.groupby("Anon Student Id"): student_groups.sort_values(by="Time") prob_list = list(pd.unique(student_groups["Problem Name"])) # if len(prob_list) < 3: # continue # first_prob_list = prob_list[:3] # last_prob_list = prob_list[-3:] # # print(len(first_prob_list), len(last_prob_list)) # final_prob_list = first_prob_list + last_prob_list # print(len(prob_list), len(final_prob_list)) #, final_prob_list) for prob, prob_groups in student_groups.groupby("Problem Name"): # For first 3 and last 3 only # if not prob in last_prob_list: # continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 2000: time_stamps_list.add(time_stamps[i+1]) progress = "" outcome = [] help_level = [] auto_complete = False means_and_extremes = False finals = len(options.final_step) totals = 0 for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: # if row["Time"] in time_stamps_list: auto_complete = True # print(row) continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) if finals == 0: totals += 1 else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] errors = 0 for step, out in zip(step_names_token, outcome): if (finals and step in options.final_step) or totals > 0: out = out.split(":") if any(any(ind in o for o in out) for ind in error_ind): errors +=1 if finals: totals = finals # 4 and more in sequence if step_names_token: # and len(step_names_token) > 3 where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if options.opt_step1: all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) if any_opt1: label_opt = "2" if all_opt1: label_opt = "1" if options.opt_step2: all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) if any_opt2: label_opt = "4" if all_opt2: label_opt = "3" if any_opt1 and any_opt2: label_opt = "5" if any_opt1 and all_opt2: label_opt = "6" if all_opt1 and any_opt2: label_opt = "7" if all_opt1 and all_opt2: label_opt = "8" correctness = 1 - errors/totals strat_correct = "0" if correctness > 0.75: strat_correct = "1" # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)]) overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) overall_labels.append(label_opt) overall_data.append('') overall_labels.append('') overall_labels = np.array(overall_labels) indices_of_zeros = list(np.where(overall_labels == '0')[0]) indices_of_ones = list(np.where(overall_labels == '1')[0]) indices_of_twos = list(np.where(overall_labels == '2')[0]) indices_of_threes = list(np.where(overall_labels == '3')[0]) indices_of_fours = list(np.where(overall_labels == '4')[0]) indices_of_fives = list(np.where(overall_labels == '5')[0]) indices_of_sixes = list(np.where(overall_labels == '6')[0]) indices_of_sevens = list(np.where(overall_labels == '7')[0]) indices_of_eights = list(np.where(overall_labels == '8')[0]) per = 0.20 zeros_instances_size = int(per * len(indices_of_zeros)) ones_instances_size = int(per * len(indices_of_ones)) twos_instances_size = int(per * len(indices_of_twos)) threes_instances_size = int(per * len(indices_of_threes)) fours_instances_size = int(per * len(indices_of_fours)) fives_instances_size = int(per * len(indices_of_fives)) sixes_instances_size = int(per * len(indices_of_sixes)) sevens_instances_size = int(per * len(indices_of_sevens)) eights_instances_size = int(per * len(indices_of_eights)) sample_size = min(zeros_instances_size, ones_instances_size, twos_instances_size, threes_instances_size, fours_instances_size, fives_instances_size, sixes_instances_size, sevens_instances_size, eights_instances_size) print(f"Sample size.... {sample_size}") sampled_instances = random.sample(indices_of_zeros, sample_size) sampled_instances.extend(random.sample(indices_of_ones, sample_size)) sampled_instances.extend(random.sample(indices_of_twos, sample_size)) sampled_instances.extend(random.sample(indices_of_threes, sample_size)) sampled_instances.extend(random.sample(indices_of_fours, sample_size)) sampled_instances.extend(random.sample(indices_of_fives, sample_size)) sampled_instances.extend(random.sample(indices_of_sixes, sample_size)) sampled_instances.extend(random.sample(indices_of_sevens, sample_size)) sampled_instances.extend(random.sample(indices_of_eights, sample_size)) writtenTrain = False writtenTest = False for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): if all_data: steps_seq = all_data[0] strat_correct = all_data[1] info = all_data[2] me_opt = all_data[3] if index in sampled_instances: writtenTrain = True train_file.write(steps_seq) train_file.write("\n") train_label.write(label) train_label.write("\n") trainr_label.write(strat_correct) trainr_label.write("\n") train_info.write(info) train_info.write("\n") train_gt_label.write(me_opt) train_gt_label.write("\n") else: writtenTest = True test_file.write(steps_seq) test_file.write("\n") # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length test_label.write(label) test_label.write("\n") # testr_label.write(str(correctness)) testr_label.write(strat_correct) testr_label.write("\n") test_info.write(info) test_info.write("\n") test_gt_label.write(me_opt) test_gt_label.write("\n") else: # Indicates actions of next student # Indicates next problem if writtenTrain: writtenTrain = False train_file.write("\n") train_info.write("\n") train_label.write("\n") trainr_label.write("\n") train_gt_label.write("\n") if writtenTest: writtenTest = False test_file.write("\n") test_info.write("\n") test_label.write("\n") testr_label.write("\n") test_gt_label.write("\n") train_file.close() train_info.close() train_label.close() trainr_label.close() train_gt_label.close() test_file.close() test_info.close() test_label.close() testr_label.close() test_gt_label.close() def prepare_finetuning_effectiveness_files(data_processor, opts): ''' Ongoing research. Student strategy learning/predicting. We have defined 9 strategy as: Notation; Label UU; 0 CU; 1 PU; 2 UC; 3 UP; 4 PP; 5 PC; 6 CP; 7 CC; 8 if UU and CU and PU and gt = ER and correct, a positive instance if UU and UC and UP and gt = ME and correct, a positive instance else a strategy PP, PC, CP, CC and gt = ER/ME or incorrect, a negative instance ''' options = copy.deepcopy(opts) for k,v in vars(opts).items(): if k.startswith("train") or k.startswith("test"): if v: f_path = v.split("/") f_path = f_path[0]+"/"+f_path[1]+"/effectiveness/"+f_path[2] setattr(options, f"{k}", f_path) print(f"options.{k} : {getattr(options, f'{k}')}") steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") trainr_label = open(options.trainr_label_path, "w") train_gt_label = open(options.train_gt_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") testr_label = open(options.testr_label_path, "w") test_gt_label = open(options.test_gt_label_path, "w") overall_data = [] overall_labels = [] for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if options.workspace_name == section: for student, student_groups in section_groups.groupby("Anon Student Id"): student_groups.sort_values(by="Time") prob_list = list(pd.unique(student_groups["Problem Name"])) # if len(prob_list) < 3: # continue # first_prob_list = prob_list[:3] # last_prob_list = prob_list[-3:] # # print(len(first_prob_list), len(last_prob_list)) # final_prob_list = first_prob_list + last_prob_list # print(len(prob_list), len(final_prob_list)) #, final_prob_list) for prob, prob_groups in student_groups.groupby("Problem Name"): # For first 3 and last 3 only # if not prob in last_prob_list: # continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 2000: time_stamps_list.add(time_stamps[i+1]) progress = "" outcome = [] help_level = [] auto_complete = False means_and_extremes = False finals = len(options.final_step) totals = 0 for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: # if row["Time"] in time_stamps_list: auto_complete = True # print(row) continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) if finals == 0: totals += 1 else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] errors = 0 for step, out in zip(step_names_token, outcome): if (finals and step in options.final_step) or totals > 0: out = out.split(":") if any(any(ind in o for o in out) for ind in error_ind): errors +=1 if finals: totals = finals # 4 and more in sequence if step_names_token: # and len(step_names_token) > 3 where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if options.opt_step1: all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) if any_opt1: label_opt = "2" if all_opt1: label_opt = "1" if options.opt_step2: all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) if any_opt2: label_opt = "4" if all_opt2: label_opt = "3" if any_opt1 and any_opt2: label_opt = "5" if any_opt1 and all_opt2: label_opt = "6" if all_opt1 and any_opt2: label_opt = "7" if all_opt1 and all_opt2: label_opt = "8" correctness = 1 - errors/totals strat_correct = "0" if correctness > 0.75: strat_correct = "1" label_effectiveness = "0" if label_opt in ["0", "1", "2"] and not means_and_extremes and strat_correct == "1": label_effectiveness = "1" elif label_opt in ["0", "3", "4"] and means_and_extremes and strat_correct == "1": label_effectiveness = "1" # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), label_opt, f"{1 if means_and_extremes else 0}"]) overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) overall_labels.append(label_effectiveness) overall_data.append('') overall_labels.append('') overall_labels = np.array(overall_labels) indices_of_zeros = list(np.where(overall_labels == '0')[0]) indices_of_ones = list(np.where(overall_labels == '1')[0]) per = 0.20 zeros_instances_size = int(per * len(indices_of_zeros)) ones_instances_size = int(per * len(indices_of_ones)) sample_size = min(zeros_instances_size, ones_instances_size) sampled_instances = random.sample(indices_of_zeros, sample_size) sampled_instances.extend(random.sample(indices_of_ones, sample_size)) writtenTrain = False writtenTest = False for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): if all_data: steps_seq = all_data[0] strat_correct = all_data[1] info = all_data[2] me_opt = all_data[3] if index in sampled_instances: writtenTrain = True train_file.write(steps_seq) train_file.write("\n") train_label.write(label) train_label.write("\n") trainr_label.write(strat_correct) trainr_label.write("\n") train_info.write(info) train_info.write("\n") train_gt_label.write(me_opt) train_gt_label.write("\n") else: writtenTest = True test_file.write(steps_seq) test_file.write("\n") # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length test_label.write(label) test_label.write("\n") # testr_label.write(str(correctness)) testr_label.write(strat_correct) testr_label.write("\n") test_info.write(info) test_info.write("\n") test_gt_label.write(me_opt) test_gt_label.write("\n") else: # Indicates actions of next student # Indicates next problem if writtenTrain: writtenTrain = False train_file.write("\n") train_info.write("\n") train_label.write("\n") trainr_label.write("\n") train_gt_label.write("\n") if writtenTest: writtenTest = False test_file.write("\n") test_info.write("\n") test_label.write("\n") testr_label.write("\n") test_gt_label.write("\n") train_file.close() train_info.close() train_label.close() trainr_label.close() train_gt_label.close() test_file.close() test_info.close() test_label.close() testr_label.close() test_gt_label.close() def prepare_attn_test_files(data_processor, opts): options = copy.deepcopy(opts) if options.code: new_folder = f"{options.workspace_name}/{options.code}" if not os.path.exists(new_folder): os.makedirs(new_folder) for k,v in vars(opts).items(): if k.startswith("train") or k.startswith("test"): if v: f_path = (f"/{options.code}/").join(v.split("/")) setattr(options, f"{k}", f_path) print(f"options.{k} : {getattr(options, f'{k}')}") steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") if options.code != "full": test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if options.workspace_name == section: for student, student_groups in section_groups.groupby("Anon Student Id"): student_groups.sort_values(by="Time") prob_list = list(pd.unique(student_groups["Problem Name"])) for prob, prob_groups in student_groups.groupby("Problem Name"): step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 2000: time_stamps_list.add(time_stamps[i+1]) progress = "" outcome = [] help_level = [] auto_complete = False means_and_extremes = False finals = len(options.final_step) totals = 0 for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: # if row["Time"] in time_stamps_list: auto_complete = True # print(row) continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) if finals == 0: totals += 1 else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] errors = 0 for step, out in zip(step_names_token, outcome): if (finals and step in options.final_step) or totals > 0: out = out.split(":") if any(any(ind in o for o in out) for ind in error_ind): errors +=1 if finals: totals = finals # 4 and more in sequence if step_names_token: # and len(step_names_token) > 3 where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if options.opt_step1: all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) if any_opt1: label_opt = "2" if all_opt1: label_opt = "1" if options.opt_step2: all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) if any_opt2: label_opt = "4" if all_opt2: label_opt = "3" if any_opt1 and any_opt2: label_opt = "5" if any_opt1 and all_opt2: label_opt = "6" if all_opt1 and any_opt2: label_opt = "7" if all_opt1 and all_opt2: label_opt = "8" correctness = 1 - errors/totals opt_correct = "0" if correctness > 0.75: opt_correct = "1" proba = random.random() # if proba <= 0.1: # if not means_and_extremes: # if prob in first_prob_list: if options.code == "full" or (options.code == "gt" and not means_and_extremes) or (options.code == "correct" and opt_correct == "1") or (options.code == "progress" and progress == "GRADUATED"): if label_opt == "0": continue train_file.write("\t".join(step_names_token)) train_file.write("\n") # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), f"{1 if means_and_extremes else 0}", label_opt])) train_info.write("\n") # if means_and_extremes: # if prob in last_prob_list: else: test_file.write("\t".join(step_names_token)) test_file.write("\n") # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), f"{1 if means_and_extremes else 0}", label_opt])) test_info.write("\n") train_file.close() train_info.close() if options.code != "full": test_file.close() test_info.close() def prepare_finetuning_future_files(data_processor, opts): options = copy.deepcopy(opts) for k,v in vars(opts).items(): if k.startswith("train") or k.startswith("test"): if v: f_path = ("/effectiveness/").join(v.split("/")) setattr(options, f"{k}", f_path) print(f"options.{k} : {getattr(options, f'{k}')}") steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) chunk_iterator = data_processor.load_file_iterator() train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") trainr_label = open(options.trainr_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") testr_label = open(options.testr_label_path, "w") for chunk_data in chunk_iterator: for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): if options.workspace_name == section: for student, student_groups in section_groups.groupby("Anon Student Id"): writtenTrain = False writtenTest = False student_groups.sort_values(by="Time") prob_list = list(pd.unique(student_groups["Problem Name"])) # if len(prob_list) < 6: # continue # first_prob_list = prob_list[:3] # last_prob_list = prob_list[-3:] # # print(len(first_prob_list), len(last_prob_list)) # final_prob_list = first_prob_list + last_prob_list # print(len(prob_list), len(final_prob_list)) #, final_prob_list) for prob, prob_groups in student_groups.groupby("Problem Name"): # For first 3 and last 3 only # if not prob in final_prob_list: # continue step_names_token = [] time_stamps = list(prob_groups["Time"]) time_stamps_list = set() for i in range(len(time_stamps)-1): if (time_stamps[i+1] - time_stamps[i]) < 2000: time_stamps_list.add(time_stamps[i+1]) progress = "" outcome = [] help_level = [] auto_complete = False errors = 0 totals = 0 means_and_extremes = False for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): step = row["Step Name"] etalon = row["CF (Etalon)"] progress = row["CF (Workspace Progress Status)"] if not pd.isna(step): if step in options.opt_step1: try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True # break except Exception as e: pass if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: # if row["Time"] in time_stamps_list: auto_complete = True # print(row) continue # if not step_names_token or step != step_names_token[-1]: # step_names_token.append(step) if not step_names_token or step != step_names_token[-1]: step_names_token.append(step) # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] outcome.append(row['Outcome']) help_level.append(str(row["Help Level"])) totals += 1 else: outcome[-1] = outcome[-1]+":"+row['Outcome'] help_level[-1] = help_level[-1]+":"+str(row['Help Level']) error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] for out in outcome: out = out.split(":") if any(any(ind in o for o in out) for ind in error_ind): errors +=1 # 4 and more in sequence if step_names_token: # and len(step_names_token) > 3 where_opt = [] for stp in step_names_token: if stp in options.opt_step1: where_opt.append("1") elif stp in options.opt_step2: where_opt.append("2") else: where_opt.append("0") label_opt = "0" if options.opt_step1: all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) if any_opt1: label_opt = "2" if all_opt1: label_opt = "1" if options.opt_step2: all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) if any_opt2: label_opt = "4" if all_opt2: label_opt = "3" if any_opt1 and any_opt2: label_opt = "5" if any_opt1 and all_opt2: label_opt = "6" if all_opt1 and any_opt2: label_opt = "7" if all_opt1 and all_opt2: label_opt = "8" correctness = 1 - errors/totals opt_correct = "0" if correctness < 0.25: opt_correct = "0" elif correctness < 0.5: opt_correct = "1" elif correctness < 0.75: opt_correct = "2" else: opt_correct = "3" proba = random.random() # if proba <= 0.1: if not means_and_extremes: # if prob in first_prob_list: writtenTrain = True train_file.write("\t".join(step_names_token)) train_file.write("\n") # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length train_label.write(label_opt) train_label.write("\n") # trainr_label.write(str(correctness)) trainr_label.write(opt_correct) trainr_label.write("\n") train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) train_info.write("\n") if means_and_extremes: # if prob in last_prob_list: # else: writtenTest = True test_file.write("\t".join(step_names_token)) test_file.write("\n") # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length test_label.write(label_opt) test_label.write("\n") # testr_label.write(str(correctness)) testr_label.write(opt_correct) testr_label.write("\n") test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) test_info.write("\n") # Indicates actions of next student # Indicates next problem if writtenTrain: train_file.write("\n") train_info.write("\n") train_label.write("\n") trainr_label.write("\n") if writtenTest: test_file.write("\n") test_info.write("\n") test_label.write("\n") testr_label.write("\n") train_file.close() train_info.close() train_label.close() trainr_label.close() test_file.close() test_info.close() test_label.close() testr_label.close() def prepare_school_coded_finetuning_partial_seq_files(data_processor, options): ''' Ongoing research. FinalAnswer step correctness Correct: 0 if attempt at step>1 1 if attempt at step==1 ''' chunk_iterator = data_processor.load_file_iterator(sep=",") train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") # val_file = open(options.val_file_path, "w") # val_info = open(options.val_info_path, "w") # val_label = open(options.val_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") overall_data = [] overall_labels = [] # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) # kcs = [kc if not pd.isna(kc) for kc in kcs] for chunk_data in chunk_iterator: for school, school_group in chunk_data.groupby('CF (Anon School Id)'): if not options.school or school in options.school: print(f"{school} : {school_group.shape}") school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & (school_group['CF (Encounter)'] == 0) & (school_group['CF (Is Review Mode)'] == -1) ] print(f"{school} : {school_group.shape}") # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): for student, student_groups in school_group.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) # prob_list = list(pd.unique(student_groups["Problem Name"])) for prob, prob_groups in student_groups.groupby("Problem Name"): actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) if unique_steps_len < 4: continue unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) if unique_opt_steps_len < 2: continue class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) step_names_token = [] original_steps_actions_attempts_help_levels_outcomes = [] original_steps = [] means_and_extremes = False correctness = "0" opt_used = False for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): step = row["Step Name"] action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] attempt = row["Attempt At Step"] # number outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] help_level = row["Help Level"] # number progress = row["CF (Workspace Progress Status)"] scenario = row['CF (Problem Scenario Tags)'] if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: etalon = row["CF (Etalon)"] if not pd.isna(etalon): etalon = etalon.strip('{}') key, value = etalon.split('=') etalon = value try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True except Exception as e: pass if row['CF (Is Autofilled)'] == True: continue prev = step_names_token[-1] if step_names_token else "" prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" if not step_names_token or step != prev_step: if step in options.opt_step1 or step in options.opt_step2: new_step = step opt_used = True else: if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if step != "FinalAnswer": step_names_token.append(new_step) else: step_names_token.append("FinalAnswer") else: if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if prev < new_step: step_names_token[-1] = new_step if step == "FinalAnswer" and opt_used: if attempt == 1 and outcome == "OK": correctness = "1" else: correctness = "0" original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") original_steps.append(step) unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) if step_names_token and unique_steps_len > 4: info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]) overall_data.append(["\t".join(step_names_token), info]) overall_labels.append(correctness) # proba = random.random() # # if prob in first_prob_list: # if proba <= 0.8: # train_file.write("\t".join(step_names_token)) # train_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # train_info.write("\n") # elif proba > 0.9: # # elif prob in last_prob_list: # test_file.write("\t".join(step_names_token)) # test_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # test_info.write("\n") # else: # val_file.write("\t".join(step_names_token)) # val_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # val_info.write("\n") # break # break # break # break # break overall_labels = np.array(overall_labels) indices_of_zeros = list(np.where(overall_labels == '0')[0]) indices_of_ones = list(np.where(overall_labels == '1')[0]) train_len = int(len(overall_labels) * 0.10) sample_size = int(train_len/2) print(f"sample_size: {sample_size}") sampled_instances = random.sample(indices_of_zeros, sample_size) sampled_instances.extend(random.sample(indices_of_ones, sample_size)) indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) print(f"balanced_test: {balanced_test}") test_sampled_instances = random.sample(indices_of_zeros, balanced_test) test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): steps_seq = all_data[0] info = all_data[1] if index in sampled_instances: train_file.write(steps_seq) train_file.write("\n") train_info.write(info) train_info.write("\n") train_label.write(label) train_label.write("\n") elif index in test_sampled_instances: # proba = random.random() # if proba <0.5: test_file.write(steps_seq) test_file.write("\n") test_info.write(info) test_info.write("\n") test_label.write(label) test_label.write("\n") # else: # val_file.write(steps_seq) # val_file.write("\n") # val_info.write(info) # val_info.write("\n") # val_label.write(label) # val_label.write("\n") train_file.close() train_info.close() train_label.close() # val_file.close() # val_info.close() # val_label.close() test_file.close() test_info.close() test_label.close() def prepare_school_coded_finetuning_opts_files(data_processor, options): ''' Ongoing research. Labels: 0 - Opt 1 1 - Opt 2 2 - Both Opt ''' chunk_iterator = data_processor.load_file_iterator(sep=",") train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") # val_file = open(options.val_file_path, "w") # val_info = open(options.val_info_path, "w") # val_label = open(options.val_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") overall_data = [] overall_labels = [] # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) # kcs = [kc if not pd.isna(kc) for kc in kcs] for chunk_data in chunk_iterator: for school, school_group in chunk_data.groupby('CF (Anon School Id)'): if not options.school or school in options.school: print(f"{school} : {school_group.shape}") school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & (school_group['CF (Encounter)'] == 0) & (school_group['CF (Is Review Mode)'] == -1) ] print(f"{school} : {school_group.shape}") # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): for student, student_groups in school_group.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) # prob_list = list(pd.unique(student_groups["Problem Name"])) # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) # prob_list = prob_list[-int(len(prob_list)/2):] for prob, prob_groups in student_groups.groupby("Problem Name"): # if not prob in prob_list: # continue actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) if unique_steps_len < 4: continue unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) if unique_opt_steps_len < 2: continue print(unique_steps, unique_opt_steps_len) class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) step_names_token = [] original_steps_actions_attempts_help_levels_outcomes = [] original_steps = [] means_and_extremes = False opt1_used = False opt2_used = False for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): step = row["Step Name"] action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] attempt = row["Attempt At Step"] # number outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] help_level = row["Help Level"] # number progress = row["CF (Workspace Progress Status)"] scenario = row['CF (Problem Scenario Tags)'] if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: etalon = row["CF (Etalon)"] if not pd.isna(etalon): etalon = etalon.strip('{}') key, value = etalon.split('=') etalon = value try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True except Exception as e: pass if row['CF (Is Autofilled)'] == True: continue prev = step_names_token[-1] if step_names_token else "" prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" if not step_names_token or step != prev_step: if step in options.opt_step1 or step in options.opt_step2: new_step = step if step in options.opt_step1[1:]: opt1_used = True elif step in options.opt_step2[2:]: opt2_used = True else: if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" step_names_token.append(new_step) else: if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if prev < new_step: step_names_token[-1] = new_step original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") original_steps.append(step) if (not opt1_used) and (not opt2_used): continue unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) if step_names_token and unique_steps_len > 4: info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]) overall_data.append(["\t".join(step_names_token), info]) label = None if opt1_used and opt2_used: label = "2" if (not opt1_used) and opt2_used: label = "1" if opt1_used and (not opt2_used): label = "0" print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}") overall_labels.append(label) # proba = random.random() # # if prob in first_prob_list: # if proba <= 0.8: # train_file.write("\t".join(step_names_token)) # train_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # train_info.write("\n") # elif proba > 0.9: # # elif prob in last_prob_list: # test_file.write("\t".join(step_names_token)) # test_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # test_info.write("\n") # else: # val_file.write("\t".join(step_names_token)) # val_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # val_info.write("\n") # break # break # break # break # break overall_labels = np.array(overall_labels) indices_of_zeros = list(np.where(overall_labels == '0')[0]) indices_of_ones = list(np.where(overall_labels == '1')[0]) indices_of_twos = list(np.where(overall_labels == '2')[0]) train_len = int(len(overall_labels) * 0.10) sample_size = int(train_len/3) print(f"sample_size: {sample_size}") sampled_instances = random.sample(indices_of_zeros, sample_size) sampled_instances.extend(random.sample(indices_of_ones, sample_size)) sampled_instances.extend(random.sample(indices_of_twos, sample_size)) indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos)) print(f"balanced_test: {balanced_test}") test_sampled_instances = random.sample(indices_of_zeros, balanced_test) test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): steps_seq = all_data[0] info = all_data[1] if index in sampled_instances: train_file.write(steps_seq) train_file.write("\n") train_info.write(info) train_info.write("\n") train_label.write(label) train_label.write("\n") elif index in test_sampled_instances: # proba = random.random() # if proba <0.5: test_file.write(steps_seq) test_file.write("\n") test_info.write(info) test_info.write("\n") test_label.write(label) test_label.write("\n") # else: # val_file.write(steps_seq) # val_file.write("\n") # val_info.write(info) # val_info.write("\n") # val_label.write(label) # val_label.write("\n") train_file.close() train_info.close() train_label.close() # val_file.close() # val_info.close() # val_label.close() test_file.close() test_info.close() test_label.close() def prepare_school_coded_finetuning_opts_intentional_files(data_processor, options): ''' Ongoing research. Labels: 0 - Opt 1 1 - Opt 2 2 - Both Opt ''' chunk_iterator = data_processor.load_file_iterator(sep=",") train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") val_file = open(options.val_file_path, "w") val_info = open(options.val_info_path, "w") val_label = open(options.val_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") # overall_data = [] # overall_labels = [] # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) # kcs = [kc if not pd.isna(kc) for kc in kcs] for chunk_data in chunk_iterator: for school, school_group in chunk_data.groupby('CF (Anon School Id)'): if not options.school or school in options.school: print(f"{school} : {school_group.shape}") school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & (school_group['CF (Encounter)'] == 0) & (school_group['CF (Is Review Mode)'] == -1) ] print(f"{school} : {school_group.shape}") # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): for student, student_groups in school_group.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) # prob_list = list(pd.unique(student_groups["Problem Name"])) prob_list= list(pd.unique(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])) # prob_list = prob_list[-int(len(prob_list)/2):] if len(prob_list) == 0: continue for prob, prob_groups in student_groups.groupby("Problem Name"): # if not prob in prob_list: # continue actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) if unique_steps_len < 4: continue unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) if unique_opt_steps_len < 2: continue # print(unique_steps, unique_opt_steps_len) class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) step_names_token = [] original_steps_actions_attempts_help_levels_outcomes = [] original_steps = [] means_and_extremes = False opt1_used = False opt2_used = False for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): step = row["Step Name"] action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] attempt = row["Attempt At Step"] # number outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] help_level = row["Help Level"] # number progress = row["CF (Workspace Progress Status)"] scenario = row['CF (Problem Scenario Tags)'] if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: etalon = row["CF (Etalon)"] if not pd.isna(etalon): etalon = etalon.strip('{}') key, value = etalon.split('=') etalon = value try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True except Exception as e: pass if row['CF (Is Autofilled)'] == True: continue prev = step_names_token[-1] if step_names_token else "" prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" if not step_names_token or step != prev_step: if step in options.opt_step1 or step in options.opt_step2: new_step = step if step in options.opt_step1[1:]: opt1_used = True elif step in options.opt_step2[2:]: opt2_used = True else: if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" step_names_token.append(new_step) else: if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if prev < new_step: step_names_token[-1] = new_step original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") original_steps.append(step) # if (not opt1_used) and (not opt2_used): # continue unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) if step_names_token and unique_steps_len > 4: info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]) # overall_data.append(["\t".join(step_names_token), info]) # label = None # if opt1_used and opt2_used: # label = "2" # if (not opt1_used) and opt2_used: # label = "1" # if opt1_used and (not opt2_used): # label = "0" # print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}") # overall_labels.append(label) proba = random.random() # if prob in first_prob_list: if proba <= 0.8: train_file.write("\t".join(step_names_token)) train_file.write("\n") # school, class, student id, progress, problem name, scenario, # prefered ER or ME, total steps length, # original seq-action-attempt-help_level-outcome train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) train_info.write("\n") elif proba > 0.9: # elif prob in last_prob_list: test_file.write("\t".join(step_names_token)) test_file.write("\n") # school, class, student id, progress, problem name, scenario, # prefered ER or ME, total steps length, # original seq-action-attempt-help_level-outcome test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) test_info.write("\n") else: val_file.write("\t".join(step_names_token)) val_file.write("\n") # school, class, student id, progress, problem name, scenario, # prefered ER or ME, total steps length, # original seq-action-attempt-help_level-outcome val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) val_info.write("\n") # break # break # break # break # break # overall_labels = np.array(overall_labels) # indices_of_zeros = list(np.where(overall_labels == '0')[0]) # indices_of_ones = list(np.where(overall_labels == '1')[0]) # indices_of_twos = list(np.where(overall_labels == '2')[0]) # train_len = int(len(overall_labels) * 0.10) # sample_size = int(train_len/3) # print(f"sample_size: {sample_size}") # sampled_instances = random.sample(indices_of_zeros, sample_size) # sampled_instances.extend(random.sample(indices_of_ones, sample_size)) # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) # indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] # indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] # balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos)) # print(f"balanced_test: {balanced_test}") # test_sampled_instances = random.sample(indices_of_zeros, balanced_test) # test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) # for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): # steps_seq = all_data[0] # info = all_data[1] # if index in sampled_instances: # train_file.write(steps_seq) # train_file.write("\n") # train_info.write(info) # train_info.write("\n") # train_label.write(label) # train_label.write("\n") # elif index in test_sampled_instances: # # proba = random.random() # # if proba <0.5: # test_file.write(steps_seq) # test_file.write("\n") # test_info.write(info) # test_info.write("\n") # test_label.write(label) # test_label.write("\n") # # else: # # val_file.write(steps_seq) # # val_file.write("\n") # # val_info.write(info) # # val_info.write("\n") # # val_label.write(label) # # val_label.write("\n") train_file.close() train_info.close() train_label.close() val_file.close() val_info.close() val_label.close() test_file.close() test_info.close() test_label.close() def prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options): ''' Ongoing research. FinalAnswer step correctness Correctness after opts: 0 if attempt at step>1 1 if attempt at step==1 ''' kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) kcs = [kc for kc in kcs if not pd.isna(kc)] kcs = np.array(sorted(list(kcs))) print(kcs, type(kcs)) print(f"KCs: {kcs}") chunk_iterator = data_processor.load_file_iterator(sep=",") train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") # val_file = open(options.val_file_path, "w") # val_info = open(options.val_info_path, "w") # val_label = open(options.val_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") overall_data = [] overall_labels = [] # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) # kcs = [kc if not pd.isna(kc) for kc in kcs] for chunk_data in chunk_iterator: for school, school_group in chunk_data.groupby('CF (Anon School Id)'): if not options.school or school in options.school: print(f"{school} : {school_group.shape}") school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & (school_group['CF (Encounter)'] == 0) & (school_group['CF (Is Review Mode)'] == -1) ] print(f"{school} : {school_group.shape}") # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): for student, student_groups in school_group.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) # prob_list = list(pd.unique(student_groups["Problem Name"])) # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) # prob_list = prob_list[-int(len(prob_list)/2):] for prob, prob_groups in student_groups.groupby("Problem Name"): # if not prob in prob_list: # continue actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) if unique_steps_len < 4: continue unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) if unique_opt_steps_len < 2: continue # print(unique_steps, unique_opt_steps_len) class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) step_names_token = [] original_steps_actions_attempts_help_levels_outcomes = [] original_steps = [] means_and_extremes = False opt1_used = False opt2_used = False final_after_opts = False correctness = "0" kcs_skills = [0 for i in kcs] diff_skills = [0 for i in kcs] finalanswer_skill = [0 for i in kcs] for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): step = row["Step Name"] action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] attempt = row["Attempt At Step"] # number outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] help_level = row["Help Level"] # number progress = row["CF (Workspace Progress Status)"] scenario = row['CF (Problem Scenario Tags)'] kc = row['KC Model(MATHia)'] prev_skill = row['CF (Skill Previous p-Known)'] curr_skill = row['CF (Skill New p-Known)'] # print(kc, prev_skill) if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: etalon = row["CF (Etalon)"] if not pd.isna(etalon): etalon = etalon.strip('{}') key, value = etalon.split('=') etalon = value try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True except Exception as e: pass if row['CF (Is Autofilled)'] == True: continue prev = step_names_token[-1] if step_names_token else "" prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" if not step_names_token or step != prev_step: if step in options.opt_step1 or step in options.opt_step2: new_step = step if step in options.opt_step1[1:]: opt1_used = True elif step in options.opt_step2[2:]: opt2_used = True else: if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts: final_after_opts = True if outcome == "OK": correctness = "1" step_names_token.append(new_step) else: if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if prev < new_step: step_names_token[-1] = new_step if not pd.isna(kc): index = np.argwhere(kcs==kc).flatten()[0] # print(index, type(index)) kcs_skills[index] = prev_skill diff_skills[index] = prev_skill - curr_skill if step == "FinalAnswer": finalanswer_skill[index] = prev_skill original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") original_steps.append(step) if (not opt1_used) and (not opt2_used): continue unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) if step_names_token and unique_steps_len > 4: label = None if opt1_used and opt2_used: label = "2" if (not opt1_used) and opt2_used: label = "1" if opt1_used and (not opt2_used): label = "0" # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}") info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label, "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)]) overall_data.append(["\t".join(step_names_token), info]) overall_labels.append(correctness) # proba = random.random() # # if prob in first_prob_list: # if proba <= 0.8: # train_file.write("\t".join(step_names_token)) # train_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # train_info.write("\n") # elif proba > 0.9: # # elif prob in last_prob_list: # test_file.write("\t".join(step_names_token)) # test_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # test_info.write("\n") # else: # val_file.write("\t".join(step_names_token)) # val_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # val_info.write("\n") # break # break # break # break # break overall_labels = np.array(overall_labels) indices_of_zeros = list(np.where(overall_labels == '0')[0]) indices_of_ones = list(np.where(overall_labels == '1')[0]) # indices_of_twos = list(np.where(overall_labels == '2')[0]) train_len = int(len(overall_labels) * 0.10) sample_size = int(train_len/2) print(f"sample_size: {sample_size}") sampled_instances = random.sample(indices_of_zeros, sample_size) sampled_instances.extend(random.sample(indices_of_ones, sample_size)) # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos)) print(f"balanced_test: {balanced_test}") test_sampled_instances = random.sample(indices_of_zeros, balanced_test) test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): steps_seq = all_data[0] info = all_data[1] if index in sampled_instances: train_file.write(steps_seq) train_file.write("\n") train_info.write(info) train_info.write("\n") train_label.write(label) train_label.write("\n") elif index in test_sampled_instances: # proba = random.random() # if proba <0.5: test_file.write(steps_seq) test_file.write("\n") test_info.write(info) test_info.write("\n") test_label.write(label) test_label.write("\n") # else: # val_file.write(steps_seq) # val_file.write("\n") # val_info.write(info) # val_info.write("\n") # val_label.write(label) # val_label.write("\n") train_file.close() train_info.close() train_label.close() # val_file.close() # val_info.close() # val_label.close() test_file.close() test_info.close() test_label.close() def prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options): ''' Ongoing research. FinalAnswer step correctness Correctness after opts: 0 if attempt at step>1 1 if attempt at step==1 ''' kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) kcs = [kc for kc in kcs if not pd.isna(kc)] kcs = np.array(sorted(list(kcs))) print(kcs, type(kcs)) print(f"KCs: {kcs}") chunk_iterator = data_processor.load_file_iterator(sep=",") train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") # val_file = open(options.val_file_path, "w") # val_info = open(options.val_info_path, "w") # val_label = open(options.val_label_path, "w") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") train_data = [] train_labels = [] test_data = [] test_labels = [] # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) # kcs = [kc if not pd.isna(kc) for kc in kcs] for chunk_data in chunk_iterator: for school, school_group in chunk_data.groupby('CF (Anon School Id)'): if not options.school or school in options.school: print(f"{school} : {school_group.shape}") school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & (school_group['CF (Encounter)'] == 0) & (school_group['CF (Is Review Mode)'] == -1) ] print(f"{school} : {school_group.shape}") # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): for student, student_groups in school_group.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) train = True proba = random.random() if proba < 0.5: train = False # prob_list = list(pd.unique(student_groups["Problem Name"])) # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) # prob_list = prob_list[-int(len(prob_list)/2):] prev_kcs_skills = [0 for i in kcs] for pi, (prob, prob_groups) in enumerate(student_groups.groupby("Problem Name")): # if not prob in prob_list: # continue actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) if unique_steps_len < 4: continue unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) if unique_opt_steps_len < 2: continue # print(unique_steps, unique_opt_steps_len) class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) step_names_token = [] original_steps_actions_attempts_help_levels_outcomes = [] original_steps = [] means_and_extremes = False opt1_used = False opt2_used = False final_after_opts = False correctness = "0" kcs_skills = [0 for i in kcs] diff_skills = [0 for i in kcs] finalanswer_skill = [0 for i in kcs] for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): step = row["Step Name"] action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] attempt = row["Attempt At Step"] # number outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] help_level = row["Help Level"] # number progress = row["CF (Workspace Progress Status)"] scenario = row['CF (Problem Scenario Tags)'] kc = row['KC Model(MATHia)'] prev_skill = row['CF (Skill Previous p-Known)'] curr_skill = row['CF (Skill New p-Known)'] # print(kc, prev_skill) if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: etalon = row["CF (Etalon)"] if not pd.isna(etalon): etalon = etalon.strip('{}') key, value = etalon.split('=') etalon = value try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True except Exception as e: pass if row['CF (Is Autofilled)'] == True: continue prev = step_names_token[-1] if step_names_token else "" prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" if not step_names_token or step != prev_step: if step in options.opt_step1 or step in options.opt_step2: new_step = step if step in options.opt_step1[1:]: opt1_used = True elif step in options.opt_step2[2:]: opt2_used = True else: if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts: final_after_opts = True if outcome == "OK": correctness = "1" step_names_token.append(new_step) else: if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if prev < new_step: step_names_token[-1] = new_step if not pd.isna(kc): index = np.argwhere(kcs==kc).flatten()[0] # print(index, type(index)) kcs_skills[index] = prev_skill if pi != 0: diff_skills[index] = prev_skill - prev_kcs_skills[index] prev_kcs_skills[index] = prev_skill if step == "FinalAnswer": finalanswer_skill[index] = prev_skill original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") original_steps.append(step) if (not opt1_used) and (not opt2_used): continue unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) if step_names_token and unique_steps_len > 4: label = None if opt1_used and opt2_used: label = "2" if (not opt1_used) and opt2_used: label = "1" if opt1_used and (not opt2_used): label = "0" # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}") info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label, "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)]) if train: train_data.append(["\t".join(step_names_token), info]) train_labels.append(correctness) else: test_data.append(["\t".join(step_names_token), info]) test_labels.append(correctness) # proba = random.random() # # if prob in first_prob_list: # if proba <= 0.8: # train_file.write("\t".join(step_names_token)) # train_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # train_info.write("\n") # elif proba > 0.9: # # elif prob in last_prob_list: # test_file.write("\t".join(step_names_token)) # test_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # test_info.write("\n") # else: # val_file.write("\t".join(step_names_token)) # val_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # val_info.write("\n") # break # break # break # break # break # overall_labels = np.array(overall_labels) # indices_of_zeros = list(np.where(overall_labels == '0')[0]) # indices_of_ones = list(np.where(overall_labels == '1')[0]) # # indices_of_twos = list(np.where(overall_labels == '2')[0]) # train_len = int(len(overall_labels) * 0.10) # sample_size = int(train_len/2) # print(f"sample_size: {sample_size}") # sampled_instances = random.sample(indices_of_zeros, sample_size) # sampled_instances.extend(random.sample(indices_of_ones, sample_size)) # # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) # indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] # indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] # # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] # balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos)) # print(f"balanced_test: {balanced_test}") # test_sampled_instances = random.sample(indices_of_zeros, balanced_test) # test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) # # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) for index, (all_data, label) in enumerate(zip(train_data, train_labels)): steps_seq = all_data[0] info = all_data[1] train_file.write(steps_seq) train_file.write("\n") train_info.write(info) train_info.write("\n") train_label.write(label) train_label.write("\n") train_file.close() train_info.close() train_label.close() for index, (all_data, label) in enumerate(zip(test_data, test_labels)): steps_seq = all_data[0] info = all_data[1] test_file.write(steps_seq) test_file.write("\n") test_info.write(info) test_info.write("\n") test_label.write(label) test_label.write("\n") test_file.close() test_info.close() test_label.close() def prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options): ''' Ongoing research. FinalAnswer step correctness Correctness after opts: 0 if attempt at step>1 1 if attempt at step==1 ''' kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) kcs = [kc for kc in kcs if not pd.isna(kc)] kcs = np.array(sorted(list(kcs))) print(kcs, type(kcs)) print(f"KCs: {kcs}") chunk_iterator = data_processor.load_file_iterator(sep=",") train_file = open(options.train_file_path, "w") train_info = open(options.train_info_path, "w") train_label = open(options.train_label_path, "w") val_file = open(options.val_file_path, "a") val_info = open(options.val_info_path, "a") val_label = open(options.val_label_path, "a") test_file = open(options.test_file_path, "w") test_info = open(options.test_info_path, "w") test_label = open(options.test_label_path, "w") overall_data = [] overall_labels = [] # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) # kcs = [kc if not pd.isna(kc) for kc in kcs] for chunk_data in chunk_iterator: for school, school_group in chunk_data.groupby('CF (Anon School Id)'): if not options.school or school in options.school: print(f"{school} : {school_group.shape}") school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & (school_group['CF (Encounter)'] == 0) & (school_group['CF (Is Review Mode)'] == -1) ] print(f"{school} : {school_group.shape}") # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): for student, student_groups in school_group.groupby("Anon Student Id"): student_groups.sort_values(by="Time", inplace=True) # prob_list = list(pd.unique(student_groups["Problem Name"])) # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) # prob_list = prob_list[-int(len(prob_list)/2):] for prob, prob_groups in student_groups.groupby("Problem Name"): # if not prob in prob_list: # continue actions = list(prob_groups["Action"]) # A problem should be completed by a student clicking Done button. if not "Done" in actions: continue unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) if unique_steps_len < 4: continue unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) if unique_opt_steps_len < 2: continue # print(unique_steps, unique_opt_steps_len) class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) step_names_token = [] original_steps_actions_attempts_help_levels_outcomes = [] original_steps = [] means_and_extremes = False opt1_used = False opt2_used = False final_after_opts = False correctness = "0" kcs_skills = [0 for i in kcs] diff_skills = [0 for i in kcs] finalanswer_skill = [0 for i in kcs] for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): step = row["Step Name"] action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] attempt = row["Attempt At Step"] # number outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] help_level = row["Help Level"] # number progress = row["CF (Workspace Progress Status)"] scenario = row['CF (Problem Scenario Tags)'] kc = row['KC Model(MATHia)'] prev_skill = row['CF (Skill Previous p-Known)'] curr_skill = row['CF (Skill New p-Known)'] # print(kc, prev_skill) if not pd.isna(step): if step in options.opt_step1 and not means_and_extremes: etalon = row["CF (Etalon)"] if not pd.isna(etalon): etalon = etalon.strip('{}') key, value = etalon.split('=') etalon = value try: etalon = int(etalon) except Exception as e: try: etalon = float(etalon) means_and_extremes = True except Exception as e: pass if row['CF (Is Autofilled)'] == True: continue prev = step_names_token[-1] if step_names_token else "" prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" if not step_names_token or step != prev_step: if step in options.opt_step1 or step in options.opt_step2: new_step = step if step in options.opt_step1[1:]: opt1_used = True elif step in options.opt_step2[2:]: opt2_used = True else: if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts: final_after_opts = True if outcome == "OK": correctness = "1" step_names_token.append(new_step) else: if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): if action == "Attempt" and outcome != "OK": new_step = step+"-2" elif "Hint" in action: new_step = step+"-1" else: new_step = step+"-0" if prev < new_step: step_names_token[-1] = new_step if not pd.isna(kc): index = np.argwhere(kcs==kc).flatten()[0] # print(index, type(index)) kcs_skills[index] = prev_skill diff_skills[index] = prev_skill - curr_skill if step == "FinalAnswer": finalanswer_skill[index] = prev_skill original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") original_steps.append(step) if (not opt1_used) and (not opt2_used): continue unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) if step_names_token and unique_steps_len > 4: label = None if opt1_used and opt2_used: label = "2" if (not opt1_used) and opt2_used: label = "1" if opt1_used and (not opt2_used): label = "0" # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}") info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label, "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)]) overall_data.append(["\t".join(step_names_token), info]) overall_labels.append(correctness) # proba = random.random() # # if prob in first_prob_list: # if proba <= 0.8: # train_file.write("\t".join(step_names_token)) # train_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # train_info.write("\n") # elif proba > 0.9: # # elif prob in last_prob_list: # test_file.write("\t".join(step_names_token)) # test_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # test_info.write("\n") # else: # val_file.write("\t".join(step_names_token)) # val_file.write("\n") # # school, class, student id, progress, problem name, scenario, # # prefered ER or ME, total steps length, # # original seq-action-attempt-help_level-outcome # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) # val_info.write("\n") # break # break # break # break # break overall_labels = np.array(overall_labels) indices_of_zeros = list(np.where(overall_labels == '0')[0]) indices_of_ones = list(np.where(overall_labels == '1')[0]) # indices_of_twos = list(np.where(overall_labels == '2')[0]) # train_len = int(len(overall_labels) * 0.10) train_len = int(len(overall_labels) * float(options.per)) sample_size = int(train_len/2) if float(options.per) == 1: sample_size = min(len(indices_of_zeros), len(indices_of_ones)) elif float(options.per) > 1: sample_size = int(options.per) print(f"sample_size: {sample_size}") sampled_instances = random.sample(indices_of_zeros, sample_size) sampled_instances.extend(random.sample(indices_of_ones, sample_size)) # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos)) print(f"balanced_test: {balanced_test}") test_sampled_instances = random.sample(indices_of_zeros, balanced_test) test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): steps_seq = all_data[0] info = all_data[1] if index in sampled_instances: train_file.write(steps_seq) train_file.write("\n") train_info.write(info) train_info.write("\n") train_label.write(label) train_label.write("\n") if float(options.per) == 1.0: val_file.write(steps_seq) val_file.write("\n") val_info.write(info) val_info.write("\n") val_label.write(label) val_label.write("\n") elif index in test_sampled_instances: # proba = random.random() # if proba <0.5: test_file.write(steps_seq) test_file.write("\n") test_info.write(info) test_info.write("\n") test_label.write(label) test_label.write("\n") if float(options.per) != 1.0: val_file.write(steps_seq) val_file.write("\n") val_info.write(info) val_info.write("\n") val_label.write(label) val_label.write("\n") train_file.close() train_info.close() train_label.close() val_file.close() val_info.close() val_label.close() test_file.close() test_info.close() test_label.close() def prepare_pretraining_vocab_file(options): # kc = pickle.load(open("dataset/unique/unique_kcs_list.pkl","rb")) # kc_token = {"KC"+str(i):k for i, k in enumerate(kc)} # pickle.dump(kc_token, open("pretraining/unique_dict_kc_token.pkl", "wb")) # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) # step_token = {"step"+str(i):k for i, k in enumerate(steps)} # folder_name = options.workspace_name+"/" if options.workspace_name else "" # pickle.dump(step_token, open(f"{folder_name}pretraining/unique_dict_step_token.pkl", "wb")) # steps = pickle.load(open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl","rb")) steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) # print("No of unique kc", len(kc)) print("No of unique steps ", len(steps)) # print("No of unique problem", len(prob)) # print("Size of vocab ", len(steps)) ordered_steps = sorted(list(steps)) with (open(options.vocab_file_path,"w")) as vb_file: vb_file.write("[PAD]\n") vb_file.write("[UNK]\n") vb_file.write("[MASK]\n") vb_file.write("[CLS]\n") vb_file.write("[SEP]\n") # vb_file.write("\n".join(kc_token.keys())) # vb_file.write("\n") # vb_file.write("\n".join(step_token.keys())) # vb_file.write("\n".join(ordered_steps)) for step in ordered_steps: if step in options.opt_step1 or step in options.opt_step2: vb_file.write(f"{step}\n") else: for i in range(3): vb_file.write(f"{step}-{i}\n") vb_file.close() with open(options.vocab_file_path,"r") as f: l = f.readlines() print(l, len(l)) f.close() def main(opt): options = copy.deepcopy(opt) if opt.workspace_name: options.dataset_folder = opt.dataset_folder+opt.workspace_name+"/" data_processor = DataPreprocessor(input_file_path=opt.dataset) if opt.analyze_dataset_by_section: print(f"Analyzing dataset by section for workspace: {opt.workspace_name}") data_processor.analyze_dataset_by_section(opt.workspace_name) pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb")) pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb")) pickle.dump(data_processor.unique_prob_hierarchy, open(f"{options.dataset_folder}unique_hierarchy_list.pkl", "wb")) pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb")) pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb")) if opt.analyze_dataset_by_school: print(f"Analyzing dataset of all school for workspace: {opt.workspace_name}") data_processor.analyze_dataset_by_school(opt.workspace_name) if not os.path.exists(options.dataset_folder): os.makedirs(options.dataset_folder) pickle.dump(data_processor.unique_schools, open(f"{options.dataset_folder}unique_schools_list.pkl", "wb")) pickle.dump(data_processor.unique_class, open(f"{options.dataset_folder}unique_class_list.pkl", "wb")) pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb")) pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb")) pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb")) pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb")) pickle.dump(data_processor.unique_new_steps_w_action_attempt, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl", "wb")) pickle.dump(data_processor.unique_new_steps_w_action_attempt_kcs, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_kcs.pkl", "wb")) pickle.dump(data_processor.unique_new_steps_w_kcs, open(f"{options.dataset_folder}unique_new_steps_w_kcs_list.pkl", "wb")) if opt.workspace_name: for k,v in vars(opt).items(): if 'path' in k: if v: redirect_path = opt.workspace_name+"/" if opt.school and opt.pretrain: sch = f"sch_largest_{len(opt.school)}-coded" #f"sch_largest_655" redirect_path = redirect_path + sch+"/" if opt.school_folder: redirect_path = redirect_path + opt.school_folder+"/" # else: # sch = "sch_largest_655" if k != "vocab_file_path": if opt.pretrain: redirect_path = redirect_path + "pretraining/" else: if opt.code: redirect_path = redirect_path + f"{opt.code}/" elif opt.finetune_task: if opt.diff_val_folder and "val" in v: redirect_path = redirect_path + f"finetuning/" else: redirect_path = redirect_path + f"finetuning/{opt.finetune_task}/" if not os.path.exists(redirect_path): os.makedirs(redirect_path) else: if not os.path.exists(redirect_path+"/pretraining/"): os.makedirs(redirect_path+"/pretraining/") setattr(options, f"{k}", redirect_path+v) # setattr(options, f"{k}", opt.workspace_name+"/check/"+v) print(f"options.{k} : {getattr(options, f'{k}')}") if options.pretrain: print("Preparing vocab...") prepare_pretraining_vocab_file(options) print("Preparing pre-training dataset...") # old non-repeated steps # prepare_pretraining_files(data_processor, options) # coded # prepare_school_coded_pretraining_files(data_processor, options) prepare_school_coded_finetuning_opts_intentional_files(data_processor, options) # prepare_pretraining_files(data_processor, options) # prepare_school_pretraining_files(data_processor, options) # else: # print("Preparing attention dataset...") # prepare_school_attention_files(data_processor, options) else: print("Preparing fine-tuning dataset...") # _1920 # prepare_finetuning_10per_files(data_processor, options) # prepare_finetuning_IS_FS_files(data_processor, options) # prepare_finetuning_correctness_files(data_processor, options) # _2223 # prepare_school_coded_finetuning_partial_seq_files(data_processor, options) # prepare_school_coded_finetuning_opts_files(data_processor, options) prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options) # prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options) # prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options) # prepare_finetuning_IS_files(data_processor, options) # # prepare_finetuning_FS_files(data_processor, options) # prepare_finetuning_correctness_aaai_files(data_processor, options) # # prepare_finetuning_SL_files(data_processor, options) # # prepare_finetuning_effectiveness_files(data_processor, options) # prepare_attn_test_files(data_processor, options) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-dataset_folder', type=str, default="dataset/CL4999_1920/") parser.add_argument('-analyze_dataset_by_section', type=bool, default=False) parser.add_argument('-analyze_dataset_by_school', type=bool, default=False) parser.add_argument('-workspace_name', type=str, default=None) parser.add_argument('-school', nargs='+', type=str, default=None) parser.add_argument('-school_folder', type=str, default=None) # parser.add_argument('-highGRschool', nargs='+', type=str, default=None) # parser.add_argument('-lowGRschool', nargs='+', type=str, default=None) parser.add_argument('-code', type=str, default=None) parser.add_argument('-finetune_task', type=str, default=None) parser.add_argument('-per', type=float, default=None) parser.add_argument("-diff_val_folder", type=bool, default=False, help="use for different val folder") parser.add_argument('-opt_step1', nargs='+', type=str, help='List of optional steps 1') parser.add_argument('-opt_step2', nargs='+', type=str, help='List of optional steps 2') parser.add_argument('-final_step', nargs='+', type=str, help='List of final step') parser.add_argument('-dataset', type=str, default="dataset/CL4999_1920/course2_1920_4999_students_datashop.txt") parser.add_argument('-pretrain', type=bool, default=False) parser.add_argument('-vocab_file_path', type=str, default="pretraining/vocab.txt") #pretraining/vocab.txt # Prepare for pretraining parser.add_argument('-train_file_path', type=str, default="train.txt") #pretraining/pretrain.txt parser.add_argument('-train_info_path', type=str, default="train_info.txt") #pretraining/pretrain_info.txt parser.add_argument('-train_label_path', type=str, default="train_label.txt") #finetuning/train_label.txt parser.add_argument('-val_file_path', type=str, default="val.txt") #pretraining/val.txt parser.add_argument('-val_info_path', type=str, default="val_info.txt") #pretraining/val_info.txt parser.add_argument('-val_label_path', type=str, default="val_label.txt") #finetuning/val_label.txt parser.add_argument('-test_file_path', type=str, default="test.txt") #pretraining/test.txt parser.add_argument('-test_info_path', type=str, default="test_info.txt") #pretraining/test_info.txt parser.add_argument('-test_label_path', type=str, default="test_label.txt") #finetuning/test_label.txt # parser.add_argument('-train_gt_label_path', type=str, default="finetuning/train_gt_label.txt") # parser.add_argument('-test_gt_label_path', type=str, default="finetuning/test_gt_label.txt") options = parser.parse_args() if not options.opt_step1: setattr(options, "opt_step1", []) print("Optional steps 1: ", options.opt_step1) if not options.opt_step2: setattr(options, "opt_step2", []) print("Optional steps 2: ", options.opt_step2) if not options.final_step: setattr(options, "final_step", []) print("Final steps: ", options.final_step) main(options)