File size: 4,821 Bytes
6a34fd4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import time
import pandas as pd
import sys
class DataPreprocessor:
def __init__(self, input_file_path):
self.input_file_path = input_file_path
self.unique_students = None
self.unique_problems = None
self.unique_prob_hierarchy = None
self.unique_steps = None
self.unique_kcs = None
def analyze_dataset(self):
file_iterator = self.load_file_iterator()
start_time = time.time()
self.unique_students = {"st"}
self.unique_problems = {"pr"}
self.unique_prob_hierarchy = {"ph"}
self.unique_kcs = {"kc"}
for chunk_data in file_iterator:
for student_id, std_groups in chunk_data.groupby('Anon Student Id'):
self.unique_students.update({student_id})
prob_hierarchy = std_groups.groupby('Level (Workspace Id)')
for hierarchy, hierarchy_groups in prob_hierarchy:
self.unique_prob_hierarchy.update({hierarchy})
prob_name = hierarchy_groups.groupby('Problem Name')
for problem_name, prob_name_groups in prob_name:
self.unique_problems.update({problem_name})
sub_skills = prob_name_groups['KC Model(MATHia)']
for a in sub_skills:
if str(a) != "nan":
temp = a.split("~~")
for kc in temp:
self.unique_kcs.update({kc})
self.unique_students.remove("st")
self.unique_problems.remove("pr")
self.unique_prob_hierarchy.remove("ph")
self.unique_kcs.remove("kc")
end_time = time.time()
print("Time Taken to analyze dataset = ", end_time - start_time)
print("Length of unique students->", len(self.unique_students))
print("Length of unique problems->", len(self.unique_problems))
print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy))
print("Length of Unique Knowledge components ->", len(self.unique_kcs))
def analyze_dataset_by_section(self, workspace_name):
file_iterator = self.load_file_iterator()
start_time = time.time()
self.unique_students = {"st"}
self.unique_problems = {"pr"}
self.unique_prob_hierarchy = {"ph"}
self.unique_steps = {"s"}
self.unique_kcs = {"kc"}
# with open("workspace_info.txt", 'a') as f:
# sys.stdout = f
for chunk_data in file_iterator:
for student_id, std_groups in chunk_data.groupby('Anon Student Id'):
prob_hierarchy = std_groups.groupby('Level (Workspace Id)')
for hierarchy, hierarchy_groups in prob_hierarchy:
if workspace_name == hierarchy:
# print("Workspace : ", hierarchy)
self.unique_students.update({student_id})
self.unique_prob_hierarchy.update({hierarchy})
prob_name = hierarchy_groups.groupby('Problem Name')
for problem_name, prob_name_groups in prob_name:
self.unique_problems.update({problem_name})
step_names = prob_name_groups['Step Name']
sub_skills = prob_name_groups['KC Model(MATHia)']
for step in step_names:
if str(step) != "nan":
self.unique_steps.update({step})
for a in sub_skills:
if str(a) != "nan":
temp = a.split("~~")
for kc in temp:
self.unique_kcs.update({kc})
self.unique_problems.remove("pr")
self.unique_prob_hierarchy.remove("ph")
self.unique_steps.remove("s")
self.unique_kcs.remove("kc")
end_time = time.time()
print("Time Taken to analyze dataset = ", end_time - start_time)
print("Workspace-> ",workspace_name)
print("Length of unique students->", len(self.unique_students))
print("Length of unique problems->", len(self.unique_problems))
print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy))
print("Length of unique step names ->", len(self.unique_steps))
print("Length of unique knowledge components ->", len(self.unique_kcs))
# f.close()
# sys.stdout = sys.__stdout__
def load_file_iterator(self):
chunk_iterator = pd.read_csv(self.input_file_path, sep="\t", header=0, iterator=True, chunksize=1000000)
return chunk_iterator
|