File size: 9,047 Bytes
5c72fe4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import time
import pandas as pd

import sys

class DataPreprocessor:
    def __init__(self, input_file_path):
        self.input_file_path = input_file_path
        self.unique_students = None
        self.unique_problems = None
        self.unique_prob_hierarchy = None
        self.unique_steps = None
        self.unique_kcs = None

    def analyze_dataset(self):
        file_iterator = self.load_file_iterator()

        start_time = time.time()
        self.unique_students = {"st"}
        self.unique_problems = {"pr"}
        self.unique_prob_hierarchy = {"ph"}
        self.unique_kcs = {"kc"}
        for chunk_data in file_iterator:
            for student_id, std_groups in chunk_data.groupby('Anon Student Id'):
                self.unique_students.update({student_id})
                prob_hierarchy = std_groups.groupby('Level (Workspace Id)')
                for hierarchy, hierarchy_groups in prob_hierarchy:
                    self.unique_prob_hierarchy.update({hierarchy})
                    prob_name = hierarchy_groups.groupby('Problem Name')
                    for problem_name, prob_name_groups in prob_name:
                        self.unique_problems.update({problem_name})
                        sub_skills = prob_name_groups['KC Model(MATHia)']
                        for a in sub_skills:
                            if str(a) != "nan":
                                temp = a.split("~~")
                                for kc in temp:
                                    self.unique_kcs.update({kc})
        self.unique_students.remove("st")
        self.unique_problems.remove("pr")
        self.unique_prob_hierarchy.remove("ph")
        self.unique_kcs.remove("kc")
        end_time = time.time()
        print("Time Taken to analyze dataset = ", end_time - start_time)
        print("Length of unique students->", len(self.unique_students))
        print("Length of unique problems->", len(self.unique_problems))
        print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy))
        print("Length of Unique Knowledge components ->", len(self.unique_kcs))

    def analyze_dataset_by_section(self, workspace_name):
        file_iterator = self.load_file_iterator()
        
        start_time = time.time()
        self.unique_students = {"st"}
        self.unique_problems = {"pr"}
        self.unique_prob_hierarchy = {"ph"}
        self.unique_steps = {"s"}
        self.unique_kcs = {"kc"}
        # with open("workspace_info.txt", 'a') as f:
        #     sys.stdout = f
        for chunk_data in file_iterator:
            for student_id, std_groups in chunk_data.groupby('Anon Student Id'):
                prob_hierarchy = std_groups.groupby('Level (Workspace Id)')
                for hierarchy, hierarchy_groups in prob_hierarchy:
                    if workspace_name == hierarchy:
                        # print("Workspace : ", hierarchy)
                        self.unique_students.update({student_id})   
                        self.unique_prob_hierarchy.update({hierarchy})
                        prob_name = hierarchy_groups.groupby('Problem Name')
                        for problem_name, prob_name_groups in prob_name:
                            self.unique_problems.update({problem_name})
                            step_names = prob_name_groups['Step Name']
                            sub_skills = prob_name_groups['KC Model(MATHia)']
                            for step in step_names:
                                if str(step) != "nan":
                                    self.unique_steps.update({step})
                            for a in sub_skills:
                                if str(a) != "nan":
                                    temp = a.split("~~")
                                    for kc in temp:
                                        self.unique_kcs.update({kc})
        self.unique_problems.remove("pr")
        self.unique_prob_hierarchy.remove("ph")
        self.unique_steps.remove("s")
        self.unique_kcs.remove("kc")
        end_time = time.time()
        print("Time Taken to analyze dataset = ", end_time - start_time)
        print("Workspace-> ",workspace_name)
        print("Length of unique students->", len(self.unique_students))
        print("Length of unique problems->", len(self.unique_problems))
        print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy))
        print("Length of unique step names ->", len(self.unique_steps))
        print("Length of unique knowledge components ->", len(self.unique_kcs))
        #     f.close()
        # sys.stdout = sys.__stdout__
        
    def analyze_dataset_by_school(self, workspace_name, school_id=None):
        file_iterator = self.load_file_iterator(sep=",")

        start_time = time.time()
        self.unique_schools = set()
        self.unique_class = set()
        self.unique_students = set()
        self.unique_problems = set()
        self.unique_steps = set()
        self.unique_kcs = set()
        self.unique_actions = set()
        self.unique_outcomes = set()
        self.unique_new_steps_w_action_attempt = set()
        self.unique_new_steps_w_kcs = set()
        self.unique_new_steps_w_action_attempt_kcs = set()
        
        for chunk_data in file_iterator:
            for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
                # if school and school == school_id:
                self.unique_schools.add(school)
                for class_id, class_group in school_group.groupby('CF (Anon Class Id)'):
                    self.unique_class.add(class_id)
                    for student_id, std_group in class_group.groupby('Anon Student Id'):
                        self.unique_students.add(student_id)
                        for prob, prob_group in std_group.groupby('Problem Name'):
                            self.unique_problems.add(prob)
                            
                            step_names = set(prob_group['Step Name'])
                            sub_skills = set(prob_group['KC Model(MATHia)'])
                            actions = set(prob_group['Action'])
                            outcomes = set(prob_group['Outcome'])
                            
                            self.unique_steps.update(step_names)
                            self.unique_kcs.update(sub_skills)
                            self.unique_actions.update(actions)
                            self.unique_outcomes.update(outcomes)
                            
                            for step in step_names:                                
                                if pd.isna(step):
                                    step_group = prob_group[pd.isna(prob_group['Step Name'])]
                                else:
                                    step_group = prob_group[prob_group['Step Name']==step]
                                    
                                for kc in set(step_group['KC Model(MATHia)']):
                                    new_step = f"{step}:{kc}"
                                    self.unique_new_steps_w_kcs.add(new_step)

                                for action, action_group in step_group.groupby('Action'):
                                    for attempt, attempt_group in action_group.groupby('Attempt At Step'):
                                        new_step = f"{step}:{action}:{attempt}"
                                        self.unique_new_steps_w_action_attempt.add(new_step)

                                        for kc in set(attempt_group["KC Model(MATHia)"]):
                                            new_step = f"{step}:{action}:{attempt}:{kc}"
                                            self.unique_new_steps_w_action_attempt_kcs.add(new_step)
                                        

        end_time = time.time()
        print("Time Taken to analyze dataset = ", end_time - start_time)
        print("Workspace-> ",workspace_name)
        print("Length of unique students->", len(self.unique_students))
        print("Length of unique problems->", len(self.unique_problems))
        print("Length of unique classes->", len(self.unique_class))
        print("Length of unique step names ->", len(self.unique_steps))
        print("Length of unique knowledge components ->", len(self.unique_kcs))
        print("Length of unique actions ->", len(self.unique_actions))
        print("Length of unique outcomes ->", len(self.unique_outcomes))
        print("Length of unique new step names with actions and attempts ->", len(self.unique_new_steps_w_action_attempt))
        print("Length of unique new step names with actions, attempts and kcs ->", len(self.unique_new_steps_w_action_attempt_kcs))
        print("Length of unique new step names with kcs ->", len(self.unique_new_steps_w_kcs))

    def load_file_iterator(self, sep="\t"):
        chunk_iterator = pd.read_csv(self.input_file_path, sep=sep, header=0, iterator=True, chunksize=1000000)
        return chunk_iterator