Spaces:

suryadev1
/

astra

Running

File size: 4,821 Bytes

6a34fd4

import time
import pandas as pd

import sys

class DataPreprocessor:
    def __init__(self, input_file_path):
        self.input_file_path = input_file_path
        self.unique_students = None
        self.unique_problems = None
        self.unique_prob_hierarchy = None
        self.unique_steps = None
        self.unique_kcs = None

    def analyze_dataset(self):
        file_iterator = self.load_file_iterator()

        start_time = time.time()
        self.unique_students = {"st"}
        self.unique_problems = {"pr"}
        self.unique_prob_hierarchy = {"ph"}
        self.unique_kcs = {"kc"}
        for chunk_data in file_iterator:
            for student_id, std_groups in chunk_data.groupby('Anon Student Id'):
                self.unique_students.update({student_id})
                prob_hierarchy = std_groups.groupby('Level (Workspace Id)')
                for hierarchy, hierarchy_groups in prob_hierarchy:
                    self.unique_prob_hierarchy.update({hierarchy})
                    prob_name = hierarchy_groups.groupby('Problem Name')
                    for problem_name, prob_name_groups in prob_name:
                        self.unique_problems.update({problem_name})
                        sub_skills = prob_name_groups['KC Model(MATHia)']
                        for a in sub_skills:
                            if str(a) != "nan":
                                temp = a.split("~~")
                                for kc in temp:
                                    self.unique_kcs.update({kc})
        self.unique_students.remove("st")
        self.unique_problems.remove("pr")
        self.unique_prob_hierarchy.remove("ph")
        self.unique_kcs.remove("kc")
        end_time = time.time()
        print("Time Taken to analyze dataset = ", end_time - start_time)
        print("Length of unique students->", len(self.unique_students))
        print("Length of unique problems->", len(self.unique_problems))
        print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy))
        print("Length of Unique Knowledge components ->", len(self.unique_kcs))

    def analyze_dataset_by_section(self, workspace_name):
        file_iterator = self.load_file_iterator()
        
        start_time = time.time()
        self.unique_students = {"st"}
        self.unique_problems = {"pr"}
        self.unique_prob_hierarchy = {"ph"}
        self.unique_steps = {"s"}
        self.unique_kcs = {"kc"}
        # with open("workspace_info.txt", 'a') as f:
        #     sys.stdout = f
        for chunk_data in file_iterator:
            for student_id, std_groups in chunk_data.groupby('Anon Student Id'):
                prob_hierarchy = std_groups.groupby('Level (Workspace Id)')
                for hierarchy, hierarchy_groups in prob_hierarchy:
                    if workspace_name == hierarchy:
                        # print("Workspace : ", hierarchy)
                        self.unique_students.update({student_id})   
                        self.unique_prob_hierarchy.update({hierarchy})
                        prob_name = hierarchy_groups.groupby('Problem Name')
                        for problem_name, prob_name_groups in prob_name:
                            self.unique_problems.update({problem_name})
                            step_names = prob_name_groups['Step Name']
                            sub_skills = prob_name_groups['KC Model(MATHia)']
                            for step in step_names:
                                if str(step) != "nan":
                                    self.unique_steps.update({step})
                            for a in sub_skills:
                                if str(a) != "nan":
                                    temp = a.split("~~")
                                    for kc in temp:
                                        self.unique_kcs.update({kc})
        self.unique_problems.remove("pr")
        self.unique_prob_hierarchy.remove("ph")
        self.unique_steps.remove("s")
        self.unique_kcs.remove("kc")
        end_time = time.time()
        print("Time Taken to analyze dataset = ", end_time - start_time)
        print("Workspace-> ",workspace_name)
        print("Length of unique students->", len(self.unique_students))
        print("Length of unique problems->", len(self.unique_problems))
        print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy))
        print("Length of unique step names ->", len(self.unique_steps))
        print("Length of unique knowledge components ->", len(self.unique_kcs))
        #     f.close()
        # sys.stdout = sys.__stdout__

    def load_file_iterator(self):
        chunk_iterator = pd.read_csv(self.input_file_path, sep="\t", header=0, iterator=True, chunksize=1000000)
        return chunk_iterator