Spaces:
Running
Running
Update my_model/dataset/dataset_processor.py
Browse files
my_model/dataset/dataset_processor.py
CHANGED
@@ -35,6 +35,7 @@ class OKVQADatasetProcessor:
|
|
35 |
self.df_answers = pd.DataFrame(self.annotations)
|
36 |
self.merged_df = None
|
37 |
|
|
|
38 |
def load_data_files(self) -> Tuple[List[dict], List[dict]]:
|
39 |
"""
|
40 |
Loads the question and annotation data from JSON files.
|
@@ -52,6 +53,7 @@ class OKVQADatasetProcessor:
|
|
52 |
|
53 |
return questions, annotations
|
54 |
|
|
|
55 |
@staticmethod
|
56 |
def find_most_frequent(my_list: List[str]) -> Optional[str]:
|
57 |
"""
|
@@ -69,6 +71,7 @@ class OKVQADatasetProcessor:
|
|
69 |
most_common = counter.most_common(1)
|
70 |
return most_common[0][0]
|
71 |
|
|
|
72 |
def merge_data(self) -> None:
|
73 |
"""
|
74 |
Merges the question and answer DataFrames on a common key.
|
@@ -81,9 +84,10 @@ class OKVQADatasetProcessor:
|
|
81 |
self.merged_df = pd.merge(self.df_questions, self.df_answers, on=['question_id', 'image_id'])
|
82 |
|
83 |
def join_words_with_hyphen(self, sentence):
|
84 |
-
|
85 |
return '-'.join(sentence.split())
|
86 |
|
|
|
87 |
def process_answers(self) -> None:
|
88 |
"""
|
89 |
Processes answers from merged DataFrame by extracting and identifying the most frequent answers.
|
@@ -103,6 +107,7 @@ class OKVQADatasetProcessor:
|
|
103 |
self.merged_df['single_word_answers'] = self.merged_df['most_frequent_processed_answer'].apply(
|
104 |
self.join_words_with_hyphen)
|
105 |
|
|
|
106 |
def get_processed_data(self) -> Optional[pd.DataFrame]:
|
107 |
"""
|
108 |
Retrieves the processed DataFrame.
|
@@ -117,6 +122,7 @@ class OKVQADatasetProcessor:
|
|
117 |
print("DataFrame is empty or not processed yet.")
|
118 |
return None
|
119 |
|
|
|
120 |
def save_to_csv(self, df: pd.DataFrame, saved_file_name: Optional[str]) -> None:
|
121 |
"""
|
122 |
Saves the DataFrame to a CSV file.
|
@@ -134,6 +140,7 @@ class OKVQADatasetProcessor:
|
|
134 |
else:
|
135 |
df.to_csv("data.csv", index=None)
|
136 |
|
|
|
137 |
def display_dataframe(self) -> None:
|
138 |
"""
|
139 |
Displays the processed DataFrame.
|
|
|
35 |
self.df_answers = pd.DataFrame(self.annotations)
|
36 |
self.merged_df = None
|
37 |
|
38 |
+
|
39 |
def load_data_files(self) -> Tuple[List[dict], List[dict]]:
|
40 |
"""
|
41 |
Loads the question and annotation data from JSON files.
|
|
|
53 |
|
54 |
return questions, annotations
|
55 |
|
56 |
+
|
57 |
@staticmethod
|
58 |
def find_most_frequent(my_list: List[str]) -> Optional[str]:
|
59 |
"""
|
|
|
71 |
most_common = counter.most_common(1)
|
72 |
return most_common[0][0]
|
73 |
|
74 |
+
|
75 |
def merge_data(self) -> None:
|
76 |
"""
|
77 |
Merges the question and answer DataFrames on a common key.
|
|
|
84 |
self.merged_df = pd.merge(self.df_questions, self.df_answers, on=['question_id', 'image_id'])
|
85 |
|
86 |
def join_words_with_hyphen(self, sentence):
|
87 |
+
|
88 |
return '-'.join(sentence.split())
|
89 |
|
90 |
+
|
91 |
def process_answers(self) -> None:
|
92 |
"""
|
93 |
Processes answers from merged DataFrame by extracting and identifying the most frequent answers.
|
|
|
107 |
self.merged_df['single_word_answers'] = self.merged_df['most_frequent_processed_answer'].apply(
|
108 |
self.join_words_with_hyphen)
|
109 |
|
110 |
+
|
111 |
def get_processed_data(self) -> Optional[pd.DataFrame]:
|
112 |
"""
|
113 |
Retrieves the processed DataFrame.
|
|
|
122 |
print("DataFrame is empty or not processed yet.")
|
123 |
return None
|
124 |
|
125 |
+
|
126 |
def save_to_csv(self, df: pd.DataFrame, saved_file_name: Optional[str]) -> None:
|
127 |
"""
|
128 |
Saves the DataFrame to a CSV file.
|
|
|
140 |
else:
|
141 |
df.to_csv("data.csv", index=None)
|
142 |
|
143 |
+
|
144 |
def display_dataframe(self) -> None:
|
145 |
"""
|
146 |
Displays the processed DataFrame.
|