Spaces:
Running
Running
Update my_model/utilities/gen_utilities.py
Browse files
my_model/utilities/gen_utilities.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
import pandas as pd
|
2 |
-
from collections import Counter
|
3 |
-
import json
|
4 |
import os
|
5 |
from PIL import Image
|
6 |
import numpy as np
|
@@ -12,160 +10,6 @@ import gc
|
|
12 |
import streamlit as st
|
13 |
|
14 |
|
15 |
-
class VQADataProcessor:
|
16 |
-
"""
|
17 |
-
A class to process OKVQA dataset.
|
18 |
-
|
19 |
-
Attributes:
|
20 |
-
questions_file_path (str): The file path for the questions JSON file.
|
21 |
-
annotations_file_path (str): The file path for the annotations JSON file.
|
22 |
-
questions (list): List of questions extracted from the JSON file.
|
23 |
-
annotations (list): List of annotations extracted from the JSON file.
|
24 |
-
df_questions (DataFrame): DataFrame created from the questions list.
|
25 |
-
df_answers (DataFrame): DataFrame created from the annotations list.
|
26 |
-
merged_df (DataFrame): DataFrame resulting from merging questions and answers.
|
27 |
-
"""
|
28 |
-
|
29 |
-
def __init__(self, questions_file_path, annotations_file_path):
|
30 |
-
"""
|
31 |
-
Initializes the VQADataProcessor with file paths for questions and annotations.
|
32 |
-
|
33 |
-
Parameters:
|
34 |
-
questions_file_path (str): The file path for the questions JSON file.
|
35 |
-
annotations_file_path (str): The file path for the annotations JSON file.
|
36 |
-
"""
|
37 |
-
self.questions_file_path = questions_file_path
|
38 |
-
self.annotations_file_path = annotations_file_path
|
39 |
-
self.questions, self.annotations = self.read_json_files()
|
40 |
-
self.df_questions = pd.DataFrame(self.questions)
|
41 |
-
self.df_answers = pd.DataFrame(self.annotations)
|
42 |
-
self.merged_df = None
|
43 |
-
|
44 |
-
def read_json_files(self):
|
45 |
-
"""
|
46 |
-
Reads the JSON files for questions and annotations.
|
47 |
-
|
48 |
-
Returns:
|
49 |
-
tuple: A tuple containing two lists: questions and annotations.
|
50 |
-
"""
|
51 |
-
with open(self.questions_file_path, 'r') as file:
|
52 |
-
data = json.load(file)
|
53 |
-
questions = data['questions']
|
54 |
-
|
55 |
-
with open(self.annotations_file_path, 'r') as file:
|
56 |
-
data = json.load(file)
|
57 |
-
annotations = data['annotations']
|
58 |
-
|
59 |
-
return questions, annotations
|
60 |
-
|
61 |
-
@staticmethod
|
62 |
-
def find_most_frequent(my_list):
|
63 |
-
"""
|
64 |
-
Finds the most frequent item in a list.
|
65 |
-
|
66 |
-
Parameters:
|
67 |
-
my_list (list): A list of items.
|
68 |
-
|
69 |
-
Returns:
|
70 |
-
The most frequent item in the list. Returns None if the list is empty.
|
71 |
-
"""
|
72 |
-
if not my_list:
|
73 |
-
return None
|
74 |
-
counter = Counter(my_list)
|
75 |
-
most_common = counter.most_common(1)
|
76 |
-
return most_common[0][0]
|
77 |
-
|
78 |
-
def merge_dataframes(self):
|
79 |
-
"""
|
80 |
-
Merges the questions and answers DataFrames on 'question_id' and 'image_id'.
|
81 |
-
"""
|
82 |
-
self.merged_df = pd.merge(self.df_questions, self.df_answers, on=['question_id', 'image_id'])
|
83 |
-
|
84 |
-
def join_words_with_hyphen(self, sentence):
|
85 |
-
|
86 |
-
return '-'.join(sentence.split())
|
87 |
-
|
88 |
-
def process_answers(self):
|
89 |
-
"""
|
90 |
-
Processes the answers by extracting raw and processed answers and finding the most frequent ones.
|
91 |
-
"""
|
92 |
-
if self.merged_df is not None:
|
93 |
-
self.merged_df['raw_answers'] = self.merged_df['answers'].apply(lambda x: [ans['raw_answer'] for ans in x])
|
94 |
-
self.merged_df['processed_answers'] = self.merged_df['answers'].apply(
|
95 |
-
lambda x: [ans['answer'] for ans in x])
|
96 |
-
self.merged_df['most_frequent_raw_answer'] = self.merged_df['raw_answers'].apply(self.find_most_frequent)
|
97 |
-
self.merged_df['most_frequent_processed_answer'] = self.merged_df['processed_answers'].apply(
|
98 |
-
self.find_most_frequent)
|
99 |
-
self.merged_df.drop(columns=['answers'], inplace=True)
|
100 |
-
else:
|
101 |
-
print("DataFrames have not been merged yet.")
|
102 |
-
|
103 |
-
# Apply the function to the 'most_frequent_processed_answer' column
|
104 |
-
self.merged_df['single_word_answers'] = self.merged_df['most_frequent_processed_answer'].apply(
|
105 |
-
self.join_words_with_hyphen)
|
106 |
-
|
107 |
-
def get_processed_data(self):
|
108 |
-
"""
|
109 |
-
Retrieves the processed DataFrame.
|
110 |
-
|
111 |
-
Returns:
|
112 |
-
DataFrame: The processed DataFrame. Returns None if the DataFrame is empty or not processed.
|
113 |
-
"""
|
114 |
-
if self.merged_df is not None:
|
115 |
-
return self.merged_df
|
116 |
-
else:
|
117 |
-
print("DataFrame is empty or not processed yet.")
|
118 |
-
return None
|
119 |
-
|
120 |
-
def save_to_csv(self, df, saved_file_name):
|
121 |
-
|
122 |
-
if saved_file_name is not None:
|
123 |
-
if ".csv" not in saved_file_name:
|
124 |
-
df.to_csv(os.path.join(saved_file_name, ".csv"), index=None)
|
125 |
-
|
126 |
-
else:
|
127 |
-
df.to_csv(saved_file_name, index=None)
|
128 |
-
|
129 |
-
else:
|
130 |
-
df.to_csv("data.csv", index=None)
|
131 |
-
|
132 |
-
def display_dataframe(self):
|
133 |
-
"""
|
134 |
-
Displays the processed DataFrame.
|
135 |
-
"""
|
136 |
-
if self.merged_df is not None:
|
137 |
-
print(self.merged_df)
|
138 |
-
else:
|
139 |
-
print("DataFrame is empty.")
|
140 |
-
|
141 |
-
|
142 |
-
def process_okvqa_dataset(questions_file_path, annotations_file_path, save_to_csv=False, saved_file_name=None):
|
143 |
-
"""
|
144 |
-
Processes the OK-VQA dataset given the file paths for questions and annotations.
|
145 |
-
|
146 |
-
Parameters:
|
147 |
-
questions_file_path (str): The file path for the questions JSON file.
|
148 |
-
annotations_file_path (str): The file path for the annotations JSON file.
|
149 |
-
|
150 |
-
Returns:
|
151 |
-
DataFrame: The processed DataFrame containing merged and processed VQA data.
|
152 |
-
"""
|
153 |
-
# Create an instance of the class
|
154 |
-
processor = VQADataProcessor(questions_file_path, annotations_file_path)
|
155 |
-
|
156 |
-
# Process the data
|
157 |
-
processor.merge_dataframes()
|
158 |
-
processor.process_answers()
|
159 |
-
|
160 |
-
# Retrieve the processed DataFrame
|
161 |
-
processed_data = processor.get_processed_data()
|
162 |
-
|
163 |
-
if save_to_csv:
|
164 |
-
processor.save_to_csv(processed_data, saved_file_name)
|
165 |
-
|
166 |
-
return processed_data
|
167 |
-
|
168 |
-
|
169 |
def show_image(image):
|
170 |
"""
|
171 |
Display an image in various environments (Jupyter, PyCharm, Hugging Face Spaces).
|
@@ -307,11 +151,3 @@ def free_gpu_resources():
|
|
307 |
gc.collect()
|
308 |
gc.collect()
|
309 |
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
if __name__ == "__main__":
|
315 |
-
pass
|
316 |
-
#val_data = process_okvqa_dataset('OpenEnded_mscoco_val2014_questions.json', 'mscoco_val2014_annotations.json', save_to_csv=True, saved_file_name="okvqa_val.csv")
|
317 |
-
#train_data = process_okvqa_dataset('OpenEnded_mscoco_train2014_questions.json', 'mscoco_train2014_annotations.json', save_to_csv=True, saved_file_name="okvqa_train.csv")
|
|
|
1 |
import pandas as pd
|
|
|
|
|
2 |
import os
|
3 |
from PIL import Image
|
4 |
import numpy as np
|
|
|
10 |
import streamlit as st
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def show_image(image):
|
14 |
"""
|
15 |
Display an image in various environments (Jupyter, PyCharm, Hugging Face Spaces).
|
|
|
151 |
gc.collect()
|
152 |
gc.collect()
|
153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|