Spaces:
Sleeping
Sleeping
Update my_model/results/evaluation.py
Browse files
my_model/results/evaluation.py
CHANGED
@@ -31,6 +31,7 @@ class KBVQAEvaluator:
|
|
31 |
gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
|
32 |
gpt4_temperature (float): Temperature setting for GPT-4 responses.
|
33 |
"""
|
|
|
34 |
|
35 |
def __init__(self): -> None
|
36 |
"""
|
@@ -55,6 +56,7 @@ class KBVQAEvaluator:
|
|
55 |
self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
|
56 |
self.gpt4_temperature = config.GPT4_TEMPERATURE
|
57 |
|
|
|
58 |
def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
|
59 |
"""
|
60 |
Apply Porter Stemmer to either a single string or a list of strings.
|
@@ -72,6 +74,7 @@ class KBVQAEvaluator:
|
|
72 |
words = answers.split()
|
73 |
return " ".join(self.stemmer.stem(word.strip()) for word in words)
|
74 |
|
|
|
75 |
def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
|
76 |
"""
|
77 |
Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
|
@@ -91,6 +94,7 @@ class KBVQAEvaluator:
|
|
91 |
count = Counter(ground_truths)
|
92 |
return min(count.get(model_answer, 0) / 3, 1)
|
93 |
|
|
|
94 |
def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
|
95 |
"""
|
96 |
Calculate Exact Match score, with optional fuzzy matching.
|
@@ -108,10 +112,13 @@ class KBVQAEvaluator:
|
|
108 |
else:
|
109 |
return int(model_answer in ground_truths)
|
110 |
|
|
|
111 |
def syntactic_evaluation(self) -> None:
|
112 |
"""
|
113 |
Process the DataFrame: stem answers, calculate scores, and store results.
|
114 |
|
|
|
|
|
115 |
"""
|
116 |
|
117 |
self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
|
@@ -127,6 +134,7 @@ class KBVQAEvaluator:
|
|
127 |
self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
|
128 |
self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
|
129 |
|
|
|
130 |
def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
|
131 |
"""
|
132 |
Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
|
@@ -158,6 +166,9 @@ class KBVQAEvaluator:
|
|
158 |
def semantic_evaluation(self) -> None:
|
159 |
"""
|
160 |
Perform semantic evaluation using GPT-4 for each model configuration.
|
|
|
|
|
|
|
161 |
"""
|
162 |
openai.api_key = self.openai_api_key
|
163 |
model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
|
@@ -192,6 +203,8 @@ class KBVQAEvaluator:
|
|
192 |
self.df.to_excel(writer, sheet_name='Main Data', index=False)
|
193 |
scores_df.to_excel(writer, sheet_name='Scores', index=False)
|
194 |
|
|
|
|
|
195 |
def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
|
196 |
"""
|
197 |
Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
|
@@ -199,6 +212,9 @@ def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
|
|
199 |
Args:
|
200 |
save (bool): Whether to save the results to an Excel file. Defaults to False.
|
201 |
save_filename (str): The filename to save the results if save is True. Defaults to "results".
|
|
|
|
|
|
|
202 |
"""
|
203 |
|
204 |
# Instantiate the evaluator
|
|
|
31 |
gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
|
32 |
gpt4_temperature (float): Temperature setting for GPT-4 responses.
|
33 |
"""
|
34 |
+
|
35 |
|
36 |
def __init__(self): -> None
|
37 |
"""
|
|
|
56 |
self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
|
57 |
self.gpt4_temperature = config.GPT4_TEMPERATURE
|
58 |
|
59 |
+
|
60 |
def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
|
61 |
"""
|
62 |
Apply Porter Stemmer to either a single string or a list of strings.
|
|
|
74 |
words = answers.split()
|
75 |
return " ".join(self.stemmer.stem(word.strip()) for word in words)
|
76 |
|
77 |
+
|
78 |
def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
|
79 |
"""
|
80 |
Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
|
|
|
94 |
count = Counter(ground_truths)
|
95 |
return min(count.get(model_answer, 0) / 3, 1)
|
96 |
|
97 |
+
|
98 |
def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
|
99 |
"""
|
100 |
Calculate Exact Match score, with optional fuzzy matching.
|
|
|
112 |
else:
|
113 |
return int(model_answer in ground_truths)
|
114 |
|
115 |
+
|
116 |
def syntactic_evaluation(self) -> None:
|
117 |
"""
|
118 |
Process the DataFrame: stem answers, calculate scores, and store results.
|
119 |
|
120 |
+
Returns:
|
121 |
+
None.
|
122 |
"""
|
123 |
|
124 |
self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
|
|
|
134 |
self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
|
135 |
self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
|
136 |
|
137 |
+
|
138 |
def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
|
139 |
"""
|
140 |
Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
|
|
|
166 |
def semantic_evaluation(self) -> None:
|
167 |
"""
|
168 |
Perform semantic evaluation using GPT-4 for each model configuration.
|
169 |
+
|
170 |
+
Returns:
|
171 |
+
None.
|
172 |
"""
|
173 |
openai.api_key = self.openai_api_key
|
174 |
model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
|
|
|
203 |
self.df.to_excel(writer, sheet_name='Main Data', index=False)
|
204 |
scores_df.to_excel(writer, sheet_name='Scores', index=False)
|
205 |
|
206 |
+
|
207 |
+
|
208 |
def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
|
209 |
"""
|
210 |
Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
|
|
|
212 |
Args:
|
213 |
save (bool): Whether to save the results to an Excel file. Defaults to False.
|
214 |
save_filename (str): The filename to save the results if save is True. Defaults to "results".
|
215 |
+
|
216 |
+
Returns:
|
217 |
+
None.
|
218 |
"""
|
219 |
|
220 |
# Instantiate the evaluator
|