File size: 13,958 Bytes
fb754b1 dab7e6b fb754b1 dab7e6b fb754b1 dab7e6b fb754b1 bd62fcd fb754b1 bd62fcd fb754b1 bd62fcd fb754b1 bd62fcd fb754b1 bd62fcd fb754b1 4b5b9d2 fb754b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
import os
import altair as alt
import evaluation_config as config
import streamlit as st
from PIL import Image
import pandas as pd
import random
class ResultDemonstrator:
"""
A class to demonstrate the results of the Knowledge-Based Visual Question Answering (KB-VQA) model.
Attributes:
main_data (pd.DataFrame): Data loaded from an Excel file containing evaluation results.
sample_img_pool (list[str]): List of image file names available for demonstration.
model_names (list[str]): List of model names as defined in the configuration.
model_configs (list[str]): List of model configurations as defined in the configuration.
"""
def __init__(self) -> None:
"""
Initializes the ResultDemonstrator class by loading the data from an Excel file.
"""
# Load data
self.main_data = pd.read_excel('evaluation_results.xlsx', sheet_name="Main Data")
self.sample_img_pool = list(os.listdir("demo"))
self.model_names = config.MODEL_NAMES
self.model_configs = config.MODEL_CONFIGURATIONS
@staticmethod
def display_table(data: pd.DataFrame) -> None:
"""
Displays a DataFrame using Streamlit's dataframe display function.
Args:
data (pd.DataFrame): The data to display.
"""
st.dataframe(data)
def calculate_and_append_data(self, data_list: list, score_column: str, model_config: str) -> None:
"""
Calculates mean scores by category and appends them to the data list.
Args:
data_list (list): List to append new data rows.
score_column (str): Name of the column to calculate mean scores for.
model_config (str): Configuration of the model.
"""
if score_column in self.main_data.columns:
category_means = self.main_data.groupby('question_category')[score_column].mean()
for category, mean_value in category_means.items():
data_list.append({
"Category": category,
"Configuration": model_config,
"Mean Value": round(mean_value * 100, 2)
})
def display_ablation_results_per_question_category(self) -> None:
"""Displays ablation results per question category for each model configuration."""
score_types = ['vqa', 'vqa_gpt4', 'em', 'em_gpt4']
data_lists = {key: [] for key in score_types}
column_names = {
'vqa': 'vqa_score_{config}',
'vqa_gpt4': 'gpt4_vqa_score_{config}',
'em': 'exact_match_score_{config}',
'em_gpt4': 'gpt4_em_score_{config}'
}
for model_name in config.MODEL_NAMES:
for conf in config.MODEL_CONFIGURATIONS:
model_config = f"{model_name}_{conf}"
for score_type, col_template in column_names.items():
self.calculate_and_append_data(data_lists[score_type],
col_template.format(config=model_config),
model_config)
# Process and display results for each score type
for score_type, data_list in data_lists.items():
df = pd.DataFrame(data_list)
results_df = df.pivot(index='Category', columns='Configuration', values='Mean Value').applymap(
lambda x: f"{x:.2f}%")
with st.expander(f"{score_type.upper()} Scores per Question Category and Model Configuration"):
self.display_table(results_df)
def display_main_results(self) -> None:
"""Displays the main model results from the Scores sheet, these are displayed from the file directly."""
main_scores = pd.read_excel('evaluation_results.xlsx', sheet_name="Scores", index_col=0)
st.markdown("### Main Model Results (Inclusive of Ablation Experiments)")
main_scores.reset_index()
self.display_table(main_scores)
def plot_token_count_vs_scores(self, conf: str, model_name: str, score_name: str = 'VQA Score') -> None:
"""
Plots an interactive scatter plot comparing token counts to VQA or EM scores using Altair.
Args:
conf (str): The configuration name.
model_name (str): The name of the model.
score_name (str): The type of score to plot.
"""
# Construct the full model configuration name
model_configuration = f"{model_name}_{conf}"
# Determine the score column name and legend mapping based on the score type
if score_name == 'VQA Score':
score_column_name = f"vqa_score_{model_configuration}"
scores = self.main_data[score_column_name]
# Map scores to categories for the legend
legend_map = ['Correct' if score == 1 else 'Partially Correct' if round(score, 2) == 0.67 else 'Incorrect' for score in scores]
color_scale = alt.Scale(domain=['Correct', 'Partially Correct', 'Incorrect'], range=['green', 'orange', 'red'])
else:
score_column_name = f"exact_match_score_{model_configuration}"
scores = self.main_data[score_column_name]
# Map scores to categories for the legend
legend_map = ['Correct' if score == 1 else 'Incorrect' for score in scores]
color_scale = alt.Scale(domain=['Correct', 'Incorrect'], range=['green', 'red'])
# Retrieve token counts from the data
token_counts = self.main_data[f'tokens_count_{conf}']
# Create a DataFrame for the scatter plot
scatter_data = pd.DataFrame({
'Index': range(len(token_counts)),
'Token Counts': token_counts,
score_name: legend_map
})
# Create an interactive scatter plot using Altair
chart = alt.Chart(scatter_data).mark_circle(
size=60,
fillOpacity=1, # Sets the fill opacity to maximum
strokeWidth=1, # Adjusts the border width making the circles bolder
stroke='black' # Sets the border color to black
).encode(
x=alt.X('Index', scale=alt.Scale(domain=[0, 1020])),
y=alt.Y('Token Counts', scale=alt.Scale(domain=[token_counts.min()-200, token_counts.max()+200])),
color=alt.Color(score_name, scale=color_scale, legend=alt.Legend(title=score_name)),
tooltip=['Index', 'Token Counts', score_name]
).interactive() # Enables zoom & pan
chart = chart.properties(
title={
"text": f"Token Counts vs {score_name} + Score + ({model_configuration})",
"color": "black", # Optional color
"fontSize": 20, # Optional font size
"anchor": "middle", # Optional anchor position
"offset": 0 # Optional offset
},
width=700,
height=500
)
# Display the interactive plot in Streamlit
st.altair_chart(chart, use_container_width=True)
@staticmethod
def color_scores(value: float) -> str:
"""
Applies color coding based on the score value.
Args:
value (float): The score value.
Returns:
str: CSS color style based on score value.
"""
try:
value = float(value) # Convert to float to handle numerical comparisons
except ValueError:
return 'color: black;' # Return black if value is not a number
if value == 1.0:
return 'color: green;'
elif value == 0.0:
return 'color: red;'
elif value == 0.67:
return 'color: orange;'
return 'color: black;'
def show_samples(self, num_samples: int = 3) -> None:
"""
Displays random sample images and their associated models answers and evaluations.
Args:
num_samples (int): Number of sample images to display.
"""
# Sample images from the pool
target_imgs = random.sample(self.sample_img_pool, num_samples)
# Generate model configurations
model_configs = [f"{model_name}_{conf}" for model_name in self.model_names for conf in self.model_configs]
# Define column names for scores dynamically
column_names = {
'vqa': 'vqa_score_{config}',
'vqa_gpt4': 'gpt4_vqa_score_{config}',
'em': 'exact_match_score_{config}',
'em_gpt4': 'gpt4_em_score_{config}'
}
for img_filename in target_imgs:
image_data = self.main_data[self.main_data['image_filename'] == img_filename]
im = Image.open(f"demo/{img_filename}")
col1, col2 = st.columns([1, 2]) # to display images side by side with their data.
# Create a container for each image
with st.container():
st.write("-------------------------------")
with col1:
st.image(im, use_column_width=True)
with st.expander('Show Caption'):
st.text(image_data.iloc[0]['caption'])
with st.expander('Show DETIC Objects'):
st.text(image_data.iloc[0]['objects_detic_trimmed'])
with st.expander('Show YOLOv5 Objects'):
st.text(image_data.iloc[0]['objects_yolov5'])
with col2:
if not image_data.empty:
st.write(f"**Question: {image_data.iloc[0]['question']}**")
st.write(f"**Ground Truth Answers:** {image_data.iloc[0]['raw_answers']}")
# Initialize an empty DataFrame for summary data
summary_data = pd.DataFrame(
columns=['Model Configuration', 'Answer', 'VQA Score', 'VQA Score (GPT-4)', 'EM Score',
'EM Score (GPT-4)'])
for config in model_configs:
# Collect data for each model configuration
row_data = {
'Model Configuration': config,
'Answer': image_data.iloc[0].get(f'{config}', '-')
}
for score_type, score_template in column_names.items():
score_col = score_template.format(config=config)
score_value = image_data.iloc[0].get(score_col, '-')
if pd.notna(score_value) and not isinstance(score_value, str):
# Format score to two decimals if it's a valid number
score_value = f"{float(score_value):.2f}"
row_data[score_type.replace('_', ' ').title()] = score_value
# Convert row data to a DataFrame and concatenate it
rd = pd.DataFrame([row_data])
rd.columns = summary_data.columns
summary_data = pd.concat([summary_data, rd], axis=0, ignore_index=True)
# Apply styling to DataFrame for score coloring
styled_summary = summary_data.style.applymap(self.color_scores,
subset=['VQA Score', 'VQA Score (GPT-4)',
'EM Score',
'EM Score (GPT-4)'])
st.markdown(styled_summary.to_html(escape=False, index=False), unsafe_allow_html=True)
else:
st.write("No data available for this image.")
def run_demo(self):
"""
Run the interactive Streamlit demo for visualizing model evaluation results and analysis.
"""
col1, col2 = st.columns([1, 4])
with col1:
# User selects the evaluation analysis aspect
section_type = st.radio("Select Evaluation Aspect", ["Evaluation Results & Analysis", 'Evaluation Samples'])
# Only show analysis type if the section type is "Evaluation Results & Analysis"
if section_type == "Evaluation Results & Analysis":
analysis_type = st.radio("Select Type", ["Main & Ablation Results", "Results per Question Category",
"Prompt Length (token count) Impact on Performance"], index=2)
if analysis_type == "Prompt Length (token count) Impact on Performance":
# Based on the selection, other options appear
model_name = st.radio("Select Model Size", self.model_names)
score_name = st.radio("Select Score Type", ["VQA Score", "Exact Match"])
elif section_type == 'Evaluation Samples':
samples_button = st.button("Generate Random Samples")
with col2:
if section_type == "Evaluation Results & Analysis":
if analysis_type == "Prompt Length (token count) Impact on Performance":
for conf in self.model_configs:
with st.expander(conf):
self.plot_token_count_vs_scores(conf, model_name, score_name)
elif analysis_type == "Main & Ablation Results":
self.display_main_results()
elif analysis_type == "Results per Question Category":
self.display_ablation_results_per_question_category()
elif section_type == 'Evaluation Samples':
if samples_button:
self.show_samples(3)
|