Spaces:
Sleeping
Sleeping
File size: 16,130 Bytes
c996cf4 c86ce90 03c134b c996cf4 cddc6a6 c996cf4 cddc6a6 c996cf4 cddc6a6 c996cf4 cddc6a6 70c6745 c996cf4 cddc6a6 c996cf4 cddc6a6 c996cf4 e2486ab c996cf4 cddc6a6 c996cf4 cddc6a6 c996cf4 cddc6a6 c996cf4 cddc6a6 70c6745 cddc6a6 c996cf4 cddc6a6 c996cf4 cddc6a6 c996cf4 cddc6a6 f6a1c31 d068b6c cddc6a6 f6a1c31 cddc6a6 f6a1c31 cddc6a6 f6a1c31 cddc6a6 f6a1c31 cddc6a6 f6a1c31 cddc6a6 977f4fb cddc6a6 977f4fb e927479 cddc6a6 f6a1c31 cddc6a6 977f4fb cddc6a6 f6a1c31 cddc6a6 c86ce90 70c6745 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 |
import streamlit as st
import json
from collections import Counter
import contractions
import csv
import random
import pandas as pd
import altair as alt
from typing import Tuple, List, Optional
from my_model.dataset.dataset_processor import process_okvqa_dataset
from my_model.config import dataset_config as config
class OKVQADatasetAnalyzer:
"""
Provides tools for analyzing and visualizing distributions of question types within given question datasets.
It supports operations such as data loading, categorization of questions based on keywords, visualization of q
uestion distribution, and exporting data to CSV files.
Attributes:
train_file_path (str): Path to the training dataset file.
test_file_path (str): Path to the testing dataset file.
data_choice (str): Choice of dataset(s) to analyze; options include 'train', 'test', or 'train_test'.
questions (List[str]): List of questions aggregated based on the dataset choice.
question_types (Counter): Counter object tracking the frequency of each question type.
Qs (Dict[str, List[str]]): Dictionary mapping question types to lists of corresponding questions.
"""
def __init__(self, train_file_path: str, test_file_path: str, data_choice: str):
"""
Initializes the OKVQADatasetAnalyzer with paths to dataset files and a choice of which datasets to analyze.
Parameters:
train_file_path (str): Path to the training dataset JSON file. This file should contain a list of questions.
test_file_path (str): Path to the testing dataset JSON file. This file should also contain a list of
questions.
data_choice (str): Specifies which dataset(s) to load and analyze. Valid options are 'train', 'test', or
'train_test'indicating whether to load training data, testing data, or both.
The constructor initializes the paths, selects the dataset based on the choice, and loads the initial data by
calling the `load_data` method. It also prepares structures for categorizing questions and storing the results.
"""
self.train_file_path = train_file_path
self.test_file_path = test_file_path
self.data_choice = data_choice
self.questions = []
self.question_types = Counter()
self.Qs = {keyword: [] for keyword in config.QUESTION_KEYWORDS + ['others']}
self.load_data()
def load_data(self) -> None:
"""
Loads the dataset(s) from the specified JSON file(s) based on the user's choice of 'train', 'test', or
'train_test'.
This method updates the internal list of questions depending on the chosen dataset.
"""
if self.data_choice in ['train', 'train_test']:
with open(self.train_file_path, 'r') as file:
train_data = json.load(file)
self.questions += [q['question'] for q in train_data['questions']]
if self.data_choice in ['test', 'train_test']:
with open(self.test_file_path, 'r') as file:
test_data = json.load(file)
self.questions += [q['question'] for q in test_data['questions']]
def categorize_questions(self) -> None:
"""
Categorizes each question in the loaded data into predefined categories based on keywords.
This method updates the internal dictionary `self.Qs` and the Counter `self.question_types` with categorized
questions.
"""
question_keywords = config.QUESTION_KEYWORDS
for question in self.questions:
question = contractions.fix(question)
words = question.lower().split()
question_keyword = None
if words[:2] == ['name', 'the']:
question_keyword = 'name the'
else:
for word in words:
if word in question_keywords:
question_keyword = word
break
if question_keyword:
self.question_types[question_keyword] += 1
self.Qs[question_keyword].append(question)
else:
self.question_types["others"] += 1
self.Qs["others"].append(question)
def plot_question_distribution(self) -> None:
"""
Plots an interactive bar chart of question types using Altair and Streamlit, displaying the count and percentage
of each type.
The chart sorts question types by count in descending order and includes detailed tooltips for interaction.
This method is intended for visualization in a Streamlit application.
"""
# Prepare data
total_questions = sum(self.question_types.values())
items = [(key, value, (value / total_questions) * 100) for key, value in self.question_types.items()]
df = pd.DataFrame(items, columns=['Question Keyword', 'Count', 'Percentage'])
# Sort data and handle 'others' category specifically if present
df = df[df['Question Keyword'] != 'others'].sort_values('Count', ascending=False)
if 'others' in self.question_types:
others_df = pd.DataFrame([('others', self.question_types['others'],
(self.question_types['others'] / total_questions) * 100)],
columns=['Question Keyword', 'Count', 'Percentage'])
df = pd.concat([df, others_df], ignore_index=True)
# Explicitly set the order of the x-axis based on the sorted DataFrame
order = df['Question Keyword'].tolist()
# Create the bar chart
bars = alt.Chart(df).mark_bar().encode(
x=alt.X('Question Keyword:N', sort=order, title='Question Keyword', axis=alt.Axis(labelAngle=-45)),
y=alt.Y('Count:Q', title='Question Count'),
color=alt.Color('Question Keyword:N', scale=alt.Scale(scheme='category20'), legend=None),
tooltip=[alt.Tooltip('Question Keyword:N', title='Type'),
alt.Tooltip('Count:Q', title='Count'),
alt.Tooltip('Percentage:Q', title='Percentage', format='.1f')]
)
# Create text labels for the bars with count and percentage
text = bars.mark_text(
align='center',
baseline='bottom',
dy=-5 # Nudges text up so it appears above the bar
).encode(
text=alt.Text('PercentageText:N')
).transform_calculate(
PercentageText="datum.Count + ' (' + format(datum.Percentage, '.1f') + '%)'"
)
# Combine the bar and text layers
chart = (bars + text).properties(
width=800,
height=600,
).configure_axis(
labelFontSize=12,
titleFontSize=16,
labelFontWeight='bold',
titleFontWeight='bold',
grid=False
).configure_text(
fontWeight='bold'
).configure_title(
fontSize=20,
font='bold',
anchor='middle'
)
# Display the chart in Streamlit
st.altair_chart(chart, use_container_width=True)
def plot_bar_chart(self, df: pd.DataFrame, category_col: str, value_col: str, chart_title: str) -> None:
"""
Plots an interactive bar chart using Altair and Streamlit.
Args:
df (pd.DataFrame): DataFrame containing the data for the bar chart.
category_col (str): Name of the column containing the categories.
value_col (str): Name of the column containing the values.
chart_title (str): Title of the chart.
Returns:
None
"""
# Calculate percentage for each category
df['Percentage'] = (df[value_col] / df[value_col].sum()) * 100
df['PercentageText'] = df['Percentage'].round(1).astype(str) + '%'
# Create the bar chart
bars = alt.Chart(df).mark_bar().encode(
x=alt.X(field=category_col, title='Category', sort='-y', axis=alt.Axis(labelAngle=-45)),
y=alt.Y(field=value_col, type='quantitative', title='Percentage'),
color=alt.Color(field=category_col, type='nominal', legend=None),
tooltip=[
alt.Tooltip(field=category_col, type='nominal', title='Category'),
alt.Tooltip(field=value_col, type='quantitative', title='Percentage'),
alt.Tooltip(field='Percentage', type='quantitative', title='Percentage', format='.1f')
]
).properties(
width=800,
height=600
)
# Add text labels to the bars
text = bars.mark_text(
align='center',
baseline='bottom',
dy=-10 # Nudges text up so it appears above the bar
).encode(
text=alt.Text('PercentageText:N')
)
# Combine the bar chart and text labels
chart = (bars + text).configure_title(
fontSize=20
).configure_axis(
labelFontSize=12,
titleFontSize=16,
labelFontWeight='bold',
titleFontWeight='bold',
grid=False
).configure_text(
fontWeight='bold')
# Display the chart in Streamlit
st.altair_chart(chart, use_container_width=True)
def export_to_csv(self, qs_filename: str, question_types_filename: str) -> None:
"""
Exports the categorized questions and their counts to two separate CSV files.
Parameters:
qs_filename (str): The filename or path for exporting the `self.Qs` dictionary data.
question_types_filename (str): The filename or path for exporting the `self.question_types` Counter data.
This method writes the contents of `self.Qs` and `self.question_types` to the specified files in CSV format.
Each CSV file includes headers for better understanding and use of the exported data.
"""
# Export self.Qs dictionary
with open(qs_filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Question Type', 'Questions'])
for q_type, questions in self.Qs.items():
for question in questions:
writer.writerow([q_type, question])
# Export self.question_types Counter
with open(question_types_filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Question Type', 'Count'])
for q_type, count in self.question_types.items():
writer.writerow([q_type, count])
def run_dataset_analyzer() -> None:
"""
Executes the dataset analysis process and displays the results using Streamlit.
This function provides an overview of the dataset, it utilizes the OKVQADatasetAnalyzer to visualize
the data.
"""
# Load datasets from Excel
datasets_comparison_table = pd.read_excel(config.DATASET_ANALYSES_PATH, sheet_name="VQA Datasets Comparison")
okvqa_dataset_characteristics = pd.read_excel(config.DATASET_ANALYSES_PATH, sheet_name="OK-VQA Dataset Characteristics")
# Process OK-VQA datasets for validation and training
val_data = process_okvqa_dataset(config.DATASET_VAL_QUESTIONS_PATH, config.DATASET_VAL_ANNOTATIONS_PATH, save_to_csv=False)
train_data = process_okvqa_dataset(config.DATASET_TRAIN_QUESTIONS_PATH, config.DATASET_TRAIN_ANNOTATIONS_PATH, save_to_csv=False)
# Initialize the dataset analyzer
dataset_analyzer = OKVQADatasetAnalyzer(config.DATASET_TRAIN_QUESTIONS_PATH, config.DATASET_VAL_QUESTIONS_PATH, 'train_test')
# Display KB-VQA datasets overview
with st.container():
st.markdown("## Overview of KB-VQA Datasets")
col1, col2 = st.columns([2, 1])
with col1:
st.write(" ")
with st.expander("1 - Knowledge-Based VQA (KB-VQA)"):
st.markdown(""" [Knowledge-Based VQA (KB-VQA)](https://arxiv.org/abs/1511.02570): One of the earliest datasets in this domain, KB-VQA
comprises 700 images and 2,402 questions, with each question associated with both an image
and a knowledge base (KB). The KB encapsulates facts about the world, including object
names, properties, and relationships, aiming to foster models capable of answering
questions through reasoning over both the image and the KB.\n""")
with st.expander("2 - Factual VQA (FVQA)"):
st.markdown(""" [Factual VQA (FVQA)](https://arxiv.org/abs/1606.05433): This dataset includes 2,190
images and 5,826 questions, accompanied by a knowledge base containing 193,449 facts.
The FVQA's questions are predominantly factual and less open-ended compared to those
in KB-VQA, offering a different challenge in knowledge-based reasoning.\n""")
with st.expander("3 - Outside-Knowledge VQA (OK-VQA)"):
st.markdown(""" [Outside-Knowledge VQA (OK-VQA)](https://arxiv.org/abs/1906.00067): OK-VQA poses a more
demanding challenge than KB-VQA, featuring an open-ended knowledge base that can be
updated during model training. This dataset contains 14,055 questions and 14,031 images.
Questions are carefully curated to ensure they require reasoning beyond the image
content alone.\n""")
with st.expander("4 - Augmented OK-VQA (A-OKVQA)"):
st.markdown(""" [Augmented OK-VQA (A-OKVQA)](https://arxiv.org/abs/2206.01718): Augmented successor of
OK-VQA dataset, focused on common-sense knowledge and reasoning rather than purely
factual knowledge, A-OKVQA offers approximately 24,903 questions across 23,692 images.
Questions in this dataset demand commonsense reasoning about the scenes depicted in the
images, moving beyond straightforward knowledge base queries. It also provides
rationales for answers, aiming to be a significant testbed for the development of AI
models that integrate visual and natural language reasoning.\n""")
with col2:
st.markdown("#### KB-VQA Datasets Comparison")
st.write(datasets_comparison_table, use_column_width=True)
st.write("-----------------------")
# Display OK-VQA dataset details
with st.container():
st.write("\n" * 10)
st.markdown("## OK-VQA Dataset")
st.write("This model was fine-tuned and evaluated using OK-VQA dataset.\n")
with st.expander("OK-VQA Dataset Characteristics"):
st.markdown("#### OK-VQA Dataset Characteristics")
st.write(okvqa_dataset_characteristics)
with st.expander("Questions Distribution over Knowledge Category"):
df = pd.read_excel(config.DATASET_ANALYSES_PATH, sheet_name="Question Category Dist")
st.markdown("#### Questions Distribution over Knowledge Category")
dataset_analyzer.plot_bar_chart(df, "Knowledge Category", "Percentage", "Questions Distribution over Knowledge Category")
with st.expander("Distribution of Question Keywords"):
dataset_analyzer.categorize_questions()
st.markdown("#### Distribution of Question Keywords")
dataset_analyzer.plot_question_distribution()
# Display sample data
with st.container():
with st.expander("Show Dataset Samples"):
n = random.randint(1,len(train_data)-10)
# Displaying 10 random samples.
st.write(train_data[n:n+10]) |