Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 6,940 Bytes
319b0b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import pandas as pd
import altair as alt
import pickle
from datetime import datetime, timezone
from typing import List, Dict, Tuple, Any, Union
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
# Define the human baselines
HUMAN_BASELINES = {
"Average ⬆️": 0.897 * 100,
"ARC": 0.80 * 100,
"HellaSwag": 0.95 * 100,
"MMLU": 0.898 * 100,
"TruthfulQA": 0.94 * 100,
}
def to_datetime(model_info: Tuple[str, Any]) -> datetime:
"""
Converts the lastModified attribute of the object to datetime.
:param model_info: A tuple containing the name and object.
The object must have a lastModified attribute
with a string representing the date and time.
:return: A datetime object converted from the lastModified attribute of the input object.
"""
name, obj = model_info
return datetime.strptime(obj.lastModified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
def join_model_info_with_results(results_df: pd.DataFrame) -> pd.DataFrame:
"""
Integrates model information with the results DataFrame by matching 'Model sha'.
:param results_df: A DataFrame containing results information including 'Model sha' column.
:return: A DataFrame with updated 'Results Date' columns, which are synchronized with model information.
"""
# load cache from disk
try:
with open("model_info_cache.pkl", "rb") as f:
model_info_cache = pickle.load(f)
except (EOFError, FileNotFoundError):
model_info_cache = {}
# Sort date strings using datetime objects as keys
sorted_dates = sorted(list(model_info_cache.items()), key=to_datetime, reverse=True)
results_df["Results Date"] = datetime.now().replace(tzinfo=timezone.utc)
# Define the date format string
date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
# Iterate over sorted_dates and update the dataframe
for name, obj in sorted_dates:
# Convert the lastModified string to a datetime object
last_modified_datetime = datetime.strptime(obj.lastModified, date_format).replace(tzinfo=timezone.utc)
# Update the "Results Date" column where "Model sha" equals obj.sha
results_df.loc[results_df["Model sha"] == obj.sha, "Results Date"] = last_modified_datetime
return results_df
def create_scores_df(results_df: pd.DataFrame) -> pd.DataFrame:
"""
Generates a DataFrame containing the maximum scores until each result date.
:param results_df: A DataFrame containing result information including metric scores and result dates.
:return: A new DataFrame containing the maximum scores until each result date for every metric.
"""
# Step 1: Ensure 'Results Date' is in datetime format and sort the DataFrame by it
results_df["Results Date"] = pd.to_datetime(results_df["Results Date"])
results_df.sort_values(by="Results Date", inplace=True)
# Step 2: Initialize the scores dictionary
scores = {
"Average ⬆️": [],
"ARC": [],
"HellaSwag": [],
"MMLU": [],
"TruthfulQA": [],
"Result Date": [],
}
# Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
for i, row in results_df.iterrows():
date = row["Results Date"]
for column in scores.keys():
if column == "Result Date":
if not scores[column] or scores[column][-1] <= date:
scores[column].append(date)
continue
current_max = scores[column][-1] if scores[column] else float("-inf")
scores[column].append(max(current_max, row[column]))
# Step 4: Convert the dictionary to a DataFrame
return pd.DataFrame(scores)
def create_plot_df(scores_df: pd.DataFrame) -> pd.DataFrame:
"""
Transforms the scores DataFrame into a new format suitable for plotting.
:param scores_df: A DataFrame containing metric scores and result dates.
:return: A new DataFrame reshaped for plotting purposes.
"""
# Sample columns
cols = ["Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA"]
# Initialize the list to store DataFrames
dfs = []
# Iterate over the cols and create a new DataFrame for each column
for col in cols:
d = scores_df[[col, "Result Date"]].copy().reset_index(drop=True)
d["Metric Name"] = col
d.rename(columns={col: "Metric Value"}, inplace=True)
dfs.append(d)
# Concatenate all the created DataFrames
concat_df = pd.concat(dfs, ignore_index=True)
# Sort values by 'Result Date'
concat_df.sort_values(by="Result Date", inplace=True)
concat_df.reset_index(drop=True, inplace=True)
# Drop duplicates based on 'Metric Name' and 'Metric Value' and keep the first (earliest) occurrence
concat_df.drop_duplicates(subset=["Metric Name", "Metric Value"], keep="first", inplace=True)
concat_df.reset_index(drop=True, inplace=True)
return concat_df
def create_metric_plot_obj(df: pd.DataFrame, metrics: List[str], human_baselines: Dict[str, float]) -> alt.LayerChart:
"""
Creates a visualization of metrics over time compared to human baselines.
:param df: A DataFrame containing 'Metric Name', 'Metric Value', and 'Result Date' columns.
:param metrics: A list of metric names to be included in the plot.
:param human_baselines: A dictionary mapping metric names to their corresponding human baseline values.
:return: An Altair LayerChart object visualizing the metrics over time.
"""
# Filter the DataFrame based on the metrics parameter
df = df[df["Metric Name"].isin(metrics)]
# Filter the human_baselines dictionary to include only the specified metrics
filtered_human_baselines = {k: v for k, v in human_baselines.items() if k in metrics}
# Create a DataFrame from filtered human baselines
human_baselines_df = pd.DataFrame(list(filtered_human_baselines.items()), columns=["Metric Name", "Metric Value"])
# Create the lines chart for each metric over time.
base = alt.Chart(df).encode(x="Result Date:T")
lines = base.mark_line().encode(
alt.Y("Metric Value:Q", scale=alt.Scale(domain=[0, 100])),
color="Metric Name:N",
)
# Create the rules (horizontal lines) chart for the human baselines.
yrules = (
alt.Chart(human_baselines_df)
.mark_rule(strokeDash=[12, 6], size=2)
.encode(y="Metric Value:Q", color="Metric Name:N")
)
# Combine lines with yrules and return the chart.
return lines + yrules
|