Spaces:

allenai
/

ZebraLogic

Running

App Files Files Community

ZebraLogic / constants.py

yuchenlin

update title

9abf560 2 months ago

raw

history blame contribute delete

No virus

7.47 kB

	from pathlib import Path
	from collections import OrderedDict

	DEFAULT_K = "∞"
	# DEFAULT_K = "1500"

	banner_url = "https://github.com/yuchenlin/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
	BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'

	# TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"


	CITATION_TEXT = """

	@misc{zebralogicbench2024,
	title={ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models},
	author={Bill Yuchen Lin, Ronan Le Bras, Yejin Choi},
	url={https://hf.co/spaces/allenai/ZebraLogicBench-Leaderboard},
	year={2024}
	}

	@article{dziri2024faith,
	title={Faith and fate: Limits of transformers on compositionality},
	author={Nouha Dziri and Ximing Lu and Melanie Sclar and Xiang Lorraine Li and Liwei Jian and Bill Yuchen Lin and Peter West and Chandra Bhagavatula and Ronan Le Bras and Jena D. Hwang and Soumya Sanyal and Sean Welleck and Xiang Ren and Allyson Ettinger and Za{\"i}d Harchaoui and Yejin Choi},
	journal={Advances in Neural Information Processing Systems},
	volume={36},
	year={2024}
	}

	"""

	# make column_names as an ordered dict



	column_names = OrderedDict({
	"Model": "Model",
	"Mode": "Mode",
	"Puzzle Acc": "Puzzle Acc",
	"Cell Acc": "Cell Acc",
	"No answer": "No answer",
	"Easy Puzzle Acc": "Easy Puzzle Acc",
	"Hard Puzzle Acc": "Hard Puzzle Acc",
	# "Total Puzzles": "Total Puzzles",
	# "Reason Lens": "Reason Lens",
	})



	LEADERBOARD_REMARKS = """WB Reward: for each comparison (A vs B), a reward for A is +/-1 if A is much better/worse than B, and +/-0.5 if A is slightly better/worse than B; when there is a Tie, the reward is 0.
	"""

	# WB Reward: for each pairwise comparison, a reward for A is +/-1 if A is much better/worse than B, and +/-0.5 if A is slightly better/worse than B; 0 for a Tie.
	# The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
	# WB Score individually scores each model based on checklists.
	# Evaluator is GPT-4-Turbo.
	LEADERBOARD_REMARKS_MAIN = """
	"""

	RANKING_COLUMN = "Puzzle Acc"

	ORDERED_COLUMN_NAMES = [
	"Model",
	"Mode",
	"Puzzle Acc",
	"Easy Puzzle Acc",
	"Hard Puzzle Acc",
	"Cell Acc",
	"No answer",
	]


	js_light = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'light') {
	url.searchParams.set('__theme', 'light');
	window.location.href = url.href;
	}

	// Find the fieldset with the given id
	const fieldset = document.getElementById("rank-column-radio");

	// Create a new span element with the text "Decoding Mode:"
	const rankBySpan = document.createElement("span");
	rankBySpan.textContent = "Decoding Mode: ";
	rankBySpan.style.fontWeight = "bold"; // Optional: make the text bold
	rankBySpan.style.fontSize = "19px"; // Larger font size
	rankBySpan.style.paddingRight = "18px"; // Add padding on the right

	// Wrap the span and the labels in a flex container
	const flexContainer = document.createElement("div");
	flexContainer.style.display = "flex";
	flexContainer.style.alignItems = "center";

	// Insert the rankBySpan at the beginning of the flex container
	flexContainer.appendChild(rankBySpan);

	// Move all existing labels into the flex container
	while (fieldset.firstChild) {
	flexContainer.appendChild(fieldset.firstChild);
	}

	// Append the flex container back to the fieldset
	fieldset.appendChild(flexContainer);
	}
	"""

	js_code = """
	function scroll_top() {
	console.log("Hello from Gradio!");
	const bubbles = document.querySelectorAll('.bubble-wrap');
	bubbles.forEach((bubble, index) => {
	setTimeout(() => {
	bubble.scrollTop = 0;
	}, index * 100); // Delay of 100ms between each iteration
	});

	}
	"""


	TASK_TYPE_STR = "Tasks: Info seeking (InfoSek), Creative Writing (CrtWrt), Coding&Debugging (Code), Reasoning (Reason), Editing (Edit), Math, Planning (Plan), Brainstorming (Brnstrm), Role playing (RolPly), Advice seeking (AdvSek), Data Analysis (DataAna)"

	css = """



	code {
	font-size: large;
	}
	footer {visibility: hidden}
	.top-left-LP{
	margin-top: 6px;
	margin-left: 5px;
	}
	.no_margin{
	margin-top: 0px;
	margin-left: 0px;
	margin-right: 0px;
	margin-bottom: 0px;
	padding-top: 0px;
	padding-left: 0px;
	padding-right: 0px;
	padding-bottom: 0px;
	}
	.markdown-text{font-size: 14pt}
	.markdown-text-tiny{font-size: 10pt}
	.markdown-text-small{font-size: 13pt}
	.markdown-text-tiny{font-size: 12pt}
	.markdown-text-tiny-red{
	font-size: 12pt;
	color: red;
	background-color: yellow;
	font-color: red;
	font-weight: bold;
	}
	th {
	text-align: center;
	font-size: 17px; /* Adjust the font size as needed */
	}
	td {
	font-size: 15px; /* Adjust the font size as needed */
	text-align: center;
	}

	.sample_button{
	border: 2px solid #000000;
	border-radius: 10px;
	padding: 10px;
	font-size: 17pt;
	font-weight: bold;
	margin: 5px;
	background-color: #D8BFD8;
	}

	.chat-common{
	height: auto;
	max-height: 400px;
	min-height: 100px;
	}
	.chat-specific{
	height: auto;
	max-height: 600px;
	min-height: 200px;
	}
	#od-benchmark-tab-table-button{
	font-size: 15pt;
	font-weight: bold;
	}

	.btn_boderline{
	border: 1px solid #000000;
	border-radius: 5px;
	padding: 5px;
	margin: 5px;
	font-size: 15pt;
	font-weight: bold;
	}

	.btn_boderline_next{
	border: 0.1px solid #000000;
	border-radius: 5px;
	padding: 5px;
	margin: 5px;
	font-size: 15pt;
	font-weight: bold;
	}

	.btn_boderline_gray{
	border: 0.5px solid gray;
	border-radius: 5px;
	padding: 5px;
	margin: 5px;
	font-size: 15pt;
	font-weight: italic;
	}
	.btn_boderline_selected{
	border: 2px solid purple;
	background-color: #f2f2f2;
	border-radius: 5px;
	padding: 5px;
	margin: 5px;
	font-size: 15pt;
	font-weight: bold;
	}
	.accordion-label button span{
	font-size: 14pt;
	font-weight: bold;
	}

	#show-task-categorized span{
	font-size: 13pt;
	font-weight: bold;
	}

	#show-open-source-models span{
	font-size: 13pt;
	font-weight: bold;
	}

	#select-models span{
	font-size: 10pt;
	}

	#select-tasks span{
	font-size: 10pt;
	}


	.markdown-text-details{
	margin: 10px;
	padding: 10px;
	}


	button.selected[role="tab"][aria-selected="true"] {
	font-size: 18px; /* or any other size you prefer */
	font-weight: bold;
	}

	#od-benchmark-tab-table-ablation-button {
	font-size: larger; /* Adjust the font size as needed */
	}


	.plotly-plot{
	height: auto;
	max-height: 600px;
	min-height: 600px;
	}

	#length-margin-radio{
	font-size: 10pt;
	# padding: 0px;
	# margin: 1px;
	}

	#show-task-categorized{
	font-size: 12pt;
	font-decoration: bold;
	}

	#show-open-source-models{
	font-size: 12pt;
	font-decoration: bold;
	}

	.box_md{
	border: 1px solid #000000;
	border-radius: 10px;
	padding: 10px;
	font-size: 12pt;
	margin: 5px;
	}
	"""