Spaces:

HuanzhiMao
/

dual_window

Running

dual_window / eval_runner_helper.py

Huanzhi (Hans) Mao

init

4d1746c 15 days ago

3.61 kB

	import json
	import os
	import re
	import statistics
	from pathlib import Path
	from typing import Union

	import numpy as np
	from constant import *

	from tqdm import tqdm


	def is_multi_turn(test_category):
	return "multi_turn" in test_category

	def contain_multi_turn_irrelevance(test_category):
	return "miss_func" in test_category or "miss_param" in test_category

	def is_executable(test_category):
	return "exec" in test_category or "rest" in test_category


	def is_rest(test_category):
	return "rest" in test_category


	def is_relevance_or_irrelevance(test_category):
	return "relevance" in test_category or "irrelevance" in test_category


	def is_chatable(test_category):
	return "chatable" in test_category


	def is_java(test_category):
	return "java" in test_category


	def is_js(test_category):
	return "javascript" in test_category


	def is_sql(test_category):
	return "sql" in test_category


	def load_file(file_path):
	result = []
	with open(file_path) as f:
	file = f.readlines()
	for line in file:
	result.append(json.loads(line))
	return result


	def get_handler(model_name):
	return handler_map[model_name](model_name, temperature=0) #Temperature doesn't matter for evaluation


	def write_list_of_dicts_to_file(filename, data, subdir=None):
	if subdir:
	# Ensure the subdirectory exists
	os.makedirs(subdir, exist_ok=True)

	# Construct the full path to the file
	filename = os.path.join(subdir, filename)

	# Write the list of dictionaries to the file in JSON format
	with open(filename, "w") as f:
	for i, entry in enumerate(data):
	# Go through each key-value pair in the dictionary to make sure the values are JSON serializable
	for key, value in entry.items():
	try:
	json.dumps(value)
	except:
	# If the value is not JSON serializable, wrap it in a string
	entry[key] = str(value)

	json_str = json.dumps(entry)
	f.write(json_str)
	if i < len(data) - 1:
	f.write("\n")


	def is_function_calling_format_output(decoded_output):
	# Ensure the output is a list of dictionaries
	if type(decoded_output) == list:
	for item in decoded_output:
	if type(item) != dict:
	return False
	return True
	return False


	def is_executable_format_output(decoded_output):
	# Ensure the output is a list of strings (one or more strings)
	if type(decoded_output) == list:
	if len(decoded_output) == 0:
	return False
	for item in decoded_output:
	if type(item) != str:
	return False
	return True
	return False


	def is_rest_format_output(decoded_output):
	# Ensure the output is a list of one string
	if type(decoded_output) == list:
	if len(decoded_output) == 1 and type(decoded_output[0]) == str:
	return True
	return False


	def is_empty_output(decoded_output):
	# This function is a patch to the ast decoder for relevance detection
	# Sometimes the ast decoder will parse successfully, but the input doens't really have a function call
	# [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct)
	if not is_function_calling_format_output(decoded_output):
	return True
	if len(decoded_output) == 0:
	return True
	if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
	return True
	return False