dual_window / eval_runner_helper.py
Huanzhi (Hans) Mao
init
4d1746c
raw
history blame
3.61 kB
import json
import os
import re
import statistics
from pathlib import Path
from typing import Union
import numpy as np
from constant import *
from tqdm import tqdm
def is_multi_turn(test_category):
return "multi_turn" in test_category
def contain_multi_turn_irrelevance(test_category):
return "miss_func" in test_category or "miss_param" in test_category
def is_executable(test_category):
return "exec" in test_category or "rest" in test_category
def is_rest(test_category):
return "rest" in test_category
def is_relevance_or_irrelevance(test_category):
return "relevance" in test_category or "irrelevance" in test_category
def is_chatable(test_category):
return "chatable" in test_category
def is_java(test_category):
return "java" in test_category
def is_js(test_category):
return "javascript" in test_category
def is_sql(test_category):
return "sql" in test_category
def load_file(file_path):
result = []
with open(file_path) as f:
file = f.readlines()
for line in file:
result.append(json.loads(line))
return result
def get_handler(model_name):
return handler_map[model_name](model_name, temperature=0) #Temperature doesn't matter for evaluation
def write_list_of_dicts_to_file(filename, data, subdir=None):
if subdir:
# Ensure the subdirectory exists
os.makedirs(subdir, exist_ok=True)
# Construct the full path to the file
filename = os.path.join(subdir, filename)
# Write the list of dictionaries to the file in JSON format
with open(filename, "w") as f:
for i, entry in enumerate(data):
# Go through each key-value pair in the dictionary to make sure the values are JSON serializable
for key, value in entry.items():
try:
json.dumps(value)
except:
# If the value is not JSON serializable, wrap it in a string
entry[key] = str(value)
json_str = json.dumps(entry)
f.write(json_str)
if i < len(data) - 1:
f.write("\n")
def is_function_calling_format_output(decoded_output):
# Ensure the output is a list of dictionaries
if type(decoded_output) == list:
for item in decoded_output:
if type(item) != dict:
return False
return True
return False
def is_executable_format_output(decoded_output):
# Ensure the output is a list of strings (one or more strings)
if type(decoded_output) == list:
if len(decoded_output) == 0:
return False
for item in decoded_output:
if type(item) != str:
return False
return True
return False
def is_rest_format_output(decoded_output):
# Ensure the output is a list of one string
if type(decoded_output) == list:
if len(decoded_output) == 1 and type(decoded_output[0]) == str:
return True
return False
def is_empty_output(decoded_output):
# This function is a patch to the ast decoder for relevance detection
# Sometimes the ast decoder will parse successfully, but the input doens't really have a function call
# [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct)
if not is_function_calling_format_output(decoded_output):
return True
if len(decoded_output) == 0:
return True
if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
return True
return False