Spaces:
Running
Running
import json | |
import os | |
import re | |
import statistics | |
from pathlib import Path | |
from typing import Union | |
import numpy as np | |
from constant import * | |
from tqdm import tqdm | |
def is_multi_turn(test_category): | |
return "multi_turn" in test_category | |
def contain_multi_turn_irrelevance(test_category): | |
return "miss_func" in test_category or "miss_param" in test_category | |
def is_executable(test_category): | |
return "exec" in test_category or "rest" in test_category | |
def is_rest(test_category): | |
return "rest" in test_category | |
def is_relevance_or_irrelevance(test_category): | |
return "relevance" in test_category or "irrelevance" in test_category | |
def is_chatable(test_category): | |
return "chatable" in test_category | |
def is_java(test_category): | |
return "java" in test_category | |
def is_js(test_category): | |
return "javascript" in test_category | |
def is_sql(test_category): | |
return "sql" in test_category | |
def load_file(file_path): | |
result = [] | |
with open(file_path) as f: | |
file = f.readlines() | |
for line in file: | |
result.append(json.loads(line)) | |
return result | |
def get_handler(model_name): | |
return handler_map[model_name](model_name, temperature=0) #Temperature doesn't matter for evaluation | |
def write_list_of_dicts_to_file(filename, data, subdir=None): | |
if subdir: | |
# Ensure the subdirectory exists | |
os.makedirs(subdir, exist_ok=True) | |
# Construct the full path to the file | |
filename = os.path.join(subdir, filename) | |
# Write the list of dictionaries to the file in JSON format | |
with open(filename, "w") as f: | |
for i, entry in enumerate(data): | |
# Go through each key-value pair in the dictionary to make sure the values are JSON serializable | |
for key, value in entry.items(): | |
try: | |
json.dumps(value) | |
except: | |
# If the value is not JSON serializable, wrap it in a string | |
entry[key] = str(value) | |
json_str = json.dumps(entry) | |
f.write(json_str) | |
if i < len(data) - 1: | |
f.write("\n") | |
def is_function_calling_format_output(decoded_output): | |
# Ensure the output is a list of dictionaries | |
if type(decoded_output) == list: | |
for item in decoded_output: | |
if type(item) != dict: | |
return False | |
return True | |
return False | |
def is_executable_format_output(decoded_output): | |
# Ensure the output is a list of strings (one or more strings) | |
if type(decoded_output) == list: | |
if len(decoded_output) == 0: | |
return False | |
for item in decoded_output: | |
if type(item) != str: | |
return False | |
return True | |
return False | |
def is_rest_format_output(decoded_output): | |
# Ensure the output is a list of one string | |
if type(decoded_output) == list: | |
if len(decoded_output) == 1 and type(decoded_output[0]) == str: | |
return True | |
return False | |
def is_empty_output(decoded_output): | |
# This function is a patch to the ast decoder for relevance detection | |
# Sometimes the ast decoder will parse successfully, but the input doens't really have a function call | |
# [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct) | |
if not is_function_calling_format_output(decoded_output): | |
return True | |
if len(decoded_output) == 0: | |
return True | |
if len(decoded_output) == 1 and len(decoded_output[0]) == 0: | |
return True | |
return False | |