import json import os import re import statistics from pathlib import Path from typing import Union import numpy as np from constant import * from tqdm import tqdm def is_multi_turn(test_category): return "multi_turn" in test_category def contain_multi_turn_irrelevance(test_category): return "miss_func" in test_category or "miss_param" in test_category def is_executable(test_category): return "exec" in test_category or "rest" in test_category def is_rest(test_category): return "rest" in test_category def is_relevance_or_irrelevance(test_category): return "relevance" in test_category or "irrelevance" in test_category def is_chatable(test_category): return "chatable" in test_category def is_java(test_category): return "java" in test_category def is_js(test_category): return "javascript" in test_category def is_sql(test_category): return "sql" in test_category def load_file(file_path): result = [] with open(file_path) as f: file = f.readlines() for line in file: result.append(json.loads(line)) return result def get_handler(model_name): return handler_map[model_name](model_name, temperature=0) #Temperature doesn't matter for evaluation def write_list_of_dicts_to_file(filename, data, subdir=None): if subdir: # Ensure the subdirectory exists os.makedirs(subdir, exist_ok=True) # Construct the full path to the file filename = os.path.join(subdir, filename) # Write the list of dictionaries to the file in JSON format with open(filename, "w") as f: for i, entry in enumerate(data): # Go through each key-value pair in the dictionary to make sure the values are JSON serializable for key, value in entry.items(): try: json.dumps(value) except: # If the value is not JSON serializable, wrap it in a string entry[key] = str(value) json_str = json.dumps(entry) f.write(json_str) if i < len(data) - 1: f.write("\n") def is_function_calling_format_output(decoded_output): # Ensure the output is a list of dictionaries if type(decoded_output) == list: for item in decoded_output: if type(item) != dict: return False return True return False def is_executable_format_output(decoded_output): # Ensure the output is a list of strings (one or more strings) if type(decoded_output) == list: if len(decoded_output) == 0: return False for item in decoded_output: if type(item) != str: return False return True return False def is_rest_format_output(decoded_output): # Ensure the output is a list of one string if type(decoded_output) == list: if len(decoded_output) == 1 and type(decoded_output[0]) == str: return True return False def is_empty_output(decoded_output): # This function is a patch to the ast decoder for relevance detection # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct) if not is_function_calling_format_output(decoded_output): return True if len(decoded_output) == 0: return True if len(decoded_output) == 1 and len(decoded_output[0]) == 0: return True return False