# 1. 选择更加合理的perplexity的计算,一个文本还是一句话算一个 # 2. 考虑perplexity的计算方式 # 3. 有偶然性,所以perplexity需要多次计算取平均值 import sys sys.path.append("..") from utils_qwen import CHECKPOINT_READ_PATH, PERTURBATIONS, BABYLM_DATA_PATH, PAREN_MODELS from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments import pandas as pd import torch import argparse import os from glob import glob from tqdm import tqdm from numpy.random import default_rng # Constants FILE_SAMPLE_SIZE = 1500 ## 考虑用3000更稳定一些 BATCH_SIZE = 8 device = "cuda" MODEL_NAME = "Qwen/Qwen2.5-0.5B" MODEL_NAME_SAVE = "Qwen2.5-0.5B" class CustomDataset(torch.utils.data.Dataset): def __init__(self, input_ids): self.input_ids = input_ids def __len__(self): return len(self.input_ids) def __getitem__(self, idx): return {"input_ids": self.input_ids[idx], "labels": self.input_ids[idx]} def get_perplexities(model, eval_dataset): # Use Trainer to evaluate and get the loss trainer = Trainer(model=model) eval_results = trainer.evaluate(eval_dataset) # Perform evaluation loss = eval_results['eval_loss'] # Get the loss value # Calculate perplexity perplexity = torch.exp(torch.tensor(loss)).item() return perplexity if __name__ == "__main__": parser = argparse.ArgumentParser(prog='Edge probing', description='Edge probing experiments') parser.add_argument('test_perturbation_type', default='all', const='all', nargs='?', choices=PERTURBATIONS.keys(), help='Perturbation function used to transform test BabyLM dataset') parser.add_argument('checkpoint_path', type=str, nargs='?', default='default-checkpoint', help='Train checkpoint') parser.add_argument('random_seed', type=int, nargs='?', default=0, help='Random seed') args = parser.parse_args() test_files = sorted(glob( f"../data/babylm_data_perturbed_qwen/babylm_{args.test_perturbation_type}/babylm_test_affected/*")) rng = default_rng(args.random_seed) # checkpoint_path = 'checkpoint-1000' checkpoint_dir = f'../train/checkpoints/babylm/babylm_{args.test_perturbation_type}_10M_seed0/runs/{args.checkpoint_path}' print("Sampling BabyLM affected test files to extract surprisals...") token_sequences = [] print("test_files:", test_files) for test_file in test_files: print(test_file) with open(test_file, 'r') as f: file_token_sequences = [ [int(s) for s in l.split()] for l in f.readlines()] sample_indices = rng.choice( list(range(len(file_token_sequences))), FILE_SAMPLE_SIZE, replace=False) file_token_sequences = [file_token_sequences[i] for i in sample_indices] token_sequences.extend(file_token_sequences) tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir) print("Loading the Qwen model...") model = AutoModelForCausalLM.from_pretrained(checkpoint_dir).to(device) # 将token_sequences解码为文本 test_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in token_sequences] # Tokenize the input sequences and prepare the dataset in the Trainer format tokenized_sequences = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024) # Create a Dataset for Trainer # Prepare the dataset dataset = CustomDataset(tokenized_sequences['input_ids']) # Calculate perplexity perplexity = get_perplexities(model, dataset) # Prepare DataFrame for saving the results ppl_df = pd.DataFrame({"Perplexity": [perplexity]}) directory = f"perplexity_results/{MODEL_NAME_SAVE}/{args.test_perturbation_type}" if not os.path.exists(directory): os.makedirs(directory) file = f"{directory}/{MODEL_NAME_SAVE}_seed{args.random_seed}_test_{args.test_perturbation_type}_{args.checkpoint_path}.csv" print(f"Writing results to CSV: {file}") ppl_df.to_csv(file, index=False) print(f"Calculated Perplexity: {perplexity}")