Impossible_llm / perplexities /perplexities_qwen.py
Yaning1001's picture
Add files using upload-large-folder tool
94011a1 verified
# 1. 选择更加合理的perplexity的计算,一个文本还是一句话算一个
# 2. 考虑perplexity的计算方式
# 3. 有偶然性,所以perplexity需要多次计算取平均值
import sys
sys.path.append("..")
from utils_qwen import CHECKPOINT_READ_PATH, PERTURBATIONS, BABYLM_DATA_PATH, PAREN_MODELS
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import pandas as pd
import torch
import argparse
import os
from glob import glob
from tqdm import tqdm
from numpy.random import default_rng
# Constants
FILE_SAMPLE_SIZE = 1500 ## 考虑用3000更稳定一些
BATCH_SIZE = 8
device = "cuda"
MODEL_NAME = "Qwen/Qwen2.5-0.5B"
MODEL_NAME_SAVE = "Qwen2.5-0.5B"
class CustomDataset(torch.utils.data.Dataset):
def __init__(self, input_ids):
self.input_ids = input_ids
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return {"input_ids": self.input_ids[idx], "labels": self.input_ids[idx]}
def get_perplexities(model, eval_dataset):
# Use Trainer to evaluate and get the loss
trainer = Trainer(model=model)
eval_results = trainer.evaluate(eval_dataset) # Perform evaluation
loss = eval_results['eval_loss'] # Get the loss value
# Calculate perplexity
perplexity = torch.exp(torch.tensor(loss)).item()
return perplexity
if __name__ == "__main__":
parser = argparse.ArgumentParser(prog='Edge probing', description='Edge probing experiments')
parser.add_argument('test_perturbation_type',
default='all',
const='all',
nargs='?',
choices=PERTURBATIONS.keys(),
help='Perturbation function used to transform test BabyLM dataset')
parser.add_argument('checkpoint_path',
type=str,
nargs='?',
default='default-checkpoint',
help='Train checkpoint')
parser.add_argument('random_seed',
type=int,
nargs='?',
default=0,
help='Random seed')
args = parser.parse_args()
test_files = sorted(glob(
f"../data/babylm_data_perturbed_qwen/babylm_{args.test_perturbation_type}/babylm_test_affected/*"))
rng = default_rng(args.random_seed)
# checkpoint_path = 'checkpoint-1000'
checkpoint_dir = f'../train/checkpoints/babylm/babylm_{args.test_perturbation_type}_10M_seed0/runs/{args.checkpoint_path}'
print("Sampling BabyLM affected test files to extract surprisals...")
token_sequences = []
print("test_files:", test_files)
for test_file in test_files:
print(test_file)
with open(test_file, 'r') as f:
file_token_sequences = [
[int(s) for s in l.split()] for l in f.readlines()]
sample_indices = rng.choice(
list(range(len(file_token_sequences))), FILE_SAMPLE_SIZE, replace=False)
file_token_sequences = [file_token_sequences[i]
for i in sample_indices]
token_sequences.extend(file_token_sequences)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir)
print("Loading the Qwen model...")
model = AutoModelForCausalLM.from_pretrained(checkpoint_dir).to(device)
# 将token_sequences解码为文本
test_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in token_sequences]
# Tokenize the input sequences and prepare the dataset in the Trainer format
tokenized_sequences = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024)
# Create a Dataset for Trainer
# Prepare the dataset
dataset = CustomDataset(tokenized_sequences['input_ids'])
# Calculate perplexity
perplexity = get_perplexities(model, dataset)
# Prepare DataFrame for saving the results
ppl_df = pd.DataFrame({"Perplexity": [perplexity]})
directory = f"perplexity_results/{MODEL_NAME_SAVE}/{args.test_perturbation_type}"
if not os.path.exists(directory):
os.makedirs(directory)
file = f"{directory}/{MODEL_NAME_SAVE}_seed{args.random_seed}_test_{args.test_perturbation_type}_{args.checkpoint_path}.csv"
print(f"Writing results to CSV: {file}")
ppl_df.to_csv(file, index=False)
print(f"Calculated Perplexity: {perplexity}")