|
|
|
|
|
|
|
|
|
import sys |
|
sys.path.append("..") |
|
|
|
from utils_qwen import CHECKPOINT_READ_PATH, PERTURBATIONS, BABYLM_DATA_PATH, PAREN_MODELS |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments |
|
import pandas as pd |
|
import torch |
|
import argparse |
|
import os |
|
from glob import glob |
|
from tqdm import tqdm |
|
from numpy.random import default_rng |
|
|
|
|
|
FILE_SAMPLE_SIZE = 1500 |
|
BATCH_SIZE = 8 |
|
device = "cuda" |
|
|
|
MODEL_NAME = "Qwen/Qwen2.5-0.5B" |
|
MODEL_NAME_SAVE = "Qwen2.5-0.5B" |
|
|
|
|
|
class CustomDataset(torch.utils.data.Dataset): |
|
def __init__(self, input_ids): |
|
self.input_ids = input_ids |
|
|
|
def __len__(self): |
|
return len(self.input_ids) |
|
|
|
def __getitem__(self, idx): |
|
return {"input_ids": self.input_ids[idx], "labels": self.input_ids[idx]} |
|
|
|
def get_perplexities(model, eval_dataset): |
|
|
|
trainer = Trainer(model=model) |
|
eval_results = trainer.evaluate(eval_dataset) |
|
loss = eval_results['eval_loss'] |
|
|
|
|
|
perplexity = torch.exp(torch.tensor(loss)).item() |
|
return perplexity |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(prog='Edge probing', description='Edge probing experiments') |
|
parser.add_argument('test_perturbation_type', |
|
default='all', |
|
const='all', |
|
nargs='?', |
|
choices=PERTURBATIONS.keys(), |
|
help='Perturbation function used to transform test BabyLM dataset') |
|
parser.add_argument('checkpoint_path', |
|
type=str, |
|
nargs='?', |
|
default='default-checkpoint', |
|
help='Train checkpoint') |
|
parser.add_argument('random_seed', |
|
type=int, |
|
nargs='?', |
|
default=0, |
|
help='Random seed') |
|
|
|
args = parser.parse_args() |
|
|
|
test_files = sorted(glob( |
|
f"../data/babylm_data_perturbed_qwen/babylm_{args.test_perturbation_type}/babylm_test_affected/*")) |
|
|
|
rng = default_rng(args.random_seed) |
|
|
|
|
|
checkpoint_dir = f'../train/checkpoints/babylm/babylm_{args.test_perturbation_type}_10M_seed0/runs/{args.checkpoint_path}' |
|
|
|
print("Sampling BabyLM affected test files to extract surprisals...") |
|
token_sequences = [] |
|
print("test_files:", test_files) |
|
for test_file in test_files: |
|
print(test_file) |
|
with open(test_file, 'r') as f: |
|
file_token_sequences = [ |
|
[int(s) for s in l.split()] for l in f.readlines()] |
|
sample_indices = rng.choice( |
|
list(range(len(file_token_sequences))), FILE_SAMPLE_SIZE, replace=False) |
|
file_token_sequences = [file_token_sequences[i] |
|
for i in sample_indices] |
|
token_sequences.extend(file_token_sequences) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir) |
|
|
|
print("Loading the Qwen model...") |
|
model = AutoModelForCausalLM.from_pretrained(checkpoint_dir).to(device) |
|
|
|
|
|
test_texts = [tokenizer.decode(seq, skip_special_tokens=True) for seq in token_sequences] |
|
|
|
|
|
tokenized_sequences = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024) |
|
|
|
|
|
|
|
|
|
|
|
dataset = CustomDataset(tokenized_sequences['input_ids']) |
|
|
|
|
|
perplexity = get_perplexities(model, dataset) |
|
|
|
|
|
ppl_df = pd.DataFrame({"Perplexity": [perplexity]}) |
|
|
|
directory = f"perplexity_results/{MODEL_NAME_SAVE}/{args.test_perturbation_type}" |
|
if not os.path.exists(directory): |
|
os.makedirs(directory) |
|
file = f"{directory}/{MODEL_NAME_SAVE}_seed{args.random_seed}_test_{args.test_perturbation_type}_{args.checkpoint_path}.csv" |
|
print(f"Writing results to CSV: {file}") |
|
ppl_df.to_csv(file, index=False) |
|
|
|
print(f"Calculated Perplexity: {perplexity}") |