File size: 3,252 Bytes
f20d980 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import sys
sys.path.append("..")
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from utils_qwen import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \
GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file
from peft import get_peft_model, LoraConfig, TaskType # Import PEFT components for LoRA
# import wandb
# === CONFIGURATION SETTINGS ===
perturbation = "shuffle_deterministic21"
train_set = "10M"
seed = 0
ckpt_path = "./checkpoints"
effective_bsz = 512
# === FILE PATHS BASED ON CONFIGURATION ===
run_id = f"babylm_{perturbation}_{train_set}_seed{seed}"
cache_dir = os.path.join(ckpt_path, "babylm_lora", run_id, "artifacts")
run_dir = os.path.join(ckpt_path, "babylm_lora", run_id, "runs")
os.makedirs(cache_dir, exist_ok=True)
os.makedirs(run_dir, exist_ok=True)
# Setup for Weights & Biases
# wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id)
# === DATASET LOADING ===
dataset_name = f"babylm_{perturbation}_{train_set}_seed{seed}"
dataset = load_dataset('babylm_dataset.py', name=dataset_name, trust_remote_code=True)
train_dataset = dataset['train']
# === TOKENIZER & MODEL LOADING ===
model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = PERTURBATIONS[perturbation]['qwen_tokenizer']
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
# === APPLYING LoRA ===
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM, # This specifies the task type
r=16, # Rank of the decomposed matrices
lora_alpha=16, # Amplitude of the LoRA updates
lora_dropout=0.1, # Dropout for LoRA layers
)
model = get_peft_model(model, lora_config)
# print("model:", model)
# for name, param in model.named_parameters():
# if param.requires_grad:
# print(f"Trainable parameter: {name}, shape: {param.shape}")
# === TOKENIZATION ===
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024)
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# === DATA COLLATOR ===
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
# === TRAINING ARGUMENTS ===
training_args = TrainingArguments(
output_dir=run_dir,
# evaluation_strategy="steps", # use with load_best_model_at_end=True
evaluation_strategy="no",
per_device_train_batch_size=1, # Set based on your hardware capabilities
logging_dir='./logs',
logging_steps=10,
save_steps=10,
# save_total_limit=5,
learning_rate=5e-4, # You may want to tune this for LoRA
num_train_epochs=10, # Fewer epochs might be sufficient due to the efficiency of LoRA
seed=seed,
# load_best_model_at_end=True,
gradient_accumulation_steps=1,
fp16=True,
warmup_ratio=0.1,
# report_to="wandb"
)
# === TRAINER ===
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
tokenizer=tokenizer,
data_collator=data_collator
)
# === TRAIN MODEL ===
trainer.train()
# End logging
# wandb.finish() |