import sys sys.path.append("..") import os from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling from utils_qwen import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \ GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file from peft import get_peft_model, LoraConfig, TaskType # Import PEFT components for LoRA # import wandb # === CONFIGURATION SETTINGS === perturbation = "shuffle_deterministic21" train_set = "10M" seed = 0 ckpt_path = "./checkpoints" effective_bsz = 512 # === FILE PATHS BASED ON CONFIGURATION === run_id = f"babylm_{perturbation}_{train_set}_seed{seed}" cache_dir = os.path.join(ckpt_path, "babylm_lora", run_id, "artifacts") run_dir = os.path.join(ckpt_path, "babylm_lora", run_id, "runs") os.makedirs(cache_dir, exist_ok=True) os.makedirs(run_dir, exist_ok=True) # Setup for Weights & Biases # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id) # === DATASET LOADING === dataset_name = f"babylm_{perturbation}_{train_set}_seed{seed}" dataset = load_dataset('babylm_dataset.py', name=dataset_name, trust_remote_code=True) train_dataset = dataset['train'] # === TOKENIZER & MODEL LOADING === model_name = "Qwen/Qwen2.5-0.5B" tokenizer = PERTURBATIONS[perturbation]['qwen_tokenizer'] model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir) # === APPLYING LoRA === lora_config = LoraConfig( task_type=TaskType.CAUSAL_LM, # This specifies the task type r=16, # Rank of the decomposed matrices lora_alpha=16, # Amplitude of the LoRA updates lora_dropout=0.1, # Dropout for LoRA layers ) model = get_peft_model(model, lora_config) # print("model:", model) # for name, param in model.named_parameters(): # if param.requires_grad: # print(f"Trainable parameter: {name}, shape: {param.shape}") # === TOKENIZATION === def tokenize_function(examples): return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024) tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) # === DATA COLLATOR === data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) # === TRAINING ARGUMENTS === training_args = TrainingArguments( output_dir=run_dir, # evaluation_strategy="steps", # use with load_best_model_at_end=True evaluation_strategy="no", per_device_train_batch_size=1, # Set based on your hardware capabilities logging_dir='./logs', logging_steps=10, save_steps=10, # save_total_limit=5, learning_rate=5e-4, # You may want to tune this for LoRA num_train_epochs=10, # Fewer epochs might be sufficient due to the efficiency of LoRA seed=seed, # load_best_model_at_end=True, gradient_accumulation_steps=1, fp16=True, warmup_ratio=0.1, # report_to="wandb" ) # === TRAINER === trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train, tokenizer=tokenizer, data_collator=data_collator ) # === TRAIN MODEL === trainer.train() # End logging # wandb.finish()