# import sys | |
# import torch | |
# sys.path.append("..") | |
# import os | |
# from datasets import load_dataset | |
# from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling | |
# from utils_llama import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \ | |
# GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file | |
# import wandb | |
# import argparse | |
# os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# # import wandb | |
# # Setup for Weights & Biases | |
# # wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id) | |
# if __name__ == "__main__": | |
# # === CONFIGURATION SETTINGS === | |
# parser = argparse.ArgumentParser(description="Training configuration.") | |
# parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.') | |
# parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.') | |
# parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.') | |
# parser.add_argument('--epoch', type=int, default=20, help='train epoch') | |
# parser.add_argument('--seed', type=int, default=0, help='Random seed.') | |
# args = parser.parse_args() | |
# # no_pos_encodings_underscore = "" # Ex: "_nopos" if needed | |
# ckpt_path = "./checkpoints" | |
# # effective_bsz = 512 | |
# model_name = "meta-llama/Llama-3.2-3B" | |
# model_save_name = "Llama-3.2-3B" | |
# # === FILE PATHS BASED ON CONFIGURATION === | |
# wandb_id = f"{model_save_name}_{args.perturbation}_train_set_{args.train_set}_epoch_{args.epoch}_batch_size_{args.batch_size}_seed_{args.seed}" | |
# wandb.init(project="impossible_llm_shuffle", group="train-experiments", name=wandb_id) | |
# wandb.config.update(args) | |
# run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}" | |
# cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts") | |
# run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs") | |
# os.makedirs(cache_dir, exist_ok=True) | |
# os.makedirs(run_dir, exist_ok=True) | |
# # === DATASET LOADING === | |
# dataset_name = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}" | |
# dataset = load_dataset('babylm_dataset_llama.py', name=dataset_name, trust_remote_code=True) | |
# train_dataset = dataset['train'] | |
# # === TOKENIZER & MODEL LOADING === | |
# # model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}" | |
# # tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) | |
# tokenizer = PERTURBATIONS[args.perturbation]['llama_tokenizer'] | |
# model = AutoModelForCausalLM.from_pretrained(model_name, | |
# # device_map="auto", # deepspeed needs to delete this setting | |
# cache_dir=cache_dir) | |
# # print("model:", model) | |
# # === TOKENIZATION === | |
# def tokenize_function(examples): | |
# return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024) | |
# tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) | |
# print(tokenized_train) | |
# # === DATA COLLATOR === | |
# data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) | |
# # === TRAINING ARGUMENTS === | |
# training_args = TrainingArguments( | |
# output_dir=run_dir, | |
# evaluation_strategy="no", | |
# per_device_train_batch_size=args.batch_size, # set "auto" in deepspeed config, adjust it in trainer | |
# logging_dir='./logs', | |
# logging_steps=1, | |
# save_steps=15000, | |
# learning_rate=5e-5, # align with deepspeed | |
# num_train_epochs=args.epoch, | |
# seed=args.seed, | |
# gradient_accumulation_steps=4, # # set "auto" in deepspeed config, adjust it in trainer | |
# fp16 = True, # align with deepspeed | |
# report_to="wandb", | |
# deepspeed="deepspeed_config/train_dp_config.json" | |
# ) | |
# # === TRAINER === | |
# trainer = Trainer( | |
# model=model, | |
# args=training_args, | |
# train_dataset=tokenized_train, | |
# tokenizer=tokenizer, | |
# data_collator=data_collator | |
# ) | |
# # === TRAIN MODEL === | |
# trainer.train() | |
# # End logging | |
# wandb.finish() | |
import sys | |
import torch | |
sys.path.append("..") | |
import os | |
from datasets import load_dataset | |
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling | |
from utils_llama import PERTURBATIONS, BABYLM_SPLITS, BABYLM_DATA_PATH, \ | |
GENRES, MARKER_TOKEN_IDS, marker_sg_token, marker_pl_token, marker_rev_token, write_file | |
import wandb | |
import argparse | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
# import wandb | |
# Setup for Weights & Biases | |
# wandb.init(project="kallini", group="babylm-perturbation-experiments", name=run_id) | |
if __name__ == "__main__": | |
# === CONFIGURATION SETTINGS === | |
parser = argparse.ArgumentParser(description="Training configuration.") | |
parser.add_argument('--perturbation', type=str, default='hop_tokens4', help='Type of perturbation to use.') | |
parser.add_argument('--train_set', type=str, default='10M', help='Dataset size for training.') | |
parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training.') | |
parser.add_argument('--epoch', type=int, default=20, help='train epoch') | |
parser.add_argument('--seed', type=int, default=0, help='Random seed.') | |
args = parser.parse_args() | |
# no_pos_encodings_underscore = "" # Ex: "_nopos" if needed | |
ckpt_path = "./checkpoints" | |
# effective_bsz = 512 | |
model_name = "meta-llama/Llama-3.2-3B" | |
model_save_name = "Llama-3.2-3B" | |
# === FILE PATHS BASED ON CONFIGURATION === | |
wandb_id = f"{model_save_name}_{args.perturbation}_train_set_{args.train_set}_epoch_{args.epoch}_batch_size_{args.batch_size}_seed_{args.seed}" | |
wandb.init(project="impossible_llm_shuffle", group="train-experiments", name=wandb_id) | |
wandb.config.update(args) | |
run_id = f"babylm_{args.perturbation}_{args.train_set}_seed{args.seed}" | |
cache_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "artifacts") | |
run_dir = os.path.join(ckpt_path, f"{model_save_name}", run_id, "runs") | |
os.makedirs(cache_dir, exist_ok=True) | |
os.makedirs(run_dir, exist_ok=True) | |
# === DATASET LOADING === | |
dataset_name = f"babylm_{args.perturbation}_{args.train_zset}_seed{args.seed}" | |
dataset = load_dataset('babylm_dataset_llama.py', name=dataset_name, trust_remote_code=True) | |
train_dataset = dataset['train'] | |
valid_dataset = dataset['validation'] | |
# === TOKENIZER & MODEL LOADING === | |
# model_name = f"gpt2{'' if no_pos_encodings_underscore == '' else '-no-pos'}-small-{perturbation}-{paren_model}" | |
# tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) | |
tokenizer = PERTURBATIONS[args.perturbation]['llama_tokenizer'] | |
model = AutoModelForCausalLM.from_pretrained(model_name, | |
# device_map="auto", # deepspeed needs to delete this setting | |
cache_dir=cache_dir) | |
# print("model:", model) | |
# === TOKENIZATION === | |
def tokenize_function(examples): | |
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=1024) | |
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) | |
tokenized_valid = valid_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) | |
shuffled_valid = tokenized_valid.shuffle() | |
tokenized_valid = shuffled_valid.select(range(500)) | |
print("tokenized_valid:", tokenized_valid) | |
# print(train_dataset) | |
# === DATA COLLATOR ===2 | |
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) | |
# === TRAINING ARGUMENTS === | |
training_args = TrainingArguments( | |
output_dir=run_dir, | |
evaluation_strategy="steps", | |
eval_steps=5, | |
per_device_train_batch_size=args.batch_size, # set "auto" in deepspeed config, adjust it in trainer | |
logging_dir='./logs', | |
logging_steps=1, | |
save_steps=1500000, | |
learning_rate=5e-5, # align with deepspeed | |
num_train_epochs=args.epoch, | |
seed=args.seed, | |
gradient_accumulation_steps=4, # # set "auto" in deepspeed config, adjust it in trainer | |
fp16 = True, # align with deepspeed | |
report_to="wandb", | |
deepspeed="deepspeed_config/train_dp_config.json" | |
) | |
# === TRAINER === | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_train, | |
eval_dataset=tokenized_valid, | |
tokenizer=tokenizer, | |
data_collator=data_collator | |
) | |
# === TRAIN MODEL === | |
trainer.train() | |
# End logging | |
wandb.finish() | |