|
import os |
|
import json |
|
|
|
CONTEXT_WINDOW = 1024 |
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
from transformers import ( |
|
AutoTokenizer, AutoModelForCausalLM, TrainingArguments, |
|
Trainer, DataCollatorForLanguageModeling |
|
) |
|
import torch |
|
from datasets import load_dataset |
|
from huggingface_hub import login |
|
from optimize_lr import best_lr |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-1.2B-instruct", token=HF_TOKEN) |
|
if tokenizer.pad_token is None: |
|
tokenizer.pad_token = tokenizer.eos_token |
|
tokenizer.padding_side = "left" |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
"Zyphra/Zamba2-1.2B-instruct", |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto" |
|
) |
|
model.config.pad_token_id = tokenizer.pad_token_id |
|
|
|
|
|
dataset = load_dataset("BramVanroy/dolly-15k-dutch", split="train_sft") |
|
|
|
def prepare_chat_format(examples): |
|
chats = [] |
|
for messages in examples['messages']: |
|
try: |
|
chat = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=True, |
|
max_length=CONTEXT_WINDOW, |
|
truncation=True, |
|
return_tensors=None |
|
) |
|
except Exception as e: |
|
print(f"Error applying chat template: {e}") |
|
|
|
text = "" |
|
for message in messages: |
|
role = message["role"] |
|
content = message["content"] |
|
text += f"<|{role}|>\n{content}</s>\n" |
|
|
|
chat = tokenizer( |
|
text, |
|
max_length=CONTEXT_WINDOW, |
|
truncation=True, |
|
return_tensors=None |
|
)["input_ids"] |
|
|
|
chats.append(chat) |
|
return {"input_ids": chats} |
|
|
|
|
|
tokenized_dataset = dataset.map( |
|
prepare_chat_format, |
|
batched=True, |
|
remove_columns=dataset.column_names |
|
) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./zamba2-finetuned", |
|
num_train_epochs=2, |
|
per_device_train_batch_size=4, |
|
save_steps=500, |
|
save_total_limit=2, |
|
logging_steps=100, |
|
learning_rate=best_lr, |
|
weight_decay=0.01, |
|
fp16=False, |
|
bf16=True, |
|
gradient_accumulation_steps=8, |
|
dataloader_num_workers=4, |
|
gradient_checkpointing=True, |
|
max_grad_norm=1.0, |
|
warmup_steps=100 |
|
) |
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, |
|
mlm=False |
|
) |
|
|
|
|
|
class CustomTrainer(Trainer): |
|
def __init__(self, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
self.model = model |
|
|
|
def _move_model_to_device(self, model, device): |
|
pass |
|
|
|
trainer = CustomTrainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset, |
|
data_collator=data_collator |
|
) |
|
|
|
|
|
trainer.train() |
|
model.save_pretrained("./zamba2-finetuned-final") |
|
tokenizer.save_pretrained("./zamba2-finetuned-final") |