llm-t97 / train4.py
ysn-rfd's picture
Upload 22 files
5500979 verified
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch
# Define parameters
model_name = "sshleifer/tiny-gpt2"
data_file = "data.txt"
output_dir = "./fine-tuned-tiny-gpt2"
block_size = 512 # Adjust block size based on your GPU memory and text length
# Step 1: Load the Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Optionally add a pad token
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
else:
model = GPT2LMHeadModel.from_pretrained(model_name)
# Ensure tokenizer and model vocabulary size are consistent
assert len(tokenizer) == model.config.vocab_size, "Tokenizer vocabulary size does not match model's embedding size."
# Step 2: Load and Preprocess Dataset
def load_and_preprocess_dataset(file_path, tokenizer, block_size):
dataset = load_dataset('text', data_files=file_path, split='train')
def tokenize_function(examples):
encodings = tokenizer(
examples['text'],
truncation=True,
padding='max_length',
max_length=block_size,
return_tensors='pt'
)
return {
'input_ids': encodings['input_ids'].squeeze(),
'attention_mask': encodings['attention_mask'].squeeze()
}
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])
return tokenized_dataset
# Load and preprocess dataset
dataset = load_and_preprocess_dataset(data_file, tokenizer, block_size)
# Split dataset into train and validation
split = dataset.train_test_split(test_size=0.1)
train_dataset = split['train']
eval_dataset = split['test']
# Step 3: Define Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=block_size)
# Step 4: Set Up TrainingArguments
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=True,
num_train_epochs=5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=2,
fp16=True,
logging_dir=os.path.join(output_dir, 'logs'),
logging_steps=200,
save_steps=1000,
save_total_limit=3,
eval_strategy="steps",
eval_steps=1000,
load_best_model_at_end=True,
metric_for_best_model="loss",
greater_is_better=False,
)
# Step 5: Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
# Step 6: Train and Evaluate
trainer.train()
# Step 7: Save the Fine-Tuned Model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model and tokenizer saved to {output_dir}")