|
import os |
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling |
|
from datasets import load_dataset |
|
import torch |
|
|
|
|
|
model_name = "sshleifer/tiny-gpt2" |
|
data_file = "data.txt" |
|
output_dir = "./fine-tuned-tiny-gpt2" |
|
block_size = 512 |
|
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained(model_name) |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
model = GPT2LMHeadModel.from_pretrained(model_name) |
|
model.resize_token_embeddings(len(tokenizer)) |
|
else: |
|
model = GPT2LMHeadModel.from_pretrained(model_name) |
|
|
|
|
|
assert len(tokenizer) == model.config.vocab_size, "Tokenizer vocabulary size does not match model's embedding size." |
|
|
|
|
|
def load_and_preprocess_dataset(file_path, tokenizer, block_size): |
|
dataset = load_dataset('text', data_files=file_path, split='train') |
|
|
|
def tokenize_function(examples): |
|
encodings = tokenizer( |
|
examples['text'], |
|
truncation=True, |
|
padding='max_length', |
|
max_length=block_size, |
|
return_tensors='pt' |
|
) |
|
return { |
|
'input_ids': encodings['input_ids'].squeeze(), |
|
'attention_mask': encodings['attention_mask'].squeeze() |
|
} |
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text']) |
|
return tokenized_dataset |
|
|
|
|
|
dataset = load_and_preprocess_dataset(data_file, tokenizer, block_size) |
|
|
|
|
|
split = dataset.train_test_split(test_size=0.1) |
|
train_dataset = split['train'] |
|
eval_dataset = split['test'] |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=block_size) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir=output_dir, |
|
overwrite_output_dir=True, |
|
num_train_epochs=5, |
|
per_device_train_batch_size=2, |
|
per_device_eval_batch_size=2, |
|
gradient_accumulation_steps=2, |
|
fp16=True, |
|
logging_dir=os.path.join(output_dir, 'logs'), |
|
logging_steps=200, |
|
save_steps=1000, |
|
save_total_limit=3, |
|
eval_strategy="steps", |
|
eval_steps=1000, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="loss", |
|
greater_is_better=False, |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset, |
|
data_collator=data_collator, |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
trainer.save_model(output_dir) |
|
tokenizer.save_pretrained(output_dir) |
|
|
|
print(f"Model and tokenizer saved to {output_dir}") |
|
|