import os from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling from datasets import load_dataset import torch # Define parameters model_name = "sshleifer/tiny-gpt2" data_file = "data.txt" output_dir = "./fine-tuned-tiny-gpt2" block_size = 512 # Adjust block size based on your GPU memory and text length # Step 1: Load the Tokenizer and Model tokenizer = GPT2Tokenizer.from_pretrained(model_name) # Optionally add a pad token if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model = GPT2LMHeadModel.from_pretrained(model_name) model.resize_token_embeddings(len(tokenizer)) else: model = GPT2LMHeadModel.from_pretrained(model_name) # Ensure tokenizer and model vocabulary size are consistent assert len(tokenizer) == model.config.vocab_size, "Tokenizer vocabulary size does not match model's embedding size." # Step 2: Load and Preprocess Dataset def load_and_preprocess_dataset(file_path, tokenizer, block_size): dataset = load_dataset('text', data_files=file_path, split='train') def tokenize_function(examples): encodings = tokenizer( examples['text'], truncation=True, padding='max_length', max_length=block_size, return_tensors='pt' ) return { 'input_ids': encodings['input_ids'].squeeze(), 'attention_mask': encodings['attention_mask'].squeeze() } tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text']) return tokenized_dataset # Load and preprocess dataset dataset = load_and_preprocess_dataset(data_file, tokenizer, block_size) # Split dataset into train and validation split = dataset.train_test_split(test_size=0.1) train_dataset = split['train'] eval_dataset = split['test'] # Step 3: Define Data Collator data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, pad_to_multiple_of=block_size) # Step 4: Set Up TrainingArguments training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=True, num_train_epochs=5, per_device_train_batch_size=2, per_device_eval_batch_size=2, gradient_accumulation_steps=2, fp16=True, logging_dir=os.path.join(output_dir, 'logs'), logging_steps=200, save_steps=1000, save_total_limit=3, eval_strategy="steps", eval_steps=1000, load_best_model_at_end=True, metric_for_best_model="loss", greater_is_better=False, ) # Step 5: Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, ) # Step 6: Train and Evaluate trainer.train() # Step 7: Save the Fine-Tuned Model trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir) print(f"Model and tokenizer saved to {output_dir}")