llm-t97 / train-t5-efficient-tiny.py
ysn-rfd's picture
Upload 22 files
5500979 verified
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
# Load and preprocess data
def load_and_preprocess_data(file_path):
with open(file_path, 'r') as f:
lines = f.readlines()
data = {'text': [line.strip() for line in lines if line.strip()]}
dataset = Dataset.from_dict(data)
return dataset
# Define preprocessing function for tokenization
def tokenize_function(examples, tokenizer):
inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)
targets = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)
inputs['labels'] = targets['input_ids']
return inputs
# Load dataset and tokenizer
data_file = 'data.txt'
dataset = load_and_preprocess_data(data_file)
tokenizer = T5Tokenizer.from_pretrained('google/t5-efficient-tiny')
# Tokenize dataset
tokenized_datasets = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
# Split dataset into training and evaluation datasets
split_datasets = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']
# Load model
model = T5ForConditionalGeneration.from_pretrained('google/t5-efficient-tiny')
# Ensure all tensors in the model are contiguous
def make_contiguous(model):
for name, param in model.named_parameters():
if not param.is_contiguous():
param.data = param.data.contiguous()
# Apply the conversion to contiguous tensors
make_contiguous(model)
# Define training arguments
training_args = TrainingArguments(
output_dir='./results',
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=6,
logging_dir='./logs',
logging_steps=10,
save_steps=10,
evaluation_strategy='steps',
save_total_limit=1,
learning_rate=8e-5,
weight_decay=0.01,
report_to='tensorboard',
fp16=True,
gradient_accumulation_steps=2
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer
)
# Train and fine-tune the model
trainer.train()
# Save the fine-tuned model
output_dir = './fine-tuned-t5-efficient-tiny'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Also save the model state dict to avoid any issues with saving
torch.save(model.state_dict(), f'{output_dir}/pytorch_model.bin')