|
import torch |
|
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments |
|
from datasets import Dataset |
|
|
|
|
|
def load_and_preprocess_data(file_path): |
|
with open(file_path, 'r') as f: |
|
lines = f.readlines() |
|
|
|
data = {'text': [line.strip() for line in lines if line.strip()]} |
|
dataset = Dataset.from_dict(data) |
|
|
|
return dataset |
|
|
|
|
|
def tokenize_function(examples, tokenizer): |
|
inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256) |
|
targets = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256) |
|
inputs['labels'] = targets['input_ids'] |
|
return inputs |
|
|
|
|
|
data_file = 'data.txt' |
|
dataset = load_and_preprocess_data(data_file) |
|
|
|
tokenizer = T5Tokenizer.from_pretrained('google/t5-efficient-tiny') |
|
|
|
|
|
tokenized_datasets = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True) |
|
|
|
|
|
split_datasets = tokenized_datasets.train_test_split(test_size=0.1) |
|
train_dataset = split_datasets['train'] |
|
eval_dataset = split_datasets['test'] |
|
|
|
|
|
model = T5ForConditionalGeneration.from_pretrained('google/t5-efficient-tiny') |
|
|
|
|
|
def make_contiguous(model): |
|
for name, param in model.named_parameters(): |
|
if not param.is_contiguous(): |
|
param.data = param.data.contiguous() |
|
|
|
|
|
make_contiguous(model) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir='./results', |
|
per_device_train_batch_size=2, |
|
per_device_eval_batch_size=2, |
|
num_train_epochs=6, |
|
logging_dir='./logs', |
|
logging_steps=10, |
|
save_steps=10, |
|
evaluation_strategy='steps', |
|
save_total_limit=1, |
|
learning_rate=8e-5, |
|
weight_decay=0.01, |
|
report_to='tensorboard', |
|
fp16=True, |
|
gradient_accumulation_steps=2 |
|
) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=train_dataset, |
|
eval_dataset=eval_dataset, |
|
tokenizer=tokenizer |
|
) |
|
|
|
|
|
trainer.train() |
|
|
|
|
|
output_dir = './fine-tuned-t5-efficient-tiny' |
|
model.save_pretrained(output_dir) |
|
tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
torch.save(model.state_dict(), f'{output_dir}/pytorch_model.bin') |
|
|