File size: 5,853 Bytes
d9409ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
# Set the KMP_DUPLICATE_LIB_OK environment variable to handle a known issue with PyTorch
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
import sys
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, get_linear_schedule_with_warmup
class GPT2Assistant:
def __init__(self):
# Load the GPT-2 tokenizer from the specified path
self.tokenizer = GPT2Tokenizer.from_pretrained("/Users/migueldeguzman/Desktop/gpt2xl_algos/RLLMv12/layer6/")
def fine_tune(self, answer_file_path, model_output_dir, epochs=1.0):
# Load the pre-trained GPT-2 model from the specified path
self.model = GPT2LMHeadModel.from_pretrained("/Users/migueldeguzman/Desktop/gpt2xl_algos/RLLMv12/layer6/")
# Create a text dataset from the specified file path and tokenizer, with a block size of 128
train_dataset = TextDataset(
tokenizer=self.tokenizer,
file_path=answer_file_path,
block_size=128
)
# Create a data collator for language modeling tasks
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False
)
# Calculate the total number of training steps based on the dataset length and number of epochs
total_steps = len(train_dataset) * epochs
# Set the number of warmup steps for the learning rate scheduler
warmup_steps = 0.1 * total_steps
# Create an Adam optimizer with specified learning rate and weight decay
optimizer = torch.optim.Adam(self.model.parameters(), lr=42e-6, weight_decay=0.005)
# Create a linear learning rate scheduler with warmup steps
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
# Define the training arguments
training_args = TrainingArguments(
output_dir=model_output_dir,
overwrite_output_dir=True,
num_train_epochs=epochs,
per_device_train_batch_size=4,
save_steps=10_000,
save_total_limit=2,
gradient_accumulation_steps=8,
lr_scheduler_type='cosine',
warmup_steps=500
)
# Create a Trainer instance with the specified model, arguments, data collator, dataset, and optimizers
trainer = Trainer(
model=self.model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
optimizers=(optimizer, scheduler)
)
# Fine-tune the model using the Trainer
trainer.train()
# Save the fine-tuned model and tokenizer to the specified output directory
self.model.save_pretrained(model_output_dir)
self.tokenizer.save_pretrained(model_output_dir)
def generate_answer(self, prompt, max_length=1000):
# Encode the input prompt using the tokenizer
input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
# Check if the tokenizer has a pad token and set it if not
if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Create an attention mask for the input ids
attention_mask = (input_ids != self.tokenizer.pad_token_id).long()
# Generate text using the fine-tuned model with the specified parameters
output = self.model.generate(
input_ids,
attention_mask=attention_mask,
max_length=max_length,
num_return_sequences=1,
no_repeat_ngram_size=2,
do_sample=True,
top_k=50,
top_p=0.95,
temperature=0.000000000000001
)
# Decode the generated output using the tokenizer, skipping special tokens
answer = self.tokenizer.decode(output[0], skip_special_tokens=True)
# Return the generated answer, excluding the original prompt
return answer[len(prompt):]
def query(self, prompt):
# Generate an answer for the given prompt
generated_answer = self.generate_answer(prompt)
print(generated_answer)
return generated_answer
def main():
# Set the file path for the text file to fine-tune on
text_file_path = "/Users/migueldeguzman/Desktop/gpt2xl_algos/RLLMv12/layer7/truth_v2.text"
# Set the output directory path for the fine-tuned model
model_output_dir = "/Users/migueldeguzman/Desktop/gpt2xl_algos/RLLMv12/layer7/"
assistant = GPT2Assistant()
# Prompt the user to choose whether to fine-tune a new model or load an existing one
choice = input("Do you want to fine-tune a new model (n) or load an existing one (e)? (n/e): ")
if choice.lower() == "n":
# Fine-tune the model if the user chooses 'n'
print("Fine-tuning the model...")
assistant.fine_tune(text_file_path, model_output_dir)
print("Model fine-tuning complete.")
elif choice.lower() == "e":
print("Loading the existing model...")
# Load the existing fine-tuned model if the user chooses 'e'
assistant.model = GPT2LMHeadModel.from_pretrained(model_output_dir)
print("Existing model loaded.")
else:
print("Invalid choice. Exiting the program.")
sys.exit()
while True:
# Prompt the user for a question# Prompt the user for a question
prompt = input("Enter your question (or type 'exit' to stop): ")
if prompt.lower() == "exit":
break
print("Answering in progress...")
# Generate an answer for the user's prompt
generated_answer = assistant.query(prompt)
print("\n")
if __name__ == "__main__":
main()
|