Update app.py
Browse files
app.py
CHANGED
@@ -32,6 +32,7 @@ def fine_tune_model(base_model_name, dataset_name):
|
|
32 |
|
33 |
print("### Dataset")
|
34 |
print(dataset)
|
|
|
35 |
print("###")
|
36 |
|
37 |
# Load model
|
@@ -44,43 +45,46 @@ def fine_tune_model(base_model_name, dataset_name):
|
|
44 |
print("###")
|
45 |
|
46 |
# Pre-process dataset
|
|
|
47 |
def preprocess(examples):
|
48 |
-
model_inputs = tokenizer(examples["sql_prompt"], text_target=examples["sql"]
|
49 |
return model_inputs
|
|
|
50 |
dataset = dataset.map(preprocess, batched=True)
|
51 |
|
52 |
print("### Pre-processed dataset")
|
53 |
print(dataset)
|
|
|
54 |
print("###")
|
55 |
|
56 |
# Split dataset into training and validation sets
|
57 |
-
train_dataset = dataset["train"]
|
58 |
-
test_dataset = dataset["test"]
|
59 |
|
60 |
print("### Training dataset")
|
61 |
-
print(
|
62 |
print("### Validation dataset")
|
63 |
print(test_dataset)
|
64 |
print("###")
|
65 |
|
66 |
# Configure training arguments
|
67 |
training_args = Seq2SeqTrainingArguments(
|
68 |
-
output_dir="./
|
69 |
-
logging_dir="./
|
70 |
num_train_epochs=1,
|
71 |
-
per_device_train_batch_size=16,
|
72 |
-
per_device_eval_batch_size=64,
|
73 |
-
eval_strategy="steps",
|
74 |
-
save_total_limit=2,
|
75 |
-
save_steps=500,
|
76 |
-
eval_steps=500,
|
77 |
-
warmup_steps=500,
|
78 |
-
weight_decay=0.01,
|
79 |
-
metric_for_best_model="accuracy",
|
80 |
-
greater_is_better=True,
|
81 |
-
load_best_model_at_end=True,
|
82 |
-
push_to_hub=True,
|
83 |
-
save_on_each_node=True,
|
84 |
)
|
85 |
|
86 |
print("### Training arguments")
|
@@ -93,13 +97,9 @@ def fine_tune_model(base_model_name, dataset_name):
|
|
93 |
args=training_args,
|
94 |
train_dataset=train_dataset,
|
95 |
eval_dataset=test_dataset,
|
96 |
-
compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1))},
|
97 |
)
|
98 |
|
99 |
-
print("### Trainer")
|
100 |
-
print(trainer)
|
101 |
-
print("###")
|
102 |
-
|
103 |
# Train and save model
|
104 |
#trainer.train()
|
105 |
#trainer.save_model()
|
@@ -128,8 +128,8 @@ def prompt_model(model_name, system_prompt, user_prompt, sql_schema):
|
|
128 |
|
129 |
def load_model(model_name):
|
130 |
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
131 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
132 |
-
tokenizer.pad_token = tokenizer.eos_token
|
133 |
|
134 |
return model, tokenizer
|
135 |
|
|
|
32 |
|
33 |
print("### Dataset")
|
34 |
print(dataset)
|
35 |
+
print(dataset.head())
|
36 |
print("###")
|
37 |
|
38 |
# Load model
|
|
|
45 |
print("###")
|
46 |
|
47 |
# Pre-process dataset
|
48 |
+
|
49 |
def preprocess(examples):
|
50 |
+
model_inputs = tokenizer(examples["sql_prompt"], text_target=examples["sql"]) #, max_length=512, padding="max_length", truncation=True)
|
51 |
return model_inputs
|
52 |
+
|
53 |
dataset = dataset.map(preprocess, batched=True)
|
54 |
|
55 |
print("### Pre-processed dataset")
|
56 |
print(dataset)
|
57 |
+
print(dataset.head())
|
58 |
print("###")
|
59 |
|
60 |
# Split dataset into training and validation sets
|
61 |
+
train_dataset = dataset["train"] #.shuffle(seed=42).select(range(1000))
|
62 |
+
test_dataset = dataset["test"] #.shuffle(seed=42).select(range(100))
|
63 |
|
64 |
print("### Training dataset")
|
65 |
+
print(train_dataset)
|
66 |
print("### Validation dataset")
|
67 |
print(test_dataset)
|
68 |
print("###")
|
69 |
|
70 |
# Configure training arguments
|
71 |
training_args = Seq2SeqTrainingArguments(
|
72 |
+
output_dir="./output",
|
73 |
+
logging_dir="./logging",
|
74 |
num_train_epochs=1,
|
75 |
+
#per_device_train_batch_size=16,
|
76 |
+
#per_device_eval_batch_size=64,
|
77 |
+
#eval_strategy="steps",
|
78 |
+
#save_total_limit=2,
|
79 |
+
#save_steps=500,
|
80 |
+
#eval_steps=500,
|
81 |
+
#warmup_steps=500,
|
82 |
+
#weight_decay=0.01,
|
83 |
+
#metric_for_best_model="accuracy",
|
84 |
+
#greater_is_better=True,
|
85 |
+
#load_best_model_at_end=True,
|
86 |
+
#push_to_hub=True,
|
87 |
+
#save_on_each_node=True,
|
88 |
)
|
89 |
|
90 |
print("### Training arguments")
|
|
|
97 |
args=training_args,
|
98 |
train_dataset=train_dataset,
|
99 |
eval_dataset=test_dataset,
|
100 |
+
#compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1))},
|
101 |
)
|
102 |
|
|
|
|
|
|
|
|
|
103 |
# Train and save model
|
104 |
#trainer.train()
|
105 |
#trainer.save_model()
|
|
|
128 |
|
129 |
def load_model(model_name):
|
130 |
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
131 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
|
132 |
+
#tokenizer.pad_token = tokenizer.eos_token
|
133 |
|
134 |
return model, tokenizer
|
135 |
|