Spaces:

bstraehle
/

sft

Running

App Files Files Community

bstraehle commited on Aug 3

Commit

88543e6

•

1 Parent(s): b850fa7

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -81

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import os, torch
 from datasets import load_dataset
 from huggingface_hub import HfApi, login
 from transformers import AutoModelForCausalLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, pipeline
-qTrainingArguments
 hf_profile = "bstraehle"
@@ -12,140 +11,134 @@ action_2 = "Prompt fine-tuned model"
 system_prompt = "You are a text to SQL query translator. Given a question in English, generate a SQL query based on the provided SCHEMA. Do not generate any additional text. SCHEMA: {schema}"
 user_prompt = "What is the total trade value and average price for each trader and stock in the trade_history table?"
-schema = "CREATE TABLE trade_history (id INT, trader_id INT, stock VARCHAR(255), price DECIMAL(5,2), quantity INT, trade_time TIMESTAMP);"
-base_model_id = "bstraehle/Meta-Llama-3.1-8B-Instruct-text-to-sql"
-dataset = "gretelai/synthetic_text_to_sql"
-def process(action, base_model_id, dataset, system_prompt, user_prompt, schema):
     #raise gr.Error("Please clone and bring your own credentials.")
     if action == action_1:
-        result = fine_tune_model(base_model_id, dataset)
     elif action == action_2:
-        fine_tuned_model_id = replace_hf_profile(base_model_id)
-        result = prompt_model(fine_tuned_model_id, system_prompt, user_prompt, schema)
     return result
-def fine_tune_model(base_model_id, dataset):
-#    tokenizer = download_model(base_model_id)
-#    upload_model(base_model_id, tokenizer)
-    # Load the dataset
-    dataset = load_dataset("gretelai/synthetic_text_to_sql")
-    # Load pre-trained model and tokenizer
-    model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
     print(model)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    if not tokenizer.pad_token:
-        tokenizer.pad_token = tokenizer.eos_token
-    # Preprocess the dataset
     def preprocess(examples):
         model_inputs = tokenizer(examples["sql_prompt"], text_target=examples["sql"], max_length=512, padding="max_length", truncation=True)
         return model_inputs
     dataset = dataset.map(preprocess, batched=True)
-    # Split dataset to training and validation sets
-    train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))  # Adjust the range as needed
-    val_dataset = dataset["test"].shuffle(seed=42).select(range(100))  # Adjust the range as needed
-    # Set training arguments
     training_args = Seq2SeqTrainingArguments(
         output_dir="./results",
-        num_train_epochs=1,  # Adjust as needed
         per_device_train_batch_size=16,
         per_device_eval_batch_size=64,
-        warmup_steps=500,
-        weight_decay=0.01,
-        logging_dir="./logs",
         save_total_limit=2,
         save_steps=500,
         eval_steps=500,
         metric_for_best_model="accuracy",
         greater_is_better=True,
-        save_on_each_node=True,
         load_best_model_at_end=True,
-        eval_strategy="steps",
         push_to_hub=True,
     )
-    # Create Trainer instance
     trainer = Seq2SeqTrainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
-        eval_dataset=val_dataset,
         compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1))},
     )
-    # Train the model
-    trainer.train()
-    # Save the trained model
-    trainer.save_model("./fine_tuned_model")
-    # Create a repository object
-    repo = Repository(
-        local_dir="./fine_tuned_model",
-        repo_type="model",
-        repo_id="bstraehle/Meta-Llama-3.1-8B-Instruct-text-to-sql",
-    )
-    # Login to the Hugging Face hub
-    repo.login(token=os.environ["HF_TOKEN"])
-    # Push the model to the hub
-    repo.push_to_hub(commit_message="Initial commit")
-def prompt_model(model_id, system_prompt, user_prompt, schema):
     pipe = pipeline("text-generation",
-                    model=model_id,
-                    model_kwargs={"torch_dtype": torch.bfloat16},
                     device_map="auto",
                     max_new_tokens=1000)
     messages = [
       {"role": "system", "content": system_prompt.format(schema=schema)},
       {"role": "user", "content": user_prompt},
       {"role": "assistant", "content": ""}
     ]
     output = pipe(messages)
     result = output[0]["generated_text"][-1]["content"]
-    print(result)
-    return result
-def download_model(base_model_id):
-    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
-    model = AutoModelForCausalLM.from_pretrained(base_model_id)
-    model.save_pretrained(base_model_id)
-    return tokenizer
-def upload_model(base_model_id, tokenizer):
-    fine_tuned_model_id = replace_hf_profile(base_model_id)
-    login(token=os.environ["HF_TOKEN"])
-    api = HfApi()
-    #api.delete_repo(repo_id=fine_tuned_model_id, repo_type="model")
-    api.create_repo(repo_id=fine_tuned_model_id)
-    api.upload_folder(
-        folder_path=base_model_id,
-        repo_id=fine_tuned_model_id
-    )
-    tokenizer.push_to_hub(fine_tuned_model_id)
-    return fine_tuned_model_id
-def replace_hf_profile(base_model_id):
-    model_id = base_model_id[base_model_id.rfind('/')+1:]
-    return f"{hf_profile}/{model_id}"
 demo = gr.Interface(fn=process,
                     inputs=[gr.Radio([action_1, action_2], label = "Action", value = action_1),
-                            gr.Textbox(label = "Base Model ID", value = base_model_id, lines = 1),
-                            gr.Textbox(label = "Dataset", value = dataset, lines = 1),
                             gr.Textbox(label = "System Prompt", value = system_prompt, lines = 2),
                             gr.Textbox(label = "User Prompt", value = user_prompt, lines = 2),
-                            gr.Textbox(label = "Schema", value = schema, lines = 2)],
-                    outputs=[gr.Textbox(label = "Completion", value = os.environ["OUTPUT"])])
 demo.launch()

 from datasets import load_dataset
 from huggingface_hub import HfApi, login
 from transformers import AutoModelForCausalLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, pipeline
 hf_profile = "bstraehle"
 system_prompt = "You are a text to SQL query translator. Given a question in English, generate a SQL query based on the provided SCHEMA. Do not generate any additional text. SCHEMA: {schema}"
 user_prompt = "What is the total trade value and average price for each trader and stock in the trade_history table?"
+sql_schema = "CREATE TABLE trade_history (id INT, trader_id INT, stock VARCHAR(255), price DECIMAL(5,2), quantity INT, trade_time TIMESTAMP);"
+model_name = "bstraehle/Meta-Llama-3.1-8B-Instruct-text-to-sql"
+dataset_name = "gretelai/synthetic_text_to_sql"
+def process(action, model_name, dataset_name, system_prompt, user_prompt, sql_schema):
     #raise gr.Error("Please clone and bring your own credentials.")
     if action == action_1:
+        result = fine_tune_model(model_name, dataset_name)
     elif action == action_2:
+        result = prompt_model(model_name, system_prompt, user_prompt, sql_schema)
     return result
+def fine_tune_model(model_name, dataset_name):
+    # Load dataset
+    dataset = load_dataset(dataset_name)
+    print("### Dataset")
+    print(dataset)
+    print("###")
+    # Load model
+    model, tokenizer = load_model(model_name)
+    print("### Model")
     print(model)
+    print("### Tokenizer")
+    print(tokenizer)
+    print("###")
+    # Pre-process dataset
     def preprocess(examples):
         model_inputs = tokenizer(examples["sql_prompt"], text_target=examples["sql"], max_length=512, padding="max_length", truncation=True)
         return model_inputs
     dataset = dataset.map(preprocess, batched=True)
+    print("### Pre-processed dataset")
+    print(dataset)
+    print("###")
+    # Split dataset into training and validation sets
+    train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
+    test_dataset = dataset["test"].shuffle(seed=42).select(range(100))
+    print("### Training dataset")
+    print(test_dataset)
+    print("### Validation dataset")
+    print(test_dataset)
+    print("###")
+    # Configure training arguments
     training_args = Seq2SeqTrainingArguments(
         output_dir="./results",
+        logging_dir="./logs",
+        num_train_epochs=1,
         per_device_train_batch_size=16,
         per_device_eval_batch_size=64,
+        eval_strategy="steps",
         save_total_limit=2,
         save_steps=500,
         eval_steps=500,
+        warmup_steps=500,
+        weight_decay=0.01,
         metric_for_best_model="accuracy",
         greater_is_better=True,
         load_best_model_at_end=True,
         push_to_hub=True,
+        save_on_each_node=True,
     )
+    print("### Training arguments")
+    print(training_args)
+    print("###")
+    # Create trainer
     trainer = Seq2SeqTrainer(
         model=model,
         args=training_args,
         train_dataset=train_dataset,
+        eval_dataset=test_dataset,
         compute_metrics=lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1))},
     )
+    print("### Trainer")
+    print(trainer)
+    print("###")
+    # Train model
+    #trainer.train()
+def prompt_model(model_name, system_prompt, user_prompt, sql_schema):
     pipe = pipeline("text-generation",
+                    model=model_name,
+                    #model_kwargs={"torch_dtype": torch.bfloat16},
                     device_map="auto",
                     max_new_tokens=1000)
     messages = [
       {"role": "system", "content": system_prompt.format(schema=schema)},
       {"role": "user", "content": user_prompt},
       {"role": "assistant", "content": ""}
     ]
     output = pipe(messages)
     result = output[0]["generated_text"][-1]["content"]
+    print("###")
+    print(result)
+    print("###")
+    return result
+def load_model(model_name):
+    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if not tokenizer.pad_token:
+        tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
 demo = gr.Interface(fn=process,
                     inputs=[gr.Radio([action_1, action_2], label = "Action", value = action_1),
+                            gr.Textbox(label = "Model Name", value = model_name, lines = 1),
+                            gr.Textbox(label = "Dataset Name", value = dataset_name, lines = 1),
                             gr.Textbox(label = "System Prompt", value = system_prompt, lines = 2),
                             gr.Textbox(label = "User Prompt", value = user_prompt, lines = 2),
+                            gr.Textbox(label = "SQL Schema", value = sql_schema, lines = 2)],
+                    outputs=[gr.Textbox(label = "Prompt Completion", value = os.environ["OUTPUT"])])
 demo.launch()