import spaces import glob import json import os import uuid from datetime import datetime from pathlib import Path import gradio as gr import torch import transformers from huggingface_hub import CommitScheduler, hf_hub_download, login from transformers import AutoTokenizer print(f"Is CUDA available: {torch.cuda.is_available()}") print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") HF_TOKEN = os.getenv("HF_TOKEN") login(HF_TOKEN) # Load the model model_id = "meta-llama/Meta-Llama-3-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id, add_special_tokens=True) pipeline = transformers.pipeline( "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device="cuda", ) # Load the model configuration with open("model_configs.json", "r") as f: model_configs = json.load(f) model_config = model_configs[model_id] # Extract instruction extract_input = model_config["extract_input"] terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>"), ] # Set up dataset storage dataset_folder = Path("dataset") dataset_folder.mkdir(exist_ok=True) # Function to get the latest dataset file def get_latest_dataset_file(): if files := glob.glob(str(dataset_folder / "data_*.jsonl")): return max(files, key=os.path.getctime) return None # Check for existing dataset and create or append to it if latest_file := get_latest_dataset_file(): dataset_file = Path(latest_file) print(f"Appending to existing dataset file: {dataset_file}") else: dataset_file = dataset_folder / f"data_{uuid.uuid4()}.jsonl" print(f"Creating new dataset file: {dataset_file}") # Set up CommitScheduler for dataset uploads repo_id = "davanstrien/magpie-preference" # Replace with your desired dataset repo scheduler = CommitScheduler( repo_id=repo_id, repo_type="dataset", folder_path=dataset_folder, path_in_repo="data", every=5, # Upload every 5 minutes ) # Function to download existing dataset files def download_existing_dataset(): try: files = hf_hub_download( repo_id=repo_id, filename="data", repo_type="dataset", recursive=True ) for file in glob.glob(os.path.join(files, "*.jsonl")): dest_file = dataset_folder / os.path.basename(file) if not dest_file.exists(): dest_file.write_bytes(Path(file).read_bytes()) print(f"Downloaded existing dataset file: {dest_file}") except Exception as e: print(f"Error downloading existing dataset: {e}") # Download existing dataset files at startup download_existing_dataset() # Function to generate a session ID def generate_session_id(): return str(uuid.uuid4()) # Function to save feedback and generated data def save_data(generated_input, generated_response, vote, session_id): data = { "timestamp": datetime.now().isoformat(), "prompt": generated_input, "completion": generated_response, "label": vote, "session_id": session_id, } with scheduler.lock: with dataset_file.open("a") as f: f.write(json.dumps(data) + "\n") return "Data saved and will be uploaded to the dataset repository." @spaces.GPU def generate_instruction_response(): prompt_info = f"""### Generating user prompt using the template: ``` {extract_input} ``` """ yield ( prompt_info, "", "", gr.update(interactive=False), gr.update(interactive=False), "", gr.update(interactive=False), ) instruction = pipeline( extract_input, max_new_tokens=2048, eos_token_id=terminators, do_sample=True, temperature=1, top_p=1, ) sanitized_instruction = instruction[0]["generated_text"][ len(extract_input) : ].split("\n")[0] first_step = ( f"{prompt_info}### LLM generated instruction:\n\n{sanitized_instruction}" ) yield ( first_step + "\n\n### Generating LLM response...", sanitized_instruction, "", gr.update(interactive=False), gr.update(interactive=False), "", gr.update(interactive=False), ) response_template = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{sanitized_instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n""" response = pipeline( response_template, max_new_tokens=2048, eos_token_id=terminators, do_sample=True, temperature=1, top_p=1, ) assistant_response = response[0]["generated_text"][len(response_template) :] final_output = f"""### Template used for generating instruction: ``` {extract_input} ``` ### LLM Generated Instruction: {sanitized_instruction} ### LLM Generated Response: {assistant_response} """ yield ( final_output, sanitized_instruction, assistant_response, gr.update(interactive=True), gr.update(interactive=True), "", gr.update(interactive=True), ) title = """