Spaces:

KwabsHug
/

GameConfigIdea

Running on Zero

App Files Files Community

kwabs22 commited on Jul 28

Commit

198db95

•

1 Parent(s): 659c0ce

Merge my new zerospace functions

Browse files

Files changed (1) hide show

app.py +329 -108

app.py CHANGED Viewed

@@ -14,15 +14,20 @@ import psutil
 from sentence_transformers import SentenceTransformer
 import textwrap
 from gradio_client import Client
 #Imported Long Variables - comment for each move to search
 from relatively_constant_variables import *
-# # Initialize the zero tensor on CUDA
 # zero = torch.Tensor([0]).cuda()
 # print(zero.device)  # This will print 'cpu' outside the @spaces.GPU decorated function
-# # Load the model and tokenizer
 # llmguide_model = AutoModelForCausalLM.from_pretrained(
 #     "Qwen/Qwen2-0.5B-Instruct",
 #     torch_dtype="auto",
@@ -30,10 +35,25 @@ from relatively_constant_variables import *
 # )
 # llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 # @spaces.GPU
-# def llmguide_generate_response(prompt, stream=False):
 #     print(zero.device)  # This will print 'cuda:0' inside the @spaces.GPU decorated function
 #     messages = [
 #         {"role": "system", "content": "You are a helpful assistant."},
 #         {"role": "user", "content": prompt}
@@ -43,7 +63,7 @@ from relatively_constant_variables import *
 #         tokenize=False,
 #         add_generation_prompt=True
 #     )
-#     model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(zero.device)
 #     start_time = time.time()
 #     total_tokens = 0
@@ -65,7 +85,10 @@ from relatively_constant_variables import *
 #             total_tokens += 1
 #             current_time = time.time()
 #             tokens_per_second = total_tokens / (current_time - start_time)
-#             yield generated_text, f"{tokens_per_second:.2f}"
 #     else:
 #         generated_ids = llmguide_model.generate(
 #             model_inputs.input_ids,
@@ -78,27 +101,117 @@ from relatively_constant_variables import *
 #         total_tokens = len(generated_ids[0])
 #         end_time = time.time()
 #         tokens_per_second = total_tokens / (end_time - start_time)
-#         yield response, f"{tokens_per_second:.2f}"
-#---------
-# # Initialize the zero tensor on CUDA
 zero = torch.Tensor([0]).cuda()
 print(zero.device)  # This will print 'cpu' outside the @spaces.GPU decorated function
 # Load the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Load the Qwen model and tokenizer
-llmguide_model = AutoModelForCausalLM.from_pretrained(
-    "Qwen/Qwen2-0.5B-Instruct",
-    torch_dtype="auto",
-    device_map="auto"
-)
-llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-#import knowledge_base from relatively_constant_variables
 # Create embeddings for the knowledge base
 knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])
@@ -113,84 +226,144 @@ def get_ram_usage():
     ram = psutil.virtual_memory()
     return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"
 @spaces.GPU
-def llmguide_generate_response(prompt, doc_ids=None, stream=False):
     print(zero.device)  # This will print 'cuda:0' inside the @spaces.GPU decorated function
     messages = [
         {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": prompt}
     ]
-    text = llmguide_tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
-    model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(llmguide_model.device)
     start_time = time.time()
     total_tokens = 0
     if stream:
-        streamer = TextIteratorStreamer(llmguide_tokenizer, skip_special_tokens=True)
         generation_kwargs = dict(
             model_inputs,
             streamer=streamer,
             max_new_tokens=512,
             temperature=0.7,
         )
-        thread = Thread(target=llmguide_model.generate, kwargs=generation_kwargs)
         thread.start()
-        generated_text = ""
         for new_text in streamer:
-            generated_text += new_text
             total_tokens += 1
             current_time = time.time()
             tokens_per_second = total_tokens / (current_time - start_time)
-            yield generated_text, f"{tokens_per_second:.2f}", "", ", ".join(doc_ids) if doc_ids else "N/A"
-        ram_usage = get_ram_usage()
-        yield generated_text, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
     else:
-        generated_ids = llmguide_model.generate(
             model_inputs.input_ids,
             max_new_tokens=512
         )
         generated_ids = [
             output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
         ]
-        response = llmguide_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
         total_tokens = len(generated_ids[0])
         end_time = time.time()
         tokens_per_second = total_tokens / (end_time - start_time)
         ram_usage = get_ram_usage()
-        yield response, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
-def process_query(query, use_rag, stream=False):
-    if use_rag:
-        retrieved_docs = retrieve(query)
-        context = " ".join([doc for doc, _ in retrieved_docs])
-        doc_ids = [doc_id for _, doc_id in retrieved_docs]
-        prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
-    else:
-        prompt = query
-        doc_ids = None
-    generator = llmguide_generate_response(prompt, doc_ids, stream)
-    if stream:
-        def stream_output():
-            for generated_text, tokens_per_second, ram_usage, doc_references in generator:
-                yield generated_text, tokens_per_second, ram_usage, doc_references
-        return stream_output()
     else:
-        # For non-streaming, we just need to get the final output
-        for generated_text, tokens_per_second, ram_usage, doc_references in generator:
-            pass  # This will iterate to the last yield
-        return generated_text, tokens_per_second, ram_usage, doc_references
-#importing FAQAllprompts from relatively_constant_variables
 #--------------------------------------------------------------------------------------------------------------------------------
@@ -781,16 +954,22 @@ def LinPEWFformat_prompt(current_prompt, prev_messages):
 #-----------------------------------------------------------------------------------------------------------------------------------
 def TestGradioClientQwen270b(text):
-    client = Client("Qwen/Qwen2-72B-Instruct")
     result = client.predict(
-            query=text, #"Hello!!",
-            history=[],
-            system="You are a helpful assistant.",
-            api_name="/model_chat"
     )
     #print(result[1][0]) #All messages in the conversation
     #print(result[2]) # System prompt
-    return result[1][0][1] # If supporting conversations this needs to return the last message instead
 #-----------------------------------------------------------------------------------------------------------------------------------
@@ -819,30 +998,30 @@ with gr.Blocks() as demo:
                             llmguide_output = gr.Textbox(lines=10, label="Generated Response")
                             llmguide_tokens_per_second = gr.Textbox(label="Tokens per Second")
-                    llmguide_submit_button.click(
-                        llmguide_generate_response,
-                        inputs=[llmguide_prompt, llmguide_stream_checkbox],
-                        outputs=[llmguide_output, llmguide_tokens_per_second],
-                    )
                 with gr.Tab("General RAG (Pathfinder?) Attempt"):
                     gr.HTML("https://huggingface.co/spaces/mteb/leaderboard - Source for SOTA - current using all-MiniLM-L6-v2")
                     gr.HTML("Placeholder for weak RAG Type Charcter interaction test aka input for JSON 'Knowledge Base' Input")
-                    gr.Interface(
-                        fn=process_query,
-                        inputs=[
-                            gr.Textbox(lines=2, placeholder="Enter your question here..."),
-                            gr.Checkbox(label="Use RAG"),
-                            gr.Checkbox(label="Stream output")
-                        ],
-                        outputs=[
-                            gr.Textbox(label="Generated Response"),
-                            gr.Textbox(label="Tokens per second"),
-                            gr.Textbox(label="RAM Usage"),
-                            gr.Textbox(label="Referenced Documents")
-                        ],
-                        title="RAG/Non-RAG Q&A System",
-                        description="Ask a question with or without using RAG. The response is generated using a GPU-accelerated model. RAM usage and referenced document IDs (for RAG) are logged."
-                    )
                 with gr.Tab("General FAQ Attempt"):
                     with gr.Tab("Front end as FAQ"):
                         FAQMainOutput = gr.TextArea(placeholder='Output will show here', value='')
@@ -853,17 +1032,59 @@ with gr.Blocks() as demo:
                                 with gr.Group():
                                     for index, (prompt, _) in enumerate(category_prompts):
                                         button = gr.Button(prompt)
-                                        button.click(llmguide_generate_response, inputs=[FAQCustomButtonInput, gr.State(index), gr.State(category_name)], outputs=FAQMainOutput)
                     with gr.Tab("Function Call as FAQ"):
                         gr.HTML("Placeholder for media task query routing as dual purpose in workflow and for user queries as psuedo RAG engine")
                         gr.HTML("https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#built-in-tooling - The three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt")
             with gr.Tab("Hugging Chat"):
                 gr.HTML("https://huggingface.co/chat<br>Huggingface chat supports - State Management (Threads), Image Generation and editing, Websearch, Document parsing (PDF?), Assistants and larger models than zero gpu can support in July 2024 (Unquantised 30B and above)")
                 gr.HTML("Existing Assistants to use and planning custom assistants placeholder")
             with gr.Tab("Embedded Spaces and gradio client"):
                 gr.HTML("In Asset Generation Tab under Text")
             with gr.Tab("Gradio Client"):
-                gr.Interface(fn=TestGradioClientQwen270b, inputs="text", outputs="markdown", description="Single response test of gradio client - Qwen/Qwen2-72B-Instruct, Use for testing like using a space and duplicate for heavy testing")
             with gr.Tab("Preview APIs"):
                 gr.HTML("July 2024 - Gemini, Cohere and Groq rate limit free APIs")
         gr.Markdown("# Current Workflow = Mermaid Diagram to (1) Story to (2) Initial JSON (through LLM and fix JSON by hand) to JSON Corrections (through LLM and fix JSON by hand) to (4) Media prompts to (5) Asset Generation to (6) JSON Media field population")
@@ -888,7 +1109,7 @@ with gr.Blocks() as demo:
                 gr.HTML("Model switch across modalities order to complete eg. ")
             with gr.Tab("Asset Generation to (6) JSON Media field population"):
                 gr.HTML("Model switch across modalities order to complete eg. ")
-            with gr.Tab("New Config Proto Assist"):
                 gr.HTML("Trying to abstract the process into one worflow is beyond me so multiple paths to goal (config) is the aim now")
                 with gr.Tab("Branching - Decisions / Timeline Creation to Story to Config Conversation"):
                     gr.HTML("Structures for interesting timeline progression")
@@ -955,28 +1176,28 @@ with gr.Blocks() as demo:
                     #     input = gr.State(item)
                     #     output = gr.Textbox("", label=item)
                     #     outputbtn = gr.Button(item).click(fn=llmguide_generate_response, inputs=input, outputs=output)
-                    for i, item in enumerate(Storycraftprompts, 1):
-                        input = gr.State(item)
-                        previous_input = gr.State(lambda: LinPEWFprevious_messages)
-                        output = gr.Textbox("", label=f"Output {i}")
-                        def LinPEWF_update_and_generate(prompt, prev_msgs):
-                            prev_msgs.append(prompt)
-                            formatted_prompt = LinPEWFformat_prompt(prompt, prev_msgs)
-                            response = llmguide_generate_response(formatted_prompt)
-                            full_response = ""
-                            for chunk in response:
-                                full_response += chunk
-                            prev_msgs.append(f"Response: {full_response}")
-                            return full_response
-                        outputbtn = gr.Button(f"Generate {i}").click(
-                            fn=LinPEWF_update_and_generate,
-                            inputs=[input, previous_input],
-                            outputs=output
-                        )
-                        LinPEWFprevious_messages.append(item)
                 #with gr.Accordion("Decisions / Timeline Creation to Story to Config Conversation", open=False):
                 with gr.Tab("Branching - Network analysis to Game config"):

 from sentence_transformers import SentenceTransformer
 import textwrap
 from gradio_client import Client
+import gc
+import sys
 #Imported Long Variables - comment for each move to search
 from relatively_constant_variables import *
+# # # Initialize the zero tensor on CUDA
 # zero = torch.Tensor([0]).cuda()
 # print(zero.device)  # This will print 'cpu' outside the @spaces.GPU decorated function
+# # Load the embedding model
+# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# # Load the Qwen model and tokenizer
 # llmguide_model = AutoModelForCausalLM.from_pretrained(
 #     "Qwen/Qwen2-0.5B-Instruct",
 #     torch_dtype="auto",
 # )
 # llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+# #import knowledge_base from relatively_constant_variables
+# # Create embeddings for the knowledge base
+# knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])
+# def retrieve(query, k=2):
+#     query_embedding = embedding_model.encode([query])
+#     similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
+#     top_k_indices = similarities.argsort(descending=True)[:k]
+#     return [(knowledge_base[i]["content"], knowledge_base[i]["id"]) for i in top_k_indices]
+# def get_ram_usage():
+#     ram = psutil.virtual_memory()
+#     return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"
 # @spaces.GPU
+# def llmguide_generate_response(prompt, doc_ids=None, stream=False):
 #     print(zero.device)  # This will print 'cuda:0' inside the @spaces.GPU decorated function
 #     messages = [
 #         {"role": "system", "content": "You are a helpful assistant."},
 #         {"role": "user", "content": prompt}
 #         tokenize=False,
 #         add_generation_prompt=True
 #     )
+#     model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(llmguide_model.device)
 #     start_time = time.time()
 #     total_tokens = 0
 #             total_tokens += 1
 #             current_time = time.time()
 #             tokens_per_second = total_tokens / (current_time - start_time)
+#             yield generated_text, f"{tokens_per_second:.2f}", "", ", ".join(doc_ids) if doc_ids else "N/A"
+#         ram_usage = get_ram_usage()
+#         yield generated_text, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
 #     else:
 #         generated_ids = llmguide_model.generate(
 #             model_inputs.input_ids,
 #         total_tokens = len(generated_ids[0])
 #         end_time = time.time()
 #         tokens_per_second = total_tokens / (end_time - start_time)
+#         ram_usage = get_ram_usage()
+#         yield response, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
+# def process_query(query, use_rag, stream=False):
+#     if use_rag:
+#         retrieved_docs = retrieve(query)
+#         context = " ".join([doc for doc, _ in retrieved_docs])
+#         doc_ids = [doc_id for _, doc_id in retrieved_docs]
+#         prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
+#     else:
+#         prompt = query
+#         doc_ids = None
+#     generator = llmguide_generate_response(prompt, doc_ids, stream)
+#     if stream:
+#         def stream_output():
+#             for generated_text, tokens_per_second, ram_usage, doc_references in generator:
+#                 yield generated_text, tokens_per_second, ram_usage, doc_references
+#         return stream_output()
+#     else:
+#         # For non-streaming, we just need to get the final output
+#         for generated_text, tokens_per_second, ram_usage, doc_references in generator:
+#             pass  # This will iterate to the last yield
+#         return generated_text, tokens_per_second, ram_usage, doc_references
+#importing FAQAllprompts from relatively_constant_variables
+#----Refactor-----
+# Initialize the zero tensor on CUDA
 zero = torch.Tensor([0]).cuda()
 print(zero.device)  # This will print 'cpu' outside the @spaces.GPU decorated function
+modelnames = ["Qwen/Qwen2-0.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat", "HuggingFaceTB/SmolLM-135M-Instruct", "microsoft/Phi-3-mini-4k-instruct",
+              "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit", "Groq/Llama-3-Groq-8B-Tool-Use", "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4", "SpectraSuite/TriLM_3.9B_Unpacked", "h2oai/h2o-danube3-500m-chat"
+              "OuteAI/Lite-Mistral-150M-v2-Instruct"]
+current_model_index = 5
+modelname = modelnames[current_model_index]
+lastmodelnameinloadfunction = None
 # Load the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Initialize model and tokenizer as global variables
+model = None
+tokenizer = None
+def get_size_str(bytes):
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+        if bytes < 1024:
+            return f"{bytes:.2f} {unit}"
+        bytes /= 1024
+def load_model(model_name):
+    global model, tokenizer, lastmodelnameinloadfunction
+    print(f"Loading model and tokenizer: {model_name}")
+    # Record initial GPU memory usage
+    initial_memory = torch.cuda.memory_allocated()
+    # Clear old model and tokenizer if they exist
+    if 'model' in globals() and model is not None:
+        model = None
+    if 'tokenizer' in globals() and tokenizer is not None:
+        tokenizer = None
+    torch.cuda.empty_cache()
+    gc.collect()
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype="auto",
+        device_map="auto"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Calculate memory usage
+    final_memory = torch.cuda.memory_allocated()
+    memory_used = final_memory - initial_memory
+    model_size = sum(p.numel() * p.element_size() for p in model.parameters())
+    tokenizer_size = sum(sys.getsizeof(v) for v in tokenizer.__dict__.values())
+    lastmodelnameinloadfunction = (model_name, model_size, tokenizer_size)
+    print(f"Model and tokenizer {model_name} loaded successfully")
+    print(f"Model size: {get_size_str(model_size)}")
+    print(f"Tokenizer size: {get_size_str(tokenizer_size)}")
+    print(f"GPU memory used: {get_size_str(memory_used)}")
+    return (f"Model and tokenizer {model_name} loaded successfully. "
+            f"Model size: {get_size_str(model_size)}, "
+            f"Tokenizer size: {get_size_str(tokenizer_size)}, "
+            f"GPU memory used: {get_size_str(memory_used)}")
+# Initial model load
+load_model(modelname)
+# For this example, let's use a knowledge base with close queries
+# knowledge_base = [
+#     {"id": "1", "content": "The capital of France is Paris. It's known for the Eiffel Tower."},
+#     {"id": "2", "content": "The capital of Italy is Rome. It's famous for the Colosseum."},
+#     {"id": "3", "content": "Python is a popular programming language, known for its simplicity."},
+#     {"id": "4", "content": "Java is a widely-used programming language, valued for its portability."},
+#     {"id": "5", "content": "Machine learning is a subset of artificial intelligence focused on data-driven learning."},
+#     {"id": "6", "content": "Deep learning is a part of machine learning based on artificial neural networks."},
+#     {"id": "7", "content": "Law is a Tekken character"},
+#     {"id": "8", "content": "The law is very complicated"},
+# ]
 # Create embeddings for the knowledge base
 knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])
     ram = psutil.virtual_memory()
     return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"
+# Global dictionary to store outputs
+output_dict = {}
+def empty_output_dict():
+    global output_dict
+    output_dict = {}
+    print("Output dictionary has been emptied.")
+def get_model_details(model):
+    return {
+        "name": model.config.name_or_path,
+        "architecture": model.config.architectures[0] if model.config.architectures else "Unknown",
+        "num_parameters": sum(p.numel() for p in model.parameters()),
+    }
+def get_tokenizer_details(tokenizer):
+    return {
+        "name": tokenizer.__class__.__name__,
+        "vocab_size": tokenizer.vocab_size,
+        "model_max_length": tokenizer.model_max_length,
+    }
 @spaces.GPU
+def generate_response(prompt, use_rag, stream=False):
+    global output_dict
     print(zero.device)  # This will print 'cuda:0' inside the @spaces.GPU decorated function
+    if use_rag:
+        retrieved_docs = retrieve(prompt)
+        context = " ".join([doc for doc, _ in retrieved_docs])
+        doc_ids = [doc_id for _, doc_id in retrieved_docs]
+        full_prompt = f"Context: {context}\nQuestion: {prompt}\nAnswer:"
+    else:
+        full_prompt = prompt
+        doc_ids = None
     messages = [
         {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": full_prompt}
     ]
+    text = tokenizer.apply_chat_template(
         messages,
         tokenize=False,
         add_generation_prompt=True
     )
+    model_inputs = tokenizer([text], return_tensors="pt").to(zero.device)
     start_time = time.time()
     total_tokens = 0
+    print(output_dict)
+    output_key = f"output_{len(output_dict) + 1}"
+    print(output_key)
+    output_dict[output_key] = {
+        "input_prompt": prompt,
+        "full_prompt": full_prompt,
+        "use_rag": use_rag,
+        "generated_text": "",
+        "tokens_per_second": 0,
+        "ram_usage": "",
+        "doc_ids": doc_ids if doc_ids else "N/A",
+        "model_details": get_model_details(model),
+        "tokenizer_details": get_tokenizer_details(tokenizer),
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
+    }
+    print(output_dict)
     if stream:
+        streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
         generation_kwargs = dict(
             model_inputs,
             streamer=streamer,
             max_new_tokens=512,
             temperature=0.7,
         )
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         for new_text in streamer:
+            output_dict[output_key]["generated_text"] += new_text
             total_tokens += 1
             current_time = time.time()
             tokens_per_second = total_tokens / (current_time - start_time)
+            ram_usage = get_ram_usage()
+            output_dict[output_key]["tokens_per_second"] = f"{tokens_per_second:.2f}"
+            output_dict[output_key]["ram_usage"] = ram_usage
+            yield (output_dict[output_key]["generated_text"],
+                   output_dict[output_key]["tokens_per_second"],
+                   output_dict[output_key]["ram_usage"],
+                   output_dict[output_key]["doc_ids"])
     else:
+        generated_ids = model.generate(
             model_inputs.input_ids,
             max_new_tokens=512
         )
         generated_ids = [
             output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
         ]
+        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
         total_tokens = len(generated_ids[0])
         end_time = time.time()
         tokens_per_second = total_tokens / (end_time - start_time)
         ram_usage = get_ram_usage()
+        output_dict[output_key]["generated_text"] = response
+        output_dict[output_key]["tokens_per_second"] = f"{tokens_per_second:.2f}"
+        output_dict[output_key]["ram_usage"] = ram_usage
+        print(output_dict)
+        yield (output_dict[output_key]["generated_text"],
+               output_dict[output_key]["tokens_per_second"],
+               output_dict[output_key]["ram_usage"],
+               output_dict[output_key]["doc_ids"])
+def get_output_details(output_key):
+    if output_key in output_dict:
+        return output_dict[output_key]
     else:
+        return f"No output found for key: {output_key}"
+# Update the switch_model function to return the load_model message
+def switch_model(choice):
+    global modelname
+    modelname = choice
+    load_message = load_model(modelname)
+    return load_message, f"Current model: {modelname}"
+# Update the model_change_handler function
+def model_change_handler(choice):
+    message, current_model = switch_model(choice)
+    return message, current_model, message  # Use the same message for both outputs
+def format_output_dict():
+    global output_dict
+    formatted_output = ""
+    for key, value in output_dict.items():
+        formatted_output += f"Key: {key}\n"
+        formatted_output += json.dumps(value, indent=2)
+        formatted_output += "\n\n"
+    print(formatted_output)
+    return formatted_output
 #--------------------------------------------------------------------------------------------------------------------------------
 #-----------------------------------------------------------------------------------------------------------------------------------
 def TestGradioClientQwen270b(text):
+    # client = Client("Qwen/Qwen2-72B-Instruct")
+    # result = client.predict(
+    #         query=text, #"Hello!!",
+    #         history=[],
+    #         system="You are a helpful assistant.",
+    #         api_name="/model_chat"
+    # )
+    client = Client("CohereForAI/c4ai-command-r-v01")
     result = client.predict(
+            user_message="Hello!!",
+            api_name="/generate_response"
     )
+    print(result)
     #print(result[1][0]) #All messages in the conversation
     #print(result[2]) # System prompt
+    return result #result[1][0][1] # If supporting conversations this needs to return the last message instead
 #-----------------------------------------------------------------------------------------------------------------------------------
                             llmguide_output = gr.Textbox(lines=10, label="Generated Response")
                             llmguide_tokens_per_second = gr.Textbox(label="Tokens per Second")
+                    # llmguide_submit_button.click(
+                    #     llmguide_generate_response,
+                    #     inputs=[llmguide_prompt, llmguide_stream_checkbox],
+                    #     outputs=[llmguide_output, llmguide_tokens_per_second],
+                    # )
                 with gr.Tab("General RAG (Pathfinder?) Attempt"):
                     gr.HTML("https://huggingface.co/spaces/mteb/leaderboard - Source for SOTA - current using all-MiniLM-L6-v2")
                     gr.HTML("Placeholder for weak RAG Type Charcter interaction test aka input for JSON 'Knowledge Base' Input")
+                    # gr.Interface(
+                    #     fn=process_query,
+                    #     inputs=[
+                    #         gr.Textbox(lines=2, placeholder="Enter your question here..."),
+                    #         gr.Checkbox(label="Use RAG"),
+                    #         gr.Checkbox(label="Stream output")
+                    #     ],
+                    #     outputs=[
+                    #         gr.Textbox(label="Generated Response"),
+                    #         gr.Textbox(label="Tokens per second"),
+                    #         gr.Textbox(label="RAM Usage"),
+                    #         gr.Textbox(label="Referenced Documents")
+                    #     ],
+                    #     title="RAG/Non-RAG Q&A System",
+                    #     description="Ask a question with or without using RAG. The response is generated using a GPU-accelerated model. RAM usage and referenced document IDs (for RAG) are logged."
+                    # )
                 with gr.Tab("General FAQ Attempt"):
                     with gr.Tab("Front end as FAQ"):
                         FAQMainOutput = gr.TextArea(placeholder='Output will show here', value='')
                                 with gr.Group():
                                     for index, (prompt, _) in enumerate(category_prompts):
                                         button = gr.Button(prompt)
+                                        # button.click(llmguide_generate_response, inputs=[FAQCustomButtonInput, gr.State(index), gr.State(category_name)], outputs=FAQMainOutput)
                     with gr.Tab("Function Call as FAQ"):
                         gr.HTML("Placeholder for media task query routing as dual purpose in workflow and for user queries as psuedo RAG engine")
                         gr.HTML("https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#built-in-tooling - The three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt")
+            with gr.Tab("ZeroGPU refactor"):
+                model_name = gr.State(modelname)
+                gr.Markdown(f"# Language Model with RAG and Model Switching")
+                gr.Markdown("This demo allows you to switch between different Qwen models and use Retrieval-Augmented Generation (RAG).")
+                gr.Markdown("**Note:** Model switching is intended for testing output quality. Due to GPU limitations, speed differences may not be apparent. Models requiring over 50GB to load will likely not work.")
+                gr.Markdown("Need to add support for switching models and loading GGUF and GPTQ and BNB")
+                gr.Markdown("57b MOE takes 6min to load and gets workload evicted - storage limit over 100G")
+                with gr.Row():
+                    with gr.Column():
+                        model_dropdown = gr.Dropdown(choices=modelnames, value=modelname, label="Select Model")
+                        current_model_info = gr.Markdown(f"Current model: {modelname}")
+                        current_model_info2 = gr.Interface(lambda x: f"Current model: {lastmodelnameinloadfunction[0]} ({lastmodelnameinloadfunction[1]}) (tokeniser = {lastmodelnameinloadfunction[2]})", inputs=None, outputs=["markdown"], description="Check what was last loaded (As the space has memory and I havent figured how spaces work enough eg. how does multiple users affect this)") # gr.Markdown(f"Current model: {lastmodelnameinloadfunction}")
+                        gr.HTML("Need to figure out my test function calling for groq-8b as it doesnt seem to answer chat properly - will need a seperate space - eg. letter counting, plural couting, using a function as form for llm to fill (like choosing which model and input parameters for media in game)?")
+                        prompt = gr.Textbox(lines=2, placeholder="Enter your prompt here...")
+                        stream_checkbox = gr.Checkbox(label="Enable streaming")
+                        rag_checkbox = gr.Checkbox(label="Enable RAG")
+                        submit_button = gr.Button("Generate")
+                    with gr.Column():
+                        with gr.Tab("Current Response"):
+                            output = gr.Textbox(lines=10, label="Generated Response")
+                            tokens_per_second = gr.Textbox(label="Tokens per Second")
+                            ram_usage = gr.Textbox(label="RAM Usage")
+                            doc_references = gr.Textbox(label="Document References")
+                        with gr.Tab("All Responses So far"):
+                            gr.Markdown("As we want a iterative process all old responses are saved for now - will figure how to make per user solution - need some kind of hook onto the loading a space to assign random usercount with timestamp")
+                            gr.Interface(format_output_dict, inputs=None, outputs=["textbox"])
+                model_dropdown.change(
+                    model_change_handler,
+                    inputs=[model_dropdown],
+                    outputs=[model_name, current_model_info, output]
+                )
+                submit_button.click(
+                    generate_response,
+                    inputs=[prompt, rag_checkbox, stream_checkbox],
+                    outputs=[output, tokens_per_second, ram_usage, doc_references],
+                )
             with gr.Tab("Hugging Chat"):
                 gr.HTML("https://huggingface.co/chat<br>Huggingface chat supports - State Management (Threads), Image Generation and editing, Websearch, Document parsing (PDF?), Assistants and larger models than zero gpu can support in July 2024 (Unquantised 30B and above)")
                 gr.HTML("Existing Assistants to use and planning custom assistants placeholder")
             with gr.Tab("Embedded Spaces and gradio client"):
                 gr.HTML("In Asset Generation Tab under Text")
             with gr.Tab("Gradio Client"):
+                gr.Interface(fn=TestGradioClientQwen270b, inputs="text", outputs="markdown", description="Single response test of gradio client - Cohere for test as api not working on Qwen/Qwen2-72B-Instruct, Use for testing like using a space and duplicate for heavy testing")
             with gr.Tab("Preview APIs"):
                 gr.HTML("July 2024 - Gemini, Cohere and Groq rate limit free APIs")
         gr.Markdown("# Current Workflow = Mermaid Diagram to (1) Story to (2) Initial JSON (through LLM and fix JSON by hand) to JSON Corrections (through LLM and fix JSON by hand) to (4) Media prompts to (5) Asset Generation to (6) JSON Media field population")
                 gr.HTML("Model switch across modalities order to complete eg. ")
             with gr.Tab("Asset Generation to (6) JSON Media field population"):
                 gr.HTML("Model switch across modalities order to complete eg. ")
+            with gr.Tab("All Variations / Themes for entire workflow proccess - to be completed"):
                 gr.HTML("Trying to abstract the process into one worflow is beyond me so multiple paths to goal (config) is the aim now")
                 with gr.Tab("Branching - Decisions / Timeline Creation to Story to Config Conversation"):
                     gr.HTML("Structures for interesting timeline progression")
                     #     input = gr.State(item)
                     #     output = gr.Textbox("", label=item)
                     #     outputbtn = gr.Button(item).click(fn=llmguide_generate_response, inputs=input, outputs=output)
+                    # for i, item in enumerate(Storycraftprompts, 1):
+                    #     input = gr.State(item)
+                    #     previous_input = gr.State(lambda: LinPEWFprevious_messages)
+                    #     output = gr.Textbox("", label=f"Output {i}")
+                    #     def LinPEWF_update_and_generate(prompt, prev_msgs):
+                    #         prev_msgs.append(prompt)
+                    #         formatted_prompt = LinPEWFformat_prompt(prompt, prev_msgs)
+                    #         response = llmguide_generate_response(formatted_prompt)
+                    #         full_response = ""
+                    #         for chunk in response:
+                    #             full_response += chunk
+                    #         prev_msgs.append(f"Response: {full_response}")
+                    #         return full_response
+                    #     outputbtn = gr.Button(f"Generate {i}").click(
+                    #         fn=LinPEWF_update_and_generate,
+                    #         inputs=[input, previous_input],
+                    #         outputs=output
+                    #     )
+                    #     LinPEWFprevious_messages.append(item)
                 #with gr.Accordion("Decisions / Timeline Creation to Story to Config Conversation", open=False):
                 with gr.Tab("Branching - Network analysis to Game config"):