kwabs22 commited on
Commit
198db95
1 Parent(s): 659c0ce

Merge my new zerospace functions

Browse files
Files changed (1) hide show
  1. app.py +329 -108
app.py CHANGED
@@ -14,15 +14,20 @@ import psutil
14
  from sentence_transformers import SentenceTransformer
15
  import textwrap
16
  from gradio_client import Client
 
 
17
 
18
  #Imported Long Variables - comment for each move to search
19
  from relatively_constant_variables import *
20
 
21
- # # Initialize the zero tensor on CUDA
22
  # zero = torch.Tensor([0]).cuda()
23
  # print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
24
 
25
- # # Load the model and tokenizer
 
 
 
26
  # llmguide_model = AutoModelForCausalLM.from_pretrained(
27
  # "Qwen/Qwen2-0.5B-Instruct",
28
  # torch_dtype="auto",
@@ -30,10 +35,25 @@ from relatively_constant_variables import *
30
  # )
31
  # llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # @spaces.GPU
34
- # def llmguide_generate_response(prompt, stream=False):
35
  # print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
36
-
37
  # messages = [
38
  # {"role": "system", "content": "You are a helpful assistant."},
39
  # {"role": "user", "content": prompt}
@@ -43,7 +63,7 @@ from relatively_constant_variables import *
43
  # tokenize=False,
44
  # add_generation_prompt=True
45
  # )
46
- # model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(zero.device)
47
 
48
  # start_time = time.time()
49
  # total_tokens = 0
@@ -65,7 +85,10 @@ from relatively_constant_variables import *
65
  # total_tokens += 1
66
  # current_time = time.time()
67
  # tokens_per_second = total_tokens / (current_time - start_time)
68
- # yield generated_text, f"{tokens_per_second:.2f}"
 
 
 
69
  # else:
70
  # generated_ids = llmguide_model.generate(
71
  # model_inputs.input_ids,
@@ -78,27 +101,117 @@ from relatively_constant_variables import *
78
  # total_tokens = len(generated_ids[0])
79
  # end_time = time.time()
80
  # tokens_per_second = total_tokens / (end_time - start_time)
81
- # yield response, f"{tokens_per_second:.2f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
 
83
 
84
- #---------
85
 
86
- # # Initialize the zero tensor on CUDA
87
  zero = torch.Tensor([0]).cuda()
88
  print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
89
 
 
 
 
 
 
 
 
90
  # Load the embedding model
91
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
92
 
93
- # Load the Qwen model and tokenizer
94
- llmguide_model = AutoModelForCausalLM.from_pretrained(
95
- "Qwen/Qwen2-0.5B-Instruct",
96
- torch_dtype="auto",
97
- device_map="auto"
98
- )
99
- llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 
 
100
 
101
- #import knowledge_base from relatively_constant_variables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  # Create embeddings for the knowledge base
104
  knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])
@@ -113,84 +226,144 @@ def get_ram_usage():
113
  ram = psutil.virtual_memory()
114
  return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  @spaces.GPU
117
- def llmguide_generate_response(prompt, doc_ids=None, stream=False):
 
 
118
  print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
119
-
 
 
 
 
 
 
 
120
  messages = [
121
  {"role": "system", "content": "You are a helpful assistant."},
122
- {"role": "user", "content": prompt}
123
  ]
124
- text = llmguide_tokenizer.apply_chat_template(
125
  messages,
126
  tokenize=False,
127
  add_generation_prompt=True
128
  )
129
- model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(llmguide_model.device)
130
-
131
  start_time = time.time()
132
  total_tokens = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  if stream:
135
- streamer = TextIteratorStreamer(llmguide_tokenizer, skip_special_tokens=True)
136
  generation_kwargs = dict(
137
  model_inputs,
138
  streamer=streamer,
139
  max_new_tokens=512,
140
  temperature=0.7,
141
  )
142
- thread = Thread(target=llmguide_model.generate, kwargs=generation_kwargs)
143
  thread.start()
144
-
145
- generated_text = ""
146
  for new_text in streamer:
147
- generated_text += new_text
148
  total_tokens += 1
149
  current_time = time.time()
150
  tokens_per_second = total_tokens / (current_time - start_time)
151
- yield generated_text, f"{tokens_per_second:.2f}", "", ", ".join(doc_ids) if doc_ids else "N/A"
152
-
153
- ram_usage = get_ram_usage()
154
- yield generated_text, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
 
 
 
155
  else:
156
- generated_ids = llmguide_model.generate(
157
  model_inputs.input_ids,
158
  max_new_tokens=512
159
  )
160
  generated_ids = [
161
  output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
162
  ]
163
- response = llmguide_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
164
  total_tokens = len(generated_ids[0])
165
  end_time = time.time()
166
  tokens_per_second = total_tokens / (end_time - start_time)
167
  ram_usage = get_ram_usage()
168
- yield response, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
169
-
170
- def process_query(query, use_rag, stream=False):
171
- if use_rag:
172
- retrieved_docs = retrieve(query)
173
- context = " ".join([doc for doc, _ in retrieved_docs])
174
- doc_ids = [doc_id for _, doc_id in retrieved_docs]
175
- prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
176
- else:
177
- prompt = query
178
- doc_ids = None
179
-
180
- generator = llmguide_generate_response(prompt, doc_ids, stream)
181
-
182
- if stream:
183
- def stream_output():
184
- for generated_text, tokens_per_second, ram_usage, doc_references in generator:
185
- yield generated_text, tokens_per_second, ram_usage, doc_references
186
- return stream_output()
187
  else:
188
- # For non-streaming, we just need to get the final output
189
- for generated_text, tokens_per_second, ram_usage, doc_references in generator:
190
- pass # This will iterate to the last yield
191
- return generated_text, tokens_per_second, ram_usage, doc_references
192
-
193
- #importing FAQAllprompts from relatively_constant_variables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  #--------------------------------------------------------------------------------------------------------------------------------
196
 
@@ -781,16 +954,22 @@ def LinPEWFformat_prompt(current_prompt, prev_messages):
781
  #-----------------------------------------------------------------------------------------------------------------------------------
782
 
783
  def TestGradioClientQwen270b(text):
784
- client = Client("Qwen/Qwen2-72B-Instruct")
 
 
 
 
 
 
 
785
  result = client.predict(
786
- query=text, #"Hello!!",
787
- history=[],
788
- system="You are a helpful assistant.",
789
- api_name="/model_chat"
790
  )
 
791
  #print(result[1][0]) #All messages in the conversation
792
  #print(result[2]) # System prompt
793
- return result[1][0][1] # If supporting conversations this needs to return the last message instead
794
 
795
  #-----------------------------------------------------------------------------------------------------------------------------------
796
 
@@ -819,30 +998,30 @@ with gr.Blocks() as demo:
819
  llmguide_output = gr.Textbox(lines=10, label="Generated Response")
820
  llmguide_tokens_per_second = gr.Textbox(label="Tokens per Second")
821
 
822
- llmguide_submit_button.click(
823
- llmguide_generate_response,
824
- inputs=[llmguide_prompt, llmguide_stream_checkbox],
825
- outputs=[llmguide_output, llmguide_tokens_per_second],
826
- )
827
  with gr.Tab("General RAG (Pathfinder?) Attempt"):
828
  gr.HTML("https://huggingface.co/spaces/mteb/leaderboard - Source for SOTA - current using all-MiniLM-L6-v2")
829
  gr.HTML("Placeholder for weak RAG Type Charcter interaction test aka input for JSON 'Knowledge Base' Input")
830
- gr.Interface(
831
- fn=process_query,
832
- inputs=[
833
- gr.Textbox(lines=2, placeholder="Enter your question here..."),
834
- gr.Checkbox(label="Use RAG"),
835
- gr.Checkbox(label="Stream output")
836
- ],
837
- outputs=[
838
- gr.Textbox(label="Generated Response"),
839
- gr.Textbox(label="Tokens per second"),
840
- gr.Textbox(label="RAM Usage"),
841
- gr.Textbox(label="Referenced Documents")
842
- ],
843
- title="RAG/Non-RAG Q&A System",
844
- description="Ask a question with or without using RAG. The response is generated using a GPU-accelerated model. RAM usage and referenced document IDs (for RAG) are logged."
845
- )
846
  with gr.Tab("General FAQ Attempt"):
847
  with gr.Tab("Front end as FAQ"):
848
  FAQMainOutput = gr.TextArea(placeholder='Output will show here', value='')
@@ -853,17 +1032,59 @@ with gr.Blocks() as demo:
853
  with gr.Group():
854
  for index, (prompt, _) in enumerate(category_prompts):
855
  button = gr.Button(prompt)
856
- button.click(llmguide_generate_response, inputs=[FAQCustomButtonInput, gr.State(index), gr.State(category_name)], outputs=FAQMainOutput)
857
  with gr.Tab("Function Call as FAQ"):
858
  gr.HTML("Placeholder for media task query routing as dual purpose in workflow and for user queries as psuedo RAG engine")
859
  gr.HTML("https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#built-in-tooling - The three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
860
  with gr.Tab("Hugging Chat"):
861
  gr.HTML("https://huggingface.co/chat<br>Huggingface chat supports - State Management (Threads), Image Generation and editing, Websearch, Document parsing (PDF?), Assistants and larger models than zero gpu can support in July 2024 (Unquantised 30B and above)")
862
  gr.HTML("Existing Assistants to use and planning custom assistants placeholder")
863
  with gr.Tab("Embedded Spaces and gradio client"):
864
  gr.HTML("In Asset Generation Tab under Text")
865
  with gr.Tab("Gradio Client"):
866
- gr.Interface(fn=TestGradioClientQwen270b, inputs="text", outputs="markdown", description="Single response test of gradio client - Qwen/Qwen2-72B-Instruct, Use for testing like using a space and duplicate for heavy testing")
867
  with gr.Tab("Preview APIs"):
868
  gr.HTML("July 2024 - Gemini, Cohere and Groq rate limit free APIs")
869
  gr.Markdown("# Current Workflow = Mermaid Diagram to (1) Story to (2) Initial JSON (through LLM and fix JSON by hand) to JSON Corrections (through LLM and fix JSON by hand) to (4) Media prompts to (5) Asset Generation to (6) JSON Media field population")
@@ -888,7 +1109,7 @@ with gr.Blocks() as demo:
888
  gr.HTML("Model switch across modalities order to complete eg. ")
889
  with gr.Tab("Asset Generation to (6) JSON Media field population"):
890
  gr.HTML("Model switch across modalities order to complete eg. ")
891
- with gr.Tab("New Config Proto Assist"):
892
  gr.HTML("Trying to abstract the process into one worflow is beyond me so multiple paths to goal (config) is the aim now")
893
  with gr.Tab("Branching - Decisions / Timeline Creation to Story to Config Conversation"):
894
  gr.HTML("Structures for interesting timeline progression")
@@ -955,28 +1176,28 @@ with gr.Blocks() as demo:
955
  # input = gr.State(item)
956
  # output = gr.Textbox("", label=item)
957
  # outputbtn = gr.Button(item).click(fn=llmguide_generate_response, inputs=input, outputs=output)
958
- for i, item in enumerate(Storycraftprompts, 1):
959
- input = gr.State(item)
960
- previous_input = gr.State(lambda: LinPEWFprevious_messages)
961
- output = gr.Textbox("", label=f"Output {i}")
962
 
963
- def LinPEWF_update_and_generate(prompt, prev_msgs):
964
- prev_msgs.append(prompt)
965
- formatted_prompt = LinPEWFformat_prompt(prompt, prev_msgs)
966
- response = llmguide_generate_response(formatted_prompt)
967
- full_response = ""
968
- for chunk in response:
969
- full_response += chunk
970
- prev_msgs.append(f"Response: {full_response}")
971
- return full_response
972
 
973
- outputbtn = gr.Button(f"Generate {i}").click(
974
- fn=LinPEWF_update_and_generate,
975
- inputs=[input, previous_input],
976
- outputs=output
977
- )
978
 
979
- LinPEWFprevious_messages.append(item)
980
 
981
  #with gr.Accordion("Decisions / Timeline Creation to Story to Config Conversation", open=False):
982
  with gr.Tab("Branching - Network analysis to Game config"):
 
14
  from sentence_transformers import SentenceTransformer
15
  import textwrap
16
  from gradio_client import Client
17
+ import gc
18
+ import sys
19
 
20
  #Imported Long Variables - comment for each move to search
21
  from relatively_constant_variables import *
22
 
23
+ # # # Initialize the zero tensor on CUDA
24
  # zero = torch.Tensor([0]).cuda()
25
  # print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
26
 
27
+ # # Load the embedding model
28
+ # embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
29
+
30
+ # # Load the Qwen model and tokenizer
31
  # llmguide_model = AutoModelForCausalLM.from_pretrained(
32
  # "Qwen/Qwen2-0.5B-Instruct",
33
  # torch_dtype="auto",
 
35
  # )
36
  # llmguide_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
37
 
38
+ # #import knowledge_base from relatively_constant_variables
39
+
40
+ # # Create embeddings for the knowledge base
41
+ # knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])
42
+
43
+ # def retrieve(query, k=2):
44
+ # query_embedding = embedding_model.encode([query])
45
+ # similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
46
+ # top_k_indices = similarities.argsort(descending=True)[:k]
47
+ # return [(knowledge_base[i]["content"], knowledge_base[i]["id"]) for i in top_k_indices]
48
+
49
+ # def get_ram_usage():
50
+ # ram = psutil.virtual_memory()
51
+ # return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"
52
+
53
  # @spaces.GPU
54
+ # def llmguide_generate_response(prompt, doc_ids=None, stream=False):
55
  # print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
56
+
57
  # messages = [
58
  # {"role": "system", "content": "You are a helpful assistant."},
59
  # {"role": "user", "content": prompt}
 
63
  # tokenize=False,
64
  # add_generation_prompt=True
65
  # )
66
+ # model_inputs = llmguide_tokenizer([text], return_tensors="pt").to(llmguide_model.device)
67
 
68
  # start_time = time.time()
69
  # total_tokens = 0
 
85
  # total_tokens += 1
86
  # current_time = time.time()
87
  # tokens_per_second = total_tokens / (current_time - start_time)
88
+ # yield generated_text, f"{tokens_per_second:.2f}", "", ", ".join(doc_ids) if doc_ids else "N/A"
89
+
90
+ # ram_usage = get_ram_usage()
91
+ # yield generated_text, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
92
  # else:
93
  # generated_ids = llmguide_model.generate(
94
  # model_inputs.input_ids,
 
101
  # total_tokens = len(generated_ids[0])
102
  # end_time = time.time()
103
  # tokens_per_second = total_tokens / (end_time - start_time)
104
+ # ram_usage = get_ram_usage()
105
+ # yield response, f"{tokens_per_second:.2f}", ram_usage, ", ".join(doc_ids) if doc_ids else "N/A"
106
+
107
+ # def process_query(query, use_rag, stream=False):
108
+ # if use_rag:
109
+ # retrieved_docs = retrieve(query)
110
+ # context = " ".join([doc for doc, _ in retrieved_docs])
111
+ # doc_ids = [doc_id for _, doc_id in retrieved_docs]
112
+ # prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"
113
+ # else:
114
+ # prompt = query
115
+ # doc_ids = None
116
+
117
+ # generator = llmguide_generate_response(prompt, doc_ids, stream)
118
+
119
+ # if stream:
120
+ # def stream_output():
121
+ # for generated_text, tokens_per_second, ram_usage, doc_references in generator:
122
+ # yield generated_text, tokens_per_second, ram_usage, doc_references
123
+ # return stream_output()
124
+ # else:
125
+ # # For non-streaming, we just need to get the final output
126
+ # for generated_text, tokens_per_second, ram_usage, doc_references in generator:
127
+ # pass # This will iterate to the last yield
128
+ # return generated_text, tokens_per_second, ram_usage, doc_references
129
 
130
+ #importing FAQAllprompts from relatively_constant_variables
131
 
132
+ #----Refactor-----
133
 
134
+ # Initialize the zero tensor on CUDA
135
  zero = torch.Tensor([0]).cuda()
136
  print(zero.device) # This will print 'cpu' outside the @spaces.GPU decorated function
137
 
138
+ modelnames = ["Qwen/Qwen2-0.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat", "HuggingFaceTB/SmolLM-135M-Instruct", "microsoft/Phi-3-mini-4k-instruct",
139
+ "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit", "Groq/Llama-3-Groq-8B-Tool-Use", "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4", "SpectraSuite/TriLM_3.9B_Unpacked", "h2oai/h2o-danube3-500m-chat"
140
+ "OuteAI/Lite-Mistral-150M-v2-Instruct"]
141
+ current_model_index = 5
142
+ modelname = modelnames[current_model_index]
143
+ lastmodelnameinloadfunction = None
144
+
145
  # Load the embedding model
146
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
147
 
148
+ # Initialize model and tokenizer as global variables
149
+ model = None
150
+ tokenizer = None
151
+
152
+ def get_size_str(bytes):
153
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
154
+ if bytes < 1024:
155
+ return f"{bytes:.2f} {unit}"
156
+ bytes /= 1024
157
 
158
+ def load_model(model_name):
159
+ global model, tokenizer, lastmodelnameinloadfunction
160
+
161
+ print(f"Loading model and tokenizer: {model_name}")
162
+
163
+ # Record initial GPU memory usage
164
+ initial_memory = torch.cuda.memory_allocated()
165
+
166
+ # Clear old model and tokenizer if they exist
167
+ if 'model' in globals() and model is not None:
168
+ model = None
169
+ if 'tokenizer' in globals() and tokenizer is not None:
170
+ tokenizer = None
171
+
172
+ torch.cuda.empty_cache()
173
+ gc.collect()
174
+
175
+ model = AutoModelForCausalLM.from_pretrained(
176
+ model_name,
177
+ torch_dtype="auto",
178
+ device_map="auto"
179
+ )
180
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
181
+
182
+ # Calculate memory usage
183
+ final_memory = torch.cuda.memory_allocated()
184
+ memory_used = final_memory - initial_memory
185
+
186
+ model_size = sum(p.numel() * p.element_size() for p in model.parameters())
187
+ tokenizer_size = sum(sys.getsizeof(v) for v in tokenizer.__dict__.values())
188
+
189
+ lastmodelnameinloadfunction = (model_name, model_size, tokenizer_size)
190
+ print(f"Model and tokenizer {model_name} loaded successfully")
191
+ print(f"Model size: {get_size_str(model_size)}")
192
+ print(f"Tokenizer size: {get_size_str(tokenizer_size)}")
193
+ print(f"GPU memory used: {get_size_str(memory_used)}")
194
+
195
+ return (f"Model and tokenizer {model_name} loaded successfully. "
196
+ f"Model size: {get_size_str(model_size)}, "
197
+ f"Tokenizer size: {get_size_str(tokenizer_size)}, "
198
+ f"GPU memory used: {get_size_str(memory_used)}")
199
+
200
+
201
+ # Initial model load
202
+ load_model(modelname)
203
+
204
+ # For this example, let's use a knowledge base with close queries
205
+ # knowledge_base = [
206
+ # {"id": "1", "content": "The capital of France is Paris. It's known for the Eiffel Tower."},
207
+ # {"id": "2", "content": "The capital of Italy is Rome. It's famous for the Colosseum."},
208
+ # {"id": "3", "content": "Python is a popular programming language, known for its simplicity."},
209
+ # {"id": "4", "content": "Java is a widely-used programming language, valued for its portability."},
210
+ # {"id": "5", "content": "Machine learning is a subset of artificial intelligence focused on data-driven learning."},
211
+ # {"id": "6", "content": "Deep learning is a part of machine learning based on artificial neural networks."},
212
+ # {"id": "7", "content": "Law is a Tekken character"},
213
+ # {"id": "8", "content": "The law is very complicated"},
214
+ # ]
215
 
216
  # Create embeddings for the knowledge base
217
  knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])
 
226
  ram = psutil.virtual_memory()
227
  return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"
228
 
229
+ # Global dictionary to store outputs
230
+ output_dict = {}
231
+
232
+ def empty_output_dict():
233
+ global output_dict
234
+ output_dict = {}
235
+ print("Output dictionary has been emptied.")
236
+
237
+ def get_model_details(model):
238
+ return {
239
+ "name": model.config.name_or_path,
240
+ "architecture": model.config.architectures[0] if model.config.architectures else "Unknown",
241
+ "num_parameters": sum(p.numel() for p in model.parameters()),
242
+ }
243
+
244
+ def get_tokenizer_details(tokenizer):
245
+ return {
246
+ "name": tokenizer.__class__.__name__,
247
+ "vocab_size": tokenizer.vocab_size,
248
+ "model_max_length": tokenizer.model_max_length,
249
+ }
250
+
251
  @spaces.GPU
252
+ def generate_response(prompt, use_rag, stream=False):
253
+ global output_dict
254
+
255
  print(zero.device) # This will print 'cuda:0' inside the @spaces.GPU decorated function
256
+ if use_rag:
257
+ retrieved_docs = retrieve(prompt)
258
+ context = " ".join([doc for doc, _ in retrieved_docs])
259
+ doc_ids = [doc_id for _, doc_id in retrieved_docs]
260
+ full_prompt = f"Context: {context}\nQuestion: {prompt}\nAnswer:"
261
+ else:
262
+ full_prompt = prompt
263
+ doc_ids = None
264
  messages = [
265
  {"role": "system", "content": "You are a helpful assistant."},
266
+ {"role": "user", "content": full_prompt}
267
  ]
268
+ text = tokenizer.apply_chat_template(
269
  messages,
270
  tokenize=False,
271
  add_generation_prompt=True
272
  )
273
+ model_inputs = tokenizer([text], return_tensors="pt").to(zero.device)
 
274
  start_time = time.time()
275
  total_tokens = 0
276
+
277
+ print(output_dict)
278
+ output_key = f"output_{len(output_dict) + 1}"
279
+ print(output_key)
280
+ output_dict[output_key] = {
281
+ "input_prompt": prompt,
282
+ "full_prompt": full_prompt,
283
+ "use_rag": use_rag,
284
+ "generated_text": "",
285
+ "tokens_per_second": 0,
286
+ "ram_usage": "",
287
+ "doc_ids": doc_ids if doc_ids else "N/A",
288
+ "model_details": get_model_details(model),
289
+ "tokenizer_details": get_tokenizer_details(tokenizer),
290
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
291
+ }
292
+ print(output_dict)
293
 
294
  if stream:
295
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
296
  generation_kwargs = dict(
297
  model_inputs,
298
  streamer=streamer,
299
  max_new_tokens=512,
300
  temperature=0.7,
301
  )
302
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
303
  thread.start()
 
 
304
  for new_text in streamer:
305
+ output_dict[output_key]["generated_text"] += new_text
306
  total_tokens += 1
307
  current_time = time.time()
308
  tokens_per_second = total_tokens / (current_time - start_time)
309
+ ram_usage = get_ram_usage()
310
+ output_dict[output_key]["tokens_per_second"] = f"{tokens_per_second:.2f}"
311
+ output_dict[output_key]["ram_usage"] = ram_usage
312
+ yield (output_dict[output_key]["generated_text"],
313
+ output_dict[output_key]["tokens_per_second"],
314
+ output_dict[output_key]["ram_usage"],
315
+ output_dict[output_key]["doc_ids"])
316
  else:
317
+ generated_ids = model.generate(
318
  model_inputs.input_ids,
319
  max_new_tokens=512
320
  )
321
  generated_ids = [
322
  output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
323
  ]
324
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
325
  total_tokens = len(generated_ids[0])
326
  end_time = time.time()
327
  tokens_per_second = total_tokens / (end_time - start_time)
328
  ram_usage = get_ram_usage()
329
+
330
+ output_dict[output_key]["generated_text"] = response
331
+ output_dict[output_key]["tokens_per_second"] = f"{tokens_per_second:.2f}"
332
+ output_dict[output_key]["ram_usage"] = ram_usage
333
+ print(output_dict)
334
+
335
+ yield (output_dict[output_key]["generated_text"],
336
+ output_dict[output_key]["tokens_per_second"],
337
+ output_dict[output_key]["ram_usage"],
338
+ output_dict[output_key]["doc_ids"])
339
+
340
+ def get_output_details(output_key):
341
+ if output_key in output_dict:
342
+ return output_dict[output_key]
 
 
 
 
 
343
  else:
344
+ return f"No output found for key: {output_key}"
345
+
346
+ # Update the switch_model function to return the load_model message
347
+ def switch_model(choice):
348
+ global modelname
349
+ modelname = choice
350
+ load_message = load_model(modelname)
351
+ return load_message, f"Current model: {modelname}"
352
+
353
+ # Update the model_change_handler function
354
+ def model_change_handler(choice):
355
+ message, current_model = switch_model(choice)
356
+ return message, current_model, message # Use the same message for both outputs
357
+
358
+ def format_output_dict():
359
+ global output_dict
360
+ formatted_output = ""
361
+ for key, value in output_dict.items():
362
+ formatted_output += f"Key: {key}\n"
363
+ formatted_output += json.dumps(value, indent=2)
364
+ formatted_output += "\n\n"
365
+ print(formatted_output)
366
+ return formatted_output
367
 
368
  #--------------------------------------------------------------------------------------------------------------------------------
369
 
 
954
  #-----------------------------------------------------------------------------------------------------------------------------------
955
 
956
  def TestGradioClientQwen270b(text):
957
+ # client = Client("Qwen/Qwen2-72B-Instruct")
958
+ # result = client.predict(
959
+ # query=text, #"Hello!!",
960
+ # history=[],
961
+ # system="You are a helpful assistant.",
962
+ # api_name="/model_chat"
963
+ # )
964
+ client = Client("CohereForAI/c4ai-command-r-v01")
965
  result = client.predict(
966
+ user_message="Hello!!",
967
+ api_name="/generate_response"
 
 
968
  )
969
+ print(result)
970
  #print(result[1][0]) #All messages in the conversation
971
  #print(result[2]) # System prompt
972
+ return result #result[1][0][1] # If supporting conversations this needs to return the last message instead
973
 
974
  #-----------------------------------------------------------------------------------------------------------------------------------
975
 
 
998
  llmguide_output = gr.Textbox(lines=10, label="Generated Response")
999
  llmguide_tokens_per_second = gr.Textbox(label="Tokens per Second")
1000
 
1001
+ # llmguide_submit_button.click(
1002
+ # llmguide_generate_response,
1003
+ # inputs=[llmguide_prompt, llmguide_stream_checkbox],
1004
+ # outputs=[llmguide_output, llmguide_tokens_per_second],
1005
+ # )
1006
  with gr.Tab("General RAG (Pathfinder?) Attempt"):
1007
  gr.HTML("https://huggingface.co/spaces/mteb/leaderboard - Source for SOTA - current using all-MiniLM-L6-v2")
1008
  gr.HTML("Placeholder for weak RAG Type Charcter interaction test aka input for JSON 'Knowledge Base' Input")
1009
+ # gr.Interface(
1010
+ # fn=process_query,
1011
+ # inputs=[
1012
+ # gr.Textbox(lines=2, placeholder="Enter your question here..."),
1013
+ # gr.Checkbox(label="Use RAG"),
1014
+ # gr.Checkbox(label="Stream output")
1015
+ # ],
1016
+ # outputs=[
1017
+ # gr.Textbox(label="Generated Response"),
1018
+ # gr.Textbox(label="Tokens per second"),
1019
+ # gr.Textbox(label="RAM Usage"),
1020
+ # gr.Textbox(label="Referenced Documents")
1021
+ # ],
1022
+ # title="RAG/Non-RAG Q&A System",
1023
+ # description="Ask a question with or without using RAG. The response is generated using a GPU-accelerated model. RAM usage and referenced document IDs (for RAG) are logged."
1024
+ # )
1025
  with gr.Tab("General FAQ Attempt"):
1026
  with gr.Tab("Front end as FAQ"):
1027
  FAQMainOutput = gr.TextArea(placeholder='Output will show here', value='')
 
1032
  with gr.Group():
1033
  for index, (prompt, _) in enumerate(category_prompts):
1034
  button = gr.Button(prompt)
1035
+ # button.click(llmguide_generate_response, inputs=[FAQCustomButtonInput, gr.State(index), gr.State(category_name)], outputs=FAQMainOutput)
1036
  with gr.Tab("Function Call as FAQ"):
1037
  gr.HTML("Placeholder for media task query routing as dual purpose in workflow and for user queries as psuedo RAG engine")
1038
  gr.HTML("https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#built-in-tooling - The three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt")
1039
+
1040
+ with gr.Tab("ZeroGPU refactor"):
1041
+ model_name = gr.State(modelname)
1042
+ gr.Markdown(f"# Language Model with RAG and Model Switching")
1043
+ gr.Markdown("This demo allows you to switch between different Qwen models and use Retrieval-Augmented Generation (RAG).")
1044
+ gr.Markdown("**Note:** Model switching is intended for testing output quality. Due to GPU limitations, speed differences may not be apparent. Models requiring over 50GB to load will likely not work.")
1045
+ gr.Markdown("Need to add support for switching models and loading GGUF and GPTQ and BNB")
1046
+ gr.Markdown("57b MOE takes 6min to load and gets workload evicted - storage limit over 100G")
1047
+
1048
+ with gr.Row():
1049
+ with gr.Column():
1050
+ model_dropdown = gr.Dropdown(choices=modelnames, value=modelname, label="Select Model")
1051
+ current_model_info = gr.Markdown(f"Current model: {modelname}")
1052
+ current_model_info2 = gr.Interface(lambda x: f"Current model: {lastmodelnameinloadfunction[0]} ({lastmodelnameinloadfunction[1]}) (tokeniser = {lastmodelnameinloadfunction[2]})", inputs=None, outputs=["markdown"], description="Check what was last loaded (As the space has memory and I havent figured how spaces work enough eg. how does multiple users affect this)") # gr.Markdown(f"Current model: {lastmodelnameinloadfunction}")
1053
+ gr.HTML("Need to figure out my test function calling for groq-8b as it doesnt seem to answer chat properly - will need a seperate space - eg. letter counting, plural couting, using a function as form for llm to fill (like choosing which model and input parameters for media in game)?")
1054
+ prompt = gr.Textbox(lines=2, placeholder="Enter your prompt here...")
1055
+ stream_checkbox = gr.Checkbox(label="Enable streaming")
1056
+ rag_checkbox = gr.Checkbox(label="Enable RAG")
1057
+ submit_button = gr.Button("Generate")
1058
+
1059
+ with gr.Column():
1060
+ with gr.Tab("Current Response"):
1061
+ output = gr.Textbox(lines=10, label="Generated Response")
1062
+ tokens_per_second = gr.Textbox(label="Tokens per Second")
1063
+ ram_usage = gr.Textbox(label="RAM Usage")
1064
+ doc_references = gr.Textbox(label="Document References")
1065
+ with gr.Tab("All Responses So far"):
1066
+ gr.Markdown("As we want a iterative process all old responses are saved for now - will figure how to make per user solution - need some kind of hook onto the loading a space to assign random usercount with timestamp")
1067
+ gr.Interface(format_output_dict, inputs=None, outputs=["textbox"])
1068
+
1069
+
1070
+ model_dropdown.change(
1071
+ model_change_handler,
1072
+ inputs=[model_dropdown],
1073
+ outputs=[model_name, current_model_info, output]
1074
+ )
1075
+
1076
+ submit_button.click(
1077
+ generate_response,
1078
+ inputs=[prompt, rag_checkbox, stream_checkbox],
1079
+ outputs=[output, tokens_per_second, ram_usage, doc_references],
1080
+ )
1081
  with gr.Tab("Hugging Chat"):
1082
  gr.HTML("https://huggingface.co/chat<br>Huggingface chat supports - State Management (Threads), Image Generation and editing, Websearch, Document parsing (PDF?), Assistants and larger models than zero gpu can support in July 2024 (Unquantised 30B and above)")
1083
  gr.HTML("Existing Assistants to use and planning custom assistants placeholder")
1084
  with gr.Tab("Embedded Spaces and gradio client"):
1085
  gr.HTML("In Asset Generation Tab under Text")
1086
  with gr.Tab("Gradio Client"):
1087
+ gr.Interface(fn=TestGradioClientQwen270b, inputs="text", outputs="markdown", description="Single response test of gradio client - Cohere for test as api not working on Qwen/Qwen2-72B-Instruct, Use for testing like using a space and duplicate for heavy testing")
1088
  with gr.Tab("Preview APIs"):
1089
  gr.HTML("July 2024 - Gemini, Cohere and Groq rate limit free APIs")
1090
  gr.Markdown("# Current Workflow = Mermaid Diagram to (1) Story to (2) Initial JSON (through LLM and fix JSON by hand) to JSON Corrections (through LLM and fix JSON by hand) to (4) Media prompts to (5) Asset Generation to (6) JSON Media field population")
 
1109
  gr.HTML("Model switch across modalities order to complete eg. ")
1110
  with gr.Tab("Asset Generation to (6) JSON Media field population"):
1111
  gr.HTML("Model switch across modalities order to complete eg. ")
1112
+ with gr.Tab("All Variations / Themes for entire workflow proccess - to be completed"):
1113
  gr.HTML("Trying to abstract the process into one worflow is beyond me so multiple paths to goal (config) is the aim now")
1114
  with gr.Tab("Branching - Decisions / Timeline Creation to Story to Config Conversation"):
1115
  gr.HTML("Structures for interesting timeline progression")
 
1176
  # input = gr.State(item)
1177
  # output = gr.Textbox("", label=item)
1178
  # outputbtn = gr.Button(item).click(fn=llmguide_generate_response, inputs=input, outputs=output)
1179
+ # for i, item in enumerate(Storycraftprompts, 1):
1180
+ # input = gr.State(item)
1181
+ # previous_input = gr.State(lambda: LinPEWFprevious_messages)
1182
+ # output = gr.Textbox("", label=f"Output {i}")
1183
 
1184
+ # def LinPEWF_update_and_generate(prompt, prev_msgs):
1185
+ # prev_msgs.append(prompt)
1186
+ # formatted_prompt = LinPEWFformat_prompt(prompt, prev_msgs)
1187
+ # response = llmguide_generate_response(formatted_prompt)
1188
+ # full_response = ""
1189
+ # for chunk in response:
1190
+ # full_response += chunk
1191
+ # prev_msgs.append(f"Response: {full_response}")
1192
+ # return full_response
1193
 
1194
+ # outputbtn = gr.Button(f"Generate {i}").click(
1195
+ # fn=LinPEWF_update_and_generate,
1196
+ # inputs=[input, previous_input],
1197
+ # outputs=output
1198
+ # )
1199
 
1200
+ # LinPEWFprevious_messages.append(item)
1201
 
1202
  #with gr.Accordion("Decisions / Timeline Creation to Story to Config Conversation", open=False):
1203
  with gr.Tab("Branching - Network analysis to Game config"):