llm-qa-bench

Sleeping

eswardivi commited on Apr 19

Commit

2cdab2a

•

1 Parent(s): e9cb74c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ import time
 token = os.environ["HF_TOKEN"]
 quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
 )
 model = AutoModelForCausalLM.from_pretrained(
@@ -34,7 +34,7 @@ else:
 @spaces.GPU(duration=150)
-def chat(message, history, temperature, top_p, top_k, max_tokens):
     start_time = time.time()
     chat = []
     for item in history:
@@ -52,7 +52,6 @@ def chat(message, history, temperature, top_p, top_k, max_tokens):
         streamer=streamer,
         max_new_tokens=max_tokens,
         do_sample=True,
-        top_p=top_p,
         top_k=top_k,
         temperature=temperature,
     )
@@ -86,9 +85,7 @@ demo = gr.ChatInterface(
         gr.Slider(
             minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
         ),
-        gr.Slider(
-            minimum=0, maximum=1, step=0.1, value=0.95, label="top_p", render=False
-        ),
         gr.Slider(
             minimum=1, maximum=10000, step=5, value=1000, label="top_k", render=False
         ),
@@ -103,6 +100,6 @@ demo = gr.ChatInterface(
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
-    description="Now Running [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) in 4bit"
 )
 demo.launch()

 token = os.environ["HF_TOKEN"]
 quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True, bnb_4bit_compute_dtype=torch.float16
 )
 model = AutoModelForCausalLM.from_pretrained(
 @spaces.GPU(duration=150)
+def chat(message, history, temperature,do_sample, top_k, max_tokens):
     start_time = time.time()
     chat = []
     for item in history:
         streamer=streamer,
         max_new_tokens=max_tokens,
         do_sample=True,
         top_k=top_k,
         temperature=temperature,
     )
         gr.Slider(
             minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
         ),
+        gr.Checkbox(label="Sampling",value=True),
         gr.Slider(
             minimum=1, maximum=10000, step=5, value=1000, label="top_k", render=False
         ),
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
+    description="Now Running [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) in 8bit"
 )
 demo.launch()