Spaces:

CreitinGameplays
/

ConvAIChat

Paused

App Files Files Community

CreitinGameplays commited on May 20

Commit

1dc34d7

•

1 Parent(s): 067742a

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -22

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-#!/usr/bin/env python
 import os
 from threading import Thread
 from typing import Iterator
@@ -9,32 +7,39 @@ import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-DESCRIPTION = "# ConvAI 9b"
-hf_token = os.getenv("hf_token")
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 512
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 if torch.cuda.is_available():
     model_id = "CreitinGameplays/ConvAI-9b"
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", token=hf_token)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 @spaces.GPU
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
-    max_new_tokens: int = 512,
-    temperature: float = 0.2,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     conversation = []
     for user, assistant in chat_history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
@@ -45,7 +50,7 @@ def generate(
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
@@ -69,6 +74,7 @@ def generate(
 chat_interface = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         gr.Slider(
             label="Max new tokens",
             minimum=1,
@@ -81,7 +87,7 @@ chat_interface = gr.ChatInterface(
             minimum=0.1,
             maximum=4.0,
             step=0.1,
-            value=0.2,
         ),
         gr.Slider(
             label="Top-p (nucleus sampling)",
@@ -115,14 +121,5 @@ chat_interface = gr.ChatInterface(
     ],
 )
-with gr.Blocks(css="style.css") as demo:
-    gr.Markdown(DESCRIPTION)
-    gr.DuplicateButton(
-        value="Duplicate Space for private use",
-        elem_id="duplicate-button",
-        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
-    )
-    chat_interface.render()
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch()

 import os
 from threading import Thread
 from typing import Iterator
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+DESCRIPTION = """\
+# ConvAI 9b
+"""
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
 if torch.cuda.is_available():
     model_id = "CreitinGameplays/ConvAI-9b"
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer.use_default_system_prompt = False
 @spaces.GPU
 def generate(
     message: str,
     chat_history: list[tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int = 1024,
+    temperature: float = 0.4,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     conversation = []
+    if system_prompt:
+        conversation.append({"role": "system", "content": system_prompt})
     for user, assistant in chat_history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
 chat_interface = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
+        gr.Textbox(label="System prompt", lines=6),
         gr.Slider(
             label="Max new tokens",
             minimum=1,
             minimum=0.1,
             maximum=4.0,
             step=0.1,
+            value=0.4,
         ),
         gr.Slider(
             label="Top-p (nucleus sampling)",
     ],
 )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()