Spaces:

CreitinGameplays
/

ConvAIChat

Paused

App Files Files Community

CreitinGameplays commited on May 19

Commit

f15ed8e

•

1 Parent(s): 5b58842

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -64

app.py CHANGED Viewed

@@ -1,83 +1,128 @@
 import gradio as gr
-import torch
 import spaces
-import bitsandbytes as bnb
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-# Define the model name
-model_name = "CreitinGameplays/ConvAI-9b"
-# Quantization configuration with bitsandbytes settings
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
-# Load tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, low_cpu_mem_usage=True)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#model.to(device)
-# Initialize chat history
-chat_history = []
-@spaces.GPU(duration=120)
-def generate_text(user_prompt, top_p, top_k, temperature):
-    """Generates text using the ConvAI model from Hugging Face Transformers and maintains conversation history."""
-    # System introduction
-    system = "You are a helpful AI language model called ChatGPT, your goal is helping users with their questions."
-    # Append user prompt to chat history
-    chat_history.append(f"User: {user_prompt}")
-    # Construct the full prompt with system introduction, user prompt, and assistant role
-    prompt = f"{system} </s> {' '.join(chat_history)} </s>"
-    # Encode the entire prompt into tokens
-    prompt_encoded = tokenizer.encode(prompt, return_tensors="pt").to(device)
-    # Generate text with the complete prompt and limit the maximum length to 256 tokens
-    output = model.generate(
-        input_ids=prompt_encoded,
-        max_length=1550,
-        num_beams=1,
-        num_return_sequences=1,
         do_sample=True,
-        top_k=top_k,
         top_p=top_p,
         temperature=temperature,
-        repetition_penalty=1.2
     )
-    # Decode the generated token sequence back to text
-    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    # Extract the assistant's response
-    assistant_response = generated_text.split("User:")[-1].strip()
-    chat_history.append(f"Assistant: {assistant_response}")
-    return "\n".join(chat_history)
-def reset_history():
-  global chat_history
-  chat_history = []
-  return "Chat history reset."
-# Define the Gradio interface
-interface = gr.Interface(
-    fn=generate_text,
-    inputs=[
-        gr.Textbox(label="Text Prompt", value="What's an AI?"),
-        gr.Slider(0, 1, value=0.9, label="Top-p"),
-        gr.Slider(1, 100, value=50, step=1, label="Top-k"),
-        gr.Slider(0.01, 1, value=0.2, label="Temperature")
     ],
-    outputs="text",
-    description="Interact with ConvAI (Loaded with Hugging Face Transformers)",
-    live=True
 )
-# Launch the Gradio interface
-interface.launch()

+#!/usr/bin/env python
+import os
+from threading import Thread
+from typing import Iterator
 import gradio as gr
 import spaces
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+DESCRIPTION = "# Mistral-7B v0.2"
+if not torch.cuda.is_available():
+    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+if torch.cuda.is_available():
+    model_id = "mistralai/Mistral-7B-Instruct-v0.2"
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+@spaces.GPU
+def generate(
+    message: str,
+    chat_history: list[tuple[str, str]],
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+) -> Iterator[str]:
+    conversation = []
+    for user, assistant in chat_history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
         do_sample=True,
         top_p=top_p,
+        top_k=top_k,
         temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+chat_interface = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.6,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Hello there! How are you doing?"],
+        ["Can you explain briefly to me what is the Python programming language?"],
+        ["Explain the plot of Cinderella in a sentence."],
+        ["How many hours does it take a man to eat a Helicopter?"],
+        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
     ],
 )
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
+    chat_interface.render()
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()