Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
from llama_cpp import Llama | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import Chroma | |
from langchain.prompts import PromptTemplate | |
# Initialize the embedding model | |
embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2", | |
model_kwargs={'device': 'cpu'}, | |
encode_kwargs={'normalize_embeddings': True} | |
) | |
# Load the existing Chroma vector store | |
persist_directory = os.path.join(os.path.dirname(__file__), 'mydb') | |
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings) | |
# Initialize the Llama model | |
llm = Llama.from_pretrained( | |
repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF", | |
filename="Llama-3.2-1B-Instruct-Q8_0.gguf", | |
) | |
# Create the RAG prompt template | |
template = """Answer the question based only on the following context: | |
{context} | |
Question: {question} | |
Answer the question in a clear way. If you cannot find the answer in the context, just say "I don't have enough information to answer this question." | |
Make sure to: | |
1. Only use information from the provided context | |
2. If you're unsure, acknowledge it | |
""" | |
prompt = PromptTemplate.from_template(template) | |
def respond( | |
message, | |
history, | |
system_message, | |
max_tokens, | |
temperature, | |
# top_p, | |
): | |
# Build the messages list | |
messages = [{"role": "system", "content": system_message}] | |
for user_msg, assistant_msg in history: | |
if user_msg: | |
messages.append({"role": "user", "content": user_msg}) | |
if assistant_msg: | |
messages.append({"role": "assistant", "content": assistant_msg}) | |
# Search the vector store | |
retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) | |
docs = retriever.get_relevant_documents(message) | |
context = "\n\n".join([doc.page_content for doc in docs]) | |
# Format the prompt | |
final_prompt = prompt.format(context=context, question=message) | |
# Add the formatted prompt to messages | |
messages.append({"role": "user", "content": final_prompt}) | |
# Generate response using the Llama model | |
response = llm.create_chat_completion( | |
messages=messages, | |
max_tokens=max_tokens, | |
temperature=temperature, | |
# top_p=top_p, | |
) | |
# Extract the assistant's reply | |
assistant_reply = response['choices'][0]['message']['content'] | |
return assistant_reply | |
# Create Gradio Chat Interface | |
demo = gr.ChatInterface( | |
fn=respond, | |
additional_inputs=[ | |
gr.Textbox(value="You are a friendly chatbot.", label="System Message"), | |
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"), | |
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"), | |
# gr.Slider( | |
# minimum=0.1, | |
# maximum=1.0, | |
# value=0.95, | |
# step=0.05, | |
# label="Top-p (Nucleus Sampling)", | |
# ), | |
], | |
title="Document-Based QA with Llama", | |
description="A PDF Chat interface powered by the Llama model.", | |
examples=["What is a Computer?"], | |
theme="default", | |
) | |
if __name__ == "__main__": | |
demo.launch() |