CHAT-PDF / app.py
AI-RESEARCHER-2024's picture
Update app.py
0e0fcef verified
raw
history blame
3.19 kB
import os
import gradio as gr
from llama_cpp import Llama
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate
# Initialize the embedding model
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'},
encode_kwargs={'normalize_embeddings': True}
)
# Load the existing Chroma vector store
persist_directory = os.path.join(os.path.dirname(__file__), 'mydb')
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
# Initialize the Llama model
llm = Llama.from_pretrained(
repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF",
filename="Llama-3.2-1B-Instruct-Q8_0.gguf",
)
# Create the RAG prompt template
template = """Answer the question based only on the following context:
{context}
Question: {question}
Answer the question in a clear way. If you cannot find the answer in the context, just say "I don't have enough information to answer this question."
Make sure to:
1. Only use information from the provided context
2. If you're unsure, acknowledge it
"""
prompt = PromptTemplate.from_template(template)
def respond(
message,
history,
system_message,
max_tokens,
temperature,
# top_p,
):
# Build the messages list
messages = [{"role": "system", "content": system_message}]
for user_msg, assistant_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
# Search the vector store
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
docs = retriever.get_relevant_documents(message)
context = "\n\n".join([doc.page_content for doc in docs])
# Format the prompt
final_prompt = prompt.format(context=context, question=message)
# Add the formatted prompt to messages
messages.append({"role": "user", "content": final_prompt})
# Generate response using the Llama model
response = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
# top_p=top_p,
)
# Extract the assistant's reply
assistant_reply = response['choices'][0]['message']['content']
return assistant_reply
# Create Gradio Chat Interface
demo = gr.ChatInterface(
fn=respond,
additional_inputs=[
gr.Textbox(value="You are a friendly chatbot.", label="System Message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
# gr.Slider(
# minimum=0.1,
# maximum=1.0,
# value=0.95,
# step=0.05,
# label="Top-p (Nucleus Sampling)",
# ),
],
title="Document-Based QA with Llama",
description="A PDF Chat interface powered by the Llama model.",
examples=["What is a Computer?"],
theme="default",
)
if __name__ == "__main__":
demo.launch()