File size: 3,189 Bytes
bd9c9bf
6f5fcd8
bd9c9bf
 
 
 
6f5fcd8
bd9c9bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e0fcef
bd9c9bf
 
 
0e0fcef
6f5fcd8
 
bd9c9bf
6f5fcd8
 
 
bd9c9bf
6f5fcd8
 
 
bd9c9bf
6f5fcd8
bd9c9bf
6f5fcd8
 
bd9c9bf
 
 
 
 
 
 
 
 
 
6f5fcd8
bd9c9bf
 
6f5fcd8
bd9c9bf
 
6f5fcd8
bd9c9bf
 
 
6f5fcd8
 
bd9c9bf
 
6f5fcd8
bd9c9bf
 
6f5fcd8
bd9c9bf
6f5fcd8
bd9c9bf
6f5fcd8
bd9c9bf
6f5fcd8
bd9c9bf
 
 
 
 
 
 
 
 
 
6f5fcd8
bd9c9bf
 
 
 
6f5fcd8
 
 
bd9c9bf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import gradio as gr
from llama_cpp import Llama
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate

# Initialize the embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# Load the existing Chroma vector store
persist_directory = os.path.join(os.path.dirname(__file__), 'mydb')
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

# Initialize the Llama model
llm = Llama.from_pretrained(
    repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF",
    filename="Llama-3.2-1B-Instruct-Q8_0.gguf",
)

# Create the RAG prompt template
template = """Answer the question based only on the following context:

{context}

Question: {question}

Answer the question in a clear way. If you cannot find the answer in the context, just say "I don't have enough information to answer this question."

Make sure to:
1. Only use information from the provided context
2. If you're unsure, acknowledge it
"""

prompt = PromptTemplate.from_template(template)

def respond(
    message,
    history,
    system_message,
    max_tokens,
    temperature,
    # top_p,
):
    # Build the messages list
    messages = [{"role": "system", "content": system_message}]

    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})

    # Search the vector store
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    docs = retriever.get_relevant_documents(message)
    context = "\n\n".join([doc.page_content for doc in docs])

    # Format the prompt
    final_prompt = prompt.format(context=context, question=message)

    # Add the formatted prompt to messages
    messages.append({"role": "user", "content": final_prompt})

    # Generate response using the Llama model
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=max_tokens,
        temperature=temperature,
        # top_p=top_p,
    )

    # Extract the assistant's reply
    assistant_reply = response['choices'][0]['message']['content']

    return assistant_reply

# Create Gradio Chat Interface
demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly chatbot.", label="System Message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
        # gr.Slider(
        #     minimum=0.1,
        #     maximum=1.0,
        #     value=0.95,
        #     step=0.05,
        #     label="Top-p (Nucleus Sampling)",
        # ),
    ],
    title="Document-Based QA with Llama",
    description="A PDF Chat interface powered by the Llama model.",
    examples=["What is a Computer?"],
    theme="default",
)

if __name__ == "__main__":
    demo.launch()