File size: 2,348 Bytes
6d6b66e
 
 
 
0ddbfad
 
 
 
 
6d6b66e
0ddbfad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d6b66e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ddbfad
6d6b66e
 
 
 
 
 
 
 
 
 
 
 
 
9e5edff
6d6b66e
 
 
 
 
 
 
 
 
 
f37e98d
 
6d6b66e
 
 
6565328
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import gradio as gr
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

model_url = 'https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_K_M.gguf'
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    temperature=0.1,
    max_new_tokens=256,
    context_window=2048,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)
# Initialize embeddings and LLM
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

def initialize_index():
    """Initialize the vector store index from PDF files in the data directory"""
    # Load documents from the data directory
    loader = SimpleDirectoryReader(
        input_dir="data",
        required_exts=[".pdf"]
    )
    documents = loader.load_data()
    
    # Create index
    index = VectorStoreIndex.from_documents(
        documents,
        embed_model=embeddings,
    )
    
    # Return query engine with Llama
    return index.as_query_engine(llm=llm)

# Initialize the query engine at startup
query_engine = initialize_index()

def process_query(
    message: str,
    history: list[tuple[str, str]],
) -> str:
    """Process a query using the RAG system"""
    try:
        # Get response from the query engine
        response = query_engine.query(
            message,
            #streaming=True
        )
        return str(response)
    except Exception as e:
        return f"Error processing query: {str(e)}"

# Create the Gradio interface
demo = gr.ChatInterface(
    process_query,
    title="PDF Question Answering with RAG + Llama",
    description="Ask questions about the content of the loaded PDF documents using Llama model",
    #undo_btn="Delete Previous",
    #clear_btn="Clear",
)

if __name__ == "__main__":
    demo.launch(debug=True)