Spaces:
Sleeping
Sleeping
souravmighty
commited on
Commit
β’
811a1e3
1
Parent(s):
1187c2e
Changed to snowflake embedding
Browse files- .chainlit/config.toml +1 -1
- Dockerfile +1 -0
- app.py +50 -33
- assets/conversational_rag_architecture.gif +0 -0
- chainlit.md +6 -0
- requirements.txt +0 -1
.chainlit/config.toml
CHANGED
@@ -49,7 +49,7 @@ auto_tag_thread = true
|
|
49 |
name = "Chatbot"
|
50 |
|
51 |
# Show the readme while the thread is empty.
|
52 |
-
show_readme_as_default =
|
53 |
|
54 |
# Description of the app and chatbot. This is used for HTML tags.
|
55 |
# description = ""
|
|
|
49 |
name = "Chatbot"
|
50 |
|
51 |
# Show the readme while the thread is empty.
|
52 |
+
show_readme_as_default = false
|
53 |
|
54 |
# Description of the app and chatbot. This is used for HTML tags.
|
55 |
# description = ""
|
Dockerfile
CHANGED
@@ -7,5 +7,6 @@ WORKDIR $HOME/app
|
|
7 |
COPY --chown=user . $HOME/app
|
8 |
COPY ./requirements.txt ~/app/requirements.txt
|
9 |
RUN pip install -r requirements.txt
|
|
|
10 |
COPY . .
|
11 |
CMD ["chainlit", "run", "app.py", "--port", "7860"]
|
|
|
7 |
COPY --chown=user . $HOME/app
|
8 |
COPY ./requirements.txt ~/app/requirements.txt
|
9 |
RUN pip install -r requirements.txt
|
10 |
+
RUN pip install git+https://github.com/UKPLab/sentence-transformers.git
|
11 |
COPY . .
|
12 |
CMD ["chainlit", "run", "app.py", "--port", "7860"]
|
app.py
CHANGED
@@ -10,6 +10,7 @@ from chainlit.input_widget import Select
|
|
10 |
import os
|
11 |
|
12 |
|
|
|
13 |
@cl.cache
|
14 |
def get_memory():
|
15 |
# Initialize message history for conversation
|
@@ -41,26 +42,6 @@ async def on_chat_start():
|
|
41 |
]
|
42 |
).send()
|
43 |
|
44 |
-
await setup_agent(settings)
|
45 |
-
|
46 |
-
|
47 |
-
@cl.on_settings_update
|
48 |
-
async def setup_agent(settings):
|
49 |
-
|
50 |
-
user_env = cl.user_session.get("env")
|
51 |
-
os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
|
52 |
-
|
53 |
-
# embeddings = OllamaEmbeddings(model="nomic-embed-text")
|
54 |
-
# embeddings = SentenceTransformerEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
|
55 |
-
# memory=get_memory()
|
56 |
-
|
57 |
-
# docsearch = await cl.make_async(Chroma)(
|
58 |
-
# persist_directory="./chroma_db",
|
59 |
-
# embedding_function=embeddings
|
60 |
-
# )
|
61 |
-
|
62 |
-
msg = cl.Message(content = f"You are using '{settings['Model']}' as LLM.")
|
63 |
-
await msg.send()
|
64 |
|
65 |
files = None #Initialize variable to store uploaded files
|
66 |
|
@@ -71,23 +52,25 @@ async def setup_agent(settings):
|
|
71 |
accept=["application/pdf"],
|
72 |
max_size_mb=100,
|
73 |
timeout=180,
|
|
|
74 |
).send()
|
75 |
|
76 |
-
file = files[0] # Get the first uploaded file
|
77 |
-
|
78 |
-
# Inform the user that processing has started
|
79 |
-
msg = cl.Message(content=f"Processing `{file.name}`...")
|
80 |
-
await msg.send()
|
81 |
|
82 |
-
# Read the PDF file
|
83 |
-
pdf = PyPDF2.PdfReader(file.path)
|
84 |
pdf_text = ""
|
85 |
-
for
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
|
89 |
# Split the text into chunks
|
90 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=
|
91 |
texts = text_splitter.split_text(pdf_text)
|
92 |
|
93 |
# Create a metadata for each chunk
|
@@ -95,20 +78,40 @@ async def setup_agent(settings):
|
|
95 |
|
96 |
# Create a Chroma vector store
|
97 |
# embeddings = OllamaEmbeddings(model="nomic-embed-text")
|
98 |
-
embeddings = SentenceTransformerEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
|
|
|
99 |
#embeddings = OllamaEmbeddings(model="llama2:7b")
|
100 |
docsearch = await cl.make_async(Chroma.from_texts)(
|
101 |
texts, embeddings, metadatas=metadatas
|
102 |
)
|
|
|
103 |
|
104 |
# Let the user know that the system is ready
|
105 |
msg.content = f"Processing `{file.name}` done. You can now ask questions!"
|
106 |
await msg.update()
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
memory=get_memory()
|
109 |
|
110 |
|
111 |
-
# Create a chain that uses the Chroma vector
|
112 |
chain = ConversationalRetrievalChain.from_llm(
|
113 |
llm = ChatGroq(model=settings["Model"]),
|
114 |
chain_type="stuff",
|
@@ -158,4 +161,18 @@ async def main(message: cl.Message):
|
|
158 |
else:
|
159 |
answer += "\nNo sources found"
|
160 |
#return results
|
161 |
-
await cl.Message(content=answer, elements=text_elements).send()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
import os
|
11 |
|
12 |
|
13 |
+
|
14 |
@cl.cache
|
15 |
def get_memory():
|
16 |
# Initialize message history for conversation
|
|
|
42 |
]
|
43 |
).send()
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
files = None #Initialize variable to store uploaded files
|
47 |
|
|
|
52 |
accept=["application/pdf"],
|
53 |
max_size_mb=100,
|
54 |
timeout=180,
|
55 |
+
max_files = 10,
|
56 |
).send()
|
57 |
|
|
|
|
|
|
|
|
|
|
|
58 |
|
|
|
|
|
59 |
pdf_text = ""
|
60 |
+
for file in files:
|
61 |
+
# Inform the user that processing has started
|
62 |
+
msg = cl.Message(content=f"Processing `{file.name}`...")
|
63 |
+
await msg.send()
|
64 |
+
|
65 |
+
# Read the PDF file
|
66 |
+
pdf = PyPDF2.PdfReader(file.path)
|
67 |
+
for page in pdf.pages:
|
68 |
+
pdf_text += page.extract_text()
|
69 |
+
|
70 |
|
71 |
|
72 |
# Split the text into chunks
|
73 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
|
74 |
texts = text_splitter.split_text(pdf_text)
|
75 |
|
76 |
# Create a metadata for each chunk
|
|
|
78 |
|
79 |
# Create a Chroma vector store
|
80 |
# embeddings = OllamaEmbeddings(model="nomic-embed-text")
|
81 |
+
# embeddings = SentenceTransformerEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
|
82 |
+
embeddings = SentenceTransformerEmbeddings(model_name = "Snowflake/snowflake-arctic-embed-m")
|
83 |
+
|
84 |
+
|
85 |
#embeddings = OllamaEmbeddings(model="llama2:7b")
|
86 |
docsearch = await cl.make_async(Chroma.from_texts)(
|
87 |
texts, embeddings, metadatas=metadatas
|
88 |
)
|
89 |
+
cl.user_session.set("docsearch", docsearch)
|
90 |
|
91 |
# Let the user know that the system is ready
|
92 |
msg.content = f"Processing `{file.name}` done. You can now ask questions!"
|
93 |
await msg.update()
|
94 |
|
95 |
+
await setup_agent(settings)
|
96 |
+
|
97 |
+
|
98 |
+
@cl.on_settings_update
|
99 |
+
async def setup_agent(settings):
|
100 |
+
|
101 |
+
user_env = cl.user_session.get("env")
|
102 |
+
os.environ["GROQ_API_KEY"] = user_env.get("GROQ_API_KEY")
|
103 |
+
|
104 |
+
memory=get_memory()
|
105 |
+
docsearch = cl.user_session.get("docsearch")
|
106 |
+
|
107 |
+
msg = cl.Message(content = f"You are using `{settings['Model']}` as LLM. You can change model in `Settings Panel` in the chat box.")
|
108 |
+
await msg.send()
|
109 |
+
|
110 |
+
|
111 |
memory=get_memory()
|
112 |
|
113 |
|
114 |
+
# Create a chain that uses the Chroma vector stores
|
115 |
chain = ConversationalRetrievalChain.from_llm(
|
116 |
llm = ChatGroq(model=settings["Model"]),
|
117 |
chain_type="stuff",
|
|
|
161 |
else:
|
162 |
answer += "\nNo sources found"
|
163 |
#return results
|
164 |
+
await cl.Message(content=answer, elements=text_elements).send()
|
165 |
+
|
166 |
+
|
167 |
+
@cl.on_stop
|
168 |
+
def on_stop():
|
169 |
+
print("The user wants to stop the task!")
|
170 |
+
docsearch = cl.user_session.get("docsearch")
|
171 |
+
docsearch.delete_collection()
|
172 |
+
|
173 |
+
|
174 |
+
@cl.on_chat_end
|
175 |
+
def on_chat_end():
|
176 |
+
print("The user disconnected!")
|
177 |
+
docsearch = cl.user_session.get("docsearch")
|
178 |
+
docsearch.delete_collection()
|
assets/conversational_rag_architecture.gif
ADDED
chainlit.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Welcome to GroqDoc!
|
2 |
+
|
3 |
+
## Useful Links π
|
4 |
+
|
5 |
+
- **Groq API KEY:** Generate Groq API Key for free [Groq API Key](https://console.groq.com/keys) π
|
6 |
+
|
requirements.txt
CHANGED
@@ -5,4 +5,3 @@ PyPDF2
|
|
5 |
chromadb
|
6 |
groq
|
7 |
langchain-groq
|
8 |
-
sentence-transformers
|
|
|
5 |
chromadb
|
6 |
groq
|
7 |
langchain-groq
|
|