get latest code from orca-2 space
Browse files- .env.example +10 -3
- app.py +87 -169
- app_modules/init.py +92 -82
- app_modules/llm_chat_chain.py +19 -7
- app_modules/llm_inference.py +34 -4
- app_modules/llm_loader.py +135 -99
- app_modules/llm_qa_chain.py +0 -3
- app_modules/llm_qa_chain_with_memory.py +32 -0
- app_modules/utils.py +39 -13
- requirements-mac.txt +127 -0
- requirements.txt +129 -38
.env.example
CHANGED
@@ -11,7 +11,7 @@ LLM_MODEL_TYPE=hftgi
|
|
11 |
|
12 |
OPENLLM_SERVER_URL=http://localhost:64300
|
13 |
|
14 |
-
HFTGI_SERVER_URL=
|
15 |
|
16 |
OPENAI_API_KEY=
|
17 |
|
@@ -28,6 +28,7 @@ HF_PIPELINE_DEVICE_TYPE=
|
|
28 |
|
29 |
# USE_LLAMA_2_PROMPT_TEMPLATE=true
|
30 |
DISABLE_MODEL_PRELOADING=true
|
|
|
31 |
CHAT_HISTORY_ENABLED=false
|
32 |
SHOW_PARAM_SETTINGS=false
|
33 |
SHARE_GRADIO_APP=false
|
@@ -47,15 +48,21 @@ USING_TORCH_BFLOAT16=true
|
|
47 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
|
48 |
|
49 |
# LLM_MODEL_TYPE must be set to huggingface
|
|
|
|
|
|
|
50 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
|
51 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
|
52 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
|
53 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
|
54 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
|
55 |
-
HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
|
56 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-13b-chat-hf"
|
57 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-70b-chat-hf"
|
58 |
-
# HUGGINGFACE_MODEL_NAME_OR_PATH="
|
|
|
|
|
|
|
59 |
|
60 |
STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"
|
61 |
|
|
|
11 |
|
12 |
OPENLLM_SERVER_URL=http://localhost:64300
|
13 |
|
14 |
+
HFTGI_SERVER_URL=
|
15 |
|
16 |
OPENAI_API_KEY=
|
17 |
|
|
|
28 |
|
29 |
# USE_LLAMA_2_PROMPT_TEMPLATE=true
|
30 |
DISABLE_MODEL_PRELOADING=true
|
31 |
+
USER_CONVERSATION_SUMMARY_BUFFER_MEMORY=true
|
32 |
CHAT_HISTORY_ENABLED=false
|
33 |
SHOW_PARAM_SETTINGS=false
|
34 |
SHARE_GRADIO_APP=false
|
|
|
48 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
|
49 |
|
50 |
# LLM_MODEL_TYPE must be set to huggingface
|
51 |
+
# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-3b"
|
52 |
+
# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-7b"
|
53 |
+
# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
|
54 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
|
55 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
|
56 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
|
57 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
|
58 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
|
59 |
+
# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
|
60 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-13b-chat-hf"
|
61 |
# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-70b-chat-hf"
|
62 |
+
# HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-7b"
|
63 |
+
# HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-13b"
|
64 |
+
HUGGINGFACE_MODEL_NAME_OR_PATH="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
65 |
+
# HUGGINGFACE_MODEL_NAME_OR_PATH="FlagAlpha/Llama2-Chinese-13b-Chat"
|
66 |
|
67 |
STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"
|
68 |
|
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
"""Main entrypoint for the app."""
|
|
|
2 |
import os
|
|
|
3 |
import time
|
4 |
from queue import Queue
|
5 |
from timeit import default_timer as timer
|
@@ -13,14 +15,13 @@ from app_modules.utils import print_llm_response, remove_extra_spaces
|
|
13 |
|
14 |
llm_loader, qa_chain = app_init()
|
15 |
|
16 |
-
show_param_settings = os.environ.get("SHOW_PARAM_SETTINGS") == "true"
|
17 |
share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
|
18 |
using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
|
19 |
-
|
20 |
-
not using_openai and os.environ.get("
|
21 |
)
|
22 |
chat_history_enabled = (
|
23 |
-
not
|
24 |
)
|
25 |
|
26 |
model = (
|
@@ -34,180 +35,97 @@ href = (
|
|
34 |
else f"https://huggingface.co/{model}"
|
35 |
)
|
36 |
|
37 |
-
if
|
38 |
qa_chain = ChatChain(llm_loader)
|
39 |
-
name = "
|
40 |
else:
|
41 |
-
name = "
|
42 |
-
|
43 |
-
title = f"
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
<div align="left">
|
47 |
<p> Currently Running: <a href="{href}">{model}</a></p>
|
48 |
</div>
|
49 |
"""
|
50 |
|
51 |
-
description = """\
|
52 |
-
<div align="center" style="margin:16px 0">
|
53 |
-
The demo is built on <a href="https://github.com/hwchase17/langchain">LangChain</a>.
|
54 |
-
</div>
|
55 |
-
"""
|
56 |
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
|
|
|
|
59 |
|
60 |
-
def qa(chatbot):
|
61 |
-
user_msg = chatbot[-1][0]
|
62 |
q = Queue()
|
63 |
result = Queue()
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
chatbot[-1][1] += "\n\nSources:\n"
|
111 |
-
ret = result.get()
|
112 |
-
titles = []
|
113 |
-
for doc in ret["source_documents"]:
|
114 |
-
page = doc.metadata["page"] + 1
|
115 |
-
url = f"{doc.metadata['url']}#page={page}"
|
116 |
-
file_name = doc.metadata["source"].split("/")[-1]
|
117 |
-
title = f"{file_name} Page: {page}"
|
118 |
-
if title not in titles:
|
119 |
-
titles.append(title)
|
120 |
-
chatbot[-1][1] += f"1. [{title}]({url})\n"
|
121 |
-
|
122 |
-
yield chatbot
|
123 |
-
|
124 |
-
|
125 |
-
with open("assets/custom.css", "r", encoding="utf-8") as f:
|
126 |
-
customCSS = f.read()
|
127 |
-
|
128 |
-
with gr.Blocks(css=customCSS) as demo:
|
129 |
-
user_question = gr.State("")
|
130 |
-
with gr.Row():
|
131 |
-
gr.HTML(title)
|
132 |
-
gr.Markdown(description_top)
|
133 |
-
with gr.Row().style(equal_height=True):
|
134 |
-
with gr.Column(scale=5):
|
135 |
-
with gr.Row():
|
136 |
-
chatbot = gr.Chatbot(elem_id="inflaton_chatbot").style(height="100%")
|
137 |
-
with gr.Row():
|
138 |
-
with gr.Column(scale=2):
|
139 |
-
user_input = gr.Textbox(
|
140 |
-
show_label=False, placeholder="Enter your question here"
|
141 |
-
).style(container=False)
|
142 |
-
with gr.Column(
|
143 |
-
min_width=70,
|
144 |
-
):
|
145 |
-
submitBtn = gr.Button("Send")
|
146 |
-
with gr.Column(
|
147 |
-
min_width=70,
|
148 |
-
):
|
149 |
-
clearBtn = gr.Button("Clear")
|
150 |
-
if show_param_settings:
|
151 |
-
with gr.Column():
|
152 |
-
with gr.Column(
|
153 |
-
min_width=50,
|
154 |
-
):
|
155 |
-
with gr.Tab(label="Parameter Setting"):
|
156 |
-
gr.Markdown("# Parameters")
|
157 |
-
top_p = gr.Slider(
|
158 |
-
minimum=-0,
|
159 |
-
maximum=1.0,
|
160 |
-
value=0.95,
|
161 |
-
step=0.05,
|
162 |
-
# interactive=True,
|
163 |
-
label="Top-p",
|
164 |
-
)
|
165 |
-
temperature = gr.Slider(
|
166 |
-
minimum=0.1,
|
167 |
-
maximum=2.0,
|
168 |
-
value=0,
|
169 |
-
step=0.1,
|
170 |
-
# interactive=True,
|
171 |
-
label="Temperature",
|
172 |
-
)
|
173 |
-
max_new_tokens = gr.Slider(
|
174 |
-
minimum=0,
|
175 |
-
maximum=2048,
|
176 |
-
value=2048,
|
177 |
-
step=8,
|
178 |
-
# interactive=True,
|
179 |
-
label="Max Generation Tokens",
|
180 |
-
)
|
181 |
-
max_context_length_tokens = gr.Slider(
|
182 |
-
minimum=0,
|
183 |
-
maximum=4096,
|
184 |
-
value=4096,
|
185 |
-
step=128,
|
186 |
-
# interactive=True,
|
187 |
-
label="Max Context Tokens",
|
188 |
-
)
|
189 |
-
gr.Markdown(description)
|
190 |
-
|
191 |
-
def chat(user_message, history):
|
192 |
-
return "", history + [[user_message, None]]
|
193 |
-
|
194 |
-
user_input.submit(
|
195 |
-
chat, [user_input, chatbot], [user_input, chatbot], queue=True
|
196 |
-
).then(qa, chatbot, chatbot)
|
197 |
-
|
198 |
-
submitBtn.click(
|
199 |
-
chat, [user_input, chatbot], [user_input, chatbot], queue=True, api_name="chat"
|
200 |
-
).then(qa, chatbot, chatbot)
|
201 |
-
|
202 |
-
def reset():
|
203 |
-
return "", []
|
204 |
-
|
205 |
-
clearBtn.click(
|
206 |
-
reset,
|
207 |
-
outputs=[user_input, chatbot],
|
208 |
-
show_progress=True,
|
209 |
-
api_name="reset",
|
210 |
-
)
|
211 |
-
|
212 |
-
demo.title = "Chat with AI Books" if chat_with_llama_2 else "Chat with Llama-2"
|
213 |
-
demo.queue(concurrency_count=CONCURRENT_COUNT).launch(share=share_gradio_app)
|
|
|
1 |
"""Main entrypoint for the app."""
|
2 |
+
|
3 |
import os
|
4 |
+
from threading import Thread
|
5 |
import time
|
6 |
from queue import Queue
|
7 |
from timeit import default_timer as timer
|
|
|
15 |
|
16 |
llm_loader, qa_chain = app_init()
|
17 |
|
|
|
18 |
share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
|
19 |
using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
|
20 |
+
chat_with_orca_2 = (
|
21 |
+
not using_openai and os.environ.get("USE_ORCA_2_PROMPT_TEMPLATE") == "true"
|
22 |
)
|
23 |
chat_history_enabled = (
|
24 |
+
not chat_with_orca_2 and os.environ.get("CHAT_HISTORY_ENABLED") == "true"
|
25 |
)
|
26 |
|
27 |
model = (
|
|
|
35 |
else f"https://huggingface.co/{model}"
|
36 |
)
|
37 |
|
38 |
+
if chat_with_orca_2:
|
39 |
qa_chain = ChatChain(llm_loader)
|
40 |
+
name = "Orca-2"
|
41 |
else:
|
42 |
+
name = "PCI DSS v4"
|
43 |
+
|
44 |
+
title = f"Chat with {name}"
|
45 |
+
examples = (
|
46 |
+
["How to cook a fish?", "Who is the president of US now?"]
|
47 |
+
if chat_with_orca_2
|
48 |
+
else [
|
49 |
+
"What's PCI DSS?",
|
50 |
+
"Can you summarize the changes made from PCI DSS version 3.2.1 to version 4.0?",
|
51 |
+
]
|
52 |
+
)
|
53 |
+
description = f"""\
|
54 |
<div align="left">
|
55 |
<p> Currently Running: <a href="{href}">{model}</a></p>
|
56 |
</div>
|
57 |
"""
|
58 |
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
def task(question, chat_history, q, result):
|
61 |
+
start = timer()
|
62 |
+
inputs = {"question": question, "chat_history": chat_history}
|
63 |
+
ret = qa_chain.call_chain(inputs, None, q)
|
64 |
+
end = timer()
|
65 |
+
|
66 |
+
print(f"Completed in {end - start:.3f}s")
|
67 |
+
print_llm_response(ret)
|
68 |
+
|
69 |
+
result.put(ret)
|
70 |
+
|
71 |
+
|
72 |
+
def predict(message, history):
|
73 |
+
print("predict:", message, history)
|
74 |
+
|
75 |
+
chat_history = []
|
76 |
+
if chat_history_enabled:
|
77 |
+
for element in history:
|
78 |
+
item = (element[0] or "", element[1] or "")
|
79 |
+
chat_history.append(item)
|
80 |
|
81 |
+
if not chat_history:
|
82 |
+
qa_chain.reset()
|
83 |
|
|
|
|
|
84 |
q = Queue()
|
85 |
result = Queue()
|
86 |
+
t = Thread(target=task, args=(message, chat_history, q, result))
|
87 |
+
t.start() # Starting the generation in a separate thread.
|
88 |
+
|
89 |
+
partial_message = ""
|
90 |
+
count = 2 if len(chat_history) > 0 else 1
|
91 |
+
|
92 |
+
while count > 0:
|
93 |
+
while q.empty():
|
94 |
+
print("nothing generated yet - retry in 0.5s")
|
95 |
+
time.sleep(0.5)
|
96 |
+
|
97 |
+
for next_token in llm_loader.streamer:
|
98 |
+
partial_message += next_token or ""
|
99 |
+
# partial_message = remove_extra_spaces(partial_message)
|
100 |
+
yield partial_message
|
101 |
+
|
102 |
+
if count == 2:
|
103 |
+
partial_message += "\n\n"
|
104 |
+
|
105 |
+
count -= 1
|
106 |
+
|
107 |
+
if not chat_with_orca_2:
|
108 |
+
partial_message += "\n\nSources:\n"
|
109 |
+
ret = result.get()
|
110 |
+
titles = []
|
111 |
+
for doc in ret["source_documents"]:
|
112 |
+
page = doc.metadata["page"] + 1
|
113 |
+
url = f"{doc.metadata['url']}#page={page}"
|
114 |
+
file_name = doc.metadata["source"].split("/")[-1]
|
115 |
+
title = f"{file_name} Page: {page}"
|
116 |
+
if title not in titles:
|
117 |
+
titles.append(title)
|
118 |
+
partial_message += f"1. [{title}]({url})\n"
|
119 |
+
|
120 |
+
yield partial_message
|
121 |
+
|
122 |
+
|
123 |
+
# Setting up the Gradio chat interface.
|
124 |
+
gr.ChatInterface(
|
125 |
+
predict,
|
126 |
+
title=title,
|
127 |
+
description=description,
|
128 |
+
examples=examples,
|
129 |
+
).launch(
|
130 |
+
share=share_gradio_app
|
131 |
+
) # Launching the web interface.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_modules/init.py
CHANGED
@@ -1,82 +1,92 @@
|
|
1 |
-
"""Main entrypoint for the app."""
|
2 |
-
|
3 |
-
|
4 |
-
from
|
5 |
-
|
6 |
-
|
7 |
-
from
|
8 |
-
from
|
9 |
-
from langchain.vectorstores.
|
10 |
-
|
11 |
-
|
12 |
-
from app_modules.
|
13 |
-
from app_modules.utils import get_device_types, init_settings
|
14 |
-
|
15 |
-
found_dotenv = find_dotenv(".env")
|
16 |
-
|
17 |
-
if len(found_dotenv) == 0:
|
18 |
-
found_dotenv = find_dotenv(".env.example")
|
19 |
-
print(f"loading env vars from: {found_dotenv}")
|
20 |
-
load_dotenv(found_dotenv, override=False)
|
21 |
-
|
22 |
-
# Constants
|
23 |
-
init_settings()
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Main entrypoint for the app."""
|
2 |
+
|
3 |
+
import os
|
4 |
+
from timeit import default_timer as timer
|
5 |
+
from typing import List, Optional
|
6 |
+
|
7 |
+
from dotenv import find_dotenv, load_dotenv
|
8 |
+
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
9 |
+
from langchain.vectorstores.chroma import Chroma
|
10 |
+
from langchain.vectorstores.faiss import FAISS
|
11 |
+
|
12 |
+
from app_modules.llm_loader import LLMLoader
|
13 |
+
from app_modules.utils import get_device_types, init_settings
|
14 |
+
|
15 |
+
found_dotenv = find_dotenv(".env")
|
16 |
+
|
17 |
+
if len(found_dotenv) == 0:
|
18 |
+
found_dotenv = find_dotenv(".env.example")
|
19 |
+
print(f"loading env vars from: {found_dotenv}")
|
20 |
+
load_dotenv(found_dotenv, override=False)
|
21 |
+
|
22 |
+
# Constants
|
23 |
+
init_settings()
|
24 |
+
|
25 |
+
if os.environ.get("LANGCHAIN_DEBUG") == "true":
|
26 |
+
import langchain
|
27 |
+
|
28 |
+
langchain.debug = True
|
29 |
+
|
30 |
+
if os.environ.get("USER_CONVERSATION_SUMMARY_BUFFER_MEMORY") == "true":
|
31 |
+
from app_modules.llm_qa_chain_with_memory import QAChain
|
32 |
+
|
33 |
+
print("using llm_qa_chain_with_memory")
|
34 |
+
else:
|
35 |
+
from app_modules.llm_qa_chain import QAChain
|
36 |
+
|
37 |
+
print("using llm_qa_chain")
|
38 |
+
|
39 |
+
|
40 |
+
def app_init():
|
41 |
+
# https://github.com/huggingface/transformers/issues/17611
|
42 |
+
os.environ["CURL_CA_BUNDLE"] = ""
|
43 |
+
|
44 |
+
hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
|
45 |
+
print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
|
46 |
+
print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
|
47 |
+
|
48 |
+
hf_embeddings_model_name = (
|
49 |
+
os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
|
50 |
+
)
|
51 |
+
|
52 |
+
n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
|
53 |
+
index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get(
|
54 |
+
"CHROMADB_INDEX_PATH"
|
55 |
+
)
|
56 |
+
using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
|
57 |
+
llm_model_type = os.environ.get("LLM_MODEL_TYPE")
|
58 |
+
|
59 |
+
start = timer()
|
60 |
+
embeddings = HuggingFaceInstructEmbeddings(
|
61 |
+
model_name=hf_embeddings_model_name,
|
62 |
+
model_kwargs={"device": hf_embeddings_device_type},
|
63 |
+
)
|
64 |
+
end = timer()
|
65 |
+
|
66 |
+
print(f"Completed in {end - start:.3f}s")
|
67 |
+
|
68 |
+
start = timer()
|
69 |
+
|
70 |
+
print(f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}")
|
71 |
+
|
72 |
+
if not os.path.isdir(index_path):
|
73 |
+
raise ValueError(f"{index_path} does not exist!")
|
74 |
+
elif using_faiss:
|
75 |
+
vectorstore = FAISS.load_local(index_path, embeddings)
|
76 |
+
else:
|
77 |
+
vectorstore = Chroma(
|
78 |
+
embedding_function=embeddings, persist_directory=index_path
|
79 |
+
)
|
80 |
+
|
81 |
+
end = timer()
|
82 |
+
|
83 |
+
print(f"Completed in {end - start:.3f}s")
|
84 |
+
|
85 |
+
start = timer()
|
86 |
+
llm_loader = LLMLoader(llm_model_type)
|
87 |
+
llm_loader.init(n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type)
|
88 |
+
qa_chain = QAChain(vectorstore, llm_loader)
|
89 |
+
end = timer()
|
90 |
+
print(f"Completed in {end - start:.3f}s")
|
91 |
+
|
92 |
+
return llm_loader, qa_chain
|
app_modules/llm_chat_chain.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
import os
|
2 |
from typing import List, Optional
|
3 |
|
4 |
-
from langchain import ConversationChain,
|
|
|
5 |
from langchain.chains.base import Chain
|
6 |
from langchain.memory import ConversationSummaryBufferMemory
|
7 |
|
8 |
from app_modules.llm_inference import LLMInference
|
|
|
9 |
|
10 |
|
11 |
def get_llama_2_prompt_template():
|
@@ -23,6 +25,13 @@ def get_llama_2_prompt_template():
|
|
23 |
return prompt_template
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
class ChatChain(LLMInference):
|
27 |
def __init__(self, llm_loader):
|
28 |
super().__init__(llm_loader)
|
@@ -31,28 +40,31 @@ class ChatChain(LLMInference):
|
|
31 |
template = (
|
32 |
get_llama_2_prompt_template()
|
33 |
if os.environ.get("USE_LLAMA_2_PROMPT_TEMPLATE") == "true"
|
34 |
-
else
|
|
|
|
|
|
|
35 |
{history}
|
36 |
Human: {input}
|
37 |
Chatbot:"""
|
|
|
38 |
)
|
39 |
|
40 |
print(f"template: {template}")
|
41 |
|
42 |
prompt = PromptTemplate(input_variables=["history", "input"], template=template)
|
43 |
-
|
44 |
-
|
45 |
-
llm=self.llm_loader.llm, max_token_limit=1024, return_messages=True
|
46 |
)
|
47 |
|
48 |
llm_chain = ConversationChain(
|
49 |
llm=self.llm_loader.llm,
|
50 |
prompt=prompt,
|
51 |
-
verbose=
|
52 |
memory=memory,
|
53 |
)
|
54 |
|
55 |
return llm_chain
|
56 |
|
57 |
def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
|
58 |
-
return chain
|
|
|
1 |
import os
|
2 |
from typing import List, Optional
|
3 |
|
4 |
+
from langchain.chains import ConversationChain, LLMChain
|
5 |
+
from langchain.prompts import PromptTemplate
|
6 |
from langchain.chains.base import Chain
|
7 |
from langchain.memory import ConversationSummaryBufferMemory
|
8 |
|
9 |
from app_modules.llm_inference import LLMInference
|
10 |
+
from app_modules.utils import CustomizedConversationSummaryBufferMemory
|
11 |
|
12 |
|
13 |
def get_llama_2_prompt_template():
|
|
|
25 |
return prompt_template
|
26 |
|
27 |
|
28 |
+
def get_orca_2_prompt_template():
|
29 |
+
system_message = "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
|
30 |
+
user_message = "Chat History:\n\n{history} \n\nUser: {input}"
|
31 |
+
prompt_template = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
|
32 |
+
return prompt_template
|
33 |
+
|
34 |
+
|
35 |
class ChatChain(LLMInference):
|
36 |
def __init__(self, llm_loader):
|
37 |
super().__init__(llm_loader)
|
|
|
40 |
template = (
|
41 |
get_llama_2_prompt_template()
|
42 |
if os.environ.get("USE_LLAMA_2_PROMPT_TEMPLATE") == "true"
|
43 |
+
else (
|
44 |
+
get_orca_2_prompt_template()
|
45 |
+
if os.environ.get("USE_ORCA_2_PROMPT_TEMPLATE") == "true"
|
46 |
+
else """You are a chatbot having a conversation with a human.
|
47 |
{history}
|
48 |
Human: {input}
|
49 |
Chatbot:"""
|
50 |
+
)
|
51 |
)
|
52 |
|
53 |
print(f"template: {template}")
|
54 |
|
55 |
prompt = PromptTemplate(input_variables=["history", "input"], template=template)
|
56 |
+
memory = CustomizedConversationSummaryBufferMemory(
|
57 |
+
llm=self.llm_loader.llm, max_token_limit=1024, return_messages=False
|
|
|
58 |
)
|
59 |
|
60 |
llm_chain = ConversationChain(
|
61 |
llm=self.llm_loader.llm,
|
62 |
prompt=prompt,
|
63 |
+
verbose=False,
|
64 |
memory=memory,
|
65 |
)
|
66 |
|
67 |
return llm_chain
|
68 |
|
69 |
def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
|
70 |
+
return super().run_chain(chain, {"input": inputs["question"]}, callbacks)
|
app_modules/llm_inference.py
CHANGED
@@ -5,6 +5,7 @@ import urllib
|
|
5 |
from queue import Queue
|
6 |
from threading import Thread
|
7 |
from typing import List, Optional
|
|
|
8 |
|
9 |
from langchain.chains.base import Chain
|
10 |
|
@@ -13,9 +14,6 @@ from app_modules.utils import remove_extra_spaces
|
|
13 |
|
14 |
|
15 |
class LLMInference(metaclass=abc.ABCMeta):
|
16 |
-
llm_loader: LLMLoader
|
17 |
-
chain: Chain
|
18 |
-
|
19 |
def __init__(self, llm_loader):
|
20 |
self.llm_loader = llm_loader
|
21 |
self.chain = None
|
@@ -30,8 +28,15 @@ class LLMInference(metaclass=abc.ABCMeta):
|
|
30 |
|
31 |
return self.chain
|
32 |
|
|
|
|
|
|
|
33 |
def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
|
36 |
def call_chain(
|
37 |
self,
|
@@ -59,6 +64,7 @@ class LLMInference(metaclass=abc.ABCMeta):
|
|
59 |
if "answer" in result:
|
60 |
result["answer"] = remove_extra_spaces(result["answer"])
|
61 |
|
|
|
62 |
base_url = os.environ.get("PDF_FILE_BASE_URL")
|
63 |
if base_url is not None and len(base_url) > 0:
|
64 |
documents = result["source_documents"]
|
@@ -66,6 +72,30 @@ class LLMInference(metaclass=abc.ABCMeta):
|
|
66 |
source = doc.metadata["source"]
|
67 |
title = source.split("/")[-1]
|
68 |
doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
return result
|
71 |
finally:
|
|
|
5 |
from queue import Queue
|
6 |
from threading import Thread
|
7 |
from typing import List, Optional
|
8 |
+
from urllib.parse import quote, urlparse, urlunparse
|
9 |
|
10 |
from langchain.chains.base import Chain
|
11 |
|
|
|
14 |
|
15 |
|
16 |
class LLMInference(metaclass=abc.ABCMeta):
|
|
|
|
|
|
|
17 |
def __init__(self, llm_loader):
|
18 |
self.llm_loader = llm_loader
|
19 |
self.chain = None
|
|
|
28 |
|
29 |
return self.chain
|
30 |
|
31 |
+
def reset(self) -> None:
|
32 |
+
self.chain = None
|
33 |
+
|
34 |
def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
|
35 |
+
result = chain.invoke(inputs, {"callbacks": callbacks})
|
36 |
+
if "text" in result:
|
37 |
+
result["response"] = result["text"]
|
38 |
+
del result["text"]
|
39 |
+
return result
|
40 |
|
41 |
def call_chain(
|
42 |
self,
|
|
|
64 |
if "answer" in result:
|
65 |
result["answer"] = remove_extra_spaces(result["answer"])
|
66 |
|
67 |
+
source_path = os.environ.get("SOURCE_PATH")
|
68 |
base_url = os.environ.get("PDF_FILE_BASE_URL")
|
69 |
if base_url is not None and len(base_url) > 0:
|
70 |
documents = result["source_documents"]
|
|
|
72 |
source = doc.metadata["source"]
|
73 |
title = source.split("/")[-1]
|
74 |
doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
|
75 |
+
elif source_path is not None and len(source_path) > 0:
|
76 |
+
documents = result["source_documents"]
|
77 |
+
for doc in documents:
|
78 |
+
source = doc.metadata["source"]
|
79 |
+
url = source.replace(source_path, "https://")
|
80 |
+
url = url.replace(".html", "")
|
81 |
+
parsed_url = urlparse(url)
|
82 |
+
|
83 |
+
# Encode path, query, and fragment
|
84 |
+
encoded_path = quote(parsed_url.path)
|
85 |
+
encoded_query = quote(parsed_url.query)
|
86 |
+
encoded_fragment = quote(parsed_url.fragment)
|
87 |
+
|
88 |
+
# Construct the encoded URL
|
89 |
+
doc.metadata["url"] = urlunparse(
|
90 |
+
(
|
91 |
+
parsed_url.scheme,
|
92 |
+
parsed_url.netloc,
|
93 |
+
encoded_path,
|
94 |
+
parsed_url.params,
|
95 |
+
encoded_query,
|
96 |
+
encoded_fragment,
|
97 |
+
)
|
98 |
+
)
|
99 |
|
100 |
return result
|
101 |
finally:
|
app_modules/llm_loader.py
CHANGED
@@ -5,17 +5,18 @@ from queue import Queue
|
|
5 |
from typing import Any, Optional
|
6 |
|
7 |
import torch
|
8 |
-
from langchain import HuggingFaceTextGenInference
|
9 |
from langchain.callbacks.base import BaseCallbackHandler
|
10 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
11 |
-
from
|
12 |
-
from
|
|
|
|
|
13 |
CTransformers,
|
14 |
GPT4All,
|
15 |
HuggingFacePipeline,
|
16 |
LlamaCpp,
|
17 |
-
OpenLLM,
|
18 |
)
|
|
|
19 |
from langchain.schema import LLMResult
|
20 |
from transformers import (
|
21 |
AutoConfig,
|
@@ -30,7 +31,6 @@ from transformers import (
|
|
30 |
pipeline,
|
31 |
)
|
32 |
|
33 |
-
from app_modules.instruct_pipeline import InstructionTextGenerationPipeline
|
34 |
from app_modules.utils import ensure_model_is_downloaded
|
35 |
|
36 |
|
@@ -49,6 +49,7 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
|
|
49 |
self.timeout = timeout
|
50 |
self.total_tokens = 0
|
51 |
self.for_huggingface = for_huggingface
|
|
|
52 |
|
53 |
def on_finalized_text(self, text: str, stream_end: bool = False):
|
54 |
super().on_finalized_text(text, stream_end=stream_end)
|
@@ -61,11 +62,23 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
|
|
61 |
self.text_queue.put("\n", timeout=self.timeout)
|
62 |
self.text_queue.put(self.stop_signal, timeout=self.timeout)
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
69 |
|
70 |
def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
|
71 |
print("\n")
|
@@ -85,18 +98,13 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
|
|
85 |
def reset(self, q: Queue = None):
|
86 |
# print("resetting TextIteratorStreamer")
|
87 |
self.text_queue = q if q is not None else Queue()
|
|
|
88 |
|
89 |
def empty(self):
|
90 |
return self.text_queue.empty()
|
91 |
|
92 |
|
93 |
class LLMLoader:
|
94 |
-
llm_model_type: str
|
95 |
-
llm: any
|
96 |
-
streamer: any
|
97 |
-
max_tokens_limit: int
|
98 |
-
lock: any
|
99 |
-
|
100 |
def __init__(self, llm_model_type):
|
101 |
self.llm_model_type = llm_model_type
|
102 |
self.llm = None
|
@@ -129,9 +137,11 @@ class LLMLoader:
|
|
129 |
hf_pipeline_device_type = "cpu"
|
130 |
|
131 |
using_cuda = hf_pipeline_device_type.startswith("cuda")
|
132 |
-
|
133 |
-
|
|
|
134 |
torch_dtype = torch.bfloat16
|
|
|
135 |
load_quantized_model = os.environ.get("LOAD_QUANTIZED_MODEL")
|
136 |
|
137 |
print(f" hf_pipeline_device_type: {hf_pipeline_device_type}")
|
@@ -139,6 +149,8 @@ class LLMLoader:
|
|
139 |
print(f" torch_dtype: {torch_dtype}")
|
140 |
print(f" n_threds: {n_threds}")
|
141 |
|
|
|
|
|
142 |
double_quant_config = BitsAndBytesConfig(
|
143 |
load_in_4bit=load_quantized_model == "4bit",
|
144 |
bnb_4bit_use_double_quant=load_quantized_model == "4bit",
|
@@ -156,20 +168,22 @@ class LLMLoader:
|
|
156 |
if self.llm_model_type == "openai":
|
157 |
MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-3.5-turbo"
|
158 |
print(f" using model: {MODEL_NAME}")
|
159 |
-
self.llm =
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
173 |
)
|
174 |
elif self.llm_model_type.startswith("gpt4all"):
|
175 |
MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
|
@@ -209,6 +223,9 @@ class LLMLoader:
|
|
209 |
)
|
210 |
elif self.llm_model_type == "hftgi":
|
211 |
HFTGI_SERVER_URL = os.environ.get("HFTGI_SERVER_URL")
|
|
|
|
|
|
|
212 |
self.max_tokens_limit = 4096
|
213 |
self.llm = HuggingFaceTextGenInference(
|
214 |
inference_server_url=HFTGI_SERVER_URL,
|
@@ -217,11 +234,20 @@ class LLMLoader:
|
|
217 |
top_p=0.95,
|
218 |
# typical_p=0.95,
|
219 |
temperature=0.01,
|
220 |
-
repetition_penalty=
|
221 |
callbacks=callbacks,
|
222 |
timeout=600,
|
223 |
streaming=True,
|
224 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
elif self.llm_model_type.startswith("huggingface"):
|
226 |
MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
|
227 |
print(f" loading model: {MODEL_NAME_OR_PATH}")
|
@@ -243,6 +269,27 @@ class LLMLoader:
|
|
243 |
|
244 |
if "Llama-2" in MODEL_NAME_OR_PATH:
|
245 |
self.max_tokens_limit = 4096
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
is_t5 = "t5" in MODEL_NAME_OR_PATH
|
248 |
temperature = (
|
@@ -250,7 +297,9 @@ class LLMLoader:
|
|
250 |
if "gpt4all-j" in MODEL_NAME_OR_PATH
|
251 |
or "dolly" in MODEL_NAME_OR_PATH
|
252 |
or "Qwen" in MODEL_NAME_OR_PATH
|
253 |
-
or "Llama
|
|
|
|
|
254 |
else 0
|
255 |
)
|
256 |
use_fast = (
|
@@ -314,6 +363,11 @@ class LLMLoader:
|
|
314 |
else (1.25 if "dolly" in MODEL_NAME_OR_PATH else 1.1)
|
315 |
)
|
316 |
|
|
|
|
|
|
|
|
|
|
|
317 |
if load_quantized_model is not None:
|
318 |
model = (
|
319 |
AutoModelForSeq2SeqLM.from_pretrained(
|
@@ -342,71 +396,40 @@ class LLMLoader:
|
|
342 |
pad_token_id = eos_token_id
|
343 |
|
344 |
pipe = (
|
345 |
-
|
346 |
-
task
|
347 |
model=model,
|
348 |
tokenizer=tokenizer,
|
|
|
|
|
349 |
streamer=self.streamer,
|
350 |
-
max_new_tokens=2048,
|
351 |
-
temperature=temperature,
|
352 |
return_full_text=return_full_text, # langchain expects the full text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
repetition_penalty=repetition_penalty,
|
354 |
)
|
355 |
-
if
|
356 |
-
else (
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
top_p=0.95,
|
371 |
-
top_k=50,
|
372 |
-
repetition_penalty=repetition_penalty,
|
373 |
-
)
|
374 |
-
if eos_token_id != -1
|
375 |
-
else pipeline(
|
376 |
-
task,
|
377 |
-
model=model,
|
378 |
-
tokenizer=tokenizer,
|
379 |
-
streamer=self.streamer,
|
380 |
-
return_full_text=return_full_text, # langchain expects the full text
|
381 |
-
device_map="auto",
|
382 |
-
trust_remote_code=True,
|
383 |
-
max_new_tokens=2048,
|
384 |
-
# verbose=True,
|
385 |
-
temperature=temperature,
|
386 |
-
top_p=0.95,
|
387 |
-
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
388 |
-
repetition_penalty=repetition_penalty,
|
389 |
-
)
|
390 |
)
|
391 |
)
|
392 |
-
elif "dolly" in MODEL_NAME_OR_PATH:
|
393 |
-
model = AutoModelForCausalLM.from_pretrained(
|
394 |
-
MODEL_NAME_OR_PATH,
|
395 |
-
device_map=hf_pipeline_device_type,
|
396 |
-
torch_dtype=torch_dtype,
|
397 |
-
)
|
398 |
-
|
399 |
-
pipe = InstructionTextGenerationPipeline(
|
400 |
-
task=task,
|
401 |
-
model=model,
|
402 |
-
tokenizer=tokenizer,
|
403 |
-
streamer=self.streamer,
|
404 |
-
max_new_tokens=2048,
|
405 |
-
temperature=temperature,
|
406 |
-
return_full_text=True,
|
407 |
-
repetition_penalty=repetition_penalty,
|
408 |
-
token=token,
|
409 |
-
)
|
410 |
else:
|
411 |
if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
|
412 |
model = (
|
@@ -456,10 +479,11 @@ class LLMLoader:
|
|
456 |
torch_dtype=torch_dtype,
|
457 |
max_new_tokens=2048,
|
458 |
trust_remote_code=True,
|
|
|
459 |
temperature=temperature,
|
460 |
top_p=0.95,
|
461 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
462 |
-
repetition_penalty=
|
463 |
)
|
464 |
if token is None
|
465 |
else pipeline(
|
@@ -475,11 +499,12 @@ class LLMLoader:
|
|
475 |
temperature=temperature,
|
476 |
top_p=0.95,
|
477 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
478 |
-
repetition_penalty=
|
479 |
token=token,
|
480 |
)
|
481 |
)
|
482 |
|
|
|
483 |
self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
|
484 |
elif self.llm_model_type == "mosaicml":
|
485 |
MODEL_NAME_OR_PATH = os.environ.get("MOSAICML_MODEL_NAME_OR_PATH")
|
@@ -534,11 +559,13 @@ class LLMLoader:
|
|
534 |
|
535 |
max_new_tokens = 8192 if "30b" in MODEL_NAME_OR_PATH else 2048
|
536 |
self.max_tokens_limit = max_new_tokens
|
537 |
-
self.search_kwargs = (
|
538 |
-
{"k": 8} if "30b" in MODEL_NAME_OR_PATH else self.search_kwargs
|
539 |
-
)
|
540 |
repetition_penalty = 1.05 if "30b" in MODEL_NAME_OR_PATH else 1.02
|
541 |
|
|
|
|
|
|
|
|
|
|
|
542 |
pipe = (
|
543 |
pipeline(
|
544 |
model=model,
|
@@ -549,7 +576,8 @@ class LLMLoader:
|
|
549 |
device_map="auto",
|
550 |
# we pass model parameters here too
|
551 |
stopping_criteria=stopping_criteria, # without this model will ramble
|
552 |
-
|
|
|
553 |
top_p=0.95, # select from top tokens whose probability add up to 15%
|
554 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
555 |
max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
|
@@ -565,7 +593,8 @@ class LLMLoader:
|
|
565 |
device=config.init_device,
|
566 |
# we pass model parameters here too
|
567 |
stopping_criteria=stopping_criteria, # without this model will ramble
|
568 |
-
|
|
|
569 |
top_p=0.95, # select from top tokens whose probability add up to 15%
|
570 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
571 |
max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
|
@@ -584,6 +613,13 @@ class LLMLoader:
|
|
584 |
# config.max_seq_len = 4096
|
585 |
config.init_device = hf_pipeline_device_type
|
586 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
587 |
model = (
|
588 |
AutoModelForCausalLM.from_pretrained(
|
589 |
MODEL_NAME_OR_PATH,
|
@@ -635,7 +671,7 @@ class LLMLoader:
|
|
635 |
top_p=0.95, # select from top tokens whose probability add up to 15%
|
636 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
637 |
max_new_tokens=2048, # mex number of tokens to generate in the output
|
638 |
-
repetition_penalty=
|
639 |
)
|
640 |
if load_quantized_model is not None
|
641 |
else pipeline(
|
@@ -651,7 +687,7 @@ class LLMLoader:
|
|
651 |
top_p=0.95, # select from top tokens whose probability add up to 15%
|
652 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
653 |
max_new_tokens=2048, # mex number of tokens to generate in the output
|
654 |
-
repetition_penalty=
|
655 |
)
|
656 |
)
|
657 |
self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
|
|
|
5 |
from typing import Any, Optional
|
6 |
|
7 |
import torch
|
|
|
8 |
from langchain.callbacks.base import BaseCallbackHandler
|
9 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
10 |
+
from langchain_openai.chat_models import ChatOpenAI
|
11 |
+
from langchain_openai.llms import OpenAI
|
12 |
+
from langchain_community.llms import (
|
13 |
+
HuggingFaceTextGenInference,
|
14 |
CTransformers,
|
15 |
GPT4All,
|
16 |
HuggingFacePipeline,
|
17 |
LlamaCpp,
|
|
|
18 |
)
|
19 |
+
from langchain_community.chat_models import ChatOllama
|
20 |
from langchain.schema import LLMResult
|
21 |
from transformers import (
|
22 |
AutoConfig,
|
|
|
31 |
pipeline,
|
32 |
)
|
33 |
|
|
|
34 |
from app_modules.utils import ensure_model_is_downloaded
|
35 |
|
36 |
|
|
|
49 |
self.timeout = timeout
|
50 |
self.total_tokens = 0
|
51 |
self.for_huggingface = for_huggingface
|
52 |
+
self.end_token = ""
|
53 |
|
54 |
def on_finalized_text(self, text: str, stream_end: bool = False):
|
55 |
super().on_finalized_text(text, stream_end=stream_end)
|
|
|
62 |
self.text_queue.put("\n", timeout=self.timeout)
|
63 |
self.text_queue.put(self.stop_signal, timeout=self.timeout)
|
64 |
|
65 |
+
def check_end_token(self, token):
|
66 |
+
new_token = self.end_token + token
|
67 |
+
if "<|im_end|>".startswith(new_token):
|
68 |
+
self.end_token = "" if new_token == "<|im_end|>" else new_token
|
69 |
+
return None
|
70 |
+
elif self.end_token != "":
|
71 |
+
self.end_token = ""
|
72 |
+
|
73 |
+
return new_token
|
74 |
+
|
75 |
def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
|
76 |
+
token = self.check_end_token(token)
|
77 |
+
if token:
|
78 |
+
sys.stdout.write(token)
|
79 |
+
sys.stdout.flush()
|
80 |
+
self.text_queue.put(token, timeout=self.timeout)
|
81 |
+
self.total_tokens = self.total_tokens + 1
|
82 |
|
83 |
def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
|
84 |
print("\n")
|
|
|
98 |
def reset(self, q: Queue = None):
|
99 |
# print("resetting TextIteratorStreamer")
|
100 |
self.text_queue = q if q is not None else Queue()
|
101 |
+
self.end_token = ""
|
102 |
|
103 |
def empty(self):
|
104 |
return self.text_queue.empty()
|
105 |
|
106 |
|
107 |
class LLMLoader:
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
def __init__(self, llm_model_type):
|
109 |
self.llm_model_type = llm_model_type
|
110 |
self.llm = None
|
|
|
137 |
hf_pipeline_device_type = "cpu"
|
138 |
|
139 |
using_cuda = hf_pipeline_device_type.startswith("cuda")
|
140 |
+
using_mps = hf_pipeline_device_type.startswith("mps")
|
141 |
+
torch_dtype = torch.float16 if using_cuda or using_mps else torch.float32
|
142 |
+
if not using_mps and os.environ.get("USING_TORCH_BFLOAT16") == "true":
|
143 |
torch_dtype = torch.bfloat16
|
144 |
+
|
145 |
load_quantized_model = os.environ.get("LOAD_QUANTIZED_MODEL")
|
146 |
|
147 |
print(f" hf_pipeline_device_type: {hf_pipeline_device_type}")
|
|
|
149 |
print(f" torch_dtype: {torch_dtype}")
|
150 |
print(f" n_threds: {n_threds}")
|
151 |
|
152 |
+
torch.set_default_dtype(torch_dtype)
|
153 |
+
|
154 |
double_quant_config = BitsAndBytesConfig(
|
155 |
load_in_4bit=load_quantized_model == "4bit",
|
156 |
bnb_4bit_use_double_quant=load_quantized_model == "4bit",
|
|
|
168 |
if self.llm_model_type == "openai":
|
169 |
MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-3.5-turbo"
|
170 |
print(f" using model: {MODEL_NAME}")
|
171 |
+
self.llm = (
|
172 |
+
OpenAI(
|
173 |
+
model_name=MODEL_NAME,
|
174 |
+
streaming=True,
|
175 |
+
callbacks=callbacks,
|
176 |
+
verbose=True,
|
177 |
+
temperature=0,
|
178 |
+
)
|
179 |
+
if "instruct" in MODEL_NAME
|
180 |
+
else ChatOpenAI(
|
181 |
+
model_name=MODEL_NAME,
|
182 |
+
streaming=True,
|
183 |
+
callbacks=callbacks,
|
184 |
+
verbose=True,
|
185 |
+
temperature=0,
|
186 |
+
)
|
187 |
)
|
188 |
elif self.llm_model_type.startswith("gpt4all"):
|
189 |
MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
|
|
|
223 |
)
|
224 |
elif self.llm_model_type == "hftgi":
|
225 |
HFTGI_SERVER_URL = os.environ.get("HFTGI_SERVER_URL")
|
226 |
+
HFTGI_RP = os.environ.get("HFTGI_RP")
|
227 |
+
repetition_penalty = 1.120 if HFTGI_RP is None else float(HFTGI_RP)
|
228 |
+
print(f" repetition_penalty: {repetition_penalty}")
|
229 |
self.max_tokens_limit = 4096
|
230 |
self.llm = HuggingFaceTextGenInference(
|
231 |
inference_server_url=HFTGI_SERVER_URL,
|
|
|
234 |
top_p=0.95,
|
235 |
# typical_p=0.95,
|
236 |
temperature=0.01,
|
237 |
+
repetition_penalty=repetition_penalty,
|
238 |
callbacks=callbacks,
|
239 |
timeout=600,
|
240 |
streaming=True,
|
241 |
)
|
242 |
+
elif self.llm_model_type == "ollama":
|
243 |
+
MODEL_NAME = os.environ.get("OLLAMA_MODEL_NAME") or "dolphin-phi"
|
244 |
+
print(f" loading model: {MODEL_NAME}")
|
245 |
+
self.llm = ChatOllama(
|
246 |
+
model=MODEL_NAME,
|
247 |
+
callbacks=callbacks,
|
248 |
+
temperature=0,
|
249 |
+
repeat_penalty=1.15,
|
250 |
+
)
|
251 |
elif self.llm_model_type.startswith("huggingface"):
|
252 |
MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
|
253 |
print(f" loading model: {MODEL_NAME_OR_PATH}")
|
|
|
269 |
|
270 |
if "Llama-2" in MODEL_NAME_OR_PATH:
|
271 |
self.max_tokens_limit = 4096
|
272 |
+
elif "TinyLlama" in MODEL_NAME_OR_PATH:
|
273 |
+
self.max_tokens_limit = 1024
|
274 |
+
|
275 |
+
class StopOnTokens(StoppingCriteria):
|
276 |
+
def __call__(
|
277 |
+
self,
|
278 |
+
input_ids: torch.LongTensor,
|
279 |
+
scores: torch.FloatTensor,
|
280 |
+
**kwargs,
|
281 |
+
) -> bool:
|
282 |
+
stop_ids = [
|
283 |
+
2
|
284 |
+
] # IDs of tokens where the generation should stop.
|
285 |
+
for stop_id in stop_ids:
|
286 |
+
if (
|
287 |
+
input_ids[0][-1] == stop_id
|
288 |
+
): # Checking if the last generated token is a stop token.
|
289 |
+
return True
|
290 |
+
return False
|
291 |
+
|
292 |
+
stopping_criteria = StoppingCriteriaList([StopOnTokens()])
|
293 |
|
294 |
is_t5 = "t5" in MODEL_NAME_OR_PATH
|
295 |
temperature = (
|
|
|
297 |
if "gpt4all-j" in MODEL_NAME_OR_PATH
|
298 |
or "dolly" in MODEL_NAME_OR_PATH
|
299 |
or "Qwen" in MODEL_NAME_OR_PATH
|
300 |
+
or "Llama" in MODEL_NAME_OR_PATH
|
301 |
+
or "Orca-2" in MODEL_NAME_OR_PATH
|
302 |
+
or "phi-2" in MODEL_NAME_OR_PATH
|
303 |
else 0
|
304 |
)
|
305 |
use_fast = (
|
|
|
363 |
else (1.25 if "dolly" in MODEL_NAME_OR_PATH else 1.1)
|
364 |
)
|
365 |
|
366 |
+
HF_RP = os.environ.get("HF_RP")
|
367 |
+
if HF_RP is not None and len(HF_RP) > 0:
|
368 |
+
repetition_penalty = float(HF_RP)
|
369 |
+
print(f" repetition_penalty: {repetition_penalty}")
|
370 |
+
|
371 |
if load_quantized_model is not None:
|
372 |
model = (
|
373 |
AutoModelForSeq2SeqLM.from_pretrained(
|
|
|
396 |
pad_token_id = eos_token_id
|
397 |
|
398 |
pipe = (
|
399 |
+
pipeline(
|
400 |
+
task,
|
401 |
model=model,
|
402 |
tokenizer=tokenizer,
|
403 |
+
eos_token_id=eos_token_id,
|
404 |
+
pad_token_id=pad_token_id,
|
405 |
streamer=self.streamer,
|
|
|
|
|
406 |
return_full_text=return_full_text, # langchain expects the full text
|
407 |
+
device_map="auto",
|
408 |
+
trust_remote_code=True,
|
409 |
+
max_new_tokens=2048,
|
410 |
+
do_sample=True,
|
411 |
+
temperature=0.01,
|
412 |
+
top_p=0.95,
|
413 |
+
top_k=50,
|
414 |
repetition_penalty=repetition_penalty,
|
415 |
)
|
416 |
+
if eos_token_id != -1
|
417 |
+
else pipeline(
|
418 |
+
task,
|
419 |
+
model=model,
|
420 |
+
tokenizer=tokenizer,
|
421 |
+
streamer=self.streamer,
|
422 |
+
return_full_text=return_full_text, # langchain expects the full text
|
423 |
+
device_map="auto",
|
424 |
+
trust_remote_code=True,
|
425 |
+
max_new_tokens=2048,
|
426 |
+
do_sample=True,
|
427 |
+
temperature=temperature,
|
428 |
+
top_p=0.95,
|
429 |
+
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
430 |
+
repetition_penalty=repetition_penalty,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
)
|
432 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
else:
|
434 |
if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
|
435 |
model = (
|
|
|
479 |
torch_dtype=torch_dtype,
|
480 |
max_new_tokens=2048,
|
481 |
trust_remote_code=True,
|
482 |
+
do_sample=True,
|
483 |
temperature=temperature,
|
484 |
top_p=0.95,
|
485 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
486 |
+
repetition_penalty=repetition_penalty,
|
487 |
)
|
488 |
if token is None
|
489 |
else pipeline(
|
|
|
499 |
temperature=temperature,
|
500 |
top_p=0.95,
|
501 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
502 |
+
repetition_penalty=repetition_penalty,
|
503 |
token=token,
|
504 |
)
|
505 |
)
|
506 |
|
507 |
+
pipe.model.config.pad_token_id = pipe.model.config.eos_token_id
|
508 |
self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
|
509 |
elif self.llm_model_type == "mosaicml":
|
510 |
MODEL_NAME_OR_PATH = os.environ.get("MOSAICML_MODEL_NAME_OR_PATH")
|
|
|
559 |
|
560 |
max_new_tokens = 8192 if "30b" in MODEL_NAME_OR_PATH else 2048
|
561 |
self.max_tokens_limit = max_new_tokens
|
|
|
|
|
|
|
562 |
repetition_penalty = 1.05 if "30b" in MODEL_NAME_OR_PATH else 1.02
|
563 |
|
564 |
+
ML_RP = os.environ.get("ML_RP")
|
565 |
+
if ML_RP is not None and len(ML_RP) > 0:
|
566 |
+
repetition_penalty = float(ML_RP)
|
567 |
+
print(f" repetition_penalty: {repetition_penalty}")
|
568 |
+
|
569 |
pipe = (
|
570 |
pipeline(
|
571 |
model=model,
|
|
|
576 |
device_map="auto",
|
577 |
# we pass model parameters here too
|
578 |
stopping_criteria=stopping_criteria, # without this model will ramble
|
579 |
+
do_sample=True,
|
580 |
+
temperature=0.01, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
|
581 |
top_p=0.95, # select from top tokens whose probability add up to 15%
|
582 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
583 |
max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
|
|
|
593 |
device=config.init_device,
|
594 |
# we pass model parameters here too
|
595 |
stopping_criteria=stopping_criteria, # without this model will ramble
|
596 |
+
do_sample=True,
|
597 |
+
temperature=0.01, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
|
598 |
top_p=0.95, # select from top tokens whose probability add up to 15%
|
599 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
600 |
max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
|
|
|
613 |
# config.max_seq_len = 4096
|
614 |
config.init_device = hf_pipeline_device_type
|
615 |
|
616 |
+
SL_RP = os.environ.get("SL_RP")
|
617 |
+
if SL_RP is not None and len(SL_RP) > 0:
|
618 |
+
repetition_penalty = float(SL_RP)
|
619 |
+
else:
|
620 |
+
repetition_penalty = 1.05
|
621 |
+
print(f" repetition_penalty: {repetition_penalty}")
|
622 |
+
|
623 |
model = (
|
624 |
AutoModelForCausalLM.from_pretrained(
|
625 |
MODEL_NAME_OR_PATH,
|
|
|
671 |
top_p=0.95, # select from top tokens whose probability add up to 15%
|
672 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
673 |
max_new_tokens=2048, # mex number of tokens to generate in the output
|
674 |
+
repetition_penalty=repetition_penalty, # without this output begins repeating
|
675 |
)
|
676 |
if load_quantized_model is not None
|
677 |
else pipeline(
|
|
|
687 |
top_p=0.95, # select from top tokens whose probability add up to 15%
|
688 |
top_k=0, # select from top 0 tokens (because zero, relies on top_p)
|
689 |
max_new_tokens=2048, # mex number of tokens to generate in the output
|
690 |
+
repetition_penalty=repetition_penalty, # without this output begins repeating
|
691 |
)
|
692 |
)
|
693 |
self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
|
app_modules/llm_qa_chain.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1 |
from langchain.chains import ConversationalRetrievalChain
|
2 |
from langchain.chains.base import Chain
|
3 |
-
from langchain.vectorstores.base import VectorStore
|
4 |
|
5 |
from app_modules.llm_inference import LLMInference
|
6 |
|
7 |
|
8 |
class QAChain(LLMInference):
|
9 |
-
vectorstore: VectorStore
|
10 |
-
|
11 |
def __init__(self, vectorstore, llm_loader):
|
12 |
super().__init__(llm_loader)
|
13 |
self.vectorstore = vectorstore
|
|
|
1 |
from langchain.chains import ConversationalRetrievalChain
|
2 |
from langchain.chains.base import Chain
|
|
|
3 |
|
4 |
from app_modules.llm_inference import LLMInference
|
5 |
|
6 |
|
7 |
class QAChain(LLMInference):
|
|
|
|
|
8 |
def __init__(self, vectorstore, llm_loader):
|
9 |
super().__init__(llm_loader)
|
10 |
self.vectorstore = vectorstore
|
app_modules/llm_qa_chain_with_memory.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chains import ConversationalRetrievalChain
|
2 |
+
from langchain.chains.base import Chain
|
3 |
+
|
4 |
+
from app_modules.llm_inference import LLMInference
|
5 |
+
from app_modules.utils import CustomizedConversationSummaryBufferMemory
|
6 |
+
|
7 |
+
|
8 |
+
class QAChain(LLMInference):
|
9 |
+
def __init__(self, vectorstore, llm_loader):
|
10 |
+
super().__init__(llm_loader)
|
11 |
+
self.vectorstore = vectorstore
|
12 |
+
|
13 |
+
def create_chain(self) -> Chain:
|
14 |
+
memory = CustomizedConversationSummaryBufferMemory(
|
15 |
+
llm=self.llm_loader.llm,
|
16 |
+
output_key="answer",
|
17 |
+
memory_key="chat_history",
|
18 |
+
max_token_limit=1024,
|
19 |
+
return_messages=True,
|
20 |
+
)
|
21 |
+
qa = ConversationalRetrievalChain.from_llm(
|
22 |
+
self.llm_loader.llm,
|
23 |
+
memory=memory,
|
24 |
+
chain_type="stuff",
|
25 |
+
retriever=self.vectorstore.as_retriever(
|
26 |
+
search_kwargs=self.llm_loader.search_kwargs
|
27 |
+
),
|
28 |
+
get_chat_history=lambda h: h,
|
29 |
+
return_source_documents=True,
|
30 |
+
)
|
31 |
+
|
32 |
+
return qa
|
app_modules/utils.py
CHANGED
@@ -10,6 +10,7 @@ from pathlib import Path
|
|
10 |
import requests
|
11 |
import torch
|
12 |
from tqdm import tqdm
|
|
|
13 |
|
14 |
|
15 |
class LogRecord(logging.LogRecord):
|
@@ -69,21 +70,31 @@ def print_llm_response(llm_response):
|
|
69 |
llm_response["source_documents"] if "source_documents" in llm_response else None
|
70 |
)
|
71 |
if source_documents is None:
|
72 |
-
source_documents =
|
73 |
-
|
74 |
-
print("\nSources:")
|
75 |
-
for source in source_documents:
|
76 |
-
metadata = source["metadata"] if "metadata" in source else source.metadata
|
77 |
-
print(
|
78 |
-
" Page: "
|
79 |
-
+ str(metadata["page"])
|
80 |
-
+ " Source: "
|
81 |
-
+ str(metadata["url"] if "url" in metadata else metadata["source"])
|
82 |
-
)
|
83 |
-
print(
|
84 |
-
source["page_content"] if "page_content" in source else source.page_content
|
85 |
)
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
def get_device_types():
|
89 |
print("Running on: ", platform.platform())
|
@@ -159,6 +170,21 @@ def ensure_model_is_downloaded(llm_model_type):
|
|
159 |
return local_path
|
160 |
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
if __name__ == "__main__":
|
163 |
hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
|
164 |
print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
|
|
|
10 |
import requests
|
11 |
import torch
|
12 |
from tqdm import tqdm
|
13 |
+
from langchain.memory import ConversationSummaryBufferMemory
|
14 |
|
15 |
|
16 |
class LogRecord(logging.LogRecord):
|
|
|
70 |
llm_response["source_documents"] if "source_documents" in llm_response else None
|
71 |
)
|
72 |
if source_documents is None:
|
73 |
+
source_documents = (
|
74 |
+
llm_response["sourceDocs"] if "sourceDocs" in llm_response else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
)
|
76 |
|
77 |
+
if source_documents is not None:
|
78 |
+
print("\nSources:")
|
79 |
+
for source in source_documents:
|
80 |
+
metadata = source["metadata"] if "metadata" in source else source.metadata
|
81 |
+
if "page" in metadata:
|
82 |
+
print(f" Page: {metadata['page']}", end="")
|
83 |
+
|
84 |
+
print(
|
85 |
+
" Source: "
|
86 |
+
+ str(metadata["url"] if "url" in metadata else metadata["source"])
|
87 |
+
)
|
88 |
+
print(
|
89 |
+
source["page_content"]
|
90 |
+
if "page_content" in source
|
91 |
+
else source.page_content
|
92 |
+
)
|
93 |
+
|
94 |
+
if "chat_history" in llm_response:
|
95 |
+
print("\nChat History:")
|
96 |
+
print(llm_response["chat_history"])
|
97 |
+
|
98 |
|
99 |
def get_device_types():
|
100 |
print("Running on: ", platform.platform())
|
|
|
170 |
return local_path
|
171 |
|
172 |
|
173 |
+
class CustomizedConversationSummaryBufferMemory(ConversationSummaryBufferMemory):
|
174 |
+
def save_context(self, inputs, outputs) -> None:
|
175 |
+
for key in outputs:
|
176 |
+
if isinstance(outputs[key], str):
|
177 |
+
outputs[key] = outputs[key].replace("<|im_end|>", "")
|
178 |
+
return super().save_context(inputs, outputs)
|
179 |
+
|
180 |
+
def predict_new_summary(self, messages, existing_summary) -> str:
|
181 |
+
return (
|
182 |
+
super()
|
183 |
+
.predict_new_summary(messages, existing_summary)
|
184 |
+
.replace("<|im_end|>", "")
|
185 |
+
)
|
186 |
+
|
187 |
+
|
188 |
if __name__ == "__main__":
|
189 |
hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
|
190 |
print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
|
requirements-mac.txt
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.26.1
|
2 |
+
aiofiles==23.2.1
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
altair==5.2.0
|
6 |
+
annotated-types==0.6.0
|
7 |
+
anyio==4.2.0
|
8 |
+
attrs==23.2.0
|
9 |
+
black==24.1.0
|
10 |
+
certifi==2023.11.17
|
11 |
+
charset-normalizer==3.3.2
|
12 |
+
click==8.1.7
|
13 |
+
colorama==0.4.6
|
14 |
+
contourpy==1.2.0
|
15 |
+
cycler==0.12.1
|
16 |
+
dataclasses-json==0.6.3
|
17 |
+
faiss-cpu==1.7.4
|
18 |
+
fastapi==0.109.0
|
19 |
+
ffmpy==0.3.1
|
20 |
+
filelock==3.13.1
|
21 |
+
fonttools==4.47.2
|
22 |
+
frozenlist==1.4.1
|
23 |
+
fsspec==2023.12.2
|
24 |
+
gradio==4.16.0
|
25 |
+
gradio_client==0.8.1
|
26 |
+
greenlet==3.0.3
|
27 |
+
h11==0.14.0
|
28 |
+
httpcore==1.0.2
|
29 |
+
httpx==0.26.0
|
30 |
+
huggingface-hub==0.20.3
|
31 |
+
idna==3.6
|
32 |
+
importlib-resources==6.1.1
|
33 |
+
InstructorEmbedding==1.0.1
|
34 |
+
isort==5.13.2
|
35 |
+
Jinja2==3.1.3
|
36 |
+
joblib==1.3.2
|
37 |
+
jsonpatch==1.33
|
38 |
+
jsonpointer==2.4
|
39 |
+
jsonschema==4.21.1
|
40 |
+
jsonschema-specifications==2023.12.1
|
41 |
+
kiwisolver==1.4.5
|
42 |
+
langchain==0.1.4
|
43 |
+
langchain-community==0.0.16
|
44 |
+
langchain-core==0.1.16
|
45 |
+
langsmith==0.0.83
|
46 |
+
markdown-it-py==3.0.0
|
47 |
+
MarkupSafe==2.1.4
|
48 |
+
marshmallow==3.20.2
|
49 |
+
matplotlib==3.8.2
|
50 |
+
mdurl==0.1.2
|
51 |
+
mpmath==1.3.0
|
52 |
+
multidict==6.0.4
|
53 |
+
mypy-extensions==1.0.0
|
54 |
+
networkx==3.2.1
|
55 |
+
nltk==3.8.1
|
56 |
+
numpy==1.26.3
|
57 |
+
# nvidia-cublas-cu12==12.1.3.1
|
58 |
+
# nvidia-cuda-cupti-cu12==12.1.105
|
59 |
+
# nvidia-cuda-nvrtc-cu12==12.1.105
|
60 |
+
# nvidia-cuda-runtime-cu12==12.1.105
|
61 |
+
# nvidia-cudnn-cu12==8.9.2.26
|
62 |
+
# nvidia-cufft-cu12==11.0.2.54
|
63 |
+
# nvidia-curand-cu12==10.3.2.106
|
64 |
+
# nvidia-cusolver-cu12==11.4.5.107
|
65 |
+
# nvidia-cusparse-cu12==12.1.0.106
|
66 |
+
# nvidia-nccl-cu12==2.18.1
|
67 |
+
# nvidia-nvjitlink-cu12==12.3.101
|
68 |
+
# nvidia-nvtx-cu12==12.1.105
|
69 |
+
orjson==3.9.12
|
70 |
+
packaging==23.2
|
71 |
+
pandas==2.2.0
|
72 |
+
pathspec==0.12.1
|
73 |
+
peft @ git+https://github.com/huggingface/peft.git@1c1c7fdaa6e6abaa53939b865dee1eded82ad032
|
74 |
+
pillow==10.2.0
|
75 |
+
platformdirs==4.1.0
|
76 |
+
protobuf==4.25.2
|
77 |
+
psutil==5.9.8
|
78 |
+
pydantic==2.5.3
|
79 |
+
pydantic_core==2.14.6
|
80 |
+
pydub==0.25.1
|
81 |
+
Pygments==2.17.2
|
82 |
+
pyparsing==3.1.1
|
83 |
+
python-dateutil==2.8.2
|
84 |
+
python-dotenv==1.0.1
|
85 |
+
python-multipart==0.0.6
|
86 |
+
pytz==2023.3.post1
|
87 |
+
PyYAML==6.0.1
|
88 |
+
referencing==0.32.1
|
89 |
+
regex==2023.12.25
|
90 |
+
requests==2.31.0
|
91 |
+
rich==13.7.0
|
92 |
+
rpds-py==0.17.1
|
93 |
+
ruff==0.1.14
|
94 |
+
safetensors==0.4.2
|
95 |
+
scikit-learn==1.4.0
|
96 |
+
scipy==1.12.0
|
97 |
+
semantic-version==2.10.0
|
98 |
+
sentence-transformers==2.2.2
|
99 |
+
sentencepiece==0.1.99
|
100 |
+
shellingham==1.5.4
|
101 |
+
six==1.16.0
|
102 |
+
sniffio==1.3.0
|
103 |
+
SQLAlchemy==2.0.25
|
104 |
+
starlette==0.35.1
|
105 |
+
sympy==1.12
|
106 |
+
tenacity==8.2.3
|
107 |
+
threadpoolctl==3.2.0
|
108 |
+
tokenizers==0.15.1
|
109 |
+
tomlkit==0.12.0
|
110 |
+
toolz==0.12.1
|
111 |
+
torch==2.1.2
|
112 |
+
torchvision==0.16.2
|
113 |
+
tqdm==4.66.1
|
114 |
+
transformers @ git+https://github.com/huggingface/transformers.git@de13a951b38b85195984164819f1ab05fe508677
|
115 |
+
# triton==2.1.0
|
116 |
+
typer==0.9.0
|
117 |
+
typing-inspect==0.9.0
|
118 |
+
typing_extensions==4.9.0
|
119 |
+
tzdata==2023.4
|
120 |
+
urllib3==2.1.0
|
121 |
+
uvicorn==0.27.0
|
122 |
+
websockets==11.0.3
|
123 |
+
yarl==1.9.4
|
124 |
+
einops==0.7.0
|
125 |
+
Pyarrow==15.0.0
|
126 |
+
openpyxl==3.1.2
|
127 |
+
tabulate==0.9.0
|
requirements.txt
CHANGED
@@ -1,38 +1,129 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.26.1
|
2 |
+
aiofiles==23.2.1
|
3 |
+
aiohttp==3.9.1
|
4 |
+
aiosignal==1.3.1
|
5 |
+
altair==5.2.0
|
6 |
+
annotated-types==0.6.0
|
7 |
+
anyio==4.2.0
|
8 |
+
attrs==23.2.0
|
9 |
+
black==24.1.0
|
10 |
+
certifi==2023.11.17
|
11 |
+
charset-normalizer==3.3.2
|
12 |
+
click==8.1.7
|
13 |
+
colorama==0.4.6
|
14 |
+
contourpy==1.2.0
|
15 |
+
cycler==0.12.1
|
16 |
+
dataclasses-json==0.6.3
|
17 |
+
faiss-cpu==1.7.4
|
18 |
+
fastapi==0.109.0
|
19 |
+
ffmpy==0.3.1
|
20 |
+
filelock==3.13.1
|
21 |
+
fonttools==4.47.2
|
22 |
+
frozenlist==1.4.1
|
23 |
+
fsspec==2023.12.2
|
24 |
+
gradio==4.16.0
|
25 |
+
gradio_client==0.8.1
|
26 |
+
greenlet==3.0.3
|
27 |
+
h11==0.14.0
|
28 |
+
httpcore==1.0.2
|
29 |
+
httpx==0.26.0
|
30 |
+
huggingface-hub==0.20.3
|
31 |
+
idna==3.6
|
32 |
+
importlib-resources==6.1.1
|
33 |
+
InstructorEmbedding==1.0.1
|
34 |
+
isort==5.13.2
|
35 |
+
Jinja2==3.1.3
|
36 |
+
joblib==1.3.2
|
37 |
+
jsonpatch==1.33
|
38 |
+
jsonpointer==2.4
|
39 |
+
jsonschema==4.21.1
|
40 |
+
jsonschema-specifications==2023.12.1
|
41 |
+
kiwisolver==1.4.5
|
42 |
+
langchain==0.1.4
|
43 |
+
langchain-community==0.0.16
|
44 |
+
langchain-openai==0.0.5
|
45 |
+
langchain-core==0.1.16
|
46 |
+
langsmith==0.0.83
|
47 |
+
markdown-it-py==3.0.0
|
48 |
+
MarkupSafe==2.1.4
|
49 |
+
marshmallow==3.20.2
|
50 |
+
matplotlib==3.8.2
|
51 |
+
mdurl==0.1.2
|
52 |
+
mpmath==1.3.0
|
53 |
+
multidict==6.0.4
|
54 |
+
mypy-extensions==1.0.0
|
55 |
+
networkx==3.2.1
|
56 |
+
nltk==3.8.1
|
57 |
+
numpy==1.26.3
|
58 |
+
nvidia-cublas-cu12==12.1.3.1
|
59 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
60 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
61 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
62 |
+
nvidia-cudnn-cu12==8.9.2.26
|
63 |
+
nvidia-cufft-cu12==11.0.2.54
|
64 |
+
nvidia-curand-cu12==10.3.2.106
|
65 |
+
nvidia-cusolver-cu12==11.4.5.107
|
66 |
+
nvidia-cusparse-cu12==12.1.0.106
|
67 |
+
nvidia-nccl-cu12==2.18.1
|
68 |
+
nvidia-nvjitlink-cu12==12.3.101
|
69 |
+
nvidia-nvtx-cu12==12.1.105
|
70 |
+
orjson==3.9.12
|
71 |
+
packaging==23.2
|
72 |
+
pandas==2.2.0
|
73 |
+
pathspec==0.12.1
|
74 |
+
peft @ git+https://github.com/huggingface/peft.git@1c1c7fdaa6e6abaa53939b865dee1eded82ad032
|
75 |
+
pillow==10.2.0
|
76 |
+
platformdirs==4.1.0
|
77 |
+
protobuf==4.25.2
|
78 |
+
psutil==5.9.8
|
79 |
+
pydantic==2.5.3
|
80 |
+
pydantic_core==2.14.6
|
81 |
+
pydub==0.25.1
|
82 |
+
Pygments==2.17.2
|
83 |
+
pyparsing==3.1.1
|
84 |
+
python-dateutil==2.8.2
|
85 |
+
python-dotenv==1.0.1
|
86 |
+
python-multipart==0.0.6
|
87 |
+
pytz==2023.3.post1
|
88 |
+
PyYAML==6.0.1
|
89 |
+
referencing==0.32.1
|
90 |
+
regex==2023.12.25
|
91 |
+
requests==2.31.0
|
92 |
+
rich==13.7.0
|
93 |
+
rpds-py==0.17.1
|
94 |
+
ruff==0.1.14
|
95 |
+
safetensors==0.4.2
|
96 |
+
scikit-learn==1.4.0
|
97 |
+
scipy==1.12.0
|
98 |
+
semantic-version==2.10.0
|
99 |
+
sentence-transformers==2.2.2
|
100 |
+
sentencepiece==0.1.99
|
101 |
+
shellingham==1.5.4
|
102 |
+
six==1.16.0
|
103 |
+
sniffio==1.3.0
|
104 |
+
SQLAlchemy==2.0.25
|
105 |
+
starlette==0.35.1
|
106 |
+
sympy==1.12
|
107 |
+
tenacity==8.2.3
|
108 |
+
threadpoolctl==3.2.0
|
109 |
+
tokenizers==0.15.1
|
110 |
+
tomlkit==0.12.0
|
111 |
+
toolz==0.12.1
|
112 |
+
torch==2.1.2
|
113 |
+
torchvision==0.16.2
|
114 |
+
tqdm==4.66.1
|
115 |
+
transformers @ git+https://github.com/huggingface/transformers.git@de13a951b38b85195984164819f1ab05fe508677
|
116 |
+
triton==2.1.0
|
117 |
+
typer==0.9.0
|
118 |
+
typing-inspect==0.9.0
|
119 |
+
typing_extensions==4.9.0
|
120 |
+
tzdata==2023.4
|
121 |
+
urllib3==2.1.0
|
122 |
+
uvicorn==0.27.0
|
123 |
+
websockets==11.0.3
|
124 |
+
yarl==1.9.4
|
125 |
+
einops==0.7.0
|
126 |
+
Pyarrow==15.0.0
|
127 |
+
openpyxl==3.1.2
|
128 |
+
text_generation==0.6.1
|
129 |
+
tabulate==0.9.0
|