dh-mc commited on
Commit
a28a4f8
1 Parent(s): 2f3b7d0

get latest code from orca-2 space

Browse files
.env.example CHANGED
@@ -11,7 +11,7 @@ LLM_MODEL_TYPE=hftgi
11
 
12
  OPENLLM_SERVER_URL=http://localhost:64300
13
 
14
- HFTGI_SERVER_URL=https://enabled-factually-cougar.ngrok-free.app
15
 
16
  OPENAI_API_KEY=
17
 
@@ -28,6 +28,7 @@ HF_PIPELINE_DEVICE_TYPE=
28
 
29
  # USE_LLAMA_2_PROMPT_TEMPLATE=true
30
  DISABLE_MODEL_PRELOADING=true
 
31
  CHAT_HISTORY_ENABLED=false
32
  SHOW_PARAM_SETTINGS=false
33
  SHARE_GRADIO_APP=false
@@ -47,15 +48,21 @@ USING_TORCH_BFLOAT16=true
47
  # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
48
 
49
  # LLM_MODEL_TYPE must be set to huggingface
 
 
 
50
  # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
51
  # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
52
  # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
53
  # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
54
  # HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
55
- HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
56
  # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-13b-chat-hf"
57
  # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-70b-chat-hf"
58
- # HUGGINGFACE_MODEL_NAME_OR_PATH="Qwen/Qwen-7B-Chat"
 
 
 
59
 
60
  STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"
61
 
 
11
 
12
  OPENLLM_SERVER_URL=http://localhost:64300
13
 
14
+ HFTGI_SERVER_URL=
15
 
16
  OPENAI_API_KEY=
17
 
 
28
 
29
  # USE_LLAMA_2_PROMPT_TEMPLATE=true
30
  DISABLE_MODEL_PRELOADING=true
31
+ USER_CONVERSATION_SUMMARY_BUFFER_MEMORY=true
32
  CHAT_HISTORY_ENABLED=false
33
  SHOW_PARAM_SETTINGS=false
34
  SHARE_GRADIO_APP=false
 
48
  # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
49
 
50
  # LLM_MODEL_TYPE must be set to huggingface
51
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-3b"
52
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-7b"
53
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
54
  # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
55
  # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
56
  # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
57
  # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
58
  # HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
59
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
60
  # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-13b-chat-hf"
61
  # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-70b-chat-hf"
62
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-7b"
63
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-13b"
64
+ HUGGINGFACE_MODEL_NAME_OR_PATH="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
65
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="FlagAlpha/Llama2-Chinese-13b-Chat"
66
 
67
  STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"
68
 
app.py CHANGED
@@ -1,5 +1,7 @@
1
  """Main entrypoint for the app."""
 
2
  import os
 
3
  import time
4
  from queue import Queue
5
  from timeit import default_timer as timer
@@ -13,14 +15,13 @@ from app_modules.utils import print_llm_response, remove_extra_spaces
13
 
14
  llm_loader, qa_chain = app_init()
15
 
16
- show_param_settings = os.environ.get("SHOW_PARAM_SETTINGS") == "true"
17
  share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
18
  using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
19
- chat_with_llama_2 = (
20
- not using_openai and os.environ.get("USE_LLAMA_2_PROMPT_TEMPLATE") == "true"
21
  )
22
  chat_history_enabled = (
23
- not chat_with_llama_2 and os.environ.get("CHAT_HISTORY_ENABLED") == "true"
24
  )
25
 
26
  model = (
@@ -34,180 +35,97 @@ href = (
34
  else f"https://huggingface.co/{model}"
35
  )
36
 
37
- if chat_with_llama_2:
38
  qa_chain = ChatChain(llm_loader)
39
- name = "Llama-2"
40
  else:
41
- name = "AI Books"
42
-
43
- title = f"""<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with {name} </h1>"""
44
-
45
- description_top = f"""\
 
 
 
 
 
 
 
46
  <div align="left">
47
  <p> Currently Running: <a href="{href}">{model}</a></p>
48
  </div>
49
  """
50
 
51
- description = """\
52
- <div align="center" style="margin:16px 0">
53
- The demo is built on <a href="https://github.com/hwchase17/langchain">LangChain</a>.
54
- </div>
55
- """
56
 
57
- CONCURRENT_COUNT = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
59
 
60
- def qa(chatbot):
61
- user_msg = chatbot[-1][0]
62
  q = Queue()
63
  result = Queue()
64
- job_done = object()
65
-
66
- def task(question, chat_history):
67
- start = timer()
68
- inputs = {"question": question}
69
- if not chat_with_llama_2:
70
- inputs["chat_history"] = chat_history
71
- ret = qa_chain.call_chain(inputs, None, q)
72
- end = timer()
73
-
74
- print(f"Completed in {end - start:.3f}s")
75
- print_llm_response(ret)
76
-
77
- q.put(job_done)
78
- result.put(ret)
79
-
80
- with start_blocking_portal() as portal:
81
- chat_history = []
82
- if chat_history_enabled:
83
- for i in range(len(chatbot) - 1):
84
- element = chatbot[i]
85
- item = (element[0] or "", element[1] or "")
86
- chat_history.append(item)
87
-
88
- portal.start_task_soon(task, user_msg, chat_history)
89
-
90
- content = ""
91
- count = 2 if len(chat_history) > 0 else 1
92
-
93
- while count > 0:
94
- while q.empty():
95
- print("nothing generated yet - retry in 0.5s")
96
- time.sleep(0.5)
97
-
98
- for next_token in llm_loader.streamer:
99
- if next_token is job_done:
100
- break
101
- content += next_token or ""
102
- chatbot[-1][1] = remove_extra_spaces(content)
103
-
104
- if count == 1:
105
- yield chatbot
106
-
107
- count -= 1
108
-
109
- if not chat_with_llama_2:
110
- chatbot[-1][1] += "\n\nSources:\n"
111
- ret = result.get()
112
- titles = []
113
- for doc in ret["source_documents"]:
114
- page = doc.metadata["page"] + 1
115
- url = f"{doc.metadata['url']}#page={page}"
116
- file_name = doc.metadata["source"].split("/")[-1]
117
- title = f"{file_name} Page: {page}"
118
- if title not in titles:
119
- titles.append(title)
120
- chatbot[-1][1] += f"1. [{title}]({url})\n"
121
-
122
- yield chatbot
123
-
124
-
125
- with open("assets/custom.css", "r", encoding="utf-8") as f:
126
- customCSS = f.read()
127
-
128
- with gr.Blocks(css=customCSS) as demo:
129
- user_question = gr.State("")
130
- with gr.Row():
131
- gr.HTML(title)
132
- gr.Markdown(description_top)
133
- with gr.Row().style(equal_height=True):
134
- with gr.Column(scale=5):
135
- with gr.Row():
136
- chatbot = gr.Chatbot(elem_id="inflaton_chatbot").style(height="100%")
137
- with gr.Row():
138
- with gr.Column(scale=2):
139
- user_input = gr.Textbox(
140
- show_label=False, placeholder="Enter your question here"
141
- ).style(container=False)
142
- with gr.Column(
143
- min_width=70,
144
- ):
145
- submitBtn = gr.Button("Send")
146
- with gr.Column(
147
- min_width=70,
148
- ):
149
- clearBtn = gr.Button("Clear")
150
- if show_param_settings:
151
- with gr.Column():
152
- with gr.Column(
153
- min_width=50,
154
- ):
155
- with gr.Tab(label="Parameter Setting"):
156
- gr.Markdown("# Parameters")
157
- top_p = gr.Slider(
158
- minimum=-0,
159
- maximum=1.0,
160
- value=0.95,
161
- step=0.05,
162
- # interactive=True,
163
- label="Top-p",
164
- )
165
- temperature = gr.Slider(
166
- minimum=0.1,
167
- maximum=2.0,
168
- value=0,
169
- step=0.1,
170
- # interactive=True,
171
- label="Temperature",
172
- )
173
- max_new_tokens = gr.Slider(
174
- minimum=0,
175
- maximum=2048,
176
- value=2048,
177
- step=8,
178
- # interactive=True,
179
- label="Max Generation Tokens",
180
- )
181
- max_context_length_tokens = gr.Slider(
182
- minimum=0,
183
- maximum=4096,
184
- value=4096,
185
- step=128,
186
- # interactive=True,
187
- label="Max Context Tokens",
188
- )
189
- gr.Markdown(description)
190
-
191
- def chat(user_message, history):
192
- return "", history + [[user_message, None]]
193
-
194
- user_input.submit(
195
- chat, [user_input, chatbot], [user_input, chatbot], queue=True
196
- ).then(qa, chatbot, chatbot)
197
-
198
- submitBtn.click(
199
- chat, [user_input, chatbot], [user_input, chatbot], queue=True, api_name="chat"
200
- ).then(qa, chatbot, chatbot)
201
-
202
- def reset():
203
- return "", []
204
-
205
- clearBtn.click(
206
- reset,
207
- outputs=[user_input, chatbot],
208
- show_progress=True,
209
- api_name="reset",
210
- )
211
-
212
- demo.title = "Chat with AI Books" if chat_with_llama_2 else "Chat with Llama-2"
213
- demo.queue(concurrency_count=CONCURRENT_COUNT).launch(share=share_gradio_app)
 
1
  """Main entrypoint for the app."""
2
+
3
  import os
4
+ from threading import Thread
5
  import time
6
  from queue import Queue
7
  from timeit import default_timer as timer
 
15
 
16
  llm_loader, qa_chain = app_init()
17
 
 
18
  share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
19
  using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
20
+ chat_with_orca_2 = (
21
+ not using_openai and os.environ.get("USE_ORCA_2_PROMPT_TEMPLATE") == "true"
22
  )
23
  chat_history_enabled = (
24
+ not chat_with_orca_2 and os.environ.get("CHAT_HISTORY_ENABLED") == "true"
25
  )
26
 
27
  model = (
 
35
  else f"https://huggingface.co/{model}"
36
  )
37
 
38
+ if chat_with_orca_2:
39
  qa_chain = ChatChain(llm_loader)
40
+ name = "Orca-2"
41
  else:
42
+ name = "PCI DSS v4"
43
+
44
+ title = f"Chat with {name}"
45
+ examples = (
46
+ ["How to cook a fish?", "Who is the president of US now?"]
47
+ if chat_with_orca_2
48
+ else [
49
+ "What's PCI DSS?",
50
+ "Can you summarize the changes made from PCI DSS version 3.2.1 to version 4.0?",
51
+ ]
52
+ )
53
+ description = f"""\
54
  <div align="left">
55
  <p> Currently Running: <a href="{href}">{model}</a></p>
56
  </div>
57
  """
58
 
 
 
 
 
 
59
 
60
+ def task(question, chat_history, q, result):
61
+ start = timer()
62
+ inputs = {"question": question, "chat_history": chat_history}
63
+ ret = qa_chain.call_chain(inputs, None, q)
64
+ end = timer()
65
+
66
+ print(f"Completed in {end - start:.3f}s")
67
+ print_llm_response(ret)
68
+
69
+ result.put(ret)
70
+
71
+
72
+ def predict(message, history):
73
+ print("predict:", message, history)
74
+
75
+ chat_history = []
76
+ if chat_history_enabled:
77
+ for element in history:
78
+ item = (element[0] or "", element[1] or "")
79
+ chat_history.append(item)
80
 
81
+ if not chat_history:
82
+ qa_chain.reset()
83
 
 
 
84
  q = Queue()
85
  result = Queue()
86
+ t = Thread(target=task, args=(message, chat_history, q, result))
87
+ t.start() # Starting the generation in a separate thread.
88
+
89
+ partial_message = ""
90
+ count = 2 if len(chat_history) > 0 else 1
91
+
92
+ while count > 0:
93
+ while q.empty():
94
+ print("nothing generated yet - retry in 0.5s")
95
+ time.sleep(0.5)
96
+
97
+ for next_token in llm_loader.streamer:
98
+ partial_message += next_token or ""
99
+ # partial_message = remove_extra_spaces(partial_message)
100
+ yield partial_message
101
+
102
+ if count == 2:
103
+ partial_message += "\n\n"
104
+
105
+ count -= 1
106
+
107
+ if not chat_with_orca_2:
108
+ partial_message += "\n\nSources:\n"
109
+ ret = result.get()
110
+ titles = []
111
+ for doc in ret["source_documents"]:
112
+ page = doc.metadata["page"] + 1
113
+ url = f"{doc.metadata['url']}#page={page}"
114
+ file_name = doc.metadata["source"].split("/")[-1]
115
+ title = f"{file_name} Page: {page}"
116
+ if title not in titles:
117
+ titles.append(title)
118
+ partial_message += f"1. [{title}]({url})\n"
119
+
120
+ yield partial_message
121
+
122
+
123
+ # Setting up the Gradio chat interface.
124
+ gr.ChatInterface(
125
+ predict,
126
+ title=title,
127
+ description=description,
128
+ examples=examples,
129
+ ).launch(
130
+ share=share_gradio_app
131
+ ) # Launching the web interface.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_modules/init.py CHANGED
@@ -1,82 +1,92 @@
1
- """Main entrypoint for the app."""
2
- import os
3
- from timeit import default_timer as timer
4
- from typing import List, Optional
5
-
6
- from dotenv import find_dotenv, load_dotenv
7
- from langchain.embeddings import HuggingFaceInstructEmbeddings
8
- from langchain.vectorstores.chroma import Chroma
9
- from langchain.vectorstores.faiss import FAISS
10
-
11
- from app_modules.llm_loader import LLMLoader
12
- from app_modules.llm_qa_chain import QAChain
13
- from app_modules.utils import get_device_types, init_settings
14
-
15
- found_dotenv = find_dotenv(".env")
16
-
17
- if len(found_dotenv) == 0:
18
- found_dotenv = find_dotenv(".env.example")
19
- print(f"loading env vars from: {found_dotenv}")
20
- load_dotenv(found_dotenv, override=False)
21
-
22
- # Constants
23
- init_settings()
24
-
25
-
26
- def app_init(initQAChain: bool = True):
27
- # https://github.com/huggingface/transformers/issues/17611
28
- os.environ["CURL_CA_BUNDLE"] = ""
29
-
30
- llm_model_type = os.environ.get("LLM_MODEL_TYPE")
31
- n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
32
-
33
- hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
34
- print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
35
- print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
36
-
37
- if initQAChain:
38
- hf_embeddings_model_name = (
39
- os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
40
- )
41
-
42
- index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get(
43
- "CHROMADB_INDEX_PATH"
44
- )
45
- using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
46
-
47
- start = timer()
48
- embeddings = HuggingFaceInstructEmbeddings(
49
- model_name=hf_embeddings_model_name,
50
- model_kwargs={"device": hf_embeddings_device_type},
51
- )
52
- end = timer()
53
-
54
- print(f"Completed in {end - start:.3f}s")
55
-
56
- start = timer()
57
-
58
- print(
59
- f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}"
60
- )
61
-
62
- if not os.path.isdir(index_path):
63
- raise ValueError(f"{index_path} does not exist!")
64
- elif using_faiss:
65
- vectorstore = FAISS.load_local(index_path, embeddings)
66
- else:
67
- vectorstore = Chroma(
68
- embedding_function=embeddings, persist_directory=index_path
69
- )
70
-
71
- end = timer()
72
-
73
- print(f"Completed in {end - start:.3f}s")
74
-
75
- start = timer()
76
- llm_loader = LLMLoader(llm_model_type)
77
- llm_loader.init(n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type)
78
- qa_chain = QAChain(vectorstore, llm_loader) if initQAChain else None
79
- end = timer()
80
- print(f"Completed in {end - start:.3f}s")
81
-
82
- return llm_loader, qa_chain
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main entrypoint for the app."""
2
+
3
+ import os
4
+ from timeit import default_timer as timer
5
+ from typing import List, Optional
6
+
7
+ from dotenv import find_dotenv, load_dotenv
8
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
9
+ from langchain.vectorstores.chroma import Chroma
10
+ from langchain.vectorstores.faiss import FAISS
11
+
12
+ from app_modules.llm_loader import LLMLoader
13
+ from app_modules.utils import get_device_types, init_settings
14
+
15
+ found_dotenv = find_dotenv(".env")
16
+
17
+ if len(found_dotenv) == 0:
18
+ found_dotenv = find_dotenv(".env.example")
19
+ print(f"loading env vars from: {found_dotenv}")
20
+ load_dotenv(found_dotenv, override=False)
21
+
22
+ # Constants
23
+ init_settings()
24
+
25
+ if os.environ.get("LANGCHAIN_DEBUG") == "true":
26
+ import langchain
27
+
28
+ langchain.debug = True
29
+
30
+ if os.environ.get("USER_CONVERSATION_SUMMARY_BUFFER_MEMORY") == "true":
31
+ from app_modules.llm_qa_chain_with_memory import QAChain
32
+
33
+ print("using llm_qa_chain_with_memory")
34
+ else:
35
+ from app_modules.llm_qa_chain import QAChain
36
+
37
+ print("using llm_qa_chain")
38
+
39
+
40
+ def app_init():
41
+ # https://github.com/huggingface/transformers/issues/17611
42
+ os.environ["CURL_CA_BUNDLE"] = ""
43
+
44
+ hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
45
+ print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
46
+ print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
47
+
48
+ hf_embeddings_model_name = (
49
+ os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
50
+ )
51
+
52
+ n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
53
+ index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get(
54
+ "CHROMADB_INDEX_PATH"
55
+ )
56
+ using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
57
+ llm_model_type = os.environ.get("LLM_MODEL_TYPE")
58
+
59
+ start = timer()
60
+ embeddings = HuggingFaceInstructEmbeddings(
61
+ model_name=hf_embeddings_model_name,
62
+ model_kwargs={"device": hf_embeddings_device_type},
63
+ )
64
+ end = timer()
65
+
66
+ print(f"Completed in {end - start:.3f}s")
67
+
68
+ start = timer()
69
+
70
+ print(f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}")
71
+
72
+ if not os.path.isdir(index_path):
73
+ raise ValueError(f"{index_path} does not exist!")
74
+ elif using_faiss:
75
+ vectorstore = FAISS.load_local(index_path, embeddings)
76
+ else:
77
+ vectorstore = Chroma(
78
+ embedding_function=embeddings, persist_directory=index_path
79
+ )
80
+
81
+ end = timer()
82
+
83
+ print(f"Completed in {end - start:.3f}s")
84
+
85
+ start = timer()
86
+ llm_loader = LLMLoader(llm_model_type)
87
+ llm_loader.init(n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type)
88
+ qa_chain = QAChain(vectorstore, llm_loader)
89
+ end = timer()
90
+ print(f"Completed in {end - start:.3f}s")
91
+
92
+ return llm_loader, qa_chain
app_modules/llm_chat_chain.py CHANGED
@@ -1,11 +1,13 @@
1
  import os
2
  from typing import List, Optional
3
 
4
- from langchain import ConversationChain, PromptTemplate
 
5
  from langchain.chains.base import Chain
6
  from langchain.memory import ConversationSummaryBufferMemory
7
 
8
  from app_modules.llm_inference import LLMInference
 
9
 
10
 
11
  def get_llama_2_prompt_template():
@@ -23,6 +25,13 @@ def get_llama_2_prompt_template():
23
  return prompt_template
24
 
25
 
 
 
 
 
 
 
 
26
  class ChatChain(LLMInference):
27
  def __init__(self, llm_loader):
28
  super().__init__(llm_loader)
@@ -31,28 +40,31 @@ class ChatChain(LLMInference):
31
  template = (
32
  get_llama_2_prompt_template()
33
  if os.environ.get("USE_LLAMA_2_PROMPT_TEMPLATE") == "true"
34
- else """You are a chatbot having a conversation with a human.
 
 
 
35
  {history}
36
  Human: {input}
37
  Chatbot:"""
 
38
  )
39
 
40
  print(f"template: {template}")
41
 
42
  prompt = PromptTemplate(input_variables=["history", "input"], template=template)
43
-
44
- memory = ConversationSummaryBufferMemory(
45
- llm=self.llm_loader.llm, max_token_limit=1024, return_messages=True
46
  )
47
 
48
  llm_chain = ConversationChain(
49
  llm=self.llm_loader.llm,
50
  prompt=prompt,
51
- verbose=True,
52
  memory=memory,
53
  )
54
 
55
  return llm_chain
56
 
57
  def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
58
- return chain({"input": inputs["question"]}, callbacks)
 
1
  import os
2
  from typing import List, Optional
3
 
4
+ from langchain.chains import ConversationChain, LLMChain
5
+ from langchain.prompts import PromptTemplate
6
  from langchain.chains.base import Chain
7
  from langchain.memory import ConversationSummaryBufferMemory
8
 
9
  from app_modules.llm_inference import LLMInference
10
+ from app_modules.utils import CustomizedConversationSummaryBufferMemory
11
 
12
 
13
  def get_llama_2_prompt_template():
 
25
  return prompt_template
26
 
27
 
28
+ def get_orca_2_prompt_template():
29
+ system_message = "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
30
+ user_message = "Chat History:\n\n{history} \n\nUser: {input}"
31
+ prompt_template = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
32
+ return prompt_template
33
+
34
+
35
  class ChatChain(LLMInference):
36
  def __init__(self, llm_loader):
37
  super().__init__(llm_loader)
 
40
  template = (
41
  get_llama_2_prompt_template()
42
  if os.environ.get("USE_LLAMA_2_PROMPT_TEMPLATE") == "true"
43
+ else (
44
+ get_orca_2_prompt_template()
45
+ if os.environ.get("USE_ORCA_2_PROMPT_TEMPLATE") == "true"
46
+ else """You are a chatbot having a conversation with a human.
47
  {history}
48
  Human: {input}
49
  Chatbot:"""
50
+ )
51
  )
52
 
53
  print(f"template: {template}")
54
 
55
  prompt = PromptTemplate(input_variables=["history", "input"], template=template)
56
+ memory = CustomizedConversationSummaryBufferMemory(
57
+ llm=self.llm_loader.llm, max_token_limit=1024, return_messages=False
 
58
  )
59
 
60
  llm_chain = ConversationChain(
61
  llm=self.llm_loader.llm,
62
  prompt=prompt,
63
+ verbose=False,
64
  memory=memory,
65
  )
66
 
67
  return llm_chain
68
 
69
  def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
70
+ return super().run_chain(chain, {"input": inputs["question"]}, callbacks)
app_modules/llm_inference.py CHANGED
@@ -5,6 +5,7 @@ import urllib
5
  from queue import Queue
6
  from threading import Thread
7
  from typing import List, Optional
 
8
 
9
  from langchain.chains.base import Chain
10
 
@@ -13,9 +14,6 @@ from app_modules.utils import remove_extra_spaces
13
 
14
 
15
  class LLMInference(metaclass=abc.ABCMeta):
16
- llm_loader: LLMLoader
17
- chain: Chain
18
-
19
  def __init__(self, llm_loader):
20
  self.llm_loader = llm_loader
21
  self.chain = None
@@ -30,8 +28,15 @@ class LLMInference(metaclass=abc.ABCMeta):
30
 
31
  return self.chain
32
 
 
 
 
33
  def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
34
- return chain(inputs, callbacks)
 
 
 
 
35
 
36
  def call_chain(
37
  self,
@@ -59,6 +64,7 @@ class LLMInference(metaclass=abc.ABCMeta):
59
  if "answer" in result:
60
  result["answer"] = remove_extra_spaces(result["answer"])
61
 
 
62
  base_url = os.environ.get("PDF_FILE_BASE_URL")
63
  if base_url is not None and len(base_url) > 0:
64
  documents = result["source_documents"]
@@ -66,6 +72,30 @@ class LLMInference(metaclass=abc.ABCMeta):
66
  source = doc.metadata["source"]
67
  title = source.split("/")[-1]
68
  doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  return result
71
  finally:
 
5
  from queue import Queue
6
  from threading import Thread
7
  from typing import List, Optional
8
+ from urllib.parse import quote, urlparse, urlunparse
9
 
10
  from langchain.chains.base import Chain
11
 
 
14
 
15
 
16
  class LLMInference(metaclass=abc.ABCMeta):
 
 
 
17
  def __init__(self, llm_loader):
18
  self.llm_loader = llm_loader
19
  self.chain = None
 
28
 
29
  return self.chain
30
 
31
+ def reset(self) -> None:
32
+ self.chain = None
33
+
34
  def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
35
+ result = chain.invoke(inputs, {"callbacks": callbacks})
36
+ if "text" in result:
37
+ result["response"] = result["text"]
38
+ del result["text"]
39
+ return result
40
 
41
  def call_chain(
42
  self,
 
64
  if "answer" in result:
65
  result["answer"] = remove_extra_spaces(result["answer"])
66
 
67
+ source_path = os.environ.get("SOURCE_PATH")
68
  base_url = os.environ.get("PDF_FILE_BASE_URL")
69
  if base_url is not None and len(base_url) > 0:
70
  documents = result["source_documents"]
 
72
  source = doc.metadata["source"]
73
  title = source.split("/")[-1]
74
  doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
75
+ elif source_path is not None and len(source_path) > 0:
76
+ documents = result["source_documents"]
77
+ for doc in documents:
78
+ source = doc.metadata["source"]
79
+ url = source.replace(source_path, "https://")
80
+ url = url.replace(".html", "")
81
+ parsed_url = urlparse(url)
82
+
83
+ # Encode path, query, and fragment
84
+ encoded_path = quote(parsed_url.path)
85
+ encoded_query = quote(parsed_url.query)
86
+ encoded_fragment = quote(parsed_url.fragment)
87
+
88
+ # Construct the encoded URL
89
+ doc.metadata["url"] = urlunparse(
90
+ (
91
+ parsed_url.scheme,
92
+ parsed_url.netloc,
93
+ encoded_path,
94
+ parsed_url.params,
95
+ encoded_query,
96
+ encoded_fragment,
97
+ )
98
+ )
99
 
100
  return result
101
  finally:
app_modules/llm_loader.py CHANGED
@@ -5,17 +5,18 @@ from queue import Queue
5
  from typing import Any, Optional
6
 
7
  import torch
8
- from langchain import HuggingFaceTextGenInference
9
  from langchain.callbacks.base import BaseCallbackHandler
10
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
11
- from langchain.chat_models import ChatOpenAI
12
- from langchain.llms import (
 
 
13
  CTransformers,
14
  GPT4All,
15
  HuggingFacePipeline,
16
  LlamaCpp,
17
- OpenLLM,
18
  )
 
19
  from langchain.schema import LLMResult
20
  from transformers import (
21
  AutoConfig,
@@ -30,7 +31,6 @@ from transformers import (
30
  pipeline,
31
  )
32
 
33
- from app_modules.instruct_pipeline import InstructionTextGenerationPipeline
34
  from app_modules.utils import ensure_model_is_downloaded
35
 
36
 
@@ -49,6 +49,7 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
49
  self.timeout = timeout
50
  self.total_tokens = 0
51
  self.for_huggingface = for_huggingface
 
52
 
53
  def on_finalized_text(self, text: str, stream_end: bool = False):
54
  super().on_finalized_text(text, stream_end=stream_end)
@@ -61,11 +62,23 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
61
  self.text_queue.put("\n", timeout=self.timeout)
62
  self.text_queue.put(self.stop_signal, timeout=self.timeout)
63
 
 
 
 
 
 
 
 
 
 
 
64
  def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
65
- sys.stdout.write(token)
66
- sys.stdout.flush()
67
- self.text_queue.put(token, timeout=self.timeout)
68
- self.total_tokens = self.total_tokens + 1
 
 
69
 
70
  def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
71
  print("\n")
@@ -85,18 +98,13 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
85
  def reset(self, q: Queue = None):
86
  # print("resetting TextIteratorStreamer")
87
  self.text_queue = q if q is not None else Queue()
 
88
 
89
  def empty(self):
90
  return self.text_queue.empty()
91
 
92
 
93
  class LLMLoader:
94
- llm_model_type: str
95
- llm: any
96
- streamer: any
97
- max_tokens_limit: int
98
- lock: any
99
-
100
  def __init__(self, llm_model_type):
101
  self.llm_model_type = llm_model_type
102
  self.llm = None
@@ -129,9 +137,11 @@ class LLMLoader:
129
  hf_pipeline_device_type = "cpu"
130
 
131
  using_cuda = hf_pipeline_device_type.startswith("cuda")
132
- torch_dtype = torch.float16 if using_cuda else torch.float32
133
- if os.environ.get("USING_TORCH_BFLOAT16") == "true":
 
134
  torch_dtype = torch.bfloat16
 
135
  load_quantized_model = os.environ.get("LOAD_QUANTIZED_MODEL")
136
 
137
  print(f" hf_pipeline_device_type: {hf_pipeline_device_type}")
@@ -139,6 +149,8 @@ class LLMLoader:
139
  print(f" torch_dtype: {torch_dtype}")
140
  print(f" n_threds: {n_threds}")
141
 
 
 
142
  double_quant_config = BitsAndBytesConfig(
143
  load_in_4bit=load_quantized_model == "4bit",
144
  bnb_4bit_use_double_quant=load_quantized_model == "4bit",
@@ -156,20 +168,22 @@ class LLMLoader:
156
  if self.llm_model_type == "openai":
157
  MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-3.5-turbo"
158
  print(f" using model: {MODEL_NAME}")
159
- self.llm = ChatOpenAI(
160
- model_name=MODEL_NAME,
161
- streaming=True,
162
- callbacks=callbacks,
163
- verbose=True,
164
- temperature=0,
165
- )
166
- elif self.llm_model_type == "openllm":
167
- server_url = os.environ.get("OPENLLM_SERVER_URL")
168
- print(f" server url: {server_url}")
169
- self.llm = OpenLLM(
170
- server_url=server_url,
171
- # callbacks=callbacks,
172
- verbose=True,
 
 
173
  )
174
  elif self.llm_model_type.startswith("gpt4all"):
175
  MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
@@ -209,6 +223,9 @@ class LLMLoader:
209
  )
210
  elif self.llm_model_type == "hftgi":
211
  HFTGI_SERVER_URL = os.environ.get("HFTGI_SERVER_URL")
 
 
 
212
  self.max_tokens_limit = 4096
213
  self.llm = HuggingFaceTextGenInference(
214
  inference_server_url=HFTGI_SERVER_URL,
@@ -217,11 +234,20 @@ class LLMLoader:
217
  top_p=0.95,
218
  # typical_p=0.95,
219
  temperature=0.01,
220
- repetition_penalty=1.12,
221
  callbacks=callbacks,
222
  timeout=600,
223
  streaming=True,
224
  )
 
 
 
 
 
 
 
 
 
225
  elif self.llm_model_type.startswith("huggingface"):
226
  MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
227
  print(f" loading model: {MODEL_NAME_OR_PATH}")
@@ -243,6 +269,27 @@ class LLMLoader:
243
 
244
  if "Llama-2" in MODEL_NAME_OR_PATH:
245
  self.max_tokens_limit = 4096
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  is_t5 = "t5" in MODEL_NAME_OR_PATH
248
  temperature = (
@@ -250,7 +297,9 @@ class LLMLoader:
250
  if "gpt4all-j" in MODEL_NAME_OR_PATH
251
  or "dolly" in MODEL_NAME_OR_PATH
252
  or "Qwen" in MODEL_NAME_OR_PATH
253
- or "Llama-2" in MODEL_NAME_OR_PATH
 
 
254
  else 0
255
  )
256
  use_fast = (
@@ -314,6 +363,11 @@ class LLMLoader:
314
  else (1.25 if "dolly" in MODEL_NAME_OR_PATH else 1.1)
315
  )
316
 
 
 
 
 
 
317
  if load_quantized_model is not None:
318
  model = (
319
  AutoModelForSeq2SeqLM.from_pretrained(
@@ -342,71 +396,40 @@ class LLMLoader:
342
  pad_token_id = eos_token_id
343
 
344
  pipe = (
345
- InstructionTextGenerationPipeline(
346
- task=task,
347
  model=model,
348
  tokenizer=tokenizer,
 
 
349
  streamer=self.streamer,
350
- max_new_tokens=2048,
351
- temperature=temperature,
352
  return_full_text=return_full_text, # langchain expects the full text
 
 
 
 
 
 
 
353
  repetition_penalty=repetition_penalty,
354
  )
355
- if "dolly" in MODEL_NAME_OR_PATH
356
- else (
357
- pipeline(
358
- task,
359
- model=model,
360
- tokenizer=tokenizer,
361
- eos_token_id=eos_token_id,
362
- pad_token_id=pad_token_id,
363
- streamer=self.streamer,
364
- return_full_text=return_full_text, # langchain expects the full text
365
- device_map="auto",
366
- trust_remote_code=True,
367
- max_new_tokens=2048,
368
- do_sample=True,
369
- temperature=0.01,
370
- top_p=0.95,
371
- top_k=50,
372
- repetition_penalty=repetition_penalty,
373
- )
374
- if eos_token_id != -1
375
- else pipeline(
376
- task,
377
- model=model,
378
- tokenizer=tokenizer,
379
- streamer=self.streamer,
380
- return_full_text=return_full_text, # langchain expects the full text
381
- device_map="auto",
382
- trust_remote_code=True,
383
- max_new_tokens=2048,
384
- # verbose=True,
385
- temperature=temperature,
386
- top_p=0.95,
387
- top_k=0, # select from top 0 tokens (because zero, relies on top_p)
388
- repetition_penalty=repetition_penalty,
389
- )
390
  )
391
  )
392
- elif "dolly" in MODEL_NAME_OR_PATH:
393
- model = AutoModelForCausalLM.from_pretrained(
394
- MODEL_NAME_OR_PATH,
395
- device_map=hf_pipeline_device_type,
396
- torch_dtype=torch_dtype,
397
- )
398
-
399
- pipe = InstructionTextGenerationPipeline(
400
- task=task,
401
- model=model,
402
- tokenizer=tokenizer,
403
- streamer=self.streamer,
404
- max_new_tokens=2048,
405
- temperature=temperature,
406
- return_full_text=True,
407
- repetition_penalty=repetition_penalty,
408
- token=token,
409
- )
410
  else:
411
  if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
412
  model = (
@@ -456,10 +479,11 @@ class LLMLoader:
456
  torch_dtype=torch_dtype,
457
  max_new_tokens=2048,
458
  trust_remote_code=True,
 
459
  temperature=temperature,
460
  top_p=0.95,
461
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
462
- repetition_penalty=1.115,
463
  )
464
  if token is None
465
  else pipeline(
@@ -475,11 +499,12 @@ class LLMLoader:
475
  temperature=temperature,
476
  top_p=0.95,
477
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
478
- repetition_penalty=1.115,
479
  token=token,
480
  )
481
  )
482
 
 
483
  self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
484
  elif self.llm_model_type == "mosaicml":
485
  MODEL_NAME_OR_PATH = os.environ.get("MOSAICML_MODEL_NAME_OR_PATH")
@@ -534,11 +559,13 @@ class LLMLoader:
534
 
535
  max_new_tokens = 8192 if "30b" in MODEL_NAME_OR_PATH else 2048
536
  self.max_tokens_limit = max_new_tokens
537
- self.search_kwargs = (
538
- {"k": 8} if "30b" in MODEL_NAME_OR_PATH else self.search_kwargs
539
- )
540
  repetition_penalty = 1.05 if "30b" in MODEL_NAME_OR_PATH else 1.02
541
 
 
 
 
 
 
542
  pipe = (
543
  pipeline(
544
  model=model,
@@ -549,7 +576,8 @@ class LLMLoader:
549
  device_map="auto",
550
  # we pass model parameters here too
551
  stopping_criteria=stopping_criteria, # without this model will ramble
552
- temperature=0, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
 
553
  top_p=0.95, # select from top tokens whose probability add up to 15%
554
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
555
  max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
@@ -565,7 +593,8 @@ class LLMLoader:
565
  device=config.init_device,
566
  # we pass model parameters here too
567
  stopping_criteria=stopping_criteria, # without this model will ramble
568
- temperature=0, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
 
569
  top_p=0.95, # select from top tokens whose probability add up to 15%
570
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
571
  max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
@@ -584,6 +613,13 @@ class LLMLoader:
584
  # config.max_seq_len = 4096
585
  config.init_device = hf_pipeline_device_type
586
 
 
 
 
 
 
 
 
587
  model = (
588
  AutoModelForCausalLM.from_pretrained(
589
  MODEL_NAME_OR_PATH,
@@ -635,7 +671,7 @@ class LLMLoader:
635
  top_p=0.95, # select from top tokens whose probability add up to 15%
636
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
637
  max_new_tokens=2048, # mex number of tokens to generate in the output
638
- repetition_penalty=1.25, # without this output begins repeating
639
  )
640
  if load_quantized_model is not None
641
  else pipeline(
@@ -651,7 +687,7 @@ class LLMLoader:
651
  top_p=0.95, # select from top tokens whose probability add up to 15%
652
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
653
  max_new_tokens=2048, # mex number of tokens to generate in the output
654
- repetition_penalty=1.05, # without this output begins repeating
655
  )
656
  )
657
  self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
 
5
  from typing import Any, Optional
6
 
7
  import torch
 
8
  from langchain.callbacks.base import BaseCallbackHandler
9
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
10
+ from langchain_openai.chat_models import ChatOpenAI
11
+ from langchain_openai.llms import OpenAI
12
+ from langchain_community.llms import (
13
+ HuggingFaceTextGenInference,
14
  CTransformers,
15
  GPT4All,
16
  HuggingFacePipeline,
17
  LlamaCpp,
 
18
  )
19
+ from langchain_community.chat_models import ChatOllama
20
  from langchain.schema import LLMResult
21
  from transformers import (
22
  AutoConfig,
 
31
  pipeline,
32
  )
33
 
 
34
  from app_modules.utils import ensure_model_is_downloaded
35
 
36
 
 
49
  self.timeout = timeout
50
  self.total_tokens = 0
51
  self.for_huggingface = for_huggingface
52
+ self.end_token = ""
53
 
54
  def on_finalized_text(self, text: str, stream_end: bool = False):
55
  super().on_finalized_text(text, stream_end=stream_end)
 
62
  self.text_queue.put("\n", timeout=self.timeout)
63
  self.text_queue.put(self.stop_signal, timeout=self.timeout)
64
 
65
+ def check_end_token(self, token):
66
+ new_token = self.end_token + token
67
+ if "<|im_end|>".startswith(new_token):
68
+ self.end_token = "" if new_token == "<|im_end|>" else new_token
69
+ return None
70
+ elif self.end_token != "":
71
+ self.end_token = ""
72
+
73
+ return new_token
74
+
75
  def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
76
+ token = self.check_end_token(token)
77
+ if token:
78
+ sys.stdout.write(token)
79
+ sys.stdout.flush()
80
+ self.text_queue.put(token, timeout=self.timeout)
81
+ self.total_tokens = self.total_tokens + 1
82
 
83
  def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
84
  print("\n")
 
98
  def reset(self, q: Queue = None):
99
  # print("resetting TextIteratorStreamer")
100
  self.text_queue = q if q is not None else Queue()
101
+ self.end_token = ""
102
 
103
  def empty(self):
104
  return self.text_queue.empty()
105
 
106
 
107
  class LLMLoader:
 
 
 
 
 
 
108
  def __init__(self, llm_model_type):
109
  self.llm_model_type = llm_model_type
110
  self.llm = None
 
137
  hf_pipeline_device_type = "cpu"
138
 
139
  using_cuda = hf_pipeline_device_type.startswith("cuda")
140
+ using_mps = hf_pipeline_device_type.startswith("mps")
141
+ torch_dtype = torch.float16 if using_cuda or using_mps else torch.float32
142
+ if not using_mps and os.environ.get("USING_TORCH_BFLOAT16") == "true":
143
  torch_dtype = torch.bfloat16
144
+
145
  load_quantized_model = os.environ.get("LOAD_QUANTIZED_MODEL")
146
 
147
  print(f" hf_pipeline_device_type: {hf_pipeline_device_type}")
 
149
  print(f" torch_dtype: {torch_dtype}")
150
  print(f" n_threds: {n_threds}")
151
 
152
+ torch.set_default_dtype(torch_dtype)
153
+
154
  double_quant_config = BitsAndBytesConfig(
155
  load_in_4bit=load_quantized_model == "4bit",
156
  bnb_4bit_use_double_quant=load_quantized_model == "4bit",
 
168
  if self.llm_model_type == "openai":
169
  MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-3.5-turbo"
170
  print(f" using model: {MODEL_NAME}")
171
+ self.llm = (
172
+ OpenAI(
173
+ model_name=MODEL_NAME,
174
+ streaming=True,
175
+ callbacks=callbacks,
176
+ verbose=True,
177
+ temperature=0,
178
+ )
179
+ if "instruct" in MODEL_NAME
180
+ else ChatOpenAI(
181
+ model_name=MODEL_NAME,
182
+ streaming=True,
183
+ callbacks=callbacks,
184
+ verbose=True,
185
+ temperature=0,
186
+ )
187
  )
188
  elif self.llm_model_type.startswith("gpt4all"):
189
  MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
 
223
  )
224
  elif self.llm_model_type == "hftgi":
225
  HFTGI_SERVER_URL = os.environ.get("HFTGI_SERVER_URL")
226
+ HFTGI_RP = os.environ.get("HFTGI_RP")
227
+ repetition_penalty = 1.120 if HFTGI_RP is None else float(HFTGI_RP)
228
+ print(f" repetition_penalty: {repetition_penalty}")
229
  self.max_tokens_limit = 4096
230
  self.llm = HuggingFaceTextGenInference(
231
  inference_server_url=HFTGI_SERVER_URL,
 
234
  top_p=0.95,
235
  # typical_p=0.95,
236
  temperature=0.01,
237
+ repetition_penalty=repetition_penalty,
238
  callbacks=callbacks,
239
  timeout=600,
240
  streaming=True,
241
  )
242
+ elif self.llm_model_type == "ollama":
243
+ MODEL_NAME = os.environ.get("OLLAMA_MODEL_NAME") or "dolphin-phi"
244
+ print(f" loading model: {MODEL_NAME}")
245
+ self.llm = ChatOllama(
246
+ model=MODEL_NAME,
247
+ callbacks=callbacks,
248
+ temperature=0,
249
+ repeat_penalty=1.15,
250
+ )
251
  elif self.llm_model_type.startswith("huggingface"):
252
  MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
253
  print(f" loading model: {MODEL_NAME_OR_PATH}")
 
269
 
270
  if "Llama-2" in MODEL_NAME_OR_PATH:
271
  self.max_tokens_limit = 4096
272
+ elif "TinyLlama" in MODEL_NAME_OR_PATH:
273
+ self.max_tokens_limit = 1024
274
+
275
+ class StopOnTokens(StoppingCriteria):
276
+ def __call__(
277
+ self,
278
+ input_ids: torch.LongTensor,
279
+ scores: torch.FloatTensor,
280
+ **kwargs,
281
+ ) -> bool:
282
+ stop_ids = [
283
+ 2
284
+ ] # IDs of tokens where the generation should stop.
285
+ for stop_id in stop_ids:
286
+ if (
287
+ input_ids[0][-1] == stop_id
288
+ ): # Checking if the last generated token is a stop token.
289
+ return True
290
+ return False
291
+
292
+ stopping_criteria = StoppingCriteriaList([StopOnTokens()])
293
 
294
  is_t5 = "t5" in MODEL_NAME_OR_PATH
295
  temperature = (
 
297
  if "gpt4all-j" in MODEL_NAME_OR_PATH
298
  or "dolly" in MODEL_NAME_OR_PATH
299
  or "Qwen" in MODEL_NAME_OR_PATH
300
+ or "Llama" in MODEL_NAME_OR_PATH
301
+ or "Orca-2" in MODEL_NAME_OR_PATH
302
+ or "phi-2" in MODEL_NAME_OR_PATH
303
  else 0
304
  )
305
  use_fast = (
 
363
  else (1.25 if "dolly" in MODEL_NAME_OR_PATH else 1.1)
364
  )
365
 
366
+ HF_RP = os.environ.get("HF_RP")
367
+ if HF_RP is not None and len(HF_RP) > 0:
368
+ repetition_penalty = float(HF_RP)
369
+ print(f" repetition_penalty: {repetition_penalty}")
370
+
371
  if load_quantized_model is not None:
372
  model = (
373
  AutoModelForSeq2SeqLM.from_pretrained(
 
396
  pad_token_id = eos_token_id
397
 
398
  pipe = (
399
+ pipeline(
400
+ task,
401
  model=model,
402
  tokenizer=tokenizer,
403
+ eos_token_id=eos_token_id,
404
+ pad_token_id=pad_token_id,
405
  streamer=self.streamer,
 
 
406
  return_full_text=return_full_text, # langchain expects the full text
407
+ device_map="auto",
408
+ trust_remote_code=True,
409
+ max_new_tokens=2048,
410
+ do_sample=True,
411
+ temperature=0.01,
412
+ top_p=0.95,
413
+ top_k=50,
414
  repetition_penalty=repetition_penalty,
415
  )
416
+ if eos_token_id != -1
417
+ else pipeline(
418
+ task,
419
+ model=model,
420
+ tokenizer=tokenizer,
421
+ streamer=self.streamer,
422
+ return_full_text=return_full_text, # langchain expects the full text
423
+ device_map="auto",
424
+ trust_remote_code=True,
425
+ max_new_tokens=2048,
426
+ do_sample=True,
427
+ temperature=temperature,
428
+ top_p=0.95,
429
+ top_k=0, # select from top 0 tokens (because zero, relies on top_p)
430
+ repetition_penalty=repetition_penalty,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  )
432
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  else:
434
  if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
435
  model = (
 
479
  torch_dtype=torch_dtype,
480
  max_new_tokens=2048,
481
  trust_remote_code=True,
482
+ do_sample=True,
483
  temperature=temperature,
484
  top_p=0.95,
485
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
486
+ repetition_penalty=repetition_penalty,
487
  )
488
  if token is None
489
  else pipeline(
 
499
  temperature=temperature,
500
  top_p=0.95,
501
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
502
+ repetition_penalty=repetition_penalty,
503
  token=token,
504
  )
505
  )
506
 
507
+ pipe.model.config.pad_token_id = pipe.model.config.eos_token_id
508
  self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
509
  elif self.llm_model_type == "mosaicml":
510
  MODEL_NAME_OR_PATH = os.environ.get("MOSAICML_MODEL_NAME_OR_PATH")
 
559
 
560
  max_new_tokens = 8192 if "30b" in MODEL_NAME_OR_PATH else 2048
561
  self.max_tokens_limit = max_new_tokens
 
 
 
562
  repetition_penalty = 1.05 if "30b" in MODEL_NAME_OR_PATH else 1.02
563
 
564
+ ML_RP = os.environ.get("ML_RP")
565
+ if ML_RP is not None and len(ML_RP) > 0:
566
+ repetition_penalty = float(ML_RP)
567
+ print(f" repetition_penalty: {repetition_penalty}")
568
+
569
  pipe = (
570
  pipeline(
571
  model=model,
 
576
  device_map="auto",
577
  # we pass model parameters here too
578
  stopping_criteria=stopping_criteria, # without this model will ramble
579
+ do_sample=True,
580
+ temperature=0.01, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
581
  top_p=0.95, # select from top tokens whose probability add up to 15%
582
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
583
  max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
 
593
  device=config.init_device,
594
  # we pass model parameters here too
595
  stopping_criteria=stopping_criteria, # without this model will ramble
596
+ do_sample=True,
597
+ temperature=0.01, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
598
  top_p=0.95, # select from top tokens whose probability add up to 15%
599
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
600
  max_new_tokens=max_new_tokens, # mex number of tokens to generate in the output
 
613
  # config.max_seq_len = 4096
614
  config.init_device = hf_pipeline_device_type
615
 
616
+ SL_RP = os.environ.get("SL_RP")
617
+ if SL_RP is not None and len(SL_RP) > 0:
618
+ repetition_penalty = float(SL_RP)
619
+ else:
620
+ repetition_penalty = 1.05
621
+ print(f" repetition_penalty: {repetition_penalty}")
622
+
623
  model = (
624
  AutoModelForCausalLM.from_pretrained(
625
  MODEL_NAME_OR_PATH,
 
671
  top_p=0.95, # select from top tokens whose probability add up to 15%
672
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
673
  max_new_tokens=2048, # mex number of tokens to generate in the output
674
+ repetition_penalty=repetition_penalty, # without this output begins repeating
675
  )
676
  if load_quantized_model is not None
677
  else pipeline(
 
687
  top_p=0.95, # select from top tokens whose probability add up to 15%
688
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
689
  max_new_tokens=2048, # mex number of tokens to generate in the output
690
+ repetition_penalty=repetition_penalty, # without this output begins repeating
691
  )
692
  )
693
  self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
app_modules/llm_qa_chain.py CHANGED
@@ -1,13 +1,10 @@
1
  from langchain.chains import ConversationalRetrievalChain
2
  from langchain.chains.base import Chain
3
- from langchain.vectorstores.base import VectorStore
4
 
5
  from app_modules.llm_inference import LLMInference
6
 
7
 
8
  class QAChain(LLMInference):
9
- vectorstore: VectorStore
10
-
11
  def __init__(self, vectorstore, llm_loader):
12
  super().__init__(llm_loader)
13
  self.vectorstore = vectorstore
 
1
  from langchain.chains import ConversationalRetrievalChain
2
  from langchain.chains.base import Chain
 
3
 
4
  from app_modules.llm_inference import LLMInference
5
 
6
 
7
  class QAChain(LLMInference):
 
 
8
  def __init__(self, vectorstore, llm_loader):
9
  super().__init__(llm_loader)
10
  self.vectorstore = vectorstore
app_modules/llm_qa_chain_with_memory.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import ConversationalRetrievalChain
2
+ from langchain.chains.base import Chain
3
+
4
+ from app_modules.llm_inference import LLMInference
5
+ from app_modules.utils import CustomizedConversationSummaryBufferMemory
6
+
7
+
8
+ class QAChain(LLMInference):
9
+ def __init__(self, vectorstore, llm_loader):
10
+ super().__init__(llm_loader)
11
+ self.vectorstore = vectorstore
12
+
13
+ def create_chain(self) -> Chain:
14
+ memory = CustomizedConversationSummaryBufferMemory(
15
+ llm=self.llm_loader.llm,
16
+ output_key="answer",
17
+ memory_key="chat_history",
18
+ max_token_limit=1024,
19
+ return_messages=True,
20
+ )
21
+ qa = ConversationalRetrievalChain.from_llm(
22
+ self.llm_loader.llm,
23
+ memory=memory,
24
+ chain_type="stuff",
25
+ retriever=self.vectorstore.as_retriever(
26
+ search_kwargs=self.llm_loader.search_kwargs
27
+ ),
28
+ get_chat_history=lambda h: h,
29
+ return_source_documents=True,
30
+ )
31
+
32
+ return qa
app_modules/utils.py CHANGED
@@ -10,6 +10,7 @@ from pathlib import Path
10
  import requests
11
  import torch
12
  from tqdm import tqdm
 
13
 
14
 
15
  class LogRecord(logging.LogRecord):
@@ -69,21 +70,31 @@ def print_llm_response(llm_response):
69
  llm_response["source_documents"] if "source_documents" in llm_response else None
70
  )
71
  if source_documents is None:
72
- source_documents = llm_response["sourceDocs"]
73
-
74
- print("\nSources:")
75
- for source in source_documents:
76
- metadata = source["metadata"] if "metadata" in source else source.metadata
77
- print(
78
- " Page: "
79
- + str(metadata["page"])
80
- + " Source: "
81
- + str(metadata["url"] if "url" in metadata else metadata["source"])
82
- )
83
- print(
84
- source["page_content"] if "page_content" in source else source.page_content
85
  )
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def get_device_types():
89
  print("Running on: ", platform.platform())
@@ -159,6 +170,21 @@ def ensure_model_is_downloaded(llm_model_type):
159
  return local_path
160
 
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  if __name__ == "__main__":
163
  hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
164
  print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
 
10
  import requests
11
  import torch
12
  from tqdm import tqdm
13
+ from langchain.memory import ConversationSummaryBufferMemory
14
 
15
 
16
  class LogRecord(logging.LogRecord):
 
70
  llm_response["source_documents"] if "source_documents" in llm_response else None
71
  )
72
  if source_documents is None:
73
+ source_documents = (
74
+ llm_response["sourceDocs"] if "sourceDocs" in llm_response else None
 
 
 
 
 
 
 
 
 
 
 
75
  )
76
 
77
+ if source_documents is not None:
78
+ print("\nSources:")
79
+ for source in source_documents:
80
+ metadata = source["metadata"] if "metadata" in source else source.metadata
81
+ if "page" in metadata:
82
+ print(f" Page: {metadata['page']}", end="")
83
+
84
+ print(
85
+ " Source: "
86
+ + str(metadata["url"] if "url" in metadata else metadata["source"])
87
+ )
88
+ print(
89
+ source["page_content"]
90
+ if "page_content" in source
91
+ else source.page_content
92
+ )
93
+
94
+ if "chat_history" in llm_response:
95
+ print("\nChat History:")
96
+ print(llm_response["chat_history"])
97
+
98
 
99
  def get_device_types():
100
  print("Running on: ", platform.platform())
 
170
  return local_path
171
 
172
 
173
+ class CustomizedConversationSummaryBufferMemory(ConversationSummaryBufferMemory):
174
+ def save_context(self, inputs, outputs) -> None:
175
+ for key in outputs:
176
+ if isinstance(outputs[key], str):
177
+ outputs[key] = outputs[key].replace("<|im_end|>", "")
178
+ return super().save_context(inputs, outputs)
179
+
180
+ def predict_new_summary(self, messages, existing_summary) -> str:
181
+ return (
182
+ super()
183
+ .predict_new_summary(messages, existing_summary)
184
+ .replace("<|im_end|>", "")
185
+ )
186
+
187
+
188
  if __name__ == "__main__":
189
  hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
190
  print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
requirements-mac.txt ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.26.1
2
+ aiofiles==23.2.1
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ altair==5.2.0
6
+ annotated-types==0.6.0
7
+ anyio==4.2.0
8
+ attrs==23.2.0
9
+ black==24.1.0
10
+ certifi==2023.11.17
11
+ charset-normalizer==3.3.2
12
+ click==8.1.7
13
+ colorama==0.4.6
14
+ contourpy==1.2.0
15
+ cycler==0.12.1
16
+ dataclasses-json==0.6.3
17
+ faiss-cpu==1.7.4
18
+ fastapi==0.109.0
19
+ ffmpy==0.3.1
20
+ filelock==3.13.1
21
+ fonttools==4.47.2
22
+ frozenlist==1.4.1
23
+ fsspec==2023.12.2
24
+ gradio==4.16.0
25
+ gradio_client==0.8.1
26
+ greenlet==3.0.3
27
+ h11==0.14.0
28
+ httpcore==1.0.2
29
+ httpx==0.26.0
30
+ huggingface-hub==0.20.3
31
+ idna==3.6
32
+ importlib-resources==6.1.1
33
+ InstructorEmbedding==1.0.1
34
+ isort==5.13.2
35
+ Jinja2==3.1.3
36
+ joblib==1.3.2
37
+ jsonpatch==1.33
38
+ jsonpointer==2.4
39
+ jsonschema==4.21.1
40
+ jsonschema-specifications==2023.12.1
41
+ kiwisolver==1.4.5
42
+ langchain==0.1.4
43
+ langchain-community==0.0.16
44
+ langchain-core==0.1.16
45
+ langsmith==0.0.83
46
+ markdown-it-py==3.0.0
47
+ MarkupSafe==2.1.4
48
+ marshmallow==3.20.2
49
+ matplotlib==3.8.2
50
+ mdurl==0.1.2
51
+ mpmath==1.3.0
52
+ multidict==6.0.4
53
+ mypy-extensions==1.0.0
54
+ networkx==3.2.1
55
+ nltk==3.8.1
56
+ numpy==1.26.3
57
+ # nvidia-cublas-cu12==12.1.3.1
58
+ # nvidia-cuda-cupti-cu12==12.1.105
59
+ # nvidia-cuda-nvrtc-cu12==12.1.105
60
+ # nvidia-cuda-runtime-cu12==12.1.105
61
+ # nvidia-cudnn-cu12==8.9.2.26
62
+ # nvidia-cufft-cu12==11.0.2.54
63
+ # nvidia-curand-cu12==10.3.2.106
64
+ # nvidia-cusolver-cu12==11.4.5.107
65
+ # nvidia-cusparse-cu12==12.1.0.106
66
+ # nvidia-nccl-cu12==2.18.1
67
+ # nvidia-nvjitlink-cu12==12.3.101
68
+ # nvidia-nvtx-cu12==12.1.105
69
+ orjson==3.9.12
70
+ packaging==23.2
71
+ pandas==2.2.0
72
+ pathspec==0.12.1
73
+ peft @ git+https://github.com/huggingface/peft.git@1c1c7fdaa6e6abaa53939b865dee1eded82ad032
74
+ pillow==10.2.0
75
+ platformdirs==4.1.0
76
+ protobuf==4.25.2
77
+ psutil==5.9.8
78
+ pydantic==2.5.3
79
+ pydantic_core==2.14.6
80
+ pydub==0.25.1
81
+ Pygments==2.17.2
82
+ pyparsing==3.1.1
83
+ python-dateutil==2.8.2
84
+ python-dotenv==1.0.1
85
+ python-multipart==0.0.6
86
+ pytz==2023.3.post1
87
+ PyYAML==6.0.1
88
+ referencing==0.32.1
89
+ regex==2023.12.25
90
+ requests==2.31.0
91
+ rich==13.7.0
92
+ rpds-py==0.17.1
93
+ ruff==0.1.14
94
+ safetensors==0.4.2
95
+ scikit-learn==1.4.0
96
+ scipy==1.12.0
97
+ semantic-version==2.10.0
98
+ sentence-transformers==2.2.2
99
+ sentencepiece==0.1.99
100
+ shellingham==1.5.4
101
+ six==1.16.0
102
+ sniffio==1.3.0
103
+ SQLAlchemy==2.0.25
104
+ starlette==0.35.1
105
+ sympy==1.12
106
+ tenacity==8.2.3
107
+ threadpoolctl==3.2.0
108
+ tokenizers==0.15.1
109
+ tomlkit==0.12.0
110
+ toolz==0.12.1
111
+ torch==2.1.2
112
+ torchvision==0.16.2
113
+ tqdm==4.66.1
114
+ transformers @ git+https://github.com/huggingface/transformers.git@de13a951b38b85195984164819f1ab05fe508677
115
+ # triton==2.1.0
116
+ typer==0.9.0
117
+ typing-inspect==0.9.0
118
+ typing_extensions==4.9.0
119
+ tzdata==2023.4
120
+ urllib3==2.1.0
121
+ uvicorn==0.27.0
122
+ websockets==11.0.3
123
+ yarl==1.9.4
124
+ einops==0.7.0
125
+ Pyarrow==15.0.0
126
+ openpyxl==3.1.2
127
+ tabulate==0.9.0
requirements.txt CHANGED
@@ -1,38 +1,129 @@
1
- gradio
2
- mdtex2html
3
- pypinyin
4
- tiktoken
5
- socksio
6
- tqdm
7
- colorama
8
- accelerate
9
- langchain
10
- torch
11
- langchain-serve
12
- protobuf
13
- faiss-cpu
14
- sentence_transformers
15
- InstructorEmbedding
16
- python-dotenv
17
- openai
18
- gpt4all
19
- pyllama
20
- git+https://github.com/huggingface/peft.git
21
- git+https://github.com/huggingface/transformers.git
22
- SentencePiece
23
- isort
24
- black
25
- pygpt4all
26
- tiktoken
27
- safetensors
28
- xformers
29
- bitsandbytes
30
- einops
31
- gevent
32
- pydantic >= 1.10.11
33
- pypdf
34
- python-telegram-bot
35
- transformers_stream_generator
36
- openllm
37
- openllm[llama]
38
- text_generation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.26.1
2
+ aiofiles==23.2.1
3
+ aiohttp==3.9.1
4
+ aiosignal==1.3.1
5
+ altair==5.2.0
6
+ annotated-types==0.6.0
7
+ anyio==4.2.0
8
+ attrs==23.2.0
9
+ black==24.1.0
10
+ certifi==2023.11.17
11
+ charset-normalizer==3.3.2
12
+ click==8.1.7
13
+ colorama==0.4.6
14
+ contourpy==1.2.0
15
+ cycler==0.12.1
16
+ dataclasses-json==0.6.3
17
+ faiss-cpu==1.7.4
18
+ fastapi==0.109.0
19
+ ffmpy==0.3.1
20
+ filelock==3.13.1
21
+ fonttools==4.47.2
22
+ frozenlist==1.4.1
23
+ fsspec==2023.12.2
24
+ gradio==4.16.0
25
+ gradio_client==0.8.1
26
+ greenlet==3.0.3
27
+ h11==0.14.0
28
+ httpcore==1.0.2
29
+ httpx==0.26.0
30
+ huggingface-hub==0.20.3
31
+ idna==3.6
32
+ importlib-resources==6.1.1
33
+ InstructorEmbedding==1.0.1
34
+ isort==5.13.2
35
+ Jinja2==3.1.3
36
+ joblib==1.3.2
37
+ jsonpatch==1.33
38
+ jsonpointer==2.4
39
+ jsonschema==4.21.1
40
+ jsonschema-specifications==2023.12.1
41
+ kiwisolver==1.4.5
42
+ langchain==0.1.4
43
+ langchain-community==0.0.16
44
+ langchain-openai==0.0.5
45
+ langchain-core==0.1.16
46
+ langsmith==0.0.83
47
+ markdown-it-py==3.0.0
48
+ MarkupSafe==2.1.4
49
+ marshmallow==3.20.2
50
+ matplotlib==3.8.2
51
+ mdurl==0.1.2
52
+ mpmath==1.3.0
53
+ multidict==6.0.4
54
+ mypy-extensions==1.0.0
55
+ networkx==3.2.1
56
+ nltk==3.8.1
57
+ numpy==1.26.3
58
+ nvidia-cublas-cu12==12.1.3.1
59
+ nvidia-cuda-cupti-cu12==12.1.105
60
+ nvidia-cuda-nvrtc-cu12==12.1.105
61
+ nvidia-cuda-runtime-cu12==12.1.105
62
+ nvidia-cudnn-cu12==8.9.2.26
63
+ nvidia-cufft-cu12==11.0.2.54
64
+ nvidia-curand-cu12==10.3.2.106
65
+ nvidia-cusolver-cu12==11.4.5.107
66
+ nvidia-cusparse-cu12==12.1.0.106
67
+ nvidia-nccl-cu12==2.18.1
68
+ nvidia-nvjitlink-cu12==12.3.101
69
+ nvidia-nvtx-cu12==12.1.105
70
+ orjson==3.9.12
71
+ packaging==23.2
72
+ pandas==2.2.0
73
+ pathspec==0.12.1
74
+ peft @ git+https://github.com/huggingface/peft.git@1c1c7fdaa6e6abaa53939b865dee1eded82ad032
75
+ pillow==10.2.0
76
+ platformdirs==4.1.0
77
+ protobuf==4.25.2
78
+ psutil==5.9.8
79
+ pydantic==2.5.3
80
+ pydantic_core==2.14.6
81
+ pydub==0.25.1
82
+ Pygments==2.17.2
83
+ pyparsing==3.1.1
84
+ python-dateutil==2.8.2
85
+ python-dotenv==1.0.1
86
+ python-multipart==0.0.6
87
+ pytz==2023.3.post1
88
+ PyYAML==6.0.1
89
+ referencing==0.32.1
90
+ regex==2023.12.25
91
+ requests==2.31.0
92
+ rich==13.7.0
93
+ rpds-py==0.17.1
94
+ ruff==0.1.14
95
+ safetensors==0.4.2
96
+ scikit-learn==1.4.0
97
+ scipy==1.12.0
98
+ semantic-version==2.10.0
99
+ sentence-transformers==2.2.2
100
+ sentencepiece==0.1.99
101
+ shellingham==1.5.4
102
+ six==1.16.0
103
+ sniffio==1.3.0
104
+ SQLAlchemy==2.0.25
105
+ starlette==0.35.1
106
+ sympy==1.12
107
+ tenacity==8.2.3
108
+ threadpoolctl==3.2.0
109
+ tokenizers==0.15.1
110
+ tomlkit==0.12.0
111
+ toolz==0.12.1
112
+ torch==2.1.2
113
+ torchvision==0.16.2
114
+ tqdm==4.66.1
115
+ transformers @ git+https://github.com/huggingface/transformers.git@de13a951b38b85195984164819f1ab05fe508677
116
+ triton==2.1.0
117
+ typer==0.9.0
118
+ typing-inspect==0.9.0
119
+ typing_extensions==4.9.0
120
+ tzdata==2023.4
121
+ urllib3==2.1.0
122
+ uvicorn==0.27.0
123
+ websockets==11.0.3
124
+ yarl==1.9.4
125
+ einops==0.7.0
126
+ Pyarrow==15.0.0
127
+ openpyxl==3.1.2
128
+ text_generation==0.6.1
129
+ tabulate==0.9.0