dh-mc commited on
Commit
99d65c0
1 Parent(s): e1a6c78

get latest code from chat-with-pci-dss-v4

Browse files
.env.example CHANGED
@@ -8,6 +8,9 @@ LLM_MODEL_TYPE=huggingface
8
 
9
  OPENAI_API_KEY=
10
 
 
 
 
11
  # cpu, mps or cuda:0 - if unset, use whatever detected
12
  HF_EMBEDDINGS_DEVICE_TYPE=
13
  HF_PIPELINE_DEVICE_TYPE=
@@ -16,9 +19,11 @@ HF_PIPELINE_DEVICE_TYPE=
16
  # LOAD_QUANTIZED_MODEL=4bit
17
  # LOAD_QUANTIZED_MODEL=8bit
18
 
 
19
  CHAT_HISTORY_ENABLED=true
20
  SHOW_PARAM_SETTINGS=false
21
- PDF_FILE_BASE_URL=https://ai-engd.netlify.app/pdfs/books/
 
22
 
23
  # if unset, default to "hkunlp/instructor-xl"
24
  HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
@@ -26,6 +31,8 @@ HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
26
  # number of cpu cores - used to set n_threads for GPT4ALL & LlamaCpp models
27
  NUMBER_OF_CPU_CORES=
28
 
 
 
29
  USING_TORCH_BFLOAT16=true
30
  # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-3b"
31
  # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-7b"
@@ -36,14 +43,14 @@ USING_TORCH_BFLOAT16=true
36
  # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
37
  # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
38
  # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
39
- HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
 
 
 
40
 
41
- # STABLELM_MODEL_NAME_OR_PATH="./models/stablelm-base-alpha-7b"
42
- # STABLELM_MODEL_NAME_OR_PATH="./models/stablelm-tuned-alpha-7b"
43
  STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"
44
 
45
- # MOSAICML_MODEL_NAME_OR_PATH="mosaicml/mpt-7b-instruct"
46
- MOSAICML_MODEL_NAME_OR_PATH="mosaicml/mpt-1b-redpajama-200b-dolly"
47
 
48
  FALCON_MODEL_NAME_OR_PATH="tiiuae/falcon-7b-instruct"
49
 
@@ -66,6 +73,6 @@ TOKENIZERS_PARALLELISM=true
66
 
67
  # env variables for ingesting source PDF files
68
  SOURCE_PDFS_PATH="./data/pdfs/"
69
- SOURCE_URLS=
70
  CHUNCK_SIZE=1024
71
  CHUNK_OVERLAP=512
 
8
 
9
  OPENAI_API_KEY=
10
 
11
+ # if unset, default to "gpt-4"
12
+ OPENAI_MODEL_NAME=
13
+
14
  # cpu, mps or cuda:0 - if unset, use whatever detected
15
  HF_EMBEDDINGS_DEVICE_TYPE=
16
  HF_PIPELINE_DEVICE_TYPE=
 
19
  # LOAD_QUANTIZED_MODEL=4bit
20
  # LOAD_QUANTIZED_MODEL=8bit
21
 
22
+ DISABLE_MODEL_PRELOADING=false
23
  CHAT_HISTORY_ENABLED=true
24
  SHOW_PARAM_SETTINGS=false
25
+ SHARE_GRADIO_APP=false
26
+ PDF_FILE_BASE_URL=https://ai-engd.netlify.app/pdfs/pci_dss_v4/
27
 
28
  # if unset, default to "hkunlp/instructor-xl"
29
  HF_EMBEDDINGS_MODEL_NAME="hkunlp/instructor-large"
 
31
  # number of cpu cores - used to set n_threads for GPT4ALL & LlamaCpp models
32
  NUMBER_OF_CPU_CORES=
33
 
34
+ HUGGINGFACE_AUTH_TOKEN=
35
+
36
  USING_TORCH_BFLOAT16=true
37
  # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-3b"
38
  # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-7b"
 
43
  # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
44
  # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
45
  # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
46
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
47
+ HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
48
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-13b-chat-hf"
49
+ # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-70b-chat-hf"
50
 
 
 
51
  STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"
52
 
53
+ MOSAICML_MODEL_NAME_OR_PATH="mosaicml/mpt-7b-instruct"
 
54
 
55
  FALCON_MODEL_NAME_OR_PATH="tiiuae/falcon-7b-instruct"
56
 
 
73
 
74
  # env variables for ingesting source PDF files
75
  SOURCE_PDFS_PATH="./data/pdfs/"
76
+ SOURCE_URLS="./data/pci_dss_urls.txt"
77
  CHUNCK_SIZE=1024
78
  CHUNK_OVERLAP=512
.gitattributes CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  data/faiss_1024_512/index.faiss filter=lfs diff=lfs merge=lfs -text
37
  data/faiss_1024_512/index.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  data/faiss_1024_512/index.faiss filter=lfs diff=lfs merge=lfs -text
37
  data/faiss_1024_512/index.pkl filter=lfs diff=lfs merge=lfs -text
38
+ data/pci_dss_v4/index.faiss filter=lfs diff=lfs merge=lfs -text
39
+ data/pci_dss_v4/index.pkl filter=lfs diff=lfs merge=lfs -text
Makefile CHANGED
@@ -2,6 +2,13 @@
2
  start:
3
  python app.py
4
 
 
 
 
 
 
 
 
5
  test:
6
  PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 TRANSFORMERS_OFFLINE=1 python test.py
7
 
@@ -21,9 +28,14 @@ format:
21
  black .
22
 
23
  install:
24
- CXX=g++-11 CC=gcc-11 pip install -U -r requirements.txt
25
- pip show langchain llama-cpp-python transformers
26
-
27
- mac-install:
28
  pip install -U -r requirements.txt
29
  pip show langchain transformers
 
 
 
 
 
 
 
 
 
 
2
  start:
3
  python app.py
4
 
5
+ serve:
6
+ ifeq ("$(PORT)", "")
7
+ JINA_HIDE_SURVEY=1 TRANSFORMERS_OFFLINE=1 python -m lcserve deploy local server
8
+ else
9
+ JINA_HIDE_SURVEY=1 TRANSFORMERS_OFFLINE=1 python -m lcserve deploy local server --port=${PORT}
10
+ endif
11
+
12
  test:
13
  PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 TRANSFORMERS_OFFLINE=1 python test.py
14
 
 
28
  black .
29
 
30
  install:
 
 
 
 
31
  pip install -U -r requirements.txt
32
  pip show langchain transformers
33
+
34
+ install-extra:
35
+ CXX=g++-11 CC=gcc-11 pip install -U -r requirements_extra.txt
36
+ pip show langchain llama-cpp-python transformers
37
+
38
+ install-extra-mac:
39
+ # brew install llvm libomp
40
+ CXX=/usr/local/opt/llvm/bin/clang++ CC=/usr/local/opt/llvm/bin/clang pip install -U -r requirements_extra.txt
41
+ pip show langchain llama-cpp-python transformers
app.py CHANGED
@@ -33,6 +33,7 @@ using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
33
  llm_model_type = os.environ.get("LLM_MODEL_TYPE")
34
  chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") == "true"
35
  show_param_settings = os.environ.get("SHOW_PARAM_SETTINGS") == "true"
 
36
 
37
 
38
  streaming_enabled = True # llm_model_type in ["openai", "llamacpp"]
@@ -76,7 +77,9 @@ def qa(chatbot):
76
 
77
  def task(question, chat_history):
78
  start = timer()
79
- ret = qa_chain.call({"question": question, "chat_history": chat_history}, q)
 
 
80
  end = timer()
81
 
82
  print(f"Completed in {end - start:.3f}s")
@@ -203,7 +206,7 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
203
  ).then(qa, chatbot, chatbot)
204
 
205
  submitBtn.click(
206
- chat, [user_input, chatbot], [user_input, chatbot], queue=True
207
  ).then(qa, chatbot, chatbot)
208
 
209
  def reset():
@@ -213,7 +216,8 @@ with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
213
  reset,
214
  outputs=[user_input, chatbot],
215
  show_progress=True,
 
216
  )
217
 
218
- demo.title = "Chat with AI Books"
219
- demo.queue(concurrency_count=1).launch()
 
33
  llm_model_type = os.environ.get("LLM_MODEL_TYPE")
34
  chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") == "true"
35
  show_param_settings = os.environ.get("SHOW_PARAM_SETTINGS") == "true"
36
+ share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
37
 
38
 
39
  streaming_enabled = True # llm_model_type in ["openai", "llamacpp"]
 
77
 
78
  def task(question, chat_history):
79
  start = timer()
80
+ ret = qa_chain.call(
81
+ {"question": question, "chat_history": chat_history}, None, q
82
+ )
83
  end = timer()
84
 
85
  print(f"Completed in {end - start:.3f}s")
 
206
  ).then(qa, chatbot, chatbot)
207
 
208
  submitBtn.click(
209
+ chat, [user_input, chatbot], [user_input, chatbot], queue=True, api_name="chat"
210
  ).then(qa, chatbot, chatbot)
211
 
212
  def reset():
 
216
  reset,
217
  outputs=[user_input, chatbot],
218
  show_progress=True,
219
+ api_name="reset",
220
  )
221
 
222
+ demo.title = "Chat with PCI DSS v4"
223
+ demo.queue(concurrency_count=1).launch(share=share_gradio_app)
app_modules/presets.py CHANGED
@@ -3,15 +3,19 @@ import os
3
 
4
  import gradio as gr
5
 
 
 
6
  using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
7
  href = (
8
  "https://openai.com/gpt-4"
9
  if using_openai
10
  else "https://huggingface.co/lmsys/fastchat-t5-3b-v1.0"
11
  )
12
- model = "OpenAI GPT-4" if using_openai else "lmsys/fastchat-t5-3b-v1.0"
 
 
13
 
14
- title = """<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with AI Books </h1>"""
15
 
16
  description_top = f"""\
17
  <div align="left">
 
3
 
4
  import gradio as gr
5
 
6
+ from app_modules.utils import *
7
+
8
  using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
9
  href = (
10
  "https://openai.com/gpt-4"
11
  if using_openai
12
  else "https://huggingface.co/lmsys/fastchat-t5-3b-v1.0"
13
  )
14
+ model = (
15
+ "OpenAI GPT-4" if using_openai else os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
16
+ )
17
 
18
+ title = """<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with PCI DSS v4 </h1>"""
19
 
20
  description_top = f"""\
21
  <div align="left">
app_modules/qa_chain.py CHANGED
@@ -1,7 +1,9 @@
1
  import os
2
  import sys
 
3
  import urllib
4
  from queue import Queue
 
5
  from typing import Any, Optional
6
 
7
  import torch
@@ -78,6 +80,9 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
78
  # print("resetting TextIteratorStreamer")
79
  self.text_queue = q if q is not None else Queue()
80
 
 
 
 
81
 
82
  class QAChain:
83
  llm_model_type: str
@@ -177,6 +182,17 @@ class QAChain:
177
  MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
178
  print(f" loading model: {MODEL_NAME_OR_PATH}")
179
 
 
 
 
 
 
 
 
 
 
 
 
180
  is_t5 = "t5" in MODEL_NAME_OR_PATH
181
  temperature = (
182
  0.01
@@ -192,20 +208,26 @@ class QAChain:
192
  padding_side = "left" # if "dolly" in MODEL_NAME_OR_PATH else None
193
 
194
  config = AutoConfig.from_pretrained(
195
- MODEL_NAME_OR_PATH, trust_remote_code=True
 
 
196
  )
197
  # config.attn_config["attn_impl"] = "triton"
198
  # config.max_seq_len = 4096
199
  config.init_device = hf_pipeline_device_type
200
 
201
  tokenizer = (
202
- T5Tokenizer.from_pretrained(MODEL_NAME_OR_PATH)
 
 
 
203
  if is_t5
204
  else AutoTokenizer.from_pretrained(
205
  MODEL_NAME_OR_PATH,
206
  use_fast=use_fast,
207
  trust_remote_code=True,
208
  padding_side=padding_side,
 
209
  )
210
  )
211
 
@@ -228,6 +250,7 @@ class QAChain:
228
  config=config,
229
  quantization_config=double_quant_config,
230
  trust_remote_code=True,
 
231
  )
232
  if is_t5
233
  else AutoModelForCausalLM.from_pretrained(
@@ -235,6 +258,7 @@ class QAChain:
235
  config=config,
236
  quantization_config=double_quant_config,
237
  trust_remote_code=True,
 
238
  )
239
  )
240
 
@@ -310,11 +334,34 @@ class QAChain:
310
  temperature=temperature,
311
  return_full_text=True,
312
  repetition_penalty=repetition_penalty,
 
313
  )
314
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  pipe = pipeline(
316
- task, # model=model,
317
- model=MODEL_NAME_OR_PATH,
318
  tokenizer=tokenizer,
319
  streamer=self.streamer,
320
  return_full_text=return_full_text, # langchain expects the full text
@@ -322,11 +369,11 @@ class QAChain:
322
  torch_dtype=torch_dtype,
323
  max_new_tokens=2048,
324
  trust_remote_code=True,
325
- # verbose=True,
326
  temperature=temperature,
327
  top_p=0.95,
328
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
329
  repetition_penalty=1.115,
 
330
  )
331
 
332
  self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
@@ -386,7 +433,7 @@ class QAChain:
386
  self.search_kwargs = (
387
  {"k": 8} if "30b" in MODEL_NAME_OR_PATH else self.search_kwargs
388
  )
389
- repetition_penalty = 1.0005 if "30b" in MODEL_NAME_OR_PATH else 1.02
390
 
391
  pipe = (
392
  pipeline(
@@ -524,7 +571,7 @@ class QAChain:
524
 
525
  return qa
526
 
527
- def call(self, inputs, q: Queue = None, tracing: bool = False):
528
  print(inputs)
529
 
530
  if self.streamer is not None and isinstance(
@@ -533,7 +580,15 @@ class QAChain:
533
  self.streamer.reset(q)
534
 
535
  qa = self.get_chain(tracing)
536
- result = qa(inputs)
 
 
 
 
 
 
 
 
537
 
538
  result["answer"] = remove_extra_spaces(result["answer"])
539
 
@@ -546,3 +601,31 @@ class QAChain:
546
  doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
547
 
548
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import sys
3
+ import time
4
  import urllib
5
  from queue import Queue
6
+ from threading import Thread
7
  from typing import Any, Optional
8
 
9
  import torch
 
80
  # print("resetting TextIteratorStreamer")
81
  self.text_queue = q if q is not None else Queue()
82
 
83
+ def empty(self):
84
+ return self.text_queue.empty()
85
+
86
 
87
  class QAChain:
88
  llm_model_type: str
 
182
  MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
183
  print(f" loading model: {MODEL_NAME_OR_PATH}")
184
 
185
+ hf_auth_token = os.environ.get("HUGGINGFACE_AUTH_TOKEN")
186
+ transformers_offline = os.environ.get("TRANSFORMERS_OFFLINE") == "1"
187
+ token = (
188
+ hf_auth_token
189
+ if hf_auth_token is not None
190
+ and len(hf_auth_token) > 0
191
+ and not transformers_offline
192
+ else None
193
+ )
194
+ print(f" HF auth token: {str(token)[-5:]}")
195
+
196
  is_t5 = "t5" in MODEL_NAME_OR_PATH
197
  temperature = (
198
  0.01
 
208
  padding_side = "left" # if "dolly" in MODEL_NAME_OR_PATH else None
209
 
210
  config = AutoConfig.from_pretrained(
211
+ MODEL_NAME_OR_PATH,
212
+ trust_remote_code=True,
213
+ token=token,
214
  )
215
  # config.attn_config["attn_impl"] = "triton"
216
  # config.max_seq_len = 4096
217
  config.init_device = hf_pipeline_device_type
218
 
219
  tokenizer = (
220
+ T5Tokenizer.from_pretrained(
221
+ MODEL_NAME_OR_PATH,
222
+ token=token,
223
+ )
224
  if is_t5
225
  else AutoTokenizer.from_pretrained(
226
  MODEL_NAME_OR_PATH,
227
  use_fast=use_fast,
228
  trust_remote_code=True,
229
  padding_side=padding_side,
230
+ token=token,
231
  )
232
  )
233
 
 
250
  config=config,
251
  quantization_config=double_quant_config,
252
  trust_remote_code=True,
253
+ token=token,
254
  )
255
  if is_t5
256
  else AutoModelForCausalLM.from_pretrained(
 
258
  config=config,
259
  quantization_config=double_quant_config,
260
  trust_remote_code=True,
261
+ token=token,
262
  )
263
  )
264
 
 
334
  temperature=temperature,
335
  return_full_text=True,
336
  repetition_penalty=repetition_penalty,
337
+ token=token,
338
  )
339
  else:
340
+ if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
341
+ use_auth_token = None
342
+ model = (
343
+ AutoModelForSeq2SeqLM.from_pretrained(
344
+ MODEL_NAME_OR_PATH,
345
+ config=config,
346
+ trust_remote_code=True,
347
+ token=token,
348
+ )
349
+ if is_t5
350
+ else AutoModelForCausalLM.from_pretrained(
351
+ MODEL_NAME_OR_PATH,
352
+ config=config,
353
+ trust_remote_code=True,
354
+ token=token,
355
+ )
356
+ )
357
+ print(f"Model memory footprint: {model.get_memory_footprint()}")
358
+ else:
359
+ use_auth_token = token
360
+ model = MODEL_NAME_OR_PATH
361
+
362
  pipe = pipeline(
363
+ task,
364
+ model=model,
365
  tokenizer=tokenizer,
366
  streamer=self.streamer,
367
  return_full_text=return_full_text, # langchain expects the full text
 
369
  torch_dtype=torch_dtype,
370
  max_new_tokens=2048,
371
  trust_remote_code=True,
 
372
  temperature=temperature,
373
  top_p=0.95,
374
  top_k=0, # select from top 0 tokens (because zero, relies on top_p)
375
  repetition_penalty=1.115,
376
+ token=use_auth_token,
377
  )
378
 
379
  self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
 
433
  self.search_kwargs = (
434
  {"k": 8} if "30b" in MODEL_NAME_OR_PATH else self.search_kwargs
435
  )
436
+ repetition_penalty = 1.05 if "30b" in MODEL_NAME_OR_PATH else 1.02
437
 
438
  pipe = (
439
  pipeline(
 
571
 
572
  return qa
573
 
574
+ def call(self, inputs, streaming_handler, q: Queue = None, tracing: bool = False):
575
  print(inputs)
576
 
577
  if self.streamer is not None and isinstance(
 
580
  self.streamer.reset(q)
581
 
582
  qa = self.get_chain(tracing)
583
+ result = (
584
+ self._run_qa_chain(
585
+ qa,
586
+ inputs,
587
+ streaming_handler,
588
+ )
589
+ if streaming_handler is not None
590
+ else qa(inputs)
591
+ )
592
 
593
  result["answer"] = remove_extra_spaces(result["answer"])
594
 
 
601
  doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
602
 
603
  return result
604
+
605
+ def _run_qa_chain(self, qa, inputs, streaming_handler):
606
+ que = Queue()
607
+
608
+ t = Thread(
609
+ target=lambda qa, inputs, q, sh: q.put(qa(inputs, callbacks=[sh])),
610
+ args=(qa, inputs, que, streaming_handler),
611
+ )
612
+ t.start()
613
+
614
+ if self.streamer is not None and isinstance(
615
+ self.streamer, TextIteratorStreamer
616
+ ):
617
+ count = 2 if len(inputs.get("chat_history")) > 0 else 1
618
+
619
+ while count > 0:
620
+ try:
621
+ for token in self.streamer:
622
+ streaming_handler.on_llm_new_token(token)
623
+
624
+ self.streamer.reset()
625
+ count -= 1
626
+ except Exception:
627
+ print("nothing generated yet - retry in 0.5s")
628
+ time.sleep(0.5)
629
+
630
+ t.join()
631
+ return que.get()
app_modules/utils.py CHANGED
@@ -88,7 +88,9 @@ def print_llm_response(llm_response):
88
  + " Source: "
89
  + str(metadata["url"] if "url" in metadata else metadata["source"])
90
  )
91
- print(source.page_content)
 
 
92
 
93
 
94
  def get_device_types():
 
88
  + " Source: "
89
  + str(metadata["url"] if "url" in metadata else metadata["source"])
90
  )
91
+ print(
92
+ source["page_content"] if "page_content" in source else source.page_content
93
+ )
94
 
95
 
96
  def get_device_types():
data/pci_dss_v4/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98e8c49e6c3ef2bcd0b258fb51ffe58fa92a63544b672f1c0c75857593afa2a8
3
+ size 5987373
data/pci_dss_v4/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8467d3647bf272f11151a512f54515ef6dd83f2081686156a437132380b28b4b
3
+ size 2035755
data/questions.txt CHANGED
@@ -1,4 +1,3 @@
1
- What's AI?
2
- life in AI era
3
- machine learning
4
- generative model
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f61185685e79b9b115f1b3d34c6bca2913174a18c014b210e749e419beb2211
3
+ size 60
 
requirements.txt CHANGED
@@ -6,14 +6,11 @@ socksio
6
  tqdm
7
  colorama
8
  accelerate
9
- Pygments
10
- llama_index
11
  langchain
12
  torch
13
- langchain
14
- protobuf==3.20.*
15
  faiss-cpu
16
- chromadb
17
  sentence_transformers
18
  InstructorEmbedding
19
  python-dotenv
@@ -25,8 +22,6 @@ git+https://github.com/huggingface/transformers.git
25
  SentencePiece
26
  isort
27
  black
28
- llama-cpp-python
29
- pyllamacpp
30
  pygpt4all
31
  tiktoken
32
  safetensors
@@ -34,4 +29,5 @@ xformers
34
  bitsandbytes
35
  einops
36
  gevent
37
- pydantic >= 1.10.11
 
 
6
  tqdm
7
  colorama
8
  accelerate
 
 
9
  langchain
10
  torch
11
+ langchain-serve
12
+ protobuf
13
  faiss-cpu
 
14
  sentence_transformers
15
  InstructorEmbedding
16
  python-dotenv
 
22
  SentencePiece
23
  isort
24
  black
 
 
25
  pygpt4all
26
  tiktoken
27
  safetensors
 
29
  bitsandbytes
30
  einops
31
  gevent
32
+ pydantic >= 1.10.11
33
+ pypdf
requirements_extra.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ llama-cpp-python
2
+ pyllamacpp
3
+ chromadb
server.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main entrypoint for the app."""
2
+ import json
3
+ import os
4
+ import time
5
+ from queue import Queue
6
+ from timeit import default_timer as timer
7
+ from typing import List, Optional
8
+
9
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
10
+ from langchain.vectorstores.chroma import Chroma
11
+ from langchain.vectorstores.faiss import FAISS
12
+ from lcserve import serving
13
+ from pydantic import BaseModel
14
+
15
+ from app_modules.presets import *
16
+ from app_modules.qa_chain import QAChain
17
+ from app_modules.utils import *
18
+
19
+ # Constants
20
+ init_settings()
21
+
22
+ # https://github.com/huggingface/transformers/issues/17611
23
+ os.environ["CURL_CA_BUNDLE"] = ""
24
+
25
+ hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
26
+ print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
27
+ print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
28
+
29
+ hf_embeddings_model_name = (
30
+ os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
31
+ )
32
+ n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
33
+ index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
34
+ using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
35
+ llm_model_type = os.environ.get("LLM_MODEL_TYPE")
36
+ chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") == "true"
37
+ show_param_settings = os.environ.get("SHOW_PARAM_SETTINGS") == "true"
38
+ share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
39
+
40
+
41
+ streaming_enabled = True # llm_model_type in ["openai", "llamacpp"]
42
+
43
+ start = timer()
44
+ embeddings = HuggingFaceInstructEmbeddings(
45
+ model_name=hf_embeddings_model_name,
46
+ model_kwargs={"device": hf_embeddings_device_type},
47
+ )
48
+ end = timer()
49
+
50
+ print(f"Completed in {end - start:.3f}s")
51
+
52
+ start = timer()
53
+
54
+ print(f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}")
55
+
56
+ if not os.path.isdir(index_path):
57
+ raise ValueError(f"{index_path} does not exist!")
58
+ elif using_faiss:
59
+ vectorstore = FAISS.load_local(index_path, embeddings)
60
+ else:
61
+ vectorstore = Chroma(embedding_function=embeddings, persist_directory=index_path)
62
+
63
+ end = timer()
64
+
65
+ print(f"Completed in {end - start:.3f}s")
66
+
67
+ start = timer()
68
+ qa_chain = QAChain(vectorstore, llm_model_type)
69
+ qa_chain.init(n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type)
70
+ end = timer()
71
+ print(f"Completed in {end - start:.3f}s")
72
+
73
+
74
+ class ChatResponse(BaseModel):
75
+ """Chat response schema."""
76
+
77
+ token: Optional[str] = None
78
+ error: Optional[str] = None
79
+ sourceDocs: Optional[List] = None
80
+
81
+
82
+ @serving(websocket=True)
83
+ def chat(question: str, history: Optional[List], **kwargs) -> str:
84
+ # Get the `streaming_handler` from `kwargs`. This is used to stream data to the client.
85
+ streaming_handler = kwargs.get("streaming_handler") if streaming_enabled else None
86
+ chat_history = []
87
+ if chat_history_enabled:
88
+ for element in history:
89
+ item = (element[0] or "", element[1] or "")
90
+ chat_history.append(item)
91
+
92
+ start = timer()
93
+ result = qa_chain.call(
94
+ {"question": question, "chat_history": chat_history}, streaming_handler
95
+ )
96
+ end = timer()
97
+ print(f"Completed in {end - start:.3f}s")
98
+
99
+ resp = ChatResponse(sourceDocs=result["source_documents"])
100
+
101
+ if not streaming_enabled:
102
+ resp.token = remove_extra_spaces(result["answer"])
103
+ print(resp.token)
104
+
105
+ return json.dumps(resp.dict())
106
+
107
+
108
+ if __name__ == "__main__":
109
+ print_llm_response(json.loads(chat("What is PCI DSS?", [])))
test.py CHANGED
@@ -29,8 +29,9 @@ hf_embeddings_model_name = (
29
  os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
30
  )
31
  n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
32
- index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
33
- using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
 
34
  llm_model_type = os.environ.get("LLM_MODEL_TYPE")
35
  chatting = len(sys.argv) > 1 and sys.argv[1] == "chat"
36
  questions_file_path = os.environ.get("QUESTIONS_FILE_PATH")
 
29
  os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
30
  )
31
  n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
32
+ faiss_index_path = os.environ.get("FAISS_INDEX_PATH") or ""
33
+ using_faiss = len(faiss_index_path) > 0
34
+ index_path = faiss_index_path if using_faiss else os.environ.get("CHROMADB_INDEX_PATH")
35
  llm_model_type = os.environ.get("LLM_MODEL_TYPE")
36
  chatting = len(sys.argv) > 1 and sys.argv[1] == "chat"
37
  questions_file_path = os.environ.get("QUESTIONS_FILE_PATH")
test.sh CHANGED
@@ -11,56 +11,69 @@ echo Using extension: $EXT
11
 
12
  [ ! -f .env ] || export $(grep -v '^#' .env | xargs)
13
 
14
- LLM_MODEL_TYPE=huggingface
 
 
 
15
 
16
- HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
 
 
 
 
 
 
17
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
18
  python test.py 2>&1 | tee ./data/logs/fastchat-t5-3b-v1.0_${EXT}.log
19
 
20
 
21
- HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
22
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
23
  python test.py 2>&1 | tee ./data/logs/wizardLM-7B-HF_${EXT}.log
24
 
25
 
26
- HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
27
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
28
  python test.py 2>&1 | tee ./data/logs/vicuna-7B-1.1-HF_${EXT}.log
29
 
30
 
31
- HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
32
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
33
  python test.py 2>&1 | tee ./data/logs/gpt4all-j_${EXT}.log
34
 
35
 
36
- # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
37
  # echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
38
  # python test.py 2>&1 | tee ./data/logs/gpt4all-falcon_${EXT}.log
39
 
40
- LLM_MODEL_TYPE=stablelm
41
 
42
- STABLELM_MODEL_NAME_OR_PATH="stabilityai/stablelm-tuned-alpha-7b"
43
- echo Testing $STABLELM_MODEL_NAME_OR_PATH
44
- python test.py 2>&1 | tee ./data/logs/stablelm-tuned-alpha-7b_${EXT}.log
45
 
46
 
47
- STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"
48
  echo Testing $STABLELM_MODEL_NAME_OR_PATH
49
  python test.py 2>&1 | tee ./data/logs/stablelm-7b-sft-v7-epoch-3_${EXT}.log
50
 
51
 
52
- LLM_MODEL_TYPE=mosaicml
53
- MOSAICML_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-mpt"
54
  echo Testing $MOSAICML_MODEL_NAME_OR_PATH
55
- python test.py 2>&1 | tee ./data/logs/gpt4all-mpt_${EXT}.log
56
 
57
 
58
- LLM_MODEL_TYPE=huggingface
59
- HUGGINGFACE_MODEL_NAME_OR_PATH="HuggingFaceH4/starchat-beta"
 
 
 
 
60
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
61
  LOAD_QUANTIZED_MODEL=8bit python test.py 2>&1 | tee ./data/logs/starchat-beta_${EXT}.log
62
 
63
 
64
- HUGGINGFACE_MODEL_NAME_OR_PATH="../../models/starcoder"
65
- echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
66
- LOAD_QUANTIZED_MODEL=8bit python test.py 2>&1 | tee ./data/logs/starcoder_${EXT}.log
 
11
 
12
  [ ! -f .env ] || export $(grep -v '^#' .env | xargs)
13
 
14
+ export LLM_MODEL_TYPE=openai
15
+ export OPENAI_MODEL_NAME="gpt-3.5-turbo"
16
+ echo Testing openai-${OPENAI_MODEL_NAME}
17
+ python test.py 2>&1 | tee ./data/logs/openai-${OPENAI_MODEL_NAME}_${EXT}.log
18
 
19
+ export OPENAI_MODEL_NAME="gpt-4"
20
+ echo Testing openai-${OPENAI_MODEL_NAME}
21
+ python test.py 2>&1 | tee ./data/logs/openai-${OPENAI_MODEL_NAME}_${EXT}.log
22
+
23
+ export LLM_MODEL_TYPE=huggingface
24
+
25
+ export HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
26
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
27
  python test.py 2>&1 | tee ./data/logs/fastchat-t5-3b-v1.0_${EXT}.log
28
 
29
 
30
+ export HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
31
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
32
  python test.py 2>&1 | tee ./data/logs/wizardLM-7B-HF_${EXT}.log
33
 
34
 
35
+ export HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
36
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
37
  python test.py 2>&1 | tee ./data/logs/vicuna-7B-1.1-HF_${EXT}.log
38
 
39
 
40
+ export HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
41
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
42
  python test.py 2>&1 | tee ./data/logs/gpt4all-j_${EXT}.log
43
 
44
 
45
+ # export HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
46
  # echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
47
  # python test.py 2>&1 | tee ./data/logs/gpt4all-falcon_${EXT}.log
48
 
49
+ export LLM_MODEL_TYPE=stablelm
50
 
51
+ # export STABLELM_MODEL_NAME_OR_PATH="stabilityai/stablelm-tuned-alpha-7b"
52
+ # echo Testing $STABLELM_MODEL_NAME_OR_PATH
53
+ # python test.py 2>&1 | tee ./data/logs/stablelm-tuned-alpha-7b_${EXT}.log
54
 
55
 
56
+ export STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"
57
  echo Testing $STABLELM_MODEL_NAME_OR_PATH
58
  python test.py 2>&1 | tee ./data/logs/stablelm-7b-sft-v7-epoch-3_${EXT}.log
59
 
60
 
61
+ export LLM_MODEL_TYPE=mosaicml
62
+ export MOSAICML_MODEL_NAME_OR_PATH="mosaicml/mpt-7b-instruct"
63
  echo Testing $MOSAICML_MODEL_NAME_OR_PATH
64
+ python test.py 2>&1 | tee ./data/logs/mpt-7b-instruct_${EXT}.log
65
 
66
 
67
+ # export MOSAICML_MODEL_NAME_OR_PATH="mosaicml/mpt-30b-instruct"
68
+ # echo Testing $MOSAICML_MODEL_NAME_OR_PATH
69
+ # LOAD_QUANTIZED_MODEL=4bit python test.py 2>&1 | tee ./data/logs/mpt-30b-instruct_${EXT}.log
70
+
71
+ export LLM_MODEL_TYPE=huggingface
72
+ export HUGGINGFACE_MODEL_NAME_OR_PATH="HuggingFaceH4/starchat-beta"
73
  echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
74
  LOAD_QUANTIZED_MODEL=8bit python test.py 2>&1 | tee ./data/logs/starchat-beta_${EXT}.log
75
 
76
 
77
+ # export HUGGINGFACE_MODEL_NAME_OR_PATH="../../models/starcoder"
78
+ # echo Testing $HUGGINGFACE_MODEL_NAME_OR_PATH
79
+ # LOAD_QUANTIZED_MODEL=8bit python test.py 2>&1 | tee ./data/logs/starcoder_${EXT}.log