inflaton commited on
Commit
c2cb992
1 Parent(s): 7f9d16c

tested app_modules/llm_loader.py

Browse files
Files changed (3) hide show
  1. .env.example +3 -3
  2. app_modules/llm_loader.py +15 -18
  3. test.py +19 -10
.env.example CHANGED
@@ -54,13 +54,13 @@ MOSAICML_MODEL_NAME_OR_PATH="mosaicml/mpt-7b-instruct"
54
 
55
  FALCON_MODEL_NAME_OR_PATH="tiiuae/falcon-7b-instruct"
56
 
57
- GPT4ALL_J_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_0.bin"
58
  GPT4ALL_J_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_0.bin
59
 
60
- GPT4ALL_MODEL_PATH="./models/ggml-nous-gpt4-vicuna-13b.bin"
61
  GPT4ALL_DOWNLOAD_LINK=https://gpt4all.io/models/ggml-nous-gpt4-vicuna-13b.bin
62
 
63
- LLAMACPP_MODEL_PATH="./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
64
  LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
65
 
66
  # Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
 
54
 
55
  FALCON_MODEL_NAME_OR_PATH="tiiuae/falcon-7b-instruct"
56
 
57
+ GPT4ALL_J_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
58
  GPT4ALL_J_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_0.bin
59
 
60
+ GPT4ALL_MODEL_PATH="../models/ggml-nous-gpt4-vicuna-13b.bin"
61
  GPT4ALL_DOWNLOAD_LINK=https://gpt4all.io/models/ggml-nous-gpt4-vicuna-13b.bin
62
 
63
+ LLAMACPP_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
64
  LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
65
 
66
  # Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
app_modules/llm_loader.py CHANGED
@@ -30,7 +30,7 @@ from transformers import (
30
  )
31
 
32
  from app_modules.instruct_pipeline import InstructionTextGenerationPipeline
33
- from app_modules.utils import ensure_model_is_downloaded, remove_extra_spaces
34
 
35
 
36
  class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
@@ -336,7 +336,6 @@ class LLMLoader:
336
  )
337
  else:
338
  if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
339
- use_auth_token = None
340
  model = (
341
  AutoModelForSeq2SeqLM.from_pretrained(
342
  MODEL_NAME_OR_PATH,
@@ -354,25 +353,23 @@ class LLMLoader:
354
  )
355
  print(f"Model memory footprint: {model.get_memory_footprint()}")
356
  else:
357
- use_auth_token = token
358
  model = MODEL_NAME_OR_PATH
359
 
360
  pipe = pipeline(
361
- task,
362
- model=model,
363
- tokenizer=tokenizer,
364
- streamer=self.streamer,
365
- return_full_text=return_full_text, # langchain expects the full text
366
- device=hf_pipeline_device_type,
367
- torch_dtype=torch_dtype,
368
- max_new_tokens=2048,
369
- trust_remote_code=True,
370
- temperature=temperature,
371
- top_p=0.95,
372
- top_k=0, # select from top 0 tokens (because zero, relies on top_p)
373
- repetition_penalty=1.115,
374
- token=use_auth_token,
375
- )
376
 
377
  self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
378
  elif self.llm_model_type == "mosaicml":
 
30
  )
31
 
32
  from app_modules.instruct_pipeline import InstructionTextGenerationPipeline
33
+ from app_modules.utils import ensure_model_is_downloaded
34
 
35
 
36
  class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
 
336
  )
337
  else:
338
  if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
 
339
  model = (
340
  AutoModelForSeq2SeqLM.from_pretrained(
341
  MODEL_NAME_OR_PATH,
 
353
  )
354
  print(f"Model memory footprint: {model.get_memory_footprint()}")
355
  else:
 
356
  model = MODEL_NAME_OR_PATH
357
 
358
  pipe = pipeline(
359
+ task,
360
+ model=model,
361
+ tokenizer=tokenizer,
362
+ streamer=self.streamer,
363
+ return_full_text=return_full_text, # langchain expects the full text
364
+ device=hf_pipeline_device_type,
365
+ torch_dtype=torch_dtype,
366
+ max_new_tokens=2048,
367
+ trust_remote_code=True,
368
+ temperature=temperature,
369
+ top_p=0.95,
370
+ top_k=0, # select from top 0 tokens (because zero, relies on top_p)
371
+ repetition_penalty=1.115,
372
+ )
 
373
 
374
  self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
375
  elif self.llm_model_type == "mosaicml":
test.py CHANGED
@@ -1,14 +1,21 @@
1
  # project/test.py
2
 
 
3
  import unittest
 
4
 
5
  from langchain.callbacks.base import BaseCallbackHandler
6
  from langchain.schema import HumanMessage
7
 
8
  from app_modules.llm_loader import LLMLoader
9
- from timeit import default_timer as timer
10
 
11
- USER_QUESTION = "What's the capital city of Malaysia?"
 
 
 
 
 
12
 
13
 
14
  class MyCustomHandler(BaseCallbackHandler):
@@ -32,7 +39,9 @@ class TestLLMLoader(unittest.TestCase):
32
  def run_test_case(self, llm_model_type, query):
33
  llm_loader = LLMLoader(llm_model_type)
34
  start = timer()
35
- llm_loader.init(n_threds=8, hf_pipeline_device_type="cpu")
 
 
36
  end = timer()
37
  print(f"Model loaded in {end - start:.3f}s")
38
 
@@ -43,17 +52,17 @@ class TestLLMLoader(unittest.TestCase):
43
  print(f"Inference completed in {end2 - end:.3f}s")
44
  print(result)
45
 
46
- def xtest_openai(self):
47
- self.run_test_case("openai", USER_QUESTION)
48
 
49
- def xtest_llamacpp(self):
50
- self.run_test_case("llamacpp", USER_QUESTION)
51
 
52
- def xtest_gpt4all_j(self):
53
- self.run_test_case("gpt4all-j", USER_QUESTION)
54
 
55
  def test_huggingface(self):
56
- self.run_test_case("huggingface", USER_QUESTION)
57
 
58
 
59
  if __name__ == "__main__":
 
1
  # project/test.py
2
 
3
+ import os
4
  import unittest
5
+ from timeit import default_timer as timer
6
 
7
  from langchain.callbacks.base import BaseCallbackHandler
8
  from langchain.schema import HumanMessage
9
 
10
  from app_modules.llm_loader import LLMLoader
11
+ from app_modules.utils import *
12
 
13
+ user_question = "What's the capital city of Malaysia?"
14
+ n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
15
+
16
+ hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
17
+ print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
18
+ print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
19
 
20
 
21
  class MyCustomHandler(BaseCallbackHandler):
 
39
  def run_test_case(self, llm_model_type, query):
40
  llm_loader = LLMLoader(llm_model_type)
41
  start = timer()
42
+ llm_loader.init(
43
+ n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type
44
+ )
45
  end = timer()
46
  print(f"Model loaded in {end - start:.3f}s")
47
 
 
52
  print(f"Inference completed in {end2 - end:.3f}s")
53
  print(result)
54
 
55
+ def test_openai(self):
56
+ self.run_test_case("openai", user_question)
57
 
58
+ def test_llamacpp(self):
59
+ self.run_test_case("llamacpp", user_question)
60
 
61
+ def test_gpt4all_j(self):
62
+ self.run_test_case("gpt4all-j", user_question)
63
 
64
  def test_huggingface(self):
65
+ self.run_test_case("huggingface", user_question)
66
 
67
 
68
  if __name__ == "__main__":