learn-ai

Runtime error

App Files Files Community

dh-mc commited on Aug 3, 2023

Commit

7f9d16c

•

1 Parent(s): ab49330

refactor code

Browse files

Files changed (4) hide show

.env.example +5 -5
Makefile +1 -1
app_modules/llm_loader.py +553 -0
test.py +31 -127

.env.example CHANGED Viewed

@@ -8,7 +8,7 @@ LLM_MODEL_TYPE=huggingface
 OPENAI_API_KEY=
-# if unset, default to "gpt-4"
 OPENAI_MODEL_NAME=
 # cpu, mps or cuda:0 - if unset, use whatever detected
@@ -54,14 +54,14 @@ MOSAICML_MODEL_NAME_OR_PATH="mosaicml/mpt-7b-instruct"
 FALCON_MODEL_NAME_OR_PATH="tiiuae/falcon-7b-instruct"
-GPT4ALL_J_MODEL_PATH="./models/ggml-gpt4all-j-v1.3-groovy.bin"
-GPT4ALL_J_DOWNLOAD_LINK=https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin
 GPT4ALL_MODEL_PATH="./models/ggml-nous-gpt4-vicuna-13b.bin"
 GPT4ALL_DOWNLOAD_LINK=https://gpt4all.io/models/ggml-nous-gpt4-vicuna-13b.bin
-LLAMACPP_MODEL_PATH="./models/wizardLM-7B.ggmlv3.q4_1.bin"
-LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/wizardLM-7B-GGML/resolve/main/wizardLM-7B.ggmlv3.q4_1.bin
 # Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
 # CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"

 OPENAI_API_KEY=
+# if unset, default to "gpt-3.5-turbo"
 OPENAI_MODEL_NAME=
 # cpu, mps or cuda:0 - if unset, use whatever detected
 FALCON_MODEL_NAME_OR_PATH="tiiuae/falcon-7b-instruct"
+GPT4ALL_J_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_0.bin"
+GPT4ALL_J_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_0.bin
 GPT4ALL_MODEL_PATH="./models/ggml-nous-gpt4-vicuna-13b.bin"
 GPT4ALL_DOWNLOAD_LINK=https://gpt4all.io/models/ggml-nous-gpt4-vicuna-13b.bin
+LLAMACPP_MODEL_PATH="./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
+LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
 # Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
 # CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"

Makefile CHANGED Viewed

@@ -10,7 +10,7 @@ else
 endif
 test:
-	PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 TRANSFORMERS_OFFLINE=1 python test.py
 chat:
 	python test.py chat

 endif
 test:
+	PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 python test.py
 chat:
 	python test.py chat

app_modules/llm_loader.py ADDED Viewed

	@@ -0,0 +1,553 @@

+import os
+import sys
+import time
+import urllib
+from queue import Queue
+from threading import Thread
+from typing import Any, Optional
+import torch
+from langchain.callbacks.base import BaseCallbackHandler
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.callbacks.tracers import LangChainTracer
+from langchain.chains import ConversationalRetrievalChain
+from langchain.chat_models import ChatOpenAI
+from langchain.llms import GPT4All, HuggingFacePipeline, LlamaCpp
+from langchain.schema import LLMResult
+from langchain.vectorstores import VectorStore
+from langchain.vectorstores.base import VectorStore
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    T5Tokenizer,
+    TextStreamer,
+    pipeline,
+)
+from app_modules.instruct_pipeline import InstructionTextGenerationPipeline
+from app_modules.utils import ensure_model_is_downloaded, remove_extra_spaces
+class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
+    def __init__(
+        self,
+        tokenizer: "AutoTokenizer",
+        skip_prompt: bool = False,
+        timeout: Optional[float] = None,
+        **decode_kwargs,
+    ):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        super().on_finalized_text(text, stream_end=stream_end)
+        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.text_queue.put(text, timeout=self.timeout)
+        if stream_end:
+            print("\n")
+            self.text_queue.put("\n", timeout=self.timeout)
+            self.text_queue.put(self.stop_signal, timeout=self.timeout)
+    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
+        sys.stdout.write(token)
+        sys.stdout.flush()
+        self.text_queue.put(token, timeout=self.timeout)
+    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
+        print("\n")
+        self.text_queue.put("\n", timeout=self.timeout)
+        self.text_queue.put(self.stop_signal, timeout=self.timeout)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        value = self.text_queue.get(timeout=self.timeout)
+        if value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value
+    def reset(self, q: Queue = None):
+        # print("resetting TextIteratorStreamer")
+        self.text_queue = q if q is not None else Queue()
+    def empty(self):
+        return self.text_queue.empty()
+class LLMLoader:
+    llm_model_type: str
+    llm: any
+    streamer: any
+    def __init__(self, llm_model_type):
+        self.llm_model_type = llm_model_type
+        self.llm = None
+        self.streamer = TextIteratorStreamer("")
+        self.max_tokens_limit = 2048
+        self.search_kwargs = {"k": 4}
+    def _init_streamer(self, tokenizer, custom_handler):
+        self.streamer = (
+            TextIteratorStreamer(
+                tokenizer,
+                timeout=10.0,
+                skip_prompt=True,
+                skip_special_tokens=True,
+            )
+            if custom_handler is None
+            else TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        )
+    def init(
+        self,
+        custom_handler: Optional[BaseCallbackHandler] = None,
+        n_threds: int = 4,
+        hf_pipeline_device_type: str = None,
+    ):
+        print("initializing LLM: " + self.llm_model_type)
+        if hf_pipeline_device_type is None:
+            hf_pipeline_device_type = "cpu"
+        using_cuda = hf_pipeline_device_type.startswith("cuda")
+        torch_dtype = torch.float16 if using_cuda else torch.float32
+        if os.environ.get("USING_TORCH_BFLOAT16") == "true":
+            torch_dtype = torch.bfloat16
+        load_quantized_model = os.environ.get("LOAD_QUANTIZED_MODEL")
+        print(f"  hf_pipeline_device_type: {hf_pipeline_device_type}")
+        print(f"     load_quantized_model: {load_quantized_model}")
+        print(f"              torch_dtype: {torch_dtype}")
+        print(f"                 n_threds: {n_threds}")
+        double_quant_config = BitsAndBytesConfig(
+            load_in_4bit=load_quantized_model == "4bit",
+            bnb_4bit_use_double_quant=load_quantized_model == "4bit",
+            load_in_8bit=load_quantized_model == "8bit",
+            bnb_8bit_use_double_quant=load_quantized_model == "8bit",
+        )
+        callbacks = [self.streamer]
+        if custom_handler is not None:
+            callbacks.append(custom_handler)
+        if self.llm is None:
+            if self.llm_model_type == "openai":
+                MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-3.5-turbo"
+                print(f"              using model: {MODEL_NAME}")
+                self.llm = ChatOpenAI(
+                    model_name=MODEL_NAME,
+                    streaming=True,
+                    callbacks=callbacks,
+                    verbose=True,
+                    temperature=0,
+                )
+            elif self.llm_model_type.startswith("gpt4all"):
+                MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
+                self.llm = GPT4All(
+                    model=MODEL_PATH,
+                    max_tokens=2048,
+                    n_threads=n_threds,
+                    backend="gptj" if self.llm_model_type == "gpt4all-j" else "llama",
+                    callbacks=callbacks,
+                    verbose=True,
+                    use_mlock=True,
+                )
+            elif self.llm_model_type == "llamacpp":
+                MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
+                self.llm = LlamaCpp(
+                    model_path=MODEL_PATH,
+                    n_ctx=8192,
+                    n_threads=n_threds,
+                    seed=0,
+                    temperature=0,
+                    max_tokens=2048,
+                    callbacks=callbacks,
+                    verbose=True,
+                    use_mlock=True,
+                )
+            elif self.llm_model_type.startswith("huggingface"):
+                MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
+                print(f"            loading model: {MODEL_NAME_OR_PATH}")
+                hf_auth_token = os.environ.get("HUGGINGFACE_AUTH_TOKEN")
+                transformers_offline = os.environ.get("TRANSFORMERS_OFFLINE") == "1"
+                token = (
+                    hf_auth_token
+                    if hf_auth_token is not None
+                    and len(hf_auth_token) > 0
+                    and not transformers_offline
+                    else None
+                )
+                print(f"            HF auth token: {str(token)[-5:]}")
+                is_t5 = "t5" in MODEL_NAME_OR_PATH
+                temperature = (
+                    0.01
+                    if "gpt4all-j" in MODEL_NAME_OR_PATH
+                    or "dolly" in MODEL_NAME_OR_PATH
+                    else 0
+                )
+                use_fast = (
+                    "stable" in MODEL_NAME_OR_PATH
+                    or "RedPajama" in MODEL_NAME_OR_PATH
+                    or "dolly" in MODEL_NAME_OR_PATH
+                )
+                padding_side = "left"  # if "dolly" in MODEL_NAME_OR_PATH else None
+                config = AutoConfig.from_pretrained(
+                    MODEL_NAME_OR_PATH,
+                    trust_remote_code=True,
+                    token=token,
+                )
+                # config.attn_config["attn_impl"] = "triton"
+                # config.max_seq_len = 4096
+                config.init_device = hf_pipeline_device_type
+                tokenizer = (
+                    T5Tokenizer.from_pretrained(
+                        MODEL_NAME_OR_PATH,
+                        token=token,
+                    )
+                    if is_t5
+                    else AutoTokenizer.from_pretrained(
+                        MODEL_NAME_OR_PATH,
+                        use_fast=use_fast,
+                        trust_remote_code=True,
+                        padding_side=padding_side,
+                        token=token,
+                    )
+                )
+                self._init_streamer(tokenizer, custom_handler)
+                task = "text2text-generation" if is_t5 else "text-generation"
+                return_full_text = True if "dolly" in MODEL_NAME_OR_PATH else None
+                repetition_penalty = (
+                    1.15
+                    if "falcon" in MODEL_NAME_OR_PATH
+                    else (1.25 if "dolly" in MODEL_NAME_OR_PATH else 1.1)
+                )
+                if load_quantized_model is not None:
+                    model = (
+                        AutoModelForSeq2SeqLM.from_pretrained(
+                            MODEL_NAME_OR_PATH,
+                            config=config,
+                            quantization_config=double_quant_config,
+                            trust_remote_code=True,
+                            token=token,
+                        )
+                        if is_t5
+                        else AutoModelForCausalLM.from_pretrained(
+                            MODEL_NAME_OR_PATH,
+                            config=config,
+                            quantization_config=double_quant_config,
+                            trust_remote_code=True,
+                            token=token,
+                        )
+                    )
+                    print(f"Model memory footprint: {model.get_memory_footprint()}")
+                    eos_token_id = -1
+                    # starchat-beta uses a special <|end|> token with ID 49155 to denote ends of a turn
+                    if "starchat" in MODEL_NAME_OR_PATH:
+                        eos_token_id = 49155
+                    pad_token_id = eos_token_id
+                    pipe = (
+                        InstructionTextGenerationPipeline(
+                            task=task,
+                            model=model,
+                            tokenizer=tokenizer,
+                            streamer=self.streamer,
+                            max_new_tokens=2048,
+                            temperature=temperature,
+                            return_full_text=return_full_text,  # langchain expects the full text
+                            repetition_penalty=repetition_penalty,
+                        )
+                        if "dolly" in MODEL_NAME_OR_PATH
+                        else (
+                            pipeline(
+                                task,
+                                model=model,
+                                tokenizer=tokenizer,
+                                eos_token_id=eos_token_id,
+                                pad_token_id=pad_token_id,
+                                streamer=self.streamer,
+                                return_full_text=return_full_text,  # langchain expects the full text
+                                device_map="auto",
+                                trust_remote_code=True,
+                                max_new_tokens=2048,
+                                do_sample=True,
+                                temperature=0.01,
+                                top_p=0.95,
+                                top_k=50,
+                                repetition_penalty=repetition_penalty,
+                            )
+                            if eos_token_id != -1
+                            else pipeline(
+                                task,
+                                model=model,
+                                tokenizer=tokenizer,
+                                streamer=self.streamer,
+                                return_full_text=return_full_text,  # langchain expects the full text
+                                device_map="auto",
+                                trust_remote_code=True,
+                                max_new_tokens=2048,
+                                # verbose=True,
+                                temperature=temperature,
+                                top_p=0.95,
+                                top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                                repetition_penalty=repetition_penalty,
+                            )
+                        )
+                    )
+                elif "dolly" in MODEL_NAME_OR_PATH:
+                    model = AutoModelForCausalLM.from_pretrained(
+                        MODEL_NAME_OR_PATH,
+                        device_map=hf_pipeline_device_type,
+                        torch_dtype=torch_dtype,
+                    )
+                    pipe = InstructionTextGenerationPipeline(
+                        task=task,
+                        model=model,
+                        tokenizer=tokenizer,
+                        streamer=self.streamer,
+                        max_new_tokens=2048,
+                        temperature=temperature,
+                        return_full_text=True,
+                        repetition_penalty=repetition_penalty,
+                        token=token,
+                    )
+                else:
+                    if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
+                        use_auth_token = None
+                        model = (
+                            AutoModelForSeq2SeqLM.from_pretrained(
+                                MODEL_NAME_OR_PATH,
+                                config=config,
+                                trust_remote_code=True,
+                                token=token,
+                            )
+                            if is_t5
+                            else AutoModelForCausalLM.from_pretrained(
+                                MODEL_NAME_OR_PATH,
+                                config=config,
+                                trust_remote_code=True,
+                                token=token,
+                            )
+                        )
+                        print(f"Model memory footprint: {model.get_memory_footprint()}")
+                    else:
+                        use_auth_token = token
+                        model = MODEL_NAME_OR_PATH
+                    pipe = pipeline(
+                        task,
+                        model=model,
+                        tokenizer=tokenizer,
+                        streamer=self.streamer,
+                        return_full_text=return_full_text,  # langchain expects the full text
+                        device=hf_pipeline_device_type,
+                        torch_dtype=torch_dtype,
+                        max_new_tokens=2048,
+                        trust_remote_code=True,
+                        temperature=temperature,
+                        top_p=0.95,
+                        top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                        repetition_penalty=1.115,
+                        token=use_auth_token,
+                    )
+                self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
+            elif self.llm_model_type == "mosaicml":
+                MODEL_NAME_OR_PATH = os.environ.get("MOSAICML_MODEL_NAME_OR_PATH")
+                print(f"            loading model: {MODEL_NAME_OR_PATH}")
+                config = AutoConfig.from_pretrained(
+                    MODEL_NAME_OR_PATH, trust_remote_code=True
+                )
+                # config.attn_config["attn_impl"] = "triton"
+                config.max_seq_len = 16384 if "30b" in MODEL_NAME_OR_PATH else 4096
+                config.init_device = hf_pipeline_device_type
+                model = (
+                    AutoModelForCausalLM.from_pretrained(
+                        MODEL_NAME_OR_PATH,
+                        config=config,
+                        quantization_config=double_quant_config,
+                        trust_remote_code=True,
+                    )
+                    if load_quantized_model is not None
+                    else AutoModelForCausalLM.from_pretrained(
+                        MODEL_NAME_OR_PATH,
+                        config=config,
+                        torch_dtype=torch_dtype,
+                        trust_remote_code=True,
+                    )
+                )
+                print(f"Model loaded on {config.init_device}")
+                print(f"Model memory footprint: {model.get_memory_footprint()}")
+                tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+                self._init_streamer(tokenizer, custom_handler)
+                # mtp-7b is trained to add "<|endoftext|>" at the end of generations
+                stop_token_ids = tokenizer.convert_tokens_to_ids(["<|endoftext|>"])
+                # define custom stopping criteria object
+                class StopOnTokens(StoppingCriteria):
+                    def __call__(
+                        self,
+                        input_ids: torch.LongTensor,
+                        scores: torch.FloatTensor,
+                        **kwargs,
+                    ) -> bool:
+                        for stop_id in stop_token_ids:
+                            if input_ids[0][-1] == stop_id:
+                                return True
+                        return False
+                stopping_criteria = StoppingCriteriaList([StopOnTokens()])
+                max_new_tokens = 8192 if "30b" in MODEL_NAME_OR_PATH else 2048
+                self.max_tokens_limit = max_new_tokens
+                self.search_kwargs = (
+                    {"k": 8} if "30b" in MODEL_NAME_OR_PATH else self.search_kwargs
+                )
+                repetition_penalty = 1.05 if "30b" in MODEL_NAME_OR_PATH else 1.02
+                pipe = (
+                    pipeline(
+                        model=model,
+                        tokenizer=tokenizer,
+                        streamer=self.streamer,
+                        return_full_text=True,  # langchain expects the full text
+                        task="text-generation",
+                        device_map="auto",
+                        # we pass model parameters here too
+                        stopping_criteria=stopping_criteria,  # without this model will ramble
+                        temperature=0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
+                        top_p=0.95,  # select from top tokens whose probability add up to 15%
+                        top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                        max_new_tokens=max_new_tokens,  # mex number of tokens to generate in the output
+                        repetition_penalty=repetition_penalty,  # without this output begins repeating
+                    )
+                    if load_quantized_model is not None
+                    else pipeline(
+                        model=model,
+                        tokenizer=tokenizer,
+                        streamer=self.streamer,
+                        return_full_text=True,  # langchain expects the full text
+                        task="text-generation",
+                        device=config.init_device,
+                        # we pass model parameters here too
+                        stopping_criteria=stopping_criteria,  # without this model will ramble
+                        temperature=0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
+                        top_p=0.95,  # select from top tokens whose probability add up to 15%
+                        top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                        max_new_tokens=max_new_tokens,  # mex number of tokens to generate in the output
+                        repetition_penalty=repetition_penalty,  # without this output begins repeating
+                    )
+                )
+                self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
+            elif self.llm_model_type == "stablelm":
+                MODEL_NAME_OR_PATH = os.environ.get("STABLELM_MODEL_NAME_OR_PATH")
+                print(f"            loading model: {MODEL_NAME_OR_PATH}")
+                config = AutoConfig.from_pretrained(
+                    MODEL_NAME_OR_PATH, trust_remote_code=True
+                )
+                # config.attn_config["attn_impl"] = "triton"
+                # config.max_seq_len = 4096
+                config.init_device = hf_pipeline_device_type
+                model = (
+                    AutoModelForCausalLM.from_pretrained(
+                        MODEL_NAME_OR_PATH,
+                        config=config,
+                        quantization_config=double_quant_config,
+                        trust_remote_code=True,
+                    )
+                    if load_quantized_model is not None
+                    else AutoModelForCausalLM.from_pretrained(
+                        MODEL_NAME_OR_PATH,
+                        config=config,
+                        torch_dtype=torch_dtype,
+                        trust_remote_code=True,
+                    )
+                )
+                print(f"Model loaded on {config.init_device}")
+                print(f"Model memory footprint: {model.get_memory_footprint()}")
+                tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
+                self._init_streamer(tokenizer, custom_handler)
+                class StopOnTokens(StoppingCriteria):
+                    def __call__(
+                        self,
+                        input_ids: torch.LongTensor,
+                        scores: torch.FloatTensor,
+                        **kwargs,
+                    ) -> bool:
+                        stop_ids = [50278, 50279, 50277, 1, 0]
+                        for stop_id in stop_ids:
+                            if input_ids[0][-1] == stop_id:
+                                return True
+                        return False
+                stopping_criteria = StoppingCriteriaList([StopOnTokens()])
+                pipe = (
+                    pipeline(
+                        model=model,
+                        tokenizer=tokenizer,
+                        streamer=self.streamer,
+                        return_full_text=True,  # langchain expects the full text
+                        task="text-generation",
+                        device_map="auto",
+                        # we pass model parameters here too
+                        stopping_criteria=stopping_criteria,  # without this model will ramble
+                        temperature=0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
+                        top_p=0.95,  # select from top tokens whose probability add up to 15%
+                        top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                        max_new_tokens=2048,  # mex number of tokens to generate in the output
+                        repetition_penalty=1.25,  # without this output begins repeating
+                    )
+                    if load_quantized_model is not None
+                    else pipeline(
+                        model=model,
+                        tokenizer=tokenizer,
+                        streamer=self.streamer,
+                        return_full_text=True,  # langchain expects the full text
+                        task="text-generation",
+                        device=config.init_device,
+                        # we pass model parameters here too
+                        stopping_criteria=stopping_criteria,  # without this model will ramble
+                        temperature=0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
+                        top_p=0.95,  # select from top tokens whose probability add up to 15%
+                        top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                        max_new_tokens=2048,  # mex number of tokens to generate in the output
+                        repetition_penalty=1.05,  # without this output begins repeating
+                    )
+                )
+                self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
+        print("initialization complete")

test.py CHANGED Viewed

@@ -1,45 +1,14 @@
-import os
-import sys
-from timeit import default_timer as timer
-from typing import List
-from langchain.callbacks.base import BaseCallbackHandler
-from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-from langchain.chains import ConversationalRetrievalChain
-from langchain.embeddings import HuggingFaceInstructEmbeddings
-from langchain.llms import GPT4All
-from langchain.schema import LLMResult
-from langchain.vectorstores.chroma import Chroma
-from langchain.vectorstores.faiss import FAISS
-from app_modules.qa_chain import *
-from app_modules.utils import *
-# Constants
-init_settings()
-# https://github.com/huggingface/transformers/issues/17611
-os.environ["CURL_CA_BUNDLE"] = ""
-hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
-print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
-print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
-hf_embeddings_model_name = (
-    os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
-)
-n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
-faiss_index_path = os.environ.get("FAISS_INDEX_PATH") or ""
-using_faiss = len(faiss_index_path) > 0
-index_path = faiss_index_path if using_faiss else os.environ.get("CHROMADB_INDEX_PATH")
-llm_model_type = os.environ.get("LLM_MODEL_TYPE")
-chatting = len(sys.argv) > 1 and sys.argv[1] == "chat"
-questions_file_path = os.environ.get("QUESTIONS_FILE_PATH")
-chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") or "true"
-## utility functions
-import os
 class MyCustomHandler(BaseCallbackHandler):
@@ -52,105 +21,40 @@ class MyCustomHandler(BaseCallbackHandler):
     def get_standalone_question(self) -> str:
         return self.texts[0].strip() if len(self.texts) > 0 else None
-    def on_llm_end(self, response: LLMResult, **kwargs) -> None:
         """Run when chain ends running."""
         print("\non_llm_end - response:")
         print(response)
         self.texts.append(response.generations[0][0].text)
-start = timer()
-embeddings = HuggingFaceInstructEmbeddings(
-    model_name=hf_embeddings_model_name,
-    model_kwargs={"device": hf_embeddings_device_type},
-)
-end = timer()
-print(f"Completed in {end - start:.3f}s")
-start = timer()
-print(f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}")
-if not os.path.isdir(index_path):
-    raise ValueError(f"{index_path} does not exist!")
-elif using_faiss:
-    vectorstore = FAISS.load_local(index_path, embeddings)
-else:
-    vectorstore = Chroma(embedding_function=embeddings, persist_directory=index_path)
-end = timer()
-print(f"Completed in {end - start:.3f}s")
-start = timer()
-qa_chain = QAChain(vectorstore, llm_model_type)
-custom_handler = MyCustomHandler()
-qa_chain.init(
-    custom_handler, n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type
-)
-end = timer()
-print(f"Completed in {end - start:.3f}s")
-# input("Press Enter to continue...")
-# exit()
-# Chatbot loop
-chat_history = []
-print("Welcome to the ChatPDF! Type 'exit' to stop.")
-# Open the file for reading
-file = open(questions_file_path, "r")
-# Read the contents of the file into a list of strings
-queue = file.readlines()
-for i in range(len(queue)):
-    queue[i] = queue[i].strip()
-# Close the file
-file.close()
-queue.append("exit")
-chat_start = timer()
-while True:
-    if chatting:
-        query = input("Please enter your question: ")
-    else:
-        query = queue.pop(0)
-    query = query.strip()
-    if query.lower() == "exit":
-        break
-    print("\nQuestion: " + query)
-    custom_handler.reset()
-    start = timer()
-    result = qa_chain.call({"question": query, "chat_history": chat_history}, None)
-    end = timer()
-    print(f"Completed in {end - start:.3f}s")
-    print_llm_response(result)
-    if len(chat_history) == 0:
-        standalone_question = query
-    else:
-        standalone_question = custom_handler.get_standalone_question()
-    if standalone_question is not None:
-        print(f"Load relevant documents for standalone question: {standalone_question}")
-        start = timer()
-        qa = qa_chain.get_chain()
-        docs = qa.retriever.get_relevant_documents(standalone_question)
-        end = timer()
-        # print(docs)
-        print(f"Completed in {end - start:.3f}s")
-    if chat_history_enabled == "true":
-        chat_history.append((query, result["answer"]))
-chat_end = timer()
-print(f"Total time used: {chat_end - chat_start:.3f}s")

+# project/test.py
+import unittest
+from langchain.callbacks.base import BaseCallbackHandler
+from langchain.schema import HumanMessage
+from app_modules.llm_loader import LLMLoader
+from timeit import default_timer as timer
+USER_QUESTION = "What's the capital city of Malaysia?"
 class MyCustomHandler(BaseCallbackHandler):
     def get_standalone_question(self) -> str:
         return self.texts[0].strip() if len(self.texts) > 0 else None
+    def on_llm_end(self, response, **kwargs) -> None:
         """Run when chain ends running."""
         print("\non_llm_end - response:")
         print(response)
         self.texts.append(response.generations[0][0].text)
+class TestLLMLoader(unittest.TestCase):
+    def run_test_case(self, llm_model_type, query):
+        llm_loader = LLMLoader(llm_model_type)
+        start = timer()
+        llm_loader.init(n_threds=8, hf_pipeline_device_type="cpu")
+        end = timer()
+        print(f"Model loaded in {end - start:.3f}s")
+        result = llm_loader.llm(
+            [HumanMessage(content=query)] if llm_model_type == "openai" else query
+        )
+        end2 = timer()
+        print(f"Inference completed in {end2 - end:.3f}s")
+        print(result)
+    def xtest_openai(self):
+        self.run_test_case("openai", USER_QUESTION)
+    def xtest_llamacpp(self):
+        self.run_test_case("llamacpp", USER_QUESTION)
+    def xtest_gpt4all_j(self):
+        self.run_test_case("gpt4all-j", USER_QUESTION)
+    def test_huggingface(self):
+        self.run_test_case("huggingface", USER_QUESTION)
+if __name__ == "__main__":
+    unittest.main()