learn-ai

Runtime error

App Files Files Community

dh-mc commited on Feb 16

Commit

a28a4f8

•

1 Parent(s): 2f3b7d0

get latest code from orca-2 space

Browse files

Files changed (11) hide show

.env.example +10 -3
app.py +87 -169
app_modules/init.py +92 -82
app_modules/llm_chat_chain.py +19 -7
app_modules/llm_inference.py +34 -4
app_modules/llm_loader.py +135 -99
app_modules/llm_qa_chain.py +0 -3
app_modules/llm_qa_chain_with_memory.py +32 -0
app_modules/utils.py +39 -13
requirements-mac.txt +127 -0
requirements.txt +129 -38

.env.example CHANGED Viewed

@@ -11,7 +11,7 @@ LLM_MODEL_TYPE=hftgi
 OPENLLM_SERVER_URL=http://localhost:64300
-HFTGI_SERVER_URL=https://enabled-factually-cougar.ngrok-free.app
 OPENAI_API_KEY=
@@ -28,6 +28,7 @@ HF_PIPELINE_DEVICE_TYPE=
 # USE_LLAMA_2_PROMPT_TEMPLATE=true
 DISABLE_MODEL_PRELOADING=true
 CHAT_HISTORY_ENABLED=false
 SHOW_PARAM_SETTINGS=false
 SHARE_GRADIO_APP=false
@@ -47,15 +48,21 @@ USING_TORCH_BFLOAT16=true
 # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
 # LLM_MODEL_TYPE must be set to huggingface
 # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
-HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-13b-chat-hf"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-70b-chat-hf"
-# HUGGINGFACE_MODEL_NAME_OR_PATH="Qwen/Qwen-7B-Chat"
 STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"

 OPENLLM_SERVER_URL=http://localhost:64300
+HFTGI_SERVER_URL=
 OPENAI_API_KEY=
 # USE_LLAMA_2_PROMPT_TEMPLATE=true
 DISABLE_MODEL_PRELOADING=true
+USER_CONVERSATION_SUMMARY_BUFFER_MEMORY=true
 CHAT_HISTORY_ENABLED=false
 SHOW_PARAM_SETTINGS=false
 SHARE_GRADIO_APP=false
 # HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
 # LLM_MODEL_TYPE must be set to huggingface
+# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-3b"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-7b"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="databricks/dolly-v2-12b"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/wizardLM-7B-HF"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="TheBloke/vicuna-7B-1.1-HF"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-j"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="nomic-ai/gpt4all-falcon"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="lmsys/fastchat-t5-3b-v1.0"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-13b-chat-hf"
 # HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-70b-chat-hf"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-7b"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="microsoft/Orca-2-13b"
+HUGGINGFACE_MODEL_NAME_OR_PATH="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# HUGGINGFACE_MODEL_NAME_OR_PATH="FlagAlpha/Llama2-Chinese-13b-Chat"
 STABLELM_MODEL_NAME_OR_PATH="OpenAssistant/stablelm-7b-sft-v7-epoch-3"

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Main entrypoint for the app."""
 import os
 import time
 from queue import Queue
 from timeit import default_timer as timer
@@ -13,14 +15,13 @@ from app_modules.utils import print_llm_response, remove_extra_spaces
 llm_loader, qa_chain = app_init()
-show_param_settings = os.environ.get("SHOW_PARAM_SETTINGS") == "true"
 share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
 using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
-chat_with_llama_2 = (
-    not using_openai and os.environ.get("USE_LLAMA_2_PROMPT_TEMPLATE") == "true"
 )
 chat_history_enabled = (
-    not chat_with_llama_2 and os.environ.get("CHAT_HISTORY_ENABLED") == "true"
 )
 model = (
@@ -34,180 +35,97 @@ href = (
     else f"https://huggingface.co/{model}"
 )
-if chat_with_llama_2:
     qa_chain = ChatChain(llm_loader)
-    name = "Llama-2"
 else:
-    name = "AI Books"
-title = f"""<h1 align="left" style="min-width:200px; margin-top:0;"> Chat with {name} </h1>"""
-description_top = f"""\
 <div align="left">
 <p> Currently Running: <a href="{href}">{model}</a></p>
 </div>
 """
-description = """\
-<div align="center" style="margin:16px 0">
-The demo is built on <a href="https://github.com/hwchase17/langchain">LangChain</a>.
-</div>
-"""
-CONCURRENT_COUNT = 1
-def qa(chatbot):
-    user_msg = chatbot[-1][0]
     q = Queue()
     result = Queue()
-    job_done = object()
-    def task(question, chat_history):
-        start = timer()
-        inputs = {"question": question}
-        if not chat_with_llama_2:
-            inputs["chat_history"] = chat_history
-        ret = qa_chain.call_chain(inputs, None, q)
-        end = timer()
-        print(f"Completed in {end - start:.3f}s")
-        print_llm_response(ret)
-        q.put(job_done)
-        result.put(ret)
-    with start_blocking_portal() as portal:
-        chat_history = []
-        if chat_history_enabled:
-            for i in range(len(chatbot) - 1):
-                element = chatbot[i]
-                item = (element[0] or "", element[1] or "")
-                chat_history.append(item)
-        portal.start_task_soon(task, user_msg, chat_history)
-        content = ""
-        count = 2 if len(chat_history) > 0 else 1
-        while count > 0:
-            while q.empty():
-                print("nothing generated yet - retry in 0.5s")
-                time.sleep(0.5)
-            for next_token in llm_loader.streamer:
-                if next_token is job_done:
-                    break
-                content += next_token or ""
-                chatbot[-1][1] = remove_extra_spaces(content)
-                if count == 1:
-                    yield chatbot
-            count -= 1
-        if not chat_with_llama_2:
-            chatbot[-1][1] += "\n\nSources:\n"
-            ret = result.get()
-            titles = []
-            for doc in ret["source_documents"]:
-                page = doc.metadata["page"] + 1
-                url = f"{doc.metadata['url']}#page={page}"
-                file_name = doc.metadata["source"].split("/")[-1]
-                title = f"{file_name} Page: {page}"
-                if title not in titles:
-                    titles.append(title)
-                    chatbot[-1][1] += f"1. [{title}]({url})\n"
-        yield chatbot
-with open("assets/custom.css", "r", encoding="utf-8") as f:
-    customCSS = f.read()
-with gr.Blocks(css=customCSS) as demo:
-    user_question = gr.State("")
-    with gr.Row():
-        gr.HTML(title)
-    gr.Markdown(description_top)
-    with gr.Row().style(equal_height=True):
-        with gr.Column(scale=5):
-            with gr.Row():
-                chatbot = gr.Chatbot(elem_id="inflaton_chatbot").style(height="100%")
-            with gr.Row():
-                with gr.Column(scale=2):
-                    user_input = gr.Textbox(
-                        show_label=False, placeholder="Enter your question here"
-                    ).style(container=False)
-                with gr.Column(
-                    min_width=70,
-                ):
-                    submitBtn = gr.Button("Send")
-                with gr.Column(
-                    min_width=70,
-                ):
-                    clearBtn = gr.Button("Clear")
-        if show_param_settings:
-            with gr.Column():
-                with gr.Column(
-                    min_width=50,
-                ):
-                    with gr.Tab(label="Parameter Setting"):
-                        gr.Markdown("# Parameters")
-                        top_p = gr.Slider(
-                            minimum=-0,
-                            maximum=1.0,
-                            value=0.95,
-                            step=0.05,
-                            # interactive=True,
-                            label="Top-p",
-                        )
-                        temperature = gr.Slider(
-                            minimum=0.1,
-                            maximum=2.0,
-                            value=0,
-                            step=0.1,
-                            # interactive=True,
-                            label="Temperature",
-                        )
-                        max_new_tokens = gr.Slider(
-                            minimum=0,
-                            maximum=2048,
-                            value=2048,
-                            step=8,
-                            # interactive=True,
-                            label="Max Generation Tokens",
-                        )
-                        max_context_length_tokens = gr.Slider(
-                            minimum=0,
-                            maximum=4096,
-                            value=4096,
-                            step=128,
-                            # interactive=True,
-                            label="Max Context Tokens",
-                        )
-    gr.Markdown(description)
-    def chat(user_message, history):
-        return "", history + [[user_message, None]]
-    user_input.submit(
-        chat, [user_input, chatbot], [user_input, chatbot], queue=True
-    ).then(qa, chatbot, chatbot)
-    submitBtn.click(
-        chat, [user_input, chatbot], [user_input, chatbot], queue=True, api_name="chat"
-    ).then(qa, chatbot, chatbot)
-    def reset():
-        return "", []
-    clearBtn.click(
-        reset,
-        outputs=[user_input, chatbot],
-        show_progress=True,
-        api_name="reset",
-    )
-demo.title = "Chat with AI Books" if chat_with_llama_2 else "Chat with Llama-2"
-demo.queue(concurrency_count=CONCURRENT_COUNT).launch(share=share_gradio_app)

 """Main entrypoint for the app."""
 import os
+from threading import Thread
 import time
 from queue import Queue
 from timeit import default_timer as timer
 llm_loader, qa_chain = app_init()
 share_gradio_app = os.environ.get("SHARE_GRADIO_APP") == "true"
 using_openai = os.environ.get("LLM_MODEL_TYPE") == "openai"
+chat_with_orca_2 = (
+    not using_openai and os.environ.get("USE_ORCA_2_PROMPT_TEMPLATE") == "true"
 )
 chat_history_enabled = (
+    not chat_with_orca_2 and os.environ.get("CHAT_HISTORY_ENABLED") == "true"
 )
 model = (
     else f"https://huggingface.co/{model}"
 )
+if chat_with_orca_2:
     qa_chain = ChatChain(llm_loader)
+    name = "Orca-2"
 else:
+    name = "PCI DSS v4"
+title = f"Chat with {name}"
+examples = (
+    ["How to cook a fish?", "Who is the president of US now?"]
+    if chat_with_orca_2
+    else [
+        "What's PCI DSS?",
+        "Can you summarize the changes made from PCI DSS version 3.2.1 to version 4.0?",
+    ]
+)
+description = f"""\
 <div align="left">
 <p> Currently Running: <a href="{href}">{model}</a></p>
 </div>
 """
+def task(question, chat_history, q, result):
+    start = timer()
+    inputs = {"question": question, "chat_history": chat_history}
+    ret = qa_chain.call_chain(inputs, None, q)
+    end = timer()
+    print(f"Completed in {end - start:.3f}s")
+    print_llm_response(ret)
+    result.put(ret)
+def predict(message, history):
+    print("predict:", message, history)
+    chat_history = []
+    if chat_history_enabled:
+        for element in history:
+            item = (element[0] or "", element[1] or "")
+            chat_history.append(item)
+    if not chat_history:
+        qa_chain.reset()
     q = Queue()
     result = Queue()
+    t = Thread(target=task, args=(message, chat_history, q, result))
+    t.start()  # Starting the generation in a separate thread.
+    partial_message = ""
+    count = 2 if len(chat_history) > 0 else 1
+    while count > 0:
+        while q.empty():
+            print("nothing generated yet - retry in 0.5s")
+            time.sleep(0.5)
+        for next_token in llm_loader.streamer:
+            partial_message += next_token or ""
+            # partial_message = remove_extra_spaces(partial_message)
+            yield partial_message
+        if count == 2:
+            partial_message += "\n\n"
+        count -= 1
+    if not chat_with_orca_2:
+        partial_message += "\n\nSources:\n"
+        ret = result.get()
+        titles = []
+        for doc in ret["source_documents"]:
+            page = doc.metadata["page"] + 1
+            url = f"{doc.metadata['url']}#page={page}"
+            file_name = doc.metadata["source"].split("/")[-1]
+            title = f"{file_name} Page: {page}"
+            if title not in titles:
+                titles.append(title)
+                partial_message += f"1. [{title}]({url})\n"
+        yield partial_message
+# Setting up the Gradio chat interface.
+gr.ChatInterface(
+    predict,
+    title=title,
+    description=description,
+    examples=examples,
+).launch(
+    share=share_gradio_app
+)  # Launching the web interface.

app_modules/init.py CHANGED Viewed

@@ -1,82 +1,92 @@
-"""Main entrypoint for the app."""
-import os
-from timeit import default_timer as timer
-from typing import List, Optional
-from dotenv import find_dotenv, load_dotenv
-from langchain.embeddings import HuggingFaceInstructEmbeddings
-from langchain.vectorstores.chroma import Chroma
-from langchain.vectorstores.faiss import FAISS
-from app_modules.llm_loader import LLMLoader
-from app_modules.llm_qa_chain import QAChain
-from app_modules.utils import get_device_types, init_settings
-found_dotenv = find_dotenv(".env")
-if len(found_dotenv) == 0:
-    found_dotenv = find_dotenv(".env.example")
-print(f"loading env vars from: {found_dotenv}")
-load_dotenv(found_dotenv, override=False)
-# Constants
-init_settings()
-def app_init(initQAChain: bool = True):
-    # https://github.com/huggingface/transformers/issues/17611
-    os.environ["CURL_CA_BUNDLE"] = ""
-    llm_model_type = os.environ.get("LLM_MODEL_TYPE")
-    n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
-    hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
-    print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
-    print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
-    if initQAChain:
-        hf_embeddings_model_name = (
-            os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
-        )
-        index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get(
-            "CHROMADB_INDEX_PATH"
-        )
-        using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
-        start = timer()
-        embeddings = HuggingFaceInstructEmbeddings(
-            model_name=hf_embeddings_model_name,
-            model_kwargs={"device": hf_embeddings_device_type},
-        )
-        end = timer()
-        print(f"Completed in {end - start:.3f}s")
-        start = timer()
-        print(
-            f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}"
-        )
-        if not os.path.isdir(index_path):
-            raise ValueError(f"{index_path} does not exist!")
-        elif using_faiss:
-            vectorstore = FAISS.load_local(index_path, embeddings)
-        else:
-            vectorstore = Chroma(
-                embedding_function=embeddings, persist_directory=index_path
-            )
-        end = timer()
-        print(f"Completed in {end - start:.3f}s")
-    start = timer()
-    llm_loader = LLMLoader(llm_model_type)
-    llm_loader.init(n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type)
-    qa_chain = QAChain(vectorstore, llm_loader) if initQAChain else None
-    end = timer()
-    print(f"Completed in {end - start:.3f}s")
-    return llm_loader, qa_chain

+"""Main entrypoint for the app."""
+import os
+from timeit import default_timer as timer
+from typing import List, Optional
+from dotenv import find_dotenv, load_dotenv
+from langchain_community.embeddings import HuggingFaceInstructEmbeddings
+from langchain.vectorstores.chroma import Chroma
+from langchain.vectorstores.faiss import FAISS
+from app_modules.llm_loader import LLMLoader
+from app_modules.utils import get_device_types, init_settings
+found_dotenv = find_dotenv(".env")
+if len(found_dotenv) == 0:
+    found_dotenv = find_dotenv(".env.example")
+print(f"loading env vars from: {found_dotenv}")
+load_dotenv(found_dotenv, override=False)
+# Constants
+init_settings()
+if os.environ.get("LANGCHAIN_DEBUG") == "true":
+    import langchain
+    langchain.debug = True
+if os.environ.get("USER_CONVERSATION_SUMMARY_BUFFER_MEMORY") == "true":
+    from app_modules.llm_qa_chain_with_memory import QAChain
+    print("using llm_qa_chain_with_memory")
+else:
+    from app_modules.llm_qa_chain import QAChain
+    print("using llm_qa_chain")
+def app_init():
+    # https://github.com/huggingface/transformers/issues/17611
+    os.environ["CURL_CA_BUNDLE"] = ""
+    hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
+    print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
+    print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
+    hf_embeddings_model_name = (
+        os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
+    )
+    n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
+    index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get(
+        "CHROMADB_INDEX_PATH"
+    )
+    using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
+    llm_model_type = os.environ.get("LLM_MODEL_TYPE")
+    start = timer()
+    embeddings = HuggingFaceInstructEmbeddings(
+        model_name=hf_embeddings_model_name,
+        model_kwargs={"device": hf_embeddings_device_type},
+    )
+    end = timer()
+    print(f"Completed in {end - start:.3f}s")
+    start = timer()
+    print(f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}")
+    if not os.path.isdir(index_path):
+        raise ValueError(f"{index_path} does not exist!")
+    elif using_faiss:
+        vectorstore = FAISS.load_local(index_path, embeddings)
+    else:
+        vectorstore = Chroma(
+            embedding_function=embeddings, persist_directory=index_path
+        )
+    end = timer()
+    print(f"Completed in {end - start:.3f}s")
+    start = timer()
+    llm_loader = LLMLoader(llm_model_type)
+    llm_loader.init(n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type)
+    qa_chain = QAChain(vectorstore, llm_loader)
+    end = timer()
+    print(f"Completed in {end - start:.3f}s")
+    return llm_loader, qa_chain

app_modules/llm_chat_chain.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import os
 from typing import List, Optional
-from langchain import ConversationChain, PromptTemplate
 from langchain.chains.base import Chain
 from langchain.memory import ConversationSummaryBufferMemory
 from app_modules.llm_inference import LLMInference
 def get_llama_2_prompt_template():
@@ -23,6 +25,13 @@ def get_llama_2_prompt_template():
     return prompt_template
 class ChatChain(LLMInference):
     def __init__(self, llm_loader):
         super().__init__(llm_loader)
@@ -31,28 +40,31 @@ class ChatChain(LLMInference):
         template = (
             get_llama_2_prompt_template()
             if os.environ.get("USE_LLAMA_2_PROMPT_TEMPLATE") == "true"
-            else """You are a chatbot having a conversation with a human.
 {history}
 Human: {input}
 Chatbot:"""
         )
         print(f"template: {template}")
         prompt = PromptTemplate(input_variables=["history", "input"], template=template)
-        memory = ConversationSummaryBufferMemory(
-            llm=self.llm_loader.llm, max_token_limit=1024, return_messages=True
         )
         llm_chain = ConversationChain(
             llm=self.llm_loader.llm,
             prompt=prompt,
-            verbose=True,
             memory=memory,
         )
         return llm_chain
     def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
-        return chain({"input": inputs["question"]}, callbacks)

 import os
 from typing import List, Optional
+from langchain.chains import ConversationChain, LLMChain
+from langchain.prompts import PromptTemplate
 from langchain.chains.base import Chain
 from langchain.memory import ConversationSummaryBufferMemory
 from app_modules.llm_inference import LLMInference
+from app_modules.utils import CustomizedConversationSummaryBufferMemory
 def get_llama_2_prompt_template():
     return prompt_template
+def get_orca_2_prompt_template():
+    system_message = "You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
+    user_message = "Chat History:\n\n{history} \n\nUser: {input}"
+    prompt_template = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
+    return prompt_template
 class ChatChain(LLMInference):
     def __init__(self, llm_loader):
         super().__init__(llm_loader)
         template = (
             get_llama_2_prompt_template()
             if os.environ.get("USE_LLAMA_2_PROMPT_TEMPLATE") == "true"
+            else (
+                get_orca_2_prompt_template()
+                if os.environ.get("USE_ORCA_2_PROMPT_TEMPLATE") == "true"
+                else """You are a chatbot having a conversation with a human.
 {history}
 Human: {input}
 Chatbot:"""
+            )
         )
         print(f"template: {template}")
         prompt = PromptTemplate(input_variables=["history", "input"], template=template)
+        memory = CustomizedConversationSummaryBufferMemory(
+            llm=self.llm_loader.llm, max_token_limit=1024, return_messages=False
         )
         llm_chain = ConversationChain(
             llm=self.llm_loader.llm,
             prompt=prompt,
+            verbose=False,
             memory=memory,
         )
         return llm_chain
     def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
+        return super().run_chain(chain, {"input": inputs["question"]}, callbacks)

app_modules/llm_inference.py CHANGED Viewed

@@ -5,6 +5,7 @@ import urllib
 from queue import Queue
 from threading import Thread
 from typing import List, Optional
 from langchain.chains.base import Chain
@@ -13,9 +14,6 @@ from app_modules.utils import remove_extra_spaces
 class LLMInference(metaclass=abc.ABCMeta):
-    llm_loader: LLMLoader
-    chain: Chain
     def __init__(self, llm_loader):
         self.llm_loader = llm_loader
         self.chain = None
@@ -30,8 +28,15 @@ class LLMInference(metaclass=abc.ABCMeta):
         return self.chain
     def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
-        return chain(inputs, callbacks)
     def call_chain(
         self,
@@ -59,6 +64,7 @@ class LLMInference(metaclass=abc.ABCMeta):
             if "answer" in result:
                 result["answer"] = remove_extra_spaces(result["answer"])
                 base_url = os.environ.get("PDF_FILE_BASE_URL")
                 if base_url is not None and len(base_url) > 0:
                     documents = result["source_documents"]
@@ -66,6 +72,30 @@ class LLMInference(metaclass=abc.ABCMeta):
                         source = doc.metadata["source"]
                         title = source.split("/")[-1]
                         doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
             return result
         finally:

 from queue import Queue
 from threading import Thread
 from typing import List, Optional
+from urllib.parse import quote, urlparse, urlunparse
 from langchain.chains.base import Chain
 class LLMInference(metaclass=abc.ABCMeta):
     def __init__(self, llm_loader):
         self.llm_loader = llm_loader
         self.chain = None
         return self.chain
+    def reset(self) -> None:
+        self.chain = None
     def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
+        result = chain.invoke(inputs, {"callbacks": callbacks})
+        if "text" in result:
+            result["response"] = result["text"]
+            del result["text"]
+        return result
     def call_chain(
         self,
             if "answer" in result:
                 result["answer"] = remove_extra_spaces(result["answer"])
+                source_path = os.environ.get("SOURCE_PATH")
                 base_url = os.environ.get("PDF_FILE_BASE_URL")
                 if base_url is not None and len(base_url) > 0:
                     documents = result["source_documents"]
                         source = doc.metadata["source"]
                         title = source.split("/")[-1]
                         doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
+                elif source_path is not None and len(source_path) > 0:
+                    documents = result["source_documents"]
+                    for doc in documents:
+                        source = doc.metadata["source"]
+                        url = source.replace(source_path, "https://")
+                        url = url.replace(".html", "")
+                        parsed_url = urlparse(url)
+                        # Encode path, query, and fragment
+                        encoded_path = quote(parsed_url.path)
+                        encoded_query = quote(parsed_url.query)
+                        encoded_fragment = quote(parsed_url.fragment)
+                        # Construct the encoded URL
+                        doc.metadata["url"] = urlunparse(
+                            (
+                                parsed_url.scheme,
+                                parsed_url.netloc,
+                                encoded_path,
+                                parsed_url.params,
+                                encoded_query,
+                                encoded_fragment,
+                            )
+                        )
             return result
         finally:

app_modules/llm_loader.py CHANGED Viewed

@@ -5,17 +5,18 @@ from queue import Queue
 from typing import Any, Optional
 import torch
-from langchain import HuggingFaceTextGenInference
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-from langchain.chat_models import ChatOpenAI
-from langchain.llms import (
     CTransformers,
     GPT4All,
     HuggingFacePipeline,
     LlamaCpp,
-    OpenLLM,
 )
 from langchain.schema import LLMResult
 from transformers import (
     AutoConfig,
@@ -30,7 +31,6 @@ from transformers import (
     pipeline,
 )
-from app_modules.instruct_pipeline import InstructionTextGenerationPipeline
 from app_modules.utils import ensure_model_is_downloaded
@@ -49,6 +49,7 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
         self.timeout = timeout
         self.total_tokens = 0
         self.for_huggingface = for_huggingface
     def on_finalized_text(self, text: str, stream_end: bool = False):
         super().on_finalized_text(text, stream_end=stream_end)
@@ -61,11 +62,23 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
             self.text_queue.put("\n", timeout=self.timeout)
             self.text_queue.put(self.stop_signal, timeout=self.timeout)
     def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
-        sys.stdout.write(token)
-        sys.stdout.flush()
-        self.text_queue.put(token, timeout=self.timeout)
-        self.total_tokens = self.total_tokens + 1
     def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
         print("\n")
@@ -85,18 +98,13 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
     def reset(self, q: Queue = None):
         # print("resetting TextIteratorStreamer")
         self.text_queue = q if q is not None else Queue()
     def empty(self):
         return self.text_queue.empty()
 class LLMLoader:
-    llm_model_type: str
-    llm: any
-    streamer: any
-    max_tokens_limit: int
-    lock: any
     def __init__(self, llm_model_type):
         self.llm_model_type = llm_model_type
         self.llm = None
@@ -129,9 +137,11 @@ class LLMLoader:
             hf_pipeline_device_type = "cpu"
         using_cuda = hf_pipeline_device_type.startswith("cuda")
-        torch_dtype = torch.float16 if using_cuda else torch.float32
-        if os.environ.get("USING_TORCH_BFLOAT16") == "true":
             torch_dtype = torch.bfloat16
         load_quantized_model = os.environ.get("LOAD_QUANTIZED_MODEL")
         print(f"  hf_pipeline_device_type: {hf_pipeline_device_type}")
@@ -139,6 +149,8 @@ class LLMLoader:
         print(f"              torch_dtype: {torch_dtype}")
         print(f"                 n_threds: {n_threds}")
         double_quant_config = BitsAndBytesConfig(
             load_in_4bit=load_quantized_model == "4bit",
             bnb_4bit_use_double_quant=load_quantized_model == "4bit",
@@ -156,20 +168,22 @@ class LLMLoader:
             if self.llm_model_type == "openai":
                 MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-3.5-turbo"
                 print(f"              using model: {MODEL_NAME}")
-                self.llm = ChatOpenAI(
-                    model_name=MODEL_NAME,
-                    streaming=True,
-                    callbacks=callbacks,
-                    verbose=True,
-                    temperature=0,
-                )
-            elif self.llm_model_type == "openllm":
-                server_url = os.environ.get("OPENLLM_SERVER_URL")
-                print(f"               server url: {server_url}")
-                self.llm = OpenLLM(
-                    server_url=server_url,
-                    # callbacks=callbacks,
-                    verbose=True,
                 )
             elif self.llm_model_type.startswith("gpt4all"):
                 MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
@@ -209,6 +223,9 @@ class LLMLoader:
                 )
             elif self.llm_model_type == "hftgi":
                 HFTGI_SERVER_URL = os.environ.get("HFTGI_SERVER_URL")
                 self.max_tokens_limit = 4096
                 self.llm = HuggingFaceTextGenInference(
                     inference_server_url=HFTGI_SERVER_URL,
@@ -217,11 +234,20 @@ class LLMLoader:
                     top_p=0.95,
                     # typical_p=0.95,
                     temperature=0.01,
-                    repetition_penalty=1.12,
                     callbacks=callbacks,
                     timeout=600,
                     streaming=True,
                 )
             elif self.llm_model_type.startswith("huggingface"):
                 MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
                 print(f"            loading model: {MODEL_NAME_OR_PATH}")
@@ -243,6 +269,27 @@ class LLMLoader:
                 if "Llama-2" in MODEL_NAME_OR_PATH:
                     self.max_tokens_limit = 4096
                 is_t5 = "t5" in MODEL_NAME_OR_PATH
                 temperature = (
@@ -250,7 +297,9 @@ class LLMLoader:
                     if "gpt4all-j" in MODEL_NAME_OR_PATH
                     or "dolly" in MODEL_NAME_OR_PATH
                     or "Qwen" in MODEL_NAME_OR_PATH
-                    or "Llama-2" in MODEL_NAME_OR_PATH
                     else 0
                 )
                 use_fast = (
@@ -314,6 +363,11 @@ class LLMLoader:
                     else (1.25 if "dolly" in MODEL_NAME_OR_PATH else 1.1)
                 )
                 if load_quantized_model is not None:
                     model = (
                         AutoModelForSeq2SeqLM.from_pretrained(
@@ -342,71 +396,40 @@ class LLMLoader:
                     pad_token_id = eos_token_id
                     pipe = (
-                        InstructionTextGenerationPipeline(
-                            task=task,
                             model=model,
                             tokenizer=tokenizer,
                             streamer=self.streamer,
-                            max_new_tokens=2048,
-                            temperature=temperature,
                             return_full_text=return_full_text,  # langchain expects the full text
                             repetition_penalty=repetition_penalty,
                         )
-                        if "dolly" in MODEL_NAME_OR_PATH
-                        else (
-                            pipeline(
-                                task,
-                                model=model,
-                                tokenizer=tokenizer,
-                                eos_token_id=eos_token_id,
-                                pad_token_id=pad_token_id,
-                                streamer=self.streamer,
-                                return_full_text=return_full_text,  # langchain expects the full text
-                                device_map="auto",
-                                trust_remote_code=True,
-                                max_new_tokens=2048,
-                                do_sample=True,
-                                temperature=0.01,
-                                top_p=0.95,
-                                top_k=50,
-                                repetition_penalty=repetition_penalty,
-                            )
-                            if eos_token_id != -1
-                            else pipeline(
-                                task,
-                                model=model,
-                                tokenizer=tokenizer,
-                                streamer=self.streamer,
-                                return_full_text=return_full_text,  # langchain expects the full text
-                                device_map="auto",
-                                trust_remote_code=True,
-                                max_new_tokens=2048,
-                                # verbose=True,
-                                temperature=temperature,
-                                top_p=0.95,
-                                top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
-                                repetition_penalty=repetition_penalty,
-                            )
                         )
                     )
-                elif "dolly" in MODEL_NAME_OR_PATH:
-                    model = AutoModelForCausalLM.from_pretrained(
-                        MODEL_NAME_OR_PATH,
-                        device_map=hf_pipeline_device_type,
-                        torch_dtype=torch_dtype,
-                    )
-                    pipe = InstructionTextGenerationPipeline(
-                        task=task,
-                        model=model,
-                        tokenizer=tokenizer,
-                        streamer=self.streamer,
-                        max_new_tokens=2048,
-                        temperature=temperature,
-                        return_full_text=True,
-                        repetition_penalty=repetition_penalty,
-                        token=token,
-                    )
                 else:
                     if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
                         model = (
@@ -456,10 +479,11 @@ class LLMLoader:
                             torch_dtype=torch_dtype,
                             max_new_tokens=2048,
                             trust_remote_code=True,
                             temperature=temperature,
                             top_p=0.95,
                             top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
-                            repetition_penalty=1.115,
                         )
                         if token is None
                         else pipeline(
@@ -475,11 +499,12 @@ class LLMLoader:
                             temperature=temperature,
                             top_p=0.95,
                             top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
-                            repetition_penalty=1.115,
                             token=token,
                         )
                     )
                 self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
             elif self.llm_model_type == "mosaicml":
                 MODEL_NAME_OR_PATH = os.environ.get("MOSAICML_MODEL_NAME_OR_PATH")
@@ -534,11 +559,13 @@ class LLMLoader:
                 max_new_tokens = 8192 if "30b" in MODEL_NAME_OR_PATH else 2048
                 self.max_tokens_limit = max_new_tokens
-                self.search_kwargs = (
-                    {"k": 8} if "30b" in MODEL_NAME_OR_PATH else self.search_kwargs
-                )
                 repetition_penalty = 1.05 if "30b" in MODEL_NAME_OR_PATH else 1.02
                 pipe = (
                     pipeline(
                         model=model,
@@ -549,7 +576,8 @@ class LLMLoader:
                         device_map="auto",
                         # we pass model parameters here too
                         stopping_criteria=stopping_criteria,  # without this model will ramble
-                        temperature=0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
                         top_p=0.95,  # select from top tokens whose probability add up to 15%
                         top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                         max_new_tokens=max_new_tokens,  # mex number of tokens to generate in the output
@@ -565,7 +593,8 @@ class LLMLoader:
                         device=config.init_device,
                         # we pass model parameters here too
                         stopping_criteria=stopping_criteria,  # without this model will ramble
-                        temperature=0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
                         top_p=0.95,  # select from top tokens whose probability add up to 15%
                         top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                         max_new_tokens=max_new_tokens,  # mex number of tokens to generate in the output
@@ -584,6 +613,13 @@ class LLMLoader:
                 # config.max_seq_len = 4096
                 config.init_device = hf_pipeline_device_type
                 model = (
                     AutoModelForCausalLM.from_pretrained(
                         MODEL_NAME_OR_PATH,
@@ -635,7 +671,7 @@ class LLMLoader:
                         top_p=0.95,  # select from top tokens whose probability add up to 15%
                         top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                         max_new_tokens=2048,  # mex number of tokens to generate in the output
-                        repetition_penalty=1.25,  # without this output begins repeating
                     )
                     if load_quantized_model is not None
                     else pipeline(
@@ -651,7 +687,7 @@ class LLMLoader:
                         top_p=0.95,  # select from top tokens whose probability add up to 15%
                         top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                         max_new_tokens=2048,  # mex number of tokens to generate in the output
-                        repetition_penalty=1.05,  # without this output begins repeating
                     )
                 )
                 self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)

 from typing import Any, Optional
 import torch
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_openai.llms import OpenAI
+from langchain_community.llms import (
+    HuggingFaceTextGenInference,
     CTransformers,
     GPT4All,
     HuggingFacePipeline,
     LlamaCpp,
 )
+from langchain_community.chat_models import ChatOllama
 from langchain.schema import LLMResult
 from transformers import (
     AutoConfig,
     pipeline,
 )
 from app_modules.utils import ensure_model_is_downloaded
         self.timeout = timeout
         self.total_tokens = 0
         self.for_huggingface = for_huggingface
+        self.end_token = ""
     def on_finalized_text(self, text: str, stream_end: bool = False):
         super().on_finalized_text(text, stream_end=stream_end)
             self.text_queue.put("\n", timeout=self.timeout)
             self.text_queue.put(self.stop_signal, timeout=self.timeout)
+    def check_end_token(self, token):
+        new_token = self.end_token + token
+        if "<|im_end|>".startswith(new_token):
+            self.end_token = "" if new_token == "<|im_end|>" else new_token
+            return None
+        elif self.end_token != "":
+            self.end_token = ""
+        return new_token
     def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
+        token = self.check_end_token(token)
+        if token:
+            sys.stdout.write(token)
+            sys.stdout.flush()
+            self.text_queue.put(token, timeout=self.timeout)
+            self.total_tokens = self.total_tokens + 1
     def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
         print("\n")
     def reset(self, q: Queue = None):
         # print("resetting TextIteratorStreamer")
         self.text_queue = q if q is not None else Queue()
+        self.end_token = ""
     def empty(self):
         return self.text_queue.empty()
 class LLMLoader:
     def __init__(self, llm_model_type):
         self.llm_model_type = llm_model_type
         self.llm = None
             hf_pipeline_device_type = "cpu"
         using_cuda = hf_pipeline_device_type.startswith("cuda")
+        using_mps = hf_pipeline_device_type.startswith("mps")
+        torch_dtype = torch.float16 if using_cuda or using_mps else torch.float32
+        if not using_mps and os.environ.get("USING_TORCH_BFLOAT16") == "true":
             torch_dtype = torch.bfloat16
         load_quantized_model = os.environ.get("LOAD_QUANTIZED_MODEL")
         print(f"  hf_pipeline_device_type: {hf_pipeline_device_type}")
         print(f"              torch_dtype: {torch_dtype}")
         print(f"                 n_threds: {n_threds}")
+        torch.set_default_dtype(torch_dtype)
         double_quant_config = BitsAndBytesConfig(
             load_in_4bit=load_quantized_model == "4bit",
             bnb_4bit_use_double_quant=load_quantized_model == "4bit",
             if self.llm_model_type == "openai":
                 MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-3.5-turbo"
                 print(f"              using model: {MODEL_NAME}")
+                self.llm = (
+                    OpenAI(
+                        model_name=MODEL_NAME,
+                        streaming=True,
+                        callbacks=callbacks,
+                        verbose=True,
+                        temperature=0,
+                    )
+                    if "instruct" in MODEL_NAME
+                    else ChatOpenAI(
+                        model_name=MODEL_NAME,
+                        streaming=True,
+                        callbacks=callbacks,
+                        verbose=True,
+                        temperature=0,
+                    )
                 )
             elif self.llm_model_type.startswith("gpt4all"):
                 MODEL_PATH = ensure_model_is_downloaded(self.llm_model_type)
                 )
             elif self.llm_model_type == "hftgi":
                 HFTGI_SERVER_URL = os.environ.get("HFTGI_SERVER_URL")
+                HFTGI_RP = os.environ.get("HFTGI_RP")
+                repetition_penalty = 1.120 if HFTGI_RP is None else float(HFTGI_RP)
+                print(f"       repetition_penalty: {repetition_penalty}")
                 self.max_tokens_limit = 4096
                 self.llm = HuggingFaceTextGenInference(
                     inference_server_url=HFTGI_SERVER_URL,
                     top_p=0.95,
                     # typical_p=0.95,
                     temperature=0.01,
+                    repetition_penalty=repetition_penalty,
                     callbacks=callbacks,
                     timeout=600,
                     streaming=True,
                 )
+            elif self.llm_model_type == "ollama":
+                MODEL_NAME = os.environ.get("OLLAMA_MODEL_NAME") or "dolphin-phi"
+                print(f"            loading model: {MODEL_NAME}")
+                self.llm = ChatOllama(
+                    model=MODEL_NAME,
+                    callbacks=callbacks,
+                    temperature=0,
+                    repeat_penalty=1.15,
+                )
             elif self.llm_model_type.startswith("huggingface"):
                 MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
                 print(f"            loading model: {MODEL_NAME_OR_PATH}")
                 if "Llama-2" in MODEL_NAME_OR_PATH:
                     self.max_tokens_limit = 4096
+                elif "TinyLlama" in MODEL_NAME_OR_PATH:
+                    self.max_tokens_limit = 1024
+                    class StopOnTokens(StoppingCriteria):
+                        def __call__(
+                            self,
+                            input_ids: torch.LongTensor,
+                            scores: torch.FloatTensor,
+                            **kwargs,
+                        ) -> bool:
+                            stop_ids = [
+                                2
+                            ]  # IDs of tokens where the generation should stop.
+                            for stop_id in stop_ids:
+                                if (
+                                    input_ids[0][-1] == stop_id
+                                ):  # Checking if the last generated token is a stop token.
+                                    return True
+                            return False
+                    stopping_criteria = StoppingCriteriaList([StopOnTokens()])
                 is_t5 = "t5" in MODEL_NAME_OR_PATH
                 temperature = (
                     if "gpt4all-j" in MODEL_NAME_OR_PATH
                     or "dolly" in MODEL_NAME_OR_PATH
                     or "Qwen" in MODEL_NAME_OR_PATH
+                    or "Llama" in MODEL_NAME_OR_PATH
+                    or "Orca-2" in MODEL_NAME_OR_PATH
+                    or "phi-2" in MODEL_NAME_OR_PATH
                     else 0
                 )
                 use_fast = (
                     else (1.25 if "dolly" in MODEL_NAME_OR_PATH else 1.1)
                 )
+                HF_RP = os.environ.get("HF_RP")
+                if HF_RP is not None and len(HF_RP) > 0:
+                    repetition_penalty = float(HF_RP)
+                print(f"       repetition_penalty: {repetition_penalty}")
                 if load_quantized_model is not None:
                     model = (
                         AutoModelForSeq2SeqLM.from_pretrained(
                     pad_token_id = eos_token_id
                     pipe = (
+                        pipeline(
+                            task,
                             model=model,
                             tokenizer=tokenizer,
+                            eos_token_id=eos_token_id,
+                            pad_token_id=pad_token_id,
                             streamer=self.streamer,
                             return_full_text=return_full_text,  # langchain expects the full text
+                            device_map="auto",
+                            trust_remote_code=True,
+                            max_new_tokens=2048,
+                            do_sample=True,
+                            temperature=0.01,
+                            top_p=0.95,
+                            top_k=50,
                             repetition_penalty=repetition_penalty,
                         )
+                        if eos_token_id != -1
+                        else pipeline(
+                            task,
+                            model=model,
+                            tokenizer=tokenizer,
+                            streamer=self.streamer,
+                            return_full_text=return_full_text,  # langchain expects the full text
+                            device_map="auto",
+                            trust_remote_code=True,
+                            max_new_tokens=2048,
+                            do_sample=True,
+                            temperature=temperature,
+                            top_p=0.95,
+                            top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                            repetition_penalty=repetition_penalty,
                         )
                     )
                 else:
                     if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
                         model = (
                             torch_dtype=torch_dtype,
                             max_new_tokens=2048,
                             trust_remote_code=True,
+                            do_sample=True,
                             temperature=temperature,
                             top_p=0.95,
                             top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                            repetition_penalty=repetition_penalty,
                         )
                         if token is None
                         else pipeline(
                             temperature=temperature,
                             top_p=0.95,
                             top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                            repetition_penalty=repetition_penalty,
                             token=token,
                         )
                     )
+                pipe.model.config.pad_token_id = pipe.model.config.eos_token_id
                 self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
             elif self.llm_model_type == "mosaicml":
                 MODEL_NAME_OR_PATH = os.environ.get("MOSAICML_MODEL_NAME_OR_PATH")
                 max_new_tokens = 8192 if "30b" in MODEL_NAME_OR_PATH else 2048
                 self.max_tokens_limit = max_new_tokens
                 repetition_penalty = 1.05 if "30b" in MODEL_NAME_OR_PATH else 1.02
+                ML_RP = os.environ.get("ML_RP")
+                if ML_RP is not None and len(ML_RP) > 0:
+                    repetition_penalty = float(ML_RP)
+                print(f"       repetition_penalty: {repetition_penalty}")
                 pipe = (
                     pipeline(
                         model=model,
                         device_map="auto",
                         # we pass model parameters here too
                         stopping_criteria=stopping_criteria,  # without this model will ramble
+                        do_sample=True,
+                        temperature=0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
                         top_p=0.95,  # select from top tokens whose probability add up to 15%
                         top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                         max_new_tokens=max_new_tokens,  # mex number of tokens to generate in the output
                         device=config.init_device,
                         # we pass model parameters here too
                         stopping_criteria=stopping_criteria,  # without this model will ramble
+                        do_sample=True,
+                        temperature=0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
                         top_p=0.95,  # select from top tokens whose probability add up to 15%
                         top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                         max_new_tokens=max_new_tokens,  # mex number of tokens to generate in the output
                 # config.max_seq_len = 4096
                 config.init_device = hf_pipeline_device_type
+                SL_RP = os.environ.get("SL_RP")
+                if SL_RP is not None and len(SL_RP) > 0:
+                    repetition_penalty = float(SL_RP)
+                else:
+                    repetition_penalty = 1.05
+                print(f"       repetition_penalty: {repetition_penalty}")
                 model = (
                     AutoModelForCausalLM.from_pretrained(
                         MODEL_NAME_OR_PATH,
                         top_p=0.95,  # select from top tokens whose probability add up to 15%
                         top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                         max_new_tokens=2048,  # mex number of tokens to generate in the output
+                        repetition_penalty=repetition_penalty,  # without this output begins repeating
                     )
                     if load_quantized_model is not None
                     else pipeline(
                         top_p=0.95,  # select from top tokens whose probability add up to 15%
                         top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                         max_new_tokens=2048,  # mex number of tokens to generate in the output
+                        repetition_penalty=repetition_penalty,  # without this output begins repeating
                     )
                 )
                 self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)

app_modules/llm_qa_chain.py CHANGED Viewed

@@ -1,13 +1,10 @@
 from langchain.chains import ConversationalRetrievalChain
 from langchain.chains.base import Chain
-from langchain.vectorstores.base import VectorStore
 from app_modules.llm_inference import LLMInference
 class QAChain(LLMInference):
-    vectorstore: VectorStore
     def __init__(self, vectorstore, llm_loader):
         super().__init__(llm_loader)
         self.vectorstore = vectorstore

 from langchain.chains import ConversationalRetrievalChain
 from langchain.chains.base import Chain
 from app_modules.llm_inference import LLMInference
 class QAChain(LLMInference):
     def __init__(self, vectorstore, llm_loader):
         super().__init__(llm_loader)
         self.vectorstore = vectorstore

app_modules/llm_qa_chain_with_memory.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from langchain.chains import ConversationalRetrievalChain
+from langchain.chains.base import Chain
+from app_modules.llm_inference import LLMInference
+from app_modules.utils import CustomizedConversationSummaryBufferMemory
+class QAChain(LLMInference):
+    def __init__(self, vectorstore, llm_loader):
+        super().__init__(llm_loader)
+        self.vectorstore = vectorstore
+    def create_chain(self) -> Chain:
+        memory = CustomizedConversationSummaryBufferMemory(
+            llm=self.llm_loader.llm,
+            output_key="answer",
+            memory_key="chat_history",
+            max_token_limit=1024,
+            return_messages=True,
+        )
+        qa = ConversationalRetrievalChain.from_llm(
+            self.llm_loader.llm,
+            memory=memory,
+            chain_type="stuff",
+            retriever=self.vectorstore.as_retriever(
+                search_kwargs=self.llm_loader.search_kwargs
+            ),
+            get_chat_history=lambda h: h,
+            return_source_documents=True,
+        )
+        return qa

app_modules/utils.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pathlib import Path
 import requests
 import torch
 from tqdm import tqdm
 class LogRecord(logging.LogRecord):
@@ -69,21 +70,31 @@ def print_llm_response(llm_response):
         llm_response["source_documents"] if "source_documents" in llm_response else None
     )
     if source_documents is None:
-        source_documents = llm_response["sourceDocs"]
-    print("\nSources:")
-    for source in source_documents:
-        metadata = source["metadata"] if "metadata" in source else source.metadata
-        print(
-            "  Page: "
-            + str(metadata["page"])
-            + " Source: "
-            + str(metadata["url"] if "url" in metadata else metadata["source"])
-        )
-        print(
-            source["page_content"] if "page_content" in source else source.page_content
         )
 def get_device_types():
     print("Running on: ", platform.platform())
@@ -159,6 +170,21 @@ def ensure_model_is_downloaded(llm_model_type):
     return local_path
 if __name__ == "__main__":
     hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
     print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")

 import requests
 import torch
 from tqdm import tqdm
+from langchain.memory import ConversationSummaryBufferMemory
 class LogRecord(logging.LogRecord):
         llm_response["source_documents"] if "source_documents" in llm_response else None
     )
     if source_documents is None:
+        source_documents = (
+            llm_response["sourceDocs"] if "sourceDocs" in llm_response else None
         )
+    if source_documents is not None:
+        print("\nSources:")
+        for source in source_documents:
+            metadata = source["metadata"] if "metadata" in source else source.metadata
+            if "page" in metadata:
+                print(f"  Page:  {metadata['page']}", end="")
+            print(
+                " Source: "
+                + str(metadata["url"] if "url" in metadata else metadata["source"])
+            )
+            print(
+                source["page_content"]
+                if "page_content" in source
+                else source.page_content
+            )
+    if "chat_history" in llm_response:
+        print("\nChat History:")
+        print(llm_response["chat_history"])
 def get_device_types():
     print("Running on: ", platform.platform())
     return local_path
+class CustomizedConversationSummaryBufferMemory(ConversationSummaryBufferMemory):
+    def save_context(self, inputs, outputs) -> None:
+        for key in outputs:
+            if isinstance(outputs[key], str):
+                outputs[key] = outputs[key].replace("<|im_end|>", "")
+        return super().save_context(inputs, outputs)
+    def predict_new_summary(self, messages, existing_summary) -> str:
+        return (
+            super()
+            .predict_new_summary(messages, existing_summary)
+            .replace("<|im_end|>", "")
+        )
 if __name__ == "__main__":
     hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
     print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")

requirements-mac.txt ADDED Viewed

	@@ -0,0 +1,127 @@

+accelerate==0.26.1
+aiofiles==23.2.1
+aiohttp==3.9.1
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.2.0
+attrs==23.2.0
+black==24.1.0
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+dataclasses-json==0.6.3
+faiss-cpu==1.7.4
+fastapi==0.109.0
+ffmpy==0.3.1
+filelock==3.13.1
+fonttools==4.47.2
+frozenlist==1.4.1
+fsspec==2023.12.2
+gradio==4.16.0
+gradio_client==0.8.1
+greenlet==3.0.3
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.26.0
+huggingface-hub==0.20.3
+idna==3.6
+importlib-resources==6.1.1
+InstructorEmbedding==1.0.1
+isort==5.13.2
+Jinja2==3.1.3
+joblib==1.3.2
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+langchain==0.1.4
+langchain-community==0.0.16
+langchain-core==0.1.16
+langsmith==0.0.83
+markdown-it-py==3.0.0
+MarkupSafe==2.1.4
+marshmallow==3.20.2
+matplotlib==3.8.2
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.3
+# nvidia-cublas-cu12==12.1.3.1
+# nvidia-cuda-cupti-cu12==12.1.105
+# nvidia-cuda-nvrtc-cu12==12.1.105
+# nvidia-cuda-runtime-cu12==12.1.105
+# nvidia-cudnn-cu12==8.9.2.26
+# nvidia-cufft-cu12==11.0.2.54
+# nvidia-curand-cu12==10.3.2.106
+# nvidia-cusolver-cu12==11.4.5.107
+# nvidia-cusparse-cu12==12.1.0.106
+# nvidia-nccl-cu12==2.18.1
+# nvidia-nvjitlink-cu12==12.3.101
+# nvidia-nvtx-cu12==12.1.105
+orjson==3.9.12
+packaging==23.2
+pandas==2.2.0
+pathspec==0.12.1
+peft @ git+https://github.com/huggingface/peft.git@1c1c7fdaa6e6abaa53939b865dee1eded82ad032
+pillow==10.2.0
+platformdirs==4.1.0
+protobuf==4.25.2
+psutil==5.9.8
+pydantic==2.5.3
+pydantic_core==2.14.6
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.1
+python-dateutil==2.8.2
+python-dotenv==1.0.1
+python-multipart==0.0.6
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.1
+regex==2023.12.25
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.17.1
+ruff==0.1.14
+safetensors==0.4.2
+scikit-learn==1.4.0
+scipy==1.12.0
+semantic-version==2.10.0
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.0
+SQLAlchemy==2.0.25
+starlette==0.35.1
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.2.0
+tokenizers==0.15.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+transformers @ git+https://github.com/huggingface/transformers.git@de13a951b38b85195984164819f1ab05fe508677
+# triton==2.1.0
+typer==0.9.0
+typing-inspect==0.9.0
+typing_extensions==4.9.0
+tzdata==2023.4
+urllib3==2.1.0
+uvicorn==0.27.0
+websockets==11.0.3
+yarl==1.9.4
+einops==0.7.0
+Pyarrow==15.0.0
+openpyxl==3.1.2
+tabulate==0.9.0

requirements.txt CHANGED Viewed

@@ -1,38 +1,129 @@
-gradio
-mdtex2html
-pypinyin
-tiktoken
-socksio
-tqdm
-colorama
-accelerate
-langchain
-torch
-langchain-serve
-protobuf
-faiss-cpu
-sentence_transformers
-InstructorEmbedding
-python-dotenv
-openai
-gpt4all
-pyllama
-git+https://github.com/huggingface/peft.git
-git+https://github.com/huggingface/transformers.git
-SentencePiece
-isort
-black
-pygpt4all
-tiktoken
-safetensors
-xformers
-bitsandbytes
-einops
-gevent
-pydantic >= 1.10.11
-pypdf
-python-telegram-bot
-transformers_stream_generator
-openllm
-openllm[llama]
-text_generation

+accelerate==0.26.1
+aiofiles==23.2.1
+aiohttp==3.9.1
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.2.0
+attrs==23.2.0
+black==24.1.0
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+dataclasses-json==0.6.3
+faiss-cpu==1.7.4
+fastapi==0.109.0
+ffmpy==0.3.1
+filelock==3.13.1
+fonttools==4.47.2
+frozenlist==1.4.1
+fsspec==2023.12.2
+gradio==4.16.0
+gradio_client==0.8.1
+greenlet==3.0.3
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.26.0
+huggingface-hub==0.20.3
+idna==3.6
+importlib-resources==6.1.1
+InstructorEmbedding==1.0.1
+isort==5.13.2
+Jinja2==3.1.3
+joblib==1.3.2
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+langchain==0.1.4
+langchain-community==0.0.16
+langchain-openai==0.0.5
+langchain-core==0.1.16
+langsmith==0.0.83
+markdown-it-py==3.0.0
+MarkupSafe==2.1.4
+marshmallow==3.20.2
+matplotlib==3.8.2
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+mypy-extensions==1.0.0
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.3
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+orjson==3.9.12
+packaging==23.2
+pandas==2.2.0
+pathspec==0.12.1
+peft @ git+https://github.com/huggingface/peft.git@1c1c7fdaa6e6abaa53939b865dee1eded82ad032
+pillow==10.2.0
+platformdirs==4.1.0
+protobuf==4.25.2
+psutil==5.9.8
+pydantic==2.5.3
+pydantic_core==2.14.6
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.1
+python-dateutil==2.8.2
+python-dotenv==1.0.1
+python-multipart==0.0.6
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.1
+regex==2023.12.25
+requests==2.31.0
+rich==13.7.0
+rpds-py==0.17.1
+ruff==0.1.14
+safetensors==0.4.2
+scikit-learn==1.4.0
+scipy==1.12.0
+semantic-version==2.10.0
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.0
+SQLAlchemy==2.0.25
+starlette==0.35.1
+sympy==1.12
+tenacity==8.2.3
+threadpoolctl==3.2.0
+tokenizers==0.15.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+transformers @ git+https://github.com/huggingface/transformers.git@de13a951b38b85195984164819f1ab05fe508677
+triton==2.1.0
+typer==0.9.0
+typing-inspect==0.9.0
+typing_extensions==4.9.0
+tzdata==2023.4
+urllib3==2.1.0
+uvicorn==0.27.0
+websockets==11.0.3
+yarl==1.9.4
+einops==0.7.0
+Pyarrow==15.0.0
+openpyxl==3.1.2
+text_generation==0.6.1
+tabulate==0.9.0