learn-ai

Runtime error

App Files Files Community

dh-mc commited on Aug 9, 2023

Commit

3ca5bd8

•

1 Parent(s): 1e4d37b

added telegram bot

Browse files

Files changed (6) hide show

Makefile +3 -0
app_modules/llm_inference.py +45 -52
app_modules/llm_loader.py +37 -21
requirements.txt +1 -0
server.py +45 -18
telegram_bot.py +93 -0

Makefile CHANGED Viewed

@@ -15,6 +15,9 @@ test:
 chat:
 	python test.py chat
 ingest:
 	python ingest.py

 chat:
 	python test.py chat
+tele:
+	python telegram_bot.py
 ingest:
 	python ingest.py

app_modules/llm_inference.py CHANGED Viewed

@@ -38,71 +38,64 @@ class LLMInference(metaclass=abc.ABCMeta):
         self, inputs, streaming_handler, q: Queue = None, tracing: bool = False
     ):
         print(inputs)
-        if self.llm_loader.streamer is not None and isinstance(
-            self.llm_loader.streamer, TextIteratorStreamer
-        ):
             self.llm_loader.streamer.reset(q)
-        chain = self.get_chain(tracing)
-        result = (
-            self._run_chain(
-                chain,
-                inputs,
-                streaming_handler,
             )
-            if streaming_handler is not None
-            else chain(inputs)
-        )
-        if "answer" in result:
-            result["answer"] = remove_extra_spaces(result["answer"])
-            base_url = os.environ.get("PDF_FILE_BASE_URL")
-            if base_url is not None and len(base_url) > 0:
-                documents = result["source_documents"]
-                for doc in documents:
-                    source = doc.metadata["source"]
-                    title = source.split("/")[-1]
-                    doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
-        return result
     def _execute_chain(self, chain, inputs, q, sh):
         q.put(chain(inputs, callbacks=[sh]))
     def _run_chain(self, chain, inputs, streaming_handler):
-        self.llm_loader.lock.acquire()
-        try:
-            que = Queue()
-            t = Thread(
-                target=self._execute_chain,
-                args=(chain, inputs, que, streaming_handler),
-            )
-            t.start()
-            if self.llm_loader.streamer is not None and isinstance(
-                self.llm_loader.streamer, TextIteratorStreamer
-            ):
-                count = (
-                    2
-                    if "chat_history" in inputs and len(inputs.get("chat_history")) > 0
-                    else 1
-                )
-                while count > 0:
-                    try:
-                        for token in self.llm_loader.streamer:
-                            streaming_handler.on_llm_new_token(token)
-                        self.llm_loader.streamer.reset()
-                        count -= 1
-                    except Exception:
-                        print("nothing generated yet - retry in 0.5s")
-                        time.sleep(0.5)
-            t.join()
-            return que.get()
-        finally:
-            self.llm_loader.lock.release()

         self, inputs, streaming_handler, q: Queue = None, tracing: bool = False
     ):
         print(inputs)
+        self.llm_loader.lock.acquire()
+        try:
             self.llm_loader.streamer.reset(q)
+            chain = self.get_chain(tracing)
+            result = (
+                self._run_chain(
+                    chain,
+                    inputs,
+                    streaming_handler,
+                )
+                if streaming_handler is not None
+                and self.llm_loader.streamer.for_huggingface
+                else chain(inputs)
             )
+            if "answer" in result:
+                result["answer"] = remove_extra_spaces(result["answer"])
+                base_url = os.environ.get("PDF_FILE_BASE_URL")
+                if base_url is not None and len(base_url) > 0:
+                    documents = result["source_documents"]
+                    for doc in documents:
+                        source = doc.metadata["source"]
+                        title = source.split("/")[-1]
+                        doc.metadata["url"] = f"{base_url}{urllib.parse.quote(title)}"
+            return result
+        finally:
+            self.llm_loader.lock.release()
     def _execute_chain(self, chain, inputs, q, sh):
         q.put(chain(inputs, callbacks=[sh]))
     def _run_chain(self, chain, inputs, streaming_handler):
+        que = Queue()
+        t = Thread(
+            target=self._execute_chain,
+            args=(chain, inputs, que, streaming_handler),
+        )
+        t.start()
+        count = (
+            2 if "chat_history" in inputs and len(inputs.get("chat_history")) > 0 else 1
+        )
+        while count > 0:
+            try:
+                for token in self.llm_loader.streamer:
+                    streaming_handler.on_llm_new_token(token)
+                self.llm_loader.streamer.reset()
+                count -= 1
+            except Exception:
+                print("nothing generated yet - retry in 0.5s")
+                time.sleep(0.5)
+        t.join()
+        return que.get()

app_modules/llm_loader.py CHANGED Viewed

@@ -33,18 +33,22 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
         tokenizer: "AutoTokenizer",
         skip_prompt: bool = False,
         timeout: Optional[float] = None,
         **decode_kwargs,
     ):
         super().__init__(tokenizer, skip_prompt, **decode_kwargs)
         self.text_queue = Queue()
         self.stop_signal = None
         self.timeout = timeout
     def on_finalized_text(self, text: str, stream_end: bool = False):
         super().on_finalized_text(text, stream_end=stream_end)
         """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
         self.text_queue.put(text, timeout=self.timeout)
         if stream_end:
             print("\n")
             self.text_queue.put("\n", timeout=self.timeout)
@@ -54,12 +58,16 @@ class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
         sys.stdout.write(token)
         sys.stdout.flush()
         self.text_queue.put(token, timeout=self.timeout)
     def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
         print("\n")
         self.text_queue.put("\n", timeout=self.timeout)
         self.text_queue.put(self.stop_signal, timeout=self.timeout)
     def __iter__(self):
         return self
@@ -88,21 +96,18 @@ class LLMLoader:
     def __init__(self, llm_model_type, lc_serve: bool = False):
         self.llm_model_type = llm_model_type
         self.llm = None
-        self.streamer = None if lc_serve else TextIteratorStreamer("")
         self.max_tokens_limit = 2048
         self.search_kwargs = {"k": 4}
         self.lock = threading.Lock()
-    def _init_streamer(self, tokenizer, custom_handler):
-        self.streamer = (
-            TextIteratorStreamer(
-                tokenizer,
-                timeout=10.0,
-                skip_prompt=True,
-                skip_special_tokens=True,
-            )
-            if custom_handler is None
-            else TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         )
     def init(
@@ -179,7 +184,11 @@ class LLMLoader:
                 MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
                 print(f"            loading model: {MODEL_NAME_OR_PATH}")
-                hf_auth_token = os.environ.get("HUGGINGFACE_AUTH_TOKEN")
                 transformers_offline = os.environ.get("TRANSFORMERS_OFFLINE") == "1"
                 token = (
                     hf_auth_token
@@ -231,7 +240,7 @@ class LLMLoader:
                     )
                 )
-                self._init_streamer(tokenizer, custom_handler)
                 task = "text2text-generation" if is_t5 else "text-generation"
@@ -343,14 +352,21 @@ class LLMLoader:
                                 MODEL_NAME_OR_PATH,
                                 config=config,
                                 trust_remote_code=True,
-                                token=token,
                             )
                             if is_t5
-                            else AutoModelForCausalLM.from_pretrained(
-                                MODEL_NAME_OR_PATH,
-                                config=config,
-                                trust_remote_code=True,
-                                token=token,
                             )
                         )
                         print(f"Model memory footprint: {model.get_memory_footprint()}")
@@ -405,7 +421,7 @@ class LLMLoader:
                 print(f"Model memory footprint: {model.get_memory_footprint()}")
                 tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
-                self._init_streamer(tokenizer, custom_handler)
                 # mtp-7b is trained to add "<|endoftext|>" at the end of generations
                 stop_token_ids = tokenizer.convert_tokens_to_ids(["<|endoftext|>"])
@@ -497,7 +513,7 @@ class LLMLoader:
                 print(f"Model memory footprint: {model.get_memory_footprint()}")
                 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
-                self._init_streamer(tokenizer, custom_handler)
                 class StopOnTokens(StoppingCriteria):
                     def __call__(

         tokenizer: "AutoTokenizer",
         skip_prompt: bool = False,
         timeout: Optional[float] = None,
+        for_huggingface: bool = False,
         **decode_kwargs,
     ):
         super().__init__(tokenizer, skip_prompt, **decode_kwargs)
         self.text_queue = Queue()
         self.stop_signal = None
         self.timeout = timeout
+        self.total_tokens = 0
+        self.for_huggingface = for_huggingface
     def on_finalized_text(self, text: str, stream_end: bool = False):
         super().on_finalized_text(text, stream_end=stream_end)
         """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
         self.text_queue.put(text, timeout=self.timeout)
+        self.total_tokens = self.total_tokens + 1
         if stream_end:
             print("\n")
             self.text_queue.put("\n", timeout=self.timeout)
         sys.stdout.write(token)
         sys.stdout.flush()
         self.text_queue.put(token, timeout=self.timeout)
+        self.total_tokens = self.total_tokens + 1
     def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
         print("\n")
         self.text_queue.put("\n", timeout=self.timeout)
         self.text_queue.put(self.stop_signal, timeout=self.timeout)
+    def for_huggingface(self) -> bool:
+        return self.tokenizer != ""
     def __iter__(self):
         return self
     def __init__(self, llm_model_type, lc_serve: bool = False):
         self.llm_model_type = llm_model_type
         self.llm = None
+        self.streamer = TextIteratorStreamer("")
         self.max_tokens_limit = 2048
         self.search_kwargs = {"k": 4}
         self.lock = threading.Lock()
+    def _init_hf_streamer(self, tokenizer):
+        self.streamer = TextIteratorStreamer(
+            tokenizer,
+            timeout=10.0,
+            skip_prompt=True,
+            skip_special_tokens=True,
+            for_huggingface=True,
         )
     def init(
                 MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
                 print(f"            loading model: {MODEL_NAME_OR_PATH}")
+                hf_auth_token = (
+                    os.environ.get("HUGGINGFACE_AUTH_TOKEN")
+                    if "Llama-2" in MODEL_NAME_OR_PATH
+                    else None
+                )
                 transformers_offline = os.environ.get("TRANSFORMERS_OFFLINE") == "1"
                 token = (
                     hf_auth_token
                     )
                 )
+                self._init_hf_streamer(tokenizer)
                 task = "text2text-generation" if is_t5 else "text-generation"
                                 MODEL_NAME_OR_PATH,
                                 config=config,
                                 trust_remote_code=True,
                             )
                             if is_t5
+                            else (
+                                AutoModelForCausalLM.from_pretrained(
+                                    MODEL_NAME_OR_PATH,
+                                    config=config,
+                                    trust_remote_code=True,
+                                )
+                                if token is None
+                                else AutoModelForCausalLM.from_pretrained(
+                                    MODEL_NAME_OR_PATH,
+                                    config=config,
+                                    trust_remote_code=True,
+                                    token=token,
+                                )
                             )
                         )
                         print(f"Model memory footprint: {model.get_memory_footprint()}")
                 print(f"Model memory footprint: {model.get_memory_footprint()}")
                 tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+                self._init_hf_streamer(tokenizer)
                 # mtp-7b is trained to add "<|endoftext|>" at the end of generations
                 stop_token_ids = tokenizer.convert_tokens_to_ids(["<|endoftext|>"])
                 print(f"Model memory footprint: {model.get_memory_footprint()}")
                 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
+                self._init_hf_streamer(tokenizer)
                 class StopOnTokens(StoppingCriteria):
                     def __call__(

requirements.txt CHANGED Viewed

@@ -31,3 +31,4 @@ einops
 gevent
 pydantic >= 1.10.11
 pypdf

 gevent
 pydantic >= 1.10.11
 pypdf
+python-telegram-bot

server.py CHANGED Viewed

@@ -11,7 +11,7 @@ from app_modules.init import app_init
 from app_modules.llm_chat_chain import ChatChain
 from app_modules.utils import print_llm_response
-llm_loader, qa_chain = app_init(True)
 chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") == "true"
@@ -26,14 +26,13 @@ class ChatResponse(BaseModel):
     sourceDocs: Optional[List] = None
-@serving(websocket=True)
-def chat(
-    question: str, history: Optional[List] = [], uuid: Optional[str] = None, **kwargs
-) -> str:
-    print(f"uuid: {uuid}")
-    # Get the `streaming_handler` from `kwargs`. This is used to stream data to the client.
-    streaming_handler = kwargs.get("streaming_handler")
-    if uuid is None:
         chat_history = []
         if chat_history_enabled:
             for element in history:
@@ -48,21 +47,49 @@ def chat(
         print(f"Completed in {end - start:.3f}s")
         print(f"qa_chain result: {result}")
-        resp = ChatResponse(sourceDocs=result["source_documents"])
-        return json.dumps(resp.dict())
     else:
-        if uuid in uuid_to_chat_chain_mapping:
-            chat = uuid_to_chat_chain_mapping[uuid]
         else:
             chat = ChatChain(llm_loader)
-            uuid_to_chat_chain_mapping[uuid] = chat
         result = chat.call_chain({"question": question}, streaming_handler)
         print(f"chat result: {result}")
-        resp = ChatResponse(sourceDocs=[])
-        return json.dumps(resp.dict())
 if __name__ == "__main__":
-    print_llm_response(json.loads(chat("What's deep learning?", [])))

 from app_modules.llm_chat_chain import ChatChain
 from app_modules.utils import print_llm_response
+llm_loader, qa_chain = app_init(__name__ != "__main__")
 chat_history_enabled = os.environ.get("CHAT_HISTORY_ENABLED") == "true"
     sourceDocs: Optional[List] = None
+def do_chat(
+    question: str,
+    history: Optional[List] = [],
+    chat_id: Optional[str] = None,
+    streaming_handler: any = None,
+):
+    if chat_id is None:
         chat_history = []
         if chat_history_enabled:
             for element in history:
         print(f"Completed in {end - start:.3f}s")
         print(f"qa_chain result: {result}")
+        return result
     else:
+        if chat_id in uuid_to_chat_chain_mapping:
+            chat = uuid_to_chat_chain_mapping[chat_id]
         else:
             chat = ChatChain(llm_loader)
+            uuid_to_chat_chain_mapping[chat_id] = chat
         result = chat.call_chain({"question": question}, streaming_handler)
         print(f"chat result: {result}")
+        return result
+@serving(websocket=True)
+def chat(
+    question: str, history: Optional[List] = [], chat_id: Optional[str] = None, **kwargs
+) -> str:
+    print("question@chat:", question)
+    streaming_handler = kwargs.get("streaming_handler")
+    result = do_chat(question, history, chat_id, streaming_handler)
+    resp = ChatResponse(
+        sourceDocs=result["source_documents"] if chat_id is None else []
+    )
+    return json.dumps(resp.dict())
+@serving
+def chat_sync(
+    question: str, history: Optional[List] = [], chat_id: Optional[str] = None, **kwargs
+) -> str:
+    print("question@chat_sync:", question)
+    result = do_chat(question, history, chat_id, None)
+    return result["text"]
 if __name__ == "__main__":
+    # print_llm_response(json.loads(chat("What's deep learning?", [])))
+    chat_start = timer()
+    chat_sync("What's generative AI?", chat_id="test_user")
+    chat_sync("more on finance", chat_id="test_user")
+    chat_end = timer()
+    total_time = chat_end - chat_start
+    print(f"Total time used: {total_time:.3f} s")
+    print(f"Number of tokens generated: {llm_loader.streamer.total_tokens}")
+    print(
+        f"Average generation speed: {llm_loader.streamer.total_tokens / total_time:.3f} tokens/s"
+    )

telegram_bot.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os
+import ssl
+import time
+from threading import Thread
+import requests
+from telegram import Update
+from telegram import __version__ as TG_VER
+from telegram.ext import (
+    Application,
+    CommandHandler,
+    ContextTypes,
+    MessageHandler,
+    filters,
+)
+from app_modules.init import *
+ctx = ssl.create_default_context()
+ctx.set_ciphers("DEFAULT")
+try:
+    from telegram import __version_info__
+except ImportError:
+    __version_info__ = (0, 0, 0, 0, 0)  # type: ignore[assignment]
+if __version_info__ < (20, 0, 0, "alpha", 1):
+    raise RuntimeError(
+        f"This example is not compatible with your current PTB version {TG_VER}. To view the "
+        f"{TG_VER} version of this example, "
+        f"visit https://docs.python-telegram-bot.org/en/v{TG_VER}/examples.html"
+    )
+TOKEN = os.getenv("TELEGRAM_API_TOKEN")
+ENDPOINT = os.getenv("CHAT_API_URL")
+# Define a few command handlers. These usually take the two arguments update and
+# context.
+async def start_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
+    """Send a message when the command /start is issued."""
+    user = update.effective_user
+    await update.message.reply_html(
+        rf"Hi {user.mention_html()}! You are welcome to ask questions on anything!",
+    )
+async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
+    """Send a message when the command /help is issued."""
+    await update.message.reply_text("Help!")
+async def chat_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
+    """Echo the user message."""
+    tic = time.perf_counter()
+    try:
+        message = {
+            "question": update.message.text,
+            "chat_id": update.message.chat.username,
+        }
+        print(message)
+        x = requests.post(ENDPOINT, json=message).json()
+        temp = time.perf_counter()
+        print(f"Received response in {temp - tic:0.4f} seconds")
+        result = x["result"]
+        print(result)
+        await update.message.reply_text(result)
+        toc = time.perf_counter()
+        print(f"Response time in {toc - tic:0.4f} seconds")
+    except Exception as e:
+        print("error", e)
+def start_telegram_bot() -> None:
+    """Start the bot."""
+    print("starting telegram bot ...")
+    # Create the Application and pass it your bot's token.
+    application = Application.builder().token(TOKEN).build()
+    # on different commands - answer in Telegram
+    application.add_handler(CommandHandler("start_command", start_command))
+    application.add_handler(CommandHandler("help", help_command))
+    # on non command i.e message - chat_command the message on Telegram
+    application.add_handler(
+        MessageHandler(filters.TEXT & ~filters.COMMAND, chat_command)
+    )
+    application.run_polling()
+if __name__ == "__main__":
+    start_telegram_bot()