Spaces:

Intel
/

Multimodal-RAG

Running

App Files Files Community

Tile commited on Jun 17

Commit

4f234f9

•

1 Parent(s): 0c8c7b4

first commit

Browse files

Files changed (4) hide show

app.py +391 -55
conversation.py +247 -0
requirements.txt +2 -1
utils.py +86 -0

app.py CHANGED Viewed

@@ -1,63 +1,399 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
 )
 if __name__ == "__main__":
-    demo.launch()

+import argparse
+import sys
+import os
+# import cv2
+import glob
 import gradio as gr
+import numpy as np
+import json
+from PIL import Image
+from tqdm import tqdm
+from pathlib import Path
+import uvicorn
+from fastapi.staticfiles import StaticFiles
+import random
+import time
+import requests
+from fastapi import FastAPI
+from conversation import SeparatorStyle, conv_templates, default_conversation
+from utils import (
+    build_logger,
+    moderation_msg,
+    server_error_msg,
+)
+from config import cur_conv
+logger = build_logger("gradio_web_server", "gradio_web_server.log")
+headers = {"Content-Type": "application/json"}
+# create a FastAPI app
+app = FastAPI()
+# # create a static directory to store the static files
+# static_dir = Path('/data/Multimodal-RAG/GenerativeAIExamples/ChatQnA/langchain/redis/chips-making-deals/')
+static_dir = Path('/data/')
+# mount FastAPI StaticFiles server
+app.mount("/static", StaticFiles(directory=static_dir), name="static")
+theme = gr.themes.Base(
+    primary_hue=gr.themes.Color(
+        c100="#dbeafe", c200="#bfdbfe", c300="#93c5fd", c400="#60a5fa", c50="#eff6ff", c500="#0054ae", c600="#00377c", c700="#00377c", c800="#1e40af", c900="#1e3a8a", c950="#0a0c2b"),
+    secondary_hue=gr.themes.Color(
+        c100="#dbeafe", c200="#bfdbfe", c300="#93c5fd", c400="#60a5fa", c50="#eff6ff", c500="#0054ae", c600="#0054ae", c700="#0054ae", c800="#1e40af", c900="#1e3a8a", c950="#1d3660"),
+).set(
+    body_background_fill_dark='*primary_950',
+    body_text_color_dark='*neutral_300',
+    border_color_accent='*primary_700',
+    border_color_accent_dark='*neutral_800',
+    block_background_fill_dark='*primary_950',
+    block_border_width='2px',
+    block_border_width_dark='2px',
+    button_primary_background_fill_dark='*primary_500',
+    button_primary_border_color_dark='*primary_500'
 )
+css='''
+    @font-face {
+        font-family: IntelOne;
+        src: url("file/assets/intelone-bodytext-font-family-regular.ttf");
+    }
+'''
+##     <td style="border-bottom:0"><img src="file/assets/DCAI_logo.png" height="300" width="300"></td>
+html_title = '''
+<table>
+<tr style="height:150px">
+    <td style="border-bottom:0"><img src="file/assets/intel-labs.png" height="100" width="100"></td>
+    <td style="border-bottom:0; vertical-align:bottom">
+    <p style="font-size:xx-large;font-family:IntelOne, Georgia, sans-serif;color: white;">
+     Cognitive AI:
+     <br>
+     Multimodal RAG on Videos
+    </p>
+    </td>
+    <td style="border-bottom:0;"><img src="file/assets/gaudi.png" width="100" height="100"></td>
+    <td style="border-bottom:0;"><img src="file/assets/xeon.png" width="100" height="100"></td>
+    <td style="border-bottom:0;"><img src="file/assets/IDC7.png" width="400" height="350"></td>
+</tr>
+</table>
+'''
+debug = False
+def print_debug(t):
+    if debug:
+        print(t)
+# https://stackoverflow.com/a/57781047
+# Resizes a image and maintains aspect ratio
+# def maintain_aspect_ratio_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
+#     # Grab the image size and initialize dimensions
+#     dim = None
+#     (h, w) = image.shape[:2]
+#     # Return original image if no need to resize
+#     if width is None and height is None:
+#         return image
+#     # We are resizing height if width is none
+#     if width is None:
+#         # Calculate the ratio of the height and construct the dimensions
+#         r = height / float(h)
+#         dim = (int(w * r), height)
+#     # We are resizing width if height is none
+#     else:
+#         # Calculate the ratio of the width and construct the dimensions
+#         r = width / float(w)
+#         dim = (width, int(h * r))
+#     # Return the resized image
+#     return cv2.resize(image, dim, interpolation=inter)
+def time_to_frame(time, fps):
+    '''
+        convert time in seconds into frame number
+    '''
+    return int(time * fps - 1)
+def str2time(strtime):
+    strtime = strtime.strip('"')
+    hrs, mins, seconds = [float(c) for c in strtime.split(':')]
+    total_seconds = hrs * 60**2 + mins * 60 + seconds
+    return total_seconds
+def get_iframe(video_path: str, start: int = -1, end: int = -1):
+    return f"""<video controls="controls" preload="metadata" src="{video_path}" width="540" height="310"></video>"""
+#TODO
+# def place(galleries, evt: gr.SelectData):
+#     print(evt.value)
+#     start_time = evt.value.split('||')[0].strip()
+#     print(start_time)
+#     # sub_video_id = evt.value.split('|')[-1]
+#     if start_time in start_time_index_map.keys():
+#         sub_video_id = start_time_index_map[start_time]
+#     else:
+#         sub_video_id = 0
+#     path_to_sub_video = f"/static/video_embeddings/mp4.keynotes23/sub-videos/keynotes23_split{sub_video_id}.mp4"
+#     # return evt.value
+#     return get_iframe(path_to_sub_video)
+# def process(text_query):
+#     tmp_dir = os.environ.get('VID_CACHE_DIR', os.environ.get('TMPDIR', './video_embeddings'))
+#     frames, transcripts = run_query(text_query, path=tmp_dir)
+#     # return video_file_path, [(image, caption) for image, caption in zip(frame_paths, transcripts)]
+#     return [(frame, caption) for frame, caption in zip(frames, transcripts)], ""
+description = "This Space lets you engage with multimodal RAG on a video through a chat box."
+no_change_btn = gr.Button.update()
+enable_btn = gr.Button.update(interactive=True)
+disable_btn = gr.Button.update(interactive=False)
+# textbox = gr.Textbox(
+#         show_label=False, placeholder="Enter text and press ENTER", container=False
+# )
+def clear_history(request: gr.Request):
+    logger.info(f"clear_history. ip: {request.client.host}")
+    state = cur_conv.copy()
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 1
+def add_text(state, text, request: gr.Request):
+    logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
+    if len(text) <= 0 :
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 1
+    text = text[:1536]  # Hard cut-off
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "") + (disable_btn,) * 1
+def http_bot(
+    state, request: gr.Request
+):
+    logger.info(f"http_bot. ip: {request.client.host}")
+    start_tstamp = time.time()
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        path_to_sub_videos = state.get_path_to_subvideos()
+        yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (no_change_btn,) * 1
+        return
+    if len(state.messages) == state.offset + 2:
+        # First round of conversation
+        new_state = cur_conv.copy()
+        new_state.append_message(new_state.roles[0], state.messages[-2][1])
+        new_state.append_message(new_state.roles[1], None)
+        state = new_state
+    # Construct prompt
+    prompt = state.get_prompt()
+    all_images = state.get_images(return_pil=False)
+    # Make requests
+    is_very_first_query = True
+    if len(all_images) == 0:
+        # first query need to do RAG
+        pload = {
+            "query": prompt,
+        }
+    else:
+        # subsequence queries, no need to do Retrieval
+        is_very_first_query = False
+        pload = {
+            "prompt": prompt,
+            "path-to-image": all_images[0],
+        }
+    if is_very_first_query:
+        url = worker_addr + "/v1/rag/chat"
+    else:
+        url = worker_addr + "/v1/rag/multi_turn_chat"
+    logger.info(f"==== request ====\n{pload}")
+    logger.info(f"==== url request ====\n{url}")
+    #uncomment this for testing UI only
+    # state.messages[-1][-1] = f"response {len(state.messages)}"
+    # yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 1
+    # return
+    state.messages[-1][-1] = "▌"
+    path_to_sub_videos = state.get_path_to_subvideos()
+    yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (disable_btn,) * 1
+    try:
+        # Stream output
+        response = requests.post(url, headers=headers, json=pload, timeout=100, stream=True)
+        for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+            if chunk:
+                res = json.loads(chunk.decode())
+        ## old_method
+        # if response.status_code == 200:
+        #     cur_json = ""
+        #     for chunk in response:
+        #         # print('chunk is ---> ', chunk.decode('utf-8'))
+        #         cur_json += chunk.decode('utf-8')
+        #         try:
+        #             res = json.loads(cur_json)
+        #         except:
+        #             # a whole json does not include in this chunk, need to concatenate with next chunk
+        #             continue
+        #         # successfully load json into res
+        #         cur_json = ""
+                if state.path_to_img is None and 'path-to-image' in res:
+                    state.path_to_img = res['path-to-image']
+                if state.video_title is None and 'title' in res:
+                    state.video_title = res['title']
+                if 'answer' in res:
+                    # print(f"answer is {res['answer']}")
+                    output = res["answer"]
+                    # print(f"state.messages is {state.messages[-1][-1]}")
+                    state.messages[-1][-1] = state.messages[-1][-1][:-1] + output + "▌"
+                    path_to_sub_videos = state.get_path_to_subvideos()
+                    yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (disable_btn,) * 1
+                time.sleep(0.03)
+        # else:
+        #     raise requests.exceptions.RequestException()
+    except requests.exceptions.RequestException as e:
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot(), None) + (
+            enable_btn,
+        )
+        return
+    state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    path_to_sub_videos = state.get_path_to_subvideos()
+    logger.info(path_to_sub_videos)
+    yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (enable_btn,) * 1
+    finish_tstamp = time.time()
+    logger.info(f"{state.messages[-1][-1]}")
+    # with open(get_conv_log_filename(), "a") as fout:
+    #     data = {
+    #         "tstamp": round(finish_tstamp, 4),
+    #         "url": url,
+    #         "start": round(start_tstamp, 4),
+    #         "finish": round(start_tstamp, 4),
+    #         "state": state.dict(),
+    #     }
+    #     fout.write(json.dumps(data) + "\n")
+    return
+dropdown_list = [
+    "What did Intel present at Nasdaq?",
+    "From Chips Act Funding Announcement, by which year is Intel committed to Net Zero gas emissions?",
+    "What percentage of renewable energy is Intel planning to use?",
+    "a band playing music",
+    "Which US state is Silicon Desert referred to?",
+    "and which US state is Silicon Forest referred to?",
+    "How do trigate fins work?",
+    "What is the advantage of trigate over planar transistors?",
+    "What are key objectives of transistor design?",
+    "How fast can transistors switch?",
+]
+with gr.Blocks(theme=theme, css=css) as demo:
+    # gr.Markdown(description)
+    state = gr.State(default_conversation.copy())
+    gr.HTML(value=html_title)
+    with gr.Row():
+        with gr.Column(scale=4):
+            video = gr.Video(height=512, width=512, elem_id="video" )
+        with gr.Column(scale=7):
+            chatbot = gr.Chatbot(
+                        elem_id="chatbot", label="Multimodal RAG Chatbot", height=450
+                )
+            with gr.Row():
+                with gr.Column(scale=8):
+                    # textbox.render()
+                    textbox = gr.Dropdown(
+                        dropdown_list,
+                        allow_custom_value=True,
+                        # show_label=False,
+                        # container=False,
+                        label="Query",
+                        info="Enter your query here or choose a sample from the dropdown list!"
+                    )
+                with gr.Column(scale=1, min_width=50):
+                    submit_btn = gr.Button(
+                        value="Send", variant="primary", interactive=True
+                    )
+            with gr.Row(elem_id="buttons") as button_row:
+                clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
+    # Register listeners
+    btn_list = [clear_btn]
+    clear_btn.click(
+        clear_history, None, [state, chatbot, textbox, video] + btn_list
+    )
+    # textbox.submit(
+    #     add_text,
+    #     [state, textbox],
+    #     [state, chatbot, textbox,] + btn_list,
+    # ).then(
+    #     http_bot,
+    #     [state, ],
+    #     [state, chatbot, video] + btn_list,
+    # )
+    submit_btn.click(
+        add_text,
+        [state, textbox],
+        [state, chatbot, textbox,] + btn_list,
+    ).then(
+        http_bot,
+        [state, ],
+        [state, chatbot, video] + btn_list,
+    )
+    print_debug('Beginning')
+    # btn.click(fn=process,
+    #     inputs=[text_query],
+    #     # outputs=[video_player, gallery],
+    #     outputs=[gallery, html],
+    # )
+    # gallery.select(place, [gallery], [html])
+demo.queue()
+app = gr.mount_gradio_app(app, demo, path='/')
+share = False
+enable_queue = True
+# try:
+#     demo.queue(concurrency_count=3)#, enable_queue=False)
+#     demo.launch(enable_queue=enable_queue, share=share, server_port=17808, server_name='0.0.0.0')
+# #BATCH -w isl-gpu48
+# except:
+#     demo.launch(enable_queue=False, share=share, server_port=17808, server_name='0.0.0.0')
+# serve the app
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=7899)
+    parser.add_argument("--concurrency-count", type=int, default=20)
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--worker-address", type=str, default="198.175.88.247")
+    parser.add_argument("--worker-port", type=int, default=7899)
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+    global worker_addr
+    worker_addr = f"http://{args.worker_address}:{args.worker_port}"
+    uvicorn.run(app, host=args.host, port=args.port)
+# for i in examples:
+#     print(f'Processing {i[0]}')
+#     results = process(*i)
+# print(f'{len(results[0])} results returned')

conversation.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import os
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    MISTRAL = auto()
+# video_helper_map = {
+#     # 'Chips Making Deal Video' : {'path' : '/data/videos/ChipmakingDeal/sub-videos/', 'prefix' : 'ChipmakingDeal_split'},
+#     'Keynote 2023' : {'path' : '/data/videos/PatsKeynote23/sub-videos/', 'prefix' : 'keynotes23_split'},
+#     'Intel Behind the Bell' : {'path' : '/data/videos/BehindTheBell/sub-videos/', 'prefix' : 'Behind the Bell Intel_split'},
+#     'CEOs Talk' : {'path' : '/data/videos/SamPatTalkAI/sub-videos/', 'prefix' : 'Sam Altman and Pat Gelsinger Talk Artificial Intelligence_split'},
+#     'Chips Act Funding Announcement' : {'path' : '/data/videos/IntelChipsFundingAnnounce/sub-videos/', 'prefix' : 'Intel Celebrates CHIPS and Science Act Direct Funding Announcement (Replay)_split'},
+#     '22nm-Chip Technology' : {'path' : '/data/videos/MarkBohrExplains22nm/sub-videos/', 'prefix' : 'Video Animation Mark Bohr Gets Small 22nm Explained  Intel_split'},
+#     '14nm-Chip Technology' : {'path' : '/data/videos/MarkBohrExplains14nm/sub-videos/', 'prefix' : 'Explanation of Intels 14nm Process_split'},
+# }
+video_helper_map = {
+    # 'Chips Making Deal Video' : {'path' : '/data/videos/ChipmakingDeal/sub-videos/', 'prefix' : 'ChipmakingDeal_split'},
+    'Innovation-2023' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/PatsKeynote23/sub-videos/', 'prefix' : 'keynotes23_split'},
+    'Behind-the-Bell-Intel' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/BehindTheBell/sub-videos/', 'prefix' : 'Behind the Bell Intel_split'},
+    'Foundry-Connect' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/SamPatTalkAI/sub-videos/', 'prefix' : 'Sam Altman and Pat Gelsinger Talk Artificial Intelligence_split'},
+    'Chips Act Funding Announcement' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/IntelChipsFundingAnnounce/sub-videos/', 'prefix' : 'Intel Celebrates CHIPS and Science Act Direct Funding Announcement (Replay)_split'},
+    '22nm-transistor-animation' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/MarkBohrExplains22nm/sub-videos/', 'prefix' : 'Video Animation Mark Bohr Gets Small 22nm Explained  Intel_split'},
+    '14nm-transistor-animation' : {'path' : '/data1/tile_gh/Multimodal-RAG/videos/MarkBohrExplains14nm/sub-videos/', 'prefix' : 'Explanation of Intels 14nm Process_split'},
+}
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    version: str = "Unknown"
+    path_to_img: str = None
+    video_title: str = None
+    caption: str = None
+    skip_next: bool = False
+    def _template_caption(self):
+        out = ""
+        if self.caption is not None:
+            out = f"The caption associated with the image is '{self.caption}'. "
+        return out
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and messages[1][1] is not None and "<image>" not in messages[0][1]:
+            # if there is a history message and <image> is not yet in the first message of user
+            # then add <image>\n to the beginning
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            messages[0] = (init_role, "<image>\n" + self._template_caption() + init_msg)
+        if len(messages) > 1 and messages[1][1] is None:
+            #Need to do RAG. prompt is the query only
+            ret = messages[0][1]
+        else:
+            if self.sep_style == SeparatorStyle.SINGLE:
+                ret = ""
+                for role, message in messages:
+                    if message:
+                        ret += role + ": " + message + self.sep
+                    else:
+                        ret += role + ":"
+            elif self.sep_style == SeparatorStyle.LLAMA_2:
+                wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+                wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+                ret = ""
+                for i, (role, message) in enumerate(messages):
+                    if i == 0:
+                        assert message, "first message should not be none"
+                        assert role == self.roles[0], "first message should come from user"
+                    if message:
+                        if type(message) is tuple:
+                            message, _, _ = message
+                        if i == 0: message = wrap_sys(self.system) + message
+                        if i % 2 == 0:
+                            message = wrap_inst(message)
+                            ret += self.sep + message
+                        else:
+                            ret += " " + message + " " + self.sep2
+                    else:
+                        ret += ""
+                ret = ret.lstrip(self.sep)
+            else:
+                raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        if self.path_to_img is not None:
+            path_to_image = self.path_to_img
+            images.append(path_to_image)
+            # import base64
+            # from io import BytesIO
+            # from PIL import Image
+            # image = Image.open(path_to_image)
+            # max_hw, min_hw = max(image.size), min(image.size)
+            # aspect_ratio = max_hw / min_hw
+            # max_len, min_len = 800, 400
+            # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            # longest_edge = int(shortest_edge * aspect_ratio)
+            # W, H = image.size
+            # if longest_edge != max(image.size):
+            #     if H > W:
+            #         H, W = longest_edge, shortest_edge
+            #     else:
+            #         H, W = shortest_edge, longest_edge
+            #     image = image.resize((W, H))
+            # if return_pil:
+            #     images.append(image)
+            # else:
+            #     # buffered = BytesIO()
+            #     # # image.save(buffered, format="PNG")
+            #     # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            #     images.append(path_to_image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,)
+    def dict(self):
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+            "path_to_img": self.path_to_img,
+            "video_title" : self.video_title,
+            "caption" : self.caption,
+        }
+    def get_path_to_subvideos(self):
+        print(f"self.video_title {self.video_title}")
+        print(f"self.path_to_image {self.path_to_img}")
+        return None
+        if self.video_title is not None and self.path_to_img is not None:
+            info = video_helper_map[self.video_title]
+            path = info['path']
+            prefix = info['prefix']
+            vid_index = self.path_to_img.split('/')[-1]
+            vid_index = vid_index.split('_')[-1]
+            vid_index = vid_index.replace('.jpg', '')
+            ret = f"{prefix}{vid_index}.mp4"
+            ret = os.path.join(path, ret)
+            return ret
+        elif self.path_to_img is not None:
+            return self.path_to_img
+        return None
+multimodal_rag = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="\n",
+    path_to_img=None,
+    video_title=None,
+    caption=None,
+)
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+    path_to_img=None,
+    video_title=None,
+    caption=None,
+)
+default_conversation = multimodal_rag
+conv_templates = {
+    "default": multimodal_rag,
+    "multimodal_rag" : multimodal_rag,
+    "llavamed_rag" : conv_mistral_instruct,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- huggingface_hub==0.22.2


1	+ huggingface_hub==0.22.2
2	+ gradio==3.43.2

utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import logging
+import logging.handlers
+import os
+import sys
+from constants import LOGDIR
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+handler = None
+save_log = False
+def build_logger(logger_name, logger_filename):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if save_log and handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True)
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''