Spaces:

MudeHui
/

annotation

Runtime error

App Files Files Community

MudeHui commited on Jan 18

Commit

1fb65ae

•

1 Parent(s): 4cdc586

Add application file

Browse files

Files changed (21) hide show

GPT_prompts.py +57 -0
app.py +127 -0
call_assistant_api.py +214 -0
call_assistant_api.sh +14 -0
cv_base.py +18 -0
dataset_demo.py +103 -0
generate_img_dataset.py +315 -0
generate_txt_dataset.py +123 -0
generater_api.py +486 -0
io_utils.py +1332 -0
llm_requirements.txt +5 -0
mixtral_test.py +46 -0
mixtral_tune.py +202 -0
mixtral_tune.sh +13 -0
outlog.txt +8 -0
prepare_dataset.py +29 -0
prepare_for_gpt.py +39 -0
reorganize_data.py +272 -0
tune_gpt.sh +5 -0
vis_common.py +37 -0
vis_utils.py +2231 -0

GPT_prompts.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# diptych template v0, generate prompt , this will provide a bit change
+TEMPLATE_0 = """Create a diptych image that consists two images. The left image is {prompt1}; The right image keep everything the same but {edit_action}."""
+# diptych template v0.1, generate prompt, this makes the pair follow instruction better
+TEMPLATE_0_1 = """Create a diptych image consisting of two panels. On the left, {prompt1}; On the right, the same image but {edit_action}."""
+# diptych template v1, generate prompt 1
+TEMPLATE_1 = """Generate a wide diptych image consists of the left image and right image. \n \
+The left image is an image from Prompt1 and the right image is the edit version of the left image from Prompt2 based on an \
+Edit Action. \n Please have a white strip separate between the two images. \
+Make sure the right image has the minimum change from the left based on the Edit Action. \
+Make sure the right image keep all other aspects, such as the scene and image layout, other than that from Edit Action,IDENTICAL. \
+Prompt1 for the left image: {prompt1}, Prompt2 for the right image: {prompt2}, Edit Action: {edit_action} """
+# given image generate prompt 1
+TEMPLATE_2 = """Create a diptych with a similar layout of the provided image, consisting of two panels separated by a white strip. \
+The left panel is to be generated following Prompt1 ('{prompt1}'). \
+The right panel should be a slightly edited version of the left, created following Prompt2 ('{prompt2}') \
+and incorporating a specific Edit Action ('{edit_action}'). \
+The changes in the right image should be minimal, and the image should not be flipped."""
+# rewrite a dalle3 prompt
+REWRITE_PROMPT_0 = """Please rewrite the following prompt to make it more clear and concise, and easier for DALLE3 to generate this diptych image follow the prompt.\
+The original prompt is: {prompt1}. The output prompt should start with 'REVISED': """
+EVALUATION_PROMPT_TEMPLATE_SIMPLE_V1 = """Text Caption: {caption}
+From 0 to 100, how much do you rate for this Text Caption in terms of the correct and comprehensive description of the image?
+Do not dominant the rating by a single attribute such as recognition correctness, but a overall rating on the object/scene appearance, position, pose, action, shape, etc., and contents in the background.
+Do not consider the appropriateness or sensitive descriptors, such as "middle-aged western man", judge based on if it has correct specifications of the object and scenes in image.
+Provide a few lines for explanation and the rate number at last after "Final Score: ".
+"""
+# this prompt help generate lots prompt to extend more prompt cases using GPT4 for training
+Extend_PROMPT = """please help generate {num} more prompt like the proviced PROMPT, \
+please vary as much as possible such as subject, background and edit attributes. \
+Make sure it is clear, concise and comprehensive, and easier for DALLE3 to generate this diptych image follow the prompt. \
+The output should be a list of json format. for exmaple: [{'prompt_0': 'xxx'}, {'prompt_0': 'xxx'}...].  \
+Do not output anything else, all examples should have key 'prompt_0'. PROMPT: {PROMPT}"""
+# this prompt help mix prompt to extend more prompt cases using GPT4 for training
+MIX_TWO_PROMPT = """please help generate {num} more prompt follow the similar pattern to the provided PROMPT with a mixed edit action. \
+please vary as much as possible such as subject, background and edit attributes based on the given edit. \
+Make sure it is clear, concise and comprehensive, and easier for DALLE3 to generate this diptych image follow the prompt. \
+The output should be a list of json format. for exmaple: [{'prompt_mix_0': 'xxx'}, {'prompt_mix_0': 'xxx'}...].  Do not output anything else, all examples should have key 'prompt_mix_0'. \
+PROMPT:  Create a diptych image that consists two images. The left image is {input}, The right image keep everything the same but first add {edit0} and second {edit1}."""
+# this will make the description more rich for input prompt and fuse the edit action.
+REWRITE_INPUT_DESCRIPTIONS = """please enrich the given PROMPT1: {prompt1}, and edit the enriched PROMPT1 using {edit_action}. \
+The output prompt start with EDITPROMPT: """

app.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from vis_common import *
+import vis_utils as v_uts
+import io_utils as io_uts
+import os
+from pickle import FALSE
+import gradio as gr
+from functools import partial
+import yaml
+import random
+import numpy as np
+import json
+# install gradio of 3.14
+os.system("echo $BYTED_HOST_IP")
+# Load the dataset change to your local path
+root = "/home/mudehui/ChatEdit"
+#prompt_version = "prompt_0_sd"
+#prompt_version = "prompt_0_hd"
+#prompt_version = "prompt_1_sd"
+#prompt_version = "prompt_1_hd"
+prompt_version = "prompt_0_rewrited_sd"
+def load_json(file, existing_data=[]):
+    if not os.path.exists(file):
+        empty = {}
+        return empty
+    with open(file, "r") as f:
+        stats = json.load(f)
+    results = {name: score for name, score in stats.items() \
+        if name not in existing_data}
+    return results
+all_items = f"{root}/full_val.jsonl"
+all_samples = io_uts.load_jsonl(all_items)
+all_samples = {f"{i:03}":all_samples[i] for i in range(len(all_samples))}
+votes = {}
+def update(name, picture_name, vote, start_idx=0, end_idx=1000):
+    record_file = f"./output/{prompt_version}/{name}.json"
+    v_uts.mkdir("", record_file)
+    start_idx, end_idx = int(start_idx), int(end_idx)
+    end_idx = min(end_idx, len(all_samples) - 1)
+    items = list(all_samples.items())[start_idx:end_idx]
+    label_samples = {name:prompt for name, prompt in items}
+    if name == "":
+        new_picture = None
+        picture_name = None
+        description = None
+        message = "Please enter your lark username"
+    elif picture_name in label_samples.keys() and vote is None:
+        new_picture = None
+        picture_name = None
+        description = None
+        message = "Please make selections! Click Next to continue..."
+    else:
+        # Read record
+        existing_data = load_json(record_file)
+        # Save record
+        if (picture_name in label_samples.keys()):
+            sample = label_samples[picture_name]
+            sample["vote"] = vote
+            existing_data[picture_name] = sample
+            with open(record_file, "w") as f:
+                json.dump(existing_data, f, indent=2)
+        # Find Next example
+        all_remaining = {}
+        for i, name in enumerate(label_samples.keys()):
+            if name in existing_data:
+                continue
+            else:
+                all_remaining[name] = label_samples[name]
+        if len(all_remaining) > 0:
+            new_sample = list(all_remaining.items())[0]
+            picture_name, data = new_sample
+            description = f"input: {data['input']}<br>output: {data['output']}<br>edit: {data['edit']}"
+            new_picture = f"{root}/{prompt_version}/{picture_name}.png"
+            message = f"{len(all_remaining)} exmaples remaining"
+        else:
+            new_picture = None
+            picture_name = None
+            description = None
+            message = "You have finished all exmaples! Thank you!"
+    outputs = [new_picture, picture_name, message, description]
+    print(outputs)
+    return tuple(outputs)
+with gr.Blocks() as demo:
+    gr.Markdown("""
+         - 输入用户名, 开始结束index，点击Next按钮开始, 你正在评价 {prompt}"
+    """.format(prompt=prompt_version))
+    with gr.Row():
+        with gr.Column():
+            picture_name = gr.Textbox(visible=FALSE)
+            picture = gr.Image(label=f"Input Image from ")
+        with gr.Column():
+            name = gr.Textbox(label="User Name (enter and click Next to start)")
+            start_idx = gr.Textbox(label="Start Index (max 292)", default="0")
+            end_idx = gr.Textbox(label="End Index (max 292)", default="1000")
+            message = gr.Markdown()
+            description = gr.Markdown()
+            vote = gr.Radio([
+                ('1: Totally not related ', 1),
+                ('2: Not follow edit, there is some/little relation between the two images.', 2),
+                ('3: OK Pair data, not follow edit, image pair need some edit effort [flip etc.] to construct a good edit pair.', 3),
+                ('4: Good pair data, can modify the instruction to form a good triplet', 4),
+                ('5: Perfectly follows the edit instruction.', 5)
+            ], label="Score", min_width=400)
+            greet_btn = gr.Button("Next")
+    greet_btn.click(fn=update,
+                    inputs=[name,picture_name,vote, start_idx, end_idx],
+                    outputs=[picture,picture_name,message,description])
+demo.queue(max_size=4)
+demo.launch(share=True)

call_assistant_api.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# install the lib of : https://github.com/pengwangucla/cv_utils
+from vis_common import *
+import vis_utils as v_uts
+import json
+import os
+import time
+import base64
+import requests
+from openai import OpenAI
+from tenacity import retry, wait_random_exponential, stop_after_attempt, wait_fixed
+from GPT_prompts import REWRITE_PROMPT_0
+API_KEY = os.environ.get("BYTE_API_KEY")
+class EditActionClassifier():
+    def __init__(self):
+        self.client = OpenAI()
+        self.assistant_key = "asst_57vfLupV8VCsCZx0BJOppSnw"
+        self.thread = self.client.beta.threads.create()
+    @retry(wait=wait_fixed(10), stop=stop_after_attempt(3))
+    def infer(self, edit_action):
+        message = self.client.beta.threads.messages.create(
+            thread_id=self.thread.id,
+            role="user",
+            content=edit_action
+        )
+        run = self.client.beta.threads.runs.create(
+          thread_id=self.thread.id,
+          assistant_id=self.assistant_key,
+        )
+        pbar = tqdm(total=100)
+        while run.status != 'completed':
+          run = self.client.beta.threads.runs.retrieve(
+            thread_id=self.thread.id,
+            run_id=run.id
+          )
+          time.sleep(.5) # Sleep and check run status again
+          pbar.update(1)
+          pbar.set_description('Run Status: ' + run.status)
+          if run.status == 'failed':
+            break
+        if run.status == 'failed':
+          print("Run failed")
+          return ""
+        messages = self.client.beta.threads.messages.list(
+          thread_id=self.thread.id
+        )
+        result = messages.data[0].content[0].text.value
+        if "edit class" in results:
+          try:
+              class_name = json.loads(result)["edit class"]
+          except Exception as e:
+              print(f"{result}, can not be load by json")
+              class_name = result
+        return class_name
+def test_personal_dalle3():
+    # Call the API
+    client = OpenAI()
+    response = client.images.generate(
+        model="dall-e-3",
+        prompt="a cute cat with a hat on",
+        size="1792x1024",
+        quality="standard",
+        n=1,
+    )
+    image_url = response.data[0].url
+    image_url = "https://oaidalleapiprodscus.blob.core.windows.net/private/org-S0JkO5ALwPh1E3YpnKFiS7Gh/user-gJLc6S6Gmp2NCFBcEyZNgRNz/img-RDqXwfARPT6LSovnZXbMyzSO.png?st=2024-01-12T18%3A54%3A32Z&se=2024-01-12T20%3A54%3A32Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-01-11T22%3A56%3A51Z&ske=2024-01-12T22%3A56%3A51Z&sks=b&skv=2021-08-06&sig=BoeIYYvxu5Cnt4YfM53Az7EYlEYUTkWfXQCKrKaDWD0%3D"
+    # Download the image from the URL
+    image_response = requests.get(image_url)
+    # Check if the request was successful
+    if image_response.status_code == 200:
+        # Save the image to a file
+        with open('cute_cat_with_hat.jpg', 'wb') as file:
+            file.write(image_response.content)
+    else:
+        print("Failed to download the image.")
+def test_call_gpt4_api():
+    from langchain_community.chat_models import AzureChatOpenAI
+    from langchain.schema import HumanMessage
+    BASE_URL = "https://search-us.byteintl.net/gpt/openapi/online/v2/crawl/"
+    DEPLOYMENT_NAME = "gpt-4-0613"
+    DEPLOYMENT_NAME = "gpt-4-1106-preview"
+    model = AzureChatOpenAI(
+            openai_api_base=BASE_URL,
+            openai_api_version="2023-03-15-preview",
+            deployment_name=DEPLOYMENT_NAME,
+            openai_api_key=API_KEY,
+            openai_api_type="azure",
+            temperature=0.5,
+            max_tokens=512,
+        )
+    content = REWRITE_PROMPT_0.format(prompt1="Create a diptych image that consists two images. \
+The left image is front-view of lying real white 12 years old man. \
+The right image keep everything the same but change the background of the subject to europe.")
+    generate_log = model([HumanMessage(content=content)]).content
+    print(generate_log)
+def test_call_gpt4v_api():
+    from langchain_community.chat_models import AzureChatOpenAI
+    from langchain.schema import HumanMessage
+    BASE_URL = "https://search-us.byteintl.net/gpt/openapi/online/v2/crawl/"
+    DEPLOYMENT_NAME = "openai_gpt-4-vision" # gptv 或 openai_gpt-4-vision
+    model = AzureChatOpenAI(
+        openai_api_base=BASE_URL,
+        openai_api_version="2023-07-01-preview",
+        deployment_name=DEPLOYMENT_NAME,
+        openai_api_key=API_KEY,
+        openai_api_type="azure",
+        temperature=0.5,
+        max_tokens=512,
+      )
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    input_ip = {
+        "url": image_url
+    }
+    image_path = "./imgs/dataset.jpg"
+    base64_image = v_uts.encode_b64(image_path)
+    input_ip = {
+        "url": f"data:image/jpeg;base64,{base64_image}"
+    }
+    generate_log = model([HumanMessage(content=[
+        {
+            "type": "text",
+            "text": "What’s in this image?"
+        },
+        {
+            "type": "image_url",
+            "image_url": input_ip
+        }
+    ])])
+    print(generate_log)
+# curl --location --request POST 'https://search.bytedance.net/gpt/openapi/online/v2/crawl?ak=业务方AK' \
+# --header 'Content-Type: application/json' \
+# --header 'X-TT-LOGID: 请求方logID，方便定位问题' \
+# --data-raw '{
+#     "prompt": "A poster of Microsoft", // 文字描述画图内容
+#     "size": "1024x1024",               // 图片大小。只支持 1024x1024 / 1024x1792 / 1792x1024
+#     "quality": "standard",             // 图片质量，默认standard
+#     "style": "vivid",                  // 图片风格，模型vivid
+#     "n": 1,
+#     "model": "dall-e-3"                // 对应模型名称，必填
+# }'
+# // response
+# {
+#     "created": 1702889995,
+#     "data": [
+#         {
+#             "url": "https://dalleprodsec.blob.core.windows.net/private/images/0811eacd-bf25-4961-814f-36d7f453907c/generated_00.png?se=2023-12-19T09%3A00%3A09Z&sig=cIRz7je1Qbjlt5GjeyLGKoxPRFggr7NAxLSeeCuGyYk%3D&ske=2023-12-22T11%3A18%3A13Z&skoid=e52d5ed7-0657-4f62-bc12-7e5dbb260a96&sks=b&skt=2023-12-15T11%3A18%3A13Z&sktid=33e01921-4d64-4f8c-a055-5bdaffd5e33d&skv=2020-10-02&sp=r&spr=https&sr=b&sv=2020-10-02",
+#             "revised_prompt": "A designed poster featuring the logo of a prominent technology company, accompanied by various emboldened text denoting the company's name and a motivational slogan. The distinct, four rectangular logo in bright colors is situated at the center of the poster, against a plain background. The composition strikes a balance between minimalism and impact, typifying the company's powerful image in the global technology industry."
+#         }
+#     ]
+# }
+def test_call_dalle3_api():
+    """ openai==1.2.0, httpx==0.23.0
+    """
+    from openai import AzureOpenAI
+    BASE_URL = "https://search-va.byteintl.net/gpt/openapi/online/v2/crawl"
+    DEPLOYMENT_NAME = "dall-e-3"
+    API_KEY = "hpjWvnz7wM2mzDg4Ggnt96xcOjeYcktj"
+    client = AzureOpenAI(
+        api_version="2023-12-01-preview",
+        api_key=API_KEY,
+        azure_endpoint=BASE_URL)
+    result = client.images.generate(
+        model=DEPLOYMENT_NAME, # the name of your DALL-E 3 deployment
+        prompt="A soldier girl holding a USA flag",
+        n=1,
+        size="1024x1024",
+        quality="standard",
+        style="vivid"
+    )
+    image_url = result.data[0].url
+    image_response = requests.get(image_url)
+    # Check if the request was successful
+    if image_response.status_code == 200:
+        # Save the image to a file
+        with open('.jpg', 'wb') as file:
+            file.write(image_response.content)
+    else:
+        print("Failed to download the image.")
+if __name__ == "__main__":
+    # classifier = EditActionClassifier()
+    # class_name = classifier.infer("Remove the background of the image")
+    # print(class_name)
+    # test_personal_dalle3()
+    # test_call_gpt4_api()
+    # test_call_gpt4v_api()
+    test_call_dalle3_api()

call_assistant_api.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+curl --location --request POST 'https://search.bytedance.net/gpt/openapi/online/v2/crawl?ak=hpjWvnz7wM2mzDg4Ggnt96xcOjeYcktj' \
+--header 'Content-Type: application/json' \
+--data-raw '{"prompt": "A poster of Microsoft","size": "1024x1024","quality": "standard", "style": "vivid", "n": 1, "model": "dall-e-3"}'
+# // response
+# {
+#     "created": 1702889995,
+#     "data": [
+#         {
+#             "url": "https://dalleprodsec.blob.core.windows.net/private/images/0811eacd-bf25-4961-814f-36d7f453907c/generated_00.png?se=2023-12-19T09%3A00%3A09Z&sig=cIRz7je1Qbjlt5GjeyLGKoxPRFggr7NAxLSeeCuGyYk%3D&ske=2023-12-22T11%3A18%3A13Z&skoid=e52d5ed7-0657-4f62-bc12-7e5dbb260a96&sks=b&skt=2023-12-15T11%3A18%3A13Z&sktid=33e01921-4d64-4f8c-a055-5bdaffd5e33d&skv=2020-10-02&sp=r&spr=https&sr=b&sv=2020-10-02",
+#             "revised_prompt": "A designed poster featuring the logo of a prominent technology company, accompanied by various emboldened text denoting the company's name and a motivational slogan. The distinct, four rectangular logo in bright colors is situated at the center of the poster, against a plain background. The composition strikes a balance between minimalism and impact, typifying the company's powerful image in the global technology industry."
+#         }
+#     ]
+# }

cv_base.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# define similiar objects as pytorch3d using numpy
+from collections import namedtuple
+Faces = namedtuple("Faces", "verts_idx normals_idx textures_idx materials_idx")
+Aux = namedtuple(
+    "Properties", "normals verts_uvs material_colors texture_images texture_atlas"
+)
+Obj = namedtuple("Obj", "verts faces properties")
+DEFAULT_MATERIAL= {
+        'material_1':
+            {
+                'ambient_color': [1., 1., 1.],
+                'diffuse_color': [1., 1., 1.],
+                'specular_color': [0., 0., 0.],
+                'shininess': 10.
+            }
+    }

dataset_demo.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from vis_common import *
+import vis_utils as v_uts
+import io_utils as io_uts
+from datasets import Dataset
+import pandas as pd
+import gradio as gr
+# install gradio of 3.14
+os.system("echo $BYTED_HOST_IP")
+# Load the dataset change to your local path
+root = "/mnt/bn/datacompv6/data/chat_edit/assets/ChatEdit/"
+# method = "parquet"
+# prompt_version = "prompt_0"
+# append = ""
+# parquet_file = f'{root}/data/{prompt_version}.parquet'
+# df = pd.read_parquet(parquet_file)
+jsonl_file = f"{root}/full_val.jsonl"
+method = "raw_file"
+print("reading data")
+df = []
+items = io_uts.load_jsonl(jsonl_file)
+print("reading data finished", len(items))
+all_prompts = ['prompt_0', 'prompt_1']
+def find_key(name):
+    for prompt in all_prompts:
+        if prompt in name:
+            return prompt
+def display_data(index, prompt_version):
+    try:
+        key = find_key(prompt_version)
+        if method == "parquet":
+            row = df.iloc[index]
+            image = v_uts.decode64(row['image'])[:, :, ::-1]  # Ensure this returns a PIL image
+            prompt = row[key]
+            return image, prompt
+        elif method == "raw_file":
+            image_file = f"{root}/{prompt_version}/{index:03}.png"
+            image = cv2.imread(image_file)[:, :, ::-1]
+            prompt = items[index][key]
+        else:
+            return "Invalid method", ""
+    except IndexError:
+        return "No more data", ""
+    except Exception as e:
+        return f"Error: {str(e)}", ""
+def search_and_display(prompt_key, prompt_version):
+    try:
+        key = find_key(prompt_version)
+        if method == "parquet":
+            results = df[df['image_id'].astype(str).str.contains(prompt_key, case=False)]
+            if not results.empty:
+                image = v_uts.decode64(results.iloc[0]['image'])[:, :, ::-1]  # Ensure this returns a PIL image
+                prompt = results.iloc[0][key]
+                return image, prompt
+        elif method == "raw_file":
+            index = int(prompt_key)
+            image_file = f"{root}/{prompt_version}/{index:03}.png"
+            assert os.path.exists(image_file), f"Image {image_file} file not found"
+            image = cv2.imread(image_file)[:, :, ::-1]
+            prompt = items[index][key]
+            return image, prompt
+        else:
+            return "No image found", "No matching prompt found"
+    except Exception as e:
+        return f"Error: {str(e)}", ""
+def combined_function(prompt_key=None, prompt_name=None):
+    print(prompt_key, prompt_name)
+    return search_and_display(prompt_key, prompt_name)
+max_len = len(df)  # Set max_len to the length of the dataframe
+iface = gr.Interface(
+    fn=combined_function,
+    inputs=[
+        gr.inputs.Textbox(default="", label="Or, enter image_id to search, 0-292"),
+        gr.Radio(["prompt_0_sd", "prompt_0_hd", "prompt_1_sd", "prompt_1_hd"]),
+    ],
+    outputs=[
+        gr.outputs.Image(label="Image", type="pil"),
+        gr.outputs.Textbox(label="Prompt")
+    ],
+    examples=[
+        ["1", "prompt_0_sd"],
+        ["2", "prompt_1_hd"],  # Adjust these examples as per your dataset
+    ],
+    allow_flagging=False,
+)
+# iface.queue(concurrency_count=1)
+# iface.launch(debug=True, share=True, inline=False, enable_queue=True, server_name="0.0.0.0")
+iface.queue().launch(debug=True, share=True, inline=False, enable_queue=True, server_name="[::]")

generate_img_dataset.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import argparse
+import json
+import sys
+from pathlib import Path
+import k_diffusion
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from omegaconf import OmegaConf
+from PIL import Image
+from pytorch_lightning import seed_everything
+from tqdm import tqdm
+sys.path.append("./")
+sys.path.append("./stable_diffusion")
+from ldm.modules.attention import CrossAttention, MemoryEfficientCrossAttention
+from ldm.util import instantiate_from_config
+from metrics.clip_similarity import ClipSimilarity
+################################################################################
+# Modified K-diffusion Euler ancestral sampler with prompt-to-prompt.
+# https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+def to_d(x, sigma, denoised):
+    """Converts a denoiser output to a Karras ODE derivative."""
+    return (x - denoised) / append_dims(sigma, x.ndim)
+def get_ancestral_step(sigma_from, sigma_to):
+    """Calculates the noise level (sigma_down) to step down to and the amount
+    of noise to add (sigma_up) when doing an ancestral sampling step."""
+    sigma_up = min(sigma_to, (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5)
+    sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+    return sigma_down, sigma_up
+def sample_euler_ancestral(model, x, sigmas, prompt2prompt_threshold=0.0, **extra_args):
+    """Ancestral sampling with Euler method steps."""
+    s_in = x.new_ones([x.shape[0]])
+    for i in range(len(sigmas) - 1):
+        prompt_to_prompt = prompt2prompt_threshold > i / (len(sigmas) - 2)
+        for m in model.modules():
+            if isinstance(m, CrossAttention) or isinstance(m, MemoryEfficientCrossAttention):
+                m.prompt_to_prompt = prompt_to_prompt
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1])
+        d = to_d(x, sigmas[i], denoised)
+        # Euler method
+        dt = sigma_down - sigmas[i]
+        x = x + d * dt
+        if sigmas[i + 1] > 0:
+            # Make noise the same across all samples in batch.
+            x = x + torch.randn_like(x[:1]) * sigma_up
+    return x
+################################################################################
+def load_model_from_config(config, ckpt, vae_ckpt=None, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    if vae_ckpt is not None:
+        print(f"Loading VAE from {vae_ckpt}")
+        vae_sd = torch.load(vae_ckpt, map_location="cpu")["state_dict"]
+        sd = {
+            k: vae_sd[k[len("first_stage_model.") :]] if k.startswith("first_stage_model.") else v
+            for k, v in sd.items()
+        }
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+    return model
+class CFGDenoiser(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.inner_model = model
+    def forward(self, x, sigma, uncond, cond, cfg_scale):
+        x_in = torch.cat([x] * 2)
+        sigma_in = torch.cat([sigma] * 2)
+        cond_in = torch.cat([uncond, cond])
+        uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
+        return uncond + (cond - uncond) * cfg_scale
+def to_pil(image: torch.Tensor) -> Image.Image:
+    image = 255.0 * rearrange(image.cpu().numpy(), "c h w -> h w c")
+    image = Image.fromarray(image.astype(np.uint8))
+    return image
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        required=True,
+        help="Path to output dataset directory.",
+    )
+    parser.add_argument(
+        "--prompts_file",
+        type=str,
+        required=True,
+        help="Path to prompts .jsonl file.",
+    )
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        default="stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt",
+        help="Path to stable diffusion checkpoint.",
+    )
+    parser.add_argument(
+        "--vae-ckpt",
+        type=str,
+        default="stable_diffusion/models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt",
+        help="Path to vae checkpoint.",
+    )
+    parser.add_argument(
+        "--steps",
+        type=int,
+        default=100,
+        help="Number of sampling steps.",
+    )
+    parser.add_argument(
+        "--n-samples",
+        type=int,
+        default=100,
+        help="Number of samples to generate per prompt (before CLIP filtering).",
+    )
+    parser.add_argument(
+        "--max-out-samples",
+        type=int,
+        default=4,
+        help="Max number of output samples to save per prompt (after CLIP filtering).",
+    )
+    parser.add_argument(
+        "--n-partitions",
+        type=int,
+        default=1,
+        help="Number of total partitions.",
+    )
+    parser.add_argument(
+        "--partition",
+        type=int,
+        default=0,
+        help="Partition index.",
+    )
+    parser.add_argument(
+        "--min-p2p",
+        type=float,
+        default=0.1,
+        help="Min prompt2prompt threshold (portion of denoising for which to fix self attention maps).",
+    )
+    parser.add_argument(
+        "--max-p2p",
+        type=float,
+        default=0.9,
+        help="Max prompt2prompt threshold (portion of denoising for which to fix self attention maps).",
+    )
+    parser.add_argument(
+        "--min-cfg",
+        type=float,
+        default=7.5,
+        help="Min classifier free guidance scale.",
+    )
+    parser.add_argument(
+        "--max-cfg",
+        type=float,
+        default=15,
+        help="Max classifier free guidance scale.",
+    )
+    parser.add_argument(
+        "--clip-threshold",
+        type=float,
+        default=0.2,
+        help="CLIP threshold for text-image similarity of each image.",
+    )
+    parser.add_argument(
+        "--clip-dir-threshold",
+        type=float,
+        default=0.2,
+        help="Directional CLIP threshold for similarity of change between pairs of text and pairs of images.",
+    )
+    parser.add_argument(
+        "--clip-img-threshold",
+        type=float,
+        default=0.7,
+        help="CLIP threshold for image-image similarity.",
+    )
+    opt = parser.parse_args()
+    global_seed = torch.randint(1 << 32, ()).item()
+    print(f"Global seed: {global_seed}")
+    seed_everything(global_seed)
+    model = load_model_from_config(
+        OmegaConf.load("stable_diffusion/configs/stable-diffusion/v1-inference.yaml"),
+        ckpt=opt.ckpt,
+        vae_ckpt=opt.vae_ckpt,
+    )
+    model.cuda().eval()
+    model_wrap = k_diffusion.external.CompVisDenoiser(model)
+    clip_similarity = ClipSimilarity().cuda()
+    out_dir = Path(opt.out_dir)
+    out_dir.mkdir(exist_ok=True, parents=True)
+    with open(opt.prompts_file) as fp:
+        prompts = [json.loads(line) for line in fp]
+    print(f"Partition index {opt.partition} ({opt.partition + 1} / {opt.n_partitions})")
+    prompts = np.array_split(list(enumerate(prompts)), opt.n_partitions)[opt.partition]
+    with torch.no_grad(), torch.autocast("cuda"), model.ema_scope():
+        uncond = model.get_learned_conditioning(2 * [""])
+        sigmas = model_wrap.get_sigmas(opt.steps)
+        for i, prompt in tqdm(prompts, desc="Prompts"):
+            prompt_dir = out_dir.joinpath(f"{i:07d}")
+            prompt_dir.mkdir(exist_ok=True)
+            with open(prompt_dir.joinpath("prompt.json"), "w") as fp:
+                json.dump(prompt, fp)
+            cond = model.get_learned_conditioning([prompt["input"], prompt["output"]])
+            results = {}
+            with tqdm(total=opt.n_samples, desc="Samples") as progress_bar:
+                while len(results) < opt.n_samples:
+                    seed = torch.randint(1 << 32, ()).item()
+                    if seed in results:
+                        continue
+                    torch.manual_seed(seed)
+                    x = torch.randn(1, 4, 512 // 8, 512 // 8, device="cuda") * sigmas[0]
+                    x = repeat(x, "1 ... -> n ...", n=2)
+                    model_wrap_cfg = CFGDenoiser(model_wrap)
+                    p2p_threshold = opt.min_p2p + torch.rand(()).item() * (opt.max_p2p - opt.min_p2p)
+                    cfg_scale = opt.min_cfg + torch.rand(()).item() * (opt.max_cfg - opt.min_cfg)
+                    extra_args = {"cond": cond, "uncond": uncond, "cfg_scale": cfg_scale}
+                    samples_ddim = sample_euler_ancestral(model_wrap_cfg, x, sigmas, p2p_threshold, **extra_args)
+                    x_samples_ddim = model.decode_first_stage(samples_ddim)
+                    x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                    x0 = x_samples_ddim[0]
+                    x1 = x_samples_ddim[1]
+                    clip_sim_0, clip_sim_1, clip_sim_dir, clip_sim_image = clip_similarity(
+                        x0[None], x1[None], [prompt["input"]], [prompt["output"]]
+                    )
+                    results[seed] = dict(
+                        image_0=to_pil(x0),
+                        image_1=to_pil(x1),
+                        p2p_threshold=p2p_threshold,
+                        cfg_scale=cfg_scale,
+                        clip_sim_0=clip_sim_0[0].item(),
+                        clip_sim_1=clip_sim_1[0].item(),
+                        clip_sim_dir=clip_sim_dir[0].item(),
+                        clip_sim_image=clip_sim_image[0].item(),
+                    )
+                    progress_bar.update()
+            # CLIP filter to get best samples for each prompt.
+            metadata = [
+                (result["clip_sim_dir"], seed)
+                for seed, result in results.items()
+                if result["clip_sim_image"] >= opt.clip_img_threshold
+                and result["clip_sim_dir"] >= opt.clip_dir_threshold
+                and result["clip_sim_0"] >= opt.clip_threshold
+                and result["clip_sim_1"] >= opt.clip_threshold
+            ]
+            metadata.sort(reverse=True)
+            for _, seed in metadata[: opt.max_out_samples]:
+                result = results[seed]
+                image_0 = result.pop("image_0")
+                image_1 = result.pop("image_1")
+                image_0.save(prompt_dir.joinpath(f"{seed}_0.jpg"), quality=100)
+                image_1.save(prompt_dir.joinpath(f"{seed}_1.jpg"), quality=100)
+                with open(prompt_dir.joinpath(f"metadata.jsonl"), "a") as fp:
+                    fp.write(f"{json.dumps(dict(seed=seed, **result))}\n")
+    print("Done.")
+if __name__ == "__main__":
+    main()

generate_txt_dataset.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from __future__ import annotations
+import json
+import time
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Optional
+import datasets
+import numpy as np
+import openai
+from tqdm.auto import tqdm
+DELIMITER_0 = "\n##\n"
+DELIMITER_1 = "\n%%\n"
+STOP = "\nEND"
+def generate(
+    openai_model: str,
+    caption: str,
+    num_retries: int = 3,
+    max_tokens: int = 256,
+    temperature: float = 0.7,
+    top_p: float = 1.0,
+    frequency_penalty: float = 0.1,
+    presence_penalty: float = 0.0,
+    sleep_on_error: float = 1.0,
+) -> Optional[tuple[str, str]]:
+    for _ in range(1 + num_retries):
+        try:
+            response = openai.Completion.create(
+                model=openai_model,
+                prompt=caption + DELIMITER_0,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                top_p=top_p,
+                frequency_penalty=frequency_penalty,
+                presence_penalty=presence_penalty,
+                stop=[STOP],
+            )
+        except Exception as e:
+            print(e)
+            time.sleep(sleep_on_error)
+            continue
+        output = response["choices"][0]["text"].split(DELIMITER_1)
+        if len(output) == 2:
+            instruction, edited_caption = output
+            results = openai.Moderation.create([instruction, edited_caption])["results"]
+            if results[0]["flagged"] or results[1]["flagged"]:
+                continue
+            if caption.strip().strip(".!?").lower() != edited_caption.strip().strip(".!?").lower():
+                return instruction, edited_caption
+        output = response["choices"][0]["text"].split(DELIMITER_1)
+        if len(output) == 2:
+            instruction, edited_caption = output
+            results = openai.Moderation.create([instruction, edited_caption])["results"]
+            if results[0]["flagged"] or results[1]["flagged"]:
+                continue
+            if caption.strip().strip(".!?").lower() != edited_caption.strip().strip(".!?").lower():
+                return instruction, edited_caption
+def main(openai_model: str, num_samples: int, num_partitions: int, partition: int, seed: int):
+    dataset = datasets.load_dataset("ChristophSchuhmann/improved_aesthetics_6.5plus", split="train")
+    # Other datasets we considered that may be worth trying:
+    # dataset = datasets.load_dataset("ChristophSchuhmann/MS_COCO_2017_URL_TEXT", split="train")
+    # dataset = datasets.load_dataset("laion/laion-coco", split="train")
+    np.random.seed(seed)
+    permutation = np.array_split(np.random.permutation(len(dataset)), num_partitions)[partition]
+    dataset = dataset[permutation]
+    captions = dataset["TEXT"]
+    urls = dataset["URL"]
+    output_path = f"data/dataset=laion-aesthetics-6.5_model={openai_model}_samples={num_samples}_partition={partition}.jsonl"  # fmt: skip
+    print(f"Prompt file path: {output_path}")
+    count = 0
+    caption_set = set()
+    url_set = set()
+    if Path(output_path).exists():
+        with open(output_path, "r") as f:
+            for line in tqdm(f, desc="Resuming from existing prompts"):
+                prompt = json.loads(line)
+                if prompt["caption"] not in caption_set and prompt["url"] not in url_set:
+                    caption_set.add(prompt["caption"])
+                    url_set.add(prompt["url"])
+                    count += 1
+    with open(output_path, "a") as fp:
+        with tqdm(total=num_samples - count, desc="Generating instructions and edited captions") as progress_bar:
+            for caption, url in zip(captions, urls):
+                if caption in caption_set or url in url_set:
+                    continue
+                if openai.Moderation.create(caption)["results"][0]["flagged"]:
+                    continue
+                edit_output = generate(openai_model, caption)
+                if edit_output is not None:
+                    edit, output = edit_output
+                    fp.write(f"{json.dumps(dict(caption=caption, edit=edit, output=output, url=url))}\n")
+                    count += 1
+                    progress_bar.update()
+                    caption_set.add(caption)
+                    url_set.add(url)
+                if count == num_samples:
+                    break
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--openai-api-key", required=True, type=str)
+    parser.add_argument("--openai-model", required=True, type=str)
+    parser.add_argument("--num-samples", default=10000, type=int)
+    parser.add_argument("--num-partitions", default=1, type=int)
+    parser.add_argument("--partition", default=0, type=int)
+    parser.add_argument("--seed", default=0, type=int)
+    args = parser.parse_args()
+    openai.api_key = args.openai_api_key
+    main(args.openai_model, args.num_samples, args.num_partitions, args.partition, args.seed)

generater_api.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import sys
+sys.path.append(
+    "/mnt/bn/wp-maliva-bytenas/mlx/users/peng.wang/playground/repo/cv_utils"
+)
+import io_utils as io_uts
+import openai
+from openai import OpenAI
+import os, sys, re
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import argparse
+import logging
+import json
+import jsonlines
+import requests
+from tenacity import retry, wait_random_exponential, stop_after_attempt, wait_fixed
+import tenacity
+from GPT_prompts import (
+    TEMPLATE_0,
+    TEMPLATE_1,
+    TEMPLATE_2,
+)
+import base64
+import requests
+import pdb
+# OpenAI API Key
+b = pdb.set_trace
+api_key = "YOUR_OPENAI_API_KEY"
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+# # Path to your image
+# image_path = "path_to_your_image.jpg"
+# # Getting the base64 string
+# base64_image = encode_image(image_path)
+# headers = {
+#   "Content-Type": "application/json",
+#   "Authorization": f"Bearer {api_key}"
+# }
+os.environ["OPENAI_API_KEY"] = "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"
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("evaluation test")
+EVALUATION_PROMPT_TEMPLATE = """Text Caption: {caption}
+Based on the image and text caption, provide the following 4 scores and 4 rationales to explain the scores. Please be concise on the rationales and limit each rationale in two sentences:
+Score 1 Image Text Matching: Please evaluate if the provided text caption accurately represents the main features and objects of the image. The caption doesn't need to detail every aspect of the image, but it should capture its primary theme. Rate the overall quality X1 of the text caption's match to the image on a scale of 1-100, considering the criteria mentioned.
+Score 2 Object Detail Fulfillment: Please evaluate the text caption to determine if it provides detailed descriptions of objects that align with the image. Specifically, assess if the caption sufficiently describes the color, size, position, shape, material, etc., of the objects. Afterward, rate the caption's overall accuracy X2 in capturing object details from the image on a scale of 1-100, based on the criteria provided.
+Score 3 Caption Text Quality: Please evaluate the text caption based on the following criteria: Grammatical Correctness, Diversity of Vocabulary (e.g., the range and uniqueness of words used), Fluency (e.g., smoothness and natural flow of sentences), Readability, Length, and Structure. Assign an overall quality score X3 on a scale of 1-100.
+Score 4 Semantic Understanding: Evaluate the given text caption in relation to its corresponding image. Your goal is to determine if the text caption provides additional semantic information that isn't readily apparent just from the image itself.
+For example:
+1. If the image mentions "a man" but the caption elaborates he is a "homeless man" or a "businessman," then the caption is enriching the semantic context.
+2. If the caption introduces concepts like the mathematical tangent function, which require in-depth knowledge to deduce, it is imparting external semantics.
+3. Captions revealing specific location addresses, festival details, or other nuanced data not easy to infer from the image also provide external semantic information.
+4. Directly identifying specific entities in the image such as buildings, people, bird species, animal breeds, car models, engines, etc., in the caption introduces additional insights.
+5. Should the image act as a contextual backdrop and the caption describes elements not explicitly showcased in the image, it has semantic depth.
+6. Lastly, if the caption depicts relationships between the subjects in the image, which need commonsense knowledge to understand, it should be considered semantically rich.
+Please assess and determine the extent of semantic enrichment the caption provides over the image. Rate the text caption's semantic depth on a scale from 1 to 100.
+X1, X2, X3, X4 are integers. Please do not include title such as "X1" in the output. Ensure that your scoring is nuanced and uses the entire range from 0 to 100, reflecting the subtle differences. The scores should be given as integers, with each number between 0 and 100 considered as a potential score, avoiding the tendency to round to multiples of 10. Output format should be: X1,X2,X3,X4\nX1 Rationale\nX2 Ratinale\nX3 Rationale\nX4 Rationale
+"""
+EVALUATION_PROMPT_TEMPLATE_SIMPLE = """Text Caption: {caption}
+From 0 to 100, how much do you rate for this Text Caption in terms of the correct and comprehensive description of the image?
+Provide a few lines for explanation and the rate number at last after "Final Score: ".
+"""
+EVALUATION_PROMPT_TEMPLATE_SIMPLE_V1 = """Text Caption: {caption}
+From 0 to 100, how much do you rate for this Text Caption in terms of the correct and comprehensive description of the image?
+Do not dominant the rating by a single attribute such as recognition correctness, but a overall rating on the object/scene appearance, position, pose, action, shape, etc., and contents in the background.
+Do not consider the appropriateness or sensitive descriptors, such as "middle-aged western man", judge based on if it has correct specifications of the object and scenes in image.
+Provide a few lines for explanation and the rate number at last after "Final Score: ".
+"""
+COMPARISON_PROMPT_TEMPLATE = """
+Caption 0: {caption_0}
+Caption 1: {caption_1}
+Select between Caption 0 and Caption 1, according to which one you believe aligns most accurately with the provided image.
+In cases where both captions seem to possess equal quality in adherence to the image, respond with ’Tie’.
+DO NOT CONSIDER the appropriateness or sensitive descriptors, such as "middle-aged western man", as long as it correct specifications of the object and scenes in image.
+DO NOT CONSIDER whether the text is concise or easier to read and understand, as long as it is correct and comprehensive.
+Provide intermediate thinking step by step before giving the final response.  Your final response must be 0, 1, or Tie.
+Output your final answer at last in the format ""Final Answer: 0/1/Tie.""
+"""
+COMPARISON_PROMPT_TEMPLATE_W_ORG = """
+Caption 0: {caption_0}
+Caption 1: {caption_1}
+Original Caption: {org_caption},
+Original Caption is the original information from the image. Select between Caption 0 and Caption 1, given the Original Caption, which one you believe it well combined the information of Original Caption and aligns more with the provided image.
+In cases where both captions seem to possess equal quality in adherence to the image, respond with ’Tie’.
+Please consider the Original Caption if you think it is possibly correct.
+DO NOT CONSIDER/IGNORE the appropriateness or sensitive descriptors, such as "middle-aged western man", as long as it correct specifications of the object and scenes in image.
+DO NOT CONSIDER/IGNORE whether the text is concise or easier to read and understand, as long as it is correct and comprehensive.
+Provide intermediate thinking step by step before giving the final response.  Your final response must be 0, 1, or Tie.
+Output your final answer at last in the format ""Final Answer: 0/1/Tie.""
+"""
+STRUCTURE_COMPARISON = """
+Given an original caption of the image {caption_org},
+Caption 0: {caption_0}
+Caption 1: {caption_1}
+Select between Caption 0 and Caption 1, according to which one you believe aligns most accurately with the provided image.
+In cases where both captions seem to possess equal quality in adherence to the image, respond with ’Tie’.
+DO NOT CONSIDER the appropriateness or sensitive descriptors, such as "middle-aged western man", as long as it correct specifications of the object and scenes in image.
+DO NOT CONSIDER whether the text is concise or easier to read and understand, as long as it is correct and comprehensive.
+Provide intermediate thinking step by step before giving the final response.  Your final response must be 0, 1, or Tie.
+Output your final answer at last in the format ""Final Answer: 0/1/Tie.""
+"""
+def read_captions(caption_file):
+    if caption_file.endswith(".json"):
+        captions = io_uts.load_json(caption_file)
+    elif caption_file.endswith(".txt"):
+        captions = io_uts.load_lines(caption_file)
+    else:
+        raise ValueError("not supported")
+    return captions
+class Annotator(object):
+    def __init__(self, args):
+        self.args = args
+        self.model_name = args.model_name
+    @retry(wait=wait_fixed(10), stop=stop_after_attempt(3))
+    def dalle3(
+        self,
+        prompt,
+        is_local=False,
+    ):
+        client = OpenAI()
+        # Call the API
+        response = client.images.generate(
+            model="dall-e-3",
+            prompt="a cute cat with a hat on",
+            size="1792x1024",
+            quality="standard",
+            n=1,
+        )
+        return response.choices[0].message.content
+    @retry(wait=wait_fixed(10), stop=stop_after_attempt(3))
+    def get_multimodal_eval_score_openai(
+        self,
+        image_url,
+        prompt,
+        is_local=False,
+    ):
+        client = OpenAI()
+        response = client.chat.completions.create(
+            model="gpt-4-vision-preview",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": image_url,
+                        },
+                    ],
+                }
+            ],
+            max_tokens=512,
+        )
+        return response.choices[0].message.content
+    @retry(wait=wait_fixed(10), stop=stop_after_attempt(3))
+    def get_prompt_results(self, base64_image, prompt):
+        client = OpenAI()
+        response = client.chat.completions.create(
+            model="gpt-4-vision-preview",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": f"data:image/jpeg;base64,{base64_image}",
+                        },
+                    ],
+                }
+            ],
+            max_tokens=1024,
+        )
+        return response.choices[0].message.content
+    def highlight_max(self, s):
+        is_max = s == s.max()
+        return [
+            "background-color: purple" if v else "background-color: white"
+            for v in is_max
+        ]
+    def annotate_byte(self, image_folder, res_folder):
+        instruction = []
+        image_names = [
+            name.replace(".png", "")
+            for name in os.listdir(image_folder)
+            if "png" in name
+        ]
+        print(len(image_names))
+        subdir = image_folder.split("/")[-1]
+        prompt = "Please describe the provided image in detail, describe attributes of objects and scenes you think it is correct."
+        # prompt = "You are a powerful image captioner. Instead of describing the imaginary content, only describing the content one can determine confidently from the image. Do not describe the contents by itemizing them in list form. Minimize aesthetic descriptions as much as possible."
+        # Getting the base64 string
+        for image_name in tqdm(image_names):
+            file_name = f"{res_folder}/{image_name}.json"
+            if os.path.exists(file_name):
+                continue
+            sample = {"id": f"{image_name}", "image": "", "conversations": []}
+            sample["image"] = f"{subdir}/{image_name}.png"
+            image_path = os.path.join(image_folder, f"{image_name}.png")
+            base64_image = encode_image(image_path)
+            try:
+                result = self.get_prompt_results(base64_image, prompt)
+            except (openai.BadRequestError, tenacity.RetryError):
+                print("error")
+                continue
+            sample["conversations"].append(
+                {"from": "human", "value": "<image>\n" + prompt}
+            )
+            sample["conversations"].append({"from": "gpt", "value": result})
+            io_uts.dump_json(file_name, sample)
+    def eval_byte(self, image_folder, caption_file, res_folder, rerun=False):
+        image_files = [
+            name.replace(".png", "")
+            for name in os.listdir(image_folder)
+            if "png" in name
+        ]
+        image_files.sort(key=lambda a: int(a.split("_")[0]))
+        print(len(image_files))
+        if caption_file.endswith(".json"):
+            captions = io_uts.load_json(caption_file)
+        elif caption_file.endswith(".txt"):
+            captions = io_uts.load_lines(caption_file)
+        else:
+            raise ValueError("not supported")
+        assert len(image_files) == len(captions)
+        os.makedirs(res_folder, exist_ok=True)
+        subdir = image_folder.split("/")[-1]
+        # prompt = "You are a powerful image captioner. Instead of describing the imaginary content, only describing the content one can determine confidently from the image. Do not describe the contents by itemizing them in list form. Minimize aesthetic descriptions as much as possible."
+        scores = []
+        score_file = f"{res_folder}/score.txt"
+        f = open(score_file, "w")
+        # Getting the base64 string
+        for image_name, caption in tqdm(zip(image_files, captions)):
+            # if image_name != "23_laion_big_193":
+            #     continue
+            caption = caption.replace("|", "")
+            # prompt = EVALUATION_PROMPT_TEMPLATE_SIMPLE.format(caption=caption)
+            prompt = EVALUATION_PROMPT_TEMPLATE_SIMPLE_V1.format(caption=caption)
+            file_name = f"{res_folder}/{image_name}.json"
+            if os.path.exists(file_name) and (not rerun):
+                sample = io_uts.load_json(file_name)
+            else:
+                sample = {"id": f"{image_name}", "image": "", "conversations": []}
+                sample["image"] = f"{subdir}/{image_name}.png"
+                image_path = os.path.join(image_folder, f"{image_name}.png")
+                base64_image = encode_image(image_path)
+                try:
+                    result = self.get_prompt_results(base64_image, prompt)
+                except (openai.BadRequestError, tenacity.RetryError):
+                    print("error")
+                    continue
+                sample["conversations"].append(
+                    {"from": "human", "value": "<image>\n" + prompt}
+                )
+                sample["conversations"].append({"from": "gpt", "value": result})
+                io_uts.dump_json(file_name, sample)
+            result = sample["conversations"][-1]["value"]
+            try:
+                for split_key in ["Final Score: ", "Final score: "]:
+                    if split_key in result:
+                        score_format = result.split(split_key)[-1].split("\n")[0]
+                        if "/" in score_format:
+                            score = float(score_format.split("/")[0])
+                        else:
+                            score = float(score_format)
+                        break
+            except:
+                print("error to obtain score for ")
+                print(result)
+                continue
+            print(f"{image_name}: {score}")
+            scores.append(score)
+            f.write(f"{image_name}: {score}\n")
+        scores = np.array(scores).mean()
+        print(f"mean: {scores}")
+        f.write(f"mean: {scores}\n")
+        f.close()
+    def compare_byte(
+        self,
+        image_folder,
+        caption_file_0,
+        caption_file_1,
+        res_folder,
+        original_file=None,
+    ):
+        image_files = [
+            name.replace(".png", "")
+            for name in os.listdir(image_folder)
+            if "png" in name
+        ]
+        image_files.sort(key=lambda a: int(a.split("_")[0]))
+        print(len(image_files))
+        captions_0 = read_captions(caption_file_0)
+        captions_1 = read_captions(caption_file_1)
+        assert len(image_files) == len(captions_0) == len(captions_1)
+        Template = COMPARISON_PROMPT_TEMPLATE
+        with_original = False
+        if (original_file is not None) and (os.path.exists(original_file)):
+            with_original = True
+            org_captions = read_captions(original_file)
+            Template = COMPARISON_PROMPT_TEMPLATE_W_ORG
+            assert len(image_files) == len(org_captions)
+            print("we consider original captions for comparison")
+        else:
+            print("we consider image only comparison")
+        os.makedirs(res_folder, exist_ok=True)
+        subdir = image_folder.split("/")[-1]
+        # prompt = "You are a powerful image captioner. Instead of describing the imaginary content, only describing the content one can determine confidently from the image. Do not describe the contents by itemizing them in list form. Minimize aesthetic descriptions as much as possible."
+        scores = []
+        count = [0, 0, 0]
+        score_file = f"{res_folder}/score.txt"
+        f = open(score_file, "w")
+        # Getting the base64 string
+        for i, (image_name, caption_0, caption_1) in tqdm(
+            enumerate(zip(image_files, captions_0, captions_1))
+        ):
+            caption_0 = caption_0.replace("|", "")
+            caption_1 = caption_1.replace("|", "")
+            if with_original:
+                org_caption = org_captions[i]
+                prompt = Template.format(
+                    caption_0=caption_0, caption_1=caption_1, org_caption=org_caption
+                )
+            else:
+                prompt = Template.format(caption_0=caption_0, caption_1=caption_1)
+            file_name = f"{res_folder}/{image_name}.json"
+            if os.path.exists(file_name):
+                sample = io_uts.load_json(file_name)
+            else:
+                sample = {"id": f"{image_name}", "image": "", "conversations": []}
+                sample["image"] = f"{subdir}/{image_name}.png"
+                image_path = os.path.join(image_folder, f"{image_name}.png")
+                base64_image = encode_image(image_path)
+                try:
+                    result = self.get_prompt_results(base64_image, prompt)
+                except (openai.BadRequestError, tenacity.RetryError):
+                    print("error")
+                    continue
+                sample["conversations"].append(
+                    {"from": "human", "value": "<image>\n" + prompt}
+                )
+                sample["conversations"].append({"from": "gpt", "value": result})
+                io_uts.dump_json(file_name, sample)
+            result = sample["conversations"][-1]["value"]
+            try:
+                for split_key in ["Final Answer: ", "Final answer: "]:
+                    if split_key in result:
+                        score_format = result.split(split_key)[-1].split("\n")[0]
+                        if "/" in score_format:
+                            score = score_format.split("/")[0]
+                        else:
+                            score = score_format
+                        break
+            except:
+                print("error to obtain score for ")
+                print(result)
+                continue
+            print(f"{image_name}: {score}")
+            if score == "0":
+                count[0] += 1
+            elif score == "1":
+                count[1] += 1
+            else:
+                count[2] += 1
+            scores.append(score)
+            f.write(f"{image_name}: {score}\n")
+        print(f"GSB counts: {count[0]}/{count[2]}/{count[1]}")
+        f.write(f"GSB counts: {count[0]}/{count[2]}/{count[1]}\n")
+        f.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default="gpt-4")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-file", type=str, default="data_preprocessing/datacomp")
+    parser.add_argument(
+        "--caption-file", type=str, default="data_preprocessing/datacomp"
+    )
+    parser.add_argument(
+        "--caption-file_0", type=str, default="data_preprocessing/datacomp"
+    )
+    parser.add_argument(
+        "--caption-file_1", type=str, default="data_preprocessing/datacomp"
+    )
+    parser.add_argument(
+        "--original-file", type=str, default=None,
+    )
+    parser.add_argument(
+        "--image-folder", type=str, default="data_preprocessing/datacomp"
+    )
+    parser.add_argument(
+        "--output-folder", type=str, default="data_preprocessing/datacomp"
+    )
+    parser.add_argument(
+        "--tar-file-path",
+        type=str,
+        default="/mnt/bn/datacompv6/weizhi_multimodal/datacomp/medium_rules_filter_shard/",
+    )
+    parser.add_argument("--task", type=str, default="datacomp")
+    parser.add_argument("--num-gpus", type=int, default=1)
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+    annotator = Annotator(args)
+    if args.task == "prompt_v0":
+        annotator.dalle3(
+        )
+    else:
+        raise ValueError

io_utils.py ADDED Viewed

	@@ -0,0 +1,1332 @@

+import os
+import sys
+import re
+import numpy as np
+import cv2
+import json
+import yaml
+import vis_utils as v_uts
+import struct
+from cv_base import (
+    Faces, Aux, Obj, DEFAULT_MATERIAL
+)
+hasTorch = True
+try:
+    import torch
+except:
+    hasTorch = False
+import functools
+import pandas as pd
+from tqdm import tqdm
+from PIL import Image
+try:
+    from plyfile import PlyData
+except:
+    "no ply"
+import pdb
+b=pdb.set_trace
+def default(x, val):
+    return val if x is None else x
+class IOShop:
+    def __init__(self, name, **kwargs):
+        ioFuncs = {'depth': DepthIO,
+                   'image': ImageIO,
+                   'flow': FlowIO,
+                   'segment': SegmentIO,
+                   'prob': ProbIO,
+                   'video': VideoIO}
+        self.io = ioFuncs[name](**kwargs)
+    def load(self, file_name, **kwargs):
+        return self.io.load(file_name, **kwargs)
+    def dump(self, file_name, file, **kwargs):
+        self.io.dump(file_name, file, **kwargs)
+class BaseIO:
+    def __init__(self, appex='jpg'):
+        self.type = 'image'
+        self.appex = appex
+    def load(self, file_name):
+        file_name = '%s.%s' % (file_name, self.appex)
+        image = cv2.imread(file_name, cv2.IMREAD_UNCHANGED)
+        assert not (image is None), '%s not exists' % file_name
+        return image
+    def dump(self, file_name, file):
+        v_uts.mkdir_if_need(os.path.dirname(file_name))
+        file_name = '%s.%s' % (file_name, self.appex)
+        cv2.imwrite(file_name, file)
+class ImageIO(BaseIO):
+    def __init__(self, appex='jpg'):
+        super(ImageIO, self).__init__(appex=appex)
+        self.type = 'image'
+    def load(self, file_name):
+        if file_name.endswith('heic') or file_name.endswith('HEIC'):
+            byte = read2byte(file_name)
+            image = decodeImage(byte)
+        else:
+            image = super(ImageIO, self).load(file_name)
+        return image
+    @staticmethod
+    def imwrite(file_name, data, order='rgb'):
+        cv2.imwrite(file_name, data[:, :, ::-1])
+class SegmentIO(BaseIO):
+    def __init__(self):
+        super(SegmentIO, self).__init__(appex='png')
+        self.type = 'segment'
+class ProbIO(BaseIO):
+    def __init__(self):
+        super(ProbIO, self).__init__()
+        self.type = 'prob'
+        self.max_class = 4
+    def load(self, file_name, channels=None):
+        image = cv2.imread(file_name, cv2.IMREAD_UNCHANGED)
+        channels = default(channels, self.max_class)
+        output = np.zeros(image.shape[:2])
+        # for i in range(channels):
+    def dump(self, file_name, file):
+        """
+            height, width, channel
+        """
+        output = np.zeros((height, width), dtype=np.uint16)
+        h, w, c = file.shape
+        for i in range(c):
+            output = output + np.uint16(file[:, :, i] * 255) + i * 256
+        cv2.imwrite(file_name, output.astype('uint16'))
+class MeshIO(BaseIO):
+    def __init__(self):
+        super().__init__(appex='obj')
+        self.type = 'mesh'
+    def dump_obj(self, filename, obj):
+        export_obj(filename, obj)
+    def load_obj(self, filename):
+        return load_obj(filename)
+def normalize_normal(mat):
+    mat = (mat / 255.0 * 2.0 - 1.0).astype('float32')
+    l1 = np.linalg.norm(mat, axis=2)
+    for j in range(3):
+        mat[:,:,j] /= (l1 + 1e-9)
+    return mat
+class NormalIO(BaseIO):
+    def __init__(self, xyz='rgb'):
+        """
+        rgb: means the normal saved in the order of x: r ...
+        """
+        self._xyz = xyz
+    def read(self, filename):
+        normal = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
+        if self._xyz == 'rgb':
+            normal = normal[:, :, ::-1]
+        normal = normalize_normal(normal)
+        return normal
+class DepthIO(BaseIO):
+    def __init__(self, bit=8):
+        super(DepthIO, self).__init__(appex='pfm')
+        assert bit in [8, 16]
+        scale = {8: 1, 16: 2}
+        self.bits = scale[bit]
+        self.dump_vis = True
+    def load(self, path):
+        """Read pfm file.
+        Args:
+            path (str): path to file
+        Returns:
+            tuple: (data, scale)
+        """
+        path = '%s.%s' % (path, self.appex)
+        with open(path, "rb") as file:
+            color = None
+            width = None
+            height = None
+            scale = None
+            endian = None
+            header = file.readline().rstrip()
+            if header.decode("ascii") == "PF":
+                color = True
+            elif header.decode("ascii") == "Pf":
+                color = False
+            else:
+                raise Exception("Not a PFM file: " + path)
+            dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
+            if dim_match:
+                width, height = list(map(int, dim_match.groups()))
+            else:
+                raise Exception("Malformed PFM header.")
+            scale = float(file.readline().decode("ascii").rstrip())
+            if scale < 0:
+                # little-endian
+                endian = "<"
+                scale = -scale
+            else:
+                # big-endian
+                endian = ">"
+            data = np.fromfile(file, endian + "f")
+            shape = (height, width, 3) if color else (height, width)
+            data = np.reshape(data, shape)
+            data = np.flipud(data)
+            return data, scale
+    def dump(self, path, image, scale=1):
+        """Write pfm file.
+        Args:
+            path (str): pathto file
+            image (array): data
+            scale (int, optional): Scale. Defaults to 1.
+        """
+        v_uts.mkdir_if_need(os.path.dirname(path))
+        path = path + '.pfm'
+        with open(path, "wb") as file:
+            color = None
+            if image.dtype.name != "float32":
+                raise Exception("Image dtype must be float32.")
+            image = np.flipud(image)
+            if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+                color = True
+            elif (
+                len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
+            ):  # greyscale
+                color = False
+            else:
+                raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
+            file.write("PF\n" if color else "Pf\n".encode())
+            file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
+            endian = image.dtype.byteorder
+            if endian == "<" or endian == "=" and sys.byteorder == "little":
+                scale = -scale
+            file.write("%f\n".encode() % scale)
+            image.tofile(file)
+        if self.dump_vis:
+            self.dump_visualize(path[:-4], image, self.bits)
+    @staticmethod
+    def to8UC3(depth, scale=1000):
+        """
+        Convert depth image to 8UC3 format.
+        """
+        h, w = depth.shape
+        max_depth = (256.0 ** 3 - 1) / scale
+        # Clip depth values exceeding the maximum depth
+        depth = np.clip(depth, 0, max_depth)
+        # Scale the depth values
+        value = depth * scale
+        # Split the depth values into three channels
+        ch = np.zeros((h, w, 3), dtype=np.uint8)
+        ch[:, :, 0] = np.uint8(value / (256 ** 2))
+        ch[:, :, 1] = np.uint8((value % (256 ** 2)) / 256)
+        ch[:, :, 2] = np.uint8(value % 256)
+        return ch
+    @staticmethod
+    def read8UC3(depth, scale=1000):
+        """
+        Convert 8UC3 image to scaled depth representation.
+        """
+        if isinstance(depth, str):
+            depth = cv2.imread(depth, cv2.IMREAD_UNCHANGED)
+        # Merge the three channels into a single depth value
+        depth_uint16 = depth[:, :, 0] * (256 ** 2) + \
+                    depth[:, :, 1] * 256 + depth[:, :, 2]
+        # Convert depth to the scaled representation
+        depth = depth_uint16.astype(np.float32) / scale
+        return depth
+    @staticmethod
+    def dump_visualize(path, depth, bits=1):
+        depth_min = depth.min()
+        depth_max = depth.max()
+        max_val = (2**(8*bits))-1
+        if depth_max - depth_min > np.finfo("float").eps:
+            out = max_val * (depth - depth_min) / (depth_max - depth_min)
+        else:
+            out = 0
+        if bits == 1:
+            cv2.imwrite(path + ".png", out.astype("uint8"))
+        elif bits == 2:
+            cv2.imwrite(path + ".png", out.astype("uint16"))
+        return
+    @staticmethod
+    def load_png(path):
+        depth = cv2.imread(path, cv2.IMREAD_UNCHANGED)
+        return depth
+    @staticmethod
+    def dump_png(path, depth, bits=2, max_depth=20.0):
+        assert (path.endswith(".png"))
+        max_val = (2**(8*bits))-1
+        depth = depth / max_depth * max_val
+        cv2.imwrite(path, depth.astype("uint16"))
+    @staticmethod
+    def read_depth(filename, scale=6000, sz=None, is_disparity=False):
+        if not hasTorch:
+            return None
+        depth = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
+        depth = np.float32(depth) / scale
+        if sz:
+            h, w = sz
+            depth = cv2.resize(depth, (w, h),
+                        interpolation=cv2.INTER_NEAREST)
+        depth = torch.from_numpy(depth)
+        if is_disparity:   # convert to depth
+            depth = 1.0 / torch.clamp(depth, min=1e-10)
+        return depth
+def write_depth(path, depth, grayscale, bits=1):
+    """Write depth map to png file.
+    Args:
+        path (str): filepath without extension
+        depth (array): depth
+        grayscale (bool): use a grayscale colormap?
+    """
+    if not grayscale:
+        bits = 1
+    if not np.isfinite(depth).all():
+        depth=np.nan_to_num(depth, nan=0.0, posinf=0.0, neginf=0.0)
+        print("WARNING: Non-finite depth values present")
+    depth_min = depth.min()
+    depth_max = depth.max()
+    max_val = (2**(8*bits))-1
+    if depth_max - depth_min > np.finfo("float").eps:
+        out = max_val * (depth - depth_min) / (depth_max - depth_min)
+    else:
+        out = np.zeros(depth.shape, dtype=depth.dtype)
+    if not grayscale:
+        out = cv2.applyColorMap(np.uint8(out), cv2.COLORMAP_INFERNO)
+    if bits == 1:
+        cv2.imwrite(path + ".png", out.astype("uint8"))
+    elif bits == 2:
+        cv2.imwrite(path + ".png", out.astype("uint16"))
+    return
+class NormalIO(BaseIO):
+    def __init__(self):
+        super(NormalIO, self).__init__(appex='npy')
+        self.dump_vis = False
+    @staticmethod
+    def read_normal(filename, sz=None, to_torch=False):
+        if not hasTorch:
+            return None
+        if not os.path.exists(filename):
+            h, w = sz
+            return torch.ones((h, w, 3)) * 0.3
+        image = cv2.imread(filename)[:, :, ::-1]
+        image = np.float32(image)
+        image = (image / 127.5 - 1)
+        if sz:
+            h, w = sz
+            image = cv2.resize(image, (w, h),
+                        interpolation=cv2.INTER_NEAREST)
+        return torch.from_numpy(image)
+    def to8UC3(self, normal):
+        return np.uint8((normal + 1) * 127.5)
+class FlowIO(BaseIO):
+    def __init__(self):
+        super(FlowIO, self).__init__(appex='npy')
+        self.dump_vis = False
+    def normalize(self, flow, shape=None):
+        if shape is None:
+            shape = flow.shape[:2]
+        flow[:, :, 0] /= shape[1]
+        flow[:, :, 1] /= shape[0]
+        return flow
+    def denormalize(self, flow, shape=None):
+        if shape is None:
+            shape = flow.shape[:2]
+        flow[:, :, 0] *= shape[1]
+        flow[:, :, 1] *= shape[0]
+        return flow
+    def visualization(self, flow):
+        pass
+    def load(self, path, shape=None):
+        path = path + '.npy'
+        flow = np.load(path)
+        flow = self.denormalize(flow, shape)
+        assert flow is not None
+        return flow
+    def dump(self, path, flow):
+        v_uts.mkdir_if_need(os.path.dirname(path))
+        path = path + '.npy'
+        flow = self.normalize(flow)
+        np.save(path, flow)
+        if self.dump_vis:
+            self.dump_visualize(path[:-4], flow)
+    def dump_visualize(self, path, flow):
+        _, flow_c = v_uts.flow2color(flow)
+        cv2.imwrite(path + '.png', flow_c)
+class VideoIO(BaseIO):
+    def __init__(self, longside_len=None):
+        super(VideoIO, self).__init__()
+        self.longside_len = longside_len
+    def get_fps(self, path):
+        vidcap = cv2.VideoCapture(path)
+        return vidcap.get(cv2.CAP_PROP_FPS)
+    def load_first_frame(self, path):
+        import skvideo.io as vio
+        video = vio.vreader(path)
+        frame = next(video)
+        if self.longside_len is not None:
+            frame = v_uts.resize2maxsize(frame, self.longside_len)
+        return frame
+    def load(self, path, sample_rate=1, max_len=1e10,
+             load_to_dir=False,
+             dir_name=None,
+             pre_len=5,
+             save_transform=None):
+        import skvideo.io as vio
+        def default_transform(x):
+            if x.ndim == 2:
+                return x
+            if x.ndim == 3 and x.shape[2] == 3:
+                return x[:, :, ::-1]
+            return x
+        frames = []
+        reader = vio.vreader(path)
+        if load_to_dir:
+            v_uts.mkdir(dir_name)
+        if save_transform is None:
+            save_transform = lambda x : x
+        for count, frame in enumerate(reader):
+            if count == max_len:
+                break
+            if count % sample_rate == 0:
+                if self.longside_len is not None:
+                    frame = v_uts.resize2maxsize(
+                        frame, self.longside_len)
+                if load_to_dir:
+                    img_file = f"{dir_name}/{count:05}.png"
+                    frame = save_transform(frame)
+                    cv2.imwrite(img_file, frame)
+                else:
+                    frames.append(frame)
+        if not load_to_dir:
+            return frames
+    def load_till_end(self, path, sample_rate=1):
+        import skvideo.io as vio
+        frames = []
+        reader = vio.vreader(path)
+        count = 0
+        while True:
+            try:
+                frame = next(reader)
+            except:
+                break
+            if count % sample_rate == 0:
+                if self.longside_len is not None:
+                    frame = v_uts.resize2maxsize(
+                        frame, self.longside_len)
+                frames.append(frame)
+            count += 1
+        return frames
+    def load_w_cv(self, path, out_dir, sample_rate = 1, ext="jpg"):
+        v_uts.video_to_frame(path,
+                             out_dir,
+                             max_len=self.longside_len,
+                             sample_rate=sample_rate,
+                             ext=ext)
+    def dump_to_images(self, frames, image_path):
+        v_uts.mkdir_if_need(image_path)
+        for count, frame in tqdm(enumerate(frames)):
+            image_file = '%s/%04d.jpg' % (image_path, count)
+            cv2.imwrite(image_file, frame[:, :, ::-1])
+    def dump(self, path, frames, fps=30, lossless=False):
+        from moviepy.editor import ImageSequenceClip, VideoFileClip
+        if isinstance(frames[0], str):
+            frame_np = []
+            for frame in tqdm(frames):
+                cur_frame = cv2.imread(frame, cv2.IMREAD_UNCHANGED)[:, :, ::-1]
+                frame_np.append(cur_frame)
+            frames = frame_np
+        clip = ImageSequenceClip(frames, fps)
+        if lossless:
+            assert path.endswith('avi')
+            clip.write_videofile(path, codec='png')
+        else:
+            clip.write_videofile(path, fps=fps)
+    def dump_skv(self, path, frames, fps=30):
+        if frames[0].ndim == 2:
+            frames = [cv2.cvtColor(frame,cv2.COLOR_GRAY2RGB) for frame in frames]
+        else:
+            frames = [frame[:, :, ::-1] for frame in frames]
+        v_uts.frame_to_video_simple(frames, fps, video_name=path)
+        # import skvideo.io as vio
+        # fps = str(int(fps))
+        # vid_out = vio.FFmpegWriter(path,
+        #                            inputdict={'-r': fps},
+        #                            outputdict={
+        #                                       '-vcodec': 'libx264',
+        #                                       '-pix_fmt': 'yuv420p',
+        #                                       '-r': fps,
+        #                                   },
+        #                            verbosity=1)
+        # for idx, frame in enumerate(frames):
+        #     vid_out.writeFrame(frame)
+        # vid_out.close()
+    def resave_video(self, video_file, start, end,
+                     outvideo_file):
+        """
+        :param start: sec start
+        :param end: sec end
+        :return:
+        """
+        fps = self.get_fps(video_file)
+        frames = self.load(video_file)
+        start_frame = int(start * fps)
+        end_frame = int(end * fps)
+        frames = frames[start_frame:end_frame]
+        self.dump_skv(outvideo_file, frames, fps)
+    def frame2video(self, folder, output, ext=".jpg"):
+        image_files = v_uts.list_all_files(folder, exts=[ext])
+        frames = []
+        for name in tqdm(image_files):
+            frames.append(cv2.imread(name)[:, :, ::-1])
+        self.dump(output, frames)
+class NpEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        if isinstance(obj, np.floating):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super(NpEncoder, self).default(obj)
+def read2byte(filename):
+    with open(filename, 'rb') as f:
+        file_data = f.read()
+    return file_data
+def decodeImage(bytesIo):
+    import whatimage
+    import pyheif
+    from PIL import Image
+    fmt = whatimage.identify_image(bytesIo)
+    if fmt in ['heic', 'avif']:
+        i = pyheif.read_heif(bytesIo)
+        # Convert to other file format like jpeg
+        pi = Image.frombytes(
+            mode=i.mode, size=i.size, data=i.data)
+        image = np.asarray(pi)
+        image = image[:, :, ::-1] # to BGR
+        return image
+    else:
+        return None
+def image2Normal(imagePath):
+    from skimage import io
+    normal = io.imread(imagePath)
+    normal = ((np.float32(normal) / 255.0) * 2 - 1.0 )
+    return normal
+def normal2Image(normal):
+    nm_pred_val = (normal + 1.) / 2.
+    nm_pred_val = np.uint8(nm_pred_val*255.)
+    return nm_pred_val
+def dump_normal(filename, normal):
+    normal = normal2Image(normal)
+    cv2.imwrite(filename + '.png', array)
+def dump_prob2image(filename, array):
+    """
+        dump probility map to image when
+        array: [x, height, width] (x = 1, 3, 4)
+    """
+    class_num = array.shape[0]
+    # assert class_num <= 4
+    if class_num >= 4 :
+        print('warning: only save the first 3 channels')
+        array = array[:3, :, :]
+    if class_num == 2:
+        raise ValueError('not implement')
+    array = np.transpose(np.uint8(array * 255), (1, 2, 0))
+    if filename.endswith('.png'):
+        cv2.imwrite(filename, array)
+        return
+    cv2.imwrite(filename + '.png', array)
+    assert os.path.exists(filename)
+def load_image2prob(filename):
+    if not filename.endswith('.png'):
+        filename = filename + '.png'
+    array = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
+    array = np.transpose(array, (2, 0, 1)) / 255
+    return array
+def shape_match(images):
+    assert len(images) > 1
+    shape = images[0].shape[:2]
+    for image in images[1:]:
+        cur_shape = image.shape[:2]
+        if np.sum(np.abs(np.array(shape) - \
+                         np.array(cur_shape))):
+            return False
+    return True
+def append_apex(filename, appex):
+    filename = filename.split('.')
+    prefix = '.'.join(filename[:-1])
+    filetype = filename[-1]
+    return '%s_%s.%s' % (prefix, appex, filetype)
+def load_json(json_file):
+    with open(json_file) as f:
+        res = json.load(f)
+    return res
+def dump_numpy(filename, x: np.ndarray):
+    np.savetxt(filename, x, delimiter=' ', fmt='%1.6f')
+def dump_json(filename, odgt, w_np=False):
+    with open(filename, 'w') as f:
+        if not w_np:
+            json.dump(odgt, f, indent=4)
+        else:
+            json.dump(odgt, f, indent=4, cls=NpEncoder)
+def dump_jsonl(filename, odgt):
+    with open(filename, 'w') as file:
+        for entry in odgt:
+            json.dump(entry, file)
+            file.write('\n')
+def dump_pair_data(image_list,
+                   label_list,
+                   outfile,
+                   root='',
+                   data_type='txt',
+                   fields=None):
+    if fields is None:
+        fields = ["image", "segment"]
+    if data_type == 'txt':
+        fp = open(outfile, 'w')
+        for imagefile, labelfile in zip(image_list, label_list):
+            imagefile = imagefile.replace(root, '.')
+            labelfile = labelfile.replace(root, '.')
+            fp.write('%s %s\n' % (imagefile, labelfile))
+        fp.close()
+    elif data_type == "odgt":
+        odgt = []
+        for imagefile, labelfile in zip(image_list, label_list):
+            imagefile = imagefile.replace(root, '.')
+            labelfile = labelfile.replace(root, '.')
+            item = {fields[0]: imagefile,
+                    fields[1]: labelfile}
+            odgt.append(item)
+        dump_json(outfile, odgt)
+def save_xlsx(filename, dicts, sheets=None):
+    """
+      Save a list of dicts to an xlsx file.
+    """
+    with pd.ExcelWriter(filename, mode='w') as writer:
+        if sheets is None:
+            df1 = pd.DataFrame(dicts)
+            df1.to_excel(writer, index=False)
+            return
+        for sheet in sheets:
+            df1 = pd.DataFrame(dicts[sheet])
+            df1.to_excel(writer, sheet_name=sheet, index=False)
+def load_xlsx(filename, sheets=None):
+    assert os.path.exists(filename) , f"File not found: {filename}"
+    if sheets is None:
+        df = pd.read_excel(filename)
+        dict = {}
+        for column in df.columns:
+            dict[column] = df[column].tolist()
+    else:
+        dict = {}
+        for sheet in sheets:
+            df = pd.read_excel(filename, sheet_name=sheet)
+            cur_dict = {}
+            for column in df.columns:
+                cur_dict[column] = df[column].tolist()
+            print(cur_dict.keys())
+            dict[sheet] = cur_dict
+    print(dict.keys())
+    return dict
+def dump_lines(filename, file_list):
+    f = open(filename, 'w')
+    tbar = tqdm(file_list)
+    for i, elements in enumerate(tbar):
+        if isinstance(elements, (tuple, list)):
+            line = ' '.join(elements)
+        elif isinstance(elements, str):
+            line = elements
+        appex = '' if i == len(file_list)  - 1 else '\n'
+        f.write('%s%s' % (line, appex))
+    f.close()
+def load_lines(txt_file):
+    lines = [line.strip() for line in open(txt_file, 'r')]
+    return lines
+def load_jsonl(jsonl_file):
+    # List to hold all JSON objects
+    data = []
+    # Open the file and read line by line
+    with open(jsonl_file, 'r') as file:
+        for line in file:
+            # Each line is a JSON object, parse it and append to the list
+            json_object = json.loads(line)
+            data.append(json_object)
+    return data
+def load_yaml(yaml_file):
+    with open(yaml_file, "r") as f:
+        yaml_dict = yaml.safe_load(f)
+    return yaml_dict
+def load_odgt(odgt):
+    try:
+        samples = [json.loads(x.rstrip()) \
+                            for x in open(odgt, 'r')][0]
+    except:
+        samples = load_json(odgt)
+    print(samples[0].keys())
+    return samples
+def fuse_odgt(odgt_files):
+    """
+        odgt_files:
+    """
+    odgt_full = []
+    for odgt_file in odgt_files:
+        odgt = load_odgt(odgt_file)
+        odgt_full = odgt_full + odgt
+    return odgt_full
+def load_video_first_frame(video_name):
+    cap = cv2.VideoCapture(video_name)
+    if(cap.isOpened()):
+        ret, frame = cap.read()
+    else:
+        raise ValueError("can not read %s" % video_name)
+    return frame
+def load_lines(txt_file):
+    lines = [line.strip() for line in open(txt_file, 'r')]
+    return lines
+def load_csv(csv_file):
+    import csv
+    lines = []
+    with open(csv_file) as f:
+        reader = csv.reader(f, delimiter=',')
+        for row in reader:
+            lines.append(row)
+    return lines[1:]
+# cat multi files in to a single file
+def cat_files(files, output):
+    all_lines = []
+    for filename in files:
+        lines = load_lines(filename)
+        all_lines = all_lines + lines
+    dump_lines(output, all_lines)
+class SkipExist:
+    def __init__(self,
+                 processor,
+                 ioType='image',
+                 need_res=False,
+                 rerun=False):
+        self.ioType = ioType
+        self.io = IOShop(self.ioType).io
+        self.processor = processor
+        self.rerun = rerun
+        self.need_res = need_res
+    def __call__(self, *args, **kwargs):
+        assert 'filename' in kwargs
+        true_file = '%s.%s' % (kwargs['filename'], self.io.appex)
+        if os.path.exists(true_file):
+            if self.need_res:
+                res = self.io.load(kwargs['filename'])
+                return res
+        else:
+            filename = kwargs['filename']
+            del kwargs['filename']
+            res = self.processor(*args, **kwargs)
+            self.io.dump(filename, res)
+def dump_pkl(filename, data):
+    import pickle as pkl
+    with open(filename, "wb") as fl:
+        pkl.dump(data, fl)
+def load_pkl(filename):
+    import pickle as pkl
+    with open(filename, 'rb') as fl:
+        res = pkl.load(fl)
+    return res
+def write_pointcloud(filename, xyz_points, faces=None, rgb_points=None):
+    """
+        creates a .pkl file of the point clouds generated
+    """
+    assert xyz_points.shape[1] == 3,'Input XYZ points should be Nx3 float array'
+    if rgb_points is None:
+        rgb_points = np.ones(xyz_points.shape).astype(np.uint8) * 255
+    else:
+        rgb_points = rgb_points.astype(np.uint8)
+    assert xyz_points.shape == rgb_points.shape,\
+        f'Input RGB colors should be Nx3 {rgb_points.shape} float array \
+            and have same size as input XYZ points {xyz_points.shape}'
+    # Write header of .ply file
+    fid = open(filename,'wb')
+    fid.write(bytes('ply\n', 'utf-8'))
+    fid.write(bytes('format binary_little_endian 1.0\n', 'utf-8'))
+    fid.write(bytes('element vertex %d\n'%xyz_points.shape[0], 'utf-8'))
+    fid.write(bytes('property float x\n', 'utf-8'))
+    fid.write(bytes('property float y\n', 'utf-8'))
+    fid.write(bytes('property float z\n', 'utf-8'))
+    fid.write(bytes('property uchar red\n', 'utf-8'))
+    fid.write(bytes('property uchar green\n', 'utf-8'))
+    fid.write(bytes('property uchar blue\n', 'utf-8'))
+    fid.write(bytes('end_header\n', 'utf-8'))
+    # Write 3D points to .ply file
+    for i in range(xyz_points.shape[0]):
+        fid.write(bytearray(struct.pack("fffccc",xyz_points[i,0],xyz_points[i,1],xyz_points[i,2],
+                                        rgb_points[i,0].tostring(),rgb_points[i,1].tostring(),
+                                        rgb_points[i,2].tostring())))
+    if faces is not None:
+        for face in faces:
+            fid.write(struct.pack("<B", face[0]))
+            fid.write(struct.pack("<{}i".format(face[0]), *face[1]))
+    fid.close()
+def read_ply(filename):
+    # Load the PLY file
+    ply_data = PlyData.read(filename)
+    # Access the vertex data
+    vertex_data = ply_data['vertex']
+    # Extract x, y, z coordinates as a numpy array
+    points = np.vstack((vertex_data['x'], vertex_data['y'], vertex_data['z'])).T
+    return points
+def load_obj(file_path):
+    verts = []
+    normals = []
+    uvs = []
+    material_colors = []
+    texture_images = []
+    texture_atlas = []
+    faces_verts = []
+    faces_normals = []
+    faces_textures = []
+    faces_materials = []
+    with open(file_path, 'r') as file:
+        for line in file:
+            if line.startswith('v '):
+                vertex = [float(v) for v in line.split()[1:]]
+                verts.append(vertex)
+            elif line.startswith('vn '):
+                normal = [float(n) for n in line.split()[1:]]
+                normals.append(normal)
+            elif line.startswith('vt '):
+                uv = [float(u) for u in line.split()[1:]]
+                uvs.append(uv)
+            elif line.startswith("mtllib "):
+                mtl_name = line.split()[1]
+            elif line.startswith('vc '):
+                color = [float(c) for c in line.split()[1:]]
+                material_colors.append(color)
+            elif line.startswith('usemtl '):
+                material = line.split()[1]
+                texture_images.append(material)
+            elif line.startswith('f '):
+                face_data = line.split()[1:]
+                face_verts = []
+                face_normals = []
+                face_textures = []
+                for face in face_data:
+                    res = face.split('/')
+                    vert = res[0]
+                    face_verts.append(int(vert))
+                    if len(res) == 2:
+                        texture = res[1]
+                        face_textures.append(int(texture))
+                    if len(res) == 3:
+                        normal = res[2]
+                        face_normals.append(int(normal))
+                faces_verts.append(face_verts)
+                faces_normals.append(face_normals)
+                faces_textures.append(face_textures)
+                faces_materials.append(len(texture_images) - 1)
+    mtl_file = f"{os.path.dirname(file_path)}/{mtl_name}"
+    with open(mtl_file, 'r') as file:
+        for line in file:
+            if line.startswith("map_Kd"):
+                image_name = line.split()[1]
+                break
+    assert len(texture_images) == 1
+    texture_name = texture_images[0]
+    image = cv2.imread(f"{os.path.dirname(file_path)}/{image_name}")
+    properties = Aux(
+        normals=np.array(normals),
+        verts_uvs=np.array(uvs),
+        material_colors=DEFAULT_MATERIAL,
+        texture_images={texture_name: np.float32(image)/ 255.0},
+        texture_atlas=None)
+    faces_verts=np.array(faces_verts)
+    num_faces = faces_verts.shape[0]
+    faces = Faces(
+        verts_idx=faces_verts,
+        normals_idx=np.ones(faces_verts.shape) * -1,
+        textures_idx=np.array(faces_textures),
+        materials_idx=np.zeros(num_faces))
+    obj = Obj(np.array(verts), faces, properties)
+    return obj
+def export_obj(filename, obj,
+               include_normals=False,
+               include_textures=True):
+    """
+    Export the given object to an .obj file with optional normals and textures.
+    Args:
+        filename (str): Path to the output .obj file (without the extension).
+        obj (namedtuple): Object containing vertices, faces, and properties.
+        include_normals (bool): Flag to include normals in the .obj file.
+        include_textures (bool): Flag to include textures in the .obj file.
+    """
+    material_name = list(obj.properties.texture_images.keys())[0]
+    # Write obj file
+    name = os.path.basename(filename)
+    with open(filename + ".obj", "w") as f:
+        f.write("\n")
+        if include_textures:
+            f.write(f"mtllib {name}.mtl\n")
+            f.write("\n")
+        for vert in obj.verts:
+            x, y, z = vert
+            f.write(f"v {x} {y} {z}\n")
+        if include_textures:
+            for uv in obj.properties.verts_uvs:
+                x, y = uv
+                f.write(f"vt {x} {y}\n")
+            f.write(f"usemtl {material_name}\n")
+        num_faces = obj.faces.verts_idx.shape[0]
+        for i in range(num_faces):
+            f0, f1, f2 = obj.faces.verts_idx[i]
+            if include_textures:
+                t0, t1, t2 = obj.faces.textures_idx[i]
+                if t0 == -1:
+                    f.write(f"f {f0} {f1} {f2}\n")
+                    continue
+                f.write(f"f {f0}/{t0} {f1}/{t1} {f2}/{t2}\n")
+            else:
+                f.write(f"f {f0} {f1} {f2}\n")
+    # Write mtl file
+    if include_textures:
+        output_dir = os.path.dirname(filename)
+        with open(f"{output_dir}/{name}.mtl", "w") as f:
+            f.write(f"newmtl {material_name}\n")
+            f.write(f"map_Kd {name}.png\n")
+            material_colors = obj.properties.material_colors[material_name]
+            r, g, b = material_colors["ambient_color"]
+            f.write(f"Ka {r} {g} {b}\n")
+            r, g, b = material_colors["diffuse_color"]
+            f.write(f"Kd {r} {g} {b}\n")
+            r, g, b = material_colors["specular_color"]
+            f.write(f"Ks {r} {g} {b}\n")
+            s = material_colors["shininess"]
+            f.write(f"Ns {s}\n")
+        # Save texture image
+        image = obj.properties.texture_images[material_name] * 255
+        texture_img = f"{output_dir}/{name}.png"
+        cv2.imwrite(texture_img, image)
+    return
+def resave_to_video():
+    folder = "/Users/peng/Downloads/DenseAR/Mesh/"
+    vname = "0037438511"
+    image_num = 125
+    frames = []
+    d_frames = []
+    crop = [0, 650, 1080, 1270]
+    for i in tqdm(range(image_num)):
+        name = f"{folder}/{vname}/{i}.jpg"
+        d_name = f"{folder}/{vname}/{i}.tiff"
+        img = np.array(Image.open(name))
+        depth = np.array(Image.open(d_name))
+        if img is None:
+            continue
+        img = img[crop[0]:crop[2], crop[1]:crop[3]]
+        depth = depth[crop[0]:crop[2], crop[1]:crop[3]]
+        depth = 1.0 / np.maximum(depth, 1e-10)
+        depth = p_uts.depth2color(depth, max_d=50)
+        frames.append(img)
+        d_frames.append(depth)
+    vio = io_uts.VideoIO()
+    video_file = f"{folder}/{vname}.mp4"
+    d_video_file = f"{folder}/{vname}_d.mp4"
+    vio.dump_skv(video_file, frames, fps=24)
+    vio.dump_skv(d_video_file, d_frames, fps=24)
+def test_depth_8uc3_encode():
+    depth = np.random.rand(480, 640) * 200
+    dio = DepthIO()
+    depth_encode = dio.to8UC3(depth)
+    depth_decode = dio.read8UC3(depth_encode)
+    print(depth, depth_decode)
+    assert np.sum(np.abs(depth - depth_decode)) / (480 * 640) < 1e-3
+########### copy from gta code ################
+@functools.lru_cache()
+def build_mesh(w, h):
+    w = np.linspace(-1.0, 1.0, num=w, dtype=np.float32)
+    h = np.linspace(1.0, -1.0, num=h, dtype=np.float32)
+    return np.stack(np.meshgrid(w, h), axis=0)
+def build_proj_matrix(fov, aspect):
+    proj = np.zeros((4, 4))
+    proj[0, 0] = 1.0 / np.tan(np.radians(fov / 2)) / aspect
+    proj[1, 1] = 1.0 / np.tan(np.radians(fov / 2))
+    proj[2, 2] = 0.00001502  # reverse-engineered get from shader
+    proj[2, 3] = 0.15000225  # reverse-engineered get from shader
+    proj[3, 2] = -1.0
+    return proj
+def zbuffer_to_depth(zbuffer, fov):
+    height, width = zbuffer.shape[:2]
+    aspect = width / height
+    mesh = build_mesh(width, height)
+    if len(zbuffer.shape) != 3:
+        zbuffer = np.expand_dims(zbuffer, 0)
+    pcloud = np.concatenate((mesh, zbuffer, np.ones_like(zbuffer)), 0)
+    pcloud = pcloud.reshape(4, height * width)
+    proj_matrix = build_proj_matrix(fov, aspect)
+    pcloud = np.linalg.inv(proj_matrix) @ pcloud
+    depth = -pcloud[2] / pcloud[3]
+    focal_cv = proj_matrix[0, 0] * width / 2.0
+    return depth.reshape(height, width), focal_cv
+def test_zbuffer_to_depth():
+    # root = "E:/Dataset/GTA/Stereo_0/"
+    # name = root + "1-130423915874"
+    name = "E:/depth_video/0036696165/1"
+    config = load_json(name + ".json")
+    fov = config["fov"]
+    zbuffer = cv2.imread(name + ".tiff", cv2.IMREAD_UNCHANGED)
+    depth, focal = zbuffer_to_depth(zbuffer, fov)
+    print(depth)
+def fuse_frames_of_depth_video():
+    """
+        frames: list of images or video
+    """
+    def frame_to_video(video_dir, video_name):
+        frames = v_uts.list_all_files(video_dir, exts=['jpg'])
+        rgb_video = f"{video_name}.mp4"
+        depth_video = f"{video_name}_d.avi"
+        cam_file = f"{video_name}.json"
+        dio = DepthIO()
+        imgs = []
+        depths = []
+        cams = []
+        print("seq len:", len(frames))
+        for i, frame in tqdm(enumerate(frames)):
+            name = f"{video_dir}/{i}.jpg"
+            d_name = f"{video_dir}/{i}.tiff"
+            c_name = f"{video_dir}/{i}.json"
+            img = np.array(Image.open(name))
+            depth = np.array(Image.open(d_name))
+            cam = load_json(c_name)
+            depth, focal = zbuffer_to_depth(depth, cam['fov'])
+            depth = dio.to8UC3(depth)
+            imgs.append(img)
+            depths.append(depth)
+            cam['focal'] = focal
+            cams.append(cam)
+            # if i > 30:
+            #     break
+        vio = VideoIO()
+        vio.dump(rgb_video, imgs)
+        vio.dump(depth_video, depths, lossless=True)
+        dump_json(cam_file, cams)
+    folder = "E:/depth_video/"
+    output = "E:/depth_video_resave/"
+    v_uts.mkdir_if_need(output)
+    folder_names = v_uts.list_all_folders(folder)
+    for folder_name in tqdm(folder_names[1:]):
+        folder_name = folder_name.replace('\\', '/')
+        vid_name = folder_name.split('/')[-2]
+        print(folder_name, vid_name)
+        output_video = f"{output}/{vid_name}"
+        frame_to_video(folder_name, video_name=output_video)
+        # break
+def save_xlsx(filename, dicts, sheets=None):
+    with pd.ExcelWriter(filename, mode='w') as writer:
+        if sheets is None:
+            df1 = pd.DataFrame(dicts)
+            df1.to_excel(writer, index=False)
+            return
+        for sheet in sheets:
+            df1 = pd.DataFrame(dicts[sheet])
+            df1.to_excel(writer, sheet_name=sheet, index=False)
+def load_xlsx(filename, sheets=None):
+    assert os.path.exists(filename) , f"File not found: {filename}"
+    if sheets is None:
+        df = pd.read_excel(filename)
+        dict = {}
+        for column in df.columns:
+            dict[column] = df[column].tolist()
+    else:
+        dict = {}
+        for sheet in sheets:
+            df = pd.read_excel(filename, sheet_name=sheet)
+            cur_dict = {}
+            for column in df.columns:
+                cur_dict[column] = df[column].tolist()
+            print(cur_dict.keys())
+            dict[sheet] = cur_dict
+    print(dict.keys())
+    return dict
+def get_sheet_list(dict, sheets=None, key="url"):
+    images_list = [dict[key]] if sheets is None else [dict[sheet_name][key] for sheet_name in sheets]
+    images_full = []
+    for images, sheet in zip(images_list, sheets):
+        print(f"{sheet}: {len(images)}")
+        images_full = images_full + images
+    return images_full
+def test_load_save_obj():
+    image_name = "000000243355_zebra"
+    obj = f"./unit_test/{image_name}.obj"
+    obj = load_obj(obj)
+    export_obj(f"./unit_test/{image_name}_resave", obj)
+if __name__ == '__main__':
+    # test = [(1,2), (3,4)]
+    # dump_pkl('test.pkl', test)
+    # print(load_pkl('test.pkl'))
+    # xyz = np.random.rand(1000, 3)
+    # write_pointcloud("test.ply", xyz)
+    # xyz = np.random.rand(1000, 3)
+    # write_pointcloud("test.ply", xyz)
+    # pass
+    # test_depth_8uc3_encode()
+    # test_zbuffer_to_depth()
+    # fuse_frames_of_depth_video()
+    test_load_save_obj()

llm_requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+!pip install -q -U bitsandbytes
+!pip install -q -U git+https://github.com/huggingface/transformers.git
+!pip install -q -U git+https://github.com/huggingface/peft.git
+!pip install -q -U git+https://github.com/huggingface/accelerate.git
+!pip install -q -U datasets scipy ipywidgets matplotlib

mixtral_test.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+from mixtral_tune import formatting_func_Edit
+from peft import PeftModel
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+model_root = "/mnt/bn/wp-maliva-bytenas/mlx/users/peng.wang/playground/model/checkpoint_bk/"
+output_root = "/opt/tiger/llm"
+######### Tune model with Mixtral Instruct 7B #########
+base_model_id = f"{model_root}/Mistral-7B-Instruct-v0.2"
+base_model_id = f"{model_root}/Mixtral-8x7B-Instruct-v0.1"
+base_model_name = "mixtral-7b"
+project = "edit-finetune"
+run_name = base_model_name + "-" + project
+output_dir = f"{output_root}/{run_name}"
+step=100
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+base_model = AutoModelForCausalLM.from_pretrained(
+    base_model_id,
+    quantization_config=bnb_config,
+    device_map="auto",
+    trust_remote_code=True,
+    use_auth_token=True
+)
+tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True)
+ft_model = base_model
+# ft_model = PeftModel.from_pretrained(base_model, f"{output_dir}/checkpoint-{step}")
+# eval_prompt = " Given an Edit Action: apply a Gingham filter for an image,what is its edit type? "
+example = {"edit": " apply a Gingham filter for an image"}
+example = {"edit": " make the image modern furnished"}
+eval_prompt = formatting_func_Edit(example, is_train=False)
+model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
+ft_model.eval()
+with torch.no_grad():
+    output = tokenizer.decode(
+        ft_model.generate(**model_input, max_new_tokens=50, repetition_penalty=1.15)[0],
+        skip_special_tokens=True)
+    print(output)

mixtral_tune.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import torch
+import transformers
+import matplotlib.pyplot as plt
+from datetime import datetime
+from functools import partial
+from peft import LoraConfig, get_peft_model
+from peft import prepare_model_for_kbit_training
+from datasets import load_dataset
+from accelerate import FullyShardedDataParallelPlugin, Accelerator
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+def formatting_func_QA(example):
+    text = f"### Question: Given an image prompt {example['input']}\n give me random Edit Action and the output prompt \n  ### Answer: Here is the edit action {example['edit']}, and here is the output {example['output']}"
+    return text
+def formatting_func_Edit(example, is_train=True):
+    text = f"### Categorizes image editing actions, outputting classifications in the format 'Edit Class: A,B,C'. In this format, 'A' represents whether the edit is 'Global' or 'Local', and 'B' denotes the specific type of manipulation, such as 'Filter', 'Stylization', 'SceneChange', etc. 'C' denotes a specified 'B' such as 'FujiFilter', 'Part' etc. This structured approach provides clear and concise information, facilitating easy understanding of the edit class. The GPT remains committed to a formal, user-friendly communication style, ensuring the classifications are accessible and precise, without delving into technical complexities.\
+Question: Given the Edit Action {example['edit']}, what is its edit type?\n"
+    if is_train:
+        text = text + f"### Answer: Edit Class: {example['class']}"
+    return text
+def plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset):
+    lengths = [len(x['input_ids']) for x in tokenized_train_dataset]
+    lengths += [len(x['input_ids']) for x in tokenized_val_dataset]
+    print(len(lengths))
+    # Plotting the histogram
+    plt.figure(figsize=(10, 6))
+    plt.hist(lengths, bins=10, alpha=0.7, color='blue')
+    plt.xlabel('Length of input_ids')
+    plt.ylabel('Frequency')
+    plt.title('Distribution of Lengths of input_ids')
+    # Saving the figure to a file
+    plt.savefig('./experiments/figure.png')  # Spe
+def generate_and_tokenize_prompt(prompt, formatting=None):
+    return tokenizer(formatting(prompt))
+def generate_and_tokenize_prompt2(prompt, max_length=512, formatting=None):
+    result = tokenizer(
+        formatting(prompt),
+        truncation=True,
+        max_length=max_length,
+        padding="max_length",
+    )
+    result["labels"] = result["input_ids"].copy()
+    return result
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+def train():
+    generate_and_tokenize = partial(generate_and_tokenize_prompt2,
+                                    max_length=128,
+                                    formatting=formatting_func_Edit)
+    # configs here latter change
+    model_root = "/mnt/bn/wp-maliva-bytenas/mlx/users/peng.wang/playground/model/checkpoint_bk/"
+    output_root = "/mlx/users/peng.wang/playground/data/chat_edit/models/llm"
+    output_root = "/opt/tiger/llm"
+    os.makedirs(output_root, exist_ok=True)
+    ######### Tune model with Mixtral MoE #########
+    base_model_id = f"{model_root}/Mixtral-8x7B-v0.1"
+    base_model_id = f"{model_root}/Mixtral-8x7B-Instruct-v0.1"
+    base_model_name = "mixtral-8x7b"
+    # ######### Tune model with Mixtral Instruct 7B #########
+    # base_model_id = f"{model_root}/Mistral-7B-Instruct-v0.2"
+    # base_model_name = "mixtral-7b"
+    ######### Instructions #########
+    train_json = "./data/chat_edit/assets/test200/edit_instructions_v0.jsonl"
+    val_json = train_json
+    project = "edit-finetune"
+    run_name = base_model_name + "-" + project
+    output_dir = f"{output_root}/{run_name}"
+    fsdp_plugin = FullyShardedDataParallelPlugin(
+        state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
+        optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
+    )
+    accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
+    train_dataset = load_dataset('json', data_files=train_json, split='train')
+    eval_dataset = load_dataset('json', data_files=val_json, split='train')
+    tokenizer = AutoTokenizer.from_pretrained(
+        base_model_id,
+        padding_side="left",
+        add_eos_token=True,
+        add_bos_token=True,
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenized_train_dataset = train_dataset.map(generate_and_tokenize)
+    tokenized_val_dataset = eval_dataset.map(generate_and_tokenize)
+    print(tokenized_train_dataset[1]['input_ids'])
+    plot_data_lengths(tokenized_train_dataset, tokenized_val_dataset)
+    # load model and do finetune
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=torch.bfloat16
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model_id, quantization_config=bnb_config, device_map="auto")
+    model.gradient_checkpointing_enable()
+    model = prepare_model_for_kbit_training(model)
+    print(model)
+    config = LoraConfig(
+        r=32,
+        lora_alpha=64,
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "w1",
+            "w2",
+            "w3",
+            "lm_head",
+        ],
+        bias="none",
+        lora_dropout=0.01,  # Conventional
+        task_type="CAUSAL_LM",
+    )
+    model = get_peft_model(model, config)
+    print_trainable_parameters(model)
+    print(model)
+    ## RUN training ##
+    tokenizer = AutoTokenizer.from_pretrained(
+        base_model_id,
+        padding_side="left",
+        add_eos_token=True,
+        add_bos_token=True,
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+    if torch.cuda.device_count() > 1: # If more than 1 GPU
+        model.is_parallelizable = True
+        model.model_parallel = True
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=tokenized_train_dataset,
+        eval_dataset=tokenized_val_dataset,
+        args=transformers.TrainingArguments(
+            output_dir=output_dir,
+            warmup_steps=1,
+            per_device_train_batch_size=2,
+            gradient_accumulation_steps=1,
+            gradient_checkpointing=True,
+            max_steps=100,
+            learning_rate=2.5e-5, # Want a small lr for finetuning
+            fp16=True,
+            optim="paged_adamw_8bit",
+            logging_steps=25,              # When to start reporting loss
+            logging_dir="./experiments/logs",        # Directory for storing logs
+            save_strategy="steps",       # Save the model checkpoint every logging step
+            save_steps=100,                # Save checkpoints every 50 steps
+            evaluation_strategy="steps", # Evaluate the model every logging step
+            eval_steps=25,               # Evaluate and save checkpoints every 50 steps
+            do_eval=True,                # Perform evaluation at the end of training
+            report_to="wandb",           # Comment this out if you don't want to use weights & baises
+            run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+    )
+    model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
+    trainer.train()
+if __name__ == '__main__':
+    train()

mixtral_tune.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+pip install --upgrade pip
+pip install -q -U bitsandbytes
+pip install -q -U git+https://github.com/huggingface/transformers.git
+pip install -q -U git+https://github.com/huggingface/peft.git
+pip install -q -U git+https://github.com/huggingface/accelerate.git
+pip install -q -U datasets scipy ipywidgets matplotlib
+train_json="./data/chat_edit/assets/test200/edit_instructions_v0.jsonl"
+output_dir = f"{output_root}/{run_name}"
+python3 ./dataset_creation/mixtral_tune.py \
+    --train_json train_json

outlog.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+nohup: ignoring input
+/usr/local/anaconda3/envs/dalle-3/lib/python3.11/site-packages/gradio/deprecation.py:43: UserWarning: You have unused kwarg parameters in Textbox, please remove them: {'default': '0'}
+  warnings.warn(
+/usr/local/anaconda3/envs/dalle-3/lib/python3.11/site-packages/gradio/deprecation.py:43: UserWarning: You have unused kwarg parameters in Textbox, please remove them: {'default': '1000'}
+  warnings.warn(
+/usr/local/anaconda3/envs/dalle-3/lib/python3.11/site-packages/gradio/deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'min_width': 400}
+  warnings.warn(

prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import json
+from argparse import ArgumentParser
+from pathlib import Path
+from tqdm.auto import tqdm
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("dataset_dir")
+    args = parser.parse_args()
+    dataset_dir = Path(args.dataset_dir)
+    seeds = []
+    with tqdm(desc="Listing dataset image seeds") as progress_bar:
+        for prompt_dir in dataset_dir.iterdir():
+            if prompt_dir.is_dir():
+                prompt_seeds = [image_path.name.split("_")[0] for image_path in sorted(prompt_dir.glob("*_0.jpg"))]
+                if len(prompt_seeds) > 0:
+                    seeds.append((prompt_dir.name, prompt_seeds))
+                    progress_bar.update()
+    seeds.sort()
+    with open(dataset_dir.joinpath("seeds.json"), "w") as f:
+        json.dump(seeds, f)
+if __name__ == "__main__":
+    main()

prepare_for_gpt.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import json
+from argparse import ArgumentParser
+from generate_txt_dataset import DELIMITER_0, DELIMITER_1, STOP
+def main(input_path: str, output_path: str):
+    with open(input_path) as f:
+        prompts = [json.loads(l) for l in f]
+    with open(output_path, "w") as f:
+        for prompt in prompts:
+            prompt_for_gpt = {
+                "prompt": f"{prompt['input']}{DELIMITER_0}",
+                "completion": f"{prompt['edit']}{DELIMITER_1}{prompt['output']}{STOP}",
+            }
+            f.write(f"{json.dumps(prompt_for_gpt)}\n")
+def main_classify(input_path: str, output_path: str):
+    with open(input_path) as f:
+        prompts = [json.loads(l) for l in f]
+    with open(output_path, "w") as f:
+        for prompt in prompts:
+            prompt_for_gpt = {
+                "prompt": f"{prompt['edit']}{DELIMITER_0}",
+                "completion": f"{prompt['class']}{STOP}",
+            }
+            f.write(f"{json.dumps(prompt_for_gpt)}\n")
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--input-path", required=False, type=str, default="/mlx/users/peng.wang/playground/data/chat_edit/assets/test200/edit_instructions_v0.jsonl")
+    parser.add_argument("--output-path", required=False, type=str, default="/mlx/users/peng.wang/playground/data/chat_edit/assets/test200/edit_class_for_gpt.jsonl")
+    args = parser.parse_args()
+    # main(args.input_path, args.output_path)
+    main_classify(args.input_path, args.output_path)

reorganize_data.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import os
+import io_utils as io_uts
+import vis_utils as v_uts
+from vis_common import *
+import pandas as pd
+from GPT_prompts import (
+    TEMPLATE_0,
+    TEMPLATE_1,
+    TEMPLATE_2
+)
+from call_assistant_api import (
+    EditActionClassifier
+)
+import json
+from datasets import Dataset
+unknown_action = "Unknown"
+def dfs(actions, res, res_set):
+    """
+        Enumerate all options in an edit action.
+    """
+    if len(actions) == 0:
+        res_set.append(res)
+        return
+    for word in actions[0]:
+        cur_res = res + [word]
+        dfs(actions[1:], cur_res, res_set)
+    return res_set
+def split_actions(actions):
+    if '/' in actions:
+        words = actions.split(" ")
+        common = ""
+        cur_actions = []  # Changed from {} to []
+        counter = 0
+        for word in words:
+            if "/" in word:
+                action = unknown_action + f"{counter} "
+                cur_actions.append(word.split('/'))
+                counter += 1
+            else:
+                action = word + " "
+            common += action
+        actions_sets = dfs(cur_actions, [], [])
+        instructions = []
+        for action_set in actions_sets:
+            temp_common = common
+            for i, action in enumerate(action_set):
+                temp_common = temp_common.replace(unknown_action+f"{i}", action.replace('_', ''))
+            instructions.append(temp_common.strip())
+        return instructions
+    else:
+        return [actions]
+def sample_prompt(sub, class_name, edit_action):
+    if not ("the subject" in edit_action):
+        if (" wall " in edit_action) or (" ground " in edit_action) or ("furnished" in edit_action):
+            prompt = "an indoor living room." if random.uniform(0, 1) < 0.5 else "a beautiful lobby"
+            return prompt
+        if (" sky " in edit_action):
+            prompt = "a natural image of sea, mountains and sky"
+            return prompt
+        if (" weather" in edit_action) or (" snow" in edit_action):
+            prompt = "a naturalistic scene with trees"
+            return prompt
+        p = random.uniform(0, 1)
+        if p < 0.5:
+            prompt = random.choice(sub["scenes"])
+            return prompt
+    p = random.uniform(0, 1)
+    person = ["view", "pose", "adj", "color", "human_age","people"]
+    subject = ["view", "pose", "adj", "color", "animal_age", "subjects"]
+    appends = [" of ", " ", " ", " ", " ", "."]
+    attri_set = person if p < 0.7 else subject
+    prompt = ""
+    for i, key in enumerate(attri_set):
+        attr = random.choice(sub[key])
+        prompt = prompt + attr + appends[i]
+    return prompt
+def prepare_our_prompt_v0():
+    """
+        Prepare the prompt with our coverage, simple prompt, found good for person.
+    """
+    random.seed(0)
+    data_root="/mlx/users/peng.wang/playground/data/chat_edit/assets/test200"
+    edit_file = f"{data_root}/edit_class.txt"
+    edit_lines = io_uts.load_lines(edit_file)
+    sub_file = f"{data_root}/subject.yaml"
+    sub = io_uts.load_yaml(sub_file)
+    from_human = f"{data_root}/edit_instructions_v0.jsonl"
+    # sample an item or empty each feature
+    items = []
+    for edit_line in tqdm(edit_lines):
+        class_name, edit_actions = edit_line.split(":")
+        edit_actions = split_actions(edit_actions)
+        for edit_action in edit_actions:
+            prompt1 = sample_prompt(sub, class_name, edit_action)
+            prompt = TEMPLATE_0.format(prompt1=prompt1, edit_action=edit_action)
+            item = {}
+            item["prompt_0"] = prompt
+            item["class"] = class_name
+            item["input"] = prompt1
+            item["edit"] = edit_action
+            item["output"] = f"{prompt1} with {edit_action}"
+            items.append(item)
+    print("number of examples:", len(items))
+    io_uts.dump_jsonl(from_human, items)
+def config_our_prompt_v1():
+    # if region wise, let first find and locate the region.
+    pass
+def config_our_prompt_v2():
+    # if region wise, let first find and locate the region.
+    pass
+def prepare_p2p_prompt_v0():
+    test_root="/mlx/users/peng.wang/playground/repo/instruct-pix2pix/data/chat_edit/assets/test200/"
+    cache_root="/mlx/users/peng.wang/playground/repo/instruct-pix2pix/data/chat_edit/assets/p2p700"
+    jsonl_file = f"{test_root}instruct_p2p_700.jsonl"
+    jsonl_file_out = f"{test_root}instruct_p2p_700_reformat.jsonl"
+    def classify_p2p_edit_action():
+        classifier = EditActionClassifier()
+        examples = io_uts.load_jsonl(jsonl_file)
+        examples_out = []
+        for count, example in tqdm(enumerate(examples)):
+            res_file = f"{cache_root}/{count}.json"
+            if os.path.exists(res_file):
+                example = io_uts.load_json(res_file)
+                examples_out.append(example)
+                continue
+            edit_class = classifier.infer(example["edit"])
+            example["class"] = edit_class
+            example["prompt_0"] = TEMPLATE_0.format(prompt1=example["input"], edit_action=example["edit"])
+            io_uts.dump_json(res_file, example)
+            examples_out.append(example)
+        io_uts.dump_jsonl(jsonl_file_out, examples_out)
+    def subsample_p2p():
+        jsonl_file_sample_out = f"{test_root}/instruct_p2p_val.jsonl"
+        examples = io_uts.load_jsonl(jsonl_file_out)
+        classes = {}
+        results = []
+        max_each_class = 1
+        for example in examples:
+            if example["class"] not in classes.keys():
+                classes[example["class"]] = 1
+                results.append(example)
+            else:
+                if classes[example["class"]] < max_each_class:
+                    classes[example["class"]] += 1
+                    results.append(example)
+        print("sample num: ", len(results))
+        io_uts.dump_jsonl(jsonl_file_sample_out, results)
+    # classify_p2p_edit_action()
+    subsample_p2p()
+def prepare_emu_set():
+    test_root="/mlx/users/peng.wang/playground/repo/instruct-pix2pix/data/chat_edit/assets/emu_test/"
+    output_root="/mlx/users/peng.wang/playground/repo/instruct-pix2pix/data/chat_edit/assets/test200/"
+    items = []
+    files = v_uts.list_all_files(test_root, exts=["txt"])
+    class_map = {
+        "add": "Local,Add",
+        "background": "Global,Background",
+        "color": "Global,Color",
+        "global": "Global",
+        "local": "Local",
+        "remove": "Local,Remove",
+        "style": "Global,Stylization",
+        "text": "Local,Add,Text"
+    }
+    for edit_file in tqdm(files):
+        edit_action = io_uts.load_lines(edit_file)
+        item = {"input": edit_action[1], "edit": edit_action[0], "output": edit_action[2]}
+        item["prompt_0"] = TEMPLATE_0.format(prompt1=item["input"], edit_action=item["edit"])
+        class_name = edit_file.split('/')[-2]
+        item["class"] = class_map[class_name]
+        items.append(item)
+    io_uts.dump_jsonl(f"{output_root}/emu_val_90.jsonl", items)
+def merge_prompts():
+    output_root="/mlx/users/peng.wang/playground/repo/instruct-pix2pix/data/chat_edit/assets/ChatEdit/"
+    our_set = "edit_instructions_val"
+    p2p_set = "instruct_p2p_val"
+    emu_set = "emu_val_90"
+    full_items = []
+    for val_set in [our_set, p2p_set, emu_set]:
+        items = io_uts.load_jsonl(f"{output_root}/{val_set}.jsonl")
+        print(val_set, len(items))
+        keynames = ["input", "edit", "output", "prompt_0", "class"]
+        items_out = []
+        for item in items:
+            # reorder the item keys based on keynames
+            item_out = {}
+            for key in keynames:
+                item_out[key] = item[key]
+            item_out["prompt_1"] = TEMPLATE_1.format(
+                prompt1=item["input"],
+                prompt2=item['output'],
+                edit_action=item["edit"])
+            item_out["prompt_2"] = TEMPLATE_2.format(
+                prompt1=item["input"],
+                prompt2=item['output'],
+                edit_action=item["edit"])
+            items_out.append(item_out)
+        full_items = full_items + items_out
+    print("num: ", len(full_items))
+    io_uts.dump_jsonl(f"{output_root}/full_val.jsonl", full_items)
+def classify_and_sample_p2p_prompts():
+    pass
+def write_dataset_toparquet():
+    dataroot = "/mnt/bn/datacompv6/data/chat_edit/assets/ChatEdit/"
+    jsonl_path = f"{dataroot}/full_val.jsonl"
+    folder_name = "prompt_0"
+    image_folder = f"{dataroot}/{folder_name}"
+    output_path = f"{dataroot}/data/"
+    v_uts.mkdir(output_path)
+    items = io_uts.load_jsonl(jsonl_path)
+    items_out = []
+    for i, item in enumerate(tqdm(items)):
+        image_path = f"{image_folder}/{i:03}.png"
+        item['image_id'] = f"{i:03}"
+        item['image'] = v_uts.encode_b64(image_path)
+        items_out.append(item)
+    # Convert the data to a pandas DataFrame
+    df = pd.DataFrame(items_out)
+    # Create a Hugging Face dataset from the DataFrame
+    dataset = Dataset.from_pandas(df)
+    # Save the dataset to a Parquet file
+    dataset.to_parquet(f"{output_path}/{folder_name}.parquet")
+if __name__ == '__main__':
+    # res = "make firework/rainbow in sky/ground region in the image"
+    # print(split_actions(res))
+    # prepare_our_prompt_v0()
+    # prepare_p2p_prompt_v0()
+    # prepare_emu_set()
+    # merge_prompts()
+    write_dataset_toparquet()

tune_gpt.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+openai api fine_tunes.create \
+    -t ./data/chat_edit/assets/test200/edit_class_for_gpt.jsonl \
+    -m davinci \
+    --n_epochs 1 \
+    --suffix "edit-pix2pix-class"

vis_common.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+import os
+import os.path as osp
+import io
+import cv2
+import time
+import copy
+import random
+import yaml
+import pdb
+b=pdb.set_trace
+from tqdm import tqdm
+from pqdm.processes import pqdm
+import logging
+import argparse
+# usage of pqdm(args, func, n_jobs)
+def get_logger(name):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    return logger
+def get_parser(name):
+    parser = argparse.ArgumentParser(description=name)
+    return parser
+def add_args(parser, name, type=str, default=None, **kwargs):
+    parser.add_argument('--%s' % name, type=type, default=default, **kwargs)
+    return parser
+def add_flag(parser, name, des=''):
+    parser.add_argument('--%s' % name, action='store_true', help=des)
+    return parser
+def debug_image(image):
+    cv2.imwrite('test.png', np.uint8(image))

vis_utils.py ADDED Viewed

	@@ -0,0 +1,2231 @@

+import os
+import cv2
+import numpy as np
+import time
+from tqdm import tqdm
+import random
+# from shapely.geometry import Point, Polygon
+from numpy.linalg import svd
+from collections import namedtuple
+from vis_common import get_logger
+from typing import Any, Dict, List, Optional, Type, Union
+logger = get_logger('v_utils')
+import pdb
+b = pdb.set_trace
+IMAGE_EXTS = ['jpg', 'png', 'jpeg', 'JPG', 'PNG', 'JPEG']
+PALETTE = [
+    (0.12156862745098039, 0.4666666666666667, 0.7058823529411765),
+    (0.6823529411764706, 0.7803921568627451, 0.9098039215686274),
+    (1.0, 0.4980392156862745, 0.054901960784313725),
+    (1.0, 0.7333333333333333, 0.47058823529411764),
+    (0.17254901960784313, 0.6274509803921569, 0.17254901960784313),
+    (0.596078431372549, 0.8745098039215686, 0.5411764705882353),
+    (0.8392156862745098, 0.15294117647058825, 0.1568627450980392),
+    (1.0, 0.596078431372549, 0.5882352941176471),
+    (0.5803921568627451, 0.403921568627451, 0.7411764705882353),
+    (0.7725490196078432, 0.6901960784313725, 0.8352941176470589),
+    (0.5490196078431373, 0.33725490196078434, 0.29411764705882354),
+    (0.7686274509803922, 0.611764705882353, 0.5803921568627451),
+    (0.8901960784313725, 0.4666666666666667, 0.7607843137254902),
+    (0.9686274509803922, 0.7137254901960784, 0.8235294117647058),
+    (0.4980392156862745, 0.4980392156862745, 0.4980392156862745),
+    (0.7803921568627451, 0.7803921568627451, 0.7803921568627451),
+    (0.7372549019607844, 0.7411764705882353, 0.13333333333333333),
+    (0.8588235294117647, 0.8588235294117647, 0.5529411764705883),
+    (0.09019607843137255, 0.7450980392156863, 0.8117647058823529),
+    (0.6196078431372549, 0.8549019607843137, 0.8980392156862745),
+]
+def check_file_in_paths(paths, filename):
+    for path in paths:
+        file = os.path.join(path, filename)
+        print(file)
+        if os.path.exists(file):
+            print(file)
+            return True
+    return False
+def clean_backslash(dir):
+    while dir[-1] == '/':
+        dir = dir[:-1]
+    return dir
+def odgt2txt(odgt_file,
+             txt_file,
+             image_key='image',
+             segment_key='segment'):
+    import io_utils as io_uts
+    odgt = io_uts.load_odgt(odgt_file)
+    f = open(txt_file, 'w')
+    for item in odgt:
+        string = f"{item[image_key]} {item[segment_key]}\n"
+        f.write(string)
+    f.close()
+    print("done")
+def single_thresh(args, mark_ignore=True):
+    """
+        threshold 255, 128, 0 type of label for a binary label
+    """
+    image_name, label_name, out_label_name = args
+    image = cv2.imread(image_name, cv2.IMREAD_UNCHANGED)
+    mask_org = cv2.imread(label_name, cv2.IMREAD_UNCHANGED)
+    if not (image.shape[0] / image.shape[1] == mask_org.shape[0] / mask_org.shape[1]):
+        # rotate match
+        if mask_org.shape[1] / mask_org.shape[0] == image.shape[0] / image.shape[1]:
+            mask_org = cv2.rotate(mask_org, cv2.cv2.ROTATE_90_CLOCKWISE)
+            print(image_name, label_name, f"shape not match {mask_org.shape} vs {image.shape}")
+        else:
+            print(image_name, label_name, "shape not match even rotation")
+            assert False
+    name = basename(label_name)
+    if mask_org.ndim == 3:
+        mask_org = mask_org[:, :, 0]
+    mask = np.zeros_like(mask_org)
+    mask[mask_org > 172] = 1
+    if mark_ignore:
+        ignore_region = np.logical_and(
+            mask_org <= 172,
+            mask_org >= 70)
+        mask[ignore_region] = 255
+    cv2.imwrite(out_label_name, np.uint8(mask))
+def find_file_w_exts(filename, exts, w_dot=False):
+    appex = '.' if w_dot else ''
+    for ext in exts:
+        if os.path.exists(f"{filename}{appex}{ext}"):
+            return True, f"{filename}{appex}{ext}"
+    return False, None
+def seg_folder_to_txt(image_folder, label_folder, root,
+                      output_file):
+    exts = ['jpg', 'png', 'jpeg']
+    image_files = list_all_files(image_folder, exts)
+    f = open(output_file, 'w')
+    for image_file in tqdm(image_files):
+        image_name = basename(image_file)
+        label_file = f"{label_folder}/{image_name}.png"
+        assert os.path.exists(label_file), f"{image_file} {label_file}"
+        image_file = image_file.replace(root, '.')
+        label_file = label_file.replace(root, '.')
+        string = f"{image_file} {label_file}\n"
+        f.write(string)
+    f.close()
+    print("done")
+def wait_for_file(filename, step=5.0):
+    count = 0.0
+    while not os.path.exists():
+        time.sleep(step)
+        count += step
+    time.sleep(step)
+    print(f"found {filename} after {count}s")
+def get_trimap_by_binary(img, eradius=20, dradius=20):
+    kernel = np.ones((radius, radius),np.uint8)
+    erosion = cv2.erode(img, kernel, iterations = 1)
+    dilation = cv2.dilate(img, kernel, iterations = 1)
+    trimap = img.copy()
+    mask = np.logical_and(dilation > 0, erosion == 0)
+    trimap[mask] = 128
+    return trimap
+def get_matting_trimap(segment, eradius = 30, dradius = 30):
+    # find the highest box, dilate segment
+    dilate_ker = np.ones((dradius, dradius), np.uint8)
+    shrink_ker = np.ones((eradius, eradius), np.uint8)
+    segment_out = cv2.dilate(segment, dilate_ker, iterations=1)
+    segment_in = cv2.erode(segment, shrink_ker, iterations=1)
+    segment_image = np.zeros_like(segment, dtype=np.uint8)
+    segment_image[segment_out > 0] = 128
+    segment_image[segment_in > 0] = 255
+    return segment_image
+def get_trimap_by_thresh():
+    pass
+def Mat2EulerImage(mat: np.ndarray, Image):
+    channel = 1 if mat.ndim == 2 else mat.shape[-1]
+    return Image(
+            data=mat.tobytes(),
+            rows=mat.shape[0],
+            cols=mat.shape[1],
+            channel=channel
+        )
+def EulerImagetoMat(res, channel=1):
+    """
+    for euler thrift, usually a image is set as
+    struct Image {
+        1: binary data, // cv::imencode(".png", image), should be bgr image
+        2: i32 rows,
+        3: i32 cols,
+        4: i32 channel
+    }
+    here we transform back
+    """
+    data = res.data
+    if channel > 1:
+        return np.fromstring(data, dtype=np.uint8).reshape(
+            (res.rows, res.cols, channel))
+    return np.fromstring(data, dtype=np.uint8).reshape(
+            (res.rows, res.cols))
+"""
+encode the name of an image with chinese
+"""
+class NameCoder():
+    def __init__(self, root_dir):
+        self.root_dir = root_dir
+    def __call__(self, name):
+        import pinyin as py
+        return py.get(name.replace(
+            self.root_dir, '').replace('/', '_').replace(' ', '_'),
+                    format='strip')
+def basename(path):
+    return os.path.splitext(os.path.basename(path))[0]
+def ext(path):
+    return os.path.splitext(os.path.basename(path))[1][1:]
+def get_cur_abs_path(some_file):
+    return os.path.dirname(os.path.abspath(some_file))
+def list_all_files(directory, exts=None, recursive=True):
+    import glob
+    all_files = []
+    if exts is None:
+        exts = IMAGE_EXTS
+    for ext in exts:
+        if not recursive:
+            files = glob.glob("%s/*%s" % (directory, ext),
+                            recursive=recursive)
+        else:
+            files = glob.glob("%s/**/*%s" % (directory, ext),
+                            recursive=recursive)
+        all_files = all_files + files
+    all_files = sorted(all_files)
+    return all_files
+def list_all_folders(directory):
+    import glob
+    folders = glob.glob(f"{directory}/*/")
+    return folders
+def list_all(folder, exts=None, recur=False):
+    if exts is None:
+        return list_all_folders(folder)
+    else:
+        return list_all_files(folder, exts, recur)
+def split_path(folder):
+    blocks = folder.split('/')
+    return [name for name in blocks if name != '']
+def dump_image(pred, res_file, score=True, dim='CHW'):
+    if score:
+        dump_prob2image(res_file, pred, dim=dim)
+    else:
+        res_file = res_file + '.png'
+        cv2.imwrite(res_file, np.uint8(pred))
+def dump_prob2image(filename, array, dim='CHW'):
+    """
+        dump probility map to image when
+        array: [x, height, width] (x = 1, 3, 4)
+    """
+    if dim == 'CHW':
+        array = np.transpose(np.uint8(array * 255), (1, 2, 0))
+    class_num = array.shape[2]
+    # assert class_num <= 4
+    if class_num >= 4 :
+        print('warning: only save the first 3 channels')
+        array = array[:, :, :3]
+    if class_num == 2:
+        array = array[:, :, 1]
+    cv2.imwrite(filename + '.png', array)
+def load_image2prob(filename):
+    if not filename.endswith('.png'):
+        filename = filename + '.png'
+    array = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
+    array = np.transpose(array, (2, 0, 1)) / 255
+    return array
+def mask2box(mask):
+    """
+        t, l, b, r
+        y0, x0, y1, x1
+    """
+    y, x = np.where(mask > 0)
+    return [np.min(y), np.min(x), np.max(y), np.max(x)]
+def dilate_mask(mask, kernel=20):
+    mask = np.uint8(mask)
+    kernel = np.ones((kernel, kernel), np.uint8)
+    mask_out = cv2.dilate(mask, kernel, iterations=1)
+    return mask_out
+def erode_mask(mask, kernel=20):
+    kernel = np.ones((kernel, kernel), np.uint8)
+    mask_out = cv2.erode(mask, kernel, iterations=1)
+    return mask_out
+def pack_argument(args, arg_names):
+    """
+    args: object of all arguments
+    arg_names: list of string name for needed arguments
+    """
+    kwargs = {}
+    for arg_name in arg_names:
+        cur_args = getattr(args, arg_name) if hasattr(args, arg_name) else None
+        if cur_args:
+            kwargs[arg_name] = cur_args
+    return kwargs
+def line_segment_cross(seg1, seg2):
+    """
+    :param seg1: [start, end]
+    :param seg2: [start, end]
+    :return:
+        True if cross, false otherwise
+    """
+    def ccw(A, B, C):
+        return (C.y - A.y) * (B.x - A.x) > (B.y - A.y) * (C.x - A.x)
+    # Return true if line segments AB and CD intersect
+    def intersect(A, B, C, D):
+        return ccw(A, C, D) != ccw(B, C, D) and ccw(A, B, C) != ccw(A, B, D)
+    Point = namedtuple('Point', 'x y')
+    A = Point(seg1[0][0], seg1[0][1])
+    B = Point(seg1[1][0], seg1[1][1])
+    C = Point(seg2[0][0], seg2[0][1])
+    D = Point(seg2[1][0], seg2[1][1])
+    return intersect(A, B, C, D)
+def pts_in_line(pts, lines, th=10):
+    """
+    pts: [x, y]
+    lines: [[x0, y0, x1, y1]]
+    """
+    count = 0
+    for line in lines:
+        x, y = pts
+        x0, y0, x1, y1 = line
+        dir0 = np.array([x - x0, y - y0])
+        dir1 = np.array([x1 - x0, y1 - y0])
+        diff = min(angle_diff(dir0, dir1),
+                    angle_diff(-1 * dir0, dir1))
+        if diff < th:
+            count += 1
+    return count
+def out_of_bound(pt, sz):
+    x, y = pt
+    h, w = sz
+    return x < 0 or y < 0 or x >= w or y >= h
+def pts_in_mask(pts, mask, allow_out=True):
+    """
+    pts: n x 2  x, y location
+    return len n mask
+    """
+    idx = np.zeros(pts.shape[0]) > 0
+    for i, pt in enumerate(pts):
+        x, y = pt
+        if out_of_bound(pt, mask.shape):
+            continue
+        if mask[y, x] > 0:
+            idx[i] = True
+    return idx
+def pts_in_poly(pts, poly, sz):
+    """
+    pts: n x 2  x, y location
+    return len n mask
+    """
+    mask = np.ones(sz)
+    cv2.fillPoly(mask,
+                 pts=[np.int0(poly)],
+                 color=(1,))
+    return pts_in_mask(pts, mask)
+def line_intersect_pt(lines: np.array, randsac=True):
+    """
+    lines: n x 4,  [s, e] of line
+    return: intersect_pt, is_parallel
+    """
+    if lines.shape[0] < 2:
+        raise ValueError('not enough line')
+    num = lines.shape[0]
+    line_id0 = 0
+    max_correct = 2
+    best_vp = None
+    for line_id0 in range(num):
+        for i in range(num):
+            if i == line_id0:
+                continue
+            lines_cur = lines[[line_id0, i], :]
+            N = 2
+            p1 = np.column_stack((lines_cur[:, :2], np.ones(N, dtype=np.float32)))
+            p2 = np.column_stack((lines_cur[:, 2:], np.ones(N, dtype=np.float32)))
+            cross_p = np.cross(p1, p2)
+            vp1 = np.cross(cross_p[0], cross_p[1])
+            if vp1[2] < 1e-5:
+                continue
+            vp1 /= vp1[2]
+            correct = pts_in_line(vp1[:2], lines)
+            if max_correct <= correct:
+                best_vp = vp1[:2]
+                max_correct = correct
+    if best_vp is not None:
+        return best_vp, False
+    return None, True
+def angle_diff(ba, bc, axis=None):
+    norma = np.linalg.norm(ba, axis=axis)
+    normb = np.linalg.norm(bc, axis=axis)
+    dot_prod = np.sum(ba * bc, axis=axis)
+    cosine_angle = dot_prod / (norma * normb)
+    angle = np.arccos(cosine_angle) * 180.0 / np.pi
+    return angle
+def on_right_side(rect, sz):
+    # judge whether rect side
+    h, w = sz
+    cx = w // 2
+    return all([pt[0] >= cx for pt in rect])
+def pts_angle(pts):
+    """
+        pts [3 x 2]
+    """
+    ba = pts[0] - pts[1]
+    bc = pts[2] - pts[1]
+    angle = angle_diff(ba, bc)
+    return angle
+def sample_points(mask, num_points=100):
+    # Get the indices where mask values are greater than 0
+    indices = np.argwhere(mask > 0)
+    # Randomly select num_points indices
+    selected_indices = np.random.choice(indices.shape[0], size=num_points, replace=False)
+    # Get the selected points
+    selected_points = indices[selected_indices]
+    return selected_points
+def valid_para_ratio(pts, th=5):
+    """
+        pts: [4 x 2]
+    """
+    def valid_ratio(ratio):
+        return 1.0 / th < ratio < th
+    ratio0 = line_len(pts[0], pts[1]) / line_len(pts[2], pts[3])
+    if not valid_ratio(ratio0):
+        return False
+    ratio1 = line_len(pts[1], pts[2]) / line_len(pts[3], pts[0])
+    if not valid_ratio(ratio1):
+        return False
+    return True
+def line_len(pt0, pt1):
+    """
+     pt0, 1: [1x2]
+    """
+    return np.linalg.norm(pt0 - pt1)
+def split_list(seq, part):
+    """
+        split a list to sub lists
+    """
+    size = len(seq) / part + 1 if part > 0 else 1
+    size = int(size)
+    return [seq[i:i+size] for i  in range(0, len(seq), size)]
+def find_portion(mask, portion_x, portion_y, th=0):
+    if mask.ndim > 2:
+        raise ValueError(f"mask must be 2 dim, now {mask.ndim}")
+    y, x = np.where(mask > th)
+    x = np.percentile(x, portion_x)
+    y = np.percentile(y, portion_y)
+    return int(x), int(y)
+def random_split(num, portion=0.1, max_num=1000):
+    """
+        num: length of list
+        max_num is val num
+        return:
+        train, val list
+    """
+    val_num = min(portion * num, max_num)
+    val_num = int(val_num)
+    idx = [i for i in range(num)]
+    random.shuffle(idx)
+    return idx[val_num:], idx[:val_num]
+def shuffle_list(list_in):
+    return random.shuffle(list_in)
+def pick(lst, idx):
+    return [lst[i] for i in idx]
+def mkdir_if_need(folder):
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+def mkdir_if_exists(path, image_name):
+    target_path = os.path.join(path, os.path.dirname(image_name))
+    if not os.path.exists(target_path):
+        os.makedirs(target_path)
+def mkdir(folder, image_name=None):
+    if image_name is not None:
+        mkdir_if_exists(folder, image_name)
+        return
+    mkdir_if_need(folder)
+    return folder
+    return folder
+def save_image_w_pallete(segment, file_name):
+    import PIL.Image as Image
+    pallete = get_pallete(256)
+    segmentation_result = np.uint8(segment)
+    segmentation_result = Image.fromarray(segmentation_result)
+    segmentation_result.putpalette(pallete)
+    segmentation_result.save(file_name)
+def get_max_size(out_size, max_len):
+    height, width = out_size
+    scale = max(height, width) / max_len
+    if scale > 1:
+        height, width = np.uint32( np.array(out_size) / scale)
+    return height ,width
+def get_pallete(num_cls):
+    """
+        this function is to get the colormap for visualizing
+        the segmentation mask
+        :param num_cls: the number of visulized class
+        :return: the pallete
+    """
+    n = num_cls
+    pallete = [0]*(n*3)
+    for j in range(0,n):
+        lab = j
+        pallete[j*3+0] = 0
+        pallete[j*3+1] = 0
+        pallete[j*3+2] = 0
+        i = 0
+        while (lab > 0):
+                pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
+                pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
+                pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
+                i = i + 1
+                lab >>= 3
+    return pallete
+def color2label(label_color, color_map=None):
+    """
+        Convert color image to semantic id based on color_map
+        color_map = {$rgb: $label_id}
+        if color map is None. Then we treat 0 as background and all none
+        zero ids as label id
+    """
+    # default bkg 255
+    label_color = np.int32(label_color)
+    height, width = label_color.shape[0:2]
+    label = label_color[:, :, 0] * (255 ** 2) + \
+            label_color[:, :, 1] * 255 + \
+            label_color[:, :, 2]
+    label_id = np.unique(label)
+    if color_map is None:
+        for i, id in enumerate(label_id):
+            if id == 0:
+                continue
+            mask = label == id
+            label[mask] = i
+        return label
+    for rgb, i in color_map.items():
+        cur_num = rgb[0] * (255 ** 2) + rgb[1] * 255 + rgb[2]
+        if cur_num in label_id:
+            mask = (label - cur_num) != 0
+            label = label * mask  + i * (1 - mask)
+    return label
+def flow2color(flow):
+    assert flow.shape[2] == 2
+    hsv = np.zeros((flow.shape[0],
+                    flow.shape[1], 3),
+                    dtype=np.float32)
+    hsv[...,1] = 255
+    mag, ang = cv2.cartToPolar(flow[...,0], flow[...,1])
+    hsv[...,0] = ang * 180 / np.pi / 2
+    hsv[...,2] = cv2.normalize(mag,None,0,255,cv2.NORM_MINMAX)
+    rgb = cv2.cvtColor(np.uint8(hsv), cv2.COLOR_HSV2BGR)
+    return hsv, rgb
+def colorEncode(labelmap, colors, mode='RGB'):
+    labelmap = labelmap.astype('int')
+    labelmap_rgb = np.zeros((labelmap.shape[0], labelmap.shape[1], 3),
+                             dtype=np.uint8)
+    for label in np.unique(labelmap):
+        if label < 0:
+            continue
+        labelmap_rgb += (labelmap == label)[:, :, np.newaxis] * \
+            np.tile(colors[label],
+                    (labelmap.shape[0], labelmap.shape[1], 1))
+    if mode == 'BGR':
+        return labelmap_rgb[:, :, ::-1]
+    else:
+        return labelmap_rgb
+def drawBoundingbox(image, boxes, colors=None):
+    """
+        boxes: t, l, b r
+    """
+    if colors is None:
+        colors = [[255, 255, 0]] * len(boxes)
+    for color, box in zip(colors, boxes):
+        box = box.astype(np.uint32)
+        t, l, b, r = box[0], box[1], box[2], box[3]
+        cv2.rectangle(image, (l, t), (r, b), color, 2)
+    return image
+def round2stride(length, stride):
+    return (length // stride) * stride
+def resize_rect(rect, sz_src, sz_tgt):
+    """
+    :param rect:  n x 4 x 2 rectangles
+    :param sz_src: (height, width)
+    :param sz_tgt:
+    :return:
+    """
+    if len(rect) == 0:
+        return rect
+    height, width = sz_src
+    height_tgt, width_tgt = sz_tgt
+    rect[:, :, 0] = np.int64(rect[:, :, 0] * width_tgt / width)
+    rect[:, :, 1] = np.int64(rect[:, :, 1] * height_tgt / height)
+    return rect
+def resize_lines(lines, sz_src, sz_tgt):
+    """
+    :param lines: [n x 4 ]  each line [start (x, y), end (x, y)]
+    :param sz_src:
+    :param sz_tgt:
+    :return:
+    """
+    assert lines.shape[1] == 2
+    lines = lines.reshape([-1, 2, 2])
+    lines = resize_rect(lines, sz_src, sz_tgt)
+    lines = lines.reshape([-1, 4])
+    return lines
+def resize_LShape(lShapes, sz_src, sz_tgt):
+    """
+    :param lShapes: [n x 6]
+    :param sz_src:
+    :param sz_tgt:
+    :return:
+    """
+    assert lShapes.shape[1] == 3
+    lShapes = lShapes.reshape([-1, 3, 2])
+    lShapes = resize_rect(lShapes, sz_src, sz_tgt)
+    lShapes = lShapes.reshape([-1, 6])
+    return lShapes
+def resize_to_fix_side(image, size=960, fix_type='height'):
+    if fix_type == "height":
+        scale = size / image.shape[0]
+        height, width = size, int(scale * image.shape[1])
+    elif fix_type == "width":
+        scale = size / image.shape[1]
+        height, width = int(scale * image.shape[0]), size
+    else:
+        raise ValueError("fix type must in [height, widht]")
+    image = cv2.resize(image, (width, height))
+    return image
+def resize_like(image, src, side="all", interpolation=None):
+    """
+     resize image like src
+    """
+    shape = src.shape[:2]
+    if interpolation is None:
+        interpolation = cv2.INTER_CUBIC
+    if side != "all":
+        size = shape[0] if side == "height" else shape[1]
+        image = resize_to_fix_side(image, size, fix_type=side)
+        return image
+    image = cv2.resize(image, (shape[1], shape[0]),
+                       interpolation=interpolation)
+    return image
+def getmaxsize(shape, size=720, fixSide=False):
+    """
+    input: [h, w, c]
+    output: [w, h]
+    """
+    height, width = shape[:2]
+    scale = max(height, width) / size
+    height, width = np.uint32(np.array(shape[:2]) / scale)
+    if fixSide:
+        return (width, height)
+    else:
+        if scale > 1:
+            return (width, height)
+        else:
+            return (shape[1], shape[0])
+def resize2size(images, size, interpolations=None):
+    """
+    :param images:
+    :param size:  width height
+    :param interpolations:
+    :return:
+    """
+    if interpolations is None:
+        interpolations = [cv2.INTER_LINEAR for _ in range(len(images))]
+    for i, (image, interpolation) in enumerate(zip(images, interpolations)):
+        if interpolation is None:
+            interpolation = cv2.INTER_LINEAR
+        if image is None:
+            print(f"{i}_th image is None")
+        image = cv2.resize(image, tuple(size), interpolation=interpolation)
+        images[i] = image
+    return images
+def resize2maxsize(image,
+                   size=720,
+                   interpolation=None,
+                   fixSide=False):
+    """
+        Constraint the maximum length of an image
+    Args:
+        fixSide: set image side must be the same as size
+    """
+    if interpolation is None:
+        interpolation = cv2.INTER_CUBIC
+    image_out = image.copy()
+    height, width = image.shape[:2]
+    scale = max(height, width) / size
+    if image_out.dtype == 'bool':
+        image_out = np.uint8(image_out)
+    height, width = np.uint32(np.array(image.shape[:2]) / scale)
+    if fixSide:
+        image_out = cv2.resize(image_out, (width, height),
+                               interpolation=interpolation)
+    else:
+        if scale > 1:
+            image_out = cv2.resize(image_out, (width, height),
+                                   interpolation=interpolation)
+    if image.dtype == bool:
+        image_out = image_out > 0
+    return image_out
+def resize2minsize(image, size=256, interpolation=None):
+    """
+        Constraint the minimum length of an image
+    """
+    if size is None:
+        return image
+    if interpolation is None:
+        interpolation = cv2.INTER_CUBIC
+    height, width = image.shape[:2]
+    scale = min(height, width) / size
+    image_out = image.copy()
+    if image_out.dtype == 'bool':
+        image_out = np.uint8(image_out)
+    if scale > 1:
+        height, width = np.uint32(np.array(image.shape[:2]) / scale)
+        image_out = cv2.resize(image_out, (width, height),
+                               interpolation=interpolation)
+    if image.dtype == bool:
+        image_out = image_out > 0
+    return image_out
+def resize2minsize(image, size=256, interpolation=None):
+    """
+        Constraint the minimum length of an image
+    """
+    if interpolation is None:
+        interpolation = cv2.INTER_CUBIC
+    height, width = image.shape[:2]
+    scale = min(height, width) / size
+    image_out = image.copy()
+    if image_out.dtype == 'bool':
+        image_out = np.uint8(image_out)
+    if scale > 1:
+        height, width = np.uint32(np.array(image.shape[:2]) / scale)
+        image_out = cv2.resize(image_out, (width, height),
+                               interpolation=interpolation)
+    if image.dtype == bool:
+        image_out = image_out > 0
+    return image_out
+def getimgsizeby(sz, size=960, fix_type='max', stride=1):
+    height, width = sz
+    if fix_type == 'min':
+        scale = min(height, width) / size
+    elif fix_type == "max":
+        scale = max(height, width) / size
+    elif fix_type == 'height':
+        scale = height / size
+    elif fix_type == 'width':
+        scale = width / size
+    height, width = np.uint32(np.float32(sz) / scale)
+    if stride > 1:
+        height = round2stride(height, stride)
+        width = round2stride(width, stride)
+    return height, width
+def resize2fixSize(image, size=960, fix_type='max', interpolation=None):
+    if interpolation is None:
+        interpolation = cv2.INTER_CUBIC
+    height, width = getimgsizeby(image.shape[:2], size, fix_type)
+    image_out = image.copy()
+    if image_out.dtype == 'bool':
+        image_out = np.uint8(image_out)
+    image_out = cv2.resize(image_out, (width, height),
+                            interpolation=interpolation)
+    if image.dtype == bool:
+        image_out = image_out > 0
+    return image_out
+def resize2range(image, max_size=720, min_size=480,
+                 interpolation=None, stride=None):
+    """
+        Constraint the maximum length of an image and min size of an image
+        if conf
+    """
+    if interpolation is None:
+        interpolation = cv2.INTER_LINEAR
+    height, width = image.shape[:2]
+    scale_to_max = max_size / max(height, width)
+    scale_to_min = min(min_size / min(height, width),
+                       max_size / max(height, width))
+    image_out = image.copy()
+    if scale_to_max < 1:
+        height, width = np.uint32(np.array(image.shape[:2]) * scale_to_max)
+        if stride is not None:
+            height = round2stride(height, stride)
+            width = round2stride(width, stride)
+        image_out = cv2.resize(image_out, (width, height),
+                               interpolation=interpolation)
+        return image_out
+    else:
+        if scale_to_min > 1:
+            height, width = np.uint32(np.array(image.shape[:2]) * scale_to_min)
+            image_out = cv2.resize(image_out, (width, height),
+                                interpolation=interpolation)
+            return image_out
+    return image_out
+def resize2maxshape(image, shape,
+                    interpolation=None,
+                    with_scale=False,
+                    mean_value=0):
+    """
+        shape is the target video shape
+        resize an image to target shape by padding zeros
+            when ratio is not match
+    """
+    def get_start_end(scale_id, height_new, width_new):
+        if scale_id == 0:
+            s_v, e_v = 0, height_new
+            s_h = int((shape[1] - width_new) / 2)
+            e_h = s_h + width_new
+        else:
+            s_v = int((shape[0] - height_new) / 2)
+            e_v = s_v + height_new
+            s_h, e_h = 0, width_new
+        return s_v, e_v, s_h, e_h
+    if interpolation is None:
+        interpolation = cv2.INTER_CUBIC
+    shape = list(shape)
+    image_shape = shape if image.ndim == 2 else shape + [image.shape[-1]]
+    image_out = np.zeros(image_shape) + mean_value
+    height, width = image.shape[:2]
+    scale_rate = np.array([shape[0] / height, shape[1] / width])
+    scale_id = np.argmin(scale_rate)
+    scale = scale_rate[scale_id]
+    image = cv2.resize(image, (int(width * scale), int(height * scale)),
+                       interpolation=interpolation)
+    height_new, width_new = image.shape[:2]
+    s_v, e_v, s_h, e_h = get_start_end(scale_id, height_new, width_new)
+    image_out[s_v:e_v, s_h:e_h] = image
+    crop = [s_v, s_h, e_v, e_h]  # top, left, bottom, right
+    if not with_scale:
+        return image_out
+    else:
+        return image_out, scale, crop
+def bilinear_interpolation(x, y, points):
+    '''Interpolate (x,y) from values associated with four points.
+    The four points are a list of four triplets:  (x, y, value).
+    The four points can be in any order.  They should form a rectangle.
+        >>> bilinear_interpolation(12, 5.5,
+        ...                        [(10, 4, 100),
+        ...                         (20, 4, 200),
+        ...                         (10, 6, 150),
+        ...                         (20, 6, 300)])
+        165.0
+    '''
+    # See formula at:  http://en.wikipedia.org/wiki/Bilinear_interpolation
+    points = sorted(points)               # order points by x, then by y
+    (x1, y1, q11), (_x1, y2, q12), (x2, _y1, q21), (_x2, _y2, q22) = points
+    if x1 != _x1 or x2 != _x2 or y1 != _y1 or y2 != _y2:
+        raise ValueError('points do not form a rectangle')
+    if not x1 <= x <= x2 or not y1 <= y <= y2:
+        raise ValueError('(x, y) not within the rectangle')
+    return (q11 * (x2 - x) * (y2 - y) +
+            q21 * (x - x1) * (y2 - y) +
+            q12 * (x2 - x) * (y - y1) +
+            q22 * (x - x1) * (y - y1)
+           ) / ((x2 - x1) * (y2 - y1) + 0.0)
+def dump_to_npy(arrays, file_path=None):
+    """
+       dump set of images to array for local visualization
+       arrays: the input arrays
+       file_path: saving path
+    """
+    assert isinstance(arrays, dict)
+    for k, v in arrays.items():
+        np.save(os.path.join(file_path, k + '.npy'), v)
+def crop(image, box):
+    """
+    box: t, l, b, r
+    """
+    t, l, b, r = box
+    return image[t:b, l:r]
+def padding_image(image_in,
+                  image_size,
+                  crop=None,
+                  interpolation=cv2.INTER_NEAREST,
+                  pad_val=0.):
+    """Pad image to target image_size based on a given crop
+    """
+    assert isinstance(pad_val, float) | isinstance(pad_val, list)
+    if image_size[0] <= image_in.shape[0] and \
+            image_size[1] <= image_in.shape[1]:
+        return image_in
+    image = image_in.copy()
+    in_dim = np.ndim(image)
+    if in_dim == 2:
+        image = image[:, :, None]
+    if isinstance(pad_val, float):
+        pad_val = [pad_val] * image.shape[-1]
+    assert len(pad_val) == image.shape[-1]
+    dim = image.shape[2]
+    image_pad = np.ones(image_size + [dim], dtype=image_in.dtype) * \
+        np.array(pad_val)
+    if not (crop is None):
+        h, w = image_size
+        crop_cur = np.uint32([crop[0] * h, crop[1] * w,
+                              crop[2] * h, crop[3] * w])
+        image = cv2.resize(
+            image, (crop_cur[3] - crop_cur[1], crop_cur[2] - crop_cur[0]),
+            interpolation=interpolation)
+    else:
+        h, w = image_in.shape[:2]
+        # default crop is padding center
+        hp, wp = image_pad.shape[:2]
+        t, l = int((hp - h) / 2), int((wp - w) / 2)
+        crop_cur = [t, l, t + h, l + w]
+    image_pad[crop_cur[0]:crop_cur[2], crop_cur[1]:crop_cur[3], :] = image
+    if in_dim == 2:
+        image_pad = np.squeeze(image_pad)
+    return image_pad
+def enlighting_v2(image, value=30):
+    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+    h, s, v = cv2.split(hsv)
+    value = (255 - np.mean(v)) * 0.6
+    value = int(value)
+    lim = 255 - value
+    v[v > lim] = 255
+    v[v <= lim] += value
+    final_hsv = cv2.merge((h, s, v))
+    img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR)
+    return img
+def enlighting(image):
+    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+    h, s, v = cv2.split(hsv)
+    # clahe = cv2.createCLAHE(clipLimit=30, tileGridSize=(8,8))
+    # v = clahe.apply(v)
+    v = cv2.equalizeHist(v)
+    # v = cv2.add(v, value)
+    # v[v > 255] = 255
+    # v[v < 0] = 0
+    final_hsv = cv2.merge((h, s, v))
+    img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR)
+    return img
+def white_balance(img):
+    result = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
+    avg_a = np.average(result[:, :, 1])
+    avg_b = np.average(result[:, :, 2])
+    result[:, :, 1] = result[:, :, 1] - ((avg_a - 128) * (result[:, :, 0] / 255.0) * 1.1)
+    result[:, :, 2] = result[:, :, 2] - ((avg_b - 128) * (result[:, :, 0] / 255.0) * 1.1)
+    result = cv2.cvtColor(result, cv2.COLOR_LAB2BGR)
+    return result
+def one_hot(label_map, class_num):
+    shape = np.array(label_map.shape)
+    length = np.prod(shape)
+    label_one_hot = np.zeros((length, class_num))
+    label_flat = label_map.flatten()
+    label_one_hot[range(length), label_flat] = 1
+    label_one_hot = label_one_hot.reshape(shape.tolist() + [class_num])
+    return label_one_hot
+def prob2label(label_prob):
+    """Convert probability to a descrete label map
+    """
+    assert label_prob.ndim == 3
+    return np.argmax(label_prob, axis=2)
+"""
+label_prob: [0, 1] probability map
+"""
+def prob2color(label_prob, color_map, bkg_color=[0,0,0]):
+    """
+        color_map: 0-255 [[x, x, x], ...]  python list
+    """
+    assert isinstance(color_map, list)
+    height, width, dim = label_prob.shape
+    color_map = color_map[:(dim - 1)]
+    color_map_mat = np.matrix([bkg_color] + color_map)
+    label_prob_mat = np.matrix(label_prob.reshape((height * width, dim)))
+    label_color = np.array(label_prob_mat * color_map_mat)
+    label_color = label_color.reshape((height, width, -1))
+    return np.uint8(label_color)
+def mix_probimage(prob, image, alpha=0.7):
+    """
+        prob: [h, w, dim] or [h, w] uint8
+    """
+    if prob.ndim == 2:
+        prob = prob[:, :, None]
+    if prob.dtype  == 'uint8':
+        prob = np.float32(prob) / 255.0
+    color_map = get_pallete(256)
+    color_map = np.array(color_map).reshape([-1, 3])[1:, :]
+    color_map = color_map.tolist()
+    prob_color = prob2color(prob, color_map)
+    image = resize_like(image, prob)
+    mix_image = (1 - alpha) * image + alpha * prob_color
+    return mix_image
+def label2color(label, color_map=None, bkg_color=[0, 0, 0]):
+    if color_map is None:
+        color_map = np.uint8(np.array(PALETTE) * 255)
+        color_map = color_map.tolist()
+    height, width = label.shape[0:2]
+    class_num = len(color_map) + 1
+    label_one_hot = one_hot(label, class_num)
+    label_color = prob2color(label_one_hot, color_map, bkg_color)
+    return label_color
+def gif_to_frames(in_path, out_path, max_frame=10000):
+    import imageio
+    gif = imageio.get_reader(in_path, '.gif')
+    # Here's the number you're looking for
+    for frame_id, frame in tqdm(enumerate(gif)):
+        filename =  '%s/%04d.png'% (out_path, frame_id)
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        cv2.imwrite(filename, frame)
+        if frame_id > max_frame:
+            break
+    print('finished')
+def speedx_video(video_in, video_out, speed):
+    import moviepy.editor as me
+    import moviepy
+    clip = me.VideoFileClip(video_in)
+    clip = moviepy.video.fx.all.speedx(clip, factor=speedx)
+    clip.write_videofile(video_out)
+def resize_boxes(boxes, image_shape):
+    """
+    boxes: n x 4 [t, l, b, r]
+    image_shape: height, width
+    """
+    if len(boxes) == 0:
+        return boxes
+    boxes = np.array(boxes)
+    boxes[:, [0, 2]] *= image_shape[0]
+    boxes[:, [1, 3]] *= image_shape[1]
+    return boxes
+def lens_blur(img, depth_in, fg_depth,
+              fg_mask=None, NUM_LAYERS = 20):
+    def layer_mask(dm, s, e):
+        # copy image dimensions, but fill with zeros
+        m = np.zeros(dm.shape)
+        # set values above start threshold to white
+        m[dm >= s] = 1
+        # set values above end threshold to black
+        m[dm > e] = 0
+        return m
+    def to_multi_mask(mask, ch=3):
+        return np.tile(mask[:, :, None] > 0, (1, 1, ch))
+    depth = depth_in.copy()
+    out = np.zeros(img.shape)
+    min_depth = np.min(np.unique(depth))
+    max_depth = np.max(np.unique(depth))
+    min_depth = int(min_depth / max_depth * 255)
+    fg_depth = int(fg_depth / max_depth * 255)
+    depth = np.uint8(depth * 255 / max_depth)
+    s = (255 - min_depth) // NUM_LAYERS
+    layers = np.array(range(min_depth, 255, s))
+    for i, a in enumerate(layers[:-1]):
+        if layers[i] < fg_depth and layers[i+1] > fg_depth:
+            fg_depth = layers[i]
+            break
+    for a in layers:
+        l_mask = layer_mask(depth, a, a+s)
+        l_mask = to_multi_mask(l_mask)
+        res = blur_filter(img, np.abs(a - fg_depth))
+        out[l_mask] = res[l_mask]
+    if fg_mask is not None:
+        fg_mask = np.tile(fg_mask[:, :, None] > 0, (1, 1, 3))
+        out[fg_mask] = img[fg_mask]
+    return out
+###############################################
+### Filters
+###############################################
+# Change blur by epsilon value (a)
+def blur_filter(img, a):
+    # increase kernel effect slowly, must be odd
+    k = (a // 10) + 1 if (a // 10) % 2 == 0 else (a // 10) + 2
+    # can't exceed 255
+    k = k if k < 255 else 255
+    kernel = (k, k)
+    # blur filter
+    o = cv2.GaussianBlur(img, kernel, 9)
+    return o
+def box_center(box):
+    """
+        boxes: n x 4 [t, l, b, r]
+    """
+    return (box[1] + box[3]) // 2, (box[0] + box[2]) // 2
+def mean_value(value, mask):
+    """
+        mean value inside mat
+    """
+    if value.ndim == 2:
+        value = value[:, :, None]
+    h, w, dim = value.shape
+    test = value.reshape([-1, dim])
+    mean = np.mean(test[mask.flatten(), :], axis=0)
+    return mean
+def is_neighbor_mask(mask0, mask1, min_len=200, kernel=10):
+    # at least 200 pixel connecting edge
+    mask = dilate_mask(mask1, kernel=kernel)
+    intern = np.sum(np.logical_and(mask0 > 0, mask > 0))
+    return intern > min_len * kernel
+def get_salient_components(segment_in, th=0.1, min_th=25):
+    """
+    :param segment_in:  0, 1 mask
+    :param th:
+    :return:
+    """
+    segment = segment_in.copy()
+    area_org = np.sum(segment)
+    segment = np.uint8(segment_in * 255)
+    ret, labels = cv2.connectedComponents(segment)
+    if ret == 2:
+        return [segment_in]
+    masks = []
+    for i in range(1, ret):
+        mask = labels == i
+        area = np.sum(mask)
+        if area < area_org * th :
+            continue
+        if area < min_th:
+            continue
+        masks.append(mask)
+    return masks
+def get_component(segment, criteria='max'):
+    """ find the largest connected component mask
+    """
+    ret, labels = cv2.connectedComponents(segment)
+    if ret == 2:
+        return segment
+    max_area = 0
+    idx = 1
+    for i in range(1, ret):
+        area = np.sum(labels == i)
+        if area > max_area:
+            max_area = area
+            idx = i
+    return np.uint8(255 * (labels == idx))
+def find_largest_mask(segment, ignore_ids=None):
+    """ find the largest mask inside component
+    """
+    if ignore_ids is None:
+        ignore_ids = []
+    ids = np.unique(segment)
+    max_area = 0
+    idx = 1
+    for i in ids:
+        if i in ignore_ids:
+            continue
+        area = np.sum(segment == i)
+        if area > max_area:
+            max_area = area
+            idx = i
+    return idx, segment == idx
+def find_center_mask(segment, ignore_ids, box = None):
+    h, w = segment.shape
+    if box is None:
+        box = [int(h / 4),
+               int(w / 4),
+               int(h * 3 / 4),
+               int(w * 3 / 4)]
+    idx, _ = find_largest_mask(
+        segment[box[0]:box[2], box[1]:box[3]], ignore_ids)
+    return idx, segment == idx
+def get_largest_component(segment_in, criteria='max'):
+    segment = segment_in.copy()
+    thresh = 0.3
+    segment = np.uint8(255 * (np.float32(segment) / 255.0 > thresh))
+    ret, labels = cv2.connectedComponents(segment)
+    if ret == 2:
+        return segment_in
+    max_area = 0
+    idx = 1
+    for i in range(1, ret):
+        area = np.sum(labels == i)
+        if area > max_area:
+            max_area = area
+            idx = i
+    mask = dilate_mask(np.uint8(labels == idx))
+    segment = segment_in * mask
+    return np.uint8(segment)
+def fillholes(mask):
+    """
+    binary mask
+    """
+    des = np.uint8(mask > 0) * 255
+    contour, hier = cv2.findContours(des,cv2.RETR_CCOMP,cv2.CHAIN_APPROX_SIMPLE)
+    # des = cv2.merge([des, des, des])
+    # cv2.drawContours(des, contour, -1, (0, 255, 0), 3)
+    for i, cnt in enumerate(contour):
+        cv2.drawContours(des, [cnt], -1, 255, -1)
+    # mask = des == 0
+    return des > 0
+def video_to_frames(in_path, out_path, max_frame=100000):
+    """separate video to frames
+    """
+    print("saving videos to frames at {}".format(out_path))
+    cap = cv2.VideoCapture(in_path)
+    frame_id = 0
+    mkdir_if_need(out_path)
+    # cv2.namedWindow("video")
+    while(cap.isOpened()):
+        ret, frame = cap.read()
+        if not ret:
+            break
+        filename = out_path + '/%04d.jpg' % frame_id
+        cv2.imwrite(filename, frame)
+        frame_id += 1
+        if frame_id > max_frame:
+            break
+    cap.release()
+    print("finished")
+def resize_video(in_path, out_path, sz, max_frame=10000):
+    """separate video to frames
+    Args:
+        sz: height, width of new video
+    """
+    from moviepy.editor import ImageSequenceClip, VideoFileClip
+    print("resize videos to vidoe at {}".format(out_path))
+    new_height, new_width = sz
+    assert os.path.exists(in_path), f"must exist {in_path}"
+    cap = cv2.VideoCapture(in_path)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    progress_bar = tqdm(total=max_frame)
+    progress_bar.set_description('Progress')
+    frame_id = 0
+    frames = []
+    while(cap.isOpened()):
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.resize(frame, (new_width, new_height))
+        frames.append(frame[:, :, ::-1])
+        frame_id += 1
+        progress_bar.update(frame_id)
+        if frame_id > max_frame:
+            break
+    clip = ImageSequenceClip(frames, fps)
+    clip.write_videofile(out_path, fps=fps)
+    cap.release()
+    print("finished")
+def frame_to_video_simple(frames,
+                          fps=10,
+                          video_name='video.avi',
+                          reader=cv2.IMREAD_UNCHANGED):
+    """
+        Combine frames to video
+        image_path: path of images
+    """
+    import sys
+    if video_name.endswith('.avi'):
+        fourcc = cv2.VideoWriter_fourcc(*'XVID')
+    elif video_name.endswith('.mp4'):
+        fourcc = cv2.VideoWriter_fourcc(*'MP4V')
+    is_str = False
+    if isinstance(frames[0], str):
+        frame = cv2.imread(frames[0], cv2.IMREAD_UNCHANGED)
+        is_str = True
+    else:
+        frame = frames[0]
+    sz = frame.shape[:2]
+    video = cv2.VideoWriter(video_name, fourcc, fps, (sz[1], sz[0]))
+    for i, frame in enumerate(tqdm(frames)):
+        sys.stdout.write('\r>>process %04d / %04d' % (i, len(frames)))
+        sys.stdout.flush()
+        if is_str:
+            frame = cv2.imread(frame, reader)
+        video.write(frame)
+    cv2.destroyAllWindows()
+    video.release()
+    print('save to %s' % video_name)
+def frame_to_video(image_path,
+                   label_path,
+                   frame_list,
+                   label_ext='',
+                   label_map_is_color=False,
+                   color_map=None,
+                   sz=None,
+                   fps=10,
+                   alpha=0.5,
+                   video_name='video.avi',
+                   exts=["jpg", "png"],
+                   is_probability=False):
+    """
+        Combine frames to video to visualize image & label image
+        image_path: path of images
+        exts: 1st is
+    """
+    def to_color_map(label):
+        assert color_map is not None
+        bkg = [255, 255, 255]
+        if is_probability:
+            if label.ndim == 2:
+                label = np.float32(label) / 255
+                label = np.concatenate(
+                    [1 - label[:, :, None],
+                     label[:, :, None]], axis=2)
+            label = prob2color(label, color_map, bkg_color=bkg)
+        else:
+            label[label > len(color_map)] = 0
+            label = label2color(label, color_map, bkg)
+        return label[:, :, ::-1]
+    import sys
+    ext_image, ext_label = exts
+    if sz is None:
+        label = cv2.imread(f"{label_path}/{frame_list[0]}.{ext_label}", cv2.IMREAD_UNCHANGED)
+        sz = label.shape[:2]
+    if video_name.endswith('.avi'):
+        fourcc = cv2.VideoWriter_fourcc(*'XVID')
+    elif video_name.endswith('.mp4'):
+        fourcc = cv2.VideoWriter_fourcc(*'MP4V')
+    video = cv2.VideoWriter(video_name, fourcc, fps, (sz[1], sz[0]))
+    for i, image_name in enumerate(frame_list):
+        sys.stdout.write('\r>>process %04d / %04d' % (i, len(frame_list)))
+        sys.stdout.flush()
+        image = cv2.resize(
+            cv2.imread(f"{image_path}/{image_name}.jpg", cv2.IMREAD_COLOR),
+            (sz[1], sz[0]))
+        label_name = image_name + label_ext
+        label = cv2.resize(cv2.imread(f"{label_path}/{label_name}.{ext_label}",
+                                      cv2.IMREAD_UNCHANGED),
+                           (sz[1], sz[0]), interpolation=cv2.INTER_NEAREST)
+        if not label_map_is_color:
+            label = to_color_map(label)
+        frame = np.uint8(image * alpha + label * (1 - alpha))
+        video.write(frame)
+    cv2.destroyAllWindows()
+    video.release()
+    print('save to %s' % video_name)
+def video_to_frame(video_path,
+                   image_folder_path=None,
+                   sample_rate=1,
+                   max_len=None,
+                   holder=None,
+                   ext="jpg"):
+    """
+        holder: the holder of image list
+    """
+    if image_folder_path is not None:
+        mkdir_if_need(image_folder_path)
+    if video_path.split('.')[-1] == 'gif':
+        gif_to_frames(video_path, image_folder_path)
+        return
+    vidcap = cv2.VideoCapture(video_path)
+    success, image = vidcap.read()
+    assert success, video_path
+    sz = image.shape[:2]
+    count = 0
+    while success:
+        if count % sample_rate == 0:
+            image_path = f'{image_folder_path}/{count:04}.{ext}'
+            if max_len is not None:
+                image = resize2maxsize(image, max_len)
+                # height, width = image.shape[:2]
+                # length = int(height / 2)
+                # image = image[:length, :, :]
+            if image_folder_path is not None:
+                cv2.imwrite(image_path, image)     # save frame as JPEG file
+            if holder is not None:
+                holder.append(image)
+        success, image = vidcap.read()
+        count += 1
+    print('success split %s' % video_path)
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    return fps, sz
+def box_intersect(box0, box1):
+    # top, left, bottom, right
+    box = [max(box0[0], box1[0]), max(box0[1], box1[1]),
+           min(box0[2], box1[2]), min(box0[3], box1[3])]
+    return box
+def timefunc(f):
+    def f_timer(*args, **kwargs):
+        start = time.time()
+        result = f(*args, **kwargs)
+        end = time.time()
+        logger.debug(f.__name__, 'took',
+                     end - start, 'second')
+        return result
+    return f_timer
+def test_one_hot():
+    label = np.array([[1, 2], [3, 4]])
+    label_one_hot = one_hot(label, 5)
+    print(label_one_hot)
+def test_resize2range():
+    test = np.ones([100, 200])
+    test2 = resize2range(test, 200, 50)
+    print(test2.shape)
+def test_prob2image():
+    test = np.random.random_sample((3, 10, 10))
+    dump_prob2image('test', test)
+    res = load_image2prob('test')
+    np.testing.assert_allclose(test, res, rtol=0.5, atol=1e-02)
+def shape_match(images):
+    assert len(images) > 1
+    shape = images[0].shape[:2]
+    for image in images[1:]:
+        cur_shape = image.shape[:2]
+        if np.sum(np.abs(np.array(shape) - \
+                         np.array(cur_shape))):
+            return False
+    return True
+def append_apex(filename, appex):
+    filename = filename.split('.')
+    prefix = '.'.join(filename[:-1])
+    filetype = filename[-1]
+    return '%s_%s.%s' % (prefix, appex, filetype)
+def get_obj_center(mask, th=0):
+    """
+        mask: 0
+    """
+    y, x = np.where(mask > th)
+    if len(y) == 0:
+        return -1 , -1
+    x, y = np.mean(x), np.mean(y)
+    return int(x), int(y)
+def poly_area(poly):
+    """
+    Args:
+      poly: [n x 2] np.array [x, y]
+    """
+    return PolyArea(poly[:, 0], poly[:, 1])
+def PolyArea(x, y):
+    return 0.5*np.abs(np.dot(x, np.roll(y, 1))-np.dot(y, np.roll(x,1)))
+def rect_size(rect):
+    return np.linalg.norm(rect[0, :] - rect[2, :])
+def avg_size(rects, option='median'):
+    sizes = np.zeros(len(rects))
+    for i, rect in enumerate(rects):
+        sizes[i] = rect_size(rect)
+    if option == 'median':
+        return np.median(sizes)
+    if option == 'mean':
+        return np.mean(sizes)
+    return None
+def poly_ratio(rect, type='min'):
+    if type == 'avg':
+        l1 = np.linalg.norm(rect[0, :] - rect[1, :])
+        l2 = np.linalg.norm(rect[1, :] - rect[2, :])
+        l3 = np.linalg.norm(rect[2, :] - rect[3, :])
+        l4 = np.linalg.norm(rect[3, :] - rect[0, :])
+        return (l1 + l3) / (l2 + l4)
+    ratio = 0
+    for i in range(4):
+        s = i
+        t = (i + 1) % 4
+        e = (i + 2) % 4
+        l1 = np.linalg.norm(rect[s, :] - rect[t, :])
+        l2 = np.linalg.norm(rect[t, :] - rect[e, :])
+        cur_ratio = max(l1 / (l2 + 1e-10), l2 / (l1 + 1e-10))
+        if cur_ratio > ratio:
+            ratio = cur_ratio
+    return ratio
+def rect_ratio(rect):
+    """ x / y
+    :param rect:
+    :return:
+    """
+    x_diff = np.max(rect[:, 0]) - np.min(rect[:, 0])
+    y_diff = np.max(rect[:, 1]) - np.min(rect[:, 1])
+    return max(x_diff / y_diff, y_diff / x_diff)
+def rect_in_size(rect, image_sz, num_th=4):
+    """rectangle inside image
+    """
+    h, w = image_sz
+    def pt_in_size(pt):
+        return 0 <= pt[0] < w and 0 <= pt[1] < h
+    valid = [False for i in range(rect.shape[0])]
+    for i, pt in enumerate(rect):
+        if pt_in_size(pt):
+            valid[i] = True
+    return np.sum(valid) >= num_th
+def valid_rect(rect):
+    l, r, t, b = rect
+    return l < r and t < b
+def compute_normal_deg_absvar(normal, mask):
+    normal_cur = normal * mask[:, :, None]
+    mean_normal = np.sum(normal_cur, axis=(0, 1)) / np.sum(mask)
+    inner = np.sum(mean_normal[None, None, :] * normal_cur, axis=2)
+    s = np.clip(np.abs(inner), 0, 1)
+    diff = np.rad2deg(np.arccos(s))
+    var = np.sum(diff * mask) / np.sum(mask)
+    return var
+def compute_ignore_mask(x, ignore_value=None):
+    mask = 1
+    if ignore_value is None:
+        return mask
+    dim = x.ndim
+    if x.ndim == 2:
+        x = x[:, :, None]
+    if not isinstance(ignore_value, list):
+        ignore_value = [ignore_value] * x.shape[-1]
+    for i, value in enumerate(ignore_value):
+        cur_mask = x[:, :, i] == value
+        mask = mask * cur_mask
+    if dim == 2:
+        x = x.squeeze(-1)
+    return mask
+def weight_reduce(res, weights):
+    """
+    """
+    dim = res[0].ndim
+    result = 0
+    weight_all = 0
+    for i, x in enumerate(res):
+        if dim == 2:
+            x = x[:, :, None]
+        weight = weights[i]
+        result = result + (x * weight[:, :, None])
+        weight_all = weight_all + weight
+    if dim == 2:
+        result = result.squeeze(-1)
+    return result / np.maximum(weight_all[:, :, None], 1e-6)
+def mask_assign(x, mask, target):
+    dim = x.ndim
+    if dim == 2:
+        x = x[:, :, None]
+    for i in range(x.shape[-1]):
+        cache = x[:, :, i]
+        cache_tgt = target[:, :, i]
+        cache[mask] = cache_tgt[mask]
+        x[:, :, i] = cache
+    if dim == 2:
+        x = x.squeeze(-1)
+    return x
+def overlap_poly(poly0, poly1, mask=None):
+    sz = None
+    if mask is None:
+        h = max(np.max(poly0[:, 1]), np.max(poly1[:, 1]))
+        w = max(np.max(poly0[:, 0]), np.max(poly1[:, 0]))
+        sz = [h + 1, w + 1]
+    else:
+        sz = mask.shape[:2]
+    vis_map0 = np.zeros(sz)
+    cv2.fillPoly(vis_map0,
+                 pts=[np.int0(poly0)],
+                 color=(1,))
+    vis_map1 = np.zeros(sz)
+    cv2.fillPoly(vis_map1,
+                 pts=[np.int0(poly1)],
+                 color=(1,))
+    inter_area = np.sum(vis_map0 * vis_map1),
+    return inter_area, inter_area / np.sum(vis_map0), inter_area / np.sum(vis_map1)
+def overlap_rect_mask(rect, mask):
+    """
+        ratio that mask is in rectangle
+    """
+    vis_map = np.zeros(mask.shape)
+    cv2.fillPoly(vis_map,
+                 pts=[np.int0(rect)],
+                 color=(1,))
+    overlap = np.sum(np.int32(mask > 0) *
+                     np.int32(vis_map > 0))
+    ratio = overlap / np.sum(vis_map > 0)
+    return ratio
+def pt_in_poly(pt, poly):
+    """
+    poly: list of pt
+    """
+    from shapely.geometry import Point
+    from shapely.geometry.polygon import Polygon
+    point = Point(pt[0], pt[1])
+    polygon = Polygon(poly)
+    return polygon.contains(point)
+def pt_in_poly_w_mask(pt, poly, sz, margin=None):
+    """
+        margin: ratio of area for expand
+    """
+    mask = np.zeros(np.int0(sz))
+    cv2.fillPoly(mask,
+                 pts=[np.int0(poly)],
+                 color=(255,))
+    if margin is not None:
+        rectArea = PolyArea(poly[:, 0], poly[:, 1])
+        pixel = np.int0(margin * np.sqrt(rectArea))
+        mask = dilate_mask(mask, pixel)
+    pt = np.int0(pt)
+    return mask[pt[1], pt[0]] > 0
+def is_overlap(r_cur, r_over, ths=None):
+    """ whether two rects are overlapping
+        r_cur: [l, r, t, b]
+    """
+    if ths is None:
+        ths = [0, 0]
+    w_th, h_th = ths
+    l, r, t, b = r_cur
+    l0, r0, t0, b0 = r_over
+    if l >= (r0 + w_th) or r <= (l0 - w_th):
+        return False
+    if b <= (t0 - h_th) or t >= (b0 + h_th):
+        return False
+    return True
+def rect_from_poly(poly):
+    min_x, max_x = np.min(poly[:, 0]), np.max(poly[:, 0])
+    min_y, max_y = np.min(poly[:, 1]), np.max(poly[:, 1])
+    return min_x, max_x, min_y, max_y
+def rotate_image_if_needed(image):
+    from PIL import Image, ExifTags
+    if hasattr(image, '_getexif'): # only present in JPEGs
+        for orientation in ExifTags.TAGS.keys():
+            if ExifTags.TAGS[orientation]=='Orientation':
+                break
+        e = image._getexif()       # returns None if no EXIF data
+        if e is not None:
+            exif=dict(e.items())
+            if orientation in exif:
+                orientation = exif[orientation]
+                if orientation == 3:   image = image.transpose(Image.ROTATE_180)
+                elif orientation == 6: image = image.transpose(Image.ROTATE_270)
+                elif orientation == 8: image = image.transpose(Image.ROTATE_90)
+    return image
+def is_night_scene(image, prob_map, sky_prob_threshold=200, brightness_threshold=100):
+    """
+        Return True if it's a night scene image
+        image: original image
+        prob_map: the probability map of image segmentation (red: sky; green: building; blue: background, value from 0 to 255)
+        sky_prob_threshold: pixel val > sky_prob_threshold will be segmented as sky
+        brightness_threshold: val < brightness_threshold will be considered as night scene
+    """
+    rotate_image_if_needed(image)
+    image = np.array(image.convert('L'))
+    sky, building, background = prob_map.split()
+    # calculate average brightness of the sky:
+    sky_mask = np.array(sky)
+    sky_brightness = (sky_mask > sky_prob_threshold) * image
+    if (np.count_nonzero(sky_brightness) == 0):
+        return False
+    else:
+        avg_sky_brightness = sky_brightness[np.nonzero(sky_brightness)].mean()
+        return avg_sky_brightness < brightness_threshold
+def detect_lines(img,
+                 fg_mask=None,
+                 length_thresh=None):
+    """
+        Detects lines using OpenCV LSD Detector
+    Return:
+        n x 4   line start, line end
+    """
+    # Convert to grayscale if required
+    if len(img.shape) == 3:
+        img_copy = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    else:
+        img_copy = img
+    h, w = img.shape[:2]
+    if length_thresh is None:
+        length_thresh = int(max(h, w) * 0.04)
+    # Create LSD detector with default parameters
+    lsd = cv2.createLineSegmentDetector(0)
+    # Detect lines in the image
+    # Returns a NumPy array of type N x 1 x 4 of float32
+    # such that the 4 numbers in the last dimension are (x1, y1, x2, y2)
+    # These denote the start and end positions of a line
+    lines = lsd.detect(img_copy)[0]
+    # Remove singleton dimension
+    lines = lines[:, 0]
+    # Filter out the lines whose length is lower than the threshold
+    dx = lines[:, 2] - lines[:, 0]
+    dy = lines[:, 3] - lines[:, 1]
+    lengths = np.sqrt(dx * dx + dy * dy)
+    mask = lengths >= length_thresh
+    lines = lines[mask]
+    # todo remove lines at boundary
+    if fg_mask:
+        fg_mask = cv2.distanceTransform(fg_mask, distanceType=cv2.DIST_C, maskSize=5).astype(np.float32)
+        select_id = np.ones((len(lines),))
+        for ind, l in enumerate(lines):
+            ll = np.int0(l)
+            dist = (fg_mask[ll[1], ll[0]] + fg_mask[ll[3], ll[2]]) * 0.5
+            if dist < 8:
+                select_id[ind] = 0
+        lines = lines[select_id > 0]
+    return lines
+def get_a_key(dict_data: Dict[str, Any]):
+    """
+        Get first iterated key value from a dictionary.
+        Args:
+            dict_data (Dict[str, Any]): dict with string keys.
+        Returns:
+            Optional[str]: str key if non-empty, else None.
+    """
+    if dict_data:
+        key = next(iter(dict_data))
+        return key
+    else:
+        return None
+def shift_to_center(image, mask, shape=None):
+    """
+       shift image object to center at mask center
+    """
+    if shape is None:
+        shape = image.shape[:2]
+    assert mask.shape[0] == shape[0]
+    cy, cx = shape[0] // 2, shape[1] // 2
+    positions = np.nonzero(mask)
+    top = positions[0].min()
+    bottom = positions[0].max()
+    left = positions[1].min()
+    right = positions[1].max()
+    new_l = cx - (right - left) // 2
+    new_r = new_l + right - left
+    new_top = cy - (bottom - top) // 2
+    new_bottom = new_top + bottom - top
+    new_im = np.zeros(image.shape)
+    new_im[new_top:new_bottom, new_l:new_r, :] = \
+        image[top:bottom, left:right, :]
+    return new_im
+def ndarray_to_list(in_dict: dict):
+    for key, item in in_dict.items():
+        if isinstance(item, np.ndarray):
+            in_dict[key] = item.tolist()
+        if isinstance(item, dict):
+            in_dict[key] = ndarray_to_list(item)
+    return in_dict
+"""
+    encode image to string and decode it back
+"""
+def encode_b64(mat, format='.png'):
+    mat = cv2.imencode(format, mat)[1]
+    return base64.b64encode(mat).decode('utf-8')
+def decode64(string):
+    jpg_original = base64.b64decode(string)
+    jpg_as_np = np.frombuffer(jpg_original, dtype=np.uint8)
+    img = cv2.imdecode(jpg_as_np, cv2.IMREAD_UNCHANGED)
+    return img
+def remap_texture(triangle1, triangle2, texture):
+    import numpy as np
+    import cv2
+    # Convert input triangles to numpy arrays
+    tri1 = np.array(triangle1, dtype=np.float32)
+    tri2 = np.array(triangle2, dtype=np.float32)
+    # Find the bounding rectangle of each triangle
+    rect1 = cv2.boundingRect(tri1)
+    rect2 = cv2.boundingRect(tri2)
+    # Offset points by left top corner of the respective rectangles
+    tri1_rect = np.float32(tri1 - rect1[:2])
+    tri2_rect = np.float32(tri2 - rect2[:2])
+    # Apply the affine transformation to map the texture from triangle1 to triangle2
+    warp_mat = cv2.getAffineTransform(tri1_rect, tri2_rect)
+    warped_texture = cv2.warpAffine(texture, warp_mat, (rect2[2], rect2[3]))
+    # Create a mask for the destination triangle
+    mask = np.zeros((rect2[3], rect2[2], 3), dtype=np.uint8)
+    cv2.fillConvexPoly(mask, np.int32(tri2_rect), (1.0, 1.0, 1.0), 16, 0)
+    # Apply the mask to the warped texture
+    remapped_texture = warped_texture * mask
+    return remapped_texture, mask
+def fuse_rgb_mask(image, mask):
+    """
+        image: h, w, [3,4] rgb or rgba image
+        mask: h, w, [1,3] mask
+    """
+    if isinstance(image, str):
+        image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
+    if isinstance(mask, str):
+        mask = cv2.imread(mask, cv2.IMREAD_UNCHANGED)
+    if not shape_match([image, mask]):
+        image = cv2.resize(image, (mask.shape[1], mask.shape[0]))
+    if image.shape[-1] == 4:
+        image = image[:, :, :3]
+    if mask.shape[-1] == 3:
+        mask = mask[:, :, 0]
+    mask = mask[:, :, None]
+    if mask.max() == 1:
+        mask = mask * 255
+    return np.concatenate([image, mask], axis=2)
+def test_remap_texture():
+    # Define test input values
+    triangle1 = [(0, 0), (50, 0), (0, 50)]
+    triangle2 = [(0, 0), (100, 0), (0, 100)]
+    texture = np.ones((50, 50, 3), dtype=np.uint8) * 255
+    # Call the remap_texture function with the test input values
+    remapped_texture = remap_texture(triangle1, triangle2, texture)
+    # Check if the output is as expected
+    assert remapped_texture.shape == (100, 100, 3), "Remapped texture shape is incorrect"
+    assert np.all(remapped_texture[:50, :50] == texture), "Texture not correctly remapped in the destination triangle"
+    # Print a success message if the test passes
+    print("Test passed: remap_texture function works as expected")
+def test_line_seg_cross():
+    seg1 = np.array([[0, 0], [1, 1]])
+    seg2 = np.array([[1, 0], [0, 1]])
+    print(line_segment_cross(seg1, seg2))
+    seg1 = np.array([[0, 0], [1, 1]])
+    seg2 = np.array([[1, 0], [1.5, 2]])
+    print(line_segment_cross(seg1, seg2))
+if __name__ == '__main__':
+    # test_one_hot()
+    # test_resize2range()
+    # test_prob2image()
+    # test_line_seg_cross()
+    # test = np.array([[0, 2], [1, 1], [1, 0], [0, 0]])
+    # area = PolyArea(test[:, 0], test[:, 1])
+    # print(area)
+    # test_remap_texture()
+    # pt = np.array([0.5, 0.5])
+    # rect = np.array([[0, 1], [1, 1], [1, 0], [0, 0]])
+    # print(pt_in_poly(pt, rect))
+    # test_file = "/opt/tiger/mzy-project/temp/BuildingAR/facader/test.png"
+    # test_out = "/opt/tiger/mzy-project/temp/BuildingAR/facader/test2.png"
+    # image = cv2.imread(test_file, cv2.IMREAD_UNCHANGED)
+    # image = fillholes(image)
+    # print(np.unique(image))
+    # cv2.imwrite(test_out, image * 255)
+    # test = np.array([[0, 2], [1, 1], [1, 0], [0, 0]])
+    # print(overlap_poly(test, test))
+    # area = PolyArea(test[:, 0], test[s:, 1])
+    # print(area)
+    # import plot_utils as p_uts
+    # image = np.zeros((480, 640, 3))
+    # lines = np.array([[500.5  , 299.6  , 409.375, 235.375],
+    #                   [504.575, 309.325, 415.625, 244.575]])
+    # pt, _ = line_intersect_pt(lines)
+    # print(pt)
+    # cv2.circle(image, np.int32(pt), 1, (255, 0, 0), 2)
+    # image = p_uts.drawLines(image, lines.reshape([-1, 2, 2]))
+    # cv2.imwrite('test.png', image)
+    paths = "/opt/tiger/spark_deploy/spark-3.0/spark-stable/bin:/opt/mlx_deploy/miniconda3/envs/mlx/bin:/opt/tiger/mlx_deploy:/opt/tiger/tce/tce_tools/bin:/home/tiger/.local/bin:/opt/common_tools:/usr/local/go/bin:/opt/tiger/mlx_deploy/vscode/code-server-4.7.1-linux-amd64/lib/vscode/bin/remote-cli:/opt/tiger/spark_deploy/spark-3.0/spark-stable/bin:/opt/mlx_deploy/miniconda3/envs/mlx/bin:/opt/tiger/mlx_deploy:/opt/tiger/spark_deploy/spark-3.0/spark-stable/bin:/opt/mlx_deploy/miniconda3/envs/mlx/bin:/opt/tiger/mlx_deploy:/opt/tiger/spark_deploy/spark-3.0/spark-stable/bin:/opt/mlx_deploy/miniconda3/envs/mlx/bin:/opt/tiger/mlx_deploy:/workspace:/opt/tiger/consul_deploy/bin/go:/root/miniconda3/bin:/root/miniconda3/condabin:/usr/local/cuda/bin:/workspace:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/tiger/ss_bin:/usr/local/jdk/bin:/usr/sbin:/opt/tiger/ss_lib/bin:/opt/tiger/ss_lib/python_package/lib/python2.7/site-packages/django/bin:/opt/tiger/yarn_deploy/hadoop/bin:/opt/tiger/yarn_deploy/hive/bin:/opt/tiger/yarn_deploy/jdk/bin:/opt/tiger/hadoop_deploy/jython-2.5.2/bin:/usr/local/bvc/bin:/opt/tiger/arnold/bin:/workspace/bernard/bin:/workspace://bin:/opt/tiger/ss_bin:/opt/tiger/ss_lib/bin:/opt/common_tools:/opt/tiger/yarn_deploy/hadoop/bin:/opt/tiger/yarn_deploy/hive/bin:/workspace:/workspace://bin:/opt/tiger/ss_bin:/opt/tiger/ss_lib/bin:/opt/common_tools:/opt/tiger/yarn_deploy/hadoop/bin:/opt/tiger/yarn_deploy/hive/bin:/workspace://bin:/opt/tiger/ss_bin:/opt/tiger/ss_lib/bin:/opt/common_tools:/opt/tiger/yarn_deploy/hadoop/bin:/opt/tiger/yarn_deploy/hive/bin:/opt/tiger/nastk/bin:/workspace://bin:/opt/tiger/ss_bin:/opt/tiger/ss_lib/bin:/opt/common_tools:/opt/tiger/yarn_deploy/hadoop/bin:/opt/tiger/yarn_deploy/hive/bin"
+    paths = paths.split(":")
+    check_file_in_paths(paths, "docker")