File size: 6,297 Bytes
38601f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f967df8
38601f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f4e51f
38601f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json
from pathlib import Path
import os
from uuid import uuid4
from threading import Lock
import gradio as gr
import jsonlines
from huggingface_hub import CommitScheduler, snapshot_download

snapshot_download(repo_id="TideDra/HDBench", local_dir="./",repo_type="dataset",allow_patterns=["data_dir/*"])

JSON_DATASET_DIR = Path("raw_annotations")
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)

JSON_DATASET_PATH = JSON_DATASET_DIR / f"{uuid4()}.json"

scheduler = CommitScheduler(
    repo_id="HDBench",
    repo_type="dataset",
    folder_path=JSON_DATASET_DIR,
    path_in_repo="data_dir/raw_annotations",
    token=os.environ["HF_TOKEN"],
)
global dataset
with open("./data_dir/qwenvl_test2017.json") as f:
    dataset = json.load(f)

dataset = {d['image']:d for d in dataset}

if os.path.exists("./data_dir/raw_annotations"):
    exist_annotations = os.listdir("./data_dir/raw_annotations")
    for anno in exist_annotations:
        anno = os.path.join("./data_dir/raw_annotations",anno)
        with jsonlines.open(anno) as reader:
            for obj in reader:
                if obj['image'] in dataset:
                    dataset.pop(obj['image'])

dataset = list(dataset.values())

dispatcher_lock = Lock()

def submit(name: str,answer:str,img:str) -> str:
    global dataset
    global JSON_DATASET_DIR
    global JSON_DATASET_PATH
    with scheduler.lock:
        # file size should less than 5 mb
        with JSON_DATASET_PATH.open("a") as f:
            json.dump({"annotator": name, "image": img, "caption": answer}, f)
            f.write("\n")
        if JSON_DATASET_PATH.stat().st_size > 1024 * 1024 * 4:
            JSON_DATASET_PATH = JSON_DATASET_DIR / f"{uuid4()}.json"
    
    return None,None,f"Number of samples to be annotated: {len(dataset)}"

def disable_and_enable_buttons():
    return gr.update(interactive=False),gr.update(interactive=True)

def get_new_data():
    global dataset
    with dispatcher_lock:
        if len(dataset) == 0:
            data = (None,None,None)
        else:
            data = dataset[-1]
            dataset = dataset[:-1]
            data = (data["image"],data["caption"],data["image"])
    return data

instruction = """
# 任务说明:

给定一张图片和该图片的描述,描述中可能存在错误,你需要用\<f>和\</f>标签将错误部分标出,并紧接着在其后用\<t>和\</t>标签给出你的纠正。如果描述完全正确则不需要做任何修改。请在Your Name中填写你的昵称,以方便我们统计与审核你的贡献。点击Next按钮获取下一个样本,修改完描述后,点击Submit按钮提交你的标注。标注结果可在[此仓库](https://huggingface.co/datasets/TideDra/HDBench/tree/main/data_dir/raw_annotations)查看(每十分钟更新一次),请确保你的标注成功存储到了仓库中,我们最终根据仓库中的标注结果进行统计你的贡献。

标注时需满足一些要求,以下要求按优先级从高到低依次列出,当要求冲突时,满足高优先级要求:
1. 将错误部分替换为正确答案后,整体语句通顺,没有语法错误。反例: "It is a cat eating a \<f>mouse\</f>\<t>rice\</t>" 正例: "It is a cat eating \<f>a mouse\</f>\<t>rice\</t>"。解释: rice不可数,所以要把"a"替换掉。
2. 改动应尽量少,尽量不改变原来的句式。反例: "There are \<f>two people in the image\</f>\<t>three people in the image\</t>" 正例: "There are \<f>two\</f>\<t>three\</t> people in the image"。解释:只把two改成three,用最少的改动实现了纠错。
3. 标签内部不要有冗余空格。反例: "There are\<f> two \</f>\<t> three \</t> people in the image" 正例: "There are \<f>two\</f>\<t>three\</t> people in the image"
4. 当描述中提及图片中完全不存在或不相干的事物,应直接删除该部分.\<t>\</t>内部不加任何文字表示删除。例:"In the image, there is a dog. \<f>There are also some cats.\</f>\<t>\</t>"
"""
example1 = ["./assets/example.png","The image features a misty canal with two wooden benches placed alongside it. One of the benches is positioned closer to the water, while the other is a bit further back. The foggy atmosphere creates a sense of serenity and calmness, as if the benches are the only beings in the scene.\n\nIn the distance, there <f>are two cars</f><t>is one car</t> parked near a bridge, adding to the serene ambiance. <f>A person can be seen in the far end of the scene, likely enjoying the peaceful environment.</f><t></t>  The bench placement and the misty canal make this scene an ideal spot for relaxation or reflection."]
example2 = ["./assets/example2.png","The image features a smiling stuffed lion sitting on a wooden picnic table. The picnic table is located in a park-like setting, with green grass surrounding the bench on which the stuffed lion is placed. \nThere <f>are two</f><t>is one</t> bench visible in the scene, with the main bench featuring the stuffed lion on it. <f>The other bench is situated a little to the right and is empty.</f><t></t>The arrangement creates a playful atmosphere, as if the lion is waiting for someone or enjoying the company of the empty bench."]
with gr.Blocks() as demo:
    gr.Markdown(instruction)
    with gr.Row():
        image = gr.Image()
        with gr.Column():
            image_path = gr.State()
            rest_num = gr.Markdown(label="rest_num",value=f"Number of samples to be annotated: {len(dataset)}")
            name = gr.Textbox(label="Your Name",placeholder="Set your name here to mark your annotations")
            text = gr.Textbox(label="Caption",lines=20)
            with gr.Row():
                next_button = gr.Button("Next")
                submit_button = gr.Button("Submit",interactive=False)
    gr.Markdown("# 示例:")
    gr.Examples(
        examples=[example1,example2],
        inputs=[image,text]
    )
    submit_button.click(fn=submit,inputs=[name,text,image_path],outputs=[image,text,rest_num]).success(
        fn=disable_and_enable_buttons,
        outputs=[submit_button,next_button]
    )
    next_button.click(fn=get_new_data,outputs=[image_path,text,image]).success(
        fn=disable_and_enable_buttons,
        outputs=[next_button,submit_button]
    )


demo.launch(share=True)