Spaces:
Runtime error
Runtime error
update app.py
Browse files- app.py +0 -115
- multimodal/setup.py +1 -0
app.py
CHANGED
@@ -70,121 +70,6 @@ def get_outputs(
|
|
70 |
return outputs
|
71 |
|
72 |
|
73 |
-
def evaluate_refcoco(
|
74 |
-
model,
|
75 |
-
tokenizer,
|
76 |
-
image_processor,
|
77 |
-
batch_size,
|
78 |
-
tsvfile,
|
79 |
-
max_generation_length=20,
|
80 |
-
num_beams=3,
|
81 |
-
length_penalty=-2.0,
|
82 |
-
device=-1,
|
83 |
-
vis_embed_size=None,
|
84 |
-
rank=0,
|
85 |
-
world_size=1,
|
86 |
-
id=0,
|
87 |
-
):
|
88 |
-
model.eval().cuda()
|
89 |
-
loc_token_ids = []
|
90 |
-
for i in range(1000):
|
91 |
-
loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
|
92 |
-
media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
|
93 |
-
endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
|
94 |
-
pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
|
95 |
-
bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
|
96 |
-
prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
|
97 |
-
# all_ids = set(range(model.lang_encoder.lm_head.out_features))
|
98 |
-
# bad_words_ids = list(all_ids - set(loc_token_ids))
|
99 |
-
# bad_words_ids = [[b] for b in bad_words_ids]
|
100 |
-
# min_loc_token_id = min(loc_token_ids)
|
101 |
-
# max_loc_token_id = max(loc_token_ids)
|
102 |
-
total = 0
|
103 |
-
correct = 0
|
104 |
-
ious = []
|
105 |
-
if "refcocog" in tsvfile:
|
106 |
-
dataset_name = "refcocog"
|
107 |
-
elif "refcocoplus" in tsvfile:
|
108 |
-
dataset_name = "refcocoplus"
|
109 |
-
else:
|
110 |
-
dataset_name = "refcoco"
|
111 |
-
with open(tsvfile, "r") as f:
|
112 |
-
lines = f.readlines()
|
113 |
-
pbar = tqdm(lines, disable=(rank != 0))
|
114 |
-
for ii, line in enumerate(pbar):
|
115 |
-
if ii % world_size != rank:
|
116 |
-
continue
|
117 |
-
total += 1
|
118 |
-
line = line.rstrip()
|
119 |
-
uniq_id, image_id, text, region_coord, image = line.split("\t")
|
120 |
-
|
121 |
-
image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB")
|
122 |
-
# image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
|
123 |
-
# image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/cat.png").convert("RGB")
|
124 |
-
# image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/262148000.png")
|
125 |
-
|
126 |
-
gt_box = np.array(list(map(float, region_coord.split(","))))
|
127 |
-
width = image.width
|
128 |
-
height = image.height
|
129 |
-
image = image.resize((224, 224))
|
130 |
-
gt_box = gt_box / np.array([width, height, width, height]) * 224
|
131 |
-
batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
|
132 |
-
prompt = [
|
133 |
-
f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token * vis_embed_size}<|#endofimage#|><|#object#|>{text.rstrip('.').strip()}<|#endofobject#|><|#visual#|>"]
|
134 |
-
# prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>the cat<|#visual#|>"]
|
135 |
-
# prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"]
|
136 |
-
# prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>a man<|#visual#|> is doing a trick on a skateboard<|#visual#|>"]
|
137 |
-
|
138 |
-
encodings = tokenizer(
|
139 |
-
prompt,
|
140 |
-
padding="longest",
|
141 |
-
truncation=True,
|
142 |
-
return_tensors="pt",
|
143 |
-
max_length=2000,
|
144 |
-
)
|
145 |
-
input_ids = encodings["input_ids"]
|
146 |
-
attention_mask = encodings["attention_mask"]
|
147 |
-
# attention_mask[input_ids == prebox_token_id] = 0
|
148 |
-
image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
|
149 |
-
image_start_index_list = [[x] for x in image_start_index_list]
|
150 |
-
image_nums = [1] * len(input_ids)
|
151 |
-
vision_x = batch_images.cuda()
|
152 |
-
lang_x = input_ids.cuda()
|
153 |
-
attention_mask = attention_mask.cuda()
|
154 |
-
|
155 |
-
model.debug_id = 0
|
156 |
-
with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
|
157 |
-
outputs = model(
|
158 |
-
vision_x=vision_x,
|
159 |
-
lang_x=lang_x,
|
160 |
-
attention_mask=attention_mask,
|
161 |
-
labels=None,
|
162 |
-
image_nums=image_nums,
|
163 |
-
image_start_index_list=image_start_index_list,
|
164 |
-
added_bbox_list=None,
|
165 |
-
add_box=False,
|
166 |
-
)
|
167 |
-
boxes = outputs["boxes"]
|
168 |
-
scores = outputs["scores"]
|
169 |
-
if len(scores) > 0:
|
170 |
-
box = boxes[scores.argmax()]
|
171 |
-
iou = get_iou(box, gt_box)
|
172 |
-
else:
|
173 |
-
iou = 0.0
|
174 |
-
# tqdm.write(f"output: {tokenizer.batch_decode(outputs)}")
|
175 |
-
tqdm.write(f"no output for: {uniq_id}, {image_id}, {text}")
|
176 |
-
if iou >= 0.5:
|
177 |
-
correct += 1
|
178 |
-
pbar.set_description(f"iou: {iou:.2f} score: {correct / total:.4f}")
|
179 |
-
# open_cv_image = np.array(image)
|
180 |
-
# # Convert RGB to BGR
|
181 |
-
# open_cv_image = open_cv_image[:, :, ::-1].copy()
|
182 |
-
# for box, score in zip(boxes, scores):
|
183 |
-
# open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
|
184 |
-
# cv2.imwrite("output.jpg", open_cv_image)
|
185 |
-
# print(boxes)
|
186 |
-
# print(scores)
|
187 |
-
# exit()
|
188 |
|
189 |
|
190 |
def generate(
|
|
|
70 |
return outputs
|
71 |
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
|
75 |
def generate(
|
multimodal/setup.py
CHANGED
@@ -33,6 +33,7 @@ if __name__ == "__main__":
|
|
33 |
"inflection",
|
34 |
"sentencepiece",
|
35 |
"open_clip_torch",
|
|
|
36 |
]
|
37 |
|
38 |
setup(
|
|
|
33 |
"inflection",
|
34 |
"sentencepiece",
|
35 |
"open_clip_torch",
|
36 |
+
"opencv-python"
|
37 |
]
|
38 |
|
39 |
setup(
|