chendl commited on
Commit
8c12524
1 Parent(s): 0332b28

update app.py

Browse files
Files changed (2) hide show
  1. app.py +0 -115
  2. multimodal/setup.py +1 -0
app.py CHANGED
@@ -70,121 +70,6 @@ def get_outputs(
70
  return outputs
71
 
72
 
73
- def evaluate_refcoco(
74
- model,
75
- tokenizer,
76
- image_processor,
77
- batch_size,
78
- tsvfile,
79
- max_generation_length=20,
80
- num_beams=3,
81
- length_penalty=-2.0,
82
- device=-1,
83
- vis_embed_size=None,
84
- rank=0,
85
- world_size=1,
86
- id=0,
87
- ):
88
- model.eval().cuda()
89
- loc_token_ids = []
90
- for i in range(1000):
91
- loc_token_ids.append(int(tokenizer(f"<loc_{i}>", add_special_tokens=False)["input_ids"][-1]))
92
- media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
93
- endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
94
- pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
95
- bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
96
- prebox_token_id = tokenizer("<|#prebox#|>", add_special_tokens=False)["input_ids"][-1]
97
- # all_ids = set(range(model.lang_encoder.lm_head.out_features))
98
- # bad_words_ids = list(all_ids - set(loc_token_ids))
99
- # bad_words_ids = [[b] for b in bad_words_ids]
100
- # min_loc_token_id = min(loc_token_ids)
101
- # max_loc_token_id = max(loc_token_ids)
102
- total = 0
103
- correct = 0
104
- ious = []
105
- if "refcocog" in tsvfile:
106
- dataset_name = "refcocog"
107
- elif "refcocoplus" in tsvfile:
108
- dataset_name = "refcocoplus"
109
- else:
110
- dataset_name = "refcoco"
111
- with open(tsvfile, "r") as f:
112
- lines = f.readlines()
113
- pbar = tqdm(lines, disable=(rank != 0))
114
- for ii, line in enumerate(pbar):
115
- if ii % world_size != rank:
116
- continue
117
- total += 1
118
- line = line.rstrip()
119
- uniq_id, image_id, text, region_coord, image = line.split("\t")
120
-
121
- image = Image.open(BytesIO(base64.urlsafe_b64decode(image))).convert("RGB")
122
- # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/yolo.png").convert("RGB")
123
- # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/cat.png").convert("RGB")
124
- # image = Image.open("/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal/temp/262148000.png")
125
-
126
- gt_box = np.array(list(map(float, region_coord.split(","))))
127
- width = image.width
128
- height = image.height
129
- image = image.resize((224, 224))
130
- gt_box = gt_box / np.array([width, height, width, height]) * 224
131
- batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
132
- prompt = [
133
- f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token * vis_embed_size}<|#endofimage#|><|#object#|>{text.rstrip('.').strip()}<|#endofobject#|><|#visual#|>"]
134
- # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>the cat<|#visual#|>"]
135
- # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>"]
136
- # prompt = [f"<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|>a man<|#visual#|> is doing a trick on a skateboard<|#visual#|>"]
137
-
138
- encodings = tokenizer(
139
- prompt,
140
- padding="longest",
141
- truncation=True,
142
- return_tensors="pt",
143
- max_length=2000,
144
- )
145
- input_ids = encodings["input_ids"]
146
- attention_mask = encodings["attention_mask"]
147
- # attention_mask[input_ids == prebox_token_id] = 0
148
- image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
149
- image_start_index_list = [[x] for x in image_start_index_list]
150
- image_nums = [1] * len(input_ids)
151
- vision_x = batch_images.cuda()
152
- lang_x = input_ids.cuda()
153
- attention_mask = attention_mask.cuda()
154
-
155
- model.debug_id = 0
156
- with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
157
- outputs = model(
158
- vision_x=vision_x,
159
- lang_x=lang_x,
160
- attention_mask=attention_mask,
161
- labels=None,
162
- image_nums=image_nums,
163
- image_start_index_list=image_start_index_list,
164
- added_bbox_list=None,
165
- add_box=False,
166
- )
167
- boxes = outputs["boxes"]
168
- scores = outputs["scores"]
169
- if len(scores) > 0:
170
- box = boxes[scores.argmax()]
171
- iou = get_iou(box, gt_box)
172
- else:
173
- iou = 0.0
174
- # tqdm.write(f"output: {tokenizer.batch_decode(outputs)}")
175
- tqdm.write(f"no output for: {uniq_id}, {image_id}, {text}")
176
- if iou >= 0.5:
177
- correct += 1
178
- pbar.set_description(f"iou: {iou:.2f} score: {correct / total:.4f}")
179
- # open_cv_image = np.array(image)
180
- # # Convert RGB to BGR
181
- # open_cv_image = open_cv_image[:, :, ::-1].copy()
182
- # for box, score in zip(boxes, scores):
183
- # open_cv_image = cv2.rectangle(open_cv_image, box[:2].astype(int), box[2:].astype(int), (255, 0, 0), 2)
184
- # cv2.imwrite("output.jpg", open_cv_image)
185
- # print(boxes)
186
- # print(scores)
187
- # exit()
188
 
189
 
190
  def generate(
 
70
  return outputs
71
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
75
  def generate(
multimodal/setup.py CHANGED
@@ -33,6 +33,7 @@ if __name__ == "__main__":
33
  "inflection",
34
  "sentencepiece",
35
  "open_clip_torch",
 
36
  ]
37
 
38
  setup(
 
33
  "inflection",
34
  "sentencepiece",
35
  "open_clip_torch",
36
+ "opencv-python"
37
  ]
38
 
39
  setup(