File size: 16,746 Bytes
f1bc325
 
 
 
 
 
 
 
 
 
 
94cd336
 
 
 
 
 
 
f1bc325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94cd336
f1bc325
 
 
 
 
 
 
 
 
94cd336
f1bc325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
import gradio as gr
import random
import numpy as np
import os
import requests
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
import cv2
import ast
import torch
from efficientnet_pytorch import EfficientNet
from torchvision import transforms
from PIL import Image
import gradio as gr
from super_gradients.training import models


class Kosmos2:
    def __init__(self):
        self.colors = [
            (0, 255, 0),
            (0, 0, 255),
            (255, 255, 0),
            (255, 0, 255),
            (0, 255, 255),
            (114, 128, 250),
            (0, 165, 255),
            (0, 128, 0),
            (144, 238, 144),
            (238, 238, 175),
            (255, 191, 0),
            (0, 128, 0),
            (226, 43, 138),
            (255, 0, 255),
            (0, 215, 255),
            (255, 0, 0),    
        ]

        self.color_map = {
            f"{color_id}": f"#{hex(color[2])[2:].zfill(2)}{hex(color[1])[2:].zfill(2)}{hex(color[0])[2:].zfill(2)}" for color_id, color in enumerate(self.colors)
        }

        self.ckpt = "ydshieh/kosmos-2-patch14-224"
        self.model = AutoModelForVision2Seq.from_pretrained(self.ckpt, trust_remote_code=True).to("cuda")
        self.processor = AutoProcessor.from_pretrained(self.ckpt, trust_remote_code=True)

    def is_overlapping(self, rect1, rect2):
        x1, y1, x2, y2 = rect1
        x3, y3, x4, y4 = rect2
        return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4)

    def draw_entity_boxes_on_image(self, image, entities, show=False, save_path=None, entity_index=-1):
        """_summary_
        Args:
            image (_type_): image or image path
            collect_entity_location (_type_): _description_
        """
        if isinstance(image, Image.Image):
            image_h = image.height
            image_w = image.width
            image = np.array(image)[:, :, [2, 1, 0]]
        elif isinstance(image, str):
            if os.path.exists(image):
                pil_img = Image.open(image).convert("RGB")
                image = np.array(pil_img)[:, :, [2, 1, 0]]
                image_h = pil_img.height
                image_w = pil_img.width
            else:
                raise ValueError(f"invaild image path, {image}")
        elif isinstance(image, torch.Tensor):
            # pdb.set_trace()
            image_tensor = image.cpu()
            reverse_norm_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])[:, None, None]
            reverse_norm_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])[:, None, None]
            image_tensor = image_tensor * reverse_norm_std + reverse_norm_mean
            pil_img = T.ToPILImage()(image_tensor)
            image_h = pil_img.height
            image_w = pil_img.width
            image = np.array(pil_img)[:, :, [2, 1, 0]]
        else:
            raise ValueError(f"invaild image format, {type(image)} for {image}")
        
        if len(entities) == 0:
            return image

        indices = list(range(len(entities)))
        if entity_index >= 0:
            indices = [entity_index]

        # Not to show too many bboxes
        entities = entities[:len(self.color_map)]
        
        new_image = image.copy()
        previous_bboxes = []
        # size of text
        text_size = 1
        # thickness of text
        text_line = 1  # int(max(1 * min(image_h, image_w) / 512, 1))
        box_line = 3
        (c_width, text_height), _ = cv2.getTextSize("F", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
        base_height = int(text_height * 0.675)
        text_offset_original = text_height - base_height
        text_spaces = 3
        # num_bboxes = sum(len(x[-1]) for x in entities)
        used_colors = self.colors  # random.sample(colors, k=num_bboxes)

        color_id = -1
        for entity_idx, (entity_name, (start, end), bboxes) in enumerate(entities):
            color_id += 1
            if entity_idx not in indices:
                continue
            for bbox_id, (x1_norm, y1_norm, x2_norm, y2_norm) in enumerate(bboxes):
                # if start is None and bbox_id > 0:
                #     color_id += 1
                orig_x1, orig_y1, orig_x2, orig_y2 = int(x1_norm * image_w), int(y1_norm * image_h), int(x2_norm * image_w), int(y2_norm * image_h)

                # draw bbox
                # random color
                color = used_colors[color_id]  # tuple(np.random.randint(0, 255, size=3).tolist())
                new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line)

                l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1

                x1 = orig_x1 - l_o
                y1 = orig_y1 - l_o

                if y1 < text_height + text_offset_original + 2 * text_spaces:
                    y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces
                    x1 = orig_x1 + r_o

                # add text background
                (text_width, text_height), _ = cv2.getTextSize(f"  {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
                text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - (text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1

                for prev_bbox in previous_bboxes:
                    while self.is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox):
                        text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces)
                        text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces)
                        y1 += (text_height + text_offset_original + 2 * text_spaces)

                        if text_bg_y2 >= image_h:
                            text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces))
                            text_bg_y2 = image_h
                            y1 = image_h
                            break

                alpha = 0.5
                for i in range(text_bg_y1, text_bg_y2):
                    for j in range(text_bg_x1, text_bg_x2):
                        if i < image_h and j < image_w:
                            if j < text_bg_x1 + 1.35 * c_width:
                                # original color
                                bg_color = color
                            else:
                                # white
                                bg_color = [255, 255, 255]
                            new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(np.uint8)

                cv2.putText(
                    new_image, f"  {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces), cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA
                )
                # previous_locations.append((x1, y1))
                previous_bboxes.append((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2))

        pil_image = Image.fromarray(new_image[:, :, [2, 1, 0]])
        if save_path:
            pil_image.save(save_path)
        if show:
            pil_image.show()

        return pil_image

    def generate_predictions(self, image_input, text_input):

        # Save the image and load it again to match the original Kosmos-2 demo.
        # (https://github.com/microsoft/unilm/blob/f4695ed0244a275201fff00bee495f76670fbe70/kosmos-2/demo/gradio_app.py#L345-L346)
        user_image_path = "/tmp/user_input_test_image.jpg"
        image_input.save(user_image_path)
        # This might give different results from the original argument `image_input`
        image_input = Image.open(user_image_path)

        if text_input == "Brief":
            text_input = "<grounding>An image of"
        elif text_input == "Detailed":
            text_input = "<grounding>Describe this image in detail:"
        else:
            text_input = f"<grounding>{text_input}"

        inputs = self.processor(text=text_input, images=image_input, return_tensors="pt")

        generated_ids = self.model.generate(
            pixel_values=inputs["pixel_values"].to("cuda"),
            input_ids=inputs["input_ids"][:, :-1].to("cuda"),
            attention_mask=inputs["attention_mask"][:, :-1].to("cuda"),
            img_features=None,
            img_attn_mask=inputs["img_attn_mask"][:, :-1].to("cuda"),
            use_cache=True,
            max_new_tokens=128,
        )
        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # By default, the generated  text is cleanup and the entities are extracted.
        processed_text, entities = self.processor.post_process_generation(generated_text)

        annotated_image = self.draw_entity_boxes_on_image(image_input, entities, show=False)

        color_id = -1
        entity_info = []
        filtered_entities = []
        for entity in entities:
            entity_name, (start, end), bboxes = entity
            if start == end:
                # skip bounding bbox without a `phrase` associated
                continue
            color_id += 1
            # for bbox_id, _ in enumerate(bboxes):
                # if start is None and bbox_id > 0:
                #     color_id += 1
            entity_info.append(((start, end), color_id))
            filtered_entities.append(entity)

        colored_text = []
        prev_start = 0
        end = 0
        for idx, ((start, end), color_id) in enumerate(entity_info):
            if start > prev_start:
                colored_text.append((processed_text[prev_start:start], None))
            colored_text.append((processed_text[start:end], f"{color_id}"))
            prev_start = end

        if end < len(processed_text):
            colored_text.append((processed_text[end:len(processed_text)], None))

        return annotated_image, colored_text, str(filtered_entities)

class VehiclePredictor:
    def __init__(self, model_path):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.yolo_nas_l = models.get("yolo_nas_l", pretrained_weights="coco")
        self.classifier_model = torch.load(model_path)
        self.classifier_model = self.classifier_model.to(self.device)
        self.classifier_model.eval()  # Set the model to evaluation mode

    def bounding_boxes_overlap(self, box1, box2):
        """Check if two bounding boxes overlap or touch."""
        x1, y1, x2, y2 = box1
        x3, y3, x4, y4 = box2
        return not (x3 > x2 or x4 < x1 or y3 > y2 or y4 < y1)
    
    def merge_boxes(self, box1, box2):
        """Return the encompassing bounding box of two boxes."""
        x1, y1, x2, y2 = box1
        x3, y3, x4, y4 = box2
        x = min(x1, x3)
        y = min(y1, y3)
        w = max(x2, x4)
        h = max(y2, y4)
        return (x, y, w, h)
    
    def save_merged_boxes(self, predictions, image_np):
        """Save merged bounding boxes as separate images."""
        processed_boxes = set()
        roi = None  # Initialize roi to None

        for image_prediction in predictions:
            bboxes = image_prediction.prediction.bboxes_xyxy  
            for box1 in bboxes:
                for box2 in bboxes:
                    if np.array_equal(box1, box2):
                        continue
                    if self.bounding_boxes_overlap(box1, box2) and tuple(box1) not in processed_boxes and tuple(box2) not in processed_boxes:
                        merged_box = self.merge_boxes(box1, box2)
                        roi = image_np[int(merged_box[1]):int(merged_box[3]), int(merged_box[0]):int(merged_box[2])]
                        processed_boxes.add(tuple(box1))
                        processed_boxes.add(tuple(box2))
                        break  # Exit the inner loop once a match is found
                if roi is not None:
                    break  # Exit the outer loop once a match is found
        return roi

    # Perform inference on an image
    def predict_image(self, image, model):
        # First, get the ROI using YOLO-NAS
        image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        predictions = self.yolo_nas_l.predict(image_np, iou=0.3, conf=0.35)
        roi_new = self.save_merged_boxes(predictions, image_np)
        
        if roi_new is None:
            roi_new = image_np  # Use the original image if no ROI is found

        # Convert ROI back to PIL Image for EfficientNet
        roi_image = Image.fromarray(cv2.cvtColor(roi_new, cv2.COLOR_BGR2RGB))

        # Define the image transformations
        transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

        # Convert PIL Image to Tensor
        roi_image_tensor = transform(roi_image).unsqueeze(0).to(self.device)

        with torch.no_grad():
            outputs = self.classifier_model(roi_image_tensor)
            _, predicted = outputs.max(1)
            prediction_text = 'Accident' if predicted.item() == 0 else 'No accident'
        
        return roi_image, prediction_text  # Return both the roi_image and the prediction text


def main():
    kosmos2 = Kosmos2()
    vehicle_predictor = VehiclePredictor('vehicle.pt')

    with gr.Blocks(title="Advanced Vehicle Contextualization & Collision Prediction", theme=gr.themes.Base()).queue() as demo:
        gr.Markdown(("""
            # Models used - 
            Kosmos-2: Grounding Multimodal Large Language Models to the World
            [[Paper]](https://arxiv.org/abs/2306.14824) [[Code]](https://github.com/microsoft/unilm/blob/master/kosmos-2)
            YOLO-NAS [[Code]](https://github.com/Deci-AI/super-gradients/blob/master/YOLONAS.md)
            EfficientNet-b0
            """))
        with gr.Row():
            with gr.Column():
                image_input = gr.Image(type="pil", label="Test Image")
                text_input = gr.Radio(["Brief", "Detailed"], label="Description Type", value="Brief")
                run_button = gr.Button(label="Run", visible=True)

            with gr.Column():
                image_output_kosmos = gr.Image(type="pil", label="Kosmos-2 Output Image")
                text_output_kosmos = gr.HighlightedText(
                                    label="Generated Description by Kosmos-2",
                                    combine_adjacent=False,
                                    show_legend=True,
                                ).style(color_map=kosmos2.color_map)
                
                image_output_vehicle = gr.Image(type="pil", label="Collision Predictor Output Image", size=(112, 112))
                text_output_vehicle = gr.Textbox(label="Collision Predictor Result")

        # record which text span (label) is selected
        selected = gr.Number(-1, show_label=False, placeholder="Selected", visible=False)

        # record the current `entities`
        entity_output = gr.Textbox(visible=False)

        # get the current selected span label
        def get_text_span_label(evt: gr.SelectData):
            if evt.value[-1] is None:
                return -1
            return int(evt.value[-1])
        # and set this information to `selected`
        text_output_kosmos.select(get_text_span_label, None, selected)
        
        # update output image when we change the span (enity) selection
        def update_output_image(img_input, image_output, entities, idx):
            entities = ast.literal_eval(entities)
            updated_image = kosmos2.draw_entity_boxes_on_image(img_input, entities, entity_index=idx)
            return updated_image
        selected.change(update_output_image, [image_input, image_output_kosmos, entity_output, selected], [image_output_kosmos])

        def combined_predictions(img, description_type):
            # Kosmos2 predictions
            kosmos_image, kosmos_text, entities = kosmos2.generate_predictions(img, description_type)
            
            # VehiclePredictor predictions
            vehicle_image, vehicle_text = vehicle_predictor.predict_image(img, vehicle_predictor.classifier_model)
            
            return kosmos_image, kosmos_text, entities, vehicle_image, vehicle_text

        run_button.click(fn=combined_predictions,
                         inputs=[image_input, text_input],
                         outputs=[image_output_kosmos, text_output_kosmos, entity_output, image_output_vehicle, text_output_vehicle],
                         show_progress=True, queue=True)

    demo.launch(share=True)

if __name__ == "__main__":
    main()