RefurnishAI

Running on Zero

App Files Files Community

Ashoka74 commited on 10 days ago

Commit

c15f1a2

•

1 Parent(s): 4193881

Update app_merged.py

Browse files

Files changed (1) hide show

app_merged.py +28 -113

app_merged.py CHANGED Viewed

@@ -841,8 +841,6 @@ def use_orientation(selected_image:gr.SelectData):
 def process_image(input_image, input_text):
     """Main processing function for the Gradio interface"""
     if isinstance(input_image, Image.Image):
         input_image = np.array(input_image)
@@ -857,7 +855,6 @@ def process_image(input_image, input_text):
     HEIGHT = 768
     WIDTH = 768
     # Initialize DDS client
     config = Config(API_TOKEN)
     client = Client(config)
@@ -867,8 +864,6 @@ def process_image(input_image, input_text):
     class_name_to_id = {name: id for id, name in enumerate(classes)}
     class_id_to_name = {id: name for name, id in class_name_to_id.items()}
     # Save input image to temp file and get URL
     with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile:
         cv2.imwrite(tmpfile.name, input_image)
@@ -884,11 +879,11 @@ def process_image(input_image, input_text):
     if len(input_text) == 0:
         task = DinoxTask(
-        image_url=image_url,
-        prompts=[TextPrompt(text="<prompt_free>")],
-        # targets=[DetectionTarget.BBox, DetectionTarget.Mask]
         )
         client.run_task(task)
         predictions = task.result.objects
         classes = [pred.category for pred in predictions]
@@ -931,38 +926,24 @@ def process_image(input_image, input_text):
         if len(detections) > 0:
             # Get first mask
             first_mask = detections.mask[0]
             # Get original RGB image
             img = input_image.copy()
             H, W, C = img.shape
-            # Create RGBA image
             alpha = np.zeros((H, W, 1), dtype=np.uint8)
-            alpha[first_mask] = 255
-            # rgba = np.dstack((img, alpha)).astype(np.uint8)
-            # Crop to mask bounds to minimize image size
-            # y_indices, x_indices = np.where(first_mask)
-            # y_min, y_max = y_indices.min(), y_indices.max()
-            # x_min, x_max = x_indices.min(), x_indices.max()
-            # Crop the RGBA image
-            # cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
-            # Set extracted foreground for mask mover
-            # mask_mover.set_extracted_fg(cropped_rgba)
-            # alpha = img[..., 3] > 0
-            H, W = alpha.shape
             # get the bounding box of alpha
             y, x = np.where(alpha > 0)
             y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
             x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
-            image_center = img[y0:y1, x0:x1]
             # resize the longer side to H * 0.9
             H, W, _ = image_center.shape
             if H > W:
@@ -972,7 +953,7 @@ def process_image(input_image, input_text):
                 H = int(H * (WIDTH * 0.9) / W)
                 W = int(WIDTH * 0.9)
-            image_center = np.array(Image.fromarray(image_center).resize((W, H)))
             # pad to H, W
             start_h = (HEIGHT - H) // 2
             start_w = (WIDTH - W) // 2
@@ -982,10 +963,9 @@ def process_image(input_image, input_text):
             image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
             image = (image * 255).clip(0, 255).astype(np.uint8)
             image = Image.fromarray(image)
-            return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
     else:
         # Run DINO-X detection
         task = DinoxTask(
@@ -998,18 +978,6 @@ def process_image(input_image, input_text):
         result = task.result
         objects = result.objects
-        # for obj in objects:
-        #     input_boxes.append(obj.bbox)
-        #     confidences.append(obj.score)
-        #     cls_name = obj.category.lower().strip()
-        #     class_names.append(cls_name)
-        #     class_ids.append(class_name_to_id[cls_name])
-        # input_boxes = np.array(input_boxes)
-        # class_ids = np.array(class_ids)
         predictions = task.result.objects
         classes = [x.strip().lower() for x in input_text.split('.') if x]
         class_name_to_id = {name: id for id, name in enumerate(classes)}
@@ -1037,46 +1005,12 @@ def process_image(input_image, input_text):
             for class_name, confidence
             in zip(class_names, confidences)
         ]
-        # Initialize SAM2
-        # torch.autocast(device_type=DEVICE, dtype=torch.bfloat16).__enter__()
-        # if torch.cuda.get_device_properties(0).major >= 8:
-        #     torch.backends.cuda.matmul.allow_tf32 = True
-        #     torch.backends.cudnn.allow_tf32 = True
-        # sam2_model = build_sam2(SAM2_MODEL_CONFIG, SAM2_CHECKPOINT, device=DEVICE)
-        # sam2_predictor = SAM2ImagePredictor(sam2_model)
-        # sam2_predictor.set_image(input_image)
-        # sam2_predictor = run_sam_inference(SAM_IMAGE_MODEL, input_image, detections)
-        # Get masks from SAM2
-        # masks, scores, logits = sam2_predictor.predict(
-        #     point_coords=None,
-        #     point_labels=None,
-        #     box=input_boxes,
-        #     multimask_output=False,
-        # )
-        if masks.ndim == 4:
-            masks = masks.squeeze(1)
-        # Create visualization
-        # labels = [f"{class_name} {confidence:.2f}"
-        #          for class_name, confidence in zip(class_names, confidences)]
-        # detections = sv.Detections(
-        #     xyxy=input_boxes,
-        #     mask=masks.astype(bool),
-        #     class_id=class_ids
-        # )
         detections = sv.Detections(
-        xyxy = boxes,
-        mask = masks.astype(bool),
-        class_id = class_ids,
-    )
         box_annotator = sv.BoxAnnotator()
         label_annotator = sv.LabelAnnotator()
@@ -1096,36 +1030,18 @@ def process_image(input_image, input_text):
             img = input_image.copy()
             H, W, C = img.shape
-            first_mask = detections.mask[0]
-            # Create RGBA image
             alpha = np.zeros((H, W, 1), dtype=np.uint8)
-            alpha[first_mask] = 255
-            # rgba = np.dstack((img, alpha)).astype(np.uint8)
-            # Crop to mask bounds to minimize image size
-            # y_indices, x_indices = np.where(first_mask)
-            # y_min, y_max = y_indices.min(), y_indices.max()
-            # x_min, x_max = x_indices.min(), x_indices.max()
-            # Crop the RGBA image
-            # cropped_rgba = rgba[y_min:y_max+1, x_min:x_max+1]
-            # Set extracted foreground for mask mover
-            # mask_mover.set_extracted_fg(cropped_rgba)
-            # alpha = img[..., 3] > 0
-            H, W = alpha.shape
             # get the bounding box of alpha
             y, x = np.where(alpha > 0)
             y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
             x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
-            image_center = img[y0:y1, x0:x1]
             # resize the longer side to H * 0.9
             H, W, _ = image_center.shape
             if H > W:
@@ -1135,7 +1051,7 @@ def process_image(input_image, input_text):
                 H = int(H * (WIDTH * 0.9) / W)
                 W = int(WIDTH * 0.9)
-            image_center = np.array(Image.fromarray(image_center).resize((W, H)))
             # pad to H, W
             start_h = (HEIGHT - H) // 2
             start_w = (WIDTH - W) // 2
@@ -1148,7 +1064,6 @@ def process_image(input_image, input_text):
             return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
         return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)

 def process_image(input_image, input_text):
     """Main processing function for the Gradio interface"""
     if isinstance(input_image, Image.Image):
         input_image = np.array(input_image)
     HEIGHT = 768
     WIDTH = 768
     # Initialize DDS client
     config = Config(API_TOKEN)
     client = Client(config)
     class_name_to_id = {name: id for id, name in enumerate(classes)}
     class_id_to_name = {id: name for name, id in class_name_to_id.items()}
     # Save input image to temp file and get URL
     with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmpfile:
         cv2.imwrite(tmpfile.name, input_image)
     if len(input_text) == 0:
         task = DinoxTask(
+            image_url=image_url,
+            prompts=[TextPrompt(text="<prompt_free>")],
+            # targets=[DetectionTarget.BBox, DetectionTarget.Mask]
         )
         client.run_task(task)
         predictions = task.result.objects
         classes = [pred.category for pred in predictions]
         if len(detections) > 0:
             # Get first mask
             first_mask = detections.mask[0]
             # Get original RGB image
             img = input_image.copy()
             H, W, C = img.shape
+            # Create RGBA image with default 255 alpha
             alpha = np.zeros((H, W, 1), dtype=np.uint8)
+            alpha[~first_mask] = 0 # 128 # for semi-transparency background
+            alpha[first_mask] = 255 # Make the foreground opaque
+            alpha = alpha.squeeze(-1) # Remove singleton dimension to become 2D
+            rgba = np.dstack((img, alpha)).astype(np.uint8)
             # get the bounding box of alpha
             y, x = np.where(alpha > 0)
             y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
             x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
+            image_center = rgba[y0:y1, x0:x1]
             # resize the longer side to H * 0.9
             H, W, _ = image_center.shape
             if H > W:
                 H = int(H * (WIDTH * 0.9) / W)
                 W = int(WIDTH * 0.9)
+            image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS))
             # pad to H, W
             start_h = (HEIGHT - H) // 2
             start_w = (WIDTH - W) // 2
             image = image[:, :, :3] * image[:, :, 3:4] + (1 - image[:, :, 3:4]) * 0.5
             image = (image * 255).clip(0, 255).astype(np.uint8)
             image = Image.fromarray(image)
+            return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
+        return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)
     else:
         # Run DINO-X detection
         task = DinoxTask(
         result = task.result
         objects = result.objects
         predictions = task.result.objects
         classes = [x.strip().lower() for x in input_text.split('.') if x]
         class_name_to_id = {name: id for id, name in enumerate(classes)}
             for class_name, confidence
             in zip(class_names, confidences)
         ]
         detections = sv.Detections(
+            xyxy=boxes,
+            mask=masks.astype(bool),
+            class_id=class_ids,
+        )
         box_annotator = sv.BoxAnnotator()
         label_annotator = sv.LabelAnnotator()
             img = input_image.copy()
             H, W, C = img.shape
+            # Create RGBA image with default 255 alpha
             alpha = np.zeros((H, W, 1), dtype=np.uint8)
+            alpha[~first_mask] = 0 # 128 for semi-transparency background
+            alpha[first_mask] = 255 # Make the foreground opaque
+            alpha = alpha.squeeze(-1) # Remove singleton dimension to become 2D
+            rgba = np.dstack((img, alpha)).astype(np.uint8)
             # get the bounding box of alpha
             y, x = np.where(alpha > 0)
             y0, y1 = max(y.min() - 1, 0), min(y.max() + 1, H)
             x0, x1 = max(x.min() - 1, 0), min(x.max() + 1, W)
+            image_center = rgba[y0:y1, x0:x1]
             # resize the longer side to H * 0.9
             H, W, _ = image_center.shape
             if H > W:
                 H = int(H * (WIDTH * 0.9) / W)
                 W = int(WIDTH * 0.9)
+            image_center = np.array(Image.fromarray(image_center).resize((W, H), Image.LANCZOS))
             # pad to H, W
             start_h = (HEIGHT - H) // 2
             start_w = (WIDTH - W) // 2
             return annotated_frame, image, gr.update(visible=False), gr.update(visible=False)
         return annotated_frame, None, gr.update(visible=False), gr.update(visible=False)