Spaces:

opendatalab
/

DocLayout-YOLO

Running on Zero

App Files Files Community

juliozhao commited on 18 days ago

Commit

778c8b4

•

1 Parent(s): dfdfaf2

Upload 13 files

Browse files

Files changed (13) hide show

README.md +5 -5
app.py +108 -0
assets/.DS_Store +0 -0
assets/example/academic.jpg +0 -0
assets/example/exam_paper.jpg +0 -0
assets/example/financial.jpg +0 -0
assets/example/fuzzy_scan.jpg +0 -0
assets/example/poster.jpg +0 -0
assets/example/ppt.jpg +0 -0
assets/example/textbook.jpg +0 -0
header.html +109 -0
requirements.txt +4 -0
visualization.py +90 -0

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
-title: DocLayout YOLO
-emoji: 📚
-colorFrom: green
-colorTo: gray
 sdk: gradio
 sdk_version: 5.1.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Online demo for DocLayout-YOLO
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DocLayout YOLO Demo
+emoji: 🐢
+colorFrom: yellow
+colorTo: green
 sdk: gradio
 sdk_version: 5.1.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Demo for DocLayout-YOLO
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+os.environ["GRADIO_TEMP_DIR"] = "./tmp"
+import sys
+import torch
+import torchvision
+import gradio as gr
+import numpy as np
+from PIL import Image
+from huggingface_hub import snapshot_download
+from visualization import visualize_bbox
+# == download weights ==
+model_dir = snapshot_download('juliozhao/DocLayout-YOLO-DocStructBench', local_dir='./models/DocLayout-YOLO-DocStructBench')
+# == select device ==
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+id_to_names = {
+    0: 'title',
+    1: 'plain text',
+    2: 'abandon',
+    3: 'figure',
+    4: 'figure_caption',
+    5: 'table',
+    6: 'table_caption',
+    7: 'table_footnote',
+    8: 'isolate_formula',
+    9: 'formula_caption'
+}
+def recognize_image(input_img, conf_threshold, iou_threshold):
+    det_res = model.predict(
+        input_img,
+        imgsz=1024,
+        conf=conf_threshold,
+        device=device,
+    )[0]
+    boxes = det_res.__dict__['boxes'].xyxy
+    classes = det_res.__dict__['boxes'].cls
+    scores = det_res.__dict__['boxes'].conf
+    indices = torchvision.ops.nms(boxes=torch.Tensor(boxes), scores=torch.Tensor(scores),iou_threshold=iou_threshold)
+    boxes, scores, classes = boxes[indices], scores[indices], classes[indices]
+    if len(boxes.shape) == 1:
+        boxes = np.expand_dims(boxes, 0)
+        scores = np.expand_dims(scores, 0)
+        classes = np.expand_dims(classes, 0)
+    vis_result = visualize_bbox(input_img, boxes, classes, scores, id_to_names)
+    return vis_result
+def gradio_reset():
+    return gr.update(value=None), gr.update(value=None)
+if __name__ == "__main__":
+    root_path = os.path.abspath(os.getcwd())
+    # == load model ==
+    from doclayout_yolo import YOLOv10
+    print(f"Using device: {device}")
+    model = YOLOv10(os.path.join(os.path.dirname(__file__), "models", "DocLayout-YOLO-DocStructBench", "doclayout_yolo_docstructbench_imgsz1024.pt"))  # load an official model
+    with open("header.html", "r") as file:
+        header = file.read()
+    with gr.Blocks() as demo:
+        gr.HTML(header)
+        with gr.Row():
+            with gr.Column():
+                input_img = gr.Image(label=" ", interactive=True)
+                with gr.Row():
+                    clear = gr.Button(value="Clear")
+                    predict = gr.Button(value="Detect", interactive=True, variant="primary")
+                with gr.Row():
+                    conf_threshold = gr.Slider(
+                        label="Confidence Threshold",
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.05,
+                        value=0.25,
+                    )
+                with gr.Row():
+                    iou_threshold = gr.Slider(
+                        label="NMS IOU Threshold",
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.05,
+                        value=0.45,
+                    )
+                with gr.Accordion("Examples:"):
+                    example_root = os.path.join(os.path.dirname(__file__), "assets", "example")
+                    gr.Examples(
+                        examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if
+                                    _.endswith("jpg")],
+                        inputs=[input_img],
+                    )
+            with gr.Column():
+                gr.Button(value="Predict Result:", interactive=False)
+                output_img = gr.Image(label=" ", interactive=False)
+        clear.click(gradio_reset, inputs=None, outputs=[input_img, output_img])
+        predict.click(recognize_image, inputs=[input_img,conf_threshold,iou_threshold], outputs=[output_img])
+    demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)

assets/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

assets/example/academic.jpg ADDED Viewed

assets/example/exam_paper.jpg ADDED Viewed

assets/example/financial.jpg ADDED Viewed

assets/example/fuzzy_scan.jpg ADDED Viewed

assets/example/poster.jpg ADDED Viewed

assets/example/ppt.jpg ADDED Viewed

assets/example/textbook.jpg ADDED Viewed

header.html ADDED Viewed

	@@ -0,0 +1,109 @@

+<html><head>
+    <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css"> -->
+    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
+  <style>
+    .link-block {
+      border: 1px solid transparent;
+      border-radius: 24px;
+      background-color: rgba(54, 54, 54, 1);
+      cursor: pointer !important;
+    }
+    .link-block:hover {
+      background-color: rgba(54, 54, 54, 0.75) !important;
+      cursor: pointer !important;
+    }
+    .external-link {
+      display: inline-flex;
+      align-items: center;
+      height: 36px;
+      line-height: 36px;
+      padding: 0 16px;
+      cursor: pointer !important;
+    }
+    .external-link,
+    .external-link:hover {
+      cursor: pointer !important;
+    }
+    a {
+      text-decoration: none;
+    }
+  </style></head>
+  <body>
+    <div style="
+        display: flex;
+        flex-direction: column;
+        justify-content: center;
+        align-items: center;
+        text-align: center;
+        background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
+        padding: 24px;
+        gap: 24px;
+        border-radius: 8px;
+      ">
+      <div style="
+          display: flex;
+          flex-direction: column;
+          align-items: center;
+          gap: 16px;
+        ">
+        <div style="display: flex; flex-direction: column; gap: 8px">
+          <h1 style="
+              font-size: 48px;
+              color: #fafafa;
+              margin: 0;
+              font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
+                'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
+            ">
+            DocLayout-YOLO
+          </h1>
+        </div>
+      </div>
+      <p style="
+          margin: 0;
+          line-height: 1.6rem;
+          font-size: 16px;
+          color: #fafafa;
+          opacity: 0.8;
+        ">
+        An efficient and robust Model for Real-World Document Layout Analysis.<br>
+      </p>
+      <style>
+        .link-block {
+          display: inline-block;
+        }
+        .link-block + .link-block {
+          margin-left: 20px;
+        }
+      </style>
+      <div class="column has-text-centered">
+        <div class="publication-links">
+          <!-- Code Link. -->
+          <span class="link-block">
+            <a href="https://github.com/opendatalab/DocLayout-YOLO" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+              <span class="icon" style="margin-right: 4px">
+                <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
+              </span>
+              <span style="color: white">Code</span>
+            </a>
+          </span>
+          <!-- Paper Link. -->
+          <span class="link-block">
+            <a href="https://arxiv.org/abs/2410.12628" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+              <span class="icon" style="margin-right: 8px">
+                <i class="fas fa-globe" style="color: white"></i>
+              </span>
+              <span style="color: white">Paper</span>
+            </a>
+          </span>
+        </div>
+      </div>
+      <!-- New Demo Links -->
+    </div>
+  </body></html>

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+doclayout-yolo==0.0.2
+gradio==5.1.0
+gradio-client==1.4.0
+huggingface_hub

visualization.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import numpy as np
+import cv2
+from PIL import Image
+def colormap(N=256, normalized=False):
+    """
+    Generate the color map.
+    Args:
+        N (int): Number of labels (default is 256).
+        normalized (bool): If True, return colors normalized to [0, 1]. Otherwise, return [0, 255].
+    Returns:
+        np.ndarray: Color map array of shape (N, 3).
+    """
+    def bitget(byteval, idx):
+        """
+        Get the bit value at the specified index.
+        Args:
+            byteval (int): The byte value.
+            idx (int): The index of the bit.
+        Returns:
+            int: The bit value (0 or 1).
+        """
+        return ((byteval & (1 << idx)) != 0)
+    cmap = np.zeros((N, 3), dtype=np.uint8)
+    for i in range(N):
+        r = g = b = 0
+        c = i
+        for j in range(8):
+            r = r | (bitget(c, 0) << (7 - j))
+            g = g | (bitget(c, 1) << (7 - j))
+            b = b | (bitget(c, 2) << (7 - j))
+            c = c >> 3
+        cmap[i] = np.array([r, g, b])
+    if normalized:
+        cmap = cmap.astype(np.float32) / 255.0
+    return cmap
+def visualize_bbox(image_path, bboxes, classes, scores, id_to_names, alpha=0.3):
+    """
+    Visualize layout detection results on an image.
+    Args:
+        image_path (str): Path to the input image.
+        bboxes (list): List of bounding boxes, each represented as [x_min, y_min, x_max, y_max].
+        classes (list): List of class IDs corresponding to the bounding boxes.
+        id_to_names (dict): Dictionary mapping class IDs to class names.
+        alpha (float): Transparency factor for the filled color (default is 0.3).
+    Returns:
+        np.ndarray: Image with visualized layout detection results.
+    """
+    # Check if image_path is a PIL.Image.Image object
+    if isinstance(image_path, Image.Image) or isinstance(image_path, np.ndarray):
+        image = np.array(image_path)
+        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # Convert RGB to BGR for OpenCV
+    else:
+        image = cv2.imread(image_path)
+    overlay = image.copy()
+    cmap = colormap(N=len(id_to_names), normalized=False)
+    # Iterate over each bounding box
+    for i, bbox in enumerate(bboxes):
+        x_min, y_min, x_max, y_max = map(int, bbox)
+        class_id = int(classes[i])
+        class_name = id_to_names[class_id]
+        text = class_name + f":{scores[i]:.3f}"
+        color = tuple(int(c) for c in cmap[class_id])
+        cv2.rectangle(overlay, (x_min, y_min), (x_max, y_max), color, -1)
+        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2)
+        # Add the class name with a background rectangle
+        (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.9, 2)
+        cv2.rectangle(image, (x_min, y_min - text_height - baseline), (x_min + text_width, y_min), color, -1)
+        cv2.putText(image, text, (x_min, y_min - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)
+    # Blend the overlay with the original image
+    cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0, image)
+    return image