Spaces:
Running
Running
File size: 3,720 Bytes
0fc5095 0375f07 9467c94 d8addc5 0375f07 9467c94 0375f07 d8addc5 0375f07 0fc5095 d8addc5 0fc5095 d8addc5 0fc5095 0375f07 0fc5095 d8addc5 0fc5095 d8addc5 9467c94 d8addc5 9467c94 d8addc5 9467c94 0fc5095 d8addc5 0fc5095 d8addc5 9467c94 0fc5095 9467c94 0fc5095 9467c94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
from typing import Optional
import gradio as gr
import numpy as np
import torch
from PIL import Image
import io
import base64, os
from utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
from PIL import Image
from ultralytics import YOLO
yolo_model = YOLO('weights/icon_detect/best.pt')
from transformers import AutoProcessor, AutoModelForCausalLM
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"weights/icon_caption_florence",
torch_dtype=torch.float32,
trust_remote_code=True
)
caption_model_processor = {'processor': processor, 'model': model}
print('Finished loading model.')
platform = 'pc'
draw_bbox_config = {
'text_scale': 0.8,
'text_thickness': 2,
'text_padding': 2,
'thickness': 2,
}
MARKDOWN = """
# OmniParser for Pure Vision Based General GUI Agent 🔥
<div>
<a href="https://arxiv.org/pdf/2408.00203">
<img src="https://img.shields.io/badge/arXiv-2408.00203-b31b1b.svg" alt="Arxiv" style="display:inline-block;">
</a>
</div>
OmniParser is a screen parsing tool to convert general GUI screens to structured elements.
"""
@torch.inference_mode()
def process(
image_input,
box_threshold,
iou_threshold
) -> Optional[Image.Image]:
image_save_path = 'imgs/saved_image_demo.png'
image_input.save(image_save_path)
ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
image_save_path,
display_img=False,
output_bb_format='xyxy',
goal_filtering=None,
easyocr_args={'paragraph': False, 'text_threshold': 0.9},
use_paddleocr=True
)
text, ocr_bbox = ocr_bbox_rslt
dino_labeled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
image_save_path,
yolo_model,
BOX_TRESHOLD=box_threshold,
output_coord_in_ratio=True,
ocr_bbox=ocr_bbox,
draw_bbox_config=draw_bbox_config,
caption_model_processor=caption_model_processor,
ocr_text=text,
iou_threshold=iou_threshold
)
image = Image.open(io.BytesIO(base64.b64decode(dino_labeled_img)))
print('Finished processing.')
parsed_content_list_str = '\n'.join(parsed_content_list)
label_coordinates_str = '\n'.join([str(coord) for coord in label_coordinates])
return image, parsed_content_list_str, label_coordinates_str
with gr.Blocks() as demo:
gr.Markdown(MARKDOWN)
with gr.Row():
with gr.Column():
image_input_component = gr.Image(type='pil', label='Upload Image')
box_threshold_component = gr.Slider(
label='Box Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.05)
iou_threshold_component = gr.Slider(
label='IOU Threshold', minimum=0.01, maximum=1.0, step=0.01, value=0.1)
submit_button_component = gr.Button(
value='Submit', variant='primary')
with gr.Column():
image_output_component = gr.Image(type='pil', label='Image Output')
text_output_component = gr.Textbox(
label='Parsed Screen Elements', placeholder='Text Output')
coordinates_output_component = gr.Textbox(
label='Coordinates', placeholder='Coordinates Output')
submit_button_component.click(
fn=process,
inputs=[
image_input_component,
box_threshold_component,
iou_threshold_component
],
outputs=[
image_output_component,
text_output_component,
coordinates_output_component
]
)
demo.queue().launch(share=False)
|