ShowUI: One Vision-Language-Action Model for GUI Visual Agent
Paper
•
2411.17465
•
Published
•
46
import time
from askui import VisionAgent
with VisionAgent() as agent:
agent.tools.webbrowser.open_new("http://www.google.com")
time.sleep(0.5)
agent.click("search field in the center of the screen", model_name="Qwen/Qwen2-VL-7B-Instruct")
agent.type("cats")
agent.keyboard("enter")
time.sleep(0.5)
agent.click("text 'Images'", model_name="AskUI/PTA-1")
time.sleep(0.5)
agent.click("second cat image", model_name="OS-Copilot/OS-Atlas-Base-7B")