import gradio as gr from autodistill_gpt_4v import GPT4V from autodistill.detection import CaptionOntology from autodistill_grounding_dino import GroundingDINO from autodistill.utils import plot import tempfile import cv2 from autodistill.core.custom_detection_model import CustomDetectionModel # Hardcoded values api_key = "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" dino_prompt = "buildings . parks ." gpt_prompt = "buildings" MARKDOWN = """ # DINO-GPT4V Use Grounding DINO and GPT-4V to label specific objects. Visit [awesome-openai-vision-api-experiments](https://github.com/roboflow/awesome-openai-vision-api-experiments) repository to find more OpenAI Vision API experiments or contribute your own.""" def respond(input_image): input_image = cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB) with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as temp_file: cv2.imwrite(temp_file.name, input_image) DINOGPT = CustomDetectionModel( detection_model=GroundingDINO( CaptionOntology({dino_prompt: dino_prompt}) ), classification_model=GPT4V( CaptionOntology({k: k for k in gpt_prompt.split(", ")}), api_key=api_key ) ) results = DINOGPT.predict(temp_file.name) if isinstance(results, tuple): # If results are a tuple, handle it accordingly # This is a placeholder, you need to adjust based on the actual structure of the tuple results = results[0] # Assuming the first item in the tuple is the desired data result = plot( image=cv2.imread(temp_file.name), detections=results, classes=gpt_prompt.split(", "), raw=True ) return result with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): with gr.Column(): input_image = gr.Image(type="numpy", label="Input Image") with gr.Column(): output_image = gr.Image(type="numpy", label="Output Image") submit_button = gr.Button("Submit") submit_button.click( fn=respond, inputs=[input_image], outputs=[output_image] ) demo.launch()