Spaces:

nhatipoglu
/

demo-vit-v2

Running on Zero

App Files Files Community

nhatipoglu commited on Sep 12

Commit

8677efd

•

1 Parent(s): e13466a

add app files

Browse files

Files changed (16) hide show

.idea/.gitignore +8 -0
.idea/demo-vit-v2.iml +8 -0
.idea/inspectionProfiles/Project_Default.xml +14 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +13 -0
README.md +43 -11
app.py +180 -0
ex.py +87 -0
images/2024_09_10_10_56_40.png +0 -0
images/2024_09_10_10_58_23.png +0 -0
images/2024_09_10_10_58_40.png +0 -0
images/2024_09_10_11_07_31.png +0 -0
images/comics.jpeg +0 -0
requirements.txt +106 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/demo-vit-v2.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,14 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="yarl" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="GithubDefaultAccount">
+    <option name="defaultAccountId" value="16dd0ba3-f1ec-4fdf-9c62-48bd69c3904d" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (linkedIn_auto_jobs_applier_with_AI)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/demo-vit-v2.iml" filepath="$PROJECT_DIR$/.idea/demo-vit-v2.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,13 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="GitSharedSettings">
+    <option name="FORCE_PUSH_PROHIBITED_PATTERNS">
+      <list>
+        <option value="master" />
+      </list>
+    </option>
+  </component>
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

README.md CHANGED Viewed

@@ -1,13 +1,45 @@
 ---
-title: Demo Vit V2
-emoji: 📉
-colorFrom: purple
-colorTo: pink
-sdk: gradio
-sdk_version: 4.44.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+### README.md for Multi-Model Object Detection Demo
 ---
+# Multi-Model Object Detection Demo
+This repository provides a demo application that uses multiple state-of-the-art vision-language models for various tasks such as object detection, image captioning, visual question answering, and image-text matching. The demo is built using Gradio for the user interface and leverages Hugging Face's `transformers` library to load and run various pre-trained models.
+## Available Models
+The following models are available in the demo:
+- **Qwen2-VL (7B, 2B, 5B, 1B):** Vision-language models optimized for object detection, question-answering, and image description tasks.
+- **BLIP:** Specialized in image captioning and visual question-answering.
+- **CLIP:** Uses contrastive learning for image-text matching.
+## Usage
+To use the demo:
+1. **Input an Image:** Upload an image that you want to analyze.
+2. **Select a Model:** Choose a model from the dropdown list to perform the desired task.
+3. **Provide a System Prompt:** Optionally, enter a system prompt to guide the model's behavior.
+4. **Enter a User Prompt:** Describe the object or task you want the model to perform.
+5. **Submit:** Click the "Submit" button to run the model and display the results.
+## Getting Started
+### Example Inputs
+The demo provides some pre-configured examples to try:
+- **Image 1:** Detect goats in an image.
+- **Image 2:** Find a blue button in the image.
+- **Image 3:** Describe a person on a bike.
+- **Image 4:** Solve questions from a screenshot.
+- **Image 5:** Describe various images such as landscapes, animals, or objects.
+## Available Functions
+- `run_example`: Core function to process the input image and prompts, run the selected model, and return the results.
+- `image_to_base64`: Converts an image to a base64 encoded string for model processing.
+- `draw_bounding_boxes`: Draws bounding boxes around detected objects in the image.
+- `rescale_bounding_boxes`: Rescales bounding boxes to the original image dimensions.

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import gradio as gr
+import spaces
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, CLIPModel, \
+    BlipForConditionalGeneration, CLIPProcessor, BlipProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import base64
+from PIL import Image, ImageDraw
+from io import BytesIO
+import re
+models = {
+    "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct",
+                                                                                 torch_dtype="auto", device_map="auto"),
+    "Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
+                                                                                 torch_dtype="auto", device_map="auto"),
+    "Qwen/Qwen2-VL-1B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-1B-Instruct",
+                                                                                 torch_dtype="auto", device_map="auto"),
+    "Qwen/Qwen2-VL-5B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-5B-Instruct",
+                                                                                 torch_dtype="auto", device_map="auto"),
+    "openai/clip-vit-base-patch32": CLIPModel.from_pretrained("openai/clip-vit-base-patch32"),
+    "Salesforce/blip-image-captioning-base": BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"),
+}
+processors = {
+    "Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
+    "Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct"),
+    "Qwen/Qwen2-VL-1B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-1B-Instruct"),
+    "Qwen/Qwen2-VL-5B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-5B-Instruct"),
+    "openai/clip-vit-base-patch32": CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32"),
+    "Salesforce/blip-image-captioning-base": BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base"),
+}
+def image_to_base64(image):
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return img_str
+def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2):
+    draw = ImageDraw.Draw(image)
+    for box in bounding_boxes:
+        xmin, ymin, xmax, ymax = box
+        draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
+    return image
+def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
+    x_scale = original_width / scaled_width
+    y_scale = original_height / scaled_height
+    rescaled_boxes = []
+    for box in bounding_boxes:
+        xmin, ymin, xmax, ymax = box
+        rescaled_box = [
+            xmin * x_scale,
+            ymin * y_scale,
+            xmax * x_scale,
+            ymax * y_scale
+        ]
+        rescaled_boxes.append(rescaled_box)
+    return rescaled_boxes
+@spaces.GPU
+def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Instruct"):
+    model = models[model_id].eval()
+    processor = processors[model_id]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
+                {"type": "text", "text": system_prompt},
+                {"type": "text", "text": text_input},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to("cuda")
+    generated_ids = model.generate(**inputs, max_new_tokens=128)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(generated_ids_trimmed,
+                                         skip_special_tokens=True,
+                                         clean_up_tokenization_spaces=False)
+    print(output_text)
+    pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
+    matches = re.findall(pattern, str(output_text))
+    parsed_boxes = [[int(num) for num in match] for match in matches]
+    scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
+    return output_text, parsed_boxes, draw_bounding_boxes(image, scaled_boxes)
+css = """
+  #output {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+  }
+"""
+default_system_prompt = ("You are a helpfull assistant to detect objects in images. "
+                         "When asked to detect elements based on a description you return bounding boxes for all "
+                         "elements in the form of [xmin, ymin, xmax, ymax] whith the "
+                         "values beeing scaled to 1000 by 1000 pixels. When there are more than one result, "
+                         "answer with a list of bounding boxes in the form of"
+                         " [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...].")
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(
+        """
+        # Multi-Model Object Detection Demo
+        This demo uses various state-of-the-art models for object detection and image-text alignment tasks.
+        **Available Models**:
+        - **Qwen2-VL (7B, 2B, 5B, 1B)**: Vision-language models optimized for various tasks.
+        - **BLIP**: Image captioning and visual question answering.
+        - **CLIP**: Contrastive learning for image-text matching.
+        - **Flamingo**: Few-shot learning for various visual tasks.
+        - **LLaVA**: Balanced performance in visual understanding and interactive AI tasks.
+        **Usage**: Input an image and a description of the target object you want to detect.
+        """
+    )
+    with gr.Tab(label="Input"):
+        with gr.Row():
+            with gr.Column():
+                input_img = gr.Image(label="Input Image", type="pil")
+                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-2B-Instruct")
+                system_prompt = gr.Textbox(label="System Prompt", value=default_system_prompt)
+                text_input = gr.Textbox(label="User Prompt")
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                model_output_text = gr.Textbox(label="Model Output Text")
+                parsed_boxes = gr.Textbox(label="Parsed Boxes")
+                annotated_image = gr.Image(label="Annotated Image")
+        gr.Examples(
+            examples=[
+                ["images/2024_09_10_10_56_40.png", "solve the questions in Turkish", default_system_prompt],
+                ["images/2024_09_10_10_58_23.png", "solve the questions in Turkish", default_system_prompt],
+                ["images/2024_09_10_10_58_40.png", "solve the questions in Turkish", default_system_prompt],
+                ["images/2024_09_10_11_07_31.png", "Describe the questions and write python code", default_system_prompt],
+                ["images/IMG_3644", "Describe the image", default_system_prompt],
+                ["images/IMG_3658", "Describe the image", default_system_prompt],
+                ["images/IMG_4028", "Describe the image", default_system_prompt],
+                ["images/IMG_4070", "Describe the image", default_system_prompt],
+                ["images/comics.jpeg", "Describe the image", default_system_prompt],
+            ],
+            inputs=[input_img, text_input, system_prompt],
+            outputs=[model_output_text, parsed_boxes, annotated_image],
+            fn=run_example,
+            cache_examples=True,
+            label="Try examples"
+        )
+        submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector],
+                         [model_output_text, parsed_boxes, annotated_image])
+demo.launch(debug=True)

ex.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from PIL import Image
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+# %%
+model = Qwen2VLForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="Qwen/Qwen2-VL-2B-Instruct",
+                                                        torch_dtype="auto",
+                                                        device_map="auto")
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+# %%
+def rescale_image_dimensions(original_width, original_height, max_size=1000):
+    # Orijinal boyutlar 1000 pikselin üzerindeyse yeniden ölçeklendir
+    if original_width > max_size or original_height > max_size:
+        aspect_ratio = original_width / original_height
+        if aspect_ratio > 1:  # Genişlik yükseklikten büyükse
+            scaled_width = max_size
+            scaled_height = int(max_size / aspect_ratio)
+        else:  # Yükseklik genişlikten büyükse veya eşitse
+            scaled_height = max_size
+            scaled_width = int(max_size * aspect_ratio)
+    else:
+        # Orijinal boyutlar zaten uygun ise
+        scaled_width = original_width
+        scaled_height = original_height
+    return scaled_width, scaled_height
+# %%
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "/home/nuh-hatipoglu/Desktop/NewMind/demo-vit/images/IMG_3644.JPG",
+            },
+            {"type": "text", "text": "Describe image"},
+        ],
+    }
+]
+# %%%
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+# %%
+image_inputs, video_inputs = process_vision_info(messages)
+# %%
+original_width, original_height = image_inputs[0].size
+new_width, new_height = rescale_image_dimensions(original_width, original_height)
+rescaled_image = image_inputs[0].resize((new_width, new_height), Image.Resampling.LANCZOS)
+image_inputs = rescaled_image
+#%%
+image_inputs[0].show()
+# %%
+inputs = processor(text=[text],
+                   images=image_inputs,
+                   videos=video_inputs,
+                   padding=True,
+                   return_tensors="pt", )
+inputs = inputs.to("cuda")
+# %%
+# Görseli aç
+image_path = "your_image_path.jpg"  # Görselin dosya yolu
+image = Image.open(image_path)
+# Orijinal boyutları al
+original_width, original_height = image.size
+# Yeni boyutları hesapla
+new_width, new_height = rescale_image_dimensions(original_width, original_height)
+# Görseli yeniden boyutlandır
+rescaled_image = image.resize((new_width, new_height), Image.ANTIALIAS)
+# Yeniden boyutlandırılmış görseli kaydet
+rescaled_image.save("rescaled_" + image_path)
+print(f"Görsel başarıyla yeniden boyutlandırıldı: {new_width}x{new_height}")

images/2024_09_10_10_56_40.png ADDED Viewed

images/2024_09_10_10_58_23.png ADDED Viewed

images/2024_09_10_10_58_40.png ADDED Viewed

images/2024_09_10_11_07_31.png ADDED Viewed

images/comics.jpeg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,106 @@

+accelerate==0.30.0
+aiofiles==23.2.1
+altair==5.4.1
+annotated-types==0.7.0
+anyio==4.4.0
+attrs==24.2.0
+blinker==1.8.2
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.3.0
+cycler==0.12.1
+fastapi==0.114.1
+ffmpy==0.4.0
+filelock==3.16.0
+fonttools==4.53.1
+fsspec==2024.9.0
+gitdb==4.0.11
+GitPython==3.1.43
+gradio==4.44.0
+gradio_client==1.3.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+huggingface-hub==0.24.6
+idna==3.8
+importlib_resources==6.4.5
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+mpmath==1.3.0
+narwhals==1.6.4
+networkx==3.3
+numpy==2.1.1
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.68
+nvidia-nvtx-cu12==12.1.105
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.2
+pillow==10.4.0
+protobuf==5.28.0
+psutil==5.9.8
+pyarrow==17.0.0
+pydantic==2.9.1
+pydantic_core==2.23.3
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.2
+qwen-vl-utils==0.0.4
+referencing==0.35.1
+regex==2024.7.24
+requests==2.32.3
+rich==13.8.1
+rpds-py==0.20.0
+ruff==0.6.4
+safetensors==0.4.5
+semantic-version==2.10.0
+setuptools==74.1.2
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+spaces==0.30.2
+starlette==0.38.5
+streamlit==1.38.0
+sympy==1.13.2
+tenacity==8.5.0
+tokenizers==0.19.1
+toml==0.10.2
+tomlkit==0.12.0
+torch==2.4.1
+torchvision==0.19.1
+tornado==6.4.1
+tqdm==4.66.5
+transformers @ git+https://github.com/huggingface/transformers.git@f38590dade57c1f8cf8a67e9409dae8935f8c478
+triton==3.0.0
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+uvicorn==0.30.6
+watchdog==4.0.2
+websockets==12.0
+yarl==1.7.0
+transformers~=4.45.0.dev0