Spaces:
Running
on
Zero
Running
on
Zero
nhatipoglu
commited on
Commit
•
8677efd
1
Parent(s):
e13466a
add app files
Browse files- .idea/.gitignore +8 -0
- .idea/demo-vit-v2.iml +8 -0
- .idea/inspectionProfiles/Project_Default.xml +14 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +7 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +13 -0
- README.md +43 -11
- app.py +180 -0
- ex.py +87 -0
- images/2024_09_10_10_56_40.png +0 -0
- images/2024_09_10_10_58_23.png +0 -0
- images/2024_09_10_10_58_40.png +0 -0
- images/2024_09_10_11_07_31.png +0 -0
- images/comics.jpeg +0 -0
- requirements.txt +106 -0
.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
.idea/demo-vit-v2.iml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
</module>
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
5 |
+
<option name="ignoredPackages">
|
6 |
+
<value>
|
7 |
+
<list size="1">
|
8 |
+
<item index="0" class="java.lang.String" itemvalue="yarl" />
|
9 |
+
</list>
|
10 |
+
</value>
|
11 |
+
</option>
|
12 |
+
</inspection_tool>
|
13 |
+
</profile>
|
14 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="GithubDefaultAccount">
|
4 |
+
<option name="defaultAccountId" value="16dd0ba3-f1ec-4fdf-9c62-48bd69c3904d" />
|
5 |
+
</component>
|
6 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (linkedIn_auto_jobs_applier_with_AI)" project-jdk-type="Python SDK" />
|
7 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/demo-vit-v2.iml" filepath="$PROJECT_DIR$/.idea/demo-vit-v2.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="GitSharedSettings">
|
4 |
+
<option name="FORCE_PUSH_PROHIBITED_PATTERNS">
|
5 |
+
<list>
|
6 |
+
<option value="master" />
|
7 |
+
</list>
|
8 |
+
</option>
|
9 |
+
</component>
|
10 |
+
<component name="VcsDirectoryMappings">
|
11 |
+
<mapping directory="" vcs="Git" />
|
12 |
+
</component>
|
13 |
+
</project>
|
README.md
CHANGED
@@ -1,13 +1,45 @@
|
|
|
|
|
|
1 |
---
|
2 |
-
title: Demo Vit V2
|
3 |
-
emoji: 📉
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: pink
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.44.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### README.md for Multi-Model Object Detection Demo
|
2 |
+
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
# Multi-Model Object Detection Demo
|
6 |
+
|
7 |
+
This repository provides a demo application that uses multiple state-of-the-art vision-language models for various tasks such as object detection, image captioning, visual question answering, and image-text matching. The demo is built using Gradio for the user interface and leverages Hugging Face's `transformers` library to load and run various pre-trained models.
|
8 |
+
|
9 |
+
## Available Models
|
10 |
+
|
11 |
+
The following models are available in the demo:
|
12 |
+
|
13 |
+
- **Qwen2-VL (7B, 2B, 5B, 1B):** Vision-language models optimized for object detection, question-answering, and image description tasks.
|
14 |
+
- **BLIP:** Specialized in image captioning and visual question-answering.
|
15 |
+
- **CLIP:** Uses contrastive learning for image-text matching.
|
16 |
+
|
17 |
+
## Usage
|
18 |
+
|
19 |
+
To use the demo:
|
20 |
+
|
21 |
+
1. **Input an Image:** Upload an image that you want to analyze.
|
22 |
+
2. **Select a Model:** Choose a model from the dropdown list to perform the desired task.
|
23 |
+
3. **Provide a System Prompt:** Optionally, enter a system prompt to guide the model's behavior.
|
24 |
+
4. **Enter a User Prompt:** Describe the object or task you want the model to perform.
|
25 |
+
5. **Submit:** Click the "Submit" button to run the model and display the results.
|
26 |
+
|
27 |
+
## Getting Started
|
28 |
+
|
29 |
+
|
30 |
+
### Example Inputs
|
31 |
+
|
32 |
+
The demo provides some pre-configured examples to try:
|
33 |
+
|
34 |
+
- **Image 1:** Detect goats in an image.
|
35 |
+
- **Image 2:** Find a blue button in the image.
|
36 |
+
- **Image 3:** Describe a person on a bike.
|
37 |
+
- **Image 4:** Solve questions from a screenshot.
|
38 |
+
- **Image 5:** Describe various images such as landscapes, animals, or objects.
|
39 |
+
|
40 |
+
## Available Functions
|
41 |
+
|
42 |
+
- `run_example`: Core function to process the input image and prompts, run the selected model, and return the results.
|
43 |
+
- `image_to_base64`: Converts an image to a base64 encoded string for model processing.
|
44 |
+
- `draw_bounding_boxes`: Draws bounding boxes around detected objects in the image.
|
45 |
+
- `rescale_bounding_boxes`: Rescales bounding boxes to the original image dimensions.
|
app.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import spaces
|
3 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor, CLIPModel, \
|
4 |
+
BlipForConditionalGeneration, CLIPProcessor, BlipProcessor
|
5 |
+
from qwen_vl_utils import process_vision_info
|
6 |
+
import torch
|
7 |
+
import base64
|
8 |
+
from PIL import Image, ImageDraw
|
9 |
+
from io import BytesIO
|
10 |
+
import re
|
11 |
+
|
12 |
+
models = {
|
13 |
+
"Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct",
|
14 |
+
torch_dtype="auto", device_map="auto"),
|
15 |
+
"Qwen/Qwen2-VL-2B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct",
|
16 |
+
torch_dtype="auto", device_map="auto"),
|
17 |
+
"Qwen/Qwen2-VL-1B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-1B-Instruct",
|
18 |
+
torch_dtype="auto", device_map="auto"),
|
19 |
+
"Qwen/Qwen2-VL-5B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-5B-Instruct",
|
20 |
+
torch_dtype="auto", device_map="auto"),
|
21 |
+
"openai/clip-vit-base-patch32": CLIPModel.from_pretrained("openai/clip-vit-base-patch32"),
|
22 |
+
"Salesforce/blip-image-captioning-base": BlipForConditionalGeneration.from_pretrained(
|
23 |
+
"Salesforce/blip-image-captioning-base"),
|
24 |
+
|
25 |
+
}
|
26 |
+
|
27 |
+
processors = {
|
28 |
+
"Qwen/Qwen2-VL-7B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct"),
|
29 |
+
"Qwen/Qwen2-VL-2B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct"),
|
30 |
+
"Qwen/Qwen2-VL-1B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-1B-Instruct"),
|
31 |
+
"Qwen/Qwen2-VL-5B-Instruct": AutoProcessor.from_pretrained("Qwen/Qwen2-VL-5B-Instruct"),
|
32 |
+
"openai/clip-vit-base-patch32": CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32"),
|
33 |
+
"Salesforce/blip-image-captioning-base": BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base"),
|
34 |
+
|
35 |
+
}
|
36 |
+
|
37 |
+
|
38 |
+
def image_to_base64(image):
|
39 |
+
buffered = BytesIO()
|
40 |
+
image.save(buffered, format="PNG")
|
41 |
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
42 |
+
return img_str
|
43 |
+
|
44 |
+
|
45 |
+
def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2):
|
46 |
+
draw = ImageDraw.Draw(image)
|
47 |
+
for box in bounding_boxes:
|
48 |
+
xmin, ymin, xmax, ymax = box
|
49 |
+
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
50 |
+
return image
|
51 |
+
|
52 |
+
|
53 |
+
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
54 |
+
x_scale = original_width / scaled_width
|
55 |
+
y_scale = original_height / scaled_height
|
56 |
+
rescaled_boxes = []
|
57 |
+
for box in bounding_boxes:
|
58 |
+
xmin, ymin, xmax, ymax = box
|
59 |
+
rescaled_box = [
|
60 |
+
xmin * x_scale,
|
61 |
+
ymin * y_scale,
|
62 |
+
xmax * x_scale,
|
63 |
+
ymax * y_scale
|
64 |
+
]
|
65 |
+
rescaled_boxes.append(rescaled_box)
|
66 |
+
return rescaled_boxes
|
67 |
+
|
68 |
+
|
69 |
+
@spaces.GPU
|
70 |
+
def run_example(image, text_input, system_prompt, model_id="Qwen/Qwen2-VL-7B-Instruct"):
|
71 |
+
model = models[model_id].eval()
|
72 |
+
processor = processors[model_id]
|
73 |
+
|
74 |
+
messages = [
|
75 |
+
{
|
76 |
+
"role": "user",
|
77 |
+
"content": [
|
78 |
+
{"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
|
79 |
+
{"type": "text", "text": system_prompt},
|
80 |
+
{"type": "text", "text": text_input},
|
81 |
+
],
|
82 |
+
}
|
83 |
+
]
|
84 |
+
|
85 |
+
text = processor.apply_chat_template(
|
86 |
+
messages, tokenize=False, add_generation_prompt=True
|
87 |
+
)
|
88 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
89 |
+
inputs = processor(
|
90 |
+
text=[text],
|
91 |
+
images=image_inputs,
|
92 |
+
videos=video_inputs,
|
93 |
+
padding=True,
|
94 |
+
return_tensors="pt",
|
95 |
+
)
|
96 |
+
inputs = inputs.to("cuda")
|
97 |
+
|
98 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
99 |
+
generated_ids_trimmed = [
|
100 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
101 |
+
]
|
102 |
+
output_text = processor.batch_decode(generated_ids_trimmed,
|
103 |
+
skip_special_tokens=True,
|
104 |
+
clean_up_tokenization_spaces=False)
|
105 |
+
|
106 |
+
print(output_text)
|
107 |
+
pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
|
108 |
+
matches = re.findall(pattern, str(output_text))
|
109 |
+
parsed_boxes = [[int(num) for num in match] for match in matches]
|
110 |
+
scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
|
111 |
+
|
112 |
+
return output_text, parsed_boxes, draw_bounding_boxes(image, scaled_boxes)
|
113 |
+
|
114 |
+
|
115 |
+
css = """
|
116 |
+
#output {
|
117 |
+
height: 500px;
|
118 |
+
overflow: auto;
|
119 |
+
border: 1px solid #ccc;
|
120 |
+
}
|
121 |
+
"""
|
122 |
+
default_system_prompt = ("You are a helpfull assistant to detect objects in images. "
|
123 |
+
"When asked to detect elements based on a description you return bounding boxes for all "
|
124 |
+
"elements in the form of [xmin, ymin, xmax, ymax] whith the "
|
125 |
+
"values beeing scaled to 1000 by 1000 pixels. When there are more than one result, "
|
126 |
+
"answer with a list of bounding boxes in the form of"
|
127 |
+
" [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...].")
|
128 |
+
|
129 |
+
with gr.Blocks(css=css) as demo:
|
130 |
+
gr.Markdown(
|
131 |
+
"""
|
132 |
+
# Multi-Model Object Detection Demo
|
133 |
+
This demo uses various state-of-the-art models for object detection and image-text alignment tasks.
|
134 |
+
|
135 |
+
**Available Models**:
|
136 |
+
- **Qwen2-VL (7B, 2B, 5B, 1B)**: Vision-language models optimized for various tasks.
|
137 |
+
- **BLIP**: Image captioning and visual question answering.
|
138 |
+
- **CLIP**: Contrastive learning for image-text matching.
|
139 |
+
- **Flamingo**: Few-shot learning for various visual tasks.
|
140 |
+
- **LLaVA**: Balanced performance in visual understanding and interactive AI tasks.
|
141 |
+
|
142 |
+
**Usage**: Input an image and a description of the target object you want to detect.
|
143 |
+
"""
|
144 |
+
)
|
145 |
+
with gr.Tab(label="Input"):
|
146 |
+
with gr.Row():
|
147 |
+
with gr.Column():
|
148 |
+
input_img = gr.Image(label="Input Image", type="pil")
|
149 |
+
model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-2B-Instruct")
|
150 |
+
system_prompt = gr.Textbox(label="System Prompt", value=default_system_prompt)
|
151 |
+
text_input = gr.Textbox(label="User Prompt")
|
152 |
+
submit_btn = gr.Button(value="Submit")
|
153 |
+
with gr.Column():
|
154 |
+
model_output_text = gr.Textbox(label="Model Output Text")
|
155 |
+
parsed_boxes = gr.Textbox(label="Parsed Boxes")
|
156 |
+
annotated_image = gr.Image(label="Annotated Image")
|
157 |
+
|
158 |
+
gr.Examples(
|
159 |
+
examples=[
|
160 |
+
["images/2024_09_10_10_56_40.png", "solve the questions in Turkish", default_system_prompt],
|
161 |
+
["images/2024_09_10_10_58_23.png", "solve the questions in Turkish", default_system_prompt],
|
162 |
+
["images/2024_09_10_10_58_40.png", "solve the questions in Turkish", default_system_prompt],
|
163 |
+
["images/2024_09_10_11_07_31.png", "Describe the questions and write python code", default_system_prompt],
|
164 |
+
["images/IMG_3644", "Describe the image", default_system_prompt],
|
165 |
+
["images/IMG_3658", "Describe the image", default_system_prompt],
|
166 |
+
["images/IMG_4028", "Describe the image", default_system_prompt],
|
167 |
+
["images/IMG_4070", "Describe the image", default_system_prompt],
|
168 |
+
["images/comics.jpeg", "Describe the image", default_system_prompt],
|
169 |
+
],
|
170 |
+
inputs=[input_img, text_input, system_prompt],
|
171 |
+
outputs=[model_output_text, parsed_boxes, annotated_image],
|
172 |
+
fn=run_example,
|
173 |
+
cache_examples=True,
|
174 |
+
label="Try examples"
|
175 |
+
)
|
176 |
+
|
177 |
+
submit_btn.click(run_example, [input_img, text_input, system_prompt, model_selector],
|
178 |
+
[model_output_text, parsed_boxes, annotated_image])
|
179 |
+
|
180 |
+
demo.launch(debug=True)
|
ex.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
3 |
+
from qwen_vl_utils import process_vision_info
|
4 |
+
|
5 |
+
# %%
|
6 |
+
|
7 |
+
model = Qwen2VLForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="Qwen/Qwen2-VL-2B-Instruct",
|
8 |
+
torch_dtype="auto",
|
9 |
+
device_map="auto")
|
10 |
+
|
11 |
+
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
12 |
+
|
13 |
+
|
14 |
+
# %%
|
15 |
+
def rescale_image_dimensions(original_width, original_height, max_size=1000):
|
16 |
+
# Orijinal boyutlar 1000 pikselin üzerindeyse yeniden ölçeklendir
|
17 |
+
if original_width > max_size or original_height > max_size:
|
18 |
+
aspect_ratio = original_width / original_height
|
19 |
+
|
20 |
+
if aspect_ratio > 1: # Genişlik yükseklikten büyükse
|
21 |
+
scaled_width = max_size
|
22 |
+
scaled_height = int(max_size / aspect_ratio)
|
23 |
+
else: # Yükseklik genişlikten büyükse veya eşitse
|
24 |
+
scaled_height = max_size
|
25 |
+
scaled_width = int(max_size * aspect_ratio)
|
26 |
+
else:
|
27 |
+
# Orijinal boyutlar zaten uygun ise
|
28 |
+
scaled_width = original_width
|
29 |
+
scaled_height = original_height
|
30 |
+
|
31 |
+
return scaled_width, scaled_height
|
32 |
+
|
33 |
+
|
34 |
+
# %%
|
35 |
+
messages = [
|
36 |
+
{
|
37 |
+
"role": "user",
|
38 |
+
"content": [
|
39 |
+
{
|
40 |
+
"type": "image",
|
41 |
+
"image": "/home/nuh-hatipoglu/Desktop/NewMind/demo-vit/images/IMG_3644.JPG",
|
42 |
+
},
|
43 |
+
{"type": "text", "text": "Describe image"},
|
44 |
+
],
|
45 |
+
}
|
46 |
+
]
|
47 |
+
# %%%
|
48 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
49 |
+
|
50 |
+
# %%
|
51 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
52 |
+
|
53 |
+
# %%
|
54 |
+
|
55 |
+
original_width, original_height = image_inputs[0].size
|
56 |
+
new_width, new_height = rescale_image_dimensions(original_width, original_height)
|
57 |
+
rescaled_image = image_inputs[0].resize((new_width, new_height), Image.Resampling.LANCZOS)
|
58 |
+
image_inputs = rescaled_image
|
59 |
+
|
60 |
+
#%%
|
61 |
+
|
62 |
+
image_inputs[0].show()
|
63 |
+
# %%
|
64 |
+
inputs = processor(text=[text],
|
65 |
+
images=image_inputs,
|
66 |
+
videos=video_inputs,
|
67 |
+
padding=True,
|
68 |
+
return_tensors="pt", )
|
69 |
+
inputs = inputs.to("cuda")
|
70 |
+
# %%
|
71 |
+
# Görseli aç
|
72 |
+
image_path = "your_image_path.jpg" # Görselin dosya yolu
|
73 |
+
image = Image.open(image_path)
|
74 |
+
|
75 |
+
# Orijinal boyutları al
|
76 |
+
original_width, original_height = image.size
|
77 |
+
|
78 |
+
# Yeni boyutları hesapla
|
79 |
+
new_width, new_height = rescale_image_dimensions(original_width, original_height)
|
80 |
+
|
81 |
+
# Görseli yeniden boyutlandır
|
82 |
+
rescaled_image = image.resize((new_width, new_height), Image.ANTIALIAS)
|
83 |
+
|
84 |
+
# Yeniden boyutlandırılmış görseli kaydet
|
85 |
+
rescaled_image.save("rescaled_" + image_path)
|
86 |
+
|
87 |
+
print(f"Görsel başarıyla yeniden boyutlandırıldı: {new_width}x{new_height}")
|
images/2024_09_10_10_56_40.png
ADDED
images/2024_09_10_10_58_23.png
ADDED
images/2024_09_10_10_58_40.png
ADDED
images/2024_09_10_11_07_31.png
ADDED
images/comics.jpeg
ADDED
requirements.txt
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.30.0
|
2 |
+
aiofiles==23.2.1
|
3 |
+
altair==5.4.1
|
4 |
+
annotated-types==0.7.0
|
5 |
+
anyio==4.4.0
|
6 |
+
attrs==24.2.0
|
7 |
+
blinker==1.8.2
|
8 |
+
cachetools==5.5.0
|
9 |
+
certifi==2024.8.30
|
10 |
+
charset-normalizer==3.3.2
|
11 |
+
click==8.1.7
|
12 |
+
contourpy==1.3.0
|
13 |
+
cycler==0.12.1
|
14 |
+
fastapi==0.114.1
|
15 |
+
ffmpy==0.4.0
|
16 |
+
filelock==3.16.0
|
17 |
+
fonttools==4.53.1
|
18 |
+
fsspec==2024.9.0
|
19 |
+
gitdb==4.0.11
|
20 |
+
GitPython==3.1.43
|
21 |
+
gradio==4.44.0
|
22 |
+
gradio_client==1.3.0
|
23 |
+
h11==0.14.0
|
24 |
+
httpcore==1.0.5
|
25 |
+
httpx==0.27.2
|
26 |
+
huggingface-hub==0.24.6
|
27 |
+
idna==3.8
|
28 |
+
importlib_resources==6.4.5
|
29 |
+
Jinja2==3.1.4
|
30 |
+
jsonschema==4.23.0
|
31 |
+
jsonschema-specifications==2023.12.1
|
32 |
+
kiwisolver==1.4.7
|
33 |
+
markdown-it-py==3.0.0
|
34 |
+
MarkupSafe==2.1.5
|
35 |
+
matplotlib==3.9.2
|
36 |
+
mdurl==0.1.2
|
37 |
+
mpmath==1.3.0
|
38 |
+
narwhals==1.6.4
|
39 |
+
networkx==3.3
|
40 |
+
numpy==2.1.1
|
41 |
+
nvidia-cublas-cu12==12.1.3.1
|
42 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
43 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
44 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
45 |
+
nvidia-cudnn-cu12==9.1.0.70
|
46 |
+
nvidia-cufft-cu12==11.0.2.54
|
47 |
+
nvidia-curand-cu12==10.3.2.106
|
48 |
+
nvidia-cusolver-cu12==11.4.5.107
|
49 |
+
nvidia-cusparse-cu12==12.1.0.106
|
50 |
+
nvidia-nccl-cu12==2.20.5
|
51 |
+
nvidia-nvjitlink-cu12==12.6.68
|
52 |
+
nvidia-nvtx-cu12==12.1.105
|
53 |
+
orjson==3.10.7
|
54 |
+
packaging==24.1
|
55 |
+
pandas==2.2.2
|
56 |
+
pillow==10.4.0
|
57 |
+
protobuf==5.28.0
|
58 |
+
psutil==5.9.8
|
59 |
+
pyarrow==17.0.0
|
60 |
+
pydantic==2.9.1
|
61 |
+
pydantic_core==2.23.3
|
62 |
+
pydeck==0.9.1
|
63 |
+
pydub==0.25.1
|
64 |
+
Pygments==2.18.0
|
65 |
+
pyparsing==3.1.4
|
66 |
+
python-dateutil==2.9.0.post0
|
67 |
+
python-multipart==0.0.9
|
68 |
+
pytz==2024.1
|
69 |
+
PyYAML==6.0.2
|
70 |
+
qwen-vl-utils==0.0.4
|
71 |
+
referencing==0.35.1
|
72 |
+
regex==2024.7.24
|
73 |
+
requests==2.32.3
|
74 |
+
rich==13.8.1
|
75 |
+
rpds-py==0.20.0
|
76 |
+
ruff==0.6.4
|
77 |
+
safetensors==0.4.5
|
78 |
+
semantic-version==2.10.0
|
79 |
+
setuptools==74.1.2
|
80 |
+
shellingham==1.5.4
|
81 |
+
six==1.16.0
|
82 |
+
smmap==5.0.1
|
83 |
+
sniffio==1.3.1
|
84 |
+
spaces==0.30.2
|
85 |
+
starlette==0.38.5
|
86 |
+
streamlit==1.38.0
|
87 |
+
sympy==1.13.2
|
88 |
+
tenacity==8.5.0
|
89 |
+
tokenizers==0.19.1
|
90 |
+
toml==0.10.2
|
91 |
+
tomlkit==0.12.0
|
92 |
+
torch==2.4.1
|
93 |
+
torchvision==0.19.1
|
94 |
+
tornado==6.4.1
|
95 |
+
tqdm==4.66.5
|
96 |
+
transformers @ git+https://github.com/huggingface/transformers.git@f38590dade57c1f8cf8a67e9409dae8935f8c478
|
97 |
+
triton==3.0.0
|
98 |
+
typer==0.12.5
|
99 |
+
typing_extensions==4.12.2
|
100 |
+
tzdata==2024.1
|
101 |
+
urllib3==2.2.2
|
102 |
+
uvicorn==0.30.6
|
103 |
+
watchdog==4.0.2
|
104 |
+
websockets==12.0
|
105 |
+
yarl==1.7.0
|
106 |
+
transformers~=4.45.0.dev0
|