Merve Noyan commited on
Commit
ad382c8
1 Parent(s): 5af142a
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import AutoProcessor, Idefics3ForConditionalGeneration
3
  import re
4
  import time
5
  from PIL import Image
@@ -11,10 +11,10 @@ import subprocess
11
 
12
  processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
13
 
14
- model = Idefics3ForConditionalGeneration.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
15
  torch_dtype=torch.bfloat16,
16
  #_attn_implementation="flash_attention_2"
17
- ).to("cuda")
18
 
19
  @spaces.GPU
20
  def model_inference(
@@ -74,8 +74,8 @@ def model_inference(
74
  return generated_texts[0]
75
 
76
 
77
- with gr.Blocks(fill_height=True) as demo:
78
- gr.Markdown("## SmolVLM")
79
  gr.Markdown("Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples.")
80
  with gr.Column():
81
  image_input = gr.Image(label="Upload your Image", type="pil", scale=1)
@@ -85,88 +85,86 @@ with gr.Blocks(fill_height=True) as demo:
85
  submit_btn = gr.Button("Submit")
86
  output = gr.Textbox(label="Output")
87
 
88
- with gr.Accordion(label="Example Inputs and Advanced Generation Parameters"):
89
  examples=[
90
- ["example_images/mmmu_example.jpeg", "Chase wants to buy 4 kilograms of oval beads and 5 kilograms of star-shaped beads. How much will he spend?", "Let's think step by step.", "Greedy", 0.4, 512, 1.2, 0.8],
91
- ["example_images/rococo_1.jpg", "What art era is this?", None, "Greedy", 0.4, 512, 1.2, 0.8],
92
- ["example_images/paper_with_text.png", "Read what's written on the paper", None, "Greedy", 0.4, 512, 1.2, 0.8],
93
- ["example_images/dragons_playing.png","What's unusual about this image?",None, "Greedy", 0.4, 512, 1.2, 0.8],
94
- ["example_images/example_images_ai2d_example_2.jpeg", "What happens to fish if pelicans increase?", None, "Greedy", 0.4, 512, 1.2, 0.8],
95
- ["example_images/travel_tips.jpg", "I want to go somewhere similar to the one in the photo. Give me destinations and travel tips.", None, "Greedy", 0.4, 512, 1.2, 0.8],
96
- ["example_images/dummy_pdf.png", "How much percent is the order status?", None, "Greedy", 0.4, 512, 1.2, 0.8],
97
- ["example_images/art_critic.png", "As an art critic AI assistant, could you describe this painting in details and make a thorough critic?.",None, "Greedy", 0.4, 512, 1.2, 0.8],
98
- ["example_images/s2w_example.png", "What is this UI about?", None,"Greedy", 0.4, 512, 1.2, 0.8]]
99
-
100
- # Hyper-parameters for generation
101
- max_new_tokens = gr.Slider(
102
- minimum=8,
103
- maximum=1024,
104
- value=512,
105
- step=1,
106
- interactive=True,
107
- label="Maximum number of new tokens to generate",
108
- )
109
- repetition_penalty = gr.Slider(
110
- minimum=0.01,
111
- maximum=5.0,
112
- value=1.2,
113
- step=0.01,
114
- interactive=True,
115
- label="Repetition penalty",
116
- info="1.0 is equivalent to no penalty",
117
- )
118
- temperature = gr.Slider(
119
- minimum=0.0,
120
- maximum=5.0,
121
- value=0.4,
122
- step=0.1,
123
- interactive=True,
124
- label="Sampling temperature",
125
- info="Higher values will produce more diverse outputs.",
126
- )
127
- top_p = gr.Slider(
128
- minimum=0.01,
129
- maximum=0.99,
130
- value=0.8,
131
- step=0.01,
132
- interactive=True,
133
- label="Top P",
134
- info="Higher values is equivalent to sampling more low-probability tokens.",
135
- )
136
- decoding_strategy = gr.Radio(
137
- [
138
- "Greedy",
139
- "Top P Sampling",
140
- ],
141
- value="Greedy",
142
- label="Decoding strategy",
143
- interactive=True,
144
- info="Higher values is equivalent to sampling more low-probability tokens.",
145
- )
146
- decoding_strategy.change(
147
- fn=lambda selection: gr.Slider(
148
- visible=(
149
- selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
150
- )
151
- ),
152
- inputs=decoding_strategy,
153
- outputs=temperature,
154
- )
155
-
156
- decoding_strategy.change(
157
- fn=lambda selection: gr.Slider(
158
- visible=(
159
- selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
160
- )
161
- ),
162
- inputs=decoding_strategy,
163
- outputs=repetition_penalty,
164
- )
165
- decoding_strategy.change(
166
- fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
167
- inputs=decoding_strategy,
168
- outputs=top_p,
169
- )
170
  gr.Examples(
171
  examples = examples,
172
  inputs=[image_input, query_input, assistant_prefix, decoding_strategy, temperature,
@@ -174,6 +172,7 @@ with gr.Blocks(fill_height=True) as demo:
174
  outputs=output,
175
  fn=model_inference
176
  )
 
177
 
178
  submit_btn.click(model_inference, inputs = [image_input, query_input, assistant_prefix, decoding_strategy, temperature,
179
  max_new_tokens, repetition_penalty, top_p], outputs=output)
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForVision2Seq
3
  import re
4
  import time
5
  from PIL import Image
 
11
 
12
  processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
13
 
14
+ model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
15
  torch_dtype=torch.bfloat16,
16
  #_attn_implementation="flash_attention_2"
17
+ ).to("cuda")
18
 
19
  @spaces.GPU
20
  def model_inference(
 
74
  return generated_texts[0]
75
 
76
 
77
+ with gr.Blocks() as demo:
78
+ gr.Markdown("## SmolVLM: Small yet Mighty 💫")
79
  gr.Markdown("Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples.")
80
  with gr.Column():
81
  image_input = gr.Image(label="Upload your Image", type="pil", scale=1)
 
85
  submit_btn = gr.Button("Submit")
86
  output = gr.Textbox(label="Output")
87
 
 
88
  examples=[
89
+ ["example_images/rococo.jpg", "What art era is this?", None, "Greedy", 0.4, 512, 1.2, 0.8],
90
+ ["example_images/examples_wat_arun.jpg", "Give me travel tips for the area around this monument.", None, "Greedy", 0.4, 512, 1.2, 0.8],
91
+ ["example_images/examples_invoice.png", "What is the due date and the invoice date?", None, "Greedy", 0.4, 512, 1.2, 0.8],
92
+ ["example_images/s2w_example.png", "What is this UI about?", None, "Greedy", 0.4, 512, 1.2, 0.8],
93
+ ["example_images/examples_weather_events.png", "Where do the severe droughts happen according to this diagram?", None, "Greedy", 0.4, 512, 1.2, 0.8],
94
+ ]
95
+
96
+ with gr.Accordion(label="Advanced Generation Parameters", open=False):
97
+
98
+ # Hyper-parameters for generation
99
+ max_new_tokens = gr.Slider(
100
+ minimum=8,
101
+ maximum=1024,
102
+ value=512,
103
+ step=1,
104
+ interactive=True,
105
+ label="Maximum number of new tokens to generate",
106
+ )
107
+ repetition_penalty = gr.Slider(
108
+ minimum=0.01,
109
+ maximum=5.0,
110
+ value=1.2,
111
+ step=0.01,
112
+ interactive=True,
113
+ label="Repetition penalty",
114
+ info="1.0 is equivalent to no penalty",
115
+ )
116
+ temperature = gr.Slider(
117
+ minimum=0.0,
118
+ maximum=5.0,
119
+ value=0.4,
120
+ step=0.1,
121
+ interactive=True,
122
+ label="Sampling temperature",
123
+ info="Higher values will produce more diverse outputs.",
124
+ )
125
+ top_p = gr.Slider(
126
+ minimum=0.01,
127
+ maximum=0.99,
128
+ value=0.8,
129
+ step=0.01,
130
+ interactive=True,
131
+ label="Top P",
132
+ info="Higher values is equivalent to sampling more low-probability tokens.",
133
+ )
134
+ decoding_strategy = gr.Radio(
135
+ [
136
+ "Greedy",
137
+ "Top P Sampling",
138
+ ],
139
+ value="Greedy",
140
+ label="Decoding strategy",
141
+ interactive=True,
142
+ info="Higher values is equivalent to sampling more low-probability tokens.",
143
+ )
144
+ decoding_strategy.change(
145
+ fn=lambda selection: gr.Slider(
146
+ visible=(
147
+ selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
148
+ )
149
+ ),
150
+ inputs=decoding_strategy,
151
+ outputs=temperature,
152
+ )
153
+
154
+ decoding_strategy.change(
155
+ fn=lambda selection: gr.Slider(
156
+ visible=(
157
+ selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
158
+ )
159
+ ),
160
+ inputs=decoding_strategy,
161
+ outputs=repetition_penalty,
162
+ )
163
+ decoding_strategy.change(
164
+ fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
165
+ inputs=decoding_strategy,
166
+ outputs=top_p,
167
+ )
 
168
  gr.Examples(
169
  examples = examples,
170
  inputs=[image_input, query_input, assistant_prefix, decoding_strategy, temperature,
 
172
  outputs=output,
173
  fn=model_inference
174
  )
175
+
176
 
177
  submit_btn.click(model_inference, inputs = [image_input, query_input, assistant_prefix, decoding_strategy, temperature,
178
  max_new_tokens, repetition_penalty, top_p], outputs=output)
example_images/art_critic.png DELETED
Binary file (87.1 kB)
 
example_images/chicken_on_money.png DELETED
Binary file (420 kB)
 
example_images/dragons_playing.png DELETED
Binary file (626 kB)
 
example_images/dummy_pdf.png DELETED
Binary file (76.9 kB)
 
example_images/example_images_ai2d_example_2.jpeg DELETED
Binary file (89.4 kB)
 
example_images/example_images_meme_french.jpg DELETED
Binary file (70.7 kB)
 
example_images/example_images_surfing_dog.jpg DELETED
Binary file (283 kB)
 
example_images/example_images_tree_fortress.jpg DELETED
Binary file (154 kB)
 
example_images/examples_invoice.png ADDED
example_images/examples_wat_arun.jpg ADDED
example_images/examples_weather_events.png ADDED
example_images/gaulois.png DELETED

Git LFS Details

  • SHA256: 83dd9cd4a9fdb43350e9b87503620db33b1e5d8aeefb4b77a32b7a0293a627be
  • Pointer size: 132 Bytes
  • Size of remote file: 1.13 MB
example_images/mmmu_example.jpeg DELETED
Binary file (17.4 kB)
 
example_images/mmmu_example_2.png DELETED
Binary file (54.8 kB)
 
example_images/paper_with_text.png DELETED
Binary file (975 kB)
 
example_images/polar_bear_coke.png DELETED
Binary file (440 kB)
 
example_images/rococo_1.jpg DELETED
Binary file (849 kB)
 
example_images/travel_tips.jpg DELETED
Binary file (209 kB)