SmolVLM

Paused

App Files Files Community

Merve Noyan commited on 8 days ago

Commit

ad382c8

•

1 Parent(s): 5af142a

update

Browse files

Files changed (19) hide show

app.py +85 -86
example_images/art_critic.png +0 -0
example_images/chicken_on_money.png +0 -0
example_images/dragons_playing.png +0 -0
example_images/dummy_pdf.png +0 -0
example_images/example_images_ai2d_example_2.jpeg +0 -0
example_images/example_images_meme_french.jpg +0 -0
example_images/example_images_surfing_dog.jpg +0 -0
example_images/example_images_tree_fortress.jpg +0 -0
example_images/examples_invoice.png +0 -0
example_images/examples_wat_arun.jpg +0 -0
example_images/examples_weather_events.png +0 -0
example_images/gaulois.png +0 -3
example_images/mmmu_example.jpeg +0 -0
example_images/mmmu_example_2.png +0 -0
example_images/paper_with_text.png +0 -0
example_images/polar_bear_coke.png +0 -0
example_images/rococo_1.jpg +0 -0
example_images/travel_tips.jpg +0 -0

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import AutoProcessor, Idefics3ForConditionalGeneration
 import re
 import time
 from PIL import Image
@@ -11,10 +11,10 @@ import subprocess
 processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
-model = Idefics3ForConditionalGeneration.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
         torch_dtype=torch.bfloat16,
         #_attn_implementation="flash_attention_2"
-                                                        ).to("cuda")
 @spaces.GPU
 def model_inference(
@@ -74,8 +74,8 @@ def model_inference(
     return generated_texts[0]
-with gr.Blocks(fill_height=True) as demo:
-    gr.Markdown("## SmolVLM")
     gr.Markdown("Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples.")
     with gr.Column():
         image_input = gr.Image(label="Upload your Image", type="pil", scale=1)
@@ -85,88 +85,86 @@ with gr.Blocks(fill_height=True) as demo:
         submit_btn = gr.Button("Submit")
         output = gr.Textbox(label="Output")
-    with gr.Accordion(label="Example Inputs and Advanced Generation Parameters"):
         examples=[
-                    ["example_images/mmmu_example.jpeg", "Chase wants to buy 4 kilograms of oval beads and 5 kilograms of star-shaped beads. How much will he spend?", "Let's think step by step.", "Greedy", 0.4, 512, 1.2, 0.8],
-                    ["example_images/rococo_1.jpg", "What art era is this?", None, "Greedy", 0.4, 512, 1.2, 0.8],
-                    ["example_images/paper_with_text.png", "Read what's written on the paper", None, "Greedy", 0.4, 512, 1.2, 0.8],
-                    ["example_images/dragons_playing.png","What's unusual about this image?",None,  "Greedy", 0.4, 512, 1.2, 0.8],
-                    ["example_images/example_images_ai2d_example_2.jpeg", "What happens to fish if pelicans increase?", None, "Greedy", 0.4, 512, 1.2, 0.8],
-                    ["example_images/travel_tips.jpg", "I want to go somewhere similar to the one in the photo. Give me destinations and travel tips.", None, "Greedy", 0.4, 512, 1.2, 0.8],
-                    ["example_images/dummy_pdf.png", "How much percent is the order status?", None, "Greedy", 0.4, 512, 1.2, 0.8],
-                    ["example_images/art_critic.png", "As an art critic AI assistant, could you describe this painting in details and make a thorough critic?.",None,  "Greedy", 0.4, 512, 1.2, 0.8],
-                    ["example_images/s2w_example.png",  "What is this UI about?", None,"Greedy", 0.4, 512, 1.2, 0.8]]
-        # Hyper-parameters for generation
-        max_new_tokens = gr.Slider(
-              minimum=8,
-              maximum=1024,
-              value=512,
-              step=1,
-              interactive=True,
-              label="Maximum number of new tokens to generate",
-          )
-        repetition_penalty = gr.Slider(
-              minimum=0.01,
-              maximum=5.0,
-              value=1.2,
-              step=0.01,
-              interactive=True,
-              label="Repetition penalty",
-              info="1.0 is equivalent to no penalty",
-          )
-        temperature = gr.Slider(
-              minimum=0.0,
-              maximum=5.0,
-              value=0.4,
-              step=0.1,
-              interactive=True,
-              label="Sampling temperature",
-              info="Higher values will produce more diverse outputs.",
-          )
-        top_p = gr.Slider(
-              minimum=0.01,
-              maximum=0.99,
-              value=0.8,
-              step=0.01,
-              interactive=True,
-              label="Top P",
-              info="Higher values is equivalent to sampling more low-probability tokens.",
-          )
-        decoding_strategy = gr.Radio(
-              [
-                  "Greedy",
-                  "Top P Sampling",
-              ],
-              value="Greedy",
-              label="Decoding strategy",
-              interactive=True,
-              info="Higher values is equivalent to sampling more low-probability tokens.",
-          )
-        decoding_strategy.change(
-              fn=lambda selection: gr.Slider(
-                  visible=(
-                      selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
-                  )
-              ),
-              inputs=decoding_strategy,
-              outputs=temperature,
-          )
-        decoding_strategy.change(
-              fn=lambda selection: gr.Slider(
-                  visible=(
-                      selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
-                  )
-              ),
-              inputs=decoding_strategy,
-              outputs=repetition_penalty,
-          )
-        decoding_strategy.change(
-              fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
-              inputs=decoding_strategy,
-              outputs=top_p,
-          )
         gr.Examples(
                         examples = examples,
                         inputs=[image_input, query_input, assistant_prefix, decoding_strategy, temperature,
@@ -174,6 +172,7 @@ with gr.Blocks(fill_height=True) as demo:
                         outputs=output,
                         fn=model_inference
                     )
         submit_btn.click(model_inference, inputs = [image_input, query_input, assistant_prefix, decoding_strategy, temperature,
                                                       max_new_tokens, repetition_penalty, top_p], outputs=output)

 import gradio as gr
+from transformers import AutoProcessor, AutoModelForVision2Seq
 import re
 import time
 from PIL import Image
 processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
+model = AutoModelForVision2Seq.from_pretrained("HuggingFaceTB/SmolVLM-Instruct",
         torch_dtype=torch.bfloat16,
         #_attn_implementation="flash_attention_2"
+        ).to("cuda")
 @spaces.GPU
 def model_inference(
     return generated_texts[0]
+with gr.Blocks() as demo:
+    gr.Markdown("## SmolVLM: Small yet Mighty 💫")
     gr.Markdown("Play with [HuggingFaceTB/SmolVLM-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct) in this demo. To get started, upload an image and text or try one of the examples.")
     with gr.Column():
         image_input = gr.Image(label="Upload your Image", type="pil", scale=1)
         submit_btn = gr.Button("Submit")
         output = gr.Textbox(label="Output")
         examples=[
+                    ["example_images/rococo.jpg", "What art era is this?", None, "Greedy", 0.4, 512, 1.2, 0.8],
+                    ["example_images/examples_wat_arun.jpg", "Give me travel tips for the area around this monument.", None, "Greedy", 0.4, 512, 1.2, 0.8],
+                    ["example_images/examples_invoice.png", "What is the due date and the invoice date?", None, "Greedy", 0.4, 512, 1.2, 0.8],
+                    ["example_images/s2w_example.png", "What is this UI about?", None, "Greedy", 0.4, 512, 1.2, 0.8],
+                    ["example_images/examples_weather_events.png", "Where do the severe droughts happen according to this diagram?", None, "Greedy", 0.4, 512, 1.2, 0.8],
+        ]
+        with gr.Accordion(label="Advanced Generation Parameters", open=False):
+            # Hyper-parameters for generation
+            max_new_tokens = gr.Slider(
+                minimum=8,
+                maximum=1024,
+                value=512,
+                step=1,
+                interactive=True,
+                label="Maximum number of new tokens to generate",
+            )
+            repetition_penalty = gr.Slider(
+                minimum=0.01,
+                maximum=5.0,
+                value=1.2,
+                step=0.01,
+                interactive=True,
+                label="Repetition penalty",
+                info="1.0 is equivalent to no penalty",
+            )
+            temperature = gr.Slider(
+                minimum=0.0,
+                maximum=5.0,
+                value=0.4,
+                step=0.1,
+                interactive=True,
+                label="Sampling temperature",
+                info="Higher values will produce more diverse outputs.",
+            )
+            top_p = gr.Slider(
+                minimum=0.01,
+                maximum=0.99,
+                value=0.8,
+                step=0.01,
+                interactive=True,
+                label="Top P",
+                info="Higher values is equivalent to sampling more low-probability tokens.",
+            )
+            decoding_strategy = gr.Radio(
+                [
+                    "Greedy",
+                    "Top P Sampling",
+                ],
+                value="Greedy",
+                label="Decoding strategy",
+                interactive=True,
+                info="Higher values is equivalent to sampling more low-probability tokens.",
+            )
+            decoding_strategy.change(
+                fn=lambda selection: gr.Slider(
+                    visible=(
+                        selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
+                    )
+                ),
+                inputs=decoding_strategy,
+                outputs=temperature,
+            )
+            decoding_strategy.change(
+                fn=lambda selection: gr.Slider(
+                    visible=(
+                        selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
+                    )
+                ),
+                inputs=decoding_strategy,
+                outputs=repetition_penalty,
+            )
+            decoding_strategy.change(
+                fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
+                inputs=decoding_strategy,
+                outputs=top_p,
+            )
         gr.Examples(
                         examples = examples,
                         inputs=[image_input, query_input, assistant_prefix, decoding_strategy, temperature,
                         outputs=output,
                         fn=model_inference
                     )
         submit_btn.click(model_inference, inputs = [image_input, query_input, assistant_prefix, decoding_strategy, temperature,
                                                       max_new_tokens, repetition_penalty, top_p], outputs=output)

example_images/art_critic.png DELETED Viewed

Binary file (87.1 kB)

example_images/chicken_on_money.png DELETED Viewed

Binary file (420 kB)

example_images/dragons_playing.png DELETED Viewed

Binary file (626 kB)

example_images/dummy_pdf.png DELETED Viewed

Binary file (76.9 kB)

example_images/example_images_ai2d_example_2.jpeg DELETED Viewed

Binary file (89.4 kB)

example_images/example_images_meme_french.jpg DELETED Viewed

Binary file (70.7 kB)

example_images/example_images_surfing_dog.jpg DELETED Viewed

Binary file (283 kB)

example_images/example_images_tree_fortress.jpg DELETED Viewed

Binary file (154 kB)

example_images/examples_invoice.png ADDED Viewed

example_images/examples_wat_arun.jpg ADDED Viewed

example_images/examples_weather_events.png ADDED Viewed

example_images/gaulois.png DELETED Viewed

Git LFS Details

SHA256: 83dd9cd4a9fdb43350e9b87503620db33b1e5d8aeefb4b77a32b7a0293a627be
Pointer size: 132 Bytes
Size of remote file: 1.13 MB

example_images/mmmu_example.jpeg DELETED Viewed

Binary file (17.4 kB)

example_images/mmmu_example_2.png DELETED Viewed

Binary file (54.8 kB)

example_images/paper_with_text.png DELETED Viewed

Binary file (975 kB)

example_images/polar_bear_coke.png DELETED Viewed

Binary file (440 kB)

example_images/rococo_1.jpg DELETED Viewed

Binary file (849 kB)

example_images/travel_tips.jpg DELETED Viewed

Binary file (209 kB)