paligemma_2

Running on Zero

App Files Files Community

merve HF staff commited on 6 days ago

Commit

834334c

•

1 Parent(s): 9fb3cfe

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
app.py +114 -0
cats.png +0 -0
examples_bowie.jpg +0 -0
howto.jpg +0 -0
password.jpg +0 -0
requirements.txt +3 -0
transformers-4.47.0.dev0-py3-none-any.whl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+transformers-4.47.0.dev0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+os.system('pip install ./transformers-4.47.0.dev0-py3-none-any.whl')
+import gradio as gr
+import PIL.Image
+import transformers
+from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
+import torch
+import string
+import functools
+import re
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+import spaces
+adapter_id = "merve/paligemma2-3b-vqav2"
+model_id = "gv-hf/paligemma2-3b-pt-448"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = PaliGemmaForConditionalGeneration.from_pretrained(adapter_id).eval().to(device)
+processor = PaliGemmaProcessor.from_pretrained(model_id)
+###### Transformers Inference
+@spaces.GPU
+def infer(
+    text,
+    image: PIL.Image.Image,
+    max_new_tokens: int
+) -> str:
+    text = "answer en " + text
+    inputs = processor(text=text, images=image, return_tensors="pt").to(device)
+    with torch.inference_mode():
+      generated_ids = model.generate(
+          **inputs,
+          max_new_tokens=max_new_tokens,
+          do_sample=False
+      )
+    result = processor.batch_decode(generated_ids, skip_special_tokens=True)
+    return result[0][len(text):].lstrip("\n")
+######## Demo
+INTRO_TEXT = """## PaliGemma 2 demo\n\n
+| [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
+| [Blogpost](https://huggingface.co/blog/paligemma)
+| [Fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
+|\n\n
+PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
+built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
+vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
+model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
+answering, text reading, object detection and object segmentation.
+\n\n
+This space includes a model LoRA fine-tuned by the team at Hugging Face on VQAv2, inferred using transformers.
+See the [Blogpost](https://huggingface.co/blog/paligemma2), the project
+[README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) and the
+[fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
+for detailed information about how to use and fine-tune PaliGemma and PaliGemma 2 models.
+\n\n
+**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
+"""
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(INTRO_TEXT)
+    with gr.Column():
+        question = gr.Text(label="Question")
+        image = gr.Image(label="Input Image", type="pil", height=500)
+        caption_btn = gr.Button(value="Submit")
+        text_output = gr.Text(label="Text Output")
+        tokens = gr.Slider(
+            label="Max New Tokens",
+            info="Set to larger for longer generation.",
+            minimum=20,
+            maximum=160,
+            value=80,
+            step=10,
+        )
+    caption_inputs = [
+        question,
+        image,
+        tokens
+        ]
+    caption_outputs = [
+        text_output
+    ]
+    caption_btn.click(
+        fn=infer,
+        inputs=caption_inputs,
+        outputs=caption_outputs,
+    )
+    examples = [
+        ["What is the graphic about?", "./howto.jpg", 60],
+        ["What is the password", "./password.jpg", 20],
+        ["Who is in this image?", "./examples_bowie.jpg", 80],
+        ]
+    gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
+    gr.Examples(
+        examples=examples,
+        inputs=caption_inputs,
+    )
+#########
+if __name__ == "__main__":
+    demo.queue(max_size=10).launch(debug=True)

cats.png ADDED Viewed

examples_bowie.jpg ADDED Viewed

howto.jpg ADDED Viewed

password.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch
+spaces
+peft

transformers-4.47.0.dev0-py3-none-any.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89dfe59f0ccb645734d6597cfb3acc61dc767e2e7fac0b4c7ab4044e583f78d4
+size 10035778