Spaces:
Running
on
Zero
Running
on
Zero
Upload 7 files
Browse files- .gitattributes +1 -0
- app.py +114 -0
- cats.png +0 -0
- examples_bowie.jpg +0 -0
- howto.jpg +0 -0
- password.jpg +0 -0
- requirements.txt +3 -0
- transformers-4.47.0.dev0-py3-none-any.whl +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
transformers-4.47.0.dev0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
os.system('pip install ./transformers-4.47.0.dev0-py3-none-any.whl')
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import PIL.Image
|
7 |
+
import transformers
|
8 |
+
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
|
9 |
+
import torch
|
10 |
+
import string
|
11 |
+
import functools
|
12 |
+
import re
|
13 |
+
import flax.linen as nn
|
14 |
+
import jax
|
15 |
+
import jax.numpy as jnp
|
16 |
+
import numpy as np
|
17 |
+
import spaces
|
18 |
+
|
19 |
+
|
20 |
+
adapter_id = "merve/paligemma2-3b-vqav2"
|
21 |
+
model_id = "gv-hf/paligemma2-3b-pt-448"
|
22 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
+
model = PaliGemmaForConditionalGeneration.from_pretrained(adapter_id).eval().to(device)
|
24 |
+
processor = PaliGemmaProcessor.from_pretrained(model_id)
|
25 |
+
|
26 |
+
###### Transformers Inference
|
27 |
+
@spaces.GPU
|
28 |
+
def infer(
|
29 |
+
text,
|
30 |
+
image: PIL.Image.Image,
|
31 |
+
max_new_tokens: int
|
32 |
+
) -> str:
|
33 |
+
text = "answer en " + text
|
34 |
+
inputs = processor(text=text, images=image, return_tensors="pt").to(device)
|
35 |
+
with torch.inference_mode():
|
36 |
+
generated_ids = model.generate(
|
37 |
+
**inputs,
|
38 |
+
max_new_tokens=max_new_tokens,
|
39 |
+
do_sample=False
|
40 |
+
)
|
41 |
+
result = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
42 |
+
return result[0][len(text):].lstrip("\n")
|
43 |
+
|
44 |
+
|
45 |
+
######## Demo
|
46 |
+
|
47 |
+
INTRO_TEXT = """## PaliGemma 2 demo\n\n
|
48 |
+
| [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
|
49 |
+
| [Blogpost](https://huggingface.co/blog/paligemma)
|
50 |
+
| [Fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
|
51 |
+
|\n\n
|
52 |
+
PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
|
53 |
+
built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
|
54 |
+
vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
|
55 |
+
model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
|
56 |
+
answering, text reading, object detection and object segmentation.
|
57 |
+
\n\n
|
58 |
+
This space includes a model LoRA fine-tuned by the team at Hugging Face on VQAv2, inferred using transformers.
|
59 |
+
See the [Blogpost](https://huggingface.co/blog/paligemma2), the project
|
60 |
+
[README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) and the
|
61 |
+
[fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
|
62 |
+
for detailed information about how to use and fine-tune PaliGemma and PaliGemma 2 models.
|
63 |
+
\n\n
|
64 |
+
**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
|
65 |
+
"""
|
66 |
+
|
67 |
+
|
68 |
+
with gr.Blocks(css="style.css") as demo:
|
69 |
+
gr.Markdown(INTRO_TEXT)
|
70 |
+
with gr.Column():
|
71 |
+
question = gr.Text(label="Question")
|
72 |
+
image = gr.Image(label="Input Image", type="pil", height=500)
|
73 |
+
caption_btn = gr.Button(value="Submit")
|
74 |
+
text_output = gr.Text(label="Text Output")
|
75 |
+
|
76 |
+
tokens = gr.Slider(
|
77 |
+
label="Max New Tokens",
|
78 |
+
info="Set to larger for longer generation.",
|
79 |
+
minimum=20,
|
80 |
+
maximum=160,
|
81 |
+
value=80,
|
82 |
+
step=10,
|
83 |
+
)
|
84 |
+
|
85 |
+
caption_inputs = [
|
86 |
+
question,
|
87 |
+
image,
|
88 |
+
tokens
|
89 |
+
]
|
90 |
+
caption_outputs = [
|
91 |
+
text_output
|
92 |
+
]
|
93 |
+
caption_btn.click(
|
94 |
+
fn=infer,
|
95 |
+
inputs=caption_inputs,
|
96 |
+
outputs=caption_outputs,
|
97 |
+
)
|
98 |
+
|
99 |
+
|
100 |
+
examples = [
|
101 |
+
["What is the graphic about?", "./howto.jpg", 60],
|
102 |
+
["What is the password", "./password.jpg", 20],
|
103 |
+
["Who is in this image?", "./examples_bowie.jpg", 80],
|
104 |
+
]
|
105 |
+
gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
|
106 |
+
|
107 |
+
gr.Examples(
|
108 |
+
examples=examples,
|
109 |
+
inputs=caption_inputs,
|
110 |
+
)
|
111 |
+
#########
|
112 |
+
|
113 |
+
if __name__ == "__main__":
|
114 |
+
demo.queue(max_size=10).launch(debug=True)
|
cats.png
ADDED
examples_bowie.jpg
ADDED
howto.jpg
ADDED
password.jpg
ADDED
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
spaces
|
3 |
+
peft
|
transformers-4.47.0.dev0-py3-none-any.whl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89dfe59f0ccb645734d6597cfb3acc61dc767e2e7fac0b4c7ab4044e583f78d4
|
3 |
+
size 10035778
|