merve HF staff commited on
Commit
834334c
1 Parent(s): 9fb3cfe

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ transformers-4.47.0.dev0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.system('pip install ./transformers-4.47.0.dev0-py3-none-any.whl')
4
+
5
+ import gradio as gr
6
+ import PIL.Image
7
+ import transformers
8
+ from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
9
+ import torch
10
+ import string
11
+ import functools
12
+ import re
13
+ import flax.linen as nn
14
+ import jax
15
+ import jax.numpy as jnp
16
+ import numpy as np
17
+ import spaces
18
+
19
+
20
+ adapter_id = "merve/paligemma2-3b-vqav2"
21
+ model_id = "gv-hf/paligemma2-3b-pt-448"
22
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+ model = PaliGemmaForConditionalGeneration.from_pretrained(adapter_id).eval().to(device)
24
+ processor = PaliGemmaProcessor.from_pretrained(model_id)
25
+
26
+ ###### Transformers Inference
27
+ @spaces.GPU
28
+ def infer(
29
+ text,
30
+ image: PIL.Image.Image,
31
+ max_new_tokens: int
32
+ ) -> str:
33
+ text = "answer en " + text
34
+ inputs = processor(text=text, images=image, return_tensors="pt").to(device)
35
+ with torch.inference_mode():
36
+ generated_ids = model.generate(
37
+ **inputs,
38
+ max_new_tokens=max_new_tokens,
39
+ do_sample=False
40
+ )
41
+ result = processor.batch_decode(generated_ids, skip_special_tokens=True)
42
+ return result[0][len(text):].lstrip("\n")
43
+
44
+
45
+ ######## Demo
46
+
47
+ INTRO_TEXT = """## PaliGemma 2 demo\n\n
48
+ | [Github](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md)
49
+ | [Blogpost](https://huggingface.co/blog/paligemma)
50
+ | [Fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
51
+ |\n\n
52
+ PaliGemma 2 is an open vision-language model by Google, inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and
53
+ built with open components such as the [SigLIP](https://arxiv.org/abs/2303.15343)
54
+ vision model and the [Gemma 2](https://arxiv.org/abs/2408.00118) language model. PaliGemma 2 is designed as a versatile
55
+ model for transfer to a wide range of vision-language tasks such as image and short video caption, visual question
56
+ answering, text reading, object detection and object segmentation.
57
+ \n\n
58
+ This space includes a model LoRA fine-tuned by the team at Hugging Face on VQAv2, inferred using transformers.
59
+ See the [Blogpost](https://huggingface.co/blog/paligemma2), the project
60
+ [README](https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md) and the
61
+ [fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
62
+ for detailed information about how to use and fine-tune PaliGemma and PaliGemma 2 models.
63
+ \n\n
64
+ **This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
65
+ """
66
+
67
+
68
+ with gr.Blocks(css="style.css") as demo:
69
+ gr.Markdown(INTRO_TEXT)
70
+ with gr.Column():
71
+ question = gr.Text(label="Question")
72
+ image = gr.Image(label="Input Image", type="pil", height=500)
73
+ caption_btn = gr.Button(value="Submit")
74
+ text_output = gr.Text(label="Text Output")
75
+
76
+ tokens = gr.Slider(
77
+ label="Max New Tokens",
78
+ info="Set to larger for longer generation.",
79
+ minimum=20,
80
+ maximum=160,
81
+ value=80,
82
+ step=10,
83
+ )
84
+
85
+ caption_inputs = [
86
+ question,
87
+ image,
88
+ tokens
89
+ ]
90
+ caption_outputs = [
91
+ text_output
92
+ ]
93
+ caption_btn.click(
94
+ fn=infer,
95
+ inputs=caption_inputs,
96
+ outputs=caption_outputs,
97
+ )
98
+
99
+
100
+ examples = [
101
+ ["What is the graphic about?", "./howto.jpg", 60],
102
+ ["What is the password", "./password.jpg", 20],
103
+ ["Who is in this image?", "./examples_bowie.jpg", 80],
104
+ ]
105
+ gr.Markdown("Example images are licensed CC0 by [akolesnikoff@](https://github.com/akolesnikoff), [mbosnjak@](https://github.com/mbosnjak), [maximneumann@](https://github.com/maximneumann) and [merve](https://huggingface.co/merve).")
106
+
107
+ gr.Examples(
108
+ examples=examples,
109
+ inputs=caption_inputs,
110
+ )
111
+ #########
112
+
113
+ if __name__ == "__main__":
114
+ demo.queue(max_size=10).launch(debug=True)
cats.png ADDED
examples_bowie.jpg ADDED
howto.jpg ADDED
password.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ spaces
3
+ peft
transformers-4.47.0.dev0-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89dfe59f0ccb645734d6597cfb3acc61dc767e2e7fac0b4c7ab4044e583f78d4
3
+ size 10035778