CallmeKaito commited on
Commit
10d1571
1 Parent(s): 27c4528

Delete LLaVa.py

Browse files
Files changed (1) hide show
  1. LLaVa.py +0 -140
LLaVa.py DELETED
@@ -1,140 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # # Set-up environment
5
-
6
- # In[2]:
7
-
8
-
9
- get_ipython().system('pip install --upgrade -q accelerate bitsandbytes')
10
-
11
-
12
- # In[ ]:
13
-
14
-
15
- get_ipython().system('rm -r transformers')
16
- get_ipython().system('git clone -b llava_improvements https://github.com/NielsRogge/transformers.git')
17
- get_ipython().system('cd transformers')
18
- get_ipython().system('pip install -q ./transformers')
19
-
20
-
21
- # In[ ]:
22
-
23
-
24
- get_ipython().system('pip install git+https://github.com/huggingface/transformers.git')
25
-
26
-
27
- # ## Load model and processor
28
-
29
- # In[ ]:
30
-
31
-
32
- from transformers import AutoProcessor, LlavaForConditionalGeneration
33
- from transformers import BitsAndBytesConfig
34
- import torch
35
-
36
- quantization_config = BitsAndBytesConfig(
37
- load_in_4bit=True,
38
- bnb_4bit_compute_dtype=torch.float16
39
- )
40
-
41
-
42
- model_id = "llava-hf/llava-1.5-7b-hf"
43
-
44
- processor = AutoProcessor.from_pretrained(model_id)
45
- model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, device_map="auto")
46
-
47
-
48
- # ## Prepare image and text for the model
49
-
50
- # In[ ]:
51
-
52
-
53
- import requests
54
- from PIL import Image
55
-
56
- image1 = Image.open('data/clock.jpeg')
57
- display(image1)
58
-
59
-
60
- # In the prompt, you can refer to images using the special \<image> token. To indicate which text comes from a human vs. the model, one uses USER and ASSISTANT respectively. The format looks as follows:
61
- #
62
- # ```bash
63
- # USER: <image>\n<prompt>\nASSISTANT:
64
- # ```
65
-
66
- # In other words, you always need to end your prompt with `ASSISTANT:`. Here we will perform batched generation (i.e generating on several prompts).
67
-
68
- # In[ ]:
69
-
70
-
71
- caption = 'an old fashioned clock sitting on top of a table'
72
-
73
- user_input = "This is an intricately crafted old-fashioned clock created by a skilled Moroccan artisan back in 1988 from Chefchaoune.. it reminds me of my mother."
74
-
75
- prompts = [
76
- f"USER: <image>\nBased on the caption '{caption}' and the following user input: '{user_input}', generate a detailed product name and description for this Moroccan artisanal item; the description should be minimal yet it gives the essence of the product and convinces people to buy or express their interest in it.\nASSISTANT:"
77
- # f"""
78
- # USER: <image>\nBased on the image caption '{caption}' and the following background information: '{user_input}', generate an attention-grabbing yet concise product name and description for this authentic Moroccan artisanal item. The description should:
79
- # Highlight the key features and unique selling points that make this product exceptional and desirable.
80
- # Convey the cultural significance, craftsmanship, and rich heritage behind the item's creation.
81
- # Use evocative language that resonates with potential buyers and piques their interest in owning this one-of-a-kind piece.
82
- # Be concise, direct, and persuasive, leaving the reader eager to learn more or acquire the product.
83
-
84
- # Your response should follow this format:
85
- # Product Name: [Compelling and relevant product name]
86
- # Product Description: [Concise yet captivating description addressing the points above]
87
- # ASSISTANT:"""
88
-
89
- ]
90
-
91
- inputs = processor(prompts, images=[image1], padding=True, return_tensors="pt").to("cuda")
92
- for k,v in inputs.items():
93
- print(k,v.shape)
94
-
95
-
96
- # ## Autoregressively generate completion
97
- #
98
- # Finally, we simply let the model predict the next tokens given the images + prompt. Of course one can adjust all the [generation parameters](https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/text_generation#transformers.GenerationMixin.generate). By default, greedy decoding is used.
99
-
100
- # In[ ]:
101
-
102
-
103
- output = model.generate(**inputs, max_new_tokens=200)
104
- generated_text = processor.batch_decode(output, skip_special_tokens=True)
105
- for text in generated_text:
106
- print(text.split("ASSISTANT:")[-1])
107
-
108
-
109
- # ## Pipeline API
110
- #
111
- # Alternatively, you can leverage the [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines) API which abstracts all of the logic above away for the user. We also provide the quantization config to make sure we leverage 4-bit inference.
112
-
113
- # In[ ]:
114
-
115
-
116
- from transformers import pipeline
117
-
118
- pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})
119
-
120
-
121
- # In[ ]:
122
-
123
-
124
- max_new_tokens = 200
125
- prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:"
126
-
127
- outputs = pipe(image1, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
128
-
129
-
130
- # In[ ]:
131
-
132
-
133
- print(outputs[0]["generated_text"])
134
-
135
-
136
- # In[ ]:
137
-
138
-
139
-
140
-