SupermanxKiaski commited on
Commit
3b40f46
1 Parent(s): 8366707

Upload 351 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. .gitignore +3 -0
  3. CLIP/CLIP.png +0 -0
  4. CLIP/LICENSE +22 -0
  5. CLIP/MANIFEST.in +1 -0
  6. CLIP/README.md +193 -0
  7. CLIP/__init__.py +0 -0
  8. CLIP/clip/__init__.py +1 -0
  9. CLIP/clip/bpe_simple_vocab_16e6.txt.gz +3 -0
  10. CLIP/clip/clip.py +231 -0
  11. CLIP/clip/model.py +484 -0
  12. CLIP/clip/simple_tokenizer.py +132 -0
  13. CLIP/clip_explainability/__init__.py +1 -0
  14. CLIP/clip_explainability/auxilary.py +422 -0
  15. CLIP/clip_explainability/bpe_simple_vocab_16e6.txt.gz +3 -0
  16. CLIP/clip_explainability/clip.py +196 -0
  17. CLIP/clip_explainability/model.py +442 -0
  18. CLIP/clip_explainability/simple_tokenizer.py +132 -0
  19. CLIP/data/country211.md +12 -0
  20. CLIP/data/prompts.md +3401 -0
  21. CLIP/data/rendered-sst2.md +11 -0
  22. CLIP/data/yfcc100m.md +14 -0
  23. CLIP/model-card.md +120 -0
  24. CLIP/requirements.txt +5 -0
  25. CLIP/setup.py +21 -0
  26. CLIP/tests/test_consistency.py +25 -0
  27. LICENSE +21 -0
  28. README.md +86 -13
  29. Text2LIVE-main/CLIP/__pycache__/__init__.cpython-37.pyc +0 -0
  30. Text2LIVE-main/CLIP/clip/__pycache__/__init__.cpython-37.pyc +0 -0
  31. Text2LIVE-main/CLIP/clip/__pycache__/clip.cpython-37.pyc +0 -0
  32. Text2LIVE-main/CLIP/clip/__pycache__/model.cpython-37.pyc +0 -0
  33. Text2LIVE-main/CLIP/clip/__pycache__/simple_tokenizer.cpython-37.pyc +0 -0
  34. Text2LIVE-main/CLIP/clip_explainability/__pycache__/__init__.cpython-37.pyc +0 -0
  35. Text2LIVE-main/CLIP/clip_explainability/__pycache__/auxilary.cpython-37.pyc +0 -0
  36. Text2LIVE-main/CLIP/clip_explainability/__pycache__/clip.cpython-37.pyc +0 -0
  37. Text2LIVE-main/CLIP/clip_explainability/__pycache__/model.cpython-37.pyc +0 -0
  38. Text2LIVE-main/CLIP/clip_explainability/__pycache__/simple_tokenizer.cpython-37.pyc +0 -0
  39. Text2LIVE-main/README.md +5 -7
  40. Text2LIVE-main/data/data/images/Thumbs.db +0 -0
  41. Text2LIVE-main/data/data/images/cake.jpeg +0 -0
  42. Text2LIVE-main/data/data/images/horse.jpg +0 -0
  43. Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint +3 -0
  44. Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint +3 -0
  45. Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint +3 -0
  46. Text2LIVE-main/data/data/videos/blackswan/00000.jpg +0 -0
  47. Text2LIVE-main/data/data/videos/blackswan/00001.jpg +0 -0
  48. Text2LIVE-main/data/data/videos/blackswan/00002.jpg +0 -0
  49. Text2LIVE-main/data/data/videos/blackswan/00003.jpg +0 -0
  50. Text2LIVE-main/data/data/videos/blackswan/00004.jpg +0 -0
.gitattributes CHANGED
@@ -35,3 +35,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  Text2LIVE-main/Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint filter=lfs diff=lfs merge=lfs -text
36
  Text2LIVE-main/Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint filter=lfs diff=lfs merge=lfs -text
37
  Text2LIVE-main/Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint filter=lfs diff=lfs merge=lfs -text
 
 
 
 
35
  Text2LIVE-main/Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint filter=lfs diff=lfs merge=lfs -text
36
  Text2LIVE-main/Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint filter=lfs diff=lfs merge=lfs -text
37
  Text2LIVE-main/Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint filter=lfs diff=lfs merge=lfs -text
38
+ Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint filter=lfs diff=lfs merge=lfs -text
39
+ Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint filter=lfs diff=lfs merge=lfs -text
40
+ Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /wandb/
2
+ __pycache__/
3
+ /idea
CLIP/CLIP.png ADDED
CLIP/LICENSE ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 OpenAI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
CLIP/MANIFEST.in ADDED
@@ -0,0 +1 @@
 
 
1
+ include clip/bpe_simple_vocab_16e6.txt.gz
CLIP/README.md ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLIP
2
+
3
+ [[Blog]](https://openai.com/blog/clip/) [[Paper]](https://arxiv.org/abs/2103.00020) [[Model Card]](model-card.md) [[Colab]](https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb)
4
+
5
+ CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task, similarly to the zero-shot capabilities of GPT-2 and 3. We found CLIP matches the performance of the original ResNet50 on ImageNet “zero-shot” without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision.
6
+
7
+
8
+
9
+ ## Approach
10
+
11
+ ![CLIP](CLIP.png)
12
+
13
+
14
+
15
+ ## Usage
16
+
17
+ First, [install PyTorch 1.7.1](https://pytorch.org/get-started/locally/) and torchvision, as well as small additional dependencies, and then install this repo as a Python package. On a CUDA GPU machine, the following will do the trick:
18
+
19
+ ```bash
20
+ $ conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
21
+ $ pip install ftfy regex tqdm
22
+ $ pip install git+https://github.com/openai/CLIP.git
23
+ ```
24
+
25
+ Replace `cudatoolkit=11.0` above with the appropriate CUDA version on your machine or `cpuonly` when installing on a machine without a GPU.
26
+
27
+ ```python
28
+ import torch
29
+ import clip
30
+ from PIL import Image
31
+
32
+ device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ model, preprocess = clip.load("ViT-B/32", device=device)
34
+
35
+ image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
36
+ text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
37
+
38
+ with torch.no_grad():
39
+ image_features = model.encode_image(image)
40
+ text_features = model.encode_text(text)
41
+
42
+ logits_per_image, logits_per_text = model(image, text)
43
+ probs = logits_per_image.softmax(dim=-1).cpu().numpy()
44
+
45
+ print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]]
46
+ ```
47
+
48
+
49
+ ## API
50
+
51
+ The CLIP module `clip` provides the following methods:
52
+
53
+ #### `clip.available_models()`
54
+
55
+ Returns the names of the available CLIP models.
56
+
57
+ #### `clip.load(name, device=..., jit=False)`
58
+
59
+ Returns the model and the TorchVision transform needed by the model, specified by the model name returned by `clip.available_models()`. It will download the model as necessary. The `name` argument can also be a path to a local checkpoint.
60
+
61
+ The device to run the model can be optionally specified, and the default is to use the first CUDA device if there is any, otherwise the CPU. When `jit` is `False`, a non-JIT version of the model will be loaded.
62
+
63
+ #### `clip.tokenize(text: Union[str, List[str]], context_length=77)`
64
+
65
+ Returns a LongTensor containing tokenized sequences of given text input(s). This can be used as the input to the model
66
+
67
+ ---
68
+
69
+ The model returned by `clip.load()` supports the following methods:
70
+
71
+ #### `model.encode_image(image: Tensor)`
72
+
73
+ Given a batch of images, returns the image features encoded by the vision portion of the CLIP model.
74
+
75
+ #### `model.encode_text(text: Tensor)`
76
+
77
+ Given a batch of text tokens, returns the text features encoded by the language portion of the CLIP model.
78
+
79
+ #### `model(image: Tensor, text: Tensor)`
80
+
81
+ Given a batch of images and a batch of text tokens, returns two Tensors, containing the logit scores corresponding to each image and text input. The values are cosine similarities between the corresponding image and text features, times 100.
82
+
83
+
84
+
85
+ ## More Examples
86
+
87
+ ### Zero-Shot Prediction
88
+
89
+ The code below performs zero-shot prediction using CLIP, as shown in Appendix B in the paper. This example takes an image from the [CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html), and predicts the most likely labels among the 100 textual labels from the dataset.
90
+
91
+ ```python
92
+ import os
93
+ import clip
94
+ import torch
95
+ from torchvision.datasets import CIFAR100
96
+
97
+ # Load the model
98
+ device = "cuda" if torch.cuda.is_available() else "cpu"
99
+ model, preprocess = clip.load('ViT-B/32', device)
100
+
101
+ # Download the dataset
102
+ cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)
103
+
104
+ # Prepare the inputs
105
+ image, class_id = cifar100[3637]
106
+ image_input = preprocess(image).unsqueeze(0).to(device)
107
+ text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device)
108
+
109
+ # Calculate features
110
+ with torch.no_grad():
111
+ image_features = model.encode_image(image_input)
112
+ text_features = model.encode_text(text_inputs)
113
+
114
+ # Pick the top 5 most similar labels for the image
115
+ image_features /= image_features.norm(dim=-1, keepdim=True)
116
+ text_features /= text_features.norm(dim=-1, keepdim=True)
117
+ similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
118
+ values, indices = similarity[0].topk(5)
119
+
120
+ # Print the result
121
+ print("\nTop predictions:\n")
122
+ for value, index in zip(values, indices):
123
+ print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")
124
+ ```
125
+
126
+ The output will look like the following (the exact numbers may be slightly different depending on the compute device):
127
+
128
+ ```
129
+ Top predictions:
130
+
131
+ snake: 65.31%
132
+ turtle: 12.29%
133
+ sweet_pepper: 3.83%
134
+ lizard: 1.88%
135
+ crocodile: 1.75%
136
+ ```
137
+
138
+ Note that this example uses the `encode_image()` and `encode_text()` methods that return the encoded features of given inputs.
139
+
140
+
141
+ ### Linear-probe evaluation
142
+
143
+ The example below uses [scikit-learn](https://scikit-learn.org/) to perform logistic regression on image features.
144
+
145
+ ```python
146
+ import os
147
+ import clip
148
+ import torch
149
+
150
+ import numpy as np
151
+ from sklearn.linear_model import LogisticRegression
152
+ from torch.utils.data import DataLoader
153
+ from torchvision.datasets import CIFAR100
154
+ from tqdm import tqdm
155
+
156
+ # Load the model
157
+ device = "cuda" if torch.cuda.is_available() else "cpu"
158
+ model, preprocess = clip.load('ViT-B/32', device)
159
+
160
+ # Load the dataset
161
+ root = os.path.expanduser("~/.cache")
162
+ train = CIFAR100(root, download=True, train=True, transform=preprocess)
163
+ test = CIFAR100(root, download=True, train=False, transform=preprocess)
164
+
165
+
166
+ def get_features(dataset):
167
+ all_features = []
168
+ all_labels = []
169
+
170
+ with torch.no_grad():
171
+ for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
172
+ features = model.encode_image(images.to(device))
173
+
174
+ all_features.append(features)
175
+ all_labels.append(labels)
176
+
177
+ return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()
178
+
179
+ # Calculate the image features
180
+ train_features, train_labels = get_features(train)
181
+ test_features, test_labels = get_features(test)
182
+
183
+ # Perform logistic regression
184
+ classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
185
+ classifier.fit(train_features, train_labels)
186
+
187
+ # Evaluate using the logistic regression classifier
188
+ predictions = classifier.predict(test_features)
189
+ accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
190
+ print(f"Accuracy = {accuracy:.3f}")
191
+ ```
192
+
193
+ Note that the `C` value should be determined via a hyperparameter sweep using a validation split.
CLIP/__init__.py ADDED
File without changes
CLIP/clip/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .clip import *
CLIP/clip/bpe_simple_vocab_16e6.txt.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
3
+ size 1356917
CLIP/clip/clip.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import urllib
4
+ import warnings
5
+ from typing import Any, Union, List
6
+ from pkg_resources import packaging
7
+
8
+ import torch
9
+ from PIL import Image
10
+ from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
11
+ from tqdm import tqdm
12
+
13
+ from .model import build_model
14
+ from .simple_tokenizer import SimpleTokenizer as _Tokenizer
15
+
16
+ try:
17
+ from torchvision.transforms import InterpolationMode
18
+ BICUBIC = InterpolationMode.BICUBIC
19
+ except ImportError:
20
+ BICUBIC = Image.BICUBIC
21
+
22
+
23
+ if packaging.version.parse(torch.__version__) < packaging.version.parse("1.7.1"):
24
+ warnings.warn("PyTorch version 1.7.1 or higher is recommended")
25
+
26
+
27
+ __all__ = ["available_models", "load", "tokenize"]
28
+ _tokenizer = _Tokenizer()
29
+
30
+ _MODELS = {
31
+ "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
32
+ "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
33
+ "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
34
+ "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
35
+ "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
36
+ "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
37
+ "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
38
+ "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
39
+ }
40
+
41
+
42
+ def _download(url: str, root: str):
43
+ os.makedirs(root, exist_ok=True)
44
+ filename = os.path.basename(url)
45
+
46
+ expected_sha256 = url.split("/")[-2]
47
+ download_target = os.path.join(root, filename)
48
+
49
+ if os.path.exists(download_target) and not os.path.isfile(download_target):
50
+ raise RuntimeError(f"{download_target} exists and is not a regular file")
51
+
52
+ if os.path.isfile(download_target):
53
+ if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
54
+ return download_target
55
+ else:
56
+ warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
57
+
58
+ with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
59
+ with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
60
+ while True:
61
+ buffer = source.read(8192)
62
+ if not buffer:
63
+ break
64
+
65
+ output.write(buffer)
66
+ loop.update(len(buffer))
67
+
68
+ if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
69
+ raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
70
+
71
+ return download_target
72
+
73
+
74
+ def _convert_image_to_rgb(image):
75
+ return image.convert("RGB")
76
+
77
+
78
+ def _transform(n_px):
79
+ return Compose([
80
+ Resize(n_px, interpolation=BICUBIC),
81
+ CenterCrop(n_px),
82
+ _convert_image_to_rgb,
83
+ ToTensor(),
84
+ Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
85
+ ])
86
+
87
+
88
+ def available_models() -> List[str]:
89
+ """Returns the names of available CLIP models"""
90
+ return list(_MODELS.keys())
91
+
92
+
93
+ def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None):
94
+ """Load a CLIP model
95
+
96
+ Parameters
97
+ ----------
98
+ name : str
99
+ A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
100
+
101
+ device : Union[str, torch.device]
102
+ The device to put the loaded model
103
+
104
+ jit : bool
105
+ Whether to load the optimized JIT model or more hackable non-JIT model (default).
106
+
107
+ download_root: str
108
+ path to download the model files; by default, it uses "~/.cache/clip"
109
+
110
+ Returns
111
+ -------
112
+ model : torch.nn.Module
113
+ The CLIP model
114
+
115
+ preprocess : Callable[[PIL.Image], torch.Tensor]
116
+ A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
117
+ """
118
+ if name in _MODELS:
119
+ model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
120
+ elif os.path.isfile(name):
121
+ model_path = name
122
+ else:
123
+ raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
124
+
125
+ try:
126
+ # loading JIT archive
127
+ model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
128
+ state_dict = None
129
+ except RuntimeError:
130
+ # loading saved state dict
131
+ if jit:
132
+ warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
133
+ jit = False
134
+ state_dict = torch.load(model_path, map_location="cpu")
135
+
136
+ if not jit:
137
+ model = build_model(state_dict or model.state_dict()).to(device)
138
+ if str(device) == "cpu":
139
+ model.float()
140
+ return model, _transform(model.visual.input_resolution)
141
+
142
+ # patch the device names
143
+ device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
144
+ device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
145
+
146
+ def patch_device(module):
147
+ try:
148
+ graphs = [module.graph] if hasattr(module, "graph") else []
149
+ except RuntimeError:
150
+ graphs = []
151
+
152
+ if hasattr(module, "forward1"):
153
+ graphs.append(module.forward1.graph)
154
+
155
+ for graph in graphs:
156
+ for node in graph.findAllNodes("prim::Constant"):
157
+ if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
158
+ node.copyAttributes(device_node)
159
+
160
+ model.apply(patch_device)
161
+ patch_device(model.encode_image)
162
+ patch_device(model.encode_text)
163
+
164
+ # patch dtype to float32 on CPU
165
+ if str(device) == "cpu":
166
+ float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
167
+ float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
168
+ float_node = float_input.node()
169
+
170
+ def patch_float(module):
171
+ try:
172
+ graphs = [module.graph] if hasattr(module, "graph") else []
173
+ except RuntimeError:
174
+ graphs = []
175
+
176
+ if hasattr(module, "forward1"):
177
+ graphs.append(module.forward1.graph)
178
+
179
+ for graph in graphs:
180
+ for node in graph.findAllNodes("aten::to"):
181
+ inputs = list(node.inputs())
182
+ for i in [1, 2]: # dtype can be the second or third argument to aten::to()
183
+ if inputs[i].node()["value"] == 5:
184
+ inputs[i].node().copyAttributes(float_node)
185
+
186
+ model.apply(patch_float)
187
+ patch_float(model.encode_image)
188
+ patch_float(model.encode_text)
189
+
190
+ model.float()
191
+
192
+ return model, _transform(model.input_resolution.item())
193
+
194
+
195
+ def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> torch.LongTensor:
196
+ """
197
+ Returns the tokenized representation of given input string(s)
198
+
199
+ Parameters
200
+ ----------
201
+ texts : Union[str, List[str]]
202
+ An input string or a list of input strings to tokenize
203
+
204
+ context_length : int
205
+ The context length to use; all CLIP models use 77 as the context length
206
+
207
+ truncate: bool
208
+ Whether to truncate the text in case its encoding is longer than the context length
209
+
210
+ Returns
211
+ -------
212
+ A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
213
+ """
214
+ if isinstance(texts, str):
215
+ texts = [texts]
216
+
217
+ sot_token = _tokenizer.encoder["<|startoftext|>"]
218
+ eot_token = _tokenizer.encoder["<|endoftext|>"]
219
+ all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
220
+ result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
221
+
222
+ for i, tokens in enumerate(all_tokens):
223
+ if len(tokens) > context_length:
224
+ if truncate:
225
+ tokens = tokens[:context_length]
226
+ tokens[-1] = eot_token
227
+ else:
228
+ raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
229
+ result[i, :len(tokens)] = torch.tensor(tokens)
230
+
231
+ return result
CLIP/clip/model.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import Tuple, Union
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torch import nn
8
+ import math
9
+
10
+
11
+ class Bottleneck(nn.Module):
12
+ expansion = 4
13
+
14
+ def __init__(self, inplanes, planes, stride=1):
15
+ super().__init__()
16
+
17
+ # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
18
+ self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
19
+ self.bn1 = nn.BatchNorm2d(planes)
20
+
21
+ self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
22
+ self.bn2 = nn.BatchNorm2d(planes)
23
+
24
+ self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
25
+
26
+ self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
27
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
28
+
29
+ self.relu = nn.ReLU(inplace=True)
30
+ self.downsample = None
31
+ self.stride = stride
32
+
33
+ if stride > 1 or inplanes != planes * Bottleneck.expansion:
34
+ # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
35
+ self.downsample = nn.Sequential(OrderedDict([
36
+ ("-1", nn.AvgPool2d(stride)),
37
+ ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
38
+ ("1", nn.BatchNorm2d(planes * self.expansion))
39
+ ]))
40
+
41
+ def forward(self, x: torch.Tensor):
42
+ identity = x
43
+
44
+ out = self.relu(self.bn1(self.conv1(x)))
45
+ out = self.relu(self.bn2(self.conv2(out)))
46
+ out = self.avgpool(out)
47
+ out = self.bn3(self.conv3(out))
48
+
49
+ if self.downsample is not None:
50
+ identity = self.downsample(x)
51
+
52
+ out += identity
53
+ out = self.relu(out)
54
+ return out
55
+
56
+
57
+ class AttentionPool2d(nn.Module):
58
+ def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
59
+ super().__init__()
60
+ self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
61
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
62
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
63
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
64
+ self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
65
+ self.num_heads = num_heads
66
+
67
+ def forward(self, x):
68
+ x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC
69
+ x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
70
+ x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
71
+ x, _ = F.multi_head_attention_forward(
72
+ query=x, key=x, value=x,
73
+ embed_dim_to_check=x.shape[-1],
74
+ num_heads=self.num_heads,
75
+ q_proj_weight=self.q_proj.weight,
76
+ k_proj_weight=self.k_proj.weight,
77
+ v_proj_weight=self.v_proj.weight,
78
+ in_proj_weight=None,
79
+ in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
80
+ bias_k=None,
81
+ bias_v=None,
82
+ add_zero_attn=False,
83
+ dropout_p=0,
84
+ out_proj_weight=self.c_proj.weight,
85
+ out_proj_bias=self.c_proj.bias,
86
+ use_separate_proj_weight=True,
87
+ training=self.training,
88
+ need_weights=False
89
+ )
90
+
91
+ return x[0]
92
+
93
+
94
+ class ModifiedResNet(nn.Module):
95
+ """
96
+ A ResNet class that is similar to torchvision's but contains the following changes:
97
+ - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
98
+ - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
99
+ - The final pooling layer is a QKV attention instead of an average pool
100
+ """
101
+
102
+ def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
103
+ super().__init__()
104
+ self.output_dim = output_dim
105
+ self.input_resolution = input_resolution
106
+
107
+ # the 3-layer stem
108
+ self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
109
+ self.bn1 = nn.BatchNorm2d(width // 2)
110
+ self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
111
+ self.bn2 = nn.BatchNorm2d(width // 2)
112
+ self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
113
+ self.bn3 = nn.BatchNorm2d(width)
114
+ self.avgpool = nn.AvgPool2d(2)
115
+ self.relu = nn.ReLU(inplace=True)
116
+
117
+ # residual layers
118
+ self._inplanes = width # this is a *mutable* variable used during construction
119
+ self.layer1 = self._make_layer(width, layers[0])
120
+ self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
121
+ self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
122
+ self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
123
+
124
+ embed_dim = width * 32 # the ResNet feature dimension
125
+ self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
126
+
127
+ def _make_layer(self, planes, blocks, stride=1):
128
+ layers = [Bottleneck(self._inplanes, planes, stride)]
129
+
130
+ self._inplanes = planes * Bottleneck.expansion
131
+ for _ in range(1, blocks):
132
+ layers.append(Bottleneck(self._inplanes, planes))
133
+
134
+ return nn.Sequential(*layers)
135
+
136
+ def forward(self, x):
137
+ def stem(x):
138
+ for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
139
+ x = self.relu(bn(conv(x)))
140
+ x = self.avgpool(x)
141
+ return x
142
+
143
+ x = x.type(self.conv1.weight.dtype)
144
+ x = stem(x)
145
+ x = self.layer1(x)
146
+ x = self.layer2(x)
147
+ x = self.layer3(x)
148
+ x = self.layer4(x)
149
+ x = self.attnpool(x)
150
+
151
+ return x
152
+
153
+
154
+ class LayerNorm(nn.LayerNorm):
155
+ """Subclass torch's LayerNorm to handle fp16."""
156
+
157
+ def forward(self, x: torch.Tensor):
158
+ orig_type = x.dtype
159
+ ret = super().forward(x.type(torch.float32))
160
+ return ret.type(orig_type)
161
+
162
+
163
+ class QuickGELU(nn.Module):
164
+ def forward(self, x: torch.Tensor):
165
+ return x * torch.sigmoid(1.702 * x)
166
+
167
+
168
+ class ResidualAttentionBlock(nn.Module):
169
+ def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
170
+ super().__init__()
171
+
172
+ self.attn = nn.MultiheadAttention(d_model, n_head)
173
+ self.ln_1 = LayerNorm(d_model)
174
+ self.mlp = nn.Sequential(OrderedDict([
175
+ ("c_fc", nn.Linear(d_model, d_model * 4)),
176
+ ("gelu", QuickGELU()),
177
+ ("c_proj", nn.Linear(d_model * 4, d_model))
178
+ ]))
179
+ self.ln_2 = LayerNorm(d_model)
180
+ self.attn_mask = attn_mask
181
+
182
+ def attention(self, x: torch.Tensor):
183
+ self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
184
+ return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
185
+
186
+ def forward(self, x: torch.Tensor):
187
+ x = x + self.attention(self.ln_1(x))
188
+ x = x + self.mlp(self.ln_2(x))
189
+ return x
190
+
191
+
192
+ class Transformer(nn.Module):
193
+ def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
194
+ super().__init__()
195
+ self.width = width
196
+ self.layers = layers
197
+ self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
198
+
199
+ def forward(self, x: torch.Tensor):
200
+ return self.resblocks(x)
201
+
202
+
203
+ class VisionTransformer(nn.Module):
204
+ def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
205
+ super().__init__()
206
+ self.input_resolution = input_resolution
207
+ self.output_dim = output_dim
208
+ self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
209
+
210
+ scale = width ** -0.5
211
+ self.class_embedding = nn.Parameter(scale * torch.randn(width))
212
+ self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
213
+ self.ln_pre = LayerNorm(width)
214
+
215
+ self.transformer = Transformer(width, layers, heads)
216
+
217
+ self.ln_post = LayerNorm(width)
218
+ self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
219
+
220
+ # https://github.com/facebookresearch/dino
221
+ def interpolate_pos_encoding(self, x, w, h):
222
+ positional_embedding = self.positional_embedding.unsqueeze(0)
223
+ patch_size = self.conv1.kernel_size[0]
224
+
225
+ npatch = x.shape[1] - 1
226
+ N = positional_embedding.shape[1] - 1
227
+ if npatch == N and w == h:
228
+ return positional_embedding
229
+ class_pos_embed = positional_embedding[:, 0]
230
+ patch_pos_embed = positional_embedding[:, 1:]
231
+ dim = x.shape[-1]
232
+
233
+ w0 = w // patch_size
234
+ h0 = h // patch_size
235
+
236
+ # we add a small number to avoid floating point error in the interpolation
237
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
238
+ w0, h0 = w0 + 0.1, h0 + 0.1
239
+ patch_pos_embed = nn.functional.interpolate(
240
+ patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
241
+ scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
242
+ mode='bicubic',
243
+ )
244
+ assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
245
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
246
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
247
+
248
+ def forward(self, x: torch.Tensor):
249
+ x = self.transformer_first_blocks_forward(x)
250
+ x = self.transformer.resblocks[-1](x)
251
+ x = x.permute(1, 0, 2) # LND -> NLD
252
+ x = self.ln_post(x[:, 0, :])
253
+
254
+ if self.proj is not None:
255
+ x = x @ self.proj
256
+
257
+ return x
258
+
259
+ def transformer_first_blocks_forward(self, x):
260
+ h, w = x.shape[-2:]
261
+ x = self.conv1(x) # shape = [*, width, grid, grid]
262
+ x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
263
+ x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
264
+ x = torch.cat(
265
+ [self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
266
+ x], dim=1) # shape = [*, grid ** 2 + 1, width]
267
+ positional_embedding = self.interpolate_pos_encoding(x, w, h)
268
+ x = x + positional_embedding.to(x.dtype)
269
+ # x = x + self.positional_embedding.to(x.dtype)
270
+ x = self.ln_pre(x)
271
+ x = x.permute(1, 0, 2) # NLD -> LND
272
+ x = self.transformer.resblocks[:-1](x)
273
+ return x
274
+
275
+ @staticmethod
276
+ def attn_cosine_sim(x, eps=1e-08):
277
+ norm = x.norm(dim=2, keepdim=True)
278
+ factor = torch.clamp(norm @ norm.permute(0, 2, 1), min=eps) # shape [1, t, t]
279
+ sim_matrix = (x @ x.permute(0, 2, 1)) / factor # shape [1, t, t]
280
+ return sim_matrix
281
+
282
+
283
+ class CLIP(nn.Module):
284
+ def __init__(self,
285
+ embed_dim: int,
286
+ # vision
287
+ image_resolution: int,
288
+ vision_layers: Union[Tuple[int, int, int, int], int],
289
+ vision_width: int,
290
+ vision_patch_size: int,
291
+ # text
292
+ context_length: int,
293
+ vocab_size: int,
294
+ transformer_width: int,
295
+ transformer_heads: int,
296
+ transformer_layers: int
297
+ ):
298
+ super().__init__()
299
+
300
+ self.context_length = context_length
301
+
302
+ if isinstance(vision_layers, (tuple, list)):
303
+ vision_heads = vision_width * 32 // 64
304
+ self.visual = ModifiedResNet(
305
+ layers=vision_layers,
306
+ output_dim=embed_dim,
307
+ heads=vision_heads,
308
+ input_resolution=image_resolution,
309
+ width=vision_width
310
+ )
311
+ else:
312
+ vision_heads = vision_width // 64
313
+ self.visual = VisionTransformer(
314
+ input_resolution=image_resolution,
315
+ patch_size=vision_patch_size,
316
+ width=vision_width,
317
+ layers=vision_layers,
318
+ heads=vision_heads,
319
+ output_dim=embed_dim
320
+ )
321
+
322
+ self.transformer = Transformer(
323
+ width=transformer_width,
324
+ layers=transformer_layers,
325
+ heads=transformer_heads,
326
+ attn_mask=self.build_attention_mask()
327
+ )
328
+
329
+ self.vocab_size = vocab_size
330
+ self.token_embedding = nn.Embedding(vocab_size, transformer_width)
331
+ self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
332
+ self.ln_final = LayerNorm(transformer_width)
333
+
334
+ self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
335
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
336
+
337
+ self.initialize_parameters()
338
+
339
+ def initialize_parameters(self):
340
+ nn.init.normal_(self.token_embedding.weight, std=0.02)
341
+ nn.init.normal_(self.positional_embedding, std=0.01)
342
+
343
+ if isinstance(self.visual, ModifiedResNet):
344
+ if self.visual.attnpool is not None:
345
+ std = self.visual.attnpool.c_proj.in_features ** -0.5
346
+ nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
347
+ nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
348
+ nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
349
+ nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
350
+
351
+ for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
352
+ for name, param in resnet_block.named_parameters():
353
+ if name.endswith("bn3.weight"):
354
+ nn.init.zeros_(param)
355
+
356
+ proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
357
+ attn_std = self.transformer.width ** -0.5
358
+ fc_std = (2 * self.transformer.width) ** -0.5
359
+ for block in self.transformer.resblocks:
360
+ nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
361
+ nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
362
+ nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
363
+ nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
364
+
365
+ if self.text_projection is not None:
366
+ nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
367
+
368
+ def build_attention_mask(self):
369
+ # lazily create causal attention mask, with full attention between the vision tokens
370
+ # pytorch uses additive attention mask; fill with -inf
371
+ mask = torch.empty(self.context_length, self.context_length)
372
+ mask.fill_(float("-inf"))
373
+ mask.triu_(1) # zero out the lower diagonal
374
+ return mask
375
+
376
+ @property
377
+ def dtype(self):
378
+ return self.visual.conv1.weight.dtype
379
+
380
+ def calculate_self_sim(self, x: torch.Tensor):
381
+ tokens = self.visual.transformer_first_blocks_forward(
382
+ x.type(self.dtype)) # shape = [batch, tokens, emb_dim] tokens include class token
383
+ tokens = tokens.permute(1, 0, 2)
384
+ ssim = self.visual.attn_cosine_sim(tokens) # shape = [batch, tokens, tokens]
385
+ return ssim
386
+
387
+ def encode_image(self, image):
388
+ return self.visual(image.type(self.dtype))
389
+
390
+ def encode_text(self, text):
391
+ x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]
392
+ x = x + self.positional_embedding.type(self.dtype)
393
+ x = x.permute(1, 0, 2) # NLD -> LND
394
+ x = self.transformer(x)
395
+ x = x.permute(1, 0, 2) # LND -> NLD
396
+ x = self.ln_final(x).type(self.dtype)
397
+
398
+ # x.shape = [batch_size, n_ctx, transformer.width]
399
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
400
+ x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
401
+
402
+ return x
403
+
404
+ def forward(self, image, text):
405
+ image_features = self.encode_image(image)
406
+ text_features = self.encode_text(text)
407
+
408
+ # normalized features
409
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
410
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
411
+
412
+ # cosine similarity as logits
413
+ logit_scale = self.logit_scale.exp()
414
+ logits_per_image = logit_scale * image_features @ text_features.t()
415
+ logits_per_text = logits_per_image.t()
416
+
417
+ # shape = [global_batch_size, global_batch_size]
418
+ return logits_per_image, logits_per_text
419
+
420
+
421
+ def convert_weights(model: nn.Module):
422
+ """Convert applicable model parameters to fp16"""
423
+
424
+ def _convert_weights_to_fp16(l):
425
+ if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
426
+ l.weight.data = l.weight.data.half()
427
+ if l.bias is not None:
428
+ l.bias.data = l.bias.data.half()
429
+
430
+ if isinstance(l, nn.MultiheadAttention):
431
+ for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
432
+ tensor = getattr(l, attr)
433
+ if tensor is not None:
434
+ tensor.data = tensor.data.half()
435
+
436
+ for name in ["text_projection", "proj"]:
437
+ if hasattr(l, name):
438
+ attr = getattr(l, name)
439
+ if attr is not None:
440
+ attr.data = attr.data.half()
441
+
442
+ model.apply(_convert_weights_to_fp16)
443
+
444
+
445
+ def build_model(state_dict: dict):
446
+ vit = "visual.proj" in state_dict
447
+
448
+ if vit:
449
+ vision_width = state_dict["visual.conv1.weight"].shape[0]
450
+ vision_layers = len(
451
+ [k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
452
+ vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
453
+ grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
454
+ image_resolution = vision_patch_size * grid_size
455
+ else:
456
+ counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in
457
+ [1, 2, 3, 4]]
458
+ vision_layers = tuple(counts)
459
+ vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
460
+ output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
461
+ vision_patch_size = None
462
+ assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
463
+ image_resolution = output_width * 32
464
+
465
+ embed_dim = state_dict["text_projection"].shape[1]
466
+ context_length = state_dict["positional_embedding"].shape[0]
467
+ vocab_size = state_dict["token_embedding.weight"].shape[0]
468
+ transformer_width = state_dict["ln_final.weight"].shape[0]
469
+ transformer_heads = transformer_width // 64
470
+ transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
471
+
472
+ model = CLIP(
473
+ embed_dim,
474
+ image_resolution, vision_layers, vision_width, vision_patch_size,
475
+ context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
476
+ )
477
+
478
+ for key in ["input_resolution", "context_length", "vocab_size"]:
479
+ if key in state_dict:
480
+ del state_dict[key]
481
+
482
+ convert_weights(model)
483
+ model.load_state_dict(state_dict)
484
+ return model.eval()
CLIP/clip/simple_tokenizer.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gzip
2
+ import html
3
+ import os
4
+ from functools import lru_cache
5
+
6
+ import ftfy
7
+ import regex as re
8
+
9
+
10
+ @lru_cache()
11
+ def default_bpe():
12
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
13
+
14
+
15
+ @lru_cache()
16
+ def bytes_to_unicode():
17
+ """
18
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
19
+ The reversible bpe codes work on unicode strings.
20
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
21
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
22
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
23
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
24
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
25
+ """
26
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
27
+ cs = bs[:]
28
+ n = 0
29
+ for b in range(2**8):
30
+ if b not in bs:
31
+ bs.append(b)
32
+ cs.append(2**8+n)
33
+ n += 1
34
+ cs = [chr(n) for n in cs]
35
+ return dict(zip(bs, cs))
36
+
37
+
38
+ def get_pairs(word):
39
+ """Return set of symbol pairs in a word.
40
+ Word is represented as tuple of symbols (symbols being variable-length strings).
41
+ """
42
+ pairs = set()
43
+ prev_char = word[0]
44
+ for char in word[1:]:
45
+ pairs.add((prev_char, char))
46
+ prev_char = char
47
+ return pairs
48
+
49
+
50
+ def basic_clean(text):
51
+ text = ftfy.fix_text(text)
52
+ text = html.unescape(html.unescape(text))
53
+ return text.strip()
54
+
55
+
56
+ def whitespace_clean(text):
57
+ text = re.sub(r'\s+', ' ', text)
58
+ text = text.strip()
59
+ return text
60
+
61
+
62
+ class SimpleTokenizer(object):
63
+ def __init__(self, bpe_path: str = default_bpe()):
64
+ self.byte_encoder = bytes_to_unicode()
65
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
66
+ merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
67
+ merges = merges[1:49152-256-2+1]
68
+ merges = [tuple(merge.split()) for merge in merges]
69
+ vocab = list(bytes_to_unicode().values())
70
+ vocab = vocab + [v+'</w>' for v in vocab]
71
+ for merge in merges:
72
+ vocab.append(''.join(merge))
73
+ vocab.extend(['<|startoftext|>', '<|endoftext|>'])
74
+ self.encoder = dict(zip(vocab, range(len(vocab))))
75
+ self.decoder = {v: k for k, v in self.encoder.items()}
76
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
77
+ self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
78
+ self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
79
+
80
+ def bpe(self, token):
81
+ if token in self.cache:
82
+ return self.cache[token]
83
+ word = tuple(token[:-1]) + ( token[-1] + '</w>',)
84
+ pairs = get_pairs(word)
85
+
86
+ if not pairs:
87
+ return token+'</w>'
88
+
89
+ while True:
90
+ bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
91
+ if bigram not in self.bpe_ranks:
92
+ break
93
+ first, second = bigram
94
+ new_word = []
95
+ i = 0
96
+ while i < len(word):
97
+ try:
98
+ j = word.index(first, i)
99
+ new_word.extend(word[i:j])
100
+ i = j
101
+ except:
102
+ new_word.extend(word[i:])
103
+ break
104
+
105
+ if word[i] == first and i < len(word)-1 and word[i+1] == second:
106
+ new_word.append(first+second)
107
+ i += 2
108
+ else:
109
+ new_word.append(word[i])
110
+ i += 1
111
+ new_word = tuple(new_word)
112
+ word = new_word
113
+ if len(word) == 1:
114
+ break
115
+ else:
116
+ pairs = get_pairs(word)
117
+ word = ' '.join(word)
118
+ self.cache[token] = word
119
+ return word
120
+
121
+ def encode(self, text):
122
+ bpe_tokens = []
123
+ text = whitespace_clean(basic_clean(text)).lower()
124
+ for token in re.findall(self.pat, text):
125
+ token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
126
+ bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
127
+ return bpe_tokens
128
+
129
+ def decode(self, tokens):
130
+ text = ''.join([self.decoder[token] for token in tokens])
131
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
132
+ return text
CLIP/clip_explainability/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .clip import *
CLIP/clip_explainability/auxilary.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import warnings
3
+ from typing import Tuple, Optional
4
+
5
+ import torch
6
+ from torch import Tensor
7
+ from torch.nn.init import xavier_uniform_
8
+ from torch.nn.init import constant_
9
+ from torch.nn.init import xavier_normal_
10
+ from torch.nn.parameter import Parameter
11
+ from torch.nn import functional as F
12
+
13
+ # We define this function as _pad because it takes an argument
14
+ # named pad, which clobbers the recursive reference to the pad
15
+ # function needed for __torch_function__ support
16
+ pad = F._pad
17
+
18
+ # This class exists solely for Transformer; it has an annotation stating
19
+ # that bias is never None, which appeases TorchScript
20
+ class _LinearWithBias(torch.nn.Linear):
21
+ bias: Tensor
22
+
23
+ def __init__(self, in_features: int, out_features: int) -> None:
24
+ super().__init__(in_features, out_features, bias=True)
25
+
26
+ def multi_head_attention_forward(query: Tensor,
27
+ key: Tensor,
28
+ value: Tensor,
29
+ embed_dim_to_check: int,
30
+ num_heads: int,
31
+ in_proj_weight: Tensor,
32
+ in_proj_bias: Tensor,
33
+ bias_k: Optional[Tensor],
34
+ bias_v: Optional[Tensor],
35
+ add_zero_attn: bool,
36
+ dropout_p: float,
37
+ out_proj_weight: Tensor,
38
+ out_proj_bias: Tensor,
39
+ training: bool = True,
40
+ key_padding_mask: Optional[Tensor] = None,
41
+ need_weights: bool = True,
42
+ attn_mask: Optional[Tensor] = None,
43
+ use_separate_proj_weight: bool = False,
44
+ q_proj_weight: Optional[Tensor] = None,
45
+ k_proj_weight: Optional[Tensor] = None,
46
+ v_proj_weight: Optional[Tensor] = None,
47
+ static_k: Optional[Tensor] = None,
48
+ static_v: Optional[Tensor] = None,
49
+ attention_probs_forward_hook = None,
50
+ attention_probs_backwards_hook = None,
51
+ ) -> Tuple[Tensor, Optional[Tensor]]:
52
+ if not torch.jit.is_scripting():
53
+ tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v,
54
+ out_proj_weight, out_proj_bias)
55
+ if any([type(t) is not Tensor for t in tens_ops]) and F.has_torch_function(tens_ops):
56
+ return F.handle_torch_function(
57
+ multi_head_attention_forward, tens_ops, query, key, value,
58
+ embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
59
+ bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight,
60
+ out_proj_bias, training=training, key_padding_mask=key_padding_mask,
61
+ need_weights=need_weights, attn_mask=attn_mask,
62
+ use_separate_proj_weight=use_separate_proj_weight,
63
+ q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight,
64
+ v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
65
+ tgt_len, bsz, embed_dim = query.size()
66
+ assert embed_dim == embed_dim_to_check
67
+ # allow MHA to have different sizes for the feature dimension
68
+ assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
69
+
70
+ head_dim = embed_dim // num_heads
71
+ assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
72
+ scaling = float(head_dim) ** -0.5
73
+
74
+ if not use_separate_proj_weight:
75
+ if torch.equal(query, key) and torch.equal(key, value):
76
+ # self-attention
77
+ q, k, v = F.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
78
+
79
+ elif torch.equal(key, value):
80
+ # encoder-decoder attention
81
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
82
+ _b = in_proj_bias
83
+ _start = 0
84
+ _end = embed_dim
85
+ _w = in_proj_weight[_start:_end, :]
86
+ if _b is not None:
87
+ _b = _b[_start:_end]
88
+ q = F.linear(query, _w, _b)
89
+
90
+ if key is None:
91
+ assert value is None
92
+ k = None
93
+ v = None
94
+ else:
95
+
96
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
97
+ _b = in_proj_bias
98
+ _start = embed_dim
99
+ _end = None
100
+ _w = in_proj_weight[_start:, :]
101
+ if _b is not None:
102
+ _b = _b[_start:]
103
+ k, v = F.linear(key, _w, _b).chunk(2, dim=-1)
104
+
105
+ else:
106
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
107
+ _b = in_proj_bias
108
+ _start = 0
109
+ _end = embed_dim
110
+ _w = in_proj_weight[_start:_end, :]
111
+ if _b is not None:
112
+ _b = _b[_start:_end]
113
+ q = F.linear(query, _w, _b)
114
+
115
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
116
+ _b = in_proj_bias
117
+ _start = embed_dim
118
+ _end = embed_dim * 2
119
+ _w = in_proj_weight[_start:_end, :]
120
+ if _b is not None:
121
+ _b = _b[_start:_end]
122
+ k = F.linear(key, _w, _b)
123
+
124
+ # This is inline in_proj function with in_proj_weight and in_proj_bias
125
+ _b = in_proj_bias
126
+ _start = embed_dim * 2
127
+ _end = None
128
+ _w = in_proj_weight[_start:, :]
129
+ if _b is not None:
130
+ _b = _b[_start:]
131
+ v = F.linear(value, _w, _b)
132
+ else:
133
+ q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
134
+ len1, len2 = q_proj_weight_non_opt.size()
135
+ assert len1 == embed_dim and len2 == query.size(-1)
136
+
137
+ k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
138
+ len1, len2 = k_proj_weight_non_opt.size()
139
+ assert len1 == embed_dim and len2 == key.size(-1)
140
+
141
+ v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
142
+ len1, len2 = v_proj_weight_non_opt.size()
143
+ assert len1 == embed_dim and len2 == value.size(-1)
144
+
145
+ if in_proj_bias is not None:
146
+ q = F.linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
147
+ k = F.linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
148
+ v = F.linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
149
+ else:
150
+ q = F.linear(query, q_proj_weight_non_opt, in_proj_bias)
151
+ k = F.linear(key, k_proj_weight_non_opt, in_proj_bias)
152
+ v = F.linear(value, v_proj_weight_non_opt, in_proj_bias)
153
+ q = q * scaling
154
+
155
+ if attn_mask is not None:
156
+ assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
157
+ attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
158
+ 'Only float, byte, and bool types are supported for attn_mask, not {}'.format(attn_mask.dtype)
159
+ if attn_mask.dtype == torch.uint8:
160
+ warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
161
+ attn_mask = attn_mask.to(torch.bool)
162
+
163
+ if attn_mask.dim() == 2:
164
+ attn_mask = attn_mask.unsqueeze(0)
165
+ if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
166
+ raise RuntimeError('The size of the 2D attn_mask is not correct.')
167
+ elif attn_mask.dim() == 3:
168
+ if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
169
+ raise RuntimeError('The size of the 3D attn_mask is not correct.')
170
+ else:
171
+ raise RuntimeError("attn_mask's dimension {} is not supported".format(attn_mask.dim()))
172
+ # attn_mask's dim is 3 now.
173
+
174
+ # convert ByteTensor key_padding_mask to bool
175
+ if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
176
+ warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
177
+ key_padding_mask = key_padding_mask.to(torch.bool)
178
+
179
+ if bias_k is not None and bias_v is not None:
180
+ if static_k is None and static_v is None:
181
+ k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
182
+ v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
183
+ if attn_mask is not None:
184
+ attn_mask = pad(attn_mask, (0, 1))
185
+ if key_padding_mask is not None:
186
+ key_padding_mask = pad(key_padding_mask, (0, 1))
187
+ else:
188
+ assert static_k is None, "bias cannot be added to static key."
189
+ assert static_v is None, "bias cannot be added to static value."
190
+ else:
191
+ assert bias_k is None
192
+ assert bias_v is None
193
+
194
+ q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
195
+ if k is not None:
196
+ k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
197
+ if v is not None:
198
+ v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
199
+
200
+ if static_k is not None:
201
+ assert static_k.size(0) == bsz * num_heads
202
+ assert static_k.size(2) == head_dim
203
+ k = static_k
204
+
205
+ if static_v is not None:
206
+ assert static_v.size(0) == bsz * num_heads
207
+ assert static_v.size(2) == head_dim
208
+ v = static_v
209
+
210
+ src_len = k.size(1)
211
+
212
+ if key_padding_mask is not None:
213
+ assert key_padding_mask.size(0) == bsz
214
+ assert key_padding_mask.size(1) == src_len
215
+
216
+ if add_zero_attn:
217
+ src_len += 1
218
+ k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
219
+ v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
220
+ if attn_mask is not None:
221
+ attn_mask = pad(attn_mask, (0, 1))
222
+ if key_padding_mask is not None:
223
+ key_padding_mask = pad(key_padding_mask, (0, 1))
224
+
225
+ attn_output_weights = torch.bmm(q, k.transpose(1, 2))
226
+ assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
227
+
228
+ if attn_mask is not None:
229
+ if attn_mask.dtype == torch.bool:
230
+ attn_output_weights.masked_fill_(attn_mask, float('-inf'))
231
+ else:
232
+ attn_output_weights += attn_mask
233
+
234
+
235
+ if key_padding_mask is not None:
236
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
237
+ attn_output_weights = attn_output_weights.masked_fill(
238
+ key_padding_mask.unsqueeze(1).unsqueeze(2),
239
+ float('-inf'),
240
+ )
241
+ attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
242
+
243
+ attn_output_weights = F.softmax(
244
+ attn_output_weights, dim=-1)
245
+ attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training)
246
+
247
+ # use hooks for the attention weights if necessary
248
+ if attention_probs_forward_hook is not None and attention_probs_backwards_hook is not None:
249
+ attention_probs_forward_hook(attn_output_weights)
250
+ attn_output_weights.register_hook(attention_probs_backwards_hook)
251
+
252
+ attn_output = torch.bmm(attn_output_weights, v)
253
+ assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
254
+ attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
255
+ attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
256
+
257
+ if need_weights:
258
+ # average attention weights over heads
259
+ attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
260
+ return attn_output, attn_output_weights.sum(dim=1) / num_heads
261
+ else:
262
+ return attn_output, None
263
+
264
+
265
+ class MultiheadAttention(torch.nn.Module):
266
+ r"""Allows the model to jointly attend to information
267
+ from different representation subspaces.
268
+ See reference: Attention Is All You Need
269
+
270
+ .. math::
271
+ \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
272
+ \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
273
+
274
+ Args:
275
+ embed_dim: total dimension of the model.
276
+ num_heads: parallel attention heads.
277
+ dropout: a Dropout layer on attn_output_weights. Default: 0.0.
278
+ bias: add bias as module parameter. Default: True.
279
+ add_bias_kv: add bias to the key and value sequences at dim=0.
280
+ add_zero_attn: add a new batch of zeros to the key and
281
+ value sequences at dim=1.
282
+ kdim: total number of features in key. Default: None.
283
+ vdim: total number of features in value. Default: None.
284
+
285
+ Note: if kdim and vdim are None, they will be set to embed_dim such that
286
+ query, key, and value have the same number of features.
287
+
288
+ Examples::
289
+
290
+ >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
291
+ >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
292
+ """
293
+ bias_k: Optional[torch.Tensor]
294
+ bias_v: Optional[torch.Tensor]
295
+
296
+ def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
297
+ super(MultiheadAttention, self).__init__()
298
+ self.embed_dim = embed_dim
299
+ self.kdim = kdim if kdim is not None else embed_dim
300
+ self.vdim = vdim if vdim is not None else embed_dim
301
+ self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
302
+
303
+ self.num_heads = num_heads
304
+ self.dropout = dropout
305
+ self.head_dim = embed_dim // num_heads
306
+ assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
307
+
308
+ if self._qkv_same_embed_dim is False:
309
+ self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
310
+ self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
311
+ self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
312
+ self.register_parameter('in_proj_weight', None)
313
+ else:
314
+ self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
315
+ self.register_parameter('q_proj_weight', None)
316
+ self.register_parameter('k_proj_weight', None)
317
+ self.register_parameter('v_proj_weight', None)
318
+
319
+ if bias:
320
+ self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
321
+ else:
322
+ self.register_parameter('in_proj_bias', None)
323
+ self.out_proj = _LinearWithBias(embed_dim, embed_dim)
324
+
325
+ if add_bias_kv:
326
+ self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
327
+ self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
328
+ else:
329
+ self.bias_k = self.bias_v = None
330
+
331
+ self.add_zero_attn = add_zero_attn
332
+
333
+ self._reset_parameters()
334
+
335
+ def _reset_parameters(self):
336
+ if self._qkv_same_embed_dim:
337
+ xavier_uniform_(self.in_proj_weight)
338
+ else:
339
+ xavier_uniform_(self.q_proj_weight)
340
+ xavier_uniform_(self.k_proj_weight)
341
+ xavier_uniform_(self.v_proj_weight)
342
+
343
+ if self.in_proj_bias is not None:
344
+ constant_(self.in_proj_bias, 0.)
345
+ constant_(self.out_proj.bias, 0.)
346
+ if self.bias_k is not None:
347
+ xavier_normal_(self.bias_k)
348
+ if self.bias_v is not None:
349
+ xavier_normal_(self.bias_v)
350
+
351
+ def __setstate__(self, state):
352
+ # Support loading old MultiheadAttention checkpoints generated by v1.1.0
353
+ if '_qkv_same_embed_dim' not in state:
354
+ state['_qkv_same_embed_dim'] = True
355
+
356
+ super(MultiheadAttention, self).__setstate__(state)
357
+
358
+ def forward(self, query, key, value, key_padding_mask=None,
359
+ need_weights=True, attn_mask=None, attention_probs_forward_hook=None, attention_probs_backwards_hook=None):
360
+ r"""
361
+ Args:
362
+ query, key, value: map a query and a set of key-value pairs to an output.
363
+ See "Attention Is All You Need" for more details.
364
+ key_padding_mask: if provided, specified padding elements in the key will
365
+ be ignored by the attention. When given a binary mask and a value is True,
366
+ the corresponding value on the attention layer will be ignored. When given
367
+ a byte mask and a value is non-zero, the corresponding value on the attention
368
+ layer will be ignored
369
+ need_weights: output attn_output_weights.
370
+ attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
371
+ the batches while a 3D mask allows to specify a different mask for the entries of each batch.
372
+
373
+ Shape:
374
+ - Inputs:
375
+ - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
376
+ the embedding dimension.
377
+ - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
378
+ the embedding dimension.
379
+ - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
380
+ the embedding dimension.
381
+ - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
382
+ If a ByteTensor is provided, the non-zero positions will be ignored while the position
383
+ with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
384
+ value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
385
+ - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
386
+ 3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
387
+ S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
388
+ positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
389
+ while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
390
+ is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
391
+ is provided, it will be added to the attention weight.
392
+
393
+ - Outputs:
394
+ - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
395
+ E is the embedding dimension.
396
+ - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
397
+ L is the target sequence length, S is the source sequence length.
398
+ """
399
+ if not self._qkv_same_embed_dim:
400
+ return multi_head_attention_forward(
401
+ query, key, value, self.embed_dim, self.num_heads,
402
+ self.in_proj_weight, self.in_proj_bias,
403
+ self.bias_k, self.bias_v, self.add_zero_attn,
404
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
405
+ training=self.training,
406
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
407
+ attn_mask=attn_mask, use_separate_proj_weight=True,
408
+ q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
409
+ v_proj_weight=self.v_proj_weight,
410
+ attention_probs_forward_hook=attention_probs_forward_hook,
411
+ attention_probs_backwards_hook=attention_probs_backwards_hook)
412
+ else:
413
+ return multi_head_attention_forward(
414
+ query, key, value, self.embed_dim, self.num_heads,
415
+ self.in_proj_weight, self.in_proj_bias,
416
+ self.bias_k, self.bias_v, self.add_zero_attn,
417
+ self.dropout, self.out_proj.weight, self.out_proj.bias,
418
+ training=self.training,
419
+ key_padding_mask=key_padding_mask, need_weights=need_weights,
420
+ attn_mask=attn_mask,
421
+ attention_probs_forward_hook=attention_probs_forward_hook,
422
+ attention_probs_backwards_hook=attention_probs_backwards_hook)
CLIP/clip_explainability/bpe_simple_vocab_16e6.txt.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
3
+ size 1356917
CLIP/clip_explainability/clip.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import urllib
4
+ import warnings
5
+ from typing import Union, List
6
+
7
+ import torch
8
+ from PIL import Image
9
+ from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
10
+ from tqdm import tqdm
11
+
12
+ from .model import build_model
13
+ from .simple_tokenizer import SimpleTokenizer as _Tokenizer
14
+
15
+ __all__ = ["available_models", "load", "tokenize"]
16
+ _tokenizer = _Tokenizer()
17
+
18
+ _MODELS = {
19
+ "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
20
+ "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
21
+ "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
22
+ "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
23
+ "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
24
+ "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
25
+ "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
26
+ "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
27
+ }
28
+
29
+ def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
30
+ os.makedirs(root, exist_ok=True)
31
+ filename = os.path.basename(url)
32
+
33
+ expected_sha256 = url.split("/")[-2]
34
+ download_target = os.path.join(root, filename)
35
+
36
+ if os.path.exists(download_target) and not os.path.isfile(download_target):
37
+ raise RuntimeError(f"{download_target} exists and is not a regular file")
38
+
39
+ if os.path.isfile(download_target):
40
+ if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
41
+ return download_target
42
+ else:
43
+ warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
44
+
45
+ with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
46
+ with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
47
+ while True:
48
+ buffer = source.read(8192)
49
+ if not buffer:
50
+ break
51
+
52
+ output.write(buffer)
53
+ loop.update(len(buffer))
54
+
55
+ if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
56
+ raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
57
+
58
+ return download_target
59
+
60
+
61
+ def _transform(n_px):
62
+ return Compose([
63
+ Resize(n_px, interpolation=Image.BICUBIC),
64
+ CenterCrop(n_px),
65
+ lambda image: image.convert("RGB"),
66
+ ToTensor(),
67
+ Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
68
+ ])
69
+
70
+
71
+ def available_models() -> List[str]:
72
+ """Returns the names of available CLIP models"""
73
+ return list(_MODELS.keys())
74
+
75
+
76
+ def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True):
77
+ """Load a CLIP model
78
+
79
+ Parameters
80
+ ----------
81
+ name : str
82
+ A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
83
+
84
+ device : Union[str, torch.device]
85
+ The device to put the loaded model
86
+
87
+ jit : bool
88
+ Whether to load the optimized JIT model (default) or more hackable non-JIT model.
89
+
90
+ Returns
91
+ -------
92
+ model : torch.nn.Module
93
+ The CLIP model
94
+
95
+ preprocess : Callable[[PIL.Image], torch.Tensor]
96
+ A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
97
+ """
98
+ if name in _MODELS:
99
+ model_path = _download(_MODELS[name])
100
+ elif os.path.isfile(name):
101
+ model_path = name
102
+ else:
103
+ raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
104
+
105
+ try:
106
+ # loading JIT archive
107
+ model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
108
+ state_dict = None
109
+ except RuntimeError:
110
+ # loading saved state dict
111
+ if jit:
112
+ warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
113
+ jit = False
114
+ state_dict = torch.load(model_path, map_location="cpu")
115
+
116
+ if not jit:
117
+ model = build_model(state_dict or model.state_dict()).to(device)
118
+ if str(device) == "cpu":
119
+ model.float()
120
+ return model, _transform(model.visual.input_resolution)
121
+
122
+ # patch the device names
123
+ device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
124
+ device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
125
+
126
+ def patch_device(module):
127
+ graphs = [module.graph] if hasattr(module, "graph") else []
128
+ if hasattr(module, "forward1"):
129
+ graphs.append(module.forward1.graph)
130
+
131
+ for graph in graphs:
132
+ for node in graph.findAllNodes("prim::Constant"):
133
+ if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
134
+ node.copyAttributes(device_node)
135
+
136
+ model.apply(patch_device)
137
+ patch_device(model.encode_image)
138
+ patch_device(model.encode_text)
139
+
140
+ # patch dtype to float32 on CPU
141
+ if str(device) == "cpu":
142
+ float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
143
+ float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
144
+ float_node = float_input.node()
145
+
146
+ def patch_float(module):
147
+ graphs = [module.graph] if hasattr(module, "graph") else []
148
+ if hasattr(module, "forward1"):
149
+ graphs.append(module.forward1.graph)
150
+
151
+ for graph in graphs:
152
+ for node in graph.findAllNodes("aten::to"):
153
+ inputs = list(node.inputs())
154
+ for i in [1, 2]: # dtype can be the second or third argument to aten::to()
155
+ if inputs[i].node()["value"] == 5:
156
+ inputs[i].node().copyAttributes(float_node)
157
+
158
+ model.apply(patch_float)
159
+ patch_float(model.encode_image)
160
+ patch_float(model.encode_text)
161
+
162
+ model.float()
163
+
164
+ return model, _transform(model.input_resolution.item())
165
+
166
+
167
+ def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
168
+ """
169
+ Returns the tokenized representation of given input string(s)
170
+
171
+ Parameters
172
+ ----------
173
+ texts : Union[str, List[str]]
174
+ An input string or a list of input strings to tokenize
175
+
176
+ context_length : int
177
+ The context length to use; all CLIP models use 77 as the context length
178
+
179
+ Returns
180
+ -------
181
+ A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
182
+ """
183
+ if isinstance(texts, str):
184
+ texts = [texts]
185
+
186
+ sot_token = _tokenizer.encoder["<|startoftext|>"]
187
+ eot_token = _tokenizer.encoder["<|endoftext|>"]
188
+ all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
189
+ result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
190
+
191
+ for i, tokens in enumerate(all_tokens):
192
+ if len(tokens) > context_length:
193
+ raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
194
+ result[i, :len(tokens)] = torch.tensor(tokens)
195
+
196
+ return result
CLIP/clip_explainability/model.py ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import Tuple, Union
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torch import nn
8
+ from .auxilary import *
9
+
10
+ class Bottleneck(nn.Module):
11
+ expansion = 4
12
+
13
+ def __init__(self, inplanes, planes, stride=1):
14
+ super().__init__()
15
+
16
+ # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
17
+ self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
18
+ self.bn1 = nn.BatchNorm2d(planes)
19
+
20
+ self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
21
+ self.bn2 = nn.BatchNorm2d(planes)
22
+
23
+ self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
24
+
25
+ self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
26
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
27
+
28
+ self.relu = nn.ReLU(inplace=True)
29
+ self.downsample = None
30
+ self.stride = stride
31
+
32
+ if stride > 1 or inplanes != planes * Bottleneck.expansion:
33
+ # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
34
+ self.downsample = nn.Sequential(OrderedDict([
35
+ ("-1", nn.AvgPool2d(stride)),
36
+ ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
37
+ ("1", nn.BatchNorm2d(planes * self.expansion))
38
+ ]))
39
+
40
+ def forward(self, x: torch.Tensor):
41
+ identity = x
42
+
43
+ out = self.relu(self.bn1(self.conv1(x)))
44
+ out = self.relu(self.bn2(self.conv2(out)))
45
+ out = self.avgpool(out)
46
+ out = self.bn3(self.conv3(out))
47
+
48
+ if self.downsample is not None:
49
+ identity = self.downsample(x)
50
+
51
+ out += identity
52
+ out = self.relu(out)
53
+ return out
54
+
55
+
56
+ class AttentionPool2d(nn.Module):
57
+ def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
58
+ super().__init__()
59
+ self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
60
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
61
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
62
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
63
+ self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
64
+ self.num_heads = num_heads
65
+
66
+ def forward(self, x):
67
+ x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC
68
+ x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
69
+ x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
70
+ x, _ = multi_head_attention_forward(
71
+ query=x, key=x, value=x,
72
+ embed_dim_to_check=x.shape[-1],
73
+ num_heads=self.num_heads,
74
+ q_proj_weight=self.q_proj.weight,
75
+ k_proj_weight=self.k_proj.weight,
76
+ v_proj_weight=self.v_proj.weight,
77
+ in_proj_weight=None,
78
+ in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
79
+ bias_k=None,
80
+ bias_v=None,
81
+ add_zero_attn=False,
82
+ dropout_p=0,
83
+ out_proj_weight=self.c_proj.weight,
84
+ out_proj_bias=self.c_proj.bias,
85
+ use_separate_proj_weight=True,
86
+ training=self.training,
87
+ need_weights=False
88
+ )
89
+
90
+ return x[0]
91
+
92
+
93
+ class ModifiedResNet(nn.Module):
94
+ """
95
+ A ResNet class that is similar to torchvision's but contains the following changes:
96
+ - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
97
+ - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
98
+ - The final pooling layer is a QKV attention instead of an average pool
99
+ """
100
+
101
+ def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
102
+ super().__init__()
103
+ self.output_dim = output_dim
104
+ self.input_resolution = input_resolution
105
+
106
+ # the 3-layer stem
107
+ self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
108
+ self.bn1 = nn.BatchNorm2d(width // 2)
109
+ self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
110
+ self.bn2 = nn.BatchNorm2d(width // 2)
111
+ self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
112
+ self.bn3 = nn.BatchNorm2d(width)
113
+ self.avgpool = nn.AvgPool2d(2)
114
+ self.relu = nn.ReLU(inplace=True)
115
+
116
+ # residual layers
117
+ self._inplanes = width # this is a *mutable* variable used during construction
118
+ self.layer1 = self._make_layer(width, layers[0])
119
+ self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
120
+ self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
121
+ self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
122
+
123
+ embed_dim = width * 32 # the ResNet feature dimension
124
+ self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
125
+
126
+ def _make_layer(self, planes, blocks, stride=1):
127
+ layers = [Bottleneck(self._inplanes, planes, stride)]
128
+
129
+ self._inplanes = planes * Bottleneck.expansion
130
+ for _ in range(1, blocks):
131
+ layers.append(Bottleneck(self._inplanes, planes))
132
+
133
+ return nn.Sequential(*layers)
134
+
135
+ def forward(self, x):
136
+ def stem(x):
137
+ for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
138
+ x = self.relu(bn(conv(x)))
139
+ x = self.avgpool(x)
140
+ return x
141
+
142
+ x = x.type(self.conv1.weight.dtype)
143
+ x = stem(x)
144
+ x = self.layer1(x)
145
+ x = self.layer2(x)
146
+ x = self.layer3(x)
147
+ x = self.layer4(x)
148
+ x = self.attnpool(x)
149
+
150
+ return x
151
+
152
+
153
+ class LayerNorm(nn.LayerNorm):
154
+ """Subclass torch's LayerNorm to handle fp16."""
155
+
156
+ def forward(self, x: torch.Tensor):
157
+ orig_type = x.dtype
158
+ ret = super().forward(x.type(torch.float32))
159
+ return ret.type(orig_type)
160
+
161
+
162
+ class QuickGELU(nn.Module):
163
+ def forward(self, x: torch.Tensor):
164
+ return x * torch.sigmoid(1.702 * x)
165
+
166
+
167
+ class ResidualAttentionBlock(nn.Module):
168
+ def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
169
+ super().__init__()
170
+
171
+ self.attn = MultiheadAttention(d_model, n_head)
172
+ self.ln_1 = LayerNorm(d_model)
173
+ self.mlp = nn.Sequential(OrderedDict([
174
+ ("c_fc", nn.Linear(d_model, d_model * 4)),
175
+ ("gelu", QuickGELU()),
176
+ ("c_proj", nn.Linear(d_model * 4, d_model))
177
+ ]))
178
+ self.ln_2 = LayerNorm(d_model)
179
+ self.attn_mask = attn_mask
180
+
181
+ self.attn_probs = None
182
+ self.attn_grad = None
183
+
184
+ def set_attn_probs(self, attn_probs):
185
+ self.attn_probs = attn_probs
186
+
187
+ def set_attn_grad(self, attn_grad):
188
+ self.attn_grad = attn_grad
189
+
190
+ def attention(self, x: torch.Tensor):
191
+ self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
192
+ return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask, attention_probs_forward_hook=self.set_attn_probs,
193
+ attention_probs_backwards_hook=self.set_attn_grad)[0]
194
+
195
+ def forward(self, x: torch.Tensor):
196
+ x = x + self.attention(self.ln_1(x))
197
+ x = x + self.mlp(self.ln_2(x))
198
+ return x
199
+
200
+
201
+ class Transformer(nn.Module):
202
+ def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
203
+ super().__init__()
204
+ self.width = width
205
+ self.layers = layers
206
+ self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
207
+
208
+ def forward(self, x: torch.Tensor):
209
+ return self.resblocks(x)
210
+
211
+
212
+ class VisualTransformer(nn.Module):
213
+ def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
214
+ super().__init__()
215
+ self.input_resolution = input_resolution
216
+ self.output_dim = output_dim
217
+ self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
218
+
219
+ scale = width ** -0.5
220
+ self.class_embedding = nn.Parameter(scale * torch.randn(width))
221
+ self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
222
+ self.ln_pre = LayerNorm(width)
223
+
224
+ self.transformer = Transformer(width, layers, heads)
225
+
226
+ self.ln_post = LayerNorm(width)
227
+ self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
228
+
229
+ def forward(self, x: torch.Tensor):
230
+ x = self.conv1(x) # shape = [*, width, grid, grid]
231
+ x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
232
+ x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
233
+ x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
234
+ x = x + self.positional_embedding.to(x.dtype)
235
+ x = self.ln_pre(x)
236
+
237
+ x = x.permute(1, 0, 2) # NLD -> LND
238
+ x = self.transformer(x)
239
+ x = x.permute(1, 0, 2) # LND -> NLD
240
+
241
+ x = self.ln_post(x[:, 0, :])
242
+
243
+ if self.proj is not None:
244
+ x = x @ self.proj
245
+
246
+ return x
247
+
248
+
249
+ class CLIP(nn.Module):
250
+ def __init__(self,
251
+ embed_dim: int,
252
+ # vision
253
+ image_resolution: int,
254
+ vision_layers: Union[Tuple[int, int, int, int], int],
255
+ vision_width: int,
256
+ vision_patch_size: int,
257
+ # text
258
+ context_length: int,
259
+ vocab_size: int,
260
+ transformer_width: int,
261
+ transformer_heads: int,
262
+ transformer_layers: int
263
+ ):
264
+ super().__init__()
265
+
266
+ self.context_length = context_length
267
+
268
+ if isinstance(vision_layers, (tuple, list)):
269
+ vision_heads = vision_width * 32 // 64
270
+ self.visual = ModifiedResNet(
271
+ layers=vision_layers,
272
+ output_dim=embed_dim,
273
+ heads=vision_heads,
274
+ input_resolution=image_resolution,
275
+ width=vision_width
276
+ )
277
+ else:
278
+ vision_heads = vision_width // 64
279
+ self.visual = VisualTransformer(
280
+ input_resolution=image_resolution,
281
+ patch_size=vision_patch_size,
282
+ width=vision_width,
283
+ layers=vision_layers,
284
+ heads=vision_heads,
285
+ output_dim=embed_dim
286
+ )
287
+
288
+ self.transformer = Transformer(
289
+ width=transformer_width,
290
+ layers=transformer_layers,
291
+ heads=transformer_heads,
292
+ attn_mask=self.build_attention_mask()
293
+ )
294
+
295
+ self.vocab_size = vocab_size
296
+ self.token_embedding = nn.Embedding(vocab_size, transformer_width)
297
+ self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
298
+ self.ln_final = LayerNorm(transformer_width)
299
+
300
+ self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
301
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
302
+
303
+ self.initialize_parameters()
304
+
305
+ def initialize_parameters(self):
306
+ nn.init.normal_(self.token_embedding.weight, std=0.02)
307
+ nn.init.normal_(self.positional_embedding, std=0.01)
308
+
309
+ if isinstance(self.visual, ModifiedResNet):
310
+ if self.visual.attnpool is not None:
311
+ std = self.visual.attnpool.c_proj.in_features ** -0.5
312
+ nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
313
+ nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
314
+ nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
315
+ nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
316
+
317
+ for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
318
+ for name, param in resnet_block.named_parameters():
319
+ if name.endswith("bn3.weight"):
320
+ nn.init.zeros_(param)
321
+
322
+ proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
323
+ attn_std = self.transformer.width ** -0.5
324
+ fc_std = (2 * self.transformer.width) ** -0.5
325
+ for block in self.transformer.resblocks:
326
+ nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
327
+ nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
328
+ nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
329
+ nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
330
+
331
+ if self.text_projection is not None:
332
+ nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
333
+
334
+ def build_attention_mask(self):
335
+ # lazily create causal attention mask, with full attention between the vision tokens
336
+ # pytorch uses additive attention mask; fill with -inf
337
+ mask = torch.empty(self.context_length, self.context_length)
338
+ mask.fill_(float("-inf"))
339
+ mask.triu_(1) # zero out the lower diagonal
340
+ return mask
341
+
342
+ @property
343
+ def dtype(self):
344
+ return self.visual.conv1.weight.dtype
345
+
346
+ def encode_image(self, image):
347
+ return self.visual(image.type(self.dtype))
348
+
349
+ def encode_text(self, text):
350
+ x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]
351
+
352
+ x = x + self.positional_embedding.type(self.dtype)
353
+ x = x.permute(1, 0, 2) # NLD -> LND
354
+ x = self.transformer(x)
355
+ x = x.permute(1, 0, 2) # LND -> NLD
356
+ x = self.ln_final(x).type(self.dtype)
357
+
358
+ # x.shape = [batch_size, n_ctx, transformer.width]
359
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
360
+ x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
361
+
362
+ return x
363
+
364
+ def forward(self, image, text):
365
+ image_features = self.encode_image(image)
366
+ text_features = self.encode_text(text)
367
+
368
+ # normalized features
369
+ image_features = image_features / image_features.norm(dim=-1, keepdim=True)
370
+ text_features = text_features / text_features.norm(dim=-1, keepdim=True)
371
+
372
+ # cosine similarity as logits
373
+ logit_scale = self.logit_scale.exp()
374
+ logits_per_image = logit_scale * image_features @ text_features.t()
375
+ logits_per_text = logit_scale * text_features @ image_features.t()
376
+
377
+ # shape = [global_batch_size, global_batch_size]
378
+ return logits_per_image, logits_per_text
379
+
380
+
381
+ def convert_weights(model: nn.Module):
382
+ """Convert applicable model parameters to fp16"""
383
+
384
+ def _convert_weights_to_fp16(l):
385
+ if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
386
+ l.weight.data = l.weight.data.half()
387
+ if l.bias is not None:
388
+ l.bias.data = l.bias.data.half()
389
+
390
+ if isinstance(l, MultiheadAttention):
391
+ for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
392
+ tensor = getattr(l, attr)
393
+ if tensor is not None:
394
+ tensor.data = tensor.data.half()
395
+
396
+ for name in ["text_projection", "proj"]:
397
+ if hasattr(l, name):
398
+ attr = getattr(l, name)
399
+ if attr is not None:
400
+ attr.data = attr.data.half()
401
+
402
+ model.apply(_convert_weights_to_fp16)
403
+
404
+
405
+ def build_model(state_dict: dict):
406
+ vit = "visual.proj" in state_dict
407
+
408
+ if vit:
409
+ vision_width = state_dict["visual.conv1.weight"].shape[0]
410
+ vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
411
+ vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
412
+ grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
413
+ image_resolution = vision_patch_size * grid_size
414
+ else:
415
+ counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
416
+ vision_layers = tuple(counts)
417
+ vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
418
+ output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
419
+ vision_patch_size = None
420
+ assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
421
+ image_resolution = output_width * 32
422
+
423
+ embed_dim = state_dict["text_projection"].shape[1]
424
+ context_length = state_dict["positional_embedding"].shape[0]
425
+ vocab_size = state_dict["token_embedding.weight"].shape[0]
426
+ transformer_width = state_dict["ln_final.weight"].shape[0]
427
+ transformer_heads = transformer_width // 64
428
+ transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
429
+
430
+ model = CLIP(
431
+ embed_dim,
432
+ image_resolution, vision_layers, vision_width, vision_patch_size,
433
+ context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
434
+ )
435
+
436
+ for key in ["input_resolution", "context_length", "vocab_size"]:
437
+ if key in state_dict:
438
+ del state_dict[key]
439
+
440
+ convert_weights(model)
441
+ model.load_state_dict(state_dict)
442
+ return model.eval()
CLIP/clip_explainability/simple_tokenizer.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gzip
2
+ import html
3
+ import os
4
+ from functools import lru_cache
5
+
6
+ import ftfy
7
+ import regex as re
8
+
9
+
10
+ @lru_cache()
11
+ def default_bpe():
12
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
13
+
14
+
15
+ @lru_cache()
16
+ def bytes_to_unicode():
17
+ """
18
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
19
+ The reversible bpe codes work on unicode strings.
20
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
21
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
22
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
23
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
24
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
25
+ """
26
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
27
+ cs = bs[:]
28
+ n = 0
29
+ for b in range(2**8):
30
+ if b not in bs:
31
+ bs.append(b)
32
+ cs.append(2**8+n)
33
+ n += 1
34
+ cs = [chr(n) for n in cs]
35
+ return dict(zip(bs, cs))
36
+
37
+
38
+ def get_pairs(word):
39
+ """Return set of symbol pairs in a word.
40
+ Word is represented as tuple of symbols (symbols being variable-length strings).
41
+ """
42
+ pairs = set()
43
+ prev_char = word[0]
44
+ for char in word[1:]:
45
+ pairs.add((prev_char, char))
46
+ prev_char = char
47
+ return pairs
48
+
49
+
50
+ def basic_clean(text):
51
+ text = ftfy.fix_text(text)
52
+ text = html.unescape(html.unescape(text))
53
+ return text.strip()
54
+
55
+
56
+ def whitespace_clean(text):
57
+ text = re.sub(r'\s+', ' ', text)
58
+ text = text.strip()
59
+ return text
60
+
61
+
62
+ class SimpleTokenizer(object):
63
+ def __init__(self, bpe_path: str = default_bpe()):
64
+ self.byte_encoder = bytes_to_unicode()
65
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
66
+ merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
67
+ merges = merges[1:49152-256-2+1]
68
+ merges = [tuple(merge.split()) for merge in merges]
69
+ vocab = list(bytes_to_unicode().values())
70
+ vocab = vocab + [v+'</w>' for v in vocab]
71
+ for merge in merges:
72
+ vocab.append(''.join(merge))
73
+ vocab.extend(['<|startoftext|>', '<|endoftext|>'])
74
+ self.encoder = dict(zip(vocab, range(len(vocab))))
75
+ self.decoder = {v: k for k, v in self.encoder.items()}
76
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
77
+ self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
78
+ self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
79
+
80
+ def bpe(self, token):
81
+ if token in self.cache:
82
+ return self.cache[token]
83
+ word = tuple(token[:-1]) + ( token[-1] + '</w>',)
84
+ pairs = get_pairs(word)
85
+
86
+ if not pairs:
87
+ return token+'</w>'
88
+
89
+ while True:
90
+ bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
91
+ if bigram not in self.bpe_ranks:
92
+ break
93
+ first, second = bigram
94
+ new_word = []
95
+ i = 0
96
+ while i < len(word):
97
+ try:
98
+ j = word.index(first, i)
99
+ new_word.extend(word[i:j])
100
+ i = j
101
+ except:
102
+ new_word.extend(word[i:])
103
+ break
104
+
105
+ if word[i] == first and i < len(word)-1 and word[i+1] == second:
106
+ new_word.append(first+second)
107
+ i += 2
108
+ else:
109
+ new_word.append(word[i])
110
+ i += 1
111
+ new_word = tuple(new_word)
112
+ word = new_word
113
+ if len(word) == 1:
114
+ break
115
+ else:
116
+ pairs = get_pairs(word)
117
+ word = ' '.join(word)
118
+ self.cache[token] = word
119
+ return word
120
+
121
+ def encode(self, text):
122
+ bpe_tokens = []
123
+ text = whitespace_clean(basic_clean(text)).lower()
124
+ for token in re.findall(self.pat, text):
125
+ token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
126
+ bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
127
+ return bpe_tokens
128
+
129
+ def decode(self, tokens):
130
+ text = ''.join([self.decoder[token] for token in tokens])
131
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
132
+ return text
CLIP/data/country211.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The Country211 Dataset
2
+
3
+ In the paper, we used an image classification dataset called Country211, to evaluate the model's capability on geolocation. To do so, we filtered the YFCC100m dataset that have GPS coordinate corresponding to a [ISO-3166 country code](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes) and created a balanced dataset by sampling 150 train images, 50 validation images, and 100 test images images for each country.
4
+
5
+ The following command will download an 11GB archive countaining the images and extract into a subdirectory `country211`:
6
+
7
+ ```bash
8
+ wget https://openaipublic.azureedge.net/clip/data/country211.tgz
9
+ tar zxvf country211.tgz
10
+ ```
11
+
12
+ These images are a subset of the YFCC100m dataset. Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/).
CLIP/data/prompts.md ADDED
@@ -0,0 +1,3401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prompts for Image Classification
2
+
3
+ Below are the class names and templates that are used for collecting the zero-shot classification scores in the paper. Each dataset has two lists `classes` and `templates`, where the string `{}` in the template is to be replaced with the corresponding class names. For the Facial Emotion Recognition 2013 dataset specifically, we used multiple class names for certain classes.
4
+
5
+ This file contains prompt data for 26 of the 27 datasets shown in Table 9 of the paper; the text prompts for ImageNet (as well as other [ImageNet Testbed](https://modestyachts.github.io/imagenet-testbed/) datasets in Figure 13) can be found in [this notebook](https://github.com/openai/CLIP/blob/main/notebooks/Prompt_Engineering_for_ImageNet.ipynb), as well as how to ensemble predictions from multiple prompts using these templates.
6
+
7
+ If you are viewing this document on GitHub, use the table of contents icon at the upper left to browse the datasets.
8
+
9
+
10
+ ## Birdsnap
11
+
12
+ ```bash
13
+ classes = [
14
+ 'Acadian Flycatcher',
15
+ 'Acorn Woodpecker',
16
+ 'Alder Flycatcher',
17
+ 'Allens Hummingbird',
18
+ 'Altamira Oriole',
19
+ 'American Avocet',
20
+ 'American Bittern',
21
+ 'American Black Duck',
22
+ 'American Coot',
23
+ 'American Crow',
24
+ 'American Dipper',
25
+ 'American Golden Plover',
26
+ 'American Goldfinch',
27
+ 'American Kestrel',
28
+ 'American Oystercatcher',
29
+ 'American Pipit',
30
+ 'American Redstart',
31
+ 'American Robin',
32
+ 'American Three toed Woodpecker',
33
+ 'American Tree Sparrow',
34
+ 'American White Pelican',
35
+ 'American Wigeon',
36
+ 'American Woodcock',
37
+ 'Anhinga',
38
+ 'Annas Hummingbird',
39
+ 'Arctic Tern',
40
+ 'Ash throated Flycatcher',
41
+ 'Audubons Oriole',
42
+ 'Bairds Sandpiper',
43
+ 'Bald Eagle',
44
+ 'Baltimore Oriole',
45
+ 'Band tailed Pigeon',
46
+ 'Barn Swallow',
47
+ 'Barred Owl',
48
+ 'Barrows Goldeneye',
49
+ 'Bay breasted Warbler',
50
+ 'Bells Vireo',
51
+ 'Belted Kingfisher',
52
+ 'Bewicks Wren',
53
+ 'Black Guillemot',
54
+ 'Black Oystercatcher',
55
+ 'Black Phoebe',
56
+ 'Black Rosy Finch',
57
+ 'Black Scoter',
58
+ 'Black Skimmer',
59
+ 'Black Tern',
60
+ 'Black Turnstone',
61
+ 'Black Vulture',
62
+ 'Black and white Warbler',
63
+ 'Black backed Woodpecker',
64
+ 'Black bellied Plover',
65
+ 'Black billed Cuckoo',
66
+ 'Black billed Magpie',
67
+ 'Black capped Chickadee',
68
+ 'Black chinned Hummingbird',
69
+ 'Black chinned Sparrow',
70
+ 'Black crested Titmouse',
71
+ 'Black crowned Night Heron',
72
+ 'Black headed Grosbeak',
73
+ 'Black legged Kittiwake',
74
+ 'Black necked Stilt',
75
+ 'Black throated Blue Warbler',
76
+ 'Black throated Gray Warbler',
77
+ 'Black throated Green Warbler',
78
+ 'Black throated Sparrow',
79
+ 'Blackburnian Warbler',
80
+ 'Blackpoll Warbler',
81
+ 'Blue Grosbeak',
82
+ 'Blue Jay',
83
+ 'Blue gray Gnatcatcher',
84
+ 'Blue headed Vireo',
85
+ 'Blue winged Teal',
86
+ 'Blue winged Warbler',
87
+ 'Boat tailed Grackle',
88
+ 'Bobolink',
89
+ 'Bohemian Waxwing',
90
+ 'Bonapartes Gull',
91
+ 'Boreal Chickadee',
92
+ 'Brandts Cormorant',
93
+ 'Brant',
94
+ 'Brewers Blackbird',
95
+ 'Brewers Sparrow',
96
+ 'Bridled Titmouse',
97
+ 'Broad billed Hummingbird',
98
+ 'Broad tailed Hummingbird',
99
+ 'Broad winged Hawk',
100
+ 'Bronzed Cowbird',
101
+ 'Brown Creeper',
102
+ 'Brown Pelican',
103
+ 'Brown Thrasher',
104
+ 'Brown capped Rosy Finch',
105
+ 'Brown crested Flycatcher',
106
+ 'Brown headed Cowbird',
107
+ 'Brown headed Nuthatch',
108
+ 'Bufflehead',
109
+ 'Bullocks Oriole',
110
+ 'Burrowing Owl',
111
+ 'Bushtit',
112
+ 'Cackling Goose',
113
+ 'Cactus Wren',
114
+ 'California Gull',
115
+ 'California Quail',
116
+ 'California Thrasher',
117
+ 'California Towhee',
118
+ 'Calliope Hummingbird',
119
+ 'Canada Goose',
120
+ 'Canada Warbler',
121
+ 'Canvasback',
122
+ 'Canyon Towhee',
123
+ 'Canyon Wren',
124
+ 'Cape May Warbler',
125
+ 'Carolina Chickadee',
126
+ 'Carolina Wren',
127
+ 'Caspian Tern',
128
+ 'Cassins Finch',
129
+ 'Cassins Kingbird',
130
+ 'Cassins Sparrow',
131
+ 'Cassins Vireo',
132
+ 'Cattle Egret',
133
+ 'Cave Swallow',
134
+ 'Cedar Waxwing',
135
+ 'Cerulean Warbler',
136
+ 'Chestnut backed Chickadee',
137
+ 'Chestnut collared Longspur',
138
+ 'Chestnut sided Warbler',
139
+ 'Chihuahuan Raven',
140
+ 'Chimney Swift',
141
+ 'Chipping Sparrow',
142
+ 'Cinnamon Teal',
143
+ 'Clapper Rail',
144
+ 'Clarks Grebe',
145
+ 'Clarks Nutcracker',
146
+ 'Clay colored Sparrow',
147
+ 'Cliff Swallow',
148
+ 'Common Black Hawk',
149
+ 'Common Eider',
150
+ 'Common Gallinule',
151
+ 'Common Goldeneye',
152
+ 'Common Grackle',
153
+ 'Common Ground Dove',
154
+ 'Common Loon',
155
+ 'Common Merganser',
156
+ 'Common Murre',
157
+ 'Common Nighthawk',
158
+ 'Common Raven',
159
+ 'Common Redpoll',
160
+ 'Common Tern',
161
+ 'Common Yellowthroat',
162
+ 'Connecticut Warbler',
163
+ 'Coopers Hawk',
164
+ 'Cordilleran Flycatcher',
165
+ 'Costas Hummingbird',
166
+ 'Couchs Kingbird',
167
+ 'Crested Caracara',
168
+ 'Curve billed Thrasher',
169
+ 'Dark eyed Junco',
170
+ 'Dickcissel',
171
+ 'Double crested Cormorant',
172
+ 'Downy Woodpecker',
173
+ 'Dunlin',
174
+ 'Dusky Flycatcher',
175
+ 'Dusky Grouse',
176
+ 'Eared Grebe',
177
+ 'Eastern Bluebird',
178
+ 'Eastern Kingbird',
179
+ 'Eastern Meadowlark',
180
+ 'Eastern Phoebe',
181
+ 'Eastern Screech Owl',
182
+ 'Eastern Towhee',
183
+ 'Eastern Wood Pewee',
184
+ 'Elegant Trogon',
185
+ 'Elf Owl',
186
+ 'Eurasian Collared Dove',
187
+ 'Eurasian Wigeon',
188
+ 'European Starling',
189
+ 'Evening Grosbeak',
190
+ 'Ferruginous Hawk',
191
+ 'Ferruginous Pygmy Owl',
192
+ 'Field Sparrow',
193
+ 'Fish Crow',
194
+ 'Florida Scrub Jay',
195
+ 'Forsters Tern',
196
+ 'Fox Sparrow',
197
+ 'Franklins Gull',
198
+ 'Fulvous Whistling Duck',
199
+ 'Gadwall',
200
+ 'Gambels Quail',
201
+ 'Gila Woodpecker',
202
+ 'Glaucous Gull',
203
+ 'Glaucous winged Gull',
204
+ 'Glossy Ibis',
205
+ 'Golden Eagle',
206
+ 'Golden crowned Kinglet',
207
+ 'Golden crowned Sparrow',
208
+ 'Golden fronted Woodpecker',
209
+ 'Golden winged Warbler',
210
+ 'Grasshopper Sparrow',
211
+ 'Gray Catbird',
212
+ 'Gray Flycatcher',
213
+ 'Gray Jay',
214
+ 'Gray Kingbird',
215
+ 'Gray cheeked Thrush',
216
+ 'Gray crowned Rosy Finch',
217
+ 'Great Black backed Gull',
218
+ 'Great Blue Heron',
219
+ 'Great Cormorant',
220
+ 'Great Crested Flycatcher',
221
+ 'Great Egret',
222
+ 'Great Gray Owl',
223
+ 'Great Horned Owl',
224
+ 'Great Kiskadee',
225
+ 'Great tailed Grackle',
226
+ 'Greater Prairie Chicken',
227
+ 'Greater Roadrunner',
228
+ 'Greater Sage Grouse',
229
+ 'Greater Scaup',
230
+ 'Greater White fronted Goose',
231
+ 'Greater Yellowlegs',
232
+ 'Green Jay',
233
+ 'Green tailed Towhee',
234
+ 'Green winged Teal',
235
+ 'Groove billed Ani',
236
+ 'Gull billed Tern',
237
+ 'Hairy Woodpecker',
238
+ 'Hammonds Flycatcher',
239
+ 'Harlequin Duck',
240
+ 'Harriss Hawk',
241
+ 'Harriss Sparrow',
242
+ 'Heermanns Gull',
243
+ 'Henslows Sparrow',
244
+ 'Hepatic Tanager',
245
+ 'Hermit Thrush',
246
+ 'Herring Gull',
247
+ 'Hoary Redpoll',
248
+ 'Hooded Merganser',
249
+ 'Hooded Oriole',
250
+ 'Hooded Warbler',
251
+ 'Horned Grebe',
252
+ 'Horned Lark',
253
+ 'House Finch',
254
+ 'House Sparrow',
255
+ 'House Wren',
256
+ 'Huttons Vireo',
257
+ 'Iceland Gull',
258
+ 'Inca Dove',
259
+ 'Indigo Bunting',
260
+ 'Killdeer',
261
+ 'King Rail',
262
+ 'Ladder backed Woodpecker',
263
+ 'Lapland Longspur',
264
+ 'Lark Bunting',
265
+ 'Lark Sparrow',
266
+ 'Laughing Gull',
267
+ 'Lazuli Bunting',
268
+ 'Le Contes Sparrow',
269
+ 'Least Bittern',
270
+ 'Least Flycatcher',
271
+ 'Least Grebe',
272
+ 'Least Sandpiper',
273
+ 'Least Tern',
274
+ 'Lesser Goldfinch',
275
+ 'Lesser Nighthawk',
276
+ 'Lesser Scaup',
277
+ 'Lesser Yellowlegs',
278
+ 'Lewiss Woodpecker',
279
+ 'Limpkin',
280
+ 'Lincolns Sparrow',
281
+ 'Little Blue Heron',
282
+ 'Loggerhead Shrike',
283
+ 'Long billed Curlew',
284
+ 'Long billed Dowitcher',
285
+ 'Long billed Thrasher',
286
+ 'Long eared Owl',
287
+ 'Long tailed Duck',
288
+ 'Louisiana Waterthrush',
289
+ 'Magnificent Frigatebird',
290
+ 'Magnolia Warbler',
291
+ 'Mallard',
292
+ 'Marbled Godwit',
293
+ 'Marsh Wren',
294
+ 'Merlin',
295
+ 'Mew Gull',
296
+ 'Mexican Jay',
297
+ 'Mississippi Kite',
298
+ 'Monk Parakeet',
299
+ 'Mottled Duck',
300
+ 'Mountain Bluebird',
301
+ 'Mountain Chickadee',
302
+ 'Mountain Plover',
303
+ 'Mourning Dove',
304
+ 'Mourning Warbler',
305
+ 'Muscovy Duck',
306
+ 'Mute Swan',
307
+ 'Nashville Warbler',
308
+ 'Nelsons Sparrow',
309
+ 'Neotropic Cormorant',
310
+ 'Northern Bobwhite',
311
+ 'Northern Cardinal',
312
+ 'Northern Flicker',
313
+ 'Northern Gannet',
314
+ 'Northern Goshawk',
315
+ 'Northern Harrier',
316
+ 'Northern Hawk Owl',
317
+ 'Northern Mockingbird',
318
+ 'Northern Parula',
319
+ 'Northern Pintail',
320
+ 'Northern Rough winged Swallow',
321
+ 'Northern Saw whet Owl',
322
+ 'Northern Shrike',
323
+ 'Northern Waterthrush',
324
+ 'Nuttalls Woodpecker',
325
+ 'Oak Titmouse',
326
+ 'Olive Sparrow',
327
+ 'Olive sided Flycatcher',
328
+ 'Orange crowned Warbler',
329
+ 'Orchard Oriole',
330
+ 'Osprey',
331
+ 'Ovenbird',
332
+ 'Pacific Golden Plover',
333
+ 'Pacific Loon',
334
+ 'Pacific Wren',
335
+ 'Pacific slope Flycatcher',
336
+ 'Painted Bunting',
337
+ 'Painted Redstart',
338
+ 'Palm Warbler',
339
+ 'Pectoral Sandpiper',
340
+ 'Peregrine Falcon',
341
+ 'Phainopepla',
342
+ 'Philadelphia Vireo',
343
+ 'Pied billed Grebe',
344
+ 'Pigeon Guillemot',
345
+ 'Pileated Woodpecker',
346
+ 'Pine Grosbeak',
347
+ 'Pine Siskin',
348
+ 'Pine Warbler',
349
+ 'Piping Plover',
350
+ 'Plumbeous Vireo',
351
+ 'Prairie Falcon',
352
+ 'Prairie Warbler',
353
+ 'Prothonotary Warbler',
354
+ 'Purple Finch',
355
+ 'Purple Gallinule',
356
+ 'Purple Martin',
357
+ 'Purple Sandpiper',
358
+ 'Pygmy Nuthatch',
359
+ 'Pyrrhuloxia',
360
+ 'Red Crossbill',
361
+ 'Red Knot',
362
+ 'Red Phalarope',
363
+ 'Red bellied Woodpecker',
364
+ 'Red breasted Merganser',
365
+ 'Red breasted Nuthatch',
366
+ 'Red breasted Sapsucker',
367
+ 'Red cockaded Woodpecker',
368
+ 'Red eyed Vireo',
369
+ 'Red headed Woodpecker',
370
+ 'Red naped Sapsucker',
371
+ 'Red necked Grebe',
372
+ 'Red necked Phalarope',
373
+ 'Red shouldered Hawk',
374
+ 'Red tailed Hawk',
375
+ 'Red throated Loon',
376
+ 'Red winged Blackbird',
377
+ 'Reddish Egret',
378
+ 'Redhead',
379
+ 'Ring billed Gull',
380
+ 'Ring necked Duck',
381
+ 'Ring necked Pheasant',
382
+ 'Rock Pigeon',
383
+ 'Rock Ptarmigan',
384
+ 'Rock Sandpiper',
385
+ 'Rock Wren',
386
+ 'Rose breasted Grosbeak',
387
+ 'Roseate Tern',
388
+ 'Rosss Goose',
389
+ 'Rough legged Hawk',
390
+ 'Royal Tern',
391
+ 'Ruby crowned Kinglet',
392
+ 'Ruby throated Hummingbird',
393
+ 'Ruddy Duck',
394
+ 'Ruddy Turnstone',
395
+ 'Ruffed Grouse',
396
+ 'Rufous Hummingbird',
397
+ 'Rufous crowned Sparrow',
398
+ 'Rusty Blackbird',
399
+ 'Sage Thrasher',
400
+ 'Saltmarsh Sparrow',
401
+ 'Sanderling',
402
+ 'Sandhill Crane',
403
+ 'Sandwich Tern',
404
+ 'Says Phoebe',
405
+ 'Scaled Quail',
406
+ 'Scarlet Tanager',
407
+ 'Scissor tailed Flycatcher',
408
+ 'Scotts Oriole',
409
+ 'Seaside Sparrow',
410
+ 'Sedge Wren',
411
+ 'Semipalmated Plover',
412
+ 'Semipalmated Sandpiper',
413
+ 'Sharp shinned Hawk',
414
+ 'Sharp tailed Grouse',
415
+ 'Short billed Dowitcher',
416
+ 'Short eared Owl',
417
+ 'Snail Kite',
418
+ 'Snow Bunting',
419
+ 'Snow Goose',
420
+ 'Snowy Egret',
421
+ 'Snowy Owl',
422
+ 'Snowy Plover',
423
+ 'Solitary Sandpiper',
424
+ 'Song Sparrow',
425
+ 'Sooty Grouse',
426
+ 'Sora',
427
+ 'Spotted Owl',
428
+ 'Spotted Sandpiper',
429
+ 'Spotted Towhee',
430
+ 'Spruce Grouse',
431
+ 'Stellers Jay',
432
+ 'Stilt Sandpiper',
433
+ 'Summer Tanager',
434
+ 'Surf Scoter',
435
+ 'Surfbird',
436
+ 'Swainsons Hawk',
437
+ 'Swainsons Thrush',
438
+ 'Swallow tailed Kite',
439
+ 'Swamp Sparrow',
440
+ 'Tennessee Warbler',
441
+ 'Thayers Gull',
442
+ 'Townsends Solitaire',
443
+ 'Townsends Warbler',
444
+ 'Tree Swallow',
445
+ 'Tricolored Heron',
446
+ 'Tropical Kingbird',
447
+ 'Trumpeter Swan',
448
+ 'Tufted Titmouse',
449
+ 'Tundra Swan',
450
+ 'Turkey Vulture',
451
+ 'Upland Sandpiper',
452
+ 'Varied Thrush',
453
+ 'Veery',
454
+ 'Verdin',
455
+ 'Vermilion Flycatcher',
456
+ 'Vesper Sparrow',
457
+ 'Violet green Swallow',
458
+ 'Virginia Rail',
459
+ 'Wandering Tattler',
460
+ 'Warbling Vireo',
461
+ 'Western Bluebird',
462
+ 'Western Grebe',
463
+ 'Western Gull',
464
+ 'Western Kingbird',
465
+ 'Western Meadowlark',
466
+ 'Western Sandpiper',
467
+ 'Western Screech Owl',
468
+ 'Western Scrub Jay',
469
+ 'Western Tanager',
470
+ 'Western Wood Pewee',
471
+ 'Whimbrel',
472
+ 'White Ibis',
473
+ 'White breasted Nuthatch',
474
+ 'White crowned Sparrow',
475
+ 'White eyed Vireo',
476
+ 'White faced Ibis',
477
+ 'White headed Woodpecker',
478
+ 'White rumped Sandpiper',
479
+ 'White tailed Hawk',
480
+ 'White tailed Kite',
481
+ 'White tailed Ptarmigan',
482
+ 'White throated Sparrow',
483
+ 'White throated Swift',
484
+ 'White winged Crossbill',
485
+ 'White winged Dove',
486
+ 'White winged Scoter',
487
+ 'Wild Turkey',
488
+ 'Willet',
489
+ 'Williamsons Sapsucker',
490
+ 'Willow Flycatcher',
491
+ 'Willow Ptarmigan',
492
+ 'Wilsons Phalarope',
493
+ 'Wilsons Plover',
494
+ 'Wilsons Snipe',
495
+ 'Wilsons Warbler',
496
+ 'Winter Wren',
497
+ 'Wood Stork',
498
+ 'Wood Thrush',
499
+ 'Worm eating Warbler',
500
+ 'Wrentit',
501
+ 'Yellow Warbler',
502
+ 'Yellow bellied Flycatcher',
503
+ 'Yellow bellied Sapsucker',
504
+ 'Yellow billed Cuckoo',
505
+ 'Yellow billed Magpie',
506
+ 'Yellow breasted Chat',
507
+ 'Yellow crowned Night Heron',
508
+ 'Yellow eyed Junco',
509
+ 'Yellow headed Blackbird',
510
+ 'Yellow rumped Warbler',
511
+ 'Yellow throated Vireo',
512
+ 'Yellow throated Warbler',
513
+ 'Zone tailed Hawk',
514
+ ]
515
+
516
+ templates = [
517
+ 'a photo of a {}, a type of bird.',
518
+ ]
519
+ ```
520
+
521
+
522
+
523
+ ## CIFAR10
524
+
525
+ ```bash
526
+ classes = [
527
+ 'airplane',
528
+ 'automobile',
529
+ 'bird',
530
+ 'cat',
531
+ 'deer',
532
+ 'dog',
533
+ 'frog',
534
+ 'horse',
535
+ 'ship',
536
+ 'truck',
537
+ ]
538
+
539
+ templates = [
540
+ 'a photo of a {}.',
541
+ 'a blurry photo of a {}.',
542
+ 'a black and white photo of a {}.',
543
+ 'a low contrast photo of a {}.',
544
+ 'a high contrast photo of a {}.',
545
+ 'a bad photo of a {}.',
546
+ 'a good photo of a {}.',
547
+ 'a photo of a small {}.',
548
+ 'a photo of a big {}.',
549
+ 'a photo of the {}.',
550
+ 'a blurry photo of the {}.',
551
+ 'a black and white photo of the {}.',
552
+ 'a low contrast photo of the {}.',
553
+ 'a high contrast photo of the {}.',
554
+ 'a bad photo of the {}.',
555
+ 'a good photo of the {}.',
556
+ 'a photo of the small {}.',
557
+ 'a photo of the big {}.',
558
+ ]
559
+ ```
560
+
561
+
562
+
563
+ ## CIFAR100
564
+
565
+ ```bash
566
+ classes = [
567
+ 'apple',
568
+ 'aquarium fish',
569
+ 'baby',
570
+ 'bear',
571
+ 'beaver',
572
+ 'bed',
573
+ 'bee',
574
+ 'beetle',
575
+ 'bicycle',
576
+ 'bottle',
577
+ 'bowl',
578
+ 'boy',
579
+ 'bridge',
580
+ 'bus',
581
+ 'butterfly',
582
+ 'camel',
583
+ 'can',
584
+ 'castle',
585
+ 'caterpillar',
586
+ 'cattle',
587
+ 'chair',
588
+ 'chimpanzee',
589
+ 'clock',
590
+ 'cloud',
591
+ 'cockroach',
592
+ 'couch',
593
+ 'crab',
594
+ 'crocodile',
595
+ 'cup',
596
+ 'dinosaur',
597
+ 'dolphin',
598
+ 'elephant',
599
+ 'flatfish',
600
+ 'forest',
601
+ 'fox',
602
+ 'girl',
603
+ 'hamster',
604
+ 'house',
605
+ 'kangaroo',
606
+ 'keyboard',
607
+ 'lamp',
608
+ 'lawn mower',
609
+ 'leopard',
610
+ 'lion',
611
+ 'lizard',
612
+ 'lobster',
613
+ 'man',
614
+ 'maple tree',
615
+ 'motorcycle',
616
+ 'mountain',
617
+ 'mouse',
618
+ 'mushroom',
619
+ 'oak tree',
620
+ 'orange',
621
+ 'orchid',
622
+ 'otter',
623
+ 'palm tree',
624
+ 'pear',
625
+ 'pickup truck',
626
+ 'pine tree',
627
+ 'plain',
628
+ 'plate',
629
+ 'poppy',
630
+ 'porcupine',
631
+ 'possum',
632
+ 'rabbit',
633
+ 'raccoon',
634
+ 'ray',
635
+ 'road',
636
+ 'rocket',
637
+ 'rose',
638
+ 'sea',
639
+ 'seal',
640
+ 'shark',
641
+ 'shrew',
642
+ 'skunk',
643
+ 'skyscraper',
644
+ 'snail',
645
+ 'snake',
646
+ 'spider',
647
+ 'squirrel',
648
+ 'streetcar',
649
+ 'sunflower',
650
+ 'sweet pepper',
651
+ 'table',
652
+ 'tank',
653
+ 'telephone',
654
+ 'television',
655
+ 'tiger',
656
+ 'tractor',
657
+ 'train',
658
+ 'trout',
659
+ 'tulip',
660
+ 'turtle',
661
+ 'wardrobe',
662
+ 'whale',
663
+ 'willow tree',
664
+ 'wolf',
665
+ 'woman',
666
+ 'worm',
667
+ ]
668
+
669
+ templates = [
670
+ 'a photo of a {}.',
671
+ 'a blurry photo of a {}.',
672
+ 'a black and white photo of a {}.',
673
+ 'a low contrast photo of a {}.',
674
+ 'a high contrast photo of a {}.',
675
+ 'a bad photo of a {}.',
676
+ 'a good photo of a {}.',
677
+ 'a photo of a small {}.',
678
+ 'a photo of a big {}.',
679
+ 'a photo of the {}.',
680
+ 'a blurry photo of the {}.',
681
+ 'a black and white photo of the {}.',
682
+ 'a low contrast photo of the {}.',
683
+ 'a high contrast photo of the {}.',
684
+ 'a bad photo of the {}.',
685
+ 'a good photo of the {}.',
686
+ 'a photo of the small {}.',
687
+ 'a photo of the big {}.',
688
+ ]
689
+ ```
690
+
691
+
692
+
693
+ ## CLEVRCounts
694
+
695
+ ```bash
696
+ classes = [
697
+ '10',
698
+ '3',
699
+ '4',
700
+ '5',
701
+ '6',
702
+ '7',
703
+ '8',
704
+ '9',
705
+ ]
706
+
707
+ templates = [
708
+ 'a photo of {} objects.',
709
+ ]
710
+ ```
711
+
712
+
713
+
714
+ ## Caltech101
715
+
716
+ ```bash
717
+ classes = [
718
+ 'background',
719
+ 'off-center face',
720
+ 'centered face',
721
+ 'leopard',
722
+ 'motorbike',
723
+ 'accordion',
724
+ 'airplane',
725
+ 'anchor',
726
+ 'ant',
727
+ 'barrel',
728
+ 'bass',
729
+ 'beaver',
730
+ 'binocular',
731
+ 'bonsai',
732
+ 'brain',
733
+ 'brontosaurus',
734
+ 'buddha',
735
+ 'butterfly',
736
+ 'camera',
737
+ 'cannon',
738
+ 'side of a car',
739
+ 'ceiling fan',
740
+ 'cellphone',
741
+ 'chair',
742
+ 'chandelier',
743
+ 'body of a cougar cat',
744
+ 'face of a cougar cat',
745
+ 'crab',
746
+ 'crayfish',
747
+ 'crocodile',
748
+ 'head of a crocodile',
749
+ 'cup',
750
+ 'dalmatian',
751
+ 'dollar bill',
752
+ 'dolphin',
753
+ 'dragonfly',
754
+ 'electric guitar',
755
+ 'elephant',
756
+ 'emu',
757
+ 'euphonium',
758
+ 'ewer',
759
+ 'ferry',
760
+ 'flamingo',
761
+ 'head of a flamingo',
762
+ 'garfield',
763
+ 'gerenuk',
764
+ 'gramophone',
765
+ 'grand piano',
766
+ 'hawksbill',
767
+ 'headphone',
768
+ 'hedgehog',
769
+ 'helicopter',
770
+ 'ibis',
771
+ 'inline skate',
772
+ 'joshua tree',
773
+ 'kangaroo',
774
+ 'ketch',
775
+ 'lamp',
776
+ 'laptop',
777
+ 'llama',
778
+ 'lobster',
779
+ 'lotus',
780
+ 'mandolin',
781
+ 'mayfly',
782
+ 'menorah',
783
+ 'metronome',
784
+ 'minaret',
785
+ 'nautilus',
786
+ 'octopus',
787
+ 'okapi',
788
+ 'pagoda',
789
+ 'panda',
790
+ 'pigeon',
791
+ 'pizza',
792
+ 'platypus',
793
+ 'pyramid',
794
+ 'revolver',
795
+ 'rhino',
796
+ 'rooster',
797
+ 'saxophone',
798
+ 'schooner',
799
+ 'scissors',
800
+ 'scorpion',
801
+ 'sea horse',
802
+ 'snoopy (cartoon beagle)',
803
+ 'soccer ball',
804
+ 'stapler',
805
+ 'starfish',
806
+ 'stegosaurus',
807
+ 'stop sign',
808
+ 'strawberry',
809
+ 'sunflower',
810
+ 'tick',
811
+ 'trilobite',
812
+ 'umbrella',
813
+ 'watch',
814
+ 'water lilly',
815
+ 'wheelchair',
816
+ 'wild cat',
817
+ 'windsor chair',
818
+ 'wrench',
819
+ 'yin and yang symbol',
820
+ ]
821
+
822
+ templates = [
823
+ 'a photo of a {}.',
824
+ 'a painting of a {}.',
825
+ 'a plastic {}.',
826
+ 'a sculpture of a {}.',
827
+ 'a sketch of a {}.',
828
+ 'a tattoo of a {}.',
829
+ 'a toy {}.',
830
+ 'a rendition of a {}.',
831
+ 'a embroidered {}.',
832
+ 'a cartoon {}.',
833
+ 'a {} in a video game.',
834
+ 'a plushie {}.',
835
+ 'a origami {}.',
836
+ 'art of a {}.',
837
+ 'graffiti of a {}.',
838
+ 'a drawing of a {}.',
839
+ 'a doodle of a {}.',
840
+ 'a photo of the {}.',
841
+ 'a painting of the {}.',
842
+ 'the plastic {}.',
843
+ 'a sculpture of the {}.',
844
+ 'a sketch of the {}.',
845
+ 'a tattoo of the {}.',
846
+ 'the toy {}.',
847
+ 'a rendition of the {}.',
848
+ 'the embroidered {}.',
849
+ 'the cartoon {}.',
850
+ 'the {} in a video game.',
851
+ 'the plushie {}.',
852
+ 'the origami {}.',
853
+ 'art of the {}.',
854
+ 'graffiti of the {}.',
855
+ 'a drawing of the {}.',
856
+ 'a doodle of the {}.',
857
+ ]
858
+ ```
859
+
860
+
861
+
862
+ ## Country211
863
+
864
+ ```bash
865
+ classes = [
866
+ 'Andorra',
867
+ 'United Arab Emirates',
868
+ 'Afghanistan',
869
+ 'Antigua and Barbuda',
870
+ 'Anguilla',
871
+ 'Albania',
872
+ 'Armenia',
873
+ 'Angola',
874
+ 'Antarctica',
875
+ 'Argentina',
876
+ 'Austria',
877
+ 'Australia',
878
+ 'Aruba',
879
+ 'Aland Islands',
880
+ 'Azerbaijan',
881
+ 'Bosnia and Herzegovina',
882
+ 'Barbados',
883
+ 'Bangladesh',
884
+ 'Belgium',
885
+ 'Burkina Faso',
886
+ 'Bulgaria',
887
+ 'Bahrain',
888
+ 'Benin',
889
+ 'Bermuda',
890
+ 'Brunei Darussalam',
891
+ 'Bolivia',
892
+ 'Bonaire, Saint Eustatius and Saba',
893
+ 'Brazil',
894
+ 'Bahamas',
895
+ 'Bhutan',
896
+ 'Botswana',
897
+ 'Belarus',
898
+ 'Belize',
899
+ 'Canada',
900
+ 'DR Congo',
901
+ 'Central African Republic',
902
+ 'Switzerland',
903
+ "Cote d'Ivoire",
904
+ 'Cook Islands',
905
+ 'Chile',
906
+ 'Cameroon',
907
+ 'China',
908
+ 'Colombia',
909
+ 'Costa Rica',
910
+ 'Cuba',
911
+ 'Cabo Verde',
912
+ 'Curacao',
913
+ 'Cyprus',
914
+ 'Czech Republic',
915
+ 'Germany',
916
+ 'Denmark',
917
+ 'Dominica',
918
+ 'Dominican Republic',
919
+ 'Algeria',
920
+ 'Ecuador',
921
+ 'Estonia',
922
+ 'Egypt',
923
+ 'Spain',
924
+ 'Ethiopia',
925
+ 'Finland',
926
+ 'Fiji',
927
+ 'Falkland Islands',
928
+ 'Faeroe Islands',
929
+ 'France',
930
+ 'Gabon',
931
+ 'United Kingdom',
932
+ 'Grenada',
933
+ 'Georgia',
934
+ 'French Guiana',
935
+ 'Guernsey',
936
+ 'Ghana',
937
+ 'Gibraltar',
938
+ 'Greenland',
939
+ 'Gambia',
940
+ 'Guadeloupe',
941
+ 'Greece',
942
+ 'South Georgia and South Sandwich Is.',
943
+ 'Guatemala',
944
+ 'Guam',
945
+ 'Guyana',
946
+ 'Hong Kong',
947
+ 'Honduras',
948
+ 'Croatia',
949
+ 'Haiti',
950
+ 'Hungary',
951
+ 'Indonesia',
952
+ 'Ireland',
953
+ 'Israel',
954
+ 'Isle of Man',
955
+ 'India',
956
+ 'Iraq',
957
+ 'Iran',
958
+ 'Iceland',
959
+ 'Italy',
960
+ 'Jersey',
961
+ 'Jamaica',
962
+ 'Jordan',
963
+ 'Japan',
964
+ 'Kenya',
965
+ 'Kyrgyz Republic',
966
+ 'Cambodia',
967
+ 'St. Kitts and Nevis',
968
+ 'North Korea',
969
+ 'South Korea',
970
+ 'Kuwait',
971
+ 'Cayman Islands',
972
+ 'Kazakhstan',
973
+ 'Laos',
974
+ 'Lebanon',
975
+ 'St. Lucia',
976
+ 'Liechtenstein',
977
+ 'Sri Lanka',
978
+ 'Liberia',
979
+ 'Lithuania',
980
+ 'Luxembourg',
981
+ 'Latvia',
982
+ 'Libya',
983
+ 'Morocco',
984
+ 'Monaco',
985
+ 'Moldova',
986
+ 'Montenegro',
987
+ 'Saint-Martin',
988
+ 'Madagascar',
989
+ 'Macedonia',
990
+ 'Mali',
991
+ 'Myanmar',
992
+ 'Mongolia',
993
+ 'Macau',
994
+ 'Martinique',
995
+ 'Mauritania',
996
+ 'Malta',
997
+ 'Mauritius',
998
+ 'Maldives',
999
+ 'Malawi',
1000
+ 'Mexico',
1001
+ 'Malaysia',
1002
+ 'Mozambique',
1003
+ 'Namibia',
1004
+ 'New Caledonia',
1005
+ 'Nigeria',
1006
+ 'Nicaragua',
1007
+ 'Netherlands',
1008
+ 'Norway',
1009
+ 'Nepal',
1010
+ 'New Zealand',
1011
+ 'Oman',
1012
+ 'Panama',
1013
+ 'Peru',
1014
+ 'French Polynesia',
1015
+ 'Papua New Guinea',
1016
+ 'Philippines',
1017
+ 'Pakistan',
1018
+ 'Poland',
1019
+ 'Puerto Rico',
1020
+ 'Palestine',
1021
+ 'Portugal',
1022
+ 'Palau',
1023
+ 'Paraguay',
1024
+ 'Qatar',
1025
+ 'Reunion',
1026
+ 'Romania',
1027
+ 'Serbia',
1028
+ 'Russia',
1029
+ 'Rwanda',
1030
+ 'Saudi Arabia',
1031
+ 'Solomon Islands',
1032
+ 'Seychelles',
1033
+ 'Sudan',
1034
+ 'Sweden',
1035
+ 'Singapore',
1036
+ 'St. Helena',
1037
+ 'Slovenia',
1038
+ 'Svalbard and Jan Mayen Islands',
1039
+ 'Slovakia',
1040
+ 'Sierra Leone',
1041
+ 'San Marino',
1042
+ 'Senegal',
1043
+ 'Somalia',
1044
+ 'South Sudan',
1045
+ 'El Salvador',
1046
+ 'Sint Maarten',
1047
+ 'Syria',
1048
+ 'Eswatini',
1049
+ 'Togo',
1050
+ 'Thailand',
1051
+ 'Tajikistan',
1052
+ 'Timor-Leste',
1053
+ 'Turkmenistan',
1054
+ 'Tunisia',
1055
+ 'Tonga',
1056
+ 'Turkey',
1057
+ 'Trinidad and Tobago',
1058
+ 'Taiwan',
1059
+ 'Tanzania',
1060
+ 'Ukraine',
1061
+ 'Uganda',
1062
+ 'United States',
1063
+ 'Uruguay',
1064
+ 'Uzbekistan',
1065
+ 'Vatican',
1066
+ 'Venezuela',
1067
+ 'British Virgin Islands',
1068
+ 'United States Virgin Islands',
1069
+ 'Vietnam',
1070
+ 'Vanuatu',
1071
+ 'Samoa',
1072
+ 'Kosovo',
1073
+ 'Yemen',
1074
+ 'South Africa',
1075
+ 'Zambia',
1076
+ 'Zimbabwe',
1077
+ ]
1078
+
1079
+ templates = [
1080
+ 'a photo i took in {}.',
1081
+ 'a photo i took while visiting {}.',
1082
+ 'a photo from my home country of {}.',
1083
+ 'a photo from my visit to {}.',
1084
+ 'a photo showing the country of {}.',
1085
+ ]
1086
+ ```
1087
+
1088
+
1089
+
1090
+ ## DescribableTextures
1091
+
1092
+ ```bash
1093
+ classes = [
1094
+ 'banded',
1095
+ 'blotchy',
1096
+ 'braided',
1097
+ 'bubbly',
1098
+ 'bumpy',
1099
+ 'chequered',
1100
+ 'cobwebbed',
1101
+ 'cracked',
1102
+ 'crosshatched',
1103
+ 'crystalline',
1104
+ 'dotted',
1105
+ 'fibrous',
1106
+ 'flecked',
1107
+ 'freckled',
1108
+ 'frilly',
1109
+ 'gauzy',
1110
+ 'grid',
1111
+ 'grooved',
1112
+ 'honeycombed',
1113
+ 'interlaced',
1114
+ 'knitted',
1115
+ 'lacelike',
1116
+ 'lined',
1117
+ 'marbled',
1118
+ 'matted',
1119
+ 'meshed',
1120
+ 'paisley',
1121
+ 'perforated',
1122
+ 'pitted',
1123
+ 'pleated',
1124
+ 'polka-dotted',
1125
+ 'porous',
1126
+ 'potholed',
1127
+ 'scaly',
1128
+ 'smeared',
1129
+ 'spiralled',
1130
+ 'sprinkled',
1131
+ 'stained',
1132
+ 'stratified',
1133
+ 'striped',
1134
+ 'studded',
1135
+ 'swirly',
1136
+ 'veined',
1137
+ 'waffled',
1138
+ 'woven',
1139
+ 'wrinkled',
1140
+ 'zigzagged',
1141
+ ]
1142
+
1143
+ templates = [
1144
+ 'a photo of a {} texture.',
1145
+ 'a photo of a {} pattern.',
1146
+ 'a photo of a {} thing.',
1147
+ 'a photo of a {} object.',
1148
+ 'a photo of the {} texture.',
1149
+ 'a photo of the {} pattern.',
1150
+ 'a photo of the {} thing.',
1151
+ 'a photo of the {} object.',
1152
+ ]
1153
+ ```
1154
+
1155
+
1156
+
1157
+ ## EuroSAT
1158
+
1159
+ ```bash
1160
+ classes = [
1161
+ 'forest',
1162
+ 'permanent crop land',
1163
+ 'residential buildings or homes or apartments',
1164
+ 'river',
1165
+ 'pasture land',
1166
+ 'lake or sea',
1167
+ 'brushland or shrubland',
1168
+ 'annual crop land',
1169
+ 'industrial buildings or commercial buildings',
1170
+ 'highway or road',
1171
+ ]
1172
+
1173
+ templates = [
1174
+ 'a centered satellite photo of {}.',
1175
+ 'a centered satellite photo of a {}.',
1176
+ 'a centered satellite photo of the {}.',
1177
+ ]
1178
+ ```
1179
+
1180
+
1181
+
1182
+ ## FGVCAircraft
1183
+
1184
+ ```bash
1185
+ classes = [
1186
+ '707-320',
1187
+ '727-200',
1188
+ '737-200',
1189
+ '737-300',
1190
+ '737-400',
1191
+ '737-500',
1192
+ '737-600',
1193
+ '737-700',
1194
+ '737-800',
1195
+ '737-900',
1196
+ '747-100',
1197
+ '747-200',
1198
+ '747-300',
1199
+ '747-400',
1200
+ '757-200',
1201
+ '757-300',
1202
+ '767-200',
1203
+ '767-300',
1204
+ '767-400',
1205
+ '777-200',
1206
+ '777-300',
1207
+ 'A300B4',
1208
+ 'A310',
1209
+ 'A318',
1210
+ 'A319',
1211
+ 'A320',
1212
+ 'A321',
1213
+ 'A330-200',
1214
+ 'A330-300',
1215
+ 'A340-200',
1216
+ 'A340-300',
1217
+ 'A340-500',
1218
+ 'A340-600',
1219
+ 'A380',
1220
+ 'ATR-42',
1221
+ 'ATR-72',
1222
+ 'An-12',
1223
+ 'BAE 146-200',
1224
+ 'BAE 146-300',
1225
+ 'BAE-125',
1226
+ 'Beechcraft 1900',
1227
+ 'Boeing 717',
1228
+ 'C-130',
1229
+ 'C-47',
1230
+ 'CRJ-200',
1231
+ 'CRJ-700',
1232
+ 'CRJ-900',
1233
+ 'Cessna 172',
1234
+ 'Cessna 208',
1235
+ 'Cessna 525',
1236
+ 'Cessna 560',
1237
+ 'Challenger 600',
1238
+ 'DC-10',
1239
+ 'DC-3',
1240
+ 'DC-6',
1241
+ 'DC-8',
1242
+ 'DC-9-30',
1243
+ 'DH-82',
1244
+ 'DHC-1',
1245
+ 'DHC-6',
1246
+ 'DHC-8-100',
1247
+ 'DHC-8-300',
1248
+ 'DR-400',
1249
+ 'Dornier 328',
1250
+ 'E-170',
1251
+ 'E-190',
1252
+ 'E-195',
1253
+ 'EMB-120',
1254
+ 'ERJ 135',
1255
+ 'ERJ 145',
1256
+ 'Embraer Legacy 600',
1257
+ 'Eurofighter Typhoon',
1258
+ 'F-16A/B',
1259
+ 'F/A-18',
1260
+ 'Falcon 2000',
1261
+ 'Falcon 900',
1262
+ 'Fokker 100',
1263
+ 'Fokker 50',
1264
+ 'Fokker 70',
1265
+ 'Global Express',
1266
+ 'Gulfstream IV',
1267
+ 'Gulfstream V',
1268
+ 'Hawk T1',
1269
+ 'Il-76',
1270
+ 'L-1011',
1271
+ 'MD-11',
1272
+ 'MD-80',
1273
+ 'MD-87',
1274
+ 'MD-90',
1275
+ 'Metroliner',
1276
+ 'Model B200',
1277
+ 'PA-28',
1278
+ 'SR-20',
1279
+ 'Saab 2000',
1280
+ 'Saab 340',
1281
+ 'Spitfire',
1282
+ 'Tornado',
1283
+ 'Tu-134',
1284
+ 'Tu-154',
1285
+ 'Yak-42',
1286
+ ]
1287
+
1288
+ templates = [
1289
+ 'a photo of a {}, a type of aircraft.',
1290
+ 'a photo of the {}, a type of aircraft.',
1291
+ ]
1292
+ ```
1293
+
1294
+
1295
+
1296
+ ## FacialEmotionRecognition2013
1297
+
1298
+ ```bash
1299
+ classes = [
1300
+ ['angry'],
1301
+ ['disgusted'],
1302
+ ['fearful'],
1303
+ ['happy', 'smiling'],
1304
+ ['sad', 'depressed'],
1305
+ ['surprised', 'shocked', 'spooked'],
1306
+ ['neutral', 'bored'],
1307
+ ]
1308
+
1309
+ templates = [
1310
+ 'a photo of a {} looking face.',
1311
+ 'a photo of a face showing the emotion: {}.',
1312
+ 'a photo of a face looking {}.',
1313
+ 'a face that looks {}.',
1314
+ 'they look {}.',
1315
+ 'look at how {} they are.',
1316
+ ]
1317
+ ```
1318
+
1319
+
1320
+
1321
+ ## Flowers102
1322
+
1323
+ ```bash
1324
+ classes = [
1325
+ 'pink primrose',
1326
+ 'hard-leaved pocket orchid',
1327
+ 'canterbury bells',
1328
+ 'sweet pea',
1329
+ 'english marigold',
1330
+ 'tiger lily',
1331
+ 'moon orchid',
1332
+ 'bird of paradise',
1333
+ 'monkshood',
1334
+ 'globe thistle',
1335
+ 'snapdragon',
1336
+ "colt's foot",
1337
+ 'king protea',
1338
+ 'spear thistle',
1339
+ 'yellow iris',
1340
+ 'globe flower',
1341
+ 'purple coneflower',
1342
+ 'peruvian lily',
1343
+ 'balloon flower',
1344
+ 'giant white arum lily',
1345
+ 'fire lily',
1346
+ 'pincushion flower',
1347
+ 'fritillary',
1348
+ 'red ginger',
1349
+ 'grape hyacinth',
1350
+ 'corn poppy',
1351
+ 'prince of wales feathers',
1352
+ 'stemless gentian',
1353
+ 'artichoke',
1354
+ 'sweet william',
1355
+ 'carnation',
1356
+ 'garden phlox',
1357
+ 'love in the mist',
1358
+ 'mexican aster',
1359
+ 'alpine sea holly',
1360
+ 'ruby-lipped cattleya',
1361
+ 'cape flower',
1362
+ 'great masterwort',
1363
+ 'siam tulip',
1364
+ 'lenten rose',
1365
+ 'barbeton daisy',
1366
+ 'daffodil',
1367
+ 'sword lily',
1368
+ 'poinsettia',
1369
+ 'bolero deep blue',
1370
+ 'wallflower',
1371
+ 'marigold',
1372
+ 'buttercup',
1373
+ 'oxeye daisy',
1374
+ 'common dandelion',
1375
+ 'petunia',
1376
+ 'wild pansy',
1377
+ 'primula',
1378
+ 'sunflower',
1379
+ 'pelargonium',
1380
+ 'bishop of llandaff',
1381
+ 'gaura',
1382
+ 'geranium',
1383
+ 'orange dahlia',
1384
+ 'pink and yellow dahlia',
1385
+ 'cautleya spicata',
1386
+ 'japanese anemone',
1387
+ 'black-eyed susan',
1388
+ 'silverbush',
1389
+ 'californian poppy',
1390
+ 'osteospermum',
1391
+ 'spring crocus',
1392
+ 'bearded iris',
1393
+ 'windflower',
1394
+ 'tree poppy',
1395
+ 'gazania',
1396
+ 'azalea',
1397
+ 'water lily',
1398
+ 'rose',
1399
+ 'thorn apple',
1400
+ 'morning glory',
1401
+ 'passion flower',
1402
+ 'lotus',
1403
+ 'toad lily',
1404
+ 'anthurium',
1405
+ 'frangipani',
1406
+ 'clematis',
1407
+ 'hibiscus',
1408
+ 'columbine',
1409
+ 'desert-rose',
1410
+ 'tree mallow',
1411
+ 'magnolia',
1412
+ 'cyclamen',
1413
+ 'watercress',
1414
+ 'canna lily',
1415
+ 'hippeastrum',
1416
+ 'bee balm',
1417
+ 'air plant',
1418
+ 'foxglove',
1419
+ 'bougainvillea',
1420
+ 'camellia',
1421
+ 'mallow',
1422
+ 'mexican petunia',
1423
+ 'bromelia',
1424
+ 'blanket flower',
1425
+ 'trumpet creeper',
1426
+ 'blackberry lily',
1427
+ ]
1428
+
1429
+ templates = [
1430
+ 'a photo of a {}, a type of flower.',
1431
+ ]
1432
+ ```
1433
+
1434
+
1435
+
1436
+ ## Food101
1437
+
1438
+ ```bash
1439
+ classes = [
1440
+ 'apple pie',
1441
+ 'baby back ribs',
1442
+ 'baklava',
1443
+ 'beef carpaccio',
1444
+ 'beef tartare',
1445
+ 'beet salad',
1446
+ 'beignets',
1447
+ 'bibimbap',
1448
+ 'bread pudding',
1449
+ 'breakfast burrito',
1450
+ 'bruschetta',
1451
+ 'caesar salad',
1452
+ 'cannoli',
1453
+ 'caprese salad',
1454
+ 'carrot cake',
1455
+ 'ceviche',
1456
+ 'cheese plate',
1457
+ 'cheesecake',
1458
+ 'chicken curry',
1459
+ 'chicken quesadilla',
1460
+ 'chicken wings',
1461
+ 'chocolate cake',
1462
+ 'chocolate mousse',
1463
+ 'churros',
1464
+ 'clam chowder',
1465
+ 'club sandwich',
1466
+ 'crab cakes',
1467
+ 'creme brulee',
1468
+ 'croque madame',
1469
+ 'cup cakes',
1470
+ 'deviled eggs',
1471
+ 'donuts',
1472
+ 'dumplings',
1473
+ 'edamame',
1474
+ 'eggs benedict',
1475
+ 'escargots',
1476
+ 'falafel',
1477
+ 'filet mignon',
1478
+ 'fish and chips',
1479
+ 'foie gras',
1480
+ 'french fries',
1481
+ 'french onion soup',
1482
+ 'french toast',
1483
+ 'fried calamari',
1484
+ 'fried rice',
1485
+ 'frozen yogurt',
1486
+ 'garlic bread',
1487
+ 'gnocchi',
1488
+ 'greek salad',
1489
+ 'grilled cheese sandwich',
1490
+ 'grilled salmon',
1491
+ 'guacamole',
1492
+ 'gyoza',
1493
+ 'hamburger',
1494
+ 'hot and sour soup',
1495
+ 'hot dog',
1496
+ 'huevos rancheros',
1497
+ 'hummus',
1498
+ 'ice cream',
1499
+ 'lasagna',
1500
+ 'lobster bisque',
1501
+ 'lobster roll sandwich',
1502
+ 'macaroni and cheese',
1503
+ 'macarons',
1504
+ 'miso soup',
1505
+ 'mussels',
1506
+ 'nachos',
1507
+ 'omelette',
1508
+ 'onion rings',
1509
+ 'oysters',
1510
+ 'pad thai',
1511
+ 'paella',
1512
+ 'pancakes',
1513
+ 'panna cotta',
1514
+ 'peking duck',
1515
+ 'pho',
1516
+ 'pizza',
1517
+ 'pork chop',
1518
+ 'poutine',
1519
+ 'prime rib',
1520
+ 'pulled pork sandwich',
1521
+ 'ramen',
1522
+ 'ravioli',
1523
+ 'red velvet cake',
1524
+ 'risotto',
1525
+ 'samosa',
1526
+ 'sashimi',
1527
+ 'scallops',
1528
+ 'seaweed salad',
1529
+ 'shrimp and grits',
1530
+ 'spaghetti bolognese',
1531
+ 'spaghetti carbonara',
1532
+ 'spring rolls',
1533
+ 'steak',
1534
+ 'strawberry shortcake',
1535
+ 'sushi',
1536
+ 'tacos',
1537
+ 'takoyaki',
1538
+ 'tiramisu',
1539
+ 'tuna tartare',
1540
+ 'waffles',
1541
+ ]
1542
+
1543
+ templates = [
1544
+ 'a photo of {}, a type of food.',
1545
+ ]
1546
+ ```
1547
+
1548
+
1549
+
1550
+ ## GTSRB
1551
+
1552
+ ```bash
1553
+ classes = [
1554
+ 'red and white circle 20 kph speed limit',
1555
+ 'red and white circle 30 kph speed limit',
1556
+ 'red and white circle 50 kph speed limit',
1557
+ 'red and white circle 60 kph speed limit',
1558
+ 'red and white circle 70 kph speed limit',
1559
+ 'red and white circle 80 kph speed limit',
1560
+ 'end / de-restriction of 80 kph speed limit',
1561
+ 'red and white circle 100 kph speed limit',
1562
+ 'red and white circle 120 kph speed limit',
1563
+ 'red and white circle red car and black car no passing',
1564
+ 'red and white circle red truck and black car no passing',
1565
+ 'red and white triangle road intersection warning',
1566
+ 'white and yellow diamond priority road',
1567
+ 'red and white upside down triangle yield right-of-way',
1568
+ 'stop',
1569
+ 'empty red and white circle',
1570
+ 'red and white circle no truck entry',
1571
+ 'red circle with white horizonal stripe no entry',
1572
+ 'red and white triangle with exclamation mark warning',
1573
+ 'red and white triangle with black left curve approaching warning',
1574
+ 'red and white triangle with black right curve approaching warning',
1575
+ 'red and white triangle with black double curve approaching warning',
1576
+ 'red and white triangle rough / bumpy road warning',
1577
+ 'red and white triangle car skidding / slipping warning',
1578
+ 'red and white triangle with merging / narrow lanes warning',
1579
+ 'red and white triangle with person digging / construction / road work warning',
1580
+ 'red and white triangle with traffic light approaching warning',
1581
+ 'red and white triangle with person walking warning',
1582
+ 'red and white triangle with child and person walking warning',
1583
+ 'red and white triangle with bicyle warning',
1584
+ 'red and white triangle with snowflake / ice warning',
1585
+ 'red and white triangle with deer warning',
1586
+ 'white circle with gray strike bar no speed limit',
1587
+ 'blue circle with white right turn arrow mandatory',
1588
+ 'blue circle with white left turn arrow mandatory',
1589
+ 'blue circle with white forward arrow mandatory',
1590
+ 'blue circle with white forward or right turn arrow mandatory',
1591
+ 'blue circle with white forward or left turn arrow mandatory',
1592
+ 'blue circle with white keep right arrow mandatory',
1593
+ 'blue circle with white keep left arrow mandatory',
1594
+ 'blue circle with white arrows indicating a traffic circle',
1595
+ 'white circle with gray strike bar indicating no passing for cars has ended',
1596
+ 'white circle with gray strike bar indicating no passing for trucks has ended',
1597
+ ]
1598
+
1599
+ templates = [
1600
+ 'a zoomed in photo of a "{}" traffic sign.',
1601
+ 'a centered photo of a "{}" traffic sign.',
1602
+ 'a close up photo of a "{}" traffic sign.',
1603
+ ]
1604
+ ```
1605
+
1606
+
1607
+
1608
+ ## HatefulMemes
1609
+
1610
+ ```bash
1611
+ classes = [
1612
+ 'meme',
1613
+ 'hatespeech meme',
1614
+ ]
1615
+
1616
+ templates = [
1617
+ 'a {}.',
1618
+ ]
1619
+ ```
1620
+
1621
+
1622
+
1623
+ ## KITTI
1624
+
1625
+ ```bash
1626
+ classes = [
1627
+ 'a photo i took of a car on my left or right side.',
1628
+ 'a photo i took with a car nearby.',
1629
+ 'a photo i took with a car in the distance.',
1630
+ 'a photo i took with no car.',
1631
+ ]
1632
+
1633
+ templates = [
1634
+ '{}',
1635
+ ]
1636
+ ```
1637
+
1638
+
1639
+
1640
+ ## Kinetics700
1641
+
1642
+ ```bash
1643
+ classes = [
1644
+ 'abseiling',
1645
+ 'acting in play',
1646
+ 'adjusting glasses',
1647
+ 'air drumming',
1648
+ 'alligator wrestling',
1649
+ 'answering questions',
1650
+ 'applauding',
1651
+ 'applying cream',
1652
+ 'archaeological excavation',
1653
+ 'archery',
1654
+ 'arguing',
1655
+ 'arm wrestling',
1656
+ 'arranging flowers',
1657
+ 'arresting',
1658
+ 'assembling bicycle',
1659
+ 'assembling computer',
1660
+ 'attending conference',
1661
+ 'auctioning',
1662
+ 'baby waking up',
1663
+ 'backflip (human)',
1664
+ 'baking cookies',
1665
+ 'bandaging',
1666
+ 'barbequing',
1667
+ 'bartending',
1668
+ 'base jumping',
1669
+ 'bathing dog',
1670
+ 'battle rope training',
1671
+ 'beatboxing',
1672
+ 'bee keeping',
1673
+ 'being excited',
1674
+ 'being in zero gravity',
1675
+ 'belly dancing',
1676
+ 'bench pressing',
1677
+ 'bending back',
1678
+ 'bending metal',
1679
+ 'biking through snow',
1680
+ 'blasting sand',
1681
+ 'blending fruit',
1682
+ 'blowdrying hair',
1683
+ 'blowing bubble gum',
1684
+ 'blowing glass',
1685
+ 'blowing leaves',
1686
+ 'blowing nose',
1687
+ 'blowing out candles',
1688
+ 'bobsledding',
1689
+ 'bodysurfing',
1690
+ 'bookbinding',
1691
+ 'bottling',
1692
+ 'bouncing ball (not juggling)',
1693
+ 'bouncing on bouncy castle',
1694
+ 'bouncing on trampoline',
1695
+ 'bowling',
1696
+ 'braiding hair',
1697
+ 'breading or breadcrumbing',
1698
+ 'breakdancing',
1699
+ 'breaking boards',
1700
+ 'breaking glass',
1701
+ 'breathing fire',
1702
+ 'brush painting',
1703
+ 'brushing floor',
1704
+ 'brushing hair',
1705
+ 'brushing teeth',
1706
+ 'building cabinet',
1707
+ 'building lego',
1708
+ 'building sandcastle',
1709
+ 'building shed',
1710
+ 'bulldozing',
1711
+ 'bungee jumping',
1712
+ 'burping',
1713
+ 'busking',
1714
+ 'calculating',
1715
+ 'calligraphy',
1716
+ 'canoeing or kayaking',
1717
+ 'capoeira',
1718
+ 'capsizing',
1719
+ 'card stacking',
1720
+ 'card throwing',
1721
+ 'carrying baby',
1722
+ 'carrying weight',
1723
+ 'cartwheeling',
1724
+ 'carving ice',
1725
+ 'carving marble',
1726
+ 'carving pumpkin',
1727
+ 'carving wood with a knife',
1728
+ 'casting fishing line',
1729
+ 'catching fish',
1730
+ 'catching or throwing baseball',
1731
+ 'catching or throwing frisbee',
1732
+ 'catching or throwing softball',
1733
+ 'celebrating',
1734
+ 'changing gear in car',
1735
+ 'changing oil',
1736
+ 'changing wheel (not on bike)',
1737
+ 'chasing',
1738
+ 'checking tires',
1739
+ 'checking watch',
1740
+ 'cheerleading',
1741
+ 'chewing gum',
1742
+ 'chiseling stone',
1743
+ 'chiseling wood',
1744
+ 'chopping meat',
1745
+ 'chopping wood',
1746
+ 'clam digging',
1747
+ 'clapping',
1748
+ 'clay pottery making',
1749
+ 'clean and jerk',
1750
+ 'cleaning gutters',
1751
+ 'cleaning pool',
1752
+ 'cleaning shoes',
1753
+ 'cleaning toilet',
1754
+ 'cleaning windows',
1755
+ 'climbing a rope',
1756
+ 'climbing ladder',
1757
+ 'climbing tree',
1758
+ 'closing door',
1759
+ 'coloring in',
1760
+ 'combing hair',
1761
+ 'contact juggling',
1762
+ 'contorting',
1763
+ 'cooking chicken',
1764
+ 'cooking egg',
1765
+ 'cooking on campfire',
1766
+ 'cooking sausages (not on barbeque)',
1767
+ 'cooking scallops',
1768
+ 'cosplaying',
1769
+ 'coughing',
1770
+ 'counting money',
1771
+ 'country line dancing',
1772
+ 'cracking back',
1773
+ 'cracking knuckles',
1774
+ 'cracking neck',
1775
+ 'crawling baby',
1776
+ 'crocheting',
1777
+ 'crossing eyes',
1778
+ 'crossing river',
1779
+ 'crying',
1780
+ 'cumbia',
1781
+ 'curling (sport)',
1782
+ 'curling eyelashes',
1783
+ 'curling hair',
1784
+ 'cutting apple',
1785
+ 'cutting cake',
1786
+ 'cutting nails',
1787
+ 'cutting orange',
1788
+ 'cutting pineapple',
1789
+ 'cutting watermelon',
1790
+ 'dancing ballet',
1791
+ 'dancing charleston',
1792
+ 'dancing gangnam style',
1793
+ 'dancing macarena',
1794
+ 'deadlifting',
1795
+ 'dealing cards',
1796
+ 'decorating the christmas tree',
1797
+ 'decoupage',
1798
+ 'delivering mail',
1799
+ 'digging',
1800
+ 'dining',
1801
+ 'directing traffic',
1802
+ 'disc golfing',
1803
+ 'diving cliff',
1804
+ 'docking boat',
1805
+ 'dodgeball',
1806
+ 'doing aerobics',
1807
+ 'doing jigsaw puzzle',
1808
+ 'doing laundry',
1809
+ 'doing nails',
1810
+ 'doing sudoku',
1811
+ 'drawing',
1812
+ 'dribbling basketball',
1813
+ 'drinking shots',
1814
+ 'driving car',
1815
+ 'driving tractor',
1816
+ 'drooling',
1817
+ 'drop kicking',
1818
+ 'drumming fingers',
1819
+ 'dumpster diving',
1820
+ 'dunking basketball',
1821
+ 'dyeing eyebrows',
1822
+ 'dyeing hair',
1823
+ 'eating burger',
1824
+ 'eating cake',
1825
+ 'eating carrots',
1826
+ 'eating chips',
1827
+ 'eating doughnuts',
1828
+ 'eating hotdog',
1829
+ 'eating ice cream',
1830
+ 'eating nachos',
1831
+ 'eating spaghetti',
1832
+ 'eating watermelon',
1833
+ 'egg hunting',
1834
+ 'embroidering',
1835
+ 'entering church',
1836
+ 'exercising arm',
1837
+ 'exercising with an exercise ball',
1838
+ 'extinguishing fire',
1839
+ 'faceplanting',
1840
+ 'falling off bike',
1841
+ 'falling off chair',
1842
+ 'feeding birds',
1843
+ 'feeding fish',
1844
+ 'feeding goats',
1845
+ 'fencing (sport)',
1846
+ 'fidgeting',
1847
+ 'filling cake',
1848
+ 'filling eyebrows',
1849
+ 'finger snapping',
1850
+ 'fixing bicycle',
1851
+ 'fixing hair',
1852
+ 'flint knapping',
1853
+ 'flipping bottle',
1854
+ 'flipping pancake',
1855
+ 'fly tying',
1856
+ 'flying kite',
1857
+ 'folding clothes',
1858
+ 'folding napkins',
1859
+ 'folding paper',
1860
+ 'front raises',
1861
+ 'frying vegetables',
1862
+ 'gargling',
1863
+ 'geocaching',
1864
+ 'getting a haircut',
1865
+ 'getting a piercing',
1866
+ 'getting a tattoo',
1867
+ 'giving or receiving award',
1868
+ 'gold panning',
1869
+ 'golf chipping',
1870
+ 'golf driving',
1871
+ 'golf putting',
1872
+ 'gospel singing in church',
1873
+ 'grinding meat',
1874
+ 'grooming cat',
1875
+ 'grooming dog',
1876
+ 'grooming horse',
1877
+ 'gymnastics tumbling',
1878
+ 'hammer throw',
1879
+ 'hand washing clothes',
1880
+ 'head stand',
1881
+ 'headbanging',
1882
+ 'headbutting',
1883
+ 'helmet diving',
1884
+ 'herding cattle',
1885
+ 'high fiving',
1886
+ 'high jump',
1887
+ 'high kick',
1888
+ 'historical reenactment',
1889
+ 'hitting baseball',
1890
+ 'hockey stop',
1891
+ 'holding snake',
1892
+ 'home roasting coffee',
1893
+ 'hopscotch',
1894
+ 'hoverboarding',
1895
+ 'huddling',
1896
+ 'hugging (not baby)',
1897
+ 'hugging baby',
1898
+ 'hula hooping',
1899
+ 'hurdling',
1900
+ 'hurling (sport)',
1901
+ 'ice climbing',
1902
+ 'ice fishing',
1903
+ 'ice skating',
1904
+ 'ice swimming',
1905
+ 'inflating balloons',
1906
+ 'installing carpet',
1907
+ 'ironing',
1908
+ 'ironing hair',
1909
+ 'javelin throw',
1910
+ 'jaywalking',
1911
+ 'jetskiing',
1912
+ 'jogging',
1913
+ 'juggling balls',
1914
+ 'juggling fire',
1915
+ 'juggling soccer ball',
1916
+ 'jumping bicycle',
1917
+ 'jumping into pool',
1918
+ 'jumping jacks',
1919
+ 'jumping sofa',
1920
+ 'jumpstyle dancing',
1921
+ 'karaoke',
1922
+ 'kicking field goal',
1923
+ 'kicking soccer ball',
1924
+ 'kissing',
1925
+ 'kitesurfing',
1926
+ 'knitting',
1927
+ 'krumping',
1928
+ 'land sailing',
1929
+ 'laughing',
1930
+ 'lawn mower racing',
1931
+ 'laying bricks',
1932
+ 'laying concrete',
1933
+ 'laying decking',
1934
+ 'laying stone',
1935
+ 'laying tiles',
1936
+ 'leatherworking',
1937
+ 'letting go of balloon',
1938
+ 'licking',
1939
+ 'lifting hat',
1940
+ 'lighting candle',
1941
+ 'lighting fire',
1942
+ 'listening with headphones',
1943
+ 'lock picking',
1944
+ 'long jump',
1945
+ 'longboarding',
1946
+ 'looking at phone',
1947
+ 'looking in mirror',
1948
+ 'luge',
1949
+ 'lunge',
1950
+ 'making a cake',
1951
+ 'making a sandwich',
1952
+ 'making balloon shapes',
1953
+ 'making bubbles',
1954
+ 'making cheese',
1955
+ 'making horseshoes',
1956
+ 'making jewelry',
1957
+ 'making latte art',
1958
+ 'making paper aeroplanes',
1959
+ 'making pizza',
1960
+ 'making slime',
1961
+ 'making snowman',
1962
+ 'making sushi',
1963
+ 'making tea',
1964
+ 'making the bed',
1965
+ 'marching',
1966
+ 'marriage proposal',
1967
+ 'massaging back',
1968
+ 'massaging feet',
1969
+ 'massaging legs',
1970
+ 'massaging neck',
1971
+ "massaging person's head",
1972
+ 'metal detecting',
1973
+ 'milking cow',
1974
+ 'milking goat',
1975
+ 'mixing colours',
1976
+ 'moon walking',
1977
+ 'mopping floor',
1978
+ 'mosh pit dancing',
1979
+ 'motorcycling',
1980
+ 'mountain climber (exercise)',
1981
+ 'moving baby',
1982
+ 'moving child',
1983
+ 'moving furniture',
1984
+ 'mowing lawn',
1985
+ 'mushroom foraging',
1986
+ 'needle felting',
1987
+ 'news anchoring',
1988
+ 'opening bottle (not wine)',
1989
+ 'opening coconuts',
1990
+ 'opening door',
1991
+ 'opening present',
1992
+ 'opening refrigerator',
1993
+ 'opening wine bottle',
1994
+ 'packing',
1995
+ 'paragliding',
1996
+ 'parasailing',
1997
+ 'parkour',
1998
+ 'passing American football (in game)',
1999
+ 'passing American football (not in game)',
2000
+ 'passing soccer ball',
2001
+ 'peeling apples',
2002
+ 'peeling banana',
2003
+ 'peeling potatoes',
2004
+ 'person collecting garbage',
2005
+ 'petting animal (not cat)',
2006
+ 'petting cat',
2007
+ 'petting horse',
2008
+ 'photobombing',
2009
+ 'photocopying',
2010
+ 'picking apples',
2011
+ 'picking blueberries',
2012
+ 'pillow fight',
2013
+ 'pinching',
2014
+ 'pirouetting',
2015
+ 'planing wood',
2016
+ 'planting trees',
2017
+ 'plastering',
2018
+ 'playing accordion',
2019
+ 'playing american football',
2020
+ 'playing badminton',
2021
+ 'playing bagpipes',
2022
+ 'playing basketball',
2023
+ 'playing bass guitar',
2024
+ 'playing beer pong',
2025
+ 'playing billiards',
2026
+ 'playing blackjack',
2027
+ 'playing cards',
2028
+ 'playing cello',
2029
+ 'playing checkers',
2030
+ 'playing chess',
2031
+ 'playing clarinet',
2032
+ 'playing controller',
2033
+ 'playing cricket',
2034
+ 'playing cymbals',
2035
+ 'playing darts',
2036
+ 'playing didgeridoo',
2037
+ 'playing dominoes',
2038
+ 'playing drums',
2039
+ 'playing field hockey',
2040
+ 'playing flute',
2041
+ 'playing gong',
2042
+ 'playing guitar',
2043
+ 'playing hand clapping games',
2044
+ 'playing harmonica',
2045
+ 'playing harp',
2046
+ 'playing ice hockey',
2047
+ 'playing keyboard',
2048
+ 'playing kickball',
2049
+ 'playing laser tag',
2050
+ 'playing lute',
2051
+ 'playing mahjong',
2052
+ 'playing maracas',
2053
+ 'playing marbles',
2054
+ 'playing monopoly',
2055
+ 'playing netball',
2056
+ 'playing nose flute',
2057
+ 'playing oboe',
2058
+ 'playing ocarina',
2059
+ 'playing organ',
2060
+ 'playing paintball',
2061
+ 'playing pan pipes',
2062
+ 'playing piano',
2063
+ 'playing piccolo',
2064
+ 'playing pinball',
2065
+ 'playing ping pong',
2066
+ 'playing poker',
2067
+ 'playing polo',
2068
+ 'playing recorder',
2069
+ 'playing road hockey',
2070
+ 'playing rounders',
2071
+ 'playing rubiks cube',
2072
+ 'playing saxophone',
2073
+ 'playing scrabble',
2074
+ 'playing shuffleboard',
2075
+ 'playing slot machine',
2076
+ 'playing squash or racquetball',
2077
+ 'playing tennis',
2078
+ 'playing trombone',
2079
+ 'playing trumpet',
2080
+ 'playing ukulele',
2081
+ 'playing violin',
2082
+ 'playing volleyball',
2083
+ 'playing with trains',
2084
+ 'playing xylophone',
2085
+ 'poaching eggs',
2086
+ 'poking bellybutton',
2087
+ 'pole vault',
2088
+ 'polishing furniture',
2089
+ 'polishing metal',
2090
+ 'popping balloons',
2091
+ 'pouring beer',
2092
+ 'pouring milk',
2093
+ 'pouring wine',
2094
+ 'preparing salad',
2095
+ 'presenting weather forecast',
2096
+ 'pretending to be a statue',
2097
+ 'pull ups',
2098
+ 'pulling espresso shot',
2099
+ 'pulling rope (game)',
2100
+ 'pumping fist',
2101
+ 'pumping gas',
2102
+ 'punching bag',
2103
+ 'punching person (boxing)',
2104
+ 'push up',
2105
+ 'pushing car',
2106
+ 'pushing cart',
2107
+ 'pushing wheelbarrow',
2108
+ 'pushing wheelchair',
2109
+ 'putting in contact lenses',
2110
+ 'putting on eyeliner',
2111
+ 'putting on foundation',
2112
+ 'putting on lipstick',
2113
+ 'putting on mascara',
2114
+ 'putting on sari',
2115
+ 'putting on shoes',
2116
+ 'putting wallpaper on wall',
2117
+ 'raising eyebrows',
2118
+ 'reading book',
2119
+ 'reading newspaper',
2120
+ 'recording music',
2121
+ 'repairing puncture',
2122
+ 'riding a bike',
2123
+ 'riding camel',
2124
+ 'riding elephant',
2125
+ 'riding mechanical bull',
2126
+ 'riding mule',
2127
+ 'riding or walking with horse',
2128
+ 'riding scooter',
2129
+ 'riding snow blower',
2130
+ 'riding unicycle',
2131
+ 'ripping paper',
2132
+ 'roasting marshmallows',
2133
+ 'roasting pig',
2134
+ 'robot dancing',
2135
+ 'rock climbing',
2136
+ 'rock scissors paper',
2137
+ 'roller skating',
2138
+ 'rolling eyes',
2139
+ 'rolling pastry',
2140
+ 'rope pushdown',
2141
+ 'running on treadmill',
2142
+ 'sailing',
2143
+ 'salsa dancing',
2144
+ 'saluting',
2145
+ 'sanding floor',
2146
+ 'sanding wood',
2147
+ 'sausage making',
2148
+ 'sawing wood',
2149
+ 'scrambling eggs',
2150
+ 'scrapbooking',
2151
+ 'scrubbing face',
2152
+ 'scuba diving',
2153
+ 'seasoning food',
2154
+ 'separating eggs',
2155
+ 'setting table',
2156
+ 'sewing',
2157
+ 'shaking hands',
2158
+ 'shaking head',
2159
+ 'shaping bread dough',
2160
+ 'sharpening knives',
2161
+ 'sharpening pencil',
2162
+ 'shaving head',
2163
+ 'shaving legs',
2164
+ 'shearing sheep',
2165
+ 'shining flashlight',
2166
+ 'shining shoes',
2167
+ 'shoot dance',
2168
+ 'shooting basketball',
2169
+ 'shooting goal (soccer)',
2170
+ 'shooting off fireworks',
2171
+ 'shopping',
2172
+ 'shot put',
2173
+ 'shouting',
2174
+ 'shoveling snow',
2175
+ 'shredding paper',
2176
+ 'shucking oysters',
2177
+ 'shuffling cards',
2178
+ 'shuffling feet',
2179
+ 'side kick',
2180
+ 'sieving',
2181
+ 'sign language interpreting',
2182
+ 'silent disco',
2183
+ 'singing',
2184
+ 'sipping cup',
2185
+ 'situp',
2186
+ 'skateboarding',
2187
+ 'ski ballet',
2188
+ 'ski jumping',
2189
+ 'skiing crosscountry',
2190
+ 'skiing mono',
2191
+ 'skiing slalom',
2192
+ 'skipping rope',
2193
+ 'skipping stone',
2194
+ 'skydiving',
2195
+ 'slacklining',
2196
+ 'slapping',
2197
+ 'sled dog racing',
2198
+ 'sleeping',
2199
+ 'slicing onion',
2200
+ 'smashing',
2201
+ 'smelling feet',
2202
+ 'smoking',
2203
+ 'smoking hookah',
2204
+ 'smoking pipe',
2205
+ 'snatch weight lifting',
2206
+ 'sneezing',
2207
+ 'snorkeling',
2208
+ 'snowboarding',
2209
+ 'snowkiting',
2210
+ 'snowmobiling',
2211
+ 'somersaulting',
2212
+ 'spelunking',
2213
+ 'spinning plates',
2214
+ 'spinning poi',
2215
+ 'splashing water',
2216
+ 'spray painting',
2217
+ 'spraying',
2218
+ 'springboard diving',
2219
+ 'square dancing',
2220
+ 'squat',
2221
+ 'squeezing orange',
2222
+ 'stacking cups',
2223
+ 'stacking dice',
2224
+ 'standing on hands',
2225
+ 'staring',
2226
+ 'steer roping',
2227
+ 'steering car',
2228
+ 'sticking tongue out',
2229
+ 'stomping grapes',
2230
+ 'stretching arm',
2231
+ 'stretching leg',
2232
+ 'sucking lolly',
2233
+ 'surfing crowd',
2234
+ 'surfing water',
2235
+ 'surveying',
2236
+ 'sweeping floor',
2237
+ 'swimming backstroke',
2238
+ 'swimming breast stroke',
2239
+ 'swimming butterfly stroke',
2240
+ 'swimming front crawl',
2241
+ 'swimming with dolphins',
2242
+ 'swimming with sharks',
2243
+ 'swing dancing',
2244
+ 'swinging baseball bat',
2245
+ 'swinging on something',
2246
+ 'sword fighting',
2247
+ 'sword swallowing',
2248
+ 'tackling',
2249
+ 'tagging graffiti',
2250
+ 'tai chi',
2251
+ 'taking photo',
2252
+ 'talking on cell phone',
2253
+ 'tango dancing',
2254
+ 'tap dancing',
2255
+ 'tapping guitar',
2256
+ 'tapping pen',
2257
+ 'tasting beer',
2258
+ 'tasting food',
2259
+ 'tasting wine',
2260
+ 'testifying',
2261
+ 'texting',
2262
+ 'threading needle',
2263
+ 'throwing axe',
2264
+ 'throwing ball (not baseball or American football)',
2265
+ 'throwing discus',
2266
+ 'throwing knife',
2267
+ 'throwing snowballs',
2268
+ 'throwing tantrum',
2269
+ 'throwing water balloon',
2270
+ 'tickling',
2271
+ 'tie dying',
2272
+ 'tightrope walking',
2273
+ 'tiptoeing',
2274
+ 'tobogganing',
2275
+ 'tossing coin',
2276
+ 'tossing salad',
2277
+ 'training dog',
2278
+ 'trapezing',
2279
+ 'treating wood',
2280
+ 'trimming or shaving beard',
2281
+ 'trimming shrubs',
2282
+ 'trimming trees',
2283
+ 'triple jump',
2284
+ 'twiddling fingers',
2285
+ 'tying bow tie',
2286
+ 'tying knot (not on a tie)',
2287
+ 'tying necktie',
2288
+ 'tying shoe laces',
2289
+ 'unboxing',
2290
+ 'uncorking champagne',
2291
+ 'unloading truck',
2292
+ 'using a microscope',
2293
+ 'using a paint roller',
2294
+ 'using a power drill',
2295
+ 'using a sledge hammer',
2296
+ 'using a wrench',
2297
+ 'using atm',
2298
+ 'using bagging machine',
2299
+ 'using circular saw',
2300
+ 'using inhaler',
2301
+ 'using megaphone',
2302
+ 'using puppets',
2303
+ 'using remote controller (not gaming)',
2304
+ 'using segway',
2305
+ 'vacuuming car',
2306
+ 'vacuuming floor',
2307
+ 'visiting the zoo',
2308
+ 'wading through mud',
2309
+ 'wading through water',
2310
+ 'waiting in line',
2311
+ 'waking up',
2312
+ 'walking on stilts',
2313
+ 'walking the dog',
2314
+ 'walking through snow',
2315
+ 'walking with crutches',
2316
+ 'washing dishes',
2317
+ 'washing feet',
2318
+ 'washing hair',
2319
+ 'washing hands',
2320
+ 'watching tv',
2321
+ 'water skiing',
2322
+ 'water sliding',
2323
+ 'watering plants',
2324
+ 'waving hand',
2325
+ 'waxing armpits',
2326
+ 'waxing back',
2327
+ 'waxing chest',
2328
+ 'waxing eyebrows',
2329
+ 'waxing legs',
2330
+ 'weaving basket',
2331
+ 'weaving fabric',
2332
+ 'welding',
2333
+ 'whistling',
2334
+ 'windsurfing',
2335
+ 'winking',
2336
+ 'wood burning (art)',
2337
+ 'wrapping present',
2338
+ 'wrestling',
2339
+ 'writing',
2340
+ 'yarn spinning',
2341
+ 'yawning',
2342
+ 'yoga',
2343
+ 'zumba'
2344
+ ]
2345
+
2346
+ templates = [
2347
+ 'a photo of {}.',
2348
+ 'a photo of a person {}.',
2349
+ 'a photo of a person using {}.',
2350
+ 'a photo of a person doing {}.',
2351
+ 'a photo of a person during {}.',
2352
+ 'a photo of a person performing {}.',
2353
+ 'a photo of a person practicing {}.',
2354
+ 'a video of {}.',
2355
+ 'a video of a person {}.',
2356
+ 'a video of a person using {}.',
2357
+ 'a video of a person doing {}.',
2358
+ 'a video of a person during {}.',
2359
+ 'a video of a person performing {}.',
2360
+ 'a video of a person practicing {}.',
2361
+ 'a example of {}.',
2362
+ 'a example of a person {}.',
2363
+ 'a example of a person using {}.',
2364
+ 'a example of a person doing {}.',
2365
+ 'a example of a person during {}.',
2366
+ 'a example of a person performing {}.',
2367
+ 'a example of a person practicing {}.',
2368
+ 'a demonstration of {}.',
2369
+ 'a demonstration of a person {}.',
2370
+ 'a demonstration of a person using {}.',
2371
+ 'a demonstration of a person doing {}.',
2372
+ 'a demonstration of a person during {}.',
2373
+ 'a demonstration of a person performing {}.',
2374
+ 'a demonstration of a person practicing {}.',
2375
+ ]
2376
+ ```
2377
+
2378
+
2379
+
2380
+ ## MNIST
2381
+
2382
+ ```bash
2383
+ classes = [
2384
+ '0',
2385
+ '1',
2386
+ '2',
2387
+ '3',
2388
+ '4',
2389
+ '5',
2390
+ '6',
2391
+ '7',
2392
+ '8',
2393
+ '9',
2394
+ ]
2395
+
2396
+ templates = [
2397
+ 'a photo of the number: "{}".',
2398
+ ]
2399
+ ```
2400
+
2401
+
2402
+
2403
+ ## OxfordPets
2404
+
2405
+ ```bash
2406
+ classes = [
2407
+ 'Abyssinian',
2408
+ 'Bengal',
2409
+ 'Birman',
2410
+ 'Bombay',
2411
+ 'British Shorthair',
2412
+ 'Egyptian Mau',
2413
+ 'Maine Coon',
2414
+ 'Persian',
2415
+ 'Ragdoll',
2416
+ 'Russian Blue',
2417
+ 'Siamese',
2418
+ 'Sphynx',
2419
+ 'american bulldog',
2420
+ 'american pit bull terrier',
2421
+ 'basset hound',
2422
+ 'beagle',
2423
+ 'boxer',
2424
+ 'chihuahua',
2425
+ 'english cocker spaniel',
2426
+ 'english setter',
2427
+ 'german shorthaired',
2428
+ 'great pyrenees',
2429
+ 'havanese',
2430
+ 'japanese chin',
2431
+ 'keeshond',
2432
+ 'leonberger',
2433
+ 'miniature pinscher',
2434
+ 'newfoundland',
2435
+ 'pomeranian',
2436
+ 'pug',
2437
+ 'saint bernard',
2438
+ 'samoyed',
2439
+ 'scottish terrier',
2440
+ 'shiba inu',
2441
+ 'staffordshire bull terrier',
2442
+ 'wheaten terrier',
2443
+ 'yorkshire terrier',
2444
+ ]
2445
+
2446
+ templates = [
2447
+ 'a photo of a {}, a type of pet.',
2448
+ ]
2449
+ ```
2450
+
2451
+
2452
+
2453
+ ## PascalVOC2007
2454
+
2455
+ ```bash
2456
+ classes = [
2457
+ 'aeroplane',
2458
+ 'bicycle',
2459
+ 'bird',
2460
+ 'boat',
2461
+ 'bottle',
2462
+ 'bus',
2463
+ 'car',
2464
+ 'cat',
2465
+ 'chair',
2466
+ 'cow',
2467
+ 'dog',
2468
+ 'horse',
2469
+ 'motorbike',
2470
+ 'person',
2471
+ 'sheep',
2472
+ 'sofa',
2473
+ 'diningtable',
2474
+ 'pottedplant',
2475
+ 'train',
2476
+ 'tvmonitor',
2477
+ ]
2478
+
2479
+ templates = [
2480
+ 'a photo of a {}.',
2481
+ ]
2482
+ ```
2483
+
2484
+
2485
+
2486
+ ## PatchCamelyon
2487
+
2488
+ ```bash
2489
+ classes = [
2490
+ 'lymph node',
2491
+ 'lymph node containing metastatic tumor tissue',
2492
+ ]
2493
+
2494
+ templates = [
2495
+ 'this is a photo of {}',
2496
+ ]
2497
+ ```
2498
+
2499
+
2500
+
2501
+ ## RESISC45
2502
+
2503
+ ```bash
2504
+ classes = [
2505
+ 'airplane',
2506
+ 'airport',
2507
+ 'baseball diamond',
2508
+ 'basketball court',
2509
+ 'beach',
2510
+ 'bridge',
2511
+ 'chaparral',
2512
+ 'church',
2513
+ 'circular farmland',
2514
+ 'cloud',
2515
+ 'commercial area',
2516
+ 'dense residential',
2517
+ 'desert',
2518
+ 'forest',
2519
+ 'freeway',
2520
+ 'golf course',
2521
+ 'ground track field',
2522
+ 'harbor',
2523
+ 'industrial area',
2524
+ 'intersection',
2525
+ 'island',
2526
+ 'lake',
2527
+ 'meadow',
2528
+ 'medium residential',
2529
+ 'mobile home park',
2530
+ 'mountain',
2531
+ 'overpass',
2532
+ 'palace',
2533
+ 'parking lot',
2534
+ 'railway',
2535
+ 'railway station',
2536
+ 'rectangular farmland',
2537
+ 'river',
2538
+ 'roundabout',
2539
+ 'runway',
2540
+ 'sea ice',
2541
+ 'ship',
2542
+ 'snowberg',
2543
+ 'sparse residential',
2544
+ 'stadium',
2545
+ 'storage tank',
2546
+ 'tennis court',
2547
+ 'terrace',
2548
+ 'thermal power station',
2549
+ 'wetland',
2550
+ ]
2551
+
2552
+ templates = [
2553
+ 'satellite imagery of {}.',
2554
+ 'aerial imagery of {}.',
2555
+ 'satellite photo of {}.',
2556
+ 'aerial photo of {}.',
2557
+ 'satellite view of {}.',
2558
+ 'aerial view of {}.',
2559
+ 'satellite imagery of a {}.',
2560
+ 'aerial imagery of a {}.',
2561
+ 'satellite photo of a {}.',
2562
+ 'aerial photo of a {}.',
2563
+ 'satellite view of a {}.',
2564
+ 'aerial view of a {}.',
2565
+ 'satellite imagery of the {}.',
2566
+ 'aerial imagery of the {}.',
2567
+ 'satellite photo of the {}.',
2568
+ 'aerial photo of the {}.',
2569
+ 'satellite view of the {}.',
2570
+ 'aerial view of the {}.',
2571
+ ]
2572
+ ```
2573
+
2574
+
2575
+
2576
+ ## SST2
2577
+
2578
+ ```bash
2579
+ classes = [
2580
+ 'negative',
2581
+ 'positive',
2582
+ ]
2583
+
2584
+ templates = [
2585
+ 'a {} review of a movie.',
2586
+ ]
2587
+ ```
2588
+
2589
+
2590
+
2591
+ ## STL10
2592
+
2593
+ ```bash
2594
+ classes = [
2595
+ 'airplane',
2596
+ 'bird',
2597
+ 'car',
2598
+ 'cat',
2599
+ 'deer',
2600
+ 'dog',
2601
+ 'horse',
2602
+ 'monkey',
2603
+ 'ship',
2604
+ 'truck',
2605
+ ]
2606
+
2607
+ templates = [
2608
+ 'a photo of a {}.',
2609
+ 'a photo of the {}.',
2610
+ ]
2611
+ ```
2612
+
2613
+
2614
+
2615
+ ## SUN397
2616
+
2617
+ ```bash
2618
+ classes = [
2619
+ 'abbey',
2620
+ 'airplane cabin',
2621
+ 'airport terminal',
2622
+ 'alley',
2623
+ 'amphitheater',
2624
+ 'amusement arcade',
2625
+ 'amusement park',
2626
+ 'anechoic chamber',
2627
+ 'apartment building outdoor',
2628
+ 'apse indoor',
2629
+ 'aquarium',
2630
+ 'aqueduct',
2631
+ 'arch',
2632
+ 'archive',
2633
+ 'arrival gate outdoor',
2634
+ 'art gallery',
2635
+ 'art school',
2636
+ 'art studio',
2637
+ 'assembly line',
2638
+ 'athletic field outdoor',
2639
+ 'atrium public',
2640
+ 'attic',
2641
+ 'auditorium',
2642
+ 'auto factory',
2643
+ 'badlands',
2644
+ 'badminton court indoor',
2645
+ 'baggage claim',
2646
+ 'bakery shop',
2647
+ 'balcony exterior',
2648
+ 'balcony interior',
2649
+ 'ball pit',
2650
+ 'ballroom',
2651
+ 'bamboo forest',
2652
+ 'banquet hall',
2653
+ 'bar',
2654
+ 'barn',
2655
+ 'barndoor',
2656
+ 'baseball field',
2657
+ 'basement',
2658
+ 'basilica',
2659
+ 'basketball court outdoor',
2660
+ 'bathroom',
2661
+ 'batters box',
2662
+ 'bayou',
2663
+ 'bazaar indoor',
2664
+ 'bazaar outdoor',
2665
+ 'beach',
2666
+ 'beauty salon',
2667
+ 'bedroom',
2668
+ 'berth',
2669
+ 'biology laboratory',
2670
+ 'bistro indoor',
2671
+ 'boardwalk',
2672
+ 'boat deck',
2673
+ 'boathouse',
2674
+ 'bookstore',
2675
+ 'booth indoor',
2676
+ 'botanical garden',
2677
+ 'bow window indoor',
2678
+ 'bow window outdoor',
2679
+ 'bowling alley',
2680
+ 'boxing ring',
2681
+ 'brewery indoor',
2682
+ 'bridge',
2683
+ 'building facade',
2684
+ 'bullring',
2685
+ 'burial chamber',
2686
+ 'bus interior',
2687
+ 'butchers shop',
2688
+ 'butte',
2689
+ 'cabin outdoor',
2690
+ 'cafeteria',
2691
+ 'campsite',
2692
+ 'campus',
2693
+ 'canal natural',
2694
+ 'canal urban',
2695
+ 'candy store',
2696
+ 'canyon',
2697
+ 'car interior backseat',
2698
+ 'car interior frontseat',
2699
+ 'carrousel',
2700
+ 'casino indoor',
2701
+ 'castle',
2702
+ 'catacomb',
2703
+ 'cathedral indoor',
2704
+ 'cathedral outdoor',
2705
+ 'cavern indoor',
2706
+ 'cemetery',
2707
+ 'chalet',
2708
+ 'cheese factory',
2709
+ 'chemistry lab',
2710
+ 'chicken coop indoor',
2711
+ 'chicken coop outdoor',
2712
+ 'childs room',
2713
+ 'church indoor',
2714
+ 'church outdoor',
2715
+ 'classroom',
2716
+ 'clean room',
2717
+ 'cliff',
2718
+ 'cloister indoor',
2719
+ 'closet',
2720
+ 'clothing store',
2721
+ 'coast',
2722
+ 'cockpit',
2723
+ 'coffee shop',
2724
+ 'computer room',
2725
+ 'conference center',
2726
+ 'conference room',
2727
+ 'construction site',
2728
+ 'control room',
2729
+ 'control tower outdoor',
2730
+ 'corn field',
2731
+ 'corral',
2732
+ 'corridor',
2733
+ 'cottage garden',
2734
+ 'courthouse',
2735
+ 'courtroom',
2736
+ 'courtyard',
2737
+ 'covered bridge exterior',
2738
+ 'creek',
2739
+ 'crevasse',
2740
+ 'crosswalk',
2741
+ 'cubicle office',
2742
+ 'dam',
2743
+ 'delicatessen',
2744
+ 'dentists office',
2745
+ 'desert sand',
2746
+ 'desert vegetation',
2747
+ 'diner indoor',
2748
+ 'diner outdoor',
2749
+ 'dinette home',
2750
+ 'dinette vehicle',
2751
+ 'dining car',
2752
+ 'dining room',
2753
+ 'discotheque',
2754
+ 'dock',
2755
+ 'doorway outdoor',
2756
+ 'dorm room',
2757
+ 'driveway',
2758
+ 'driving range outdoor',
2759
+ 'drugstore',
2760
+ 'electrical substation',
2761
+ 'elevator door',
2762
+ 'elevator interior',
2763
+ 'elevator shaft',
2764
+ 'engine room',
2765
+ 'escalator indoor',
2766
+ 'excavation',
2767
+ 'factory indoor',
2768
+ 'fairway',
2769
+ 'fastfood restaurant',
2770
+ 'field cultivated',
2771
+ 'field wild',
2772
+ 'fire escape',
2773
+ 'fire station',
2774
+ 'firing range indoor',
2775
+ 'fishpond',
2776
+ 'florist shop indoor',
2777
+ 'food court',
2778
+ 'forest broadleaf',
2779
+ 'forest needleleaf',
2780
+ 'forest path',
2781
+ 'forest road',
2782
+ 'formal garden',
2783
+ 'fountain',
2784
+ 'galley',
2785
+ 'game room',
2786
+ 'garage indoor',
2787
+ 'garbage dump',
2788
+ 'gas station',
2789
+ 'gazebo exterior',
2790
+ 'general store indoor',
2791
+ 'general store outdoor',
2792
+ 'gift shop',
2793
+ 'golf course',
2794
+ 'greenhouse indoor',
2795
+ 'greenhouse outdoor',
2796
+ 'gymnasium indoor',
2797
+ 'hangar indoor',
2798
+ 'hangar outdoor',
2799
+ 'harbor',
2800
+ 'hayfield',
2801
+ 'heliport',
2802
+ 'herb garden',
2803
+ 'highway',
2804
+ 'hill',
2805
+ 'home office',
2806
+ 'hospital',
2807
+ 'hospital room',
2808
+ 'hot spring',
2809
+ 'hot tub outdoor',
2810
+ 'hotel outdoor',
2811
+ 'hotel room',
2812
+ 'house',
2813
+ 'hunting lodge outdoor',
2814
+ 'ice cream parlor',
2815
+ 'ice floe',
2816
+ 'ice shelf',
2817
+ 'ice skating rink indoor',
2818
+ 'ice skating rink outdoor',
2819
+ 'iceberg',
2820
+ 'igloo',
2821
+ 'industrial area',
2822
+ 'inn outdoor',
2823
+ 'islet',
2824
+ 'jacuzzi indoor',
2825
+ 'jail cell',
2826
+ 'jail indoor',
2827
+ 'jewelry shop',
2828
+ 'kasbah',
2829
+ 'kennel indoor',
2830
+ 'kennel outdoor',
2831
+ 'kindergarden classroom',
2832
+ 'kitchen',
2833
+ 'kitchenette',
2834
+ 'labyrinth outdoor',
2835
+ 'lake natural',
2836
+ 'landfill',
2837
+ 'landing deck',
2838
+ 'laundromat',
2839
+ 'lecture room',
2840
+ 'library indoor',
2841
+ 'library outdoor',
2842
+ 'lido deck outdoor',
2843
+ 'lift bridge',
2844
+ 'lighthouse',
2845
+ 'limousine interior',
2846
+ 'living room',
2847
+ 'lobby',
2848
+ 'lock chamber',
2849
+ 'locker room',
2850
+ 'mansion',
2851
+ 'manufactured home',
2852
+ 'market indoor',
2853
+ 'market outdoor',
2854
+ 'marsh',
2855
+ 'martial arts gym',
2856
+ 'mausoleum',
2857
+ 'medina',
2858
+ 'moat water',
2859
+ 'monastery outdoor',
2860
+ 'mosque indoor',
2861
+ 'mosque outdoor',
2862
+ 'motel',
2863
+ 'mountain',
2864
+ 'mountain snowy',
2865
+ 'movie theater indoor',
2866
+ 'museum indoor',
2867
+ 'music store',
2868
+ 'music studio',
2869
+ 'nuclear power plant outdoor',
2870
+ 'nursery',
2871
+ 'oast house',
2872
+ 'observatory outdoor',
2873
+ 'ocean',
2874
+ 'office',
2875
+ 'office building',
2876
+ 'oil refinery outdoor',
2877
+ 'oilrig',
2878
+ 'operating room',
2879
+ 'orchard',
2880
+ 'outhouse outdoor',
2881
+ 'pagoda',
2882
+ 'palace',
2883
+ 'pantry',
2884
+ 'park',
2885
+ 'parking garage indoor',
2886
+ 'parking garage outdoor',
2887
+ 'parking lot',
2888
+ 'parlor',
2889
+ 'pasture',
2890
+ 'patio',
2891
+ 'pavilion',
2892
+ 'pharmacy',
2893
+ 'phone booth',
2894
+ 'physics laboratory',
2895
+ 'picnic area',
2896
+ 'pilothouse indoor',
2897
+ 'planetarium outdoor',
2898
+ 'playground',
2899
+ 'playroom',
2900
+ 'plaza',
2901
+ 'podium indoor',
2902
+ 'podium outdoor',
2903
+ 'pond',
2904
+ 'poolroom establishment',
2905
+ 'poolroom home',
2906
+ 'power plant outdoor',
2907
+ 'promenade deck',
2908
+ 'pub indoor',
2909
+ 'pulpit',
2910
+ 'putting green',
2911
+ 'racecourse',
2912
+ 'raceway',
2913
+ 'raft',
2914
+ 'railroad track',
2915
+ 'rainforest',
2916
+ 'reception',
2917
+ 'recreation room',
2918
+ 'residential neighborhood',
2919
+ 'restaurant',
2920
+ 'restaurant kitchen',
2921
+ 'restaurant patio',
2922
+ 'rice paddy',
2923
+ 'riding arena',
2924
+ 'river',
2925
+ 'rock arch',
2926
+ 'rope bridge',
2927
+ 'ruin',
2928
+ 'runway',
2929
+ 'sandbar',
2930
+ 'sandbox',
2931
+ 'sauna',
2932
+ 'schoolhouse',
2933
+ 'sea cliff',
2934
+ 'server room',
2935
+ 'shed',
2936
+ 'shoe shop',
2937
+ 'shopfront',
2938
+ 'shopping mall indoor',
2939
+ 'shower',
2940
+ 'skatepark',
2941
+ 'ski lodge',
2942
+ 'ski resort',
2943
+ 'ski slope',
2944
+ 'sky',
2945
+ 'skyscraper',
2946
+ 'slum',
2947
+ 'snowfield',
2948
+ 'squash court',
2949
+ 'stable',
2950
+ 'stadium baseball',
2951
+ 'stadium football',
2952
+ 'stage indoor',
2953
+ 'staircase',
2954
+ 'street',
2955
+ 'subway interior',
2956
+ 'subway station platform',
2957
+ 'supermarket',
2958
+ 'sushi bar',
2959
+ 'swamp',
2960
+ 'swimming pool indoor',
2961
+ 'swimming pool outdoor',
2962
+ 'synagogue indoor',
2963
+ 'synagogue outdoor',
2964
+ 'television studio',
2965
+ 'temple east asia',
2966
+ 'temple south asia',
2967
+ 'tennis court indoor',
2968
+ 'tennis court outdoor',
2969
+ 'tent outdoor',
2970
+ 'theater indoor procenium',
2971
+ 'theater indoor seats',
2972
+ 'thriftshop',
2973
+ 'throne room',
2974
+ 'ticket booth',
2975
+ 'toll plaza',
2976
+ 'topiary garden',
2977
+ 'tower',
2978
+ 'toyshop',
2979
+ 'track outdoor',
2980
+ 'train railway',
2981
+ 'train station platform',
2982
+ 'tree farm',
2983
+ 'tree house',
2984
+ 'trench',
2985
+ 'underwater coral reef',
2986
+ 'utility room',
2987
+ 'valley',
2988
+ 'van interior',
2989
+ 'vegetable garden',
2990
+ 'veranda',
2991
+ 'veterinarians office',
2992
+ 'viaduct',
2993
+ 'videostore',
2994
+ 'village',
2995
+ 'vineyard',
2996
+ 'volcano',
2997
+ 'volleyball court indoor',
2998
+ 'volleyball court outdoor',
2999
+ 'waiting room',
3000
+ 'warehouse indoor',
3001
+ 'water tower',
3002
+ 'waterfall block',
3003
+ 'waterfall fan',
3004
+ 'waterfall plunge',
3005
+ 'watering hole',
3006
+ 'wave',
3007
+ 'wet bar',
3008
+ 'wheat field',
3009
+ 'wind farm',
3010
+ 'windmill',
3011
+ 'wine cellar barrel storage',
3012
+ 'wine cellar bottle storage',
3013
+ 'wrestling ring indoor',
3014
+ 'yard',
3015
+ 'youth hostel',
3016
+ ]
3017
+
3018
+ templates = [
3019
+ 'a photo of a {}.',
3020
+ 'a photo of the {}.',
3021
+ ]
3022
+ ```
3023
+
3024
+
3025
+
3026
+ ## StanfordCars
3027
+
3028
+ ```bash
3029
+ classes = [
3030
+ 'AM General Hummer SUV 2000',
3031
+ 'Acura RL Sedan 2012',
3032
+ 'Acura TL Sedan 2012',
3033
+ 'Acura TL Type-S 2008',
3034
+ 'Acura TSX Sedan 2012',
3035
+ 'Acura Integra Type R 2001',
3036
+ 'Acura ZDX Hatchback 2012',
3037
+ 'Aston Martin V8 Vantage Convertible 2012',
3038
+ 'Aston Martin V8 Vantage Coupe 2012',
3039
+ 'Aston Martin Virage Convertible 2012',
3040
+ 'Aston Martin Virage Coupe 2012',
3041
+ 'Audi RS 4 Convertible 2008',
3042
+ 'Audi A5 Coupe 2012',
3043
+ 'Audi TTS Coupe 2012',
3044
+ 'Audi R8 Coupe 2012',
3045
+ 'Audi V8 Sedan 1994',
3046
+ 'Audi 100 Sedan 1994',
3047
+ 'Audi 100 Wagon 1994',
3048
+ 'Audi TT Hatchback 2011',
3049
+ 'Audi S6 Sedan 2011',
3050
+ 'Audi S5 Convertible 2012',
3051
+ 'Audi S5 Coupe 2012',
3052
+ 'Audi S4 Sedan 2012',
3053
+ 'Audi S4 Sedan 2007',
3054
+ 'Audi TT RS Coupe 2012',
3055
+ 'BMW ActiveHybrid 5 Sedan 2012',
3056
+ 'BMW 1 Series Convertible 2012',
3057
+ 'BMW 1 Series Coupe 2012',
3058
+ 'BMW 3 Series Sedan 2012',
3059
+ 'BMW 3 Series Wagon 2012',
3060
+ 'BMW 6 Series Convertible 2007',
3061
+ 'BMW X5 SUV 2007',
3062
+ 'BMW X6 SUV 2012',
3063
+ 'BMW M3 Coupe 2012',
3064
+ 'BMW M5 Sedan 2010',
3065
+ 'BMW M6 Convertible 2010',
3066
+ 'BMW X3 SUV 2012',
3067
+ 'BMW Z4 Convertible 2012',
3068
+ 'Bentley Continental Supersports Conv. Convertible 2012',
3069
+ 'Bentley Arnage Sedan 2009',
3070
+ 'Bentley Mulsanne Sedan 2011',
3071
+ 'Bentley Continental GT Coupe 2012',
3072
+ 'Bentley Continental GT Coupe 2007',
3073
+ 'Bentley Continental Flying Spur Sedan 2007',
3074
+ 'Bugatti Veyron 16.4 Convertible 2009',
3075
+ 'Bugatti Veyron 16.4 Coupe 2009',
3076
+ 'Buick Regal GS 2012',
3077
+ 'Buick Rainier SUV 2007',
3078
+ 'Buick Verano Sedan 2012',
3079
+ 'Buick Enclave SUV 2012',
3080
+ 'Cadillac CTS-V Sedan 2012',
3081
+ 'Cadillac SRX SUV 2012',
3082
+ 'Cadillac Escalade EXT Crew Cab 2007',
3083
+ 'Chevrolet Silverado 1500 Hybrid Crew Cab 2012',
3084
+ 'Chevrolet Corvette Convertible 2012',
3085
+ 'Chevrolet Corvette ZR1 2012',
3086
+ 'Chevrolet Corvette Ron Fellows Edition Z06 2007',
3087
+ 'Chevrolet Traverse SUV 2012',
3088
+ 'Chevrolet Camaro Convertible 2012',
3089
+ 'Chevrolet HHR SS 2010',
3090
+ 'Chevrolet Impala Sedan 2007',
3091
+ 'Chevrolet Tahoe Hybrid SUV 2012',
3092
+ 'Chevrolet Sonic Sedan 2012',
3093
+ 'Chevrolet Express Cargo Van 2007',
3094
+ 'Chevrolet Avalanche Crew Cab 2012',
3095
+ 'Chevrolet Cobalt SS 2010',
3096
+ 'Chevrolet Malibu Hybrid Sedan 2010',
3097
+ 'Chevrolet TrailBlazer SS 2009',
3098
+ 'Chevrolet Silverado 2500HD Regular Cab 2012',
3099
+ 'Chevrolet Silverado 1500 Classic Extended Cab 2007',
3100
+ 'Chevrolet Express Van 2007',
3101
+ 'Chevrolet Monte Carlo Coupe 2007',
3102
+ 'Chevrolet Malibu Sedan 2007',
3103
+ 'Chevrolet Silverado 1500 Extended Cab 2012',
3104
+ 'Chevrolet Silverado 1500 Regular Cab 2012',
3105
+ 'Chrysler Aspen SUV 2009',
3106
+ 'Chrysler Sebring Convertible 2010',
3107
+ 'Chrysler Town and Country Minivan 2012',
3108
+ 'Chrysler 300 SRT-8 2010',
3109
+ 'Chrysler Crossfire Convertible 2008',
3110
+ 'Chrysler PT Cruiser Convertible 2008',
3111
+ 'Daewoo Nubira Wagon 2002',
3112
+ 'Dodge Caliber Wagon 2012',
3113
+ 'Dodge Caliber Wagon 2007',
3114
+ 'Dodge Caravan Minivan 1997',
3115
+ 'Dodge Ram Pickup 3500 Crew Cab 2010',
3116
+ 'Dodge Ram Pickup 3500 Quad Cab 2009',
3117
+ 'Dodge Sprinter Cargo Van 2009',
3118
+ 'Dodge Journey SUV 2012',
3119
+ 'Dodge Dakota Crew Cab 2010',
3120
+ 'Dodge Dakota Club Cab 2007',
3121
+ 'Dodge Magnum Wagon 2008',
3122
+ 'Dodge Challenger SRT8 2011',
3123
+ 'Dodge Durango SUV 2012',
3124
+ 'Dodge Durango SUV 2007',
3125
+ 'Dodge Charger Sedan 2012',
3126
+ 'Dodge Charger SRT-8 2009',
3127
+ 'Eagle Talon Hatchback 1998',
3128
+ 'FIAT 500 Abarth 2012',
3129
+ 'FIAT 500 Convertible 2012',
3130
+ 'Ferrari FF Coupe 2012',
3131
+ 'Ferrari California Convertible 2012',
3132
+ 'Ferrari 458 Italia Convertible 2012',
3133
+ 'Ferrari 458 Italia Coupe 2012',
3134
+ 'Fisker Karma Sedan 2012',
3135
+ 'Ford F-450 Super Duty Crew Cab 2012',
3136
+ 'Ford Mustang Convertible 2007',
3137
+ 'Ford Freestar Minivan 2007',
3138
+ 'Ford Expedition EL SUV 2009',
3139
+ 'Ford Edge SUV 2012',
3140
+ 'Ford Ranger SuperCab 2011',
3141
+ 'Ford GT Coupe 2006',
3142
+ 'Ford F-150 Regular Cab 2012',
3143
+ 'Ford F-150 Regular Cab 2007',
3144
+ 'Ford Focus Sedan 2007',
3145
+ 'Ford E-Series Wagon Van 2012',
3146
+ 'Ford Fiesta Sedan 2012',
3147
+ 'GMC Terrain SUV 2012',
3148
+ 'GMC Savana Van 2012',
3149
+ 'GMC Yukon Hybrid SUV 2012',
3150
+ 'GMC Acadia SUV 2012',
3151
+ 'GMC Canyon Extended Cab 2012',
3152
+ 'Geo Metro Convertible 1993',
3153
+ 'HUMMER H3T Crew Cab 2010',
3154
+ 'HUMMER H2 SUT Crew Cab 2009',
3155
+ 'Honda Odyssey Minivan 2012',
3156
+ 'Honda Odyssey Minivan 2007',
3157
+ 'Honda Accord Coupe 2012',
3158
+ 'Honda Accord Sedan 2012',
3159
+ 'Hyundai Veloster Hatchback 2012',
3160
+ 'Hyundai Santa Fe SUV 2012',
3161
+ 'Hyundai Tucson SUV 2012',
3162
+ 'Hyundai Veracruz SUV 2012',
3163
+ 'Hyundai Sonata Hybrid Sedan 2012',
3164
+ 'Hyundai Elantra Sedan 2007',
3165
+ 'Hyundai Accent Sedan 2012',
3166
+ 'Hyundai Genesis Sedan 2012',
3167
+ 'Hyundai Sonata Sedan 2012',
3168
+ 'Hyundai Elantra Touring Hatchback 2012',
3169
+ 'Hyundai Azera Sedan 2012',
3170
+ 'Infiniti G Coupe IPL 2012',
3171
+ 'Infiniti QX56 SUV 2011',
3172
+ 'Isuzu Ascender SUV 2008',
3173
+ 'Jaguar XK XKR 2012',
3174
+ 'Jeep Patriot SUV 2012',
3175
+ 'Jeep Wrangler SUV 2012',
3176
+ 'Jeep Liberty SUV 2012',
3177
+ 'Jeep Grand Cherokee SUV 2012',
3178
+ 'Jeep Compass SUV 2012',
3179
+ 'Lamborghini Reventon Coupe 2008',
3180
+ 'Lamborghini Aventador Coupe 2012',
3181
+ 'Lamborghini Gallardo LP 570-4 Superleggera 2012',
3182
+ 'Lamborghini Diablo Coupe 2001',
3183
+ 'Land Rover Range Rover SUV 2012',
3184
+ 'Land Rover LR2 SUV 2012',
3185
+ 'Lincoln Town Car Sedan 2011',
3186
+ 'MINI Cooper Roadster Convertible 2012',
3187
+ 'Maybach Landaulet Convertible 2012',
3188
+ 'Mazda Tribute SUV 2011',
3189
+ 'McLaren MP4-12C Coupe 2012',
3190
+ 'Mercedes-Benz 300-Class Convertible 1993',
3191
+ 'Mercedes-Benz C-Class Sedan 2012',
3192
+ 'Mercedes-Benz SL-Class Coupe 2009',
3193
+ 'Mercedes-Benz E-Class Sedan 2012',
3194
+ 'Mercedes-Benz S-Class Sedan 2012',
3195
+ 'Mercedes-Benz Sprinter Van 2012',
3196
+ 'Mitsubishi Lancer Sedan 2012',
3197
+ 'Nissan Leaf Hatchback 2012',
3198
+ 'Nissan NV Passenger Van 2012',
3199
+ 'Nissan Juke Hatchback 2012',
3200
+ 'Nissan 240SX Coupe 1998',
3201
+ 'Plymouth Neon Coupe 1999',
3202
+ 'Porsche Panamera Sedan 2012',
3203
+ 'Ram C/V Cargo Van Minivan 2012',
3204
+ 'Rolls-Royce Phantom Drophead Coupe Convertible 2012',
3205
+ 'Rolls-Royce Ghost Sedan 2012',
3206
+ 'Rolls-Royce Phantom Sedan 2012',
3207
+ 'Scion xD Hatchback 2012',
3208
+ 'Spyker C8 Convertible 2009',
3209
+ 'Spyker C8 Coupe 2009',
3210
+ 'Suzuki Aerio Sedan 2007',
3211
+ 'Suzuki Kizashi Sedan 2012',
3212
+ 'Suzuki SX4 Hatchback 2012',
3213
+ 'Suzuki SX4 Sedan 2012',
3214
+ 'Tesla Model S Sedan 2012',
3215
+ 'Toyota Sequoia SUV 2012',
3216
+ 'Toyota Camry Sedan 2012',
3217
+ 'Toyota Corolla Sedan 2012',
3218
+ 'Toyota 4Runner SUV 2012',
3219
+ 'Volkswagen Golf Hatchback 2012',
3220
+ 'Volkswagen Golf Hatchback 1991',
3221
+ 'Volkswagen Beetle Hatchback 2012',
3222
+ 'Volvo C30 Hatchback 2012',
3223
+ 'Volvo 240 Sedan 1993',
3224
+ 'Volvo XC90 SUV 2007',
3225
+ 'smart fortwo Convertible 2012',
3226
+ ]
3227
+
3228
+ templates = [
3229
+ 'a photo of a {}.',
3230
+ 'a photo of the {}.',
3231
+ 'a photo of my {}.',
3232
+ 'i love my {}!',
3233
+ 'a photo of my dirty {}.',
3234
+ 'a photo of my clean {}.',
3235
+ 'a photo of my new {}.',
3236
+ 'a photo of my old {}.',
3237
+ ]
3238
+ ```
3239
+
3240
+
3241
+
3242
+ ## UCF101
3243
+
3244
+ ```bash
3245
+ classes = [
3246
+ 'Apply Eye Makeup',
3247
+ 'Apply Lipstick',
3248
+ 'Archery',
3249
+ 'Baby Crawling',
3250
+ 'Balance Beam',
3251
+ 'Band Marching',
3252
+ 'Baseball Pitch',
3253
+ 'Basketball',
3254
+ 'Basketball Dunk',
3255
+ 'Bench Press',
3256
+ 'Biking',
3257
+ 'Billiards',
3258
+ 'Blow Dry Hair',
3259
+ 'Blowing Candles',
3260
+ 'Body Weight Squats',
3261
+ 'Bowling',
3262
+ 'Boxing Punching Bag',
3263
+ 'Boxing Speed Bag',
3264
+ 'Breast Stroke',
3265
+ 'Brushing Teeth',
3266
+ 'Clean And Jerk',
3267
+ 'Cliff Diving',
3268
+ 'Cricket Bowling',
3269
+ 'Cricket Shot',
3270
+ 'Cutting In Kitchen',
3271
+ 'Diving',
3272
+ 'Drumming',
3273
+ 'Fencing',
3274
+ 'Field Hockey Penalty',
3275
+ 'Floor Gymnastics',
3276
+ 'Frisbee Catch',
3277
+ 'Front Crawl',
3278
+ 'Golf Swing',
3279
+ 'Haircut',
3280
+ 'Hammer Throw',
3281
+ 'Hammering',
3282
+ 'Hand Stand Pushups',
3283
+ 'Handstand Walking',
3284
+ 'Head Massage',
3285
+ 'High Jump',
3286
+ 'Horse Race',
3287
+ 'Horse Riding',
3288
+ 'Hula Hoop',
3289
+ 'Ice Dancing',
3290
+ 'Javelin Throw',
3291
+ 'Juggling Balls',
3292
+ 'Jump Rope',
3293
+ 'Jumping Jack',
3294
+ 'Kayaking',
3295
+ 'Knitting',
3296
+ 'Long Jump',
3297
+ 'Lunges',
3298
+ 'Military Parade',
3299
+ 'Mixing',
3300
+ 'Mopping Floor',
3301
+ 'Nunchucks',
3302
+ 'Parallel Bars',
3303
+ 'Pizza Tossing',
3304
+ 'Playing Cello',
3305
+ 'Playing Daf',
3306
+ 'Playing Dhol',
3307
+ 'Playing Flute',
3308
+ 'Playing Guitar',
3309
+ 'Playing Piano',
3310
+ 'Playing Sitar',
3311
+ 'Playing Tabla',
3312
+ 'Playing Violin',
3313
+ 'Pole Vault',
3314
+ 'Pommel Horse',
3315
+ 'Pull Ups',
3316
+ 'Punch',
3317
+ 'Push Ups',
3318
+ 'Rafting',
3319
+ 'Rock Climbing Indoor',
3320
+ 'Rope Climbing',
3321
+ 'Rowing',
3322
+ 'Salsa Spin',
3323
+ 'Shaving Beard',
3324
+ 'Shotput',
3325
+ 'Skate Boarding',
3326
+ 'Skiing',
3327
+ 'Skijet',
3328
+ 'Sky Diving',
3329
+ 'Soccer Juggling',
3330
+ 'Soccer Penalty',
3331
+ 'Still Rings',
3332
+ 'Sumo Wrestling',
3333
+ 'Surfing',
3334
+ 'Swing',
3335
+ 'Table Tennis Shot',
3336
+ 'Tai Chi',
3337
+ 'Tennis Swing',
3338
+ 'Throw Discus',
3339
+ 'Trampoline Jumping',
3340
+ 'Typing',
3341
+ 'Uneven Bars',
3342
+ 'Volleyball Spiking',
3343
+ 'Walking With Dog',
3344
+ 'Wall Pushups',
3345
+ 'Writing On Board',
3346
+ 'Yo Yo',
3347
+ ]
3348
+
3349
+ templates = [
3350
+ 'a photo of a person {}.',
3351
+ 'a video of a person {}.',
3352
+ 'a example of a person {}.',
3353
+ 'a demonstration of a person {}.',
3354
+ 'a photo of the person {}.',
3355
+ 'a video of the person {}.',
3356
+ 'a example of the person {}.',
3357
+ 'a demonstration of the person {}.',
3358
+ 'a photo of a person using {}.',
3359
+ 'a video of a person using {}.',
3360
+ 'a example of a person using {}.',
3361
+ 'a demonstration of a person using {}.',
3362
+ 'a photo of the person using {}.',
3363
+ 'a video of the person using {}.',
3364
+ 'a example of the person using {}.',
3365
+ 'a demonstration of the person using {}.',
3366
+ 'a photo of a person doing {}.',
3367
+ 'a video of a person doing {}.',
3368
+ 'a example of a person doing {}.',
3369
+ 'a demonstration of a person doing {}.',
3370
+ 'a photo of the person doing {}.',
3371
+ 'a video of the person doing {}.',
3372
+ 'a example of the person doing {}.',
3373
+ 'a demonstration of the person doing {}.',
3374
+ 'a photo of a person during {}.',
3375
+ 'a video of a person during {}.',
3376
+ 'a example of a person during {}.',
3377
+ 'a demonstration of a person during {}.',
3378
+ 'a photo of the person during {}.',
3379
+ 'a video of the person during {}.',
3380
+ 'a example of the person during {}.',
3381
+ 'a demonstration of the person during {}.',
3382
+ 'a photo of a person performing {}.',
3383
+ 'a video of a person performing {}.',
3384
+ 'a example of a person performing {}.',
3385
+ 'a demonstration of a person performing {}.',
3386
+ 'a photo of the person performing {}.',
3387
+ 'a video of the person performing {}.',
3388
+ 'a example of the person performing {}.',
3389
+ 'a demonstration of the person performing {}.',
3390
+ 'a photo of a person practicing {}.',
3391
+ 'a video of a person practicing {}.',
3392
+ 'a example of a person practicing {}.',
3393
+ 'a demonstration of a person practicing {}.',
3394
+ 'a photo of the person practicing {}.',
3395
+ 'a video of the person practicing {}.',
3396
+ 'a example of the person practicing {}.',
3397
+ 'a demonstration of the person practicing {}.',
3398
+ ]
3399
+ ```
3400
+
3401
+
CLIP/data/rendered-sst2.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The Rendered SST2 Dataset
2
+
3
+ In the paper, we used an image classification dataset called Rendered SST2, to evaluate the model's capability on optical character recognition. To do so, we rendered the sentences in the [Standford Sentiment Treebank v2](https://nlp.stanford.edu/sentiment/treebank.html) dataset and used those as the input to the CLIP image encoder.
4
+
5
+ The following command will download a 131MB archive countaining the images and extract into a subdirectory `rendered-sst2`:
6
+
7
+ ```bash
8
+ wget https://openaipublic.azureedge.net/clip/data/rendered-sst2.tgz
9
+ tar zxvf rendered-sst2.tgz
10
+ ```
11
+
CLIP/data/yfcc100m.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The YFCC100M Subset
2
+
3
+ In the paper, we performed a dataset ablation using a subset of the YFCC100M dataset and showed that the performance remained largely similar.
4
+
5
+ The subset contains 14,829,396 images, about 15% of the full dataset, which have been filtered to only keep those with natural languag titles and/or descriptions in English.
6
+
7
+ We provide the list of (line number, photo identifier, photo hash) of each image contained in this subset. These correspond to the first three columns in the dataset's metadata TSV file.
8
+
9
+ ```bash
10
+ wget https://openaipublic.azureedge.net/clip/data/yfcc100m_subset_data.tsv.bz2
11
+ bunzip2 yfcc100m_subset_data.tsv.bz2
12
+ ```
13
+
14
+ Use of the underlying media files is subject to the Creative Commons licenses chosen by their creators/uploaders. For more information about the YFCC100M dataset, visit [the official website](https://multimediacommons.wordpress.com/yfcc100m-core-dataset/).
CLIP/model-card.md ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Card: CLIP
2
+
3
+ Inspired by [Model Cards for Model Reporting (Mitchell et al.)](https://arxiv.org/abs/1810.03993) and [Lessons from Archives (Jo & Gebru)](https://arxiv.org/pdf/1912.10389.pdf), we’re providing some accompanying information about the multimodal model.
4
+
5
+ ## Model Details
6
+
7
+ The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within.
8
+
9
+ ### Model Date
10
+
11
+ January 2021
12
+
13
+ ### Model Type
14
+
15
+ The base model uses a ResNet50 with several modifications as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. There is also a variant of the model where the ResNet image encoder is replaced with a Vision Transformer.
16
+
17
+ ### Model Versions
18
+
19
+ Initially, we’ve released one CLIP model based on the Vision Transformer architecture equivalent to ViT-B/32, along with the RN50 model, using the architecture equivalent to ResNet-50.
20
+
21
+ As part of the staged release process, we have also released the RN101 model, as well as RN50x4, a RN50 scaled up 4x according to the [EfficientNet](https://arxiv.org/abs/1905.11946) scaling rule. In July 2021, we additionally released the RN50x16 and ViT-B/16 models, and In January 2022, the RN50x64 and ViT-L/14 models were released.
22
+
23
+ Please see the paper linked below for further details about their specification.
24
+
25
+ ### Documents
26
+
27
+ - [Blog Post](https://openai.com/blog/clip/)
28
+ - [CLIP Paper](https://arxiv.org/abs/2103.00020)
29
+
30
+
31
+
32
+ ## Model Use
33
+
34
+ ### Intended Use
35
+
36
+ The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis.
37
+
38
+ #### Primary intended uses
39
+
40
+ The primary intended users of these models are AI researchers.
41
+
42
+ We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models.
43
+
44
+ ### Out-of-Scope Use Cases
45
+
46
+ **Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful.
47
+
48
+ Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use.
49
+
50
+ Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases.
51
+
52
+
53
+
54
+ ## Data
55
+
56
+ The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users.
57
+
58
+ ### Data Mission Statement
59
+
60
+ Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset.
61
+
62
+
63
+
64
+ ## Performance and Limitations
65
+
66
+ ### Performance
67
+
68
+ We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets:
69
+
70
+ - Food101
71
+ - CIFAR10
72
+ - CIFAR100
73
+ - Birdsnap
74
+ - SUN397
75
+ - Stanford Cars
76
+ - FGVC Aircraft
77
+ - VOC2007
78
+ - DTD
79
+ - Oxford-IIIT Pet dataset
80
+ - Caltech101
81
+ - Flowers102
82
+ - MNIST
83
+ - SVHN
84
+ - IIIT5K
85
+ - Hateful Memes
86
+ - SST-2
87
+ - UCF101
88
+ - Kinetics700
89
+ - Country211
90
+ - CLEVR Counting
91
+ - KITTI Distance
92
+ - STL-10
93
+ - RareAct
94
+ - Flickr30
95
+ - MSCOCO
96
+ - ImageNet
97
+ - ImageNet-A
98
+ - ImageNet-R
99
+ - ImageNet Sketch
100
+ - ObjectNet (ImageNet Overlap)
101
+ - Youtube-BB
102
+ - ImageNet-Vid
103
+
104
+ ## Limitations
105
+
106
+ CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance.
107
+
108
+ ### Bias and Fairness
109
+
110
+ We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper).
111
+
112
+ We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks.
113
+
114
+
115
+
116
+ ## Feedback
117
+
118
+ ### Where to send questions or comments about the model
119
+
120
+ Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9)
CLIP/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ftfy
2
+ regex
3
+ tqdm
4
+ torch
5
+ torchvision
CLIP/setup.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pkg_resources
4
+ from setuptools import setup, find_packages
5
+
6
+ setup(
7
+ name="clip",
8
+ py_modules=["clip"],
9
+ version="1.0",
10
+ description="",
11
+ author="OpenAI",
12
+ packages=find_packages(exclude=["tests*"]),
13
+ install_requires=[
14
+ str(r)
15
+ for r in pkg_resources.parse_requirements(
16
+ open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
17
+ )
18
+ ],
19
+ include_package_data=True,
20
+ extras_require={'dev': ['pytest']},
21
+ )
CLIP/tests/test_consistency.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pytest
3
+ import torch
4
+ from PIL import Image
5
+
6
+ import clip
7
+
8
+
9
+ @pytest.mark.parametrize('model_name', clip.available_models())
10
+ def test_consistency(model_name):
11
+ device = "cpu"
12
+ jit_model, transform = clip.load(model_name, device=device, jit=True)
13
+ py_model, _ = clip.load(model_name, device=device, jit=False)
14
+
15
+ image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
16
+ text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
17
+
18
+ with torch.no_grad():
19
+ logits_per_image, _ = jit_model(image, text)
20
+ jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
21
+
22
+ logits_per_image, _ = py_model(image, text)
23
+ py_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
24
+
25
+ assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1)
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Omer Bar-Tal, Dolev Ofri-Amar, Rafail Fridman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,86 @@
1
- ---
2
- title: Text2live
3
- emoji: 🚀
4
- colorFrom: yellow
5
- colorTo: green
6
- sdk: streamlit
7
- sdk_version: 1.17.0
8
- app_file: app.py
9
- pinned: false
10
- license: unknown
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text2LIVE: Text-Driven Layered Image and Video Editing (ECCV 2022 - Oral)
2
+ ## [<a href="https://text2live.github.io/" target="_blank">Project Page</a>]
3
+
4
+ [![arXiv](https://img.shields.io/badge/arXiv-Text2LIVE-b31b1b.svg)](https://arxiv.org/abs/2204.02491)
5
+ ![Pytorch](https://img.shields.io/badge/PyTorch->=1.10.0-Red?logo=pytorch)
6
+ [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/weizmannscience/text2live)
7
+
8
+ ![teaser](https://user-images.githubusercontent.com/22198039/179798581-ca6f6652-600a-400a-b21b-713fc5c15d56.png)
9
+
10
+ **Text2LIVE** is a method for text-driven editing of real-world images and videos, as described in <a href="https://arxiv.org/abs/2204.02491" target="_blank">(link to paper)</a>.
11
+
12
+ [//]: # (. It can be used for localized and global edits that change the texture of existing objects or augment the scene with semi-transparent effects &#40;e.g. smoke, fire, snow&#41;.)
13
+
14
+ [//]: # (### Abstract)
15
+ >We present a method for zero-shot, text-driven appearance manipulation in natural images and videos. Specifically, given an input image or video and a target text prompt, our goal is to edit the appearance of existing objects (e.g., object's texture) or augment the scene with new visual effects (e.g., smoke, fire) in a semantically meaningful manner. Our framework trains a generator using an internal dataset of training examples, extracted from a single input (image or video and target text prompt), while leveraging an external pre-trained CLIP model to establish our losses. Rather than directly generating the edited output, our key idea is to generate an edit layer (color+opacity) that is composited over the original input. This allows us to constrain the generation process and maintain high fidelity to the original input via novel text-driven losses that are applied directly to the edit layer. Our method neither relies on a pre-trained generator nor requires user-provided edit masks. Thus, it can perform localized, semantic edits on high-resolution natural images and videos across a variety of objects and scenes.
16
+
17
+
18
+ ## Getting Started
19
+ ### Installation
20
+
21
+ ```
22
+ git clone https://github.com/omerbt/Text2LIVE.git
23
+ conda create --name text2live python=3.9
24
+ conda activate text2live
25
+ pip install -r requirements.txt
26
+ ```
27
+
28
+ ### Download sample images and videos
29
+ Download sample images and videos from the DAVIS dataset:
30
+ ```
31
+ cd Text2LIVE
32
+ gdown https://drive.google.com/uc?id=1osN4PlPkY9uk6pFqJZo8lhJUjTIpa80J&export=download
33
+ unzip data.zip
34
+ ```
35
+ It will create a folder `data`:
36
+ ```
37
+ Text2LIVE
38
+ ├── ...
39
+ ├── data
40
+ │ ├── pretrained_nla_models # NLA models are stored here
41
+ │ ├── images # sample images
42
+ │ └── videos # sample videos from DAVIS dataset
43
+ │ ├── car-turn # contains video frames
44
+ │ ├── ...
45
+ └── ...
46
+ ```
47
+ To enforce temporal consistency in video edits, we utilize the Neural Layered Atlases (NLA). Pretrained NLA models are taken from <a href="https://layered-neural-atlases.github.io">here</a>, and are already inside the `data` folder.
48
+
49
+ ### Run examples
50
+ * Our method is designed to change textures of existing objects / augment the scene with semi-transparent effects (e.g., smoke, fire). It is not designed for adding new objects or significantly deviating from the original spatial layout.
51
+ * Training **Text2LIVE** multiple times with the same inputs can lead to slightly different results.
52
+ * CLIP sometimes exhibits bias towards specific solutions (see figure 9 in the paper), thus slightly different text prompts may lead to different flavors of edits.
53
+
54
+
55
+ The required GPU memory depends on the input image/video size, but you should be good with a Tesla V100 32GB :).
56
+ Currently mixed precision introduces some instability in the training process, but it could be added later.
57
+
58
+ #### Video Editing
59
+ Run the following command to start training
60
+ ```
61
+ python train_video.py --example_config car-turn_winter.yaml
62
+ ```
63
+ #### Image Editing
64
+ Run the following command to start training
65
+ ```
66
+ python train_image.py --example_config golden_horse.yaml
67
+ ```
68
+ Intermediate results will be saved to `results` during optimization. The frequency of saving intermediate results is indicated in the `log_images_freq` flag of the configuration.
69
+
70
+ ## Sample Results
71
+ https://user-images.githubusercontent.com/22198039/179797381-983e0453-2e5d-40e8-983d-578217b358e4.mov
72
+
73
+ For more see the [supplementary material](https://text2live.github.io/sm/index.html).
74
+
75
+
76
+ ## Citation
77
+ ```
78
+ @inproceedings{bar2022text2live,
79
+ title={Text2live: Text-driven layered image and video editing},
80
+ author={Bar-Tal, Omer and Ofri-Amar, Dolev and Fridman, Rafail and Kasten, Yoni and Dekel, Tali},
81
+ booktitle={European Conference on Computer Vision},
82
+ pages={707--723},
83
+ year={2022},
84
+ organization={Springer}
85
+ }
86
+ ```
Text2LIVE-main/CLIP/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (158 Bytes). View file
 
Text2LIVE-main/CLIP/clip/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (184 Bytes). View file
 
Text2LIVE-main/CLIP/clip/__pycache__/clip.cpython-37.pyc ADDED
Binary file (8.28 kB). View file
 
Text2LIVE-main/CLIP/clip/__pycache__/model.cpython-37.pyc ADDED
Binary file (16.6 kB). View file
 
Text2LIVE-main/CLIP/clip/__pycache__/simple_tokenizer.cpython-37.pyc ADDED
Binary file (5.78 kB). View file
 
Text2LIVE-main/CLIP/clip_explainability/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (199 Bytes). View file
 
Text2LIVE-main/CLIP/clip_explainability/__pycache__/auxilary.cpython-37.pyc ADDED
Binary file (12 kB). View file
 
Text2LIVE-main/CLIP/clip_explainability/__pycache__/clip.cpython-37.pyc ADDED
Binary file (7.59 kB). View file
 
Text2LIVE-main/CLIP/clip_explainability/__pycache__/model.cpython-37.pyc ADDED
Binary file (15.4 kB). View file
 
Text2LIVE-main/CLIP/clip_explainability/__pycache__/simple_tokenizer.cpython-37.pyc ADDED
Binary file (5.79 kB). View file
 
Text2LIVE-main/README.md CHANGED
@@ -75,12 +75,10 @@ For more see the [supplementary material](https://text2live.github.io/sm/index.h
75
 
76
  ## Citation
77
  ```
78
- @inproceedings{bar2022text2live,
79
- title={Text2live: Text-driven layered image and video editing},
80
- author={Bar-Tal, Omer and Ofri-Amar, Dolev and Fridman, Rafail and Kasten, Yoni and Dekel, Tali},
81
- booktitle={European Conference on Computer Vision},
82
- pages={707--723},
83
- year={2022},
84
- organization={Springer}
85
  }
86
  ```
 
75
 
76
  ## Citation
77
  ```
78
+ @article{bar2022text2live,
79
+ title = {Text2LIVE: Text-Driven Layered Image and Video Editing},
80
+ author = {Bar-Tal, Omer and Ofri-Amar, Dolev and Fridman, Rafail and Kasten, Yoni and Dekel, Tali},
81
+ journal = {arXiv preprint arXiv:2204.02491},
82
+ year = {2022}
 
 
83
  }
84
  ```
Text2LIVE-main/data/data/images/Thumbs.db ADDED
Binary file (13.8 kB). View file
 
Text2LIVE-main/data/data/images/cake.jpeg ADDED
Text2LIVE-main/data/data/images/horse.jpg ADDED
Text2LIVE-main/data/data/pretrained_nla_models/blackswan/checkpoint ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f50895f39815de243cb8166001771260d9720e6d1bda6289088a0366c7c70f2
3
+ size 14657387
Text2LIVE-main/data/data/pretrained_nla_models/car-turn/checkpoint ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:973953ed6f0f742df9ab3fd21e7369db541689c40a8cd22ddb12f912c2e84b95
3
+ size 14657387
Text2LIVE-main/data/data/pretrained_nla_models/libby/checkpoint ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2095f38eacee144175b08fdaaffd52e97991c08f0825be0d8cf836a5297ae535
3
+ size 14657387
Text2LIVE-main/data/data/videos/blackswan/00000.jpg ADDED
Text2LIVE-main/data/data/videos/blackswan/00001.jpg ADDED
Text2LIVE-main/data/data/videos/blackswan/00002.jpg ADDED
Text2LIVE-main/data/data/videos/blackswan/00003.jpg ADDED
Text2LIVE-main/data/data/videos/blackswan/00004.jpg ADDED