Sohan Anisetty commited on
Commit
de972ee
1 Parent(s): e143cff

added files

Browse files
Files changed (6) hide show
  1. README.md +50 -0
  2. config.json +52 -0
  3. generation_config.json +10 -0
  4. merges.txt +0 -0
  5. pytorch_model.bin +3 -0
  6. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # OFA-tiny
6
+
7
+ ## Introduction
8
+ This is the **tiny** version of OFA pretrained model finetuned on CLEVR and a custom block stack dataset.
9
+
10
+ The directory includes 4 files, namely `config.json` which consists of model configuration, `vocab.json` and `merge.txt` for our OFA tokenizer, and lastly `pytorch_model.bin` which consists of model weights.
11
+
12
+
13
+ ## How to use
14
+ Download the models as shown below.
15
+ ```bash
16
+ git clone https://github.com/sohananisetty/OFA_VQA.git
17
+ git clone https://huggingface.co/SohanAnisetty/ofa-vqa-base
18
+ ```
19
+
20
+ After, refer the path to ofa-vqa-base to `ckpt_dir`, and prepare an image for the testing example below.
21
+
22
+ ```python
23
+ from PIL import Image
24
+ from torchvision import transforms
25
+ from transformers import OFATokenizer, OFAModelForVQA
26
+
27
+ mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
28
+ resolution = 480
29
+ patch_resize_transform = transforms.Compose([
30
+ lambda image: image.convert("RGB"),
31
+ transforms.Resize((resolution, resolution), interpolation=Image.BICUBIC),
32
+ transforms.ToTensor(),
33
+ transforms.Normalize(mean=mean, std=std)
34
+ ])
35
+
36
+
37
+ tokenizer = OFATokenizer.from_pretrained(ckpt_dir)
38
+
39
+ txt = " what does the image describe?"
40
+ inputs = tokenizer([txt], return_tensors="pt").input_ids
41
+ inputs = inputs.cuda()
42
+ img = Image.open(path_to_image)
43
+ patch_img = patch_resize_transform(img).unsqueeze(0).cuda()
44
+
45
+
46
+ model = OFAModel.from_pretrained(ckpt_dir, use_cache=False).cuda()
47
+ gen = model.generate(inputs, patch_images=patch_img, num_beams=5, no_repeat_ngram_size=3)
48
+
49
+ print(tokenizer.batch_decode(gen skip_special_tokens=True))
50
+ ```
config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "add_type_embedding": true,
5
+ "architectures": [
6
+ "OFAModelForVQA"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "attn_scale_factor": 2.0,
10
+ "bos_token_id": 0,
11
+ "classifier_dropout": 0.0,
12
+ "code_image_size": 128,
13
+ "code_layernorm_embedding": true,
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_drop_path_rate": 0.0,
17
+ "decoder_ffn_dim": 3072,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 6,
20
+ "decoder_normalize_before": true,
21
+ "decoder_start_token_id": 0,
22
+ "dropout": 0.1,
23
+ "encoder_attention_heads": 12,
24
+ "encoder_drop_path_rate": 0.0,
25
+ "encoder_ffn_dim": 3072,
26
+ "encoder_layerdrop": 0.0,
27
+ "encoder_layers": 6,
28
+ "encoder_normalize_before": true,
29
+ "entangle_position_embedding": false,
30
+ "eos_token_id": 2,
31
+ "forced_eos_token_id": 2,
32
+ "image_bucket_size": 42,
33
+ "init_std": 0.02,
34
+ "is_encoder_decoder": true,
35
+ "layernorm_embedding": true,
36
+ "max_position_embeddings": 1024,
37
+ "model_type": "ofa",
38
+ "normformer": true,
39
+ "num_hidden_layers": 6,
40
+ "pad_token_id": 1,
41
+ "patch_layernorm_embedding": true,
42
+ "resnet_drop_path_rate": 0.0,
43
+ "resnet_model_path": null,
44
+ "resnet_type": "resnet101",
45
+ "scale_embedding": false,
46
+ "share_decoder_input_output_embed": true,
47
+ "token_bucket_size": 256,
48
+ "torch_dtype": "float32",
49
+ "transformers_version": "4.26.1",
50
+ "use_cache": false,
51
+ "vocab_size": 59457
52
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 0,
5
+ "eos_token_id": 2,
6
+ "forced_eos_token_id": 2,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.26.1",
9
+ "use_cache": false
10
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3b36e31e3670185941cd270e0d7022ed80ab00f52fe9d4b245966e82e603353
3
+ size 796223833
vocab.json ADDED
The diff for this file is too large to render. See raw diff