File size: 8,168 Bytes
f76d30f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
# -*- coding: utf-8 -*-
'''
This script extracts image and text features for evaluation. (with single-GPU)
'''
import os
import argparse
import logging
from pathlib import Path
import json
import torch
from tqdm import tqdm
from clip.model import convert_weights, CLIP
from eval.data import get_eval_img_dataset, get_eval_txt_dataset
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--extract-image-feats',
action="store_true",
default=False,
help="Whether to extract image features."
)
parser.add_argument(
'--extract-text-feats',
action="store_true",
default=False,
help="Whether to extract text features."
)
parser.add_argument(
'--image-data',
type=str,
default="../Multimodal_Retrieval/lmdb/test/imgs",
help="If --extract-image-feats is True, specify the path of the LMDB directory storing input image base64 strings."
)
parser.add_argument(
'--text-data',
type=str,
default="../Multimodal_Retrieval/test_texts.jsonl",
help="If --extract-text-feats is True, specify the path of input text Jsonl file."
)
parser.add_argument(
'--image-feat-output-path',
type=str,
default=None,
help="If --extract-image-feats is True, specify the path of output image features."
)
parser.add_argument(
'--text-feat-output-path',
type=str,
default=None,
help="If --extract-image-feats is True, specify the path of output text features."
)
parser.add_argument(
"--img-batch-size", type=int, default=64, help="Image batch size."
)
parser.add_argument(
"--text-batch-size", type=int, default=64, help="Text batch size."
)
parser.add_argument(
"--context-length", type=int, default=64, help="The maximum length of input text (include [CLS] & [SEP] tokens)."
)
parser.add_argument(
"--resume",
default=None,
type=str,
help="path to latest checkpoint (default: none)",
)
parser.add_argument(
"--precision",
choices=["amp", "fp16", "fp32"],
default="amp",
help="Floating point precition."
)
parser.add_argument(
"--vision-model",
choices=["ViT-B-16", "ViT-L-14", "RN50"],
default="ViT-B-16",
help="Name of the vision backbone to use.",
)
parser.add_argument(
"--text-model",
choices=["RoBERTa-wwm-ext-base-chinese", "RoBERTa-wwm-ext-large-chinese", "RBT3-chinese"],
default="RoBERTa-wwm-ext-base-chinese",
help="Name of the text backbone to use.",
)
parser.add_argument(
"--debug",
default=False,
action="store_true",
help="If true, more information is logged."
)
args = parser.parse_args()
return args
# Used by https://github.com/openai/CLIP/issues/83 but not below.
# Keeping it incase needed.
def convert_models_to_fp32(model):
for p in model.parameters():
p.data = p.data.float()
if p.grad:
p.grad.data = p.grad.data.float()
if __name__ == "__main__":
args = parse_args()
assert args.extract_image_feats or args.extract_text_feats, "--extract-image-feats and --extract-text-feats cannot both be False!"
# Log params.
print("Params:")
for name in sorted(vars(args)):
val = getattr(args, name)
print(f" {name}: {val}")
args.gpu = 0
torch.cuda.set_device(args.gpu)
# Initialize the model.
vision_model_config_file = Path(__file__).parent.parent.parent / f"clip/model_configs/{args.vision_model.replace('/', '-')}.json"
print('Loading vision model config from', vision_model_config_file)
assert os.path.exists(vision_model_config_file)
text_model_config_file = Path(__file__).parent.parent.parent / f"clip/model_configs/{args.text_model.replace('/', '-')}.json"
print('Loading text model config from', text_model_config_file)
assert os.path.exists(text_model_config_file)
with open(vision_model_config_file, 'r') as fv, open(text_model_config_file, 'r') as ft:
model_info = json.load(fv)
if isinstance(model_info['vision_layers'], str):
model_info['vision_layers'] = eval(model_info['vision_layers'])
for k, v in json.load(ft).items():
model_info[k] = v
model = CLIP(**model_info)
convert_weights(model)
# See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372
if args.precision == "amp" or args.precision == "fp32":
convert_models_to_fp32(model)
model.cuda(args.gpu)
if args.precision == "fp16":
convert_weights(model)
# Get data.
if args.extract_image_feats:
print("Preparing image inference dataset.")
img_data = get_eval_img_dataset(args)
if args.extract_text_feats:
print("Preparing text inference dataset.")
text_data = get_eval_txt_dataset(args, max_txt_length=args.context_length)
# Resume from a checkpoint.
print("Begin to load model checkpoint from {}.".format(args.resume))
assert os.path.exists(args.resume), "The checkpoint file {} not exists!".format(args.resume)
# Map model to be loaded to specified single gpu.
loc = "cuda:{}".format(args.gpu)
checkpoint = torch.load(args.resume, map_location='cpu')
start_epoch = checkpoint["epoch"]
sd = checkpoint["state_dict"]
if next(iter(sd.items()))[0].startswith('module'):
sd = {k[len('module.'):]: v for k, v in sd.items() if "bert.pooler" not in k}
model.load_state_dict(sd)
print(
f"=> loaded checkpoint '{args.resume}' (epoch {checkpoint['epoch']} @ {checkpoint['step']} steps)"
)
# Make inference for texts
if args.extract_text_feats:
print('Make inference for texts...')
if args.text_feat_output_path is None:
args.text_feat_output_path = "{}.txt_feat.jsonl".format(args.text_data[:-6])
write_cnt = 0
with open(args.text_feat_output_path, "w") as fout:
model.eval()
dataloader = text_data.dataloader
with torch.no_grad():
for batch in tqdm(dataloader):
text_ids, texts = batch
texts = texts.cuda(args.gpu, non_blocking=True)
text_features = model(None, texts)
text_features /= text_features.norm(dim=-1, keepdim=True)
for text_id, text_feature in zip(text_ids.tolist(), text_features.tolist()):
fout.write("{}\n".format(json.dumps({"text_id": text_id, "feature": text_feature})))
write_cnt += 1
print('{} text features are stored in {}'.format(write_cnt, args.text_feat_output_path))
# Make inference for images
if args.extract_image_feats:
print('Make inference for images...')
if args.image_feat_output_path is None:
# by default, we store the image features under the same directory with the text features
args.image_feat_output_path = "{}.img_feat.jsonl".format(args.text_data.replace("_texts.jsonl", "_imgs"))
write_cnt = 0
with open(args.image_feat_output_path, "w") as fout:
model.eval()
dataloader = img_data.dataloader
with torch.no_grad():
for batch in tqdm(dataloader):
image_ids, images = batch
images = images.cuda(args.gpu, non_blocking=True)
image_features = model(images, None)
image_features /= image_features.norm(dim=-1, keepdim=True)
for image_id, image_feature in zip(image_ids.tolist(), image_features.tolist()):
fout.write("{}\n".format(json.dumps({"image_id": image_id, "feature": image_feature})))
write_cnt += 1
print('{} image features are stored in {}'.format(write_cnt, args.image_feat_output_path))
print("Done!") |