File size: 3,199 Bytes
f76d30f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# -*- coding: utf-8 -*-
'''
This scripts performs kNN search on inferenced image and text features (on single-GPU) and outputs image-to-text retrieval prediction file for evaluation.
'''
import argparse
import numpy
from tqdm import tqdm
import json
import numpy as np
import torch
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--image-feats',
type=str,
required=True,
help="Specify the path of image features."
)
parser.add_argument(
'--text-feats',
type=str,
required=True,
help="Specify the path of text features."
)
parser.add_argument(
'--top-k',
type=int,
default=10,
help="Specify the k value of top-k predictions."
)
parser.add_argument(
'--eval-batch-size',
type=int,
default=32768,
help="Specify the image-side batch size when computing the inner products, default to 8192"
)
parser.add_argument(
'--output',
type=str,
required=True,
help="Specify the output jsonl prediction filepath."
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# Log params.
print("Params:")
for name in sorted(vars(args)):
val = getattr(args, name)
print(f" {name}: {val}")
print("Begin to load text features...")
text_ids = []
text_feats = []
with open(args.text_feats, "r") as fin:
for line in tqdm(fin):
obj = json.loads(line.strip())
text_ids.append(obj['text_id'])
text_feats.append(obj['feature'])
text_feats_array = np.array(text_feats, dtype=np.float32)
print("Finished loading text features.")
print("Begin to compute top-{} predictions for images...".format(args.top_k))
with open(args.output, "w") as fout:
with open(args.image_feats, "r") as fin:
for line in tqdm(fin):
obj = json.loads(line.strip())
image_id = obj['image_id']
image_feat = obj['feature']
score_tuples = []
image_feat_tensor = torch.tensor([image_feat], dtype=torch.float).cuda() # [1, feature_dim]
idx = 0
while idx < len(text_ids):
text_feats_tensor = torch.from_numpy(text_feats_array[idx : min(idx + args.eval_batch_size, len(text_ids))]).cuda() # [batch_size, feature_dim]
batch_scores = image_feat_tensor @ text_feats_tensor.t() # [1, batch_size]
for text_id, score in zip(text_ids[idx : min(idx + args.eval_batch_size, len(text_ids))], batch_scores.squeeze(0).tolist()):
score_tuples.append((text_id, score))
idx += args.eval_batch_size
top_k_predictions = sorted(score_tuples, key=lambda x:x[1], reverse=True)[:args.top_k]
fout.write("{}\n".format(json.dumps({"image_id": image_id, "text_ids": [entry[0] for entry in top_k_predictions]})))
print("Top-{} predictions are saved in {}".format(args.top_k, args.output))
print("Done!")
|