| |
| ''' |
| This scripts performs kNN search on inferenced image and text features (on single-GPU) and outputs image-to-text retrieval prediction file for evaluation. |
| ''' |
|
|
| import argparse |
| import numpy |
| from tqdm import tqdm |
| import json |
|
|
| import numpy as np |
| import torch |
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| '--image-feats', |
| type=str, |
| required=True, |
| help="Specify the path of image features." |
| ) |
| parser.add_argument( |
| '--text-feats', |
| type=str, |
| required=True, |
| help="Specify the path of text features." |
| ) |
| parser.add_argument( |
| '--top-k', |
| type=int, |
| default=10, |
| help="Specify the k value of top-k predictions." |
| ) |
| parser.add_argument( |
| '--eval-batch-size', |
| type=int, |
| default=32768, |
| help="Specify the image-side batch size when computing the inner products, default to 8192" |
| ) |
| parser.add_argument( |
| '--output', |
| type=str, |
| required=True, |
| help="Specify the output jsonl prediction filepath." |
| ) |
| return parser.parse_args() |
|
|
| if __name__ == "__main__": |
| args = parse_args() |
|
|
| |
| print("Params:") |
| for name in sorted(vars(args)): |
| val = getattr(args, name) |
| print(f" {name}: {val}") |
|
|
| print("Begin to load text features...") |
| text_ids = [] |
| text_feats = [] |
| with open(args.text_feats, "r") as fin: |
| for line in tqdm(fin): |
| obj = json.loads(line.strip()) |
| text_ids.append(obj['text_id']) |
| text_feats.append(obj['feature']) |
| text_feats_array = np.array(text_feats, dtype=np.float32) |
| print("Finished loading text features.") |
|
|
| print("Begin to compute top-{} predictions for images...".format(args.top_k)) |
| with open(args.output, "w") as fout: |
| with open(args.image_feats, "r") as fin: |
| for line in tqdm(fin): |
| obj = json.loads(line.strip()) |
| image_id = obj['image_id'] |
| image_feat = obj['feature'] |
| score_tuples = [] |
| image_feat_tensor = torch.tensor([image_feat], dtype=torch.float).cuda() |
| idx = 0 |
| while idx < len(text_ids): |
| text_feats_tensor = torch.from_numpy(text_feats_array[idx : min(idx + args.eval_batch_size, len(text_ids))]).cuda() |
| batch_scores = image_feat_tensor @ text_feats_tensor.t() |
| for text_id, score in zip(text_ids[idx : min(idx + args.eval_batch_size, len(text_ids))], batch_scores.squeeze(0).tolist()): |
| score_tuples.append((text_id, score)) |
| idx += args.eval_batch_size |
| top_k_predictions = sorted(score_tuples, key=lambda x:x[1], reverse=True)[:args.top_k] |
| fout.write("{}\n".format(json.dumps({"image_id": image_id, "text_ids": [entry[0] for entry in top_k_predictions]}))) |
| |
| print("Top-{} predictions are saved in {}".format(args.top_k, args.output)) |
| print("Done!") |
|
|