| import jsonlines |
| import argparse |
| import pandas as pd |
| from tqdm import tqdm |
|
|
| parser = argparse.ArgumentParser(description=__doc__, |
| formatter_class=lambda prog: argparse.HelpFormatter(prog, width=100)) |
| parser.add_argument('--corpus', metavar='FILE', type=str, required=True, help='Corpus file in jsonl') |
| parser.add_argument('--input_ranking', metavar='FILE', type=str, required=True, help='Ranking file from ColBERT in tsv') |
| parser.add_argument('--output_ranking', metavar='FILE', type=str, required=True, help='Ranking file with robust doc ids in tsv') |
| args = parser.parse_args() |
|
|
|
|
| with jsonlines.open(args.corpus,'r') as reader: |
| doc_ids = [obj['id'] for obj in reader] |
|
|
| df = pd.read_csv(args.input_ranking, sep='\t', header=None, names=['query_id', 'doc_id', 'rank']) |
| df['doc_id'] = df['doc_id'].apply(lambda x: doc_ids[int(x)]) |
| df['score'] = 1 / df['rank'] |
|
|
| df = df.sort_values(by='score', ascending=False) |
| df = df.drop_duplicates(subset=['query_id', 'doc_id']) |
| df = df.groupby('query_id').head(1000) |
| df['rank'] = df.groupby('query_id').cumcount() |
| df = df.sort_values(['query_id','rank']) |
|
|
| with open(args.output_ranking,'w') as writer: |
| for _, obj in df.iterrows(): |
| query_id, doc_id, rank, score = obj['query_id'], obj['doc_id'], obj['rank'], obj['score'] |
| writer.write(f'{query_id}\tQ0\t{doc_id}\t{rank}\t{score}\tColBERT\n') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| |
| |
|
|
|
|
|
|
| |
| |
| |
| |
|
|