| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import argparse |
|
|
| import numpy as np |
| import pandas as pd |
|
|
|
|
| def construct_negatives(input_file, output_file, num_passages, num_negatives): |
| qrels = pd.read_csv(input_file, delimiter="\t", header=None) |
| with open(output_file, "w") as f: |
| for i in range(len(qrels)): |
| query_id, rel_passage_id = qrels[0][i], qrels[2][i] |
| negatives = np.random.randint(num_passages, size=num_negatives) |
| output_ids = [query_id, rel_passage_id] + negatives.tolist() |
| output_str = [str(id_) for id_ in output_ids] |
| print("\t".join(output_str), file=f) |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Negative passages construction") |
| parser.add_argument("--data", type=str, default="msmarco_dataset", help="path to folder with data") |
| parser.add_argument("--num_passages", type=int, default=8841823, help="total number of passages") |
| parser.add_argument("--num_negatives", type=int, default=10, help="number of negatives per positive") |
| args = parser.parse_args() |
|
|
| for mode in ["train", "dev"]: |
| construct_negatives( |
| input_file=f"{args.data}/qrels.{mode}.tsv", |
| output_file=f"{args.data}/query2passages.{mode}.tsv", |
| num_passages=args.num_passages, |
| num_negatives=args.num_negatives, |
| ) |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|