Spaces:
Sleeping
Sleeping
Commit ·
9339c96
1
Parent(s): 5d0d255
ADD: test log
Browse files- exp_pipeline/pipeline.py +3 -0
exp_pipeline/pipeline.py
CHANGED
|
@@ -15,6 +15,7 @@ def run_pipeline(split: str = "train"):
|
|
| 15 |
data_loader = DataLoader()
|
| 16 |
raw_data = data_loader.get_passage_dataset(split)
|
| 17 |
logger.info(f"Loaded {len(raw_data)} samples from MS MARCO Passage Ranking [{split}]")
|
|
|
|
| 18 |
|
| 19 |
# 2. 预处理数据
|
| 20 |
preprocessor = Preprocessor()
|
|
@@ -22,6 +23,7 @@ def run_pipeline(split: str = "train"):
|
|
| 22 |
if hasattr(raw_data, "to_dict"):
|
| 23 |
raw_data = raw_data.to_dict()
|
| 24 |
raw_data = [dict(zip(raw_data.keys(), v)) for v in zip(*raw_data.values())]
|
|
|
|
| 25 |
|
| 26 |
# MS MARCO Passage v2.1: 用passages["passage_text"]字段
|
| 27 |
passages = []
|
|
@@ -30,6 +32,7 @@ def run_pipeline(split: str = "train"):
|
|
| 30 |
passages.extend(item["passages"]["passage_text"])
|
| 31 |
processed = preprocessor.preprocess_passages(passages)
|
| 32 |
texts = [p["text"] for p in processed]
|
|
|
|
| 33 |
|
| 34 |
logger.info(f"Processed {len(texts)} passages")
|
| 35 |
|
|
|
|
| 15 |
data_loader = DataLoader()
|
| 16 |
raw_data = data_loader.get_passage_dataset(split)
|
| 17 |
logger.info(f"Loaded {len(raw_data)} samples from MS MARCO Passage Ranking [{split}]")
|
| 18 |
+
print("data_loader\n")
|
| 19 |
|
| 20 |
# 2. 预处理数据
|
| 21 |
preprocessor = Preprocessor()
|
|
|
|
| 23 |
if hasattr(raw_data, "to_dict"):
|
| 24 |
raw_data = raw_data.to_dict()
|
| 25 |
raw_data = [dict(zip(raw_data.keys(), v)) for v in zip(*raw_data.values())]
|
| 26 |
+
print("raw_data\n")
|
| 27 |
|
| 28 |
# MS MARCO Passage v2.1: 用passages["passage_text"]字段
|
| 29 |
passages = []
|
|
|
|
| 32 |
passages.extend(item["passages"]["passage_text"])
|
| 33 |
processed = preprocessor.preprocess_passages(passages)
|
| 34 |
texts = [p["text"] for p in processed]
|
| 35 |
+
print("texts\n")
|
| 36 |
|
| 37 |
logger.info(f"Processed {len(texts)} passages")
|
| 38 |
|