Spaces:
Running
Running
| from __future__ import annotations | |
| from datasets import load_dataset | |
| from dataset_config import DatasetConfig | |
| def build_corpus(size: int, ds_cfg: DatasetConfig | None = None) -> list[str]: | |
| """Build a corpus of real sentences from the configured dataset.""" | |
| if ds_cfg is None: | |
| ds_cfg = DatasetConfig() | |
| if ds_cfg.data is not None: | |
| data = ds_cfg.data | |
| else: | |
| dataset = load_dataset(ds_cfg.name, ds_cfg.config, split=ds_cfg.split) | |
| data = {col: list(dataset[col]) for col in dataset.column_names} | |
| sentences = list(data[ds_cfg.query_col]) + list(data[ds_cfg.passage_col]) | |
| full: list[str] = [] | |
| while len(full) < size: | |
| full.extend(sentences) | |
| return full[:size] | |