from __future__ import annotations import json from dataclasses import dataclass from pathlib import Path from typing import Iterable, List @dataclass class SummarizationExample: article: str summary: str def load_jsonl(path: str | Path) -> List[SummarizationExample]: path = Path(path) items: List[SummarizationExample] = [] with path.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue obj = json.loads(line) article = obj.get("article") or obj.get("text") or "" summary = obj.get("summary") or obj.get("label") or "" if article and summary: items.append(SummarizationExample(article=article, summary=summary)) return items def iter_pairs(examples: Iterable[SummarizationExample]): for ex in examples: yield ex.article, ex.summary