File size: 1,995 Bytes
0a55f0f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | from __future__ import annotations
import json
from pathlib import Path
from typing import Any, Dict, List
from common.paper_package import (
PaperPackage,
_collect_bibliography,
_collect_citation_contexts,
_collect_full_processed_text,
_collect_sections,
_load_json,
_normalize_dict_payload,
)
def _collect_all_cluster_evidence(paper_dir: Path) -> List[Dict[str, Any]]:
discovery = _normalize_dict_payload(_load_json(paper_dir / "usage_discovery_from_contributions.json", {}))
clusters = discovery.get("clusters", [])
out = []
for cluster in clusters:
out.append(
{
"cluster_id": cluster.get("cluster_id"),
"representative_claim": cluster.get("representative_claim") or cluster.get("cluster_title"),
"cluster_title": cluster.get("cluster_title"),
"count": cluster.get("count"),
"cluster_key": cluster.get("cluster_key"),
"claim_indices": cluster.get("claim_indices", []),
"source_cluster_ids": cluster.get("source_cluster_ids", []),
"merge_rationale": cluster.get("merge_rationale"),
}
)
return out
def load_paper_package(paper_dir: str | Path, extracted_claim_override: str | None = None) -> PaperPackage:
paper_dir = Path(paper_dir)
paper_metadata = _normalize_dict_payload(_load_json(paper_dir / "paper_metadata.json", {}))
cluster_evidence = _collect_all_cluster_evidence(paper_dir)
seed = extracted_claim_override or ""
return PaperPackage(
paper_dir=paper_dir,
paper_metadata=paper_metadata,
extracted_discovery_claim=seed,
downstream_cluster_evidence=cluster_evidence,
paper_text=_collect_sections(paper_dir),
full_processed_text=_collect_full_processed_text(paper_dir),
bibliography=_collect_bibliography(paper_dir),
citation_contexts=_collect_citation_contexts(paper_dir),
)
|