| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
| from typing import Any, Dict, List |
|
|
| from common.paper_package import ( |
| PaperPackage, |
| _collect_bibliography, |
| _collect_citation_contexts, |
| _collect_full_processed_text, |
| _collect_sections, |
| _load_json, |
| _normalize_dict_payload, |
| ) |
|
|
|
|
| def _collect_all_cluster_evidence(paper_dir: Path) -> List[Dict[str, Any]]: |
| discovery = _normalize_dict_payload(_load_json(paper_dir / "usage_discovery_from_contributions.json", {})) |
| clusters = discovery.get("clusters", []) |
| out = [] |
| for cluster in clusters: |
| out.append( |
| { |
| "cluster_id": cluster.get("cluster_id"), |
| "representative_claim": cluster.get("representative_claim") or cluster.get("cluster_title"), |
| "cluster_title": cluster.get("cluster_title"), |
| "count": cluster.get("count"), |
| "cluster_key": cluster.get("cluster_key"), |
| "claim_indices": cluster.get("claim_indices", []), |
| "source_cluster_ids": cluster.get("source_cluster_ids", []), |
| "merge_rationale": cluster.get("merge_rationale"), |
| } |
| ) |
| return out |
|
|
|
|
| def load_paper_package(paper_dir: str | Path, extracted_claim_override: str | None = None) -> PaperPackage: |
| paper_dir = Path(paper_dir) |
| paper_metadata = _normalize_dict_payload(_load_json(paper_dir / "paper_metadata.json", {})) |
| cluster_evidence = _collect_all_cluster_evidence(paper_dir) |
| seed = extracted_claim_override or "" |
| return PaperPackage( |
| paper_dir=paper_dir, |
| paper_metadata=paper_metadata, |
| extracted_discovery_claim=seed, |
| downstream_cluster_evidence=cluster_evidence, |
| paper_text=_collect_sections(paper_dir), |
| full_processed_text=_collect_full_processed_text(paper_dir), |
| bibliography=_collect_bibliography(paper_dir), |
| citation_contexts=_collect_citation_contexts(paper_dir), |
| ) |
|
|