Spaces:

EricCham8
/

Scipaths

Running

File size: 10,233 Bytes

0a55f0f

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Any, Dict, List

from pydantic import BaseModel


SECTION_FILES = [
    "abstract.txt",
    "introduction.tex",
    "related_work.tex",
    "tldr.txt",
]


class PaperPackage(BaseModel):
    paper_dir: Path
    paper_metadata: Dict[str, Any]
    extracted_discovery_claim: str
    downstream_cluster_evidence: List[Dict[str, Any]]
    paper_text: Dict[str, str]
    full_processed_text: str
    bibliography: List[Dict[str, Any]]
    citation_contexts: List[Dict[str, Any]]

    def to_prompt_payload(self) -> Dict[str, Any]:
        return {
            "paper_metadata": self.paper_metadata,
            "extracted_discovery_claim": self.extracted_discovery_claim,
            "downstream_cluster_evidence": self.downstream_cluster_evidence,
            "paper_text": self.paper_text,
            "full_processed_text": self.full_processed_text,
            "bibliography": self.bibliography,
            "citation_contexts": self.citation_contexts,
        }


def _load_json(path: Path, default: Any) -> Any:
    try:
        return json.loads(path.read_text())
    except Exception:
        return default


def _read_text(path: Path) -> str:
    try:
        return path.read_text()
    except Exception:
        return ""


def _normalize_dict_payload(value: Any) -> Dict[str, Any]:
    if isinstance(value, dict):
        return value
    if isinstance(value, list):
        for item in value:
            if isinstance(item, dict):
                return item
    return {}


def _collect_sections(paper_dir: Path) -> Dict[str, str]:
    sections_dir = paper_dir / "sections"
    out: Dict[str, str] = {}
    for name in SECTION_FILES:
        text = _read_text(sections_dir / name).strip()
        if text:
            out[name] = text[:12000]
    if not out:
        processed = _read_text(paper_dir / "processed_main.tex").strip()
        if processed:
            out["processed_main.tex"] = processed[:24000]
    return out


def _collect_full_processed_text(paper_dir: Path) -> str:
    processed = _read_text(paper_dir / "processed_main.tex").strip()
    if processed:
        return processed

    sections_dir = paper_dir / "sections"
    parts: List[str] = []
    if sections_dir.exists():
        for path in sorted(sections_dir.iterdir()):
            if not path.is_file():
                continue
            text = _read_text(path).strip()
            if text:
                parts.append(f"[{path.name}]\n{text}")
    return "\n\n".join(parts)


def _extract_year(value: Any) -> Any:
    if value:
        return value
    return None


def _normalise_reference_record(ref: Dict[str, Any]) -> Dict[str, Any]:
    cited = ref.get("citedPaper")
    source = cited if isinstance(cited, dict) else ref
    external_ids = source.get("external_ids") or source.get("externalIds") or {}
    return {
        "ref_id": (
            ref.get("ref_id")
            or ref.get("bib_key")
            or source.get("ref_id")
            or source.get("bib_key")
            or source.get("paperId")
            or source.get("paper_id")
            or external_ids.get("ACL")
            or external_ids.get("ArXiv")
            or external_ids.get("DOI")
        ),
        "title": source.get("title") or source.get("ref_title"),
        "authors": source.get("authors") or source.get("ref_authors"),
        "year": _extract_year(source.get("year") or source.get("ref_year")),
        "external_ids": external_ids,
    }


def _parse_bibtex_entries(text: str, limit: int) -> List[Dict[str, Any]]:
    entries: List[Dict[str, Any]] = []
    for match in re.finditer(r"@\w+\s*\{\s*([^,]+),(.*?)(?=\n@\w+\s*\{|\Z)", text, re.S):
        key = match.group(1).strip()
        body = match.group(2)
        fields: Dict[str, str] = {}
        for field in ("title", "author", "year", "doi", "url", "eprint"):
            field_match = re.search(
                rf"\b{field}\s*=\s*(\{{(?:[^{{}}]|\{{[^{{}}]*\}})*\}}|\"[^\"]*\"|[^,\n]+)",
                body,
                re.I | re.S,
            )
            if field_match:
                value = field_match.group(1).strip().strip(",")
                if (value.startswith("{") and value.endswith("}")) or (
                    value.startswith('"') and value.endswith('"')
                ):
                    value = value[1:-1]
                fields[field] = re.sub(r"\s+", " ", value).strip()
        if fields:
            external_ids: Dict[str, Any] = {}
            if fields.get("doi"):
                external_ids["DOI"] = fields["doi"]
            if fields.get("eprint"):
                external_ids["ArXiv"] = fields["eprint"]
            entries.append(
                {
                    "ref_id": key,
                    "title": fields.get("title"),
                    "authors": fields.get("author"),
                    "year": fields.get("year"),
                    "external_ids": external_ids,
                }
            )
        if len(entries) >= limit:
            break
    return entries


def _collect_bibtex_citation_contexts(paper_dir: Path, limit: int = 60) -> List[Dict[str, Any]]:
    bibtex = _read_text(paper_dir / "references.bib")
    processed = _read_text(paper_dir / "processed_main.tex")
    if not bibtex or not processed:
        return []

    refs = _parse_bibtex_entries(bibtex, limit=500)
    out: List[Dict[str, Any]] = []
    seen: set[tuple[str, int]] = set()
    for ref in refs:
        ref_id = ref.get("ref_id")
        if not ref_id:
            continue
        for match in re.finditer(rf"\\cite\w*\s*(?:\[[^\]]*\]\s*)*\{{[^}}]*\b{re.escape(str(ref_id))}\b[^}}]*\}}", processed):
            key = (str(ref_id), match.start())
            if key in seen:
                continue
            seen.add(key)
            start = max(0, match.start() - 350)
            end = min(len(processed), match.end() + 350)
            snippet = re.sub(r"\s+", " ", processed[start:end]).strip()
            out.append(
                {
                    "ref_id": ref_id,
                    "citation_marker": ref.get("title") or ref_id,
                    "text": snippet,
                    "section": None,
                    "intents": [],
                }
            )
            if len(out) >= limit:
                return out
    return out


def _collect_bibliography(paper_dir: Path, limit: int = 80) -> List[Dict[str, Any]]:
    refs = _load_json(paper_dir / "references_metadata.json", [])
    if isinstance(refs, list) and refs:
        return [_normalise_reference_record(ref) for ref in refs[:limit] if isinstance(ref, dict)]

    bibtex = _read_text(paper_dir / "references.bib")
    if bibtex:
        return _parse_bibtex_entries(bibtex, limit)
    return []


def _collect_citation_contexts(paper_dir: Path, limit: int = 60) -> List[Dict[str, Any]]:
    refs = _load_json(paper_dir / "references_metadata.json", [])
    out = []
    if isinstance(refs, list):
        for ref in refs:
            if not isinstance(ref, dict):
                continue
            ref_record = _normalise_reference_record(ref)
            for context in ref.get("contextsWithIntent") or []:
                if not isinstance(context, dict):
                    continue
                text = context.get("context") or context.get("text") or ""
                if not text:
                    continue
                out.append(
                    {
                        "ref_id": ref_record.get("ref_id"),
                        "citation_marker": ref_record.get("title"),
                        "text": text,
                        "section": context.get("section"),
                        "intents": context.get("intents", []),
                    }
                )
                if len(out) >= limit:
                    return out
    contexts = _load_json(paper_dir / "usage_contexts.json", [])
    if isinstance(contexts, list):
        for item in contexts:
            entry = {
                "ref_id": item.get("ref_id") or item.get("bib_key"),
                "citation_marker": item.get("citation_marker"),
                "text": item.get("text") or item.get("text_raw") or "",
                "section": item.get("section"),
            }
            if entry["text"]:
                out.append(entry)
            if len(out) >= limit:
                break
    if not out:
        out = _collect_bibtex_citation_contexts(paper_dir, limit=limit)
    return out


def _collect_downstream_cluster_evidence(paper_dir: Path) -> List[Dict[str, Any]]:
    discovery = _normalize_dict_payload(_load_json(paper_dir / "usage_discovery_from_contributions.json", {}))
    clusters = discovery.get("clusters", [])
    out = []
    for cluster in clusters:
        out.append(
            {
                "cluster_id": cluster.get("cluster_id"),
                "representative_claim": cluster.get("representative_claim") or cluster.get("cluster_title"),
                "cluster_title": cluster.get("cluster_title"),
                "count": cluster.get("count"),
                "merge_rationale": cluster.get("merge_rationale"),
            }
        )
    return out


def load_paper_package(paper_dir: str | Path, extracted_claim_override: str | None = None) -> PaperPackage:
    paper_dir = Path(paper_dir)
    discovery = _normalize_dict_payload(_load_json(paper_dir / "usage_discovery_from_contributions.json", {}))
    paper_metadata = _normalize_dict_payload(_load_json(paper_dir / "paper_metadata.json", {}))
    claim = extracted_claim_override or (
        discovery.get("most_impactful_contribution_self_contained")
        or discovery.get("most_impactful_contribution")
        or ""
    )
    return PaperPackage(
        paper_dir=paper_dir,
        paper_metadata=paper_metadata,
        extracted_discovery_claim=claim,
        downstream_cluster_evidence=_collect_downstream_cluster_evidence(paper_dir),
        paper_text=_collect_sections(paper_dir),
        full_processed_text=_collect_full_processed_text(paper_dir),
        bibliography=_collect_bibliography(paper_dir),
        citation_contexts=_collect_citation_contexts(paper_dir),
    )