File size: 5,868 Bytes
0a55f0f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | import argparse
import json
from pathlib import Path
from typing import Any, Dict, List, Optional
PAPER_META_FILE = "paper_metadata.json"
CITATIONS_FILE = "citations_metadata.json"
DEFAULT_OUT_NAME = "usage_contexts.json"
def load_json(path: Path) -> Any | None:
if not path.exists():
return None
try:
return json.loads(path.read_text(encoding="utf-8"))
except Exception as e:
print(f"[WARN] could not parse JSON at {path}: {e}")
return None
def iter_paper_dirs(root: Path) -> List[Path]:
out: List[Path] = []
for child in root.iterdir():
if child.is_dir() and (child / PAPER_META_FILE).exists():
out.append(child)
return out
def _extract_contexts(item: Dict[str, Any]) -> List[Dict[str, Any]]:
contexts: List[Dict[str, Any]] = []
raw = item.get("contextsWithIntent") or []
if isinstance(raw, list) and raw:
for entry in raw:
if not isinstance(entry, dict):
continue
text_raw = (entry.get("context") or "").strip()
text = (entry.get("context_with_marker") or text_raw).strip()
intents = entry.get("intents") or []
contexts.append(
{
"text": text,
"text_raw": text_raw,
"intents": intents,
}
)
# Fallback for older schema that only stores raw context strings.
if not contexts:
raw_alt = item.get("contexts") or []
if isinstance(raw_alt, list):
for text in raw_alt:
if not isinstance(text, str):
continue
text = text.strip()
if text:
contexts.append(
{
"text": text,
"intents": [],
}
)
return contexts
def build_usage_contexts_for_paper(paper_dir: Path) -> Optional[Dict[str, Any]]:
citations_path = paper_dir / CITATIONS_FILE
data = load_json(citations_path)
if data is None:
return None
if not isinstance(data, list):
print(f"[WARN] {paper_dir.name}: {CITATIONS_FILE} is not a list")
return None
citing_entries: List[Dict[str, Any]] = []
total_contexts = 0
citing_with_context = 0
influential_citations = 0
influential_with_context = 0
influential_contexts: List[Dict[str, Any]] = []
for item in data:
if not isinstance(item, dict):
continue
citing = item.get("citingPaper") or {}
contexts = _extract_contexts(item)
is_influential = bool(item.get("isInfluential", False))
if is_influential:
influential_citations += 1
if contexts:
citing_with_context += 1
total_contexts += len(contexts)
if is_influential:
influential_with_context += 1
citing_entries.append(
{
"citing_paper_id": citing.get("paperId"),
"title": citing.get("title"),
"external_ids": citing.get("externalIds") or {},
"is_influential": is_influential,
"contexts": contexts,
}
)
if is_influential and contexts:
influential_contexts.append(
{
"citing_paper_id": citing.get("paperId"),
"title": citing.get("title"),
"external_ids": citing.get("externalIds") or {},
"contexts": contexts,
}
)
payload = {
"paper_id": paper_dir.name,
"total_citations": len(data),
"num_contexts": total_contexts,
"num_citing_with_context": citing_with_context,
"num_citing_without_context": len(data) - citing_with_context,
"num_influential_citations": influential_citations,
"num_influential_with_context": influential_with_context,
"influential_contexts": influential_contexts,
"citing_papers": citing_entries,
}
return payload
def run(root: Path, out_name: str, overwrite: bool) -> None:
root = root.resolve()
if not root.exists():
raise SystemExit(f"Root directory does not exist: {root}")
paper_dirs = sorted(iter_paper_dirs(root), key=lambda p: p.name)
print(f"[INFO] Found {len(paper_dirs)} paper dirs under {root}")
for paper_dir in paper_dirs:
out_path = paper_dir / out_name
if out_path.exists() and not overwrite:
print(f"[SKIP] {paper_dir.name}: {out_name} already exists")
continue
payload = build_usage_contexts_for_paper(paper_dir)
if payload is None:
continue
out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
print(
f"[OK] {paper_dir.name}: wrote {out_name} "
f"({payload['num_contexts']} contexts from {payload['total_citations']} citations)"
)
def main() -> None:
parser = argparse.ArgumentParser(
description="Build usage_contexts.json from citations_metadata.json files."
)
parser.add_argument(
"--root",
type=str,
default="processed_papers/acl_2024",
help="Root directory containing processed_papers/acl_2024/<paper_id> dirs.",
)
parser.add_argument(
"--out-name",
type=str,
default=DEFAULT_OUT_NAME,
help="Output filename to write inside each paper dir.",
)
parser.add_argument(
"--overwrite",
action="store_true",
help="Overwrite existing usage_contexts.json files.",
)
args = parser.parse_args()
run(Path(args.root), out_name=args.out_name, overwrite=args.overwrite)
if __name__ == "__main__":
main()
|