Spaces:

EricCham8
/

Scipaths

Running

App Files Files Community

Eric Chamoun commited on about 22 hours ago

Commit

0a55f0f

0 Parent(s):

Initial SciPaths Space release

Browse files

Files changed (43) hide show

.dockerignore +7 -0
.gitattributes +1 -0
.gitignore +7 -0
Deep-Citation/Data/acl.tsv +0 -0
Deep-Citation/Data/class_def.json +23 -0
Deep-Citation/Model/__init__.py +1 -0
Deep-Citation/Model/model.py +89 -0
Deep-Citation/Workspace/acl_scicite_wksp_trl/args.txt +21 -0
Deep-Citation/Workspace/acl_scicite_wksp_trl/best_model.pt +3 -0
Deep-Citation/data.py +211 -0
Dockerfile +24 -0
README.md +232 -0
app.py +5 -0
hf_space/requirements.txt +17 -0
hf_space/runner.py +333 -0
hf_space/streamlit_app.py +864 -0
hf_space/streamlit_config.py +30 -0
requirements.txt +1 -0
src/common/__init__.py +0 -0
src/common/llm_client.py +49 -0
src/common/model_client.py +143 -0
src/common/paper_package.py +288 -0
src/step_01_fetch/config.py +6 -0
src/step_01_fetch/fetch_metadata.py +440 -0
src/step_01_fetch/process_tex_source.py +203 -0
src/step_01_fetch/semanticscholar_client.py +158 -0
src/step_02_mark_citations/replace_citation_markers.py +440 -0
src/step_03_usage_contexts/build_usage_contexts.py +184 -0
src/step_04_label_citations/label_citation_functions.py +373 -0
src/step_05_verify_uses_extends/prompts.py +115 -0
src/step_05_verify_uses_extends/schemas.py +22 -0
src/step_05_verify_uses_extends/verify_uses_extends.py +296 -0
src/step_06_extract_paragraphs/extract_arxiv_paragraphs.py +488 -0
src/step_07_extract_and_refine/extract_contributions_from_citations.py +329 -0
src/step_07_extract_and_refine/prompts.py +65 -0
src/step_07_extract_and_refine/refine_and_filter_clusters_llm.py +402 -0
src/step_07_extract_and_refine/schemas.py +12 -0
src/step_08_annotation/__init__.py +3 -0
src/step_08_annotation/cli.py +99 -0
src/step_08_annotation/final_prompts.py +0 -0
src/step_08_annotation/paper_package.py +52 -0
src/step_08_annotation/pipeline.py +256 -0
src/step_08_annotation/schemas.py +127 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,7 @@

+.git
+__pycache__
+*.pyc
+hf_space/runs
+**/__pycache__
+**/*.pyc
+*.zip

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ Deep-Citation/Workspace/acl_scicite_wksp_trl/best_model.pt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__/
+*.py[cod]
+.DS_Store
+.streamlit/secrets.toml
+hf_space/runs/
+runs/
+*.zip

Deep-Citation/Data/acl.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

Deep-Citation/Data/class_def.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "acl":
+        {
+            "BACKGROUND": "The citation provides relevant information for the domain that the present paper discusses.",
+            "MOTIVATION": "The citation illustrates the need for data, goals, methods, etc that is proposed in the present paper.",
+            "USES": "The present paper uses data, methods, etc., from the paper associated with the citation.",
+            "EXTENDS": "The present paper extends the data, methods, etc. from the paper associated with the citation.",
+            "COMPAREORCONTRAST": "The present paper expresses similarity / differences to the citation.",
+            "FUTURE": "The citation is a potential avenue for future work of the present paper."
+        },
+    "kim":
+        {
+            "Used": "The present paper uses at least one method that is proposed in the paper associated with the citation.",
+            "Not used": "The present paper does not use or extend any methods that is proposed in the paper associated with the citation.",
+            "Extended": "The present paper uses an extended / modified version of the method proposed in the paper associated with the citation."
+        },
+    "scicite":
+        {
+            "Background": "The citation states, mentions, or points to the background information giving more context about a problem, concept, approach, topic, or importance of the problem that is discussed in the present paper.",
+            "Method": "The present paper uses a method, tool, approach or dataset that is proposed in the paper associated with the citation.",
+            "Result": "The present paper compares its results/findings with the results/findings of the paper associated with the citation."
+        }
+}

Deep-Citation/Model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import LanguageModel, MultiHeadLanguageModel

Deep-Citation/Model/model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import torch
+import torch.nn as nn
+from typing import List
+from transformers import AutoModel
+def mask_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+class LanguageModel(nn.Module):
+    def __init__(self,
+                 modelname: str,
+                 device: str,
+                 readout: str
+        ):
+        super(LanguageModel, self).__init__()
+        self.device = device
+        self.modelname = modelname
+        self.readout_fn = readout
+        self.model = AutoModel.from_pretrained(modelname)
+        self.hidden_size = self.model.config.hidden_size
+    def readout(self, model_inputs, model_outputs, readout_masks=None):
+        if self.readout_fn == 'cls':
+            if 'bert' in self.modelname or 'deberta' in self.modelname:
+                text_representations = model_outputs.last_hidden_state[:, 0]
+            elif 'xlnet' in self.modelname:
+                text_representations = model_outputs.last_hidden_state[:, -1]
+            else:
+                raise ValueError('Invalid model name {} for the cls readout.'.format(self.modelname))
+        elif self.readout_fn == 'mean':
+            text_representations = mask_pooling(model_outputs, model_inputs['attention_mask'])
+        elif self.readout_fn == 'ch' and readout_masks is not None:
+            text_representations = mask_pooling(model_outputs, readout_masks)
+        else:
+            raise ValueError('Invalid readout function.')
+        return text_representations
+    def _lm_forward(self, tokens):
+        tokens = tokens.to(self.device)
+        if 'readout_mask' in tokens:
+            readout_mask = tokens.pop('readout_mask')
+        else:
+            readout_mask = None
+        outputs = self.model(**tokens)
+        return self.readout(tokens, outputs, readout_mask)
+    def forward(self):
+        raise NotImplementedError
+    def save_pretrained(self, modeldir):
+        model_filename = os.path.join(modeldir, 'checkpoint.pt')
+        torch.save(self.state_dict(), model_filename)
+    def load_pretrained(self, modeldir):
+        model_filename = os.path.join(modeldir, 'checkpoint.pt')
+        self.load_state_dict(torch.load(model_filename))
+class MultiHeadLanguageModel(LanguageModel):
+    def __init__(self,
+                 modelname: str,
+                 device: str,
+                 readout: str,
+                 num_classes: List
+        ):
+        super().__init__(
+            modelname,
+            device,
+            readout
+        )
+        self.num_classes = num_classes
+        self.lns = nn.ModuleList([nn.Linear(self.hidden_size, num_class) for num_class in num_classes])
+    def forward(self, input_tokens, input_head_indices, class_tokens, class_head_indices):
+        head_indices = torch.unique(input_head_indices)
+        text_representations = self._lm_forward(input_tokens)
+        final_preds = {}
+        for i in head_indices:
+            if torch.any(input_head_indices == i):
+                final_preds[i.item()] = self.lns[i.item()](text_representations[input_head_indices == i])
+            else:
+                final_preds[i.item()] = torch.tensor([]).to(self.device)
+        return final_preds

Deep-Citation/Workspace/acl_scicite_wksp_trl/args.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+Namespace(dataset='acl-scicite',
+lambdas='1-0.063',
+data_dir='Data',
+workspace='Workspace/acl_scicite_wksp_trl',
+class_definition='Data/class_def.json',
+batch_size=32,
+lr=5e-05,
+decay_rate=0.5,
+decay_step=5,
+num_epochs=10,
+scheduler='slanted',
+dropout_rate=0.2,
+l2=0.0,
+device='cuda',
+tol=10,
+inference_only=False,
+seed=1,
+lm='scibert',
+max_length=512,
+batch_size_factor=2,
+readout='ch')

Deep-Citation/Workspace/acl_scicite_wksp_trl/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e45ab11942439f80a121dad5b2d9da392470e0cedf6a7335991fa0a1f616dcb2
+size 439784777

Deep-Citation/data.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import json
+import copy
+import torch
+import scipy
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from scipy.special import softmax
+from transformers import AutoTokenizer
+class CollateFn(object):
+    def __init__(self, modelname, class_definitions=None, instance_weights=False):
+        self.instance_weights = instance_weights
+        use_fast = False if 'deberta' in modelname else True
+        self.tokenizer = AutoTokenizer.from_pretrained(modelname, use_fast=use_fast)
+        cited_ids = self.tokenizer.encode('<CITED HERE>', add_special_tokens=False)
+        self.cited_here_tokens = torch.tensor(cited_ids, dtype=torch.long)
+        if class_definitions is not None:
+            self.class_definitions = []
+            self.class_head_indices = []
+            for i, defs in enumerate(class_definitions):
+                self.class_definitions += defs
+                self.class_head_indices.append(i * torch.ones(len(defs), dtype=torch.long))
+            self.class_head_indices = torch.cat(self.class_head_indices, dim=0)
+            self.class_tokens = self.tokenizer(
+                self.class_definitions,
+                return_tensors="pt",
+                max_length=512,
+                truncation=True,
+                padding=True
+            )
+    def _get_readout_mask(self, tokens):
+        # cited_here_tokens = torch.tensor([962, 8412, 1530, 1374])
+        readout_mask = torch.zeros_like(tokens['input_ids'], dtype=torch.bool)
+        batch_size = tokens['input_ids'].size(0)
+        l = tokens['input_ids'].size(1)
+        ctk_l = self.cited_here_tokens.size(0)
+        for b in range(batch_size):
+            for i in range(1, l - ctk_l):
+                if torch.equal(tokens['input_ids'][b, i:i+ctk_l], self.cited_here_tokens):
+                    readout_mask[b, i:i+ctk_l] = True
+            if not readout_mask[b].any():
+                # Fallback to CLS if the citation marker isn't matched.
+                readout_mask[b, 0] = True
+        return readout_mask
+    def _tokenize_context(self, context):
+        tokens = self.tokenizer(
+            context,
+            return_tensors="pt",
+            max_length=512,
+            truncation=True,
+            padding=True
+        )
+        tokens['readout_mask'] = self._get_readout_mask(
+            tokens
+        )
+        return tokens
+    def __call__(self, samples):
+        if self.instance_weights:
+            text, labels, ds_indices, instance_weights = list(map(list, zip(*samples)))
+            batched_text = self._tokenize_context(text)
+            labels = torch.stack(labels)
+            ds_indices = torch.stack(ds_indices)
+            instance_weights = torch.stack(instance_weights)
+            return batched_text, labels, ds_indices, instance_weights
+        else:
+            text, labels, ds_indices = list(map(list, zip(*samples)))
+            batched_text = self._tokenize_context(text)
+            labels = torch.stack(labels)
+            ds_indices = torch.stack(ds_indices)
+            return batched_text, labels, ds_indices, copy.deepcopy(self.class_tokens), self.class_head_indices
+class Dataset(object):
+    def __init__(self, dataframe, class_definitions, lmbd=1.0):
+        self.class_definitions = class_definitions
+        self.lmbd = lmbd
+        self._load_data(dataframe)
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        '''Get datapoint with index'''
+        return (self.text[idx], self.labels[idx], self.ds_index[idx])
+    def _load_data(self, annotated_data):
+        self.labels = torch.LongTensor(annotated_data['label'].tolist())
+        self.original_labels = torch.LongTensor(annotated_data['label'].tolist())
+        self.ds_index = torch.zeros_like(self.original_labels)
+        self.text = annotated_data['context'].tolist()
+class MultiHeadDatasets(object):
+    def __init__(self, datasets, batch_size_factor=2):
+        self.text = []
+        self.ds_index = []
+        self.labels = []
+        self.class_definitions = []
+        self.lambdas = []
+        self.dataset_sizes = [len(d.labels) for d in datasets]
+        if len(self.dataset_sizes) > 1:
+            if sum(self.dataset_sizes) / self.dataset_sizes[0] <= batch_size_factor:
+                self.sample_auxiliary = False
+                self.adjusted_batch_size_factor = sum(self.dataset_sizes) / self.dataset_sizes[0]
+            else:
+                self.sample_auxiliary = True
+                self.sample_distribution = np.array([d.lmbd for d in datasets[1:]]) / sum([d.lmbd for d in datasets[1:]])
+                self.adjusted_batch_size_factor = batch_size_factor
+        else:
+            self.sample_auxiliary = False
+            self.adjusted_batch_size_factor = 1
+        for i, d in enumerate(datasets):
+            self.text += d.text
+            self.ds_index.append(i * torch.ones(len(d.text), dtype=torch.long))
+            self.labels.append(d.labels)
+            self.class_definitions.append(d.class_definitions)
+            self.lambdas.append(d.lmbd)
+        self.labels = torch.cat(self.labels, dim=0)
+        self.ds_index = torch.cat(self.ds_index, dim=0)
+    def sample_auxiliary_instace(self):
+        sampled_dataset_idx = np.random.choice(
+            np.arange(1, len(self.dataset_sizes)),
+            p=self.sample_distribution
+        )
+        instance_idx = np.random.choice(
+            self.dataset_sizes[sampled_dataset_idx]
+        ) + sum(self.dataset_sizes[:sampled_dataset_idx])
+        return instance_idx
+    def __len__(self):
+        if self.sample_auxiliary: # if the auxiliary dataset is larger than the main dataset
+            return self.dataset_sizes[0] * self.adjusted_batch_size_factor
+        return len(self.labels)
+    def __getitem__(self, idx):
+        '''Get datapoint with index'''
+        if idx < self.dataset_sizes[0] or not self.sample_auxiliary:
+            return (self.text[idx], self.labels[idx], self.ds_index[idx])
+        else:
+            real_idx = self.sample_auxiliary_instace()
+            return (self.text[real_idx], self.labels[real_idx], self.ds_index[real_idx])
+def load_class_definitions(filename):
+    with open(filename, 'r') as f:
+        class_definitions = json.load(f)
+    results = {k:{} for k in class_definitions.keys()}
+    for k, v in class_definitions.items():
+        for kk, vv in v.items():
+            results[k][kk.lower()] = vv
+    return results
+def create_data_channels(filename, class_definition_filename, split=None, lmbd=1.0):
+    data = pd.read_csv(filename, sep='\t')
+    data = data.fillna(' ')
+    print('Number of data instance: {}'.format(data.shape[0]))
+    # map labels to ids
+    unique_labels = data['label'].unique().tolist()
+    label2id = {lb: i for i, lb in enumerate(unique_labels)}
+    data['label'] = data['label'].apply(
+        lambda x: label2id[x])
+    data_train = data[data['split'] == 'train'].reset_index()
+    data_val = data[data['split'] == 'val'].reset_index()
+    data_test = data[data['split'] == 'test'].reset_index()
+    class_definitions = load_class_definitions(class_definition_filename)
+    dataname = filename.split('/')[-1].split('.')[0]
+    data_class_definitions = [class_definitions[dataname][lb.lower()] for lb in unique_labels]
+    train_data = Dataset(data_train, data_class_definitions, lmbd=lmbd)
+    val_data = Dataset(data_val, data_class_definitions, lmbd=lmbd)
+    test_data = Dataset(data_test, data_class_definitions, lmbd=lmbd)
+    return train_data, val_data, test_data, unique_labels
+def create_single_data_object(filename, class_definition_filename, split=None, lmbd=1.0):
+    data = pd.read_csv(filename, sep='\t')
+    data = data.fillna(' ')
+    print('Number of data instance: {}'.format(data.shape[0]))
+    # map labels to ids
+    unique_labels = data['label'].unique()
+    label2id = {lb: i for i, lb in enumerate(unique_labels)}
+    data['label'] = data['label'].apply(
+        lambda x: label2id[x])
+    class_definitions = load_class_definitions(class_definition_filename)
+    dataname = filename.split('/')[-1].split('.')[0]
+    data_class_definitions = [class_definitions[dataname][lb.lower()] for lb in unique_labels]
+    if split is None:
+        return Dataset(data, data_class_definitions, lmbd=lmbd), unique_labels
+    else:
+        return Dataset(data[data['split'] == split].reset_index(), data_class_definitions, lmbd=lmbd), unique_labels

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    STREAMLIT_SERVER_HEADLESS=true
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /app/requirements.txt
+COPY hf_space/requirements.txt /app/hf_space/requirements.txt
+RUN python -m pip install --upgrade pip && \
+    pip install -r requirements.txt
+COPY . /app
+EXPOSE 7860
+CMD ["streamlit", "run", "hf_space/streamlit_app.py", "--server.address", "0.0.0.0", "--server.port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,232 @@

+---
+title: SciPaths
+emoji: 🔬
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+pinned: false
+---
+# SciPaths
+SciPaths runs an end-to-end target-contribution pathway pipeline for arXiv papers. It collects downstream citation evidence, derives target contributions from refined citation clusters, decomposes each target contribution into enabling contributions, and grounds those enabling contributions in prior studies.
+The Hugging Face Space launches the Streamlit app from `hf_space/streamlit_app.py`.
+## Citation
+If you find this useful, please cite our paper as:
+```bibtex
+@misc{chamoun2026scipathsforecastingpathwaysscientific,
+      title={SciPaths: Forecasting Pathways to Scientific Discovery},
+      author={Eric Chamoun and Yizhou Chi and Yulong Chen and Rui Cao and Zifeng Ding and Michalis Korakakis and Andreas Vlachos},
+      year={2026},
+      eprint={2605.14600},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2605.14600},
+}
+```
+Paper URL: https://arxiv.org/abs/2605.14600
+## Required Secrets
+Set this in the Space settings before publishing:
+```text
+GEMINI_API_KEY=<Google Gemini API key>
+```
+Optional, for saving completed run artifacts to a Hugging Face Dataset:
+```text
+HF_WRITE_TOKEN=<Hugging Face write token>
+RUNS_REPO_ID=<owner/dataset-name>
+RUNS_REPO_TYPE=dataset
+```
+Optional, for higher Semantic Scholar limits:
+```text
+SEMANTIC_SCHOLAR_API_KEY=<Semantic Scholar API key>
+```
+## Run The Demo Locally
+```bash
+pip install -r requirements.txt
+streamlit run hf_space/streamlit_app.py
+```
+Then enter an arXiv URL or ID, for example:
+```text
+https://arxiv.org/abs/2211.08788
+```
+The app writes each run under:
+```text
+hf_space/runs/<job_id>/
+```
+## Run One Example From The Command Line
+This example stores all intermediate files under `runs/example/processed_papers`.
+```bash
+mkdir -p runs/example
+printf '[{"id":"2211.08788","title":"","id_type":"ArXiv"}]\n' > runs/example/input_ids.json
+python src/step_01_fetch/fetch_metadata.py \
+  --ids runs/example/input_ids.json \
+  --outdir runs/example/processed_papers
+python src/step_02_mark_citations/replace_citation_markers.py \
+  --root runs/example/processed_papers
+python src/step_03_usage_contexts/build_usage_contexts.py \
+  --root runs/example/processed_papers \
+  --out-name usage_contexts.json
+python src/step_04_label_citations/label_citation_functions.py \
+  --root runs/example/processed_papers \
+  --model-path Deep-Citation/Workspace/acl_scicite_wksp_trl/best_model.pt \
+  --model-data-dir Deep-Citation/Data \
+  --model-class-def Deep-Citation/Data/class_def.json \
+  --model-lm scibert \
+  --device cpu
+python src/step_05_verify_uses_extends/verify_uses_extends.py \
+  --root runs/example/processed_papers \
+  --k 0 \
+  --batch-size 25
+python src/step_06_extract_paragraphs/extract_arxiv_paragraphs.py \
+  --root runs/example/processed_papers
+python src/step_07_extract_and_refine/extract_contributions_from_citations.py \
+  --root runs/example/processed_papers
+python src/step_07_extract_and_refine/refine_and_filter_clusters_llm.py \
+  --root runs/example/processed_papers \
+  --inplace \
+  --overwrite
+PYTHONPATH=src \
+python -m step_08_annotation.cli run \
+  --paper-dir runs/example/processed_papers/2211.08788 \
+  --provider gemini \
+  --model gemini/gemini-3.1-pro-preview \
+  --formatter-model gemini/gemini-3.1-pro-preview \
+  --judge-model gemini/gemini-3.1-pro-preview \
+  --candidate-count 3 \
+  --output-root runs/example/two_pass_outputs
+```
+The final UI payload is written as `pass_2_ui_payload.json` inside the annotation run directory printed by the last command.
+## Run Each Step On A Set Of Papers
+Create an ID file with one entry per paper:
+```json
+[
+  {"id": "2211.08788", "title": "", "id_type": "ArXiv"},
+  {"id": "2311.14919", "title": "", "id_type": "ArXiv"}
+]
+```
+Save it as `runs/batch/input_ids.json`, then run:
+```bash
+mkdir -p runs/batch
+# 1. Fetch metadata + LaTeX for each input paper.
+python src/step_01_fetch/fetch_metadata.py \
+  --ids runs/batch/input_ids.json \
+  --outdir runs/batch/processed_papers
+# 2. Add explicit citation markers to the target-paper text.
+python src/step_02_mark_citations/replace_citation_markers.py \
+  --root runs/batch/processed_papers
+# 3. Build downstream citation usage contexts.
+python src/step_03_usage_contexts/build_usage_contexts.py \
+  --root runs/batch/processed_papers \
+  --out-name usage_contexts.json
+# 4. Label citation functions with the bundled Deep-Citation classifier.
+python src/step_04_label_citations/label_citation_functions.py \
+  --root runs/batch/processed_papers \
+  --model-path Deep-Citation/Workspace/acl_scicite_wksp_trl/best_model.pt \
+  --model-data-dir Deep-Citation/Data \
+  --model-class-def Deep-Citation/Data/class_def.json \
+  --model-lm scibert \
+  --device cpu
+# 5. Verify USES/EXTENDS citations with an LLM.
+python src/step_05_verify_uses_extends/verify_uses_extends.py \
+  --root runs/batch/processed_papers \
+  --k 0 \
+  --batch-size 25
+# 6. Extract arXiv paragraphs from downstream citing papers.
+python src/step_06_extract_paragraphs/extract_arxiv_paragraphs.py \
+  --root runs/batch/processed_papers
+# 7. Extract downstream contribution clusters, then merge/filter them.
+python src/step_07_extract_and_refine/extract_contributions_from_citations.py \
+  --root runs/batch/processed_papers
+python src/step_07_extract_and_refine/refine_and_filter_clusters_llm.py \
+  --root runs/batch/processed_papers \
+  --inplace \
+  --overwrite
+# 8. Annotate each ready paper: target contributions, enabling contributions, and groundings.
+for paper_dir in runs/batch/processed_papers/*; do
+  [ -d "$paper_dir" ] || continue
+  [ -f "$paper_dir/usage_discovery_from_contributions.json" ] || continue
+  PYTHONPATH=src \
+  python -m step_08_annotation.cli run \
+    --paper-dir "$paper_dir" \
+    --provider gemini \
+    --model gemini/gemini-3.1-pro-preview \
+    --formatter-model gemini/gemini-3.1-pro-preview \
+    --judge-model gemini/gemini-3.1-pro-preview \
+    --candidate-count 3 \
+    --output-root runs/batch/two_pass_outputs
+done
+```
+## Pipeline Steps
+1. **Fetch metadata + LaTeX.** Downloads target-paper metadata, references, citing-paper metadata, and arXiv source where available.
+2. **Add citation markers.** Inserts normalized citation markers into the target paper so downstream citation contexts can be aligned.
+3. **Build usage contexts.** Collects text windows around downstream citations to the target paper.
+4. **Label citation functions.** Uses the bundled Deep-Citation classifier to label citation contexts as background, use, extension, comparison, and related categories.
+5. **Verify USES/EXTENDS.** Uses an LLM to check whether candidate downstream citations genuinely use or extend the target paper.
+6. **Extract arXiv paragraphs.** Retrieves fuller paragraphs from citing papers so the system has enough context for contribution extraction.
+7. **Extract and refine target-contribution clusters.** Extracts what downstream papers use the target paper for, clusters near-duplicates, and filters weak/non-usage evidence.
+8. **Annotate pathways.** Derives target contributions from the refined clusters, decomposes each into enabling contributions, selects primary groundings, and records additional grounding studies.
+## Important Files
+```text
+hf_space/streamlit_app.py                  Streamlit UI
+hf_space/runner.py                         Orchestrates steps 1-7 for the UI
+hf_space/streamlit_config.py               Example papers and tab names
+src/common/                                Shared LLM and paper-package utilities
+src/step_01_fetch/                         Metadata, references, citations, and LaTeX
+src/step_02_mark_citations/                Citation-marker insertion
+src/step_03_usage_contexts/                Downstream usage-context construction
+src/step_04_label_citations/               Deep-Citation citation-function labeling
+src/step_05_verify_uses_extends/           LLM verification of USES/EXTENDS citations
+src/step_06_extract_paragraphs/            ArXiv paragraph extraction from citing papers
+src/step_07_extract_and_refine/            Contribution extraction and cluster refinement
+src/step_08_annotation/                    Target/enabling contribution annotation and grounding
+Deep-Citation/                             Bundled citation-function classifier assets
+```

app.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from hf_space.streamlit_app import main
+if __name__ == "__main__":
+    main()

hf_space/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+streamlit>=1.36.0
+arxiv==2.2.0
+requests==2.32.5
+google-generativeai
+litellm
+rapidfuzz
+bibtexparser
+sentence-transformers
+transformers
+torch
+huggingface_hub
+typer
+tqdm
+pydantic
+numpy
+pandas
+scipy

hf_space/runner.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import time
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Generator, List, Optional, Tuple
+@dataclass
+class PipelineConfig:
+    repo_root: Path
+    source_root: Path
+    paper_input: str
+    llm_provider: str
+    llm_model: str
+    llm_model_step4: str
+    model_path: str
+    model_data_dir: str
+    model_class_def: str
+    model_lm: str
+    device: str
+    embedding_model: str
+@dataclass
+class PipelineResult:
+    job_id: str
+    job_dir: Path
+    paper_dir: Path
+    zip_path: Path
+STEP_LABELS = {
+    1: "Fetch metadata + LaTeX for input paper",
+    2: "Add citation markers",
+    3: "Build usage contexts",
+    4: "Label citation functions",
+    5: "Verify USES/EXTENDS",
+    6: "Extract arXiv paragraphs",
+    7: "Extract target contributions and refine clusters",
+}
+FULL_STEPS = [1, 2, 3, 4, 5, 6, 7]
+STOP_PREFIX = "Pipeline stopped:"
+def parse_arxiv_id(paper_input: str) -> str:
+    s = (paper_input or "").strip()
+    if not s:
+        raise ValueError("paper_input is required")
+    if "arxiv.org" in s:
+        m = re.search(r"arxiv\.org/(abs|pdf)/([^/?#]+)", s)
+        if not m:
+            raise ValueError(f"Could not parse arXiv ID from URL: {s}")
+        s = m.group(2)
+    s = s.replace(".pdf", "")
+    s = re.sub(r"v\d+$", "", s)
+    if not re.match(r"^[0-9]{4}\.[0-9]{4,5}$", s):
+        raise ValueError(f"Invalid arXiv ID format: {s}")
+    return s
+def _build_commands(
+    cfg: PipelineConfig,
+    step: int,
+    job_processed_root: Path,
+    paper_id: str,
+    ids_path: Optional[Path],
+) -> List[List[str]]:
+    py = sys.executable
+    if step == 1:
+        assert ids_path is not None
+        return [[
+            py,
+            "src/step_01_fetch/fetch_metadata.py",
+            "--ids",
+            str(ids_path),
+            "--outdir",
+            str(job_processed_root),
+        ]]
+    if step == 2:
+        return [[py, "src/step_02_mark_citations/replace_citation_markers.py", "--root", str(job_processed_root)]]
+    if step == 3:
+        return [[py, "src/step_03_usage_contexts/build_usage_contexts.py", "--root", str(job_processed_root), "--out-name", "usage_contexts.json"]]
+    if step == 4:
+        return [[
+            py,
+            "src/step_04_label_citations/label_citation_functions.py",
+            "--root",
+            str(job_processed_root),
+            "--model-path",
+            cfg.model_path,
+            "--model-data-dir",
+            cfg.model_data_dir,
+            "--model-class-def",
+            cfg.model_class_def,
+            "--model-lm",
+            cfg.model_lm,
+            "--device",
+            cfg.device,
+        ]]
+    if step == 5:
+        return [[
+            py,
+            "src/step_05_verify_uses_extends/verify_uses_extends.py",
+            "--root",
+            str(job_processed_root),
+            "--k",
+            "0",
+            "--batch-size",
+            "25",
+        ]]
+    if step == 6:
+        return [[py, "src/step_06_extract_paragraphs/extract_arxiv_paragraphs.py", "--root", str(job_processed_root)]]
+    if step == 7:
+        return [
+            [py, "src/step_07_extract_and_refine/extract_contributions_from_citations.py", "--root", str(job_processed_root)],
+            [py, "src/step_07_extract_and_refine/refine_and_filter_clusters_llm.py", "--root", str(job_processed_root), "--inplace", "--overwrite"],
+        ]
+    raise ValueError(f"Unknown step: {step}")
+def _write_single_id_file(job_dir: Path, arxiv_id: str) -> Path:
+    ids_path = job_dir / "input_ids.json"
+    payload = [{"id": arxiv_id, "title": "", "id_type": "ArXiv"}]
+    ids_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    return ids_path
+def _write_run_metadata(cfg: PipelineConfig, job_dir: Path, paper_id: str, arxiv_id: str) -> None:
+    payload = {
+        "paper_input": cfg.paper_input,
+        "paper_id": paper_id,
+        "arxiv_id": arxiv_id,
+        "source_root": str(cfg.source_root),
+        "steps": FULL_STEPS + ["annotation"],
+        "llm_provider": cfg.llm_provider,
+        "llm_model": cfg.llm_model,
+        "llm_model_step4": cfg.llm_model_step4,
+        "device": cfg.device,
+        "embedding_model": cfg.embedding_model,
+        "timestamp": int(time.time()),
+    }
+    (job_dir / "run_config.json").write_text(json.dumps(payload, indent=2), encoding="utf-8")
+def _zip_job_dir(job_dir: Path) -> Path:
+    zip_base = job_dir.parent / job_dir.name
+    archive = shutil.make_archive(str(zip_base), "zip", root_dir=str(job_dir))
+    return Path(archive)
+def _tail_log(path: Path, max_lines: int = 60) -> str:
+    try:
+        lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
+    except Exception:
+        return ""
+    if not lines:
+        return ""
+    return "\n".join(lines[-max_lines:])
+def _load_json(path: Path, default=None):
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return default
+def _write_summary_and_zip(job_dir: Path, summary_lines: List[str]) -> Path:
+    (job_dir / "summary.txt").write_text("\n".join(summary_lines), encoding="utf-8")
+    return _zip_job_dir(job_dir)
+def _count_verified_uses_extends(payload: dict) -> int:
+    records = payload.get("confirmed") or payload.get("verified_contexts") or payload.get("contexts") or payload.get("items") or []
+    if not isinstance(records, list):
+        return 0
+    accepted = {"USES", "EXTENDS", "Uses", "Extends"}
+    return sum(1 for item in records if isinstance(item, dict) and item.get("label") in accepted)
+def _stop_reason_after_step(step: int, paper_dir: Path) -> str | None:
+    if step == 1:
+        if not paper_dir.exists():
+            return "metadata could not be fetched for this paper"
+        if not (paper_dir / "processed_main.tex").exists():
+            return "arXiv source could not be retrieved or converted for this paper"
+        citations = _load_json(paper_dir / "citations_metadata.json", [])
+        if not isinstance(citations, list) or not citations:
+            return "Semantic Scholar returned no citing papers for this target paper"
+    if step == 3:
+        usage = _load_json(paper_dir / "usage_contexts.json", {})
+        if not isinstance(usage, dict):
+            return "citation usage contexts could not be built"
+        if int(usage.get("num_contexts") or 0) == 0:
+            return "no citation usage contexts were found"
+    if step == 4:
+        labels = _load_json(paper_dir / "usage_context_labels.json", {})
+        contexts = labels.get("labels") if isinstance(labels, dict) else None
+        if not isinstance(contexts, list) or not contexts:
+            return "citation-function labeling produced no labeled contexts"
+    if step == 5:
+        verified = _load_json(paper_dir / "usage_uses_extends_verified.json", {})
+        if not isinstance(verified, dict):
+            return "USES/EXTENDS verification did not produce an output file"
+        if _count_verified_uses_extends(verified) == 0:
+            return "no downstream citations were verified as USES or EXTENDS"
+    if step == 6:
+        paragraphs = _load_json(paper_dir / "usage_citing_paragraphs.json", {})
+        citing = paragraphs.get("citing_papers") if isinstance(paragraphs, dict) else None
+        if not isinstance(citing, list) or not citing:
+            return "no citing-paper paragraphs could be extracted from arXiv"
+        usable = [
+            item for item in citing
+            if isinstance(item, dict)
+            and not item.get("error")
+            and (item.get("matched_paragraphs") or item.get("target_citing_paragraphs"))
+        ]
+        if not usable:
+            return "arXiv paragraph extraction returned no usable citing-paper text"
+    if step == 7:
+        contributions = _load_json(paper_dir / "usage_contributions.json", {})
+        items = contributions.get("contributions") if isinstance(contributions, dict) else None
+        if not isinstance(items, list) or not items:
+            return "no downstream target-contribution evidence could be extracted"
+        refined = _load_json(paper_dir / "usage_discovery_from_contributions.json", {})
+        clusters = refined.get("clusters") if isinstance(refined, dict) else None
+        if not isinstance(clusters, list) or not clusters:
+            return "no valid downstream usage clusters survived refinement"
+    return None
+def run_pipeline(cfg: PipelineConfig, output_root: Path) -> Generator[Tuple[str, Optional[str]], None, PipelineResult]:
+    output_root.mkdir(parents=True, exist_ok=True)
+    job_id = f"job_{int(time.time())}_{uuid.uuid4().hex[:8]}"
+    job_dir = output_root / job_id
+    job_processed_root = job_dir / "processed_papers"
+    job_logs = job_dir / "logs"
+    job_processed_root.mkdir(parents=True, exist_ok=True)
+    job_logs.mkdir(parents=True, exist_ok=True)
+    arxiv_id = parse_arxiv_id(cfg.paper_input)
+    paper_id = arxiv_id
+    ids_path = _write_single_id_file(job_dir, arxiv_id)
+    _write_run_metadata(cfg, job_dir, paper_id, arxiv_id)
+    base_env = os.environ.copy()
+    base_env["LLM_PROVIDER"] = cfg.llm_provider
+    base_env["LLM_MODEL"] = cfg.llm_model
+    summary_lines: List[str] = []
+    paper_dir = job_processed_root / paper_id
+    max_step = 8
+    for step in FULL_STEPS:
+        label = STEP_LABELS[step]
+        log_file = job_logs / f"step_{step:02d}.log"
+        summary_lines.append(f"[{step}] {label}")
+        yield (f"Step {step}/{max_step}: {label}", None)
+        env = base_env.copy()
+        if step == 5 and cfg.llm_model_step4:
+            env["LLM_MODEL"] = cfg.llm_model_step4
+        with log_file.open("w", encoding="utf-8") as lf:
+            return_code = 0
+            failed_cmd: List[str] | None = None
+            for cmd in _build_commands(cfg, step, job_processed_root, paper_id, ids_path):
+                lf.write(f"$ {' '.join(cmd)}\n\n")
+                proc = subprocess.Popen(
+                    cmd,
+                    cwd=str(cfg.repo_root),
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    encoding="utf-8",
+                    errors="ignore",
+                    env=env,
+                )
+                assert proc.stdout is not None
+                for line in proc.stdout:
+                    lf.write(line)
+                return_code = proc.wait()
+                if return_code != 0:
+                    failed_cmd = cmd
+                    break
+        if return_code != 0:
+            summary_lines.append(f"FAILED at step {step}")
+            zip_path = _write_summary_and_zip(job_dir, summary_lines)
+            tail = _tail_log(log_file)
+            if tail:
+                yield (
+                    f"Step {step} failed.\n\nCommand: {' '.join(failed_cmd or [])}\n\nLast log lines:\n{tail}",
+                    str(zip_path),
+                )
+            else:
+                yield (f"Step {step} failed. Command: {' '.join(failed_cmd or [])}", str(zip_path))
+            return PipelineResult(job_id=job_id, job_dir=job_dir, paper_dir=paper_dir, zip_path=zip_path)
+        else:
+            yield (f"Step {step} complete", None)
+        if step == 1 and not paper_dir.exists():
+            summary_lines.append("FAILED: fetch_metadata did not create paper directory")
+            zip_path = _write_summary_and_zip(job_dir, summary_lines)
+            yield (f"Step 1 finished but paper dir missing: {paper_dir}", str(zip_path))
+            return PipelineResult(job_id=job_id, job_dir=job_dir, paper_dir=paper_dir, zip_path=zip_path)
+        stop_reason = _stop_reason_after_step(step, paper_dir)
+        if stop_reason:
+            message = f"{STOP_PREFIX} {stop_reason}."
+            summary_lines.append(message)
+            zip_path = _write_summary_and_zip(job_dir, summary_lines)
+            yield (message, str(zip_path))
+            return PipelineResult(job_id=job_id, job_dir=job_dir, paper_dir=paper_dir, zip_path=zip_path)
+    summary_lines.append("SUCCESS")
+    zip_path = _write_summary_and_zip(job_dir, summary_lines)
+    yield ("Pipeline completed successfully.", str(zip_path))
+    return PipelineResult(job_id=job_id, job_dir=job_dir, paper_dir=paper_dir, zip_path=zip_path)

hf_space/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,864 @@

+import json
+import os
+import sys
+import time
+import html
+from pathlib import Path
+from typing import Any, Optional
+import streamlit as st
+try:
+    from huggingface_hub import HfApi
+except Exception:
+    HfApi = None
+SRC = Path(__file__).resolve().parent
+REPO_ROOT = SRC.parent
+for extra in (SRC, REPO_ROOT / "src"):
+    extra_str = str(extra)
+    if extra_str not in sys.path:
+        sys.path.insert(0, extra_str)
+import runner as runner_module
+from runner import PipelineConfig
+from common.paper_package import load_paper_package
+from step_08_annotation.pipeline import TwoPassAnnotationPipeline
+from streamlit_config import EXAMPLES, TAB_NAMES
+DEFAULT_SOURCE_ROOT = str(REPO_ROOT / "src" / "processed_papers")
+DEFAULT_OUTPUT_ROOT = str(REPO_ROOT / "hf_space" / "runs")
+CUSTOM_CSS = """
+<style>
+.block-container {max-width: 1450px; padding-top: 2rem; padding-bottom: 2rem;}
+[data-testid="stSidebar"] {background: #f5f7fb; border-right: 1px solid #e2e8f0;}
+.hero-title {font-size: 3rem; font-weight: 800; letter-spacing: -0.03em; color: #1f2937; margin-bottom: 0.35rem;}
+.hero-sub {font-size: 1rem; color: #6b7280; max-width: 920px; margin-bottom: 1.25rem;}
+.metric-card {background: #ffffff; border: 1px solid #e5e7eb; border-radius: 16px; padding: 1rem 1.1rem; min-height: 96px;}
+.metric-label {font-size: 0.78rem; font-weight: 700; color: #6b7280; text-transform: uppercase; letter-spacing: 0.04em;}
+.metric-value {font-size: 1.7rem; font-weight: 800; color: #111827; margin-top: 0.35rem;}
+.soft-card {background: #ffffff; border: 1px solid #e5e7eb; border-radius: 16px; padding: 1rem 1.1rem;}
+.claim-card {background: #ffffff; border: 1px solid #e5e7eb; border-radius: 18px; overflow: hidden; margin-bottom: 1rem;}
+.claim-head {padding: 1rem 1.1rem; border-bottom: 1px solid #eef2f7; background: #fcfdff;}
+.claim-kicker {font-size: 0.78rem; font-weight: 800; color: #2563eb; text-transform: uppercase; letter-spacing: 0.04em; margin-bottom: 0.45rem;}
+.claim-text {font-size: 1.05rem; line-height: 1.55; font-weight: 700; color: #111827;}
+.claim-grid {display: grid; grid-template-columns: 1.7fr 1fr;}
+.claim-main, .claim-side {padding: 1rem 1.1rem;}
+.claim-side {border-left: 1px solid #eef2f7; background: #fbfdff;}
+.section-label {font-size: 0.78rem; font-weight: 800; color: #6b7280; text-transform: uppercase; letter-spacing: 0.04em; margin-bottom: 0.7rem;}
+.pill-row {display: flex; flex-wrap: wrap; gap: 0.45rem; margin-top: 0.8rem;}
+.pill {display: inline-block; padding: 0.28rem 0.7rem; border-radius: 999px; border: 1px solid #dbe4f0; background: #f8fbff; color: #1d4ed8; font-size: 0.78rem; font-weight: 700;}
+.ingredient-card {border: 1px solid #e6edf7; border-left: 4px solid #2563eb; border-radius: 12px; background: #ffffff; padding: 0.9rem; margin-bottom: 0.8rem;}
+.ingredient-top {display: flex; justify-content: space-between; gap: 0.7rem; align-items: flex-start; margin-bottom: 0.45rem;}
+.ingredient-name {font-size: 0.98rem; font-weight: 800; color: #111827; line-height: 1.4;}
+.role-pill {display: inline-block; padding: 0.2rem 0.55rem; border-radius: 999px; border: 1px solid #ddd6fe; background: #f5f3ff; color: #6d28d9; font-size: 0.72rem; font-weight: 800; white-space: nowrap;}
+.field {font-size: 0.88rem; line-height: 1.5; color: #374151; margin-top: 0.4rem;}
+.field b {color: #111827;}
+.grounding-block {margin-top: 0.75rem; display: grid; gap: 0.55rem;}
+.grounding-card {border-radius: 10px; padding: 0.65rem 0.75rem; border: 1px solid #bfdbfe; background: #eff6ff;}
+.grounding-card.additional {border-color: #fed7aa; background: #fff7ed;}
+.grounding-label {font-size: 0.7rem; font-weight: 900; text-transform: uppercase; letter-spacing: 0.05em; margin-bottom: 0.25rem;}
+.grounding-label.primary {color: #1d4ed8;}
+.grounding-label.additional {color: #c2410c;}
+.grounding-title {font-size: 0.9rem; font-weight: 800; color: #111827; line-height: 1.35;}
+.grounding-meta {font-size: 0.78rem; color: #64748b; margin-top: 0.2rem;}
+.cluster-card {border: 1px solid #e5e7eb; border-radius: 16px; background: #ffffff; padding: 1rem 1.1rem; margin-bottom: 0.9rem;}
+.cluster-card.additional-study {border-color: #fed7aa; background: #fff7ed;}
+.cluster-title {font-size: 1rem; font-weight: 800; color: #111827; line-height: 1.45; margin-bottom: 0.4rem;}
+.cluster-meta {font-size: 0.86rem; color: #6b7280; margin-bottom: 0.65rem;}
+.empty-card {border: 1px dashed #cbd5e1; border-radius: 14px; padding: 1rem; background: #ffffff; color: #64748b;}
+.example-btn button {border-radius: 999px !important; border: 1px solid #fecaca !important; color: #991b1b !important; background: #fff !important;}
+@media (max-width: 1050px) {.claim-grid {grid-template-columns: 1fr;} .claim-side {border-left: none; border-top: 1px solid #eef2f7;}}
+</style>
+"""
+def get_secret(name: str, default: str = "") -> str:
+    value = os.getenv(name)
+    if value:
+        return value
+    try:
+        return st.secrets[name]
+    except Exception:
+        return default
+def run_repo_config() -> tuple[str | None, str, str | None]:
+    repo_id = get_secret("RUNS_REPO_ID", "")
+    repo_type = get_secret("RUNS_REPO_TYPE", "dataset")
+    token = get_secret("HF_WRITE_TOKEN", "") or get_secret("HF_TOKEN", "")
+    return repo_id or None, repo_type, token or None
+def remote_run_prefix(job_id: str) -> str:
+    return f"runs/{job_id}"
+def upload_run_artifact(job_dir: Path) -> str:
+    repo_id, repo_type, token = run_repo_config()
+    if not repo_id or not token:
+        return ""
+    if HfApi is None:
+        return "upload_failed: huggingface_hub is not installed"
+    job_id = job_dir.name
+    remote_prefix = remote_run_prefix(job_id)
+    uploaded: list[str] = []
+    try:
+        api = HfApi(token=token)
+        for name in ["input_ids.json", "run_config.json", "summary.txt"]:
+            path = job_dir / name
+            if path.exists():
+                api.upload_file(
+                    path_or_fileobj=str(path),
+                    path_in_repo=f"{remote_prefix}/{name}",
+                    repo_id=repo_id,
+                    repo_type=repo_type,
+                    commit_message=f"Upload {name} for {job_id}",
+                )
+                uploaded.append(name)
+        for folder_name in ["logs", "processed_papers", "two_pass_outputs"]:
+            folder = job_dir / folder_name
+            if not folder.exists():
+                continue
+            files = [path for path in folder.rglob("*") if path.is_file()]
+            if not files:
+                continue
+            api.upload_folder(
+                folder_path=str(folder),
+                path_in_repo=f"{remote_prefix}/{folder_name}",
+                repo_id=repo_id,
+                repo_type=repo_type,
+                commit_message=f"Upload {folder_name} for {job_id}",
+                ignore_patterns=["__pycache__/*", "*.pyc", "*.zip"],
+            )
+            uploaded.append(f"{folder_name}[{len(files)} files]")
+        return f"{repo_type}:{repo_id}/{remote_prefix}/ (uploaded: {', '.join(uploaded) or 'nothing'})"
+    except Exception as exc:
+        return f"upload_failed: {exc}"
+def _load_json(path: Path) -> Optional[dict]:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+def _status_from_line(line: str, current: str) -> str:
+    text = (line or "").strip()
+    text = _display_log_line(text)
+    if text.startswith("Pipeline stopped:"):
+        return "Stopped"
+    if text.startswith("Step "):
+        return text
+    if "failed" in text.lower():
+        return f"Failed: {text}"
+    if "completed successfully" in text.lower():
+        return "Completed"
+    return current
+def _display_log_line(line: str) -> str:
+    text = (line or "").strip()
+    if text.startswith("Step ") and " failed." in text:
+        return text.splitlines()[0]
+    if text == "[annotation] starting cluster-first two-pass annotation":
+        return "Step 8/8: Annotate target contributions and enabling contributions"
+    if text.startswith("[annotation] complete:"):
+        return "Step 8 complete"
+    if text == "Pipeline completed successfully.":
+        return text
+    return text
+def _format_step_event(line: str) -> str:
+    text = _display_log_line(line)
+    if not text:
+        return ""
+    if text.startswith("Step ") and "/" in text and ":" in text:
+        return f"🛠️ {text}"
+    if text.startswith("Step ") and text.endswith(" complete"):
+        return f"✅ {text}"
+    if text.lower().startswith("stopped after step"):
+        return f"⏹️ {text}"
+    if text.startswith("Pipeline stopped:"):
+        return f"⏹️ {text}"
+    if "failed" in text.lower():
+        return f"❌ {text}"
+    if "completed successfully" in text.lower():
+        return f"✅ {text}"
+    return f"• {text}"
+def _ensure_state():
+    defaults = {
+        "paper_input": "",
+        "run_status": "Idle",
+        "run_logs": [],
+        "run_events": [],
+        "artifact_path": None,
+        "run_dir_path": None,
+        "paper_dir_path": None,
+        "annotation_payload_path": None,
+        "run_summary": None,
+        "annotation_skipped_reason": None,
+        "pipeline_failed_reason": None,
+        "remote_artifact_ref": "",
+    }
+    for key, value in defaults.items():
+        st.session_state.setdefault(key, value)
+def _metric_card(label: str, value: Any):
+    st.markdown(
+        f"<div class='metric-card'><div class='metric-label'>{label}</div><div class='metric-value'>{value}</div></div>",
+        unsafe_allow_html=True,
+    )
+def _esc(value: Any) -> str:
+    return html.escape("" if value is None else str(value))
+def _safe_int(value: Any, default: int = 0) -> int:
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return default
+def _grounding_html(grounding: Optional[dict], label: str, kind: str) -> str:
+    if not grounding:
+        return ""
+    title = (
+        grounding.get("ref_title")
+        or grounding.get("title")
+        or grounding.get("paper_id")
+        or grounding.get("ref_id")
+        or "__NONE__"
+    )
+    meta = []
+    if grounding.get("paper_id"):
+        meta.append(f"paper_id: {grounding.get('paper_id')}")
+    elif grounding.get("ref_id"):
+        meta.append(f"ref_id: {grounding.get('ref_id')}")
+    if grounding.get("ref_year"):
+        meta.append(str(grounding.get("ref_year")))
+    authors = grounding.get("ref_authors")
+    if isinstance(authors, list) and authors:
+        meta.append(", ".join(str(author) for author in authors[:3]))
+    meta_html = f"<div class='grounding-meta'>{_esc(' · '.join(meta))}</div>" if meta else ""
+    extra_class = " additional" if kind == "additional" else ""
+    return (
+        f"<div class='grounding-card{extra_class}'>"
+        f"<div class='grounding-label {kind}'>{_esc(label)}</div>"
+        f"<div class='grounding-title'>{_esc(title)}</div>"
+        f"{meta_html}"
+        "</div>"
+    )
+def _study_key(item: dict) -> str:
+    for key in ["paper_id", "ref_id", "ref_title", "title"]:
+        value = item.get(key)
+        if value:
+            return str(value).lower()
+    return ""
+def _collect_grounded_studies(discoveries: list[dict], ingredients: list[dict]) -> list[dict]:
+    studies: list[dict] = []
+    seen: set[str] = set()
+    for item in discoveries:
+        if not isinstance(item, dict):
+            continue
+        copied = dict(item)
+        copied["_grounding_kind"] = "primary"
+        copied["_grounding_label"] = "Primary study"
+        key = _study_key(copied)
+        if key:
+            seen.add(key)
+        studies.append(copied)
+    for idx, ingredient in enumerate(ingredients, start=1):
+        if not isinstance(ingredient, dict):
+            continue
+        canonical = ingredient.get("canonical_grounding") or {}
+        canonical_key = _study_key(canonical) if isinstance(canonical, dict) else ""
+        annotation = ingredient.get("canonical_annotation") or {}
+        for ref in ingredient.get("additional_groundings") or []:
+            if not isinstance(ref, dict):
+                continue
+            key = _study_key(ref)
+            if key and (key == canonical_key or key in seen):
+                continue
+            copied = dict(ref)
+            copied["_grounding_kind"] = "additional"
+            copied["_grounding_label"] = f"Additional study for enabling contribution {idx}"
+            copied.setdefault("role", annotation.get("role") or ", ".join(annotation.get("roles") or []))
+            copied.setdefault("contribution", annotation.get("contribution"))
+            copied.setdefault("rationale", annotation.get("rationale"))
+            if key:
+                seen.add(key)
+            studies.append(copied)
+    return studies
+def _render_reference_list(discoveries: list[dict], ingredients: Optional[list[dict]] = None):
+    studies = _collect_grounded_studies(discoveries, ingredients or [])
+    if not studies:
+        st.markdown("<div class='empty-card'>No grounded studies listed for this target contribution.</div>", unsafe_allow_html=True)
+        return
+    for item in studies:
+        title = item.get("ref_title") or item.get("title") or item.get("ref_id") or item.get("paper_id") or "Untitled reference"
+        is_additional = item.get("_grounding_kind") == "additional"
+        meta = []
+        if item.get("_grounding_label"):
+            meta.append(str(item.get("_grounding_label")))
+        if item.get("role"):
+            meta.append(str(item.get("role")))
+        if item.get("ref_year"):
+            meta.append(str(item.get("ref_year")))
+        class_name = "cluster-card additional-study" if is_additional else "cluster-card"
+        body = [f"<div class='{class_name}'><div class='cluster-title'>{_esc(title)}</div>"]
+        if meta:
+            body.append(f"<div class='cluster-meta'>{_esc(' · '.join(meta))}</div>")
+        if item.get("contribution"):
+            body.append(f"<div class='field'><b>Contribution.</b> {_esc(item.get('contribution'))}</div>")
+        if item.get("rationale"):
+            body.append(f"<div class='field'><b>Rationale.</b> {_esc(item.get('rationale'))}</div>")
+        body.append("</div>")
+        st.markdown("".join(body), unsafe_allow_html=True)
+def _render_claims_tab(payload: Optional[dict]):
+    if not payload:
+        st.markdown("<div class='empty-card'>No annotation payload is available yet.</div>", unsafe_allow_html=True)
+        return
+    claims = payload.get("claims") or []
+    if not claims:
+        st.markdown("<div class='empty-card'>The run completed, but no target contributions were produced.</div>", unsafe_allow_html=True)
+        return
+    for idx, claim in enumerate(claims, start=1):
+        claim_id = claim.get("claim_id") or f"C{idx}"
+        claim_text = claim.get("rewritten_claim") or claim.get("text") or "(missing target contribution text)"
+        ingredients = claim.get("ingredients") or []
+        discoveries = claim.get("enabling_discoveries") or []
+        grounded_studies = _collect_grounded_studies(discoveries, ingredients)
+        meta_pills = []
+        if claim.get("decision"):
+            meta_pills.append(str(claim.get("decision")))
+        if claim.get("cluster_id"):
+            meta_pills.append(f"cluster {claim.get('cluster_id')}")
+        meta_pills.append(f"{len(ingredients)} enabling contribution{'s' if len(ingredients) != 1 else ''}")
+        meta_pills.append(f"{len(grounded_studies)} grounded stud{'ies' if len(grounded_studies) != 1 else 'y'}")
+        pills_html = "".join(f"<span class='pill'>{_esc(p)}</span>" for p in meta_pills)
+        st.markdown(
+            f"""
+            <div class='claim-card'>
+              <div class='claim-head'>
+                <div class='claim-kicker'>Target contribution {idx} · {_esc(claim_id)}</div>
+                <div class='claim-text'>{_esc(claim_text)}</div>
+                <div class='pill-row'>{pills_html}</div>
+              </div>
+            </div>
+            """,
+            unsafe_allow_html=True,
+        )
+        left, right = st.columns([1.7, 1.0], gap="large")
+        with left:
+            st.markdown("<div class='section-label'>Decomposition</div>", unsafe_allow_html=True)
+            if not ingredients:
+                st.markdown("<div class='empty-card'>No enabling contributions for this target contribution.</div>", unsafe_allow_html=True)
+            for ingredient_idx, ingredient in enumerate(ingredients, start=1):
+                annotation = ingredient.get("canonical_annotation") or {}
+                role = annotation.get("role") or ", ".join(annotation.get("roles") or []) or "UNSPECIFIED"
+                canonical_grounding = ingredient.get("canonical_grounding") or {}
+                extras = ingredient.get("additional_groundings") or []
+                grounding_parts = []
+                if canonical_grounding:
+                    grounding_parts.append(
+                        _grounding_html(canonical_grounding, "Primary grounding", "primary")
+                    )
+                for ref in extras:
+                    if not isinstance(ref, dict):
+                        continue
+                    if canonical_grounding and (
+                        ref.get("paper_id") == canonical_grounding.get("paper_id")
+                        or ref.get("ref_id") == canonical_grounding.get("ref_id")
+                    ):
+                        continue
+                    grounding_parts.append(
+                        _grounding_html(ref, "Additional grounding", "additional")
+                    )
+                if not grounding_parts:
+                    canonical_ref_id = ingredient.get("canonical_ref_id") or "__NONE__"
+                    grounding_parts.append(
+                        "<div class='grounding-card'>"
+                        "<div class='grounding-label primary'>Grounding</div>"
+                        f"<div class='grounding-title'>{_esc(canonical_ref_id)}</div>"
+                        "</div>"
+                    )
+                grounding_block = (
+                    "<div class='grounding-block'>"
+                    f"<div class='section-label'>Groundings for enabling contribution {ingredient_idx}</div>"
+                    + "".join(grounding_parts)
+                    + "</div>"
+                )
+                st.markdown(
+                    f"""
+                    <div class='ingredient-card'>
+                      <div class='ingredient-top'>
+                        <div class='ingredient-name'>{ingredient_idx}. {_esc(ingredient.get('ingredient') or '(missing enabling contribution)')}</div>
+                        <div class='role-pill'>{_esc(role)}</div>
+                      </div>
+                      <div class='field'><b>Contribution.</b> {_esc(annotation.get('contribution') or '')}</div>
+                      <div class='field'><b>Rationale.</b> {_esc(annotation.get('rationale') or '')}</div>
+                      <div class='field'><b>Evidence.</b> {_esc(annotation.get('evidence_span') or '')}</div>
+                      {grounding_block}
+                    </div>
+                    """,
+                    unsafe_allow_html=True,
+                )
+        with right:
+            st.markdown("<div class='section-label'>Grounded and additional studies</div>", unsafe_allow_html=True)
+            _render_reference_list(discoveries, ingredients)
+def _render_clusters_tab(discovery: Optional[dict], contributions: list[dict]):
+    if not discovery:
+        st.markdown("<div class='empty-card'>No refined cluster file is available yet.</div>", unsafe_allow_html=True)
+        return
+    clusters = discovery.get("clusters") or []
+    dropped = discovery.get("dropped_clusters") or []
+    if not clusters:
+        st.markdown("<div class='empty-card'>No valid downstream usage clusters survived refinement and filtering.</div>", unsafe_allow_html=True)
+        if dropped:
+            with st.expander(f"Dropped clusters ({len(dropped)})", expanded=False):
+                st.json(dropped)
+        return
+    for cluster in clusters:
+        cluster_id = cluster.get("cluster_id", "")
+        rep = cluster.get("representative_claim") or cluster.get("cluster_title") or "(missing representative claim)"
+        count = _safe_int(cluster.get("count"), len(cluster.get("claim_indices") or []))
+        source_ids = cluster.get("source_cluster_ids") or []
+        merge_rationale = cluster.get("merge_rationale") or ""
+        st.markdown(
+            f"""
+            <div class='cluster-card'>
+              <div class='cluster-title'>{_esc(rep)}</div>
+              <div class='cluster-meta'>Cluster {_esc(cluster_id)} · {count} contribution instance{'s' if count != 1 else ''}</div>
+            </div>
+            """,
+            unsafe_allow_html=True,
+        )
+        meta_cols = st.columns([1.3, 1.3, 1.4])
+        with meta_cols[0]:
+            st.caption("Cluster ID")
+            st.code(str(cluster_id), language="text")
+        with meta_cols[1]:
+            st.caption("Source clusters")
+            st.code(", ".join(str(x) for x in source_ids) if source_ids else "singleton", language="text")
+        with meta_cols[2]:
+            st.caption("Merge rationale")
+            st.write(merge_rationale or "—")
+        claim_indices = cluster.get("claim_indices") or []
+        if claim_indices:
+            with st.expander(f"Linked contribution instances ({len(claim_indices)})", expanded=False):
+                for idx in claim_indices:
+                    try:
+                        j = int(idx)
+                    except Exception:
+                        continue
+                    if 0 <= j < len(contributions):
+                        item = contributions[j] or {}
+                        title = item.get("citing_title") or item.get("citing_paper_id") or "Unknown citing paper"
+                        claim = item.get("paper_claim") or item.get("claim") or "(missing claim)"
+                        rationale = item.get("rationale") or ""
+                        evidence = item.get("evidence_span") or ""
+                        st.markdown(f"**{title}**")
+                        st.write(claim)
+                        if rationale:
+                            st.caption(f"Rationale: {rationale}")
+                        if evidence:
+                            st.caption(f"Evidence: {evidence}")
+                        st.divider()
+    if dropped:
+        with st.expander(f"Dropped clusters ({len(dropped)})", expanded=False):
+            st.json(dropped)
+def run_two_pass_annotation(
+    paper_dir: Path,
+    annotation_output_root: Path,
+    llm_provider: str,
+    llm_model: str,
+    formatter_model: str,
+    judge_model: str,
+    candidate_count: int,
+):
+    paper = load_paper_package(paper_dir)
+    pipeline = TwoPassAnnotationPipeline(
+        provider=llm_provider,
+        model=llm_model,
+        formatter_model=formatter_model or None,
+        judge_model=judge_model or None,
+        output_root=annotation_output_root,
+        annotator_id="streamlit_hf_space",
+        candidate_count=max(1, int(candidate_count)),
+        formatter_max_attempts=3,
+        include_reference_examples=True,
+        prompt_profile="full",
+    )
+    result = pipeline.run(paper)
+    return result.result, result.run_dir
+def run_pipeline_stream(
+    paper_input: str,
+    source_root: str,
+    output_root: str,
+    llm_provider: str,
+    llm_model: str,
+    llm_model_step4: str,
+    formatter_model: str,
+    judge_model: str,
+    candidate_count: int,
+):
+    gemini_key = get_secret("GEMINI_API_KEY")
+    if gemini_key:
+        os.environ["GEMINI_API_KEY"] = gemini_key
+    cfg = PipelineConfig(
+        repo_root=REPO_ROOT,
+        source_root=Path(source_root).expanduser().resolve(),
+        paper_input=paper_input.strip(),
+        llm_provider=llm_provider.strip() or "gemini",
+        llm_model=llm_model.strip() or "gemini-3.1-pro-preview",
+        llm_model_step4=llm_model_step4.strip() or "gemini-3-flash-preview",
+        model_path="Deep-Citation/Workspace/acl_scicite_wksp_trl/best_model.pt",
+        model_data_dir="Deep-Citation/Data",
+        model_class_def="Deep-Citation/Data/class_def.json",
+        model_lm="scibert",
+        device="cpu",
+        embedding_model="sentence-transformers/all-mpnet-base-v2",
+    )
+    status_placeholder = st.empty()
+    activity_placeholder = st.empty()
+    status = "Starting"
+    logs: list[str] = []
+    events: list[str] = []
+    seen_events: set[str] = set()
+    artifact_path = None
+    annotation_payload_path = None
+    annotation_skipped_reason = None
+    run_summary = None
+    pipeline_stopped_reason = None
+    pipeline_failed_reason = None
+    def render_activity(items: list[str]):
+        if not items:
+            activity_placeholder.info("Waiting for first step...")
+            return
+        activity_placeholder.markdown("### Activity\n" + "\n".join(f"- {item}" for item in items[-20:]))
+    def append_display_line(line: str):
+        display_line = _display_log_line(line)
+        if not display_line:
+            return
+        logs.append(display_line)
+        event = _format_step_event(display_line)
+        if event and event not in seen_events:
+            seen_events.add(event)
+            events.append(event)
+            render_activity(events)
+    for line, maybe_artifact in runner_module.run_pipeline(cfg, Path(output_root).expanduser().resolve()):
+        if line:
+            if line.strip() == "Pipeline completed successfully.":
+                if maybe_artifact:
+                    artifact_path = maybe_artifact
+                continue
+            display_line = _display_log_line(line)
+            if display_line:
+                logs.append(display_line)
+                status = _status_from_line(display_line, status)
+                if display_line.startswith("Pipeline stopped:"):
+                    pipeline_stopped_reason = display_line
+                if "failed" in display_line.lower():
+                    pipeline_failed_reason = display_line
+                event = _format_step_event(display_line)
+                if event and event not in seen_events:
+                    seen_events.add(event)
+                    events.append(event)
+        if maybe_artifact:
+            artifact_path = maybe_artifact
+        status_placeholder.info(f"Current status: {status}")
+        render_activity(events)
+    run_dir_path = None
+    paper_dir_path = None
+    remote_artifact_ref = ""
+    if artifact_path:
+        job_dir = Path(str(artifact_path)).with_suffix("")
+        run_dir_path = str(job_dir)
+        paper_id = runner_module.parse_arxiv_id(paper_input.strip())
+        paper_dir = job_dir / "processed_papers" / paper_id
+        paper_dir_path = str(paper_dir)
+        if pipeline_failed_reason:
+            annotation_skipped_reason = f"{pipeline_failed_reason} Annotation was not run."
+        elif pipeline_stopped_reason:
+            annotation_skipped_reason = f"{pipeline_stopped_reason} Annotation was not run."
+        else:
+            discovery = _load_json(paper_dir / "usage_discovery_from_contributions.json") or {}
+            refined_clusters = discovery.get("clusters") or []
+            if not refined_clusters:
+                annotation_skipped_reason = "No valid downstream usage clusters remained after refinement and filtering. Annotation was skipped."
+                logs.append("[annotation] skipped: no refined downstream usage clusters")
+            else:
+                append_display_line("[annotation] starting cluster-first two-pass annotation")
+                status_placeholder.info("Current status: Running annotation")
+                try:
+                    run_output, annotation_run_dir = run_two_pass_annotation(
+                        paper_dir=paper_dir,
+                        annotation_output_root=job_dir / "two_pass_outputs",
+                        llm_provider=llm_provider,
+                        llm_model=llm_model,
+                        formatter_model=formatter_model,
+                        judge_model=judge_model,
+                        candidate_count=candidate_count,
+                    )
+                    payload_path = run_output.get("ui_payload_path") if isinstance(run_output, dict) else None
+                    if payload_path and Path(payload_path).exists():
+                        annotation_payload_path = str(Path(payload_path))
+                    append_display_line(f"[annotation] complete: {annotation_run_dir}")
+                except Exception as exc:
+                    pipeline_failed_reason = f"Annotation failed: {exc}"
+                    annotation_skipped_reason = pipeline_failed_reason
+                    logs.append(f"[annotation] failed: {exc}")
+        logs.append("[upload] uploading run artifact to Hugging Face dataset")
+        status_placeholder.info("Current status: Finalizing run")
+        remote_artifact_ref = upload_run_artifact(job_dir)
+        if remote_artifact_ref:
+            logs.append(f"[upload] {remote_artifact_ref}")
+        else:
+            logs.append("[upload] skipped: RUNS_REPO_ID/HF_WRITE_TOKEN not configured")
+        if not pipeline_stopped_reason and not pipeline_failed_reason:
+            append_display_line("Pipeline completed successfully.")
+    if pipeline_failed_reason:
+        status = "Failed"
+    elif artifact_path and pipeline_stopped_reason:
+        status = "Stopped"
+    else:
+        status = "Completed" if artifact_path else "Failed"
+    if status == "Completed":
+        status_placeholder.success(f"Final status: {status}")
+    elif status == "Stopped":
+        status_placeholder.warning(f"Final status: {status}")
+    else:
+        status_placeholder.error("Final status: Failed")
+    st.session_state["run_status"] = status
+    st.session_state["run_logs"] = logs
+    st.session_state["run_events"] = events
+    st.session_state["artifact_path"] = artifact_path
+    st.session_state["run_dir_path"] = run_dir_path
+    st.session_state["paper_dir_path"] = paper_dir_path
+    st.session_state["annotation_payload_path"] = annotation_payload_path
+    st.session_state["annotation_skipped_reason"] = annotation_skipped_reason
+    st.session_state["pipeline_stopped_reason"] = pipeline_stopped_reason
+    st.session_state["pipeline_failed_reason"] = pipeline_failed_reason
+    st.session_state["run_summary"] = run_summary
+    st.session_state["remote_artifact_ref"] = remote_artifact_ref
+def _load_result_bundle():
+    paper_dir_path = st.session_state.get("paper_dir_path")
+    annotation_payload_path = st.session_state.get("annotation_payload_path")
+    paper_dir = Path(paper_dir_path) if paper_dir_path else None
+    payload = _load_json(Path(annotation_payload_path)) if annotation_payload_path else None
+    discovery = _load_json(paper_dir / "usage_discovery_from_contributions.json") if paper_dir and paper_dir.exists() else None
+    contributions_data = _load_json(paper_dir / "usage_contributions.json") if paper_dir and paper_dir.exists() else None
+    contributions = (contributions_data or {}).get("contributions") or []
+    return paper_dir, discovery, contributions, payload
+def _render_overview(payload: Optional[dict], discovery: Optional[dict]):
+    claims = (payload or {}).get("claims") or []
+    ingredients = sum(len(claim.get("ingredients") or []) for claim in claims)
+    studies = sum(
+        len(_collect_grounded_studies(claim.get("enabling_discoveries") or [], claim.get("ingredients") or []))
+        for claim in claims
+    )
+    clusters = len((discovery or {}).get("clusters") or [])
+    c1, c2, c3, c4 = st.columns(4)
+    with c1:
+        _metric_card("Refined clusters", clusters)
+    with c2:
+        _metric_card("Target contributions", len(claims))
+    with c3:
+        _metric_card("Enabling contributions", ingredients)
+    with c4:
+        _metric_card("Grounded studies", studies)
+def _build_public_export(discovery: Optional[dict], payload: Optional[dict]) -> dict:
+    claims = []
+    for claim in (payload or {}).get("claims") or []:
+        if not isinstance(claim, dict):
+            continue
+        ingredients = []
+        for ingredient in claim.get("ingredients") or []:
+            if not isinstance(ingredient, dict):
+                continue
+            ingredients.append({
+                "ingredient_id": ingredient.get("ingredient_id"),
+                "enabling_contribution": ingredient.get("ingredient"),
+                "canonical_annotation": ingredient.get("canonical_annotation") or {},
+                "primary_grounding": ingredient.get("canonical_grounding") or {},
+                "additional_groundings": ingredient.get("additional_groundings") or [],
+            })
+        claims.append({
+            "claim_id": claim.get("claim_id"),
+            "target_contribution": claim.get("rewritten_claim") or claim.get("text"),
+            "cluster_id": claim.get("cluster_id"),
+            "decision": claim.get("decision"),
+            "enabling_contributions": ingredients,
+            "grounded_studies": _collect_grounded_studies(claim.get("enabling_discoveries") or [], claim.get("ingredients") or []),
+        })
+    return {
+        "citation_clusters": (discovery or {}).get("clusters") or [],
+        "target_contribution_decompositions": claims,
+    }
+def main():
+    llm_provider = os.getenv("LLM_PROVIDER", "gemini")
+    llm_model = os.getenv("LLM_MODEL", "gemini-3.1-pro-preview")
+    llm_model_step4 = os.getenv("LLM_MODEL_STEP4", "gemini-3-flash-preview")
+    formatter_model = os.getenv("ANNOTATION_FORMATTER_MODEL", "gemini/gemini-3.1-pro-preview")
+    judge_model = os.getenv("ANNOTATION_JUDGE_MODEL", "gemini/gemini-3.1-pro-preview")
+    candidate_count = int(os.getenv("ANNOTATION_CANDIDATE_COUNT", "3"))
+    source_root = DEFAULT_SOURCE_ROOT
+    output_root = DEFAULT_OUTPUT_ROOT
+    st.set_page_config(page_title="Forecasting Scientific Contribution Pathways", page_icon="📚", layout="wide")
+    st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
+    _ensure_state()
+    with st.sidebar:
+        st.markdown("## SciPaths")
+        st.caption("Enter an arXiv paper and run the target-contribution pathway annotation pipeline.")
+        st.divider()
+        st.markdown("### Citation")
+        st.caption("If you find this useful, please cite our paper as:")
+        st.code(
+            "@misc{chamoun2026scipathsforecastingpathwaysscientific,\n"
+            "      title={SciPaths: Forecasting Pathways to Scientific Discovery}, \n"
+            "      author={Eric Chamoun and Yizhou Chi and Yulong Chen and Rui Cao and Zifeng Ding and Michalis Korakakis and Andreas Vlachos},\n"
+            "      year={2026},\n"
+            "      eprint={2605.14600},\n"
+            "      archivePrefix={arXiv},\n"
+            "      primaryClass={cs.CL},\n"
+            "      url={https://arxiv.org/abs/2605.14600}, \n"
+            "}",
+            language="bibtex",
+        )
+        st.caption("Paper URL: https://arxiv.org/abs/2605.14600")
+        st.caption("Questions or feedback: ec806@cam.ac.uk")
+        st.divider()
+        if st.button("Clear chat / restart", use_container_width=True):
+            for key in [
+                "paper_input", "run_status", "run_logs", "run_events", "artifact_path",
+                "run_dir_path", "paper_dir_path", "annotation_payload_path",
+                "run_summary", "annotation_skipped_reason", "pipeline_stopped_reason",
+                "pipeline_failed_reason", "remote_artifact_ref",
+            ]:
+                if key in st.session_state:
+                    del st.session_state[key]
+            st.rerun()
+        if not get_secret("GEMINI_API_KEY"):
+            st.warning("No GEMINI_API_KEY found in environment or secrets.", icon="🔑")
+    st.markdown("<div class='hero-title'>Forecasting Scientific Contribution Pathways</div>", unsafe_allow_html=True)
+    st.markdown(
+        "<div class='hero-sub'>Run the SciPaths pipeline through refined downstream citation clusters, then derive target contributions from those clusters and decompose each target contribution into enabling contributions and grounded studies.</div>",
+        unsafe_allow_html=True,
+    )
+    tabs = st.tabs(TAB_NAMES)
+    with tabs[0]:
+        with st.expander("Try an example", expanded=True):
+            cols = st.columns(len(EXAMPLES))
+            for i, (label, value) in enumerate(EXAMPLES.items()):
+                with cols[i]:
+                    if st.button(label, key=f"example::{label}", use_container_width=True):
+                        st.session_state["paper_input"] = value
+                        st.rerun()
+        paper_input = st.text_input(
+            "Paper input (arXiv URL or ID)",
+            key="paper_input",
+            placeholder="https://arxiv.org/abs/2311.14919",
+        )
+        if st.button("Run pipeline + annotation", type="primary", use_container_width=True):
+            if not paper_input.strip():
+                st.error("Paper input is required.")
+            else:
+                run_pipeline_stream(
+                    paper_input=paper_input,
+                    source_root=source_root,
+                    output_root=output_root,
+                    llm_provider=llm_provider,
+                    llm_model=llm_model,
+                    llm_model_step4=llm_model_step4,
+                    formatter_model=formatter_model,
+                    judge_model=judge_model,
+                    candidate_count=candidate_count,
+                )
+        st.markdown("### Latest run")
+        st.info(f"Status: {st.session_state.get('run_status', 'Idle')}")
+        if st.session_state.get("pipeline_failed_reason"):
+            st.error(st.session_state["pipeline_failed_reason"])
+        if st.session_state.get("annotation_skipped_reason"):
+            st.warning(st.session_state["annotation_skipped_reason"])
+        paper_dir, discovery, contributions, payload = _load_result_bundle()
+        public_export = _build_public_export(discovery, payload)
+        if public_export["citation_clusters"] or public_export["target_contribution_decompositions"]:
+            st.download_button(
+                "Download citation clusters and contribution groundings",
+                data=json.dumps(public_export, indent=2, ensure_ascii=False),
+                file_name="scipaths_run_results.json",
+                mime="application/json",
+                use_container_width=False,
+            )
+        _render_overview(payload, discovery)
+    with tabs[1]:
+        paper_dir, discovery, contributions, payload = _load_result_bundle()
+        _render_clusters_tab(discovery, contributions)
+    with tabs[2]:
+        paper_dir, discovery, contributions, payload = _load_result_bundle()
+        _render_claims_tab(payload)
+if __name__ == "__main__":
+    main()

hf_space/streamlit_config.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from runner import STEP_LABELS
+EXAMPLES = {
+    "Confidence-based MBR Decoding": "https://arxiv.org/abs/2311.14919",
+    "AVerImaTeC": "https://arxiv.org/abs/2505.17978",
+    "CSCD-NS (2022)": "https://arxiv.org/abs/2211.08788",
+}
+TAB_NAMES = [
+    "Pipeline Run",
+    "Citation Clusters",
+    "Target Contribution Decomposition",
+]
+METHOD_NOTES = {
+    "Pipeline scope": "Runs steps 0, 1, 2, 3, 4, 5, 6, and 8, then launches cluster-first two-pass annotation.",
+    "Input": "Accepts a single arXiv URL or arXiv ID.",
+    "Cluster-first annotation": "Uses all refined downstream USES/EXTENDS clusters to derive target contributions, then decomposes each target contribution separately.",
+    "Stopping rule": "If no valid downstream usage clusters remain after refinement and filtering, annotation is skipped.",
+}
+DISPLAY_STEPS = [0, 1, 2, 3, 4, 5, 6, 8]
+def pipeline_steps_markdown() -> str:
+    lines = []
+    for idx in DISPLAY_STEPS:
+        lines.append(f"{idx}. {STEP_LABELS[idx]}")
+    lines.append("9. Cluster-first target contribution annotation and enabling contribution decomposition")
+    return "\n".join(lines)

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ -r hf_space/requirements.txt

src/common/__init__.py ADDED Viewed

File without changes

src/common/llm_client.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+from typing import Optional
+import google.generativeai as genai
+from google.generativeai.types import GenerationConfig
+class LLMClient:
+    def __init__(self):
+        self.provider = os.getenv("LLM_PROVIDER", "gemini").lower()
+        self.model_name = os.getenv("LLM_MODEL", "gemini-3.1-pro-preview")
+        if self.provider == "gemini":
+            if genai is None:
+                raise ImportError("google-generativeai not installed.")
+            key = os.getenv("GEMINI_API_KEY")
+            if not key:
+                raise ValueError("GEMINI_API_KEY not set.")
+            genai.configure(api_key=key)
+            self.model = genai.GenerativeModel(self.model_name)
+        else:
+            raise NotImplementedError("Only Gemini provider is wired for now.")
+    def call(self, prompt: str, schema: Optional[dict] = None) -> str:
+        """
+        Call the underlying LLM.
+        If `schema` is provided (as a plain JSON schema dict), and provider is Gemini,
+        use it as response_schema with JSON mime type.
+        """
+        if self.provider == "gemini":
+            if schema and GenerationConfig is not None:
+                config = GenerationConfig(
+                    response_schema=schema,
+                    response_mime_type="application/json",
+                )
+                response = self.model.generate_content(
+                    prompt,
+                    generation_config=config,
+                )
+            else:
+                response = self.model.generate_content(prompt)
+            text = getattr(response, "text", "")
+            if not text:
+                raise RuntimeError("LLM response did not contain text.")
+            return text
+        raise NotImplementedError("Schema-based calls only wired for Gemini right now.")

src/common/model_client.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from __future__ import annotations
+import json
+import os
+import re
+from dataclasses import dataclass
+from typing import Any, Type
+import litellm
+from litellm import completion
+from pydantic import BaseModel, ValidationError
+@dataclass
+class ModelConfig:
+    provider: str
+    model: str
+    temperature: float = 0.2
+    max_tokens: int = 12000
+    @property
+    def model_name(self) -> str:
+        if "/" in self.model:
+            return self.model
+        if self.provider.lower() == "openai":
+            return f"openai/{self.model}"
+        if self.provider.lower() == "gemini":
+            return f"gemini/{self.model}"
+        return self.model
+class MultiProviderLLMClient:
+    def __init__(self, default_config: ModelConfig, stage_models: dict[str, str] | None = None):
+        self.default_config = default_config
+        self.stage_models = stage_models or {}
+        litellm.drop_params = True
+        self._validate_env(default_config.provider)
+    def _validate_env(self, provider: str) -> None:
+        provider = provider.lower()
+        if provider == "openai" and not os.getenv("OPENAI_API_KEY"):
+            raise ValueError("OPENAI_API_KEY is required for provider=openai")
+        if provider == "gemini" and not os.getenv("GEMINI_API_KEY"):
+            raise ValueError("GEMINI_API_KEY is required for provider=gemini")
+    def config_for_stage(self, stage_name: str) -> ModelConfig:
+        model_override = self.stage_models.get(stage_name)
+        if not model_override:
+            return self.default_config
+        provider = self.default_config.provider
+        model = model_override
+        if "/" in model_override:
+            provider, model = model_override.split("/", 1)
+        self._validate_env(provider)
+        return ModelConfig(
+            provider=provider,
+            model=model,
+            temperature=self.default_config.temperature,
+            max_tokens=self.default_config.max_tokens,
+        )
+    def generate_structured(
+        self,
+        *,
+        stage_name: str,
+        system_prompt: str,
+        user_prompt: str,
+        response_model: Type[BaseModel],
+    ) -> BaseModel:
+        config = self.config_for_stage(stage_name)
+        completion_kwargs = {
+            "model": config.model_name,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            "max_tokens": config.max_tokens,
+            "response_format": {"type": "json_object"},
+        }
+        temperature = self._temperature_for_model(config)
+        if temperature is not None:
+            completion_kwargs["temperature"] = temperature
+        response = completion(
+            **completion_kwargs,
+        )
+        content = response.choices[0].message.content or ""
+        payload = self._parse_json(content)
+        try:
+            return response_model.model_validate(payload)
+        except ValidationError as exc:
+            if isinstance(payload, list) and len(payload) == 1 and isinstance(payload[0], dict):
+                try:
+                    return response_model.model_validate(payload[0])
+                except ValidationError:
+                    pass
+            raise ValueError(
+                f"Stage {stage_name} returned invalid JSON for {response_model.__name__}: {exc}\nRaw content:\n{content}"
+            ) from exc
+    def generate_text(
+        self,
+        *,
+        stage_name: str,
+        system_prompt: str,
+        user_prompt: str,
+    ) -> str:
+        config = self.config_for_stage(stage_name)
+        completion_kwargs = {
+            "model": config.model_name,
+            "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            "max_tokens": config.max_tokens,
+        }
+        temperature = self._temperature_for_model(config)
+        if temperature is not None:
+            completion_kwargs["temperature"] = temperature
+        response = completion(**completion_kwargs)
+        return (response.choices[0].message.content or "").strip()
+    @staticmethod
+    def _parse_json(text: str) -> Any:
+        text = text.strip()
+        if text.startswith("```"):
+            match = re.search(r"```(?:json)?\s*(.*?)```", text, flags=re.S)
+            if match:
+                text = match.group(1).strip()
+        try:
+            return json.loads(text)
+        except json.JSONDecodeError:
+            match = re.search(r"(\{.*\}|\[.*\])", text, flags=re.S)
+            if match:
+                return json.loads(match.group(1))
+            raise
+    @staticmethod
+    def _temperature_for_model(config: ModelConfig) -> float | None:
+        model_name = config.model_name.lower()
+        if "gpt-5" in model_name:
+            return None
+        return config.temperature

src/common/paper_package.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from __future__ import annotations
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List
+from pydantic import BaseModel
+SECTION_FILES = [
+    "abstract.txt",
+    "introduction.tex",
+    "related_work.tex",
+    "tldr.txt",
+]
+class PaperPackage(BaseModel):
+    paper_dir: Path
+    paper_metadata: Dict[str, Any]
+    extracted_discovery_claim: str
+    downstream_cluster_evidence: List[Dict[str, Any]]
+    paper_text: Dict[str, str]
+    full_processed_text: str
+    bibliography: List[Dict[str, Any]]
+    citation_contexts: List[Dict[str, Any]]
+    def to_prompt_payload(self) -> Dict[str, Any]:
+        return {
+            "paper_metadata": self.paper_metadata,
+            "extracted_discovery_claim": self.extracted_discovery_claim,
+            "downstream_cluster_evidence": self.downstream_cluster_evidence,
+            "paper_text": self.paper_text,
+            "full_processed_text": self.full_processed_text,
+            "bibliography": self.bibliography,
+            "citation_contexts": self.citation_contexts,
+        }
+def _load_json(path: Path, default: Any) -> Any:
+    try:
+        return json.loads(path.read_text())
+    except Exception:
+        return default
+def _read_text(path: Path) -> str:
+    try:
+        return path.read_text()
+    except Exception:
+        return ""
+def _normalize_dict_payload(value: Any) -> Dict[str, Any]:
+    if isinstance(value, dict):
+        return value
+    if isinstance(value, list):
+        for item in value:
+            if isinstance(item, dict):
+                return item
+    return {}
+def _collect_sections(paper_dir: Path) -> Dict[str, str]:
+    sections_dir = paper_dir / "sections"
+    out: Dict[str, str] = {}
+    for name in SECTION_FILES:
+        text = _read_text(sections_dir / name).strip()
+        if text:
+            out[name] = text[:12000]
+    if not out:
+        processed = _read_text(paper_dir / "processed_main.tex").strip()
+        if processed:
+            out["processed_main.tex"] = processed[:24000]
+    return out
+def _collect_full_processed_text(paper_dir: Path) -> str:
+    processed = _read_text(paper_dir / "processed_main.tex").strip()
+    if processed:
+        return processed
+    sections_dir = paper_dir / "sections"
+    parts: List[str] = []
+    if sections_dir.exists():
+        for path in sorted(sections_dir.iterdir()):
+            if not path.is_file():
+                continue
+            text = _read_text(path).strip()
+            if text:
+                parts.append(f"[{path.name}]\n{text}")
+    return "\n\n".join(parts)
+def _extract_year(value: Any) -> Any:
+    if value:
+        return value
+    return None
+def _normalise_reference_record(ref: Dict[str, Any]) -> Dict[str, Any]:
+    cited = ref.get("citedPaper")
+    source = cited if isinstance(cited, dict) else ref
+    external_ids = source.get("external_ids") or source.get("externalIds") or {}
+    return {
+        "ref_id": (
+            ref.get("ref_id")
+            or ref.get("bib_key")
+            or source.get("ref_id")
+            or source.get("bib_key")
+            or source.get("paperId")
+            or source.get("paper_id")
+            or external_ids.get("ACL")
+            or external_ids.get("ArXiv")
+            or external_ids.get("DOI")
+        ),
+        "title": source.get("title") or source.get("ref_title"),
+        "authors": source.get("authors") or source.get("ref_authors"),
+        "year": _extract_year(source.get("year") or source.get("ref_year")),
+        "external_ids": external_ids,
+    }
+def _parse_bibtex_entries(text: str, limit: int) -> List[Dict[str, Any]]:
+    entries: List[Dict[str, Any]] = []
+    for match in re.finditer(r"@\w+\s*\{\s*([^,]+),(.*?)(?=\n@\w+\s*\{|\Z)", text, re.S):
+        key = match.group(1).strip()
+        body = match.group(2)
+        fields: Dict[str, str] = {}
+        for field in ("title", "author", "year", "doi", "url", "eprint"):
+            field_match = re.search(
+                rf"\b{field}\s*=\s*(\{{(?:[^{{}}]|\{{[^{{}}]*\}})*\}}|\"[^\"]*\"|[^,\n]+)",
+                body,
+                re.I | re.S,
+            )
+            if field_match:
+                value = field_match.group(1).strip().strip(",")
+                if (value.startswith("{") and value.endswith("}")) or (
+                    value.startswith('"') and value.endswith('"')
+                ):
+                    value = value[1:-1]
+                fields[field] = re.sub(r"\s+", " ", value).strip()
+        if fields:
+            external_ids: Dict[str, Any] = {}
+            if fields.get("doi"):
+                external_ids["DOI"] = fields["doi"]
+            if fields.get("eprint"):
+                external_ids["ArXiv"] = fields["eprint"]
+            entries.append(
+                {
+                    "ref_id": key,
+                    "title": fields.get("title"),
+                    "authors": fields.get("author"),
+                    "year": fields.get("year"),
+                    "external_ids": external_ids,
+                }
+            )
+        if len(entries) >= limit:
+            break
+    return entries
+def _collect_bibtex_citation_contexts(paper_dir: Path, limit: int = 60) -> List[Dict[str, Any]]:
+    bibtex = _read_text(paper_dir / "references.bib")
+    processed = _read_text(paper_dir / "processed_main.tex")
+    if not bibtex or not processed:
+        return []
+    refs = _parse_bibtex_entries(bibtex, limit=500)
+    out: List[Dict[str, Any]] = []
+    seen: set[tuple[str, int]] = set()
+    for ref in refs:
+        ref_id = ref.get("ref_id")
+        if not ref_id:
+            continue
+        for match in re.finditer(rf"\\cite\w*\s*(?:\[[^\]]*\]\s*)*\{{[^}}]*\b{re.escape(str(ref_id))}\b[^}}]*\}}", processed):
+            key = (str(ref_id), match.start())
+            if key in seen:
+                continue
+            seen.add(key)
+            start = max(0, match.start() - 350)
+            end = min(len(processed), match.end() + 350)
+            snippet = re.sub(r"\s+", " ", processed[start:end]).strip()
+            out.append(
+                {
+                    "ref_id": ref_id,
+                    "citation_marker": ref.get("title") or ref_id,
+                    "text": snippet,
+                    "section": None,
+                    "intents": [],
+                }
+            )
+            if len(out) >= limit:
+                return out
+    return out
+def _collect_bibliography(paper_dir: Path, limit: int = 80) -> List[Dict[str, Any]]:
+    refs = _load_json(paper_dir / "references_metadata.json", [])
+    if isinstance(refs, list) and refs:
+        return [_normalise_reference_record(ref) for ref in refs[:limit] if isinstance(ref, dict)]
+    bibtex = _read_text(paper_dir / "references.bib")
+    if bibtex:
+        return _parse_bibtex_entries(bibtex, limit)
+    return []
+def _collect_citation_contexts(paper_dir: Path, limit: int = 60) -> List[Dict[str, Any]]:
+    refs = _load_json(paper_dir / "references_metadata.json", [])
+    out = []
+    if isinstance(refs, list):
+        for ref in refs:
+            if not isinstance(ref, dict):
+                continue
+            ref_record = _normalise_reference_record(ref)
+            for context in ref.get("contextsWithIntent") or []:
+                if not isinstance(context, dict):
+                    continue
+                text = context.get("context") or context.get("text") or ""
+                if not text:
+                    continue
+                out.append(
+                    {
+                        "ref_id": ref_record.get("ref_id"),
+                        "citation_marker": ref_record.get("title"),
+                        "text": text,
+                        "section": context.get("section"),
+                        "intents": context.get("intents", []),
+                    }
+                )
+                if len(out) >= limit:
+                    return out
+    contexts = _load_json(paper_dir / "usage_contexts.json", [])
+    if isinstance(contexts, list):
+        for item in contexts:
+            entry = {
+                "ref_id": item.get("ref_id") or item.get("bib_key"),
+                "citation_marker": item.get("citation_marker"),
+                "text": item.get("text") or item.get("text_raw") or "",
+                "section": item.get("section"),
+            }
+            if entry["text"]:
+                out.append(entry)
+            if len(out) >= limit:
+                break
+    if not out:
+        out = _collect_bibtex_citation_contexts(paper_dir, limit=limit)
+    return out
+def _collect_downstream_cluster_evidence(paper_dir: Path) -> List[Dict[str, Any]]:
+    discovery = _normalize_dict_payload(_load_json(paper_dir / "usage_discovery_from_contributions.json", {}))
+    clusters = discovery.get("clusters", [])
+    out = []
+    for cluster in clusters:
+        out.append(
+            {
+                "cluster_id": cluster.get("cluster_id"),
+                "representative_claim": cluster.get("representative_claim") or cluster.get("cluster_title"),
+                "cluster_title": cluster.get("cluster_title"),
+                "count": cluster.get("count"),
+                "merge_rationale": cluster.get("merge_rationale"),
+            }
+        )
+    return out
+def load_paper_package(paper_dir: str | Path, extracted_claim_override: str | None = None) -> PaperPackage:
+    paper_dir = Path(paper_dir)
+    discovery = _normalize_dict_payload(_load_json(paper_dir / "usage_discovery_from_contributions.json", {}))
+    paper_metadata = _normalize_dict_payload(_load_json(paper_dir / "paper_metadata.json", {}))
+    claim = extracted_claim_override or (
+        discovery.get("most_impactful_contribution_self_contained")
+        or discovery.get("most_impactful_contribution")
+        or ""
+    )
+    return PaperPackage(
+        paper_dir=paper_dir,
+        paper_metadata=paper_metadata,
+        extracted_discovery_claim=claim,
+        downstream_cluster_evidence=_collect_downstream_cluster_evidence(paper_dir),
+        paper_text=_collect_sections(paper_dir),
+        full_processed_text=_collect_full_processed_text(paper_dir),
+        bibliography=_collect_bibliography(paper_dir),
+        citation_contexts=_collect_citation_contexts(paper_dir),
+    )

src/step_01_fetch/config.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from pathlib import Path
+import os
+ACL_IDS_PATH = Path("input_ids.json")
+PAPERS_DIR = Path("papers")
+SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY", "")

src/step_01_fetch/fetch_metadata.py ADDED Viewed

	@@ -0,0 +1,440 @@

+from pathlib import Path
+import argparse
+import json
+import os
+import random
+import re
+import tarfile
+import time
+import arxiv
+import requests
+from config import ACL_IDS_PATH
+from process_tex_source import preprocess_tex, extract_introduction_and_related
+from semanticscholar_client import get_paper, get_paper_links, search_by_title
+def load_ids(path: Path):
+    return json.loads(path.read_text(encoding="utf-8"))
+def ensure_dir(path: Path):
+    path.mkdir(parents=True, exist_ok=True)
+_ARXIV_LAST_TS = 0.0
+def _cleanup_partial_source_dir(source_dir: Path) -> None:
+    for pattern in ("*.tar.gz", "*.tgz", "*.tar"):
+        for path in source_dir.glob(pattern):
+            try:
+                path.unlink()
+            except Exception:
+                pass
+def _download_arxiv_source_with_retries(paper, source_dir: Path, arxiv_id: str) -> Path | None:
+    max_retries = int(os.getenv("ARXIV_SOURCE_MAX_RETRIES", "4"))
+    base_sleep = float(os.getenv("ARXIV_SOURCE_BASE_SLEEP", "2.0"))
+    max_sleep = float(os.getenv("ARXIV_MAX_BACKOFF", "60"))
+    last_exc = None
+    for attempt in range(max_retries):
+        _cleanup_partial_source_dir(source_dir)
+        try:
+            _arxiv_min_interval_sleep()
+            tar_path = Path(paper.download_source(dirpath=str(source_dir)))
+            if not tar_path.exists():
+                raise FileNotFoundError(f"download_source returned {tar_path}, but the file does not exist")
+            if tar_path.stat().st_size < 1024:
+                raise IOError(f"downloaded source archive is unexpectedly small ({tar_path.stat().st_size} bytes)")
+            return tar_path
+        except Exception as exc:
+            last_exc = exc
+            sleep = min(base_sleep * (2**attempt), max_sleep) + random.uniform(0.0, 0.5)
+            print(f"[WARN] Failed to download source for {arxiv_id} on attempt {attempt + 1}/{max_retries}: {exc}")
+            if attempt + 1 < max_retries:
+                print(f"[INFO] Retrying source download in {sleep:.2f}s")
+                time.sleep(sleep)
+    print(f"[WARN] Source download failed for {arxiv_id} after {max_retries} attempts: {last_exc}")
+    return None
+def _arxiv_min_interval_sleep() -> None:
+    """Global throttle to avoid arXiv API rate limits."""
+    global _ARXIV_LAST_TS
+    min_interval = float(os.getenv("ARXIV_MIN_INTERVAL", "1.0"))
+    now = time.monotonic()
+    elapsed = now - _ARXIV_LAST_TS
+    if elapsed < min_interval:
+        time.sleep(min_interval - elapsed)
+    _ARXIV_LAST_TS = time.monotonic()
+def download_arxiv_tex(arxiv_id: str, base_dir: Path) -> Path | None:
+    """
+    Download LaTeX source from arXiv and return the path to a merged TeX file.
+    - arxiv_id: e.g. "2410.22815"
+    - base_dir: paper directory where source should be unpacked
+    """
+    source_dir = base_dir / f"tex_{arxiv_id}"
+    source_dir.mkdir(parents=True, exist_ok=True)
+    search = arxiv.Search(id_list=[arxiv_id])
+    max_retries = int(os.getenv("ARXIV_MAX_RETRIES", "6"))
+    base_sleep = float(os.getenv("ARXIV_BASE_SLEEP", "2.0"))
+    max_sleep = float(os.getenv("ARXIV_MAX_BACKOFF", "60"))
+    paper = None
+    for attempt in range(max_retries):
+        try:
+            _arxiv_min_interval_sleep()
+            paper = next(search.results())
+            break
+        except StopIteration:
+            print(f"[WARN] No arXiv paper found for ID {arxiv_id}")
+            return None
+        except arxiv.HTTPError as exc:
+            if getattr(exc, "status", None) == 429 or "429" in str(exc):
+                sleep = min(base_sleep * (2**attempt), max_sleep) + random.uniform(0.0, 0.5)
+                print(f"[WARN] arXiv 429 → retrying in {sleep:.2f}s")
+                time.sleep(sleep)
+                continue
+            print(f"[WARN] arXiv HTTP error for {arxiv_id}: {exc}")
+            return None
+        except Exception as exc:
+            sleep = min(base_sleep * (2**attempt), max_sleep) + random.uniform(0.0, 0.5)
+            print(f"[WARN] arXiv error {exc} → retrying in {sleep:.2f}s")
+            time.sleep(sleep)
+            continue
+    if paper is None:
+        print(f"[ERROR] Giving up after {max_retries} attempts for arXiv ID {arxiv_id}")
+        return None
+    tar_path = _download_arxiv_source_with_retries(paper, source_dir, arxiv_id)
+    if tar_path is None:
+        return None
+    try:
+        with tarfile.open(tar_path) as tar:
+            tar.extractall(path=source_dir)
+        os.remove(tar_path)
+    except Exception as exc:
+        print(f"[WARN] Failed to extract source for {arxiv_id}: {exc}")
+        return None
+    processed_tex = preprocess_tex(source_dir)
+    if processed_tex:
+        extract_introduction_and_related(processed_tex)
+    if not processed_tex or not processed_tex.exists():
+        print(f"[WARN] Could not produce merged TeX for {arxiv_id}")
+        return None
+    print(f"[INFO] Processed LaTeX for {arxiv_id} at {processed_tex}")
+    return processed_tex
+def _extract_arxiv_id_from_text(text: str) -> str | None:
+    if not text:
+        return None
+    match = re.search(r"\b(\d{4}\.\d{4,5}(?:v\d+)?)\b", text)
+    if match:
+        return match.group(1)
+    match = re.search(r"arxiv[:\s/]*(\d{4}\.\d{4,5}(?:v\d+)?)", text, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    return None
+def _safe_write_json(path: Path, payload) -> None:
+    path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+def _safe_write_text(path: Path, text: str) -> None:
+    path.write_text(text, encoding="utf-8")
+def _query_openreview_for_paper(openreview_id: str) -> dict | None:
+    """Query OpenReview using a real OpenReview note/forum id."""
+    if not openreview_id:
+        return None
+    try_urls = [
+        f"https://api.openreview.net/notes?forum={openreview_id}",
+        f"https://api2.openreview.net/notes?forum={openreview_id}",
+        f"https://api.openreview.net/notes?id={openreview_id}",
+        f"https://api2.openreview.net/notes?id={openreview_id}",
+    ]
+    for url in try_urls:
+        try:
+            response = requests.get(url, timeout=20)
+            if response.status_code != 200:
+                continue
+            payload = response.json()
+        except Exception:
+            continue
+        notes = None
+        if isinstance(payload, dict) and isinstance(payload.get("notes"), list):
+            notes = payload["notes"]
+        elif isinstance(payload, dict) and payload.get("content"):
+            notes = [payload]
+        elif isinstance(payload, list):
+            notes = payload
+        if not notes:
+            continue
+        note = notes[0]
+        content = note.get("content") if isinstance(note, dict) else None
+        title = None
+        arxiv_id = None
+        pdf_url = None
+        if isinstance(content, dict):
+            raw_title = content.get("title") or content.get("paperTitle")
+            title = raw_title.get("value") if isinstance(raw_title, dict) else raw_title
+            raw_pdf = content.get("pdf")
+            pdf_url = raw_pdf.get("value") if isinstance(raw_pdf, dict) else raw_pdf
+            for value in content.values():
+                if isinstance(value, dict):
+                    value = value.get("value")
+                if isinstance(value, list):
+                    value = " ".join(str(item) for item in value)
+                if isinstance(value, str):
+                    arxiv_id = _extract_arxiv_id_from_text(value)
+                    if arxiv_id:
+                        break
+        if not title and isinstance(note, dict):
+            title = note.get("title") or note.get("forumTitle")
+        if not arxiv_id and isinstance(note, dict):
+            for value in note.values():
+                if isinstance(value, str):
+                    arxiv_id = _extract_arxiv_id_from_text(value)
+                    if arxiv_id:
+                        break
+        return {
+            "title": title,
+            "arxiv_id": arxiv_id,
+            "pdf_url": pdf_url,
+            "openreview_id": openreview_id,
+            "source_url": url,
+        }
+    return None
+def _treat_as_openreview(paper: dict) -> bool:
+    acl_id = str(paper.get("id", "")).lower()
+    id_type = str(paper.get("id_type", "")).lower()
+    return (
+        id_type == "openreview"
+        or bool(paper.get("openreview_id"))
+        or acl_id.startswith("neurips-")
+        or acl_id.startswith("icml-")
+    )
+def _fetch_s2_by_title(title: str, acl_id: str) -> tuple[int, dict | None]:
+    if not title:
+        print(f"[WARN] no title available for {acl_id} → skipping.")
+        return 0, None
+    hit = search_by_title(title)
+    if not hit:
+        print(f"[WARN] no S2 match for {acl_id} ({title}) → skipping.")
+        return 0, None
+    s2_id = hit["paperId"]
+    print(f"[DEBUG] title search matched semantic scholar paperId={s2_id}")
+    return get_paper(s2_id, id_type="SemanticScholar")
+def _best_arxiv_id(*values: str) -> str | None:
+    for value in values:
+        arxiv_id = _extract_arxiv_id_from_text(value or "")
+        if arxiv_id:
+            return arxiv_id
+    return None
+def _write_openreview_snapshot(paper_dir: Path, payload: dict) -> None:
+    if payload:
+        _safe_write_json(paper_dir / "openreview_metadata.json", payload)
+def _write_metadata_outputs(paper_dir: Path, acl_id: str, data: dict) -> None:
+    meta_path = paper_dir / "paper_metadata.json"
+    _safe_write_json(meta_path, [data])
+    print(f"[DEBUG] wrote metadata to {meta_path}")
+    external_ids = data.get("externalIds", {}) or {}
+    arxiv_id = external_ids.get("ArXiv")
+    if arxiv_id:
+        download_arxiv_tex(arxiv_id=arxiv_id, base_dir=paper_dir)
+    sections_dir = paper_dir / "sections"
+    sections_dir.mkdir(exist_ok=True)
+    abstract = data.get("abstract")
+    if abstract:
+        _safe_write_text(sections_dir / "abstract.txt", abstract)
+    tldr_obj = data.get("tldr")
+    if isinstance(tldr_obj, dict) and tldr_obj.get("text"):
+        _safe_write_text(sections_dir / "tldr.txt", tldr_obj["text"])
+    semantic_id = data.get("paperId")
+    if not semantic_id:
+        print(f"[WARN] no semantic_id for {acl_id} → skip refs/cites.")
+        return
+    citation_count = data.get("citationCount", 0)
+    reference_count = data.get("referenceCount", 0)
+    ref_status, refs = get_paper_links(semantic_id, "references", reference_count)
+    if ref_status == 200:
+        _safe_write_json(paper_dir / "references_metadata.json", refs)
+    cit_status, cits = get_paper_links(semantic_id, "citations", citation_count)
+    if cit_status == 200:
+        _safe_write_json(paper_dir / "citations_metadata.json", cits)
+    if "ArXiv" not in external_ids:
+        _safe_write_text(paper_dir / "no_arxiv.txt", "no arxiv for this paper")
+def fetch_one_acl_id(paper: dict, base_dir: Path):
+    acl_id = paper["id"]
+    title = (paper.get("title") or "").strip()
+    id_type = paper.get("id_type", "ACL")
+    openreview_id = paper.get("openreview_id", "")
+    input_pdf_url = paper.get("pdf_url", "")
+    s2_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY", "")
+    print(
+        f"[DEBUG] fetch_one_acl_id: id={acl_id} id_type={id_type} "
+        f"title_len={len(title)} s2_key_present={'yes' if bool(s2_key) else 'no'} "
+        f"s2_key_len={len(s2_key)}"
+    )
+    paper_dir = base_dir / acl_id
+    ensure_dir(paper_dir)
+    meta_path = paper_dir / "paper_metadata.json"
+    if meta_path.exists():
+        return
+    status, data = 0, None
+    fetch_label = f"{id_type}:{acl_id}"
+    is_openreview = _treat_as_openreview(paper)
+    openreview_meta = None
+    attempted_title_search = False
+    if is_openreview:
+        try:
+            openreview_meta = _query_openreview_for_paper(openreview_id or acl_id)
+        except Exception as exc:
+            print(f"[WARN] OpenReview lookup failed for {acl_id}: {exc}")
+            openreview_meta = None
+        if openreview_meta:
+            _write_openreview_snapshot(paper_dir, openreview_meta)
+            or_title = (openreview_meta.get("title") or title or "").strip()
+            arxiv_id = (
+                _best_arxiv_id(
+                    openreview_meta.get("arxiv_id", ""),
+                    openreview_meta.get("pdf_url", ""),
+                    input_pdf_url,
+                )
+                or ""
+            )
+            if arxiv_id:
+                print(f"[DEBUG] OpenReview -> found ArXiv {arxiv_id} for {acl_id}")
+                status, data = get_paper(arxiv_id, id_type="ArXiv")
+                fetch_label = f"ArXiv:{arxiv_id}"
+                title = or_title or title
+            elif or_title:
+                print(f"[DEBUG] OpenReview -> no arXiv for {acl_id}, title-searching")
+                status, data = _fetch_s2_by_title(or_title, acl_id)
+                fetch_label = f"title:{or_title[:80]}"
+                title = or_title
+                attempted_title_search = True
+            else:
+                print(f"[WARN] OpenReview metadata for {acl_id} had neither title nor arXiv")
+        else:
+            print(f"[WARN] no OpenReview metadata for {acl_id} (openreview_id={openreview_id or acl_id})")
+        if data is None and title and not attempted_title_search:
+            print(f"[DEBUG] OpenReview fallback -> title-searching extracted title for {acl_id}")
+            status, data = _fetch_s2_by_title(title, acl_id)
+            fetch_label = f"title:{title[:80]}"
+            attempted_title_search = True
+    if data is None and not is_openreview:
+        status, data = get_paper(acl_id, id_type=id_type)
+        fetch_label = f"{id_type}:{acl_id}"
+    if data is None and not attempted_title_search:
+        print(
+            f"[WARN] direct fetch failed for {fetch_label} "
+            f"(status={status}) → trying title search with title_len={len(title)}"
+        )
+        status, data = _fetch_s2_by_title(title, acl_id)
+    if status != 200 or data is None:
+        print(f"[WARN] still no data for {acl_id} → skipping.")
+        return
+    _write_metadata_outputs(paper_dir, acl_id, data)
+    print("[SUCCESS]")
+def fetch_all_metadata(acl_ids_path: Path, out_dir: Path, start_from: str | None = None, resume: bool = False):
+    raw = json.loads(acl_ids_path.read_text(encoding="utf-8"))
+    papers = raw if isinstance(raw[0], dict) else [{"id": x, "title": ""} for x in raw]
+    start_seen = start_from is None
+    for paper in papers:
+        pid = str(paper.get("id", ""))
+        if not start_seen:
+            if pid == start_from:
+                start_seen = True
+            else:
+                continue
+        if resume:
+            paper_dir = out_dir / pid
+            if (paper_dir / "paper_metadata.json").exists():
+                continue
+        fetch_one_acl_id(paper, out_dir)
+    return "Meta Data Completed"
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ids", type=str, required=True, help="Path to JSON file with paper IDs.")
+    parser.add_argument("--outdir", type=str, default="papers", help="Output directory for metadata.")
+    parser.add_argument("--start-from", type=str, default=None, help="Start from this paper ID.")
+    parser.add_argument("--resume", action="store_true", help="Skip papers that already have paper_metadata.json.")
+    args = parser.parse_args()
+    ACL_IDS_PATH = Path(args.ids).expanduser().resolve()
+    OUTDIR = Path(args.outdir).expanduser().resolve()
+    if not ACL_IDS_PATH.exists():
+        raise FileNotFoundError(f"Could not find {ACL_IDS_PATH}")
+    print(f"[INFO] Using ID list from {ACL_IDS_PATH}")
+    print(f"[INFO] Output will be saved to {OUTDIR}")
+    start = time.time()
+    fetch_all_metadata(acl_ids_path=ACL_IDS_PATH, out_dir=OUTDIR, start_from=args.start_from, resume=args.resume)
+    print("done in", time.time() - start, "s")

src/step_01_fetch/process_tex_source.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import re
+from pathlib import Path
+import shutil
+def read_tex(path: Path) -> str:
+    try:
+        return path.read_text(encoding="utf-8", errors="ignore")
+    except Exception:
+        return ""
+def resolve_inputs(tex: str, base_dir: Path, seen=None) -> str:
+    """
+    Recursively replace \\input{...} and \\include{...} with file contents.
+    """
+    if seen is None:
+        seen = set()
+    pattern = r'\\(?:input|include)\{([^}]+)\}'
+    def repl(match):
+        name = match.group(1)
+        if not name.endswith(".tex"):
+            name += ".tex"
+        full = base_dir / name
+        if full in seen:
+            return f"% WARNING: skipped circular input {full}\n"
+        if not full.exists():
+            return f"% WARNING: missing file {full}\n"
+        seen.add(full)
+        content = read_tex(full)
+        return resolve_inputs(content, full.parent, seen)
+    return re.sub(pattern, repl, tex)
+def find_main_tex(source_dir: Path) -> Path | None:
+    """
+    Heuristic to find the main .tex file:
+    1. match .bbl → .tex
+    2. else top-level .tex that contains \\begin{document}
+    3. else first .tex in directory
+    """
+    bbls = list(source_dir.glob("*.bbl"))
+    if bbls:
+        main_candidate = source_dir / (bbls[0].stem + ".tex")
+        if main_candidate.exists():
+            return main_candidate
+    for tex in source_dir.glob("*.tex"):
+        if "\\begin{document}" in read_tex(tex):
+            return tex
+    tex_files = list(source_dir.glob("*.tex"))
+    return tex_files[0] if tex_files else None
+def preprocess_tex(source_dir: Path) -> Path | None:
+    """
+    Given an extracted arXiv source directory, produce:
+      - a merged TeX file named 'processed_main.tex'
+      - a concatenated BibTeX file named 'references.bib'
+    Both are written in the parent directory of source_dir (the paper dir).
+    Then delete the extracted source_dir.
+    """
+    main_tex = find_main_tex(source_dir)
+    if not main_tex:
+        print(f"[WARN] No main .tex found in {source_dir}")
+        shutil.rmtree(source_dir, ignore_errors=True)
+        return None
+    raw = read_tex(main_tex)
+    merged = resolve_inputs(raw, main_tex.parent)
+    paper_dir = source_dir.parent
+    out_tex_path = paper_dir / "processed_main.tex"
+    out_tex_path.write_text(merged, encoding="utf-8")
+    bib_files = list(source_dir.rglob("*.bib"))
+    if bib_files:
+        bib_texts = []
+        for bib in bib_files:
+            try:
+                bib_texts.append(bib.read_text(encoding="utf-8", errors="ignore"))
+            except Exception:
+                print(f"[WARN] Could not read bib file {bib}")
+        if bib_texts:
+            bib_out = paper_dir / "references.bib"
+            bib_out.write_text("\n\n".join(bib_texts), encoding="utf-8")
+            print(f"[INFO] Wrote combined BibTeX to {bib_out}")
+    shutil.rmtree(source_dir, ignore_errors=True)
+    return out_tex_path
+def _load_tex(path: Path) -> str:
+    return path.read_text(encoding="utf-8", errors="ignore")
+SECTION_PATTERN = re.compile(
+    r'\\section\*?\{([^}]*)\}',
+    flags=re.IGNORECASE
+)
+def _split_into_sections(tex: str):
+    """
+    Returns a list of (section_title, content) in order.
+    Title is the raw LaTeX title text (without braces).
+    Content is the text from this \\section line up to (but not including)
+    the next \\section or end of document.
+    """
+    sections = []
+    matches = list(SECTION_PATTERN.finditer(tex))
+    if not matches:
+        return sections
+    for i, m in enumerate(matches):
+        title = m.group(1).strip()
+        start = m.start()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(tex)
+        content = tex[start:end]
+        sections.append((title, content))
+    return sections
+def _normalize_title(title: str) -> str:
+    """Lowercase and strip punctuation-ish stuff for robust matching."""
+    t = title.lower()
+    t = re.sub(r'[^a-z0-9\s]', ' ', t)
+    t = re.sub(r'\s+', ' ', t).strip()
+    return t
+def _find_best_section(sections, candidates):
+    """
+    sections: list of (raw_title, content)
+    candidates: list of strings to match against normalized title
+    Returns the content of the best-matching section or None.
+    """
+    norm_candidates = [c.lower() for c in candidates]
+    for raw_title, content in sections:
+        nt = _normalize_title(raw_title)
+        for cand in norm_candidates:
+            if nt == cand or cand in nt:
+                return content
+    return None
+def extract_introduction_and_related(
+    processed_tex_path: Path,
+    out_dir: Path | None = None,
+) -> dict:
+    """
+    Given path to processed_main.tex, extract Introduction and Related Work sections
+    into separate .tex files.
+    Returns a dict with keys:
+        {
+          "introduction": Path | None,
+          "related_work": Path | None
+        }
+    """
+    if out_dir is None:
+        out_dir = processed_tex_path.parent / "sections"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    tex = _load_tex(processed_tex_path)
+    sections = _split_into_sections(tex)
+    intro_candidates = ["introduction"]
+    related_candidates = ["related work"]
+    intro_content = _find_best_section(sections, intro_candidates)
+    related_content = _find_best_section(sections, related_candidates)
+    results = {"introduction": None, "related_work": None}
+    if intro_content:
+        intro_path = out_dir / "introduction.tex"
+        intro_path.write_text(intro_content, encoding="utf-8")
+        results["introduction"] = intro_path
+    else:
+        print(f"[WARN] No Introduction section found in {processed_tex_path}")
+    if related_content:
+        rw_path = out_dir / "related_work.tex"
+        rw_path.write_text(related_content, encoding="utf-8")
+        results["related_work"] = rw_path
+    else:
+        print(f"[WARN] No Related Work section found in {processed_tex_path}")
+    return results

src/step_01_fetch/semanticscholar_client.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import time
+import random
+import requests
+from typing import Optional, Tuple, Any
+from config import SEMANTIC_SCHOLAR_API_KEY
+import os
+BASE_URL = "https://api.semanticscholar.org/graph/v1/paper"
+_LAST_REQUEST_TS = 0.0
+def _min_interval_sleep() -> None:
+    """Global throttle to avoid hammering Semantic Scholar."""
+    global _LAST_REQUEST_TS
+    min_interval = float(os.getenv("S2_MIN_INTERVAL", "1.0"))
+    now = time.monotonic()
+    elapsed = now - _LAST_REQUEST_TS
+    if elapsed < min_interval:
+        time.sleep(min_interval - elapsed)
+    _LAST_REQUEST_TS = time.monotonic()
+def robust_request(url, params=None, headers=None, max_retries=8, base_sleep=2.0):
+    """
+    Make a GET request with exponential backoff.
+    Retries on:
+        - connection errors
+        - 429 (Too Many Requests)
+        - 500–599 server errors
+        - invalid JSON
+    Returns (status_code, json_or_None).
+    """
+    for attempt in range(max_retries):
+        try:
+            _min_interval_sleep()
+            resp = requests.get(url, params=params, headers=headers, timeout=30)
+            status = resp.status_code
+            if status == 200:
+                try:
+                    return 200, resp.json()
+                except Exception:
+                    print(f"[WARN] JSON decode failed on attempt {attempt+1}/{max_retries}")
+            if status == 429:
+                retry_after = resp.headers.get("Retry-After")
+                if retry_after:
+                    try:
+                        sleep = float(retry_after)
+                    except Exception:
+                        sleep = base_sleep * (2 ** attempt)
+                else:
+                    sleep = base_sleep * (2 ** attempt)
+                max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60"))
+                sleep = min(sleep, max_sleep)
+                sleep += random.uniform(0.0, 0.5)
+                print(f"[WARN] 429 Too Many Requests → retrying in {sleep:.2f}s")
+                time.sleep(sleep)
+                continue
+            if 500 <= status < 600:
+                sleep = base_sleep * (2 ** attempt)
+                max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60"))
+                sleep = min(sleep, max_sleep)
+                sleep += random.uniform(0.0, 0.5)
+                print(f"[WARN] Server error {status} → retrying in {sleep:.2f}s")
+                time.sleep(sleep)
+                continue
+            return status, None
+        except requests.exceptions.RequestException as e:
+            sleep = base_sleep * (2 ** attempt)
+            max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60"))
+            sleep = min(sleep, max_sleep)
+            sleep += random.uniform(0.0, 0.5)
+            print(f"[WARN] Network error {e} → retrying in {sleep:.2f}s")
+            time.sleep(sleep)
+            continue
+    print(f"[ERROR] Giving up after {max_retries} attempts for URL: {url}")
+    return None, None
+def get_paper(paper_id: str, id_type: str = "ACL") -> Tuple[int, Optional[dict]]:
+    """
+    id_type can be "ACL" or "SemanticScholar" or "ArXiv" etc.
+    """
+    if id_type == "SemanticScholar":
+        full_id = paper_id
+    else:
+        full_id = f"{id_type}:{paper_id}"
+    url = f"{BASE_URL}/{full_id}"
+    params = {
+        "fields": (
+            "title,year,publicationDate,authors,url,venue,externalIds,"
+            "tldr,abstract,citationCount,referenceCount,openAccessPdf"
+        )
+    }
+    headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}
+    status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0)
+    if status == 200 and data is not None:
+        return status, data
+    else:
+        print(f"[WARN] {status} on {full_id}")
+        return status or 0, None
+def get_paper_links(semantic_id: str, target_type: str, total: int, limit: int = 1000):
+    headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}
+    loops = total // limit + 1 if total else 0
+    collected = []
+    for i in range(loops):
+        offset = i * limit
+        url = f"{BASE_URL}/{semantic_id}/{target_type}"
+        params = {
+            "offset": offset,
+            "limit": limit,
+            "fields": "paperId,title,isInfluential,externalIds,contextsWithIntent,openAccessPdf",
+        }
+        status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0)
+        if status != 200 or data is None:
+            print(f"[WARN] {target_type} fetch failed for {semantic_id} (status {status})")
+            return status or 0, []
+        items = data.get("data")
+        if not isinstance(items, list):
+            print(f"[WARN] malformed {target_type} response for {semantic_id}")
+            return status, []
+        collected.extend(items)
+    return 200, collected
+def search_by_title(title: str, limit: int = 1):
+    """Search Semantic Scholar by paper title."""
+    url = "https://api.semanticscholar.org/graph/v1/paper/search"
+    params = {
+        "query": title,
+        "limit": limit,
+        "fields": "paperId,title,year,venue,externalIds",
+    }
+    headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}
+    status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0)
+    if status == 200 and data is not None:
+        items = data.get("data", [])
+        return items[0] if items else None
+    else:
+        print(f"[WARN] title search failed for '{title[:60]}...' (status {status})")
+        return None

src/step_02_mark_citations/replace_citation_markers.py ADDED Viewed

	@@ -0,0 +1,440 @@

+import argparse
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+PAPER_META_FILE = "paper_metadata.json"
+USAGE_CLAIMS_FILE = "usage_claims.json"
+USAGE_CONTEXTS_FILE = "usage_contexts.json"
+CITATIONS_FILE = "citations_metadata.json"
+PROCESSED_MAIN_FILE = "processed_main.tex"
+REFERENCES_META_FILE = "references_metadata.json"
+def load_json(path: Path) -> Any | None:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+def save_json(path: Path, data: Any) -> None:
+    path.write_text(json.dumps(data, indent=2), encoding="utf-8")
+def iter_paper_dirs(root: Path) -> List[Path]:
+    out: List[Path] = []
+    for child in root.iterdir():
+        if child.is_dir() and (child / PAPER_META_FILE).exists():
+            out.append(child)
+    return out
+def load_paper_metadata(paper_dir: Path) -> Dict[str, Any]:
+    meta = load_json(paper_dir / PAPER_META_FILE)
+    if isinstance(meta, list) and meta:
+        return meta[0]
+    if isinstance(meta, dict):
+        return meta
+    return {}
+def _is_structurally_complete(paper_dir: Path) -> bool:
+    return (
+        (paper_dir / PAPER_META_FILE).exists()
+        and (paper_dir / PROCESSED_MAIN_FILE).exists()
+        and (paper_dir / REFERENCES_META_FILE).exists()
+    )
+def _author_last_names(authors: List[Any]) -> List[str]:
+    last_names: List[str] = []
+    for author in authors:
+        if isinstance(author, dict):
+            name = author.get("name")
+        else:
+            name = author
+        if not isinstance(name, str):
+            continue
+        parts = [p for p in re.split(r"\s+", name.strip()) if p]
+        if not parts:
+            continue
+        last_names.append(parts[-1])
+    return list(dict.fromkeys(last_names))
+def _title_aliases(title: str) -> List[str]:
+    aliases = [title]
+    if ":" in title:
+        aliases.append(title.split(":", 1)[0])
+    acronym = "".join([c for c in title if c.isupper()])
+    if 3 <= len(acronym) <= 10:
+        aliases.append(acronym)
+    return list(dict.fromkeys([a for a in aliases if a]))
+def _artifact_aliases(paper_dir: Path) -> List[str]:
+    aliases: List[str] = []
+    usage_claims = load_json(paper_dir / USAGE_CLAIMS_FILE)
+    if isinstance(usage_claims, dict):
+        caps = usage_claims.get("capabilities") or []
+        if isinstance(caps, list):
+            for cap in caps:
+                if not isinstance(cap, dict):
+                    continue
+                name = cap.get("artifact_name")
+                if isinstance(name, str) and name.strip():
+                    aliases.append(name.strip())
+    return list(dict.fromkeys(aliases))
+def _loose_alias_pattern(alias: str) -> str:
+    parts = re.split(r"[^A-Za-z0-9]+", alias)
+    parts = [p for p in parts if p]
+    if not parts:
+        return ""
+    return r"\b" + r"[-\s]*".join(map(re.escape, parts)) + r"\b"
+def build_patterns(
+    meta: Dict[str, Any],
+    paper_dir: Path,
+) -> Tuple[List[re.Pattern], List[re.Pattern], str | None]:
+    year = meta.get("year")
+    year_str = str(year) if isinstance(year, int) else None
+    authors = meta.get("authors") if isinstance(meta.get("authors"), list) else []
+    last_names = _author_last_names(authors)
+    title = meta.get("title") if isinstance(meta.get("title"), str) else ""
+    aliases = _title_aliases(title) + _artifact_aliases(paper_dir)
+    aliases = [a for a in aliases if a]
+    author_patterns: List[re.Pattern] = []
+    alias_patterns: List[re.Pattern] = []
+    if year_str and last_names:
+        year_pat = rf"{re.escape(year_str)}[a-z]?"
+        first_last = re.escape(last_names[0])
+        author_patterns.append(
+            re.compile(
+                rf"\b{first_last}\s+et\s+al\.?\s*(?:,\s*|\s*){year_pat}",
+                re.IGNORECASE,
+            )
+        )
+    for alias in aliases:
+        pat = _loose_alias_pattern(alias)
+        if pat:
+            alias_patterns.append(re.compile(pat, re.IGNORECASE))
+    first_last = last_names[0] if last_names else None
+    return author_patterns, alias_patterns, first_last
+def _replace_author_span(text: str, first_last: str) -> Tuple[str, bool]:
+    occurrences = list(re.finditer(rf"\b{re.escape(first_last)}\b", text, re.IGNORECASE))
+    if len(occurrences) != 1:
+        return text, False
+    author_pat = re.compile(
+        rf"\(?\b{re.escape(first_last)}\b"
+        rf"\s+(?:et\s+al\.?|and|&)\s*"
+        rf"(?:,?\s*\(?\d{{4}}[a-z]?\)?)?"
+        rf"\)?",
+        re.IGNORECASE,
+    )
+    new_text, count = author_pat.subn("<CITED HERE>", text, count=1)
+    return new_text, count > 0
+_BRACKET_NUM_RE = re.compile(r"\[[0-9,;\s]+\]")
+_BRACKET_GROUP_RE = re.compile(r"\[([0-9,;\s]+)\]")
+def _extract_bracket_numbers(text: str) -> List[str]:
+    numbers: List[str] = []
+    for match in _BRACKET_GROUP_RE.finditer(text):
+        parts = re.split(r"[,\s;]+", match.group(1).strip())
+        for part in parts:
+            if part.isdigit():
+                numbers.append(part)
+    return numbers
+def _dominant_bracket(contexts: List[Dict[str, Any]]) -> str | None:
+    counts: Dict[str, int] = {}
+    for ctx in contexts:
+        if not isinstance(ctx, dict):
+            continue
+        text = ctx.get("context") or ctx.get("text")
+        if not isinstance(text, str):
+            continue
+        for num in _extract_bracket_numbers(text):
+            counts[num] = counts.get(num, 0) + 1
+    if not counts:
+        return None
+    best = max(counts.values())
+    winners = [num for num, count in counts.items() if count == best]
+    if len(winners) == 1:
+        return winners[0]
+    return None
+def _single_bracket_candidate(contexts: List[Dict[str, Any]]) -> str | None:
+    counts: Dict[str, int] = {}
+    for ctx in contexts:
+        if not isinstance(ctx, dict):
+            continue
+        text = ctx.get("context") or ctx.get("text")
+        if not isinstance(text, str):
+            continue
+        matches = list(_BRACKET_GROUP_RE.finditer(text))
+        if len(matches) == 1:
+            nums = _extract_bracket_numbers(text)
+            if len(nums) != 1:
+                continue
+            num = nums[0]
+            counts[num] = counts.get(num, 0) + 1
+    if not counts:
+        return None
+    best = max(counts.values())
+    winners = [num for num, count in counts.items() if count == best]
+    if len(winners) == 1:
+        return winners[0]
+    return None
+def _replace_single_bracket(text: str, dominant: str | None) -> Tuple[str, bool]:
+    matches = list(_BRACKET_GROUP_RE.finditer(text))
+    if len(matches) != 1:
+        return text, False
+    nums = _extract_bracket_numbers(text)
+    if len(nums) != 1:
+        return text, False
+    num = nums[0]
+    if dominant is not None and num != dominant:
+        return text, False
+    start, end = matches[0].span()
+    return text[:start] + "<CITED HERE>" + text[end:], True
+def replace_with_marker(
+    text: str,
+    author_patterns: List[re.Pattern],
+    alias_patterns: List[re.Pattern],
+    dominant_bracket: str | None = None,
+    first_author_last: str | None = None,
+) -> Tuple[str, bool]:
+    def _collapse_markers(value: str) -> str:
+        value = re.sub(r"(?:<CITED HERE>[\s()\[\],;:]*){2,}", "<CITED HERE> ", value)
+        value = re.sub(r"<CITED HERE>(?:\s+<CITED HERE>)+", "<CITED HERE>", value)
+        return value.strip()
+    updated = text
+    changed = False
+    author_changed = False
+    if first_author_last:
+        new, author_changed = _replace_author_span(updated, first_author_last)
+        if author_changed:
+            changed = True
+            updated = _collapse_markers(new)
+    if dominant_bracket:
+        def _replace_if_contains(match: re.Match) -> str:
+            nums = re.split(r"[,\s;]+", match.group(1).strip())
+            if any(n == dominant_bracket for n in nums if n.isdigit()):
+                return "<CITED HERE>"
+            return match.group(0)
+        new = _BRACKET_GROUP_RE.sub(_replace_if_contains, updated)
+        if new != updated:
+            changed = True
+            updated = _collapse_markers(new)
+    for pat in author_patterns:
+        new = pat.sub("<CITED HERE>", updated)
+        if new != updated:
+            changed = True
+            updated = _collapse_markers(new)
+    if not author_changed:
+        for pat in alias_patterns:
+            new = pat.sub("<CITED HERE>", updated)
+            if new != updated:
+                changed = True
+                updated = _collapse_markers(new)
+    new, bracket_changed = _replace_single_bracket(updated, dominant_bracket)
+    if bracket_changed:
+        changed = True
+        updated = _collapse_markers(new)
+    updated = _collapse_markers(updated)
+    return updated, changed
+def _process_contexts(
+    contexts: List[Dict[str, Any]],
+    author_patterns: List[re.Pattern],
+    alias_patterns: List[re.Pattern],
+    dominant_bracket: str | None,
+    first_author_last: str | None,
+) -> Tuple[int, int]:
+    updated_count = 0
+    total = 0
+    for ctx in contexts:
+        if not isinstance(ctx, dict):
+            continue
+        text = ctx.get("context") or ctx.get("text")
+        if not isinstance(text, str):
+            continue
+        total += 1
+        new_text, changed = replace_with_marker(
+            text,
+            author_patterns=author_patterns,
+            alias_patterns=alias_patterns,
+            dominant_bracket=dominant_bracket,
+            first_author_last=first_author_last,
+        )
+        if changed:
+            updated_count += 1
+        ctx["context_with_marker"] = new_text
+    return updated_count, total
+def update_citations_file(
+    paper_dir: Path,
+    author_patterns: List[re.Pattern],
+    alias_patterns: List[re.Pattern],
+    first_author_last: str | None,
+) -> Tuple[int, int]:
+    path = paper_dir / CITATIONS_FILE
+    data = load_json(path)
+    if not isinstance(data, list):
+        return 0, 0
+    updated = 0
+    total = 0
+    for entry in data:
+        if not isinstance(entry, dict):
+            continue
+        ctxs = entry.get("contextsWithIntent") or []
+        if isinstance(ctxs, list):
+            dominant = _dominant_bracket(ctxs)
+            if dominant is None:
+                dominant = _single_bracket_candidate(ctxs)
+            upd, tot = _process_contexts(
+                ctxs,
+                author_patterns,
+                alias_patterns,
+                dominant,
+                first_author_last,
+            )
+            updated += upd
+            total += tot
+    save_json(path, data)
+    return updated, total
+def update_usage_contexts_file(
+    paper_dir: Path,
+    author_patterns: List[re.Pattern],
+    alias_patterns: List[re.Pattern],
+    first_author_last: str | None,
+) -> Tuple[int, int]:
+    path = paper_dir / USAGE_CONTEXTS_FILE
+    data = load_json(path)
+    if not isinstance(data, dict):
+        return 0, 0
+    updated = 0
+    total = 0
+    for entry in data.get("citing_papers", []) or []:
+        if not isinstance(entry, dict):
+            continue
+        ctxs = entry.get("contexts") or []
+        if isinstance(ctxs, list):
+            dominant = _dominant_bracket(ctxs)
+            if dominant is None:
+                dominant = _single_bracket_candidate(ctxs)
+            upd, tot = _process_contexts(
+                ctxs,
+                author_patterns,
+                alias_patterns,
+                dominant,
+                first_author_last,
+            )
+            updated += upd
+            total += tot
+    save_json(path, data)
+    return updated, total
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Replace citation mentions with <CITED HERE> in context fields."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="runs/processed_papers",
+        help="Root directory containing processed paper directories.",
+    )
+    parser.add_argument(
+        "--usage-contexts",
+        action="store_true",
+        help="Also update usage_contexts.json.",
+    )
+    args = parser.parse_args()
+    root = Path(args.root).expanduser().resolve()
+    if not root.exists():
+        raise SystemExit(f"Root directory does not exist: {root}")
+    paper_dirs = sorted(iter_paper_dirs(root), key=lambda p: p.name)
+    print(f"[INFO] Found {len(paper_dirs)} paper dirs under {root}")
+    total_updated = 0
+    total_contexts = 0
+    skipped_incomplete = 0
+    for paper_dir in paper_dirs:
+        if not _is_structurally_complete(paper_dir):
+            skipped_incomplete += 1
+            continue
+        meta = load_paper_metadata(paper_dir)
+        if not meta:
+            continue
+        author_patterns, alias_patterns, first_author_last = build_patterns(meta, paper_dir)
+        if not (author_patterns or alias_patterns):
+            continue
+        updated, total = update_citations_file(
+            paper_dir,
+            author_patterns,
+            alias_patterns,
+            first_author_last,
+        )
+        total_updated += updated
+        total_contexts += total
+        if args.usage_contexts:
+            upd_usage, tot_usage = update_usage_contexts_file(
+                paper_dir,
+                author_patterns,
+                alias_patterns,
+                first_author_last,
+            )
+            updated += upd_usage
+            total += tot_usage
+            total_updated += upd_usage
+            total_contexts += tot_usage
+        if total:
+            print(f"[OK] {paper_dir.name}: updated {updated} contexts over {total}")
+    print(
+        f"[SUMMARY] total_updated={total_updated} over {total_contexts}; "
+        f"skipped_incomplete={skipped_incomplete}"
+    )
+if __name__ == "__main__":
+    main()

src/step_03_usage_contexts/build_usage_contexts.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import argparse
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+PAPER_META_FILE = "paper_metadata.json"
+CITATIONS_FILE = "citations_metadata.json"
+DEFAULT_OUT_NAME = "usage_contexts.json"
+def load_json(path: Path) -> Any | None:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception as e:
+        print(f"[WARN] could not parse JSON at {path}: {e}")
+        return None
+def iter_paper_dirs(root: Path) -> List[Path]:
+    out: List[Path] = []
+    for child in root.iterdir():
+        if child.is_dir() and (child / PAPER_META_FILE).exists():
+            out.append(child)
+    return out
+def _extract_contexts(item: Dict[str, Any]) -> List[Dict[str, Any]]:
+    contexts: List[Dict[str, Any]] = []
+    raw = item.get("contextsWithIntent") or []
+    if isinstance(raw, list) and raw:
+        for entry in raw:
+            if not isinstance(entry, dict):
+                continue
+            text_raw = (entry.get("context") or "").strip()
+            text = (entry.get("context_with_marker") or text_raw).strip()
+            intents = entry.get("intents") or []
+            contexts.append(
+                {
+                    "text": text,
+                    "text_raw": text_raw,
+                    "intents": intents,
+                }
+            )
+    # Fallback for older schema that only stores raw context strings.
+    if not contexts:
+        raw_alt = item.get("contexts") or []
+        if isinstance(raw_alt, list):
+            for text in raw_alt:
+                if not isinstance(text, str):
+                    continue
+                text = text.strip()
+                if text:
+                    contexts.append(
+                        {
+                            "text": text,
+                            "intents": [],
+                        }
+                    )
+    return contexts
+def build_usage_contexts_for_paper(paper_dir: Path) -> Optional[Dict[str, Any]]:
+    citations_path = paper_dir / CITATIONS_FILE
+    data = load_json(citations_path)
+    if data is None:
+        return None
+    if not isinstance(data, list):
+        print(f"[WARN] {paper_dir.name}: {CITATIONS_FILE} is not a list")
+        return None
+    citing_entries: List[Dict[str, Any]] = []
+    total_contexts = 0
+    citing_with_context = 0
+    influential_citations = 0
+    influential_with_context = 0
+    influential_contexts: List[Dict[str, Any]] = []
+    for item in data:
+        if not isinstance(item, dict):
+            continue
+        citing = item.get("citingPaper") or {}
+        contexts = _extract_contexts(item)
+        is_influential = bool(item.get("isInfluential", False))
+        if is_influential:
+            influential_citations += 1
+        if contexts:
+            citing_with_context += 1
+            total_contexts += len(contexts)
+            if is_influential:
+                influential_with_context += 1
+        citing_entries.append(
+            {
+                "citing_paper_id": citing.get("paperId"),
+                "title": citing.get("title"),
+                "external_ids": citing.get("externalIds") or {},
+                "is_influential": is_influential,
+                "contexts": contexts,
+            }
+        )
+        if is_influential and contexts:
+            influential_contexts.append(
+                {
+                    "citing_paper_id": citing.get("paperId"),
+                    "title": citing.get("title"),
+                    "external_ids": citing.get("externalIds") or {},
+                    "contexts": contexts,
+                }
+            )
+    payload = {
+        "paper_id": paper_dir.name,
+        "total_citations": len(data),
+        "num_contexts": total_contexts,
+        "num_citing_with_context": citing_with_context,
+        "num_citing_without_context": len(data) - citing_with_context,
+        "num_influential_citations": influential_citations,
+        "num_influential_with_context": influential_with_context,
+        "influential_contexts": influential_contexts,
+        "citing_papers": citing_entries,
+    }
+    return payload
+def run(root: Path, out_name: str, overwrite: bool) -> None:
+    root = root.resolve()
+    if not root.exists():
+        raise SystemExit(f"Root directory does not exist: {root}")
+    paper_dirs = sorted(iter_paper_dirs(root), key=lambda p: p.name)
+    print(f"[INFO] Found {len(paper_dirs)} paper dirs under {root}")
+    for paper_dir in paper_dirs:
+        out_path = paper_dir / out_name
+        if out_path.exists() and not overwrite:
+            print(f"[SKIP] {paper_dir.name}: {out_name} already exists")
+            continue
+        payload = build_usage_contexts_for_paper(paper_dir)
+        if payload is None:
+            continue
+        out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        print(
+            f"[OK] {paper_dir.name}: wrote {out_name} "
+            f"({payload['num_contexts']} contexts from {payload['total_citations']} citations)"
+        )
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Build usage_contexts.json from citations_metadata.json files."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="processed_papers/acl_2024",
+        help="Root directory containing processed_papers/acl_2024/<paper_id> dirs.",
+    )
+    parser.add_argument(
+        "--out-name",
+        type=str,
+        default=DEFAULT_OUT_NAME,
+        help="Output filename to write inside each paper dir.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing usage_contexts.json files.",
+    )
+    args = parser.parse_args()
+    run(Path(args.root), out_name=args.out_name, overwrite=args.overwrite)
+if __name__ == "__main__":
+    main()

src/step_04_label_citations/label_citation_functions.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+DEEP_CITATION_ROOT = Path(__file__).resolve().parents[2] / "Deep-Citation"
+if not DEEP_CITATION_ROOT.exists():
+    raise SystemExit(f"Deep-Citation repo not found at {DEEP_CITATION_ROOT}")
+sys.path.insert(0, str(DEEP_CITATION_ROOT))
+from data import CollateFn, create_data_channels
+from Model import MultiHeadLanguageModel
+import torch
+from torch.utils.data import DataLoader
+PAPER_META_FILE = "paper_metadata.json"
+USAGE_CONTEXTS_FILE = "usage_contexts.json"
+OUT_FILE = "usage_context_labels.json"
+LABEL_SET = [
+    "Background",
+    "Uses",
+    "Extends",
+    "CompareOrContrast",
+    "Motivation",
+    "Future",
+]
+def load_json(path: Path) -> Any | None:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+def iter_paper_dirs(root: Path) -> List[Path]:
+    out: List[Path] = []
+    for child in root.iterdir():
+        if child.is_dir() and (child / PAPER_META_FILE).exists():
+            out.append(child)
+    return out
+def flatten_contexts(usage: Dict[str, Any]) -> List[Dict[str, Any]]:
+    contexts: List[Dict[str, Any]] = []
+    idx = 1
+    for entry in usage.get("citing_papers", []) or []:
+        if not isinstance(entry, dict):
+            continue
+        citing_title = entry.get("title") or "Unknown citing paper"
+        citing_paper_id = entry.get("citing_paper_id") or ""
+        for c in entry.get("contexts", []) or []:
+            if not isinstance(c, dict):
+                continue
+            text = (c.get("text") or "").strip()
+            if not text:
+                continue
+            contexts.append(
+                {
+                    "id": idx,
+                    "text": text,
+                    "citing_title": citing_title,
+                    "citing_paper_id": citing_paper_id,
+                }
+            )
+            idx += 1
+    return contexts
+def _resolve_model_name(lm: str) -> str:
+    if lm == "scibert":
+        return "allenai/scibert_scivocab_uncased"
+    if lm == "bert":
+        return "bert-base-uncased"
+    if lm == "deberta":
+        return "microsoft/deberta-v3-base"
+    if lm == "deberta-large":
+        return "microsoft/deberta-v3-large"
+    return lm
+def _infer_head_sizes(state_dict: Dict[str, Any]) -> List[int]:
+    head_weights = [
+        (k, v) for k, v in state_dict.items() if k.startswith("lns.") and k.endswith(".weight")
+    ]
+    head_weights.sort(key=lambda x: int(x[0].split(".")[1]))
+    return [int(weight.shape[0]) for _, weight in head_weights]
+class _ContextDataset:
+    def __init__(self, texts: List[str]):
+        self.texts = texts
+    def __len__(self) -> int:
+        return len(self.texts)
+    def __getitem__(self, idx: int):
+        return (self.texts[idx], torch.tensor(0), torch.tensor(0))
+def label_with_model(
+    contexts: List[Dict[str, Any]],
+    model_path: Path,
+    data_dir: Path,
+    class_definition: Path,
+    lm: str,
+    device: str,
+    batch_size: int,
+) -> Dict[int, Dict[str, Any]]:
+    data_file = data_dir / "acl.tsv"
+    train_data, _, _, label_names = create_data_channels(
+        str(data_file),
+        str(class_definition),
+        lmbd=1.0,
+    )
+    modelname = _resolve_model_name(lm)
+    state_dict = torch.load(model_path, map_location=device)
+    head_sizes = _infer_head_sizes(state_dict)
+    model = MultiHeadLanguageModel(
+        modelname=modelname,
+        device=device,
+        readout="ch",
+        num_classes=head_sizes,
+    ).to(device)
+    model.load_state_dict(state_dict)
+    model.eval()
+    collate_fn = CollateFn(
+        modelname=modelname,
+        class_definitions=train_data.class_definitions,
+        instance_weights=False,
+    )
+    dataset = _ContextDataset([ctx["text"] for ctx in contexts])
+    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
+    outputs: Dict[int, Dict[str, Any]] = {}
+    idx_offset = 0
+    with torch.no_grad():
+        for batched_text, labels, ds_indices, class_tokens, class_ds_indices in loader:
+            ds_indices = ds_indices.to(device)
+            class_ds_indices = class_ds_indices.to(device)
+            logits = model(batched_text, ds_indices, class_tokens, class_ds_indices)[0]
+            probs = torch.softmax(logits, dim=1)
+            preds = logits.argmax(dim=1).cpu().tolist()
+            pred_confidences = probs.max(dim=1).values.cpu().tolist()
+            top2 = torch.topk(probs, k=2, dim=1).values.cpu()
+            margins = (top2[:, 0] - top2[:, 1]).tolist()
+            for i, pred in enumerate(preds):
+                raw_label = label_names[pred]
+                outputs[idx_offset + i + 1] = {
+                    "id": idx_offset + i + 1,
+                    "label": raw_label,
+                    "confidence": float(pred_confidences[i]),
+                    "confidence_margin": float(margins[i]),
+                    "cue_span": "",
+                    "rationale": "scibert_model",
+                }
+            idx_offset += len(preds)
+    return outputs
+def aggregate_citing_labels(labels: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    by_citing: Dict[str, List[Dict[str, Any]]] = {}
+    for item in labels:
+        citing_id = item.get("citing_paper_id") or ""
+        by_citing.setdefault(citing_id, []).append(item)
+    aggregated: List[Dict[str, Any]] = []
+    for citing_id, items in by_citing.items():
+        title = items[0].get("citing_title", "")
+        labels_set = {it.get("label") for it in items}
+        if "Extends" in labels_set:
+            label = "Extends"
+            evidence_ids = [it["id"] for it in items if it.get("label") == "Extends"]
+        elif "Uses" in labels_set:
+            label = "Uses"
+            evidence_ids = [it["id"] for it in items if it.get("label") == "Uses"]
+        elif "CompareOrContrast" in labels_set:
+            label = "CompareOrContrast"
+            evidence_ids = [
+                it["id"] for it in items if it.get("label") == "CompareOrContrast"
+            ]
+        else:
+            label = "Background"
+            evidence_ids = []
+        aggregated.append(
+            {
+                "citing_paper_id": citing_id,
+                "citing_title": title,
+                "label": label,
+                "evidence_context_ids": evidence_ids,
+            }
+        )
+    return aggregated
+def aggregate_final_label(citing_labels: List[Dict[str, Any]]) -> str:
+    labels_set = {item.get("label") for item in citing_labels}
+    if "Extends" in labels_set:
+        return "Extends"
+    if "Uses" in labels_set:
+        return "Uses"
+    if "CompareOrContrast" in labels_set:
+        return "CompareOrContrast"
+    return "Background"
+def score_for_paper(
+    paper_dir: Path,
+    batch_size: int,
+    overwrite: bool,
+    model_path: Path,
+    model_data_dir: Path,
+    model_class_def: Path,
+    model_lm: str,
+    device: str,
+) -> str:
+    usage_path = paper_dir / USAGE_CONTEXTS_FILE
+    usage = load_json(usage_path)
+    if not isinstance(usage, dict):
+        return "missing_usage"
+    contexts = flatten_contexts(usage)
+    if not contexts:
+        return "empty_contexts"
+    out_path = paper_dir / OUT_FILE
+    if out_path.exists() and not overwrite:
+        return "skipped"
+    labeled = label_with_model(
+        contexts=contexts,
+        model_path=model_path,
+        data_dir=model_data_dir,
+        class_definition=model_class_def,
+        lm=model_lm,
+        device=device,
+        batch_size=batch_size,
+    )
+    labels_sorted = []
+    for context in contexts:
+        context_id = context["id"]
+        item = labeled.get(context_id)
+        if not item:
+            item = {
+                "id": context_id,
+                "label": "Background",
+                "confidence": 0.0,
+                "cue_span": "",
+                "rationale": "missing label",
+            }
+        item = dict(item)
+        item["citing_paper_id"] = context.get("citing_paper_id", "")
+        item["citing_title"] = context.get("citing_title", "")
+        item["text"] = context.get("text", "")
+        labels_sorted.append(item)
+    citing_labels = aggregate_citing_labels(labels_sorted)
+    payload = {
+        "paper_id": usage.get("paper_id"),
+        "num_contexts": len(contexts),
+        "label_set": LABEL_SET,
+        "labels": labels_sorted,
+        "citing_paper_labels": citing_labels,
+        "final_label": aggregate_final_label(citing_labels),
+    }
+    out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    return "labeled"
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Label citation functions using a Deep-Citation checkpoint."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="runs/processed_papers",
+        help="Root directory containing processed paper directories.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=32,
+        help="Batch size for model inference.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing usage_context_labels.json files.",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        required=True,
+        help="Path to Deep-Citation best_model.pt checkpoint.",
+    )
+    parser.add_argument(
+        "--model-data-dir",
+        type=str,
+        default="Deep-Citation/Data",
+        help="Deep-Citation data directory (for label order).",
+    )
+    parser.add_argument(
+        "--model-class-def",
+        type=str,
+        default="Deep-Citation/Data/class_def.json",
+        help="Deep-Citation class_def.json path.",
+    )
+    parser.add_argument(
+        "--model-lm",
+        type=str,
+        default="scibert",
+        help="Model name used for the Deep-Citation checkpoint.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device for model inference (cuda/cpu).",
+    )
+    args = parser.parse_args()
+    model_path = Path(args.model_path).expanduser().resolve()
+    if not model_path.exists():
+        raise SystemExit(f"Model path does not exist: {model_path}")
+    root = Path(args.root).expanduser().resolve()
+    if not root.exists():
+        raise SystemExit(f"Root directory does not exist: {root}")
+    paper_dirs = sorted(iter_paper_dirs(root), key=lambda p: p.name)
+    print(f"[INFO] Found {len(paper_dirs)} paper dirs under {root}")
+    counts = {
+        "labeled": 0,
+        "skipped": 0,
+        "missing_usage": 0,
+        "empty_contexts": 0,
+    }
+    for paper_dir in paper_dirs:
+        status = score_for_paper(
+            paper_dir,
+            args.batch_size,
+            args.overwrite,
+            model_path=model_path,
+            model_data_dir=Path(args.model_data_dir).expanduser().resolve(),
+            model_class_def=Path(args.model_class_def).expanduser().resolve(),
+            model_lm=args.model_lm,
+            device=args.device,
+        )
+        counts[status] = counts.get(status, 0) + 1
+        print(f"[{status.upper()}] {paper_dir.name}")
+    print(
+        "[SUMMARY] labeled={labeled}, skipped={skipped}, missing_usage={missing_usage}, "
+        "empty_contexts={empty_contexts}".format(**counts)
+    )
+if __name__ == "__main__":
+    main()

src/step_05_verify_uses_extends/prompts.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from typing import Dict, List
+USES_DEFINITION = (
+    "USES: The CITING_PAPER explicitly uses/adopts/evaluates on/includes/relies on "
+    "a dataset, benchmark, method, tool, or reported results from TARGET_PAPER "
+    "as part of the CITING_PAPER's own methodology or evaluation."
+)
+EXTENDS_DEFINITION = (
+    "EXTENDS: The CITING_PAPER explicitly extends/modifies/adapts/builds upon "
+    "TARGET_PAPER's method/dataset/benchmark/tool."
+)
+NOTES_DEFINITION = (
+    "NOT USES/EXTENDS: Merely describing what TARGET_PAPER introduces/offers/proposes "
+    "or listing it among related work or benchmarks (without stating adoption). "
+    "If no explicit adoption/extension cue is present, label NOT_CONFIRMED."
+)
+FEW_SHOT_USES = [
+    "We use the same splits as <CITED HERE> .",
+    "The Praat tool was used ( <CITED HERE> ) .",
+    "CCGBank ( <CITED HERE> ) is used to train the model .",
+    "This design idea was adopted from TANKA ( <CITED HERE>b ) .",
+    "Our strategy is based on the approach presented by <CITED HERE> .",
+]
+FEW_SHOT_EXTENDS = [
+    "The features can be easily obtained by modifying the TAT extraction algorithm described in ( <CITED HERE> ) .",
+    "Our own work ( <CITED HERE> ) extends the first idea to paraphrase fragment extraction on monolingual parallel and comparable corpora .",
+    "This article represents an extension of our previous work on unsupervised event coreference resolution ( Bejan et al. 2009 ; <CITED HERE> ) .",
+    "This evaluation set-up is an improvement versus the one we previously reported ( <CITED HERE> ) , in which fixed partitions were used for training , development , and testing .",
+    "The computational treatment of lexical rules proposed can be seen as an extension to the principled method discussed by Gotz and <CITED HERE> , 1996 , 1997b ) for encoding the main building block of HPSG grammars -- the implicative constraints -- as a logic program .",
+]
+FEW_SHOT_NOT_CONFIRMED = [
+    "<CITED HERE> introduced factored SMT .",
+    "See ( <CITED HERE> ) for a discussion .",
+    "See , among others , ( <CITED HERE> ) .",
+    "<CITED HERE> reported a correlation of r = .69 .",
+    "See <CITED HERE> for further discussion .",
+]
+def build_uses_extends_verification_prompt(
+    target_info: Dict[str, str],
+    candidates: List[Dict[str, str]],
+) -> str:
+    header = [
+        "You are verifying citation function for a TARGET paper inside a citing sentence.",
+        "Be strict: lists of related work or benchmarks are NOT USES/EXTENDS unless there is an explicit action",
+        "like \"use\", \"build on\", \"adopt\", \"extend\", \"based on\", \"trained on\", \"evaluate on\", \"implement\".",
+        "",
+        "Actor test (CRITICAL for USES/EXTENSION):",
+        "- Only label USES or EXTENSION if the ACTION is performed by the CITING_PAPER.",
+        "- The cue_span for USES/EXTENSION must include an explicit citing-paper actor phrase such as:",
+        "  \"we\", \"our\", \"in this work\", \"in this paper\", \"we use\", \"we evaluate\",",
+        "  \"our evaluation includes\", \"we extend\", \"we build on\", \"we adapt\".",
+        "- If the context says the TARGET_PAPER (or some other paper/system) uses/extends something",
+        "  (e.g., \"TARGET_PAPER uses...\", \"TARGET_PAPER extends...\"),",
+        "  then it is NOT USES/EXTENSION. Label NOT_CONFIRMED.",
+        "",
+        "Task: Label each sentence as USES, EXTENDS, or NOT_CONFIRMED.",
+        "Return JSON only with one entry per input sentence.",
+        "",
+        "Definitions:",
+        f"- {USES_DEFINITION}",
+        f"- {EXTENDS_DEFINITION}",
+        f"- {NOTES_DEFINITION}",
+        "",
+        "Output rules:",
+        "- label must be one of: USES, EXTENDS, NOT_CONFIRMED",
+        "- cue_span: exact substring from the sentence that justifies USES/EXTENDS, else empty",
+        "- rationale: one short sentence",
+        "- If cue_span is empty => label must be NOT_CONFIRMED",
+        "",
+        "Few-shot examples:",
+        "USES:",
+    ]
+    for ex in FEW_SHOT_USES:
+        header.append(f"- {ex}")
+    header.append("EXTENDS:")
+    for ex in FEW_SHOT_EXTENDS:
+        header.append(f"- {ex}")
+    header.append("NOT_CONFIRMED:")
+    for ex in FEW_SHOT_NOT_CONFIRMED:
+        header.append(f"- {ex}")
+    header.extend(
+        [
+            "",
+            "TARGET_PAPER:",
+            f"- title: {target_info.get('title', '')}",
+            f"- first_author_last: {target_info.get('first_author_last', '')}",
+            f"- year: {target_info.get('year', '')}",
+            "",
+            "CANDIDATES:",
+        ]
+    )
+    for item in candidates:
+        header.extend(
+            [
+                f"ID: {item['id']}",
+                f"Citing paper: {item.get('citing_title', '')}",
+                f"Sentence: {item.get('text', '')}",
+                "",
+            ]
+        )
+    header.append("JSON OUTPUT:")
+    header.append("{\"labels\": [{\"id\": 1, \"label\": \"USES\", \"cue_span\": \"...\", \"rationale\": \"...\"}]}")
+    return "\n".join(header)

src/step_05_verify_uses_extends/schemas.py ADDED Viewed

	@@ -0,0 +1,22 @@

+USES_EXTENDS_VERIFICATION_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "labels": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "id": {"type": "integer"},
+                    "label": {
+                        "type": "string",
+                        "enum": ["USES", "EXTENDS", "NOT_CONFIRMED"],
+                    },
+                    "cue_span": {"type": "string"},
+                    "rationale": {"type": "string"},
+                },
+                "required": ["id", "label", "cue_span", "rationale"],
+            },
+        },
+    },
+    "required": ["labels"],
+}

src/step_05_verify_uses_extends/verify_uses_extends.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+SRC_ROOT = Path(__file__).resolve().parents[1]
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+from common.llm_client import LLMClient
+from prompts import build_uses_extends_verification_prompt
+from schemas import USES_EXTENDS_VERIFICATION_JSON_SCHEMA
+PAPER_META_FILE = "paper_metadata.json"
+USAGE_LABELS_FILE = "usage_context_labels.json"
+OUT_FILE = "usage_uses_extends_verified.json"
+USE_LABELS = {"Uses", "Extends"}
+def load_json(path: Path) -> Any | None:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+def iter_paper_dirs(root: Path) -> List[Path]:
+    out: List[Path] = []
+    for child in root.iterdir():
+        if child.is_dir() and (child / PAPER_META_FILE).exists():
+            out.append(child)
+    return out
+def _normalize_author_last(name: str) -> str:
+    parts = [p for p in (name or "").split() if p.strip()]
+    return parts[-1] if parts else ""
+def extract_target_info(meta: Any) -> Dict[str, str]:
+    if isinstance(meta, list) and meta:
+        meta = meta[0]
+    if not isinstance(meta, dict):
+        return {"title": "", "first_author_last": "", "year": ""}
+    authors = meta.get("authors") or []
+    first_author = authors[0]["name"] if authors else ""
+    return {
+        "title": meta.get("title", ""),
+        "first_author_last": _normalize_author_last(first_author),
+        "year": str(meta.get("year", "")),
+    }
+def verify_candidates(
+    client: LLMClient,
+    target_info: Dict[str, str],
+    candidates: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    prompt = build_uses_extends_verification_prompt(target_info, candidates)
+    try:
+        raw = client.call(prompt, schema=USES_EXTENDS_VERIFICATION_JSON_SCHEMA)
+    except Exception as exc:
+        print(f"[WARN] LLM call failed: {exc}. Marking all candidates NOT_CONFIRMED.")
+        return [
+            {
+                "id": item.get("id"),
+                "label": "NOT_CONFIRMED",
+                "cue_span": "",
+                "rationale": "",
+                "text": item.get("text", ""),
+                "citing_paper_id": item.get("citing_paper_id", ""),
+                "citing_title": item.get("citing_title", ""),
+                "original_label": item.get("original_label", ""),
+            }
+            for item in candidates
+        ]
+    data = _parse_llm_json(raw)
+    if not isinstance(data, dict):
+        print("[WARN] Failed to parse LLM JSON response; marking all candidates NOT_CONFIRMED.")
+        return [
+            {
+                "id": item.get("id"),
+                "label": "NOT_CONFIRMED",
+                "cue_span": "",
+                "rationale": "",
+                "text": item.get("text", ""),
+                "citing_paper_id": item.get("citing_paper_id", ""),
+                "citing_title": item.get("citing_title", ""),
+                "original_label": item.get("original_label", ""),
+            }
+            for item in candidates
+        ]
+    labels = data.get("labels", [])
+    by_id = {item.get("id"): item for item in labels if isinstance(item, dict)}
+    verified: List[Dict[str, Any]] = []
+    for candidate in candidates:
+        item_id = candidate["id"]
+        model = by_id.get(item_id, {})
+        label = model.get("label", "NOT_CONFIRMED")
+        cue_span = model.get("cue_span", "")
+        if not cue_span:
+            label = "NOT_CONFIRMED"
+        verified.append(
+            {
+                "id": item_id,
+                "label": label,
+                "cue_span": cue_span,
+                "rationale": model.get("rationale", ""),
+                "text": candidate.get("text", ""),
+                "citing_paper_id": candidate.get("citing_paper_id", ""),
+                "citing_title": candidate.get("citing_title", ""),
+                "original_label": candidate.get("original_label", ""),
+            }
+        )
+    return verified
+def _parse_llm_json(raw: str) -> Any | None:
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError:
+        pass
+    cleaned = raw.strip()
+    if cleaned.startswith("```"):
+        cleaned = cleaned.strip("`")
+        cleaned = cleaned.replace("json", "", 1).strip()
+    start = cleaned.find("{")
+    end = cleaned.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        return None
+    snippet = cleaned[start : end + 1]
+    try:
+        return json.loads(snippet)
+    except json.JSONDecodeError:
+        return None
+def process_paper(
+    paper_dir: Path,
+    client: LLMClient,
+    k: int,
+    batch_size: int,
+    overwrite: bool,
+    resume: bool,
+) -> str:
+    labels_path = paper_dir / USAGE_LABELS_FILE
+    payload = load_json(labels_path)
+    if not isinstance(payload, dict):
+        return "missing_labels"
+    out_path = paper_dir / OUT_FILE
+    if out_path.exists() and (resume or not overwrite):
+        return "skipped"
+    labels = payload.get("labels", [])
+    candidates_all = []
+    for item in labels:
+        if item.get("label") in USE_LABELS:
+            candidates_all.append(
+                {
+                    "id": item.get("id"),
+                    "text": item.get("text", ""),
+                    "citing_paper_id": item.get("citing_paper_id", ""),
+                    "citing_title": item.get("citing_title", ""),
+                    "original_label": item.get("label"),
+                    "confidence": float(item.get("confidence", 0.0) or 0.0),
+                }
+            )
+    if not candidates_all:
+        result = {
+            "paper_id": payload.get("paper_id"),
+            "target": {},
+            "candidates_total": 0,
+            "candidates_considered": 0,
+            "verified": [],
+            "confirmed": [],
+        }
+        out_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
+        return "no_candidates"
+    # Keep top-k highest-confidence USES/EXTENDS contexts for LLM verification.
+    # If k <= 0, verify all candidates.
+    candidates_all = sorted(
+        candidates_all,
+        key=lambda x: x.get("confidence", 0.0),
+        reverse=True,
+    )
+    candidates = candidates_all if k <= 0 else candidates_all[:k]
+    target_info = extract_target_info(load_json(paper_dir / PAPER_META_FILE))
+    verified: List[Dict[str, Any]] = []
+    if batch_size <= 0:
+        batch_size = 25
+    for i in range(0, len(candidates), batch_size):
+        batch = candidates[i : i + batch_size]
+        verified.extend(verify_candidates(client, target_info, batch))
+    confirmed = [v for v in verified if v["label"] in {"USES", "EXTENDS"}]
+    if any(item["label"] == "EXTENDS" for item in confirmed):
+        final_label = "EXTENDS"
+    elif confirmed:
+        final_label = "USES"
+    else:
+        final_label = "NOT_CONFIRMED"
+    result = {
+        "paper_id": payload.get("paper_id"),
+        "target": target_info,
+        "candidates_total": len(candidates_all),
+        "candidates_considered": len(candidates),
+        "verification_batch_size": int(batch_size),
+        "verification_num_batches": (len(candidates) + batch_size - 1) // batch_size if candidates else 0,
+        "candidates_selected": len(confirmed),
+        "verified": verified,
+        "confirmed": confirmed,
+        "confirmed_extends": sum(1 for x in confirmed if x.get("label") == "EXTENDS"),
+        "confirmed_uses": sum(1 for x in confirmed if x.get("label") == "USES"),
+        "final_label": final_label,
+    }
+    out_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
+    return "verified"
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Verify USES/EXTENDS candidates via LLM and select top-K."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="runs/processed_papers",
+        help="Root directory containing processed paper directories.",
+    )
+    parser.add_argument(
+        "--k",
+        type=int,
+        default=0,
+        help="Verify top-k USES/EXTENDS candidates ranked by classifier confidence (<=0 means all).",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=25,
+        help="Number of candidates per LLM verification batch.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing usage_uses_extends_verified.json files.",
+    )
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="Skip papers with existing output files (even if --overwrite is set).",
+    )
+    args = parser.parse_args()
+    root = Path(args.root).expanduser().resolve()
+    if not root.exists():
+        raise SystemExit(f"Root directory does not exist: {root}")
+    client = LLMClient()
+    paper_dirs = sorted(iter_paper_dirs(root), key=lambda p: p.name)
+    print(f"[INFO] Found {len(paper_dirs)} paper dirs under {root}")
+    counts = {"verified": 0, "skipped": 0, "missing_labels": 0, "no_candidates": 0}
+    for paper_dir in paper_dirs:
+        status = process_paper(
+            paper_dir,
+            client,
+            args.k,
+            args.batch_size,
+            args.overwrite,
+            args.resume,
+        )
+        counts[status] = counts.get(status, 0) + 1
+        print(f"[{status.upper()}] {paper_dir.name}")
+    print(
+        "[SUMMARY] verified={verified}, skipped={skipped}, missing_labels={missing_labels}, "
+        "no_candidates={no_candidates}".format(**counts)
+    )
+if __name__ == "__main__":
+    main()

src/step_06_extract_paragraphs/extract_arxiv_paragraphs.py ADDED Viewed

	@@ -0,0 +1,488 @@

+import argparse
+import json
+import random
+import re
+import sys
+import tarfile
+import tempfile
+import time
+import urllib.request
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import os
+SRC_ROOT = Path(__file__).resolve().parents[1]
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+PAPER_META_FILE = "paper_metadata.json"
+USAGE_CONTEXTS_FILE = "usage_contexts.json"
+VERIFIED_FILE = "usage_uses_extends_verified.json"
+OUT_FILE = "usage_citing_paragraphs.json"
+def load_json(path: Path) -> Any | None:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+def iter_paper_dirs(root: Path) -> List[Path]:
+    out: List[Path] = []
+    for child in root.iterdir():
+        if child.is_dir() and (child / PAPER_META_FILE).exists():
+            out.append(child)
+    return out
+def safe_extract(tar: tarfile.TarFile, path: Path) -> None:
+    for member in tar.getmembers():
+        member_path = path / member.name
+        if not str(member_path.resolve()).startswith(str(path.resolve())):
+            raise RuntimeError(f"Blocked path traversal in tar: {member.name}")
+    tar.extractall(path)
+_ARXIV_LAST_TS = 0.0
+def _arxiv_min_interval_sleep() -> None:
+    """Global throttle to avoid arXiv API rate limits."""
+    global _ARXIV_LAST_TS
+    min_interval = float(os.getenv("ARXIV_MIN_INTERVAL", "1.0"))
+    now = time.monotonic()
+    elapsed = now - _ARXIV_LAST_TS
+    if elapsed < min_interval:
+        time.sleep(min_interval - elapsed)
+    _ARXIV_LAST_TS = time.monotonic()
+def download_arxiv_source(arxiv_id: str, tmpdir: Path) -> Optional[Path]:
+    url = f"https://arxiv.org/e-print/{arxiv_id}"
+    archive_path = tmpdir / f"{arxiv_id.replace('/', '_')}.tar"
+    max_retries = int(os.getenv("ARXIV_MAX_RETRIES", "6"))
+    base_sleep = float(os.getenv("ARXIV_BASE_SLEEP", "2.0"))
+    max_sleep = float(os.getenv("ARXIV_MAX_BACKOFF", "60"))
+    for attempt in range(max_retries):
+        try:
+            _arxiv_min_interval_sleep()
+            urllib.request.urlretrieve(url, archive_path)  # noqa: S310
+            try:
+                with tarfile.open(archive_path) as tar:
+                    safe_extract(tar, tmpdir)
+                return tmpdir
+            except tarfile.ReadError as exc:
+                print(f"[WARN] Invalid arXiv archive for {arxiv_id}: {exc}")
+                return None
+        except Exception as exc:
+            # arXiv sometimes returns 429; treat any network error as retryable.
+            sleep = min(base_sleep * (2 ** attempt), max_sleep) + random.uniform(0.0, 0.5)
+            print(f"[WARN] Failed to download arXiv source for {arxiv_id}: {exc}")
+            print(f"[WARN] arXiv download retrying in {sleep:.2f}s")
+            time.sleep(sleep)
+            continue
+    print(f"[ERROR] Giving up after {max_retries} attempts for arXiv {arxiv_id}")
+    return None
+def find_main_tex(root: Path) -> Optional[Path]:
+    tex_files = list(root.rglob("*.tex"))
+    if not tex_files:
+        return None
+    candidates: List[Tuple[int, Path]] = []
+    for path in tex_files:
+        try:
+            text = path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+        score = 0
+        if "\\begin{document}" in text:
+            score += 3
+        if "\\documentclass" in text:
+            score += 2
+        score += len(text) // 1000
+        candidates.append((score, path))
+    candidates.sort(key=lambda x: x[0], reverse=True)
+    return candidates[0][1] if candidates else None
+def read_bib_files(root: Path) -> Dict[str, str]:
+    bibs: Dict[str, str] = {}
+    for path in root.rglob("*.bib"):
+        try:
+            bibs[str(path.relative_to(root))] = path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            continue
+    return bibs
+def normalize_text(text: str) -> str:
+    text = re.sub(r"[^a-z0-9\s]", " ", text.lower())
+    return re.sub(r"\s+", " ", text).strip()
+def tokenize(text: str) -> List[str]:
+    return [t for t in normalize_text(text).split() if t]
+def paragraphize(text: str) -> List[str]:
+    text = text.replace("\r\n", "\n")
+    text = re.sub(r"\n\s*\n", "\n\n", text)
+    paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
+    return paragraphs
+def strip_latex_comments(text: str) -> str:
+    # Remove explicit comment environments first.
+    text = re.sub(r"\\begin\{comment\}.*?\\end\{comment\}", "", text, flags=re.DOTALL)
+    cleaned_lines: List[str] = []
+    for line in text.splitlines():
+        out_chars: List[str] = []
+        i = 0
+        while i < len(line):
+            ch = line[i]
+            if ch == "%":
+                # Keep escaped percent (\%) and continue parsing.
+                if i > 0 and line[i - 1] == "\\":
+                    out_chars.append(ch)
+                    i += 1
+                    continue
+                # Unescaped percent starts a LaTeX comment; ignore rest of the line.
+                break
+            out_chars.append(ch)
+            i += 1
+        cleaned_lines.append("".join(out_chars))
+    return "\n".join(cleaned_lines)
+def parse_bib_entries(bib_text: str) -> List[Dict[str, str]]:
+    entries: List[Dict[str, str]] = []
+    matches = list(re.finditer(r"@[\w]+\s*\{\s*([^,]+),", bib_text))
+    for i, match in enumerate(matches):
+        key = match.group(1).strip()
+        start = match.end()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(bib_text)
+        body = bib_text[start:end]
+        fields = {}
+        for f_match in re.finditer(r"(\w+)\s*=\s*[{|\"](.+?)[}|\"]\s*,", body, re.DOTALL):
+            fields[f_match.group(1).lower()] = f_match.group(2).strip()
+        entries.append({"key": key, **fields})
+    return entries
+def find_target_bib_keys(
+    bib_texts: Dict[str, str],
+    target_info: Dict[str, str],
+) -> List[str]:
+    target_title = normalize_text(target_info.get("title", ""))
+    target_author = normalize_text(target_info.get("first_author_last", ""))
+    target_year = target_info.get("year", "")
+    if not target_title and not target_author:
+        return []
+    keys: List[str] = []
+    for bib_text in bib_texts.values():
+        for entry in parse_bib_entries(bib_text):
+            title = normalize_text(entry.get("title", ""))
+            author = normalize_text(entry.get("author", ""))
+            year = str(entry.get("year", ""))
+            has_title = bool(title)
+            title_match = target_title and (target_title in title or title in target_title)
+            author_match = target_author and target_author in author
+            year_match = target_year and target_year in year
+            if title_match and author_match:
+                keys.append(entry["key"])
+            elif not has_title and author_match and year_match:
+                keys.append(entry["key"])
+            elif author_match and year_match:
+                keys.append(entry["key"])
+    return keys
+def replace_target_citations(text: str, target_keys: List[str], target_info: Dict[str, str]) -> str:
+    key_set = set(target_keys or [])
+    author = target_info.get("first_author_last", "").lower()
+    year = target_info.get("year", "")
+    alt_years = {year}
+    if year.isdigit():
+        alt_years.add(str(int(year) - 1))
+        alt_years.add(str(int(year) + 1))
+    def repl(match: re.Match) -> str:
+        keys = [k.strip() for k in match.group(1).split(",")]
+        for key in keys:
+            if key in key_set:
+                return "<CITED HERE>"
+            key_lc = key.lower()
+            if author and author in key_lc and any(y in key_lc for y in alt_years if y):
+                return "<CITED HERE>"
+        return match.group(0)
+    return re.sub(r"\\cite[a-zA-Z]*\s*\{([^}]+)\}", repl, text)
+def match_paragraphs(
+    paragraphs: List[str],
+    contexts: List[Dict[str, str]],
+) -> List[Dict[str, Any]]:
+    results: List[Dict[str, Any]] = []
+    para_tokens = [set(tokenize(p)) for p in paragraphs]
+    for idx, ctx in enumerate(contexts, start=1):
+        ctx_text = ctx.get("text", "")
+        ctx_tokens = set(tokenize(ctx_text))
+        if not ctx_tokens:
+            continue
+        best = None
+        best_score = 0.0
+        for p_idx, tokens in enumerate(para_tokens):
+            if not tokens:
+                continue
+            overlap = len(ctx_tokens & tokens) / max(1, len(ctx_tokens))
+            if overlap > best_score:
+                best = p_idx
+                best_score = overlap
+        if best is not None and best_score >= 0.5:
+            paragraph = paragraphs[best]
+            results.append(
+                {
+                    "context_id": idx,
+                    "context": ctx_text,
+                    "context_with_marker": ctx.get("text_with_marker", ctx_text),
+                    "paragraph": paragraph,
+                    "overlap": round(best_score, 3),
+                }
+            )
+    return results
+def _normalize_text(text: str) -> str:
+    return " ".join(text.split()).strip().lower()
+def _normalize_for_match(text: str) -> str:
+    text = text.replace("<CITED HERE>", "")
+    text = re.sub(r"\[[^\]]+\]", "", text)
+    return _normalize_text(text)
+def _normalize_author_last(name: str) -> str:
+    parts = [p for p in (name or "").split() if p.strip()]
+    return parts[-1] if parts else ""
+def extract_target_info(meta: Any) -> Dict[str, str]:
+    if isinstance(meta, list) and meta:
+        meta = meta[0]
+    if not isinstance(meta, dict):
+        return {"title": "", "first_author_last": "", "year": ""}
+    authors = meta.get("authors") or []
+    first_author = authors[0]["name"] if authors else ""
+    return {
+        "title": meta.get("title", ""),
+        "first_author_last": _normalize_author_last(first_author),
+        "year": str(meta.get("year", "")),
+    }
+def build_citing_contexts_map(
+    usage: Dict[str, Any],
+    confirmed_texts_by_citing: Dict[str, set] | None,
+) -> Dict[str, Dict[str, Any]]:
+    citing_map: Dict[str, Dict[str, Any]] = {}
+    for entry in usage.get("citing_papers", []) or []:
+        if not isinstance(entry, dict):
+            continue
+        citing_id = entry.get("citing_paper_id") or ""
+        allowed_texts = confirmed_texts_by_citing.get(citing_id) if confirmed_texts_by_citing else None
+        allowed_norms = (
+            {_normalize_for_match(text) for text in allowed_texts} if allowed_texts else None
+        )
+        contexts = []
+        seen = set()
+        for c in entry.get("contexts", []) or []:
+            if not isinstance(c, dict):
+                continue
+            text_raw = (c.get("text") or "").strip()
+            text_with_marker = (c.get("context_with_marker") or text_raw).strip()
+            if not text_raw:
+                continue
+            norm = _normalize_for_match(text_raw)
+            if allowed_norms is not None and norm not in allowed_norms:
+                continue
+            if norm in seen:
+                continue
+            seen.add(norm)
+            contexts.append({"text": text_raw, "text_with_marker": text_with_marker})
+        if allowed_texts is not None and not contexts:
+            for text in allowed_texts:
+                norm = _normalize_for_match(text)
+                if norm in seen:
+                    continue
+                seen.add(norm)
+                contexts.append({"text": text, "text_with_marker": text})
+        citing_map[citing_id] = {
+            "title": entry.get("title", ""),
+            "paper_id": citing_id,
+            "arxiv_id": (entry.get("external_ids") or {}).get("ArXiv", ""),
+            "contexts": contexts,
+        }
+    return citing_map
+def process_citing_paper(citing: Dict[str, Any]) -> Dict[str, Any]:
+    target_info = citing.get("target_info", {})
+    arxiv_id = citing.get("arxiv_id", "")
+    if not arxiv_id:
+        return {"error": "missing_arxiv_id", **citing}
+    with tempfile.TemporaryDirectory() as tmp:
+        tmpdir = Path(tmp)
+        if not download_arxiv_source(arxiv_id, tmpdir):
+            return {"error": "bad_arxiv_archive", **citing}
+        main_tex = find_main_tex(tmpdir)
+        if not main_tex:
+            return {"error": "missing_main_tex", **citing}
+        tex_text = main_tex.read_text(encoding="utf-8", errors="ignore")
+        tex_text = strip_latex_comments(tex_text)
+        bibs = read_bib_files(tmpdir)
+        target_keys = find_target_bib_keys(bibs, target_info)
+        tex_text = replace_target_citations(tex_text, target_keys, target_info)
+        paragraphs = paragraphize(tex_text)
+        target_citing_paragraphs = [p for p in paragraphs if "<CITED HERE>" in p]
+        matched = match_paragraphs(paragraphs, citing.get("contexts", []))
+        return {
+            "citing_paper_id": citing.get("paper_id", ""),
+            "citing_title": citing.get("title", ""),
+            "arxiv_id": arxiv_id,
+            "main_tex_file": str(main_tex.relative_to(tmpdir)),
+            "bib_files": list(bibs.keys()),
+            "bib_texts": bibs,
+            "target_bib_keys": target_keys,
+            "contexts": citing.get("contexts", []),
+            "target_citing_paragraphs": target_citing_paragraphs,
+            "matched_paragraphs": matched,
+        }
+def process_paper(root: Path, overwrite: bool, include_all: bool, resume: bool) -> str:
+    usage = load_json(root / USAGE_CONTEXTS_FILE)
+    if not isinstance(usage, dict):
+        return "missing_usage"
+    out_path = root / OUT_FILE
+    if out_path.exists() and (resume or not overwrite):
+        return "skipped"
+    verified = None
+    confirmed_texts_by_citing: Dict[str, set] = {}
+    if not include_all:
+        verified = load_json(root / VERIFIED_FILE)
+        if not isinstance(verified, dict):
+            return "missing_verified"
+        for item in verified.get("confirmed", []) or []:
+            citing_id = item.get("citing_paper_id") or ""
+            text = item.get("text") or ""
+            if not citing_id or not text:
+                continue
+            confirmed_texts_by_citing.setdefault(citing_id, set()).add(text)
+    target_info = extract_target_info(load_json(root / PAPER_META_FILE))
+    citing_map = build_citing_contexts_map(
+        usage,
+        confirmed_texts_by_citing if confirmed_texts_by_citing else None,
+    )
+    if not citing_map:
+        out_path.write_text(
+            json.dumps({"paper_id": usage.get("paper_id"), "citing_papers": []}, indent=2),
+            encoding="utf-8",
+        )
+        return "empty_citing"
+    confirmed_ids: Optional[set] = None
+    if not include_all and isinstance(verified, dict):
+        confirmed = verified.get("confirmed", [])
+        confirmed_ids = {
+            item.get("citing_paper_id")
+            for item in confirmed
+            if item.get("citing_paper_id")
+        }
+    citing_papers = []
+    for citing_id, citing in citing_map.items():
+        if confirmed_ids is not None and citing_id not in confirmed_ids:
+            continue
+        citing["target_info"] = target_info
+        citing_papers.append(process_citing_paper(citing))
+    payload = {"paper_id": usage.get("paper_id"), "citing_papers": citing_papers}
+    out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    return "processed"
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download arXiv sources and extract citation-local paragraphs."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="runs/processed_papers",
+        help="Root directory containing processed paper directories.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing usage_citing_paragraphs.json files.",
+    )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Process all citing papers (not just confirmed USES/EXTENDS).",
+    )
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="Skip papers with existing output files (even if --overwrite is set).",
+    )
+    args = parser.parse_args()
+    root = Path(args.root).expanduser().resolve()
+    if not root.exists():
+        raise SystemExit(f"Root directory does not exist: {root}")
+    paper_dirs = sorted(iter_paper_dirs(root), key=lambda p: p.name)
+    print(f"[INFO] Found {len(paper_dirs)} paper dirs under {root}")
+    counts = {
+        "processed": 0,
+        "skipped": 0,
+        "missing_usage": 0,
+        "missing_verified": 0,
+        "empty_citing": 0,
+    }
+    for paper_dir in paper_dirs:
+        status = process_paper(paper_dir, args.overwrite, args.all, args.resume)
+        counts[status] = counts.get(status, 0) + 1
+        print(f"[{status.upper()}] {paper_dir.name}")
+    print(
+        "[SUMMARY] processed={processed}, skipped={skipped}, missing_usage={missing_usage}, "
+        "missing_verified={missing_verified}, empty_citing={empty_citing}".format(**counts)
+    )
+if __name__ == "__main__":
+    main()

src/step_07_extract_and_refine/extract_contributions_from_citations.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+SRC_ROOT = Path(__file__).resolve().parents[1]
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+from common.llm_client import LLMClient
+from prompts import build_contribution_prompt
+from schemas import CONTRIBUTION_JSON_SCHEMA
+PAPER_META_FILE = "paper_metadata.json"
+USAGE_CONTEXTS_FILE = "usage_contexts.json"
+ARXIV_PARAGRAPHS_FILE = "usage_citing_paragraphs.json"
+VERIFIED_FILE = "usage_uses_extends_verified.json"
+OUT_FILE = "usage_contributions.json"
+def load_json(path: Path) -> Any | None:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+def iter_paper_dirs(root: Path) -> List[Path]:
+    out: List[Path] = []
+    for child in root.iterdir():
+        if child.is_dir() and (child / PAPER_META_FILE).exists():
+            out.append(child)
+    return out
+def _normalize_author_last(name: str) -> str:
+    parts = [p for p in (name or "").split() if p.strip()]
+    return parts[-1] if parts else ""
+def extract_target_info(meta: Any) -> Dict[str, str]:
+    if isinstance(meta, list) and meta:
+        meta = meta[0]
+    if not isinstance(meta, dict):
+        return {
+            "title": "",
+            "first_author_last": "",
+            "year": "",
+            "tldr": "",
+            "abstract": "",
+        }
+    authors = meta.get("authors") or []
+    first_author = authors[0]["name"] if authors else ""
+    tldr = ""
+    tldr_obj = meta.get("tldr")
+    if isinstance(tldr_obj, dict):
+        tldr = tldr_obj.get("text", "")
+    return {
+        "title": meta.get("title", ""),
+        "first_author_last": _normalize_author_last(first_author),
+        "year": str(meta.get("year", "")),
+        "tldr": tldr,
+        "abstract": meta.get("abstract", ""),
+    }
+def build_citing_contexts_map_from_paragraphs(
+    arxiv_data: Dict[str, Any],
+) -> Dict[str, Dict[str, Any]]:
+    citing_map: Dict[str, Dict[str, Any]] = {}
+    for entry in arxiv_data.get("citing_papers", []) or []:
+        if not isinstance(entry, dict):
+            continue
+        citing_id = entry.get("citing_paper_id") or ""
+        contexts = []
+        seen = set()
+        for paragraph in entry.get("target_citing_paragraphs", []) or []:
+            paragraph = (paragraph or "").strip()
+            if not paragraph:
+                continue
+            combined = f"Target-citing paragraph: {paragraph}"
+            norm = " ".join(combined.split()).lower()
+            if norm in seen:
+                continue
+            seen.add(norm)
+            contexts.append(combined)
+        citing_map[citing_id] = {
+            "title": entry.get("citing_title", ""),
+            "paper_id": citing_id,
+            "contexts": contexts,
+            "source": "arxiv_paragraphs",
+        }
+    return citing_map
+def build_citing_contexts_map_from_usage(
+    usage: Dict[str, Any],
+    confirmed_texts_by_citing: Dict[str, set] | None,
+) -> Dict[str, Dict[str, Any]]:
+    citing_map: Dict[str, Dict[str, Any]] = {}
+    for entry in usage.get("citing_papers", []) or []:
+        if not isinstance(entry, dict):
+            continue
+        citing_id = entry.get("citing_paper_id") or ""
+        allowed_texts = confirmed_texts_by_citing.get(citing_id) if confirmed_texts_by_citing else None
+        contexts = []
+        seen = set()
+        for c in entry.get("contexts", []) or []:
+            if not isinstance(c, dict):
+                continue
+            text = (c.get("context_with_marker") or c.get("text") or "").strip()
+            if not text:
+                continue
+            if allowed_texts is not None and text not in allowed_texts:
+                continue
+            norm = " ".join(text.split()).lower()
+            if norm in seen:
+                continue
+            seen.add(norm)
+            contexts.append(f"Target sentence: {text}")
+        citing_map[citing_id] = {
+            "title": entry.get("title", ""),
+            "paper_id": citing_id,
+            "contexts": contexts,
+            "source": "usage_contexts_fallback",
+        }
+    return citing_map
+def extract_contribution(
+    client: LLMClient,
+    target_info: Dict[str, str],
+    citing_info: Dict[str, Any],
+) -> Dict[str, Any]:
+    contexts = citing_info.get("contexts", [])
+    prompt = build_contribution_prompt(target_info, citing_info, contexts)
+    raw = client.call(prompt, schema=CONTRIBUTION_JSON_SCHEMA)
+    data = _parse_llm_json(raw)
+    if not isinstance(data, dict):
+        return {
+            "citing_paper_id": citing_info.get("paper_id", ""),
+            "citing_title": citing_info.get("title", ""),
+            "label": "NOT_CONFIRMED",
+            "paper_claim": "",
+            "claim": "",
+            "cluster_title": "",
+            "cluster_key": "",
+            "evidence_span": "",
+            "rationale": "",
+            "contexts": contexts,
+            "source": citing_info.get("source", "unknown"),
+        }
+    label = data.get("label", "NOT_CONFIRMED")
+    paper_claim = data.get("paper_claim", "") or data.get("claim", "")
+    cluster_title = data.get("cluster_title", "") or data.get("cluster_claim", "")
+    cluster_key = data.get("cluster_key", "")
+    evidence_span = data.get("evidence_span", "")
+    if not evidence_span:
+        label = "NOT_CONFIRMED"
+        paper_claim = ""
+        cluster_title = ""
+        cluster_key = ""
+    if label in {"USES", "EXTENDS"} and not cluster_title:
+        cluster_title = paper_claim
+    if label in {"USES", "EXTENDS"} and not cluster_key:
+        cluster_key = f"{label}|contribution|unspecified"
+    return {
+        "citing_paper_id": citing_info.get("paper_id", ""),
+        "citing_title": citing_info.get("title", ""),
+        "label": label,
+        "paper_claim": paper_claim,
+        "claim": paper_claim,
+        "cluster_title": cluster_title,
+        "cluster_key": cluster_key,
+        "evidence_span": evidence_span,
+        "rationale": data.get("rationale", ""),
+        "contexts": contexts,
+        "source": citing_info.get("source", "unknown"),
+    }
+def _parse_llm_json(raw: str) -> Any | None:
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError:
+        pass
+    cleaned = raw.strip()
+    if cleaned.startswith("```"):
+        cleaned = cleaned.strip("`")
+        cleaned = cleaned.replace("json", "", 1).strip()
+    start = cleaned.find("{")
+    end = cleaned.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        return None
+    snippet = cleaned[start : end + 1]
+    try:
+        return json.loads(snippet)
+    except json.JSONDecodeError:
+        return None
+def process_paper(
+    paper_dir: Path,
+    client: LLMClient,
+    overwrite: bool,
+    resume: bool,
+) -> str:
+    verified = load_json(paper_dir / VERIFIED_FILE)
+    if not isinstance(verified, dict):
+        return "missing_verified"
+    out_path = paper_dir / OUT_FILE
+    if out_path.exists() and (resume or not overwrite):
+        return "skipped"
+    if verified.get("final_label") == "NOT_CONFIRMED":
+        payload = {
+            "paper_id": verified.get("paper_id"),
+            "final_label": "NOT_CONFIRMED",
+            "contributions": [],
+        }
+        out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        return "no_confirmed"
+    arxiv_data = load_json(paper_dir / ARXIV_PARAGRAPHS_FILE)
+    if not isinstance(arxiv_data, dict):
+        return "missing_arxiv_paragraphs"
+    target_info = extract_target_info(load_json(paper_dir / PAPER_META_FILE))
+    citing_map = build_citing_contexts_map_from_paragraphs(arxiv_data)
+    usage = load_json(paper_dir / USAGE_CONTEXTS_FILE)
+    confirmed_texts_by_citing: Dict[str, set] = {}
+    for item in verified.get("confirmed", []) or []:
+        citing_id = item.get("citing_paper_id") or ""
+        text = item.get("text") or ""
+        if not citing_id or not text:
+            continue
+        confirmed_texts_by_citing.setdefault(citing_id, set()).add(text)
+    usage_map = (
+        build_citing_contexts_map_from_usage(usage, confirmed_texts_by_citing)
+        if isinstance(usage, dict)
+        else {}
+    )
+    confirmed = verified.get("confirmed", [])
+    confirmed_ids = {item.get("citing_paper_id") for item in confirmed if item.get("citing_paper_id")}
+    contributions: List[Dict[str, Any]] = []
+    fallback_citing_ids: List[str] = []
+    for citing_id in confirmed_ids:
+        citing_info = citing_map.get(citing_id)
+        if citing_info and not citing_info.get("contexts"):
+            citing_info = None
+        if not citing_info:
+            fallback = usage_map.get(citing_id)
+            if fallback and fallback.get("contexts"):
+                citing_info = fallback
+                fallback_citing_ids.append(citing_id)
+            else:
+                continue
+        contributions.append(extract_contribution(client, target_info, citing_info))
+    payload = {
+        "paper_id": verified.get("paper_id"),
+        "final_label": verified.get("final_label"),
+        "contributions": contributions,
+        "source": "arxiv_paragraphs",
+        "fallback_citing_ids": fallback_citing_ids,
+    }
+    out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    return "labeled"
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Extract per-citing-paper contribution claims from verified USES/EXTENDS."
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="runs/processed_papers",
+        help="Root directory containing processed paper directories.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing usage_contributions.json files.",
+    )
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="Skip papers with existing output files (even if --overwrite is set).",
+    )
+    args = parser.parse_args()
+    root = Path(args.root).expanduser().resolve()
+    if not root.exists():
+        raise SystemExit(f"Root directory does not exist: {root}")
+    client = LLMClient()
+    paper_dirs = sorted(iter_paper_dirs(root), key=lambda p: p.name)
+    print(f"[INFO] Found {len(paper_dirs)} paper dirs under {root}")
+    counts = {
+        "labeled": 0,
+        "skipped": 0,
+        "missing_verified": 0,
+        "missing_arxiv_paragraphs": 0,
+        "no_confirmed": 0,
+    }
+    for paper_dir in paper_dirs:
+        status = process_paper(paper_dir, client, args.overwrite, args.resume)
+        counts[status] = counts.get(status, 0) + 1
+        print(f"[{status.upper()}] {paper_dir.name}")
+    print(
+        "[SUMMARY] labeled={labeled}, skipped={skipped}, missing_verified={missing_verified}, "
+        "missing_arxiv_paragraphs={missing_arxiv_paragraphs}, no_confirmed={no_confirmed}".format(**counts)
+    )
+if __name__ == "__main__":
+    main()

src/step_07_extract_and_refine/prompts.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import Dict, List
+def build_contribution_prompt(
+    target_info: Dict[str, str],
+    citing_info: Dict[str, str],
+    contexts: List[str],
+) -> str:
+    header = [
+        "You are extracting how a citing paper uses or extends a target paper.",
+        "Read the paragraph(s) below and write ONE concise contribution claim.",
+        "Focus only on what the citing paper actually does with the target paper.",
+        "",
+        "Rules:",
+        "- If the citing paper explicitly uses/adopts/evaluates on the target paper's method/data/benchmark, label USES.",
+        "- If it explicitly extends/modifies/adapts/builds upon the target paper, label EXTENDS.",
+        "- If the paragraph is only descriptive/background or only compares/mentions the target paper, return label NOT_CONFIRMED and empty fields.",
+        "- Do not output comparison-only claims (e.g., 'compares to <CITED HERE>'); those are NOT_CONFIRMED.",
+        "- Output paper_claim: one concise, paper-specific contribution claim.",
+        "- Output cluster_title: concise natural-language cluster summary (6-14 words), generic across papers.",
+        "- Also output cluster_key in this exact format: RELATION|artifact|purpose",
+        "- cluster_key must be generic and reusable across papers.",
+        "- artifact and purpose must be short snake_case phrases (e.g., dataset, evaluation_protocol, evaluation).",
+        "- cluster_key RELATION must exactly match label.",
+        "- Avoid overly specific keys (no paper names, no model/version numbers, no citation keys).",
+        "- Prefer stable generic keys such as: USES|dataset|evaluation, EXTENDS|dataset|dataset_creation, USES|evaluation_protocol|evaluation.",
+        "- If label is NOT_CONFIRMED, paper_claim, cluster_title, cluster_key, and evidence_span must be empty.",
+        "- The evidence_span must be a verbatim substring from the provided contexts.",
+        "- The TARGET_PAPER abstract/TLDR is for background only; do not use it as evidence.",
+        "",
+        "Negative example (NOT_CONFIRMED):",
+        "Paragraph: \"We compare our method to <CITED HERE> and other baselines.\"",
+        "Output: {\"label\":\"NOT_CONFIRMED\",\"paper_claim\":\"\",\"cluster_title\":\"\",\"cluster_key\":\"\",\"evidence_span\":\"\",\"rationale\":\"Comparison only.\"}",
+        "",
+        "Return JSON only.",
+        "",
+        "TARGET_PAPER:",
+        f"- title: {target_info.get('title', '')}",
+        f"- first_author_last: {target_info.get('first_author_last', '')}",
+        f"- year: {target_info.get('year', '')}",
+        f"- tldr: {target_info.get('tldr', '')}",
+        f"- abstract: {target_info.get('abstract', '')}",
+        "",
+        "CITING_PAPER:",
+        f"- title: {citing_info.get('title', '')}",
+        f"- paper_id: {citing_info.get('paper_id', '')}",
+        "",
+        "CONTEXTS (verbatim, same order as extracted):",
+    ]
+    for i, text in enumerate(contexts, start=1):
+        header.append(f"({i}) {text}")
+    header.append("")
+    header.append("JSON OUTPUT:")
+    header.append(
+        "{"
+        "\"label\":\"USES\","
+        "\"paper_claim\":\"...\","
+        "\"cluster_title\":\"Uses target dataset for evaluation\","
+        "\"cluster_key\":\"USES|dataset|evaluation\","
+        "\"evidence_span\":\"...\","
+        "\"rationale\":\"...\""
+        "}"
+    )
+    return "\n".join(header)

src/step_07_extract_and_refine/refine_and_filter_clusters_llm.py ADDED Viewed

	@@ -0,0 +1,402 @@

+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+SRC_ROOT = Path(__file__).resolve().parents[1]
+if str(SRC_ROOT) not in sys.path:
+    sys.path.insert(0, str(SRC_ROOT))
+from common.llm_client import LLMClient
+PAPER_META_FILE = "paper_metadata.json"
+CONTRIB_FILE = "usage_contributions.json"
+DISCOVERY_FILE = "usage_discovery_from_contributions.json"
+OUT_FILE = "usage_discovery_from_contributions_refined.json"
+REFINE_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "kept_groups": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "cluster_ids": {"type": "array", "items": {"type": "string"}},
+                    "merged_title": {"type": "string"},
+                    "merged_key": {"type": "string"},
+                    "rationale": {"type": "string"},
+                },
+                "required": ["cluster_ids", "merged_title", "merged_key", "rationale"],
+            },
+        },
+        "dropped_clusters": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "cluster_id": {"type": "string"},
+                    "reason": {"type": "string"},
+                },
+                "required": ["cluster_id", "reason"],
+            },
+        },
+    },
+    "required": ["kept_groups", "dropped_clusters"],
+}
+def load_json(path: Path) -> Any | None:
+    if not path.exists():
+        return None
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+def iter_paper_dirs(root: Path) -> List[Path]:
+    return sorted([p for p in root.iterdir() if p.is_dir() and (p / PAPER_META_FILE).exists()], key=lambda p: p.name)
+def _to_int_indices(raw_indices: List[Any]) -> List[int]:
+    out: List[int] = []
+    for i in raw_indices or []:
+        try:
+            out.append(int(i))
+        except Exception:
+            continue
+    return out
+def _parse_key(key: str) -> Tuple[str, str, str]:
+    parts = [p.strip() for p in str(key or "").split("|")]
+    if len(parts) >= 3:
+        return parts[0].upper(), parts[1], parts[2]
+    return "", "", ""
+def _dominant_key(member_clusters: List[Dict[str, Any]]) -> str:
+    rel_count: Dict[str, int] = {}
+    art_count: Dict[str, int] = {}
+    pur_count: Dict[str, int] = {}
+    for c in member_clusters:
+        rel, art, pur = _parse_key(c.get("cluster_key", ""))
+        if rel:
+            rel_count[rel] = rel_count.get(rel, 0) + 1
+        if art:
+            art_count[art] = art_count.get(art, 0) + 1
+        if pur:
+            pur_count[pur] = pur_count.get(pur, 0) + 1
+    rel = max(rel_count, key=rel_count.get) if rel_count else "USES"
+    art = max(art_count, key=art_count.get) if art_count else "contribution"
+    pur = max(pur_count, key=pur_count.get) if pur_count else "unspecified"
+    return f"{rel}|{art}|{pur}"
+def _extract_title(meta: Any) -> str:
+    if isinstance(meta, list) and meta:
+        meta = meta[0]
+    if not isinstance(meta, dict):
+        return ""
+    return str(meta.get("title", ""))
+def _title_from_cluster_key(cluster_key: str) -> str:
+    parts = [p.strip() for p in str(cluster_key or "").split("|")]
+    if len(parts) >= 3:
+        relation, artifact, purpose = parts[0], parts[1], parts[2]
+        relation_txt = "Uses" if relation.upper() == "USES" else "Extends"
+        artifact_txt = artifact.replace("_", " ")
+        purpose_txt = purpose.replace("_", " ")
+        return f"{relation_txt} {artifact_txt} for {purpose_txt}".strip()
+    return cluster_key or ""
+def _cluster_by_exact_keys(keys: List[str]) -> List[List[int]]:
+    groups: Dict[str, List[int]] = {}
+    order: List[str] = []
+    for i, key in enumerate(keys):
+        k = (key or "").strip()
+        if not k:
+            k = f"__EMPTY__::{i}"
+        if k not in groups:
+            groups[k] = []
+            order.append(k)
+        groups[k].append(i)
+    return [groups[k] for k in order]
+def _build_initial_clusters_from_contributions(contrib: Dict[str, Any]) -> List[Dict[str, Any]]:
+    contributions = [
+        c for c in contrib.get("contributions", []) or []
+        if c.get("label") in {"USES", "EXTENDS"} and (c.get("paper_claim") or c.get("claim"))
+    ]
+    if not contributions:
+        return []
+    cluster_keys_all: List[str] = []
+    for c in contributions:
+        key = (c.get("cluster_key") or "").strip()
+        if not key:
+            label = str(c.get("label", "USES")).upper()
+            if label not in {"USES", "EXTENDS"}:
+                label = "USES"
+            key = f"{label}|contribution|unspecified"
+        cluster_keys_all.append(key)
+    clusters = _cluster_by_exact_keys(cluster_keys_all)
+    out: List[Dict[str, Any]] = []
+    for idx, cluster in enumerate(clusters, start=1):
+        first = contributions[cluster[0]]
+        key = cluster_keys_all[cluster[0]]
+        title = (first.get("cluster_title") or "").strip() or _title_from_cluster_key(key)
+        out.append({
+            "cluster_id": f"C{idx}",
+            "count": str(len(cluster)),
+            "representative_claim": title,
+            "cluster_key": key,
+            "cluster_title": title,
+            "claim_indices": [str(i) for i in cluster],
+        })
+    return out
+def _cluster_support_summary(cluster: Dict[str, Any], contributions: List[Dict[str, Any]]) -> Dict[str, Any]:
+    indices = _to_int_indices(cluster.get("claim_indices") or [])
+    items: List[Dict[str, Any]] = []
+    for i in indices:
+        if 0 <= i < len(contributions):
+            items.append(contributions[i])
+    labels = [str(item.get("label", "")).upper() for item in items if item.get("label")]
+    examples: List[str] = []
+    for item in items:
+        text = str(item.get("paper_claim") or item.get("claim") or "").strip()
+        if text:
+            examples.append(text)
+        if len(examples) >= 3:
+            break
+    rationales = [str(item.get("rationale", "")).strip() for item in items if item.get("rationale")][:2]
+    use_count = sum(1 for x in labels if x == "USES")
+    ext_count = sum(1 for x in labels if x == "EXTENDS")
+    return {
+        "examples": examples,
+        "rationales": rationales,
+        "uses_count": use_count,
+        "extends_count": ext_count,
+        "member_count": len(items),
+    }
+def build_prompt(paper_title: str, centroids: List[Dict[str, Any]]) -> str:
+    lines: List[str] = [
+        "You are refining downstream citation contribution clusters for one target paper.",
+        "Input clusters are already built. Your job is to (a) conservatively merge near-duplicate downstream-usage clusters and (b) drop clusters that do not actually show substantive downstream usage of the target contribution.",
+        "",
+        f"Target paper: {paper_title}",
+        "",
+        "Rules:",
+        "- Operate only at cluster level. Do not invent new instances.",
+        "- Prefer conservative merges. If unsure, keep clusters separate.",
+        "- You may drop clusters only when they fail to show real downstream use or extension of the target contribution.",
+        "- Drop clusters that are clearly mere mention, loose comparison, background citation, noisy extraction, or off-target usage.",
+        "- Never merge USES and EXTENDS clusters together.",
+        "- Every input cluster_id must either appear in exactly one kept group or in dropped_clusters.",
+        "- kept merged_key must be in format RELATION|artifact|purpose.",
+        "- merged_title must be a short natural-language summary (5-12 words).",
+        "",
+        "Input clusters:",
+    ]
+    for c in centroids:
+        lines.append(
+            f"- {c['cluster_id']}: key={c.get('cluster_key','')}; title={c.get('cluster_title','')}; count={c.get('count', 0)}; uses={c.get('uses_count',0)}; extends={c.get('extends_count',0)}; examples={' | '.join(c.get('examples',[])[:2])}; rationales={' | '.join(c.get('rationales',[])[:1])}"
+        )
+    lines += [
+        "",
+        "Return JSON only with this shape:",
+        "{",
+        '  "kept_groups": [',
+        "    {",
+        '      "cluster_ids": ["C1","C3"],',
+        '      "merged_title": "Uses target dataset for evaluation",',
+        '      "merged_key": "USES|dataset|evaluation",',
+        '      "rationale": "Both clusters describe the same downstream dataset use."',
+        "    }",
+        "  ],",
+        '  "dropped_clusters": [',
+        '    {"cluster_id": "C7", "reason": "Only background mention; no substantive downstream use."}',
+        "  ]",
+        "}",
+    ]
+    return "\n".join(lines)
+def _normalize_decision(data: Dict[str, Any], original_clusters: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    valid_ids = [c.get("cluster_id", "") for c in original_clusters if c.get("cluster_id")]
+    valid_set = set(valid_ids)
+    assigned = set()
+    kept: List[Dict[str, Any]] = []
+    dropped: List[Dict[str, Any]] = []
+    for item in data.get("dropped_clusters") or []:
+        cid = item.get("cluster_id")
+        if cid in valid_set and cid not in assigned:
+            assigned.add(cid)
+            dropped.append({"cluster_id": cid, "reason": str(item.get("reason", "")).strip() or "Dropped by LLM filter."})
+    for g in data.get("kept_groups") or []:
+        ids = [cid for cid in (g.get("cluster_ids") or []) if cid in valid_set and cid not in assigned]
+        if not ids:
+            continue
+        for cid in ids:
+            assigned.add(cid)
+        kept.append({
+            "cluster_ids": ids,
+            "merged_title": str(g.get("merged_title", "")).strip(),
+            "merged_key": str(g.get("merged_key", "")).strip(),
+            "rationale": str(g.get("rationale", "")).strip(),
+        })
+    for cid in valid_ids:
+        if cid not in assigned:
+            kept.append({
+                "cluster_ids": [cid],
+                "merged_title": "",
+                "merged_key": "",
+                "rationale": "Auto-singleton fallback.",
+            })
+    order = {cid: i for i, cid in enumerate(valid_ids)}
+    kept.sort(key=lambda g: min(order[cid] for cid in g["cluster_ids"]))
+    dropped.sort(key=lambda x: order.get(x["cluster_id"], 10**9))
+    return kept, dropped
+def refine_paper(paper_dir: Path, overwrite: bool, inplace: bool) -> str:
+    disc_path = paper_dir / DISCOVERY_FILE
+    contrib_path = paper_dir / CONTRIB_FILE
+    meta_path = paper_dir / PAPER_META_FILE
+    disc = load_json(disc_path)
+    contrib = load_json(contrib_path)
+    meta = load_json(meta_path)
+    if not isinstance(contrib, dict):
+        return "missing_inputs"
+    if not isinstance(disc, dict):
+        disc = {"paper_id": contrib.get("paper_id"), "decision": "", "justification": "", "clusters": []}
+    clusters = disc.get("clusters") or []
+    if not clusters:
+        clusters = _build_initial_clusters_from_contributions(contrib)
+    if not clusters:
+        payload = dict(disc)
+        payload["clusters"] = []
+        payload["dropped_clusters"] = []
+        payload["cluster_refine_method"] = "llm_centroid_merge_filter"
+        payload["cluster_refine_source"] = CONTRIB_FILE
+        out_path = disc_path if inplace else (paper_dir / OUT_FILE)
+        out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        return "empty_clusters"
+    out_path = disc_path if inplace else (paper_dir / OUT_FILE)
+    if out_path.exists() and not overwrite:
+        return "skipped"
+    contributions = contrib.get("contributions") or []
+    centroids: List[Dict[str, Any]] = []
+    auto_dropped: List[Dict[str, Any]] = []
+    active_clusters: List[Dict[str, Any]] = []
+    for c in clusters:
+        cid = c.get("cluster_id", "")
+        summary = _cluster_support_summary(c, contributions)
+        rel, _, _ = _parse_key(c.get("cluster_key", ""))
+        if summary["uses_count"] + summary["extends_count"] == 0 or rel not in {"USES", "EXTENDS"}:
+            auto_dropped.append({"cluster_id": cid, "reason": "No verified USES/EXTENDS support in member contributions."})
+            continue
+        row = {
+            "cluster_id": cid,
+            "cluster_key": c.get("cluster_key", ""),
+            "cluster_title": c.get("cluster_title") or c.get("representative_claim") or "",
+            "count": int(c.get("count", summary["member_count"]) or summary["member_count"]),
+            **summary,
+        }
+        centroids.append(row)
+        active_clusters.append(c)
+    if not active_clusters:
+        payload = dict(disc)
+        payload["clusters"] = []
+        payload["dropped_clusters"] = auto_dropped
+        payload["cluster_refine_method"] = "llm_centroid_merge_filter"
+        payload["cluster_refine_source"] = CONTRIB_FILE if not load_json(disc_path) else DISCOVERY_FILE
+        out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        return "refined"
+    prompt = build_prompt(_extract_title(meta), centroids)
+    client = LLMClient()
+    raw = client.call(prompt, schema=REFINE_SCHEMA)
+    data = json.loads(raw)
+    kept_groups, llm_dropped = _normalize_decision(data, active_clusters)
+    id_to_cluster = {c.get("cluster_id"): c for c in active_clusters if c.get("cluster_id")}
+    merged_clusters: List[Dict[str, Any]] = []
+    for idx, g in enumerate(kept_groups, start=1):
+        member_ids = g["cluster_ids"]
+        members = [id_to_cluster[mid] for mid in member_ids if mid in id_to_cluster]
+        merged_indices: List[int] = []
+        for m in members:
+            for i in _to_int_indices(m.get("claim_indices") or []):
+                if i not in merged_indices:
+                    merged_indices.append(i)
+        merged_indices.sort()
+        merged_key = g.get("merged_key") or _dominant_key(members)
+        rel, _, _ = _parse_key(merged_key)
+        if rel not in {"USES", "EXTENDS"}:
+            merged_key = _dominant_key(members)
+        merged_title = g.get("merged_title") or (members[0].get("cluster_title") if members else "")
+        if not merged_title:
+            merged_title = members[0].get("representative_claim", "") if members else ""
+        merged_clusters.append({
+            "cluster_id": f"C{idx}",
+            "count": str(len(merged_indices)),
+            "representative_claim": merged_title,
+            "cluster_key": merged_key,
+            "cluster_title": merged_title,
+            "claim_indices": [str(i) for i in merged_indices],
+            "source_cluster_ids": member_ids,
+            "merge_rationale": g.get("rationale", ""),
+        })
+    payload = dict(disc)
+    payload["clusters"] = merged_clusters
+    payload["dropped_clusters"] = auto_dropped + llm_dropped
+    payload["cluster_refine_method"] = "llm_centroid_merge_filter"
+    payload["cluster_refine_source"] = CONTRIB_FILE if not load_json(disc_path) else DISCOVERY_FILE
+    out_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    return "refined"
+def main() -> None:
+    parser = argparse.ArgumentParser(description="LLM centroid-level merge/filter pass for downstream contribution clusters.")
+    parser.add_argument("--root", type=str, default="runs/processed_papers", help="Root directory containing processed paper directories.")
+    parser.add_argument("--overwrite", action="store_true", help="Overwrite output file if it exists.")
+    parser.add_argument("--inplace", action="store_true", help="Write back to usage_discovery_from_contributions.json.")
+    args = parser.parse_args()
+    root = Path(args.root).expanduser().resolve()
+    if not root.exists():
+        raise SystemExit(f"Root directory does not exist: {root}")
+    paper_dirs = iter_paper_dirs(root)
+    print(f"[INFO] Found {len(paper_dirs)} paper dirs under {root}")
+    counts = {"refined": 0, "skipped": 0, "missing_inputs": 0, "empty_clusters": 0}
+    for paper_dir in paper_dirs:
+        status = refine_paper(paper_dir, overwrite=args.overwrite, inplace=args.inplace)
+        counts[status] = counts.get(status, 0) + 1
+        print(f"[{status.upper()}] {paper_dir.name}")
+    print("[SUMMARY] refined={refined}, skipped={skipped}, missing_inputs={missing_inputs}, empty_clusters={empty_clusters}".format(**counts))
+if __name__ == "__main__":
+    main()

src/step_07_extract_and_refine/schemas.py ADDED Viewed

	@@ -0,0 +1,12 @@

+CONTRIBUTION_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "label": {"type": "string", "enum": ["USES", "EXTENDS", "NOT_CONFIRMED"]},
+        "paper_claim": {"type": "string"},
+        "cluster_title": {"type": "string"},
+        "cluster_key": {"type": "string"},
+        "evidence_span": {"type": "string"},
+        "rationale": {"type": "string"},
+    },
+    "required": ["label", "paper_claim", "cluster_title", "cluster_key", "evidence_span", "rationale"],
+}

src/step_08_annotation/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .pipeline import TwoPassAnnotationPipeline, TwoPassPipelineResult
2	+
3	+ __all__ = ["TwoPassAnnotationPipeline", "TwoPassPipelineResult"]

src/step_08_annotation/cli.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+import typer
+from .paper_package import load_paper_package
+from .pipeline import TwoPassAnnotationPipeline
+app = typer.Typer(help="Run step 8: derive target contributions, enabling contributions, and groundings.")
+def _default_output_root() -> Path:
+    return Path("runs/two_pass_outputs")
+@app.command()
+def run(
+    paper_dir: Path = typer.Option(..., exists=True, file_okay=False, dir_okay=True),
+    provider: str = typer.Option("openai", help="Provider family: openai or gemini."),
+    model: str = typer.Option("openai/gpt-5", help="Reasoning model used for target-contribution derivation and annotation."),
+    formatter_model: str | None = typer.Option(
+        None,
+        help="Optional model override for pass 2 formatting, e.g. openai/gpt-5-mini or openai/gpt-5.4-pro.",
+    ),
+    judge_model: str | None = typer.Option(
+        None,
+        help="Optional model override for pass 1 candidate ranking. Ignored when --candidate-count=1.",
+    ),
+    candidate_count: int = typer.Option(
+        1,
+        help="Number of reasoning candidates to generate. If set to 1, no judge call is made.",
+    ),
+    formatter_max_attempts: int = typer.Option(
+        3,
+        help="Formatter-only retry attempts after pass 1 has succeeded.",
+    ),
+    include_reference_examples: bool = typer.Option(
+        True,
+        "--include-reference-examples/--no-include-reference-examples",
+        help="Include the built-in reference examples in the pass-1 reasoning prompt.",
+    ),
+    prompt_profile: str = typer.Option(
+        "full",
+        help="Reasoning prompt profile: full or generic.",
+    ),
+    output_root: Path = typer.Option(
+        _default_output_root(),
+        help="Directory to store run outputs.",
+    ),
+    run_label: str | None = typer.Option(None, help="Optional label to include in the saved run directory name."),
+    annotator_id: str = typer.Option("llm", help="Annotator id to embed in the final UI payload."),
+    extracted_claim: str | None = typer.Option(None, help="Optional override for the extracted target contribution."),
+) -> None:
+    paper = load_paper_package(paper_dir, extracted_claim_override=extracted_claim)
+    pipeline = TwoPassAnnotationPipeline(
+        provider=provider,
+        model=model,
+        formatter_model=formatter_model,
+        judge_model=judge_model,
+        output_root=output_root,
+        run_label=run_label,
+        annotator_id=annotator_id,
+        candidate_count=candidate_count,
+        formatter_max_attempts=formatter_max_attempts,
+        include_reference_examples=include_reference_examples,
+        prompt_profile=prompt_profile,
+        progress_callback=typer.echo,
+    )
+    result = pipeline.run(paper)
+    typer.echo(str(result.run_dir / "run_output.json"))
+@app.command()
+def summarize(run_output: Path = typer.Option(..., exists=True, dir_okay=False, file_okay=True)) -> None:
+    data = json.loads(run_output.read_text())
+    payload = data.get("ui_payload") or {}
+    claims = payload.get("claims") or []
+    summary = {
+        "paper_id": data.get("paper_id"),
+        "target_contribution_count": len(claims),
+        "target_contributions": [
+            {
+                "claim_id": claim.get("claim_id"),
+                "rewritten_claim": claim.get("rewritten_claim"),
+                "decision": claim.get("decision"),
+                "enabling_contribution_count": len(claim.get("ingredients") or []),
+            }
+            for claim in claims
+        ],
+    }
+    typer.echo(json.dumps(summary, indent=2))
+if __name__ == "__main__":
+    app()

src/step_08_annotation/final_prompts.py ADDED Viewed

The diff for this file is too large to render. See raw diff

src/step_08_annotation/paper_package.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+from common.paper_package import (
+    PaperPackage,
+    _collect_bibliography,
+    _collect_citation_contexts,
+    _collect_full_processed_text,
+    _collect_sections,
+    _load_json,
+    _normalize_dict_payload,
+)
+def _collect_all_cluster_evidence(paper_dir: Path) -> List[Dict[str, Any]]:
+    discovery = _normalize_dict_payload(_load_json(paper_dir / "usage_discovery_from_contributions.json", {}))
+    clusters = discovery.get("clusters", [])
+    out = []
+    for cluster in clusters:
+        out.append(
+            {
+                "cluster_id": cluster.get("cluster_id"),
+                "representative_claim": cluster.get("representative_claim") or cluster.get("cluster_title"),
+                "cluster_title": cluster.get("cluster_title"),
+                "count": cluster.get("count"),
+                "cluster_key": cluster.get("cluster_key"),
+                "claim_indices": cluster.get("claim_indices", []),
+                "source_cluster_ids": cluster.get("source_cluster_ids", []),
+                "merge_rationale": cluster.get("merge_rationale"),
+            }
+        )
+    return out
+def load_paper_package(paper_dir: str | Path, extracted_claim_override: str | None = None) -> PaperPackage:
+    paper_dir = Path(paper_dir)
+    paper_metadata = _normalize_dict_payload(_load_json(paper_dir / "paper_metadata.json", {}))
+    cluster_evidence = _collect_all_cluster_evidence(paper_dir)
+    seed = extracted_claim_override or ""
+    return PaperPackage(
+        paper_dir=paper_dir,
+        paper_metadata=paper_metadata,
+        extracted_discovery_claim=seed,
+        downstream_cluster_evidence=cluster_evidence,
+        paper_text=_collect_sections(paper_dir),
+        full_processed_text=_collect_full_processed_text(paper_dir),
+        bibliography=_collect_bibliography(paper_dir),
+        citation_contexts=_collect_citation_contexts(paper_dir),
+    )

src/step_08_annotation/pipeline.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from __future__ import annotations
+import json
+import traceback
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Callable, Dict
+from common.model_client import ModelConfig, MultiProviderLLMClient
+from common.paper_package import PaperPackage
+from .final_prompts import (
+    SYSTEM_TWO_PASS_FORMATTER,
+    SYSTEM_TWO_PASS_JUDGE,
+    SYSTEM_TWO_PASS_REASONING,
+    formatter_prompt,
+    judge_prompt,
+    reasoning_prompt,
+)
+from .schemas import JudgeResult, UIPayload
+@dataclass
+class TwoPassPipelineResult:
+    run_dir: Path
+    result: Dict[str, Any]
+class FormatterStageError(RuntimeError):
+    def __init__(self, message: str, run_dir: Path):
+        super().__init__(message)
+        self.run_dir = run_dir
+class TwoPassAnnotationPipeline:
+    def __init__(
+        self,
+        *,
+        provider: str,
+        model: str,
+        formatter_model: str | None,
+        judge_model: str | None,
+        output_root: Path,
+        run_label: str | None = None,
+        annotator_id: str = "llm",
+        temperature: float = 0.2,
+        max_tokens: int = 16000,
+        candidate_count: int = 1,
+        formatter_max_attempts: int = 3,
+        include_reference_examples: bool = True,
+        prompt_profile: str = "full",
+        progress_callback: Callable[[str], None] | None = None,
+    ):
+        self.output_root = output_root
+        self.annotator_id = annotator_id
+        self.progress_callback = progress_callback
+        self.run_label = run_label
+        self.candidate_count = max(1, candidate_count)
+        self.formatter_max_attempts = max(1, formatter_max_attempts)
+        self.include_reference_examples = include_reference_examples
+        self.prompt_profile = prompt_profile
+        self.use_judge = self.candidate_count > 1
+        stage_models = {}
+        if formatter_model:
+            stage_models["two_pass_formatter"] = formatter_model
+        if judge_model and self.use_judge:
+            stage_models["two_pass_judge"] = judge_model
+        self.client = MultiProviderLLMClient(
+            default_config=ModelConfig(
+                provider=provider,
+                model=model,
+                temperature=temperature,
+                max_tokens=max_tokens,
+            ),
+            stage_models=stage_models,
+        )
+    def run(self, paper: PaperPackage) -> TwoPassPipelineResult:
+        run_dir = self._make_run_dir(paper)
+        payload = {
+            **paper.to_prompt_payload(),
+            "paper_dir": paper.paper_dir,
+            "full_processed_text": self._load_full_processed_text(paper),
+        }
+        formatter_config = self.client.config_for_stage("two_pass_formatter")
+        self._log(
+            f"[run] paper={paper.paper_dir.name} provider={self.client.default_config.provider} model={self.client.default_config.model_name}"
+        )
+        self._log(f"[run] formatter_model={formatter_config.model_name}")
+        self._log(f"[run] include_reference_examples={self.include_reference_examples}")
+        self._log(f"[run] prompt_profile={self.prompt_profile}")
+        if self.use_judge:
+            judge_config = self.client.config_for_stage("two_pass_judge")
+            self._log(f"[run] judge_model={judge_config.model_name}")
+        else:
+            self._log("[run] judge_model=disabled (candidate_count=1)")
+        self._log(f"[run] output={run_dir}")
+        reasoning_user_prompt = reasoning_prompt(payload, include_reference_examples=self.include_reference_examples, prompt_profile=self.prompt_profile)
+        self._write_text(run_dir / "pass_1_reasoning.prompt.txt", reasoning_user_prompt)
+        self._log(
+            f"[pass 1] free-form reasoning ({self.candidate_count} candidate{'s' if self.candidate_count != 1 else ''})"
+        )
+        candidate_texts: list[str] = []
+        candidate_paths: list[str] = []
+        for index in range(self.candidate_count):
+            reasoning_text = self.client.generate_text(
+                stage_name="two_pass_reasoning",
+                system_prompt=SYSTEM_TWO_PASS_REASONING,
+                user_prompt=reasoning_user_prompt,
+            )
+            candidate_id = f"candidate_{index + 1}"
+            candidate_path = run_dir / f"pass_1_reasoning.output.{candidate_id}.md"
+            self._write_text(candidate_path, reasoning_text)
+            candidate_texts.append(reasoning_text)
+            candidate_paths.append(str(candidate_path))
+        selected_candidate_index = 0
+        selected_candidate_id = "candidate_1"
+        selected_reasoning_text = candidate_texts[0]
+        judge_output_path: Path | None = None
+        if self.use_judge:
+            judge_user_prompt = judge_prompt(payload, candidate_texts)
+            self._write_text(run_dir / "pass_1_reasoning.judge.prompt.txt", judge_user_prompt)
+            self._log("[pass 1] candidate judging")
+            judge_result = self.client.generate_structured(
+                stage_name="two_pass_judge",
+                system_prompt=SYSTEM_TWO_PASS_JUDGE,
+                user_prompt=judge_user_prompt,
+                response_model=JudgeResult,
+            )
+            judge_output_path = run_dir / "pass_1_reasoning.judge.output.json"
+            self._write_json(judge_output_path, judge_result.model_dump())
+            selected_candidate_index = judge_result.selected_candidate_index
+            selected_candidate_id = judge_result.selected_candidate_id
+            selected_reasoning_text = candidate_texts[selected_candidate_index]
+        selected_reasoning_path = run_dir / "pass_1_reasoning.selected.md"
+        self._write_text(selected_reasoning_path, selected_reasoning_text)
+        formatter_user_prompt = formatter_prompt(payload, selected_reasoning_text, self.annotator_id)
+        self._write_text(run_dir / "pass_2_formatter.prompt.txt", formatter_user_prompt)
+        final_payload: UIPayload | None = None
+        formatter_attempts: list[dict[str, Any]] = []
+        for attempt in range(1, self.formatter_max_attempts + 1):
+            self._log(
+                f"[pass 2] strict ui json formatting (attempt {attempt}/{self.formatter_max_attempts})"
+            )
+            try:
+                final_payload = self.client.generate_structured(
+                    stage_name="two_pass_formatter",
+                    system_prompt=SYSTEM_TWO_PASS_FORMATTER,
+                    user_prompt=formatter_user_prompt,
+                    response_model=UIPayload,
+                )
+                formatter_attempts.append({"attempt": attempt, "status": "success"})
+                break
+            except Exception as exc:
+                error_text = "".join(traceback.format_exception(exc)).strip()
+                error_path = run_dir / f"pass_2_formatter.attempt_{attempt}.error.txt"
+                self._write_text(error_path, error_text)
+                formatter_attempts.append(
+                    {
+                        "attempt": attempt,
+                        "status": "failed",
+                        "error": str(exc),
+                        "error_path": str(error_path),
+                    }
+                )
+                if attempt < self.formatter_max_attempts:
+                    self._log("[pass 2] formatter failed; retrying formatter only")
+        if final_payload is None:
+            self._write_json(run_dir / "formatter_attempts.json", {"attempts": formatter_attempts})
+            raise FormatterStageError(
+                f"Formatter failed after {self.formatter_max_attempts} attempts; pass 1 outputs kept in {run_dir}",
+                run_dir,
+            )
+        self._write_json(run_dir / "formatter_attempts.json", {"attempts": formatter_attempts})
+        self._write_json(run_dir / "pass_2_ui_payload.json", final_payload.model_dump())
+        result = {
+            "paper_id": paper.paper_dir.name,
+            "paper_dir": str(paper.paper_dir),
+            "generated_at": datetime.now(timezone.utc).isoformat(),
+            "reasoner_model": self.client.default_config.model_name,
+            "formatter_model": formatter_config.model_name,
+            "judge_model": judge_config.model_name if self.use_judge else None,
+            "candidate_count": self.candidate_count,
+            "include_reference_examples": self.include_reference_examples,
+            "prompt_profile": self.prompt_profile,
+            "reasoning_candidate_paths": [str(path) for path in candidate_paths],
+            "selected_reasoning_candidate": selected_candidate_id,
+            "selected_candidate_index": selected_candidate_index,
+            "selected_reasoning_path": str(selected_reasoning_path),
+            "judge_output_path": str(judge_output_path) if judge_output_path is not None else None,
+            "formatter_attempts": formatter_attempts,
+            "ui_payload_path": str(run_dir / "pass_2_ui_payload.json"),
+            "ui_payload": final_payload.model_dump(),
+        }
+        self._write_json(run_dir / "run_output.json", result)
+        self._log("[run] complete")
+        return TwoPassPipelineResult(run_dir=run_dir, result=result)
+    def _make_run_dir(self, paper: PaperPackage) -> Path:
+        stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+        run_name = stamp
+        if self.run_label:
+            run_name = f"{self._slugify(self.run_label)}__{stamp}"
+        run_dir = self.output_root / paper.paper_dir.name / run_name
+        run_dir.mkdir(parents=True, exist_ok=True)
+        return run_dir
+    def _write_json(self, path: Path, payload: Dict[str, Any]) -> None:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(json.dumps(payload, indent=2, ensure_ascii=True) + "\n")
+    def _write_text(self, path: Path, text: str) -> None:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(text)
+    def _log(self, message: str) -> None:
+        if self.progress_callback:
+            self.progress_callback(message)
+    @staticmethod
+    def _slugify(value: str) -> str:
+        slug = "".join(ch if ch.isalnum() or ch in {"-", "_", "."} else "-" for ch in value.strip())
+        slug = "-".join(part for part in slug.split("-") if part)
+        return slug[:160] or "run"
+    def _load_full_processed_text(self, paper: PaperPackage) -> str:
+        processed_path = paper.paper_dir / "processed_main.tex"
+        if processed_path.exists():
+            try:
+                return processed_path.read_text()
+            except Exception:
+                pass
+        sections_dir = paper.paper_dir / "sections"
+        parts: list[str] = []
+        if sections_dir.exists():
+            for path in sorted(sections_dir.iterdir()):
+                if not path.is_file():
+                    continue
+                try:
+                    text = path.read_text().strip()
+                except Exception:
+                    continue
+                if text:
+                    parts.append(f"[{path.name}]\n{text}")
+        return "\n\n".join(parts)

src/step_08_annotation/schemas.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from __future__ import annotations
+from typing import Any, Dict, List, Literal, Optional
+from pydantic import BaseModel
+ALLOWED_ARTIFACT_TYPES = ["Resource", "Finding", "Method", "Benchmark", "Dataset", "Tool", "Other"]
+ALLOWED_ROLES = [
+    "CONCEPTUAL_FRAMEWORK",
+    "CORE_METHOD",
+    "DATA_SOURCE",
+    "MODEL_INITIALIZATION",
+    "EVALUATION_PROTOCOL",
+]
+class ReasoningCandidate(BaseModel):
+    study: str
+    decision: Literal["accepted_canonical", "accepted_additional", "accepted_none", "rejected_candidate"]
+    why: str
+class ReasoningIngredient(BaseModel):
+    ingredient_id: str
+    ingredient: str
+    necessary: bool
+    from_prior_work: bool
+    maps_cleanly_to_one_study: bool
+    notes: str
+    canonical_grounding_decision: Dict[str, Any]
+    additional_groundings: List[Dict[str, Any]]
+    candidate_studies_considered: List[ReasoningCandidate]
+    role: Optional[Literal["CONCEPTUAL_FRAMEWORK", "CORE_METHOD", "DATA_SOURCE", "MODEL_INITIALIZATION", "EVALUATION_PROTOCOL", "IMPLEMENTATION_TOOLING", "TRAINING_DATA"]] = None
+    contribution: str
+    rationale: str
+    evidence_span: str
+class ReasoningClaim(BaseModel):
+    claim_id: str
+    artifact_type: Literal["Resource", "Finding", "Method", "Benchmark", "Dataset", "Tool", "Other"]
+    rewritten_claim: str
+    cluster_id: str = ""
+    decision: Literal["YES_SUFFICIENT", "NO_NOT_DISCOVERY"] = "YES_SUFFICIENT"
+    notes: str = ""
+    why_this_is_atomic: str
+    ingredients: List[ReasoningIngredient]
+class ReasoningOutput(BaseModel):
+    original_discovery_claim: str
+    claim_split_decision: Dict[str, Any]
+    rewritten_claims: List[ReasoningClaim]
+    paper_level_notes: str = ""
+class GroundingRecord(BaseModel):
+    ref_id: Optional[str] = None
+    bib_key: Optional[str] = None
+    paper_id: Optional[str] = None
+    external_ids: Optional[Dict[str, Any]] = None
+    ref_title: Optional[str] = None
+    ref_year: Optional[str] = None
+    ref_authors: Optional[str] = None
+class CanonicalAnnotation(BaseModel):
+    role: Optional[Literal["CONCEPTUAL_FRAMEWORK", "CORE_METHOD", "DATA_SOURCE", "MODEL_INITIALIZATION", "EVALUATION_PROTOCOL", "IMPLEMENTATION_TOOLING", "TRAINING_DATA"]] = None
+    roles: List[Literal["CONCEPTUAL_FRAMEWORK", "CORE_METHOD", "DATA_SOURCE", "MODEL_INITIALIZATION", "EVALUATION_PROTOCOL", "IMPLEMENTATION_TOOLING", "TRAINING_DATA"]]
+    contribution: str
+    rationale: str
+    evidence_span: str
+class IngredientPayload(BaseModel):
+    ingredient_id: str
+    ingredient: str
+    canonical_ref_id: str
+    canonical_grounding: Optional[GroundingRecord] = None
+    additional_ref_ids: List[str]
+    additional_groundings: List[GroundingRecord]
+    canonical_annotation: CanonicalAnnotation
+class EnablingDiscoveryPayload(GroundingRecord):
+    ingredient_id: str
+    ingredient: str
+    role: Optional[Literal["CONCEPTUAL_FRAMEWORK", "CORE_METHOD", "DATA_SOURCE", "MODEL_INITIALIZATION", "EVALUATION_PROTOCOL", "IMPLEMENTATION_TOOLING", "TRAINING_DATA"]] = None
+    roles: List[Literal["CONCEPTUAL_FRAMEWORK", "CORE_METHOD", "DATA_SOURCE", "MODEL_INITIALIZATION", "EVALUATION_PROTOCOL", "IMPLEMENTATION_TOOLING", "TRAINING_DATA"]]
+    contribution: str
+    rationale: str
+    evidence_span: str
+class ClaimPayload(BaseModel):
+    claim_id: str
+    text: str
+    rewritten_claim: str
+    cluster_id: str = ""
+    decision: Literal["YES_SUFFICIENT", "NO_NOT_DISCOVERY", "UNCERTAIN"] = "YES_SUFFICIENT"
+    notes: str = ""
+    ingredients: List[IngredientPayload]
+    enabling_discoveries: List[EnablingDiscoveryPayload]
+class JudgeCandidateScore(BaseModel):
+    candidate_id: str
+    candidate_index: int
+    score: int
+    assessment: str
+class JudgeResult(BaseModel):
+    selected_candidate_index: int
+    selected_candidate_id: str
+    selected_reason: str
+    candidate_scores: List[JudgeCandidateScore]
+class UIPayload(BaseModel):
+    target_paper_id: str
+    target_title: Optional[str] = None
+    target_year: Optional[int] = None
+    annotator_id: str
+    active_claim_id: Optional[str] = None
+    claims: List[ClaimPayload]