File size: 5,593 Bytes
0a55f0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import time
import random
import requests
from typing import Optional, Tuple, Any
from config import SEMANTIC_SCHOLAR_API_KEY
import os

BASE_URL = "https://api.semanticscholar.org/graph/v1/paper"

_LAST_REQUEST_TS = 0.0


def _min_interval_sleep() -> None:
    """Global throttle to avoid hammering Semantic Scholar."""
    global _LAST_REQUEST_TS
    min_interval = float(os.getenv("S2_MIN_INTERVAL", "1.0"))
    now = time.monotonic()
    elapsed = now - _LAST_REQUEST_TS
    if elapsed < min_interval:
        time.sleep(min_interval - elapsed)
    _LAST_REQUEST_TS = time.monotonic()


def robust_request(url, params=None, headers=None, max_retries=8, base_sleep=2.0):
    """
    Make a GET request with exponential backoff.
    Retries on:
        - connection errors
        - 429 (Too Many Requests)
        - 500–599 server errors
        - invalid JSON
    Returns (status_code, json_or_None).
    """

    for attempt in range(max_retries):
        try:
            _min_interval_sleep()
            resp = requests.get(url, params=params, headers=headers, timeout=30)
            status = resp.status_code

            if status == 200:
                try:
                    return 200, resp.json()
                except Exception:
                    print(f"[WARN] JSON decode failed on attempt {attempt+1}/{max_retries}")

            if status == 429:
                retry_after = resp.headers.get("Retry-After")
                if retry_after:
                    try:
                        sleep = float(retry_after)
                    except Exception:
                        sleep = base_sleep * (2 ** attempt)
                else:
                    sleep = base_sleep * (2 ** attempt)
                max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60"))
                sleep = min(sleep, max_sleep)
                sleep += random.uniform(0.0, 0.5)
                print(f"[WARN] 429 Too Many Requests → retrying in {sleep:.2f}s")
                time.sleep(sleep)
                continue

            if 500 <= status < 600:
                sleep = base_sleep * (2 ** attempt)
                max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60"))
                sleep = min(sleep, max_sleep)
                sleep += random.uniform(0.0, 0.5)
                print(f"[WARN] Server error {status} → retrying in {sleep:.2f}s")
                time.sleep(sleep)
                continue

            return status, None

        except requests.exceptions.RequestException as e:
            sleep = base_sleep * (2 ** attempt)
            max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60"))
            sleep = min(sleep, max_sleep)
            sleep += random.uniform(0.0, 0.5)
            print(f"[WARN] Network error {e} → retrying in {sleep:.2f}s")
            time.sleep(sleep)
            continue

    print(f"[ERROR] Giving up after {max_retries} attempts for URL: {url}")
    return None, None
def get_paper(paper_id: str, id_type: str = "ACL") -> Tuple[int, Optional[dict]]:
    """
    id_type can be "ACL" or "SemanticScholar" or "ArXiv" etc.
    """
    if id_type == "SemanticScholar":
        full_id = paper_id
    else:
        full_id = f"{id_type}:{paper_id}"

    url = f"{BASE_URL}/{full_id}"
    params = {
        "fields": (
            "title,year,publicationDate,authors,url,venue,externalIds,"
            "tldr,abstract,citationCount,referenceCount,openAccessPdf"
        )
    }

    headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}

    status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0)
    if status == 200 and data is not None:
        return status, data
    else:
        print(f"[WARN] {status} on {full_id}")
        return status or 0, None


def get_paper_links(semantic_id: str, target_type: str, total: int, limit: int = 1000):
    headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}
    loops = total // limit + 1 if total else 0
    collected = []

    for i in range(loops):
        offset = i * limit
        url = f"{BASE_URL}/{semantic_id}/{target_type}"
        params = {
            "offset": offset,
            "limit": limit,
            "fields": "paperId,title,isInfluential,externalIds,contextsWithIntent,openAccessPdf",
        }

        status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0)

        if status != 200 or data is None:
            print(f"[WARN] {target_type} fetch failed for {semantic_id} (status {status})")
            return status or 0, []

        items = data.get("data")
        if not isinstance(items, list):
            print(f"[WARN] malformed {target_type} response for {semantic_id}")
            return status, []

        collected.extend(items)

    return 200, collected


def search_by_title(title: str, limit: int = 1):
    """Search Semantic Scholar by paper title."""
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {
        "query": title,
        "limit": limit,
        "fields": "paperId,title,year,venue,externalIds",
    }
    headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}

    status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0)
    if status == 200 and data is not None:
        items = data.get("data", [])
        return items[0] if items else None
    else:
        print(f"[WARN] title search failed for '{title[:60]}...' (status {status})")
        return None