File size: 5,593 Bytes
0a55f0f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | import time
import random
import requests
from typing import Optional, Tuple, Any
from config import SEMANTIC_SCHOLAR_API_KEY
import os
BASE_URL = "https://api.semanticscholar.org/graph/v1/paper"
_LAST_REQUEST_TS = 0.0
def _min_interval_sleep() -> None:
"""Global throttle to avoid hammering Semantic Scholar."""
global _LAST_REQUEST_TS
min_interval = float(os.getenv("S2_MIN_INTERVAL", "1.0"))
now = time.monotonic()
elapsed = now - _LAST_REQUEST_TS
if elapsed < min_interval:
time.sleep(min_interval - elapsed)
_LAST_REQUEST_TS = time.monotonic()
def robust_request(url, params=None, headers=None, max_retries=8, base_sleep=2.0):
"""
Make a GET request with exponential backoff.
Retries on:
- connection errors
- 429 (Too Many Requests)
- 500–599 server errors
- invalid JSON
Returns (status_code, json_or_None).
"""
for attempt in range(max_retries):
try:
_min_interval_sleep()
resp = requests.get(url, params=params, headers=headers, timeout=30)
status = resp.status_code
if status == 200:
try:
return 200, resp.json()
except Exception:
print(f"[WARN] JSON decode failed on attempt {attempt+1}/{max_retries}")
if status == 429:
retry_after = resp.headers.get("Retry-After")
if retry_after:
try:
sleep = float(retry_after)
except Exception:
sleep = base_sleep * (2 ** attempt)
else:
sleep = base_sleep * (2 ** attempt)
max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60"))
sleep = min(sleep, max_sleep)
sleep += random.uniform(0.0, 0.5)
print(f"[WARN] 429 Too Many Requests → retrying in {sleep:.2f}s")
time.sleep(sleep)
continue
if 500 <= status < 600:
sleep = base_sleep * (2 ** attempt)
max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60"))
sleep = min(sleep, max_sleep)
sleep += random.uniform(0.0, 0.5)
print(f"[WARN] Server error {status} → retrying in {sleep:.2f}s")
time.sleep(sleep)
continue
return status, None
except requests.exceptions.RequestException as e:
sleep = base_sleep * (2 ** attempt)
max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60"))
sleep = min(sleep, max_sleep)
sleep += random.uniform(0.0, 0.5)
print(f"[WARN] Network error {e} → retrying in {sleep:.2f}s")
time.sleep(sleep)
continue
print(f"[ERROR] Giving up after {max_retries} attempts for URL: {url}")
return None, None
def get_paper(paper_id: str, id_type: str = "ACL") -> Tuple[int, Optional[dict]]:
"""
id_type can be "ACL" or "SemanticScholar" or "ArXiv" etc.
"""
if id_type == "SemanticScholar":
full_id = paper_id
else:
full_id = f"{id_type}:{paper_id}"
url = f"{BASE_URL}/{full_id}"
params = {
"fields": (
"title,year,publicationDate,authors,url,venue,externalIds,"
"tldr,abstract,citationCount,referenceCount,openAccessPdf"
)
}
headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}
status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0)
if status == 200 and data is not None:
return status, data
else:
print(f"[WARN] {status} on {full_id}")
return status or 0, None
def get_paper_links(semantic_id: str, target_type: str, total: int, limit: int = 1000):
headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}
loops = total // limit + 1 if total else 0
collected = []
for i in range(loops):
offset = i * limit
url = f"{BASE_URL}/{semantic_id}/{target_type}"
params = {
"offset": offset,
"limit": limit,
"fields": "paperId,title,isInfluential,externalIds,contextsWithIntent,openAccessPdf",
}
status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0)
if status != 200 or data is None:
print(f"[WARN] {target_type} fetch failed for {semantic_id} (status {status})")
return status or 0, []
items = data.get("data")
if not isinstance(items, list):
print(f"[WARN] malformed {target_type} response for {semantic_id}")
return status, []
collected.extend(items)
return 200, collected
def search_by_title(title: str, limit: int = 1):
"""Search Semantic Scholar by paper title."""
url = "https://api.semanticscholar.org/graph/v1/paper/search"
params = {
"query": title,
"limit": limit,
"fields": "paperId,title,year,venue,externalIds",
}
headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {}
status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0)
if status == 200 and data is not None:
items = data.get("data", [])
return items[0] if items else None
else:
print(f"[WARN] title search failed for '{title[:60]}...' (status {status})")
return None
|