| import time |
| import random |
| import requests |
| from typing import Optional, Tuple, Any |
| from config import SEMANTIC_SCHOLAR_API_KEY |
| import os |
|
|
| BASE_URL = "https://api.semanticscholar.org/graph/v1/paper" |
|
|
| _LAST_REQUEST_TS = 0.0 |
|
|
|
|
| def _min_interval_sleep() -> None: |
| """Global throttle to avoid hammering Semantic Scholar.""" |
| global _LAST_REQUEST_TS |
| min_interval = float(os.getenv("S2_MIN_INTERVAL", "1.0")) |
| now = time.monotonic() |
| elapsed = now - _LAST_REQUEST_TS |
| if elapsed < min_interval: |
| time.sleep(min_interval - elapsed) |
| _LAST_REQUEST_TS = time.monotonic() |
|
|
|
|
| def robust_request(url, params=None, headers=None, max_retries=8, base_sleep=2.0): |
| """ |
| Make a GET request with exponential backoff. |
| Retries on: |
| - connection errors |
| - 429 (Too Many Requests) |
| - 500–599 server errors |
| - invalid JSON |
| Returns (status_code, json_or_None). |
| """ |
|
|
| for attempt in range(max_retries): |
| try: |
| _min_interval_sleep() |
| resp = requests.get(url, params=params, headers=headers, timeout=30) |
| status = resp.status_code |
|
|
| if status == 200: |
| try: |
| return 200, resp.json() |
| except Exception: |
| print(f"[WARN] JSON decode failed on attempt {attempt+1}/{max_retries}") |
|
|
| if status == 429: |
| retry_after = resp.headers.get("Retry-After") |
| if retry_after: |
| try: |
| sleep = float(retry_after) |
| except Exception: |
| sleep = base_sleep * (2 ** attempt) |
| else: |
| sleep = base_sleep * (2 ** attempt) |
| max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60")) |
| sleep = min(sleep, max_sleep) |
| sleep += random.uniform(0.0, 0.5) |
| print(f"[WARN] 429 Too Many Requests → retrying in {sleep:.2f}s") |
| time.sleep(sleep) |
| continue |
|
|
| if 500 <= status < 600: |
| sleep = base_sleep * (2 ** attempt) |
| max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60")) |
| sleep = min(sleep, max_sleep) |
| sleep += random.uniform(0.0, 0.5) |
| print(f"[WARN] Server error {status} → retrying in {sleep:.2f}s") |
| time.sleep(sleep) |
| continue |
|
|
| return status, None |
|
|
| except requests.exceptions.RequestException as e: |
| sleep = base_sleep * (2 ** attempt) |
| max_sleep = float(os.getenv("S2_MAX_BACKOFF", "60")) |
| sleep = min(sleep, max_sleep) |
| sleep += random.uniform(0.0, 0.5) |
| print(f"[WARN] Network error {e} → retrying in {sleep:.2f}s") |
| time.sleep(sleep) |
| continue |
|
|
| print(f"[ERROR] Giving up after {max_retries} attempts for URL: {url}") |
| return None, None |
| def get_paper(paper_id: str, id_type: str = "ACL") -> Tuple[int, Optional[dict]]: |
| """ |
| id_type can be "ACL" or "SemanticScholar" or "ArXiv" etc. |
| """ |
| if id_type == "SemanticScholar": |
| full_id = paper_id |
| else: |
| full_id = f"{id_type}:{paper_id}" |
|
|
| url = f"{BASE_URL}/{full_id}" |
| params = { |
| "fields": ( |
| "title,year,publicationDate,authors,url,venue,externalIds," |
| "tldr,abstract,citationCount,referenceCount,openAccessPdf" |
| ) |
| } |
|
|
| headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {} |
|
|
| status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0) |
| if status == 200 and data is not None: |
| return status, data |
| else: |
| print(f"[WARN] {status} on {full_id}") |
| return status or 0, None |
|
|
|
|
| def get_paper_links(semantic_id: str, target_type: str, total: int, limit: int = 1000): |
| headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {} |
| loops = total // limit + 1 if total else 0 |
| collected = [] |
|
|
| for i in range(loops): |
| offset = i * limit |
| url = f"{BASE_URL}/{semantic_id}/{target_type}" |
| params = { |
| "offset": offset, |
| "limit": limit, |
| "fields": "paperId,title,isInfluential,externalIds,contextsWithIntent,openAccessPdf", |
| } |
|
|
| status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0) |
|
|
| if status != 200 or data is None: |
| print(f"[WARN] {target_type} fetch failed for {semantic_id} (status {status})") |
| return status or 0, [] |
|
|
| items = data.get("data") |
| if not isinstance(items, list): |
| print(f"[WARN] malformed {target_type} response for {semantic_id}") |
| return status, [] |
|
|
| collected.extend(items) |
|
|
| return 200, collected |
|
|
|
|
| def search_by_title(title: str, limit: int = 1): |
| """Search Semantic Scholar by paper title.""" |
| url = "https://api.semanticscholar.org/graph/v1/paper/search" |
| params = { |
| "query": title, |
| "limit": limit, |
| "fields": "paperId,title,year,venue,externalIds", |
| } |
| headers = {"x-api-key": SEMANTIC_SCHOLAR_API_KEY} if SEMANTIC_SCHOLAR_API_KEY else {} |
|
|
| status, data = robust_request(url, params=params, headers=headers, max_retries=5, base_sleep=1.0) |
| if status == 200 and data is not None: |
| items = data.get("data", []) |
| return items[0] if items else None |
| else: |
| print(f"[WARN] title search failed for '{title[:60]}...' (status {status})") |
| return None |
|
|