Spaces:
Running
Running
| """ | |
| Google Scholar Citation API — backed by SerpAPI, with daily cache. | |
| Deploy to HuggingFace Spaces (Docker SDK). | |
| """ | |
| import os | |
| import time | |
| import json | |
| import hashlib | |
| import logging | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Optional | |
| import httpx | |
| from fastapi import FastAPI, Query, HTTPException | |
| from fastapi.responses import JSONResponse | |
| # --------------------------------------------------------------------------- | |
| # Config | |
| # --------------------------------------------------------------------------- | |
| SERPAPI_KEY = os.environ.get("SERPAPI_KEY", "") | |
| CACHE_DIR = Path(os.environ.get("CACHE_DIR", "/tmp/scholar_cache")) | |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
| CACHE_TTL_SECONDS = 86400 # 24 hours — at most 1 SerpAPI call per author per day | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger("scholar-api") | |
| # --------------------------------------------------------------------------- | |
| # FastAPI app | |
| # --------------------------------------------------------------------------- | |
| app = FastAPI( | |
| title="Google Scholar Citation API", | |
| description=( | |
| "Fetches Google Scholar author citation metrics via SerpAPI. " | |
| "Results are cached for 24 hours to minimize API usage." | |
| ), | |
| version="1.0.0", | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Cache helpers | |
| # --------------------------------------------------------------------------- | |
| def _cache_key(author_id: str) -> str: | |
| """Return a safe filename for the author_id.""" | |
| h = hashlib.sha256(author_id.encode()).hexdigest()[:16] | |
| return f"{author_id}_{h}" | |
| def _read_cache(author_id: str) -> Optional[dict]: | |
| """Return cached data if it exists and is fresh (< CACHE_TTL_SECONDS).""" | |
| cache_file = CACHE_DIR / f"{_cache_key(author_id)}.json" | |
| if not cache_file.exists(): | |
| return None | |
| try: | |
| data = json.loads(cache_file.read_text(encoding="utf-8")) | |
| cached_ts = data.get("_cached_at", 0) | |
| if time.time() - cached_ts < CACHE_TTL_SECONDS: | |
| logger.info("Cache HIT for author_id=%s", author_id) | |
| return data | |
| logger.info("Cache EXPIRED for author_id=%s", author_id) | |
| except Exception: | |
| logger.warning("Cache read error for author_id=%s", author_id, exc_info=True) | |
| return None | |
| def _write_cache(author_id: str, data: dict) -> None: | |
| """Persist data with a timestamp.""" | |
| data["_cached_at"] = time.time() | |
| data["_cached_at_human"] = datetime.now(timezone.utc).isoformat() | |
| cache_file = CACHE_DIR / f"{_cache_key(author_id)}.json" | |
| cache_file.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") | |
| logger.info("Cache WRITE for author_id=%s", author_id) | |
| # --------------------------------------------------------------------------- | |
| # SerpAPI caller | |
| # --------------------------------------------------------------------------- | |
| async def _fetch_from_serpapi(author_id: str) -> dict: | |
| """Call SerpAPI Google Scholar Author endpoint.""" | |
| if not SERPAPI_KEY: | |
| raise HTTPException( | |
| status_code=500, | |
| detail="SERPAPI_KEY is not configured. Set it as an environment variable / HF Space secret.", | |
| ) | |
| params = { | |
| "engine": "google_scholar_author", | |
| "author_id": author_id, | |
| "api_key": SERPAPI_KEY, | |
| } | |
| async with httpx.AsyncClient(timeout=30) as client: | |
| resp = await client.get("https://serpapi.com/search.json", params=params) | |
| if resp.status_code != 200: | |
| logger.error("SerpAPI error: %s %s", resp.status_code, resp.text[:500]) | |
| raise HTTPException( | |
| status_code=502, | |
| detail=f"SerpAPI returned status {resp.status_code}: {resp.text[:300]}", | |
| ) | |
| raw = resp.json() | |
| # Extract the fields we care about | |
| author_info = raw.get("author", {}) | |
| cited_by = raw.get("cited_by", {}) | |
| articles = raw.get("articles", []) | |
| result = { | |
| "author": { | |
| "name": author_info.get("name"), | |
| "affiliations": author_info.get("affiliations"), | |
| "thumbnail": author_info.get("thumbnail"), | |
| "interests": [ | |
| {"title": i.get("title"), "link": i.get("link")} | |
| for i in author_info.get("interests", []) | |
| ], | |
| }, | |
| "citation_stats": { | |
| "table": cited_by.get("table", []), | |
| "graph": cited_by.get("graph", []), | |
| }, | |
| "articles": [ | |
| { | |
| "title": a.get("title"), | |
| "link": a.get("link"), | |
| "authors": a.get("authors"), | |
| "publication": a.get("publication"), | |
| "cited_by_value": a.get("cited_by", {}).get("value"), | |
| "year": a.get("year"), | |
| } | |
| for a in articles | |
| ], | |
| } | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # Routes | |
| # --------------------------------------------------------------------------- | |
| async def root(): | |
| return { | |
| "message": "Google Scholar Citation API", | |
| "usage": "GET /citations?author_id=<GOOGLE_SCHOLAR_AUTHOR_ID>", | |
| "docs": "/docs", | |
| } | |
| async def get_citations( | |
| author_id: str = Query( | |
| ..., | |
| description="Google Scholar author ID (e.g. 'JicYPdAAAAAJ' for Elon Musk-style profiles).", | |
| min_length=4, | |
| ), | |
| ): | |
| """ | |
| Return citation metrics for a Google Scholar author. | |
| - First checks cache (valid for 24 h). | |
| - Falls back to SerpAPI if no fresh cache exists. | |
| """ | |
| # 1. Try cache | |
| cached = _read_cache(author_id) | |
| if cached is not None: | |
| cached["_source"] = "cache" | |
| return JSONResponse(content=cached) | |
| # 2. Fetch from SerpAPI | |
| logger.info("Cache MISS — calling SerpAPI for author_id=%s", author_id) | |
| data = await _fetch_from_serpapi(author_id) | |
| # 3. Write cache | |
| _write_cache(author_id, data) | |
| data["_source"] = "serpapi" | |
| return JSONResponse(content=data) | |
| async def cache_status( | |
| author_id: str = Query(..., description="Author ID to check cache for"), | |
| ): | |
| """Check whether a cached entry exists and how old it is.""" | |
| cache_file = CACHE_DIR / f"{_cache_key(author_id)}.json" | |
| if not cache_file.exists(): | |
| return {"cached": False} | |
| try: | |
| data = json.loads(cache_file.read_text(encoding="utf-8")) | |
| cached_at = data.get("_cached_at", 0) | |
| age_seconds = time.time() - cached_at | |
| return { | |
| "cached": True, | |
| "cached_at": data.get("_cached_at_human"), | |
| "age_seconds": round(age_seconds, 1), | |
| "expires_in_seconds": max(0, round(CACHE_TTL_SECONDS - age_seconds, 1)), | |
| } | |
| except Exception: | |
| return {"cached": False, "error": "failed to read cache"} | |
| async def health(): | |
| return {"status": "ok", "serpapi_key_set": bool(SERPAPI_KEY)} | |