Spaces:

Subhadip007
/

researchpilot-api

Running

Subhadip007 commited on 17 days ago

Commit

5c095ca

1 Parent(s): 2671aea

feat: FastAPI backend complete

- FastAPI application with lifespan startup (models pre-loaded)
- POST /query: full RAG pipeline over HTTP with Pydantic validation
- GET /health: system status with vector DB and BM25 index sizes
- CORS middleware: browser frontends can call the API
- asyncio.to_thread: CPU-bound RAG runs without blocking event loop
- Auto-generated Swagger UI at /docs (OAS 3.1)
- Warm query latency: ~3s after first request warms models

Endpoints:
GET / API info
GET /health System health check
POST /query Research paper Q&A with citations

Files changed (5) hide show

run_api.py +27 -0
src/api/__init__.py +0 -0
src/api/main.py +237 -0
src/api/schemas.py +85 -0
test_api.py +37 -0

run_api.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Start the ResearchPilot FastAPI server.
+Run from project root:
+    python run_api.py
+Then visit:
+    http://localhost:8000/docs    <- Interactive API documentation
+    http://localhost:8000/health  <- Health check
+    http://localhost:8000/        <- API info
+"""
+import uvicorn
+from config.settings import API_HOST, API_PORT, API_RELOAD
+if __name__ == "__main__":
+    print("Starting ResearchPilot API...")
+    print(f"API docs: http://localhost:{API_PORT}/docs")
+    print(f"Health:   http://localhost:{API_PORT}/health")
+    uvicorn.run(
+        "src.api.main:app",
+        host    = API_HOST,
+        port    = API_PORT,
+        reload  = API_RELOAD,     # Auto-restart on code changes (dev only)
+        workers = 1,              # Single worker for dev (no GPU sharing issues)
+    )

src/api/__init__.py ADDED Viewed

File without changes

src/api/main.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+ResearchPilot FastAPI application.
+STARTUP BEHAVIOR:
+    When the server starts, it loads ALL models into memory:
+    - BGE embedding model (~110MB)
+    - Cross-encoder re-ranker (~80MB)
+    - BM25 index (~40MB)
+    - Qdrant connection
+    This takes ~15 seconds once, then every request is fast.
+    This is called "warm start" - the model is always ready.
+    Without this, the first request after server restart
+    would take 20+ seconds. Unacceptable for production.
+LIFESPAN PATTERN:
+    FastAPI's lifespan context manager runs code at startup
+    and shutdown. We use it to initialize the RAG pipeline
+    once and store it in app.state for all requests to share.
+"""
+import asyncio
+import time
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from src.api.schemas import (
+    QueryRequest,
+    QueryResponse,
+    CitationSchema,
+    HealthResponse,
+    ErrorResponse,
+)
+from src.rag.pipeline import RAGPipeline
+from src.utils.logger import setup_logger, get_logger
+setup_logger()
+logger = get_logger(__name__)
+# ---------------------------------------------------------
+# LIFESPAN - runs at startup and shutdown
+# ---------------------------------------------------------
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Initialize resources at startup, clean up at shutdown.
+    The 'yield' separates startup (before) from shutdown (after).
+    Everything before yield runs when server starts.
+    Everything after yield runs when server shuts down.
+    """
+    # --------------- STARTUP ---------------
+    logger.info("ResearchPilot API starting up...")
+    start = time.time()
+    # Initialize RAG pipeline - loads all models into memory
+    # We store it on app.state so all request handlers can access it
+    app.state.rag_pipeline = RAGPipeline()
+    elapsed = time.time() - start
+    logger.info(f"API ready in {elapsed:.1f}s")
+    yield   # Server is now running and handling requests
+    # --------------- SHUTDOWN ---------------
+    logger.info("ResearchPilot API shutting down...")
+# ---------------------------------------------------------
+# APP INITIALIZATION
+# ---------------------------------------------------------
+app = FastAPI(
+    title       = "ResearchPilot API",
+    description = "Production RAG system for ML research paper Q&A",
+    version     = "1.0.0",
+    lifespan    = lifespan,
+    docs_url    = "/docs",    # Swagger UI at http://localhost:8000/docs
+    redoc_url   = "/redoc",   # ReDoc at http://localhost:8000/redoc
+)
+# CORS middleware — allows browser-based frontends to call this API
+# Without this, a browser on localhost:3000 cannot call localhost:8000
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins  = ["*"],   # In production, restrict to your domain
+    allow_methods  = ["*"],
+    allow_headers  = ["*"],
+)
+# ---------------------------------------------------------
+# EXCEPTION HANDLER
+# ---------------------------------------------------------
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+    """
+    Catch any unhandled exception and return a clean JSON error.
+    Without this, FastAPI returns a raw 500 error with no detail.
+    """
+    logger.error(f"Unhandled exception on {request.url}: {exc}")
+    return JSONResponse(
+        status_code = 500,
+        content     = {
+            "error":  "Internal server error",
+            "detail": str(exc),
+            "code":   500,
+        }
+    )
+# ---------------------------------------------------------
+# ROUTES
+# ---------------------------------------------------------
+@app.get(
+    "/health",
+    response_model = HealthResponse,
+    summary        = "Health check",
+    tags           = ["System"],
+)
+async def health_check(request: Request) -> HealthResponse:
+    """
+    Returns system health status.
+    Used by deployment platforms to verify the service is running.
+    Also useful for debugging - shows database sizes.
+    """
+    pipeline = request.app.state.rag_pipeline
+    # Get Qdrant collection size
+    qdrant_size = pipeline.retriever.hybrid_retriever.qdrant.get_collection_size()
+    # Get BM25 index size
+    bm25_size = len(pipeline.retriever.hybrid_retriever.bm25.chunk_ids)
+    return HealthResponse(
+        status           = "healthy",
+        model            = "llama-3.3-70b-versatile",
+        vector_db_size   = qdrant_size,
+        bm25_index_size  = bm25_size,
+        version          = "1.0.0",
+    )
+@app.post(
+    "/query",
+    response_model = QueryResponse,
+    summary        = "Query research papers",
+    tags           = ["RAG"],
+)
+async def query_papers(
+    request:     Request,
+    query_input: QueryRequest,
+) -> QueryResponse:
+    """
+    Submit a natural language question about ML research.
+    The system retrieves relevant paper excerpts and generates
+    a grounded answer with citations.
+    - **question**: Your research question (3-500 characters)
+    - **top_k**: Number of paper chunks to retrieve (1-20, default 5)
+    - **filter_category**: Filter by ArXiv category (e.g. cs.LG)
+    - **filter_year_gte**: Only include papers from this year onwards
+    """
+    pipeline = request.app.state.rag_pipeline
+    logger.info(
+        f"Query received: '{query_input.question[:60]}' "
+        f"[top_k={query_input.top_k}]"
+    )
+    # Run the RAG pipeline in a thread pool
+    # WHY asyncio.to_thread:
+    #   Our RAG pipeline is CPU-bound (not async).
+    #   Running it directly in an async handler would BLOCK
+    #   the entire FastAPI event loop - no other requests
+    #   could be processed while one query is running.
+    #   asyncio.to_thread runs it in a separate thread,
+    #   keeping the event loop free for other requests.
+    try:
+        response = await asyncio.to_thread(
+            pipeline.query,
+            query_input.question,
+            query_input.top_k,
+            query_input.filter_category,
+            query_input.filter_year_gte,
+        )
+    except Exception as e:
+        logger.error(f"RAG pipeline error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    # Convert RAGResponse dataclass to API schema
+    citations = [
+        CitationSchema(
+            paper_id       = c.get("paper_id", ""),
+            title          = c.get("title", ""),
+            authors        = c.get("authors", []),
+            published_date = c.get("published_date", ""),
+            arxiv_url      = c.get("arxiv_url", ""),
+        )
+        for c in response.citations
+    ]
+    return QueryResponse(
+        answer             = response.answer,
+        citations          = citations,
+        query              = response.query,
+        chunks_used        = len(response.retrieved_chunks),
+        retrieval_time_ms  = response.retrieval_time_ms,
+        generation_time_ms = response.generation_time_ms,
+        total_time_ms      = response.total_time_ms,
+        has_context        = response.has_context,
+    )
+@app.get(
+    "/",
+    summary = "API root",
+    tags    = ["System"],
+)
+async def root():
+    """API root - confirms service is running."""
+    return {
+        "service": "ResearchPilot API",
+        "version": "1.0.0",
+        "docs":    "/docs",
+        "health":  "/health",
+    }

src/api/schemas.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+Pydantic schemas for API request and response validation.
+WHY PYDANTIC SCHEMAS IN THE API LAYER:
+    FastAPI uses these to:
+    1. Validate incoming requests (wrong types -> automatic 422 error)
+    2. Serialize outgoing responses (Python objects -> JSON)
+    3. Generate automatic API documentation (OpenAPI/Swagger)
+    You get input validation AND documentation for free.
+"""
+from pydantic import BaseModel, Field
+from typing import Optional
+class QueryRequest(BaseModel):
+    """
+    Schema for POST /query request body.
+    Field() lets us add validation constraints and documentation.
+    """
+    question: str = Field(
+        ...,                    # ... means required
+        min_length      = 3,
+        max_length      = 500,
+        description     = "Research question to answer",
+        examples        = ["How does LoRA reduce trainable parameters?"]
+    )
+    top_k: int = Field(
+        default     = 5,
+        ge          = 1,                 # ge = greater than or equal
+        le          = 20,
+        description = "Number of chunks to retrieve"
+    )
+    filter_category: Optional[str] = Field(
+        default     = None,
+        description = "ArXiv category filter, e.g. 'cs.LG'",
+        example     = ["cs.LG"]
+    )
+    filter_year_gte: Optional[int] = Field(
+        default     = None,
+        ge          = 2020,
+        le          = 2030,
+        description = "Only include papers from this year onwards",
+        example     = [2024]
+    )
+class CitationSchema(BaseModel):
+    """A single cited paper."""
+    paper_id:       str
+    title:          str
+    authors:        list[str]
+    published_date: str
+    arxiv_url:      str
+class QueryResponse(BaseModel):
+    """Schema for POST /query response."""
+    answer:             str
+    citations:          list[CitationSchema]
+    query:              str
+    chunks_used:        int
+    retrieval_time_ms:  float
+    generation_time_ms: float
+    total_time_ms:      float
+    has_context:        bool
+class HealthResponse(BaseModel):
+    """Schema for GET /health response."""
+    status:          str
+    model:           str
+    vector_db_size:  int
+    bm25_index_size: int
+    version:         str = "1.0.0"
+class ErrorResponse(BaseModel):
+    """Schema for error responses."""
+    error:   str
+    detail:  str
+    code:    int

test_api.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Run this in a SEPARATE terminal while run_api.py is running
+import requests
+import json
+BASE_URL = "http://localhost:8000"
+# Test 1: Health check
+print("Testing /health...")
+r = requests.get(f"{BASE_URL}/health")
+print(json.dumps(r.json(), indent=2))
+# Test 2: Query
+print("\nTesting /query...")
+payload = {
+    "question": "What is LoRA and how does it work?",
+    "top_k": 5
+}
+r = requests.post(f"{BASE_URL}/query", json=payload)
+data = r.json()
+print(f"Answer: {data['answer'][:300]}...")
+print(f"\nCitations: {len(data['citations'])}")
+for c in data['citations']:
+    print(f"  - {c['paper_id']}: {c['title'][:50]}...")
+print(f"\nTotal time: {data['total_time_ms']:.0f}ms")
+# Test 3: Filtered query
+print("\nTesting /query with filter...")
+payload = {
+    "question": "graph neural network applications",
+    "top_k": 3,
+    "filter_year_gte": 2026
+}
+r = requests.post(f"{BASE_URL}/query", json=payload)
+data = r.json()
+print(f"Answer: {data['answer'][:200]}...")
+print(f"Citations: {len(data['citations'])}")