Spaces:

LazyHuman
/

plexi-api

Running

File size: 6,625 Bytes

3b6130d

"""
main.py — Plexi API (FastAPI service for HuggingFace Spaces)
============================================================
Endpoints:
  POST /retrieve   — embed query + vector search (scope-filtered)
  GET  /manifest   — proxy + cache the materials manifest.json
  GET  /health     — liveness probe (also used by keep-alive cron)

The heavy resources (index + embedding model) are loaded ONCE at startup via
FastAPI's lifespan context manager and shared across all requests.
"""

import os
import time
from contextlib import asynccontextmanager
from functools import lru_cache

import requests
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field

from rag import (
    DEFAULT_TOP_K,
    MATERIALS_REPO,
    MANIFEST_BRANCH,
    format_context,
    load_index,
    retrieve_chunks,
)

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
ALLOWED_ORIGINS = os.getenv(
    "ALLOWED_ORIGINS",
    # Default: allow the Cloudflare Pages domain + localhost for dev
    "https://plexi.lazyhideout.tech,http://localhost:5173,http://localhost:4173",
).split(",")

# ---------------------------------------------------------------------------
# Startup / Shutdown — load heavy resources once
# ---------------------------------------------------------------------------
_state: dict = {}


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Load the RAG index at startup; release on shutdown."""
    print("Loading RAG index from GitHub…")
    t0 = time.time()
    index, error = load_index()
    elapsed = round(time.time() - t0, 2)

    if error:
        print(f"⚠️  RAG index unavailable: {error}")
        _state["index"] = None
        _state["index_error"] = error
    else:
        print(f"✅ RAG index loaded in {elapsed}s")
        _state["index"] = index
        _state["index_error"] = None

    _state["index_loaded"] = index is not None
    _state["startup_ts"] = time.time()
    yield
    # Cleanup (nothing heavy to clean up here)
    _state.clear()


# ---------------------------------------------------------------------------
# App
# ---------------------------------------------------------------------------
app = FastAPI(
    title="Plexi API",
    description=(
        "RAG retrieval backend for Plexi. "
        "Accepts student queries and returns relevant study material chunks."
    ),
    version="1.0.0",
    lifespan=lifespan,
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=ALLOWED_ORIGINS,
    allow_credentials=False,
    allow_methods=["GET", "POST", "OPTIONS"],
    allow_headers=["Content-Type"],
)


# ---------------------------------------------------------------------------
# Request / Response models
# ---------------------------------------------------------------------------
class RetrieveRequest(BaseModel):
    query: str = Field(..., min_length=1, max_length=2000)
    semester: str = Field(..., min_length=1, max_length=100)
    subject: str = Field(..., min_length=1, max_length=100)
    top_k: int = Field(default=DEFAULT_TOP_K, ge=1, le=20)


class ChunkResult(BaseModel):
    text: str
    score: float | None
    filename: str | None
    subject: str | None


class RetrieveResponse(BaseModel):
    chunks: list[ChunkResult]
    query: str
    semester: str
    subject: str
    rag_active: bool
    context_formatted: str


# ---------------------------------------------------------------------------
# Manifest caching (simple in-memory, 5-minute TTL)
# ---------------------------------------------------------------------------
_manifest_cache: dict = {"data": None, "fetched_at": 0}
MANIFEST_TTL = 300  # seconds


def _get_manifest() -> dict:
    now = time.time()
    if _manifest_cache["data"] and (now - _manifest_cache["fetched_at"]) < MANIFEST_TTL:
        return _manifest_cache["data"]

    url = f"https://raw.githubusercontent.com/{MATERIALS_REPO}/{MANIFEST_BRANCH}/manifest.json"
    resp = requests.get(url, timeout=15)
    resp.raise_for_status()
    data = resp.json()

    _manifest_cache["data"] = data
    _manifest_cache["fetched_at"] = now
    return data


# ---------------------------------------------------------------------------
# Routes
# ---------------------------------------------------------------------------
@app.get("/health")
def health():
    """Liveness probe — also pinged by the GitHub Actions keep-alive cron."""
    uptime = round(time.time() - _state.get("startup_ts", time.time()), 1)
    return {
        "status": "ok",
        "index_loaded": _state.get("index_loaded", False),
        "index_error": _state.get("index_error"),
        "embed_model": "sentence-transformers/all-MiniLM-L6-v2",
        "uptime_seconds": uptime,
    }


@app.get("/manifest")
def get_manifest():
    """
    Proxy and cache the study materials manifest.json from GitHub.
    The Cloudflare Worker also caches this in KV — this is a double layer.
    """
    try:
        data = _get_manifest()
        return JSONResponse(content=data)
    except requests.HTTPError as err:
        raise HTTPException(status_code=502, detail=f"GitHub fetch failed: {err}")
    except Exception as err:
        raise HTTPException(status_code=500, detail=str(err))


@app.post("/retrieve", response_model=RetrieveResponse)
def retrieve(body: RetrieveRequest):
    """
    Core RAG endpoint.

    1. Embeds the query using all-MiniLM-L6-v2 (local, fast ~5-10ms)
    2. Searches the pre-built LlamaIndex vector store
    3. Filters results by semester + subject metadata
    4. Returns top-k chunks + a formatted context string for the LLM prompt
    """
    index = _state.get("index")

    chunks = retrieve_chunks(
        index=index,
        query=body.query,
        semester=body.semester,
        subject=body.subject,
        top_k=body.top_k,
    )

    context_formatted = format_context(chunks)

    return RetrieveResponse(
        chunks=chunks,
        query=body.query,
        semester=body.semester,
        subject=body.subject,
        rag_active=index is not None,
        context_formatted=context_formatted,
    )


# ---------------------------------------------------------------------------
# Run (for local development only — HF uses Dockerfile CMD)
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    import uvicorn

    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)