LazyHuman10 commited on
Commit
3b6130d
·
0 Parent(s):

Initial commit for HF Space

Browse files
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HuggingFace Spaces — Plexi API
2
+ # Uses Python 3.11 slim. HF Spaces expects the app on port 7860.
3
+
4
+ FROM python:3.11-slim
5
+
6
+ WORKDIR /app
7
+
8
+ # System deps for sentence-transformers (tokenizers, etc.)
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ build-essential \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ COPY requirements.txt .
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ COPY . .
17
+
18
+ # HuggingFace Spaces default port
19
+ EXPOSE 7860
20
+
21
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Plexi API
2
+
3
+ FastAPI RAG backend deployed on HuggingFace Spaces.
4
+
5
+ ## What It Does
6
+
7
+ - Loads the pre-built LlamaIndex vector store from the `plexi-materials` GitHub repo at startup
8
+ - Exposes three endpoints consumed by the Cloudflare Worker middleman
9
+
10
+ ## Endpoints
11
+
12
+ | Method | Path | Purpose |
13
+ |---|---|---|
14
+ | `GET` | `/health` | Liveness probe — used by keep-alive GitHub Actions |
15
+ | `GET` | `/manifest` | Proxies + caches `manifest.json` from the materials repo |
16
+ | `POST` | `/retrieve` | Embeds query, searches index, returns scoped top-k chunks |
17
+
18
+ ## Local Development
19
+
20
+ ```bash
21
+ pip install -r requirements.txt
22
+ uvicorn main:app --reload --port 7860
23
+ ```
24
+
25
+ Visit `http://localhost:7860/docs` for the interactive API docs.
26
+
27
+ ## Environment Variables
28
+
29
+ | Variable | Default | Purpose |
30
+ |---|---|---|
31
+ | `MATERIALS_REPO` | `KunalGupta25/plexi-materials` | GitHub repo with study materials |
32
+ | `MANIFEST_BRANCH` | `main` | Branch that holds `manifest.json` and `index/` |
33
+ | `ALLOWED_ORIGINS` | (Cloudflare Pages URL) | CORS allowed origins |
34
+
35
+ ## Deploy to HuggingFace Spaces
36
+
37
+ 1. Create a new Space with **Docker** SDK
38
+ 2. Push this folder as the Space repo
39
+ 3. Set environment variables in the Space settings
40
+ 4. HF will build and run the Dockerfile automatically
__pycache__/main.cpython-313.pyc ADDED
Binary file (7.69 kB). View file
 
__pycache__/rag.cpython-313.pyc ADDED
Binary file (7.94 kB). View file
 
main.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ main.py — Plexi API (FastAPI service for HuggingFace Spaces)
3
+ ============================================================
4
+ Endpoints:
5
+ POST /retrieve — embed query + vector search (scope-filtered)
6
+ GET /manifest — proxy + cache the materials manifest.json
7
+ GET /health — liveness probe (also used by keep-alive cron)
8
+
9
+ The heavy resources (index + embedding model) are loaded ONCE at startup via
10
+ FastAPI's lifespan context manager and shared across all requests.
11
+ """
12
+
13
+ import os
14
+ import time
15
+ from contextlib import asynccontextmanager
16
+ from functools import lru_cache
17
+
18
+ import requests
19
+ from fastapi import FastAPI, HTTPException, Request
20
+ from fastapi.middleware.cors import CORSMiddleware
21
+ from fastapi.responses import JSONResponse
22
+ from pydantic import BaseModel, Field
23
+
24
+ from rag import (
25
+ DEFAULT_TOP_K,
26
+ MATERIALS_REPO,
27
+ MANIFEST_BRANCH,
28
+ format_context,
29
+ load_index,
30
+ retrieve_chunks,
31
+ )
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Config
35
+ # ---------------------------------------------------------------------------
36
+ ALLOWED_ORIGINS = os.getenv(
37
+ "ALLOWED_ORIGINS",
38
+ # Default: allow the Cloudflare Pages domain + localhost for dev
39
+ "https://plexi.lazyhideout.tech,http://localhost:5173,http://localhost:4173",
40
+ ).split(",")
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Startup / Shutdown — load heavy resources once
44
+ # ---------------------------------------------------------------------------
45
+ _state: dict = {}
46
+
47
+
48
+ @asynccontextmanager
49
+ async def lifespan(app: FastAPI):
50
+ """Load the RAG index at startup; release on shutdown."""
51
+ print("Loading RAG index from GitHub…")
52
+ t0 = time.time()
53
+ index, error = load_index()
54
+ elapsed = round(time.time() - t0, 2)
55
+
56
+ if error:
57
+ print(f"⚠️ RAG index unavailable: {error}")
58
+ _state["index"] = None
59
+ _state["index_error"] = error
60
+ else:
61
+ print(f"✅ RAG index loaded in {elapsed}s")
62
+ _state["index"] = index
63
+ _state["index_error"] = None
64
+
65
+ _state["index_loaded"] = index is not None
66
+ _state["startup_ts"] = time.time()
67
+ yield
68
+ # Cleanup (nothing heavy to clean up here)
69
+ _state.clear()
70
+
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # App
74
+ # ---------------------------------------------------------------------------
75
+ app = FastAPI(
76
+ title="Plexi API",
77
+ description=(
78
+ "RAG retrieval backend for Plexi. "
79
+ "Accepts student queries and returns relevant study material chunks."
80
+ ),
81
+ version="1.0.0",
82
+ lifespan=lifespan,
83
+ )
84
+
85
+ app.add_middleware(
86
+ CORSMiddleware,
87
+ allow_origins=ALLOWED_ORIGINS,
88
+ allow_credentials=False,
89
+ allow_methods=["GET", "POST", "OPTIONS"],
90
+ allow_headers=["Content-Type"],
91
+ )
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Request / Response models
96
+ # ---------------------------------------------------------------------------
97
+ class RetrieveRequest(BaseModel):
98
+ query: str = Field(..., min_length=1, max_length=2000)
99
+ semester: str = Field(..., min_length=1, max_length=100)
100
+ subject: str = Field(..., min_length=1, max_length=100)
101
+ top_k: int = Field(default=DEFAULT_TOP_K, ge=1, le=20)
102
+
103
+
104
+ class ChunkResult(BaseModel):
105
+ text: str
106
+ score: float | None
107
+ filename: str | None
108
+ subject: str | None
109
+
110
+
111
+ class RetrieveResponse(BaseModel):
112
+ chunks: list[ChunkResult]
113
+ query: str
114
+ semester: str
115
+ subject: str
116
+ rag_active: bool
117
+ context_formatted: str
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Manifest caching (simple in-memory, 5-minute TTL)
122
+ # ---------------------------------------------------------------------------
123
+ _manifest_cache: dict = {"data": None, "fetched_at": 0}
124
+ MANIFEST_TTL = 300 # seconds
125
+
126
+
127
+ def _get_manifest() -> dict:
128
+ now = time.time()
129
+ if _manifest_cache["data"] and (now - _manifest_cache["fetched_at"]) < MANIFEST_TTL:
130
+ return _manifest_cache["data"]
131
+
132
+ url = f"https://raw.githubusercontent.com/{MATERIALS_REPO}/{MANIFEST_BRANCH}/manifest.json"
133
+ resp = requests.get(url, timeout=15)
134
+ resp.raise_for_status()
135
+ data = resp.json()
136
+
137
+ _manifest_cache["data"] = data
138
+ _manifest_cache["fetched_at"] = now
139
+ return data
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Routes
144
+ # ---------------------------------------------------------------------------
145
+ @app.get("/health")
146
+ def health():
147
+ """Liveness probe — also pinged by the GitHub Actions keep-alive cron."""
148
+ uptime = round(time.time() - _state.get("startup_ts", time.time()), 1)
149
+ return {
150
+ "status": "ok",
151
+ "index_loaded": _state.get("index_loaded", False),
152
+ "index_error": _state.get("index_error"),
153
+ "embed_model": "sentence-transformers/all-MiniLM-L6-v2",
154
+ "uptime_seconds": uptime,
155
+ }
156
+
157
+
158
+ @app.get("/manifest")
159
+ def get_manifest():
160
+ """
161
+ Proxy and cache the study materials manifest.json from GitHub.
162
+ The Cloudflare Worker also caches this in KV — this is a double layer.
163
+ """
164
+ try:
165
+ data = _get_manifest()
166
+ return JSONResponse(content=data)
167
+ except requests.HTTPError as err:
168
+ raise HTTPException(status_code=502, detail=f"GitHub fetch failed: {err}")
169
+ except Exception as err:
170
+ raise HTTPException(status_code=500, detail=str(err))
171
+
172
+
173
+ @app.post("/retrieve", response_model=RetrieveResponse)
174
+ def retrieve(body: RetrieveRequest):
175
+ """
176
+ Core RAG endpoint.
177
+
178
+ 1. Embeds the query using all-MiniLM-L6-v2 (local, fast ~5-10ms)
179
+ 2. Searches the pre-built LlamaIndex vector store
180
+ 3. Filters results by semester + subject metadata
181
+ 4. Returns top-k chunks + a formatted context string for the LLM prompt
182
+ """
183
+ index = _state.get("index")
184
+
185
+ chunks = retrieve_chunks(
186
+ index=index,
187
+ query=body.query,
188
+ semester=body.semester,
189
+ subject=body.subject,
190
+ top_k=body.top_k,
191
+ )
192
+
193
+ context_formatted = format_context(chunks)
194
+
195
+ return RetrieveResponse(
196
+ chunks=chunks,
197
+ query=body.query,
198
+ semester=body.semester,
199
+ subject=body.subject,
200
+ rag_active=index is not None,
201
+ context_formatted=context_formatted,
202
+ )
203
+
204
+
205
+ # ---------------------------------------------------------------------------
206
+ # Run (for local development only — HF uses Dockerfile CMD)
207
+ # ---------------------------------------------------------------------------
208
+ if __name__ == "__main__":
209
+ import uvicorn
210
+
211
+ uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
rag.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ rag.py — Plexi RAG Engine
3
+ =========================
4
+ Handles everything related to the LlamaIndex vector index:
5
+ - Downloading the pre-built index from GitHub
6
+ - Loading HuggingFace sentence-transformer embeddings
7
+ - Embedding queries and retrieving top-k chunks scoped by semester + subject
8
+ - Extracting text from PDFs for full-context fallback
9
+ - Formatting retrieved chunks for the LLM system prompt
10
+ """
11
+
12
+ import io
13
+ import os
14
+ import tempfile
15
+ from pathlib import Path
16
+
17
+ import requests
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Optional LlamaIndex — graceful degradation if not installed
21
+ # ---------------------------------------------------------------------------
22
+ try:
23
+ from llama_index.core import Settings, StorageContext, load_index_from_storage
24
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
25
+
26
+ LLAMA_INDEX_AVAILABLE = True
27
+ except ImportError:
28
+ LLAMA_INDEX_AVAILABLE = False
29
+
30
+ try:
31
+ import PyPDF2
32
+
33
+ PYPDF2_AVAILABLE = True
34
+ except ImportError:
35
+ PYPDF2_AVAILABLE = False
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Constants
39
+ # ---------------------------------------------------------------------------
40
+ MATERIALS_REPO = os.getenv("MATERIALS_REPO", "KunalGupta25/plexi-materials")
41
+ MANIFEST_BRANCH = os.getenv("MANIFEST_BRANCH", "main")
42
+ EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
43
+
44
+ INDEX_FILES = [
45
+ "default__vector_store.json",
46
+ "docstore.json",
47
+ "graph_store.json",
48
+ "image__vector_store.json",
49
+ "index_store.json",
50
+ ]
51
+
52
+ DEFAULT_TOP_K = 5
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # Index loading (called once at FastAPI startup)
57
+ # ---------------------------------------------------------------------------
58
+
59
+ def load_index():
60
+ """
61
+ Download the pre-built LlamaIndex from the materials repo and return a
62
+ VectorStoreIndex ready for querying.
63
+
64
+ Returns (index, error_msg). index is None if loading failed.
65
+ """
66
+ if not LLAMA_INDEX_AVAILABLE:
67
+ return None, "llama-index-core is not installed."
68
+
69
+ index_base_url = (
70
+ f"https://raw.githubusercontent.com/{MATERIALS_REPO}/{MANIFEST_BRANCH}/index"
71
+ )
72
+ index_dir = tempfile.mkdtemp(prefix="plexi_index_")
73
+
74
+ for filename in INDEX_FILES:
75
+ url = f"{index_base_url}/{filename}"
76
+ try:
77
+ resp = requests.get(url, timeout=30)
78
+ resp.raise_for_status()
79
+ with open(os.path.join(index_dir, filename), "wb") as fh:
80
+ fh.write(resp.content)
81
+ except Exception as err:
82
+ return None, f"Failed to download index file '{filename}': {err}"
83
+
84
+ try:
85
+ embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_ID)
86
+ Settings.embed_model = embed_model
87
+ Settings.llm = None
88
+
89
+ storage_ctx = StorageContext.from_defaults(persist_dir=index_dir)
90
+ index = load_index_from_storage(storage_ctx)
91
+ return index, None
92
+ except Exception as err:
93
+ return None, f"Failed to load index from storage: {err}"
94
+
95
+
96
+ def load_embed_model():
97
+ """Load and return the HuggingFace embedding model (for health checks)."""
98
+ if not LLAMA_INDEX_AVAILABLE:
99
+ return None
100
+ return HuggingFaceEmbedding(model_name=EMBED_MODEL_ID)
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # Retrieval
105
+ # ---------------------------------------------------------------------------
106
+
107
+ def _matches_scope(node, semester: str, subject: str) -> bool:
108
+ """Return True when a retrieved node belongs to the active semester + subject."""
109
+ metadata = getattr(node.node, "metadata", {}) or {}
110
+ return (
111
+ metadata.get("semester") == semester
112
+ and metadata.get("subject") == subject
113
+ )
114
+
115
+
116
+ def retrieve_chunks(
117
+ index,
118
+ query: str,
119
+ semester: str,
120
+ subject: str,
121
+ top_k: int = DEFAULT_TOP_K,
122
+ ) -> list[dict]:
123
+ """
124
+ Embed the query, retrieve top-k chunks from the index scoped to the
125
+ given semester + subject.
126
+
127
+ Returns a list of dicts:
128
+ { text, score, filename, subject }
129
+ """
130
+ if index is None:
131
+ return []
132
+
133
+ try:
134
+ # Fetch more than needed so we have room to filter by scope
135
+ retriever = index.as_retriever(similarity_top_k=max(top_k * 5, 10))
136
+ nodes = retriever.retrieve(query)
137
+
138
+ scoped = [n for n in nodes if _matches_scope(n, semester, subject)]
139
+
140
+ return [
141
+ {
142
+ "text": node.node.get_content(),
143
+ "score": round(float(node.score), 4) if node.score is not None else None,
144
+ "filename": (getattr(node.node, "metadata", {}) or {}).get("filename"),
145
+ "subject": (getattr(node.node, "metadata", {}) or {}).get("subject"),
146
+ }
147
+ for node in scoped[:top_k]
148
+ ]
149
+ except Exception as err:
150
+ print(f"Retrieval error: {err}")
151
+ return []
152
+
153
+
154
+ # ---------------------------------------------------------------------------
155
+ # Context formatting (for system prompt injection)
156
+ # ---------------------------------------------------------------------------
157
+
158
+ def format_context(chunks: list[dict]) -> str:
159
+ """Format retrieved chunks as a numbered block for the LLM system prompt."""
160
+ if not chunks:
161
+ return "(No relevant context retrieved for this query.)"
162
+ parts = []
163
+ for i, chunk in enumerate(chunks, start=1):
164
+ score_info = f" [relevance: {chunk['score']}]" if chunk.get("score") else ""
165
+ source = chunk.get("filename") or chunk.get("subject") or "Unknown source"
166
+ parts.append(
167
+ f"--- Chunk {i} | {source}{score_info} ---\n{chunk['text']}\n"
168
+ )
169
+ return "\n".join(parts)
170
+
171
+
172
+ # ---------------------------------------------------------------------------
173
+ # PDF text extraction (used for full-context fallback loading)
174
+ # ---------------------------------------------------------------------------
175
+
176
+ def read_pdf_text(pdf_bytes: bytes) -> str:
177
+ """Extract plain text from PDF bytes. Returns empty string on failure."""
178
+ if not PYPDF2_AVAILABLE:
179
+ return ""
180
+ text_parts = []
181
+ try:
182
+ reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
183
+ for page in reader.pages:
184
+ try:
185
+ page_text = page.extract_text()
186
+ if page_text:
187
+ # Sanitise surrogate pairs that can appear in some PDFs
188
+ filtered = page_text.encode("utf-16", "surrogatepass").decode(
189
+ "utf-16", "ignore"
190
+ )
191
+ text_parts.append(filtered)
192
+ except Exception:
193
+ pass
194
+ except Exception:
195
+ return pdf_bytes.decode("utf-8", errors="ignore") if pdf_bytes else ""
196
+ return "\n".join(text_parts)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.115.0,<1.0.0
2
+ uvicorn[standard]>=0.30.0,<1.0.0
3
+ pydantic>=2.0.0,<3.0.0
4
+ requests>=2.31.0,<3.0.0
5
+ python-dotenv>=1.0.0
6
+ PyPDF2>=3.0.0,<4.0.0
7
+ llama-index-core>=0.11.0,<0.13.0
8
+ llama-index-embeddings-huggingface>=0.3.0,<1.0.0
9
+ sentence-transformers>=3.0.0,<4.0.0