Spaces:

Ghostgim
/

ghostlm

Sleeping

App Files Files Community

Ghostgim commited on 15 days ago

Commit

ae0fa76

verified ·

1 Parent(s): 551cb99

feat(rag): wire retrieval-augmented chat into chat_fn

Browse files

RAG (retrieval-augmented generation) is now the default chat mode on the
Space when the index loads. Pulls 83K BGE-embedded passages from the
Ghostgim/GhostLM-v0.9-experimental Models repo at startup; embeds each
user query with BAAI/bge-small-en-v1.5; takes top-4 by cosine similarity;
prepends them as 'Reference passages' in front of the question. The model
is not RAFT-trained yet so it sees retrieved context as part of the user
message, but even un-trained-for retrieval cuts the hallucination floor
the bare 81M model exhibits.

RAG load is wrapped in try/except: if the index file isn't in the Models
repo (e.g. upload still in progress), the Space falls back to bare chat
with an honest 'RAG: OFF' note in the description so the user knows
what they're getting. No silent failure modes.

Adds requirements: numpy >= 1.24, transformers >= 4.38, sentencepiece >=
0.1.99 (BGE deps). All small; no LFS budget impact on the Space.

Index files (rag/index.npy fp16 64MB + rag/chunks.jsonl 57MB + rag/meta.json)
land in the Models repo separately when bandwidth cooperates; once they
do, restart_space() picks them up without a Space code change.

Files changed (2) hide show

app.py +130 -1
requirements.txt +16 -4

app.py CHANGED Viewed

@@ -32,13 +32,15 @@ the default 200-token cap on the 81M model.
 from __future__ import annotations
 import gc
 import os
 import sys
 from dataclasses import fields
 from pathlib import Path
-from typing import List
 import gradio as gr
 import torch
 import torch.nn.functional as F
@@ -245,6 +247,117 @@ TOKENIZER = GhostTokenizer()
 END_ID = TOKENIZER._special_tokens[TOKENIZER.END]
 # ---------------------------------------------------------------------------
 # Chat handler
 # ---------------------------------------------------------------------------
@@ -263,6 +376,20 @@ def chat_fn(message: str, history: list, temperature: float, top_k: int,
     # process (e.g. a buggy plugin flipping training mode).
     MODEL.eval()
     turns: list = []
     for h in history:
         if isinstance(h, dict) and h.get("role") in ("user", "assistant"):
@@ -356,6 +483,8 @@ rented GPU compute, where literature reports factual recall on cybersec
 MCQ starting to emerge. Spec at
 [`docs/ghost_base_spec.md`](https://github.com/joemunene-by/GhostLM/blob/main/docs/ghost_base_spec.md).
 **Loaded checkpoint:** `{LOADED_FROM}`
 """

 from __future__ import annotations
 import gc
+import json
 import os
 import sys
 from dataclasses import fields
 from pathlib import Path
+from typing import List, Optional
 import gradio as gr
+import numpy as np
 import torch
 import torch.nn.functional as F
 END_ID = TOKENIZER._special_tokens[TOKENIZER.END]
+# ---------------------------------------------------------------------------
+# RAG (retrieval-augmented generation)
+#
+# The bare 81M chat model hallucinates badly because at this scale it has
+# learned the *register* of cybersec writing but not the *facts* in any
+# retrievable form. RAG closes that gap without retraining: we embed the
+# user's query with a small BGE bi-encoder, look up the top-K most-similar
+# chunks from the pretrain corpus, and inject them as "Reference passages"
+# in front of the question. The model then conditions on real facts
+# instead of producing register-shaped fiction.
+#
+# The index (83K chunks of NVD / MITRE / CWE / OWASP / CTFtime / arXiv at
+# 384-dim BGE embeddings) lives in the Models repo at rag/, alongside the
+# weights. The Space pulls it on first launch via hf_hub_download.
+# ---------------------------------------------------------------------------
+RAG_INDEX: Optional[np.ndarray] = None
+RAG_CHUNKS: Optional[List[dict]] = None
+RAG_EMBEDDER_TOK = None
+RAG_EMBEDDER = None
+RAG_LOAD_ERROR: Optional[str] = None
+def _load_rag() -> None:
+    """Load RAG index + embedder. On any failure leaves everything None
+    and stores the error message so the UI can surface it. The chat
+    handler treats RAG as optional: if it didn't load, generation still
+    works, just bare without retrieval."""
+    global RAG_INDEX, RAG_CHUNKS, RAG_EMBEDDER_TOK, RAG_EMBEDDER, RAG_LOAD_ERROR
+    try:
+        from huggingface_hub import hf_hub_download
+        print(f"Pulling RAG index from {HUB_REPO}...")
+        index_path = hf_hub_download(repo_id=HUB_REPO, filename="rag/index.npy", repo_type="model")
+        chunks_path = hf_hub_download(repo_id=HUB_REPO, filename="rag/chunks.jsonl", repo_type="model")
+        idx = np.load(index_path)
+        # Index ships as fp16 (halves the upload from 128 MB to 64 MB);
+        # upcast for the matmul against the fp32 query vector.
+        if idx.dtype != np.float32:
+            idx = idx.astype(np.float32)
+        chunks: List[dict] = []
+        with open(chunks_path) as f:
+            for line in f:
+                chunks.append(json.loads(line))
+        from transformers import AutoModel, AutoTokenizer
+        e_tok = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
+        e_model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5").eval()
+        if os.environ.get("SPACE_ID"):
+            # Same fp16 cast we apply to GhostLM on the Space's tight CPU
+            # memory budget. BGE-small is 130M params at fp32 (~260 MB);
+            # halving keeps the working set inside the cpu-basic worker
+            # ceiling alongside the v0.9 weights and the index matrix.
+            e_model = e_model.half()
+        RAG_INDEX = idx
+        RAG_CHUNKS = chunks
+        RAG_EMBEDDER_TOK = e_tok
+        RAG_EMBEDDER = e_model
+        print(f"RAG loaded: {len(chunks)} chunks, dim {idx.shape[1]}")
+    except Exception as e:
+        RAG_LOAD_ERROR = f"{type(e).__name__}: {e}"
+        print(f"RAG disabled, falling back to bare chat: {RAG_LOAD_ERROR}")
+_load_rag()
+def retrieve(query: str, k: int = 4) -> List[dict]:
+    """Embed the query and return the top-K chunks by cosine similarity.
+    Returns an empty list if RAG isn't loaded; caller handles that."""
+    if RAG_INDEX is None or RAG_EMBEDDER is None or RAG_EMBEDDER_TOK is None:
+        return []
+    # BGE recommends prefixing queries with this instruction string.
+    text = "Represent this sentence for searching relevant passages: " + query
+    enc = RAG_EMBEDDER_TOK(text, padding=True, truncation=True,
+                           max_length=512, return_tensors="pt")
+    with torch.no_grad():
+        out = RAG_EMBEDDER(**enc)
+    emb = out.last_hidden_state[:, 0]
+    emb = F.normalize(emb, p=2, dim=-1)
+    q_vec = emb.cpu().to(torch.float32).numpy().reshape(-1)
+    scores = RAG_INDEX @ q_vec
+    top = np.argsort(-scores)[:k]
+    return [RAG_CHUNKS[i] for i in top]
+def format_rag_prompt(query: str, passages: List[dict]) -> str:
+    """Wrap the query with retrieved reference passages. The model is
+    not RAFT-trained yet so it just sees this as part of the user
+    message; even without a RAFT pass, retrieval-augmented chat
+    dramatically reduces the bare 81M model's hallucination rate on
+    factual cybersec questions."""
+    if not passages:
+        return query
+    refs = []
+    for i, p in enumerate(passages):
+        text = p.get("text", "")
+        if len(text) > 400:
+            text = text[:400].rsplit(" ", 1)[0] + "..."
+        refs.append(f"[{i + 1}] ({p.get('source', '?')} {p.get('ref', '')}) {text}")
+    refs_block = "\n\n".join(refs)
+    return (
+        "Reference passages from the cybersecurity corpus:\n\n"
+        f"{refs_block}\n\n"
+        "Use the reference passages above to answer the question. If the "
+        "passages don't contain the answer, say so rather than guessing.\n\n"
+        f"Question: {query}"
+    )
 # ---------------------------------------------------------------------------
 # Chat handler
 # ---------------------------------------------------------------------------
     # process (e.g. a buggy plugin flipping training mode).
     MODEL.eval()
+    # If RAG loaded successfully at startup, retrieve top-K relevant
+    # passages from the indexed corpus and prepend them to the user's
+    # message before building the chat prompt. The retrieval cost is
+    # ~1-2 s on cpu-basic; activated by default whenever RAG is
+    # available because it's the difference between "register-shaped
+    # fiction" and "answers grounded in real CVE/MITRE/CWE text".
+    if RAG_INDEX is not None:
+        try:
+            passages = retrieve(message, k=4)
+            if passages:
+                message = format_rag_prompt(message, passages)
+        except Exception as e:  # noqa: BLE001 - never break chat for retrieval issues
+            print(f"RAG retrieve failed for this turn: {type(e).__name__}: {e}")
     turns: list = []
     for h in history:
         if isinstance(h, dict) and h.get("role") in ("user", "assistant"):
 MCQ starting to emerge. Spec at
 [`docs/ghost_base_spec.md`](https://github.com/joemunene-by/GhostLM/blob/main/docs/ghost_base_spec.md).
+**Retrieval-augmented mode:** {("**ON**. Each query is augmented with top-4 passages retrieved from a 83K-chunk index of the cybersec corpus (NVD / MITRE / CWE / OWASP / CTFtime / arXiv). The model conditions on real reference text instead of producing register-shaped fiction. Retrieval adds ~1-2 s per reply." if RAG_INDEX is not None else f"**OFF**. RAG could not load at startup (`{RAG_LOAD_ERROR}`). Generation is bare; expect hallucination on factual questions.")}
 **Loaded checkpoint:** `{LOADED_FROM}`
 """

requirements.txt CHANGED Viewed

@@ -14,12 +14,24 @@ torch>=2.0.0
 # tiktoken is the GPT-2 BPE backend the GhostTokenizer wraps.
 tiktoken>=0.5.0
-# huggingface_hub for hf_hub_download. The v0.9 chat weights are stored
-# in the Models repo Ghostgim/GhostLM-v0.9-experimental (not in the
-# Space's own LFS) so the Space stays within the 1 GB free-tier cap.
-# Pulled and cached on first launch.
 huggingface_hub>=0.20
 # Python 3.13 removed the stdlib audioop module that gradio's transitive
 # pydub dep imports at module-load time. Without this the entire gradio
 # import chain fails with ModuleNotFoundError: No module named

 # tiktoken is the GPT-2 BPE backend the GhostTokenizer wraps.
 tiktoken>=0.5.0
+# huggingface_hub for hf_hub_download. The v0.9 chat weights and the
+# RAG index both live in the Models repo Ghostgim/GhostLM-v0.9-experimental
+# (not in the Space's own LFS) so the Space stays within the 1 GB
+# free-tier cap. Pulled and cached on first launch.
 huggingface_hub>=0.20
+# transformers + sentencepiece for the BGE-small-en-v1.5 retrieval
+# embedder used by the RAG layer. transformers is already pulled in
+# transitively but listing it explicitly pins the version range we test
+# against. sentencepiece is BGE's tokenizer backend.
+transformers>=4.38
+sentencepiece>=0.1.99
+# numpy for the in-memory retrieval index (83K x 384 float32 matrix).
+# Already a transitive dep of torch but listed explicitly because the
+# RAG code path imports it directly.
+numpy>=1.24
 # Python 3.13 removed the stdlib audioop module that gradio's transitive
 # pydub dep imports at module-load time. Without this the entire gradio
 # import chain fails with ModuleNotFoundError: No module named