Spaces:

junaid17
/

CortexSearch

Sleeping

App Files Files Community

junaid17 commited on 8 days ago

Commit

c46b826

verified ·

1 Parent(s): fa15a30

Upload 3 files

Browse files

Files changed (3) hide show

app.py +163 -0
requirements.txt +14 -0
text_engine.py +194 -0

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# main.py
+from fastapi import FastAPI, HTTPException, status, File, UploadFile, Form, Query
+from fastapi.middleware.cors import CORSMiddleware
+from typing import Optional
+import pandas as pd
+import io
+import os
+from text_engine import Text_Search_Engine
+app = FastAPI(title="CortexSearch", version="1.0", description="A flexible text search API with multiple FAISS index types and BM25 support.")
+# Choose default index_type here: "flat", "ivf", or "hnsw"
+store = Text_Search_Engine(index_type=os.getenv("INDEX_TYPE", "flat"))
+try:
+    store.load()
+except Exception:
+    pass
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the Flexible Text Intelligence API"}
+# -------------------------
+# Column preview endpoint
+# -------------------------
+@app.post("/list_columns")
+async def list_columns(file: UploadFile = File(...)):
+    """
+    Upload a CSV and get available columns back.
+    Useful to preview before choosing columns to index.
+    """
+    try:
+        contents = await file.read()
+        df = pd.read_csv(io.BytesIO(contents))
+        return {"available_columns": list(df.columns)}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
+# -------------------------
+# Health check endpoint
+# -------------------------
+@app.get("/health")
+async def health():
+    return {"status": "ok", "rows_indexed": len(store.rows), "index_type": store.index_type}
+# -------------------------
+# Upload CSV (build fresh index)
+# -------------------------
+@app.post("/upload_csv")
+async def upload_csv(file: UploadFile = File(...), columns: str = Form(...), index_type: Optional[str] = Form(None)):
+    #Upload CSV and specify columns (comma-separated) to combine into searchable text.
+    #Optional form field 'index_type' can be 'flat', 'ivf', or 'hnsw' to override engine default.
+    try:
+        contents = await file.read()
+        df = pd.read_csv(io.BytesIO(contents))
+        column_list = [c.strip() for c in columns.split(",") if c.strip()]
+        # Validate
+        for col in column_list:
+            if col not in df.columns:
+                return {
+                    "status": "error",
+                    "detail": f"Column '{col}' not found.",
+                    "available_columns": list(df.columns),
+                }
+        rows = df.dropna(subset=column_list).to_dict(orient="records")
+        for r in rows:
+            r["_search_text"] = " ".join(str(r[col]) for col in column_list if r.get(col) is not None)
+        texts = [r["_search_text"] for r in rows]
+        if index_type:
+            store.index_type = index_type
+        store.encode_store(rows, texts)
+        return {"status": "success", "count": len(rows), "used_columns": column_list, "index_type": store.index_type}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+# -------------------------
+# Add CSV (append new rows)
+# -------------------------
+@app.post("/add_csv")
+async def add_csv(file: UploadFile = File(...), columns: str = Form(...)):
+    try:
+        contents = await file.read()
+        df = pd.read_csv(io.BytesIO(contents))
+        column_list = [c.strip() for c in columns.split(",") if c.strip()]
+        for col in column_list:
+            if col not in df.columns:
+                return {
+                    "status": "error",
+                    "detail": f"Column '{col}' not found.",
+                    "available_columns": list(df.columns),
+                }
+        new_rows = df.dropna(subset=column_list).to_dict(orient="records")
+        for r in new_rows:
+            r["_search_text"] = " ".join(str(r[col]) for col in column_list if r.get(col) is not None)
+        new_texts = [r["_search_text"] for r in new_rows]
+        store.add_rows(new_rows, new_texts)
+        return {"status": "success", "added_count": len(new_rows), "used_columns": column_list, "total_rows": len(store.rows)}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+# -------------------------
+# Search endpoint
+# -------------------------
+@app.get("/search")
+async def search(
+    query: str,
+    top_k: int = 3,
+    mode: str = Query("semantic", enum=["semantic", "lexical", "hybrid"]),
+    alpha: float = 0.5,):
+    #mode: semantic | lexical | hybrid
+    #alpha: weight for semantic in hybrid (0..1)
+    try:
+        if mode == "semantic":
+            results = store.search(query, top_k=top_k)
+        elif mode == "lexical":
+            if store.bm25 is None:
+                return {"results": []}
+            tokenized_query = query.lower().split()
+            scores = store.bm25.get_scores(tokenized_query)
+            ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:top_k]
+            results = [{**store.rows[i], "score": float(score)} for i, score in ranked]
+        else:
+            results = store.hybrid_search(query, top_k=top_k, alpha=alpha)
+        return {"results": results}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+# -------------------------
+# Delete all data
+# -------------------------
+@app.delete("/delete_data")
+async def delete_data():
+    try:
+        store.clear_vdb()
+        return {"status": "success", "message": "Vector DB cleared"}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+faiss-cpu
+sentence_transformers
+numpy
+pandas
+scikit-learn
+torch
+transformers
+uvicorn
+fastapi
+python-multipart
+rank_bm25
+torchvision
+pillow
+git+https://github.com/openai/CLIP.git

text_engine.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# text_engine.py
+import os
+import pickle
+import logging
+from typing import List, Optional
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import faiss
+from rank_bm25 import BM25Okapi
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class Text_Search_Engine:
+    def __init__(
+        self,
+        base_folder: str = "vector_store",
+        model_name: str = "sentence-transformers/LaBSE",
+        index_type: str = "flat",
+    ):
+        self.base_folder = base_folder
+        self.embeddings_folder = os.path.join(base_folder, "embeddings")
+        self.docs_folder = os.path.join(base_folder, "documents")
+        os.makedirs(self.embeddings_folder, exist_ok=True)
+        os.makedirs(self.docs_folder, exist_ok=True)
+        self.model = SentenceTransformer(model_name)
+        self.index: Optional[faiss.Index] = None
+        self.rows: List[dict] = []
+        self.texts: List[str] = []
+        self.bm25: Optional[BM25Okapi] = None
+        self.index_type = index_type
+    # -------------------------
+    # Index creation utilities
+    # -------------------------
+    def _create_index(self, dimension: int, embeddings: np.ndarray):
+        if self.index_type == "flat":
+            self.index = faiss.IndexFlatL2(dimension)
+        elif self.index_type == "ivf":
+            nlist = max(1, min(256, len(embeddings) // 10))
+            quantizer = faiss.IndexFlatL2(dimension)
+            self.index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
+            self.index.train(np.array(embeddings).astype("float32"))
+        elif self.index_type == "hnsw":
+            self.index = faiss.IndexHNSWFlat(dimension, 32)
+        else:
+            raise ValueError(f"Unsupported index type: {self.index_type}")
+    def _persist(self):
+        try:
+            if self.index is not None:
+                faiss.write_index(self.index, os.path.join(self.embeddings_folder, "multilingual.index"))
+            with open(os.path.join(self.docs_folder, "rows.pkl"), "wb") as f:
+                pickle.dump(self.rows, f)
+            logger.info("Persisted index and rows to disk.")
+        except Exception as e:
+            logger.exception("Failed to persist index/rows: %s", e)
+    # -------------------------
+    # Core operations
+    # -------------------------
+    def encode_store(self, rows: List[dict], texts: List[str]):
+        try:
+            embeddings = self.model.encode(texts, convert_to_numpy=True)
+            dimension = embeddings.shape[1]
+            self._create_index(dimension, embeddings)
+            self.index.add(np.array(embeddings).astype("float32"))
+            self.rows = rows
+            self.texts = texts
+            tokenized_corpus = [t.lower().split() for t in texts]
+            self.bm25 = BM25Okapi(tokenized_corpus)
+            self._persist()
+            logger.info("Index built with %d rows (index_type=%s).", len(rows), self.index_type)
+        except Exception as e:
+            logger.exception("Error in encode_store: %s", e)
+            raise
+    def load(self):
+        try:
+            index_path = os.path.join(self.embeddings_folder, "multilingual.index")
+            rows_path = os.path.join(self.docs_folder, "rows.pkl")
+            if os.path.exists(index_path) and os.path.exists(rows_path):
+                self.index = faiss.read_index(index_path)
+                with open(rows_path, "rb") as f:
+                    self.rows = pickle.load(f)
+                self.texts = [r["_search_text"] for r in self.rows]
+                tokenized_corpus = [t.lower().split() for t in self.texts]
+                self.bm25 = BM25Okapi(tokenized_corpus)
+                logger.info("Loaded index and %d rows from disk.", len(self.rows))
+            else:
+                logger.info("No persisted index/rows found.")
+        except Exception as e:
+            logger.exception("Error in load: %s", e)
+            raise
+    def add_rows(self, new_rows: List[dict], new_texts: List[str]):
+        try:
+            if not new_rows:
+                return
+            new_embeddings = self.model.encode(new_texts, convert_to_numpy=True).astype("float32")
+            if self.index is None:
+                self._create_index(new_embeddings.shape[1], new_embeddings)
+                self.index.add(new_embeddings)
+            else:
+                if isinstance(self.index, faiss.IndexIVFFlat) and not self.index.is_trained:
+                    combined = np.vstack([self.model.encode(self.texts, convert_to_numpy=True).astype("float32"), new_embeddings]) if self.texts else new_embeddings
+                    self.index.train(combined)
+                self.index.add(new_embeddings)
+            self.rows.extend(new_rows)
+            self.texts.extend(new_texts)
+            tokenized_corpus = [t.lower().split() for t in self.texts]
+            self.bm25 = BM25Okapi(tokenized_corpus)
+            self._persist()
+            logger.info("Added %d new rows. Total rows: %d", len(new_rows), len(self.rows))
+        except Exception as e:
+            logger.exception("Error in add_rows: %s", e)
+            raise
+    # -------------------------
+    # Search methods
+    # -------------------------
+    def search(self, query: str, top_k: int = 3):
+        try:
+            if self.index is None:
+                return []
+            query_emb = self.model.encode([query], convert_to_numpy=True).astype("float32")
+            k = min(top_k, len(self.rows))
+            distances, indices = self.index.search(query_emb, k=k)
+            results = [
+                {**self.rows[i], "distance": float(distances[0][j])}
+                for j, i in enumerate(indices[0])
+            ]
+            return sorted(results, key=lambda x: x["distance"])
+        except Exception as e:
+            logger.exception("Error in search: %s", e)
+            return []
+    def hybrid_search(self, query: str, top_k: int = 3, alpha: float = 0.5):
+        try:
+            if self.index is None or self.bm25 is None:
+                return []
+            query_emb = self.model.encode([query], convert_to_numpy=True).astype("float32")
+            distances, indices = self.index.search(query_emb, k=len(self.texts))
+            semantic_scores = {i: 1 / (1 + distances[0][j]) for j, i in enumerate(indices[0])}
+            tokenized_query = query.lower().split()
+            bm25_scores = self.bm25.get_scores(tokenized_query)
+            lexical_scores = {i: bm25_scores[i] for i in range(len(self.texts))}
+            combined = []
+            for i, row in enumerate(self.rows):
+                sem = semantic_scores.get(i, 0.0)
+                lex = lexical_scores.get(i, 0.0)
+                score = alpha * sem + (1 - alpha) * lex
+                combined.append({**row, "score": float(score)})
+            combined = sorted(combined, key=lambda x: x["score"], reverse=True)
+            return combined[:top_k]
+        except Exception as e:
+            logger.exception("Error in hybrid_search: %s", e)
+            return []
+    # -------------------------
+    # Utilities
+    # -------------------------
+    def clear_vdb(self):
+        try:
+            if self.index is not None:
+                try:
+                    self.index.reset()
+                except Exception:
+                    self.index = None
+            self.rows = []
+            self.texts = []
+            self.bm25 = None
+            index_path = os.path.join(self.embeddings_folder, "multilingual.index")
+            docs_path = os.path.join(self.docs_folder, "rows.pkl")
+            if os.path.exists(index_path):
+                os.remove(index_path)
+            if os.path.exists(docs_path):
+                os.remove(docs_path)
+            logger.info("Cleared vector DB and persisted files.")
+        except Exception as e:
+            logger.exception("Error in clear_vdb: %s", e)
+            raise