Spaces:

technophyle
/

doc-qa

Sleeping

App Files Files Community

DevelopedBy-Siva commited on Feb 4

Commit

b378103

1 Parent(s): 466d417

push

Browse files

Files changed (11) hide show

.dockerignore +16 -0
.gitignore +214 -0
Dockerfile +17 -0
requirements.txt +27 -0
server_app.py +629 -0
src/__init__.py +16 -0
src/database.py +83 -0
src/document_processor.py +74 -0
src/embeddings.py +33 -0
src/rag_system.py +239 -0
src/vector_store.py +126 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,16 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.venv/
+venv/
+.env
+.git/
+.gitignore
+*.db
+faiss/
+uploads/
+temp_uploads/
+data/
+rag_system.db

.gitignore ADDED Viewed

	@@ -0,0 +1,214 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for listreamlit==1.31.1braries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific templstreamlit==1.31.1ate is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+uploads/
+temp_uploads/
+data/
+rag_system.db
+test_demo.py

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.11-slim
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+ && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY . /app
+ENV PYTHONUNBUFFERED=1
+EXPOSE 7860
+CMD ["uvicorn", "server_app:app", "--host", "0.0.0.0", "--port", "7860"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+fastapi==0.109.2
+uvicorn[standard]==0.27.1
+python-multipart==0.0.9
+sentence-transformers==2.3.1
+faiss-cpu==1.9.0.post1
+langchain==0.1.9
+langchain-community==0.0.21
+openai==1.12.0
+pypdf==4.0.1
+python-docx==1.1.0
+python-magic==0.4.27
+sqlalchemy==2.0.25
+fastapi==0.109.2
+uvicorn==0.27.1
+python-multipart==0.0.9
+python-dotenv==1.0.1
+pydantic==2.6.1
+numpy==1.26.4
+pandas==2.2.0
+httpx==0.27.2
+openai>=1.3.0

server_app.py ADDED Viewed

	@@ -0,0 +1,629 @@

+from fastapi import FastAPI, UploadFile, File, HTTPException, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from typing import Optional
+import shutil
+import os
+from pathlib import Path
+import sys
+from openai import OpenAI
+import json
+sys.path.insert(0, str(Path(__file__).parent))
+from src.rag_system import IncrementalRAGSystem
+from src.database import get_db_session, DocumentVersion, DocumentChunk
+client = OpenAI(
+    api_key=os.getenv("GROQ_API_KEY"), base_url="https://api.groq.com/openai/v1"
+)
+app = FastAPI(
+    title="Incremental RAG API",
+    description="API for document Q&A RAG System",
+    version="1.0.0",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:3000",
+        "https://document-qa-rag-system.vercel.app/",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+rag_system = None
+@app.on_event("startup")
+def startup():
+    global rag_system
+    rag_system = IncrementalRAGSystem()
+TEMP_UPLOAD_DIR = "./temp_uploads"
+Path(TEMP_UPLOAD_DIR).mkdir(exist_ok=True)
+class QueryRequest(BaseModel):
+    question: str
+    version_id: Optional[int] = None
+    k: int = 5
+class ComparisonRequest(BaseModel):
+    question: str
+    version_id_1: int
+    version_id_2: int
+    k: int = 3
+@app.get("/")
+async def root():
+    return {
+        "status": "online",
+        "message": "Document Q&A RAG API is running",
+    }
+@app.post("/api/documents/upload")
+async def upload_document(
+    file: UploadFile = File(...), doc_name: Optional[str] = Form(None)
+):
+    temp_file_path = None
+    try:
+        allowed_extensions = {".pdf", ".txt", ".docx"}
+        file_ext = Path(file.filename).suffix.lower()
+        if file_ext not in allowed_extensions:
+            raise HTTPException(
+                status_code=400, detail=f"File type {file_ext} not supported"
+            )
+        temp_file_path = Path(TEMP_UPLOAD_DIR) / file.filename
+        with open(temp_file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+        if not doc_name:
+            doc_name = Path(file.filename).stem
+        result = rag_system.add_document(
+            file_path=str(temp_file_path), doc_name=doc_name
+        )
+        temp_file_path.unlink()
+        return JSONResponse(
+            content={
+                "success": True,
+                "message": f"Document uploaded as version {result['version_number']}",
+                "data": result,
+            }
+        )
+    except Exception as e:
+        if temp_file_path and temp_file_path.exists():
+            temp_file_path.unlink()
+        raise HTTPException(status_code=500, detail=str(e))
+def build_source_context(results):
+    parts = []
+    for i, r in enumerate(results, start=1):
+        excerpt = r["content"]
+        if len(excerpt) > 2000:
+            excerpt = excerpt[:2000] + "..."
+        parts.append(f"[Source {i}]\n{excerpt}")
+    return "\n\n".join(parts)
+def extract_document_topics(chunks: list, max_topics: int = 5) -> list:
+    sample_text = "\n".join([c["content"] for c in chunks[:3]])
+    try:
+        prompt = f"""
+Extract the main topics covered in this document.
+Document sample:
+{sample_text[:1000]}
+Return JSON with main topics/sections:
+{{
+  "topics": ["Topic 1", "Topic 2", "Topic 3"]
+}}
+Keep topics concise (2-4 words each). Maximum {max_topics} topics.
+"""
+        resp = client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.3,
+            max_tokens=200,
+            response_format={"type": "json_object"},
+        )
+        result = json.loads(resp.choices[0].message.content)
+        return result.get("topics", [])[:max_topics]
+    except Exception as e:
+        words = sample_text.lower().split()
+        fallback_topics = []
+        policy_keywords = [
+            "policy",
+            "work",
+            "remote",
+            "vacation",
+            "benefits",
+            "security",
+            "equipment",
+            "eligibility",
+        ]
+        for keyword in policy_keywords:
+            if keyword in words:
+                fallback_topics.append(keyword.title())
+        return (
+            fallback_topics[:max_topics] if fallback_topics else ["General Information"]
+        )
+@app.post("/api/query/generate")
+async def query_with_llm(query_request: QueryRequest):
+    question = query_request.question.strip()
+    if len(question) < 3:
+        return {
+            "question": question,
+            "not_found": True,
+            "answer": "",
+            "message": "Question too short (minimum 3 characters)",
+            "sources": [],
+        }
+    results = rag_system.query(
+        question=question,
+        version_id=query_request.version_id,
+        k=query_request.k,
+    )
+    if not results:
+        return {
+            "question": question,
+            "not_found": True,
+            "answer": "",
+            "message": "No content found in this document version",
+            "suggestion": "Check if you selected the correct version or try searching all versions",
+            "sources": [],
+        }
+    top_score = results[0]["similarity_score"]
+    if top_score < 0.35:
+        topics = extract_document_topics(results)
+        return {
+            "question": question,
+            "not_found": True,
+            "answer": "",
+            "message": "No direct match for your question",
+            "topics": topics,
+            "suggestions": [
+                "Try asking about specific topics listed above",
+                "Use keywords from the document",
+                (
+                    f"Example: 'What is the {topics[0].lower()}?'"
+                    if topics
+                    else "Be more specific"
+                ),
+            ],
+            "top_score": round(top_score, 3),
+            "sources": [],
+        }
+    force_low_confidence = False
+    if top_score < 0.4:
+        filtered = results[:3]
+        force_low_confidence = True
+    elif top_score > 0.6:
+        filtered = [r for r in results if r["similarity_score"] > 0.5][:3]
+    elif top_score > 0.45:
+        filtered = [r for r in results if r["similarity_score"] > 0.4][:2]
+    else:
+        filtered = results[:1]
+    context = build_source_context(filtered)
+    avg_sim = sum(r["similarity_score"] for r in filtered) / len(filtered)
+    system_msg = """You are a helpful document Q&A assistant.
+IMPORTANT RULES:
+1. Answer using ONLY the provided context
+2. If context is relevant, provide an answer even if partial
+3. Only return not_found=true if context is COMPLETELY unrelated
+4. For general questions (like "policy" or "document"), summarize key points
+You must return valid JSON in this format:
+{
+  "not_found": false,
+  "answer": "Your answer here",
+  "confidence": "high|medium|low"
+}
+Only use not_found=true if truly nothing relevant exists."""
+    user_prompt = f"""
+Context (avg similarity: {avg_sim:.2f}):
+{context}
+Question: {question}
+Provide a helpful answer based on the context. If the question is general, summarize the main points."""
+    try:
+        resp = client.chat.completions.create(
+            model="llama-3.3-70b-versatile",
+            messages=[
+                {"role": "system", "content": system_msg},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=0.1,
+            max_tokens=800,
+            response_format={"type": "json_object"},
+        )
+        text = resp.choices[0].message.content.strip()
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"LLM API error: {str(e)}")
+    try:
+        j = json.loads(text)
+    except json.JSONDecodeError:
+        start = text.find("{")
+        end = text.rfind("}")
+        if start != -1 and end != -1:
+            try:
+                j = json.loads(text[start : end + 1])
+            except json.JSONDecodeError:
+                j = {
+                    "not_found": False,
+                    "answer": text,
+                    "confidence": "low",
+                    "note": "Response format was non-standard",
+                }
+        else:
+            raise HTTPException(
+                status_code=500, detail="Failed to parse LLM response as JSON"
+            )
+    j["sources"] = filtered
+    j["question"] = question
+    j["avg_similarity"] = round(avg_sim, 3)
+    if "confidence" not in j:
+        if avg_sim > 0.6:
+            j["confidence"] = "high"
+        elif avg_sim > 0.45:
+            j["confidence"] = "medium"
+        else:
+            j["confidence"] = "low"
+    if force_low_confidence:
+        j["confidence"] = "low"
+        j["warning"] = "Answer based on limited context relevance"
+    return j
+@app.get("/api/documents")
+async def list_documents():
+    try:
+        documents = rag_system.get_all_documents()
+        return documents
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/documents/{doc_name}/versions")
+async def get_document_versions(doc_name: str):
+    try:
+        versions = rag_system.get_document_versions(doc_name)
+        if not versions:
+            raise HTTPException(
+                status_code=404, detail=f"Document '{doc_name}' not found"
+            )
+        return versions
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/api/documents/{doc_name}/versions/{version_id}/diff")
+async def get_version_diff(doc_name: str, version_id: int):
+    try:
+        session = get_db_session()
+        try:
+            current_version = (
+                session.query(DocumentVersion).filter_by(id=version_id).first()
+            )
+            if not current_version:
+                raise HTTPException(status_code=404, detail="Version not found")
+            prev_version = (
+                session.query(DocumentVersion)
+                .filter_by(
+                    document_id=current_version.document_id,
+                    version_number=current_version.version_number - 1,
+                )
+                .first()
+            )
+            if not prev_version:
+                return {
+                    "success": True,
+                    "message": "This is the first version",
+                    "is_first_version": True,
+                    "current_version": current_version.version_number,
+                }
+            current_chunks = [chunk.content for chunk in current_version.chunks]
+            prev_chunks = [chunk.content for chunk in prev_version.chunks]
+            current_text = "\n\n".join(current_chunks)
+            prev_text = "\n\n".join(prev_chunks)
+            stats = {
+                "chunks_added": len(current_chunks) - len(prev_chunks),
+                "current_chunks": len(current_chunks),
+                "previous_chunks": len(prev_chunks),
+                "current_version": current_version.version_number,
+                "previous_version": prev_version.version_number,
+            }
+            system_msg = """You are analyzing document changes.
+Identify what changed between two versions.
+Be specific and concise.
+You must respond with valid JSON only."""
+            user_prompt = f"""
+Previous Version:
+{prev_text[:3000]}...
+Current Version:
+{current_text[:3000]}...
+Analyze the changes and return valid JSON in this format:
+{{
+  "summary": "Brief overview of changes",
+  "key_changes": [
+    {{"type": "added|modified|removed", "description": "what changed"}},
+  ],
+  "impact": "low|medium|high"
+}}
+"""
+            try:
+                resp = client.chat.completions.create(
+                    model="llama-3.3-70b-versatile",
+                    messages=[
+                        {"role": "system", "content": system_msg},
+                        {"role": "user", "content": user_prompt},
+                    ],
+                    temperature=0.1,
+                    max_tokens=500,
+                    response_format={"type": "json_object"},  # Now this works
+                )
+                llm_response = resp.choices[0].message.content.strip()
+                try:
+                    diff_analysis = json.loads(llm_response)
+                except json.JSONDecodeError as e:
+                    print(f"Failed to parse LLM response: {llm_response}")
+                    diff_analysis = {
+                        "summary": f"Version {current_version.version_number} has {len(current_chunks) - len(prev_chunks)} more chunks than version {prev_version.version_number}",
+                        "key_changes": [
+                            {
+                                "type": "modified",
+                                "description": f"Content updated with {abs(len(current_chunks) - len(prev_chunks))} chunk difference",
+                            }
+                        ],
+                        "impact": "medium",
+                    }
+            except Exception as llm_error:
+                print(f"LLM API error: {llm_error}")
+                diff_analysis = {
+                    "summary": "Unable to generate detailed analysis",
+                    "key_changes": [
+                        {
+                            "type": "modified",
+                            "description": f"{len(current_chunks)} chunks in current version vs {len(prev_chunks)} in previous",
+                        }
+                    ],
+                    "impact": "unknown",
+                }
+            return {
+                "success": True,
+                "is_first_version": False,
+                "stats": stats,
+                "analysis": diff_analysis,
+                "version_info": {
+                    "current": {
+                        "id": current_version.id,
+                        "number": current_version.version_number,
+                        "date": current_version.upload_date.isoformat(),
+                    },
+                    "previous": {
+                        "id": prev_version.id,
+                        "number": prev_version.version_number,
+                        "date": prev_version.upload_date.isoformat(),
+                    },
+                },
+            }
+        finally:
+            session.close()
+    except HTTPException:
+        raise
+    except json.JSONDecodeError as e:
+        raise HTTPException(
+            status_code=500, detail=f"Failed to parse LLM response: {str(e)}"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/compare/detailed")
+async def compare_versions_detailed(comparison: ComparisonRequest):
+    try:
+        session = get_db_session()
+        try:
+            v1 = (
+                session.query(DocumentVersion)
+                .filter_by(id=comparison.version_id_1)
+                .first()
+            )
+            v2 = (
+                session.query(DocumentVersion)
+                .filter_by(id=comparison.version_id_2)
+                .first()
+            )
+            if not v1 or not v2:
+                raise HTTPException(status_code=404, detail="Version not found")
+            v1_chunks = [chunk.content for chunk in v1.chunks]
+            v2_chunks = [chunk.content for chunk in v2.chunks]
+            v1_text = "\n\n".join(v1_chunks)
+            v2_text = "\n\n".join(v2_chunks)
+            if comparison.question:
+                results_v1 = rag_system.query(
+                    question=comparison.question,
+                    version_id=comparison.version_id_1,
+                    k=comparison.k,
+                )
+                results_v2 = rag_system.query(
+                    question=comparison.question,
+                    version_id=comparison.version_id_2,
+                    k=comparison.k,
+                )
+                context_v1 = "\n".join([r["content"] for r in results_v1[:2]])
+                context_v2 = "\n".join([r["content"] for r in results_v2[:2]])
+                system_msg = """Compare how two document versions answer the same question.
+                Identify specific differences."""
+                user_prompt = f"""
+Question: {comparison.question}
+Version {v1.version_number} says:
+{context_v1}
+Version {v2.version_number} says:
+{context_v2}
+Return JSON:
+{{
+  "answer_v1": "Answer from version 1",
+  "answer_v2": "Answer from version 2",
+  "changed": true/false,
+  "differences": [
+    {{"aspect": "what changed", "v1": "old value", "v2": "new value"}}
+  ],
+  "summary": "Overall comparison"
+}}
+"""
+            else:
+                system_msg = """Compare two document versions.
+                Identify all significant changes."""
+                user_prompt = f"""
+Version {v1.version_number}:
+{v1_text[:4000]}...
+Version {v2.version_number}:
+{v2_text[:4000]}...
+Return JSON:
+{{
+  "overall_change": "high|medium|low",
+  "summary": "What changed overall",
+  "sections_changed": ["section 1", "section 2"],
+  "key_differences": [
+    {{"category": "category", "description": "what changed", "type": "added|modified|removed"}}
+  ],
+  "recommendations": "Who should review these changes"
+}}
+"""
+            resp = client.chat.completions.create(
+                model="llama-3.3-70b-versatile",
+                messages=[
+                    {"role": "system", "content": system_msg},
+                    {"role": "user", "content": user_prompt},
+                ],
+                temperature=0.1,
+                max_tokens=1000,
+            )
+            analysis = json.loads(resp.choices[0].message.content)
+            return {
+                "success": True,
+                "question": comparison.question if comparison.question else None,
+                "version_info": {
+                    "version_1": {
+                        "id": v1.id,
+                        "number": v1.version_number,
+                        "date": v1.upload_date.isoformat(),
+                        "chunks": len(v1_chunks),
+                    },
+                    "version_2": {
+                        "id": v2.id,
+                        "number": v2.version_number,
+                        "date": v2.upload_date.isoformat(),
+                        "chunks": len(v2_chunks),
+                    },
+                },
+                "analysis": analysis,
+                "stats": {
+                    "chunks_difference": len(v2_chunks) - len(v1_chunks),
+                    "text_length_v1": len(v1_text),
+                    "text_length_v2": len(v2_text),
+                },
+            }
+        finally:
+            session.close()
+    except HTTPException:
+        raise
+    except json.JSONDecodeError:
+        raise HTTPException(status_code=500, detail="Failed to parse LLM response")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("server_app:app", host="0.0.0.0", port=8000, reload=True)

src/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+Incremental RAG System - A production-ready RAG with document versioning
+"""
+from .rag_system import IncrementalRAGSystem
+from .embeddings import EmbeddingGenerator
+from .vector_store import FAISSVectorStore
+from .document_processor import DocumentProcessor
+__version__ = "1.0.0"
+__all__ = [
+    "IncrementalRAGSystem",
+    "EmbeddingGenerator",
+    "FAISSVectorStore",
+    "DocumentProcessor",
+]

src/database.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from sqlalchemy import (
+    create_engine,
+    Column,
+    Integer,
+    String,
+    DateTime,
+    Text,
+    ForeignKey,
+)
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker, relationship
+from datetime import datetime
+import os
+Base = declarative_base()
+class Document(Base):
+    __tablename__ = "documents"
+    id = Column(Integer, primary_key=True)
+    doc_name = Column(String(255), nullable=False)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    versions = relationship(
+        "DocumentVersion", back_populates="document", cascade="all, delete-orphan"
+    )
+    def __repr__(self):
+        return f"<Document(id={self.id}, name='{self.doc_name}')>"
+class DocumentVersion(Base):
+    __tablename__ = "document_versions"
+    id = Column(Integer, primary_key=True)
+    document_id = Column(Integer, ForeignKey("documents.id"), nullable=False)
+    version_number = Column(Integer, nullable=False)
+    file_path = Column(String(512), nullable=False)
+    upload_date = Column(DateTime, default=datetime.utcnow)
+    file_hash = Column(String(64))
+    doc_metadata = Column(Text)
+    document = relationship("Document", back_populates="versions")
+    chunks = relationship(
+        "DocumentChunk", back_populates="version", cascade="all, delete-orphan"
+    )
+    def __repr__(self):
+        return f"<DocumentVersion(doc_id={self.document_id}, v{self.version_number})>"
+class DocumentChunk(Base):
+    __tablename__ = "document_chunks"
+    id = Column(Integer, primary_key=True)
+    version_id = Column(Integer, ForeignKey("document_versions.id"), nullable=False)
+    chunk_index = Column(Integer, nullable=False)
+    content = Column(Text, nullable=False)
+    faiss_index = Column(Integer)
+    version = relationship("DocumentVersion", back_populates="chunks")
+    def __repr__(self):
+        return f"<DocumentChunk(id={self.id}, chunk_index={self.chunk_index})>"
+def init_db(database_url: str = None):
+    if database_url is None:
+        database_url = os.getenv("DATABASE_URL", "sqlite:///./rag_system.db")
+    engine = create_engine(database_url, echo=False)
+    Base.metadata.create_all(engine)
+    SessionLocal = sessionmaker(bind=engine)
+    return engine, SessionLocal
+def get_db_session(database_url: str = None):
+    _, SessionLocal = init_db(database_url)
+    return SessionLocal()

src/document_processor.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import hashlib
+from typing import List, Tuple
+from pathlib import Path
+import pypdf
+class DocumentProcessor:
+    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def extract_text_from_pdf(self, file_path: str) -> str:
+        text = ""
+        try:
+            with open(file_path, "rb") as file:
+                pdf_reader = pypdf.PdfReader(file)
+                for page in pdf_reader.pages:
+                    text += page.extract_text() + "\n"
+        except Exception as e:
+            raise ValueError(f"Error reading PDF: {str(e)}")
+        return text.strip()
+    def chunk_text(self, text: str) -> List[str]:
+        if not text:
+            return []
+        chunks = []
+        start = 0
+        text_length = len(text)
+        while start < text_length:
+            end = start + self.chunk_size
+            chunk = text[start:end]
+            if end < text_length:
+                last_period = chunk.rfind(".")
+                last_newline = chunk.rfind("\n")
+                break_point = max(last_period, last_newline)
+                if break_point > self.chunk_size * 0.5:
+                    chunk = chunk[: break_point + 1]
+                    end = start + break_point + 1
+            chunks.append(chunk.strip())
+            start = end - self.chunk_overlap
+        return [c for c in chunks if c]
+    def process_document(self, file_path: str) -> Tuple[str, List[str]]:
+        file_ext = Path(file_path).suffix.lower()
+        if file_ext == ".pdf":
+            text = self.extract_text_from_pdf(file_path)
+        elif file_ext == ".txt":
+            with open(file_path, "r", encoding="utf-8") as f:
+                text = f.read()
+        else:
+            raise ValueError(f"Unsupported file type: {file_ext}")
+        chunks = self.chunk_text(text)
+        return text, chunks
+    @staticmethod
+    def compute_file_hash(file_path: str) -> str:
+        hash_md5 = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+        return hash_md5.hexdigest()

src/embeddings.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import List
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import os
+class EmbeddingGenerator:
+    def __init__(self, model_name: str = None):
+        self.model_name = model_name or os.getenv("EMBEDDING_MODEL", "all-MiniLM-L6-v2")
+        print(f"Loading embedding model: {self.model_name}")
+        self.model = SentenceTransformer(self.model_name)
+        self.embedding_dim = self.model.get_sentence_embedding_dimension()
+        print(f"Model loaded. Embedding dimension: {self.embedding_dim}")
+    def embed_text(self, text: str) -> np.ndarray:
+        return self.model.encode(text, convert_to_numpy=True)
+    def embed_batch(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
+        if not texts:
+            return np.array([])
+        embeddings = self.model.encode(
+            texts,
+            batch_size=batch_size,
+            convert_to_numpy=True,
+            show_progress_bar=len(texts) > 10,
+        )
+        return embeddings
+    def get_embedding_dim(self) -> int:
+        return self.embedding_dim

src/rag_system.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+import shutil
+from pathlib import Path
+from typing import List, Tuple, Optional
+from datetime import datetime
+from src.database import (
+    init_db,
+    get_db_session,
+    Document,
+    DocumentVersion,
+    DocumentChunk,
+)
+from src.document_processor import DocumentProcessor
+from src.embeddings import EmbeddingGenerator
+from src.vector_store import FAISSVectorStore
+class IncrementalRAGSystem:
+    def __init__(
+        self,
+        database_url: str = None,
+        embedding_model: str = None,
+        index_path: str = None,
+        upload_dir: str = None,
+    ):
+        print("Initializing Incremental RAG System...")
+        self.database_url = database_url or os.getenv(
+            "DATABASE_URL", "sqlite:///./rag_system.db"
+        )
+        init_db(self.database_url)
+        self.processor = DocumentProcessor(chunk_size=512, chunk_overlap=50)
+        self.embedder = EmbeddingGenerator(model_name=embedding_model)
+        self.vector_store = FAISSVectorStore(
+            embedding_dim=self.embedder.get_embedding_dim(),
+            index_path=index_path or "./data/faiss_index",
+        )
+        self.upload_dir = upload_dir or "./uploads"
+        Path(self.upload_dir).mkdir(parents=True, exist_ok=True)
+        print("RAG System initialized successfully!")
+    def add_document(self, file_path: str, doc_name: str = None) -> dict:
+        if not Path(file_path).exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+        if doc_name is None:
+            doc_name = Path(file_path).stem
+        print(f"\nProcessing document: {doc_name}")
+        full_text, chunks = self.processor.process_document(file_path)
+        file_hash = self.processor.compute_file_hash(file_path)
+        print(f"  - Extracted {len(chunks)} chunks")
+        session = get_db_session(self.database_url)
+        try:
+            document = session.query(Document).filter_by(doc_name=doc_name).first()
+            if document is None:
+                document = Document(doc_name=doc_name)
+                session.add(document)
+                session.flush()
+                version_number = 1
+                print(f"  - Created new document (ID: {document.id})")
+            else:
+                max_version = (
+                    session.query(DocumentVersion)
+                    .filter_by(document_id=document.id)
+                    .count()
+                )
+                version_number = max_version + 1
+                print(f"  - Adding version {version_number} to existing document")
+            dest_path = (
+                Path(self.upload_dir)
+                / f"{doc_name}_v{version_number}{Path(file_path).suffix}"
+            )
+            shutil.copy2(file_path, dest_path)
+            version = DocumentVersion(
+                document_id=document.id,
+                version_number=version_number,
+                file_path=str(dest_path),
+                file_hash=file_hash,
+            )
+            session.add(version)
+            session.flush()
+            print(f"  - Generating embeddings...")
+            embeddings = self.embedder.embed_batch(chunks)
+            metadata_list = [
+                {
+                    "document_id": document.id,
+                    "version_id": version.id,
+                    "chunk_index": i,
+                    "doc_name": doc_name,
+                    "version_number": version_number,
+                    "content": chunk,
+                }
+                for i, chunk in enumerate(chunks)
+            ]
+            faiss_ids = self.vector_store.add_embeddings(embeddings, metadata_list)
+            for i, (chunk, faiss_id) in enumerate(zip(chunks, faiss_ids)):
+                db_chunk = DocumentChunk(
+                    version_id=version.id,
+                    chunk_index=i,
+                    content=chunk,
+                    faiss_index=faiss_id,
+                )
+                session.add(db_chunk)
+            session.commit()
+            self.vector_store.save()
+            print(f"Successfully added {doc_name} v{version_number}")
+            return {
+                "document_id": document.id,
+                "document_name": doc_name,
+                "version_id": version.id,
+                "version_number": version_number,
+                "num_chunks": len(chunks),
+                "file_path": str(dest_path),
+            }
+        except Exception as e:
+            session.rollback()
+            raise e
+        finally:
+            session.close()
+    def query(
+        self, question: str, version_id: Optional[int] = None, k: int = 5
+    ) -> List[dict]:
+        print(f"\nQuerying: '{question}'")
+        query_embedding = self.embedder.embed_text(question)
+        results = self.vector_store.search(
+            query_embedding, k=k, version_filter=version_id
+        )
+        print(f"  - Found {len(results)} relevant chunks")
+        formatted_results = []
+        for distance, metadata in results:
+            formatted_results.append(
+                {
+                    "content": metadata.get("content", ""),
+                    "document_name": metadata.get("doc_name", ""),
+                    "version": metadata.get("version_number", ""),
+                    "chunk_index": metadata.get("chunk_index", ""),
+                    "similarity_score": 1 / (1 + distance),
+                }
+            )
+        return formatted_results
+    def get_document_versions(self, doc_name: str) -> List[dict]:
+        session = get_db_session(self.database_url)
+        try:
+            document = session.query(Document).filter_by(doc_name=doc_name).first()
+            if not document:
+                return []
+            versions = (
+                session.query(DocumentVersion)
+                .filter_by(document_id=document.id)
+                .order_by(DocumentVersion.version_number)
+                .all()
+            )
+            return [
+                {
+                    "version_id": v.id,
+                    "version_number": v.version_number,
+                    "upload_date": v.upload_date.isoformat(),
+                    "file_path": v.file_path,
+                    "num_chunks": len(v.chunks),
+                }
+                for v in versions
+            ]
+        finally:
+            session.close()
+    def get_all_documents(self) -> List[dict]:
+        session = get_db_session(self.database_url)
+        try:
+            documents = session.query(Document).all()
+            result = []
+            for doc in documents:
+                result.append(
+                    {
+                        "document_id": doc.id,
+                        "document_name": doc.doc_name,
+                        "created_at": doc.created_at.isoformat(),
+                        "num_versions": len(doc.versions),
+                    }
+                )
+            return result
+        finally:
+            session.close()
+    def get_stats(self) -> dict:
+        session = get_db_session(self.database_url)
+        try:
+            num_documents = session.query(Document).count()
+            num_versions = session.query(DocumentVersion).count()
+            num_chunks = session.query(DocumentChunk).count()
+            vector_stats = self.vector_store.get_stats()
+            return {
+                "num_documents": num_documents,
+                "num_versions": num_versions,
+                "num_chunks": num_chunks,
+                "vector_store": vector_stats,
+            }
+        finally:
+            session.close()

src/vector_store.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import faiss
+import numpy as np
+import pickle
+from pathlib import Path
+from typing import List, Tuple, Optional
+class FAISSVectorStore:
+    def __init__(self, embedding_dim: int, index_path: str = None):
+        self.embedding_dim = embedding_dim
+        self.index_path = index_path or "./data/faiss_index"
+        self.index = None
+        self.id_to_metadata = {}  # Map FAISS ID to metadata
+        self.current_id = 0
+        Path(self.index_path).parent.mkdir(parents=True, exist_ok=True)
+        if Path(f"{self.index_path}.faiss").exists():
+            self.load()
+        else:
+            self._create_new_index()
+    def _create_new_index(self):
+        self.index = faiss.IndexFlatL2(self.embedding_dim)
+        self.id_to_metadata = {}
+        self.current_id = 0
+        print(f"Created new FAISS index with dimension {self.embedding_dim}")
+    def add_embeddings(self, embeddings: np.ndarray, metadata: List[dict]) -> List[int]:
+        if embeddings.shape[1] != self.embedding_dim:
+            raise ValueError(
+                f"Embedding dimension mismatch: expected {self.embedding_dim}, "
+                f"got {embeddings.shape[1]}"
+            )
+        embeddings = embeddings.astype("float32")
+        num_vectors = embeddings.shape[0]
+        ids = list(range(self.current_id, self.current_id + num_vectors))
+        self.index.add(embeddings)
+        for i, meta in zip(ids, metadata):
+            self.id_to_metadata[i] = meta
+        self.current_id += num_vectors
+        print(f"Added {num_vectors} vectors. Total: {self.index.ntotal}")
+        return ids
+    def search(
+        self,
+        query_embedding: np.ndarray,
+        k: int = 5,
+        version_filter: Optional[int] = None,
+    ) -> List[Tuple[float, dict]]:
+        if self.index.ntotal == 0:
+            return []
+        if query_embedding.ndim == 1:
+            query_embedding = query_embedding.reshape(1, -1)
+        query_embedding = query_embedding.astype("float32")
+        search_k = k * 10 if version_filter else k
+        distances, indices = self.index.search(
+            query_embedding, min(search_k, self.index.ntotal)
+        )
+        results = []
+        for dist, idx in zip(distances[0], indices[0]):
+            if idx == -1:
+                continue
+            metadata = self.id_to_metadata.get(int(idx), {})
+            if version_filter is not None:
+                if metadata.get("version_id") != version_filter:
+                    continue
+            results.append((float(dist), metadata))
+            if len(results) >= k:
+                break
+        return results
+    def save(self):
+        faiss.write_index(self.index, f"{self.index_path}.faiss")
+        with open(f"{self.index_path}.meta", "wb") as f:
+            pickle.dump(
+                {
+                    "id_to_metadata": self.id_to_metadata,
+                    "current_id": self.current_id,
+                    "embedding_dim": self.embedding_dim,
+                },
+                f,
+            )
+        print(f"Saved index to {self.index_path}")
+    def load(self):
+        try:
+            self.index = faiss.read_index(f"{self.index_path}.faiss")
+            with open(f"{self.index_path}.meta", "rb") as f:
+                data = pickle.load(f)
+                self.id_to_metadata = data["id_to_metadata"]
+                self.current_id = data["current_id"]
+                self.embedding_dim = data["embedding_dim"]
+            print(f"Loaded index from {self.index_path} ({self.index.ntotal} vectors)")
+        except Exception as e:
+            print(f"Error loading index: {e}")
+            self._create_new_index()
+    def get_stats(self) -> dict:
+        return {
+            "total_vectors": self.index.ntotal if self.index else 0,
+            "embedding_dim": self.embedding_dim,
+            "index_path": self.index_path,
+        }