Phonex commited on
Commit ·
167596f
0
Parent(s):
TheTruthSchool_RAG
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +68 -0
- .env.docker.example +15 -0
- .env.example +51 -0
- .gitignore +92 -0
- Dockerfile +178 -0
- LICENSE +21 -0
- README.md +269 -0
- backend/Dockerfile +44 -0
- backend/README.md +353 -0
- backend/__init__.py +7 -0
- backend/main.py +2078 -0
- backend/requirements.txt +28 -0
- backend/reranker.py +304 -0
- backend/url_fetcher.py +381 -0
- backend/web_search.py +295 -0
- docker-compose.yml +58 -0
- frontend/.env.example +2 -0
- frontend/Dockerfile +33 -0
- frontend/nginx.conf +46 -0
- frontend/package-lock.json +0 -0
- frontend/package.json +43 -0
- frontend/postcss.config.js +6 -0
- frontend/public/index.html +17 -0
- frontend/src/App.js +1268 -0
- frontend/src/index.css +79 -0
- frontend/src/index.js +11 -0
- frontend/tailwind.config.js +27 -0
- rag_anything_smaranika/.github/ISSUE_TEMPLATE/bug_report.yml +61 -0
- rag_anything_smaranika/.github/ISSUE_TEMPLATE/config.yml +1 -0
- rag_anything_smaranika/.github/ISSUE_TEMPLATE/feature_request.yml +26 -0
- rag_anything_smaranika/.github/ISSUE_TEMPLATE/question.yml +26 -0
- rag_anything_smaranika/.github/dependabot.yml +11 -0
- rag_anything_smaranika/.github/pull_request_template.md +32 -0
- rag_anything_smaranika/.github/workflows/linting.yaml +30 -0
- rag_anything_smaranika/.github/workflows/pypi-publish.yml +52 -0
- rag_anything_smaranika/.gitignore +79 -0
- rag_anything_smaranika/.pre-commit-config.yaml +28 -0
- rag_anything_smaranika/LICENSE +21 -0
- rag_anything_smaranika/MANIFEST.in +9 -0
- rag_anything_smaranika/README.md +1260 -0
- rag_anything_smaranika/README_zh.md +1258 -0
- rag_anything_smaranika/docs/batch_processing.md +341 -0
- rag_anything_smaranika/docs/context_aware_processing.md +375 -0
- rag_anything_smaranika/docs/enhanced_markdown.md +552 -0
- rag_anything_smaranika/env.example +192 -0
- rag_anything_smaranika/examples/batch_processing_example.py +561 -0
- rag_anything_smaranika/examples/batch_processing_optimized_example.py +216 -0
- rag_anything_smaranika/examples/enhanced_markdown_example.py +1055 -0
- rag_anything_smaranika/examples/image_format_test.py +234 -0
- rag_anything_smaranika/examples/insert_content_list_example.py +419 -0
.dockerignore
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
*.egg-info/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
*.egg
|
| 11 |
+
.pytest_cache/
|
| 12 |
+
.mypy_cache/
|
| 13 |
+
.ruff_cache/
|
| 14 |
+
|
| 15 |
+
# Virtual Environments
|
| 16 |
+
venv/
|
| 17 |
+
env/
|
| 18 |
+
ENV/
|
| 19 |
+
.venv
|
| 20 |
+
|
| 21 |
+
# IDEs
|
| 22 |
+
.vscode/
|
| 23 |
+
.idea/
|
| 24 |
+
*.swp
|
| 25 |
+
*.swo
|
| 26 |
+
*~
|
| 27 |
+
|
| 28 |
+
# OS
|
| 29 |
+
.DS_Store
|
| 30 |
+
Thumbs.db
|
| 31 |
+
|
| 32 |
+
# Git
|
| 33 |
+
.git/
|
| 34 |
+
.gitignore
|
| 35 |
+
.gitattributes
|
| 36 |
+
|
| 37 |
+
# Documentation
|
| 38 |
+
*.md
|
| 39 |
+
!README.md
|
| 40 |
+
|
| 41 |
+
# Logs
|
| 42 |
+
*.log
|
| 43 |
+
logs/
|
| 44 |
+
|
| 45 |
+
# Environment files (will be passed via docker-compose)
|
| 46 |
+
.env
|
| 47 |
+
.env.*
|
| 48 |
+
|
| 49 |
+
# Storage and uploads (will be mounted as volumes)
|
| 50 |
+
storage/
|
| 51 |
+
uploads/
|
| 52 |
+
backend/output/
|
| 53 |
+
|
| 54 |
+
# Frontend
|
| 55 |
+
frontend/node_modules/
|
| 56 |
+
frontend/build/
|
| 57 |
+
frontend/.env
|
| 58 |
+
frontend/.env.local
|
| 59 |
+
|
| 60 |
+
# Temporary files
|
| 61 |
+
tmp/
|
| 62 |
+
temp/
|
| 63 |
+
*.tmp
|
| 64 |
+
|
| 65 |
+
# Docker
|
| 66 |
+
Dockerfile
|
| 67 |
+
docker-compose*.yml
|
| 68 |
+
.dockerignore
|
.env.docker.example
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 3 |
+
|
| 4 |
+
GEMINI_TEXT_MODEL=models/gemini-flash-latest
|
| 5 |
+
GEMINI_VERIFIER_MODEL=models/gemini-pro-latest
|
| 6 |
+
GEMINI_VISION_MODEL=models/gemini-flash-latest
|
| 7 |
+
GEMINI_EMBEDDING_MODEL=models/text-embedding-004
|
| 8 |
+
|
| 9 |
+
TAVILY_API_KEY=your_tavily_api_key_here
|
| 10 |
+
|
| 11 |
+
REACT_APP_BACKEND_URL=http://localhost:8000
|
| 12 |
+
|
| 13 |
+
BACKEND_PORT=8000
|
| 14 |
+
|
| 15 |
+
FRONTEND_PORT=3000
|
.env.example
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Agentic RAG System - Environment Variables
|
| 2 |
+
# Copy this file to .env and fill in your actual API keys
|
| 3 |
+
|
| 4 |
+
# ============================================
|
| 5 |
+
# Required: Google Gemini API Key
|
| 6 |
+
# ============================================
|
| 7 |
+
# Get your free API key from: https://makersuite.google.com/app/apikey
|
| 8 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 9 |
+
|
| 10 |
+
# ============================================
|
| 11 |
+
# Gemini Model Configuration (Optional)
|
| 12 |
+
# ============================================
|
| 13 |
+
# Text generation model (fast responses)
|
| 14 |
+
GEMINI_TEXT_MODEL=models/gemini-flash-latest
|
| 15 |
+
|
| 16 |
+
# Verification model (quality checking)
|
| 17 |
+
GEMINI_VERIFIER_MODEL=models/gemini-pro-latest
|
| 18 |
+
|
| 19 |
+
# Vision model (image processing)
|
| 20 |
+
GEMINI_VISION_MODEL=models/gemini-flash-latest
|
| 21 |
+
|
| 22 |
+
# Embedding model (vector embeddings)
|
| 23 |
+
GEMINI_EMBEDDING_MODEL=models/text-embedding-004
|
| 24 |
+
|
| 25 |
+
# ============================================
|
| 26 |
+
# Optional: Tavily API Key (Web Search)
|
| 27 |
+
# ============================================
|
| 28 |
+
# Get your free API key from: https://tavily.com
|
| 29 |
+
# Leave empty to disable web search features
|
| 30 |
+
TAVILY_API_KEY=your_tavily_api_key_here
|
| 31 |
+
|
| 32 |
+
# ============================================
|
| 33 |
+
# Application Configuration
|
| 34 |
+
# ============================================
|
| 35 |
+
# Backend API URL (used by frontend)
|
| 36 |
+
REACT_APP_BACKEND_URL=http://localhost:8000
|
| 37 |
+
|
| 38 |
+
# Backend port
|
| 39 |
+
BACKEND_PORT=8000
|
| 40 |
+
|
| 41 |
+
# Frontend port
|
| 42 |
+
FRONTEND_PORT=3000
|
| 43 |
+
|
| 44 |
+
# ============================================
|
| 45 |
+
# Hugging Face Space Configuration
|
| 46 |
+
# ============================================
|
| 47 |
+
# When deploying to Hugging Face Spaces:
|
| 48 |
+
# 1. Go to your Space settings
|
| 49 |
+
# 2. Add secrets for GEMINI_API_KEY and TAVILY_API_KEY
|
| 50 |
+
# 3. The Dockerfile will use port 7860 automatically
|
| 51 |
+
# 4. REACT_APP_BACKEND_URL will be set to /api automatically
|
.gitignore
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dependencies
|
| 2 |
+
node_modules/
|
| 3 |
+
frontend/node_modules/
|
| 4 |
+
backend/__pycache__/
|
| 5 |
+
venv/
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.pyc
|
| 8 |
+
*.pyo
|
| 9 |
+
*.pyd
|
| 10 |
+
.Python
|
| 11 |
+
pip-log.txt
|
| 12 |
+
pip-delete-this-directory.txt
|
| 13 |
+
|
| 14 |
+
# Build outputs
|
| 15 |
+
frontend/build/
|
| 16 |
+
dist/
|
| 17 |
+
build/
|
| 18 |
+
*.egg-info/
|
| 19 |
+
.eggs/
|
| 20 |
+
|
| 21 |
+
# Cache
|
| 22 |
+
frontend/node_modules/.cache/
|
| 23 |
+
.cache/
|
| 24 |
+
.pytest_cache/
|
| 25 |
+
.mypy_cache/
|
| 26 |
+
|
| 27 |
+
# Environment variables
|
| 28 |
+
.env
|
| 29 |
+
.env.local
|
| 30 |
+
.env.*.local
|
| 31 |
+
*.env
|
| 32 |
+
backend/.env
|
| 33 |
+
frontend/.env
|
| 34 |
+
|
| 35 |
+
# IDE
|
| 36 |
+
.vscode/
|
| 37 |
+
.idea/
|
| 38 |
+
*.swp
|
| 39 |
+
*.swo
|
| 40 |
+
*~
|
| 41 |
+
|
| 42 |
+
# OS
|
| 43 |
+
.DS_Store
|
| 44 |
+
Thumbs.db
|
| 45 |
+
*.log
|
| 46 |
+
|
| 47 |
+
# Output files
|
| 48 |
+
backend/output/
|
| 49 |
+
*.out
|
| 50 |
+
|
| 51 |
+
# Storage (runtime data - don't commit to git)
|
| 52 |
+
storage/
|
| 53 |
+
uploads/
|
| 54 |
+
rag_anything_smaranika/__pycache__/
|
| 55 |
+
|
| 56 |
+
# Logs
|
| 57 |
+
logs/
|
| 58 |
+
*.log
|
| 59 |
+
npm-debug.log*
|
| 60 |
+
yarn-debug.log*
|
| 61 |
+
yarn-error.log*
|
| 62 |
+
|
| 63 |
+
# Test coverage
|
| 64 |
+
htmlcov/
|
| 65 |
+
.coverage
|
| 66 |
+
.coverage.*
|
| 67 |
+
coverage.xml
|
| 68 |
+
*.cover
|
| 69 |
+
|
| 70 |
+
# Jupyter Notebook
|
| 71 |
+
.ipynb_checkpoints
|
| 72 |
+
|
| 73 |
+
# Docker
|
| 74 |
+
*.pid
|
| 75 |
+
.docker/
|
| 76 |
+
|
| 77 |
+
# Temporary files
|
| 78 |
+
tmp/
|
| 79 |
+
temp/
|
| 80 |
+
*.tmp
|
| 81 |
+
|
| 82 |
+
# Binary assets (use Git LFS if needed)
|
| 83 |
+
*.png
|
| 84 |
+
*.jpg
|
| 85 |
+
*.jpeg
|
| 86 |
+
*.gif
|
| 87 |
+
*.svg
|
| 88 |
+
*.ico
|
| 89 |
+
|
| 90 |
+
# Local LightRAG package in vendor directory
|
| 91 |
+
vendor/lightrag/__pycache__/
|
| 92 |
+
vendor/**/__pycache__/
|
Dockerfile
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
curl \
|
| 10 |
+
wget \
|
| 11 |
+
git \
|
| 12 |
+
build-essential \
|
| 13 |
+
nginx \
|
| 14 |
+
nodejs \
|
| 15 |
+
npm \
|
| 16 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
COPY backend/requirements.txt /app/backend/requirements.txt
|
| 20 |
+
RUN pip install --no-cache-dir --use-pep517 -r /app/backend/requirements.txt
|
| 21 |
+
|
| 22 |
+
# Copy local modified LightRAG package in vendor directory
|
| 23 |
+
COPY vendor/ /app/vendor/
|
| 24 |
+
|
| 25 |
+
COPY backend/ /app/backend/
|
| 26 |
+
COPY rag_anything_smaranika/ /app/rag_anything_smaranika/
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
COPY frontend/ /app/frontend/
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
WORKDIR /app/frontend
|
| 33 |
+
RUN npm install
|
| 34 |
+
RUN REACT_APP_API_URL=/api npm run build
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
RUN mkdir -p /var/lib/nginx/body /var/lib/nginx/fastcgi \
|
| 39 |
+
/var/lib/nginx/proxy /var/lib/nginx/scgi /var/lib/nginx/uwsgi \
|
| 40 |
+
/var/log/nginx /var/cache/nginx && \
|
| 41 |
+
chmod -R 777 /var/lib/nginx /var/log/nginx /var/cache/nginx && \
|
| 42 |
+
touch /var/run/nginx.pid && chmod 666 /var/run/nginx.pid
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
RUN echo 'pid /tmp/nginx.pid;\n\
|
| 46 |
+
error_log /var/log/nginx/error.log;\n\
|
| 47 |
+
events {\n\
|
| 48 |
+
worker_connections 1024;\n\
|
| 49 |
+
}\n\
|
| 50 |
+
http {\n\
|
| 51 |
+
include /etc/nginx/mime.types;\n\
|
| 52 |
+
default_type application/octet-stream;\n\
|
| 53 |
+
access_log /var/log/nginx/access.log;\n\
|
| 54 |
+
client_body_temp_path /tmp/client_body;\n\
|
| 55 |
+
proxy_temp_path /tmp/proxy;\n\
|
| 56 |
+
fastcgi_temp_path /tmp/fastcgi;\n\
|
| 57 |
+
uwsgi_temp_path /tmp/uwsgi;\n\
|
| 58 |
+
scgi_temp_path /tmp/scgi;\n\
|
| 59 |
+
\n\
|
| 60 |
+
server {\n\
|
| 61 |
+
listen 7860;\n\
|
| 62 |
+
server_name _;\n\
|
| 63 |
+
\n\
|
| 64 |
+
location / {\n\
|
| 65 |
+
root /app/frontend/build;\n\
|
| 66 |
+
try_files $uri $uri/ /index.html;\n\
|
| 67 |
+
}\n\
|
| 68 |
+
\n\
|
| 69 |
+
location /api/ {\n\
|
| 70 |
+
proxy_pass http://127.0.0.1:8000/;\n\
|
| 71 |
+
proxy_http_version 1.1;\n\
|
| 72 |
+
proxy_set_header Upgrade $http_upgrade;\n\
|
| 73 |
+
proxy_set_header Connection "upgrade";\n\
|
| 74 |
+
proxy_set_header Host $host;\n\
|
| 75 |
+
proxy_set_header X-Real-IP $remote_addr;\n\
|
| 76 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n\
|
| 77 |
+
proxy_set_header X-Forwarded-Proto $scheme;\n\
|
| 78 |
+
proxy_buffering off;\n\
|
| 79 |
+
proxy_cache_bypass $http_upgrade;\n\
|
| 80 |
+
}\n\
|
| 81 |
+
\n\
|
| 82 |
+
location /health {\n\
|
| 83 |
+
proxy_pass http://127.0.0.1:8000/health;\n\
|
| 84 |
+
proxy_http_version 1.1;\n\
|
| 85 |
+
proxy_set_header Host $host;\n\
|
| 86 |
+
}\n\
|
| 87 |
+
}\n\
|
| 88 |
+
}' > /etc/nginx/nginx.conf
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
RUN mkdir -p /app/storage /app/uploads /app/backend/output /app/output /app/.cache/huggingface && \
|
| 92 |
+
chmod -R 777 /app/storage /app/uploads /app/backend/output /app/output /app/.cache
|
| 93 |
+
|
| 94 |
+
WORKDIR /app
|
| 95 |
+
|
| 96 |
+
WORKDIR /app/rag_anything_smaranika
|
| 97 |
+
RUN pip install --no-cache-dir -e .
|
| 98 |
+
|
| 99 |
+
WORKDIR /app
|
| 100 |
+
|
| 101 |
+
RUN mkdir -p /app/storage/medical /app/storage/legal /app/storage/financial \
|
| 102 |
+
/app/storage/technical /app/storage/academic && \
|
| 103 |
+
chmod -R 777 /app/storage
|
| 104 |
+
|
| 105 |
+
# Create output directory in the working directory for the parser
|
| 106 |
+
RUN mkdir -p /app/output && chmod -R 777 /app/output
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
RUN echo '#!/bin/bash\n\
|
| 110 |
+
set -e\n\
|
| 111 |
+
\n\
|
| 112 |
+
echo "===== Application Startup at $(date +"%Y-%m-%d %H:%M:%S") ====="\n\
|
| 113 |
+
echo ""\n\
|
| 114 |
+
echo "Starting Agentic RAG System for Hugging Face Space..."\n\
|
| 115 |
+
\n\
|
| 116 |
+
# Check for required environment variables\n\
|
| 117 |
+
if [ -z "$GEMINI_API_KEY" ]; then\n\
|
| 118 |
+
echo "ERROR: GEMINI_API_KEY environment variable is not set!"\n\
|
| 119 |
+
echo "Please set it in your Hugging Face Space settings."\n\
|
| 120 |
+
exit 1\n\
|
| 121 |
+
fi\n\
|
| 122 |
+
\n\
|
| 123 |
+
# Start backend in background\n\
|
| 124 |
+
echo "Starting FastAPI backend on port 8000..."\n\
|
| 125 |
+
cd /app\n\
|
| 126 |
+
export PYTHONPATH=/app:/app/vendor:$PYTHONPATH\n\
|
| 127 |
+
python -m uvicorn backend.main:app --host 127.0.0.1 --port 8000 --log-level info &\n\
|
| 128 |
+
BACKEND_PID=$!\n\
|
| 129 |
+
\n\
|
| 130 |
+
# Wait for backend to be ready\n\
|
| 131 |
+
echo "Waiting for backend to be ready..."\n\
|
| 132 |
+
for i in {1..30}; do\n\
|
| 133 |
+
if curl -s http://127.0.0.1:8000/health > /dev/null 2>&1; then\n\
|
| 134 |
+
echo "Backend is ready!"\n\
|
| 135 |
+
break\n\
|
| 136 |
+
fi\n\
|
| 137 |
+
echo "Waiting for backend... ($i/30)"\n\
|
| 138 |
+
sleep 2\n\
|
| 139 |
+
done\n\
|
| 140 |
+
\n\
|
| 141 |
+
# Start nginx\n\
|
| 142 |
+
echo "Starting nginx on port 7860..."\n\
|
| 143 |
+
nginx -g "daemon off;" &\n\
|
| 144 |
+
NGINX_PID=$!\n\
|
| 145 |
+
\n\
|
| 146 |
+
echo ""\n\
|
| 147 |
+
echo "==========================================="\n\
|
| 148 |
+
echo "Agentic RAG System is running!"\n\
|
| 149 |
+
echo "Backend: http://localhost:8000"\n\
|
| 150 |
+
echo "Frontend: http://localhost:7860"\n\
|
| 151 |
+
echo "API Docs: http://localhost:8000/docs"\n\
|
| 152 |
+
echo "==========================================="\n\
|
| 153 |
+
echo ""\n\
|
| 154 |
+
\n\
|
| 155 |
+
# Wait for both processes\n\
|
| 156 |
+
wait $BACKEND_PID $NGINX_PID\n\
|
| 157 |
+
' > /app/start.sh && chmod +x /app/start.sh
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
EXPOSE 7860
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 164 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 165 |
+
|
| 166 |
+
ENV PYTHONUNBUFFERED=1
|
| 167 |
+
ENV PYTHONPATH=/app:/app/vendor:$PYTHONPATH
|
| 168 |
+
ENV BACKEND_PORT=8000
|
| 169 |
+
ENV FRONTEND_PORT=7860
|
| 170 |
+
ENV HF_HOME=/app/.cache/huggingface
|
| 171 |
+
ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
|
| 172 |
+
ENV HF_DATASETS_CACHE=/app/.cache/huggingface/datasets
|
| 173 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 174 |
+
ENV PYTHONHASHSEED=0
|
| 175 |
+
ENV PYTHONOPTIMIZE=0
|
| 176 |
+
|
| 177 |
+
# Start the application
|
| 178 |
+
CMD ["/app/start.sh"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Agentic RAG System Contributors
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Agentic RAG System
|
| 3 |
+
emoji: 🤖
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: mit
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# 🤖 Agentic RAG System
|
| 13 |
+
|
| 14 |
+
A production-ready **Retrieval-Augmented Generation (RAG)** system with multi-domain support, advanced AI features, and intelligent web search integration. Built with FastAPI, React, and Google Gemini API.
|
| 15 |
+
|
| 16 |
+
## ✨ Features
|
| 17 |
+
|
| 18 |
+
### 🎯 Multi-Domain Support
|
| 19 |
+
- **Medical & Healthcare**: Medical documents, research papers, clinical guidelines
|
| 20 |
+
- **Legal & Compliance**: Legal documents, contracts, regulations, case law
|
| 21 |
+
- **Financial & Analytics**: Financial reports, analysis, market research
|
| 22 |
+
- **Technical Documentation**: Technical docs, APIs, code, architecture
|
| 23 |
+
- **Academic Research**: Research papers, academic publications, studies
|
| 24 |
+
|
| 25 |
+
### 🚀 Advanced AI Capabilities
|
| 26 |
+
- **Query Improvement**: Automatic query enhancement with abbreviation expansion
|
| 27 |
+
- **Dual-LLM Verification**: Two-stage answer verification using Gemini Pro
|
| 28 |
+
- **Web Search Integration**: Augment answers with real-time web search via Tavily
|
| 29 |
+
- **Conversation Memory**: Context-aware responses with conversation history
|
| 30 |
+
- **Multimodal Processing**: Support for images, tables, and equations (MinerU parser)
|
| 31 |
+
- **Smart Reranking**: Gemini-powered relevance reranking for better results
|
| 32 |
+
- **Streaming Responses**: Real-time token streaming for responsive UX
|
| 33 |
+
|
| 34 |
+
### 🔧 Technical Features
|
| 35 |
+
- **Gemini API Integration**: Free-tier Gemini Flash & Pro models
|
| 36 |
+
- **Async Processing**: Background document processing with status tracking
|
| 37 |
+
- **RESTful API**: Clean, well-documented FastAPI endpoints
|
| 38 |
+
- **Modern React Frontend**: Beautiful, responsive UI with Tailwind CSS
|
| 39 |
+
- **Docker Support**: One-command deployment with docker-compose
|
| 40 |
+
- **Performance Optimized**: Query caching, fast mode (2-3x speedup), batch processing
|
| 41 |
+
|
| 42 |
+
## 🚀 Quick Start (Docker)
|
| 43 |
+
|
| 44 |
+
### Prerequisites
|
| 45 |
+
- Docker and Docker Compose
|
| 46 |
+
- Google Gemini API Key ([Get one free](https://makersuite.google.com/app/apikey))
|
| 47 |
+
- (Optional) Tavily API Key for web search ([Get one free](https://tavily.com))
|
| 48 |
+
|
| 49 |
+
### 1. Clone the Repository
|
| 50 |
+
```bash
|
| 51 |
+
git clone <your-repo-url>
|
| 52 |
+
cd Agentic_RAG
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
### 2. Set Up Environment Variables
|
| 56 |
+
Create a `.env` file in the project root:
|
| 57 |
+
```bash
|
| 58 |
+
GEMINI_API_KEY=your_gemini_api_key_here
|
| 59 |
+
GEMINI_TEXT_MODEL=models/gemini-flash-latest
|
| 60 |
+
GEMINI_VERIFIER_MODEL=models/gemini-pro-latest
|
| 61 |
+
GEMINI_VISION_MODEL=models/gemini-flash-latest
|
| 62 |
+
GEMINI_EMBEDDING_MODEL=models/text-embedding-004
|
| 63 |
+
TAVILY_API_KEY=your_tavily_api_key_here # Optional, for web search
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
### 3. Start the Application
|
| 67 |
+
```bash
|
| 68 |
+
docker-compose up -d
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### 4. Access the Application
|
| 72 |
+
- **Frontend**: http://localhost:3000
|
| 73 |
+
- **Backend API**: http://localhost:8000
|
| 74 |
+
- **API Docs**: http://localhost:8000/docs
|
| 75 |
+
|
| 76 |
+
## 📖 Usage
|
| 77 |
+
|
| 78 |
+
### Upload Documents
|
| 79 |
+
1. Navigate to the frontend at http://localhost:3000
|
| 80 |
+
2. Select a domain (Medical, Legal, Financial, Technical, or Academic)
|
| 81 |
+
3. Upload PDF, DOCX, TXT, or other supported documents
|
| 82 |
+
4. Wait for processing to complete (tracked with real-time status)
|
| 83 |
+
|
| 84 |
+
### Query Documents
|
| 85 |
+
1. Enter your question in the query interface
|
| 86 |
+
2. Select query mode:
|
| 87 |
+
- **Mix**: Balanced combination of local and global search (recommended)
|
| 88 |
+
- **Local**: Focused chunk-based search
|
| 89 |
+
- **Global**: Knowledge graph entity search
|
| 90 |
+
- **Hybrid**: Advanced combination
|
| 91 |
+
- **Web**: RAG + real-time web search
|
| 92 |
+
3. Toggle advanced features:
|
| 93 |
+
- **Query Improvement**: Enhance your query automatically
|
| 94 |
+
- **Verification**: Dual-LLM quality check
|
| 95 |
+
- **Web Search**: Augment with real-time web results
|
| 96 |
+
- **Fast Mode**: 2-3x faster queries (slightly lower quality)
|
| 97 |
+
4. Get streaming responses with sources and confidence scores
|
| 98 |
+
|
| 99 |
+
### API Usage
|
| 100 |
+
```python
|
| 101 |
+
import requests
|
| 102 |
+
|
| 103 |
+
# Upload document
|
| 104 |
+
files = {"file": open("document.pdf", "rb")}
|
| 105 |
+
data = {"domain": "medical"}
|
| 106 |
+
response = requests.post("http://localhost:8000/upload", files=files, data=data)
|
| 107 |
+
print(response.json())
|
| 108 |
+
|
| 109 |
+
# Query documents
|
| 110 |
+
query_data = {
|
| 111 |
+
"query": "What are the treatment options for hypertension?",
|
| 112 |
+
"domain": "medical",
|
| 113 |
+
"mode": "mix",
|
| 114 |
+
"enable_web_search": False,
|
| 115 |
+
"fast_mode": False,
|
| 116 |
+
"return_metadata": True
|
| 117 |
+
}
|
| 118 |
+
response = requests.post("http://localhost:8000/query", json=query_data)
|
| 119 |
+
print(response.json())
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
## 🏗️ Architecture
|
| 123 |
+
|
| 124 |
+
```
|
| 125 |
+
Agentic_RAG/
|
| 126 |
+
├── backend/ # FastAPI backend
|
| 127 |
+
│ ├── main.py # Main API server
|
| 128 |
+
│ ├── reranker.py # Gemini-powered reranking
|
| 129 |
+
│ ├── web_search.py # Tavily web search integration
|
| 130 |
+
│ ├── url_fetcher.py # URL content fetching
|
| 131 |
+
│ ├── requirements.txt # Python dependencies
|
| 132 |
+
│ └── Dockerfile # Backend container
|
| 133 |
+
├── frontend/ # React frontend
|
| 134 |
+
│ ├── src/ # React components
|
| 135 |
+
│ ├── public/ # Static assets
|
| 136 |
+
│ ├── package.json # Node dependencies
|
| 137 |
+
│ ├── Dockerfile # Frontend container
|
| 138 |
+
│ └── nginx.conf # Nginx configuration
|
| 139 |
+
├── storage/ # RAG storage (created at runtime)
|
| 140 |
+
│ ├── medical/ # Medical domain storage
|
| 141 |
+
│ ├── legal/ # Legal domain storage
|
| 142 |
+
│ └── ... # Other domains
|
| 143 |
+
├── uploads/ # Uploaded documents
|
| 144 |
+
├── docker-compose.yml # Docker orchestration
|
| 145 |
+
├── Dockerfile # Hugging Face Space Dockerfile
|
| 146 |
+
└── README.md # This file
|
| 147 |
+
```
|
| 148 |
+
|
| 149 |
+
## 🔑 Environment Variables
|
| 150 |
+
|
| 151 |
+
| Variable | Description | Required | Default |
|
| 152 |
+
|----------|-------------|----------|---------|
|
| 153 |
+
| `GEMINI_API_KEY` | Google Gemini API key | Yes | - |
|
| 154 |
+
| `GEMINI_TEXT_MODEL` | Text generation model | No | `models/gemini-flash-latest` |
|
| 155 |
+
| `GEMINI_VERIFIER_MODEL` | Verification model | No | `models/gemini-pro-latest` |
|
| 156 |
+
| `GEMINI_VISION_MODEL` | Vision processing model | No | `models/gemini-flash-latest` |
|
| 157 |
+
| `GEMINI_EMBEDDING_MODEL` | Embedding model | No | `models/text-embedding-004` |
|
| 158 |
+
| `TAVILY_API_KEY` | Tavily web search API key | No | - |
|
| 159 |
+
|
| 160 |
+
## 📊 API Endpoints
|
| 161 |
+
|
| 162 |
+
### Health Check
|
| 163 |
+
```bash
|
| 164 |
+
GET /health
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
### List Domains
|
| 168 |
+
```bash
|
| 169 |
+
GET /domains
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
### Upload Document
|
| 173 |
+
```bash
|
| 174 |
+
POST /upload
|
| 175 |
+
Content-Type: multipart/form-data
|
| 176 |
+
|
| 177 |
+
file: <document file>
|
| 178 |
+
domain: medical
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
### Query Documents (Streaming)
|
| 182 |
+
```bash
|
| 183 |
+
POST /query/stream
|
| 184 |
+
Content-Type: application/json
|
| 185 |
+
|
| 186 |
+
{
|
| 187 |
+
"query": "What are the treatment options?",
|
| 188 |
+
"domain": "medical",
|
| 189 |
+
"mode": "mix",
|
| 190 |
+
"enable_web_search": false,
|
| 191 |
+
"fast_mode": false
|
| 192 |
+
}
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
### Query Documents (Standard)
|
| 196 |
+
```bash
|
| 197 |
+
POST /query
|
| 198 |
+
Content-Type: application/json
|
| 199 |
+
|
| 200 |
+
{
|
| 201 |
+
"query": "What are the treatment options?",
|
| 202 |
+
"domain": "medical",
|
| 203 |
+
"mode": "mix"
|
| 204 |
+
}
|
| 205 |
+
```
|
| 206 |
+
|
| 207 |
+
### Check Processing Status
|
| 208 |
+
```bash
|
| 209 |
+
GET /status/{processing_id}
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
### List Documents
|
| 213 |
+
```bash
|
| 214 |
+
GET /documents?domain=medical
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
### Delete Document
|
| 218 |
+
```bash
|
| 219 |
+
DELETE /documents/{doc_id}
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
## 🎯 Performance
|
| 223 |
+
|
| 224 |
+
- **Fast Mode**: 2-3x faster queries with optimized parameters
|
| 225 |
+
- **Query Caching**: 5-minute TTL cache for repeated queries
|
| 226 |
+
- **Batch Processing**: Parallel document processing (up to 10 documents)
|
| 227 |
+
- **Streaming**: Real-time token streaming for responsive UX
|
| 228 |
+
- **Reranking**: Gemini-powered relevance scoring
|
| 229 |
+
|
| 230 |
+
## 🛠️ Development
|
| 231 |
+
|
| 232 |
+
### Backend Development
|
| 233 |
+
```bash
|
| 234 |
+
cd backend
|
| 235 |
+
pip install -r requirements.txt
|
| 236 |
+
python main.py
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
### Frontend Development
|
| 240 |
+
```bash
|
| 241 |
+
cd frontend
|
| 242 |
+
npm install
|
| 243 |
+
npm start
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
+
## 🤝 Contributing
|
| 247 |
+
|
| 248 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
| 249 |
+
|
| 250 |
+
## 📝 License
|
| 251 |
+
|
| 252 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
| 253 |
+
|
| 254 |
+
## 🙏 Acknowledgments
|
| 255 |
+
|
| 256 |
+
- [LightRAG](https://github.com/HKUDS/LightRAG) - RAG framework
|
| 257 |
+
- [Google Gemini](https://ai.google.dev/) - LLM and embeddings
|
| 258 |
+
- [Tavily](https://tavily.com/) - Web search API
|
| 259 |
+
- [MinerU](https://github.com/opendatalab/MinerU) - Document parsing
|
| 260 |
+
- [FastAPI](https://fastapi.tiangolo.com/) - Backend framework
|
| 261 |
+
- [React](https://react.dev/) - Frontend framework
|
| 262 |
+
|
| 263 |
+
## 📧 Support
|
| 264 |
+
|
| 265 |
+
For issues and questions, please open an issue on GitHub.
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
Built with ❤️ using FastAPI, React, and Google Gemini
|
backend/Dockerfile
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
build-essential \
|
| 9 |
+
curl \
|
| 10 |
+
git \
|
| 11 |
+
wget \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Copy backend requirements first for better caching
|
| 15 |
+
COPY backend/requirements.txt /app/backend/requirements.txt
|
| 16 |
+
|
| 17 |
+
# Install Python dependencies
|
| 18 |
+
RUN pip install --no-cache-dir -r /app/backend/requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy rag_anything_smaranika (contains raganything module)
|
| 21 |
+
COPY rag_anything_smaranika /app/rag_anything_smaranika
|
| 22 |
+
|
| 23 |
+
# Install raganything as an editable package
|
| 24 |
+
RUN pip install -e /app/rag_anything_smaranika
|
| 25 |
+
|
| 26 |
+
# Copy backend application code
|
| 27 |
+
COPY backend /app/backend
|
| 28 |
+
|
| 29 |
+
# Create necessary directories
|
| 30 |
+
RUN mkdir -p /app/storage /app/uploads /app/backend/output
|
| 31 |
+
|
| 32 |
+
# Set environment variables
|
| 33 |
+
ENV PYTHONUNBUFFERED=1
|
| 34 |
+
ENV PYTHONPATH=/app
|
| 35 |
+
|
| 36 |
+
# Expose port
|
| 37 |
+
EXPOSE 8000
|
| 38 |
+
|
| 39 |
+
# Health check
|
| 40 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
|
| 41 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 42 |
+
|
| 43 |
+
# Run the application
|
| 44 |
+
CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
backend/README.md
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Enhanced RAG-Anything Backend API
|
| 2 |
+
|
| 3 |
+
Production-ready FastAPI backend for the RAG-Anything system with multi-domain support and advanced AI features.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
### 🎯 Multi-Domain Support
|
| 8 |
+
- **Medical & Healthcare**: Medical documents, research papers, clinical guidelines
|
| 9 |
+
- **Legal & Compliance**: Legal documents, contracts, regulations, case law
|
| 10 |
+
- **Financial & Analytics**: Financial reports, analysis, market research
|
| 11 |
+
- **Technical Documentation**: Technical docs, APIs, code, architecture
|
| 12 |
+
- **Academic Research**: Research papers, academic publications, studies
|
| 13 |
+
|
| 14 |
+
### 🚀 Advanced AI Capabilities
|
| 15 |
+
- **Query Improvement**: Automatic query enhancement with abbreviation expansion
|
| 16 |
+
- **Dual-LLM Verification**: Two-stage answer verification for quality assurance
|
| 17 |
+
- **Conversation Memory**: Context-aware responses with conversation history
|
| 18 |
+
- **Multimodal Processing**: Support for images, tables, and equations
|
| 19 |
+
- **Domain-Specific Prompts**: Optimized prompts for each domain
|
| 20 |
+
|
| 21 |
+
### 🔧 Technical Features
|
| 22 |
+
- **Gemini API Integration**: Free-tier Gemini 1.5 Flash model
|
| 23 |
+
- **Async Processing**: Background document processing
|
| 24 |
+
- **RESTful API**: Clean, well-documented endpoints
|
| 25 |
+
- **CORS Support**: Cross-origin resource sharing enabled
|
| 26 |
+
- **Error Handling**: Comprehensive error handling and logging
|
| 27 |
+
|
| 28 |
+
## Installation
|
| 29 |
+
|
| 30 |
+
### Prerequisites
|
| 31 |
+
- Python 3.9+
|
| 32 |
+
- Gemini API Key ([Get one here](https://makersuite.google.com/app/apikey))
|
| 33 |
+
|
| 34 |
+
### Setup
|
| 35 |
+
|
| 36 |
+
1. **Clone the repository**
|
| 37 |
+
```bash
|
| 38 |
+
cd /mnt/data/Agentic_RAG/backend
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
2. **Install dependencies**
|
| 42 |
+
```bash
|
| 43 |
+
pip install -r requirements.txt
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
3. **Set up environment variables**
|
| 47 |
+
```bash
|
| 48 |
+
export GEMINI_API_KEY="your-api-key-here"
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
Or create a `.env` file:
|
| 52 |
+
```env
|
| 53 |
+
GEMINI_API_KEY=your-api-key-here
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
4. **Run the server**
|
| 57 |
+
```bash
|
| 58 |
+
python main.py
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
Or using uvicorn directly:
|
| 62 |
+
```bash
|
| 63 |
+
uvicorn main:app --host 0.0.0.0 --port 8000 --reload
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
## API Endpoints
|
| 67 |
+
|
| 68 |
+
### Health Check
|
| 69 |
+
```bash
|
| 70 |
+
GET /health
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
Response:
|
| 74 |
+
```json
|
| 75 |
+
{
|
| 76 |
+
"status": "healthy",
|
| 77 |
+
"timestamp": "2025-01-04T10:00:00",
|
| 78 |
+
"version": "1.0.0",
|
| 79 |
+
"features": {
|
| 80 |
+
"query_improvement": true,
|
| 81 |
+
"dual_llm_verification": true,
|
| 82 |
+
"conversation_memory": true,
|
| 83 |
+
"multi_domain": true,
|
| 84 |
+
"multimodal_processing": true,
|
| 85 |
+
"gemini_integration": true
|
| 86 |
+
},
|
| 87 |
+
"domains": ["medical", "legal", "financial", "technical", "academic"]
|
| 88 |
+
}
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
### List Domains
|
| 92 |
+
```bash
|
| 93 |
+
GET /domains
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
### Upload Document
|
| 97 |
+
```bash
|
| 98 |
+
POST /upload
|
| 99 |
+
Content-Type: multipart/form-data
|
| 100 |
+
|
| 101 |
+
file: <document file>
|
| 102 |
+
domain: medical
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
Response:
|
| 106 |
+
```json
|
| 107 |
+
{
|
| 108 |
+
"success": true,
|
| 109 |
+
"message": "Document uploaded and queued for processing",
|
| 110 |
+
"file_name": "research_paper.pdf",
|
| 111 |
+
"domain": "medical",
|
| 112 |
+
"processing_id": "uuid-here"
|
| 113 |
+
}
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
### Query Documents
|
| 117 |
+
```bash
|
| 118 |
+
POST /query
|
| 119 |
+
Content-Type: application/json
|
| 120 |
+
|
| 121 |
+
{
|
| 122 |
+
"query": "What are the treatment options for hypertension?",
|
| 123 |
+
"domain": "medical",
|
| 124 |
+
"mode": "mix",
|
| 125 |
+
"conversation_id": "conv_123",
|
| 126 |
+
"return_metadata": true
|
| 127 |
+
}
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
Response:
|
| 131 |
+
```json
|
| 132 |
+
{
|
| 133 |
+
"answer": "Hypertension treatment includes lifestyle modifications...",
|
| 134 |
+
"sources": ["medical_guidelines.pdf"],
|
| 135 |
+
"confidence_score": 0.92,
|
| 136 |
+
"query_improved": true,
|
| 137 |
+
"verification_performed": true,
|
| 138 |
+
"conversation_id": "conv_123",
|
| 139 |
+
"metadata": {
|
| 140 |
+
"original_query": "What is HTN treatment?",
|
| 141 |
+
"improved_query": "What are the treatment options for hypertension?",
|
| 142 |
+
"verification_score": 8.5,
|
| 143 |
+
"modification_attempts": 1
|
| 144 |
+
}
|
| 145 |
+
}
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
### Get Conversation History
|
| 149 |
+
```bash
|
| 150 |
+
GET /conversation/{conversation_id}
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
### Clear Conversation
|
| 154 |
+
```bash
|
| 155 |
+
DELETE /conversation/{conversation_id}
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
### Clear Domain Data
|
| 159 |
+
```bash
|
| 160 |
+
DELETE /clear/{domain}
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
## Usage Examples
|
| 164 |
+
|
| 165 |
+
### Using cURL
|
| 166 |
+
|
| 167 |
+
**Upload a document:**
|
| 168 |
+
```bash
|
| 169 |
+
curl -X POST "http://localhost:8000/upload" \
|
| 170 |
+
-F "file=@medical_paper.pdf" \
|
| 171 |
+
-F "domain=medical"
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
**Query documents:**
|
| 175 |
+
```bash
|
| 176 |
+
curl -X POST "http://localhost:8000/query" \
|
| 177 |
+
-H "Content-Type: application/json" \
|
| 178 |
+
-d '{
|
| 179 |
+
"query": "What are the side effects of ACE inhibitors?",
|
| 180 |
+
"domain": "medical",
|
| 181 |
+
"mode": "mix",
|
| 182 |
+
"return_metadata": true
|
| 183 |
+
}'
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
### Using Python
|
| 187 |
+
|
| 188 |
+
```python
|
| 189 |
+
import requests
|
| 190 |
+
|
| 191 |
+
# Upload document
|
| 192 |
+
with open("medical_paper.pdf", "rb") as f:
|
| 193 |
+
files = {"file": f}
|
| 194 |
+
data = {"domain": "medical"}
|
| 195 |
+
response = requests.post("http://localhost:8000/upload", files=files, data=data)
|
| 196 |
+
print(response.json())
|
| 197 |
+
|
| 198 |
+
# Query documents
|
| 199 |
+
query_data = {
|
| 200 |
+
"query": "What are the treatment options for hypertension?",
|
| 201 |
+
"domain": "medical",
|
| 202 |
+
"mode": "mix",
|
| 203 |
+
"return_metadata": True
|
| 204 |
+
}
|
| 205 |
+
response = requests.post("http://localhost:8000/query", json=query_data)
|
| 206 |
+
print(response.json())
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
### Using JavaScript/TypeScript
|
| 210 |
+
|
| 211 |
+
```typescript
|
| 212 |
+
// Upload document
|
| 213 |
+
const formData = new FormData();
|
| 214 |
+
formData.append('file', fileInput.files[0]);
|
| 215 |
+
formData.append('domain', 'medical');
|
| 216 |
+
|
| 217 |
+
const uploadResponse = await fetch('http://localhost:8000/upload', {
|
| 218 |
+
method: 'POST',
|
| 219 |
+
body: formData
|
| 220 |
+
});
|
| 221 |
+
|
| 222 |
+
// Query documents
|
| 223 |
+
const queryResponse = await fetch('http://localhost:8000/query', {
|
| 224 |
+
method: 'POST',
|
| 225 |
+
headers: { 'Content-Type': 'application/json' },
|
| 226 |
+
body: JSON.stringify({
|
| 227 |
+
query: 'What are the treatment options for hypertension?',
|
| 228 |
+
domain: 'medical',
|
| 229 |
+
mode: 'mix',
|
| 230 |
+
return_metadata: true
|
| 231 |
+
})
|
| 232 |
+
});
|
| 233 |
+
|
| 234 |
+
const result = await queryResponse.json();
|
| 235 |
+
console.log(result);
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
## Configuration
|
| 239 |
+
|
| 240 |
+
### Domain-Specific Settings
|
| 241 |
+
|
| 242 |
+
Each domain has customized settings in `DOMAIN_CONFIGS`:
|
| 243 |
+
|
| 244 |
+
```python
|
| 245 |
+
{
|
| 246 |
+
"medical": {
|
| 247 |
+
"enable_query_improvement": True,
|
| 248 |
+
"query_improvement_method": "hybrid",
|
| 249 |
+
"expand_abbreviations": True,
|
| 250 |
+
"verification_threshold": 7.5,
|
| 251 |
+
# ... more settings
|
| 252 |
+
}
|
| 253 |
+
}
|
| 254 |
+
```
|
| 255 |
+
|
| 256 |
+
### Gemini Model Configuration
|
| 257 |
+
|
| 258 |
+
Currently using `gemini-1.5-flash` (free tier). To use a different model:
|
| 259 |
+
|
| 260 |
+
```python
|
| 261 |
+
GEMINI_MODEL = "gemini-1.5-pro" # More capable, paid tier
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
+
## Architecture
|
| 265 |
+
|
| 266 |
+
```
|
| 267 |
+
backend/
|
| 268 |
+
├── main.py # FastAPI application
|
| 269 |
+
├── requirements.txt # Python dependencies
|
| 270 |
+
└── README.md # This file
|
| 271 |
+
|
| 272 |
+
storage/ # Created at runtime
|
| 273 |
+
├── medical/ # Medical domain storage
|
| 274 |
+
├── legal/ # Legal domain storage
|
| 275 |
+
├── financial/ # Financial domain storage
|
| 276 |
+
├── technical/ # Technical domain storage
|
| 277 |
+
└── academic/ # Academic domain storage
|
| 278 |
+
|
| 279 |
+
uploads/ # Uploaded files
|
| 280 |
+
├── medical/
|
| 281 |
+
├── legal/
|
| 282 |
+
└── ...
|
| 283 |
+
```
|
| 284 |
+
|
| 285 |
+
## API Documentation
|
| 286 |
+
|
| 287 |
+
Interactive API documentation is available at:
|
| 288 |
+
- **Swagger UI**: http://localhost:8000/docs
|
| 289 |
+
- **ReDoc**: http://localhost:8000/redoc
|
| 290 |
+
|
| 291 |
+
## Error Handling
|
| 292 |
+
|
| 293 |
+
The API uses standard HTTP status codes:
|
| 294 |
+
|
| 295 |
+
- `200`: Success
|
| 296 |
+
- `400`: Bad Request (invalid parameters)
|
| 297 |
+
- `404`: Not Found
|
| 298 |
+
- `500`: Internal Server Error
|
| 299 |
+
|
| 300 |
+
All errors return JSON:
|
| 301 |
+
```json
|
| 302 |
+
{
|
| 303 |
+
"detail": "Error message here"
|
| 304 |
+
}
|
| 305 |
+
```
|
| 306 |
+
|
| 307 |
+
## Logging
|
| 308 |
+
|
| 309 |
+
Logs are output to console with the format:
|
| 310 |
+
```
|
| 311 |
+
2025-01-04 10:00:00 - main - INFO - Message here
|
| 312 |
+
```
|
| 313 |
+
|
| 314 |
+
## Production Deployment
|
| 315 |
+
|
| 316 |
+
For production deployment:
|
| 317 |
+
|
| 318 |
+
1. **Set proper CORS origins** in `main.py`:
|
| 319 |
+
```python
|
| 320 |
+
allow_origins=["https://your-frontend-domain.com"]
|
| 321 |
+
```
|
| 322 |
+
|
| 323 |
+
2. **Use a production ASGI server**:
|
| 324 |
+
```bash
|
| 325 |
+
gunicorn main:app -w 4 -k uvicorn.workers.UvicornWorker
|
| 326 |
+
```
|
| 327 |
+
|
| 328 |
+
3. **Set up environment variables** securely (don't commit `.env` files)
|
| 329 |
+
|
| 330 |
+
4. **Enable HTTPS** using a reverse proxy (nginx, Caddy, etc.)
|
| 331 |
+
|
| 332 |
+
5. **Set up proper logging** (file-based, log rotation)
|
| 333 |
+
|
| 334 |
+
6. **Monitor** with tools like Prometheus, Grafana
|
| 335 |
+
|
| 336 |
+
## Troubleshooting
|
| 337 |
+
|
| 338 |
+
### "GEMINI_API_KEY not set"
|
| 339 |
+
Set your API key as an environment variable or in a `.env` file.
|
| 340 |
+
|
| 341 |
+
### "Failed to initialize RAG system"
|
| 342 |
+
Check that the storage directories are writable and all dependencies are installed.
|
| 343 |
+
|
| 344 |
+
### "File type not supported"
|
| 345 |
+
Verify the file extension is in the allowed list for the target domain.
|
| 346 |
+
|
| 347 |
+
## License
|
| 348 |
+
|
| 349 |
+
[Your License Here]
|
| 350 |
+
|
| 351 |
+
## Support
|
| 352 |
+
|
| 353 |
+
For issues and questions, please open an issue on GitHub.
|
backend/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Backend package for RAG-Anything API
|
| 3 |
+
|
| 4 |
+
This package contains the FastAPI backend and supporting modules.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
__version__ = "1.1.0"
|
backend/main.py
ADDED
|
@@ -0,0 +1,2078 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI Backend for Enhanced RAG-Anything System (v1.1 - Updated)
|
| 3 |
+
|
| 4 |
+
Production-ready backend with:
|
| 5 |
+
- Multi-domain support (medical, legal, financial, technical, academic)
|
| 6 |
+
- Gemini API integration (LLM, Vision, Embeddings)
|
| 7 |
+
- Query improvement and dual-LLM verification
|
| 8 |
+
- Conversation history management
|
| 9 |
+
- Document processing and querying
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import asyncio
|
| 14 |
+
import logging
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Dict, List, Optional, Any
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
import uuid
|
| 19 |
+
import hashlib
|
| 20 |
+
import time
|
| 21 |
+
import json
|
| 22 |
+
from contextlib import asynccontextmanager
|
| 23 |
+
from dotenv import load_dotenv
|
| 24 |
+
from cachetools import TTLCache
|
| 25 |
+
|
| 26 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
|
| 27 |
+
from fastapi.responses import StreamingResponse
|
| 28 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 29 |
+
from pydantic import BaseModel, Field
|
| 30 |
+
import google.generativeai as genai
|
| 31 |
+
|
| 32 |
+
# Add project root to path for imports
|
| 33 |
+
import sys
|
| 34 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 35 |
+
|
| 36 |
+
# Load environment variables from .env file
|
| 37 |
+
load_dotenv(Path(__file__).parent / ".env")
|
| 38 |
+
|
| 39 |
+
from raganything.raganything import RAGAnything, RAGAnythingConfig, create_rag_anything
|
| 40 |
+
from backend.reranker import GeminiReranker
|
| 41 |
+
from backend.web_search import WebSearcher, create_web_searcher
|
| 42 |
+
from backend.url_fetcher import URLFetcher, create_url_fetcher
|
| 43 |
+
|
| 44 |
+
# Configure logging
|
| 45 |
+
logging.basicConfig(
|
| 46 |
+
level=logging.INFO,
|
| 47 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 48 |
+
)
|
| 49 |
+
logger = logging.getLogger(__name__)
|
| 50 |
+
|
| 51 |
+
# =============================================================================
|
| 52 |
+
# Domain Configurations
|
| 53 |
+
# =============================================================================
|
| 54 |
+
|
| 55 |
+
DOMAIN_CONFIGS = {
|
| 56 |
+
"medical": {
|
| 57 |
+
"name": "Medical & Healthcare",
|
| 58 |
+
"description": "Optimized for medical documents, research papers, clinical guidelines",
|
| 59 |
+
"system_prompt": (
|
| 60 |
+
"You are a medical AI assistant with expertise in healthcare, clinical medicine, "
|
| 61 |
+
"and medical research. Provide accurate, evidence-based responses with appropriate "
|
| 62 |
+
"medical terminology. Always cite sources and indicate confidence levels."
|
| 63 |
+
),
|
| 64 |
+
"analysis_prompt": (
|
| 65 |
+
"Analyze this medical document focusing on: diagnoses, treatments, medications, "
|
| 66 |
+
"clinical findings, patient outcomes, and evidence-based recommendations."
|
| 67 |
+
),
|
| 68 |
+
"file_extensions": [".pdf", ".doc", ".docx", ".txt", ".md", ".csv", ".xlsx"],
|
| 69 |
+
"config_overrides": {
|
| 70 |
+
"domain": "medical",
|
| 71 |
+
"enable_query_improvement": True,
|
| 72 |
+
"query_improvement_method": "hybrid",
|
| 73 |
+
"expand_abbreviations": True,
|
| 74 |
+
"add_domain_keywords": True,
|
| 75 |
+
"extract_query_entities": True,
|
| 76 |
+
"enable_dual_llm_verification": True,
|
| 77 |
+
"enable_answer_verification": True,
|
| 78 |
+
"enable_answer_modification": True,
|
| 79 |
+
"verification_threshold": 7.5,
|
| 80 |
+
"check_factual_consistency": True,
|
| 81 |
+
"check_completeness": True,
|
| 82 |
+
"check_relevance": True,
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"legal": {
|
| 86 |
+
"name": "Legal & Compliance",
|
| 87 |
+
"description": "Specialized for legal documents, contracts, regulations, case law",
|
| 88 |
+
"system_prompt": (
|
| 89 |
+
"You are a legal AI assistant with expertise in law, regulations, and compliance. "
|
| 90 |
+
"Provide precise legal analysis with proper citations. Note that this is for "
|
| 91 |
+
"informational purposes only and not legal advice."
|
| 92 |
+
),
|
| 93 |
+
"analysis_prompt": (
|
| 94 |
+
"Analyze this legal document focusing on: key provisions, obligations, rights, "
|
| 95 |
+
"legal precedents, regulatory requirements, and potential implications."
|
| 96 |
+
),
|
| 97 |
+
"file_extensions": [".pdf", ".doc", ".docx", ".txt", ".csv", ".xlsx"],
|
| 98 |
+
"config_overrides": {
|
| 99 |
+
"domain": "legal",
|
| 100 |
+
"enable_query_improvement": True,
|
| 101 |
+
"query_improvement_method": "llm",
|
| 102 |
+
"expand_abbreviations": True,
|
| 103 |
+
"extract_query_entities": True,
|
| 104 |
+
"enable_dual_llm_verification": True,
|
| 105 |
+
"enable_answer_verification": True,
|
| 106 |
+
"enable_answer_modification": True,
|
| 107 |
+
"verification_threshold": 8.0,
|
| 108 |
+
"check_factual_consistency": True,
|
| 109 |
+
"check_completeness": True,
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"financial": {
|
| 113 |
+
"name": "Financial & Analytics",
|
| 114 |
+
"description": "Tailored for financial reports, analysis, market research, forecasts",
|
| 115 |
+
"system_prompt": (
|
| 116 |
+
"You are a financial AI assistant with expertise in finance, accounting, and "
|
| 117 |
+
"market analysis. Provide data-driven insights with numerical precision. "
|
| 118 |
+
"Include relevant financial metrics and trends."
|
| 119 |
+
),
|
| 120 |
+
"analysis_prompt": (
|
| 121 |
+
"Analyze this financial document focusing on: financial metrics, trends, "
|
| 122 |
+
"performance indicators, risk factors, market conditions, and forecasts."
|
| 123 |
+
),
|
| 124 |
+
"file_extensions": [".pdf", ".xlsx", ".csv", ".doc", ".docx"],
|
| 125 |
+
"config_overrides": {
|
| 126 |
+
"domain": "financial",
|
| 127 |
+
"enable_query_improvement": True,
|
| 128 |
+
"query_improvement_method": "hybrid",
|
| 129 |
+
"expand_abbreviations": True,
|
| 130 |
+
"add_domain_keywords": True,
|
| 131 |
+
"enable_dual_llm_verification": True,
|
| 132 |
+
"enable_answer_verification": True,
|
| 133 |
+
"verification_threshold": 7.5,
|
| 134 |
+
"check_factual_consistency": True,
|
| 135 |
+
}
|
| 136 |
+
},
|
| 137 |
+
"technical": {
|
| 138 |
+
"name": "Technical Documentation",
|
| 139 |
+
"description": "Optimized for technical docs, APIs, code, system architecture",
|
| 140 |
+
"system_prompt": (
|
| 141 |
+
"You are a technical AI assistant with expertise in software development, "
|
| 142 |
+
"system architecture, and technical documentation. Provide clear, precise "
|
| 143 |
+
"technical explanations with code examples when relevant."
|
| 144 |
+
),
|
| 145 |
+
"analysis_prompt": (
|
| 146 |
+
"Analyze this technical document focusing on: system design, APIs, configurations, "
|
| 147 |
+
"dependencies, implementation details, and best practices."
|
| 148 |
+
),
|
| 149 |
+
"file_extensions": [".pdf", ".md", ".txt", ".rst", ".doc", ".docx", ".csv", ".xlsx"],
|
| 150 |
+
"config_overrides": {
|
| 151 |
+
"domain": "technical",
|
| 152 |
+
"enable_query_improvement": True,
|
| 153 |
+
"query_improvement_method": "hybrid",
|
| 154 |
+
"expand_abbreviations": True,
|
| 155 |
+
"extract_query_entities": True,
|
| 156 |
+
"enable_dual_llm_verification": True,
|
| 157 |
+
"enable_answer_verification": True,
|
| 158 |
+
"verification_threshold": 7.0,
|
| 159 |
+
}
|
| 160 |
+
},
|
| 161 |
+
"academic": {
|
| 162 |
+
"name": "Academic Research",
|
| 163 |
+
"description": "Designed for research papers, academic publications, studies",
|
| 164 |
+
"system_prompt": (
|
| 165 |
+
"You are an academic AI assistant with expertise in research methodology, "
|
| 166 |
+
"scholarly analysis, and scientific literature. Provide well-reasoned responses "
|
| 167 |
+
"with proper academic citations and methodology discussion."
|
| 168 |
+
),
|
| 169 |
+
"analysis_prompt": (
|
| 170 |
+
"Analyze this academic document focusing on: research questions, methodology, "
|
| 171 |
+
"findings, conclusions, citations, and contributions to the field."
|
| 172 |
+
),
|
| 173 |
+
"file_extensions": [".pdf", ".doc", ".docx", ".txt", ".tex", ".csv", ".xlsx"],
|
| 174 |
+
"config_overrides": {
|
| 175 |
+
"domain": "academic",
|
| 176 |
+
"enable_query_improvement": True,
|
| 177 |
+
"query_improvement_method": "llm",
|
| 178 |
+
"expand_abbreviations": True,
|
| 179 |
+
"add_domain_keywords": True,
|
| 180 |
+
"extract_query_entities": True,
|
| 181 |
+
"enable_dual_llm_verification": True,
|
| 182 |
+
"enable_answer_verification": True,
|
| 183 |
+
"enable_answer_modification": True,
|
| 184 |
+
"verification_threshold": 8.0,
|
| 185 |
+
"check_completeness": True,
|
| 186 |
+
"check_relevance": True,
|
| 187 |
+
}
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
# =============================================================================
|
| 192 |
+
# Global State & Configuration
|
| 193 |
+
# =============================================================================
|
| 194 |
+
|
| 195 |
+
# RAG instances per domain
|
| 196 |
+
rag_instances: Dict[str, RAGAnything] = {}
|
| 197 |
+
|
| 198 |
+
# Web searcher instance
|
| 199 |
+
web_searcher: Optional[WebSearcher] = None
|
| 200 |
+
|
| 201 |
+
# URL fetcher instance
|
| 202 |
+
url_fetcher: Optional[URLFetcher] = None
|
| 203 |
+
|
| 204 |
+
# Conversation history storage
|
| 205 |
+
conversation_histories: Dict[str, List[Dict[str, str]]] = {}
|
| 206 |
+
|
| 207 |
+
# Processing status tracker
|
| 208 |
+
processing_status: Dict[str, Dict[str, Any]] = {}
|
| 209 |
+
|
| 210 |
+
# Query result cache (TTL: 5 minutes, max 100 entries)
|
| 211 |
+
query_cache: TTLCache = TTLCache(maxsize=100, ttl=300)
|
| 212 |
+
|
| 213 |
+
# Performance metrics storage
|
| 214 |
+
performance_metrics: Dict[str, List[float]] = {
|
| 215 |
+
"query_times": [],
|
| 216 |
+
"processing_times": [],
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
# Base paths
|
| 220 |
+
BASE_DIR = Path(__file__).parent.parent
|
| 221 |
+
STORAGE_DIR = BASE_DIR / "storage"
|
| 222 |
+
UPLOAD_DIR = BASE_DIR / "uploads"
|
| 223 |
+
STATUS_FILE = STORAGE_DIR / "processing_status.json"
|
| 224 |
+
|
| 225 |
+
# --- IMPROVEMENT: Centralized and configurable Gemini model names ---
|
| 226 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
|
| 227 |
+
GEMINI_TEXT_MODEL = os.getenv("GEMINI_TEXT_MODEL", "models/gemini-flash-latest") # Fast generation (alias to latest Flash)
|
| 228 |
+
GEMINI_VERIFIER_MODEL = os.getenv("GEMINI_VERIFIER_MODEL", "models/gemini-pro-latest") # Quality verification (alias to latest Pro)
|
| 229 |
+
GEMINI_VISION_MODEL = os.getenv("GEMINI_VISION_MODEL", "models/gemini-flash-latest") # Vision model
|
| 230 |
+
GEMINI_EMBEDDING_MODEL = os.getenv("GEMINI_EMBEDDING_MODEL", "models/text-embedding-004") # Embedding model
|
| 231 |
+
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "") # For web search
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
# =============================================================================
|
| 235 |
+
# Status Persistence Functions
|
| 236 |
+
# =============================================================================
|
| 237 |
+
|
| 238 |
+
def load_processing_status() -> Dict[str, Dict[str, Any]]:
|
| 239 |
+
"""Load processing status from disk."""
|
| 240 |
+
try:
|
| 241 |
+
if STATUS_FILE.exists():
|
| 242 |
+
with open(STATUS_FILE, 'r') as f:
|
| 243 |
+
status_data = json.load(f)
|
| 244 |
+
logger.info(f"Loaded {len(status_data)} processing status entries from disk")
|
| 245 |
+
return status_data
|
| 246 |
+
return {}
|
| 247 |
+
except Exception as e:
|
| 248 |
+
logger.error(f"Error loading processing status: {e}", exc_info=True)
|
| 249 |
+
return {}
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def save_processing_status():
|
| 253 |
+
"""Save processing status to disk."""
|
| 254 |
+
try:
|
| 255 |
+
STATUS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 256 |
+
with open(STATUS_FILE, 'w') as f:
|
| 257 |
+
json.dump(processing_status, f, indent=2)
|
| 258 |
+
logger.debug(f"Saved {len(processing_status)} processing status entries to disk")
|
| 259 |
+
except Exception as e:
|
| 260 |
+
logger.error(f"Error saving processing status: {e}", exc_info=True)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def update_processing_status(processing_id: str, status_update: Dict[str, Any]):
|
| 264 |
+
"""Update processing status both in-memory and on disk."""
|
| 265 |
+
processing_status[processing_id] = status_update
|
| 266 |
+
save_processing_status()
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
# =============================================================================
|
| 270 |
+
# Lifespan Management (Startup/Shutdown)
|
| 271 |
+
# =============================================================================
|
| 272 |
+
|
| 273 |
+
@asynccontextmanager
|
| 274 |
+
async def lifespan(app: FastAPI):
|
| 275 |
+
"""Handles application startup and shutdown events."""
|
| 276 |
+
# --- STARTUP ---
|
| 277 |
+
logger.info("Starting Enhanced RAG-Anything API...")
|
| 278 |
+
STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
| 279 |
+
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
| 280 |
+
for domain in DOMAIN_CONFIGS.keys():
|
| 281 |
+
(STORAGE_DIR / domain).mkdir(parents=True, exist_ok=True)
|
| 282 |
+
logger.info(f"Created storage directories: {STORAGE_DIR}")
|
| 283 |
+
|
| 284 |
+
# Load processing status from disk
|
| 285 |
+
global processing_status
|
| 286 |
+
processing_status.update(load_processing_status())
|
| 287 |
+
|
| 288 |
+
if GEMINI_API_KEY:
|
| 289 |
+
try:
|
| 290 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
| 291 |
+
logger.info("Gemini API initialized successfully")
|
| 292 |
+
logger.info(f"Model Configuration:")
|
| 293 |
+
logger.info(f" TEXT_MODEL: {GEMINI_TEXT_MODEL}")
|
| 294 |
+
logger.info(f" VERIFIER_MODEL: {GEMINI_VERIFIER_MODEL}")
|
| 295 |
+
logger.info(f" VISION_MODEL: {GEMINI_VISION_MODEL}")
|
| 296 |
+
logger.info(f" EMBEDDING_MODEL: {GEMINI_EMBEDDING_MODEL}")
|
| 297 |
+
except Exception as e:
|
| 298 |
+
logger.error(f"Failed to initialize Gemini API: {e}", exc_info=True)
|
| 299 |
+
logger.warning("Application will start but Gemini features will not work")
|
| 300 |
+
else:
|
| 301 |
+
logger.warning("GEMINI_API_KEY not set. Set it in environment variables.")
|
| 302 |
+
|
| 303 |
+
# Initialize web searcher if Tavily API key is available
|
| 304 |
+
global web_searcher, url_fetcher
|
| 305 |
+
if TAVILY_API_KEY:
|
| 306 |
+
try:
|
| 307 |
+
web_searcher = create_web_searcher(api_key=TAVILY_API_KEY, max_results=5)
|
| 308 |
+
logger.info("Tavily web search initialized successfully")
|
| 309 |
+
except Exception as e:
|
| 310 |
+
logger.warning(f"Failed to initialize Tavily: {e}. Web search will not be available.")
|
| 311 |
+
web_searcher = None
|
| 312 |
+
else:
|
| 313 |
+
logger.info("TAVILY_API_KEY not set. Web search features disabled.")
|
| 314 |
+
|
| 315 |
+
# Initialize URL fetcher
|
| 316 |
+
try:
|
| 317 |
+
url_download_dir = UPLOAD_DIR / "url_downloads"
|
| 318 |
+
url_download_dir.mkdir(parents=True, exist_ok=True)
|
| 319 |
+
url_fetcher = create_url_fetcher(download_dir=str(url_download_dir))
|
| 320 |
+
logger.info("URL fetcher initialized successfully")
|
| 321 |
+
except Exception as e:
|
| 322 |
+
logger.warning(f"Failed to initialize URL fetcher: {e}. URL ingestion will not be available.")
|
| 323 |
+
url_fetcher = None
|
| 324 |
+
|
| 325 |
+
logger.info("Enhanced RAG-Anything API started successfully!")
|
| 326 |
+
|
| 327 |
+
yield # Application runs here
|
| 328 |
+
|
| 329 |
+
# --- SHUTDOWN ---
|
| 330 |
+
logger.info("Shutting down API...")
|
| 331 |
+
for domain, rag_instance in rag_instances.items():
|
| 332 |
+
logger.info(f"Finalizing storages for domain: {domain}")
|
| 333 |
+
await rag_instance.finalize_storages()
|
| 334 |
+
logger.info("API shutdown complete.")
|
| 335 |
+
|
| 336 |
+
# =============================================================================
|
| 337 |
+
# FastAPI App Setup
|
| 338 |
+
# =============================================================================
|
| 339 |
+
|
| 340 |
+
app = FastAPI(
|
| 341 |
+
title="Enhanced RAG-Anything API",
|
| 342 |
+
description="Production-ready RAG system with multi-domain support and advanced features",
|
| 343 |
+
version="1.1.0",
|
| 344 |
+
docs_url="/docs",
|
| 345 |
+
redoc_url="/redoc",
|
| 346 |
+
lifespan=lifespan # --- FIX: Using modern lifespan event handler ---
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
# CORS Configuration
|
| 350 |
+
app.add_middleware(
|
| 351 |
+
CORSMiddleware,
|
| 352 |
+
allow_origins=["*"],
|
| 353 |
+
allow_credentials=True,
|
| 354 |
+
allow_methods=["*"],
|
| 355 |
+
allow_headers=["*"],
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
# =============================================================================
|
| 360 |
+
# Request/Response Models
|
| 361 |
+
# =============================================================================
|
| 362 |
+
|
| 363 |
+
class QueryRequest(BaseModel):
|
| 364 |
+
query: str = Field(..., description="User query text", min_length=1)
|
| 365 |
+
domain: str = Field("medical", description="Domain context (medical, legal, etc.)")
|
| 366 |
+
mode: str = Field("mix", description="Query mode (local, global, hybrid, naive, mix, web, hybrid_web)")
|
| 367 |
+
conversation_id: Optional[str] = Field(None, description="Conversation ID for context")
|
| 368 |
+
return_metadata: bool = Field(True, description="Include detailed metadata in response")
|
| 369 |
+
enable_web_search: bool = Field(False, description="Enable web search augmentation")
|
| 370 |
+
web_search_only: bool = Field(False, description="Use only web search (no RAG)")
|
| 371 |
+
enable_verification: bool = Field(True, description="Enable dual-LLM verification")
|
| 372 |
+
# Performance optimization parameters
|
| 373 |
+
fast_mode: bool = Field(False, description="Use optimized parameters for faster queries (2-3x speedup)")
|
| 374 |
+
top_k: Optional[int] = Field(None, description="Number of top results to retrieve (default: 40, fast: 20)")
|
| 375 |
+
enable_cache: bool = Field(True, description="Enable query result caching")
|
| 376 |
+
enable_query_improvement: bool = Field(True, description="Enable query improvement/expansion")
|
| 377 |
+
enable_verification_check: bool = Field(True, description="Enable verification step (separate from enable_verification)")
|
| 378 |
+
|
| 379 |
+
class Config:
|
| 380 |
+
json_schema_extra = {
|
| 381 |
+
"example": {
|
| 382 |
+
"query": "What are the treatment options for hypertension?",
|
| 383 |
+
"domain": "medical",
|
| 384 |
+
"mode": "mix",
|
| 385 |
+
"conversation_id": "conv_123",
|
| 386 |
+
"return_metadata": True,
|
| 387 |
+
"enable_web_search": False,
|
| 388 |
+
"web_search_only": False,
|
| 389 |
+
"enable_verification": True
|
| 390 |
+
}
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
class QueryResponse(BaseModel):
|
| 395 |
+
answer: str = Field(..., description="Generated answer")
|
| 396 |
+
sources: List[str] = Field(default_factory=list, description="Source documents used")
|
| 397 |
+
confidence_score: float = Field(0.0, description="Confidence score (0-1)")
|
| 398 |
+
query_improved: bool = Field(False, description="Whether query was improved")
|
| 399 |
+
verification_performed: bool = Field(False, description="Whether answer was verified")
|
| 400 |
+
conversation_id: str = Field(..., description="Conversation ID")
|
| 401 |
+
metadata: Optional[Dict[str, Any]] = Field(None, description="Additional metadata")
|
| 402 |
+
|
| 403 |
+
class Config:
|
| 404 |
+
json_schema_extra = {
|
| 405 |
+
"example": {
|
| 406 |
+
"answer": "Hypertension treatment includes lifestyle modifications and medications...",
|
| 407 |
+
"sources": ["medical_guidelines.pdf", "research_paper.pdf"],
|
| 408 |
+
"confidence_score": 0.92,
|
| 409 |
+
"query_improved": True,
|
| 410 |
+
"verification_performed": True,
|
| 411 |
+
"conversation_id": "conv_123",
|
| 412 |
+
"metadata": {
|
| 413 |
+
"original_query": "What is HTN treatment?",
|
| 414 |
+
"improved_query": "What are the treatment options for hypertension?",
|
| 415 |
+
"verification_score": 8.5
|
| 416 |
+
}
|
| 417 |
+
}
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
class UploadResponse(BaseModel):
|
| 422 |
+
success: bool
|
| 423 |
+
message: str
|
| 424 |
+
file_name: str
|
| 425 |
+
domain: str
|
| 426 |
+
processing_id: str
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
class BatchUploadResponse(BaseModel):
|
| 430 |
+
success: bool
|
| 431 |
+
message: str
|
| 432 |
+
total_files: int
|
| 433 |
+
accepted_files: int
|
| 434 |
+
processing_ids: List[str]
|
| 435 |
+
domain: str
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
class URLUploadRequest(BaseModel):
|
| 439 |
+
url: str = Field(..., description="URL to fetch and process")
|
| 440 |
+
domain: str = Field("medical", description="Domain context")
|
| 441 |
+
convert_to_markdown: bool = Field(True, description="Convert HTML to markdown")
|
| 442 |
+
|
| 443 |
+
class Config:
|
| 444 |
+
json_schema_extra = {
|
| 445 |
+
"example": {
|
| 446 |
+
"url": "https://example.com/medical-article.pdf",
|
| 447 |
+
"domain": "medical",
|
| 448 |
+
"convert_to_markdown": True
|
| 449 |
+
}
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
class DomainInfo(BaseModel):
|
| 454 |
+
domain_id: str
|
| 455 |
+
name: str
|
| 456 |
+
description: str
|
| 457 |
+
file_extensions: List[str]
|
| 458 |
+
features: Dict[str, Any]
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
class HealthResponse(BaseModel):
|
| 462 |
+
status: str
|
| 463 |
+
timestamp: str
|
| 464 |
+
version: str
|
| 465 |
+
features: Dict[str, bool]
|
| 466 |
+
domains: List[str]
|
| 467 |
+
|
| 468 |
+
# =============================================================================
|
| 469 |
+
# Gemini Integration Functions
|
| 470 |
+
# =============================================================================
|
| 471 |
+
|
| 472 |
+
async def gemini_llm_func(
|
| 473 |
+
prompt: str,
|
| 474 |
+
system_prompt: Optional[str] = None,
|
| 475 |
+
history_messages: Optional[List[Dict[str, str]]] = None,
|
| 476 |
+
**kwargs,
|
| 477 |
+
) -> str:
|
| 478 |
+
"""
|
| 479 |
+
Gemini LLM function for text generation (Improved with format validation).
|
| 480 |
+
|
| 481 |
+
Enhancements:
|
| 482 |
+
- Increased token limits for entity extraction tasks
|
| 483 |
+
- Better temperature control for structured outputs
|
| 484 |
+
- Response validation and auto-append of completion delimiter
|
| 485 |
+
"""
|
| 486 |
+
def _sync_call():
|
| 487 |
+
try:
|
| 488 |
+
from google.generativeai.types import HarmCategory, HarmBlockThreshold
|
| 489 |
+
|
| 490 |
+
safety_settings = [
|
| 491 |
+
{
|
| 492 |
+
"category": HarmCategory.HARM_CATEGORY_HARASSMENT,
|
| 493 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
| 497 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 498 |
+
},
|
| 499 |
+
{
|
| 500 |
+
"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
| 501 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 502 |
+
},
|
| 503 |
+
{
|
| 504 |
+
"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
| 505 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 506 |
+
},
|
| 507 |
+
]
|
| 508 |
+
# --- IMPROVEMENT: Use system_instruction parameter ---
|
| 509 |
+
logger.info(f"Creating GenerativeModel with model_name: {GEMINI_TEXT_MODEL}")
|
| 510 |
+
model = genai.GenerativeModel(
|
| 511 |
+
model_name=GEMINI_TEXT_MODEL,
|
| 512 |
+
system_instruction=system_prompt,
|
| 513 |
+
safety_settings=safety_settings
|
| 514 |
+
)
|
| 515 |
+
config_params = {}
|
| 516 |
+
|
| 517 |
+
# Smart temperature control: lower for extraction tasks
|
| 518 |
+
is_extraction_task = system_prompt and ("entity" in system_prompt.lower() or "extraction" in system_prompt.lower())
|
| 519 |
+
if "temperature" in kwargs:
|
| 520 |
+
config_params["temperature"] = kwargs["temperature"]
|
| 521 |
+
else:
|
| 522 |
+
# Use lower temperature for structured extraction tasks
|
| 523 |
+
config_params["temperature"] = 0.1 if is_extraction_task else 0.3
|
| 524 |
+
|
| 525 |
+
# Increase token limit for extraction tasks to avoid truncation
|
| 526 |
+
if "max_tokens" in kwargs:
|
| 527 |
+
config_params["max_output_tokens"] = kwargs["max_tokens"]
|
| 528 |
+
else:
|
| 529 |
+
# Larger limits for extraction to ensure completion delimiter is included
|
| 530 |
+
config_params["max_output_tokens"] = 16384 if is_extraction_task else 8192
|
| 531 |
+
|
| 532 |
+
generation_config = genai.types.GenerationConfig(**config_params)
|
| 533 |
+
|
| 534 |
+
# --- IMPROVEMENT: Build structured history for chat model ---
|
| 535 |
+
history = []
|
| 536 |
+
if history_messages:
|
| 537 |
+
for msg in history_messages[-5:]:
|
| 538 |
+
role = "user" if msg.get("role") == "user" else "model"
|
| 539 |
+
content = msg.get("content", "")
|
| 540 |
+
if content:
|
| 541 |
+
history.append({"role": role, "parts": [content]})
|
| 542 |
+
|
| 543 |
+
chat = model.start_chat(history=history)
|
| 544 |
+
response = chat.send_message(prompt, generation_config=generation_config)
|
| 545 |
+
try:
|
| 546 |
+
result = response.text
|
| 547 |
+
|
| 548 |
+
# Post-processing: Ensure completion delimiter is present for extraction tasks
|
| 549 |
+
if is_extraction_task and result:
|
| 550 |
+
# Check if completion delimiter is missing
|
| 551 |
+
if "<|COMPLETE|>" not in result and "<|complete|>" not in result:
|
| 552 |
+
logger.warning("Completion delimiter missing from extraction result, appending it")
|
| 553 |
+
# Append the delimiter to the end
|
| 554 |
+
result = result.strip() + "\n<|COMPLETE|>"
|
| 555 |
+
|
| 556 |
+
return result
|
| 557 |
+
except ValueError as ve:
|
| 558 |
+
logger.warning(f"Response blocked or empty. Reason: {ve}. Candidates: {response.candidates}")
|
| 559 |
+
if response.prompt_feedback:
|
| 560 |
+
logger.warning(f"Prompt feedback: {response.prompt_feedback}")
|
| 561 |
+
return ""
|
| 562 |
+
except Exception as e:
|
| 563 |
+
logger.error(f"Gemini LLM error: {e}", exc_info=True)
|
| 564 |
+
raise
|
| 565 |
+
return await asyncio.to_thread(_sync_call)
|
| 566 |
+
|
| 567 |
+
|
| 568 |
+
async def gemini_verifier_llm_func(
|
| 569 |
+
prompt: str,
|
| 570 |
+
system_prompt: Optional[str] = None,
|
| 571 |
+
history_messages: Optional[List[Dict[str, str]]] = None,
|
| 572 |
+
**kwargs,
|
| 573 |
+
) -> str:
|
| 574 |
+
"""Gemini Pro LLM function for answer verification (more powerful, thorough)."""
|
| 575 |
+
def _sync_call():
|
| 576 |
+
try:
|
| 577 |
+
from google.generativeai.types import HarmCategory, HarmBlockThreshold
|
| 578 |
+
|
| 579 |
+
safety_settings = [
|
| 580 |
+
{
|
| 581 |
+
"category": HarmCategory.HARM_CATEGORY_HARASSMENT,
|
| 582 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 583 |
+
},
|
| 584 |
+
{
|
| 585 |
+
"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
| 586 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
| 590 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
| 594 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 595 |
+
},
|
| 596 |
+
]
|
| 597 |
+
# Use Pro model for better verification
|
| 598 |
+
logger.info(f"Creating Verifier GenerativeModel with model_name: {GEMINI_VERIFIER_MODEL}")
|
| 599 |
+
model = genai.GenerativeModel(
|
| 600 |
+
model_name=GEMINI_VERIFIER_MODEL,
|
| 601 |
+
system_instruction=system_prompt,
|
| 602 |
+
safety_settings=safety_settings
|
| 603 |
+
)
|
| 604 |
+
config_params = {}
|
| 605 |
+
if "temperature" in kwargs:
|
| 606 |
+
config_params["temperature"] = kwargs["temperature"]
|
| 607 |
+
if "max_tokens" in kwargs:
|
| 608 |
+
config_params["max_output_tokens"] = kwargs["max_tokens"]
|
| 609 |
+
else:
|
| 610 |
+
# Default to larger token limit for verification responses
|
| 611 |
+
config_params["max_output_tokens"] = 8192
|
| 612 |
+
generation_config = genai.types.GenerationConfig(**config_params)
|
| 613 |
+
|
| 614 |
+
# Build history
|
| 615 |
+
history = []
|
| 616 |
+
if history_messages:
|
| 617 |
+
for msg in history_messages[-5:]:
|
| 618 |
+
role = "user" if msg.get("role") == "user" else "model"
|
| 619 |
+
content = msg.get("content", "")
|
| 620 |
+
if content:
|
| 621 |
+
history.append({"role": role, "parts": [content]})
|
| 622 |
+
|
| 623 |
+
chat = model.start_chat(history=history)
|
| 624 |
+
response = chat.send_message(prompt, generation_config=generation_config)
|
| 625 |
+
try:
|
| 626 |
+
return response.text
|
| 627 |
+
except ValueError as ve:
|
| 628 |
+
logger.warning(f"Response blocked or empty. Reason: {ve}. Candidates: {response.candidates}")
|
| 629 |
+
if response.prompt_feedback:
|
| 630 |
+
logger.warning(f"Prompt feedback: {response.prompt_feedback}")
|
| 631 |
+
return ""
|
| 632 |
+
except Exception as e:
|
| 633 |
+
logger.error(f"Gemini Verifier LLM error: {e}", exc_info=True)
|
| 634 |
+
raise
|
| 635 |
+
return await asyncio.to_thread(_sync_call)
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
async def gemini_vision_func(
|
| 639 |
+
prompt: str,
|
| 640 |
+
system_prompt: Optional[str] = None,
|
| 641 |
+
image_data: Optional[str] = None,
|
| 642 |
+
**kwargs,
|
| 643 |
+
) -> str:
|
| 644 |
+
"""Gemini Vision function for image analysis."""
|
| 645 |
+
def _sync_call():
|
| 646 |
+
try:
|
| 647 |
+
from google.generativeai.types import HarmCategory, HarmBlockThreshold
|
| 648 |
+
|
| 649 |
+
safety_settings = [
|
| 650 |
+
{
|
| 651 |
+
"category": HarmCategory.HARM_CATEGORY_HARASSMENT,
|
| 652 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 653 |
+
},
|
| 654 |
+
{
|
| 655 |
+
"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
| 656 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 657 |
+
},
|
| 658 |
+
{
|
| 659 |
+
"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
| 660 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
| 664 |
+
"threshold": HarmBlockThreshold.BLOCK_NONE,
|
| 665 |
+
},
|
| 666 |
+
]
|
| 667 |
+
# --- FIX: Use dedicated vision model ---
|
| 668 |
+
logger.info(f"Creating Vision GenerativeModel with model_name: {GEMINI_VISION_MODEL}")
|
| 669 |
+
model = genai.GenerativeModel(GEMINI_VISION_MODEL, safety_settings=safety_settings)
|
| 670 |
+
config_params = {}
|
| 671 |
+
if "temperature" in kwargs:
|
| 672 |
+
config_params["temperature"] = kwargs["temperature"]
|
| 673 |
+
if "max_tokens" in kwargs:
|
| 674 |
+
config_params["max_output_tokens"] = kwargs["max_tokens"]
|
| 675 |
+
generation_config = genai.types.GenerationConfig(**config_params)
|
| 676 |
+
|
| 677 |
+
content_parts = []
|
| 678 |
+
if system_prompt:
|
| 679 |
+
content_parts.append(system_prompt)
|
| 680 |
+
content_parts.append(prompt)
|
| 681 |
+
|
| 682 |
+
if image_data:
|
| 683 |
+
import base64
|
| 684 |
+
import io
|
| 685 |
+
from PIL import Image
|
| 686 |
+
image_bytes = base64.b64decode(image_data)
|
| 687 |
+
image = Image.open(io.BytesIO(image_bytes))
|
| 688 |
+
content_parts.append(image)
|
| 689 |
+
|
| 690 |
+
response = model.generate_content(content_parts, generation_config=generation_config)
|
| 691 |
+
try:
|
| 692 |
+
return response.text
|
| 693 |
+
except ValueError as ve:
|
| 694 |
+
logger.warning(f"Vision response blocked or empty. Reason: {ve}. Candidates: {response.candidates}")
|
| 695 |
+
if response.prompt_feedback:
|
| 696 |
+
logger.warning(f"Vision prompt feedback: {response.prompt_feedback}")
|
| 697 |
+
return ""
|
| 698 |
+
except Exception as e:
|
| 699 |
+
logger.error(f"Gemini Vision error: {e}", exc_info=True)
|
| 700 |
+
raise
|
| 701 |
+
return await asyncio.to_thread(_sync_call)
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
async def gemini_embedding_func(texts: List[str]) -> List[List[float]]:
|
| 705 |
+
"""Gemini Embedding function for text vectorization."""
|
| 706 |
+
def _sync_call():
|
| 707 |
+
try:
|
| 708 |
+
# --- IMPROVEMENT: Use newer embedding model ---
|
| 709 |
+
result = genai.embed_content(
|
| 710 |
+
model=GEMINI_EMBEDDING_MODEL,
|
| 711 |
+
content=texts,
|
| 712 |
+
task_type="retrieval_document"
|
| 713 |
+
)
|
| 714 |
+
return result['embedding']
|
| 715 |
+
except Exception as e:
|
| 716 |
+
logger.error(f"Gemini Embedding error: {e}", exc_info=True)
|
| 717 |
+
raise
|
| 718 |
+
return await asyncio.to_thread(_sync_call)
|
| 719 |
+
|
| 720 |
+
gemini_embedding_func.embedding_dim = 768
|
| 721 |
+
|
| 722 |
+
|
| 723 |
+
async def synthesize_web_results_with_gemini(
|
| 724 |
+
query: str,
|
| 725 |
+
web_context: str,
|
| 726 |
+
rag_context: Optional[str] = None
|
| 727 |
+
) -> str:
|
| 728 |
+
"""
|
| 729 |
+
Use Gemini to synthesize web search results into a coherent, direct answer
|
| 730 |
+
|
| 731 |
+
Args:
|
| 732 |
+
query: User's original query
|
| 733 |
+
web_context: Formatted web search results
|
| 734 |
+
rag_context: Optional RAG results to incorporate
|
| 735 |
+
|
| 736 |
+
Returns:
|
| 737 |
+
Synthesized answer from Gemini
|
| 738 |
+
"""
|
| 739 |
+
try:
|
| 740 |
+
logger.info("Synthesizing web results with Gemini")
|
| 741 |
+
|
| 742 |
+
# Build synthesis prompt
|
| 743 |
+
if rag_context:
|
| 744 |
+
system_prompt = """You are an expert research assistant. Your task is to synthesize information from both
|
| 745 |
+
a knowledge base and recent web search results to provide a comprehensive, accurate answer.
|
| 746 |
+
|
| 747 |
+
Guidelines:
|
| 748 |
+
- Provide a direct, clear answer to the user's question
|
| 749 |
+
- Combine insights from both the knowledge base and web sources
|
| 750 |
+
- Cite sources when making specific claims (use [Source N] notation)
|
| 751 |
+
- If there are contradictions, acknowledge them and explain
|
| 752 |
+
- Be concise but thorough
|
| 753 |
+
- Use a professional, informative tone"""
|
| 754 |
+
|
| 755 |
+
prompt = f"""User Question: {query}
|
| 756 |
+
|
| 757 |
+
Knowledge Base Information:
|
| 758 |
+
{rag_context}
|
| 759 |
+
|
| 760 |
+
Web Search Results:
|
| 761 |
+
{web_context}
|
| 762 |
+
|
| 763 |
+
Based on the above information, provide a comprehensive answer to the user's question. Synthesize the information from both sources and cite your sources appropriately."""
|
| 764 |
+
|
| 765 |
+
else:
|
| 766 |
+
system_prompt = """You are an expert research assistant. Your task is to synthesize web search results
|
| 767 |
+
into a clear, direct answer to the user's question.
|
| 768 |
+
|
| 769 |
+
Guidelines:
|
| 770 |
+
- Provide a direct, clear answer to the user's question
|
| 771 |
+
- Cite sources when making specific claims (use [Source N] notation)
|
| 772 |
+
- Be concise but comprehensive
|
| 773 |
+
- If information is limited or unclear, acknowledge it
|
| 774 |
+
- Use a professional, informative tone
|
| 775 |
+
- Include relevant details like dates, statistics, or examples when available"""
|
| 776 |
+
|
| 777 |
+
prompt = f"""User Question: {query}
|
| 778 |
+
|
| 779 |
+
Web Search Results:
|
| 780 |
+
{web_context}
|
| 781 |
+
|
| 782 |
+
Based on the web search results above, provide a clear and comprehensive answer to the user's question. Cite your sources appropriately."""
|
| 783 |
+
|
| 784 |
+
# Call Gemini to synthesize the answer
|
| 785 |
+
answer = await gemini_llm_func(
|
| 786 |
+
prompt=prompt,
|
| 787 |
+
system_prompt=system_prompt,
|
| 788 |
+
temperature=0.3, # Lower temperature for more focused answers
|
| 789 |
+
max_tokens=1500
|
| 790 |
+
)
|
| 791 |
+
|
| 792 |
+
if not answer or len(answer.strip()) < 10:
|
| 793 |
+
logger.warning("Gemini synthesis produced minimal output, using fallback")
|
| 794 |
+
return web_context
|
| 795 |
+
|
| 796 |
+
return answer
|
| 797 |
+
|
| 798 |
+
except Exception as e:
|
| 799 |
+
logger.error(f"Error synthesizing web results with Gemini: {e}", exc_info=True)
|
| 800 |
+
# Fallback to raw web context
|
| 801 |
+
return web_context
|
| 802 |
+
|
| 803 |
+
|
| 804 |
+
async def gemini_rerank_func(query: str, documents: List[str], top_n: Optional[int] = None) -> List[Dict[str, Any]]:
|
| 805 |
+
"""
|
| 806 |
+
Gemini-based reranking function for LightRAG
|
| 807 |
+
|
| 808 |
+
This follows LightRAG's reranking API signature which expects:
|
| 809 |
+
- documents: List of strings (not dict chunks)
|
| 810 |
+
- top_n: Number of top results (not top_k)
|
| 811 |
+
- Returns: List of {"index": int, "relevance_score": float}
|
| 812 |
+
|
| 813 |
+
Args:
|
| 814 |
+
query: Search query
|
| 815 |
+
documents: List of document strings to rerank
|
| 816 |
+
top_n: Number of top documents to return (None = return all, reranked)
|
| 817 |
+
|
| 818 |
+
Returns:
|
| 819 |
+
List of {"index": int, "relevance_score": float} in descending score order
|
| 820 |
+
"""
|
| 821 |
+
try:
|
| 822 |
+
# Convert documents (strings) to chunks format for our reranker
|
| 823 |
+
chunks = [{"content": doc} for doc in documents]
|
| 824 |
+
|
| 825 |
+
# Initialize reranker with Gemini LLM function
|
| 826 |
+
reranker = GeminiReranker(
|
| 827 |
+
llm_func=gemini_llm_func,
|
| 828 |
+
batch_size=3, # Process 3 chunks at a time to avoid rate limits
|
| 829 |
+
temperature=0.1
|
| 830 |
+
)
|
| 831 |
+
|
| 832 |
+
# Perform reranking
|
| 833 |
+
reranked_chunks = await reranker.rerank(query, chunks, top_n)
|
| 834 |
+
|
| 835 |
+
# Convert back to LightRAG format: List[{"index": int, "relevance_score": float}]
|
| 836 |
+
results = []
|
| 837 |
+
for i, chunk in enumerate(reranked_chunks):
|
| 838 |
+
# Find original index of this chunk
|
| 839 |
+
original_content = chunk.get("content", "")
|
| 840 |
+
try:
|
| 841 |
+
original_index = documents.index(original_content)
|
| 842 |
+
except ValueError:
|
| 843 |
+
# Fallback: use current index if not found
|
| 844 |
+
original_index = i
|
| 845 |
+
|
| 846 |
+
results.append({
|
| 847 |
+
"index": original_index,
|
| 848 |
+
"relevance_score": chunk.get("relevance_score", 0.0)
|
| 849 |
+
})
|
| 850 |
+
|
| 851 |
+
logger.debug(f"Reranked {len(documents)} documents, returning {len(results)} results")
|
| 852 |
+
return results
|
| 853 |
+
|
| 854 |
+
except Exception as e:
|
| 855 |
+
logger.error(f"Reranking error: {e}", exc_info=True)
|
| 856 |
+
# Return original order on error - format: List[{"index": int, "relevance_score": float}]
|
| 857 |
+
result_count = top_n if top_n and top_n < len(documents) else len(documents)
|
| 858 |
+
return [{"index": i, "relevance_score": 1.0} for i in range(result_count)]
|
| 859 |
+
|
| 860 |
+
|
| 861 |
+
# =============================================================================
|
| 862 |
+
# RAG Instance Management
|
| 863 |
+
# =============================================================================
|
| 864 |
+
|
| 865 |
+
async def get_rag_instance(domain: str) -> RAGAnything:
|
| 866 |
+
"""Get or create RAG instance for a specific domain."""
|
| 867 |
+
if domain not in DOMAIN_CONFIGS:
|
| 868 |
+
raise HTTPException(
|
| 869 |
+
status_code=400,
|
| 870 |
+
detail=f"Invalid domain '{domain}'. Valid domains: {list(DOMAIN_CONFIGS.keys())}"
|
| 871 |
+
)
|
| 872 |
+
if domain in rag_instances:
|
| 873 |
+
logger.debug(f"Using cached RAG instance for domain: {domain}")
|
| 874 |
+
return rag_instances[domain]
|
| 875 |
+
|
| 876 |
+
logger.info(f"Creating new RAG instance for domain: {domain}")
|
| 877 |
+
try:
|
| 878 |
+
domain_config = DOMAIN_CONFIGS[domain]
|
| 879 |
+
domain_storage = STORAGE_DIR / domain
|
| 880 |
+
domain_storage.mkdir(parents=True, exist_ok=True)
|
| 881 |
+
|
| 882 |
+
config = RAGAnythingConfig(
|
| 883 |
+
working_dir=str(domain_storage),
|
| 884 |
+
parser="mineru",
|
| 885 |
+
parse_method="auto",
|
| 886 |
+
enable_image_processing=True,
|
| 887 |
+
enable_table_processing=True,
|
| 888 |
+
enable_equation_processing=True,
|
| 889 |
+
**domain_config["config_overrides"]
|
| 890 |
+
)
|
| 891 |
+
rag = await create_rag_anything(
|
| 892 |
+
llm_model_func=gemini_llm_func, # Flash for generation
|
| 893 |
+
vision_model_func=gemini_vision_func, # Flash for vision
|
| 894 |
+
embedding_func=gemini_embedding_func, # Embedding model
|
| 895 |
+
verifier_llm_func=gemini_verifier_llm_func, # Pro for verification
|
| 896 |
+
config=config,
|
| 897 |
+
rerank_model_func=gemini_rerank_func, # Enable reranking (passed directly)
|
| 898 |
+
)
|
| 899 |
+
rag_instances[domain] = rag
|
| 900 |
+
logger.info(f"RAG instance created successfully for domain: {domain}")
|
| 901 |
+
return rag
|
| 902 |
+
except Exception as e:
|
| 903 |
+
logger.error(f"Failed to create RAG instance for domain {domain}: {e}", exc_info=True)
|
| 904 |
+
raise HTTPException(
|
| 905 |
+
status_code=500,
|
| 906 |
+
detail=f"Failed to initialize RAG system for domain '{domain}': {str(e)}"
|
| 907 |
+
)
|
| 908 |
+
|
| 909 |
+
# =============================================================================
|
| 910 |
+
# API Endpoints
|
| 911 |
+
# =============================================================================
|
| 912 |
+
|
| 913 |
+
@app.get("/health", response_model=HealthResponse)
|
| 914 |
+
async def health_check():
|
| 915 |
+
"""Health check endpoint."""
|
| 916 |
+
return HealthResponse(
|
| 917 |
+
status="healthy",
|
| 918 |
+
timestamp=datetime.now().isoformat(),
|
| 919 |
+
version="2.0.0",
|
| 920 |
+
features={
|
| 921 |
+
"query_improvement": True,
|
| 922 |
+
"dual_llm_verification": True,
|
| 923 |
+
"gemini_pro_verifier": True,
|
| 924 |
+
"reranking": True,
|
| 925 |
+
"conversation_memory": True,
|
| 926 |
+
"multi_domain": True,
|
| 927 |
+
"multimodal_processing": True,
|
| 928 |
+
"gemini_integration": bool(GEMINI_API_KEY),
|
| 929 |
+
"web_search": bool(web_searcher),
|
| 930 |
+
"url_ingestion": bool(url_fetcher),
|
| 931 |
+
},
|
| 932 |
+
domains=list(DOMAIN_CONFIGS.keys())
|
| 933 |
+
)
|
| 934 |
+
|
| 935 |
+
|
| 936 |
+
@app.get("/domains", response_model=List[DomainInfo])
|
| 937 |
+
async def list_domains():
|
| 938 |
+
"""List all available domains."""
|
| 939 |
+
domains = []
|
| 940 |
+
for domain_id, config in DOMAIN_CONFIGS.items():
|
| 941 |
+
domains.append(DomainInfo(
|
| 942 |
+
domain_id=domain_id,
|
| 943 |
+
name=config["name"],
|
| 944 |
+
description=config["description"],
|
| 945 |
+
file_extensions=config["file_extensions"],
|
| 946 |
+
features={k: v for k, v in config["config_overrides"].items() if isinstance(v, bool)}
|
| 947 |
+
))
|
| 948 |
+
return domains
|
| 949 |
+
|
| 950 |
+
|
| 951 |
+
@app.post("/upload", response_model=UploadResponse)
|
| 952 |
+
async def upload_document(
|
| 953 |
+
file: UploadFile = File(...),
|
| 954 |
+
domain: str = Form(...),
|
| 955 |
+
background_tasks: BackgroundTasks = None
|
| 956 |
+
):
|
| 957 |
+
"""Upload and process a document in the background."""
|
| 958 |
+
logger.info(f"Upload request: {file.filename} to domain: {domain}")
|
| 959 |
+
try:
|
| 960 |
+
if domain not in DOMAIN_CONFIGS:
|
| 961 |
+
raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
|
| 962 |
+
|
| 963 |
+
file_ext = Path(file.filename).suffix.lower()
|
| 964 |
+
allowed_extensions = DOMAIN_CONFIGS[domain]["file_extensions"]
|
| 965 |
+
if file_ext not in allowed_extensions:
|
| 966 |
+
raise HTTPException(400, f"File type {file_ext} not for '{domain}'. Allowed: {allowed_extensions}")
|
| 967 |
+
|
| 968 |
+
processing_id = str(uuid.uuid4())
|
| 969 |
+
domain_upload_dir = UPLOAD_DIR / domain
|
| 970 |
+
domain_upload_dir.mkdir(parents=True, exist_ok=True)
|
| 971 |
+
file_path = domain_upload_dir / f"{processing_id}_{file.filename}"
|
| 972 |
+
|
| 973 |
+
with open(file_path, "wb") as f:
|
| 974 |
+
f.write(await file.read())
|
| 975 |
+
logger.info(f"File saved: {file_path}")
|
| 976 |
+
|
| 977 |
+
# Initialize status and save to disk
|
| 978 |
+
update_processing_status(processing_id, {
|
| 979 |
+
"status": "processing",
|
| 980 |
+
"message": "Processing document...",
|
| 981 |
+
"file_name": file.filename,
|
| 982 |
+
"domain": domain,
|
| 983 |
+
"started_at": datetime.now().isoformat()
|
| 984 |
+
})
|
| 985 |
+
|
| 986 |
+
async def process_document_task():
|
| 987 |
+
try:
|
| 988 |
+
logger.info(f"Processing document: {file_path}")
|
| 989 |
+
rag = await get_rag_instance(domain)
|
| 990 |
+
result = await rag.process_document_complete(str(file_path))
|
| 991 |
+
|
| 992 |
+
# Check result (process_document_complete returns None on success)
|
| 993 |
+
if result is None or (isinstance(result, dict) and result.get("success") is not False):
|
| 994 |
+
logger.info(f"Document processed successfully: {file.filename}")
|
| 995 |
+
update_processing_status(processing_id, {
|
| 996 |
+
"status": "completed",
|
| 997 |
+
"message": "Document processed successfully",
|
| 998 |
+
"file_name": file.filename,
|
| 999 |
+
"domain": domain,
|
| 1000 |
+
"completed_at": datetime.now().isoformat()
|
| 1001 |
+
})
|
| 1002 |
+
else:
|
| 1003 |
+
error_msg = result.get('error', 'Unknown processing error') if isinstance(result, dict) else "Processing error"
|
| 1004 |
+
logger.error(f"Document processing failed: {error_msg}")
|
| 1005 |
+
update_processing_status(processing_id, {
|
| 1006 |
+
"status": "failed",
|
| 1007 |
+
"message": f"Processing failed: {error_msg}",
|
| 1008 |
+
"file_name": file.filename,
|
| 1009 |
+
"domain": domain,
|
| 1010 |
+
"error": error_msg
|
| 1011 |
+
})
|
| 1012 |
+
except Exception as e:
|
| 1013 |
+
logger.error(f"Error in background processing of {file.filename}: {e}", exc_info=True)
|
| 1014 |
+
update_processing_status(processing_id, {
|
| 1015 |
+
"status": "failed",
|
| 1016 |
+
"message": f"Error: {str(e)}",
|
| 1017 |
+
"file_name": file.filename,
|
| 1018 |
+
"domain": domain,
|
| 1019 |
+
"error": str(e)
|
| 1020 |
+
})
|
| 1021 |
+
|
| 1022 |
+
background_tasks.add_task(process_document_task)
|
| 1023 |
+
|
| 1024 |
+
return UploadResponse(
|
| 1025 |
+
success=True,
|
| 1026 |
+
message="Document uploaded and queued for processing",
|
| 1027 |
+
file_name=file.filename,
|
| 1028 |
+
domain=domain,
|
| 1029 |
+
processing_id=processing_id
|
| 1030 |
+
)
|
| 1031 |
+
except HTTPException:
|
| 1032 |
+
raise
|
| 1033 |
+
except Exception as e:
|
| 1034 |
+
logger.error(f"Upload error: {e}", exc_info=True)
|
| 1035 |
+
raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
|
| 1036 |
+
|
| 1037 |
+
|
| 1038 |
+
@app.post("/upload-batch", response_model=BatchUploadResponse)
|
| 1039 |
+
async def upload_documents_batch(
|
| 1040 |
+
files: List[UploadFile] = File(...),
|
| 1041 |
+
domain: str = Form(...),
|
| 1042 |
+
background_tasks: BackgroundTasks = None
|
| 1043 |
+
):
|
| 1044 |
+
"""
|
| 1045 |
+
Upload and process multiple documents in batch using optimized processing.
|
| 1046 |
+
|
| 1047 |
+
Uses BatchOptimizer for 2-3x faster processing through:
|
| 1048 |
+
- Parallel parsing (up to 4 documents simultaneously)
|
| 1049 |
+
- Parallel processing (up to 10 documents simultaneously)
|
| 1050 |
+
- Pipeline architecture (parse + process in parallel)
|
| 1051 |
+
"""
|
| 1052 |
+
logger.info(f"Batch upload request: {len(files)} files to domain: {domain}")
|
| 1053 |
+
try:
|
| 1054 |
+
if domain not in DOMAIN_CONFIGS:
|
| 1055 |
+
raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
|
| 1056 |
+
|
| 1057 |
+
allowed_extensions = DOMAIN_CONFIGS[domain]["file_extensions"]
|
| 1058 |
+
domain_upload_dir = UPLOAD_DIR / domain
|
| 1059 |
+
domain_upload_dir.mkdir(parents=True, exist_ok=True)
|
| 1060 |
+
|
| 1061 |
+
# Process and save all files
|
| 1062 |
+
file_paths = []
|
| 1063 |
+
processing_ids = []
|
| 1064 |
+
rejected_files = []
|
| 1065 |
+
|
| 1066 |
+
for file in files:
|
| 1067 |
+
file_ext = Path(file.filename).suffix.lower()
|
| 1068 |
+
if file_ext not in allowed_extensions:
|
| 1069 |
+
rejected_files.append(file.filename)
|
| 1070 |
+
logger.warning(f"Rejected file {file.filename}: extension {file_ext} not allowed")
|
| 1071 |
+
continue
|
| 1072 |
+
|
| 1073 |
+
processing_id = str(uuid.uuid4())
|
| 1074 |
+
file_path = domain_upload_dir / f"{processing_id}_{file.filename}"
|
| 1075 |
+
|
| 1076 |
+
with open(file_path, "wb") as f:
|
| 1077 |
+
f.write(await file.read())
|
| 1078 |
+
|
| 1079 |
+
file_paths.append(str(file_path))
|
| 1080 |
+
processing_ids.append(processing_id)
|
| 1081 |
+
|
| 1082 |
+
# Initialize status for each file
|
| 1083 |
+
update_processing_status(processing_id, {
|
| 1084 |
+
"status": "queued",
|
| 1085 |
+
"message": "Queued for batch processing...",
|
| 1086 |
+
"file_name": file.filename,
|
| 1087 |
+
"domain": domain,
|
| 1088 |
+
"started_at": datetime.now().isoformat()
|
| 1089 |
+
})
|
| 1090 |
+
|
| 1091 |
+
logger.info(f"Accepted {len(file_paths)}/{len(files)} files, rejected: {rejected_files}")
|
| 1092 |
+
|
| 1093 |
+
if not file_paths:
|
| 1094 |
+
raise HTTPException(400, f"No valid files provided. Allowed extensions: {allowed_extensions}")
|
| 1095 |
+
|
| 1096 |
+
# Process documents in batch using optimized processing
|
| 1097 |
+
async def process_batch_task():
|
| 1098 |
+
start_time = time.time()
|
| 1099 |
+
try:
|
| 1100 |
+
logger.info(f"Starting optimized batch processing of {len(file_paths)} files")
|
| 1101 |
+
rag = await get_rag_instance(domain)
|
| 1102 |
+
|
| 1103 |
+
# Use optimized batch processing if available
|
| 1104 |
+
if hasattr(rag, 'process_documents_batch_optimized'):
|
| 1105 |
+
result = await rag.process_documents_batch_optimized(
|
| 1106 |
+
file_paths=file_paths,
|
| 1107 |
+
max_concurrent_parsers=4, # MinerU optimal
|
| 1108 |
+
max_concurrent_processors=10, # Higher for I/O-bound tasks
|
| 1109 |
+
enable_progress_tracking=True,
|
| 1110 |
+
)
|
| 1111 |
+
|
| 1112 |
+
# Update statuses based on results
|
| 1113 |
+
successful_files = result.get('successful_files', [])
|
| 1114 |
+
failed_files = result.get('failed_files', {})
|
| 1115 |
+
|
| 1116 |
+
for idx, file_path in enumerate(file_paths):
|
| 1117 |
+
processing_id = processing_ids[idx]
|
| 1118 |
+
filename = Path(file_path).name.split('_', 1)[1] if '_' in Path(file_path).name else Path(file_path).name
|
| 1119 |
+
|
| 1120 |
+
if file_path in successful_files:
|
| 1121 |
+
update_processing_status(processing_id, {
|
| 1122 |
+
"status": "completed",
|
| 1123 |
+
"message": "Document processed successfully",
|
| 1124 |
+
"file_name": filename,
|
| 1125 |
+
"domain": domain,
|
| 1126 |
+
"completed_at": datetime.now().isoformat()
|
| 1127 |
+
})
|
| 1128 |
+
elif file_path in failed_files:
|
| 1129 |
+
error_msg = failed_files[file_path]
|
| 1130 |
+
update_processing_status(processing_id, {
|
| 1131 |
+
"status": "failed",
|
| 1132 |
+
"message": f"Processing failed: {error_msg}",
|
| 1133 |
+
"file_name": filename,
|
| 1134 |
+
"domain": domain,
|
| 1135 |
+
"error": error_msg
|
| 1136 |
+
})
|
| 1137 |
+
|
| 1138 |
+
total_time = time.time() - start_time
|
| 1139 |
+
throughput = len(successful_files) / total_time if total_time > 0 else 0
|
| 1140 |
+
logger.info(
|
| 1141 |
+
f"Batch processing complete: {len(successful_files)}/{len(file_paths)} successful "
|
| 1142 |
+
f"in {total_time:.2f}s ({throughput:.2f} docs/sec)"
|
| 1143 |
+
)
|
| 1144 |
+
|
| 1145 |
+
# Track performance
|
| 1146 |
+
performance_metrics["processing_times"].append(total_time)
|
| 1147 |
+
if len(performance_metrics["processing_times"]) > 100:
|
| 1148 |
+
performance_metrics["processing_times"] = performance_metrics["processing_times"][-100:]
|
| 1149 |
+
|
| 1150 |
+
else:
|
| 1151 |
+
# Fallback: process sequentially
|
| 1152 |
+
logger.warning("Optimized batch processing not available, using sequential processing")
|
| 1153 |
+
for idx, file_path in enumerate(file_paths):
|
| 1154 |
+
processing_id = processing_ids[idx]
|
| 1155 |
+
filename = Path(file_path).name.split('_', 1)[1] if '_' in Path(file_path).name else Path(file_path).name
|
| 1156 |
+
|
| 1157 |
+
current_status = processing_status[processing_id].copy()
|
| 1158 |
+
current_status["status"] = "processing"
|
| 1159 |
+
current_status["message"] = "Processing document..."
|
| 1160 |
+
update_processing_status(processing_id, current_status)
|
| 1161 |
+
|
| 1162 |
+
try:
|
| 1163 |
+
result = await rag.process_document_complete(file_path)
|
| 1164 |
+
if result is None or (isinstance(result, dict) and result.get("success") is not False):
|
| 1165 |
+
update_processing_status(processing_id, {
|
| 1166 |
+
"status": "completed",
|
| 1167 |
+
"message": "Document processed successfully",
|
| 1168 |
+
"file_name": filename,
|
| 1169 |
+
"domain": domain,
|
| 1170 |
+
"completed_at": datetime.now().isoformat()
|
| 1171 |
+
})
|
| 1172 |
+
else:
|
| 1173 |
+
error_msg = result.get('error', 'Unknown error') if isinstance(result, dict) else "Processing error"
|
| 1174 |
+
update_processing_status(processing_id, {
|
| 1175 |
+
"status": "failed",
|
| 1176 |
+
"message": f"Processing failed: {error_msg}",
|
| 1177 |
+
"file_name": filename,
|
| 1178 |
+
"domain": domain,
|
| 1179 |
+
"error": error_msg
|
| 1180 |
+
})
|
| 1181 |
+
except Exception as e:
|
| 1182 |
+
logger.error(f"Error processing {filename}: {e}", exc_info=True)
|
| 1183 |
+
update_processing_status(processing_id, {
|
| 1184 |
+
"status": "failed",
|
| 1185 |
+
"message": f"Error: {str(e)}",
|
| 1186 |
+
"file_name": filename,
|
| 1187 |
+
"domain": domain,
|
| 1188 |
+
"error": str(e)
|
| 1189 |
+
})
|
| 1190 |
+
|
| 1191 |
+
except Exception as e:
|
| 1192 |
+
logger.error(f"Batch processing error: {e}", exc_info=True)
|
| 1193 |
+
# Mark all as failed
|
| 1194 |
+
for idx, file_path in enumerate(file_paths):
|
| 1195 |
+
processing_id = processing_ids[idx]
|
| 1196 |
+
filename = Path(file_path).name.split('_', 1)[1] if '_' in Path(file_path).name else Path(file_path).name
|
| 1197 |
+
update_processing_status(processing_id, {
|
| 1198 |
+
"status": "failed",
|
| 1199 |
+
"message": f"Batch processing error: {str(e)}",
|
| 1200 |
+
"file_name": filename,
|
| 1201 |
+
"domain": domain,
|
| 1202 |
+
"error": str(e)
|
| 1203 |
+
})
|
| 1204 |
+
|
| 1205 |
+
background_tasks.add_task(process_batch_task)
|
| 1206 |
+
|
| 1207 |
+
return BatchUploadResponse(
|
| 1208 |
+
success=True,
|
| 1209 |
+
message=f"Batch upload queued: {len(file_paths)} files accepted" + (f", {len(rejected_files)} rejected" if rejected_files else ""),
|
| 1210 |
+
total_files=len(files),
|
| 1211 |
+
accepted_files=len(file_paths),
|
| 1212 |
+
processing_ids=processing_ids,
|
| 1213 |
+
domain=domain
|
| 1214 |
+
)
|
| 1215 |
+
|
| 1216 |
+
except HTTPException:
|
| 1217 |
+
raise
|
| 1218 |
+
except Exception as e:
|
| 1219 |
+
logger.error(f"Batch upload error: {e}", exc_info=True)
|
| 1220 |
+
raise HTTPException(status_code=500, detail=f"Batch upload failed: {str(e)}")
|
| 1221 |
+
|
| 1222 |
+
|
| 1223 |
+
@app.post("/upload-url", response_model=UploadResponse)
|
| 1224 |
+
async def upload_url(
|
| 1225 |
+
request: URLUploadRequest,
|
| 1226 |
+
background_tasks: BackgroundTasks
|
| 1227 |
+
):
|
| 1228 |
+
"""Fetch document from URL and process it."""
|
| 1229 |
+
logger.info(f"URL upload request: {request.url} to domain: {request.domain}")
|
| 1230 |
+
try:
|
| 1231 |
+
if not url_fetcher:
|
| 1232 |
+
raise HTTPException(503, "URL fetcher not available. Check server configuration.")
|
| 1233 |
+
|
| 1234 |
+
if request.domain not in DOMAIN_CONFIGS:
|
| 1235 |
+
raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
|
| 1236 |
+
|
| 1237 |
+
processing_id = str(uuid.uuid4())
|
| 1238 |
+
|
| 1239 |
+
# Initialize status
|
| 1240 |
+
update_processing_status(processing_id, {
|
| 1241 |
+
"status": "fetching",
|
| 1242 |
+
"message": "Fetching URL content...",
|
| 1243 |
+
"url": request.url,
|
| 1244 |
+
"domain": request.domain,
|
| 1245 |
+
"started_at": datetime.now().isoformat()
|
| 1246 |
+
})
|
| 1247 |
+
|
| 1248 |
+
async def fetch_and_process_url():
|
| 1249 |
+
try:
|
| 1250 |
+
logger.info(f"[URL UPLOAD] Starting fetch for: {request.url}")
|
| 1251 |
+
|
| 1252 |
+
# Fetch URL content with timeout
|
| 1253 |
+
fetch_result = await asyncio.wait_for(
|
| 1254 |
+
url_fetcher.fetch_url(
|
| 1255 |
+
url=request.url,
|
| 1256 |
+
convert_to_markdown=request.convert_to_markdown
|
| 1257 |
+
),
|
| 1258 |
+
timeout=60.0 # 60 second timeout for URL fetching
|
| 1259 |
+
)
|
| 1260 |
+
|
| 1261 |
+
if not fetch_result.get("success"):
|
| 1262 |
+
error_msg = fetch_result.get("error", "Unknown fetch error")
|
| 1263 |
+
logger.error(f"[URL UPLOAD] Fetch failed: {error_msg}")
|
| 1264 |
+
update_processing_status(processing_id, {
|
| 1265 |
+
"status": "failed",
|
| 1266 |
+
"message": f"Failed to fetch URL: {error_msg}",
|
| 1267 |
+
"domain": request.domain,
|
| 1268 |
+
"error": error_msg
|
| 1269 |
+
})
|
| 1270 |
+
return
|
| 1271 |
+
|
| 1272 |
+
file_path = fetch_result.get("file_path")
|
| 1273 |
+
if not file_path:
|
| 1274 |
+
logger.error("[URL UPLOAD] No file path returned from URL fetch")
|
| 1275 |
+
update_processing_status(processing_id, {
|
| 1276 |
+
"status": "failed",
|
| 1277 |
+
"message": "No file path returned from URL fetch",
|
| 1278 |
+
"domain": request.domain,
|
| 1279 |
+
"error": "No file path"
|
| 1280 |
+
})
|
| 1281 |
+
return
|
| 1282 |
+
|
| 1283 |
+
logger.info(f"[URL UPLOAD] Content saved to: {file_path}")
|
| 1284 |
+
|
| 1285 |
+
# Update status
|
| 1286 |
+
update_processing_status(processing_id, {
|
| 1287 |
+
"status": "processing",
|
| 1288 |
+
"message": "Processing document...",
|
| 1289 |
+
"domain": request.domain,
|
| 1290 |
+
"file_path": file_path
|
| 1291 |
+
})
|
| 1292 |
+
|
| 1293 |
+
# Get RAG instance
|
| 1294 |
+
rag = await get_rag_instance(request.domain)
|
| 1295 |
+
|
| 1296 |
+
# Check if we have a content list with images (advanced HTML parsing)
|
| 1297 |
+
content_list = fetch_result.get("content_list")
|
| 1298 |
+
images_count = fetch_result.get("images_count", 0)
|
| 1299 |
+
|
| 1300 |
+
if content_list and len(content_list) > 0 and images_count > 0:
|
| 1301 |
+
# Advanced pathway: Process pre-parsed content list with images
|
| 1302 |
+
logger.info(f"[URL UPLOAD] Using advanced processing: {len(content_list)} blocks, {images_count} images")
|
| 1303 |
+
result = await asyncio.wait_for(
|
| 1304 |
+
rag.process_content_list_direct(
|
| 1305 |
+
content_list=content_list,
|
| 1306 |
+
source_identifier=request.url,
|
| 1307 |
+
enable_image_processing=True
|
| 1308 |
+
),
|
| 1309 |
+
timeout=300.0 # 5 minute timeout for processing
|
| 1310 |
+
)
|
| 1311 |
+
else:
|
| 1312 |
+
# Standard pathway: Process as regular document (PDF or text-only HTML)
|
| 1313 |
+
logger.info("[URL UPLOAD] Using standard document processing")
|
| 1314 |
+
result = await asyncio.wait_for(
|
| 1315 |
+
rag.process_document_complete(file_path),
|
| 1316 |
+
timeout=300.0 # 5 minute timeout for processing
|
| 1317 |
+
)
|
| 1318 |
+
|
| 1319 |
+
# Check result and update status
|
| 1320 |
+
# Note: process_document_complete returns None on success (not a dict)
|
| 1321 |
+
if result is None or (isinstance(result, dict) and result.get("success") is not False):
|
| 1322 |
+
logger.info(f"[URL UPLOAD] ✓ Successfully processed: {request.url}")
|
| 1323 |
+
update_processing_status(processing_id, {
|
| 1324 |
+
"status": "completed",
|
| 1325 |
+
"message": "Document processed successfully",
|
| 1326 |
+
"domain": request.domain,
|
| 1327 |
+
"file_path": file_path,
|
| 1328 |
+
"completed_at": datetime.now().isoformat()
|
| 1329 |
+
})
|
| 1330 |
+
else:
|
| 1331 |
+
error_msg = result.get('error', 'Unknown processing error') if isinstance(result, dict) else "Processing returned error"
|
| 1332 |
+
logger.error(f"[URL UPLOAD] ✗ Processing failed: {error_msg}")
|
| 1333 |
+
update_processing_status(processing_id, {
|
| 1334 |
+
"status": "failed",
|
| 1335 |
+
"message": f"Processing failed: {error_msg}",
|
| 1336 |
+
"domain": request.domain,
|
| 1337 |
+
"error": error_msg
|
| 1338 |
+
})
|
| 1339 |
+
|
| 1340 |
+
except asyncio.TimeoutError:
|
| 1341 |
+
logger.error(f"[URL UPLOAD] ✗ Timeout processing {request.url}")
|
| 1342 |
+
update_processing_status(processing_id, {
|
| 1343 |
+
"status": "failed",
|
| 1344 |
+
"message": "Processing timeout",
|
| 1345 |
+
"domain": request.domain,
|
| 1346 |
+
"error": "Timeout"
|
| 1347 |
+
})
|
| 1348 |
+
except Exception as e:
|
| 1349 |
+
logger.error(f"[URL UPLOAD] ✗ Error processing {request.url}: {e}", exc_info=True)
|
| 1350 |
+
update_processing_status(processing_id, {
|
| 1351 |
+
"status": "failed",
|
| 1352 |
+
"message": f"Error: {str(e)}",
|
| 1353 |
+
"domain": request.domain,
|
| 1354 |
+
"error": str(e)
|
| 1355 |
+
})
|
| 1356 |
+
|
| 1357 |
+
background_tasks.add_task(fetch_and_process_url)
|
| 1358 |
+
|
| 1359 |
+
return UploadResponse(
|
| 1360 |
+
success=True,
|
| 1361 |
+
message="URL queued for fetching and processing",
|
| 1362 |
+
file_name=request.url,
|
| 1363 |
+
domain=request.domain,
|
| 1364 |
+
processing_id=processing_id
|
| 1365 |
+
)
|
| 1366 |
+
|
| 1367 |
+
except HTTPException:
|
| 1368 |
+
raise
|
| 1369 |
+
except Exception as e:
|
| 1370 |
+
logger.error(f"URL upload error: {e}", exc_info=True)
|
| 1371 |
+
raise HTTPException(status_code=500, detail=f"URL upload failed: {str(e)}")
|
| 1372 |
+
|
| 1373 |
+
|
| 1374 |
+
@app.post("/query/stream")
|
| 1375 |
+
async def query_documents_stream(request: QueryRequest):
|
| 1376 |
+
"""
|
| 1377 |
+
Stream query responses with real-time token generation and verification.
|
| 1378 |
+
|
| 1379 |
+
This endpoint provides Server-Sent Events (SSE) streaming for real-time
|
| 1380 |
+
response generation while maintaining dual-LLM verification.
|
| 1381 |
+
Supports web search augmentation when enabled.
|
| 1382 |
+
"""
|
| 1383 |
+
logger.info(f"Streaming query request: '{request.query[:50]}...' in domain: {request.domain}, web_search: {request.enable_web_search}, web_only: {request.web_search_only}")
|
| 1384 |
+
|
| 1385 |
+
async def generate_sse():
|
| 1386 |
+
"""Generate Server-Sent Events stream"""
|
| 1387 |
+
import json
|
| 1388 |
+
|
| 1389 |
+
try:
|
| 1390 |
+
conversation_id = request.conversation_id or f"conv_{uuid.uuid4()}"
|
| 1391 |
+
|
| 1392 |
+
# Handle web search only mode
|
| 1393 |
+
if request.web_search_only:
|
| 1394 |
+
if not web_searcher:
|
| 1395 |
+
error_data = {"type": "error", "content": {"message": "Web search not available. Set TAVILY_API_KEY."}, "done": True}
|
| 1396 |
+
yield f"event: error\ndata: {json.dumps(error_data)}\n\n"
|
| 1397 |
+
return
|
| 1398 |
+
|
| 1399 |
+
logger.info("Using web search only mode (streaming)")
|
| 1400 |
+
try:
|
| 1401 |
+
web_results = await web_searcher.search(request.query, max_results=5)
|
| 1402 |
+
web_context = web_searcher.format_results_for_llm(web_results)
|
| 1403 |
+
|
| 1404 |
+
# Synthesize answer using Gemini (streaming simulation)
|
| 1405 |
+
logger.info("Synthesizing web search results with Gemini (streaming)")
|
| 1406 |
+
answer = await synthesize_web_results_with_gemini(
|
| 1407 |
+
query=request.query,
|
| 1408 |
+
web_context=web_context,
|
| 1409 |
+
rag_context=None
|
| 1410 |
+
)
|
| 1411 |
+
|
| 1412 |
+
# Stream the answer word by word
|
| 1413 |
+
words = answer.split()
|
| 1414 |
+
for i, word in enumerate(words):
|
| 1415 |
+
token = word + " " if i < len(words) - 1 else word
|
| 1416 |
+
data = {"type": "token", "content": token, "done": False}
|
| 1417 |
+
yield f"event: token\ndata: {json.dumps(data)}\n\n"
|
| 1418 |
+
await asyncio.sleep(0.01) # Small delay for streaming effect
|
| 1419 |
+
|
| 1420 |
+
# Send completion event
|
| 1421 |
+
yield f"event: done\ndata: {json.dumps({'message': 'Stream complete', 'conversation_id': conversation_id})}\n\n"
|
| 1422 |
+
return
|
| 1423 |
+
|
| 1424 |
+
except Exception as e:
|
| 1425 |
+
logger.error(f"Web search only error: {e}", exc_info=True)
|
| 1426 |
+
error_data = {"type": "error", "content": {"message": f"Web search failed: {str(e)}"}, "done": True}
|
| 1427 |
+
yield f"event: error\ndata: {json.dumps(error_data)}\n\n"
|
| 1428 |
+
return
|
| 1429 |
+
|
| 1430 |
+
# Get RAG instance
|
| 1431 |
+
rag = await get_rag_instance(request.domain)
|
| 1432 |
+
|
| 1433 |
+
# Determine optimal parameters based on fast_mode
|
| 1434 |
+
if request.fast_mode:
|
| 1435 |
+
# Optimized parameters for 2-3x speedup
|
| 1436 |
+
top_k = request.top_k if request.top_k is not None else 20
|
| 1437 |
+
chunk_top_k = 10
|
| 1438 |
+
max_entity_tokens = 4000
|
| 1439 |
+
max_relation_tokens = 6000
|
| 1440 |
+
max_total_tokens = 20000
|
| 1441 |
+
logger.info(f"⚡ Fast mode enabled for streaming: top_k={top_k}, chunk_top_k={chunk_top_k}")
|
| 1442 |
+
else:
|
| 1443 |
+
# Default parameters (higher quality, slower)
|
| 1444 |
+
top_k = request.top_k if request.top_k is not None else 40
|
| 1445 |
+
chunk_top_k = 20
|
| 1446 |
+
max_entity_tokens = 6000
|
| 1447 |
+
max_relation_tokens = 8000
|
| 1448 |
+
max_total_tokens = 30000
|
| 1449 |
+
|
| 1450 |
+
# Log toggle settings
|
| 1451 |
+
logger.info(f"Query settings - improvement: {request.enable_query_improvement}, verification: {request.enable_verification_check}, web_search: {request.enable_web_search}")
|
| 1452 |
+
|
| 1453 |
+
# If web search augmentation is enabled, we need to collect the RAG answer first
|
| 1454 |
+
# then augment with web search
|
| 1455 |
+
if request.enable_web_search and web_searcher:
|
| 1456 |
+
logger.info("Web search augmentation enabled for streaming")
|
| 1457 |
+
|
| 1458 |
+
# Collect RAG answer first
|
| 1459 |
+
rag_answer_buffer = []
|
| 1460 |
+
async for chunk in rag.aquery_stream(
|
| 1461 |
+
query=request.query,
|
| 1462 |
+
mode=request.mode,
|
| 1463 |
+
enable_verification=request.enable_verification_check,
|
| 1464 |
+
enable_query_improvement=request.enable_query_improvement,
|
| 1465 |
+
top_k=top_k,
|
| 1466 |
+
chunk_top_k=chunk_top_k,
|
| 1467 |
+
max_entity_tokens=max_entity_tokens,
|
| 1468 |
+
max_relation_tokens=max_relation_tokens,
|
| 1469 |
+
max_total_tokens=max_total_tokens
|
| 1470 |
+
):
|
| 1471 |
+
chunk_type = chunk.get("type", "token")
|
| 1472 |
+
content = chunk.get("content", "")
|
| 1473 |
+
done = chunk.get("done", False)
|
| 1474 |
+
|
| 1475 |
+
if chunk_type == "token":
|
| 1476 |
+
# Stream token and collect it
|
| 1477 |
+
rag_answer_buffer.append(content)
|
| 1478 |
+
data = {"type": "token", "content": content, "done": False}
|
| 1479 |
+
yield f"event: token\ndata: {json.dumps(data)}\n\n"
|
| 1480 |
+
|
| 1481 |
+
elif chunk_type == "verification":
|
| 1482 |
+
# Send verification metadata
|
| 1483 |
+
data = {"type": "verification", "content": content, "done": done}
|
| 1484 |
+
yield f"event: verification\ndata: {json.dumps(data)}\n\n"
|
| 1485 |
+
|
| 1486 |
+
elif chunk_type == "error":
|
| 1487 |
+
# Send error
|
| 1488 |
+
data = {"type": "error", "content": content, "done": True}
|
| 1489 |
+
yield f"event: error\ndata: {json.dumps(data)}\n\n"
|
| 1490 |
+
return
|
| 1491 |
+
|
| 1492 |
+
# Now perform web search and synthesis
|
| 1493 |
+
try:
|
| 1494 |
+
rag_answer = "".join(rag_answer_buffer)
|
| 1495 |
+
logger.info("Performing web search to augment RAG answer...")
|
| 1496 |
+
web_results = await web_searcher.search(request.query, max_results=5)
|
| 1497 |
+
|
| 1498 |
+
if web_results.get("results"):
|
| 1499 |
+
web_context = web_searcher.format_results_for_llm(web_results)
|
| 1500 |
+
|
| 1501 |
+
# Synthesize combined answer
|
| 1502 |
+
logger.info("Synthesizing RAG + web results with Gemini")
|
| 1503 |
+
synthesized_answer = await synthesize_web_results_with_gemini(
|
| 1504 |
+
query=request.query,
|
| 1505 |
+
web_context=web_context,
|
| 1506 |
+
rag_context=rag_answer
|
| 1507 |
+
)
|
| 1508 |
+
|
| 1509 |
+
# Clear the previous RAG answer and stream the synthesized one
|
| 1510 |
+
# Send a newline separator
|
| 1511 |
+
data = {"type": "token", "content": "\n\n---\n\n**Enhanced with Web Search:**\n\n", "done": False}
|
| 1512 |
+
yield f"event: token\ndata: {json.dumps(data)}\n\n"
|
| 1513 |
+
|
| 1514 |
+
# Stream synthesized answer
|
| 1515 |
+
words = synthesized_answer.split()
|
| 1516 |
+
for i, word in enumerate(words):
|
| 1517 |
+
token = word + " " if i < len(words) - 1 else word
|
| 1518 |
+
data = {"type": "token", "content": token, "done": False}
|
| 1519 |
+
yield f"event: token\ndata: {json.dumps(data)}\n\n"
|
| 1520 |
+
await asyncio.sleep(0.01)
|
| 1521 |
+
|
| 1522 |
+
except Exception as e:
|
| 1523 |
+
logger.error(f"Web search augmentation error: {e}", exc_info=True)
|
| 1524 |
+
# Continue without web augmentation
|
| 1525 |
+
pass
|
| 1526 |
+
|
| 1527 |
+
# Send completion event
|
| 1528 |
+
yield f"event: done\ndata: {json.dumps({'message': 'Stream complete', 'conversation_id': conversation_id})}\n\n"
|
| 1529 |
+
|
| 1530 |
+
else:
|
| 1531 |
+
# Standard RAG streaming without web search
|
| 1532 |
+
async for chunk in rag.aquery_stream(
|
| 1533 |
+
query=request.query,
|
| 1534 |
+
mode=request.mode,
|
| 1535 |
+
enable_verification=request.enable_verification_check,
|
| 1536 |
+
enable_query_improvement=request.enable_query_improvement,
|
| 1537 |
+
top_k=top_k,
|
| 1538 |
+
chunk_top_k=chunk_top_k,
|
| 1539 |
+
max_entity_tokens=max_entity_tokens,
|
| 1540 |
+
max_relation_tokens=max_relation_tokens,
|
| 1541 |
+
max_total_tokens=max_total_tokens
|
| 1542 |
+
):
|
| 1543 |
+
chunk_type = chunk.get("type", "token")
|
| 1544 |
+
content = chunk.get("content", "")
|
| 1545 |
+
done = chunk.get("done", False)
|
| 1546 |
+
|
| 1547 |
+
if chunk_type == "token":
|
| 1548 |
+
# Stream token
|
| 1549 |
+
data = {"type": "token", "content": content, "done": done}
|
| 1550 |
+
yield f"event: token\ndata: {json.dumps(data)}\n\n"
|
| 1551 |
+
|
| 1552 |
+
elif chunk_type == "verification":
|
| 1553 |
+
# Send verification metadata
|
| 1554 |
+
data = {"type": "verification", "content": content, "done": done}
|
| 1555 |
+
yield f"event: verification\ndata: {json.dumps(data)}\n\n"
|
| 1556 |
+
|
| 1557 |
+
elif chunk_type == "error":
|
| 1558 |
+
# Send error
|
| 1559 |
+
data = {"type": "error", "content": content, "done": True}
|
| 1560 |
+
yield f"event: error\ndata: {json.dumps(data)}\n\n"
|
| 1561 |
+
break
|
| 1562 |
+
|
| 1563 |
+
# Send completion event
|
| 1564 |
+
yield f"event: done\ndata: {json.dumps({'message': 'Stream complete', 'conversation_id': conversation_id})}\n\n"
|
| 1565 |
+
|
| 1566 |
+
except Exception as e:
|
| 1567 |
+
logger.error(f"Streaming error: {e}", exc_info=True)
|
| 1568 |
+
error_data = {"type": "error", "content": {"message": str(e)}, "done": True}
|
| 1569 |
+
yield f"event: error\ndata: {json.dumps(error_data)}\n\n"
|
| 1570 |
+
|
| 1571 |
+
return StreamingResponse(
|
| 1572 |
+
generate_sse(),
|
| 1573 |
+
media_type="text/event-stream",
|
| 1574 |
+
headers={
|
| 1575 |
+
"Cache-Control": "no-cache",
|
| 1576 |
+
"Connection": "keep-alive",
|
| 1577 |
+
"X-Accel-Buffering": "no"
|
| 1578 |
+
}
|
| 1579 |
+
)
|
| 1580 |
+
|
| 1581 |
+
|
| 1582 |
+
@app.post("/query", response_model=QueryResponse)
|
| 1583 |
+
async def query_documents(request: QueryRequest):
|
| 1584 |
+
"""Query documents with enhanced RAG capabilities and optional web search."""
|
| 1585 |
+
start_time = time.time()
|
| 1586 |
+
logger.info(f"Query request: '{request.query[:50]}...' in domain: {request.domain}, mode: {request.mode}, fast_mode: {request.fast_mode}")
|
| 1587 |
+
|
| 1588 |
+
try:
|
| 1589 |
+
conversation_id = request.conversation_id or f"conv_{uuid.uuid4()}"
|
| 1590 |
+
conversation_history = conversation_histories.get(conversation_id, [])
|
| 1591 |
+
|
| 1592 |
+
# Generate cache key for non-web-search queries
|
| 1593 |
+
cache_key = None
|
| 1594 |
+
if request.enable_cache and not request.web_search_only and not request.enable_web_search:
|
| 1595 |
+
cache_data = f"{request.query}:{request.domain}:{request.mode}:{request.fast_mode}:{request.enable_verification}"
|
| 1596 |
+
cache_key = hashlib.md5(cache_data.encode()).hexdigest()
|
| 1597 |
+
|
| 1598 |
+
# Check cache
|
| 1599 |
+
if cache_key in query_cache:
|
| 1600 |
+
cached_response = query_cache[cache_key]
|
| 1601 |
+
logger.info(f"✓ Cache hit for query (saved {time.time() - start_time:.2f}s)")
|
| 1602 |
+
# Update conversation ID in cached response
|
| 1603 |
+
cached_response.conversation_id = conversation_id
|
| 1604 |
+
return cached_response
|
| 1605 |
+
|
| 1606 |
+
# Handle web search only mode
|
| 1607 |
+
if request.web_search_only:
|
| 1608 |
+
if not web_searcher:
|
| 1609 |
+
raise HTTPException(503, "Web search not available. Set TAVILY_API_KEY.")
|
| 1610 |
+
|
| 1611 |
+
logger.info("Using web search only mode")
|
| 1612 |
+
web_results = await web_searcher.search(request.query, max_results=5)
|
| 1613 |
+
|
| 1614 |
+
# Format results for LLM processing
|
| 1615 |
+
web_context = web_searcher.format_results_for_llm(web_results)
|
| 1616 |
+
|
| 1617 |
+
# Synthesize answer using Gemini
|
| 1618 |
+
logger.info("Synthesizing web search results with Gemini")
|
| 1619 |
+
answer = await synthesize_web_results_with_gemini(
|
| 1620 |
+
query=request.query,
|
| 1621 |
+
web_context=web_context,
|
| 1622 |
+
rag_context=None
|
| 1623 |
+
)
|
| 1624 |
+
|
| 1625 |
+
result = {
|
| 1626 |
+
"answer": answer,
|
| 1627 |
+
"original_query": request.query,
|
| 1628 |
+
"improved_query": request.query,
|
| 1629 |
+
"verification_passed": False,
|
| 1630 |
+
"verification_score": 0,
|
| 1631 |
+
"web_search_performed": True,
|
| 1632 |
+
"sources": [{"url": r.get("url"), "title": r.get("title")} for r in web_results.get("results", [])]
|
| 1633 |
+
}
|
| 1634 |
+
else:
|
| 1635 |
+
# Standard RAG query with optimized parameters
|
| 1636 |
+
rag = await get_rag_instance(request.domain)
|
| 1637 |
+
|
| 1638 |
+
# Determine optimal parameters based on fast_mode
|
| 1639 |
+
if request.fast_mode:
|
| 1640 |
+
# Optimized parameters for 2-3x speedup
|
| 1641 |
+
top_k = request.top_k if request.top_k is not None else 20
|
| 1642 |
+
chunk_top_k = 10
|
| 1643 |
+
max_entity_tokens = 4000
|
| 1644 |
+
max_relation_tokens = 6000
|
| 1645 |
+
max_total_tokens = 20000
|
| 1646 |
+
logger.info(f"⚡ Fast mode enabled: top_k={top_k}, chunk_top_k={chunk_top_k}")
|
| 1647 |
+
else:
|
| 1648 |
+
# Default parameters (higher quality, slower)
|
| 1649 |
+
top_k = request.top_k if request.top_k is not None else 40
|
| 1650 |
+
chunk_top_k = 20
|
| 1651 |
+
max_entity_tokens = 6000
|
| 1652 |
+
max_relation_tokens = 8000
|
| 1653 |
+
max_total_tokens = 30000
|
| 1654 |
+
|
| 1655 |
+
# Build query parameters
|
| 1656 |
+
from lightrag import QueryParam
|
| 1657 |
+
query_kwargs = {
|
| 1658 |
+
"top_k": top_k,
|
| 1659 |
+
"chunk_top_k": chunk_top_k,
|
| 1660 |
+
"max_entity_tokens": max_entity_tokens,
|
| 1661 |
+
"max_relation_tokens": max_relation_tokens,
|
| 1662 |
+
"max_total_tokens": max_total_tokens,
|
| 1663 |
+
}
|
| 1664 |
+
|
| 1665 |
+
# Log toggle settings
|
| 1666 |
+
logger.info(f"Query settings - improvement: {request.enable_query_improvement}, verification: {request.enable_verification_check}")
|
| 1667 |
+
|
| 1668 |
+
result = await rag.aquery(
|
| 1669 |
+
query=request.query,
|
| 1670 |
+
mode=request.mode,
|
| 1671 |
+
enable_query_improvement=request.enable_query_improvement, # Use toggle instead of always true
|
| 1672 |
+
enable_verification=request.enable_verification_check, # Use toggle instead of always request.enable_verification
|
| 1673 |
+
return_verification_info=request.return_metadata,
|
| 1674 |
+
**query_kwargs
|
| 1675 |
+
)
|
| 1676 |
+
|
| 1677 |
+
# Augment with web search if requested
|
| 1678 |
+
if request.enable_web_search and web_searcher:
|
| 1679 |
+
logger.info("Augmenting RAG results with web search")
|
| 1680 |
+
try:
|
| 1681 |
+
rag_answer = result.get("answer") if isinstance(result, dict) else str(result)
|
| 1682 |
+
web_results = await web_searcher.search(request.query, max_results=5)
|
| 1683 |
+
|
| 1684 |
+
if web_results.get("results"):
|
| 1685 |
+
# Format web results for LLM
|
| 1686 |
+
web_context = web_searcher.format_results_for_llm(web_results)
|
| 1687 |
+
|
| 1688 |
+
# Synthesize combined answer using Gemini
|
| 1689 |
+
logger.info("Synthesizing RAG + web results with Gemini")
|
| 1690 |
+
synthesized_answer = await synthesize_web_results_with_gemini(
|
| 1691 |
+
query=request.query,
|
| 1692 |
+
web_context=web_context,
|
| 1693 |
+
rag_context=rag_answer
|
| 1694 |
+
)
|
| 1695 |
+
|
| 1696 |
+
if isinstance(result, dict):
|
| 1697 |
+
result["answer"] = synthesized_answer
|
| 1698 |
+
result["web_search_performed"] = True
|
| 1699 |
+
result["web_sources"] = [{"url": r.get("url"), "title": r.get("title")} for r in web_results.get("results", [])]
|
| 1700 |
+
else:
|
| 1701 |
+
result = synthesized_answer
|
| 1702 |
+
except Exception as e:
|
| 1703 |
+
logger.error(f"Web search augmentation error: {e}")
|
| 1704 |
+
# Continue with RAG-only result
|
| 1705 |
+
|
| 1706 |
+
# Handle None result
|
| 1707 |
+
if result is None:
|
| 1708 |
+
answer = "I couldn't find any relevant information in the knowledge base to answer your question. Please try rephrasing your question or ensure that relevant documents have been uploaded."
|
| 1709 |
+
metadata = {
|
| 1710 |
+
"original_query": request.query,
|
| 1711 |
+
"improved_query": request.query,
|
| 1712 |
+
"verification_passed": False,
|
| 1713 |
+
"verification_score": 0,
|
| 1714 |
+
}
|
| 1715 |
+
query_improved = False
|
| 1716 |
+
verification_performed = False
|
| 1717 |
+
confidence = 0.0
|
| 1718 |
+
elif isinstance(result, dict):
|
| 1719 |
+
answer = result.get("answer", "No answer found.")
|
| 1720 |
+
metadata = {
|
| 1721 |
+
"original_query": result.get("original_query", request.query),
|
| 1722 |
+
"improved_query": result.get("improved_query", request.query),
|
| 1723 |
+
"verification_passed": result.get("verification_passed", False),
|
| 1724 |
+
"verification_score": result.get("verification_score", 0),
|
| 1725 |
+
}
|
| 1726 |
+
query_improved = result.get("improved_query") != result.get("original_query")
|
| 1727 |
+
verification_performed = result.get("verification_passed", False)
|
| 1728 |
+
confidence = result.get("verification_score", 0) / 10.0
|
| 1729 |
+
else:
|
| 1730 |
+
answer = str(result) if result else "No answer found."
|
| 1731 |
+
metadata = {}
|
| 1732 |
+
query_improved = False
|
| 1733 |
+
verification_performed = False
|
| 1734 |
+
confidence = 1.0
|
| 1735 |
+
|
| 1736 |
+
conversation_history.extend([
|
| 1737 |
+
{"role": "user", "content": request.query},
|
| 1738 |
+
{"role": "assistant", "content": answer}
|
| 1739 |
+
])
|
| 1740 |
+
conversation_histories[conversation_id] = conversation_history
|
| 1741 |
+
|
| 1742 |
+
response = QueryResponse(
|
| 1743 |
+
answer=answer,
|
| 1744 |
+
sources=[], # TODO: Extract from result if available
|
| 1745 |
+
confidence_score=confidence,
|
| 1746 |
+
query_improved=query_improved,
|
| 1747 |
+
verification_performed=verification_performed,
|
| 1748 |
+
conversation_id=conversation_id,
|
| 1749 |
+
metadata=metadata if request.return_metadata else None
|
| 1750 |
+
)
|
| 1751 |
+
|
| 1752 |
+
# Store in cache if enabled (non-web search queries only)
|
| 1753 |
+
if cache_key and request.enable_cache:
|
| 1754 |
+
query_cache[cache_key] = response
|
| 1755 |
+
logger.info(f"✓ Cached query result (key: {cache_key[:16]}...)")
|
| 1756 |
+
|
| 1757 |
+
# Track performance metrics
|
| 1758 |
+
query_time = time.time() - start_time
|
| 1759 |
+
performance_metrics["query_times"].append(query_time)
|
| 1760 |
+
# Keep only last 100 metrics
|
| 1761 |
+
if len(performance_metrics["query_times"]) > 100:
|
| 1762 |
+
performance_metrics["query_times"] = performance_metrics["query_times"][-100:]
|
| 1763 |
+
|
| 1764 |
+
logger.info(f"Query completed in {query_time:.2f}s (fast_mode: {request.fast_mode}, confidence: {confidence:.2f})")
|
| 1765 |
+
return response
|
| 1766 |
+
except HTTPException:
|
| 1767 |
+
raise
|
| 1768 |
+
except Exception as e:
|
| 1769 |
+
logger.error(f"Query error: {e}", exc_info=True)
|
| 1770 |
+
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
|
| 1771 |
+
|
| 1772 |
+
|
| 1773 |
+
@app.get("/conversation/{conversation_id}")
|
| 1774 |
+
async def get_conversation(conversation_id: str):
|
| 1775 |
+
"""Get conversation history by ID."""
|
| 1776 |
+
if conversation_id not in conversation_histories:
|
| 1777 |
+
raise HTTPException(status_code=404, detail="Conversation not found")
|
| 1778 |
+
return {
|
| 1779 |
+
"conversation_id": conversation_id,
|
| 1780 |
+
"messages": conversation_histories[conversation_id],
|
| 1781 |
+
}
|
| 1782 |
+
|
| 1783 |
+
|
| 1784 |
+
@app.delete("/conversation/{conversation_id}")
|
| 1785 |
+
async def clear_conversation(conversation_id: str):
|
| 1786 |
+
"""Clear conversation history."""
|
| 1787 |
+
if conversation_id in conversation_histories:
|
| 1788 |
+
del conversation_histories[conversation_id]
|
| 1789 |
+
logger.info(f"Cleared conversation: {conversation_id}")
|
| 1790 |
+
return {"success": True, "message": "Conversation cleared"}
|
| 1791 |
+
raise HTTPException(status_code=404, detail="Conversation not found")
|
| 1792 |
+
|
| 1793 |
+
|
| 1794 |
+
@app.delete("/clear/{domain}")
|
| 1795 |
+
async def clear_domain_data(domain: str):
|
| 1796 |
+
"""WARNING: Deletes all processed documents and indices for the domain."""
|
| 1797 |
+
logger.warning(f"Clear domain data request: {domain}")
|
| 1798 |
+
try:
|
| 1799 |
+
if domain not in DOMAIN_CONFIGS:
|
| 1800 |
+
raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
|
| 1801 |
+
|
| 1802 |
+
if domain in rag_instances:
|
| 1803 |
+
await rag_instances[domain].finalize_storages()
|
| 1804 |
+
del rag_instances[domain]
|
| 1805 |
+
|
| 1806 |
+
domain_storage = STORAGE_DIR / domain
|
| 1807 |
+
if domain_storage.exists():
|
| 1808 |
+
import shutil
|
| 1809 |
+
shutil.rmtree(domain_storage)
|
| 1810 |
+
domain_storage.mkdir(parents=True, exist_ok=True)
|
| 1811 |
+
|
| 1812 |
+
logger.info(f"Domain data cleared: {domain}")
|
| 1813 |
+
return {"success": True, "message": f"All data cleared for domain '{domain}'"}
|
| 1814 |
+
except Exception as e:
|
| 1815 |
+
logger.error(f"Clear domain error: {e}", exc_info=True)
|
| 1816 |
+
raise HTTPException(status_code=500, detail=f"Failed to clear domain: {str(e)}")
|
| 1817 |
+
|
| 1818 |
+
|
| 1819 |
+
@app.get("/documents")
|
| 1820 |
+
async def list_documents(domain: str):
|
| 1821 |
+
"""
|
| 1822 |
+
List all processed documents for a domain.
|
| 1823 |
+
|
| 1824 |
+
Only returns documents with status 'completed'. Documents still being
|
| 1825 |
+
processed are excluded to avoid confusion.
|
| 1826 |
+
"""
|
| 1827 |
+
try:
|
| 1828 |
+
if domain not in DOMAIN_CONFIGS:
|
| 1829 |
+
raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
|
| 1830 |
+
|
| 1831 |
+
documents = []
|
| 1832 |
+
domain_upload_dir = UPLOAD_DIR / domain
|
| 1833 |
+
|
| 1834 |
+
if domain_upload_dir.exists():
|
| 1835 |
+
for file_path in domain_upload_dir.glob("*"):
|
| 1836 |
+
if file_path.is_file():
|
| 1837 |
+
# Extract processing_id and filename
|
| 1838 |
+
filename = file_path.name
|
| 1839 |
+
parts = filename.split('_', 1)
|
| 1840 |
+
processing_id = parts[0] if len(parts) > 1 else ""
|
| 1841 |
+
display_name = parts[1] if len(parts) > 1 else filename
|
| 1842 |
+
|
| 1843 |
+
# Check if document is actually completed
|
| 1844 |
+
# Skip if still processing, queued, or fetching
|
| 1845 |
+
if processing_id in processing_status:
|
| 1846 |
+
status = processing_status[processing_id].get('status')
|
| 1847 |
+
if status in ['processing', 'queued', 'fetching']:
|
| 1848 |
+
# Document is still being processed, skip it
|
| 1849 |
+
logger.debug(f"Skipping document {processing_id} - status: {status}")
|
| 1850 |
+
continue
|
| 1851 |
+
elif status == 'failed':
|
| 1852 |
+
# Optionally skip failed documents or include them
|
| 1853 |
+
# For now, skip them to only show successfully processed docs
|
| 1854 |
+
continue
|
| 1855 |
+
|
| 1856 |
+
# Only include completed documents or legacy ones without status
|
| 1857 |
+
documents.append({
|
| 1858 |
+
"id": processing_id,
|
| 1859 |
+
"name": display_name,
|
| 1860 |
+
"domain": domain,
|
| 1861 |
+
"status": "processed",
|
| 1862 |
+
"uploadedAt": str(file_path.stat().st_mtime)
|
| 1863 |
+
})
|
| 1864 |
+
|
| 1865 |
+
return {"documents": documents}
|
| 1866 |
+
except HTTPException:
|
| 1867 |
+
raise
|
| 1868 |
+
except Exception as e:
|
| 1869 |
+
logger.error(f"Error listing documents: {e}", exc_info=True)
|
| 1870 |
+
raise HTTPException(status_code=500, detail=f"Failed to list documents: {str(e)}")
|
| 1871 |
+
|
| 1872 |
+
|
| 1873 |
+
@app.get("/performance-metrics")
|
| 1874 |
+
async def get_performance_metrics():
|
| 1875 |
+
"""Get performance metrics for queries and document processing."""
|
| 1876 |
+
try:
|
| 1877 |
+
query_times = performance_metrics.get("query_times", [])
|
| 1878 |
+
processing_times = performance_metrics.get("processing_times", [])
|
| 1879 |
+
|
| 1880 |
+
# Calculate statistics
|
| 1881 |
+
def calc_stats(times):
|
| 1882 |
+
if not times:
|
| 1883 |
+
return {"count": 0, "avg": 0, "min": 0, "max": 0}
|
| 1884 |
+
return {
|
| 1885 |
+
"count": len(times),
|
| 1886 |
+
"avg": sum(times) / len(times),
|
| 1887 |
+
"min": min(times),
|
| 1888 |
+
"max": max(times)
|
| 1889 |
+
}
|
| 1890 |
+
|
| 1891 |
+
return {
|
| 1892 |
+
"query_metrics": calc_stats(query_times),
|
| 1893 |
+
"processing_metrics": calc_stats(processing_times),
|
| 1894 |
+
"cache_stats": {
|
| 1895 |
+
"size": len(query_cache),
|
| 1896 |
+
"max_size": query_cache.maxsize,
|
| 1897 |
+
"ttl_seconds": query_cache.ttl
|
| 1898 |
+
}
|
| 1899 |
+
}
|
| 1900 |
+
except Exception as e:
|
| 1901 |
+
logger.error(f"Error getting performance metrics: {e}", exc_info=True)
|
| 1902 |
+
return {
|
| 1903 |
+
"query_metrics": {"count": 0, "avg": 0, "min": 0, "max": 0},
|
| 1904 |
+
"processing_metrics": {"count": 0, "avg": 0, "min": 0, "max": 0},
|
| 1905 |
+
"cache_stats": {"size": 0, "max_size": 100, "ttl_seconds": 300}
|
| 1906 |
+
}
|
| 1907 |
+
|
| 1908 |
+
|
| 1909 |
+
@app.get("/status/{processing_id}")
|
| 1910 |
+
async def get_processing_status(processing_id: str):
|
| 1911 |
+
"""
|
| 1912 |
+
Get the processing status of a document.
|
| 1913 |
+
|
| 1914 |
+
Now uses persistent status storage that survives backend restarts.
|
| 1915 |
+
The status is loaded from disk on startup and kept in sync.
|
| 1916 |
+
"""
|
| 1917 |
+
try:
|
| 1918 |
+
# Check the persistent status tracker (loaded from disk on startup)
|
| 1919 |
+
if processing_id in processing_status:
|
| 1920 |
+
status_info = processing_status[processing_id]
|
| 1921 |
+
logger.debug(f"Status check for {processing_id}: {status_info.get('status')}")
|
| 1922 |
+
return {
|
| 1923 |
+
"processing_id": processing_id,
|
| 1924 |
+
**status_info
|
| 1925 |
+
}
|
| 1926 |
+
|
| 1927 |
+
# If not in status tracker, check if this is a legacy upload
|
| 1928 |
+
# (uploaded before persistent status was implemented)
|
| 1929 |
+
for domain in DOMAIN_CONFIGS.keys():
|
| 1930 |
+
domain_upload_dir = UPLOAD_DIR / domain
|
| 1931 |
+
if domain_upload_dir.exists():
|
| 1932 |
+
for file_path in domain_upload_dir.glob(f"{processing_id}_*"):
|
| 1933 |
+
if file_path.is_file():
|
| 1934 |
+
# Legacy upload - return completed status
|
| 1935 |
+
# but don't add to persistent status to avoid confusion
|
| 1936 |
+
return {
|
| 1937 |
+
"processing_id": processing_id,
|
| 1938 |
+
"status": "completed",
|
| 1939 |
+
"message": "Document processed successfully (legacy upload)"
|
| 1940 |
+
}
|
| 1941 |
+
|
| 1942 |
+
# If not found anywhere, status is unknown
|
| 1943 |
+
# This typically means the processing_id is invalid
|
| 1944 |
+
return {
|
| 1945 |
+
"processing_id": processing_id,
|
| 1946 |
+
"status": "unknown",
|
| 1947 |
+
"message": "Processing ID not found. It may be invalid or expired."
|
| 1948 |
+
}
|
| 1949 |
+
except Exception as e:
|
| 1950 |
+
logger.error(f"Error checking status: {e}", exc_info=True)
|
| 1951 |
+
return {
|
| 1952 |
+
"processing_id": processing_id,
|
| 1953 |
+
"status": "error",
|
| 1954 |
+
"message": f"Error checking status: {str(e)}",
|
| 1955 |
+
"error": str(e)
|
| 1956 |
+
}
|
| 1957 |
+
|
| 1958 |
+
|
| 1959 |
+
@app.delete("/documents/{doc_id}")
|
| 1960 |
+
async def delete_document(doc_id: str):
|
| 1961 |
+
"""
|
| 1962 |
+
Delete a processed document completely including all RAG data.
|
| 1963 |
+
|
| 1964 |
+
This endpoint performs comprehensive deletion of:
|
| 1965 |
+
- Knowledge graph entities and relationships
|
| 1966 |
+
- Embedding vectors (chunks, entities, relationships)
|
| 1967 |
+
- Text chunks and metadata
|
| 1968 |
+
- Document status records
|
| 1969 |
+
- Physical upload files
|
| 1970 |
+
- Parser output files
|
| 1971 |
+
|
| 1972 |
+
Returns detailed deletion report with verification.
|
| 1973 |
+
"""
|
| 1974 |
+
try:
|
| 1975 |
+
from raganything.deletion_verifier import delete_document_complete
|
| 1976 |
+
|
| 1977 |
+
logger.info(f"Delete document request: {doc_id}")
|
| 1978 |
+
|
| 1979 |
+
# Step 1: Search for the document in all domains
|
| 1980 |
+
found_domain = None
|
| 1981 |
+
for domain in DOMAIN_CONFIGS.keys():
|
| 1982 |
+
domain_upload_dir = UPLOAD_DIR / domain
|
| 1983 |
+
if domain_upload_dir.exists():
|
| 1984 |
+
for file_path in domain_upload_dir.glob(f"{doc_id}_*"):
|
| 1985 |
+
if file_path.is_file():
|
| 1986 |
+
found_domain = domain
|
| 1987 |
+
break
|
| 1988 |
+
if found_domain:
|
| 1989 |
+
break
|
| 1990 |
+
|
| 1991 |
+
if not found_domain:
|
| 1992 |
+
logger.warning(f"Document {doc_id} not found in any domain")
|
| 1993 |
+
raise HTTPException(status_code=404, detail="Document not found")
|
| 1994 |
+
|
| 1995 |
+
logger.info(f"Found document {doc_id} in domain: {found_domain}")
|
| 1996 |
+
|
| 1997 |
+
# Step 2: Get RAG instance and find the actual document ID in storage
|
| 1998 |
+
rag = await get_rag_instance(found_domain)
|
| 1999 |
+
|
| 2000 |
+
# Find document in doc_status by processing_id prefix
|
| 2001 |
+
doc_to_delete = None
|
| 2002 |
+
doc_status_file = STORAGE_DIR / found_domain / "kv_store_doc_status.json"
|
| 2003 |
+
if doc_status_file.exists():
|
| 2004 |
+
import json
|
| 2005 |
+
with open(doc_status_file, 'r') as f:
|
| 2006 |
+
doc_status = json.load(f)
|
| 2007 |
+
|
| 2008 |
+
# Find document by file_path containing doc_id
|
| 2009 |
+
for doc_key, doc_info in doc_status.items():
|
| 2010 |
+
if 'file_path' in doc_info and doc_id in doc_info['file_path']:
|
| 2011 |
+
doc_to_delete = doc_key
|
| 2012 |
+
logger.info(f"Found document in storage: {doc_key}")
|
| 2013 |
+
break
|
| 2014 |
+
|
| 2015 |
+
if not doc_to_delete:
|
| 2016 |
+
logger.warning(f"Document {doc_id} not found in doc_status")
|
| 2017 |
+
# Still try to delete physical files
|
| 2018 |
+
doc_to_delete = doc_id
|
| 2019 |
+
|
| 2020 |
+
# Step 3: Collect files and directories to delete
|
| 2021 |
+
upload_files = list((UPLOAD_DIR / found_domain).glob(f"{doc_id}_*"))
|
| 2022 |
+
output_dir = BASE_DIR / "backend" / "output"
|
| 2023 |
+
output_paths = list(output_dir.glob(f"{doc_id}_*")) if output_dir.exists() else []
|
| 2024 |
+
|
| 2025 |
+
# Step 4: Perform complete deletion with verification
|
| 2026 |
+
deletion_report = await delete_document_complete(
|
| 2027 |
+
rag_instance=rag,
|
| 2028 |
+
doc_id=doc_to_delete,
|
| 2029 |
+
storage_dir=STORAGE_DIR / found_domain,
|
| 2030 |
+
upload_files=upload_files,
|
| 2031 |
+
output_dirs=output_paths
|
| 2032 |
+
)
|
| 2033 |
+
|
| 2034 |
+
# Step 5: Return detailed report
|
| 2035 |
+
if deletion_report.success:
|
| 2036 |
+
logger.info(
|
| 2037 |
+
f"Successfully deleted document {doc_id}: "
|
| 2038 |
+
f"{deletion_report.chunks_deleted} chunks, "
|
| 2039 |
+
f"{deletion_report.entities_deleted} entities, "
|
| 2040 |
+
f"{deletion_report.relationships_deleted} relationships, "
|
| 2041 |
+
f"{len(deletion_report.files_deleted)} files, "
|
| 2042 |
+
f"{len(deletion_report.directories_deleted)} directories"
|
| 2043 |
+
)
|
| 2044 |
+
return {
|
| 2045 |
+
"success": True,
|
| 2046 |
+
"message": "Document deleted completely with verification",
|
| 2047 |
+
"domain": found_domain,
|
| 2048 |
+
"report": deletion_report.to_dict()
|
| 2049 |
+
}
|
| 2050 |
+
else:
|
| 2051 |
+
logger.error(
|
| 2052 |
+
f"Document deletion completed with errors for {doc_id}: "
|
| 2053 |
+
f"{deletion_report.errors}"
|
| 2054 |
+
)
|
| 2055 |
+
return {
|
| 2056 |
+
"success": False,
|
| 2057 |
+
"message": "Document deletion completed with errors",
|
| 2058 |
+
"domain": found_domain,
|
| 2059 |
+
"report": deletion_report.to_dict()
|
| 2060 |
+
}
|
| 2061 |
+
|
| 2062 |
+
except HTTPException:
|
| 2063 |
+
raise
|
| 2064 |
+
except Exception as e:
|
| 2065 |
+
logger.error(f"Error deleting document {doc_id}: {e}", exc_info=True)
|
| 2066 |
+
raise HTTPException(
|
| 2067 |
+
status_code=500,
|
| 2068 |
+
detail=f"Failed to delete document: {str(e)}"
|
| 2069 |
+
)
|
| 2070 |
+
|
| 2071 |
+
|
| 2072 |
+
# =============================================================================
|
| 2073 |
+
# Main Entry Point
|
| 2074 |
+
# =============================================================================
|
| 2075 |
+
|
| 2076 |
+
if __name__ == "__main__":
|
| 2077 |
+
import uvicorn
|
| 2078 |
+
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True, log_level="info")
|
backend/requirements.txt
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FastAPI Backend Requirements - Python 3.12 compatible
|
| 2 |
+
|
| 3 |
+
# Web Framework
|
| 4 |
+
fastapi>=0.104.0
|
| 5 |
+
uvicorn[standard]>=0.24.0
|
| 6 |
+
python-multipart>=0.0.6
|
| 7 |
+
|
| 8 |
+
# Google Gemini API
|
| 9 |
+
google-generativeai>=0.8.0
|
| 10 |
+
|
| 11 |
+
# Image Processing
|
| 12 |
+
Pillow>=10.0.0
|
| 13 |
+
|
| 14 |
+
# Environment Variables
|
| 15 |
+
python-dotenv>=1.0.0
|
| 16 |
+
|
| 17 |
+
# Web Search & URL Fetching
|
| 18 |
+
tavily-python>=0.3.0
|
| 19 |
+
requests>=2.31.0
|
| 20 |
+
beautifulsoup4>=4.12.0
|
| 21 |
+
markdownify>=0.11.0
|
| 22 |
+
|
| 23 |
+
# Additional dependencies
|
| 24 |
+
cachetools>=5.3.0
|
| 25 |
+
aiofiles>=23.0.0
|
| 26 |
+
|
| 27 |
+
# LightRAG - Using local modified version in /lightrag directory
|
| 28 |
+
# lightrag-hku>=1.4.0
|
backend/reranker.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reranking Module for RAG-Anything
|
| 3 |
+
|
| 4 |
+
Provides reranking functionality using:
|
| 5 |
+
1. Gemini-based LLM reranking (free tier compatible)
|
| 6 |
+
2. Cross-encoder style scoring
|
| 7 |
+
3. Relevance-based reordering
|
| 8 |
+
|
| 9 |
+
Reranking is crucial for RAG systems because:
|
| 10 |
+
- Vector search (embeddings) finds semantically similar text but may miss subtle context
|
| 11 |
+
- LLMs can deeply understand query intent and document relevance
|
| 12 |
+
- Reranking improves answer quality by promoting truly relevant chunks to the top
|
| 13 |
+
|
| 14 |
+
Author: RAG-Anything Team
|
| 15 |
+
Version: 1.0.0
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import asyncio
|
| 19 |
+
import logging
|
| 20 |
+
import re
|
| 21 |
+
from typing import List, Dict, Any, Optional, Callable
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class GeminiReranker:
|
| 27 |
+
"""
|
| 28 |
+
Reranker using Gemini API for semantic relevance scoring
|
| 29 |
+
|
| 30 |
+
This reranker takes chunks from vector search and re-scores them
|
| 31 |
+
based on deep semantic understanding using an LLM.
|
| 32 |
+
|
| 33 |
+
Why reranking matters:
|
| 34 |
+
---------------------
|
| 35 |
+
Vector embeddings alone can miss:
|
| 36 |
+
- Negations ("not effective" vs "effective")
|
| 37 |
+
- Context dependencies ("aspirin for elderly" vs "aspirin for children")
|
| 38 |
+
- Query intent ("what causes X" vs "how to prevent X")
|
| 39 |
+
|
| 40 |
+
LLM reranking provides:
|
| 41 |
+
- Contextual understanding of the query
|
| 42 |
+
- Semantic relevance beyond keyword matching
|
| 43 |
+
- Better handling of complex queries
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
def __init__(
|
| 47 |
+
self,
|
| 48 |
+
llm_func: Optional[Callable] = None,
|
| 49 |
+
model_name: str = "models/gemini-2.5-flash",
|
| 50 |
+
batch_size: int = 5,
|
| 51 |
+
temperature: float = 0.1
|
| 52 |
+
):
|
| 53 |
+
"""
|
| 54 |
+
Initialize Gemini-based reranker
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
llm_func: Optional LLM function to use for reranking
|
| 58 |
+
model_name: Gemini model to use (default: flash for speed)
|
| 59 |
+
batch_size: Number of chunks to process in parallel
|
| 60 |
+
temperature: Temperature for relevance scoring (low=consistent)
|
| 61 |
+
"""
|
| 62 |
+
self.llm_func = llm_func
|
| 63 |
+
self.model_name = model_name
|
| 64 |
+
self.batch_size = batch_size
|
| 65 |
+
self.temperature = temperature
|
| 66 |
+
|
| 67 |
+
async def rerank(
|
| 68 |
+
self,
|
| 69 |
+
query: str,
|
| 70 |
+
chunks: List[Dict[str, Any]],
|
| 71 |
+
top_k: Optional[int] = None
|
| 72 |
+
) -> List[Dict[str, Any]]:
|
| 73 |
+
"""
|
| 74 |
+
Rerank chunks based on relevance to query
|
| 75 |
+
|
| 76 |
+
Process:
|
| 77 |
+
1. Take top chunks from vector search (e.g., top 50)
|
| 78 |
+
2. Score each chunk's relevance using LLM (0-10 scale)
|
| 79 |
+
3. Re-order by relevance score
|
| 80 |
+
4. Return top_k most relevant chunks
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
query: Search query
|
| 84 |
+
chunks: List of chunks with 'content' field
|
| 85 |
+
top_k: Return only top K results (None = return all, reranked)
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
List of reranked chunks with 'relevance_score' field added
|
| 89 |
+
"""
|
| 90 |
+
if not chunks:
|
| 91 |
+
logger.warning("No chunks to rerank")
|
| 92 |
+
return []
|
| 93 |
+
|
| 94 |
+
if len(chunks) == 1:
|
| 95 |
+
logger.debug("Only one chunk, skipping reranking")
|
| 96 |
+
chunks[0]['relevance_score'] = 1.0
|
| 97 |
+
return chunks
|
| 98 |
+
|
| 99 |
+
logger.info(f"Reranking {len(chunks)} chunks for query: {query[:50]}...")
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
# Score all chunks in batches
|
| 103 |
+
scored_chunks = await self._score_chunks_batch(query, chunks)
|
| 104 |
+
|
| 105 |
+
# Sort by relevance score (highest first)
|
| 106 |
+
scored_chunks.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
|
| 107 |
+
|
| 108 |
+
# Return top_k if specified
|
| 109 |
+
if top_k:
|
| 110 |
+
scored_chunks = scored_chunks[:top_k]
|
| 111 |
+
|
| 112 |
+
logger.info(
|
| 113 |
+
f"Reranking complete. Top score: {scored_chunks[0].get('relevance_score', 0):.2f}, "
|
| 114 |
+
f"Bottom score: {scored_chunks[-1].get('relevance_score', 0):.2f}"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return scored_chunks
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
logger.error(f"Error during reranking: {e}", exc_info=True)
|
| 121 |
+
# Return original order on error
|
| 122 |
+
return chunks[:top_k] if top_k else chunks
|
| 123 |
+
|
| 124 |
+
async def _score_chunks_batch(
|
| 125 |
+
self,
|
| 126 |
+
query: str,
|
| 127 |
+
chunks: List[Dict[str, Any]]
|
| 128 |
+
) -> List[Dict[str, Any]]:
|
| 129 |
+
"""
|
| 130 |
+
Score chunks in batches for efficiency
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
query: Search query
|
| 134 |
+
chunks: List of chunks to score
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
Chunks with relevance_score added
|
| 138 |
+
"""
|
| 139 |
+
scored_chunks = []
|
| 140 |
+
|
| 141 |
+
# Process in batches to avoid rate limits
|
| 142 |
+
for i in range(0, len(chunks), self.batch_size):
|
| 143 |
+
batch = chunks[i:i + self.batch_size]
|
| 144 |
+
|
| 145 |
+
# Score batch concurrently
|
| 146 |
+
tasks = [self._score_chunk(query, chunk) for chunk in batch]
|
| 147 |
+
batch_scores = await asyncio.gather(*tasks, return_exceptions=True)
|
| 148 |
+
|
| 149 |
+
# Collect results
|
| 150 |
+
for chunk, score_result in zip(batch, batch_scores):
|
| 151 |
+
if isinstance(score_result, Exception):
|
| 152 |
+
logger.warning(f"Failed to score chunk: {score_result}")
|
| 153 |
+
chunk['relevance_score'] = 0.0
|
| 154 |
+
else:
|
| 155 |
+
chunk['relevance_score'] = score_result
|
| 156 |
+
|
| 157 |
+
scored_chunks.append(chunk)
|
| 158 |
+
|
| 159 |
+
return scored_chunks
|
| 160 |
+
|
| 161 |
+
async def _score_chunk(
|
| 162 |
+
self,
|
| 163 |
+
query: str,
|
| 164 |
+
chunk: Dict[str, Any]
|
| 165 |
+
) -> float:
|
| 166 |
+
"""
|
| 167 |
+
Score a single chunk's relevance to the query using LLM
|
| 168 |
+
|
| 169 |
+
Prompt engineering approach:
|
| 170 |
+
- Ask LLM to act as a relevance judge
|
| 171 |
+
- Provide clear scoring criteria (0-10 scale)
|
| 172 |
+
- Extract numeric score from response
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
query: Search query
|
| 176 |
+
chunk: Chunk dictionary with 'content' field
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
Relevance score (0-10)
|
| 180 |
+
"""
|
| 181 |
+
content = chunk.get('content', '')
|
| 182 |
+
if not content:
|
| 183 |
+
return 0.0
|
| 184 |
+
|
| 185 |
+
# Truncate very long chunks to avoid token limits
|
| 186 |
+
max_content_length = 1000
|
| 187 |
+
if len(content) > max_content_length:
|
| 188 |
+
content = content[:max_content_length] + "..."
|
| 189 |
+
|
| 190 |
+
# Prompt for relevance scoring
|
| 191 |
+
prompt = f"""You are a relevance judge. Score how relevant the following passage is to answering the query.
|
| 192 |
+
|
| 193 |
+
Query: {query}
|
| 194 |
+
|
| 195 |
+
Passage:
|
| 196 |
+
{content}
|
| 197 |
+
|
| 198 |
+
Scoring criteria:
|
| 199 |
+
10 = Directly answers the query with specific, relevant information
|
| 200 |
+
8-9 = Highly relevant, provides useful context
|
| 201 |
+
6-7 = Somewhat relevant, contains related information
|
| 202 |
+
4-5 = Tangentially related, limited usefulness
|
| 203 |
+
2-3 = Barely related, mostly off-topic
|
| 204 |
+
0-1 = Completely irrelevant
|
| 205 |
+
|
| 206 |
+
Respond with ONLY a number from 0-10. No explanation needed."""
|
| 207 |
+
|
| 208 |
+
try:
|
| 209 |
+
# Call LLM for scoring
|
| 210 |
+
if self.llm_func:
|
| 211 |
+
response = await self.llm_func(
|
| 212 |
+
prompt=prompt,
|
| 213 |
+
temperature=self.temperature,
|
| 214 |
+
max_tokens=50 # Increased from 10 to allow for complete score responses
|
| 215 |
+
)
|
| 216 |
+
else:
|
| 217 |
+
# Fallback: no reranking
|
| 218 |
+
return 5.0
|
| 219 |
+
|
| 220 |
+
# Extract numeric score from response
|
| 221 |
+
score = self._extract_score(response)
|
| 222 |
+
return score
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
logger.error(f"Error scoring chunk: {e}")
|
| 226 |
+
return 5.0 # Default mid-range score on error
|
| 227 |
+
|
| 228 |
+
def _extract_score(self, response: str) -> float:
|
| 229 |
+
"""
|
| 230 |
+
Extract numeric score from LLM response
|
| 231 |
+
|
| 232 |
+
Handles various response formats:
|
| 233 |
+
- "8.5"
|
| 234 |
+
- "Score: 9"
|
| 235 |
+
- "The relevance is 7/10"
|
| 236 |
+
- "8"
|
| 237 |
+
|
| 238 |
+
Args:
|
| 239 |
+
response: LLM response text
|
| 240 |
+
|
| 241 |
+
Returns:
|
| 242 |
+
Extracted score (0-10), defaults to 5.0 if parsing fails
|
| 243 |
+
"""
|
| 244 |
+
try:
|
| 245 |
+
# Remove whitespace
|
| 246 |
+
response = response.strip()
|
| 247 |
+
|
| 248 |
+
# Try to find a number (int or float) in the response
|
| 249 |
+
# Pattern matches: "8", "8.5", "9/10", "Score: 7", etc.
|
| 250 |
+
number_pattern = r'(\d+\.?\d*)'
|
| 251 |
+
matches = re.findall(number_pattern, response)
|
| 252 |
+
|
| 253 |
+
if matches:
|
| 254 |
+
# Take the first number found
|
| 255 |
+
score = float(matches[0])
|
| 256 |
+
|
| 257 |
+
# Normalize to 0-10 range
|
| 258 |
+
score = max(0.0, min(10.0, score))
|
| 259 |
+
|
| 260 |
+
return score
|
| 261 |
+
else:
|
| 262 |
+
logger.warning(f"Could not extract score from response: {response}")
|
| 263 |
+
return 5.0
|
| 264 |
+
|
| 265 |
+
except Exception as e:
|
| 266 |
+
logger.error(f"Error extracting score: {e}")
|
| 267 |
+
return 5.0
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
# Example usage
|
| 271 |
+
async def main():
|
| 272 |
+
"""Example demonstrating reranking"""
|
| 273 |
+
# Mock LLM function for testing
|
| 274 |
+
async def mock_llm(prompt: str, **kwargs) -> str:
|
| 275 |
+
# Simulate scoring based on keyword matching
|
| 276 |
+
if "directly" in prompt.lower():
|
| 277 |
+
return "9"
|
| 278 |
+
elif "somewhat" in prompt.lower():
|
| 279 |
+
return "6"
|
| 280 |
+
else:
|
| 281 |
+
return "3"
|
| 282 |
+
|
| 283 |
+
# Create reranker
|
| 284 |
+
reranker = GeminiReranker(llm_func=mock_llm)
|
| 285 |
+
|
| 286 |
+
# Example query and chunks
|
| 287 |
+
query = "What are the side effects of aspirin?"
|
| 288 |
+
|
| 289 |
+
chunks = [
|
| 290 |
+
{"content": "Aspirin can cause stomach bleeding in some patients..."},
|
| 291 |
+
{"content": "The history of aspirin dates back to ancient times..."},
|
| 292 |
+
{"content": "Common side effects include nausea and heartburn..."},
|
| 293 |
+
]
|
| 294 |
+
|
| 295 |
+
# Rerank
|
| 296 |
+
reranked = await reranker.rerank(query, chunks, top_k=2)
|
| 297 |
+
|
| 298 |
+
print("Reranked results:")
|
| 299 |
+
for i, chunk in enumerate(reranked, 1):
|
| 300 |
+
print(f"{i}. Score: {chunk['relevance_score']:.1f} - {chunk['content'][:50]}...")
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
if __name__ == "__main__":
|
| 304 |
+
asyncio.run(main())
|
backend/url_fetcher.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
URL Document Fetcher for RAG-Anything
|
| 3 |
+
|
| 4 |
+
Fetches and processes documents from URLs for ingestion into the RAG system.
|
| 5 |
+
|
| 6 |
+
Features:
|
| 7 |
+
- Web page scraping and parsing
|
| 8 |
+
- PDF download from URLs
|
| 9 |
+
- Markdown conversion
|
| 10 |
+
- Content cleaning and preprocessing
|
| 11 |
+
- Advanced parsing with text and image extraction
|
| 12 |
+
- Integration with RAG pipeline
|
| 13 |
+
|
| 14 |
+
Author: RAG-Anything Team
|
| 15 |
+
Version: 2.0.0
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import os
|
| 19 |
+
import asyncio
|
| 20 |
+
import logging
|
| 21 |
+
import tempfile
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from typing import Optional, Dict, Any, List
|
| 24 |
+
from urllib.parse import urlparse
|
| 25 |
+
import hashlib
|
| 26 |
+
import base64
|
| 27 |
+
|
| 28 |
+
logger = logging.getLogger(__name__)
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
import requests
|
| 32 |
+
from bs4 import BeautifulSoup
|
| 33 |
+
import markdownify
|
| 34 |
+
from urllib.parse import urljoin
|
| 35 |
+
DEPS_AVAILABLE = True
|
| 36 |
+
except ImportError:
|
| 37 |
+
DEPS_AVAILABLE = False
|
| 38 |
+
logger.warning("URL fetcher dependencies not installed. Install with: pip install requests beautifulsoup4 markdownify")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class URLFetcher:
|
| 42 |
+
"""Fetch and process documents from URLs"""
|
| 43 |
+
|
| 44 |
+
def __init__(
|
| 45 |
+
self,
|
| 46 |
+
download_dir: Optional[str] = None,
|
| 47 |
+
timeout: int = 30,
|
| 48 |
+
user_agent: str = "RAG-Anything/1.0"
|
| 49 |
+
):
|
| 50 |
+
"""
|
| 51 |
+
Initialize URL fetcher
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
download_dir: Directory to save downloaded files
|
| 55 |
+
timeout: Request timeout in seconds
|
| 56 |
+
user_agent: User agent string for requests
|
| 57 |
+
"""
|
| 58 |
+
if not DEPS_AVAILABLE:
|
| 59 |
+
raise ImportError("Required dependencies not installed. Run: pip install requests beautifulsoup4 markdownify")
|
| 60 |
+
|
| 61 |
+
self.download_dir = download_dir or tempfile.gettempdir()
|
| 62 |
+
self.timeout = timeout
|
| 63 |
+
self.headers = {"User-Agent": user_agent}
|
| 64 |
+
|
| 65 |
+
Path(self.download_dir).mkdir(parents=True, exist_ok=True)
|
| 66 |
+
logger.info(f"URLFetcher initialized (download_dir={self.download_dir})")
|
| 67 |
+
|
| 68 |
+
def _create_content_list(self, title: str, text_content: str, images: List[Dict]) -> List[Dict[str, Any]]:
|
| 69 |
+
"""
|
| 70 |
+
Create a structured content list compatible with RAG pipeline
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
title: Document title
|
| 74 |
+
text_content: Extracted text content
|
| 75 |
+
images: List of extracted images with metadata
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
List of content blocks for RAG processing
|
| 79 |
+
"""
|
| 80 |
+
content_list = []
|
| 81 |
+
|
| 82 |
+
# Add title as first text block
|
| 83 |
+
if title:
|
| 84 |
+
content_list.append({
|
| 85 |
+
"type": "text",
|
| 86 |
+
"text": f"# {title}",
|
| 87 |
+
"page_idx": 0
|
| 88 |
+
})
|
| 89 |
+
|
| 90 |
+
# Split text into paragraphs and add as text blocks
|
| 91 |
+
paragraphs = [p.strip() for p in text_content.split("\n\n") if p.strip()]
|
| 92 |
+
for idx, paragraph in enumerate(paragraphs[:50]): # Limit to first 50 paragraphs
|
| 93 |
+
if paragraph:
|
| 94 |
+
content_list.append({
|
| 95 |
+
"type": "text",
|
| 96 |
+
"text": paragraph,
|
| 97 |
+
"page_idx": idx // 10 # Group every 10 paragraphs as a "page"
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
# Add images as image blocks
|
| 101 |
+
for idx, img_info in enumerate(images):
|
| 102 |
+
content_list.append({
|
| 103 |
+
"type": "image",
|
| 104 |
+
"img_path": img_info["path"],
|
| 105 |
+
"image_caption": img_info.get("alt", "") or img_info.get("title", ""),
|
| 106 |
+
"page_idx": (len(paragraphs) + idx) // 10
|
| 107 |
+
})
|
| 108 |
+
|
| 109 |
+
return content_list
|
| 110 |
+
|
| 111 |
+
async def fetch_url(
|
| 112 |
+
self,
|
| 113 |
+
url: str,
|
| 114 |
+
save_as_pdf: bool = False,
|
| 115 |
+
convert_to_markdown: bool = True
|
| 116 |
+
) -> Dict[str, Any]:
|
| 117 |
+
"""
|
| 118 |
+
Fetch and process content from URL
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
url: URL to fetch
|
| 122 |
+
save_as_pdf: Whether to save as PDF (for PDF URLs)
|
| 123 |
+
convert_to_markdown: Convert HTML to markdown
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
Dictionary with file_path, content, metadata
|
| 127 |
+
"""
|
| 128 |
+
try:
|
| 129 |
+
logger.info(f"Fetching URL: {url}")
|
| 130 |
+
|
| 131 |
+
# Validate URL
|
| 132 |
+
parsed = urlparse(url)
|
| 133 |
+
if not parsed.scheme or not parsed.netloc:
|
| 134 |
+
raise ValueError(f"Invalid URL: {url}")
|
| 135 |
+
|
| 136 |
+
# Determine content type
|
| 137 |
+
response = await asyncio.to_thread(
|
| 138 |
+
requests.head, url, headers=self.headers, timeout=self.timeout, allow_redirects=True
|
| 139 |
+
)
|
| 140 |
+
content_type = response.headers.get("Content-Type", "").lower()
|
| 141 |
+
|
| 142 |
+
# Handle PDF files
|
| 143 |
+
if "pdf" in content_type or url.lower().endswith(".pdf"):
|
| 144 |
+
return await self._fetch_pdf(url)
|
| 145 |
+
|
| 146 |
+
# Handle HTML/web pages
|
| 147 |
+
elif "html" in content_type or not content_type:
|
| 148 |
+
return await self._fetch_html(url, convert_to_markdown)
|
| 149 |
+
|
| 150 |
+
# Handle other file types
|
| 151 |
+
else:
|
| 152 |
+
return await self._fetch_generic(url, content_type)
|
| 153 |
+
|
| 154 |
+
except Exception as e:
|
| 155 |
+
logger.error(f"Error fetching URL {url}: {e}", exc_info=True)
|
| 156 |
+
return {
|
| 157 |
+
"success": False,
|
| 158 |
+
"error": str(e),
|
| 159 |
+
"url": url,
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
async def _fetch_pdf(self, url: str) -> Dict[str, Any]:
|
| 163 |
+
"""Fetch PDF from URL"""
|
| 164 |
+
try:
|
| 165 |
+
response = await asyncio.to_thread(
|
| 166 |
+
requests.get, url, headers=self.headers, timeout=self.timeout
|
| 167 |
+
)
|
| 168 |
+
response.raise_for_status()
|
| 169 |
+
|
| 170 |
+
# Generate filename from URL
|
| 171 |
+
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
|
| 172 |
+
filename = f"url_{url_hash}.pdf"
|
| 173 |
+
file_path = Path(self.download_dir) / filename
|
| 174 |
+
|
| 175 |
+
# Save PDF
|
| 176 |
+
with open(file_path, "wb") as f:
|
| 177 |
+
f.write(response.content)
|
| 178 |
+
|
| 179 |
+
logger.info(f"PDF downloaded: {file_path}")
|
| 180 |
+
|
| 181 |
+
return {
|
| 182 |
+
"success": True,
|
| 183 |
+
"file_path": str(file_path),
|
| 184 |
+
"url": url,
|
| 185 |
+
"content_type": "pdf",
|
| 186 |
+
"size_bytes": len(response.content),
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
logger.error(f"Error fetching PDF: {e}")
|
| 191 |
+
raise
|
| 192 |
+
|
| 193 |
+
async def _fetch_html(self, url: str, convert_to_markdown: bool = True) -> Dict[str, Any]:
|
| 194 |
+
"""Fetch and parse HTML page with advanced content extraction"""
|
| 195 |
+
try:
|
| 196 |
+
response = await asyncio.to_thread(
|
| 197 |
+
requests.get, url, headers=self.headers, timeout=self.timeout
|
| 198 |
+
)
|
| 199 |
+
response.raise_for_status()
|
| 200 |
+
|
| 201 |
+
# Parse HTML
|
| 202 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
| 203 |
+
|
| 204 |
+
# Remove unwanted elements
|
| 205 |
+
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "iframe", "noscript"]):
|
| 206 |
+
tag.decompose()
|
| 207 |
+
|
| 208 |
+
# Extract title
|
| 209 |
+
title = soup.find("title")
|
| 210 |
+
title_text = title.get_text().strip() if title else "Untitled"
|
| 211 |
+
|
| 212 |
+
# Extract main content
|
| 213 |
+
main_content = soup.find("main") or soup.find("article") or soup.find("body")
|
| 214 |
+
|
| 215 |
+
# Extract images before converting to markdown (limit to first 10 images)
|
| 216 |
+
images = []
|
| 217 |
+
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
|
| 218 |
+
images_dir = Path(self.download_dir) / f"url_{url_hash}_images"
|
| 219 |
+
images_dir.mkdir(parents=True, exist_ok=True)
|
| 220 |
+
|
| 221 |
+
all_images = main_content.find_all("img")
|
| 222 |
+
max_images = min(10, len(all_images)) # Limit to 10 images
|
| 223 |
+
logger.info(f"Found {len(all_images)} images, downloading first {max_images}")
|
| 224 |
+
|
| 225 |
+
for idx, img in enumerate(all_images[:max_images]):
|
| 226 |
+
try:
|
| 227 |
+
img_url = img.get("src")
|
| 228 |
+
if not img_url:
|
| 229 |
+
continue
|
| 230 |
+
|
| 231 |
+
# Skip data URIs and very small images
|
| 232 |
+
if img_url.startswith("data:"):
|
| 233 |
+
continue
|
| 234 |
+
|
| 235 |
+
# Handle relative URLs
|
| 236 |
+
if img_url.startswith("//"):
|
| 237 |
+
img_url = "https:" + img_url
|
| 238 |
+
elif img_url.startswith("/"):
|
| 239 |
+
parsed_base = urlparse(url)
|
| 240 |
+
img_url = f"{parsed_base.scheme}://{parsed_base.netloc}{img_url}"
|
| 241 |
+
elif not img_url.startswith("http"):
|
| 242 |
+
img_url = urljoin(url, img_url)
|
| 243 |
+
|
| 244 |
+
# Download image with timeout
|
| 245 |
+
img_response = await asyncio.to_thread(
|
| 246 |
+
requests.get, img_url, headers=self.headers, timeout=5, stream=True
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
if img_response.status_code == 200:
|
| 250 |
+
# Check content size (skip if too large > 10MB)
|
| 251 |
+
content_length = img_response.headers.get('content-length')
|
| 252 |
+
if content_length and int(content_length) > 10 * 1024 * 1024:
|
| 253 |
+
logger.debug(f"Skipping large image {idx}: {content_length} bytes")
|
| 254 |
+
continue
|
| 255 |
+
|
| 256 |
+
# Determine file extension
|
| 257 |
+
content_type = img_response.headers.get("Content-Type", "")
|
| 258 |
+
ext = ".jpg"
|
| 259 |
+
if "png" in content_type:
|
| 260 |
+
ext = ".png"
|
| 261 |
+
elif "gif" in content_type:
|
| 262 |
+
ext = ".gif"
|
| 263 |
+
elif "webp" in content_type:
|
| 264 |
+
ext = ".webp"
|
| 265 |
+
|
| 266 |
+
img_path = images_dir / f"image_{idx}{ext}"
|
| 267 |
+
with open(img_path, "wb") as f:
|
| 268 |
+
f.write(img_response.content)
|
| 269 |
+
|
| 270 |
+
images.append({
|
| 271 |
+
"path": str(img_path),
|
| 272 |
+
"alt": img.get("alt", ""),
|
| 273 |
+
"title": img.get("title", ""),
|
| 274 |
+
"url": img_url
|
| 275 |
+
})
|
| 276 |
+
logger.debug(f"Downloaded image {idx+1}/{max_images}: {img_path.name}")
|
| 277 |
+
except Exception as img_error:
|
| 278 |
+
logger.debug(f"Failed to download image {idx}: {img_error}")
|
| 279 |
+
continue
|
| 280 |
+
|
| 281 |
+
if convert_to_markdown:
|
| 282 |
+
# Convert to markdown
|
| 283 |
+
content = markdownify.markdownify(
|
| 284 |
+
str(main_content),
|
| 285 |
+
heading_style="ATX",
|
| 286 |
+
bullets="-"
|
| 287 |
+
)
|
| 288 |
+
else:
|
| 289 |
+
# Extract plain text
|
| 290 |
+
content = main_content.get_text(separator="\n", strip=True)
|
| 291 |
+
|
| 292 |
+
# Create content list with structured data
|
| 293 |
+
content_list = self._create_content_list(title_text, content, images)
|
| 294 |
+
|
| 295 |
+
# Save to file
|
| 296 |
+
ext = ".md" if convert_to_markdown else ".txt"
|
| 297 |
+
filename = f"url_{url_hash}{ext}"
|
| 298 |
+
file_path = Path(self.download_dir) / filename
|
| 299 |
+
|
| 300 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 301 |
+
f.write(f"# {title_text}\n\n")
|
| 302 |
+
f.write(f"Source: {url}\n\n")
|
| 303 |
+
f.write(content)
|
| 304 |
+
|
| 305 |
+
# Save content list as JSON for RAG processing
|
| 306 |
+
import json
|
| 307 |
+
json_path = Path(self.download_dir) / f"url_{url_hash}_content_list.json"
|
| 308 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
| 309 |
+
json.dump(content_list, f, indent=2, ensure_ascii=False)
|
| 310 |
+
|
| 311 |
+
logger.info(f"HTML content saved: {file_path}")
|
| 312 |
+
logger.info(f"Extracted {len(images)} images from web page")
|
| 313 |
+
|
| 314 |
+
return {
|
| 315 |
+
"success": True,
|
| 316 |
+
"file_path": str(file_path),
|
| 317 |
+
"content_list_path": str(json_path),
|
| 318 |
+
"url": url,
|
| 319 |
+
"content_type": "html",
|
| 320 |
+
"title": title_text,
|
| 321 |
+
"content_preview": content[:500],
|
| 322 |
+
"images_count": len(images),
|
| 323 |
+
"content_list": content_list
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
except Exception as e:
|
| 327 |
+
logger.error(f"Error fetching HTML: {e}")
|
| 328 |
+
raise
|
| 329 |
+
|
| 330 |
+
async def _fetch_generic(self, url: str, content_type: str) -> Dict[str, Any]:
|
| 331 |
+
"""Fetch generic file"""
|
| 332 |
+
try:
|
| 333 |
+
response = await asyncio.to_thread(
|
| 334 |
+
requests.get, url, headers=self.headers, timeout=self.timeout
|
| 335 |
+
)
|
| 336 |
+
response.raise_for_status()
|
| 337 |
+
|
| 338 |
+
# Determine extension from content type
|
| 339 |
+
ext_map = {
|
| 340 |
+
"text/plain": ".txt",
|
| 341 |
+
"text/markdown": ".md",
|
| 342 |
+
"application/msword": ".doc",
|
| 343 |
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
| 344 |
+
}
|
| 345 |
+
ext = ext_map.get(content_type, ".bin")
|
| 346 |
+
|
| 347 |
+
# Save file
|
| 348 |
+
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
|
| 349 |
+
filename = f"url_{url_hash}{ext}"
|
| 350 |
+
file_path = Path(self.download_dir) / filename
|
| 351 |
+
|
| 352 |
+
with open(file_path, "wb") as f:
|
| 353 |
+
f.write(response.content)
|
| 354 |
+
|
| 355 |
+
logger.info(f"File downloaded: {file_path}")
|
| 356 |
+
|
| 357 |
+
return {
|
| 358 |
+
"success": True,
|
| 359 |
+
"file_path": str(file_path),
|
| 360 |
+
"url": url,
|
| 361 |
+
"content_type": content_type,
|
| 362 |
+
"size_bytes": len(response.content),
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
except Exception as e:
|
| 366 |
+
logger.error(f"Error fetching file: {e}")
|
| 367 |
+
raise
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
def create_url_fetcher(download_dir: Optional[str] = None, **kwargs) -> URLFetcher:
|
| 371 |
+
"""
|
| 372 |
+
Factory function to create a URL fetcher
|
| 373 |
+
|
| 374 |
+
Args:
|
| 375 |
+
download_dir: Directory to save downloaded files
|
| 376 |
+
**kwargs: Additional URLFetcher parameters
|
| 377 |
+
|
| 378 |
+
Returns:
|
| 379 |
+
Configured URLFetcher instance
|
| 380 |
+
"""
|
| 381 |
+
return URLFetcher(download_dir=download_dir, **kwargs)
|
backend/web_search.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Web Search Module for RAG-Anything using Tavily API
|
| 3 |
+
|
| 4 |
+
Provides intelligent web search capabilities to augment RAG with real-time information.
|
| 5 |
+
|
| 6 |
+
Features:
|
| 7 |
+
- Tavily API integration for high-quality search results
|
| 8 |
+
- Context-aware search query generation
|
| 9 |
+
- Result filtering and ranking
|
| 10 |
+
- Hybrid RAG + Web search mode
|
| 11 |
+
|
| 12 |
+
Author: RAG-Anything Team
|
| 13 |
+
Version: 1.0.0
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import asyncio
|
| 18 |
+
import logging
|
| 19 |
+
from typing import List, Dict, Any, Optional
|
| 20 |
+
from datetime import datetime
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
from tavily import TavilyClient, AsyncTavilyClient
|
| 26 |
+
TAVILY_AVAILABLE = True
|
| 27 |
+
except ImportError:
|
| 28 |
+
TAVILY_AVAILABLE = False
|
| 29 |
+
logger.warning("Tavily not installed. Install with: pip install tavily-python")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class WebSearcher:
|
| 33 |
+
"""Web search integration using Tavily API"""
|
| 34 |
+
|
| 35 |
+
def __init__(
|
| 36 |
+
self,
|
| 37 |
+
api_key: Optional[str] = None,
|
| 38 |
+
max_results: int = 5,
|
| 39 |
+
search_depth: str = "advanced",
|
| 40 |
+
include_raw_content: bool = True
|
| 41 |
+
):
|
| 42 |
+
"""
|
| 43 |
+
Initialize web searcher
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
api_key: Tavily API key (from env if not provided)
|
| 47 |
+
max_results: Maximum number of search results to return
|
| 48 |
+
search_depth: "basic" or "advanced" (advanced is more thorough)
|
| 49 |
+
include_raw_content: Whether to include full page content
|
| 50 |
+
"""
|
| 51 |
+
if not TAVILY_AVAILABLE:
|
| 52 |
+
raise ImportError("Tavily is not installed. Install with: pip install tavily-python")
|
| 53 |
+
|
| 54 |
+
self.api_key = api_key or os.getenv("TAVILY_API_KEY")
|
| 55 |
+
if not self.api_key:
|
| 56 |
+
raise ValueError("Tavily API key not found. Set TAVILY_API_KEY environment variable.")
|
| 57 |
+
|
| 58 |
+
self.max_results = max_results
|
| 59 |
+
self.search_depth = search_depth
|
| 60 |
+
self.include_raw_content = include_raw_content
|
| 61 |
+
|
| 62 |
+
# Initialize async client
|
| 63 |
+
self.client = AsyncTavilyClient(api_key=self.api_key)
|
| 64 |
+
|
| 65 |
+
logger.info(f"WebSearcher initialized (max_results={max_results}, depth={search_depth})")
|
| 66 |
+
|
| 67 |
+
async def search(
|
| 68 |
+
self,
|
| 69 |
+
query: str,
|
| 70 |
+
max_results: Optional[int] = None,
|
| 71 |
+
include_domains: Optional[List[str]] = None,
|
| 72 |
+
exclude_domains: Optional[List[str]] = None,
|
| 73 |
+
search_depth: Optional[str] = None
|
| 74 |
+
) -> Dict[str, Any]:
|
| 75 |
+
"""
|
| 76 |
+
Perform web search
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
query: Search query
|
| 80 |
+
max_results: Override default max results
|
| 81 |
+
include_domains: Only search these domains
|
| 82 |
+
exclude_domains: Exclude these domains
|
| 83 |
+
search_depth: Override default search depth
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
Dictionary with search results and metadata
|
| 87 |
+
"""
|
| 88 |
+
try:
|
| 89 |
+
logger.info(f"Searching web: {query[:100]}...")
|
| 90 |
+
|
| 91 |
+
# Build search parameters
|
| 92 |
+
search_params = {
|
| 93 |
+
"query": query,
|
| 94 |
+
"max_results": max_results or self.max_results,
|
| 95 |
+
"search_depth": search_depth or self.search_depth,
|
| 96 |
+
"include_raw_content": self.include_raw_content,
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
if include_domains:
|
| 100 |
+
search_params["include_domains"] = include_domains
|
| 101 |
+
if exclude_domains:
|
| 102 |
+
search_params["exclude_domains"] = exclude_domains
|
| 103 |
+
|
| 104 |
+
# Perform search
|
| 105 |
+
response = await self.client.search(**search_params)
|
| 106 |
+
|
| 107 |
+
# Process results
|
| 108 |
+
results = {
|
| 109 |
+
"query": query,
|
| 110 |
+
"results": response.get("results", []),
|
| 111 |
+
"answer": response.get("answer", ""), # Tavily's AI-generated answer
|
| 112 |
+
"search_metadata": {
|
| 113 |
+
"total_results": len(response.get("results", [])),
|
| 114 |
+
"search_depth": search_params["search_depth"],
|
| 115 |
+
"timestamp": datetime.now().isoformat(),
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
logger.info(f"Web search complete: {len(results['results'])} results found")
|
| 120 |
+
return results
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(f"Web search error: {e}", exc_info=True)
|
| 124 |
+
return {
|
| 125 |
+
"query": query,
|
| 126 |
+
"results": [],
|
| 127 |
+
"answer": "",
|
| 128 |
+
"error": str(e),
|
| 129 |
+
"search_metadata": {
|
| 130 |
+
"total_results": 0,
|
| 131 |
+
"error": str(e),
|
| 132 |
+
"timestamp": datetime.now().isoformat(),
|
| 133 |
+
}
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
async def search_with_context(
|
| 137 |
+
self,
|
| 138 |
+
query: str,
|
| 139 |
+
context: Optional[str] = None,
|
| 140 |
+
**kwargs
|
| 141 |
+
) -> Dict[str, Any]:
|
| 142 |
+
"""
|
| 143 |
+
Search with additional context to refine query
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
query: Base search query
|
| 147 |
+
context: Additional context to help refine search
|
| 148 |
+
**kwargs: Additional search parameters
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Search results dictionary
|
| 152 |
+
"""
|
| 153 |
+
# If context provided, enhance query
|
| 154 |
+
if context:
|
| 155 |
+
enhanced_query = f"{query} {context}"
|
| 156 |
+
else:
|
| 157 |
+
enhanced_query = query
|
| 158 |
+
|
| 159 |
+
return await self.search(enhanced_query, **kwargs)
|
| 160 |
+
|
| 161 |
+
def format_results_for_rag(self, search_results: Dict[str, Any]) -> str:
|
| 162 |
+
"""
|
| 163 |
+
Format web search results for RAG context
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
search_results: Results from search()
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
Formatted string for RAG context
|
| 170 |
+
"""
|
| 171 |
+
if not search_results.get("results"):
|
| 172 |
+
return "No web search results found."
|
| 173 |
+
|
| 174 |
+
formatted = ["=== Web Search Results ===\n"]
|
| 175 |
+
|
| 176 |
+
# Add Tavily's answer if available
|
| 177 |
+
if search_results.get("answer"):
|
| 178 |
+
formatted.append(f"Quick Answer: {search_results['answer']}\n")
|
| 179 |
+
|
| 180 |
+
# Add individual results
|
| 181 |
+
for idx, result in enumerate(search_results["results"], 1):
|
| 182 |
+
formatted.append(f"\n[Source {idx}] {result.get('title', 'Untitled')}")
|
| 183 |
+
formatted.append(f"URL: {result.get('url', 'N/A')}")
|
| 184 |
+
formatted.append(f"Content: {result.get('content', 'No content')[:500]}...")
|
| 185 |
+
if result.get("score"):
|
| 186 |
+
formatted.append(f"Relevance: {result['score']:.2f}")
|
| 187 |
+
|
| 188 |
+
formatted.append(f"\n=== End of Web Results ({len(search_results['results'])} sources) ===")
|
| 189 |
+
return "\n".join(formatted)
|
| 190 |
+
|
| 191 |
+
def format_results_for_llm(self, search_results: Dict[str, Any]) -> str:
|
| 192 |
+
"""
|
| 193 |
+
Format web search results optimally for LLM processing
|
| 194 |
+
|
| 195 |
+
Args:
|
| 196 |
+
search_results: Results from search()
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
Structured string optimized for LLM comprehension
|
| 200 |
+
"""
|
| 201 |
+
if not search_results.get("results"):
|
| 202 |
+
return "No relevant web search results were found for this query."
|
| 203 |
+
|
| 204 |
+
formatted = []
|
| 205 |
+
|
| 206 |
+
# Add Tavily's AI-generated answer first (if available)
|
| 207 |
+
if search_results.get("answer"):
|
| 208 |
+
formatted.append("### AI-Generated Summary:")
|
| 209 |
+
formatted.append(search_results['answer'])
|
| 210 |
+
formatted.append("")
|
| 211 |
+
|
| 212 |
+
# Add detailed source information
|
| 213 |
+
formatted.append("### Detailed Sources:")
|
| 214 |
+
formatted.append("")
|
| 215 |
+
|
| 216 |
+
for idx, result in enumerate(search_results["results"], 1):
|
| 217 |
+
formatted.append(f"**Source {idx}: {result.get('title', 'Untitled')}**")
|
| 218 |
+
formatted.append(f"- URL: {result.get('url', 'N/A')}")
|
| 219 |
+
formatted.append(f"- Published: {result.get('published_date', 'Unknown date')}")
|
| 220 |
+
|
| 221 |
+
# Get content (full or truncated based on availability)
|
| 222 |
+
content = result.get('content', '')
|
| 223 |
+
if result.get('raw_content') and len(result.get('raw_content', '')) > len(content):
|
| 224 |
+
content = result['raw_content'][:2000] # Use more detailed content
|
| 225 |
+
|
| 226 |
+
formatted.append(f"- Content: {content}")
|
| 227 |
+
|
| 228 |
+
if result.get("score"):
|
| 229 |
+
formatted.append(f"- Relevance Score: {result['score']:.2%}")
|
| 230 |
+
|
| 231 |
+
formatted.append("")
|
| 232 |
+
|
| 233 |
+
formatted.append(f"*Total sources: {len(search_results['results'])}*")
|
| 234 |
+
return "\n".join(formatted)
|
| 235 |
+
|
| 236 |
+
async def hybrid_search(
|
| 237 |
+
self,
|
| 238 |
+
query: str,
|
| 239 |
+
rag_results: Optional[str] = None,
|
| 240 |
+
combine_results: bool = True,
|
| 241 |
+
**kwargs
|
| 242 |
+
) -> Dict[str, Any]:
|
| 243 |
+
"""
|
| 244 |
+
Hybrid search: combine RAG results with web search
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
query: Search query
|
| 248 |
+
rag_results: Results from RAG system
|
| 249 |
+
combine_results: Whether to combine RAG and web results
|
| 250 |
+
**kwargs: Additional search parameters
|
| 251 |
+
|
| 252 |
+
Returns:
|
| 253 |
+
Dictionary with combined results
|
| 254 |
+
"""
|
| 255 |
+
# Perform web search
|
| 256 |
+
web_results = await self.search(query, **kwargs)
|
| 257 |
+
|
| 258 |
+
if not combine_results:
|
| 259 |
+
return web_results
|
| 260 |
+
|
| 261 |
+
# Combine RAG and web results
|
| 262 |
+
combined_context = []
|
| 263 |
+
|
| 264 |
+
if rag_results:
|
| 265 |
+
combined_context.append("=== Knowledge Base Results ===")
|
| 266 |
+
combined_context.append(rag_results)
|
| 267 |
+
combined_context.append("")
|
| 268 |
+
|
| 269 |
+
combined_context.append(self.format_results_for_rag(web_results))
|
| 270 |
+
|
| 271 |
+
return {
|
| 272 |
+
"query": query,
|
| 273 |
+
"combined_context": "\n".join(combined_context),
|
| 274 |
+
"rag_results": rag_results,
|
| 275 |
+
"web_results": web_results,
|
| 276 |
+
"metadata": {
|
| 277 |
+
"has_rag_results": bool(rag_results),
|
| 278 |
+
"has_web_results": len(web_results.get("results", [])) > 0,
|
| 279 |
+
"timestamp": datetime.now().isoformat(),
|
| 280 |
+
}
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def create_web_searcher(api_key: Optional[str] = None, **kwargs) -> WebSearcher:
|
| 285 |
+
"""
|
| 286 |
+
Factory function to create a web searcher
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
api_key: Tavily API key
|
| 290 |
+
**kwargs: Additional WebSearcher parameters
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
Configured WebSearcher instance
|
| 294 |
+
"""
|
| 295 |
+
return WebSearcher(api_key=api_key, **kwargs)
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
backend:
|
| 3 |
+
build:
|
| 4 |
+
context: .
|
| 5 |
+
dockerfile: backend/Dockerfile
|
| 6 |
+
container_name: agentic-rag-backend
|
| 7 |
+
restart: unless-stopped
|
| 8 |
+
ports:
|
| 9 |
+
- "8000:8000"
|
| 10 |
+
environment:
|
| 11 |
+
- GEMINI_API_KEY=${GEMINI_API_KEY}
|
| 12 |
+
- GEMINI_TEXT_MODEL=${GEMINI_TEXT_MODEL:-models/gemini-flash-latest}
|
| 13 |
+
- GEMINI_VERIFIER_MODEL=${GEMINI_VERIFIER_MODEL:-models/gemini-pro-latest}
|
| 14 |
+
- GEMINI_VISION_MODEL=${GEMINI_VISION_MODEL:-models/gemini-flash-latest}
|
| 15 |
+
- GEMINI_EMBEDDING_MODEL=${GEMINI_EMBEDDING_MODEL:-models/text-embedding-004}
|
| 16 |
+
- TAVILY_API_KEY=${TAVILY_API_KEY}
|
| 17 |
+
- PYTHONUNBUFFERED=1
|
| 18 |
+
volumes:
|
| 19 |
+
- ./storage:/app/storage
|
| 20 |
+
- ./uploads:/app/uploads
|
| 21 |
+
- ./backend/output:/app/backend/output
|
| 22 |
+
networks:
|
| 23 |
+
- rag-network
|
| 24 |
+
healthcheck:
|
| 25 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
| 26 |
+
interval: 30s
|
| 27 |
+
timeout: 10s
|
| 28 |
+
retries: 3
|
| 29 |
+
start_period: 40s
|
| 30 |
+
|
| 31 |
+
frontend:
|
| 32 |
+
build:
|
| 33 |
+
context: ./frontend
|
| 34 |
+
dockerfile: Dockerfile
|
| 35 |
+
container_name: agentic-rag-frontend
|
| 36 |
+
restart: unless-stopped
|
| 37 |
+
ports:
|
| 38 |
+
- "3000:80"
|
| 39 |
+
depends_on:
|
| 40 |
+
backend:
|
| 41 |
+
condition: service_healthy
|
| 42 |
+
networks:
|
| 43 |
+
- rag-network
|
| 44 |
+
healthcheck:
|
| 45 |
+
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/"]
|
| 46 |
+
interval: 30s
|
| 47 |
+
timeout: 10s
|
| 48 |
+
retries: 3
|
| 49 |
+
start_period: 10s
|
| 50 |
+
|
| 51 |
+
networks:
|
| 52 |
+
rag-network:
|
| 53 |
+
driver: bridge
|
| 54 |
+
|
| 55 |
+
volumes:
|
| 56 |
+
storage:
|
| 57 |
+
uploads:
|
| 58 |
+
output:
|
frontend/.env.example
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API Configuration
|
| 2 |
+
REACT_APP_API_URL=http://localhost:8000
|
frontend/Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM node:20-alpine AS builder
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Copy package files
|
| 6 |
+
COPY package*.json ./
|
| 7 |
+
|
| 8 |
+
# Install dependencies
|
| 9 |
+
RUN npm ci --silent
|
| 10 |
+
|
| 11 |
+
# Copy source code
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
# Build the application
|
| 15 |
+
RUN npm run build
|
| 16 |
+
|
| 17 |
+
FROM nginx:alpine
|
| 18 |
+
|
| 19 |
+
# Copy custom nginx configuration
|
| 20 |
+
COPY nginx.conf /etc/nginx/conf.d/default.conf
|
| 21 |
+
|
| 22 |
+
# Copy built application from builder stage
|
| 23 |
+
COPY --from=builder /app/build /usr/share/nginx/html
|
| 24 |
+
|
| 25 |
+
# Expose port
|
| 26 |
+
EXPOSE 80
|
| 27 |
+
|
| 28 |
+
# Health check
|
| 29 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
|
| 30 |
+
CMD wget --quiet --tries=1 --spider http://localhost/ || exit 1
|
| 31 |
+
|
| 32 |
+
# Start nginx
|
| 33 |
+
CMD ["nginx", "-g", "daemon off;"]
|
frontend/nginx.conf
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
server {
|
| 2 |
+
listen 80;
|
| 3 |
+
server_name localhost;
|
| 4 |
+
root /usr/share/nginx/html;
|
| 5 |
+
index index.html;
|
| 6 |
+
|
| 7 |
+
# Gzip compression
|
| 8 |
+
gzip on;
|
| 9 |
+
gzip_vary on;
|
| 10 |
+
gzip_min_length 1000;
|
| 11 |
+
gzip_types text/plain text/css text/xml text/javascript application/x-javascript application/xml+rss application/json;
|
| 12 |
+
|
| 13 |
+
# Serve static files with cache headers
|
| 14 |
+
location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
|
| 15 |
+
expires 1y;
|
| 16 |
+
add_header Cache-Control "public, immutable";
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
# API proxy to backend
|
| 20 |
+
location /api {
|
| 21 |
+
proxy_pass http://backend:8000;
|
| 22 |
+
proxy_http_version 1.1;
|
| 23 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 24 |
+
proxy_set_header Connection 'upgrade';
|
| 25 |
+
proxy_set_header Host $host;
|
| 26 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 27 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 28 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 29 |
+
proxy_cache_bypass $http_upgrade;
|
| 30 |
+
|
| 31 |
+
# Increase timeout for long-running queries
|
| 32 |
+
proxy_read_timeout 300s;
|
| 33 |
+
proxy_connect_timeout 75s;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
# Serve React app - all routes go to index.html
|
| 37 |
+
location / {
|
| 38 |
+
try_files $uri $uri/ /index.html;
|
| 39 |
+
add_header Cache-Control "no-cache";
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# Security headers
|
| 43 |
+
add_header X-Frame-Options "SAMEORIGIN" always;
|
| 44 |
+
add_header X-Content-Type-Options "nosniff" always;
|
| 45 |
+
add_header X-XSS-Protection "1; mode=block" always;
|
| 46 |
+
}
|
frontend/package-lock.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/package.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "enhanced-rag-frontend",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"description": "Enhanced RAG-Anything Frontend with Multi-Domain Support",
|
| 5 |
+
"private": true,
|
| 6 |
+
"dependencies": {
|
| 7 |
+
"lucide-react": "^0.294.0",
|
| 8 |
+
"react": "^18.2.0",
|
| 9 |
+
"react-dom": "^18.2.0",
|
| 10 |
+
"react-markdown": "^10.1.0",
|
| 11 |
+
"react-scripts": "5.0.1",
|
| 12 |
+
"rehype-highlight": "^7.0.2",
|
| 13 |
+
"remark-gfm": "^4.0.1"
|
| 14 |
+
},
|
| 15 |
+
"scripts": {
|
| 16 |
+
"start": "react-scripts start",
|
| 17 |
+
"build": "react-scripts build",
|
| 18 |
+
"test": "react-scripts test",
|
| 19 |
+
"eject": "react-scripts eject"
|
| 20 |
+
},
|
| 21 |
+
"eslintConfig": {
|
| 22 |
+
"extends": [
|
| 23 |
+
"react-app"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
"browserslist": {
|
| 27 |
+
"production": [
|
| 28 |
+
">0.2%",
|
| 29 |
+
"not dead",
|
| 30 |
+
"not op_mini all"
|
| 31 |
+
],
|
| 32 |
+
"development": [
|
| 33 |
+
"last 1 chrome version",
|
| 34 |
+
"last 1 firefox version",
|
| 35 |
+
"last 1 safari version"
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
"devDependencies": {
|
| 39 |
+
"autoprefixer": "^10.4.16",
|
| 40 |
+
"postcss": "^8.4.31",
|
| 41 |
+
"tailwindcss": "^3.3.5"
|
| 42 |
+
}
|
| 43 |
+
}
|
frontend/postcss.config.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
module.exports = {
|
| 2 |
+
plugins: {
|
| 3 |
+
tailwindcss: {},
|
| 4 |
+
autoprefixer: {},
|
| 5 |
+
},
|
| 6 |
+
}
|
frontend/public/index.html
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 6 |
+
<meta name="theme-color" content="#000000" />
|
| 7 |
+
<meta
|
| 8 |
+
name="description"
|
| 9 |
+
content="Enhanced RAG System with Multi-Domain Support"
|
| 10 |
+
/>
|
| 11 |
+
<title>Enhanced RAG System</title>
|
| 12 |
+
</head>
|
| 13 |
+
<body>
|
| 14 |
+
<noscript>You need to enable JavaScript to run this app.</noscript>
|
| 15 |
+
<div id="root"></div>
|
| 16 |
+
</body>
|
| 17 |
+
</html>
|
frontend/src/App.js
ADDED
|
@@ -0,0 +1,1268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* TheTruthSchool - Professional AI Assistant Interface
|
| 3 |
+
*
|
| 4 |
+
* Features:
|
| 5 |
+
* - Dark mode with elegant theme switching
|
| 6 |
+
* - Claude/ChatGPT-inspired professional design
|
| 7 |
+
* - Multi-domain RAG with TheTruthSchool branding
|
| 8 |
+
* - Smooth animations and modern UI
|
| 9 |
+
*/
|
| 10 |
+
|
| 11 |
+
import React, { useState, useEffect, useRef, useCallback } from 'react';
|
| 12 |
+
import ReactMarkdown from 'react-markdown';
|
| 13 |
+
import remarkGfm from 'remark-gfm';
|
| 14 |
+
import rehypeHighlight from 'rehype-highlight';
|
| 15 |
+
import 'highlight.js/styles/atom-one-dark.css';
|
| 16 |
+
import {
|
| 17 |
+
Send,
|
| 18 |
+
Upload,
|
| 19 |
+
FileText,
|
| 20 |
+
CheckCircle,
|
| 21 |
+
XCircle,
|
| 22 |
+
Menu,
|
| 23 |
+
X,
|
| 24 |
+
Loader2,
|
| 25 |
+
Trash2,
|
| 26 |
+
FolderOpen,
|
| 27 |
+
RefreshCw,
|
| 28 |
+
Moon,
|
| 29 |
+
Sun,
|
| 30 |
+
Sparkles
|
| 31 |
+
} from 'lucide-react';
|
| 32 |
+
|
| 33 |
+
// =============================================================================
|
| 34 |
+
// Domain Configurations
|
| 35 |
+
// =============================================================================
|
| 36 |
+
|
| 37 |
+
const DOMAIN_CONFIGS = {
|
| 38 |
+
medical: {
|
| 39 |
+
name: 'Medical & Healthcare',
|
| 40 |
+
description: 'Medical documents, research papers, clinical guidelines',
|
| 41 |
+
color: '#3b82f6',
|
| 42 |
+
fileTypes: ['.pdf', '.docx', '.xml', '.txt', '.doc', '.csv', '.xlsx'],
|
| 43 |
+
icon: '🏥'
|
| 44 |
+
},
|
| 45 |
+
legal: {
|
| 46 |
+
name: 'Legal & Compliance',
|
| 47 |
+
description: 'Legal documents, contracts, regulations, case law',
|
| 48 |
+
color: '#8b5cf6',
|
| 49 |
+
fileTypes: ['.pdf', '.docx', '.txt', '.doc', '.csv', '.xlsx'],
|
| 50 |
+
icon: '⚖️'
|
| 51 |
+
},
|
| 52 |
+
financial: {
|
| 53 |
+
name: 'Financial & Analytics',
|
| 54 |
+
description: 'Financial reports, analysis, market research',
|
| 55 |
+
color: '#10b981',
|
| 56 |
+
fileTypes: ['.pdf', '.xlsx', '.csv', '.json', '.xls'],
|
| 57 |
+
icon: '💰'
|
| 58 |
+
},
|
| 59 |
+
technical: {
|
| 60 |
+
name: 'Technical Documentation',
|
| 61 |
+
description: 'Technical docs, APIs, code, system architecture',
|
| 62 |
+
color: '#f97316',
|
| 63 |
+
fileTypes: ['.pdf', '.md', '.docx', '.json', '.txt', '.rst', '.csv', '.xlsx'],
|
| 64 |
+
icon: '⚙️'
|
| 65 |
+
},
|
| 66 |
+
academic: {
|
| 67 |
+
name: 'Academic Research',
|
| 68 |
+
description: 'Research papers, academic publications, studies',
|
| 69 |
+
color: '#6366f1',
|
| 70 |
+
fileTypes: ['.pdf', '.docx', '.tex', '.bib', '.txt', '.csv', '.xlsx'],
|
| 71 |
+
icon: '🎓'
|
| 72 |
+
}
|
| 73 |
+
};
|
| 74 |
+
|
| 75 |
+
const API_BASE_URL = process.env.REACT_APP_API_URL || 'http://localhost:8000';
|
| 76 |
+
|
| 77 |
+
// =============================================================================
|
| 78 |
+
// Main Component
|
| 79 |
+
// =============================================================================
|
| 80 |
+
|
| 81 |
+
export default function TheTruthSchoolAI() {
|
| 82 |
+
const getFromLocalStorage = (key, defaultValue) => {
|
| 83 |
+
try {
|
| 84 |
+
const item = window.localStorage.getItem(key);
|
| 85 |
+
return item ? JSON.parse(item) : defaultValue;
|
| 86 |
+
} catch (error) {
|
| 87 |
+
console.error(`Error reading localStorage key "${key}":`, error);
|
| 88 |
+
return defaultValue;
|
| 89 |
+
}
|
| 90 |
+
};
|
| 91 |
+
|
| 92 |
+
// State Management
|
| 93 |
+
const [darkMode, setDarkMode] = useState(() => getFromLocalStorage('darkMode', true));
|
| 94 |
+
const [selectedDomain, setSelectedDomain] = useState(() => getFromLocalStorage('selectedDomain', 'medical'));
|
| 95 |
+
const [currentView, setCurrentView] = useState('app');
|
| 96 |
+
const [processingDocs, setProcessingDocs] = useState(() => getFromLocalStorage('processingDocs', []));
|
| 97 |
+
const [processedDocs, setProcessedDocs] = useState([]);
|
| 98 |
+
const [query, setQuery] = useState('');
|
| 99 |
+
const [messages, setMessages] = useState(() => getFromLocalStorage('chatMessages', []));
|
| 100 |
+
const [isQuerying, setIsQuerying] = useState(false);
|
| 101 |
+
const [error, setError] = useState(null);
|
| 102 |
+
const [showUploadModal, setShowUploadModal] = useState(false);
|
| 103 |
+
const [isDragging, setIsDragging] = useState(false);
|
| 104 |
+
const [showSidebar, setShowSidebar] = useState(true);
|
| 105 |
+
const [enableWebSearch, setEnableWebSearch] = useState(() => getFromLocalStorage('enableWebSearch', false));
|
| 106 |
+
const [webSearchOnly, setWebSearchOnly] = useState(() => getFromLocalStorage('webSearchOnly', false));
|
| 107 |
+
const [urlInput, setUrlInput] = useState('');
|
| 108 |
+
const [uploadMode, setUploadMode] = useState('file');
|
| 109 |
+
const [fastMode, setFastMode] = useState(() => getFromLocalStorage('fastMode', false));
|
| 110 |
+
const [enableCache, setEnableCache] = useState(() => getFromLocalStorage('enableCache', true));
|
| 111 |
+
const [enableQueryImprovement, setEnableQueryImprovement] = useState(() => getFromLocalStorage('enableQueryImprovement', true));
|
| 112 |
+
const [enableVerification, setEnableVerification] = useState(() => getFromLocalStorage('enableVerification', true));
|
| 113 |
+
const [typingSpeed] = useState(0);
|
| 114 |
+
|
| 115 |
+
const messagesEndRef = useRef(null);
|
| 116 |
+
const fileInputRef = useRef(null);
|
| 117 |
+
const typingIntervalRef = useRef(null);
|
| 118 |
+
|
| 119 |
+
// Theme classes based on dark mode
|
| 120 |
+
const theme = {
|
| 121 |
+
bg: darkMode ? 'bg-[#0D0D0D]' : 'bg-white',
|
| 122 |
+
bgSecondary: darkMode ? 'bg-[#171717]' : 'bg-gray-50',
|
| 123 |
+
bgTertiary: darkMode ? 'bg-[#252525]' : 'bg-white',
|
| 124 |
+
text: darkMode ? 'text-gray-100' : 'text-gray-900',
|
| 125 |
+
textSecondary: darkMode ? 'text-gray-400' : 'text-gray-600',
|
| 126 |
+
textMuted: darkMode ? 'text-gray-500' : 'text-gray-500',
|
| 127 |
+
border: darkMode ? 'border-gray-800' : 'border-gray-200',
|
| 128 |
+
borderLight: darkMode ? 'border-gray-700' : 'border-gray-300',
|
| 129 |
+
hover: darkMode ? 'hover:bg-[#252525]' : 'hover:bg-gray-100',
|
| 130 |
+
active: darkMode ? 'bg-[#252525]' : 'bg-blue-50',
|
| 131 |
+
userMessage: darkMode ? 'bg-blue-600' : 'bg-blue-600',
|
| 132 |
+
assistantMessage: darkMode ? 'bg-[#252525]' : 'bg-gray-100',
|
| 133 |
+
input: darkMode ? 'bg-[#171717] border-gray-700 text-gray-100' : 'bg-white border-gray-300 text-gray-900',
|
| 134 |
+
button: darkMode ? 'bg-[#252525] hover:bg-[#2D2D2D]' : 'bg-gray-100 hover:bg-gray-200'
|
| 135 |
+
};
|
| 136 |
+
|
| 137 |
+
const scrollToBottom = () => {
|
| 138 |
+
messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' });
|
| 139 |
+
};
|
| 140 |
+
|
| 141 |
+
useEffect(() => {
|
| 142 |
+
scrollToBottom();
|
| 143 |
+
}, [messages]);
|
| 144 |
+
|
| 145 |
+
// Persist to localStorage
|
| 146 |
+
useEffect(() => {
|
| 147 |
+
try {
|
| 148 |
+
window.localStorage.setItem('darkMode', JSON.stringify(darkMode));
|
| 149 |
+
window.localStorage.setItem('chatMessages', JSON.stringify(messages));
|
| 150 |
+
window.localStorage.setItem('selectedDomain', JSON.stringify(selectedDomain));
|
| 151 |
+
window.localStorage.setItem('processingDocs', JSON.stringify(processingDocs));
|
| 152 |
+
window.localStorage.setItem('enableWebSearch', JSON.stringify(enableWebSearch));
|
| 153 |
+
window.localStorage.setItem('webSearchOnly', JSON.stringify(webSearchOnly));
|
| 154 |
+
window.localStorage.setItem('fastMode', JSON.stringify(fastMode));
|
| 155 |
+
window.localStorage.setItem('enableCache', JSON.stringify(enableCache));
|
| 156 |
+
window.localStorage.setItem('enableQueryImprovement', JSON.stringify(enableQueryImprovement));
|
| 157 |
+
window.localStorage.setItem('enableVerification', JSON.stringify(enableVerification));
|
| 158 |
+
} catch (error) {
|
| 159 |
+
console.error('Error saving to localStorage:', error);
|
| 160 |
+
}
|
| 161 |
+
}, [darkMode, messages, selectedDomain, processingDocs, enableWebSearch, webSearchOnly, fastMode, enableCache, enableQueryImprovement, enableVerification]);
|
| 162 |
+
|
| 163 |
+
// Fetch processed documents
|
| 164 |
+
const fetchProcessedDocuments = useCallback(async () => {
|
| 165 |
+
try {
|
| 166 |
+
const response = await fetch(`${API_BASE_URL}/documents?domain=${selectedDomain}`);
|
| 167 |
+
if (response.ok) {
|
| 168 |
+
const data = await response.json();
|
| 169 |
+
const fetchedDocs = data.documents || [];
|
| 170 |
+
setProcessedDocs(prev => {
|
| 171 |
+
const fetchedIds = new Set(fetchedDocs.map(d => d.id));
|
| 172 |
+
const recentlyAdded = prev.filter(d => d.id && !fetchedIds.has(d.id));
|
| 173 |
+
return [...fetchedDocs, ...recentlyAdded];
|
| 174 |
+
});
|
| 175 |
+
}
|
| 176 |
+
} catch (err) {
|
| 177 |
+
console.error('Error fetching documents:', err);
|
| 178 |
+
}
|
| 179 |
+
}, [selectedDomain]);
|
| 180 |
+
|
| 181 |
+
// Check processing status
|
| 182 |
+
const checkProcessingStatus = useCallback(async () => {
|
| 183 |
+
const updatedProcessing = [];
|
| 184 |
+
for (const doc of processingDocs) {
|
| 185 |
+
try {
|
| 186 |
+
const response = await fetch(`${API_BASE_URL}/status/${doc.processingId}`);
|
| 187 |
+
if (response.ok) {
|
| 188 |
+
const status = await response.json();
|
| 189 |
+
if (status.status === 'completed') {
|
| 190 |
+
setProcessedDocs(prev => [...prev, { ...doc, id: doc.processingId, status: 'completed' }]);
|
| 191 |
+
} else if (status.status === 'failed') {
|
| 192 |
+
setError(`Processing failed for ${doc.name}: ${status.error}`);
|
| 193 |
+
} else {
|
| 194 |
+
updatedProcessing.push({ ...doc, status: status.status });
|
| 195 |
+
}
|
| 196 |
+
}
|
| 197 |
+
} catch (err) {
|
| 198 |
+
console.error('Error checking status:', err);
|
| 199 |
+
}
|
| 200 |
+
}
|
| 201 |
+
setProcessingDocs(updatedProcessing);
|
| 202 |
+
}, [processingDocs]);
|
| 203 |
+
|
| 204 |
+
useEffect(() => {
|
| 205 |
+
fetchProcessedDocuments();
|
| 206 |
+
}, [selectedDomain, fetchProcessedDocuments]);
|
| 207 |
+
|
| 208 |
+
useEffect(() => {
|
| 209 |
+
const interval = setInterval(() => {
|
| 210 |
+
if (processingDocs.length > 0) {
|
| 211 |
+
checkProcessingStatus();
|
| 212 |
+
}
|
| 213 |
+
}, 3000);
|
| 214 |
+
return () => clearInterval(interval);
|
| 215 |
+
}, [processingDocs, checkProcessingStatus]);
|
| 216 |
+
|
| 217 |
+
// API Functions
|
| 218 |
+
const handleFileUpload = async (files) => {
|
| 219 |
+
if (!files || files.length === 0) return;
|
| 220 |
+
setError(null);
|
| 221 |
+
const newProcessingDocs = [];
|
| 222 |
+
|
| 223 |
+
for (const file of files) {
|
| 224 |
+
const fileExt = '.' + file.name.split('.').pop().toLowerCase();
|
| 225 |
+
const allowedTypes = DOMAIN_CONFIGS[selectedDomain].fileTypes;
|
| 226 |
+
|
| 227 |
+
if (!allowedTypes.includes(fileExt)) {
|
| 228 |
+
setError(`File type ${fileExt} not supported for ${selectedDomain} domain.`);
|
| 229 |
+
continue;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
const formData = new FormData();
|
| 233 |
+
formData.append('file', file);
|
| 234 |
+
formData.append('domain', selectedDomain);
|
| 235 |
+
|
| 236 |
+
try {
|
| 237 |
+
const response = await fetch(`${API_BASE_URL}/upload`, {
|
| 238 |
+
method: 'POST',
|
| 239 |
+
body: formData
|
| 240 |
+
});
|
| 241 |
+
|
| 242 |
+
const data = await response.json();
|
| 243 |
+
if (response.ok) {
|
| 244 |
+
newProcessingDocs.push({
|
| 245 |
+
name: file.name,
|
| 246 |
+
domain: selectedDomain,
|
| 247 |
+
processingId: data.processing_id,
|
| 248 |
+
status: 'processing',
|
| 249 |
+
uploadedAt: new Date().toISOString()
|
| 250 |
+
});
|
| 251 |
+
} else {
|
| 252 |
+
setError(data.detail || 'Upload failed');
|
| 253 |
+
}
|
| 254 |
+
} catch (err) {
|
| 255 |
+
console.error('Upload error:', err);
|
| 256 |
+
setError(`Failed to upload ${file.name}: ${err.message}`);
|
| 257 |
+
}
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
setProcessingDocs(prev => [...prev, ...newProcessingDocs]);
|
| 261 |
+
setShowUploadModal(false);
|
| 262 |
+
};
|
| 263 |
+
|
| 264 |
+
const handleUrlUpload = async () => {
|
| 265 |
+
if (!urlInput.trim()) {
|
| 266 |
+
setError('Please enter a valid URL');
|
| 267 |
+
return;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
setError(null);
|
| 271 |
+
|
| 272 |
+
try {
|
| 273 |
+
const response = await fetch(`${API_BASE_URL}/upload-url`, {
|
| 274 |
+
method: 'POST',
|
| 275 |
+
headers: { 'Content-Type': 'application/json' },
|
| 276 |
+
body: JSON.stringify({
|
| 277 |
+
url: urlInput,
|
| 278 |
+
domain: selectedDomain,
|
| 279 |
+
convert_to_markdown: true
|
| 280 |
+
})
|
| 281 |
+
});
|
| 282 |
+
|
| 283 |
+
const data = await response.json();
|
| 284 |
+
if (response.ok) {
|
| 285 |
+
setProcessingDocs(prev => [...prev, {
|
| 286 |
+
name: urlInput,
|
| 287 |
+
domain: selectedDomain,
|
| 288 |
+
processingId: data.processing_id,
|
| 289 |
+
status: 'processing',
|
| 290 |
+
uploadedAt: new Date().toISOString()
|
| 291 |
+
}]);
|
| 292 |
+
setUrlInput('');
|
| 293 |
+
setShowUploadModal(false);
|
| 294 |
+
} else {
|
| 295 |
+
setError(data.detail || 'URL upload failed');
|
| 296 |
+
}
|
| 297 |
+
} catch (err) {
|
| 298 |
+
console.error('URL upload error:', err);
|
| 299 |
+
setError(`Failed to upload URL: ${err.message}`);
|
| 300 |
+
}
|
| 301 |
+
};
|
| 302 |
+
|
| 303 |
+
const startTypingEffect = useCallback((messageIndex, targetTextRef, isStreamingRef) => {
|
| 304 |
+
if (typingIntervalRef.current) {
|
| 305 |
+
clearInterval(typingIntervalRef.current);
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
let displayedLength = 0;
|
| 309 |
+
|
| 310 |
+
typingIntervalRef.current = setInterval(() => {
|
| 311 |
+
const targetText = targetTextRef.current || '';
|
| 312 |
+
const isStillStreaming = isStreamingRef.current;
|
| 313 |
+
|
| 314 |
+
if (displayedLength < targetText.length) {
|
| 315 |
+
const charsToAdd = Math.max(1, Math.floor(typingSpeed / 10));
|
| 316 |
+
displayedLength = Math.min(displayedLength + charsToAdd, targetText.length);
|
| 317 |
+
|
| 318 |
+
setMessages(prev => {
|
| 319 |
+
const newMessages = [...prev];
|
| 320 |
+
if (newMessages[messageIndex]) {
|
| 321 |
+
newMessages[messageIndex] = {
|
| 322 |
+
...newMessages[messageIndex],
|
| 323 |
+
content: targetText.substring(0, displayedLength)
|
| 324 |
+
};
|
| 325 |
+
}
|
| 326 |
+
return newMessages;
|
| 327 |
+
});
|
| 328 |
+
} else if (!isStillStreaming && displayedLength >= targetText.length) {
|
| 329 |
+
clearInterval(typingIntervalRef.current);
|
| 330 |
+
typingIntervalRef.current = null;
|
| 331 |
+
}
|
| 332 |
+
}, 30);
|
| 333 |
+
}, [typingSpeed]);
|
| 334 |
+
|
| 335 |
+
useEffect(() => {
|
| 336 |
+
return () => {
|
| 337 |
+
if (typingIntervalRef.current) {
|
| 338 |
+
clearInterval(typingIntervalRef.current);
|
| 339 |
+
}
|
| 340 |
+
};
|
| 341 |
+
}, []);
|
| 342 |
+
|
| 343 |
+
const handleQuery = async () => {
|
| 344 |
+
if (!query.trim()) return;
|
| 345 |
+
|
| 346 |
+
setError(null);
|
| 347 |
+
setIsQuerying(true);
|
| 348 |
+
|
| 349 |
+
const userMessage = { role: 'user', content: query };
|
| 350 |
+
setMessages(prev => [...prev, userMessage]);
|
| 351 |
+
const currentQuery = query;
|
| 352 |
+
setQuery('');
|
| 353 |
+
|
| 354 |
+
const assistantMessageIndex = messages.length + 1;
|
| 355 |
+
setMessages(prev => [...prev, {
|
| 356 |
+
role: 'assistant',
|
| 357 |
+
content: '',
|
| 358 |
+
streaming: true,
|
| 359 |
+
verification: null
|
| 360 |
+
}]);
|
| 361 |
+
|
| 362 |
+
const fullTextBufferRef = { current: '' };
|
| 363 |
+
const isStreamingRef = { current: true };
|
| 364 |
+
let typingStarted = false;
|
| 365 |
+
|
| 366 |
+
try {
|
| 367 |
+
const response = await fetch(`${API_BASE_URL}/query/stream`, {
|
| 368 |
+
method: 'POST',
|
| 369 |
+
headers: { 'Content-Type': 'application/json' },
|
| 370 |
+
body: JSON.stringify({
|
| 371 |
+
query: currentQuery,
|
| 372 |
+
domain: selectedDomain,
|
| 373 |
+
enable_verification: true,
|
| 374 |
+
enable_web_search: enableWebSearch,
|
| 375 |
+
web_search_only: webSearchOnly,
|
| 376 |
+
fast_mode: fastMode,
|
| 377 |
+
enable_cache: enableCache,
|
| 378 |
+
enable_query_improvement: enableQueryImprovement,
|
| 379 |
+
enable_verification_check: enableVerification
|
| 380 |
+
})
|
| 381 |
+
});
|
| 382 |
+
|
| 383 |
+
if (!response.ok) {
|
| 384 |
+
throw new Error(`HTTP error! status: ${response.status}`);
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
const reader = response.body.getReader();
|
| 388 |
+
const decoder = new TextDecoder();
|
| 389 |
+
let buffer = '';
|
| 390 |
+
|
| 391 |
+
while (true) {
|
| 392 |
+
const { done, value } = await reader.read();
|
| 393 |
+
|
| 394 |
+
if (done) {
|
| 395 |
+
break;
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
buffer += decoder.decode(value, { stream: true });
|
| 399 |
+
|
| 400 |
+
const events = buffer.split('\n\n');
|
| 401 |
+
buffer = events.pop() || '';
|
| 402 |
+
|
| 403 |
+
for (const event of events) {
|
| 404 |
+
if (!event.trim()) continue;
|
| 405 |
+
|
| 406 |
+
const lines = event.split('\n');
|
| 407 |
+
let eventType = 'message';
|
| 408 |
+
let eventData = '';
|
| 409 |
+
|
| 410 |
+
for (const line of lines) {
|
| 411 |
+
if (line.startsWith('event:')) {
|
| 412 |
+
eventType = line.substring(6).trim();
|
| 413 |
+
} else if (line.startsWith('data:')) {
|
| 414 |
+
eventData = line.substring(5).trim();
|
| 415 |
+
}
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
if (eventData) {
|
| 419 |
+
const data = JSON.parse(eventData);
|
| 420 |
+
|
| 421 |
+
if (eventType === 'token') {
|
| 422 |
+
fullTextBufferRef.current += data.content;
|
| 423 |
+
|
| 424 |
+
if (!typingStarted && typingSpeed > 0) {
|
| 425 |
+
typingStarted = true;
|
| 426 |
+
startTypingEffect(assistantMessageIndex, fullTextBufferRef, isStreamingRef);
|
| 427 |
+
} else if (typingSpeed === 0) {
|
| 428 |
+
setMessages(prev => {
|
| 429 |
+
const newMessages = [...prev];
|
| 430 |
+
newMessages[assistantMessageIndex] = {
|
| 431 |
+
...newMessages[assistantMessageIndex],
|
| 432 |
+
content: fullTextBufferRef.current
|
| 433 |
+
};
|
| 434 |
+
return newMessages;
|
| 435 |
+
});
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
} else if (eventType === 'verification') {
|
| 439 |
+
setMessages(prev => {
|
| 440 |
+
const newMessages = [...prev];
|
| 441 |
+
newMessages[assistantMessageIndex] = {
|
| 442 |
+
...newMessages[assistantMessageIndex],
|
| 443 |
+
verification: data.content,
|
| 444 |
+
streaming: false
|
| 445 |
+
};
|
| 446 |
+
return newMessages;
|
| 447 |
+
});
|
| 448 |
+
|
| 449 |
+
} else if (eventType === 'done') {
|
| 450 |
+
isStreamingRef.current = false;
|
| 451 |
+
|
| 452 |
+
setTimeout(() => {
|
| 453 |
+
if (typingIntervalRef.current) {
|
| 454 |
+
clearInterval(typingIntervalRef.current);
|
| 455 |
+
typingIntervalRef.current = null;
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
setMessages(prev => {
|
| 459 |
+
const newMessages = [...prev];
|
| 460 |
+
newMessages[assistantMessageIndex] = {
|
| 461 |
+
...newMessages[assistantMessageIndex],
|
| 462 |
+
streaming: false,
|
| 463 |
+
content: fullTextBufferRef.current
|
| 464 |
+
};
|
| 465 |
+
return newMessages;
|
| 466 |
+
});
|
| 467 |
+
}, typingSpeed === 0 ? 0 : 500);
|
| 468 |
+
|
| 469 |
+
} else if (eventType === 'error') {
|
| 470 |
+
const errorMessage = data.content.message || 'An error occurred';
|
| 471 |
+
const errorSuggestion = data.content.suggestion || '';
|
| 472 |
+
setError(errorSuggestion ? `${errorMessage}\n\n${errorSuggestion}` : errorMessage);
|
| 473 |
+
|
| 474 |
+
isStreamingRef.current = false;
|
| 475 |
+
|
| 476 |
+
if (typingIntervalRef.current) {
|
| 477 |
+
clearInterval(typingIntervalRef.current);
|
| 478 |
+
typingIntervalRef.current = null;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
setMessages(prev => {
|
| 482 |
+
const newMessages = [...prev];
|
| 483 |
+
newMessages[assistantMessageIndex] = {
|
| 484 |
+
...newMessages[assistantMessageIndex],
|
| 485 |
+
content: fullTextBufferRef.current || errorMessage,
|
| 486 |
+
streaming: false,
|
| 487 |
+
error: true
|
| 488 |
+
};
|
| 489 |
+
return newMessages;
|
| 490 |
+
});
|
| 491 |
+
break;
|
| 492 |
+
}
|
| 493 |
+
}
|
| 494 |
+
}
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
} catch (err) {
|
| 498 |
+
console.error('Query error:', err);
|
| 499 |
+
setError(`Query failed: ${err.message}`);
|
| 500 |
+
|
| 501 |
+
if (typingIntervalRef.current) {
|
| 502 |
+
clearInterval(typingIntervalRef.current);
|
| 503 |
+
typingIntervalRef.current = null;
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
setMessages(prev => {
|
| 507 |
+
const newMessages = [...prev];
|
| 508 |
+
if (newMessages[assistantMessageIndex]) {
|
| 509 |
+
newMessages[assistantMessageIndex] = {
|
| 510 |
+
...newMessages[assistantMessageIndex],
|
| 511 |
+
content: newMessages[assistantMessageIndex].content || '[Error occurred]',
|
| 512 |
+
streaming: false,
|
| 513 |
+
error: true
|
| 514 |
+
};
|
| 515 |
+
}
|
| 516 |
+
return newMessages;
|
| 517 |
+
});
|
| 518 |
+
} finally {
|
| 519 |
+
setIsQuerying(false);
|
| 520 |
+
}
|
| 521 |
+
};
|
| 522 |
+
|
| 523 |
+
const handleKeyPress = (e) => {
|
| 524 |
+
if (e.key === 'Enter' && !e.shiftKey) {
|
| 525 |
+
e.preventDefault();
|
| 526 |
+
handleQuery();
|
| 527 |
+
}
|
| 528 |
+
};
|
| 529 |
+
|
| 530 |
+
const handleDeleteDocument = async (docId, docName) => {
|
| 531 |
+
if (!docId) {
|
| 532 |
+
console.error('Document ID is undefined');
|
| 533 |
+
setError('Cannot delete document: ID is missing');
|
| 534 |
+
return;
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
const confirmed = window.confirm(
|
| 538 |
+
`Are you sure you want to delete "${docName || 'this document'}"?\n\nThis action cannot be undone.`
|
| 539 |
+
);
|
| 540 |
+
|
| 541 |
+
if (!confirmed) {
|
| 542 |
+
return;
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
try {
|
| 546 |
+
const response = await fetch(`${API_BASE_URL}/documents/${docId}`, {
|
| 547 |
+
method: 'DELETE'
|
| 548 |
+
});
|
| 549 |
+
|
| 550 |
+
const data = await response.json();
|
| 551 |
+
|
| 552 |
+
if (response.ok && data.success) {
|
| 553 |
+
setProcessedDocs(prev => prev.filter(doc => doc.id !== docId));
|
| 554 |
+
await fetchProcessedDocuments();
|
| 555 |
+
} else {
|
| 556 |
+
const errorMsg = data.message || data.detail || 'Failed to delete document';
|
| 557 |
+
setError(errorMsg);
|
| 558 |
+
}
|
| 559 |
+
} catch (err) {
|
| 560 |
+
console.error('Error deleting document:', err);
|
| 561 |
+
setError('Failed to delete document: ' + err.message);
|
| 562 |
+
}
|
| 563 |
+
};
|
| 564 |
+
|
| 565 |
+
const handleDragOver = (e) => {
|
| 566 |
+
e.preventDefault();
|
| 567 |
+
setIsDragging(true);
|
| 568 |
+
};
|
| 569 |
+
|
| 570 |
+
const handleDragLeave = (e) => {
|
| 571 |
+
e.preventDefault();
|
| 572 |
+
setIsDragging(false);
|
| 573 |
+
};
|
| 574 |
+
|
| 575 |
+
const handleDrop = (e) => {
|
| 576 |
+
e.preventDefault();
|
| 577 |
+
setIsDragging(false);
|
| 578 |
+
handleFileUpload(e.dataTransfer.files);
|
| 579 |
+
};
|
| 580 |
+
|
| 581 |
+
// =============================================================================
|
| 582 |
+
// Render Functions
|
| 583 |
+
// =============================================================================
|
| 584 |
+
|
| 585 |
+
const renderNavigation = () => (
|
| 586 |
+
<nav className={`${theme.bgTertiary} ${theme.border} border-b px-6 py-3`}>
|
| 587 |
+
<div className="flex items-center justify-between max-w-7xl mx-auto">
|
| 588 |
+
<div className="flex items-center space-x-8">
|
| 589 |
+
<div className="flex items-center space-x-3">
|
| 590 |
+
<div className="flex items-center space-x-2">
|
| 591 |
+
<div className={`w-8 h-8 ${darkMode ? 'bg-gradient-to-br from-purple-500 to-blue-500' : 'bg-gradient-to-br from-blue-600 to-purple-600'} rounded-lg flex items-center justify-center`}>
|
| 592 |
+
<Sparkles className="w-5 h-5 text-white" />
|
| 593 |
+
</div>
|
| 594 |
+
<h1 className={`text-xl font-bold ${theme.text}`}>TheTruthSchool</h1>
|
| 595 |
+
</div>
|
| 596 |
+
<span className={`text-sm ${theme.textMuted}`}>/ {DOMAIN_CONFIGS[selectedDomain].name}</span>
|
| 597 |
+
</div>
|
| 598 |
+
|
| 599 |
+
<div className="flex items-center space-x-1">
|
| 600 |
+
<button
|
| 601 |
+
onClick={() => setCurrentView('app')}
|
| 602 |
+
className={`px-4 py-2 text-sm font-medium rounded-md transition-colors ${
|
| 603 |
+
currentView === 'app'
|
| 604 |
+
? `${darkMode ? 'text-blue-400 bg-blue-900/30' : 'text-blue-600 bg-blue-50'}`
|
| 605 |
+
: `${theme.textSecondary} ${theme.hover}`
|
| 606 |
+
}`}
|
| 607 |
+
>
|
| 608 |
+
Chat
|
| 609 |
+
</button>
|
| 610 |
+
<button
|
| 611 |
+
onClick={() => setCurrentView('files')}
|
| 612 |
+
className={`px-4 py-2 text-sm font-medium rounded-md transition-colors ${
|
| 613 |
+
currentView === 'files'
|
| 614 |
+
? `${darkMode ? 'text-blue-400 bg-blue-900/30' : 'text-blue-600 bg-blue-50'}`
|
| 615 |
+
: `${theme.textSecondary} ${theme.hover}`
|
| 616 |
+
}`}
|
| 617 |
+
>
|
| 618 |
+
Files
|
| 619 |
+
</button>
|
| 620 |
+
<button
|
| 621 |
+
onClick={() => setCurrentView('settings')}
|
| 622 |
+
className={`px-4 py-2 text-sm font-medium rounded-md transition-colors ${
|
| 623 |
+
currentView === 'settings'
|
| 624 |
+
? `${darkMode ? 'text-blue-400 bg-blue-900/30' : 'text-blue-600 bg-blue-50'}`
|
| 625 |
+
: `${theme.textSecondary} ${theme.hover}`
|
| 626 |
+
}`}
|
| 627 |
+
>
|
| 628 |
+
Settings
|
| 629 |
+
</button>
|
| 630 |
+
</div>
|
| 631 |
+
</div>
|
| 632 |
+
|
| 633 |
+
<div className="flex items-center space-x-2">
|
| 634 |
+
<button
|
| 635 |
+
onClick={() => setDarkMode(!darkMode)}
|
| 636 |
+
className={`p-2 ${theme.textSecondary} ${theme.hover} rounded-md transition-colors`}
|
| 637 |
+
>
|
| 638 |
+
{darkMode ? <Sun className="w-5 h-5" /> : <Moon className="w-5 h-5" />}
|
| 639 |
+
</button>
|
| 640 |
+
<button
|
| 641 |
+
onClick={() => setShowSidebar(!showSidebar)}
|
| 642 |
+
className={`p-2 ${theme.textSecondary} ${theme.hover} rounded-md transition-colors`}
|
| 643 |
+
>
|
| 644 |
+
{showSidebar ? <X className="w-5 h-5" /> : <Menu className="w-5 h-5" />}
|
| 645 |
+
</button>
|
| 646 |
+
</div>
|
| 647 |
+
</div>
|
| 648 |
+
</nav>
|
| 649 |
+
);
|
| 650 |
+
|
| 651 |
+
const renderSidebar = () => (
|
| 652 |
+
<div className={`${showSidebar ? 'w-64' : 'w-0'} transition-all duration-300 ${theme.bgSecondary} ${theme.border} border-r overflow-hidden`}>
|
| 653 |
+
<div className="p-4 space-y-4">
|
| 654 |
+
<div>
|
| 655 |
+
<h3 className={`text-xs font-semibold ${theme.textMuted} uppercase mb-3`}>Domains</h3>
|
| 656 |
+
<div className="space-y-1">
|
| 657 |
+
{Object.entries(DOMAIN_CONFIGS).map(([key, config]) => (
|
| 658 |
+
<button
|
| 659 |
+
key={key}
|
| 660 |
+
onClick={() => setSelectedDomain(key)}
|
| 661 |
+
className={`w-full flex items-center space-x-3 px-3 py-2 rounded-lg text-sm transition-colors ${
|
| 662 |
+
selectedDomain === key
|
| 663 |
+
? `${darkMode ? 'bg-blue-900/30 text-blue-400' : 'bg-blue-50 text-blue-700'} font-medium`
|
| 664 |
+
: `${theme.textSecondary} ${theme.hover}`
|
| 665 |
+
}`}
|
| 666 |
+
>
|
| 667 |
+
<span className="text-lg">{config.icon}</span>
|
| 668 |
+
<span className="flex-1 text-left truncate">{config.name}</span>
|
| 669 |
+
</button>
|
| 670 |
+
))}
|
| 671 |
+
</div>
|
| 672 |
+
</div>
|
| 673 |
+
|
| 674 |
+
{processingDocs.length > 0 && (
|
| 675 |
+
<div>
|
| 676 |
+
<h3 className={`text-xs font-semibold ${theme.textMuted} uppercase mb-3`}>Processing</h3>
|
| 677 |
+
<div className="space-y-2">
|
| 678 |
+
{processingDocs.map((doc, idx) => (
|
| 679 |
+
<div key={idx} className={`flex items-center space-x-2 px-3 py-2 ${darkMode ? 'bg-yellow-900/20' : 'bg-yellow-50'} rounded-lg`}>
|
| 680 |
+
<Loader2 className={`w-4 h-4 ${darkMode ? 'text-yellow-400' : 'text-yellow-600'} animate-spin`} />
|
| 681 |
+
<span className={`text-xs ${darkMode ? 'text-yellow-300' : 'text-yellow-800'} truncate flex-1`}>{doc.name}</span>
|
| 682 |
+
</div>
|
| 683 |
+
))}
|
| 684 |
+
</div>
|
| 685 |
+
</div>
|
| 686 |
+
)}
|
| 687 |
+
|
| 688 |
+
{processedDocs.length > 0 && (
|
| 689 |
+
<div>
|
| 690 |
+
<h3 className={`text-xs font-semibold ${theme.textMuted} uppercase mb-3`}>
|
| 691 |
+
Documents ({processedDocs.length})
|
| 692 |
+
</h3>
|
| 693 |
+
<div className="space-y-1 max-h-64 overflow-y-auto">
|
| 694 |
+
{processedDocs.map((doc, idx) => (
|
| 695 |
+
<div key={idx} className={`flex items-center space-x-2 px-3 py-2 ${theme.bgTertiary} rounded-lg ${theme.border} border group`}>
|
| 696 |
+
<FileText className={`w-4 h-4 ${theme.textMuted}`} />
|
| 697 |
+
<span className={`text-xs ${theme.textSecondary} truncate flex-1`}>{doc.name || `Document ${idx + 1}`}</span>
|
| 698 |
+
<button
|
| 699 |
+
onClick={() => handleDeleteDocument(doc.id, doc.name)}
|
| 700 |
+
className="opacity-0 group-hover:opacity-100 transition-opacity"
|
| 701 |
+
>
|
| 702 |
+
<Trash2 className={`w-3 h-3 ${theme.textMuted} hover:text-red-600`} />
|
| 703 |
+
</button>
|
| 704 |
+
</div>
|
| 705 |
+
))}
|
| 706 |
+
</div>
|
| 707 |
+
</div>
|
| 708 |
+
)}
|
| 709 |
+
|
| 710 |
+
{messages.length > 0 && (
|
| 711 |
+
<div className={`pt-4 ${theme.border} border-t`}>
|
| 712 |
+
<button
|
| 713 |
+
onClick={() => {
|
| 714 |
+
if (window.confirm('Clear all chat history? This cannot be undone.')) {
|
| 715 |
+
setMessages([]);
|
| 716 |
+
window.localStorage.removeItem('chatMessages');
|
| 717 |
+
}
|
| 718 |
+
}}
|
| 719 |
+
className={`w-full flex items-center justify-center space-x-2 px-3 py-2 text-sm text-red-500 hover:${darkMode ? 'bg-red-900/20' : 'bg-red-50'} rounded-lg transition-colors`}
|
| 720 |
+
>
|
| 721 |
+
<Trash2 className="w-4 h-4" />
|
| 722 |
+
<span>Clear Chat</span>
|
| 723 |
+
</button>
|
| 724 |
+
</div>
|
| 725 |
+
)}
|
| 726 |
+
</div>
|
| 727 |
+
</div>
|
| 728 |
+
);
|
| 729 |
+
|
| 730 |
+
const renderAppView = () => (
|
| 731 |
+
<div className={`flex-1 flex flex-col ${theme.bg}`}>
|
| 732 |
+
{messages.length === 0 ? (
|
| 733 |
+
<div className="flex-1 flex flex-col items-center justify-center px-4">
|
| 734 |
+
<div className="text-center max-w-2xl">
|
| 735 |
+
<div className={`w-20 h-20 ${darkMode ? 'bg-gradient-to-br from-purple-500 to-blue-500' : 'bg-gradient-to-br from-blue-600 to-purple-600'} rounded-2xl flex items-center justify-center mx-auto mb-6 shadow-lg`}>
|
| 736 |
+
<Sparkles className="w-10 h-10 text-white" />
|
| 737 |
+
</div>
|
| 738 |
+
<h2 className={`text-4xl font-bold ${theme.text} mb-3`}>TheTruthSchool AI</h2>
|
| 739 |
+
<p className={`${theme.textSecondary} mb-8 text-lg`}>
|
| 740 |
+
Your intelligent assistant for document analysis and knowledge discovery
|
| 741 |
+
</p>
|
| 742 |
+
|
| 743 |
+
<div className="grid grid-cols-3 gap-4 text-left">
|
| 744 |
+
<div className={`p-5 ${theme.bgSecondary} rounded-xl ${theme.border} border`}>
|
| 745 |
+
<div className="text-3xl mb-3">📚</div>
|
| 746 |
+
<h3 className={`font-semibold ${theme.text} mb-2`}>Smart Upload</h3>
|
| 747 |
+
<p className={`text-sm ${theme.textSecondary}`}>Process PDFs, documents, and web content</p>
|
| 748 |
+
</div>
|
| 749 |
+
<div className={`p-5 ${theme.bgSecondary} rounded-xl ${theme.border} border`}>
|
| 750 |
+
<div className="text-3xl mb-3">🧠</div>
|
| 751 |
+
<h3 className={`font-semibold ${theme.text} mb-2`}>Deep Understanding</h3>
|
| 752 |
+
<p className={`text-sm ${theme.textSecondary}`}>Advanced RAG with knowledge graphs</p>
|
| 753 |
+
</div>
|
| 754 |
+
<div className={`p-5 ${theme.bgSecondary} rounded-xl ${theme.border} border`}>
|
| 755 |
+
<div className="text-3xl mb-3">✨</div>
|
| 756 |
+
<h3 className={`font-semibold ${theme.text} mb-2`}>Multi-Domain</h3>
|
| 757 |
+
<p className={`text-sm ${theme.textSecondary}`}>Optimized for healthcare, legal, finance & more</p>
|
| 758 |
+
</div>
|
| 759 |
+
</div>
|
| 760 |
+
</div>
|
| 761 |
+
</div>
|
| 762 |
+
) : (
|
| 763 |
+
<div className="flex-1 overflow-y-auto px-4 py-6">
|
| 764 |
+
<div className="max-w-3xl mx-auto space-y-6">
|
| 765 |
+
{messages.map((msg, idx) => (
|
| 766 |
+
<div key={idx} className={`flex ${msg.role === 'user' ? 'justify-end' : 'justify-start'}`}>
|
| 767 |
+
<div className={`max-w-[80%] ${msg.role === 'user' ? 'bg-blue-600 text-white' : `${theme.assistantMessage} ${theme.text}`} rounded-2xl px-5 py-4 shadow-sm`}>
|
| 768 |
+
{msg.role === 'user' ? (
|
| 769 |
+
<p className="text-sm whitespace-pre-wrap">{msg.content}</p>
|
| 770 |
+
) : (
|
| 771 |
+
<div className={`text-sm prose prose-sm max-w-none ${darkMode ? 'prose-invert' : ''}`}>
|
| 772 |
+
<ReactMarkdown
|
| 773 |
+
remarkPlugins={[remarkGfm]}
|
| 774 |
+
rehypePlugins={[rehypeHighlight]}
|
| 775 |
+
components={{
|
| 776 |
+
code({ node, inline, className, children, ...props }) {
|
| 777 |
+
return inline ? (
|
| 778 |
+
<code className={`${darkMode ? 'bg-gray-700 text-gray-100' : 'bg-gray-200 text-gray-800'} px-1.5 py-0.5 rounded text-xs font-mono`} {...props}>
|
| 779 |
+
{children}
|
| 780 |
+
</code>
|
| 781 |
+
) : (
|
| 782 |
+
<code className={className} {...props}>
|
| 783 |
+
{children}
|
| 784 |
+
</code>
|
| 785 |
+
);
|
| 786 |
+
},
|
| 787 |
+
a({ node, children, ...props }) {
|
| 788 |
+
return (
|
| 789 |
+
<a className={`${darkMode ? 'text-blue-400' : 'text-blue-600'} hover:underline`} target="_blank" rel="noopener noreferrer" {...props}>
|
| 790 |
+
{children}
|
| 791 |
+
</a>
|
| 792 |
+
);
|
| 793 |
+
},
|
| 794 |
+
table: ({ node, ...props }) => (
|
| 795 |
+
<div className="overflow-x-auto my-4">
|
| 796 |
+
<table className={`min-w-full divide-y ${darkMode ? 'divide-gray-700 border-gray-700' : 'divide-gray-300 border-gray-300'} border rounded-lg`} {...props} />
|
| 797 |
+
</div>
|
| 798 |
+
),
|
| 799 |
+
thead: ({ node, ...props }) => (
|
| 800 |
+
<thead className={darkMode ? 'bg-gray-800' : 'bg-gray-100'} {...props} />
|
| 801 |
+
),
|
| 802 |
+
tbody: ({ node, ...props }) => (
|
| 803 |
+
<tbody className={`divide-y ${darkMode ? 'divide-gray-700 bg-gray-900' : 'divide-gray-200 bg-white'}`} {...props} />
|
| 804 |
+
),
|
| 805 |
+
th: ({ node, ...props }) => (
|
| 806 |
+
<th className={`px-4 py-3 text-left text-xs font-bold uppercase tracking-wider ${darkMode ? 'text-gray-300 border-gray-700' : 'text-gray-700 border-gray-300'} border-r last:border-r-0`} {...props} />
|
| 807 |
+
),
|
| 808 |
+
td: ({ node, ...props }) => (
|
| 809 |
+
<td className={`px-4 py-3 text-sm ${darkMode ? 'text-gray-300 border-gray-700' : 'text-gray-900 border-gray-200'} border-r last:border-r-0`} {...props} />
|
| 810 |
+
),
|
| 811 |
+
tr: ({ node, ...props }) => (
|
| 812 |
+
<tr className={darkMode ? 'hover:bg-gray-800' : 'hover:bg-gray-50'} {...props} />
|
| 813 |
+
),
|
| 814 |
+
}}
|
| 815 |
+
>
|
| 816 |
+
{msg.content}
|
| 817 |
+
</ReactMarkdown>
|
| 818 |
+
</div>
|
| 819 |
+
)}
|
| 820 |
+
{msg.streaming && msg.role === 'assistant' && (
|
| 821 |
+
<div className={`flex items-center space-x-1 ${theme.textMuted} text-sm mt-2`}>
|
| 822 |
+
<span>Thinking</span>
|
| 823 |
+
<span className="animate-pulse">...</span>
|
| 824 |
+
</div>
|
| 825 |
+
)}
|
| 826 |
+
</div>
|
| 827 |
+
</div>
|
| 828 |
+
))}
|
| 829 |
+
<div ref={messagesEndRef} />
|
| 830 |
+
</div>
|
| 831 |
+
</div>
|
| 832 |
+
)}
|
| 833 |
+
|
| 834 |
+
{/* Bottom Input Bar */}
|
| 835 |
+
<div className={`${theme.border} border-t ${theme.bgTertiary} px-4 py-4`}>
|
| 836 |
+
<div className="max-w-3xl mx-auto">
|
| 837 |
+
<div className="flex items-end space-x-3">
|
| 838 |
+
<button
|
| 839 |
+
onClick={() => setShowUploadModal(true)}
|
| 840 |
+
className={`px-4 py-3 ${darkMode ? 'bg-blue-600 hover:bg-blue-700' : 'bg-blue-600 hover:bg-blue-700'} text-white rounded-xl transition-colors flex items-center space-x-2`}
|
| 841 |
+
>
|
| 842 |
+
<Upload className="w-4 h-4" />
|
| 843 |
+
<span className="text-sm font-medium">Upload</span>
|
| 844 |
+
</button>
|
| 845 |
+
|
| 846 |
+
<textarea
|
| 847 |
+
value={query}
|
| 848 |
+
onChange={(e) => setQuery(e.target.value)}
|
| 849 |
+
onKeyDown={handleKeyPress}
|
| 850 |
+
placeholder="Message TheTruthSchool..."
|
| 851 |
+
className={`flex-1 px-4 py-3 ${theme.input} rounded-xl focus:outline-none focus:ring-2 focus:ring-blue-500 resize-none`}
|
| 852 |
+
disabled={isQuerying}
|
| 853 |
+
rows={1}
|
| 854 |
+
style={{ minHeight: '48px', maxHeight: '200px' }}
|
| 855 |
+
/>
|
| 856 |
+
|
| 857 |
+
<button
|
| 858 |
+
onClick={handleQuery}
|
| 859 |
+
disabled={isQuerying || !query.trim()}
|
| 860 |
+
className={`p-3 ${darkMode ? 'bg-blue-600 hover:bg-blue-700' : 'bg-blue-600 hover:bg-blue-700'} text-white rounded-xl transition-colors disabled:opacity-50 disabled:cursor-not-allowed`}
|
| 861 |
+
>
|
| 862 |
+
<Send className="w-5 h-5" />
|
| 863 |
+
</button>
|
| 864 |
+
</div>
|
| 865 |
+
|
| 866 |
+
<div className="flex items-center justify-center space-x-6 mt-3">
|
| 867 |
+
<label className="flex items-center space-x-2 cursor-pointer">
|
| 868 |
+
<input
|
| 869 |
+
type="checkbox"
|
| 870 |
+
checked={enableWebSearch}
|
| 871 |
+
onChange={(e) => {
|
| 872 |
+
setEnableWebSearch(e.target.checked);
|
| 873 |
+
if (e.target.checked && webSearchOnly) {
|
| 874 |
+
setWebSearchOnly(false);
|
| 875 |
+
}
|
| 876 |
+
}}
|
| 877 |
+
className="w-4 h-4 text-blue-600 rounded focus:ring-blue-500"
|
| 878 |
+
/>
|
| 879 |
+
<span className={`text-sm ${theme.textSecondary}`}>Enhance with Web Search</span>
|
| 880 |
+
</label>
|
| 881 |
+
<label className="flex items-center space-x-2 cursor-pointer">
|
| 882 |
+
<input
|
| 883 |
+
type="checkbox"
|
| 884 |
+
checked={webSearchOnly}
|
| 885 |
+
onChange={(e) => {
|
| 886 |
+
setWebSearchOnly(e.target.checked);
|
| 887 |
+
if (e.target.checked) {
|
| 888 |
+
setEnableWebSearch(false);
|
| 889 |
+
}
|
| 890 |
+
}}
|
| 891 |
+
className="w-4 h-4 text-blue-600 rounded focus:ring-blue-500"
|
| 892 |
+
/>
|
| 893 |
+
<span className={`text-sm ${theme.textSecondary}`}>Web Search Only</span>
|
| 894 |
+
</label>
|
| 895 |
+
</div>
|
| 896 |
+
|
| 897 |
+
<p className={`text-xs ${theme.textMuted} mt-2 text-center`}>
|
| 898 |
+
Press Enter to send • Shift+Enter for new line
|
| 899 |
+
</p>
|
| 900 |
+
</div>
|
| 901 |
+
</div>
|
| 902 |
+
</div>
|
| 903 |
+
);
|
| 904 |
+
|
| 905 |
+
const renderFilesView = () => (
|
| 906 |
+
<div className={`flex-1 overflow-y-auto p-6 ${theme.bg}`}>
|
| 907 |
+
<div className="max-w-5xl mx-auto">
|
| 908 |
+
<div className="flex items-center justify-between mb-6">
|
| 909 |
+
<div>
|
| 910 |
+
<h2 className={`text-2xl font-bold ${theme.text}`}>Document Management</h2>
|
| 911 |
+
<p className={theme.textSecondary}>Manage your uploaded and processed documents</p>
|
| 912 |
+
</div>
|
| 913 |
+
<div className="flex space-x-3">
|
| 914 |
+
<button
|
| 915 |
+
onClick={fetchProcessedDocuments}
|
| 916 |
+
className={`flex items-center space-x-2 px-4 py-2 ${theme.button} ${theme.text} rounded-lg transition-colors`}
|
| 917 |
+
>
|
| 918 |
+
<RefreshCw className="w-4 h-4" />
|
| 919 |
+
<span>Refresh</span>
|
| 920 |
+
</button>
|
| 921 |
+
<button
|
| 922 |
+
onClick={() => setShowUploadModal(true)}
|
| 923 |
+
className="flex items-center space-x-2 px-4 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors"
|
| 924 |
+
>
|
| 925 |
+
<Upload className="w-4 h-4" />
|
| 926 |
+
<span>Upload Documents</span>
|
| 927 |
+
</button>
|
| 928 |
+
</div>
|
| 929 |
+
</div>
|
| 930 |
+
|
| 931 |
+
{processingDocs.length > 0 && (
|
| 932 |
+
<div className="mb-6">
|
| 933 |
+
<h3 className={`text-lg font-semibold ${theme.text} mb-3`}>Processing Documents</h3>
|
| 934 |
+
<div className="grid grid-cols-1 md:grid-cols-2 gap-4">
|
| 935 |
+
{processingDocs.map((doc, idx) => (
|
| 936 |
+
<div key={idx} className={`flex items-center space-x-4 p-4 ${darkMode ? 'bg-yellow-900/20 border-yellow-800' : 'bg-yellow-50 border-yellow-200'} border rounded-lg`}>
|
| 937 |
+
<Loader2 className={`w-8 h-8 ${darkMode ? 'text-yellow-400' : 'text-yellow-600'} animate-spin`} />
|
| 938 |
+
<div className="flex-1">
|
| 939 |
+
<p className={`font-medium ${theme.text}`}>{doc.name}</p>
|
| 940 |
+
<p className={`text-sm ${theme.textSecondary}`}>Processing...</p>
|
| 941 |
+
</div>
|
| 942 |
+
</div>
|
| 943 |
+
))}
|
| 944 |
+
</div>
|
| 945 |
+
</div>
|
| 946 |
+
)}
|
| 947 |
+
|
| 948 |
+
<div>
|
| 949 |
+
<h3 className={`text-lg font-semibold ${theme.text} mb-3`}>
|
| 950 |
+
Processed Documents ({processedDocs.length})
|
| 951 |
+
</h3>
|
| 952 |
+
{processedDocs.length === 0 ? (
|
| 953 |
+
<div className={`text-center py-12 ${theme.bgSecondary} rounded-lg`}>
|
| 954 |
+
<FolderOpen className={`w-16 h-16 ${theme.textMuted} mx-auto mb-4`} />
|
| 955 |
+
<p className={theme.textSecondary}>No documents processed yet</p>
|
| 956 |
+
<button
|
| 957 |
+
onClick={() => setShowUploadModal(true)}
|
| 958 |
+
className="mt-4 px-6 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors"
|
| 959 |
+
>
|
| 960 |
+
Upload Your First Document
|
| 961 |
+
</button>
|
| 962 |
+
</div>
|
| 963 |
+
) : (
|
| 964 |
+
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
|
| 965 |
+
{processedDocs.map((doc, idx) => (
|
| 966 |
+
<div key={idx} className={`p-4 ${theme.bgTertiary} ${theme.border} border rounded-lg hover:shadow-lg transition-all group`}>
|
| 967 |
+
<div className="flex items-start justify-between mb-3">
|
| 968 |
+
<FileText className="w-8 h-8 text-blue-600" />
|
| 969 |
+
<button
|
| 970 |
+
onClick={() => handleDeleteDocument(doc.id, doc.name)}
|
| 971 |
+
className="opacity-0 group-hover:opacity-100 transition-opacity p-1 hover:bg-gray-700 rounded"
|
| 972 |
+
>
|
| 973 |
+
<Trash2 className={`w-4 h-4 ${theme.textMuted} hover:text-red-500`} />
|
| 974 |
+
</button>
|
| 975 |
+
</div>
|
| 976 |
+
<p className={`font-medium ${theme.text} mb-1 truncate`} title={doc.name}>{doc.name || `Document ${idx + 1}`}</p>
|
| 977 |
+
<p className={`text-sm ${theme.textSecondary} mb-2`}>{DOMAIN_CONFIGS[doc.domain]?.name || selectedDomain}</p>
|
| 978 |
+
<div className="flex items-center space-x-2">
|
| 979 |
+
<CheckCircle className="w-4 h-4 text-green-500" />
|
| 980 |
+
<span className={`text-xs ${theme.textSecondary}`}>Processed</span>
|
| 981 |
+
</div>
|
| 982 |
+
</div>
|
| 983 |
+
))}
|
| 984 |
+
</div>
|
| 985 |
+
)}
|
| 986 |
+
</div>
|
| 987 |
+
</div>
|
| 988 |
+
</div>
|
| 989 |
+
);
|
| 990 |
+
|
| 991 |
+
const renderSettingsView = () => (
|
| 992 |
+
<div className={`flex-1 overflow-y-auto p-6 ${theme.bg}`}>
|
| 993 |
+
<div className="max-w-3xl mx-auto">
|
| 994 |
+
<h2 className={`text-2xl font-bold ${theme.text} mb-6`}>Settings</h2>
|
| 995 |
+
|
| 996 |
+
<div className="space-y-6">
|
| 997 |
+
<div className={`${theme.bgTertiary} ${theme.border} border rounded-lg p-6`}>
|
| 998 |
+
<h3 className={`text-lg font-semibold ${theme.text} mb-4`}>Appearance</h3>
|
| 999 |
+
<div className="flex items-center justify-between">
|
| 1000 |
+
<div>
|
| 1001 |
+
<label className={`block text-sm font-medium ${theme.text}`}>Theme</label>
|
| 1002 |
+
<p className={`text-xs ${theme.textSecondary} mt-1`}>Choose your preferred interface theme</p>
|
| 1003 |
+
</div>
|
| 1004 |
+
<button
|
| 1005 |
+
onClick={() => setDarkMode(!darkMode)}
|
| 1006 |
+
className={`px-4 py-2 ${theme.button} ${theme.text} rounded-lg transition-colors flex items-center space-x-2`}
|
| 1007 |
+
>
|
| 1008 |
+
{darkMode ? (
|
| 1009 |
+
<>
|
| 1010 |
+
<Sun className="w-4 h-4" />
|
| 1011 |
+
<span>Light Mode</span>
|
| 1012 |
+
</>
|
| 1013 |
+
) : (
|
| 1014 |
+
<>
|
| 1015 |
+
<Moon className="w-4 h-4" />
|
| 1016 |
+
<span>Dark Mode</span>
|
| 1017 |
+
</>
|
| 1018 |
+
)}
|
| 1019 |
+
</button>
|
| 1020 |
+
</div>
|
| 1021 |
+
</div>
|
| 1022 |
+
|
| 1023 |
+
<div className={`${theme.bgTertiary} ${theme.border} border rounded-lg p-6`}>
|
| 1024 |
+
<h3 className={`text-lg font-semibold ${theme.text} mb-4`}>Domain Configuration</h3>
|
| 1025 |
+
<div className="space-y-3">
|
| 1026 |
+
<div>
|
| 1027 |
+
<label className={`block text-sm font-medium ${theme.text} mb-2`}>Current Domain</label>
|
| 1028 |
+
<select
|
| 1029 |
+
value={selectedDomain}
|
| 1030 |
+
onChange={(e) => setSelectedDomain(e.target.value)}
|
| 1031 |
+
className={`w-full px-4 py-2 ${theme.input} rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500`}
|
| 1032 |
+
>
|
| 1033 |
+
{Object.entries(DOMAIN_CONFIGS).map(([key, config]) => (
|
| 1034 |
+
<option key={key} value={key}>{config.name}</option>
|
| 1035 |
+
))}
|
| 1036 |
+
</select>
|
| 1037 |
+
</div>
|
| 1038 |
+
</div>
|
| 1039 |
+
</div>
|
| 1040 |
+
|
| 1041 |
+
<div className={`${theme.bgTertiary} ${theme.border} border rounded-lg p-6`}>
|
| 1042 |
+
<h3 className={`text-lg font-semibold ${theme.text} mb-4`}>Performance Settings</h3>
|
| 1043 |
+
<div className="space-y-4">
|
| 1044 |
+
<div className="flex items-start space-x-3">
|
| 1045 |
+
<input
|
| 1046 |
+
type="checkbox"
|
| 1047 |
+
id="fastMode"
|
| 1048 |
+
checked={fastMode}
|
| 1049 |
+
onChange={(e) => setFastMode(e.target.checked)}
|
| 1050 |
+
className="w-5 h-5 text-blue-600 rounded focus:ring-blue-500 mt-0.5"
|
| 1051 |
+
/>
|
| 1052 |
+
<div className="flex-1">
|
| 1053 |
+
<label htmlFor="fastMode" className={`block text-sm font-medium ${theme.text} cursor-pointer`}>
|
| 1054 |
+
Fast Mode
|
| 1055 |
+
</label>
|
| 1056 |
+
<p className={`text-xs ${theme.textSecondary} mt-1`}>
|
| 1057 |
+
Use optimized parameters for 2-3x faster queries
|
| 1058 |
+
</p>
|
| 1059 |
+
</div>
|
| 1060 |
+
</div>
|
| 1061 |
+
|
| 1062 |
+
<div className="flex items-start space-x-3">
|
| 1063 |
+
<input
|
| 1064 |
+
type="checkbox"
|
| 1065 |
+
id="enableCache"
|
| 1066 |
+
checked={enableCache}
|
| 1067 |
+
onChange={(e) => setEnableCache(e.target.checked)}
|
| 1068 |
+
className="w-5 h-5 text-blue-600 rounded focus:ring-blue-500 mt-0.5"
|
| 1069 |
+
/>
|
| 1070 |
+
<div className="flex-1">
|
| 1071 |
+
<label htmlFor="enableCache" className={`block text-sm font-medium ${theme.text} cursor-pointer`}>
|
| 1072 |
+
Enable Query Caching
|
| 1073 |
+
</label>
|
| 1074 |
+
<p className={`text-xs ${theme.textSecondary} mt-1`}>
|
| 1075 |
+
Cache results for faster repeated queries
|
| 1076 |
+
</p>
|
| 1077 |
+
</div>
|
| 1078 |
+
</div>
|
| 1079 |
+
|
| 1080 |
+
<div className="flex items-start space-x-3">
|
| 1081 |
+
<input
|
| 1082 |
+
type="checkbox"
|
| 1083 |
+
id="enableQueryImprovement"
|
| 1084 |
+
checked={enableQueryImprovement}
|
| 1085 |
+
onChange={(e) => setEnableQueryImprovement(e.target.checked)}
|
| 1086 |
+
className="w-5 h-5 text-blue-600 rounded focus:ring-blue-500 mt-0.5"
|
| 1087 |
+
/>
|
| 1088 |
+
<div className="flex-1">
|
| 1089 |
+
<label htmlFor="enableQueryImprovement" className={`block text-sm font-medium ${theme.text} cursor-pointer`}>
|
| 1090 |
+
Enable Query Improvement
|
| 1091 |
+
</label>
|
| 1092 |
+
<p className={`text-xs ${theme.textSecondary} mt-1`}>
|
| 1093 |
+
Automatically enhance queries for better results
|
| 1094 |
+
</p>
|
| 1095 |
+
</div>
|
| 1096 |
+
</div>
|
| 1097 |
+
|
| 1098 |
+
<div className="flex items-start space-x-3">
|
| 1099 |
+
<input
|
| 1100 |
+
type="checkbox"
|
| 1101 |
+
id="enableVerification"
|
| 1102 |
+
checked={enableVerification}
|
| 1103 |
+
onChange={(e) => setEnableVerification(e.target.checked)}
|
| 1104 |
+
className="w-5 h-5 text-blue-600 rounded focus:ring-blue-500 mt-0.5"
|
| 1105 |
+
/>
|
| 1106 |
+
<div className="flex-1">
|
| 1107 |
+
<label htmlFor="enableVerification" className={`block text-sm font-medium ${theme.text} cursor-pointer`}>
|
| 1108 |
+
Enable Answer Verification
|
| 1109 |
+
</label>
|
| 1110 |
+
<p className={`text-xs ${theme.textSecondary} mt-1`}>
|
| 1111 |
+
Verify answer quality and accuracy with dual-LLM
|
| 1112 |
+
</p>
|
| 1113 |
+
</div>
|
| 1114 |
+
</div>
|
| 1115 |
+
</div>
|
| 1116 |
+
</div>
|
| 1117 |
+
</div>
|
| 1118 |
+
</div>
|
| 1119 |
+
</div>
|
| 1120 |
+
);
|
| 1121 |
+
|
| 1122 |
+
const renderUploadModal = () => {
|
| 1123 |
+
if (!showUploadModal) return null;
|
| 1124 |
+
|
| 1125 |
+
return (
|
| 1126 |
+
<div className="fixed inset-0 bg-black bg-opacity-70 flex items-center justify-center z-50 p-4 backdrop-blur-sm">
|
| 1127 |
+
<div className={`${theme.bgTertiary} rounded-2xl max-w-2xl w-full p-6 shadow-2xl`}>
|
| 1128 |
+
<div className="flex items-center justify-between mb-6">
|
| 1129 |
+
<h2 className={`text-2xl font-bold ${theme.text}`}>Upload Documents</h2>
|
| 1130 |
+
<button
|
| 1131 |
+
onClick={() => {
|
| 1132 |
+
setShowUploadModal(false);
|
| 1133 |
+
setUploadMode('file');
|
| 1134 |
+
setUrlInput('');
|
| 1135 |
+
}}
|
| 1136 |
+
className={`p-2 ${theme.hover} rounded-lg`}
|
| 1137 |
+
>
|
| 1138 |
+
<X className={`w-5 h-5 ${theme.textSecondary}`} />
|
| 1139 |
+
</button>
|
| 1140 |
+
</div>
|
| 1141 |
+
|
| 1142 |
+
<div className="flex items-center space-x-2 mb-6">
|
| 1143 |
+
<button
|
| 1144 |
+
onClick={() => setUploadMode('file')}
|
| 1145 |
+
className={`flex-1 px-4 py-2 rounded-lg font-medium transition-colors ${
|
| 1146 |
+
uploadMode === 'file'
|
| 1147 |
+
? 'bg-blue-600 text-white'
|
| 1148 |
+
: `${theme.button} ${theme.text}`
|
| 1149 |
+
}`}
|
| 1150 |
+
>
|
| 1151 |
+
Upload File
|
| 1152 |
+
</button>
|
| 1153 |
+
<button
|
| 1154 |
+
onClick={() => setUploadMode('url')}
|
| 1155 |
+
className={`flex-1 px-4 py-2 rounded-lg font-medium transition-colors ${
|
| 1156 |
+
uploadMode === 'url'
|
| 1157 |
+
? 'bg-blue-600 text-white'
|
| 1158 |
+
: `${theme.button} ${theme.text}`
|
| 1159 |
+
}`}
|
| 1160 |
+
>
|
| 1161 |
+
Upload from URL
|
| 1162 |
+
</button>
|
| 1163 |
+
</div>
|
| 1164 |
+
|
| 1165 |
+
{uploadMode === 'file' ? (
|
| 1166 |
+
<div
|
| 1167 |
+
onDragOver={handleDragOver}
|
| 1168 |
+
onDragLeave={handleDragLeave}
|
| 1169 |
+
onDrop={handleDrop}
|
| 1170 |
+
className={`border-2 border-dashed rounded-xl p-12 text-center transition-colors ${
|
| 1171 |
+
isDragging
|
| 1172 |
+
? 'border-blue-500 bg-blue-500/10'
|
| 1173 |
+
: `${theme.borderLight}`
|
| 1174 |
+
}`}
|
| 1175 |
+
>
|
| 1176 |
+
<Upload className={`w-16 h-16 ${theme.textMuted} mx-auto mb-4`} />
|
| 1177 |
+
<h3 className={`text-lg font-semibold ${theme.text} mb-2`}>
|
| 1178 |
+
Drop files here or click to browse
|
| 1179 |
+
</h3>
|
| 1180 |
+
<p className={`${theme.textSecondary} mb-4`}>
|
| 1181 |
+
Supported: {DOMAIN_CONFIGS[selectedDomain].fileTypes.join(', ')}
|
| 1182 |
+
</p>
|
| 1183 |
+
<input
|
| 1184 |
+
ref={fileInputRef}
|
| 1185 |
+
type="file"
|
| 1186 |
+
multiple
|
| 1187 |
+
accept={DOMAIN_CONFIGS[selectedDomain].fileTypes.join(',')}
|
| 1188 |
+
onChange={(e) => handleFileUpload(e.target.files)}
|
| 1189 |
+
className="hidden"
|
| 1190 |
+
/>
|
| 1191 |
+
<button
|
| 1192 |
+
onClick={() => fileInputRef.current?.click()}
|
| 1193 |
+
className="px-6 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors"
|
| 1194 |
+
>
|
| 1195 |
+
Select Files
|
| 1196 |
+
</button>
|
| 1197 |
+
</div>
|
| 1198 |
+
) : (
|
| 1199 |
+
<div className="space-y-4">
|
| 1200 |
+
<div>
|
| 1201 |
+
<label className={`block text-sm font-medium ${theme.text} mb-2`}>
|
| 1202 |
+
Enter URL to fetch and process
|
| 1203 |
+
</label>
|
| 1204 |
+
<input
|
| 1205 |
+
type="url"
|
| 1206 |
+
value={urlInput}
|
| 1207 |
+
onChange={(e) => setUrlInput(e.target.value)}
|
| 1208 |
+
placeholder="https://example.com/document.pdf"
|
| 1209 |
+
className={`w-full px-4 py-3 ${theme.input} rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500`}
|
| 1210 |
+
onKeyDown={(e) => {
|
| 1211 |
+
if (e.key === 'Enter') {
|
| 1212 |
+
handleUrlUpload();
|
| 1213 |
+
}
|
| 1214 |
+
}}
|
| 1215 |
+
/>
|
| 1216 |
+
</div>
|
| 1217 |
+
<button
|
| 1218 |
+
onClick={handleUrlUpload}
|
| 1219 |
+
disabled={!urlInput.trim()}
|
| 1220 |
+
className="w-full px-6 py-3 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
|
| 1221 |
+
>
|
| 1222 |
+
Fetch and Process URL
|
| 1223 |
+
</button>
|
| 1224 |
+
</div>
|
| 1225 |
+
)}
|
| 1226 |
+
</div>
|
| 1227 |
+
</div>
|
| 1228 |
+
);
|
| 1229 |
+
};
|
| 1230 |
+
|
| 1231 |
+
const renderError = () => {
|
| 1232 |
+
if (!error) return null;
|
| 1233 |
+
|
| 1234 |
+
return (
|
| 1235 |
+
<div className={`fixed bottom-4 right-4 ${darkMode ? 'bg-red-900/90 border-red-800' : 'bg-red-50 border-red-200'} border rounded-lg p-4 max-w-md shadow-2xl backdrop-blur-sm`}>
|
| 1236 |
+
<div className="flex items-start space-x-3">
|
| 1237 |
+
<XCircle className="w-5 h-5 text-red-500 flex-shrink-0 mt-0.5" />
|
| 1238 |
+
<div className="flex-1">
|
| 1239 |
+
<p className={`text-sm ${darkMode ? 'text-red-200' : 'text-red-800'}`}>{error}</p>
|
| 1240 |
+
</div>
|
| 1241 |
+
<button
|
| 1242 |
+
onClick={() => setError(null)}
|
| 1243 |
+
className="text-red-500 hover:text-red-600"
|
| 1244 |
+
>
|
| 1245 |
+
<X className="w-4 h-4" />
|
| 1246 |
+
</button>
|
| 1247 |
+
</div>
|
| 1248 |
+
</div>
|
| 1249 |
+
);
|
| 1250 |
+
};
|
| 1251 |
+
|
| 1252 |
+
return (
|
| 1253 |
+
<div className={`h-screen flex flex-col ${theme.bg}`}>
|
| 1254 |
+
{renderNavigation()}
|
| 1255 |
+
|
| 1256 |
+
<div className="flex-1 flex overflow-hidden">
|
| 1257 |
+
{renderSidebar()}
|
| 1258 |
+
|
| 1259 |
+
{currentView === 'app' && renderAppView()}
|
| 1260 |
+
{currentView === 'files' && renderFilesView()}
|
| 1261 |
+
{currentView === 'settings' && renderSettingsView()}
|
| 1262 |
+
</div>
|
| 1263 |
+
|
| 1264 |
+
{renderUploadModal()}
|
| 1265 |
+
{renderError()}
|
| 1266 |
+
</div>
|
| 1267 |
+
);
|
| 1268 |
+
}
|
frontend/src/index.css
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@tailwind base;
|
| 2 |
+
@tailwind components;
|
| 3 |
+
@tailwind utilities;
|
| 4 |
+
|
| 5 |
+
@keyframes blink {
|
| 6 |
+
0%, 80%, 100% {
|
| 7 |
+
opacity: 0;
|
| 8 |
+
}
|
| 9 |
+
40% {
|
| 10 |
+
opacity: 1;
|
| 11 |
+
}
|
| 12 |
+
}
|
| 13 |
+
.animate-blink {
|
| 14 |
+
animation: blink 1.4s infinite;
|
| 15 |
+
animation-fill-mode: both;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
* {
|
| 19 |
+
margin: 0;
|
| 20 |
+
padding: 0;
|
| 21 |
+
box-sizing: border-box;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
body {
|
| 25 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
|
| 26 |
+
'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
|
| 27 |
+
sans-serif;
|
| 28 |
+
-webkit-font-smoothing: antialiased;
|
| 29 |
+
-moz-osx-font-smoothing: grayscale;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
code {
|
| 33 |
+
font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
|
| 34 |
+
monospace;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
/* Custom scrollbar for webkit browsers */
|
| 39 |
+
::-webkit-scrollbar {
|
| 40 |
+
width: 8px;
|
| 41 |
+
height: 8px;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
::-webkit-scrollbar-track {
|
| 45 |
+
background: #1f2937;
|
| 46 |
+
border-radius: 4px;
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
::-webkit-scrollbar-thumb {
|
| 50 |
+
background: #4b5563;
|
| 51 |
+
border-radius: 4px;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
::-webkit-scrollbar-thumb:hover {
|
| 55 |
+
background: #6b7280;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
/* Smooth transitions */
|
| 59 |
+
* {
|
| 60 |
+
transition-property: background-color, border-color, color, fill, stroke;
|
| 61 |
+
transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
|
| 62 |
+
transition-duration: 150ms;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
/* Animations */
|
| 66 |
+
@keyframes fadeIn {
|
| 67 |
+
from {
|
| 68 |
+
opacity: 0;
|
| 69 |
+
transform: translateY(10px);
|
| 70 |
+
}
|
| 71 |
+
to {
|
| 72 |
+
opacity: 1;
|
| 73 |
+
transform: translateY(0);
|
| 74 |
+
}
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.animate-fadeIn {
|
| 78 |
+
animation: fadeIn 0.3s ease-out;
|
| 79 |
+
}
|
frontend/src/index.js
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import React from 'react';
|
| 2 |
+
import ReactDOM from 'react-dom/client';
|
| 3 |
+
import './index.css';
|
| 4 |
+
import App from './App';
|
| 5 |
+
|
| 6 |
+
const root = ReactDOM.createRoot(document.getElementById('root'));
|
| 7 |
+
root.render(
|
| 8 |
+
<React.StrictMode>
|
| 9 |
+
<App />
|
| 10 |
+
</React.StrictMode>
|
| 11 |
+
);
|
frontend/tailwind.config.js
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/** @type {import('tailwindcss').Config} */
|
| 2 |
+
module.exports = {
|
| 3 |
+
content: [
|
| 4 |
+
"./src/**/*.{js,jsx,ts,tsx}",
|
| 5 |
+
],
|
| 6 |
+
theme: {
|
| 7 |
+
extend: {
|
| 8 |
+
colors: {
|
| 9 |
+
gray: {
|
| 10 |
+
650: '#4b5563',
|
| 11 |
+
750: '#2d3748',
|
| 12 |
+
850: '#1a202c',
|
| 13 |
+
}
|
| 14 |
+
},
|
| 15 |
+
animation: {
|
| 16 |
+
'fadeIn': 'fadeIn 0.3s ease-out',
|
| 17 |
+
},
|
| 18 |
+
keyframes: {
|
| 19 |
+
fadeIn: {
|
| 20 |
+
'0%': { opacity: '0', transform: 'translateY(10px)' },
|
| 21 |
+
'100%': { opacity: '1', transform: 'translateY(0)' },
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
},
|
| 25 |
+
},
|
| 26 |
+
plugins: [],
|
| 27 |
+
}
|
rag_anything_smaranika/.github/ISSUE_TEMPLATE/bug_report.yml
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Bug Report
|
| 2 |
+
description: File a bug report
|
| 3 |
+
title: "[Bug]:"
|
| 4 |
+
labels: ["bug", "triage"]
|
| 5 |
+
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
id: existingcheck
|
| 9 |
+
attributes:
|
| 10 |
+
label: Do you need to file an issue?
|
| 11 |
+
description: Please help us manage our time by avoiding duplicates and common bugs with the steps below.
|
| 12 |
+
options:
|
| 13 |
+
- label: I have searched the existing issues and this bug is not already filed.
|
| 14 |
+
- label: I believe this is a legitimate bug, not just a question or feature request.
|
| 15 |
+
- type: textarea
|
| 16 |
+
id: description
|
| 17 |
+
attributes:
|
| 18 |
+
label: Describe the bug
|
| 19 |
+
description: A clear and concise description of what the bug is.
|
| 20 |
+
placeholder: What went wrong?
|
| 21 |
+
- type: textarea
|
| 22 |
+
id: reproduce
|
| 23 |
+
attributes:
|
| 24 |
+
label: Steps to reproduce
|
| 25 |
+
description: Steps to reproduce the behavior.
|
| 26 |
+
placeholder: How can we replicate the issue?
|
| 27 |
+
- type: textarea
|
| 28 |
+
id: expected_behavior
|
| 29 |
+
attributes:
|
| 30 |
+
label: Expected Behavior
|
| 31 |
+
description: A clear and concise description of what you expected to happen.
|
| 32 |
+
placeholder: What should have happened?
|
| 33 |
+
- type: textarea
|
| 34 |
+
id: configused
|
| 35 |
+
attributes:
|
| 36 |
+
label: LightRAG Config Used
|
| 37 |
+
description: The LightRAG configuration used for the run.
|
| 38 |
+
placeholder: The settings content or LightRAG configuration
|
| 39 |
+
value: |
|
| 40 |
+
# Paste your config here
|
| 41 |
+
- type: textarea
|
| 42 |
+
id: screenshotslogs
|
| 43 |
+
attributes:
|
| 44 |
+
label: Logs and screenshots
|
| 45 |
+
description: If applicable, add screenshots and logs to help explain your problem.
|
| 46 |
+
placeholder: Add logs and screenshots here
|
| 47 |
+
- type: textarea
|
| 48 |
+
id: additional_information
|
| 49 |
+
attributes:
|
| 50 |
+
label: Additional Information
|
| 51 |
+
description: |
|
| 52 |
+
- LightRAG Version: e.g., v0.1.1
|
| 53 |
+
- Operating System: e.g., Windows 10, Ubuntu 20.04
|
| 54 |
+
- Python Version: e.g., 3.8
|
| 55 |
+
- Related Issues: e.g., #1
|
| 56 |
+
- Any other relevant information.
|
| 57 |
+
value: |
|
| 58 |
+
- LightRAG Version:
|
| 59 |
+
- Operating System:
|
| 60 |
+
- Python Version:
|
| 61 |
+
- Related Issues:
|
rag_anything_smaranika/.github/ISSUE_TEMPLATE/config.yml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
blank_issues_enabled: false
|
rag_anything_smaranika/.github/ISSUE_TEMPLATE/feature_request.yml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Feature Request
|
| 2 |
+
description: File a feature request
|
| 3 |
+
labels: ["enhancement"]
|
| 4 |
+
title: "[Feature Request]:"
|
| 5 |
+
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
id: existingcheck
|
| 9 |
+
attributes:
|
| 10 |
+
label: Do you need to file a feature request?
|
| 11 |
+
description: Please help us manage our time by avoiding duplicates and common feature request with the steps below.
|
| 12 |
+
options:
|
| 13 |
+
- label: I have searched the existing feature request and this feature request is not already filed.
|
| 14 |
+
- label: I believe this is a legitimate feature request, not just a question or bug.
|
| 15 |
+
- type: textarea
|
| 16 |
+
id: feature_request_description
|
| 17 |
+
attributes:
|
| 18 |
+
label: Feature Request Description
|
| 19 |
+
description: A clear and concise description of the feature request you would like.
|
| 20 |
+
placeholder: What this feature request add more or improve?
|
| 21 |
+
- type: textarea
|
| 22 |
+
id: additional_context
|
| 23 |
+
attributes:
|
| 24 |
+
label: Additional Context
|
| 25 |
+
description: Add any other context or screenshots about the feature request here.
|
| 26 |
+
placeholder: Any additional information
|
rag_anything_smaranika/.github/ISSUE_TEMPLATE/question.yml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Question
|
| 2 |
+
description: Ask a general question
|
| 3 |
+
labels: ["question"]
|
| 4 |
+
title: "[Question]:"
|
| 5 |
+
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
id: existingcheck
|
| 9 |
+
attributes:
|
| 10 |
+
label: Do you need to ask a question?
|
| 11 |
+
description: Please help us manage our time by avoiding duplicates and common questions with the steps below.
|
| 12 |
+
options:
|
| 13 |
+
- label: I have searched the existing question and discussions and this question is not already answered.
|
| 14 |
+
- label: I believe this is a legitimate question, not just a bug or feature request.
|
| 15 |
+
- type: textarea
|
| 16 |
+
id: question
|
| 17 |
+
attributes:
|
| 18 |
+
label: Your Question
|
| 19 |
+
description: A clear and concise description of your question.
|
| 20 |
+
placeholder: What is your question?
|
| 21 |
+
- type: textarea
|
| 22 |
+
id: context
|
| 23 |
+
attributes:
|
| 24 |
+
label: Additional Context
|
| 25 |
+
description: Provide any additional context or details that might help us understand your question better.
|
| 26 |
+
placeholder: Add any relevant information here
|
rag_anything_smaranika/.github/dependabot.yml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# To get started with Dependabot version updates, you'll need to specify which
|
| 2 |
+
# package ecosystems to update and where the package manifests are located.
|
| 3 |
+
# Please see the documentation for all configuration options:
|
| 4 |
+
# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
|
| 5 |
+
|
| 6 |
+
version: 2
|
| 7 |
+
updates:
|
| 8 |
+
- package-ecosystem: "pip" # See documentation for possible values
|
| 9 |
+
directory: "/" # Location of package manifests
|
| 10 |
+
schedule:
|
| 11 |
+
interval: "weekly"
|
rag_anything_smaranika/.github/pull_request_template.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!--
|
| 2 |
+
Thanks for contributing to RAGAnything!
|
| 3 |
+
|
| 4 |
+
Please ensure your pull request is ready for review before submitting.
|
| 5 |
+
|
| 6 |
+
About this template
|
| 7 |
+
|
| 8 |
+
This template helps contributors provide a clear and concise description of their changes. Feel free to adjust it as needed.
|
| 9 |
+
-->
|
| 10 |
+
|
| 11 |
+
## Description
|
| 12 |
+
|
| 13 |
+
[Briefly describe the changes made in this pull request.]
|
| 14 |
+
|
| 15 |
+
## Related Issues
|
| 16 |
+
|
| 17 |
+
[Reference any related issues or tasks addressed by this pull request.]
|
| 18 |
+
|
| 19 |
+
## Changes Made
|
| 20 |
+
|
| 21 |
+
[List the specific changes made in this pull request.]
|
| 22 |
+
|
| 23 |
+
## Checklist
|
| 24 |
+
|
| 25 |
+
- [ ] Changes tested locally
|
| 26 |
+
- [ ] Code reviewed
|
| 27 |
+
- [ ] Documentation updated (if necessary)
|
| 28 |
+
- [ ] Unit tests added (if applicable)
|
| 29 |
+
|
| 30 |
+
## Additional Notes
|
| 31 |
+
|
| 32 |
+
[Add any additional notes or context for the reviewer(s).]
|
rag_anything_smaranika/.github/workflows/linting.yaml
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Linting and Formatting
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
pull_request:
|
| 8 |
+
branches:
|
| 9 |
+
- main
|
| 10 |
+
|
| 11 |
+
jobs:
|
| 12 |
+
lint-and-format:
|
| 13 |
+
runs-on: ubuntu-latest
|
| 14 |
+
|
| 15 |
+
steps:
|
| 16 |
+
- name: Checkout code
|
| 17 |
+
uses: actions/checkout@v2
|
| 18 |
+
|
| 19 |
+
- name: Set up Python
|
| 20 |
+
uses: actions/setup-python@v2
|
| 21 |
+
with:
|
| 22 |
+
python-version: '3.x'
|
| 23 |
+
|
| 24 |
+
- name: Install dependencies
|
| 25 |
+
run: |
|
| 26 |
+
python -m pip install --upgrade pip
|
| 27 |
+
pip install pre-commit
|
| 28 |
+
|
| 29 |
+
- name: Run pre-commit
|
| 30 |
+
run: pre-commit run --all-files --show-diff-on-failure
|
rag_anything_smaranika/.github/workflows/pypi-publish.yml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Upload RAGAnything Package
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
release:
|
| 5 |
+
types: [published]
|
| 6 |
+
|
| 7 |
+
permissions:
|
| 8 |
+
contents: read
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
release-build:
|
| 12 |
+
runs-on: ubuntu-latest
|
| 13 |
+
|
| 14 |
+
steps:
|
| 15 |
+
- uses: actions/checkout@v4
|
| 16 |
+
|
| 17 |
+
- uses: actions/setup-python@v5
|
| 18 |
+
with:
|
| 19 |
+
python-version: "3.x"
|
| 20 |
+
|
| 21 |
+
- name: Build release distributions
|
| 22 |
+
run: |
|
| 23 |
+
python -m pip install build
|
| 24 |
+
python -m build
|
| 25 |
+
|
| 26 |
+
- name: Upload distributions
|
| 27 |
+
uses: actions/upload-artifact@v4
|
| 28 |
+
with:
|
| 29 |
+
name: release-dists
|
| 30 |
+
path: dist/
|
| 31 |
+
|
| 32 |
+
pypi-publish:
|
| 33 |
+
runs-on: ubuntu-latest
|
| 34 |
+
needs:
|
| 35 |
+
- release-build
|
| 36 |
+
permissions:
|
| 37 |
+
id-token: write
|
| 38 |
+
|
| 39 |
+
environment:
|
| 40 |
+
name: pypi
|
| 41 |
+
|
| 42 |
+
steps:
|
| 43 |
+
- name: Retrieve release distributions
|
| 44 |
+
uses: actions/download-artifact@v4
|
| 45 |
+
with:
|
| 46 |
+
name: release-dists
|
| 47 |
+
path: dist/
|
| 48 |
+
|
| 49 |
+
- name: Publish release distributions to PyPI
|
| 50 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
| 51 |
+
with:
|
| 52 |
+
packages-dir: dist/
|
rag_anything_smaranika/.gitignore
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-related files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.egg-info/
|
| 5 |
+
.eggs/
|
| 6 |
+
*.tgz
|
| 7 |
+
*.tar.gz
|
| 8 |
+
*.ini
|
| 9 |
+
|
| 10 |
+
# Virtual Environment
|
| 11 |
+
.venv/
|
| 12 |
+
env/
|
| 13 |
+
venv/
|
| 14 |
+
|
| 15 |
+
*.env*
|
| 16 |
+
.env_example
|
| 17 |
+
|
| 18 |
+
# Build / Distribution
|
| 19 |
+
dist/
|
| 20 |
+
build/
|
| 21 |
+
site/
|
| 22 |
+
|
| 23 |
+
# Logs / Reports
|
| 24 |
+
*.log
|
| 25 |
+
*.log.*
|
| 26 |
+
*.logfire
|
| 27 |
+
*.coverage/
|
| 28 |
+
log/
|
| 29 |
+
|
| 30 |
+
# Caches
|
| 31 |
+
.cache/
|
| 32 |
+
.mypy_cache/
|
| 33 |
+
.pytest_cache/
|
| 34 |
+
.ruff_cache/
|
| 35 |
+
.gradio/
|
| 36 |
+
.history/
|
| 37 |
+
temp/
|
| 38 |
+
|
| 39 |
+
# IDE / Editor Files
|
| 40 |
+
.idea/
|
| 41 |
+
.vscode/
|
| 42 |
+
.vscode/settings.json
|
| 43 |
+
|
| 44 |
+
# Framework-specific files
|
| 45 |
+
local_neo4jWorkDir/
|
| 46 |
+
neo4jWorkDir/
|
| 47 |
+
|
| 48 |
+
# Data & Storage
|
| 49 |
+
inputs/
|
| 50 |
+
rag_storage*/
|
| 51 |
+
examples/input/
|
| 52 |
+
examples/output/
|
| 53 |
+
output*/
|
| 54 |
+
|
| 55 |
+
# Miscellaneous
|
| 56 |
+
.DS_Store
|
| 57 |
+
TODO.md
|
| 58 |
+
ignore_this.txt
|
| 59 |
+
*.ignore.*
|
| 60 |
+
|
| 61 |
+
# Project-specific files
|
| 62 |
+
dickens*/
|
| 63 |
+
book.txt
|
| 64 |
+
LightRAG.pdf
|
| 65 |
+
LightRAG_2-4.pdf
|
| 66 |
+
download_models_hf.py
|
| 67 |
+
lightrag-dev/
|
| 68 |
+
gui/
|
| 69 |
+
|
| 70 |
+
# unit-test files
|
| 71 |
+
test_*
|
| 72 |
+
|
| 73 |
+
# Cline files
|
| 74 |
+
memory-bank/
|
| 75 |
+
|
| 76 |
+
# AI
|
| 77 |
+
.claude/
|
| 78 |
+
.cursor/
|
| 79 |
+
CLAUDE.md
|
rag_anything_smaranika/.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 3 |
+
rev: v5.0.0
|
| 4 |
+
hooks:
|
| 5 |
+
- id: trailing-whitespace
|
| 6 |
+
exclude: ^lightrag/api/webui/
|
| 7 |
+
- id: end-of-file-fixer
|
| 8 |
+
exclude: ^lightrag/api/webui/
|
| 9 |
+
- id: requirements-txt-fixer
|
| 10 |
+
exclude: ^lightrag/api/webui/
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 14 |
+
rev: v0.6.4
|
| 15 |
+
hooks:
|
| 16 |
+
- id: ruff-format
|
| 17 |
+
exclude: ^lightrag/api/webui/
|
| 18 |
+
- id: ruff
|
| 19 |
+
args: [--fix, --ignore=E402]
|
| 20 |
+
exclude: ^lightrag/api/webui/
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
- repo: https://github.com/mgedmin/check-manifest
|
| 24 |
+
rev: "0.49"
|
| 25 |
+
hooks:
|
| 26 |
+
- id: check-manifest
|
| 27 |
+
stages: [manual]
|
| 28 |
+
exclude: ^lightrag/api/webui/
|
rag_anything_smaranika/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 ✨Data Intelligence Lab@HKU✨
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
rag_anything_smaranika/MANIFEST.in
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include requirements.txt
|
| 2 |
+
include README.md
|
| 3 |
+
include README_zh.md
|
| 4 |
+
include LICENSE
|
| 5 |
+
recursive-include raganything *.py
|
| 6 |
+
recursive-include examples *.py
|
| 7 |
+
global-exclude *.pyc
|
| 8 |
+
global-exclude __pycache__
|
| 9 |
+
global-exclude *.egg-info
|
rag_anything_smaranika/README.md
ADDED
|
@@ -0,0 +1,1260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div align="center">
|
| 2 |
+
|
| 3 |
+
<div style="margin: 20px 0;">
|
| 4 |
+
<img src="./assets/logo.png" width="120" height="120" alt="RAG-Anything Logo" style="border-radius: 20px; box-shadow: 0 8px 32px rgba(0, 217, 255, 0.3);">
|
| 5 |
+
</div>
|
| 6 |
+
|
| 7 |
+
# 🚀 RAG-Anything: All-in-One RAG Framework
|
| 8 |
+
|
| 9 |
+
<a href="https://trendshift.io/repositories/14959" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14959" alt="HKUDS%2FRAG-Anything | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
| 10 |
+
|
| 11 |
+
<div align="center">
|
| 12 |
+
<img src="https://readme-typing-svg.herokuapp.com?font=Orbitron&size=24&duration=3000&pause=1000&color=00D9FF¢er=true&vCenter=true&width=600&lines=Welcome+to+RAG-Anything;Next-Gen+Multimodal+RAG+System;Powered+by+Advanced+AI+Technology" alt="Typing Animation" />
|
| 13 |
+
</div>
|
| 14 |
+
|
| 15 |
+
<div align="center">
|
| 16 |
+
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;">
|
| 17 |
+
<p>
|
| 18 |
+
<a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥Project-Page-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>
|
| 19 |
+
<a href='https://arxiv.org/abs/2410.05779'><img src='https://img.shields.io/badge/📄arXiv-2410.05779-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
|
| 20 |
+
<a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡Based%20on-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>
|
| 21 |
+
</p>
|
| 22 |
+
<p>
|
| 23 |
+
<a href="https://github.com/HKUDS/RAG-Anything/stargazers"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>
|
| 24 |
+
<img src="https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
|
| 25 |
+
<a href="https://pypi.org/project/raganything/"><img src="https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b"></a>
|
| 26 |
+
<a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/⚡uv-Ready-ff6b6b?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e"></a>
|
| 27 |
+
</p>
|
| 28 |
+
<p>
|
| 29 |
+
<a href="https://discord.gg/yF2MmDJyGJ"><img src="https://img.shields.io/badge/💬Discord-Community-7289da?style=for-the-badge&logo=discord&logoColor=white&labelColor=1a1a2e"></a>
|
| 30 |
+
<a href="https://github.com/HKUDS/RAG-Anything/issues/7"><img src="https://img.shields.io/badge/💬WeChat-Group-07c160?style=for-the-badge&logo=wechat&logoColor=white&labelColor=1a1a2e"></a>
|
| 31 |
+
</p>
|
| 32 |
+
<p>
|
| 33 |
+
<a href="README_zh.md"><img src="https://img.shields.io/badge/🇨🇳中文版-1a1a2e?style=for-the-badge"></a>
|
| 34 |
+
<a href="README.md"><img src="https://img.shields.io/badge/🇺🇸English-1a1a2e?style=for-the-badge"></a>
|
| 35 |
+
</p>
|
| 36 |
+
</div>
|
| 37 |
+
</div>
|
| 38 |
+
|
| 39 |
+
</div>
|
| 40 |
+
|
| 41 |
+
<div align="center">
|
| 42 |
+
<div style="width: 100%; height: 2px; margin: 20px 0; background: linear-gradient(90deg, transparent, #00d9ff, transparent);"></div>
|
| 43 |
+
</div>
|
| 44 |
+
|
| 45 |
+
<div align="center">
|
| 46 |
+
<a href="#-quick-start" style="text-decoration: none;">
|
| 47 |
+
<img src="https://img.shields.io/badge/Quick%20Start-Get%20Started%20Now-00d9ff?style=for-the-badge&logo=rocket&logoColor=white&labelColor=1a1a2e">
|
| 48 |
+
</a>
|
| 49 |
+
</div>
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## 🎉 News
|
| 54 |
+
- [X] [2025.08.12]🎯📢 🔍 RAG-Anything now features **VLM-Enhanced Query** mode! When documents include images, the system seamlessly integrates them into VLM for advanced multimodal analysis, combining visual and textual context for deeper insights.
|
| 55 |
+
- [X] [2025.07.05]🎯📢 RAG-Anything now features a [context configuration module](docs/context_aware_processing.md), enabling intelligent integration of relevant contextual information to enhance multimodal content processing.
|
| 56 |
+
- [X] [2025.07.04]🎯📢 🚀 RAG-Anything now supports multimodal query capabilities, enabling enhanced RAG with seamless processing of text, images, tables, and equations.
|
| 57 |
+
- [X] [2025.07.03]🎯📢 🎉 RAG-Anything has reached 1k🌟 stars on GitHub! Thank you for your incredible support and valuable contributions to the project.
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## 🌟 System Overview
|
| 62 |
+
|
| 63 |
+
*Next-Generation Multimodal Intelligence*
|
| 64 |
+
|
| 65 |
+
<div style="background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border: 2px solid #00d9ff; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);">
|
| 66 |
+
|
| 67 |
+
Modern documents increasingly contain diverse multimodal content—text, images, tables, equations, charts, and multimedia—that traditional text-focused RAG systems cannot effectively process. **RAG-Anything** addresses this challenge as a comprehensive **All-in-One Multimodal Document Processing RAG system** built on [LightRAG](https://github.com/HKUDS/LightRAG).
|
| 68 |
+
|
| 69 |
+
As a unified solution, RAG-Anything **eliminates the need for multiple specialized tools**. It provides **seamless processing and querying across all content modalities** within a single integrated framework. Unlike conventional RAG approaches that struggle with non-textual elements, our all-in-one system delivers **comprehensive multimodal retrieval capabilities**.
|
| 70 |
+
|
| 71 |
+
Users can query documents containing **interleaved text**, **visual diagrams**, **structured tables**, and **mathematical formulations** through **one cohesive interface**. This consolidated approach makes RAG-Anything particularly valuable for academic research, technical documentation, financial reports, and enterprise knowledge management where rich, mixed-content documents demand a **unified processing framework**.
|
| 72 |
+
|
| 73 |
+
<img src="assets/rag_anything_framework.png" alt="RAG-Anything" />
|
| 74 |
+
|
| 75 |
+
</div>
|
| 76 |
+
|
| 77 |
+
### 🎯 Key Features
|
| 78 |
+
|
| 79 |
+
<div style="background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 15px; padding: 25px; margin: 20px 0;">
|
| 80 |
+
|
| 81 |
+
- **🔄 End-to-End Multimodal Pipeline** - Complete workflow from document ingestion and parsing to intelligent multimodal query answering
|
| 82 |
+
- **📄 Universal Document Support** - Seamless processing of PDFs, Office documents, images, and diverse file formats
|
| 83 |
+
- **🧠 Specialized Content Analysis** - Dedicated processors for images, tables, mathematical equations, and heterogeneous content types
|
| 84 |
+
- **🔗 Multimodal Knowledge Graph** - Automatic entity extraction and cross-modal relationship discovery for enhanced understanding
|
| 85 |
+
- **⚡ Adaptive Processing Modes** - Flexible MinerU-based parsing or direct multimodal content injection workflows
|
| 86 |
+
- **📋 Direct Content List Insertion** - Bypass document parsing by directly inserting pre-parsed content lists from external sources
|
| 87 |
+
- **🎯 Hybrid Intelligent Retrieval** - Advanced search capabilities spanning textual and multimodal content with contextual understanding
|
| 88 |
+
|
| 89 |
+
</div>
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
|
| 93 |
+
## 🏗️ Algorithm & Architecture
|
| 94 |
+
|
| 95 |
+
<div style="background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border-left: 5px solid #00d9ff;">
|
| 96 |
+
|
| 97 |
+
### Core Algorithm
|
| 98 |
+
|
| 99 |
+
**RAG-Anything** implements an effective **multi-stage multimodal pipeline** that fundamentally extends traditional RAG architectures to seamlessly handle diverse content modalities through intelligent orchestration and cross-modal understanding.
|
| 100 |
+
|
| 101 |
+
</div>
|
| 102 |
+
|
| 103 |
+
<div align="center">
|
| 104 |
+
<div style="width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);">
|
| 105 |
+
<div style="display: flex; justify-content: space-around; align-items: center; flex-wrap: wrap; gap: 20px;">
|
| 106 |
+
<div style="text-align: center;">
|
| 107 |
+
<div style="font-size: 24px; margin-bottom: 10px;">📄</div>
|
| 108 |
+
<div style="font-size: 14px; color: #00d9ff;">Document Parsing</div>
|
| 109 |
+
</div>
|
| 110 |
+
<div style="font-size: 20px; color: #00d9ff;">→</div>
|
| 111 |
+
<div style="text-align: center;">
|
| 112 |
+
<div style="font-size: 24px; margin-bottom: 10px;">🧠</div>
|
| 113 |
+
<div style="font-size: 14px; color: #00d9ff;">Content Analysis</div>
|
| 114 |
+
</div>
|
| 115 |
+
<div style="font-size: 20px; color: #00d9ff;">→</div>
|
| 116 |
+
<div style="text-align: center;">
|
| 117 |
+
<div style="font-size: 24px; margin-bottom: 10px;">🔍</div>
|
| 118 |
+
<div style="font-size: 14px; color: #00d9ff;">Knowledge Graph</div>
|
| 119 |
+
</div>
|
| 120 |
+
<div style="font-size: 20px; color: #00d9ff;">→</div>
|
| 121 |
+
<div style="text-align: center;">
|
| 122 |
+
<div style="font-size: 24px; margin-bottom: 10px;">🎯</div>
|
| 123 |
+
<div style="font-size: 14px; color: #00d9ff;">Intelligent Retrieval</div>
|
| 124 |
+
</div>
|
| 125 |
+
</div>
|
| 126 |
+
</div>
|
| 127 |
+
</div>
|
| 128 |
+
|
| 129 |
+
### 1. Document Parsing Stage
|
| 130 |
+
|
| 131 |
+
<div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
|
| 132 |
+
|
| 133 |
+
The system provides high-fidelity document extraction through adaptive content decomposition. It intelligently segments heterogeneous elements while preserving contextual relationships. Universal format compatibility is achieved via specialized optimized parsers.
|
| 134 |
+
|
| 135 |
+
**Key Components:**
|
| 136 |
+
|
| 137 |
+
- **⚙️ MinerU Integration**: Leverages [MinerU](https://github.com/opendatalab/MinerU) for high-fidelity document structure extraction and semantic preservation across complex layouts.
|
| 138 |
+
|
| 139 |
+
- **🧩 Adaptive Content Decomposition**: Automatically segments documents into coherent text blocks, visual elements, structured tables, mathematical equations, and specialized content types while preserving contextual relationships.
|
| 140 |
+
|
| 141 |
+
- **📁 Universal Format Support**: Provides comprehensive handling of PDFs, Office documents (DOC/DOCX/PPT/PPTX/XLS/XLSX), images, and emerging formats through specialized parsers with format-specific optimization.
|
| 142 |
+
|
| 143 |
+
</div>
|
| 144 |
+
|
| 145 |
+
### 2. Multi-Modal Content Understanding & Processing
|
| 146 |
+
|
| 147 |
+
<div style="background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;">
|
| 148 |
+
|
| 149 |
+
The system automatically categorizes and routes content through optimized channels. It uses concurrent pipelines for parallel text and multimodal processing. Document hierarchy and relationships are preserved during transformation.
|
| 150 |
+
|
| 151 |
+
**Key Components:**
|
| 152 |
+
|
| 153 |
+
- **🎯 Autonomous Content Categorization and Routing**: Automatically identify, categorize, and route different content types through optimized execution channels.
|
| 154 |
+
|
| 155 |
+
- **⚡ Concurrent Multi-Pipeline Architecture**: Implements concurrent execution of textual and multimodal content through dedicated processing pipelines. This approach maximizes throughput efficiency while preserving content integrity.
|
| 156 |
+
|
| 157 |
+
- **🏗️ Document Hierarchy Extraction**: Extracts and preserves original document hierarchy and inter-element relationships during content transformation.
|
| 158 |
+
|
| 159 |
+
</div>
|
| 160 |
+
|
| 161 |
+
### 3. Multimodal Analysis Engine
|
| 162 |
+
|
| 163 |
+
<div style="background: linear-gradient(90deg, #0f3460 0%, #1a1a2e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #00d9ff;">
|
| 164 |
+
|
| 165 |
+
The system deploys modality-aware processing units for heterogeneous data modalities:
|
| 166 |
+
|
| 167 |
+
**Specialized Analyzers:**
|
| 168 |
+
|
| 169 |
+
- **🔍 Visual Content Analyzer**:
|
| 170 |
+
- Integrate vision model for image analysis.
|
| 171 |
+
- Generates context-aware descriptive captions based on visual semantics.
|
| 172 |
+
- Extracts spatial relationships and hierarchical structures between visual elements.
|
| 173 |
+
|
| 174 |
+
- **📊 Structured Data Interpreter**:
|
| 175 |
+
- Performs systematic interpretation of tabular and structured data formats.
|
| 176 |
+
- Implements statistical pattern recognition algorithms for data trend analysis.
|
| 177 |
+
- Identifies semantic relationships and dependencies across multiple tabular datasets.
|
| 178 |
+
|
| 179 |
+
- **📐 Mathematical Expression Parser**:
|
| 180 |
+
- Parses complex mathematical expressions and formulas with high accuracy.
|
| 181 |
+
- Provides native LaTeX format support for seamless integration with academic workflows.
|
| 182 |
+
- Establishes conceptual mappings between mathematical equations and domain-specific knowledge bases.
|
| 183 |
+
|
| 184 |
+
- **🔧 Extensible Modality Handler**:
|
| 185 |
+
- Provides configurable processing framework for custom and emerging content types.
|
| 186 |
+
- Enables dynamic integration of new modality processors through plugin architecture.
|
| 187 |
+
- Supports runtime configuration of processing pipelines for specialized use cases.
|
| 188 |
+
|
| 189 |
+
</div>
|
| 190 |
+
|
| 191 |
+
### 4. Multimodal Knowledge Graph Index
|
| 192 |
+
|
| 193 |
+
<div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
|
| 194 |
+
|
| 195 |
+
The multi-modal knowledge graph construction module transforms document content into structured semantic representations. It extracts multimodal entities, establishes cross-modal relationships, and preserves hierarchical organization. The system applies weighted relevance scoring for optimized knowledge retrieval.
|
| 196 |
+
|
| 197 |
+
**Core Functions:**
|
| 198 |
+
|
| 199 |
+
- **🔍 Multi-Modal Entity Extraction**: Transforms significant multimodal elements into structured knowledge graph entities. The process includes semantic annotations and metadata preservation.
|
| 200 |
+
|
| 201 |
+
- **🔗 Cross-Modal Relationship Mapping**: Establishes semantic connections and dependencies between textual entities and multimodal components. This is achieved through automated relationship inference algorithms.
|
| 202 |
+
|
| 203 |
+
- **🏗️ Hierarchical Structure Preservation**: Maintains original document organization through "belongs_to" relationship chains. These chains preserve logical content hierarchy and sectional dependencies.
|
| 204 |
+
|
| 205 |
+
- **⚖️ Weighted Relationship Scoring**: Assigns quantitative relevance scores to relationship types. Scoring is based on semantic proximity and contextual significance within the document structure.
|
| 206 |
+
|
| 207 |
+
</div>
|
| 208 |
+
|
| 209 |
+
### 5. Modality-Aware Retrieval
|
| 210 |
+
|
| 211 |
+
<div style="background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;">
|
| 212 |
+
|
| 213 |
+
The hybrid retrieval system combines vector similarity search with graph traversal algorithms for comprehensive content retrieval. It implements modality-aware ranking mechanisms and maintains relational coherence between retrieved elements to ensure contextually integrated information delivery.
|
| 214 |
+
|
| 215 |
+
**Retrieval Mechanisms:**
|
| 216 |
+
|
| 217 |
+
- **🔀 Vector-Graph Fusion**: Integrates vector similarity search with graph traversal algorithms. This approach leverages both semantic embeddings and structural relationships for comprehensive content retrieval.
|
| 218 |
+
|
| 219 |
+
- **📊 Modality-Aware Ranking**: Implements adaptive scoring mechanisms that weight retrieval results based on content type relevance. The system adjusts rankings according to query-specific modality preferences.
|
| 220 |
+
|
| 221 |
+
- **🔗 Relational Coherence Maintenance**: Maintains semantic and structural relationships between retrieved elements. This ensures coherent information delivery and contextual integrity.
|
| 222 |
+
|
| 223 |
+
</div>
|
| 224 |
+
|
| 225 |
+
---
|
| 226 |
+
|
| 227 |
+
## 🚀 Quick Start
|
| 228 |
+
|
| 229 |
+
*Initialize Your AI Journey*
|
| 230 |
+
|
| 231 |
+
<div align="center">
|
| 232 |
+
<img src="https://user-images.githubusercontent.com/74038190/212284158-e840e285-664b-44d7-b79b-e264b5e54825.gif" width="400">
|
| 233 |
+
</div>
|
| 234 |
+
|
| 235 |
+
### Installation
|
| 236 |
+
|
| 237 |
+
#### Option 1: Install from PyPI (Recommended)
|
| 238 |
+
|
| 239 |
+
```bash
|
| 240 |
+
# Basic installation
|
| 241 |
+
pip install raganything
|
| 242 |
+
|
| 243 |
+
# With optional dependencies for extended format support:
|
| 244 |
+
pip install 'raganything[all]' # All optional features
|
| 245 |
+
pip install 'raganything[image]' # Image format conversion (BMP, TIFF, GIF, WebP)
|
| 246 |
+
pip install 'raganything[text]' # Text file processing (TXT, MD)
|
| 247 |
+
pip install 'raganything[image,text]' # Multiple features
|
| 248 |
+
```
|
| 249 |
+
|
| 250 |
+
#### Option 2: Install from Source
|
| 251 |
+
```bash
|
| 252 |
+
# Install uv (if not already installed)
|
| 253 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 254 |
+
|
| 255 |
+
# Clone and setup the project with uv
|
| 256 |
+
git clone https://github.com/HKUDS/RAG-Anything.git
|
| 257 |
+
cd RAG-Anything
|
| 258 |
+
|
| 259 |
+
# Install the package and dependencies in a virtual environment
|
| 260 |
+
uv sync
|
| 261 |
+
|
| 262 |
+
# If you encounter network timeouts (especially for opencv packages):
|
| 263 |
+
# UV_HTTP_TIMEOUT=120 uv sync
|
| 264 |
+
|
| 265 |
+
# Run commands directly with uv (recommended approach)
|
| 266 |
+
uv run python examples/raganything_example.py --help
|
| 267 |
+
|
| 268 |
+
# Install with optional dependencies
|
| 269 |
+
uv sync --extra image --extra text # Specific extras
|
| 270 |
+
uv sync --all-extras # All optional features
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
#### Optional Dependencies
|
| 274 |
+
|
| 275 |
+
- **`[image]`** - Enables processing of BMP, TIFF, GIF, WebP image formats (requires Pillow)
|
| 276 |
+
- **`[text]`** - Enables processing of TXT and MD files (requires ReportLab)
|
| 277 |
+
- **`[all]`** - Includes all Python optional dependencies
|
| 278 |
+
|
| 279 |
+
> **⚠️ Office Document Processing Requirements:**
|
| 280 |
+
> - Office documents (.doc, .docx, .ppt, .pptx, .xls, .xlsx) require **LibreOffice** installation
|
| 281 |
+
> - Download from [LibreOffice official website](https://www.libreoffice.org/download/download/)
|
| 282 |
+
> - **Windows**: Download installer from official website
|
| 283 |
+
> - **macOS**: `brew install --cask libreoffice`
|
| 284 |
+
> - **Ubuntu/Debian**: `sudo apt-get install libreoffice`
|
| 285 |
+
> - **CentOS/RHEL**: `sudo yum install libreoffice`
|
| 286 |
+
|
| 287 |
+
**Check MinerU installation:**
|
| 288 |
+
|
| 289 |
+
```bash
|
| 290 |
+
# Verify installation
|
| 291 |
+
mineru --version
|
| 292 |
+
|
| 293 |
+
# Check if properly configured
|
| 294 |
+
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU installed properly' if rag.check_parser_installation() else '❌ MinerU installation issue')"
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
Models are downloaded automatically on first use. For manual download, refer to [MinerU Model Source Configuration](https://github.com/opendatalab/MinerU/blob/master/README.md#22-model-source-configuration).
|
| 298 |
+
|
| 299 |
+
### Usage Examples
|
| 300 |
+
|
| 301 |
+
#### 1. End-to-End Document Processing
|
| 302 |
+
|
| 303 |
+
```python
|
| 304 |
+
import asyncio
|
| 305 |
+
from raganything import RAGAnything, RAGAnythingConfig
|
| 306 |
+
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 307 |
+
from lightrag.utils import EmbeddingFunc
|
| 308 |
+
|
| 309 |
+
async def main():
|
| 310 |
+
# Set up API configuration
|
| 311 |
+
api_key = "your-api-key"
|
| 312 |
+
base_url = "your-base-url" # Optional
|
| 313 |
+
|
| 314 |
+
# Create RAGAnything configuration
|
| 315 |
+
config = RAGAnythingConfig(
|
| 316 |
+
working_dir="./rag_storage",
|
| 317 |
+
parser="mineru", # Parser selection: mineru or docling
|
| 318 |
+
parse_method="auto", # Parse method: auto, ocr, or txt
|
| 319 |
+
enable_image_processing=True,
|
| 320 |
+
enable_table_processing=True,
|
| 321 |
+
enable_equation_processing=True,
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
# Define LLM model function
|
| 325 |
+
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
| 326 |
+
return openai_complete_if_cache(
|
| 327 |
+
"gpt-4o-mini",
|
| 328 |
+
prompt,
|
| 329 |
+
system_prompt=system_prompt,
|
| 330 |
+
history_messages=history_messages,
|
| 331 |
+
api_key=api_key,
|
| 332 |
+
base_url=base_url,
|
| 333 |
+
**kwargs,
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# Define vision model function for image processing
|
| 337 |
+
def vision_model_func(
|
| 338 |
+
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
| 339 |
+
):
|
| 340 |
+
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
| 341 |
+
if messages:
|
| 342 |
+
return openai_complete_if_cache(
|
| 343 |
+
"gpt-4o",
|
| 344 |
+
"",
|
| 345 |
+
system_prompt=None,
|
| 346 |
+
history_messages=[],
|
| 347 |
+
messages=messages,
|
| 348 |
+
api_key=api_key,
|
| 349 |
+
base_url=base_url,
|
| 350 |
+
**kwargs,
|
| 351 |
+
)
|
| 352 |
+
# Traditional single image format
|
| 353 |
+
elif image_data:
|
| 354 |
+
return openai_complete_if_cache(
|
| 355 |
+
"gpt-4o",
|
| 356 |
+
"",
|
| 357 |
+
system_prompt=None,
|
| 358 |
+
history_messages=[],
|
| 359 |
+
messages=[
|
| 360 |
+
{"role": "system", "content": system_prompt}
|
| 361 |
+
if system_prompt
|
| 362 |
+
else None,
|
| 363 |
+
{
|
| 364 |
+
"role": "user",
|
| 365 |
+
"content": [
|
| 366 |
+
{"type": "text", "text": prompt},
|
| 367 |
+
{
|
| 368 |
+
"type": "image_url",
|
| 369 |
+
"image_url": {
|
| 370 |
+
"url": f"data:image/jpeg;base64,{image_data}"
|
| 371 |
+
},
|
| 372 |
+
},
|
| 373 |
+
],
|
| 374 |
+
}
|
| 375 |
+
if image_data
|
| 376 |
+
else {"role": "user", "content": prompt},
|
| 377 |
+
],
|
| 378 |
+
api_key=api_key,
|
| 379 |
+
base_url=base_url,
|
| 380 |
+
**kwargs,
|
| 381 |
+
)
|
| 382 |
+
# Pure text format
|
| 383 |
+
else:
|
| 384 |
+
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
| 385 |
+
|
| 386 |
+
# Define embedding function
|
| 387 |
+
embedding_func = EmbeddingFunc(
|
| 388 |
+
embedding_dim=3072,
|
| 389 |
+
max_token_size=8192,
|
| 390 |
+
func=lambda texts: openai_embed(
|
| 391 |
+
texts,
|
| 392 |
+
model="text-embedding-3-large",
|
| 393 |
+
api_key=api_key,
|
| 394 |
+
base_url=base_url,
|
| 395 |
+
),
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
# Initialize RAGAnything
|
| 399 |
+
rag = RAGAnything(
|
| 400 |
+
config=config,
|
| 401 |
+
llm_model_func=llm_model_func,
|
| 402 |
+
vision_model_func=vision_model_func,
|
| 403 |
+
embedding_func=embedding_func,
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
# Process a document
|
| 407 |
+
await rag.process_document_complete(
|
| 408 |
+
file_path="path/to/your/document.pdf",
|
| 409 |
+
output_dir="./output",
|
| 410 |
+
parse_method="auto"
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
# Query the processed content
|
| 414 |
+
# Pure text query - for basic knowledge base search
|
| 415 |
+
text_result = await rag.aquery(
|
| 416 |
+
"What are the main findings shown in the figures and tables?",
|
| 417 |
+
mode="hybrid"
|
| 418 |
+
)
|
| 419 |
+
print("Text query result:", text_result)
|
| 420 |
+
|
| 421 |
+
# Multimodal query with specific multimodal content
|
| 422 |
+
multimodal_result = await rag.aquery_with_multimodal(
|
| 423 |
+
"Explain this formula and its relevance to the document content",
|
| 424 |
+
multimodal_content=[{
|
| 425 |
+
"type": "equation",
|
| 426 |
+
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
| 427 |
+
"equation_caption": "Document relevance probability"
|
| 428 |
+
}],
|
| 429 |
+
mode="hybrid"
|
| 430 |
+
)
|
| 431 |
+
print("Multimodal query result:", multimodal_result)
|
| 432 |
+
|
| 433 |
+
if __name__ == "__main__":
|
| 434 |
+
asyncio.run(main())
|
| 435 |
+
```
|
| 436 |
+
|
| 437 |
+
#### 2. Direct Multimodal Content Processing
|
| 438 |
+
|
| 439 |
+
```python
|
| 440 |
+
import asyncio
|
| 441 |
+
from lightrag import LightRAG
|
| 442 |
+
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 443 |
+
from lightrag.utils import EmbeddingFunc
|
| 444 |
+
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
|
| 445 |
+
|
| 446 |
+
async def process_multimodal_content():
|
| 447 |
+
# Set up API configuration
|
| 448 |
+
api_key = "your-api-key"
|
| 449 |
+
base_url = "your-base-url" # Optional
|
| 450 |
+
|
| 451 |
+
# Initialize LightRAG
|
| 452 |
+
rag = LightRAG(
|
| 453 |
+
working_dir="./rag_storage",
|
| 454 |
+
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
| 455 |
+
"gpt-4o-mini",
|
| 456 |
+
prompt,
|
| 457 |
+
system_prompt=system_prompt,
|
| 458 |
+
history_messages=history_messages,
|
| 459 |
+
api_key=api_key,
|
| 460 |
+
base_url=base_url,
|
| 461 |
+
**kwargs,
|
| 462 |
+
),
|
| 463 |
+
embedding_func=EmbeddingFunc(
|
| 464 |
+
embedding_dim=3072,
|
| 465 |
+
max_token_size=8192,
|
| 466 |
+
func=lambda texts: openai_embed(
|
| 467 |
+
texts,
|
| 468 |
+
model="text-embedding-3-large",
|
| 469 |
+
api_key=api_key,
|
| 470 |
+
base_url=base_url,
|
| 471 |
+
),
|
| 472 |
+
)
|
| 473 |
+
)
|
| 474 |
+
await rag.initialize_storages()
|
| 475 |
+
|
| 476 |
+
# Process an image
|
| 477 |
+
image_processor = ImageModalProcessor(
|
| 478 |
+
lightrag=rag,
|
| 479 |
+
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
|
| 480 |
+
"gpt-4o",
|
| 481 |
+
"",
|
| 482 |
+
system_prompt=None,
|
| 483 |
+
history_messages=[],
|
| 484 |
+
messages=[
|
| 485 |
+
{"role": "system", "content": system_prompt} if system_prompt else None,
|
| 486 |
+
{"role": "user", "content": [
|
| 487 |
+
{"type": "text", "text": prompt},
|
| 488 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
| 489 |
+
]} if image_data else {"role": "user", "content": prompt}
|
| 490 |
+
],
|
| 491 |
+
api_key=api_key,
|
| 492 |
+
base_url=base_url,
|
| 493 |
+
**kwargs,
|
| 494 |
+
) if image_data else openai_complete_if_cache(
|
| 495 |
+
"gpt-4o-mini",
|
| 496 |
+
prompt,
|
| 497 |
+
system_prompt=system_prompt,
|
| 498 |
+
history_messages=history_messages,
|
| 499 |
+
api_key=api_key,
|
| 500 |
+
base_url=base_url,
|
| 501 |
+
**kwargs,
|
| 502 |
+
)
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
image_content = {
|
| 506 |
+
"img_path": "path/to/image.jpg",
|
| 507 |
+
"image_caption": ["Figure 1: Experimental results"],
|
| 508 |
+
"image_footnote": ["Data collected in 2024"]
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
description, entity_info = await image_processor.process_multimodal_content(
|
| 512 |
+
modal_content=image_content,
|
| 513 |
+
content_type="image",
|
| 514 |
+
file_path="research_paper.pdf",
|
| 515 |
+
entity_name="Experimental Results Figure"
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
# Process a table
|
| 519 |
+
table_processor = TableModalProcessor(
|
| 520 |
+
lightrag=rag,
|
| 521 |
+
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
| 522 |
+
"gpt-4o-mini",
|
| 523 |
+
prompt,
|
| 524 |
+
system_prompt=system_prompt,
|
| 525 |
+
history_messages=history_messages,
|
| 526 |
+
api_key=api_key,
|
| 527 |
+
base_url=base_url,
|
| 528 |
+
**kwargs,
|
| 529 |
+
)
|
| 530 |
+
)
|
| 531 |
+
|
| 532 |
+
table_content = {
|
| 533 |
+
"table_body": """
|
| 534 |
+
| Method | Accuracy | F1-Score |
|
| 535 |
+
|--------|----------|----------|
|
| 536 |
+
| RAGAnything | 95.2% | 0.94 |
|
| 537 |
+
| Baseline | 87.3% | 0.85 |
|
| 538 |
+
""",
|
| 539 |
+
"table_caption": ["Performance Comparison"],
|
| 540 |
+
"table_footnote": ["Results on test dataset"]
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
description, entity_info = await table_processor.process_multimodal_content(
|
| 544 |
+
modal_content=table_content,
|
| 545 |
+
content_type="table",
|
| 546 |
+
file_path="research_paper.pdf",
|
| 547 |
+
entity_name="Performance Results Table"
|
| 548 |
+
)
|
| 549 |
+
|
| 550 |
+
if __name__ == "__main__":
|
| 551 |
+
asyncio.run(process_multimodal_content())
|
| 552 |
+
```
|
| 553 |
+
|
| 554 |
+
#### 3. Batch Processing
|
| 555 |
+
|
| 556 |
+
```python
|
| 557 |
+
# Process multiple documents
|
| 558 |
+
await rag.process_folder_complete(
|
| 559 |
+
folder_path="./documents",
|
| 560 |
+
output_dir="./output",
|
| 561 |
+
file_extensions=[".pdf", ".docx", ".pptx"],
|
| 562 |
+
recursive=True,
|
| 563 |
+
max_workers=4
|
| 564 |
+
)
|
| 565 |
+
```
|
| 566 |
+
|
| 567 |
+
#### 4. Custom Modal Processors
|
| 568 |
+
|
| 569 |
+
```python
|
| 570 |
+
from raganything.modalprocessors import GenericModalProcessor
|
| 571 |
+
|
| 572 |
+
class CustomModalProcessor(GenericModalProcessor):
|
| 573 |
+
async def process_multimodal_content(self, modal_content, content_type, file_path, entity_name):
|
| 574 |
+
# Your custom processing logic
|
| 575 |
+
enhanced_description = await self.analyze_custom_content(modal_content)
|
| 576 |
+
entity_info = self.create_custom_entity(enhanced_description, entity_name)
|
| 577 |
+
return await self._create_entity_and_chunk(enhanced_description, entity_info, file_path)
|
| 578 |
+
```
|
| 579 |
+
|
| 580 |
+
#### 5. Query Options
|
| 581 |
+
|
| 582 |
+
RAG-Anything provides three types of query methods:
|
| 583 |
+
|
| 584 |
+
**Pure Text Queries** - Direct knowledge base search using LightRAG:
|
| 585 |
+
```python
|
| 586 |
+
# Different query modes for text queries
|
| 587 |
+
text_result_hybrid = await rag.aquery("Your question", mode="hybrid")
|
| 588 |
+
text_result_local = await rag.aquery("Your question", mode="local")
|
| 589 |
+
text_result_global = await rag.aquery("Your question", mode="global")
|
| 590 |
+
text_result_naive = await rag.aquery("Your question", mode="naive")
|
| 591 |
+
|
| 592 |
+
# Synchronous version
|
| 593 |
+
sync_text_result = rag.query("Your question", mode="hybrid")
|
| 594 |
+
```
|
| 595 |
+
|
| 596 |
+
**VLM Enhanced Queries** - Automatically analyze images in retrieved context using VLM:
|
| 597 |
+
```python
|
| 598 |
+
# VLM enhanced query (automatically enabled when vision_model_func is provided)
|
| 599 |
+
vlm_result = await rag.aquery(
|
| 600 |
+
"Analyze the charts and figures in the document",
|
| 601 |
+
mode="hybrid"
|
| 602 |
+
# vlm_enhanced=True is automatically set when vision_model_func is available
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
# Manually control VLM enhancement
|
| 606 |
+
vlm_enabled = await rag.aquery(
|
| 607 |
+
"What do the images show in this document?",
|
| 608 |
+
mode="hybrid",
|
| 609 |
+
vlm_enhanced=True # Force enable VLM enhancement
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
vlm_disabled = await rag.aquery(
|
| 613 |
+
"What do the images show in this document?",
|
| 614 |
+
mode="hybrid",
|
| 615 |
+
vlm_enhanced=False # Force disable VLM enhancement
|
| 616 |
+
)
|
| 617 |
+
|
| 618 |
+
# When documents contain images, VLM can see and analyze them directly
|
| 619 |
+
# The system will automatically:
|
| 620 |
+
# 1. Retrieve relevant context containing image paths
|
| 621 |
+
# 2. Load and encode images as base64
|
| 622 |
+
# 3. Send both text context and images to VLM for comprehensive analysis
|
| 623 |
+
```
|
| 624 |
+
|
| 625 |
+
**Multimodal Queries** - Enhanced queries with specific multimodal content analysis:
|
| 626 |
+
```python
|
| 627 |
+
# Query with table data
|
| 628 |
+
table_result = await rag.aquery_with_multimodal(
|
| 629 |
+
"Compare these performance metrics with the document content",
|
| 630 |
+
multimodal_content=[{
|
| 631 |
+
"type": "table",
|
| 632 |
+
"table_data": """Method,Accuracy,Speed
|
| 633 |
+
RAGAnything,95.2%,120ms
|
| 634 |
+
Traditional,87.3%,180ms""",
|
| 635 |
+
"table_caption": "Performance comparison"
|
| 636 |
+
}],
|
| 637 |
+
mode="hybrid"
|
| 638 |
+
)
|
| 639 |
+
|
| 640 |
+
# Query with equation content
|
| 641 |
+
equation_result = await rag.aquery_with_multimodal(
|
| 642 |
+
"Explain this formula and its relevance to the document content",
|
| 643 |
+
multimodal_content=[{
|
| 644 |
+
"type": "equation",
|
| 645 |
+
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
| 646 |
+
"equation_caption": "Document relevance probability"
|
| 647 |
+
}],
|
| 648 |
+
mode="hybrid"
|
| 649 |
+
)
|
| 650 |
+
```
|
| 651 |
+
|
| 652 |
+
#### 6. Loading Existing LightRAG Instance
|
| 653 |
+
|
| 654 |
+
```python
|
| 655 |
+
import asyncio
|
| 656 |
+
from raganything import RAGAnything, RAGAnythingConfig
|
| 657 |
+
from lightrag import LightRAG
|
| 658 |
+
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 659 |
+
from lightrag.kg.shared_storage import initialize_pipeline_status
|
| 660 |
+
from lightrag.utils import EmbeddingFunc
|
| 661 |
+
import os
|
| 662 |
+
|
| 663 |
+
async def load_existing_lightrag():
|
| 664 |
+
# Set up API configuration
|
| 665 |
+
api_key = "your-api-key"
|
| 666 |
+
base_url = "your-base-url" # Optional
|
| 667 |
+
|
| 668 |
+
# First, create or load existing LightRAG instance
|
| 669 |
+
lightrag_working_dir = "./existing_lightrag_storage"
|
| 670 |
+
|
| 671 |
+
# Check if previous LightRAG instance exists
|
| 672 |
+
if os.path.exists(lightrag_working_dir) and os.listdir(lightrag_working_dir):
|
| 673 |
+
print("✅ Found existing LightRAG instance, loading...")
|
| 674 |
+
else:
|
| 675 |
+
print("❌ No existing LightRAG instance found, will create new one")
|
| 676 |
+
|
| 677 |
+
# Create/load LightRAG instance with your configuration
|
| 678 |
+
lightrag_instance = LightRAG(
|
| 679 |
+
working_dir=lightrag_working_dir,
|
| 680 |
+
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
| 681 |
+
"gpt-4o-mini",
|
| 682 |
+
prompt,
|
| 683 |
+
system_prompt=system_prompt,
|
| 684 |
+
history_messages=history_messages,
|
| 685 |
+
api_key=api_key,
|
| 686 |
+
base_url=base_url,
|
| 687 |
+
**kwargs,
|
| 688 |
+
),
|
| 689 |
+
embedding_func=EmbeddingFunc(
|
| 690 |
+
embedding_dim=3072,
|
| 691 |
+
max_token_size=8192,
|
| 692 |
+
func=lambda texts: openai_embed(
|
| 693 |
+
texts,
|
| 694 |
+
model="text-embedding-3-large",
|
| 695 |
+
api_key=api_key,
|
| 696 |
+
base_url=base_url,
|
| 697 |
+
),
|
| 698 |
+
)
|
| 699 |
+
)
|
| 700 |
+
|
| 701 |
+
# Initialize storage (this will load existing data if available)
|
| 702 |
+
await lightrag_instance.initialize_storages()
|
| 703 |
+
await initialize_pipeline_status()
|
| 704 |
+
|
| 705 |
+
# Define vision model function for image processing
|
| 706 |
+
def vision_model_func(
|
| 707 |
+
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
| 708 |
+
):
|
| 709 |
+
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
| 710 |
+
if messages:
|
| 711 |
+
return openai_complete_if_cache(
|
| 712 |
+
"gpt-4o",
|
| 713 |
+
"",
|
| 714 |
+
system_prompt=None,
|
| 715 |
+
history_messages=[],
|
| 716 |
+
messages=messages,
|
| 717 |
+
api_key=api_key,
|
| 718 |
+
base_url=base_url,
|
| 719 |
+
**kwargs,
|
| 720 |
+
)
|
| 721 |
+
# Traditional single image format
|
| 722 |
+
elif image_data:
|
| 723 |
+
return openai_complete_if_cache(
|
| 724 |
+
"gpt-4o",
|
| 725 |
+
"",
|
| 726 |
+
system_prompt=None,
|
| 727 |
+
history_messages=[],
|
| 728 |
+
messages=[
|
| 729 |
+
{"role": "system", "content": system_prompt}
|
| 730 |
+
if system_prompt
|
| 731 |
+
else None,
|
| 732 |
+
{
|
| 733 |
+
"role": "user",
|
| 734 |
+
"content": [
|
| 735 |
+
{"type": "text", "text": prompt},
|
| 736 |
+
{
|
| 737 |
+
"type": "image_url",
|
| 738 |
+
"image_url": {
|
| 739 |
+
"url": f"data:image/jpeg;base64,{image_data}"
|
| 740 |
+
},
|
| 741 |
+
},
|
| 742 |
+
],
|
| 743 |
+
}
|
| 744 |
+
if image_data
|
| 745 |
+
else {"role": "user", "content": prompt},
|
| 746 |
+
],
|
| 747 |
+
api_key=api_key,
|
| 748 |
+
base_url=base_url,
|
| 749 |
+
**kwargs,
|
| 750 |
+
)
|
| 751 |
+
# Pure text format
|
| 752 |
+
else:
|
| 753 |
+
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
| 754 |
+
|
| 755 |
+
# Now use existing LightRAG instance to initialize RAGAnything
|
| 756 |
+
rag = RAGAnything(
|
| 757 |
+
lightrag=lightrag_instance, # Pass existing LightRAG instance
|
| 758 |
+
vision_model_func=vision_model_func,
|
| 759 |
+
# Note: working_dir, llm_model_func, embedding_func, etc. are inherited from lightrag_instance
|
| 760 |
+
)
|
| 761 |
+
|
| 762 |
+
# Query existing knowledge base
|
| 763 |
+
result = await rag.aquery(
|
| 764 |
+
"What data has been processed in this LightRAG instance?",
|
| 765 |
+
mode="hybrid"
|
| 766 |
+
)
|
| 767 |
+
print("Query result:", result)
|
| 768 |
+
|
| 769 |
+
# Add new multimodal document to existing LightRAG instance
|
| 770 |
+
await rag.process_document_complete(
|
| 771 |
+
file_path="path/to/new/multimodal_document.pdf",
|
| 772 |
+
output_dir="./output"
|
| 773 |
+
)
|
| 774 |
+
|
| 775 |
+
if __name__ == "__main__":
|
| 776 |
+
asyncio.run(load_existing_lightrag())
|
| 777 |
+
```
|
| 778 |
+
|
| 779 |
+
#### 7. Direct Content List Insertion
|
| 780 |
+
|
| 781 |
+
For scenarios where you already have a pre-parsed content list (e.g., from external parsers or previous processing), you can directly insert it into RAGAnything without document parsing:
|
| 782 |
+
|
| 783 |
+
```python
|
| 784 |
+
import asyncio
|
| 785 |
+
from raganything import RAGAnything, RAGAnythingConfig
|
| 786 |
+
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 787 |
+
from lightrag.utils import EmbeddingFunc
|
| 788 |
+
|
| 789 |
+
async def insert_content_list_example():
|
| 790 |
+
# Set up API configuration
|
| 791 |
+
api_key = "your-api-key"
|
| 792 |
+
base_url = "your-base-url" # Optional
|
| 793 |
+
|
| 794 |
+
# Create RAGAnything configuration
|
| 795 |
+
config = RAGAnythingConfig(
|
| 796 |
+
working_dir="./rag_storage",
|
| 797 |
+
enable_image_processing=True,
|
| 798 |
+
enable_table_processing=True,
|
| 799 |
+
enable_equation_processing=True,
|
| 800 |
+
)
|
| 801 |
+
|
| 802 |
+
# Define model functions
|
| 803 |
+
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
| 804 |
+
return openai_complete_if_cache(
|
| 805 |
+
"gpt-4o-mini",
|
| 806 |
+
prompt,
|
| 807 |
+
system_prompt=system_prompt,
|
| 808 |
+
history_messages=history_messages,
|
| 809 |
+
api_key=api_key,
|
| 810 |
+
base_url=base_url,
|
| 811 |
+
**kwargs,
|
| 812 |
+
)
|
| 813 |
+
|
| 814 |
+
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
|
| 815 |
+
# If messages format is provided (for multimodal VLM enhanced query), use it directly
|
| 816 |
+
if messages:
|
| 817 |
+
return openai_complete_if_cache(
|
| 818 |
+
"gpt-4o",
|
| 819 |
+
"",
|
| 820 |
+
system_prompt=None,
|
| 821 |
+
history_messages=[],
|
| 822 |
+
messages=messages,
|
| 823 |
+
api_key=api_key,
|
| 824 |
+
base_url=base_url,
|
| 825 |
+
**kwargs,
|
| 826 |
+
)
|
| 827 |
+
# Traditional single image format
|
| 828 |
+
elif image_data:
|
| 829 |
+
return openai_complete_if_cache(
|
| 830 |
+
"gpt-4o",
|
| 831 |
+
"",
|
| 832 |
+
system_prompt=None,
|
| 833 |
+
history_messages=[],
|
| 834 |
+
messages=[
|
| 835 |
+
{"role": "system", "content": system_prompt} if system_prompt else None,
|
| 836 |
+
{
|
| 837 |
+
"role": "user",
|
| 838 |
+
"content": [
|
| 839 |
+
{"type": "text", "text": prompt},
|
| 840 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
| 841 |
+
],
|
| 842 |
+
} if image_data else {"role": "user", "content": prompt},
|
| 843 |
+
],
|
| 844 |
+
api_key=api_key,
|
| 845 |
+
base_url=base_url,
|
| 846 |
+
**kwargs,
|
| 847 |
+
)
|
| 848 |
+
# Pure text format
|
| 849 |
+
else:
|
| 850 |
+
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
| 851 |
+
|
| 852 |
+
embedding_func = EmbeddingFunc(
|
| 853 |
+
embedding_dim=3072,
|
| 854 |
+
max_token_size=8192,
|
| 855 |
+
func=lambda texts: openai_embed(
|
| 856 |
+
texts,
|
| 857 |
+
model="text-embedding-3-large",
|
| 858 |
+
api_key=api_key,
|
| 859 |
+
base_url=base_url,
|
| 860 |
+
),
|
| 861 |
+
)
|
| 862 |
+
|
| 863 |
+
# Initialize RAGAnything
|
| 864 |
+
rag = RAGAnything(
|
| 865 |
+
config=config,
|
| 866 |
+
llm_model_func=llm_model_func,
|
| 867 |
+
vision_model_func=vision_model_func,
|
| 868 |
+
embedding_func=embedding_func,
|
| 869 |
+
)
|
| 870 |
+
|
| 871 |
+
# Example: Pre-parsed content list from external source
|
| 872 |
+
content_list = [
|
| 873 |
+
{
|
| 874 |
+
"type": "text",
|
| 875 |
+
"text": "This is the introduction section of our research paper.",
|
| 876 |
+
"page_idx": 0 # Page number where this content appears
|
| 877 |
+
},
|
| 878 |
+
{
|
| 879 |
+
"type": "image",
|
| 880 |
+
"img_path": "/absolute/path/to/figure1.jpg", # IMPORTANT: Use absolute path
|
| 881 |
+
"image_caption": ["Figure 1: System Architecture"],
|
| 882 |
+
"image_footnote": ["Source: Authors' original design"],
|
| 883 |
+
"page_idx": 1 # Page number where this image appears
|
| 884 |
+
},
|
| 885 |
+
{
|
| 886 |
+
"type": "table",
|
| 887 |
+
"table_body": "| Method | Accuracy | F1-Score |\n|--------|----------|----------|\n| Ours | 95.2% | 0.94 |\n| Baseline | 87.3% | 0.85 |",
|
| 888 |
+
"table_caption": ["Table 1: Performance Comparison"],
|
| 889 |
+
"table_footnote": ["Results on test dataset"],
|
| 890 |
+
"page_idx": 2 # Page number where this table appears
|
| 891 |
+
},
|
| 892 |
+
{
|
| 893 |
+
"type": "equation",
|
| 894 |
+
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
| 895 |
+
"text": "Document relevance probability formula",
|
| 896 |
+
"page_idx": 3 # Page number where this equation appears
|
| 897 |
+
},
|
| 898 |
+
{
|
| 899 |
+
"type": "text",
|
| 900 |
+
"text": "In conclusion, our method demonstrates superior performance across all metrics.",
|
| 901 |
+
"page_idx": 4 # Page number where this content appears
|
| 902 |
+
}
|
| 903 |
+
]
|
| 904 |
+
|
| 905 |
+
# Insert the content list directly
|
| 906 |
+
await rag.insert_content_list(
|
| 907 |
+
content_list=content_list,
|
| 908 |
+
file_path="research_paper.pdf", # Reference file name for citation
|
| 909 |
+
split_by_character=None, # Optional text splitting
|
| 910 |
+
split_by_character_only=False, # Optional text splitting mode
|
| 911 |
+
doc_id=None, # Optional custom document ID (will be auto-generated if not provided)
|
| 912 |
+
display_stats=True # Show content statistics
|
| 913 |
+
)
|
| 914 |
+
|
| 915 |
+
# Query the inserted content
|
| 916 |
+
result = await rag.aquery(
|
| 917 |
+
"What are the key findings and performance metrics mentioned in the research?",
|
| 918 |
+
mode="hybrid"
|
| 919 |
+
)
|
| 920 |
+
print("Query result:", result)
|
| 921 |
+
|
| 922 |
+
# You can also insert multiple content lists with different document IDs
|
| 923 |
+
another_content_list = [
|
| 924 |
+
{
|
| 925 |
+
"type": "text",
|
| 926 |
+
"text": "This is content from another document.",
|
| 927 |
+
"page_idx": 0 # Page number where this content appears
|
| 928 |
+
},
|
| 929 |
+
{
|
| 930 |
+
"type": "table",
|
| 931 |
+
"table_body": "| Feature | Value |\n|---------|-------|\n| Speed | Fast |\n| Accuracy | High |",
|
| 932 |
+
"table_caption": ["Feature Comparison"],
|
| 933 |
+
"page_idx": 1 # Page number where this table appears
|
| 934 |
+
}
|
| 935 |
+
]
|
| 936 |
+
|
| 937 |
+
await rag.insert_content_list(
|
| 938 |
+
content_list=another_content_list,
|
| 939 |
+
file_path="another_document.pdf",
|
| 940 |
+
doc_id="custom-doc-id-123" # Custom document ID
|
| 941 |
+
)
|
| 942 |
+
|
| 943 |
+
if __name__ == "__main__":
|
| 944 |
+
asyncio.run(insert_content_list_example())
|
| 945 |
+
```
|
| 946 |
+
|
| 947 |
+
**Content List Format:**
|
| 948 |
+
|
| 949 |
+
The `content_list` should follow the standard format with each item being a dictionary containing:
|
| 950 |
+
|
| 951 |
+
- **Text content**: `{"type": "text", "text": "content text", "page_idx": 0}`
|
| 952 |
+
- **Image content**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "image_caption": ["caption"], "image_footnote": ["note"], "page_idx": 1}`
|
| 953 |
+
- **Table content**: `{"type": "table", "table_body": "markdown table", "table_caption": ["caption"], "table_footnote": ["note"], "page_idx": 2}`
|
| 954 |
+
- **Equation content**: `{"type": "equation", "latex": "LaTeX formula", "text": "description", "page_idx": 3}`
|
| 955 |
+
- **Generic content**: `{"type": "custom_type", "content": "any content", "page_idx": 4}`
|
| 956 |
+
|
| 957 |
+
**Important Notes:**
|
| 958 |
+
- **`img_path`**: Must be an absolute path to the image file (e.g., `/home/user/images/chart.jpg` or `C:\Users\user\images\chart.jpg`)
|
| 959 |
+
- **`page_idx`**: Represents the page number where the content appears in the original document (0-based indexing)
|
| 960 |
+
- **Content ordering**: Items are processed in the order they appear in the list
|
| 961 |
+
|
| 962 |
+
This method is particularly useful when:
|
| 963 |
+
- You have content from external parsers (non-MinerU/Docling)
|
| 964 |
+
- You want to process programmatically generated content
|
| 965 |
+
- You need to insert content from multiple sources into a single knowledge base
|
| 966 |
+
- You have cached parsing results that you want to reuse
|
| 967 |
+
|
| 968 |
+
---
|
| 969 |
+
|
| 970 |
+
## 🛠️ Examples
|
| 971 |
+
|
| 972 |
+
*Practical Implementation Demos*
|
| 973 |
+
|
| 974 |
+
<div align="center">
|
| 975 |
+
<img src="https://user-images.githubusercontent.com/74038190/212257455-13e3e01e-d6a6-45dc-bb92-3ab87b12dfc1.gif" width="300">
|
| 976 |
+
</div>
|
| 977 |
+
|
| 978 |
+
The `examples/` directory contains comprehensive usage examples:
|
| 979 |
+
|
| 980 |
+
- **`raganything_example.py`**: End-to-end document processing with MinerU
|
| 981 |
+
- **`modalprocessors_example.py`**: Direct multimodal content processing
|
| 982 |
+
- **`office_document_test.py`**: Office document parsing test with MinerU (no API key required)
|
| 983 |
+
- **`image_format_test.py`**: Image format parsing test with MinerU (no API key required)
|
| 984 |
+
- **`text_format_test.py`**: Text format parsing test with MinerU (no API key required)
|
| 985 |
+
|
| 986 |
+
**Run examples:**
|
| 987 |
+
|
| 988 |
+
```bash
|
| 989 |
+
# End-to-end processing with parser selection
|
| 990 |
+
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
|
| 991 |
+
|
| 992 |
+
# Direct modal processing
|
| 993 |
+
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
|
| 994 |
+
|
| 995 |
+
# Office document parsing test (MinerU only)
|
| 996 |
+
python examples/office_document_test.py --file path/to/document.docx
|
| 997 |
+
|
| 998 |
+
# Image format parsing test (MinerU only)
|
| 999 |
+
python examples/image_format_test.py --file path/to/image.bmp
|
| 1000 |
+
|
| 1001 |
+
# Text format parsing test (MinerU only)
|
| 1002 |
+
python examples/text_format_test.py --file path/to/document.md
|
| 1003 |
+
|
| 1004 |
+
# Check LibreOffice installation
|
| 1005 |
+
python examples/office_document_test.py --check-libreoffice --file dummy
|
| 1006 |
+
|
| 1007 |
+
# Check PIL/Pillow installation
|
| 1008 |
+
python examples/image_format_test.py --check-pillow --file dummy
|
| 1009 |
+
|
| 1010 |
+
# Check ReportLab installation
|
| 1011 |
+
python examples/text_format_test.py --check-reportlab --file dummy
|
| 1012 |
+
```
|
| 1013 |
+
|
| 1014 |
+
---
|
| 1015 |
+
|
| 1016 |
+
## 🔧 Configuration
|
| 1017 |
+
|
| 1018 |
+
*System Optimization Parameters*
|
| 1019 |
+
|
| 1020 |
+
### Environment Variables
|
| 1021 |
+
|
| 1022 |
+
Create a `.env` file (refer to `.env.example`):
|
| 1023 |
+
|
| 1024 |
+
```bash
|
| 1025 |
+
OPENAI_API_KEY=your_openai_api_key
|
| 1026 |
+
OPENAI_BASE_URL=your_base_url # Optional
|
| 1027 |
+
OUTPUT_DIR=./output # Default output directory for parsed documents
|
| 1028 |
+
PARSER=mineru # Parser selection: mineru or docling
|
| 1029 |
+
PARSE_METHOD=auto # Parse method: auto, ocr, or txt
|
| 1030 |
+
```
|
| 1031 |
+
|
| 1032 |
+
**Note:** For backward compatibility, legacy environment variable names are still supported:
|
| 1033 |
+
- `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD`
|
| 1034 |
+
|
| 1035 |
+
> **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys.
|
| 1036 |
+
|
| 1037 |
+
### Parser Configuration
|
| 1038 |
+
|
| 1039 |
+
RAGAnything now supports multiple parsers, each with specific advantages:
|
| 1040 |
+
|
| 1041 |
+
#### MinerU Parser
|
| 1042 |
+
- Supports PDF, images, Office documents, and more formats
|
| 1043 |
+
- Powerful OCR and table extraction capabilities
|
| 1044 |
+
- GPU acceleration support
|
| 1045 |
+
|
| 1046 |
+
#### Docling Parser
|
| 1047 |
+
- Optimized for Office documents and HTML files
|
| 1048 |
+
- Better document structure preservation
|
| 1049 |
+
- Native support for multiple Office formats
|
| 1050 |
+
|
| 1051 |
+
### MinerU Configuration
|
| 1052 |
+
|
| 1053 |
+
```bash
|
| 1054 |
+
# MinerU 2.0 uses command-line parameters instead of config files
|
| 1055 |
+
# Check available options:
|
| 1056 |
+
mineru --help
|
| 1057 |
+
|
| 1058 |
+
# Common configurations:
|
| 1059 |
+
mineru -p input.pdf -o output_dir -m auto # Automatic parsing mode
|
| 1060 |
+
mineru -p input.pdf -o output_dir -m ocr # OCR-focused parsing
|
| 1061 |
+
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU acceleration
|
| 1062 |
+
```
|
| 1063 |
+
|
| 1064 |
+
You can also configure parsing through RAGAnything parameters:
|
| 1065 |
+
|
| 1066 |
+
```python
|
| 1067 |
+
# Basic parsing configuration with parser selection
|
| 1068 |
+
await rag.process_document_complete(
|
| 1069 |
+
file_path="document.pdf",
|
| 1070 |
+
output_dir="./output/",
|
| 1071 |
+
parse_method="auto", # or "ocr", "txt"
|
| 1072 |
+
parser="mineru" # Optional: "mineru" or "docling"
|
| 1073 |
+
)
|
| 1074 |
+
|
| 1075 |
+
# Advanced parsing configuration with special parameters
|
| 1076 |
+
await rag.process_document_complete(
|
| 1077 |
+
file_path="document.pdf",
|
| 1078 |
+
output_dir="./output/",
|
| 1079 |
+
parse_method="auto", # Parsing method: "auto", "ocr", "txt"
|
| 1080 |
+
parser="mineru", # Parser selection: "mineru" or "docling"
|
| 1081 |
+
|
| 1082 |
+
# MinerU special parameters - all supported kwargs:
|
| 1083 |
+
lang="ch", # Document language for OCR optimization (e.g., "ch", "en", "ja")
|
| 1084 |
+
device="cuda:0", # Inference device: "cpu", "cuda", "cuda:0", "npu", "mps"
|
| 1085 |
+
start_page=0, # Starting page number (0-based, for PDF)
|
| 1086 |
+
end_page=10, # Ending page number (0-based, for PDF)
|
| 1087 |
+
formula=True, # Enable formula parsing
|
| 1088 |
+
table=True, # Enable table parsing
|
| 1089 |
+
backend="pipeline", # Parsing backend: pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client.
|
| 1090 |
+
source="huggingface", # Model source: "huggingface", "modelscope", "local"
|
| 1091 |
+
# vlm_url="http://127.0.0.1:3000" # Service address when using backend=vlm-sglang-client
|
| 1092 |
+
|
| 1093 |
+
# Standard RAGAnything parameters
|
| 1094 |
+
display_stats=True, # Display content statistics
|
| 1095 |
+
split_by_character=None, # Optional character to split text by
|
| 1096 |
+
doc_id=None # Optional document ID
|
| 1097 |
+
)
|
| 1098 |
+
```
|
| 1099 |
+
|
| 1100 |
+
> **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything now supports multiple document parsers - you can choose between MinerU and Docling based on your needs.
|
| 1101 |
+
|
| 1102 |
+
### Processing Requirements
|
| 1103 |
+
|
| 1104 |
+
Different content types require specific optional dependencies:
|
| 1105 |
+
|
| 1106 |
+
- **Office Documents** (.doc, .docx, .ppt, .pptx, .xls, .xlsx): Install [LibreOffice](https://www.libreoffice.org/download/download/)
|
| 1107 |
+
- **Extended Image Formats** (.bmp, .tiff, .gif, .webp): Install with `pip install raganything[image]`
|
| 1108 |
+
- **Text Files** (.txt, .md): Install with `pip install raganything[text]`
|
| 1109 |
+
|
| 1110 |
+
> **📋 Quick Install**: Use `pip install raganything[all]` to enable all format support (Python dependencies only - LibreOffice still needs separate installation)
|
| 1111 |
+
|
| 1112 |
+
---
|
| 1113 |
+
|
| 1114 |
+
## 🧪 Supported Content Types
|
| 1115 |
+
|
| 1116 |
+
### Document Formats
|
| 1117 |
+
|
| 1118 |
+
- **PDFs** - Research papers, reports, presentations
|
| 1119 |
+
- **Office Documents** - DOC, DOCX, PPT, PPTX, XLS, XLSX
|
| 1120 |
+
- **Images** - JPG, PNG, BMP, TIFF, GIF, WebP
|
| 1121 |
+
- **Text Files** - TXT, MD
|
| 1122 |
+
|
| 1123 |
+
### Multimodal Elements
|
| 1124 |
+
|
| 1125 |
+
- **Images** - Photographs, diagrams, charts, screenshots
|
| 1126 |
+
- **Tables** - Data tables, comparison charts, statistical summaries
|
| 1127 |
+
- **Equations** - Mathematical formulas in LaTeX format
|
| 1128 |
+
- **Generic Content** - Custom content types via extensible processors
|
| 1129 |
+
|
| 1130 |
+
*For installation of format-specific dependencies, see the [Configuration](#-configuration) section.*
|
| 1131 |
+
|
| 1132 |
+
---
|
| 1133 |
+
|
| 1134 |
+
## 📖 Citation
|
| 1135 |
+
|
| 1136 |
+
*Academic Reference*
|
| 1137 |
+
|
| 1138 |
+
<div align="center">
|
| 1139 |
+
<div style="width: 60px; height: 60px; margin: 20px auto; position: relative;">
|
| 1140 |
+
<div style="width: 100%; height: 100%; border: 2px solid #00d9ff; border-radius: 50%; position: relative;">
|
| 1141 |
+
<div style="position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); font-size: 24px; color: #00d9ff;">📖</div>
|
| 1142 |
+
</div>
|
| 1143 |
+
<div style="position: absolute; bottom: -5px; left: 50%; transform: translateX(-50%); width: 20px; height: 20px; background: white; border-right: 2px solid #00d9ff; border-bottom: 2px solid #00d9ff; transform: rotate(45deg);"></div>
|
| 1144 |
+
</div>
|
| 1145 |
+
</div>
|
| 1146 |
+
|
| 1147 |
+
If you find RAG-Anything useful in your research, please cite our paper:
|
| 1148 |
+
|
| 1149 |
+
```bibtex
|
| 1150 |
+
@article{guo2024lightrag,
|
| 1151 |
+
title={LightRAG: Simple and Fast Retrieval-Augmented Generation},
|
| 1152 |
+
author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang},
|
| 1153 |
+
year={2024},
|
| 1154 |
+
eprint={2410.05779},
|
| 1155 |
+
archivePrefix={arXiv},
|
| 1156 |
+
primaryClass={cs.IR}
|
| 1157 |
+
}
|
| 1158 |
+
```
|
| 1159 |
+
|
| 1160 |
+
---
|
| 1161 |
+
|
| 1162 |
+
## 🔗 Related Projects
|
| 1163 |
+
|
| 1164 |
+
*Ecosystem & Extensions*
|
| 1165 |
+
|
| 1166 |
+
<div align="center">
|
| 1167 |
+
<table>
|
| 1168 |
+
<tr>
|
| 1169 |
+
<td align="center">
|
| 1170 |
+
<a href="https://github.com/HKUDS/LightRAG">
|
| 1171 |
+
<div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
|
| 1172 |
+
<span style="font-size: 32px;">⚡</span>
|
| 1173 |
+
</div>
|
| 1174 |
+
<b>LightRAG</b><br>
|
| 1175 |
+
<sub>Simple and Fast RAG</sub>
|
| 1176 |
+
</a>
|
| 1177 |
+
</td>
|
| 1178 |
+
<td align="center">
|
| 1179 |
+
<a href="https://github.com/HKUDS/VideoRAG">
|
| 1180 |
+
<div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
|
| 1181 |
+
<span style="font-size: 32px;">🎥</span>
|
| 1182 |
+
</div>
|
| 1183 |
+
<b>VideoRAG</b><br>
|
| 1184 |
+
<sub>Extreme Long-Context Video RAG</sub>
|
| 1185 |
+
</a>
|
| 1186 |
+
</td>
|
| 1187 |
+
<td align="center">
|
| 1188 |
+
<a href="https://github.com/HKUDS/MiniRAG">
|
| 1189 |
+
<div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
|
| 1190 |
+
<span style="font-size: 32px;">✨</span>
|
| 1191 |
+
</div>
|
| 1192 |
+
<b>MiniRAG</b><br>
|
| 1193 |
+
<sub>Extremely Simple RAG</sub>
|
| 1194 |
+
</a>
|
| 1195 |
+
</td>
|
| 1196 |
+
</tr>
|
| 1197 |
+
</table>
|
| 1198 |
+
</div>
|
| 1199 |
+
|
| 1200 |
+
---
|
| 1201 |
+
|
| 1202 |
+
## ⭐ Star History
|
| 1203 |
+
|
| 1204 |
+
*Community Growth Trajectory*
|
| 1205 |
+
|
| 1206 |
+
<div align="center">
|
| 1207 |
+
<a href="https://star-history.com/#HKUDS/RAG-Anything&Date">
|
| 1208 |
+
<picture>
|
| 1209 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date&theme=dark" />
|
| 1210 |
+
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date" />
|
| 1211 |
+
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date" style="border-radius: 15px; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);" />
|
| 1212 |
+
</picture>
|
| 1213 |
+
</a>
|
| 1214 |
+
</div>
|
| 1215 |
+
|
| 1216 |
+
---
|
| 1217 |
+
|
| 1218 |
+
## 🤝 Contribution
|
| 1219 |
+
|
| 1220 |
+
*Join the Innovation*
|
| 1221 |
+
|
| 1222 |
+
<div align="center">
|
| 1223 |
+
We thank all our contributors for their valuable contributions.
|
| 1224 |
+
</div>
|
| 1225 |
+
|
| 1226 |
+
<div align="center">
|
| 1227 |
+
<a href="https://github.com/HKUDS/RAG-Anything/graphs/contributors">
|
| 1228 |
+
<img src="https://contrib.rocks/image?repo=HKUDS/RAG-Anything" style="border-radius: 15px; box-shadow: 0 0 20px rgba(0, 217, 255, 0.3);" />
|
| 1229 |
+
</a>
|
| 1230 |
+
</div>
|
| 1231 |
+
|
| 1232 |
+
---
|
| 1233 |
+
|
| 1234 |
+
<div align="center" style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 30px; margin: 30px 0;">
|
| 1235 |
+
<div>
|
| 1236 |
+
<img src="https://user-images.githubusercontent.com/74038190/212284100-561aa473-3905-4a80-b561-0d28506553ee.gif" width="500">
|
| 1237 |
+
</div>
|
| 1238 |
+
<div style="margin-top: 20px;">
|
| 1239 |
+
<a href="https://github.com/HKUDS/RAG-Anything" style="text-decoration: none;">
|
| 1240 |
+
<img src="https://img.shields.io/badge/⭐%20Star%20us%20on%20GitHub-1a1a2e?style=for-the-badge&logo=github&logoColor=white">
|
| 1241 |
+
</a>
|
| 1242 |
+
<a href="https://github.com/HKUDS/RAG-Anything/issues" style="text-decoration: none;">
|
| 1243 |
+
<img src="https://img.shields.io/badge/🐛%20Report%20Issues-ff6b6b?style=for-the-badge&logo=github&logoColor=white">
|
| 1244 |
+
</a>
|
| 1245 |
+
<a href="https://github.com/HKUDS/RAG-Anything/discussions" style="text-decoration: none;">
|
| 1246 |
+
<img src="https://img.shields.io/badge/💬%20Discussions-4ecdc4?style=for-the-badge&logo=github&logoColor=white">
|
| 1247 |
+
</a>
|
| 1248 |
+
</div>
|
| 1249 |
+
</div>
|
| 1250 |
+
|
| 1251 |
+
<div align="center">
|
| 1252 |
+
<div style="width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);">
|
| 1253 |
+
<div style="display: flex; justify-content: center; align-items: center; gap: 15px;">
|
| 1254 |
+
<span style="font-size: 24px;">⭐</span>
|
| 1255 |
+
<span style="color: #00d9ff; font-size: 18px;">Thank you for visiting RAG-Anything!</span>
|
| 1256 |
+
<span style="font-size: 24px;">⭐</span>
|
| 1257 |
+
</div>
|
| 1258 |
+
<div style="margin-top: 10px; color: #00d9ff; font-size: 16px;">Building the Future of Multimodal AI</div>
|
| 1259 |
+
</div>
|
| 1260 |
+
</div>
|
rag_anything_smaranika/README_zh.md
ADDED
|
@@ -0,0 +1,1258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<div align="center">
|
| 2 |
+
|
| 3 |
+
<div style="margin: 20px 0;">
|
| 4 |
+
<img src="./assets/logo.png" width="120" height="120" alt="RAG-Anything Logo" style="border-radius: 20px; box-shadow: 0 8px 32px rgba(0, 217, 255, 0.3);">
|
| 5 |
+
</div>
|
| 6 |
+
|
| 7 |
+
# 🚀 RAG-Anything: All-in-One RAG System
|
| 8 |
+
|
| 9 |
+
<div align="center">
|
| 10 |
+
<div style="width: 100%; height: 2px; margin: 20px 0; background: linear-gradient(90deg, transparent, #00d9ff, transparent);"></div>
|
| 11 |
+
</div>
|
| 12 |
+
|
| 13 |
+
<div align="center">
|
| 14 |
+
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;">
|
| 15 |
+
<p>
|
| 16 |
+
<a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥项目-主页-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>
|
| 17 |
+
<a href='https://arxiv.org/abs/2410.05779'><img src='https://img.shields.io/badge/📄arXiv-2410.05779-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
|
| 18 |
+
<a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡基于-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>
|
| 19 |
+
</p>
|
| 20 |
+
<p>
|
| 21 |
+
<a href="https://github.com/HKUDS/RAG-Anything/stargazers"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>
|
| 22 |
+
<img src="https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
|
| 23 |
+
<a href="https://pypi.org/project/raganything/"><img src="https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b"></a>
|
| 24 |
+
</p>
|
| 25 |
+
<p>
|
| 26 |
+
<a href="https://discord.gg/yF2MmDJyGJ"><img src="https://img.shields.io/badge/💬Discord-社区-7289da?style=for-the-badge&logo=discord&logoColor=white&labelColor=1a1a2e"></a>
|
| 27 |
+
<a href="https://github.com/HKUDS/RAG-Anything/issues/7"><img src="https://img.shields.io/badge/💬微信群-交流-07c160?style=for-the-badge&logo=wechat&logoColor=white&labelColor=1a1a2e"></a>
|
| 28 |
+
</p>
|
| 29 |
+
<p>
|
| 30 |
+
<a href="README_zh.md"><img src="https://img.shields.io/badge/🇨🇳中文版-1a1a2e?style=for-the-badge"></a>
|
| 31 |
+
<a href="README.md"><img src="https://img.shields.io/badge/🇺🇸English-1a1a2e?style=for-the-badge"></a>
|
| 32 |
+
</p>
|
| 33 |
+
</div>
|
| 34 |
+
</div>
|
| 35 |
+
|
| 36 |
+
</div>
|
| 37 |
+
|
| 38 |
+
<div align="center" style="margin: 30px 0;">
|
| 39 |
+
<img src="https://user-images.githubusercontent.com/74038190/212284100-561aa473-3905-4a80-b561-0d28506553ee.gif" width="800">
|
| 40 |
+
</div>
|
| 41 |
+
|
| 42 |
+
<div align="center">
|
| 43 |
+
<a href="#-快速开始" style="text-decoration: none;">
|
| 44 |
+
<img src="https://img.shields.io/badge/快速开始-立即开始使用-00d9ff?style=for-the-badge&logo=rocket&logoColor=white&labelColor=1a1a2e">
|
| 45 |
+
</a>
|
| 46 |
+
</div>
|
| 47 |
+
|
| 48 |
+
---
|
| 49 |
+
|
| 50 |
+
## 🎉 新闻
|
| 51 |
+
- [X] [2025.08.12]🎯📢 🔍 RAGAnything 现在支持 **VLM增强查询** 模式!当文档包含图片时,系统可以自动将图片与文本上下文一起直接传递给VLM进行综合多模态分析。
|
| 52 |
+
- [X] [2025.07.05]🎯📢 RAGAnything 新增[上下文配置模块](docs/context_aware_processing.md),支持为多模态内容处理添加相关上下文信息。
|
| 53 |
+
- [X] [2025.07.04]🎯📢 RAGAnything 现在支持多模态内容查询,实现了集成文本、图像、表格和公式处理的增强检索生成功能。
|
| 54 |
+
- [X] [2025.07.03]🎯📢 RAGAnything 在GitHub上达到了1K星标🌟!感谢您的支持和贡献。
|
| 55 |
+
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
## 🌟 系统概述
|
| 59 |
+
|
| 60 |
+
*下一代多模态智能*
|
| 61 |
+
|
| 62 |
+
<div style="background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border: 2px solid #00d9ff; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);">
|
| 63 |
+
|
| 64 |
+
**RAG-Anything**是一个综合性多模态文档处理RAG系统。该系统能够无缝处理和查询包含文本、图像、表格、公式等多模态内容的复杂文档,提供完整的检索增强(RAG)生成解决方案。
|
| 65 |
+
|
| 66 |
+
<img src="assets/rag_anything_framework.png" alt="RAG-Anything" />
|
| 67 |
+
|
| 68 |
+
</div>
|
| 69 |
+
|
| 70 |
+
### 🎯 核心特性
|
| 71 |
+
|
| 72 |
+
<div style="background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 15px; padding: 25px; margin: 20px 0;">
|
| 73 |
+
|
| 74 |
+
- **🔄 端到端多模态处理流水线** - 提供从文档解析到多模态查询响应的完整处理链路,确保系统的一体化运行
|
| 75 |
+
- **📄 多格式文档支持** - 支持PDF、Office文档(DOC/DOCX/PPT/PPTX/XLS/XLSX)、图像等主流文档格式的统一处理和解析
|
| 76 |
+
- **🧠 多模态内容分析引擎** - 针对图像、表格、公式和通用文本内容部署专门的处理器,确保各类内容的精准解析
|
| 77 |
+
- **🔗 基于知识图谱索引** - 实现自动化实体提取和关系构建,建立跨模态的语义连接网络
|
| 78 |
+
- **⚡ 灵活的处理架构** - 支持基于MinerU的智能解析模式和直接多模态内容插入模式,满足不同应用场景需求
|
| 79 |
+
- **📋 直接内容列表插入** - 跳过文档解析,直接插入来自外部��的预解析内容列表,支持多种数据来源整合
|
| 80 |
+
- **🎯 跨模态检索机制** - 实现跨文本和多模态内容的智能检索,提供精准的信息定位和匹配能力
|
| 81 |
+
|
| 82 |
+
</div>
|
| 83 |
+
|
| 84 |
+
---
|
| 85 |
+
|
| 86 |
+
## 🏗️ 算法原理与架构
|
| 87 |
+
|
| 88 |
+
<div style="background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border-left: 5px solid #00d9ff;">
|
| 89 |
+
|
| 90 |
+
### 核心算法
|
| 91 |
+
|
| 92 |
+
**RAG-Anything** 采用灵活的分层架构设计,实现多阶段多模态处理流水线,将传统RAG系统扩展为支持异构内容类型的综合处理平台。
|
| 93 |
+
|
| 94 |
+
</div>
|
| 95 |
+
|
| 96 |
+
<div align="center">
|
| 97 |
+
<div style="width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);">
|
| 98 |
+
<div style="display: flex; justify-content: space-around; align-items: center; flex-wrap: wrap; gap: 20px;">
|
| 99 |
+
<div style="text-align: center;">
|
| 100 |
+
<div style="font-size: 24px; margin-bottom: 10px;">📄</div>
|
| 101 |
+
<div style="font-size: 14px; color: #00d9ff;">文档解析</div>
|
| 102 |
+
</div>
|
| 103 |
+
<div style="font-size: 20px; color: #00d9ff;">→</div>
|
| 104 |
+
<div style="text-align: center;">
|
| 105 |
+
<div style="font-size: 24px; margin-bottom: 10px;">🧠</div>
|
| 106 |
+
<div style="font-size: 14px; color: #00d9ff;">内容分析</div>
|
| 107 |
+
</div>
|
| 108 |
+
<div style="font-size: 20px; color: #00d9ff;">→</div>
|
| 109 |
+
<div style="text-align: center;">
|
| 110 |
+
<div style="font-size: 24px; margin-bottom: 10px;">🔍</div>
|
| 111 |
+
<div style="font-size: 14px; color: #00d9ff;">知识图谱</div>
|
| 112 |
+
</div>
|
| 113 |
+
<div style="font-size: 20px; color: #00d9ff;">→</div>
|
| 114 |
+
<div style="text-align: center;">
|
| 115 |
+
<div style="font-size: 24px; margin-bottom: 10px;">🎯</div>
|
| 116 |
+
<div style="font-size: 14px; color: #00d9ff;">智能检索</div>
|
| 117 |
+
</div>
|
| 118 |
+
</div>
|
| 119 |
+
</div>
|
| 120 |
+
</div>
|
| 121 |
+
|
| 122 |
+
### 1. 文档解析阶段
|
| 123 |
+
|
| 124 |
+
<div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
|
| 125 |
+
|
| 126 |
+
该系统构建了高精度文档解析平台,通过结构化提取引擎实现多模态元素的完整识别与提取。系统采用自适应内容分解机制,智能分离文档中的文本、图像、表格、公式等异构内容,并保持其语义关联性。同时支持PDF、Office文档、图像等主流格式的统一处理,提供标准化的多模态内容输出。
|
| 127 |
+
|
| 128 |
+
**核心组件:**
|
| 129 |
+
|
| 130 |
+
- **⚙️ 结构化提取引擎**:集成 [MinerU](https://github.com/opendatalab/MinerU) 文档解析框架,实现精确的文档结构识别与内容提取,确保多模态元素的完整性和准确性。
|
| 131 |
+
|
| 132 |
+
- **🧩 自适应内容分解机制**:建立智能内容分离系统,自动识别并提取文档中的文本块、图像、表格、公式等异构元素,保持元素间的语义关联关系。
|
| 133 |
+
|
| 134 |
+
- **📁 多格式兼容处理**:部署专业化解析器矩阵,支持PDF、Office文档系列(DOC/DOCX/PPT/PPTX/XLS/XLSX)、图像等主流格式的统一处理与标准化输出。
|
| 135 |
+
|
| 136 |
+
</div>
|
| 137 |
+
|
| 138 |
+
### 2. 多模态内容理解与处理
|
| 139 |
+
|
| 140 |
+
<div style="background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;">
|
| 141 |
+
|
| 142 |
+
该多模态内容处理系统通过自主分类路由机制实现异构内容的智能识别与优化分发。系统采用并发多流水线架构,确保文本和多模态内容的高效并行处理,在最大化吞吐量的同时保持内容完整性,并能完整提取和保持原始文档的层次结构与元素关联关系。
|
| 143 |
+
|
| 144 |
+
**核心组件:**
|
| 145 |
+
|
| 146 |
+
- **🎯 自主内容分类与路由**:自动识别、分类并将不同内容类型路由至优化的执行通道。
|
| 147 |
+
|
| 148 |
+
- **⚡ 并发多流水线架构**:通过专用处理流水线实现文本和多模态内容的并发执行。这种方法在保持内容完整性的同时最大化吞吐效率。
|
| 149 |
+
|
| 150 |
+
- **🏗️ 文档层次结构提取**:在内容转换过程中提取并保持原始文档的层次结构和元素间关系。
|
| 151 |
+
|
| 152 |
+
</div>
|
| 153 |
+
|
| 154 |
+
### 3. 多模态分析引擎
|
| 155 |
+
|
| 156 |
+
<div style="background: linear-gradient(90deg, #0f3460 0%, #1a1a2e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #00d9ff;">
|
| 157 |
+
|
| 158 |
+
系统部署了面向异构数据模态的模态感知处理单元:
|
| 159 |
+
|
| 160 |
+
**专用分析器:**
|
| 161 |
+
|
| 162 |
+
- **🔍 视觉内容分析器**:
|
| 163 |
+
- 集成视觉模型进行图像分析和内容识别
|
| 164 |
+
- 基于视觉语义生成上下文感知的描述性标题
|
| 165 |
+
- 提取视觉元素间的空间关系和层次结构
|
| 166 |
+
|
| 167 |
+
- **📊 结构化数据解释器**:
|
| 168 |
+
- 对表格和结构化数据格式进行系统性解释
|
| 169 |
+
- 实现数据趋势分析的统计模式识别算法
|
| 170 |
+
- 识别多个表格数据集间的语义关系和依赖性
|
| 171 |
+
|
| 172 |
+
- **📐 数学表达式解析器**:
|
| 173 |
+
- 高精度解析复杂数学表达式和公式
|
| 174 |
+
- 提供原生LaTeX格式支持以实现��学术工作流的无缝集成
|
| 175 |
+
- 建立数学方程与领域特定知识库间的概念映射
|
| 176 |
+
|
| 177 |
+
- **🔧 可扩展模态处理器**:
|
| 178 |
+
- 为自定义和新兴内容类型提供可配置的处理框架
|
| 179 |
+
- 通过插件架构实现新模态处理器的动态集成
|
| 180 |
+
- 支持专用场景下处理流水线的运行时配置
|
| 181 |
+
|
| 182 |
+
</div>
|
| 183 |
+
|
| 184 |
+
### 4. 多模态知识图谱索引
|
| 185 |
+
|
| 186 |
+
<div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
|
| 187 |
+
|
| 188 |
+
多模态知识图谱构建模块将文档内容转换为结构化语义表示。系统提取多模态实体,建立跨模态关系,并保持层次化组织结构。通过加权相关性评分实现优化的知识检索。
|
| 189 |
+
|
| 190 |
+
**核心功能:**
|
| 191 |
+
|
| 192 |
+
- **🔍 多模态实体提取**:将重要的多模态元素转换为结构化知识图谱实体。该过程包括语义标注和元数据保存。
|
| 193 |
+
|
| 194 |
+
- **🔗 跨模态关系映射**:在文本实体和多模态组件之间建立语义连接和依赖关系。通过自动化关系推理算法实现这一功能。
|
| 195 |
+
|
| 196 |
+
- **🏗️ 层次结构保持**:通过"归属于"关系链维护原始文档组织结构。这些关系链保持逻辑内容层次和章节依赖关系。
|
| 197 |
+
|
| 198 |
+
- **⚖️ 加权关系评分**:为关系类型分配定量相关性分数。评分基于语义邻近性和文档结构内的上下文重要性。
|
| 199 |
+
|
| 200 |
+
</div>
|
| 201 |
+
|
| 202 |
+
### 5. 模态感知检索
|
| 203 |
+
|
| 204 |
+
<div style="background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;">
|
| 205 |
+
|
| 206 |
+
混合检索系统结合向量相似性搜索与图遍历算法,实现全面的内容检索。系统实现模态感知排序机制,并维护检索元素间的关系一致性,确保上下文集成的信息传递。
|
| 207 |
+
|
| 208 |
+
**检索机制:**
|
| 209 |
+
|
| 210 |
+
- **🔀 向量-图谱融合**:集成向量相似性搜索与图遍历算法。该方法同时利用语义嵌入和结构关系实现全面的内容检索。
|
| 211 |
+
|
| 212 |
+
- **📊 模态感知排序**:实现基于内容类型相关性的自适应评分机制。系统根据查询特定的模态偏好调整排序结果。
|
| 213 |
+
|
| 214 |
+
- **🔗 关系一致性维护**:维护检索元素间的语义和结构关系。确保信息传递的连贯性和上下文完整性。
|
| 215 |
+
|
| 216 |
+
</div>
|
| 217 |
+
|
| 218 |
+
---
|
| 219 |
+
|
| 220 |
+
## 🚀 快速开始
|
| 221 |
+
|
| 222 |
+
*启动您的AI之旅*
|
| 223 |
+
|
| 224 |
+
<div align="center">
|
| 225 |
+
<img src="https://user-images.githubusercontent.com/74038190/212284158-e840e285-664b-44d7-b79b-e264b5e54825.gif" width="400">
|
| 226 |
+
</div>
|
| 227 |
+
|
| 228 |
+
### 安装
|
| 229 |
+
|
| 230 |
+
#### 选项1:从PyPI安装(推荐)
|
| 231 |
+
|
| 232 |
+
```bash
|
| 233 |
+
# 基础安装
|
| 234 |
+
pip install raganything
|
| 235 |
+
|
| 236 |
+
# 安装包含扩展格式支持的可选依赖:
|
| 237 |
+
pip install 'raganything[all]' # 所有可选功能
|
| 238 |
+
pip install 'raganything[image]' # 图像格式转换 (BMP, TIFF, GIF, WebP)
|
| 239 |
+
pip install 'raganything[text]' # 文本文件处理 (TXT, MD)
|
| 240 |
+
pip install 'raganything[image,text]' # 多个功能组合
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
#### 选项2:从源码安装
|
| 244 |
+
|
| 245 |
+
```bash
|
| 246 |
+
git clone https://github.com/HKUDS/RAG-Anything.git
|
| 247 |
+
cd RAG-Anything
|
| 248 |
+
pip install -e .
|
| 249 |
+
|
| 250 |
+
# 安装可选依赖
|
| 251 |
+
pip install -e '.[all]'
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
#### 可选依赖
|
| 255 |
+
|
| 256 |
+
- **`[image]`** - 启用BMP、TIFF、GIF、WebP图像格式处理(需要Pillow)
|
| 257 |
+
- **`[text]`** - 启用TXT和MD文件处理(需要ReportLab)
|
| 258 |
+
- **`[all]`** - 包含所有Python可选依赖
|
| 259 |
+
|
| 260 |
+
> **⚠️ Office文档处理配置要求:**
|
| 261 |
+
> - Office文档 (.doc, .docx, .ppt, .pptx, .xls, .xlsx) 需要安装 **LibreOffice**
|
| 262 |
+
> - 从[LibreOffice官网](https://www.libreoffice.org/download/download/)下载安装
|
| 263 |
+
> - **Windows**:从官网下载安装包
|
| 264 |
+
> - **macOS**:`brew install --cask libreoffice`
|
| 265 |
+
> - **Ubuntu/Debian**:`sudo apt-get install libreoffice`
|
| 266 |
+
> - **CentOS/RHEL**:`sudo yum install libreoffice`
|
| 267 |
+
|
| 268 |
+
**检查MinerU安装:**
|
| 269 |
+
|
| 270 |
+
```bash
|
| 271 |
+
# 验证安装
|
| 272 |
+
mineru --version
|
| 273 |
+
|
| 274 |
+
# 检查是否正确配置
|
| 275 |
+
python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU安装正常' if rag.check_parser_installation() else '❌ MinerU安装有问题')"
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
+
模型在首次使用时自动下载。手动下载参考[MinerU模型源配置](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#22-%E6%A8%A1%E5%9E%8B%E6%BA%90%E9%85%8D%E7%BD%AE):
|
| 279 |
+
|
| 280 |
+
### 使用示例
|
| 281 |
+
|
| 282 |
+
#### 1. 端到端文档处理
|
| 283 |
+
|
| 284 |
+
```python
|
| 285 |
+
import asyncio
|
| 286 |
+
from raganything import RAGAnything, RAGAnythingConfig
|
| 287 |
+
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 288 |
+
from lightrag.utils import EmbeddingFunc
|
| 289 |
+
|
| 290 |
+
async def main():
|
| 291 |
+
# 设置 API 配置
|
| 292 |
+
api_key = "your-api-key"
|
| 293 |
+
base_url = "your-base-url" # 可选
|
| 294 |
+
|
| 295 |
+
# 创建 RAGAnything 配置
|
| 296 |
+
config = RAGAnythingConfig(
|
| 297 |
+
working_dir="./rag_storage",
|
| 298 |
+
parser="mineru", # 选择解析器:mineru 或 docling
|
| 299 |
+
parse_method="auto", # 解析方法:auto, ocr 或 txt
|
| 300 |
+
enable_image_processing=True,
|
| 301 |
+
enable_table_processing=True,
|
| 302 |
+
enable_equation_processing=True,
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# 定义 LLM 模型��数
|
| 306 |
+
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
| 307 |
+
return openai_complete_if_cache(
|
| 308 |
+
"gpt-4o-mini",
|
| 309 |
+
prompt,
|
| 310 |
+
system_prompt=system_prompt,
|
| 311 |
+
history_messages=history_messages,
|
| 312 |
+
api_key=api_key,
|
| 313 |
+
base_url=base_url,
|
| 314 |
+
**kwargs,
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
# 定义视觉模型函数用于图像处理
|
| 318 |
+
def vision_model_func(
|
| 319 |
+
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
| 320 |
+
):
|
| 321 |
+
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
| 322 |
+
if messages:
|
| 323 |
+
return openai_complete_if_cache(
|
| 324 |
+
"gpt-4o",
|
| 325 |
+
"",
|
| 326 |
+
system_prompt=None,
|
| 327 |
+
history_messages=[],
|
| 328 |
+
messages=messages,
|
| 329 |
+
api_key=api_key,
|
| 330 |
+
base_url=base_url,
|
| 331 |
+
**kwargs,
|
| 332 |
+
)
|
| 333 |
+
# 传统单图片格式
|
| 334 |
+
elif image_data:
|
| 335 |
+
return openai_complete_if_cache(
|
| 336 |
+
"gpt-4o",
|
| 337 |
+
"",
|
| 338 |
+
system_prompt=None,
|
| 339 |
+
history_messages=[],
|
| 340 |
+
messages=[
|
| 341 |
+
{"role": "system", "content": system_prompt}
|
| 342 |
+
if system_prompt
|
| 343 |
+
else None,
|
| 344 |
+
{
|
| 345 |
+
"role": "user",
|
| 346 |
+
"content": [
|
| 347 |
+
{"type": "text", "text": prompt},
|
| 348 |
+
{
|
| 349 |
+
"type": "image_url",
|
| 350 |
+
"image_url": {
|
| 351 |
+
"url": f"data:image/jpeg;base64,{image_data}"
|
| 352 |
+
},
|
| 353 |
+
},
|
| 354 |
+
],
|
| 355 |
+
}
|
| 356 |
+
if image_data
|
| 357 |
+
else {"role": "user", "content": prompt},
|
| 358 |
+
],
|
| 359 |
+
api_key=api_key,
|
| 360 |
+
base_url=base_url,
|
| 361 |
+
**kwargs,
|
| 362 |
+
)
|
| 363 |
+
# 纯文本格式
|
| 364 |
+
else:
|
| 365 |
+
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
| 366 |
+
|
| 367 |
+
# 定义嵌入函数
|
| 368 |
+
embedding_func = EmbeddingFunc(
|
| 369 |
+
embedding_dim=3072,
|
| 370 |
+
max_token_size=8192,
|
| 371 |
+
func=lambda texts: openai_embed(
|
| 372 |
+
texts,
|
| 373 |
+
model="text-embedding-3-large",
|
| 374 |
+
api_key=api_key,
|
| 375 |
+
base_url=base_url,
|
| 376 |
+
),
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
# 初始化 RAGAnything
|
| 380 |
+
rag = RAGAnything(
|
| 381 |
+
config=config,
|
| 382 |
+
llm_model_func=llm_model_func,
|
| 383 |
+
vision_model_func=vision_model_func,
|
| 384 |
+
embedding_func=embedding_func,
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
# 处理文档
|
| 388 |
+
await rag.process_document_complete(
|
| 389 |
+
file_path="path/to/your/document.pdf",
|
| 390 |
+
output_dir="./output",
|
| 391 |
+
parse_method="auto"
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
# 查询处理后的内容
|
| 395 |
+
# 纯文本查询 - 基本知识库搜索
|
| 396 |
+
text_result = await rag.aquery(
|
| 397 |
+
"文档的主要内容是什么?",
|
| 398 |
+
mode="hybrid"
|
| 399 |
+
)
|
| 400 |
+
print("文本查询结果:", text_result)
|
| 401 |
+
|
| 402 |
+
# 多模态查询 - 包含具体多模态内容的查询
|
| 403 |
+
multimodal_result = await rag.aquery_with_multimodal(
|
| 404 |
+
"分析这个性能数据并解释与现有文档内容的关系",
|
| 405 |
+
multimodal_content=[{
|
| 406 |
+
"type": "table",
|
| 407 |
+
"table_data": """系统,准确率,F1分数
|
| 408 |
+
RAGAnything,95.2%,0.94
|
| 409 |
+
基准方法,87.3%,0.85""",
|
| 410 |
+
"table_caption": "性能对比结果"
|
| 411 |
+
}],
|
| 412 |
+
mode="hybrid"
|
| 413 |
+
)
|
| 414 |
+
print("多模态查询结果:", multimodal_result)
|
| 415 |
+
|
| 416 |
+
if __name__ == "__main__":
|
| 417 |
+
asyncio.run(main())
|
| 418 |
+
```
|
| 419 |
+
|
| 420 |
+
#### 2. 直接多模态内容处理
|
| 421 |
+
|
| 422 |
+
```python
|
| 423 |
+
import asyncio
|
| 424 |
+
from lightrag import LightRAG
|
| 425 |
+
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 426 |
+
from lightrag.utils import EmbeddingFunc
|
| 427 |
+
from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
|
| 428 |
+
|
| 429 |
+
async def process_multimodal_content():
|
| 430 |
+
# 设置 API 配置
|
| 431 |
+
api_key = "your-api-key"
|
| 432 |
+
base_url = "your-base-url" # 可选
|
| 433 |
+
|
| 434 |
+
# 初始化 LightRAG
|
| 435 |
+
rag = LightRAG(
|
| 436 |
+
working_dir="./rag_storage",
|
| 437 |
+
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
| 438 |
+
"gpt-4o-mini",
|
| 439 |
+
prompt,
|
| 440 |
+
system_prompt=system_prompt,
|
| 441 |
+
history_messages=history_messages,
|
| 442 |
+
api_key=api_key,
|
| 443 |
+
base_url=base_url,
|
| 444 |
+
**kwargs,
|
| 445 |
+
),
|
| 446 |
+
embedding_func=EmbeddingFunc(
|
| 447 |
+
embedding_dim=3072,
|
| 448 |
+
max_token_size=8192,
|
| 449 |
+
func=lambda texts: openai_embed(
|
| 450 |
+
texts,
|
| 451 |
+
model="text-embedding-3-large",
|
| 452 |
+
api_key=api_key,
|
| 453 |
+
base_url=base_url,
|
| 454 |
+
),
|
| 455 |
+
)
|
| 456 |
+
)
|
| 457 |
+
await rag.initialize_storages()
|
| 458 |
+
|
| 459 |
+
# 处理图像
|
| 460 |
+
image_processor = ImageModalProcessor(
|
| 461 |
+
lightrag=rag,
|
| 462 |
+
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
|
| 463 |
+
"gpt-4o",
|
| 464 |
+
"",
|
| 465 |
+
system_prompt=None,
|
| 466 |
+
history_messages=[],
|
| 467 |
+
messages=[
|
| 468 |
+
{"role": "system", "content": system_prompt} if system_prompt else None,
|
| 469 |
+
{"role": "user", "content": [
|
| 470 |
+
{"type": "text", "text": prompt},
|
| 471 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
| 472 |
+
]} if image_data else {"role": "user", "content": prompt}
|
| 473 |
+
],
|
| 474 |
+
api_key=api_key,
|
| 475 |
+
base_url=base_url,
|
| 476 |
+
**kwargs,
|
| 477 |
+
) if image_data else openai_complete_if_cache(
|
| 478 |
+
"gpt-4o-mini",
|
| 479 |
+
prompt,
|
| 480 |
+
system_prompt=system_prompt,
|
| 481 |
+
history_messages=history_messages,
|
| 482 |
+
api_key=api_key,
|
| 483 |
+
base_url=base_url,
|
| 484 |
+
**kwargs,
|
| 485 |
+
)
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
image_content = {
|
| 489 |
+
"img_path": "path/to/image.jpg",
|
| 490 |
+
"image_caption": ["图1:实验结果"],
|
| 491 |
+
"image_footnote": ["数据收集于2024年"]
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
description, entity_info = await image_processor.process_multimodal_content(
|
| 495 |
+
modal_content=image_content,
|
| 496 |
+
content_type="image",
|
| 497 |
+
file_path="research_paper.pdf",
|
| 498 |
+
entity_name="实验结果图表"
|
| 499 |
+
)
|
| 500 |
+
|
| 501 |
+
# 处理表格
|
| 502 |
+
table_processor = TableModalProcessor(
|
| 503 |
+
lightrag=rag,
|
| 504 |
+
modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
| 505 |
+
"gpt-4o-mini",
|
| 506 |
+
prompt,
|
| 507 |
+
system_prompt=system_prompt,
|
| 508 |
+
history_messages=history_messages,
|
| 509 |
+
api_key=api_key,
|
| 510 |
+
base_url=base_url,
|
| 511 |
+
**kwargs,
|
| 512 |
+
)
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
table_content = {
|
| 516 |
+
"table_body": """
|
| 517 |
+
| 方法 | 准确率 | F1分数 |
|
| 518 |
+
|------|--------|--------|
|
| 519 |
+
| RAGAnything | 95.2% | 0.94 |
|
| 520 |
+
| 基准方法 | 87.3% | 0.85 |
|
| 521 |
+
""",
|
| 522 |
+
"table_caption": ["性能对比"],
|
| 523 |
+
"table_footnote": ["测试数据集结果"]
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
description, entity_info = await table_processor.process_multimodal_content(
|
| 527 |
+
modal_content=table_content,
|
| 528 |
+
content_type="table",
|
| 529 |
+
file_path="research_paper.pdf",
|
| 530 |
+
entity_name="性能结果表格"
|
| 531 |
+
)
|
| 532 |
+
|
| 533 |
+
if __name__ == "__main__":
|
| 534 |
+
asyncio.run(process_multimodal_content())
|
| 535 |
+
```
|
| 536 |
+
|
| 537 |
+
#### 3. 批量处理
|
| 538 |
+
|
| 539 |
+
```python
|
| 540 |
+
# 处理多个文档
|
| 541 |
+
await rag.process_folder_complete(
|
| 542 |
+
folder_path="./documents",
|
| 543 |
+
output_dir="./output",
|
| 544 |
+
file_extensions=[".pdf", ".docx", ".pptx"],
|
| 545 |
+
recursive=True,
|
| 546 |
+
max_workers=4
|
| 547 |
+
)
|
| 548 |
+
```
|
| 549 |
+
|
| 550 |
+
#### 4. 自定义模态处理器
|
| 551 |
+
|
| 552 |
+
```python
|
| 553 |
+
from raganything.modalprocessors import GenericModalProcessor
|
| 554 |
+
|
| 555 |
+
class CustomModalProcessor(GenericModalProcessor):
|
| 556 |
+
async def process_multimodal_content(self, modal_content, content_type, file_path, entity_name):
|
| 557 |
+
# 你的自定义处理逻辑
|
| 558 |
+
enhanced_description = await self.analyze_custom_content(modal_content)
|
| 559 |
+
entity_info = self.create_custom_entity(enhanced_description, entity_name)
|
| 560 |
+
return await self._create_entity_and_chunk(enhanced_description, entity_info, file_path)
|
| 561 |
+
```
|
| 562 |
+
|
| 563 |
+
#### 5. 查询选项
|
| 564 |
+
|
| 565 |
+
RAG-Anything 提供三种类型的查询方法:
|
| 566 |
+
|
| 567 |
+
**纯文本查询** - 使用LightRAG直接进行知识库搜索:
|
| 568 |
+
```python
|
| 569 |
+
# 文本查询的不同模式
|
| 570 |
+
text_result_hybrid = await rag.aquery("你的问题", mode="hybrid")
|
| 571 |
+
text_result_local = await rag.aquery("你的问题", mode="local")
|
| 572 |
+
text_result_global = await rag.aquery("你的问题", mode="global")
|
| 573 |
+
text_result_naive = await rag.aquery("你的问题", mode="naive")
|
| 574 |
+
|
| 575 |
+
# 同步版本
|
| 576 |
+
sync_text_result = rag.query("你的问题", mode="hybrid")
|
| 577 |
+
```
|
| 578 |
+
|
| 579 |
+
**VLM增强查询** - 使用VLM自动分析检索上下文中的图像:
|
| 580 |
+
```python
|
| 581 |
+
# VLM增强查询(当提供vision_model_func时自动启用)
|
| 582 |
+
vlm_result = await rag.aquery(
|
| 583 |
+
"分析文档中的图表和数据",
|
| 584 |
+
mode="hybrid"
|
| 585 |
+
# vlm_enhanced=True 当vision_model_func可用时自动设置
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
# 手动控制VLM增强
|
| 589 |
+
vlm_enabled = await rag.aquery(
|
| 590 |
+
"这个文档中的图片显示了什么内容?",
|
| 591 |
+
mode="hybrid",
|
| 592 |
+
vlm_enhanced=True # 强制启用VLM增强
|
| 593 |
+
)
|
| 594 |
+
|
| 595 |
+
vlm_disabled = await rag.aquery(
|
| 596 |
+
"这个文档中的图片显示了什么内容?",
|
| 597 |
+
mode="hybrid",
|
| 598 |
+
vlm_enhanced=False # 强制禁用VLM增强
|
| 599 |
+
)
|
| 600 |
+
|
| 601 |
+
# 当文档包含图片时,VLM可以直接查看和分析图片
|
| 602 |
+
# 系统将自动:
|
| 603 |
+
# 1. 检索包含图片路径的相关上下文
|
| 604 |
+
# 2. 加载图片并编码为base64格式
|
| 605 |
+
# 3. 将文本上下文和图片一起发送给VLM进行综合分析
|
| 606 |
+
```
|
| 607 |
+
|
| 608 |
+
**多模态查询** - 包含特定多模态内容分析的增强查询:
|
| 609 |
+
```python
|
| 610 |
+
# 包含表格数据的查询
|
| 611 |
+
table_result = await rag.aquery_with_multimodal(
|
| 612 |
+
"比较这些性能指标与文档内容",
|
| 613 |
+
multimodal_content=[{
|
| 614 |
+
"type": "table",
|
| 615 |
+
"table_data": """方法,准确率,速度
|
| 616 |
+
LightRAG,95.2%,120ms
|
| 617 |
+
传统方法,87.3%,180ms""",
|
| 618 |
+
"table_caption": "性能对比"
|
| 619 |
+
}],
|
| 620 |
+
mode="hybrid"
|
| 621 |
+
)
|
| 622 |
+
|
| 623 |
+
# 包含公式内容的查询
|
| 624 |
+
equation_result = await rag.aquery_with_multimodal(
|
| 625 |
+
"解释这个公式及其与文档内容的相关性",
|
| 626 |
+
multimodal_content=[{
|
| 627 |
+
"type": "equation",
|
| 628 |
+
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
| 629 |
+
"equation_caption": "文档相关性概率"
|
| 630 |
+
}],
|
| 631 |
+
mode="hybrid"
|
| 632 |
+
)
|
| 633 |
+
```
|
| 634 |
+
|
| 635 |
+
#### 6. 加载已存在的LightRAG实例
|
| 636 |
+
|
| 637 |
+
```python
|
| 638 |
+
import asyncio
|
| 639 |
+
from raganything import RAGAnything
|
| 640 |
+
from lightrag import LightRAG
|
| 641 |
+
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 642 |
+
from lightrag.utils import EmbeddingFunc
|
| 643 |
+
import os
|
| 644 |
+
|
| 645 |
+
async def load_existing_lightrag():
|
| 646 |
+
# 设置 API 配置
|
| 647 |
+
api_key = "your-api-key"
|
| 648 |
+
base_url = "your-base-url" # 可选
|
| 649 |
+
|
| 650 |
+
# 首先,创建或加载已存在的 LightRAG 实例
|
| 651 |
+
lightrag_working_dir = "./existing_lightrag_storage"
|
| 652 |
+
|
| 653 |
+
# 检查是否存在之前的 LightRAG 实例
|
| 654 |
+
if os.path.exists(lightrag_working_dir) and os.listdir(lightrag_working_dir):
|
| 655 |
+
print("✅ 发现已存在的 LightRAG 实例,正在加载...")
|
| 656 |
+
else:
|
| 657 |
+
print("❌ 未找到已存在的 LightRAG 实例,将创建新实例")
|
| 658 |
+
|
| 659 |
+
# 使用您的配置创建/加载 LightRAG 实例
|
| 660 |
+
lightrag_instance = LightRAG(
|
| 661 |
+
working_dir=lightrag_working_dir,
|
| 662 |
+
llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
|
| 663 |
+
"gpt-4o-mini",
|
| 664 |
+
prompt,
|
| 665 |
+
system_prompt=system_prompt,
|
| 666 |
+
history_messages=history_messages,
|
| 667 |
+
api_key=api_key,
|
| 668 |
+
base_url=base_url,
|
| 669 |
+
**kwargs,
|
| 670 |
+
),
|
| 671 |
+
embedding_func=EmbeddingFunc(
|
| 672 |
+
embedding_dim=3072,
|
| 673 |
+
max_token_size=8192,
|
| 674 |
+
func=lambda texts: openai_embed(
|
| 675 |
+
texts,
|
| 676 |
+
model="text-embedding-3-large",
|
| 677 |
+
api_key=api_key,
|
| 678 |
+
base_url=base_url,
|
| 679 |
+
),
|
| 680 |
+
)
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
# 初始化存储(如果有现有数据,这将加载它们)
|
| 684 |
+
await lightrag_instance.initialize_storages()
|
| 685 |
+
await initialize_pipeline_status()
|
| 686 |
+
|
| 687 |
+
# 定义视觉模型函数用于图像处理
|
| 688 |
+
def vision_model_func(
|
| 689 |
+
prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
|
| 690 |
+
):
|
| 691 |
+
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
| 692 |
+
if messages:
|
| 693 |
+
return openai_complete_if_cache(
|
| 694 |
+
"gpt-4o",
|
| 695 |
+
"",
|
| 696 |
+
system_prompt=None,
|
| 697 |
+
history_messages=[],
|
| 698 |
+
messages=messages,
|
| 699 |
+
api_key=api_key,
|
| 700 |
+
base_url=base_url,
|
| 701 |
+
**kwargs,
|
| 702 |
+
)
|
| 703 |
+
# 传统单图片格式
|
| 704 |
+
elif image_data:
|
| 705 |
+
return openai_complete_if_cache(
|
| 706 |
+
"gpt-4o",
|
| 707 |
+
"",
|
| 708 |
+
system_prompt=None,
|
| 709 |
+
history_messages=[],
|
| 710 |
+
messages=[
|
| 711 |
+
{"role": "system", "content": system_prompt}
|
| 712 |
+
if system_prompt
|
| 713 |
+
else None,
|
| 714 |
+
{
|
| 715 |
+
"role": "user",
|
| 716 |
+
"content": [
|
| 717 |
+
{"type": "text", "text": prompt},
|
| 718 |
+
{
|
| 719 |
+
"type": "image_url",
|
| 720 |
+
"image_url": {
|
| 721 |
+
"url": f"data:image/jpeg;base64,{image_data}"
|
| 722 |
+
},
|
| 723 |
+
},
|
| 724 |
+
],
|
| 725 |
+
}
|
| 726 |
+
if image_data
|
| 727 |
+
else {"role": "user", "content": prompt},
|
| 728 |
+
],
|
| 729 |
+
api_key=api_key,
|
| 730 |
+
base_url=base_url,
|
| 731 |
+
**kwargs,
|
| 732 |
+
)
|
| 733 |
+
# 纯文本格式
|
| 734 |
+
else:
|
| 735 |
+
return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
| 736 |
+
|
| 737 |
+
# 现在使用已存在的 LightRAG 实例初始化 RAGAnything
|
| 738 |
+
rag = RAGAnything(
|
| 739 |
+
lightrag=lightrag_instance, # 传入已存在的 LightRAG 实例
|
| 740 |
+
vision_model_func=vision_model_func,
|
| 741 |
+
# 注意:working_dir、llm_model_func、embedding_func 等都从 lightrag_instance 继承
|
| 742 |
+
)
|
| 743 |
+
|
| 744 |
+
# 查询已存在的知识库
|
| 745 |
+
result = await rag.aquery(
|
| 746 |
+
"这个 LightRAG 实例中处理了哪些数据?",
|
| 747 |
+
mode="hybrid"
|
| 748 |
+
)
|
| 749 |
+
print("查询结果:", result)
|
| 750 |
+
|
| 751 |
+
# 向已存在的 LightRAG 实例添加新的多模态文档
|
| 752 |
+
await rag.process_document_complete(
|
| 753 |
+
file_path="path/to/new/multimodal_document.pdf",
|
| 754 |
+
output_dir="./output"
|
| 755 |
+
)
|
| 756 |
+
|
| 757 |
+
if __name__ == "__main__":
|
| 758 |
+
asyncio.run(load_existing_lightrag())
|
| 759 |
+
```
|
| 760 |
+
|
| 761 |
+
#### 7. 直接插入内容列表
|
| 762 |
+
|
| 763 |
+
当您已经有预解析的内容列表(例如,来自外部解析器或之前的处理结果)时,可以直接插入到 RAGAnything 中而无需文档解析:
|
| 764 |
+
|
| 765 |
+
```python
|
| 766 |
+
import asyncio
|
| 767 |
+
from raganything import RAGAnything, RAGAnythingConfig
|
| 768 |
+
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 769 |
+
from lightrag.utils import EmbeddingFunc
|
| 770 |
+
|
| 771 |
+
async def insert_content_list_example():
|
| 772 |
+
# 设置 API 配置
|
| 773 |
+
api_key = "your-api-key"
|
| 774 |
+
base_url = "your-base-url" # 可选
|
| 775 |
+
|
| 776 |
+
# 创建 RAGAnything 配置
|
| 777 |
+
config = RAGAnythingConfig(
|
| 778 |
+
working_dir="./rag_storage",
|
| 779 |
+
enable_image_processing=True,
|
| 780 |
+
enable_table_processing=True,
|
| 781 |
+
enable_equation_processing=True,
|
| 782 |
+
)
|
| 783 |
+
|
| 784 |
+
# 定义模型函数
|
| 785 |
+
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
| 786 |
+
return openai_complete_if_cache(
|
| 787 |
+
"gpt-4o-mini",
|
| 788 |
+
prompt,
|
| 789 |
+
system_prompt=system_prompt,
|
| 790 |
+
history_messages=history_messages,
|
| 791 |
+
api_key=api_key,
|
| 792 |
+
base_url=base_url,
|
| 793 |
+
**kwargs,
|
| 794 |
+
)
|
| 795 |
+
|
| 796 |
+
def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
|
| 797 |
+
# 如果提供了messages格式(用于多模态VLM增强查询),直接使用
|
| 798 |
+
if messages:
|
| 799 |
+
return openai_complete_if_cache(
|
| 800 |
+
"gpt-4o",
|
| 801 |
+
"",
|
| 802 |
+
system_prompt=None,
|
| 803 |
+
history_messages=[],
|
| 804 |
+
messages=messages,
|
| 805 |
+
api_key=api_key,
|
| 806 |
+
base_url=base_url,
|
| 807 |
+
**kwargs,
|
| 808 |
+
)
|
| 809 |
+
# 传统单图片格式
|
| 810 |
+
elif image_data:
|
| 811 |
+
return openai_complete_if_cache(
|
| 812 |
+
"gpt-4o",
|
| 813 |
+
"",
|
| 814 |
+
system_prompt=None,
|
| 815 |
+
history_messages=[],
|
| 816 |
+
messages=[
|
| 817 |
+
{"role": "system", "content": system_prompt} if system_prompt else None,
|
| 818 |
+
{
|
| 819 |
+
"role": "user",
|
| 820 |
+
"content": [
|
| 821 |
+
{"type": "text", "text": prompt},
|
| 822 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
|
| 823 |
+
],
|
| 824 |
+
} if image_data else {"role": "user", "content": prompt},
|
| 825 |
+
],
|
| 826 |
+
api_key=api_key,
|
| 827 |
+
base_url=base_url,
|
| 828 |
+
**kwargs,
|
| 829 |
+
)
|
| 830 |
+
# 纯文本格式
|
| 831 |
+
else:
|
| 832 |
+
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
| 833 |
+
|
| 834 |
+
embedding_func = EmbeddingFunc(
|
| 835 |
+
embedding_dim=3072,
|
| 836 |
+
max_token_size=8192,
|
| 837 |
+
func=lambda texts: openai_embed(
|
| 838 |
+
texts,
|
| 839 |
+
model="text-embedding-3-large",
|
| 840 |
+
api_key=api_key,
|
| 841 |
+
base_url=base_url,
|
| 842 |
+
),
|
| 843 |
+
)
|
| 844 |
+
|
| 845 |
+
# 初始化 RAGAnything
|
| 846 |
+
rag = RAGAnything(
|
| 847 |
+
config=config,
|
| 848 |
+
llm_model_func=llm_model_func,
|
| 849 |
+
vision_model_func=vision_model_func,
|
| 850 |
+
embedding_func=embedding_func,
|
| 851 |
+
)
|
| 852 |
+
|
| 853 |
+
# 示例:来自外部源的预解析内容列表
|
| 854 |
+
content_list = [
|
| 855 |
+
{
|
| 856 |
+
"type": "text",
|
| 857 |
+
"text": "这是我们研究论文的引言部分。",
|
| 858 |
+
"page_idx": 0 # 此内容出现的页码
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"type": "image",
|
| 862 |
+
"img_path": "/absolute/path/to/figure1.jpg", # 重要:使用绝对路径
|
| 863 |
+
"image_caption": ["图1:系统架构"],
|
| 864 |
+
"image_footnote": ["来源:作者原创设计"],
|
| 865 |
+
"page_idx": 1 # 此图像出现的页码
|
| 866 |
+
},
|
| 867 |
+
{
|
| 868 |
+
"type": "table",
|
| 869 |
+
"table_body": "| 方法 | 准确率 | F1分数 |\n|------|--------|--------|\n| 我们的方法 | 95.2% | 0.94 |\n| 基准方法 | 87.3% | 0.85 |",
|
| 870 |
+
"table_caption": ["表1:性能对比"],
|
| 871 |
+
"table_footnote": ["测试数据集结果"],
|
| 872 |
+
"page_idx": 2 # 此表格出现的页码
|
| 873 |
+
},
|
| 874 |
+
{
|
| 875 |
+
"type": "equation",
|
| 876 |
+
"latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
|
| 877 |
+
"text": "文档相关性概率公式",
|
| 878 |
+
"page_idx": 3 # 此公式出现的页码
|
| 879 |
+
},
|
| 880 |
+
{
|
| 881 |
+
"type": "text",
|
| 882 |
+
"text": "总之,我们的方法在所有指标上都表现出优越的性能。",
|
| 883 |
+
"page_idx": 4 # 此内容出现的页码
|
| 884 |
+
}
|
| 885 |
+
]
|
| 886 |
+
|
| 887 |
+
# 直接插入内容列表
|
| 888 |
+
await rag.insert_content_list(
|
| 889 |
+
content_list=content_list,
|
| 890 |
+
file_path="research_paper.pdf", # 用于引用的参考文件名
|
| 891 |
+
split_by_character=None, # 可选的文本分割
|
| 892 |
+
split_by_character_only=False, # 可选的文本分割模式
|
| 893 |
+
doc_id=None, # 可选的自定义文档ID(如果未提供将自动生成)
|
| 894 |
+
display_stats=True # 显示内容统计信息
|
| 895 |
+
)
|
| 896 |
+
|
| 897 |
+
# 查询��入的内容
|
| 898 |
+
result = await rag.aquery(
|
| 899 |
+
"研究中提到的主要发现和性能指标是什么?",
|
| 900 |
+
mode="hybrid"
|
| 901 |
+
)
|
| 902 |
+
print("查询结果:", result)
|
| 903 |
+
|
| 904 |
+
# 您也可以使用不同的文档ID插入多个内容列表
|
| 905 |
+
another_content_list = [
|
| 906 |
+
{
|
| 907 |
+
"type": "text",
|
| 908 |
+
"text": "这是来自另一个文档的内容。",
|
| 909 |
+
"page_idx": 0 # 此内容出现的页码
|
| 910 |
+
},
|
| 911 |
+
{
|
| 912 |
+
"type": "table",
|
| 913 |
+
"table_body": "| 特性 | 值 |\n|------|----|\n| 速度 | 快速 |\n| 准确性 | 高 |",
|
| 914 |
+
"table_caption": ["特性对比"],
|
| 915 |
+
"page_idx": 1 # 此表格出现的页码
|
| 916 |
+
}
|
| 917 |
+
]
|
| 918 |
+
|
| 919 |
+
await rag.insert_content_list(
|
| 920 |
+
content_list=another_content_list,
|
| 921 |
+
file_path="another_document.pdf",
|
| 922 |
+
doc_id="custom-doc-id-123" # 自定义文档ID
|
| 923 |
+
)
|
| 924 |
+
|
| 925 |
+
if __name__ == "__main__":
|
| 926 |
+
asyncio.run(insert_content_list_example())
|
| 927 |
+
```
|
| 928 |
+
|
| 929 |
+
**内容列表格式:**
|
| 930 |
+
|
| 931 |
+
`content_list` 应遵循标准格式,每个项目都是包含以下内容的字典:
|
| 932 |
+
|
| 933 |
+
- **文本内容**: `{"type": "text", "text": "内容文本", "page_idx": 0}`
|
| 934 |
+
- **图像内容**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "image_caption": ["标题"], "image_footnote": ["注释"], "page_idx": 1}`
|
| 935 |
+
- **表格内容**: `{"type": "table", "table_body": "markdown表格", "table_caption": ["标题"], "table_footnote": ["注释"], "page_idx": 2}`
|
| 936 |
+
- **公式内容**: `{"type": "equation", "latex": "LaTeX公式", "text": "描述", "page_idx": 3}`
|
| 937 |
+
- **通用内容**: `{"type": "custom_type", "content": "任何内容", "page_idx": 4}`
|
| 938 |
+
|
| 939 |
+
**重要说明:**
|
| 940 |
+
- **`img_path`**: 必须是图像文件的绝对路径(例如:`/home/user/images/chart.jpg` 或 `C:\Users\user\images\chart.jpg`)
|
| 941 |
+
- **`page_idx`**: 表示内容在原始文档中出现的页码(从0开始的索引)
|
| 942 |
+
- **内容顺序**: 项目按照在列表中出现的顺序进行处理
|
| 943 |
+
|
| 944 |
+
此方法在以下情况下特别有用:
|
| 945 |
+
- 您有来自外部解析器的内容(非MinerU/Docling)
|
| 946 |
+
- 您想要处理程序化生成的内容
|
| 947 |
+
- 您需要将来自多个源的内容插入到单个知识库中
|
| 948 |
+
- 您有想要重用的缓存解析结果
|
| 949 |
+
|
| 950 |
+
---
|
| 951 |
+
|
| 952 |
+
## 🛠️ 示例
|
| 953 |
+
|
| 954 |
+
*实际应用演示*
|
| 955 |
+
|
| 956 |
+
<div align="center">
|
| 957 |
+
<img src="https://user-images.githubusercontent.com/74038190/212257455-13e3e01e-d6a6-45dc-bb92-3ab87b12dfc1.gif" width="300">
|
| 958 |
+
</div>
|
| 959 |
+
|
| 960 |
+
`examples/` 目录包含完整的使用示例:
|
| 961 |
+
|
| 962 |
+
- **`raganything_example.py`**:基于MinerU的端到端文档处理
|
| 963 |
+
- **`modalprocessors_example.py`**:直接多模态内容处理
|
| 964 |
+
- **`office_document_test.py`**:Office文档解析测试(无需API密钥)
|
| 965 |
+
- **`image_format_test.py`**:图像格式解析测试(无需API密钥)
|
| 966 |
+
- **`text_format_test.py`**:文本格式解析测试(无需API密钥)
|
| 967 |
+
|
| 968 |
+
**运行示例:**
|
| 969 |
+
|
| 970 |
+
```bash
|
| 971 |
+
# 端到端处理(包含解析器选择)
|
| 972 |
+
python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
|
| 973 |
+
|
| 974 |
+
# 直接模态处理
|
| 975 |
+
python examples/modalprocessors_example.py --api-key YOUR_API_KEY
|
| 976 |
+
|
| 977 |
+
# Office文档解析测试(仅MinerU功能)
|
| 978 |
+
python examples/office_document_test.py --file path/to/document.docx
|
| 979 |
+
|
| 980 |
+
# 图像格式解析测试(仅MinerU功能)
|
| 981 |
+
python examples/image_format_test.py --file path/to/image.bmp
|
| 982 |
+
|
| 983 |
+
# 文本格式解析测试(仅MinerU功能)
|
| 984 |
+
python examples/text_format_test.py --file path/to/document.md
|
| 985 |
+
|
| 986 |
+
# 检查LibreOffice安装
|
| 987 |
+
python examples/office_document_test.py --check-libreoffice --file dummy
|
| 988 |
+
|
| 989 |
+
# 检查PIL/Pillow安装
|
| 990 |
+
python examples/image_format_test.py --check-pillow --file dummy
|
| 991 |
+
|
| 992 |
+
# 检查ReportLab安装
|
| 993 |
+
python examples/text_format_test.py --check-reportlab --file dummy
|
| 994 |
+
```
|
| 995 |
+
|
| 996 |
+
> **注意**:API密钥仅在完整RAG处理和LLM集成时需要。解析测试文件(`office_document_test.py`、`image_format_test.py` 和 `text_format_test.py`)仅测试MinerU功能,无需API密钥。
|
| 997 |
+
|
| 998 |
+
---
|
| 999 |
+
|
| 1000 |
+
## 🔧 配置
|
| 1001 |
+
|
| 1002 |
+
*系统优化参数*
|
| 1003 |
+
|
| 1004 |
+
### 环境变量
|
| 1005 |
+
|
| 1006 |
+
创建 `.env` 文件(参考 `.env.example`):
|
| 1007 |
+
|
| 1008 |
+
```bash
|
| 1009 |
+
OPENAI_API_KEY=your_openai_api_key
|
| 1010 |
+
OPENAI_BASE_URL=your_base_url # 可选
|
| 1011 |
+
OUTPUT_DIR=./output # 解析文档的默认输出目录
|
| 1012 |
+
PARSER=mineru # 解析器选择:mineru 或 docling
|
| 1013 |
+
PARSE_METHOD=auto # 解析方法:auto, ocr 或 txt
|
| 1014 |
+
```
|
| 1015 |
+
|
| 1016 |
+
**注意:** 为了向后兼容,旧的环境变量名称仍然有效:
|
| 1017 |
+
- `MINERU_PARSE_METHOD` 已弃用,请使用 `PARSE_METHOD`
|
| 1018 |
+
|
| 1019 |
+
### 解析器配置
|
| 1020 |
+
|
| 1021 |
+
RAGAnything 现在支持多种解析器,每种解析器都有其特定的优势:
|
| 1022 |
+
|
| 1023 |
+
#### MinerU 解析器
|
| 1024 |
+
- 支持PDF、图像、Office文档等多种格式
|
| 1025 |
+
- 强大的OCR和表格提取能力
|
| 1026 |
+
- 支持GPU加速
|
| 1027 |
+
|
| 1028 |
+
#### Docling 解析器
|
| 1029 |
+
- 专门优化Office文档和HTML文件的解析
|
| 1030 |
+
- 更好的文档结构保持
|
| 1031 |
+
- 原生支持多种Office格式
|
| 1032 |
+
|
| 1033 |
+
### MinerU配置
|
| 1034 |
+
|
| 1035 |
+
```bash
|
| 1036 |
+
# MinerU 2.0使用命令行参数而不是配置文件
|
| 1037 |
+
# 查看可用选项:
|
| 1038 |
+
mineru --help
|
| 1039 |
+
|
| 1040 |
+
# 常用配置:
|
| 1041 |
+
mineru -p input.pdf -o output_dir -m auto # 自动解析模式
|
| 1042 |
+
mineru -p input.pdf -o output_dir -m ocr # OCR重点解析
|
| 1043 |
+
mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU加速
|
| 1044 |
+
```
|
| 1045 |
+
|
| 1046 |
+
你也可以通过RAGAnything参数配置解析:
|
| 1047 |
+
|
| 1048 |
+
```python
|
| 1049 |
+
# 基础解析配置和解析器选择
|
| 1050 |
+
await rag.process_document_complete(
|
| 1051 |
+
file_path="document.pdf",
|
| 1052 |
+
output_dir="./output/",
|
| 1053 |
+
parse_method="auto", # 或 "ocr", "txt"
|
| 1054 |
+
parser="mineru" # 可选:"mineru" 或 "docling"
|
| 1055 |
+
)
|
| 1056 |
+
|
| 1057 |
+
# 高级解析配置(包含特殊参数)
|
| 1058 |
+
await rag.process_document_complete(
|
| 1059 |
+
file_path="document.pdf",
|
| 1060 |
+
output_dir="./output/",
|
| 1061 |
+
parse_method="auto", # 解析方法:"auto", "ocr", "txt"
|
| 1062 |
+
parser="mineru", # 解析器选择:"mineru" 或 "docling"
|
| 1063 |
+
|
| 1064 |
+
# MinerU特殊参数 - 支持的所有kwargs:
|
| 1065 |
+
lang="ch", # 文档语言优化(如:"ch", "en", "ja")
|
| 1066 |
+
device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
|
| 1067 |
+
start_page=0, # 起始页码(0为基准,适用于PDF)
|
| 1068 |
+
end_page=10, # 结束页码(0为基准,适用于PDF)
|
| 1069 |
+
formula=True, # 启用公式解析
|
| 1070 |
+
table=True, # 启用表格解析
|
| 1071 |
+
backend="pipeline", # 解析后端:pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client
|
| 1072 |
+
source="huggingface", # 模型源:"huggingface", "modelscope", "local"
|
| 1073 |
+
# vlm_url="http://127.0.0.1:3000" # 当backend=vlm-sglang-client时,需指定服务地址
|
| 1074 |
+
|
| 1075 |
+
# RAGAnything标准参数
|
| 1076 |
+
display_stats=True, # 显示内容统计信息
|
| 1077 |
+
split_by_character=None, # 可选的文本分割字符
|
| 1078 |
+
doc_id=None # 可选的文档ID
|
| 1079 |
+
)
|
| 1080 |
+
```
|
| 1081 |
+
|
| 1082 |
+
> **注意**:MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。RAG-Anything现在支持多种文档解析器 - 你可以根据需要在MinerU和Docling之间选择。
|
| 1083 |
+
|
| 1084 |
+
### 处理要求
|
| 1085 |
+
|
| 1086 |
+
不同内容类型需要特定的可选依赖:
|
| 1087 |
+
|
| 1088 |
+
- **Office文档** (.doc, .docx, .ppt, .pptx, .xls, .xlsx): 安装并配置 [LibreOffice](https://www.libreoffice.org/download/download/)
|
| 1089 |
+
- **扩展图像格式** (.bmp, .tiff, .gif, .webp): 使用 `pip install raganything[image]` 安装
|
| 1090 |
+
- **文本文件** (.txt, .md): 使用 `pip install raganything[text]` 安装
|
| 1091 |
+
|
| 1092 |
+
> **📋 快速安装**: 使用 `pip install raganything[all]` 启用所有格式支持(仅Python依赖 - LibreOffice仍需单独安装)
|
| 1093 |
+
|
| 1094 |
+
---
|
| 1095 |
+
|
| 1096 |
+
## 🧪 支持的内容类型
|
| 1097 |
+
|
| 1098 |
+
### 文档格式
|
| 1099 |
+
|
| 1100 |
+
- **PDF** - 研究论文、报告、演示文稿
|
| 1101 |
+
- **Office文档** - DOC、DOCX、PPT、PPTX、XLS、XLSX
|
| 1102 |
+
- **图像** - JPG、PNG、BMP、TIFF、GIF、WebP
|
| 1103 |
+
- **文本文件** - TXT、MD
|
| 1104 |
+
|
| 1105 |
+
### 多模态元素
|
| 1106 |
+
|
| 1107 |
+
- **图像** - 照片、图表、示意图、截图
|
| 1108 |
+
- **表格** - 数据表、对比图、统计摘要
|
| 1109 |
+
- **公式** - LaTeX格式的数学公式
|
| 1110 |
+
- **通用内容** - 通过可扩展处理器支持的自定义内容类型
|
| 1111 |
+
|
| 1112 |
+
*格式特定依赖的安装说明请参见[配置](#-配置)部分。*
|
| 1113 |
+
|
| 1114 |
+
---
|
| 1115 |
+
|
| 1116 |
+
## 📖 引用
|
| 1117 |
+
|
| 1118 |
+
*学术参考*
|
| 1119 |
+
|
| 1120 |
+
<div align="center">
|
| 1121 |
+
<div style="width: 60px; height: 60px; margin: 20px auto; position: relative;">
|
| 1122 |
+
<div style="width: 100%; height: 100%; border: 2px solid #00d9ff; border-radius: 50%; position: relative;">
|
| 1123 |
+
<div style="position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); font-size: 24px; color: #00d9ff;">📖</div>
|
| 1124 |
+
</div>
|
| 1125 |
+
<div style="position: absolute; bottom: -5px; left: 50%; transform: translateX(-50%); width: 20px; height: 20px; background: white; border-right: 2px solid #00d9ff; border-bottom: 2px solid #00d9ff; transform: rotate(45deg);"></div>
|
| 1126 |
+
</div>
|
| 1127 |
+
</div>
|
| 1128 |
+
|
| 1129 |
+
```bibtex
|
| 1130 |
+
@article{guo2024lightrag,
|
| 1131 |
+
title={LightRAG: Simple and Fast Retrieval-Augmented Generation},
|
| 1132 |
+
author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang},
|
| 1133 |
+
year={2024},
|
| 1134 |
+
eprint={2410.05779},
|
| 1135 |
+
archivePrefix={arXiv},
|
| 1136 |
+
primaryClass={cs.IR}
|
| 1137 |
+
}
|
| 1138 |
+
```
|
| 1139 |
+
|
| 1140 |
+
---
|
| 1141 |
+
|
| 1142 |
+
## 🔗 相关项目
|
| 1143 |
+
|
| 1144 |
+
*生态系统与扩展*
|
| 1145 |
+
|
| 1146 |
+
<div align="center">
|
| 1147 |
+
<table>
|
| 1148 |
+
<tr>
|
| 1149 |
+
<td align="center">
|
| 1150 |
+
<a href="https://github.com/HKUDS/LightRAG">
|
| 1151 |
+
<div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
|
| 1152 |
+
<span style="font-size: 32px;">⚡</span>
|
| 1153 |
+
</div>
|
| 1154 |
+
<b>LightRAG</b><br>
|
| 1155 |
+
<sub>简单快速的RAG系统</sub>
|
| 1156 |
+
</a>
|
| 1157 |
+
</td>
|
| 1158 |
+
<td align="center">
|
| 1159 |
+
<a href="https://github.com/HKUDS/VideoRAG">
|
| 1160 |
+
<div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
|
| 1161 |
+
<span style="font-size: 32px;">🎥</span>
|
| 1162 |
+
</div>
|
| 1163 |
+
<b>VideoRAG</b><br>
|
| 1164 |
+
<sub>超长上下文视频RAG系统</sub>
|
| 1165 |
+
</a>
|
| 1166 |
+
</td>
|
| 1167 |
+
<td align="center">
|
| 1168 |
+
<a href="https://github.com/HKUDS/MiniRAG">
|
| 1169 |
+
<div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
|
| 1170 |
+
<span style="font-size: 32px;">✨</span>
|
| 1171 |
+
</div>
|
| 1172 |
+
<b>MiniRAG</b><br>
|
| 1173 |
+
<sub>极简RAG系统</sub>
|
| 1174 |
+
</a>
|
| 1175 |
+
</td>
|
| 1176 |
+
</tr>
|
| 1177 |
+
</table>
|
| 1178 |
+
</div>
|
| 1179 |
+
|
| 1180 |
+
---
|
| 1181 |
+
|
| 1182 |
+
## ⭐ Star History
|
| 1183 |
+
|
| 1184 |
+
*社区增长轨迹*
|
| 1185 |
+
|
| 1186 |
+
<div align="center">
|
| 1187 |
+
<a href="https://star-history.com/#HKUDS/RAG-Anything&Date">
|
| 1188 |
+
<picture>
|
| 1189 |
+
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date&theme=dark" />
|
| 1190 |
+
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date" />
|
| 1191 |
+
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date" style="border-radius: 15px; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);" />
|
| 1192 |
+
</picture>
|
| 1193 |
+
</a>
|
| 1194 |
+
</div>
|
| 1195 |
+
|
| 1196 |
+
---
|
| 1197 |
+
|
| 1198 |
+
## 🤝 贡献者
|
| 1199 |
+
|
| 1200 |
+
*加入创新*
|
| 1201 |
+
|
| 1202 |
+
<div align="center">
|
| 1203 |
+
感谢所有贡献者!
|
| 1204 |
+
</div>
|
| 1205 |
+
|
| 1206 |
+
<div align="center">
|
| 1207 |
+
<a href="https://github.com/HKUDS/RAG-Anything/graphs/contributors">
|
| 1208 |
+
<img src="https://contrib.rocks/image?repo=HKUDS/RAG-Anything" style="border-radius: 15px; box-shadow: 0 0 20px rgba(0, 217, 255, 0.3);" />
|
| 1209 |
+
</a>
|
| 1210 |
+
</div>
|
| 1211 |
+
|
| 1212 |
+
---
|
| 1213 |
+
|
| 1214 |
+
<div align="center" style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 30px; margin: 30px 0;">
|
| 1215 |
+
<div>
|
| 1216 |
+
<img src="https://user-images.githubusercontent.com/74038190/212284100-561aa473-3905-4a80-b561-0d28506553ee.gif" width="500">
|
| 1217 |
+
</div>
|
| 1218 |
+
<div style="margin-top: 20px;">
|
| 1219 |
+
<a href="https://github.com/HKUDS/RAG-Anything" style="text-decoration: none;">
|
| 1220 |
+
<img src="https://img.shields.io/badge/⭐%20在GitHub上为我们点星-1a1a2e?style=for-the-badge&logo=github&logoColor=white">
|
| 1221 |
+
</a>
|
| 1222 |
+
<a href="https://github.com/HKUDS/RAG-Anything/issues" style="text-decoration: none;">
|
| 1223 |
+
<img src="https://img.shields.io/badge/🐛%20报告问题-ff6b6b?style=for-the-badge&logo=github&logoColor=white">
|
| 1224 |
+
</a>
|
| 1225 |
+
<a href="https://github.com/HKUDS/RAG-Anything/discussions" style="text-decoration: none;">
|
| 1226 |
+
<img src="https://img.shields.io/badge/💬%20讨论交流-4ecdc4?style=for-the-badge&logo=github&logoColor=white">
|
| 1227 |
+
</a>
|
| 1228 |
+
</div>
|
| 1229 |
+
</div>
|
| 1230 |
+
|
| 1231 |
+
<div align="center">
|
| 1232 |
+
<div style="width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);">
|
| 1233 |
+
<div style="display: flex; justify-content: center; align-items: center; gap: 15px;">
|
| 1234 |
+
<span style="font-size: 24px;">⭐</span>
|
| 1235 |
+
<span style="color: #00d9ff; font-size: 18px;">感谢您访问RAG-Anything!</span>
|
| 1236 |
+
<span style="font-size: 24px;">⭐</span>
|
| 1237 |
+
</div>
|
| 1238 |
+
<div style="margin-top: 10px; color: #00d9ff; font-size: 16px;">构建多模态AI的未来</div>
|
| 1239 |
+
</div>
|
| 1240 |
+
</div>
|
| 1241 |
+
|
| 1242 |
+
<div align="center">
|
| 1243 |
+
<img src="https://readme-typing-svg.herokuapp.com?font=Orbitron&size=20&duration=3000&pause=1000&color=00D9FF¢er=true&vCenter=true&width=600&lines=感谢您访问RAG-Anything!;构建多模态AI的未来;如果觉得有用请点星⭐!" alt="Closing Animation" />
|
| 1244 |
+
</div>
|
| 1245 |
+
|
| 1246 |
+
<style>
|
| 1247 |
+
@keyframes pulse {
|
| 1248 |
+
0% { transform: scale(1); }
|
| 1249 |
+
50% { transform: scale(1.05); }
|
| 1250 |
+
100% { transform: scale(1); }
|
| 1251 |
+
}
|
| 1252 |
+
|
| 1253 |
+
@keyframes glow {
|
| 1254 |
+
0% { box-shadow: 0 0 5px rgba(0, 217, 255, 0.5); }
|
| 1255 |
+
50% { box-shadow: 0 0 20px rgba(0, 217, 255, 0.8); }
|
| 1256 |
+
100% { box-shadow: 0 0 5px rgba(0, 217, 255, 0.5); }
|
| 1257 |
+
}
|
| 1258 |
+
</style>
|
rag_anything_smaranika/docs/batch_processing.md
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Batch Processing
|
| 2 |
+
|
| 3 |
+
This document describes the batch processing feature for RAG-Anything, which allows you to process multiple documents in parallel for improved throughput.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The batch processing feature allows you to process multiple documents concurrently, significantly improving throughput for large document collections. It provides parallel processing, progress tracking, error handling, and flexible configuration options.
|
| 8 |
+
|
| 9 |
+
## Key Features
|
| 10 |
+
|
| 11 |
+
- **Parallel Processing**: Process multiple files concurrently using thread pools
|
| 12 |
+
- **Progress Tracking**: Real-time progress bars with `tqdm`
|
| 13 |
+
- **Error Handling**: Comprehensive error reporting and recovery
|
| 14 |
+
- **Flexible Input**: Support for files, directories, and recursive search
|
| 15 |
+
- **Configurable Workers**: Adjustable number of parallel workers
|
| 16 |
+
- **Installation Check Bypass**: Optional skip for environments with package conflicts
|
| 17 |
+
|
| 18 |
+
## Installation
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
# Basic installation
|
| 22 |
+
pip install raganything[all]
|
| 23 |
+
|
| 24 |
+
# Required for batch processing
|
| 25 |
+
pip install tqdm
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Usage
|
| 29 |
+
|
| 30 |
+
### Basic Batch Processing
|
| 31 |
+
|
| 32 |
+
```python
|
| 33 |
+
from raganything.batch_parser import BatchParser
|
| 34 |
+
|
| 35 |
+
# Create batch parser
|
| 36 |
+
batch_parser = BatchParser(
|
| 37 |
+
parser_type="mineru", # or "docling"
|
| 38 |
+
max_workers=4,
|
| 39 |
+
show_progress=True,
|
| 40 |
+
timeout_per_file=300,
|
| 41 |
+
skip_installation_check=False # Set to True if having parser installation issues
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Process multiple files
|
| 45 |
+
result = batch_parser.process_batch(
|
| 46 |
+
file_paths=["doc1.pdf", "doc2.docx", "folder/"],
|
| 47 |
+
output_dir="./batch_output",
|
| 48 |
+
parse_method="auto",
|
| 49 |
+
recursive=True
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Check results
|
| 53 |
+
print(result.summary())
|
| 54 |
+
print(f"Success rate: {result.success_rate:.1f}%")
|
| 55 |
+
print(f"Processing time: {result.processing_time:.2f} seconds")
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Asynchronous Batch Processing
|
| 59 |
+
|
| 60 |
+
```python
|
| 61 |
+
import asyncio
|
| 62 |
+
from raganything.batch_parser import BatchParser
|
| 63 |
+
|
| 64 |
+
async def async_batch_processing():
|
| 65 |
+
batch_parser = BatchParser(
|
| 66 |
+
parser_type="mineru",
|
| 67 |
+
max_workers=4,
|
| 68 |
+
show_progress=True
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Process files asynchronously
|
| 72 |
+
result = await batch_parser.process_batch_async(
|
| 73 |
+
file_paths=["doc1.pdf", "doc2.docx"],
|
| 74 |
+
output_dir="./output",
|
| 75 |
+
parse_method="auto"
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
return result
|
| 79 |
+
|
| 80 |
+
# Run async processing
|
| 81 |
+
result = asyncio.run(async_batch_processing())
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
### Integration with RAG-Anything
|
| 85 |
+
|
| 86 |
+
```python
|
| 87 |
+
from raganything import RAGAnything
|
| 88 |
+
|
| 89 |
+
rag = RAGAnything()
|
| 90 |
+
|
| 91 |
+
# Process documents with batch functionality
|
| 92 |
+
result = rag.process_documents_batch(
|
| 93 |
+
file_paths=["doc1.pdf", "doc2.docx"],
|
| 94 |
+
output_dir="./output",
|
| 95 |
+
max_workers=4,
|
| 96 |
+
show_progress=True
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
print(f"Processed {len(result.successful_files)} files successfully")
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
### Process Documents with RAG Integration
|
| 103 |
+
|
| 104 |
+
```python
|
| 105 |
+
# Process documents in batch and then add them to RAG
|
| 106 |
+
result = await rag.process_documents_with_rag_batch(
|
| 107 |
+
file_paths=["doc1.pdf", "doc2.docx"],
|
| 108 |
+
output_dir="./output",
|
| 109 |
+
max_workers=4,
|
| 110 |
+
show_progress=True
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
print(f"Processed {result['successful_rag_files']} files with RAG")
|
| 114 |
+
print(f"Total processing time: {result['total_processing_time']:.2f} seconds")
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
### Command Line Interface
|
| 118 |
+
|
| 119 |
+
```bash
|
| 120 |
+
# Basic batch processing
|
| 121 |
+
python -m raganything.batch_parser path/to/docs/ --output ./output --workers 4
|
| 122 |
+
|
| 123 |
+
# With specific parser
|
| 124 |
+
python -m raganything.batch_parser path/to/docs/ --parser mineru --method auto
|
| 125 |
+
|
| 126 |
+
# Without progress bar
|
| 127 |
+
python -m raganything.batch_parser path/to/docs/ --output ./output --no-progress
|
| 128 |
+
|
| 129 |
+
# Help
|
| 130 |
+
python -m raganything.batch_parser --help
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
## Configuration
|
| 134 |
+
|
| 135 |
+
### Environment Variables
|
| 136 |
+
|
| 137 |
+
```env
|
| 138 |
+
# Batch processing configuration
|
| 139 |
+
MAX_CONCURRENT_FILES=4
|
| 140 |
+
SUPPORTED_FILE_EXTENSIONS=.pdf,.docx,.doc,.pptx,.ppt,.xlsx,.xls,.txt,.md
|
| 141 |
+
RECURSIVE_FOLDER_PROCESSING=true
|
| 142 |
+
PARSER_OUTPUT_DIR=./parsed_output
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
### BatchParser Parameters
|
| 146 |
+
|
| 147 |
+
- **parser_type**: `"mineru"` or `"docling"` (default: `"mineru"`)
|
| 148 |
+
- **max_workers**: Number of parallel workers (default: `4`)
|
| 149 |
+
- **show_progress**: Show progress bar (default: `True`)
|
| 150 |
+
- **timeout_per_file**: Timeout per file in seconds (default: `300`)
|
| 151 |
+
- **skip_installation_check**: Skip parser installation check (default: `False`)
|
| 152 |
+
|
| 153 |
+
## Supported File Types
|
| 154 |
+
|
| 155 |
+
- **PDF files**: `.pdf`
|
| 156 |
+
- **Office documents**: `.doc`, `.docx`, `.ppt`, `.pptx`, `.xls`, `.xlsx`
|
| 157 |
+
- **Images**: `.png`, `.jpg`, `.jpeg`, `.bmp`, `.tiff`, `.tif`, `.gif`, `.webp`
|
| 158 |
+
- **Text files**: `.txt`, `.md`
|
| 159 |
+
|
| 160 |
+
## API Reference
|
| 161 |
+
|
| 162 |
+
### BatchProcessingResult
|
| 163 |
+
|
| 164 |
+
```python
|
| 165 |
+
@dataclass
|
| 166 |
+
class BatchProcessingResult:
|
| 167 |
+
successful_files: List[str] # Successfully processed files
|
| 168 |
+
failed_files: List[str] # Failed files
|
| 169 |
+
total_files: int # Total number of files
|
| 170 |
+
processing_time: float # Total processing time in seconds
|
| 171 |
+
errors: Dict[str, str] # Error messages for failed files
|
| 172 |
+
output_dir: str # Output directory used
|
| 173 |
+
|
| 174 |
+
def summary(self) -> str: # Human-readable summary
|
| 175 |
+
def success_rate(self) -> float: # Success rate as percentage
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
### BatchParser Methods
|
| 179 |
+
|
| 180 |
+
```python
|
| 181 |
+
class BatchParser:
|
| 182 |
+
def __init__(self, parser_type: str = "mineru", max_workers: int = 4, ...):
|
| 183 |
+
"""Initialize batch parser"""
|
| 184 |
+
|
| 185 |
+
def get_supported_extensions(self) -> List[str]:
|
| 186 |
+
"""Get list of supported file extensions"""
|
| 187 |
+
|
| 188 |
+
def filter_supported_files(self, file_paths: List[str], recursive: bool = True) -> List[str]:
|
| 189 |
+
"""Filter files to only supported types"""
|
| 190 |
+
|
| 191 |
+
def process_batch(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
|
| 192 |
+
"""Process files in batch"""
|
| 193 |
+
|
| 194 |
+
async def process_batch_async(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
|
| 195 |
+
"""Process files in batch asynchronously"""
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
## Performance Considerations
|
| 199 |
+
|
| 200 |
+
### Memory Usage
|
| 201 |
+
- Each worker uses additional memory
|
| 202 |
+
- Recommended: 2-4 workers for most systems
|
| 203 |
+
- Monitor memory usage with large files
|
| 204 |
+
|
| 205 |
+
### CPU Usage
|
| 206 |
+
- Parallel processing utilizes multiple cores
|
| 207 |
+
- Optimal worker count depends on CPU cores and file sizes
|
| 208 |
+
- I/O may become bottleneck with many small files
|
| 209 |
+
|
| 210 |
+
### Recommended Settings
|
| 211 |
+
- **Small files** (< 1MB): Higher worker count (6-8)
|
| 212 |
+
- **Large files** (> 100MB): Lower worker count (2-3)
|
| 213 |
+
- **Mixed sizes**: Start with 4 workers and adjust
|
| 214 |
+
|
| 215 |
+
## Troubleshooting
|
| 216 |
+
|
| 217 |
+
### Common Issues
|
| 218 |
+
|
| 219 |
+
#### Memory Errors
|
| 220 |
+
```python
|
| 221 |
+
# Solution: Reduce max_workers
|
| 222 |
+
batch_parser = BatchParser(max_workers=2)
|
| 223 |
+
```
|
| 224 |
+
|
| 225 |
+
#### Timeout Errors
|
| 226 |
+
```python
|
| 227 |
+
# Solution: Increase timeout_per_file
|
| 228 |
+
batch_parser = BatchParser(timeout_per_file=600) # 10 minutes
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
#### Parser Installation Issues
|
| 232 |
+
```python
|
| 233 |
+
# Solution: Skip installation check
|
| 234 |
+
batch_parser = BatchParser(skip_installation_check=True)
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
#### File Not Found Errors
|
| 238 |
+
- Check file paths and permissions
|
| 239 |
+
- Ensure input files exist
|
| 240 |
+
- Verify directory access rights
|
| 241 |
+
|
| 242 |
+
### Debug Mode
|
| 243 |
+
|
| 244 |
+
Enable debug logging for detailed information:
|
| 245 |
+
|
| 246 |
+
```python
|
| 247 |
+
import logging
|
| 248 |
+
logging.basicConfig(level=logging.DEBUG)
|
| 249 |
+
|
| 250 |
+
# Create batch parser with debug logging
|
| 251 |
+
batch_parser = BatchParser(parser_type="mineru", max_workers=2)
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
### Error Handling
|
| 255 |
+
|
| 256 |
+
The batch processor provides comprehensive error handling:
|
| 257 |
+
|
| 258 |
+
```python
|
| 259 |
+
result = batch_parser.process_batch(file_paths=["doc1.pdf", "doc2.docx"])
|
| 260 |
+
|
| 261 |
+
# Check for errors
|
| 262 |
+
if result.failed_files:
|
| 263 |
+
print("Failed files:")
|
| 264 |
+
for file_path in result.failed_files:
|
| 265 |
+
error_message = result.errors.get(file_path, "Unknown error")
|
| 266 |
+
print(f" - {file_path}: {error_message}")
|
| 267 |
+
|
| 268 |
+
# Process only successful files
|
| 269 |
+
for file_path in result.successful_files:
|
| 270 |
+
print(f"Successfully processed: {file_path}")
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
## Examples
|
| 274 |
+
|
| 275 |
+
### Process Entire Directory
|
| 276 |
+
|
| 277 |
+
```python
|
| 278 |
+
from pathlib import Path
|
| 279 |
+
|
| 280 |
+
# Process all supported files in a directory
|
| 281 |
+
batch_parser = BatchParser(max_workers=4)
|
| 282 |
+
directory_path = Path("./documents")
|
| 283 |
+
|
| 284 |
+
result = batch_parser.process_batch(
|
| 285 |
+
file_paths=[str(directory_path)],
|
| 286 |
+
output_dir="./processed",
|
| 287 |
+
recursive=True # Include subdirectories
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
print(f"Processed {len(result.successful_files)} out of {result.total_files} files")
|
| 291 |
+
```
|
| 292 |
+
|
| 293 |
+
### Filter Files Before Processing
|
| 294 |
+
|
| 295 |
+
```python
|
| 296 |
+
# Get all files in directory
|
| 297 |
+
all_files = ["doc1.pdf", "image.png", "spreadsheet.xlsx", "unsupported.xyz"]
|
| 298 |
+
|
| 299 |
+
# Filter to supported files only
|
| 300 |
+
supported_files = batch_parser.filter_supported_files(all_files)
|
| 301 |
+
print(f"Will process {len(supported_files)} out of {len(all_files)} files")
|
| 302 |
+
|
| 303 |
+
# Process only supported files
|
| 304 |
+
result = batch_parser.process_batch(
|
| 305 |
+
file_paths=supported_files,
|
| 306 |
+
output_dir="./output"
|
| 307 |
+
)
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
### Custom Error Handling
|
| 311 |
+
|
| 312 |
+
```python
|
| 313 |
+
def process_with_retry(file_paths, max_retries=3):
|
| 314 |
+
"""Process files with retry logic"""
|
| 315 |
+
|
| 316 |
+
for attempt in range(max_retries):
|
| 317 |
+
result = batch_parser.process_batch(file_paths, "./output")
|
| 318 |
+
|
| 319 |
+
if not result.failed_files:
|
| 320 |
+
break # All files processed successfully
|
| 321 |
+
|
| 322 |
+
print(f"Attempt {attempt + 1}: {len(result.failed_files)} files failed")
|
| 323 |
+
file_paths = result.failed_files # Retry failed files
|
| 324 |
+
|
| 325 |
+
return result
|
| 326 |
+
```
|
| 327 |
+
|
| 328 |
+
## Best Practices
|
| 329 |
+
|
| 330 |
+
1. **Start with default settings** and adjust based on performance
|
| 331 |
+
2. **Monitor system resources** during batch processing
|
| 332 |
+
3. **Use appropriate worker counts** for your hardware
|
| 333 |
+
4. **Handle errors gracefully** with retry logic
|
| 334 |
+
5. **Test with small batches** before processing large collections
|
| 335 |
+
6. **Use skip_installation_check** if facing parser installation issues
|
| 336 |
+
7. **Enable progress tracking** for long-running operations
|
| 337 |
+
8. **Set appropriate timeouts** based on expected file processing times
|
| 338 |
+
|
| 339 |
+
## Conclusion
|
| 340 |
+
|
| 341 |
+
The batch processing feature significantly improves RAG-Anything's throughput for large document collections. It provides flexible configuration options, comprehensive error handling, and seamless integration with the existing RAG-Anything pipeline.
|
rag_anything_smaranika/docs/context_aware_processing.md
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Context-Aware Multimodal Processing in RAGAnything
|
| 2 |
+
|
| 3 |
+
This document describes the context-aware multimodal processing feature in RAGAnything, which provides surrounding content information to LLMs when analyzing images, tables, equations, and other multimodal content for enhanced accuracy and relevance.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The context-aware feature enables RAGAnything to automatically extract and provide surrounding text content as context when processing multimodal content. This leads to more accurate and contextually relevant analysis by giving AI models additional information about where the content appears in the document structure.
|
| 8 |
+
|
| 9 |
+
### Key Benefits
|
| 10 |
+
|
| 11 |
+
- **Enhanced Accuracy**: Context helps AI understand the purpose and meaning of multimodal content
|
| 12 |
+
- **Semantic Coherence**: Generated descriptions align with document context and terminology
|
| 13 |
+
- **Automated Integration**: Context extraction is automatically enabled during document processing
|
| 14 |
+
- **Flexible Configuration**: Multiple extraction modes and filtering options
|
| 15 |
+
|
| 16 |
+
## Key Features
|
| 17 |
+
|
| 18 |
+
### 1. Configuration Support
|
| 19 |
+
- **Integrated Configuration**: Complete context options in `RAGAnythingConfig`
|
| 20 |
+
- **Environment Variables**: Configure all context parameters via environment variables
|
| 21 |
+
- **Dynamic Updates**: Runtime configuration updates supported
|
| 22 |
+
- **Content Format Control**: Configurable content source format detection
|
| 23 |
+
|
| 24 |
+
### 2. Automated Integration
|
| 25 |
+
- **Auto-Initialization**: Modal processors automatically receive tokenizer and context configuration
|
| 26 |
+
- **Content Source Setup**: Document processing automatically sets content sources for context extraction
|
| 27 |
+
- **Position Information**: Automatic position info (page_idx, index) passed to processors
|
| 28 |
+
- **Batch Processing**: Context-aware batch processing for efficient document handling
|
| 29 |
+
|
| 30 |
+
### 3. Advanced Token Management
|
| 31 |
+
- **Accurate Token Counting**: Uses LightRAG's tokenizer for precise token calculation
|
| 32 |
+
- **Smart Boundary Preservation**: Truncates at sentence/paragraph boundaries
|
| 33 |
+
- **Backward Compatibility**: Fallback to character truncation when tokenizer unavailable
|
| 34 |
+
|
| 35 |
+
### 4. Universal Context Extraction
|
| 36 |
+
- **Multiple Formats**: Support for MinerU, plain text, custom formats
|
| 37 |
+
- **Flexible Modes**: Page-based and chunk-based context extraction
|
| 38 |
+
- **Content Filtering**: Configurable content type filtering
|
| 39 |
+
- **Header Support**: Optional inclusion of document headers and structure
|
| 40 |
+
|
| 41 |
+
## Configuration
|
| 42 |
+
|
| 43 |
+
### RAGAnythingConfig Parameters
|
| 44 |
+
|
| 45 |
+
```python
|
| 46 |
+
# Context Extraction Configuration
|
| 47 |
+
context_window: int = 1 # Context window size (pages/chunks)
|
| 48 |
+
context_mode: str = "page" # Context mode ("page" or "chunk")
|
| 49 |
+
max_context_tokens: int = 2000 # Maximum context tokens
|
| 50 |
+
include_headers: bool = True # Include document headers
|
| 51 |
+
include_captions: bool = True # Include image/table captions
|
| 52 |
+
context_filter_content_types: List[str] = ["text"] # Content types to include
|
| 53 |
+
content_format: str = "minerU" # Default content format for context extraction
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### Environment Variables
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
# Context extraction settings
|
| 60 |
+
CONTEXT_WINDOW=2
|
| 61 |
+
CONTEXT_MODE=page
|
| 62 |
+
MAX_CONTEXT_TOKENS=3000
|
| 63 |
+
INCLUDE_HEADERS=true
|
| 64 |
+
INCLUDE_CAPTIONS=true
|
| 65 |
+
CONTEXT_FILTER_CONTENT_TYPES=text,image
|
| 66 |
+
CONTENT_FORMAT=minerU
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## Usage Guide
|
| 70 |
+
|
| 71 |
+
### 1. Basic Configuration
|
| 72 |
+
|
| 73 |
+
```python
|
| 74 |
+
from raganything import RAGAnything, RAGAnythingConfig
|
| 75 |
+
|
| 76 |
+
# Create configuration with context settings
|
| 77 |
+
config = RAGAnythingConfig(
|
| 78 |
+
context_window=2,
|
| 79 |
+
context_mode="page",
|
| 80 |
+
max_context_tokens=3000,
|
| 81 |
+
include_headers=True,
|
| 82 |
+
include_captions=True,
|
| 83 |
+
context_filter_content_types=["text", "image"],
|
| 84 |
+
content_format="minerU"
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# Create RAGAnything instance
|
| 88 |
+
rag_anything = RAGAnything(
|
| 89 |
+
config=config,
|
| 90 |
+
llm_model_func=your_llm_function,
|
| 91 |
+
embedding_func=your_embedding_function
|
| 92 |
+
)
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
### 2. Automatic Document Processing
|
| 96 |
+
|
| 97 |
+
```python
|
| 98 |
+
# Context is automatically enabled during document processing
|
| 99 |
+
await rag_anything.process_document_complete("document.pdf")
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
### 3. Manual Content Source Configuration
|
| 103 |
+
|
| 104 |
+
```python
|
| 105 |
+
# Set content source for specific content lists
|
| 106 |
+
rag_anything.set_content_source_for_context(content_list, "minerU")
|
| 107 |
+
|
| 108 |
+
# Update context configuration at runtime
|
| 109 |
+
rag_anything.update_context_config(
|
| 110 |
+
context_window=1,
|
| 111 |
+
max_context_tokens=1500,
|
| 112 |
+
include_captions=False
|
| 113 |
+
)
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
### 4. Direct Modal Processor Usage
|
| 117 |
+
|
| 118 |
+
```python
|
| 119 |
+
from raganything.modalprocessors import (
|
| 120 |
+
ContextExtractor,
|
| 121 |
+
ContextConfig,
|
| 122 |
+
ImageModalProcessor
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
# Configure context extraction
|
| 126 |
+
config = ContextConfig(
|
| 127 |
+
context_window=1,
|
| 128 |
+
context_mode="page",
|
| 129 |
+
max_context_tokens=2000,
|
| 130 |
+
include_headers=True,
|
| 131 |
+
include_captions=True,
|
| 132 |
+
filter_content_types=["text"]
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Initialize context extractor
|
| 136 |
+
context_extractor = ContextExtractor(config)
|
| 137 |
+
|
| 138 |
+
# Initialize modal processor with context support
|
| 139 |
+
processor = ImageModalProcessor(lightrag, caption_func, context_extractor)
|
| 140 |
+
|
| 141 |
+
# Set content source
|
| 142 |
+
processor.set_content_source(content_list, "minerU")
|
| 143 |
+
|
| 144 |
+
# Process with context
|
| 145 |
+
item_info = {
|
| 146 |
+
"page_idx": 2,
|
| 147 |
+
"index": 5,
|
| 148 |
+
"type": "image"
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
result = await processor.process_multimodal_content(
|
| 152 |
+
modal_content=image_data,
|
| 153 |
+
content_type="image",
|
| 154 |
+
file_path="document.pdf",
|
| 155 |
+
entity_name="Architecture Diagram",
|
| 156 |
+
item_info=item_info
|
| 157 |
+
)
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
## Context Modes
|
| 161 |
+
|
| 162 |
+
### Page-Based Context (`context_mode="page"`)
|
| 163 |
+
- Extracts context based on page boundaries
|
| 164 |
+
- Uses `page_idx` field from content items
|
| 165 |
+
- Suitable for document-structured content
|
| 166 |
+
- Example: Include text from 2 pages before and after current image
|
| 167 |
+
|
| 168 |
+
### Chunk-Based Context (`context_mode="chunk"`)
|
| 169 |
+
- Extracts context based on content item positions
|
| 170 |
+
- Uses sequential position in content list
|
| 171 |
+
- Suitable for fine-grained control
|
| 172 |
+
- Example: Include 5 content items before and after current table
|
| 173 |
+
|
| 174 |
+
## Processing Workflow
|
| 175 |
+
|
| 176 |
+
### 1. Document Parsing
|
| 177 |
+
```
|
| 178 |
+
Document Input → MinerU Parsing → content_list Generation
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
### 2. Context Setup
|
| 182 |
+
```
|
| 183 |
+
content_list → Set as Context Source → All Modal Processors Gain Context Capability
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
### 3. Multimodal Processing
|
| 187 |
+
```
|
| 188 |
+
Multimodal Content → Extract Surrounding Context → Enhanced LLM Analysis → More Accurate Results
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
## Content Source Formats
|
| 192 |
+
|
| 193 |
+
### MinerU Format
|
| 194 |
+
```json
|
| 195 |
+
[
|
| 196 |
+
{
|
| 197 |
+
"type": "text",
|
| 198 |
+
"text": "Document content here...",
|
| 199 |
+
"text_level": 1,
|
| 200 |
+
"page_idx": 0
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"type": "image",
|
| 204 |
+
"img_path": "images/figure1.jpg",
|
| 205 |
+
"image_caption": ["Figure 1: Architecture"],
|
| 206 |
+
"image_footnote": [],
|
| 207 |
+
"page_idx": 1
|
| 208 |
+
}
|
| 209 |
+
]
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
### Custom Text Chunks
|
| 213 |
+
```python
|
| 214 |
+
text_chunks = [
|
| 215 |
+
"First chunk of text content...",
|
| 216 |
+
"Second chunk of text content...",
|
| 217 |
+
"Third chunk of text content..."
|
| 218 |
+
]
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
### Plain Text
|
| 222 |
+
```python
|
| 223 |
+
full_document = "Complete document text with all content..."
|
| 224 |
+
```
|
| 225 |
+
|
| 226 |
+
## Configuration Examples
|
| 227 |
+
|
| 228 |
+
### High-Precision Context
|
| 229 |
+
For focused analysis with minimal context:
|
| 230 |
+
```python
|
| 231 |
+
config = RAGAnythingConfig(
|
| 232 |
+
context_window=1,
|
| 233 |
+
context_mode="page",
|
| 234 |
+
max_context_tokens=1000,
|
| 235 |
+
include_headers=True,
|
| 236 |
+
include_captions=False,
|
| 237 |
+
context_filter_content_types=["text"]
|
| 238 |
+
)
|
| 239 |
+
```
|
| 240 |
+
|
| 241 |
+
### Comprehensive Context
|
| 242 |
+
For broad analysis with rich context:
|
| 243 |
+
```python
|
| 244 |
+
config = RAGAnythingConfig(
|
| 245 |
+
context_window=2,
|
| 246 |
+
context_mode="page",
|
| 247 |
+
max_context_tokens=3000,
|
| 248 |
+
include_headers=True,
|
| 249 |
+
include_captions=True,
|
| 250 |
+
context_filter_content_types=["text", "image", "table"]
|
| 251 |
+
)
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
### Chunk-Based Analysis
|
| 255 |
+
For fine-grained sequential context:
|
| 256 |
+
```python
|
| 257 |
+
config = RAGAnythingConfig(
|
| 258 |
+
context_window=5,
|
| 259 |
+
context_mode="chunk",
|
| 260 |
+
max_context_tokens=2000,
|
| 261 |
+
include_headers=False,
|
| 262 |
+
include_captions=False,
|
| 263 |
+
context_filter_content_types=["text"]
|
| 264 |
+
)
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
## Performance Optimization
|
| 268 |
+
|
| 269 |
+
### 1. Accurate Token Control
|
| 270 |
+
- Uses real tokenizer for precise token counting
|
| 271 |
+
- Avoids exceeding LLM token limits
|
| 272 |
+
- Provides consistent performance
|
| 273 |
+
|
| 274 |
+
### 2. Smart Truncation
|
| 275 |
+
- Truncates at sentence boundaries
|
| 276 |
+
- Maintains semantic integrity
|
| 277 |
+
- Adds truncation indicators
|
| 278 |
+
|
| 279 |
+
### 3. Caching Optimization
|
| 280 |
+
- Context extraction results can be reused
|
| 281 |
+
- Reduces redundant computation overhead
|
| 282 |
+
|
| 283 |
+
## Advanced Features
|
| 284 |
+
|
| 285 |
+
### Context Truncation
|
| 286 |
+
The system automatically truncates context to fit within token limits:
|
| 287 |
+
- Uses actual tokenizer for accurate token counting
|
| 288 |
+
- Attempts to end at sentence boundaries (periods)
|
| 289 |
+
- Falls back to line boundaries if needed
|
| 290 |
+
- Adds "..." indicator for truncated content
|
| 291 |
+
|
| 292 |
+
### Header Formatting
|
| 293 |
+
When `include_headers=True`, headers are formatted with markdown-style prefixes:
|
| 294 |
+
```
|
| 295 |
+
# Level 1 Header
|
| 296 |
+
## Level 2 Header
|
| 297 |
+
### Level 3 Header
|
| 298 |
+
```
|
| 299 |
+
|
| 300 |
+
### Caption Integration
|
| 301 |
+
When `include_captions=True`, image and table captions are included as:
|
| 302 |
+
```
|
| 303 |
+
[Image: Figure 1 caption text]
|
| 304 |
+
[Table: Table 1 caption text]
|
| 305 |
+
```
|
| 306 |
+
|
| 307 |
+
## Integration with RAGAnything
|
| 308 |
+
|
| 309 |
+
The context-aware feature is seamlessly integrated into RAGAnything's workflow:
|
| 310 |
+
|
| 311 |
+
1. **Automatic Setup**: Context extractors are automatically created and configured
|
| 312 |
+
2. **Content Source Management**: Document processing automatically sets content sources
|
| 313 |
+
3. **Processor Integration**: All modal processors receive context capabilities
|
| 314 |
+
4. **Configuration Consistency**: Single configuration system for all context settings
|
| 315 |
+
|
| 316 |
+
## Error Handling
|
| 317 |
+
|
| 318 |
+
The system includes robust error handling:
|
| 319 |
+
- Gracefully handles missing or invalid content sources
|
| 320 |
+
- Returns empty context for unsupported formats
|
| 321 |
+
- Logs warnings for configuration issues
|
| 322 |
+
- Continues processing even if context extraction fails
|
| 323 |
+
|
| 324 |
+
## Compatibility
|
| 325 |
+
|
| 326 |
+
- **Backward Compatible**: Existing code works without modification
|
| 327 |
+
- **Optional Feature**: Context can be selectively enabled/disabled
|
| 328 |
+
- **Flexible Configuration**: Supports multiple configuration combinations
|
| 329 |
+
|
| 330 |
+
## Best Practices
|
| 331 |
+
|
| 332 |
+
1. **Token Limits**: Ensure `max_context_tokens` doesn't exceed LLM context limits
|
| 333 |
+
2. **Performance Impact**: Larger context windows increase processing time
|
| 334 |
+
3. **Content Quality**: Context quality directly affects analysis accuracy
|
| 335 |
+
4. **Window Size**: Match window size to content structure (documents vs articles)
|
| 336 |
+
5. **Content Filtering**: Use `context_filter_content_types` to reduce noise
|
| 337 |
+
|
| 338 |
+
## Troubleshooting
|
| 339 |
+
|
| 340 |
+
### Common Issues
|
| 341 |
+
|
| 342 |
+
**Context Not Extracted**
|
| 343 |
+
- Check if `set_content_source_for_context()` was called
|
| 344 |
+
- Verify `item_info` contains required fields (`page_idx`, `index`)
|
| 345 |
+
- Confirm content source format is correct
|
| 346 |
+
|
| 347 |
+
**Context Too Long/Short**
|
| 348 |
+
- Adjust `max_context_tokens` setting
|
| 349 |
+
- Modify `context_window` size
|
| 350 |
+
- Check `context_filter_content_types` configuration
|
| 351 |
+
|
| 352 |
+
**Irrelevant Context**
|
| 353 |
+
- Refine `context_filter_content_types` to exclude noise
|
| 354 |
+
- Reduce `context_window` size
|
| 355 |
+
- Set `include_captions=False` if captions are not helpful
|
| 356 |
+
|
| 357 |
+
**Configuration Issues**
|
| 358 |
+
- Verify environment variables are set correctly
|
| 359 |
+
- Check RAGAnythingConfig parameter names
|
| 360 |
+
- Ensure content_format matches your data source
|
| 361 |
+
|
| 362 |
+
## Examples
|
| 363 |
+
|
| 364 |
+
Check out these example files for complete usage demonstrations:
|
| 365 |
+
|
| 366 |
+
- **Configuration Examples**: See how to set up different context configurations
|
| 367 |
+
- **Integration Examples**: Learn how to integrate context-aware processing into your workflow
|
| 368 |
+
- **Custom Processors**: Examples of creating custom modal processors with context support
|
| 369 |
+
|
| 370 |
+
## API Reference
|
| 371 |
+
|
| 372 |
+
For detailed API documentation, see the docstrings in:
|
| 373 |
+
- `raganything/modalprocessors.py` - Context extraction and modal processors
|
| 374 |
+
- `raganything/config.py` - Configuration options
|
| 375 |
+
- `raganything/raganything.py` - Main RAGAnything class integration
|
rag_anything_smaranika/docs/enhanced_markdown.md
ADDED
|
@@ -0,0 +1,552 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Enhanced Markdown Conversion
|
| 2 |
+
|
| 3 |
+
This document describes the enhanced markdown conversion feature for RAG-Anything, which provides high-quality PDF generation from markdown files with multiple backend options and advanced styling.
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
The enhanced markdown conversion feature provides professional-quality PDF generation from markdown files. It supports multiple conversion backends, advanced styling options, syntax highlighting, and seamless integration with RAG-Anything's document processing pipeline.
|
| 8 |
+
|
| 9 |
+
## Key Features
|
| 10 |
+
|
| 11 |
+
- **Multiple Backends**: WeasyPrint, Pandoc, and automatic backend selection
|
| 12 |
+
- **Advanced Styling**: Custom CSS, syntax highlighting, and professional layouts
|
| 13 |
+
- **Image Support**: Embedded images with proper scaling and positioning
|
| 14 |
+
- **Table Support**: Formatted tables with borders and professional styling
|
| 15 |
+
- **Code Highlighting**: Syntax highlighting for code blocks using Pygments
|
| 16 |
+
- **Custom Templates**: Support for custom CSS and document templates
|
| 17 |
+
- **Table of Contents**: Automatic TOC generation with navigation links
|
| 18 |
+
- **Professional Typography**: High-quality fonts and spacing
|
| 19 |
+
|
| 20 |
+
## Installation
|
| 21 |
+
|
| 22 |
+
### Required Dependencies
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
# Basic installation
|
| 26 |
+
pip install raganything[all]
|
| 27 |
+
|
| 28 |
+
# Required for enhanced markdown conversion
|
| 29 |
+
pip install markdown weasyprint pygments
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Optional Dependencies
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
# For Pandoc backend (system installation required)
|
| 36 |
+
# Ubuntu/Debian:
|
| 37 |
+
sudo apt-get install pandoc wkhtmltopdf
|
| 38 |
+
|
| 39 |
+
# macOS:
|
| 40 |
+
brew install pandoc wkhtmltopdf
|
| 41 |
+
|
| 42 |
+
# Or using conda:
|
| 43 |
+
conda install -c conda-forge pandoc wkhtmltopdf
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### Backend-Specific Installation
|
| 47 |
+
|
| 48 |
+
#### WeasyPrint (Recommended)
|
| 49 |
+
```bash
|
| 50 |
+
# Install WeasyPrint with system dependencies
|
| 51 |
+
pip install weasyprint
|
| 52 |
+
|
| 53 |
+
# Ubuntu/Debian system dependencies:
|
| 54 |
+
sudo apt-get install -y build-essential python3-dev python3-pip \
|
| 55 |
+
python3-setuptools python3-wheel python3-cffi libcairo2 \
|
| 56 |
+
libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \
|
| 57 |
+
libffi-dev shared-mime-info
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
#### Pandoc
|
| 61 |
+
- Download from: https://pandoc.org/installing.html
|
| 62 |
+
- Requires system-wide installation
|
| 63 |
+
- Used for complex document structures and LaTeX-quality output
|
| 64 |
+
|
| 65 |
+
## Usage
|
| 66 |
+
|
| 67 |
+
### Basic Conversion
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
|
| 71 |
+
|
| 72 |
+
# Create converter with default settings
|
| 73 |
+
converter = EnhancedMarkdownConverter()
|
| 74 |
+
|
| 75 |
+
# Convert markdown file to PDF
|
| 76 |
+
success = converter.convert_file_to_pdf(
|
| 77 |
+
input_path="document.md",
|
| 78 |
+
output_path="document.pdf",
|
| 79 |
+
method="auto" # Automatically select best available backend
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
if success:
|
| 83 |
+
print("✅ Conversion successful!")
|
| 84 |
+
else:
|
| 85 |
+
print("❌ Conversion failed")
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### Advanced Configuration
|
| 89 |
+
|
| 90 |
+
```python
|
| 91 |
+
# Create custom configuration
|
| 92 |
+
config = MarkdownConfig(
|
| 93 |
+
page_size="A4", # A4, Letter, Legal, etc.
|
| 94 |
+
margin="1in", # CSS-style margins
|
| 95 |
+
font_size="12pt", # Base font size
|
| 96 |
+
line_height="1.5", # Line spacing
|
| 97 |
+
include_toc=True, # Generate table of contents
|
| 98 |
+
syntax_highlighting=True, # Enable code syntax highlighting
|
| 99 |
+
|
| 100 |
+
# Custom CSS styling
|
| 101 |
+
custom_css="""
|
| 102 |
+
body {
|
| 103 |
+
font-family: 'Georgia', serif;
|
| 104 |
+
color: #333;
|
| 105 |
+
}
|
| 106 |
+
h1 {
|
| 107 |
+
color: #2c3e50;
|
| 108 |
+
border-bottom: 2px solid #3498db;
|
| 109 |
+
padding-bottom: 0.3em;
|
| 110 |
+
}
|
| 111 |
+
code {
|
| 112 |
+
background-color: #f8f9fa;
|
| 113 |
+
padding: 2px 4px;
|
| 114 |
+
border-radius: 3px;
|
| 115 |
+
}
|
| 116 |
+
pre {
|
| 117 |
+
background-color: #f8f9fa;
|
| 118 |
+
border-left: 4px solid #3498db;
|
| 119 |
+
padding: 15px;
|
| 120 |
+
border-radius: 5px;
|
| 121 |
+
}
|
| 122 |
+
table {
|
| 123 |
+
border-collapse: collapse;
|
| 124 |
+
width: 100%;
|
| 125 |
+
margin: 1em 0;
|
| 126 |
+
}
|
| 127 |
+
th, td {
|
| 128 |
+
border: 1px solid #ddd;
|
| 129 |
+
padding: 8px 12px;
|
| 130 |
+
text-align: left;
|
| 131 |
+
}
|
| 132 |
+
th {
|
| 133 |
+
background-color: #f2f2f2;
|
| 134 |
+
font-weight: bold;
|
| 135 |
+
}
|
| 136 |
+
"""
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
converter = EnhancedMarkdownConverter(config)
|
| 140 |
+
```
|
| 141 |
+
|
| 142 |
+
### Backend Selection
|
| 143 |
+
|
| 144 |
+
```python
|
| 145 |
+
# Check available backends
|
| 146 |
+
converter = EnhancedMarkdownConverter()
|
| 147 |
+
backend_info = converter.get_backend_info()
|
| 148 |
+
|
| 149 |
+
print("Available backends:")
|
| 150 |
+
for backend, available in backend_info["available_backends"].items():
|
| 151 |
+
status = "✅" if available else "❌"
|
| 152 |
+
print(f" {status} {backend}")
|
| 153 |
+
|
| 154 |
+
print(f"Recommended backend: {backend_info['recommended_backend']}")
|
| 155 |
+
|
| 156 |
+
# Use specific backend
|
| 157 |
+
converter.convert_file_to_pdf(
|
| 158 |
+
input_path="document.md",
|
| 159 |
+
output_path="document.pdf",
|
| 160 |
+
method="weasyprint" # or "pandoc", "pandoc_system", "auto"
|
| 161 |
+
)
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
### Content Conversion
|
| 165 |
+
|
| 166 |
+
```python
|
| 167 |
+
# Convert markdown content directly (not from file)
|
| 168 |
+
markdown_content = """
|
| 169 |
+
# Sample Document
|
| 170 |
+
|
| 171 |
+
## Introduction
|
| 172 |
+
This is a **bold** statement with *italic* text.
|
| 173 |
+
|
| 174 |
+
## Code Example
|
| 175 |
+
```python
|
| 176 |
+
def hello_world():
|
| 177 |
+
print("Hello, World!")
|
| 178 |
+
return "Success"
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
## Table
|
| 182 |
+
| Feature | Status | Notes |
|
| 183 |
+
|---------|--------|-------|
|
| 184 |
+
| PDF Generation | ✅ | Working |
|
| 185 |
+
| Syntax Highlighting | ✅ | Pygments |
|
| 186 |
+
| Custom CSS | ✅ | Full support |
|
| 187 |
+
"""
|
| 188 |
+
|
| 189 |
+
success = converter.convert_markdown_to_pdf(
|
| 190 |
+
markdown_content=markdown_content,
|
| 191 |
+
output_path="sample.pdf",
|
| 192 |
+
method="auto"
|
| 193 |
+
)
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
### Command Line Interface
|
| 197 |
+
|
| 198 |
+
```bash
|
| 199 |
+
# Basic conversion
|
| 200 |
+
python -m raganything.enhanced_markdown document.md --output document.pdf
|
| 201 |
+
|
| 202 |
+
# With specific backend
|
| 203 |
+
python -m raganything.enhanced_markdown document.md --method weasyprint
|
| 204 |
+
|
| 205 |
+
# With custom CSS file
|
| 206 |
+
python -m raganything.enhanced_markdown document.md --css custom_style.css
|
| 207 |
+
|
| 208 |
+
# Show backend information
|
| 209 |
+
python -m raganything.enhanced_markdown --info
|
| 210 |
+
|
| 211 |
+
# Help
|
| 212 |
+
python -m raganything.enhanced_markdown --help
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
## Backend Comparison
|
| 216 |
+
|
| 217 |
+
| Backend | Pros | Cons | Best For | Quality |
|
| 218 |
+
|---------|------|------|----------|---------|
|
| 219 |
+
| **WeasyPrint** | • Excellent CSS support<br>• Fast rendering<br>• Great web-style layouts<br>• Python-based | • Limited LaTeX features<br>• Requires system deps | • Web-style documents<br>• Custom styling<br>• Fast conversion | ⭐⭐⭐⭐ |
|
| 220 |
+
| **Pandoc** | • Extensive features<br>• LaTeX-quality output<br>• Academic formatting<br>• Many input/output formats | • Slower conversion<br>• System installation<br>• Complex setup | • Academic papers<br>• Complex documents<br>• Publication quality | ⭐⭐⭐⭐⭐ |
|
| 221 |
+
| **Auto** | • Automatic selection<br>• Fallback support<br>• User-friendly | • May not use optimal backend | • General use<br>• Quick setup<br>• Development | ⭐⭐⭐⭐ |
|
| 222 |
+
|
| 223 |
+
## Configuration Options
|
| 224 |
+
|
| 225 |
+
### MarkdownConfig Parameters
|
| 226 |
+
|
| 227 |
+
```python
|
| 228 |
+
@dataclass
|
| 229 |
+
class MarkdownConfig:
|
| 230 |
+
# Page layout
|
| 231 |
+
page_size: str = "A4" # A4, Letter, Legal, A3, etc.
|
| 232 |
+
margin: str = "1in" # CSS margin format
|
| 233 |
+
font_size: str = "12pt" # Base font size
|
| 234 |
+
line_height: str = "1.5" # Line spacing multiplier
|
| 235 |
+
|
| 236 |
+
# Content options
|
| 237 |
+
include_toc: bool = True # Generate table of contents
|
| 238 |
+
syntax_highlighting: bool = True # Enable code highlighting
|
| 239 |
+
image_max_width: str = "100%" # Maximum image width
|
| 240 |
+
table_style: str = "..." # Default table CSS
|
| 241 |
+
|
| 242 |
+
# Styling
|
| 243 |
+
css_file: Optional[str] = None # External CSS file path
|
| 244 |
+
custom_css: Optional[str] = None # Inline CSS content
|
| 245 |
+
template_file: Optional[str] = None # Custom HTML template
|
| 246 |
+
|
| 247 |
+
# Output options
|
| 248 |
+
output_format: str = "pdf" # Currently only PDF supported
|
| 249 |
+
output_dir: Optional[str] = None # Output directory
|
| 250 |
+
|
| 251 |
+
# Metadata
|
| 252 |
+
metadata: Optional[Dict[str, str]] = None # Document metadata
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
### Supported Markdown Features
|
| 256 |
+
|
| 257 |
+
#### Basic Formatting
|
| 258 |
+
- **Headers**: `# ## ### #### ##### ######`
|
| 259 |
+
- **Emphasis**: `*italic*`, `**bold**`, `***bold italic***`
|
| 260 |
+
- **Links**: `[text](url)`, `[text][ref]`
|
| 261 |
+
- **Images**: ``, `![alt][ref]`
|
| 262 |
+
- **Lists**: Ordered and unordered, nested
|
| 263 |
+
- **Blockquotes**: `> quote`
|
| 264 |
+
- **Line breaks**: Double space or `\n\n`
|
| 265 |
+
|
| 266 |
+
#### Advanced Features
|
| 267 |
+
- **Tables**: GitHub-style tables with alignment
|
| 268 |
+
- **Code blocks**: Fenced code blocks with language specification
|
| 269 |
+
- **Inline code**: `backtick code`
|
| 270 |
+
- **Horizontal rules**: `---` or `***`
|
| 271 |
+
- **Footnotes**: `[^1]` references
|
| 272 |
+
- **Definition lists**: Term and definition pairs
|
| 273 |
+
- **Attributes**: `{#id .class key=value}`
|
| 274 |
+
|
| 275 |
+
#### Code Highlighting
|
| 276 |
+
|
| 277 |
+
```markdown
|
| 278 |
+
```python
|
| 279 |
+
def example_function():
|
| 280 |
+
"""This will be syntax highlighted"""
|
| 281 |
+
return "Hello, World!"
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
```javascript
|
| 285 |
+
function exampleFunction() {
|
| 286 |
+
// This will also be highlighted
|
| 287 |
+
return "Hello, World!";
|
| 288 |
+
}
|
| 289 |
+
```
|
| 290 |
+
```
|
| 291 |
+
|
| 292 |
+
## Integration with RAG-Anything
|
| 293 |
+
|
| 294 |
+
The enhanced markdown conversion integrates seamlessly with RAG-Anything:
|
| 295 |
+
|
| 296 |
+
```python
|
| 297 |
+
from raganything import RAGAnything
|
| 298 |
+
|
| 299 |
+
# Initialize RAG-Anything
|
| 300 |
+
rag = RAGAnything()
|
| 301 |
+
|
| 302 |
+
# Process markdown files - enhanced conversion is used automatically
|
| 303 |
+
await rag.process_document_complete("document.md")
|
| 304 |
+
|
| 305 |
+
# Batch processing with enhanced markdown conversion
|
| 306 |
+
result = rag.process_documents_batch(
|
| 307 |
+
file_paths=["doc1.md", "doc2.md", "doc3.md"],
|
| 308 |
+
output_dir="./output"
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
# The .md files will be converted to PDF using enhanced conversion
|
| 312 |
+
# before being processed by the RAG system
|
| 313 |
+
```
|
| 314 |
+
|
| 315 |
+
## Performance Considerations
|
| 316 |
+
|
| 317 |
+
### Conversion Speed
|
| 318 |
+
- **WeasyPrint**: ~1-3 seconds for typical documents
|
| 319 |
+
- **Pandoc**: ~3-10 seconds for typical documents
|
| 320 |
+
- **Large documents**: Time scales roughly linearly with content
|
| 321 |
+
|
| 322 |
+
### Memory Usage
|
| 323 |
+
- **WeasyPrint**: ~50-100MB per conversion
|
| 324 |
+
- **Pandoc**: ~100-200MB per conversion
|
| 325 |
+
- **Images**: Large images increase memory usage significantly
|
| 326 |
+
|
| 327 |
+
### Optimization Tips
|
| 328 |
+
1. **Resize large images** before embedding
|
| 329 |
+
2. **Use compressed images** (JPEG for photos, PNG for graphics)
|
| 330 |
+
3. **Limit concurrent conversions** to avoid memory issues
|
| 331 |
+
4. **Cache converted content** when processing multiple times
|
| 332 |
+
|
| 333 |
+
## Examples
|
| 334 |
+
|
| 335 |
+
### Sample Markdown Document
|
| 336 |
+
|
| 337 |
+
```markdown
|
| 338 |
+
# Technical Documentation
|
| 339 |
+
|
| 340 |
+
## Table of Contents
|
| 341 |
+
[TOC]
|
| 342 |
+
|
| 343 |
+
## Overview
|
| 344 |
+
This document provides comprehensive technical specifications.
|
| 345 |
+
|
| 346 |
+
## Architecture
|
| 347 |
+
|
| 348 |
+
### System Components
|
| 349 |
+
1. **Parser Engine**: Handles document processing
|
| 350 |
+
2. **Storage Layer**: Manages data persistence
|
| 351 |
+
3. **Query Interface**: Provides search capabilities
|
| 352 |
+
|
| 353 |
+
### Code Implementation
|
| 354 |
+
```python
|
| 355 |
+
from raganything import RAGAnything
|
| 356 |
+
|
| 357 |
+
# Initialize system
|
| 358 |
+
rag = RAGAnything(config={
|
| 359 |
+
"working_dir": "./storage",
|
| 360 |
+
"enable_image_processing": True
|
| 361 |
+
})
|
| 362 |
+
|
| 363 |
+
# Process document
|
| 364 |
+
await rag.process_document_complete("document.pdf")
|
| 365 |
+
```
|
| 366 |
+
|
| 367 |
+
### Performance Metrics
|
| 368 |
+
|
| 369 |
+
| Component | Throughput | Latency | Memory |
|
| 370 |
+
|-----------|------------|---------|--------|
|
| 371 |
+
| Parser | 100 docs/hour | 36s avg | 2.5 GB |
|
| 372 |
+
| Storage | 1000 ops/sec | 1ms avg | 512 MB |
|
| 373 |
+
| Query | 50 queries/sec | 20ms avg | 1 GB |
|
| 374 |
+
|
| 375 |
+
## Integration Notes
|
| 376 |
+
|
| 377 |
+
> **Important**: Always validate input before processing.
|
| 378 |
+
|
| 379 |
+
## Conclusion
|
| 380 |
+
The enhanced system provides excellent performance for document processing workflows.
|
| 381 |
+
```
|
| 382 |
+
|
| 383 |
+
### Generated PDF Features
|
| 384 |
+
|
| 385 |
+
The enhanced markdown converter produces PDFs with:
|
| 386 |
+
|
| 387 |
+
- **Professional typography** with proper font selection and spacing
|
| 388 |
+
- **Syntax-highlighted code blocks** using Pygments
|
| 389 |
+
- **Formatted tables** with borders and alternating row colors
|
| 390 |
+
- **Clickable table of contents** with navigation links
|
| 391 |
+
- **Responsive images** that scale appropriately
|
| 392 |
+
- **Custom styling** through CSS
|
| 393 |
+
- **Proper page breaks** and margins
|
| 394 |
+
- **Document metadata** and properties
|
| 395 |
+
|
| 396 |
+
## Troubleshooting
|
| 397 |
+
|
| 398 |
+
### Common Issues
|
| 399 |
+
|
| 400 |
+
#### WeasyPrint Installation Problems
|
| 401 |
+
```bash
|
| 402 |
+
# Ubuntu/Debian: Install system dependencies
|
| 403 |
+
sudo apt-get update
|
| 404 |
+
sudo apt-get install -y build-essential python3-dev libcairo2 \
|
| 405 |
+
libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \
|
| 406 |
+
libffi-dev shared-mime-info
|
| 407 |
+
|
| 408 |
+
# Then reinstall WeasyPrint
|
| 409 |
+
pip install --force-reinstall weasyprint
|
| 410 |
+
```
|
| 411 |
+
|
| 412 |
+
#### Pandoc Not Found
|
| 413 |
+
```bash
|
| 414 |
+
# Check if Pandoc is installed
|
| 415 |
+
pandoc --version
|
| 416 |
+
|
| 417 |
+
# Install Pandoc (Ubuntu/Debian)
|
| 418 |
+
sudo apt-get install pandoc wkhtmltopdf
|
| 419 |
+
|
| 420 |
+
# Or download from: https://pandoc.org/installing.html
|
| 421 |
+
```
|
| 422 |
+
|
| 423 |
+
#### CSS Issues
|
| 424 |
+
- Check CSS syntax in custom_css
|
| 425 |
+
- Verify CSS file paths exist
|
| 426 |
+
- Test CSS with simple HTML first
|
| 427 |
+
- Use browser developer tools to debug styling
|
| 428 |
+
|
| 429 |
+
#### Image Problems
|
| 430 |
+
- Ensure images are accessible (correct paths)
|
| 431 |
+
- Check image file formats (PNG, JPEG, GIF supported)
|
| 432 |
+
- Verify image file permissions
|
| 433 |
+
- Consider image size and format optimization
|
| 434 |
+
|
| 435 |
+
#### Font Issues
|
| 436 |
+
```python
|
| 437 |
+
# Use web-safe fonts
|
| 438 |
+
config = MarkdownConfig(
|
| 439 |
+
custom_css="""
|
| 440 |
+
body {
|
| 441 |
+
font-family: 'Arial', 'Helvetica', sans-serif;
|
| 442 |
+
}
|
| 443 |
+
"""
|
| 444 |
+
)
|
| 445 |
+
```
|
| 446 |
+
|
| 447 |
+
### Debug Mode
|
| 448 |
+
|
| 449 |
+
Enable detailed logging for troubleshooting:
|
| 450 |
+
|
| 451 |
+
```python
|
| 452 |
+
import logging
|
| 453 |
+
|
| 454 |
+
# Enable debug logging
|
| 455 |
+
logging.basicConfig(
|
| 456 |
+
level=logging.DEBUG,
|
| 457 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
# Create converter with debug logging
|
| 461 |
+
converter = EnhancedMarkdownConverter()
|
| 462 |
+
result = converter.convert_file_to_pdf("test.md", "test.pdf")
|
| 463 |
+
```
|
| 464 |
+
|
| 465 |
+
### Error Handling
|
| 466 |
+
|
| 467 |
+
```python
|
| 468 |
+
def robust_conversion(input_path, output_path):
|
| 469 |
+
"""Convert with fallback backends"""
|
| 470 |
+
converter = EnhancedMarkdownConverter()
|
| 471 |
+
|
| 472 |
+
# Try backends in order of preference
|
| 473 |
+
backends = ["weasyprint", "pandoc", "auto"]
|
| 474 |
+
|
| 475 |
+
for backend in backends:
|
| 476 |
+
try:
|
| 477 |
+
success = converter.convert_file_to_pdf(
|
| 478 |
+
input_path=input_path,
|
| 479 |
+
output_path=output_path,
|
| 480 |
+
method=backend
|
| 481 |
+
)
|
| 482 |
+
if success:
|
| 483 |
+
print(f"✅ Conversion successful with {backend}")
|
| 484 |
+
return True
|
| 485 |
+
except Exception as e:
|
| 486 |
+
print(f"❌ {backend} failed: {str(e)}")
|
| 487 |
+
continue
|
| 488 |
+
|
| 489 |
+
print("❌ All backends failed")
|
| 490 |
+
return False
|
| 491 |
+
```
|
| 492 |
+
|
| 493 |
+
## API Reference
|
| 494 |
+
|
| 495 |
+
### EnhancedMarkdownConverter
|
| 496 |
+
|
| 497 |
+
```python
|
| 498 |
+
class EnhancedMarkdownConverter:
|
| 499 |
+
def __init__(self, config: Optional[MarkdownConfig] = None):
|
| 500 |
+
"""Initialize converter with optional configuration"""
|
| 501 |
+
|
| 502 |
+
def convert_file_to_pdf(self, input_path: str, output_path: str, method: str = "auto") -> bool:
|
| 503 |
+
"""Convert markdown file to PDF"""
|
| 504 |
+
|
| 505 |
+
def convert_markdown_to_pdf(self, markdown_content: str, output_path: str, method: str = "auto") -> bool:
|
| 506 |
+
"""Convert markdown content to PDF"""
|
| 507 |
+
|
| 508 |
+
def get_backend_info(self) -> Dict[str, Any]:
|
| 509 |
+
"""Get information about available backends"""
|
| 510 |
+
|
| 511 |
+
def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:
|
| 512 |
+
"""Convert using WeasyPrint backend"""
|
| 513 |
+
|
| 514 |
+
def convert_with_pandoc(self, markdown_content: str, output_path: str) -> bool:
|
| 515 |
+
"""Convert using Pandoc backend"""
|
| 516 |
+
```
|
| 517 |
+
|
| 518 |
+
## Best Practices
|
| 519 |
+
|
| 520 |
+
1. **Choose the right backend** for your use case:
|
| 521 |
+
- **WeasyPrint** for web-style documents and custom CSS
|
| 522 |
+
- **Pandoc** for academic papers and complex formatting
|
| 523 |
+
- **Auto** for general use and development
|
| 524 |
+
|
| 525 |
+
2. **Optimize images** before embedding:
|
| 526 |
+
- Use appropriate formats (JPEG for photos, PNG for graphics)
|
| 527 |
+
- Compress images to reduce file size
|
| 528 |
+
- Set reasonable maximum widths
|
| 529 |
+
|
| 530 |
+
3. **Design responsive layouts**:
|
| 531 |
+
- Use relative units (%, em) instead of absolute (px)
|
| 532 |
+
- Test with different page sizes
|
| 533 |
+
- Consider print-specific CSS
|
| 534 |
+
|
| 535 |
+
4. **Test your styling**:
|
| 536 |
+
- Start with default styling and incrementally customize
|
| 537 |
+
- Test with sample content before production use
|
| 538 |
+
- Validate CSS syntax
|
| 539 |
+
|
| 540 |
+
5. **Handle errors gracefully**:
|
| 541 |
+
- Implement fallback backends
|
| 542 |
+
- Provide meaningful error messages
|
| 543 |
+
- Log conversion attempts for debugging
|
| 544 |
+
|
| 545 |
+
6. **Performance optimization**:
|
| 546 |
+
- Cache converted content when possible
|
| 547 |
+
- Process large batches with appropriate worker counts
|
| 548 |
+
- Monitor memory usage with large documents
|
| 549 |
+
|
| 550 |
+
## Conclusion
|
| 551 |
+
|
| 552 |
+
The enhanced markdown conversion feature provides professional-quality PDF generation with flexible styling options and multiple backend support. It seamlessly integrates with RAG-Anything's document processing pipeline while offering standalone functionality for markdown-to-PDF conversion needs.
|
rag_anything_smaranika/env.example
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
### This is sample file of .env
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
### Server Configuration
|
| 5 |
+
HOST=0.0.0.0
|
| 6 |
+
PORT=9621
|
| 7 |
+
WEBUI_TITLE='My Graph KB'
|
| 8 |
+
WEBUI_DESCRIPTION="Simple and Fast Graph Based RAG System"
|
| 9 |
+
OLLAMA_EMULATING_MODEL_TAG=latest
|
| 10 |
+
# WORKERS=2
|
| 11 |
+
# CORS_ORIGINS=http://localhost:3000,http://localhost:8080
|
| 12 |
+
|
| 13 |
+
### Login Configuration
|
| 14 |
+
# AUTH_ACCOUNTS='admin:admin123,user1:pass456'
|
| 15 |
+
# TOKEN_SECRET=Your-Key-For-LightRAG-API-Server
|
| 16 |
+
# TOKEN_EXPIRE_HOURS=48
|
| 17 |
+
# GUEST_TOKEN_EXPIRE_HOURS=24
|
| 18 |
+
# JWT_ALGORITHM=HS256
|
| 19 |
+
|
| 20 |
+
### API-Key to access LightRAG Server API
|
| 21 |
+
# LIGHTRAG_API_KEY=your-secure-api-key-here
|
| 22 |
+
# WHITELIST_PATHS=/health,/api/*
|
| 23 |
+
|
| 24 |
+
### Optional SSL Configuration
|
| 25 |
+
# SSL=true
|
| 26 |
+
# SSL_CERTFILE=/path/to/cert.pem
|
| 27 |
+
# SSL_KEYFILE=/path/to/key.pem
|
| 28 |
+
|
| 29 |
+
### Directory Configuration (defaults to current working directory)
|
| 30 |
+
### Should not be set if deploy by docker (Set by Dockerfile instead of .env)
|
| 31 |
+
### Default value is ./inputs and ./rag_storage
|
| 32 |
+
# INPUT_DIR=<absolute_path_for_doc_input_dir>
|
| 33 |
+
|
| 34 |
+
### RAGAnything Configuration (Multimodal Document Processing)
|
| 35 |
+
### ---
|
| 36 |
+
### Parser Configuration
|
| 37 |
+
# PARSE_METHOD=auto
|
| 38 |
+
# OUTPUT_DIR=./output
|
| 39 |
+
# PARSER=mineru
|
| 40 |
+
# DISPLAY_CONTENT_STATS=true
|
| 41 |
+
|
| 42 |
+
### Multimodal Processing Configuration
|
| 43 |
+
# ENABLE_IMAGE_PROCESSING=true
|
| 44 |
+
# ENABLE_TABLE_PROCESSING=true
|
| 45 |
+
# ENABLE_EQUATION_PROCESSING=true
|
| 46 |
+
|
| 47 |
+
### Batch Processing Configuration
|
| 48 |
+
# MAX_CONCURRENT_FILES=1
|
| 49 |
+
# SUPPORTED_FILE_EXTENSIONS=.pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md
|
| 50 |
+
# RECURSIVE_FOLDER_PROCESSING=true
|
| 51 |
+
|
| 52 |
+
### Context Extraction Configuration
|
| 53 |
+
# CONTEXT_WINDOW=1
|
| 54 |
+
# CONTEXT_MODE=page
|
| 55 |
+
# MAX_CONTEXT_TOKENS=2000
|
| 56 |
+
# INCLUDE_HEADERS=true
|
| 57 |
+
# INCLUDE_CAPTIONS=true
|
| 58 |
+
# CONTEXT_FILTER_CONTENT_TYPES=text
|
| 59 |
+
# CONTENT_FORMAT=minerU
|
| 60 |
+
|
| 61 |
+
### Max nodes return from grap retrieval
|
| 62 |
+
# MAX_GRAPH_NODES=1000
|
| 63 |
+
|
| 64 |
+
### Logging level
|
| 65 |
+
# LOG_LEVEL=INFO
|
| 66 |
+
# VERBOSE=False
|
| 67 |
+
# LOG_MAX_BYTES=10485760
|
| 68 |
+
# LOG_BACKUP_COUNT=5
|
| 69 |
+
### Logfile location (defaults to current working directory)
|
| 70 |
+
# LOG_DIR=/path/to/log/directory
|
| 71 |
+
|
| 72 |
+
### Settings for RAG query
|
| 73 |
+
# HISTORY_TURNS=3
|
| 74 |
+
# COSINE_THRESHOLD=0.2
|
| 75 |
+
# TOP_K=60
|
| 76 |
+
# MAX_TOKEN_TEXT_CHUNK=4000
|
| 77 |
+
# MAX_TOKEN_RELATION_DESC=4000
|
| 78 |
+
# MAX_TOKEN_ENTITY_DESC=4000
|
| 79 |
+
|
| 80 |
+
### Entity and ralation summarization configuration
|
| 81 |
+
### Language: English, Chinese, French, German ...
|
| 82 |
+
SUMMARY_LANGUAGE=English
|
| 83 |
+
### Number of duplicated entities/edges to trigger LLM re-summary on merge ( at least 3 is recommented)
|
| 84 |
+
# FORCE_LLM_SUMMARY_ON_MERGE=6
|
| 85 |
+
### Max tokens for entity/relations description after merge
|
| 86 |
+
# MAX_TOKEN_SUMMARY=500
|
| 87 |
+
|
| 88 |
+
### Number of parallel processing documents(Less than MAX_ASYNC/2 is recommended)
|
| 89 |
+
# MAX_PARALLEL_INSERT=2
|
| 90 |
+
### Chunk size for document splitting, 500~1500 is recommended
|
| 91 |
+
# CHUNK_SIZE=1200
|
| 92 |
+
# CHUNK_OVERLAP_SIZE=100
|
| 93 |
+
|
| 94 |
+
### LLM Configuration
|
| 95 |
+
ENABLE_LLM_CACHE=true
|
| 96 |
+
ENABLE_LLM_CACHE_FOR_EXTRACT=true
|
| 97 |
+
### Time out in seconds for LLM, None for infinite timeout
|
| 98 |
+
TIMEOUT=240
|
| 99 |
+
### Some models like o1-mini require temperature to be set to 1
|
| 100 |
+
TEMPERATURE=0
|
| 101 |
+
### Max concurrency requests of LLM
|
| 102 |
+
MAX_ASYNC=4
|
| 103 |
+
### MAX_TOKENS: max tokens send to LLM for entity relation summaries (less than context size of the model)
|
| 104 |
+
### MAX_TOKENS: set as num_ctx option for Ollama by API Server
|
| 105 |
+
MAX_TOKENS=32768
|
| 106 |
+
### LLM Binding type: openai, ollama, lollms, azure_openai, lmstudio
|
| 107 |
+
LLM_BINDING=openai
|
| 108 |
+
LLM_MODEL=gpt-4o
|
| 109 |
+
LLM_BINDING_HOST=https://api.openai.com/v1
|
| 110 |
+
LLM_BINDING_API_KEY=your_api_key
|
| 111 |
+
### Optional for Azure
|
| 112 |
+
# AZURE_OPENAI_API_VERSION=2024-08-01-preview
|
| 113 |
+
# AZURE_OPENAI_DEPLOYMENT=gpt-4o
|
| 114 |
+
|
| 115 |
+
### Embedding Configuration
|
| 116 |
+
### Embedding Binding type: openai, ollama, lollms, azure_openai, lmstudio
|
| 117 |
+
EMBEDDING_BINDING=ollama
|
| 118 |
+
EMBEDDING_MODEL=bge-m3:latest
|
| 119 |
+
EMBEDDING_DIM=1024
|
| 120 |
+
EMBEDDING_BINDING_API_KEY=your_api_key
|
| 121 |
+
# If the embedding service is deployed within the same Docker stack, use host.docker.internal instead of localhost
|
| 122 |
+
EMBEDDING_BINDING_HOST=http://localhost:11434
|
| 123 |
+
### Num of chunks send to Embedding in single request
|
| 124 |
+
# EMBEDDING_BATCH_NUM=32
|
| 125 |
+
### Max concurrency requests for Embedding
|
| 126 |
+
# EMBEDDING_FUNC_MAX_ASYNC=16
|
| 127 |
+
### Maximum tokens sent to Embedding for each chunk (no longer in use?)
|
| 128 |
+
# MAX_EMBED_TOKENS=8192
|
| 129 |
+
### Optional for Azure
|
| 130 |
+
# AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-large
|
| 131 |
+
# AZURE_EMBEDDING_API_VERSION=2023-05-15
|
| 132 |
+
|
| 133 |
+
### Data storage selection
|
| 134 |
+
# LIGHTRAG_KV_STORAGE=PGKVStorage
|
| 135 |
+
# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
|
| 136 |
+
# LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
|
| 137 |
+
# LIGHTRAG_GRAPH_STORAGE=Neo4JStorage
|
| 138 |
+
|
| 139 |
+
### TiDB Configuration (Deprecated)
|
| 140 |
+
# TIDB_HOST=localhost
|
| 141 |
+
# TIDB_PORT=4000
|
| 142 |
+
# TIDB_USER=your_username
|
| 143 |
+
# TIDB_PASSWORD='your_password'
|
| 144 |
+
# TIDB_DATABASE=your_database
|
| 145 |
+
### separating all data from difference Lightrag instances(deprecating)
|
| 146 |
+
# TIDB_WORKSPACE=default
|
| 147 |
+
|
| 148 |
+
### PostgreSQL Configuration
|
| 149 |
+
POSTGRES_HOST=localhost
|
| 150 |
+
POSTGRES_PORT=5432
|
| 151 |
+
POSTGRES_USER=your_username
|
| 152 |
+
POSTGRES_PASSWORD='your_password'
|
| 153 |
+
POSTGRES_DATABASE=your_database
|
| 154 |
+
POSTGRES_MAX_CONNECTIONS=12
|
| 155 |
+
### separating all data from difference Lightrag instances(deprecating)
|
| 156 |
+
# POSTGRES_WORKSPACE=default
|
| 157 |
+
|
| 158 |
+
### Neo4j Configuration
|
| 159 |
+
NEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io
|
| 160 |
+
NEO4J_USERNAME=neo4j
|
| 161 |
+
NEO4J_PASSWORD='your_password'
|
| 162 |
+
|
| 163 |
+
### Independent AGM Configuration(not for AMG embedded in PostreSQL)
|
| 164 |
+
# AGE_POSTGRES_DB=
|
| 165 |
+
# AGE_POSTGRES_USER=
|
| 166 |
+
# AGE_POSTGRES_PASSWORD=
|
| 167 |
+
# AGE_POSTGRES_HOST=
|
| 168 |
+
# AGE_POSTGRES_PORT=8529
|
| 169 |
+
|
| 170 |
+
# AGE Graph Name(apply to PostgreSQL and independent AGM)
|
| 171 |
+
### AGE_GRAPH_NAME is precated
|
| 172 |
+
# AGE_GRAPH_NAME=lightrag
|
| 173 |
+
|
| 174 |
+
### MongoDB Configuration
|
| 175 |
+
MONGO_URI=mongodb://root:root@localhost:27017/
|
| 176 |
+
MONGO_DATABASE=LightRAG
|
| 177 |
+
### separating all data from difference Lightrag instances(deprecating)
|
| 178 |
+
# MONGODB_GRAPH=false
|
| 179 |
+
|
| 180 |
+
### Milvus Configuration
|
| 181 |
+
MILVUS_URI=http://localhost:19530
|
| 182 |
+
MILVUS_DB_NAME=lightrag
|
| 183 |
+
# MILVUS_USER=root
|
| 184 |
+
# MILVUS_PASSWORD=your_password
|
| 185 |
+
# MILVUS_TOKEN=your_token
|
| 186 |
+
|
| 187 |
+
### Qdrant
|
| 188 |
+
QDRANT_URL=http://localhost:16333
|
| 189 |
+
# QDRANT_API_KEY=your-api-key
|
| 190 |
+
|
| 191 |
+
### Redis
|
| 192 |
+
REDIS_URI=redis://localhost:6379
|
rag_anything_smaranika/examples/batch_processing_example.py
ADDED
|
@@ -0,0 +1,561 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Batch Processing Example for RAG-Anything
|
| 4 |
+
|
| 5 |
+
This example demonstrates how to use the batch processing capabilities
|
| 6 |
+
to process multiple documents in parallel for improved throughput.
|
| 7 |
+
|
| 8 |
+
Features demonstrated:
|
| 9 |
+
- Basic batch processing with BatchParser
|
| 10 |
+
- Asynchronous batch processing
|
| 11 |
+
- Integration with RAG-Anything
|
| 12 |
+
- Error handling and progress tracking
|
| 13 |
+
- File filtering and directory processing
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import asyncio
|
| 17 |
+
import logging
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import tempfile
|
| 20 |
+
import time
|
| 21 |
+
|
| 22 |
+
# Add project root directory to Python path
|
| 23 |
+
import sys
|
| 24 |
+
|
| 25 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 26 |
+
|
| 27 |
+
from raganything import RAGAnything, RAGAnythingConfig
|
| 28 |
+
from raganything.batch_parser import BatchParser
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def create_sample_documents():
|
| 32 |
+
"""Create sample documents for batch processing testing"""
|
| 33 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 34 |
+
sample_files = []
|
| 35 |
+
|
| 36 |
+
# Create various document types
|
| 37 |
+
documents = {
|
| 38 |
+
"document1.txt": "This is a simple text document for testing batch processing.",
|
| 39 |
+
"document2.txt": "Another text document with different content.",
|
| 40 |
+
"document3.md": """# Markdown Document
|
| 41 |
+
|
| 42 |
+
## Introduction
|
| 43 |
+
This is a markdown document for testing.
|
| 44 |
+
|
| 45 |
+
### Features
|
| 46 |
+
- Markdown formatting
|
| 47 |
+
- Code blocks
|
| 48 |
+
- Lists
|
| 49 |
+
|
| 50 |
+
```python
|
| 51 |
+
def example():
|
| 52 |
+
return "Hello from markdown"
|
| 53 |
+
```
|
| 54 |
+
""",
|
| 55 |
+
"report.txt": """Business Report
|
| 56 |
+
|
| 57 |
+
Executive Summary:
|
| 58 |
+
This report demonstrates batch processing capabilities.
|
| 59 |
+
|
| 60 |
+
Key Findings:
|
| 61 |
+
1. Parallel processing improves throughput
|
| 62 |
+
2. Progress tracking enhances user experience
|
| 63 |
+
3. Error handling ensures reliability
|
| 64 |
+
|
| 65 |
+
Conclusion:
|
| 66 |
+
Batch processing is essential for large-scale document processing.
|
| 67 |
+
""",
|
| 68 |
+
"notes.md": """# Meeting Notes
|
| 69 |
+
|
| 70 |
+
## Date: 2024-01-15
|
| 71 |
+
|
| 72 |
+
### Attendees
|
| 73 |
+
- Alice Johnson
|
| 74 |
+
- Bob Smith
|
| 75 |
+
- Carol Williams
|
| 76 |
+
|
| 77 |
+
### Discussion Topics
|
| 78 |
+
1. **Batch Processing Implementation**
|
| 79 |
+
- Parallel document processing
|
| 80 |
+
- Progress tracking
|
| 81 |
+
- Error handling strategies
|
| 82 |
+
|
| 83 |
+
2. **Performance Metrics**
|
| 84 |
+
- Target: 100 documents/hour
|
| 85 |
+
- Memory usage: < 4GB
|
| 86 |
+
- Success rate: > 95%
|
| 87 |
+
|
| 88 |
+
### Action Items
|
| 89 |
+
- [ ] Implement batch processing
|
| 90 |
+
- [ ] Add progress bars
|
| 91 |
+
- [ ] Test with large document sets
|
| 92 |
+
- [ ] Optimize memory usage
|
| 93 |
+
|
| 94 |
+
### Next Steps
|
| 95 |
+
Continue development and testing of batch processing features.
|
| 96 |
+
""",
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Create files
|
| 100 |
+
for filename, content in documents.items():
|
| 101 |
+
file_path = temp_dir / filename
|
| 102 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 103 |
+
f.write(content)
|
| 104 |
+
sample_files.append(str(file_path))
|
| 105 |
+
|
| 106 |
+
return sample_files, temp_dir
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def demonstrate_basic_batch_processing():
|
| 110 |
+
"""Demonstrate basic batch processing functionality"""
|
| 111 |
+
print("\n" + "=" * 60)
|
| 112 |
+
print("BASIC BATCH PROCESSING DEMONSTRATION")
|
| 113 |
+
print("=" * 60)
|
| 114 |
+
|
| 115 |
+
# Create sample documents
|
| 116 |
+
sample_files, temp_dir = create_sample_documents()
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
print(f"Created {len(sample_files)} sample documents in: {temp_dir}")
|
| 120 |
+
for file_path in sample_files:
|
| 121 |
+
print(f" - {Path(file_path).name}")
|
| 122 |
+
|
| 123 |
+
# Create batch parser
|
| 124 |
+
batch_parser = BatchParser(
|
| 125 |
+
parser_type="mineru",
|
| 126 |
+
max_workers=3,
|
| 127 |
+
show_progress=True,
|
| 128 |
+
timeout_per_file=60,
|
| 129 |
+
skip_installation_check=True, # Skip installation check for demo
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
print("\nBatch parser configured:")
|
| 133 |
+
print(" - Parser type: mineru")
|
| 134 |
+
print(" - Max workers: 3")
|
| 135 |
+
print(" - Progress tracking: enabled")
|
| 136 |
+
print(" - Timeout per file: 60 seconds")
|
| 137 |
+
|
| 138 |
+
# Check supported extensions
|
| 139 |
+
supported_extensions = batch_parser.get_supported_extensions()
|
| 140 |
+
print(f" - Supported extensions: {supported_extensions}")
|
| 141 |
+
|
| 142 |
+
# Filter files to supported types
|
| 143 |
+
supported_files = batch_parser.filter_supported_files(sample_files)
|
| 144 |
+
print("\nFile filtering results:")
|
| 145 |
+
print(f" - Total files: {len(sample_files)}")
|
| 146 |
+
print(f" - Supported files: {len(supported_files)}")
|
| 147 |
+
|
| 148 |
+
# Process batch
|
| 149 |
+
output_dir = temp_dir / "batch_output"
|
| 150 |
+
print("\nStarting batch processing...")
|
| 151 |
+
print(f"Output directory: {output_dir}")
|
| 152 |
+
|
| 153 |
+
start_time = time.time()
|
| 154 |
+
result = batch_parser.process_batch(
|
| 155 |
+
file_paths=supported_files,
|
| 156 |
+
output_dir=str(output_dir),
|
| 157 |
+
parse_method="auto",
|
| 158 |
+
recursive=False,
|
| 159 |
+
)
|
| 160 |
+
processing_time = time.time() - start_time
|
| 161 |
+
|
| 162 |
+
# Display results
|
| 163 |
+
print("\n" + "-" * 40)
|
| 164 |
+
print("BATCH PROCESSING RESULTS")
|
| 165 |
+
print("-" * 40)
|
| 166 |
+
print(result.summary())
|
| 167 |
+
print(f"Total processing time: {processing_time:.2f} seconds")
|
| 168 |
+
print(f"Success rate: {result.success_rate:.1f}%")
|
| 169 |
+
|
| 170 |
+
if result.successful_files:
|
| 171 |
+
print("\nSuccessfully processed files:")
|
| 172 |
+
for file_path in result.successful_files:
|
| 173 |
+
print(f" ✅ {Path(file_path).name}")
|
| 174 |
+
|
| 175 |
+
if result.failed_files:
|
| 176 |
+
print("\nFailed files:")
|
| 177 |
+
for file_path in result.failed_files:
|
| 178 |
+
error = result.errors.get(file_path, "Unknown error")
|
| 179 |
+
print(f" ❌ {Path(file_path).name}: {error}")
|
| 180 |
+
|
| 181 |
+
return result
|
| 182 |
+
|
| 183 |
+
except Exception as e:
|
| 184 |
+
print(f"❌ Batch processing demonstration failed: {str(e)}")
|
| 185 |
+
return None
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
async def demonstrate_async_batch_processing():
|
| 189 |
+
"""Demonstrate asynchronous batch processing"""
|
| 190 |
+
print("\n" + "=" * 60)
|
| 191 |
+
print("ASYNCHRONOUS BATCH PROCESSING DEMONSTRATION")
|
| 192 |
+
print("=" * 60)
|
| 193 |
+
|
| 194 |
+
# Create sample documents
|
| 195 |
+
sample_files, temp_dir = create_sample_documents()
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
print(f"Processing {len(sample_files)} documents asynchronously...")
|
| 199 |
+
|
| 200 |
+
# Create batch parser
|
| 201 |
+
batch_parser = BatchParser(
|
| 202 |
+
parser_type="mineru",
|
| 203 |
+
max_workers=2,
|
| 204 |
+
show_progress=True,
|
| 205 |
+
skip_installation_check=True,
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
# Process batch asynchronously
|
| 209 |
+
output_dir = temp_dir / "async_output"
|
| 210 |
+
|
| 211 |
+
start_time = time.time()
|
| 212 |
+
result = await batch_parser.process_batch_async(
|
| 213 |
+
file_paths=sample_files,
|
| 214 |
+
output_dir=str(output_dir),
|
| 215 |
+
parse_method="auto",
|
| 216 |
+
recursive=False,
|
| 217 |
+
)
|
| 218 |
+
processing_time = time.time() - start_time
|
| 219 |
+
|
| 220 |
+
# Display results
|
| 221 |
+
print("\n" + "-" * 40)
|
| 222 |
+
print("ASYNC BATCH PROCESSING RESULTS")
|
| 223 |
+
print("-" * 40)
|
| 224 |
+
print(result.summary())
|
| 225 |
+
print(f"Async processing time: {processing_time:.2f} seconds")
|
| 226 |
+
print(f"Success rate: {result.success_rate:.1f}%")
|
| 227 |
+
|
| 228 |
+
return result
|
| 229 |
+
|
| 230 |
+
except Exception as e:
|
| 231 |
+
print(f"❌ Async batch processing demonstration failed: {str(e)}")
|
| 232 |
+
return None
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
async def demonstrate_rag_integration():
|
| 236 |
+
"""Demonstrate batch processing integration with RAG-Anything"""
|
| 237 |
+
print("\n" + "=" * 60)
|
| 238 |
+
print("RAG-ANYTHING BATCH INTEGRATION DEMONSTRATION")
|
| 239 |
+
print("=" * 60)
|
| 240 |
+
|
| 241 |
+
# Create sample documents
|
| 242 |
+
sample_files, temp_dir = create_sample_documents()
|
| 243 |
+
|
| 244 |
+
try:
|
| 245 |
+
# Initialize RAG-Anything with temporary storage
|
| 246 |
+
config = RAGAnythingConfig(
|
| 247 |
+
working_dir=str(temp_dir / "rag_storage"),
|
| 248 |
+
enable_image_processing=True,
|
| 249 |
+
enable_table_processing=True,
|
| 250 |
+
enable_equation_processing=True,
|
| 251 |
+
max_concurrent_files=2,
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
rag = RAGAnything(config=config)
|
| 255 |
+
|
| 256 |
+
print("RAG-Anything initialized with batch processing capabilities")
|
| 257 |
+
|
| 258 |
+
# Show available batch methods
|
| 259 |
+
batch_methods = [method for method in dir(rag) if "batch" in method.lower()]
|
| 260 |
+
print(f"Available batch methods: {batch_methods}")
|
| 261 |
+
|
| 262 |
+
# Demonstrate batch processing with RAG integration
|
| 263 |
+
print(f"\nProcessing {len(sample_files)} documents with RAG integration...")
|
| 264 |
+
|
| 265 |
+
# Use the RAG-integrated batch processing
|
| 266 |
+
try:
|
| 267 |
+
# Process documents in batch
|
| 268 |
+
result = rag.process_documents_batch(
|
| 269 |
+
file_paths=sample_files,
|
| 270 |
+
output_dir=str(temp_dir / "rag_batch_output"),
|
| 271 |
+
max_workers=2,
|
| 272 |
+
show_progress=True,
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
print("\n" + "-" * 40)
|
| 276 |
+
print("RAG BATCH PROCESSING RESULTS")
|
| 277 |
+
print("-" * 40)
|
| 278 |
+
print(result.summary())
|
| 279 |
+
print(f"Success rate: {result.success_rate:.1f}%")
|
| 280 |
+
|
| 281 |
+
# Demonstrate batch processing with full RAG integration
|
| 282 |
+
print("\nProcessing documents with full RAG integration...")
|
| 283 |
+
|
| 284 |
+
rag_result = await rag.process_documents_with_rag_batch(
|
| 285 |
+
file_paths=sample_files[:2], # Process subset for demo
|
| 286 |
+
output_dir=str(temp_dir / "rag_full_output"),
|
| 287 |
+
max_workers=1,
|
| 288 |
+
show_progress=True,
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
print("\n" + "-" * 40)
|
| 292 |
+
print("FULL RAG INTEGRATION RESULTS")
|
| 293 |
+
print("-" * 40)
|
| 294 |
+
print(f"Parse result: {rag_result['parse_result'].summary()}")
|
| 295 |
+
print(
|
| 296 |
+
f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds"
|
| 297 |
+
)
|
| 298 |
+
print(
|
| 299 |
+
f"Successfully processed with RAG: {rag_result['successful_rag_files']}"
|
| 300 |
+
)
|
| 301 |
+
print(f"Failed RAG processing: {rag_result['failed_rag_files']}")
|
| 302 |
+
|
| 303 |
+
return rag_result
|
| 304 |
+
|
| 305 |
+
except Exception as e:
|
| 306 |
+
print(f"⚠️ RAG integration demo completed with limitations: {str(e)}")
|
| 307 |
+
print(
|
| 308 |
+
"Note: This is expected in environments without full API configuration"
|
| 309 |
+
)
|
| 310 |
+
return None
|
| 311 |
+
|
| 312 |
+
except Exception as e:
|
| 313 |
+
print(f"❌ RAG integration demonstration failed: {str(e)}")
|
| 314 |
+
return None
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def demonstrate_directory_processing():
|
| 318 |
+
"""Demonstrate processing entire directories"""
|
| 319 |
+
print("\n" + "=" * 60)
|
| 320 |
+
print("DIRECTORY PROCESSING DEMONSTRATION")
|
| 321 |
+
print("=" * 60)
|
| 322 |
+
|
| 323 |
+
# Create a directory structure with nested files
|
| 324 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 325 |
+
|
| 326 |
+
# Create main directory files
|
| 327 |
+
main_files = {
|
| 328 |
+
"overview.txt": "Main directory overview document",
|
| 329 |
+
"readme.md": "# Project README\n\nThis is the main project documentation.",
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
# Create subdirectory
|
| 333 |
+
sub_dir = temp_dir / "subdirectory"
|
| 334 |
+
sub_dir.mkdir()
|
| 335 |
+
|
| 336 |
+
sub_files = {
|
| 337 |
+
"details.txt": "Detailed information in subdirectory",
|
| 338 |
+
"notes.md": "# Notes\n\nAdditional notes and information.",
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
# Write all files
|
| 342 |
+
all_files = []
|
| 343 |
+
for filename, content in main_files.items():
|
| 344 |
+
file_path = temp_dir / filename
|
| 345 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 346 |
+
f.write(content)
|
| 347 |
+
all_files.append(str(file_path))
|
| 348 |
+
|
| 349 |
+
for filename, content in sub_files.items():
|
| 350 |
+
file_path = sub_dir / filename
|
| 351 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 352 |
+
f.write(content)
|
| 353 |
+
all_files.append(str(file_path))
|
| 354 |
+
|
| 355 |
+
try:
|
| 356 |
+
print("Created directory structure:")
|
| 357 |
+
print(f" Main directory: {temp_dir}")
|
| 358 |
+
print(f" Files in main: {list(main_files.keys())}")
|
| 359 |
+
print(f" Subdirectory: {sub_dir}")
|
| 360 |
+
print(f" Files in sub: {list(sub_files.keys())}")
|
| 361 |
+
|
| 362 |
+
# Create batch parser
|
| 363 |
+
batch_parser = BatchParser(
|
| 364 |
+
parser_type="mineru",
|
| 365 |
+
max_workers=2,
|
| 366 |
+
show_progress=True,
|
| 367 |
+
skip_installation_check=True,
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
# Process entire directory recursively
|
| 371 |
+
print("\nProcessing entire directory recursively...")
|
| 372 |
+
|
| 373 |
+
result = batch_parser.process_batch(
|
| 374 |
+
file_paths=[str(temp_dir)], # Pass directory path
|
| 375 |
+
output_dir=str(temp_dir / "directory_output"),
|
| 376 |
+
parse_method="auto",
|
| 377 |
+
recursive=True, # Include subdirectories
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
print("\n" + "-" * 40)
|
| 381 |
+
print("DIRECTORY PROCESSING RESULTS")
|
| 382 |
+
print("-" * 40)
|
| 383 |
+
print(result.summary())
|
| 384 |
+
print(f"Total files found and processed: {result.total_files}")
|
| 385 |
+
print(f"Success rate: {result.success_rate:.1f}%")
|
| 386 |
+
|
| 387 |
+
if result.successful_files:
|
| 388 |
+
print("\nSuccessfully processed:")
|
| 389 |
+
for file_path in result.successful_files:
|
| 390 |
+
relative_path = Path(file_path).relative_to(temp_dir)
|
| 391 |
+
print(f" ✅ {relative_path}")
|
| 392 |
+
|
| 393 |
+
return result
|
| 394 |
+
|
| 395 |
+
except Exception as e:
|
| 396 |
+
print(f"❌ Directory processing demonstration failed: {str(e)}")
|
| 397 |
+
return None
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
def demonstrate_error_handling():
|
| 401 |
+
"""Demonstrate error handling and recovery"""
|
| 402 |
+
print("\n" + "=" * 60)
|
| 403 |
+
print("ERROR HANDLING DEMONSTRATION")
|
| 404 |
+
print("=" * 60)
|
| 405 |
+
|
| 406 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 407 |
+
|
| 408 |
+
# Create files with various issues
|
| 409 |
+
files_with_issues = {
|
| 410 |
+
"valid_file.txt": "This is a valid file that should process successfully.",
|
| 411 |
+
"empty_file.txt": "", # Empty file
|
| 412 |
+
"large_file.txt": "x" * 1000000, # Large file (1MB of 'x')
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
created_files = []
|
| 416 |
+
for filename, content in files_with_issues.items():
|
| 417 |
+
file_path = temp_dir / filename
|
| 418 |
+
with open(file_path, "w", encoding="utf-8") as f:
|
| 419 |
+
f.write(content)
|
| 420 |
+
created_files.append(str(file_path))
|
| 421 |
+
|
| 422 |
+
# Add a non-existent file to the list
|
| 423 |
+
created_files.append(str(temp_dir / "non_existent_file.txt"))
|
| 424 |
+
|
| 425 |
+
try:
|
| 426 |
+
print(f"Testing error handling with {len(created_files)} files:")
|
| 427 |
+
for file_path in created_files:
|
| 428 |
+
name = Path(file_path).name
|
| 429 |
+
exists = Path(file_path).exists()
|
| 430 |
+
size = Path(file_path).stat().st_size if exists else 0
|
| 431 |
+
print(f" - {name}: {'exists' if exists else 'missing'}, {size} bytes")
|
| 432 |
+
|
| 433 |
+
# Create batch parser with short timeout for demonstration
|
| 434 |
+
batch_parser = BatchParser(
|
| 435 |
+
parser_type="mineru",
|
| 436 |
+
max_workers=2,
|
| 437 |
+
show_progress=True,
|
| 438 |
+
timeout_per_file=30, # Short timeout for demo
|
| 439 |
+
skip_installation_check=True,
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
# Process files and handle errors
|
| 443 |
+
result = batch_parser.process_batch(
|
| 444 |
+
file_paths=created_files,
|
| 445 |
+
output_dir=str(temp_dir / "error_test_output"),
|
| 446 |
+
parse_method="auto",
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
print("\n" + "-" * 40)
|
| 450 |
+
print("ERROR HANDLING RESULTS")
|
| 451 |
+
print("-" * 40)
|
| 452 |
+
print(result.summary())
|
| 453 |
+
|
| 454 |
+
if result.successful_files:
|
| 455 |
+
print("\nSuccessful files:")
|
| 456 |
+
for file_path in result.successful_files:
|
| 457 |
+
print(f" ✅ {Path(file_path).name}")
|
| 458 |
+
|
| 459 |
+
if result.failed_files:
|
| 460 |
+
print("\nFailed files with error details:")
|
| 461 |
+
for file_path in result.failed_files:
|
| 462 |
+
error = result.errors.get(file_path, "Unknown error")
|
| 463 |
+
print(f" ❌ {Path(file_path).name}: {error}")
|
| 464 |
+
|
| 465 |
+
# Demonstrate retry logic
|
| 466 |
+
if result.failed_files:
|
| 467 |
+
print(
|
| 468 |
+
f"\nDemonstrating retry logic for {len(result.failed_files)} failed files..."
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
# Retry only the failed files
|
| 472 |
+
retry_result = batch_parser.process_batch(
|
| 473 |
+
file_paths=result.failed_files,
|
| 474 |
+
output_dir=str(temp_dir / "retry_output"),
|
| 475 |
+
parse_method="auto",
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
print(f"Retry results: {retry_result.summary()}")
|
| 479 |
+
|
| 480 |
+
return result
|
| 481 |
+
|
| 482 |
+
except Exception as e:
|
| 483 |
+
print(f"❌ Error handling demonstration failed: {str(e)}")
|
| 484 |
+
return None
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
async def main():
|
| 488 |
+
"""Main demonstration function"""
|
| 489 |
+
# Configure logging
|
| 490 |
+
logging.basicConfig(
|
| 491 |
+
level=logging.INFO,
|
| 492 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 493 |
+
)
|
| 494 |
+
|
| 495 |
+
print("RAG-Anything Batch Processing Demonstration")
|
| 496 |
+
print("=" * 70)
|
| 497 |
+
print("This example demonstrates various batch processing capabilities:")
|
| 498 |
+
print(" - Basic batch processing with progress tracking")
|
| 499 |
+
print(" - Asynchronous processing for improved performance")
|
| 500 |
+
print(" - Integration with RAG-Anything pipeline")
|
| 501 |
+
print(" - Directory processing with recursive file discovery")
|
| 502 |
+
print(" - Comprehensive error handling and recovery")
|
| 503 |
+
|
| 504 |
+
results = {}
|
| 505 |
+
|
| 506 |
+
# Run demonstrations
|
| 507 |
+
print("\n🚀 Starting demonstrations...")
|
| 508 |
+
|
| 509 |
+
# Basic batch processing
|
| 510 |
+
results["basic"] = demonstrate_basic_batch_processing()
|
| 511 |
+
|
| 512 |
+
# Asynchronous processing
|
| 513 |
+
results["async"] = await demonstrate_async_batch_processing()
|
| 514 |
+
|
| 515 |
+
# RAG integration
|
| 516 |
+
results["rag"] = await demonstrate_rag_integration()
|
| 517 |
+
|
| 518 |
+
# Directory processing
|
| 519 |
+
results["directory"] = demonstrate_directory_processing()
|
| 520 |
+
|
| 521 |
+
# Error handling
|
| 522 |
+
results["error_handling"] = demonstrate_error_handling()
|
| 523 |
+
|
| 524 |
+
# Summary
|
| 525 |
+
print("\n" + "=" * 70)
|
| 526 |
+
print("DEMONSTRATION SUMMARY")
|
| 527 |
+
print("=" * 70)
|
| 528 |
+
|
| 529 |
+
for demo_name, result in results.items():
|
| 530 |
+
if result:
|
| 531 |
+
if hasattr(result, "success_rate"):
|
| 532 |
+
print(
|
| 533 |
+
f"✅ {demo_name.upper()}: {result.success_rate:.1f}% success rate"
|
| 534 |
+
)
|
| 535 |
+
else:
|
| 536 |
+
print(f"✅ {demo_name.upper()}: Completed successfully")
|
| 537 |
+
else:
|
| 538 |
+
print(f"❌ {demo_name.upper()}: Failed or had limitations")
|
| 539 |
+
|
| 540 |
+
print("\n📊 Key Features Demonstrated:")
|
| 541 |
+
print(" - Parallel document processing with configurable worker counts")
|
| 542 |
+
print(" - Real-time progress tracking with tqdm progress bars")
|
| 543 |
+
print(" - Comprehensive error handling and reporting")
|
| 544 |
+
print(" - File filtering based on supported document types")
|
| 545 |
+
print(" - Directory processing with recursive file discovery")
|
| 546 |
+
print(" - Asynchronous processing for improved performance")
|
| 547 |
+
print(" - Integration with RAG-Anything document pipeline")
|
| 548 |
+
print(" - Retry logic for failed documents")
|
| 549 |
+
print(" - Detailed processing statistics and timing")
|
| 550 |
+
|
| 551 |
+
print("\n💡 Best Practices Highlighted:")
|
| 552 |
+
print(" - Use appropriate worker counts for your system")
|
| 553 |
+
print(" - Enable progress tracking for long-running operations")
|
| 554 |
+
print(" - Handle errors gracefully with retry mechanisms")
|
| 555 |
+
print(" - Filter files to supported types before processing")
|
| 556 |
+
print(" - Set reasonable timeouts for document processing")
|
| 557 |
+
print(" - Use skip_installation_check for environments with conflicts")
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
if __name__ == "__main__":
|
| 561 |
+
asyncio.run(main())
|
rag_anything_smaranika/examples/batch_processing_optimized_example.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example: Optimized Batch Processing for RAGAnything
|
| 3 |
+
|
| 4 |
+
This example demonstrates the new optimized batch processing capabilities
|
| 5 |
+
that provide 2-3x faster processing for large document collections.
|
| 6 |
+
|
| 7 |
+
Features demonstrated:
|
| 8 |
+
- Concurrent document parsing with prefetching
|
| 9 |
+
- Pipeline architecture (parse + process simultaneously)
|
| 10 |
+
- Progress tracking with ETA estimation
|
| 11 |
+
- Adaptive rate limiting
|
| 12 |
+
- Performance statistics
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import asyncio
|
| 16 |
+
import time
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from raganything import RAGAnything
|
| 19 |
+
|
| 20 |
+
async def progress_callback(progress_data):
|
| 21 |
+
"""
|
| 22 |
+
Callback function to handle progress updates
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
progress_data: Dict containing:
|
| 26 |
+
- processed: Number of processed documents
|
| 27 |
+
- total: Total number of documents
|
| 28 |
+
- failed: Number of failed documents
|
| 29 |
+
- percentage: Completion percentage
|
| 30 |
+
- eta_seconds: Estimated time remaining
|
| 31 |
+
- rate_docs_per_sec: Processing rate
|
| 32 |
+
"""
|
| 33 |
+
print(f"\rProgress: {progress_data['processed']}/{progress_data['total']} "
|
| 34 |
+
f"({progress_data['percentage']:.1f}%) | "
|
| 35 |
+
f"Rate: {progress_data['rate_docs_per_sec']:.2f} docs/s | "
|
| 36 |
+
f"ETA: {progress_data['eta_seconds']:.1f}s", end='', flush=True)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
async def main():
|
| 40 |
+
# Initialize RAGAnything
|
| 41 |
+
rag = RAGAnything(
|
| 42 |
+
working_dir="./rag_storage",
|
| 43 |
+
rag_dir="./rag_index",
|
| 44 |
+
parser="mineru", # or "docling"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Example 1: Process a list of documents with optimization
|
| 48 |
+
print("=" * 60)
|
| 49 |
+
print("Example 1: Optimized Batch Processing")
|
| 50 |
+
print("=" * 60)
|
| 51 |
+
|
| 52 |
+
documents = [
|
| 53 |
+
"./data/report1.pdf",
|
| 54 |
+
"./data/report2.pdf",
|
| 55 |
+
"./data/research_paper.pdf",
|
| 56 |
+
"./data/technical_spec.docx",
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
start_time = time.time()
|
| 60 |
+
|
| 61 |
+
result = await rag.process_documents_batch_optimized(
|
| 62 |
+
file_paths=documents,
|
| 63 |
+
max_concurrent_parsers=4, # Parse up to 4 documents at once
|
| 64 |
+
max_concurrent_processors=10, # Process up to 10 chunks concurrently
|
| 65 |
+
enable_progress_tracking=True,
|
| 66 |
+
progress_callback=progress_callback,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
print() # New line after progress bar
|
| 70 |
+
|
| 71 |
+
elapsed_time = time.time() - start_time
|
| 72 |
+
|
| 73 |
+
# Display results
|
| 74 |
+
print(f"\n📊 Processing Results:")
|
| 75 |
+
print(f" ✅ Successful: {len(result['successful_files'])} documents")
|
| 76 |
+
print(f" ❌ Failed: {len(result['failed_files'])} documents")
|
| 77 |
+
print(f" ⏱️ Total time: {elapsed_time:.2f}s")
|
| 78 |
+
|
| 79 |
+
# Display detailed statistics
|
| 80 |
+
stats = result['statistics']
|
| 81 |
+
print(f"\n📈 Performance Statistics:")
|
| 82 |
+
print(f" Processing rate: {stats['processing_rate_docs_per_sec']:.2f} docs/sec")
|
| 83 |
+
print(f" Parsing time: {stats['parsing_time']:.2f}s")
|
| 84 |
+
print(f" Text processing: {stats['text_processing_time']:.2f}s")
|
| 85 |
+
print(f" Multimodal processing: {stats['multimodal_processing_time']:.2f}s")
|
| 86 |
+
print(f" Cache hit rate: {stats['cache_hit_rate']:.1f}%")
|
| 87 |
+
|
| 88 |
+
# Show per-document results
|
| 89 |
+
if result['successful_files']:
|
| 90 |
+
print(f"\n✅ Successfully processed files:")
|
| 91 |
+
for file_info in result['successful_files'][:5]: # Show first 5
|
| 92 |
+
print(f" - {Path(file_info['file_path']).name} "
|
| 93 |
+
f"(processing: {file_info['processing_time']:.2f}s, "
|
| 94 |
+
f"parsing: {file_info['parse_time']:.2f}s)")
|
| 95 |
+
|
| 96 |
+
if result['failed_files']:
|
| 97 |
+
print(f"\n❌ Failed files:")
|
| 98 |
+
for file_info in result['failed_files']:
|
| 99 |
+
print(f" - {Path(file_info['file_path']).name}: {file_info['error']}")
|
| 100 |
+
|
| 101 |
+
# Example 2: Process an entire folder with optimization
|
| 102 |
+
print("\n" + "=" * 60)
|
| 103 |
+
print("Example 2: Optimized Folder Processing")
|
| 104 |
+
print("=" * 60)
|
| 105 |
+
|
| 106 |
+
folder_result = await rag.process_folder_optimized(
|
| 107 |
+
folder_path="./data/documents",
|
| 108 |
+
file_extensions=['.pdf', '.docx', '.pptx'],
|
| 109 |
+
recursive=True,
|
| 110 |
+
max_concurrent_parsers=6,
|
| 111 |
+
max_concurrent_processors=12,
|
| 112 |
+
progress_callback=progress_callback,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
print() # New line after progress bar
|
| 116 |
+
|
| 117 |
+
print(f"\n📁 Folder Processing Complete:")
|
| 118 |
+
print(f" Successful: {len(folder_result['successful_files'])} files")
|
| 119 |
+
print(f" Failed: {len(folder_result['failed_files'])} files")
|
| 120 |
+
print(f" Rate: {folder_result['statistics']['processing_rate_docs_per_sec']:.2f} docs/sec")
|
| 121 |
+
|
| 122 |
+
# Example 3: Compare standard vs optimized processing
|
| 123 |
+
print("\n" + "=" * 60)
|
| 124 |
+
print("Example 3: Performance Comparison")
|
| 125 |
+
print("=" * 60)
|
| 126 |
+
|
| 127 |
+
test_docs = ["./data/test1.pdf", "./data/test2.pdf", "./data/test3.pdf"]
|
| 128 |
+
|
| 129 |
+
# Standard processing
|
| 130 |
+
print("\n🐢 Standard batch processing...")
|
| 131 |
+
standard_start = time.time()
|
| 132 |
+
await rag.process_folder_complete(
|
| 133 |
+
folder_path="./data/test",
|
| 134 |
+
max_workers=4,
|
| 135 |
+
display_stats=False
|
| 136 |
+
)
|
| 137 |
+
standard_time = time.time() - standard_start
|
| 138 |
+
|
| 139 |
+
# Optimized processing (on different set to avoid cache)
|
| 140 |
+
print("🚀 Optimized batch processing...")
|
| 141 |
+
optimized_start = time.time()
|
| 142 |
+
await rag.process_documents_batch_optimized(
|
| 143 |
+
file_paths=test_docs,
|
| 144 |
+
max_concurrent_parsers=4,
|
| 145 |
+
max_concurrent_processors=10,
|
| 146 |
+
enable_progress_tracking=False,
|
| 147 |
+
)
|
| 148 |
+
optimized_time = time.time() - optimized_start
|
| 149 |
+
|
| 150 |
+
print(f"\n⚡ Performance Improvement:")
|
| 151 |
+
print(f" Standard: {standard_time:.2f}s")
|
| 152 |
+
print(f" Optimized: {optimized_time:.2f}s")
|
| 153 |
+
if standard_time > 0:
|
| 154 |
+
speedup = (standard_time / optimized_time)
|
| 155 |
+
print(f" Speedup: {speedup:.2f}x faster")
|
| 156 |
+
|
| 157 |
+
# Example 4: Custom progress tracking
|
| 158 |
+
print("\n" + "=" * 60)
|
| 159 |
+
print("Example 4: Custom Progress Tracking")
|
| 160 |
+
print("=" * 60)
|
| 161 |
+
|
| 162 |
+
class CustomProgressTracker:
|
| 163 |
+
def __init__(self):
|
| 164 |
+
self.start_time = time.time()
|
| 165 |
+
self.logs = []
|
| 166 |
+
|
| 167 |
+
def __call__(self, progress):
|
| 168 |
+
"""Progress callback"""
|
| 169 |
+
elapsed = time.time() - self.start_time
|
| 170 |
+
log_entry = {
|
| 171 |
+
"timestamp": elapsed,
|
| 172 |
+
"processed": progress['processed'],
|
| 173 |
+
"total": progress['total'],
|
| 174 |
+
"percentage": progress['percentage'],
|
| 175 |
+
"rate": progress['rate_docs_per_sec'],
|
| 176 |
+
}
|
| 177 |
+
self.logs.append(log_entry)
|
| 178 |
+
|
| 179 |
+
# Print formatted progress
|
| 180 |
+
bar_length = 40
|
| 181 |
+
filled_length = int(bar_length * progress['percentage'] / 100)
|
| 182 |
+
bar = '█' * filled_length + '-' * (bar_length - filled_length)
|
| 183 |
+
|
| 184 |
+
print(f"\r|{bar}| {progress['percentage']:.1f}% "
|
| 185 |
+
f"[{progress['processed']}/{progress['total']}] "
|
| 186 |
+
f"ETA: {progress['eta_seconds']:.0f}s", end='', flush=True)
|
| 187 |
+
|
| 188 |
+
def save_log(self, filename="processing_log.txt"):
|
| 189 |
+
"""Save progress log to file"""
|
| 190 |
+
with open(filename, 'w') as f:
|
| 191 |
+
f.write("Batch Processing Log\n")
|
| 192 |
+
f.write("=" * 50 + "\n")
|
| 193 |
+
for entry in self.logs:
|
| 194 |
+
f.write(f"Time: {entry['timestamp']:.2f}s | "
|
| 195 |
+
f"Progress: {entry['processed']}/{entry['total']} "
|
| 196 |
+
f"({entry['percentage']:.1f}%) | "
|
| 197 |
+
f"Rate: {entry['rate']:.2f} docs/s\n")
|
| 198 |
+
|
| 199 |
+
tracker = CustomProgressTracker()
|
| 200 |
+
|
| 201 |
+
await rag.process_documents_batch_optimized(
|
| 202 |
+
file_paths=documents,
|
| 203 |
+
progress_callback=tracker,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
print() # New line
|
| 207 |
+
tracker.save_log("./batch_processing_log.txt")
|
| 208 |
+
print("📝 Progress log saved to batch_processing_log.txt")
|
| 209 |
+
|
| 210 |
+
print("\n" + "=" * 60)
|
| 211 |
+
print("All examples completed!")
|
| 212 |
+
print("=" * 60)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
if __name__ == "__main__":
|
| 216 |
+
asyncio.run(main())
|
rag_anything_smaranika/examples/enhanced_markdown_example.py
ADDED
|
@@ -0,0 +1,1055 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Enhanced Markdown Conversion Example for RAG-Anything
|
| 4 |
+
|
| 5 |
+
This example demonstrates the enhanced markdown to PDF conversion capabilities
|
| 6 |
+
with multiple backends, advanced styling, and professional formatting.
|
| 7 |
+
|
| 8 |
+
Features demonstrated:
|
| 9 |
+
- Basic markdown to PDF conversion
|
| 10 |
+
- Multiple conversion backends (WeasyPrint, Pandoc)
|
| 11 |
+
- Custom CSS styling and configuration
|
| 12 |
+
- Backend detection and selection
|
| 13 |
+
- Error handling and fallback mechanisms
|
| 14 |
+
- Command-line interface usage
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import logging
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import tempfile
|
| 20 |
+
|
| 21 |
+
# Add project root directory to Python path
|
| 22 |
+
import sys
|
| 23 |
+
|
| 24 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 25 |
+
|
| 26 |
+
from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def create_sample_markdown_content():
|
| 30 |
+
"""Create comprehensive sample markdown content for testing"""
|
| 31 |
+
|
| 32 |
+
# Basic sample
|
| 33 |
+
basic_content = """# Basic Markdown Sample
|
| 34 |
+
|
| 35 |
+
## Introduction
|
| 36 |
+
This is a simple markdown document demonstrating basic formatting.
|
| 37 |
+
|
| 38 |
+
### Text Formatting
|
| 39 |
+
- **Bold text** and *italic text*
|
| 40 |
+
- `Inline code` examples
|
| 41 |
+
- [Links to external sites](https://github.com)
|
| 42 |
+
|
| 43 |
+
### Lists
|
| 44 |
+
1. First ordered item
|
| 45 |
+
2. Second ordered item
|
| 46 |
+
3. Third ordered item
|
| 47 |
+
|
| 48 |
+
- Unordered item
|
| 49 |
+
- Another unordered item
|
| 50 |
+
- Nested item
|
| 51 |
+
- Another nested item
|
| 52 |
+
|
| 53 |
+
### Blockquotes
|
| 54 |
+
> This is a blockquote with important information.
|
| 55 |
+
> It can span multiple lines.
|
| 56 |
+
|
| 57 |
+
### Code Block
|
| 58 |
+
```python
|
| 59 |
+
def hello_world():
|
| 60 |
+
print("Hello, World!")
|
| 61 |
+
return "Success"
|
| 62 |
+
```
|
| 63 |
+
"""
|
| 64 |
+
|
| 65 |
+
# Technical documentation sample
|
| 66 |
+
technical_content = """# Technical Documentation
|
| 67 |
+
|
| 68 |
+
## Table of Contents
|
| 69 |
+
- [Overview](#overview)
|
| 70 |
+
- [Architecture](#architecture)
|
| 71 |
+
- [Implementation](#implementation)
|
| 72 |
+
- [Performance](#performance)
|
| 73 |
+
|
| 74 |
+
## Overview
|
| 75 |
+
This document provides comprehensive technical specifications for the enhanced markdown conversion system.
|
| 76 |
+
|
| 77 |
+
## Architecture
|
| 78 |
+
|
| 79 |
+
### Core Components
|
| 80 |
+
1. **Markdown Parser**: Processes markdown syntax
|
| 81 |
+
2. **CSS Engine**: Applies styling and layout
|
| 82 |
+
3. **PDF Generator**: Creates final PDF output
|
| 83 |
+
4. **Backend Manager**: Handles multiple conversion engines
|
| 84 |
+
|
| 85 |
+
### Data Flow
|
| 86 |
+
```mermaid
|
| 87 |
+
graph LR
|
| 88 |
+
A[Markdown Input] --> B[Parser]
|
| 89 |
+
B --> C[CSS Processor]
|
| 90 |
+
C --> D[PDF Generator]
|
| 91 |
+
D --> E[PDF Output]
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
## Implementation
|
| 95 |
+
|
| 96 |
+
### Python Code Example
|
| 97 |
+
```python
|
| 98 |
+
from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
|
| 99 |
+
|
| 100 |
+
# Configure converter
|
| 101 |
+
config = MarkdownConfig(
|
| 102 |
+
page_size="A4",
|
| 103 |
+
margin="1in",
|
| 104 |
+
include_toc=True,
|
| 105 |
+
syntax_highlighting=True
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# Create converter
|
| 109 |
+
converter = EnhancedMarkdownConverter(config)
|
| 110 |
+
|
| 111 |
+
# Convert to PDF
|
| 112 |
+
success = converter.convert_file_to_pdf(
|
| 113 |
+
input_path="document.md",
|
| 114 |
+
output_path="output.pdf",
|
| 115 |
+
method="weasyprint"
|
| 116 |
+
)
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
### Configuration Options
|
| 120 |
+
```yaml
|
| 121 |
+
converter:
|
| 122 |
+
page_size: A4
|
| 123 |
+
margin: 1in
|
| 124 |
+
font_size: 12pt
|
| 125 |
+
include_toc: true
|
| 126 |
+
syntax_highlighting: true
|
| 127 |
+
backend: weasyprint
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## Performance
|
| 131 |
+
|
| 132 |
+
### Benchmark Results
|
| 133 |
+
| Backend | Speed | Quality | Features |
|
| 134 |
+
|---------|-------|---------|----------|
|
| 135 |
+
| WeasyPrint | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ |
|
| 136 |
+
| Pandoc | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
|
| 137 |
+
|
| 138 |
+
### Processing Times
|
| 139 |
+
- **Small documents** (< 10 pages): 1-3 seconds
|
| 140 |
+
- **Medium documents** (10-50 pages): 3-10 seconds
|
| 141 |
+
- **Large documents** (> 50 pages): 10-30 seconds
|
| 142 |
+
|
| 143 |
+
## Advanced Features
|
| 144 |
+
|
| 145 |
+
### Custom CSS Styling
|
| 146 |
+
The system supports advanced CSS customization:
|
| 147 |
+
|
| 148 |
+
```css
|
| 149 |
+
body {
|
| 150 |
+
font-family: 'Georgia', serif;
|
| 151 |
+
line-height: 1.6;
|
| 152 |
+
color: #333;
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
h1 {
|
| 156 |
+
color: #2c3e50;
|
| 157 |
+
border-bottom: 2px solid #3498db;
|
| 158 |
+
padding-bottom: 0.3em;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
code {
|
| 162 |
+
background-color: #f8f9fa;
|
| 163 |
+
padding: 2px 4px;
|
| 164 |
+
border-radius: 3px;
|
| 165 |
+
font-family: 'Courier New', monospace;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
pre {
|
| 169 |
+
background-color: #f8f9fa;
|
| 170 |
+
border-left: 4px solid #3498db;
|
| 171 |
+
padding: 15px;
|
| 172 |
+
border-radius: 5px;
|
| 173 |
+
overflow-x: auto;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
table {
|
| 177 |
+
border-collapse: collapse;
|
| 178 |
+
width: 100%;
|
| 179 |
+
margin: 1em 0;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
th, td {
|
| 183 |
+
border: 1px solid #ddd;
|
| 184 |
+
padding: 8px 12px;
|
| 185 |
+
text-align: left;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
th {
|
| 189 |
+
background-color: #f2f2f2;
|
| 190 |
+
font-weight: bold;
|
| 191 |
+
}
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
### Image Support
|
| 195 |
+

|
| 196 |
+
|
| 197 |
+
Images are automatically scaled and positioned appropriately in the PDF output.
|
| 198 |
+
|
| 199 |
+
## Conclusion
|
| 200 |
+
The enhanced markdown conversion system provides professional-quality PDF generation with extensive customization options and multiple backend support.
|
| 201 |
+
|
| 202 |
+
---
|
| 203 |
+
|
| 204 |
+
*Generated on: 2024-01-15*
|
| 205 |
+
*Version: 1.0.0*
|
| 206 |
+
"""
|
| 207 |
+
|
| 208 |
+
# Academic paper sample
|
| 209 |
+
academic_content = """# Research Paper: Advanced Document Processing
|
| 210 |
+
|
| 211 |
+
**Authors:** Alice Johnson¹, Bob Smith², Carol Williams¹
|
| 212 |
+
**Affiliations:**
|
| 213 |
+
¹ University of Technology
|
| 214 |
+
² Research Institute
|
| 215 |
+
|
| 216 |
+
## Abstract
|
| 217 |
+
|
| 218 |
+
This paper presents a comprehensive analysis of advanced document processing techniques using enhanced markdown conversion. Our research demonstrates significant improvements in processing speed and output quality through optimized backend selection and custom styling approaches.
|
| 219 |
+
|
| 220 |
+
**Keywords:** document processing, markdown conversion, PDF generation, performance optimization
|
| 221 |
+
|
| 222 |
+
## 1. Introduction
|
| 223 |
+
|
| 224 |
+
Document processing has become increasingly important in modern information systems. The ability to convert markdown documents to high-quality PDF outputs with professional formatting is crucial for academic, technical, and business applications.
|
| 225 |
+
|
| 226 |
+
### 1.1 Research Objectives
|
| 227 |
+
|
| 228 |
+
1. Evaluate different markdown conversion backends
|
| 229 |
+
2. Analyze performance characteristics of each approach
|
| 230 |
+
3. Develop optimization strategies for large-scale processing
|
| 231 |
+
4. Design flexible configuration systems for diverse use cases
|
| 232 |
+
|
| 233 |
+
### 1.2 Contributions
|
| 234 |
+
|
| 235 |
+
This work makes the following contributions:
|
| 236 |
+
- Comprehensive comparison of markdown conversion backends
|
| 237 |
+
- Performance optimization techniques for large documents
|
| 238 |
+
- Flexible configuration framework for customization
|
| 239 |
+
- Integration patterns for document processing pipelines
|
| 240 |
+
|
| 241 |
+
## 2. Methodology
|
| 242 |
+
|
| 243 |
+
### 2.1 Experimental Setup
|
| 244 |
+
|
| 245 |
+
We conducted experiments using the following configuration:
|
| 246 |
+
|
| 247 |
+
```python
|
| 248 |
+
# Experimental configuration
|
| 249 |
+
config = MarkdownConfig(
|
| 250 |
+
page_size="A4",
|
| 251 |
+
margin="1in",
|
| 252 |
+
font_size="11pt",
|
| 253 |
+
line_height="1.4",
|
| 254 |
+
include_toc=True,
|
| 255 |
+
syntax_highlighting=True
|
| 256 |
+
)
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
### 2.2 Test Documents
|
| 260 |
+
|
| 261 |
+
| Category | Count | Avg Size | Complexity |
|
| 262 |
+
|----------|-------|----------|------------|
|
| 263 |
+
| Simple | 100 | 2 pages | Low |
|
| 264 |
+
| Medium | 50 | 10 pages | Medium |
|
| 265 |
+
| Complex | 25 | 25 pages | High |
|
| 266 |
+
|
| 267 |
+
### 2.3 Metrics
|
| 268 |
+
|
| 269 |
+
We evaluated performance using the following metrics:
|
| 270 |
+
- **Conversion Speed**: Time to generate PDF (seconds)
|
| 271 |
+
- **Memory Usage**: Peak memory consumption (MB)
|
| 272 |
+
- **Output Quality**: Visual assessment score (1-10)
|
| 273 |
+
- **Feature Support**: Number of supported markdown features
|
| 274 |
+
|
| 275 |
+
## 3. Results
|
| 276 |
+
|
| 277 |
+
### 3.1 Performance Comparison
|
| 278 |
+
|
| 279 |
+
The following table summarizes our performance results:
|
| 280 |
+
|
| 281 |
+
| Backend | Speed (s) | Memory (MB) | Quality | Features |
|
| 282 |
+
|---------|-----------|-------------|---------|----------|
|
| 283 |
+
| WeasyPrint | 2.3 ± 0.5 | 85 ± 15 | 8.5 | 85% |
|
| 284 |
+
| Pandoc | 4.7 ± 1.2 | 120 ± 25 | 9.2 | 95% |
|
| 285 |
+
|
| 286 |
+
### 3.2 Quality Analysis
|
| 287 |
+
|
| 288 |
+
#### 3.2.1 Typography
|
| 289 |
+
WeasyPrint excels in web-style typography with excellent CSS support, while Pandoc provides superior academic formatting with LaTeX-quality output.
|
| 290 |
+
|
| 291 |
+
#### 3.2.2 Code Highlighting
|
| 292 |
+
Both backends support syntax highlighting through Pygments:
|
| 293 |
+
|
| 294 |
+
```python
|
| 295 |
+
def analyze_performance(backend, documents):
|
| 296 |
+
'''Analyze conversion performance for given backend'''
|
| 297 |
+
results = []
|
| 298 |
+
|
| 299 |
+
for doc in documents:
|
| 300 |
+
start_time = time.time()
|
| 301 |
+
success = backend.convert(doc)
|
| 302 |
+
end_time = time.time()
|
| 303 |
+
|
| 304 |
+
results.append({
|
| 305 |
+
'document': doc,
|
| 306 |
+
'time': end_time - start_time,
|
| 307 |
+
'success': success
|
| 308 |
+
})
|
| 309 |
+
|
| 310 |
+
return results
|
| 311 |
+
```
|
| 312 |
+
|
| 313 |
+
### 3.3 Scalability
|
| 314 |
+
|
| 315 |
+
Our scalability analysis shows:
|
| 316 |
+
- Linear scaling with document size for both backends
|
| 317 |
+
- Memory usage proportional to content complexity
|
| 318 |
+
- Optimal batch sizes of 10-20 documents for parallel processing
|
| 319 |
+
|
| 320 |
+
## 4. Discussion
|
| 321 |
+
|
| 322 |
+
### 4.1 Backend Selection Guidelines
|
| 323 |
+
|
| 324 |
+
Choose **WeasyPrint** for:
|
| 325 |
+
- Web-style documents with custom CSS
|
| 326 |
+
- Fast conversion requirements
|
| 327 |
+
- Simple to medium complexity documents
|
| 328 |
+
|
| 329 |
+
Choose **Pandoc** for:
|
| 330 |
+
- Academic papers and publications
|
| 331 |
+
- Complex document structures
|
| 332 |
+
- Maximum feature support requirements
|
| 333 |
+
|
| 334 |
+
### 4.2 Optimization Strategies
|
| 335 |
+
|
| 336 |
+
1. **Image Optimization**: Compress images before embedding
|
| 337 |
+
2. **CSS Minimization**: Use efficient CSS selectors
|
| 338 |
+
3. **Content Chunking**: Process large documents in sections
|
| 339 |
+
4. **Caching**: Cache converted content for repeated use
|
| 340 |
+
|
| 341 |
+
## 5. Conclusion
|
| 342 |
+
|
| 343 |
+
This research demonstrates that enhanced markdown conversion provides significant benefits for document processing workflows. The choice between WeasyPrint and Pandoc depends on specific requirements for speed, quality, and features.
|
| 344 |
+
|
| 345 |
+
### 5.1 Future Work
|
| 346 |
+
|
| 347 |
+
- Integration with cloud processing services
|
| 348 |
+
- Real-time collaborative editing support
|
| 349 |
+
- Advanced template systems
|
| 350 |
+
- Performance optimization for very large documents
|
| 351 |
+
|
| 352 |
+
## References
|
| 353 |
+
|
| 354 |
+
1. Johnson, A. et al. (2024). "Advanced Document Processing Techniques." *Journal of Information Systems*, 15(3), 45-62.
|
| 355 |
+
2. Smith, B. (2023). "PDF Generation Optimization." *Technical Computing Review*, 8(2), 12-28.
|
| 356 |
+
3. Williams, C. (2024). "Markdown Processing Frameworks." *Software Engineering Quarterly*, 22(1), 78-95.
|
| 357 |
+
|
| 358 |
+
---
|
| 359 |
+
|
| 360 |
+
**Manuscript received:** January 10, 2024
|
| 361 |
+
**Accepted for publication:** January 15, 2024
|
| 362 |
+
**Published online:** January 20, 2024
|
| 363 |
+
"""
|
| 364 |
+
|
| 365 |
+
return {
|
| 366 |
+
"basic": basic_content,
|
| 367 |
+
"technical": technical_content,
|
| 368 |
+
"academic": academic_content,
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def demonstrate_basic_conversion():
|
| 373 |
+
"""Demonstrate basic markdown to PDF conversion"""
|
| 374 |
+
print("\n" + "=" * 60)
|
| 375 |
+
print("BASIC MARKDOWN CONVERSION DEMONSTRATION")
|
| 376 |
+
print("=" * 60)
|
| 377 |
+
|
| 378 |
+
try:
|
| 379 |
+
# Create converter with default settings
|
| 380 |
+
converter = EnhancedMarkdownConverter()
|
| 381 |
+
|
| 382 |
+
# Show backend information
|
| 383 |
+
backend_info = converter.get_backend_info()
|
| 384 |
+
print("Available conversion backends:")
|
| 385 |
+
for backend, available in backend_info["available_backends"].items():
|
| 386 |
+
status = "✅" if available else "❌"
|
| 387 |
+
print(f" {status} {backend}")
|
| 388 |
+
print(f"Recommended backend: {backend_info['recommended_backend']}")
|
| 389 |
+
|
| 390 |
+
# Get sample content
|
| 391 |
+
samples = create_sample_markdown_content()
|
| 392 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 393 |
+
|
| 394 |
+
# Convert basic sample
|
| 395 |
+
basic_md_path = temp_dir / "basic_sample.md"
|
| 396 |
+
with open(basic_md_path, "w", encoding="utf-8") as f:
|
| 397 |
+
f.write(samples["basic"])
|
| 398 |
+
|
| 399 |
+
print(f"\nConverting basic sample: {basic_md_path}")
|
| 400 |
+
|
| 401 |
+
success = converter.convert_file_to_pdf(
|
| 402 |
+
input_path=str(basic_md_path),
|
| 403 |
+
output_path=str(temp_dir / "basic_sample.pdf"),
|
| 404 |
+
method="auto", # Let the system choose the best backend
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
if success:
|
| 408 |
+
print("✅ Basic conversion successful!")
|
| 409 |
+
print(f" Output: {temp_dir / 'basic_sample.pdf'}")
|
| 410 |
+
else:
|
| 411 |
+
print("❌ Basic conversion failed")
|
| 412 |
+
|
| 413 |
+
return success, temp_dir
|
| 414 |
+
|
| 415 |
+
except Exception as e:
|
| 416 |
+
print(f"❌ Basic conversion demonstration failed: {str(e)}")
|
| 417 |
+
return False, None
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
def demonstrate_backend_comparison():
|
| 421 |
+
"""Demonstrate different conversion backends"""
|
| 422 |
+
print("\n" + "=" * 60)
|
| 423 |
+
print("BACKEND COMPARISON DEMONSTRATION")
|
| 424 |
+
print("=" * 60)
|
| 425 |
+
|
| 426 |
+
try:
|
| 427 |
+
samples = create_sample_markdown_content()
|
| 428 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 429 |
+
|
| 430 |
+
# Create technical document
|
| 431 |
+
tech_md_path = temp_dir / "technical.md"
|
| 432 |
+
with open(tech_md_path, "w", encoding="utf-8") as f:
|
| 433 |
+
f.write(samples["technical"])
|
| 434 |
+
|
| 435 |
+
print("Testing different backends with technical document...")
|
| 436 |
+
|
| 437 |
+
# Test different backends
|
| 438 |
+
backends = ["auto", "weasyprint", "pandoc"]
|
| 439 |
+
results = {}
|
| 440 |
+
|
| 441 |
+
for backend in backends:
|
| 442 |
+
try:
|
| 443 |
+
print(f"\nTesting {backend} backend...")
|
| 444 |
+
|
| 445 |
+
converter = EnhancedMarkdownConverter()
|
| 446 |
+
output_path = temp_dir / f"technical_{backend}.pdf"
|
| 447 |
+
|
| 448 |
+
import time
|
| 449 |
+
|
| 450 |
+
start_time = time.time()
|
| 451 |
+
|
| 452 |
+
success = converter.convert_file_to_pdf(
|
| 453 |
+
input_path=str(tech_md_path),
|
| 454 |
+
output_path=str(output_path),
|
| 455 |
+
method=backend,
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
end_time = time.time()
|
| 459 |
+
conversion_time = end_time - start_time
|
| 460 |
+
|
| 461 |
+
if success:
|
| 462 |
+
file_size = (
|
| 463 |
+
output_path.stat().st_size if output_path.exists() else 0
|
| 464 |
+
)
|
| 465 |
+
print(
|
| 466 |
+
f" ✅ {backend}: Success in {conversion_time:.2f}s, {file_size} bytes"
|
| 467 |
+
)
|
| 468 |
+
results[backend] = {
|
| 469 |
+
"success": True,
|
| 470 |
+
"time": conversion_time,
|
| 471 |
+
"size": file_size,
|
| 472 |
+
"output": str(output_path),
|
| 473 |
+
}
|
| 474 |
+
else:
|
| 475 |
+
print(f" ❌ {backend}: Failed")
|
| 476 |
+
results[backend] = {"success": False, "time": conversion_time}
|
| 477 |
+
|
| 478 |
+
except Exception as e:
|
| 479 |
+
print(f" ❌ {backend}: Error - {str(e)}")
|
| 480 |
+
results[backend] = {"success": False, "error": str(e)}
|
| 481 |
+
|
| 482 |
+
# Summary
|
| 483 |
+
print("\n" + "-" * 40)
|
| 484 |
+
print("BACKEND COMPARISON SUMMARY")
|
| 485 |
+
print("-" * 40)
|
| 486 |
+
successful_backends = [b for b, r in results.items() if r.get("success", False)]
|
| 487 |
+
print(f"Successful backends: {successful_backends}")
|
| 488 |
+
|
| 489 |
+
if successful_backends:
|
| 490 |
+
fastest = min(successful_backends, key=lambda b: results[b]["time"])
|
| 491 |
+
print(f"Fastest backend: {fastest} ({results[fastest]['time']:.2f}s)")
|
| 492 |
+
|
| 493 |
+
return results, temp_dir
|
| 494 |
+
|
| 495 |
+
except Exception as e:
|
| 496 |
+
print(f"❌ Backend comparison demonstration failed: {str(e)}")
|
| 497 |
+
return None, None
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
def demonstrate_custom_styling():
|
| 501 |
+
"""Demonstrate custom CSS styling and configuration"""
|
| 502 |
+
print("\n" + "=" * 60)
|
| 503 |
+
print("CUSTOM STYLING DEMONSTRATION")
|
| 504 |
+
print("=" * 60)
|
| 505 |
+
|
| 506 |
+
try:
|
| 507 |
+
samples = create_sample_markdown_content()
|
| 508 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 509 |
+
|
| 510 |
+
# Create custom CSS
|
| 511 |
+
custom_css = """
|
| 512 |
+
body {
|
| 513 |
+
font-family: 'Times New Roman', serif;
|
| 514 |
+
font-size: 11pt;
|
| 515 |
+
line-height: 1.4;
|
| 516 |
+
color: #2c3e50;
|
| 517 |
+
max-width: 800px;
|
| 518 |
+
margin: 0 auto;
|
| 519 |
+
padding: 20px;
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
h1 {
|
| 523 |
+
color: #c0392b;
|
| 524 |
+
font-size: 2.2em;
|
| 525 |
+
border-bottom: 3px solid #e74c3c;
|
| 526 |
+
padding-bottom: 0.5em;
|
| 527 |
+
margin-top: 2em;
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
h2 {
|
| 531 |
+
color: #8e44ad;
|
| 532 |
+
font-size: 1.6em;
|
| 533 |
+
border-bottom: 2px solid #9b59b6;
|
| 534 |
+
padding-bottom: 0.3em;
|
| 535 |
+
margin-top: 1.5em;
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
h3 {
|
| 539 |
+
color: #2980b9;
|
| 540 |
+
font-size: 1.3em;
|
| 541 |
+
margin-top: 1.2em;
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
code {
|
| 545 |
+
background-color: #ecf0f1;
|
| 546 |
+
color: #e74c3c;
|
| 547 |
+
padding: 3px 6px;
|
| 548 |
+
border-radius: 4px;
|
| 549 |
+
font-family: 'Courier New', monospace;
|
| 550 |
+
font-size: 0.9em;
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
pre {
|
| 554 |
+
background-color: #2c3e50;
|
| 555 |
+
color: #ecf0f1;
|
| 556 |
+
padding: 20px;
|
| 557 |
+
border-radius: 8px;
|
| 558 |
+
border-left: 5px solid #3498db;
|
| 559 |
+
overflow-x: auto;
|
| 560 |
+
font-size: 0.9em;
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
pre code {
|
| 564 |
+
background-color: transparent;
|
| 565 |
+
color: inherit;
|
| 566 |
+
padding: 0;
|
| 567 |
+
}
|
| 568 |
+
|
| 569 |
+
blockquote {
|
| 570 |
+
background-color: #f8f9fa;
|
| 571 |
+
border-left: 5px solid #3498db;
|
| 572 |
+
margin: 1em 0;
|
| 573 |
+
padding: 15px 20px;
|
| 574 |
+
font-style: italic;
|
| 575 |
+
color: #555;
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
table {
|
| 579 |
+
border-collapse: collapse;
|
| 580 |
+
width: 100%;
|
| 581 |
+
margin: 1.5em 0;
|
| 582 |
+
background-color: white;
|
| 583 |
+
border-radius: 8px;
|
| 584 |
+
overflow: hidden;
|
| 585 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
th {
|
| 589 |
+
background-color: #3498db;
|
| 590 |
+
color: white;
|
| 591 |
+
padding: 12px 15px;
|
| 592 |
+
text-align: left;
|
| 593 |
+
font-weight: bold;
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
td {
|
| 597 |
+
padding: 10px 15px;
|
| 598 |
+
border-bottom: 1px solid #ecf0f1;
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
tr:nth-child(even) {
|
| 602 |
+
background-color: #f8f9fa;
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
tr:hover {
|
| 606 |
+
background-color: #e8f4fd;
|
| 607 |
+
}
|
| 608 |
+
|
| 609 |
+
ul, ol {
|
| 610 |
+
margin-bottom: 1em;
|
| 611 |
+
padding-left: 2em;
|
| 612 |
+
}
|
| 613 |
+
|
| 614 |
+
li {
|
| 615 |
+
margin-bottom: 0.5em;
|
| 616 |
+
line-height: 1.6;
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
a {
|
| 620 |
+
color: #3498db;
|
| 621 |
+
text-decoration: none;
|
| 622 |
+
border-bottom: 1px dotted #3498db;
|
| 623 |
+
}
|
| 624 |
+
|
| 625 |
+
a:hover {
|
| 626 |
+
color: #2980b9;
|
| 627 |
+
border-bottom: 1px solid #2980b9;
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
.toc {
|
| 631 |
+
background-color: #f8f9fa;
|
| 632 |
+
border: 2px solid #e9ecef;
|
| 633 |
+
border-radius: 8px;
|
| 634 |
+
padding: 20px;
|
| 635 |
+
margin: 2em 0;
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
.toc h2 {
|
| 639 |
+
color: #2c3e50;
|
| 640 |
+
margin-top: 0;
|
| 641 |
+
border-bottom: none;
|
| 642 |
+
}
|
| 643 |
+
|
| 644 |
+
.toc ul {
|
| 645 |
+
list-style-type: none;
|
| 646 |
+
padding-left: 0;
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
.toc li {
|
| 650 |
+
margin-bottom: 0.8em;
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
.toc a {
|
| 654 |
+
color: #2c3e50;
|
| 655 |
+
font-weight: 500;
|
| 656 |
+
border-bottom: none;
|
| 657 |
+
}
|
| 658 |
+
"""
|
| 659 |
+
|
| 660 |
+
# Create custom configuration
|
| 661 |
+
config = MarkdownConfig(
|
| 662 |
+
page_size="A4",
|
| 663 |
+
margin="0.8in",
|
| 664 |
+
font_size="11pt",
|
| 665 |
+
line_height="1.4",
|
| 666 |
+
include_toc=True,
|
| 667 |
+
syntax_highlighting=True,
|
| 668 |
+
custom_css=custom_css,
|
| 669 |
+
)
|
| 670 |
+
|
| 671 |
+
converter = EnhancedMarkdownConverter(config)
|
| 672 |
+
|
| 673 |
+
# Convert academic sample with custom styling
|
| 674 |
+
academic_md_path = temp_dir / "academic_styled.md"
|
| 675 |
+
with open(academic_md_path, "w", encoding="utf-8") as f:
|
| 676 |
+
f.write(samples["academic"])
|
| 677 |
+
|
| 678 |
+
print("Converting academic paper with custom styling...")
|
| 679 |
+
print("Custom styling features:")
|
| 680 |
+
print(" - Custom color scheme (reds, purples, blues)")
|
| 681 |
+
print(" - Times New Roman serif font")
|
| 682 |
+
print(" - Enhanced table styling with hover effects")
|
| 683 |
+
print(" - Styled code blocks with dark theme")
|
| 684 |
+
print(" - Custom blockquote styling")
|
| 685 |
+
print(" - Professional header styling")
|
| 686 |
+
|
| 687 |
+
success = converter.convert_file_to_pdf(
|
| 688 |
+
input_path=str(academic_md_path),
|
| 689 |
+
output_path=str(temp_dir / "academic_styled.pdf"),
|
| 690 |
+
method="weasyprint", # WeasyPrint is best for custom CSS
|
| 691 |
+
)
|
| 692 |
+
|
| 693 |
+
if success:
|
| 694 |
+
print("✅ Custom styling conversion successful!")
|
| 695 |
+
print(f" Output: {temp_dir / 'academic_styled.pdf'}")
|
| 696 |
+
|
| 697 |
+
# Also create a default version for comparison
|
| 698 |
+
default_converter = EnhancedMarkdownConverter()
|
| 699 |
+
default_success = default_converter.convert_file_to_pdf(
|
| 700 |
+
input_path=str(academic_md_path),
|
| 701 |
+
output_path=str(temp_dir / "academic_default.pdf"),
|
| 702 |
+
method="weasyprint",
|
| 703 |
+
)
|
| 704 |
+
|
| 705 |
+
if default_success:
|
| 706 |
+
print(f" Comparison (default): {temp_dir / 'academic_default.pdf'}")
|
| 707 |
+
else:
|
| 708 |
+
print("❌ Custom styling conversion failed")
|
| 709 |
+
|
| 710 |
+
return success, temp_dir
|
| 711 |
+
|
| 712 |
+
except Exception as e:
|
| 713 |
+
print(f"❌ Custom styling demonstration failed: {str(e)}")
|
| 714 |
+
return False, None
|
| 715 |
+
|
| 716 |
+
|
| 717 |
+
def demonstrate_content_conversion():
|
| 718 |
+
"""Demonstrate converting markdown content directly (not from file)"""
|
| 719 |
+
print("\n" + "=" * 60)
|
| 720 |
+
print("CONTENT CONVERSION DEMONSTRATION")
|
| 721 |
+
print("=" * 60)
|
| 722 |
+
|
| 723 |
+
try:
|
| 724 |
+
# Create markdown content programmatically
|
| 725 |
+
dynamic_content = f"""# Dynamic Content Example
|
| 726 |
+
|
| 727 |
+
## Generated Information
|
| 728 |
+
This document was generated programmatically on {Path(__file__).name}.
|
| 729 |
+
|
| 730 |
+
## System Information
|
| 731 |
+
- **Python Path**: {sys.executable}
|
| 732 |
+
- **Script Location**: {Path(__file__).absolute()}
|
| 733 |
+
- **Working Directory**: {Path.cwd()}
|
| 734 |
+
|
| 735 |
+
## Dynamic Table
|
| 736 |
+
| Property | Value |
|
| 737 |
+
|----------|-------|
|
| 738 |
+
| Script Name | {Path(__file__).name} |
|
| 739 |
+
| Python Version | {sys.version.split()[0]} |
|
| 740 |
+
| Platform | {sys.platform} |
|
| 741 |
+
|
| 742 |
+
## Code Example
|
| 743 |
+
```python
|
| 744 |
+
# This content was generated dynamically
|
| 745 |
+
import sys
|
| 746 |
+
from pathlib import Path
|
| 747 |
+
|
| 748 |
+
def generate_report():
|
| 749 |
+
return f"Report generated from {{Path(__file__).name}}"
|
| 750 |
+
|
| 751 |
+
print(generate_report())
|
| 752 |
+
```
|
| 753 |
+
|
| 754 |
+
## Features Demonstrated
|
| 755 |
+
This example shows how to:
|
| 756 |
+
1. Generate markdown content programmatically
|
| 757 |
+
2. Convert content directly without saving to file first
|
| 758 |
+
3. Include dynamic information in documents
|
| 759 |
+
4. Use different conversion methods
|
| 760 |
+
|
| 761 |
+
> **Note**: This content was created in memory and converted directly to PDF
|
| 762 |
+
> without intermediate file storage.
|
| 763 |
+
|
| 764 |
+
## Conclusion
|
| 765 |
+
Direct content conversion is useful for:
|
| 766 |
+
- Dynamic report generation
|
| 767 |
+
- Programmatic document creation
|
| 768 |
+
- API-based document services
|
| 769 |
+
- Real-time content processing
|
| 770 |
+
"""
|
| 771 |
+
|
| 772 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 773 |
+
converter = EnhancedMarkdownConverter()
|
| 774 |
+
|
| 775 |
+
print("Converting dynamically generated markdown content...")
|
| 776 |
+
print("Content includes:")
|
| 777 |
+
print(" - System information")
|
| 778 |
+
print(" - Dynamic tables with current values")
|
| 779 |
+
print(" - Generated timestamps")
|
| 780 |
+
print(" - Programmatic examples")
|
| 781 |
+
|
| 782 |
+
# Convert content directly to PDF
|
| 783 |
+
output_path = temp_dir / "dynamic_content.pdf"
|
| 784 |
+
|
| 785 |
+
success = converter.convert_markdown_to_pdf(
|
| 786 |
+
markdown_content=dynamic_content,
|
| 787 |
+
output_path=str(output_path),
|
| 788 |
+
method="auto",
|
| 789 |
+
)
|
| 790 |
+
|
| 791 |
+
if success:
|
| 792 |
+
print("✅ Content conversion successful!")
|
| 793 |
+
print(f" Output: {output_path}")
|
| 794 |
+
|
| 795 |
+
# Show file size
|
| 796 |
+
file_size = output_path.stat().st_size
|
| 797 |
+
print(f" Generated PDF size: {file_size} bytes")
|
| 798 |
+
else:
|
| 799 |
+
print("❌ Content conversion failed")
|
| 800 |
+
|
| 801 |
+
return success, temp_dir
|
| 802 |
+
|
| 803 |
+
except Exception as e:
|
| 804 |
+
print(f"❌ Content conversion demonstration failed: {str(e)}")
|
| 805 |
+
return False, None
|
| 806 |
+
|
| 807 |
+
|
| 808 |
+
def demonstrate_error_handling():
|
| 809 |
+
"""Demonstrate error handling and fallback mechanisms"""
|
| 810 |
+
print("\n" + "=" * 60)
|
| 811 |
+
print("ERROR HANDLING DEMONSTRATION")
|
| 812 |
+
print("=" * 60)
|
| 813 |
+
|
| 814 |
+
try:
|
| 815 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 816 |
+
|
| 817 |
+
# Test cases with various issues
|
| 818 |
+
test_cases = {
|
| 819 |
+
"invalid_markdown": """# Invalid Markdown
|
| 820 |
+
|
| 821 |
+
This markdown has some {{invalid}} syntax and [broken links](http://nonexistent.invalid).
|
| 822 |
+
|
| 823 |
+
```unknown_language
|
| 824 |
+
This code block uses an unknown language
|
| 825 |
+
```
|
| 826 |
+
|
| 827 |
+

|
| 828 |
+
""",
|
| 829 |
+
"complex_content": """# Complex Content Test
|
| 830 |
+
|
| 831 |
+
## Mathematical Expressions
|
| 832 |
+
This tests content that might be challenging for some backends:
|
| 833 |
+
|
| 834 |
+
$$ E = mc^2 $$
|
| 835 |
+
|
| 836 |
+
$$\\sum_{i=1}^{n} x_i = \\frac{n(n+1)}{2}$$
|
| 837 |
+
|
| 838 |
+
## Complex Tables
|
| 839 |
+
| A | B | C | D | E | F | G |
|
| 840 |
+
|---|---|---|---|---|---|---|
|
| 841 |
+
| Very long content that might wrap | Short | Medium length content | X | Y | Z | End |
|
| 842 |
+
| Another row with different lengths | A | B | C | D | E | F |
|
| 843 |
+
|
| 844 |
+
## Special Characters
|
| 845 |
+
Unicode: α, β, γ, δ, ε, ζ, η, θ, ι, κ, λ, μ, ν, ξ, ο, π, ρ, σ, τ, υ, φ, χ, ψ, ω
|
| 846 |
+
Symbols: ♠ ♣ ♥ ♦ ☀ ☁ ☂ ☃ ☄ ★ ☆ ☉ ☊ ☋ ☌ ☍ ☎ ☏
|
| 847 |
+
Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
|
| 848 |
+
""",
|
| 849 |
+
"empty_content": "",
|
| 850 |
+
"minimal_content": "# Just a title",
|
| 851 |
+
}
|
| 852 |
+
|
| 853 |
+
print("Testing error handling with various content types...")
|
| 854 |
+
|
| 855 |
+
results = {}
|
| 856 |
+
|
| 857 |
+
for test_name, content in test_cases.items():
|
| 858 |
+
print(f"\nTesting: {test_name}")
|
| 859 |
+
|
| 860 |
+
try:
|
| 861 |
+
# Try multiple backends for each test case
|
| 862 |
+
for backend in ["auto", "weasyprint", "pandoc"]:
|
| 863 |
+
try:
|
| 864 |
+
converter = EnhancedMarkdownConverter()
|
| 865 |
+
output_path = temp_dir / f"{test_name}_{backend}.pdf"
|
| 866 |
+
|
| 867 |
+
success = converter.convert_markdown_to_pdf(
|
| 868 |
+
markdown_content=content,
|
| 869 |
+
output_path=str(output_path),
|
| 870 |
+
method=backend,
|
| 871 |
+
)
|
| 872 |
+
|
| 873 |
+
if success:
|
| 874 |
+
file_size = (
|
| 875 |
+
output_path.stat().st_size
|
| 876 |
+
if output_path.exists()
|
| 877 |
+
else 0
|
| 878 |
+
)
|
| 879 |
+
print(f" ✅ {backend}: Success ({file_size} bytes)")
|
| 880 |
+
results[f"{test_name}_{backend}"] = {
|
| 881 |
+
"success": True,
|
| 882 |
+
"size": file_size,
|
| 883 |
+
}
|
| 884 |
+
else:
|
| 885 |
+
print(f" ❌ {backend}: Failed")
|
| 886 |
+
results[f"{test_name}_{backend}"] = {"success": False}
|
| 887 |
+
|
| 888 |
+
except Exception as e:
|
| 889 |
+
print(f" ❌ {backend}: Error - {str(e)[:60]}...")
|
| 890 |
+
results[f"{test_name}_{backend}"] = {
|
| 891 |
+
"success": False,
|
| 892 |
+
"error": str(e),
|
| 893 |
+
}
|
| 894 |
+
|
| 895 |
+
except Exception as e:
|
| 896 |
+
print(f" ❌ Test case failed: {str(e)}")
|
| 897 |
+
|
| 898 |
+
# Demonstrate robust conversion with fallbacks
|
| 899 |
+
print("\nDemonstrating robust conversion with fallback logic...")
|
| 900 |
+
|
| 901 |
+
def robust_convert(content, output_path):
|
| 902 |
+
"""Convert with multiple backend fallbacks"""
|
| 903 |
+
backends = ["weasyprint", "pandoc", "auto"]
|
| 904 |
+
|
| 905 |
+
for backend in backends:
|
| 906 |
+
try:
|
| 907 |
+
converter = EnhancedMarkdownConverter()
|
| 908 |
+
success = converter.convert_markdown_to_pdf(
|
| 909 |
+
markdown_content=content,
|
| 910 |
+
output_path=output_path,
|
| 911 |
+
method=backend,
|
| 912 |
+
)
|
| 913 |
+
if success:
|
| 914 |
+
return backend, True
|
| 915 |
+
except Exception:
|
| 916 |
+
continue
|
| 917 |
+
|
| 918 |
+
return None, False
|
| 919 |
+
|
| 920 |
+
# Test robust conversion
|
| 921 |
+
test_content = test_cases["complex_content"]
|
| 922 |
+
robust_output = temp_dir / "robust_conversion.pdf"
|
| 923 |
+
|
| 924 |
+
successful_backend, success = robust_convert(test_content, str(robust_output))
|
| 925 |
+
|
| 926 |
+
if success:
|
| 927 |
+
print(f"✅ Robust conversion successful using {successful_backend}")
|
| 928 |
+
print(f" Output: {robust_output}")
|
| 929 |
+
else:
|
| 930 |
+
print("❌ All backends failed for robust conversion")
|
| 931 |
+
|
| 932 |
+
# Summary
|
| 933 |
+
print("\n" + "-" * 40)
|
| 934 |
+
print("ERROR HANDLING SUMMARY")
|
| 935 |
+
print("-" * 40)
|
| 936 |
+
successful_conversions = sum(
|
| 937 |
+
1 for r in results.values() if r.get("success", False)
|
| 938 |
+
)
|
| 939 |
+
total_attempts = len(results)
|
| 940 |
+
success_rate = (
|
| 941 |
+
(successful_conversions / total_attempts * 100) if total_attempts > 0 else 0
|
| 942 |
+
)
|
| 943 |
+
|
| 944 |
+
print(f"Total conversion attempts: {total_attempts}")
|
| 945 |
+
print(f"Successful conversions: {successful_conversions}")
|
| 946 |
+
print(f"Success rate: {success_rate:.1f}%")
|
| 947 |
+
|
| 948 |
+
return results, temp_dir
|
| 949 |
+
|
| 950 |
+
except Exception as e:
|
| 951 |
+
print(f"❌ Error handling demonstration failed: {str(e)}")
|
| 952 |
+
return None, None
|
| 953 |
+
|
| 954 |
+
|
| 955 |
+
def main():
|
| 956 |
+
"""Main demonstration function"""
|
| 957 |
+
# Configure logging
|
| 958 |
+
logging.basicConfig(
|
| 959 |
+
level=logging.INFO,
|
| 960 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 961 |
+
)
|
| 962 |
+
|
| 963 |
+
print("RAG-Anything Enhanced Markdown Conversion Demonstration")
|
| 964 |
+
print("=" * 70)
|
| 965 |
+
print(
|
| 966 |
+
"This example demonstrates various enhanced markdown conversion capabilities:"
|
| 967 |
+
)
|
| 968 |
+
print(" - Basic markdown to PDF conversion")
|
| 969 |
+
print(" - Multiple backend comparison (WeasyPrint vs Pandoc)")
|
| 970 |
+
print(" - Custom CSS styling and professional formatting")
|
| 971 |
+
print(" - Direct content conversion without file I/O")
|
| 972 |
+
print(" - Comprehensive error handling and fallback mechanisms")
|
| 973 |
+
|
| 974 |
+
results = {}
|
| 975 |
+
|
| 976 |
+
# Run demonstrations
|
| 977 |
+
print("\n🚀 Starting demonstrations...")
|
| 978 |
+
|
| 979 |
+
# Basic conversion
|
| 980 |
+
success, temp_dir = demonstrate_basic_conversion()
|
| 981 |
+
results["basic"] = success
|
| 982 |
+
|
| 983 |
+
# Backend comparison
|
| 984 |
+
backend_results, _ = demonstrate_backend_comparison()
|
| 985 |
+
results["backends"] = backend_results
|
| 986 |
+
|
| 987 |
+
# Custom styling
|
| 988 |
+
styling_success, _ = demonstrate_custom_styling()
|
| 989 |
+
results["styling"] = styling_success
|
| 990 |
+
|
| 991 |
+
# Content conversion
|
| 992 |
+
content_success, _ = demonstrate_content_conversion()
|
| 993 |
+
results["content"] = content_success
|
| 994 |
+
|
| 995 |
+
# Error handling
|
| 996 |
+
error_results, _ = demonstrate_error_handling()
|
| 997 |
+
results["error_handling"] = error_results
|
| 998 |
+
|
| 999 |
+
# Summary
|
| 1000 |
+
print("\n" + "=" * 70)
|
| 1001 |
+
print("DEMONSTRATION SUMMARY")
|
| 1002 |
+
print("=" * 70)
|
| 1003 |
+
|
| 1004 |
+
print("✅ Features Successfully Demonstrated:")
|
| 1005 |
+
if results["basic"]:
|
| 1006 |
+
print(" - Basic markdown to PDF conversion")
|
| 1007 |
+
if results["backends"]:
|
| 1008 |
+
successful_backends = [
|
| 1009 |
+
b for b, r in results["backends"].items() if r.get("success", False)
|
| 1010 |
+
]
|
| 1011 |
+
print(f" - Multiple backends: {successful_backends}")
|
| 1012 |
+
if results["styling"]:
|
| 1013 |
+
print(" - Custom CSS styling and professional formatting")
|
| 1014 |
+
if results["content"]:
|
| 1015 |
+
print(" - Direct content conversion without file I/O")
|
| 1016 |
+
if results["error_handling"]:
|
| 1017 |
+
success_rate = (
|
| 1018 |
+
sum(
|
| 1019 |
+
1 for r in results["error_handling"].values() if r.get("success", False)
|
| 1020 |
+
)
|
| 1021 |
+
/ len(results["error_handling"])
|
| 1022 |
+
* 100
|
| 1023 |
+
)
|
| 1024 |
+
print(f" - Error handling with {success_rate:.1f}% overall success rate")
|
| 1025 |
+
|
| 1026 |
+
print("\n📊 Key Capabilities Highlighted:")
|
| 1027 |
+
print(" - Professional PDF generation with high-quality typography")
|
| 1028 |
+
print(" - Multiple conversion backends with automatic selection")
|
| 1029 |
+
print(" - Extensive CSS customization for branded documents")
|
| 1030 |
+
print(" - Syntax highlighting for code blocks using Pygments")
|
| 1031 |
+
print(" - Table formatting with professional styling")
|
| 1032 |
+
print(" - Image embedding with proper scaling")
|
| 1033 |
+
print(" - Table of contents generation with navigation")
|
| 1034 |
+
print(" - Comprehensive error handling and fallback mechanisms")
|
| 1035 |
+
|
| 1036 |
+
print("\n💡 Best Practices Demonstrated:")
|
| 1037 |
+
print(" - Choose WeasyPrint for web-style documents and custom CSS")
|
| 1038 |
+
print(" - Choose Pandoc for academic papers and complex formatting")
|
| 1039 |
+
print(" - Use 'auto' method for general-purpose conversion")
|
| 1040 |
+
print(" - Implement fallback logic for robust conversion")
|
| 1041 |
+
print(" - Optimize images before embedding in documents")
|
| 1042 |
+
print(" - Test custom CSS with simple content first")
|
| 1043 |
+
print(" - Handle errors gracefully with multiple backend attempts")
|
| 1044 |
+
print(" - Use appropriate page sizes and margins for target use case")
|
| 1045 |
+
|
| 1046 |
+
print("\n🎯 Integration Patterns:")
|
| 1047 |
+
print(" - Standalone conversion for document generation")
|
| 1048 |
+
print(" - Integration with RAG-Anything document pipeline")
|
| 1049 |
+
print(" - API-based document services")
|
| 1050 |
+
print(" - Batch processing for multiple documents")
|
| 1051 |
+
print(" - Dynamic content generation from templates")
|
| 1052 |
+
|
| 1053 |
+
|
| 1054 |
+
if __name__ == "__main__":
|
| 1055 |
+
main()
|
rag_anything_smaranika/examples/image_format_test.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Image Format Parsing Test Script for RAG-Anything
|
| 4 |
+
|
| 5 |
+
This script demonstrates how to parse various image formats
|
| 6 |
+
using MinerU, including JPG, PNG, BMP, TIFF, GIF, and WebP files.
|
| 7 |
+
|
| 8 |
+
Requirements:
|
| 9 |
+
- PIL/Pillow library for format conversion
|
| 10 |
+
- RAG-Anything package
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
python image_format_test.py --file path/to/image.bmp
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import argparse
|
| 17 |
+
import asyncio
|
| 18 |
+
import sys
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from raganything import RAGAnything
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def check_pillow_installation():
|
| 24 |
+
"""Check if PIL/Pillow is installed and available"""
|
| 25 |
+
try:
|
| 26 |
+
from PIL import Image
|
| 27 |
+
|
| 28 |
+
print(
|
| 29 |
+
f"✅ PIL/Pillow found: PIL version {Image.__version__ if hasattr(Image, '__version__') else 'Unknown'}"
|
| 30 |
+
)
|
| 31 |
+
return True
|
| 32 |
+
except ImportError:
|
| 33 |
+
print("❌ PIL/Pillow not found. Please install Pillow:")
|
| 34 |
+
print(" pip install Pillow")
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def get_image_info(image_path: Path):
|
| 39 |
+
"""Get detailed image information"""
|
| 40 |
+
try:
|
| 41 |
+
from PIL import Image
|
| 42 |
+
|
| 43 |
+
with Image.open(image_path) as img:
|
| 44 |
+
return {
|
| 45 |
+
"format": img.format,
|
| 46 |
+
"mode": img.mode,
|
| 47 |
+
"size": img.size,
|
| 48 |
+
"has_transparency": img.mode in ("RGBA", "LA")
|
| 49 |
+
or "transparency" in img.info,
|
| 50 |
+
}
|
| 51 |
+
except Exception as e:
|
| 52 |
+
return {"error": str(e)}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
async def test_image_format_parsing(file_path: str):
|
| 56 |
+
"""Test image format parsing with MinerU"""
|
| 57 |
+
|
| 58 |
+
print(f"🧪 Testing image format parsing: {file_path}")
|
| 59 |
+
|
| 60 |
+
# Check if file exists and is a supported image format
|
| 61 |
+
file_path = Path(file_path)
|
| 62 |
+
if not file_path.exists():
|
| 63 |
+
print(f"❌ File does not exist: {file_path}")
|
| 64 |
+
return False
|
| 65 |
+
|
| 66 |
+
supported_extensions = {
|
| 67 |
+
".jpg",
|
| 68 |
+
".jpeg",
|
| 69 |
+
".png",
|
| 70 |
+
".bmp",
|
| 71 |
+
".tiff",
|
| 72 |
+
".tif",
|
| 73 |
+
".gif",
|
| 74 |
+
".webp",
|
| 75 |
+
}
|
| 76 |
+
if file_path.suffix.lower() not in supported_extensions:
|
| 77 |
+
print(f"❌ Unsupported file format: {file_path.suffix}")
|
| 78 |
+
print(f" Supported formats: {', '.join(supported_extensions)}")
|
| 79 |
+
return False
|
| 80 |
+
|
| 81 |
+
print(f"📸 File format: {file_path.suffix.upper()}")
|
| 82 |
+
print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
|
| 83 |
+
|
| 84 |
+
# Get detailed image information
|
| 85 |
+
img_info = get_image_info(file_path)
|
| 86 |
+
if "error" not in img_info:
|
| 87 |
+
print("🖼️ Image info:")
|
| 88 |
+
print(f" • Format: {img_info['format']}")
|
| 89 |
+
print(f" • Mode: {img_info['mode']}")
|
| 90 |
+
print(f" • Size: {img_info['size'][0]}x{img_info['size'][1]}")
|
| 91 |
+
print(f" • Has transparency: {img_info['has_transparency']}")
|
| 92 |
+
|
| 93 |
+
# Check format compatibility with MinerU
|
| 94 |
+
mineru_native_formats = {".jpg", ".jpeg", ".png"}
|
| 95 |
+
needs_conversion = file_path.suffix.lower() not in mineru_native_formats
|
| 96 |
+
|
| 97 |
+
if needs_conversion:
|
| 98 |
+
print(
|
| 99 |
+
f"ℹ️ Format {file_path.suffix.upper()} will be converted to PNG for MinerU compatibility"
|
| 100 |
+
)
|
| 101 |
+
else:
|
| 102 |
+
print(f"✅ Format {file_path.suffix.upper()} is natively supported by MinerU")
|
| 103 |
+
|
| 104 |
+
# Initialize RAGAnything (only for parsing functionality)
|
| 105 |
+
rag = RAGAnything()
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
# Test image parsing with MinerU
|
| 109 |
+
print("\n🔄 Testing image parsing with MinerU...")
|
| 110 |
+
content_list, md_content = await rag.parse_document(
|
| 111 |
+
file_path=str(file_path),
|
| 112 |
+
output_dir="./test_output",
|
| 113 |
+
parse_method="ocr", # Images use OCR method
|
| 114 |
+
display_stats=True,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
print("✅ Parsing successful!")
|
| 118 |
+
print(f" 📊 Content blocks: {len(content_list)}")
|
| 119 |
+
print(f" 📝 Markdown length: {len(md_content)} characters")
|
| 120 |
+
|
| 121 |
+
# Analyze content types
|
| 122 |
+
content_types = {}
|
| 123 |
+
for item in content_list:
|
| 124 |
+
if isinstance(item, dict):
|
| 125 |
+
content_type = item.get("type", "unknown")
|
| 126 |
+
content_types[content_type] = content_types.get(content_type, 0) + 1
|
| 127 |
+
|
| 128 |
+
if content_types:
|
| 129 |
+
print(" 📋 Content distribution:")
|
| 130 |
+
for content_type, count in sorted(content_types.items()):
|
| 131 |
+
print(f" • {content_type}: {count}")
|
| 132 |
+
|
| 133 |
+
# Display extracted text (if any)
|
| 134 |
+
if md_content.strip():
|
| 135 |
+
print("\n📄 Extracted text preview (first 500 characters):")
|
| 136 |
+
preview = md_content.strip()[:500]
|
| 137 |
+
print(f" {preview}{'...' if len(md_content) > 500 else ''}")
|
| 138 |
+
else:
|
| 139 |
+
print("\n📄 No text extracted from the image")
|
| 140 |
+
|
| 141 |
+
# Display image processing results
|
| 142 |
+
image_items = [
|
| 143 |
+
item
|
| 144 |
+
for item in content_list
|
| 145 |
+
if isinstance(item, dict) and item.get("type") == "image"
|
| 146 |
+
]
|
| 147 |
+
if image_items:
|
| 148 |
+
print(f"\n🖼️ Found {len(image_items)} processed image(s):")
|
| 149 |
+
for i, item in enumerate(image_items, 1):
|
| 150 |
+
print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
|
| 151 |
+
caption = item.get("image_caption", item.get("img_caption", []))
|
| 152 |
+
if caption:
|
| 153 |
+
print(f" Caption: {caption[0] if caption else 'N/A'}")
|
| 154 |
+
|
| 155 |
+
# Display text blocks (OCR results)
|
| 156 |
+
text_items = [
|
| 157 |
+
item
|
| 158 |
+
for item in content_list
|
| 159 |
+
if isinstance(item, dict) and item.get("type") == "text"
|
| 160 |
+
]
|
| 161 |
+
if text_items:
|
| 162 |
+
print("\n📝 OCR text blocks found:")
|
| 163 |
+
for i, item in enumerate(text_items, 1):
|
| 164 |
+
text_content = item.get("text", "")
|
| 165 |
+
if text_content.strip():
|
| 166 |
+
preview = text_content.strip()[:200]
|
| 167 |
+
print(
|
| 168 |
+
f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# Check for any tables detected in the image
|
| 172 |
+
table_items = [
|
| 173 |
+
item
|
| 174 |
+
for item in content_list
|
| 175 |
+
if isinstance(item, dict) and item.get("type") == "table"
|
| 176 |
+
]
|
| 177 |
+
if table_items:
|
| 178 |
+
print(f"\n📊 Found {len(table_items)} table(s) in image:")
|
| 179 |
+
for i, item in enumerate(table_items, 1):
|
| 180 |
+
print(f" {i}. Table detected with content")
|
| 181 |
+
|
| 182 |
+
print("\n🎉 Image format parsing test completed successfully!")
|
| 183 |
+
print("📁 Output files saved to: ./test_output")
|
| 184 |
+
return True
|
| 185 |
+
|
| 186 |
+
except Exception as e:
|
| 187 |
+
print(f"\n❌ Image format parsing failed: {str(e)}")
|
| 188 |
+
import traceback
|
| 189 |
+
|
| 190 |
+
print(f" Full error: {traceback.format_exc()}")
|
| 191 |
+
return False
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def main():
|
| 195 |
+
"""Main function"""
|
| 196 |
+
parser = argparse.ArgumentParser(
|
| 197 |
+
description="Test image format parsing with MinerU"
|
| 198 |
+
)
|
| 199 |
+
parser.add_argument("--file", help="Path to the image file to test")
|
| 200 |
+
parser.add_argument(
|
| 201 |
+
"--check-pillow", action="store_true", help="Only check PIL/Pillow installation"
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
args = parser.parse_args()
|
| 205 |
+
|
| 206 |
+
# Check PIL/Pillow installation
|
| 207 |
+
print("🔧 Checking PIL/Pillow installation...")
|
| 208 |
+
if not check_pillow_installation():
|
| 209 |
+
return 1
|
| 210 |
+
|
| 211 |
+
if args.check_pillow:
|
| 212 |
+
print("✅ PIL/Pillow installation check passed!")
|
| 213 |
+
return 0
|
| 214 |
+
|
| 215 |
+
# If not just checking dependencies, file argument is required
|
| 216 |
+
if not args.file:
|
| 217 |
+
print("❌ Error: --file argument is required when not using --check-pillow")
|
| 218 |
+
parser.print_help()
|
| 219 |
+
return 1
|
| 220 |
+
|
| 221 |
+
# Run the parsing test
|
| 222 |
+
try:
|
| 223 |
+
success = asyncio.run(test_image_format_parsing(args.file))
|
| 224 |
+
return 0 if success else 1
|
| 225 |
+
except KeyboardInterrupt:
|
| 226 |
+
print("\n⏹️ Test interrupted by user")
|
| 227 |
+
return 1
|
| 228 |
+
except Exception as e:
|
| 229 |
+
print(f"\n❌ Unexpected error: {str(e)}")
|
| 230 |
+
return 1
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
if __name__ == "__main__":
|
| 234 |
+
sys.exit(main())
|
rag_anything_smaranika/examples/insert_content_list_example.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
"""
|
| 3 |
+
Example script demonstrating direct content list insertion with RAGAnything
|
| 4 |
+
|
| 5 |
+
This example shows how to:
|
| 6 |
+
1. Create a simple content list with different content types
|
| 7 |
+
2. Insert content list directly without document parsing using insert_content_list() method
|
| 8 |
+
3. Perform pure text queries using aquery() method
|
| 9 |
+
4. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method
|
| 10 |
+
5. Handle different types of multimodal content in the inserted knowledge base
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import argparse
|
| 15 |
+
import asyncio
|
| 16 |
+
import logging
|
| 17 |
+
import logging.config
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
# Add project root directory to Python path
|
| 21 |
+
import sys
|
| 22 |
+
|
| 23 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
| 24 |
+
|
| 25 |
+
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
| 26 |
+
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
|
| 27 |
+
from raganything import RAGAnything, RAGAnythingConfig
|
| 28 |
+
|
| 29 |
+
from dotenv import load_dotenv
|
| 30 |
+
|
| 31 |
+
load_dotenv(dotenv_path=".env", override=False)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def configure_logging():
|
| 35 |
+
"""Configure logging for the application"""
|
| 36 |
+
# Get log directory path from environment variable or use current directory
|
| 37 |
+
log_dir = os.getenv("LOG_DIR", os.getcwd())
|
| 38 |
+
log_file_path = os.path.abspath(
|
| 39 |
+
os.path.join(log_dir, "insert_content_list_example.log")
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
print(f"\nInsert Content List example log file: {log_file_path}\n")
|
| 43 |
+
os.makedirs(os.path.dirname(log_dir), exist_ok=True)
|
| 44 |
+
|
| 45 |
+
# Get log file max size and backup count from environment variables
|
| 46 |
+
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
|
| 47 |
+
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
|
| 48 |
+
|
| 49 |
+
logging.config.dictConfig(
|
| 50 |
+
{
|
| 51 |
+
"version": 1,
|
| 52 |
+
"disable_existing_loggers": False,
|
| 53 |
+
"formatters": {
|
| 54 |
+
"default": {
|
| 55 |
+
"format": "%(levelname)s: %(message)s",
|
| 56 |
+
},
|
| 57 |
+
"detailed": {
|
| 58 |
+
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 59 |
+
},
|
| 60 |
+
},
|
| 61 |
+
"handlers": {
|
| 62 |
+
"console": {
|
| 63 |
+
"formatter": "default",
|
| 64 |
+
"class": "logging.StreamHandler",
|
| 65 |
+
"stream": "ext://sys.stderr",
|
| 66 |
+
},
|
| 67 |
+
"file": {
|
| 68 |
+
"formatter": "detailed",
|
| 69 |
+
"class": "logging.handlers.RotatingFileHandler",
|
| 70 |
+
"filename": log_file_path,
|
| 71 |
+
"maxBytes": log_max_bytes,
|
| 72 |
+
"backupCount": log_backup_count,
|
| 73 |
+
"encoding": "utf-8",
|
| 74 |
+
},
|
| 75 |
+
},
|
| 76 |
+
"loggers": {
|
| 77 |
+
"lightrag": {
|
| 78 |
+
"handlers": ["console", "file"],
|
| 79 |
+
"level": "INFO",
|
| 80 |
+
"propagate": False,
|
| 81 |
+
},
|
| 82 |
+
},
|
| 83 |
+
}
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Set the logger level to INFO
|
| 87 |
+
logger.setLevel(logging.INFO)
|
| 88 |
+
# Enable verbose debug if needed
|
| 89 |
+
set_verbose_debug(os.getenv("VERBOSE", "false").lower() == "true")
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def create_sample_content_list():
|
| 93 |
+
"""
|
| 94 |
+
Create a simple content list for testing insert_content_list functionality
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
List[Dict]: Sample content list with various content types
|
| 98 |
+
|
| 99 |
+
Note:
|
| 100 |
+
- img_path should be absolute path to the image file
|
| 101 |
+
- page_idx represents the page number where the content appears (0-based)
|
| 102 |
+
"""
|
| 103 |
+
content_list = [
|
| 104 |
+
# Introduction text
|
| 105 |
+
{
|
| 106 |
+
"type": "text",
|
| 107 |
+
"text": "Welcome to the RAGAnything System Documentation. This guide covers the advanced multimodal document processing capabilities and features of our comprehensive RAG system.",
|
| 108 |
+
"page_idx": 0, # Page number where this content appears
|
| 109 |
+
},
|
| 110 |
+
# System architecture image
|
| 111 |
+
{
|
| 112 |
+
"type": "image",
|
| 113 |
+
"img_path": "/absolute/path/to/system_architecture.jpg", # IMPORTANT: Use absolute path to image file
|
| 114 |
+
"image_caption": ["Figure 1: RAGAnything System Architecture"],
|
| 115 |
+
"image_footnote": [
|
| 116 |
+
"The architecture shows the complete pipeline from document parsing to multimodal query processing"
|
| 117 |
+
],
|
| 118 |
+
"page_idx": 1, # Page number where this image appears
|
| 119 |
+
},
|
| 120 |
+
# Performance comparison table
|
| 121 |
+
{
|
| 122 |
+
"type": "table",
|
| 123 |
+
"table_body": """| System | Accuracy | Processing Speed | Memory Usage |
|
| 124 |
+
|--------|----------|------------------|--------------|
|
| 125 |
+
| RAGAnything | 95.2% | 120ms | 2.1GB |
|
| 126 |
+
| Traditional RAG | 87.3% | 180ms | 3.2GB |
|
| 127 |
+
| Baseline System | 82.1% | 220ms | 4.1GB |
|
| 128 |
+
| Simple Retrieval | 76.5% | 95ms | 1.8GB |""",
|
| 129 |
+
"table_caption": [
|
| 130 |
+
"Table 1: Performance Comparison of Different RAG Systems"
|
| 131 |
+
],
|
| 132 |
+
"table_footnote": [
|
| 133 |
+
"All tests conducted on the same hardware with identical test datasets"
|
| 134 |
+
],
|
| 135 |
+
"page_idx": 2, # Page number where this table appears
|
| 136 |
+
},
|
| 137 |
+
# Mathematical formula
|
| 138 |
+
{
|
| 139 |
+
"type": "equation",
|
| 140 |
+
"latex": "Relevance(d, q) = \\sum_{i=1}^{n} w_i \\cdot sim(t_i^d, t_i^q) \\cdot \\alpha_i",
|
| 141 |
+
"text": "Document relevance scoring formula where w_i are term weights, sim() is similarity function, and α_i are modality importance factors",
|
| 142 |
+
"page_idx": 3, # Page number where this equation appears
|
| 143 |
+
},
|
| 144 |
+
# Feature description
|
| 145 |
+
{
|
| 146 |
+
"type": "text",
|
| 147 |
+
"text": "The system supports multiple content modalities including text, images, tables, and mathematical equations. Each modality is processed using specialized processors optimized for that content type.",
|
| 148 |
+
"page_idx": 4, # Page number where this content appears
|
| 149 |
+
},
|
| 150 |
+
# Technical specifications table
|
| 151 |
+
{
|
| 152 |
+
"type": "table",
|
| 153 |
+
"table_body": """| Feature | Specification |
|
| 154 |
+
|---------|---------------|
|
| 155 |
+
| Supported Formats | PDF, DOCX, PPTX, XLSX, Images |
|
| 156 |
+
| Max Document Size | 100MB |
|
| 157 |
+
| Concurrent Processing | Up to 8 documents |
|
| 158 |
+
| Query Response Time | <200ms average |
|
| 159 |
+
| Knowledge Graph Nodes | Up to 1M entities |""",
|
| 160 |
+
"table_caption": ["Table 2: Technical Specifications"],
|
| 161 |
+
"table_footnote": [
|
| 162 |
+
"Specifications may vary based on hardware configuration"
|
| 163 |
+
],
|
| 164 |
+
"page_idx": 5, # Page number where this table appears
|
| 165 |
+
},
|
| 166 |
+
# Conclusion
|
| 167 |
+
{
|
| 168 |
+
"type": "text",
|
| 169 |
+
"text": "RAGAnything represents a significant advancement in multimodal document processing, providing comprehensive solutions for complex knowledge extraction and retrieval tasks.",
|
| 170 |
+
"page_idx": 6, # Page number where this content appears
|
| 171 |
+
},
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
return content_list
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
async def demo_insert_content_list(
|
| 178 |
+
api_key: str,
|
| 179 |
+
base_url: str = None,
|
| 180 |
+
working_dir: str = None,
|
| 181 |
+
):
|
| 182 |
+
"""
|
| 183 |
+
Demonstrate content list insertion and querying with RAGAnything
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
api_key: OpenAI API key
|
| 187 |
+
base_url: Optional base URL for API
|
| 188 |
+
working_dir: Working directory for RAG storage
|
| 189 |
+
"""
|
| 190 |
+
try:
|
| 191 |
+
# Create RAGAnything configuration
|
| 192 |
+
config = RAGAnythingConfig(
|
| 193 |
+
working_dir=working_dir or "./rag_storage",
|
| 194 |
+
enable_image_processing=True,
|
| 195 |
+
enable_table_processing=True,
|
| 196 |
+
enable_equation_processing=True,
|
| 197 |
+
display_content_stats=True, # Show content statistics
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# Define LLM model function
|
| 201 |
+
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
| 202 |
+
return openai_complete_if_cache(
|
| 203 |
+
"gpt-4o-mini",
|
| 204 |
+
prompt,
|
| 205 |
+
system_prompt=system_prompt,
|
| 206 |
+
history_messages=history_messages,
|
| 207 |
+
api_key=api_key,
|
| 208 |
+
base_url=base_url,
|
| 209 |
+
**kwargs,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Define vision model function for image processing
|
| 213 |
+
def vision_model_func(
|
| 214 |
+
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
| 215 |
+
):
|
| 216 |
+
if image_data:
|
| 217 |
+
return openai_complete_if_cache(
|
| 218 |
+
"gpt-4o",
|
| 219 |
+
"",
|
| 220 |
+
system_prompt=None,
|
| 221 |
+
history_messages=[],
|
| 222 |
+
messages=[
|
| 223 |
+
{"role": "system", "content": system_prompt}
|
| 224 |
+
if system_prompt
|
| 225 |
+
else None,
|
| 226 |
+
{
|
| 227 |
+
"role": "user",
|
| 228 |
+
"content": [
|
| 229 |
+
{"type": "text", "text": prompt},
|
| 230 |
+
{
|
| 231 |
+
"type": "image_url",
|
| 232 |
+
"image_url": {
|
| 233 |
+
"url": f"data:image/jpeg;base64,{image_data}"
|
| 234 |
+
},
|
| 235 |
+
},
|
| 236 |
+
],
|
| 237 |
+
}
|
| 238 |
+
if image_data
|
| 239 |
+
else {"role": "user", "content": prompt},
|
| 240 |
+
],
|
| 241 |
+
api_key=api_key,
|
| 242 |
+
base_url=base_url,
|
| 243 |
+
**kwargs,
|
| 244 |
+
)
|
| 245 |
+
else:
|
| 246 |
+
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
| 247 |
+
|
| 248 |
+
# Define embedding function
|
| 249 |
+
embedding_func = EmbeddingFunc(
|
| 250 |
+
embedding_dim=3072,
|
| 251 |
+
max_token_size=8192,
|
| 252 |
+
func=lambda texts: openai_embed(
|
| 253 |
+
texts,
|
| 254 |
+
model="text-embedding-3-large",
|
| 255 |
+
api_key=api_key,
|
| 256 |
+
base_url=base_url,
|
| 257 |
+
),
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
# Initialize RAGAnything
|
| 261 |
+
rag = RAGAnything(
|
| 262 |
+
config=config,
|
| 263 |
+
llm_model_func=llm_model_func,
|
| 264 |
+
vision_model_func=vision_model_func,
|
| 265 |
+
embedding_func=embedding_func,
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
# Create sample content list
|
| 269 |
+
logger.info("Creating sample content list...")
|
| 270 |
+
content_list = create_sample_content_list()
|
| 271 |
+
logger.info(f"Created content list with {len(content_list)} items")
|
| 272 |
+
|
| 273 |
+
# Insert content list directly
|
| 274 |
+
logger.info("\nInserting content list into RAGAnything...")
|
| 275 |
+
await rag.insert_content_list(
|
| 276 |
+
content_list=content_list,
|
| 277 |
+
file_path="raganything_documentation.pdf", # Reference file name for citation
|
| 278 |
+
split_by_character=None, # Optional text splitting
|
| 279 |
+
split_by_character_only=False, # Optional text splitting mode
|
| 280 |
+
doc_id="demo-doc-001", # Custom document ID
|
| 281 |
+
display_stats=True, # Show content statistics
|
| 282 |
+
)
|
| 283 |
+
logger.info("Content list insertion completed!")
|
| 284 |
+
|
| 285 |
+
# Example queries - demonstrating different query approaches
|
| 286 |
+
logger.info("\nQuerying inserted content:")
|
| 287 |
+
|
| 288 |
+
# 1. Pure text queries using aquery()
|
| 289 |
+
text_queries = [
|
| 290 |
+
"What is RAGAnything and what are its main features?",
|
| 291 |
+
"How does RAGAnything compare to traditional RAG systems?",
|
| 292 |
+
"What are the technical specifications of the system?",
|
| 293 |
+
]
|
| 294 |
+
|
| 295 |
+
for query in text_queries:
|
| 296 |
+
logger.info(f"\n[Text Query]: {query}")
|
| 297 |
+
result = await rag.aquery(query, mode="hybrid")
|
| 298 |
+
logger.info(f"Answer: {result}")
|
| 299 |
+
|
| 300 |
+
# 2. Multimodal query with specific multimodal content using aquery_with_multimodal()
|
| 301 |
+
logger.info(
|
| 302 |
+
"\n[Multimodal Query]: Analyzing new performance data against existing benchmarks"
|
| 303 |
+
)
|
| 304 |
+
multimodal_result = await rag.aquery_with_multimodal(
|
| 305 |
+
"Compare this new performance data with the existing benchmark results in the documentation",
|
| 306 |
+
multimodal_content=[
|
| 307 |
+
{
|
| 308 |
+
"type": "table",
|
| 309 |
+
"table_data": """Method,Accuracy,Speed,Memory
|
| 310 |
+
New_Approach,97.1%,110ms,1.9GB
|
| 311 |
+
Enhanced_RAG,91.4%,140ms,2.5GB""",
|
| 312 |
+
"table_caption": "Latest experimental results",
|
| 313 |
+
}
|
| 314 |
+
],
|
| 315 |
+
mode="hybrid",
|
| 316 |
+
)
|
| 317 |
+
logger.info(f"Answer: {multimodal_result}")
|
| 318 |
+
|
| 319 |
+
# 3. Another multimodal query with equation content
|
| 320 |
+
logger.info("\n[Multimodal Query]: Mathematical formula analysis")
|
| 321 |
+
equation_result = await rag.aquery_with_multimodal(
|
| 322 |
+
"How does this similarity formula relate to the relevance scoring mentioned in the documentation?",
|
| 323 |
+
multimodal_content=[
|
| 324 |
+
{
|
| 325 |
+
"type": "equation",
|
| 326 |
+
"latex": "sim(a, b) = \\frac{a \\cdot b}{||a|| \\times ||b||} + \\beta \\cdot context\\_weight",
|
| 327 |
+
"equation_caption": "Enhanced cosine similarity with context weighting",
|
| 328 |
+
}
|
| 329 |
+
],
|
| 330 |
+
mode="hybrid",
|
| 331 |
+
)
|
| 332 |
+
logger.info(f"Answer: {equation_result}")
|
| 333 |
+
|
| 334 |
+
# 4. Insert another content list with different document ID
|
| 335 |
+
logger.info("\nInserting additional content list...")
|
| 336 |
+
additional_content = [
|
| 337 |
+
{
|
| 338 |
+
"type": "text",
|
| 339 |
+
"text": "This is additional documentation about advanced features and configuration options.",
|
| 340 |
+
"page_idx": 0, # Page number where this content appears
|
| 341 |
+
},
|
| 342 |
+
{
|
| 343 |
+
"type": "table",
|
| 344 |
+
"table_body": """| Configuration | Default Value | Range |
|
| 345 |
+
|---------------|---------------|-------|
|
| 346 |
+
| Chunk Size | 512 tokens | 128-2048 |
|
| 347 |
+
| Context Window | 4096 tokens | 1024-8192 |
|
| 348 |
+
| Batch Size | 32 | 1-128 |""",
|
| 349 |
+
"table_caption": ["Advanced Configuration Parameters"],
|
| 350 |
+
"page_idx": 1, # Page number where this table appears
|
| 351 |
+
},
|
| 352 |
+
]
|
| 353 |
+
|
| 354 |
+
await rag.insert_content_list(
|
| 355 |
+
content_list=additional_content,
|
| 356 |
+
file_path="advanced_configuration.pdf",
|
| 357 |
+
doc_id="demo-doc-002", # Different document ID
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
# Query combined knowledge base
|
| 361 |
+
logger.info("\n[Combined Query]: What configuration options are available?")
|
| 362 |
+
combined_result = await rag.aquery(
|
| 363 |
+
"What configuration options are available and what are their default values?",
|
| 364 |
+
mode="hybrid",
|
| 365 |
+
)
|
| 366 |
+
logger.info(f"Answer: {combined_result}")
|
| 367 |
+
|
| 368 |
+
except Exception as e:
|
| 369 |
+
logger.error(f"Error in content list insertion demo: {str(e)}")
|
| 370 |
+
import traceback
|
| 371 |
+
|
| 372 |
+
logger.error(traceback.format_exc())
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def main():
|
| 376 |
+
"""Main function to run the example"""
|
| 377 |
+
parser = argparse.ArgumentParser(description="Insert Content List Example")
|
| 378 |
+
parser.add_argument(
|
| 379 |
+
"--working_dir", "-w", default="./rag_storage", help="Working directory path"
|
| 380 |
+
)
|
| 381 |
+
parser.add_argument(
|
| 382 |
+
"--api-key",
|
| 383 |
+
default=os.getenv("LLM_BINDING_API_KEY"),
|
| 384 |
+
help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
|
| 385 |
+
)
|
| 386 |
+
parser.add_argument(
|
| 387 |
+
"--base-url",
|
| 388 |
+
default=os.getenv("LLM_BINDING_HOST"),
|
| 389 |
+
help="Optional base URL for API",
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
args = parser.parse_args()
|
| 393 |
+
|
| 394 |
+
# Check if API key is provided
|
| 395 |
+
if not args.api_key:
|
| 396 |
+
logger.error("Error: OpenAI API key is required")
|
| 397 |
+
logger.error("Set api key environment variable or use --api-key option")
|
| 398 |
+
return
|
| 399 |
+
|
| 400 |
+
# Run the demo
|
| 401 |
+
asyncio.run(
|
| 402 |
+
demo_insert_content_list(
|
| 403 |
+
args.api_key,
|
| 404 |
+
args.base_url,
|
| 405 |
+
args.working_dir,
|
| 406 |
+
)
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
if __name__ == "__main__":
|
| 411 |
+
# Configure logging first
|
| 412 |
+
configure_logging()
|
| 413 |
+
|
| 414 |
+
print("RAGAnything Insert Content List Example")
|
| 415 |
+
print("=" * 45)
|
| 416 |
+
print("Demonstrating direct content list insertion without document parsing")
|
| 417 |
+
print("=" * 45)
|
| 418 |
+
|
| 419 |
+
main()
|