Phonex commited on
Commit
167596f
·
0 Parent(s):

TheTruthSchool_RAG

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +68 -0
  2. .env.docker.example +15 -0
  3. .env.example +51 -0
  4. .gitignore +92 -0
  5. Dockerfile +178 -0
  6. LICENSE +21 -0
  7. README.md +269 -0
  8. backend/Dockerfile +44 -0
  9. backend/README.md +353 -0
  10. backend/__init__.py +7 -0
  11. backend/main.py +2078 -0
  12. backend/requirements.txt +28 -0
  13. backend/reranker.py +304 -0
  14. backend/url_fetcher.py +381 -0
  15. backend/web_search.py +295 -0
  16. docker-compose.yml +58 -0
  17. frontend/.env.example +2 -0
  18. frontend/Dockerfile +33 -0
  19. frontend/nginx.conf +46 -0
  20. frontend/package-lock.json +0 -0
  21. frontend/package.json +43 -0
  22. frontend/postcss.config.js +6 -0
  23. frontend/public/index.html +17 -0
  24. frontend/src/App.js +1268 -0
  25. frontend/src/index.css +79 -0
  26. frontend/src/index.js +11 -0
  27. frontend/tailwind.config.js +27 -0
  28. rag_anything_smaranika/.github/ISSUE_TEMPLATE/bug_report.yml +61 -0
  29. rag_anything_smaranika/.github/ISSUE_TEMPLATE/config.yml +1 -0
  30. rag_anything_smaranika/.github/ISSUE_TEMPLATE/feature_request.yml +26 -0
  31. rag_anything_smaranika/.github/ISSUE_TEMPLATE/question.yml +26 -0
  32. rag_anything_smaranika/.github/dependabot.yml +11 -0
  33. rag_anything_smaranika/.github/pull_request_template.md +32 -0
  34. rag_anything_smaranika/.github/workflows/linting.yaml +30 -0
  35. rag_anything_smaranika/.github/workflows/pypi-publish.yml +52 -0
  36. rag_anything_smaranika/.gitignore +79 -0
  37. rag_anything_smaranika/.pre-commit-config.yaml +28 -0
  38. rag_anything_smaranika/LICENSE +21 -0
  39. rag_anything_smaranika/MANIFEST.in +9 -0
  40. rag_anything_smaranika/README.md +1260 -0
  41. rag_anything_smaranika/README_zh.md +1258 -0
  42. rag_anything_smaranika/docs/batch_processing.md +341 -0
  43. rag_anything_smaranika/docs/context_aware_processing.md +375 -0
  44. rag_anything_smaranika/docs/enhanced_markdown.md +552 -0
  45. rag_anything_smaranika/env.example +192 -0
  46. rag_anything_smaranika/examples/batch_processing_example.py +561 -0
  47. rag_anything_smaranika/examples/batch_processing_optimized_example.py +216 -0
  48. rag_anything_smaranika/examples/enhanced_markdown_example.py +1055 -0
  49. rag_anything_smaranika/examples/image_format_test.py +234 -0
  50. rag_anything_smaranika/examples/insert_content_list_example.py +419 -0
.dockerignore ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ *.egg
11
+ .pytest_cache/
12
+ .mypy_cache/
13
+ .ruff_cache/
14
+
15
+ # Virtual Environments
16
+ venv/
17
+ env/
18
+ ENV/
19
+ .venv
20
+
21
+ # IDEs
22
+ .vscode/
23
+ .idea/
24
+ *.swp
25
+ *.swo
26
+ *~
27
+
28
+ # OS
29
+ .DS_Store
30
+ Thumbs.db
31
+
32
+ # Git
33
+ .git/
34
+ .gitignore
35
+ .gitattributes
36
+
37
+ # Documentation
38
+ *.md
39
+ !README.md
40
+
41
+ # Logs
42
+ *.log
43
+ logs/
44
+
45
+ # Environment files (will be passed via docker-compose)
46
+ .env
47
+ .env.*
48
+
49
+ # Storage and uploads (will be mounted as volumes)
50
+ storage/
51
+ uploads/
52
+ backend/output/
53
+
54
+ # Frontend
55
+ frontend/node_modules/
56
+ frontend/build/
57
+ frontend/.env
58
+ frontend/.env.local
59
+
60
+ # Temporary files
61
+ tmp/
62
+ temp/
63
+ *.tmp
64
+
65
+ # Docker
66
+ Dockerfile
67
+ docker-compose*.yml
68
+ .dockerignore
.env.docker.example ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ GEMINI_API_KEY=your_gemini_api_key_here
3
+
4
+ GEMINI_TEXT_MODEL=models/gemini-flash-latest
5
+ GEMINI_VERIFIER_MODEL=models/gemini-pro-latest
6
+ GEMINI_VISION_MODEL=models/gemini-flash-latest
7
+ GEMINI_EMBEDDING_MODEL=models/text-embedding-004
8
+
9
+ TAVILY_API_KEY=your_tavily_api_key_here
10
+
11
+ REACT_APP_BACKEND_URL=http://localhost:8000
12
+
13
+ BACKEND_PORT=8000
14
+
15
+ FRONTEND_PORT=3000
.env.example ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Agentic RAG System - Environment Variables
2
+ # Copy this file to .env and fill in your actual API keys
3
+
4
+ # ============================================
5
+ # Required: Google Gemini API Key
6
+ # ============================================
7
+ # Get your free API key from: https://makersuite.google.com/app/apikey
8
+ GEMINI_API_KEY=your_gemini_api_key_here
9
+
10
+ # ============================================
11
+ # Gemini Model Configuration (Optional)
12
+ # ============================================
13
+ # Text generation model (fast responses)
14
+ GEMINI_TEXT_MODEL=models/gemini-flash-latest
15
+
16
+ # Verification model (quality checking)
17
+ GEMINI_VERIFIER_MODEL=models/gemini-pro-latest
18
+
19
+ # Vision model (image processing)
20
+ GEMINI_VISION_MODEL=models/gemini-flash-latest
21
+
22
+ # Embedding model (vector embeddings)
23
+ GEMINI_EMBEDDING_MODEL=models/text-embedding-004
24
+
25
+ # ============================================
26
+ # Optional: Tavily API Key (Web Search)
27
+ # ============================================
28
+ # Get your free API key from: https://tavily.com
29
+ # Leave empty to disable web search features
30
+ TAVILY_API_KEY=your_tavily_api_key_here
31
+
32
+ # ============================================
33
+ # Application Configuration
34
+ # ============================================
35
+ # Backend API URL (used by frontend)
36
+ REACT_APP_BACKEND_URL=http://localhost:8000
37
+
38
+ # Backend port
39
+ BACKEND_PORT=8000
40
+
41
+ # Frontend port
42
+ FRONTEND_PORT=3000
43
+
44
+ # ============================================
45
+ # Hugging Face Space Configuration
46
+ # ============================================
47
+ # When deploying to Hugging Face Spaces:
48
+ # 1. Go to your Space settings
49
+ # 2. Add secrets for GEMINI_API_KEY and TAVILY_API_KEY
50
+ # 3. The Dockerfile will use port 7860 automatically
51
+ # 4. REACT_APP_BACKEND_URL will be set to /api automatically
.gitignore ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dependencies
2
+ node_modules/
3
+ frontend/node_modules/
4
+ backend/__pycache__/
5
+ venv/
6
+ __pycache__/
7
+ *.pyc
8
+ *.pyo
9
+ *.pyd
10
+ .Python
11
+ pip-log.txt
12
+ pip-delete-this-directory.txt
13
+
14
+ # Build outputs
15
+ frontend/build/
16
+ dist/
17
+ build/
18
+ *.egg-info/
19
+ .eggs/
20
+
21
+ # Cache
22
+ frontend/node_modules/.cache/
23
+ .cache/
24
+ .pytest_cache/
25
+ .mypy_cache/
26
+
27
+ # Environment variables
28
+ .env
29
+ .env.local
30
+ .env.*.local
31
+ *.env
32
+ backend/.env
33
+ frontend/.env
34
+
35
+ # IDE
36
+ .vscode/
37
+ .idea/
38
+ *.swp
39
+ *.swo
40
+ *~
41
+
42
+ # OS
43
+ .DS_Store
44
+ Thumbs.db
45
+ *.log
46
+
47
+ # Output files
48
+ backend/output/
49
+ *.out
50
+
51
+ # Storage (runtime data - don't commit to git)
52
+ storage/
53
+ uploads/
54
+ rag_anything_smaranika/__pycache__/
55
+
56
+ # Logs
57
+ logs/
58
+ *.log
59
+ npm-debug.log*
60
+ yarn-debug.log*
61
+ yarn-error.log*
62
+
63
+ # Test coverage
64
+ htmlcov/
65
+ .coverage
66
+ .coverage.*
67
+ coverage.xml
68
+ *.cover
69
+
70
+ # Jupyter Notebook
71
+ .ipynb_checkpoints
72
+
73
+ # Docker
74
+ *.pid
75
+ .docker/
76
+
77
+ # Temporary files
78
+ tmp/
79
+ temp/
80
+ *.tmp
81
+
82
+ # Binary assets (use Git LFS if needed)
83
+ *.png
84
+ *.jpg
85
+ *.jpeg
86
+ *.gif
87
+ *.svg
88
+ *.ico
89
+
90
+ # Local LightRAG package in vendor directory
91
+ vendor/lightrag/__pycache__/
92
+ vendor/**/__pycache__/
Dockerfile ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FROM python:3.12-slim
3
+
4
+
5
+ WORKDIR /app
6
+
7
+
8
+ RUN apt-get update && apt-get install -y \
9
+ curl \
10
+ wget \
11
+ git \
12
+ build-essential \
13
+ nginx \
14
+ nodejs \
15
+ npm \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+
19
+ COPY backend/requirements.txt /app/backend/requirements.txt
20
+ RUN pip install --no-cache-dir --use-pep517 -r /app/backend/requirements.txt
21
+
22
+ # Copy local modified LightRAG package in vendor directory
23
+ COPY vendor/ /app/vendor/
24
+
25
+ COPY backend/ /app/backend/
26
+ COPY rag_anything_smaranika/ /app/rag_anything_smaranika/
27
+
28
+
29
+ COPY frontend/ /app/frontend/
30
+
31
+
32
+ WORKDIR /app/frontend
33
+ RUN npm install
34
+ RUN REACT_APP_API_URL=/api npm run build
35
+
36
+
37
+
38
+ RUN mkdir -p /var/lib/nginx/body /var/lib/nginx/fastcgi \
39
+ /var/lib/nginx/proxy /var/lib/nginx/scgi /var/lib/nginx/uwsgi \
40
+ /var/log/nginx /var/cache/nginx && \
41
+ chmod -R 777 /var/lib/nginx /var/log/nginx /var/cache/nginx && \
42
+ touch /var/run/nginx.pid && chmod 666 /var/run/nginx.pid
43
+
44
+
45
+ RUN echo 'pid /tmp/nginx.pid;\n\
46
+ error_log /var/log/nginx/error.log;\n\
47
+ events {\n\
48
+ worker_connections 1024;\n\
49
+ }\n\
50
+ http {\n\
51
+ include /etc/nginx/mime.types;\n\
52
+ default_type application/octet-stream;\n\
53
+ access_log /var/log/nginx/access.log;\n\
54
+ client_body_temp_path /tmp/client_body;\n\
55
+ proxy_temp_path /tmp/proxy;\n\
56
+ fastcgi_temp_path /tmp/fastcgi;\n\
57
+ uwsgi_temp_path /tmp/uwsgi;\n\
58
+ scgi_temp_path /tmp/scgi;\n\
59
+ \n\
60
+ server {\n\
61
+ listen 7860;\n\
62
+ server_name _;\n\
63
+ \n\
64
+ location / {\n\
65
+ root /app/frontend/build;\n\
66
+ try_files $uri $uri/ /index.html;\n\
67
+ }\n\
68
+ \n\
69
+ location /api/ {\n\
70
+ proxy_pass http://127.0.0.1:8000/;\n\
71
+ proxy_http_version 1.1;\n\
72
+ proxy_set_header Upgrade $http_upgrade;\n\
73
+ proxy_set_header Connection "upgrade";\n\
74
+ proxy_set_header Host $host;\n\
75
+ proxy_set_header X-Real-IP $remote_addr;\n\
76
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n\
77
+ proxy_set_header X-Forwarded-Proto $scheme;\n\
78
+ proxy_buffering off;\n\
79
+ proxy_cache_bypass $http_upgrade;\n\
80
+ }\n\
81
+ \n\
82
+ location /health {\n\
83
+ proxy_pass http://127.0.0.1:8000/health;\n\
84
+ proxy_http_version 1.1;\n\
85
+ proxy_set_header Host $host;\n\
86
+ }\n\
87
+ }\n\
88
+ }' > /etc/nginx/nginx.conf
89
+
90
+
91
+ RUN mkdir -p /app/storage /app/uploads /app/backend/output /app/output /app/.cache/huggingface && \
92
+ chmod -R 777 /app/storage /app/uploads /app/backend/output /app/output /app/.cache
93
+
94
+ WORKDIR /app
95
+
96
+ WORKDIR /app/rag_anything_smaranika
97
+ RUN pip install --no-cache-dir -e .
98
+
99
+ WORKDIR /app
100
+
101
+ RUN mkdir -p /app/storage/medical /app/storage/legal /app/storage/financial \
102
+ /app/storage/technical /app/storage/academic && \
103
+ chmod -R 777 /app/storage
104
+
105
+ # Create output directory in the working directory for the parser
106
+ RUN mkdir -p /app/output && chmod -R 777 /app/output
107
+
108
+
109
+ RUN echo '#!/bin/bash\n\
110
+ set -e\n\
111
+ \n\
112
+ echo "===== Application Startup at $(date +"%Y-%m-%d %H:%M:%S") ====="\n\
113
+ echo ""\n\
114
+ echo "Starting Agentic RAG System for Hugging Face Space..."\n\
115
+ \n\
116
+ # Check for required environment variables\n\
117
+ if [ -z "$GEMINI_API_KEY" ]; then\n\
118
+ echo "ERROR: GEMINI_API_KEY environment variable is not set!"\n\
119
+ echo "Please set it in your Hugging Face Space settings."\n\
120
+ exit 1\n\
121
+ fi\n\
122
+ \n\
123
+ # Start backend in background\n\
124
+ echo "Starting FastAPI backend on port 8000..."\n\
125
+ cd /app\n\
126
+ export PYTHONPATH=/app:/app/vendor:$PYTHONPATH\n\
127
+ python -m uvicorn backend.main:app --host 127.0.0.1 --port 8000 --log-level info &\n\
128
+ BACKEND_PID=$!\n\
129
+ \n\
130
+ # Wait for backend to be ready\n\
131
+ echo "Waiting for backend to be ready..."\n\
132
+ for i in {1..30}; do\n\
133
+ if curl -s http://127.0.0.1:8000/health > /dev/null 2>&1; then\n\
134
+ echo "Backend is ready!"\n\
135
+ break\n\
136
+ fi\n\
137
+ echo "Waiting for backend... ($i/30)"\n\
138
+ sleep 2\n\
139
+ done\n\
140
+ \n\
141
+ # Start nginx\n\
142
+ echo "Starting nginx on port 7860..."\n\
143
+ nginx -g "daemon off;" &\n\
144
+ NGINX_PID=$!\n\
145
+ \n\
146
+ echo ""\n\
147
+ echo "==========================================="\n\
148
+ echo "Agentic RAG System is running!"\n\
149
+ echo "Backend: http://localhost:8000"\n\
150
+ echo "Frontend: http://localhost:7860"\n\
151
+ echo "API Docs: http://localhost:8000/docs"\n\
152
+ echo "==========================================="\n\
153
+ echo ""\n\
154
+ \n\
155
+ # Wait for both processes\n\
156
+ wait $BACKEND_PID $NGINX_PID\n\
157
+ ' > /app/start.sh && chmod +x /app/start.sh
158
+
159
+
160
+ EXPOSE 7860
161
+
162
+
163
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
164
+ CMD curl -f http://localhost:7860/health || exit 1
165
+
166
+ ENV PYTHONUNBUFFERED=1
167
+ ENV PYTHONPATH=/app:/app/vendor:$PYTHONPATH
168
+ ENV BACKEND_PORT=8000
169
+ ENV FRONTEND_PORT=7860
170
+ ENV HF_HOME=/app/.cache/huggingface
171
+ ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
172
+ ENV HF_DATASETS_CACHE=/app/.cache/huggingface/datasets
173
+ ENV PYTHONDONTWRITEBYTECODE=1
174
+ ENV PYTHONHASHSEED=0
175
+ ENV PYTHONOPTIMIZE=0
176
+
177
+ # Start the application
178
+ CMD ["/app/start.sh"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Agentic RAG System Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Agentic RAG System
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: mit
10
+ ---
11
+
12
+ # 🤖 Agentic RAG System
13
+
14
+ A production-ready **Retrieval-Augmented Generation (RAG)** system with multi-domain support, advanced AI features, and intelligent web search integration. Built with FastAPI, React, and Google Gemini API.
15
+
16
+ ## ✨ Features
17
+
18
+ ### 🎯 Multi-Domain Support
19
+ - **Medical & Healthcare**: Medical documents, research papers, clinical guidelines
20
+ - **Legal & Compliance**: Legal documents, contracts, regulations, case law
21
+ - **Financial & Analytics**: Financial reports, analysis, market research
22
+ - **Technical Documentation**: Technical docs, APIs, code, architecture
23
+ - **Academic Research**: Research papers, academic publications, studies
24
+
25
+ ### 🚀 Advanced AI Capabilities
26
+ - **Query Improvement**: Automatic query enhancement with abbreviation expansion
27
+ - **Dual-LLM Verification**: Two-stage answer verification using Gemini Pro
28
+ - **Web Search Integration**: Augment answers with real-time web search via Tavily
29
+ - **Conversation Memory**: Context-aware responses with conversation history
30
+ - **Multimodal Processing**: Support for images, tables, and equations (MinerU parser)
31
+ - **Smart Reranking**: Gemini-powered relevance reranking for better results
32
+ - **Streaming Responses**: Real-time token streaming for responsive UX
33
+
34
+ ### 🔧 Technical Features
35
+ - **Gemini API Integration**: Free-tier Gemini Flash & Pro models
36
+ - **Async Processing**: Background document processing with status tracking
37
+ - **RESTful API**: Clean, well-documented FastAPI endpoints
38
+ - **Modern React Frontend**: Beautiful, responsive UI with Tailwind CSS
39
+ - **Docker Support**: One-command deployment with docker-compose
40
+ - **Performance Optimized**: Query caching, fast mode (2-3x speedup), batch processing
41
+
42
+ ## 🚀 Quick Start (Docker)
43
+
44
+ ### Prerequisites
45
+ - Docker and Docker Compose
46
+ - Google Gemini API Key ([Get one free](https://makersuite.google.com/app/apikey))
47
+ - (Optional) Tavily API Key for web search ([Get one free](https://tavily.com))
48
+
49
+ ### 1. Clone the Repository
50
+ ```bash
51
+ git clone <your-repo-url>
52
+ cd Agentic_RAG
53
+ ```
54
+
55
+ ### 2. Set Up Environment Variables
56
+ Create a `.env` file in the project root:
57
+ ```bash
58
+ GEMINI_API_KEY=your_gemini_api_key_here
59
+ GEMINI_TEXT_MODEL=models/gemini-flash-latest
60
+ GEMINI_VERIFIER_MODEL=models/gemini-pro-latest
61
+ GEMINI_VISION_MODEL=models/gemini-flash-latest
62
+ GEMINI_EMBEDDING_MODEL=models/text-embedding-004
63
+ TAVILY_API_KEY=your_tavily_api_key_here # Optional, for web search
64
+ ```
65
+
66
+ ### 3. Start the Application
67
+ ```bash
68
+ docker-compose up -d
69
+ ```
70
+
71
+ ### 4. Access the Application
72
+ - **Frontend**: http://localhost:3000
73
+ - **Backend API**: http://localhost:8000
74
+ - **API Docs**: http://localhost:8000/docs
75
+
76
+ ## 📖 Usage
77
+
78
+ ### Upload Documents
79
+ 1. Navigate to the frontend at http://localhost:3000
80
+ 2. Select a domain (Medical, Legal, Financial, Technical, or Academic)
81
+ 3. Upload PDF, DOCX, TXT, or other supported documents
82
+ 4. Wait for processing to complete (tracked with real-time status)
83
+
84
+ ### Query Documents
85
+ 1. Enter your question in the query interface
86
+ 2. Select query mode:
87
+ - **Mix**: Balanced combination of local and global search (recommended)
88
+ - **Local**: Focused chunk-based search
89
+ - **Global**: Knowledge graph entity search
90
+ - **Hybrid**: Advanced combination
91
+ - **Web**: RAG + real-time web search
92
+ 3. Toggle advanced features:
93
+ - **Query Improvement**: Enhance your query automatically
94
+ - **Verification**: Dual-LLM quality check
95
+ - **Web Search**: Augment with real-time web results
96
+ - **Fast Mode**: 2-3x faster queries (slightly lower quality)
97
+ 4. Get streaming responses with sources and confidence scores
98
+
99
+ ### API Usage
100
+ ```python
101
+ import requests
102
+
103
+ # Upload document
104
+ files = {"file": open("document.pdf", "rb")}
105
+ data = {"domain": "medical"}
106
+ response = requests.post("http://localhost:8000/upload", files=files, data=data)
107
+ print(response.json())
108
+
109
+ # Query documents
110
+ query_data = {
111
+ "query": "What are the treatment options for hypertension?",
112
+ "domain": "medical",
113
+ "mode": "mix",
114
+ "enable_web_search": False,
115
+ "fast_mode": False,
116
+ "return_metadata": True
117
+ }
118
+ response = requests.post("http://localhost:8000/query", json=query_data)
119
+ print(response.json())
120
+ ```
121
+
122
+ ## 🏗️ Architecture
123
+
124
+ ```
125
+ Agentic_RAG/
126
+ ├── backend/ # FastAPI backend
127
+ │ ├── main.py # Main API server
128
+ │ ├── reranker.py # Gemini-powered reranking
129
+ │ ├── web_search.py # Tavily web search integration
130
+ │ ├── url_fetcher.py # URL content fetching
131
+ │ ├── requirements.txt # Python dependencies
132
+ │ └── Dockerfile # Backend container
133
+ ├── frontend/ # React frontend
134
+ │ ├── src/ # React components
135
+ │ ├── public/ # Static assets
136
+ │ ├── package.json # Node dependencies
137
+ │ ├── Dockerfile # Frontend container
138
+ │ └── nginx.conf # Nginx configuration
139
+ ├── storage/ # RAG storage (created at runtime)
140
+ │ ├── medical/ # Medical domain storage
141
+ │ ├── legal/ # Legal domain storage
142
+ │ └── ... # Other domains
143
+ ├── uploads/ # Uploaded documents
144
+ ├── docker-compose.yml # Docker orchestration
145
+ ├── Dockerfile # Hugging Face Space Dockerfile
146
+ └── README.md # This file
147
+ ```
148
+
149
+ ## 🔑 Environment Variables
150
+
151
+ | Variable | Description | Required | Default |
152
+ |----------|-------------|----------|---------|
153
+ | `GEMINI_API_KEY` | Google Gemini API key | Yes | - |
154
+ | `GEMINI_TEXT_MODEL` | Text generation model | No | `models/gemini-flash-latest` |
155
+ | `GEMINI_VERIFIER_MODEL` | Verification model | No | `models/gemini-pro-latest` |
156
+ | `GEMINI_VISION_MODEL` | Vision processing model | No | `models/gemini-flash-latest` |
157
+ | `GEMINI_EMBEDDING_MODEL` | Embedding model | No | `models/text-embedding-004` |
158
+ | `TAVILY_API_KEY` | Tavily web search API key | No | - |
159
+
160
+ ## 📊 API Endpoints
161
+
162
+ ### Health Check
163
+ ```bash
164
+ GET /health
165
+ ```
166
+
167
+ ### List Domains
168
+ ```bash
169
+ GET /domains
170
+ ```
171
+
172
+ ### Upload Document
173
+ ```bash
174
+ POST /upload
175
+ Content-Type: multipart/form-data
176
+
177
+ file: <document file>
178
+ domain: medical
179
+ ```
180
+
181
+ ### Query Documents (Streaming)
182
+ ```bash
183
+ POST /query/stream
184
+ Content-Type: application/json
185
+
186
+ {
187
+ "query": "What are the treatment options?",
188
+ "domain": "medical",
189
+ "mode": "mix",
190
+ "enable_web_search": false,
191
+ "fast_mode": false
192
+ }
193
+ ```
194
+
195
+ ### Query Documents (Standard)
196
+ ```bash
197
+ POST /query
198
+ Content-Type: application/json
199
+
200
+ {
201
+ "query": "What are the treatment options?",
202
+ "domain": "medical",
203
+ "mode": "mix"
204
+ }
205
+ ```
206
+
207
+ ### Check Processing Status
208
+ ```bash
209
+ GET /status/{processing_id}
210
+ ```
211
+
212
+ ### List Documents
213
+ ```bash
214
+ GET /documents?domain=medical
215
+ ```
216
+
217
+ ### Delete Document
218
+ ```bash
219
+ DELETE /documents/{doc_id}
220
+ ```
221
+
222
+ ## 🎯 Performance
223
+
224
+ - **Fast Mode**: 2-3x faster queries with optimized parameters
225
+ - **Query Caching**: 5-minute TTL cache for repeated queries
226
+ - **Batch Processing**: Parallel document processing (up to 10 documents)
227
+ - **Streaming**: Real-time token streaming for responsive UX
228
+ - **Reranking**: Gemini-powered relevance scoring
229
+
230
+ ## 🛠️ Development
231
+
232
+ ### Backend Development
233
+ ```bash
234
+ cd backend
235
+ pip install -r requirements.txt
236
+ python main.py
237
+ ```
238
+
239
+ ### Frontend Development
240
+ ```bash
241
+ cd frontend
242
+ npm install
243
+ npm start
244
+ ```
245
+
246
+ ## 🤝 Contributing
247
+
248
+ Contributions are welcome! Please feel free to submit a Pull Request.
249
+
250
+ ## 📝 License
251
+
252
+ This project is licensed under the MIT License - see the LICENSE file for details.
253
+
254
+ ## 🙏 Acknowledgments
255
+
256
+ - [LightRAG](https://github.com/HKUDS/LightRAG) - RAG framework
257
+ - [Google Gemini](https://ai.google.dev/) - LLM and embeddings
258
+ - [Tavily](https://tavily.com/) - Web search API
259
+ - [MinerU](https://github.com/opendatalab/MinerU) - Document parsing
260
+ - [FastAPI](https://fastapi.tiangolo.com/) - Backend framework
261
+ - [React](https://react.dev/) - Frontend framework
262
+
263
+ ## 📧 Support
264
+
265
+ For issues and questions, please open an issue on GitHub.
266
+
267
+ ---
268
+
269
+ Built with ❤️ using FastAPI, React, and Google Gemini
backend/Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ build-essential \
9
+ curl \
10
+ git \
11
+ wget \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Copy backend requirements first for better caching
15
+ COPY backend/requirements.txt /app/backend/requirements.txt
16
+
17
+ # Install Python dependencies
18
+ RUN pip install --no-cache-dir -r /app/backend/requirements.txt
19
+
20
+ # Copy rag_anything_smaranika (contains raganything module)
21
+ COPY rag_anything_smaranika /app/rag_anything_smaranika
22
+
23
+ # Install raganything as an editable package
24
+ RUN pip install -e /app/rag_anything_smaranika
25
+
26
+ # Copy backend application code
27
+ COPY backend /app/backend
28
+
29
+ # Create necessary directories
30
+ RUN mkdir -p /app/storage /app/uploads /app/backend/output
31
+
32
+ # Set environment variables
33
+ ENV PYTHONUNBUFFERED=1
34
+ ENV PYTHONPATH=/app
35
+
36
+ # Expose port
37
+ EXPOSE 8000
38
+
39
+ # Health check
40
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
41
+ CMD curl -f http://localhost:8000/health || exit 1
42
+
43
+ # Run the application
44
+ CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]
backend/README.md ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Enhanced RAG-Anything Backend API
2
+
3
+ Production-ready FastAPI backend for the RAG-Anything system with multi-domain support and advanced AI features.
4
+
5
+ ## Features
6
+
7
+ ### 🎯 Multi-Domain Support
8
+ - **Medical & Healthcare**: Medical documents, research papers, clinical guidelines
9
+ - **Legal & Compliance**: Legal documents, contracts, regulations, case law
10
+ - **Financial & Analytics**: Financial reports, analysis, market research
11
+ - **Technical Documentation**: Technical docs, APIs, code, architecture
12
+ - **Academic Research**: Research papers, academic publications, studies
13
+
14
+ ### 🚀 Advanced AI Capabilities
15
+ - **Query Improvement**: Automatic query enhancement with abbreviation expansion
16
+ - **Dual-LLM Verification**: Two-stage answer verification for quality assurance
17
+ - **Conversation Memory**: Context-aware responses with conversation history
18
+ - **Multimodal Processing**: Support for images, tables, and equations
19
+ - **Domain-Specific Prompts**: Optimized prompts for each domain
20
+
21
+ ### 🔧 Technical Features
22
+ - **Gemini API Integration**: Free-tier Gemini 1.5 Flash model
23
+ - **Async Processing**: Background document processing
24
+ - **RESTful API**: Clean, well-documented endpoints
25
+ - **CORS Support**: Cross-origin resource sharing enabled
26
+ - **Error Handling**: Comprehensive error handling and logging
27
+
28
+ ## Installation
29
+
30
+ ### Prerequisites
31
+ - Python 3.9+
32
+ - Gemini API Key ([Get one here](https://makersuite.google.com/app/apikey))
33
+
34
+ ### Setup
35
+
36
+ 1. **Clone the repository**
37
+ ```bash
38
+ cd /mnt/data/Agentic_RAG/backend
39
+ ```
40
+
41
+ 2. **Install dependencies**
42
+ ```bash
43
+ pip install -r requirements.txt
44
+ ```
45
+
46
+ 3. **Set up environment variables**
47
+ ```bash
48
+ export GEMINI_API_KEY="your-api-key-here"
49
+ ```
50
+
51
+ Or create a `.env` file:
52
+ ```env
53
+ GEMINI_API_KEY=your-api-key-here
54
+ ```
55
+
56
+ 4. **Run the server**
57
+ ```bash
58
+ python main.py
59
+ ```
60
+
61
+ Or using uvicorn directly:
62
+ ```bash
63
+ uvicorn main:app --host 0.0.0.0 --port 8000 --reload
64
+ ```
65
+
66
+ ## API Endpoints
67
+
68
+ ### Health Check
69
+ ```bash
70
+ GET /health
71
+ ```
72
+
73
+ Response:
74
+ ```json
75
+ {
76
+ "status": "healthy",
77
+ "timestamp": "2025-01-04T10:00:00",
78
+ "version": "1.0.0",
79
+ "features": {
80
+ "query_improvement": true,
81
+ "dual_llm_verification": true,
82
+ "conversation_memory": true,
83
+ "multi_domain": true,
84
+ "multimodal_processing": true,
85
+ "gemini_integration": true
86
+ },
87
+ "domains": ["medical", "legal", "financial", "technical", "academic"]
88
+ }
89
+ ```
90
+
91
+ ### List Domains
92
+ ```bash
93
+ GET /domains
94
+ ```
95
+
96
+ ### Upload Document
97
+ ```bash
98
+ POST /upload
99
+ Content-Type: multipart/form-data
100
+
101
+ file: <document file>
102
+ domain: medical
103
+ ```
104
+
105
+ Response:
106
+ ```json
107
+ {
108
+ "success": true,
109
+ "message": "Document uploaded and queued for processing",
110
+ "file_name": "research_paper.pdf",
111
+ "domain": "medical",
112
+ "processing_id": "uuid-here"
113
+ }
114
+ ```
115
+
116
+ ### Query Documents
117
+ ```bash
118
+ POST /query
119
+ Content-Type: application/json
120
+
121
+ {
122
+ "query": "What are the treatment options for hypertension?",
123
+ "domain": "medical",
124
+ "mode": "mix",
125
+ "conversation_id": "conv_123",
126
+ "return_metadata": true
127
+ }
128
+ ```
129
+
130
+ Response:
131
+ ```json
132
+ {
133
+ "answer": "Hypertension treatment includes lifestyle modifications...",
134
+ "sources": ["medical_guidelines.pdf"],
135
+ "confidence_score": 0.92,
136
+ "query_improved": true,
137
+ "verification_performed": true,
138
+ "conversation_id": "conv_123",
139
+ "metadata": {
140
+ "original_query": "What is HTN treatment?",
141
+ "improved_query": "What are the treatment options for hypertension?",
142
+ "verification_score": 8.5,
143
+ "modification_attempts": 1
144
+ }
145
+ }
146
+ ```
147
+
148
+ ### Get Conversation History
149
+ ```bash
150
+ GET /conversation/{conversation_id}
151
+ ```
152
+
153
+ ### Clear Conversation
154
+ ```bash
155
+ DELETE /conversation/{conversation_id}
156
+ ```
157
+
158
+ ### Clear Domain Data
159
+ ```bash
160
+ DELETE /clear/{domain}
161
+ ```
162
+
163
+ ## Usage Examples
164
+
165
+ ### Using cURL
166
+
167
+ **Upload a document:**
168
+ ```bash
169
+ curl -X POST "http://localhost:8000/upload" \
170
+ -F "file=@medical_paper.pdf" \
171
+ -F "domain=medical"
172
+ ```
173
+
174
+ **Query documents:**
175
+ ```bash
176
+ curl -X POST "http://localhost:8000/query" \
177
+ -H "Content-Type: application/json" \
178
+ -d '{
179
+ "query": "What are the side effects of ACE inhibitors?",
180
+ "domain": "medical",
181
+ "mode": "mix",
182
+ "return_metadata": true
183
+ }'
184
+ ```
185
+
186
+ ### Using Python
187
+
188
+ ```python
189
+ import requests
190
+
191
+ # Upload document
192
+ with open("medical_paper.pdf", "rb") as f:
193
+ files = {"file": f}
194
+ data = {"domain": "medical"}
195
+ response = requests.post("http://localhost:8000/upload", files=files, data=data)
196
+ print(response.json())
197
+
198
+ # Query documents
199
+ query_data = {
200
+ "query": "What are the treatment options for hypertension?",
201
+ "domain": "medical",
202
+ "mode": "mix",
203
+ "return_metadata": True
204
+ }
205
+ response = requests.post("http://localhost:8000/query", json=query_data)
206
+ print(response.json())
207
+ ```
208
+
209
+ ### Using JavaScript/TypeScript
210
+
211
+ ```typescript
212
+ // Upload document
213
+ const formData = new FormData();
214
+ formData.append('file', fileInput.files[0]);
215
+ formData.append('domain', 'medical');
216
+
217
+ const uploadResponse = await fetch('http://localhost:8000/upload', {
218
+ method: 'POST',
219
+ body: formData
220
+ });
221
+
222
+ // Query documents
223
+ const queryResponse = await fetch('http://localhost:8000/query', {
224
+ method: 'POST',
225
+ headers: { 'Content-Type': 'application/json' },
226
+ body: JSON.stringify({
227
+ query: 'What are the treatment options for hypertension?',
228
+ domain: 'medical',
229
+ mode: 'mix',
230
+ return_metadata: true
231
+ })
232
+ });
233
+
234
+ const result = await queryResponse.json();
235
+ console.log(result);
236
+ ```
237
+
238
+ ## Configuration
239
+
240
+ ### Domain-Specific Settings
241
+
242
+ Each domain has customized settings in `DOMAIN_CONFIGS`:
243
+
244
+ ```python
245
+ {
246
+ "medical": {
247
+ "enable_query_improvement": True,
248
+ "query_improvement_method": "hybrid",
249
+ "expand_abbreviations": True,
250
+ "verification_threshold": 7.5,
251
+ # ... more settings
252
+ }
253
+ }
254
+ ```
255
+
256
+ ### Gemini Model Configuration
257
+
258
+ Currently using `gemini-1.5-flash` (free tier). To use a different model:
259
+
260
+ ```python
261
+ GEMINI_MODEL = "gemini-1.5-pro" # More capable, paid tier
262
+ ```
263
+
264
+ ## Architecture
265
+
266
+ ```
267
+ backend/
268
+ ├── main.py # FastAPI application
269
+ ├── requirements.txt # Python dependencies
270
+ └── README.md # This file
271
+
272
+ storage/ # Created at runtime
273
+ ├── medical/ # Medical domain storage
274
+ ├── legal/ # Legal domain storage
275
+ ├── financial/ # Financial domain storage
276
+ ├── technical/ # Technical domain storage
277
+ └── academic/ # Academic domain storage
278
+
279
+ uploads/ # Uploaded files
280
+ ├── medical/
281
+ ├── legal/
282
+ └── ...
283
+ ```
284
+
285
+ ## API Documentation
286
+
287
+ Interactive API documentation is available at:
288
+ - **Swagger UI**: http://localhost:8000/docs
289
+ - **ReDoc**: http://localhost:8000/redoc
290
+
291
+ ## Error Handling
292
+
293
+ The API uses standard HTTP status codes:
294
+
295
+ - `200`: Success
296
+ - `400`: Bad Request (invalid parameters)
297
+ - `404`: Not Found
298
+ - `500`: Internal Server Error
299
+
300
+ All errors return JSON:
301
+ ```json
302
+ {
303
+ "detail": "Error message here"
304
+ }
305
+ ```
306
+
307
+ ## Logging
308
+
309
+ Logs are output to console with the format:
310
+ ```
311
+ 2025-01-04 10:00:00 - main - INFO - Message here
312
+ ```
313
+
314
+ ## Production Deployment
315
+
316
+ For production deployment:
317
+
318
+ 1. **Set proper CORS origins** in `main.py`:
319
+ ```python
320
+ allow_origins=["https://your-frontend-domain.com"]
321
+ ```
322
+
323
+ 2. **Use a production ASGI server**:
324
+ ```bash
325
+ gunicorn main:app -w 4 -k uvicorn.workers.UvicornWorker
326
+ ```
327
+
328
+ 3. **Set up environment variables** securely (don't commit `.env` files)
329
+
330
+ 4. **Enable HTTPS** using a reverse proxy (nginx, Caddy, etc.)
331
+
332
+ 5. **Set up proper logging** (file-based, log rotation)
333
+
334
+ 6. **Monitor** with tools like Prometheus, Grafana
335
+
336
+ ## Troubleshooting
337
+
338
+ ### "GEMINI_API_KEY not set"
339
+ Set your API key as an environment variable or in a `.env` file.
340
+
341
+ ### "Failed to initialize RAG system"
342
+ Check that the storage directories are writable and all dependencies are installed.
343
+
344
+ ### "File type not supported"
345
+ Verify the file extension is in the allowed list for the target domain.
346
+
347
+ ## License
348
+
349
+ [Your License Here]
350
+
351
+ ## Support
352
+
353
+ For issues and questions, please open an issue on GitHub.
backend/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """
2
+ Backend package for RAG-Anything API
3
+
4
+ This package contains the FastAPI backend and supporting modules.
5
+ """
6
+
7
+ __version__ = "1.1.0"
backend/main.py ADDED
@@ -0,0 +1,2078 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Backend for Enhanced RAG-Anything System (v1.1 - Updated)
3
+
4
+ Production-ready backend with:
5
+ - Multi-domain support (medical, legal, financial, technical, academic)
6
+ - Gemini API integration (LLM, Vision, Embeddings)
7
+ - Query improvement and dual-LLM verification
8
+ - Conversation history management
9
+ - Document processing and querying
10
+ """
11
+
12
+ import os
13
+ import asyncio
14
+ import logging
15
+ from pathlib import Path
16
+ from typing import Dict, List, Optional, Any
17
+ from datetime import datetime
18
+ import uuid
19
+ import hashlib
20
+ import time
21
+ import json
22
+ from contextlib import asynccontextmanager
23
+ from dotenv import load_dotenv
24
+ from cachetools import TTLCache
25
+
26
+ from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
27
+ from fastapi.responses import StreamingResponse
28
+ from fastapi.middleware.cors import CORSMiddleware
29
+ from pydantic import BaseModel, Field
30
+ import google.generativeai as genai
31
+
32
+ # Add project root to path for imports
33
+ import sys
34
+ sys.path.insert(0, str(Path(__file__).parent.parent))
35
+
36
+ # Load environment variables from .env file
37
+ load_dotenv(Path(__file__).parent / ".env")
38
+
39
+ from raganything.raganything import RAGAnything, RAGAnythingConfig, create_rag_anything
40
+ from backend.reranker import GeminiReranker
41
+ from backend.web_search import WebSearcher, create_web_searcher
42
+ from backend.url_fetcher import URLFetcher, create_url_fetcher
43
+
44
+ # Configure logging
45
+ logging.basicConfig(
46
+ level=logging.INFO,
47
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
48
+ )
49
+ logger = logging.getLogger(__name__)
50
+
51
+ # =============================================================================
52
+ # Domain Configurations
53
+ # =============================================================================
54
+
55
+ DOMAIN_CONFIGS = {
56
+ "medical": {
57
+ "name": "Medical & Healthcare",
58
+ "description": "Optimized for medical documents, research papers, clinical guidelines",
59
+ "system_prompt": (
60
+ "You are a medical AI assistant with expertise in healthcare, clinical medicine, "
61
+ "and medical research. Provide accurate, evidence-based responses with appropriate "
62
+ "medical terminology. Always cite sources and indicate confidence levels."
63
+ ),
64
+ "analysis_prompt": (
65
+ "Analyze this medical document focusing on: diagnoses, treatments, medications, "
66
+ "clinical findings, patient outcomes, and evidence-based recommendations."
67
+ ),
68
+ "file_extensions": [".pdf", ".doc", ".docx", ".txt", ".md", ".csv", ".xlsx"],
69
+ "config_overrides": {
70
+ "domain": "medical",
71
+ "enable_query_improvement": True,
72
+ "query_improvement_method": "hybrid",
73
+ "expand_abbreviations": True,
74
+ "add_domain_keywords": True,
75
+ "extract_query_entities": True,
76
+ "enable_dual_llm_verification": True,
77
+ "enable_answer_verification": True,
78
+ "enable_answer_modification": True,
79
+ "verification_threshold": 7.5,
80
+ "check_factual_consistency": True,
81
+ "check_completeness": True,
82
+ "check_relevance": True,
83
+ }
84
+ },
85
+ "legal": {
86
+ "name": "Legal & Compliance",
87
+ "description": "Specialized for legal documents, contracts, regulations, case law",
88
+ "system_prompt": (
89
+ "You are a legal AI assistant with expertise in law, regulations, and compliance. "
90
+ "Provide precise legal analysis with proper citations. Note that this is for "
91
+ "informational purposes only and not legal advice."
92
+ ),
93
+ "analysis_prompt": (
94
+ "Analyze this legal document focusing on: key provisions, obligations, rights, "
95
+ "legal precedents, regulatory requirements, and potential implications."
96
+ ),
97
+ "file_extensions": [".pdf", ".doc", ".docx", ".txt", ".csv", ".xlsx"],
98
+ "config_overrides": {
99
+ "domain": "legal",
100
+ "enable_query_improvement": True,
101
+ "query_improvement_method": "llm",
102
+ "expand_abbreviations": True,
103
+ "extract_query_entities": True,
104
+ "enable_dual_llm_verification": True,
105
+ "enable_answer_verification": True,
106
+ "enable_answer_modification": True,
107
+ "verification_threshold": 8.0,
108
+ "check_factual_consistency": True,
109
+ "check_completeness": True,
110
+ }
111
+ },
112
+ "financial": {
113
+ "name": "Financial & Analytics",
114
+ "description": "Tailored for financial reports, analysis, market research, forecasts",
115
+ "system_prompt": (
116
+ "You are a financial AI assistant with expertise in finance, accounting, and "
117
+ "market analysis. Provide data-driven insights with numerical precision. "
118
+ "Include relevant financial metrics and trends."
119
+ ),
120
+ "analysis_prompt": (
121
+ "Analyze this financial document focusing on: financial metrics, trends, "
122
+ "performance indicators, risk factors, market conditions, and forecasts."
123
+ ),
124
+ "file_extensions": [".pdf", ".xlsx", ".csv", ".doc", ".docx"],
125
+ "config_overrides": {
126
+ "domain": "financial",
127
+ "enable_query_improvement": True,
128
+ "query_improvement_method": "hybrid",
129
+ "expand_abbreviations": True,
130
+ "add_domain_keywords": True,
131
+ "enable_dual_llm_verification": True,
132
+ "enable_answer_verification": True,
133
+ "verification_threshold": 7.5,
134
+ "check_factual_consistency": True,
135
+ }
136
+ },
137
+ "technical": {
138
+ "name": "Technical Documentation",
139
+ "description": "Optimized for technical docs, APIs, code, system architecture",
140
+ "system_prompt": (
141
+ "You are a technical AI assistant with expertise in software development, "
142
+ "system architecture, and technical documentation. Provide clear, precise "
143
+ "technical explanations with code examples when relevant."
144
+ ),
145
+ "analysis_prompt": (
146
+ "Analyze this technical document focusing on: system design, APIs, configurations, "
147
+ "dependencies, implementation details, and best practices."
148
+ ),
149
+ "file_extensions": [".pdf", ".md", ".txt", ".rst", ".doc", ".docx", ".csv", ".xlsx"],
150
+ "config_overrides": {
151
+ "domain": "technical",
152
+ "enable_query_improvement": True,
153
+ "query_improvement_method": "hybrid",
154
+ "expand_abbreviations": True,
155
+ "extract_query_entities": True,
156
+ "enable_dual_llm_verification": True,
157
+ "enable_answer_verification": True,
158
+ "verification_threshold": 7.0,
159
+ }
160
+ },
161
+ "academic": {
162
+ "name": "Academic Research",
163
+ "description": "Designed for research papers, academic publications, studies",
164
+ "system_prompt": (
165
+ "You are an academic AI assistant with expertise in research methodology, "
166
+ "scholarly analysis, and scientific literature. Provide well-reasoned responses "
167
+ "with proper academic citations and methodology discussion."
168
+ ),
169
+ "analysis_prompt": (
170
+ "Analyze this academic document focusing on: research questions, methodology, "
171
+ "findings, conclusions, citations, and contributions to the field."
172
+ ),
173
+ "file_extensions": [".pdf", ".doc", ".docx", ".txt", ".tex", ".csv", ".xlsx"],
174
+ "config_overrides": {
175
+ "domain": "academic",
176
+ "enable_query_improvement": True,
177
+ "query_improvement_method": "llm",
178
+ "expand_abbreviations": True,
179
+ "add_domain_keywords": True,
180
+ "extract_query_entities": True,
181
+ "enable_dual_llm_verification": True,
182
+ "enable_answer_verification": True,
183
+ "enable_answer_modification": True,
184
+ "verification_threshold": 8.0,
185
+ "check_completeness": True,
186
+ "check_relevance": True,
187
+ }
188
+ }
189
+ }
190
+
191
+ # =============================================================================
192
+ # Global State & Configuration
193
+ # =============================================================================
194
+
195
+ # RAG instances per domain
196
+ rag_instances: Dict[str, RAGAnything] = {}
197
+
198
+ # Web searcher instance
199
+ web_searcher: Optional[WebSearcher] = None
200
+
201
+ # URL fetcher instance
202
+ url_fetcher: Optional[URLFetcher] = None
203
+
204
+ # Conversation history storage
205
+ conversation_histories: Dict[str, List[Dict[str, str]]] = {}
206
+
207
+ # Processing status tracker
208
+ processing_status: Dict[str, Dict[str, Any]] = {}
209
+
210
+ # Query result cache (TTL: 5 minutes, max 100 entries)
211
+ query_cache: TTLCache = TTLCache(maxsize=100, ttl=300)
212
+
213
+ # Performance metrics storage
214
+ performance_metrics: Dict[str, List[float]] = {
215
+ "query_times": [],
216
+ "processing_times": [],
217
+ }
218
+
219
+ # Base paths
220
+ BASE_DIR = Path(__file__).parent.parent
221
+ STORAGE_DIR = BASE_DIR / "storage"
222
+ UPLOAD_DIR = BASE_DIR / "uploads"
223
+ STATUS_FILE = STORAGE_DIR / "processing_status.json"
224
+
225
+ # --- IMPROVEMENT: Centralized and configurable Gemini model names ---
226
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
227
+ GEMINI_TEXT_MODEL = os.getenv("GEMINI_TEXT_MODEL", "models/gemini-flash-latest") # Fast generation (alias to latest Flash)
228
+ GEMINI_VERIFIER_MODEL = os.getenv("GEMINI_VERIFIER_MODEL", "models/gemini-pro-latest") # Quality verification (alias to latest Pro)
229
+ GEMINI_VISION_MODEL = os.getenv("GEMINI_VISION_MODEL", "models/gemini-flash-latest") # Vision model
230
+ GEMINI_EMBEDDING_MODEL = os.getenv("GEMINI_EMBEDDING_MODEL", "models/text-embedding-004") # Embedding model
231
+ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "") # For web search
232
+
233
+
234
+ # =============================================================================
235
+ # Status Persistence Functions
236
+ # =============================================================================
237
+
238
+ def load_processing_status() -> Dict[str, Dict[str, Any]]:
239
+ """Load processing status from disk."""
240
+ try:
241
+ if STATUS_FILE.exists():
242
+ with open(STATUS_FILE, 'r') as f:
243
+ status_data = json.load(f)
244
+ logger.info(f"Loaded {len(status_data)} processing status entries from disk")
245
+ return status_data
246
+ return {}
247
+ except Exception as e:
248
+ logger.error(f"Error loading processing status: {e}", exc_info=True)
249
+ return {}
250
+
251
+
252
+ def save_processing_status():
253
+ """Save processing status to disk."""
254
+ try:
255
+ STATUS_FILE.parent.mkdir(parents=True, exist_ok=True)
256
+ with open(STATUS_FILE, 'w') as f:
257
+ json.dump(processing_status, f, indent=2)
258
+ logger.debug(f"Saved {len(processing_status)} processing status entries to disk")
259
+ except Exception as e:
260
+ logger.error(f"Error saving processing status: {e}", exc_info=True)
261
+
262
+
263
+ def update_processing_status(processing_id: str, status_update: Dict[str, Any]):
264
+ """Update processing status both in-memory and on disk."""
265
+ processing_status[processing_id] = status_update
266
+ save_processing_status()
267
+
268
+
269
+ # =============================================================================
270
+ # Lifespan Management (Startup/Shutdown)
271
+ # =============================================================================
272
+
273
+ @asynccontextmanager
274
+ async def lifespan(app: FastAPI):
275
+ """Handles application startup and shutdown events."""
276
+ # --- STARTUP ---
277
+ logger.info("Starting Enhanced RAG-Anything API...")
278
+ STORAGE_DIR.mkdir(parents=True, exist_ok=True)
279
+ UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
280
+ for domain in DOMAIN_CONFIGS.keys():
281
+ (STORAGE_DIR / domain).mkdir(parents=True, exist_ok=True)
282
+ logger.info(f"Created storage directories: {STORAGE_DIR}")
283
+
284
+ # Load processing status from disk
285
+ global processing_status
286
+ processing_status.update(load_processing_status())
287
+
288
+ if GEMINI_API_KEY:
289
+ try:
290
+ genai.configure(api_key=GEMINI_API_KEY)
291
+ logger.info("Gemini API initialized successfully")
292
+ logger.info(f"Model Configuration:")
293
+ logger.info(f" TEXT_MODEL: {GEMINI_TEXT_MODEL}")
294
+ logger.info(f" VERIFIER_MODEL: {GEMINI_VERIFIER_MODEL}")
295
+ logger.info(f" VISION_MODEL: {GEMINI_VISION_MODEL}")
296
+ logger.info(f" EMBEDDING_MODEL: {GEMINI_EMBEDDING_MODEL}")
297
+ except Exception as e:
298
+ logger.error(f"Failed to initialize Gemini API: {e}", exc_info=True)
299
+ logger.warning("Application will start but Gemini features will not work")
300
+ else:
301
+ logger.warning("GEMINI_API_KEY not set. Set it in environment variables.")
302
+
303
+ # Initialize web searcher if Tavily API key is available
304
+ global web_searcher, url_fetcher
305
+ if TAVILY_API_KEY:
306
+ try:
307
+ web_searcher = create_web_searcher(api_key=TAVILY_API_KEY, max_results=5)
308
+ logger.info("Tavily web search initialized successfully")
309
+ except Exception as e:
310
+ logger.warning(f"Failed to initialize Tavily: {e}. Web search will not be available.")
311
+ web_searcher = None
312
+ else:
313
+ logger.info("TAVILY_API_KEY not set. Web search features disabled.")
314
+
315
+ # Initialize URL fetcher
316
+ try:
317
+ url_download_dir = UPLOAD_DIR / "url_downloads"
318
+ url_download_dir.mkdir(parents=True, exist_ok=True)
319
+ url_fetcher = create_url_fetcher(download_dir=str(url_download_dir))
320
+ logger.info("URL fetcher initialized successfully")
321
+ except Exception as e:
322
+ logger.warning(f"Failed to initialize URL fetcher: {e}. URL ingestion will not be available.")
323
+ url_fetcher = None
324
+
325
+ logger.info("Enhanced RAG-Anything API started successfully!")
326
+
327
+ yield # Application runs here
328
+
329
+ # --- SHUTDOWN ---
330
+ logger.info("Shutting down API...")
331
+ for domain, rag_instance in rag_instances.items():
332
+ logger.info(f"Finalizing storages for domain: {domain}")
333
+ await rag_instance.finalize_storages()
334
+ logger.info("API shutdown complete.")
335
+
336
+ # =============================================================================
337
+ # FastAPI App Setup
338
+ # =============================================================================
339
+
340
+ app = FastAPI(
341
+ title="Enhanced RAG-Anything API",
342
+ description="Production-ready RAG system with multi-domain support and advanced features",
343
+ version="1.1.0",
344
+ docs_url="/docs",
345
+ redoc_url="/redoc",
346
+ lifespan=lifespan # --- FIX: Using modern lifespan event handler ---
347
+ )
348
+
349
+ # CORS Configuration
350
+ app.add_middleware(
351
+ CORSMiddleware,
352
+ allow_origins=["*"],
353
+ allow_credentials=True,
354
+ allow_methods=["*"],
355
+ allow_headers=["*"],
356
+ )
357
+
358
+
359
+ # =============================================================================
360
+ # Request/Response Models
361
+ # =============================================================================
362
+
363
+ class QueryRequest(BaseModel):
364
+ query: str = Field(..., description="User query text", min_length=1)
365
+ domain: str = Field("medical", description="Domain context (medical, legal, etc.)")
366
+ mode: str = Field("mix", description="Query mode (local, global, hybrid, naive, mix, web, hybrid_web)")
367
+ conversation_id: Optional[str] = Field(None, description="Conversation ID for context")
368
+ return_metadata: bool = Field(True, description="Include detailed metadata in response")
369
+ enable_web_search: bool = Field(False, description="Enable web search augmentation")
370
+ web_search_only: bool = Field(False, description="Use only web search (no RAG)")
371
+ enable_verification: bool = Field(True, description="Enable dual-LLM verification")
372
+ # Performance optimization parameters
373
+ fast_mode: bool = Field(False, description="Use optimized parameters for faster queries (2-3x speedup)")
374
+ top_k: Optional[int] = Field(None, description="Number of top results to retrieve (default: 40, fast: 20)")
375
+ enable_cache: bool = Field(True, description="Enable query result caching")
376
+ enable_query_improvement: bool = Field(True, description="Enable query improvement/expansion")
377
+ enable_verification_check: bool = Field(True, description="Enable verification step (separate from enable_verification)")
378
+
379
+ class Config:
380
+ json_schema_extra = {
381
+ "example": {
382
+ "query": "What are the treatment options for hypertension?",
383
+ "domain": "medical",
384
+ "mode": "mix",
385
+ "conversation_id": "conv_123",
386
+ "return_metadata": True,
387
+ "enable_web_search": False,
388
+ "web_search_only": False,
389
+ "enable_verification": True
390
+ }
391
+ }
392
+
393
+
394
+ class QueryResponse(BaseModel):
395
+ answer: str = Field(..., description="Generated answer")
396
+ sources: List[str] = Field(default_factory=list, description="Source documents used")
397
+ confidence_score: float = Field(0.0, description="Confidence score (0-1)")
398
+ query_improved: bool = Field(False, description="Whether query was improved")
399
+ verification_performed: bool = Field(False, description="Whether answer was verified")
400
+ conversation_id: str = Field(..., description="Conversation ID")
401
+ metadata: Optional[Dict[str, Any]] = Field(None, description="Additional metadata")
402
+
403
+ class Config:
404
+ json_schema_extra = {
405
+ "example": {
406
+ "answer": "Hypertension treatment includes lifestyle modifications and medications...",
407
+ "sources": ["medical_guidelines.pdf", "research_paper.pdf"],
408
+ "confidence_score": 0.92,
409
+ "query_improved": True,
410
+ "verification_performed": True,
411
+ "conversation_id": "conv_123",
412
+ "metadata": {
413
+ "original_query": "What is HTN treatment?",
414
+ "improved_query": "What are the treatment options for hypertension?",
415
+ "verification_score": 8.5
416
+ }
417
+ }
418
+ }
419
+
420
+
421
+ class UploadResponse(BaseModel):
422
+ success: bool
423
+ message: str
424
+ file_name: str
425
+ domain: str
426
+ processing_id: str
427
+
428
+
429
+ class BatchUploadResponse(BaseModel):
430
+ success: bool
431
+ message: str
432
+ total_files: int
433
+ accepted_files: int
434
+ processing_ids: List[str]
435
+ domain: str
436
+
437
+
438
+ class URLUploadRequest(BaseModel):
439
+ url: str = Field(..., description="URL to fetch and process")
440
+ domain: str = Field("medical", description="Domain context")
441
+ convert_to_markdown: bool = Field(True, description="Convert HTML to markdown")
442
+
443
+ class Config:
444
+ json_schema_extra = {
445
+ "example": {
446
+ "url": "https://example.com/medical-article.pdf",
447
+ "domain": "medical",
448
+ "convert_to_markdown": True
449
+ }
450
+ }
451
+
452
+
453
+ class DomainInfo(BaseModel):
454
+ domain_id: str
455
+ name: str
456
+ description: str
457
+ file_extensions: List[str]
458
+ features: Dict[str, Any]
459
+
460
+
461
+ class HealthResponse(BaseModel):
462
+ status: str
463
+ timestamp: str
464
+ version: str
465
+ features: Dict[str, bool]
466
+ domains: List[str]
467
+
468
+ # =============================================================================
469
+ # Gemini Integration Functions
470
+ # =============================================================================
471
+
472
+ async def gemini_llm_func(
473
+ prompt: str,
474
+ system_prompt: Optional[str] = None,
475
+ history_messages: Optional[List[Dict[str, str]]] = None,
476
+ **kwargs,
477
+ ) -> str:
478
+ """
479
+ Gemini LLM function for text generation (Improved with format validation).
480
+
481
+ Enhancements:
482
+ - Increased token limits for entity extraction tasks
483
+ - Better temperature control for structured outputs
484
+ - Response validation and auto-append of completion delimiter
485
+ """
486
+ def _sync_call():
487
+ try:
488
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
489
+
490
+ safety_settings = [
491
+ {
492
+ "category": HarmCategory.HARM_CATEGORY_HARASSMENT,
493
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
494
+ },
495
+ {
496
+ "category": HarmCategory.HARM_CATEGORY_HATE_SPEECH,
497
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
498
+ },
499
+ {
500
+ "category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
501
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
502
+ },
503
+ {
504
+ "category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
505
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
506
+ },
507
+ ]
508
+ # --- IMPROVEMENT: Use system_instruction parameter ---
509
+ logger.info(f"Creating GenerativeModel with model_name: {GEMINI_TEXT_MODEL}")
510
+ model = genai.GenerativeModel(
511
+ model_name=GEMINI_TEXT_MODEL,
512
+ system_instruction=system_prompt,
513
+ safety_settings=safety_settings
514
+ )
515
+ config_params = {}
516
+
517
+ # Smart temperature control: lower for extraction tasks
518
+ is_extraction_task = system_prompt and ("entity" in system_prompt.lower() or "extraction" in system_prompt.lower())
519
+ if "temperature" in kwargs:
520
+ config_params["temperature"] = kwargs["temperature"]
521
+ else:
522
+ # Use lower temperature for structured extraction tasks
523
+ config_params["temperature"] = 0.1 if is_extraction_task else 0.3
524
+
525
+ # Increase token limit for extraction tasks to avoid truncation
526
+ if "max_tokens" in kwargs:
527
+ config_params["max_output_tokens"] = kwargs["max_tokens"]
528
+ else:
529
+ # Larger limits for extraction to ensure completion delimiter is included
530
+ config_params["max_output_tokens"] = 16384 if is_extraction_task else 8192
531
+
532
+ generation_config = genai.types.GenerationConfig(**config_params)
533
+
534
+ # --- IMPROVEMENT: Build structured history for chat model ---
535
+ history = []
536
+ if history_messages:
537
+ for msg in history_messages[-5:]:
538
+ role = "user" if msg.get("role") == "user" else "model"
539
+ content = msg.get("content", "")
540
+ if content:
541
+ history.append({"role": role, "parts": [content]})
542
+
543
+ chat = model.start_chat(history=history)
544
+ response = chat.send_message(prompt, generation_config=generation_config)
545
+ try:
546
+ result = response.text
547
+
548
+ # Post-processing: Ensure completion delimiter is present for extraction tasks
549
+ if is_extraction_task and result:
550
+ # Check if completion delimiter is missing
551
+ if "<|COMPLETE|>" not in result and "<|complete|>" not in result:
552
+ logger.warning("Completion delimiter missing from extraction result, appending it")
553
+ # Append the delimiter to the end
554
+ result = result.strip() + "\n<|COMPLETE|>"
555
+
556
+ return result
557
+ except ValueError as ve:
558
+ logger.warning(f"Response blocked or empty. Reason: {ve}. Candidates: {response.candidates}")
559
+ if response.prompt_feedback:
560
+ logger.warning(f"Prompt feedback: {response.prompt_feedback}")
561
+ return ""
562
+ except Exception as e:
563
+ logger.error(f"Gemini LLM error: {e}", exc_info=True)
564
+ raise
565
+ return await asyncio.to_thread(_sync_call)
566
+
567
+
568
+ async def gemini_verifier_llm_func(
569
+ prompt: str,
570
+ system_prompt: Optional[str] = None,
571
+ history_messages: Optional[List[Dict[str, str]]] = None,
572
+ **kwargs,
573
+ ) -> str:
574
+ """Gemini Pro LLM function for answer verification (more powerful, thorough)."""
575
+ def _sync_call():
576
+ try:
577
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
578
+
579
+ safety_settings = [
580
+ {
581
+ "category": HarmCategory.HARM_CATEGORY_HARASSMENT,
582
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
583
+ },
584
+ {
585
+ "category": HarmCategory.HARM_CATEGORY_HATE_SPEECH,
586
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
587
+ },
588
+ {
589
+ "category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
590
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
591
+ },
592
+ {
593
+ "category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
594
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
595
+ },
596
+ ]
597
+ # Use Pro model for better verification
598
+ logger.info(f"Creating Verifier GenerativeModel with model_name: {GEMINI_VERIFIER_MODEL}")
599
+ model = genai.GenerativeModel(
600
+ model_name=GEMINI_VERIFIER_MODEL,
601
+ system_instruction=system_prompt,
602
+ safety_settings=safety_settings
603
+ )
604
+ config_params = {}
605
+ if "temperature" in kwargs:
606
+ config_params["temperature"] = kwargs["temperature"]
607
+ if "max_tokens" in kwargs:
608
+ config_params["max_output_tokens"] = kwargs["max_tokens"]
609
+ else:
610
+ # Default to larger token limit for verification responses
611
+ config_params["max_output_tokens"] = 8192
612
+ generation_config = genai.types.GenerationConfig(**config_params)
613
+
614
+ # Build history
615
+ history = []
616
+ if history_messages:
617
+ for msg in history_messages[-5:]:
618
+ role = "user" if msg.get("role") == "user" else "model"
619
+ content = msg.get("content", "")
620
+ if content:
621
+ history.append({"role": role, "parts": [content]})
622
+
623
+ chat = model.start_chat(history=history)
624
+ response = chat.send_message(prompt, generation_config=generation_config)
625
+ try:
626
+ return response.text
627
+ except ValueError as ve:
628
+ logger.warning(f"Response blocked or empty. Reason: {ve}. Candidates: {response.candidates}")
629
+ if response.prompt_feedback:
630
+ logger.warning(f"Prompt feedback: {response.prompt_feedback}")
631
+ return ""
632
+ except Exception as e:
633
+ logger.error(f"Gemini Verifier LLM error: {e}", exc_info=True)
634
+ raise
635
+ return await asyncio.to_thread(_sync_call)
636
+
637
+
638
+ async def gemini_vision_func(
639
+ prompt: str,
640
+ system_prompt: Optional[str] = None,
641
+ image_data: Optional[str] = None,
642
+ **kwargs,
643
+ ) -> str:
644
+ """Gemini Vision function for image analysis."""
645
+ def _sync_call():
646
+ try:
647
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
648
+
649
+ safety_settings = [
650
+ {
651
+ "category": HarmCategory.HARM_CATEGORY_HARASSMENT,
652
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
653
+ },
654
+ {
655
+ "category": HarmCategory.HARM_CATEGORY_HATE_SPEECH,
656
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
657
+ },
658
+ {
659
+ "category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
660
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
661
+ },
662
+ {
663
+ "category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
664
+ "threshold": HarmBlockThreshold.BLOCK_NONE,
665
+ },
666
+ ]
667
+ # --- FIX: Use dedicated vision model ---
668
+ logger.info(f"Creating Vision GenerativeModel with model_name: {GEMINI_VISION_MODEL}")
669
+ model = genai.GenerativeModel(GEMINI_VISION_MODEL, safety_settings=safety_settings)
670
+ config_params = {}
671
+ if "temperature" in kwargs:
672
+ config_params["temperature"] = kwargs["temperature"]
673
+ if "max_tokens" in kwargs:
674
+ config_params["max_output_tokens"] = kwargs["max_tokens"]
675
+ generation_config = genai.types.GenerationConfig(**config_params)
676
+
677
+ content_parts = []
678
+ if system_prompt:
679
+ content_parts.append(system_prompt)
680
+ content_parts.append(prompt)
681
+
682
+ if image_data:
683
+ import base64
684
+ import io
685
+ from PIL import Image
686
+ image_bytes = base64.b64decode(image_data)
687
+ image = Image.open(io.BytesIO(image_bytes))
688
+ content_parts.append(image)
689
+
690
+ response = model.generate_content(content_parts, generation_config=generation_config)
691
+ try:
692
+ return response.text
693
+ except ValueError as ve:
694
+ logger.warning(f"Vision response blocked or empty. Reason: {ve}. Candidates: {response.candidates}")
695
+ if response.prompt_feedback:
696
+ logger.warning(f"Vision prompt feedback: {response.prompt_feedback}")
697
+ return ""
698
+ except Exception as e:
699
+ logger.error(f"Gemini Vision error: {e}", exc_info=True)
700
+ raise
701
+ return await asyncio.to_thread(_sync_call)
702
+
703
+
704
+ async def gemini_embedding_func(texts: List[str]) -> List[List[float]]:
705
+ """Gemini Embedding function for text vectorization."""
706
+ def _sync_call():
707
+ try:
708
+ # --- IMPROVEMENT: Use newer embedding model ---
709
+ result = genai.embed_content(
710
+ model=GEMINI_EMBEDDING_MODEL,
711
+ content=texts,
712
+ task_type="retrieval_document"
713
+ )
714
+ return result['embedding']
715
+ except Exception as e:
716
+ logger.error(f"Gemini Embedding error: {e}", exc_info=True)
717
+ raise
718
+ return await asyncio.to_thread(_sync_call)
719
+
720
+ gemini_embedding_func.embedding_dim = 768
721
+
722
+
723
+ async def synthesize_web_results_with_gemini(
724
+ query: str,
725
+ web_context: str,
726
+ rag_context: Optional[str] = None
727
+ ) -> str:
728
+ """
729
+ Use Gemini to synthesize web search results into a coherent, direct answer
730
+
731
+ Args:
732
+ query: User's original query
733
+ web_context: Formatted web search results
734
+ rag_context: Optional RAG results to incorporate
735
+
736
+ Returns:
737
+ Synthesized answer from Gemini
738
+ """
739
+ try:
740
+ logger.info("Synthesizing web results with Gemini")
741
+
742
+ # Build synthesis prompt
743
+ if rag_context:
744
+ system_prompt = """You are an expert research assistant. Your task is to synthesize information from both
745
+ a knowledge base and recent web search results to provide a comprehensive, accurate answer.
746
+
747
+ Guidelines:
748
+ - Provide a direct, clear answer to the user's question
749
+ - Combine insights from both the knowledge base and web sources
750
+ - Cite sources when making specific claims (use [Source N] notation)
751
+ - If there are contradictions, acknowledge them and explain
752
+ - Be concise but thorough
753
+ - Use a professional, informative tone"""
754
+
755
+ prompt = f"""User Question: {query}
756
+
757
+ Knowledge Base Information:
758
+ {rag_context}
759
+
760
+ Web Search Results:
761
+ {web_context}
762
+
763
+ Based on the above information, provide a comprehensive answer to the user's question. Synthesize the information from both sources and cite your sources appropriately."""
764
+
765
+ else:
766
+ system_prompt = """You are an expert research assistant. Your task is to synthesize web search results
767
+ into a clear, direct answer to the user's question.
768
+
769
+ Guidelines:
770
+ - Provide a direct, clear answer to the user's question
771
+ - Cite sources when making specific claims (use [Source N] notation)
772
+ - Be concise but comprehensive
773
+ - If information is limited or unclear, acknowledge it
774
+ - Use a professional, informative tone
775
+ - Include relevant details like dates, statistics, or examples when available"""
776
+
777
+ prompt = f"""User Question: {query}
778
+
779
+ Web Search Results:
780
+ {web_context}
781
+
782
+ Based on the web search results above, provide a clear and comprehensive answer to the user's question. Cite your sources appropriately."""
783
+
784
+ # Call Gemini to synthesize the answer
785
+ answer = await gemini_llm_func(
786
+ prompt=prompt,
787
+ system_prompt=system_prompt,
788
+ temperature=0.3, # Lower temperature for more focused answers
789
+ max_tokens=1500
790
+ )
791
+
792
+ if not answer or len(answer.strip()) < 10:
793
+ logger.warning("Gemini synthesis produced minimal output, using fallback")
794
+ return web_context
795
+
796
+ return answer
797
+
798
+ except Exception as e:
799
+ logger.error(f"Error synthesizing web results with Gemini: {e}", exc_info=True)
800
+ # Fallback to raw web context
801
+ return web_context
802
+
803
+
804
+ async def gemini_rerank_func(query: str, documents: List[str], top_n: Optional[int] = None) -> List[Dict[str, Any]]:
805
+ """
806
+ Gemini-based reranking function for LightRAG
807
+
808
+ This follows LightRAG's reranking API signature which expects:
809
+ - documents: List of strings (not dict chunks)
810
+ - top_n: Number of top results (not top_k)
811
+ - Returns: List of {"index": int, "relevance_score": float}
812
+
813
+ Args:
814
+ query: Search query
815
+ documents: List of document strings to rerank
816
+ top_n: Number of top documents to return (None = return all, reranked)
817
+
818
+ Returns:
819
+ List of {"index": int, "relevance_score": float} in descending score order
820
+ """
821
+ try:
822
+ # Convert documents (strings) to chunks format for our reranker
823
+ chunks = [{"content": doc} for doc in documents]
824
+
825
+ # Initialize reranker with Gemini LLM function
826
+ reranker = GeminiReranker(
827
+ llm_func=gemini_llm_func,
828
+ batch_size=3, # Process 3 chunks at a time to avoid rate limits
829
+ temperature=0.1
830
+ )
831
+
832
+ # Perform reranking
833
+ reranked_chunks = await reranker.rerank(query, chunks, top_n)
834
+
835
+ # Convert back to LightRAG format: List[{"index": int, "relevance_score": float}]
836
+ results = []
837
+ for i, chunk in enumerate(reranked_chunks):
838
+ # Find original index of this chunk
839
+ original_content = chunk.get("content", "")
840
+ try:
841
+ original_index = documents.index(original_content)
842
+ except ValueError:
843
+ # Fallback: use current index if not found
844
+ original_index = i
845
+
846
+ results.append({
847
+ "index": original_index,
848
+ "relevance_score": chunk.get("relevance_score", 0.0)
849
+ })
850
+
851
+ logger.debug(f"Reranked {len(documents)} documents, returning {len(results)} results")
852
+ return results
853
+
854
+ except Exception as e:
855
+ logger.error(f"Reranking error: {e}", exc_info=True)
856
+ # Return original order on error - format: List[{"index": int, "relevance_score": float}]
857
+ result_count = top_n if top_n and top_n < len(documents) else len(documents)
858
+ return [{"index": i, "relevance_score": 1.0} for i in range(result_count)]
859
+
860
+
861
+ # =============================================================================
862
+ # RAG Instance Management
863
+ # =============================================================================
864
+
865
+ async def get_rag_instance(domain: str) -> RAGAnything:
866
+ """Get or create RAG instance for a specific domain."""
867
+ if domain not in DOMAIN_CONFIGS:
868
+ raise HTTPException(
869
+ status_code=400,
870
+ detail=f"Invalid domain '{domain}'. Valid domains: {list(DOMAIN_CONFIGS.keys())}"
871
+ )
872
+ if domain in rag_instances:
873
+ logger.debug(f"Using cached RAG instance for domain: {domain}")
874
+ return rag_instances[domain]
875
+
876
+ logger.info(f"Creating new RAG instance for domain: {domain}")
877
+ try:
878
+ domain_config = DOMAIN_CONFIGS[domain]
879
+ domain_storage = STORAGE_DIR / domain
880
+ domain_storage.mkdir(parents=True, exist_ok=True)
881
+
882
+ config = RAGAnythingConfig(
883
+ working_dir=str(domain_storage),
884
+ parser="mineru",
885
+ parse_method="auto",
886
+ enable_image_processing=True,
887
+ enable_table_processing=True,
888
+ enable_equation_processing=True,
889
+ **domain_config["config_overrides"]
890
+ )
891
+ rag = await create_rag_anything(
892
+ llm_model_func=gemini_llm_func, # Flash for generation
893
+ vision_model_func=gemini_vision_func, # Flash for vision
894
+ embedding_func=gemini_embedding_func, # Embedding model
895
+ verifier_llm_func=gemini_verifier_llm_func, # Pro for verification
896
+ config=config,
897
+ rerank_model_func=gemini_rerank_func, # Enable reranking (passed directly)
898
+ )
899
+ rag_instances[domain] = rag
900
+ logger.info(f"RAG instance created successfully for domain: {domain}")
901
+ return rag
902
+ except Exception as e:
903
+ logger.error(f"Failed to create RAG instance for domain {domain}: {e}", exc_info=True)
904
+ raise HTTPException(
905
+ status_code=500,
906
+ detail=f"Failed to initialize RAG system for domain '{domain}': {str(e)}"
907
+ )
908
+
909
+ # =============================================================================
910
+ # API Endpoints
911
+ # =============================================================================
912
+
913
+ @app.get("/health", response_model=HealthResponse)
914
+ async def health_check():
915
+ """Health check endpoint."""
916
+ return HealthResponse(
917
+ status="healthy",
918
+ timestamp=datetime.now().isoformat(),
919
+ version="2.0.0",
920
+ features={
921
+ "query_improvement": True,
922
+ "dual_llm_verification": True,
923
+ "gemini_pro_verifier": True,
924
+ "reranking": True,
925
+ "conversation_memory": True,
926
+ "multi_domain": True,
927
+ "multimodal_processing": True,
928
+ "gemini_integration": bool(GEMINI_API_KEY),
929
+ "web_search": bool(web_searcher),
930
+ "url_ingestion": bool(url_fetcher),
931
+ },
932
+ domains=list(DOMAIN_CONFIGS.keys())
933
+ )
934
+
935
+
936
+ @app.get("/domains", response_model=List[DomainInfo])
937
+ async def list_domains():
938
+ """List all available domains."""
939
+ domains = []
940
+ for domain_id, config in DOMAIN_CONFIGS.items():
941
+ domains.append(DomainInfo(
942
+ domain_id=domain_id,
943
+ name=config["name"],
944
+ description=config["description"],
945
+ file_extensions=config["file_extensions"],
946
+ features={k: v for k, v in config["config_overrides"].items() if isinstance(v, bool)}
947
+ ))
948
+ return domains
949
+
950
+
951
+ @app.post("/upload", response_model=UploadResponse)
952
+ async def upload_document(
953
+ file: UploadFile = File(...),
954
+ domain: str = Form(...),
955
+ background_tasks: BackgroundTasks = None
956
+ ):
957
+ """Upload and process a document in the background."""
958
+ logger.info(f"Upload request: {file.filename} to domain: {domain}")
959
+ try:
960
+ if domain not in DOMAIN_CONFIGS:
961
+ raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
962
+
963
+ file_ext = Path(file.filename).suffix.lower()
964
+ allowed_extensions = DOMAIN_CONFIGS[domain]["file_extensions"]
965
+ if file_ext not in allowed_extensions:
966
+ raise HTTPException(400, f"File type {file_ext} not for '{domain}'. Allowed: {allowed_extensions}")
967
+
968
+ processing_id = str(uuid.uuid4())
969
+ domain_upload_dir = UPLOAD_DIR / domain
970
+ domain_upload_dir.mkdir(parents=True, exist_ok=True)
971
+ file_path = domain_upload_dir / f"{processing_id}_{file.filename}"
972
+
973
+ with open(file_path, "wb") as f:
974
+ f.write(await file.read())
975
+ logger.info(f"File saved: {file_path}")
976
+
977
+ # Initialize status and save to disk
978
+ update_processing_status(processing_id, {
979
+ "status": "processing",
980
+ "message": "Processing document...",
981
+ "file_name": file.filename,
982
+ "domain": domain,
983
+ "started_at": datetime.now().isoformat()
984
+ })
985
+
986
+ async def process_document_task():
987
+ try:
988
+ logger.info(f"Processing document: {file_path}")
989
+ rag = await get_rag_instance(domain)
990
+ result = await rag.process_document_complete(str(file_path))
991
+
992
+ # Check result (process_document_complete returns None on success)
993
+ if result is None or (isinstance(result, dict) and result.get("success") is not False):
994
+ logger.info(f"Document processed successfully: {file.filename}")
995
+ update_processing_status(processing_id, {
996
+ "status": "completed",
997
+ "message": "Document processed successfully",
998
+ "file_name": file.filename,
999
+ "domain": domain,
1000
+ "completed_at": datetime.now().isoformat()
1001
+ })
1002
+ else:
1003
+ error_msg = result.get('error', 'Unknown processing error') if isinstance(result, dict) else "Processing error"
1004
+ logger.error(f"Document processing failed: {error_msg}")
1005
+ update_processing_status(processing_id, {
1006
+ "status": "failed",
1007
+ "message": f"Processing failed: {error_msg}",
1008
+ "file_name": file.filename,
1009
+ "domain": domain,
1010
+ "error": error_msg
1011
+ })
1012
+ except Exception as e:
1013
+ logger.error(f"Error in background processing of {file.filename}: {e}", exc_info=True)
1014
+ update_processing_status(processing_id, {
1015
+ "status": "failed",
1016
+ "message": f"Error: {str(e)}",
1017
+ "file_name": file.filename,
1018
+ "domain": domain,
1019
+ "error": str(e)
1020
+ })
1021
+
1022
+ background_tasks.add_task(process_document_task)
1023
+
1024
+ return UploadResponse(
1025
+ success=True,
1026
+ message="Document uploaded and queued for processing",
1027
+ file_name=file.filename,
1028
+ domain=domain,
1029
+ processing_id=processing_id
1030
+ )
1031
+ except HTTPException:
1032
+ raise
1033
+ except Exception as e:
1034
+ logger.error(f"Upload error: {e}", exc_info=True)
1035
+ raise HTTPException(status_code=500, detail=f"Upload failed: {str(e)}")
1036
+
1037
+
1038
+ @app.post("/upload-batch", response_model=BatchUploadResponse)
1039
+ async def upload_documents_batch(
1040
+ files: List[UploadFile] = File(...),
1041
+ domain: str = Form(...),
1042
+ background_tasks: BackgroundTasks = None
1043
+ ):
1044
+ """
1045
+ Upload and process multiple documents in batch using optimized processing.
1046
+
1047
+ Uses BatchOptimizer for 2-3x faster processing through:
1048
+ - Parallel parsing (up to 4 documents simultaneously)
1049
+ - Parallel processing (up to 10 documents simultaneously)
1050
+ - Pipeline architecture (parse + process in parallel)
1051
+ """
1052
+ logger.info(f"Batch upload request: {len(files)} files to domain: {domain}")
1053
+ try:
1054
+ if domain not in DOMAIN_CONFIGS:
1055
+ raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
1056
+
1057
+ allowed_extensions = DOMAIN_CONFIGS[domain]["file_extensions"]
1058
+ domain_upload_dir = UPLOAD_DIR / domain
1059
+ domain_upload_dir.mkdir(parents=True, exist_ok=True)
1060
+
1061
+ # Process and save all files
1062
+ file_paths = []
1063
+ processing_ids = []
1064
+ rejected_files = []
1065
+
1066
+ for file in files:
1067
+ file_ext = Path(file.filename).suffix.lower()
1068
+ if file_ext not in allowed_extensions:
1069
+ rejected_files.append(file.filename)
1070
+ logger.warning(f"Rejected file {file.filename}: extension {file_ext} not allowed")
1071
+ continue
1072
+
1073
+ processing_id = str(uuid.uuid4())
1074
+ file_path = domain_upload_dir / f"{processing_id}_{file.filename}"
1075
+
1076
+ with open(file_path, "wb") as f:
1077
+ f.write(await file.read())
1078
+
1079
+ file_paths.append(str(file_path))
1080
+ processing_ids.append(processing_id)
1081
+
1082
+ # Initialize status for each file
1083
+ update_processing_status(processing_id, {
1084
+ "status": "queued",
1085
+ "message": "Queued for batch processing...",
1086
+ "file_name": file.filename,
1087
+ "domain": domain,
1088
+ "started_at": datetime.now().isoformat()
1089
+ })
1090
+
1091
+ logger.info(f"Accepted {len(file_paths)}/{len(files)} files, rejected: {rejected_files}")
1092
+
1093
+ if not file_paths:
1094
+ raise HTTPException(400, f"No valid files provided. Allowed extensions: {allowed_extensions}")
1095
+
1096
+ # Process documents in batch using optimized processing
1097
+ async def process_batch_task():
1098
+ start_time = time.time()
1099
+ try:
1100
+ logger.info(f"Starting optimized batch processing of {len(file_paths)} files")
1101
+ rag = await get_rag_instance(domain)
1102
+
1103
+ # Use optimized batch processing if available
1104
+ if hasattr(rag, 'process_documents_batch_optimized'):
1105
+ result = await rag.process_documents_batch_optimized(
1106
+ file_paths=file_paths,
1107
+ max_concurrent_parsers=4, # MinerU optimal
1108
+ max_concurrent_processors=10, # Higher for I/O-bound tasks
1109
+ enable_progress_tracking=True,
1110
+ )
1111
+
1112
+ # Update statuses based on results
1113
+ successful_files = result.get('successful_files', [])
1114
+ failed_files = result.get('failed_files', {})
1115
+
1116
+ for idx, file_path in enumerate(file_paths):
1117
+ processing_id = processing_ids[idx]
1118
+ filename = Path(file_path).name.split('_', 1)[1] if '_' in Path(file_path).name else Path(file_path).name
1119
+
1120
+ if file_path in successful_files:
1121
+ update_processing_status(processing_id, {
1122
+ "status": "completed",
1123
+ "message": "Document processed successfully",
1124
+ "file_name": filename,
1125
+ "domain": domain,
1126
+ "completed_at": datetime.now().isoformat()
1127
+ })
1128
+ elif file_path in failed_files:
1129
+ error_msg = failed_files[file_path]
1130
+ update_processing_status(processing_id, {
1131
+ "status": "failed",
1132
+ "message": f"Processing failed: {error_msg}",
1133
+ "file_name": filename,
1134
+ "domain": domain,
1135
+ "error": error_msg
1136
+ })
1137
+
1138
+ total_time = time.time() - start_time
1139
+ throughput = len(successful_files) / total_time if total_time > 0 else 0
1140
+ logger.info(
1141
+ f"Batch processing complete: {len(successful_files)}/{len(file_paths)} successful "
1142
+ f"in {total_time:.2f}s ({throughput:.2f} docs/sec)"
1143
+ )
1144
+
1145
+ # Track performance
1146
+ performance_metrics["processing_times"].append(total_time)
1147
+ if len(performance_metrics["processing_times"]) > 100:
1148
+ performance_metrics["processing_times"] = performance_metrics["processing_times"][-100:]
1149
+
1150
+ else:
1151
+ # Fallback: process sequentially
1152
+ logger.warning("Optimized batch processing not available, using sequential processing")
1153
+ for idx, file_path in enumerate(file_paths):
1154
+ processing_id = processing_ids[idx]
1155
+ filename = Path(file_path).name.split('_', 1)[1] if '_' in Path(file_path).name else Path(file_path).name
1156
+
1157
+ current_status = processing_status[processing_id].copy()
1158
+ current_status["status"] = "processing"
1159
+ current_status["message"] = "Processing document..."
1160
+ update_processing_status(processing_id, current_status)
1161
+
1162
+ try:
1163
+ result = await rag.process_document_complete(file_path)
1164
+ if result is None or (isinstance(result, dict) and result.get("success") is not False):
1165
+ update_processing_status(processing_id, {
1166
+ "status": "completed",
1167
+ "message": "Document processed successfully",
1168
+ "file_name": filename,
1169
+ "domain": domain,
1170
+ "completed_at": datetime.now().isoformat()
1171
+ })
1172
+ else:
1173
+ error_msg = result.get('error', 'Unknown error') if isinstance(result, dict) else "Processing error"
1174
+ update_processing_status(processing_id, {
1175
+ "status": "failed",
1176
+ "message": f"Processing failed: {error_msg}",
1177
+ "file_name": filename,
1178
+ "domain": domain,
1179
+ "error": error_msg
1180
+ })
1181
+ except Exception as e:
1182
+ logger.error(f"Error processing {filename}: {e}", exc_info=True)
1183
+ update_processing_status(processing_id, {
1184
+ "status": "failed",
1185
+ "message": f"Error: {str(e)}",
1186
+ "file_name": filename,
1187
+ "domain": domain,
1188
+ "error": str(e)
1189
+ })
1190
+
1191
+ except Exception as e:
1192
+ logger.error(f"Batch processing error: {e}", exc_info=True)
1193
+ # Mark all as failed
1194
+ for idx, file_path in enumerate(file_paths):
1195
+ processing_id = processing_ids[idx]
1196
+ filename = Path(file_path).name.split('_', 1)[1] if '_' in Path(file_path).name else Path(file_path).name
1197
+ update_processing_status(processing_id, {
1198
+ "status": "failed",
1199
+ "message": f"Batch processing error: {str(e)}",
1200
+ "file_name": filename,
1201
+ "domain": domain,
1202
+ "error": str(e)
1203
+ })
1204
+
1205
+ background_tasks.add_task(process_batch_task)
1206
+
1207
+ return BatchUploadResponse(
1208
+ success=True,
1209
+ message=f"Batch upload queued: {len(file_paths)} files accepted" + (f", {len(rejected_files)} rejected" if rejected_files else ""),
1210
+ total_files=len(files),
1211
+ accepted_files=len(file_paths),
1212
+ processing_ids=processing_ids,
1213
+ domain=domain
1214
+ )
1215
+
1216
+ except HTTPException:
1217
+ raise
1218
+ except Exception as e:
1219
+ logger.error(f"Batch upload error: {e}", exc_info=True)
1220
+ raise HTTPException(status_code=500, detail=f"Batch upload failed: {str(e)}")
1221
+
1222
+
1223
+ @app.post("/upload-url", response_model=UploadResponse)
1224
+ async def upload_url(
1225
+ request: URLUploadRequest,
1226
+ background_tasks: BackgroundTasks
1227
+ ):
1228
+ """Fetch document from URL and process it."""
1229
+ logger.info(f"URL upload request: {request.url} to domain: {request.domain}")
1230
+ try:
1231
+ if not url_fetcher:
1232
+ raise HTTPException(503, "URL fetcher not available. Check server configuration.")
1233
+
1234
+ if request.domain not in DOMAIN_CONFIGS:
1235
+ raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
1236
+
1237
+ processing_id = str(uuid.uuid4())
1238
+
1239
+ # Initialize status
1240
+ update_processing_status(processing_id, {
1241
+ "status": "fetching",
1242
+ "message": "Fetching URL content...",
1243
+ "url": request.url,
1244
+ "domain": request.domain,
1245
+ "started_at": datetime.now().isoformat()
1246
+ })
1247
+
1248
+ async def fetch_and_process_url():
1249
+ try:
1250
+ logger.info(f"[URL UPLOAD] Starting fetch for: {request.url}")
1251
+
1252
+ # Fetch URL content with timeout
1253
+ fetch_result = await asyncio.wait_for(
1254
+ url_fetcher.fetch_url(
1255
+ url=request.url,
1256
+ convert_to_markdown=request.convert_to_markdown
1257
+ ),
1258
+ timeout=60.0 # 60 second timeout for URL fetching
1259
+ )
1260
+
1261
+ if not fetch_result.get("success"):
1262
+ error_msg = fetch_result.get("error", "Unknown fetch error")
1263
+ logger.error(f"[URL UPLOAD] Fetch failed: {error_msg}")
1264
+ update_processing_status(processing_id, {
1265
+ "status": "failed",
1266
+ "message": f"Failed to fetch URL: {error_msg}",
1267
+ "domain": request.domain,
1268
+ "error": error_msg
1269
+ })
1270
+ return
1271
+
1272
+ file_path = fetch_result.get("file_path")
1273
+ if not file_path:
1274
+ logger.error("[URL UPLOAD] No file path returned from URL fetch")
1275
+ update_processing_status(processing_id, {
1276
+ "status": "failed",
1277
+ "message": "No file path returned from URL fetch",
1278
+ "domain": request.domain,
1279
+ "error": "No file path"
1280
+ })
1281
+ return
1282
+
1283
+ logger.info(f"[URL UPLOAD] Content saved to: {file_path}")
1284
+
1285
+ # Update status
1286
+ update_processing_status(processing_id, {
1287
+ "status": "processing",
1288
+ "message": "Processing document...",
1289
+ "domain": request.domain,
1290
+ "file_path": file_path
1291
+ })
1292
+
1293
+ # Get RAG instance
1294
+ rag = await get_rag_instance(request.domain)
1295
+
1296
+ # Check if we have a content list with images (advanced HTML parsing)
1297
+ content_list = fetch_result.get("content_list")
1298
+ images_count = fetch_result.get("images_count", 0)
1299
+
1300
+ if content_list and len(content_list) > 0 and images_count > 0:
1301
+ # Advanced pathway: Process pre-parsed content list with images
1302
+ logger.info(f"[URL UPLOAD] Using advanced processing: {len(content_list)} blocks, {images_count} images")
1303
+ result = await asyncio.wait_for(
1304
+ rag.process_content_list_direct(
1305
+ content_list=content_list,
1306
+ source_identifier=request.url,
1307
+ enable_image_processing=True
1308
+ ),
1309
+ timeout=300.0 # 5 minute timeout for processing
1310
+ )
1311
+ else:
1312
+ # Standard pathway: Process as regular document (PDF or text-only HTML)
1313
+ logger.info("[URL UPLOAD] Using standard document processing")
1314
+ result = await asyncio.wait_for(
1315
+ rag.process_document_complete(file_path),
1316
+ timeout=300.0 # 5 minute timeout for processing
1317
+ )
1318
+
1319
+ # Check result and update status
1320
+ # Note: process_document_complete returns None on success (not a dict)
1321
+ if result is None or (isinstance(result, dict) and result.get("success") is not False):
1322
+ logger.info(f"[URL UPLOAD] ✓ Successfully processed: {request.url}")
1323
+ update_processing_status(processing_id, {
1324
+ "status": "completed",
1325
+ "message": "Document processed successfully",
1326
+ "domain": request.domain,
1327
+ "file_path": file_path,
1328
+ "completed_at": datetime.now().isoformat()
1329
+ })
1330
+ else:
1331
+ error_msg = result.get('error', 'Unknown processing error') if isinstance(result, dict) else "Processing returned error"
1332
+ logger.error(f"[URL UPLOAD] ✗ Processing failed: {error_msg}")
1333
+ update_processing_status(processing_id, {
1334
+ "status": "failed",
1335
+ "message": f"Processing failed: {error_msg}",
1336
+ "domain": request.domain,
1337
+ "error": error_msg
1338
+ })
1339
+
1340
+ except asyncio.TimeoutError:
1341
+ logger.error(f"[URL UPLOAD] ✗ Timeout processing {request.url}")
1342
+ update_processing_status(processing_id, {
1343
+ "status": "failed",
1344
+ "message": "Processing timeout",
1345
+ "domain": request.domain,
1346
+ "error": "Timeout"
1347
+ })
1348
+ except Exception as e:
1349
+ logger.error(f"[URL UPLOAD] ✗ Error processing {request.url}: {e}", exc_info=True)
1350
+ update_processing_status(processing_id, {
1351
+ "status": "failed",
1352
+ "message": f"Error: {str(e)}",
1353
+ "domain": request.domain,
1354
+ "error": str(e)
1355
+ })
1356
+
1357
+ background_tasks.add_task(fetch_and_process_url)
1358
+
1359
+ return UploadResponse(
1360
+ success=True,
1361
+ message="URL queued for fetching and processing",
1362
+ file_name=request.url,
1363
+ domain=request.domain,
1364
+ processing_id=processing_id
1365
+ )
1366
+
1367
+ except HTTPException:
1368
+ raise
1369
+ except Exception as e:
1370
+ logger.error(f"URL upload error: {e}", exc_info=True)
1371
+ raise HTTPException(status_code=500, detail=f"URL upload failed: {str(e)}")
1372
+
1373
+
1374
+ @app.post("/query/stream")
1375
+ async def query_documents_stream(request: QueryRequest):
1376
+ """
1377
+ Stream query responses with real-time token generation and verification.
1378
+
1379
+ This endpoint provides Server-Sent Events (SSE) streaming for real-time
1380
+ response generation while maintaining dual-LLM verification.
1381
+ Supports web search augmentation when enabled.
1382
+ """
1383
+ logger.info(f"Streaming query request: '{request.query[:50]}...' in domain: {request.domain}, web_search: {request.enable_web_search}, web_only: {request.web_search_only}")
1384
+
1385
+ async def generate_sse():
1386
+ """Generate Server-Sent Events stream"""
1387
+ import json
1388
+
1389
+ try:
1390
+ conversation_id = request.conversation_id or f"conv_{uuid.uuid4()}"
1391
+
1392
+ # Handle web search only mode
1393
+ if request.web_search_only:
1394
+ if not web_searcher:
1395
+ error_data = {"type": "error", "content": {"message": "Web search not available. Set TAVILY_API_KEY."}, "done": True}
1396
+ yield f"event: error\ndata: {json.dumps(error_data)}\n\n"
1397
+ return
1398
+
1399
+ logger.info("Using web search only mode (streaming)")
1400
+ try:
1401
+ web_results = await web_searcher.search(request.query, max_results=5)
1402
+ web_context = web_searcher.format_results_for_llm(web_results)
1403
+
1404
+ # Synthesize answer using Gemini (streaming simulation)
1405
+ logger.info("Synthesizing web search results with Gemini (streaming)")
1406
+ answer = await synthesize_web_results_with_gemini(
1407
+ query=request.query,
1408
+ web_context=web_context,
1409
+ rag_context=None
1410
+ )
1411
+
1412
+ # Stream the answer word by word
1413
+ words = answer.split()
1414
+ for i, word in enumerate(words):
1415
+ token = word + " " if i < len(words) - 1 else word
1416
+ data = {"type": "token", "content": token, "done": False}
1417
+ yield f"event: token\ndata: {json.dumps(data)}\n\n"
1418
+ await asyncio.sleep(0.01) # Small delay for streaming effect
1419
+
1420
+ # Send completion event
1421
+ yield f"event: done\ndata: {json.dumps({'message': 'Stream complete', 'conversation_id': conversation_id})}\n\n"
1422
+ return
1423
+
1424
+ except Exception as e:
1425
+ logger.error(f"Web search only error: {e}", exc_info=True)
1426
+ error_data = {"type": "error", "content": {"message": f"Web search failed: {str(e)}"}, "done": True}
1427
+ yield f"event: error\ndata: {json.dumps(error_data)}\n\n"
1428
+ return
1429
+
1430
+ # Get RAG instance
1431
+ rag = await get_rag_instance(request.domain)
1432
+
1433
+ # Determine optimal parameters based on fast_mode
1434
+ if request.fast_mode:
1435
+ # Optimized parameters for 2-3x speedup
1436
+ top_k = request.top_k if request.top_k is not None else 20
1437
+ chunk_top_k = 10
1438
+ max_entity_tokens = 4000
1439
+ max_relation_tokens = 6000
1440
+ max_total_tokens = 20000
1441
+ logger.info(f"⚡ Fast mode enabled for streaming: top_k={top_k}, chunk_top_k={chunk_top_k}")
1442
+ else:
1443
+ # Default parameters (higher quality, slower)
1444
+ top_k = request.top_k if request.top_k is not None else 40
1445
+ chunk_top_k = 20
1446
+ max_entity_tokens = 6000
1447
+ max_relation_tokens = 8000
1448
+ max_total_tokens = 30000
1449
+
1450
+ # Log toggle settings
1451
+ logger.info(f"Query settings - improvement: {request.enable_query_improvement}, verification: {request.enable_verification_check}, web_search: {request.enable_web_search}")
1452
+
1453
+ # If web search augmentation is enabled, we need to collect the RAG answer first
1454
+ # then augment with web search
1455
+ if request.enable_web_search and web_searcher:
1456
+ logger.info("Web search augmentation enabled for streaming")
1457
+
1458
+ # Collect RAG answer first
1459
+ rag_answer_buffer = []
1460
+ async for chunk in rag.aquery_stream(
1461
+ query=request.query,
1462
+ mode=request.mode,
1463
+ enable_verification=request.enable_verification_check,
1464
+ enable_query_improvement=request.enable_query_improvement,
1465
+ top_k=top_k,
1466
+ chunk_top_k=chunk_top_k,
1467
+ max_entity_tokens=max_entity_tokens,
1468
+ max_relation_tokens=max_relation_tokens,
1469
+ max_total_tokens=max_total_tokens
1470
+ ):
1471
+ chunk_type = chunk.get("type", "token")
1472
+ content = chunk.get("content", "")
1473
+ done = chunk.get("done", False)
1474
+
1475
+ if chunk_type == "token":
1476
+ # Stream token and collect it
1477
+ rag_answer_buffer.append(content)
1478
+ data = {"type": "token", "content": content, "done": False}
1479
+ yield f"event: token\ndata: {json.dumps(data)}\n\n"
1480
+
1481
+ elif chunk_type == "verification":
1482
+ # Send verification metadata
1483
+ data = {"type": "verification", "content": content, "done": done}
1484
+ yield f"event: verification\ndata: {json.dumps(data)}\n\n"
1485
+
1486
+ elif chunk_type == "error":
1487
+ # Send error
1488
+ data = {"type": "error", "content": content, "done": True}
1489
+ yield f"event: error\ndata: {json.dumps(data)}\n\n"
1490
+ return
1491
+
1492
+ # Now perform web search and synthesis
1493
+ try:
1494
+ rag_answer = "".join(rag_answer_buffer)
1495
+ logger.info("Performing web search to augment RAG answer...")
1496
+ web_results = await web_searcher.search(request.query, max_results=5)
1497
+
1498
+ if web_results.get("results"):
1499
+ web_context = web_searcher.format_results_for_llm(web_results)
1500
+
1501
+ # Synthesize combined answer
1502
+ logger.info("Synthesizing RAG + web results with Gemini")
1503
+ synthesized_answer = await synthesize_web_results_with_gemini(
1504
+ query=request.query,
1505
+ web_context=web_context,
1506
+ rag_context=rag_answer
1507
+ )
1508
+
1509
+ # Clear the previous RAG answer and stream the synthesized one
1510
+ # Send a newline separator
1511
+ data = {"type": "token", "content": "\n\n---\n\n**Enhanced with Web Search:**\n\n", "done": False}
1512
+ yield f"event: token\ndata: {json.dumps(data)}\n\n"
1513
+
1514
+ # Stream synthesized answer
1515
+ words = synthesized_answer.split()
1516
+ for i, word in enumerate(words):
1517
+ token = word + " " if i < len(words) - 1 else word
1518
+ data = {"type": "token", "content": token, "done": False}
1519
+ yield f"event: token\ndata: {json.dumps(data)}\n\n"
1520
+ await asyncio.sleep(0.01)
1521
+
1522
+ except Exception as e:
1523
+ logger.error(f"Web search augmentation error: {e}", exc_info=True)
1524
+ # Continue without web augmentation
1525
+ pass
1526
+
1527
+ # Send completion event
1528
+ yield f"event: done\ndata: {json.dumps({'message': 'Stream complete', 'conversation_id': conversation_id})}\n\n"
1529
+
1530
+ else:
1531
+ # Standard RAG streaming without web search
1532
+ async for chunk in rag.aquery_stream(
1533
+ query=request.query,
1534
+ mode=request.mode,
1535
+ enable_verification=request.enable_verification_check,
1536
+ enable_query_improvement=request.enable_query_improvement,
1537
+ top_k=top_k,
1538
+ chunk_top_k=chunk_top_k,
1539
+ max_entity_tokens=max_entity_tokens,
1540
+ max_relation_tokens=max_relation_tokens,
1541
+ max_total_tokens=max_total_tokens
1542
+ ):
1543
+ chunk_type = chunk.get("type", "token")
1544
+ content = chunk.get("content", "")
1545
+ done = chunk.get("done", False)
1546
+
1547
+ if chunk_type == "token":
1548
+ # Stream token
1549
+ data = {"type": "token", "content": content, "done": done}
1550
+ yield f"event: token\ndata: {json.dumps(data)}\n\n"
1551
+
1552
+ elif chunk_type == "verification":
1553
+ # Send verification metadata
1554
+ data = {"type": "verification", "content": content, "done": done}
1555
+ yield f"event: verification\ndata: {json.dumps(data)}\n\n"
1556
+
1557
+ elif chunk_type == "error":
1558
+ # Send error
1559
+ data = {"type": "error", "content": content, "done": True}
1560
+ yield f"event: error\ndata: {json.dumps(data)}\n\n"
1561
+ break
1562
+
1563
+ # Send completion event
1564
+ yield f"event: done\ndata: {json.dumps({'message': 'Stream complete', 'conversation_id': conversation_id})}\n\n"
1565
+
1566
+ except Exception as e:
1567
+ logger.error(f"Streaming error: {e}", exc_info=True)
1568
+ error_data = {"type": "error", "content": {"message": str(e)}, "done": True}
1569
+ yield f"event: error\ndata: {json.dumps(error_data)}\n\n"
1570
+
1571
+ return StreamingResponse(
1572
+ generate_sse(),
1573
+ media_type="text/event-stream",
1574
+ headers={
1575
+ "Cache-Control": "no-cache",
1576
+ "Connection": "keep-alive",
1577
+ "X-Accel-Buffering": "no"
1578
+ }
1579
+ )
1580
+
1581
+
1582
+ @app.post("/query", response_model=QueryResponse)
1583
+ async def query_documents(request: QueryRequest):
1584
+ """Query documents with enhanced RAG capabilities and optional web search."""
1585
+ start_time = time.time()
1586
+ logger.info(f"Query request: '{request.query[:50]}...' in domain: {request.domain}, mode: {request.mode}, fast_mode: {request.fast_mode}")
1587
+
1588
+ try:
1589
+ conversation_id = request.conversation_id or f"conv_{uuid.uuid4()}"
1590
+ conversation_history = conversation_histories.get(conversation_id, [])
1591
+
1592
+ # Generate cache key for non-web-search queries
1593
+ cache_key = None
1594
+ if request.enable_cache and not request.web_search_only and not request.enable_web_search:
1595
+ cache_data = f"{request.query}:{request.domain}:{request.mode}:{request.fast_mode}:{request.enable_verification}"
1596
+ cache_key = hashlib.md5(cache_data.encode()).hexdigest()
1597
+
1598
+ # Check cache
1599
+ if cache_key in query_cache:
1600
+ cached_response = query_cache[cache_key]
1601
+ logger.info(f"✓ Cache hit for query (saved {time.time() - start_time:.2f}s)")
1602
+ # Update conversation ID in cached response
1603
+ cached_response.conversation_id = conversation_id
1604
+ return cached_response
1605
+
1606
+ # Handle web search only mode
1607
+ if request.web_search_only:
1608
+ if not web_searcher:
1609
+ raise HTTPException(503, "Web search not available. Set TAVILY_API_KEY.")
1610
+
1611
+ logger.info("Using web search only mode")
1612
+ web_results = await web_searcher.search(request.query, max_results=5)
1613
+
1614
+ # Format results for LLM processing
1615
+ web_context = web_searcher.format_results_for_llm(web_results)
1616
+
1617
+ # Synthesize answer using Gemini
1618
+ logger.info("Synthesizing web search results with Gemini")
1619
+ answer = await synthesize_web_results_with_gemini(
1620
+ query=request.query,
1621
+ web_context=web_context,
1622
+ rag_context=None
1623
+ )
1624
+
1625
+ result = {
1626
+ "answer": answer,
1627
+ "original_query": request.query,
1628
+ "improved_query": request.query,
1629
+ "verification_passed": False,
1630
+ "verification_score": 0,
1631
+ "web_search_performed": True,
1632
+ "sources": [{"url": r.get("url"), "title": r.get("title")} for r in web_results.get("results", [])]
1633
+ }
1634
+ else:
1635
+ # Standard RAG query with optimized parameters
1636
+ rag = await get_rag_instance(request.domain)
1637
+
1638
+ # Determine optimal parameters based on fast_mode
1639
+ if request.fast_mode:
1640
+ # Optimized parameters for 2-3x speedup
1641
+ top_k = request.top_k if request.top_k is not None else 20
1642
+ chunk_top_k = 10
1643
+ max_entity_tokens = 4000
1644
+ max_relation_tokens = 6000
1645
+ max_total_tokens = 20000
1646
+ logger.info(f"⚡ Fast mode enabled: top_k={top_k}, chunk_top_k={chunk_top_k}")
1647
+ else:
1648
+ # Default parameters (higher quality, slower)
1649
+ top_k = request.top_k if request.top_k is not None else 40
1650
+ chunk_top_k = 20
1651
+ max_entity_tokens = 6000
1652
+ max_relation_tokens = 8000
1653
+ max_total_tokens = 30000
1654
+
1655
+ # Build query parameters
1656
+ from lightrag import QueryParam
1657
+ query_kwargs = {
1658
+ "top_k": top_k,
1659
+ "chunk_top_k": chunk_top_k,
1660
+ "max_entity_tokens": max_entity_tokens,
1661
+ "max_relation_tokens": max_relation_tokens,
1662
+ "max_total_tokens": max_total_tokens,
1663
+ }
1664
+
1665
+ # Log toggle settings
1666
+ logger.info(f"Query settings - improvement: {request.enable_query_improvement}, verification: {request.enable_verification_check}")
1667
+
1668
+ result = await rag.aquery(
1669
+ query=request.query,
1670
+ mode=request.mode,
1671
+ enable_query_improvement=request.enable_query_improvement, # Use toggle instead of always true
1672
+ enable_verification=request.enable_verification_check, # Use toggle instead of always request.enable_verification
1673
+ return_verification_info=request.return_metadata,
1674
+ **query_kwargs
1675
+ )
1676
+
1677
+ # Augment with web search if requested
1678
+ if request.enable_web_search and web_searcher:
1679
+ logger.info("Augmenting RAG results with web search")
1680
+ try:
1681
+ rag_answer = result.get("answer") if isinstance(result, dict) else str(result)
1682
+ web_results = await web_searcher.search(request.query, max_results=5)
1683
+
1684
+ if web_results.get("results"):
1685
+ # Format web results for LLM
1686
+ web_context = web_searcher.format_results_for_llm(web_results)
1687
+
1688
+ # Synthesize combined answer using Gemini
1689
+ logger.info("Synthesizing RAG + web results with Gemini")
1690
+ synthesized_answer = await synthesize_web_results_with_gemini(
1691
+ query=request.query,
1692
+ web_context=web_context,
1693
+ rag_context=rag_answer
1694
+ )
1695
+
1696
+ if isinstance(result, dict):
1697
+ result["answer"] = synthesized_answer
1698
+ result["web_search_performed"] = True
1699
+ result["web_sources"] = [{"url": r.get("url"), "title": r.get("title")} for r in web_results.get("results", [])]
1700
+ else:
1701
+ result = synthesized_answer
1702
+ except Exception as e:
1703
+ logger.error(f"Web search augmentation error: {e}")
1704
+ # Continue with RAG-only result
1705
+
1706
+ # Handle None result
1707
+ if result is None:
1708
+ answer = "I couldn't find any relevant information in the knowledge base to answer your question. Please try rephrasing your question or ensure that relevant documents have been uploaded."
1709
+ metadata = {
1710
+ "original_query": request.query,
1711
+ "improved_query": request.query,
1712
+ "verification_passed": False,
1713
+ "verification_score": 0,
1714
+ }
1715
+ query_improved = False
1716
+ verification_performed = False
1717
+ confidence = 0.0
1718
+ elif isinstance(result, dict):
1719
+ answer = result.get("answer", "No answer found.")
1720
+ metadata = {
1721
+ "original_query": result.get("original_query", request.query),
1722
+ "improved_query": result.get("improved_query", request.query),
1723
+ "verification_passed": result.get("verification_passed", False),
1724
+ "verification_score": result.get("verification_score", 0),
1725
+ }
1726
+ query_improved = result.get("improved_query") != result.get("original_query")
1727
+ verification_performed = result.get("verification_passed", False)
1728
+ confidence = result.get("verification_score", 0) / 10.0
1729
+ else:
1730
+ answer = str(result) if result else "No answer found."
1731
+ metadata = {}
1732
+ query_improved = False
1733
+ verification_performed = False
1734
+ confidence = 1.0
1735
+
1736
+ conversation_history.extend([
1737
+ {"role": "user", "content": request.query},
1738
+ {"role": "assistant", "content": answer}
1739
+ ])
1740
+ conversation_histories[conversation_id] = conversation_history
1741
+
1742
+ response = QueryResponse(
1743
+ answer=answer,
1744
+ sources=[], # TODO: Extract from result if available
1745
+ confidence_score=confidence,
1746
+ query_improved=query_improved,
1747
+ verification_performed=verification_performed,
1748
+ conversation_id=conversation_id,
1749
+ metadata=metadata if request.return_metadata else None
1750
+ )
1751
+
1752
+ # Store in cache if enabled (non-web search queries only)
1753
+ if cache_key and request.enable_cache:
1754
+ query_cache[cache_key] = response
1755
+ logger.info(f"✓ Cached query result (key: {cache_key[:16]}...)")
1756
+
1757
+ # Track performance metrics
1758
+ query_time = time.time() - start_time
1759
+ performance_metrics["query_times"].append(query_time)
1760
+ # Keep only last 100 metrics
1761
+ if len(performance_metrics["query_times"]) > 100:
1762
+ performance_metrics["query_times"] = performance_metrics["query_times"][-100:]
1763
+
1764
+ logger.info(f"Query completed in {query_time:.2f}s (fast_mode: {request.fast_mode}, confidence: {confidence:.2f})")
1765
+ return response
1766
+ except HTTPException:
1767
+ raise
1768
+ except Exception as e:
1769
+ logger.error(f"Query error: {e}", exc_info=True)
1770
+ raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
1771
+
1772
+
1773
+ @app.get("/conversation/{conversation_id}")
1774
+ async def get_conversation(conversation_id: str):
1775
+ """Get conversation history by ID."""
1776
+ if conversation_id not in conversation_histories:
1777
+ raise HTTPException(status_code=404, detail="Conversation not found")
1778
+ return {
1779
+ "conversation_id": conversation_id,
1780
+ "messages": conversation_histories[conversation_id],
1781
+ }
1782
+
1783
+
1784
+ @app.delete("/conversation/{conversation_id}")
1785
+ async def clear_conversation(conversation_id: str):
1786
+ """Clear conversation history."""
1787
+ if conversation_id in conversation_histories:
1788
+ del conversation_histories[conversation_id]
1789
+ logger.info(f"Cleared conversation: {conversation_id}")
1790
+ return {"success": True, "message": "Conversation cleared"}
1791
+ raise HTTPException(status_code=404, detail="Conversation not found")
1792
+
1793
+
1794
+ @app.delete("/clear/{domain}")
1795
+ async def clear_domain_data(domain: str):
1796
+ """WARNING: Deletes all processed documents and indices for the domain."""
1797
+ logger.warning(f"Clear domain data request: {domain}")
1798
+ try:
1799
+ if domain not in DOMAIN_CONFIGS:
1800
+ raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
1801
+
1802
+ if domain in rag_instances:
1803
+ await rag_instances[domain].finalize_storages()
1804
+ del rag_instances[domain]
1805
+
1806
+ domain_storage = STORAGE_DIR / domain
1807
+ if domain_storage.exists():
1808
+ import shutil
1809
+ shutil.rmtree(domain_storage)
1810
+ domain_storage.mkdir(parents=True, exist_ok=True)
1811
+
1812
+ logger.info(f"Domain data cleared: {domain}")
1813
+ return {"success": True, "message": f"All data cleared for domain '{domain}'"}
1814
+ except Exception as e:
1815
+ logger.error(f"Clear domain error: {e}", exc_info=True)
1816
+ raise HTTPException(status_code=500, detail=f"Failed to clear domain: {str(e)}")
1817
+
1818
+
1819
+ @app.get("/documents")
1820
+ async def list_documents(domain: str):
1821
+ """
1822
+ List all processed documents for a domain.
1823
+
1824
+ Only returns documents with status 'completed'. Documents still being
1825
+ processed are excluded to avoid confusion.
1826
+ """
1827
+ try:
1828
+ if domain not in DOMAIN_CONFIGS:
1829
+ raise HTTPException(400, f"Invalid domain. Valid: {list(DOMAIN_CONFIGS.keys())}")
1830
+
1831
+ documents = []
1832
+ domain_upload_dir = UPLOAD_DIR / domain
1833
+
1834
+ if domain_upload_dir.exists():
1835
+ for file_path in domain_upload_dir.glob("*"):
1836
+ if file_path.is_file():
1837
+ # Extract processing_id and filename
1838
+ filename = file_path.name
1839
+ parts = filename.split('_', 1)
1840
+ processing_id = parts[0] if len(parts) > 1 else ""
1841
+ display_name = parts[1] if len(parts) > 1 else filename
1842
+
1843
+ # Check if document is actually completed
1844
+ # Skip if still processing, queued, or fetching
1845
+ if processing_id in processing_status:
1846
+ status = processing_status[processing_id].get('status')
1847
+ if status in ['processing', 'queued', 'fetching']:
1848
+ # Document is still being processed, skip it
1849
+ logger.debug(f"Skipping document {processing_id} - status: {status}")
1850
+ continue
1851
+ elif status == 'failed':
1852
+ # Optionally skip failed documents or include them
1853
+ # For now, skip them to only show successfully processed docs
1854
+ continue
1855
+
1856
+ # Only include completed documents or legacy ones without status
1857
+ documents.append({
1858
+ "id": processing_id,
1859
+ "name": display_name,
1860
+ "domain": domain,
1861
+ "status": "processed",
1862
+ "uploadedAt": str(file_path.stat().st_mtime)
1863
+ })
1864
+
1865
+ return {"documents": documents}
1866
+ except HTTPException:
1867
+ raise
1868
+ except Exception as e:
1869
+ logger.error(f"Error listing documents: {e}", exc_info=True)
1870
+ raise HTTPException(status_code=500, detail=f"Failed to list documents: {str(e)}")
1871
+
1872
+
1873
+ @app.get("/performance-metrics")
1874
+ async def get_performance_metrics():
1875
+ """Get performance metrics for queries and document processing."""
1876
+ try:
1877
+ query_times = performance_metrics.get("query_times", [])
1878
+ processing_times = performance_metrics.get("processing_times", [])
1879
+
1880
+ # Calculate statistics
1881
+ def calc_stats(times):
1882
+ if not times:
1883
+ return {"count": 0, "avg": 0, "min": 0, "max": 0}
1884
+ return {
1885
+ "count": len(times),
1886
+ "avg": sum(times) / len(times),
1887
+ "min": min(times),
1888
+ "max": max(times)
1889
+ }
1890
+
1891
+ return {
1892
+ "query_metrics": calc_stats(query_times),
1893
+ "processing_metrics": calc_stats(processing_times),
1894
+ "cache_stats": {
1895
+ "size": len(query_cache),
1896
+ "max_size": query_cache.maxsize,
1897
+ "ttl_seconds": query_cache.ttl
1898
+ }
1899
+ }
1900
+ except Exception as e:
1901
+ logger.error(f"Error getting performance metrics: {e}", exc_info=True)
1902
+ return {
1903
+ "query_metrics": {"count": 0, "avg": 0, "min": 0, "max": 0},
1904
+ "processing_metrics": {"count": 0, "avg": 0, "min": 0, "max": 0},
1905
+ "cache_stats": {"size": 0, "max_size": 100, "ttl_seconds": 300}
1906
+ }
1907
+
1908
+
1909
+ @app.get("/status/{processing_id}")
1910
+ async def get_processing_status(processing_id: str):
1911
+ """
1912
+ Get the processing status of a document.
1913
+
1914
+ Now uses persistent status storage that survives backend restarts.
1915
+ The status is loaded from disk on startup and kept in sync.
1916
+ """
1917
+ try:
1918
+ # Check the persistent status tracker (loaded from disk on startup)
1919
+ if processing_id in processing_status:
1920
+ status_info = processing_status[processing_id]
1921
+ logger.debug(f"Status check for {processing_id}: {status_info.get('status')}")
1922
+ return {
1923
+ "processing_id": processing_id,
1924
+ **status_info
1925
+ }
1926
+
1927
+ # If not in status tracker, check if this is a legacy upload
1928
+ # (uploaded before persistent status was implemented)
1929
+ for domain in DOMAIN_CONFIGS.keys():
1930
+ domain_upload_dir = UPLOAD_DIR / domain
1931
+ if domain_upload_dir.exists():
1932
+ for file_path in domain_upload_dir.glob(f"{processing_id}_*"):
1933
+ if file_path.is_file():
1934
+ # Legacy upload - return completed status
1935
+ # but don't add to persistent status to avoid confusion
1936
+ return {
1937
+ "processing_id": processing_id,
1938
+ "status": "completed",
1939
+ "message": "Document processed successfully (legacy upload)"
1940
+ }
1941
+
1942
+ # If not found anywhere, status is unknown
1943
+ # This typically means the processing_id is invalid
1944
+ return {
1945
+ "processing_id": processing_id,
1946
+ "status": "unknown",
1947
+ "message": "Processing ID not found. It may be invalid or expired."
1948
+ }
1949
+ except Exception as e:
1950
+ logger.error(f"Error checking status: {e}", exc_info=True)
1951
+ return {
1952
+ "processing_id": processing_id,
1953
+ "status": "error",
1954
+ "message": f"Error checking status: {str(e)}",
1955
+ "error": str(e)
1956
+ }
1957
+
1958
+
1959
+ @app.delete("/documents/{doc_id}")
1960
+ async def delete_document(doc_id: str):
1961
+ """
1962
+ Delete a processed document completely including all RAG data.
1963
+
1964
+ This endpoint performs comprehensive deletion of:
1965
+ - Knowledge graph entities and relationships
1966
+ - Embedding vectors (chunks, entities, relationships)
1967
+ - Text chunks and metadata
1968
+ - Document status records
1969
+ - Physical upload files
1970
+ - Parser output files
1971
+
1972
+ Returns detailed deletion report with verification.
1973
+ """
1974
+ try:
1975
+ from raganything.deletion_verifier import delete_document_complete
1976
+
1977
+ logger.info(f"Delete document request: {doc_id}")
1978
+
1979
+ # Step 1: Search for the document in all domains
1980
+ found_domain = None
1981
+ for domain in DOMAIN_CONFIGS.keys():
1982
+ domain_upload_dir = UPLOAD_DIR / domain
1983
+ if domain_upload_dir.exists():
1984
+ for file_path in domain_upload_dir.glob(f"{doc_id}_*"):
1985
+ if file_path.is_file():
1986
+ found_domain = domain
1987
+ break
1988
+ if found_domain:
1989
+ break
1990
+
1991
+ if not found_domain:
1992
+ logger.warning(f"Document {doc_id} not found in any domain")
1993
+ raise HTTPException(status_code=404, detail="Document not found")
1994
+
1995
+ logger.info(f"Found document {doc_id} in domain: {found_domain}")
1996
+
1997
+ # Step 2: Get RAG instance and find the actual document ID in storage
1998
+ rag = await get_rag_instance(found_domain)
1999
+
2000
+ # Find document in doc_status by processing_id prefix
2001
+ doc_to_delete = None
2002
+ doc_status_file = STORAGE_DIR / found_domain / "kv_store_doc_status.json"
2003
+ if doc_status_file.exists():
2004
+ import json
2005
+ with open(doc_status_file, 'r') as f:
2006
+ doc_status = json.load(f)
2007
+
2008
+ # Find document by file_path containing doc_id
2009
+ for doc_key, doc_info in doc_status.items():
2010
+ if 'file_path' in doc_info and doc_id in doc_info['file_path']:
2011
+ doc_to_delete = doc_key
2012
+ logger.info(f"Found document in storage: {doc_key}")
2013
+ break
2014
+
2015
+ if not doc_to_delete:
2016
+ logger.warning(f"Document {doc_id} not found in doc_status")
2017
+ # Still try to delete physical files
2018
+ doc_to_delete = doc_id
2019
+
2020
+ # Step 3: Collect files and directories to delete
2021
+ upload_files = list((UPLOAD_DIR / found_domain).glob(f"{doc_id}_*"))
2022
+ output_dir = BASE_DIR / "backend" / "output"
2023
+ output_paths = list(output_dir.glob(f"{doc_id}_*")) if output_dir.exists() else []
2024
+
2025
+ # Step 4: Perform complete deletion with verification
2026
+ deletion_report = await delete_document_complete(
2027
+ rag_instance=rag,
2028
+ doc_id=doc_to_delete,
2029
+ storage_dir=STORAGE_DIR / found_domain,
2030
+ upload_files=upload_files,
2031
+ output_dirs=output_paths
2032
+ )
2033
+
2034
+ # Step 5: Return detailed report
2035
+ if deletion_report.success:
2036
+ logger.info(
2037
+ f"Successfully deleted document {doc_id}: "
2038
+ f"{deletion_report.chunks_deleted} chunks, "
2039
+ f"{deletion_report.entities_deleted} entities, "
2040
+ f"{deletion_report.relationships_deleted} relationships, "
2041
+ f"{len(deletion_report.files_deleted)} files, "
2042
+ f"{len(deletion_report.directories_deleted)} directories"
2043
+ )
2044
+ return {
2045
+ "success": True,
2046
+ "message": "Document deleted completely with verification",
2047
+ "domain": found_domain,
2048
+ "report": deletion_report.to_dict()
2049
+ }
2050
+ else:
2051
+ logger.error(
2052
+ f"Document deletion completed with errors for {doc_id}: "
2053
+ f"{deletion_report.errors}"
2054
+ )
2055
+ return {
2056
+ "success": False,
2057
+ "message": "Document deletion completed with errors",
2058
+ "domain": found_domain,
2059
+ "report": deletion_report.to_dict()
2060
+ }
2061
+
2062
+ except HTTPException:
2063
+ raise
2064
+ except Exception as e:
2065
+ logger.error(f"Error deleting document {doc_id}: {e}", exc_info=True)
2066
+ raise HTTPException(
2067
+ status_code=500,
2068
+ detail=f"Failed to delete document: {str(e)}"
2069
+ )
2070
+
2071
+
2072
+ # =============================================================================
2073
+ # Main Entry Point
2074
+ # =============================================================================
2075
+
2076
+ if __name__ == "__main__":
2077
+ import uvicorn
2078
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True, log_level="info")
backend/requirements.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI Backend Requirements - Python 3.12 compatible
2
+
3
+ # Web Framework
4
+ fastapi>=0.104.0
5
+ uvicorn[standard]>=0.24.0
6
+ python-multipart>=0.0.6
7
+
8
+ # Google Gemini API
9
+ google-generativeai>=0.8.0
10
+
11
+ # Image Processing
12
+ Pillow>=10.0.0
13
+
14
+ # Environment Variables
15
+ python-dotenv>=1.0.0
16
+
17
+ # Web Search & URL Fetching
18
+ tavily-python>=0.3.0
19
+ requests>=2.31.0
20
+ beautifulsoup4>=4.12.0
21
+ markdownify>=0.11.0
22
+
23
+ # Additional dependencies
24
+ cachetools>=5.3.0
25
+ aiofiles>=23.0.0
26
+
27
+ # LightRAG - Using local modified version in /lightrag directory
28
+ # lightrag-hku>=1.4.0
backend/reranker.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reranking Module for RAG-Anything
3
+
4
+ Provides reranking functionality using:
5
+ 1. Gemini-based LLM reranking (free tier compatible)
6
+ 2. Cross-encoder style scoring
7
+ 3. Relevance-based reordering
8
+
9
+ Reranking is crucial for RAG systems because:
10
+ - Vector search (embeddings) finds semantically similar text but may miss subtle context
11
+ - LLMs can deeply understand query intent and document relevance
12
+ - Reranking improves answer quality by promoting truly relevant chunks to the top
13
+
14
+ Author: RAG-Anything Team
15
+ Version: 1.0.0
16
+ """
17
+
18
+ import asyncio
19
+ import logging
20
+ import re
21
+ from typing import List, Dict, Any, Optional, Callable
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class GeminiReranker:
27
+ """
28
+ Reranker using Gemini API for semantic relevance scoring
29
+
30
+ This reranker takes chunks from vector search and re-scores them
31
+ based on deep semantic understanding using an LLM.
32
+
33
+ Why reranking matters:
34
+ ---------------------
35
+ Vector embeddings alone can miss:
36
+ - Negations ("not effective" vs "effective")
37
+ - Context dependencies ("aspirin for elderly" vs "aspirin for children")
38
+ - Query intent ("what causes X" vs "how to prevent X")
39
+
40
+ LLM reranking provides:
41
+ - Contextual understanding of the query
42
+ - Semantic relevance beyond keyword matching
43
+ - Better handling of complex queries
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ llm_func: Optional[Callable] = None,
49
+ model_name: str = "models/gemini-2.5-flash",
50
+ batch_size: int = 5,
51
+ temperature: float = 0.1
52
+ ):
53
+ """
54
+ Initialize Gemini-based reranker
55
+
56
+ Args:
57
+ llm_func: Optional LLM function to use for reranking
58
+ model_name: Gemini model to use (default: flash for speed)
59
+ batch_size: Number of chunks to process in parallel
60
+ temperature: Temperature for relevance scoring (low=consistent)
61
+ """
62
+ self.llm_func = llm_func
63
+ self.model_name = model_name
64
+ self.batch_size = batch_size
65
+ self.temperature = temperature
66
+
67
+ async def rerank(
68
+ self,
69
+ query: str,
70
+ chunks: List[Dict[str, Any]],
71
+ top_k: Optional[int] = None
72
+ ) -> List[Dict[str, Any]]:
73
+ """
74
+ Rerank chunks based on relevance to query
75
+
76
+ Process:
77
+ 1. Take top chunks from vector search (e.g., top 50)
78
+ 2. Score each chunk's relevance using LLM (0-10 scale)
79
+ 3. Re-order by relevance score
80
+ 4. Return top_k most relevant chunks
81
+
82
+ Args:
83
+ query: Search query
84
+ chunks: List of chunks with 'content' field
85
+ top_k: Return only top K results (None = return all, reranked)
86
+
87
+ Returns:
88
+ List of reranked chunks with 'relevance_score' field added
89
+ """
90
+ if not chunks:
91
+ logger.warning("No chunks to rerank")
92
+ return []
93
+
94
+ if len(chunks) == 1:
95
+ logger.debug("Only one chunk, skipping reranking")
96
+ chunks[0]['relevance_score'] = 1.0
97
+ return chunks
98
+
99
+ logger.info(f"Reranking {len(chunks)} chunks for query: {query[:50]}...")
100
+
101
+ try:
102
+ # Score all chunks in batches
103
+ scored_chunks = await self._score_chunks_batch(query, chunks)
104
+
105
+ # Sort by relevance score (highest first)
106
+ scored_chunks.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
107
+
108
+ # Return top_k if specified
109
+ if top_k:
110
+ scored_chunks = scored_chunks[:top_k]
111
+
112
+ logger.info(
113
+ f"Reranking complete. Top score: {scored_chunks[0].get('relevance_score', 0):.2f}, "
114
+ f"Bottom score: {scored_chunks[-1].get('relevance_score', 0):.2f}"
115
+ )
116
+
117
+ return scored_chunks
118
+
119
+ except Exception as e:
120
+ logger.error(f"Error during reranking: {e}", exc_info=True)
121
+ # Return original order on error
122
+ return chunks[:top_k] if top_k else chunks
123
+
124
+ async def _score_chunks_batch(
125
+ self,
126
+ query: str,
127
+ chunks: List[Dict[str, Any]]
128
+ ) -> List[Dict[str, Any]]:
129
+ """
130
+ Score chunks in batches for efficiency
131
+
132
+ Args:
133
+ query: Search query
134
+ chunks: List of chunks to score
135
+
136
+ Returns:
137
+ Chunks with relevance_score added
138
+ """
139
+ scored_chunks = []
140
+
141
+ # Process in batches to avoid rate limits
142
+ for i in range(0, len(chunks), self.batch_size):
143
+ batch = chunks[i:i + self.batch_size]
144
+
145
+ # Score batch concurrently
146
+ tasks = [self._score_chunk(query, chunk) for chunk in batch]
147
+ batch_scores = await asyncio.gather(*tasks, return_exceptions=True)
148
+
149
+ # Collect results
150
+ for chunk, score_result in zip(batch, batch_scores):
151
+ if isinstance(score_result, Exception):
152
+ logger.warning(f"Failed to score chunk: {score_result}")
153
+ chunk['relevance_score'] = 0.0
154
+ else:
155
+ chunk['relevance_score'] = score_result
156
+
157
+ scored_chunks.append(chunk)
158
+
159
+ return scored_chunks
160
+
161
+ async def _score_chunk(
162
+ self,
163
+ query: str,
164
+ chunk: Dict[str, Any]
165
+ ) -> float:
166
+ """
167
+ Score a single chunk's relevance to the query using LLM
168
+
169
+ Prompt engineering approach:
170
+ - Ask LLM to act as a relevance judge
171
+ - Provide clear scoring criteria (0-10 scale)
172
+ - Extract numeric score from response
173
+
174
+ Args:
175
+ query: Search query
176
+ chunk: Chunk dictionary with 'content' field
177
+
178
+ Returns:
179
+ Relevance score (0-10)
180
+ """
181
+ content = chunk.get('content', '')
182
+ if not content:
183
+ return 0.0
184
+
185
+ # Truncate very long chunks to avoid token limits
186
+ max_content_length = 1000
187
+ if len(content) > max_content_length:
188
+ content = content[:max_content_length] + "..."
189
+
190
+ # Prompt for relevance scoring
191
+ prompt = f"""You are a relevance judge. Score how relevant the following passage is to answering the query.
192
+
193
+ Query: {query}
194
+
195
+ Passage:
196
+ {content}
197
+
198
+ Scoring criteria:
199
+ 10 = Directly answers the query with specific, relevant information
200
+ 8-9 = Highly relevant, provides useful context
201
+ 6-7 = Somewhat relevant, contains related information
202
+ 4-5 = Tangentially related, limited usefulness
203
+ 2-3 = Barely related, mostly off-topic
204
+ 0-1 = Completely irrelevant
205
+
206
+ Respond with ONLY a number from 0-10. No explanation needed."""
207
+
208
+ try:
209
+ # Call LLM for scoring
210
+ if self.llm_func:
211
+ response = await self.llm_func(
212
+ prompt=prompt,
213
+ temperature=self.temperature,
214
+ max_tokens=50 # Increased from 10 to allow for complete score responses
215
+ )
216
+ else:
217
+ # Fallback: no reranking
218
+ return 5.0
219
+
220
+ # Extract numeric score from response
221
+ score = self._extract_score(response)
222
+ return score
223
+
224
+ except Exception as e:
225
+ logger.error(f"Error scoring chunk: {e}")
226
+ return 5.0 # Default mid-range score on error
227
+
228
+ def _extract_score(self, response: str) -> float:
229
+ """
230
+ Extract numeric score from LLM response
231
+
232
+ Handles various response formats:
233
+ - "8.5"
234
+ - "Score: 9"
235
+ - "The relevance is 7/10"
236
+ - "8"
237
+
238
+ Args:
239
+ response: LLM response text
240
+
241
+ Returns:
242
+ Extracted score (0-10), defaults to 5.0 if parsing fails
243
+ """
244
+ try:
245
+ # Remove whitespace
246
+ response = response.strip()
247
+
248
+ # Try to find a number (int or float) in the response
249
+ # Pattern matches: "8", "8.5", "9/10", "Score: 7", etc.
250
+ number_pattern = r'(\d+\.?\d*)'
251
+ matches = re.findall(number_pattern, response)
252
+
253
+ if matches:
254
+ # Take the first number found
255
+ score = float(matches[0])
256
+
257
+ # Normalize to 0-10 range
258
+ score = max(0.0, min(10.0, score))
259
+
260
+ return score
261
+ else:
262
+ logger.warning(f"Could not extract score from response: {response}")
263
+ return 5.0
264
+
265
+ except Exception as e:
266
+ logger.error(f"Error extracting score: {e}")
267
+ return 5.0
268
+
269
+
270
+ # Example usage
271
+ async def main():
272
+ """Example demonstrating reranking"""
273
+ # Mock LLM function for testing
274
+ async def mock_llm(prompt: str, **kwargs) -> str:
275
+ # Simulate scoring based on keyword matching
276
+ if "directly" in prompt.lower():
277
+ return "9"
278
+ elif "somewhat" in prompt.lower():
279
+ return "6"
280
+ else:
281
+ return "3"
282
+
283
+ # Create reranker
284
+ reranker = GeminiReranker(llm_func=mock_llm)
285
+
286
+ # Example query and chunks
287
+ query = "What are the side effects of aspirin?"
288
+
289
+ chunks = [
290
+ {"content": "Aspirin can cause stomach bleeding in some patients..."},
291
+ {"content": "The history of aspirin dates back to ancient times..."},
292
+ {"content": "Common side effects include nausea and heartburn..."},
293
+ ]
294
+
295
+ # Rerank
296
+ reranked = await reranker.rerank(query, chunks, top_k=2)
297
+
298
+ print("Reranked results:")
299
+ for i, chunk in enumerate(reranked, 1):
300
+ print(f"{i}. Score: {chunk['relevance_score']:.1f} - {chunk['content'][:50]}...")
301
+
302
+
303
+ if __name__ == "__main__":
304
+ asyncio.run(main())
backend/url_fetcher.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL Document Fetcher for RAG-Anything
3
+
4
+ Fetches and processes documents from URLs for ingestion into the RAG system.
5
+
6
+ Features:
7
+ - Web page scraping and parsing
8
+ - PDF download from URLs
9
+ - Markdown conversion
10
+ - Content cleaning and preprocessing
11
+ - Advanced parsing with text and image extraction
12
+ - Integration with RAG pipeline
13
+
14
+ Author: RAG-Anything Team
15
+ Version: 2.0.0
16
+ """
17
+
18
+ import os
19
+ import asyncio
20
+ import logging
21
+ import tempfile
22
+ from pathlib import Path
23
+ from typing import Optional, Dict, Any, List
24
+ from urllib.parse import urlparse
25
+ import hashlib
26
+ import base64
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ try:
31
+ import requests
32
+ from bs4 import BeautifulSoup
33
+ import markdownify
34
+ from urllib.parse import urljoin
35
+ DEPS_AVAILABLE = True
36
+ except ImportError:
37
+ DEPS_AVAILABLE = False
38
+ logger.warning("URL fetcher dependencies not installed. Install with: pip install requests beautifulsoup4 markdownify")
39
+
40
+
41
+ class URLFetcher:
42
+ """Fetch and process documents from URLs"""
43
+
44
+ def __init__(
45
+ self,
46
+ download_dir: Optional[str] = None,
47
+ timeout: int = 30,
48
+ user_agent: str = "RAG-Anything/1.0"
49
+ ):
50
+ """
51
+ Initialize URL fetcher
52
+
53
+ Args:
54
+ download_dir: Directory to save downloaded files
55
+ timeout: Request timeout in seconds
56
+ user_agent: User agent string for requests
57
+ """
58
+ if not DEPS_AVAILABLE:
59
+ raise ImportError("Required dependencies not installed. Run: pip install requests beautifulsoup4 markdownify")
60
+
61
+ self.download_dir = download_dir or tempfile.gettempdir()
62
+ self.timeout = timeout
63
+ self.headers = {"User-Agent": user_agent}
64
+
65
+ Path(self.download_dir).mkdir(parents=True, exist_ok=True)
66
+ logger.info(f"URLFetcher initialized (download_dir={self.download_dir})")
67
+
68
+ def _create_content_list(self, title: str, text_content: str, images: List[Dict]) -> List[Dict[str, Any]]:
69
+ """
70
+ Create a structured content list compatible with RAG pipeline
71
+
72
+ Args:
73
+ title: Document title
74
+ text_content: Extracted text content
75
+ images: List of extracted images with metadata
76
+
77
+ Returns:
78
+ List of content blocks for RAG processing
79
+ """
80
+ content_list = []
81
+
82
+ # Add title as first text block
83
+ if title:
84
+ content_list.append({
85
+ "type": "text",
86
+ "text": f"# {title}",
87
+ "page_idx": 0
88
+ })
89
+
90
+ # Split text into paragraphs and add as text blocks
91
+ paragraphs = [p.strip() for p in text_content.split("\n\n") if p.strip()]
92
+ for idx, paragraph in enumerate(paragraphs[:50]): # Limit to first 50 paragraphs
93
+ if paragraph:
94
+ content_list.append({
95
+ "type": "text",
96
+ "text": paragraph,
97
+ "page_idx": idx // 10 # Group every 10 paragraphs as a "page"
98
+ })
99
+
100
+ # Add images as image blocks
101
+ for idx, img_info in enumerate(images):
102
+ content_list.append({
103
+ "type": "image",
104
+ "img_path": img_info["path"],
105
+ "image_caption": img_info.get("alt", "") or img_info.get("title", ""),
106
+ "page_idx": (len(paragraphs) + idx) // 10
107
+ })
108
+
109
+ return content_list
110
+
111
+ async def fetch_url(
112
+ self,
113
+ url: str,
114
+ save_as_pdf: bool = False,
115
+ convert_to_markdown: bool = True
116
+ ) -> Dict[str, Any]:
117
+ """
118
+ Fetch and process content from URL
119
+
120
+ Args:
121
+ url: URL to fetch
122
+ save_as_pdf: Whether to save as PDF (for PDF URLs)
123
+ convert_to_markdown: Convert HTML to markdown
124
+
125
+ Returns:
126
+ Dictionary with file_path, content, metadata
127
+ """
128
+ try:
129
+ logger.info(f"Fetching URL: {url}")
130
+
131
+ # Validate URL
132
+ parsed = urlparse(url)
133
+ if not parsed.scheme or not parsed.netloc:
134
+ raise ValueError(f"Invalid URL: {url}")
135
+
136
+ # Determine content type
137
+ response = await asyncio.to_thread(
138
+ requests.head, url, headers=self.headers, timeout=self.timeout, allow_redirects=True
139
+ )
140
+ content_type = response.headers.get("Content-Type", "").lower()
141
+
142
+ # Handle PDF files
143
+ if "pdf" in content_type or url.lower().endswith(".pdf"):
144
+ return await self._fetch_pdf(url)
145
+
146
+ # Handle HTML/web pages
147
+ elif "html" in content_type or not content_type:
148
+ return await self._fetch_html(url, convert_to_markdown)
149
+
150
+ # Handle other file types
151
+ else:
152
+ return await self._fetch_generic(url, content_type)
153
+
154
+ except Exception as e:
155
+ logger.error(f"Error fetching URL {url}: {e}", exc_info=True)
156
+ return {
157
+ "success": False,
158
+ "error": str(e),
159
+ "url": url,
160
+ }
161
+
162
+ async def _fetch_pdf(self, url: str) -> Dict[str, Any]:
163
+ """Fetch PDF from URL"""
164
+ try:
165
+ response = await asyncio.to_thread(
166
+ requests.get, url, headers=self.headers, timeout=self.timeout
167
+ )
168
+ response.raise_for_status()
169
+
170
+ # Generate filename from URL
171
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
172
+ filename = f"url_{url_hash}.pdf"
173
+ file_path = Path(self.download_dir) / filename
174
+
175
+ # Save PDF
176
+ with open(file_path, "wb") as f:
177
+ f.write(response.content)
178
+
179
+ logger.info(f"PDF downloaded: {file_path}")
180
+
181
+ return {
182
+ "success": True,
183
+ "file_path": str(file_path),
184
+ "url": url,
185
+ "content_type": "pdf",
186
+ "size_bytes": len(response.content),
187
+ }
188
+
189
+ except Exception as e:
190
+ logger.error(f"Error fetching PDF: {e}")
191
+ raise
192
+
193
+ async def _fetch_html(self, url: str, convert_to_markdown: bool = True) -> Dict[str, Any]:
194
+ """Fetch and parse HTML page with advanced content extraction"""
195
+ try:
196
+ response = await asyncio.to_thread(
197
+ requests.get, url, headers=self.headers, timeout=self.timeout
198
+ )
199
+ response.raise_for_status()
200
+
201
+ # Parse HTML
202
+ soup = BeautifulSoup(response.content, "html.parser")
203
+
204
+ # Remove unwanted elements
205
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside", "iframe", "noscript"]):
206
+ tag.decompose()
207
+
208
+ # Extract title
209
+ title = soup.find("title")
210
+ title_text = title.get_text().strip() if title else "Untitled"
211
+
212
+ # Extract main content
213
+ main_content = soup.find("main") or soup.find("article") or soup.find("body")
214
+
215
+ # Extract images before converting to markdown (limit to first 10 images)
216
+ images = []
217
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
218
+ images_dir = Path(self.download_dir) / f"url_{url_hash}_images"
219
+ images_dir.mkdir(parents=True, exist_ok=True)
220
+
221
+ all_images = main_content.find_all("img")
222
+ max_images = min(10, len(all_images)) # Limit to 10 images
223
+ logger.info(f"Found {len(all_images)} images, downloading first {max_images}")
224
+
225
+ for idx, img in enumerate(all_images[:max_images]):
226
+ try:
227
+ img_url = img.get("src")
228
+ if not img_url:
229
+ continue
230
+
231
+ # Skip data URIs and very small images
232
+ if img_url.startswith("data:"):
233
+ continue
234
+
235
+ # Handle relative URLs
236
+ if img_url.startswith("//"):
237
+ img_url = "https:" + img_url
238
+ elif img_url.startswith("/"):
239
+ parsed_base = urlparse(url)
240
+ img_url = f"{parsed_base.scheme}://{parsed_base.netloc}{img_url}"
241
+ elif not img_url.startswith("http"):
242
+ img_url = urljoin(url, img_url)
243
+
244
+ # Download image with timeout
245
+ img_response = await asyncio.to_thread(
246
+ requests.get, img_url, headers=self.headers, timeout=5, stream=True
247
+ )
248
+
249
+ if img_response.status_code == 200:
250
+ # Check content size (skip if too large > 10MB)
251
+ content_length = img_response.headers.get('content-length')
252
+ if content_length and int(content_length) > 10 * 1024 * 1024:
253
+ logger.debug(f"Skipping large image {idx}: {content_length} bytes")
254
+ continue
255
+
256
+ # Determine file extension
257
+ content_type = img_response.headers.get("Content-Type", "")
258
+ ext = ".jpg"
259
+ if "png" in content_type:
260
+ ext = ".png"
261
+ elif "gif" in content_type:
262
+ ext = ".gif"
263
+ elif "webp" in content_type:
264
+ ext = ".webp"
265
+
266
+ img_path = images_dir / f"image_{idx}{ext}"
267
+ with open(img_path, "wb") as f:
268
+ f.write(img_response.content)
269
+
270
+ images.append({
271
+ "path": str(img_path),
272
+ "alt": img.get("alt", ""),
273
+ "title": img.get("title", ""),
274
+ "url": img_url
275
+ })
276
+ logger.debug(f"Downloaded image {idx+1}/{max_images}: {img_path.name}")
277
+ except Exception as img_error:
278
+ logger.debug(f"Failed to download image {idx}: {img_error}")
279
+ continue
280
+
281
+ if convert_to_markdown:
282
+ # Convert to markdown
283
+ content = markdownify.markdownify(
284
+ str(main_content),
285
+ heading_style="ATX",
286
+ bullets="-"
287
+ )
288
+ else:
289
+ # Extract plain text
290
+ content = main_content.get_text(separator="\n", strip=True)
291
+
292
+ # Create content list with structured data
293
+ content_list = self._create_content_list(title_text, content, images)
294
+
295
+ # Save to file
296
+ ext = ".md" if convert_to_markdown else ".txt"
297
+ filename = f"url_{url_hash}{ext}"
298
+ file_path = Path(self.download_dir) / filename
299
+
300
+ with open(file_path, "w", encoding="utf-8") as f:
301
+ f.write(f"# {title_text}\n\n")
302
+ f.write(f"Source: {url}\n\n")
303
+ f.write(content)
304
+
305
+ # Save content list as JSON for RAG processing
306
+ import json
307
+ json_path = Path(self.download_dir) / f"url_{url_hash}_content_list.json"
308
+ with open(json_path, "w", encoding="utf-8") as f:
309
+ json.dump(content_list, f, indent=2, ensure_ascii=False)
310
+
311
+ logger.info(f"HTML content saved: {file_path}")
312
+ logger.info(f"Extracted {len(images)} images from web page")
313
+
314
+ return {
315
+ "success": True,
316
+ "file_path": str(file_path),
317
+ "content_list_path": str(json_path),
318
+ "url": url,
319
+ "content_type": "html",
320
+ "title": title_text,
321
+ "content_preview": content[:500],
322
+ "images_count": len(images),
323
+ "content_list": content_list
324
+ }
325
+
326
+ except Exception as e:
327
+ logger.error(f"Error fetching HTML: {e}")
328
+ raise
329
+
330
+ async def _fetch_generic(self, url: str, content_type: str) -> Dict[str, Any]:
331
+ """Fetch generic file"""
332
+ try:
333
+ response = await asyncio.to_thread(
334
+ requests.get, url, headers=self.headers, timeout=self.timeout
335
+ )
336
+ response.raise_for_status()
337
+
338
+ # Determine extension from content type
339
+ ext_map = {
340
+ "text/plain": ".txt",
341
+ "text/markdown": ".md",
342
+ "application/msword": ".doc",
343
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
344
+ }
345
+ ext = ext_map.get(content_type, ".bin")
346
+
347
+ # Save file
348
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
349
+ filename = f"url_{url_hash}{ext}"
350
+ file_path = Path(self.download_dir) / filename
351
+
352
+ with open(file_path, "wb") as f:
353
+ f.write(response.content)
354
+
355
+ logger.info(f"File downloaded: {file_path}")
356
+
357
+ return {
358
+ "success": True,
359
+ "file_path": str(file_path),
360
+ "url": url,
361
+ "content_type": content_type,
362
+ "size_bytes": len(response.content),
363
+ }
364
+
365
+ except Exception as e:
366
+ logger.error(f"Error fetching file: {e}")
367
+ raise
368
+
369
+
370
+ def create_url_fetcher(download_dir: Optional[str] = None, **kwargs) -> URLFetcher:
371
+ """
372
+ Factory function to create a URL fetcher
373
+
374
+ Args:
375
+ download_dir: Directory to save downloaded files
376
+ **kwargs: Additional URLFetcher parameters
377
+
378
+ Returns:
379
+ Configured URLFetcher instance
380
+ """
381
+ return URLFetcher(download_dir=download_dir, **kwargs)
backend/web_search.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web Search Module for RAG-Anything using Tavily API
3
+
4
+ Provides intelligent web search capabilities to augment RAG with real-time information.
5
+
6
+ Features:
7
+ - Tavily API integration for high-quality search results
8
+ - Context-aware search query generation
9
+ - Result filtering and ranking
10
+ - Hybrid RAG + Web search mode
11
+
12
+ Author: RAG-Anything Team
13
+ Version: 1.0.0
14
+ """
15
+
16
+ import os
17
+ import asyncio
18
+ import logging
19
+ from typing import List, Dict, Any, Optional
20
+ from datetime import datetime
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ try:
25
+ from tavily import TavilyClient, AsyncTavilyClient
26
+ TAVILY_AVAILABLE = True
27
+ except ImportError:
28
+ TAVILY_AVAILABLE = False
29
+ logger.warning("Tavily not installed. Install with: pip install tavily-python")
30
+
31
+
32
+ class WebSearcher:
33
+ """Web search integration using Tavily API"""
34
+
35
+ def __init__(
36
+ self,
37
+ api_key: Optional[str] = None,
38
+ max_results: int = 5,
39
+ search_depth: str = "advanced",
40
+ include_raw_content: bool = True
41
+ ):
42
+ """
43
+ Initialize web searcher
44
+
45
+ Args:
46
+ api_key: Tavily API key (from env if not provided)
47
+ max_results: Maximum number of search results to return
48
+ search_depth: "basic" or "advanced" (advanced is more thorough)
49
+ include_raw_content: Whether to include full page content
50
+ """
51
+ if not TAVILY_AVAILABLE:
52
+ raise ImportError("Tavily is not installed. Install with: pip install tavily-python")
53
+
54
+ self.api_key = api_key or os.getenv("TAVILY_API_KEY")
55
+ if not self.api_key:
56
+ raise ValueError("Tavily API key not found. Set TAVILY_API_KEY environment variable.")
57
+
58
+ self.max_results = max_results
59
+ self.search_depth = search_depth
60
+ self.include_raw_content = include_raw_content
61
+
62
+ # Initialize async client
63
+ self.client = AsyncTavilyClient(api_key=self.api_key)
64
+
65
+ logger.info(f"WebSearcher initialized (max_results={max_results}, depth={search_depth})")
66
+
67
+ async def search(
68
+ self,
69
+ query: str,
70
+ max_results: Optional[int] = None,
71
+ include_domains: Optional[List[str]] = None,
72
+ exclude_domains: Optional[List[str]] = None,
73
+ search_depth: Optional[str] = None
74
+ ) -> Dict[str, Any]:
75
+ """
76
+ Perform web search
77
+
78
+ Args:
79
+ query: Search query
80
+ max_results: Override default max results
81
+ include_domains: Only search these domains
82
+ exclude_domains: Exclude these domains
83
+ search_depth: Override default search depth
84
+
85
+ Returns:
86
+ Dictionary with search results and metadata
87
+ """
88
+ try:
89
+ logger.info(f"Searching web: {query[:100]}...")
90
+
91
+ # Build search parameters
92
+ search_params = {
93
+ "query": query,
94
+ "max_results": max_results or self.max_results,
95
+ "search_depth": search_depth or self.search_depth,
96
+ "include_raw_content": self.include_raw_content,
97
+ }
98
+
99
+ if include_domains:
100
+ search_params["include_domains"] = include_domains
101
+ if exclude_domains:
102
+ search_params["exclude_domains"] = exclude_domains
103
+
104
+ # Perform search
105
+ response = await self.client.search(**search_params)
106
+
107
+ # Process results
108
+ results = {
109
+ "query": query,
110
+ "results": response.get("results", []),
111
+ "answer": response.get("answer", ""), # Tavily's AI-generated answer
112
+ "search_metadata": {
113
+ "total_results": len(response.get("results", [])),
114
+ "search_depth": search_params["search_depth"],
115
+ "timestamp": datetime.now().isoformat(),
116
+ }
117
+ }
118
+
119
+ logger.info(f"Web search complete: {len(results['results'])} results found")
120
+ return results
121
+
122
+ except Exception as e:
123
+ logger.error(f"Web search error: {e}", exc_info=True)
124
+ return {
125
+ "query": query,
126
+ "results": [],
127
+ "answer": "",
128
+ "error": str(e),
129
+ "search_metadata": {
130
+ "total_results": 0,
131
+ "error": str(e),
132
+ "timestamp": datetime.now().isoformat(),
133
+ }
134
+ }
135
+
136
+ async def search_with_context(
137
+ self,
138
+ query: str,
139
+ context: Optional[str] = None,
140
+ **kwargs
141
+ ) -> Dict[str, Any]:
142
+ """
143
+ Search with additional context to refine query
144
+
145
+ Args:
146
+ query: Base search query
147
+ context: Additional context to help refine search
148
+ **kwargs: Additional search parameters
149
+
150
+ Returns:
151
+ Search results dictionary
152
+ """
153
+ # If context provided, enhance query
154
+ if context:
155
+ enhanced_query = f"{query} {context}"
156
+ else:
157
+ enhanced_query = query
158
+
159
+ return await self.search(enhanced_query, **kwargs)
160
+
161
+ def format_results_for_rag(self, search_results: Dict[str, Any]) -> str:
162
+ """
163
+ Format web search results for RAG context
164
+
165
+ Args:
166
+ search_results: Results from search()
167
+
168
+ Returns:
169
+ Formatted string for RAG context
170
+ """
171
+ if not search_results.get("results"):
172
+ return "No web search results found."
173
+
174
+ formatted = ["=== Web Search Results ===\n"]
175
+
176
+ # Add Tavily's answer if available
177
+ if search_results.get("answer"):
178
+ formatted.append(f"Quick Answer: {search_results['answer']}\n")
179
+
180
+ # Add individual results
181
+ for idx, result in enumerate(search_results["results"], 1):
182
+ formatted.append(f"\n[Source {idx}] {result.get('title', 'Untitled')}")
183
+ formatted.append(f"URL: {result.get('url', 'N/A')}")
184
+ formatted.append(f"Content: {result.get('content', 'No content')[:500]}...")
185
+ if result.get("score"):
186
+ formatted.append(f"Relevance: {result['score']:.2f}")
187
+
188
+ formatted.append(f"\n=== End of Web Results ({len(search_results['results'])} sources) ===")
189
+ return "\n".join(formatted)
190
+
191
+ def format_results_for_llm(self, search_results: Dict[str, Any]) -> str:
192
+ """
193
+ Format web search results optimally for LLM processing
194
+
195
+ Args:
196
+ search_results: Results from search()
197
+
198
+ Returns:
199
+ Structured string optimized for LLM comprehension
200
+ """
201
+ if not search_results.get("results"):
202
+ return "No relevant web search results were found for this query."
203
+
204
+ formatted = []
205
+
206
+ # Add Tavily's AI-generated answer first (if available)
207
+ if search_results.get("answer"):
208
+ formatted.append("### AI-Generated Summary:")
209
+ formatted.append(search_results['answer'])
210
+ formatted.append("")
211
+
212
+ # Add detailed source information
213
+ formatted.append("### Detailed Sources:")
214
+ formatted.append("")
215
+
216
+ for idx, result in enumerate(search_results["results"], 1):
217
+ formatted.append(f"**Source {idx}: {result.get('title', 'Untitled')}**")
218
+ formatted.append(f"- URL: {result.get('url', 'N/A')}")
219
+ formatted.append(f"- Published: {result.get('published_date', 'Unknown date')}")
220
+
221
+ # Get content (full or truncated based on availability)
222
+ content = result.get('content', '')
223
+ if result.get('raw_content') and len(result.get('raw_content', '')) > len(content):
224
+ content = result['raw_content'][:2000] # Use more detailed content
225
+
226
+ formatted.append(f"- Content: {content}")
227
+
228
+ if result.get("score"):
229
+ formatted.append(f"- Relevance Score: {result['score']:.2%}")
230
+
231
+ formatted.append("")
232
+
233
+ formatted.append(f"*Total sources: {len(search_results['results'])}*")
234
+ return "\n".join(formatted)
235
+
236
+ async def hybrid_search(
237
+ self,
238
+ query: str,
239
+ rag_results: Optional[str] = None,
240
+ combine_results: bool = True,
241
+ **kwargs
242
+ ) -> Dict[str, Any]:
243
+ """
244
+ Hybrid search: combine RAG results with web search
245
+
246
+ Args:
247
+ query: Search query
248
+ rag_results: Results from RAG system
249
+ combine_results: Whether to combine RAG and web results
250
+ **kwargs: Additional search parameters
251
+
252
+ Returns:
253
+ Dictionary with combined results
254
+ """
255
+ # Perform web search
256
+ web_results = await self.search(query, **kwargs)
257
+
258
+ if not combine_results:
259
+ return web_results
260
+
261
+ # Combine RAG and web results
262
+ combined_context = []
263
+
264
+ if rag_results:
265
+ combined_context.append("=== Knowledge Base Results ===")
266
+ combined_context.append(rag_results)
267
+ combined_context.append("")
268
+
269
+ combined_context.append(self.format_results_for_rag(web_results))
270
+
271
+ return {
272
+ "query": query,
273
+ "combined_context": "\n".join(combined_context),
274
+ "rag_results": rag_results,
275
+ "web_results": web_results,
276
+ "metadata": {
277
+ "has_rag_results": bool(rag_results),
278
+ "has_web_results": len(web_results.get("results", [])) > 0,
279
+ "timestamp": datetime.now().isoformat(),
280
+ }
281
+ }
282
+
283
+
284
+ def create_web_searcher(api_key: Optional[str] = None, **kwargs) -> WebSearcher:
285
+ """
286
+ Factory function to create a web searcher
287
+
288
+ Args:
289
+ api_key: Tavily API key
290
+ **kwargs: Additional WebSearcher parameters
291
+
292
+ Returns:
293
+ Configured WebSearcher instance
294
+ """
295
+ return WebSearcher(api_key=api_key, **kwargs)
docker-compose.yml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ backend:
3
+ build:
4
+ context: .
5
+ dockerfile: backend/Dockerfile
6
+ container_name: agentic-rag-backend
7
+ restart: unless-stopped
8
+ ports:
9
+ - "8000:8000"
10
+ environment:
11
+ - GEMINI_API_KEY=${GEMINI_API_KEY}
12
+ - GEMINI_TEXT_MODEL=${GEMINI_TEXT_MODEL:-models/gemini-flash-latest}
13
+ - GEMINI_VERIFIER_MODEL=${GEMINI_VERIFIER_MODEL:-models/gemini-pro-latest}
14
+ - GEMINI_VISION_MODEL=${GEMINI_VISION_MODEL:-models/gemini-flash-latest}
15
+ - GEMINI_EMBEDDING_MODEL=${GEMINI_EMBEDDING_MODEL:-models/text-embedding-004}
16
+ - TAVILY_API_KEY=${TAVILY_API_KEY}
17
+ - PYTHONUNBUFFERED=1
18
+ volumes:
19
+ - ./storage:/app/storage
20
+ - ./uploads:/app/uploads
21
+ - ./backend/output:/app/backend/output
22
+ networks:
23
+ - rag-network
24
+ healthcheck:
25
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
26
+ interval: 30s
27
+ timeout: 10s
28
+ retries: 3
29
+ start_period: 40s
30
+
31
+ frontend:
32
+ build:
33
+ context: ./frontend
34
+ dockerfile: Dockerfile
35
+ container_name: agentic-rag-frontend
36
+ restart: unless-stopped
37
+ ports:
38
+ - "3000:80"
39
+ depends_on:
40
+ backend:
41
+ condition: service_healthy
42
+ networks:
43
+ - rag-network
44
+ healthcheck:
45
+ test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/"]
46
+ interval: 30s
47
+ timeout: 10s
48
+ retries: 3
49
+ start_period: 10s
50
+
51
+ networks:
52
+ rag-network:
53
+ driver: bridge
54
+
55
+ volumes:
56
+ storage:
57
+ uploads:
58
+ output:
frontend/.env.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # API Configuration
2
+ REACT_APP_API_URL=http://localhost:8000
frontend/Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM node:20-alpine AS builder
2
+
3
+ WORKDIR /app
4
+
5
+ # Copy package files
6
+ COPY package*.json ./
7
+
8
+ # Install dependencies
9
+ RUN npm ci --silent
10
+
11
+ # Copy source code
12
+ COPY . .
13
+
14
+ # Build the application
15
+ RUN npm run build
16
+
17
+ FROM nginx:alpine
18
+
19
+ # Copy custom nginx configuration
20
+ COPY nginx.conf /etc/nginx/conf.d/default.conf
21
+
22
+ # Copy built application from builder stage
23
+ COPY --from=builder /app/build /usr/share/nginx/html
24
+
25
+ # Expose port
26
+ EXPOSE 80
27
+
28
+ # Health check
29
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
30
+ CMD wget --quiet --tries=1 --spider http://localhost/ || exit 1
31
+
32
+ # Start nginx
33
+ CMD ["nginx", "-g", "daemon off;"]
frontend/nginx.conf ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ server {
2
+ listen 80;
3
+ server_name localhost;
4
+ root /usr/share/nginx/html;
5
+ index index.html;
6
+
7
+ # Gzip compression
8
+ gzip on;
9
+ gzip_vary on;
10
+ gzip_min_length 1000;
11
+ gzip_types text/plain text/css text/xml text/javascript application/x-javascript application/xml+rss application/json;
12
+
13
+ # Serve static files with cache headers
14
+ location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
15
+ expires 1y;
16
+ add_header Cache-Control "public, immutable";
17
+ }
18
+
19
+ # API proxy to backend
20
+ location /api {
21
+ proxy_pass http://backend:8000;
22
+ proxy_http_version 1.1;
23
+ proxy_set_header Upgrade $http_upgrade;
24
+ proxy_set_header Connection 'upgrade';
25
+ proxy_set_header Host $host;
26
+ proxy_set_header X-Real-IP $remote_addr;
27
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
28
+ proxy_set_header X-Forwarded-Proto $scheme;
29
+ proxy_cache_bypass $http_upgrade;
30
+
31
+ # Increase timeout for long-running queries
32
+ proxy_read_timeout 300s;
33
+ proxy_connect_timeout 75s;
34
+ }
35
+
36
+ # Serve React app - all routes go to index.html
37
+ location / {
38
+ try_files $uri $uri/ /index.html;
39
+ add_header Cache-Control "no-cache";
40
+ }
41
+
42
+ # Security headers
43
+ add_header X-Frame-Options "SAMEORIGIN" always;
44
+ add_header X-Content-Type-Options "nosniff" always;
45
+ add_header X-XSS-Protection "1; mode=block" always;
46
+ }
frontend/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
frontend/package.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "enhanced-rag-frontend",
3
+ "version": "1.0.0",
4
+ "description": "Enhanced RAG-Anything Frontend with Multi-Domain Support",
5
+ "private": true,
6
+ "dependencies": {
7
+ "lucide-react": "^0.294.0",
8
+ "react": "^18.2.0",
9
+ "react-dom": "^18.2.0",
10
+ "react-markdown": "^10.1.0",
11
+ "react-scripts": "5.0.1",
12
+ "rehype-highlight": "^7.0.2",
13
+ "remark-gfm": "^4.0.1"
14
+ },
15
+ "scripts": {
16
+ "start": "react-scripts start",
17
+ "build": "react-scripts build",
18
+ "test": "react-scripts test",
19
+ "eject": "react-scripts eject"
20
+ },
21
+ "eslintConfig": {
22
+ "extends": [
23
+ "react-app"
24
+ ]
25
+ },
26
+ "browserslist": {
27
+ "production": [
28
+ ">0.2%",
29
+ "not dead",
30
+ "not op_mini all"
31
+ ],
32
+ "development": [
33
+ "last 1 chrome version",
34
+ "last 1 firefox version",
35
+ "last 1 safari version"
36
+ ]
37
+ },
38
+ "devDependencies": {
39
+ "autoprefixer": "^10.4.16",
40
+ "postcss": "^8.4.31",
41
+ "tailwindcss": "^3.3.5"
42
+ }
43
+ }
frontend/postcss.config.js ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ module.exports = {
2
+ plugins: {
3
+ tailwindcss: {},
4
+ autoprefixer: {},
5
+ },
6
+ }
frontend/public/index.html ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
6
+ <meta name="theme-color" content="#000000" />
7
+ <meta
8
+ name="description"
9
+ content="Enhanced RAG System with Multi-Domain Support"
10
+ />
11
+ <title>Enhanced RAG System</title>
12
+ </head>
13
+ <body>
14
+ <noscript>You need to enable JavaScript to run this app.</noscript>
15
+ <div id="root"></div>
16
+ </body>
17
+ </html>
frontend/src/App.js ADDED
@@ -0,0 +1,1268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * TheTruthSchool - Professional AI Assistant Interface
3
+ *
4
+ * Features:
5
+ * - Dark mode with elegant theme switching
6
+ * - Claude/ChatGPT-inspired professional design
7
+ * - Multi-domain RAG with TheTruthSchool branding
8
+ * - Smooth animations and modern UI
9
+ */
10
+
11
+ import React, { useState, useEffect, useRef, useCallback } from 'react';
12
+ import ReactMarkdown from 'react-markdown';
13
+ import remarkGfm from 'remark-gfm';
14
+ import rehypeHighlight from 'rehype-highlight';
15
+ import 'highlight.js/styles/atom-one-dark.css';
16
+ import {
17
+ Send,
18
+ Upload,
19
+ FileText,
20
+ CheckCircle,
21
+ XCircle,
22
+ Menu,
23
+ X,
24
+ Loader2,
25
+ Trash2,
26
+ FolderOpen,
27
+ RefreshCw,
28
+ Moon,
29
+ Sun,
30
+ Sparkles
31
+ } from 'lucide-react';
32
+
33
+ // =============================================================================
34
+ // Domain Configurations
35
+ // =============================================================================
36
+
37
+ const DOMAIN_CONFIGS = {
38
+ medical: {
39
+ name: 'Medical & Healthcare',
40
+ description: 'Medical documents, research papers, clinical guidelines',
41
+ color: '#3b82f6',
42
+ fileTypes: ['.pdf', '.docx', '.xml', '.txt', '.doc', '.csv', '.xlsx'],
43
+ icon: '🏥'
44
+ },
45
+ legal: {
46
+ name: 'Legal & Compliance',
47
+ description: 'Legal documents, contracts, regulations, case law',
48
+ color: '#8b5cf6',
49
+ fileTypes: ['.pdf', '.docx', '.txt', '.doc', '.csv', '.xlsx'],
50
+ icon: '⚖️'
51
+ },
52
+ financial: {
53
+ name: 'Financial & Analytics',
54
+ description: 'Financial reports, analysis, market research',
55
+ color: '#10b981',
56
+ fileTypes: ['.pdf', '.xlsx', '.csv', '.json', '.xls'],
57
+ icon: '💰'
58
+ },
59
+ technical: {
60
+ name: 'Technical Documentation',
61
+ description: 'Technical docs, APIs, code, system architecture',
62
+ color: '#f97316',
63
+ fileTypes: ['.pdf', '.md', '.docx', '.json', '.txt', '.rst', '.csv', '.xlsx'],
64
+ icon: '⚙️'
65
+ },
66
+ academic: {
67
+ name: 'Academic Research',
68
+ description: 'Research papers, academic publications, studies',
69
+ color: '#6366f1',
70
+ fileTypes: ['.pdf', '.docx', '.tex', '.bib', '.txt', '.csv', '.xlsx'],
71
+ icon: '🎓'
72
+ }
73
+ };
74
+
75
+ const API_BASE_URL = process.env.REACT_APP_API_URL || 'http://localhost:8000';
76
+
77
+ // =============================================================================
78
+ // Main Component
79
+ // =============================================================================
80
+
81
+ export default function TheTruthSchoolAI() {
82
+ const getFromLocalStorage = (key, defaultValue) => {
83
+ try {
84
+ const item = window.localStorage.getItem(key);
85
+ return item ? JSON.parse(item) : defaultValue;
86
+ } catch (error) {
87
+ console.error(`Error reading localStorage key "${key}":`, error);
88
+ return defaultValue;
89
+ }
90
+ };
91
+
92
+ // State Management
93
+ const [darkMode, setDarkMode] = useState(() => getFromLocalStorage('darkMode', true));
94
+ const [selectedDomain, setSelectedDomain] = useState(() => getFromLocalStorage('selectedDomain', 'medical'));
95
+ const [currentView, setCurrentView] = useState('app');
96
+ const [processingDocs, setProcessingDocs] = useState(() => getFromLocalStorage('processingDocs', []));
97
+ const [processedDocs, setProcessedDocs] = useState([]);
98
+ const [query, setQuery] = useState('');
99
+ const [messages, setMessages] = useState(() => getFromLocalStorage('chatMessages', []));
100
+ const [isQuerying, setIsQuerying] = useState(false);
101
+ const [error, setError] = useState(null);
102
+ const [showUploadModal, setShowUploadModal] = useState(false);
103
+ const [isDragging, setIsDragging] = useState(false);
104
+ const [showSidebar, setShowSidebar] = useState(true);
105
+ const [enableWebSearch, setEnableWebSearch] = useState(() => getFromLocalStorage('enableWebSearch', false));
106
+ const [webSearchOnly, setWebSearchOnly] = useState(() => getFromLocalStorage('webSearchOnly', false));
107
+ const [urlInput, setUrlInput] = useState('');
108
+ const [uploadMode, setUploadMode] = useState('file');
109
+ const [fastMode, setFastMode] = useState(() => getFromLocalStorage('fastMode', false));
110
+ const [enableCache, setEnableCache] = useState(() => getFromLocalStorage('enableCache', true));
111
+ const [enableQueryImprovement, setEnableQueryImprovement] = useState(() => getFromLocalStorage('enableQueryImprovement', true));
112
+ const [enableVerification, setEnableVerification] = useState(() => getFromLocalStorage('enableVerification', true));
113
+ const [typingSpeed] = useState(0);
114
+
115
+ const messagesEndRef = useRef(null);
116
+ const fileInputRef = useRef(null);
117
+ const typingIntervalRef = useRef(null);
118
+
119
+ // Theme classes based on dark mode
120
+ const theme = {
121
+ bg: darkMode ? 'bg-[#0D0D0D]' : 'bg-white',
122
+ bgSecondary: darkMode ? 'bg-[#171717]' : 'bg-gray-50',
123
+ bgTertiary: darkMode ? 'bg-[#252525]' : 'bg-white',
124
+ text: darkMode ? 'text-gray-100' : 'text-gray-900',
125
+ textSecondary: darkMode ? 'text-gray-400' : 'text-gray-600',
126
+ textMuted: darkMode ? 'text-gray-500' : 'text-gray-500',
127
+ border: darkMode ? 'border-gray-800' : 'border-gray-200',
128
+ borderLight: darkMode ? 'border-gray-700' : 'border-gray-300',
129
+ hover: darkMode ? 'hover:bg-[#252525]' : 'hover:bg-gray-100',
130
+ active: darkMode ? 'bg-[#252525]' : 'bg-blue-50',
131
+ userMessage: darkMode ? 'bg-blue-600' : 'bg-blue-600',
132
+ assistantMessage: darkMode ? 'bg-[#252525]' : 'bg-gray-100',
133
+ input: darkMode ? 'bg-[#171717] border-gray-700 text-gray-100' : 'bg-white border-gray-300 text-gray-900',
134
+ button: darkMode ? 'bg-[#252525] hover:bg-[#2D2D2D]' : 'bg-gray-100 hover:bg-gray-200'
135
+ };
136
+
137
+ const scrollToBottom = () => {
138
+ messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' });
139
+ };
140
+
141
+ useEffect(() => {
142
+ scrollToBottom();
143
+ }, [messages]);
144
+
145
+ // Persist to localStorage
146
+ useEffect(() => {
147
+ try {
148
+ window.localStorage.setItem('darkMode', JSON.stringify(darkMode));
149
+ window.localStorage.setItem('chatMessages', JSON.stringify(messages));
150
+ window.localStorage.setItem('selectedDomain', JSON.stringify(selectedDomain));
151
+ window.localStorage.setItem('processingDocs', JSON.stringify(processingDocs));
152
+ window.localStorage.setItem('enableWebSearch', JSON.stringify(enableWebSearch));
153
+ window.localStorage.setItem('webSearchOnly', JSON.stringify(webSearchOnly));
154
+ window.localStorage.setItem('fastMode', JSON.stringify(fastMode));
155
+ window.localStorage.setItem('enableCache', JSON.stringify(enableCache));
156
+ window.localStorage.setItem('enableQueryImprovement', JSON.stringify(enableQueryImprovement));
157
+ window.localStorage.setItem('enableVerification', JSON.stringify(enableVerification));
158
+ } catch (error) {
159
+ console.error('Error saving to localStorage:', error);
160
+ }
161
+ }, [darkMode, messages, selectedDomain, processingDocs, enableWebSearch, webSearchOnly, fastMode, enableCache, enableQueryImprovement, enableVerification]);
162
+
163
+ // Fetch processed documents
164
+ const fetchProcessedDocuments = useCallback(async () => {
165
+ try {
166
+ const response = await fetch(`${API_BASE_URL}/documents?domain=${selectedDomain}`);
167
+ if (response.ok) {
168
+ const data = await response.json();
169
+ const fetchedDocs = data.documents || [];
170
+ setProcessedDocs(prev => {
171
+ const fetchedIds = new Set(fetchedDocs.map(d => d.id));
172
+ const recentlyAdded = prev.filter(d => d.id && !fetchedIds.has(d.id));
173
+ return [...fetchedDocs, ...recentlyAdded];
174
+ });
175
+ }
176
+ } catch (err) {
177
+ console.error('Error fetching documents:', err);
178
+ }
179
+ }, [selectedDomain]);
180
+
181
+ // Check processing status
182
+ const checkProcessingStatus = useCallback(async () => {
183
+ const updatedProcessing = [];
184
+ for (const doc of processingDocs) {
185
+ try {
186
+ const response = await fetch(`${API_BASE_URL}/status/${doc.processingId}`);
187
+ if (response.ok) {
188
+ const status = await response.json();
189
+ if (status.status === 'completed') {
190
+ setProcessedDocs(prev => [...prev, { ...doc, id: doc.processingId, status: 'completed' }]);
191
+ } else if (status.status === 'failed') {
192
+ setError(`Processing failed for ${doc.name}: ${status.error}`);
193
+ } else {
194
+ updatedProcessing.push({ ...doc, status: status.status });
195
+ }
196
+ }
197
+ } catch (err) {
198
+ console.error('Error checking status:', err);
199
+ }
200
+ }
201
+ setProcessingDocs(updatedProcessing);
202
+ }, [processingDocs]);
203
+
204
+ useEffect(() => {
205
+ fetchProcessedDocuments();
206
+ }, [selectedDomain, fetchProcessedDocuments]);
207
+
208
+ useEffect(() => {
209
+ const interval = setInterval(() => {
210
+ if (processingDocs.length > 0) {
211
+ checkProcessingStatus();
212
+ }
213
+ }, 3000);
214
+ return () => clearInterval(interval);
215
+ }, [processingDocs, checkProcessingStatus]);
216
+
217
+ // API Functions
218
+ const handleFileUpload = async (files) => {
219
+ if (!files || files.length === 0) return;
220
+ setError(null);
221
+ const newProcessingDocs = [];
222
+
223
+ for (const file of files) {
224
+ const fileExt = '.' + file.name.split('.').pop().toLowerCase();
225
+ const allowedTypes = DOMAIN_CONFIGS[selectedDomain].fileTypes;
226
+
227
+ if (!allowedTypes.includes(fileExt)) {
228
+ setError(`File type ${fileExt} not supported for ${selectedDomain} domain.`);
229
+ continue;
230
+ }
231
+
232
+ const formData = new FormData();
233
+ formData.append('file', file);
234
+ formData.append('domain', selectedDomain);
235
+
236
+ try {
237
+ const response = await fetch(`${API_BASE_URL}/upload`, {
238
+ method: 'POST',
239
+ body: formData
240
+ });
241
+
242
+ const data = await response.json();
243
+ if (response.ok) {
244
+ newProcessingDocs.push({
245
+ name: file.name,
246
+ domain: selectedDomain,
247
+ processingId: data.processing_id,
248
+ status: 'processing',
249
+ uploadedAt: new Date().toISOString()
250
+ });
251
+ } else {
252
+ setError(data.detail || 'Upload failed');
253
+ }
254
+ } catch (err) {
255
+ console.error('Upload error:', err);
256
+ setError(`Failed to upload ${file.name}: ${err.message}`);
257
+ }
258
+ }
259
+
260
+ setProcessingDocs(prev => [...prev, ...newProcessingDocs]);
261
+ setShowUploadModal(false);
262
+ };
263
+
264
+ const handleUrlUpload = async () => {
265
+ if (!urlInput.trim()) {
266
+ setError('Please enter a valid URL');
267
+ return;
268
+ }
269
+
270
+ setError(null);
271
+
272
+ try {
273
+ const response = await fetch(`${API_BASE_URL}/upload-url`, {
274
+ method: 'POST',
275
+ headers: { 'Content-Type': 'application/json' },
276
+ body: JSON.stringify({
277
+ url: urlInput,
278
+ domain: selectedDomain,
279
+ convert_to_markdown: true
280
+ })
281
+ });
282
+
283
+ const data = await response.json();
284
+ if (response.ok) {
285
+ setProcessingDocs(prev => [...prev, {
286
+ name: urlInput,
287
+ domain: selectedDomain,
288
+ processingId: data.processing_id,
289
+ status: 'processing',
290
+ uploadedAt: new Date().toISOString()
291
+ }]);
292
+ setUrlInput('');
293
+ setShowUploadModal(false);
294
+ } else {
295
+ setError(data.detail || 'URL upload failed');
296
+ }
297
+ } catch (err) {
298
+ console.error('URL upload error:', err);
299
+ setError(`Failed to upload URL: ${err.message}`);
300
+ }
301
+ };
302
+
303
+ const startTypingEffect = useCallback((messageIndex, targetTextRef, isStreamingRef) => {
304
+ if (typingIntervalRef.current) {
305
+ clearInterval(typingIntervalRef.current);
306
+ }
307
+
308
+ let displayedLength = 0;
309
+
310
+ typingIntervalRef.current = setInterval(() => {
311
+ const targetText = targetTextRef.current || '';
312
+ const isStillStreaming = isStreamingRef.current;
313
+
314
+ if (displayedLength < targetText.length) {
315
+ const charsToAdd = Math.max(1, Math.floor(typingSpeed / 10));
316
+ displayedLength = Math.min(displayedLength + charsToAdd, targetText.length);
317
+
318
+ setMessages(prev => {
319
+ const newMessages = [...prev];
320
+ if (newMessages[messageIndex]) {
321
+ newMessages[messageIndex] = {
322
+ ...newMessages[messageIndex],
323
+ content: targetText.substring(0, displayedLength)
324
+ };
325
+ }
326
+ return newMessages;
327
+ });
328
+ } else if (!isStillStreaming && displayedLength >= targetText.length) {
329
+ clearInterval(typingIntervalRef.current);
330
+ typingIntervalRef.current = null;
331
+ }
332
+ }, 30);
333
+ }, [typingSpeed]);
334
+
335
+ useEffect(() => {
336
+ return () => {
337
+ if (typingIntervalRef.current) {
338
+ clearInterval(typingIntervalRef.current);
339
+ }
340
+ };
341
+ }, []);
342
+
343
+ const handleQuery = async () => {
344
+ if (!query.trim()) return;
345
+
346
+ setError(null);
347
+ setIsQuerying(true);
348
+
349
+ const userMessage = { role: 'user', content: query };
350
+ setMessages(prev => [...prev, userMessage]);
351
+ const currentQuery = query;
352
+ setQuery('');
353
+
354
+ const assistantMessageIndex = messages.length + 1;
355
+ setMessages(prev => [...prev, {
356
+ role: 'assistant',
357
+ content: '',
358
+ streaming: true,
359
+ verification: null
360
+ }]);
361
+
362
+ const fullTextBufferRef = { current: '' };
363
+ const isStreamingRef = { current: true };
364
+ let typingStarted = false;
365
+
366
+ try {
367
+ const response = await fetch(`${API_BASE_URL}/query/stream`, {
368
+ method: 'POST',
369
+ headers: { 'Content-Type': 'application/json' },
370
+ body: JSON.stringify({
371
+ query: currentQuery,
372
+ domain: selectedDomain,
373
+ enable_verification: true,
374
+ enable_web_search: enableWebSearch,
375
+ web_search_only: webSearchOnly,
376
+ fast_mode: fastMode,
377
+ enable_cache: enableCache,
378
+ enable_query_improvement: enableQueryImprovement,
379
+ enable_verification_check: enableVerification
380
+ })
381
+ });
382
+
383
+ if (!response.ok) {
384
+ throw new Error(`HTTP error! status: ${response.status}`);
385
+ }
386
+
387
+ const reader = response.body.getReader();
388
+ const decoder = new TextDecoder();
389
+ let buffer = '';
390
+
391
+ while (true) {
392
+ const { done, value } = await reader.read();
393
+
394
+ if (done) {
395
+ break;
396
+ }
397
+
398
+ buffer += decoder.decode(value, { stream: true });
399
+
400
+ const events = buffer.split('\n\n');
401
+ buffer = events.pop() || '';
402
+
403
+ for (const event of events) {
404
+ if (!event.trim()) continue;
405
+
406
+ const lines = event.split('\n');
407
+ let eventType = 'message';
408
+ let eventData = '';
409
+
410
+ for (const line of lines) {
411
+ if (line.startsWith('event:')) {
412
+ eventType = line.substring(6).trim();
413
+ } else if (line.startsWith('data:')) {
414
+ eventData = line.substring(5).trim();
415
+ }
416
+ }
417
+
418
+ if (eventData) {
419
+ const data = JSON.parse(eventData);
420
+
421
+ if (eventType === 'token') {
422
+ fullTextBufferRef.current += data.content;
423
+
424
+ if (!typingStarted && typingSpeed > 0) {
425
+ typingStarted = true;
426
+ startTypingEffect(assistantMessageIndex, fullTextBufferRef, isStreamingRef);
427
+ } else if (typingSpeed === 0) {
428
+ setMessages(prev => {
429
+ const newMessages = [...prev];
430
+ newMessages[assistantMessageIndex] = {
431
+ ...newMessages[assistantMessageIndex],
432
+ content: fullTextBufferRef.current
433
+ };
434
+ return newMessages;
435
+ });
436
+ }
437
+
438
+ } else if (eventType === 'verification') {
439
+ setMessages(prev => {
440
+ const newMessages = [...prev];
441
+ newMessages[assistantMessageIndex] = {
442
+ ...newMessages[assistantMessageIndex],
443
+ verification: data.content,
444
+ streaming: false
445
+ };
446
+ return newMessages;
447
+ });
448
+
449
+ } else if (eventType === 'done') {
450
+ isStreamingRef.current = false;
451
+
452
+ setTimeout(() => {
453
+ if (typingIntervalRef.current) {
454
+ clearInterval(typingIntervalRef.current);
455
+ typingIntervalRef.current = null;
456
+ }
457
+
458
+ setMessages(prev => {
459
+ const newMessages = [...prev];
460
+ newMessages[assistantMessageIndex] = {
461
+ ...newMessages[assistantMessageIndex],
462
+ streaming: false,
463
+ content: fullTextBufferRef.current
464
+ };
465
+ return newMessages;
466
+ });
467
+ }, typingSpeed === 0 ? 0 : 500);
468
+
469
+ } else if (eventType === 'error') {
470
+ const errorMessage = data.content.message || 'An error occurred';
471
+ const errorSuggestion = data.content.suggestion || '';
472
+ setError(errorSuggestion ? `${errorMessage}\n\n${errorSuggestion}` : errorMessage);
473
+
474
+ isStreamingRef.current = false;
475
+
476
+ if (typingIntervalRef.current) {
477
+ clearInterval(typingIntervalRef.current);
478
+ typingIntervalRef.current = null;
479
+ }
480
+
481
+ setMessages(prev => {
482
+ const newMessages = [...prev];
483
+ newMessages[assistantMessageIndex] = {
484
+ ...newMessages[assistantMessageIndex],
485
+ content: fullTextBufferRef.current || errorMessage,
486
+ streaming: false,
487
+ error: true
488
+ };
489
+ return newMessages;
490
+ });
491
+ break;
492
+ }
493
+ }
494
+ }
495
+ }
496
+
497
+ } catch (err) {
498
+ console.error('Query error:', err);
499
+ setError(`Query failed: ${err.message}`);
500
+
501
+ if (typingIntervalRef.current) {
502
+ clearInterval(typingIntervalRef.current);
503
+ typingIntervalRef.current = null;
504
+ }
505
+
506
+ setMessages(prev => {
507
+ const newMessages = [...prev];
508
+ if (newMessages[assistantMessageIndex]) {
509
+ newMessages[assistantMessageIndex] = {
510
+ ...newMessages[assistantMessageIndex],
511
+ content: newMessages[assistantMessageIndex].content || '[Error occurred]',
512
+ streaming: false,
513
+ error: true
514
+ };
515
+ }
516
+ return newMessages;
517
+ });
518
+ } finally {
519
+ setIsQuerying(false);
520
+ }
521
+ };
522
+
523
+ const handleKeyPress = (e) => {
524
+ if (e.key === 'Enter' && !e.shiftKey) {
525
+ e.preventDefault();
526
+ handleQuery();
527
+ }
528
+ };
529
+
530
+ const handleDeleteDocument = async (docId, docName) => {
531
+ if (!docId) {
532
+ console.error('Document ID is undefined');
533
+ setError('Cannot delete document: ID is missing');
534
+ return;
535
+ }
536
+
537
+ const confirmed = window.confirm(
538
+ `Are you sure you want to delete "${docName || 'this document'}"?\n\nThis action cannot be undone.`
539
+ );
540
+
541
+ if (!confirmed) {
542
+ return;
543
+ }
544
+
545
+ try {
546
+ const response = await fetch(`${API_BASE_URL}/documents/${docId}`, {
547
+ method: 'DELETE'
548
+ });
549
+
550
+ const data = await response.json();
551
+
552
+ if (response.ok && data.success) {
553
+ setProcessedDocs(prev => prev.filter(doc => doc.id !== docId));
554
+ await fetchProcessedDocuments();
555
+ } else {
556
+ const errorMsg = data.message || data.detail || 'Failed to delete document';
557
+ setError(errorMsg);
558
+ }
559
+ } catch (err) {
560
+ console.error('Error deleting document:', err);
561
+ setError('Failed to delete document: ' + err.message);
562
+ }
563
+ };
564
+
565
+ const handleDragOver = (e) => {
566
+ e.preventDefault();
567
+ setIsDragging(true);
568
+ };
569
+
570
+ const handleDragLeave = (e) => {
571
+ e.preventDefault();
572
+ setIsDragging(false);
573
+ };
574
+
575
+ const handleDrop = (e) => {
576
+ e.preventDefault();
577
+ setIsDragging(false);
578
+ handleFileUpload(e.dataTransfer.files);
579
+ };
580
+
581
+ // =============================================================================
582
+ // Render Functions
583
+ // =============================================================================
584
+
585
+ const renderNavigation = () => (
586
+ <nav className={`${theme.bgTertiary} ${theme.border} border-b px-6 py-3`}>
587
+ <div className="flex items-center justify-between max-w-7xl mx-auto">
588
+ <div className="flex items-center space-x-8">
589
+ <div className="flex items-center space-x-3">
590
+ <div className="flex items-center space-x-2">
591
+ <div className={`w-8 h-8 ${darkMode ? 'bg-gradient-to-br from-purple-500 to-blue-500' : 'bg-gradient-to-br from-blue-600 to-purple-600'} rounded-lg flex items-center justify-center`}>
592
+ <Sparkles className="w-5 h-5 text-white" />
593
+ </div>
594
+ <h1 className={`text-xl font-bold ${theme.text}`}>TheTruthSchool</h1>
595
+ </div>
596
+ <span className={`text-sm ${theme.textMuted}`}>/ {DOMAIN_CONFIGS[selectedDomain].name}</span>
597
+ </div>
598
+
599
+ <div className="flex items-center space-x-1">
600
+ <button
601
+ onClick={() => setCurrentView('app')}
602
+ className={`px-4 py-2 text-sm font-medium rounded-md transition-colors ${
603
+ currentView === 'app'
604
+ ? `${darkMode ? 'text-blue-400 bg-blue-900/30' : 'text-blue-600 bg-blue-50'}`
605
+ : `${theme.textSecondary} ${theme.hover}`
606
+ }`}
607
+ >
608
+ Chat
609
+ </button>
610
+ <button
611
+ onClick={() => setCurrentView('files')}
612
+ className={`px-4 py-2 text-sm font-medium rounded-md transition-colors ${
613
+ currentView === 'files'
614
+ ? `${darkMode ? 'text-blue-400 bg-blue-900/30' : 'text-blue-600 bg-blue-50'}`
615
+ : `${theme.textSecondary} ${theme.hover}`
616
+ }`}
617
+ >
618
+ Files
619
+ </button>
620
+ <button
621
+ onClick={() => setCurrentView('settings')}
622
+ className={`px-4 py-2 text-sm font-medium rounded-md transition-colors ${
623
+ currentView === 'settings'
624
+ ? `${darkMode ? 'text-blue-400 bg-blue-900/30' : 'text-blue-600 bg-blue-50'}`
625
+ : `${theme.textSecondary} ${theme.hover}`
626
+ }`}
627
+ >
628
+ Settings
629
+ </button>
630
+ </div>
631
+ </div>
632
+
633
+ <div className="flex items-center space-x-2">
634
+ <button
635
+ onClick={() => setDarkMode(!darkMode)}
636
+ className={`p-2 ${theme.textSecondary} ${theme.hover} rounded-md transition-colors`}
637
+ >
638
+ {darkMode ? <Sun className="w-5 h-5" /> : <Moon className="w-5 h-5" />}
639
+ </button>
640
+ <button
641
+ onClick={() => setShowSidebar(!showSidebar)}
642
+ className={`p-2 ${theme.textSecondary} ${theme.hover} rounded-md transition-colors`}
643
+ >
644
+ {showSidebar ? <X className="w-5 h-5" /> : <Menu className="w-5 h-5" />}
645
+ </button>
646
+ </div>
647
+ </div>
648
+ </nav>
649
+ );
650
+
651
+ const renderSidebar = () => (
652
+ <div className={`${showSidebar ? 'w-64' : 'w-0'} transition-all duration-300 ${theme.bgSecondary} ${theme.border} border-r overflow-hidden`}>
653
+ <div className="p-4 space-y-4">
654
+ <div>
655
+ <h3 className={`text-xs font-semibold ${theme.textMuted} uppercase mb-3`}>Domains</h3>
656
+ <div className="space-y-1">
657
+ {Object.entries(DOMAIN_CONFIGS).map(([key, config]) => (
658
+ <button
659
+ key={key}
660
+ onClick={() => setSelectedDomain(key)}
661
+ className={`w-full flex items-center space-x-3 px-3 py-2 rounded-lg text-sm transition-colors ${
662
+ selectedDomain === key
663
+ ? `${darkMode ? 'bg-blue-900/30 text-blue-400' : 'bg-blue-50 text-blue-700'} font-medium`
664
+ : `${theme.textSecondary} ${theme.hover}`
665
+ }`}
666
+ >
667
+ <span className="text-lg">{config.icon}</span>
668
+ <span className="flex-1 text-left truncate">{config.name}</span>
669
+ </button>
670
+ ))}
671
+ </div>
672
+ </div>
673
+
674
+ {processingDocs.length > 0 && (
675
+ <div>
676
+ <h3 className={`text-xs font-semibold ${theme.textMuted} uppercase mb-3`}>Processing</h3>
677
+ <div className="space-y-2">
678
+ {processingDocs.map((doc, idx) => (
679
+ <div key={idx} className={`flex items-center space-x-2 px-3 py-2 ${darkMode ? 'bg-yellow-900/20' : 'bg-yellow-50'} rounded-lg`}>
680
+ <Loader2 className={`w-4 h-4 ${darkMode ? 'text-yellow-400' : 'text-yellow-600'} animate-spin`} />
681
+ <span className={`text-xs ${darkMode ? 'text-yellow-300' : 'text-yellow-800'} truncate flex-1`}>{doc.name}</span>
682
+ </div>
683
+ ))}
684
+ </div>
685
+ </div>
686
+ )}
687
+
688
+ {processedDocs.length > 0 && (
689
+ <div>
690
+ <h3 className={`text-xs font-semibold ${theme.textMuted} uppercase mb-3`}>
691
+ Documents ({processedDocs.length})
692
+ </h3>
693
+ <div className="space-y-1 max-h-64 overflow-y-auto">
694
+ {processedDocs.map((doc, idx) => (
695
+ <div key={idx} className={`flex items-center space-x-2 px-3 py-2 ${theme.bgTertiary} rounded-lg ${theme.border} border group`}>
696
+ <FileText className={`w-4 h-4 ${theme.textMuted}`} />
697
+ <span className={`text-xs ${theme.textSecondary} truncate flex-1`}>{doc.name || `Document ${idx + 1}`}</span>
698
+ <button
699
+ onClick={() => handleDeleteDocument(doc.id, doc.name)}
700
+ className="opacity-0 group-hover:opacity-100 transition-opacity"
701
+ >
702
+ <Trash2 className={`w-3 h-3 ${theme.textMuted} hover:text-red-600`} />
703
+ </button>
704
+ </div>
705
+ ))}
706
+ </div>
707
+ </div>
708
+ )}
709
+
710
+ {messages.length > 0 && (
711
+ <div className={`pt-4 ${theme.border} border-t`}>
712
+ <button
713
+ onClick={() => {
714
+ if (window.confirm('Clear all chat history? This cannot be undone.')) {
715
+ setMessages([]);
716
+ window.localStorage.removeItem('chatMessages');
717
+ }
718
+ }}
719
+ className={`w-full flex items-center justify-center space-x-2 px-3 py-2 text-sm text-red-500 hover:${darkMode ? 'bg-red-900/20' : 'bg-red-50'} rounded-lg transition-colors`}
720
+ >
721
+ <Trash2 className="w-4 h-4" />
722
+ <span>Clear Chat</span>
723
+ </button>
724
+ </div>
725
+ )}
726
+ </div>
727
+ </div>
728
+ );
729
+
730
+ const renderAppView = () => (
731
+ <div className={`flex-1 flex flex-col ${theme.bg}`}>
732
+ {messages.length === 0 ? (
733
+ <div className="flex-1 flex flex-col items-center justify-center px-4">
734
+ <div className="text-center max-w-2xl">
735
+ <div className={`w-20 h-20 ${darkMode ? 'bg-gradient-to-br from-purple-500 to-blue-500' : 'bg-gradient-to-br from-blue-600 to-purple-600'} rounded-2xl flex items-center justify-center mx-auto mb-6 shadow-lg`}>
736
+ <Sparkles className="w-10 h-10 text-white" />
737
+ </div>
738
+ <h2 className={`text-4xl font-bold ${theme.text} mb-3`}>TheTruthSchool AI</h2>
739
+ <p className={`${theme.textSecondary} mb-8 text-lg`}>
740
+ Your intelligent assistant for document analysis and knowledge discovery
741
+ </p>
742
+
743
+ <div className="grid grid-cols-3 gap-4 text-left">
744
+ <div className={`p-5 ${theme.bgSecondary} rounded-xl ${theme.border} border`}>
745
+ <div className="text-3xl mb-3">📚</div>
746
+ <h3 className={`font-semibold ${theme.text} mb-2`}>Smart Upload</h3>
747
+ <p className={`text-sm ${theme.textSecondary}`}>Process PDFs, documents, and web content</p>
748
+ </div>
749
+ <div className={`p-5 ${theme.bgSecondary} rounded-xl ${theme.border} border`}>
750
+ <div className="text-3xl mb-3">🧠</div>
751
+ <h3 className={`font-semibold ${theme.text} mb-2`}>Deep Understanding</h3>
752
+ <p className={`text-sm ${theme.textSecondary}`}>Advanced RAG with knowledge graphs</p>
753
+ </div>
754
+ <div className={`p-5 ${theme.bgSecondary} rounded-xl ${theme.border} border`}>
755
+ <div className="text-3xl mb-3">✨</div>
756
+ <h3 className={`font-semibold ${theme.text} mb-2`}>Multi-Domain</h3>
757
+ <p className={`text-sm ${theme.textSecondary}`}>Optimized for healthcare, legal, finance & more</p>
758
+ </div>
759
+ </div>
760
+ </div>
761
+ </div>
762
+ ) : (
763
+ <div className="flex-1 overflow-y-auto px-4 py-6">
764
+ <div className="max-w-3xl mx-auto space-y-6">
765
+ {messages.map((msg, idx) => (
766
+ <div key={idx} className={`flex ${msg.role === 'user' ? 'justify-end' : 'justify-start'}`}>
767
+ <div className={`max-w-[80%] ${msg.role === 'user' ? 'bg-blue-600 text-white' : `${theme.assistantMessage} ${theme.text}`} rounded-2xl px-5 py-4 shadow-sm`}>
768
+ {msg.role === 'user' ? (
769
+ <p className="text-sm whitespace-pre-wrap">{msg.content}</p>
770
+ ) : (
771
+ <div className={`text-sm prose prose-sm max-w-none ${darkMode ? 'prose-invert' : ''}`}>
772
+ <ReactMarkdown
773
+ remarkPlugins={[remarkGfm]}
774
+ rehypePlugins={[rehypeHighlight]}
775
+ components={{
776
+ code({ node, inline, className, children, ...props }) {
777
+ return inline ? (
778
+ <code className={`${darkMode ? 'bg-gray-700 text-gray-100' : 'bg-gray-200 text-gray-800'} px-1.5 py-0.5 rounded text-xs font-mono`} {...props}>
779
+ {children}
780
+ </code>
781
+ ) : (
782
+ <code className={className} {...props}>
783
+ {children}
784
+ </code>
785
+ );
786
+ },
787
+ a({ node, children, ...props }) {
788
+ return (
789
+ <a className={`${darkMode ? 'text-blue-400' : 'text-blue-600'} hover:underline`} target="_blank" rel="noopener noreferrer" {...props}>
790
+ {children}
791
+ </a>
792
+ );
793
+ },
794
+ table: ({ node, ...props }) => (
795
+ <div className="overflow-x-auto my-4">
796
+ <table className={`min-w-full divide-y ${darkMode ? 'divide-gray-700 border-gray-700' : 'divide-gray-300 border-gray-300'} border rounded-lg`} {...props} />
797
+ </div>
798
+ ),
799
+ thead: ({ node, ...props }) => (
800
+ <thead className={darkMode ? 'bg-gray-800' : 'bg-gray-100'} {...props} />
801
+ ),
802
+ tbody: ({ node, ...props }) => (
803
+ <tbody className={`divide-y ${darkMode ? 'divide-gray-700 bg-gray-900' : 'divide-gray-200 bg-white'}`} {...props} />
804
+ ),
805
+ th: ({ node, ...props }) => (
806
+ <th className={`px-4 py-3 text-left text-xs font-bold uppercase tracking-wider ${darkMode ? 'text-gray-300 border-gray-700' : 'text-gray-700 border-gray-300'} border-r last:border-r-0`} {...props} />
807
+ ),
808
+ td: ({ node, ...props }) => (
809
+ <td className={`px-4 py-3 text-sm ${darkMode ? 'text-gray-300 border-gray-700' : 'text-gray-900 border-gray-200'} border-r last:border-r-0`} {...props} />
810
+ ),
811
+ tr: ({ node, ...props }) => (
812
+ <tr className={darkMode ? 'hover:bg-gray-800' : 'hover:bg-gray-50'} {...props} />
813
+ ),
814
+ }}
815
+ >
816
+ {msg.content}
817
+ </ReactMarkdown>
818
+ </div>
819
+ )}
820
+ {msg.streaming && msg.role === 'assistant' && (
821
+ <div className={`flex items-center space-x-1 ${theme.textMuted} text-sm mt-2`}>
822
+ <span>Thinking</span>
823
+ <span className="animate-pulse">...</span>
824
+ </div>
825
+ )}
826
+ </div>
827
+ </div>
828
+ ))}
829
+ <div ref={messagesEndRef} />
830
+ </div>
831
+ </div>
832
+ )}
833
+
834
+ {/* Bottom Input Bar */}
835
+ <div className={`${theme.border} border-t ${theme.bgTertiary} px-4 py-4`}>
836
+ <div className="max-w-3xl mx-auto">
837
+ <div className="flex items-end space-x-3">
838
+ <button
839
+ onClick={() => setShowUploadModal(true)}
840
+ className={`px-4 py-3 ${darkMode ? 'bg-blue-600 hover:bg-blue-700' : 'bg-blue-600 hover:bg-blue-700'} text-white rounded-xl transition-colors flex items-center space-x-2`}
841
+ >
842
+ <Upload className="w-4 h-4" />
843
+ <span className="text-sm font-medium">Upload</span>
844
+ </button>
845
+
846
+ <textarea
847
+ value={query}
848
+ onChange={(e) => setQuery(e.target.value)}
849
+ onKeyDown={handleKeyPress}
850
+ placeholder="Message TheTruthSchool..."
851
+ className={`flex-1 px-4 py-3 ${theme.input} rounded-xl focus:outline-none focus:ring-2 focus:ring-blue-500 resize-none`}
852
+ disabled={isQuerying}
853
+ rows={1}
854
+ style={{ minHeight: '48px', maxHeight: '200px' }}
855
+ />
856
+
857
+ <button
858
+ onClick={handleQuery}
859
+ disabled={isQuerying || !query.trim()}
860
+ className={`p-3 ${darkMode ? 'bg-blue-600 hover:bg-blue-700' : 'bg-blue-600 hover:bg-blue-700'} text-white rounded-xl transition-colors disabled:opacity-50 disabled:cursor-not-allowed`}
861
+ >
862
+ <Send className="w-5 h-5" />
863
+ </button>
864
+ </div>
865
+
866
+ <div className="flex items-center justify-center space-x-6 mt-3">
867
+ <label className="flex items-center space-x-2 cursor-pointer">
868
+ <input
869
+ type="checkbox"
870
+ checked={enableWebSearch}
871
+ onChange={(e) => {
872
+ setEnableWebSearch(e.target.checked);
873
+ if (e.target.checked && webSearchOnly) {
874
+ setWebSearchOnly(false);
875
+ }
876
+ }}
877
+ className="w-4 h-4 text-blue-600 rounded focus:ring-blue-500"
878
+ />
879
+ <span className={`text-sm ${theme.textSecondary}`}>Enhance with Web Search</span>
880
+ </label>
881
+ <label className="flex items-center space-x-2 cursor-pointer">
882
+ <input
883
+ type="checkbox"
884
+ checked={webSearchOnly}
885
+ onChange={(e) => {
886
+ setWebSearchOnly(e.target.checked);
887
+ if (e.target.checked) {
888
+ setEnableWebSearch(false);
889
+ }
890
+ }}
891
+ className="w-4 h-4 text-blue-600 rounded focus:ring-blue-500"
892
+ />
893
+ <span className={`text-sm ${theme.textSecondary}`}>Web Search Only</span>
894
+ </label>
895
+ </div>
896
+
897
+ <p className={`text-xs ${theme.textMuted} mt-2 text-center`}>
898
+ Press Enter to send • Shift+Enter for new line
899
+ </p>
900
+ </div>
901
+ </div>
902
+ </div>
903
+ );
904
+
905
+ const renderFilesView = () => (
906
+ <div className={`flex-1 overflow-y-auto p-6 ${theme.bg}`}>
907
+ <div className="max-w-5xl mx-auto">
908
+ <div className="flex items-center justify-between mb-6">
909
+ <div>
910
+ <h2 className={`text-2xl font-bold ${theme.text}`}>Document Management</h2>
911
+ <p className={theme.textSecondary}>Manage your uploaded and processed documents</p>
912
+ </div>
913
+ <div className="flex space-x-3">
914
+ <button
915
+ onClick={fetchProcessedDocuments}
916
+ className={`flex items-center space-x-2 px-4 py-2 ${theme.button} ${theme.text} rounded-lg transition-colors`}
917
+ >
918
+ <RefreshCw className="w-4 h-4" />
919
+ <span>Refresh</span>
920
+ </button>
921
+ <button
922
+ onClick={() => setShowUploadModal(true)}
923
+ className="flex items-center space-x-2 px-4 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors"
924
+ >
925
+ <Upload className="w-4 h-4" />
926
+ <span>Upload Documents</span>
927
+ </button>
928
+ </div>
929
+ </div>
930
+
931
+ {processingDocs.length > 0 && (
932
+ <div className="mb-6">
933
+ <h3 className={`text-lg font-semibold ${theme.text} mb-3`}>Processing Documents</h3>
934
+ <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
935
+ {processingDocs.map((doc, idx) => (
936
+ <div key={idx} className={`flex items-center space-x-4 p-4 ${darkMode ? 'bg-yellow-900/20 border-yellow-800' : 'bg-yellow-50 border-yellow-200'} border rounded-lg`}>
937
+ <Loader2 className={`w-8 h-8 ${darkMode ? 'text-yellow-400' : 'text-yellow-600'} animate-spin`} />
938
+ <div className="flex-1">
939
+ <p className={`font-medium ${theme.text}`}>{doc.name}</p>
940
+ <p className={`text-sm ${theme.textSecondary}`}>Processing...</p>
941
+ </div>
942
+ </div>
943
+ ))}
944
+ </div>
945
+ </div>
946
+ )}
947
+
948
+ <div>
949
+ <h3 className={`text-lg font-semibold ${theme.text} mb-3`}>
950
+ Processed Documents ({processedDocs.length})
951
+ </h3>
952
+ {processedDocs.length === 0 ? (
953
+ <div className={`text-center py-12 ${theme.bgSecondary} rounded-lg`}>
954
+ <FolderOpen className={`w-16 h-16 ${theme.textMuted} mx-auto mb-4`} />
955
+ <p className={theme.textSecondary}>No documents processed yet</p>
956
+ <button
957
+ onClick={() => setShowUploadModal(true)}
958
+ className="mt-4 px-6 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors"
959
+ >
960
+ Upload Your First Document
961
+ </button>
962
+ </div>
963
+ ) : (
964
+ <div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
965
+ {processedDocs.map((doc, idx) => (
966
+ <div key={idx} className={`p-4 ${theme.bgTertiary} ${theme.border} border rounded-lg hover:shadow-lg transition-all group`}>
967
+ <div className="flex items-start justify-between mb-3">
968
+ <FileText className="w-8 h-8 text-blue-600" />
969
+ <button
970
+ onClick={() => handleDeleteDocument(doc.id, doc.name)}
971
+ className="opacity-0 group-hover:opacity-100 transition-opacity p-1 hover:bg-gray-700 rounded"
972
+ >
973
+ <Trash2 className={`w-4 h-4 ${theme.textMuted} hover:text-red-500`} />
974
+ </button>
975
+ </div>
976
+ <p className={`font-medium ${theme.text} mb-1 truncate`} title={doc.name}>{doc.name || `Document ${idx + 1}`}</p>
977
+ <p className={`text-sm ${theme.textSecondary} mb-2`}>{DOMAIN_CONFIGS[doc.domain]?.name || selectedDomain}</p>
978
+ <div className="flex items-center space-x-2">
979
+ <CheckCircle className="w-4 h-4 text-green-500" />
980
+ <span className={`text-xs ${theme.textSecondary}`}>Processed</span>
981
+ </div>
982
+ </div>
983
+ ))}
984
+ </div>
985
+ )}
986
+ </div>
987
+ </div>
988
+ </div>
989
+ );
990
+
991
+ const renderSettingsView = () => (
992
+ <div className={`flex-1 overflow-y-auto p-6 ${theme.bg}`}>
993
+ <div className="max-w-3xl mx-auto">
994
+ <h2 className={`text-2xl font-bold ${theme.text} mb-6`}>Settings</h2>
995
+
996
+ <div className="space-y-6">
997
+ <div className={`${theme.bgTertiary} ${theme.border} border rounded-lg p-6`}>
998
+ <h3 className={`text-lg font-semibold ${theme.text} mb-4`}>Appearance</h3>
999
+ <div className="flex items-center justify-between">
1000
+ <div>
1001
+ <label className={`block text-sm font-medium ${theme.text}`}>Theme</label>
1002
+ <p className={`text-xs ${theme.textSecondary} mt-1`}>Choose your preferred interface theme</p>
1003
+ </div>
1004
+ <button
1005
+ onClick={() => setDarkMode(!darkMode)}
1006
+ className={`px-4 py-2 ${theme.button} ${theme.text} rounded-lg transition-colors flex items-center space-x-2`}
1007
+ >
1008
+ {darkMode ? (
1009
+ <>
1010
+ <Sun className="w-4 h-4" />
1011
+ <span>Light Mode</span>
1012
+ </>
1013
+ ) : (
1014
+ <>
1015
+ <Moon className="w-4 h-4" />
1016
+ <span>Dark Mode</span>
1017
+ </>
1018
+ )}
1019
+ </button>
1020
+ </div>
1021
+ </div>
1022
+
1023
+ <div className={`${theme.bgTertiary} ${theme.border} border rounded-lg p-6`}>
1024
+ <h3 className={`text-lg font-semibold ${theme.text} mb-4`}>Domain Configuration</h3>
1025
+ <div className="space-y-3">
1026
+ <div>
1027
+ <label className={`block text-sm font-medium ${theme.text} mb-2`}>Current Domain</label>
1028
+ <select
1029
+ value={selectedDomain}
1030
+ onChange={(e) => setSelectedDomain(e.target.value)}
1031
+ className={`w-full px-4 py-2 ${theme.input} rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500`}
1032
+ >
1033
+ {Object.entries(DOMAIN_CONFIGS).map(([key, config]) => (
1034
+ <option key={key} value={key}>{config.name}</option>
1035
+ ))}
1036
+ </select>
1037
+ </div>
1038
+ </div>
1039
+ </div>
1040
+
1041
+ <div className={`${theme.bgTertiary} ${theme.border} border rounded-lg p-6`}>
1042
+ <h3 className={`text-lg font-semibold ${theme.text} mb-4`}>Performance Settings</h3>
1043
+ <div className="space-y-4">
1044
+ <div className="flex items-start space-x-3">
1045
+ <input
1046
+ type="checkbox"
1047
+ id="fastMode"
1048
+ checked={fastMode}
1049
+ onChange={(e) => setFastMode(e.target.checked)}
1050
+ className="w-5 h-5 text-blue-600 rounded focus:ring-blue-500 mt-0.5"
1051
+ />
1052
+ <div className="flex-1">
1053
+ <label htmlFor="fastMode" className={`block text-sm font-medium ${theme.text} cursor-pointer`}>
1054
+ Fast Mode
1055
+ </label>
1056
+ <p className={`text-xs ${theme.textSecondary} mt-1`}>
1057
+ Use optimized parameters for 2-3x faster queries
1058
+ </p>
1059
+ </div>
1060
+ </div>
1061
+
1062
+ <div className="flex items-start space-x-3">
1063
+ <input
1064
+ type="checkbox"
1065
+ id="enableCache"
1066
+ checked={enableCache}
1067
+ onChange={(e) => setEnableCache(e.target.checked)}
1068
+ className="w-5 h-5 text-blue-600 rounded focus:ring-blue-500 mt-0.5"
1069
+ />
1070
+ <div className="flex-1">
1071
+ <label htmlFor="enableCache" className={`block text-sm font-medium ${theme.text} cursor-pointer`}>
1072
+ Enable Query Caching
1073
+ </label>
1074
+ <p className={`text-xs ${theme.textSecondary} mt-1`}>
1075
+ Cache results for faster repeated queries
1076
+ </p>
1077
+ </div>
1078
+ </div>
1079
+
1080
+ <div className="flex items-start space-x-3">
1081
+ <input
1082
+ type="checkbox"
1083
+ id="enableQueryImprovement"
1084
+ checked={enableQueryImprovement}
1085
+ onChange={(e) => setEnableQueryImprovement(e.target.checked)}
1086
+ className="w-5 h-5 text-blue-600 rounded focus:ring-blue-500 mt-0.5"
1087
+ />
1088
+ <div className="flex-1">
1089
+ <label htmlFor="enableQueryImprovement" className={`block text-sm font-medium ${theme.text} cursor-pointer`}>
1090
+ Enable Query Improvement
1091
+ </label>
1092
+ <p className={`text-xs ${theme.textSecondary} mt-1`}>
1093
+ Automatically enhance queries for better results
1094
+ </p>
1095
+ </div>
1096
+ </div>
1097
+
1098
+ <div className="flex items-start space-x-3">
1099
+ <input
1100
+ type="checkbox"
1101
+ id="enableVerification"
1102
+ checked={enableVerification}
1103
+ onChange={(e) => setEnableVerification(e.target.checked)}
1104
+ className="w-5 h-5 text-blue-600 rounded focus:ring-blue-500 mt-0.5"
1105
+ />
1106
+ <div className="flex-1">
1107
+ <label htmlFor="enableVerification" className={`block text-sm font-medium ${theme.text} cursor-pointer`}>
1108
+ Enable Answer Verification
1109
+ </label>
1110
+ <p className={`text-xs ${theme.textSecondary} mt-1`}>
1111
+ Verify answer quality and accuracy with dual-LLM
1112
+ </p>
1113
+ </div>
1114
+ </div>
1115
+ </div>
1116
+ </div>
1117
+ </div>
1118
+ </div>
1119
+ </div>
1120
+ );
1121
+
1122
+ const renderUploadModal = () => {
1123
+ if (!showUploadModal) return null;
1124
+
1125
+ return (
1126
+ <div className="fixed inset-0 bg-black bg-opacity-70 flex items-center justify-center z-50 p-4 backdrop-blur-sm">
1127
+ <div className={`${theme.bgTertiary} rounded-2xl max-w-2xl w-full p-6 shadow-2xl`}>
1128
+ <div className="flex items-center justify-between mb-6">
1129
+ <h2 className={`text-2xl font-bold ${theme.text}`}>Upload Documents</h2>
1130
+ <button
1131
+ onClick={() => {
1132
+ setShowUploadModal(false);
1133
+ setUploadMode('file');
1134
+ setUrlInput('');
1135
+ }}
1136
+ className={`p-2 ${theme.hover} rounded-lg`}
1137
+ >
1138
+ <X className={`w-5 h-5 ${theme.textSecondary}`} />
1139
+ </button>
1140
+ </div>
1141
+
1142
+ <div className="flex items-center space-x-2 mb-6">
1143
+ <button
1144
+ onClick={() => setUploadMode('file')}
1145
+ className={`flex-1 px-4 py-2 rounded-lg font-medium transition-colors ${
1146
+ uploadMode === 'file'
1147
+ ? 'bg-blue-600 text-white'
1148
+ : `${theme.button} ${theme.text}`
1149
+ }`}
1150
+ >
1151
+ Upload File
1152
+ </button>
1153
+ <button
1154
+ onClick={() => setUploadMode('url')}
1155
+ className={`flex-1 px-4 py-2 rounded-lg font-medium transition-colors ${
1156
+ uploadMode === 'url'
1157
+ ? 'bg-blue-600 text-white'
1158
+ : `${theme.button} ${theme.text}`
1159
+ }`}
1160
+ >
1161
+ Upload from URL
1162
+ </button>
1163
+ </div>
1164
+
1165
+ {uploadMode === 'file' ? (
1166
+ <div
1167
+ onDragOver={handleDragOver}
1168
+ onDragLeave={handleDragLeave}
1169
+ onDrop={handleDrop}
1170
+ className={`border-2 border-dashed rounded-xl p-12 text-center transition-colors ${
1171
+ isDragging
1172
+ ? 'border-blue-500 bg-blue-500/10'
1173
+ : `${theme.borderLight}`
1174
+ }`}
1175
+ >
1176
+ <Upload className={`w-16 h-16 ${theme.textMuted} mx-auto mb-4`} />
1177
+ <h3 className={`text-lg font-semibold ${theme.text} mb-2`}>
1178
+ Drop files here or click to browse
1179
+ </h3>
1180
+ <p className={`${theme.textSecondary} mb-4`}>
1181
+ Supported: {DOMAIN_CONFIGS[selectedDomain].fileTypes.join(', ')}
1182
+ </p>
1183
+ <input
1184
+ ref={fileInputRef}
1185
+ type="file"
1186
+ multiple
1187
+ accept={DOMAIN_CONFIGS[selectedDomain].fileTypes.join(',')}
1188
+ onChange={(e) => handleFileUpload(e.target.files)}
1189
+ className="hidden"
1190
+ />
1191
+ <button
1192
+ onClick={() => fileInputRef.current?.click()}
1193
+ className="px-6 py-2 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors"
1194
+ >
1195
+ Select Files
1196
+ </button>
1197
+ </div>
1198
+ ) : (
1199
+ <div className="space-y-4">
1200
+ <div>
1201
+ <label className={`block text-sm font-medium ${theme.text} mb-2`}>
1202
+ Enter URL to fetch and process
1203
+ </label>
1204
+ <input
1205
+ type="url"
1206
+ value={urlInput}
1207
+ onChange={(e) => setUrlInput(e.target.value)}
1208
+ placeholder="https://example.com/document.pdf"
1209
+ className={`w-full px-4 py-3 ${theme.input} rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500`}
1210
+ onKeyDown={(e) => {
1211
+ if (e.key === 'Enter') {
1212
+ handleUrlUpload();
1213
+ }
1214
+ }}
1215
+ />
1216
+ </div>
1217
+ <button
1218
+ onClick={handleUrlUpload}
1219
+ disabled={!urlInput.trim()}
1220
+ className="w-full px-6 py-3 bg-blue-600 text-white rounded-lg hover:bg-blue-700 transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
1221
+ >
1222
+ Fetch and Process URL
1223
+ </button>
1224
+ </div>
1225
+ )}
1226
+ </div>
1227
+ </div>
1228
+ );
1229
+ };
1230
+
1231
+ const renderError = () => {
1232
+ if (!error) return null;
1233
+
1234
+ return (
1235
+ <div className={`fixed bottom-4 right-4 ${darkMode ? 'bg-red-900/90 border-red-800' : 'bg-red-50 border-red-200'} border rounded-lg p-4 max-w-md shadow-2xl backdrop-blur-sm`}>
1236
+ <div className="flex items-start space-x-3">
1237
+ <XCircle className="w-5 h-5 text-red-500 flex-shrink-0 mt-0.5" />
1238
+ <div className="flex-1">
1239
+ <p className={`text-sm ${darkMode ? 'text-red-200' : 'text-red-800'}`}>{error}</p>
1240
+ </div>
1241
+ <button
1242
+ onClick={() => setError(null)}
1243
+ className="text-red-500 hover:text-red-600"
1244
+ >
1245
+ <X className="w-4 h-4" />
1246
+ </button>
1247
+ </div>
1248
+ </div>
1249
+ );
1250
+ };
1251
+
1252
+ return (
1253
+ <div className={`h-screen flex flex-col ${theme.bg}`}>
1254
+ {renderNavigation()}
1255
+
1256
+ <div className="flex-1 flex overflow-hidden">
1257
+ {renderSidebar()}
1258
+
1259
+ {currentView === 'app' && renderAppView()}
1260
+ {currentView === 'files' && renderFilesView()}
1261
+ {currentView === 'settings' && renderSettingsView()}
1262
+ </div>
1263
+
1264
+ {renderUploadModal()}
1265
+ {renderError()}
1266
+ </div>
1267
+ );
1268
+ }
frontend/src/index.css ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @tailwind base;
2
+ @tailwind components;
3
+ @tailwind utilities;
4
+
5
+ @keyframes blink {
6
+ 0%, 80%, 100% {
7
+ opacity: 0;
8
+ }
9
+ 40% {
10
+ opacity: 1;
11
+ }
12
+ }
13
+ .animate-blink {
14
+ animation: blink 1.4s infinite;
15
+ animation-fill-mode: both;
16
+ }
17
+
18
+ * {
19
+ margin: 0;
20
+ padding: 0;
21
+ box-sizing: border-box;
22
+ }
23
+
24
+ body {
25
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
26
+ 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
27
+ sans-serif;
28
+ -webkit-font-smoothing: antialiased;
29
+ -moz-osx-font-smoothing: grayscale;
30
+ }
31
+
32
+ code {
33
+ font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
34
+ monospace;
35
+ }
36
+
37
+
38
+ /* Custom scrollbar for webkit browsers */
39
+ ::-webkit-scrollbar {
40
+ width: 8px;
41
+ height: 8px;
42
+ }
43
+
44
+ ::-webkit-scrollbar-track {
45
+ background: #1f2937;
46
+ border-radius: 4px;
47
+ }
48
+
49
+ ::-webkit-scrollbar-thumb {
50
+ background: #4b5563;
51
+ border-radius: 4px;
52
+ }
53
+
54
+ ::-webkit-scrollbar-thumb:hover {
55
+ background: #6b7280;
56
+ }
57
+
58
+ /* Smooth transitions */
59
+ * {
60
+ transition-property: background-color, border-color, color, fill, stroke;
61
+ transition-timing-function: cubic-bezier(0.4, 0, 0.2, 1);
62
+ transition-duration: 150ms;
63
+ }
64
+
65
+ /* Animations */
66
+ @keyframes fadeIn {
67
+ from {
68
+ opacity: 0;
69
+ transform: translateY(10px);
70
+ }
71
+ to {
72
+ opacity: 1;
73
+ transform: translateY(0);
74
+ }
75
+ }
76
+
77
+ .animate-fadeIn {
78
+ animation: fadeIn 0.3s ease-out;
79
+ }
frontend/src/index.js ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react';
2
+ import ReactDOM from 'react-dom/client';
3
+ import './index.css';
4
+ import App from './App';
5
+
6
+ const root = ReactDOM.createRoot(document.getElementById('root'));
7
+ root.render(
8
+ <React.StrictMode>
9
+ <App />
10
+ </React.StrictMode>
11
+ );
frontend/tailwind.config.js ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /** @type {import('tailwindcss').Config} */
2
+ module.exports = {
3
+ content: [
4
+ "./src/**/*.{js,jsx,ts,tsx}",
5
+ ],
6
+ theme: {
7
+ extend: {
8
+ colors: {
9
+ gray: {
10
+ 650: '#4b5563',
11
+ 750: '#2d3748',
12
+ 850: '#1a202c',
13
+ }
14
+ },
15
+ animation: {
16
+ 'fadeIn': 'fadeIn 0.3s ease-out',
17
+ },
18
+ keyframes: {
19
+ fadeIn: {
20
+ '0%': { opacity: '0', transform: 'translateY(10px)' },
21
+ '100%': { opacity: '1', transform: 'translateY(0)' },
22
+ }
23
+ }
24
+ },
25
+ },
26
+ plugins: [],
27
+ }
rag_anything_smaranika/.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Bug Report
2
+ description: File a bug report
3
+ title: "[Bug]:"
4
+ labels: ["bug", "triage"]
5
+
6
+ body:
7
+ - type: checkboxes
8
+ id: existingcheck
9
+ attributes:
10
+ label: Do you need to file an issue?
11
+ description: Please help us manage our time by avoiding duplicates and common bugs with the steps below.
12
+ options:
13
+ - label: I have searched the existing issues and this bug is not already filed.
14
+ - label: I believe this is a legitimate bug, not just a question or feature request.
15
+ - type: textarea
16
+ id: description
17
+ attributes:
18
+ label: Describe the bug
19
+ description: A clear and concise description of what the bug is.
20
+ placeholder: What went wrong?
21
+ - type: textarea
22
+ id: reproduce
23
+ attributes:
24
+ label: Steps to reproduce
25
+ description: Steps to reproduce the behavior.
26
+ placeholder: How can we replicate the issue?
27
+ - type: textarea
28
+ id: expected_behavior
29
+ attributes:
30
+ label: Expected Behavior
31
+ description: A clear and concise description of what you expected to happen.
32
+ placeholder: What should have happened?
33
+ - type: textarea
34
+ id: configused
35
+ attributes:
36
+ label: LightRAG Config Used
37
+ description: The LightRAG configuration used for the run.
38
+ placeholder: The settings content or LightRAG configuration
39
+ value: |
40
+ # Paste your config here
41
+ - type: textarea
42
+ id: screenshotslogs
43
+ attributes:
44
+ label: Logs and screenshots
45
+ description: If applicable, add screenshots and logs to help explain your problem.
46
+ placeholder: Add logs and screenshots here
47
+ - type: textarea
48
+ id: additional_information
49
+ attributes:
50
+ label: Additional Information
51
+ description: |
52
+ - LightRAG Version: e.g., v0.1.1
53
+ - Operating System: e.g., Windows 10, Ubuntu 20.04
54
+ - Python Version: e.g., 3.8
55
+ - Related Issues: e.g., #1
56
+ - Any other relevant information.
57
+ value: |
58
+ - LightRAG Version:
59
+ - Operating System:
60
+ - Python Version:
61
+ - Related Issues:
rag_anything_smaranika/.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ blank_issues_enabled: false
rag_anything_smaranika/.github/ISSUE_TEMPLATE/feature_request.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Feature Request
2
+ description: File a feature request
3
+ labels: ["enhancement"]
4
+ title: "[Feature Request]:"
5
+
6
+ body:
7
+ - type: checkboxes
8
+ id: existingcheck
9
+ attributes:
10
+ label: Do you need to file a feature request?
11
+ description: Please help us manage our time by avoiding duplicates and common feature request with the steps below.
12
+ options:
13
+ - label: I have searched the existing feature request and this feature request is not already filed.
14
+ - label: I believe this is a legitimate feature request, not just a question or bug.
15
+ - type: textarea
16
+ id: feature_request_description
17
+ attributes:
18
+ label: Feature Request Description
19
+ description: A clear and concise description of the feature request you would like.
20
+ placeholder: What this feature request add more or improve?
21
+ - type: textarea
22
+ id: additional_context
23
+ attributes:
24
+ label: Additional Context
25
+ description: Add any other context or screenshots about the feature request here.
26
+ placeholder: Any additional information
rag_anything_smaranika/.github/ISSUE_TEMPLATE/question.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Question
2
+ description: Ask a general question
3
+ labels: ["question"]
4
+ title: "[Question]:"
5
+
6
+ body:
7
+ - type: checkboxes
8
+ id: existingcheck
9
+ attributes:
10
+ label: Do you need to ask a question?
11
+ description: Please help us manage our time by avoiding duplicates and common questions with the steps below.
12
+ options:
13
+ - label: I have searched the existing question and discussions and this question is not already answered.
14
+ - label: I believe this is a legitimate question, not just a bug or feature request.
15
+ - type: textarea
16
+ id: question
17
+ attributes:
18
+ label: Your Question
19
+ description: A clear and concise description of your question.
20
+ placeholder: What is your question?
21
+ - type: textarea
22
+ id: context
23
+ attributes:
24
+ label: Additional Context
25
+ description: Provide any additional context or details that might help us understand your question better.
26
+ placeholder: Add any relevant information here
rag_anything_smaranika/.github/dependabot.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To get started with Dependabot version updates, you'll need to specify which
2
+ # package ecosystems to update and where the package manifests are located.
3
+ # Please see the documentation for all configuration options:
4
+ # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
5
+
6
+ version: 2
7
+ updates:
8
+ - package-ecosystem: "pip" # See documentation for possible values
9
+ directory: "/" # Location of package manifests
10
+ schedule:
11
+ interval: "weekly"
rag_anything_smaranika/.github/pull_request_template.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--
2
+ Thanks for contributing to RAGAnything!
3
+
4
+ Please ensure your pull request is ready for review before submitting.
5
+
6
+ About this template
7
+
8
+ This template helps contributors provide a clear and concise description of their changes. Feel free to adjust it as needed.
9
+ -->
10
+
11
+ ## Description
12
+
13
+ [Briefly describe the changes made in this pull request.]
14
+
15
+ ## Related Issues
16
+
17
+ [Reference any related issues or tasks addressed by this pull request.]
18
+
19
+ ## Changes Made
20
+
21
+ [List the specific changes made in this pull request.]
22
+
23
+ ## Checklist
24
+
25
+ - [ ] Changes tested locally
26
+ - [ ] Code reviewed
27
+ - [ ] Documentation updated (if necessary)
28
+ - [ ] Unit tests added (if applicable)
29
+
30
+ ## Additional Notes
31
+
32
+ [Add any additional notes or context for the reviewer(s).]
rag_anything_smaranika/.github/workflows/linting.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Linting and Formatting
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ pull_request:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ lint-and-format:
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Checkout code
17
+ uses: actions/checkout@v2
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v2
21
+ with:
22
+ python-version: '3.x'
23
+
24
+ - name: Install dependencies
25
+ run: |
26
+ python -m pip install --upgrade pip
27
+ pip install pre-commit
28
+
29
+ - name: Run pre-commit
30
+ run: pre-commit run --all-files --show-diff-on-failure
rag_anything_smaranika/.github/workflows/pypi-publish.yml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Upload RAGAnything Package
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ release-build:
12
+ runs-on: ubuntu-latest
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.x"
20
+
21
+ - name: Build release distributions
22
+ run: |
23
+ python -m pip install build
24
+ python -m build
25
+
26
+ - name: Upload distributions
27
+ uses: actions/upload-artifact@v4
28
+ with:
29
+ name: release-dists
30
+ path: dist/
31
+
32
+ pypi-publish:
33
+ runs-on: ubuntu-latest
34
+ needs:
35
+ - release-build
36
+ permissions:
37
+ id-token: write
38
+
39
+ environment:
40
+ name: pypi
41
+
42
+ steps:
43
+ - name: Retrieve release distributions
44
+ uses: actions/download-artifact@v4
45
+ with:
46
+ name: release-dists
47
+ path: dist/
48
+
49
+ - name: Publish release distributions to PyPI
50
+ uses: pypa/gh-action-pypi-publish@release/v1
51
+ with:
52
+ packages-dir: dist/
rag_anything_smaranika/.gitignore ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-related files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ *.tgz
7
+ *.tar.gz
8
+ *.ini
9
+
10
+ # Virtual Environment
11
+ .venv/
12
+ env/
13
+ venv/
14
+
15
+ *.env*
16
+ .env_example
17
+
18
+ # Build / Distribution
19
+ dist/
20
+ build/
21
+ site/
22
+
23
+ # Logs / Reports
24
+ *.log
25
+ *.log.*
26
+ *.logfire
27
+ *.coverage/
28
+ log/
29
+
30
+ # Caches
31
+ .cache/
32
+ .mypy_cache/
33
+ .pytest_cache/
34
+ .ruff_cache/
35
+ .gradio/
36
+ .history/
37
+ temp/
38
+
39
+ # IDE / Editor Files
40
+ .idea/
41
+ .vscode/
42
+ .vscode/settings.json
43
+
44
+ # Framework-specific files
45
+ local_neo4jWorkDir/
46
+ neo4jWorkDir/
47
+
48
+ # Data & Storage
49
+ inputs/
50
+ rag_storage*/
51
+ examples/input/
52
+ examples/output/
53
+ output*/
54
+
55
+ # Miscellaneous
56
+ .DS_Store
57
+ TODO.md
58
+ ignore_this.txt
59
+ *.ignore.*
60
+
61
+ # Project-specific files
62
+ dickens*/
63
+ book.txt
64
+ LightRAG.pdf
65
+ LightRAG_2-4.pdf
66
+ download_models_hf.py
67
+ lightrag-dev/
68
+ gui/
69
+
70
+ # unit-test files
71
+ test_*
72
+
73
+ # Cline files
74
+ memory-bank/
75
+
76
+ # AI
77
+ .claude/
78
+ .cursor/
79
+ CLAUDE.md
rag_anything_smaranika/.pre-commit-config.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ exclude: ^lightrag/api/webui/
7
+ - id: end-of-file-fixer
8
+ exclude: ^lightrag/api/webui/
9
+ - id: requirements-txt-fixer
10
+ exclude: ^lightrag/api/webui/
11
+
12
+
13
+ - repo: https://github.com/astral-sh/ruff-pre-commit
14
+ rev: v0.6.4
15
+ hooks:
16
+ - id: ruff-format
17
+ exclude: ^lightrag/api/webui/
18
+ - id: ruff
19
+ args: [--fix, --ignore=E402]
20
+ exclude: ^lightrag/api/webui/
21
+
22
+
23
+ - repo: https://github.com/mgedmin/check-manifest
24
+ rev: "0.49"
25
+ hooks:
26
+ - id: check-manifest
27
+ stages: [manual]
28
+ exclude: ^lightrag/api/webui/
rag_anything_smaranika/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 ✨Data Intelligence Lab@HKU✨
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
rag_anything_smaranika/MANIFEST.in ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ include requirements.txt
2
+ include README.md
3
+ include README_zh.md
4
+ include LICENSE
5
+ recursive-include raganything *.py
6
+ recursive-include examples *.py
7
+ global-exclude *.pyc
8
+ global-exclude __pycache__
9
+ global-exclude *.egg-info
rag_anything_smaranika/README.md ADDED
@@ -0,0 +1,1260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <div style="margin: 20px 0;">
4
+ <img src="./assets/logo.png" width="120" height="120" alt="RAG-Anything Logo" style="border-radius: 20px; box-shadow: 0 8px 32px rgba(0, 217, 255, 0.3);">
5
+ </div>
6
+
7
+ # 🚀 RAG-Anything: All-in-One RAG Framework
8
+
9
+ <a href="https://trendshift.io/repositories/14959" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14959" alt="HKUDS%2FRAG-Anything | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
10
+
11
+ <div align="center">
12
+ <img src="https://readme-typing-svg.herokuapp.com?font=Orbitron&size=24&duration=3000&pause=1000&color=00D9FF&center=true&vCenter=true&width=600&lines=Welcome+to+RAG-Anything;Next-Gen+Multimodal+RAG+System;Powered+by+Advanced+AI+Technology" alt="Typing Animation" />
13
+ </div>
14
+
15
+ <div align="center">
16
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;">
17
+ <p>
18
+ <a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥Project-Page-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>
19
+ <a href='https://arxiv.org/abs/2410.05779'><img src='https://img.shields.io/badge/📄arXiv-2410.05779-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
20
+ <a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡Based%20on-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>
21
+ </p>
22
+ <p>
23
+ <a href="https://github.com/HKUDS/RAG-Anything/stargazers"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>
24
+ <img src="https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
25
+ <a href="https://pypi.org/project/raganything/"><img src="https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b"></a>
26
+ <a href="https://github.com/astral-sh/uv"><img src="https://img.shields.io/badge/⚡uv-Ready-ff6b6b?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e"></a>
27
+ </p>
28
+ <p>
29
+ <a href="https://discord.gg/yF2MmDJyGJ"><img src="https://img.shields.io/badge/💬Discord-Community-7289da?style=for-the-badge&logo=discord&logoColor=white&labelColor=1a1a2e"></a>
30
+ <a href="https://github.com/HKUDS/RAG-Anything/issues/7"><img src="https://img.shields.io/badge/💬WeChat-Group-07c160?style=for-the-badge&logo=wechat&logoColor=white&labelColor=1a1a2e"></a>
31
+ </p>
32
+ <p>
33
+ <a href="README_zh.md"><img src="https://img.shields.io/badge/🇨🇳中文版-1a1a2e?style=for-the-badge"></a>
34
+ <a href="README.md"><img src="https://img.shields.io/badge/🇺🇸English-1a1a2e?style=for-the-badge"></a>
35
+ </p>
36
+ </div>
37
+ </div>
38
+
39
+ </div>
40
+
41
+ <div align="center">
42
+ <div style="width: 100%; height: 2px; margin: 20px 0; background: linear-gradient(90deg, transparent, #00d9ff, transparent);"></div>
43
+ </div>
44
+
45
+ <div align="center">
46
+ <a href="#-quick-start" style="text-decoration: none;">
47
+ <img src="https://img.shields.io/badge/Quick%20Start-Get%20Started%20Now-00d9ff?style=for-the-badge&logo=rocket&logoColor=white&labelColor=1a1a2e">
48
+ </a>
49
+ </div>
50
+
51
+ ---
52
+
53
+ ## 🎉 News
54
+ - [X] [2025.08.12]🎯📢 🔍 RAG-Anything now features **VLM-Enhanced Query** mode! When documents include images, the system seamlessly integrates them into VLM for advanced multimodal analysis, combining visual and textual context for deeper insights.
55
+ - [X] [2025.07.05]🎯📢 RAG-Anything now features a [context configuration module](docs/context_aware_processing.md), enabling intelligent integration of relevant contextual information to enhance multimodal content processing.
56
+ - [X] [2025.07.04]🎯📢 🚀 RAG-Anything now supports multimodal query capabilities, enabling enhanced RAG with seamless processing of text, images, tables, and equations.
57
+ - [X] [2025.07.03]🎯📢 🎉 RAG-Anything has reached 1k🌟 stars on GitHub! Thank you for your incredible support and valuable contributions to the project.
58
+
59
+ ---
60
+
61
+ ## 🌟 System Overview
62
+
63
+ *Next-Generation Multimodal Intelligence*
64
+
65
+ <div style="background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border: 2px solid #00d9ff; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);">
66
+
67
+ Modern documents increasingly contain diverse multimodal content—text, images, tables, equations, charts, and multimedia—that traditional text-focused RAG systems cannot effectively process. **RAG-Anything** addresses this challenge as a comprehensive **All-in-One Multimodal Document Processing RAG system** built on [LightRAG](https://github.com/HKUDS/LightRAG).
68
+
69
+ As a unified solution, RAG-Anything **eliminates the need for multiple specialized tools**. It provides **seamless processing and querying across all content modalities** within a single integrated framework. Unlike conventional RAG approaches that struggle with non-textual elements, our all-in-one system delivers **comprehensive multimodal retrieval capabilities**.
70
+
71
+ Users can query documents containing **interleaved text**, **visual diagrams**, **structured tables**, and **mathematical formulations** through **one cohesive interface**. This consolidated approach makes RAG-Anything particularly valuable for academic research, technical documentation, financial reports, and enterprise knowledge management where rich, mixed-content documents demand a **unified processing framework**.
72
+
73
+ <img src="assets/rag_anything_framework.png" alt="RAG-Anything" />
74
+
75
+ </div>
76
+
77
+ ### 🎯 Key Features
78
+
79
+ <div style="background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 15px; padding: 25px; margin: 20px 0;">
80
+
81
+ - **🔄 End-to-End Multimodal Pipeline** - Complete workflow from document ingestion and parsing to intelligent multimodal query answering
82
+ - **📄 Universal Document Support** - Seamless processing of PDFs, Office documents, images, and diverse file formats
83
+ - **🧠 Specialized Content Analysis** - Dedicated processors for images, tables, mathematical equations, and heterogeneous content types
84
+ - **🔗 Multimodal Knowledge Graph** - Automatic entity extraction and cross-modal relationship discovery for enhanced understanding
85
+ - **⚡ Adaptive Processing Modes** - Flexible MinerU-based parsing or direct multimodal content injection workflows
86
+ - **📋 Direct Content List Insertion** - Bypass document parsing by directly inserting pre-parsed content lists from external sources
87
+ - **🎯 Hybrid Intelligent Retrieval** - Advanced search capabilities spanning textual and multimodal content with contextual understanding
88
+
89
+ </div>
90
+
91
+ ---
92
+
93
+ ## 🏗️ Algorithm & Architecture
94
+
95
+ <div style="background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border-left: 5px solid #00d9ff;">
96
+
97
+ ### Core Algorithm
98
+
99
+ **RAG-Anything** implements an effective **multi-stage multimodal pipeline** that fundamentally extends traditional RAG architectures to seamlessly handle diverse content modalities through intelligent orchestration and cross-modal understanding.
100
+
101
+ </div>
102
+
103
+ <div align="center">
104
+ <div style="width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);">
105
+ <div style="display: flex; justify-content: space-around; align-items: center; flex-wrap: wrap; gap: 20px;">
106
+ <div style="text-align: center;">
107
+ <div style="font-size: 24px; margin-bottom: 10px;">📄</div>
108
+ <div style="font-size: 14px; color: #00d9ff;">Document Parsing</div>
109
+ </div>
110
+ <div style="font-size: 20px; color: #00d9ff;">→</div>
111
+ <div style="text-align: center;">
112
+ <div style="font-size: 24px; margin-bottom: 10px;">🧠</div>
113
+ <div style="font-size: 14px; color: #00d9ff;">Content Analysis</div>
114
+ </div>
115
+ <div style="font-size: 20px; color: #00d9ff;">→</div>
116
+ <div style="text-align: center;">
117
+ <div style="font-size: 24px; margin-bottom: 10px;">🔍</div>
118
+ <div style="font-size: 14px; color: #00d9ff;">Knowledge Graph</div>
119
+ </div>
120
+ <div style="font-size: 20px; color: #00d9ff;">→</div>
121
+ <div style="text-align: center;">
122
+ <div style="font-size: 24px; margin-bottom: 10px;">🎯</div>
123
+ <div style="font-size: 14px; color: #00d9ff;">Intelligent Retrieval</div>
124
+ </div>
125
+ </div>
126
+ </div>
127
+ </div>
128
+
129
+ ### 1. Document Parsing Stage
130
+
131
+ <div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
132
+
133
+ The system provides high-fidelity document extraction through adaptive content decomposition. It intelligently segments heterogeneous elements while preserving contextual relationships. Universal format compatibility is achieved via specialized optimized parsers.
134
+
135
+ **Key Components:**
136
+
137
+ - **⚙️ MinerU Integration**: Leverages [MinerU](https://github.com/opendatalab/MinerU) for high-fidelity document structure extraction and semantic preservation across complex layouts.
138
+
139
+ - **🧩 Adaptive Content Decomposition**: Automatically segments documents into coherent text blocks, visual elements, structured tables, mathematical equations, and specialized content types while preserving contextual relationships.
140
+
141
+ - **📁 Universal Format Support**: Provides comprehensive handling of PDFs, Office documents (DOC/DOCX/PPT/PPTX/XLS/XLSX), images, and emerging formats through specialized parsers with format-specific optimization.
142
+
143
+ </div>
144
+
145
+ ### 2. Multi-Modal Content Understanding & Processing
146
+
147
+ <div style="background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;">
148
+
149
+ The system automatically categorizes and routes content through optimized channels. It uses concurrent pipelines for parallel text and multimodal processing. Document hierarchy and relationships are preserved during transformation.
150
+
151
+ **Key Components:**
152
+
153
+ - **🎯 Autonomous Content Categorization and Routing**: Automatically identify, categorize, and route different content types through optimized execution channels.
154
+
155
+ - **⚡ Concurrent Multi-Pipeline Architecture**: Implements concurrent execution of textual and multimodal content through dedicated processing pipelines. This approach maximizes throughput efficiency while preserving content integrity.
156
+
157
+ - **🏗️ Document Hierarchy Extraction**: Extracts and preserves original document hierarchy and inter-element relationships during content transformation.
158
+
159
+ </div>
160
+
161
+ ### 3. Multimodal Analysis Engine
162
+
163
+ <div style="background: linear-gradient(90deg, #0f3460 0%, #1a1a2e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #00d9ff;">
164
+
165
+ The system deploys modality-aware processing units for heterogeneous data modalities:
166
+
167
+ **Specialized Analyzers:**
168
+
169
+ - **🔍 Visual Content Analyzer**:
170
+ - Integrate vision model for image analysis.
171
+ - Generates context-aware descriptive captions based on visual semantics.
172
+ - Extracts spatial relationships and hierarchical structures between visual elements.
173
+
174
+ - **📊 Structured Data Interpreter**:
175
+ - Performs systematic interpretation of tabular and structured data formats.
176
+ - Implements statistical pattern recognition algorithms for data trend analysis.
177
+ - Identifies semantic relationships and dependencies across multiple tabular datasets.
178
+
179
+ - **📐 Mathematical Expression Parser**:
180
+ - Parses complex mathematical expressions and formulas with high accuracy.
181
+ - Provides native LaTeX format support for seamless integration with academic workflows.
182
+ - Establishes conceptual mappings between mathematical equations and domain-specific knowledge bases.
183
+
184
+ - **🔧 Extensible Modality Handler**:
185
+ - Provides configurable processing framework for custom and emerging content types.
186
+ - Enables dynamic integration of new modality processors through plugin architecture.
187
+ - Supports runtime configuration of processing pipelines for specialized use cases.
188
+
189
+ </div>
190
+
191
+ ### 4. Multimodal Knowledge Graph Index
192
+
193
+ <div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
194
+
195
+ The multi-modal knowledge graph construction module transforms document content into structured semantic representations. It extracts multimodal entities, establishes cross-modal relationships, and preserves hierarchical organization. The system applies weighted relevance scoring for optimized knowledge retrieval.
196
+
197
+ **Core Functions:**
198
+
199
+ - **🔍 Multi-Modal Entity Extraction**: Transforms significant multimodal elements into structured knowledge graph entities. The process includes semantic annotations and metadata preservation.
200
+
201
+ - **🔗 Cross-Modal Relationship Mapping**: Establishes semantic connections and dependencies between textual entities and multimodal components. This is achieved through automated relationship inference algorithms.
202
+
203
+ - **🏗️ Hierarchical Structure Preservation**: Maintains original document organization through "belongs_to" relationship chains. These chains preserve logical content hierarchy and sectional dependencies.
204
+
205
+ - **⚖️ Weighted Relationship Scoring**: Assigns quantitative relevance scores to relationship types. Scoring is based on semantic proximity and contextual significance within the document structure.
206
+
207
+ </div>
208
+
209
+ ### 5. Modality-Aware Retrieval
210
+
211
+ <div style="background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;">
212
+
213
+ The hybrid retrieval system combines vector similarity search with graph traversal algorithms for comprehensive content retrieval. It implements modality-aware ranking mechanisms and maintains relational coherence between retrieved elements to ensure contextually integrated information delivery.
214
+
215
+ **Retrieval Mechanisms:**
216
+
217
+ - **🔀 Vector-Graph Fusion**: Integrates vector similarity search with graph traversal algorithms. This approach leverages both semantic embeddings and structural relationships for comprehensive content retrieval.
218
+
219
+ - **📊 Modality-Aware Ranking**: Implements adaptive scoring mechanisms that weight retrieval results based on content type relevance. The system adjusts rankings according to query-specific modality preferences.
220
+
221
+ - **🔗 Relational Coherence Maintenance**: Maintains semantic and structural relationships between retrieved elements. This ensures coherent information delivery and contextual integrity.
222
+
223
+ </div>
224
+
225
+ ---
226
+
227
+ ## 🚀 Quick Start
228
+
229
+ *Initialize Your AI Journey*
230
+
231
+ <div align="center">
232
+ <img src="https://user-images.githubusercontent.com/74038190/212284158-e840e285-664b-44d7-b79b-e264b5e54825.gif" width="400">
233
+ </div>
234
+
235
+ ### Installation
236
+
237
+ #### Option 1: Install from PyPI (Recommended)
238
+
239
+ ```bash
240
+ # Basic installation
241
+ pip install raganything
242
+
243
+ # With optional dependencies for extended format support:
244
+ pip install 'raganything[all]' # All optional features
245
+ pip install 'raganything[image]' # Image format conversion (BMP, TIFF, GIF, WebP)
246
+ pip install 'raganything[text]' # Text file processing (TXT, MD)
247
+ pip install 'raganything[image,text]' # Multiple features
248
+ ```
249
+
250
+ #### Option 2: Install from Source
251
+ ```bash
252
+ # Install uv (if not already installed)
253
+ curl -LsSf https://astral.sh/uv/install.sh | sh
254
+
255
+ # Clone and setup the project with uv
256
+ git clone https://github.com/HKUDS/RAG-Anything.git
257
+ cd RAG-Anything
258
+
259
+ # Install the package and dependencies in a virtual environment
260
+ uv sync
261
+
262
+ # If you encounter network timeouts (especially for opencv packages):
263
+ # UV_HTTP_TIMEOUT=120 uv sync
264
+
265
+ # Run commands directly with uv (recommended approach)
266
+ uv run python examples/raganything_example.py --help
267
+
268
+ # Install with optional dependencies
269
+ uv sync --extra image --extra text # Specific extras
270
+ uv sync --all-extras # All optional features
271
+ ```
272
+
273
+ #### Optional Dependencies
274
+
275
+ - **`[image]`** - Enables processing of BMP, TIFF, GIF, WebP image formats (requires Pillow)
276
+ - **`[text]`** - Enables processing of TXT and MD files (requires ReportLab)
277
+ - **`[all]`** - Includes all Python optional dependencies
278
+
279
+ > **⚠️ Office Document Processing Requirements:**
280
+ > - Office documents (.doc, .docx, .ppt, .pptx, .xls, .xlsx) require **LibreOffice** installation
281
+ > - Download from [LibreOffice official website](https://www.libreoffice.org/download/download/)
282
+ > - **Windows**: Download installer from official website
283
+ > - **macOS**: `brew install --cask libreoffice`
284
+ > - **Ubuntu/Debian**: `sudo apt-get install libreoffice`
285
+ > - **CentOS/RHEL**: `sudo yum install libreoffice`
286
+
287
+ **Check MinerU installation:**
288
+
289
+ ```bash
290
+ # Verify installation
291
+ mineru --version
292
+
293
+ # Check if properly configured
294
+ python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU installed properly' if rag.check_parser_installation() else '❌ MinerU installation issue')"
295
+ ```
296
+
297
+ Models are downloaded automatically on first use. For manual download, refer to [MinerU Model Source Configuration](https://github.com/opendatalab/MinerU/blob/master/README.md#22-model-source-configuration).
298
+
299
+ ### Usage Examples
300
+
301
+ #### 1. End-to-End Document Processing
302
+
303
+ ```python
304
+ import asyncio
305
+ from raganything import RAGAnything, RAGAnythingConfig
306
+ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
307
+ from lightrag.utils import EmbeddingFunc
308
+
309
+ async def main():
310
+ # Set up API configuration
311
+ api_key = "your-api-key"
312
+ base_url = "your-base-url" # Optional
313
+
314
+ # Create RAGAnything configuration
315
+ config = RAGAnythingConfig(
316
+ working_dir="./rag_storage",
317
+ parser="mineru", # Parser selection: mineru or docling
318
+ parse_method="auto", # Parse method: auto, ocr, or txt
319
+ enable_image_processing=True,
320
+ enable_table_processing=True,
321
+ enable_equation_processing=True,
322
+ )
323
+
324
+ # Define LLM model function
325
+ def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
326
+ return openai_complete_if_cache(
327
+ "gpt-4o-mini",
328
+ prompt,
329
+ system_prompt=system_prompt,
330
+ history_messages=history_messages,
331
+ api_key=api_key,
332
+ base_url=base_url,
333
+ **kwargs,
334
+ )
335
+
336
+ # Define vision model function for image processing
337
+ def vision_model_func(
338
+ prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
339
+ ):
340
+ # If messages format is provided (for multimodal VLM enhanced query), use it directly
341
+ if messages:
342
+ return openai_complete_if_cache(
343
+ "gpt-4o",
344
+ "",
345
+ system_prompt=None,
346
+ history_messages=[],
347
+ messages=messages,
348
+ api_key=api_key,
349
+ base_url=base_url,
350
+ **kwargs,
351
+ )
352
+ # Traditional single image format
353
+ elif image_data:
354
+ return openai_complete_if_cache(
355
+ "gpt-4o",
356
+ "",
357
+ system_prompt=None,
358
+ history_messages=[],
359
+ messages=[
360
+ {"role": "system", "content": system_prompt}
361
+ if system_prompt
362
+ else None,
363
+ {
364
+ "role": "user",
365
+ "content": [
366
+ {"type": "text", "text": prompt},
367
+ {
368
+ "type": "image_url",
369
+ "image_url": {
370
+ "url": f"data:image/jpeg;base64,{image_data}"
371
+ },
372
+ },
373
+ ],
374
+ }
375
+ if image_data
376
+ else {"role": "user", "content": prompt},
377
+ ],
378
+ api_key=api_key,
379
+ base_url=base_url,
380
+ **kwargs,
381
+ )
382
+ # Pure text format
383
+ else:
384
+ return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
385
+
386
+ # Define embedding function
387
+ embedding_func = EmbeddingFunc(
388
+ embedding_dim=3072,
389
+ max_token_size=8192,
390
+ func=lambda texts: openai_embed(
391
+ texts,
392
+ model="text-embedding-3-large",
393
+ api_key=api_key,
394
+ base_url=base_url,
395
+ ),
396
+ )
397
+
398
+ # Initialize RAGAnything
399
+ rag = RAGAnything(
400
+ config=config,
401
+ llm_model_func=llm_model_func,
402
+ vision_model_func=vision_model_func,
403
+ embedding_func=embedding_func,
404
+ )
405
+
406
+ # Process a document
407
+ await rag.process_document_complete(
408
+ file_path="path/to/your/document.pdf",
409
+ output_dir="./output",
410
+ parse_method="auto"
411
+ )
412
+
413
+ # Query the processed content
414
+ # Pure text query - for basic knowledge base search
415
+ text_result = await rag.aquery(
416
+ "What are the main findings shown in the figures and tables?",
417
+ mode="hybrid"
418
+ )
419
+ print("Text query result:", text_result)
420
+
421
+ # Multimodal query with specific multimodal content
422
+ multimodal_result = await rag.aquery_with_multimodal(
423
+ "Explain this formula and its relevance to the document content",
424
+ multimodal_content=[{
425
+ "type": "equation",
426
+ "latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
427
+ "equation_caption": "Document relevance probability"
428
+ }],
429
+ mode="hybrid"
430
+ )
431
+ print("Multimodal query result:", multimodal_result)
432
+
433
+ if __name__ == "__main__":
434
+ asyncio.run(main())
435
+ ```
436
+
437
+ #### 2. Direct Multimodal Content Processing
438
+
439
+ ```python
440
+ import asyncio
441
+ from lightrag import LightRAG
442
+ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
443
+ from lightrag.utils import EmbeddingFunc
444
+ from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
445
+
446
+ async def process_multimodal_content():
447
+ # Set up API configuration
448
+ api_key = "your-api-key"
449
+ base_url = "your-base-url" # Optional
450
+
451
+ # Initialize LightRAG
452
+ rag = LightRAG(
453
+ working_dir="./rag_storage",
454
+ llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
455
+ "gpt-4o-mini",
456
+ prompt,
457
+ system_prompt=system_prompt,
458
+ history_messages=history_messages,
459
+ api_key=api_key,
460
+ base_url=base_url,
461
+ **kwargs,
462
+ ),
463
+ embedding_func=EmbeddingFunc(
464
+ embedding_dim=3072,
465
+ max_token_size=8192,
466
+ func=lambda texts: openai_embed(
467
+ texts,
468
+ model="text-embedding-3-large",
469
+ api_key=api_key,
470
+ base_url=base_url,
471
+ ),
472
+ )
473
+ )
474
+ await rag.initialize_storages()
475
+
476
+ # Process an image
477
+ image_processor = ImageModalProcessor(
478
+ lightrag=rag,
479
+ modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
480
+ "gpt-4o",
481
+ "",
482
+ system_prompt=None,
483
+ history_messages=[],
484
+ messages=[
485
+ {"role": "system", "content": system_prompt} if system_prompt else None,
486
+ {"role": "user", "content": [
487
+ {"type": "text", "text": prompt},
488
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
489
+ ]} if image_data else {"role": "user", "content": prompt}
490
+ ],
491
+ api_key=api_key,
492
+ base_url=base_url,
493
+ **kwargs,
494
+ ) if image_data else openai_complete_if_cache(
495
+ "gpt-4o-mini",
496
+ prompt,
497
+ system_prompt=system_prompt,
498
+ history_messages=history_messages,
499
+ api_key=api_key,
500
+ base_url=base_url,
501
+ **kwargs,
502
+ )
503
+ )
504
+
505
+ image_content = {
506
+ "img_path": "path/to/image.jpg",
507
+ "image_caption": ["Figure 1: Experimental results"],
508
+ "image_footnote": ["Data collected in 2024"]
509
+ }
510
+
511
+ description, entity_info = await image_processor.process_multimodal_content(
512
+ modal_content=image_content,
513
+ content_type="image",
514
+ file_path="research_paper.pdf",
515
+ entity_name="Experimental Results Figure"
516
+ )
517
+
518
+ # Process a table
519
+ table_processor = TableModalProcessor(
520
+ lightrag=rag,
521
+ modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
522
+ "gpt-4o-mini",
523
+ prompt,
524
+ system_prompt=system_prompt,
525
+ history_messages=history_messages,
526
+ api_key=api_key,
527
+ base_url=base_url,
528
+ **kwargs,
529
+ )
530
+ )
531
+
532
+ table_content = {
533
+ "table_body": """
534
+ | Method | Accuracy | F1-Score |
535
+ |--------|----------|----------|
536
+ | RAGAnything | 95.2% | 0.94 |
537
+ | Baseline | 87.3% | 0.85 |
538
+ """,
539
+ "table_caption": ["Performance Comparison"],
540
+ "table_footnote": ["Results on test dataset"]
541
+ }
542
+
543
+ description, entity_info = await table_processor.process_multimodal_content(
544
+ modal_content=table_content,
545
+ content_type="table",
546
+ file_path="research_paper.pdf",
547
+ entity_name="Performance Results Table"
548
+ )
549
+
550
+ if __name__ == "__main__":
551
+ asyncio.run(process_multimodal_content())
552
+ ```
553
+
554
+ #### 3. Batch Processing
555
+
556
+ ```python
557
+ # Process multiple documents
558
+ await rag.process_folder_complete(
559
+ folder_path="./documents",
560
+ output_dir="./output",
561
+ file_extensions=[".pdf", ".docx", ".pptx"],
562
+ recursive=True,
563
+ max_workers=4
564
+ )
565
+ ```
566
+
567
+ #### 4. Custom Modal Processors
568
+
569
+ ```python
570
+ from raganything.modalprocessors import GenericModalProcessor
571
+
572
+ class CustomModalProcessor(GenericModalProcessor):
573
+ async def process_multimodal_content(self, modal_content, content_type, file_path, entity_name):
574
+ # Your custom processing logic
575
+ enhanced_description = await self.analyze_custom_content(modal_content)
576
+ entity_info = self.create_custom_entity(enhanced_description, entity_name)
577
+ return await self._create_entity_and_chunk(enhanced_description, entity_info, file_path)
578
+ ```
579
+
580
+ #### 5. Query Options
581
+
582
+ RAG-Anything provides three types of query methods:
583
+
584
+ **Pure Text Queries** - Direct knowledge base search using LightRAG:
585
+ ```python
586
+ # Different query modes for text queries
587
+ text_result_hybrid = await rag.aquery("Your question", mode="hybrid")
588
+ text_result_local = await rag.aquery("Your question", mode="local")
589
+ text_result_global = await rag.aquery("Your question", mode="global")
590
+ text_result_naive = await rag.aquery("Your question", mode="naive")
591
+
592
+ # Synchronous version
593
+ sync_text_result = rag.query("Your question", mode="hybrid")
594
+ ```
595
+
596
+ **VLM Enhanced Queries** - Automatically analyze images in retrieved context using VLM:
597
+ ```python
598
+ # VLM enhanced query (automatically enabled when vision_model_func is provided)
599
+ vlm_result = await rag.aquery(
600
+ "Analyze the charts and figures in the document",
601
+ mode="hybrid"
602
+ # vlm_enhanced=True is automatically set when vision_model_func is available
603
+ )
604
+
605
+ # Manually control VLM enhancement
606
+ vlm_enabled = await rag.aquery(
607
+ "What do the images show in this document?",
608
+ mode="hybrid",
609
+ vlm_enhanced=True # Force enable VLM enhancement
610
+ )
611
+
612
+ vlm_disabled = await rag.aquery(
613
+ "What do the images show in this document?",
614
+ mode="hybrid",
615
+ vlm_enhanced=False # Force disable VLM enhancement
616
+ )
617
+
618
+ # When documents contain images, VLM can see and analyze them directly
619
+ # The system will automatically:
620
+ # 1. Retrieve relevant context containing image paths
621
+ # 2. Load and encode images as base64
622
+ # 3. Send both text context and images to VLM for comprehensive analysis
623
+ ```
624
+
625
+ **Multimodal Queries** - Enhanced queries with specific multimodal content analysis:
626
+ ```python
627
+ # Query with table data
628
+ table_result = await rag.aquery_with_multimodal(
629
+ "Compare these performance metrics with the document content",
630
+ multimodal_content=[{
631
+ "type": "table",
632
+ "table_data": """Method,Accuracy,Speed
633
+ RAGAnything,95.2%,120ms
634
+ Traditional,87.3%,180ms""",
635
+ "table_caption": "Performance comparison"
636
+ }],
637
+ mode="hybrid"
638
+ )
639
+
640
+ # Query with equation content
641
+ equation_result = await rag.aquery_with_multimodal(
642
+ "Explain this formula and its relevance to the document content",
643
+ multimodal_content=[{
644
+ "type": "equation",
645
+ "latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
646
+ "equation_caption": "Document relevance probability"
647
+ }],
648
+ mode="hybrid"
649
+ )
650
+ ```
651
+
652
+ #### 6. Loading Existing LightRAG Instance
653
+
654
+ ```python
655
+ import asyncio
656
+ from raganything import RAGAnything, RAGAnythingConfig
657
+ from lightrag import LightRAG
658
+ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
659
+ from lightrag.kg.shared_storage import initialize_pipeline_status
660
+ from lightrag.utils import EmbeddingFunc
661
+ import os
662
+
663
+ async def load_existing_lightrag():
664
+ # Set up API configuration
665
+ api_key = "your-api-key"
666
+ base_url = "your-base-url" # Optional
667
+
668
+ # First, create or load existing LightRAG instance
669
+ lightrag_working_dir = "./existing_lightrag_storage"
670
+
671
+ # Check if previous LightRAG instance exists
672
+ if os.path.exists(lightrag_working_dir) and os.listdir(lightrag_working_dir):
673
+ print("✅ Found existing LightRAG instance, loading...")
674
+ else:
675
+ print("❌ No existing LightRAG instance found, will create new one")
676
+
677
+ # Create/load LightRAG instance with your configuration
678
+ lightrag_instance = LightRAG(
679
+ working_dir=lightrag_working_dir,
680
+ llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
681
+ "gpt-4o-mini",
682
+ prompt,
683
+ system_prompt=system_prompt,
684
+ history_messages=history_messages,
685
+ api_key=api_key,
686
+ base_url=base_url,
687
+ **kwargs,
688
+ ),
689
+ embedding_func=EmbeddingFunc(
690
+ embedding_dim=3072,
691
+ max_token_size=8192,
692
+ func=lambda texts: openai_embed(
693
+ texts,
694
+ model="text-embedding-3-large",
695
+ api_key=api_key,
696
+ base_url=base_url,
697
+ ),
698
+ )
699
+ )
700
+
701
+ # Initialize storage (this will load existing data if available)
702
+ await lightrag_instance.initialize_storages()
703
+ await initialize_pipeline_status()
704
+
705
+ # Define vision model function for image processing
706
+ def vision_model_func(
707
+ prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
708
+ ):
709
+ # If messages format is provided (for multimodal VLM enhanced query), use it directly
710
+ if messages:
711
+ return openai_complete_if_cache(
712
+ "gpt-4o",
713
+ "",
714
+ system_prompt=None,
715
+ history_messages=[],
716
+ messages=messages,
717
+ api_key=api_key,
718
+ base_url=base_url,
719
+ **kwargs,
720
+ )
721
+ # Traditional single image format
722
+ elif image_data:
723
+ return openai_complete_if_cache(
724
+ "gpt-4o",
725
+ "",
726
+ system_prompt=None,
727
+ history_messages=[],
728
+ messages=[
729
+ {"role": "system", "content": system_prompt}
730
+ if system_prompt
731
+ else None,
732
+ {
733
+ "role": "user",
734
+ "content": [
735
+ {"type": "text", "text": prompt},
736
+ {
737
+ "type": "image_url",
738
+ "image_url": {
739
+ "url": f"data:image/jpeg;base64,{image_data}"
740
+ },
741
+ },
742
+ ],
743
+ }
744
+ if image_data
745
+ else {"role": "user", "content": prompt},
746
+ ],
747
+ api_key=api_key,
748
+ base_url=base_url,
749
+ **kwargs,
750
+ )
751
+ # Pure text format
752
+ else:
753
+ return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
754
+
755
+ # Now use existing LightRAG instance to initialize RAGAnything
756
+ rag = RAGAnything(
757
+ lightrag=lightrag_instance, # Pass existing LightRAG instance
758
+ vision_model_func=vision_model_func,
759
+ # Note: working_dir, llm_model_func, embedding_func, etc. are inherited from lightrag_instance
760
+ )
761
+
762
+ # Query existing knowledge base
763
+ result = await rag.aquery(
764
+ "What data has been processed in this LightRAG instance?",
765
+ mode="hybrid"
766
+ )
767
+ print("Query result:", result)
768
+
769
+ # Add new multimodal document to existing LightRAG instance
770
+ await rag.process_document_complete(
771
+ file_path="path/to/new/multimodal_document.pdf",
772
+ output_dir="./output"
773
+ )
774
+
775
+ if __name__ == "__main__":
776
+ asyncio.run(load_existing_lightrag())
777
+ ```
778
+
779
+ #### 7. Direct Content List Insertion
780
+
781
+ For scenarios where you already have a pre-parsed content list (e.g., from external parsers or previous processing), you can directly insert it into RAGAnything without document parsing:
782
+
783
+ ```python
784
+ import asyncio
785
+ from raganything import RAGAnything, RAGAnythingConfig
786
+ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
787
+ from lightrag.utils import EmbeddingFunc
788
+
789
+ async def insert_content_list_example():
790
+ # Set up API configuration
791
+ api_key = "your-api-key"
792
+ base_url = "your-base-url" # Optional
793
+
794
+ # Create RAGAnything configuration
795
+ config = RAGAnythingConfig(
796
+ working_dir="./rag_storage",
797
+ enable_image_processing=True,
798
+ enable_table_processing=True,
799
+ enable_equation_processing=True,
800
+ )
801
+
802
+ # Define model functions
803
+ def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
804
+ return openai_complete_if_cache(
805
+ "gpt-4o-mini",
806
+ prompt,
807
+ system_prompt=system_prompt,
808
+ history_messages=history_messages,
809
+ api_key=api_key,
810
+ base_url=base_url,
811
+ **kwargs,
812
+ )
813
+
814
+ def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
815
+ # If messages format is provided (for multimodal VLM enhanced query), use it directly
816
+ if messages:
817
+ return openai_complete_if_cache(
818
+ "gpt-4o",
819
+ "",
820
+ system_prompt=None,
821
+ history_messages=[],
822
+ messages=messages,
823
+ api_key=api_key,
824
+ base_url=base_url,
825
+ **kwargs,
826
+ )
827
+ # Traditional single image format
828
+ elif image_data:
829
+ return openai_complete_if_cache(
830
+ "gpt-4o",
831
+ "",
832
+ system_prompt=None,
833
+ history_messages=[],
834
+ messages=[
835
+ {"role": "system", "content": system_prompt} if system_prompt else None,
836
+ {
837
+ "role": "user",
838
+ "content": [
839
+ {"type": "text", "text": prompt},
840
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
841
+ ],
842
+ } if image_data else {"role": "user", "content": prompt},
843
+ ],
844
+ api_key=api_key,
845
+ base_url=base_url,
846
+ **kwargs,
847
+ )
848
+ # Pure text format
849
+ else:
850
+ return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
851
+
852
+ embedding_func = EmbeddingFunc(
853
+ embedding_dim=3072,
854
+ max_token_size=8192,
855
+ func=lambda texts: openai_embed(
856
+ texts,
857
+ model="text-embedding-3-large",
858
+ api_key=api_key,
859
+ base_url=base_url,
860
+ ),
861
+ )
862
+
863
+ # Initialize RAGAnything
864
+ rag = RAGAnything(
865
+ config=config,
866
+ llm_model_func=llm_model_func,
867
+ vision_model_func=vision_model_func,
868
+ embedding_func=embedding_func,
869
+ )
870
+
871
+ # Example: Pre-parsed content list from external source
872
+ content_list = [
873
+ {
874
+ "type": "text",
875
+ "text": "This is the introduction section of our research paper.",
876
+ "page_idx": 0 # Page number where this content appears
877
+ },
878
+ {
879
+ "type": "image",
880
+ "img_path": "/absolute/path/to/figure1.jpg", # IMPORTANT: Use absolute path
881
+ "image_caption": ["Figure 1: System Architecture"],
882
+ "image_footnote": ["Source: Authors' original design"],
883
+ "page_idx": 1 # Page number where this image appears
884
+ },
885
+ {
886
+ "type": "table",
887
+ "table_body": "| Method | Accuracy | F1-Score |\n|--------|----------|----------|\n| Ours | 95.2% | 0.94 |\n| Baseline | 87.3% | 0.85 |",
888
+ "table_caption": ["Table 1: Performance Comparison"],
889
+ "table_footnote": ["Results on test dataset"],
890
+ "page_idx": 2 # Page number where this table appears
891
+ },
892
+ {
893
+ "type": "equation",
894
+ "latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
895
+ "text": "Document relevance probability formula",
896
+ "page_idx": 3 # Page number where this equation appears
897
+ },
898
+ {
899
+ "type": "text",
900
+ "text": "In conclusion, our method demonstrates superior performance across all metrics.",
901
+ "page_idx": 4 # Page number where this content appears
902
+ }
903
+ ]
904
+
905
+ # Insert the content list directly
906
+ await rag.insert_content_list(
907
+ content_list=content_list,
908
+ file_path="research_paper.pdf", # Reference file name for citation
909
+ split_by_character=None, # Optional text splitting
910
+ split_by_character_only=False, # Optional text splitting mode
911
+ doc_id=None, # Optional custom document ID (will be auto-generated if not provided)
912
+ display_stats=True # Show content statistics
913
+ )
914
+
915
+ # Query the inserted content
916
+ result = await rag.aquery(
917
+ "What are the key findings and performance metrics mentioned in the research?",
918
+ mode="hybrid"
919
+ )
920
+ print("Query result:", result)
921
+
922
+ # You can also insert multiple content lists with different document IDs
923
+ another_content_list = [
924
+ {
925
+ "type": "text",
926
+ "text": "This is content from another document.",
927
+ "page_idx": 0 # Page number where this content appears
928
+ },
929
+ {
930
+ "type": "table",
931
+ "table_body": "| Feature | Value |\n|---------|-------|\n| Speed | Fast |\n| Accuracy | High |",
932
+ "table_caption": ["Feature Comparison"],
933
+ "page_idx": 1 # Page number where this table appears
934
+ }
935
+ ]
936
+
937
+ await rag.insert_content_list(
938
+ content_list=another_content_list,
939
+ file_path="another_document.pdf",
940
+ doc_id="custom-doc-id-123" # Custom document ID
941
+ )
942
+
943
+ if __name__ == "__main__":
944
+ asyncio.run(insert_content_list_example())
945
+ ```
946
+
947
+ **Content List Format:**
948
+
949
+ The `content_list` should follow the standard format with each item being a dictionary containing:
950
+
951
+ - **Text content**: `{"type": "text", "text": "content text", "page_idx": 0}`
952
+ - **Image content**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "image_caption": ["caption"], "image_footnote": ["note"], "page_idx": 1}`
953
+ - **Table content**: `{"type": "table", "table_body": "markdown table", "table_caption": ["caption"], "table_footnote": ["note"], "page_idx": 2}`
954
+ - **Equation content**: `{"type": "equation", "latex": "LaTeX formula", "text": "description", "page_idx": 3}`
955
+ - **Generic content**: `{"type": "custom_type", "content": "any content", "page_idx": 4}`
956
+
957
+ **Important Notes:**
958
+ - **`img_path`**: Must be an absolute path to the image file (e.g., `/home/user/images/chart.jpg` or `C:\Users\user\images\chart.jpg`)
959
+ - **`page_idx`**: Represents the page number where the content appears in the original document (0-based indexing)
960
+ - **Content ordering**: Items are processed in the order they appear in the list
961
+
962
+ This method is particularly useful when:
963
+ - You have content from external parsers (non-MinerU/Docling)
964
+ - You want to process programmatically generated content
965
+ - You need to insert content from multiple sources into a single knowledge base
966
+ - You have cached parsing results that you want to reuse
967
+
968
+ ---
969
+
970
+ ## 🛠️ Examples
971
+
972
+ *Practical Implementation Demos*
973
+
974
+ <div align="center">
975
+ <img src="https://user-images.githubusercontent.com/74038190/212257455-13e3e01e-d6a6-45dc-bb92-3ab87b12dfc1.gif" width="300">
976
+ </div>
977
+
978
+ The `examples/` directory contains comprehensive usage examples:
979
+
980
+ - **`raganything_example.py`**: End-to-end document processing with MinerU
981
+ - **`modalprocessors_example.py`**: Direct multimodal content processing
982
+ - **`office_document_test.py`**: Office document parsing test with MinerU (no API key required)
983
+ - **`image_format_test.py`**: Image format parsing test with MinerU (no API key required)
984
+ - **`text_format_test.py`**: Text format parsing test with MinerU (no API key required)
985
+
986
+ **Run examples:**
987
+
988
+ ```bash
989
+ # End-to-end processing with parser selection
990
+ python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
991
+
992
+ # Direct modal processing
993
+ python examples/modalprocessors_example.py --api-key YOUR_API_KEY
994
+
995
+ # Office document parsing test (MinerU only)
996
+ python examples/office_document_test.py --file path/to/document.docx
997
+
998
+ # Image format parsing test (MinerU only)
999
+ python examples/image_format_test.py --file path/to/image.bmp
1000
+
1001
+ # Text format parsing test (MinerU only)
1002
+ python examples/text_format_test.py --file path/to/document.md
1003
+
1004
+ # Check LibreOffice installation
1005
+ python examples/office_document_test.py --check-libreoffice --file dummy
1006
+
1007
+ # Check PIL/Pillow installation
1008
+ python examples/image_format_test.py --check-pillow --file dummy
1009
+
1010
+ # Check ReportLab installation
1011
+ python examples/text_format_test.py --check-reportlab --file dummy
1012
+ ```
1013
+
1014
+ ---
1015
+
1016
+ ## 🔧 Configuration
1017
+
1018
+ *System Optimization Parameters*
1019
+
1020
+ ### Environment Variables
1021
+
1022
+ Create a `.env` file (refer to `.env.example`):
1023
+
1024
+ ```bash
1025
+ OPENAI_API_KEY=your_openai_api_key
1026
+ OPENAI_BASE_URL=your_base_url # Optional
1027
+ OUTPUT_DIR=./output # Default output directory for parsed documents
1028
+ PARSER=mineru # Parser selection: mineru or docling
1029
+ PARSE_METHOD=auto # Parse method: auto, ocr, or txt
1030
+ ```
1031
+
1032
+ **Note:** For backward compatibility, legacy environment variable names are still supported:
1033
+ - `MINERU_PARSE_METHOD` is deprecated, please use `PARSE_METHOD`
1034
+
1035
+ > **Note**: API keys are only required for full RAG processing with LLM integration. The parsing test files (`office_document_test.py` and `image_format_test.py`) only test parser functionality and do not require API keys.
1036
+
1037
+ ### Parser Configuration
1038
+
1039
+ RAGAnything now supports multiple parsers, each with specific advantages:
1040
+
1041
+ #### MinerU Parser
1042
+ - Supports PDF, images, Office documents, and more formats
1043
+ - Powerful OCR and table extraction capabilities
1044
+ - GPU acceleration support
1045
+
1046
+ #### Docling Parser
1047
+ - Optimized for Office documents and HTML files
1048
+ - Better document structure preservation
1049
+ - Native support for multiple Office formats
1050
+
1051
+ ### MinerU Configuration
1052
+
1053
+ ```bash
1054
+ # MinerU 2.0 uses command-line parameters instead of config files
1055
+ # Check available options:
1056
+ mineru --help
1057
+
1058
+ # Common configurations:
1059
+ mineru -p input.pdf -o output_dir -m auto # Automatic parsing mode
1060
+ mineru -p input.pdf -o output_dir -m ocr # OCR-focused parsing
1061
+ mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU acceleration
1062
+ ```
1063
+
1064
+ You can also configure parsing through RAGAnything parameters:
1065
+
1066
+ ```python
1067
+ # Basic parsing configuration with parser selection
1068
+ await rag.process_document_complete(
1069
+ file_path="document.pdf",
1070
+ output_dir="./output/",
1071
+ parse_method="auto", # or "ocr", "txt"
1072
+ parser="mineru" # Optional: "mineru" or "docling"
1073
+ )
1074
+
1075
+ # Advanced parsing configuration with special parameters
1076
+ await rag.process_document_complete(
1077
+ file_path="document.pdf",
1078
+ output_dir="./output/",
1079
+ parse_method="auto", # Parsing method: "auto", "ocr", "txt"
1080
+ parser="mineru", # Parser selection: "mineru" or "docling"
1081
+
1082
+ # MinerU special parameters - all supported kwargs:
1083
+ lang="ch", # Document language for OCR optimization (e.g., "ch", "en", "ja")
1084
+ device="cuda:0", # Inference device: "cpu", "cuda", "cuda:0", "npu", "mps"
1085
+ start_page=0, # Starting page number (0-based, for PDF)
1086
+ end_page=10, # Ending page number (0-based, for PDF)
1087
+ formula=True, # Enable formula parsing
1088
+ table=True, # Enable table parsing
1089
+ backend="pipeline", # Parsing backend: pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client.
1090
+ source="huggingface", # Model source: "huggingface", "modelscope", "local"
1091
+ # vlm_url="http://127.0.0.1:3000" # Service address when using backend=vlm-sglang-client
1092
+
1093
+ # Standard RAGAnything parameters
1094
+ display_stats=True, # Display content statistics
1095
+ split_by_character=None, # Optional character to split text by
1096
+ doc_id=None # Optional document ID
1097
+ )
1098
+ ```
1099
+
1100
+ > **Note**: MinerU 2.0 no longer uses the `magic-pdf.json` configuration file. All settings are now passed as command-line parameters or function arguments. RAG-Anything now supports multiple document parsers - you can choose between MinerU and Docling based on your needs.
1101
+
1102
+ ### Processing Requirements
1103
+
1104
+ Different content types require specific optional dependencies:
1105
+
1106
+ - **Office Documents** (.doc, .docx, .ppt, .pptx, .xls, .xlsx): Install [LibreOffice](https://www.libreoffice.org/download/download/)
1107
+ - **Extended Image Formats** (.bmp, .tiff, .gif, .webp): Install with `pip install raganything[image]`
1108
+ - **Text Files** (.txt, .md): Install with `pip install raganything[text]`
1109
+
1110
+ > **📋 Quick Install**: Use `pip install raganything[all]` to enable all format support (Python dependencies only - LibreOffice still needs separate installation)
1111
+
1112
+ ---
1113
+
1114
+ ## 🧪 Supported Content Types
1115
+
1116
+ ### Document Formats
1117
+
1118
+ - **PDFs** - Research papers, reports, presentations
1119
+ - **Office Documents** - DOC, DOCX, PPT, PPTX, XLS, XLSX
1120
+ - **Images** - JPG, PNG, BMP, TIFF, GIF, WebP
1121
+ - **Text Files** - TXT, MD
1122
+
1123
+ ### Multimodal Elements
1124
+
1125
+ - **Images** - Photographs, diagrams, charts, screenshots
1126
+ - **Tables** - Data tables, comparison charts, statistical summaries
1127
+ - **Equations** - Mathematical formulas in LaTeX format
1128
+ - **Generic Content** - Custom content types via extensible processors
1129
+
1130
+ *For installation of format-specific dependencies, see the [Configuration](#-configuration) section.*
1131
+
1132
+ ---
1133
+
1134
+ ## 📖 Citation
1135
+
1136
+ *Academic Reference*
1137
+
1138
+ <div align="center">
1139
+ <div style="width: 60px; height: 60px; margin: 20px auto; position: relative;">
1140
+ <div style="width: 100%; height: 100%; border: 2px solid #00d9ff; border-radius: 50%; position: relative;">
1141
+ <div style="position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); font-size: 24px; color: #00d9ff;">📖</div>
1142
+ </div>
1143
+ <div style="position: absolute; bottom: -5px; left: 50%; transform: translateX(-50%); width: 20px; height: 20px; background: white; border-right: 2px solid #00d9ff; border-bottom: 2px solid #00d9ff; transform: rotate(45deg);"></div>
1144
+ </div>
1145
+ </div>
1146
+
1147
+ If you find RAG-Anything useful in your research, please cite our paper:
1148
+
1149
+ ```bibtex
1150
+ @article{guo2024lightrag,
1151
+ title={LightRAG: Simple and Fast Retrieval-Augmented Generation},
1152
+ author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang},
1153
+ year={2024},
1154
+ eprint={2410.05779},
1155
+ archivePrefix={arXiv},
1156
+ primaryClass={cs.IR}
1157
+ }
1158
+ ```
1159
+
1160
+ ---
1161
+
1162
+ ## 🔗 Related Projects
1163
+
1164
+ *Ecosystem & Extensions*
1165
+
1166
+ <div align="center">
1167
+ <table>
1168
+ <tr>
1169
+ <td align="center">
1170
+ <a href="https://github.com/HKUDS/LightRAG">
1171
+ <div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
1172
+ <span style="font-size: 32px;">⚡</span>
1173
+ </div>
1174
+ <b>LightRAG</b><br>
1175
+ <sub>Simple and Fast RAG</sub>
1176
+ </a>
1177
+ </td>
1178
+ <td align="center">
1179
+ <a href="https://github.com/HKUDS/VideoRAG">
1180
+ <div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
1181
+ <span style="font-size: 32px;">🎥</span>
1182
+ </div>
1183
+ <b>VideoRAG</b><br>
1184
+ <sub>Extreme Long-Context Video RAG</sub>
1185
+ </a>
1186
+ </td>
1187
+ <td align="center">
1188
+ <a href="https://github.com/HKUDS/MiniRAG">
1189
+ <div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
1190
+ <span style="font-size: 32px;">✨</span>
1191
+ </div>
1192
+ <b>MiniRAG</b><br>
1193
+ <sub>Extremely Simple RAG</sub>
1194
+ </a>
1195
+ </td>
1196
+ </tr>
1197
+ </table>
1198
+ </div>
1199
+
1200
+ ---
1201
+
1202
+ ## ⭐ Star History
1203
+
1204
+ *Community Growth Trajectory*
1205
+
1206
+ <div align="center">
1207
+ <a href="https://star-history.com/#HKUDS/RAG-Anything&Date">
1208
+ <picture>
1209
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date&theme=dark" />
1210
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date" />
1211
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date" style="border-radius: 15px; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);" />
1212
+ </picture>
1213
+ </a>
1214
+ </div>
1215
+
1216
+ ---
1217
+
1218
+ ## 🤝 Contribution
1219
+
1220
+ *Join the Innovation*
1221
+
1222
+ <div align="center">
1223
+ We thank all our contributors for their valuable contributions.
1224
+ </div>
1225
+
1226
+ <div align="center">
1227
+ <a href="https://github.com/HKUDS/RAG-Anything/graphs/contributors">
1228
+ <img src="https://contrib.rocks/image?repo=HKUDS/RAG-Anything" style="border-radius: 15px; box-shadow: 0 0 20px rgba(0, 217, 255, 0.3);" />
1229
+ </a>
1230
+ </div>
1231
+
1232
+ ---
1233
+
1234
+ <div align="center" style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 30px; margin: 30px 0;">
1235
+ <div>
1236
+ <img src="https://user-images.githubusercontent.com/74038190/212284100-561aa473-3905-4a80-b561-0d28506553ee.gif" width="500">
1237
+ </div>
1238
+ <div style="margin-top: 20px;">
1239
+ <a href="https://github.com/HKUDS/RAG-Anything" style="text-decoration: none;">
1240
+ <img src="https://img.shields.io/badge/⭐%20Star%20us%20on%20GitHub-1a1a2e?style=for-the-badge&logo=github&logoColor=white">
1241
+ </a>
1242
+ <a href="https://github.com/HKUDS/RAG-Anything/issues" style="text-decoration: none;">
1243
+ <img src="https://img.shields.io/badge/🐛%20Report%20Issues-ff6b6b?style=for-the-badge&logo=github&logoColor=white">
1244
+ </a>
1245
+ <a href="https://github.com/HKUDS/RAG-Anything/discussions" style="text-decoration: none;">
1246
+ <img src="https://img.shields.io/badge/💬%20Discussions-4ecdc4?style=for-the-badge&logo=github&logoColor=white">
1247
+ </a>
1248
+ </div>
1249
+ </div>
1250
+
1251
+ <div align="center">
1252
+ <div style="width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);">
1253
+ <div style="display: flex; justify-content: center; align-items: center; gap: 15px;">
1254
+ <span style="font-size: 24px;">⭐</span>
1255
+ <span style="color: #00d9ff; font-size: 18px;">Thank you for visiting RAG-Anything!</span>
1256
+ <span style="font-size: 24px;">⭐</span>
1257
+ </div>
1258
+ <div style="margin-top: 10px; color: #00d9ff; font-size: 16px;">Building the Future of Multimodal AI</div>
1259
+ </div>
1260
+ </div>
rag_anything_smaranika/README_zh.md ADDED
@@ -0,0 +1,1258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <div style="margin: 20px 0;">
4
+ <img src="./assets/logo.png" width="120" height="120" alt="RAG-Anything Logo" style="border-radius: 20px; box-shadow: 0 8px 32px rgba(0, 217, 255, 0.3);">
5
+ </div>
6
+
7
+ # 🚀 RAG-Anything: All-in-One RAG System
8
+
9
+ <div align="center">
10
+ <div style="width: 100%; height: 2px; margin: 20px 0; background: linear-gradient(90deg, transparent, #00d9ff, transparent);"></div>
11
+ </div>
12
+
13
+ <div align="center">
14
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 25px; text-align: center;">
15
+ <p>
16
+ <a href='https://github.com/HKUDS/RAG-Anything'><img src='https://img.shields.io/badge/🔥项目-主页-00d9ff?style=for-the-badge&logo=github&logoColor=white&labelColor=1a1a2e'></a>
17
+ <a href='https://arxiv.org/abs/2410.05779'><img src='https://img.shields.io/badge/📄arXiv-2410.05779-ff6b6b?style=for-the-badge&logo=arxiv&logoColor=white&labelColor=1a1a2e'></a>
18
+ <a href='https://github.com/HKUDS/LightRAG'><img src='https://img.shields.io/badge/⚡基于-LightRAG-4ecdc4?style=for-the-badge&logo=lightning&logoColor=white&labelColor=1a1a2e'></a>
19
+ </p>
20
+ <p>
21
+ <a href="https://github.com/HKUDS/RAG-Anything/stargazers"><img src='https://img.shields.io/github/stars/HKUDS/RAG-Anything?color=00d9ff&style=for-the-badge&logo=star&logoColor=white&labelColor=1a1a2e' /></a>
22
+ <img src="https://img.shields.io/badge/🐍Python-3.10-4ecdc4?style=for-the-badge&logo=python&logoColor=white&labelColor=1a1a2e">
23
+ <a href="https://pypi.org/project/raganything/"><img src="https://img.shields.io/pypi/v/raganything.svg?style=for-the-badge&logo=pypi&logoColor=white&labelColor=1a1a2e&color=ff6b6b"></a>
24
+ </p>
25
+ <p>
26
+ <a href="https://discord.gg/yF2MmDJyGJ"><img src="https://img.shields.io/badge/💬Discord-社区-7289da?style=for-the-badge&logo=discord&logoColor=white&labelColor=1a1a2e"></a>
27
+ <a href="https://github.com/HKUDS/RAG-Anything/issues/7"><img src="https://img.shields.io/badge/💬微信群-交流-07c160?style=for-the-badge&logo=wechat&logoColor=white&labelColor=1a1a2e"></a>
28
+ </p>
29
+ <p>
30
+ <a href="README_zh.md"><img src="https://img.shields.io/badge/🇨🇳中文版-1a1a2e?style=for-the-badge"></a>
31
+ <a href="README.md"><img src="https://img.shields.io/badge/🇺🇸English-1a1a2e?style=for-the-badge"></a>
32
+ </p>
33
+ </div>
34
+ </div>
35
+
36
+ </div>
37
+
38
+ <div align="center" style="margin: 30px 0;">
39
+ <img src="https://user-images.githubusercontent.com/74038190/212284100-561aa473-3905-4a80-b561-0d28506553ee.gif" width="800">
40
+ </div>
41
+
42
+ <div align="center">
43
+ <a href="#-快速开始" style="text-decoration: none;">
44
+ <img src="https://img.shields.io/badge/快速开始-立即开始使用-00d9ff?style=for-the-badge&logo=rocket&logoColor=white&labelColor=1a1a2e">
45
+ </a>
46
+ </div>
47
+
48
+ ---
49
+
50
+ ## 🎉 新闻
51
+ - [X] [2025.08.12]🎯📢 🔍 RAGAnything 现在支持 **VLM增强查询** 模式!当文档包含图片时,系统可以自动将图片与文本上下文一起直接传递给VLM进行综合多模态分析。
52
+ - [X] [2025.07.05]🎯📢 RAGAnything 新增[上下文配置模块](docs/context_aware_processing.md),支持为多模态内容处理添加相关上下文信息。
53
+ - [X] [2025.07.04]🎯📢 RAGAnything 现在支持多模态内容查询,实现了集成文本、图像、表格和公式处理的增强检索生成功能。
54
+ - [X] [2025.07.03]🎯📢 RAGAnything 在GitHub上达到了1K星标🌟!感谢您的支持和贡献。
55
+
56
+ ---
57
+
58
+ ## 🌟 系统概述
59
+
60
+ *下一代多模态智能*
61
+
62
+ <div style="background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border: 2px solid #00d9ff; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);">
63
+
64
+ **RAG-Anything**是一个综合性多模态文档处理RAG系统。该系统能够无缝处理和查询包含文本、图像、表格、公式等多模态内容的复杂文档,提供完整的检索增强(RAG)生成解决方案。
65
+
66
+ <img src="assets/rag_anything_framework.png" alt="RAG-Anything" />
67
+
68
+ </div>
69
+
70
+ ### 🎯 核心特性
71
+
72
+ <div style="background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); border-radius: 15px; padding: 25px; margin: 20px 0;">
73
+
74
+ - **🔄 端到端多模态处理流水线** - 提供从文档解析到多模态查询响应的完整处理链路,确保系统的一体化运行
75
+ - **📄 多格式文档支持** - 支持PDF、Office文档(DOC/DOCX/PPT/PPTX/XLS/XLSX)、图像等主流文档格式的统一处理和解析
76
+ - **🧠 多模态内容分析引擎** - 针对图像、表格、公式和通用文本内容部署专门的处理器,确保各类内容的精准解析
77
+ - **🔗 基于知识图谱索引** - 实现自动化实体提取和关系构建,建立跨模态的语义连接网络
78
+ - **⚡ 灵活的处理架构** - 支持基于MinerU的智能解析模式和直接多模态内容插入模式,满足不同应用场景需求
79
+ - **📋 直接内容列表插入** - 跳过文档解析,直接插入来自外部��的预解析内容列表,支持多种数据来源整合
80
+ - **🎯 跨模态检索机制** - 实现跨文本和多模态内容的智能检索,提供精准的信息定位和匹配能力
81
+
82
+ </div>
83
+
84
+ ---
85
+
86
+ ## 🏗️ 算法原理与架构
87
+
88
+ <div style="background: linear-gradient(135deg, #0f0f23 0%, #1a1a2e 100%); border-radius: 15px; padding: 25px; margin: 20px 0; border-left: 5px solid #00d9ff;">
89
+
90
+ ### 核心算法
91
+
92
+ **RAG-Anything** 采用灵活的分层架构设计,实现多阶段多模态处理流水线,将传统RAG系统扩展为支持异构内容类型的综合处理平台。
93
+
94
+ </div>
95
+
96
+ <div align="center">
97
+ <div style="width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);">
98
+ <div style="display: flex; justify-content: space-around; align-items: center; flex-wrap: wrap; gap: 20px;">
99
+ <div style="text-align: center;">
100
+ <div style="font-size: 24px; margin-bottom: 10px;">📄</div>
101
+ <div style="font-size: 14px; color: #00d9ff;">文档解析</div>
102
+ </div>
103
+ <div style="font-size: 20px; color: #00d9ff;">→</div>
104
+ <div style="text-align: center;">
105
+ <div style="font-size: 24px; margin-bottom: 10px;">🧠</div>
106
+ <div style="font-size: 14px; color: #00d9ff;">内容分析</div>
107
+ </div>
108
+ <div style="font-size: 20px; color: #00d9ff;">→</div>
109
+ <div style="text-align: center;">
110
+ <div style="font-size: 24px; margin-bottom: 10px;">🔍</div>
111
+ <div style="font-size: 14px; color: #00d9ff;">知识图谱</div>
112
+ </div>
113
+ <div style="font-size: 20px; color: #00d9ff;">→</div>
114
+ <div style="text-align: center;">
115
+ <div style="font-size: 24px; margin-bottom: 10px;">🎯</div>
116
+ <div style="font-size: 14px; color: #00d9ff;">智能检索</div>
117
+ </div>
118
+ </div>
119
+ </div>
120
+ </div>
121
+
122
+ ### 1. 文档解析阶段
123
+
124
+ <div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
125
+
126
+ 该系统构建了高精度文档解析平台,通过结构化提取引擎实现多模态元素的完整识别与提取。系统采用自适应内容分解机制,智能分离文档中的文本、图像、表格、公式等异构内容,并保持其语义关联性。同时支持PDF、Office文档、图像等主流格式的统一处理,提供标准化的多模态内容输出。
127
+
128
+ **核心组件:**
129
+
130
+ - **⚙️ 结构化提取引擎**:集成 [MinerU](https://github.com/opendatalab/MinerU) 文档解析框架,实现精确的文档结构识别与内容提取,确保多模态元素的完整性和准确性。
131
+
132
+ - **🧩 自适应内容分解机制**:建立智能内容分离系统,自动识别并提取文档中的文本块、图像、表格、公式等异构元素,保持元素间的语义关联关系。
133
+
134
+ - **📁 多格式兼容处理**:部署专业化解析器矩阵,支持PDF、Office文档系列(DOC/DOCX/PPT/PPTX/XLS/XLSX)、图像等主流格式的统一处理与标准化输出。
135
+
136
+ </div>
137
+
138
+ ### 2. 多模态内容理解与处理
139
+
140
+ <div style="background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;">
141
+
142
+ 该多模态内容处理系统通过自主分类路由机制实现异构内容的智能识别与优化分发。系统采用并发多流水线架构,确保文本和多模态内容的高效并行处理,在最大化吞吐量的同时保持内容完整性,并能完整提取和保持原始文档的层次结构与元素关联关系。
143
+
144
+ **核心组件:**
145
+
146
+ - **🎯 自主内容分类与路由**:自动识别、分类并将不同内容类型路由至优化的执行通道。
147
+
148
+ - **⚡ 并发多流水线架构**:通过专用处理流水线实现文本和多模态内容的并发执行。这种方法在保持内容完整性的同时最大化吞吐效率。
149
+
150
+ - **🏗️ 文档层次结构提取**:在内容转换过程中提取并保持原始文档的层次结构和元素间关系。
151
+
152
+ </div>
153
+
154
+ ### 3. 多模态分析引擎
155
+
156
+ <div style="background: linear-gradient(90deg, #0f3460 0%, #1a1a2e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #00d9ff;">
157
+
158
+ 系统部署了面向异构数据模态的模态感知处理单元:
159
+
160
+ **专用分析器:**
161
+
162
+ - **🔍 视觉内容分析器**:
163
+ - 集成视觉模型进行图像分析和内容识别
164
+ - 基于视觉语义生成上下文感知的描述性标题
165
+ - 提取视觉元素间的空间关系和层次结构
166
+
167
+ - **📊 结构化数据解释器**:
168
+ - 对表格和结构化数据格式进行系统性解释
169
+ - 实现数据趋势分析的统计模式识别算法
170
+ - 识别多个表格数据集间的语义关系和依赖性
171
+
172
+ - **📐 数学表达式解析器**:
173
+ - 高精度解析复杂数学表达式和公式
174
+ - 提供原生LaTeX格式支持以实现��学术工作流的无缝集成
175
+ - 建立数学方程与领域特定知识库间的概念映射
176
+
177
+ - **🔧 可扩展模态处理器**:
178
+ - 为自定义和新兴内容类型提供可配置的处理框架
179
+ - 通过插件架构实现新模态处理器的动态集成
180
+ - 支持专用场景下处理流水线的运行时配置
181
+
182
+ </div>
183
+
184
+ ### 4. 多模态知识图谱索引
185
+
186
+ <div style="background: linear-gradient(90deg, #1a1a2e 0%, #16213e 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #4ecdc4;">
187
+
188
+ 多模态知识图谱构建模块将文档内容转换为结构化语义表示。系统提取多模态实体,建立跨模态关系,并保持层次化组织结构。通过加权相关性评分实现优化的知识检索。
189
+
190
+ **核心功能:**
191
+
192
+ - **🔍 多模态实体提取**:将重要的多模态元素转换为结构化知识图谱实体。该过程包括语义标注和元数据保存。
193
+
194
+ - **🔗 跨模态关系映射**:在文本实体和多模态组件之间建立语义连接和依赖关系。通过自动化关系推理算法实现这一功能。
195
+
196
+ - **🏗️ 层次结构保持**:通过"归属于"关系链维护原始文档组织结构。这些关系链保持逻辑内容层次和章节依赖关系。
197
+
198
+ - **⚖️ 加权关系评分**:为关系类型分配定量相关性分数。评分基于语义邻近性和文档结构内的上下文重要性。
199
+
200
+ </div>
201
+
202
+ ### 5. 模态感知检索
203
+
204
+ <div style="background: linear-gradient(90deg, #16213e 0%, #0f3460 100%); border-radius: 10px; padding: 20px; margin: 15px 0; border-left: 4px solid #ff6b6b;">
205
+
206
+ 混合检索系统结合向量相似性搜索与图遍历算法,实现全面的内容检索。系统实现模态感知排序机制,并维护检索元素间的关系一致性,确保上下文集成的信息传递。
207
+
208
+ **检索机制:**
209
+
210
+ - **🔀 向量-图谱融合**:集成向量相似性搜索与图遍历算法。该方法同时利用语义嵌入和结构关系实现全面的内容检索。
211
+
212
+ - **📊 模态感知排序**:实现基于内容类型相关性的自适应评分机制。系统根据查询特定的模态偏好调整排序结果。
213
+
214
+ - **🔗 关系一致性维护**:维护检索元素间的语义和结构关系。确保信息传递的连贯性和上下文完整性。
215
+
216
+ </div>
217
+
218
+ ---
219
+
220
+ ## 🚀 快速开始
221
+
222
+ *启动您的AI之旅*
223
+
224
+ <div align="center">
225
+ <img src="https://user-images.githubusercontent.com/74038190/212284158-e840e285-664b-44d7-b79b-e264b5e54825.gif" width="400">
226
+ </div>
227
+
228
+ ### 安装
229
+
230
+ #### 选项1:从PyPI安装(推荐)
231
+
232
+ ```bash
233
+ # 基础安装
234
+ pip install raganything
235
+
236
+ # 安装包含扩展格式支持的可选依赖:
237
+ pip install 'raganything[all]' # 所有可选功能
238
+ pip install 'raganything[image]' # 图像格式转换 (BMP, TIFF, GIF, WebP)
239
+ pip install 'raganything[text]' # 文本文件处理 (TXT, MD)
240
+ pip install 'raganything[image,text]' # 多个功能组合
241
+ ```
242
+
243
+ #### 选项2:从源码安装
244
+
245
+ ```bash
246
+ git clone https://github.com/HKUDS/RAG-Anything.git
247
+ cd RAG-Anything
248
+ pip install -e .
249
+
250
+ # 安装可选依赖
251
+ pip install -e '.[all]'
252
+ ```
253
+
254
+ #### 可选依赖
255
+
256
+ - **`[image]`** - 启用BMP、TIFF、GIF、WebP图像格式处理(需要Pillow)
257
+ - **`[text]`** - 启用TXT和MD文件处理(需要ReportLab)
258
+ - **`[all]`** - 包含所有Python可选依赖
259
+
260
+ > **⚠️ Office文档处理配置要求:**
261
+ > - Office文档 (.doc, .docx, .ppt, .pptx, .xls, .xlsx) 需要安装 **LibreOffice**
262
+ > - 从[LibreOffice官网](https://www.libreoffice.org/download/download/)下载安装
263
+ > - **Windows**:从官网下载安装包
264
+ > - **macOS**:`brew install --cask libreoffice`
265
+ > - **Ubuntu/Debian**:`sudo apt-get install libreoffice`
266
+ > - **CentOS/RHEL**:`sudo yum install libreoffice`
267
+
268
+ **检查MinerU安装:**
269
+
270
+ ```bash
271
+ # 验证安装
272
+ mineru --version
273
+
274
+ # 检查是否正确配置
275
+ python -c "from raganything import RAGAnything; rag = RAGAnything(); print('✅ MinerU安装正常' if rag.check_parser_installation() else '❌ MinerU安装有问题')"
276
+ ```
277
+
278
+ 模型在首次使用时自动下载。手动下载参考[MinerU模型源配置](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#22-%E6%A8%A1%E5%9E%8B%E6%BA%90%E9%85%8D%E7%BD%AE):
279
+
280
+ ### 使用示例
281
+
282
+ #### 1. 端到端文档处理
283
+
284
+ ```python
285
+ import asyncio
286
+ from raganything import RAGAnything, RAGAnythingConfig
287
+ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
288
+ from lightrag.utils import EmbeddingFunc
289
+
290
+ async def main():
291
+ # 设置 API 配置
292
+ api_key = "your-api-key"
293
+ base_url = "your-base-url" # 可选
294
+
295
+ # 创建 RAGAnything 配置
296
+ config = RAGAnythingConfig(
297
+ working_dir="./rag_storage",
298
+ parser="mineru", # 选择解析器:mineru 或 docling
299
+ parse_method="auto", # 解析方法:auto, ocr 或 txt
300
+ enable_image_processing=True,
301
+ enable_table_processing=True,
302
+ enable_equation_processing=True,
303
+ )
304
+
305
+ # 定义 LLM 模型��数
306
+ def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
307
+ return openai_complete_if_cache(
308
+ "gpt-4o-mini",
309
+ prompt,
310
+ system_prompt=system_prompt,
311
+ history_messages=history_messages,
312
+ api_key=api_key,
313
+ base_url=base_url,
314
+ **kwargs,
315
+ )
316
+
317
+ # 定义视觉模型函数用于图像处理
318
+ def vision_model_func(
319
+ prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
320
+ ):
321
+ # 如果提供了messages格式(用于多模态VLM增强查询),直接使用
322
+ if messages:
323
+ return openai_complete_if_cache(
324
+ "gpt-4o",
325
+ "",
326
+ system_prompt=None,
327
+ history_messages=[],
328
+ messages=messages,
329
+ api_key=api_key,
330
+ base_url=base_url,
331
+ **kwargs,
332
+ )
333
+ # 传统单图片格式
334
+ elif image_data:
335
+ return openai_complete_if_cache(
336
+ "gpt-4o",
337
+ "",
338
+ system_prompt=None,
339
+ history_messages=[],
340
+ messages=[
341
+ {"role": "system", "content": system_prompt}
342
+ if system_prompt
343
+ else None,
344
+ {
345
+ "role": "user",
346
+ "content": [
347
+ {"type": "text", "text": prompt},
348
+ {
349
+ "type": "image_url",
350
+ "image_url": {
351
+ "url": f"data:image/jpeg;base64,{image_data}"
352
+ },
353
+ },
354
+ ],
355
+ }
356
+ if image_data
357
+ else {"role": "user", "content": prompt},
358
+ ],
359
+ api_key=api_key,
360
+ base_url=base_url,
361
+ **kwargs,
362
+ )
363
+ # 纯文本格式
364
+ else:
365
+ return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
366
+
367
+ # 定义嵌入函数
368
+ embedding_func = EmbeddingFunc(
369
+ embedding_dim=3072,
370
+ max_token_size=8192,
371
+ func=lambda texts: openai_embed(
372
+ texts,
373
+ model="text-embedding-3-large",
374
+ api_key=api_key,
375
+ base_url=base_url,
376
+ ),
377
+ )
378
+
379
+ # 初始化 RAGAnything
380
+ rag = RAGAnything(
381
+ config=config,
382
+ llm_model_func=llm_model_func,
383
+ vision_model_func=vision_model_func,
384
+ embedding_func=embedding_func,
385
+ )
386
+
387
+ # 处理文档
388
+ await rag.process_document_complete(
389
+ file_path="path/to/your/document.pdf",
390
+ output_dir="./output",
391
+ parse_method="auto"
392
+ )
393
+
394
+ # 查询处理后的内容
395
+ # 纯文本查询 - 基本知识库搜索
396
+ text_result = await rag.aquery(
397
+ "文档的主要内容是什么?",
398
+ mode="hybrid"
399
+ )
400
+ print("文本查询结果:", text_result)
401
+
402
+ # 多模态查询 - 包含具体多模态内容的查询
403
+ multimodal_result = await rag.aquery_with_multimodal(
404
+ "分析这个性能数据并解释与现有文档内容的关系",
405
+ multimodal_content=[{
406
+ "type": "table",
407
+ "table_data": """系统,准确率,F1分数
408
+ RAGAnything,95.2%,0.94
409
+ 基准方法,87.3%,0.85""",
410
+ "table_caption": "性能对比结果"
411
+ }],
412
+ mode="hybrid"
413
+ )
414
+ print("多模态查询结果:", multimodal_result)
415
+
416
+ if __name__ == "__main__":
417
+ asyncio.run(main())
418
+ ```
419
+
420
+ #### 2. 直接多模态内容处理
421
+
422
+ ```python
423
+ import asyncio
424
+ from lightrag import LightRAG
425
+ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
426
+ from lightrag.utils import EmbeddingFunc
427
+ from raganything.modalprocessors import ImageModalProcessor, TableModalProcessor
428
+
429
+ async def process_multimodal_content():
430
+ # 设置 API 配置
431
+ api_key = "your-api-key"
432
+ base_url = "your-base-url" # 可选
433
+
434
+ # 初始化 LightRAG
435
+ rag = LightRAG(
436
+ working_dir="./rag_storage",
437
+ llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
438
+ "gpt-4o-mini",
439
+ prompt,
440
+ system_prompt=system_prompt,
441
+ history_messages=history_messages,
442
+ api_key=api_key,
443
+ base_url=base_url,
444
+ **kwargs,
445
+ ),
446
+ embedding_func=EmbeddingFunc(
447
+ embedding_dim=3072,
448
+ max_token_size=8192,
449
+ func=lambda texts: openai_embed(
450
+ texts,
451
+ model="text-embedding-3-large",
452
+ api_key=api_key,
453
+ base_url=base_url,
454
+ ),
455
+ )
456
+ )
457
+ await rag.initialize_storages()
458
+
459
+ # 处理图像
460
+ image_processor = ImageModalProcessor(
461
+ lightrag=rag,
462
+ modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs: openai_complete_if_cache(
463
+ "gpt-4o",
464
+ "",
465
+ system_prompt=None,
466
+ history_messages=[],
467
+ messages=[
468
+ {"role": "system", "content": system_prompt} if system_prompt else None,
469
+ {"role": "user", "content": [
470
+ {"type": "text", "text": prompt},
471
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
472
+ ]} if image_data else {"role": "user", "content": prompt}
473
+ ],
474
+ api_key=api_key,
475
+ base_url=base_url,
476
+ **kwargs,
477
+ ) if image_data else openai_complete_if_cache(
478
+ "gpt-4o-mini",
479
+ prompt,
480
+ system_prompt=system_prompt,
481
+ history_messages=history_messages,
482
+ api_key=api_key,
483
+ base_url=base_url,
484
+ **kwargs,
485
+ )
486
+ )
487
+
488
+ image_content = {
489
+ "img_path": "path/to/image.jpg",
490
+ "image_caption": ["图1:实验结果"],
491
+ "image_footnote": ["数据收集于2024年"]
492
+ }
493
+
494
+ description, entity_info = await image_processor.process_multimodal_content(
495
+ modal_content=image_content,
496
+ content_type="image",
497
+ file_path="research_paper.pdf",
498
+ entity_name="实验结果图表"
499
+ )
500
+
501
+ # 处理表格
502
+ table_processor = TableModalProcessor(
503
+ lightrag=rag,
504
+ modal_caption_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
505
+ "gpt-4o-mini",
506
+ prompt,
507
+ system_prompt=system_prompt,
508
+ history_messages=history_messages,
509
+ api_key=api_key,
510
+ base_url=base_url,
511
+ **kwargs,
512
+ )
513
+ )
514
+
515
+ table_content = {
516
+ "table_body": """
517
+ | 方法 | 准确率 | F1分数 |
518
+ |------|--------|--------|
519
+ | RAGAnything | 95.2% | 0.94 |
520
+ | 基准方法 | 87.3% | 0.85 |
521
+ """,
522
+ "table_caption": ["性能对比"],
523
+ "table_footnote": ["测试数据集结果"]
524
+ }
525
+
526
+ description, entity_info = await table_processor.process_multimodal_content(
527
+ modal_content=table_content,
528
+ content_type="table",
529
+ file_path="research_paper.pdf",
530
+ entity_name="性能结果表格"
531
+ )
532
+
533
+ if __name__ == "__main__":
534
+ asyncio.run(process_multimodal_content())
535
+ ```
536
+
537
+ #### 3. 批量处理
538
+
539
+ ```python
540
+ # 处理多个文档
541
+ await rag.process_folder_complete(
542
+ folder_path="./documents",
543
+ output_dir="./output",
544
+ file_extensions=[".pdf", ".docx", ".pptx"],
545
+ recursive=True,
546
+ max_workers=4
547
+ )
548
+ ```
549
+
550
+ #### 4. 自定义模态处理器
551
+
552
+ ```python
553
+ from raganything.modalprocessors import GenericModalProcessor
554
+
555
+ class CustomModalProcessor(GenericModalProcessor):
556
+ async def process_multimodal_content(self, modal_content, content_type, file_path, entity_name):
557
+ # 你的自定义处理逻辑
558
+ enhanced_description = await self.analyze_custom_content(modal_content)
559
+ entity_info = self.create_custom_entity(enhanced_description, entity_name)
560
+ return await self._create_entity_and_chunk(enhanced_description, entity_info, file_path)
561
+ ```
562
+
563
+ #### 5. 查询选项
564
+
565
+ RAG-Anything 提供三种类型的查询方法:
566
+
567
+ **纯文本查询** - 使用LightRAG直接进行知识库搜索:
568
+ ```python
569
+ # 文本查询的不同模式
570
+ text_result_hybrid = await rag.aquery("你的问题", mode="hybrid")
571
+ text_result_local = await rag.aquery("你的问题", mode="local")
572
+ text_result_global = await rag.aquery("你的问题", mode="global")
573
+ text_result_naive = await rag.aquery("你的问题", mode="naive")
574
+
575
+ # 同步版本
576
+ sync_text_result = rag.query("你的问题", mode="hybrid")
577
+ ```
578
+
579
+ **VLM增强查询** - 使用VLM自动分析检索上下文中的图像:
580
+ ```python
581
+ # VLM增强查询(当提供vision_model_func时自动启用)
582
+ vlm_result = await rag.aquery(
583
+ "分析文档中的图表和数据",
584
+ mode="hybrid"
585
+ # vlm_enhanced=True 当vision_model_func可用时自动设置
586
+ )
587
+
588
+ # 手动控制VLM增强
589
+ vlm_enabled = await rag.aquery(
590
+ "这个文档中的图片显示了什么内容?",
591
+ mode="hybrid",
592
+ vlm_enhanced=True # 强制启用VLM增强
593
+ )
594
+
595
+ vlm_disabled = await rag.aquery(
596
+ "这个文档中的图片显示了什么内容?",
597
+ mode="hybrid",
598
+ vlm_enhanced=False # 强制禁用VLM增强
599
+ )
600
+
601
+ # 当文档包含图片时,VLM可以直接查看和分析图片
602
+ # 系统将自动:
603
+ # 1. 检索包含图片路径的相关上下文
604
+ # 2. 加载图片并编码为base64格式
605
+ # 3. 将文本上下文和图片一起发送给VLM进行综合分析
606
+ ```
607
+
608
+ **多模态查询** - 包含特定多模态内容分析的增强查询:
609
+ ```python
610
+ # 包含表格数据的查询
611
+ table_result = await rag.aquery_with_multimodal(
612
+ "比较这些性能指标与文档内容",
613
+ multimodal_content=[{
614
+ "type": "table",
615
+ "table_data": """方法,准确率,速度
616
+ LightRAG,95.2%,120ms
617
+ 传统方法,87.3%,180ms""",
618
+ "table_caption": "性能对比"
619
+ }],
620
+ mode="hybrid"
621
+ )
622
+
623
+ # 包含公式内容的查询
624
+ equation_result = await rag.aquery_with_multimodal(
625
+ "解释这个公式及其与文档内容的相关性",
626
+ multimodal_content=[{
627
+ "type": "equation",
628
+ "latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
629
+ "equation_caption": "文档相关性概率"
630
+ }],
631
+ mode="hybrid"
632
+ )
633
+ ```
634
+
635
+ #### 6. 加载已存在的LightRAG实例
636
+
637
+ ```python
638
+ import asyncio
639
+ from raganything import RAGAnything
640
+ from lightrag import LightRAG
641
+ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
642
+ from lightrag.utils import EmbeddingFunc
643
+ import os
644
+
645
+ async def load_existing_lightrag():
646
+ # 设置 API 配置
647
+ api_key = "your-api-key"
648
+ base_url = "your-base-url" # 可选
649
+
650
+ # 首先,创建或加载已存在的 LightRAG 实例
651
+ lightrag_working_dir = "./existing_lightrag_storage"
652
+
653
+ # 检查是否存在之前的 LightRAG 实例
654
+ if os.path.exists(lightrag_working_dir) and os.listdir(lightrag_working_dir):
655
+ print("✅ 发现已存在的 LightRAG 实例,正在加载...")
656
+ else:
657
+ print("❌ 未找到已存在的 LightRAG 实例,将创建新实例")
658
+
659
+ # 使用您的配置创建/加载 LightRAG 实例
660
+ lightrag_instance = LightRAG(
661
+ working_dir=lightrag_working_dir,
662
+ llm_model_func=lambda prompt, system_prompt=None, history_messages=[], **kwargs: openai_complete_if_cache(
663
+ "gpt-4o-mini",
664
+ prompt,
665
+ system_prompt=system_prompt,
666
+ history_messages=history_messages,
667
+ api_key=api_key,
668
+ base_url=base_url,
669
+ **kwargs,
670
+ ),
671
+ embedding_func=EmbeddingFunc(
672
+ embedding_dim=3072,
673
+ max_token_size=8192,
674
+ func=lambda texts: openai_embed(
675
+ texts,
676
+ model="text-embedding-3-large",
677
+ api_key=api_key,
678
+ base_url=base_url,
679
+ ),
680
+ )
681
+ )
682
+
683
+ # 初始化存储(如果有现有数据,这将加载它们)
684
+ await lightrag_instance.initialize_storages()
685
+ await initialize_pipeline_status()
686
+
687
+ # 定义视觉模型函数用于图像处理
688
+ def vision_model_func(
689
+ prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs
690
+ ):
691
+ # 如果提供了messages格式(用于多模态VLM增强查询),直接使用
692
+ if messages:
693
+ return openai_complete_if_cache(
694
+ "gpt-4o",
695
+ "",
696
+ system_prompt=None,
697
+ history_messages=[],
698
+ messages=messages,
699
+ api_key=api_key,
700
+ base_url=base_url,
701
+ **kwargs,
702
+ )
703
+ # 传统单图片格式
704
+ elif image_data:
705
+ return openai_complete_if_cache(
706
+ "gpt-4o",
707
+ "",
708
+ system_prompt=None,
709
+ history_messages=[],
710
+ messages=[
711
+ {"role": "system", "content": system_prompt}
712
+ if system_prompt
713
+ else None,
714
+ {
715
+ "role": "user",
716
+ "content": [
717
+ {"type": "text", "text": prompt},
718
+ {
719
+ "type": "image_url",
720
+ "image_url": {
721
+ "url": f"data:image/jpeg;base64,{image_data}"
722
+ },
723
+ },
724
+ ],
725
+ }
726
+ if image_data
727
+ else {"role": "user", "content": prompt},
728
+ ],
729
+ api_key=api_key,
730
+ base_url=base_url,
731
+ **kwargs,
732
+ )
733
+ # 纯文本格式
734
+ else:
735
+ return lightrag_instance.llm_model_func(prompt, system_prompt, history_messages, **kwargs)
736
+
737
+ # 现在使用已存在的 LightRAG 实例初始化 RAGAnything
738
+ rag = RAGAnything(
739
+ lightrag=lightrag_instance, # 传入已存在的 LightRAG 实例
740
+ vision_model_func=vision_model_func,
741
+ # 注意:working_dir、llm_model_func、embedding_func 等都从 lightrag_instance 继承
742
+ )
743
+
744
+ # 查询已存在的知识库
745
+ result = await rag.aquery(
746
+ "这个 LightRAG 实例中处理了哪些数据?",
747
+ mode="hybrid"
748
+ )
749
+ print("查询结果:", result)
750
+
751
+ # 向已存在的 LightRAG 实例添加新的多模态文档
752
+ await rag.process_document_complete(
753
+ file_path="path/to/new/multimodal_document.pdf",
754
+ output_dir="./output"
755
+ )
756
+
757
+ if __name__ == "__main__":
758
+ asyncio.run(load_existing_lightrag())
759
+ ```
760
+
761
+ #### 7. 直接插入内容列表
762
+
763
+ 当您已经有预解析的内容列表(例如,来自外部解析器或之前的处理结果)时,可以直接插入到 RAGAnything 中而无需文档解析:
764
+
765
+ ```python
766
+ import asyncio
767
+ from raganything import RAGAnything, RAGAnythingConfig
768
+ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
769
+ from lightrag.utils import EmbeddingFunc
770
+
771
+ async def insert_content_list_example():
772
+ # 设置 API 配置
773
+ api_key = "your-api-key"
774
+ base_url = "your-base-url" # 可选
775
+
776
+ # 创建 RAGAnything 配置
777
+ config = RAGAnythingConfig(
778
+ working_dir="./rag_storage",
779
+ enable_image_processing=True,
780
+ enable_table_processing=True,
781
+ enable_equation_processing=True,
782
+ )
783
+
784
+ # 定义模型函数
785
+ def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
786
+ return openai_complete_if_cache(
787
+ "gpt-4o-mini",
788
+ prompt,
789
+ system_prompt=system_prompt,
790
+ history_messages=history_messages,
791
+ api_key=api_key,
792
+ base_url=base_url,
793
+ **kwargs,
794
+ )
795
+
796
+ def vision_model_func(prompt, system_prompt=None, history_messages=[], image_data=None, messages=None, **kwargs):
797
+ # 如果提供了messages格式(用于多模态VLM增强查询),直接使用
798
+ if messages:
799
+ return openai_complete_if_cache(
800
+ "gpt-4o",
801
+ "",
802
+ system_prompt=None,
803
+ history_messages=[],
804
+ messages=messages,
805
+ api_key=api_key,
806
+ base_url=base_url,
807
+ **kwargs,
808
+ )
809
+ # 传统单图片格式
810
+ elif image_data:
811
+ return openai_complete_if_cache(
812
+ "gpt-4o",
813
+ "",
814
+ system_prompt=None,
815
+ history_messages=[],
816
+ messages=[
817
+ {"role": "system", "content": system_prompt} if system_prompt else None,
818
+ {
819
+ "role": "user",
820
+ "content": [
821
+ {"type": "text", "text": prompt},
822
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
823
+ ],
824
+ } if image_data else {"role": "user", "content": prompt},
825
+ ],
826
+ api_key=api_key,
827
+ base_url=base_url,
828
+ **kwargs,
829
+ )
830
+ # 纯文本格式
831
+ else:
832
+ return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
833
+
834
+ embedding_func = EmbeddingFunc(
835
+ embedding_dim=3072,
836
+ max_token_size=8192,
837
+ func=lambda texts: openai_embed(
838
+ texts,
839
+ model="text-embedding-3-large",
840
+ api_key=api_key,
841
+ base_url=base_url,
842
+ ),
843
+ )
844
+
845
+ # 初始化 RAGAnything
846
+ rag = RAGAnything(
847
+ config=config,
848
+ llm_model_func=llm_model_func,
849
+ vision_model_func=vision_model_func,
850
+ embedding_func=embedding_func,
851
+ )
852
+
853
+ # 示例:来自外部源的预解析内容列表
854
+ content_list = [
855
+ {
856
+ "type": "text",
857
+ "text": "这是我们研究论文的引言部分。",
858
+ "page_idx": 0 # 此内容出现的页码
859
+ },
860
+ {
861
+ "type": "image",
862
+ "img_path": "/absolute/path/to/figure1.jpg", # 重要:使用绝对路径
863
+ "image_caption": ["图1:系统架构"],
864
+ "image_footnote": ["来源:作者原创设计"],
865
+ "page_idx": 1 # 此图像出现的页码
866
+ },
867
+ {
868
+ "type": "table",
869
+ "table_body": "| 方法 | 准确率 | F1分数 |\n|------|--------|--------|\n| 我们的方法 | 95.2% | 0.94 |\n| 基准方法 | 87.3% | 0.85 |",
870
+ "table_caption": ["表1:性能对比"],
871
+ "table_footnote": ["测试数据集结果"],
872
+ "page_idx": 2 # 此表格出现的页码
873
+ },
874
+ {
875
+ "type": "equation",
876
+ "latex": "P(d|q) = \\frac{P(q|d) \\cdot P(d)}{P(q)}",
877
+ "text": "文档相关性概率公式",
878
+ "page_idx": 3 # 此公式出现的页码
879
+ },
880
+ {
881
+ "type": "text",
882
+ "text": "总之,我们的方法在所有指标上都表现出优越的性能。",
883
+ "page_idx": 4 # 此内容出现的页码
884
+ }
885
+ ]
886
+
887
+ # 直接插入内容列表
888
+ await rag.insert_content_list(
889
+ content_list=content_list,
890
+ file_path="research_paper.pdf", # 用于引用的参考文件名
891
+ split_by_character=None, # 可选的文本分割
892
+ split_by_character_only=False, # 可选的文本分割模式
893
+ doc_id=None, # 可选的自定义文档ID(如果未提供将自动生成)
894
+ display_stats=True # 显示内容统计信息
895
+ )
896
+
897
+ # 查询��入的内容
898
+ result = await rag.aquery(
899
+ "研究中提到的主要发现和性能指标是什么?",
900
+ mode="hybrid"
901
+ )
902
+ print("查询结果:", result)
903
+
904
+ # 您也可以使用不同的文档ID插入多个内容列表
905
+ another_content_list = [
906
+ {
907
+ "type": "text",
908
+ "text": "这是来自另一个文档的内容。",
909
+ "page_idx": 0 # 此内容出现的页码
910
+ },
911
+ {
912
+ "type": "table",
913
+ "table_body": "| 特性 | 值 |\n|------|----|\n| 速度 | 快速 |\n| 准确性 | 高 |",
914
+ "table_caption": ["特性对比"],
915
+ "page_idx": 1 # 此表格出现的页码
916
+ }
917
+ ]
918
+
919
+ await rag.insert_content_list(
920
+ content_list=another_content_list,
921
+ file_path="another_document.pdf",
922
+ doc_id="custom-doc-id-123" # 自定义文档ID
923
+ )
924
+
925
+ if __name__ == "__main__":
926
+ asyncio.run(insert_content_list_example())
927
+ ```
928
+
929
+ **内容列表格式:**
930
+
931
+ `content_list` 应遵循标准格式,每个项目都是包含以下内容的字典:
932
+
933
+ - **文本内容**: `{"type": "text", "text": "内容文本", "page_idx": 0}`
934
+ - **图像内容**: `{"type": "image", "img_path": "/absolute/path/to/image.jpg", "image_caption": ["标题"], "image_footnote": ["注释"], "page_idx": 1}`
935
+ - **表格内容**: `{"type": "table", "table_body": "markdown表格", "table_caption": ["标题"], "table_footnote": ["注释"], "page_idx": 2}`
936
+ - **公式内容**: `{"type": "equation", "latex": "LaTeX公式", "text": "描述", "page_idx": 3}`
937
+ - **通用内容**: `{"type": "custom_type", "content": "任何内容", "page_idx": 4}`
938
+
939
+ **重要说明:**
940
+ - **`img_path`**: 必须是图像文件的绝对路径(例如:`/home/user/images/chart.jpg` 或 `C:\Users\user\images\chart.jpg`)
941
+ - **`page_idx`**: 表示内容在原始文档中出现的页码(从0开始的索引)
942
+ - **内容顺序**: 项目按照在列表中出现的顺序进行处理
943
+
944
+ 此方法在以下情况下特别有用:
945
+ - 您有来自外部解析器的内容(非MinerU/Docling)
946
+ - 您想要处理程序化生成的内容
947
+ - 您需要将来自多个源的内容插入到单个知识库中
948
+ - 您有想要重用的缓存解析结果
949
+
950
+ ---
951
+
952
+ ## 🛠️ 示例
953
+
954
+ *实际应用演示*
955
+
956
+ <div align="center">
957
+ <img src="https://user-images.githubusercontent.com/74038190/212257455-13e3e01e-d6a6-45dc-bb92-3ab87b12dfc1.gif" width="300">
958
+ </div>
959
+
960
+ `examples/` 目录包含完整的使用示例:
961
+
962
+ - **`raganything_example.py`**:基于MinerU的端到端文档处理
963
+ - **`modalprocessors_example.py`**:直接多模态内容处理
964
+ - **`office_document_test.py`**:Office文档解析测试(无需API密钥)
965
+ - **`image_format_test.py`**:图像格式解析测试(无需API密钥)
966
+ - **`text_format_test.py`**:文本格式解析测试(无需API密钥)
967
+
968
+ **运行示例:**
969
+
970
+ ```bash
971
+ # 端到端处理(包含解析器选择)
972
+ python examples/raganything_example.py path/to/document.pdf --api-key YOUR_API_KEY --parser mineru
973
+
974
+ # 直接模态处理
975
+ python examples/modalprocessors_example.py --api-key YOUR_API_KEY
976
+
977
+ # Office文档解析测试(仅MinerU功能)
978
+ python examples/office_document_test.py --file path/to/document.docx
979
+
980
+ # 图像格式解析测试(仅MinerU功能)
981
+ python examples/image_format_test.py --file path/to/image.bmp
982
+
983
+ # 文本格式解析测试(仅MinerU功能)
984
+ python examples/text_format_test.py --file path/to/document.md
985
+
986
+ # 检查LibreOffice安装
987
+ python examples/office_document_test.py --check-libreoffice --file dummy
988
+
989
+ # 检查PIL/Pillow安装
990
+ python examples/image_format_test.py --check-pillow --file dummy
991
+
992
+ # 检查ReportLab安装
993
+ python examples/text_format_test.py --check-reportlab --file dummy
994
+ ```
995
+
996
+ > **注意**:API密钥仅在完整RAG处理和LLM集成时需要。解析测试文件(`office_document_test.py`、`image_format_test.py` 和 `text_format_test.py`)仅测试MinerU功能,无需API密钥。
997
+
998
+ ---
999
+
1000
+ ## 🔧 配置
1001
+
1002
+ *系统优化参数*
1003
+
1004
+ ### 环境变量
1005
+
1006
+ 创建 `.env` 文件(参考 `.env.example`):
1007
+
1008
+ ```bash
1009
+ OPENAI_API_KEY=your_openai_api_key
1010
+ OPENAI_BASE_URL=your_base_url # 可选
1011
+ OUTPUT_DIR=./output # 解析文档的默认输出目录
1012
+ PARSER=mineru # 解析器选择:mineru 或 docling
1013
+ PARSE_METHOD=auto # 解析方法:auto, ocr 或 txt
1014
+ ```
1015
+
1016
+ **注意:** 为了向后兼容,旧的环境变量名称仍然有效:
1017
+ - `MINERU_PARSE_METHOD` 已弃用,请使用 `PARSE_METHOD`
1018
+
1019
+ ### 解析器配置
1020
+
1021
+ RAGAnything 现在支持多种解析器,每种解析器都有其特定的优势:
1022
+
1023
+ #### MinerU 解析器
1024
+ - 支持PDF、图像、Office文档等多种格式
1025
+ - 强大的OCR和表格提取能力
1026
+ - 支持GPU加速
1027
+
1028
+ #### Docling 解析器
1029
+ - 专门优化Office文档和HTML文件的解析
1030
+ - 更好的文档结构保持
1031
+ - 原生支持多种Office格式
1032
+
1033
+ ### MinerU配置
1034
+
1035
+ ```bash
1036
+ # MinerU 2.0使用命令行参数而不是配置文件
1037
+ # 查看可用选项:
1038
+ mineru --help
1039
+
1040
+ # 常用配置:
1041
+ mineru -p input.pdf -o output_dir -m auto # 自动解析模式
1042
+ mineru -p input.pdf -o output_dir -m ocr # OCR重点解析
1043
+ mineru -p input.pdf -o output_dir -b pipeline --device cuda # GPU加速
1044
+ ```
1045
+
1046
+ 你也可以通过RAGAnything参数配置解析:
1047
+
1048
+ ```python
1049
+ # 基础解析配置和解析器选择
1050
+ await rag.process_document_complete(
1051
+ file_path="document.pdf",
1052
+ output_dir="./output/",
1053
+ parse_method="auto", # 或 "ocr", "txt"
1054
+ parser="mineru" # 可选:"mineru" 或 "docling"
1055
+ )
1056
+
1057
+ # 高级解析配置(包含特殊参数)
1058
+ await rag.process_document_complete(
1059
+ file_path="document.pdf",
1060
+ output_dir="./output/",
1061
+ parse_method="auto", # 解析方法:"auto", "ocr", "txt"
1062
+ parser="mineru", # 解析器选择:"mineru" 或 "docling"
1063
+
1064
+ # MinerU特殊参数 - 支持的所有kwargs:
1065
+ lang="ch", # 文档语言优化(如:"ch", "en", "ja")
1066
+ device="cuda:0", # 推理设备:"cpu", "cuda", "cuda:0", "npu", "mps"
1067
+ start_page=0, # 起始页码(0为基准,适用于PDF)
1068
+ end_page=10, # 结束页码(0为基准,适用于PDF)
1069
+ formula=True, # 启用公式解析
1070
+ table=True, # 启用表格解析
1071
+ backend="pipeline", # 解析后端:pipeline|vlm-transformers|vlm-sglang-engine|vlm-sglang-client
1072
+ source="huggingface", # 模型源:"huggingface", "modelscope", "local"
1073
+ # vlm_url="http://127.0.0.1:3000" # 当backend=vlm-sglang-client时,需指定服务地址
1074
+
1075
+ # RAGAnything标准参数
1076
+ display_stats=True, # 显示内容统计信息
1077
+ split_by_character=None, # 可选的文本分割字符
1078
+ doc_id=None # 可选的文档ID
1079
+ )
1080
+ ```
1081
+
1082
+ > **注意**:MinerU 2.0不再使用 `magic-pdf.json` 配置文件。所有设置现在通过命令行参数或函数参数传递。RAG-Anything现在支持多种文档解析器 - 你可以根据需要在MinerU和Docling之间选择。
1083
+
1084
+ ### 处理要求
1085
+
1086
+ 不同内容类型需要特定的可选依赖:
1087
+
1088
+ - **Office文档** (.doc, .docx, .ppt, .pptx, .xls, .xlsx): 安装并配置 [LibreOffice](https://www.libreoffice.org/download/download/)
1089
+ - **扩展图像格式** (.bmp, .tiff, .gif, .webp): 使用 `pip install raganything[image]` 安装
1090
+ - **文本文件** (.txt, .md): 使用 `pip install raganything[text]` 安装
1091
+
1092
+ > **📋 快速安装**: 使用 `pip install raganything[all]` 启用所有格式支持(仅Python依赖 - LibreOffice仍需单独安装)
1093
+
1094
+ ---
1095
+
1096
+ ## 🧪 支持的内容类型
1097
+
1098
+ ### 文档格式
1099
+
1100
+ - **PDF** - 研究论文、报告、演示文稿
1101
+ - **Office文档** - DOC、DOCX、PPT、PPTX、XLS、XLSX
1102
+ - **图像** - JPG、PNG、BMP、TIFF、GIF、WebP
1103
+ - **文本文件** - TXT、MD
1104
+
1105
+ ### 多模态元素
1106
+
1107
+ - **图像** - 照片、图表、示意图、截图
1108
+ - **表格** - 数据表、对比图、统计摘要
1109
+ - **公式** - LaTeX格式的数学公式
1110
+ - **通用内容** - 通过可扩展处理器支持的自定义内容类型
1111
+
1112
+ *格式特定依赖的安装说明请参见[配置](#-配置)部分。*
1113
+
1114
+ ---
1115
+
1116
+ ## 📖 引用
1117
+
1118
+ *学术参考*
1119
+
1120
+ <div align="center">
1121
+ <div style="width: 60px; height: 60px; margin: 20px auto; position: relative;">
1122
+ <div style="width: 100%; height: 100%; border: 2px solid #00d9ff; border-radius: 50%; position: relative;">
1123
+ <div style="position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); font-size: 24px; color: #00d9ff;">📖</div>
1124
+ </div>
1125
+ <div style="position: absolute; bottom: -5px; left: 50%; transform: translateX(-50%); width: 20px; height: 20px; background: white; border-right: 2px solid #00d9ff; border-bottom: 2px solid #00d9ff; transform: rotate(45deg);"></div>
1126
+ </div>
1127
+ </div>
1128
+
1129
+ ```bibtex
1130
+ @article{guo2024lightrag,
1131
+ title={LightRAG: Simple and Fast Retrieval-Augmented Generation},
1132
+ author={Zirui Guo and Lianghao Xia and Yanhua Yu and Tu Ao and Chao Huang},
1133
+ year={2024},
1134
+ eprint={2410.05779},
1135
+ archivePrefix={arXiv},
1136
+ primaryClass={cs.IR}
1137
+ }
1138
+ ```
1139
+
1140
+ ---
1141
+
1142
+ ## 🔗 相关项目
1143
+
1144
+ *生态系统与扩展*
1145
+
1146
+ <div align="center">
1147
+ <table>
1148
+ <tr>
1149
+ <td align="center">
1150
+ <a href="https://github.com/HKUDS/LightRAG">
1151
+ <div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
1152
+ <span style="font-size: 32px;">⚡</span>
1153
+ </div>
1154
+ <b>LightRAG</b><br>
1155
+ <sub>简单快速的RAG系统</sub>
1156
+ </a>
1157
+ </td>
1158
+ <td align="center">
1159
+ <a href="https://github.com/HKUDS/VideoRAG">
1160
+ <div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
1161
+ <span style="font-size: 32px;">🎥</span>
1162
+ </div>
1163
+ <b>VideoRAG</b><br>
1164
+ <sub>超长上下文视频RAG系统</sub>
1165
+ </a>
1166
+ </td>
1167
+ <td align="center">
1168
+ <a href="https://github.com/HKUDS/MiniRAG">
1169
+ <div style="width: 100px; height: 100px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2); display: flex; align-items: center; justify-content: center; margin-bottom: 10px;">
1170
+ <span style="font-size: 32px;">✨</span>
1171
+ </div>
1172
+ <b>MiniRAG</b><br>
1173
+ <sub>极简RAG系统</sub>
1174
+ </a>
1175
+ </td>
1176
+ </tr>
1177
+ </table>
1178
+ </div>
1179
+
1180
+ ---
1181
+
1182
+ ## ⭐ Star History
1183
+
1184
+ *社区增长轨迹*
1185
+
1186
+ <div align="center">
1187
+ <a href="https://star-history.com/#HKUDS/RAG-Anything&Date">
1188
+ <picture>
1189
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date&theme=dark" />
1190
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date" />
1191
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=HKUDS/RAG-Anything&type=Date" style="border-radius: 15px; box-shadow: 0 0 30px rgba(0, 217, 255, 0.3);" />
1192
+ </picture>
1193
+ </a>
1194
+ </div>
1195
+
1196
+ ---
1197
+
1198
+ ## 🤝 贡献者
1199
+
1200
+ *加入创新*
1201
+
1202
+ <div align="center">
1203
+ 感谢所有贡献者!
1204
+ </div>
1205
+
1206
+ <div align="center">
1207
+ <a href="https://github.com/HKUDS/RAG-Anything/graphs/contributors">
1208
+ <img src="https://contrib.rocks/image?repo=HKUDS/RAG-Anything" style="border-radius: 15px; box-shadow: 0 0 20px rgba(0, 217, 255, 0.3);" />
1209
+ </a>
1210
+ </div>
1211
+
1212
+ ---
1213
+
1214
+ <div align="center" style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; padding: 30px; margin: 30px 0;">
1215
+ <div>
1216
+ <img src="https://user-images.githubusercontent.com/74038190/212284100-561aa473-3905-4a80-b561-0d28506553ee.gif" width="500">
1217
+ </div>
1218
+ <div style="margin-top: 20px;">
1219
+ <a href="https://github.com/HKUDS/RAG-Anything" style="text-decoration: none;">
1220
+ <img src="https://img.shields.io/badge/⭐%20在GitHub上为我们点星-1a1a2e?style=for-the-badge&logo=github&logoColor=white">
1221
+ </a>
1222
+ <a href="https://github.com/HKUDS/RAG-Anything/issues" style="text-decoration: none;">
1223
+ <img src="https://img.shields.io/badge/🐛%20报告问题-ff6b6b?style=for-the-badge&logo=github&logoColor=white">
1224
+ </a>
1225
+ <a href="https://github.com/HKUDS/RAG-Anything/discussions" style="text-decoration: none;">
1226
+ <img src="https://img.shields.io/badge/💬%20讨论交流-4ecdc4?style=for-the-badge&logo=github&logoColor=white">
1227
+ </a>
1228
+ </div>
1229
+ </div>
1230
+
1231
+ <div align="center">
1232
+ <div style="width: 100%; max-width: 600px; margin: 20px auto; padding: 20px; background: linear-gradient(135deg, rgba(0, 217, 255, 0.1) 0%, rgba(0, 217, 255, 0.05) 100%); border-radius: 15px; border: 1px solid rgba(0, 217, 255, 0.2);">
1233
+ <div style="display: flex; justify-content: center; align-items: center; gap: 15px;">
1234
+ <span style="font-size: 24px;">⭐</span>
1235
+ <span style="color: #00d9ff; font-size: 18px;">感谢您访问RAG-Anything!</span>
1236
+ <span style="font-size: 24px;">⭐</span>
1237
+ </div>
1238
+ <div style="margin-top: 10px; color: #00d9ff; font-size: 16px;">构建多模态AI的未来</div>
1239
+ </div>
1240
+ </div>
1241
+
1242
+ <div align="center">
1243
+ <img src="https://readme-typing-svg.herokuapp.com?font=Orbitron&size=20&duration=3000&pause=1000&color=00D9FF&center=true&vCenter=true&width=600&lines=感谢您访问RAG-Anything!;构建多模态AI的未来;如果觉得有用请点星⭐!" alt="Closing Animation" />
1244
+ </div>
1245
+
1246
+ <style>
1247
+ @keyframes pulse {
1248
+ 0% { transform: scale(1); }
1249
+ 50% { transform: scale(1.05); }
1250
+ 100% { transform: scale(1); }
1251
+ }
1252
+
1253
+ @keyframes glow {
1254
+ 0% { box-shadow: 0 0 5px rgba(0, 217, 255, 0.5); }
1255
+ 50% { box-shadow: 0 0 20px rgba(0, 217, 255, 0.8); }
1256
+ 100% { box-shadow: 0 0 5px rgba(0, 217, 255, 0.5); }
1257
+ }
1258
+ </style>
rag_anything_smaranika/docs/batch_processing.md ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Batch Processing
2
+
3
+ This document describes the batch processing feature for RAG-Anything, which allows you to process multiple documents in parallel for improved throughput.
4
+
5
+ ## Overview
6
+
7
+ The batch processing feature allows you to process multiple documents concurrently, significantly improving throughput for large document collections. It provides parallel processing, progress tracking, error handling, and flexible configuration options.
8
+
9
+ ## Key Features
10
+
11
+ - **Parallel Processing**: Process multiple files concurrently using thread pools
12
+ - **Progress Tracking**: Real-time progress bars with `tqdm`
13
+ - **Error Handling**: Comprehensive error reporting and recovery
14
+ - **Flexible Input**: Support for files, directories, and recursive search
15
+ - **Configurable Workers**: Adjustable number of parallel workers
16
+ - **Installation Check Bypass**: Optional skip for environments with package conflicts
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ # Basic installation
22
+ pip install raganything[all]
23
+
24
+ # Required for batch processing
25
+ pip install tqdm
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ ### Basic Batch Processing
31
+
32
+ ```python
33
+ from raganything.batch_parser import BatchParser
34
+
35
+ # Create batch parser
36
+ batch_parser = BatchParser(
37
+ parser_type="mineru", # or "docling"
38
+ max_workers=4,
39
+ show_progress=True,
40
+ timeout_per_file=300,
41
+ skip_installation_check=False # Set to True if having parser installation issues
42
+ )
43
+
44
+ # Process multiple files
45
+ result = batch_parser.process_batch(
46
+ file_paths=["doc1.pdf", "doc2.docx", "folder/"],
47
+ output_dir="./batch_output",
48
+ parse_method="auto",
49
+ recursive=True
50
+ )
51
+
52
+ # Check results
53
+ print(result.summary())
54
+ print(f"Success rate: {result.success_rate:.1f}%")
55
+ print(f"Processing time: {result.processing_time:.2f} seconds")
56
+ ```
57
+
58
+ ### Asynchronous Batch Processing
59
+
60
+ ```python
61
+ import asyncio
62
+ from raganything.batch_parser import BatchParser
63
+
64
+ async def async_batch_processing():
65
+ batch_parser = BatchParser(
66
+ parser_type="mineru",
67
+ max_workers=4,
68
+ show_progress=True
69
+ )
70
+
71
+ # Process files asynchronously
72
+ result = await batch_parser.process_batch_async(
73
+ file_paths=["doc1.pdf", "doc2.docx"],
74
+ output_dir="./output",
75
+ parse_method="auto"
76
+ )
77
+
78
+ return result
79
+
80
+ # Run async processing
81
+ result = asyncio.run(async_batch_processing())
82
+ ```
83
+
84
+ ### Integration with RAG-Anything
85
+
86
+ ```python
87
+ from raganything import RAGAnything
88
+
89
+ rag = RAGAnything()
90
+
91
+ # Process documents with batch functionality
92
+ result = rag.process_documents_batch(
93
+ file_paths=["doc1.pdf", "doc2.docx"],
94
+ output_dir="./output",
95
+ max_workers=4,
96
+ show_progress=True
97
+ )
98
+
99
+ print(f"Processed {len(result.successful_files)} files successfully")
100
+ ```
101
+
102
+ ### Process Documents with RAG Integration
103
+
104
+ ```python
105
+ # Process documents in batch and then add them to RAG
106
+ result = await rag.process_documents_with_rag_batch(
107
+ file_paths=["doc1.pdf", "doc2.docx"],
108
+ output_dir="./output",
109
+ max_workers=4,
110
+ show_progress=True
111
+ )
112
+
113
+ print(f"Processed {result['successful_rag_files']} files with RAG")
114
+ print(f"Total processing time: {result['total_processing_time']:.2f} seconds")
115
+ ```
116
+
117
+ ### Command Line Interface
118
+
119
+ ```bash
120
+ # Basic batch processing
121
+ python -m raganything.batch_parser path/to/docs/ --output ./output --workers 4
122
+
123
+ # With specific parser
124
+ python -m raganything.batch_parser path/to/docs/ --parser mineru --method auto
125
+
126
+ # Without progress bar
127
+ python -m raganything.batch_parser path/to/docs/ --output ./output --no-progress
128
+
129
+ # Help
130
+ python -m raganything.batch_parser --help
131
+ ```
132
+
133
+ ## Configuration
134
+
135
+ ### Environment Variables
136
+
137
+ ```env
138
+ # Batch processing configuration
139
+ MAX_CONCURRENT_FILES=4
140
+ SUPPORTED_FILE_EXTENSIONS=.pdf,.docx,.doc,.pptx,.ppt,.xlsx,.xls,.txt,.md
141
+ RECURSIVE_FOLDER_PROCESSING=true
142
+ PARSER_OUTPUT_DIR=./parsed_output
143
+ ```
144
+
145
+ ### BatchParser Parameters
146
+
147
+ - **parser_type**: `"mineru"` or `"docling"` (default: `"mineru"`)
148
+ - **max_workers**: Number of parallel workers (default: `4`)
149
+ - **show_progress**: Show progress bar (default: `True`)
150
+ - **timeout_per_file**: Timeout per file in seconds (default: `300`)
151
+ - **skip_installation_check**: Skip parser installation check (default: `False`)
152
+
153
+ ## Supported File Types
154
+
155
+ - **PDF files**: `.pdf`
156
+ - **Office documents**: `.doc`, `.docx`, `.ppt`, `.pptx`, `.xls`, `.xlsx`
157
+ - **Images**: `.png`, `.jpg`, `.jpeg`, `.bmp`, `.tiff`, `.tif`, `.gif`, `.webp`
158
+ - **Text files**: `.txt`, `.md`
159
+
160
+ ## API Reference
161
+
162
+ ### BatchProcessingResult
163
+
164
+ ```python
165
+ @dataclass
166
+ class BatchProcessingResult:
167
+ successful_files: List[str] # Successfully processed files
168
+ failed_files: List[str] # Failed files
169
+ total_files: int # Total number of files
170
+ processing_time: float # Total processing time in seconds
171
+ errors: Dict[str, str] # Error messages for failed files
172
+ output_dir: str # Output directory used
173
+
174
+ def summary(self) -> str: # Human-readable summary
175
+ def success_rate(self) -> float: # Success rate as percentage
176
+ ```
177
+
178
+ ### BatchParser Methods
179
+
180
+ ```python
181
+ class BatchParser:
182
+ def __init__(self, parser_type: str = "mineru", max_workers: int = 4, ...):
183
+ """Initialize batch parser"""
184
+
185
+ def get_supported_extensions(self) -> List[str]:
186
+ """Get list of supported file extensions"""
187
+
188
+ def filter_supported_files(self, file_paths: List[str], recursive: bool = True) -> List[str]:
189
+ """Filter files to only supported types"""
190
+
191
+ def process_batch(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
192
+ """Process files in batch"""
193
+
194
+ async def process_batch_async(self, file_paths: List[str], output_dir: str, ...) -> BatchProcessingResult:
195
+ """Process files in batch asynchronously"""
196
+ ```
197
+
198
+ ## Performance Considerations
199
+
200
+ ### Memory Usage
201
+ - Each worker uses additional memory
202
+ - Recommended: 2-4 workers for most systems
203
+ - Monitor memory usage with large files
204
+
205
+ ### CPU Usage
206
+ - Parallel processing utilizes multiple cores
207
+ - Optimal worker count depends on CPU cores and file sizes
208
+ - I/O may become bottleneck with many small files
209
+
210
+ ### Recommended Settings
211
+ - **Small files** (< 1MB): Higher worker count (6-8)
212
+ - **Large files** (> 100MB): Lower worker count (2-3)
213
+ - **Mixed sizes**: Start with 4 workers and adjust
214
+
215
+ ## Troubleshooting
216
+
217
+ ### Common Issues
218
+
219
+ #### Memory Errors
220
+ ```python
221
+ # Solution: Reduce max_workers
222
+ batch_parser = BatchParser(max_workers=2)
223
+ ```
224
+
225
+ #### Timeout Errors
226
+ ```python
227
+ # Solution: Increase timeout_per_file
228
+ batch_parser = BatchParser(timeout_per_file=600) # 10 minutes
229
+ ```
230
+
231
+ #### Parser Installation Issues
232
+ ```python
233
+ # Solution: Skip installation check
234
+ batch_parser = BatchParser(skip_installation_check=True)
235
+ ```
236
+
237
+ #### File Not Found Errors
238
+ - Check file paths and permissions
239
+ - Ensure input files exist
240
+ - Verify directory access rights
241
+
242
+ ### Debug Mode
243
+
244
+ Enable debug logging for detailed information:
245
+
246
+ ```python
247
+ import logging
248
+ logging.basicConfig(level=logging.DEBUG)
249
+
250
+ # Create batch parser with debug logging
251
+ batch_parser = BatchParser(parser_type="mineru", max_workers=2)
252
+ ```
253
+
254
+ ### Error Handling
255
+
256
+ The batch processor provides comprehensive error handling:
257
+
258
+ ```python
259
+ result = batch_parser.process_batch(file_paths=["doc1.pdf", "doc2.docx"])
260
+
261
+ # Check for errors
262
+ if result.failed_files:
263
+ print("Failed files:")
264
+ for file_path in result.failed_files:
265
+ error_message = result.errors.get(file_path, "Unknown error")
266
+ print(f" - {file_path}: {error_message}")
267
+
268
+ # Process only successful files
269
+ for file_path in result.successful_files:
270
+ print(f"Successfully processed: {file_path}")
271
+ ```
272
+
273
+ ## Examples
274
+
275
+ ### Process Entire Directory
276
+
277
+ ```python
278
+ from pathlib import Path
279
+
280
+ # Process all supported files in a directory
281
+ batch_parser = BatchParser(max_workers=4)
282
+ directory_path = Path("./documents")
283
+
284
+ result = batch_parser.process_batch(
285
+ file_paths=[str(directory_path)],
286
+ output_dir="./processed",
287
+ recursive=True # Include subdirectories
288
+ )
289
+
290
+ print(f"Processed {len(result.successful_files)} out of {result.total_files} files")
291
+ ```
292
+
293
+ ### Filter Files Before Processing
294
+
295
+ ```python
296
+ # Get all files in directory
297
+ all_files = ["doc1.pdf", "image.png", "spreadsheet.xlsx", "unsupported.xyz"]
298
+
299
+ # Filter to supported files only
300
+ supported_files = batch_parser.filter_supported_files(all_files)
301
+ print(f"Will process {len(supported_files)} out of {len(all_files)} files")
302
+
303
+ # Process only supported files
304
+ result = batch_parser.process_batch(
305
+ file_paths=supported_files,
306
+ output_dir="./output"
307
+ )
308
+ ```
309
+
310
+ ### Custom Error Handling
311
+
312
+ ```python
313
+ def process_with_retry(file_paths, max_retries=3):
314
+ """Process files with retry logic"""
315
+
316
+ for attempt in range(max_retries):
317
+ result = batch_parser.process_batch(file_paths, "./output")
318
+
319
+ if not result.failed_files:
320
+ break # All files processed successfully
321
+
322
+ print(f"Attempt {attempt + 1}: {len(result.failed_files)} files failed")
323
+ file_paths = result.failed_files # Retry failed files
324
+
325
+ return result
326
+ ```
327
+
328
+ ## Best Practices
329
+
330
+ 1. **Start with default settings** and adjust based on performance
331
+ 2. **Monitor system resources** during batch processing
332
+ 3. **Use appropriate worker counts** for your hardware
333
+ 4. **Handle errors gracefully** with retry logic
334
+ 5. **Test with small batches** before processing large collections
335
+ 6. **Use skip_installation_check** if facing parser installation issues
336
+ 7. **Enable progress tracking** for long-running operations
337
+ 8. **Set appropriate timeouts** based on expected file processing times
338
+
339
+ ## Conclusion
340
+
341
+ The batch processing feature significantly improves RAG-Anything's throughput for large document collections. It provides flexible configuration options, comprehensive error handling, and seamless integration with the existing RAG-Anything pipeline.
rag_anything_smaranika/docs/context_aware_processing.md ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Context-Aware Multimodal Processing in RAGAnything
2
+
3
+ This document describes the context-aware multimodal processing feature in RAGAnything, which provides surrounding content information to LLMs when analyzing images, tables, equations, and other multimodal content for enhanced accuracy and relevance.
4
+
5
+ ## Overview
6
+
7
+ The context-aware feature enables RAGAnything to automatically extract and provide surrounding text content as context when processing multimodal content. This leads to more accurate and contextually relevant analysis by giving AI models additional information about where the content appears in the document structure.
8
+
9
+ ### Key Benefits
10
+
11
+ - **Enhanced Accuracy**: Context helps AI understand the purpose and meaning of multimodal content
12
+ - **Semantic Coherence**: Generated descriptions align with document context and terminology
13
+ - **Automated Integration**: Context extraction is automatically enabled during document processing
14
+ - **Flexible Configuration**: Multiple extraction modes and filtering options
15
+
16
+ ## Key Features
17
+
18
+ ### 1. Configuration Support
19
+ - **Integrated Configuration**: Complete context options in `RAGAnythingConfig`
20
+ - **Environment Variables**: Configure all context parameters via environment variables
21
+ - **Dynamic Updates**: Runtime configuration updates supported
22
+ - **Content Format Control**: Configurable content source format detection
23
+
24
+ ### 2. Automated Integration
25
+ - **Auto-Initialization**: Modal processors automatically receive tokenizer and context configuration
26
+ - **Content Source Setup**: Document processing automatically sets content sources for context extraction
27
+ - **Position Information**: Automatic position info (page_idx, index) passed to processors
28
+ - **Batch Processing**: Context-aware batch processing for efficient document handling
29
+
30
+ ### 3. Advanced Token Management
31
+ - **Accurate Token Counting**: Uses LightRAG's tokenizer for precise token calculation
32
+ - **Smart Boundary Preservation**: Truncates at sentence/paragraph boundaries
33
+ - **Backward Compatibility**: Fallback to character truncation when tokenizer unavailable
34
+
35
+ ### 4. Universal Context Extraction
36
+ - **Multiple Formats**: Support for MinerU, plain text, custom formats
37
+ - **Flexible Modes**: Page-based and chunk-based context extraction
38
+ - **Content Filtering**: Configurable content type filtering
39
+ - **Header Support**: Optional inclusion of document headers and structure
40
+
41
+ ## Configuration
42
+
43
+ ### RAGAnythingConfig Parameters
44
+
45
+ ```python
46
+ # Context Extraction Configuration
47
+ context_window: int = 1 # Context window size (pages/chunks)
48
+ context_mode: str = "page" # Context mode ("page" or "chunk")
49
+ max_context_tokens: int = 2000 # Maximum context tokens
50
+ include_headers: bool = True # Include document headers
51
+ include_captions: bool = True # Include image/table captions
52
+ context_filter_content_types: List[str] = ["text"] # Content types to include
53
+ content_format: str = "minerU" # Default content format for context extraction
54
+ ```
55
+
56
+ ### Environment Variables
57
+
58
+ ```bash
59
+ # Context extraction settings
60
+ CONTEXT_WINDOW=2
61
+ CONTEXT_MODE=page
62
+ MAX_CONTEXT_TOKENS=3000
63
+ INCLUDE_HEADERS=true
64
+ INCLUDE_CAPTIONS=true
65
+ CONTEXT_FILTER_CONTENT_TYPES=text,image
66
+ CONTENT_FORMAT=minerU
67
+ ```
68
+
69
+ ## Usage Guide
70
+
71
+ ### 1. Basic Configuration
72
+
73
+ ```python
74
+ from raganything import RAGAnything, RAGAnythingConfig
75
+
76
+ # Create configuration with context settings
77
+ config = RAGAnythingConfig(
78
+ context_window=2,
79
+ context_mode="page",
80
+ max_context_tokens=3000,
81
+ include_headers=True,
82
+ include_captions=True,
83
+ context_filter_content_types=["text", "image"],
84
+ content_format="minerU"
85
+ )
86
+
87
+ # Create RAGAnything instance
88
+ rag_anything = RAGAnything(
89
+ config=config,
90
+ llm_model_func=your_llm_function,
91
+ embedding_func=your_embedding_function
92
+ )
93
+ ```
94
+
95
+ ### 2. Automatic Document Processing
96
+
97
+ ```python
98
+ # Context is automatically enabled during document processing
99
+ await rag_anything.process_document_complete("document.pdf")
100
+ ```
101
+
102
+ ### 3. Manual Content Source Configuration
103
+
104
+ ```python
105
+ # Set content source for specific content lists
106
+ rag_anything.set_content_source_for_context(content_list, "minerU")
107
+
108
+ # Update context configuration at runtime
109
+ rag_anything.update_context_config(
110
+ context_window=1,
111
+ max_context_tokens=1500,
112
+ include_captions=False
113
+ )
114
+ ```
115
+
116
+ ### 4. Direct Modal Processor Usage
117
+
118
+ ```python
119
+ from raganything.modalprocessors import (
120
+ ContextExtractor,
121
+ ContextConfig,
122
+ ImageModalProcessor
123
+ )
124
+
125
+ # Configure context extraction
126
+ config = ContextConfig(
127
+ context_window=1,
128
+ context_mode="page",
129
+ max_context_tokens=2000,
130
+ include_headers=True,
131
+ include_captions=True,
132
+ filter_content_types=["text"]
133
+ )
134
+
135
+ # Initialize context extractor
136
+ context_extractor = ContextExtractor(config)
137
+
138
+ # Initialize modal processor with context support
139
+ processor = ImageModalProcessor(lightrag, caption_func, context_extractor)
140
+
141
+ # Set content source
142
+ processor.set_content_source(content_list, "minerU")
143
+
144
+ # Process with context
145
+ item_info = {
146
+ "page_idx": 2,
147
+ "index": 5,
148
+ "type": "image"
149
+ }
150
+
151
+ result = await processor.process_multimodal_content(
152
+ modal_content=image_data,
153
+ content_type="image",
154
+ file_path="document.pdf",
155
+ entity_name="Architecture Diagram",
156
+ item_info=item_info
157
+ )
158
+ ```
159
+
160
+ ## Context Modes
161
+
162
+ ### Page-Based Context (`context_mode="page"`)
163
+ - Extracts context based on page boundaries
164
+ - Uses `page_idx` field from content items
165
+ - Suitable for document-structured content
166
+ - Example: Include text from 2 pages before and after current image
167
+
168
+ ### Chunk-Based Context (`context_mode="chunk"`)
169
+ - Extracts context based on content item positions
170
+ - Uses sequential position in content list
171
+ - Suitable for fine-grained control
172
+ - Example: Include 5 content items before and after current table
173
+
174
+ ## Processing Workflow
175
+
176
+ ### 1. Document Parsing
177
+ ```
178
+ Document Input → MinerU Parsing → content_list Generation
179
+ ```
180
+
181
+ ### 2. Context Setup
182
+ ```
183
+ content_list → Set as Context Source → All Modal Processors Gain Context Capability
184
+ ```
185
+
186
+ ### 3. Multimodal Processing
187
+ ```
188
+ Multimodal Content → Extract Surrounding Context → Enhanced LLM Analysis → More Accurate Results
189
+ ```
190
+
191
+ ## Content Source Formats
192
+
193
+ ### MinerU Format
194
+ ```json
195
+ [
196
+ {
197
+ "type": "text",
198
+ "text": "Document content here...",
199
+ "text_level": 1,
200
+ "page_idx": 0
201
+ },
202
+ {
203
+ "type": "image",
204
+ "img_path": "images/figure1.jpg",
205
+ "image_caption": ["Figure 1: Architecture"],
206
+ "image_footnote": [],
207
+ "page_idx": 1
208
+ }
209
+ ]
210
+ ```
211
+
212
+ ### Custom Text Chunks
213
+ ```python
214
+ text_chunks = [
215
+ "First chunk of text content...",
216
+ "Second chunk of text content...",
217
+ "Third chunk of text content..."
218
+ ]
219
+ ```
220
+
221
+ ### Plain Text
222
+ ```python
223
+ full_document = "Complete document text with all content..."
224
+ ```
225
+
226
+ ## Configuration Examples
227
+
228
+ ### High-Precision Context
229
+ For focused analysis with minimal context:
230
+ ```python
231
+ config = RAGAnythingConfig(
232
+ context_window=1,
233
+ context_mode="page",
234
+ max_context_tokens=1000,
235
+ include_headers=True,
236
+ include_captions=False,
237
+ context_filter_content_types=["text"]
238
+ )
239
+ ```
240
+
241
+ ### Comprehensive Context
242
+ For broad analysis with rich context:
243
+ ```python
244
+ config = RAGAnythingConfig(
245
+ context_window=2,
246
+ context_mode="page",
247
+ max_context_tokens=3000,
248
+ include_headers=True,
249
+ include_captions=True,
250
+ context_filter_content_types=["text", "image", "table"]
251
+ )
252
+ ```
253
+
254
+ ### Chunk-Based Analysis
255
+ For fine-grained sequential context:
256
+ ```python
257
+ config = RAGAnythingConfig(
258
+ context_window=5,
259
+ context_mode="chunk",
260
+ max_context_tokens=2000,
261
+ include_headers=False,
262
+ include_captions=False,
263
+ context_filter_content_types=["text"]
264
+ )
265
+ ```
266
+
267
+ ## Performance Optimization
268
+
269
+ ### 1. Accurate Token Control
270
+ - Uses real tokenizer for precise token counting
271
+ - Avoids exceeding LLM token limits
272
+ - Provides consistent performance
273
+
274
+ ### 2. Smart Truncation
275
+ - Truncates at sentence boundaries
276
+ - Maintains semantic integrity
277
+ - Adds truncation indicators
278
+
279
+ ### 3. Caching Optimization
280
+ - Context extraction results can be reused
281
+ - Reduces redundant computation overhead
282
+
283
+ ## Advanced Features
284
+
285
+ ### Context Truncation
286
+ The system automatically truncates context to fit within token limits:
287
+ - Uses actual tokenizer for accurate token counting
288
+ - Attempts to end at sentence boundaries (periods)
289
+ - Falls back to line boundaries if needed
290
+ - Adds "..." indicator for truncated content
291
+
292
+ ### Header Formatting
293
+ When `include_headers=True`, headers are formatted with markdown-style prefixes:
294
+ ```
295
+ # Level 1 Header
296
+ ## Level 2 Header
297
+ ### Level 3 Header
298
+ ```
299
+
300
+ ### Caption Integration
301
+ When `include_captions=True`, image and table captions are included as:
302
+ ```
303
+ [Image: Figure 1 caption text]
304
+ [Table: Table 1 caption text]
305
+ ```
306
+
307
+ ## Integration with RAGAnything
308
+
309
+ The context-aware feature is seamlessly integrated into RAGAnything's workflow:
310
+
311
+ 1. **Automatic Setup**: Context extractors are automatically created and configured
312
+ 2. **Content Source Management**: Document processing automatically sets content sources
313
+ 3. **Processor Integration**: All modal processors receive context capabilities
314
+ 4. **Configuration Consistency**: Single configuration system for all context settings
315
+
316
+ ## Error Handling
317
+
318
+ The system includes robust error handling:
319
+ - Gracefully handles missing or invalid content sources
320
+ - Returns empty context for unsupported formats
321
+ - Logs warnings for configuration issues
322
+ - Continues processing even if context extraction fails
323
+
324
+ ## Compatibility
325
+
326
+ - **Backward Compatible**: Existing code works without modification
327
+ - **Optional Feature**: Context can be selectively enabled/disabled
328
+ - **Flexible Configuration**: Supports multiple configuration combinations
329
+
330
+ ## Best Practices
331
+
332
+ 1. **Token Limits**: Ensure `max_context_tokens` doesn't exceed LLM context limits
333
+ 2. **Performance Impact**: Larger context windows increase processing time
334
+ 3. **Content Quality**: Context quality directly affects analysis accuracy
335
+ 4. **Window Size**: Match window size to content structure (documents vs articles)
336
+ 5. **Content Filtering**: Use `context_filter_content_types` to reduce noise
337
+
338
+ ## Troubleshooting
339
+
340
+ ### Common Issues
341
+
342
+ **Context Not Extracted**
343
+ - Check if `set_content_source_for_context()` was called
344
+ - Verify `item_info` contains required fields (`page_idx`, `index`)
345
+ - Confirm content source format is correct
346
+
347
+ **Context Too Long/Short**
348
+ - Adjust `max_context_tokens` setting
349
+ - Modify `context_window` size
350
+ - Check `context_filter_content_types` configuration
351
+
352
+ **Irrelevant Context**
353
+ - Refine `context_filter_content_types` to exclude noise
354
+ - Reduce `context_window` size
355
+ - Set `include_captions=False` if captions are not helpful
356
+
357
+ **Configuration Issues**
358
+ - Verify environment variables are set correctly
359
+ - Check RAGAnythingConfig parameter names
360
+ - Ensure content_format matches your data source
361
+
362
+ ## Examples
363
+
364
+ Check out these example files for complete usage demonstrations:
365
+
366
+ - **Configuration Examples**: See how to set up different context configurations
367
+ - **Integration Examples**: Learn how to integrate context-aware processing into your workflow
368
+ - **Custom Processors**: Examples of creating custom modal processors with context support
369
+
370
+ ## API Reference
371
+
372
+ For detailed API documentation, see the docstrings in:
373
+ - `raganything/modalprocessors.py` - Context extraction and modal processors
374
+ - `raganything/config.py` - Configuration options
375
+ - `raganything/raganything.py` - Main RAGAnything class integration
rag_anything_smaranika/docs/enhanced_markdown.md ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Enhanced Markdown Conversion
2
+
3
+ This document describes the enhanced markdown conversion feature for RAG-Anything, which provides high-quality PDF generation from markdown files with multiple backend options and advanced styling.
4
+
5
+ ## Overview
6
+
7
+ The enhanced markdown conversion feature provides professional-quality PDF generation from markdown files. It supports multiple conversion backends, advanced styling options, syntax highlighting, and seamless integration with RAG-Anything's document processing pipeline.
8
+
9
+ ## Key Features
10
+
11
+ - **Multiple Backends**: WeasyPrint, Pandoc, and automatic backend selection
12
+ - **Advanced Styling**: Custom CSS, syntax highlighting, and professional layouts
13
+ - **Image Support**: Embedded images with proper scaling and positioning
14
+ - **Table Support**: Formatted tables with borders and professional styling
15
+ - **Code Highlighting**: Syntax highlighting for code blocks using Pygments
16
+ - **Custom Templates**: Support for custom CSS and document templates
17
+ - **Table of Contents**: Automatic TOC generation with navigation links
18
+ - **Professional Typography**: High-quality fonts and spacing
19
+
20
+ ## Installation
21
+
22
+ ### Required Dependencies
23
+
24
+ ```bash
25
+ # Basic installation
26
+ pip install raganything[all]
27
+
28
+ # Required for enhanced markdown conversion
29
+ pip install markdown weasyprint pygments
30
+ ```
31
+
32
+ ### Optional Dependencies
33
+
34
+ ```bash
35
+ # For Pandoc backend (system installation required)
36
+ # Ubuntu/Debian:
37
+ sudo apt-get install pandoc wkhtmltopdf
38
+
39
+ # macOS:
40
+ brew install pandoc wkhtmltopdf
41
+
42
+ # Or using conda:
43
+ conda install -c conda-forge pandoc wkhtmltopdf
44
+ ```
45
+
46
+ ### Backend-Specific Installation
47
+
48
+ #### WeasyPrint (Recommended)
49
+ ```bash
50
+ # Install WeasyPrint with system dependencies
51
+ pip install weasyprint
52
+
53
+ # Ubuntu/Debian system dependencies:
54
+ sudo apt-get install -y build-essential python3-dev python3-pip \
55
+ python3-setuptools python3-wheel python3-cffi libcairo2 \
56
+ libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \
57
+ libffi-dev shared-mime-info
58
+ ```
59
+
60
+ #### Pandoc
61
+ - Download from: https://pandoc.org/installing.html
62
+ - Requires system-wide installation
63
+ - Used for complex document structures and LaTeX-quality output
64
+
65
+ ## Usage
66
+
67
+ ### Basic Conversion
68
+
69
+ ```python
70
+ from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
71
+
72
+ # Create converter with default settings
73
+ converter = EnhancedMarkdownConverter()
74
+
75
+ # Convert markdown file to PDF
76
+ success = converter.convert_file_to_pdf(
77
+ input_path="document.md",
78
+ output_path="document.pdf",
79
+ method="auto" # Automatically select best available backend
80
+ )
81
+
82
+ if success:
83
+ print("✅ Conversion successful!")
84
+ else:
85
+ print("❌ Conversion failed")
86
+ ```
87
+
88
+ ### Advanced Configuration
89
+
90
+ ```python
91
+ # Create custom configuration
92
+ config = MarkdownConfig(
93
+ page_size="A4", # A4, Letter, Legal, etc.
94
+ margin="1in", # CSS-style margins
95
+ font_size="12pt", # Base font size
96
+ line_height="1.5", # Line spacing
97
+ include_toc=True, # Generate table of contents
98
+ syntax_highlighting=True, # Enable code syntax highlighting
99
+
100
+ # Custom CSS styling
101
+ custom_css="""
102
+ body {
103
+ font-family: 'Georgia', serif;
104
+ color: #333;
105
+ }
106
+ h1 {
107
+ color: #2c3e50;
108
+ border-bottom: 2px solid #3498db;
109
+ padding-bottom: 0.3em;
110
+ }
111
+ code {
112
+ background-color: #f8f9fa;
113
+ padding: 2px 4px;
114
+ border-radius: 3px;
115
+ }
116
+ pre {
117
+ background-color: #f8f9fa;
118
+ border-left: 4px solid #3498db;
119
+ padding: 15px;
120
+ border-radius: 5px;
121
+ }
122
+ table {
123
+ border-collapse: collapse;
124
+ width: 100%;
125
+ margin: 1em 0;
126
+ }
127
+ th, td {
128
+ border: 1px solid #ddd;
129
+ padding: 8px 12px;
130
+ text-align: left;
131
+ }
132
+ th {
133
+ background-color: #f2f2f2;
134
+ font-weight: bold;
135
+ }
136
+ """
137
+ )
138
+
139
+ converter = EnhancedMarkdownConverter(config)
140
+ ```
141
+
142
+ ### Backend Selection
143
+
144
+ ```python
145
+ # Check available backends
146
+ converter = EnhancedMarkdownConverter()
147
+ backend_info = converter.get_backend_info()
148
+
149
+ print("Available backends:")
150
+ for backend, available in backend_info["available_backends"].items():
151
+ status = "✅" if available else "❌"
152
+ print(f" {status} {backend}")
153
+
154
+ print(f"Recommended backend: {backend_info['recommended_backend']}")
155
+
156
+ # Use specific backend
157
+ converter.convert_file_to_pdf(
158
+ input_path="document.md",
159
+ output_path="document.pdf",
160
+ method="weasyprint" # or "pandoc", "pandoc_system", "auto"
161
+ )
162
+ ```
163
+
164
+ ### Content Conversion
165
+
166
+ ```python
167
+ # Convert markdown content directly (not from file)
168
+ markdown_content = """
169
+ # Sample Document
170
+
171
+ ## Introduction
172
+ This is a **bold** statement with *italic* text.
173
+
174
+ ## Code Example
175
+ ```python
176
+ def hello_world():
177
+ print("Hello, World!")
178
+ return "Success"
179
+ ```
180
+
181
+ ## Table
182
+ | Feature | Status | Notes |
183
+ |---------|--------|-------|
184
+ | PDF Generation | ✅ | Working |
185
+ | Syntax Highlighting | ✅ | Pygments |
186
+ | Custom CSS | ✅ | Full support |
187
+ """
188
+
189
+ success = converter.convert_markdown_to_pdf(
190
+ markdown_content=markdown_content,
191
+ output_path="sample.pdf",
192
+ method="auto"
193
+ )
194
+ ```
195
+
196
+ ### Command Line Interface
197
+
198
+ ```bash
199
+ # Basic conversion
200
+ python -m raganything.enhanced_markdown document.md --output document.pdf
201
+
202
+ # With specific backend
203
+ python -m raganything.enhanced_markdown document.md --method weasyprint
204
+
205
+ # With custom CSS file
206
+ python -m raganything.enhanced_markdown document.md --css custom_style.css
207
+
208
+ # Show backend information
209
+ python -m raganything.enhanced_markdown --info
210
+
211
+ # Help
212
+ python -m raganything.enhanced_markdown --help
213
+ ```
214
+
215
+ ## Backend Comparison
216
+
217
+ | Backend | Pros | Cons | Best For | Quality |
218
+ |---------|------|------|----------|---------|
219
+ | **WeasyPrint** | • Excellent CSS support<br>• Fast rendering<br>• Great web-style layouts<br>• Python-based | • Limited LaTeX features<br>• Requires system deps | • Web-style documents<br>• Custom styling<br>• Fast conversion | ⭐⭐⭐⭐ |
220
+ | **Pandoc** | • Extensive features<br>• LaTeX-quality output<br>• Academic formatting<br>• Many input/output formats | • Slower conversion<br>• System installation<br>• Complex setup | • Academic papers<br>• Complex documents<br>• Publication quality | ⭐⭐⭐⭐⭐ |
221
+ | **Auto** | • Automatic selection<br>• Fallback support<br>• User-friendly | • May not use optimal backend | • General use<br>• Quick setup<br>• Development | ⭐⭐⭐⭐ |
222
+
223
+ ## Configuration Options
224
+
225
+ ### MarkdownConfig Parameters
226
+
227
+ ```python
228
+ @dataclass
229
+ class MarkdownConfig:
230
+ # Page layout
231
+ page_size: str = "A4" # A4, Letter, Legal, A3, etc.
232
+ margin: str = "1in" # CSS margin format
233
+ font_size: str = "12pt" # Base font size
234
+ line_height: str = "1.5" # Line spacing multiplier
235
+
236
+ # Content options
237
+ include_toc: bool = True # Generate table of contents
238
+ syntax_highlighting: bool = True # Enable code highlighting
239
+ image_max_width: str = "100%" # Maximum image width
240
+ table_style: str = "..." # Default table CSS
241
+
242
+ # Styling
243
+ css_file: Optional[str] = None # External CSS file path
244
+ custom_css: Optional[str] = None # Inline CSS content
245
+ template_file: Optional[str] = None # Custom HTML template
246
+
247
+ # Output options
248
+ output_format: str = "pdf" # Currently only PDF supported
249
+ output_dir: Optional[str] = None # Output directory
250
+
251
+ # Metadata
252
+ metadata: Optional[Dict[str, str]] = None # Document metadata
253
+ ```
254
+
255
+ ### Supported Markdown Features
256
+
257
+ #### Basic Formatting
258
+ - **Headers**: `# ## ### #### ##### ######`
259
+ - **Emphasis**: `*italic*`, `**bold**`, `***bold italic***`
260
+ - **Links**: `[text](url)`, `[text][ref]`
261
+ - **Images**: `![alt](url)`, `![alt][ref]`
262
+ - **Lists**: Ordered and unordered, nested
263
+ - **Blockquotes**: `> quote`
264
+ - **Line breaks**: Double space or `\n\n`
265
+
266
+ #### Advanced Features
267
+ - **Tables**: GitHub-style tables with alignment
268
+ - **Code blocks**: Fenced code blocks with language specification
269
+ - **Inline code**: `backtick code`
270
+ - **Horizontal rules**: `---` or `***`
271
+ - **Footnotes**: `[^1]` references
272
+ - **Definition lists**: Term and definition pairs
273
+ - **Attributes**: `{#id .class key=value}`
274
+
275
+ #### Code Highlighting
276
+
277
+ ```markdown
278
+ ```python
279
+ def example_function():
280
+ """This will be syntax highlighted"""
281
+ return "Hello, World!"
282
+ ```
283
+
284
+ ```javascript
285
+ function exampleFunction() {
286
+ // This will also be highlighted
287
+ return "Hello, World!";
288
+ }
289
+ ```
290
+ ```
291
+
292
+ ## Integration with RAG-Anything
293
+
294
+ The enhanced markdown conversion integrates seamlessly with RAG-Anything:
295
+
296
+ ```python
297
+ from raganything import RAGAnything
298
+
299
+ # Initialize RAG-Anything
300
+ rag = RAGAnything()
301
+
302
+ # Process markdown files - enhanced conversion is used automatically
303
+ await rag.process_document_complete("document.md")
304
+
305
+ # Batch processing with enhanced markdown conversion
306
+ result = rag.process_documents_batch(
307
+ file_paths=["doc1.md", "doc2.md", "doc3.md"],
308
+ output_dir="./output"
309
+ )
310
+
311
+ # The .md files will be converted to PDF using enhanced conversion
312
+ # before being processed by the RAG system
313
+ ```
314
+
315
+ ## Performance Considerations
316
+
317
+ ### Conversion Speed
318
+ - **WeasyPrint**: ~1-3 seconds for typical documents
319
+ - **Pandoc**: ~3-10 seconds for typical documents
320
+ - **Large documents**: Time scales roughly linearly with content
321
+
322
+ ### Memory Usage
323
+ - **WeasyPrint**: ~50-100MB per conversion
324
+ - **Pandoc**: ~100-200MB per conversion
325
+ - **Images**: Large images increase memory usage significantly
326
+
327
+ ### Optimization Tips
328
+ 1. **Resize large images** before embedding
329
+ 2. **Use compressed images** (JPEG for photos, PNG for graphics)
330
+ 3. **Limit concurrent conversions** to avoid memory issues
331
+ 4. **Cache converted content** when processing multiple times
332
+
333
+ ## Examples
334
+
335
+ ### Sample Markdown Document
336
+
337
+ ```markdown
338
+ # Technical Documentation
339
+
340
+ ## Table of Contents
341
+ [TOC]
342
+
343
+ ## Overview
344
+ This document provides comprehensive technical specifications.
345
+
346
+ ## Architecture
347
+
348
+ ### System Components
349
+ 1. **Parser Engine**: Handles document processing
350
+ 2. **Storage Layer**: Manages data persistence
351
+ 3. **Query Interface**: Provides search capabilities
352
+
353
+ ### Code Implementation
354
+ ```python
355
+ from raganything import RAGAnything
356
+
357
+ # Initialize system
358
+ rag = RAGAnything(config={
359
+ "working_dir": "./storage",
360
+ "enable_image_processing": True
361
+ })
362
+
363
+ # Process document
364
+ await rag.process_document_complete("document.pdf")
365
+ ```
366
+
367
+ ### Performance Metrics
368
+
369
+ | Component | Throughput | Latency | Memory |
370
+ |-----------|------------|---------|--------|
371
+ | Parser | 100 docs/hour | 36s avg | 2.5 GB |
372
+ | Storage | 1000 ops/sec | 1ms avg | 512 MB |
373
+ | Query | 50 queries/sec | 20ms avg | 1 GB |
374
+
375
+ ## Integration Notes
376
+
377
+ > **Important**: Always validate input before processing.
378
+
379
+ ## Conclusion
380
+ The enhanced system provides excellent performance for document processing workflows.
381
+ ```
382
+
383
+ ### Generated PDF Features
384
+
385
+ The enhanced markdown converter produces PDFs with:
386
+
387
+ - **Professional typography** with proper font selection and spacing
388
+ - **Syntax-highlighted code blocks** using Pygments
389
+ - **Formatted tables** with borders and alternating row colors
390
+ - **Clickable table of contents** with navigation links
391
+ - **Responsive images** that scale appropriately
392
+ - **Custom styling** through CSS
393
+ - **Proper page breaks** and margins
394
+ - **Document metadata** and properties
395
+
396
+ ## Troubleshooting
397
+
398
+ ### Common Issues
399
+
400
+ #### WeasyPrint Installation Problems
401
+ ```bash
402
+ # Ubuntu/Debian: Install system dependencies
403
+ sudo apt-get update
404
+ sudo apt-get install -y build-essential python3-dev libcairo2 \
405
+ libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 \
406
+ libffi-dev shared-mime-info
407
+
408
+ # Then reinstall WeasyPrint
409
+ pip install --force-reinstall weasyprint
410
+ ```
411
+
412
+ #### Pandoc Not Found
413
+ ```bash
414
+ # Check if Pandoc is installed
415
+ pandoc --version
416
+
417
+ # Install Pandoc (Ubuntu/Debian)
418
+ sudo apt-get install pandoc wkhtmltopdf
419
+
420
+ # Or download from: https://pandoc.org/installing.html
421
+ ```
422
+
423
+ #### CSS Issues
424
+ - Check CSS syntax in custom_css
425
+ - Verify CSS file paths exist
426
+ - Test CSS with simple HTML first
427
+ - Use browser developer tools to debug styling
428
+
429
+ #### Image Problems
430
+ - Ensure images are accessible (correct paths)
431
+ - Check image file formats (PNG, JPEG, GIF supported)
432
+ - Verify image file permissions
433
+ - Consider image size and format optimization
434
+
435
+ #### Font Issues
436
+ ```python
437
+ # Use web-safe fonts
438
+ config = MarkdownConfig(
439
+ custom_css="""
440
+ body {
441
+ font-family: 'Arial', 'Helvetica', sans-serif;
442
+ }
443
+ """
444
+ )
445
+ ```
446
+
447
+ ### Debug Mode
448
+
449
+ Enable detailed logging for troubleshooting:
450
+
451
+ ```python
452
+ import logging
453
+
454
+ # Enable debug logging
455
+ logging.basicConfig(
456
+ level=logging.DEBUG,
457
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
458
+ )
459
+
460
+ # Create converter with debug logging
461
+ converter = EnhancedMarkdownConverter()
462
+ result = converter.convert_file_to_pdf("test.md", "test.pdf")
463
+ ```
464
+
465
+ ### Error Handling
466
+
467
+ ```python
468
+ def robust_conversion(input_path, output_path):
469
+ """Convert with fallback backends"""
470
+ converter = EnhancedMarkdownConverter()
471
+
472
+ # Try backends in order of preference
473
+ backends = ["weasyprint", "pandoc", "auto"]
474
+
475
+ for backend in backends:
476
+ try:
477
+ success = converter.convert_file_to_pdf(
478
+ input_path=input_path,
479
+ output_path=output_path,
480
+ method=backend
481
+ )
482
+ if success:
483
+ print(f"✅ Conversion successful with {backend}")
484
+ return True
485
+ except Exception as e:
486
+ print(f"❌ {backend} failed: {str(e)}")
487
+ continue
488
+
489
+ print("❌ All backends failed")
490
+ return False
491
+ ```
492
+
493
+ ## API Reference
494
+
495
+ ### EnhancedMarkdownConverter
496
+
497
+ ```python
498
+ class EnhancedMarkdownConverter:
499
+ def __init__(self, config: Optional[MarkdownConfig] = None):
500
+ """Initialize converter with optional configuration"""
501
+
502
+ def convert_file_to_pdf(self, input_path: str, output_path: str, method: str = "auto") -> bool:
503
+ """Convert markdown file to PDF"""
504
+
505
+ def convert_markdown_to_pdf(self, markdown_content: str, output_path: str, method: str = "auto") -> bool:
506
+ """Convert markdown content to PDF"""
507
+
508
+ def get_backend_info(self) -> Dict[str, Any]:
509
+ """Get information about available backends"""
510
+
511
+ def convert_with_weasyprint(self, markdown_content: str, output_path: str) -> bool:
512
+ """Convert using WeasyPrint backend"""
513
+
514
+ def convert_with_pandoc(self, markdown_content: str, output_path: str) -> bool:
515
+ """Convert using Pandoc backend"""
516
+ ```
517
+
518
+ ## Best Practices
519
+
520
+ 1. **Choose the right backend** for your use case:
521
+ - **WeasyPrint** for web-style documents and custom CSS
522
+ - **Pandoc** for academic papers and complex formatting
523
+ - **Auto** for general use and development
524
+
525
+ 2. **Optimize images** before embedding:
526
+ - Use appropriate formats (JPEG for photos, PNG for graphics)
527
+ - Compress images to reduce file size
528
+ - Set reasonable maximum widths
529
+
530
+ 3. **Design responsive layouts**:
531
+ - Use relative units (%, em) instead of absolute (px)
532
+ - Test with different page sizes
533
+ - Consider print-specific CSS
534
+
535
+ 4. **Test your styling**:
536
+ - Start with default styling and incrementally customize
537
+ - Test with sample content before production use
538
+ - Validate CSS syntax
539
+
540
+ 5. **Handle errors gracefully**:
541
+ - Implement fallback backends
542
+ - Provide meaningful error messages
543
+ - Log conversion attempts for debugging
544
+
545
+ 6. **Performance optimization**:
546
+ - Cache converted content when possible
547
+ - Process large batches with appropriate worker counts
548
+ - Monitor memory usage with large documents
549
+
550
+ ## Conclusion
551
+
552
+ The enhanced markdown conversion feature provides professional-quality PDF generation with flexible styling options and multiple backend support. It seamlessly integrates with RAG-Anything's document processing pipeline while offering standalone functionality for markdown-to-PDF conversion needs.
rag_anything_smaranika/env.example ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### This is sample file of .env
2
+
3
+
4
+ ### Server Configuration
5
+ HOST=0.0.0.0
6
+ PORT=9621
7
+ WEBUI_TITLE='My Graph KB'
8
+ WEBUI_DESCRIPTION="Simple and Fast Graph Based RAG System"
9
+ OLLAMA_EMULATING_MODEL_TAG=latest
10
+ # WORKERS=2
11
+ # CORS_ORIGINS=http://localhost:3000,http://localhost:8080
12
+
13
+ ### Login Configuration
14
+ # AUTH_ACCOUNTS='admin:admin123,user1:pass456'
15
+ # TOKEN_SECRET=Your-Key-For-LightRAG-API-Server
16
+ # TOKEN_EXPIRE_HOURS=48
17
+ # GUEST_TOKEN_EXPIRE_HOURS=24
18
+ # JWT_ALGORITHM=HS256
19
+
20
+ ### API-Key to access LightRAG Server API
21
+ # LIGHTRAG_API_KEY=your-secure-api-key-here
22
+ # WHITELIST_PATHS=/health,/api/*
23
+
24
+ ### Optional SSL Configuration
25
+ # SSL=true
26
+ # SSL_CERTFILE=/path/to/cert.pem
27
+ # SSL_KEYFILE=/path/to/key.pem
28
+
29
+ ### Directory Configuration (defaults to current working directory)
30
+ ### Should not be set if deploy by docker (Set by Dockerfile instead of .env)
31
+ ### Default value is ./inputs and ./rag_storage
32
+ # INPUT_DIR=<absolute_path_for_doc_input_dir>
33
+
34
+ ### RAGAnything Configuration (Multimodal Document Processing)
35
+ ### ---
36
+ ### Parser Configuration
37
+ # PARSE_METHOD=auto
38
+ # OUTPUT_DIR=./output
39
+ # PARSER=mineru
40
+ # DISPLAY_CONTENT_STATS=true
41
+
42
+ ### Multimodal Processing Configuration
43
+ # ENABLE_IMAGE_PROCESSING=true
44
+ # ENABLE_TABLE_PROCESSING=true
45
+ # ENABLE_EQUATION_PROCESSING=true
46
+
47
+ ### Batch Processing Configuration
48
+ # MAX_CONCURRENT_FILES=1
49
+ # SUPPORTED_FILE_EXTENSIONS=.pdf,.jpg,.jpeg,.png,.bmp,.tiff,.tif,.gif,.webp,.doc,.docx,.ppt,.pptx,.xls,.xlsx,.txt,.md
50
+ # RECURSIVE_FOLDER_PROCESSING=true
51
+
52
+ ### Context Extraction Configuration
53
+ # CONTEXT_WINDOW=1
54
+ # CONTEXT_MODE=page
55
+ # MAX_CONTEXT_TOKENS=2000
56
+ # INCLUDE_HEADERS=true
57
+ # INCLUDE_CAPTIONS=true
58
+ # CONTEXT_FILTER_CONTENT_TYPES=text
59
+ # CONTENT_FORMAT=minerU
60
+
61
+ ### Max nodes return from grap retrieval
62
+ # MAX_GRAPH_NODES=1000
63
+
64
+ ### Logging level
65
+ # LOG_LEVEL=INFO
66
+ # VERBOSE=False
67
+ # LOG_MAX_BYTES=10485760
68
+ # LOG_BACKUP_COUNT=5
69
+ ### Logfile location (defaults to current working directory)
70
+ # LOG_DIR=/path/to/log/directory
71
+
72
+ ### Settings for RAG query
73
+ # HISTORY_TURNS=3
74
+ # COSINE_THRESHOLD=0.2
75
+ # TOP_K=60
76
+ # MAX_TOKEN_TEXT_CHUNK=4000
77
+ # MAX_TOKEN_RELATION_DESC=4000
78
+ # MAX_TOKEN_ENTITY_DESC=4000
79
+
80
+ ### Entity and ralation summarization configuration
81
+ ### Language: English, Chinese, French, German ...
82
+ SUMMARY_LANGUAGE=English
83
+ ### Number of duplicated entities/edges to trigger LLM re-summary on merge ( at least 3 is recommented)
84
+ # FORCE_LLM_SUMMARY_ON_MERGE=6
85
+ ### Max tokens for entity/relations description after merge
86
+ # MAX_TOKEN_SUMMARY=500
87
+
88
+ ### Number of parallel processing documents(Less than MAX_ASYNC/2 is recommended)
89
+ # MAX_PARALLEL_INSERT=2
90
+ ### Chunk size for document splitting, 500~1500 is recommended
91
+ # CHUNK_SIZE=1200
92
+ # CHUNK_OVERLAP_SIZE=100
93
+
94
+ ### LLM Configuration
95
+ ENABLE_LLM_CACHE=true
96
+ ENABLE_LLM_CACHE_FOR_EXTRACT=true
97
+ ### Time out in seconds for LLM, None for infinite timeout
98
+ TIMEOUT=240
99
+ ### Some models like o1-mini require temperature to be set to 1
100
+ TEMPERATURE=0
101
+ ### Max concurrency requests of LLM
102
+ MAX_ASYNC=4
103
+ ### MAX_TOKENS: max tokens send to LLM for entity relation summaries (less than context size of the model)
104
+ ### MAX_TOKENS: set as num_ctx option for Ollama by API Server
105
+ MAX_TOKENS=32768
106
+ ### LLM Binding type: openai, ollama, lollms, azure_openai, lmstudio
107
+ LLM_BINDING=openai
108
+ LLM_MODEL=gpt-4o
109
+ LLM_BINDING_HOST=https://api.openai.com/v1
110
+ LLM_BINDING_API_KEY=your_api_key
111
+ ### Optional for Azure
112
+ # AZURE_OPENAI_API_VERSION=2024-08-01-preview
113
+ # AZURE_OPENAI_DEPLOYMENT=gpt-4o
114
+
115
+ ### Embedding Configuration
116
+ ### Embedding Binding type: openai, ollama, lollms, azure_openai, lmstudio
117
+ EMBEDDING_BINDING=ollama
118
+ EMBEDDING_MODEL=bge-m3:latest
119
+ EMBEDDING_DIM=1024
120
+ EMBEDDING_BINDING_API_KEY=your_api_key
121
+ # If the embedding service is deployed within the same Docker stack, use host.docker.internal instead of localhost
122
+ EMBEDDING_BINDING_HOST=http://localhost:11434
123
+ ### Num of chunks send to Embedding in single request
124
+ # EMBEDDING_BATCH_NUM=32
125
+ ### Max concurrency requests for Embedding
126
+ # EMBEDDING_FUNC_MAX_ASYNC=16
127
+ ### Maximum tokens sent to Embedding for each chunk (no longer in use?)
128
+ # MAX_EMBED_TOKENS=8192
129
+ ### Optional for Azure
130
+ # AZURE_EMBEDDING_DEPLOYMENT=text-embedding-3-large
131
+ # AZURE_EMBEDDING_API_VERSION=2023-05-15
132
+
133
+ ### Data storage selection
134
+ # LIGHTRAG_KV_STORAGE=PGKVStorage
135
+ # LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
136
+ # LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
137
+ # LIGHTRAG_GRAPH_STORAGE=Neo4JStorage
138
+
139
+ ### TiDB Configuration (Deprecated)
140
+ # TIDB_HOST=localhost
141
+ # TIDB_PORT=4000
142
+ # TIDB_USER=your_username
143
+ # TIDB_PASSWORD='your_password'
144
+ # TIDB_DATABASE=your_database
145
+ ### separating all data from difference Lightrag instances(deprecating)
146
+ # TIDB_WORKSPACE=default
147
+
148
+ ### PostgreSQL Configuration
149
+ POSTGRES_HOST=localhost
150
+ POSTGRES_PORT=5432
151
+ POSTGRES_USER=your_username
152
+ POSTGRES_PASSWORD='your_password'
153
+ POSTGRES_DATABASE=your_database
154
+ POSTGRES_MAX_CONNECTIONS=12
155
+ ### separating all data from difference Lightrag instances(deprecating)
156
+ # POSTGRES_WORKSPACE=default
157
+
158
+ ### Neo4j Configuration
159
+ NEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io
160
+ NEO4J_USERNAME=neo4j
161
+ NEO4J_PASSWORD='your_password'
162
+
163
+ ### Independent AGM Configuration(not for AMG embedded in PostreSQL)
164
+ # AGE_POSTGRES_DB=
165
+ # AGE_POSTGRES_USER=
166
+ # AGE_POSTGRES_PASSWORD=
167
+ # AGE_POSTGRES_HOST=
168
+ # AGE_POSTGRES_PORT=8529
169
+
170
+ # AGE Graph Name(apply to PostgreSQL and independent AGM)
171
+ ### AGE_GRAPH_NAME is precated
172
+ # AGE_GRAPH_NAME=lightrag
173
+
174
+ ### MongoDB Configuration
175
+ MONGO_URI=mongodb://root:root@localhost:27017/
176
+ MONGO_DATABASE=LightRAG
177
+ ### separating all data from difference Lightrag instances(deprecating)
178
+ # MONGODB_GRAPH=false
179
+
180
+ ### Milvus Configuration
181
+ MILVUS_URI=http://localhost:19530
182
+ MILVUS_DB_NAME=lightrag
183
+ # MILVUS_USER=root
184
+ # MILVUS_PASSWORD=your_password
185
+ # MILVUS_TOKEN=your_token
186
+
187
+ ### Qdrant
188
+ QDRANT_URL=http://localhost:16333
189
+ # QDRANT_API_KEY=your-api-key
190
+
191
+ ### Redis
192
+ REDIS_URI=redis://localhost:6379
rag_anything_smaranika/examples/batch_processing_example.py ADDED
@@ -0,0 +1,561 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Batch Processing Example for RAG-Anything
4
+
5
+ This example demonstrates how to use the batch processing capabilities
6
+ to process multiple documents in parallel for improved throughput.
7
+
8
+ Features demonstrated:
9
+ - Basic batch processing with BatchParser
10
+ - Asynchronous batch processing
11
+ - Integration with RAG-Anything
12
+ - Error handling and progress tracking
13
+ - File filtering and directory processing
14
+ """
15
+
16
+ import asyncio
17
+ import logging
18
+ from pathlib import Path
19
+ import tempfile
20
+ import time
21
+
22
+ # Add project root directory to Python path
23
+ import sys
24
+
25
+ sys.path.append(str(Path(__file__).parent.parent))
26
+
27
+ from raganything import RAGAnything, RAGAnythingConfig
28
+ from raganything.batch_parser import BatchParser
29
+
30
+
31
+ def create_sample_documents():
32
+ """Create sample documents for batch processing testing"""
33
+ temp_dir = Path(tempfile.mkdtemp())
34
+ sample_files = []
35
+
36
+ # Create various document types
37
+ documents = {
38
+ "document1.txt": "This is a simple text document for testing batch processing.",
39
+ "document2.txt": "Another text document with different content.",
40
+ "document3.md": """# Markdown Document
41
+
42
+ ## Introduction
43
+ This is a markdown document for testing.
44
+
45
+ ### Features
46
+ - Markdown formatting
47
+ - Code blocks
48
+ - Lists
49
+
50
+ ```python
51
+ def example():
52
+ return "Hello from markdown"
53
+ ```
54
+ """,
55
+ "report.txt": """Business Report
56
+
57
+ Executive Summary:
58
+ This report demonstrates batch processing capabilities.
59
+
60
+ Key Findings:
61
+ 1. Parallel processing improves throughput
62
+ 2. Progress tracking enhances user experience
63
+ 3. Error handling ensures reliability
64
+
65
+ Conclusion:
66
+ Batch processing is essential for large-scale document processing.
67
+ """,
68
+ "notes.md": """# Meeting Notes
69
+
70
+ ## Date: 2024-01-15
71
+
72
+ ### Attendees
73
+ - Alice Johnson
74
+ - Bob Smith
75
+ - Carol Williams
76
+
77
+ ### Discussion Topics
78
+ 1. **Batch Processing Implementation**
79
+ - Parallel document processing
80
+ - Progress tracking
81
+ - Error handling strategies
82
+
83
+ 2. **Performance Metrics**
84
+ - Target: 100 documents/hour
85
+ - Memory usage: < 4GB
86
+ - Success rate: > 95%
87
+
88
+ ### Action Items
89
+ - [ ] Implement batch processing
90
+ - [ ] Add progress bars
91
+ - [ ] Test with large document sets
92
+ - [ ] Optimize memory usage
93
+
94
+ ### Next Steps
95
+ Continue development and testing of batch processing features.
96
+ """,
97
+ }
98
+
99
+ # Create files
100
+ for filename, content in documents.items():
101
+ file_path = temp_dir / filename
102
+ with open(file_path, "w", encoding="utf-8") as f:
103
+ f.write(content)
104
+ sample_files.append(str(file_path))
105
+
106
+ return sample_files, temp_dir
107
+
108
+
109
+ def demonstrate_basic_batch_processing():
110
+ """Demonstrate basic batch processing functionality"""
111
+ print("\n" + "=" * 60)
112
+ print("BASIC BATCH PROCESSING DEMONSTRATION")
113
+ print("=" * 60)
114
+
115
+ # Create sample documents
116
+ sample_files, temp_dir = create_sample_documents()
117
+
118
+ try:
119
+ print(f"Created {len(sample_files)} sample documents in: {temp_dir}")
120
+ for file_path in sample_files:
121
+ print(f" - {Path(file_path).name}")
122
+
123
+ # Create batch parser
124
+ batch_parser = BatchParser(
125
+ parser_type="mineru",
126
+ max_workers=3,
127
+ show_progress=True,
128
+ timeout_per_file=60,
129
+ skip_installation_check=True, # Skip installation check for demo
130
+ )
131
+
132
+ print("\nBatch parser configured:")
133
+ print(" - Parser type: mineru")
134
+ print(" - Max workers: 3")
135
+ print(" - Progress tracking: enabled")
136
+ print(" - Timeout per file: 60 seconds")
137
+
138
+ # Check supported extensions
139
+ supported_extensions = batch_parser.get_supported_extensions()
140
+ print(f" - Supported extensions: {supported_extensions}")
141
+
142
+ # Filter files to supported types
143
+ supported_files = batch_parser.filter_supported_files(sample_files)
144
+ print("\nFile filtering results:")
145
+ print(f" - Total files: {len(sample_files)}")
146
+ print(f" - Supported files: {len(supported_files)}")
147
+
148
+ # Process batch
149
+ output_dir = temp_dir / "batch_output"
150
+ print("\nStarting batch processing...")
151
+ print(f"Output directory: {output_dir}")
152
+
153
+ start_time = time.time()
154
+ result = batch_parser.process_batch(
155
+ file_paths=supported_files,
156
+ output_dir=str(output_dir),
157
+ parse_method="auto",
158
+ recursive=False,
159
+ )
160
+ processing_time = time.time() - start_time
161
+
162
+ # Display results
163
+ print("\n" + "-" * 40)
164
+ print("BATCH PROCESSING RESULTS")
165
+ print("-" * 40)
166
+ print(result.summary())
167
+ print(f"Total processing time: {processing_time:.2f} seconds")
168
+ print(f"Success rate: {result.success_rate:.1f}%")
169
+
170
+ if result.successful_files:
171
+ print("\nSuccessfully processed files:")
172
+ for file_path in result.successful_files:
173
+ print(f" ✅ {Path(file_path).name}")
174
+
175
+ if result.failed_files:
176
+ print("\nFailed files:")
177
+ for file_path in result.failed_files:
178
+ error = result.errors.get(file_path, "Unknown error")
179
+ print(f" ❌ {Path(file_path).name}: {error}")
180
+
181
+ return result
182
+
183
+ except Exception as e:
184
+ print(f"❌ Batch processing demonstration failed: {str(e)}")
185
+ return None
186
+
187
+
188
+ async def demonstrate_async_batch_processing():
189
+ """Demonstrate asynchronous batch processing"""
190
+ print("\n" + "=" * 60)
191
+ print("ASYNCHRONOUS BATCH PROCESSING DEMONSTRATION")
192
+ print("=" * 60)
193
+
194
+ # Create sample documents
195
+ sample_files, temp_dir = create_sample_documents()
196
+
197
+ try:
198
+ print(f"Processing {len(sample_files)} documents asynchronously...")
199
+
200
+ # Create batch parser
201
+ batch_parser = BatchParser(
202
+ parser_type="mineru",
203
+ max_workers=2,
204
+ show_progress=True,
205
+ skip_installation_check=True,
206
+ )
207
+
208
+ # Process batch asynchronously
209
+ output_dir = temp_dir / "async_output"
210
+
211
+ start_time = time.time()
212
+ result = await batch_parser.process_batch_async(
213
+ file_paths=sample_files,
214
+ output_dir=str(output_dir),
215
+ parse_method="auto",
216
+ recursive=False,
217
+ )
218
+ processing_time = time.time() - start_time
219
+
220
+ # Display results
221
+ print("\n" + "-" * 40)
222
+ print("ASYNC BATCH PROCESSING RESULTS")
223
+ print("-" * 40)
224
+ print(result.summary())
225
+ print(f"Async processing time: {processing_time:.2f} seconds")
226
+ print(f"Success rate: {result.success_rate:.1f}%")
227
+
228
+ return result
229
+
230
+ except Exception as e:
231
+ print(f"❌ Async batch processing demonstration failed: {str(e)}")
232
+ return None
233
+
234
+
235
+ async def demonstrate_rag_integration():
236
+ """Demonstrate batch processing integration with RAG-Anything"""
237
+ print("\n" + "=" * 60)
238
+ print("RAG-ANYTHING BATCH INTEGRATION DEMONSTRATION")
239
+ print("=" * 60)
240
+
241
+ # Create sample documents
242
+ sample_files, temp_dir = create_sample_documents()
243
+
244
+ try:
245
+ # Initialize RAG-Anything with temporary storage
246
+ config = RAGAnythingConfig(
247
+ working_dir=str(temp_dir / "rag_storage"),
248
+ enable_image_processing=True,
249
+ enable_table_processing=True,
250
+ enable_equation_processing=True,
251
+ max_concurrent_files=2,
252
+ )
253
+
254
+ rag = RAGAnything(config=config)
255
+
256
+ print("RAG-Anything initialized with batch processing capabilities")
257
+
258
+ # Show available batch methods
259
+ batch_methods = [method for method in dir(rag) if "batch" in method.lower()]
260
+ print(f"Available batch methods: {batch_methods}")
261
+
262
+ # Demonstrate batch processing with RAG integration
263
+ print(f"\nProcessing {len(sample_files)} documents with RAG integration...")
264
+
265
+ # Use the RAG-integrated batch processing
266
+ try:
267
+ # Process documents in batch
268
+ result = rag.process_documents_batch(
269
+ file_paths=sample_files,
270
+ output_dir=str(temp_dir / "rag_batch_output"),
271
+ max_workers=2,
272
+ show_progress=True,
273
+ )
274
+
275
+ print("\n" + "-" * 40)
276
+ print("RAG BATCH PROCESSING RESULTS")
277
+ print("-" * 40)
278
+ print(result.summary())
279
+ print(f"Success rate: {result.success_rate:.1f}%")
280
+
281
+ # Demonstrate batch processing with full RAG integration
282
+ print("\nProcessing documents with full RAG integration...")
283
+
284
+ rag_result = await rag.process_documents_with_rag_batch(
285
+ file_paths=sample_files[:2], # Process subset for demo
286
+ output_dir=str(temp_dir / "rag_full_output"),
287
+ max_workers=1,
288
+ show_progress=True,
289
+ )
290
+
291
+ print("\n" + "-" * 40)
292
+ print("FULL RAG INTEGRATION RESULTS")
293
+ print("-" * 40)
294
+ print(f"Parse result: {rag_result['parse_result'].summary()}")
295
+ print(
296
+ f"RAG processing time: {rag_result['total_processing_time']:.2f} seconds"
297
+ )
298
+ print(
299
+ f"Successfully processed with RAG: {rag_result['successful_rag_files']}"
300
+ )
301
+ print(f"Failed RAG processing: {rag_result['failed_rag_files']}")
302
+
303
+ return rag_result
304
+
305
+ except Exception as e:
306
+ print(f"⚠️ RAG integration demo completed with limitations: {str(e)}")
307
+ print(
308
+ "Note: This is expected in environments without full API configuration"
309
+ )
310
+ return None
311
+
312
+ except Exception as e:
313
+ print(f"❌ RAG integration demonstration failed: {str(e)}")
314
+ return None
315
+
316
+
317
+ def demonstrate_directory_processing():
318
+ """Demonstrate processing entire directories"""
319
+ print("\n" + "=" * 60)
320
+ print("DIRECTORY PROCESSING DEMONSTRATION")
321
+ print("=" * 60)
322
+
323
+ # Create a directory structure with nested files
324
+ temp_dir = Path(tempfile.mkdtemp())
325
+
326
+ # Create main directory files
327
+ main_files = {
328
+ "overview.txt": "Main directory overview document",
329
+ "readme.md": "# Project README\n\nThis is the main project documentation.",
330
+ }
331
+
332
+ # Create subdirectory
333
+ sub_dir = temp_dir / "subdirectory"
334
+ sub_dir.mkdir()
335
+
336
+ sub_files = {
337
+ "details.txt": "Detailed information in subdirectory",
338
+ "notes.md": "# Notes\n\nAdditional notes and information.",
339
+ }
340
+
341
+ # Write all files
342
+ all_files = []
343
+ for filename, content in main_files.items():
344
+ file_path = temp_dir / filename
345
+ with open(file_path, "w", encoding="utf-8") as f:
346
+ f.write(content)
347
+ all_files.append(str(file_path))
348
+
349
+ for filename, content in sub_files.items():
350
+ file_path = sub_dir / filename
351
+ with open(file_path, "w", encoding="utf-8") as f:
352
+ f.write(content)
353
+ all_files.append(str(file_path))
354
+
355
+ try:
356
+ print("Created directory structure:")
357
+ print(f" Main directory: {temp_dir}")
358
+ print(f" Files in main: {list(main_files.keys())}")
359
+ print(f" Subdirectory: {sub_dir}")
360
+ print(f" Files in sub: {list(sub_files.keys())}")
361
+
362
+ # Create batch parser
363
+ batch_parser = BatchParser(
364
+ parser_type="mineru",
365
+ max_workers=2,
366
+ show_progress=True,
367
+ skip_installation_check=True,
368
+ )
369
+
370
+ # Process entire directory recursively
371
+ print("\nProcessing entire directory recursively...")
372
+
373
+ result = batch_parser.process_batch(
374
+ file_paths=[str(temp_dir)], # Pass directory path
375
+ output_dir=str(temp_dir / "directory_output"),
376
+ parse_method="auto",
377
+ recursive=True, # Include subdirectories
378
+ )
379
+
380
+ print("\n" + "-" * 40)
381
+ print("DIRECTORY PROCESSING RESULTS")
382
+ print("-" * 40)
383
+ print(result.summary())
384
+ print(f"Total files found and processed: {result.total_files}")
385
+ print(f"Success rate: {result.success_rate:.1f}%")
386
+
387
+ if result.successful_files:
388
+ print("\nSuccessfully processed:")
389
+ for file_path in result.successful_files:
390
+ relative_path = Path(file_path).relative_to(temp_dir)
391
+ print(f" ✅ {relative_path}")
392
+
393
+ return result
394
+
395
+ except Exception as e:
396
+ print(f"❌ Directory processing demonstration failed: {str(e)}")
397
+ return None
398
+
399
+
400
+ def demonstrate_error_handling():
401
+ """Demonstrate error handling and recovery"""
402
+ print("\n" + "=" * 60)
403
+ print("ERROR HANDLING DEMONSTRATION")
404
+ print("=" * 60)
405
+
406
+ temp_dir = Path(tempfile.mkdtemp())
407
+
408
+ # Create files with various issues
409
+ files_with_issues = {
410
+ "valid_file.txt": "This is a valid file that should process successfully.",
411
+ "empty_file.txt": "", # Empty file
412
+ "large_file.txt": "x" * 1000000, # Large file (1MB of 'x')
413
+ }
414
+
415
+ created_files = []
416
+ for filename, content in files_with_issues.items():
417
+ file_path = temp_dir / filename
418
+ with open(file_path, "w", encoding="utf-8") as f:
419
+ f.write(content)
420
+ created_files.append(str(file_path))
421
+
422
+ # Add a non-existent file to the list
423
+ created_files.append(str(temp_dir / "non_existent_file.txt"))
424
+
425
+ try:
426
+ print(f"Testing error handling with {len(created_files)} files:")
427
+ for file_path in created_files:
428
+ name = Path(file_path).name
429
+ exists = Path(file_path).exists()
430
+ size = Path(file_path).stat().st_size if exists else 0
431
+ print(f" - {name}: {'exists' if exists else 'missing'}, {size} bytes")
432
+
433
+ # Create batch parser with short timeout for demonstration
434
+ batch_parser = BatchParser(
435
+ parser_type="mineru",
436
+ max_workers=2,
437
+ show_progress=True,
438
+ timeout_per_file=30, # Short timeout for demo
439
+ skip_installation_check=True,
440
+ )
441
+
442
+ # Process files and handle errors
443
+ result = batch_parser.process_batch(
444
+ file_paths=created_files,
445
+ output_dir=str(temp_dir / "error_test_output"),
446
+ parse_method="auto",
447
+ )
448
+
449
+ print("\n" + "-" * 40)
450
+ print("ERROR HANDLING RESULTS")
451
+ print("-" * 40)
452
+ print(result.summary())
453
+
454
+ if result.successful_files:
455
+ print("\nSuccessful files:")
456
+ for file_path in result.successful_files:
457
+ print(f" ✅ {Path(file_path).name}")
458
+
459
+ if result.failed_files:
460
+ print("\nFailed files with error details:")
461
+ for file_path in result.failed_files:
462
+ error = result.errors.get(file_path, "Unknown error")
463
+ print(f" ❌ {Path(file_path).name}: {error}")
464
+
465
+ # Demonstrate retry logic
466
+ if result.failed_files:
467
+ print(
468
+ f"\nDemonstrating retry logic for {len(result.failed_files)} failed files..."
469
+ )
470
+
471
+ # Retry only the failed files
472
+ retry_result = batch_parser.process_batch(
473
+ file_paths=result.failed_files,
474
+ output_dir=str(temp_dir / "retry_output"),
475
+ parse_method="auto",
476
+ )
477
+
478
+ print(f"Retry results: {retry_result.summary()}")
479
+
480
+ return result
481
+
482
+ except Exception as e:
483
+ print(f"❌ Error handling demonstration failed: {str(e)}")
484
+ return None
485
+
486
+
487
+ async def main():
488
+ """Main demonstration function"""
489
+ # Configure logging
490
+ logging.basicConfig(
491
+ level=logging.INFO,
492
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
493
+ )
494
+
495
+ print("RAG-Anything Batch Processing Demonstration")
496
+ print("=" * 70)
497
+ print("This example demonstrates various batch processing capabilities:")
498
+ print(" - Basic batch processing with progress tracking")
499
+ print(" - Asynchronous processing for improved performance")
500
+ print(" - Integration with RAG-Anything pipeline")
501
+ print(" - Directory processing with recursive file discovery")
502
+ print(" - Comprehensive error handling and recovery")
503
+
504
+ results = {}
505
+
506
+ # Run demonstrations
507
+ print("\n🚀 Starting demonstrations...")
508
+
509
+ # Basic batch processing
510
+ results["basic"] = demonstrate_basic_batch_processing()
511
+
512
+ # Asynchronous processing
513
+ results["async"] = await demonstrate_async_batch_processing()
514
+
515
+ # RAG integration
516
+ results["rag"] = await demonstrate_rag_integration()
517
+
518
+ # Directory processing
519
+ results["directory"] = demonstrate_directory_processing()
520
+
521
+ # Error handling
522
+ results["error_handling"] = demonstrate_error_handling()
523
+
524
+ # Summary
525
+ print("\n" + "=" * 70)
526
+ print("DEMONSTRATION SUMMARY")
527
+ print("=" * 70)
528
+
529
+ for demo_name, result in results.items():
530
+ if result:
531
+ if hasattr(result, "success_rate"):
532
+ print(
533
+ f"✅ {demo_name.upper()}: {result.success_rate:.1f}% success rate"
534
+ )
535
+ else:
536
+ print(f"✅ {demo_name.upper()}: Completed successfully")
537
+ else:
538
+ print(f"❌ {demo_name.upper()}: Failed or had limitations")
539
+
540
+ print("\n📊 Key Features Demonstrated:")
541
+ print(" - Parallel document processing with configurable worker counts")
542
+ print(" - Real-time progress tracking with tqdm progress bars")
543
+ print(" - Comprehensive error handling and reporting")
544
+ print(" - File filtering based on supported document types")
545
+ print(" - Directory processing with recursive file discovery")
546
+ print(" - Asynchronous processing for improved performance")
547
+ print(" - Integration with RAG-Anything document pipeline")
548
+ print(" - Retry logic for failed documents")
549
+ print(" - Detailed processing statistics and timing")
550
+
551
+ print("\n💡 Best Practices Highlighted:")
552
+ print(" - Use appropriate worker counts for your system")
553
+ print(" - Enable progress tracking for long-running operations")
554
+ print(" - Handle errors gracefully with retry mechanisms")
555
+ print(" - Filter files to supported types before processing")
556
+ print(" - Set reasonable timeouts for document processing")
557
+ print(" - Use skip_installation_check for environments with conflicts")
558
+
559
+
560
+ if __name__ == "__main__":
561
+ asyncio.run(main())
rag_anything_smaranika/examples/batch_processing_optimized_example.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example: Optimized Batch Processing for RAGAnything
3
+
4
+ This example demonstrates the new optimized batch processing capabilities
5
+ that provide 2-3x faster processing for large document collections.
6
+
7
+ Features demonstrated:
8
+ - Concurrent document parsing with prefetching
9
+ - Pipeline architecture (parse + process simultaneously)
10
+ - Progress tracking with ETA estimation
11
+ - Adaptive rate limiting
12
+ - Performance statistics
13
+ """
14
+
15
+ import asyncio
16
+ import time
17
+ from pathlib import Path
18
+ from raganything import RAGAnything
19
+
20
+ async def progress_callback(progress_data):
21
+ """
22
+ Callback function to handle progress updates
23
+
24
+ Args:
25
+ progress_data: Dict containing:
26
+ - processed: Number of processed documents
27
+ - total: Total number of documents
28
+ - failed: Number of failed documents
29
+ - percentage: Completion percentage
30
+ - eta_seconds: Estimated time remaining
31
+ - rate_docs_per_sec: Processing rate
32
+ """
33
+ print(f"\rProgress: {progress_data['processed']}/{progress_data['total']} "
34
+ f"({progress_data['percentage']:.1f}%) | "
35
+ f"Rate: {progress_data['rate_docs_per_sec']:.2f} docs/s | "
36
+ f"ETA: {progress_data['eta_seconds']:.1f}s", end='', flush=True)
37
+
38
+
39
+ async def main():
40
+ # Initialize RAGAnything
41
+ rag = RAGAnything(
42
+ working_dir="./rag_storage",
43
+ rag_dir="./rag_index",
44
+ parser="mineru", # or "docling"
45
+ )
46
+
47
+ # Example 1: Process a list of documents with optimization
48
+ print("=" * 60)
49
+ print("Example 1: Optimized Batch Processing")
50
+ print("=" * 60)
51
+
52
+ documents = [
53
+ "./data/report1.pdf",
54
+ "./data/report2.pdf",
55
+ "./data/research_paper.pdf",
56
+ "./data/technical_spec.docx",
57
+ ]
58
+
59
+ start_time = time.time()
60
+
61
+ result = await rag.process_documents_batch_optimized(
62
+ file_paths=documents,
63
+ max_concurrent_parsers=4, # Parse up to 4 documents at once
64
+ max_concurrent_processors=10, # Process up to 10 chunks concurrently
65
+ enable_progress_tracking=True,
66
+ progress_callback=progress_callback,
67
+ )
68
+
69
+ print() # New line after progress bar
70
+
71
+ elapsed_time = time.time() - start_time
72
+
73
+ # Display results
74
+ print(f"\n📊 Processing Results:")
75
+ print(f" ✅ Successful: {len(result['successful_files'])} documents")
76
+ print(f" ❌ Failed: {len(result['failed_files'])} documents")
77
+ print(f" ⏱️ Total time: {elapsed_time:.2f}s")
78
+
79
+ # Display detailed statistics
80
+ stats = result['statistics']
81
+ print(f"\n📈 Performance Statistics:")
82
+ print(f" Processing rate: {stats['processing_rate_docs_per_sec']:.2f} docs/sec")
83
+ print(f" Parsing time: {stats['parsing_time']:.2f}s")
84
+ print(f" Text processing: {stats['text_processing_time']:.2f}s")
85
+ print(f" Multimodal processing: {stats['multimodal_processing_time']:.2f}s")
86
+ print(f" Cache hit rate: {stats['cache_hit_rate']:.1f}%")
87
+
88
+ # Show per-document results
89
+ if result['successful_files']:
90
+ print(f"\n✅ Successfully processed files:")
91
+ for file_info in result['successful_files'][:5]: # Show first 5
92
+ print(f" - {Path(file_info['file_path']).name} "
93
+ f"(processing: {file_info['processing_time']:.2f}s, "
94
+ f"parsing: {file_info['parse_time']:.2f}s)")
95
+
96
+ if result['failed_files']:
97
+ print(f"\n❌ Failed files:")
98
+ for file_info in result['failed_files']:
99
+ print(f" - {Path(file_info['file_path']).name}: {file_info['error']}")
100
+
101
+ # Example 2: Process an entire folder with optimization
102
+ print("\n" + "=" * 60)
103
+ print("Example 2: Optimized Folder Processing")
104
+ print("=" * 60)
105
+
106
+ folder_result = await rag.process_folder_optimized(
107
+ folder_path="./data/documents",
108
+ file_extensions=['.pdf', '.docx', '.pptx'],
109
+ recursive=True,
110
+ max_concurrent_parsers=6,
111
+ max_concurrent_processors=12,
112
+ progress_callback=progress_callback,
113
+ )
114
+
115
+ print() # New line after progress bar
116
+
117
+ print(f"\n📁 Folder Processing Complete:")
118
+ print(f" Successful: {len(folder_result['successful_files'])} files")
119
+ print(f" Failed: {len(folder_result['failed_files'])} files")
120
+ print(f" Rate: {folder_result['statistics']['processing_rate_docs_per_sec']:.2f} docs/sec")
121
+
122
+ # Example 3: Compare standard vs optimized processing
123
+ print("\n" + "=" * 60)
124
+ print("Example 3: Performance Comparison")
125
+ print("=" * 60)
126
+
127
+ test_docs = ["./data/test1.pdf", "./data/test2.pdf", "./data/test3.pdf"]
128
+
129
+ # Standard processing
130
+ print("\n🐢 Standard batch processing...")
131
+ standard_start = time.time()
132
+ await rag.process_folder_complete(
133
+ folder_path="./data/test",
134
+ max_workers=4,
135
+ display_stats=False
136
+ )
137
+ standard_time = time.time() - standard_start
138
+
139
+ # Optimized processing (on different set to avoid cache)
140
+ print("🚀 Optimized batch processing...")
141
+ optimized_start = time.time()
142
+ await rag.process_documents_batch_optimized(
143
+ file_paths=test_docs,
144
+ max_concurrent_parsers=4,
145
+ max_concurrent_processors=10,
146
+ enable_progress_tracking=False,
147
+ )
148
+ optimized_time = time.time() - optimized_start
149
+
150
+ print(f"\n⚡ Performance Improvement:")
151
+ print(f" Standard: {standard_time:.2f}s")
152
+ print(f" Optimized: {optimized_time:.2f}s")
153
+ if standard_time > 0:
154
+ speedup = (standard_time / optimized_time)
155
+ print(f" Speedup: {speedup:.2f}x faster")
156
+
157
+ # Example 4: Custom progress tracking
158
+ print("\n" + "=" * 60)
159
+ print("Example 4: Custom Progress Tracking")
160
+ print("=" * 60)
161
+
162
+ class CustomProgressTracker:
163
+ def __init__(self):
164
+ self.start_time = time.time()
165
+ self.logs = []
166
+
167
+ def __call__(self, progress):
168
+ """Progress callback"""
169
+ elapsed = time.time() - self.start_time
170
+ log_entry = {
171
+ "timestamp": elapsed,
172
+ "processed": progress['processed'],
173
+ "total": progress['total'],
174
+ "percentage": progress['percentage'],
175
+ "rate": progress['rate_docs_per_sec'],
176
+ }
177
+ self.logs.append(log_entry)
178
+
179
+ # Print formatted progress
180
+ bar_length = 40
181
+ filled_length = int(bar_length * progress['percentage'] / 100)
182
+ bar = '█' * filled_length + '-' * (bar_length - filled_length)
183
+
184
+ print(f"\r|{bar}| {progress['percentage']:.1f}% "
185
+ f"[{progress['processed']}/{progress['total']}] "
186
+ f"ETA: {progress['eta_seconds']:.0f}s", end='', flush=True)
187
+
188
+ def save_log(self, filename="processing_log.txt"):
189
+ """Save progress log to file"""
190
+ with open(filename, 'w') as f:
191
+ f.write("Batch Processing Log\n")
192
+ f.write("=" * 50 + "\n")
193
+ for entry in self.logs:
194
+ f.write(f"Time: {entry['timestamp']:.2f}s | "
195
+ f"Progress: {entry['processed']}/{entry['total']} "
196
+ f"({entry['percentage']:.1f}%) | "
197
+ f"Rate: {entry['rate']:.2f} docs/s\n")
198
+
199
+ tracker = CustomProgressTracker()
200
+
201
+ await rag.process_documents_batch_optimized(
202
+ file_paths=documents,
203
+ progress_callback=tracker,
204
+ )
205
+
206
+ print() # New line
207
+ tracker.save_log("./batch_processing_log.txt")
208
+ print("📝 Progress log saved to batch_processing_log.txt")
209
+
210
+ print("\n" + "=" * 60)
211
+ print("All examples completed!")
212
+ print("=" * 60)
213
+
214
+
215
+ if __name__ == "__main__":
216
+ asyncio.run(main())
rag_anything_smaranika/examples/enhanced_markdown_example.py ADDED
@@ -0,0 +1,1055 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Enhanced Markdown Conversion Example for RAG-Anything
4
+
5
+ This example demonstrates the enhanced markdown to PDF conversion capabilities
6
+ with multiple backends, advanced styling, and professional formatting.
7
+
8
+ Features demonstrated:
9
+ - Basic markdown to PDF conversion
10
+ - Multiple conversion backends (WeasyPrint, Pandoc)
11
+ - Custom CSS styling and configuration
12
+ - Backend detection and selection
13
+ - Error handling and fallback mechanisms
14
+ - Command-line interface usage
15
+ """
16
+
17
+ import logging
18
+ from pathlib import Path
19
+ import tempfile
20
+
21
+ # Add project root directory to Python path
22
+ import sys
23
+
24
+ sys.path.append(str(Path(__file__).parent.parent))
25
+
26
+ from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
27
+
28
+
29
+ def create_sample_markdown_content():
30
+ """Create comprehensive sample markdown content for testing"""
31
+
32
+ # Basic sample
33
+ basic_content = """# Basic Markdown Sample
34
+
35
+ ## Introduction
36
+ This is a simple markdown document demonstrating basic formatting.
37
+
38
+ ### Text Formatting
39
+ - **Bold text** and *italic text*
40
+ - `Inline code` examples
41
+ - [Links to external sites](https://github.com)
42
+
43
+ ### Lists
44
+ 1. First ordered item
45
+ 2. Second ordered item
46
+ 3. Third ordered item
47
+
48
+ - Unordered item
49
+ - Another unordered item
50
+ - Nested item
51
+ - Another nested item
52
+
53
+ ### Blockquotes
54
+ > This is a blockquote with important information.
55
+ > It can span multiple lines.
56
+
57
+ ### Code Block
58
+ ```python
59
+ def hello_world():
60
+ print("Hello, World!")
61
+ return "Success"
62
+ ```
63
+ """
64
+
65
+ # Technical documentation sample
66
+ technical_content = """# Technical Documentation
67
+
68
+ ## Table of Contents
69
+ - [Overview](#overview)
70
+ - [Architecture](#architecture)
71
+ - [Implementation](#implementation)
72
+ - [Performance](#performance)
73
+
74
+ ## Overview
75
+ This document provides comprehensive technical specifications for the enhanced markdown conversion system.
76
+
77
+ ## Architecture
78
+
79
+ ### Core Components
80
+ 1. **Markdown Parser**: Processes markdown syntax
81
+ 2. **CSS Engine**: Applies styling and layout
82
+ 3. **PDF Generator**: Creates final PDF output
83
+ 4. **Backend Manager**: Handles multiple conversion engines
84
+
85
+ ### Data Flow
86
+ ```mermaid
87
+ graph LR
88
+ A[Markdown Input] --> B[Parser]
89
+ B --> C[CSS Processor]
90
+ C --> D[PDF Generator]
91
+ D --> E[PDF Output]
92
+ ```
93
+
94
+ ## Implementation
95
+
96
+ ### Python Code Example
97
+ ```python
98
+ from raganything.enhanced_markdown import EnhancedMarkdownConverter, MarkdownConfig
99
+
100
+ # Configure converter
101
+ config = MarkdownConfig(
102
+ page_size="A4",
103
+ margin="1in",
104
+ include_toc=True,
105
+ syntax_highlighting=True
106
+ )
107
+
108
+ # Create converter
109
+ converter = EnhancedMarkdownConverter(config)
110
+
111
+ # Convert to PDF
112
+ success = converter.convert_file_to_pdf(
113
+ input_path="document.md",
114
+ output_path="output.pdf",
115
+ method="weasyprint"
116
+ )
117
+ ```
118
+
119
+ ### Configuration Options
120
+ ```yaml
121
+ converter:
122
+ page_size: A4
123
+ margin: 1in
124
+ font_size: 12pt
125
+ include_toc: true
126
+ syntax_highlighting: true
127
+ backend: weasyprint
128
+ ```
129
+
130
+ ## Performance
131
+
132
+ ### Benchmark Results
133
+ | Backend | Speed | Quality | Features |
134
+ |---------|-------|---------|----------|
135
+ | WeasyPrint | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ |
136
+ | Pandoc | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
137
+
138
+ ### Processing Times
139
+ - **Small documents** (< 10 pages): 1-3 seconds
140
+ - **Medium documents** (10-50 pages): 3-10 seconds
141
+ - **Large documents** (> 50 pages): 10-30 seconds
142
+
143
+ ## Advanced Features
144
+
145
+ ### Custom CSS Styling
146
+ The system supports advanced CSS customization:
147
+
148
+ ```css
149
+ body {
150
+ font-family: 'Georgia', serif;
151
+ line-height: 1.6;
152
+ color: #333;
153
+ }
154
+
155
+ h1 {
156
+ color: #2c3e50;
157
+ border-bottom: 2px solid #3498db;
158
+ padding-bottom: 0.3em;
159
+ }
160
+
161
+ code {
162
+ background-color: #f8f9fa;
163
+ padding: 2px 4px;
164
+ border-radius: 3px;
165
+ font-family: 'Courier New', monospace;
166
+ }
167
+
168
+ pre {
169
+ background-color: #f8f9fa;
170
+ border-left: 4px solid #3498db;
171
+ padding: 15px;
172
+ border-radius: 5px;
173
+ overflow-x: auto;
174
+ }
175
+
176
+ table {
177
+ border-collapse: collapse;
178
+ width: 100%;
179
+ margin: 1em 0;
180
+ }
181
+
182
+ th, td {
183
+ border: 1px solid #ddd;
184
+ padding: 8px 12px;
185
+ text-align: left;
186
+ }
187
+
188
+ th {
189
+ background-color: #f2f2f2;
190
+ font-weight: bold;
191
+ }
192
+ ```
193
+
194
+ ### Image Support
195
+ ![Sample Image](https://via.placeholder.com/400x200/3498db/ffffff?text=Sample+Image)
196
+
197
+ Images are automatically scaled and positioned appropriately in the PDF output.
198
+
199
+ ## Conclusion
200
+ The enhanced markdown conversion system provides professional-quality PDF generation with extensive customization options and multiple backend support.
201
+
202
+ ---
203
+
204
+ *Generated on: 2024-01-15*
205
+ *Version: 1.0.0*
206
+ """
207
+
208
+ # Academic paper sample
209
+ academic_content = """# Research Paper: Advanced Document Processing
210
+
211
+ **Authors:** Alice Johnson¹, Bob Smith², Carol Williams¹
212
+ **Affiliations:**
213
+ ¹ University of Technology
214
+ ² Research Institute
215
+
216
+ ## Abstract
217
+
218
+ This paper presents a comprehensive analysis of advanced document processing techniques using enhanced markdown conversion. Our research demonstrates significant improvements in processing speed and output quality through optimized backend selection and custom styling approaches.
219
+
220
+ **Keywords:** document processing, markdown conversion, PDF generation, performance optimization
221
+
222
+ ## 1. Introduction
223
+
224
+ Document processing has become increasingly important in modern information systems. The ability to convert markdown documents to high-quality PDF outputs with professional formatting is crucial for academic, technical, and business applications.
225
+
226
+ ### 1.1 Research Objectives
227
+
228
+ 1. Evaluate different markdown conversion backends
229
+ 2. Analyze performance characteristics of each approach
230
+ 3. Develop optimization strategies for large-scale processing
231
+ 4. Design flexible configuration systems for diverse use cases
232
+
233
+ ### 1.2 Contributions
234
+
235
+ This work makes the following contributions:
236
+ - Comprehensive comparison of markdown conversion backends
237
+ - Performance optimization techniques for large documents
238
+ - Flexible configuration framework for customization
239
+ - Integration patterns for document processing pipelines
240
+
241
+ ## 2. Methodology
242
+
243
+ ### 2.1 Experimental Setup
244
+
245
+ We conducted experiments using the following configuration:
246
+
247
+ ```python
248
+ # Experimental configuration
249
+ config = MarkdownConfig(
250
+ page_size="A4",
251
+ margin="1in",
252
+ font_size="11pt",
253
+ line_height="1.4",
254
+ include_toc=True,
255
+ syntax_highlighting=True
256
+ )
257
+ ```
258
+
259
+ ### 2.2 Test Documents
260
+
261
+ | Category | Count | Avg Size | Complexity |
262
+ |----------|-------|----------|------------|
263
+ | Simple | 100 | 2 pages | Low |
264
+ | Medium | 50 | 10 pages | Medium |
265
+ | Complex | 25 | 25 pages | High |
266
+
267
+ ### 2.3 Metrics
268
+
269
+ We evaluated performance using the following metrics:
270
+ - **Conversion Speed**: Time to generate PDF (seconds)
271
+ - **Memory Usage**: Peak memory consumption (MB)
272
+ - **Output Quality**: Visual assessment score (1-10)
273
+ - **Feature Support**: Number of supported markdown features
274
+
275
+ ## 3. Results
276
+
277
+ ### 3.1 Performance Comparison
278
+
279
+ The following table summarizes our performance results:
280
+
281
+ | Backend | Speed (s) | Memory (MB) | Quality | Features |
282
+ |---------|-----------|-------------|---------|----------|
283
+ | WeasyPrint | 2.3 ± 0.5 | 85 ± 15 | 8.5 | 85% |
284
+ | Pandoc | 4.7 ± 1.2 | 120 ± 25 | 9.2 | 95% |
285
+
286
+ ### 3.2 Quality Analysis
287
+
288
+ #### 3.2.1 Typography
289
+ WeasyPrint excels in web-style typography with excellent CSS support, while Pandoc provides superior academic formatting with LaTeX-quality output.
290
+
291
+ #### 3.2.2 Code Highlighting
292
+ Both backends support syntax highlighting through Pygments:
293
+
294
+ ```python
295
+ def analyze_performance(backend, documents):
296
+ '''Analyze conversion performance for given backend'''
297
+ results = []
298
+
299
+ for doc in documents:
300
+ start_time = time.time()
301
+ success = backend.convert(doc)
302
+ end_time = time.time()
303
+
304
+ results.append({
305
+ 'document': doc,
306
+ 'time': end_time - start_time,
307
+ 'success': success
308
+ })
309
+
310
+ return results
311
+ ```
312
+
313
+ ### 3.3 Scalability
314
+
315
+ Our scalability analysis shows:
316
+ - Linear scaling with document size for both backends
317
+ - Memory usage proportional to content complexity
318
+ - Optimal batch sizes of 10-20 documents for parallel processing
319
+
320
+ ## 4. Discussion
321
+
322
+ ### 4.1 Backend Selection Guidelines
323
+
324
+ Choose **WeasyPrint** for:
325
+ - Web-style documents with custom CSS
326
+ - Fast conversion requirements
327
+ - Simple to medium complexity documents
328
+
329
+ Choose **Pandoc** for:
330
+ - Academic papers and publications
331
+ - Complex document structures
332
+ - Maximum feature support requirements
333
+
334
+ ### 4.2 Optimization Strategies
335
+
336
+ 1. **Image Optimization**: Compress images before embedding
337
+ 2. **CSS Minimization**: Use efficient CSS selectors
338
+ 3. **Content Chunking**: Process large documents in sections
339
+ 4. **Caching**: Cache converted content for repeated use
340
+
341
+ ## 5. Conclusion
342
+
343
+ This research demonstrates that enhanced markdown conversion provides significant benefits for document processing workflows. The choice between WeasyPrint and Pandoc depends on specific requirements for speed, quality, and features.
344
+
345
+ ### 5.1 Future Work
346
+
347
+ - Integration with cloud processing services
348
+ - Real-time collaborative editing support
349
+ - Advanced template systems
350
+ - Performance optimization for very large documents
351
+
352
+ ## References
353
+
354
+ 1. Johnson, A. et al. (2024). "Advanced Document Processing Techniques." *Journal of Information Systems*, 15(3), 45-62.
355
+ 2. Smith, B. (2023). "PDF Generation Optimization." *Technical Computing Review*, 8(2), 12-28.
356
+ 3. Williams, C. (2024). "Markdown Processing Frameworks." *Software Engineering Quarterly*, 22(1), 78-95.
357
+
358
+ ---
359
+
360
+ **Manuscript received:** January 10, 2024
361
+ **Accepted for publication:** January 15, 2024
362
+ **Published online:** January 20, 2024
363
+ """
364
+
365
+ return {
366
+ "basic": basic_content,
367
+ "technical": technical_content,
368
+ "academic": academic_content,
369
+ }
370
+
371
+
372
+ def demonstrate_basic_conversion():
373
+ """Demonstrate basic markdown to PDF conversion"""
374
+ print("\n" + "=" * 60)
375
+ print("BASIC MARKDOWN CONVERSION DEMONSTRATION")
376
+ print("=" * 60)
377
+
378
+ try:
379
+ # Create converter with default settings
380
+ converter = EnhancedMarkdownConverter()
381
+
382
+ # Show backend information
383
+ backend_info = converter.get_backend_info()
384
+ print("Available conversion backends:")
385
+ for backend, available in backend_info["available_backends"].items():
386
+ status = "✅" if available else "❌"
387
+ print(f" {status} {backend}")
388
+ print(f"Recommended backend: {backend_info['recommended_backend']}")
389
+
390
+ # Get sample content
391
+ samples = create_sample_markdown_content()
392
+ temp_dir = Path(tempfile.mkdtemp())
393
+
394
+ # Convert basic sample
395
+ basic_md_path = temp_dir / "basic_sample.md"
396
+ with open(basic_md_path, "w", encoding="utf-8") as f:
397
+ f.write(samples["basic"])
398
+
399
+ print(f"\nConverting basic sample: {basic_md_path}")
400
+
401
+ success = converter.convert_file_to_pdf(
402
+ input_path=str(basic_md_path),
403
+ output_path=str(temp_dir / "basic_sample.pdf"),
404
+ method="auto", # Let the system choose the best backend
405
+ )
406
+
407
+ if success:
408
+ print("✅ Basic conversion successful!")
409
+ print(f" Output: {temp_dir / 'basic_sample.pdf'}")
410
+ else:
411
+ print("❌ Basic conversion failed")
412
+
413
+ return success, temp_dir
414
+
415
+ except Exception as e:
416
+ print(f"❌ Basic conversion demonstration failed: {str(e)}")
417
+ return False, None
418
+
419
+
420
+ def demonstrate_backend_comparison():
421
+ """Demonstrate different conversion backends"""
422
+ print("\n" + "=" * 60)
423
+ print("BACKEND COMPARISON DEMONSTRATION")
424
+ print("=" * 60)
425
+
426
+ try:
427
+ samples = create_sample_markdown_content()
428
+ temp_dir = Path(tempfile.mkdtemp())
429
+
430
+ # Create technical document
431
+ tech_md_path = temp_dir / "technical.md"
432
+ with open(tech_md_path, "w", encoding="utf-8") as f:
433
+ f.write(samples["technical"])
434
+
435
+ print("Testing different backends with technical document...")
436
+
437
+ # Test different backends
438
+ backends = ["auto", "weasyprint", "pandoc"]
439
+ results = {}
440
+
441
+ for backend in backends:
442
+ try:
443
+ print(f"\nTesting {backend} backend...")
444
+
445
+ converter = EnhancedMarkdownConverter()
446
+ output_path = temp_dir / f"technical_{backend}.pdf"
447
+
448
+ import time
449
+
450
+ start_time = time.time()
451
+
452
+ success = converter.convert_file_to_pdf(
453
+ input_path=str(tech_md_path),
454
+ output_path=str(output_path),
455
+ method=backend,
456
+ )
457
+
458
+ end_time = time.time()
459
+ conversion_time = end_time - start_time
460
+
461
+ if success:
462
+ file_size = (
463
+ output_path.stat().st_size if output_path.exists() else 0
464
+ )
465
+ print(
466
+ f" ✅ {backend}: Success in {conversion_time:.2f}s, {file_size} bytes"
467
+ )
468
+ results[backend] = {
469
+ "success": True,
470
+ "time": conversion_time,
471
+ "size": file_size,
472
+ "output": str(output_path),
473
+ }
474
+ else:
475
+ print(f" ❌ {backend}: Failed")
476
+ results[backend] = {"success": False, "time": conversion_time}
477
+
478
+ except Exception as e:
479
+ print(f" ❌ {backend}: Error - {str(e)}")
480
+ results[backend] = {"success": False, "error": str(e)}
481
+
482
+ # Summary
483
+ print("\n" + "-" * 40)
484
+ print("BACKEND COMPARISON SUMMARY")
485
+ print("-" * 40)
486
+ successful_backends = [b for b, r in results.items() if r.get("success", False)]
487
+ print(f"Successful backends: {successful_backends}")
488
+
489
+ if successful_backends:
490
+ fastest = min(successful_backends, key=lambda b: results[b]["time"])
491
+ print(f"Fastest backend: {fastest} ({results[fastest]['time']:.2f}s)")
492
+
493
+ return results, temp_dir
494
+
495
+ except Exception as e:
496
+ print(f"❌ Backend comparison demonstration failed: {str(e)}")
497
+ return None, None
498
+
499
+
500
+ def demonstrate_custom_styling():
501
+ """Demonstrate custom CSS styling and configuration"""
502
+ print("\n" + "=" * 60)
503
+ print("CUSTOM STYLING DEMONSTRATION")
504
+ print("=" * 60)
505
+
506
+ try:
507
+ samples = create_sample_markdown_content()
508
+ temp_dir = Path(tempfile.mkdtemp())
509
+
510
+ # Create custom CSS
511
+ custom_css = """
512
+ body {
513
+ font-family: 'Times New Roman', serif;
514
+ font-size: 11pt;
515
+ line-height: 1.4;
516
+ color: #2c3e50;
517
+ max-width: 800px;
518
+ margin: 0 auto;
519
+ padding: 20px;
520
+ }
521
+
522
+ h1 {
523
+ color: #c0392b;
524
+ font-size: 2.2em;
525
+ border-bottom: 3px solid #e74c3c;
526
+ padding-bottom: 0.5em;
527
+ margin-top: 2em;
528
+ }
529
+
530
+ h2 {
531
+ color: #8e44ad;
532
+ font-size: 1.6em;
533
+ border-bottom: 2px solid #9b59b6;
534
+ padding-bottom: 0.3em;
535
+ margin-top: 1.5em;
536
+ }
537
+
538
+ h3 {
539
+ color: #2980b9;
540
+ font-size: 1.3em;
541
+ margin-top: 1.2em;
542
+ }
543
+
544
+ code {
545
+ background-color: #ecf0f1;
546
+ color: #e74c3c;
547
+ padding: 3px 6px;
548
+ border-radius: 4px;
549
+ font-family: 'Courier New', monospace;
550
+ font-size: 0.9em;
551
+ }
552
+
553
+ pre {
554
+ background-color: #2c3e50;
555
+ color: #ecf0f1;
556
+ padding: 20px;
557
+ border-radius: 8px;
558
+ border-left: 5px solid #3498db;
559
+ overflow-x: auto;
560
+ font-size: 0.9em;
561
+ }
562
+
563
+ pre code {
564
+ background-color: transparent;
565
+ color: inherit;
566
+ padding: 0;
567
+ }
568
+
569
+ blockquote {
570
+ background-color: #f8f9fa;
571
+ border-left: 5px solid #3498db;
572
+ margin: 1em 0;
573
+ padding: 15px 20px;
574
+ font-style: italic;
575
+ color: #555;
576
+ }
577
+
578
+ table {
579
+ border-collapse: collapse;
580
+ width: 100%;
581
+ margin: 1.5em 0;
582
+ background-color: white;
583
+ border-radius: 8px;
584
+ overflow: hidden;
585
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
586
+ }
587
+
588
+ th {
589
+ background-color: #3498db;
590
+ color: white;
591
+ padding: 12px 15px;
592
+ text-align: left;
593
+ font-weight: bold;
594
+ }
595
+
596
+ td {
597
+ padding: 10px 15px;
598
+ border-bottom: 1px solid #ecf0f1;
599
+ }
600
+
601
+ tr:nth-child(even) {
602
+ background-color: #f8f9fa;
603
+ }
604
+
605
+ tr:hover {
606
+ background-color: #e8f4fd;
607
+ }
608
+
609
+ ul, ol {
610
+ margin-bottom: 1em;
611
+ padding-left: 2em;
612
+ }
613
+
614
+ li {
615
+ margin-bottom: 0.5em;
616
+ line-height: 1.6;
617
+ }
618
+
619
+ a {
620
+ color: #3498db;
621
+ text-decoration: none;
622
+ border-bottom: 1px dotted #3498db;
623
+ }
624
+
625
+ a:hover {
626
+ color: #2980b9;
627
+ border-bottom: 1px solid #2980b9;
628
+ }
629
+
630
+ .toc {
631
+ background-color: #f8f9fa;
632
+ border: 2px solid #e9ecef;
633
+ border-radius: 8px;
634
+ padding: 20px;
635
+ margin: 2em 0;
636
+ }
637
+
638
+ .toc h2 {
639
+ color: #2c3e50;
640
+ margin-top: 0;
641
+ border-bottom: none;
642
+ }
643
+
644
+ .toc ul {
645
+ list-style-type: none;
646
+ padding-left: 0;
647
+ }
648
+
649
+ .toc li {
650
+ margin-bottom: 0.8em;
651
+ }
652
+
653
+ .toc a {
654
+ color: #2c3e50;
655
+ font-weight: 500;
656
+ border-bottom: none;
657
+ }
658
+ """
659
+
660
+ # Create custom configuration
661
+ config = MarkdownConfig(
662
+ page_size="A4",
663
+ margin="0.8in",
664
+ font_size="11pt",
665
+ line_height="1.4",
666
+ include_toc=True,
667
+ syntax_highlighting=True,
668
+ custom_css=custom_css,
669
+ )
670
+
671
+ converter = EnhancedMarkdownConverter(config)
672
+
673
+ # Convert academic sample with custom styling
674
+ academic_md_path = temp_dir / "academic_styled.md"
675
+ with open(academic_md_path, "w", encoding="utf-8") as f:
676
+ f.write(samples["academic"])
677
+
678
+ print("Converting academic paper with custom styling...")
679
+ print("Custom styling features:")
680
+ print(" - Custom color scheme (reds, purples, blues)")
681
+ print(" - Times New Roman serif font")
682
+ print(" - Enhanced table styling with hover effects")
683
+ print(" - Styled code blocks with dark theme")
684
+ print(" - Custom blockquote styling")
685
+ print(" - Professional header styling")
686
+
687
+ success = converter.convert_file_to_pdf(
688
+ input_path=str(academic_md_path),
689
+ output_path=str(temp_dir / "academic_styled.pdf"),
690
+ method="weasyprint", # WeasyPrint is best for custom CSS
691
+ )
692
+
693
+ if success:
694
+ print("✅ Custom styling conversion successful!")
695
+ print(f" Output: {temp_dir / 'academic_styled.pdf'}")
696
+
697
+ # Also create a default version for comparison
698
+ default_converter = EnhancedMarkdownConverter()
699
+ default_success = default_converter.convert_file_to_pdf(
700
+ input_path=str(academic_md_path),
701
+ output_path=str(temp_dir / "academic_default.pdf"),
702
+ method="weasyprint",
703
+ )
704
+
705
+ if default_success:
706
+ print(f" Comparison (default): {temp_dir / 'academic_default.pdf'}")
707
+ else:
708
+ print("❌ Custom styling conversion failed")
709
+
710
+ return success, temp_dir
711
+
712
+ except Exception as e:
713
+ print(f"❌ Custom styling demonstration failed: {str(e)}")
714
+ return False, None
715
+
716
+
717
+ def demonstrate_content_conversion():
718
+ """Demonstrate converting markdown content directly (not from file)"""
719
+ print("\n" + "=" * 60)
720
+ print("CONTENT CONVERSION DEMONSTRATION")
721
+ print("=" * 60)
722
+
723
+ try:
724
+ # Create markdown content programmatically
725
+ dynamic_content = f"""# Dynamic Content Example
726
+
727
+ ## Generated Information
728
+ This document was generated programmatically on {Path(__file__).name}.
729
+
730
+ ## System Information
731
+ - **Python Path**: {sys.executable}
732
+ - **Script Location**: {Path(__file__).absolute()}
733
+ - **Working Directory**: {Path.cwd()}
734
+
735
+ ## Dynamic Table
736
+ | Property | Value |
737
+ |----------|-------|
738
+ | Script Name | {Path(__file__).name} |
739
+ | Python Version | {sys.version.split()[0]} |
740
+ | Platform | {sys.platform} |
741
+
742
+ ## Code Example
743
+ ```python
744
+ # This content was generated dynamically
745
+ import sys
746
+ from pathlib import Path
747
+
748
+ def generate_report():
749
+ return f"Report generated from {{Path(__file__).name}}"
750
+
751
+ print(generate_report())
752
+ ```
753
+
754
+ ## Features Demonstrated
755
+ This example shows how to:
756
+ 1. Generate markdown content programmatically
757
+ 2. Convert content directly without saving to file first
758
+ 3. Include dynamic information in documents
759
+ 4. Use different conversion methods
760
+
761
+ > **Note**: This content was created in memory and converted directly to PDF
762
+ > without intermediate file storage.
763
+
764
+ ## Conclusion
765
+ Direct content conversion is useful for:
766
+ - Dynamic report generation
767
+ - Programmatic document creation
768
+ - API-based document services
769
+ - Real-time content processing
770
+ """
771
+
772
+ temp_dir = Path(tempfile.mkdtemp())
773
+ converter = EnhancedMarkdownConverter()
774
+
775
+ print("Converting dynamically generated markdown content...")
776
+ print("Content includes:")
777
+ print(" - System information")
778
+ print(" - Dynamic tables with current values")
779
+ print(" - Generated timestamps")
780
+ print(" - Programmatic examples")
781
+
782
+ # Convert content directly to PDF
783
+ output_path = temp_dir / "dynamic_content.pdf"
784
+
785
+ success = converter.convert_markdown_to_pdf(
786
+ markdown_content=dynamic_content,
787
+ output_path=str(output_path),
788
+ method="auto",
789
+ )
790
+
791
+ if success:
792
+ print("✅ Content conversion successful!")
793
+ print(f" Output: {output_path}")
794
+
795
+ # Show file size
796
+ file_size = output_path.stat().st_size
797
+ print(f" Generated PDF size: {file_size} bytes")
798
+ else:
799
+ print("❌ Content conversion failed")
800
+
801
+ return success, temp_dir
802
+
803
+ except Exception as e:
804
+ print(f"❌ Content conversion demonstration failed: {str(e)}")
805
+ return False, None
806
+
807
+
808
+ def demonstrate_error_handling():
809
+ """Demonstrate error handling and fallback mechanisms"""
810
+ print("\n" + "=" * 60)
811
+ print("ERROR HANDLING DEMONSTRATION")
812
+ print("=" * 60)
813
+
814
+ try:
815
+ temp_dir = Path(tempfile.mkdtemp())
816
+
817
+ # Test cases with various issues
818
+ test_cases = {
819
+ "invalid_markdown": """# Invalid Markdown
820
+
821
+ This markdown has some {{invalid}} syntax and [broken links](http://nonexistent.invalid).
822
+
823
+ ```unknown_language
824
+ This code block uses an unknown language
825
+ ```
826
+
827
+ ![Missing Image](nonexistent_image.png)
828
+ """,
829
+ "complex_content": """# Complex Content Test
830
+
831
+ ## Mathematical Expressions
832
+ This tests content that might be challenging for some backends:
833
+
834
+ $$ E = mc^2 $$
835
+
836
+ $$\\sum_{i=1}^{n} x_i = \\frac{n(n+1)}{2}$$
837
+
838
+ ## Complex Tables
839
+ | A | B | C | D | E | F | G |
840
+ |---|---|---|---|---|---|---|
841
+ | Very long content that might wrap | Short | Medium length content | X | Y | Z | End |
842
+ | Another row with different lengths | A | B | C | D | E | F |
843
+
844
+ ## Special Characters
845
+ Unicode: α, β, γ, δ, ε, ζ, η, θ, ι, κ, λ, μ, ν, ξ, ο, π, ρ, σ, τ, υ, φ, χ, ψ, ω
846
+ Symbols: ♠ ♣ ♥ ♦ ☀ ☁ ☂ ☃ ☄ ★ ☆ ☉ ☊ ☋ ☌ ☍ ☎ ☏
847
+ Arrows: ← ↑ → ↓ ↔ ↕ ↖ ↗ ↘ ↙
848
+ """,
849
+ "empty_content": "",
850
+ "minimal_content": "# Just a title",
851
+ }
852
+
853
+ print("Testing error handling with various content types...")
854
+
855
+ results = {}
856
+
857
+ for test_name, content in test_cases.items():
858
+ print(f"\nTesting: {test_name}")
859
+
860
+ try:
861
+ # Try multiple backends for each test case
862
+ for backend in ["auto", "weasyprint", "pandoc"]:
863
+ try:
864
+ converter = EnhancedMarkdownConverter()
865
+ output_path = temp_dir / f"{test_name}_{backend}.pdf"
866
+
867
+ success = converter.convert_markdown_to_pdf(
868
+ markdown_content=content,
869
+ output_path=str(output_path),
870
+ method=backend,
871
+ )
872
+
873
+ if success:
874
+ file_size = (
875
+ output_path.stat().st_size
876
+ if output_path.exists()
877
+ else 0
878
+ )
879
+ print(f" ✅ {backend}: Success ({file_size} bytes)")
880
+ results[f"{test_name}_{backend}"] = {
881
+ "success": True,
882
+ "size": file_size,
883
+ }
884
+ else:
885
+ print(f" ❌ {backend}: Failed")
886
+ results[f"{test_name}_{backend}"] = {"success": False}
887
+
888
+ except Exception as e:
889
+ print(f" ❌ {backend}: Error - {str(e)[:60]}...")
890
+ results[f"{test_name}_{backend}"] = {
891
+ "success": False,
892
+ "error": str(e),
893
+ }
894
+
895
+ except Exception as e:
896
+ print(f" ❌ Test case failed: {str(e)}")
897
+
898
+ # Demonstrate robust conversion with fallbacks
899
+ print("\nDemonstrating robust conversion with fallback logic...")
900
+
901
+ def robust_convert(content, output_path):
902
+ """Convert with multiple backend fallbacks"""
903
+ backends = ["weasyprint", "pandoc", "auto"]
904
+
905
+ for backend in backends:
906
+ try:
907
+ converter = EnhancedMarkdownConverter()
908
+ success = converter.convert_markdown_to_pdf(
909
+ markdown_content=content,
910
+ output_path=output_path,
911
+ method=backend,
912
+ )
913
+ if success:
914
+ return backend, True
915
+ except Exception:
916
+ continue
917
+
918
+ return None, False
919
+
920
+ # Test robust conversion
921
+ test_content = test_cases["complex_content"]
922
+ robust_output = temp_dir / "robust_conversion.pdf"
923
+
924
+ successful_backend, success = robust_convert(test_content, str(robust_output))
925
+
926
+ if success:
927
+ print(f"✅ Robust conversion successful using {successful_backend}")
928
+ print(f" Output: {robust_output}")
929
+ else:
930
+ print("❌ All backends failed for robust conversion")
931
+
932
+ # Summary
933
+ print("\n" + "-" * 40)
934
+ print("ERROR HANDLING SUMMARY")
935
+ print("-" * 40)
936
+ successful_conversions = sum(
937
+ 1 for r in results.values() if r.get("success", False)
938
+ )
939
+ total_attempts = len(results)
940
+ success_rate = (
941
+ (successful_conversions / total_attempts * 100) if total_attempts > 0 else 0
942
+ )
943
+
944
+ print(f"Total conversion attempts: {total_attempts}")
945
+ print(f"Successful conversions: {successful_conversions}")
946
+ print(f"Success rate: {success_rate:.1f}%")
947
+
948
+ return results, temp_dir
949
+
950
+ except Exception as e:
951
+ print(f"❌ Error handling demonstration failed: {str(e)}")
952
+ return None, None
953
+
954
+
955
+ def main():
956
+ """Main demonstration function"""
957
+ # Configure logging
958
+ logging.basicConfig(
959
+ level=logging.INFO,
960
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
961
+ )
962
+
963
+ print("RAG-Anything Enhanced Markdown Conversion Demonstration")
964
+ print("=" * 70)
965
+ print(
966
+ "This example demonstrates various enhanced markdown conversion capabilities:"
967
+ )
968
+ print(" - Basic markdown to PDF conversion")
969
+ print(" - Multiple backend comparison (WeasyPrint vs Pandoc)")
970
+ print(" - Custom CSS styling and professional formatting")
971
+ print(" - Direct content conversion without file I/O")
972
+ print(" - Comprehensive error handling and fallback mechanisms")
973
+
974
+ results = {}
975
+
976
+ # Run demonstrations
977
+ print("\n🚀 Starting demonstrations...")
978
+
979
+ # Basic conversion
980
+ success, temp_dir = demonstrate_basic_conversion()
981
+ results["basic"] = success
982
+
983
+ # Backend comparison
984
+ backend_results, _ = demonstrate_backend_comparison()
985
+ results["backends"] = backend_results
986
+
987
+ # Custom styling
988
+ styling_success, _ = demonstrate_custom_styling()
989
+ results["styling"] = styling_success
990
+
991
+ # Content conversion
992
+ content_success, _ = demonstrate_content_conversion()
993
+ results["content"] = content_success
994
+
995
+ # Error handling
996
+ error_results, _ = demonstrate_error_handling()
997
+ results["error_handling"] = error_results
998
+
999
+ # Summary
1000
+ print("\n" + "=" * 70)
1001
+ print("DEMONSTRATION SUMMARY")
1002
+ print("=" * 70)
1003
+
1004
+ print("✅ Features Successfully Demonstrated:")
1005
+ if results["basic"]:
1006
+ print(" - Basic markdown to PDF conversion")
1007
+ if results["backends"]:
1008
+ successful_backends = [
1009
+ b for b, r in results["backends"].items() if r.get("success", False)
1010
+ ]
1011
+ print(f" - Multiple backends: {successful_backends}")
1012
+ if results["styling"]:
1013
+ print(" - Custom CSS styling and professional formatting")
1014
+ if results["content"]:
1015
+ print(" - Direct content conversion without file I/O")
1016
+ if results["error_handling"]:
1017
+ success_rate = (
1018
+ sum(
1019
+ 1 for r in results["error_handling"].values() if r.get("success", False)
1020
+ )
1021
+ / len(results["error_handling"])
1022
+ * 100
1023
+ )
1024
+ print(f" - Error handling with {success_rate:.1f}% overall success rate")
1025
+
1026
+ print("\n📊 Key Capabilities Highlighted:")
1027
+ print(" - Professional PDF generation with high-quality typography")
1028
+ print(" - Multiple conversion backends with automatic selection")
1029
+ print(" - Extensive CSS customization for branded documents")
1030
+ print(" - Syntax highlighting for code blocks using Pygments")
1031
+ print(" - Table formatting with professional styling")
1032
+ print(" - Image embedding with proper scaling")
1033
+ print(" - Table of contents generation with navigation")
1034
+ print(" - Comprehensive error handling and fallback mechanisms")
1035
+
1036
+ print("\n💡 Best Practices Demonstrated:")
1037
+ print(" - Choose WeasyPrint for web-style documents and custom CSS")
1038
+ print(" - Choose Pandoc for academic papers and complex formatting")
1039
+ print(" - Use 'auto' method for general-purpose conversion")
1040
+ print(" - Implement fallback logic for robust conversion")
1041
+ print(" - Optimize images before embedding in documents")
1042
+ print(" - Test custom CSS with simple content first")
1043
+ print(" - Handle errors gracefully with multiple backend attempts")
1044
+ print(" - Use appropriate page sizes and margins for target use case")
1045
+
1046
+ print("\n🎯 Integration Patterns:")
1047
+ print(" - Standalone conversion for document generation")
1048
+ print(" - Integration with RAG-Anything document pipeline")
1049
+ print(" - API-based document services")
1050
+ print(" - Batch processing for multiple documents")
1051
+ print(" - Dynamic content generation from templates")
1052
+
1053
+
1054
+ if __name__ == "__main__":
1055
+ main()
rag_anything_smaranika/examples/image_format_test.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Image Format Parsing Test Script for RAG-Anything
4
+
5
+ This script demonstrates how to parse various image formats
6
+ using MinerU, including JPG, PNG, BMP, TIFF, GIF, and WebP files.
7
+
8
+ Requirements:
9
+ - PIL/Pillow library for format conversion
10
+ - RAG-Anything package
11
+
12
+ Usage:
13
+ python image_format_test.py --file path/to/image.bmp
14
+ """
15
+
16
+ import argparse
17
+ import asyncio
18
+ import sys
19
+ from pathlib import Path
20
+ from raganything import RAGAnything
21
+
22
+
23
+ def check_pillow_installation():
24
+ """Check if PIL/Pillow is installed and available"""
25
+ try:
26
+ from PIL import Image
27
+
28
+ print(
29
+ f"✅ PIL/Pillow found: PIL version {Image.__version__ if hasattr(Image, '__version__') else 'Unknown'}"
30
+ )
31
+ return True
32
+ except ImportError:
33
+ print("❌ PIL/Pillow not found. Please install Pillow:")
34
+ print(" pip install Pillow")
35
+ return False
36
+
37
+
38
+ def get_image_info(image_path: Path):
39
+ """Get detailed image information"""
40
+ try:
41
+ from PIL import Image
42
+
43
+ with Image.open(image_path) as img:
44
+ return {
45
+ "format": img.format,
46
+ "mode": img.mode,
47
+ "size": img.size,
48
+ "has_transparency": img.mode in ("RGBA", "LA")
49
+ or "transparency" in img.info,
50
+ }
51
+ except Exception as e:
52
+ return {"error": str(e)}
53
+
54
+
55
+ async def test_image_format_parsing(file_path: str):
56
+ """Test image format parsing with MinerU"""
57
+
58
+ print(f"🧪 Testing image format parsing: {file_path}")
59
+
60
+ # Check if file exists and is a supported image format
61
+ file_path = Path(file_path)
62
+ if not file_path.exists():
63
+ print(f"❌ File does not exist: {file_path}")
64
+ return False
65
+
66
+ supported_extensions = {
67
+ ".jpg",
68
+ ".jpeg",
69
+ ".png",
70
+ ".bmp",
71
+ ".tiff",
72
+ ".tif",
73
+ ".gif",
74
+ ".webp",
75
+ }
76
+ if file_path.suffix.lower() not in supported_extensions:
77
+ print(f"❌ Unsupported file format: {file_path.suffix}")
78
+ print(f" Supported formats: {', '.join(supported_extensions)}")
79
+ return False
80
+
81
+ print(f"📸 File format: {file_path.suffix.upper()}")
82
+ print(f"📏 File size: {file_path.stat().st_size / 1024:.1f} KB")
83
+
84
+ # Get detailed image information
85
+ img_info = get_image_info(file_path)
86
+ if "error" not in img_info:
87
+ print("🖼️ Image info:")
88
+ print(f" • Format: {img_info['format']}")
89
+ print(f" • Mode: {img_info['mode']}")
90
+ print(f" • Size: {img_info['size'][0]}x{img_info['size'][1]}")
91
+ print(f" • Has transparency: {img_info['has_transparency']}")
92
+
93
+ # Check format compatibility with MinerU
94
+ mineru_native_formats = {".jpg", ".jpeg", ".png"}
95
+ needs_conversion = file_path.suffix.lower() not in mineru_native_formats
96
+
97
+ if needs_conversion:
98
+ print(
99
+ f"ℹ️ Format {file_path.suffix.upper()} will be converted to PNG for MinerU compatibility"
100
+ )
101
+ else:
102
+ print(f"✅ Format {file_path.suffix.upper()} is natively supported by MinerU")
103
+
104
+ # Initialize RAGAnything (only for parsing functionality)
105
+ rag = RAGAnything()
106
+
107
+ try:
108
+ # Test image parsing with MinerU
109
+ print("\n🔄 Testing image parsing with MinerU...")
110
+ content_list, md_content = await rag.parse_document(
111
+ file_path=str(file_path),
112
+ output_dir="./test_output",
113
+ parse_method="ocr", # Images use OCR method
114
+ display_stats=True,
115
+ )
116
+
117
+ print("✅ Parsing successful!")
118
+ print(f" 📊 Content blocks: {len(content_list)}")
119
+ print(f" 📝 Markdown length: {len(md_content)} characters")
120
+
121
+ # Analyze content types
122
+ content_types = {}
123
+ for item in content_list:
124
+ if isinstance(item, dict):
125
+ content_type = item.get("type", "unknown")
126
+ content_types[content_type] = content_types.get(content_type, 0) + 1
127
+
128
+ if content_types:
129
+ print(" 📋 Content distribution:")
130
+ for content_type, count in sorted(content_types.items()):
131
+ print(f" • {content_type}: {count}")
132
+
133
+ # Display extracted text (if any)
134
+ if md_content.strip():
135
+ print("\n📄 Extracted text preview (first 500 characters):")
136
+ preview = md_content.strip()[:500]
137
+ print(f" {preview}{'...' if len(md_content) > 500 else ''}")
138
+ else:
139
+ print("\n📄 No text extracted from the image")
140
+
141
+ # Display image processing results
142
+ image_items = [
143
+ item
144
+ for item in content_list
145
+ if isinstance(item, dict) and item.get("type") == "image"
146
+ ]
147
+ if image_items:
148
+ print(f"\n🖼️ Found {len(image_items)} processed image(s):")
149
+ for i, item in enumerate(image_items, 1):
150
+ print(f" {i}. Image path: {item.get('img_path', 'N/A')}")
151
+ caption = item.get("image_caption", item.get("img_caption", []))
152
+ if caption:
153
+ print(f" Caption: {caption[0] if caption else 'N/A'}")
154
+
155
+ # Display text blocks (OCR results)
156
+ text_items = [
157
+ item
158
+ for item in content_list
159
+ if isinstance(item, dict) and item.get("type") == "text"
160
+ ]
161
+ if text_items:
162
+ print("\n📝 OCR text blocks found:")
163
+ for i, item in enumerate(text_items, 1):
164
+ text_content = item.get("text", "")
165
+ if text_content.strip():
166
+ preview = text_content.strip()[:200]
167
+ print(
168
+ f" {i}. {preview}{'...' if len(text_content) > 200 else ''}"
169
+ )
170
+
171
+ # Check for any tables detected in the image
172
+ table_items = [
173
+ item
174
+ for item in content_list
175
+ if isinstance(item, dict) and item.get("type") == "table"
176
+ ]
177
+ if table_items:
178
+ print(f"\n📊 Found {len(table_items)} table(s) in image:")
179
+ for i, item in enumerate(table_items, 1):
180
+ print(f" {i}. Table detected with content")
181
+
182
+ print("\n🎉 Image format parsing test completed successfully!")
183
+ print("📁 Output files saved to: ./test_output")
184
+ return True
185
+
186
+ except Exception as e:
187
+ print(f"\n❌ Image format parsing failed: {str(e)}")
188
+ import traceback
189
+
190
+ print(f" Full error: {traceback.format_exc()}")
191
+ return False
192
+
193
+
194
+ def main():
195
+ """Main function"""
196
+ parser = argparse.ArgumentParser(
197
+ description="Test image format parsing with MinerU"
198
+ )
199
+ parser.add_argument("--file", help="Path to the image file to test")
200
+ parser.add_argument(
201
+ "--check-pillow", action="store_true", help="Only check PIL/Pillow installation"
202
+ )
203
+
204
+ args = parser.parse_args()
205
+
206
+ # Check PIL/Pillow installation
207
+ print("🔧 Checking PIL/Pillow installation...")
208
+ if not check_pillow_installation():
209
+ return 1
210
+
211
+ if args.check_pillow:
212
+ print("✅ PIL/Pillow installation check passed!")
213
+ return 0
214
+
215
+ # If not just checking dependencies, file argument is required
216
+ if not args.file:
217
+ print("❌ Error: --file argument is required when not using --check-pillow")
218
+ parser.print_help()
219
+ return 1
220
+
221
+ # Run the parsing test
222
+ try:
223
+ success = asyncio.run(test_image_format_parsing(args.file))
224
+ return 0 if success else 1
225
+ except KeyboardInterrupt:
226
+ print("\n⏹️ Test interrupted by user")
227
+ return 1
228
+ except Exception as e:
229
+ print(f"\n❌ Unexpected error: {str(e)}")
230
+ return 1
231
+
232
+
233
+ if __name__ == "__main__":
234
+ sys.exit(main())
rag_anything_smaranika/examples/insert_content_list_example.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Example script demonstrating direct content list insertion with RAGAnything
4
+
5
+ This example shows how to:
6
+ 1. Create a simple content list with different content types
7
+ 2. Insert content list directly without document parsing using insert_content_list() method
8
+ 3. Perform pure text queries using aquery() method
9
+ 4. Perform multimodal queries with specific multimodal content using aquery_with_multimodal() method
10
+ 5. Handle different types of multimodal content in the inserted knowledge base
11
+ """
12
+
13
+ import os
14
+ import argparse
15
+ import asyncio
16
+ import logging
17
+ import logging.config
18
+ from pathlib import Path
19
+
20
+ # Add project root directory to Python path
21
+ import sys
22
+
23
+ sys.path.append(str(Path(__file__).parent.parent))
24
+
25
+ from lightrag.llm.openai import openai_complete_if_cache, openai_embed
26
+ from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
27
+ from raganything import RAGAnything, RAGAnythingConfig
28
+
29
+ from dotenv import load_dotenv
30
+
31
+ load_dotenv(dotenv_path=".env", override=False)
32
+
33
+
34
+ def configure_logging():
35
+ """Configure logging for the application"""
36
+ # Get log directory path from environment variable or use current directory
37
+ log_dir = os.getenv("LOG_DIR", os.getcwd())
38
+ log_file_path = os.path.abspath(
39
+ os.path.join(log_dir, "insert_content_list_example.log")
40
+ )
41
+
42
+ print(f"\nInsert Content List example log file: {log_file_path}\n")
43
+ os.makedirs(os.path.dirname(log_dir), exist_ok=True)
44
+
45
+ # Get log file max size and backup count from environment variables
46
+ log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
47
+ log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
48
+
49
+ logging.config.dictConfig(
50
+ {
51
+ "version": 1,
52
+ "disable_existing_loggers": False,
53
+ "formatters": {
54
+ "default": {
55
+ "format": "%(levelname)s: %(message)s",
56
+ },
57
+ "detailed": {
58
+ "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
59
+ },
60
+ },
61
+ "handlers": {
62
+ "console": {
63
+ "formatter": "default",
64
+ "class": "logging.StreamHandler",
65
+ "stream": "ext://sys.stderr",
66
+ },
67
+ "file": {
68
+ "formatter": "detailed",
69
+ "class": "logging.handlers.RotatingFileHandler",
70
+ "filename": log_file_path,
71
+ "maxBytes": log_max_bytes,
72
+ "backupCount": log_backup_count,
73
+ "encoding": "utf-8",
74
+ },
75
+ },
76
+ "loggers": {
77
+ "lightrag": {
78
+ "handlers": ["console", "file"],
79
+ "level": "INFO",
80
+ "propagate": False,
81
+ },
82
+ },
83
+ }
84
+ )
85
+
86
+ # Set the logger level to INFO
87
+ logger.setLevel(logging.INFO)
88
+ # Enable verbose debug if needed
89
+ set_verbose_debug(os.getenv("VERBOSE", "false").lower() == "true")
90
+
91
+
92
+ def create_sample_content_list():
93
+ """
94
+ Create a simple content list for testing insert_content_list functionality
95
+
96
+ Returns:
97
+ List[Dict]: Sample content list with various content types
98
+
99
+ Note:
100
+ - img_path should be absolute path to the image file
101
+ - page_idx represents the page number where the content appears (0-based)
102
+ """
103
+ content_list = [
104
+ # Introduction text
105
+ {
106
+ "type": "text",
107
+ "text": "Welcome to the RAGAnything System Documentation. This guide covers the advanced multimodal document processing capabilities and features of our comprehensive RAG system.",
108
+ "page_idx": 0, # Page number where this content appears
109
+ },
110
+ # System architecture image
111
+ {
112
+ "type": "image",
113
+ "img_path": "/absolute/path/to/system_architecture.jpg", # IMPORTANT: Use absolute path to image file
114
+ "image_caption": ["Figure 1: RAGAnything System Architecture"],
115
+ "image_footnote": [
116
+ "The architecture shows the complete pipeline from document parsing to multimodal query processing"
117
+ ],
118
+ "page_idx": 1, # Page number where this image appears
119
+ },
120
+ # Performance comparison table
121
+ {
122
+ "type": "table",
123
+ "table_body": """| System | Accuracy | Processing Speed | Memory Usage |
124
+ |--------|----------|------------------|--------------|
125
+ | RAGAnything | 95.2% | 120ms | 2.1GB |
126
+ | Traditional RAG | 87.3% | 180ms | 3.2GB |
127
+ | Baseline System | 82.1% | 220ms | 4.1GB |
128
+ | Simple Retrieval | 76.5% | 95ms | 1.8GB |""",
129
+ "table_caption": [
130
+ "Table 1: Performance Comparison of Different RAG Systems"
131
+ ],
132
+ "table_footnote": [
133
+ "All tests conducted on the same hardware with identical test datasets"
134
+ ],
135
+ "page_idx": 2, # Page number where this table appears
136
+ },
137
+ # Mathematical formula
138
+ {
139
+ "type": "equation",
140
+ "latex": "Relevance(d, q) = \\sum_{i=1}^{n} w_i \\cdot sim(t_i^d, t_i^q) \\cdot \\alpha_i",
141
+ "text": "Document relevance scoring formula where w_i are term weights, sim() is similarity function, and α_i are modality importance factors",
142
+ "page_idx": 3, # Page number where this equation appears
143
+ },
144
+ # Feature description
145
+ {
146
+ "type": "text",
147
+ "text": "The system supports multiple content modalities including text, images, tables, and mathematical equations. Each modality is processed using specialized processors optimized for that content type.",
148
+ "page_idx": 4, # Page number where this content appears
149
+ },
150
+ # Technical specifications table
151
+ {
152
+ "type": "table",
153
+ "table_body": """| Feature | Specification |
154
+ |---------|---------------|
155
+ | Supported Formats | PDF, DOCX, PPTX, XLSX, Images |
156
+ | Max Document Size | 100MB |
157
+ | Concurrent Processing | Up to 8 documents |
158
+ | Query Response Time | <200ms average |
159
+ | Knowledge Graph Nodes | Up to 1M entities |""",
160
+ "table_caption": ["Table 2: Technical Specifications"],
161
+ "table_footnote": [
162
+ "Specifications may vary based on hardware configuration"
163
+ ],
164
+ "page_idx": 5, # Page number where this table appears
165
+ },
166
+ # Conclusion
167
+ {
168
+ "type": "text",
169
+ "text": "RAGAnything represents a significant advancement in multimodal document processing, providing comprehensive solutions for complex knowledge extraction and retrieval tasks.",
170
+ "page_idx": 6, # Page number where this content appears
171
+ },
172
+ ]
173
+
174
+ return content_list
175
+
176
+
177
+ async def demo_insert_content_list(
178
+ api_key: str,
179
+ base_url: str = None,
180
+ working_dir: str = None,
181
+ ):
182
+ """
183
+ Demonstrate content list insertion and querying with RAGAnything
184
+
185
+ Args:
186
+ api_key: OpenAI API key
187
+ base_url: Optional base URL for API
188
+ working_dir: Working directory for RAG storage
189
+ """
190
+ try:
191
+ # Create RAGAnything configuration
192
+ config = RAGAnythingConfig(
193
+ working_dir=working_dir or "./rag_storage",
194
+ enable_image_processing=True,
195
+ enable_table_processing=True,
196
+ enable_equation_processing=True,
197
+ display_content_stats=True, # Show content statistics
198
+ )
199
+
200
+ # Define LLM model function
201
+ def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
202
+ return openai_complete_if_cache(
203
+ "gpt-4o-mini",
204
+ prompt,
205
+ system_prompt=system_prompt,
206
+ history_messages=history_messages,
207
+ api_key=api_key,
208
+ base_url=base_url,
209
+ **kwargs,
210
+ )
211
+
212
+ # Define vision model function for image processing
213
+ def vision_model_func(
214
+ prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
215
+ ):
216
+ if image_data:
217
+ return openai_complete_if_cache(
218
+ "gpt-4o",
219
+ "",
220
+ system_prompt=None,
221
+ history_messages=[],
222
+ messages=[
223
+ {"role": "system", "content": system_prompt}
224
+ if system_prompt
225
+ else None,
226
+ {
227
+ "role": "user",
228
+ "content": [
229
+ {"type": "text", "text": prompt},
230
+ {
231
+ "type": "image_url",
232
+ "image_url": {
233
+ "url": f"data:image/jpeg;base64,{image_data}"
234
+ },
235
+ },
236
+ ],
237
+ }
238
+ if image_data
239
+ else {"role": "user", "content": prompt},
240
+ ],
241
+ api_key=api_key,
242
+ base_url=base_url,
243
+ **kwargs,
244
+ )
245
+ else:
246
+ return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
247
+
248
+ # Define embedding function
249
+ embedding_func = EmbeddingFunc(
250
+ embedding_dim=3072,
251
+ max_token_size=8192,
252
+ func=lambda texts: openai_embed(
253
+ texts,
254
+ model="text-embedding-3-large",
255
+ api_key=api_key,
256
+ base_url=base_url,
257
+ ),
258
+ )
259
+
260
+ # Initialize RAGAnything
261
+ rag = RAGAnything(
262
+ config=config,
263
+ llm_model_func=llm_model_func,
264
+ vision_model_func=vision_model_func,
265
+ embedding_func=embedding_func,
266
+ )
267
+
268
+ # Create sample content list
269
+ logger.info("Creating sample content list...")
270
+ content_list = create_sample_content_list()
271
+ logger.info(f"Created content list with {len(content_list)} items")
272
+
273
+ # Insert content list directly
274
+ logger.info("\nInserting content list into RAGAnything...")
275
+ await rag.insert_content_list(
276
+ content_list=content_list,
277
+ file_path="raganything_documentation.pdf", # Reference file name for citation
278
+ split_by_character=None, # Optional text splitting
279
+ split_by_character_only=False, # Optional text splitting mode
280
+ doc_id="demo-doc-001", # Custom document ID
281
+ display_stats=True, # Show content statistics
282
+ )
283
+ logger.info("Content list insertion completed!")
284
+
285
+ # Example queries - demonstrating different query approaches
286
+ logger.info("\nQuerying inserted content:")
287
+
288
+ # 1. Pure text queries using aquery()
289
+ text_queries = [
290
+ "What is RAGAnything and what are its main features?",
291
+ "How does RAGAnything compare to traditional RAG systems?",
292
+ "What are the technical specifications of the system?",
293
+ ]
294
+
295
+ for query in text_queries:
296
+ logger.info(f"\n[Text Query]: {query}")
297
+ result = await rag.aquery(query, mode="hybrid")
298
+ logger.info(f"Answer: {result}")
299
+
300
+ # 2. Multimodal query with specific multimodal content using aquery_with_multimodal()
301
+ logger.info(
302
+ "\n[Multimodal Query]: Analyzing new performance data against existing benchmarks"
303
+ )
304
+ multimodal_result = await rag.aquery_with_multimodal(
305
+ "Compare this new performance data with the existing benchmark results in the documentation",
306
+ multimodal_content=[
307
+ {
308
+ "type": "table",
309
+ "table_data": """Method,Accuracy,Speed,Memory
310
+ New_Approach,97.1%,110ms,1.9GB
311
+ Enhanced_RAG,91.4%,140ms,2.5GB""",
312
+ "table_caption": "Latest experimental results",
313
+ }
314
+ ],
315
+ mode="hybrid",
316
+ )
317
+ logger.info(f"Answer: {multimodal_result}")
318
+
319
+ # 3. Another multimodal query with equation content
320
+ logger.info("\n[Multimodal Query]: Mathematical formula analysis")
321
+ equation_result = await rag.aquery_with_multimodal(
322
+ "How does this similarity formula relate to the relevance scoring mentioned in the documentation?",
323
+ multimodal_content=[
324
+ {
325
+ "type": "equation",
326
+ "latex": "sim(a, b) = \\frac{a \\cdot b}{||a|| \\times ||b||} + \\beta \\cdot context\\_weight",
327
+ "equation_caption": "Enhanced cosine similarity with context weighting",
328
+ }
329
+ ],
330
+ mode="hybrid",
331
+ )
332
+ logger.info(f"Answer: {equation_result}")
333
+
334
+ # 4. Insert another content list with different document ID
335
+ logger.info("\nInserting additional content list...")
336
+ additional_content = [
337
+ {
338
+ "type": "text",
339
+ "text": "This is additional documentation about advanced features and configuration options.",
340
+ "page_idx": 0, # Page number where this content appears
341
+ },
342
+ {
343
+ "type": "table",
344
+ "table_body": """| Configuration | Default Value | Range |
345
+ |---------------|---------------|-------|
346
+ | Chunk Size | 512 tokens | 128-2048 |
347
+ | Context Window | 4096 tokens | 1024-8192 |
348
+ | Batch Size | 32 | 1-128 |""",
349
+ "table_caption": ["Advanced Configuration Parameters"],
350
+ "page_idx": 1, # Page number where this table appears
351
+ },
352
+ ]
353
+
354
+ await rag.insert_content_list(
355
+ content_list=additional_content,
356
+ file_path="advanced_configuration.pdf",
357
+ doc_id="demo-doc-002", # Different document ID
358
+ )
359
+
360
+ # Query combined knowledge base
361
+ logger.info("\n[Combined Query]: What configuration options are available?")
362
+ combined_result = await rag.aquery(
363
+ "What configuration options are available and what are their default values?",
364
+ mode="hybrid",
365
+ )
366
+ logger.info(f"Answer: {combined_result}")
367
+
368
+ except Exception as e:
369
+ logger.error(f"Error in content list insertion demo: {str(e)}")
370
+ import traceback
371
+
372
+ logger.error(traceback.format_exc())
373
+
374
+
375
+ def main():
376
+ """Main function to run the example"""
377
+ parser = argparse.ArgumentParser(description="Insert Content List Example")
378
+ parser.add_argument(
379
+ "--working_dir", "-w", default="./rag_storage", help="Working directory path"
380
+ )
381
+ parser.add_argument(
382
+ "--api-key",
383
+ default=os.getenv("LLM_BINDING_API_KEY"),
384
+ help="OpenAI API key (defaults to LLM_BINDING_API_KEY env var)",
385
+ )
386
+ parser.add_argument(
387
+ "--base-url",
388
+ default=os.getenv("LLM_BINDING_HOST"),
389
+ help="Optional base URL for API",
390
+ )
391
+
392
+ args = parser.parse_args()
393
+
394
+ # Check if API key is provided
395
+ if not args.api_key:
396
+ logger.error("Error: OpenAI API key is required")
397
+ logger.error("Set api key environment variable or use --api-key option")
398
+ return
399
+
400
+ # Run the demo
401
+ asyncio.run(
402
+ demo_insert_content_list(
403
+ args.api_key,
404
+ args.base_url,
405
+ args.working_dir,
406
+ )
407
+ )
408
+
409
+
410
+ if __name__ == "__main__":
411
+ # Configure logging first
412
+ configure_logging()
413
+
414
+ print("RAGAnything Insert Content List Example")
415
+ print("=" * 45)
416
+ print("Demonstrating direct content list insertion without document parsing")
417
+ print("=" * 45)
418
+
419
+ main()