Commit ·
a00e2ad
1
Parent(s): 4353929
[NOTICKET]: adjusted pyproject.toml for OCR PDF
Browse files- .gitignore +2 -1
- pyproject.toml +4 -0
- src/knowledge/processing_service.py +0 -2
.gitignore
CHANGED
|
@@ -35,4 +35,5 @@ playground_chat.py
|
|
| 35 |
playground_flush_cache.py
|
| 36 |
playground_create_user.py
|
| 37 |
API_CONTRACT.md
|
| 38 |
-
context_engineering/
|
|
|
|
|
|
| 35 |
playground_flush_cache.py
|
| 36 |
playground_create_user.py
|
| 37 |
API_CONTRACT.md
|
| 38 |
+
context_engineering/
|
| 39 |
+
sample_file/
|
pyproject.toml
CHANGED
|
@@ -84,6 +84,10 @@ dependencies = [
|
|
| 84 |
"pymssql>=2.3.0",
|
| 85 |
"sqlalchemy-bigquery>=1.11.0",
|
| 86 |
"snowflake-sqlalchemy>=1.7.0",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
]
|
| 88 |
|
| 89 |
[project.optional-dependencies]
|
|
|
|
| 84 |
"pymssql>=2.3.0",
|
| 85 |
"sqlalchemy-bigquery>=1.11.0",
|
| 86 |
"snowflake-sqlalchemy>=1.7.0",
|
| 87 |
+
# --- OCR (pdf processing) ---
|
| 88 |
+
"pdf2image>=1.17.0",
|
| 89 |
+
"pytesseract>=0.3.13",
|
| 90 |
+
"pypdf2>=3.0.1",
|
| 91 |
]
|
| 92 |
|
| 93 |
[project.optional-dependencies]
|
src/knowledge/processing_service.py
CHANGED
|
@@ -5,12 +5,10 @@ from langchain_core.documents import Document as LangChainDocument
|
|
| 5 |
from src.db.postgres.vector_store import get_vector_store
|
| 6 |
from src.storage.az_blob.az_blob import blob_storage
|
| 7 |
from src.db.postgres.models import Document as DBDocument
|
| 8 |
-
from src.config.settings import settings
|
| 9 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 10 |
from src.middlewares.logging import get_logger
|
| 11 |
from typing import List
|
| 12 |
import sys
|
| 13 |
-
import pypdf
|
| 14 |
import docx
|
| 15 |
import pandas as pd
|
| 16 |
import pytesseract
|
|
|
|
| 5 |
from src.db.postgres.vector_store import get_vector_store
|
| 6 |
from src.storage.az_blob.az_blob import blob_storage
|
| 7 |
from src.db.postgres.models import Document as DBDocument
|
|
|
|
| 8 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 9 |
from src.middlewares.logging import get_logger
|
| 10 |
from typing import List
|
| 11 |
import sys
|
|
|
|
| 12 |
import docx
|
| 13 |
import pandas as pd
|
| 14 |
import pytesseract
|