sofhiaazzhr commited on
Commit
a00e2ad
·
1 Parent(s): 4353929

[NOTICKET]: adjusted pyproject.toml for OCR PDF

Browse files
.gitignore CHANGED
@@ -35,4 +35,5 @@ playground_chat.py
35
  playground_flush_cache.py
36
  playground_create_user.py
37
  API_CONTRACT.md
38
- context_engineering/
 
 
35
  playground_flush_cache.py
36
  playground_create_user.py
37
  API_CONTRACT.md
38
+ context_engineering/
39
+ sample_file/
pyproject.toml CHANGED
@@ -84,6 +84,10 @@ dependencies = [
84
  "pymssql>=2.3.0",
85
  "sqlalchemy-bigquery>=1.11.0",
86
  "snowflake-sqlalchemy>=1.7.0",
 
 
 
 
87
  ]
88
 
89
  [project.optional-dependencies]
 
84
  "pymssql>=2.3.0",
85
  "sqlalchemy-bigquery>=1.11.0",
86
  "snowflake-sqlalchemy>=1.7.0",
87
+ # --- OCR (pdf processing) ---
88
+ "pdf2image>=1.17.0",
89
+ "pytesseract>=0.3.13",
90
+ "pypdf2>=3.0.1",
91
  ]
92
 
93
  [project.optional-dependencies]
src/knowledge/processing_service.py CHANGED
@@ -5,12 +5,10 @@ from langchain_core.documents import Document as LangChainDocument
5
  from src.db.postgres.vector_store import get_vector_store
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.db.postgres.models import Document as DBDocument
8
- from src.config.settings import settings
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
  from src.middlewares.logging import get_logger
11
  from typing import List
12
  import sys
13
- import pypdf
14
  import docx
15
  import pandas as pd
16
  import pytesseract
 
5
  from src.db.postgres.vector_store import get_vector_store
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.db.postgres.models import Document as DBDocument
 
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
  from src.middlewares.logging import get_logger
10
  from typing import List
11
  import sys
 
12
  import docx
13
  import pandas as pd
14
  import pytesseract