Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Running

App Files Files Community

sofhiaazzhr commited on 5 days ago

Commit

6b9a13d

1 Parent(s): 347a73a

[NOTICKET]: use tesseract for extract PDF

Browse files

Files changed (1) hide show

src/knowledge/processing_service.py +31 -63

src/knowledge/processing_service.py CHANGED Viewed

@@ -8,12 +8,13 @@ from src.db.postgres.models import Document as DBDocument
 from src.config.settings import settings
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
-from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
-from azure.core.credentials import AzureKeyCredential
 from typing import List
 import pypdf
 import docx
 import pandas as pd
 from io import BytesIO
 logger = get_logger("knowledge_processing")
@@ -83,69 +84,36 @@ class KnowledgeProcessingService:
     async def _build_pdf_documents(
         self, content: bytes, db_doc: DBDocument
     ) -> List[LangChainDocument]:
-        """Build LangChain documents from PDF with page_label metadata.
-        Uses Azure Document Intelligence (per-page) when credentials are present,
-        falls back to pypdf (also per-page) otherwise.
-        """
         documents: List[LangChainDocument] = []
-        if settings.azureai_docintel_endpoint and settings.azureai_docintel_key:
-            async with DocumentIntelligenceClient(
-                endpoint=settings.azureai_docintel_endpoint,
-                credential=AzureKeyCredential(settings.azureai_docintel_key),
-            ) as client:
-                poller = await client.begin_analyze_document(
-                    model_id="prebuilt-read",
-                    body=BytesIO(content),
-                    content_type="application/pdf",
-                )
-                result = await poller.result()
-                logger.info(f"Azure DI extracted {len(result.pages or [])} pages")
-                for page in result.pages or []:
-                    page_text = "\n".join(
-                        line.content for line in (page.lines or [])
-                    )
-                    if not page_text.strip():
-                        continue
-                    for chunk in self.text_splitter.split_text(page_text):
-                        documents.append(LangChainDocument(
-                            page_content=chunk,
-                            metadata={
-                                "user_id": db_doc.user_id,
-                                "source_type": "document",
-                                "data": {
-                                    "document_id": db_doc.id,
-                                    "filename": db_doc.filename,
-                                    "file_type": db_doc.file_type,
-                                    "chunk_index": len(documents),
-                                    "page_label": page.page_number,
-                                },
-                            }
-                        ))
-        else:
-            logger.warning("Azure DI not configured, using pypdf")
-            pdf_reader = pypdf.PdfReader(BytesIO(content))
-            for page_num, page in enumerate(pdf_reader.pages, start=1):
-                page_text = page.extract_text() or ""
-                if not page_text.strip():
-                    continue
-                for chunk in self.text_splitter.split_text(page_text):
-                    documents.append(LangChainDocument(
-                        page_content=chunk,
-                        metadata={
-                            "user_id": db_doc.user_id,
-                            "source_type": "document",
-                            "data": {
-                                "document_id": db_doc.id,
-                                "filename": db_doc.filename,
-                                "file_type": db_doc.file_type,
-                                "chunk_index": len(documents),
-                                "page_label": page_num,
-                            },
-                        }
-                    ))
         return documents

 from src.config.settings import settings
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
 from typing import List
+import sys
 import pypdf
 import docx
 import pandas as pd
+import pytesseract
+from pdf2image import convert_from_bytes
 from io import BytesIO
 logger = get_logger("knowledge_processing")
     async def _build_pdf_documents(
         self, content: bytes, db_doc: DBDocument
     ) -> List[LangChainDocument]:
+        """Build LangChain documents from PDF with page_label metadata using Tesseract OCR."""
         documents: List[LangChainDocument] = []
+        poppler_path = None
+        if sys.platform == "win32":
+            pytesseract.pytesseract.tesseract_cmd = r"./software/Tesseract-OCR/tesseract.exe"
+            poppler_path = "./software/poppler-24.08.0/Library/bin"
+        images = convert_from_bytes(content, poppler_path=poppler_path)
+        logger.info(f"Tesseract OCR: converting {len(images)} pages")
+        for page_num, image in enumerate(images, start=1):
+            page_text = pytesseract.image_to_string(image)
+            if not page_text.strip():
+                continue
+            for chunk in self.text_splitter.split_text(page_text):
+                documents.append(LangChainDocument(
+                    page_content=chunk,
+                    metadata={
+                        "user_id": db_doc.user_id,
+                        "source_type": "document",
+                        "data": {
+                            "document_id": db_doc.id,
+                            "filename": db_doc.filename,
+                            "file_type": db_doc.file_type,
+                            "chunk_index": len(documents),
+                            "page_label": page_num,
+                        },
+                    }
+                ))
         return documents