sofhiaazzhr commited on
Commit
6b9a13d
·
1 Parent(s): 347a73a

[NOTICKET]: use tesseract for extract PDF

Browse files
Files changed (1) hide show
  1. src/knowledge/processing_service.py +31 -63
src/knowledge/processing_service.py CHANGED
@@ -8,12 +8,13 @@ from src.db.postgres.models import Document as DBDocument
8
  from src.config.settings import settings
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
  from src.middlewares.logging import get_logger
11
- from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
12
- from azure.core.credentials import AzureKeyCredential
13
  from typing import List
 
14
  import pypdf
15
  import docx
16
  import pandas as pd
 
 
17
  from io import BytesIO
18
 
19
  logger = get_logger("knowledge_processing")
@@ -83,69 +84,36 @@ class KnowledgeProcessingService:
83
  async def _build_pdf_documents(
84
  self, content: bytes, db_doc: DBDocument
85
  ) -> List[LangChainDocument]:
86
- """Build LangChain documents from PDF with page_label metadata.
87
-
88
- Uses Azure Document Intelligence (per-page) when credentials are present,
89
- falls back to pypdf (also per-page) otherwise.
90
- """
91
  documents: List[LangChainDocument] = []
92
 
93
- if settings.azureai_docintel_endpoint and settings.azureai_docintel_key:
94
- async with DocumentIntelligenceClient(
95
- endpoint=settings.azureai_docintel_endpoint,
96
- credential=AzureKeyCredential(settings.azureai_docintel_key),
97
- ) as client:
98
- poller = await client.begin_analyze_document(
99
- model_id="prebuilt-read",
100
- body=BytesIO(content),
101
- content_type="application/pdf",
102
- )
103
- result = await poller.result()
104
- logger.info(f"Azure DI extracted {len(result.pages or [])} pages")
105
-
106
- for page in result.pages or []:
107
- page_text = "\n".join(
108
- line.content for line in (page.lines or [])
109
- )
110
- if not page_text.strip():
111
- continue
112
- for chunk in self.text_splitter.split_text(page_text):
113
- documents.append(LangChainDocument(
114
- page_content=chunk,
115
- metadata={
116
- "user_id": db_doc.user_id,
117
- "source_type": "document",
118
- "data": {
119
- "document_id": db_doc.id,
120
- "filename": db_doc.filename,
121
- "file_type": db_doc.file_type,
122
- "chunk_index": len(documents),
123
- "page_label": page.page_number,
124
- },
125
- }
126
- ))
127
- else:
128
- logger.warning("Azure DI not configured, using pypdf")
129
- pdf_reader = pypdf.PdfReader(BytesIO(content))
130
- for page_num, page in enumerate(pdf_reader.pages, start=1):
131
- page_text = page.extract_text() or ""
132
- if not page_text.strip():
133
- continue
134
- for chunk in self.text_splitter.split_text(page_text):
135
- documents.append(LangChainDocument(
136
- page_content=chunk,
137
- metadata={
138
- "user_id": db_doc.user_id,
139
- "source_type": "document",
140
- "data": {
141
- "document_id": db_doc.id,
142
- "filename": db_doc.filename,
143
- "file_type": db_doc.file_type,
144
- "chunk_index": len(documents),
145
- "page_label": page_num,
146
- },
147
- }
148
- ))
149
 
150
  return documents
151
 
 
8
  from src.config.settings import settings
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
  from src.middlewares.logging import get_logger
 
 
11
  from typing import List
12
+ import sys
13
  import pypdf
14
  import docx
15
  import pandas as pd
16
+ import pytesseract
17
+ from pdf2image import convert_from_bytes
18
  from io import BytesIO
19
 
20
  logger = get_logger("knowledge_processing")
 
84
  async def _build_pdf_documents(
85
  self, content: bytes, db_doc: DBDocument
86
  ) -> List[LangChainDocument]:
87
+ """Build LangChain documents from PDF with page_label metadata using Tesseract OCR."""
 
 
 
 
88
  documents: List[LangChainDocument] = []
89
 
90
+ poppler_path = None
91
+ if sys.platform == "win32":
92
+ pytesseract.pytesseract.tesseract_cmd = r"./software/Tesseract-OCR/tesseract.exe"
93
+ poppler_path = "./software/poppler-24.08.0/Library/bin"
94
+
95
+ images = convert_from_bytes(content, poppler_path=poppler_path)
96
+ logger.info(f"Tesseract OCR: converting {len(images)} pages")
97
+
98
+ for page_num, image in enumerate(images, start=1):
99
+ page_text = pytesseract.image_to_string(image)
100
+ if not page_text.strip():
101
+ continue
102
+ for chunk in self.text_splitter.split_text(page_text):
103
+ documents.append(LangChainDocument(
104
+ page_content=chunk,
105
+ metadata={
106
+ "user_id": db_doc.user_id,
107
+ "source_type": "document",
108
+ "data": {
109
+ "document_id": db_doc.id,
110
+ "filename": db_doc.filename,
111
+ "file_type": db_doc.file_type,
112
+ "chunk_index": len(documents),
113
+ "page_label": page_num,
114
+ },
115
+ }
116
+ ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  return documents
119