| """Loader that loads image files.""" |
| from typing import List |
|
|
| from langchain.document_loaders.unstructured import UnstructuredFileLoader |
| from paddleocr import PaddleOCR |
| import os |
| import nltk |
| from configs.model_config import NLTK_DATA_PATH |
|
|
| nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path |
|
|
| class UnstructuredPaddleImageLoader(UnstructuredFileLoader): |
| """Loader that uses unstructured to load image files, such as PNGs and JPGs.""" |
|
|
| def _get_elements(self) -> List: |
| def image_ocr_txt(filepath, dir_path="tmp_files"): |
| full_dir_path = os.path.join(os.path.dirname(filepath), dir_path) |
| if not os.path.exists(full_dir_path): |
| os.makedirs(full_dir_path) |
| filename = os.path.split(filepath)[-1] |
| ocr = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=False, show_log=False) |
| result = ocr.ocr(img=filepath) |
|
|
| ocr_result = [i[1][0] for line in result for i in line] |
| txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename)) |
| with open(txt_file_path, 'w', encoding='utf-8') as fout: |
| fout.write("\n".join(ocr_result)) |
| return txt_file_path |
|
|
| txt_file_path = image_ocr_txt(self.file_path) |
| from unstructured.partition.text import partition_text |
| return partition_text(filename=txt_file_path, **self.unstructured_kwargs) |
| |
| |
| if __name__ == "__main__": |
| import sys |
| sys.path.append(os.path.dirname(os.path.dirname(__file__))) |
| filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), "knowledge_base", "samples", "content", "test.jpg") |
| loader = UnstructuredPaddleImageLoader(filepath, mode="elements") |
| docs = loader.load() |
| for doc in docs: |
| print(doc) |
|
|