| from typing import Dict, Any, List |
| import os |
| import base64 |
| current_dir = os.getcwd() |
| os.environ['HF_HOME'] = os.path.join(current_dir) |
| os.environ['PAGINATE_OUTPUT']='True' |
| from marker.convert import convert_single_pdf |
| from marker.logger import configure_logging |
| from marker.models import load_all_models |
| from marker.output import save_markdown |
| from io import BytesIO |
| class EndpointHandler: |
| def __init__(self, path=""): |
| |
| self.models = load_all_models() |
| self.file_location = "input/temp.pdf" |
| os.makedirs("input", exist_ok=True) |
|
|
| def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
| """ |
| data args: |
| inputs (:obj: dict): A dictionary containing the inputs. |
| max_pages (:obj: int): The maximum number of pages to process. |
| file (:obj: str): The base64-encoded PDF file content. |
| Return: |
| A list of dictionaries containing the extracted text. |
| """ |
| inputs = data.get("inputs", {}) |
| file_content = inputs.get("file") |
| max_pages = inputs.get("max_pages", None) |
|
|
| |
| file_bytes = base64.b64decode(file_content) |
| self.upload_file(BytesIO(file_bytes)) |
|
|
| pdf_path = self.file_location |
|
|
| |
| extracted_text, _, _ = convert_single_pdf(pdf_path, self.models, max_pages=max_pages, langs=["vi"]) |
| |
| return [{"extracted_text": extracted_text}] |
|
|
| def upload_file(self, file: BytesIO, max_pages: int = None): |
| with open(self.file_location, "wb") as f: |
| f.write(file.read()) |
| return True |