| from starlette.applications import Starlette |
| from starlette.responses import JSONResponse, PlainTextResponse |
| from starlette.routing import Route |
| from starlette.requests import Request |
| from starlette.middleware import Middleware |
| from starlette.middleware.cors import CORSMiddleware |
| import tempfile |
| import shutil |
| import os |
|
|
| import pymupdf4llm |
| from unstructured.partition.auto import partition |
| from unstructured.cleaners.core import clean |
| from chonkie import RecursiveChunker, RecursiveRules |
|
|
| recipe = RecursiveRules.from_dict({ |
| "name": "default", |
| "schema": "v1", |
| "description": "Default recipe for plaintext documents in Korean", |
| "language": "kr", |
| "metadata": { |
| "version": "0.1.0", |
| "author": "Chonkie Team" |
| }, |
| "recipe": { |
| "delimiters": [".", "ใ", "!", "๏ผ", "?", "๏ผ", "\n"], |
| "include_delim": "prev", |
| "recursive_rules": { |
| "levels": |
| [ |
| { |
| "delimiters": [ |
| "\n\n", |
| "\n\r" |
| ], |
| "whitespace": False, |
| "include_delim": "next" |
| }, |
| { |
| "delimiters": [ |
| "\n", |
| "\r" |
| ], |
| "whitespace": False, |
| "include_delim": "prev" |
| }, |
| { |
| "delimiters": [ |
| ".", |
| "ใ", |
| "!", |
| "๏ผ", |
| "?", |
| "๏ผ" |
| ], |
| "whitespace": False, |
| "include_delim": "prev" |
| }, |
| { |
| "delimiters": None, |
| "whitespace": True, |
| "include_delim": "prev" |
| }, |
| { |
| "delimiters": None, |
| "whitespace": False, |
| "include_delim": "prev" |
| } |
| ] |
| } |
| } |
| }) |
| chunker = RecursiveChunker(rules=recipe) |
|
|
| |
| async def handle_file_upload(request: Request): |
| form = await request.form() |
| upload = form.get("file") |
|
|
| if not upload or not upload.filename: |
| return JSONResponse({"error": "ํ์ผ์ ์
๋ก๋ํด์ฃผ์ธ์."}, status_code=400) |
|
|
| filename = upload.filename |
| ext = os.path.splitext(filename)[1].lower() |
|
|
| |
| with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp: |
| shutil.copyfileobj(upload.file, tmp) |
| tmp_path = tmp.name |
|
|
| try: |
| if ext == ".pdf": |
| |
| markdown_text = pymupdf4llm.to_markdown(tmp_path) |
| chunks = chunker(markdown_text) |
| return JSONResponse([chunk.text for chunk in chunks]) |
| else: |
| |
| elements = partition(tmp_path) |
| cleaned_text = clean( |
| "\n".join([str(el) for el in elements]), |
| dashes=True, |
| trailing_punctuation=True, |
| ) |
| chunks =chunker(cleaned_text) |
| return JSONResponse([chunk.text for chunk in chunks]) |
| except Exception as e: |
| return JSONResponse({"error": f"๋ฌธ์ ์ฒ๋ฆฌ ์คํจ: {str(e)}"}, status_code=500) |
| finally: |
| os.unlink(tmp_path) |
|
|
| |
| routes = [ |
| Route("/upload", handle_file_upload, methods=["POST"]), |
| ] |
|
|
| |
| middleware = [ |
| Middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]), |
| ] |
|
|
| app = Starlette(debug=True, routes=routes, middleware=middleware) |