Spaces:
Sleeping
Sleeping
| """ | |
| FastAPI — Universal Order Confirmation PDF Extractor | |
| Uses OpenAI GPT-4o to extract data from ANY supplier PDF format. | |
| Run: | |
| pip install -r requirements.txt | |
| export OPENAI_API_KEY="sk-..." | |
| uvicorn main:app --reload --port 8000 | |
| """ | |
| import io | |
| import json | |
| import os | |
| import re | |
| from typing import Any, List, Optional | |
| from dotenv import load_dotenv | |
| import pdfplumber | |
| from openai import OpenAI, AuthenticationError as OpenAIAuthError | |
| from fastapi import FastAPI, File, HTTPException, UploadFile | |
| from pydantic import BaseModel, Field | |
| load_dotenv() | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Config | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "") | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Pydantic models | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| class VoucherLine(BaseModel): | |
| type: str = Field( | |
| default="E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", | |
| alias="$type", | |
| ) | |
| Number: str # Supplier article number | |
| Quantity: float | |
| Price: float # Purchase unit price (from PDF) | |
| Description: str = "" | |
| VatCode: str = "01" | |
| DeliveryDate: Optional[str] = None | |
| model_config = {"populate_by_name": True} | |
| class VoucherResponse(BaseModel): | |
| Supplier: str | |
| OurOrderNumber: str # e.g. 2600385 | |
| DeliveryDate: Optional[str] | |
| CustomerNumber: Optional[str] | |
| VoucherDate: Optional[str] | |
| Currency: str | |
| AdditionalFields: List[Any] = [] | |
| VoucherLines: List[VoucherLine] | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # PDF text extraction | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def extract_text(pdf_bytes: bytes) -> str: | |
| parts = [] | |
| with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: | |
| for page in pdf.pages: | |
| t = page.extract_text() | |
| if t: | |
| parts.append(t) | |
| return "\n".join(parts) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # LLM prompts | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| SYSTEM_PROMPT = """You are a precise data extraction assistant for a Swiss hose service company (Schlauchservice Baumann GmbH). | |
| You receive raw text extracted from supplier order confirmation PDFs (in German) and must extract structured order data. | |
| IMPORTANT RULES: | |
| - "OurOrderNumber" is the BUYER's order number — look for patterns like 2600xxx, BEST. 2600xxx, "Ihre Bestellung", "I/Bestellung", "Ihre Bestellnr.", "Ihr Auftrag", "Bestellreferenz" | |
| - "CustomerNumber" is the supplier's customer number for Schlauchservice Baumann — look for "Kunden-Nr.", "Debitorennr.", "Kundennr.", "Kunden NR.", "Ihre Kunden-Nr.", "Kundennummer" | |
| - All dates in the output must use the format DD.MM.YYYY (e.g. 18.03.2026). Convert ALL date formats to this. | |
| - "DeliveryDate": the confirmed delivery/dispatch date — look for "Lieferung/Termin", "Auslieferdatum", "Versandtermin", "Lieferung", "Termin best.", "Warenausgangsdatum", "Versand-Datum", "Liefertermin" | |
| * If the delivery date is given as a calendar week like "KW 11" or "KW11", convert it to the WEDNESDAY of that ISO week in the document year. Example: "KW 11" in year 2026 → Wednesday of week 11, 2026 = 11.03.2026. Derive the year from the document/voucher date. | |
| * If no delivery date is mentioned at all, set DeliveryDate to null. | |
| - "VoucherDate": the document/order confirmation date — look for "Datum", "Belegdatum", date next to "Auftragsbestätigung". Format as DD.MM.YYYY. | |
| - For VoucherLines: extract ONLY real product/article lines. Skip shipping costs, surcharge lines, freight lines, and packaging lines UNLESS they have a real article number. | |
| - "Number": the supplier's article/item number. If there are two numbers (our nr / their nr), use the SUPPLIER'S number (first one listed) | |
| - "Price": unit purchase price. Use the net/discounted price if available. If only gross price with a discount %, calculate: price × (1 - discount/100) | |
| - "Quantity": number of units ordered | |
| - "Description": product description text | |
| - "VatCode": always "01" | |
| - "Currency": CHF or EUR | |
| - If a delivery date is per-line, use the earliest confirmed date as the overall DeliveryDate too | |
| - For delivery notes (Lieferschein): treat as order confirmation, extract what's available. Price may be 0 if not shown. | |
| You must respond ONLY with a valid JSON object — no markdown, no explanation, no extra text.""" | |
| EXTRACTION_PROMPT = """Extract the order data from this supplier PDF text and return JSON matching this exact structure: | |
| {{ | |
| "Supplier": "supplier company name", | |
| "OurOrderNumber": "2600xxx", | |
| "DeliveryDate": "YYYY-MM-DD or null", | |
| "CustomerNumber": "customer number or null", | |
| "VoucherDate": "YYYY-MM-DD or null", | |
| "Currency": "CHF or EUR", | |
| "AdditionalFields": [], | |
| "VoucherLines": [ | |
| {{ | |
| "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", | |
| "Number": "article number", | |
| "Quantity": 1.0, | |
| "Price": 0.0, | |
| "Description": "product description", | |
| "VatCode": "01", | |
| "DeliveryDate": "DD.MM.YYYY or null" | |
| }} | |
| ] | |
| }} | |
| PDF TEXT: | |
| --- | |
| {pdf_text} | |
| --- | |
| Return ONLY the JSON object.""" | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # LLM extraction via OpenAI API | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def llm_extract(pdf_text: str, api_key: str) -> dict: | |
| """Call OpenAI GPT-4o to extract structured order data from PDF text.""" | |
| client = OpenAI(api_key=api_key) | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| max_tokens=4096, | |
| temperature=0, | |
| response_format={"type": "json_object"}, # enforces valid JSON output | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": EXTRACTION_PROMPT.format(pdf_text=pdf_text)}, | |
| ], | |
| ) | |
| raw = response.choices[0].message.content.strip() | |
| # Strip any accidental markdown fences (safety net) | |
| raw = re.sub(r"^```(?:json)?\s*", "", raw) | |
| raw = re.sub(r"\s*```$", "", raw) | |
| return json.loads(raw) | |
| def parse_pdf(pdf_bytes: bytes, api_key: str) -> VoucherResponse: | |
| # 1. Extract text from PDF | |
| pdf_text = extract_text(pdf_bytes) | |
| if not pdf_text.strip(): | |
| raise ValueError("Could not extract any text from this PDF (may be image-based).") | |
| # 2. Send to LLM for extraction | |
| raw_data = llm_extract(pdf_text, api_key) | |
| # 3. Build VoucherLines | |
| voucher_lines = [] | |
| for line in raw_data.get("VoucherLines", []): | |
| vl = VoucherLine(**{ | |
| "$type": line.get( | |
| "$type", | |
| "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", | |
| ), | |
| "Number": str(line.get("Number", "")), | |
| "Quantity": float(line.get("Quantity", 0)), | |
| "Price": float(line.get("Price", 0)), | |
| "Description": line.get("Description", ""), | |
| "VatCode": line.get("VatCode", "01"), | |
| "DeliveryDate": line.get("DeliveryDate"), | |
| }) | |
| voucher_lines.append(vl) | |
| response = VoucherResponse( | |
| Supplier=raw_data.get("Supplier", "Unknown"), | |
| OurOrderNumber=str(raw_data.get("OurOrderNumber", "")), | |
| DeliveryDate=raw_data.get("DeliveryDate"), | |
| CustomerNumber=raw_data.get("CustomerNumber"), | |
| VoucherDate=raw_data.get("VoucherDate"), | |
| Currency=raw_data.get("Currency", "CHF"), | |
| AdditionalFields=[], | |
| VoucherLines=voucher_lines, | |
| ) | |
| return response | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # FastAPI | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| app = FastAPI( | |
| title="Universal Order Confirmation PDF Extractor", | |
| description=( | |
| "Upload any supplier order-confirmation PDF and receive ERP-ready JSON. " | |
| "Powered by GPT-4o — works with any supplier format." | |
| ), | |
| version="4.0.0", | |
| ) | |
| async def extract_order( | |
| file: UploadFile = File(..., description="Supplier order-confirmation PDF") | |
| ): | |
| """ | |
| Upload a PDF order confirmation from any supplier. | |
| GPT-4o extracts the order data and returns ERP-ready JSON. | |
| The `api_key` parameter defaults to the OPENAI_API_KEY environment variable. | |
| """ | |
| api_key: str = OPENAI_API_KEY | |
| if not api_key: | |
| raise HTTPException( | |
| status_code=500, | |
| detail="OpenAI API key not configured. Set OPENAI_API_KEY env var or pass api_key param.", | |
| ) | |
| if not file.filename.lower().endswith(".pdf"): | |
| raise HTTPException(status_code=400, detail="Only PDF files are accepted.") | |
| content = await file.read() | |
| try: | |
| result = parse_pdf(content, api_key) | |
| except json.JSONDecodeError as e: | |
| raise HTTPException(status_code=422, detail=f"LLM returned invalid JSON: {e}") | |
| except ValueError as e: | |
| raise HTTPException(status_code=422, detail=str(e)) | |
| except OpenAIAuthError: | |
| raise HTTPException(status_code=401, detail="Invalid OpenAI API key.") | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Extraction error: {e}") | |
| return result | |
| def health(): | |
| return {"status": "ok", "version": "4.0.0"} |