""" FastAPI — Universal Order Confirmation PDF Extractor Uses OpenAI GPT-4o to extract data from ANY supplier PDF format. Run: pip install -r requirements.txt export OPENAI_API_KEY="sk-..." uvicorn main:app --reload --port 8000 """ import io import json import os import re from typing import Any, List, Optional from dotenv import load_dotenv import pdfplumber from openai import OpenAI, AuthenticationError as OpenAIAuthError from fastapi import FastAPI, File, HTTPException, UploadFile from pydantic import BaseModel, Field load_dotenv() # ───────────────────────────────────────────────────────────────────────────── # Config # ───────────────────────────────────────────────────────────────────────────── OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "") # ───────────────────────────────────────────────────────────────────────────── # Pydantic models # ───────────────────────────────────────────────────────────────────────────── class VoucherLine(BaseModel): type: str = Field( default="E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", alias="$type", ) Number: str # Supplier article number Quantity: float Price: float # Purchase unit price (from PDF) Description: str = "" VatCode: str = "01" DeliveryDate: Optional[str] = None model_config = {"populate_by_name": True} class VoucherResponse(BaseModel): Supplier: str OurOrderNumber: str # e.g. 2600385 DeliveryDate: Optional[str] CustomerNumber: Optional[str] VoucherDate: Optional[str] Currency: str AdditionalFields: List[Any] = [] VoucherLines: List[VoucherLine] # ───────────────────────────────────────────────────────────────────────────── # PDF text extraction # ───────────────────────────────────────────────────────────────────────────── def extract_text(pdf_bytes: bytes) -> str: parts = [] with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: for page in pdf.pages: t = page.extract_text() if t: parts.append(t) return "\n".join(parts) # ───────────────────────────────────────────────────────────────────────────── # LLM prompts # ───────────────────────────────────────────────────────────────────────────── SYSTEM_PROMPT = """You are a precise data extraction assistant for a Swiss hose service company (Schlauchservice Baumann GmbH). You receive raw text extracted from supplier order confirmation PDFs (in German) and must extract structured order data. IMPORTANT RULES: - "OurOrderNumber" is the BUYER's order number — look for patterns like 2600xxx, BEST. 2600xxx, "Ihre Bestellung", "I/Bestellung", "Ihre Bestellnr.", "Ihr Auftrag", "Bestellreferenz" - "CustomerNumber" is the supplier's customer number for Schlauchservice Baumann — look for "Kunden-Nr.", "Debitorennr.", "Kundennr.", "Kunden NR.", "Ihre Kunden-Nr.", "Kundennummer" - All dates in the output must use the format DD.MM.YYYY (e.g. 18.03.2026). Convert ALL date formats to this. - "DeliveryDate": the confirmed delivery/dispatch date — look for "Lieferung/Termin", "Auslieferdatum", "Versandtermin", "Lieferung", "Termin best.", "Warenausgangsdatum", "Versand-Datum", "Liefertermin" * If the delivery date is given as a calendar week like "KW 11" or "KW11", convert it to the WEDNESDAY of that ISO week in the document year. Example: "KW 11" in year 2026 → Wednesday of week 11, 2026 = 11.03.2026. Derive the year from the document/voucher date. * If no delivery date is mentioned at all, set DeliveryDate to null. - "VoucherDate": the document/order confirmation date — look for "Datum", "Belegdatum", date next to "Auftragsbestätigung". Format as DD.MM.YYYY. - For VoucherLines: extract ONLY real product/article lines. Skip shipping costs, surcharge lines, freight lines, and packaging lines UNLESS they have a real article number. - "Number": the supplier's article/item number. If there are two numbers (our nr / their nr), use the SUPPLIER'S number (first one listed) - "Price": unit purchase price. Use the net/discounted price if available. If only gross price with a discount %, calculate: price × (1 - discount/100) - "Quantity": number of units ordered - "Description": product description text - "VatCode": always "01" - "Currency": CHF or EUR - If a delivery date is per-line, use the earliest confirmed date as the overall DeliveryDate too - For delivery notes (Lieferschein): treat as order confirmation, extract what's available. Price may be 0 if not shown. You must respond ONLY with a valid JSON object — no markdown, no explanation, no extra text.""" EXTRACTION_PROMPT = """Extract the order data from this supplier PDF text and return JSON matching this exact structure: {{ "Supplier": "supplier company name", "OurOrderNumber": "2600xxx", "DeliveryDate": "YYYY-MM-DD or null", "CustomerNumber": "customer number or null", "VoucherDate": "YYYY-MM-DD or null", "Currency": "CHF or EUR", "AdditionalFields": [], "VoucherLines": [ {{ "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", "Number": "article number", "Quantity": 1.0, "Price": 0.0, "Description": "product description", "VatCode": "01", "DeliveryDate": "DD.MM.YYYY or null" }} ] }} PDF TEXT: --- {pdf_text} --- Return ONLY the JSON object.""" # ───────────────────────────────────────────────────────────────────────────── # LLM extraction via OpenAI API # ───────────────────────────────────────────────────────────────────────────── def llm_extract(pdf_text: str, api_key: str) -> dict: """Call OpenAI GPT-4o to extract structured order data from PDF text.""" client = OpenAI(api_key=api_key) response = client.chat.completions.create( model="gpt-4o", max_tokens=4096, temperature=0, response_format={"type": "json_object"}, # enforces valid JSON output messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": EXTRACTION_PROMPT.format(pdf_text=pdf_text)}, ], ) raw = response.choices[0].message.content.strip() # Strip any accidental markdown fences (safety net) raw = re.sub(r"^```(?:json)?\s*", "", raw) raw = re.sub(r"\s*```$", "", raw) return json.loads(raw) def parse_pdf(pdf_bytes: bytes, api_key: str) -> VoucherResponse: # 1. Extract text from PDF pdf_text = extract_text(pdf_bytes) if not pdf_text.strip(): raise ValueError("Could not extract any text from this PDF (may be image-based).") # 2. Send to LLM for extraction raw_data = llm_extract(pdf_text, api_key) # 3. Build VoucherLines voucher_lines = [] for line in raw_data.get("VoucherLines", []): vl = VoucherLine(**{ "$type": line.get( "$type", "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", ), "Number": str(line.get("Number", "")), "Quantity": float(line.get("Quantity", 0)), "Price": float(line.get("Price", 0)), "Description": line.get("Description", ""), "VatCode": line.get("VatCode", "01"), "DeliveryDate": line.get("DeliveryDate"), }) voucher_lines.append(vl) response = VoucherResponse( Supplier=raw_data.get("Supplier", "Unknown"), OurOrderNumber=str(raw_data.get("OurOrderNumber", "")), DeliveryDate=raw_data.get("DeliveryDate"), CustomerNumber=raw_data.get("CustomerNumber"), VoucherDate=raw_data.get("VoucherDate"), Currency=raw_data.get("Currency", "CHF"), AdditionalFields=[], VoucherLines=voucher_lines, ) return response # ───────────────────────────────────────────────────────────────────────────── # FastAPI # ───────────────────────────────────────────────────────────────────────────── app = FastAPI( title="Universal Order Confirmation PDF Extractor", description=( "Upload any supplier order-confirmation PDF and receive ERP-ready JSON. " "Powered by GPT-4o — works with any supplier format." ), version="4.0.0", ) @app.post( "/extract", response_model=VoucherResponse, summary="Extract order data from any supplier PDF", ) async def extract_order( file: UploadFile = File(..., description="Supplier order-confirmation PDF") ): """ Upload a PDF order confirmation from any supplier. GPT-4o extracts the order data and returns ERP-ready JSON. The `api_key` parameter defaults to the OPENAI_API_KEY environment variable. """ api_key: str = OPENAI_API_KEY if not api_key: raise HTTPException( status_code=500, detail="OpenAI API key not configured. Set OPENAI_API_KEY env var or pass api_key param.", ) if not file.filename.lower().endswith(".pdf"): raise HTTPException(status_code=400, detail="Only PDF files are accepted.") content = await file.read() try: result = parse_pdf(content, api_key) except json.JSONDecodeError as e: raise HTTPException(status_code=422, detail=f"LLM returned invalid JSON: {e}") except ValueError as e: raise HTTPException(status_code=422, detail=str(e)) except OpenAIAuthError: raise HTTPException(status_code=401, detail="Invalid OpenAI API key.") except Exception as e: raise HTTPException(status_code=500, detail=f"Extraction error: {e}") return result @app.get("/health", summary="Health check") def health(): return {"status": "ok", "version": "4.0.0"}