E3K / main.py
AiDeveloper1's picture
Update main.py
080df18 verified
"""
FastAPI — Universal Order Confirmation PDF Extractor
Uses OpenAI GPT-4o to extract data from ANY supplier PDF format.
Run:
pip install -r requirements.txt
export OPENAI_API_KEY="sk-..."
uvicorn main:app --reload --port 8000
"""
import io
import json
import os
import re
from typing import Any, List, Optional
from dotenv import load_dotenv
import pdfplumber
from openai import OpenAI, AuthenticationError as OpenAIAuthError
from fastapi import FastAPI, File, HTTPException, UploadFile
from pydantic import BaseModel, Field
load_dotenv()
# ─────────────────────────────────────────────────────────────────────────────
# Config
# ─────────────────────────────────────────────────────────────────────────────
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
# ─────────────────────────────────────────────────────────────────────────────
# Pydantic models
# ─────────────────────────────────────────────────────────────────────────────
class VoucherLine(BaseModel):
type: str = Field(
default="E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
alias="$type",
)
Number: str # Supplier article number
Quantity: float
Price: float # Purchase unit price (from PDF)
Description: str = ""
VatCode: str = "01"
DeliveryDate: Optional[str] = None
model_config = {"populate_by_name": True}
class VoucherResponse(BaseModel):
Supplier: str
OurOrderNumber: str # e.g. 2600385
DeliveryDate: Optional[str]
CustomerNumber: Optional[str]
VoucherDate: Optional[str]
Currency: str
AdditionalFields: List[Any] = []
VoucherLines: List[VoucherLine]
# ─────────────────────────────────────────────────────────────────────────────
# PDF text extraction
# ─────────────────────────────────────────────────────────────────────────────
def extract_text(pdf_bytes: bytes) -> str:
parts = []
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
for page in pdf.pages:
t = page.extract_text()
if t:
parts.append(t)
return "\n".join(parts)
# ─────────────────────────────────────────────────────────────────────────────
# LLM prompts
# ─────────────────────────────────────────────────────────────────────────────
SYSTEM_PROMPT = """You are a precise data extraction assistant for a Swiss hose service company (Schlauchservice Baumann GmbH).
You receive raw text extracted from supplier order confirmation PDFs (in German) and must extract structured order data.
IMPORTANT RULES:
- "OurOrderNumber" is the BUYER's order number — look for patterns like 2600xxx, BEST. 2600xxx, "Ihre Bestellung", "I/Bestellung", "Ihre Bestellnr.", "Ihr Auftrag", "Bestellreferenz"
- "CustomerNumber" is the supplier's customer number for Schlauchservice Baumann — look for "Kunden-Nr.", "Debitorennr.", "Kundennr.", "Kunden NR.", "Ihre Kunden-Nr.", "Kundennummer"
- All dates in the output must use the format DD.MM.YYYY (e.g. 18.03.2026). Convert ALL date formats to this.
- "DeliveryDate": the confirmed delivery/dispatch date — look for "Lieferung/Termin", "Auslieferdatum", "Versandtermin", "Lieferung", "Termin best.", "Warenausgangsdatum", "Versand-Datum", "Liefertermin"
* If the delivery date is given as a calendar week like "KW 11" or "KW11", convert it to the WEDNESDAY of that ISO week in the document year. Example: "KW 11" in year 2026 → Wednesday of week 11, 2026 = 11.03.2026. Derive the year from the document/voucher date.
* If no delivery date is mentioned at all, set DeliveryDate to null.
- "VoucherDate": the document/order confirmation date — look for "Datum", "Belegdatum", date next to "Auftragsbestätigung". Format as DD.MM.YYYY.
- For VoucherLines: extract ONLY real product/article lines. Skip shipping costs, surcharge lines, freight lines, and packaging lines UNLESS they have a real article number.
- "Number": the supplier's article/item number. If there are two numbers (our nr / their nr), use the SUPPLIER'S number (first one listed)
- "Price": unit purchase price. Use the net/discounted price if available. If only gross price with a discount %, calculate: price × (1 - discount/100)
- "Quantity": number of units ordered
- "Description": product description text
- "VatCode": always "01"
- "Currency": CHF or EUR
- If a delivery date is per-line, use the earliest confirmed date as the overall DeliveryDate too
- For delivery notes (Lieferschein): treat as order confirmation, extract what's available. Price may be 0 if not shown.
You must respond ONLY with a valid JSON object — no markdown, no explanation, no extra text."""
EXTRACTION_PROMPT = """Extract the order data from this supplier PDF text and return JSON matching this exact structure:
{{
"Supplier": "supplier company name",
"OurOrderNumber": "2600xxx",
"DeliveryDate": "YYYY-MM-DD or null",
"CustomerNumber": "customer number or null",
"VoucherDate": "YYYY-MM-DD or null",
"Currency": "CHF or EUR",
"AdditionalFields": [],
"VoucherLines": [
{{
"$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
"Number": "article number",
"Quantity": 1.0,
"Price": 0.0,
"Description": "product description",
"VatCode": "01",
"DeliveryDate": "DD.MM.YYYY or null"
}}
]
}}
PDF TEXT:
---
{pdf_text}
---
Return ONLY the JSON object."""
# ─────────────────────────────────────────────────────────────────────────────
# LLM extraction via OpenAI API
# ─────────────────────────────────────────────────────────────────────────────
def llm_extract(pdf_text: str, api_key: str) -> dict:
"""Call OpenAI GPT-4o to extract structured order data from PDF text."""
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model="gpt-4o",
max_tokens=4096,
temperature=0,
response_format={"type": "json_object"}, # enforces valid JSON output
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": EXTRACTION_PROMPT.format(pdf_text=pdf_text)},
],
)
raw = response.choices[0].message.content.strip()
# Strip any accidental markdown fences (safety net)
raw = re.sub(r"^```(?:json)?\s*", "", raw)
raw = re.sub(r"\s*```$", "", raw)
return json.loads(raw)
def parse_pdf(pdf_bytes: bytes, api_key: str) -> VoucherResponse:
# 1. Extract text from PDF
pdf_text = extract_text(pdf_bytes)
if not pdf_text.strip():
raise ValueError("Could not extract any text from this PDF (may be image-based).")
# 2. Send to LLM for extraction
raw_data = llm_extract(pdf_text, api_key)
# 3. Build VoucherLines
voucher_lines = []
for line in raw_data.get("VoucherLines", []):
vl = VoucherLine(**{
"$type": line.get(
"$type",
"E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
),
"Number": str(line.get("Number", "")),
"Quantity": float(line.get("Quantity", 0)),
"Price": float(line.get("Price", 0)),
"Description": line.get("Description", ""),
"VatCode": line.get("VatCode", "01"),
"DeliveryDate": line.get("DeliveryDate"),
})
voucher_lines.append(vl)
response = VoucherResponse(
Supplier=raw_data.get("Supplier", "Unknown"),
OurOrderNumber=str(raw_data.get("OurOrderNumber", "")),
DeliveryDate=raw_data.get("DeliveryDate"),
CustomerNumber=raw_data.get("CustomerNumber"),
VoucherDate=raw_data.get("VoucherDate"),
Currency=raw_data.get("Currency", "CHF"),
AdditionalFields=[],
VoucherLines=voucher_lines,
)
return response
# ─────────────────────────────────────────────────────────────────────────────
# FastAPI
# ─────────────────────────────────────────────────────────────────────────────
app = FastAPI(
title="Universal Order Confirmation PDF Extractor",
description=(
"Upload any supplier order-confirmation PDF and receive ERP-ready JSON. "
"Powered by GPT-4o — works with any supplier format."
),
version="4.0.0",
)
@app.post(
"/extract",
response_model=VoucherResponse,
summary="Extract order data from any supplier PDF",
)
async def extract_order(
file: UploadFile = File(..., description="Supplier order-confirmation PDF")
):
"""
Upload a PDF order confirmation from any supplier.
GPT-4o extracts the order data and returns ERP-ready JSON.
The `api_key` parameter defaults to the OPENAI_API_KEY environment variable.
"""
api_key: str = OPENAI_API_KEY
if not api_key:
raise HTTPException(
status_code=500,
detail="OpenAI API key not configured. Set OPENAI_API_KEY env var or pass api_key param.",
)
if not file.filename.lower().endswith(".pdf"):
raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
content = await file.read()
try:
result = parse_pdf(content, api_key)
except json.JSONDecodeError as e:
raise HTTPException(status_code=422, detail=f"LLM returned invalid JSON: {e}")
except ValueError as e:
raise HTTPException(status_code=422, detail=str(e))
except OpenAIAuthError:
raise HTTPException(status_code=401, detail="Invalid OpenAI API key.")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Extraction error: {e}")
return result
@app.get("/health", summary="Health check")
def health():
return {"status": "ok", "version": "4.0.0"}