Spaces:

AiDeveloper1
/

E3K

Sleeping

App Files Files Community

E3K / main.py

AiDeveloper1

Update main.py

080df18 verified about 1 month ago

raw

history blame contribute delete

11.7 kB

	"""
	FastAPI — Universal Order Confirmation PDF Extractor
	Uses OpenAI GPT-4o to extract data from ANY supplier PDF format.

	Run:
	pip install -r requirements.txt
	export OPENAI_API_KEY="sk-..."
	uvicorn main:app --reload --port 8000
	"""

	import io
	import json
	import os
	import re
	from typing import Any, List, Optional
	from dotenv import load_dotenv
	import pdfplumber
	from openai import OpenAI, AuthenticationError as OpenAIAuthError
	from fastapi import FastAPI, File, HTTPException, UploadFile
	from pydantic import BaseModel, Field

	load_dotenv()
	# ─────────────────────────────────────────────────────────────────────────────
	# Config
	# ─────────────────────────────────────────────────────────────────────────────

	OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")



	# ─────────────────────────────────────────────────────────────────────────────
	# Pydantic models
	# ─────────────────────────────────────────────────────────────────────────────

	class VoucherLine(BaseModel):
	type: str = Field(
	default="E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
	alias="$type",
	)
	Number: str # Supplier article number
	Quantity: float
	Price: float # Purchase unit price (from PDF)
	Description: str = ""
	VatCode: str = "01"
	DeliveryDate: Optional[str] = None

	model_config = {"populate_by_name": True}


	class VoucherResponse(BaseModel):
	Supplier: str
	OurOrderNumber: str # e.g. 2600385
	DeliveryDate: Optional[str]
	CustomerNumber: Optional[str]
	VoucherDate: Optional[str]
	Currency: str
	AdditionalFields: List[Any] = []
	VoucherLines: List[VoucherLine]


	# ─────────────────────────────────────────────────────────────────────────────
	# PDF text extraction
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_text(pdf_bytes: bytes) -> str:
	parts = []
	with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
	for page in pdf.pages:
	t = page.extract_text()
	if t:
	parts.append(t)
	return "\n".join(parts)




	# ─────────────────────────────────────────────────────────────────────────────
	# LLM prompts
	# ─────────────────────────────────────────────────────────────────────────────

	SYSTEM_PROMPT = """You are a precise data extraction assistant for a Swiss hose service company (Schlauchservice Baumann GmbH).
	You receive raw text extracted from supplier order confirmation PDFs (in German) and must extract structured order data.

	IMPORTANT RULES:
	- "OurOrderNumber" is the BUYER's order number — look for patterns like 2600xxx, BEST. 2600xxx, "Ihre Bestellung", "I/Bestellung", "Ihre Bestellnr.", "Ihr Auftrag", "Bestellreferenz"
	- "CustomerNumber" is the supplier's customer number for Schlauchservice Baumann — look for "Kunden-Nr.", "Debitorennr.", "Kundennr.", "Kunden NR.", "Ihre Kunden-Nr.", "Kundennummer"
	- All dates in the output must use the format DD.MM.YYYY (e.g. 18.03.2026). Convert ALL date formats to this.
	- "DeliveryDate": the confirmed delivery/dispatch date — look for "Lieferung/Termin", "Auslieferdatum", "Versandtermin", "Lieferung", "Termin best.", "Warenausgangsdatum", "Versand-Datum", "Liefertermin"
	* If the delivery date is given as a calendar week like "KW 11" or "KW11", convert it to the WEDNESDAY of that ISO week in the document year. Example: "KW 11" in year 2026 → Wednesday of week 11, 2026 = 11.03.2026. Derive the year from the document/voucher date.
	* If no delivery date is mentioned at all, set DeliveryDate to null.
	- "VoucherDate": the document/order confirmation date — look for "Datum", "Belegdatum", date next to "Auftragsbestätigung". Format as DD.MM.YYYY.
	- For VoucherLines: extract ONLY real product/article lines. Skip shipping costs, surcharge lines, freight lines, and packaging lines UNLESS they have a real article number.
	- "Number": the supplier's article/item number. If there are two numbers (our nr / their nr), use the SUPPLIER'S number (first one listed)
	- "Price": unit purchase price. Use the net/discounted price if available. If only gross price with a discount %, calculate: price × (1 - discount/100)
	- "Quantity": number of units ordered
	- "Description": product description text
	- "VatCode": always "01"
	- "Currency": CHF or EUR
	- If a delivery date is per-line, use the earliest confirmed date as the overall DeliveryDate too
	- For delivery notes (Lieferschein): treat as order confirmation, extract what's available. Price may be 0 if not shown.

	You must respond ONLY with a valid JSON object — no markdown, no explanation, no extra text."""

	EXTRACTION_PROMPT = """Extract the order data from this supplier PDF text and return JSON matching this exact structure:

	{{
	"Supplier": "supplier company name",
	"OurOrderNumber": "2600xxx",
	"DeliveryDate": "YYYY-MM-DD or null",
	"CustomerNumber": "customer number or null",
	"VoucherDate": "YYYY-MM-DD or null",
	"Currency": "CHF or EUR",
	"AdditionalFields": [],
	"VoucherLines": [
	{{
	"$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
	"Number": "article number",
	"Quantity": 1.0,
	"Price": 0.0,
	"Description": "product description",
	"VatCode": "01",
	"DeliveryDate": "DD.MM.YYYY or null"
	}}
	]
	}}

	PDF TEXT:
	---
	{pdf_text}
	---

	Return ONLY the JSON object."""


	# ─────────────────────────────────────────────────────────────────────────────
	# LLM extraction via OpenAI API
	# ─────────────────────────────────────────────────────────────────────────────

	def llm_extract(pdf_text: str, api_key: str) -> dict:
	"""Call OpenAI GPT-4o to extract structured order data from PDF text."""
	client = OpenAI(api_key=api_key)

	response = client.chat.completions.create(
	model="gpt-4o",
	max_tokens=4096,
	temperature=0,
	response_format={"type": "json_object"}, # enforces valid JSON output
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": EXTRACTION_PROMPT.format(pdf_text=pdf_text)},
	],
	)

	raw = response.choices[0].message.content.strip()

	# Strip any accidental markdown fences (safety net)
	raw = re.sub(r"^```(?:json)?\s*", "", raw)
	raw = re.sub(r"\s*```$", "", raw)

	return json.loads(raw)


	def parse_pdf(pdf_bytes: bytes, api_key: str) -> VoucherResponse:
	# 1. Extract text from PDF
	pdf_text = extract_text(pdf_bytes)
	if not pdf_text.strip():
	raise ValueError("Could not extract any text from this PDF (may be image-based).")

	# 2. Send to LLM for extraction
	raw_data = llm_extract(pdf_text, api_key)

	# 3. Build VoucherLines
	voucher_lines = []
	for line in raw_data.get("VoucherLines", []):
	vl = VoucherLine(**{
	"$type": line.get(
	"$type",
	"E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
	),
	"Number": str(line.get("Number", "")),
	"Quantity": float(line.get("Quantity", 0)),
	"Price": float(line.get("Price", 0)),
	"Description": line.get("Description", ""),
	"VatCode": line.get("VatCode", "01"),
	"DeliveryDate": line.get("DeliveryDate"),
	})
	voucher_lines.append(vl)

	response = VoucherResponse(
	Supplier=raw_data.get("Supplier", "Unknown"),
	OurOrderNumber=str(raw_data.get("OurOrderNumber", "")),
	DeliveryDate=raw_data.get("DeliveryDate"),
	CustomerNumber=raw_data.get("CustomerNumber"),
	VoucherDate=raw_data.get("VoucherDate"),
	Currency=raw_data.get("Currency", "CHF"),
	AdditionalFields=[],
	VoucherLines=voucher_lines,
	)

	return response


	# ─────────────────────────────────────────────────────────────────────────────
	# FastAPI
	# ─────────────────────────────────────────────────────────────────────────────

	app = FastAPI(
	title="Universal Order Confirmation PDF Extractor",
	description=(
	"Upload any supplier order-confirmation PDF and receive ERP-ready JSON. "
	"Powered by GPT-4o — works with any supplier format."
	),
	version="4.0.0",
	)


	@app.post(
	"/extract",
	response_model=VoucherResponse,
	summary="Extract order data from any supplier PDF",
	)
	async def extract_order(
	file: UploadFile = File(..., description="Supplier order-confirmation PDF")
	):
	"""
	Upload a PDF order confirmation from any supplier.
	GPT-4o extracts the order data and returns ERP-ready JSON.

	The `api_key` parameter defaults to the OPENAI_API_KEY environment variable.
	"""
	api_key: str = OPENAI_API_KEY

	if not api_key:
	raise HTTPException(
	status_code=500,
	detail="OpenAI API key not configured. Set OPENAI_API_KEY env var or pass api_key param.",
	)

	if not file.filename.lower().endswith(".pdf"):
	raise HTTPException(status_code=400, detail="Only PDF files are accepted.")

	content = await file.read()
	try:
	result = parse_pdf(content, api_key)
	except json.JSONDecodeError as e:
	raise HTTPException(status_code=422, detail=f"LLM returned invalid JSON: {e}")
	except ValueError as e:
	raise HTTPException(status_code=422, detail=str(e))
	except OpenAIAuthError:
	raise HTTPException(status_code=401, detail="Invalid OpenAI API key.")
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Extraction error: {e}")

	return result


	@app.get("/health", summary="Health check")
	def health():
	return {"status": "ok", "version": "4.0.0"}