TenderIQ / core /criteria_extractor.py
JaydeepR's picture
Step 6: criteria extractor, audit, fallback, and Tab 2 wiring
61e2cc7
import json
from pathlib import Path
import streamlit as st
from core import audit, fallback
from core.config import MODEL_VERSION
from core.llm_client import LLM, LLMUnavailable
from core.pdf_utils import extract_pages
from core.prompts import EXTRACT_CRITERIA_PROMPT_SYSTEM
from core.schemas import Criterion
@st.cache_resource
def _get_llm() -> LLM:
return LLM()
def extract_criteria(tender_pdf_path: Path) -> list[Criterion]:
pages = extract_pages(tender_pdf_path)
tender_text = "\n\n".join(
f"--- PAGE {p['page']} ---\n{p['text']}" for p in pages
)
user_prompt = f"""{tender_text}
---
Return JSON in this exact format:
{{"criteria": [
{{"id": "C1", "title": "...", "category": "financial|technical|compliance",
"mandatory": true, "description": "...",
"rule": {{"type": "numeric_threshold|count_threshold|certification_present|document_present",
"field": "...", "operator": ">=|<=|==|exists", "value": null, "unit": null}},
"query_hints": ["...", "...", "..."],
"source_page": 1, "source_clause": "3.2(a)"}},
...
]}}
Each criterion must have all fields. Assign sequential IDs C1, C2, ...
"""
try:
llm = _get_llm()
result = llm.chat_json(EXTRACT_CRITERIA_PROMPT_SYSTEM, user_prompt)
raw_list = result.get("criteria", [])
criteria = [Criterion(**c) for c in raw_list]
audit.log(
"criteria_extracted",
model_version=MODEL_VERSION,
count=len(criteria),
source=str(tender_pdf_path.name),
)
return criteria
except LLMUnavailable:
audit.log("precomputed_fallback_used", reason="LLMUnavailable in extract_criteria")
st.session_state["fallback_active"] = True
return fallback.load_criteria()