JaydeepR Claude Sonnet 4.6 commited on
Commit
61e2cc7
·
1 Parent(s): f42bfb0

Step 6: criteria extractor, audit, fallback, and Tab 2 wiring

Browse files

Implements specs/07_criteria_extractor.md. extract_criteria calls DeepSeek
with the full tender text and parses Criterion objects; falls back to
hardcoded precomputed criteria on LLMUnavailable. audit.py writes to SQLite;
fallback.py loads precomputed JSON or hardcoded defaults. Tab 2 renders
criteria cards with category/mandatory badges and rule details.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

core/audit.py CHANGED
@@ -1,6 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def log(action: str, actor: str = "system", **fields) -> int:
2
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def query(filters: dict | None = None) -> list[dict]:
6
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sqlite3
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+
6
+ from core.config import AUDIT_DB, MODEL_VERSION
7
+
8
+ _SCHEMA = """
9
+ CREATE TABLE IF NOT EXISTS audit_log (
10
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
11
+ ts TEXT NOT NULL,
12
+ action TEXT NOT NULL,
13
+ actor TEXT NOT NULL,
14
+ model_version TEXT,
15
+ bidder_id TEXT,
16
+ criterion_id TEXT,
17
+ payload_json TEXT
18
+ );
19
+ """
20
+
21
+
22
+ def _conn() -> sqlite3.Connection:
23
+ Path(AUDIT_DB).parent.mkdir(parents=True, exist_ok=True)
24
+ conn = sqlite3.connect(AUDIT_DB)
25
+ conn.row_factory = sqlite3.Row
26
+ conn.execute(_SCHEMA)
27
+ conn.commit()
28
+ return conn
29
+
30
+
31
  def log(action: str, actor: str = "system", **fields) -> int:
32
+ ts = datetime.now(timezone.utc).isoformat()
33
+ model_version = fields.pop("model_version", MODEL_VERSION)
34
+ bidder_id = fields.pop("bidder_id", None)
35
+ criterion_id = fields.pop("criterion_id", None)
36
+ payload_json = json.dumps(fields) if fields else None
37
+
38
+ conn = _conn()
39
+ cur = conn.execute(
40
+ "INSERT INTO audit_log (ts, action, actor, model_version, bidder_id, criterion_id, payload_json) "
41
+ "VALUES (?, ?, ?, ?, ?, ?, ?)",
42
+ (ts, action, actor, model_version, bidder_id, criterion_id, payload_json),
43
+ )
44
+ conn.commit()
45
+ row_id = cur.lastrowid
46
+ conn.close()
47
+ return row_id
48
 
49
 
50
  def query(filters: dict | None = None) -> list[dict]:
51
+ conn = _conn()
52
+ sql = "SELECT * FROM audit_log"
53
+ params: list = []
54
+ if filters:
55
+ clauses = []
56
+ if "bidder_id" in filters:
57
+ clauses.append("bidder_id = ?")
58
+ params.append(filters["bidder_id"])
59
+ if "action" in filters:
60
+ clauses.append("action = ?")
61
+ params.append(filters["action"])
62
+ if "date_from" in filters:
63
+ clauses.append("ts >= ?")
64
+ params.append(filters["date_from"])
65
+ if "date_to" in filters:
66
+ clauses.append("ts <= ?")
67
+ params.append(filters["date_to"])
68
+ if clauses:
69
+ sql += " WHERE " + " AND ".join(clauses)
70
+ sql += " ORDER BY id DESC"
71
+ rows = conn.execute(sql, params).fetchall()
72
+ conn.close()
73
+ return [dict(r) for r in rows]
core/criteria_extractor.py CHANGED
@@ -1,6 +1,56 @@
 
1
  from pathlib import Path
 
 
 
 
 
 
 
 
2
  from core.schemas import Criterion
3
 
4
 
 
 
 
 
 
5
  def extract_criteria(tender_pdf_path: Path) -> list[Criterion]:
6
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
  from pathlib import Path
3
+
4
+ import streamlit as st
5
+
6
+ from core import audit, fallback
7
+ from core.config import MODEL_VERSION
8
+ from core.llm_client import LLM, LLMUnavailable
9
+ from core.pdf_utils import extract_pages
10
+ from core.prompts import EXTRACT_CRITERIA_PROMPT_SYSTEM
11
  from core.schemas import Criterion
12
 
13
 
14
+ @st.cache_resource
15
+ def _get_llm() -> LLM:
16
+ return LLM()
17
+
18
+
19
  def extract_criteria(tender_pdf_path: Path) -> list[Criterion]:
20
+ pages = extract_pages(tender_pdf_path)
21
+ tender_text = "\n\n".join(
22
+ f"--- PAGE {p['page']} ---\n{p['text']}" for p in pages
23
+ )
24
+
25
+ user_prompt = f"""{tender_text}
26
+
27
+ ---
28
+ Return JSON in this exact format:
29
+ {{"criteria": [
30
+ {{"id": "C1", "title": "...", "category": "financial|technical|compliance",
31
+ "mandatory": true, "description": "...",
32
+ "rule": {{"type": "numeric_threshold|count_threshold|certification_present|document_present",
33
+ "field": "...", "operator": ">=|<=|==|exists", "value": null, "unit": null}},
34
+ "query_hints": ["...", "...", "..."],
35
+ "source_page": 1, "source_clause": "3.2(a)"}},
36
+ ...
37
+ ]}}
38
+ Each criterion must have all fields. Assign sequential IDs C1, C2, ...
39
+ """
40
+
41
+ try:
42
+ llm = _get_llm()
43
+ result = llm.chat_json(EXTRACT_CRITERIA_PROMPT_SYSTEM, user_prompt)
44
+ raw_list = result.get("criteria", [])
45
+ criteria = [Criterion(**c) for c in raw_list]
46
+ audit.log(
47
+ "criteria_extracted",
48
+ model_version=MODEL_VERSION,
49
+ count=len(criteria),
50
+ source=str(tender_pdf_path.name),
51
+ )
52
+ return criteria
53
+ except LLMUnavailable:
54
+ audit.log("precomputed_fallback_used", reason="LLMUnavailable in extract_criteria")
55
+ st.session_state["fallback_active"] = True
56
+ return fallback.load_criteria()
core/fallback.py CHANGED
@@ -1,9 +1,82 @@
 
 
 
1
  from core.schemas import Criterion, Verdict
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def load_criteria() -> list[Criterion]:
5
- raise NotImplementedError
 
 
 
 
6
 
7
 
8
  def load_evaluation(bidder_id: str, criterion_id: str) -> Verdict:
9
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from core.config import PRECOMPUTED_DIR
4
  from core.schemas import Criterion, Verdict
5
 
6
+ _HARDCODED_CRITERIA = [
7
+ {
8
+ "id": "C1", "title": "Minimum Annual Turnover",
9
+ "category": "financial", "mandatory": True,
10
+ "description": "The bidder shall have a minimum average annual turnover of INR 5 Crore during the last three financial years (2022-23, 2023-24, 2024-25).",
11
+ "rule": {"type": "numeric_threshold", "field": "annual_turnover_inr",
12
+ "operator": ">=", "value": 50000000, "unit": "INR"},
13
+ "query_hints": ["annual turnover", "total revenue", "INR crore", "audited financials", "CA certificate"],
14
+ "source_page": 2, "source_clause": "3.2(a)",
15
+ },
16
+ {
17
+ "id": "C2", "title": "Completed Construction Projects",
18
+ "category": "technical", "mandatory": True,
19
+ "description": "The bidder must have successfully completed at least three (3) similar construction projects of value not less than INR 1 Crore each in the last five financial years.",
20
+ "rule": {"type": "count_threshold", "field": "completed_projects",
21
+ "operator": ">=", "value": 3, "unit": None},
22
+ "query_hints": ["completed projects", "construction experience", "work order", "completion certificate", "similar projects"],
23
+ "source_page": 2, "source_clause": "3.2(b)",
24
+ },
25
+ {
26
+ "id": "C3", "title": "GST Registration",
27
+ "category": "compliance", "mandatory": True,
28
+ "description": "The bidder shall possess a valid Goods and Services Tax (GST) registration certificate. The GSTIN must be active as on the date of submission.",
29
+ "rule": {"type": "certification_present", "field": "gstin",
30
+ "operator": "exists", "value": None, "unit": None},
31
+ "query_hints": ["GSTIN", "GST certificate", "GST registration", "tax registration"],
32
+ "source_page": 2, "source_clause": "3.2(c)",
33
+ },
34
+ {
35
+ "id": "C4", "title": "ISO 9001:2015 Certification",
36
+ "category": "compliance", "mandatory": True,
37
+ "description": "The bidder shall hold a valid ISO 9001:2015 Quality Management System certification issued by an accredited certification body.",
38
+ "rule": {"type": "certification_present", "field": "iso_9001",
39
+ "operator": "exists", "value": None, "unit": None},
40
+ "query_hints": ["ISO 9001", "quality management", "ISO certificate", "QMS certification"],
41
+ "source_page": 2, "source_clause": "3.2(d)",
42
+ },
43
+ {
44
+ "id": "C5", "title": "Paramilitary Infrastructure Experience",
45
+ "category": "technical", "mandatory": False,
46
+ "description": "Preferably, the bidder may have prior experience with construction or maintenance of paramilitary or defence infrastructure.",
47
+ "rule": {"type": "document_present", "field": "paramilitary_experience",
48
+ "operator": "exists", "value": None, "unit": None},
49
+ "query_hints": ["paramilitary", "defence infrastructure", "CRPF", "BSF", "security forces"],
50
+ "source_page": 2, "source_clause": "3.2(e)",
51
+ },
52
+ ]
53
+
54
 
55
  def load_criteria() -> list[Criterion]:
56
+ criteria_file = PRECOMPUTED_DIR / "criteria.json"
57
+ if criteria_file.exists():
58
+ data = json.loads(criteria_file.read_text(encoding="utf-8"))
59
+ return [Criterion(**c) for c in data.get("criteria", data)]
60
+ return [Criterion(**c) for c in _HARDCODED_CRITERIA]
61
 
62
 
63
  def load_evaluation(bidder_id: str, criterion_id: str) -> Verdict:
64
+ eval_file = PRECOMPUTED_DIR / f"eval_{bidder_id}.json"
65
+ if eval_file.exists():
66
+ data = json.loads(eval_file.read_text(encoding="utf-8"))
67
+ verdicts = data if isinstance(data, list) else data.get("verdicts", [])
68
+ for v in verdicts:
69
+ if v.get("criterion_id") == criterion_id:
70
+ return Verdict(**v)
71
+ # Return a needs_review verdict as safe default
72
+ from core.config import MODEL_VERSION
73
+ from datetime import datetime, timezone
74
+ return Verdict(
75
+ bidder_id=bidder_id,
76
+ criterion_id=criterion_id,
77
+ verdict="needs_review",
78
+ reason="Pre-computed evaluation not available. Manual review required.",
79
+ model_version=MODEL_VERSION,
80
+ timestamp=datetime.now(timezone.utc).isoformat(),
81
+ combined_confidence=0.0,
82
+ )
specs/07_criteria_extractor.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spec 07 — Criteria Extractor
2
+
3
+ **Step:** 6 of 15
4
+ **Time budget:** ~30 min
5
+ **Checkpoint:** Tab 2 in the running app shows 5 criteria extracted from the mock tender.
6
+
7
+ ---
8
+
9
+ ## Goal
10
+
11
+ Implement `core/criteria_extractor.py` and wire up `ui/tab_tender.py` to call it. On `LLMUnavailable`, fall back to `fallback.load_criteria()`. Cache result in `st.session_state["criteria"]`.
12
+
13
+ ---
14
+
15
+ ## `core/criteria_extractor.py`
16
+
17
+ ### `extract_criteria(tender_pdf_path: Path) -> list[Criterion]`
18
+
19
+ 1. Call `pdf_utils.extract_pages(tender_pdf_path)` → list of `{"page": int, "text": str}`.
20
+ 2. Join pages: `tender_text = "\n\n--- PAGE {n} ---\n\n".join(p["text"] for p in pages)`.
21
+ 3. Build user prompt:
22
+ ```
23
+ {tender_text}
24
+
25
+ ---
26
+ Return JSON in this exact format:
27
+ {"criteria": [
28
+ {"id": "C1", "title": "...", "category": "financial|technical|compliance",
29
+ "mandatory": true|false, "description": "...",
30
+ "rule": {"type": "numeric_threshold|count_threshold|certification_present|document_present",
31
+ "field": "...", "operator": ">=|<=|==|exists", "value": null_or_number, "unit": null_or_string},
32
+ "query_hints": ["...", "..."],
33
+ "source_page": <int>, "source_clause": "..."},
34
+ ...
35
+ ]}
36
+ ```
37
+ 4. Call `llm.chat_json(EXTRACT_CRITERIA_PROMPT_SYSTEM, user_prompt)`.
38
+ 5. Parse `result["criteria"]` → validate each item as `Criterion(**item)`.
39
+ 6. Log `criteria_extracted` to audit with `payload_json=json.dumps({"count": len(criteria)})`.
40
+ 7. Return `list[Criterion]`.
41
+
42
+ On `LLMUnavailable`:
43
+ - Log `precomputed_fallback_used` to audit.
44
+ - Set `st.session_state["fallback_active"] = True`.
45
+ - Return `fallback.load_criteria()`.
46
+
47
+ LLM singleton: use `@st.cache_resource` on a getter `_get_llm()` so the client is created once per Streamlit session.
48
+
49
+ ---
50
+
51
+ ## `ui/tab_tender.py`
52
+
53
+ Renders the Tender Analysis tab. Replaces the stub.
54
+
55
+ Layout:
56
+ 1. `st.header("Tender Analysis")`
57
+ 2. File uploader: `uploaded = st.file_uploader("Upload tender PDF", type=["pdf"])`. If nothing uploaded, use the preloaded mock: `data/tender/crpf_construction_tender.pdf`.
58
+ 3. Show the filename being used.
59
+ 4. Button **"Extract Criteria (Live LLM)"**:
60
+ - Save uploaded bytes to a temp file (or use the mock path directly).
61
+ - Call `criteria_extractor.extract_criteria(path)`.
62
+ - Store in `st.session_state["criteria"]`.
63
+ 5. If `st.session_state.get("criteria")`:
64
+ - Show `st.success(f"Extracted {len(criteria)} criteria")`.
65
+ - For each criterion, render a card using `st.expander`:
66
+ - Title + mandatory/optional badge (🔴 Mandatory / 🟡 Optional).
67
+ - Category badge (color-coded: financial=blue, technical=green, compliance=orange).
68
+ - Description text.
69
+ - Source: page + clause.
70
+ - Rule details (type, operator, value, unit).
71
+
72
+ ---
73
+
74
+ ## Acceptance Criteria
75
+
76
+ 1. `extract_criteria(Path("data/tender/crpf_construction_tender.pdf"))` returns a list of 5 `Criterion` objects (when LLM is available) or the precomputed fallback (when not).
77
+ 2. Tab 2 renders without error in both modes.
78
+ 3. Each extracted criterion shows title, mandatory status, category, and source clause.
79
+ 4. `st.session_state["criteria"]` is populated after the button is clicked.
ui/tab_tender.py CHANGED
@@ -1,5 +1,69 @@
 
 
 
1
  import streamlit as st
2
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def render() -> None:
4
  st.header("Tender Analysis")
5
- st.info("Coming soon — upload tender and extract eligibility criteria.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ from pathlib import Path
3
+
4
  import streamlit as st
5
 
6
+ from core import criteria_extractor
7
+ from core.config import DATA_DIR
8
+
9
+ _MOCK_TENDER = DATA_DIR / "tender" / "crpf_construction_tender.pdf"
10
+
11
+ _CATEGORY_COLORS = {
12
+ "financial": "🔵",
13
+ "technical": "🟢",
14
+ "compliance": "🟠",
15
+ }
16
+
17
+
18
  def render() -> None:
19
  st.header("Tender Analysis")
20
+
21
+ uploaded = st.file_uploader("Upload tender PDF (leave blank to use pre-loaded mock)", type=["pdf"])
22
+
23
+ if uploaded:
24
+ tender_bytes = uploaded.read()
25
+ tender_name = uploaded.name
26
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
27
+ tmp.write(tender_bytes)
28
+ tender_path = Path(tmp.name)
29
+ else:
30
+ tender_path = _MOCK_TENDER
31
+ tender_name = _MOCK_TENDER.name
32
+
33
+ st.caption(f"Using: **{tender_name}**")
34
+
35
+ if st.button("Extract Criteria (Live LLM)", type="primary"):
36
+ with st.spinner("Calling DeepSeek to extract eligibility criteria…"):
37
+ criteria = criteria_extractor.extract_criteria(tender_path)
38
+ st.session_state["criteria"] = [c.model_dump() for c in criteria]
39
+ st.session_state["tender_path"] = str(tender_path)
40
+
41
+ criteria_data = st.session_state.get("criteria")
42
+ if criteria_data:
43
+ st.success(f"Extracted **{len(criteria_data)}** criteria")
44
+
45
+ if st.session_state.get("fallback_active"):
46
+ st.warning("⚠ Live API unavailable — showing pre-computed criteria.")
47
+
48
+ for c in criteria_data:
49
+ mandatory_badge = "🔴 Mandatory" if c["mandatory"] else "🟡 Optional"
50
+ cat_icon = _CATEGORY_COLORS.get(c["category"], "⚪")
51
+ label = f"{cat_icon} **{c['id']}** — {c['title']} {mandatory_badge}"
52
+ with st.expander(label, expanded=False):
53
+ col1, col2 = st.columns([2, 1])
54
+ with col1:
55
+ st.markdown(f"**Description:** {c['description']}")
56
+ rule = c["rule"]
57
+ rule_parts = [f"Type: `{rule['type']}`", f"Field: `{rule['field']}`",
58
+ f"Operator: `{rule['operator']}`"]
59
+ if rule.get("value") is not None:
60
+ rule_parts.append(f"Value: `{rule['value']}`")
61
+ if rule.get("unit"):
62
+ rule_parts.append(f"Unit: `{rule['unit']}`")
63
+ st.markdown(" · ".join(rule_parts))
64
+ with col2:
65
+ st.markdown(f"**Category:** {c['category'].capitalize()}")
66
+ st.markdown(f"**Source:** Page {c['source_page']}, Clause {c['source_clause']}")
67
+ if c.get("query_hints"):
68
+ hints = ", ".join(f"`{h}`" for h in c["query_hints"])
69
+ st.markdown(f"**Query hints:** {hints}")