Spaces:

Boka73
/

Tender-data-automation

Running

File size: 3,791 Bytes

dd6303a

"""
Structured extraction model for tender PDF parsing.

Every extracted value keeps its source, confidence, and review status.
This allows the cross-check report to show what is complete, duplicated,
conflicting, or waiting for approval.
"""

from dataclasses import dataclass, field, asdict
from typing import Any, Dict, List, Optional


@dataclass
class FieldEvidence:
    source_file: str
    source_type: str
    page: Optional[int]
    snippet: str
    method: str
    confidence: float


@dataclass
class ExtractedField:
    key: str
    label: str
    value: Any = None
    normalized_value: Any = None
    confidence: float = 0.0
    status: str = "missing"  # matched, conflict, duplicate, missing, review, approved
    evidence: List[FieldEvidence] = field(default_factory=list)
    notes: List[str] = field(default_factory=list)

    def add_evidence(self, evidence: FieldEvidence) -> None:
        self.evidence.append(evidence)
        if evidence.confidence > self.confidence:
            self.confidence = evidence.confidence

    def best_source(self) -> str:
        if not self.evidence:
            return ""
        best = max(self.evidence, key=lambda e: e.confidence)
        page = f" p.{best.page}" if best.page else ""
        return f"{best.source_file}{page}"


@dataclass
class DuplicateFinding:
    field_key: str
    value: Any
    sources: List[str]
    status: str = "duplicate"
    note: str = "Same value found in multiple documents."


@dataclass
class ConflictFinding:
    field_key: str
    values: Dict[str, Any]
    status: str = "conflict"
    note: str = "Different values found for the same field."


@dataclass
class ExtractionBundle:
    fields: Dict[str, ExtractedField] = field(default_factory=dict)
    duplicates: List[DuplicateFinding] = field(default_factory=list)
    conflicts: List[ConflictFinding] = field(default_factory=list)
    raw_text_by_file: Dict[str, str] = field(default_factory=dict)

    def set_field(
        self,
        key: str,
        label: str,
        value: Any,
        source_file: str,
        source_type: str,
        page: Optional[int],
        snippet: str,
        method: str,
        confidence: float,
        normalized_value: Any = None,
    ) -> None:
        field = self.fields.get(key) or ExtractedField(key=key, label=label)
        evidence = FieldEvidence(
            source_file=source_file,
            source_type=source_type,
            page=page,
            snippet=snippet[:500],
            method=method,
            confidence=confidence,
        )

        candidate_norm = normalized_value if normalized_value is not None else value
        if field.value is None:
            field.value = value
            field.normalized_value = candidate_norm
            field.status = "matched" if confidence >= 0.85 else "review"
        else:
            existing = str(field.normalized_value).strip().lower()
            incoming = str(candidate_norm).strip().lower()
            if existing == incoming:
                field.status = "duplicate"
            else:
                field.status = "conflict"
                field.notes.append(f"Conflicting value from {source_file}: {value}")

        field.add_evidence(evidence)
        self.fields[key] = field

    def get(self, key: str, default: Any = "") -> Any:
        field = self.fields.get(key)
        return field.value if field and field.value is not None else default

    def to_dict(self) -> dict:
        return asdict(self)

    def summarize_quality(self) -> Dict[str, int]:
        counts = {"matched": 0, "duplicate": 0, "conflict": 0, "missing": 0, "review": 0, "approved": 0}
        for field in self.fields.values():
            counts[field.status] = counts.get(field.status, 0) + 1
        return counts