Boka73's picture
Deploy Gradio app
dd6303a verified
"""Local duplicate detection and side-by-side tender comparison."""
from __future__ import annotations
import difflib
import json
import pathlib
BASE = pathlib.Path(__file__).resolve().parents[1]
OUTPUT_DIR = BASE / "output"
def _load_data(tender_id: str) -> dict:
p = OUTPUT_DIR / tender_id / "extracted_data.json"
if not p.exists():
return {}
return json.loads(p.read_text(encoding="utf-8"))
def compare_tenders(left_id: str, right_id: str) -> dict:
left = _load_data(left_id)
right = _load_data(right_id)
keys = sorted(set(left.keys()) | set(right.keys()))
rows = []
for key in keys:
if key in {"boq_items", "equipment", "manpower", "work_activities", "jv_partners"}:
continue
lv = left.get(key, "")
rv = right.get(key, "")
rows.append({"field": key, "left": lv, "right": rv, "same": lv == rv})
return {"left_id": left_id, "right_id": right_id, "fields": rows}
def detect_duplicates(target_id: str, threshold: float = 0.82) -> list[dict]:
target = _load_data(target_id)
target_text = " ".join(str(target.get(k, "")) for k in ["work_name", "package_no", "location", "procuring_entity"])
matches = []
for folder in OUTPUT_DIR.glob("*"):
if not folder.is_dir() or folder.name == target_id:
continue
other = _load_data(folder.name)
other_text = " ".join(str(other.get(k, "")) for k in ["work_name", "package_no", "location", "procuring_entity"])
score = difflib.SequenceMatcher(None, target_text.lower(), other_text.lower()).ratio()
if score >= threshold:
matches.append({"tender_id": folder.name, "similarity": round(score, 3), "work_name": other.get("work_name", "")})
return sorted(matches, key=lambda r: r["similarity"], reverse=True)