Spaces:
Sleeping
Sleeping
| """Local duplicate detection and side-by-side tender comparison.""" | |
| from __future__ import annotations | |
| import difflib | |
| import json | |
| import pathlib | |
| BASE = pathlib.Path(__file__).resolve().parents[1] | |
| OUTPUT_DIR = BASE / "output" | |
| def _load_data(tender_id: str) -> dict: | |
| p = OUTPUT_DIR / tender_id / "extracted_data.json" | |
| if not p.exists(): | |
| return {} | |
| return json.loads(p.read_text(encoding="utf-8")) | |
| def compare_tenders(left_id: str, right_id: str) -> dict: | |
| left = _load_data(left_id) | |
| right = _load_data(right_id) | |
| keys = sorted(set(left.keys()) | set(right.keys())) | |
| rows = [] | |
| for key in keys: | |
| if key in {"boq_items", "equipment", "manpower", "work_activities", "jv_partners"}: | |
| continue | |
| lv = left.get(key, "") | |
| rv = right.get(key, "") | |
| rows.append({"field": key, "left": lv, "right": rv, "same": lv == rv}) | |
| return {"left_id": left_id, "right_id": right_id, "fields": rows} | |
| def detect_duplicates(target_id: str, threshold: float = 0.82) -> list[dict]: | |
| target = _load_data(target_id) | |
| target_text = " ".join(str(target.get(k, "")) for k in ["work_name", "package_no", "location", "procuring_entity"]) | |
| matches = [] | |
| for folder in OUTPUT_DIR.glob("*"): | |
| if not folder.is_dir() or folder.name == target_id: | |
| continue | |
| other = _load_data(folder.name) | |
| other_text = " ".join(str(other.get(k, "")) for k in ["work_name", "package_no", "location", "procuring_entity"]) | |
| score = difflib.SequenceMatcher(None, target_text.lower(), other_text.lower()).ratio() | |
| if score >= threshold: | |
| matches.append({"tender_id": folder.name, "similarity": round(score, 3), "work_name": other.get("work_name", "")}) | |
| return sorted(matches, key=lambda r: r["similarity"], reverse=True) | |