UrbanLens / Backend /utils /fuzzy_match.py
0xarchit's picture
Initial Hugging Face Space - Backend deployment
42d88ae
from difflib import SequenceMatcher
from typing import Optional
CATEGORY_KEYWORDS: dict[str, list[str]] = {
"Damaged Road Issues": [
"road", "damage", "damaged", "broken", "crack", "cracked", "pavement",
"asphalt", "street", "highway", "lane", "surface", "rough", "uneven"
],
"Pothole Issues": [
"pothole", "hole", "pit", "crater", "dip", "depression", "bump",
"cavity", "hollow", "gap"
],
"Illegal Parking Issues": [
"parking", "parked", "car", "vehicle", "illegal", "wrong", "blocking",
"obstruction", "no parking", "double park", "sidewalk"
],
"Broken Road Sign Issues": [
"sign", "signboard", "traffic sign", "road sign", "broken sign",
"fallen sign", "damaged sign", "missing sign", "bent"
],
"Fallen Trees": [
"tree", "fallen", "branch", "trunk", "uprooted", "collapsed",
"blocking", "storm", "wind", "timber"
],
"Littering/Garbage on Public Places": [
"garbage", "trash", "litter", "waste", "rubbish", "dump", "dirty",
"filth", "debris", "plastic", "pile", "mess", "junk", "disposal"
],
"Vandalism Issues": [
"vandal", "graffiti", "spray", "paint", "defaced", "broken",
"smashed", "destroyed", "damaged property", "torn"
],
"Dead Animal Pollution": [
"dead", "animal", "carcass", "body", "corpse", "rotting", "smell",
"stink", "dog", "cat", "bird", "cow", "roadkill"
],
"Damaged Concrete Structures": [
"concrete", "structure", "wall", "pillar", "bridge", "flyover",
"footpath", "sidewalk", "curb", "crack", "broken"
],
"Damaged Electric Wires and Poles": [
"electric", "wire", "pole", "cable", "power", "electricity",
"hanging", "exposed", "sparking", "transformer", "light pole"
],
}
def normalize_text(text: str) -> str:
return text.lower().strip()
def calculate_similarity(s1: str, s2: str) -> float:
return SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
def fuzzy_match_word(word: str, keywords: list[str], threshold: float = 0.7) -> bool:
word = normalize_text(word)
for keyword in keywords:
if word == keyword:
return True
if len(word) >= 4 and calculate_similarity(word, keyword) >= threshold:
return True
if keyword in word or word in keyword:
return True
return False
def match_description_to_category(
description: Optional[str],
detected_category: str,
threshold: float = 0.6
) -> tuple[bool, float, list[str]]:
if not description:
return False, 0.0, []
keywords = CATEGORY_KEYWORDS.get(detected_category, [])
if not keywords:
return False, 0.0, []
words = normalize_text(description).replace(",", " ").replace(".", " ").split()
matched_words = []
for word in words:
if len(word) < 3:
continue
if fuzzy_match_word(word, keywords):
matched_words.append(word)
if not words:
return False, 0.0, []
match_score = len(matched_words) / max(len(words), 1)
is_match = len(matched_words) >= 1 or match_score >= threshold
return is_match, match_score, matched_words
def auto_validate_issue(
description: Optional[str],
detected_categories: list[str],
confidence_threshold: float = 0.5
) -> tuple[bool, str]:
if not description or not detected_categories:
return False, "No description or no detections for auto-validation"
for category in detected_categories:
is_match, score, matched_words = match_description_to_category(
description, category
)
if is_match:
return True, f"Auto-validated: '{category}' matched with keywords: {matched_words}"
return False, f"Manual verification required: no match between description and detected categories {detected_categories}"