Spaces:

0xarchit
/

UrbanLens

Sleeping

App Files Files Community

UrbanLens / Backend /utils /fuzzy_match.py

0xarchit

Initial Hugging Face Space - Backend deployment

42d88ae 3 months ago

raw

history blame contribute delete

3.95 kB

	from difflib import SequenceMatcher
	from typing import Optional

	CATEGORY_KEYWORDS: dict[str, list[str]] = {
	"Damaged Road Issues": [
	"road", "damage", "damaged", "broken", "crack", "cracked", "pavement",
	"asphalt", "street", "highway", "lane", "surface", "rough", "uneven"
	],
	"Pothole Issues": [
	"pothole", "hole", "pit", "crater", "dip", "depression", "bump",
	"cavity", "hollow", "gap"
	],
	"Illegal Parking Issues": [
	"parking", "parked", "car", "vehicle", "illegal", "wrong", "blocking",
	"obstruction", "no parking", "double park", "sidewalk"
	],
	"Broken Road Sign Issues": [
	"sign", "signboard", "traffic sign", "road sign", "broken sign",
	"fallen sign", "damaged sign", "missing sign", "bent"
	],
	"Fallen Trees": [
	"tree", "fallen", "branch", "trunk", "uprooted", "collapsed",
	"blocking", "storm", "wind", "timber"
	],
	"Littering/Garbage on Public Places": [
	"garbage", "trash", "litter", "waste", "rubbish", "dump", "dirty",
	"filth", "debris", "plastic", "pile", "mess", "junk", "disposal"
	],
	"Vandalism Issues": [
	"vandal", "graffiti", "spray", "paint", "defaced", "broken",
	"smashed", "destroyed", "damaged property", "torn"
	],
	"Dead Animal Pollution": [
	"dead", "animal", "carcass", "body", "corpse", "rotting", "smell",
	"stink", "dog", "cat", "bird", "cow", "roadkill"
	],
	"Damaged Concrete Structures": [
	"concrete", "structure", "wall", "pillar", "bridge", "flyover",
	"footpath", "sidewalk", "curb", "crack", "broken"
	],
	"Damaged Electric Wires and Poles": [
	"electric", "wire", "pole", "cable", "power", "electricity",
	"hanging", "exposed", "sparking", "transformer", "light pole"
	],
	}


	def normalize_text(text: str) -> str:
	return text.lower().strip()


	def calculate_similarity(s1: str, s2: str) -> float:
	return SequenceMatcher(None, s1.lower(), s2.lower()).ratio()


	def fuzzy_match_word(word: str, keywords: list[str], threshold: float = 0.7) -> bool:
	word = normalize_text(word)
	for keyword in keywords:
	if word == keyword:
	return True
	if len(word) >= 4 and calculate_similarity(word, keyword) >= threshold:
	return True
	if keyword in word or word in keyword:
	return True
	return False


	def match_description_to_category(
	description: Optional[str],
	detected_category: str,
	threshold: float = 0.6
	) -> tuple[bool, float, list[str]]:
	if not description:
	return False, 0.0, []

	keywords = CATEGORY_KEYWORDS.get(detected_category, [])
	if not keywords:
	return False, 0.0, []

	words = normalize_text(description).replace(",", " ").replace(".", " ").split()

	matched_words = []
	for word in words:
	if len(word) < 3:
	continue
	if fuzzy_match_word(word, keywords):
	matched_words.append(word)

	if not words:
	return False, 0.0, []

	match_score = len(matched_words) / max(len(words), 1)
	is_match = len(matched_words) >= 1 or match_score >= threshold

	return is_match, match_score, matched_words


	def auto_validate_issue(
	description: Optional[str],
	detected_categories: list[str],
	confidence_threshold: float = 0.5
	) -> tuple[bool, str]:
	if not description or not detected_categories:
	return False, "No description or no detections for auto-validation"

	for category in detected_categories:
	is_match, score, matched_words = match_description_to_category(
	description, category
	)
	if is_match:
	return True, f"Auto-validated: '{category}' matched with keywords: {matched_words}"

	return False, f"Manual verification required: no match between description and detected categories {detected_categories}"