File size: 3,949 Bytes
42d88ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from difflib import SequenceMatcher
from typing import Optional

CATEGORY_KEYWORDS: dict[str, list[str]] = {
    "Damaged Road Issues": [
        "road", "damage", "damaged", "broken", "crack", "cracked", "pavement",
        "asphalt", "street", "highway", "lane", "surface", "rough", "uneven"
    ],
    "Pothole Issues": [
        "pothole", "hole", "pit", "crater", "dip", "depression", "bump",
        "cavity", "hollow", "gap"
    ],
    "Illegal Parking Issues": [
        "parking", "parked", "car", "vehicle", "illegal", "wrong", "blocking",
        "obstruction", "no parking", "double park", "sidewalk"
    ],
    "Broken Road Sign Issues": [
        "sign", "signboard", "traffic sign", "road sign", "broken sign",
        "fallen sign", "damaged sign", "missing sign", "bent"
    ],
    "Fallen Trees": [
        "tree", "fallen", "branch", "trunk", "uprooted", "collapsed",
        "blocking", "storm", "wind", "timber"
    ],
    "Littering/Garbage on Public Places": [
        "garbage", "trash", "litter", "waste", "rubbish", "dump", "dirty",
        "filth", "debris", "plastic", "pile", "mess", "junk", "disposal"
    ],
    "Vandalism Issues": [
        "vandal", "graffiti", "spray", "paint", "defaced", "broken",
        "smashed", "destroyed", "damaged property", "torn"
    ],
    "Dead Animal Pollution": [
        "dead", "animal", "carcass", "body", "corpse", "rotting", "smell",
        "stink", "dog", "cat", "bird", "cow", "roadkill"
    ],
    "Damaged Concrete Structures": [
        "concrete", "structure", "wall", "pillar", "bridge", "flyover",
        "footpath", "sidewalk", "curb", "crack", "broken"
    ],
    "Damaged Electric Wires and Poles": [
        "electric", "wire", "pole", "cable", "power", "electricity",
        "hanging", "exposed", "sparking", "transformer", "light pole"
    ],
}


def normalize_text(text: str) -> str:
    return text.lower().strip()


def calculate_similarity(s1: str, s2: str) -> float:
    return SequenceMatcher(None, s1.lower(), s2.lower()).ratio()


def fuzzy_match_word(word: str, keywords: list[str], threshold: float = 0.7) -> bool:
    word = normalize_text(word)
    for keyword in keywords:
        if word == keyword:
            return True
        if len(word) >= 4 and calculate_similarity(word, keyword) >= threshold:
            return True
        if keyword in word or word in keyword:
            return True
    return False


def match_description_to_category(
    description: Optional[str],
    detected_category: str,
    threshold: float = 0.6
) -> tuple[bool, float, list[str]]:
    if not description:
        return False, 0.0, []
    
    keywords = CATEGORY_KEYWORDS.get(detected_category, [])
    if not keywords:
        return False, 0.0, []
    
    words = normalize_text(description).replace(",", " ").replace(".", " ").split()
    
    matched_words = []
    for word in words:
        if len(word) < 3:
            continue
        if fuzzy_match_word(word, keywords):
            matched_words.append(word)
    
    if not words:
        return False, 0.0, []
    
    match_score = len(matched_words) / max(len(words), 1)
    is_match = len(matched_words) >= 1 or match_score >= threshold
    
    return is_match, match_score, matched_words


def auto_validate_issue(
    description: Optional[str],
    detected_categories: list[str],
    confidence_threshold: float = 0.5
) -> tuple[bool, str]:
    if not description or not detected_categories:
        return False, "No description or no detections for auto-validation"
    
    for category in detected_categories:
        is_match, score, matched_words = match_description_to_category(
            description, category
        )
        if is_match:
            return True, f"Auto-validated: '{category}' matched with keywords: {matched_words}"
    
    return False, f"Manual verification required: no match between description and detected categories {detected_categories}"