Upload agents/semantic_agent.py with huggingface_hub
Browse files- agents/semantic_agent.py +99 -17
agents/semantic_agent.py
CHANGED
|
@@ -226,7 +226,12 @@ Your 8 analysis domains:
|
|
| 226 |
|
| 227 |
2. TIME OF DAY: Sky color/brightness must match shadow lengths and lighting direction. A bright blue sky requires short shadows (midday) or long shadows from a specific direction. Stars visible + brightly lit ground is contradictory.
|
| 228 |
|
| 229 |
-
3. ERA / TECHNOLOGY ANACHRONISM: Visible technology
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
4. GEOGRAPHIC COHERENCE: Architecture style must match vegetation and climate. Tropical palm trees next to Northern European half-timbered houses is impossible. Road markings should match the apparent country (right-hand vs left-hand traffic, line styles). Visible text/signs should be in the expected language for the geography.
|
| 232 |
|
|
@@ -286,17 +291,22 @@ def _calibrate_vlm_confidence(raw_conf: float) -> float:
|
|
| 286 |
def run_semantic_agent(img):
|
| 287 |
findings, scores = [], []
|
| 288 |
vlm_ok = True
|
|
|
|
|
|
|
| 289 |
|
| 290 |
-
for sys_p, usr_p, name, features in [
|
| 291 |
(SYS_LIGHTING, USR_LIGHTING, "Lighting Physics",
|
| 292 |
["Shadow Direction","Shadow Quality","Specular Consistency","Ambient Occlusion",
|
| 293 |
-
"Color Temperature","Subsurface Scattering","Caustics","Inter-reflections"]
|
|
|
|
| 294 |
(SYS_ANATOMY, USR_ANATOMY, "Anatomical Analysis",
|
| 295 |
["Hand Anatomy","Facial Symmetry","Body Proportions","Skin Texture",
|
| 296 |
-
"Hair Consistency","Eye Details","Clothing Physics"]
|
|
|
|
| 297 |
(SYS_PHYSICS, USR_PHYSICS, "Physical Plausibility",
|
| 298 |
["Material Appearance","Perspective Geometry","Gravity & Structure",
|
| 299 |
-
"Scale & Proportion","Transparency","Contact Physics","Motion Coherence","Depth & Occlusion"]
|
|
|
|
| 300 |
]:
|
| 301 |
try:
|
| 302 |
resp = _vlm(img, sys_p, usr_p)
|
|
@@ -304,18 +314,56 @@ def run_semantic_agent(img):
|
|
| 304 |
parsed = _parse(resp)
|
| 305 |
sc = _score(parsed)
|
| 306 |
|
| 307 |
-
# Calibrate VLM confidence before storing
|
| 308 |
raw_conf = parsed.get("confidence", 0.5)
|
| 309 |
cal_conf = _calibrate_vlm_confidence(raw_conf)
|
| 310 |
|
|
|
|
| 311 |
if name == "Anatomical Analysis" and not parsed.get("contains_people", True):
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
|
| 314 |
anomalies = parsed.get("anomalies", [])
|
|
|
|
|
|
|
|
|
|
| 315 |
for feat in features:
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
|
| 320 |
findings.append({"test": name, "vlm_analysis": parsed, "anomalies": anomalies,
|
| 321 |
"score": sc, "confidence": cal_conf,
|
|
@@ -323,15 +371,18 @@ def run_semantic_agent(img):
|
|
| 323 |
"calibrated_confidence": cal_conf,
|
| 324 |
"note": parsed.get("explanation", "")[:200]})
|
| 325 |
scores.append(sc)
|
|
|
|
|
|
|
| 326 |
else:
|
| 327 |
vlm_ok = False
|
| 328 |
for feat in features:
|
| 329 |
findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
|
| 330 |
-
|
| 331 |
except Exception as e:
|
| 332 |
findings.append({"test": name, "error": str(e), "score": 0})
|
|
|
|
| 333 |
|
| 334 |
-
# Context plausibility
|
| 335 |
try:
|
| 336 |
resp = _vlm(img, SYS_CONTEXT, USR_CONTEXT)
|
| 337 |
if resp and not resp.startswith("VLM_ERROR"):
|
|
@@ -347,27 +398,58 @@ def run_semantic_agent(img):
|
|
| 347 |
findings.append({"test": feat, "score": sc / len(context_features),
|
| 348 |
"note": parsed.get("explanation", "")[:100], "parent": "Context"})
|
| 349 |
scores.append(sc / len(context_features))
|
|
|
|
|
|
|
| 350 |
|
| 351 |
findings.append({"test": "Context Plausibility", "vlm_analysis": parsed,
|
| 352 |
"score": sc, "confidence": cal_conf,
|
| 353 |
"note": parsed.get("explanation", "")[:200]})
|
| 354 |
scores.append(sc)
|
|
|
|
|
|
|
| 355 |
else:
|
| 356 |
vlm_ok = False
|
| 357 |
except:
|
| 358 |
pass
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
if not vlm_ok:
|
| 363 |
conf *= 0.3
|
| 364 |
|
| 365 |
-
viol = [f["test"] for f in findings if f.get("score", 0) > 0.15
|
| 366 |
-
|
|
|
|
|
|
|
| 367 |
rat = f"Semantic violations: {', '.join(viol[:5])}." if viol else \
|
| 368 |
f"Semantically consistent: {', '.join(comp[:5])}." if comp else "Semantic inconclusive."
|
| 369 |
for f in findings:
|
| 370 |
-
if f.get("note") and "parent" not in f:
|
| 371 |
rat += f" [{f['test']}]: {f['note'][:100]}."
|
| 372 |
|
| 373 |
return AgentEvidence("Semantic Consistency Agent", np.clip(avg, -1, 1), conf,
|
|
|
|
| 226 |
|
| 227 |
2. TIME OF DAY: Sky color/brightness must match shadow lengths and lighting direction. A bright blue sky requires short shadows (midday) or long shadows from a specific direction. Stars visible + brightly lit ground is contradictory.
|
| 228 |
|
| 229 |
+
3. ERA / TECHNOLOGY ANACHRONISM: Visible technology must match the apparent era of other objects in the scene. Use these concrete anchors:
|
| 230 |
+
- Pre-1990: No flat-screen TVs, no smartphones, no LED lighting, no modern car designs (rounded headlights, DRLs). CRT monitors only. Wired phones only.
|
| 231 |
+
- 1990-2005: Flip phones and early Nokias OK, but no touchscreen smartphones. Boxy CRT monitors, not flat panels. Boxy car designs.
|
| 232 |
+
- 2005-2015: Early smartphones OK, but no notched/hole-punch screens. Flat panels exist but bezels are thick.
|
| 233 |
+
- Post-2015: Thin-bezel phones, wireless earbuds, USB-C cables, modern LED strip lighting.
|
| 234 |
+
If the scene mixes eras (1950s architecture + a person holding a modern iPhone), flag it. Fashion should match the era of other visible technology.
|
| 235 |
|
| 236 |
4. GEOGRAPHIC COHERENCE: Architecture style must match vegetation and climate. Tropical palm trees next to Northern European half-timbered houses is impossible. Road markings should match the apparent country (right-hand vs left-hand traffic, line styles). Visible text/signs should be in the expected language for the geography.
|
| 237 |
|
|
|
|
| 291 |
def run_semantic_agent(img):
|
| 292 |
findings, scores = [], []
|
| 293 |
vlm_ok = True
|
| 294 |
+
n_applicable = 0 # Track how many sub-features were actually applicable
|
| 295 |
+
n_total = 0 # Track total sub-features attempted
|
| 296 |
|
| 297 |
+
for sys_p, usr_p, name, features, null_fields in [
|
| 298 |
(SYS_LIGHTING, USR_LIGHTING, "Lighting Physics",
|
| 299 |
["Shadow Direction","Shadow Quality","Specular Consistency","Ambient Occlusion",
|
| 300 |
+
"Color Temperature","Subsurface Scattering","Caustics","Inter-reflections"],
|
| 301 |
+
{"sss_correct", "caustics_correct", "interreflections_ok"}),
|
| 302 |
(SYS_ANATOMY, USR_ANATOMY, "Anatomical Analysis",
|
| 303 |
["Hand Anatomy","Facial Symmetry","Body Proportions","Skin Texture",
|
| 304 |
+
"Hair Consistency","Eye Details","Clothing Physics"],
|
| 305 |
+
set()),
|
| 306 |
(SYS_PHYSICS, USR_PHYSICS, "Physical Plausibility",
|
| 307 |
["Material Appearance","Perspective Geometry","Gravity & Structure",
|
| 308 |
+
"Scale & Proportion","Transparency","Contact Physics","Motion Coherence","Depth & Occlusion"],
|
| 309 |
+
{"transparency_ok", "motion_ok"}),
|
| 310 |
]:
|
| 311 |
try:
|
| 312 |
resp = _vlm(img, sys_p, usr_p)
|
|
|
|
| 314 |
parsed = _parse(resp)
|
| 315 |
sc = _score(parsed)
|
| 316 |
|
|
|
|
| 317 |
raw_conf = parsed.get("confidence", 0.5)
|
| 318 |
cal_conf = _calibrate_vlm_confidence(raw_conf)
|
| 319 |
|
| 320 |
+
# Fix 3: Anatomy on non-human images β tag as not_applicable
|
| 321 |
if name == "Anatomical Analysis" and not parsed.get("contains_people", True):
|
| 322 |
+
for feat in features:
|
| 323 |
+
findings.append({"test": feat, "score": 0.0,
|
| 324 |
+
"note": "No people in image β not applicable",
|
| 325 |
+
"not_applicable": True, "parent": name})
|
| 326 |
+
# NOT added to scores β these should not dilute the posterior
|
| 327 |
+
n_total += len(features)
|
| 328 |
+
findings.append({"test": name, "vlm_analysis": parsed,
|
| 329 |
+
"score": 0.0, "confidence": cal_conf,
|
| 330 |
+
"not_applicable": True,
|
| 331 |
+
"note": "No people detected β anatomy analysis skipped"})
|
| 332 |
+
continue
|
| 333 |
|
| 334 |
anomalies = parsed.get("anomalies", [])
|
| 335 |
+
|
| 336 |
+
# Fix 2: Count applicable sub-features (exclude nulls)
|
| 337 |
+
applicable_features = []
|
| 338 |
for feat in features:
|
| 339 |
+
# Check if VLM returned null for the corresponding field
|
| 340 |
+
field_map = {f: k for f, k in zip(features, parsed.keys()) if k in null_fields}
|
| 341 |
+
is_null = False
|
| 342 |
+
for nf in null_fields:
|
| 343 |
+
if parsed.get(nf) is None:
|
| 344 |
+
# Map null field back to feature name (approximate)
|
| 345 |
+
if any(nf_word in feat.lower() for nf_word in nf.replace("_ok","").replace("_correct","").split("_")):
|
| 346 |
+
is_null = True
|
| 347 |
+
break
|
| 348 |
+
|
| 349 |
+
if is_null:
|
| 350 |
+
findings.append({"test": feat, "score": 0.0,
|
| 351 |
+
"note": "Not applicable to this image",
|
| 352 |
+
"not_applicable": True, "parent": name})
|
| 353 |
+
n_total += 1
|
| 354 |
+
else:
|
| 355 |
+
applicable_features.append(feat)
|
| 356 |
+
|
| 357 |
+
# Distribute score only across applicable features
|
| 358 |
+
n_applicable_here = len(applicable_features)
|
| 359 |
+
if n_applicable_here > 0:
|
| 360 |
+
per_feat_score = sc / n_applicable_here
|
| 361 |
+
for feat in applicable_features:
|
| 362 |
+
findings.append({"test": feat, "score": per_feat_score,
|
| 363 |
+
"note": parsed.get("explanation", "")[:100], "parent": name})
|
| 364 |
+
scores.append(per_feat_score)
|
| 365 |
+
n_applicable += 1
|
| 366 |
+
n_total += 1
|
| 367 |
|
| 368 |
findings.append({"test": name, "vlm_analysis": parsed, "anomalies": anomalies,
|
| 369 |
"score": sc, "confidence": cal_conf,
|
|
|
|
| 371 |
"calibrated_confidence": cal_conf,
|
| 372 |
"note": parsed.get("explanation", "")[:200]})
|
| 373 |
scores.append(sc)
|
| 374 |
+
n_applicable += 1
|
| 375 |
+
n_total += 1
|
| 376 |
else:
|
| 377 |
vlm_ok = False
|
| 378 |
for feat in features:
|
| 379 |
findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
|
| 380 |
+
n_total += len(features)
|
| 381 |
except Exception as e:
|
| 382 |
findings.append({"test": name, "error": str(e), "score": 0})
|
| 383 |
+
n_total += 1
|
| 384 |
|
| 385 |
+
# Context plausibility
|
| 386 |
try:
|
| 387 |
resp = _vlm(img, SYS_CONTEXT, USR_CONTEXT)
|
| 388 |
if resp and not resp.startswith("VLM_ERROR"):
|
|
|
|
| 398 |
findings.append({"test": feat, "score": sc / len(context_features),
|
| 399 |
"note": parsed.get("explanation", "")[:100], "parent": "Context"})
|
| 400 |
scores.append(sc / len(context_features))
|
| 401 |
+
n_applicable += 1
|
| 402 |
+
n_total += 1
|
| 403 |
|
| 404 |
findings.append({"test": "Context Plausibility", "vlm_analysis": parsed,
|
| 405 |
"score": sc, "confidence": cal_conf,
|
| 406 |
"note": parsed.get("explanation", "")[:200]})
|
| 407 |
scores.append(sc)
|
| 408 |
+
n_applicable += 1
|
| 409 |
+
n_total += 1
|
| 410 |
else:
|
| 411 |
vlm_ok = False
|
| 412 |
except:
|
| 413 |
pass
|
| 414 |
|
| 415 |
+
# Fix 1: Confidence floor β distinguish genuinely neutral from cancelled-out
|
| 416 |
+
if scores:
|
| 417 |
+
avg = float(np.mean(scores))
|
| 418 |
+
# Check if scores genuinely agree on neutral vs. cancelling each other out
|
| 419 |
+
score_signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
|
| 420 |
+
n_positive = sum(1 for s in score_signs if s > 0)
|
| 421 |
+
n_negative = sum(1 for s in score_signs if s < 0)
|
| 422 |
+
n_neutral = sum(1 for s in score_signs if s == 0)
|
| 423 |
+
|
| 424 |
+
if n_positive > 0 and n_negative > 0:
|
| 425 |
+
# Scores cancelled out β LOW confidence, not 0.4
|
| 426 |
+
agreement = max(n_positive, n_negative) / (n_positive + n_negative)
|
| 427 |
+
conf = min(1.0, 0.15 + 0.5 * abs(avg) * agreement)
|
| 428 |
+
elif n_neutral == len(score_signs):
|
| 429 |
+
# Everything genuinely neutral (VLM said 0 for everything) β low confidence
|
| 430 |
+
conf = 0.2
|
| 431 |
+
else:
|
| 432 |
+
# Scores agree in direction β confidence scales with magnitude
|
| 433 |
+
conf = min(1.0, 0.3 + 0.6 * abs(avg))
|
| 434 |
+
|
| 435 |
+
# Scale by coverage: fewer applicable features = lower confidence
|
| 436 |
+
coverage = n_applicable / max(n_total, 1)
|
| 437 |
+
conf *= max(0.3, coverage)
|
| 438 |
+
else:
|
| 439 |
+
avg = 0.0
|
| 440 |
+
conf = 0.1
|
| 441 |
+
|
| 442 |
if not vlm_ok:
|
| 443 |
conf *= 0.3
|
| 444 |
|
| 445 |
+
viol = [f["test"] for f in findings if f.get("score", 0) > 0.15
|
| 446 |
+
and "parent" not in f and not f.get("not_applicable")]
|
| 447 |
+
comp = [f["test"] for f in findings if f.get("score", 0) < -0.1
|
| 448 |
+
and "parent" not in f and not f.get("not_applicable")]
|
| 449 |
rat = f"Semantic violations: {', '.join(viol[:5])}." if viol else \
|
| 450 |
f"Semantically consistent: {', '.join(comp[:5])}." if comp else "Semantic inconclusive."
|
| 451 |
for f in findings:
|
| 452 |
+
if f.get("note") and "parent" not in f and not f.get("not_applicable"):
|
| 453 |
rat += f" [{f['test']}]: {f['note'][:100]}."
|
| 454 |
|
| 455 |
return AgentEvidence("Semantic Consistency Agent", np.clip(avg, -1, 1), conf,
|