anky2002 commited on
Commit
390f4c5
Β·
verified Β·
1 Parent(s): 4d66672

Upload agents/semantic_agent.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. agents/semantic_agent.py +99 -17
agents/semantic_agent.py CHANGED
@@ -226,7 +226,12 @@ Your 8 analysis domains:
226
 
227
  2. TIME OF DAY: Sky color/brightness must match shadow lengths and lighting direction. A bright blue sky requires short shadows (midday) or long shadows from a specific direction. Stars visible + brightly lit ground is contradictory.
228
 
229
- 3. ERA / TECHNOLOGY ANACHRONISM: Visible technology (phones, cars, screens, signage style) should match the apparent era. A scene with 1950s architecture containing modern smartphones is suspicious. Fashion should match the apparent era of other objects.
 
 
 
 
 
230
 
231
  4. GEOGRAPHIC COHERENCE: Architecture style must match vegetation and climate. Tropical palm trees next to Northern European half-timbered houses is impossible. Road markings should match the apparent country (right-hand vs left-hand traffic, line styles). Visible text/signs should be in the expected language for the geography.
232
 
@@ -286,17 +291,22 @@ def _calibrate_vlm_confidence(raw_conf: float) -> float:
286
  def run_semantic_agent(img):
287
  findings, scores = [], []
288
  vlm_ok = True
 
 
289
 
290
- for sys_p, usr_p, name, features in [
291
  (SYS_LIGHTING, USR_LIGHTING, "Lighting Physics",
292
  ["Shadow Direction","Shadow Quality","Specular Consistency","Ambient Occlusion",
293
- "Color Temperature","Subsurface Scattering","Caustics","Inter-reflections"]),
 
294
  (SYS_ANATOMY, USR_ANATOMY, "Anatomical Analysis",
295
  ["Hand Anatomy","Facial Symmetry","Body Proportions","Skin Texture",
296
- "Hair Consistency","Eye Details","Clothing Physics"]),
 
297
  (SYS_PHYSICS, USR_PHYSICS, "Physical Plausibility",
298
  ["Material Appearance","Perspective Geometry","Gravity & Structure",
299
- "Scale & Proportion","Transparency","Contact Physics","Motion Coherence","Depth & Occlusion"]),
 
300
  ]:
301
  try:
302
  resp = _vlm(img, sys_p, usr_p)
@@ -304,18 +314,56 @@ def run_semantic_agent(img):
304
  parsed = _parse(resp)
305
  sc = _score(parsed)
306
 
307
- # Calibrate VLM confidence before storing
308
  raw_conf = parsed.get("confidence", 0.5)
309
  cal_conf = _calibrate_vlm_confidence(raw_conf)
310
 
 
311
  if name == "Anatomical Analysis" and not parsed.get("contains_people", True):
312
- sc = 0.0
 
 
 
 
 
 
 
 
 
 
313
 
314
  anomalies = parsed.get("anomalies", [])
 
 
 
315
  for feat in features:
316
- findings.append({"test": feat, "score": sc / len(features),
317
- "note": parsed.get("explanation", "")[:100], "parent": name})
318
- scores.append(sc / len(features))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  findings.append({"test": name, "vlm_analysis": parsed, "anomalies": anomalies,
321
  "score": sc, "confidence": cal_conf,
@@ -323,15 +371,18 @@ def run_semantic_agent(img):
323
  "calibrated_confidence": cal_conf,
324
  "note": parsed.get("explanation", "")[:200]})
325
  scores.append(sc)
 
 
326
  else:
327
  vlm_ok = False
328
  for feat in features:
329
  findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
330
- scores.append(0.0)
331
  except Exception as e:
332
  findings.append({"test": name, "error": str(e), "score": 0})
 
333
 
334
- # Context plausibility (expanded to 8 sub-features)
335
  try:
336
  resp = _vlm(img, SYS_CONTEXT, USR_CONTEXT)
337
  if resp and not resp.startswith("VLM_ERROR"):
@@ -347,27 +398,58 @@ def run_semantic_agent(img):
347
  findings.append({"test": feat, "score": sc / len(context_features),
348
  "note": parsed.get("explanation", "")[:100], "parent": "Context"})
349
  scores.append(sc / len(context_features))
 
 
350
 
351
  findings.append({"test": "Context Plausibility", "vlm_analysis": parsed,
352
  "score": sc, "confidence": cal_conf,
353
  "note": parsed.get("explanation", "")[:200]})
354
  scores.append(sc)
 
 
355
  else:
356
  vlm_ok = False
357
  except:
358
  pass
359
 
360
- avg = float(np.mean(scores)) if scores else 0.0
361
- conf = min(1.0, 0.4 + 0.5 * abs(avg))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  if not vlm_ok:
363
  conf *= 0.3
364
 
365
- viol = [f["test"] for f in findings if f.get("score", 0) > 0.15 and "parent" not in f]
366
- comp = [f["test"] for f in findings if f.get("score", 0) < -0.1 and "parent" not in f]
 
 
367
  rat = f"Semantic violations: {', '.join(viol[:5])}." if viol else \
368
  f"Semantically consistent: {', '.join(comp[:5])}." if comp else "Semantic inconclusive."
369
  for f in findings:
370
- if f.get("note") and "parent" not in f:
371
  rat += f" [{f['test']}]: {f['note'][:100]}."
372
 
373
  return AgentEvidence("Semantic Consistency Agent", np.clip(avg, -1, 1), conf,
 
226
 
227
  2. TIME OF DAY: Sky color/brightness must match shadow lengths and lighting direction. A bright blue sky requires short shadows (midday) or long shadows from a specific direction. Stars visible + brightly lit ground is contradictory.
228
 
229
+ 3. ERA / TECHNOLOGY ANACHRONISM: Visible technology must match the apparent era of other objects in the scene. Use these concrete anchors:
230
+ - Pre-1990: No flat-screen TVs, no smartphones, no LED lighting, no modern car designs (rounded headlights, DRLs). CRT monitors only. Wired phones only.
231
+ - 1990-2005: Flip phones and early Nokias OK, but no touchscreen smartphones. Boxy CRT monitors, not flat panels. Boxy car designs.
232
+ - 2005-2015: Early smartphones OK, but no notched/hole-punch screens. Flat panels exist but bezels are thick.
233
+ - Post-2015: Thin-bezel phones, wireless earbuds, USB-C cables, modern LED strip lighting.
234
+ If the scene mixes eras (1950s architecture + a person holding a modern iPhone), flag it. Fashion should match the era of other visible technology.
235
 
236
  4. GEOGRAPHIC COHERENCE: Architecture style must match vegetation and climate. Tropical palm trees next to Northern European half-timbered houses is impossible. Road markings should match the apparent country (right-hand vs left-hand traffic, line styles). Visible text/signs should be in the expected language for the geography.
237
 
 
291
  def run_semantic_agent(img):
292
  findings, scores = [], []
293
  vlm_ok = True
294
+ n_applicable = 0 # Track how many sub-features were actually applicable
295
+ n_total = 0 # Track total sub-features attempted
296
 
297
+ for sys_p, usr_p, name, features, null_fields in [
298
  (SYS_LIGHTING, USR_LIGHTING, "Lighting Physics",
299
  ["Shadow Direction","Shadow Quality","Specular Consistency","Ambient Occlusion",
300
+ "Color Temperature","Subsurface Scattering","Caustics","Inter-reflections"],
301
+ {"sss_correct", "caustics_correct", "interreflections_ok"}),
302
  (SYS_ANATOMY, USR_ANATOMY, "Anatomical Analysis",
303
  ["Hand Anatomy","Facial Symmetry","Body Proportions","Skin Texture",
304
+ "Hair Consistency","Eye Details","Clothing Physics"],
305
+ set()),
306
  (SYS_PHYSICS, USR_PHYSICS, "Physical Plausibility",
307
  ["Material Appearance","Perspective Geometry","Gravity & Structure",
308
+ "Scale & Proportion","Transparency","Contact Physics","Motion Coherence","Depth & Occlusion"],
309
+ {"transparency_ok", "motion_ok"}),
310
  ]:
311
  try:
312
  resp = _vlm(img, sys_p, usr_p)
 
314
  parsed = _parse(resp)
315
  sc = _score(parsed)
316
 
 
317
  raw_conf = parsed.get("confidence", 0.5)
318
  cal_conf = _calibrate_vlm_confidence(raw_conf)
319
 
320
+ # Fix 3: Anatomy on non-human images β†’ tag as not_applicable
321
  if name == "Anatomical Analysis" and not parsed.get("contains_people", True):
322
+ for feat in features:
323
+ findings.append({"test": feat, "score": 0.0,
324
+ "note": "No people in image β€” not applicable",
325
+ "not_applicable": True, "parent": name})
326
+ # NOT added to scores β€” these should not dilute the posterior
327
+ n_total += len(features)
328
+ findings.append({"test": name, "vlm_analysis": parsed,
329
+ "score": 0.0, "confidence": cal_conf,
330
+ "not_applicable": True,
331
+ "note": "No people detected β€” anatomy analysis skipped"})
332
+ continue
333
 
334
  anomalies = parsed.get("anomalies", [])
335
+
336
+ # Fix 2: Count applicable sub-features (exclude nulls)
337
+ applicable_features = []
338
  for feat in features:
339
+ # Check if VLM returned null for the corresponding field
340
+ field_map = {f: k for f, k in zip(features, parsed.keys()) if k in null_fields}
341
+ is_null = False
342
+ for nf in null_fields:
343
+ if parsed.get(nf) is None:
344
+ # Map null field back to feature name (approximate)
345
+ if any(nf_word in feat.lower() for nf_word in nf.replace("_ok","").replace("_correct","").split("_")):
346
+ is_null = True
347
+ break
348
+
349
+ if is_null:
350
+ findings.append({"test": feat, "score": 0.0,
351
+ "note": "Not applicable to this image",
352
+ "not_applicable": True, "parent": name})
353
+ n_total += 1
354
+ else:
355
+ applicable_features.append(feat)
356
+
357
+ # Distribute score only across applicable features
358
+ n_applicable_here = len(applicable_features)
359
+ if n_applicable_here > 0:
360
+ per_feat_score = sc / n_applicable_here
361
+ for feat in applicable_features:
362
+ findings.append({"test": feat, "score": per_feat_score,
363
+ "note": parsed.get("explanation", "")[:100], "parent": name})
364
+ scores.append(per_feat_score)
365
+ n_applicable += 1
366
+ n_total += 1
367
 
368
  findings.append({"test": name, "vlm_analysis": parsed, "anomalies": anomalies,
369
  "score": sc, "confidence": cal_conf,
 
371
  "calibrated_confidence": cal_conf,
372
  "note": parsed.get("explanation", "")[:200]})
373
  scores.append(sc)
374
+ n_applicable += 1
375
+ n_total += 1
376
  else:
377
  vlm_ok = False
378
  for feat in features:
379
  findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
380
+ n_total += len(features)
381
  except Exception as e:
382
  findings.append({"test": name, "error": str(e), "score": 0})
383
+ n_total += 1
384
 
385
+ # Context plausibility
386
  try:
387
  resp = _vlm(img, SYS_CONTEXT, USR_CONTEXT)
388
  if resp and not resp.startswith("VLM_ERROR"):
 
398
  findings.append({"test": feat, "score": sc / len(context_features),
399
  "note": parsed.get("explanation", "")[:100], "parent": "Context"})
400
  scores.append(sc / len(context_features))
401
+ n_applicable += 1
402
+ n_total += 1
403
 
404
  findings.append({"test": "Context Plausibility", "vlm_analysis": parsed,
405
  "score": sc, "confidence": cal_conf,
406
  "note": parsed.get("explanation", "")[:200]})
407
  scores.append(sc)
408
+ n_applicable += 1
409
+ n_total += 1
410
  else:
411
  vlm_ok = False
412
  except:
413
  pass
414
 
415
+ # Fix 1: Confidence floor β€” distinguish genuinely neutral from cancelled-out
416
+ if scores:
417
+ avg = float(np.mean(scores))
418
+ # Check if scores genuinely agree on neutral vs. cancelling each other out
419
+ score_signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
420
+ n_positive = sum(1 for s in score_signs if s > 0)
421
+ n_negative = sum(1 for s in score_signs if s < 0)
422
+ n_neutral = sum(1 for s in score_signs if s == 0)
423
+
424
+ if n_positive > 0 and n_negative > 0:
425
+ # Scores cancelled out β€” LOW confidence, not 0.4
426
+ agreement = max(n_positive, n_negative) / (n_positive + n_negative)
427
+ conf = min(1.0, 0.15 + 0.5 * abs(avg) * agreement)
428
+ elif n_neutral == len(score_signs):
429
+ # Everything genuinely neutral (VLM said 0 for everything) β€” low confidence
430
+ conf = 0.2
431
+ else:
432
+ # Scores agree in direction β€” confidence scales with magnitude
433
+ conf = min(1.0, 0.3 + 0.6 * abs(avg))
434
+
435
+ # Scale by coverage: fewer applicable features = lower confidence
436
+ coverage = n_applicable / max(n_total, 1)
437
+ conf *= max(0.3, coverage)
438
+ else:
439
+ avg = 0.0
440
+ conf = 0.1
441
+
442
  if not vlm_ok:
443
  conf *= 0.3
444
 
445
+ viol = [f["test"] for f in findings if f.get("score", 0) > 0.15
446
+ and "parent" not in f and not f.get("not_applicable")]
447
+ comp = [f["test"] for f in findings if f.get("score", 0) < -0.1
448
+ and "parent" not in f and not f.get("not_applicable")]
449
  rat = f"Semantic violations: {', '.join(viol[:5])}." if viol else \
450
  f"Semantically consistent: {', '.join(comp[:5])}." if comp else "Semantic inconclusive."
451
  for f in findings:
452
+ if f.get("note") and "parent" not in f and not f.get("not_applicable"):
453
  rat += f" [{f['test']}]: {f['note'][:100]}."
454
 
455
  return AgentEvidence("Semantic Consistency Agent", np.clip(avg, -1, 1), conf,