File size: 33,577 Bytes
0d484f2
 
 
 
 
 
 
 
ada1738
8b4b288
ada1738
 
 
 
8b4b288
 
 
 
ada1738
8b4b288
6845e7a
ada1738
 
8b4b288
 
 
4883725
 
 
 
6845e7a
4883725
 
 
 
 
 
 
 
 
 
6845e7a
 
 
4883725
 
 
 
 
 
 
 
 
6845e7a
 
5388980
 
 
 
6845e7a
5388980
6845e7a
4883725
6845e7a
4883725
8b4b288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d484f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b4b288
0d484f2
8b4b288
0d484f2
8b4b288
0d484f2
8b4b288
0d484f2
8b4b288
0d484f2
 
 
 
 
 
a8cb6af
 
 
 
 
 
 
 
0d484f2
 
 
 
8b4b288
 
ada1738
0d484f2
 
8b4b288
0d484f2
8b4b288
 
 
 
0d484f2
ada1738
 
0d484f2
ada1738
 
 
0d484f2
 
 
 
 
ada1738
0d484f2
ada1738
0d484f2
 
 
 
 
 
 
ada1738
617ed10
 
 
 
 
 
0d484f2
ada1738
0d484f2
 
 
 
 
 
 
 
 
 
 
 
 
 
617ed10
 
0d484f2
ada1738
8b4b288
ada1738
 
8b4b288
0d484f2
8b4b288
 
 
 
 
 
0d484f2
ada1738
 
0d484f2
ada1738
 
0d484f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b4b288
 
ada1738
0d484f2
ada1738
8b4b288
ada1738
8b4b288
0d484f2
8b4b288
 
0d484f2
ada1738
 
0d484f2
ada1738
 
 
0d484f2
 
 
 
 
 
 
 
 
 
 
 
390f4c5
 
 
 
 
 
0d484f2
 
ada1738
0d484f2
 
 
 
 
 
6a1eeda
 
 
 
 
 
 
 
 
 
 
 
0d484f2
 
 
 
 
 
 
 
6a1eeda
ada1738
8b4b288
 
0d484f2
 
 
8b4b288
 
0d484f2
 
8b4b288
6a1eeda
0d484f2
8b4b288
 
0d484f2
8b4b288
ada1738
0d484f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b4b288
0d484f2
 
390f4c5
 
8b4b288
390f4c5
0d484f2
 
390f4c5
 
0d484f2
 
390f4c5
 
0d484f2
 
390f4c5
 
8b4b288
ada1738
0d484f2
8b4b288
0d484f2
 
 
 
 
 
390f4c5
0d484f2
390f4c5
 
 
 
 
 
 
 
 
 
 
0d484f2
 
390f4c5
 
 
8b4b288
390f4c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d484f2
 
 
 
 
 
8b4b288
390f4c5
 
8b4b288
0d484f2
8b4b288
0d484f2
390f4c5
ada1738
0d484f2
390f4c5
8b4b288
390f4c5
8b4b288
0d484f2
8b4b288
0d484f2
 
 
 
 
 
 
6a1eeda
 
0d484f2
 
 
 
390f4c5
 
0d484f2
 
 
 
 
390f4c5
 
0d484f2
 
 
 
 
390f4c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d484f2
 
 
390f4c5
 
 
 
0d484f2
 
ada1738
390f4c5
0d484f2
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
"""FORENSIQ β€” Semantic Consistency Agent (31 features via VLM)
Uses Qwen2.5-VL-72B with calibrated forensic prompts.

Design principles applied from review:
- Qualitative inconsistency detection, NOT metric estimation from 2D images
- Explicit phenomenon ownership: Lighting owns illumination, Physics owns geometry/materials
- Confidence calibration instructions in every prompt
- Expanded Context prompt (5β†’8 sub-features)
"""
import os, base64, io, json, re, numpy as np
from PIL import Image
from typing import Dict, Any, Optional
from agents.optical_agent import AgentEvidence

def _b64(img, mx=1024):
    w,h=img.size
    if max(w,h)>mx: r=mx/max(w,h); img=img.resize((int(w*r),int(h*r)),Image.LANCZOS)
    buf=io.BytesIO(); img.convert("RGB").save(buf,"JPEG",quality=90); return base64.b64encode(buf.getvalue()).decode()

def _vlm(img, sys_prompt, user_prompt):
    """Call VLM with generous timeout and retry for cold-start."""
    try:
        from openai import OpenAI
    except ImportError: return None
    token=os.environ.get("HF_TOKEN","")
    if not token: return None
    
    client=OpenAI(
        base_url="https://router.huggingface.co/v1",
        api_key=token,
        timeout=90.0,  # 90s β€” 72B model needs time for cold start
    )
    b64=_b64(img)
    messages=[
        {"role":"system","content":sys_prompt},
        {"role":"user","content":[
            {"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{b64}"}},
            {"type":"text","text":user_prompt}
        ]}
    ]
    
    # Try up to 3 times with exponential backoff (cold start can take 30s+)
    last_error = None
    for attempt in range(3):
        try:
            resp=client.chat.completions.create(
                model="Qwen/Qwen2.5-VL-72B-Instruct",
                messages=messages,
                max_tokens=2000,
                temperature=0.1,
            )
            return resp.choices[0].message.content
        except Exception as e:
            last_error = e
            if attempt < 2:
                err_str = str(last_error)
                # Don't retry on payment/quota errors β€” it won't help
                if '402' in err_str or 'credit' in err_str.lower() or 'quota' in err_str.lower():
                    return f"VLM_ERROR: Inference credits depleted. Add HF Pro subscription or purchase credits at huggingface.co/settings/billing"
                import time
                wait = 3 * (attempt + 1)
                time.sleep(wait)
                continue
    return f"VLM_ERROR: {last_error}"
    return "VLM_ERROR: exhausted retries"

def _parse(text):
    if not text: return {}
    for pattern in [r'```(?:json)?\s*(\{.*?\})\s*```', r'(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})']:
        m=re.search(pattern,text,re.DOTALL)
        if m:
            try: return json.loads(m.group(1))
            except: pass
    try: return json.loads(text)
    except: return {"raw":text}

def _score(parsed):
    v=parsed.get("verdict","UNKNOWN")
    if v=="MANIPULATED": return 0.7
    if v=="SUSPICIOUS": return 0.4
    if v=="AUTHENTIC": return -0.4
    return 0.0

# ── Shared calibration instruction appended to every prompt ──────────
CONFIDENCE_CALIBRATION = """

CONFIDENCE CALIBRATION β€” CRITICAL:
Your confidence score MUST follow these rules:
- Default to 0.5 if you are uncertain or the evidence is ambiguous.
- Only use 0.7+ if you observe an UNAMBIGUOUS, SPECIFIC violation (e.g., a hand with 6 clearly countable fingers, shadows pointing in opposite directions from same light source).
- Only use 0.3 or below if the image is clearly, unambiguously consistent with reality and you can articulate exactly why.
- Use 0.4-0.6 for most images. Most images are ambiguous. Do NOT inflate confidence.
- If a sub-analysis is not applicable (no people, no text, no transparent objects), set that field to null and do NOT let it affect your overall confidence.
VLMs systematically overstate confidence. Resist this bias. When in doubt, stay near 0.5."""


# ═══════════════════════════════════════════════════════════════════════
# PROMPT 1: LIGHTING (8 features)
# Owns: ALL illumination phenomena β€” shadows, highlights, light color,
#        light transport (SSS, caustics, inter-reflections)
# Does NOT own: material reflectance (that's Physics), geometry (Physics)
# ═══════════════════════════════════════════════════════════════════════

SYS_LIGHTING = """You are a forensic lighting analyst. You detect QUALITATIVE inconsistencies in illumination that indicate AI generation or manipulation. You work from visual appearance, not metric measurement.

IMPORTANT: You are analyzing a 2D image. You CANNOT compute exact distances, angles, or irradiance values. Instead, you look for VISIBLE INCONSISTENCIES that would be obvious to a trained observer:

Your 8 analysis domains (you OWN these β€” no other agent covers them):

1. SHADOW DIRECTION: Do shadows from different objects in the scene appear to point toward consistent light source position(s)? Look for shadows that diverge when they should converge, or shadows pointing in incompatible directions. You do NOT need to compute exact angles β€” just assess whether the overall shadow pattern is self-consistent.

2. SHADOW QUALITY: Are shadow edges (penumbra) consistent with the apparent light source? A small bright light produces hard shadows; overcast sky produces soft shadows. Do ALL shadows in the scene share the same hardness/softness? Mixed hard and soft shadows without explanation (e.g., multiple lights) is suspicious.

3. SPECULAR HIGHLIGHTS: Bright reflections on shiny surfaces encode the light direction. If multiple shiny objects are visible, do their highlights appear to come from the same direction? If a person has catchlights in their eyes, do both eyes show highlights in the same position?

4. AMBIENT OCCLUSION: Where objects meet surfaces (feet on floor, cup on table, book on shelf), there should be subtle darkening at the contact line. AI images frequently omit contact shadows or place them incorrectly. Check: are contact shadows present where objects touch?

5. COLOR TEMPERATURE: Light from a single source should tint all surfaces the same hue. Look for: one side of a face warm-toned while the other is cool-toned without a motivating second light source. Indoor scenes with mixed warm/cool illumination should have visible light sources to explain it.

6. SUBSURFACE SCATTERING: If you can see thin body parts (ears, nostrils, fingers between a light) backlit by a strong source, they should glow warm/red from blood beneath the skin. If present, is it consistent with the light direction? If absent when expected, flag it.

7. CAUSTICS: If glass, water, or transparent objects are present near a surface, look for projected light patterns. Their absence in a brightly lit scene with transparent objects is mildly suspicious. If caustics ARE visible, do they match the shape and position of the transparent object?

8. INTER-REFLECTIONS: Strongly colored surfaces near neutral surfaces should tint them. A red blanket next to a white wall should cast a subtle red tint. Look for color bleeding that's present OR suspiciously absent.

CRITICAL β€” AI LIGHTING TELLS:
AI-generated images frequently produce physically impossible lighting:
- A bright WINDOW as primary light source MUST create strong directional shadows on nearby subjects. If a person is next to a bright window but their face is evenly lit with no harsh shadows, this is IMPOSSIBLE without a visible fill light or reflector. Flag this.
- Indoor scenes with perfectly even illumination and no dark corners are suspicious β€” real rooms have lighting falloff.
- Multiple light sources should create multiple shadow directions. A single shadow direction with omnidirectional illumination is contradictory.
Flag any of these as SUSPICIOUS with high confidence.""" + CONFIDENCE_CALIBRATION

USR_LIGHTING = """Analyze this image for lighting inconsistencies across all 8 domains.

For each, give a QUALITATIVE assessment based on what you can visually observe β€” do NOT attempt to compute metric values like exact angles or irradiance.

Respond in JSON:
{
    "shadow_direction_consistent": true/false,
    "shadow_quality_consistent": true/false,
    "specular_consistent": true/false,
    "ambient_occlusion_present": true/false,
    "color_temp_consistent": true/false,
    "sss_correct": true/false/null,
    "caustics_correct": true/false/null,
    "interreflections_ok": true/false/null,
    "anomalies": ["specific anomaly descriptions with image region references"],
    "confidence": 0.0-1.0,
    "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
    "explanation": "detailed reasoning citing what you observe, not what you compute"
}"""


# ═══════════════════════════════════════════════════════════════════════
# PROMPT 2: ANATOMY (7 features)
# ═══════════════════════════════════════════════════════════════════════

SYS_ANATOMY = """You are a forensic anatomist. You detect anatomical errors in images that indicate AI generation.

DETECTION PROTOCOL:

1. HANDS β€” This is your highest-priority check. Procedure:
   a) Locate every visible hand in the image.
   b) For each hand, COUNT fingers individually: thumb, index, middle, ring, pinky. State the count explicitly.
   c) Verify each finger has correct joint count (thumb: 2 joints, others: 3 joints).
   d) Check that joints bend only in anatomically possible directions.
   e) Verify nails are on the correct (dorsal) side of each finger.
   f) If hands are partially occluded, note what's visible vs. hidden.

   CRITICAL FINGER COUNTING RULES β€” READ CAREFULLY:
   - If a hand is HOLDING AN OBJECT (glass, cup, phone, bag, food, steering wheel, tool), fingers will be wrapped around it and partially hidden. DO NOT count this as having extra fingers. Instead, note "hand partially occluded by held object" and set hands_correct=null.
   - If a hand is seen at an ANGLE (foreshortened, from the side, curled into a fist), some fingers will overlap or be hidden behind others. DO NOT count overlapping fingers as extra. Set hands_correct=null and note the occlusion.
   - Only flag extra/missing fingers if you can see a FULLY OPEN, SPREAD hand with MORE than 5 or FEWER than 4 clearly distinct, individually identifiable fingers.
   - When in doubt about finger count due to occlusion, angle, or low resolution: set hands_correct=null, NOT false. False positives on finger counting are worse than missed detections.

2. FACIAL SYMMETRY β€” Flag asymmetry ONLY if it would be noticeable to a casual observer at normal viewing distance. Natural faces have subtle asymmetry; AI faces often have GROSS asymmetry (one ear significantly higher/larger, one eye noticeably different shape, jawline shifted). Do NOT flag sub-pixel or barely perceptible differences.

3. BODY PROPORTIONS β€” Check against standard human ratios: head β‰ˆ 1/7.5 of height, elbow at waist, fingertips at mid-thigh. Flag only OBVIOUS violations (forearm twice the length of upper arm, head clearly too large).

4. SKIN TEXTURE β€” Look for abrupt texture changes: one patch of skin with visible pores adjacent to a smooth patch. Check for texture that transitions unnaturally between face regions.

5. HAIR β€” Look for: strands that float disconnected from the scalp, hairline that dissolves into skin without natural transition, inconsistent hair direction (some strands defy gravity without explanation).

6. EYE DETAILS β€” Catchlight reflections must appear in the same relative position in both eyes (same light source). Both irises should have the same color. Eyelashes should radiate outward from the lid margin.

7. CLOTHING β€” Fabric must drape under gravity. Seams must be continuous (not disappearing/reappearing). Buttons must have buttonholes. Jewelry must connect to the body.""" + CONFIDENCE_CALIBRATION

USR_ANATOMY = """Perform anatomical forensic analysis.

MANDATORY: If hands are visible, explicitly count each finger on each hand. State your count clearly (e.g., "Left hand: thumb, index, middle, ring, pinky = 5 fingers").

CRITICAL: If a hand is holding something, seen at an angle, or partially hidden, set hands_correct=null and note the occlusion. Do NOT report extra fingers on occluded or foreshortened hands β€” this is the #1 source of false positives.

If NO people are visible, set contains_people=false and skip all other fields.

Respond in JSON:
{
    "contains_people": true/false,
    "hands_correct": true/false/null,
    "finger_count": "explicit count per hand, e.g. 'Left: 5 (thumb,index,middle,ring,pinky), Right: not visible'",
    "face_symmetric": true/false/null,
    "proportions_ok": true/false/null,
    "skin_natural": true/false/null,
    "hair_natural": true/false/null,
    "eyes_consistent": true/false/null,
    "clothing_ok": true/false/null,
    "anomalies": ["specific anatomical errors with locations"],
    "confidence": 0.0-1.0,
    "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
    "explanation": "reasoning with specific observations β€” for hands, cite your finger count"
}"""


# ═══════════════════════════════════════════════════════════════════════
# PROMPT 3: PHYSICAL PLAUSIBILITY (8 features)
# Owns: geometry, material appearance, structural mechanics, object interaction
# Does NOT own: illumination/shadows (that's Lighting), anatomy (that's Anatomy)
# Explicit partition from Lighting: this agent checks materials, perspective, and
#   structural physics. It does NOT re-analyze shadows, highlights, or light color.
# ═══════════════════════════════════════════════════════════════════════

SYS_PHYSICS = """You are a forensic physicist. You detect violations of geometry, material properties, and structural mechanics in images.

SCOPE β€” You analyze these 8 domains. You do NOT analyze lighting/shadows/specular highlights (a separate Lighting Agent handles those). Focus ONLY on:

1. MATERIAL APPEARANCE: Does each material look like what it claims to be? Metals should show environment reflections. Wood should have grain. Fabric should have texture. The SAME material across an image should have consistent appearance. Look for: a "metal" railing that looks like plastic, or glass that doesn't distort the background.

2. PERSPECTIVE GEOMETRY: Parallel lines in the real world (edges of buildings, railroad tracks, road markings) must converge to consistent vanishing points. Check for: lines that should be parallel but converge to different points, vertical lines that lean inconsistently.

3. GRAVITY & STRUCTURE: Everything must obey gravity. Objects rest on surfaces, don't float. Liquids have flat surfaces. Cantilevered structures need support. Fabric hangs down. Hair falls down (unless wind/motion is depicted). Look for: floating objects, impossible structural loads, upward-flowing fabric.

4. SCALE & PROPORTION: Objects with known real-world sizes (people ~1.7m, doors ~2m, cars ~4.5m, chairs ~0.45m seat height) should be proportional to each other. Check for: a person who would be 3m tall next to a door, or a cup the size of a head.

5. TRANSPARENCY: Glass transmits and distorts. Water refracts. Transparent objects should show what's behind them, distorted appropriately. Frosted glass blurs. Thick glass distorts more. Check for: glass that's perfectly clear with no distortion, or opaque "glass."

6. CONTACT PHYSICS: Where objects rest on soft surfaces, there should be deformation (cushion under person, mattress under object). Where heavy objects rest on surfaces, the surface should show appropriate response.

7. MOTION COHERENCE: If motion blur is present, its direction and magnitude should be consistent with the depicted motion. A moving car should have horizontal blur. A falling object should have vertical blur. An image with one object blurred and everything else sharp needs a fast-moving object OR selective focus.

8. DEPTH & OCCLUSION: Nearer objects must occlude farther ones consistently. No object should appear to be simultaneously in front of AND behind another object. Occlusion boundaries should be clean (no "melting" edges).""" + CONFIDENCE_CALIBRATION

USR_PHYSICS = """Analyze this image for physics violations.

SCOPE REMINDER: Do NOT analyze lighting, shadows, or specular highlights β€” that is handled by a separate agent. Focus on materials, geometry, gravity, scale, transparency, contact, motion, and depth.

Respond in JSON:
{
    "material_consistent": true/false,
    "perspective_correct": true/false,
    "gravity_ok": true/false,
    "scale_consistent": true/false,
    "transparency_ok": true/false/null,
    "contact_ok": true/false,
    "motion_ok": true/false/null,
    "depth_ordering_ok": true/false,
    "anomalies": ["specific physics violations β€” not lighting"],
    "confidence": 0.0-1.0,
    "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
    "explanation": "reasoning focused on geometry and material physics"
}"""


# ═══════════════════════════════════════════════════════════════════════
# PROMPT 4: CONTEXT PLAUSIBILITY (8 features β€” expanded from 5)
# ═══════════════════════════════════════════════════════════════════════

SYS_CONTEXT = """You are a forensic scene analyst specializing in contextual coherence. AI-generated images often combine elements that could not physically coexist in the same real photograph.

Your 8 analysis domains:

1. TEMPORAL SEASON: Vegetation, foliage color, and flower blooming must match. Snow on the ground requires bare or evergreen trees. Green deciduous leaves + snow is a contradiction. Clothing should match the apparent season.

2. TIME OF DAY: Sky color/brightness must match shadow lengths and lighting direction. A bright blue sky requires short shadows (midday) or long shadows from a specific direction. Stars visible + brightly lit ground is contradictory.

3. ERA / TECHNOLOGY ANACHRONISM: Visible technology must match the apparent era of other objects in the scene. Use these concrete anchors:
   - Pre-1990: No flat-screen TVs, no smartphones, no LED lighting, no modern car designs (rounded headlights, DRLs). CRT monitors only. Wired phones only.
   - 1990-2005: Flip phones and early Nokias OK, but no touchscreen smartphones. Boxy CRT monitors, not flat panels. Boxy car designs.
   - 2005-2015: Early smartphones OK, but no notched/hole-punch screens. Flat panels exist but bezels are thick.
   - Post-2015: Thin-bezel phones, wireless earbuds, USB-C cables, modern LED strip lighting.
   If the scene mixes eras (1950s architecture + a person holding a modern iPhone), flag it. Fashion should match the era of other visible technology.

4. GEOGRAPHIC COHERENCE: Architecture style must match vegetation and climate. Tropical palm trees next to Northern European half-timbered houses is impossible. Road markings should match the apparent country (right-hand vs left-hand traffic, line styles). Visible text/signs should be in the expected language for the geography.

5. WEATHER COHERENCE: Sky conditions must match ground conditions. Wet pavement requires recent rain or overcast sky. Dry dust in the air contradicts standing water. Snow requires freezing conditions (visible breath, winter clothing). Fog obscures distant objects.

6. ATTIRE-SETTING MATCH: Beach clothing at a business meeting is impossible (unless clearly a party/casual scene). Winter coats in a tropical setting. Formal wear in a construction zone. Analyze whether clothing choices are plausible for the depicted location and activity.

7. SIGN & LABEL COHERENCE: Visible signs, labels, and text should be appropriate for the scene type. A restaurant should show food-related signage. A hospital should show medical signage. Signs in a residential area should show house numbers, street names. Complete absence of expected signage in a commercial area is mildly suspicious.

8. OBJECT FUNCTION & ARRANGEMENT: Furniture should be arranged for use (chairs face tables). Appliances should be connected (lamps plugged in, or at least near outlets). Tools should be held or stored correctly. Kitchen items should be in kitchens. Check for: objects that serve no function, impossible arrangements, or items placed where they'd be impractical.

9. AI STOCK PHOTO AESTHETICS β€” CRITICAL CHECK: AI-generated professional/office/lifestyle images have distinctive tells:
   - UNNATURALLY CLEAN environments: offices with zero clutter, kitchens with no crumbs, desks with nothing out of place. Real offices have cable tangles, personal items, slight mess.
   - IMPOSSIBLY PERFECT LIGHTING: perfectly even illumination with no harsh shadows, especially in indoor scenes where windows should create directional light and dark corners.
   - REPEATED IDENTICAL ELEMENTS: multiple sticky notes that are exactly the same size/color/angle, identical books on a shelf, repeated patterns that a human would vary.
   - WHITEBOARD/SCREEN CONTENT: Text on whiteboards that looks coherent from a distance but contains repeated phrases, nonsensical diagrams, or text that doesn't quite form real words. Look for duplicated headers, flowcharts that loop impossibly, and bullet points that repeat.
   - STOCK PHOTO POSES: People in unnaturally perfect poses, smiling too evenly, gesturing in ways that look like stock photography templates rather than candid moments.
   - SKIN PERFECTION: Completely poreless, airbrushed-looking skin with no visible texture, freckles, or imperfections. Real people have skin texture visible at any reasonable resolution.
   Flag ANY of these patterns β€” they are strong AI-generation indicators.""" + CONFIDENCE_CALIBRATION

USR_CONTEXT = """Analyze contextual plausibility across all 9 domains:
1. Temporal/Season β€” vegetation vs clothing vs weather
2. Time of Day β€” sky vs shadows vs lighting
3. Era/Technology β€” anachronistic objects
4. Geographic β€” architecture vs vegetation vs signage language
5. Weather β€” sky vs ground conditions vs attire
6. Attire-Setting β€” clothing appropriate for location/activity
7. Sign/Label Coherence β€” signage matches scene type
8. Object Arrangement β€” functional, plausible placement
9. AI Stock Photo Aesthetics β€” unnaturally clean, perfect lighting, repeated elements, whiteboard gibberish, stock poses, poreless skin

Respond in JSON:
{
    "season_consistent": true/false,
    "time_of_day_consistent": true/false,
    "era_consistent": true/false,
    "geographic_consistent": true/false,
    "weather_consistent": true/false,
    "attire_setting_match": true/false,
    "signage_coherent": true/false,
    "objects_functional": true/false,
    "ai_stock_aesthetics": true/false,
    "anomalies": ["specific contextual violations with reasoning"],
    "confidence": 0.0-1.0,
    "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
    "explanation": "detailed reasoning per domain"
}"""


# ═══════════════════════════════════════════════════════════════════════
# AGENT RUNNER
# ═══════════════════════════════════════════════════════════════════════

# VLM confidence temperature β€” applied before feeding into Bayesian Eq.1
# VLMs systematically overstate confidence; this compresses toward 0.5
VLM_CONFIDENCE_TEMPERATURE = 2.0

def _calibrate_vlm_confidence(raw_conf: float) -> float:
    """Post-process VLM confidence with temperature scaling.
    Compresses extreme values toward 0.5 to counter VLM overconfidence."""
    if raw_conf <= 0 or raw_conf >= 1:
        return 0.5
    logit = np.log(raw_conf / (1 - raw_conf))
    scaled = logit / VLM_CONFIDENCE_TEMPERATURE
    return float(1.0 / (1.0 + np.exp(-scaled)))


def run_semantic_agent(img):
    findings, scores = [], []
    vlm_ok = True
    n_applicable = 0  # Track how many sub-features were actually applicable
    n_total = 0       # Track total sub-features attempted
    
    for sys_p, usr_p, name, features, null_fields in [
        (SYS_LIGHTING, USR_LIGHTING, "Lighting Physics",
         ["Shadow Direction","Shadow Quality","Specular Consistency","Ambient Occlusion",
          "Color Temperature","Subsurface Scattering","Caustics","Inter-reflections"],
         {"sss_correct", "caustics_correct", "interreflections_ok"}),
        (SYS_ANATOMY, USR_ANATOMY, "Anatomical Analysis",
         ["Hand Anatomy","Facial Symmetry","Body Proportions","Skin Texture",
          "Hair Consistency","Eye Details","Clothing Physics"],
         set()),
        (SYS_PHYSICS, USR_PHYSICS, "Physical Plausibility",
         ["Material Appearance","Perspective Geometry","Gravity & Structure",
          "Scale & Proportion","Transparency","Contact Physics","Motion Coherence","Depth & Occlusion"],
         {"transparency_ok", "motion_ok"}),
    ]:
        try:
            resp = _vlm(img, sys_p, usr_p)
            if resp and not resp.startswith("VLM_ERROR"):
                parsed = _parse(resp)
                sc = _score(parsed)
                
                raw_conf = parsed.get("confidence", 0.5)
                cal_conf = _calibrate_vlm_confidence(raw_conf)
                
                # Fix 3: Anatomy on non-human images β†’ tag as not_applicable
                if name == "Anatomical Analysis" and not parsed.get("contains_people", True):
                    for feat in features:
                        findings.append({"test": feat, "score": 0.0,
                                       "note": "No people in image β€” not applicable",
                                       "not_applicable": True, "parent": name})
                        # NOT added to scores β€” these should not dilute the posterior
                    n_total += len(features)
                    findings.append({"test": name, "vlm_analysis": parsed,
                                   "score": 0.0, "confidence": cal_conf,
                                   "not_applicable": True,
                                   "note": "No people detected β€” anatomy analysis skipped"})
                    continue
                
                anomalies = parsed.get("anomalies", [])
                
                # Fix 2: Count applicable sub-features (exclude nulls)
                applicable_features = []
                for feat in features:
                    # Check if VLM returned null for the corresponding field
                    field_map = {f: k for f, k in zip(features, parsed.keys()) if k in null_fields}
                    is_null = False
                    for nf in null_fields:
                        if parsed.get(nf) is None:
                            # Map null field back to feature name (approximate)
                            if any(nf_word in feat.lower() for nf_word in nf.replace("_ok","").replace("_correct","").split("_")):
                                is_null = True
                                break
                    
                    if is_null:
                        findings.append({"test": feat, "score": 0.0,
                                       "note": "Not applicable to this image",
                                       "not_applicable": True, "parent": name})
                        n_total += 1
                    else:
                        applicable_features.append(feat)
                
                # Distribute score only across applicable features
                n_applicable_here = len(applicable_features)
                if n_applicable_here > 0:
                    per_feat_score = sc / n_applicable_here
                    for feat in applicable_features:
                        findings.append({"test": feat, "score": per_feat_score,
                                       "note": parsed.get("explanation", "")[:100], "parent": name})
                        scores.append(per_feat_score)
                        n_applicable += 1
                        n_total += 1
                
                findings.append({"test": name, "vlm_analysis": parsed, "anomalies": anomalies,
                               "score": sc, "confidence": cal_conf,
                               "raw_vlm_confidence": raw_conf,
                               "calibrated_confidence": cal_conf,
                               "note": parsed.get("explanation", "")[:200]})
                scores.append(sc)
                n_applicable += 1
                n_total += 1
            else:
                vlm_ok = False
                for feat in features:
                    findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
                n_total += len(features)
        except Exception as e:
            findings.append({"test": name, "error": str(e), "score": 0})
            n_total += 1
    
    # Context plausibility
    try:
        resp = _vlm(img, SYS_CONTEXT, USR_CONTEXT)
        if resp and not resp.startswith("VLM_ERROR"):
            parsed = _parse(resp)
            sc = _score(parsed)
            raw_conf = parsed.get("confidence", 0.5)
            cal_conf = _calibrate_vlm_confidence(raw_conf)
            
            context_features = ["Season Consistency","Time-of-Day","Era/Technology",
                              "Geographic Coherence","Weather Coherence",
                              "Attire-Setting Match","Sign/Label Coherence",
                              "Object Arrangement","AI Stock Photo Aesthetics"]
            for feat in context_features:
                findings.append({"test": feat, "score": sc / len(context_features),
                               "note": parsed.get("explanation", "")[:100], "parent": "Context"})
                scores.append(sc / len(context_features))
                n_applicable += 1
                n_total += 1
            
            findings.append({"test": "Context Plausibility", "vlm_analysis": parsed,
                           "score": sc, "confidence": cal_conf,
                           "note": parsed.get("explanation", "")[:200]})
            scores.append(sc)
            n_applicable += 1
            n_total += 1
        else:
            vlm_ok = False
    except:
        pass

    # Fix 1: Confidence floor β€” distinguish genuinely neutral from cancelled-out
    if scores:
        avg = float(np.mean(scores))
        # Check if scores genuinely agree on neutral vs. cancelling each other out
        score_signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
        n_positive = sum(1 for s in score_signs if s > 0)
        n_negative = sum(1 for s in score_signs if s < 0)
        n_neutral = sum(1 for s in score_signs if s == 0)
        
        if n_positive > 0 and n_negative > 0:
            # Scores cancelled out β€” LOW confidence, not 0.4
            agreement = max(n_positive, n_negative) / (n_positive + n_negative)
            conf = min(1.0, 0.15 + 0.5 * abs(avg) * agreement)
        elif n_neutral == len(score_signs):
            # Everything genuinely neutral (VLM said 0 for everything) β€” low confidence
            conf = 0.2
        else:
            # Scores agree in direction β€” confidence scales with magnitude
            conf = min(1.0, 0.3 + 0.6 * abs(avg))
        
        # Scale by coverage: fewer applicable features = lower confidence
        coverage = n_applicable / max(n_total, 1)
        conf *= max(0.3, coverage)
    else:
        avg = 0.0
        conf = 0.1
    
    if not vlm_ok:
        conf *= 0.3
    
    viol = [f["test"] for f in findings if f.get("score", 0) > 0.15
            and "parent" not in f and not f.get("not_applicable")]
    comp = [f["test"] for f in findings if f.get("score", 0) < -0.1
            and "parent" not in f and not f.get("not_applicable")]
    rat = f"Semantic violations: {', '.join(viol[:5])}." if viol else \
          f"Semantically consistent: {', '.join(comp[:5])}." if comp else "Semantic inconclusive."
    for f in findings:
        if f.get("note") and "parent" not in f and not f.get("not_applicable"):
            rat += f" [{f['test']}]: {f['note'][:100]}."
    
    return AgentEvidence("Semantic Consistency Agent", np.clip(avg, -1, 1), conf,
                         0.0 if vlm_ok else 0.8, rat,
                         [f for f in findings if "parent" not in f])