"""FORENSIQ — Semantic Consistency Agent (31 features via VLM)
Uses Qwen2.5-VL-72B with calibrated forensic prompts.

Design principles applied from review:
- Qualitative inconsistency detection, NOT metric estimation from 2D images
- Explicit phenomenon ownership: Lighting owns illumination, Physics owns geometry/materials
- Confidence calibration instructions in every prompt
- Expanded Context prompt (5→8 sub-features)
"""
import os, base64, io, json, re, numpy as np
from PIL import Image
from typing import Dict, Any, Optional
from agents.optical_agent import AgentEvidence

def _b64(img, mx=1024):
    w,h=img.size
    if max(w,h)>mx: r=mx/max(w,h); img=img.resize((int(w*r),int(h*r)),Image.LANCZOS)
    buf=io.BytesIO(); img.convert("RGB").save(buf,"JPEG",quality=90); return base64.b64encode(buf.getvalue()).decode()

def _vlm(img, sys_prompt, user_prompt):
    """Call VLM with generous timeout and retry for cold-start."""
    try:
        from openai import OpenAI
    except ImportError: return None
    token=os.environ.get("HF_TOKEN","")
    if not token: return None
    
    client=OpenAI(
        base_url="https://router.huggingface.co/v1",
        api_key=token,
        timeout=90.0,  # 90s — 72B model needs time for cold start
    )
    b64=_b64(img)
    messages=[
        {"role":"system","content":sys_prompt},
        {"role":"user","content":[
            {"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{b64}"}},
            {"type":"text","text":user_prompt}
        ]}
    ]
    
    # Try up to 3 times with exponential backoff (cold start can take 30s+)
    last_error = None
    for attempt in range(3):
        try:
            resp=client.chat.completions.create(
                model="Qwen/Qwen2.5-VL-72B-Instruct",
                messages=messages,
                max_tokens=2000,
                temperature=0.1,
            )
            return resp.choices[0].message.content
        except Exception as e:
            last_error = e
            if attempt < 2:
                err_str = str(last_error)
                # Don't retry on payment/quota errors — it won't help
                if '402' in err_str or 'credit' in err_str.lower() or 'quota' in err_str.lower():
                    return f"VLM_ERROR: Inference credits depleted. Add HF Pro subscription or purchase credits at huggingface.co/settings/billing"
                import time
                wait = 3 * (attempt + 1)
                time.sleep(wait)
                continue
    return f"VLM_ERROR: {last_error}"
    return "VLM_ERROR: exhausted retries"

def _parse(text):
    if not text: return {}
    for pattern in [r'```(?:json)?\s*(\{.*?\})\s*```', r'(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})']:
        m=re.search(pattern,text,re.DOTALL)
        if m:
            try: return json.loads(m.group(1))
            except: pass
    try: return json.loads(text)
    except: return {"raw":text}

def _score(parsed):
    v=parsed.get("verdict","UNKNOWN")
    if v=="MANIPULATED": return 0.7
    if v=="SUSPICIOUS": return 0.4
    if v=="AUTHENTIC": return -0.4
    return 0.0

# ── Shared calibration instruction appended to every prompt ──────────
CONFIDENCE_CALIBRATION = """

CONFIDENCE CALIBRATION — CRITICAL:
Your confidence score MUST follow these rules:
- Default to 0.5 if you are uncertain or the evidence is ambiguous.
- Only use 0.7+ if you observe an UNAMBIGUOUS, SPECIFIC violation (e.g., a hand with 6 clearly countable fingers, shadows pointing in opposite directions from same light source).
- Only use 0.3 or below if the image is clearly, unambiguously consistent with reality and you can articulate exactly why.
- Use 0.4-0.6 for most images. Most images are ambiguous. Do NOT inflate confidence.
- If a sub-analysis is not applicable (no people, no text, no transparent objects), set that field to null and do NOT let it affect your overall confidence.
VLMs systematically overstate confidence. Resist this bias. When in doubt, stay near 0.5."""


# ═══════════════════════════════════════════════════════════════════════
# PROMPT 1: LIGHTING (8 features)
# Owns: ALL illumination phenomena — shadows, highlights, light color,
#        light transport (SSS, caustics, inter-reflections)
# Does NOT own: material reflectance (that's Physics), geometry (Physics)
# ═══════════════════════════════════════════════════════════════════════

SYS_LIGHTING = """You are a forensic lighting analyst. You detect QUALITATIVE inconsistencies in illumination that indicate AI generation or manipulation. You work from visual appearance, not metric measurement.

IMPORTANT: You are analyzing a 2D image. You CANNOT compute exact distances, angles, or irradiance values. Instead, you look for VISIBLE INCONSISTENCIES that would be obvious to a trained observer:

Your 8 analysis domains (you OWN these — no other agent covers them):

1. SHADOW DIRECTION: Do shadows from different objects in the scene appear to point toward consistent light source position(s)? Look for shadows that diverge when they should converge, or shadows pointing in incompatible directions. You do NOT need to compute exact angles — just assess whether the overall shadow pattern is self-consistent.

2. SHADOW QUALITY: Are shadow edges (penumbra) consistent with the apparent light source? A small bright light produces hard shadows; overcast sky produces soft shadows. Do ALL shadows in the scene share the same hardness/softness? Mixed hard and soft shadows without explanation (e.g., multiple lights) is suspicious.

3. SPECULAR HIGHLIGHTS: Bright reflections on shiny surfaces encode the light direction. If multiple shiny objects are visible, do their highlights appear to come from the same direction? If a person has catchlights in their eyes, do both eyes show highlights in the same position?

4. AMBIENT OCCLUSION: Where objects meet surfaces (feet on floor, cup on table, book on shelf), there should be subtle darkening at the contact line. AI images frequently omit contact shadows or place them incorrectly. Check: are contact shadows present where objects touch?

5. COLOR TEMPERATURE: Light from a single source should tint all surfaces the same hue. Look for: one side of a face warm-toned while the other is cool-toned without a motivating second light source. Indoor scenes with mixed warm/cool illumination should have visible light sources to explain it.

6. SUBSURFACE SCATTERING: If you can see thin body parts (ears, nostrils, fingers between a light) backlit by a strong source, they should glow warm/red from blood beneath the skin. If present, is it consistent with the light direction? If absent when expected, flag it.

7. CAUSTICS: If glass, water, or transparent objects are present near a surface, look for projected light patterns. Their absence in a brightly lit scene with transparent objects is mildly suspicious. If caustics ARE visible, do they match the shape and position of the transparent object?

8. INTER-REFLECTIONS: Strongly colored surfaces near neutral surfaces should tint them. A red blanket next to a white wall should cast a subtle red tint. Look for color bleeding that's present OR suspiciously absent.

CRITICAL — AI LIGHTING TELLS:
AI-generated images frequently produce physically impossible lighting:
- A bright WINDOW as primary light source MUST create strong directional shadows on nearby subjects. If a person is next to a bright window but their face is evenly lit with no harsh shadows, this is IMPOSSIBLE without a visible fill light or reflector. Flag this.
- Indoor scenes with perfectly even illumination and no dark corners are suspicious — real rooms have lighting falloff.
- Multiple light sources should create multiple shadow directions. A single shadow direction with omnidirectional illumination is contradictory.
Flag any of these as SUSPICIOUS with high confidence.""" + CONFIDENCE_CALIBRATION

USR_LIGHTING = """Analyze this image for lighting inconsistencies across all 8 domains.

For each, give a QUALITATIVE assessment based on what you can visually observe — do NOT attempt to compute metric values like exact angles or irradiance.

Respond in JSON:
{
    "shadow_direction_consistent": true/false,
    "shadow_quality_consistent": true/false,
    "specular_consistent": true/false,
    "ambient_occlusion_present": true/false,
    "color_temp_consistent": true/false,
    "sss_correct": true/false/null,
    "caustics_correct": true/false/null,
    "interreflections_ok": true/false/null,
    "anomalies": ["specific anomaly descriptions with image region references"],
    "confidence": 0.0-1.0,
    "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
    "explanation": "detailed reasoning citing what you observe, not what you compute"
}"""


# ═══════════════════════════════════════════════════════════════════════
# PROMPT 2: ANATOMY (7 features)
# ═══════════════════════════════════════════════════════════════════════

SYS_ANATOMY = """You are a forensic anatomist. You detect anatomical errors in images that indicate AI generation.

DETECTION PROTOCOL:

1. HANDS — This is your highest-priority check. Procedure:
   a) Locate every visible hand in the image.
   b) For each hand, COUNT fingers individually: thumb, index, middle, ring, pinky. State the count explicitly.
   c) Verify each finger has correct joint count (thumb: 2 joints, others: 3 joints).
   d) Check that joints bend only in anatomically possible directions.
   e) Verify nails are on the correct (dorsal) side of each finger.
   f) If hands are partially occluded, note what's visible vs. hidden.

   CRITICAL FINGER COUNTING RULES — READ CAREFULLY:
   - If a hand is HOLDING AN OBJECT (glass, cup, phone, bag, food, steering wheel, tool), fingers will be wrapped around it and partially hidden. DO NOT count this as having extra fingers. Instead, note "hand partially occluded by held object" and set hands_correct=null.
   - If a hand is seen at an ANGLE (foreshortened, from the side, curled into a fist), some fingers will overlap or be hidden behind others. DO NOT count overlapping fingers as extra. Set hands_correct=null and note the occlusion.
   - Only flag extra/missing fingers if you can see a FULLY OPEN, SPREAD hand with MORE than 5 or FEWER than 4 clearly distinct, individually identifiable fingers.
   - When in doubt about finger count due to occlusion, angle, or low resolution: set hands_correct=null, NOT false. False positives on finger counting are worse than missed detections.

2. FACIAL SYMMETRY — Flag asymmetry ONLY if it would be noticeable to a casual observer at normal viewing distance. Natural faces have subtle asymmetry; AI faces often have GROSS asymmetry (one ear significantly higher/larger, one eye noticeably different shape, jawline shifted). Do NOT flag sub-pixel or barely perceptible differences.

3. BODY PROPORTIONS — Check against standard human ratios: head ≈ 1/7.5 of height, elbow at waist, fingertips at mid-thigh. Flag only OBVIOUS violations (forearm twice the length of upper arm, head clearly too large).

4. SKIN TEXTURE — Look for abrupt texture changes: one patch of skin with visible pores adjacent to a smooth patch. Check for texture that transitions unnaturally between face regions.

5. HAIR — Look for: strands that float disconnected from the scalp, hairline that dissolves into skin without natural transition, inconsistent hair direction (some strands defy gravity without explanation).

6. EYE DETAILS — Catchlight reflections must appear in the same relative position in both eyes (same light source). Both irises should have the same color. Eyelashes should radiate outward from the lid margin.

7. CLOTHING — Fabric must drape under gravity. Seams must be continuous (not disappearing/reappearing). Buttons must have buttonholes. Jewelry must connect to the body.""" + CONFIDENCE_CALIBRATION

USR_ANATOMY = """Perform anatomical forensic analysis.

MANDATORY: If hands are visible, explicitly count each finger on each hand. State your count clearly (e.g., "Left hand: thumb, index, middle, ring, pinky = 5 fingers").

CRITICAL: If a hand is holding something, seen at an angle, or partially hidden, set hands_correct=null and note the occlusion. Do NOT report extra fingers on occluded or foreshortened hands — this is the #1 source of false positives.

If NO people are visible, set contains_people=false and skip all other fields.

Respond in JSON:
{
    "contains_people": true/false,
    "hands_correct": true/false/null,
    "finger_count": "explicit count per hand, e.g. 'Left: 5 (thumb,index,middle,ring,pinky), Right: not visible'",
    "face_symmetric": true/false/null,
    "proportions_ok": true/false/null,
    "skin_natural": true/false/null,
    "hair_natural": true/false/null,
    "eyes_consistent": true/false/null,
    "clothing_ok": true/false/null,
    "anomalies": ["specific anatomical errors with locations"],
    "confidence": 0.0-1.0,
    "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
    "explanation": "reasoning with specific observations — for hands, cite your finger count"
}"""


# ═══════════════════════════════════════════════════════════════════════
# PROMPT 3: PHYSICAL PLAUSIBILITY (8 features)
# Owns: geometry, material appearance, structural mechanics, object interaction
# Does NOT own: illumination/shadows (that's Lighting), anatomy (that's Anatomy)
# Explicit partition from Lighting: this agent checks materials, perspective, and
#   structural physics. It does NOT re-analyze shadows, highlights, or light color.
# ═══════════════════════════════════════════════════════════════════════

SYS_PHYSICS = """You are a forensic physicist. You detect violations of geometry, material properties, and structural mechanics in images.

SCOPE — You analyze these 8 domains. You do NOT analyze lighting/shadows/specular highlights (a separate Lighting Agent handles those). Focus ONLY on:

1. MATERIAL APPEARANCE: Does each material look like what it claims to be? Metals should show environment reflections. Wood should have grain. Fabric should have texture. The SAME material across an image should have consistent appearance. Look for: a "metal" railing that looks like plastic, or glass that doesn't distort the background.

2. PERSPECTIVE GEOMETRY: Parallel lines in the real world (edges of buildings, railroad tracks, road markings) must converge to consistent vanishing points. Check for: lines that should be parallel but converge to different points, vertical lines that lean inconsistently.

3. GRAVITY & STRUCTURE: Everything must obey gravity. Objects rest on surfaces, don't float. Liquids have flat surfaces. Cantilevered structures need support. Fabric hangs down. Hair falls down (unless wind/motion is depicted). Look for: floating objects, impossible structural loads, upward-flowing fabric.

4. SCALE & PROPORTION: Objects with known real-world sizes (people ~1.7m, doors ~2m, cars ~4.5m, chairs ~0.45m seat height) should be proportional to each other. Check for: a person who would be 3m tall next to a door, or a cup the size of a head.

5. TRANSPARENCY: Glass transmits and distorts. Water refracts. Transparent objects should show what's behind them, distorted appropriately. Frosted glass blurs. Thick glass distorts more. Check for: glass that's perfectly clear with no distortion, or opaque "glass."

6. CONTACT PHYSICS: Where objects rest on soft surfaces, there should be deformation (cushion under person, mattress under object). Where heavy objects rest on surfaces, the surface should show appropriate response.

7. MOTION COHERENCE: If motion blur is present, its direction and magnitude should be consistent with the depicted motion. A moving car should have horizontal blur. A falling object should have vertical blur. An image with one object blurred and everything else sharp needs a fast-moving object OR selective focus.

8. DEPTH & OCCLUSION: Nearer objects must occlude farther ones consistently. No object should appear to be simultaneously in front of AND behind another object. Occlusion boundaries should be clean (no "melting" edges).""" + CONFIDENCE_CALIBRATION

USR_PHYSICS = """Analyze this image for physics violations.

SCOPE REMINDER: Do NOT analyze lighting, shadows, or specular highlights — that is handled by a separate agent. Focus on materials, geometry, gravity, scale, transparency, contact, motion, and depth.

Respond in JSON:
{
    "material_consistent": true/false,
    "perspective_correct": true/false,
    "gravity_ok": true/false,
    "scale_consistent": true/false,
    "transparency_ok": true/false/null,
    "contact_ok": true/false,
    "motion_ok": true/false/null,
    "depth_ordering_ok": true/false,
    "anomalies": ["specific physics violations — not lighting"],
    "confidence": 0.0-1.0,
    "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
    "explanation": "reasoning focused on geometry and material physics"
}"""


# ═══════════════════════════════════════════════════════════════════════
# PROMPT 4: CONTEXT PLAUSIBILITY (8 features — expanded from 5)
# ═══════════════════════════════════════════════════════════════════════

SYS_CONTEXT = """You are a forensic scene analyst specializing in contextual coherence. AI-generated images often combine elements that could not physically coexist in the same real photograph.

Your 8 analysis domains:

1. TEMPORAL SEASON: Vegetation, foliage color, and flower blooming must match. Snow on the ground requires bare or evergreen trees. Green deciduous leaves + snow is a contradiction. Clothing should match the apparent season.

2. TIME OF DAY: Sky color/brightness must match shadow lengths and lighting direction. A bright blue sky requires short shadows (midday) or long shadows from a specific direction. Stars visible + brightly lit ground is contradictory.

3. ERA / TECHNOLOGY ANACHRONISM: Visible technology must match the apparent era of other objects in the scene. Use these concrete anchors:
   - Pre-1990: No flat-screen TVs, no smartphones, no LED lighting, no modern car designs (rounded headlights, DRLs). CRT monitors only. Wired phones only.
   - 1990-2005: Flip phones and early Nokias OK, but no touchscreen smartphones. Boxy CRT monitors, not flat panels. Boxy car designs.
   - 2005-2015: Early smartphones OK, but no notched/hole-punch screens. Flat panels exist but bezels are thick.
   - Post-2015: Thin-bezel phones, wireless earbuds, USB-C cables, modern LED strip lighting.
   If the scene mixes eras (1950s architecture + a person holding a modern iPhone), flag it. Fashion should match the era of other visible technology.

4. GEOGRAPHIC COHERENCE: Architecture style must match vegetation and climate. Tropical palm trees next to Northern European half-timbered houses is impossible. Road markings should match the apparent country (right-hand vs left-hand traffic, line styles). Visible text/signs should be in the expected language for the geography.

5. WEATHER COHERENCE: Sky conditions must match ground conditions. Wet pavement requires recent rain or overcast sky. Dry dust in the air contradicts standing water. Snow requires freezing conditions (visible breath, winter clothing). Fog obscures distant objects.

6. ATTIRE-SETTING MATCH: Beach clothing at a business meeting is impossible (unless clearly a party/casual scene). Winter coats in a tropical setting. Formal wear in a construction zone. Analyze whether clothing choices are plausible for the depicted location and activity.

7. SIGN & LABEL COHERENCE: Visible signs, labels, and text should be appropriate for the scene type. A restaurant should show food-related signage. A hospital should show medical signage. Signs in a residential area should show house numbers, street names. Complete absence of expected signage in a commercial area is mildly suspicious.

8. OBJECT FUNCTION & ARRANGEMENT: Furniture should be arranged for use (chairs face tables). Appliances should be connected (lamps plugged in, or at least near outlets). Tools should be held or stored correctly. Kitchen items should be in kitchens. Check for: objects that serve no function, impossible arrangements, or items placed where they'd be impractical.

9. AI STOCK PHOTO AESTHETICS — CRITICAL CHECK: AI-generated professional/office/lifestyle images have distinctive tells:
   - UNNATURALLY CLEAN environments: offices with zero clutter, kitchens with no crumbs, desks with nothing out of place. Real offices have cable tangles, personal items, slight mess.
   - IMPOSSIBLY PERFECT LIGHTING: perfectly even illumination with no harsh shadows, especially in indoor scenes where windows should create directional light and dark corners.
   - REPEATED IDENTICAL ELEMENTS: multiple sticky notes that are exactly the same size/color/angle, identical books on a shelf, repeated patterns that a human would vary.
   - WHITEBOARD/SCREEN CONTENT: Text on whiteboards that looks coherent from a distance but contains repeated phrases, nonsensical diagrams, or text that doesn't quite form real words. Look for duplicated headers, flowcharts that loop impossibly, and bullet points that repeat.
   - STOCK PHOTO POSES: People in unnaturally perfect poses, smiling too evenly, gesturing in ways that look like stock photography templates rather than candid moments.
   - SKIN PERFECTION: Completely poreless, airbrushed-looking skin with no visible texture, freckles, or imperfections. Real people have skin texture visible at any reasonable resolution.
   Flag ANY of these patterns — they are strong AI-generation indicators.""" + CONFIDENCE_CALIBRATION

USR_CONTEXT = """Analyze contextual plausibility across all 9 domains:
1. Temporal/Season — vegetation vs clothing vs weather
2. Time of Day — sky vs shadows vs lighting
3. Era/Technology — anachronistic objects
4. Geographic — architecture vs vegetation vs signage language
5. Weather — sky vs ground conditions vs attire
6. Attire-Setting — clothing appropriate for location/activity
7. Sign/Label Coherence — signage matches scene type
8. Object Arrangement — functional, plausible placement
9. AI Stock Photo Aesthetics — unnaturally clean, perfect lighting, repeated elements, whiteboard gibberish, stock poses, poreless skin

Respond in JSON:
{
    "season_consistent": true/false,
    "time_of_day_consistent": true/false,
    "era_consistent": true/false,
    "geographic_consistent": true/false,
    "weather_consistent": true/false,
    "attire_setting_match": true/false,
    "signage_coherent": true/false,
    "objects_functional": true/false,
    "ai_stock_aesthetics": true/false,
    "anomalies": ["specific contextual violations with reasoning"],
    "confidence": 0.0-1.0,
    "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
    "explanation": "detailed reasoning per domain"
}"""


# ═══════════════════════════════════════════════════════════════════════
# AGENT RUNNER
# ═══════════════════════════════════════════════════════════════════════

# VLM confidence temperature — applied before feeding into Bayesian Eq.1
# VLMs systematically overstate confidence; this compresses toward 0.5
VLM_CONFIDENCE_TEMPERATURE = 2.0

def _calibrate_vlm_confidence(raw_conf: float) -> float:
    """Post-process VLM confidence with temperature scaling.
    Compresses extreme values toward 0.5 to counter VLM overconfidence."""
    if raw_conf <= 0 or raw_conf >= 1:
        return 0.5
    logit = np.log(raw_conf / (1 - raw_conf))
    scaled = logit / VLM_CONFIDENCE_TEMPERATURE
    return float(1.0 / (1.0 + np.exp(-scaled)))


def run_semantic_agent(img):
    findings, scores = [], []
    vlm_ok = True
    n_applicable = 0  # Track how many sub-features were actually applicable
    n_total = 0       # Track total sub-features attempted
    
    for sys_p, usr_p, name, features, null_fields in [
        (SYS_LIGHTING, USR_LIGHTING, "Lighting Physics",
         ["Shadow Direction","Shadow Quality","Specular Consistency","Ambient Occlusion",
          "Color Temperature","Subsurface Scattering","Caustics","Inter-reflections"],
         {"sss_correct", "caustics_correct", "interreflections_ok"}),
        (SYS_ANATOMY, USR_ANATOMY, "Anatomical Analysis",
         ["Hand Anatomy","Facial Symmetry","Body Proportions","Skin Texture",
          "Hair Consistency","Eye Details","Clothing Physics"],
         set()),
        (SYS_PHYSICS, USR_PHYSICS, "Physical Plausibility",
         ["Material Appearance","Perspective Geometry","Gravity & Structure",
          "Scale & Proportion","Transparency","Contact Physics","Motion Coherence","Depth & Occlusion"],
         {"transparency_ok", "motion_ok"}),
    ]:
        try:
            resp = _vlm(img, sys_p, usr_p)
            if resp and not resp.startswith("VLM_ERROR"):
                parsed = _parse(resp)
                sc = _score(parsed)
                
                raw_conf = parsed.get("confidence", 0.5)
                cal_conf = _calibrate_vlm_confidence(raw_conf)
                
                # Fix 3: Anatomy on non-human images → tag as not_applicable
                if name == "Anatomical Analysis" and not parsed.get("contains_people", True):
                    for feat in features:
                        findings.append({"test": feat, "score": 0.0,
                                       "note": "No people in image — not applicable",
                                       "not_applicable": True, "parent": name})
                        # NOT added to scores — these should not dilute the posterior
                    n_total += len(features)
                    findings.append({"test": name, "vlm_analysis": parsed,
                                   "score": 0.0, "confidence": cal_conf,
                                   "not_applicable": True,
                                   "note": "No people detected — anatomy analysis skipped"})
                    continue
                
                anomalies = parsed.get("anomalies", [])
                
                # Fix 2: Count applicable sub-features (exclude nulls)
                applicable_features = []
                for feat in features:
                    # Check if VLM returned null for the corresponding field
                    field_map = {f: k for f, k in zip(features, parsed.keys()) if k in null_fields}
                    is_null = False
                    for nf in null_fields:
                        if parsed.get(nf) is None:
                            # Map null field back to feature name (approximate)
                            if any(nf_word in feat.lower() for nf_word in nf.replace("_ok","").replace("_correct","").split("_")):
                                is_null = True
                                break
                    
                    if is_null:
                        findings.append({"test": feat, "score": 0.0,
                                       "note": "Not applicable to this image",
                                       "not_applicable": True, "parent": name})
                        n_total += 1
                    else:
                        applicable_features.append(feat)
                
                # Distribute score only across applicable features
                n_applicable_here = len(applicable_features)
                if n_applicable_here > 0:
                    per_feat_score = sc / n_applicable_here
                    for feat in applicable_features:
                        findings.append({"test": feat, "score": per_feat_score,
                                       "note": parsed.get("explanation", "")[:100], "parent": name})
                        scores.append(per_feat_score)
                        n_applicable += 1
                        n_total += 1
                
                findings.append({"test": name, "vlm_analysis": parsed, "anomalies": anomalies,
                               "score": sc, "confidence": cal_conf,
                               "raw_vlm_confidence": raw_conf,
                               "calibrated_confidence": cal_conf,
                               "note": parsed.get("explanation", "")[:200]})
                scores.append(sc)
                n_applicable += 1
                n_total += 1
            else:
                vlm_ok = False
                for feat in features:
                    findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
                n_total += len(features)
        except Exception as e:
            findings.append({"test": name, "error": str(e), "score": 0})
            n_total += 1
    
    # Context plausibility
    try:
        resp = _vlm(img, SYS_CONTEXT, USR_CONTEXT)
        if resp and not resp.startswith("VLM_ERROR"):
            parsed = _parse(resp)
            sc = _score(parsed)
            raw_conf = parsed.get("confidence", 0.5)
            cal_conf = _calibrate_vlm_confidence(raw_conf)
            
            context_features = ["Season Consistency","Time-of-Day","Era/Technology",
                              "Geographic Coherence","Weather Coherence",
                              "Attire-Setting Match","Sign/Label Coherence",
                              "Object Arrangement","AI Stock Photo Aesthetics"]
            for feat in context_features:
                findings.append({"test": feat, "score": sc / len(context_features),
                               "note": parsed.get("explanation", "")[:100], "parent": "Context"})
                scores.append(sc / len(context_features))
                n_applicable += 1
                n_total += 1
            
            findings.append({"test": "Context Plausibility", "vlm_analysis": parsed,
                           "score": sc, "confidence": cal_conf,
                           "note": parsed.get("explanation", "")[:200]})
            scores.append(sc)
            n_applicable += 1
            n_total += 1
        else:
            vlm_ok = False
    except:
        pass

    # Fix 1: Confidence floor — distinguish genuinely neutral from cancelled-out
    if scores:
        avg = float(np.mean(scores))
        # Check if scores genuinely agree on neutral vs. cancelling each other out
        score_signs = [1 if s > 0.05 else (-1 if s < -0.05 else 0) for s in scores]
        n_positive = sum(1 for s in score_signs if s > 0)
        n_negative = sum(1 for s in score_signs if s < 0)
        n_neutral = sum(1 for s in score_signs if s == 0)
        
        if n_positive > 0 and n_negative > 0:
            # Scores cancelled out — LOW confidence, not 0.4
            agreement = max(n_positive, n_negative) / (n_positive + n_negative)
            conf = min(1.0, 0.15 + 0.5 * abs(avg) * agreement)
        elif n_neutral == len(score_signs):
            # Everything genuinely neutral (VLM said 0 for everything) — low confidence
            conf = 0.2
        else:
            # Scores agree in direction — confidence scales with magnitude
            conf = min(1.0, 0.3 + 0.6 * abs(avg))
        
        # Scale by coverage: fewer applicable features = lower confidence
        coverage = n_applicable / max(n_total, 1)
        conf *= max(0.3, coverage)
    else:
        avg = 0.0
        conf = 0.1
    
    if not vlm_ok:
        conf *= 0.3
    
    viol = [f["test"] for f in findings if f.get("score", 0) > 0.15
            and "parent" not in f and not f.get("not_applicable")]
    comp = [f["test"] for f in findings if f.get("score", 0) < -0.1
            and "parent" not in f and not f.get("not_applicable")]
    rat = f"Semantic violations: {', '.join(viol[:5])}." if viol else \
          f"Semantically consistent: {', '.join(comp[:5])}." if comp else "Semantic inconclusive."
    for f in findings:
        if f.get("note") and "parent" not in f and not f.get("not_applicable"):
            rat += f" [{f['test']}]: {f['note'][:100]}."
    
    return AgentEvidence("Semantic Consistency Agent", np.clip(avg, -1, 1), conf,
                         0.0 if vlm_ok else 0.8, rat,
                         [f for f in findings if "parent" not in f])