Upload agents/semantic_agent.py with huggingface_hub
Browse files- agents/semantic_agent.py +254 -133
agents/semantic_agent.py
CHANGED
|
@@ -1,5 +1,11 @@
|
|
| 1 |
-
"""FORENSIQ β Semantic Consistency Agent (
|
| 2 |
-
Uses Qwen2.5-VL-72B with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
"""
|
| 4 |
import os, base64, io, json, re, numpy as np
|
| 5 |
from PIL import Image
|
|
@@ -44,211 +50,326 @@ def _score(parsed):
|
|
| 44 |
if v=="AUTHENTIC": return -0.4
|
| 45 |
return 0.0
|
| 46 |
|
| 47 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
-
1. SHADOW GEOMETRY: Trace every shadow to its casting object. All shadow vectors must converge to consistent light source position(s). Shadow length encodes sun elevation via tan(ΞΈ) = object_height/shadow_length. Penumbra width encodes light source angular size.
|
| 53 |
-
2. INVERSE SQUARE LAW: Light intensity I = P/(4ΟrΒ²). Surfaces equidistant from a point light must have equal irradiance. Check illumination falloff on flat surfaces (walls, floors, tables).
|
| 54 |
-
3. SPECULAR HIGHLIGHTS: Each specular reflection encodes light source direction via the reflection law (angle of incidence = angle of reflection). Check that specular highlights across different objects in the scene are consistent with the same light source(s).
|
| 55 |
-
4. AMBIENT OCCLUSION: Contact shadows and ambient occlusion should be darkest in concavities and where objects touch surfaces. AI often forgets these subtle cues.
|
| 56 |
-
5. COLOR TEMPERATURE: All illuminated surfaces under the same light should share its color temperature. Mixed lighting (warm/cool) must be physically motivated (window + lamp).
|
| 57 |
-
6. SUBSURFACE SCATTERING: Thin objects (ears, fingers, leaves) backlit by strong light should show red/warm translucency. AI rarely gets this right.
|
| 58 |
-
7. CAUSTICS: Light through transparent objects (glass, water) creates caustic patterns. If present, they must match the refracting geometry.
|
| 59 |
-
8. INTER-REFLECTIONS: Colored surfaces bounce colored light onto nearby surfaces. A red wall should tint nearby white objects slightly red.
|
| 60 |
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
|
| 75 |
Respond in JSON:
|
| 76 |
{
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
"specular_consistent": true/false,
|
| 80 |
-
"
|
| 81 |
"color_temp_consistent": true/false,
|
| 82 |
"sss_correct": true/false/null,
|
| 83 |
"caustics_correct": true/false/null,
|
| 84 |
"interreflections_ok": true/false/null,
|
| 85 |
-
"anomalies": ["specific anomaly descriptions"],
|
| 86 |
"confidence": 0.0-1.0,
|
| 87 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
|
| 88 |
-
"explanation": "detailed reasoning citing
|
| 89 |
}"""
|
| 90 |
|
| 91 |
-
SYS_ANATOMY = """You are a forensic anatomist and medical illustrator with encyclopedic knowledge of human body structure. AI-generated images violate anatomy in specific, detectable ways.
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
5. HAIR: Consistent direction of growth. No floating strands disconnected from scalp. Hairline follows natural patterns.
|
| 99 |
-
6. EYES: Catchlight reflections should match between eyes and match the lighting direction. Iris has consistent color and pattern. Sclera is white with subtle veins.
|
| 100 |
-
7. CLOTHING/ACCESSORIES: Fabric drapes under gravity. Seams are continuous. Buttons/zippers are physically connected. Jewelry doesn't float.
|
| 101 |
|
| 102 |
-
|
| 103 |
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
|
| 107 |
-
1. HAND ANATOMY β Count fingers on each visible hand. Check joint angles, nail placement, proportions.
|
| 108 |
-
2. FACIAL SYMMETRY β Check eye alignment, ear symmetry, nose/mouth centering, teeth.
|
| 109 |
-
3. BODY PROPORTIONS β Check limb ratios, joint positions, head-to-body ratio.
|
| 110 |
-
4. SKIN & TEXTURE β Check pore consistency, wrinkle patterns, texture continuity.
|
| 111 |
-
5. HAIR β Check growth direction, hairline, strand connectivity.
|
| 112 |
-
6. EYE DETAILS β Check catchlights, iris consistency, sclera, eyelash direction.
|
| 113 |
-
7. CLOTHING PHYSICS β Check fabric draping, seam continuity, accessory placement.
|
| 114 |
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
Respond in JSON:
|
| 118 |
{
|
| 119 |
"contains_people": true/false,
|
| 120 |
"hands_correct": true/false/null,
|
| 121 |
-
"finger_count": "e.g. 'Left: 5, Right:
|
| 122 |
"face_symmetric": true/false/null,
|
| 123 |
"proportions_ok": true/false/null,
|
| 124 |
"skin_natural": true/false/null,
|
| 125 |
"hair_natural": true/false/null,
|
| 126 |
"eyes_consistent": true/false/null,
|
| 127 |
"clothing_ok": true/false/null,
|
| 128 |
-
"anomalies": ["specific anatomical errors"],
|
| 129 |
"confidence": 0.0-1.0,
|
| 130 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
|
| 131 |
-
"explanation": "
|
| 132 |
}"""
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
1.
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
Respond in JSON:
|
| 157 |
{
|
| 158 |
-
"
|
| 159 |
"perspective_correct": true/false,
|
| 160 |
"gravity_ok": true/false,
|
| 161 |
"scale_consistent": true/false,
|
| 162 |
"transparency_ok": true/false/null,
|
| 163 |
-
"
|
| 164 |
"motion_ok": true/false/null,
|
| 165 |
"depth_ordering_ok": true/false,
|
| 166 |
-
"anomalies": ["specific physics violations"],
|
| 167 |
"confidence": 0.0-1.0,
|
| 168 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
|
| 169 |
-
"explanation": "
|
| 170 |
}"""
|
| 171 |
|
| 172 |
-
SYS_CONTEXT = """You are a forensic scene analyst who evaluates whether an image's content is contextually plausible. AI-generated images often combine elements that shouldn't coexist.
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
Respond in JSON:
|
| 189 |
{
|
| 190 |
-
"
|
|
|
|
|
|
|
| 191 |
"geographic_consistent": true/false,
|
| 192 |
"weather_consistent": true/false,
|
| 193 |
-
"
|
|
|
|
| 194 |
"objects_functional": true/false,
|
| 195 |
-
"anomalies": ["specific contextual violations"],
|
| 196 |
"confidence": 0.0-1.0,
|
| 197 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
|
| 198 |
-
"explanation": "reasoning"
|
| 199 |
}"""
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
def run_semantic_agent(img):
|
| 202 |
-
findings,scores=[],[]
|
| 203 |
-
vlm_ok=True
|
| 204 |
|
| 205 |
-
for sys_p,usr_p,name,features in [
|
| 206 |
-
(SYS_LIGHTING, USR_LIGHTING, "Lighting Physics",
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
]:
|
| 210 |
try:
|
| 211 |
-
resp=_vlm(img,sys_p,usr_p)
|
| 212 |
if resp and not resp.startswith("VLM_ERROR"):
|
| 213 |
-
parsed=_parse(resp)
|
| 214 |
-
sc=_score(parsed)
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
for feat in features:
|
| 220 |
-
findings.append({"test":feat,"score":sc/len(features),
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
scores.append(sc)
|
| 226 |
else:
|
| 227 |
-
vlm_ok=False
|
| 228 |
for feat in features:
|
| 229 |
-
findings.append({"test":feat,"score":0.0,"note":"VLM unavailable","vlm_error":True})
|
| 230 |
scores.append(0.0)
|
| 231 |
except Exception as e:
|
| 232 |
-
findings.append({"test":name,"error":str(e),"score":0})
|
| 233 |
|
| 234 |
-
# Context plausibility (
|
| 235 |
try:
|
| 236 |
-
resp=_vlm(img,SYS_CONTEXT,USR_CONTEXT)
|
| 237 |
if resp and not resp.startswith("VLM_ERROR"):
|
| 238 |
-
parsed=_parse(resp)
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
for f in findings:
|
| 252 |
-
if f.get("note") and "parent" not in f:
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FORENSIQ β Semantic Consistency Agent (31 features via VLM)
|
| 2 |
+
Uses Qwen2.5-VL-72B with calibrated forensic prompts.
|
| 3 |
+
|
| 4 |
+
Design principles applied from review:
|
| 5 |
+
- Qualitative inconsistency detection, NOT metric estimation from 2D images
|
| 6 |
+
- Explicit phenomenon ownership: Lighting owns illumination, Physics owns geometry/materials
|
| 7 |
+
- Confidence calibration instructions in every prompt
|
| 8 |
+
- Expanded Context prompt (5β8 sub-features)
|
| 9 |
"""
|
| 10 |
import os, base64, io, json, re, numpy as np
|
| 11 |
from PIL import Image
|
|
|
|
| 50 |
if v=="AUTHENTIC": return -0.4
|
| 51 |
return 0.0
|
| 52 |
|
| 53 |
+
# ββ Shared calibration instruction appended to every prompt ββββββββββ
|
| 54 |
+
CONFIDENCE_CALIBRATION = """
|
| 55 |
+
|
| 56 |
+
CONFIDENCE CALIBRATION β CRITICAL:
|
| 57 |
+
Your confidence score MUST follow these rules:
|
| 58 |
+
- Default to 0.5 if you are uncertain or the evidence is ambiguous.
|
| 59 |
+
- Only use 0.7+ if you observe an UNAMBIGUOUS, SPECIFIC violation (e.g., a hand with 6 clearly countable fingers, shadows pointing in opposite directions from same light source).
|
| 60 |
+
- Only use 0.3 or below if the image is clearly, unambiguously consistent with reality and you can articulate exactly why.
|
| 61 |
+
- Use 0.4-0.6 for most images. Most images are ambiguous. Do NOT inflate confidence.
|
| 62 |
+
- If a sub-analysis is not applicable (no people, no text, no transparent objects), set that field to null and do NOT let it affect your overall confidence.
|
| 63 |
+
VLMs systematically overstate confidence. Resist this bias. When in doubt, stay near 0.5."""
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
# PROMPT 1: LIGHTING (8 features)
|
| 68 |
+
# Owns: ALL illumination phenomena β shadows, highlights, light color,
|
| 69 |
+
# light transport (SSS, caustics, inter-reflections)
|
| 70 |
+
# Does NOT own: material reflectance (that's Physics), geometry (Physics)
|
| 71 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 72 |
+
|
| 73 |
+
SYS_LIGHTING = """You are a forensic lighting analyst. You detect QUALITATIVE inconsistencies in illumination that indicate AI generation or manipulation. You work from visual appearance, not metric measurement.
|
| 74 |
+
|
| 75 |
+
IMPORTANT: You are analyzing a 2D image. You CANNOT compute exact distances, angles, or irradiance values. Instead, you look for VISIBLE INCONSISTENCIES that would be obvious to a trained observer:
|
| 76 |
+
|
| 77 |
+
Your 8 analysis domains (you OWN these β no other agent covers them):
|
| 78 |
|
| 79 |
+
1. SHADOW DIRECTION: Do shadows from different objects in the scene appear to point toward consistent light source position(s)? Look for shadows that diverge when they should converge, or shadows pointing in incompatible directions. You do NOT need to compute exact angles β just assess whether the overall shadow pattern is self-consistent.
|
| 80 |
|
| 81 |
+
2. SHADOW QUALITY: Are shadow edges (penumbra) consistent with the apparent light source? A small bright light produces hard shadows; overcast sky produces soft shadows. Do ALL shadows in the scene share the same hardness/softness? Mixed hard and soft shadows without explanation (e.g., multiple lights) is suspicious.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
3. SPECULAR HIGHLIGHTS: Bright reflections on shiny surfaces encode the light direction. If multiple shiny objects are visible, do their highlights appear to come from the same direction? If a person has catchlights in their eyes, do both eyes show highlights in the same position?
|
| 84 |
|
| 85 |
+
4. AMBIENT OCCLUSION: Where objects meet surfaces (feet on floor, cup on table, book on shelf), there should be subtle darkening at the contact line. AI images frequently omit contact shadows or place them incorrectly. Check: are contact shadows present where objects touch?
|
| 86 |
|
| 87 |
+
5. COLOR TEMPERATURE: Light from a single source should tint all surfaces the same hue. Look for: one side of a face warm-toned while the other is cool-toned without a motivating second light source. Indoor scenes with mixed warm/cool illumination should have visible light sources to explain it.
|
| 88 |
+
|
| 89 |
+
6. SUBSURFACE SCATTERING: If you can see thin body parts (ears, nostrils, fingers between a light) backlit by a strong source, they should glow warm/red from blood beneath the skin. If present, is it consistent with the light direction? If absent when expected, flag it.
|
| 90 |
+
|
| 91 |
+
7. CAUSTICS: If glass, water, or transparent objects are present near a surface, look for projected light patterns. Their absence in a brightly lit scene with transparent objects is mildly suspicious. If caustics ARE visible, do they match the shape and position of the transparent object?
|
| 92 |
+
|
| 93 |
+
8. INTER-REFLECTIONS: Strongly colored surfaces near neutral surfaces should tint them. A red blanket next to a white wall should cast a subtle red tint. Look for color bleeding that's present OR suspiciously absent.""" + CONFIDENCE_CALIBRATION
|
| 94 |
+
|
| 95 |
+
USR_LIGHTING = """Analyze this image for lighting inconsistencies across all 8 domains.
|
| 96 |
+
|
| 97 |
+
For each, give a QUALITATIVE assessment based on what you can visually observe β do NOT attempt to compute metric values like exact angles or irradiance.
|
| 98 |
|
| 99 |
Respond in JSON:
|
| 100 |
{
|
| 101 |
+
"shadow_direction_consistent": true/false,
|
| 102 |
+
"shadow_quality_consistent": true/false,
|
| 103 |
"specular_consistent": true/false,
|
| 104 |
+
"ambient_occlusion_present": true/false,
|
| 105 |
"color_temp_consistent": true/false,
|
| 106 |
"sss_correct": true/false/null,
|
| 107 |
"caustics_correct": true/false/null,
|
| 108 |
"interreflections_ok": true/false/null,
|
| 109 |
+
"anomalies": ["specific anomaly descriptions with image region references"],
|
| 110 |
"confidence": 0.0-1.0,
|
| 111 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
|
| 112 |
+
"explanation": "detailed reasoning citing what you observe, not what you compute"
|
| 113 |
}"""
|
| 114 |
|
|
|
|
| 115 |
|
| 116 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 117 |
+
# PROMPT 2: ANATOMY (7 features)
|
| 118 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 119 |
+
|
| 120 |
+
SYS_ANATOMY = """You are a forensic anatomist. You detect anatomical errors in images that indicate AI generation.
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
DETECTION PROTOCOL:
|
| 123 |
|
| 124 |
+
1. HANDS β This is your highest-priority check. Procedure:
|
| 125 |
+
a) Locate every visible hand in the image.
|
| 126 |
+
b) For each hand, COUNT fingers individually: thumb, index, middle, ring, pinky. State the count explicitly.
|
| 127 |
+
c) Verify each finger has correct joint count (thumb: 2 joints, others: 3 joints).
|
| 128 |
+
d) Check that joints bend only in anatomically possible directions.
|
| 129 |
+
e) Verify nails are on the correct (dorsal) side of each finger.
|
| 130 |
+
f) If hands are partially occluded, note what's visible vs. hidden.
|
| 131 |
|
| 132 |
+
2. FACIAL SYMMETRY β Flag asymmetry ONLY if it would be noticeable to a casual observer at normal viewing distance. Natural faces have subtle asymmetry; AI faces often have GROSS asymmetry (one ear significantly higher/larger, one eye noticeably different shape, jawline shifted). Do NOT flag sub-pixel or barely perceptible differences.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
+
3. BODY PROPORTIONS β Check against standard human ratios: head β 1/7.5 of height, elbow at waist, fingertips at mid-thigh. Flag only OBVIOUS violations (forearm twice the length of upper arm, head clearly too large).
|
| 135 |
+
|
| 136 |
+
4. SKIN TEXTURE β Look for abrupt texture changes: one patch of skin with visible pores adjacent to a smooth patch. Check for texture that transitions unnaturally between face regions.
|
| 137 |
+
|
| 138 |
+
5. HAIR β Look for: strands that float disconnected from the scalp, hairline that dissolves into skin without natural transition, inconsistent hair direction (some strands defy gravity without explanation).
|
| 139 |
+
|
| 140 |
+
6. EYE DETAILS β Catchlight reflections must appear in the same relative position in both eyes (same light source). Both irises should have the same color. Eyelashes should radiate outward from the lid margin.
|
| 141 |
+
|
| 142 |
+
7. CLOTHING β Fabric must drape under gravity. Seams must be continuous (not disappearing/reappearing). Buttons must have buttonholes. Jewelry must connect to the body.""" + CONFIDENCE_CALIBRATION
|
| 143 |
+
|
| 144 |
+
USR_ANATOMY = """Perform anatomical forensic analysis.
|
| 145 |
+
|
| 146 |
+
MANDATORY: If hands are visible, explicitly count each finger on each hand. State your count clearly (e.g., "Left hand: thumb, index, middle, ring, pinky = 5 fingers").
|
| 147 |
+
|
| 148 |
+
If NO people are visible, set contains_people=false and skip all other fields.
|
| 149 |
|
| 150 |
Respond in JSON:
|
| 151 |
{
|
| 152 |
"contains_people": true/false,
|
| 153 |
"hands_correct": true/false/null,
|
| 154 |
+
"finger_count": "explicit count per hand, e.g. 'Left: 5 (thumb,index,middle,ring,pinky), Right: not visible'",
|
| 155 |
"face_symmetric": true/false/null,
|
| 156 |
"proportions_ok": true/false/null,
|
| 157 |
"skin_natural": true/false/null,
|
| 158 |
"hair_natural": true/false/null,
|
| 159 |
"eyes_consistent": true/false/null,
|
| 160 |
"clothing_ok": true/false/null,
|
| 161 |
+
"anomalies": ["specific anatomical errors with locations"],
|
| 162 |
"confidence": 0.0-1.0,
|
| 163 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
|
| 164 |
+
"explanation": "reasoning with specific observations β for hands, cite your finger count"
|
| 165 |
}"""
|
| 166 |
|
| 167 |
+
|
| 168 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 169 |
+
# PROMPT 3: PHYSICAL PLAUSIBILITY (8 features)
|
| 170 |
+
# Owns: geometry, material appearance, structural mechanics, object interaction
|
| 171 |
+
# Does NOT own: illumination/shadows (that's Lighting), anatomy (that's Anatomy)
|
| 172 |
+
# Explicit partition from Lighting: this agent checks materials, perspective, and
|
| 173 |
+
# structural physics. It does NOT re-analyze shadows, highlights, or light color.
|
| 174 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 175 |
+
|
| 176 |
+
SYS_PHYSICS = """You are a forensic physicist. You detect violations of geometry, material properties, and structural mechanics in images.
|
| 177 |
+
|
| 178 |
+
SCOPE β You analyze these 8 domains. You do NOT analyze lighting/shadows/specular highlights (a separate Lighting Agent handles those). Focus ONLY on:
|
| 179 |
+
|
| 180 |
+
1. MATERIAL APPEARANCE: Does each material look like what it claims to be? Metals should show environment reflections. Wood should have grain. Fabric should have texture. The SAME material across an image should have consistent appearance. Look for: a "metal" railing that looks like plastic, or glass that doesn't distort the background.
|
| 181 |
+
|
| 182 |
+
2. PERSPECTIVE GEOMETRY: Parallel lines in the real world (edges of buildings, railroad tracks, road markings) must converge to consistent vanishing points. Check for: lines that should be parallel but converge to different points, vertical lines that lean inconsistently.
|
| 183 |
+
|
| 184 |
+
3. GRAVITY & STRUCTURE: Everything must obey gravity. Objects rest on surfaces, don't float. Liquids have flat surfaces. Cantilevered structures need support. Fabric hangs down. Hair falls down (unless wind/motion is depicted). Look for: floating objects, impossible structural loads, upward-flowing fabric.
|
| 185 |
+
|
| 186 |
+
4. SCALE & PROPORTION: Objects with known real-world sizes (people ~1.7m, doors ~2m, cars ~4.5m, chairs ~0.45m seat height) should be proportional to each other. Check for: a person who would be 3m tall next to a door, or a cup the size of a head.
|
| 187 |
+
|
| 188 |
+
5. TRANSPARENCY: Glass transmits and distorts. Water refracts. Transparent objects should show what's behind them, distorted appropriately. Frosted glass blurs. Thick glass distorts more. Check for: glass that's perfectly clear with no distortion, or opaque "glass."
|
| 189 |
+
|
| 190 |
+
6. CONTACT PHYSICS: Where objects rest on soft surfaces, there should be deformation (cushion under person, mattress under object). Where heavy objects rest on surfaces, the surface should show appropriate response.
|
| 191 |
+
|
| 192 |
+
7. MOTION COHERENCE: If motion blur is present, its direction and magnitude should be consistent with the depicted motion. A moving car should have horizontal blur. A falling object should have vertical blur. An image with one object blurred and everything else sharp needs a fast-moving object OR selective focus.
|
| 193 |
+
|
| 194 |
+
8. DEPTH & OCCLUSION: Nearer objects must occlude farther ones consistently. No object should appear to be simultaneously in front of AND behind another object. Occlusion boundaries should be clean (no "melting" edges).""" + CONFIDENCE_CALIBRATION
|
| 195 |
+
|
| 196 |
+
USR_PHYSICS = """Analyze this image for physics violations.
|
| 197 |
+
|
| 198 |
+
SCOPE REMINDER: Do NOT analyze lighting, shadows, or specular highlights β that is handled by a separate agent. Focus on materials, geometry, gravity, scale, transparency, contact, motion, and depth.
|
| 199 |
|
| 200 |
Respond in JSON:
|
| 201 |
{
|
| 202 |
+
"material_consistent": true/false,
|
| 203 |
"perspective_correct": true/false,
|
| 204 |
"gravity_ok": true/false,
|
| 205 |
"scale_consistent": true/false,
|
| 206 |
"transparency_ok": true/false/null,
|
| 207 |
+
"contact_ok": true/false,
|
| 208 |
"motion_ok": true/false/null,
|
| 209 |
"depth_ordering_ok": true/false,
|
| 210 |
+
"anomalies": ["specific physics violations β not lighting"],
|
| 211 |
"confidence": 0.0-1.0,
|
| 212 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
|
| 213 |
+
"explanation": "reasoning focused on geometry and material physics"
|
| 214 |
}"""
|
| 215 |
|
|
|
|
| 216 |
|
| 217 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 218 |
+
# PROMPT 4: CONTEXT PLAUSIBILITY (8 features β expanded from 5)
|
| 219 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 220 |
+
|
| 221 |
+
SYS_CONTEXT = """You are a forensic scene analyst specializing in contextual coherence. AI-generated images often combine elements that could not physically coexist in the same real photograph.
|
| 222 |
+
|
| 223 |
+
Your 8 analysis domains:
|
| 224 |
+
|
| 225 |
+
1. TEMPORAL SEASON: Vegetation, foliage color, and flower blooming must match. Snow on the ground requires bare or evergreen trees. Green deciduous leaves + snow is a contradiction. Clothing should match the apparent season.
|
| 226 |
+
|
| 227 |
+
2. TIME OF DAY: Sky color/brightness must match shadow lengths and lighting direction. A bright blue sky requires short shadows (midday) or long shadows from a specific direction. Stars visible + brightly lit ground is contradictory.
|
| 228 |
+
|
| 229 |
+
3. ERA / TECHNOLOGY ANACHRONISM: Visible technology (phones, cars, screens, signage style) should match the apparent era. A scene with 1950s architecture containing modern smartphones is suspicious. Fashion should match the apparent era of other objects.
|
| 230 |
+
|
| 231 |
+
4. GEOGRAPHIC COHERENCE: Architecture style must match vegetation and climate. Tropical palm trees next to Northern European half-timbered houses is impossible. Road markings should match the apparent country (right-hand vs left-hand traffic, line styles). Visible text/signs should be in the expected language for the geography.
|
| 232 |
|
| 233 |
+
5. WEATHER COHERENCE: Sky conditions must match ground conditions. Wet pavement requires recent rain or overcast sky. Dry dust in the air contradicts standing water. Snow requires freezing conditions (visible breath, winter clothing). Fog obscures distant objects.
|
| 234 |
+
|
| 235 |
+
6. ATTIRE-SETTING MATCH: Beach clothing at a business meeting is impossible (unless clearly a party/casual scene). Winter coats in a tropical setting. Formal wear in a construction zone. Analyze whether clothing choices are plausible for the depicted location and activity.
|
| 236 |
+
|
| 237 |
+
7. SIGN & LABEL COHERENCE: Visible signs, labels, and text should be appropriate for the scene type. A restaurant should show food-related signage. A hospital should show medical signage. Signs in a residential area should show house numbers, street names. Complete absence of expected signage in a commercial area is mildly suspicious.
|
| 238 |
+
|
| 239 |
+
8. OBJECT FUNCTION & ARRANGEMENT: Furniture should be arranged for use (chairs face tables). Appliances should be connected (lamps plugged in, or at least near outlets). Tools should be held or stored correctly. Kitchen items should be in kitchens. Check for: objects that serve no function, impossible arrangements, or items placed where they'd be impractical.""" + CONFIDENCE_CALIBRATION
|
| 240 |
+
|
| 241 |
+
USR_CONTEXT = """Analyze contextual plausibility across all 8 domains:
|
| 242 |
+
1. Temporal/Season β vegetation vs clothing vs weather
|
| 243 |
+
2. Time of Day β sky vs shadows vs lighting
|
| 244 |
+
3. Era/Technology β anachronistic objects
|
| 245 |
+
4. Geographic β architecture vs vegetation vs signage language
|
| 246 |
+
5. Weather β sky vs ground conditions vs attire
|
| 247 |
+
6. Attire-Setting β clothing appropriate for location/activity
|
| 248 |
+
7. Sign/Label Coherence β signage matches scene type
|
| 249 |
+
8. Object Arrangement β functional, plausible placement
|
| 250 |
|
| 251 |
Respond in JSON:
|
| 252 |
{
|
| 253 |
+
"season_consistent": true/false,
|
| 254 |
+
"time_of_day_consistent": true/false,
|
| 255 |
+
"era_consistent": true/false,
|
| 256 |
"geographic_consistent": true/false,
|
| 257 |
"weather_consistent": true/false,
|
| 258 |
+
"attire_setting_match": true/false,
|
| 259 |
+
"signage_coherent": true/false,
|
| 260 |
"objects_functional": true/false,
|
| 261 |
+
"anomalies": ["specific contextual violations with reasoning"],
|
| 262 |
"confidence": 0.0-1.0,
|
| 263 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
|
| 264 |
+
"explanation": "detailed reasoning per domain"
|
| 265 |
}"""
|
| 266 |
|
| 267 |
+
|
| 268 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 269 |
+
# AGENT RUNNER
|
| 270 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 271 |
+
|
| 272 |
+
# VLM confidence temperature β applied before feeding into Bayesian Eq.1
|
| 273 |
+
# VLMs systematically overstate confidence; this compresses toward 0.5
|
| 274 |
+
VLM_CONFIDENCE_TEMPERATURE = 2.0
|
| 275 |
+
|
| 276 |
+
def _calibrate_vlm_confidence(raw_conf: float) -> float:
|
| 277 |
+
"""Post-process VLM confidence with temperature scaling.
|
| 278 |
+
Compresses extreme values toward 0.5 to counter VLM overconfidence."""
|
| 279 |
+
if raw_conf <= 0 or raw_conf >= 1:
|
| 280 |
+
return 0.5
|
| 281 |
+
logit = np.log(raw_conf / (1 - raw_conf))
|
| 282 |
+
scaled = logit / VLM_CONFIDENCE_TEMPERATURE
|
| 283 |
+
return float(1.0 / (1.0 + np.exp(-scaled)))
|
| 284 |
+
|
| 285 |
+
|
| 286 |
def run_semantic_agent(img):
|
| 287 |
+
findings, scores = [], []
|
| 288 |
+
vlm_ok = True
|
| 289 |
|
| 290 |
+
for sys_p, usr_p, name, features in [
|
| 291 |
+
(SYS_LIGHTING, USR_LIGHTING, "Lighting Physics",
|
| 292 |
+
["Shadow Direction","Shadow Quality","Specular Consistency","Ambient Occlusion",
|
| 293 |
+
"Color Temperature","Subsurface Scattering","Caustics","Inter-reflections"]),
|
| 294 |
+
(SYS_ANATOMY, USR_ANATOMY, "Anatomical Analysis",
|
| 295 |
+
["Hand Anatomy","Facial Symmetry","Body Proportions","Skin Texture",
|
| 296 |
+
"Hair Consistency","Eye Details","Clothing Physics"]),
|
| 297 |
+
(SYS_PHYSICS, USR_PHYSICS, "Physical Plausibility",
|
| 298 |
+
["Material Appearance","Perspective Geometry","Gravity & Structure",
|
| 299 |
+
"Scale & Proportion","Transparency","Contact Physics","Motion Coherence","Depth & Occlusion"]),
|
| 300 |
]:
|
| 301 |
try:
|
| 302 |
+
resp = _vlm(img, sys_p, usr_p)
|
| 303 |
if resp and not resp.startswith("VLM_ERROR"):
|
| 304 |
+
parsed = _parse(resp)
|
| 305 |
+
sc = _score(parsed)
|
| 306 |
+
|
| 307 |
+
# Calibrate VLM confidence before storing
|
| 308 |
+
raw_conf = parsed.get("confidence", 0.5)
|
| 309 |
+
cal_conf = _calibrate_vlm_confidence(raw_conf)
|
| 310 |
+
|
| 311 |
+
if name == "Anatomical Analysis" and not parsed.get("contains_people", True):
|
| 312 |
+
sc = 0.0
|
| 313 |
+
|
| 314 |
+
anomalies = parsed.get("anomalies", [])
|
| 315 |
for feat in features:
|
| 316 |
+
findings.append({"test": feat, "score": sc / len(features),
|
| 317 |
+
"note": parsed.get("explanation", "")[:100], "parent": name})
|
| 318 |
+
scores.append(sc / len(features))
|
| 319 |
+
|
| 320 |
+
findings.append({"test": name, "vlm_analysis": parsed, "anomalies": anomalies,
|
| 321 |
+
"score": sc, "confidence": cal_conf,
|
| 322 |
+
"raw_vlm_confidence": raw_conf,
|
| 323 |
+
"calibrated_confidence": cal_conf,
|
| 324 |
+
"note": parsed.get("explanation", "")[:200]})
|
| 325 |
scores.append(sc)
|
| 326 |
else:
|
| 327 |
+
vlm_ok = False
|
| 328 |
for feat in features:
|
| 329 |
+
findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
|
| 330 |
scores.append(0.0)
|
| 331 |
except Exception as e:
|
| 332 |
+
findings.append({"test": name, "error": str(e), "score": 0})
|
| 333 |
|
| 334 |
+
# Context plausibility (expanded to 8 sub-features)
|
| 335 |
try:
|
| 336 |
+
resp = _vlm(img, SYS_CONTEXT, USR_CONTEXT)
|
| 337 |
if resp and not resp.startswith("VLM_ERROR"):
|
| 338 |
+
parsed = _parse(resp)
|
| 339 |
+
sc = _score(parsed)
|
| 340 |
+
raw_conf = parsed.get("confidence", 0.5)
|
| 341 |
+
cal_conf = _calibrate_vlm_confidence(raw_conf)
|
| 342 |
+
|
| 343 |
+
context_features = ["Season Consistency","Time-of-Day","Era/Technology",
|
| 344 |
+
"Geographic Coherence","Weather Coherence",
|
| 345 |
+
"Attire-Setting Match","Sign/Label Coherence","Object Arrangement"]
|
| 346 |
+
for feat in context_features:
|
| 347 |
+
findings.append({"test": feat, "score": sc / len(context_features),
|
| 348 |
+
"note": parsed.get("explanation", "")[:100], "parent": "Context"})
|
| 349 |
+
scores.append(sc / len(context_features))
|
| 350 |
+
|
| 351 |
+
findings.append({"test": "Context Plausibility", "vlm_analysis": parsed,
|
| 352 |
+
"score": sc, "confidence": cal_conf,
|
| 353 |
+
"note": parsed.get("explanation", "")[:200]})
|
| 354 |
+
scores.append(sc)
|
| 355 |
+
else:
|
| 356 |
+
vlm_ok = False
|
| 357 |
+
except:
|
| 358 |
+
pass
|
| 359 |
+
|
| 360 |
+
avg = float(np.mean(scores)) if scores else 0.0
|
| 361 |
+
conf = min(1.0, 0.4 + 0.5 * abs(avg))
|
| 362 |
+
if not vlm_ok:
|
| 363 |
+
conf *= 0.3
|
| 364 |
+
|
| 365 |
+
viol = [f["test"] for f in findings if f.get("score", 0) > 0.15 and "parent" not in f]
|
| 366 |
+
comp = [f["test"] for f in findings if f.get("score", 0) < -0.1 and "parent" not in f]
|
| 367 |
+
rat = f"Semantic violations: {', '.join(viol[:5])}." if viol else \
|
| 368 |
+
f"Semantically consistent: {', '.join(comp[:5])}." if comp else "Semantic inconclusive."
|
| 369 |
for f in findings:
|
| 370 |
+
if f.get("note") and "parent" not in f:
|
| 371 |
+
rat += f" [{f['test']}]: {f['note'][:100]}."
|
| 372 |
+
|
| 373 |
+
return AgentEvidence("Semantic Consistency Agent", np.clip(avg, -1, 1), conf,
|
| 374 |
+
0.0 if vlm_ok else 0.8, rat,
|
| 375 |
+
[f for f in findings if "parent" not in f])
|