anky2002 commited on
Commit
ada1738
Β·
verified Β·
1 Parent(s): 2f3f5e8

Upload agents/semantic_agent.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. agents/semantic_agent.py +358 -0
agents/semantic_agent.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FORENSIQ β€” Semantic Consistency Agent (VLM-powered)
3
+ Uses Qwen2.5-VL via HF Inference to evaluate:
4
+ - Lighting consistency (shadow convergence, inverse square law)
5
+ - Material properties (BRDF anomalies, reflectance)
6
+ - Anatomical errors (finger count, joint angles, facial symmetry)
7
+ - Physical plausibility (gravity, perspective, scale)
8
+ """
9
+
10
+ import os
11
+ import base64
12
+ import io
13
+ import json
14
+ import re
15
+ import numpy as np
16
+ from PIL import Image
17
+ from typing import Dict, Any, Optional
18
+ from dataclasses import dataclass
19
+
20
+ from agents.optical_agent import AgentEvidence
21
+
22
+ # ─── VLM Interface ───────────────────────────────────────────────────
23
+
24
+ def _encode_image_b64(img: Image.Image, max_size: int = 1024) -> str:
25
+ """Encode PIL image as base64 JPEG for API submission."""
26
+ # Resize if too large
27
+ w, h = img.size
28
+ if max(w, h) > max_size:
29
+ ratio = max_size / max(w, h)
30
+ img = img.resize((int(w * ratio), int(h * ratio)), Image.LANCZOS)
31
+ buf = io.BytesIO()
32
+ img.convert("RGB").save(buf, format="JPEG", quality=90)
33
+ return base64.b64encode(buf.getvalue()).decode("utf-8")
34
+
35
+
36
+ def _call_vlm(img: Image.Image, system_prompt: str, user_prompt: str) -> Optional[str]:
37
+ """Call Qwen2.5-VL-7B via HF router (OpenAI-compatible endpoint)."""
38
+ try:
39
+ from openai import OpenAI
40
+ except ImportError:
41
+ return None
42
+
43
+ token = os.environ.get("HF_TOKEN", "")
44
+ if not token:
45
+ return None
46
+
47
+ try:
48
+ client = OpenAI(
49
+ base_url="https://router.huggingface.co/v1",
50
+ api_key=token,
51
+ )
52
+
53
+ b64 = _encode_image_b64(img)
54
+
55
+ response = client.chat.completions.create(
56
+ model="Qwen/Qwen2.5-VL-72B-Instruct",
57
+ messages=[
58
+ {"role": "system", "content": system_prompt},
59
+ {
60
+ "role": "user",
61
+ "content": [
62
+ {
63
+ "type": "image_url",
64
+ "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
65
+ },
66
+ {"type": "text", "text": user_prompt},
67
+ ],
68
+ },
69
+ ],
70
+ max_tokens=1500,
71
+ temperature=0.1,
72
+ )
73
+ return response.choices[0].message.content
74
+ except Exception as e:
75
+ return f"VLM_ERROR: {str(e)}"
76
+
77
+
78
+ def _parse_vlm_json(text: str) -> Dict[str, Any]:
79
+ """Extract JSON from VLM response (handles markdown code blocks)."""
80
+ if text is None:
81
+ return {}
82
+ # Try to find JSON block
83
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
84
+ if json_match:
85
+ try:
86
+ return json.loads(json_match.group(1))
87
+ except json.JSONDecodeError:
88
+ pass
89
+ # Try direct parse
90
+ try:
91
+ return json.loads(text)
92
+ except json.JSONDecodeError:
93
+ pass
94
+ # Try to find any {...} block
95
+ brace_match = re.search(r'\{[^{}]*\}', text, re.DOTALL)
96
+ if brace_match:
97
+ try:
98
+ return json.loads(brace_match.group(0))
99
+ except json.JSONDecodeError:
100
+ pass
101
+ return {"raw_response": text}
102
+
103
+
104
+ # ─── Lighting Consistency ────────────────────────────────────────────
105
+
106
+ LIGHTING_SYSTEM_PROMPT = """You are an expert forensic image analyst specializing in lighting physics and photogrammetry. Your task is to analyze images for lighting consistency violations that indicate AI generation or manipulation.
107
+
108
+ You understand:
109
+ - Shadow direction convergence (all shadows must trace back to consistent light source positions)
110
+ - Inverse square law (light intensity falls off as 1/rΒ²)
111
+ - Specular highlight placement (must be consistent with light source direction)
112
+ - Ambient vs direct lighting ratios
113
+ - Multiple light source scenarios
114
+ - Reflection consistency in eyes, glasses, and shiny surfaces
115
+
116
+ Be precise, clinical, and evidence-based. Cite specific image regions when noting anomalies."""
117
+
118
+ LIGHTING_USER_PROMPT = """Analyze this image for lighting consistency. Examine:
119
+ 1. Shadow directions β€” do all shadows point to consistent light source(s)?
120
+ 2. Shadow softness β€” is it consistent with the apparent light source distance?
121
+ 3. Specular highlights β€” are reflections in eyes, skin, and objects consistent?
122
+ 4. Light falloff β€” does brightness decrease naturally with distance from light?
123
+ 5. Ambient lighting β€” is the ambient-to-direct ratio physically plausible?
124
+
125
+ Respond in JSON format:
126
+ {
127
+ "lighting_consistent": true/false,
128
+ "shadow_direction_consistent": true/false,
129
+ "specular_highlights_consistent": true/false,
130
+ "light_falloff_natural": true/false,
131
+ "anomalies": ["list of specific anomalies found, empty if none"],
132
+ "confidence": 0.0-1.0,
133
+ "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
134
+ "explanation": "detailed reasoning"
135
+ }"""
136
+
137
+
138
+ def analyze_lighting(img: Image.Image) -> Dict[str, Any]:
139
+ response = _call_vlm(img, LIGHTING_SYSTEM_PROMPT, LIGHTING_USER_PROMPT)
140
+ if response and not response.startswith("VLM_ERROR"):
141
+ parsed = _parse_vlm_json(response)
142
+ verdict = parsed.get("verdict", "UNKNOWN")
143
+ anomalies = parsed.get("anomalies", [])
144
+ confidence = parsed.get("confidence", 0.5)
145
+
146
+ if verdict == "MANIPULATED":
147
+ score = 0.7
148
+ elif verdict == "SUSPICIOUS":
149
+ score = 0.4
150
+ elif verdict == "AUTHENTIC":
151
+ score = -0.4
152
+ else:
153
+ score = 0.0
154
+
155
+ return {
156
+ "test": "Lighting Consistency",
157
+ "vlm_analysis": parsed,
158
+ "anomalies": anomalies,
159
+ "score": score,
160
+ "confidence": confidence,
161
+ "note": parsed.get("explanation", response[:200]),
162
+ }
163
+ else:
164
+ return {
165
+ "test": "Lighting Consistency",
166
+ "score": 0.0,
167
+ "note": f"VLM unavailable: {response or 'no HF_TOKEN'}",
168
+ "vlm_error": True,
169
+ }
170
+
171
+
172
+ # ─── Anatomical Analysis ────────────────────────────────────────────
173
+
174
+ ANATOMY_SYSTEM_PROMPT = """You are an expert forensic analyst specializing in human anatomy verification in images. AI-generated images frequently contain anatomical errors that are physically impossible.
175
+
176
+ You have encyclopedic knowledge of:
177
+ - Hand anatomy: finger count (exactly 5 per hand), joint bending directions, nail placement, proportions
178
+ - Facial anatomy: bilateral symmetry, ear alignment, eye spacing, teeth regularity
179
+ - Body proportions: limb ratios, joint angles, skeletal plausibility
180
+ - Skin texture: pore consistency, wrinkle patterns, hair follicle distribution
181
+ - Clothing physics: fabric draping, seam continuity, button alignment
182
+
183
+ AI-generated images commonly fail on: extra/missing fingers, impossible joint angles, asymmetric ears, teeth anomalies, melted/merged body parts, clothing that defies physics."""
184
+
185
+ ANATOMY_USER_PROMPT = """Carefully examine this image for anatomical correctness. Check:
186
+ 1. Hands: Count fingers on each visible hand. Check joint angles and proportions.
187
+ 2. Face: Check bilateral symmetry, ear alignment, eye consistency, teeth.
188
+ 3. Body: Check limb proportions, joint angles, body part connections.
189
+ 4. Skin/Hair: Check texture consistency, pore patterns, hairline.
190
+ 5. Clothing: Check seam continuity, fabric physics, accessory consistency.
191
+
192
+ Respond in JSON format:
193
+ {
194
+ "contains_people": true/false,
195
+ "finger_count_correct": true/false/null,
196
+ "facial_symmetry_ok": true/false/null,
197
+ "body_proportions_ok": true/false/null,
198
+ "skin_texture_natural": true/false/null,
199
+ "clothing_physics_ok": true/false/null,
200
+ "anomalies": ["list of specific anatomical errors found"],
201
+ "confidence": 0.0-1.0,
202
+ "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
203
+ "explanation": "detailed reasoning with specific observations"
204
+ }"""
205
+
206
+
207
+ def analyze_anatomy(img: Image.Image) -> Dict[str, Any]:
208
+ response = _call_vlm(img, ANATOMY_SYSTEM_PROMPT, ANATOMY_USER_PROMPT)
209
+ if response and not response.startswith("VLM_ERROR"):
210
+ parsed = _parse_vlm_json(response)
211
+
212
+ if not parsed.get("contains_people", True):
213
+ return {
214
+ "test": "Anatomical Analysis",
215
+ "score": 0.0,
216
+ "note": "No people detected in image β€” anatomical analysis not applicable",
217
+ "vlm_analysis": parsed,
218
+ }
219
+
220
+ verdict = parsed.get("verdict", "UNKNOWN")
221
+ anomalies = parsed.get("anomalies", [])
222
+
223
+ if verdict == "MANIPULATED":
224
+ score = 0.8
225
+ elif verdict == "SUSPICIOUS":
226
+ score = 0.4
227
+ elif verdict == "AUTHENTIC":
228
+ score = -0.4
229
+ else:
230
+ score = 0.0
231
+
232
+ return {
233
+ "test": "Anatomical Analysis",
234
+ "vlm_analysis": parsed,
235
+ "anomalies": anomalies,
236
+ "score": score,
237
+ "confidence": parsed.get("confidence", 0.5),
238
+ "note": parsed.get("explanation", response[:200]),
239
+ }
240
+ else:
241
+ return {
242
+ "test": "Anatomical Analysis",
243
+ "score": 0.0,
244
+ "note": f"VLM unavailable: {response or 'no HF_TOKEN'}",
245
+ "vlm_error": True,
246
+ }
247
+
248
+
249
+ # ─── Material / Physics Plausibility ────────────────────────────────
250
+
251
+ PHYSICS_SYSTEM_PROMPT = """You are an expert forensic physicist who analyzes images for violations of physical laws. AI-generated images often violate basic physics because generative models learn visual patterns without understanding underlying physics.
252
+
253
+ Your expertise covers:
254
+ - Material reflectance: metals should reflect surroundings, glass should refract, matte surfaces shouldn't have specular highlights
255
+ - BRDF consistency: bidirectional reflectance should be consistent across the same material
256
+ - Gravity and structural physics: objects should rest on surfaces, liquids should be level, structures should be load-bearing
257
+ - Perspective geometry: parallel lines should converge to consistent vanishing points
258
+ - Scale consistency: known objects should be proportional to each other
259
+ - Transparency/refraction: glass, water, and transparent objects should distort backgrounds correctly"""
260
+
261
+ PHYSICS_USER_PROMPT = """Analyze this image for physical plausibility violations:
262
+ 1. Material properties: Are reflections, textures, and surface properties physically correct?
263
+ 2. Perspective: Do parallel lines converge to consistent vanishing points?
264
+ 3. Scale: Are objects proportional to each other and known references?
265
+ 4. Gravity: Do objects rest naturally? Are liquids level? Do fabrics drape correctly?
266
+ 5. Transparency: Do glass, water, or transparent objects refract/distort correctly?
267
+
268
+ Respond in JSON format:
269
+ {
270
+ "materials_consistent": true/false,
271
+ "perspective_correct": true/false,
272
+ "scale_consistent": true/false,
273
+ "gravity_plausible": true/false,
274
+ "anomalies": ["list of specific physics violations"],
275
+ "confidence": 0.0-1.0,
276
+ "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED",
277
+ "explanation": "detailed reasoning"
278
+ }"""
279
+
280
+
281
+ def analyze_physics(img: Image.Image) -> Dict[str, Any]:
282
+ response = _call_vlm(img, PHYSICS_SYSTEM_PROMPT, PHYSICS_USER_PROMPT)
283
+ if response and not response.startswith("VLM_ERROR"):
284
+ parsed = _parse_vlm_json(response)
285
+ verdict = parsed.get("verdict", "UNKNOWN")
286
+ anomalies = parsed.get("anomalies", [])
287
+
288
+ if verdict == "MANIPULATED":
289
+ score = 0.6
290
+ elif verdict == "SUSPICIOUS":
291
+ score = 0.3
292
+ elif verdict == "AUTHENTIC":
293
+ score = -0.4
294
+ else:
295
+ score = 0.0
296
+
297
+ return {
298
+ "test": "Physical Plausibility",
299
+ "vlm_analysis": parsed,
300
+ "anomalies": anomalies,
301
+ "score": score,
302
+ "confidence": parsed.get("confidence", 0.5),
303
+ "note": parsed.get("explanation", response[:200]),
304
+ }
305
+ else:
306
+ return {
307
+ "test": "Physical Plausibility",
308
+ "score": 0.0,
309
+ "note": f"VLM unavailable: {response or 'no HF_TOKEN'}",
310
+ "vlm_error": True,
311
+ }
312
+
313
+
314
+ # ─── Main Agent Entry Point ─────────────────────────────────────────
315
+ def run_semantic_agent(img: Image.Image) -> AgentEvidence:
316
+ """Run all semantic consistency tests via VLM."""
317
+ findings = []
318
+ scores = []
319
+ vlm_available = True
320
+
321
+ for fn in [analyze_lighting, analyze_anatomy, analyze_physics]:
322
+ try:
323
+ result = fn(img)
324
+ findings.append(result)
325
+ scores.append(result["score"])
326
+ if result.get("vlm_error"):
327
+ vlm_available = False
328
+ except Exception as e:
329
+ findings.append({"test": fn.__name__, "error": str(e), "score": 0})
330
+
331
+ avg_score = float(np.mean(scores)) if scores else 0.0
332
+ confidence = min(1.0, 0.4 + 0.5 * abs(avg_score))
333
+
334
+ if not vlm_available:
335
+ confidence *= 0.3 # Low confidence without VLM
336
+
337
+ violations = [f["test"] for f in findings if f.get("score", 0) > 0.2]
338
+ compliant = [f["test"] for f in findings if f.get("score", 0) < -0.1]
339
+
340
+ if violations:
341
+ rationale = f"Semantic violations detected: {', '.join(violations)}."
342
+ elif compliant:
343
+ rationale = f"Semantic consistency confirmed: {', '.join(compliant)}."
344
+ else:
345
+ rationale = "Semantic analysis inconclusive."
346
+
347
+ for f in findings:
348
+ if f.get("note"):
349
+ rationale += f" [{f['test']}]: {f['note'][:150]}."
350
+
351
+ return AgentEvidence(
352
+ agent_name="Semantic Consistency Agent",
353
+ violation_score=np.clip(avg_score, -1, 1),
354
+ confidence=confidence,
355
+ failure_prob=0.0 if vlm_available else 0.8,
356
+ rationale=rationale,
357
+ sub_findings=findings,
358
+ )