File size: 9,762 Bytes
983b8d8 25b4b0d 6ac5014 983b8d8 6ac5014 983b8d8 6ac5014 983b8d8 6ac5014 983b8d8 6ac5014 983b8d8 6ac5014 983b8d8 6ac5014 25b4b0d 6ac5014 25b4b0d 6ac5014 25b4b0d 6ac5014 25b4b0d 6ac5014 25b4b0d f2c7b8e 25b4b0d 983b8d8 25b4b0d 6ac5014 f2c7b8e 6ac5014 25b4b0d 6ac5014 983b8d8 25b4b0d 6ac5014 25b4b0d 6ac5014 983b8d8 25b4b0d 983b8d8 25b4b0d 6ac5014 25b4b0d 983b8d8 25b4b0d 983b8d8 6ac5014 983b8d8 6ac5014 983b8d8 6ac5014 983b8d8 6ac5014 983b8d8 25b4b0d 983b8d8 2b7f13d 983b8d8 25b4b0d 983b8d8 25b4b0d 983b8d8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | """FORENSIQ β Text & Typography Agent (9 features via VLM)"""
import os, numpy as np
from PIL import Image
from typing import Dict, Any
from agents.optical_agent import AgentEvidence
from agents.semantic_agent import _vlm, _parse, _score, _calibrate_vlm_confidence, CONFIDENCE_CALIBRATION
SYS_TEXT = """You are a forensic typographic examiner and computational linguist. AI-generated images produce text that is visually plausible but linguistically, typographically, or physically impossible.
Your 9 analysis domains:
1. CHARACTER LEGIBILITY: Every character must be a valid glyph from a real writing system. Look for: strokes that start but don't finish, serifs that appear mid-stroke, characters that resemble but aren't any real letter, hybrid glyphs that mix features of different letters.
2. SPELLING & LANGUAGE: All text must form valid words in an identifiable language. Check for: random letter sequences that resemble but aren't real words ("RESTAUNRT", "COFF3E SH0P"), mixed scripts within a single word (Latin + Cyrillic), grammatically impossible combinations.
3. FONT CONSISTENCY: Within a single text element (sign, label, caption), all characters should share the same font family, weight, and style. AI often mixes serif and sans-serif mid-word, varies x-height, or changes stroke width between adjacent characters.
4. KERNING & SPACING: Professional typography has consistent letter-spacing. Look for: irregular gaps between letters, overlapping characters, inconsistent word spacing within a line, characters that are too tightly packed or too spread out relative to the font.
5. BASELINE ALIGNMENT: Characters in a word should sit on a consistent baseline. AI text often has characters that float above or sink below the line, or a baseline that wobbles rather than being straight or following a consistent curve.
6. TEXT PERSPECTIVE: Text on surfaces (signs, buildings, clothing, products) must follow the surface's perspective geometry. Each character should distort consistently with its position on the surface. Text that appears "pasted on" without perspective distortion is suspicious.
7. TEXT-SURFACE INTEGRATION: Text should look like it belongs on its surface. Painted signs show brush texture. Printed text is flat and uniform. Embossed text has consistent lighting/shadow. Neon signs glow. Digital screens emit light. Check that the text rendering matches the apparent medium.
8. CONTEXTUAL APPROPRIATENESS: Signs should contain text appropriate for their context (restaurant β food items, street β street name, storefront β business name). ALSO check for ABSENCE of expected text: a storefront with no name, a book with no title, a commercial product with no label β these absences can indicate AI generation.
9. REFLECTION/SHADOW TEXT: If text is reflected in mirrors, water, or shiny surfaces, the reflection must be geometrically correct (horizontally mirrored). Text shadows must match the text geometry and scene lighting direction. Shadow letters should match the shape of the source letters.
CRITICAL β WHITEBOARD, SCREEN, AND HANDWRITTEN TEXT ANALYSIS:
AI-generated images frequently contain whiteboards, screens, notebooks, or documents with text that looks plausible at first glance but fails under scrutiny. You MUST carefully analyze:
- WHITEBOARD CONTENT: Read every word on visible whiteboards. Check for: repeated headers/phrases, flowcharts with impossible logic loops, bullet points that say the same thing twice, diagrams that don't connect properly, text that trails off into gibberish.
- SCREEN/MONITOR TEXT: Text on computer screens, phones, or tablets. Is it real readable content or visual noise that resembles text?
- HANDWRITTEN TEXT: Handwriting should have natural variation. AI handwriting is often too uniform or has inconsistent stroke pressure.
- DUPLICATED TEXT: If the same phrase appears twice in different locations on a whiteboard or sign, this is a strong AI indicator β real handwriting is never accidentally duplicated.
These are among the STRONGEST tells for AI-generated professional/office images.""" + CONFIDENCE_CALIBRATION
USR_TEXT = """Perform a comprehensive typographic forensic analysis of ALL visible text in this image.
For each text element you find, analyze all 9 domains.
CRITICAL: Pay special attention to WHITEBOARDS, SCREENS, and HANDWRITTEN TEXT. AI-generated images almost always contain subtle text errors on these surfaces β repeated phrases, nonsensical diagrams, duplicated headers, or text that looks coherent from a distance but contains gibberish up close. Read every word carefully.
ALSO check for ABSENCE of expected text: If the scene contains surfaces that would normally have text (storefronts, product labels, book covers, road signs, vehicle plates, clothing brand logos) but they are blank, blurred, or contain gibberish, flag this as suspicious.
If NO text is visible at all, assess whether the scene type would normally contain text. A close-up portrait with no text is normal. A street scene with no signs at all is unusual.
Respond in JSON:
{
"text_found": true/false,
"expected_text_absent": true/false,
"text_elements": [
{
"content": "exact transcription or GIBBERISH if unreadable",
"location": "where in image",
"legible": true/false,
"spelling_ok": true/false,
"font_consistent": true/false,
"kerning_ok": true/false,
"baseline_ok": true/false,
"perspective_ok": true/false,
"surface_integrated": true/false,
"context_appropriate": true/false,
"reflection_ok": true/false/null
}
],
"anomalies": ["specific text anomalies with image region references"],
"confidence": 0.0-1.0,
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
"explanation": "detailed reasoning"
}"""
def run_text_agent(img):
findings, scores = [], []
vlm_ok = True
FEATURES = ["Character Legibility","Spelling Validity","Font Consistency","Kerning & Spacing",
"Baseline Alignment","Text Perspective","Surface Integration","Context Appropriateness",
"Reflection/Shadow Text"]
try:
resp = _vlm(img, SYS_TEXT, USR_TEXT)
if resp and not resp.startswith("VLM_ERROR"):
parsed = _parse(resp)
raw_conf = parsed.get("confidence", 0.5)
cal_conf = _calibrate_vlm_confidence(raw_conf)
no_text = not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT"
expected_absent = parsed.get("expected_text_absent", False)
if no_text and not expected_absent:
# Genuinely no text expected β neutral
for feat in FEATURES:
findings.append({"test": feat, "score": 0.0, "note": "No text in image"})
scores.append(0.0)
elif no_text and expected_absent:
# Text SHOULD be there but isn't β suspicious
for feat in FEATURES:
feat_score = 0.1 if feat == "Context Appropriateness" else 0.0
findings.append({"test": feat, "score": feat_score,
"note": "Expected text absent from scene"})
scores.append(feat_score)
findings.append({"test": "Expected Text Absent", "score": 0.3,
"note": "Scene should contain text but doesn't β AI indicator"})
scores.append(0.3)
else:
sc = _score(parsed)
elements = parsed.get("text_elements", [])
anomalies = parsed.get("anomalies", [])
for feat in FEATURES:
feat_score = sc / len(FEATURES)
findings.append({"test": feat, "score": feat_score,
"note": parsed.get("explanation", "")[:100]})
scores.append(feat_score)
findings.append({"test": "Text Overall", "vlm_analysis": parsed,
"text_elements": elements, "anomalies": anomalies,
"score": sc, "confidence": cal_conf,
"raw_vlm_confidence": raw_conf,
"note": parsed.get("explanation", "")[:200]})
scores.append(sc)
else:
vlm_ok = False
for feat in FEATURES:
findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
scores.append(0.0)
except Exception as e:
findings.append({"test": "Text Analysis", "error": str(e), "score": 0})
avg = float(np.mean(scores)) if scores else 0.0
conf = min(1.0, 0.4 + 0.5 * abs(avg))
if not vlm_ok: conf *= 0.3
text_found = any(f.get("text_found", False) for f in findings if "vlm_analysis" in f)
if not text_found and vlm_ok:
rat = "No text visible in image. Text agent not applicable."
conf = 0.1
elif not vlm_ok:
rat = "VLM unavailable. Text analysis skipped."
else:
viol = [f["test"] for f in findings if f.get("score", 0) > 0.1]
rat = f"Text anomalies: {', '.join(viol[:5])}." if viol else "Text appears legitimate."
for f in findings:
if f.get("note") and "vlm_analysis" not in f: rat += f" {f['note'][:80]}"
return AgentEvidence("Text & Typography Agent", np.clip(avg, -1, 1), conf,
0.0 if vlm_ok else 0.9, rat,
[f for f in findings if "vlm_analysis" not in f])
|