Upload agents/text_agent.py with huggingface_hub
Browse files- agents/text_agent.py +40 -31
agents/text_agent.py
CHANGED
|
@@ -3,53 +3,45 @@ import os, numpy as np
|
|
| 3 |
from PIL import Image
|
| 4 |
from typing import Dict, Any
|
| 5 |
from agents.optical_agent import AgentEvidence
|
| 6 |
-
from agents.semantic_agent import _vlm, _parse, _score
|
| 7 |
|
| 8 |
-
SYS_TEXT = """You are a
|
| 9 |
|
| 10 |
-
Your
|
| 11 |
|
| 12 |
-
1. CHARACTER LEGIBILITY: Every character must be a valid glyph from a real writing system. Look for
|
| 13 |
|
| 14 |
-
2. SPELLING & LANGUAGE: All text must form valid words in an identifiable language. Check for: random letter sequences that resemble but aren't real words, mixed scripts within a single word (Latin + Cyrillic), grammatically impossible combinations.
|
| 15 |
|
| 16 |
-
3. FONT CONSISTENCY: Within a single text element (sign, label, caption), all characters should share the same font family, weight, and style. AI often mixes serif and sans-serif, or
|
| 17 |
|
| 18 |
-
4. KERNING & SPACING: Professional typography has consistent letter-spacing. Look for: irregular gaps between letters, overlapping characters, inconsistent word spacing within a line.
|
| 19 |
|
| 20 |
-
5. BASELINE ALIGNMENT: Characters in a word should sit on a consistent baseline. AI text often has characters that float above or sink below the line.
|
| 21 |
|
| 22 |
-
6. TEXT PERSPECTIVE: Text on surfaces (signs, buildings, clothing) must follow the surface's perspective geometry.
|
| 23 |
|
| 24 |
-
7. TEXT-SURFACE INTEGRATION: Text should look like it belongs on its surface
|
| 25 |
|
| 26 |
-
8. CONTEXTUAL APPROPRIATENESS: Signs should contain text appropriate for their context (
|
| 27 |
|
| 28 |
-
9. REFLECTION/SHADOW TEXT: If text is reflected in
|
| 29 |
-
|
| 30 |
-
Be meticulous. AI text errors are often subtle β a letter that's "almost" right but has an extra stroke, or text that's "almost" English but contains no real words."""
|
| 31 |
|
| 32 |
USR_TEXT = """Perform a comprehensive typographic forensic analysis of ALL visible text in this image.
|
| 33 |
|
| 34 |
-
For each text element you find, analyze all 9 domains
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
3. Font Consistency β same font throughout each text element?
|
| 38 |
-
4. Kerning & Spacing β consistent letter/word spacing?
|
| 39 |
-
5. Baseline Alignment β characters on consistent baseline?
|
| 40 |
-
6. Text Perspective β follows surface geometry?
|
| 41 |
-
7. Surface Integration β text looks like it belongs on surface?
|
| 42 |
-
8. Context β text is appropriate for the scene?
|
| 43 |
-
9. Reflections/Shadows β any reflected/shadowed text is geometrically correct?
|
| 44 |
|
| 45 |
-
If NO text is visible,
|
| 46 |
|
| 47 |
Respond in JSON:
|
| 48 |
{
|
| 49 |
"text_found": true/false,
|
|
|
|
| 50 |
"text_elements": [
|
| 51 |
{
|
| 52 |
-
"content": "transcription or GIBBERISH",
|
| 53 |
"location": "where in image",
|
| 54 |
"legible": true/false,
|
| 55 |
"spelling_ok": true/false,
|
|
@@ -62,7 +54,7 @@ Respond in JSON:
|
|
| 62 |
"reflection_ok": true/false/null
|
| 63 |
}
|
| 64 |
],
|
| 65 |
-
"anomalies": ["specific text anomalies
|
| 66 |
"confidence": 0.0-1.0,
|
| 67 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
|
| 68 |
"explanation": "detailed reasoning"
|
|
@@ -80,25 +72,42 @@ def run_text_agent(img):
|
|
| 80 |
resp = _vlm(img, SYS_TEXT, USR_TEXT)
|
| 81 |
if resp and not resp.startswith("VLM_ERROR"):
|
| 82 |
parsed = _parse(resp)
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
for feat in FEATURES:
|
| 85 |
findings.append({"test": feat, "score": 0.0, "note": "No text in image"})
|
| 86 |
scores.append(0.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
else:
|
| 88 |
sc = _score(parsed)
|
| 89 |
elements = parsed.get("text_elements", [])
|
| 90 |
anomalies = parsed.get("anomalies", [])
|
| 91 |
|
| 92 |
-
# Per-feature scoring based on element analysis
|
| 93 |
for feat in FEATURES:
|
| 94 |
feat_score = sc / len(FEATURES)
|
| 95 |
findings.append({"test": feat, "score": feat_score,
|
| 96 |
"note": parsed.get("explanation", "")[:100]})
|
| 97 |
scores.append(feat_score)
|
| 98 |
|
| 99 |
-
findings.append({"test": "Text
|
| 100 |
"text_elements": elements, "anomalies": anomalies,
|
| 101 |
-
"score": sc, "confidence":
|
|
|
|
| 102 |
"note": parsed.get("explanation", "")[:200]})
|
| 103 |
scores.append(sc)
|
| 104 |
else:
|
|
|
|
| 3 |
from PIL import Image
|
| 4 |
from typing import Dict, Any
|
| 5 |
from agents.optical_agent import AgentEvidence
|
| 6 |
+
from agents.semantic_agent import _vlm, _parse, _score, _calibrate_vlm_confidence, CONFIDENCE_CALIBRATION
|
| 7 |
|
| 8 |
+
SYS_TEXT = """You are a forensic typographic examiner and computational linguist. AI-generated images produce text that is visually plausible but linguistically, typographically, or physically impossible.
|
| 9 |
|
| 10 |
+
Your 9 analysis domains:
|
| 11 |
|
| 12 |
+
1. CHARACTER LEGIBILITY: Every character must be a valid glyph from a real writing system. Look for: strokes that start but don't finish, serifs that appear mid-stroke, characters that resemble but aren't any real letter, hybrid glyphs that mix features of different letters.
|
| 13 |
|
| 14 |
+
2. SPELLING & LANGUAGE: All text must form valid words in an identifiable language. Check for: random letter sequences that resemble but aren't real words ("RESTAUNRT", "COFF3E SH0P"), mixed scripts within a single word (Latin + Cyrillic), grammatically impossible combinations.
|
| 15 |
|
| 16 |
+
3. FONT CONSISTENCY: Within a single text element (sign, label, caption), all characters should share the same font family, weight, and style. AI often mixes serif and sans-serif mid-word, varies x-height, or changes stroke width between adjacent characters.
|
| 17 |
|
| 18 |
+
4. KERNING & SPACING: Professional typography has consistent letter-spacing. Look for: irregular gaps between letters, overlapping characters, inconsistent word spacing within a line, characters that are too tightly packed or too spread out relative to the font.
|
| 19 |
|
| 20 |
+
5. BASELINE ALIGNMENT: Characters in a word should sit on a consistent baseline. AI text often has characters that float above or sink below the line, or a baseline that wobbles rather than being straight or following a consistent curve.
|
| 21 |
|
| 22 |
+
6. TEXT PERSPECTIVE: Text on surfaces (signs, buildings, clothing, products) must follow the surface's perspective geometry. Each character should distort consistently with its position on the surface. Text that appears "pasted on" without perspective distortion is suspicious.
|
| 23 |
|
| 24 |
+
7. TEXT-SURFACE INTEGRATION: Text should look like it belongs on its surface. Painted signs show brush texture. Printed text is flat and uniform. Embossed text has consistent lighting/shadow. Neon signs glow. Digital screens emit light. Check that the text rendering matches the apparent medium.
|
| 25 |
|
| 26 |
+
8. CONTEXTUAL APPROPRIATENESS: Signs should contain text appropriate for their context (restaurant β food items, street β street name, storefront β business name). ALSO check for ABSENCE of expected text: a storefront with no name, a book with no title, a commercial product with no label β these absences can indicate AI generation.
|
| 27 |
|
| 28 |
+
9. REFLECTION/SHADOW TEXT: If text is reflected in mirrors, water, or shiny surfaces, the reflection must be geometrically correct (horizontally mirrored). Text shadows must match the text geometry and scene lighting direction. Shadow letters should match the shape of the source letters.""" + CONFIDENCE_CALIBRATION
|
|
|
|
|
|
|
| 29 |
|
| 30 |
USR_TEXT = """Perform a comprehensive typographic forensic analysis of ALL visible text in this image.
|
| 31 |
|
| 32 |
+
For each text element you find, analyze all 9 domains.
|
| 33 |
+
|
| 34 |
+
ALSO check for ABSENCE of expected text: If the scene contains surfaces that would normally have text (storefronts, product labels, book covers, road signs, vehicle plates, clothing brand logos) but they are blank, blurred, or contain gibberish, flag this as suspicious.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
If NO text is visible at all, assess whether the scene type would normally contain text. A close-up portrait with no text is normal. A street scene with no signs at all is unusual.
|
| 37 |
|
| 38 |
Respond in JSON:
|
| 39 |
{
|
| 40 |
"text_found": true/false,
|
| 41 |
+
"expected_text_absent": true/false,
|
| 42 |
"text_elements": [
|
| 43 |
{
|
| 44 |
+
"content": "exact transcription or GIBBERISH if unreadable",
|
| 45 |
"location": "where in image",
|
| 46 |
"legible": true/false,
|
| 47 |
"spelling_ok": true/false,
|
|
|
|
| 54 |
"reflection_ok": true/false/null
|
| 55 |
}
|
| 56 |
],
|
| 57 |
+
"anomalies": ["specific text anomalies with image region references"],
|
| 58 |
"confidence": 0.0-1.0,
|
| 59 |
"verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
|
| 60 |
"explanation": "detailed reasoning"
|
|
|
|
| 72 |
resp = _vlm(img, SYS_TEXT, USR_TEXT)
|
| 73 |
if resp and not resp.startswith("VLM_ERROR"):
|
| 74 |
parsed = _parse(resp)
|
| 75 |
+
raw_conf = parsed.get("confidence", 0.5)
|
| 76 |
+
cal_conf = _calibrate_vlm_confidence(raw_conf)
|
| 77 |
+
|
| 78 |
+
no_text = not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT"
|
| 79 |
+
expected_absent = parsed.get("expected_text_absent", False)
|
| 80 |
+
|
| 81 |
+
if no_text and not expected_absent:
|
| 82 |
+
# Genuinely no text expected β neutral
|
| 83 |
for feat in FEATURES:
|
| 84 |
findings.append({"test": feat, "score": 0.0, "note": "No text in image"})
|
| 85 |
scores.append(0.0)
|
| 86 |
+
elif no_text and expected_absent:
|
| 87 |
+
# Text SHOULD be there but isn't β suspicious
|
| 88 |
+
for feat in FEATURES:
|
| 89 |
+
feat_score = 0.1 if feat == "Context Appropriateness" else 0.0
|
| 90 |
+
findings.append({"test": feat, "score": feat_score,
|
| 91 |
+
"note": "Expected text absent from scene"})
|
| 92 |
+
scores.append(feat_score)
|
| 93 |
+
findings.append({"test": "Expected Text Absent", "score": 0.3,
|
| 94 |
+
"note": "Scene should contain text but doesn't β AI indicator"})
|
| 95 |
+
scores.append(0.3)
|
| 96 |
else:
|
| 97 |
sc = _score(parsed)
|
| 98 |
elements = parsed.get("text_elements", [])
|
| 99 |
anomalies = parsed.get("anomalies", [])
|
| 100 |
|
|
|
|
| 101 |
for feat in FEATURES:
|
| 102 |
feat_score = sc / len(FEATURES)
|
| 103 |
findings.append({"test": feat, "score": feat_score,
|
| 104 |
"note": parsed.get("explanation", "")[:100]})
|
| 105 |
scores.append(feat_score)
|
| 106 |
|
| 107 |
+
findings.append({"test": "Text Overall", "vlm_analysis": parsed,
|
| 108 |
"text_elements": elements, "anomalies": anomalies,
|
| 109 |
+
"score": sc, "confidence": cal_conf,
|
| 110 |
+
"raw_vlm_confidence": raw_conf,
|
| 111 |
"note": parsed.get("explanation", "")[:200]})
|
| 112 |
scores.append(sc)
|
| 113 |
else:
|