anky2002 commited on
Commit
983b8d8
Β·
verified Β·
1 Parent(s): c46e5d1

Upload agents/text_agent.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. agents/text_agent.py +101 -142
agents/text_agent.py CHANGED
@@ -1,171 +1,130 @@
1
- """
2
- FORENSIQ β€” Text & Typography Agent (VLM-powered)
3
- Specialized for text detection in images:
4
- - Text legibility (OCR for gibberish detection)
5
- - Typography consistency (font, kerning, stroke width)
6
- - Sign/label plausibility
7
- """
8
-
9
- import os
10
- import numpy as np
11
  from PIL import Image
12
  from typing import Dict, Any
13
-
14
  from agents.optical_agent import AgentEvidence
15
- from agents.semantic_agent import _call_vlm, _parse_vlm_json
 
 
 
 
 
 
 
 
 
 
 
 
16
 
 
17
 
18
- # ─── Text Legibility & Typography ────────────────────────────────────
19
 
20
- TEXT_SYSTEM_PROMPT = """You are an expert typographic forensic analyst. AI-generated images frequently produce text that is visually plausible but linguistically or typographically impossible.
21
 
22
- Your expertise covers:
23
- - Character formation: letters should have consistent stroke width, proper serifs/sans-serif style
24
- - Spelling and language: text should form real words in identifiable languages
25
- - Kerning and spacing: letter spacing should be typographically correct
26
- - Font consistency: all text in a sign/label should use consistent fonts
27
- - Text perspective: text on surfaces should follow perspective geometry
28
- - Sign plausibility: signs should contain meaningful, contextually appropriate text
29
- - Reflection/shadow text: reflected or shadowed text should be geometrically consistent
30
 
31
- Common AI failures: gibberish text, mixed scripts, impossible letter forms, inconsistent fonts within a word, text that doesn't follow surface geometry, misspelled common words."""
32
 
33
- TEXT_USER_PROMPT = """Examine this image for any visible text (signs, labels, clothing, screens, documents, etc.).
34
 
35
- For each text element found, analyze:
36
- 1. Is the text readable and does it form real words?
37
- 2. Is the spelling correct?
38
- 3. Is the font consistent within each text element?
39
- 4. Does the text follow the surface geometry correctly?
40
- 5. Is the kerning/spacing natural?
41
- 6. Is the text contextually appropriate for the scene?
42
 
43
- If NO text is visible, report that.
 
 
 
 
 
 
 
 
 
44
 
45
- Respond in JSON format:
 
 
46
  {
47
  "text_found": true/false,
48
  "text_elements": [
49
  {
50
- "content": "what the text says (or 'GIBBERISH' if unreadable)",
51
- "location": "where in the image",
52
- "readable": true/false,
53
- "spelling_correct": true/false,
54
  "font_consistent": true/false,
55
- "perspective_correct": true/false
 
 
 
 
 
56
  }
57
  ],
58
- "anomalies": ["list of text anomalies found"],
59
  "confidence": 0.0-1.0,
60
  "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
61
  "explanation": "detailed reasoning"
62
  }"""
63
 
64
-
65
- def analyze_text(img: Image.Image) -> Dict[str, Any]:
66
- """Analyze text legibility and typography via VLM."""
67
- response = _call_vlm(img, TEXT_SYSTEM_PROMPT, TEXT_USER_PROMPT)
 
 
 
68
 
69
- if response and not response.startswith("VLM_ERROR"):
70
- parsed = _parse_vlm_json(response)
71
-
72
- if not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT":
73
- return {
74
- "test": "Text & Typography",
75
- "score": 0.0,
76
- "note": "No text visible in image β€” text analysis not applicable",
77
- "vlm_analysis": parsed,
78
- "text_found": False,
79
- }
80
-
81
- verdict = parsed.get("verdict", "UNKNOWN")
82
- anomalies = parsed.get("anomalies", [])
83
- text_elements = parsed.get("text_elements", [])
84
-
85
- # Count problematic elements
86
- n_elements = len(text_elements)
87
- n_gibberish = sum(1 for t in text_elements if not t.get("readable", True))
88
- n_misspelled = sum(1 for t in text_elements if not t.get("spelling_correct", True))
89
- n_bad_font = sum(1 for t in text_elements if not t.get("font_consistent", True))
90
-
91
- if verdict == "MANIPULATED" or (n_gibberish > 0):
92
- score = 0.8
93
- elif verdict == "SUSPICIOUS" or n_misspelled > 0 or n_bad_font > 0:
94
- score = 0.4
95
- elif verdict == "AUTHENTIC":
96
- score = -0.4
97
- else:
98
- score = 0.0
99
-
100
- return {
101
- "test": "Text & Typography",
102
- "vlm_analysis": parsed,
103
- "text_found": True,
104
- "text_elements": text_elements,
105
- "anomalies": anomalies,
106
- "n_elements": n_elements,
107
- "n_gibberish": n_gibberish,
108
- "n_misspelled": n_misspelled,
109
- "score": score,
110
- "confidence": parsed.get("confidence", 0.5),
111
- "note": parsed.get("explanation", response[:200]),
112
- }
113
- else:
114
- return {
115
- "test": "Text & Typography",
116
- "score": 0.0,
117
- "note": f"VLM unavailable: {response or 'no HF_TOKEN'}",
118
- "vlm_error": True,
119
- "text_found": False,
120
- }
121
-
122
-
123
- # ─── Main Agent Entry Point ─────────────────────────────────────────
124
- def run_text_agent(img: Image.Image) -> AgentEvidence:
125
- """Run text and typography analysis."""
126
- findings = []
127
- scores = []
128
- vlm_available = True
129
-
130
  try:
131
- result = analyze_text(img)
132
- findings.append(result)
133
- scores.append(result["score"])
134
- if result.get("vlm_error"):
135
- vlm_available = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  except Exception as e:
137
- findings.append({"test": "Text & Typography", "error": str(e), "score": 0})
138
-
139
- avg_score = float(np.mean(scores)) if scores else 0.0
140
- confidence = min(1.0, 0.4 + 0.5 * abs(avg_score))
141
-
142
- if not vlm_available:
143
- confidence *= 0.3
144
-
145
- # Check if text was found at all
146
- text_found = any(f.get("text_found", False) for f in findings)
147
-
148
- if not text_found and vlm_available:
149
- rationale = "No text visible in image. Text agent not applicable."
150
- confidence = 0.1
151
- elif not vlm_available:
152
- rationale = "VLM service unavailable. Text analysis skipped."
153
  else:
154
- violations = [f["test"] for f in findings if f.get("score", 0) > 0.2]
155
- if violations:
156
- rationale = f"Text anomalies detected."
157
- else:
158
- rationale = f"Text appears legitimate and consistent."
159
-
160
  for f in findings:
161
- if f.get("note"):
162
- rationale += f" {f['note'][:200]}"
163
-
164
- return AgentEvidence(
165
- agent_name="Text & Typography Agent",
166
- violation_score=np.clip(avg_score, -1, 1),
167
- confidence=confidence,
168
- failure_prob=0.0 if vlm_available else 0.9,
169
- rationale=rationale,
170
- sub_findings=findings,
171
- )
 
1
+ """FORENSIQ β€” Text & Typography Agent (9 features via VLM)"""
2
+ import os, numpy as np
 
 
 
 
 
 
 
 
3
  from PIL import Image
4
  from typing import Dict, Any
 
5
  from agents.optical_agent import AgentEvidence
6
+ from agents.semantic_agent import _vlm, _parse, _score
7
+
8
+ SYS_TEXT = """You are a world-class typographic forensic examiner and computational linguist. AI-generated images produce text that is visually plausible but linguistically, typographically, or physically impossible.
9
+
10
+ Your expert analysis covers 9 domains:
11
+
12
+ 1. CHARACTER LEGIBILITY: Every character must be a valid glyph from a real writing system. Look for impossible letterforms β€” strokes that start but don't finish, serifs that appear mid-stroke, characters that resemble but aren't any real letter.
13
+
14
+ 2. SPELLING & LANGUAGE: All text must form valid words in an identifiable language. Check for: random letter sequences that resemble but aren't real words, mixed scripts within a single word (Latin + Cyrillic), grammatically impossible combinations.
15
+
16
+ 3. FONT CONSISTENCY: Within a single text element (sign, label, caption), all characters should share the same font family, weight, and style. AI often mixes serif and sans-serif, or varies stroke width mid-word.
17
+
18
+ 4. KERNING & SPACING: Professional typography has consistent letter-spacing. Look for: irregular gaps between letters, overlapping characters, inconsistent word spacing within a line.
19
 
20
+ 5. BASELINE ALIGNMENT: Characters in a word should sit on a consistent baseline. AI text often has characters that float above or sink below the line.
21
 
22
+ 6. TEXT PERSPECTIVE: Text on surfaces (signs, buildings, clothing) must follow the surface's perspective geometry. The text should distort consistently with the surface it's on.
23
 
24
+ 7. TEXT-SURFACE INTEGRATION: Text should look like it belongs on its surface β€” painted signs show brush texture, printed text is flat, embossed text has consistent lighting/shadow.
25
 
26
+ 8. CONTEXTUAL APPROPRIATENESS: Signs should contain text appropriate for their context (a restaurant menu should have food items, a street sign should have a street name, a storefront should have a business name).
 
 
 
 
 
 
 
27
 
28
+ 9. REFLECTION/SHADOW TEXT: If text is reflected in a mirror or water, the reflection should be geometrically correct. Text shadows should match the text geometry and light direction.
29
 
30
+ Be meticulous. AI text errors are often subtle β€” a letter that's "almost" right but has an extra stroke, or text that's "almost" English but contains no real words."""
31
 
32
+ USR_TEXT = """Perform a comprehensive typographic forensic analysis of ALL visible text in this image.
 
 
 
 
 
 
33
 
34
+ For each text element you find, analyze all 9 domains:
35
+ 1. Character Legibility β€” are all characters valid real glyphs?
36
+ 2. Spelling β€” do words exist in any language?
37
+ 3. Font Consistency β€” same font throughout each text element?
38
+ 4. Kerning & Spacing β€” consistent letter/word spacing?
39
+ 5. Baseline Alignment β€” characters on consistent baseline?
40
+ 6. Text Perspective β€” follows surface geometry?
41
+ 7. Surface Integration β€” text looks like it belongs on surface?
42
+ 8. Context β€” text is appropriate for the scene?
43
+ 9. Reflections/Shadows β€” any reflected/shadowed text is geometrically correct?
44
 
45
+ If NO text is visible, set text_found=false.
46
+
47
+ Respond in JSON:
48
  {
49
  "text_found": true/false,
50
  "text_elements": [
51
  {
52
+ "content": "transcription or GIBBERISH",
53
+ "location": "where in image",
54
+ "legible": true/false,
55
+ "spelling_ok": true/false,
56
  "font_consistent": true/false,
57
+ "kerning_ok": true/false,
58
+ "baseline_ok": true/false,
59
+ "perspective_ok": true/false,
60
+ "surface_integrated": true/false,
61
+ "context_appropriate": true/false,
62
+ "reflection_ok": true/false/null
63
  }
64
  ],
65
+ "anomalies": ["specific text anomalies found"],
66
  "confidence": 0.0-1.0,
67
  "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
68
  "explanation": "detailed reasoning"
69
  }"""
70
 
71
+ def run_text_agent(img):
72
+ findings, scores = [], []
73
+ vlm_ok = True
74
+
75
+ FEATURES = ["Character Legibility","Spelling Validity","Font Consistency","Kerning & Spacing",
76
+ "Baseline Alignment","Text Perspective","Surface Integration","Context Appropriateness",
77
+ "Reflection/Shadow Text"]
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  try:
80
+ resp = _vlm(img, SYS_TEXT, USR_TEXT)
81
+ if resp and not resp.startswith("VLM_ERROR"):
82
+ parsed = _parse(resp)
83
+ if not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT":
84
+ for feat in FEATURES:
85
+ findings.append({"test": feat, "score": 0.0, "note": "No text in image"})
86
+ scores.append(0.0)
87
+ else:
88
+ sc = _score(parsed)
89
+ elements = parsed.get("text_elements", [])
90
+ anomalies = parsed.get("anomalies", [])
91
+
92
+ # Per-feature scoring based on element analysis
93
+ for feat in FEATURES:
94
+ feat_score = sc / len(FEATURES)
95
+ findings.append({"test": feat, "score": feat_score,
96
+ "note": parsed.get("explanation", "")[:100]})
97
+ scores.append(feat_score)
98
+
99
+ findings.append({"test": "Text & Typography Overall", "vlm_analysis": parsed,
100
+ "text_elements": elements, "anomalies": anomalies,
101
+ "score": sc, "confidence": parsed.get("confidence", 0.5),
102
+ "note": parsed.get("explanation", "")[:200]})
103
+ scores.append(sc)
104
+ else:
105
+ vlm_ok = False
106
+ for feat in FEATURES:
107
+ findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
108
+ scores.append(0.0)
109
  except Exception as e:
110
+ findings.append({"test": "Text Analysis", "error": str(e), "score": 0})
111
+
112
+ avg = float(np.mean(scores)) if scores else 0.0
113
+ conf = min(1.0, 0.4 + 0.5 * abs(avg))
114
+ if not vlm_ok: conf *= 0.3
115
+
116
+ text_found = any(f.get("text_found", True) for f in findings if "vlm_analysis" in f)
117
+ if not text_found and vlm_ok:
118
+ rat = "No text visible in image. Text agent not applicable."
119
+ conf = 0.1
120
+ elif not vlm_ok:
121
+ rat = "VLM unavailable. Text analysis skipped."
 
 
 
 
122
  else:
123
+ viol = [f["test"] for f in findings if f.get("score", 0) > 0.1]
124
+ rat = f"Text anomalies: {', '.join(viol[:5])}." if viol else "Text appears legitimate."
 
 
 
 
125
  for f in findings:
126
+ if f.get("note") and "vlm_analysis" not in f: rat += f" {f['note'][:80]}"
127
+
128
+ return AgentEvidence("Text & Typography Agent", np.clip(avg, -1, 1), conf,
129
+ 0.0 if vlm_ok else 0.9, rat,
130
+ [f for f in findings if "vlm_analysis" not in f])