anky2002 commited on
Commit
25b4b0d
Β·
verified Β·
1 Parent(s): 27f7870

Upload agents/text_agent.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. agents/text_agent.py +171 -0
agents/text_agent.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FORENSIQ β€” Text & Typography Agent (VLM-powered)
3
+ Specialized for text detection in images:
4
+ - Text legibility (OCR for gibberish detection)
5
+ - Typography consistency (font, kerning, stroke width)
6
+ - Sign/label plausibility
7
+ """
8
+
9
+ import os
10
+ import numpy as np
11
+ from PIL import Image
12
+ from typing import Dict, Any
13
+
14
+ from agents.optical_agent import AgentEvidence
15
+ from agents.semantic_agent import _call_vlm, _parse_vlm_json
16
+
17
+
18
+ # ─── Text Legibility & Typography ────────────────────────────────────
19
+
20
+ TEXT_SYSTEM_PROMPT = """You are an expert typographic forensic analyst. AI-generated images frequently produce text that is visually plausible but linguistically or typographically impossible.
21
+
22
+ Your expertise covers:
23
+ - Character formation: letters should have consistent stroke width, proper serifs/sans-serif style
24
+ - Spelling and language: text should form real words in identifiable languages
25
+ - Kerning and spacing: letter spacing should be typographically correct
26
+ - Font consistency: all text in a sign/label should use consistent fonts
27
+ - Text perspective: text on surfaces should follow perspective geometry
28
+ - Sign plausibility: signs should contain meaningful, contextually appropriate text
29
+ - Reflection/shadow text: reflected or shadowed text should be geometrically consistent
30
+
31
+ Common AI failures: gibberish text, mixed scripts, impossible letter forms, inconsistent fonts within a word, text that doesn't follow surface geometry, misspelled common words."""
32
+
33
+ TEXT_USER_PROMPT = """Examine this image for any visible text (signs, labels, clothing, screens, documents, etc.).
34
+
35
+ For each text element found, analyze:
36
+ 1. Is the text readable and does it form real words?
37
+ 2. Is the spelling correct?
38
+ 3. Is the font consistent within each text element?
39
+ 4. Does the text follow the surface geometry correctly?
40
+ 5. Is the kerning/spacing natural?
41
+ 6. Is the text contextually appropriate for the scene?
42
+
43
+ If NO text is visible, report that.
44
+
45
+ Respond in JSON format:
46
+ {
47
+ "text_found": true/false,
48
+ "text_elements": [
49
+ {
50
+ "content": "what the text says (or 'GIBBERISH' if unreadable)",
51
+ "location": "where in the image",
52
+ "readable": true/false,
53
+ "spelling_correct": true/false,
54
+ "font_consistent": true/false,
55
+ "perspective_correct": true/false
56
+ }
57
+ ],
58
+ "anomalies": ["list of text anomalies found"],
59
+ "confidence": 0.0-1.0,
60
+ "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
61
+ "explanation": "detailed reasoning"
62
+ }"""
63
+
64
+
65
+ def analyze_text(img: Image.Image) -> Dict[str, Any]:
66
+ """Analyze text legibility and typography via VLM."""
67
+ response = _call_vlm(img, TEXT_SYSTEM_PROMPT, TEXT_USER_PROMPT)
68
+
69
+ if response and not response.startswith("VLM_ERROR"):
70
+ parsed = _parse_vlm_json(response)
71
+
72
+ if not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT":
73
+ return {
74
+ "test": "Text & Typography",
75
+ "score": 0.0,
76
+ "note": "No text visible in image β€” text analysis not applicable",
77
+ "vlm_analysis": parsed,
78
+ "text_found": False,
79
+ }
80
+
81
+ verdict = parsed.get("verdict", "UNKNOWN")
82
+ anomalies = parsed.get("anomalies", [])
83
+ text_elements = parsed.get("text_elements", [])
84
+
85
+ # Count problematic elements
86
+ n_elements = len(text_elements)
87
+ n_gibberish = sum(1 for t in text_elements if not t.get("readable", True))
88
+ n_misspelled = sum(1 for t in text_elements if not t.get("spelling_correct", True))
89
+ n_bad_font = sum(1 for t in text_elements if not t.get("font_consistent", True))
90
+
91
+ if verdict == "MANIPULATED" or (n_gibberish > 0):
92
+ score = 0.8
93
+ elif verdict == "SUSPICIOUS" or n_misspelled > 0 or n_bad_font > 0:
94
+ score = 0.4
95
+ elif verdict == "AUTHENTIC":
96
+ score = -0.4
97
+ else:
98
+ score = 0.0
99
+
100
+ return {
101
+ "test": "Text & Typography",
102
+ "vlm_analysis": parsed,
103
+ "text_found": True,
104
+ "text_elements": text_elements,
105
+ "anomalies": anomalies,
106
+ "n_elements": n_elements,
107
+ "n_gibberish": n_gibberish,
108
+ "n_misspelled": n_misspelled,
109
+ "score": score,
110
+ "confidence": parsed.get("confidence", 0.5),
111
+ "note": parsed.get("explanation", response[:200]),
112
+ }
113
+ else:
114
+ return {
115
+ "test": "Text & Typography",
116
+ "score": 0.0,
117
+ "note": f"VLM unavailable: {response or 'no HF_TOKEN'}",
118
+ "vlm_error": True,
119
+ "text_found": False,
120
+ }
121
+
122
+
123
+ # ─── Main Agent Entry Point ─────────────────────────────────────────
124
+ def run_text_agent(img: Image.Image) -> AgentEvidence:
125
+ """Run text and typography analysis."""
126
+ findings = []
127
+ scores = []
128
+ vlm_available = True
129
+
130
+ try:
131
+ result = analyze_text(img)
132
+ findings.append(result)
133
+ scores.append(result["score"])
134
+ if result.get("vlm_error"):
135
+ vlm_available = False
136
+ except Exception as e:
137
+ findings.append({"test": "Text & Typography", "error": str(e), "score": 0})
138
+
139
+ avg_score = float(np.mean(scores)) if scores else 0.0
140
+ confidence = min(1.0, 0.4 + 0.5 * abs(avg_score))
141
+
142
+ if not vlm_available:
143
+ confidence *= 0.3
144
+
145
+ # Check if text was found at all
146
+ text_found = any(f.get("text_found", False) for f in findings)
147
+
148
+ if not text_found and vlm_available:
149
+ rationale = "No text visible in image. Text agent not applicable."
150
+ confidence = 0.1
151
+ elif not vlm_available:
152
+ rationale = "VLM service unavailable. Text analysis skipped."
153
+ else:
154
+ violations = [f["test"] for f in findings if f.get("score", 0) > 0.2]
155
+ if violations:
156
+ rationale = f"Text anomalies detected."
157
+ else:
158
+ rationale = f"Text appears legitimate and consistent."
159
+
160
+ for f in findings:
161
+ if f.get("note"):
162
+ rationale += f" {f['note'][:200]}"
163
+
164
+ return AgentEvidence(
165
+ agent_name="Text & Typography Agent",
166
+ violation_score=np.clip(avg_score, -1, 1),
167
+ confidence=confidence,
168
+ failure_prob=0.0 if vlm_available else 0.9,
169
+ rationale=rationale,
170
+ sub_findings=findings,
171
+ )