anky2002 commited on
Commit
6ac5014
Β·
verified Β·
1 Parent(s): 0d484f2

Upload agents/text_agent.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. agents/text_agent.py +40 -31
agents/text_agent.py CHANGED
@@ -3,53 +3,45 @@ import os, numpy as np
3
  from PIL import Image
4
  from typing import Dict, Any
5
  from agents.optical_agent import AgentEvidence
6
- from agents.semantic_agent import _vlm, _parse, _score
7
 
8
- SYS_TEXT = """You are a world-class typographic forensic examiner and computational linguist. AI-generated images produce text that is visually plausible but linguistically, typographically, or physically impossible.
9
 
10
- Your expert analysis covers 9 domains:
11
 
12
- 1. CHARACTER LEGIBILITY: Every character must be a valid glyph from a real writing system. Look for impossible letterforms β€” strokes that start but don't finish, serifs that appear mid-stroke, characters that resemble but aren't any real letter.
13
 
14
- 2. SPELLING & LANGUAGE: All text must form valid words in an identifiable language. Check for: random letter sequences that resemble but aren't real words, mixed scripts within a single word (Latin + Cyrillic), grammatically impossible combinations.
15
 
16
- 3. FONT CONSISTENCY: Within a single text element (sign, label, caption), all characters should share the same font family, weight, and style. AI often mixes serif and sans-serif, or varies stroke width mid-word.
17
 
18
- 4. KERNING & SPACING: Professional typography has consistent letter-spacing. Look for: irregular gaps between letters, overlapping characters, inconsistent word spacing within a line.
19
 
20
- 5. BASELINE ALIGNMENT: Characters in a word should sit on a consistent baseline. AI text often has characters that float above or sink below the line.
21
 
22
- 6. TEXT PERSPECTIVE: Text on surfaces (signs, buildings, clothing) must follow the surface's perspective geometry. The text should distort consistently with the surface it's on.
23
 
24
- 7. TEXT-SURFACE INTEGRATION: Text should look like it belongs on its surface β€” painted signs show brush texture, printed text is flat, embossed text has consistent lighting/shadow.
25
 
26
- 8. CONTEXTUAL APPROPRIATENESS: Signs should contain text appropriate for their context (a restaurant menu should have food items, a street sign should have a street name, a storefront should have a business name).
27
 
28
- 9. REFLECTION/SHADOW TEXT: If text is reflected in a mirror or water, the reflection should be geometrically correct. Text shadows should match the text geometry and light direction.
29
-
30
- Be meticulous. AI text errors are often subtle β€” a letter that's "almost" right but has an extra stroke, or text that's "almost" English but contains no real words."""
31
 
32
  USR_TEXT = """Perform a comprehensive typographic forensic analysis of ALL visible text in this image.
33
 
34
- For each text element you find, analyze all 9 domains:
35
- 1. Character Legibility β€” are all characters valid real glyphs?
36
- 2. Spelling β€” do words exist in any language?
37
- 3. Font Consistency β€” same font throughout each text element?
38
- 4. Kerning & Spacing β€” consistent letter/word spacing?
39
- 5. Baseline Alignment β€” characters on consistent baseline?
40
- 6. Text Perspective β€” follows surface geometry?
41
- 7. Surface Integration β€” text looks like it belongs on surface?
42
- 8. Context β€” text is appropriate for the scene?
43
- 9. Reflections/Shadows β€” any reflected/shadowed text is geometrically correct?
44
 
45
- If NO text is visible, set text_found=false.
46
 
47
  Respond in JSON:
48
  {
49
  "text_found": true/false,
 
50
  "text_elements": [
51
  {
52
- "content": "transcription or GIBBERISH",
53
  "location": "where in image",
54
  "legible": true/false,
55
  "spelling_ok": true/false,
@@ -62,7 +54,7 @@ Respond in JSON:
62
  "reflection_ok": true/false/null
63
  }
64
  ],
65
- "anomalies": ["specific text anomalies found"],
66
  "confidence": 0.0-1.0,
67
  "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
68
  "explanation": "detailed reasoning"
@@ -80,25 +72,42 @@ def run_text_agent(img):
80
  resp = _vlm(img, SYS_TEXT, USR_TEXT)
81
  if resp and not resp.startswith("VLM_ERROR"):
82
  parsed = _parse(resp)
83
- if not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT":
 
 
 
 
 
 
 
84
  for feat in FEATURES:
85
  findings.append({"test": feat, "score": 0.0, "note": "No text in image"})
86
  scores.append(0.0)
 
 
 
 
 
 
 
 
 
 
87
  else:
88
  sc = _score(parsed)
89
  elements = parsed.get("text_elements", [])
90
  anomalies = parsed.get("anomalies", [])
91
 
92
- # Per-feature scoring based on element analysis
93
  for feat in FEATURES:
94
  feat_score = sc / len(FEATURES)
95
  findings.append({"test": feat, "score": feat_score,
96
  "note": parsed.get("explanation", "")[:100]})
97
  scores.append(feat_score)
98
 
99
- findings.append({"test": "Text & Typography Overall", "vlm_analysis": parsed,
100
  "text_elements": elements, "anomalies": anomalies,
101
- "score": sc, "confidence": parsed.get("confidence", 0.5),
 
102
  "note": parsed.get("explanation", "")[:200]})
103
  scores.append(sc)
104
  else:
 
3
  from PIL import Image
4
  from typing import Dict, Any
5
  from agents.optical_agent import AgentEvidence
6
+ from agents.semantic_agent import _vlm, _parse, _score, _calibrate_vlm_confidence, CONFIDENCE_CALIBRATION
7
 
8
+ SYS_TEXT = """You are a forensic typographic examiner and computational linguist. AI-generated images produce text that is visually plausible but linguistically, typographically, or physically impossible.
9
 
10
+ Your 9 analysis domains:
11
 
12
+ 1. CHARACTER LEGIBILITY: Every character must be a valid glyph from a real writing system. Look for: strokes that start but don't finish, serifs that appear mid-stroke, characters that resemble but aren't any real letter, hybrid glyphs that mix features of different letters.
13
 
14
+ 2. SPELLING & LANGUAGE: All text must form valid words in an identifiable language. Check for: random letter sequences that resemble but aren't real words ("RESTAUNRT", "COFF3E SH0P"), mixed scripts within a single word (Latin + Cyrillic), grammatically impossible combinations.
15
 
16
+ 3. FONT CONSISTENCY: Within a single text element (sign, label, caption), all characters should share the same font family, weight, and style. AI often mixes serif and sans-serif mid-word, varies x-height, or changes stroke width between adjacent characters.
17
 
18
+ 4. KERNING & SPACING: Professional typography has consistent letter-spacing. Look for: irregular gaps between letters, overlapping characters, inconsistent word spacing within a line, characters that are too tightly packed or too spread out relative to the font.
19
 
20
+ 5. BASELINE ALIGNMENT: Characters in a word should sit on a consistent baseline. AI text often has characters that float above or sink below the line, or a baseline that wobbles rather than being straight or following a consistent curve.
21
 
22
+ 6. TEXT PERSPECTIVE: Text on surfaces (signs, buildings, clothing, products) must follow the surface's perspective geometry. Each character should distort consistently with its position on the surface. Text that appears "pasted on" without perspective distortion is suspicious.
23
 
24
+ 7. TEXT-SURFACE INTEGRATION: Text should look like it belongs on its surface. Painted signs show brush texture. Printed text is flat and uniform. Embossed text has consistent lighting/shadow. Neon signs glow. Digital screens emit light. Check that the text rendering matches the apparent medium.
25
 
26
+ 8. CONTEXTUAL APPROPRIATENESS: Signs should contain text appropriate for their context (restaurant β†’ food items, street β†’ street name, storefront β†’ business name). ALSO check for ABSENCE of expected text: a storefront with no name, a book with no title, a commercial product with no label β€” these absences can indicate AI generation.
27
 
28
+ 9. REFLECTION/SHADOW TEXT: If text is reflected in mirrors, water, or shiny surfaces, the reflection must be geometrically correct (horizontally mirrored). Text shadows must match the text geometry and scene lighting direction. Shadow letters should match the shape of the source letters.""" + CONFIDENCE_CALIBRATION
 
 
29
 
30
  USR_TEXT = """Perform a comprehensive typographic forensic analysis of ALL visible text in this image.
31
 
32
+ For each text element you find, analyze all 9 domains.
33
+
34
+ ALSO check for ABSENCE of expected text: If the scene contains surfaces that would normally have text (storefronts, product labels, book covers, road signs, vehicle plates, clothing brand logos) but they are blank, blurred, or contain gibberish, flag this as suspicious.
 
 
 
 
 
 
 
35
 
36
+ If NO text is visible at all, assess whether the scene type would normally contain text. A close-up portrait with no text is normal. A street scene with no signs at all is unusual.
37
 
38
  Respond in JSON:
39
  {
40
  "text_found": true/false,
41
+ "expected_text_absent": true/false,
42
  "text_elements": [
43
  {
44
+ "content": "exact transcription or GIBBERISH if unreadable",
45
  "location": "where in image",
46
  "legible": true/false,
47
  "spelling_ok": true/false,
 
54
  "reflection_ok": true/false/null
55
  }
56
  ],
57
+ "anomalies": ["specific text anomalies with image region references"],
58
  "confidence": 0.0-1.0,
59
  "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
60
  "explanation": "detailed reasoning"
 
72
  resp = _vlm(img, SYS_TEXT, USR_TEXT)
73
  if resp and not resp.startswith("VLM_ERROR"):
74
  parsed = _parse(resp)
75
+ raw_conf = parsed.get("confidence", 0.5)
76
+ cal_conf = _calibrate_vlm_confidence(raw_conf)
77
+
78
+ no_text = not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT"
79
+ expected_absent = parsed.get("expected_text_absent", False)
80
+
81
+ if no_text and not expected_absent:
82
+ # Genuinely no text expected β€” neutral
83
  for feat in FEATURES:
84
  findings.append({"test": feat, "score": 0.0, "note": "No text in image"})
85
  scores.append(0.0)
86
+ elif no_text and expected_absent:
87
+ # Text SHOULD be there but isn't β€” suspicious
88
+ for feat in FEATURES:
89
+ feat_score = 0.1 if feat == "Context Appropriateness" else 0.0
90
+ findings.append({"test": feat, "score": feat_score,
91
+ "note": "Expected text absent from scene"})
92
+ scores.append(feat_score)
93
+ findings.append({"test": "Expected Text Absent", "score": 0.3,
94
+ "note": "Scene should contain text but doesn't β€” AI indicator"})
95
+ scores.append(0.3)
96
  else:
97
  sc = _score(parsed)
98
  elements = parsed.get("text_elements", [])
99
  anomalies = parsed.get("anomalies", [])
100
 
 
101
  for feat in FEATURES:
102
  feat_score = sc / len(FEATURES)
103
  findings.append({"test": feat, "score": feat_score,
104
  "note": parsed.get("explanation", "")[:100]})
105
  scores.append(feat_score)
106
 
107
+ findings.append({"test": "Text Overall", "vlm_analysis": parsed,
108
  "text_elements": elements, "anomalies": anomalies,
109
+ "score": sc, "confidence": cal_conf,
110
+ "raw_vlm_confidence": raw_conf,
111
  "note": parsed.get("explanation", "")[:200]})
112
  scores.append(sc)
113
  else: