Spaces:

anky2002
/

FORENSIQ

Running

App Files Files Community

anky2002 commited on 14 days ago

Commit

6ac5014

verified ·

1 Parent(s): 0d484f2

Upload agents/text_agent.py with huggingface_hub

Browse files

Files changed (1) hide show

agents/text_agent.py +40 -31

agents/text_agent.py CHANGED Viewed

@@ -3,53 +3,45 @@ import os, numpy as np
 from PIL import Image
 from typing import Dict, Any
 from agents.optical_agent import AgentEvidence
-from agents.semantic_agent import _vlm, _parse, _score
-SYS_TEXT = """You are a world-class typographic forensic examiner and computational linguist. AI-generated images produce text that is visually plausible but linguistically, typographically, or physically impossible.
-Your expert analysis covers 9 domains:
-1. CHARACTER LEGIBILITY: Every character must be a valid glyph from a real writing system. Look for impossible letterforms — strokes that start but don't finish, serifs that appear mid-stroke, characters that resemble but aren't any real letter.
-2. SPELLING & LANGUAGE: All text must form valid words in an identifiable language. Check for: random letter sequences that resemble but aren't real words, mixed scripts within a single word (Latin + Cyrillic), grammatically impossible combinations.
-3. FONT CONSISTENCY: Within a single text element (sign, label, caption), all characters should share the same font family, weight, and style. AI often mixes serif and sans-serif, or varies stroke width mid-word.
-4. KERNING & SPACING: Professional typography has consistent letter-spacing. Look for: irregular gaps between letters, overlapping characters, inconsistent word spacing within a line.
-5. BASELINE ALIGNMENT: Characters in a word should sit on a consistent baseline. AI text often has characters that float above or sink below the line.
-6. TEXT PERSPECTIVE: Text on surfaces (signs, buildings, clothing) must follow the surface's perspective geometry. The text should distort consistently with the surface it's on.
-7. TEXT-SURFACE INTEGRATION: Text should look like it belongs on its surface — painted signs show brush texture, printed text is flat, embossed text has consistent lighting/shadow.
-8. CONTEXTUAL APPROPRIATENESS: Signs should contain text appropriate for their context (a restaurant menu should have food items, a street sign should have a street name, a storefront should have a business name).
-9. REFLECTION/SHADOW TEXT: If text is reflected in a mirror or water, the reflection should be geometrically correct. Text shadows should match the text geometry and light direction.
-Be meticulous. AI text errors are often subtle — a letter that's "almost" right but has an extra stroke, or text that's "almost" English but contains no real words."""
 USR_TEXT = """Perform a comprehensive typographic forensic analysis of ALL visible text in this image.
-For each text element you find, analyze all 9 domains:
-1. Character Legibility — are all characters valid real glyphs?
-2. Spelling — do words exist in any language?
-3. Font Consistency — same font throughout each text element?
-4. Kerning & Spacing — consistent letter/word spacing?
-5. Baseline Alignment — characters on consistent baseline?
-6. Text Perspective — follows surface geometry?
-7. Surface Integration — text looks like it belongs on surface?
-8. Context — text is appropriate for the scene?
-9. Reflections/Shadows — any reflected/shadowed text is geometrically correct?
-If NO text is visible, set text_found=false.
 Respond in JSON:
 {
     "text_found": true/false,
     "text_elements": [
         {
-            "content": "transcription or GIBBERISH",
             "location": "where in image",
             "legible": true/false,
             "spelling_ok": true/false,
@@ -62,7 +54,7 @@ Respond in JSON:
             "reflection_ok": true/false/null
         }
     ],
-    "anomalies": ["specific text anomalies found"],
     "confidence": 0.0-1.0,
     "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
     "explanation": "detailed reasoning"
@@ -80,25 +72,42 @@ def run_text_agent(img):
         resp = _vlm(img, SYS_TEXT, USR_TEXT)
         if resp and not resp.startswith("VLM_ERROR"):
             parsed = _parse(resp)
-            if not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT":
                 for feat in FEATURES:
                     findings.append({"test": feat, "score": 0.0, "note": "No text in image"})
                     scores.append(0.0)
             else:
                 sc = _score(parsed)
                 elements = parsed.get("text_elements", [])
                 anomalies = parsed.get("anomalies", [])
-                # Per-feature scoring based on element analysis
                 for feat in FEATURES:
                     feat_score = sc / len(FEATURES)
                     findings.append({"test": feat, "score": feat_score,
                                    "note": parsed.get("explanation", "")[:100]})
                     scores.append(feat_score)
-                findings.append({"test": "Text & Typography Overall", "vlm_analysis": parsed,
                                "text_elements": elements, "anomalies": anomalies,
-                               "score": sc, "confidence": parsed.get("confidence", 0.5),
                                "note": parsed.get("explanation", "")[:200]})
                 scores.append(sc)
         else:

 from PIL import Image
 from typing import Dict, Any
 from agents.optical_agent import AgentEvidence
+from agents.semantic_agent import _vlm, _parse, _score, _calibrate_vlm_confidence, CONFIDENCE_CALIBRATION
+SYS_TEXT = """You are a forensic typographic examiner and computational linguist. AI-generated images produce text that is visually plausible but linguistically, typographically, or physically impossible.
+Your 9 analysis domains:
+1. CHARACTER LEGIBILITY: Every character must be a valid glyph from a real writing system. Look for: strokes that start but don't finish, serifs that appear mid-stroke, characters that resemble but aren't any real letter, hybrid glyphs that mix features of different letters.
+2. SPELLING & LANGUAGE: All text must form valid words in an identifiable language. Check for: random letter sequences that resemble but aren't real words ("RESTAUNRT", "COFF3E SH0P"), mixed scripts within a single word (Latin + Cyrillic), grammatically impossible combinations.
+3. FONT CONSISTENCY: Within a single text element (sign, label, caption), all characters should share the same font family, weight, and style. AI often mixes serif and sans-serif mid-word, varies x-height, or changes stroke width between adjacent characters.
+4. KERNING & SPACING: Professional typography has consistent letter-spacing. Look for: irregular gaps between letters, overlapping characters, inconsistent word spacing within a line, characters that are too tightly packed or too spread out relative to the font.
+5. BASELINE ALIGNMENT: Characters in a word should sit on a consistent baseline. AI text often has characters that float above or sink below the line, or a baseline that wobbles rather than being straight or following a consistent curve.
+6. TEXT PERSPECTIVE: Text on surfaces (signs, buildings, clothing, products) must follow the surface's perspective geometry. Each character should distort consistently with its position on the surface. Text that appears "pasted on" without perspective distortion is suspicious.
+7. TEXT-SURFACE INTEGRATION: Text should look like it belongs on its surface. Painted signs show brush texture. Printed text is flat and uniform. Embossed text has consistent lighting/shadow. Neon signs glow. Digital screens emit light. Check that the text rendering matches the apparent medium.
+8. CONTEXTUAL APPROPRIATENESS: Signs should contain text appropriate for their context (restaurant → food items, street → street name, storefront → business name). ALSO check for ABSENCE of expected text: a storefront with no name, a book with no title, a commercial product with no label — these absences can indicate AI generation.
+9. REFLECTION/SHADOW TEXT: If text is reflected in mirrors, water, or shiny surfaces, the reflection must be geometrically correct (horizontally mirrored). Text shadows must match the text geometry and scene lighting direction. Shadow letters should match the shape of the source letters.""" + CONFIDENCE_CALIBRATION
 USR_TEXT = """Perform a comprehensive typographic forensic analysis of ALL visible text in this image.
+For each text element you find, analyze all 9 domains.
+ALSO check for ABSENCE of expected text: If the scene contains surfaces that would normally have text (storefronts, product labels, book covers, road signs, vehicle plates, clothing brand logos) but they are blank, blurred, or contain gibberish, flag this as suspicious.
+If NO text is visible at all, assess whether the scene type would normally contain text. A close-up portrait with no text is normal. A street scene with no signs at all is unusual.
 Respond in JSON:
 {
     "text_found": true/false,
+    "expected_text_absent": true/false,
     "text_elements": [
         {
+            "content": "exact transcription or GIBBERISH if unreadable",
             "location": "where in image",
             "legible": true/false,
             "spelling_ok": true/false,
             "reflection_ok": true/false/null
         }
     ],
+    "anomalies": ["specific text anomalies with image region references"],
     "confidence": 0.0-1.0,
     "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
     "explanation": "detailed reasoning"
         resp = _vlm(img, SYS_TEXT, USR_TEXT)
         if resp and not resp.startswith("VLM_ERROR"):
             parsed = _parse(resp)
+            raw_conf = parsed.get("confidence", 0.5)
+            cal_conf = _calibrate_vlm_confidence(raw_conf)
+            no_text = not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT"
+            expected_absent = parsed.get("expected_text_absent", False)
+            if no_text and not expected_absent:
+                # Genuinely no text expected — neutral
                 for feat in FEATURES:
                     findings.append({"test": feat, "score": 0.0, "note": "No text in image"})
                     scores.append(0.0)
+            elif no_text and expected_absent:
+                # Text SHOULD be there but isn't — suspicious
+                for feat in FEATURES:
+                    feat_score = 0.1 if feat == "Context Appropriateness" else 0.0
+                    findings.append({"test": feat, "score": feat_score,
+                                   "note": "Expected text absent from scene"})
+                    scores.append(feat_score)
+                findings.append({"test": "Expected Text Absent", "score": 0.3,
+                               "note": "Scene should contain text but doesn't — AI indicator"})
+                scores.append(0.3)
             else:
                 sc = _score(parsed)
                 elements = parsed.get("text_elements", [])
                 anomalies = parsed.get("anomalies", [])
                 for feat in FEATURES:
                     feat_score = sc / len(FEATURES)
                     findings.append({"test": feat, "score": feat_score,
                                    "note": parsed.get("explanation", "")[:100]})
                     scores.append(feat_score)
+                findings.append({"test": "Text Overall", "vlm_analysis": parsed,
                                "text_elements": elements, "anomalies": anomalies,
+                               "score": sc, "confidence": cal_conf,
+                               "raw_vlm_confidence": raw_conf,
                                "note": parsed.get("explanation", "")[:200]})
                 scores.append(sc)
         else: