Spaces:

anky2002
/

FORENSIQ

Running

App Files Files Community

anky2002 commited on 14 days ago

Commit

983b8d8

verified ·

1 Parent(s): c46e5d1

Upload agents/text_agent.py with huggingface_hub

Browse files

Files changed (1) hide show

agents/text_agent.py +101 -142

agents/text_agent.py CHANGED Viewed

@@ -1,171 +1,130 @@
-"""
-FORENSIQ — Text & Typography Agent (VLM-powered)
-Specialized for text detection in images:
-  - Text legibility (OCR for gibberish detection)
-  - Typography consistency (font, kerning, stroke width)
-  - Sign/label plausibility
-"""
-import os
-import numpy as np
 from PIL import Image
 from typing import Dict, Any
 from agents.optical_agent import AgentEvidence
-from agents.semantic_agent import _call_vlm, _parse_vlm_json
-# ─── Text Legibility & Typography ────────────────────────────────────
-TEXT_SYSTEM_PROMPT = """You are an expert typographic forensic analyst. AI-generated images frequently produce text that is visually plausible but linguistically or typographically impossible.
-Your expertise covers:
-- Character formation: letters should have consistent stroke width, proper serifs/sans-serif style
-- Spelling and language: text should form real words in identifiable languages
-- Kerning and spacing: letter spacing should be typographically correct
-- Font consistency: all text in a sign/label should use consistent fonts
-- Text perspective: text on surfaces should follow perspective geometry
-- Sign plausibility: signs should contain meaningful, contextually appropriate text
-- Reflection/shadow text: reflected or shadowed text should be geometrically consistent
-Common AI failures: gibberish text, mixed scripts, impossible letter forms, inconsistent fonts within a word, text that doesn't follow surface geometry, misspelled common words."""
-TEXT_USER_PROMPT = """Examine this image for any visible text (signs, labels, clothing, screens, documents, etc.).
-For each text element found, analyze:
-1. Is the text readable and does it form real words?
-2. Is the spelling correct?
-3. Is the font consistent within each text element?
-4. Does the text follow the surface geometry correctly?
-5. Is the kerning/spacing natural?
-6. Is the text contextually appropriate for the scene?
-If NO text is visible, report that.
-Respond in JSON format:
 {
     "text_found": true/false,
     "text_elements": [
         {
-            "content": "what the text says (or 'GIBBERISH' if unreadable)",
-            "location": "where in the image",
-            "readable": true/false,
-            "spelling_correct": true/false,
             "font_consistent": true/false,
-            "perspective_correct": true/false
         }
     ],
-    "anomalies": ["list of text anomalies found"],
     "confidence": 0.0-1.0,
     "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
     "explanation": "detailed reasoning"
 }"""
-def analyze_text(img: Image.Image) -> Dict[str, Any]:
-    """Analyze text legibility and typography via VLM."""
-    response = _call_vlm(img, TEXT_SYSTEM_PROMPT, TEXT_USER_PROMPT)
-    if response and not response.startswith("VLM_ERROR"):
-        parsed = _parse_vlm_json(response)
-        if not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT":
-            return {
-                "test": "Text & Typography",
-                "score": 0.0,
-                "note": "No text visible in image — text analysis not applicable",
-                "vlm_analysis": parsed,
-                "text_found": False,
-            }
-        verdict = parsed.get("verdict", "UNKNOWN")
-        anomalies = parsed.get("anomalies", [])
-        text_elements = parsed.get("text_elements", [])
-        # Count problematic elements
-        n_elements = len(text_elements)
-        n_gibberish = sum(1 for t in text_elements if not t.get("readable", True))
-        n_misspelled = sum(1 for t in text_elements if not t.get("spelling_correct", True))
-        n_bad_font = sum(1 for t in text_elements if not t.get("font_consistent", True))
-        if verdict == "MANIPULATED" or (n_gibberish > 0):
-            score = 0.8
-        elif verdict == "SUSPICIOUS" or n_misspelled > 0 or n_bad_font > 0:
-            score = 0.4
-        elif verdict == "AUTHENTIC":
-            score = -0.4
-        else:
-            score = 0.0
-        return {
-            "test": "Text & Typography",
-            "vlm_analysis": parsed,
-            "text_found": True,
-            "text_elements": text_elements,
-            "anomalies": anomalies,
-            "n_elements": n_elements,
-            "n_gibberish": n_gibberish,
-            "n_misspelled": n_misspelled,
-            "score": score,
-            "confidence": parsed.get("confidence", 0.5),
-            "note": parsed.get("explanation", response[:200]),
-        }
-    else:
-        return {
-            "test": "Text & Typography",
-            "score": 0.0,
-            "note": f"VLM unavailable: {response or 'no HF_TOKEN'}",
-            "vlm_error": True,
-            "text_found": False,
-        }
-# ─── Main Agent Entry Point ─────────────────────────────────────────
-def run_text_agent(img: Image.Image) -> AgentEvidence:
-    """Run text and typography analysis."""
-    findings = []
-    scores = []
-    vlm_available = True
     try:
-        result = analyze_text(img)
-        findings.append(result)
-        scores.append(result["score"])
-        if result.get("vlm_error"):
-            vlm_available = False
     except Exception as e:
-        findings.append({"test": "Text & Typography", "error": str(e), "score": 0})
-    avg_score = float(np.mean(scores)) if scores else 0.0
-    confidence = min(1.0, 0.4 + 0.5 * abs(avg_score))
-    if not vlm_available:
-        confidence *= 0.3
-    # Check if text was found at all
-    text_found = any(f.get("text_found", False) for f in findings)
-    if not text_found and vlm_available:
-        rationale = "No text visible in image. Text agent not applicable."
-        confidence = 0.1
-    elif not vlm_available:
-        rationale = "VLM service unavailable. Text analysis skipped."
     else:
-        violations = [f["test"] for f in findings if f.get("score", 0) > 0.2]
-        if violations:
-            rationale = f"Text anomalies detected."
-        else:
-            rationale = f"Text appears legitimate and consistent."
         for f in findings:
-            if f.get("note"):
-                rationale += f" {f['note'][:200]}"
-    return AgentEvidence(
-        agent_name="Text & Typography Agent",
-        violation_score=np.clip(avg_score, -1, 1),
-        confidence=confidence,
-        failure_prob=0.0 if vlm_available else 0.9,
-        rationale=rationale,
-        sub_findings=findings,
-    )

+"""FORENSIQ — Text & Typography Agent (9 features via VLM)"""
+import os, numpy as np
 from PIL import Image
 from typing import Dict, Any
 from agents.optical_agent import AgentEvidence
+from agents.semantic_agent import _vlm, _parse, _score
+SYS_TEXT = """You are a world-class typographic forensic examiner and computational linguist. AI-generated images produce text that is visually plausible but linguistically, typographically, or physically impossible.
+Your expert analysis covers 9 domains:
+1. CHARACTER LEGIBILITY: Every character must be a valid glyph from a real writing system. Look for impossible letterforms — strokes that start but don't finish, serifs that appear mid-stroke, characters that resemble but aren't any real letter.
+2. SPELLING & LANGUAGE: All text must form valid words in an identifiable language. Check for: random letter sequences that resemble but aren't real words, mixed scripts within a single word (Latin + Cyrillic), grammatically impossible combinations.
+3. FONT CONSISTENCY: Within a single text element (sign, label, caption), all characters should share the same font family, weight, and style. AI often mixes serif and sans-serif, or varies stroke width mid-word.
+4. KERNING & SPACING: Professional typography has consistent letter-spacing. Look for: irregular gaps between letters, overlapping characters, inconsistent word spacing within a line.
+5. BASELINE ALIGNMENT: Characters in a word should sit on a consistent baseline. AI text often has characters that float above or sink below the line.
+6. TEXT PERSPECTIVE: Text on surfaces (signs, buildings, clothing) must follow the surface's perspective geometry. The text should distort consistently with the surface it's on.
+7. TEXT-SURFACE INTEGRATION: Text should look like it belongs on its surface — painted signs show brush texture, printed text is flat, embossed text has consistent lighting/shadow.
+8. CONTEXTUAL APPROPRIATENESS: Signs should contain text appropriate for their context (a restaurant menu should have food items, a street sign should have a street name, a storefront should have a business name).
+9. REFLECTION/SHADOW TEXT: If text is reflected in a mirror or water, the reflection should be geometrically correct. Text shadows should match the text geometry and light direction.
+Be meticulous. AI text errors are often subtle — a letter that's "almost" right but has an extra stroke, or text that's "almost" English but contains no real words."""
+USR_TEXT = """Perform a comprehensive typographic forensic analysis of ALL visible text in this image.
+For each text element you find, analyze all 9 domains:
+1. Character Legibility — are all characters valid real glyphs?
+2. Spelling — do words exist in any language?
+3. Font Consistency — same font throughout each text element?
+4. Kerning & Spacing — consistent letter/word spacing?
+5. Baseline Alignment — characters on consistent baseline?
+6. Text Perspective — follows surface geometry?
+7. Surface Integration — text looks like it belongs on surface?
+8. Context — text is appropriate for the scene?
+9. Reflections/Shadows — any reflected/shadowed text is geometrically correct?
+If NO text is visible, set text_found=false.
+Respond in JSON:
 {
     "text_found": true/false,
     "text_elements": [
         {
+            "content": "transcription or GIBBERISH",
+            "location": "where in image",
+            "legible": true/false,
+            "spelling_ok": true/false,
             "font_consistent": true/false,
+            "kerning_ok": true/false,
+            "baseline_ok": true/false,
+            "perspective_ok": true/false,
+            "surface_integrated": true/false,
+            "context_appropriate": true/false,
+            "reflection_ok": true/false/null
         }
     ],
+    "anomalies": ["specific text anomalies found"],
     "confidence": 0.0-1.0,
     "verdict": "AUTHENTIC" or "SUSPICIOUS" or "MANIPULATED" or "NO_TEXT",
     "explanation": "detailed reasoning"
 }"""
+def run_text_agent(img):
+    findings, scores = [], []
+    vlm_ok = True
+    FEATURES = ["Character Legibility","Spelling Validity","Font Consistency","Kerning & Spacing",
+                "Baseline Alignment","Text Perspective","Surface Integration","Context Appropriateness",
+                "Reflection/Shadow Text"]
     try:
+        resp = _vlm(img, SYS_TEXT, USR_TEXT)
+        if resp and not resp.startswith("VLM_ERROR"):
+            parsed = _parse(resp)
+            if not parsed.get("text_found", False) or parsed.get("verdict") == "NO_TEXT":
+                for feat in FEATURES:
+                    findings.append({"test": feat, "score": 0.0, "note": "No text in image"})
+                    scores.append(0.0)
+            else:
+                sc = _score(parsed)
+                elements = parsed.get("text_elements", [])
+                anomalies = parsed.get("anomalies", [])
+                # Per-feature scoring based on element analysis
+                for feat in FEATURES:
+                    feat_score = sc / len(FEATURES)
+                    findings.append({"test": feat, "score": feat_score,
+                                   "note": parsed.get("explanation", "")[:100]})
+                    scores.append(feat_score)
+                findings.append({"test": "Text & Typography Overall", "vlm_analysis": parsed,
+                               "text_elements": elements, "anomalies": anomalies,
+                               "score": sc, "confidence": parsed.get("confidence", 0.5),
+                               "note": parsed.get("explanation", "")[:200]})
+                scores.append(sc)
+        else:
+            vlm_ok = False
+            for feat in FEATURES:
+                findings.append({"test": feat, "score": 0.0, "note": "VLM unavailable", "vlm_error": True})
+                scores.append(0.0)
     except Exception as e:
+        findings.append({"test": "Text Analysis", "error": str(e), "score": 0})
+    avg = float(np.mean(scores)) if scores else 0.0
+    conf = min(1.0, 0.4 + 0.5 * abs(avg))
+    if not vlm_ok: conf *= 0.3
+    text_found = any(f.get("text_found", True) for f in findings if "vlm_analysis" in f)
+    if not text_found and vlm_ok:
+        rat = "No text visible in image. Text agent not applicable."
+        conf = 0.1
+    elif not vlm_ok:
+        rat = "VLM unavailable. Text analysis skipped."
     else:
+        viol = [f["test"] for f in findings if f.get("score", 0) > 0.1]
+        rat = f"Text anomalies: {', '.join(viol[:5])}." if viol else "Text appears legitimate."
         for f in findings:
+            if f.get("note") and "vlm_analysis" not in f: rat += f" {f['note'][:80]}"
+    return AgentEvidence("Text & Typography Agent", np.clip(avg, -1, 1), conf,
+                         0.0 if vlm_ok else 0.9, rat,
+                         [f for f in findings if "vlm_analysis" not in f])