Spaces:

RadicalNotionAI
/

modeldna

Running

File size: 9,259 Bytes

#!/usr/bin/env python3
"""
modeldna — HuggingFace Space
Interactive model provenance scanner.
Replaces the stale RadicalNotionAI/modelatlas-dashboard Space.

Deployed at: https://huggingface.co/spaces/RadicalNotionAI/modeldna
Custom domain: modeldna.ai (via HF Space custom domain setting)
"""
import gradio as gr
import json
import sys
import time
from pathlib import Path

# scan.py is in the same directory as app.py in both local hf_space/ and on HF
sys.path.insert(0, str(Path(__file__).parent))
from scan import scan, KNOWN_BASES

# ── Discovery: find derivatives that may not attribute properly ────────────

def find_unattributed_derivatives(base_match: str, scanned_id: str) -> list[dict]:
    """
    Query the scan results database for models sharing the same base
    that don't declare attribution to their source.
    Returns models that appear derivative but lack proper attribution.
    """
    try:
        import psycopg2
        conn = psycopg2.connect(
            "postgresql:///modelatlas?host=/var/run/postgresql&port=5433&user=tim"
        )
        cur = conn.cursor()
        # Find models in the scan results that match this base but lack attribution
        # (placeholder query — will be populated as scans accumulate)
        cur.execute("""
            SELECT model_id, confirmed_base, has_attribution, downloads
            FROM modeldna_scans
            WHERE confirmed_base = %s
              AND model_id != %s
              AND (has_attribution = false OR has_attribution IS NULL)
            ORDER BY downloads DESC NULLS LAST
            LIMIT 5
        """, (base_match, scanned_id))
        rows = cur.fetchall()
        cur.close(); conn.close()
        return [{"model_id": r[0], "confirmed_base": r[1], "downloads": r[3]} for r in rows]
    except Exception:
        return []


def store_scan_result(result: dict) -> None:
    """Store a scan result for future derivative discovery."""
    try:
        import psycopg2
        conn = psycopg2.connect(
            "postgresql:///modelatlas?host=/var/run/postgresql&port=5433&user=tim"
        )
        cur = conn.cursor()
        cur.execute("""
            CREATE TABLE IF NOT EXISTS modeldna_scans (
                id SERIAL PRIMARY KEY,
                model_id TEXT UNIQUE,
                confirmed_base TEXT,
                confidence TEXT,
                has_attribution BOOLEAN,
                flag_count INT,
                downloads INT,
                scanned_at TIMESTAMPTZ DEFAULT now()
            )
        """)
        v = result.get("verdict", {})
        m = result.get("metadata", {})
        e = result.get("evidence", {})
        has_attr = bool(e.get("claimed_base"))
        cur.execute("""
            INSERT INTO modeldna_scans
              (model_id, confirmed_base, confidence, has_attribution, flag_count, downloads)
            VALUES (%s, %s, %s, %s, %s, %s)
            ON CONFLICT (model_id) DO UPDATE
              SET confidence=EXCLUDED.confidence,
                  has_attribution=EXCLUDED.has_attribution,
                  flag_count=EXCLUDED.flag_count,
                  downloads=EXCLUDED.downloads,
                  scanned_at=now()
        """, (
            result.get("model_id"),
            v.get("base_model_confirmed"),
            v.get("confidence"),
            has_attr,
            v.get("flag_count", 0),
            m.get("downloads", 0),
        ))
        conn.commit(); cur.close(); conn.close()
    except Exception:
        pass  # graceful — don't break the scan if storage fails


def format_verdict(result: dict) -> tuple[str, str, str]:
    """Format scan result into three UI sections."""
    if "error" in result:
        return (
            "❌ Scan Failed",
            f"**Error**: {result['error']}",
            ""
        )

    v = result.get("verdict", {})
    e = result.get("evidence", {})
    m = result.get("metadata", {})
    flags = v.get("flags", [])

    # Header
    confidence_emoji = {"HIGH": "✅", "MODERATE": "⚠️", "NONE": "❓"}.get(v.get("confidence",""), "❓")
    header = f"{confidence_emoji} **{v.get('architecture', 'Unknown')}**"
    header += f"\n\n*Scanned in {result.get('elapsed_s', '?')}s · Stage 1 (config-only)*"
    header += f"\n\n📥 {m.get('downloads',0):,} downloads · 👍 {m.get('likes',0)} likes"

    # Verdict details
    details = f"### Architecture Confirmation\n"
    details += f"**Base model**: {v.get('base_model_confirmed', 'Unrecognized')}\n"
    details += f"**Confidence**: {v.get('confidence', 'None')}\n\n"

    if e.get("base_matches"):
        details += "**Evidence**:\n"
        for bm in e["base_matches"][:2]:
            for ev in bm.get("evidence", []):
                details += f"- {ev}\n"
    details += "\n"

    if e.get("modelatlas_similar"):
        details += "**Similar verified models** (ModelAtlas reference):\n"
        for s in e["modelatlas_similar"][:3]:
            details += f"- `{s['model_id']}`\n"

    # Flags
    flag_text = ""
    if flags:
        flag_text = f"### ⚠️ {len(flags)} Flag(s) Found\n\n"
        for f in flags:
            flag_text += f"**[{f['type']}]**\n\n{f['explanation']}\n\n---\n\n"
    else:
        flag_text = "### ✅ No Flags\n\nNo suspicious claims detected in model name or metadata."

    return header, details, flag_text


def run_scan(model_id: str) -> tuple[str, str, str, str]:
    """Main scan function called by Gradio."""
    model_id = model_id.strip()
    if not model_id:
        return "Enter a HuggingFace model ID above.", "", "", ""

    # Normalize: handle full URLs
    if "huggingface.co/" in model_id:
        model_id = model_id.split("huggingface.co/")[-1].strip("/")

    result = scan(model_id)

    # Store result for derivative discovery
    store_scan_result(result)

    # Find unattributed derivatives
    base = result.get("verdict", {}).get("base_model_confirmed", "")
    derivatives = find_unattributed_derivatives(base, model_id) if base else []

    header, details, flags = format_verdict(result)

    # Derivative discovery section
    discovery = ""
    if derivatives:
        discovery = f"### 🔍 {len(derivatives)} Related Models Found Without Attribution\n\n"
        discovery += "These models share the same architecture base but don't declare it:\n\n"
        for d in derivatives:
            discovery += f"- `{d['model_id']}` ({d.get('downloads',0):,} downloads)\n"
    else:
        discovery = (
            "### 🔍 Derivative Discovery\n\n"
            "This scan has been stored. As similar models are scanned, "
            "derivatives that don't properly attribute their source will appear here."
        )

    return header, details, flags, discovery


# ── Gradio UI ──────────────────────────────────────────────────────────────

EXAMPLES = [
    "Qwen/Qwen3.5-27B",
    "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled",
    "poolside/Laguna-XS.2",
    "deepseek-ai/DeepSeek-R1",
    "mistralai/Mistral-Medium-3.5-128B",
]

CSS = """
.gradio-container { max-width: 900px !important; margin: 0 auto; }
.verdict-header { font-size: 1.2em; }
footer { display: none; }
"""

with gr.Blocks(
    title="ModelDNA — AI Model Provenance",
    theme=gr.themes.Ocean(),
    css=CSS,
) as demo:
    gr.Markdown("""
    # 🧬 ModelDNA
    ### The DNA test for AI models — verify provenance before you download
    *Powered by ModelAtlas · a RadicalNotion product*

    > **Works with:** standard HuggingFace checkpoints (safetensors / PyTorch bin).
    > **Not yet supported:** GGUF quantized models, private/gated models. No weight download needed — Stage 1 reads config.json only.
    ---
    """)

    with gr.Row():
        model_input = gr.Textbox(
            label="HuggingFace Model ID or URL",
            placeholder="e.g. Qwen/Qwen3.5-27B  (not GGUF — use the original checkpoint)",
            scale=4,
        )
        scan_btn = gr.Button("🔬 Scan", variant="primary", scale=1)

    gr.Examples(
        examples=EXAMPLES,
        inputs=model_input,
        label="Try these examples",
    )

    gr.Markdown("---")

    with gr.Row():
        header_out = gr.Markdown(label="Verdict")
    with gr.Row():
        with gr.Column():
            details_out = gr.Markdown(label="Evidence")
        with gr.Column():
            flags_out = gr.Markdown(label="Flags")

    gr.Markdown("---")
    discovery_out = gr.Markdown(label="Derivative Discovery")

    gr.Markdown("""
    ---
    *Stage 1 (architecture screening): free, unlimited, no weight download needed.*
    *Stage 2 (weight-level analysis): coming soon — deeper confirmation.*
    *[modeldna.ai](https://modeldna.ai) · [RadicalNotionAI on HF](https://huggingface.co/RadicalNotionAI)*
    """)

    scan_btn.click(
        fn=run_scan,
        inputs=[model_input],
        outputs=[header_out, details_out, flags_out, discovery_out],
    )
    model_input.submit(
        fn=run_scan,
        inputs=[model_input],
        outputs=[header_out, details_out, flags_out, discovery_out],
    )

if __name__ == "__main__":
    demo.launch()