Spaces:

RadicalNotionAI
/

modeldna

Running

App Files Files Community

trohrbaugh commited on 1 day ago

Commit

b210edb

verified ·

1 Parent(s): 05354d8

Launch modeldna Space: Stage 1 architecture scanner

Browse files

Files changed (4) hide show

README.md +24 -8
app.py +263 -0
requirements.txt +3 -0
scan.py +337 -0

README.md CHANGED Viewed

@@ -1,13 +1,29 @@
 ---
-title: Modeldna
-emoji: 🌍
-colorFrom: gray
-colorTo: purple
 sdk: gradio
-sdk_version: 6.14.0
-python_version: '3.13'
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ModelDNA
+emoji: 🧬
+colorFrom: blue
+colorTo: gray
 sdk: gradio
+sdk_version: "4.40.0"
 app_file: app.py
+pinned: true
+license: apache-2.0
+short_description: Verify AI model provenance before you download
 ---
+# 🧬 ModelDNA
+**Verify AI model provenance before you download.**
+Paste any HuggingFace model ID (or URL) to instantly check:
+- **Architecture confirmation** — what base model does this actually use?
+- **Claim validation** — does the name match the architecture?
+- **Unverifiable claim flags** — e.g. "Claude-distilled" cannot be confirmed from weights
+- **Derivative discovery** — models sharing the same base that don't declare attribution
+Stage 1 uses only `config.json` (~2 KB). No weight download. Results in ~2 seconds.
+---
+*Powered by [ModelAtlas](https://modeldna.ai) · a [RadicalNotion](https://radicalnotion.ai) product*

app.py ADDED Viewed

	@@ -0,0 +1,263 @@

+#!/usr/bin/env python3
+"""
+modeldna — HuggingFace Space
+Interactive model provenance scanner.
+Replaces the stale RadicalNotionAI/modelatlas-dashboard Space.
+Deployed at: https://huggingface.co/spaces/RadicalNotionAI/modeldna
+Custom domain: modeldna.ai (via HF Space custom domain setting)
+"""
+import gradio as gr
+import json
+import sys
+import time
+from pathlib import Path
+# scan.py is in the same directory as app.py in both local hf_space/ and on HF
+sys.path.insert(0, str(Path(__file__).parent))
+from scan import scan, KNOWN_BASES
+# ── Discovery: find derivatives that may not attribute properly ────────────
+def find_unattributed_derivatives(base_match: str, scanned_id: str) -> list[dict]:
+    """
+    Query the scan results database for models sharing the same base
+    that don't declare attribution to their source.
+    Returns models that appear derivative but lack proper attribution.
+    """
+    try:
+        import psycopg2
+        conn = psycopg2.connect(
+            "postgresql:///modelatlas?host=/var/run/postgresql&port=5433&user=tim"
+        )
+        cur = conn.cursor()
+        # Find models in the scan results that match this base but lack attribution
+        # (placeholder query — will be populated as scans accumulate)
+        cur.execute("""
+            SELECT model_id, confirmed_base, has_attribution, downloads
+            FROM modeldna_scans
+            WHERE confirmed_base = %s
+              AND model_id != %s
+              AND (has_attribution = false OR has_attribution IS NULL)
+            ORDER BY downloads DESC NULLS LAST
+            LIMIT 5
+        """, (base_match, scanned_id))
+        rows = cur.fetchall()
+        cur.close(); conn.close()
+        return [{"model_id": r[0], "confirmed_base": r[1], "downloads": r[3]} for r in rows]
+    except Exception:
+        return []
+def store_scan_result(result: dict) -> None:
+    """Store a scan result for future derivative discovery."""
+    try:
+        import psycopg2
+        conn = psycopg2.connect(
+            "postgresql:///modelatlas?host=/var/run/postgresql&port=5433&user=tim"
+        )
+        cur = conn.cursor()
+        cur.execute("""
+            CREATE TABLE IF NOT EXISTS modeldna_scans (
+                id SERIAL PRIMARY KEY,
+                model_id TEXT UNIQUE,
+                confirmed_base TEXT,
+                confidence TEXT,
+                has_attribution BOOLEAN,
+                flag_count INT,
+                downloads INT,
+                scanned_at TIMESTAMPTZ DEFAULT now()
+            )
+        """)
+        v = result.get("verdict", {})
+        m = result.get("metadata", {})
+        e = result.get("evidence", {})
+        has_attr = bool(e.get("claimed_base"))
+        cur.execute("""
+            INSERT INTO modeldna_scans
+              (model_id, confirmed_base, confidence, has_attribution, flag_count, downloads)
+            VALUES (%s, %s, %s, %s, %s, %s)
+            ON CONFLICT (model_id) DO UPDATE
+              SET confidence=EXCLUDED.confidence,
+                  has_attribution=EXCLUDED.has_attribution,
+                  flag_count=EXCLUDED.flag_count,
+                  downloads=EXCLUDED.downloads,
+                  scanned_at=now()
+        """, (
+            result.get("model_id"),
+            v.get("base_model_confirmed"),
+            v.get("confidence"),
+            has_attr,
+            v.get("flag_count", 0),
+            m.get("downloads", 0),
+        ))
+        conn.commit(); cur.close(); conn.close()
+    except Exception:
+        pass  # graceful — don't break the scan if storage fails
+def format_verdict(result: dict) -> tuple[str, str, str]:
+    """Format scan result into three UI sections."""
+    if "error" in result:
+        return (
+            "❌ Scan Failed",
+            f"**Error**: {result['error']}",
+            ""
+        )
+    v = result.get("verdict", {})
+    e = result.get("evidence", {})
+    m = result.get("metadata", {})
+    flags = v.get("flags", [])
+    # Header
+    confidence_emoji = {"HIGH": "✅", "MODERATE": "⚠️", "NONE": "❓"}.get(v.get("confidence",""), "❓")
+    header = f"{confidence_emoji} **{v.get('architecture', 'Unknown')}**"
+    header += f"\n\n*Scanned in {result.get('elapsed_s', '?')}s · Stage 1 (config-only)*"
+    header += f"\n\n📥 {m.get('downloads',0):,} downloads · 👍 {m.get('likes',0)} likes"
+    # Verdict details
+    details = f"### Architecture Confirmation\n"
+    details += f"**Base model**: {v.get('base_model_confirmed', 'Unrecognized')}\n"
+    details += f"**Confidence**: {v.get('confidence', 'None')}\n\n"
+    if e.get("base_matches"):
+        details += "**Evidence**:\n"
+        for bm in e["base_matches"][:2]:
+            for ev in bm.get("evidence", []):
+                details += f"- {ev}\n"
+    details += "\n"
+    if e.get("modelatlas_similar"):
+        details += "**Similar verified models** (ModelAtlas reference):\n"
+        for s in e["modelatlas_similar"][:3]:
+            details += f"- `{s['model_id']}`\n"
+    # Flags
+    flag_text = ""
+    if flags:
+        flag_text = f"### ⚠️ {len(flags)} Flag(s) Found\n\n"
+        for f in flags:
+            flag_text += f"**[{f['type']}]**\n\n{f['explanation']}\n\n---\n\n"
+    else:
+        flag_text = "### ✅ No Flags\n\nNo suspicious claims detected in model name or metadata."
+    return header, details, flag_text
+def run_scan(model_id: str) -> tuple[str, str, str, str]:
+    """Main scan function called by Gradio."""
+    model_id = model_id.strip()
+    if not model_id:
+        return "Enter a HuggingFace model ID above.", "", "", ""
+    # Normalize: handle full URLs
+    if "huggingface.co/" in model_id:
+        model_id = model_id.split("huggingface.co/")[-1].strip("/")
+    result = scan(model_id)
+    # Store result for derivative discovery
+    store_scan_result(result)
+    # Find unattributed derivatives
+    base = result.get("verdict", {}).get("base_model_confirmed", "")
+    derivatives = find_unattributed_derivatives(base, model_id) if base else []
+    header, details, flags = format_verdict(result)
+    # Derivative discovery section
+    discovery = ""
+    if derivatives:
+        discovery = f"### 🔍 {len(derivatives)} Related Models Found Without Attribution\n\n"
+        discovery += "These models share the same architecture base but don't declare it:\n\n"
+        for d in derivatives:
+            discovery += f"- `{d['model_id']}` ({d.get('downloads',0):,} downloads)\n"
+    else:
+        discovery = (
+            "### 🔍 Derivative Discovery\n\n"
+            "This scan has been stored. As similar models are scanned, "
+            "derivatives that don't properly attribute their source will appear here."
+        )
+    return header, details, flags, discovery
+# ── Gradio UI ──────────────────────────────────────────────────────────────
+EXAMPLES = [
+    "Qwen/Qwen3.5-27B",
+    "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled",
+    "poolside/Laguna-XS.2",
+    "deepseek-ai/DeepSeek-R1",
+    "mistralai/Mistral-Medium-3.5-128B",
+]
+CSS = """
+.gradio-container { max-width: 900px !important; margin: 0 auto; }
+.verdict-header { font-size: 1.2em; }
+footer { display: none; }
+"""
+with gr.Blocks(
+    title="ModelDNA — AI Model Provenance",
+    theme=gr.themes.Base(
+        primary_hue="cyan",
+        neutral_hue="slate",
+    ),
+    css=CSS,
+) as demo:
+    gr.Markdown("""
+    # 🧬 ModelDNA
+    ### The DNA test for AI models — verify provenance before you download
+    *Powered by ModelAtlas · a RadicalNotion product*
+    ---
+    """)
+    with gr.Row():
+        model_input = gr.Textbox(
+            label="HuggingFace Model ID",
+            placeholder="e.g. Qwen/Qwen3.5-27B or paste a HF URL",
+            scale=4,
+        )
+        scan_btn = gr.Button("🔬 Scan", variant="primary", scale=1)
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=model_input,
+        label="Try these examples",
+    )
+    gr.Markdown("---")
+    with gr.Row():
+        header_out = gr.Markdown(label="Verdict")
+    with gr.Row():
+        with gr.Column():
+            details_out = gr.Markdown(label="Evidence")
+        with gr.Column():
+            flags_out = gr.Markdown(label="Flags")
+    gr.Markdown("---")
+    discovery_out = gr.Markdown(label="Derivative Discovery")
+    gr.Markdown("""
+    ---
+    *Stage 1 (architecture screening): free, unlimited, no weight download needed.*
+    *Stage 2 (weight-level analysis): coming soon — deeper confirmation.*
+    *[modeldna.ai](https://modeldna.ai) · [RadicalNotionAI on HF](https://huggingface.co/RadicalNotionAI)*
+    """)
+    scan_btn.click(
+        fn=run_scan,
+        inputs=[model_input],
+        outputs=[header_out, details_out, flags_out, discovery_out],
+    )
+    model_input.submit(
+        fn=run_scan,
+        inputs=[model_input],
+        outputs=[header_out, details_out, flags_out, discovery_out],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio>=4.40.0
+requests>=2.31.0
+psycopg2-binary>=2.9.9

scan.py ADDED Viewed

	@@ -0,0 +1,337 @@

+#!/usr/bin/env python3
+"""
+modeldna Stage 1 HF Scanner — core logic.
+Given a HuggingFace model_id, validates architectural claims against the
+ModelAtlas reference database. No weight download needed — uses config.json only.
+This is the heart of the modeldna 'test before you download' feature.
+"""
+from __future__ import annotations
+import json, hashlib, re, time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+import requests
+import psycopg2, psycopg2.extras
+DB = "postgresql:///modelatlas?host=/var/run/postgresql&port=5433&user=tim"
+HF_API = "https://huggingface.co"
+# Known base model reference configs (canonical identifiers)
+KNOWN_BASES = {
+    "qwen3_5_text": {
+        "name": "Qwen3.5 (dense)",
+        "vocab_size": 248320,
+        "model_type_patterns": ["qwen3_5_text", "qwen3_5"],
+    },
+    "qwen3_5_moe_text": {
+        "name": "Qwen3.5 MoE",
+        "vocab_size": 248320,
+        "model_type_patterns": ["qwen3_5_moe_text", "qwen3_5_moe"],
+    },
+    "qwen3": {
+        "name": "Qwen3",
+        "vocab_size": [151936, 152064],
+        "model_type_patterns": ["qwen3"],
+    },
+    "qwen2": {
+        "name": "Qwen2.5",
+        "vocab_size": [151936, 152064],
+        "model_type_patterns": ["qwen2"],
+    },
+    "llama3": {
+        "name": "Llama 3.x",
+        "vocab_size": 128256,
+        "model_type_patterns": ["llama"],
+        "num_key_value_heads_hint": [8, 32],
+    },
+    "llama2": {
+        "name": "Llama 2",
+        "vocab_size": 32000,
+        "model_type_patterns": ["llama"],
+    },
+    "mistral": {
+        "name": "Mistral 7B family",
+        "vocab_size": 32000,
+        "model_type_patterns": ["mistral", "mixtral"],
+    },
+    "deepseek_v3": {
+        "name": "DeepSeek V3/R1",
+        "vocab_size": 129280,
+        "model_type_patterns": ["deepseek_v3", "deepseek_v2"],
+        "kv_lora_rank": 512,
+    },
+    "gemma": {
+        "name": "Gemma family",
+        "vocab_size": [256000, 262144],
+        "model_type_patterns": ["gemma"],
+    },
+}
+def fetch_config(model_id: str) -> Optional[dict]:
+    """Fetch config.json from HuggingFace. Returns None on failure."""
+    url = f"{HF_API}/{model_id}/resolve/main/config.json"
+    try:
+        r = requests.get(url, timeout=20)
+        r.raise_for_status()
+        return r.json()
+    except Exception as e:
+        return None
+def fetch_model_metadata(model_id: str) -> dict:
+    """Fetch HF model metadata (downloads, likes, author, tags)."""
+    try:
+        r = requests.get(f"{HF_API}/api/models/{model_id}", timeout=10)
+        r.raise_for_status()
+        d = r.json()
+        return {
+            "downloads": d.get("downloads", 0),
+            "likes": d.get("likes", 0),
+            "author": d.get("author", ""),
+            "tags": d.get("tags", []),
+            "pipeline_tag": d.get("pipeline_tag", ""),
+            "base_model": d.get("cardData", {}).get("base_model", ""),
+            "license": d.get("cardData", {}).get("license", ""),
+            "created_at": d.get("createdAt", ""),
+            "last_modified": d.get("lastModified", ""),
+        }
+    except Exception:
+        return {}
+def detect_claimed_base(model_id: str, config: dict, metadata: dict) -> dict:
+    """Detect what base model a model claims to be derived from."""
+    claims = {}
+    name = model_id.split("/")[-1].lower()
+    # Explicit base_model field
+    if metadata.get("base_model"):
+        claims["explicit_base"] = metadata["base_model"]
+    # Name-based detection
+    name_signals = []
+    for term, base_key in [
+        ("qwen3.5", "qwen3_5"), ("qwen3-5", "qwen3_5"), ("qwen35", "qwen3_5"),
+        ("qwen3", "qwen3"), ("qwen2.5", "qwen2"), ("qwen2", "qwen2"),
+        ("llama-3", "llama3"), ("llama3", "llama3"), ("llama-2", "llama2"),
+        ("mistral", "mistral"), ("mixtral", "mistral"),
+        ("deepseek", "deepseek_v3"), ("gemma", "gemma"),
+    ]:
+        if term in name:
+            name_signals.append(base_key)
+    if name_signals:
+        claims["name_implies"] = name_signals
+    # Suspicious claims in name
+    suspicious = []
+    for term in ["claude", "gpt", "chatgpt", "openai", "gemini", "anthropic"]:
+        if term in name:
+            suspicious.append(term)
+    if suspicious:
+        claims["suspicious_name_terms"] = suspicious
+    return claims
+def stage1_screen(model_id: str, config: dict) -> dict:
+    """
+    Stage 1: Architecture screening against ModelAtlas reference.
+    Returns a structured verdict without downloading any weights.
+    Handles nested text_config (Qwen3.5/3.6, Mistral3, MiMo-V2.5 pattern).
+    """
+    # Merge text_config into top-level if present (multimodal nested configs)
+    if config.get("text_config") and not config.get("vocab_size"):
+        tc = config["text_config"]
+        config = {**tc, **{k: v for k, v in config.items()
+                           if k not in ("text_config", "vision_config", "audio_config")}}
+    vocab = config.get("vocab_size")
+    model_type = (config.get("model_type") or "").lower()
+    hidden = config.get("hidden_size")
+    layers = config.get("num_hidden_layers")
+    kv_lora = config.get("kv_lora_rank")  # MLA signal
+    base_model_field = config.get("base_model") or config.get("_name_or_path", "")
+    # Compute architecture signature
+    key_fields = sorted([
+        f"vocab={vocab}", f"type={model_type}", f"hidden={hidden}",
+        f"layers={layers}", f"kv_lora={kv_lora}",
+    ])
+    arch_sig = hashlib.md5("|".join(str(f) for f in key_fields).encode()).hexdigest()[:12]
+    # Match against known bases
+    base_matches = []
+    for base_key, base_info in KNOWN_BASES.items():
+        score = 0
+        reasons = []
+        # Vocab match
+        expected_vocab = base_info.get("vocab_size")
+        if isinstance(expected_vocab, list):
+            if vocab in expected_vocab: score += 3; reasons.append(f"vocab matches ({vocab})")
+        elif vocab == expected_vocab:
+            score += 3; reasons.append(f"vocab matches ({vocab})")
+        # Model type match
+        for pat in base_info.get("model_type_patterns", []):
+            if model_type == pat:
+                score += 3; reasons.append(f"model_type '{model_type}' exact"); break
+            elif model_type.startswith(pat):
+                score += 2; reasons.append(f"model_type '{model_type}' matches {pat}"); break
+        # MLA signal
+        if base_key == "deepseek_v3" and kv_lora and kv_lora > 0:
+            score += 2; reasons.append(f"MLA kv_lora_rank={kv_lora}")
+        if score >= 3:
+            base_matches.append({
+                "base": base_key,
+                "name": base_info["name"],
+                "confidence": "HIGH" if score >= 5 else "MODERATE",
+                "score": score,
+                "evidence": reasons,
+            })
+    # Check ModelAtlas DB for exact signature
+    db_matches = []
+    try:
+        conn = psycopg2.connect(DB)
+        cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+        cur.execute("""
+            SELECT m.model_id, o.name AS lab, m.hf_downloads, m.release_date,
+                   a.technique_signature, a.total_params, a.num_layers, a.hidden_size, a.vocab_size
+            FROM analyses a JOIN models m ON m.id=a.model_id
+            JOIN organizations o ON m.org_id=o.id
+            WHERE a.is_current=true AND a.vocab_size=%s AND a.hidden_size=%s
+              AND m.model_id NOT ILIKE '%%tiny%%' AND m.model_id NOT ILIKE '/%%'
+            ORDER BY m.hf_downloads DESC NULLS LAST
+            LIMIT 5
+        """, (vocab, hidden))
+        db_matches = [dict(r) for r in cur.fetchall()]
+        cur.close(); conn.close()
+    except Exception:
+        pass
+    return {
+        "arch_signature": arch_sig,
+        "config_signals": {
+            "model_type": model_type,
+            "vocab_size": vocab,
+            "hidden_size": hidden,
+            "num_layers": layers,
+            "has_mla": bool(kv_lora and kv_lora > 0),
+            "kv_lora_rank": kv_lora,
+        },
+        "base_matches": sorted(base_matches, key=lambda x: -x["score"]),
+        "modelatlas_similar": db_matches,
+    }
+def generate_verdict(
+    model_id: str,
+    config: dict,
+    metadata: dict,
+    claims: dict,
+    stage1: dict,
+) -> dict:
+    """Synthesize all signals into a human-readable verdict."""
+    now = datetime.now(timezone.utc).isoformat()
+    base_matches = stage1["base_matches"]
+    suspicious = claims.get("suspicious_name_terms", [])
+    # Headline verdict
+    if base_matches:
+        top = base_matches[0]
+        if top["confidence"] == "HIGH":
+            architecture_verdict = f"CONFIRMED — architecture matches {top['name']}"
+        else:
+            architecture_verdict = f"LIKELY — architecture consistent with {top['name']}"
+    else:
+        architecture_verdict = "UNRECOGNIZED — architecture does not match any known base model"
+    # Claim accuracy flags
+    flags = []
+    if "claude" in suspicious or "anthropic" in suspicious:
+        flags.append({
+            "type": "UNVERIFIABLE_CLAIM",
+            "term": "claude/anthropic",
+            "explanation": (
+                "Claude weights are not publicly available — no weight transfer from Claude "
+                "is possible. If this model used Claude-generated reasoning traces as training "
+                "data (distillation), that is a post-training technique that leaves no "
+                "architectural trace and cannot be verified from weights alone. "
+                "The base architecture claim can be checked; the Claude claim cannot."
+            ),
+        })
+    if "gpt" in suspicious or "openai" in suspicious or "chatgpt" in suspicious:
+        flags.append({
+            "type": "UNVERIFIABLE_CLAIM",
+            "term": "gpt/openai",
+            "explanation": "GPT-4/OpenAI weights are closed. Any weight transfer claim is false. Distillation via outputs is possible but unverifiable from architecture.",
+        })
+    if "gemini" in suspicious:
+        flags.append({
+            "type": "UNVERIFIABLE_CLAIM",
+            "term": "gemini",
+            "explanation": "Gemini weights are closed. Architecture shows no Gemini structure.",
+        })
+    # Name vs architecture consistency
+    name_implied = claims.get("name_implies", [])
+    if name_implied and base_matches:
+        top_base = base_matches[0]["base"]
+        if not any(n in top_base or top_base in n for n in name_implied):
+            flags.append({
+                "type": "NAME_MISMATCH",
+                "explanation": f"Model name implies {name_implied} but architecture suggests {top_base}. Possible mislabeling.",
+            })
+    return {
+        "model_id": model_id,
+        "scanned_at": now,
+        "verdict": {
+            "architecture": architecture_verdict,
+            "base_model_confirmed": base_matches[0]["name"] if base_matches else "Unknown",
+            "confidence": base_matches[0]["confidence"] if base_matches else "NONE",
+            "flags": flags,
+            "flag_count": len(flags),
+            "stage": "Stage 1 (config-only — no weight download)",
+        },
+        "evidence": {
+            "config_signals": stage1["config_signals"],
+            "base_matches": stage1["base_matches"][:3],
+            "modelatlas_similar": stage1["modelatlas_similar"][:3],
+            "claimed_base": claims.get("explicit_base"),
+            "name_implies": name_implied,
+        },
+        "metadata": {
+            "downloads": metadata.get("downloads", 0),
+            "likes": metadata.get("likes", 0),
+            "license": metadata.get("license", ""),
+            "created_at": metadata.get("created_at", ""),
+        },
+        "note": (
+            "Stage 1 validates architecture from config.json only (~2KB). "
+            "Stage 2 weight analysis (requires model download) provides stronger confirmation. "
+            "Powered by ModelAtlas — modeldna.ai · a RadicalNotion product."
+        ),
+    }
+def scan(model_id: str) -> dict:
+    """Full Stage 1 scan. Entry point."""
+    t0 = time.time()
+    config = fetch_config(model_id)
+    if not config:
+        return {
+            "model_id": model_id,
+            "error": "Could not fetch config.json — model may be private, gated, or not exist on HuggingFace.",
+            "scanned_at": datetime.now(timezone.utc).isoformat(),
+        }
+    metadata = fetch_model_metadata(model_id)
+    claims = detect_claimed_base(model_id, config, metadata)
+    stage1 = stage1_screen(model_id, config)
+    verdict = generate_verdict(model_id, config, metadata, claims, stage1)
+    verdict["elapsed_s"] = round(time.time() - t0, 2)
+    return verdict
+if __name__ == "__main__":
+    import sys
+    model_id = sys.argv[1] if len(sys.argv) > 1 else "Qwen/Qwen3.5-27B"
+    result = scan(model_id)
+    print(json.dumps(result, indent=2, default=str))