trohrbaugh commited on
Commit
b210edb
·
verified ·
1 Parent(s): 05354d8

Launch modeldna Space: Stage 1 architecture scanner

Browse files
Files changed (4) hide show
  1. README.md +24 -8
  2. app.py +263 -0
  3. requirements.txt +3 -0
  4. scan.py +337 -0
README.md CHANGED
@@ -1,13 +1,29 @@
1
  ---
2
- title: Modeldna
3
- emoji: 🌍
4
- colorFrom: gray
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.14.0
8
- python_version: '3.13'
9
  app_file: app.py
10
- pinned: false
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: ModelDNA
3
+ emoji: 🧬
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: "4.40.0"
 
8
  app_file: app.py
9
+ pinned: true
10
+ license: apache-2.0
11
+ short_description: Verify AI model provenance before you download
12
  ---
13
 
14
+ # 🧬 ModelDNA
15
+
16
+ **Verify AI model provenance before you download.**
17
+
18
+ Paste any HuggingFace model ID (or URL) to instantly check:
19
+
20
+ - **Architecture confirmation** — what base model does this actually use?
21
+ - **Claim validation** — does the name match the architecture?
22
+ - **Unverifiable claim flags** — e.g. "Claude-distilled" cannot be confirmed from weights
23
+ - **Derivative discovery** — models sharing the same base that don't declare attribution
24
+
25
+ Stage 1 uses only `config.json` (~2 KB). No weight download. Results in ~2 seconds.
26
+
27
+ ---
28
+
29
+ *Powered by [ModelAtlas](https://modeldna.ai) · a [RadicalNotion](https://radicalnotion.ai) product*
app.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ modeldna — HuggingFace Space
4
+ Interactive model provenance scanner.
5
+ Replaces the stale RadicalNotionAI/modelatlas-dashboard Space.
6
+
7
+ Deployed at: https://huggingface.co/spaces/RadicalNotionAI/modeldna
8
+ Custom domain: modeldna.ai (via HF Space custom domain setting)
9
+ """
10
+ import gradio as gr
11
+ import json
12
+ import sys
13
+ import time
14
+ from pathlib import Path
15
+
16
+ # scan.py is in the same directory as app.py in both local hf_space/ and on HF
17
+ sys.path.insert(0, str(Path(__file__).parent))
18
+ from scan import scan, KNOWN_BASES
19
+
20
+ # ── Discovery: find derivatives that may not attribute properly ────────────
21
+
22
+ def find_unattributed_derivatives(base_match: str, scanned_id: str) -> list[dict]:
23
+ """
24
+ Query the scan results database for models sharing the same base
25
+ that don't declare attribution to their source.
26
+ Returns models that appear derivative but lack proper attribution.
27
+ """
28
+ try:
29
+ import psycopg2
30
+ conn = psycopg2.connect(
31
+ "postgresql:///modelatlas?host=/var/run/postgresql&port=5433&user=tim"
32
+ )
33
+ cur = conn.cursor()
34
+ # Find models in the scan results that match this base but lack attribution
35
+ # (placeholder query — will be populated as scans accumulate)
36
+ cur.execute("""
37
+ SELECT model_id, confirmed_base, has_attribution, downloads
38
+ FROM modeldna_scans
39
+ WHERE confirmed_base = %s
40
+ AND model_id != %s
41
+ AND (has_attribution = false OR has_attribution IS NULL)
42
+ ORDER BY downloads DESC NULLS LAST
43
+ LIMIT 5
44
+ """, (base_match, scanned_id))
45
+ rows = cur.fetchall()
46
+ cur.close(); conn.close()
47
+ return [{"model_id": r[0], "confirmed_base": r[1], "downloads": r[3]} for r in rows]
48
+ except Exception:
49
+ return []
50
+
51
+
52
+ def store_scan_result(result: dict) -> None:
53
+ """Store a scan result for future derivative discovery."""
54
+ try:
55
+ import psycopg2
56
+ conn = psycopg2.connect(
57
+ "postgresql:///modelatlas?host=/var/run/postgresql&port=5433&user=tim"
58
+ )
59
+ cur = conn.cursor()
60
+ cur.execute("""
61
+ CREATE TABLE IF NOT EXISTS modeldna_scans (
62
+ id SERIAL PRIMARY KEY,
63
+ model_id TEXT UNIQUE,
64
+ confirmed_base TEXT,
65
+ confidence TEXT,
66
+ has_attribution BOOLEAN,
67
+ flag_count INT,
68
+ downloads INT,
69
+ scanned_at TIMESTAMPTZ DEFAULT now()
70
+ )
71
+ """)
72
+ v = result.get("verdict", {})
73
+ m = result.get("metadata", {})
74
+ e = result.get("evidence", {})
75
+ has_attr = bool(e.get("claimed_base"))
76
+ cur.execute("""
77
+ INSERT INTO modeldna_scans
78
+ (model_id, confirmed_base, confidence, has_attribution, flag_count, downloads)
79
+ VALUES (%s, %s, %s, %s, %s, %s)
80
+ ON CONFLICT (model_id) DO UPDATE
81
+ SET confidence=EXCLUDED.confidence,
82
+ has_attribution=EXCLUDED.has_attribution,
83
+ flag_count=EXCLUDED.flag_count,
84
+ downloads=EXCLUDED.downloads,
85
+ scanned_at=now()
86
+ """, (
87
+ result.get("model_id"),
88
+ v.get("base_model_confirmed"),
89
+ v.get("confidence"),
90
+ has_attr,
91
+ v.get("flag_count", 0),
92
+ m.get("downloads", 0),
93
+ ))
94
+ conn.commit(); cur.close(); conn.close()
95
+ except Exception:
96
+ pass # graceful — don't break the scan if storage fails
97
+
98
+
99
+ def format_verdict(result: dict) -> tuple[str, str, str]:
100
+ """Format scan result into three UI sections."""
101
+ if "error" in result:
102
+ return (
103
+ "❌ Scan Failed",
104
+ f"**Error**: {result['error']}",
105
+ ""
106
+ )
107
+
108
+ v = result.get("verdict", {})
109
+ e = result.get("evidence", {})
110
+ m = result.get("metadata", {})
111
+ flags = v.get("flags", [])
112
+
113
+ # Header
114
+ confidence_emoji = {"HIGH": "✅", "MODERATE": "⚠️", "NONE": "❓"}.get(v.get("confidence",""), "❓")
115
+ header = f"{confidence_emoji} **{v.get('architecture', 'Unknown')}**"
116
+ header += f"\n\n*Scanned in {result.get('elapsed_s', '?')}s · Stage 1 (config-only)*"
117
+ header += f"\n\n📥 {m.get('downloads',0):,} downloads · 👍 {m.get('likes',0)} likes"
118
+
119
+ # Verdict details
120
+ details = f"### Architecture Confirmation\n"
121
+ details += f"**Base model**: {v.get('base_model_confirmed', 'Unrecognized')}\n"
122
+ details += f"**Confidence**: {v.get('confidence', 'None')}\n\n"
123
+
124
+ if e.get("base_matches"):
125
+ details += "**Evidence**:\n"
126
+ for bm in e["base_matches"][:2]:
127
+ for ev in bm.get("evidence", []):
128
+ details += f"- {ev}\n"
129
+ details += "\n"
130
+
131
+ if e.get("modelatlas_similar"):
132
+ details += "**Similar verified models** (ModelAtlas reference):\n"
133
+ for s in e["modelatlas_similar"][:3]:
134
+ details += f"- `{s['model_id']}`\n"
135
+
136
+ # Flags
137
+ flag_text = ""
138
+ if flags:
139
+ flag_text = f"### ⚠️ {len(flags)} Flag(s) Found\n\n"
140
+ for f in flags:
141
+ flag_text += f"**[{f['type']}]**\n\n{f['explanation']}\n\n---\n\n"
142
+ else:
143
+ flag_text = "### ✅ No Flags\n\nNo suspicious claims detected in model name or metadata."
144
+
145
+ return header, details, flag_text
146
+
147
+
148
+ def run_scan(model_id: str) -> tuple[str, str, str, str]:
149
+ """Main scan function called by Gradio."""
150
+ model_id = model_id.strip()
151
+ if not model_id:
152
+ return "Enter a HuggingFace model ID above.", "", "", ""
153
+
154
+ # Normalize: handle full URLs
155
+ if "huggingface.co/" in model_id:
156
+ model_id = model_id.split("huggingface.co/")[-1].strip("/")
157
+
158
+ result = scan(model_id)
159
+
160
+ # Store result for derivative discovery
161
+ store_scan_result(result)
162
+
163
+ # Find unattributed derivatives
164
+ base = result.get("verdict", {}).get("base_model_confirmed", "")
165
+ derivatives = find_unattributed_derivatives(base, model_id) if base else []
166
+
167
+ header, details, flags = format_verdict(result)
168
+
169
+ # Derivative discovery section
170
+ discovery = ""
171
+ if derivatives:
172
+ discovery = f"### 🔍 {len(derivatives)} Related Models Found Without Attribution\n\n"
173
+ discovery += "These models share the same architecture base but don't declare it:\n\n"
174
+ for d in derivatives:
175
+ discovery += f"- `{d['model_id']}` ({d.get('downloads',0):,} downloads)\n"
176
+ else:
177
+ discovery = (
178
+ "### 🔍 Derivative Discovery\n\n"
179
+ "This scan has been stored. As similar models are scanned, "
180
+ "derivatives that don't properly attribute their source will appear here."
181
+ )
182
+
183
+ return header, details, flags, discovery
184
+
185
+
186
+ # ── Gradio UI ──────────────────────────────────────────────────────────────
187
+
188
+ EXAMPLES = [
189
+ "Qwen/Qwen3.5-27B",
190
+ "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled",
191
+ "poolside/Laguna-XS.2",
192
+ "deepseek-ai/DeepSeek-R1",
193
+ "mistralai/Mistral-Medium-3.5-128B",
194
+ ]
195
+
196
+ CSS = """
197
+ .gradio-container { max-width: 900px !important; margin: 0 auto; }
198
+ .verdict-header { font-size: 1.2em; }
199
+ footer { display: none; }
200
+ """
201
+
202
+ with gr.Blocks(
203
+ title="ModelDNA — AI Model Provenance",
204
+ theme=gr.themes.Base(
205
+ primary_hue="cyan",
206
+ neutral_hue="slate",
207
+ ),
208
+ css=CSS,
209
+ ) as demo:
210
+ gr.Markdown("""
211
+ # 🧬 ModelDNA
212
+ ### The DNA test for AI models — verify provenance before you download
213
+ *Powered by ModelAtlas · a RadicalNotion product*
214
+ ---
215
+ """)
216
+
217
+ with gr.Row():
218
+ model_input = gr.Textbox(
219
+ label="HuggingFace Model ID",
220
+ placeholder="e.g. Qwen/Qwen3.5-27B or paste a HF URL",
221
+ scale=4,
222
+ )
223
+ scan_btn = gr.Button("🔬 Scan", variant="primary", scale=1)
224
+
225
+ gr.Examples(
226
+ examples=EXAMPLES,
227
+ inputs=model_input,
228
+ label="Try these examples",
229
+ )
230
+
231
+ gr.Markdown("---")
232
+
233
+ with gr.Row():
234
+ header_out = gr.Markdown(label="Verdict")
235
+ with gr.Row():
236
+ with gr.Column():
237
+ details_out = gr.Markdown(label="Evidence")
238
+ with gr.Column():
239
+ flags_out = gr.Markdown(label="Flags")
240
+
241
+ gr.Markdown("---")
242
+ discovery_out = gr.Markdown(label="Derivative Discovery")
243
+
244
+ gr.Markdown("""
245
+ ---
246
+ *Stage 1 (architecture screening): free, unlimited, no weight download needed.*
247
+ *Stage 2 (weight-level analysis): coming soon — deeper confirmation.*
248
+ *[modeldna.ai](https://modeldna.ai) · [RadicalNotionAI on HF](https://huggingface.co/RadicalNotionAI)*
249
+ """)
250
+
251
+ scan_btn.click(
252
+ fn=run_scan,
253
+ inputs=[model_input],
254
+ outputs=[header_out, details_out, flags_out, discovery_out],
255
+ )
256
+ model_input.submit(
257
+ fn=run_scan,
258
+ inputs=[model_input],
259
+ outputs=[header_out, details_out, flags_out, discovery_out],
260
+ )
261
+
262
+ if __name__ == "__main__":
263
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=4.40.0
2
+ requests>=2.31.0
3
+ psycopg2-binary>=2.9.9
scan.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ modeldna Stage 1 HF Scanner — core logic.
4
+ Given a HuggingFace model_id, validates architectural claims against the
5
+ ModelAtlas reference database. No weight download needed — uses config.json only.
6
+
7
+ This is the heart of the modeldna 'test before you download' feature.
8
+ """
9
+ from __future__ import annotations
10
+ import json, hashlib, re, time
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Optional
14
+ import requests
15
+ import psycopg2, psycopg2.extras
16
+
17
+ DB = "postgresql:///modelatlas?host=/var/run/postgresql&port=5433&user=tim"
18
+ HF_API = "https://huggingface.co"
19
+
20
+ # Known base model reference configs (canonical identifiers)
21
+ KNOWN_BASES = {
22
+ "qwen3_5_text": {
23
+ "name": "Qwen3.5 (dense)",
24
+ "vocab_size": 248320,
25
+ "model_type_patterns": ["qwen3_5_text", "qwen3_5"],
26
+ },
27
+ "qwen3_5_moe_text": {
28
+ "name": "Qwen3.5 MoE",
29
+ "vocab_size": 248320,
30
+ "model_type_patterns": ["qwen3_5_moe_text", "qwen3_5_moe"],
31
+ },
32
+ "qwen3": {
33
+ "name": "Qwen3",
34
+ "vocab_size": [151936, 152064],
35
+ "model_type_patterns": ["qwen3"],
36
+ },
37
+ "qwen2": {
38
+ "name": "Qwen2.5",
39
+ "vocab_size": [151936, 152064],
40
+ "model_type_patterns": ["qwen2"],
41
+ },
42
+ "llama3": {
43
+ "name": "Llama 3.x",
44
+ "vocab_size": 128256,
45
+ "model_type_patterns": ["llama"],
46
+ "num_key_value_heads_hint": [8, 32],
47
+ },
48
+ "llama2": {
49
+ "name": "Llama 2",
50
+ "vocab_size": 32000,
51
+ "model_type_patterns": ["llama"],
52
+ },
53
+ "mistral": {
54
+ "name": "Mistral 7B family",
55
+ "vocab_size": 32000,
56
+ "model_type_patterns": ["mistral", "mixtral"],
57
+ },
58
+ "deepseek_v3": {
59
+ "name": "DeepSeek V3/R1",
60
+ "vocab_size": 129280,
61
+ "model_type_patterns": ["deepseek_v3", "deepseek_v2"],
62
+ "kv_lora_rank": 512,
63
+ },
64
+ "gemma": {
65
+ "name": "Gemma family",
66
+ "vocab_size": [256000, 262144],
67
+ "model_type_patterns": ["gemma"],
68
+ },
69
+ }
70
+
71
+
72
+ def fetch_config(model_id: str) -> Optional[dict]:
73
+ """Fetch config.json from HuggingFace. Returns None on failure."""
74
+ url = f"{HF_API}/{model_id}/resolve/main/config.json"
75
+ try:
76
+ r = requests.get(url, timeout=20)
77
+ r.raise_for_status()
78
+ return r.json()
79
+ except Exception as e:
80
+ return None
81
+
82
+
83
+ def fetch_model_metadata(model_id: str) -> dict:
84
+ """Fetch HF model metadata (downloads, likes, author, tags)."""
85
+ try:
86
+ r = requests.get(f"{HF_API}/api/models/{model_id}", timeout=10)
87
+ r.raise_for_status()
88
+ d = r.json()
89
+ return {
90
+ "downloads": d.get("downloads", 0),
91
+ "likes": d.get("likes", 0),
92
+ "author": d.get("author", ""),
93
+ "tags": d.get("tags", []),
94
+ "pipeline_tag": d.get("pipeline_tag", ""),
95
+ "base_model": d.get("cardData", {}).get("base_model", ""),
96
+ "license": d.get("cardData", {}).get("license", ""),
97
+ "created_at": d.get("createdAt", ""),
98
+ "last_modified": d.get("lastModified", ""),
99
+ }
100
+ except Exception:
101
+ return {}
102
+
103
+
104
+ def detect_claimed_base(model_id: str, config: dict, metadata: dict) -> dict:
105
+ """Detect what base model a model claims to be derived from."""
106
+ claims = {}
107
+ name = model_id.split("/")[-1].lower()
108
+ # Explicit base_model field
109
+ if metadata.get("base_model"):
110
+ claims["explicit_base"] = metadata["base_model"]
111
+ # Name-based detection
112
+ name_signals = []
113
+ for term, base_key in [
114
+ ("qwen3.5", "qwen3_5"), ("qwen3-5", "qwen3_5"), ("qwen35", "qwen3_5"),
115
+ ("qwen3", "qwen3"), ("qwen2.5", "qwen2"), ("qwen2", "qwen2"),
116
+ ("llama-3", "llama3"), ("llama3", "llama3"), ("llama-2", "llama2"),
117
+ ("mistral", "mistral"), ("mixtral", "mistral"),
118
+ ("deepseek", "deepseek_v3"), ("gemma", "gemma"),
119
+ ]:
120
+ if term in name:
121
+ name_signals.append(base_key)
122
+ if name_signals:
123
+ claims["name_implies"] = name_signals
124
+ # Suspicious claims in name
125
+ suspicious = []
126
+ for term in ["claude", "gpt", "chatgpt", "openai", "gemini", "anthropic"]:
127
+ if term in name:
128
+ suspicious.append(term)
129
+ if suspicious:
130
+ claims["suspicious_name_terms"] = suspicious
131
+ return claims
132
+
133
+
134
+ def stage1_screen(model_id: str, config: dict) -> dict:
135
+ """
136
+ Stage 1: Architecture screening against ModelAtlas reference.
137
+ Returns a structured verdict without downloading any weights.
138
+ Handles nested text_config (Qwen3.5/3.6, Mistral3, MiMo-V2.5 pattern).
139
+ """
140
+ # Merge text_config into top-level if present (multimodal nested configs)
141
+ if config.get("text_config") and not config.get("vocab_size"):
142
+ tc = config["text_config"]
143
+ config = {**tc, **{k: v for k, v in config.items()
144
+ if k not in ("text_config", "vision_config", "audio_config")}}
145
+
146
+ vocab = config.get("vocab_size")
147
+ model_type = (config.get("model_type") or "").lower()
148
+ hidden = config.get("hidden_size")
149
+ layers = config.get("num_hidden_layers")
150
+ kv_lora = config.get("kv_lora_rank") # MLA signal
151
+ base_model_field = config.get("base_model") or config.get("_name_or_path", "")
152
+
153
+ # Compute architecture signature
154
+ key_fields = sorted([
155
+ f"vocab={vocab}", f"type={model_type}", f"hidden={hidden}",
156
+ f"layers={layers}", f"kv_lora={kv_lora}",
157
+ ])
158
+ arch_sig = hashlib.md5("|".join(str(f) for f in key_fields).encode()).hexdigest()[:12]
159
+
160
+ # Match against known bases
161
+ base_matches = []
162
+ for base_key, base_info in KNOWN_BASES.items():
163
+ score = 0
164
+ reasons = []
165
+ # Vocab match
166
+ expected_vocab = base_info.get("vocab_size")
167
+ if isinstance(expected_vocab, list):
168
+ if vocab in expected_vocab: score += 3; reasons.append(f"vocab matches ({vocab})")
169
+ elif vocab == expected_vocab:
170
+ score += 3; reasons.append(f"vocab matches ({vocab})")
171
+ # Model type match
172
+ for pat in base_info.get("model_type_patterns", []):
173
+ if model_type == pat:
174
+ score += 3; reasons.append(f"model_type '{model_type}' exact"); break
175
+ elif model_type.startswith(pat):
176
+ score += 2; reasons.append(f"model_type '{model_type}' matches {pat}"); break
177
+ # MLA signal
178
+ if base_key == "deepseek_v3" and kv_lora and kv_lora > 0:
179
+ score += 2; reasons.append(f"MLA kv_lora_rank={kv_lora}")
180
+ if score >= 3:
181
+ base_matches.append({
182
+ "base": base_key,
183
+ "name": base_info["name"],
184
+ "confidence": "HIGH" if score >= 5 else "MODERATE",
185
+ "score": score,
186
+ "evidence": reasons,
187
+ })
188
+
189
+ # Check ModelAtlas DB for exact signature
190
+ db_matches = []
191
+ try:
192
+ conn = psycopg2.connect(DB)
193
+ cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
194
+ cur.execute("""
195
+ SELECT m.model_id, o.name AS lab, m.hf_downloads, m.release_date,
196
+ a.technique_signature, a.total_params, a.num_layers, a.hidden_size, a.vocab_size
197
+ FROM analyses a JOIN models m ON m.id=a.model_id
198
+ JOIN organizations o ON m.org_id=o.id
199
+ WHERE a.is_current=true AND a.vocab_size=%s AND a.hidden_size=%s
200
+ AND m.model_id NOT ILIKE '%%tiny%%' AND m.model_id NOT ILIKE '/%%'
201
+ ORDER BY m.hf_downloads DESC NULLS LAST
202
+ LIMIT 5
203
+ """, (vocab, hidden))
204
+ db_matches = [dict(r) for r in cur.fetchall()]
205
+ cur.close(); conn.close()
206
+ except Exception:
207
+ pass
208
+
209
+ return {
210
+ "arch_signature": arch_sig,
211
+ "config_signals": {
212
+ "model_type": model_type,
213
+ "vocab_size": vocab,
214
+ "hidden_size": hidden,
215
+ "num_layers": layers,
216
+ "has_mla": bool(kv_lora and kv_lora > 0),
217
+ "kv_lora_rank": kv_lora,
218
+ },
219
+ "base_matches": sorted(base_matches, key=lambda x: -x["score"]),
220
+ "modelatlas_similar": db_matches,
221
+ }
222
+
223
+
224
+ def generate_verdict(
225
+ model_id: str,
226
+ config: dict,
227
+ metadata: dict,
228
+ claims: dict,
229
+ stage1: dict,
230
+ ) -> dict:
231
+ """Synthesize all signals into a human-readable verdict."""
232
+ now = datetime.now(timezone.utc).isoformat()
233
+ base_matches = stage1["base_matches"]
234
+ suspicious = claims.get("suspicious_name_terms", [])
235
+
236
+ # Headline verdict
237
+ if base_matches:
238
+ top = base_matches[0]
239
+ if top["confidence"] == "HIGH":
240
+ architecture_verdict = f"CONFIRMED — architecture matches {top['name']}"
241
+ else:
242
+ architecture_verdict = f"LIKELY — architecture consistent with {top['name']}"
243
+ else:
244
+ architecture_verdict = "UNRECOGNIZED — architecture does not match any known base model"
245
+
246
+ # Claim accuracy flags
247
+ flags = []
248
+ if "claude" in suspicious or "anthropic" in suspicious:
249
+ flags.append({
250
+ "type": "UNVERIFIABLE_CLAIM",
251
+ "term": "claude/anthropic",
252
+ "explanation": (
253
+ "Claude weights are not publicly available — no weight transfer from Claude "
254
+ "is possible. If this model used Claude-generated reasoning traces as training "
255
+ "data (distillation), that is a post-training technique that leaves no "
256
+ "architectural trace and cannot be verified from weights alone. "
257
+ "The base architecture claim can be checked; the Claude claim cannot."
258
+ ),
259
+ })
260
+ if "gpt" in suspicious or "openai" in suspicious or "chatgpt" in suspicious:
261
+ flags.append({
262
+ "type": "UNVERIFIABLE_CLAIM",
263
+ "term": "gpt/openai",
264
+ "explanation": "GPT-4/OpenAI weights are closed. Any weight transfer claim is false. Distillation via outputs is possible but unverifiable from architecture.",
265
+ })
266
+ if "gemini" in suspicious:
267
+ flags.append({
268
+ "type": "UNVERIFIABLE_CLAIM",
269
+ "term": "gemini",
270
+ "explanation": "Gemini weights are closed. Architecture shows no Gemini structure.",
271
+ })
272
+
273
+ # Name vs architecture consistency
274
+ name_implied = claims.get("name_implies", [])
275
+ if name_implied and base_matches:
276
+ top_base = base_matches[0]["base"]
277
+ if not any(n in top_base or top_base in n for n in name_implied):
278
+ flags.append({
279
+ "type": "NAME_MISMATCH",
280
+ "explanation": f"Model name implies {name_implied} but architecture suggests {top_base}. Possible mislabeling.",
281
+ })
282
+
283
+ return {
284
+ "model_id": model_id,
285
+ "scanned_at": now,
286
+ "verdict": {
287
+ "architecture": architecture_verdict,
288
+ "base_model_confirmed": base_matches[0]["name"] if base_matches else "Unknown",
289
+ "confidence": base_matches[0]["confidence"] if base_matches else "NONE",
290
+ "flags": flags,
291
+ "flag_count": len(flags),
292
+ "stage": "Stage 1 (config-only — no weight download)",
293
+ },
294
+ "evidence": {
295
+ "config_signals": stage1["config_signals"],
296
+ "base_matches": stage1["base_matches"][:3],
297
+ "modelatlas_similar": stage1["modelatlas_similar"][:3],
298
+ "claimed_base": claims.get("explicit_base"),
299
+ "name_implies": name_implied,
300
+ },
301
+ "metadata": {
302
+ "downloads": metadata.get("downloads", 0),
303
+ "likes": metadata.get("likes", 0),
304
+ "license": metadata.get("license", ""),
305
+ "created_at": metadata.get("created_at", ""),
306
+ },
307
+ "note": (
308
+ "Stage 1 validates architecture from config.json only (~2KB). "
309
+ "Stage 2 weight analysis (requires model download) provides stronger confirmation. "
310
+ "Powered by ModelAtlas — modeldna.ai · a RadicalNotion product."
311
+ ),
312
+ }
313
+
314
+
315
+ def scan(model_id: str) -> dict:
316
+ """Full Stage 1 scan. Entry point."""
317
+ t0 = time.time()
318
+ config = fetch_config(model_id)
319
+ if not config:
320
+ return {
321
+ "model_id": model_id,
322
+ "error": "Could not fetch config.json — model may be private, gated, or not exist on HuggingFace.",
323
+ "scanned_at": datetime.now(timezone.utc).isoformat(),
324
+ }
325
+ metadata = fetch_model_metadata(model_id)
326
+ claims = detect_claimed_base(model_id, config, metadata)
327
+ stage1 = stage1_screen(model_id, config)
328
+ verdict = generate_verdict(model_id, config, metadata, claims, stage1)
329
+ verdict["elapsed_s"] = round(time.time() - t0, 2)
330
+ return verdict
331
+
332
+
333
+ if __name__ == "__main__":
334
+ import sys
335
+ model_id = sys.argv[1] if len(sys.argv) > 1 else "Qwen/Qwen3.5-27B"
336
+ result = scan(model_id)
337
+ print(json.dumps(result, indent=2, default=str))