"""Seed demo artifacts so every showcase path works without external data. Idempotent — skips any artifact that already exists. Safe to call during Docker build OR at container start. Generates: - data/processed/mri_dl_2d/best_model.pt (random resnet18 4-class) - data/processed/mri_model.onnx (dynamic-D/H/W ONNX, biased toward 'abnormal') - data/processed/eeg_clf.joblib (synthetic-separable RandomForest) - data/external_rag/index/rag_index.pkl (4-chunk synthetic clinical TF-IDF) - tests/fixtures/mri_sample/subject_0_axial.png (axial slice from the bundled NIfTI) """ from __future__ import annotations import sys from pathlib import Path def seed_mri_dl_2d() -> Path: out = Path("data/processed/mri_dl_2d/best_model.pt") if out.exists(): return out out.parent.mkdir(parents=True, exist_ok=True) import torch from torchvision import models model = models.resnet18(weights=None) model.fc = torch.nn.Linear(model.fc.in_features, 4) torch.save(model.state_dict(), str(out)) return out def seed_mri_volumetric_onnx() -> Path: out = Path("data/processed/mri_model.onnx") if out.exists(): return out out.parent.mkdir(parents=True, exist_ok=True) import onnx from onnx import TensorProto, helper input_info = helper.make_tensor_value_info( "input", TensorProto.FLOAT, [1, 1, "D", "H", "W"], ) output_info = helper.make_tensor_value_info("logits", TensorProto.FLOAT, [1, 2]) value = helper.make_tensor("const_logits", TensorProto.FLOAT, [1, 2], [0.3, 2.1]) node = helper.make_node("Constant", inputs=[], outputs=["logits"], value=value) graph = helper.make_graph([node], "demo_mri_classifier", [input_info], [output_info]) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) model.ir_version = 10 onnx.save(model, str(out)) return out def seed_eeg_clf() -> Path: out = Path("data/processed/eeg_clf.joblib") if out.exists(): return out out.parent.mkdir(parents=True, exist_ok=True) import joblib import numpy as np from sklearn.ensemble import RandomForestClassifier rng = np.random.default_rng(0) n_features = 16 X_ctrl = rng.normal(0.0, 1.0, size=(100, n_features)) X_alz = rng.normal(2.0, 1.0, size=(100, n_features)) X = np.vstack([X_ctrl, X_alz]) y = np.array([0] * 100 + [1] * 100) clf = RandomForestClassifier(n_estimators=12, max_depth=6, random_state=0) clf.fit(X, y) joblib.dump(clf, str(out)) return out def seed_clinical_rag_index() -> Path: """Tiny synthetic clinical TF-IDF index (4 chunks). Replace with the real pre-built pickle to upgrade quality without code changes.""" out = Path("data/external_rag/index/rag_index.pkl") if out.exists(): return out out.parent.mkdir(parents=True, exist_ok=True) import pickle from datetime import datetime from sklearn.feature_extraction.text import TfidfVectorizer from src.rag.clinical.types import ClinicalChunk chunks = [ ClinicalChunk(0, "alzheimers_lifestyle.pdf", 1, 1, "Aerobic exercise and Mediterranean diet are associated with reduced cognitive decline in older adults at risk for Alzheimer's disease."), ClinicalChunk(1, "parkinsons_motor.pdf", 1, 1, "Levodopa remains the most effective symptomatic treatment for motor symptoms of Parkinson's disease."), ClinicalChunk(2, "alzheimers_mci.pdf", 2, 2, "Mild cognitive impairment may progress to dementia; MMSE and MoCA are standard screening tools."), ClinicalChunk(3, "parkinsons_nutrition.pdf", 1, 1, "Dietary patterns rich in antioxidants and omega-3 fatty acids are linked to lower Parkinson's risk."), ] vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), min_df=1, norm="l2") matrix = vectorizer.fit_transform([c.text for c in chunks]) payload = { "created_at": datetime.now().isoformat(timespec="seconds"), "source_dir": str(out.parent), "chunk_words": 220, "overlap_words": 45, "chunks": chunks, "vectorizer": vectorizer, "matrix": matrix, } with out.open("wb") as f: pickle.dump(payload, f) return out def seed_axial_png() -> Path: """Axial mid-slice PNG from the bundled NIfTI fixture for the Researcher tab.""" out = Path("tests/fixtures/mri_sample/subject_0_axial.png") if out.exists(): return out out.parent.mkdir(parents=True, exist_ok=True) import nibabel as nib import numpy as np from PIL import Image src = Path("tests/fixtures/mri_sample/subject_0.nii.gz") vol = np.asarray(nib.load(str(src)).get_fdata(), dtype=np.float32) mid = vol.shape[2] // 2 slc = vol[:, :, mid] norm = (slc - slc.min()) / max(slc.max() - slc.min(), 1e-6) Image.fromarray((norm * 255).astype(np.uint8), mode="L").save(str(out)) return out def main() -> int: seeds = [ ("MRI 2D resnet18 state_dict", seed_mri_dl_2d), ("MRI volumetric ONNX", seed_mri_volumetric_onnx), ("EEG sklearn classifier", seed_eeg_clf), ("Clinical TF-IDF RAG index", seed_clinical_rag_index), ("Axial PNG fixture", seed_axial_png), ] print("Seeding demo artifacts...", flush=True) for name, fn in seeds: try: path = fn() kb = path.stat().st_size // 1024 if path.is_file() else 0 print(f" OK {name:35s} {path} ({kb} KB)", flush=True) except Exception as e: print(f" FAIL {name}: {type(e).__name__}: {e}", flush=True) return 1 print("Done.", flush=True) return 0 if __name__ == "__main__": sys.exit(main())