Spaces:

EigenLabs
/

ising-transformer

Sleeping

App Files Files Community

bertran-yorro commited on 1 day ago

Commit

5c85f22

verified ·

1 Parent(s): 2ef4436

Initial upload: model, training scripts, Gradio app, data

Browse files

Files changed (14) hide show

app.py +287 -0
eval.py +386 -0
examples.png +0 -0
ising.py +20 -0
main.py +26 -0
metadata.json +17 -0
model.py +362 -0
requirements.txt +9 -0
sample.py +208 -0
samples-2-epoch.png +0 -0
spins.npy +3 -0
spins_test.npy +3 -0
train.py +229 -0
vi_train.py +288 -0

app.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import shlex
+import subprocess
+import sys
+from pathlib import Path
+import gradio as gr
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+OUT       = Path("outputs")
+CE_CKPT   = OUT / "ce_checkpoint.eqx"
+VI_CKPT   = OUT / "vi_checkpoint.eqx"
+CE_SMPL   = OUT / "samples_ce.npy"
+VI_SMPL   = OUT / "samples_vi.npy"
+TRAIN_DATA = Path("spins.npy")
+TEST_DATA  = Path("spins_test.npy")
+# ---------------------------------------------------------------------------
+# Subprocess helpers
+# ---------------------------------------------------------------------------
+def _stream(command: list[str]):
+    """Run a command and yield log lines in real time."""
+    log = ["$ " + " ".join(shlex.quote(p) for p in command), ""]
+    yield "\n".join(log)
+    proc = subprocess.Popen(
+        command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+    )
+    assert proc.stdout is not None
+    for line in proc.stdout:
+        log.append(line.rstrip())
+        yield "\n".join(log[-300:])
+    rc = proc.wait()
+    log += ["", f"— exited {rc} —"]
+    yield "\n".join(log[-300:])
+# ---------------------------------------------------------------------------
+# Sample grid figure
+# ---------------------------------------------------------------------------
+def _samples_figure(path: Path, title: str, n: int = 16) -> plt.Figure | None:
+    if not path.exists():
+        return None
+    grids = np.load(path).astype(np.float32)[:n]   # (N, L, L), values ±1
+    cols  = min(8, len(grids))
+    rows  = (len(grids) + cols - 1) // cols
+    fig, axes = plt.subplots(rows, cols, figsize=(cols * 1.4, rows * 1.4))
+    axes = np.array(axes).reshape(-1)
+    mags = grids.mean(axis=(1, 2))
+    for i, ax in enumerate(axes):
+        if i < len(grids):
+            ax.imshow(grids[i], cmap="gray", vmin=-1, vmax=1, interpolation="nearest")
+            ax.set_title(f"m={mags[i]:.2f}", fontsize=6)
+        ax.axis("off")
+    fig.suptitle(title, fontsize=9)
+    plt.tight_layout()
+    return fig
+# ---------------------------------------------------------------------------
+# Tab 1 – Cross-entropy training
+# ---------------------------------------------------------------------------
+def run_ce(mode, epochs, batch_size, lr, max_steps):
+    OUT.mkdir(parents=True, exist_ok=True)
+    cmd = [
+        sys.executable, "train.py",
+        "--data",              str(TRAIN_DATA),
+        "--batch-size",        str(int(batch_size)),
+        "--learning-rate",     str(lr),
+        "--output-checkpoint", str(CE_CKPT),
+    ]
+    if mode == "Smoke":
+        cmd += ["--epochs", "1", "--max-train-steps", "5", "--max-eval-batches", "2"]
+    else:
+        cmd += ["--epochs", str(int(epochs))]
+        if int(max_steps) > 0:
+            cmd += ["--max-train-steps", str(int(max_steps))]
+    for log in _stream(cmd):
+        yield log, None
+    ckpt = str(CE_CKPT) if CE_CKPT.exists() else None
+    yield log, ckpt
+# ---------------------------------------------------------------------------
+# Tab 2 – Variational inference fine-tuning
+# ---------------------------------------------------------------------------
+def run_vi(mode, steps, batch_size, lr, warm_start):
+    OUT.mkdir(parents=True, exist_ok=True)
+    if warm_start and not CE_CKPT.exists():
+        yield "⚠  CE checkpoint not found. Run CE training first, or uncheck warm-start.", None
+        return
+    cmd = [
+        sys.executable, "vi_train.py",
+        "--batch-size",        str(int(batch_size)),
+        "--learning-rate",     str(lr),
+        "--output-checkpoint", str(VI_CKPT),
+    ]
+    if warm_start and CE_CKPT.exists():
+        cmd += ["--checkpoint", str(CE_CKPT)]
+    if mode == "Smoke":
+        cmd += ["--num-steps", "3", "--log-every", "1"]
+    else:
+        cmd += ["--num-steps", str(int(steps))]
+    for log in _stream(cmd):
+        yield log, None
+    ckpt = str(VI_CKPT) if VI_CKPT.exists() else None
+    yield log, ckpt
+# ---------------------------------------------------------------------------
+# Tab 3 – Sample & Eval
+# ---------------------------------------------------------------------------
+def run_eval(which, num_samples, seed):
+    OUT.mkdir(parents=True, exist_ok=True)
+    log_lines = []
+    def emit(msg=""):
+        log_lines.append(msg)
+        return (
+            "\n".join(log_lines[-200:]),
+            None, None,   # CE figure, VI figure
+        )
+    run_ce_ = which in ("CE", "Both")
+    run_vi_ = which in ("VI", "Both")
+    if run_ce_ and not CE_CKPT.exists():
+        yield emit("⚠  CE checkpoint not found. Run CE training first.")
+        return
+    if run_vi_ and not VI_CKPT.exists():
+        yield emit("⚠  VI checkpoint not found. Run VI training first.")
+        return
+    # ── Generate samples ───────────────────────────────────────────────────
+    for ckpt, out_path, label in [
+        (CE_CKPT, CE_SMPL, "CE"),
+        (VI_CKPT, VI_SMPL, "VI"),
+    ]:
+        if (label == "CE" and not run_ce_) or (label == "VI" and not run_vi_):
+            continue
+        log_lines.append(f"\n── Generating {num_samples} {label} samples ──")
+        yield "\n".join(log_lines[-200:]), None, None
+        cmd = [
+            sys.executable, "sample.py",
+            "--checkpoint",  str(ckpt),
+            "--num-samples", str(int(num_samples)),
+            "--output",      str(out_path),
+            "--seed",        str(int(seed)),
+        ]
+        for chunk in _stream(cmd):
+            log_lines[-1:] = chunk.splitlines()[-10:]
+            yield "\n".join(log_lines[-200:]), None, None
+    # ── Run eval ──────────────────────────────────────────────────────────
+    for ckpt, smpl, label in [
+        (CE_CKPT, CE_SMPL, "CE"),
+        (VI_CKPT, VI_SMPL, "VI"),
+    ]:
+        if (label == "CE" and not run_ce_) or (label == "VI" and not run_vi_):
+            continue
+        log_lines.append(f"\n── Evaluating {label} model ──")
+        yield "\n".join(log_lines[-200:]), None, None
+        cmd = [
+            sys.executable, "eval.py",
+            "--checkpoint",  str(ckpt),
+            "--test-data",   str(TEST_DATA),
+            "--num-samples", str(int(num_samples)),
+            "--samples-file",str(smpl),
+            "--seed",        str(int(seed)),
+        ]
+        for chunk in _stream(cmd):
+            log_lines[-1:] = chunk.splitlines()[-20:]
+            yield "\n".join(log_lines[-200:]), None, None
+    # ── Build figures ──────────────────────────────────────────────────────
+    ce_fig = _samples_figure(CE_SMPL, "CE samples") if run_ce_ else None
+    vi_fig = _samples_figure(VI_SMPL, "VI samples") if run_vi_ else None
+    yield "\n".join(log_lines[-200:]), ce_fig, vi_fig
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+with gr.Blocks(title="Ising Transformer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        "# 2D Ising Transformer\n"
+        "Autoregressive transformer trained on the 2D Ising model at the critical "
+        "temperature T_c ≈ 2.27.  "
+        "Run **CE training** first, optionally fine-tune with **Variational Inference**, "
+        "then **Sample & Eval** to compare both against the held-out test set."
+    )
+    with gr.Tabs():
+        # ── Tab 1: CE training ──────────────────────────────────────────────
+        with gr.Tab("Cross-Entropy Training"):
+            gr.Markdown(
+                "Trains the model to maximise `log q(s)` on the training spin "
+                "configurations (teacher forcing, causal attention).  "
+                "A *Smoke* run does 5 steps to verify everything compiles."
+            )
+            with gr.Row():
+                ce_mode  = gr.Radio(["Smoke", "Full"], value="Smoke", label="Mode")
+                ce_epoch = gr.Number(value=10, precision=0, minimum=1, label="Epochs")
+                ce_bs    = gr.Number(value=32, precision=0, minimum=1, label="Batch size")
+            with gr.Row():
+                ce_lr    = gr.Number(value=1e-4, label="Learning rate")
+                ce_steps = gr.Number(value=0, precision=0, minimum=0, label="Max steps (0 = no cap)")
+            ce_run  = gr.Button("Run CE Training", variant="primary")
+            ce_logs = gr.Textbox(label="Logs", lines=20, max_lines=30)
+            ce_ckpt = gr.File(label="Checkpoint")
+            ce_run.click(
+                run_ce,
+                inputs=[ce_mode, ce_epoch, ce_bs, ce_lr, ce_steps],
+                outputs=[ce_logs, ce_ckpt],
+            )
+        # ── Tab 2: VI fine-tuning ─────────────────────────────────────────
+        with gr.Tab("Variational Inference Fine-tuning"):
+            gr.Markdown(
+                "Minimises the variational free energy `F = ⟨E(s)⟩ − T·H[q]` using "
+                "the REINFORCE gradient estimator.  Warm-starting from the CE "
+                "checkpoint is strongly recommended.  "
+                "A *Smoke* run does 3 steps."
+            )
+            with gr.Row():
+                vi_mode  = gr.Radio(["Smoke", "Full"], value="Smoke", label="Mode")
+                vi_steps = gr.Number(value=200, precision=0, minimum=1, label="Steps")
+                vi_bs    = gr.Number(value=16, precision=0, minimum=1, label="Batch size")
+            with gr.Row():
+                vi_lr    = gr.Number(value=1e-4, label="Learning rate")
+                vi_warm  = gr.Checkbox(value=True, label="Warm-start from CE checkpoint")
+            vi_run  = gr.Button("Run VI Fine-tuning", variant="primary")
+            vi_logs = gr.Textbox(label="Logs", lines=20, max_lines=30)
+            vi_ckpt = gr.File(label="Checkpoint")
+            vi_run.click(
+                run_vi,
+                inputs=[vi_mode, vi_steps, vi_bs, vi_lr, vi_warm],
+                outputs=[vi_logs, vi_ckpt],
+            )
+        # ── Tab 3: Sample & Eval ──────────────────────────────────────────
+        with gr.Tab("Sample & Eval"):
+            gr.Markdown(
+                "Generates spin configurations from the selected model(s), then runs "
+                "the physical-observable evaluation against `spins_test.npy` "
+                "(a held-out set, never seen during training).\n\n"
+                "Features compared: magnetisation, energy, two-point correlations, "
+                "cluster statistics.  Distance reported as **Mahalanobis D** in the "
+                "decorrelated feature space."
+            )
+            with gr.Row():
+                ev_which = gr.Radio(
+                    ["CE", "VI", "Both"], value="Both", label="Model(s) to evaluate"
+                )
+                ev_n    = gr.Number(value=64, precision=0, minimum=4, label="Num samples")
+                ev_seed = gr.Number(value=0, precision=0, label="Seed")
+            ev_run  = gr.Button("Run Sample & Eval", variant="primary")
+            ev_logs = gr.Textbox(label="Logs", lines=20, max_lines=30)
+            with gr.Row():
+                ev_ce_img = gr.Plot(label="CE samples")
+                ev_vi_img = gr.Plot(label="VI samples")
+            ev_run.click(
+                run_eval,
+                inputs=[ev_which, ev_n, ev_seed],
+                outputs=[ev_logs, ev_ce_img, ev_vi_img],
+            )
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=1).launch()

eval.py ADDED Viewed

	@@ -0,0 +1,386 @@

+#!/usr/bin/env python
+# /// script
+# dependencies = [
+#   "jax[cuda12]",
+#   "equinox",
+#   "scipy",
+#   "jaxtyping",
+# ]
+# ///
+"""Evaluate a trained Generator against held-out test samples.
+For each configuration we compute an 11-dimensional feature vector of physical
+observables.  The Mahalanobis distance between the real and generated feature
+distributions gives a single scalar measure of model quality.
+Per-sample feature vector
+--------------------------
+  m,  m^2,  |m|     magnetisation and its moments
+  e,  e^2            nearest-neighbour energy per spin  (periodic BC)
+  C(1..8)            connected two-point correlation at r = 1, 2, 4, 8
+  s_mean/N           mean cluster size  (4-connected, open BC)
+  s_max/N            largest cluster size
+Ensemble statistics  (printed for reference, not part of Mahalanobis)
+----------------------------------------------------------------------
+  chi  = N · Var(m) / T        magnetic susceptibility
+  C_v  = N · Var(e) / T²       specific heat
+  U4   = 1 − <m^4>/(3<m^2>^2)  Binder cumulant
+                                 → 2/3 in ordered phase
+                                 → 0   in disordered phase
+                                 ≈ 0.47 at T_c for 2D Ising (L→∞)
+Distance
+--------
+  D = sqrt( Δμ^T  Σ_real^{-1}  Δμ )
+  where Δμ = μ_gen − μ_real and Σ_real is the sample covariance of the
+  real test features.  Per-feature z-scores Δμ_i / σ_real_i are also
+  reported so you can see which observables deviate most.
+"""
+import argparse
+from pathlib import Path
+import numpy as np
+import scipy.ndimage
+import jax
+from tqdm.auto import tqdm
+from model import gen_config
+from sample import load_checkpoint, sample_batch, tokens_to_grids
+from train import load_ising_data
+# ---------------------------------------------------------------------------
+# Physical constants
+# ---------------------------------------------------------------------------
+J   = 1.0
+T_C = 2.0 / np.log(1.0 + np.sqrt(2.0))   # exact: 2J / ln(1+√2) ≈ 2.2692
+FEATURE_NAMES = [
+    "m",        "m^2",     "|m|",
+    "e",        "e^2",
+    "C(r=1)",   "C(r=2)",  "C(r=4)",  "C(r=8)",
+    "s_mean/N", "s_max/N",
+]
+# ---------------------------------------------------------------------------
+# Per-sample observables
+# ---------------------------------------------------------------------------
+def energy_per_spin(grid: np.ndarray) -> float:
+    """Nearest-neighbour energy density with periodic boundary conditions.
+    E/N = −J/N · Σ_{⟨ij⟩} s_i s_j
+    Each bond counted once via right- and down-shifts.
+    """
+    right = np.roll(grid, -1, axis=1)
+    down  = np.roll(grid, -1, axis=0)
+    return float(-J * (grid * right + grid * down).sum() / grid.size)
+def connected_correlations(
+    grid: np.ndarray,
+    distances: tuple[int, ...] = (1, 2, 4, 8),
+) -> np.ndarray:
+    """Isotropic connected two-point function C(r) = ½[<s_x s_{x+r}> + <s_y s_{y+r}>] - <s>².
+    Averaged over both spatial directions and all origin sites using
+    periodic boundary conditions.
+    """
+    m = float(grid.mean())
+    corr = []
+    for r in distances:
+        cx = float((grid * np.roll(grid, r, axis=1)).mean())
+        cy = float((grid * np.roll(grid, r, axis=0)).mean())
+        corr.append((cx + cy) / 2.0 - m ** 2)
+    return np.array(corr, dtype=np.float64)
+def cluster_stats(grid: np.ndarray) -> tuple[float, float]:
+    """Mean and maximum cluster size for both spin species.
+    Uses 4-connectivity (no diagonals) and open boundary conditions.
+    Returns sizes normalised by the total number of spins so the result
+    is independent of lattice size.
+    Note: open BC means edge-spanning clusters are split at the boundary;
+    this is applied consistently to both real and generated samples so
+    systematic bias cancels in the Mahalanobis comparison.
+    """
+    N = grid.size
+    all_sizes: list[np.ndarray] = []
+    for spin in (1, -1):
+        labeled, n_labels = scipy.ndimage.label(grid == spin)
+        if n_labels > 0:
+            # bincount index 0 is background; skip it
+            all_sizes.append(np.bincount(labeled.ravel())[1:])
+    if not all_sizes:
+        return 0.0, 0.0
+    sizes = np.concatenate(all_sizes).astype(np.float64)
+    return float(sizes.mean()) / N, float(sizes.max()) / N
+def compute_features(grid: np.ndarray) -> np.ndarray:
+    """Return the 11-D feature vector for a single ±1 grid of shape (L, L)."""
+    m  = float(grid.mean())
+    e  = energy_per_spin(grid)
+    cr = connected_correlations(grid)
+    s_mean, s_max = cluster_stats(grid)
+    return np.array(
+        [m, m ** 2, abs(m), e, e ** 2, *cr, s_mean, s_max],
+        dtype=np.float64,
+    )
+def compute_feature_matrix(grids: np.ndarray, desc: str = "features") -> np.ndarray:
+    """Compute the (N, 11) feature matrix for a batch of grids."""
+    return np.stack(
+        [compute_features(grids[i])
+         for i in tqdm(range(len(grids)), desc=desc, leave=False)]
+    )
+# ---------------------------------------------------------------------------
+# Ensemble statistics
+# ---------------------------------------------------------------------------
+def ensemble_stats(X: np.ndarray, T: float = T_C) -> dict[str, float]:
+    """Derive thermodynamic ensemble statistics from a feature matrix.
+    Arguments
+    ---------
+    X : (N, 11) feature matrix from ``compute_feature_matrix``.
+    T : temperature used for χ and C_v normalisation.
+    """
+    L = gen_config["lattice_size"]
+    N = L * L
+    m  = X[:, FEATURE_NAMES.index("m")]
+    m2 = X[:, FEATURE_NAMES.index("m^2")]
+    m4 = m ** 4
+    e  = X[:, FEATURE_NAMES.index("e")]
+    chi    = N * float(m.var())  / T
+    Cv     = N * float(e.var())  / T ** 2
+    binder = float(1.0 - m4.mean() / (3.0 * m2.mean() ** 2)) if m2.mean() > 0 else float("nan")
+    return {
+        "<|m|>": float(np.abs(m).mean()),
+        "chi":   chi,
+        "C_v":   Cv,
+        "U4":    binder,
+    }
+# ---------------------------------------------------------------------------
+# Mahalanobis distance
+# ---------------------------------------------------------------------------
+def mahalanobis_distance(
+    X_ref:   np.ndarray,
+    X_query: np.ndarray,
+    reg:     float = 1e-6,
+) -> tuple[float, np.ndarray]:
+    """Mahalanobis distance of the query-mean from the reference distribution.
+    D = sqrt( Δμ^T  Σ_ref^{-1}  Δμ )
+    Also returns per-feature z-scores  z_i = Δμ_i / σ_ref_i,
+    where σ_ref_i = sqrt(Σ_ref[i,i]).  |z_i| > 1 indicates a feature
+    whose mean differs by more than one real-sample standard deviation.
+    Parameters
+    ----------
+    X_ref   : (N, d) real / reference feature matrix
+    X_query : (M, d) generated / query feature matrix
+    reg     : diagonal regularisation added to Σ_ref before inversion
+    """
+    mu_ref   = X_ref.mean(axis=0)
+    mu_query = X_query.mean(axis=0)
+    cov      = np.cov(X_ref.T) + reg * np.eye(X_ref.shape[1])
+    cov_inv  = np.linalg.inv(cov)
+    delta    = mu_query - mu_ref
+    D        = float(np.sqrt(max(0.0, delta @ cov_inv @ delta)))
+    z_scores = delta / np.sqrt(np.diag(cov))
+    return D, z_scores
+# ---------------------------------------------------------------------------
+# Reporting
+# ---------------------------------------------------------------------------
+def print_feature_table(X_real: np.ndarray, X_gen: np.ndarray) -> None:
+    mu_r = X_real.mean(axis=0)
+    sd_r = X_real.std(axis=0)
+    mu_g = X_gen.mean(axis=0)
+    sd_g = X_gen.std(axis=0)
+    col = 13
+    hdr = (f"  {'Feature':<11}  {'Real mean':>{col}}  {'Real std':>{col}}"
+           f"  {'Gen mean':>{col}}  {'Gen std':>{col}}  {'z-score':>8}")
+    print(hdr)
+    print("  " + "─" * (len(hdr) - 2))
+    for name, mr, sr, mg, sg in zip(FEATURE_NAMES, mu_r, sd_r, mu_g, sd_g):
+        z = (mg - mr) / (sr + 1e-12)
+        flag = " <" if abs(z) > 1.0 else ""
+        print(f"  {name:<11}  {mr:>{col}.4f}  {sr:>{col}.4f}"
+              f"  {mg:>{col}.4f}  {sg:>{col}.4f}  {z:>+8.3f}{flag}")
+    print()
+def print_ensemble_table(stats_real: dict, stats_gen: dict) -> None:
+    labels = {
+        "<|m|>": "mean |m|",
+        "chi":   "chi  (susceptibility)",
+        "C_v":   "C_v  (specific heat)",
+        "U4":    "U4   (Binder cumulant)",
+    }
+    print(f"  {'Observable':<26}  {'Real':>10}  {'Generated':>10}")
+    print("  " + "─" * 50)
+    for key, label in labels.items():
+        r = stats_real[key]
+        g = stats_gen[key]
+        print(f"  {label:<26}  {r:>10.4f}  {g:>10.4f}")
+    print()
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+_SAMPLE_BATCH = 4   # fixed vmapped batch; changing triggers recompilation
+def generate_grids(model, n: int, key: jax.Array, L: int) -> np.ndarray:
+    """Sample n grids in batches of _SAMPLE_BATCH with a progress bar.
+    Using a fixed batch size means only one JIT compilation happens regardless
+    of n.  The final partial batch is padded then trimmed.
+    """
+    batches = []
+    n_full, remainder = divmod(n, _SAMPLE_BATCH)
+    n_batches = n_full + (1 if remainder else 0)
+    with tqdm(total=n, unit="samples", desc="Sampling") as pbar:
+        for i in range(n_batches):
+            key, subkey = jax.random.split(key)
+            tokens = np.asarray(sample_batch(model, _SAMPLE_BATCH, subkey))
+            batches.append(tokens)
+            pbar.update(min(_SAMPLE_BATCH, n - i * _SAMPLE_BATCH))
+    return tokens_to_grids(np.concatenate(batches)[:n], L)
+def load_test_grids(
+    test_data: Path | None,
+    data: Path,
+    n: int,
+    L: int,
+    rng: np.random.Generator,
+) -> np.ndarray:
+    """Load real test grids, preferring a dedicated test file over the val split.
+    Parameters
+    ----------
+    test_data : optional path to a standalone test .npy file (N, L, L) int8 {-1,+1}
+    data      : path to the main spins.npy (used only if test_data is None)
+    """
+    if test_data is not None:
+        spins = np.load(test_data)                      # (N, L, L) int8
+        tokens = (spins.astype(np.int32) + 1) // 2     # → {0, 1}
+        rows, cols = snake_order(L)
+        tokens = tokens[:, rows, cols]                  # (N, L²)
+    else:
+        _, tokens = load_ising_data(data)               # val split of spins.npy
+    n = min(n, len(tokens))
+    idx = rng.choice(len(tokens), size=n, replace=False)
+    return tokens_to_grids(tokens[idx], L)              # (n, L, L), values ±1
+def parse_args():
+    p = argparse.ArgumentParser(
+        description="Compare generated vs real Ising samples via physical observables."
+    )
+    p.add_argument("--checkpoint",   type=Path, required=True,
+                   help="Path to the .eqx checkpoint file.")
+    p.add_argument("--data",         type=Path,
+                   default=Path(__file__).parent / "spins.npy",
+                   help="Path to spins.npy  (default: ./spins.npy).  "
+                        "Used only if --test-data is not provided.")
+    p.add_argument("--test-data",    type=Path,
+                   default=Path(__file__).parent / "spins_test.npy",
+                   help="Dedicated held-out test set (.npy, N×L×L int8 {-1,+1}).  "
+                        "Takes priority over the val split of --data.")
+    p.add_argument("--num-samples",  type=int, default=50,
+                   help="Number of samples to compare  (default: 50).")
+    p.add_argument("--samples-file",  type=Path, default=None,
+                   help="Optional .npy of pre-generated {-1,+1} grids (N,L,L) "
+                        "from 'sample.py --output'.  Skips generation entirely.")
+    p.add_argument("--seed",         type=int, default=0)
+    return p.parse_args()
+def main():
+    args = parse_args()
+    L   = gen_config["lattice_size"]
+    rng = np.random.default_rng(args.seed)
+    # ── Real samples (test split) ─────────────────────────────────────────────
+    # Prefer spins_test.npy; fall back to val split of spins.npy.
+    test_path = args.test_data if (args.test_data and args.test_data.exists()) else None
+    if test_path:
+        print(f"Loading test data from {test_path} …")
+    else:
+        print("Loading test data from val split of spins.npy …")
+    n = args.num_samples
+    real_grids = load_test_grids(test_path, args.data, n, L, rng)
+    n = len(real_grids)    # may be capped by dataset size
+    # ── Generated samples ─────────────────────────────────────────────────────
+    if args.samples_file is not None:
+        print(f"Loading pre-generated samples from {args.samples_file} …")
+        gen_grids = np.load(args.samples_file).astype(np.int8)[:n]
+        if gen_grids.shape[1:] != (L, L):
+            raise ValueError(
+                f"samples-file grid shape {gen_grids.shape[1:]} != ({L},{L})"
+            )
+        n = min(n, len(gen_grids))
+        real_grids = real_grids[:n]
+    else:
+        print(f"Loading checkpoint from {args.checkpoint} …")
+        model = load_checkpoint(args.checkpoint)
+        key   = jax.random.PRNGKey(args.seed)
+        gen_grids = generate_grids(model, n, key, L)   # (n, L, L), values ±1
+    print(f"\nL = {L}  |  N = {n} samples per group  |  T_C = {T_C:.6f}\n")
+    # ── Feature matrices ──────────────────────────────────────────────────────
+    X_real = compute_feature_matrix(real_grids, desc="Features: real      ")
+    X_gen  = compute_feature_matrix(gen_grids,  desc="Features: generated ")
+    # ── Per-feature comparison table ──────────────────────────────────────────
+    print("Per-feature statistics  (z-score = Δμ / σ_real; '<' marks |z| > 1)\n")
+    print_feature_table(X_real, X_gen)
+    # ── Ensemble statistics ───────────────────────────────────────────────────
+    print("Ensemble statistics\n")
+    print_ensemble_table(ensemble_stats(X_real), ensemble_stats(X_gen))
+    # ── Mahalanobis distance ──────────────────────────────────────────────────
+    D, z = mahalanobis_distance(X_real, X_gen)
+    print(f"Mahalanobis distance  D = {D:.4f}")
+    print( "  (D measures how many 'std-devs' the generated feature mean sits")
+    print( "   from the real distribution in the decorrelated feature space.)")
+    print()
+    print("  Top deviating features:")
+    order = np.argsort(np.abs(z))[::-1]
+    for i in order[:5]:
+        print(f"    {FEATURE_NAMES[i]:<11}  z = {z[i]:+.3f}")
+if __name__ == "__main__":
+    main()

examples.png ADDED Viewed

ising.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Legacy entry point.  This file has been split into:
+  model.py  — Generator architecture and gen_config
+  train.py  — training loop  (python train.py --help)
+  sample.py — checkpoint loading and autoregressive sampling  (python sample.py --help)
+"""
+# Re-export model symbols for any code that still does `from ising import ...`
+from model import (  # noqa: F401
+    snake_order,
+    EmbedderBlock,
+    FeedForwardBlock,
+    AttentionBlock,
+    TransformerLayer,
+    Encoder,
+    Generator,
+    gen_config,
+)
+if __name__ == "__main__":
+    import train
+    train.main()

main.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import numpy as np
+import sys, jax
+sys.path.insert(0, '.')
+from sample import load_checkpoint, sample_batch, tokens_to_grids
+model = load_checkpoint('checkpoint.eqx')
+lattice_size = model.encoder.embedder_block.lattice_size
+key = jax.random.PRNGKey(42)
+print('Compiling and sampling 16 configurations...')
+tokens = sample_batch(model, 16, key)
+tokens = np.asarray(tokens)
+grids = tokens_to_grids(tokens, lattice_size)
+mags = grids.mean(axis=(1,2))
+print(f'Magnetizations: {np.round(mags, 3)}')
+print(f'Mean |m|: {np.abs(mags).mean():.4f}')
+fig, axes = plt.subplots(2, 8, figsize=(14, 4))
+for i, ax in enumerate(axes.flat):
+    ax.imshow(grids[i], cmap='gray', vmin=-1, vmax=1, interpolation='nearest')
+    ax.set_title(f'm={mags[i]:.2f}', fontsize=7)
+    ax.axis('off')
+fig.suptitle('Sampled Ising configs (L=32, T=T_c, 2 epochs)', fontsize=10)
+plt.tight_layout()
+plt.savefig('samples.png', dpi=150, bbox_inches='tight')
+print('Saved samples.png')

metadata.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "lattice_size": 32,
+  "sample_count": 10000,
+  "temperature": 2.269185314213022,
+  "temperature_note": "2D Ising critical temperature T_c = 2J/ln(1+sqrt(2))",
+  "coupling": 1.0,
+  "spin_values": [
+    -1,
+    1
+  ],
+  "burn_in_sweeps": 200,
+  "sample_interval_sweeps": 5,
+  "n_chains": 10,
+  "algorithm": "Wolff single-cluster",
+  "base_seed": 1778172101,
+  "generation_time_s": 4.9
+}

model.py ADDED Viewed

	@@ -0,0 +1,362 @@

+"""Transformer model for autoregressive Ising spin generation.
+Architecture: causal (GPT-style) transformer with per-site positional
+embeddings in snake (boustrophedon) order.  The model is trained to maximise
+p(s_0, s_1, ..., s_{N-1}) = ∏_t p(s_t | s_0, ..., s_{t-1}), where the spin
+sites are visited in snake order over the L×L lattice.
+"""
+from collections.abc import Mapping
+import equinox as eqx
+import jax
+import jax.numpy as jnp
+import numpy as np
+from jaxtyping import Array, Float, Int
+def snake_order(size: int) -> tuple[np.ndarray, np.ndarray]:
+    """Return (rows, cols) index arrays traversing an L×L grid in snake order.
+    Even rows go left-to-right; odd rows go right-to-left.  The returned
+    arrays have length size² and implement numpy advanced indexing:
+        grid[rows, cols]  →  1-D sequence in snake order
+        grid[rows, cols] = seq  →  scatter a sequence back to the grid
+    """
+    if size <= 0:
+        raise ValueError("size must be positive")
+    rows, cols = [], []
+    for row in range(size):
+        columns = range(size) if row % 2 == 0 else range(size - 1, -1, -1)
+        for col in columns:
+            rows.append(row)
+            cols.append(col)
+    return np.array(rows), np.array(cols)
+# ---------------------------------------------------------------------------
+# Building blocks
+# ---------------------------------------------------------------------------
+class EmbedderBlock(eqx.Module):
+    """Spin-state + lattice-position embedder.
+    Each position in the snake-order sequence gets three embeddings summed:
+      • a learned spin-state embedding  (token  ∈ {0, 1})
+      • a learned row-position embedding
+      • a learned column-position embedding
+    The row/column indices are derived from `snake_order` at trace time, so
+    they fold to compile-time constants — no array model-parameters are stored.
+    """
+    state_embedder: eqx.nn.Embedding
+    row_embedder: eqx.nn.Embedding
+    column_embedder: eqx.nn.Embedding
+    layernorm: eqx.nn.LayerNorm
+    dropout: eqx.nn.Dropout
+    lattice_size: int = eqx.field(static=True)
+    def __init__(
+        self,
+        state_size: int,
+        lattice_size: int,
+        embedding_size: int,
+        hidden_size: int,
+        dropout_rate: float,
+        key: jax.random.PRNGKey,
+    ):
+        state_key, row_key, col_key = jax.random.split(key, 3)
+        self.state_embedder = eqx.nn.Embedding(
+            num_embeddings=state_size, embedding_size=embedding_size, key=state_key
+        )
+        self.row_embedder = eqx.nn.Embedding(
+            num_embeddings=lattice_size, embedding_size=embedding_size, key=row_key
+        )
+        self.column_embedder = eqx.nn.Embedding(
+            num_embeddings=lattice_size, embedding_size=embedding_size, key=col_key
+        )
+        self.layernorm = eqx.nn.LayerNorm(shape=hidden_size)
+        self.dropout = eqx.nn.Dropout(dropout_rate)
+        self.lattice_size = lattice_size
+    def __call__(
+        self,
+        states: Int[Array, " seq_len"],
+        enable_dropout: bool = False,
+        key: jax.Array | None = None,
+    ) -> Float[Array, "seq_len hidden_size"]:
+        rows, cols = snake_order(self.lattice_size)   # concrete at trace time
+        x_states = jax.vmap(self.state_embedder)(states)
+        x_rows   = jax.vmap(self.row_embedder)(jnp.asarray(rows))
+        x_cols   = jax.vmap(self.column_embedder)(jnp.asarray(cols))
+        x = x_states + x_rows + x_cols
+        x = jax.vmap(self.layernorm)(x)
+        x = self.dropout(x, inference=not enable_dropout, key=key)
+        return x
+class FeedForwardBlock(eqx.Module):
+    """Position-wise feed-forward block with residual connection."""
+    mlp: eqx.nn.Linear
+    output: eqx.nn.Linear
+    layernorm: eqx.nn.LayerNorm
+    dropout: eqx.nn.Dropout
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        dropout_rate: float,
+        key: jax.random.PRNGKey,
+    ):
+        mlp_key, out_key = jax.random.split(key)
+        self.mlp    = eqx.nn.Linear(hidden_size, intermediate_size, key=mlp_key)
+        self.output = eqx.nn.Linear(intermediate_size, hidden_size, key=out_key)
+        self.layernorm = eqx.nn.LayerNorm(shape=hidden_size)
+        self.dropout   = eqx.nn.Dropout(dropout_rate)
+    def __call__(
+        self,
+        inputs: Float[Array, " hidden_size"],
+        enable_dropout: bool = False,
+        key: jax.Array | None = None,
+    ) -> Float[Array, " hidden_size"]:
+        x = jax.nn.gelu(self.mlp(inputs))
+        x = self.output(x)
+        x = self.dropout(x, inference=not enable_dropout, key=key)
+        x = x + inputs
+        x = self.layernorm(x)
+        return x
+class AttentionBlock(eqx.Module):
+    """Multi-head self-attention with causal (lower-triangular) mask."""
+    attention: eqx.nn.MultiheadAttention
+    layernorm: eqx.nn.LayerNorm
+    dropout: eqx.nn.Dropout
+    num_heads: int = eqx.field(static=True)
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        dropout_rate: float,
+        attention_dropout_rate: float,
+        key: jax.random.PRNGKey,
+    ):
+        self.num_heads = num_heads
+        self.attention = eqx.nn.MultiheadAttention(
+            num_heads=num_heads,
+            query_size=hidden_size,
+            use_query_bias=True,
+            use_key_bias=True,
+            use_value_bias=True,
+            use_output_bias=True,
+            dropout_p=attention_dropout_rate,
+            key=key,
+        )
+        self.layernorm = eqx.nn.LayerNorm(shape=hidden_size)
+        self.dropout   = eqx.nn.Dropout(dropout_rate)
+    def __call__(
+        self,
+        inputs: Float[Array, "seq_len hidden_size"],
+        mask: Int[Array, " seq_len"] | None,
+        enable_dropout: bool = False,
+        key: jax.random.PRNGKey = None,
+    ) -> Float[Array, "seq_len hidden_size"]:
+        attn_key, drop_key = (None, None) if key is None else jax.random.split(key)
+        if mask is not None:
+            mask = self._causal_mask(mask)
+        x = self.attention(
+            query=inputs, key_=inputs, value=inputs,
+            mask=mask, inference=not enable_dropout, key=attn_key,
+        )
+        x = self.dropout(x, inference=not enable_dropout, key=drop_key)
+        x = x + inputs
+        x = jax.vmap(self.layernorm)(x)
+        return x
+    def _causal_mask(
+        self, mask: Int[Array, " seq_len"]
+    ) -> Float[Array, "num_heads seq_len seq_len"]:
+        """Lower-triangular mask combined with a padding mask."""
+        n = mask.shape[0]
+        pad   = jnp.multiply(mask[:, None], mask[None, :])          # [n, n]
+        causal = jnp.tril(jnp.ones((n, n), dtype=mask.dtype))       # [n, n]
+        m = jnp.multiply(pad, causal)                                # [n, n]
+        m = jnp.broadcast_to(m[None], (self.num_heads, n, n))        # [H, n, n]
+        return m.astype(jnp.float32)
+class TransformerLayer(eqx.Module):
+    """One transformer block: attention followed by feed-forward."""
+    attention_block: AttentionBlock
+    ff_block: FeedForwardBlock
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        num_heads: int,
+        dropout_rate: float,
+        attention_dropout_rate: float,
+        key: jax.random.PRNGKey,
+    ):
+        attn_key, ff_key = jax.random.split(key)
+        self.attention_block = AttentionBlock(
+            hidden_size=hidden_size,
+            num_heads=num_heads,
+            dropout_rate=dropout_rate,
+            attention_dropout_rate=attention_dropout_rate,
+            key=attn_key,
+        )
+        self.ff_block = FeedForwardBlock(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            dropout_rate=dropout_rate,
+            key=ff_key,
+        )
+    def __call__(
+        self,
+        inputs: Float[Array, "seq_len hidden_size"],
+        mask: Int[Array, " seq_len"] | None = None,
+        *,
+        enable_dropout: bool = False,
+        key: jax.Array | None = None,
+    ) -> Float[Array, "seq_len hidden_size"]:
+        attn_key, ff_key = (None, None) if key is None else jax.random.split(key)
+        x = self.attention_block(inputs, mask, enable_dropout=enable_dropout, key=attn_key)
+        n = x.shape[0]
+        ff_keys = None if ff_key is None else jax.random.split(ff_key, n)
+        x = jax.vmap(self.ff_block, in_axes=(0, None, 0))(x, enable_dropout, ff_keys)
+        return x
+# ---------------------------------------------------------------------------
+# Encoder and top-level Generator
+# ---------------------------------------------------------------------------
+class Encoder(eqx.Module):
+    """Stack of transformer layers over a snake-ordered spin sequence."""
+    embedder_block: EmbedderBlock
+    layers: list[TransformerLayer]
+    def __init__(
+        self,
+        state_size: int,
+        lattice_size: int,
+        embedding_size: int,
+        hidden_size: int,
+        intermediate_size: int,
+        num_layers: int,
+        num_heads: int,
+        dropout_rate: float,
+        attention_dropout_rate: float,
+        key: jax.random.PRNGKey,
+    ):
+        emb_key, layer_key = jax.random.split(key)
+        self.embedder_block = EmbedderBlock(
+            state_size=state_size,
+            lattice_size=lattice_size,
+            embedding_size=embedding_size,
+            hidden_size=hidden_size,
+            dropout_rate=dropout_rate,
+            key=emb_key,
+        )
+        layer_keys = jax.random.split(layer_key, num_layers)
+        self.layers = [
+            TransformerLayer(
+                hidden_size=hidden_size,
+                intermediate_size=intermediate_size,
+                num_heads=num_heads,
+                dropout_rate=dropout_rate,
+                attention_dropout_rate=attention_dropout_rate,
+                key=lk,
+            )
+            for lk in layer_keys
+        ]
+    def __call__(
+        self,
+        states: Int[Array, " seq_len"],
+        *,
+        enable_dropout: bool = False,
+        key: jax.Array | None = None,
+    ) -> Float[Array, "seq_len hidden_size"]:
+        emb_key, l_key = (None, None) if key is None else jax.random.split(key)
+        x = self.embedder_block(states, enable_dropout=enable_dropout, key=emb_key)
+        mask = jnp.ones_like(states, dtype=jnp.int32)  # no padding; causal only
+        for layer in self.layers:
+            cl_key, l_key = (None, None) if l_key is None else jax.random.split(l_key)
+            x = layer(x, mask, enable_dropout=enable_dropout, key=cl_key)
+        return x
+class Generator(eqx.Module):
+    """Autoregressive transformer generator for Ising spin configurations.
+    Input:  token_ids — integer spin tokens {0=down, 1=up} in snake order.
+    Output: logits    — shape (seq_len, state_size), where logits[t] is the
+                        predicted distribution over the spin at position t+1
+                        given positions 0..t.
+    """
+    encoder: Encoder
+    lm_head: eqx.nn.Linear
+    dropout: eqx.nn.Dropout
+    def __init__(self, config: Mapping, key: jax.random.PRNGKey):
+        enc_key, head_key = jax.random.split(key)
+        self.encoder = Encoder(
+            state_size=config["state_size"],
+            lattice_size=config["lattice_size"],
+            embedding_size=config["hidden_size"],
+            hidden_size=config["hidden_size"],
+            intermediate_size=config["intermediate_size"],
+            num_layers=config["num_hidden_layers"],
+            num_heads=config["num_attention_heads"],
+            dropout_rate=config["hidden_dropout_prob"],
+            attention_dropout_rate=config["attention_probs_dropout_prob"],
+            key=enc_key,
+        )
+        self.lm_head = eqx.nn.Linear(
+            in_features=config["hidden_size"],
+            out_features=config["state_size"],
+            key=head_key,
+        )
+        self.dropout = eqx.nn.Dropout(config["hidden_dropout_prob"])
+    def __call__(
+        self,
+        inputs: dict[str, Int[Array, " seq_len"]],
+        enable_dropout: bool = False,
+        key: jax.random.PRNGKey = None,
+    ) -> Float[Array, "seq_len state_size"]:
+        e_key, d_key = (None, None) if key is None else jax.random.split(key)
+        x = self.encoder(inputs["token_ids"], enable_dropout=enable_dropout, key=e_key)
+        x = self.dropout(x, inference=not enable_dropout, key=d_key)
+        return jax.vmap(self.lm_head)(x)
+# ---------------------------------------------------------------------------
+# Default configuration
+# ---------------------------------------------------------------------------
+gen_config = {
+    "state_size": 2,            # spin tokens: 0 (↓) or 1 (↑)
+    "lattice_size": 32,         # L×L lattice → L² = 1024 sequence length
+    "hidden_size": 128,
+    "num_hidden_layers": 2,
+    "num_attention_heads": 2,
+    "hidden_act": "gelu",
+    "intermediate_size": 512,
+    "hidden_dropout_prob": 0.1,
+    "attention_probs_dropout_prob": 0.1,
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+jax[cuda12]
+equinox
+optax
+einops
+tqdm
+jaxtyping
+gradio>=4.0
+matplotlib
+scipy

sample.py ADDED Viewed

	@@ -0,0 +1,208 @@

+#!/usr/bin/env python
+# /// script
+# dependencies = [
+#   "jax[cuda12]",
+#   "equinox",
+#   "matplotlib",
+#   "jaxtyping",
+# ]
+# ///
+"""Sample spin configurations from a trained Ising Generator checkpoint.
+Usage:
+    python sample.py --checkpoint model.eqx [--num-samples 16]
+                     [--output samples.npy] [--plot] [--seed 0]
+How autoregressive sampling works
+----------------------------------
+The model is trained with a causal (lower-triangular) attention mask, so at
+position t the output logits[t] are a function of spins s_0 … s_t only.
+We exploit this to sample the full sequence one spin at a time:
+  1. Sample s_0 uniformly (the model has no BOS token).
+  2. For t = 0, 1, …, L²-2:
+       a. Run the full forward pass on the current token buffer.
+          Spins at positions > t are still placeholder zeros, but causal
+          masking prevents the network from attending to them.
+       b. Draw s_{t+1} ~ Categorical(softmax(logits[t])).
+       c. Write s_{t+1} into the buffer.
+This is O(L⁴) in compute (L² steps × L² attention), which is 1 B ops for a
+32×32 lattice.  `jax.lax.scan` compiles the loop body once so subsequent
+calls are fast.
+"""
+import argparse
+import functools
+from pathlib import Path
+import equinox as eqx
+import jax
+import jax.numpy as jnp
+import numpy as np
+from jaxtyping import Array, Int
+from model import Generator, gen_config, snake_order
+# ---------------------------------------------------------------------------
+# Checkpoint I/O
+# ---------------------------------------------------------------------------
+def load_checkpoint(
+    path: Path,
+    config: dict = gen_config,
+    key: jax.Array | None = None,
+) -> Generator:
+    """Deserialise a Generator from *path*.
+    A fresh model is initialised with *config* (weights are immediately
+    overwritten), so *key* only needs to be reproducible across calls if you
+    care about the random seed used for the skeleton — in practice any key
+    works.
+    """
+    if key is None:
+        key = jax.random.PRNGKey(0)
+    skeleton = Generator(config=config, key=key)
+    return eqx.tree_deserialise_leaves(path, skeleton)
+# ---------------------------------------------------------------------------
+# Sampling
+# ---------------------------------------------------------------------------
+def sample_sequence(
+    model: Generator,
+    key: jax.random.PRNGKey,
+) -> Int[Array, " seq_len"]:
+    """Autoregressively sample one spin sequence in snake order.
+    This function is JAX-traceable and safe to use inside ``jax.vmap`` or
+    ``jax.lax.scan``.  JIT-compile it (or wrap in ``sample_batch``) for best
+    performance — the first call will take longer due to compilation.
+    """
+    lattice_size = model.encoder.embedder_block.lattice_size
+    seq_len = lattice_size * lattice_size
+    def step(carry, t):
+        tokens, step_key = carry
+        step_key, sample_key = jax.random.split(step_key)
+        logits = model({"token_ids": tokens}, enable_dropout=False, key=None)
+        # logits[t] → distribution over s_{t+1}
+        next_token = jax.random.categorical(sample_key, logits[t])
+        tokens = tokens.at[t + 1].set(next_token)
+        return (tokens, step_key), None
+    key, first_key = jax.random.split(key)
+    first_token = jax.random.randint(first_key, shape=(), minval=0, maxval=2)
+    tokens = jnp.zeros(seq_len, dtype=jnp.int32).at[0].set(first_token)
+    (tokens, _), _ = jax.lax.scan(step, (tokens, key), jnp.arange(seq_len - 1))
+    return tokens
+@eqx.filter_jit
+def sample_batch(
+    model: Generator,
+    num_samples: int,
+    key: jax.random.PRNGKey,
+) -> Int[Array, "num_samples seq_len"]:
+    """Sample *num_samples* configurations in parallel (vmapped + JIT'd).
+    The first call triggers compilation; subsequent calls with the same
+    ``num_samples`` reuse the compiled code.
+    """
+    keys = jax.random.split(key, num_samples)
+    return jax.vmap(sample_sequence, in_axes=(None, 0))(model, keys)
+# ---------------------------------------------------------------------------
+# Grid conversion
+# ---------------------------------------------------------------------------
+def tokens_to_grid(
+    tokens: np.ndarray | Array,
+    lattice_size: int,
+) -> np.ndarray:
+    """Convert a snake-ordered token sequence {0, 1} → a {-1, +1} L×L grid."""
+    tokens = np.asarray(tokens)
+    rows, cols = snake_order(lattice_size)
+    grid = np.empty((lattice_size, lattice_size), dtype=np.int8)
+    grid[rows, cols] = (tokens * 2 - 1).astype(np.int8)
+    return grid
+def tokens_to_grids(
+    tokens: np.ndarray | Array,
+    lattice_size: int,
+) -> np.ndarray:
+    """Batch version of ``tokens_to_grid``.  Input shape: (N, L²)."""
+    tokens = np.asarray(tokens)
+    return np.stack([tokens_to_grid(tokens[i], lattice_size) for i in range(len(tokens))])
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args():
+    p = argparse.ArgumentParser(description="Sample from a trained Ising Generator.")
+    p.add_argument("--checkpoint",   type=Path, required=True,
+                   help="Path to the .eqx checkpoint file.")
+    p.add_argument("--num-samples",  type=int,  default=16)
+    p.add_argument("--output",       type=Path, default=None,
+                   help="Save sampled {-1,+1} grids as a .npy file (N, L, L).")
+    p.add_argument("--plot",         action="store_true",
+                   help="Display a grid of sampled configurations with matplotlib.")
+    p.add_argument("--seed",         type=int,  default=0)
+    return p.parse_args()
+def main():
+    args = parse_args()
+    print(f"Loading checkpoint from {args.checkpoint} …")
+    model = load_checkpoint(args.checkpoint)
+    lattice_size = model.encoder.embedder_block.lattice_size
+    print(f"  lattice_size={lattice_size}, seq_len={lattice_size**2}")
+    key = jax.random.PRNGKey(args.seed)
+    print(f"Sampling {args.num_samples} configurations "
+          f"(compiling on first call) …")
+    tokens = sample_batch(model, args.num_samples, key)
+    tokens = np.asarray(tokens)
+    grids = tokens_to_grids(tokens, lattice_size)   # (N, L, L), values {-1, +1}
+    print(f"  shape: {grids.shape}  dtype: {grids.dtype}")
+    print(f"  mean magnetization : {grids.mean():.4f}")
+    print(f"  mean |magnetization|: {np.abs(grids.mean(axis=(1, 2))).mean():.4f}")
+    if args.output is not None:
+        np.save(args.output, grids)
+        print(f"Saved → {args.output}")
+    if args.plot:
+        try:
+            import matplotlib.pyplot as plt
+        except ImportError:
+            print("matplotlib not available; skipping plot (pip install matplotlib).")
+            return
+        cols = min(8, args.num_samples)
+        rows = (args.num_samples + cols - 1) // cols
+        fig, axes = plt.subplots(rows, cols, figsize=(cols * 1.5, rows * 1.5))
+        axes = np.array(axes).reshape(-1)
+        for i, ax in enumerate(axes):
+            if i < len(grids):
+                ax.imshow(grids[i], cmap="gray", vmin=-1, vmax=1, interpolation="nearest")
+            ax.axis("off")
+        fig.suptitle(
+            f"Sampled Ising configurations  "
+            f"(L={lattice_size}, n={args.num_samples})",
+            fontsize=10,
+        )
+        plt.tight_layout()
+        plt.show()
+if __name__ == "__main__":
+    main()

samples-2-epoch.png ADDED Viewed

spins.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1890a0f2baeee4212b3bd79b74bff1f1f31135d9031ffb8c79bc558eae23a83
+size 10240128

spins_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:040dea2a4f3b5753c7531d7a55643a7be3c8be4a7d9b9c1f11f481ea522ff67e
+size 1024128

train.py ADDED Viewed

	@@ -0,0 +1,229 @@

+#!/usr/bin/env python
+# /// script
+# dependencies = [
+#   "jax[cuda12]",
+#   "equinox",
+#   "optax",
+#   "einops",
+#   "tqdm",
+#   "jaxtyping",
+# ]
+# ///
+"""Training script for the Ising spin Generator.
+Usage:
+    python train.py [--epochs N] [--batch-size B] [--learning-rate LR]
+                    [--data path/to/spins.npy] [--output-checkpoint model.eqx]
+"""
+import argparse
+import functools
+from pathlib import Path
+import einops
+import equinox as eqx
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+from tqdm.auto import tqdm
+from model import Generator, gen_config, snake_order
+# ---------------------------------------------------------------------------
+# Data loading
+# ---------------------------------------------------------------------------
+def load_ising_data(
+    path: Path, train_frac: float = 0.9
+) -> tuple[np.ndarray, np.ndarray]:
+    """Load spins.npy, map {-1,1} → {0,1}, flatten with snake ordering.
+    Returns ``(train_tokens, val_tokens)``, each ``(N, L²)`` int32.
+    """
+    spins = np.load(path)                           # (N, L, L) int8
+    lattice_size = spins.shape[1]
+    tokens = (spins.astype(np.int32) + 1) // 2     # (N, L, L), values in {0,1}
+    rows, cols = snake_order(lattice_size)
+    tokens = tokens[:, rows, cols]                  # (N, L²)
+    n_train = int(len(tokens) * train_frac)
+    return tokens[:n_train], tokens[n_train:]
+# ---------------------------------------------------------------------------
+# Batch preparation
+# ---------------------------------------------------------------------------
+def prepare_batch(batch: np.ndarray, num_devices: int) -> dict:
+    """Reshape ``(batch, seq)`` → ``(devices, batch//devices, seq)`` for pmap."""
+    token_ids = einops.rearrange(
+        batch,
+        "(devices batch) seq -> devices batch seq",
+        devices=num_devices,
+    )
+    return {"token_ids": token_ids}
+# ---------------------------------------------------------------------------
+# Training / eval steps
+# ---------------------------------------------------------------------------
+@eqx.filter_value_and_grad
+def compute_loss(model, inputs, key):
+    """Autoregressive cross-entropy: logits[:, :-1] predicts token_ids[:, 1:]."""
+    batch_size = inputs["token_ids"].shape[0]
+    keys = jax.random.split(key, batch_size)
+    logits = jax.vmap(model, in_axes=(0, None, 0))(inputs, True, keys)
+    return jnp.mean(
+        optax.softmax_cross_entropy_with_integer_labels(
+            logits=logits[:, :-1, :],
+            labels=inputs["token_ids"][:, 1:],
+        )
+    )
+def make_step(model, inputs, opt_state, key, tx):
+    key, new_key = jax.random.split(key)
+    loss, grads = compute_loss(model, inputs, key)
+    grads = jax.lax.pmean(grads, axis_name="devices")
+    updates, opt_state = tx.update(grads, opt_state, model)
+    model = eqx.apply_updates(model, updates)
+    return loss, model, opt_state, new_key
+def make_eval_step(model, inputs):
+    """Per-device mean NLL (nats/token), called inside pmap."""
+    logits = jax.vmap(functools.partial(model, enable_dropout=False))(inputs)
+    return jnp.mean(
+        optax.softmax_cross_entropy_with_integer_labels(
+            logits=logits[:, :-1, :],
+            labels=inputs["token_ids"][:, 1:],
+        )
+    )
+p_make_eval_step = eqx.filter_pmap(make_eval_step)
+# ---------------------------------------------------------------------------
+# pmap helpers
+# ---------------------------------------------------------------------------
+def replicate_for_pmap(value, devices):
+    mesh     = jax.sharding.Mesh(np.asarray(devices), ("devices",))
+    sharding = jax.sharding.NamedSharding(mesh, jax.sharding.PartitionSpec("devices"))
+    def replicate_leaf(leaf):
+        leaf = jnp.asarray(leaf)
+        leaf = jnp.broadcast_to(leaf, (len(devices),) + leaf.shape)
+        return jax.device_put(leaf, sharding)
+    return jax.tree.map(replicate_leaf, value)
+def unreplicate_from_pmap(value):
+    return jax.tree.map(lambda leaf: leaf[0], value)
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args():
+    p = argparse.ArgumentParser(description="Train an Ising spin generator.")
+    p.add_argument("--epochs",            type=int,   default=10)
+    p.add_argument("--batch-size",        type=int,   default=32)
+    p.add_argument("--learning-rate",     type=float, default=1e-4)
+    p.add_argument("--max-train-steps",   type=int,   default=None)
+    p.add_argument("--max-eval-batches",  type=int,   default=None)
+    p.add_argument("--data",              type=Path,
+                   default=Path(__file__).parent / "spins.npy")
+    p.add_argument("--output-checkpoint", type=Path,  default=None)
+    p.add_argument("--seed",              type=int,   default=5678)
+    return p.parse_args()
+def main():
+    args = parse_args()
+    num_devices = jax.device_count()
+    print(f"JAX devices: {jax.devices()}")
+    assert args.batch_size % num_devices == 0, (
+        "batch-size must be a multiple of the number of devices"
+    )
+    key = jax.random.PRNGKey(args.seed)
+    model_key, train_key = jax.random.split(key)
+    model = Generator(config=gen_config, key=model_key)
+    train_tokens, val_tokens = load_ising_data(args.data)
+    print(
+        f"Train: {len(train_tokens):,}  Val: {len(val_tokens):,}  "
+        f"Seq len: {train_tokens.shape[1]}"
+    )
+    tx = optax.chain(
+        optax.clip_by_global_norm(1.0),
+        optax.adam(learning_rate=args.learning_rate),
+    )
+    # Mask to float-only leaves so integer bookkeeping fields are excluded.
+    tx = optax.masked(tx, jax.tree.map(eqx.is_inexact_array, model))
+    opt_state = tx.init(model)
+    p_make_step = eqx.filter_pmap(
+        functools.partial(make_step, tx=tx), axis_name="devices"
+    )
+    devices = jax.local_devices()
+    opt_state  = replicate_for_pmap(opt_state, devices)
+    model      = replicate_for_pmap(model, devices)
+    train_key  = replicate_for_pmap(train_key, devices)
+    global_step = 0
+    for epoch in range(args.epochs):
+        rng      = np.random.default_rng(args.seed + epoch)
+        shuffled = train_tokens[rng.permutation(len(train_tokens))]
+        num_batches = len(shuffled) // args.batch_size
+        if args.max_train_steps is not None:
+            num_batches = min(num_batches, max(args.max_train_steps - global_step, 0))
+        with tqdm(range(num_batches), unit="steps",
+                  desc=f"Epoch {epoch + 1}/{args.epochs}") as pbar:
+            for step in pbar:
+                batch  = shuffled[step * args.batch_size : (step + 1) * args.batch_size]
+                inputs = prepare_batch(batch, num_devices)
+                loss, model, opt_state, train_key = p_make_step(
+                    model, inputs, opt_state, train_key
+                )
+                global_step += 1
+                pbar.set_postfix(loss=float(np.sum(loss)))
+                if args.max_train_steps and global_step >= args.max_train_steps:
+                    break
+        if args.max_train_steps and global_step >= args.max_train_steps:
+            break
+    # ---- validation ----
+    num_val = len(val_tokens) // args.batch_size
+    if args.max_eval_batches is not None:
+        num_val = min(num_val, args.max_eval_batches)
+    val_losses = []
+    for step in tqdm(range(num_val), unit="steps", desc="Validation"):
+        batch  = val_tokens[step * args.batch_size : (step + 1) * args.batch_size]
+        inputs = prepare_batch(batch, num_devices)
+        val_losses.append(float(np.mean(p_make_eval_step(model, inputs))))
+    print(f"Val NLL: {np.mean(val_losses):.4f} nats/token")
+    if args.output_checkpoint is not None:
+        args.output_checkpoint.parent.mkdir(parents=True, exist_ok=True)
+        eqx.tree_serialise_leaves(
+            args.output_checkpoint, unreplicate_from_pmap(model)
+        )
+        print(f"Saved checkpoint → {args.output_checkpoint}")
+if __name__ == "__main__":
+    main()

vi_train.py ADDED Viewed

	@@ -0,0 +1,288 @@

+#!/usr/bin/env python
+# /// script
+# dependencies = [
+#   "jax[cuda12]",
+#   "equinox",
+#   "optax",
+#   "tqdm",
+#   "jaxtyping",
+# ]
+# ///
+"""Variational inference fine-tuning of the Ising Generator.
+Objective
+---------
+Minimise the variational free energy
+    F[q] = E_{s~q}[E(s)] − T · H[q]
+         = T · E_{s~q}[ E(s)/T + log q(s) ]
+which equals KL(q ∥ p*) up to the constant log Z, where
+p*(s) ∝ exp(−E(s)/T) is the Ising Boltzmann distribution.
+As F decreases, the model q approaches the correct physics.
+Gradient estimator (REINFORCE / score-function)
+------------------------------------------------
+    ∇_θ F = T · E_{s~q}[( E(s)/T + log q(s) − b ) · ∇_θ log q(s)]
+where b = batch-mean reward is a zero-variance control variate.
+Per training step
+-----------------
+  1. Sample a batch of configs from the current model q_θ         (slow on CPU)
+  2. Compute Ising energy E(s) for each sample                    (fast, no model)
+  3. Compute log q(s) via a teacher-forced forward pass            (fast, one pass)
+  4. Assemble reward R = E/T + log q, subtract baseline, backprop  (fast)
+Speed note
+----------
+Step 1 dominates on CPU (~5 s / sample on a 32×32 lattice).  On GPU it is
+typically 10–100× faster and VI training with batch_size ≥ 32 is practical.
+The --checkpoint flag warm-starts from a CE-pretrained model, which dramati-
+cally reduces the number of VI steps needed to converge.
+Monitoring
+----------
+At each step the following quantities are logged:
+  e   = ⟨E/N⟩              mean energy per spin  (converges to data ~−1.45)
+  h   = −⟨log q⟩/N         entropy per spin in nats  (random init ≈ 0.693)
+  f   = e − T·h             Helmholtz free energy per spin (we minimise this)
+  |m| = ⟨|Σs / N|⟩         mean absolute magnetisation
+"""
+import argparse
+from pathlib import Path
+import equinox as eqx
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+from tqdm.auto import tqdm
+from model import Generator, gen_config, snake_order
+from sample import load_checkpoint, sample_batch
+# ---------------------------------------------------------------------------
+# Physical constants
+# ---------------------------------------------------------------------------
+J   = 1.0
+T_C = 2.0 / np.log(1.0 + np.sqrt(2.0))   # exact 2D Ising T_c ≈ 2.2692
+# ---------------------------------------------------------------------------
+# Ising energy (pure JAX — no model parameters, no gradient needed)
+# ---------------------------------------------------------------------------
+def ising_energy_per_spin(token_ids: jax.Array) -> jax.Array:
+    """Ising energy per spin from a snake-ordered token sequence {0, 1}.
+    Hamiltonian: H = −J Σ_{⟨ij⟩} s_i s_j  with periodic boundary conditions.
+    Returns a scalar.
+    """
+    L    = gen_config["lattice_size"]
+    rows, cols = snake_order(L)                              # concrete at trace time
+    spins = (token_ids * 2 - 1).astype(jnp.float32)        # {0,1} → {−1,+1}
+    grid  = jnp.zeros((L, L)).at[jnp.asarray(rows), jnp.asarray(cols)].set(spins)
+    right = jnp.roll(grid, -1, axis=1)
+    down  = jnp.roll(grid, -1, axis=0)
+    return -J * (grid * right + grid * down).sum() / (L * L)
+# ---------------------------------------------------------------------------
+# VI loss  (REINFORCE with per-batch baseline)
+# ---------------------------------------------------------------------------
+@eqx.filter_value_and_grad(has_aux=True)
+def compute_vi_loss(
+    model,
+    token_ids: jax.Array,
+    T: float,
+) -> tuple[jax.Array, dict]:
+    """REINFORCE proxy loss for ∇_θ F[q].
+    The returned scalar is *not* the free energy — it is the REINFORCE
+    surrogate whose gradient equals ∇_θ F/T.  Use aux["f"] to track F.
+    Parameters
+    ----------
+    model     : Generator
+    token_ids : int array  (batch, seq_len)  samples drawn from q_θ
+    T         : target temperature
+    Returns
+    -------
+    (loss, aux), grads
+      aux keys:  e, h, f  (per spin),  |m|,  reward_std  (variance diagnostic)
+    """
+    N = gen_config["lattice_size"] ** 2
+    # ── log q(s) via teacher-forced forward pass ─────────────────────────────
+    # Disable dropout so we get the exact model log-probability.
+    # in_axes=(0, None, None): vmap over batch; broadcast enable_dropout and key.
+    logits = jax.vmap(model, in_axes=(0, None, None))(
+        {"token_ids": token_ids}, False, None
+    )   # (batch, seq_len, state_size)
+    # Σ_t log p(s_t | s_{<t})   — summed over the sequence axis
+    log_q = -optax.softmax_cross_entropy_with_integer_labels(
+        logits[:, :-1, :], token_ids[:, 1:]
+    ).sum(axis=-1)   # (batch,)
+    # ── Ising energies ────────────────────────────────────────────────────────
+    # jax.lax.stop_gradient keeps energies out of the autodiff graph.
+    energies = jax.lax.stop_gradient(
+        jax.vmap(ising_energy_per_spin)(token_ids)   # (batch,)
+    )
+    # ── REINFORCE reward  R = E/T + log q(s) ─────────────────────────────────
+    reward   = jax.lax.stop_gradient(energies / T + log_q)
+    baseline = reward.mean()
+    # Proxy loss: ∇ loss  =  −E_q[(R − b) · ∇ log q]  =  ∇ F/T
+    loss = jnp.mean(jax.lax.stop_gradient(reward - baseline) * (-log_q))
+    # ── Diagnostics (all stop-gradiented; no effect on training) ─────────────
+    e   = energies.mean()                   # mean energy per spin
+    h   = -log_q.mean() / N                 # entropy per spin  (nats)
+    f   = e - T * h                         # Helmholtz free energy per spin
+    m   = jnp.abs((token_ids * 2 - 1).astype(jnp.float32).mean(axis=-1)).mean()
+    aux = {
+        "e":           e,
+        "h":           h,
+        "f":           f,
+        "|m|":         m,
+        "reward_std":  reward.std(),        # REINFORCE variance diagnostic
+    }
+    return loss, aux
+# ---------------------------------------------------------------------------
+# Single training step  (JIT-compiled; does NOT include sampling)
+# ---------------------------------------------------------------------------
+@eqx.filter_jit
+def vi_step(
+    model,
+    token_ids: jax.Array,
+    opt_state,
+    tx,
+    T: float,
+):
+    """Compute VI loss + gradient and apply one optimiser update.
+    Sampling is intentionally excluded so you can profile / replace it
+    without re-compiling the gradient computation.
+    """
+    (loss, aux), grads = compute_vi_loss(model, token_ids, T)
+    updates, opt_state = tx.update(grads, opt_state, model)
+    model = eqx.apply_updates(model, updates)
+    return loss, aux, model, opt_state
+# ---------------------------------------------------------------------------
+# Sampling helper
+# ---------------------------------------------------------------------------
+_SAMPLE_BATCH = 4   # fixed call-site batch; changing triggers recompilation
+def draw_samples(model, n: int, key: jax.Array) -> jax.Array:
+    """Sample n configurations from the model in fixed-size batches.
+    Returns a jnp int32 array of shape (n, L²).
+    """
+    all_tokens = []
+    n_calls = -(-n // _SAMPLE_BATCH)   # ceiling division
+    for _ in range(n_calls):
+        key, subkey = jax.random.split(key)
+        all_tokens.append(np.asarray(sample_batch(model, _SAMPLE_BATCH, subkey)))
+    return jnp.asarray(np.concatenate(all_tokens)[:n])
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def parse_args():
+    p = argparse.ArgumentParser(
+        description="Variational inference fine-tuning of an Ising Generator.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    p.add_argument("--checkpoint",        type=Path, default=None,
+                   help="Warm-start from this .eqx file (strongly recommended).")
+    p.add_argument("--output-checkpoint", type=Path, default=None,
+                   help="Save final model to this path.")
+    p.add_argument("--num-steps",         type=int,   default=200)
+    p.add_argument("--batch-size",        type=int,   default=16,
+                   help="Configurations sampled from q per gradient step.")
+    p.add_argument("--learning-rate",     type=float, default=1e-4)
+    p.add_argument("--temperature",       type=float, default=T_C,
+                   help=f"Target Boltzmann temperature (default: T_c ≈ {T_C:.4f}).")
+    p.add_argument("--log-every",         type=int,   default=1)
+    p.add_argument("--seed",              type=int,   default=0)
+    return p.parse_args()
+def main():
+    args = parse_args()
+    T   = args.temperature
+    key = jax.random.PRNGKey(args.seed)
+    # ── Model ─────────────────────────────────────────────────────────────────
+    if args.checkpoint is not None:
+        print(f"Loading checkpoint from {args.checkpoint} …")
+        model = load_checkpoint(args.checkpoint)
+    else:
+        print("Initialising model from scratch.")
+        print("  Tip: use --checkpoint to warm-start from a CE-pretrained model.")
+        key, model_key = jax.random.split(key)
+        model = Generator(config=gen_config, key=model_key)
+    # ── Optimiser ─────────────────────────────────────────────────────────────
+    tx = optax.chain(
+        optax.clip_by_global_norm(1.0),
+        optax.adam(learning_rate=args.learning_rate),
+    )
+    tx        = optax.masked(tx, jax.tree.map(eqx.is_inexact_array, model))
+    opt_state = tx.init(model)
+    L = gen_config["lattice_size"]
+    print(f"\nVI training  |  steps={args.num_steps}  "
+          f"batch={args.batch_size}  T={T:.4f}  lr={args.learning_rate}  L={L}")
+    print("  columns:  e = ⟨E/N⟩   h = −⟨log q⟩/N   "
+          "f = e−T·h  (minimised)   |m| = mean |magnetisation|\n")
+    # ── Training loop ─────────────────────────────────────────────────────────
+    with tqdm(range(args.num_steps), unit="steps") as pbar:
+        for step in pbar:
+            # 1. Sample from current model (bottleneck on CPU)
+            key, sample_key = jax.random.split(key)
+            token_ids = draw_samples(model, args.batch_size, sample_key)
+            # 2. VI gradient step (JIT-compiled teacher-forced forward pass)
+            loss, aux, model, opt_state = vi_step(
+                model, token_ids, opt_state, tx, T
+            )
+            if step % args.log_every == 0:
+                pbar.set_postfix(
+                    e   = f"{float(aux['e']):.4f}",
+                    h   = f"{float(aux['h']):.4f}",
+                    f   = f"{float(aux['f']):.4f}",
+                    m   = f"{float(aux['|m|']):.3f}",
+                    Rstd= f"{float(aux['reward_std']):.3f}",
+                )
+    # ── Save ──────────────────────────────────────────────────────────────────
+    if args.output_checkpoint is not None:
+        args.output_checkpoint.parent.mkdir(parents=True, exist_ok=True)
+        eqx.tree_serialise_leaves(args.output_checkpoint, model)
+        print(f"\nSaved checkpoint → {args.output_checkpoint}")
+if __name__ == "__main__":
+    main()