Spaces:
Running
Running
| """Numerical tests for TAF Agent formulas — paper §3.3, §5, §7.1. | |
| Verifies the corrected implementations match: | |
| - exact theoretical paper formulas (γ_Padé, D_f closed) | |
| - numerical ground truth (partition_Z at γ=1, mean_log_d) | |
| - paper Table §7.1 compression examples | |
| """ | |
| from __future__ import annotations | |
| import math | |
| import sys | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(ROOT / "cli")) | |
| sys.path.insert(0, str(ROOT / "python")) | |
| from diagnose_model import ( # type: ignore | |
| D_f_closed, free_energy_F, partition_Z, mean_log_d, | |
| entropy_S, heat_capacity_Cv, theta_eff_pade, EULER_GAMMA, | |
| ) | |
| from taf_browser import ( # type: ignore | |
| gamma_pade, d_horizon, theta_design, df_window, | |
| gamma_decompose, gamma_decompose_v2, | |
| ) | |
| # ───────────────────────────────────────────────────────────────────────── | |
| # γ_Padé (sanity) | |
| # ───────────────────────────────────────────────────────────────────────── | |
| def test_gamma_pade_T_zero_gives_one(): | |
| assert abs(gamma_pade(10000, 0) - 1.0) < 1e-12 | |
| def test_gamma_pade_at_T_theta_sqrt2_gives_zero(): | |
| """T = θ√2 ⇒ γ_Padé = 0 (paper saturation point).""" | |
| theta = 10000 | |
| T = int(theta * math.sqrt(2)) | |
| g = gamma_pade(theta, T) | |
| assert abs(g) < 1e-3, f"got {g}" | |
| def test_gamma_pade_at_T_theta_over_sqrt2_NOT_zero(): | |
| """T = θ/√2 (= d_alias) gives γ_Padé = 1/3, NOT 0 | |
| (only γ_LINEAR saturates here).""" | |
| theta = 10000 | |
| T = int(theta / math.sqrt(2)) | |
| g = gamma_pade(theta, T) | |
| assert abs(g - 1.0/3.0) < 0.01, f"expected ~1/3, got {g}" | |
| # ───────────────────────────────────────────────────────────────────────── | |
| # partition_Z γ=1: H_N + Euler-Mascheroni | |
| # ───────────────────────────────────────────────────────────────────────── | |
| def test_partition_Z_at_gamma_1_matches_H_N(): | |
| """partition_Z(1, N) should approximate H_N = ∑ 1/d to within 1%.""" | |
| for N in (100, 1000, 10000): | |
| H_N = sum(1.0 / d for d in range(1, N + 1)) | |
| Z_pred = partition_Z(1.0, N) | |
| rel_err = abs(Z_pred - H_N) / H_N | |
| assert rel_err < 0.01, f"N={N}: H_N={H_N:.4f}, code={Z_pred:.4f}, err={rel_err:.4f}" | |
| def test_partition_Z_at_gamma_neq_1_continuous(): | |
| """Z is continuous across γ=1 boundary (limit-consistent).""" | |
| Z_below = partition_Z(0.99999, 10000) | |
| Z_above = partition_Z(1.00001, 10000) | |
| Z_at = partition_Z(1.0, 10000) | |
| assert abs(Z_below - Z_at) < 0.05 * Z_at | |
| assert abs(Z_above - Z_at) < 0.05 * Z_at | |
| # ───────────────────────────────────────────────────────────────────────── | |
| # D_f_closed: exact paper Theorem 7.1 | |
| # ───────────────────────────────────────────────────────────────────────── | |
| def _df_numerical_truth(gamma: float, f: float, N: int) -> int: | |
| """Brute-force compute the smallest D such that ∑_{d=1}^D d^{-γ}/Z ≥ f.""" | |
| weights = [d ** (-gamma) for d in range(1, N + 1)] | |
| total = sum(weights) | |
| cum = 0.0 | |
| for d, w in enumerate(weights, start=1): | |
| cum += w | |
| if cum / total >= f: | |
| return d | |
| return N | |
| def test_D_f_phase_A_pythia_70m(): | |
| """Pythia-70m γ=0.748, paper Table §7.1: D_0.90 ≈ 1383.""" | |
| truth = _df_numerical_truth(0.748, 0.90, 2000) | |
| code = D_f_closed(0.748, 0.90, 2000) | |
| assert abs(code - truth) <= max(15, 0.02 * truth), \ | |
| f"phase A: code={code}, truth={truth}" | |
| def test_D_f_phase_A_pythia_2_8b(): | |
| """pythia-2.8b γ=0.674, paper: D_0.90 ≈ 1476.""" | |
| truth = _df_numerical_truth(0.674, 0.90, 2000) | |
| code = D_f_closed(0.674, 0.90, 2000) | |
| assert abs(code - truth) <= max(15, 0.02 * truth) | |
| def test_D_f_at_gamma_1_matches_discrete_truth(): | |
| """At γ=1: discrete D_f from cumulative ∑ 1/d ≥ f·H_N. | |
| Continuum approximation N^f overestimates by ~6%. | |
| """ | |
| truth = _df_numerical_truth(1.0, 0.9, 2000) | |
| code = D_f_closed(1.0, 0.9, 2000) | |
| assert code == truth, f"γ=1: code={code}, truth={truth}" | |
| # Document continuum-approx discrepancy: | |
| continuum = int(round(2000 ** 0.9)) | |
| assert abs(continuum - truth) > 30, \ | |
| "continuum N^f should differ from discrete truth at γ=1" | |
| def test_D_f_phase_B_severe_compression(): | |
| """γ=1.5: discrete-truth implementation → exact match.""" | |
| truth = _df_numerical_truth(1.5, 0.90, 2000) | |
| code = D_f_closed(1.5, 0.90, 2000) | |
| assert code == truth, f"phase B: code={code}, truth={truth}" | |
| assert code < 200, f"phase B should be tiny, got {code}" | |
| def test_D_f_llama_3_8b_phase_B(): | |
| """LLaMA-3-8B γ=1.046 — discrete truth, exact.""" | |
| truth = _df_numerical_truth(1.046, 0.90, 2000) | |
| code = D_f_closed(1.046, 0.90, 2000) | |
| assert code == truth | |
| def test_D_f_at_boundary_0_99(): | |
| truth = _df_numerical_truth(0.99, 0.90, 2000) | |
| code = D_f_closed(0.99, 0.90, 2000) | |
| assert code == truth | |
| def test_D_f_at_boundary_1_01(): | |
| truth = _df_numerical_truth(1.01, 0.90, 2000) | |
| code = D_f_closed(1.01, 0.90, 2000) | |
| assert code == truth | |
| # ───────────────────────────────────────────────────────────────────────── | |
| # free_energy_F: physics convention F = -log(Z)/γ | |
| # ───────────────────────────────────────────────────────────────────────── | |
| def test_free_energy_F_physics_convention(): | |
| """F = -T·log(Z) = -log(Z)/γ.""" | |
| for gamma in (0.5, 0.75, 1.0, 1.5): | |
| Z = partition_Z(gamma, 2000) | |
| expected = -math.log(Z) / gamma | |
| code = free_energy_F(gamma, 2000) | |
| assert abs(code - expected) < 1e-8, \ | |
| f"γ={gamma}: code={code}, expected={expected}" | |
| def test_thermodynamic_identity_S_equals_U_minus_F_over_T(): | |
| """Sanity: S = (U − F)/T = γ·(U − F). | |
| Equivalently S = γU + log Z when F = -log Z/γ. | |
| """ | |
| for gamma in (0.5, 0.75, 1.0, 1.5): | |
| Z = partition_Z(gamma, 2000) | |
| U = mean_log_d(gamma, 2000) | |
| F = free_energy_F(gamma, 2000) | |
| S_from_eq = gamma * (U - F) | |
| S_direct = entropy_S(gamma, 2000) | |
| # In our entropy_S = log Z + γU, and corrected F = -log Z/γ ⇒ | |
| # γ(U − F) = γU + log Z = S. So they MUST match. | |
| assert abs(S_from_eq - S_direct) < 1e-8, \ | |
| f"γ={gamma}: S_eq={S_from_eq}, S_direct={S_direct}" | |
| # ───────────────────────────────────────────────────────────────────────── | |
| # C_V at Hagedorn — paper §5.2 was wrong, agent's numerical-derivative is OK | |
| # ───────────────────────────────────────────────────────────────────────── | |
| def test_cv_at_hagedorn_matches_corrected_asymptotic(): | |
| """C_V(γ=1, N) ~ (log N)²/12 + sub-leading corrections. | |
| Agent's numerical derivative gives the exact discrete value; ratio to | |
| the leading asymptotic /12 converges slowly (1/log N rate). | |
| Paper §5.2 said /4 — wrong by factor 3. | |
| """ | |
| # Verify agent does NOT match /4 (paper's claim) | |
| cv_10000 = heat_capacity_Cv(1.0, 10000) | |
| pred_paper_wrong = math.log(10000) ** 2 / 4.0 | |
| assert cv_10000 / pred_paper_wrong < 0.5, "C_V should NOT match paper's /4" | |
| # Verify it DOES converge to /12 from above | |
| ratios = [] | |
| for N in (1000, 10000, 100000): | |
| cv = heat_capacity_Cv(1.0, N) | |
| pred_corrected = math.log(N) ** 2 / 12.0 | |
| ratios.append(cv / pred_corrected) | |
| # Monotone decreasing toward 1 from above | |
| assert ratios[0] > ratios[1] > ratios[2] > 1.0 | |
| assert ratios[-1] < 1.20, f"N=10⁵ ratio should approach 1, got {ratios[-1]:.4f}" | |
| # ───────────────────────────────────────────────────────────────────────── | |
| # Browser df_window — exact in calibrated zone, None outside | |
| # ───────────────────────────────────────────────────────────────────────── | |
| def test_df_window_in_zone(): | |
| """γ=0.748 ∈ [0.65, 0.85]: should match exact paper formula.""" | |
| truth = _df_numerical_truth(0.748, 0.90, 2000) | |
| code = df_window(0.748, 2000, 0.90) | |
| assert code is not None | |
| assert abs(code - truth) <= max(15, 0.02 * truth) | |
| def test_df_window_out_of_zone_returns_None(): | |
| assert df_window(0.5, 2000) is None # too low | |
| assert df_window(0.95, 2000) is None # too high | |
| assert df_window(1.5, 2000) is None # phase B | |
| # ───────────────────────────────────────────────────────────────────────── | |
| # Sanity: theta_design + gamma_pade are inverses | |
| # ───────────────────────────────────────────────────────────────────────── | |
| def test_theta_design_inverts_gamma_pade(): | |
| """θ_design(γ, T) should yield θ such that γ_Padé(θ, T) = γ exactly.""" | |
| for gamma_target in (0.3, 0.5, 0.7, 0.85): | |
| for T in (1000, 2000, 8000): | |
| theta = theta_design(gamma_target, T) | |
| recovered = gamma_pade(theta, T) | |
| assert abs(recovered - gamma_target) < 1e-9 | |
| def test_theta_eff_pade_definition(): | |
| """θ_eff_Padé = θ + T/√2 (paper definition).""" | |
| for theta in (10000, 500000, 1_000_000): | |
| for T in (1000, 2000): | |
| assert abs(theta_eff_pade(theta, T) - (theta + T / math.sqrt(2))) < 1e-9 | |
| # ───────────────────────────────────────────────────────────────────────── | |
| # gamma_decompose: audit-driven calibration changes | |
| # ───────────────────────────────────────────────────────────────────────── | |
| def test_decompose_SWA_disabled(): | |
| """δ_SWA was originally fit on n=1 — must NOT apply correction; status flagged.""" | |
| result = gamma_decompose(0.75, has_SWA=True) | |
| assert result["delta_SWA"] == 0.0 | |
| assert "n1_disabled" in result["delta_SWA_status"] | |
| def test_decompose_GQA_still_active(): | |
| """δ_GQA replicates in panel re-audit (+0.115 vs +0.11 hardcoded).""" | |
| on = gamma_decompose(0.75, has_GQA=True) | |
| off = gamma_decompose(0.75, has_GQA=False) | |
| assert abs(on["delta_GQA"] - 0.11) < 1e-9 | |
| assert off["delta_GQA"] == 0.0 | |
| def test_decompose_v2_warnings_present(): | |
| """v2 must emit calibration_warning.""" | |
| r = gamma_decompose_v2(0.75, n_params_M=500, has_SWA=True, is_instruct=True) | |
| assert "calibration_warning" in r | |
| assert r["delta_SWA"] == 0.0 # disabled | |
| assert "exploratory" in r["delta_SWA_status"] or "n1" in r["delta_SWA_status"] | |