ml-intern
File size: 8,696 Bytes
f7eba79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
AlphaEvolve-style Evolutionary Formula Discovery for Zeros
=============================================================
Inspired by arXiv:2511.02864 (AlphaEvolve, Terence Tao et al.)

Key idea: Use an evolutionary coding agent to discover empirical formulas
for zeta zero statistics. Instead of searching in proof space, we search
in CODE space — Python expressions that approximate observed data.

Fitness function: agreement with empirical zero statistics (pair correlation,
spacing distribution, etc.).

This is the "AlphaFold for math" approach: end-to-end search for formulas
that match data, then verify if they generalize.
"""

import numpy as np
from typing import Dict, List, Callable
import random
import math


class FormulaGene:
    """A candidate formula as a Python expression string."""
    def __init__(self, expression: str, fitness: float = None):
        self.expression = expression
        self.fitness = fitness

    def __repr__(self):
        return f"Gene({self.expression}, fitness={self.fitness})"


class AlphaFoldMathEvolver:
    """
    Evolutionary search for formulas matching zero statistics.
    
    Target: Find a closed-form formula f(n) that approximates the n-th
    zero spacing s_n = γ_{n+1} - γ_n.
    
    Expression grammar:
        - Variables: n, log(n), sqrt(n)
        - Constants: pi, e, numbers 1-10
        - Operations: +, -, *, /, exp, log, sqrt, sin, cos
        
    Fitness: L2 distance between f(n) and actual normalized spacings.
    """

    def __init__(self, zeros: List[float]):
        self.zeros = np.array(zeros)
        self.spacings = np.diff(self.zeros)
        self.normalized_spacings = self.spacings / np.mean(self.spacings)
        self.results = {}

    def _random_expression(self, depth: int = 0, max_depth: int = 4) -> str:
        """Generate a random mathematical expression."""
        if depth >= max_depth or random.random() < 0.3:
            # Terminal
            terminals = ['n', 'math.log(n+1)', 'math.sqrt(n)',
                         'math.pi', 'math.e', '1', '2', '0.5']
            return random.choice(terminals)

        # Non-terminal
        ops = ['+', '-', '*', '/']
        funcs = ['math.exp', 'math.log', 'math.sqrt', 'math.sin', 'math.cos']

        if random.random() < 0.5:
            op = random.choice(ops)
            left = self._random_expression(depth + 1, max_depth)
            right = self._random_expression(depth + 1, max_depth)
            # Protect against division by zero
            if op == '/':
                return f"({left}) / ({right} + 0.001)"
            return f"({left}) {op} ({right})"
        else:
            func = random.choice(funcs)
            arg = self._random_expression(depth + 1, max_depth)
            # Protect log domain
            if func == 'math.log':
                return f"{func}(abs({arg}) + 0.001)"
            return f"{func}({arg})"

    def _evaluate_expression(self, expr: str, n_vals: np.ndarray) -> np.ndarray:
        """Safely evaluate expression for array of n values."""
        try:
            # Create safe namespace
            namespace = {'n': n_vals, 'math': math, 'np': np}
            result = eval(expr, {"__builtins__": {}}, namespace)
            result = np.array(result, dtype=float)
            # Ensure 1D array of correct length
            if result.ndim == 0:
                result = np.full(len(n_vals), float(result))
            # Filter invalid values
            result = np.where(np.isfinite(result), result, 0)
            return result
        except Exception:
            return np.zeros(len(n_vals))

    def _fitness(self, expr: str, sample_size: int = 1000) -> float:
        """Compute fitness: negative MSE between formula and actual spacings."""
        n_vals = np.arange(1, min(sample_size + 1, len(self.normalized_spacings)))
        predicted = self._evaluate_expression(expr, n_vals)
        actual = self.normalized_spacings[:len(n_vals)]

        if len(predicted) != len(actual):
            return -1e10

        # Normalize predicted to match actual scale
        if np.std(predicted) > 0:
            predicted = (predicted - np.mean(predicted)) / np.std(predicted)
            predicted = predicted * np.std(actual) + np.mean(actual)

        mse = np.mean((predicted - actual) ** 2)
        # Also reward simplicity (shorter expressions)
        complexity_penalty = len(expr) * 0.0001
        return -mse - complexity_penalty

    def _mutate(self, expr: str) -> str:
        """Randomly mutate an expression."""
        mutations = [
            lambda e: e + f" + {random.choice(['1', '0.1', 'n*0.01'])}",
            lambda e: f"math.sin({e})" if 'sin' not in e else e,
            lambda e: f"math.log(abs({e}) + 0.001)" if 'log' not in e else e,
            lambda e: f"({e}) * {random.choice(['0.9', '1.1', '2'])}",
            lambda e: self._random_expression(depth=1, max_depth=3),
        ]
        return random.choice(mutations)(expr)

    def _crossover(self, expr1: str, expr2: str) -> str:
        """Combine two expressions."""
        if random.random() < 0.5:
            return f"({expr1}) * 0.5 + ({expr2}) * 0.5"
        return f"({expr1}) / (abs({expr2}) + 0.001)"

    def evolve(self, population_size: int = 50, generations: int = 30,
               sample_size: int = 500) -> Dict:
        """Run evolutionary search."""
        print(f"  [AlphaFoldMath] Evolving formulas for {sample_size} spacings...")

        # Initialize population
        population = [FormulaGene(self._random_expression()) for _ in range(population_size)]

        best_fitness_history = []

        for gen in range(generations):
            # Evaluate fitness
            for gene in population:
                if gene.fitness is None:
                    gene.fitness = self._fitness(gene.expression, sample_size)

            # Sort by fitness
            population.sort(key=lambda g: g.fitness, reverse=True)
            best_fitness_history.append(population[0].fitness)

            if gen % 10 == 0:
                print(f"    Gen {gen}: best fitness = {population[0].fitness:.6f}")

            # Selection: keep top 20%
            elite_count = max(1, population_size // 5)
            new_population = population[:elite_count]

            # Generate offspring
            while len(new_population) < population_size:
                if random.random() < 0.7 and len(population) >= 2:
                    # Crossover
                    p1, p2 = random.sample(population[:elite_count*2], 2)
                    child_expr = self._crossover(p1.expression, p2.expression)
                else:
                    # Mutation
                    parent = random.choice(population[:elite_count*2])
                    child_expr = self._mutate(parent.expression)

                new_population.append(FormulaGene(child_expr))

            population = new_population

        # Final evaluation
        for gene in population:
            if gene.fitness is None:
                gene.fitness = self._fitness(gene.expression, sample_size)
        population.sort(key=lambda g: g.fitness, reverse=True)

        best = population[0]
        n_vals = np.arange(1, min(sample_size + 1, len(self.normalized_spacings)))
        predicted = self._evaluate_expression(best.expression, n_vals)

        self.results = {
            'strategy': 'alphafold_math_evolutionary_formula',
            'population_size': population_size,
            'generations': generations,
            'best_expression': best.expression,
            'best_fitness': float(best.fitness),
            'fitness_history': [float(f) for f in best_fitness_history],
            'predicted_vs_actual': {
                'predicted_mean': float(np.mean(predicted)),
                'actual_mean': float(np.mean(self.normalized_spacings[:sample_size])),
                'predicted_std': float(np.std(predicted)),
                'actual_std': float(np.std(self.normalized_spacings[:sample_size])),
            },
            'interpretation': "Negative fitness = -MSE. Higher = better agreement.",
        }
        return self.results

    def summary(self) -> str:
        r = self.results
        s = f"AlphaFold-Math Formula Evolver\n{'='*50}\n"
        s += f"Best formula: {r['best_expression']}\n"
        s += f"Best fitness: {r['best_fitness']:.6f}\n"
        pv = r['predicted_vs_actual']
        s += f"Predicted mean={pv['predicted_mean']:.4f} vs actual={pv['actual_mean']:.4f}\n"
        s += f"Predicted std={pv['predicted_std']:.4f} vs actual={pv['actual_std']:.4f}\n"
        s += f"Note: This is an empirical formula discovered by evolution, not proven.\n"
        return s