File size: 15,466 Bytes
d6b782e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad63797
d6b782e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad63797
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
"""
Preprocessing utilities for ECFlow web app.

Handles:
- CSV/NPZ parsing for both CV and TPD data
- Physical-to-dimensionless unit conversion for CV (Compton convention)
- Formal potential estimation
- Diffusion coefficient estimation via Randles-Sevcik
"""

import io
import numpy as np

# Physical constants
F_CONST = 96485.3329  # Faraday constant (C/mol)
R_CONST = 8.314462    # Gas constant (J/(mol·K))


# =========================================================================
# CV nondimensionalization
# =========================================================================

def nondimensionalize_cv(E_volts, i_amps, v_Vs, E0_V, T_K=298.15,
                         A_cm2=0.0707, C_A_molcm3=1e-6, D_A_cm2s=1e-5, n=1,
                         v_ref_Vs=0.1):
    """
    Convert physical CV data to dimensionless units for the ECFlow model.

    Potential and current are nondimensionalized using the Compton convention
    with the scan-rate-dependent diffusion length d = sqrt(D·RT/(nFv)):
        θ = (E - E₀) / (RT/nF)
        ψ = i / (nFAC·D/d)

    The dimensionless scan rate σ = v / v_ref is computed separately.
    In the Compton convention σ ≡ 1 by construction (d absorbs v), but the
    ECFlow model uses σ as an explicit conditioning variable to distinguish
    experiments at different scan rates. Setting v_ref so that σ spans the
    training range (~0.1–100) gives the model the scan-rate information.

    Args:
        E_volts: potential array (V)
        i_amps: current array (A)
        v_Vs: scan rate (V/s)
        E0_V: formal potential (V)
        T_K: temperature (K)
        A_cm2: electrode area (cm²)
        C_A_molcm3: bulk concentration (mol/cm³)
        D_A_cm2s: diffusion coefficient (cm²/s)
        n: number of electrons
        v_ref_Vs: reference scan rate (V/s) at which σ = 1

    Returns:
        theta: dimensionless potential array
        flux: dimensionless current array
        sigma: dimensionless scan rate (= v_Vs / v_ref_Vs)
    """
    thermal_voltage = R_CONST * T_K / (n * F_CONST)

    theta = (E_volts - E0_V) / thermal_voltage

    d = np.sqrt(D_A_cm2s * R_CONST * T_K / (n * F_CONST * v_Vs))
    flux_scale = n * F_CONST * A_cm2 * C_A_molcm3 * D_A_cm2s / d
    flux = i_amps / flux_scale

    sigma = v_Vs / v_ref_Vs

    return theta.astype(np.float32), flux.astype(np.float32), float(sigma)


def estimate_E0(E, i):
    """
    Estimate formal potential from CV midpoint of anodic/cathodic peaks.

    Args:
        E: potential array (V)
        i: current array (A)

    Returns:
        E0 estimate (V)
    """
    E = np.asarray(E)
    i = np.asarray(i)

    mid = len(E) // 2
    i_anodic = i[:mid] if i[:mid].max() > abs(i[:mid].min()) else i[mid:]
    i_cathodic = i[mid:] if i[mid:].min() < -abs(i[mid:].max()) else i[:mid]

    E_pa = E[np.argmax(i)]
    E_pc = E[np.argmin(i)]

    return float((E_pa + E_pc) / 2.0)


def estimate_D_randles_sevcik(i_peak_A, v_Vs, A_cm2, C_molcm3, n=1, T_K=298.15):
    """
    Estimate diffusion coefficient from Randles-Sevcik equation.

    i_p = 0.4463 * n^(3/2) * F^(3/2) * A * C * sqrt(D * v / (R * T))

    Args:
        i_peak_A: peak current (A)
        v_Vs: scan rate (V/s)
        A_cm2: electrode area (cm^2)
        C_molcm3: concentration (mol/cm^3)
        n: number of electrons
        T_K: temperature (K)

    Returns:
        D estimate (cm^2/s)
    """
    coeff = 0.4463 * n**1.5 * F_CONST**1.5 * A_cm2 * C_molcm3
    if abs(coeff) < 1e-30 or v_Vs <= 0:
        return 1e-5
    ratio = abs(i_peak_A) / coeff
    D = ratio**2 * R_CONST * T_K / v_Vs
    return max(float(D), 1e-10)


# =========================================================================
# CSV parsing
# =========================================================================

def parse_cv_csv(file_content, delimiter=None):
    """
    Parse a CV CSV file with flexible column detection.

    Expected columns: potential (V or mV) and current (A, mA, uA, nA).
    Optionally includes a time column (s) to infer the scan rate.
    Auto-detects column names and units from header.

    Args:
        file_content: string or bytes of CSV content
        delimiter: CSV delimiter (auto-detected if None)

    Returns:
        dict with 'E_V' (potential in V), 'i_A' (current in A),
        and optionally 'scan_rate_Vs' (V/s) if time is available.
    """
    if isinstance(file_content, bytes):
        file_content = file_content.decode("utf-8", errors="replace")

    lines = file_content.strip().split("\n")
    if len(lines) < 2:
        raise ValueError("CSV must have at least a header and one data row")

    if delimiter is None:
        for d in [",", "\t", ";"]:
            if d in lines[0]:
                delimiter = d
                break
        if delimiter is None:
            delimiter = ","

    header = [h.strip().lower() for h in lines[0].split(delimiter)]

    e_col, i_col, t_col = None, None, None
    e_scale, i_scale = 1.0, 1.0

    time_patterns = ["time/s", "time (s)", "time/ms", "time (ms)",
                     "elapsed time", "t/s", "t (s)", "time"]

    potential_patterns = [
        ("e/v", 1.0), ("e (v)", 1.0), ("potential/v", 1.0), ("potential (v)", 1.0),
        ("ewe/v", 1.0), ("working electrode", 1.0),
        ("e/mv", 1e-3), ("e (mv)", 1e-3), ("potential/mv", 1e-3), ("potential (mv)", 1e-3),
        ("voltage", 1.0), ("e", 1.0), ("potential", 1.0),
    ]
    current_patterns = [
        ("i/a", 1.0), ("i (a)", 1.0), ("current/a", 1.0), ("current (a)", 1.0),
        ("<i>/ma", 1e-3),
        ("i/ma", 1e-3), ("i (ma)", 1e-3), ("current/ma", 1e-3), ("current (ma)", 1e-3),
        ("i/ua", 1e-6), ("i (ua)", 1e-6), ("i/µa", 1e-6), ("i (µa)", 1e-6),
        ("current/ua", 1e-6), ("current/µa", 1e-6),
        ("i/na", 1e-9), ("i (na)", 1e-9),
        ("current", 1.0), ("i", 1.0),
    ]

    for idx, col in enumerate(header):
        if t_col is None:
            for pat in time_patterns:
                if pat in col:
                    t_col = idx
                    break
            if t_col == idx:
                continue
        if e_col is None:
            for pat, scale in potential_patterns:
                if pat in col:
                    e_col, e_scale = idx, scale
                    break
        if i_col is None:
            for pat, scale in current_patterns:
                if pat in col:
                    i_col, i_scale = idx, scale
                    break

    if e_col is None or i_col is None:
        non_time = [idx for idx in range(len(header)) if idx != t_col]
        if len(non_time) >= 2:
            e_col, i_col = non_time[0], non_time[1]
        else:
            raise ValueError(
                f"Cannot identify potential/current columns from header: {header}"
            )

    all_cols = {e_col, i_col}
    if t_col is not None:
        all_cols.add(t_col)
    max_col = max(all_cols)

    E_vals, i_vals, t_vals = [], [], []
    for line in lines[1:]:
        parts = line.strip().split(delimiter)
        if len(parts) <= max_col:
            continue
        try:
            E_vals.append(float(parts[e_col]) * e_scale)
            i_vals.append(float(parts[i_col]) * i_scale)
            if t_col is not None:
                t_vals.append(float(parts[t_col]))
        except ValueError:
            continue

    if len(E_vals) < 5:
        raise ValueError(f"Only {len(E_vals)} valid data points found")

    result = {
        "E_V": np.array(E_vals, dtype=np.float32),
        "i_A": np.array(i_vals, dtype=np.float32),
    }

    if t_vals:
        t_arr = np.array(t_vals, dtype=np.float64)
        E_arr = np.array(E_vals, dtype=np.float64)
        mid = len(E_arr) // 2
        dE = np.abs(np.diff(E_arr[:mid]))
        dt = np.abs(np.diff(t_arr[:mid]))
        valid = dt > 1e-12
        if valid.sum() > 10:
            v = float(np.median(dE[valid] / dt[valid]))
            if v > 1e-6:
                result["scan_rate_Vs"] = v

    return result


def parse_tpd_csv(file_content, delimiter=None):
    """
    Parse a TPD CSV file.

    Expected columns: temperature (K or °C) and signal (arb. units).
    Optionally includes a time column (s) to infer the heating rate.
    Auto-detects Celsius vs Kelvin.

    Returns:
        dict with 'T_K' (temperature in K), 'signal' (arb. units),
        and optionally 'beta_Ks' (heating rate in K/s) if time is available.
    """
    if isinstance(file_content, bytes):
        file_content = file_content.decode("utf-8", errors="replace")

    lines = file_content.strip().split("\n")
    if len(lines) < 2:
        raise ValueError("CSV must have at least a header and one data row")

    if delimiter is None:
        for d in [",", "\t", ";"]:
            if d in lines[0]:
                delimiter = d
                break
        if delimiter is None:
            delimiter = ","

    header = [h.strip().lower() for h in lines[0].split(delimiter)]

    t_col, s_col, time_col = None, None, None
    is_celsius = False

    temp_patterns = [
        ("temperature", False), ("temp", False), ("t/k", False), ("t (k)", False),
        ("t/c", True), ("t (c)", True), ("t/°c", True), ("t (°c)", True),
    ]
    signal_patterns = ["signal", "rate", "intensity", "des", "tpd"]
    time_patterns = ["time/s", "time (s)", "time"]

    for idx, col in enumerate(header):
        if t_col is None:
            for pat, celsius in temp_patterns:
                if pat in col:
                    t_col = idx
                    is_celsius = celsius
                    break
        if s_col is None:
            for pat in signal_patterns:
                if pat in col:
                    s_col = idx
                    break
        if time_col is None:
            for pat in time_patterns:
                if pat in col:
                    time_col = idx
                    break

    if t_col is None or s_col is None:
        if len(header) >= 2:
            t_col, s_col = 0, 1
        else:
            raise ValueError(
                f"Cannot identify temperature/signal columns from header: {header}"
            )

    all_cols = {t_col, s_col}
    if time_col is not None:
        all_cols.add(time_col)
    max_col = max(all_cols)

    T_vals, s_vals, time_vals = [], [], []
    for line in lines[1:]:
        parts = line.strip().split(delimiter)
        if len(parts) <= max_col:
            continue
        try:
            T_vals.append(float(parts[t_col]))
            s_vals.append(float(parts[s_col]))
            if time_col is not None:
                time_vals.append(float(parts[time_col]))
        except ValueError:
            continue

    if len(T_vals) < 5:
        raise ValueError(f"Only {len(T_vals)} valid data points found")

    T_arr = np.array(T_vals, dtype=np.float32)
    if is_celsius or T_arr.max() < 200:
        T_arr += 273.15

    result = {
        "T_K": T_arr,
        "signal": np.array(s_vals, dtype=np.float32),
    }

    if time_vals:
        time_arr = np.array(time_vals, dtype=np.float32)
        dt = time_arr[-1] - time_arr[0]
        dT = T_arr[-1] - T_arr[0]
        if dt > 0:
            result["beta_Ks"] = float(dT / dt)

    return result


def parse_dimensionless_cv_csv(file_content, delimiter=None):
    """
    Parse a CSV that already contains dimensionless CV data.

    Expected columns: theta (dimensionless potential), flux (dimensionless current).

    Returns:
        dict with 'theta', 'flux' arrays
    """
    if isinstance(file_content, bytes):
        file_content = file_content.decode("utf-8", errors="replace")

    lines = file_content.strip().split("\n")
    if len(lines) < 2:
        raise ValueError("CSV must have at least a header and one data row")

    if delimiter is None:
        for d in [",", "\t", ";"]:
            if d in lines[0]:
                delimiter = d
                break
        if delimiter is None:
            delimiter = ","

    header = [h.strip().lower() for h in lines[0].split(delimiter)]

    t_col, f_col = None, None
    for idx, col in enumerate(header):
        if t_col is None and any(p in col for p in ["theta", "potential", "e"]):
            t_col = idx
        if f_col is None and any(p in col for p in ["flux", "current", "j", "i"]):
            f_col = idx

    if t_col is None or f_col is None:
        if len(header) >= 2:
            t_col, f_col = 0, 1
        else:
            raise ValueError(f"Cannot identify columns from header: {header}")

    theta_vals, flux_vals = [], []
    for line in lines[1:]:
        parts = line.strip().split(delimiter)
        if len(parts) <= max(t_col, f_col):
            continue
        try:
            theta_vals.append(float(parts[t_col]))
            flux_vals.append(float(parts[f_col]))
        except ValueError:
            continue

    return {
        "theta": np.array(theta_vals, dtype=np.float32),
        "flux": np.array(flux_vals, dtype=np.float32),
    }


# ── TPD summary feature extraction ──────────────────────────────────

MAX_HEATING_RATES = 3
TPD_FEATURES_PER_RATE = 6
TPD_SUMMARY_DIM = MAX_HEATING_RATES * TPD_FEATURES_PER_RATE + MAX_HEATING_RATES  # 21


def extract_tpd_summary_stats(temperature, rate, lengths, heating_rates, n_rates):
    """Extract 21-dim hand-crafted summary statistics from raw TPD data.

    Per heating rate (6 features): normalized peak rate, peak temperature,
    half-peak width, normalized total desorption integral, asymmetry ratio
    (left vs right half-width), log10(peak rate).
    Plus log10(heating_rate) per curve.

    Args:
        temperature: [N, T] array of temperatures (K)
        rate: [N, T] array of desorption rates
        lengths: [N] array of valid lengths per curve
        heating_rates: [N] array of heating rates (K/s)
        n_rates: number of heating rates

    Returns:
        1-D array of shape (21,)
    """
    features = np.zeros(TPD_SUMMARY_DIM, dtype=np.float32)
    for i in range(min(n_rates, MAX_HEATING_RATES)):
        L = int(lengths[i])
        temp = temperature[i, :L]
        r = rate[i, :L]
        peak_abs = np.max(np.abs(r)) + 1e-30

        peak_rate = np.max(r)
        idx_peak = np.argmax(r)
        peak_temp = temp[idx_peak]

        half_max = peak_rate / 2.0
        above_half = r >= half_max
        if np.any(above_half):
            indices = np.where(above_half)[0]
            half_width = temp[indices[-1]] - temp[indices[0]]
            left_width = peak_temp - temp[indices[0]]
            right_width = temp[indices[-1]] - peak_temp
            asymmetry = (right_width - left_width) / (half_width + 1e-30)
        else:
            half_width = 0.0
            asymmetry = 0.0

        if L > 1:
            integral = (np.trapezoid(r, temp)
                        if hasattr(np, 'trapezoid') else np.trapz(r, temp))
        else:
            integral = 0.0

        log_peak = np.log10(peak_abs)

        offset = i * TPD_FEATURES_PER_RATE
        features[offset + 0] = peak_rate / peak_abs
        features[offset + 1] = peak_temp
        features[offset + 2] = half_width
        features[offset + 3] = integral / (peak_abs * (temp.max() - temp.min()) + 1e-30)
        features[offset + 4] = asymmetry
        features[offset + 5] = log_peak

        features[MAX_HEATING_RATES * TPD_FEATURES_PER_RATE + i] = np.log10(heating_rates[i])
    return features