File size: 18,180 Bytes
bd20c6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
"""
tests/test_queueing_controller.py

8 tests for QueueingController (ICML 2026, arXiv:2605.04595).
Covers stability theory, EMA arrival-rate estimation, Welford statistics,
INVARIANT-11, and the Prometheus metrics export.

EMA timing note:
    record_request_arrival() uses time.monotonic() internally (not the
    timestamp argument) to measure inter-arrival dt for the EMA update.
    Tests drive real elapsed time via time.sleep(). A window_seconds of
    1.0–2.0 s is used so EMA samples persist for multiple iterations,
    enabling convergence in 5–15 steps.
"""

import math
import random
import time
from typing import List, Tuple

import pytest

from contextforge.scheduling.queueing_controller import (
    QueueingController,
    QueueingConfig,
    StabilityState,
)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _make_random_params(seed: int) -> List[Tuple[float, float, int]]:
    """Generate 50 deterministic (lambda, mu, blocks) tuples."""
    rng = random.Random(seed)
    params = []
    for _ in range(50):
        lam = rng.uniform(0.05, 5.0)
        mu = rng.uniform(0.3, 8.0)
        blk = rng.randint(8, 512)
        params.append((lam, mu, blk))
    return params


RANDOM_PARAMS = _make_random_params(seed=42)


# ---------------------------------------------------------------------------
# Test class
# ---------------------------------------------------------------------------

class TestQueueingController:
    """8 tests for QueueingController (ICML 2026)."""

    # -----------------------------------------------------------------------
    # test_stability_under_low_load
    # -----------------------------------------------------------------------
    def test_stability_under_low_load(self):
        """
        λ=0.5 req/sec, μ=2.0 req/sec → ρ≈0.25, is_stable=True.

        25 arrivals with 2 s sleep give inter-arrival dt=2 s.
        Service time 0.5 s → μ = 1/0.5 = 2.0.
        With 25 completions service_stats.count=25 ≥ 10 (no fallback).
        """
        ctrl = QueueingController(QueueingConfig(window_seconds=2.0))

        inter_arrival = 2.0    # → λ = 0.5
        service_time = 0.5     # → μ = 2.0

        now = time.monotonic()
        for i in range(25):
            ctrl.record_request_arrival(now, token_count=128, agent_id="a")
            ctrl.record_request_completion(
                now + service_time,
                service_time_ms=service_time * 1000.0,
                blocks_consumed=16,
                agent_id="a",
            )
            time.sleep(inter_arrival)
            now = time.monotonic()

        state = ctrl.compute_stability_state(
            current_free_blocks=128,
            total_blocks=256,
        )

        assert 0.15 <= state.utilization_rho <= 0.40, (
            f"Expected rho≈0.25, got {state.utilization_rho}"
        )
        assert state.is_stable is True, (
            f"System should be stable at rho={state.utilization_rho}"
        )
        assert state.minimum_stable_blocks <= 128

    # -----------------------------------------------------------------------
    # test_instability_detection
    # -----------------------------------------------------------------------
    def test_instability_detection(self):
        """
        λ≈5 req/sec, μ=2 req/sec → theoretical ρ=2.5 (clamped to 0.9999).

        25 arrivals at 0.2 s intervals drive the EMA to λ≈5.
        Service time 0.5 s → μ=2.

        is_stable = False when current_free_blocks (20) < minimum_stable_blocks (42),
        even though rho < 1.0 — the M/G/1 free-blocks floor is violated first.
        """
        ctrl = QueueingController(QueueingConfig(window_seconds=2.0))

        inter_arrival = 0.2    # → λ = 5.0 (EMA converges here)
        service_time = 0.5     # → μ = 2.0

        now = time.monotonic()
        for i in range(25):
            ctrl.record_request_arrival(now, token_count=128, agent_id="a")
            ctrl.record_request_completion(
                now + service_time,
                service_time_ms=service_time * 1000.0,
                blocks_consumed=16,
                agent_id="a",
            )
            time.sleep(inter_arrival)
            now = time.monotonic()

        # With lambda≈5, E[S]=0.5, E[blocks]=16, safety_margin=1.15:
        #   minimum_stable_blocks = ceil(5 * 0.5 * 16 * 1.15) = 46
        # Setting current_free_blocks=20 < 46 triggers is_stable=False
        # regardless of rho (which is clamped at 0.9999).
        state = ctrl.compute_stability_state(
            current_free_blocks=20,
            total_blocks=512,
        )

        # EMA lambda should be close to 5.0 (the driven arrival rate)
        assert state.arrival_rate_lambda >= 4.0, (
            f"Expected λ EMA ≥4.0, got {state.arrival_rate_lambda}"
        )
        # is_stable=False because free_blocks < minimum_stable_blocks
        assert state.is_stable is False, (
            f"System should be unstable: free_blocks=20 < minimum={state.minimum_stable_blocks} "
            f"(lambda={state.arrival_rate_lambda})"
        )

    # -----------------------------------------------------------------------
    # test_invariant_11_never_violated
    # -----------------------------------------------------------------------
    @pytest.mark.parametrize("lambda_val,mu_val,blocks", RANDOM_PARAMS)
    def test_invariant_11_never_violated(
        self, lambda_val: float, mu_val: float, blocks: int
    ):
        """
        INVARIANT-11: after every get_eviction_target_blocks() call,
        free_blocks_after_eviction >= minimum_stable_blocks.

        Uses window_seconds=1.0 and inter_arrival=0.1 s so the EMA
        converges quickly (alpha=0.095 per step → ~10 steps to steady state).
        12 iterations give service_stats.count=12 (≥ 10 threshold, no fallback).

        Only sub-case (b) is tested here (eviction triggered), because for
        large-λ random params the minimum floor exceeds available space,
        making the "no eviction needed" path unreachable with this setup.

        Assertion: result_free >= minimum_stable_blocks after eviction.
        """
        ctrl = QueueingController(QueueingConfig(window_seconds=1.0))

        inter_arrival = 0.1   # fast convergence: alpha=0.095 per step
        service_time_s = min(1.0 / mu_val if mu_val > 0 else 1.0, 1.0)

        now = time.monotonic()
        for _ in range(12):
            ctrl.record_request_arrival(now, token_count=128, agent_id="a")
            ctrl.record_request_completion(
                now + service_time_s,
                service_time_ms=service_time_s * 1000.0,
                blocks_consumed=blocks,
                agent_id="a",
            )
            time.sleep(inter_arrival)
            now = time.monotonic()

        total_blocks = max(2 * blocks, 512)

        # Sub-case (b): eviction triggered — verify INVARIANT-11
        # Use current_free = total_blocks/2 and request blocks/2
        # to force projected below floor, triggering eviction.
        available = total_blocks // 2
        requested = max(1, blocks // 2)

        state = ctrl.compute_stability_state(
            current_free_blocks=available,
            total_blocks=total_blocks,
        )
        target = ctrl.get_eviction_target_blocks(
            current_free_blocks=available,
            total_blocks=total_blocks,
            requested_new_blocks=requested,
        )

        # After eviction: result_free = projected_before + evicted
        result_free = available - requested + target
        assert result_free >= state.minimum_stable_blocks, (
            f"INVARIANT-11 violation: result_free={result_free} "
            f"< minimum_stable_blocks={state.minimum_stable_blocks} "
            f"(lambda={lambda_val}, mu={mu_val}, blocks={blocks})"
        )

    # -----------------------------------------------------------------------
    # test_quantization_bits_ladder
    # -----------------------------------------------------------------------
    @pytest.mark.parametrize(
        "target_rho,expected_bits",
        [
            (0.65, 16),   # < 0.70 → 16-bit
            (0.78, 8),    # 0.70 ≤ ρ < 0.85 → 8-bit
            (0.90, 4),    # 0.85 ≤ ρ < 0.95 → 4-bit
            (0.97, 2),    # ≥ 0.95 → 2-bit
        ],
    )
    def test_quantization_bits_ladder(self, target_rho: float, expected_bits: int):
        """
        get_recommended_quantization_bits() returns the correct bit-width
        for each utilisation regime in arXiv:2605.04595 Table 2.

        Uses inter_arrival=0.1 s (fast convergence) and 15 iterations.
        With window_seconds=1.0 and dt=0.1s → alpha=0.095, EMA converges
        in ~15 steps. Service stats.count=15 (≥ 10, no fallback).
        """
        ctrl = QueueingController(QueueingConfig(window_seconds=1.0))

        mu = 2.0
        lam = target_rho * mu
        inter_arrival = 1.0 / lam
        service_time_s = 1.0 / mu   # 0.5 s

        now = time.monotonic()
        for _ in range(15):
            ctrl.record_request_arrival(now, token_count=128, agent_id="a")
            ctrl.record_request_completion(
                now + service_time_s,
                service_time_ms=service_time_s * 1000.0,
                blocks_consumed=16,
                agent_id="a",
            )
            time.sleep(inter_arrival)
            now = time.monotonic()

        state = ctrl.compute_stability_state(
            current_free_blocks=128,
            total_blocks=256,
        )

        # EMA may be somewhat off; accept ±10% tolerance
        assert abs(state.utilization_rho - target_rho) < 0.10, (
            f"rho={state.utilization_rho:.4f} too far from target={target_rho}"
        )

        bits = ctrl.get_recommended_quantization_bits()
        assert bits == expected_bits, (
            f"For rho={state.utilization_rho:.4f} "
            f"expected bits={expected_bits}, got {bits}"
        )

    # -----------------------------------------------------------------------
    # test_ema_arrival_rate
    # -----------------------------------------------------------------------
    def test_ema_arrival_rate(self):
        """
        6 requests at exactly 1.0 s intervals (λ=1.0 req/sec).

        With window_seconds=1.0 and dt=1.0s → α=1-exp(-1/1)=0.632.
        After 6 arrivals (5 EMA updates) the estimate is well above the
        fallback threshold (0.1) and reflects the true rate.

        We also ensure service_stats.count ≥ 10 so the controller is
        not in fallback mode (μ uses real estimates, not 1.0).
        """
        config = QueueingConfig(window_seconds=1.0)
        ctrl = QueueingController(config)

        now = time.monotonic()
        for i in range(12):   # 12 arrivals + completions → service_stats.count=12 ≥ 10
            ctrl.record_request_arrival(now, token_count=256, agent_id="a")
            ctrl.record_request_completion(
                now + 0.4,
                service_time_ms=400.0,
                blocks_consumed=16,
                agent_id="a",
            )
            time.sleep(1.0)
            now = time.monotonic()

        state = ctrl.compute_stability_state(
            current_free_blocks=64,
            total_blocks=256,
        )

        # Lambda from EMA must be above fallback (0.1)
        assert state.arrival_rate_lambda > 0.1, (
            f"Expected λ from EMA (>0.1), got {state.arrival_rate_lambda}"
        )
        # With α=0.632 and 5 updates, EMA converges to roughly the true rate (≈1.0)
        assert 0.5 <= state.arrival_rate_lambda <= 2.5, (
            f"Expected λ≈1.0 (±factor 2.5), got {state.arrival_rate_lambda}"
        )

    # -----------------------------------------------------------------------
    # test_welford_service_time
    # -----------------------------------------------------------------------
    def test_welford_service_time(self):
        """
        100 completions with deterministic service time 500 ms.
        Welford mean must converge to 0.5 s; variance must be near 0.

        Also verified with heterogeneous samples to confirm correct
        Welford updates across the full value range.
        """
        ctrl = QueueingController(QueueingConfig())

        service_time_ms = 500.0
        n = 100
        now = time.monotonic()

        for i in range(n):
            ctrl.record_request_completion(
                now + i * 0.01,
                service_time_ms=service_time_ms,
                blocks_consumed=16,
                agent_id="a",
            )

        state = ctrl.compute_stability_state(
            current_free_blocks=64,
            total_blocks=256,
        )

        # E[S] = 0.5 s → μ = 1/0.5 = 2.0
        assert abs(state.service_rate_mu - 2.0) < 0.05, (
            f"Expected μ≈2.0, got {state.service_rate_mu}"
        )
        e_service = 1.0 / state.service_rate_mu
        assert abs(e_service - 0.5) < 0.02, (
            f"Expected E[S]=0.5 s, got {e_service:.4f} s"
        )

        # ---- Heterogeneous: linear sweep [0.4, 0.6] s → true mean = 0.5 s
        ctrl2 = QueueingController(QueueingConfig())
        for i in range(100):
            svc = 0.4 + (i / 99.0) * 0.2
            ctrl2.record_request_completion(
                now + i * 0.01,
                service_time_ms=svc * 1000.0,
                blocks_consumed=16,
                agent_id="a",
            )

        state2 = ctrl2.compute_stability_state(
            current_free_blocks=64,
            total_blocks=256,
        )
        e_service2 = 1.0 / state2.service_rate_mu
        assert 0.45 <= e_service2 <= 0.55, (
            f"Heterogeneous: expected E[S]≈0.5, got {e_service2:.4f}"
        )

    # -----------------------------------------------------------------------
    # test_fallback_on_insufficient_data
    # -----------------------------------------------------------------------
    def test_fallback_on_insufficient_data(self):
        """
        When < 10 service completions have been recorded, fallback values:

            λ_fallback = 0.1  req/sec
            E[S]_fallback = 1.0 s  → μ = 1.0 req/sec
            E[blocks]_fallback = config.block_size = 16

        Scenarios:
          (a) cold start — no data at all
          (b) partial data — 5 arrivals but 0 completions
        """
        config = QueueingConfig(block_size=16)

        # (a) Cold start — zero arrivals, zero completions
        ctrl_cold = QueueingController(config)
        state_cold = ctrl_cold.compute_stability_state(
            current_free_blocks=64,
            total_blocks=256,
        )

        assert state_cold.arrival_rate_lambda == 0.1, (
            f"Expected λ_fallback=0.1, got {state_cold.arrival_rate_lambda}"
        )
        assert state_cold.service_rate_mu == 1.0, (
            f"Expected μ_fallback=1.0, got {state_cold.service_rate_mu}"
        )
        assert state_cold.mean_blocks_per_request == 16.0, (
            f"Expected E[blocks]_fallback=16, "
            f"got {state_cold.mean_blocks_per_request}"
        )

        # (b) 5 arrivals, 0 completions → service_stats.count = 0 (< 10)
        ctrl_partial = QueueingController(config)
        now = time.monotonic()
        for _ in range(5):
            ctrl_partial.record_request_arrival(now, token_count=128, agent_id="a")
            time.sleep(0.01)
            now = time.monotonic()

        state_partial = ctrl_partial.compute_stability_state(
            current_free_blocks=64,
            total_blocks=256,
        )

        # service_stats.count = 0 (< 10) → fallback must be active
        assert state_partial.service_rate_mu == 1.0, (
            f"Expected μ_fallback=1.0 with 0 completions, "
            f"got {state_partial.service_rate_mu}"
        )
        assert state_partial.mean_blocks_per_request == 16.0, (
            f"Expected E[blocks]_fallback=16, "
            f"got {state_partial.mean_blocks_per_request}"
        )

    # -----------------------------------------------------------------------
    # test_export_metrics_keys
    # -----------------------------------------------------------------------
    def test_export_metrics_keys(self):
        """
        export_metrics() returns exactly 7 Prometheus-compatible keys,
        all numeric and non-NaN.
        """
        config = QueueingConfig(window_seconds=1.0)
        ctrl = QueueingController(config)

        # Feed enough data to exit fallback regime
        inter_arrival = 1.0
        service_time = 0.4
        now = time.monotonic()
        for i in range(20):
            ctrl.record_request_arrival(now, token_count=128, agent_id="a")
            ctrl.record_request_completion(
                now + service_time,
                service_time_ms=service_time * 1000.0,
                blocks_consumed=16,
                agent_id="a",
            )
            time.sleep(inter_arrival)
            now = time.monotonic()

        metrics = ctrl.export_metrics()

        expected_keys = [
            "queueing_lambda",
            "queueing_mu",
            "queueing_rho",
            "queueing_is_stable",
            "queueing_lambda_critical",
            "queueing_minimum_stable_blocks",
            "queueing_stability_margin_pct",
        ]

        assert set(metrics.keys()) == set(expected_keys), (
            f"Expected keys {expected_keys}, got {sorted(metrics.keys())}"
        )

        for key in expected_keys:
            val = metrics[key]
            assert isinstance(val, (int, float)), (
                f"Metric {key} has non-numeric value: {val!r}"
            )
            assert not math.isnan(val), f"Metric {key} is NaN"

        assert metrics["queueing_is_stable"] in (0.0, 1.0), (
            f"queueing_is_stable should be 0.0 or 1.0, "
            f"got {metrics['queueing_is_stable']}"
        )

        for key in expected_keys:
            assert metrics[key] >= 0.0, f"Metric {key} is negative: {metrics[key]}"