| import sys |
| import os |
| import numpy as np |
| from typing import List |
|
|
| |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) |
|
|
| from server.llmserve_environment import LLMServeEnvironment |
| from llmserve_env.models import ServeAction, QuantizationTier |
|
|
| def test_quantization_jitter(): |
| print("[INFO] Testing Quantization Jitter (Chiron 2024)...") |
| env = LLMServeEnvironment(seed=42) |
| |
| |
| env.reset(task_id="static_workload") |
| fp16_latencies = [] |
| for _ in range(50): |
| obs = env.step(ServeAction(quantization_tier=QuantizationTier.FP16.value, batch_cap=200)) |
| fp16_latencies.append(obs.p50_ttft_ms) |
| |
| fp16_cv = np.std(fp16_latencies) / np.mean(fp16_latencies) |
| print(f" FP16 CV: {fp16_cv:.4f}") |
| |
| |
| env.reset(task_id="static_workload") |
| int4_latencies = [] |
| for _ in range(50): |
| obs = env.step(ServeAction(quantization_tier=QuantizationTier.INT4.value, batch_cap=200)) |
| int4_latencies.append(obs.p50_ttft_ms) |
| |
| int4_cv = np.std(int4_latencies) / np.mean(int4_latencies) |
| print(f" INT4 CV: {int4_cv:.4f}") |
| |
| |
| assert int4_cv > fp16_cv, f"INT4 Jitter ({int4_cv:.4f}) must be > FP16 Jitter ({fp16_cv:.4f})" |
| print("[PASS] Quantization Jitter verified.") |
|
|
| def test_thermal_throttling(): |
| print("[INFO] Testing Thermal Throttling Trigger...") |
| env = LLMServeEnvironment(seed=42) |
| env.reset(task_id="static_workload") |
| |
| |
| for i in range(100): |
| env.step(ServeAction(batch_cap=10)) |
| |
| obs_normal = env.step(ServeAction(batch_cap=10)) |
| assert not obs_normal.metadata["is_throttled"], "Should not be throttled yet" |
| |
| |
| |
| for _ in range(120): |
| obs = env.step(ServeAction(batch_cap=512)) |
| |
| print(f" Step 120: Throttled={obs.metadata['is_throttled']}") |
| assert obs.metadata['is_throttled'], "Thermal throttling should be active" |
| print("[SUCCESS] Thermal Throttling Verified.") |
|
|
| def test_priority_preemption(): |
| print("[INFO] Testing Priority Preemption...") |
| env = LLMServeEnvironment(seed=42) |
| |
| |
| |
| |
| env.reset(task_id="adversarial_multitenant") |
| preemption_triggered = False |
| for i in range(40): |
| |
| |
| |
| obs = env.step(ServeAction(priority_routing=True, kv_budget_fraction=0.1, batch_cap=2)) |
| if obs.metadata["preemption_events"] > 0: |
| preemption_triggered = True |
| print(f" Step {i}: Preemption Triggered! Events: {obs.metadata['preemption_events']}") |
| break |
| |
| assert preemption_triggered, "Priority routing should trigger preemption when cache is full" |
| print("[SUCCESS] Priority Preemption Verified.") |
|
|
| def test_speculative_acceptance(): |
| print("[INFO] Testing Speculative Alpha (Chat vs API)...") |
| env = LLMServeEnvironment(seed=42) |
| |
| |
| env.reset(task_id="static_workload") |
| obs_chat = env.step(ServeAction(speculation_depth=4)) |
| |
| |
| env.reset(task_id="adversarial_multitenant") |
| obs_api = env.step(ServeAction(speculation_depth=4)) |
| |
| print(f" Chat Alpha: {obs_chat.spec_acceptance_rate:.4f}") |
| print(f" API Alpha: {obs_api.spec_acceptance_rate:.4f}") |
| assert obs_chat.spec_acceptance_rate > obs_api.spec_acceptance_rate, "Chat should have higher acceptance than API" |
| print("[SUCCESS] Speculative Alpha Verified.") |
|
|
| if __name__ == "__main__": |
| try: |
| test_quantization_jitter() |
| test_thermal_throttling() |
| test_priority_preemption() |
| test_speculative_acceptance() |
| print("\n[ALL TESTS PASSED] Physical Binary Triggers are fully functional.") |
| except Exception as e: |
| print(f"\n[FAIL] Trigger Verification Failed: {e}") |
| sys.exit(1) |
|
|