| from __future__ import annotations | |
| class KVCacheSimulator: | |
| def apply( | |
| self, | |
| queue_depth: int, | |
| mean_prompt_length: float, | |
| kv_budget_fraction: float, | |
| priority_routing: bool = False, | |
| ) -> tuple[float, int]: | |
| requested = queue_depth * mean_prompt_length | |
| budget = max(1.0, 16000.0 * kv_budget_fraction) | |
| occupancy = min(1.0, requested / budget) | |
| evictions = 0 | |
| if requested > budget: | |
| if priority_routing and occupancy > 0.95: | |
| evictions = int((requested - (budget * 0.90)) / max(mean_prompt_length, 1.0)) | |
| else: | |
| evictions = int((requested - budget) / max(mean_prompt_length, 1.0)) | |
| return occupancy, max(0, evictions) | |