File size: 762 Bytes
4fbc241 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | from __future__ import annotations
class KVCacheSimulator:
def apply(
self,
queue_depth: int,
mean_prompt_length: float,
kv_budget_fraction: float,
priority_routing: bool = False,
) -> tuple[float, int]:
requested = queue_depth * mean_prompt_length
budget = max(1.0, 16000.0 * kv_budget_fraction)
occupancy = min(1.0, requested / budget)
evictions = 0
if requested > budget:
if priority_routing and occupancy > 0.95:
evictions = int((requested - (budget * 0.90)) / max(mean_prompt_length, 1.0))
else:
evictions = int((requested - budget) / max(mean_prompt_length, 1.0))
return occupancy, max(0, evictions)
|