File size: 762 Bytes
4fbc241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from __future__ import annotations


class KVCacheSimulator:
    def apply(
        self,
        queue_depth: int,
        mean_prompt_length: float,
        kv_budget_fraction: float,
        priority_routing: bool = False,
    ) -> tuple[float, int]:
        requested = queue_depth * mean_prompt_length
        budget = max(1.0, 16000.0 * kv_budget_fraction)
        occupancy = min(1.0, requested / budget)
        evictions = 0

        if requested > budget:
            if priority_routing and occupancy > 0.95:
                evictions = int((requested - (budget * 0.90)) / max(mean_prompt_length, 1.0))
            else:
                evictions = int((requested - budget) / max(mean_prompt_length, 1.0))

        return occupancy, max(0, evictions)