File size: 2,357 Bytes
2d86c06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
{
  "model_name": "SpecKV-MLP16",
  "description": "Lightweight acceptance rate predictor for adaptive speculative decoding gamma selection.",
  "architecture": {
    "type": "MLPRegressor",
    "hidden_layers": [16],
    "activation": "relu",
    "output": "regression (acceptance rate, 0-1)"
  },
  "input_features": [
    {"name": "mean_draft_entropy", "description": "Mean entropy of draft token distributions (bits)", "dtype": "float32"},
    {"name": "mean_draft_confidence", "description": "Mean top-1 probability of draft tokens", "dtype": "float32"},
    {"name": "max_draft_entropy", "description": "Max entropy across draft tokens in the step", "dtype": "float32"},
    {"name": "min_draft_confidence", "description": "Min top-1 probability across draft tokens in the step", "dtype": "float32"},
    {"name": "comp_enc", "description": "Compression level encoding: 0=fp16, 1=int8, 2=nf4", "dtype": "int"},
    {"name": "gamma", "description": "Candidate speculation length to evaluate", "dtype": "int"}
  ],
  "output": {
    "name": "predicted_acceptance_rate",
    "range": [0.0, 1.0],
    "description": "Predicted fraction of draft tokens the target model will accept"
  },
  "usage": {
    "gamma_selection": "For each candidate gamma in {2, 4, 6, 8}, predict acceptance rate. Select gamma that maximizes: predicted_ar * gamma + 1",
    "overhead": "0.34ms per decision (4 forward passes through the MLP)"
  },
  "training": {
    "data": "5112 step-level records from Phase 2 profiling",
    "model_pair": "Llama-3.2-1B-Instruct (draft) / Llama-3.2-3B-Instruct (target)",
    "compression_levels": ["fp16", "int8", "nf4"],
    "gamma_values": [2, 4, 6, 8],
    "tasks": ["code", "math", "chat", "summarization"],
    "framework": "scikit-learn 1.x",
    "random_seed": 42
  },
  "performance": {
    "test_mse": 0.090,
    "test_correlation": 0.685,
    "decision_overhead_ms": 0.336,
    "improvement_over_fixed4": "56.0%",
    "statistical_significance": "p < 0.001 (paired bootstrap, 10K resamples)"
  },
  "files": {
    "speckv_mlp16.pkl": "Full sklearn model (pickle format)",
    "speckv_mlp16_weights.npz": "Raw numpy weights for framework-agnostic loading"
  },
  "license": "MIT",
  "paper": "SpecKV: Adaptive Speculative Decoding with Compression-Aware Gamma Selection",
  "repository": "https://github.com/Amorfati123/SpecKV"
}