Shikhar1
/

SpecKV

speculative-decoding

inference-optimization

adaptive-inference language:

en pipeline_tag: other

Model card Files Files and versions

SpecKV / config.json

Shikhar1's picture

Upload 4 files

6a0834f verified 1 day ago

2.36 kB

	{
	"model_name": "SpecKV-MLP16",
	"description": "Lightweight acceptance rate predictor for adaptive speculative decoding gamma selection.",
	"architecture": {
	"type": "MLPRegressor",
	"hidden_layers": [16],
	"activation": "relu",
	"output": "regression (acceptance rate, 0-1)"
	},
	"input_features": [
	{"name": "mean_draft_entropy", "description": "Mean entropy of draft token distributions (bits)", "dtype": "float32"},
	{"name": "mean_draft_confidence", "description": "Mean top-1 probability of draft tokens", "dtype": "float32"},
	{"name": "max_draft_entropy", "description": "Max entropy across draft tokens in the step", "dtype": "float32"},
	{"name": "min_draft_confidence", "description": "Min top-1 probability across draft tokens in the step", "dtype": "float32"},
	{"name": "comp_enc", "description": "Compression level encoding: 0=fp16, 1=int8, 2=nf4", "dtype": "int"},
	{"name": "gamma", "description": "Candidate speculation length to evaluate", "dtype": "int"}
	],
	"output": {
	"name": "predicted_acceptance_rate",
	"range": [0.0, 1.0],
	"description": "Predicted fraction of draft tokens the target model will accept"
	},
	"usage": {
	"gamma_selection": "For each candidate gamma in {2, 4, 6, 8}, predict acceptance rate. Select gamma that maximizes: predicted_ar * gamma + 1",
	"overhead": "0.34ms per decision (4 forward passes through the MLP)"
	},
	"training": {
	"data": "5112 step-level records from Phase 2 profiling",
	"model_pair": "Llama-3.2-1B-Instruct (draft) / Llama-3.2-3B-Instruct (target)",
	"compression_levels": ["fp16", "int8", "nf4"],
	"gamma_values": [2, 4, 6, 8],
	"tasks": ["code", "math", "chat", "summarization"],
	"framework": "scikit-learn 1.x",
	"random_seed": 42
	},
	"performance": {
	"test_mse": 0.090,
	"test_correlation": 0.685,
	"decision_overhead_ms": 0.336,
	"improvement_over_fixed4": "56.0%",
	"statistical_significance": "p < 0.001 (paired bootstrap, 10K resamples)"
	},
	"files": {
	"speckv_mlp16.pkl": "Full sklearn model (pickle format)",
	"speckv_mlp16_weights.npz": "Raw numpy weights for framework-agnostic loading"
	},
	"license": "MIT",
	"paper": "SpecKV: Adaptive Speculative Decoding with Compression-Aware Gamma Selection",
	"repository": "https://github.com/Amorfati123/SpecKV"
	}