WCNegentropy
/

BitTransformerLM

Text Generation

language-modeling

Model card Files Files and versions

BitTransformerLM / bit_transformer /collapse.py

WCNegentropy's picture

🤖 Updated BitTransformerLM from development space

36c78b1 verified 8 months ago

history blame contribute delete

3.24 kB

	import json
	import os
	from typing import Dict, List, Optional, Tuple

	import torch

	from .model import BitTransformerLM
	from .training import train_loop


	def collapse_submodel(
	cluster_data: List[List[int]],
	target_params: Dict,
	floors: Optional[Dict[str, float]] = None,
	max_rounds: int = 3,
	width_scale: float = 1.5,
	forward_kwargs: Optional[Dict] = None,
	) -> Tuple[BitTransformerLM, Dict[str, float]]:
	"""Distill a submodel from clustered bit sequences.

	The routine deepens the target model when telemetry floors are unmet and,
	after the first deepening fails, widens the hidden dimensions by
	``width_scale`` once before retrying. Returns the distilled model and its
	final telemetry metrics.
	"""
	if floors is None:
	floors = {"negentropy": 0.5, "lz_complexity": 0.3, "symbiosis_score": 0.5}

	bit_tensor = torch.tensor(cluster_data, dtype=torch.long)
	n = len(bit_tensor)
	split = max(1, int(0.8 * n))
	train_bits = bit_tensor[:split]
	val_bits = bit_tensor[split:]
	if len(val_bits) == 0:
	val_bits = train_bits

	params = target_params.copy()
	metrics: Dict[str, float] = {}
	width_scaled = False
	for round_idx in range(max_rounds):
	model = BitTransformerLM(**params)
	train_loop(
	model,
	train_bits,
	epochs=2,
	compress_prob=0.5,
	direct_prob=0.0,
	log=False,
	forward_kwargs=forward_kwargs,
	)
	with torch.no_grad():
	logits, telemetry = model(val_bits, **(forward_kwargs or {}))
	neg_k = model.negentropy_logits(logits).mean().item()
	lz_c = model.lz_complexity_logits(logits).mean().item()
	sym_s = telemetry["symbiosis_score"].mean().item()
	metrics = {
	"negentropy": neg_k,
	"lz_complexity": lz_c,
	"symbiosis_score": sym_s,
	}
	if (
	neg_k >= floors["negentropy"]
	and lz_c >= floors["lz_complexity"]
	and sym_s >= floors["symbiosis_score"]
	):
	break
	if round_idx == 0:
	params["num_layers"] = max(1, params.get("num_layers", 1)) + 1
	elif not width_scaled:
	params["d_model"] = int(params.get("d_model", 32) * width_scale)
	params["dim_feedforward"] = int(
	params.get("dim_feedforward", 64) * width_scale
	)
	width_scaled = True
	else:
	params["num_layers"] = max(1, params.get("num_layers", 1)) + 1
	return model, metrics


	def save_distilled_model(
	model: BitTransformerLM,
	path: str,
	metrics: Dict[str, float],
	floors: Optional[Dict[str, float]] = None,
	) -> None:
	"""Serialize a distilled model and its metric summary to disk.

	Weights are written to ``path`` and a ``metrics.json`` file is placed in the
	same directory containing the achieved metrics alongside the target floors.
	"""
	torch.save(model.state_dict(), path)
	payload = {"metrics": metrics, "floors": floors or {}}
	metrics_path = os.path.join(os.path.dirname(path), "metrics.json")
	with open(metrics_path, "w") as f:
	json.dump(payload, f)