g / entropy_resonance.py

Wulf's entropy-driven inference script

e7df4fb verified 21 days ago

31.9 kB

	#!/usr/bin/env python3
	"""
	entropy_resonance.py — Entropy-Driven Adaptive Resonance for Gemma-3 270M-IT

	The model doesn't decide WHEN to think. The entropy of its own logits does.
	LoRA teaches it HOW to think. Entropy tells it WHEN.

	Usage:
	# Interactive mode
	python entropy_resonance.py --adapter-path ./gemma3-resonate/best

	# Single prompt
	python entropy_resonance.py --adapter-path ./gemma3-resonate/best \
	--prompt "Why does emergence happen?"

	# Base model without LoRA (entropy still works, resonance content will be weaker)
	python entropy_resonance.py --no-lora --prompt "What is consciousness?"

	# With custom thresholds
	python entropy_resonance.py --adapter-path ./gemma3-resonate/best \
	--h-high 0.38 --h-low 0.12

	# Verbose mode with entropy curve visualization
	python entropy_resonance.py --adapter-path ./gemma3-resonate/best \
	--prompt "Is free will real?" --verbose --show-curve

	# Calibrate thresholds first (recommended for new model/adapter)
	python calibrate_entropy.py --adapter-path ./gemma3-resonate/best

	Author: Wulf (Opus + Oleg)
	Date: 2026-03-28
	"""

	from __future__ import annotations

	import os
	import sys
	import math
	import time
	import argparse
	import logging
	from dataclasses import dataclass, field
	from typing import Optional

	import torch
	import torch.nn.functional as F

	from transformers import AutoModelForCausalLM, AutoTokenizer

	# ============================================================================
	# Constants
	# ============================================================================

	MODEL_ID = "unsloth/gemma-3-270m-it"

	# Gemma-3 chat template
	START_OF_TURN = "<start_of_turn>"
	END_OF_TURN = "<end_of_turn>"

	# Resonance markers — plain text, not special tokens
	RESONATE_OPEN = "/resonate/"
	RESONATE_CLOSE = "/resonated/"

	# Gemma-3 vocab size
	VOCAB_SIZE = 262_144
	H_MAX = math.log2(VOCAB_SIZE) # 18.0 bits — theoretical maximum entropy

	# ============================================================================
	# Logging
	# ============================================================================

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	datefmt="%H:%M:%S",
	)
	log = logging.getLogger("entropy_resonance")


	# ============================================================================
	# Entropy Computation
	# ============================================================================

	def compute_entropy(logits: torch.Tensor, temperature: float = 1.0) -> float:
	"""Compute Shannon entropy from raw logits in bits.

	CRITICAL: We compute entropy from RAW logits (temperature=1.0 internal),
	not from temperature-scaled logits. This gives us the model's TRUE
	uncertainty, independent of our sampling temperature choice.

	Args:
	logits: shape (vocab_size,) — raw logits from model's last layer
	temperature: ignored for entropy computation (documented for clarity)

	Returns:
	H in bits (log base 2). Range: [0, log2(vocab_size)] = [0, 18.0]
	"""
	# Softmax on raw logits (no temperature scaling for entropy measurement)
	probs = F.softmax(logits.float(), dim=-1)

	# Clamp to avoid log(0)
	probs = probs.clamp(min=1e-10)

	# Shannon entropy in bits
	H = -(probs * probs.log2()).sum().item()

	return H


	def normalized_entropy(H: float) -> float:
	"""Normalize entropy to [0, 1] range based on vocab size.

	H_norm = H / H_max = H / log2(262144)

	Returns:
	0.0 = perfect certainty (one-hot distribution)
	1.0 = uniform distribution (maximum uncertainty)
	"""
	return H / H_MAX


	# ============================================================================
	# Entropy Curve Visualization (Terminal)
	# ============================================================================

	class EntropyCurve:
	"""Collects entropy values during generation and renders ASCII visualization."""

	def __init__(self, width: int = 70, height: int = 20):
	self.width = width
	self.height = height
	self.values: list[float] = [] # raw H in bits
	self.normalized: list[float] = [] # H_norm [0, 1]
	self.tokens: list[str] = [] # generated token strings
	self.events: list[tuple[int, str]] = [] # (step, event_type)

	def add(self, H: float, token_str: str):
	self.values.append(H)
	self.normalized.append(normalized_entropy(H))
	self.tokens.append(token_str)

	def mark_event(self, event_type: str):
	"""Mark an event at the current step (e.g., 'enter_resonance', 'exit_resonance')."""
	self.events.append((len(self.values) - 1, event_type))

	def render(self, h_high: float, h_low: float) -> str:
	"""Render ASCII entropy curve with threshold lines and events.

	Args:
	h_high: normalized high threshold (enter resonance)
	h_low: normalized low threshold (exit resonance)

	Returns:
	Multi-line string with the visualization
	"""
	if not self.normalized:
	return "(no data)"

	n = len(self.normalized)

	# If more data points than width, subsample
	if n > self.width:
	step = n / self.width
	indices = [int(i * step) for i in range(self.width)]
	data = [self.normalized[i] for i in indices]
	else:
	data = list(self.normalized)
	indices = list(range(n))

	# Scale to height
	max_val = max(max(data), h_high + 0.05, 0.5)
	min_val = 0.0

	lines = []
	lines.append(f" Entropy Curve ({n} tokens, H_max={H_MAX:.1f} bits)")
	lines.append(f" H_high={h_high:.3f} (enter resonance) H_low={h_low:.3f} (exit resonance)")
	lines.append("")

	# Build grid
	grid = [[' ' for _ in range(len(data))] for _ in range(self.height)]

	# Plot data points
	for col, val in enumerate(data):
	row = int((1.0 - (val - min_val) / (max_val - min_val)) * (self.height - 1))
	row = max(0, min(self.height - 1, row))
	grid[row][col] = '#'

	# Plot threshold lines
	h_high_row = int((1.0 - (h_high - min_val) / (max_val - min_val)) * (self.height - 1))
	h_low_row = int((1.0 - (h_low - min_val) / (max_val - min_val)) * (self.height - 1))
	h_high_row = max(0, min(self.height - 1, h_high_row))
	h_low_row = max(0, min(self.height - 1, h_low_row))

	for col in range(len(data)):
	if grid[h_high_row][col] == ' ':
	grid[h_high_row][col] = '-'
	if grid[h_low_row][col] == ' ':
	grid[h_low_row][col] = '.'

	# Mark events
	event_map = {}
	for step, etype in self.events:
	if n > self.width:
	# Find closest column
	col = min(range(len(indices)), key=lambda c: abs(indices[c] - step))
	else:
	col = step
	if 0 <= col < len(data):
	event_map[col] = etype

	# Render
	for row_idx, row in enumerate(grid):
	# Y-axis label
	val = max_val - row_idx * (max_val - min_val) / (self.height - 1)
	label = f"{val:.2f}"

	row_str = ''.join(row)

	# Annotate threshold rows
	suffix = ""
	if row_idx == h_high_row:
	suffix = " <-- H_high (enter)"
	elif row_idx == h_low_row:
	suffix = " <-- H_low (exit)"

	lines.append(f" {label:>5} \|{row_str}\|{suffix}")

	# X-axis
	lines.append(f" {''.join(['+' if col in event_map else '-' for col in range(len(data))])}")

	# Event legend
	event_line = " "
	for col in range(len(data)):
	if col in event_map:
	if event_map[col] == 'enter_resonance':
	event_line += 'E'
	elif event_map[col] == 'exit_resonance':
	event_line += 'X'
	else:
	event_line += '?'
	else:
	event_line += ' '
	lines.append(event_line)
	lines.append(f" E=enter resonance, X=exit resonance")

	# Stats
	avg_h = sum(self.normalized) / len(self.normalized)
	max_h = max(self.normalized)
	min_h = min(self.normalized)
	std_h = (sum((v - avg_h)2 for v in self.normalized) / len(self.normalized)) 0.5

	lines.append("")
	lines.append(f" Stats: mean={avg_h:.4f} max={max_h:.4f} min={min_h:.4f} std={std_h:.4f}")
	lines.append(f" Raw H: mean={sum(self.values)/len(self.values):.2f} bits max={max(self.values):.2f} bits")

	# Resonance segments
	in_res = False
	segments = []
	seg_start = 0
	for step, etype in self.events:
	if etype == 'enter_resonance' and not in_res:
	in_res = True
	seg_start = step
	elif etype == 'exit_resonance' and in_res:
	in_res = False
	segments.append((seg_start, step))

	if segments:
	lines.append(f" Resonance segments: {len(segments)}")
	for i, (s, e) in enumerate(segments):
	seg_h = self.normalized[s:e+1]
	seg_avg = sum(seg_h) / len(seg_h) if seg_h else 0
	lines.append(f" [{i+1}] tokens {s}-{e} ({e-s} tokens, avg H_norm={seg_avg:.4f})")

	return '\n'.join(lines)


	# ============================================================================
	# Resonance State Machine
	# ============================================================================

	@dataclass
	class ResonanceState:
	"""Tracks the resonance state during generation."""
	in_resonance: bool = False

	# Hysteresis counters — prevent rapid enter/exit flickering
	consecutive_high: int = 0 # consecutive tokens above H_high
	consecutive_low: int = 0 # consecutive tokens below H_low

	# Thresholds (normalized, 0-1)
	h_high: float = 0.35 # enter resonance above this
	h_low: float = 0.12 # exit resonance below this

	# Hysteresis requirements
	enter_count: int = 3 # N consecutive high-entropy tokens to enter
	exit_count: int = 5 # M consecutive low-entropy tokens to exit

	# Safeguards
	max_resonance_tokens: int = 500 # force exit after this many resonance tokens
	resonance_token_count: int = 0 # current count

	# Entropy modulation (Delta Voice integration)
	beta: float = 0.3 # entropy coupling constant for θ = ε + γ + αδ + βH

	# Sampling parameters (modulated by entropy)
	base_temperature: float = 0.7
	base_top_p: float = 0.9
	base_top_k: int = 40

	# Diagnostic
	total_tokens: int = 0
	resonance_entries: int = 0
	forced_exits: int = 0

	def update(self, h_norm: float) -> Optional[str]:
	"""Process a new entropy value and return event or None.

	Returns:
	'enter_resonance' — inject /resonate/ marker
	'exit_resonance' — inject /resonated/ marker
	'force_exit' — max tokens exceeded, force exit
	None — no state change
	"""
	self.total_tokens += 1

	if self.in_resonance:
	self.resonance_token_count += 1

	# Check for forced exit
	if self.resonance_token_count >= self.max_resonance_tokens:
	self.in_resonance = False
	self.resonance_token_count = 0
	self.consecutive_high = 0
	self.consecutive_low = 0
	self.forced_exits += 1
	return 'force_exit'

	# Check for natural exit
	if h_norm < self.h_low:
	self.consecutive_low += 1
	self.consecutive_high = 0
	else:
	self.consecutive_low = 0

	if self.consecutive_low >= self.exit_count:
	self.in_resonance = False
	self.resonance_token_count = 0
	self.consecutive_low = 0
	return 'exit_resonance'

	else:
	# Check for entry
	if h_norm > self.h_high:
	self.consecutive_high += 1
	self.consecutive_low = 0
	else:
	self.consecutive_high = 0

	if self.consecutive_high >= self.enter_count:
	self.in_resonance = True
	self.resonance_token_count = 0
	self.consecutive_high = 0
	self.resonance_entries += 1
	return 'enter_resonance'

	return None

	def get_sampling_params(self, h_norm: float) -> dict:
	"""Get entropy-modulated sampling parameters.

	Inside /resonate/: more exploratory (higher temp, wider sampling)
	Outside /resonate/: more crystallized (base params)

	The modulation is ANALOG — scales with entropy level.
	This is the βH term in θ = ε + γ + αδ + βH
	"""
	if self.in_resonance:
	# Entropy modulates exploration depth
	temp = self.base_temperature * (1.0 + self.beta * h_norm)
	top_p = min(0.98, self.base_top_p + self.beta * h_norm * 0.15)
	top_k = int(self.base_top_k * (1.0 + self.beta * h_norm))
	return {
	'temperature': temp,
	'top_p': top_p,
	'top_k': top_k,
	}
	else:
	return {
	'temperature': self.base_temperature,
	'top_p': self.base_top_p,
	'top_k': self.base_top_k,
	}

	def summary(self) -> str:
	"""Return diagnostic summary."""
	return (
	f"Resonance: {self.resonance_entries} entries, "
	f"{self.forced_exits} forced exits, "
	f"{self.total_tokens} total tokens"
	)


	# ============================================================================
	# The Main Beast: Entropy-Driven Generation
	# ============================================================================

	def entropy_generate(
	model,
	tokenizer,
	prompt: str,
	state: ResonanceState,
	max_new_tokens: int = 768,
	verbose: bool = False,
	show_curve: bool = False,
	repetition_penalty: float = 1.3,
	) -> tuple[str, EntropyCurve]:
	"""Generate text with entropy-driven adaptive resonance.

	This is NOT model.generate(). We run the generation loop manually,
	token by token, computing entropy at each step and making resonance
	decisions in real time.

	Args:
	model: Gemma-3 270M-IT (with or without LoRA adapter)
	tokenizer: Gemma tokenizer
	prompt: user's question/input
	state: ResonanceState with thresholds and parameters
	max_new_tokens: maximum tokens to generate
	verbose: print entropy at each step
	show_curve: collect data for visualization
	repetition_penalty: penalize repeated tokens

	Returns:
	(generated_text, entropy_curve)
	"""
	device = next(model.parameters()).device
	model.eval()

	# Format prompt in Gemma chat template
	input_text = f"{START_OF_TURN}user\n{prompt}{END_OF_TURN}\n{START_OF_TURN}model\n"
	input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

	# Initialize
	curve = EntropyCurve()
	generated_ids = []
	generated_text = ""

	# Track generated token IDs for repetition penalty
	all_ids = input_ids[0].tolist()

	# EOS token
	eos_id = tokenizer.eos_token_id
	# Gemma end_of_turn token
	eot_text = END_OF_TURN
	eot_ids = tokenizer.encode(eot_text, add_special_tokens=False)

	# Reset state for this generation
	state.in_resonance = False
	state.consecutive_high = 0
	state.consecutive_low = 0
	state.resonance_token_count = 0
	state.total_tokens = 0
	state.resonance_entries = 0
	state.forced_exits = 0

	# Prefill: get initial logits from full context
	with torch.no_grad():
	outputs = model(input_ids)
	next_logits = outputs.logits[0, -1, :] # (vocab_size,)

	for step in range(max_new_tokens):
	# ── 1. Compute entropy from RAW logits ──
	H = compute_entropy(next_logits)
	h_norm = normalized_entropy(H)

	# ── 2. Check resonance state ──
	event = state.update(h_norm)

	if event == 'enter_resonance':
	# Inject /resonate/ marker into the generation
	marker_text = f"\n{RESONATE_OPEN}\n"
	marker_ids = tokenizer.encode(marker_text, add_special_tokens=False)
	generated_ids.extend(marker_ids)
	all_ids.extend(marker_ids)
	generated_text += marker_text

	if verbose:
	log.info(f" [ENTER RESONANCE] H_norm={h_norm:.4f} at token {step}")

	if show_curve:
	curve.mark_event('enter_resonance')

	# Re-run model with the injected marker to update context
	full_ids = torch.tensor([all_ids], device=device)
	with torch.no_grad():
	outputs = model(full_ids)
	next_logits = outputs.logits[0, -1, :]
	continue # Re-evaluate entropy after marker injection

	elif event in ('exit_resonance', 'force_exit'):
	# Inject /resonated/ marker
	marker_text = f"\n{RESONATE_CLOSE}\n"
	marker_ids = tokenizer.encode(marker_text, add_special_tokens=False)
	generated_ids.extend(marker_ids)
	all_ids.extend(marker_ids)
	generated_text += marker_text

	if verbose:
	if event == 'force_exit':
	log.warning(f" [FORCED EXIT] Max resonance tokens exceeded at step {step}")
	else:
	log.info(f" [EXIT RESONANCE] H_norm={h_norm:.4f} at token {step}")

	if show_curve:
	curve.mark_event('exit_resonance')

	# Re-run model with marker
	full_ids = torch.tensor([all_ids], device=device)
	with torch.no_grad():
	outputs = model(full_ids)
	next_logits = outputs.logits[0, -1, :]
	continue

	# ── 3. Get entropy-modulated sampling parameters ──
	params = state.get_sampling_params(h_norm)

	# ── 4. Apply repetition penalty ──
	logits = next_logits.clone()
	if repetition_penalty != 1.0 and generated_ids:
	for prev_id in set(generated_ids[-50:]): # look back 50 tokens
	if logits[prev_id] > 0:
	logits[prev_id] /= repetition_penalty
	else:
	logits[prev_id] *= repetition_penalty

	# ── 5. Apply temperature ──
	temp = params['temperature']
	if temp > 0:
	logits = logits / temp
	else:
	# temperature=0 → greedy
	pass

	# ── 6. Apply top-k filtering ──
	top_k = params['top_k']
	if top_k > 0:
	indices_to_remove = logits < torch.topk(logits, top_k)[0][-1]
	logits[indices_to_remove] = float('-inf')

	# ── 7. Apply top-p (nucleus) filtering ──
	top_p = params['top_p']
	if top_p < 1.0:
	sorted_logits, sorted_indices = torch.sort(logits, descending=True)
	cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
	# Remove tokens with cumulative prob above top_p
	sorted_indices_to_remove = cumulative_probs > top_p
	# Keep the first token above threshold
	sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
	sorted_indices_to_remove[0] = False
	indices_to_remove = sorted_indices[sorted_indices_to_remove]
	logits[indices_to_remove] = float('-inf')

	# ── 8. Sample ──
	probs = F.softmax(logits, dim=-1)
	if temp > 0:
	next_token = torch.multinomial(probs, num_samples=1).item()
	else:
	next_token = torch.argmax(logits).item()

	# ── 9. Check for EOS ──
	if next_token == eos_id:
	break

	# Check for end_of_turn sequence
	generated_ids.append(next_token)
	all_ids.append(next_token)

	token_str = tokenizer.decode([next_token])
	generated_text += token_str

	# Check if we just generated end_of_turn
	if generated_text.rstrip().endswith(eot_text):
	generated_text = generated_text.rstrip()[:-len(eot_text)].rstrip()
	break

	# ── 10. Record for visualization ──
	if show_curve:
	curve.add(H, token_str)

	if verbose and step % 10 == 0:
	mode = "RESONANCE" if state.in_resonance else "crystal"
	log.info(
	f" step={step:3d} H={H:.2f}bits H_norm={h_norm:.4f} "
	f"mode={mode} temp={params['temperature']:.3f} "
	f"token={repr(token_str)}"
	)

	# ── 11. Forward pass for next token ──
	next_input = torch.tensor([[next_token]], device=device)
	full_ids = torch.tensor([all_ids], device=device)
	with torch.no_grad():
	# Use full context for each step (no KV cache for simplicity;
	# for production, implement KV cache management)
	outputs = model(full_ids)
	next_logits = outputs.logits[0, -1, :]

	return generated_text, curve


	# ============================================================================
	# Pretty Printing
	# ============================================================================

	def print_result(prompt: str, generated: str, curve: EntropyCurve,
	state: ResonanceState, show_curve: bool = False,
	h_high: float = 0.35, h_low: float = 0.12):
	"""Print the generation result with formatting."""

	print(f"\n{'='*70}")
	print(f" PROMPT: {prompt}")
	print(f"{'='*70}")

	# Parse /resonate/ sections
	if RESONATE_OPEN in generated and RESONATE_CLOSE in generated:
	parts = generated.split(RESONATE_OPEN, 1)
	pre_resonate = parts[0].strip()
	rest = parts[1]

	if RESONATE_CLOSE in rest:
	reasoning_and_rest = rest.split(RESONATE_CLOSE, 1)
	reasoning = reasoning_and_rest[0].strip()
	answer = reasoning_and_rest[1].strip()
	else:
	reasoning = rest.strip()
	answer = "[resonance did not crystallize — forced exit or max tokens]"

	if pre_resonate:
	print(f"\n {pre_resonate}")

	print(f"\n --- {RESONATE_OPEN} ---")
	# Print reasoning with indent
	for line in reasoning.split('\n'):
	print(f" \| {line}")

	print(f"\n --- {RESONATE_CLOSE} ---")
	print(f"\n {answer}")
	else:
	# No resonance triggered — direct answer
	print(f"\n [direct answer — entropy stayed low, no resonance needed]")
	print(f"\n {generated}")

	print(f"\n{'─'*70}")
	print(f" {state.summary()}")

	if show_curve and curve.values:
	print(f"\n{curve.render(h_high, h_low)}")

	print(f"{'='*70}\n")


	# ============================================================================
	# Model Loading
	# ============================================================================

	def load_model(model_id: str = MODEL_ID, adapter_path: str = None,
	device: str = None) -> tuple:
	"""Load Gemma-3 270M-IT with optional LoRA adapter.

	Args:
	model_id: base model identifier
	adapter_path: path to LoRA adapter (None for base model)
	device: 'cuda', 'cpu', or 'mps' (auto-detected if None)

	Returns:
	(model, tokenizer, device_str)
	"""
	if device is None:
	if torch.cuda.is_available():
	device = 'cuda'
	elif torch.backends.mps.is_available():
	device = 'mps'
	else:
	device = 'cpu'

	log.info(f"Loading tokenizer from {model_id}...")
	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

	log.info(f"Loading model from {model_id} onto {device}...")

	dtype = torch.bfloat16 if device == 'cuda' else torch.float32

	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=dtype,
	device_map=device if device == 'cuda' else None,
	attn_implementation="sdpa" if device == 'cuda' else "eager",
	trust_remote_code=True,
	)

	if device != 'cuda':
	model = model.to(device)

	total_params = sum(p.numel() for p in model.parameters())
	log.info(f"Base model: {total_params/1e6:.1f}M params, dtype={dtype}")

	# Load LoRA adapter if provided
	if adapter_path:
	if not os.path.isdir(adapter_path):
	log.error(f"Adapter path does not exist: {adapter_path}")
	log.error("Run training first: python train_gemma_resonate.py")
	sys.exit(1)

	from peft import PeftModel
	log.info(f"Loading LoRA adapter from {adapter_path}...")
	model = PeftModel.from_pretrained(model, adapter_path)
	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	log.info(f"Adapter loaded: {trainable/1e6:.1f}M trainable params")

	model.eval()
	return model, tokenizer, device


	# ============================================================================
	# Interactive Mode
	# ============================================================================

	def interactive_mode(model, tokenizer, state: ResonanceState,
	verbose: bool = False, show_curve: bool = False):
	"""Interactive REPL for entropy-driven resonance."""

	print(f"\n{'='*70}")
	print(f" ENTROPY-DRIVEN ADAPTIVE RESONANCE")
	print(f" Gemma-3 270M-IT + Entropy Monitoring")
	print(f"{'─'*70}")
	print(f" H_high = {state.h_high:.3f} (enter resonance)")
	print(f" H_low = {state.h_low:.3f} (exit resonance)")
	print(f" Beta = {state.beta:.2f} (entropy coupling)")
	print(f" Max resonance tokens = {state.max_resonance_tokens}")
	print(f"{'─'*70}")
	print(f" Commands: /quit /verbose /curve /thresholds H_HIGH H_LOW")
	print(f"{'='*70}\n")

	while True:
	try:
	prompt = input(">>> ").strip()
	except (EOFError, KeyboardInterrupt):
	print("\nExiting.")
	break

	if not prompt:
	continue

	if prompt == '/quit':
	break
	elif prompt == '/verbose':
	verbose = not verbose
	print(f" Verbose: {'ON' if verbose else 'OFF'}")
	continue
	elif prompt == '/curve':
	show_curve = not show_curve
	print(f" Curve: {'ON' if show_curve else 'OFF'}")
	continue
	elif prompt.startswith('/thresholds'):
	parts = prompt.split()
	if len(parts) == 3:
	try:
	state.h_high = float(parts[1])
	state.h_low = float(parts[2])
	print(f" Thresholds updated: H_high={state.h_high:.3f}, H_low={state.h_low:.3f}")
	except ValueError:
	print(f" Usage: /thresholds 0.35 0.12")
	else:
	print(f" Current: H_high={state.h_high:.3f}, H_low={state.h_low:.3f}")
	continue

	# Generate with entropy monitoring
	t0 = time.time()

	generated, curve = entropy_generate(
	model, tokenizer, prompt, state,
	verbose=verbose,
	show_curve=show_curve,
	)

	elapsed = time.time() - t0

	print_result(prompt, generated, curve, state,
	show_curve=show_curve,
	h_high=state.h_high, h_low=state.h_low)

	tokens_generated = len(curve.values) if curve.values else 0
	tps = tokens_generated / elapsed if elapsed > 0 else 0
	print(f" [{elapsed:.1f}s, ~{tokens_generated} tokens, {tps:.1f} tok/s]\n")


	# ============================================================================
	# Main
	# ============================================================================

	def main():
	parser = argparse.ArgumentParser(
	description="Entropy-Driven Adaptive Resonance — inference for Gemma-3 270M-IT"
	)

	# Model
	parser.add_argument("--model", default=MODEL_ID, help="Base model ID")
	parser.add_argument("--adapter-path", default=None, help="LoRA adapter path")
	parser.add_argument("--no-lora", action="store_true", help="Skip LoRA loading")
	parser.add_argument("--device", default=None, help="Device: cuda/cpu/mps (auto)")

	# Generation
	parser.add_argument("--prompt", default=None, help="Single prompt (non-interactive)")
	parser.add_argument("--max-tokens", type=int, default=768, help="Max tokens to generate")

	# Entropy thresholds
	parser.add_argument("--h-high", type=float, default=0.35,
	help="Normalized entropy threshold to enter resonance (0-1)")
	parser.add_argument("--h-low", type=float, default=0.12,
	help="Normalized entropy threshold to exit resonance (0-1)")
	parser.add_argument("--beta", type=float, default=0.3,
	help="Entropy coupling constant (Delta Voice integration)")

	# Hysteresis
	parser.add_argument("--enter-count", type=int, default=3,
	help="Consecutive high-entropy tokens to enter resonance")
	parser.add_argument("--exit-count", type=int, default=5,
	help="Consecutive low-entropy tokens to exit resonance")
	parser.add_argument("--max-resonance", type=int, default=500,
	help="Max tokens in a single resonance section")

	# Sampling
	parser.add_argument("--temperature", type=float, default=0.7, help="Base temperature")
	parser.add_argument("--top-p", type=float, default=0.9, help="Base top-p")
	parser.add_argument("--top-k", type=int, default=40, help="Base top-k")
	parser.add_argument("--repetition-penalty", type=float, default=1.3,
	help="Repetition penalty")

	# Display
	parser.add_argument("--verbose", action="store_true", help="Show entropy per step")
	parser.add_argument("--show-curve", action="store_true",
	help="Show ASCII entropy curve after generation")

	args = parser.parse_args()

	# Load model
	adapter = None if args.no_lora else args.adapter_path
	model, tokenizer, device = load_model(args.model, adapter, args.device)

	# Build resonance state
	state = ResonanceState(
	h_high=args.h_high,
	h_low=args.h_low,
	enter_count=args.enter_count,
	exit_count=args.exit_count,
	max_resonance_tokens=args.max_resonance,
	beta=args.beta,
	base_temperature=args.temperature,
	base_top_p=args.top_p,
	base_top_k=args.top_k,
	)

	if args.prompt:
	# Single prompt mode
	generated, curve = entropy_generate(
	model, tokenizer, args.prompt, state,
	max_new_tokens=args.max_tokens,
	verbose=args.verbose,
	show_curve=args.show_curve,
	repetition_penalty=args.repetition_penalty,
	)
	print_result(args.prompt, generated, curve, state,
	show_curve=args.show_curve,
	h_high=state.h_high, h_low=state.h_low)
	else:
	# Interactive mode
	interactive_mode(model, tokenizer, state,
	verbose=args.verbose,
	show_curve=args.show_curve)


	if __name__ == "__main__":
	main()