Upload chimera/model.py
Browse files- chimera/model.py +5 -11
chimera/model.py
CHANGED
|
@@ -254,9 +254,6 @@ class Chimera51ForCausalLM(nn.Module):
|
|
| 254 |
# Evolution modulation every N layers (lightweight)
|
| 255 |
evo_mod = None
|
| 256 |
if i % self.evo_every_n_layers == 0 and self.evolution is not None:
|
| 257 |
-
# Compute modulation from semantic memory
|
| 258 |
-
# Note: loss parameter requires a scalar loss tensor for TTT/surprise;
|
| 259 |
-
# pass None during standard forward, compute explicitly for TTT
|
| 260 |
evo_result = self.evolution(
|
| 261 |
hidden_states=x.detach() if not x.requires_grad else x,
|
| 262 |
layer_idx=i,
|
|
@@ -270,7 +267,6 @@ class Chimera51ForCausalLM(nn.Module):
|
|
| 270 |
# TTT update for target layers (only in training, no backprop)
|
| 271 |
if self.training and evo_result.get('ttt_delta') is not None:
|
| 272 |
with torch.no_grad():
|
| 273 |
-
# Apply TTT to MLP down-projection if this is a target layer
|
| 274 |
if hasattr(layer.mlp, 'w_down'):
|
| 275 |
layer.mlp.w_down.data.add_(evo_result['ttt_delta'] * self.evolution.ttt.inner_lr)
|
| 276 |
|
|
@@ -330,11 +326,11 @@ class Chimera51ForCausalLM(nn.Module):
|
|
| 330 |
effective = num_loops
|
| 331 |
if effective is None and not self.training and probe_logits is not None:
|
| 332 |
effective = self.entropy_valve.get_loop_count(probe_logits)
|
| 333 |
-
elif effective is None
|
| 334 |
-
#
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
effective =
|
| 338 |
|
| 339 |
# Loop body
|
| 340 |
loop_fn = lambda inp: self._run_layers(
|
|
@@ -395,8 +391,6 @@ class Chimera51ForCausalLM(nn.Module):
|
|
| 395 |
# Store episodic case after forward (for inference mode)
|
| 396 |
if not self.training and self.evolution is not None:
|
| 397 |
last_hidden = x[:, -1, :].detach()
|
| 398 |
-
# Schedule episodic storage for end of sequence
|
| 399 |
-
# (In real use, call model.evolution.store_episodic() explicitly)
|
| 400 |
|
| 401 |
return CausalLMOutput(
|
| 402 |
loss=loss,
|
|
|
|
| 254 |
# Evolution modulation every N layers (lightweight)
|
| 255 |
evo_mod = None
|
| 256 |
if i % self.evo_every_n_layers == 0 and self.evolution is not None:
|
|
|
|
|
|
|
|
|
|
| 257 |
evo_result = self.evolution(
|
| 258 |
hidden_states=x.detach() if not x.requires_grad else x,
|
| 259 |
layer_idx=i,
|
|
|
|
| 267 |
# TTT update for target layers (only in training, no backprop)
|
| 268 |
if self.training and evo_result.get('ttt_delta') is not None:
|
| 269 |
with torch.no_grad():
|
|
|
|
| 270 |
if hasattr(layer.mlp, 'w_down'):
|
| 271 |
layer.mlp.w_down.data.add_(evo_result['ttt_delta'] * self.evolution.ttt.inner_lr)
|
| 272 |
|
|
|
|
| 326 |
effective = num_loops
|
| 327 |
if effective is None and not self.training and probe_logits is not None:
|
| 328 |
effective = self.entropy_valve.get_loop_count(probe_logits)
|
| 329 |
+
elif effective is None:
|
| 330 |
+
# FIX: During training, use the loop_controller.loop_default directly
|
| 331 |
+
# instead of running the loop classifier (which calls .item() and is
|
| 332 |
+
# expensive). The ProgressiveLoopScheduler already sets loop_default.
|
| 333 |
+
effective = self.loop_controller.loop_default
|
| 334 |
|
| 335 |
# Loop body
|
| 336 |
loop_fn = lambda inp: self._run_layers(
|
|
|
|
| 391 |
# Store episodic case after forward (for inference mode)
|
| 392 |
if not self.training and self.evolution is not None:
|
| 393 |
last_hidden = x[:, -1, :].detach()
|
|
|
|
|
|
|
| 394 |
|
| 395 |
return CausalLMOutput(
|
| 396 |
loss=loss,
|