swarm-os / backend /engine /tensor_challenges.py
aryxn323's picture
Initial Space deployment with llama-cpp + React dashboard
8892a6c
"""
Tensor Challenge Generator β€” The Dummy Workload Factory
========================================================
Generates precisely calibrated PyTorch tensor workloads that are injected
into the AI's code AFTER their solution, creating the physics test:
Naive code: Loads entire tensor β†’ hits 501MB β†’ OOMKilled / CUDA OOM
Genius code: Uses checkpointing / mixed precision β†’ peaks at ~150MB β†’ PASS
The challenges are designed so that:
- The raw memory footprint EXCEEDS 500MB (the sandbox limit)
- But the mathematical workload CAN be completed within 500MB
if the AI uses efficient strategies
Challenge tiers:
TIER_1 (Warm-up): ~600MB raw, trivially solvable with fp16
TIER_2 (Standard): ~800MB raw, requires checkpointing OR mixed precision
TIER_3 (Adversarial): ~1.2GB raw, requires checkpointing AND mixed precision
"""
import textwrap
import logging
logger = logging.getLogger("swarm-os.tensor-challenges")
# ── Challenge Definitions ──
CHALLENGES = {
# ─── TIER 1: Warm-up ───
# ~600MB in fp32. Solvable with a single optimization.
# A 4-layer MLP with a fat hidden dimension processing a large batch.
"tier_1_mlp_overfit": {
"name": "MLP Overfitting Stress Test",
"tier": 1,
"raw_memory_mb": 600,
"description": "Dense MLP with oversized hidden layers. Naive forward pass exceeds 500MB.",
"hint_to_sre": "Try torch.autocast(dtype=torch.float16) to halve memory.",
"code": textwrap.dedent("""\
# ═══ TENSOR CHALLENGE: TIER 1 β€” MLP Overfit ═══
# Raw memory footprint: ~600MB (fp32)
# Target: Process without exceeding 500MB VRAM
import torch
import torch.nn as nn
class StressModel(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(4096, 2048),
nn.ReLU(),
nn.Linear(2048, 2048),
nn.ReLU(),
nn.Linear(2048, 2048),
nn.ReLU(),
nn.Linear(2048, 1000),
)
def forward(self, x):
return self.layers(x)
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_model = StressModel().to(_device)
# Massive batch: 8192 samples Γ— 4096 features Γ— 4 bytes = 128MB input
# Small fp32 weights (72MB) + Huge intermediate activations:
# Iteration 1 (fp32): Total peak ~720MB β†’ OOMs cleanly
# Iteration 2 (fp16): Autocast halves activations β†’ Peak ~496MB β†’ Passes cleanly
_input = torch.randn(8192, 4096, device=_device)
_output = _model(_input)
_loss = _output.sum()
_loss.backward()
print(f"CHALLENGE_RESULT=PASS|tier=1|output_shape={list(_output.shape)}")
del _model, _input, _output, _loss
torch.cuda.empty_cache()
# ═══ END CHALLENGE ═══
"""),
},
# ─── TIER 2: Standard ───
# ~800MB in fp32. Requires gradient checkpointing OR mixed precision.
# A mini-transformer with multi-head attention and long sequences.
"tier_2_transformer_fwd": {
"name": "Transformer Forward Pass Stress Test",
"tier": 2,
"raw_memory_mb": 800,
"description": "Mini-transformer with long sequences. Activations dominate memory.",
"hint_to_sre": "Use torch.utils.checkpoint or torch.autocast to survive.",
"code": textwrap.dedent("""\
# ═══ TENSOR CHALLENGE: TIER 2 β€” Transformer Forward ═══
# Raw memory footprint: ~800MB (fp32)
# Target: Process without exceeding 500MB VRAM
import torch
import torch.nn as nn
class StressTransformer(nn.Module):
def __init__(self, d_model=1024, nhead=8, num_layers=6, dim_ff=4096):
super().__init__()
self.embedding = nn.Linear(512, d_model)
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model, nhead=nhead, dim_feedforward=dim_ff,
batch_first=True, dropout=0.0,
)
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
self.head = nn.Linear(d_model, 100)
def forward(self, x):
x = self.embedding(x)
x = self.encoder(x)
return self.head(x[:, -1, :])
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_model = StressTransformer().to(_device)
# Batch=64, SeqLen=512, Features=512
# Attention matrices: 64 Γ— 8 heads Γ— 512 Γ— 512 Γ— 4 bytes β‰ˆ 500MB alone
_input = torch.randn(64, 512, 512, device=_device)
_output = _model(_input)
_loss = _output.sum()
_loss.backward()
print(f"CHALLENGE_RESULT=PASS|tier=2|output_shape={list(_output.shape)}")
del _model, _input, _output, _loss
torch.cuda.empty_cache()
# ═══ END CHALLENGE ═══
"""),
},
# ─── TIER 3: Adversarial ───
# ~1.2GB in fp32. Requires BOTH checkpointing AND mixed precision.
# A deep residual network with skip connections and large feature maps.
"tier_3_deep_resnet": {
"name": "Deep ResNet Adversarial Stress Test",
"tier": 3,
"raw_memory_mb": 1200,
"description": "Deep residual network. Only combined optimizations survive.",
"hint_to_sre": "Requires gradient checkpointing AND mixed precision together.",
"code": textwrap.dedent("""\
# ═══ TENSOR CHALLENGE: TIER 3 β€” Deep ResNet Adversarial ═══
# Raw memory footprint: ~1.2GB (fp32)
# Target: Process without exceeding 500MB VRAM
# Only solvable with BOTH checkpointing AND mixed precision
import torch
import torch.nn as nn
class ResBlock(nn.Module):
def __init__(self, channels):
super().__init__()
self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn1 = nn.BatchNorm2d(channels)
self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
self.bn2 = nn.BatchNorm2d(channels)
def forward(self, x):
residual = x
out = torch.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
return torch.relu(out + residual)
class DeepStressNet(nn.Module):
def __init__(self, num_blocks=16, channels=256):
super().__init__()
self.stem = nn.Sequential(
nn.Conv2d(3, channels, 7, stride=2, padding=3),
nn.BatchNorm2d(channels),
nn.ReLU(),
)
self.blocks = nn.ModuleList([ResBlock(channels) for _ in range(num_blocks)])
self.pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(channels, 1000)
def forward(self, x):
x = self.stem(x)
for block in self.blocks:
x = block(x)
x = self.pool(x).flatten(1)
return self.fc(x)
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_model = DeepStressNet(num_blocks=16, channels=256).to(_device)
# Batch=32, 3Γ—224Γ—224 images
# 16 ResBlocks Γ— 256 channels Γ— 112Γ—112 feature maps Γ— 4 bytes β‰ˆ 1.2GB activations
_input = torch.randn(32, 3, 224, 224, device=_device)
_output = _model(_input)
_loss = _output.sum()
_loss.backward()
print(f"CHALLENGE_RESULT=PASS|tier=3|output_shape={list(_output.shape)}")
del _model, _input, _output, _loss
torch.cuda.empty_cache()
# ═══ END CHALLENGE ═══
"""),
},
}
class TensorChallengeGenerator:
"""
Generates and manages tensor challenge workloads for the Docker sandbox.
Selects appropriate challenge tier based on training curriculum stage.
"""
def __init__(self):
self.challenges_issued = 0
self.challenges_passed = 0
self.tier_history: list = []
def get_challenge(self, tier: int = 1) -> dict:
"""
Get a tensor challenge by tier.
Args:
tier: 1 (warm-up), 2 (standard), 3 (adversarial)
Returns:
dict with name, tier, raw_memory_mb, code, hint
"""
tier_map = {
1: "tier_1_mlp_overfit",
2: "tier_2_transformer_fwd",
3: "tier_3_deep_resnet",
}
key = tier_map.get(tier, "tier_1_mlp_overfit")
challenge = CHALLENGES[key]
self.challenges_issued += 1
self.tier_history.append(tier)
logger.info(
"Challenge issued: tier=%d name='%s' raw_memory=%dMB (challenge #%d)",
tier, challenge["name"], challenge["raw_memory_mb"], self.challenges_issued,
)
return {
"key": key,
"name": challenge["name"],
"tier": challenge["tier"],
"raw_memory_mb": challenge["raw_memory_mb"],
"description": challenge["description"],
"hint_to_sre": challenge["hint_to_sre"],
"code": challenge["code"],
}
def record_result(self, tier: int, passed: bool):
"""Record whether a challenge was passed or failed."""
if passed:
self.challenges_passed += 1
logger.info(
"Challenge result: tier=%d passed=%s (total: %d/%d)",
tier, passed, self.challenges_passed, self.challenges_issued,
)
def get_curriculum_tier(self) -> int:
"""
Auto-select challenge tier based on the AI's training progress.
Implements curriculum learning:
- Start with Tier 1
- Promote to Tier 2 after 2 consecutive Tier 1 passes
- Promote to Tier 3 after 2 consecutive Tier 2 passes
"""
if len(self.tier_history) < 2:
return 1
recent = self.tier_history[-2:]
# If last 2 were Tier 1 passes, promote to Tier 2
if all(t == 1 for t in recent) and self.challenges_passed >= 2:
return 2
# If last 2 were Tier 2 passes, promote to Tier 3
if all(t == 2 for t in recent) and self.challenges_passed >= 4:
return 3
# Stay at current tier
return recent[-1] if recent else 1
def get_stats(self) -> dict:
"""Get challenge statistics for dashboard display."""
return {
"total_issued": self.challenges_issued,
"total_passed": self.challenges_passed,
"pass_rate": round(
(self.challenges_passed / max(1, self.challenges_issued)) * 100, 1
),
"current_tier": self.get_curriculum_tier(),
"tier_history": self.tier_history[-20:], # Last 20
}