surrogate-1 / bin /lightning-trainer.sh
Ashira Pitchayapakayakul
rename: drop '-lora-' segment from all model names + capitalize v1.5 size
b772ad8
#!/usr/bin/env bash
# Lightning AI Studios trainer β€” uses 80 free GPU hr/mo (incl. H100/H200/A100).
#
# Strategy: Lightning H200 has 141GB VRAM in 4 hr quota β€” fits Qwen3-Coder-480B-A35B
# QLoRA easily, OR Full SFT of Qwen3-Coder-Next.
#
# Auth: requires LIGHTNING_USER_KEY + LIGHTNING_USER_ID secrets (from Lightning
# Settings β†’ API Keys). When unset this daemon skips silently.
set -uo pipefail
set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a
LOG="$HOME/.surrogate/logs/lightning-trainer.log"
mkdir -p "$(dirname "$LOG")"
if [[ -z "${LIGHTNING_API_KEY:-}" || -z "${LIGHTNING_USER_ID:-}" ]]; then
echo "[$(date +%H:%M:%S)] lightning-trainer skipping β€” LIGHTNING_API_KEY/USER_ID not set" | tee -a "$LOG"
exit 0
fi
if ! command -v lightning >/dev/null 2>&1; then
pip install --quiet --user lightning lightning-sdk 2>>"$LOG"
export PATH="$HOME/.local/bin:$PATH"
fi
# Lightning SDK reads from env LIGHTNING_USER_ID + LIGHTNING_API_KEY (newer
# format) OR LIGHTNING_USER_KEY (older). Export both for redundancy.
export LIGHTNING_USER_ID LIGHTNING_API_KEY
export LIGHTNING_USER_KEY="$LIGHTNING_API_KEY"
echo "[$(date +%H:%M:%S)] lightning-trainer cycle start" | tee -a "$LOG"
# Build training script β€” H200 4hr can train massive 480B model with QLoRA
TRAIN_SCRIPT="$HOME/.surrogate/state/lightning-train.py"
cat > "$TRAIN_SCRIPT" << 'PYEOF'
"""Surrogate-1 LoRA training on Lightning AI H200.
H200 has 141 GB VRAM β†’ fits Qwen3-Coder-480B-A35B QLoRA in 4 hr free quota.
This is the LARGEST model we can train without paying."""
import os, subprocess, sys
subprocess.check_call([sys.executable,"-m","pip","install","--quiet",
"transformers>=4.45.0","datasets>=3.0.0","peft>=0.13.0",
"accelerate>=1.0.0","bitsandbytes>=0.43.0","huggingface_hub>=0.25.0"])
import torch
from datasets import load_dataset, interleave_datasets, Dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
Trainer, DataCollatorForSeq2Seq, BitsAndBytesConfig)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
# H200 141 GB β†’ can fit 480B QLoRA. If H200 not available, falls back gracefully.
BASE = os.environ.get("BASE_MODEL", "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8")
MAX_SAMPLES = int(os.environ.get("MAX_SAMPLES", "30000")) # 4 hr H200 fits ~30K samples
HUB_ID = os.environ.get("HUB_MODEL_ID", "axentx/surrogate-1-coder-480b-a35b-v1")
print(f"━━━ Surrogate-1 LoRA on Lightning H200 ━━━")
print(f"base={BASE} samples={MAX_SAMPLES:,} hub={HUB_ID}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'NO CUDA'}")
SIBLINGS=["axentx/surrogate-1-training-pairs","axentx/surrogate-1-pairs-A",
"axentx/surrogate-1-pairs-B","axentx/surrogate-1-pairs-C","axentx/surrogate-1-pairs-D"]
streams=[]
for r in SIBLINGS:
try: streams.append(load_dataset(r,split="train",streaming=True))
except Exception as e: print(f"skip {r}: {e}")
ds = interleave_datasets(streams, stopping_strategy="all_exhausted")
rows=[]
for i,ex in enumerate(ds):
if i>=MAX_SAMPLES: break
p=(ex.get("prompt") or ex.get("instruction") or "").strip()
r=(ex.get("response") or ex.get("output") or "").strip()
if len(p)>=20 and len(r)>=30: rows.append({"prompt":p,"response":r})
print(f"kept {len(rows):,} samples")
raw = Dataset.from_list(rows)
tok=AutoTokenizer.from_pretrained(BASE,trust_remote_code=True)
if tok.pad_token is None: tok.pad_token=tok.eos_token
bnb=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")
model=AutoModelForCausalLM.from_pretrained(BASE, quantization_config=bnb,
device_map="auto", trust_remote_code=True)
model=prepare_model_for_kbit_training(model)
lora=LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05, # bumped rank since we have GPU headroom
target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
task_type=TaskType.CAUSAL_LM)
model=get_peft_model(model,lora)
model.print_trainable_parameters()
def fmt(ex):
msgs=[{"role":"system","content":"You are Surrogate-1, a senior DevSecOps AI coding agent."},
{"role":"user","content":ex["prompt"]},{"role":"assistant","content":ex["response"]}]
return {"text": tok.apply_chat_template(msgs,tokenize=False,add_generation_prompt=False)}
raw=raw.map(fmt,remove_columns=raw.column_names)
def tk(b):
e=tok(b["text"],truncation=True,max_length=4096,padding=False) # longer ctx since H200 has space
e["labels"]=e["input_ids"].copy(); return e
tokenized=raw.map(tk,batched=True,remove_columns=["text"])
args=TrainingArguments(
output_dir="./out", num_train_epochs=1.0,
per_device_train_batch_size=2, gradient_accumulation_steps=8, # bigger batch on H200
learning_rate=2e-4, bf16=True, gradient_checkpointing=True,
logging_steps=20, save_strategy="steps", save_steps=200, save_total_limit=2,
warmup_ratio=0.03, lr_scheduler_type="cosine", report_to="none",
push_to_hub=True, hub_model_id=HUB_ID, hub_strategy="every_save",
hub_token=os.environ.get("HF_TOKEN"),
hub_private_repo=False) # PUBLIC β€” multi-checkpoint > 500MB; flip after train
collator=DataCollatorForSeq2Seq(tok,padding=True,return_tensors="pt")
Trainer(model=model,args=args,train_dataset=tokenized,data_collator=collator,
tokenizer=tok).train()
print("βœ… done")
PYEOF
# Submit to Lightning Studios via Python SDK (lightning run app is deprecated).
# Strategy: connect to (or create) a Studio with H200 attached, upload the
# training script, run it via studio.run(). The Studio persists output so
# we can poll later.
echo "[$(date +%H:%M:%S)] submitting to Lightning H200 via SDK" | tee -a "$LOG"
python3 - "$TRAIN_SCRIPT" >> "$LOG" 2>&1 << 'SDK_EOF' || echo "[$(date +%H:%M:%S)] SDK run errored β€” see log" | tee -a "$LOG"
import os, sys, time
script_path = sys.argv[1]
try:
from lightning_sdk import Studio, Machine
except ImportError:
print("ERR: lightning_sdk not installed", flush=True)
sys.exit(2)
studio_name = f"surrogate-1-train-{time.strftime('%Y%m%d-%H%M', time.gmtime())}"
print(f"β–Ά connecting/creating Studio: {studio_name}")
try:
# SDK reads auth from env LIGHTNING_USER_ID + LIGHTNING_API_KEY which the
# bash wrapper exported above. Don't pass them as kwargs (TypeError).
studio = Studio(name=studio_name, teamspace="default", create_ok=True)
studio.start(machine=Machine.H200)
print(f" βœ… Studio H200 started")
studio.upload_file(script_path, "train.py")
print(f" βœ… uploaded train.py")
# Fire training in background β€” SDK returns immediately
try:
job = studio.run("python train.py", in_background=True)
except TypeError:
# in_background may not be supported in this SDK version β€” try without
job = studio.run("python train.py")
print(f" βœ… submitted job: {job}")
except Exception as e:
print(f" ❌ {type(e).__name__}: {str(e)[:300]}")
sys.exit(3)
SDK_EOF
echo "[$(date +%H:%M:%S)] lightning-trainer cycle done" | tee -a "$LOG"
# trigger: pickup LIGHTNING_USER_ID + LIGHTNING_API_KEY 2026-04-28T20:29:29Z