#!/usr/bin/env bash
# Lightning AI Studios trainer — uses 80 free GPU hr/mo (incl. H100/H200/A100).
#
# Strategy: Lightning H200 has 141GB VRAM in 4 hr quota — fits Qwen3-Coder-480B-A35B
# QLoRA easily, OR Full SFT of Qwen3-Coder-Next.
#
# Auth: requires LIGHTNING_USER_KEY + LIGHTNING_USER_ID secrets (from Lightning
# Settings → API Keys). When unset this daemon skips silently.

set -uo pipefail
set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a

LOG="$HOME/.surrogate/logs/lightning-trainer.log"
mkdir -p "$(dirname "$LOG")"

if [[ -z "${LIGHTNING_API_KEY:-}" || -z "${LIGHTNING_USER_ID:-}" ]]; then
    echo "[$(date +%H:%M:%S)] lightning-trainer skipping — LIGHTNING_API_KEY/USER_ID not set" | tee -a "$LOG"
    exit 0
fi

if ! command -v lightning >/dev/null 2>&1; then
    pip install --quiet --user lightning lightning-sdk 2>>"$LOG"
    export PATH="$HOME/.local/bin:$PATH"
fi

# Lightning SDK reads from env LIGHTNING_USER_ID + LIGHTNING_API_KEY (newer
# format) OR LIGHTNING_USER_KEY (older). Export both for redundancy.
export LIGHTNING_USER_ID LIGHTNING_API_KEY
export LIGHTNING_USER_KEY="$LIGHTNING_API_KEY"

echo "[$(date +%H:%M:%S)] lightning-trainer cycle start" | tee -a "$LOG"

# Build training script — H200 4hr can train massive 480B model with QLoRA
TRAIN_SCRIPT="$HOME/.surrogate/state/lightning-train.py"
cat > "$TRAIN_SCRIPT" << 'PYEOF'
"""Surrogate-1 LoRA training on Lightning AI H200.
H200 has 141 GB VRAM → fits Qwen3-Coder-480B-A35B QLoRA in 4 hr free quota.
This is the LARGEST model we can train without paying."""

import os, subprocess, sys
subprocess.check_call([sys.executable,"-m","pip","install","--quiet",
    "transformers>=4.45.0","datasets>=3.0.0","peft>=0.13.0",
    "accelerate>=1.0.0","bitsandbytes>=0.43.0","huggingface_hub>=0.25.0"])

import torch
from datasets import load_dataset, interleave_datasets, Dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
    Trainer, DataCollatorForSeq2Seq, BitsAndBytesConfig)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

# H200 141 GB → can fit 480B QLoRA. If H200 not available, falls back gracefully.
BASE = os.environ.get("BASE_MODEL", "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8")
MAX_SAMPLES = int(os.environ.get("MAX_SAMPLES", "30000"))   # 4 hr H200 fits ~30K samples
HUB_ID = os.environ.get("HUB_MODEL_ID", "axentx/surrogate-1-coder-480b-a35b-v1")

print(f"━━━ Surrogate-1 LoRA on Lightning H200 ━━━")
print(f"base={BASE}  samples={MAX_SAMPLES:,}  hub={HUB_ID}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'NO CUDA'}")

SIBLINGS=["axentx/surrogate-1-training-pairs","axentx/surrogate-1-pairs-A",
          "axentx/surrogate-1-pairs-B","axentx/surrogate-1-pairs-C","axentx/surrogate-1-pairs-D"]
streams=[]
for r in SIBLINGS:
    try: streams.append(load_dataset(r,split="train",streaming=True))
    except Exception as e: print(f"skip {r}: {e}")
ds = interleave_datasets(streams, stopping_strategy="all_exhausted")
rows=[]
for i,ex in enumerate(ds):
    if i>=MAX_SAMPLES: break
    p=(ex.get("prompt") or ex.get("instruction") or "").strip()
    r=(ex.get("response") or ex.get("output") or "").strip()
    if len(p)>=20 and len(r)>=30: rows.append({"prompt":p,"response":r})
print(f"kept {len(rows):,} samples")
raw = Dataset.from_list(rows)

tok=AutoTokenizer.from_pretrained(BASE,trust_remote_code=True)
if tok.pad_token is None: tok.pad_token=tok.eos_token

bnb=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")
model=AutoModelForCausalLM.from_pretrained(BASE, quantization_config=bnb,
    device_map="auto", trust_remote_code=True)
model=prepare_model_for_kbit_training(model)

lora=LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,  # bumped rank since we have GPU headroom
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type=TaskType.CAUSAL_LM)
model=get_peft_model(model,lora)
model.print_trainable_parameters()

def fmt(ex):
    msgs=[{"role":"system","content":"You are Surrogate-1, a senior DevSecOps AI coding agent."},
          {"role":"user","content":ex["prompt"]},{"role":"assistant","content":ex["response"]}]
    return {"text": tok.apply_chat_template(msgs,tokenize=False,add_generation_prompt=False)}
raw=raw.map(fmt,remove_columns=raw.column_names)
def tk(b):
    e=tok(b["text"],truncation=True,max_length=4096,padding=False)  # longer ctx since H200 has space
    e["labels"]=e["input_ids"].copy(); return e
tokenized=raw.map(tk,batched=True,remove_columns=["text"])

args=TrainingArguments(
    output_dir="./out", num_train_epochs=1.0,
    per_device_train_batch_size=2, gradient_accumulation_steps=8,  # bigger batch on H200
    learning_rate=2e-4, bf16=True, gradient_checkpointing=True,
    logging_steps=20, save_strategy="steps", save_steps=200, save_total_limit=2,
    warmup_ratio=0.03, lr_scheduler_type="cosine", report_to="none",
    push_to_hub=True, hub_model_id=HUB_ID, hub_strategy="every_save",
    hub_token=os.environ.get("HF_TOKEN"),
    hub_private_repo=False)   # PUBLIC — multi-checkpoint > 500MB; flip after train
collator=DataCollatorForSeq2Seq(tok,padding=True,return_tensors="pt")
Trainer(model=model,args=args,train_dataset=tokenized,data_collator=collator,
    tokenizer=tok).train()
print("✅ done")
PYEOF

# Submit to Lightning Studios via Python SDK (lightning run app is deprecated).
# Strategy: connect to (or create) a Studio with H200 attached, upload the
# training script, run it via studio.run(). The Studio persists output so
# we can poll later.
echo "[$(date +%H:%M:%S)] submitting to Lightning H200 via SDK" | tee -a "$LOG"
python3 - "$TRAIN_SCRIPT" >> "$LOG" 2>&1 << 'SDK_EOF' || echo "[$(date +%H:%M:%S)] SDK run errored — see log" | tee -a "$LOG"
import os, sys, time
script_path = sys.argv[1]

try:
    from lightning_sdk import Studio, Machine
except ImportError:
    print("ERR: lightning_sdk not installed", flush=True)
    sys.exit(2)

studio_name = f"surrogate-1-train-{time.strftime('%Y%m%d-%H%M', time.gmtime())}"
print(f"▶ connecting/creating Studio: {studio_name}")

try:
    # SDK reads auth from env LIGHTNING_USER_ID + LIGHTNING_API_KEY which the
    # bash wrapper exported above. Don't pass them as kwargs (TypeError).
    studio = Studio(name=studio_name, teamspace="default", create_ok=True)
    studio.start(machine=Machine.H200)
    print(f"  ✅ Studio H200 started")
    studio.upload_file(script_path, "train.py")
    print(f"  ✅ uploaded train.py")
    # Fire training in background — SDK returns immediately
    try:
        job = studio.run("python train.py", in_background=True)
    except TypeError:
        # in_background may not be supported in this SDK version — try without
        job = studio.run("python train.py")
    print(f"  ✅ submitted job: {job}")
except Exception as e:
    print(f"  ❌ {type(e).__name__}: {str(e)[:300]}")
    sys.exit(3)
SDK_EOF
echo "[$(date +%H:%M:%S)] lightning-trainer cycle done" | tee -a "$LOG"
# trigger: pickup LIGHTNING_USER_ID + LIGHTNING_API_KEY 2026-04-28T20:29:29Z