Spaces:
Runtime error
Runtime error
Ashira Pitchayapakayakul
rename: drop '-lora-' segment from all model names + capitalize v1.5 size
b772ad8 | # Lightning AI Studios trainer β uses 80 free GPU hr/mo (incl. H100/H200/A100). | |
| # | |
| # Strategy: Lightning H200 has 141GB VRAM in 4 hr quota β fits Qwen3-Coder-480B-A35B | |
| # QLoRA easily, OR Full SFT of Qwen3-Coder-Next. | |
| # | |
| # Auth: requires LIGHTNING_USER_KEY + LIGHTNING_USER_ID secrets (from Lightning | |
| # Settings β API Keys). When unset this daemon skips silently. | |
| set -uo pipefail | |
| set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a | |
| LOG="$HOME/.surrogate/logs/lightning-trainer.log" | |
| mkdir -p "$(dirname "$LOG")" | |
| if [[ -z "${LIGHTNING_API_KEY:-}" || -z "${LIGHTNING_USER_ID:-}" ]]; then | |
| echo "[$(date +%H:%M:%S)] lightning-trainer skipping β LIGHTNING_API_KEY/USER_ID not set" | tee -a "$LOG" | |
| exit 0 | |
| fi | |
| if ! command -v lightning >/dev/null 2>&1; then | |
| pip install --quiet --user lightning lightning-sdk 2>>"$LOG" | |
| export PATH="$HOME/.local/bin:$PATH" | |
| fi | |
| # Lightning SDK reads from env LIGHTNING_USER_ID + LIGHTNING_API_KEY (newer | |
| # format) OR LIGHTNING_USER_KEY (older). Export both for redundancy. | |
| export LIGHTNING_USER_ID LIGHTNING_API_KEY | |
| export LIGHTNING_USER_KEY="$LIGHTNING_API_KEY" | |
| echo "[$(date +%H:%M:%S)] lightning-trainer cycle start" | tee -a "$LOG" | |
| # Build training script β H200 4hr can train massive 480B model with QLoRA | |
| TRAIN_SCRIPT="$HOME/.surrogate/state/lightning-train.py" | |
| cat > "$TRAIN_SCRIPT" << 'PYEOF' | |
| """Surrogate-1 LoRA training on Lightning AI H200. | |
| H200 has 141 GB VRAM β fits Qwen3-Coder-480B-A35B QLoRA in 4 hr free quota. | |
| This is the LARGEST model we can train without paying.""" | |
| import os, subprocess, sys | |
| subprocess.check_call([sys.executable,"-m","pip","install","--quiet", | |
| "transformers>=4.45.0","datasets>=3.0.0","peft>=0.13.0", | |
| "accelerate>=1.0.0","bitsandbytes>=0.43.0","huggingface_hub>=0.25.0"]) | |
| import torch | |
| from datasets import load_dataset, interleave_datasets, Dataset | |
| from transformers import (AutoTokenizer, AutoModelForCausalLM, TrainingArguments, | |
| Trainer, DataCollatorForSeq2Seq, BitsAndBytesConfig) | |
| from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType | |
| # H200 141 GB β can fit 480B QLoRA. If H200 not available, falls back gracefully. | |
| BASE = os.environ.get("BASE_MODEL", "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8") | |
| MAX_SAMPLES = int(os.environ.get("MAX_SAMPLES", "30000")) # 4 hr H200 fits ~30K samples | |
| HUB_ID = os.environ.get("HUB_MODEL_ID", "axentx/surrogate-1-coder-480b-a35b-v1") | |
| print(f"βββ Surrogate-1 LoRA on Lightning H200 βββ") | |
| print(f"base={BASE} samples={MAX_SAMPLES:,} hub={HUB_ID}") | |
| print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'NO CUDA'}") | |
| SIBLINGS=["axentx/surrogate-1-training-pairs","axentx/surrogate-1-pairs-A", | |
| "axentx/surrogate-1-pairs-B","axentx/surrogate-1-pairs-C","axentx/surrogate-1-pairs-D"] | |
| streams=[] | |
| for r in SIBLINGS: | |
| try: streams.append(load_dataset(r,split="train",streaming=True)) | |
| except Exception as e: print(f"skip {r}: {e}") | |
| ds = interleave_datasets(streams, stopping_strategy="all_exhausted") | |
| rows=[] | |
| for i,ex in enumerate(ds): | |
| if i>=MAX_SAMPLES: break | |
| p=(ex.get("prompt") or ex.get("instruction") or "").strip() | |
| r=(ex.get("response") or ex.get("output") or "").strip() | |
| if len(p)>=20 and len(r)>=30: rows.append({"prompt":p,"response":r}) | |
| print(f"kept {len(rows):,} samples") | |
| raw = Dataset.from_list(rows) | |
| tok=AutoTokenizer.from_pretrained(BASE,trust_remote_code=True) | |
| if tok.pad_token is None: tok.pad_token=tok.eos_token | |
| bnb=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4") | |
| model=AutoModelForCausalLM.from_pretrained(BASE, quantization_config=bnb, | |
| device_map="auto", trust_remote_code=True) | |
| model=prepare_model_for_kbit_training(model) | |
| lora=LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05, # bumped rank since we have GPU headroom | |
| target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], | |
| task_type=TaskType.CAUSAL_LM) | |
| model=get_peft_model(model,lora) | |
| model.print_trainable_parameters() | |
| def fmt(ex): | |
| msgs=[{"role":"system","content":"You are Surrogate-1, a senior DevSecOps AI coding agent."}, | |
| {"role":"user","content":ex["prompt"]},{"role":"assistant","content":ex["response"]}] | |
| return {"text": tok.apply_chat_template(msgs,tokenize=False,add_generation_prompt=False)} | |
| raw=raw.map(fmt,remove_columns=raw.column_names) | |
| def tk(b): | |
| e=tok(b["text"],truncation=True,max_length=4096,padding=False) # longer ctx since H200 has space | |
| e["labels"]=e["input_ids"].copy(); return e | |
| tokenized=raw.map(tk,batched=True,remove_columns=["text"]) | |
| args=TrainingArguments( | |
| output_dir="./out", num_train_epochs=1.0, | |
| per_device_train_batch_size=2, gradient_accumulation_steps=8, # bigger batch on H200 | |
| learning_rate=2e-4, bf16=True, gradient_checkpointing=True, | |
| logging_steps=20, save_strategy="steps", save_steps=200, save_total_limit=2, | |
| warmup_ratio=0.03, lr_scheduler_type="cosine", report_to="none", | |
| push_to_hub=True, hub_model_id=HUB_ID, hub_strategy="every_save", | |
| hub_token=os.environ.get("HF_TOKEN"), | |
| hub_private_repo=False) # PUBLIC β multi-checkpoint > 500MB; flip after train | |
| collator=DataCollatorForSeq2Seq(tok,padding=True,return_tensors="pt") | |
| Trainer(model=model,args=args,train_dataset=tokenized,data_collator=collator, | |
| tokenizer=tok).train() | |
| print("β done") | |
| PYEOF | |
| # Submit to Lightning Studios via Python SDK (lightning run app is deprecated). | |
| # Strategy: connect to (or create) a Studio with H200 attached, upload the | |
| # training script, run it via studio.run(). The Studio persists output so | |
| # we can poll later. | |
| echo "[$(date +%H:%M:%S)] submitting to Lightning H200 via SDK" | tee -a "$LOG" | |
| python3 - "$TRAIN_SCRIPT" >> "$LOG" 2>&1 << 'SDK_EOF' || echo "[$(date +%H:%M:%S)] SDK run errored β see log" | tee -a "$LOG" | |
| import os, sys, time | |
| script_path = sys.argv[1] | |
| try: | |
| from lightning_sdk import Studio, Machine | |
| except ImportError: | |
| print("ERR: lightning_sdk not installed", flush=True) | |
| sys.exit(2) | |
| studio_name = f"surrogate-1-train-{time.strftime('%Y%m%d-%H%M', time.gmtime())}" | |
| print(f"βΆ connecting/creating Studio: {studio_name}") | |
| try: | |
| # SDK reads auth from env LIGHTNING_USER_ID + LIGHTNING_API_KEY which the | |
| # bash wrapper exported above. Don't pass them as kwargs (TypeError). | |
| studio = Studio(name=studio_name, teamspace="default", create_ok=True) | |
| studio.start(machine=Machine.H200) | |
| print(f" β Studio H200 started") | |
| studio.upload_file(script_path, "train.py") | |
| print(f" β uploaded train.py") | |
| # Fire training in background β SDK returns immediately | |
| try: | |
| job = studio.run("python train.py", in_background=True) | |
| except TypeError: | |
| # in_background may not be supported in this SDK version β try without | |
| job = studio.run("python train.py") | |
| print(f" β submitted job: {job}") | |
| except Exception as e: | |
| print(f" β {type(e).__name__}: {str(e)[:300]}") | |
| sys.exit(3) | |
| SDK_EOF | |
| echo "[$(date +%H:%M:%S)] lightning-trainer cycle done" | tee -a "$LOG" | |
| # trigger: pickup LIGHTNING_USER_ID + LIGHTNING_API_KEY 2026-04-28T20:29:29Z | |