Spaces:
Runtime error
feat: kaggle-trainer + llm-burst x3 — TRAIN now, MAX synthesis
Browse filesUser: 'ตอนนี้เทรนไปก็ตัวเล็ก หรือเธอค่อยๆเทรนได้ก็เอา' + KAGGLE token.
Stop waiting, start training.
1) kaggle-trainer.sh daemon (cron every 6 hr):
- reads KAGGLE_API_TOKEN + KAGGLE_USERNAME from Space secrets
- writes ~/.kaggle/kaggle.json
- generates the train.py notebook on-the-fly (streams 5 sibling
datasets, QLoRA Qwen2.5-Coder-7B, push to axentx/surrogate-1-coder-lora-v1)
- 'kaggle kernels push' submits to Kaggle T4 GPU
- Free 30 hr/week → ~5 LoRA runs/week per Kaggle account
- LoRA adapter auto-pushes back to HF Hub when training finishes
2) llm-burst FULL THROTTLE:
- batch_size_per_provider 3 -> 8 (every cycle)
- sleep 30-60s -> 10-20s (between cycles)
- With Cerebras 1M tok/day budget that's ~30 RPM, well within quota
- Net: ~3-4x synthetic pair throughput
3) status-server log allowlist + 'kaggle-trainer' added so /logs/kaggle-trainer
exposes training-submission history.
User context note: KAGGLE_API_TOKEN secret will be pushed once HF API
rate-limit clears. Until then daemon skips silently with the explicit
'KAGGLE_API_TOKEN not set' log line.
- bin/hermes-status-server.py +1 -1
- bin/kaggle-trainer.sh +202 -0
- bin/llm-burst-generator.py +7 -7
- start.sh +4 -0
|
@@ -167,7 +167,7 @@ def log_tail(name: str, lines: int = 100) -> PlainTextResponse:
|
|
| 167 |
"auto-orchestrate-loop", "training-push", "ollama", "discord-bot",
|
| 168 |
"hermes-discord-bot", "surrogate-research-loop", "surrogate-research-apply",
|
| 169 |
"surrogate-dev-loop", "domain-scrape-loop", "github-domain-scrape",
|
| 170 |
-
"qwen-coder", "git-clone", "git-pull", "redis", "parquet-direct-ingest", "bulk-ingest-parallel", "rag-vector-builder", "auto-orchestrate-continuous", "dataset-enrich", "hf-dataset-discoverer", "dedup-bootstrap", "github-agentic-crawler", "ollama-pull-granite", "synthetic-data", "self-ingest", "scrape-sre-postmortems", "refresh-cve-feed", "self-heal-watchdog", "gh-actions-ticker", "llm-burst-generator", "expand-role-keywords",
|
| 171 |
"ollama-pull-coder", "ollama-pull-devstral", "ollama-pull-fallback",
|
| 172 |
"ollama-pull-yicoder", "ollama-pull-embed", "ollama-pull-light",
|
| 173 |
}
|
|
|
|
| 167 |
"auto-orchestrate-loop", "training-push", "ollama", "discord-bot",
|
| 168 |
"hermes-discord-bot", "surrogate-research-loop", "surrogate-research-apply",
|
| 169 |
"surrogate-dev-loop", "domain-scrape-loop", "github-domain-scrape",
|
| 170 |
+
"qwen-coder", "git-clone", "git-pull", "redis", "parquet-direct-ingest", "bulk-ingest-parallel", "rag-vector-builder", "auto-orchestrate-continuous", "dataset-enrich", "hf-dataset-discoverer", "dedup-bootstrap", "github-agentic-crawler", "ollama-pull-granite", "synthetic-data", "self-ingest", "scrape-sre-postmortems", "refresh-cve-feed", "self-heal-watchdog", "gh-actions-ticker", "llm-burst-generator", "expand-role-keywords", "kaggle-trainer",
|
| 171 |
"ollama-pull-coder", "ollama-pull-devstral", "ollama-pull-fallback",
|
| 172 |
"ollama-pull-yicoder", "ollama-pull-embed", "ollama-pull-light",
|
| 173 |
}
|
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Kaggle remote trainer — runs on HF Space, triggers Kaggle T4 GPU training.
|
| 3 |
+
#
|
| 4 |
+
# Architecture:
|
| 5 |
+
# HF Space (this) ── uploads notebook + dataset slice ──→ Kaggle T4 GPU
|
| 6 |
+
# ←── downloads LoRA adapter, pushes to HF Hub ──
|
| 7 |
+
#
|
| 8 |
+
# Free Kaggle quota: 30 hr/week T4 GPU per account. We can run 5-7 LoRA
|
| 9 |
+
# experiments per week per account at no cost.
|
| 10 |
+
#
|
| 11 |
+
# This daemon checks every 6 hours: if no training is currently running on
|
| 12 |
+
# Kaggle for surrogate-1, it kicks a new one with the latest dataset slice.
|
| 13 |
+
|
| 14 |
+
set -uo pipefail
|
| 15 |
+
set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a
|
| 16 |
+
|
| 17 |
+
LOG="$HOME/.surrogate/logs/kaggle-trainer.log"
|
| 18 |
+
mkdir -p "$(dirname "$LOG")"
|
| 19 |
+
|
| 20 |
+
KAGGLE_DIR="$HOME/.kaggle"
|
| 21 |
+
mkdir -p "$KAGGLE_DIR"
|
| 22 |
+
|
| 23 |
+
# Kaggle CLI reads $HOME/.kaggle/kaggle.json (older format) OR $KAGGLE_API_TOKEN
|
| 24 |
+
# env (newer format). User gave us KGAT_... which is the newer format.
|
| 25 |
+
if [[ -n "${KAGGLE_API_TOKEN:-}" ]]; then
|
| 26 |
+
# Newer Kaggle CLI accepts API token directly via env. Older needs the
|
| 27 |
+
# legacy kaggle.json. Try both for compatibility.
|
| 28 |
+
cat > "$KAGGLE_DIR/kaggle.json" << EOF
|
| 29 |
+
{"username":"${KAGGLE_USERNAME:-ashirafuse}","key":"${KAGGLE_API_TOKEN}"}
|
| 30 |
+
EOF
|
| 31 |
+
chmod 600 "$KAGGLE_DIR/kaggle.json"
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
if ! command -v kaggle >/dev/null 2>&1; then
|
| 35 |
+
pip install --quiet --user kaggle 2>>"$LOG"
|
| 36 |
+
export PATH="$HOME/.local/bin:$PATH"
|
| 37 |
+
fi
|
| 38 |
+
|
| 39 |
+
if [[ -z "${KAGGLE_API_TOKEN:-}" ]] || [[ -z "${HF_TOKEN:-}" ]]; then
|
| 40 |
+
echo "[$(date +%H:%M:%S)] kaggle-trainer skipping — KAGGLE_API_TOKEN or HF_TOKEN not set" | tee -a "$LOG"
|
| 41 |
+
exit 0
|
| 42 |
+
fi
|
| 43 |
+
|
| 44 |
+
# Notebook directory on Kaggle (will be created by kaggle kernels init)
|
| 45 |
+
NB_OWNER="${KAGGLE_USERNAME:-ashirafuse}"
|
| 46 |
+
NB_SLUG="surrogate-1-lora-trainer"
|
| 47 |
+
WORK_DIR="$HOME/.surrogate/state/kaggle-nb"
|
| 48 |
+
mkdir -p "$WORK_DIR"
|
| 49 |
+
cd "$WORK_DIR"
|
| 50 |
+
|
| 51 |
+
echo "[$(date +%H:%M:%S)] kaggle-trainer cycle start" | tee -a "$LOG"
|
| 52 |
+
|
| 53 |
+
# ── Build the notebook ──────────────────────────────────────────────────────
|
| 54 |
+
cat > "$WORK_DIR/kernel-metadata.json" << EOF
|
| 55 |
+
{
|
| 56 |
+
"id": "${NB_OWNER}/${NB_SLUG}",
|
| 57 |
+
"title": "surrogate-1 LoRA trainer",
|
| 58 |
+
"code_file": "train.py",
|
| 59 |
+
"language": "python",
|
| 60 |
+
"kernel_type": "script",
|
| 61 |
+
"is_private": false,
|
| 62 |
+
"enable_gpu": true,
|
| 63 |
+
"enable_tpu": false,
|
| 64 |
+
"enable_internet": true,
|
| 65 |
+
"dataset_sources": [],
|
| 66 |
+
"competition_sources": [],
|
| 67 |
+
"kernel_sources": []
|
| 68 |
+
}
|
| 69 |
+
EOF
|
| 70 |
+
|
| 71 |
+
cat > "$WORK_DIR/train.py" << 'PYEOF'
|
| 72 |
+
"""Surrogate-1 LoRA training on Kaggle T4 GPU.
|
| 73 |
+
Streams data from axentx/surrogate-1-* sibling datasets on HF Hub.
|
| 74 |
+
Saves LoRA adapter back to axentx/surrogate-1-coder-lora-vN."""
|
| 75 |
+
|
| 76 |
+
import os
|
| 77 |
+
import subprocess
|
| 78 |
+
import sys
|
| 79 |
+
import time
|
| 80 |
+
|
| 81 |
+
# install deps (once per kernel-version)
|
| 82 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet",
|
| 83 |
+
"transformers>=4.45.0", "datasets>=3.0.0",
|
| 84 |
+
"peft>=0.13.0", "accelerate>=1.0.0", "bitsandbytes>=0.43.0",
|
| 85 |
+
"huggingface_hub>=0.25.0"])
|
| 86 |
+
|
| 87 |
+
# read HF token from Kaggle Secrets
|
| 88 |
+
try:
|
| 89 |
+
from kaggle_secrets import UserSecretsClient
|
| 90 |
+
os.environ["HF_TOKEN"] = UserSecretsClient().get_secret("HF_TOKEN")
|
| 91 |
+
os.environ["HUGGING_FACE_HUB_TOKEN"] = os.environ["HF_TOKEN"]
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"⚠ Kaggle Secrets not available: {e}")
|
| 94 |
+
|
| 95 |
+
import torch
|
| 96 |
+
from datasets import load_dataset, interleave_datasets
|
| 97 |
+
from transformers import (AutoTokenizer, AutoModelForCausalLM,
|
| 98 |
+
TrainingArguments, Trainer, DataCollatorForSeq2Seq, BitsAndBytesConfig)
|
| 99 |
+
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
|
| 100 |
+
|
| 101 |
+
BASE = os.environ.get("BASE_MODEL", "Qwen/Qwen2.5-Coder-7B-Instruct")
|
| 102 |
+
MAX_SAMPLES = int(os.environ.get("MAX_SAMPLES", "30000"))
|
| 103 |
+
EPOCHS = float(os.environ.get("EPOCHS", "1"))
|
| 104 |
+
HUB_ID = os.environ.get("HUB_MODEL_ID", "axentx/surrogate-1-coder-lora-v1")
|
| 105 |
+
|
| 106 |
+
print(f"━━━ Surrogate-1 LoRA on Kaggle T4 ━━━")
|
| 107 |
+
print(f"base={BASE} samples={MAX_SAMPLES:,} epochs={EPOCHS} hub={HUB_ID}")
|
| 108 |
+
|
| 109 |
+
# ── data ────────────────────────────────────────────────────────────────────
|
| 110 |
+
SIBLINGS = [
|
| 111 |
+
"axentx/surrogate-1-training-pairs",
|
| 112 |
+
"axentx/surrogate-1-pairs-A",
|
| 113 |
+
"axentx/surrogate-1-pairs-B",
|
| 114 |
+
"axentx/surrogate-1-pairs-C",
|
| 115 |
+
"axentx/surrogate-1-pairs-D",
|
| 116 |
+
]
|
| 117 |
+
streams = []
|
| 118 |
+
for r in SIBLINGS:
|
| 119 |
+
try:
|
| 120 |
+
streams.append(load_dataset(r, split="train", streaming=True))
|
| 121 |
+
print(f" loaded {r}")
|
| 122 |
+
except Exception as e:
|
| 123 |
+
print(f" skip {r}: {e}")
|
| 124 |
+
ds = interleave_datasets(streams, stopping_strategy="all_exhausted")
|
| 125 |
+
|
| 126 |
+
rows = []
|
| 127 |
+
for i, ex in enumerate(ds):
|
| 128 |
+
if i >= MAX_SAMPLES: break
|
| 129 |
+
p = (ex.get("prompt") or ex.get("instruction") or "").strip()
|
| 130 |
+
r = (ex.get("response") or ex.get("output") or "").strip()
|
| 131 |
+
if len(p) >= 20 and len(r) >= 30:
|
| 132 |
+
rows.append({"prompt": p, "response": r})
|
| 133 |
+
print(f" kept {len(rows):,} samples")
|
| 134 |
+
|
| 135 |
+
from datasets import Dataset
|
| 136 |
+
raw = Dataset.from_list(rows)
|
| 137 |
+
|
| 138 |
+
# ── model ───────────────────────────────────────────────────────────────────
|
| 139 |
+
tok = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True)
|
| 140 |
+
if tok.pad_token is None: tok.pad_token = tok.eos_token
|
| 141 |
+
|
| 142 |
+
bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16,
|
| 143 |
+
bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")
|
| 144 |
+
model = AutoModelForCausalLM.from_pretrained(BASE, quantization_config=bnb,
|
| 145 |
+
device_map="auto", trust_remote_code=True)
|
| 146 |
+
model = prepare_model_for_kbit_training(model)
|
| 147 |
+
|
| 148 |
+
lora = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05,
|
| 149 |
+
target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
|
| 150 |
+
task_type=TaskType.CAUSAL_LM)
|
| 151 |
+
model = get_peft_model(model, lora)
|
| 152 |
+
model.print_trainable_parameters()
|
| 153 |
+
|
| 154 |
+
# ── tokenize ────────────────────────────────────────────────────────────────
|
| 155 |
+
def fmt(ex):
|
| 156 |
+
msgs = [
|
| 157 |
+
{"role":"system","content":"You are Surrogate-1, a senior DevSecOps AI coding agent."},
|
| 158 |
+
{"role":"user","content":ex["prompt"]},
|
| 159 |
+
{"role":"assistant","content":ex["response"]},
|
| 160 |
+
]
|
| 161 |
+
return {"text": tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)}
|
| 162 |
+
|
| 163 |
+
raw = raw.map(fmt, remove_columns=raw.column_names)
|
| 164 |
+
def tk(b):
|
| 165 |
+
e = tok(b["text"], truncation=True, max_length=2048, padding=False)
|
| 166 |
+
e["labels"] = e["input_ids"].copy()
|
| 167 |
+
return e
|
| 168 |
+
tokenized = raw.map(tk, batched=True, remove_columns=["text"])
|
| 169 |
+
|
| 170 |
+
# ── train ───────────────────────────────────────────────────────────────────
|
| 171 |
+
args = TrainingArguments(
|
| 172 |
+
output_dir="./surrogate-1-lora-out",
|
| 173 |
+
num_train_epochs=EPOCHS,
|
| 174 |
+
per_device_train_batch_size=1,
|
| 175 |
+
gradient_accumulation_steps=16,
|
| 176 |
+
learning_rate=2e-4,
|
| 177 |
+
bf16=torch.cuda.is_bf16_supported(),
|
| 178 |
+
fp16=not torch.cuda.is_bf16_supported(),
|
| 179 |
+
gradient_checkpointing=True,
|
| 180 |
+
logging_steps=20,
|
| 181 |
+
save_strategy="steps", save_steps=500, save_total_limit=2,
|
| 182 |
+
warmup_ratio=0.03, lr_scheduler_type="cosine",
|
| 183 |
+
report_to="none",
|
| 184 |
+
push_to_hub=True,
|
| 185 |
+
hub_model_id=HUB_ID,
|
| 186 |
+
hub_strategy="every_save",
|
| 187 |
+
hub_token=os.environ.get("HF_TOKEN"),
|
| 188 |
+
)
|
| 189 |
+
collator = DataCollatorForSeq2Seq(tok, padding=True, return_tensors="pt")
|
| 190 |
+
trainer = Trainer(model=model, args=args, train_dataset=tokenized,
|
| 191 |
+
data_collator=collator, tokenizer=tok)
|
| 192 |
+
trainer.train()
|
| 193 |
+
trainer.push_to_hub(commit_message=f"Surrogate-1 LoRA — {MAX_SAMPLES:,} samples, {EPOCHS} epochs (Kaggle T4)")
|
| 194 |
+
print("✅ done")
|
| 195 |
+
PYEOF
|
| 196 |
+
|
| 197 |
+
# ── Push notebook to Kaggle (creates if not exists, updates if exists) ─────
|
| 198 |
+
echo "[$(date +%H:%M:%S)] kaggle kernels push" | tee -a "$LOG"
|
| 199 |
+
kaggle kernels push -p "$WORK_DIR" 2>&1 | tee -a "$LOG"
|
| 200 |
+
|
| 201 |
+
# kernels push schedules a run; status check later
|
| 202 |
+
echo "[$(date +%H:%M:%S)] kaggle-trainer cycle done — notebook submitted" | tee -a "$LOG"
|
|
@@ -268,9 +268,9 @@ def main():
|
|
| 268 |
|
| 269 |
while True:
|
| 270 |
cycle += 1
|
| 271 |
-
#
|
| 272 |
-
#
|
| 273 |
-
batch_size_per_provider =
|
| 274 |
with ThreadPoolExecutor(max_workers=len(active) * batch_size_per_provider) as pool:
|
| 275 |
futures = []
|
| 276 |
for p in active:
|
|
@@ -301,10 +301,10 @@ def main():
|
|
| 301 |
log(f" diag {name}: {err}")
|
| 302 |
_first_err_per_provider.clear()
|
| 303 |
|
| 304 |
-
#
|
| 305 |
-
#
|
| 306 |
-
#
|
| 307 |
-
time.sleep(
|
| 308 |
|
| 309 |
|
| 310 |
if __name__ == "__main__":
|
|
|
|
| 268 |
|
| 269 |
while True:
|
| 270 |
cycle += 1
|
| 271 |
+
# FULL THROTTLE: 8 templates per provider per cycle (was 3).
|
| 272 |
+
# Cerebras 1M tok/day = ~30 RPM sustained — plenty of headroom.
|
| 273 |
+
batch_size_per_provider = 8
|
| 274 |
with ThreadPoolExecutor(max_workers=len(active) * batch_size_per_provider) as pool:
|
| 275 |
futures = []
|
| 276 |
for p in active:
|
|
|
|
| 301 |
log(f" diag {name}: {err}")
|
| 302 |
_first_err_per_provider.clear()
|
| 303 |
|
| 304 |
+
# FULL THROTTLE — Cerebras 1M tok/day = 11.5 RPS sustainable budget.
|
| 305 |
+
# We're at ~3 providers x 6 templates x cycle. Drop sleep so we
|
| 306 |
+
# actually use the quota allotment instead of leaving it on the table.
|
| 307 |
+
time.sleep(10 + random.randint(0, 10))
|
| 308 |
|
| 309 |
|
| 310 |
if __name__ == "__main__":
|
|
@@ -337,6 +337,10 @@ while true; do
|
|
| 337 |
# Cerebras/Groq → +80 specific job-description-style search terms each).
|
| 338 |
# Discoverer auto-uses the expanded list on its next cycle.
|
| 339 |
[[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 &
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
sleep 60
|
| 341 |
done
|
| 342 |
CRONSH
|
|
|
|
| 337 |
# Cerebras/Groq → +80 specific job-description-style search terms each).
|
| 338 |
# Discoverer auto-uses the expanded list on its next cycle.
|
| 339 |
[[ $((M % 1440)) -eq 360 ]] && python3 ~/.surrogate/bin/expand-role-keywords.py >> "$LOG_DIR/expand-role-keywords.log" 2>&1 &
|
| 340 |
+
# Every 6 hours: kick a Kaggle T4 LoRA training run on the latest dataset
|
| 341 |
+
# slice. Free Kaggle quota = 30 hr/week per account; one full run = 4-6 hr,
|
| 342 |
+
# so 4 runs/week comfortable. Notebook self-uploads adapter to HF hub.
|
| 343 |
+
[[ $((M % 360)) -eq 30 ]] && bash ~/.surrogate/bin/kaggle-trainer.sh >> "$LOG_DIR/kaggle-trainer.log" 2>&1 &
|
| 344 |
sleep 60
|
| 345 |
done
|
| 346 |
CRONSH
|