File size: 1,942 Bytes
1bacd77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
ECHO ULTIMATE β€” HuggingFace Space GPU Training Entrypoint.
Runs full GRPO training then pushes adapter to HF Hub.
Hardware: T4 medium or A10G small (set in Space settings).
"""
import os
import sys
import logging

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s β€” %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)

import threading
from fastapi import FastAPI
from fastapi.responses import PlainTextResponse
import uvicorn

# ── Tiny status server on :7860 so HF Space health checks pass ────────────────
status_app = FastAPI()
training_log = []

@status_app.get("/health")
def health():
    return {"status": "training", "log_lines": len(training_log)}

@status_app.get("/log", response_class=PlainTextResponse)
def log():
    return "\n".join(training_log[-100:])

def run_status_server():
    uvicorn.run(status_app, host="0.0.0.0", port=7860, log_level="warning")

threading.Thread(target=run_status_server, daemon=True).start()

# ── Training ──────────────────────────────────────────────────────────────────
print("=" * 60)
print("πŸš€  ECHO ULTIMATE β€” GRPO Training on HF GPU Space")
print("=" * 60)

from config import cfg
from env.task_bank import TaskBank
from training.train import train

bank = TaskBank()
bank.download_all()

hf_token = os.environ.get("HF_TOKEN", "")
use_wandb = bool(os.environ.get("WANDB_API_KEY", ""))

train(
    model_name=cfg.MODEL_NAME,
    output_dir=cfg.MODEL_SAVE_DIR,
    task_bank=bank,
    use_wandb=use_wandb,
)

print("\nπŸŽ‰  Training complete! Space will stay running β€” check /log for details.")
# Keep the status server alive after training
import time
while True:
    time.sleep(60)