Spaces:

anugrah55
/

opensleuth-training-gemini-cli

Paused

opensleuth-training-gemini-cli / entrypoint.sh

trainer v0.4: switch to Qwen2.5-3B-Instruct, dynamic task discovery, delegated probe sampling, difficulty-weighted rollouts, push to opensleuth-qwen2.5-3b-grpo-v2; sentinel cleared on FORCE_TRAIN=1.

78575eb verified 13 days ago

raw

history blame contribute delete

2.68 kB

	#!/usr/bin/env bash
	# OpenSleuth training Space entrypoint.
	#
	# Starts a tiny background HTTP server on $PORT (default 7860) so the HF
	# Spaces health probe is satisfied, then runs the actual training script in
	# the foreground. All training logs go to stdout and are visible in the
	# Space's "Container logs" tab.
	set -euo pipefail

	PORT="${PORT:-7860}"

	log() { echo "[entrypoint $(date -u +%H:%M:%S)] $*"; }

	# 1. Background heartbeat HTTP server. Just returns 200 OK on every request.
	log "starting heartbeat server on :${PORT}"
	python -c "
	import http.server, socketserver, os, threading, time
	class H(http.server.BaseHTTPRequestHandler):
	def do_GET(self):
	self.send_response(200)
	self.send_header('Content-Type','text/plain')
	self.end_headers()
	self.wfile.write(b'opensleuth-trainer alive\n')
	def log_message(self, a, *kw): pass
	port = int(os.environ.get('PORT','7860'))
	srv = socketserver.TCPServer(('0.0.0.0', port), H)
	threading.Thread(target=srv.serve_forever, daemon=True).start()
	print(f'[heartbeat] listening on :{port}', flush=True)
	while True: time.sleep(3600)
	" &
	HB_PID=$!

	# Give the heartbeat a moment to bind before the orchestrator probes it.
	sleep 2

	# 2. Run training. If a sentinel file exists, training already completed in a
	# previous container start (the Space orchestrator restarts containers
	# that exit cleanly), so just idle on the heartbeat to avoid wasting GPU
	# on duplicate runs. Set FORCE_TRAIN=1 to override.
	#
	# v0.4 update: when FORCE_TRAIN=1 is set, we explicitly delete the old
	# sentinel up-front. Without this the sentinel from a previous v0.2 run
	# (Qwen 0.5B / 9 builtins) blocks the v0.4 run (Qwen 3B / 15 tasks) on
	# Space restart. The sentinel only ever gets re-touched after a fresh
	# successful training run completes below.
	SENTINEL="/data/.opensleuth-trained"
	if [[ -n "${FORCE_TRAIN:-}" && -f "$SENTINEL" ]]; then
	log "FORCE_TRAIN=1 set; removing stale sentinel $SENTINEL so we re-train."
	rm -f "$SENTINEL"
	fi
	if [[ -f "$SENTINEL" && -z "${FORCE_TRAIN:-}" ]]; then
	log "sentinel $SENTINEL exists; skipping training (set FORCE_TRAIN=1 to retrain). Idling..."
	sleep infinity
	fi

	log "starting training (PID $$)"
	log "GPU info:"
	python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu')"

	if python /app/train.py; then
	log "training succeeded; writing sentinel and idling so the container stays alive."
	touch "$SENTINEL"
	sleep infinity
	else
	log "training failed (exit $?); container will exit so the error surfaces."
	exit 1
	fi