Spaces:

bledden
/

stack-doctor

Build error

App Files Files Community

stack-doctor / serve.py

bledden

Upload folder using huggingface_hub

c75f6b6 verified about 1 month ago

raw

history blame contribute delete

5.62 kB

	"""Unified server for HF Spaces: environment + inference + dashboard on port 7860."""

	import json
	import os
	import sys
	import time
	import threading

	sys.path.insert(0, "/app")

	from fastapi import FastAPI, Request
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import FileResponse, JSONResponse
	import uvicorn

	from server.app import app as env_app

	env_app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Model state (loaded in background)
	MODEL_STATE = {"model": None, "tokenizer": None, "ready": False, "error": None}

	UNTRAINED_SYSTEM = (
	"You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n"
	"You receive an incident ticket with hardware/model/backend context, log excerpts, and specialist opinions.\n"
	"Some specialists may be wrong. Output a JSON array of actions:\n"
	' {"type":"inspect","target":"logs\|config\|snippet\|metrics"}\n'
	' {"type":"ask_specialist","specialist":"runtime\|dispatch\|kernel\|loader"}\n'
	' {"type":"apply_fix","fix":"<fix_name>"}\n'
	' {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<why>"}'
	)

	TRAINED_SYSTEM = (
	"You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n"
	"You are methodical: first inspect logs and config, then query specialists to cross-verify (some lie), then apply a fix and submit.\n\n"
	"Available actions (output as a JSON array):\n"
	' {"type":"inspect","target":"logs"} or "config" or "snippet" or "metrics"\n'
	' {"type":"ask_specialist","specialist":"runtime"} or "dispatch" or "kernel" or "loader"\n'
	' {"type":"apply_fix","fix":"<name>"} -- available fixes: add_whitelist_entry, fix_comm_config, fix_quantization, fix_runtime_path, fix_weight_mapping, relax_arch_check, switch_backend, tune_memory_config, update_driver_config, update_model_config\n'
	' {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<detailed reasoning>"}\n\n'
	"Available root causes: arch_guard, backend_selector, backend_whitelist, distributed_comm, driver_compat, memory_oom, model_config, quantization_error, runtime_loader, weight_layout\n\n"
	"IMPORTANT: Pick ONE target per inspect, ONE specialist per query. Investigate before submitting. Give a detailed justification.\n\n"
	"Example output:\n"
	'[{"type":"inspect","target":"logs"},{"type":"inspect","target":"config"},{"type":"ask_specialist","specialist":"kernel"},'
	'{"type":"apply_fix","fix":"relax_arch_check"},'
	'{"type":"submit","root_cause":"arch_guard","fix":"relax_arch_check","justification":"Logs show architecture check failure for SM90. Config confirms guard enabled. Kernel specialist confirmed not a kernel issue."}]'
	)


	def load_model_background():
	"""Load Qwen 1.5B in a background thread so the server starts fast."""
	try:
	print("[Model] Loading Qwen2.5-1.5B-Instruct (CPU)...")
	t0 = time.time()
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	model_name = "Qwen/Qwen2.5-1.5B-Instruct"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float32,
	device_map="cpu",
	)

	MODEL_STATE["model"] = model
	MODEL_STATE["tokenizer"] = tokenizer
	MODEL_STATE["ready"] = True
	print(f"[Model] Loaded in {time.time()-t0:.1f}s")
	except Exception as ex:
	MODEL_STATE["error"] = str(ex)
	print(f"[Model] Failed to load: {ex}")


	threading.Thread(target=load_model_background, daemon=True).start()


	@env_app.post("/generate")
	async def generate_endpoint(request: Request):
	body = await request.json()
	prompt_text = body.get("prompt", "")
	max_tokens = body.get("max_tokens", 512)
	mode = body.get("mode", "untrained")

	if not MODEL_STATE["ready"]:
	if MODEL_STATE["error"]:
	return JSONResponse({"error": MODEL_STATE["error"]}, status_code=500)
	return JSONResponse({"error": "Model still loading, please wait..."}, status_code=503)

	model = MODEL_STATE["model"]
	tokenizer = MODEL_STATE["tokenizer"]
	system = TRAINED_SYSTEM if mode == "trained" else UNTRAINED_SYSTEM

	messages = [
	{"role": "system", "content": system},
	{"role": "user", "content": prompt_text},
	]

	import torch

	text_input = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	inputs = tokenizer(text_input, return_tensors="pt")

	t0 = time.time()
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	pad_token_id=tokenizer.eos_token_id,
	)

	new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
	text = tokenizer.decode(new_tokens, skip_special_tokens=True)
	gen_time = time.time() - t0
	print(f"[Model] Generated {len(text)} chars in {gen_time:.1f}s (mode={mode})")
	return JSONResponse({"text": text, "gen_time": gen_time})


	@env_app.get("/model_status")
	async def model_status():
	return JSONResponse({
	"ready": MODEL_STATE["ready"],
	"error": MODEL_STATE["error"],
	})


	@env_app.get("/", include_in_schema=False)
	async def root():
	return FileResponse("/app/static/index.html")


	if __name__ == "__main__":
	uvicorn.run(env_app, host="0.0.0.0", port=7860)