Spaces:
Build error
Build error
| """Unified server for HF Spaces: environment + inference + dashboard on port 7860.""" | |
| import json | |
| import os | |
| import sys | |
| import time | |
| import threading | |
| sys.path.insert(0, "/app") | |
| from fastapi import FastAPI, Request | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import FileResponse, JSONResponse | |
| import uvicorn | |
| from server.app import app as env_app | |
| env_app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Model state (loaded in background) | |
| MODEL_STATE = {"model": None, "tokenizer": None, "ready": False, "error": None} | |
| UNTRAINED_SYSTEM = ( | |
| "You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n" | |
| "You receive an incident ticket with hardware/model/backend context, log excerpts, and specialist opinions.\n" | |
| "Some specialists may be wrong. Output a JSON array of actions:\n" | |
| ' {"type":"inspect","target":"logs|config|snippet|metrics"}\n' | |
| ' {"type":"ask_specialist","specialist":"runtime|dispatch|kernel|loader"}\n' | |
| ' {"type":"apply_fix","fix":"<fix_name>"}\n' | |
| ' {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<why>"}' | |
| ) | |
| TRAINED_SYSTEM = ( | |
| "You are Stack Doctor, an expert AI agent that diagnoses inference-stack incidents.\n" | |
| "You are methodical: first inspect logs and config, then query specialists to cross-verify (some lie), then apply a fix and submit.\n\n" | |
| "Available actions (output as a JSON array):\n" | |
| ' {"type":"inspect","target":"logs"} or "config" or "snippet" or "metrics"\n' | |
| ' {"type":"ask_specialist","specialist":"runtime"} or "dispatch" or "kernel" or "loader"\n' | |
| ' {"type":"apply_fix","fix":"<name>"} -- available fixes: add_whitelist_entry, fix_comm_config, fix_quantization, fix_runtime_path, fix_weight_mapping, relax_arch_check, switch_backend, tune_memory_config, update_driver_config, update_model_config\n' | |
| ' {"type":"submit","root_cause":"<cause>","fix":"<fix>","justification":"<detailed reasoning>"}\n\n' | |
| "Available root causes: arch_guard, backend_selector, backend_whitelist, distributed_comm, driver_compat, memory_oom, model_config, quantization_error, runtime_loader, weight_layout\n\n" | |
| "IMPORTANT: Pick ONE target per inspect, ONE specialist per query. Investigate before submitting. Give a detailed justification.\n\n" | |
| "Example output:\n" | |
| '[{"type":"inspect","target":"logs"},{"type":"inspect","target":"config"},{"type":"ask_specialist","specialist":"kernel"},' | |
| '{"type":"apply_fix","fix":"relax_arch_check"},' | |
| '{"type":"submit","root_cause":"arch_guard","fix":"relax_arch_check","justification":"Logs show architecture check failure for SM90. Config confirms guard enabled. Kernel specialist confirmed not a kernel issue."}]' | |
| ) | |
| def load_model_background(): | |
| """Load Qwen 1.5B in a background thread so the server starts fast.""" | |
| try: | |
| print("[Model] Loading Qwen2.5-1.5B-Instruct (CPU)...") | |
| t0 = time.time() | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| model_name = "Qwen/Qwen2.5-1.5B-Instruct" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float32, | |
| device_map="cpu", | |
| ) | |
| MODEL_STATE["model"] = model | |
| MODEL_STATE["tokenizer"] = tokenizer | |
| MODEL_STATE["ready"] = True | |
| print(f"[Model] Loaded in {time.time()-t0:.1f}s") | |
| except Exception as ex: | |
| MODEL_STATE["error"] = str(ex) | |
| print(f"[Model] Failed to load: {ex}") | |
| threading.Thread(target=load_model_background, daemon=True).start() | |
| async def generate_endpoint(request: Request): | |
| body = await request.json() | |
| prompt_text = body.get("prompt", "") | |
| max_tokens = body.get("max_tokens", 512) | |
| mode = body.get("mode", "untrained") | |
| if not MODEL_STATE["ready"]: | |
| if MODEL_STATE["error"]: | |
| return JSONResponse({"error": MODEL_STATE["error"]}, status_code=500) | |
| return JSONResponse({"error": "Model still loading, please wait..."}, status_code=503) | |
| model = MODEL_STATE["model"] | |
| tokenizer = MODEL_STATE["tokenizer"] | |
| system = TRAINED_SYSTEM if mode == "trained" else UNTRAINED_SYSTEM | |
| messages = [ | |
| {"role": "system", "content": system}, | |
| {"role": "user", "content": prompt_text}, | |
| ] | |
| import torch | |
| text_input = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = tokenizer(text_input, return_tensors="pt") | |
| t0 = time.time() | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| new_tokens = outputs[0][inputs["input_ids"].shape[1]:] | |
| text = tokenizer.decode(new_tokens, skip_special_tokens=True) | |
| gen_time = time.time() - t0 | |
| print(f"[Model] Generated {len(text)} chars in {gen_time:.1f}s (mode={mode})") | |
| return JSONResponse({"text": text, "gen_time": gen_time}) | |
| async def model_status(): | |
| return JSONResponse({ | |
| "ready": MODEL_STATE["ready"], | |
| "error": MODEL_STATE["error"], | |
| }) | |
| async def root(): | |
| return FileResponse("/app/static/index.html") | |
| if __name__ == "__main__": | |
| uvicorn.run(env_app, host="0.0.0.0", port=7860) | |