import os
os.environ.setdefault('OMP_NUM_THREADS', '5')  # Suppress sklearn KMeans MKL memory leak warning on Windows
from download_weights import ensure_weights
import io
import time
import shutil
import base64
import json
import asyncio
from fastapi import FastAPI, UploadFile, File, Form, HTTPException, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
import torch
import torchvision.transforms as standard_transforms
import numpy as np
from PIL import Image
import cv2
from sklearn.cluster import KMeans
from sqlmodel import Session

from models import build_model
from tracker import Tracker
from database import init_db, engine, FlightReport

app = FastAPI()

# Allow both local dev and the deployed Vercel frontend.
# The ALLOWED_ORIGINS env var can be set on HF Spaces to your exact Vercel URL.
_raw_origins = os.environ.get(
    "ALLOWED_ORIGINS",
    "http://localhost:5173,http://127.0.0.1:5173,http://localhost:4173,http://127.0.0.1:4173,http://localhost:3000,http://127.0.0.1:3000,https://praveendatascience-crowd-detection.hf.space"
)
ALLOWED_ORIGINS = [o.strip() for o in _raw_origins.split(",") if o.strip()]

app.add_middleware(
    CORSMiddleware,
    allow_origins=ALLOWED_ORIGINS,
    allow_origin_regex=r"https://.*\.vercel\.app",   # matches any Vercel deployment
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
TEMP_DIR = os.path.join(BASE_DIR, "temp_uploads")
os.makedirs(TEMP_DIR, exist_ok=True)

class Args:
    def __init__(self):
        self.backbone = 'vgg16_bn'
        self.row = 2
        self.line = 2

model = None
device = None
transform = None

@app.get("/")
async def health_check():
    """Health check — HuggingFace Spaces pings this to confirm the app is alive."""
    return {"status": "ok", "model_loaded": model is not None}


@app.on_event("startup")
async def startup_event():
    # Download weights from HuggingFace Hub if not present locally
    ensure_weights()
    init_db()
    
    global model, device, transform
    device_type = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if device_type.type == 'cuda':
        torch.backends.cudnn.benchmark = True
        
    args = Args()
    model_obj = build_model(args)
    model_obj.to(device_type)
    if device_type.type == 'cuda':
        model_obj.to(memory_format=torch.channels_last)
    
    weight_path = os.path.join(BASE_DIR, 'weights', 'SHTechA.pth')
    if os.path.exists(weight_path):
        checkpoint = torch.load(weight_path, map_location=device_type)
        model_obj.load_state_dict(checkpoint['model'])
    
    model_obj.eval()
    
    transform_obj = standard_transforms.Compose([
        standard_transforms.ToTensor(), 
        standard_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    model = model_obj
    device = device_type
    transform = transform_obj

def score_aware_merge(predictions, radius, orig_width, orig_height):
    if not predictions: return []
    predictions = sorted(predictions, key=lambda item: item[2], reverse=True)
    final_points = []
    radius_sq = radius * radius
    for x, y, _ in predictions:
        if not (0 <= x < orig_width and 0 <= y < orig_height): continue
        duplicate = False
        for fx, fy in final_points:
            if (x - fx) ** 2 + (y - fy) ** 2 <= radius_sq:
                duplicate = True
                break
        if not duplicate:
            final_points.append([float(x), float(y)])
    return final_points

def round_to_stride(value, stride=128):
    return max(stride, int(np.ceil(value / stride) * stride))

def process_frame(img_raw, model, device, transform, threshold, max_dim=3840, magnification=1.5, patch_size=512, nms_radius=8.0, batch_size=8, patch_overlap=0.25, inference_strategy="Auto", full_frame_max_dim=1800, fencing_poly=None):
    orig_width, orig_height = img_raw.size
    
    # CPU optimizations to prevent >60s timeouts on HF Spaces free tier
    if device.type == 'cpu':
        max_dim = min(max_dim, 1280)
        magnification = min(magnification, 1.0)
        inference_strategy = "Single Pass"
        
    work_width = int(orig_width * magnification)
    work_height = int(orig_height * magnification)
    
    if max_dim is not None and (work_width > max_dim or work_height > max_dim):
        scale = max_dim / float(max(work_width, work_height))
        work_width = int(work_width * scale)
        work_height = int(work_height * scale)
        magnification = work_width / float(orig_width)
        
    resample_filter = getattr(Image, 'Resampling', Image).LANCZOS if hasattr(Image, 'Resampling') else getattr(Image, 'ANTIALIAS', 1)
    img_magnified = img_raw.resize((work_width, work_height), resample_filter)

    use_single_pass = inference_strategy == "Single Pass" or (inference_strategy == "Auto" and max(work_width, work_height) <= full_frame_max_dim)

    final_points = []
    if use_single_pass:
        model_width = round_to_stride(work_width)
        model_height = round_to_stride(work_height)
        scale_x = model_width / float(orig_width)
        scale_y = model_height / float(orig_height)
        model_img = img_raw.resize((model_width, model_height), resample_filter)
        samples = transform(model_img).unsqueeze(0).to(device, non_blocking=True)
        if device.type == 'cuda': samples = samples.contiguous(memory_format=torch.channels_last)

        with torch.inference_mode():
            if device.type == 'cuda':
                with torch.cuda.amp.autocast(): outputs = model(samples)
            else: outputs = model(samples)

        scores = torch.nn.functional.softmax(outputs['pred_logits'].float(), -1)[:, :, 1][0]
        points = outputs['pred_points'][0].float()
        mask = scores > threshold
        selected_points = points[mask].detach().cpu().numpy()
        selected_scores = scores[mask].detach().cpu().numpy()
        predictions = []
        for point, score in zip(selected_points, selected_scores):
            predictions.append([point[0] / scale_x, point[1] / scale_y, float(score)])
        final_points = score_aware_merge(predictions, nms_radius, orig_width, orig_height)
    else:
        pad_border = 256
        new_width = ((work_width + (pad_border * 2) + patch_size - 1) // patch_size) * patch_size
        new_height = ((work_height + (pad_border * 2) + patch_size - 1) // patch_size) * patch_size
        
        img_padded = Image.new('RGB', (new_width, new_height), (0, 0, 0))
        img_padded.paste(img_magnified, (pad_border, pad_border))
        
        all_predictions = []
        patch_overlap = min(max(float(patch_overlap), 0.0), 0.75)
        stride = max(64, int(patch_size * (1.0 - patch_overlap)))
        patch_jobs = []
        
        for y in range(0, new_height - stride + 1, stride):
            for x in range(0, new_width - stride + 1, stride):
                if y + patch_size > new_height or x + patch_size > new_width: continue
                patch = img_padded.crop((x, y, x + patch_size, y + patch_size))
                patch_jobs.append((x, y, patch))

        total_patches = len(patch_jobs)
        batch_size = max(1, int(batch_size))

        for start_idx in range(0, total_patches, batch_size):
            batch_jobs = patch_jobs[start_idx:start_idx + batch_size]
            patch_tensors = [transform(patch) for _, _, patch in batch_jobs]
            samples = torch.stack(patch_tensors, dim=0).to(device, non_blocking=True)
            if device.type == 'cuda': samples = samples.contiguous(memory_format=torch.channels_last)

            with torch.inference_mode():
                if device.type == 'cuda':
                    with torch.cuda.amp.autocast(): outputs = model(samples)
                else: outputs = model(samples)

            outputs_scores = torch.nn.functional.softmax(outputs['pred_logits'].float(), -1)[:, :, 1]
            outputs_points = outputs['pred_points'].float()

            for batch_idx, (x, y, _) in enumerate(batch_jobs):
                mask = outputs_scores[batch_idx] > threshold
                points = outputs_points[batch_idx][mask].detach().cpu().numpy()
                scores = outputs_scores[batch_idx][mask].detach().cpu().numpy()

                if len(points) > 0:
                    points[:, 0] += (x - pad_border)
                    points[:, 1] += (y - pad_border)
                    points = points / float(magnification)
                    for point, score in zip(points, scores):
                        all_predictions.append([point[0], point[1], float(score)])

        final_points = score_aware_merge(all_predictions, nms_radius, orig_width, orig_height)

    # Smart Zone Fencing filter
    if fencing_poly and len(fencing_poly) > 2:
        poly_arr = np.array([[p['x']*orig_width, p['y']*orig_height] for p in fencing_poly], dtype=np.int32)
        filtered_pts = []
        for pt in final_points:
            # check if point is inside
            if cv2.pointPolygonTest(poly_arr, (pt[0], pt[1]), False) >= 0:
                filtered_pts.append(pt)
        final_points = filtered_pts

    return img_raw, len(final_points), final_points

def process_frame_with_oom_recovery(*args, batch_size=8, **kwargs):
    current_batch_size = max(1, int(batch_size))
    while current_batch_size >= 1:
        try:
            return process_frame(*args, batch_size=current_batch_size, **kwargs), current_batch_size
        except RuntimeError as exc:
            if "out of memory" not in str(exc).lower(): raise
            if torch.cuda.is_available(): torch.cuda.empty_cache()
            if current_batch_size == 1: raise
            current_batch_size = max(1, current_batch_size // 2)

def generate_colors(n):
    colors = []
    base_hues = [30, 90, 150, 210, 270, 330]
    for i in range(n):
        h = base_hues[i % len(base_hues)]
        hsv = np.uint8([[[h, 255, 255]]])
        bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)[0][0]
        colors.append((int(bgr[0]), int(bgr[1]), int(bgr[2])))
    return colors

def draw_points(img, points, use_heatmap=False, use_clustering=False, use_motion_vectors=False, prev_points=None):
    img_bgr = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    
    if use_heatmap:
        h, w = img_bgr.shape[:2]
        heatmap = np.zeros((h, w), dtype=np.float32)
        for p in points:
            px, py = int(p[0]), int(p[1])
            if 0 <= px < w and 0 <= py < h:
                r = 15
                x_min, x_max = max(0, px-r), min(w, px+r+1)
                y_min, y_max = max(0, py-r), min(h, py+r+1)
                y, x = np.ogrid[y_min:y_max, x_min:x_max]
                mask = np.exp(-((x - px)**2 + (y - py)**2) / (2 * 5**2))
                heatmap[y_min:y_max, x_min:x_max] += mask
        heatmap = np.clip(heatmap * 100, 0, 255).astype(np.uint8)
        color_map = cv2.applyColorMap(heatmap, cv2.COLORMAP_INFERNO)
        mask = (heatmap > 10).astype(np.float32)[:, :, np.newaxis]
        img_bgr = (img_bgr * (1 - mask * 0.7) + color_map * (mask * 0.7)).astype(np.uint8)
        
    elif use_clustering and len(points) >= 3:
        num_clusters = min(len(points) // 10 + 1, 5)
        # Guard: KMeans requires n_clusters <= n_samples
        num_clusters = max(1, min(num_clusters, len(points)))
        if num_clusters > 1:
            pts_array = np.array([[p[0], p[1]] for p in points])
            try:
                kmeans = KMeans(n_clusters=num_clusters, n_init='auto', random_state=42).fit(pts_array)
                labels = kmeans.labels_
                colors = generate_colors(num_clusters)
                for i, p in enumerate(points):
                    cv2.circle(img_bgr, (int(p[0]), int(p[1])), 3, colors[labels[i]], -1)
                for c in range(num_clusters):
                    cluster_pts = pts_array[labels == c].astype(np.int32)
                    if len(cluster_pts) >= 3:
                        hull = cv2.convexHull(cluster_pts)
                        cv2.polylines(img_bgr, [hull], True, colors[c], 2)
            except Exception:
                # Fallback to plain dots if clustering fails for any reason
                for p in points: cv2.circle(img_bgr, (int(p[0]), int(p[1])), 2, (184, 230, 0), -1)
        else:
            for p in points: cv2.circle(img_bgr, (int(p[0]), int(p[1])), 2, (184, 230, 0), -1)
    else:
        for p in points:
            cv2.circle(img_bgr, (int(p[0]), int(p[1])), 2, (184, 230, 0), -1)

    # GAP 5: Motion Vectors — draw arrows from prev positions to current
    if use_motion_vectors and prev_points and len(prev_points) > 0 and len(points) > 0:
        cur_arr  = np.array([[p[0], p[1]] for p in points],      dtype=np.float32)
        prev_arr = np.array([[p[0], p[1]] for p in prev_points], dtype=np.float32)
        # Match nearest neighbours between prev and current
        for pp in prev_arr:
            dists = np.sum((cur_arr - pp) ** 2, axis=1)
            nearest_idx = int(np.argmin(dists))
            if dists[nearest_idx] < 2500:  # max 50px movement
                cp = cur_arr[nearest_idx]
                dx, dy = float(cp[0] - pp[0]), float(cp[1] - pp[1])
                if abs(dx) > 1 or abs(dy) > 1:  # only draw if actually moved
                    speed = (dx**2 + dy**2) ** 0.5
                    # Color from green (slow) to amber (fast)
                    t = min(speed / 30.0, 1.0)
                    color = (
                        int(11 * (1 - t) + 11 * t),
                        int(230 * (1 - t) + 158 * t),
                        int(184 * (1 - t) + 245 * t)
                    )
                    cv2.arrowedLine(
                        img_bgr,
                        (int(pp[0]), int(pp[1])),
                        (int(cp[0]), int(cp[1])),
                        color, 1, tipLength=0.3
                    )
            
    return img_bgr

@app.post("/api/upload-video")
async def upload_video(file: UploadFile = File(...)):
    # Sanitize filename to remove spaces/special chars that break WebSocket URLs
    safe_name = "".join(c if c.isalnum() or c in '._-' else '_' for c in (file.filename or 'video'))
    file_id = f"vid_{int(time.time())}_{safe_name}"
    file_path = os.path.join(TEMP_DIR, file_id)
    # Stream-write in chunks to avoid loading entire video into RAM
    try:
        with open(file_path, "wb") as out_f:
            while True:
                chunk = await file.read(1024 * 1024)  # 1 MB chunks
                if not chunk:
                    break
                out_f.write(chunk)
    except Exception as exc:
        if os.path.exists(file_path):
            os.remove(file_path)
        raise HTTPException(status_code=500, detail=f"Video upload failed: {exc}")
    return {"file_id": file_id, "size": os.path.getsize(file_path)}

@app.websocket("/api/stream-video/{file_id}")
async def stream_video(websocket: WebSocket, file_id: str):
    await websocket.accept()
    file_path = os.path.join(TEMP_DIR, file_id)

    if not os.path.exists(file_path):
        await websocket.send_json({"status": "error", "message": "Video file not found on server. Please upload again."})
        await websocket.close()
        return

    # Guard: model must be loaded
    if model is None:
        await websocket.send_json({"status": "error", "message": "AI model not loaded yet. Please wait and retry."})
        await websocket.close()
        return

    cap = None
    try:
        data = await websocket.receive_json()
        settings_payload = data.get("settings", {})

        confidence_threshold = float(settings_payload.get("confidenceThresh", 0.35))
        magnification        = float(settings_payload.get("magnification", 1.5))
        nms_radius           = float(settings_payload.get("nmsRadius", 9.0))
        use_heatmap          = bool(settings_payload.get("useHeatmap", False))
        use_clustering       = bool(settings_payload.get("useClustering", False))
        use_motion_vecs      = bool(settings_payload.get("useMotionVecs", False))
        fencing_poly         = settings_payload.get("fencingPolygon", [])
        frame_skip           = max(1, int(settings_payload.get("frameSkip", 3)))
        patch_overlap        = 0.25
        capacity_limit       = int(settings_payload.get("capacityLimit", 150))

        cap = cv2.VideoCapture(file_path)
        if not cap.isOpened():
            await websocket.send_json({"status": "error", "message": f"Cannot open video file. Format may not be supported by OpenCV."})
            return

        total_video_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 1
        tracker = Tracker(max_distance=50.0, max_age=5)

        frames_processed  = 0
        total_unique      = 0
        peak_crowd        = 0
        total_anomalies   = 0
        capacity_breached = False
        prev_raw_points   = []

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frames_processed % frame_skip == 0:
                try:
                    img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    pil_img = Image.fromarray(img_rgb)

                    (_, count, raw_points), _ = process_frame_with_oom_recovery(
                        pil_img, model, device, transform, confidence_threshold,
                        max_dim=1920, magnification=magnification, nms_radius=nms_radius,
                        batch_size=4, patch_overlap=patch_overlap, inference_strategy="Auto",
                        fencing_poly=fencing_poly
                    )

                    if count > peak_crowd:      peak_crowd = count
                    if count > capacity_limit:  capacity_breached = True

                    img_bgr = draw_points(pil_img, raw_points, use_heatmap, use_clustering,
                                          use_motion_vecs, prev_raw_points)
                    prev_raw_points = raw_points[:]

                    active_tracks, cumulative_unique, anomaly = tracker.update(img_bgr, raw_points)
                    total_unique = cumulative_unique
                    if anomaly:
                        total_anomalies += 1

                    for t in active_tracks:
                        color = (11, 158, 245) if (anomaly and hasattr(t, 'velocity') and t.velocity > 35) else (0, 255, 255)
                        cv2.circle(img_bgr, (int(t.pt[0]), int(t.pt[1])), 4, color, -1)

                    _, buffer   = cv2.imencode('.jpg', img_bgr, [cv2.IMWRITE_JPEG_QUALITY, 80])
                    encoded_img = base64.b64encode(buffer).decode('utf-8')

                    progress = round(frames_processed / total_video_frames * 100)
                    await websocket.send_json({
                        "status":       "playing",
                        "frame":        frames_processed,
                        "count":        count,
                        "total_unique": total_unique,
                        "anomalyEvent": anomaly,
                        "progress":     progress,
                        "imageB64":     encoded_img
                    })

                except Exception as frame_err:
                    print(f"[Frame {frames_processed} error]: {frame_err}")
                    # Skip this frame and continue rather than crashing the whole stream

            frames_processed += 1
            await asyncio.sleep(0.001)

        # Release BEFORE any file operations so Windows unlocks it
        cap.release()
        cap = None

        # Log to DB
        try:
            with Session(engine) as db_session:
                record = FlightReport(
                    filename=file_id,
                    max_capacity_breached=capacity_breached,
                    peak_crowd_count=peak_crowd,
                    duration_frames=frames_processed,
                    chaos_anomalies=total_anomalies
                )
                db_session.add(record)
                db_session.commit()
        except Exception as db_err:
            print(f"[DB error]: {db_err}")

        await websocket.send_json({"status": "done", "total_unique": total_unique})

    except WebSocketDisconnect:
        print("[WebSocket] client disconnected")
    except Exception as e:
        print(f"[Stream error]: {e}")
        try:
            await websocket.send_json({"status": "error", "message": str(e)})
        except Exception:
            pass
    finally:
        # Make sure cap is released before file deletion
        if cap is not None:
            cap.release()
        # Retry deletion — Windows may keep handle briefly after cap.release()
        for _attempt in range(5):
            try:
                if os.path.exists(file_path):
                    os.remove(file_path)
                break
            except PermissionError:
                import time as _t
                _t.sleep(0.3)


@app.post("/api/process-image")
async def process_image_api(
    file: UploadFile = File(...),
    confidence_threshold: float = Form(0.35),
    magnification: float = Form(1.5),
    nms_radius: float = Form(9.0),
    use_heatmap: str = Form("false"),
    use_clustering: str = Form("false"),
    use_motion_vectors: str = Form("false"),
    fencing_polygon: str = Form("[]"),
    inference_batch_size: int = Form(8),
    patch_overlap: float = Form(0.25),
    max_resolution: int = Form(3840),
    inference_strategy: str = Form("Auto")
):
    try:
        fencing_poly = json.loads(fencing_polygon) if fencing_polygon else []
        contents = await file.read()
        image = Image.open(io.BytesIO(contents)).convert('RGB')
        
        start_time = time.perf_counter()
        
        (processed_img, count, points), used_batch_size = process_frame_with_oom_recovery(
            image, model, device, transform, confidence_threshold,
            max_dim=max_resolution, magnification=magnification,
            nms_radius=nms_radius, batch_size=inference_batch_size,
            patch_overlap=patch_overlap, inference_strategy=inference_strategy,
            fencing_poly=fencing_poly
        )
        
        elapsed = time.perf_counter() - start_time
        img_bgr = draw_points(
            image, points,
            use_heatmap.lower() == 'true',
            use_clustering.lower() == 'true',
            use_motion_vectors.lower() == 'true',
            None  # no prev_points for single image
        )
            
        _, buffer = cv2.imencode('.jpg', img_bgr)
        encoded_img = base64.b64encode(buffer).decode('utf-8')
        
        # Save to database
        try:
            with Session(engine) as session:
                report = FlightReport(
                    filename=file.filename,
                    max_capacity_breached=False, # We don't have capacity from frontend here natively, but let's record the scan
                    peak_crowd_count=count,
                    duration_frames=1,
                    chaos_anomalies=0
                )
                session.add(report)
                session.commit()
        except Exception as e:
            print(f"Failed to save to db: {e}")
            pass
        
        return {
            "count": count, "elapsed": elapsed,
            "usedBatchSize": used_batch_size, "imageB64": encoded_img
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=False)