#!/usr/bin/env python3
"""
Evaluate SCRFD model on WiderFace validation set.

Usage:
    python scripts/evaluate.py \\
        --model scrfd_34g \\
        --checkpoint checkpoints/scrfd_34g_best.pth \\
        --data-root data/wider_face \\
        --output-dir results/scrfd_34g

Output:
    - WiderFace Easy/Medium/Hard AP
    - Prediction files in WiderFace submission format
    - Speed benchmark results
"""

import os
import sys
import argparse
import time
import json
from pathlib import Path

import numpy as np
import cv2
import torch

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from models.detector import build_detector
from evaluation.widerface_eval import WiderFaceEvaluator
from evaluation.speed_benchmark import SpeedBenchmark


def parse_args():
    parser = argparse.ArgumentParser(description='Evaluate SCRFD')
    parser.add_argument('--model', type=str, default='scrfd_34g')
    parser.add_argument('--checkpoint', type=str, required=True)
    parser.add_argument('--data-root', type=str, default='data/wider_face')
    parser.add_argument('--output-dir', type=str, default='results')
    parser.add_argument('--input-size', type=int, default=640)
    parser.add_argument('--score-thresh', type=float, default=0.02)
    parser.add_argument('--nms-thresh', type=float, default=0.4)
    parser.add_argument('--device', type=str, default='cuda')
    parser.add_argument('--benchmark', action='store_true', default=True)
    parser.add_argument('--multi-scale', action='store_true',
                        help='Multi-scale testing (slower, higher AP)')
    parser.add_argument('--scales', nargs='+', type=int,
                        default=[500, 800, 1100, 1400, 1700],
                        help='Scales for multi-scale testing')
    return parser.parse_args()


@torch.no_grad()
def evaluate_single_scale(model, evaluator, data_root, input_size, device,
                          score_thresh):
    """Run single-scale evaluation."""
    img_dir = os.path.join(data_root, 'WIDER_val', 'images')
    mean = np.array([104.0, 117.0, 123.0], dtype=np.float32)

    total_time = 0
    num_images = 0

    for event in sorted(os.listdir(img_dir)):
        event_dir = os.path.join(img_dir, event)
        if not os.path.isdir(event_dir):
            continue

        for img_name in sorted(os.listdir(event_dir)):
            if not img_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                continue

            img_path = os.path.join(event_dir, img_name)
            img = cv2.imread(img_path)
            if img is None:
                continue

            h, w = img.shape[:2]
            filename = f'{event}/{img_name}'

            # Preprocess
            scale = input_size / max(h, w)
            new_h, new_w = int(h * scale), int(w * scale)
            resized = cv2.resize(img, (new_w, new_h))

            padded = np.zeros((input_size, input_size, 3), dtype=np.float32)
            padded[:new_h, :new_w] = resized
            padded = (padded - mean).transpose(2, 0, 1)

            tensor = torch.from_numpy(padded).unsqueeze(0).float().to(device)

            # Inference
            t0 = time.time()
            results = model(tensor)
            total_time += time.time() - t0
            num_images += 1

            # Post-process
            r = results[0]
            boxes = r['boxes'].cpu().numpy()
            scores = r['scores'].cpu().numpy()

            # Rescale to original
            if len(boxes) > 0:
                boxes /= scale
                mask = scores >= score_thresh
                boxes = boxes[mask]
                scores = scores[mask]

            evaluator.add_prediction(filename, boxes, scores)

            if num_images % 200 == 0:
                fps = num_images / max(total_time, 1e-6)
                print(f"  Processed {num_images} images ({fps:.1f} FPS)")

    return total_time, num_images


def main():
    args = parse_args()
    os.makedirs(args.output_dir, exist_ok=True)

    # Load model
    print(f"Loading {args.model} from {args.checkpoint}")
    model = build_detector(
        args.model,
        score_threshold=args.score_thresh,
        nms_threshold=args.nms_thresh,
    ).to(args.device)

    checkpoint = torch.load(args.checkpoint, map_location='cpu')
    state_dict = checkpoint.get('model_state_dict', checkpoint)
    model.load_state_dict(state_dict, strict=False)
    model.eval()

    num_params = sum(p.numel() for p in model.parameters()) / 1e6
    print(f"  Parameters: {num_params:.2f}M")

    # WiderFace evaluation
    print("Running WiderFace evaluation...")
    evaluator = WiderFaceEvaluator(
        gt_dir=os.path.join(args.data_root, 'wider_face_split')
    )

    total_time, num_images = evaluate_single_scale(
        model, evaluator, args.data_root, args.input_size,
        args.device, args.score_thresh
    )

    # Results
    results = evaluator.evaluate()
    report = evaluator.generate_report()
    print(report)

    # Save predictions
    evaluator.save_predictions(os.path.join(args.output_dir, 'predictions'))

    # Speed benchmark
    if args.benchmark:
        print("\nRunning speed benchmark...")
        bench = SpeedBenchmark(device=args.device)
        for size in [320, 480, 640, 960]:
            bench.benchmark_model(model, args.model, input_size=size)
        bench.print_results()

        # Save markdown table
        with open(os.path.join(args.output_dir, 'speed_benchmark.md'), 'w') as f:
            f.write(bench.to_markdown())

    # Save results
    results['num_images'] = num_images
    results['total_time'] = total_time
    results['avg_fps'] = num_images / max(total_time, 1e-6)
    results['model'] = args.model
    results['input_size'] = args.input_size

    with open(os.path.join(args.output_dir, 'results.json'), 'w') as f:
        json.dump(results, f, indent=2)

    print(f"\nResults saved to {args.output_dir}/")


if __name__ == '__main__':
    main()