#!/usr/bin/env python3
"""
GAP-CLIP Evaluation Runner
===========================

Orchestrates all evaluation scripts, one per paper section.  Each evaluation
is independent and can be run in isolation via ``--steps``.

Usage
-----
Run everything::

    python evaluation/run_all_evaluations.py

Run specific sections::

    python evaluation/run_all_evaluations.py --steps sec51,sec52
    python evaluation/run_all_evaluations.py --steps annex92,annex93

Available steps
---------------
  sec51       §5.1  Colour model accuracy (Table 1)
  sec52       §5.2  Category model confusion matrix (Table 2)
  sec533      §5.3.3  NN classification accuracy (Table 3)
  sec536      §5.3.6  Embedding structure Tests A/B/C/D (Table 4)
  annex92     Annex 9.2  Pairwise colour similarity heatmaps
  annex93     Annex 9.3  t-SNE visualisations
  annex94     Annex 9.4  Fashion search demo

Author: Lea Attia Sarfati
"""

import argparse
import sys
import traceback
from datetime import datetime
from pathlib import Path

# Make sure the repo root is on the path so that `config` is importable,
# and the evaluation directory so that secXX modules can be imported.
sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent))

ALL_STEPS = ["sec51", "sec52", "sec533", "sec536", "annex92", "annex93", "annex94"]


class ResourceCache:
    """Lazy-loading cache for shared models and raw datasets.

    Each property is loaded at most once and cached for reuse across
    evaluation sections.  This avoids re-downloading Kaggle data (~30s),
    re-loading Fashion-CLIP (~15s) and GAP-CLIP (~20s) multiple times.
    """

    def __init__(self, device=None):
        import torch
        if device is None:
            device = "mps" if torch.backends.mps.is_available() else "cpu"
        self.device = torch.device(device) if isinstance(device, str) else device

        self._gap_clip = None
        self._fashion_clip = None
        self._color_model = None
        self._hierarchy_classes = None
        self._kaggle_raw_df = None
        self._local_raw_df = None

    @property
    def gap_clip(self):
        """(model, processor) for GAP-CLIP."""
        if self._gap_clip is None:
            from config import main_model_path
            from utils.model_loader import load_gap_clip
            print("[ResourceCache] Loading GAP-CLIP...")
            self._gap_clip = load_gap_clip(main_model_path, self.device)
        return self._gap_clip

    @property
    def fashion_clip(self):
        """(model, processor) for Fashion-CLIP baseline."""
        if self._fashion_clip is None:
            from utils.model_loader import load_baseline_fashion_clip
            print("[ResourceCache] Loading Fashion-CLIP baseline...")
            self._fashion_clip = load_baseline_fashion_clip(self.device)
        return self._fashion_clip

    @property
    def color_model(self):
        """ColorCLIP model instance."""
        if self._color_model is None:
            from config import color_model_path
            from utils.model_loader import load_color_model
            print("[ResourceCache] Loading ColorCLIP model...")
            self._color_model, _ = load_color_model(color_model_path, self.device)
        return self._color_model

    @property
    def hierarchy_classes(self):
        """List of hierarchy class names from the hierarchy model checkpoint."""
        if self._hierarchy_classes is None:
            import torch
            from config import hierarchy_model_path
            print("[ResourceCache] Loading hierarchy classes...")
            checkpoint = torch.load(hierarchy_model_path, map_location=self.device)
            self._hierarchy_classes = checkpoint.get('hierarchy_classes', [])
            print(f"[ResourceCache] Found {len(self._hierarchy_classes)} hierarchy classes")
        return self._hierarchy_classes

    @property
    def kaggle_raw_df(self):
        """Raw Kaggle KAGL DataFrame (downloaded once from HuggingFace)."""
        if self._kaggle_raw_df is None:
            from utils.datasets import download_kaggle_raw_df
            print("[ResourceCache] Downloading Kaggle KAGL dataset...")
            self._kaggle_raw_df = download_kaggle_raw_df()
        return self._kaggle_raw_df

    @property
    def local_raw_df(self):
        """Raw local validation DataFrame (read once from CSV)."""
        if self._local_raw_df is None:
            import pandas as pd
            from config import local_dataset_path
            print("[ResourceCache] Loading local validation CSV...")
            self._local_raw_df = pd.read_csv(local_dataset_path)
            print(f"[ResourceCache] Local dataset: {len(self._local_raw_df)} rows")
        return self._local_raw_df


class EvaluationRunner:
    """Runs one or more evaluation sections and collects pass/fail status."""

    def __init__(self, output_dir: str = "evaluation_results"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True, parents=True)
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.results: dict[str, str] = {}  # step -> "ok" | "failed" | "skipped"
        self.cache = ResourceCache()

    # ------------------------------------------------------------------
    # Individual section runners (lazy imports to allow partial execution)
    # ------------------------------------------------------------------

    def run_sec51(self):
        """§5.1 – Colour model accuracy (Table 1)."""
        from sec51_color_model_eval import ColorEvaluator
        baseline_model, baseline_processor = self.cache.fashion_clip
        evaluator = ColorEvaluator(
            device=self.cache.device,
            directory=str(self.output_dir / "sec51"),
            baseline_model=baseline_model,
            baseline_processor=baseline_processor,
            color_model=self.cache.color_model,
            kaggle_raw_df=self.cache.kaggle_raw_df,
            local_raw_df=self.cache.local_raw_df,
        )
        max_samples = 5000
        evaluator.evaluate_kaggle_marqo(max_samples=max_samples)
        evaluator.evaluate_local_validation(max_samples=max_samples)
        evaluator.evaluate_baseline_kaggle_marqo(max_samples=max_samples)
        evaluator.evaluate_baseline_local_validation(max_samples=max_samples)

    def run_sec52(self):
        """§5.2 – Category model confusion matrix (Table 2)."""
        from sec52_category_model_eval import CategoryModelEvaluator
        gap_model, gap_processor = self.cache.gap_clip
        baseline_model, baseline_processor = self.cache.fashion_clip
        evaluator = CategoryModelEvaluator(
            device=self.cache.device,
            directory=str(self.output_dir / "sec52"),
            gap_clip_model=gap_model,
            gap_clip_processor=gap_processor,
            baseline_model=baseline_model,
            baseline_processor=baseline_processor,
            hierarchy_classes=self.cache.hierarchy_classes,
            kaggle_raw_df=self.cache.kaggle_raw_df,
            local_raw_df=self.cache.local_raw_df,
        )
        evaluator.run_full_evaluation()

    def run_sec533(self):
        """§5.3.3 – Nearest-neighbour classification accuracy (Table 3)."""
        from sec533_clip_nn_accuracy import ColorHierarchyEvaluator
        gap_model, gap_processor = self.cache.gap_clip
        baseline_model, baseline_processor = self.cache.fashion_clip
        evaluator = ColorHierarchyEvaluator(
            device=self.cache.device,
            directory=str(self.output_dir / "sec533"),
            gap_clip_model=gap_model,
            gap_clip_processor=gap_processor,
            baseline_model=baseline_model,
            baseline_processor=baseline_processor,
            hierarchy_classes=self.cache.hierarchy_classes,
            kaggle_raw_df=self.cache.kaggle_raw_df,
            local_raw_df=self.cache.local_raw_df,
        )
        evaluator.run_full_evaluation(max_samples=10_000)

    def run_sec536(self):
        """§5.3.6 – Embedding structure Tests A/B/C/D."""
        from sec536_embedding_structure import main as sec536_main
        gap_model, gap_processor = self.cache.gap_clip
        baseline_model, baseline_processor = self.cache.fashion_clip
        sec536_main(
            selected_tests={"A", "B", "C", "D"},
            model=gap_model,
            processor=gap_processor,
            baseline_model=baseline_model,
            baseline_processor=baseline_processor,
        )

    def run_annex92(self):
        """Annex 9.2 – Pairwise colour similarity heatmaps."""
        # annex92 is a self-contained script; run its __main__ guard.
        import runpy
        runpy.run_path(
            str(Path(__file__).parent / "annex92_color_heatmaps.py"),
            run_name="__main__",
        )

    def run_annex93(self):
        """Annex 9.3 – t-SNE visualisations."""
        import runpy
        runpy.run_path(
            str(Path(__file__).parent / "annex93_tsne.py"),
            run_name="__main__",
        )

    def run_annex94(self):
        """Annex 9.4 – Fashion search demo."""
        import runpy
        runpy.run_path(
            str(Path(__file__).parent / "annex94_search_demo.py"),
            run_name="__main__",
        )

    # ------------------------------------------------------------------
    # Orchestration
    # ------------------------------------------------------------------

    def _run_step(self, step: str) -> bool:
        method = getattr(self, f"run_{step.replace('-', '_')}", None)
        if method is None:
            print(f"⚠️  Unknown step '{step}' – skipping.")
            self.results[step] = "skipped"
            return False

        print(f"\n{'='*70}")
        print(f"▶  Running {step}  ({method.__doc__ or ''})")
        print(f"{'='*70}")
        try:
            method()
            self.results[step] = "ok"
            print(f"✅  {step} completed successfully.")
            return True
        except Exception:
            self.results[step] = "failed"
            print(f"❌  {step} FAILED:")
            traceback.print_exc()
            return False

    def run(self, steps: list[str]) -> bool:
        print("=" * 70)
        print(f"🚀  GAP-CLIP Evaluation  ({self.timestamp})")
        print(f"    Steps: {', '.join(steps)}")
        print(f"    Output: {self.output_dir}")
        print("=" * 70)

        for step in steps:
            self._run_step(step)

        # Summary
        print(f"\n{'='*70}")
        print("📊  Summary")
        print(f"{'='*70}")
        all_ok = True
        for step in steps:
            status = self.results.get(step, "skipped")
            icon = {"ok": "✅", "failed": "❌", "skipped": "⚠️ "}.get(status, "?")
            print(f"  {icon}  {step:15s} {status}")
            if status == "failed":
                all_ok = False

        print("=" * 70)
        return all_ok


def main():
    parser = argparse.ArgumentParser(
        description="Run GAP-CLIP evaluations.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="\n".join(
            [
                "Available steps:",
                "  sec51    §5.1  Colour model (Table 1)",
                "  sec52    §5.2  Category model (Table 2)",
                "  sec533   §5.3.3 NN accuracy (Table 3)",
                "  sec536   §5.3.6 Embedding structure tests A/B/C/D (Table 4)",
                "  annex92  Annex 9.2 Colour heatmaps",
                "  annex93  Annex 9.3 t-SNE",
                "  annex94  Annex 9.4 Search demo",
            ]
        ),
    )
    parser.add_argument(
        "--steps",
        type=str,
        default="all",
        help=(
            "Comma-separated list of steps to run, or 'all' to run everything "
            "(default: all).  Example: --steps sec51,sec52,sec536"
        ),
    )
    parser.add_argument(
        "--output",
        type=str,
        default="evaluation_results",
        help="Directory to save results (default: evaluation_results).",
    )
    args = parser.parse_args()

    if args.steps.strip().lower() == "all":
        steps = ALL_STEPS
    else:
        steps = [s.strip() for s in args.steps.split(",") if s.strip()]

    runner = EvaluationRunner(output_dir=args.output)
    success = runner.run(steps)
    sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()