#!/usr/bin/env python3
"""
CPU vs MPS 模式下语义分析耗时基准测试

测试 20/200/2000 token 单次语义分析时间，每种情况测 3 次。

用法（从项目根目录运行）：
  # CPU 模式
  FORCE_CPU=1 python scripts/bench_semantic_device.py

  # MPS 模式（Apple Silicon，不设 FORCE_CPU）
  python scripts/bench_semantic_device.py

  # 同时跑两种模式并汇总
  python scripts/bench_semantic_device.py --all
"""

import argparse
import json
import os
import subprocess
import sys
import time
from pathlib import Path

# 确保项目根在 path 中
PROJECT_ROOT = Path(__file__).resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

def _make_text_for_tokens(tokenizer, target_tokens: int) -> str:
    """生成约 target_tokens 个 token 的文本"""
    base = "人工智能正在改变我们的生活。机器学习、深度学习等技术在医疗、金融等领域广泛应用。大模型在自然语言处理、图像识别等方面表现突出。"
    text = base
    while True:
        ids = tokenizer.encode(text, add_special_tokens=False)
        if len(ids) >= target_tokens:
            break
        text += base
    ids = tokenizer.encode(text, add_special_tokens=False)
    if len(ids) > target_tokens:
        truncated = tokenizer.decode(ids[:target_tokens])
        return truncated
    return text


def run_benchmark(repeats: int = 3, gradient_checkpointing: bool = True) -> dict:
    from backend.app_context import AppContext
    from backend.data_utils import resolve_data_dir
    from backend.device import DeviceManager
    from backend.model_manager import ensure_semantic_slot_ready
    from backend.semantic_analyzer import analyze_semantic
    from argparse import Namespace

    data_dir = resolve_data_dir(None)
    init_args = Namespace(
        model="default",
        semantic_model="qwen3-0.6b-instruct",
        logits_gradient_submode="topk_sum",
        logits_gradient_prob_weighted=False,
        gradient_checkpointing=gradient_checkpointing,
        address="0.0.0.0",
        port="5001",
        dir=None,
        no_cors=False,
        no_auto_load=False,
    )
    AppContext.init(init_args, data_dir)

    device = DeviceManager.get_device()
    device_name = DeviceManager.get_device_name(device)
    print(f"\n{'='*60}")
    print(f"设备: {device_name} ({device})")
    print("=" * 60)

    tokenizer, _, _ = ensure_semantic_slot_ready()
    target_counts = [500]
    results = {}

    for n_tokens in target_counts:
        text = _make_text_for_tokens(tokenizer, n_tokens)
        actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
        print(f"\n--- {n_tokens} tokens (实际: {actual_tokens}) ---")

        times = []
        for i in range(repeats):
            t0 = time.perf_counter()
            analyze_semantic("人工智能", text)
            elapsed = time.perf_counter() - t0
            times.append(elapsed)
            print(f"  第 {i+1} 次: {elapsed:.3f}s")

        avg = sum(times) / len(times)
        results[str(n_tokens)] = {
            "actual_tokens": actual_tokens,
            "times": [round(t, 4) for t in times],
            "avg": round(avg, 4),
            "min": round(min(times), 4),
            "max": round(max(times), 4),
        }
        print(f"  平均: {avg:.3f}s  最小: {min(times):.3f}s  最大: {max(times):.3f}s")

    return {
        "device": device_name,
        "device_type": device.type,
        "gradient_checkpointing": gradient_checkpointing,
        "results": results,
    }


def main():
    parser = argparse.ArgumentParser(description="CPU/MPS 语义分析耗时基准测试")
    parser.add_argument(
        "--repeats",
        type=int,
        default=3,
        help="每种 token 数重复次数",
    )
    parser.add_argument(
        "--all",
        action="store_true",
        help="依次运行 CPU 和 MPS 模式并汇总",
    )
    parser.add_argument(
        "--output", "-o",
        type=Path,
        default=None,
        help="结果输出 JSON 路径",
    )
    parser.add_argument(
        "--no-gradient-checkpointing",
        dest="gradient_checkpointing",
        action="store_false",
        help="关闭 GC（默认开启）",
    )
    parser.set_defaults(gradient_checkpointing=True)
    args = parser.parse_args()

    if args.all:
        import tempfile
        all_results = []
        for label, env in [("CPU", {"FORCE_CPU": "1"}), ("MPS", {})]:
            env_copy = os.environ.copy()
            env_copy.update(env)
            if label == "MPS":
                env_copy.pop("FORCE_CPU", None)
            print(f"\n\n{'#'*60}")
            print(f"# 运行 {label} 模式")
            print("#" * 60)
            with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
                out_path = f.name
            cmd = [sys.executable, __file__, "--repeats", str(args.repeats), "-o", out_path]
            if not args.gradient_checkpointing:
                cmd.append("--no-gradient-checkpointing")
            proc = subprocess.run(cmd, env=env_copy, cwd=PROJECT_ROOT)
            if proc.returncode != 0:
                print(f"❌ {label} 模式运行失败")
                sys.exit(1)
            data = json.loads(Path(out_path).read_text(encoding="utf-8"))
            os.unlink(out_path)
            all_results.append(data)

        print("\n\n" + "=" * 60)
        print("汇总")
        print("=" * 60)
        for r in all_results:
            print(f"\n{r['device']} ({r['device_type']}):")
            for k, v in r["results"].items():
                print(f"  {k} tokens: avg={v['avg']}s  min={v['min']}s  max={v['max']}s  times={v['times']}")
        if args.output:
            args.output.write_text(
                json.dumps({"modes": all_results}, ensure_ascii=False, indent=2),
                encoding="utf-8",
            )
            print(f"\n✅ 汇总已写入 {args.output}")
        return

    result = run_benchmark(repeats=args.repeats, gradient_checkpointing=args.gradient_checkpointing)

    if args.output:
        args.output.write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"\n✅ 结果已写入 {args.output}")

    return result


if __name__ == "__main__":
    main()