"""Generate a ready-to-copy vllm serve command."""

from __future__ import annotations

from llm_cal.architecture.profile import ArchitectureProfile
from llm_cal.engine_compat.loader import EngineCompatEntry


def generate_vllm_command(
    model_id: str,
    profile: ArchitectureProfile,
    tensor_parallel_size: int,
    entry: EngineCompatEntry | None,
    max_model_len: int | None = None,
) -> str:
    """Generate a multi-line `vllm serve ...` command string.

    If `entry` is given, appends required_flags and optional_flags verbatim.
    """
    lines: list[str] = [
        "vllm serve " + model_id,
        f"  --tensor-parallel-size {tensor_parallel_size}",
    ]

    # Pick max-model-len from profile if caller didn't override.
    effective_max = max_model_len
    if effective_max is None and profile.position is not None:
        effective_max = profile.position.max_position_embeddings
    if effective_max:
        lines.append(f"  --max-model-len {effective_max}")

    # DeepSeek and friends need trust-remote-code. Heuristic: non-trivial model_type.
    if _needs_trust_remote_code(profile.model_type):
        lines.append("  --trust-remote-code")

    lines.append("  --gpu-memory-utilization 0.9")

    if entry is not None:
        for flag in entry.required_flags:
            lines.append("  " + _render_flag(flag.flag, flag.value))
        for flag in entry.optional_flags:
            lines.append("  " + _render_flag(flag.flag, flag.value))

    return " \\\n".join(lines)


def _render_flag(flag: str, value: str | None) -> str:
    if value is None:
        return flag
    return f"{flag} {value}"


def _needs_trust_remote_code(model_type: str) -> bool:
    """Models that ship custom modeling code in the repo."""
    return model_type.startswith(("deepseek", "qwen2_moe", "qwen3_moe", "mixtral"))