GitHub Actions
Auto-deploy from GitHub Actions
cc6274a
"""Generate a ready-to-copy vllm serve command."""
from __future__ import annotations
from llm_cal.architecture.profile import ArchitectureProfile
from llm_cal.engine_compat.loader import EngineCompatEntry
def generate_vllm_command(
model_id: str,
profile: ArchitectureProfile,
tensor_parallel_size: int,
entry: EngineCompatEntry | None,
max_model_len: int | None = None,
) -> str:
"""Generate a multi-line `vllm serve ...` command string.
If `entry` is given, appends required_flags and optional_flags verbatim.
"""
lines: list[str] = [
"vllm serve " + model_id,
f" --tensor-parallel-size {tensor_parallel_size}",
]
# Pick max-model-len from profile if caller didn't override.
effective_max = max_model_len
if effective_max is None and profile.position is not None:
effective_max = profile.position.max_position_embeddings
if effective_max:
lines.append(f" --max-model-len {effective_max}")
# DeepSeek and friends need trust-remote-code. Heuristic: non-trivial model_type.
if _needs_trust_remote_code(profile.model_type):
lines.append(" --trust-remote-code")
lines.append(" --gpu-memory-utilization 0.9")
if entry is not None:
for flag in entry.required_flags:
lines.append(" " + _render_flag(flag.flag, flag.value))
for flag in entry.optional_flags:
lines.append(" " + _render_flag(flag.flag, flag.value))
return " \\\n".join(lines)
def _render_flag(flag: str, value: str | None) -> str:
if value is None:
return flag
return f"{flag} {value}"
def _needs_trust_remote_code(model_type: str) -> bool:
"""Models that ship custom modeling code in the repo."""
return model_type.startswith(("deepseek", "qwen2_moe", "qwen3_moe", "mixtral"))