Spaces:

bluemoonsoldout
/

llm-cal

Running

llm-cal / src /llm_cal /command_generator /vllm.py

GitHub Actions

Auto-deploy from GitHub Actions

cc6274a 28 days ago

1.84 kB

	"""Generate a ready-to-copy vllm serve command."""

	from __future__ import annotations

	from llm_cal.architecture.profile import ArchitectureProfile
	from llm_cal.engine_compat.loader import EngineCompatEntry


	def generate_vllm_command(
	model_id: str,
	profile: ArchitectureProfile,
	tensor_parallel_size: int,
	entry: EngineCompatEntry \| None,
	max_model_len: int \| None = None,
	) -> str:
	"""Generate a multi-line `vllm serve ...` command string.

	If `entry` is given, appends required_flags and optional_flags verbatim.
	"""
	lines: list[str] = [
	"vllm serve " + model_id,
	f" --tensor-parallel-size {tensor_parallel_size}",
	]

	# Pick max-model-len from profile if caller didn't override.
	effective_max = max_model_len
	if effective_max is None and profile.position is not None:
	effective_max = profile.position.max_position_embeddings
	if effective_max:
	lines.append(f" --max-model-len {effective_max}")

	# DeepSeek and friends need trust-remote-code. Heuristic: non-trivial model_type.
	if _needs_trust_remote_code(profile.model_type):
	lines.append(" --trust-remote-code")

	lines.append(" --gpu-memory-utilization 0.9")

	if entry is not None:
	for flag in entry.required_flags:
	lines.append(" " + _render_flag(flag.flag, flag.value))
	for flag in entry.optional_flags:
	lines.append(" " + _render_flag(flag.flag, flag.value))

	return " \\\n".join(lines)


	def _render_flag(flag: str, value: str \| None) -> str:
	if value is None:
	return flag
	return f"{flag} {value}"


	def _needs_trust_remote_code(model_type: str) -> bool:
	"""Models that ship custom modeling code in the repo."""
	return model_type.startswith(("deepseek", "qwen2_moe", "qwen3_moe", "mixtral"))