Spaces:

techfreakworm
/

ACE-Music-Studio

Running on Zero

App Files Files Community

ACE-Music-Studio / modes.py

techfreakworm

feat(lyrics): add qwen 2.5 7b lazy loader with mlx and transformers backends

9c07a74 unverified 2 days ago

raw

history blame contribute delete

8.76 kB

	"""Pure mode handlers — one function per generation mode.

	Each handler validates inputs, builds the ACE-Step kwargs for its mode, and
	hands off to `backend.dispatch(...)`. Backend ownership of @spaces.GPU and
	pipeline lifecycle keeps these handlers cheap to test.

	The ``lyrics()`` handler is the odd one out: it does NOT touch the ACE-Step
	backend at all. It calls ``lyrics_lm.generate_lyrics`` directly, since the
	Qwen 2.5 7B LM is its own lazy singleton and doesn't share the DiT / 5Hz
	pipeline lifecycle with the audio modes.
	"""

	from __future__ import annotations

	from typing import Any

	import lyrics_lm


	def _require(params: dict[str, Any], field: str) -> Any:
	v = params.get(field)
	if v is None or (isinstance(v, str) and not v.strip()):
	raise ValueError(f"Missing required field: {field}")
	return v


	def generate(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
	"""Text → song. Vocals + instruments in one stream."""
	prompt = _require(params, "prompt")
	lyrics = params.get("lyrics", "")
	duration_s = int(params.get("duration_s", 30))
	instrumental = bool(params.get("instrumental", False))

	return backend.dispatch(
	mode="generate",
	params={
	"prompt": prompt,
	"lyrics": lyrics,
	"duration_s": duration_s,
	"instrumental": instrumental,
	"seed": params.get("seed"),
	"loras": params.get("loras", []),
	"advanced": params.get("advanced", {}),
	"lm": params.get("lm", {}),
	"dcw": params.get("dcw", {}),
	},
	)


	def cover(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
	"""Audio-reference cover — reference audio + new prompt -> song in that style.

	Maps to ACE-Step's ``GenerationParams(task_type="cover")`` with
	``reference_audio`` set to the uploaded clip and ``audio_cover_strength``
	controlling how tightly the new song hugs the reference timbre/structure.
	"""
	ref_audio = _require(params, "ref_audio")
	prompt = params.get("prompt", "")
	lyrics = params.get("lyrics", "")
	duration_s = int(params.get("duration_s", 30))

	return backend.dispatch(
	mode="cover",
	params={
	"prompt": prompt,
	"ref_audio": ref_audio,
	"lyrics": lyrics,
	"duration_s": duration_s,
	"audio_cover_strength": float(params.get("audio_cover_strength", 0.93)),
	"cover_noise_strength": float(params.get("cover_noise_strength", 0.0)),
	"seed": params.get("seed"),
	"loras": params.get("loras", []),
	"advanced": params.get("advanced", {}),
	"lm": params.get("lm", {}),
	"dcw": params.get("dcw", {}),
	},
	)


	def extend(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
	"""Song continuation — seed audio + extension prompt -> extended song.

	Maps to ACE-Step's ``GenerationParams(task_type="repaint")`` with
	``src_audio`` set to the seed and ``repainting_start``/``repainting_end``
	pointing past the end of the seed so the model paints new audio after it.
	"""
	seed_audio = _require(params, "seed_audio")
	extra_prompt = params.get("extra_prompt", "")
	extra_duration_s = int(params.get("extra_duration_s", 60))

	return backend.dispatch(
	mode="extend",
	params={
	"seed_audio": seed_audio,
	"extra_prompt": extra_prompt,
	"extension_lyrics": params.get("extension_lyrics", ""),
	"extra_duration_s": extra_duration_s,
	"repaint_mode": params.get("repaint_mode", "balanced"),
	"repaint_strength": float(params.get("repaint_strength", 0.5)),
	"wav_crossfade_s": float(params.get("wav_crossfade_s", 2.0)),
	"latent_crossfade_frames": int(params.get("latent_crossfade_frames", 10)),
	"chunk_mask_mode": params.get("chunk_mask_mode", "auto"),
	"seed": params.get("seed"),
	"loras": params.get("loras", []),
	"advanced": params.get("advanced", {}),
	"lm": params.get("lm", {}),
	"dcw": params.get("dcw", {}),
	},
	)


	def edit(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
	"""Segment-level edit — repaint a region OR morph caption-to-caption.

	Two sub-modes:

	- ``"repaint"`` (default): paint over ``[segment_start_s, segment_end_s]``
	using ACE-Step's ``task_type="repaint"`` with the segment bounds wired
	into ``repainting_start`` / ``repainting_end``.
	- ``"flow_edit"``: caption-to-caption morph. The installed ACE-Step
	``GenerationParams`` dataclass has no native ``flow_edit_*`` fields, so
	flow-edit is implemented downstream as a ``task_type="repaint"`` pass
	with a lower ``audio_cover_strength`` to allow more style drift. The
	``flow_source_caption`` / ``flow_n_*`` knobs are carried through the
	internal params dict so the pipeline wrapper can use them if/when the
	upstream dataclass grows native support.
	"""
	source_audio = _require(params, "source_audio")
	sub_mode = params.get("sub_mode", "repaint")

	out_params: dict[str, Any] = {
	"source_audio": source_audio,
	"source_lyrics": params.get("source_lyrics", ""),
	"target_lyrics": params.get("target_lyrics", ""),
	"segment_start_s": float(params.get("segment_start_s", 0.0)),
	"segment_end_s": float(params.get("segment_end_s", 30.0)),
	"sub_mode": sub_mode,
	"seed": params.get("seed"),
	"loras": params.get("loras", []),
	"advanced": params.get("advanced", {}),
	"lm": params.get("lm", {}),
	"dcw": params.get("dcw", {}),
	}
	if sub_mode == "repaint":
	out_params.update(
	{
	"repaint_mode": params.get("repaint_mode", "balanced"),
	"repaint_strength": float(params.get("repaint_strength", 0.5)),
	"chunk_mask_mode": params.get("chunk_mask_mode", "auto"),
	"latent_crossfade_frames": int(params.get("latent_crossfade_frames", 10)),
	"wav_crossfade_s": float(params.get("wav_crossfade_s", 0.0)),
	}
	)
	elif sub_mode == "flow_edit":
	out_params.update(
	{
	"flow_source_caption": params.get("flow_source_caption", ""),
	"flow_n_min": float(params.get("flow_n_min", 0.0)),
	"flow_n_max": float(params.get("flow_n_max", 1.0)),
	"flow_n_avg": int(params.get("flow_n_avg", 1)),
	}
	)

	return backend.dispatch(mode="edit", params=out_params)


	def lyrics(backend, params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
	"""Lyrics-only mode. Returns ``(drafted_text, metadata_dict)``.

	Does NOT touch the ACE-Step backend — Qwen 2.5 7B Instruct is owned
	by ``lyrics_lm`` as its own lazy singleton. The ``backend`` argument
	is kept in the signature for parity with the other mode handlers but
	is unused here.
	"""
	del backend # signature parity with generate/cover/extend/edit
	brief = _require(params, "brief")
	structure = params.get("structure", "intro, verse, chorus, verse, chorus, bridge, chorus, outro")
	language = params.get("language", "en")
	tone = params.get("tone", "")
	verse_lines = int(params.get("verse_lines", 6))
	chorus_lines = int(params.get("chorus_lines", 4))
	bridge_lines = int(params.get("bridge_lines", 2))
	rhyme = params.get("rhyme", "loose")
	temperature = float(params.get("temperature", 0.85))
	top_p = float(params.get("top_p", 0.9))
	top_k = int(params.get("top_k", 40))
	max_new_tokens = int(params.get("max_new_tokens", 600))
	seed = params.get("seed")

	text = lyrics_lm.generate_lyrics(
	brief=brief,
	structure=structure,
	language=language,
	tone=tone,
	verse_lines=verse_lines,
	chorus_lines=chorus_lines,
	bridge_lines=bridge_lines,
	rhyme=rhyme,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	max_new_tokens=max_new_tokens,
	seed=seed,
	)
	meta = {
	"mode": "lyrics",
	"model": lyrics_lm._DEFAULT_MAC_ID,
	"brief_first_line": brief.splitlines()[0] if brief else "",
	"structure": structure,
	"language": language,
	"tone": tone,
	"verse_lines": verse_lines,
	"chorus_lines": chorus_lines,
	"bridge_lines": bridge_lines,
	"rhyme": rhyme,
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"max_new_tokens": max_new_tokens,
	"seed": seed,
	}
	return text, meta