Spaces:
Running on Zero
Running on Zero
LM audio understanding & sample gen
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- acestep/constants.py +1 -0
- acestep/constrained_logits_processor.py +250 -465
- acestep/gradio_ui.py +318 -32
- acestep/llm_inference.py +446 -29
- examples/text2music/example_01.json +7 -2
- examples/text2music/example_02.json +7 -2
- examples/text2music/example_03.json +7 -2
- examples/text2music/example_04.json +7 -2
- examples/text2music/example_05.json +7 -2
- examples/text2music/example_06.json +7 -2
- examples/text2music/example_07.json +7 -2
- examples/text2music/example_08.json +7 -2
- examples/text2music/example_09.json +7 -2
- examples/text2music/example_10.json +7 -2
- examples/text2music/example_11.json +7 -2
- examples/text2music/example_12.json +7 -2
- examples/text2music/example_13.json +7 -2
- examples/text2music/example_14.json +7 -2
- examples/text2music/example_15.json +7 -2
- examples/text2music/example_16.json +7 -2
- examples/text2music/example_17.json +7 -2
- examples/text2music/example_18.json +7 -2
- examples/text2music/example_19.json +7 -2
- examples/text2music/example_20.json +7 -2
- examples/text2music/example_21.json +7 -2
- examples/text2music/example_22.json +7 -2
- examples/text2music/example_23.json +7 -2
- examples/text2music/example_24.json +7 -2
- examples/text2music/example_25.json +7 -2
- examples/text2music/example_26.json +7 -2
- examples/text2music/example_27.json +7 -2
- examples/text2music/example_28.json +7 -2
- examples/text2music/example_29.json +7 -2
- examples/text2music/example_30.json +7 -2
- examples/text2music/example_31.json +7 -2
- examples/text2music/example_32.json +7 -2
- examples/text2music/example_33.json +7 -2
- examples/text2music/example_34.json +7 -2
- examples/text2music/example_35.json +7 -2
- examples/text2music/example_36.json +7 -2
- examples/text2music/example_37.json +7 -2
- examples/text2music/example_38.json +10 -0
- examples/text2music/example_39.json +10 -0
- examples/text2music/example_40.json +10 -0
- examples/text2music/example_41.json +10 -0
- examples/text2music/example_42.json +10 -0
- examples/text2music/example_43.json +10 -0
- examples/text2music/example_44.json +10 -0
- examples/text2music/example_45.json +10 -0
- examples/text2music/example_46.json +10 -0
acestep/constants.py
CHANGED
|
@@ -69,6 +69,7 @@ TASK_TYPES_BASE = ["text2music", "repaint", "cover", "extract", "lego", "complet
|
|
| 69 |
# Default instructions
|
| 70 |
DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
|
| 71 |
DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
|
|
|
|
| 72 |
|
| 73 |
# Instruction templates for each task type
|
| 74 |
# Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
|
|
|
|
| 69 |
# Default instructions
|
| 70 |
DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
|
| 71 |
DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
|
| 72 |
+
DEFAULT_LM_UNDERSTAND_INSTRUCTION = "Understand the given musical conditions and describe the audio semantics accordingly:"
|
| 73 |
|
| 74 |
# Instruction templates for each task type
|
| 75 |
# Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
|
acestep/constrained_logits_processor.py
CHANGED
|
@@ -35,9 +35,6 @@ class FSMState(Enum):
|
|
| 35 |
DURATION_NAME = auto() # Generating "duration: "
|
| 36 |
DURATION_VALUE = auto() # Generating numeric value 10-600
|
| 37 |
NEWLINE_AFTER_DURATION = auto()
|
| 38 |
-
GENRES_NAME = auto() # Generating "genres: "
|
| 39 |
-
GENRES_VALUE = auto() # Generating any non-empty string
|
| 40 |
-
NEWLINE_AFTER_GENRES = auto()
|
| 41 |
KEYSCALE_NAME = auto() # Generating "keyscale: "
|
| 42 |
KEYSCALE_VALUE = auto() # Generating keyscale pattern
|
| 43 |
NEWLINE_AFTER_KEYSCALE = auto()
|
|
@@ -77,35 +74,28 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 77 |
tokenizer: AutoTokenizer,
|
| 78 |
enabled: bool = True,
|
| 79 |
debug: bool = False,
|
| 80 |
-
|
| 81 |
-
skip_genres: bool = True,
|
| 82 |
):
|
| 83 |
"""
|
| 84 |
Initialize the constrained logits processor.
|
| 85 |
|
| 86 |
This processor should be initialized once when loading the LLM and reused
|
| 87 |
-
for all generations.
|
| 88 |
-
the caption-based genre filtering.
|
| 89 |
-
|
| 90 |
Args:
|
| 91 |
tokenizer: The tokenizer to use for encoding/decoding
|
| 92 |
enabled: Whether to enable constrained decoding
|
| 93 |
debug: Whether to print debug information
|
| 94 |
-
genres_vocab_path: Path to genres vocabulary file (one genre per line)
|
| 95 |
-
If None, defaults to "acestep/genres_vocab.txt"
|
| 96 |
-
skip_genres: Whether to skip genres generation in metadata (default True)
|
| 97 |
"""
|
| 98 |
self.tokenizer = tokenizer
|
| 99 |
self.enabled = enabled
|
| 100 |
self.debug = debug
|
| 101 |
-
self.skip_genres = skip_genres
|
| 102 |
self.skip_caption = False # Set to True to skip caption field generation
|
| 103 |
self.skip_language = False # Set to True to skip language field generation
|
| 104 |
self.caption: Optional[str] = None # Set via update_caption() before each generation
|
| 105 |
|
| 106 |
# User-provided metadata fields (optional)
|
| 107 |
# If provided, these fields will be used directly instead of generating
|
| 108 |
-
# Format: {"bpm": "120", "caption": "...", "duration": "234", "keyscale": "G major", "language": "en", "timesignature": "4"
|
| 109 |
self.user_provided_metadata: Dict[str, Optional[str]] = {
|
| 110 |
"bpm": None,
|
| 111 |
"caption": None,
|
|
@@ -113,7 +103,6 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 113 |
"keyscale": None,
|
| 114 |
"language": None,
|
| 115 |
"timesignature": None,
|
| 116 |
-
"genres": None,
|
| 117 |
}
|
| 118 |
|
| 119 |
# Temperature settings for different generation phases (set per-generation)
|
|
@@ -131,6 +120,10 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 131 |
# Stop at reasoning flag - if True, stop generation after </think> tag
|
| 132 |
self.stop_at_reasoning: bool = False
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
# Current state
|
| 135 |
self.state = FSMState.THINK_TAG
|
| 136 |
self.position_in_state = 0 # Position within current state's fixed string
|
|
@@ -140,6 +133,8 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 140 |
# Caption generation state tracking
|
| 141 |
self.caption_after_newline = False # Track if we're right after a newline in caption
|
| 142 |
self.caption_token_count = 0 # Track token count for caption (max 512)
|
|
|
|
|
|
|
| 143 |
|
| 144 |
# Token queue for user-provided fields (injected directly without generation)
|
| 145 |
self.user_field_token_queue: List[int] = []
|
|
@@ -147,16 +142,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 147 |
|
| 148 |
# Pre-compute token IDs for efficiency
|
| 149 |
self._precompute_tokens()
|
| 150 |
-
|
| 151 |
-
# Genres vocabulary for constrained decoding
|
| 152 |
-
self.genres_vocab_path = genres_vocab_path or os.path.join(
|
| 153 |
-
os.path.dirname(os.path.abspath(__file__)), "genres_vocab.txt"
|
| 154 |
-
)
|
| 155 |
-
self.genres_vocab: List[str] = [] # Full vocab
|
| 156 |
-
self.genres_vocab_mtime: float = 0.0
|
| 157 |
-
self.genres_trie: Dict = {} # Trie for full vocab (fallback)
|
| 158 |
-
self.caption_genres_trie: Dict = {} # Trie for caption-matched genres (priority)
|
| 159 |
-
self.caption_matched_genres: List[str] = [] # Genres matched from caption
|
| 160 |
self._char_to_tokens: Dict[str, set] = {} # Precomputed char -> token IDs mapping
|
| 161 |
|
| 162 |
# Precompute token mappings once (O(vocab_size), runs once at init)
|
|
@@ -199,11 +185,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 199 |
|
| 200 |
# Build language prefix tree (similar to keyscale but for language codes)
|
| 201 |
self.language_prefix_tree = self._build_language_prefix_tree()
|
| 202 |
-
|
| 203 |
-
self._load_genres_vocab()
|
| 204 |
-
|
| 205 |
-
# Note: Caption-based genre filtering is initialized via update_caption() before each generation
|
| 206 |
-
|
| 207 |
# Fixed strings for each state
|
| 208 |
# IMPORTANT: Do NOT include trailing space after colon - tokenizer will handle spacing
|
| 209 |
# All matching should be done at token level, not string level
|
|
@@ -214,14 +196,13 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 214 |
FSMState.BPM_NAME: "bpm:",
|
| 215 |
FSMState.CAPTION_NAME: "caption:",
|
| 216 |
FSMState.DURATION_NAME: "duration:",
|
| 217 |
-
FSMState.GENRES_NAME: "genres:",
|
| 218 |
FSMState.KEYSCALE_NAME: "keyscale:",
|
| 219 |
FSMState.LANGUAGE_NAME: "language:",
|
| 220 |
FSMState.TIMESIG_NAME: "timesignature:",
|
| 221 |
FSMState.THINK_END_TAG: "</think>",
|
| 222 |
}
|
| 223 |
|
| 224 |
-
# State transitions
|
| 225 |
self._build_state_transitions()
|
| 226 |
|
| 227 |
def _get_next_field_state(self, current_field: str) -> Optional[FSMState]:
|
|
@@ -230,19 +211,17 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 230 |
even if the field is user-provided (we still need to generate the field name).
|
| 231 |
|
| 232 |
Args:
|
| 233 |
-
current_field: Current field name ("bpm", "caption", "duration", "
|
| 234 |
|
| 235 |
Returns:
|
| 236 |
Next FSMState (NAME state of next field), or THINK_END_TAG if no more fields
|
| 237 |
"""
|
| 238 |
# New field order: bpm -> caption -> duration -> keyscale -> language -> timesignature
|
| 239 |
-
|
| 240 |
-
field_order = ["bpm", "caption", "duration", "genres", "keyscale", "language", "timesignature"]
|
| 241 |
field_to_state = {
|
| 242 |
"bpm": FSMState.BPM_NAME,
|
| 243 |
"caption": FSMState.CAPTION_NAME,
|
| 244 |
"duration": FSMState.DURATION_NAME,
|
| 245 |
-
"genres": FSMState.GENRES_NAME,
|
| 246 |
"keyscale": FSMState.KEYSCALE_NAME,
|
| 247 |
"language": FSMState.LANGUAGE_NAME,
|
| 248 |
"timesignature": FSMState.TIMESIG_NAME,
|
|
@@ -256,10 +235,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 256 |
# Find next field in order
|
| 257 |
for i in range(current_idx + 1, len(field_order)):
|
| 258 |
field = field_order[i]
|
| 259 |
-
|
| 260 |
-
# Skip fields based on flags
|
| 261 |
-
if field == "genres" and self.skip_genres:
|
| 262 |
-
continue
|
| 263 |
if field == "caption" and self.skip_caption:
|
| 264 |
continue
|
| 265 |
if field == "language" and self.skip_language:
|
|
@@ -272,7 +248,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 272 |
return FSMState.THINK_END_TAG
|
| 273 |
|
| 274 |
def _build_state_transitions(self):
|
| 275 |
-
"""Build state transition map based on
|
| 276 |
self.next_state = {
|
| 277 |
FSMState.THINK_TAG: FSMState.NEWLINE_AFTER_THINK,
|
| 278 |
FSMState.NEWLINE_AFTER_THINK: FSMState.BPM_NAME, # Always start with BPM
|
|
@@ -281,7 +257,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 281 |
}
|
| 282 |
|
| 283 |
# Build transitions for all fields (even if user-provided, we still need to generate field name)
|
| 284 |
-
# Field order: bpm -> caption -> duration ->
|
| 285 |
|
| 286 |
# BPM field: NAME -> VALUE -> next field (caption or duration)
|
| 287 |
self.next_state[FSMState.BPM_NAME] = FSMState.BPM_VALUE
|
|
@@ -296,11 +272,6 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 296 |
self.next_state[FSMState.DURATION_NAME] = FSMState.DURATION_VALUE
|
| 297 |
self.next_state[FSMState.DURATION_VALUE] = self._get_next_field_state("duration")
|
| 298 |
|
| 299 |
-
# Genres field (only if not skipped): NAME -> VALUE -> next field
|
| 300 |
-
if not self.skip_genres:
|
| 301 |
-
self.next_state[FSMState.GENRES_NAME] = FSMState.GENRES_VALUE
|
| 302 |
-
self.next_state[FSMState.GENRES_VALUE] = self._get_next_field_state("genres")
|
| 303 |
-
|
| 304 |
# Keyscale field: NAME -> VALUE -> next field (language or timesignature)
|
| 305 |
self.next_state[FSMState.KEYSCALE_NAME] = FSMState.KEYSCALE_VALUE
|
| 306 |
self.next_state[FSMState.KEYSCALE_VALUE] = self._get_next_field_state("keyscale")
|
|
@@ -314,11 +285,6 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 314 |
self.next_state[FSMState.TIMESIG_NAME] = FSMState.TIMESIG_VALUE
|
| 315 |
self.next_state[FSMState.TIMESIG_VALUE] = FSMState.THINK_END_TAG
|
| 316 |
|
| 317 |
-
def set_skip_genres(self, skip: bool):
|
| 318 |
-
"""Set whether to skip genres generation and rebuild state transitions."""
|
| 319 |
-
self.skip_genres = skip
|
| 320 |
-
self._build_state_transitions()
|
| 321 |
-
|
| 322 |
def set_skip_caption(self, skip: bool):
|
| 323 |
"""Set whether to skip caption generation and rebuild state transitions."""
|
| 324 |
self.skip_caption = skip
|
|
@@ -372,6 +338,21 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 372 |
"""
|
| 373 |
self.stop_at_reasoning = stop
|
| 374 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
def set_user_metadata(self, metadata: Optional[Dict[str, Optional[str]]] = None):
|
| 376 |
"""
|
| 377 |
Set user-provided metadata fields. Fields that are provided will be used directly
|
|
@@ -385,14 +366,13 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 385 |
- "keyscale": Optional[str] - e.g., "G major"
|
| 386 |
- "language": Optional[str] - e.g., "en"
|
| 387 |
- "timesignature": Optional[str] - e.g., "4"
|
| 388 |
-
- "genres": Optional[str] - e.g., "Pop Rock"
|
| 389 |
If None, clears all user-provided metadata.
|
| 390 |
"""
|
| 391 |
if metadata is None:
|
| 392 |
metadata = {}
|
| 393 |
|
| 394 |
# Update user-provided metadata
|
| 395 |
-
for field in ["bpm", "caption", "duration", "keyscale", "language", "timesignature"
|
| 396 |
if field in metadata:
|
| 397 |
self.user_provided_metadata[field] = metadata[field]
|
| 398 |
else:
|
|
@@ -458,10 +438,6 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 458 |
# Vocab size
|
| 459 |
self.vocab_size = len(self.tokenizer)
|
| 460 |
|
| 461 |
-
# Comma token for multi-genre support
|
| 462 |
-
comma_tokens = self.tokenizer.encode(",", add_special_tokens=False)
|
| 463 |
-
self.comma_token = comma_tokens[-1] if comma_tokens else None
|
| 464 |
-
|
| 465 |
# EOS token for duration-constrained codes generation
|
| 466 |
self.eos_token_id = self.tokenizer.eos_token_id
|
| 467 |
|
|
@@ -484,6 +460,8 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 484 |
# Precompute audio code mask for efficient blocking (O(1) instead of O(n))
|
| 485 |
# This mask will be added to scores during caption generation
|
| 486 |
self.audio_code_mask: Optional[torch.Tensor] = None
|
|
|
|
|
|
|
| 487 |
self._build_audio_code_mask()
|
| 488 |
|
| 489 |
# Build valid keyscales set (prefix tree will be built after _char_to_tokens is initialized)
|
|
@@ -519,9 +497,13 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 519 |
This mask can be added to scores in O(1) time instead of O(n) loop.
|
| 520 |
|
| 521 |
The mask is [1, vocab_size] tensor with -inf at audio code token positions.
|
|
|
|
|
|
|
|
|
|
| 522 |
"""
|
| 523 |
if not self.audio_code_token_ids:
|
| 524 |
self.audio_code_mask = None
|
|
|
|
| 525 |
return
|
| 526 |
|
| 527 |
# Create mask tensor: 0 everywhere, -inf at audio code positions
|
|
@@ -536,8 +518,19 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 536 |
|
| 537 |
self.audio_code_mask = mask
|
| 538 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
if self.debug:
|
| 540 |
-
logger.debug(f"Built audio code
|
| 541 |
|
| 542 |
def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
|
| 543 |
"""
|
|
@@ -814,133 +807,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 814 |
print(f" {repr(ks)}")
|
| 815 |
|
| 816 |
print("=" * 60)
|
| 817 |
-
|
| 818 |
-
def _load_genres_vocab(self):
|
| 819 |
-
"""
|
| 820 |
-
Load genres vocabulary from file. Supports hot reload by checking file mtime.
|
| 821 |
-
File format: one genre per line, lines starting with # are comments.
|
| 822 |
-
"""
|
| 823 |
-
if not os.path.exists(self.genres_vocab_path):
|
| 824 |
-
if self.debug:
|
| 825 |
-
logger.debug(f"Genres vocab file not found: {self.genres_vocab_path}")
|
| 826 |
-
return
|
| 827 |
-
|
| 828 |
-
try:
|
| 829 |
-
mtime = os.path.getmtime(self.genres_vocab_path)
|
| 830 |
-
if mtime <= self.genres_vocab_mtime:
|
| 831 |
-
return # File hasn't changed
|
| 832 |
-
|
| 833 |
-
with open(self.genres_vocab_path, 'r', encoding='utf-8') as f:
|
| 834 |
-
genres = []
|
| 835 |
-
for line in f:
|
| 836 |
-
line = line.strip()
|
| 837 |
-
if line and not line.startswith('#'):
|
| 838 |
-
genres.append(line.lower())
|
| 839 |
-
|
| 840 |
-
self.genres_vocab = genres
|
| 841 |
-
self.genres_vocab_mtime = mtime
|
| 842 |
-
self._build_genres_trie()
|
| 843 |
-
|
| 844 |
-
if self.debug:
|
| 845 |
-
logger.debug(f"Loaded {len(self.genres_vocab)} genres from {self.genres_vocab_path}")
|
| 846 |
-
except Exception as e:
|
| 847 |
-
logger.warning(f"Failed to load genres vocab: {e}")
|
| 848 |
-
|
| 849 |
-
def _build_genres_trie(self):
|
| 850 |
-
"""
|
| 851 |
-
Build a trie (prefix tree) from genres vocabulary for efficient prefix matching.
|
| 852 |
-
Each node is a dict with:
|
| 853 |
-
- '_end': True if this node represents a complete genre
|
| 854 |
-
- other keys: next characters in the trie
|
| 855 |
-
"""
|
| 856 |
-
self.genres_trie = {}
|
| 857 |
-
|
| 858 |
-
for genre in self.genres_vocab:
|
| 859 |
-
node = self.genres_trie
|
| 860 |
-
for char in genre:
|
| 861 |
-
if char not in node:
|
| 862 |
-
node[char] = {}
|
| 863 |
-
node = node[char]
|
| 864 |
-
node['_end'] = True # Mark end of a complete genre
|
| 865 |
-
|
| 866 |
-
if self.debug:
|
| 867 |
-
logger.debug(f"Built genres trie with {len(self.genres_vocab)} entries")
|
| 868 |
-
|
| 869 |
-
def _extract_caption_genres(self, caption: str):
|
| 870 |
-
"""
|
| 871 |
-
Extract genres from the user's caption that match entries in the vocabulary.
|
| 872 |
-
This creates a smaller trie for faster and more relevant genre generation.
|
| 873 |
-
|
| 874 |
-
Strategy (optimized - O(words * max_genre_len) instead of O(vocab_size)):
|
| 875 |
-
1. Extract words/phrases from caption
|
| 876 |
-
2. For each word, use trie to find all vocab entries that START with this word
|
| 877 |
-
3. Build a separate trie from matched genres
|
| 878 |
-
"""
|
| 879 |
-
if not caption or not self.genres_vocab:
|
| 880 |
-
return
|
| 881 |
-
|
| 882 |
-
caption_lower = caption.lower()
|
| 883 |
-
matched_genres = set()
|
| 884 |
-
|
| 885 |
-
# Extract words from caption (split by common delimiters)
|
| 886 |
-
import re
|
| 887 |
-
words = re.split(r'[,\s\-_/\\|]+', caption_lower)
|
| 888 |
-
words = [w.strip() for w in words if w.strip() and len(w.strip()) >= 2]
|
| 889 |
-
|
| 890 |
-
# For each word, find genres in trie that start with this word
|
| 891 |
-
for word in words:
|
| 892 |
-
# Find all genres starting with this word using trie traversal
|
| 893 |
-
node = self._get_genres_trie_node(word)
|
| 894 |
-
if node is not None:
|
| 895 |
-
# Collect all complete genres under this node
|
| 896 |
-
self._collect_complete_genres(node, word, matched_genres)
|
| 897 |
-
|
| 898 |
-
# Also check if any word appears as a substring in short genres (< 20 chars)
|
| 899 |
-
# This is a quick check for common single-word genres
|
| 900 |
-
genres_set = set(self.genres_vocab)
|
| 901 |
-
for word in words:
|
| 902 |
-
if word in genres_set:
|
| 903 |
-
matched_genres.add(word)
|
| 904 |
-
|
| 905 |
-
if not matched_genres:
|
| 906 |
-
if self.debug:
|
| 907 |
-
logger.debug(f"No genres matched in caption, using full vocab")
|
| 908 |
-
return
|
| 909 |
-
|
| 910 |
-
# Build a trie from matched genres
|
| 911 |
-
self.caption_matched_genres = list(matched_genres)
|
| 912 |
-
self.caption_genres_trie = {}
|
| 913 |
-
|
| 914 |
-
for genre in matched_genres:
|
| 915 |
-
node = self.caption_genres_trie
|
| 916 |
-
for char in genre:
|
| 917 |
-
if char not in node:
|
| 918 |
-
node[char] = {}
|
| 919 |
-
node = node[char]
|
| 920 |
-
node['_end'] = True
|
| 921 |
-
|
| 922 |
-
if self.debug:
|
| 923 |
-
logger.debug(f"Matched {len(matched_genres)} genres from caption: {list(matched_genres)[:5]}...")
|
| 924 |
-
|
| 925 |
-
def _collect_complete_genres(self, node: Dict, prefix: str, result: set, max_depth: int = 50):
|
| 926 |
-
"""
|
| 927 |
-
Recursively collect all complete genres under a trie node.
|
| 928 |
-
Limited depth to avoid too many matches.
|
| 929 |
-
"""
|
| 930 |
-
if max_depth <= 0:
|
| 931 |
-
return
|
| 932 |
-
|
| 933 |
-
if node.get('_end', False):
|
| 934 |
-
result.add(prefix)
|
| 935 |
-
|
| 936 |
-
# Limit total collected genres to avoid slowdown
|
| 937 |
-
if len(result) >= 100:
|
| 938 |
-
return
|
| 939 |
-
|
| 940 |
-
for char, child_node in node.items():
|
| 941 |
-
if char not in ('_end', '_tokens'):
|
| 942 |
-
self._collect_complete_genres(child_node, prefix + char, result, max_depth - 1)
|
| 943 |
-
|
| 944 |
def _precompute_char_token_mapping(self):
|
| 945 |
"""
|
| 946 |
Precompute mapping from characters to token IDs and token decoded texts.
|
|
@@ -992,36 +859,8 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 992 |
|
| 993 |
if self.debug:
|
| 994 |
logger.debug(f"Precomputed char->token mapping for {len(self._char_to_tokens)} unique characters")
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
"""Check if genres vocab file has been updated and reload if necessary."""
|
| 998 |
-
if not os.path.exists(self.genres_vocab_path):
|
| 999 |
-
return
|
| 1000 |
-
|
| 1001 |
-
try:
|
| 1002 |
-
mtime = os.path.getmtime(self.genres_vocab_path)
|
| 1003 |
-
if mtime > self.genres_vocab_mtime:
|
| 1004 |
-
self._load_genres_vocab()
|
| 1005 |
-
except Exception:
|
| 1006 |
-
pass # Ignore errors during hot reload check
|
| 1007 |
-
|
| 1008 |
-
def _get_genres_trie_node(self, prefix: str) -> Optional[Dict]:
|
| 1009 |
-
"""
|
| 1010 |
-
Get the trie node for a given prefix.
|
| 1011 |
-
Returns None if the prefix is not valid (no genres start with this prefix).
|
| 1012 |
-
"""
|
| 1013 |
-
node = self.genres_trie
|
| 1014 |
-
for char in prefix.lower():
|
| 1015 |
-
if char not in node:
|
| 1016 |
-
return None
|
| 1017 |
-
node = node[char]
|
| 1018 |
-
return node
|
| 1019 |
-
|
| 1020 |
-
def _is_complete_genre(self, text: str) -> bool:
|
| 1021 |
-
"""Check if the given text is a complete genre in the vocabulary."""
|
| 1022 |
-
node = self._get_genres_trie_node(text.strip())
|
| 1023 |
-
return node is not None and node.get('_end', False)
|
| 1024 |
-
|
| 1025 |
def _get_trie_node_from_trie(self, trie: Dict, prefix: str) -> Optional[Dict]:
|
| 1026 |
"""Get a trie node from a specific trie (helper for caption vs full trie)."""
|
| 1027 |
node = trie
|
|
@@ -1030,109 +869,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1030 |
return None
|
| 1031 |
node = node[char]
|
| 1032 |
return node
|
| 1033 |
-
|
| 1034 |
-
def _get_allowed_genres_tokens(self) -> List[int]:
|
| 1035 |
-
"""
|
| 1036 |
-
Get allowed tokens for genres field based on trie matching.
|
| 1037 |
-
|
| 1038 |
-
The entire genres string (including commas) must match a complete entry in the vocab.
|
| 1039 |
-
For example, if vocab contains "pop, rock, jazz", the generated string must exactly
|
| 1040 |
-
match that entry - we don't treat commas as separators for individual genres.
|
| 1041 |
-
|
| 1042 |
-
Strategy:
|
| 1043 |
-
1. If caption-matched genres exist, use that smaller trie first (faster + more relevant)
|
| 1044 |
-
2. If no caption matches or prefix not in caption trie, fallback to full vocab trie
|
| 1045 |
-
3. Get valid next characters from current trie node
|
| 1046 |
-
4. For each candidate token, verify the full decoded text forms a valid trie prefix
|
| 1047 |
-
"""
|
| 1048 |
-
if not self.genres_vocab:
|
| 1049 |
-
# No vocab loaded, allow all except newline if empty
|
| 1050 |
-
return []
|
| 1051 |
-
|
| 1052 |
-
# Use the full accumulated value (don't split by comma - treat as single entry)
|
| 1053 |
-
accumulated = self.accumulated_value.lower()
|
| 1054 |
-
current_genre_prefix = accumulated.strip()
|
| 1055 |
-
|
| 1056 |
-
# Determine which trie to use: caption-matched (priority) or full vocab (fallback)
|
| 1057 |
-
use_caption_trie = False
|
| 1058 |
-
current_node = None
|
| 1059 |
-
|
| 1060 |
-
# Try caption-matched trie first if available
|
| 1061 |
-
if self.caption_genres_trie:
|
| 1062 |
-
if current_genre_prefix == "":
|
| 1063 |
-
current_node = self.caption_genres_trie
|
| 1064 |
-
use_caption_trie = True
|
| 1065 |
-
else:
|
| 1066 |
-
current_node = self._get_trie_node_from_trie(self.caption_genres_trie, current_genre_prefix)
|
| 1067 |
-
if current_node is not None:
|
| 1068 |
-
use_caption_trie = True
|
| 1069 |
-
|
| 1070 |
-
# Fallback to full vocab trie
|
| 1071 |
-
if current_node is None:
|
| 1072 |
-
if current_genre_prefix == "":
|
| 1073 |
-
current_node = self.genres_trie
|
| 1074 |
-
else:
|
| 1075 |
-
current_node = self._get_genres_trie_node(current_genre_prefix)
|
| 1076 |
-
|
| 1077 |
-
if current_node is None:
|
| 1078 |
-
# Invalid prefix, force newline to end
|
| 1079 |
-
if self.newline_token:
|
| 1080 |
-
return [self.newline_token]
|
| 1081 |
-
return []
|
| 1082 |
-
|
| 1083 |
-
# Get valid next characters from trie node
|
| 1084 |
-
valid_next_chars = set(k for k in current_node.keys() if k not in ('_end', '_tokens'))
|
| 1085 |
-
|
| 1086 |
-
# If current value is a complete genre, allow newline to end
|
| 1087 |
-
is_complete = current_node.get('_end', False)
|
| 1088 |
-
|
| 1089 |
-
if not valid_next_chars:
|
| 1090 |
-
# No more characters to match, only allow newline if complete
|
| 1091 |
-
allowed = set()
|
| 1092 |
-
if is_complete and self.newline_token:
|
| 1093 |
-
allowed.add(self.newline_token)
|
| 1094 |
-
return list(allowed)
|
| 1095 |
-
|
| 1096 |
-
# Collect candidate tokens based on first character
|
| 1097 |
-
candidate_tokens = set()
|
| 1098 |
-
for char in valid_next_chars:
|
| 1099 |
-
if char in self._char_to_tokens:
|
| 1100 |
-
candidate_tokens.update(self._char_to_tokens[char])
|
| 1101 |
-
|
| 1102 |
-
# Select the appropriate trie for validation
|
| 1103 |
-
active_trie = self.caption_genres_trie if use_caption_trie else self.genres_trie
|
| 1104 |
-
|
| 1105 |
-
# Validate each candidate token: check if prefix + decoded_token is a valid trie prefix
|
| 1106 |
-
allowed = set()
|
| 1107 |
-
for token_id in candidate_tokens:
|
| 1108 |
-
# Use precomputed decoded text (already normalized)
|
| 1109 |
-
decoded_normalized = self._token_to_text.get(token_id, "")
|
| 1110 |
-
|
| 1111 |
-
if not decoded_normalized or not decoded_normalized.strip():
|
| 1112 |
-
# Token decodes to empty or only whitespace - allow if space/comma is a valid next char
|
| 1113 |
-
if ' ' in valid_next_chars or ',' in valid_next_chars:
|
| 1114 |
-
allowed.add(token_id)
|
| 1115 |
-
continue
|
| 1116 |
-
|
| 1117 |
-
# Build new prefix by appending decoded token
|
| 1118 |
-
# Handle space-prefixed tokens (e.g., " rock" from "pop rock")
|
| 1119 |
-
if decoded_normalized.startswith(' ') or decoded_normalized.startswith(','):
|
| 1120 |
-
# Token has leading space/comma - append directly
|
| 1121 |
-
new_prefix = current_genre_prefix + decoded_normalized
|
| 1122 |
-
else:
|
| 1123 |
-
new_prefix = current_genre_prefix + decoded_normalized
|
| 1124 |
-
|
| 1125 |
-
# Check if new_prefix is a valid prefix in the active trie
|
| 1126 |
-
new_node = self._get_trie_node_from_trie(active_trie, new_prefix)
|
| 1127 |
-
if new_node is not None:
|
| 1128 |
-
allowed.add(token_id)
|
| 1129 |
-
|
| 1130 |
-
# If current value is a complete genre, also allow newline
|
| 1131 |
-
if is_complete and self.newline_token:
|
| 1132 |
-
allowed.add(self.newline_token)
|
| 1133 |
-
|
| 1134 |
-
return list(allowed)
|
| 1135 |
-
|
| 1136 |
def reset(self):
|
| 1137 |
"""Reset the processor state for a new generation."""
|
| 1138 |
self.state = FSMState.THINK_TAG
|
|
@@ -1144,6 +881,8 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1144 |
self.current_user_field = None # Reset current user field
|
| 1145 |
self.caption_after_newline = False # Reset caption newline tracking
|
| 1146 |
self.caption_token_count = 0 # Reset caption token count
|
|
|
|
|
|
|
| 1147 |
|
| 1148 |
def set_target_duration(self, duration: Optional[float]):
|
| 1149 |
"""
|
|
@@ -1163,27 +902,6 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1163 |
if self.debug:
|
| 1164 |
logger.debug("Target duration cleared, no duration constraint")
|
| 1165 |
|
| 1166 |
-
def update_caption(self, caption: Optional[str]):
|
| 1167 |
-
"""
|
| 1168 |
-
Update the caption and rebuild the caption-matched genres trie.
|
| 1169 |
-
Call this before each generation to prioritize genres from the new caption.
|
| 1170 |
-
|
| 1171 |
-
Args:
|
| 1172 |
-
caption: User's input caption. If None or empty, clears caption matching.
|
| 1173 |
-
"""
|
| 1174 |
-
# Check for hot reload of genres vocabulary
|
| 1175 |
-
self._try_reload_genres_vocab()
|
| 1176 |
-
|
| 1177 |
-
self.caption = caption
|
| 1178 |
-
self.caption_genres_trie = {}
|
| 1179 |
-
self.caption_matched_genres = []
|
| 1180 |
-
|
| 1181 |
-
if caption:
|
| 1182 |
-
self._extract_caption_genres(caption)
|
| 1183 |
-
|
| 1184 |
-
# Also reset FSM state for new generation
|
| 1185 |
-
self.reset()
|
| 1186 |
-
|
| 1187 |
def _get_allowed_tokens_for_fixed_string(self, fixed_str: str) -> List[int]:
|
| 1188 |
"""
|
| 1189 |
Get the token IDs that can continue the fixed string from current position.
|
|
@@ -1342,26 +1060,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1342 |
logger.debug(f"Numeric field decision: newline_prob={newline_prob:.4f}, max_digit_prob={max_digit_prob:.4f}")
|
| 1343 |
|
| 1344 |
return newline_prob > max_digit_prob
|
| 1345 |
-
|
| 1346 |
-
def _should_end_text_field(self, logits: torch.Tensor) -> bool:
|
| 1347 |
-
"""
|
| 1348 |
-
Determine if we should end a text field (genres).
|
| 1349 |
-
Returns True if P(newline) > P(any other token) AND we have some content.
|
| 1350 |
-
"""
|
| 1351 |
-
if not self.accumulated_value.strip():
|
| 1352 |
-
return False # Need at least some content
|
| 1353 |
-
|
| 1354 |
-
probs = torch.softmax(logits, dim=-1)
|
| 1355 |
-
newline_prob = probs[0, self.newline_token].item() if self.newline_token else 0
|
| 1356 |
-
|
| 1357 |
-
# Get max probability among non-newline tokens
|
| 1358 |
-
masked_probs = probs.clone()
|
| 1359 |
-
if self.newline_token:
|
| 1360 |
-
masked_probs[0, self.newline_token] = 0
|
| 1361 |
-
max_other_prob = masked_probs[0].max().item()
|
| 1362 |
-
|
| 1363 |
-
return newline_prob > max_other_prob
|
| 1364 |
-
|
| 1365 |
def _get_allowed_keyscale_tokens(self) -> List[int]:
|
| 1366 |
"""
|
| 1367 |
Get allowed tokens for keyscale field using the precomputed prefix tree.
|
|
@@ -1435,9 +1134,32 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1435 |
return self._apply_temperature_scaling(scores)
|
| 1436 |
|
| 1437 |
if self.state == FSMState.COMPLETED:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1438 |
return self._apply_temperature_scaling(scores)
|
| 1439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1440 |
if self.state == FSMState.CODES_GENERATION:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1441 |
# Apply duration constraint in codes generation phase
|
| 1442 |
if self.target_codes is not None and self.eos_token_id is not None:
|
| 1443 |
if self.codes_count < self.target_codes:
|
|
@@ -1464,6 +1186,31 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1464 |
# Apply temperature scaling after constraint masking
|
| 1465 |
return self._apply_temperature_scaling(scores)
|
| 1466 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1467 |
def _apply_temperature_scaling(self, scores: torch.FloatTensor) -> torch.FloatTensor:
|
| 1468 |
"""
|
| 1469 |
Apply temperature scaling based on current generation phase.
|
|
@@ -1501,7 +1248,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1501 |
Uses the same tokenization logic as prefix tree building.
|
| 1502 |
|
| 1503 |
Args:
|
| 1504 |
-
field_name: Field name ("bpm", "caption", "duration", "keyscale", "language", "timesignature"
|
| 1505 |
|
| 1506 |
Returns:
|
| 1507 |
List of token IDs for the complete field, or None if field is not provided
|
|
@@ -1518,7 +1265,6 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1518 |
"keyscale": "keyscale: ",
|
| 1519 |
"language": "language: ",
|
| 1520 |
"timesignature": "timesignature: ",
|
| 1521 |
-
"genres": "genres: ",
|
| 1522 |
}
|
| 1523 |
prefix = field_to_prefix[field_name]
|
| 1524 |
full_text = f"{prefix}{value}\n"
|
|
@@ -1673,15 +1419,25 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1673 |
|
| 1674 |
# If top token does NOT start with space/tab, it's a new field (like "duration:")
|
| 1675 |
if len(top_token_text) > 0 and top_token_text[0] not in ' \t':
|
| 1676 |
-
# Caption is ending
|
|
|
|
|
|
|
| 1677 |
self.caption_after_newline = False
|
| 1678 |
-
self.
|
| 1679 |
-
|
| 1680 |
-
|
|
|
|
| 1681 |
else:
|
| 1682 |
# It's indentation, continue caption
|
| 1683 |
self.caption_after_newline = False
|
| 1684 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1685 |
# Block backticks (code blocks)
|
| 1686 |
if self.backtick_token is not None:
|
| 1687 |
scores[0, self.backtick_token] = float('-inf')
|
|
@@ -1749,55 +1505,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1749 |
mask[0, self.newline_token] = 0
|
| 1750 |
|
| 1751 |
scores = scores + mask
|
| 1752 |
-
|
| 1753 |
-
elif self.state == FSMState.GENRES_VALUE:
|
| 1754 |
-
# Check if field is user-provided and we haven't started injecting yet
|
| 1755 |
-
if self.user_provided_metadata["genres"] is not None and not self.user_field_token_queue and not self.accumulated_value:
|
| 1756 |
-
# Initialize token queue with field value tokens (value + newline)
|
| 1757 |
-
value = self.user_provided_metadata["genres"]
|
| 1758 |
-
value_text = f" {value}\n"
|
| 1759 |
-
value_tokens = self.tokenizer.encode(value_text, add_special_tokens=False)
|
| 1760 |
-
if value_tokens:
|
| 1761 |
-
self.user_field_token_queue = value_tokens
|
| 1762 |
-
self.current_user_field = "genres"
|
| 1763 |
-
# Inject first token
|
| 1764 |
-
mask[0, value_tokens[0]] = 0
|
| 1765 |
-
scores = scores + mask
|
| 1766 |
-
return scores
|
| 1767 |
-
|
| 1768 |
-
# Try to hot-reload genres vocab if file has changed
|
| 1769 |
-
self._try_reload_genres_vocab()
|
| 1770 |
-
|
| 1771 |
-
# Get allowed tokens based on genres vocabulary
|
| 1772 |
-
allowed = self._get_allowed_genres_tokens()
|
| 1773 |
-
|
| 1774 |
-
if allowed:
|
| 1775 |
-
# Use vocabulary-constrained decoding
|
| 1776 |
-
for t in allowed:
|
| 1777 |
-
mask[0, t] = 0
|
| 1778 |
-
scores = scores + mask
|
| 1779 |
-
elif self.genres_vocab:
|
| 1780 |
-
# Vocab is loaded but no valid continuation found
|
| 1781 |
-
# Force newline to end the field
|
| 1782 |
-
if self.newline_token:
|
| 1783 |
-
mask[0, self.newline_token] = 0
|
| 1784 |
-
if self.debug:
|
| 1785 |
-
logger.debug(f"No valid genre continuation for '{self.accumulated_value}', forcing newline")
|
| 1786 |
-
scores = scores + mask
|
| 1787 |
-
else:
|
| 1788 |
-
# Fallback: no vocab loaded, use probability-based ending
|
| 1789 |
-
if self._should_end_text_field(scores):
|
| 1790 |
-
if self.newline_token:
|
| 1791 |
-
mask[0, self.newline_token] = 0
|
| 1792 |
-
self._transition_to_next_state()
|
| 1793 |
-
scores = scores + mask
|
| 1794 |
-
else:
|
| 1795 |
-
# Allow any token except newline if we don't have content yet
|
| 1796 |
-
if not self.accumulated_value.strip():
|
| 1797 |
-
if self.newline_token:
|
| 1798 |
-
scores[0, self.newline_token] = float('-inf')
|
| 1799 |
-
# Otherwise, don't constrain (fallback behavior)
|
| 1800 |
-
|
| 1801 |
elif self.state == FSMState.KEYSCALE_VALUE:
|
| 1802 |
# Check if field is user-provided and we haven't started injecting yet
|
| 1803 |
if self.user_provided_metadata["keyscale"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
|
|
@@ -1835,7 +1543,10 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1835 |
scores = scores + mask
|
| 1836 |
|
| 1837 |
elif self.state == FSMState.LANGUAGE_VALUE:
|
| 1838 |
-
# Language field:
|
|
|
|
|
|
|
|
|
|
| 1839 |
|
| 1840 |
# Check if field is user-provided and we haven't started injecting yet
|
| 1841 |
if self.user_provided_metadata["language"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
|
|
@@ -1851,25 +1562,63 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1851 |
scores = scores + mask
|
| 1852 |
return scores
|
| 1853 |
|
| 1854 |
-
#
|
| 1855 |
-
|
| 1856 |
-
if
|
| 1857 |
-
#
|
| 1858 |
-
|
| 1859 |
-
|
| 1860 |
-
|
| 1861 |
-
|
| 1862 |
-
|
| 1863 |
-
|
| 1864 |
-
|
| 1865 |
-
|
| 1866 |
-
|
| 1867 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1868 |
else:
|
| 1869 |
-
#
|
| 1870 |
if self.newline_token:
|
| 1871 |
mask[0, self.newline_token] = 0
|
| 1872 |
scores = scores + mask
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1873 |
|
| 1874 |
elif self.state == FSMState.TIMESIG_VALUE:
|
| 1875 |
# Check if field is user-provided and we haven't started injecting yet
|
|
@@ -1908,12 +1657,18 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1908 |
old_state = self.state
|
| 1909 |
next_state = self.next_state[self.state]
|
| 1910 |
|
| 1911 |
-
#
|
| 1912 |
-
#
|
| 1913 |
-
|
| 1914 |
-
|
| 1915 |
-
|
| 1916 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1917 |
|
| 1918 |
self.state = next_state
|
| 1919 |
self.position_in_state = 0
|
|
@@ -1921,6 +1676,8 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1921 |
self.accumulated_token_ids = [] # Reset token ID sequence for new field
|
| 1922 |
self.caption_after_newline = False # Reset caption newline tracking
|
| 1923 |
self.caption_token_count = 0 # Reset caption token count
|
|
|
|
|
|
|
| 1924 |
if self.debug:
|
| 1925 |
logger.debug(f"FSM transition: {old_state.name} -> {self.state.name}")
|
| 1926 |
|
|
@@ -1991,23 +1748,23 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 1991 |
|
| 1992 |
# Check if we've completed the fixed string
|
| 1993 |
if self.position_in_state >= len(fixed_str):
|
| 1994 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1995 |
|
| 1996 |
elif self.state in [FSMState.BPM_VALUE, FSMState.DURATION_VALUE, FSMState.TIMESIG_VALUE]:
|
| 1997 |
# Accumulate numeric value using token ID sequence
|
| 1998 |
if generated_token_id == self.newline_token:
|
| 1999 |
-
# if self.state == FSMState.DURATION_VALUE and self.accumulated_value:
|
| 2000 |
-
# try:
|
| 2001 |
-
# generated_duration = int(self.accumulated_value)
|
| 2002 |
-
# if self.target_codes is None and generated_duration > 0:
|
| 2003 |
-
# self.target_codes = int(generated_duration * 5)
|
| 2004 |
-
# if self.debug:
|
| 2005 |
-
# logger.debug(f"Synced duration: {generated_duration}s -> Set target_codes limit to {self.target_codes}")
|
| 2006 |
-
# except ValueError:
|
| 2007 |
-
# if self.debug:
|
| 2008 |
-
# logger.warning(f"Could not parse duration value: {self.accumulated_value}")
|
| 2009 |
-
# Newline ends the field
|
| 2010 |
-
# Save old state before transition
|
| 2011 |
old_state = self.state
|
| 2012 |
self._transition_to_next_state()
|
| 2013 |
# IMPORTANT: After state transition, if new state is a fixed_strings state,
|
|
@@ -2022,21 +1779,7 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 2022 |
# Also update legacy accumulated_value for compatibility
|
| 2023 |
if token_str.strip().isdigit():
|
| 2024 |
self.accumulated_value += token_str.strip()
|
| 2025 |
-
|
| 2026 |
-
elif self.state == FSMState.GENRES_VALUE:
|
| 2027 |
-
if generated_token_id == self.newline_token:
|
| 2028 |
-
# Newline ends the field
|
| 2029 |
-
self._transition_to_next_state()
|
| 2030 |
-
# IMPORTANT: After state transition, if new state is a fixed_strings state,
|
| 2031 |
-
# we should NOT update position_in_state with the newline token length,
|
| 2032 |
-
# because that token belongs to the old state, not the new state.
|
| 2033 |
-
# Return early to avoid the fixed_strings update logic below.
|
| 2034 |
-
if self.state in self.fixed_strings:
|
| 2035 |
-
return
|
| 2036 |
-
else:
|
| 2037 |
-
# Genres still uses string-based trie, so keep accumulated_value
|
| 2038 |
-
self.accumulated_value += token_str
|
| 2039 |
-
|
| 2040 |
elif self.state == FSMState.CAPTION_VALUE:
|
| 2041 |
# Track token count for 512 limit
|
| 2042 |
self.caption_token_count += 1
|
|
@@ -2049,9 +1792,51 @@ class MetadataConstrainedLogitsProcessor(LogitsProcessor):
|
|
| 2049 |
# Mark that we need to check next token for field transition
|
| 2050 |
self.caption_after_newline = True
|
| 2051 |
else:
|
| 2052 |
-
# Not a newline - if we were after newline and this is not space,
|
| 2053 |
# transition already happened in _process_single_sequence
|
| 2054 |
self.caption_after_newline = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2055 |
|
| 2056 |
elif self.state == FSMState.KEYSCALE_VALUE:
|
| 2057 |
if generated_token_id == self.newline_token:
|
|
|
|
| 35 |
DURATION_NAME = auto() # Generating "duration: "
|
| 36 |
DURATION_VALUE = auto() # Generating numeric value 10-600
|
| 37 |
NEWLINE_AFTER_DURATION = auto()
|
|
|
|
|
|
|
|
|
|
| 38 |
KEYSCALE_NAME = auto() # Generating "keyscale: "
|
| 39 |
KEYSCALE_VALUE = auto() # Generating keyscale pattern
|
| 40 |
NEWLINE_AFTER_KEYSCALE = auto()
|
|
|
|
| 74 |
tokenizer: AutoTokenizer,
|
| 75 |
enabled: bool = True,
|
| 76 |
debug: bool = False,
|
| 77 |
+
**kwargs: Any,
|
|
|
|
| 78 |
):
|
| 79 |
"""
|
| 80 |
Initialize the constrained logits processor.
|
| 81 |
|
| 82 |
This processor should be initialized once when loading the LLM and reused
|
| 83 |
+
for all generations.
|
|
|
|
|
|
|
| 84 |
Args:
|
| 85 |
tokenizer: The tokenizer to use for encoding/decoding
|
| 86 |
enabled: Whether to enable constrained decoding
|
| 87 |
debug: Whether to print debug information
|
|
|
|
|
|
|
|
|
|
| 88 |
"""
|
| 89 |
self.tokenizer = tokenizer
|
| 90 |
self.enabled = enabled
|
| 91 |
self.debug = debug
|
|
|
|
| 92 |
self.skip_caption = False # Set to True to skip caption field generation
|
| 93 |
self.skip_language = False # Set to True to skip language field generation
|
| 94 |
self.caption: Optional[str] = None # Set via update_caption() before each generation
|
| 95 |
|
| 96 |
# User-provided metadata fields (optional)
|
| 97 |
# If provided, these fields will be used directly instead of generating
|
| 98 |
+
# Format: {"bpm": "120", "caption": "...", "duration": "234", "keyscale": "G major", "language": "en", "timesignature": "4"}
|
| 99 |
self.user_provided_metadata: Dict[str, Optional[str]] = {
|
| 100 |
"bpm": None,
|
| 101 |
"caption": None,
|
|
|
|
| 103 |
"keyscale": None,
|
| 104 |
"language": None,
|
| 105 |
"timesignature": None,
|
|
|
|
| 106 |
}
|
| 107 |
|
| 108 |
# Temperature settings for different generation phases (set per-generation)
|
|
|
|
| 120 |
# Stop at reasoning flag - if True, stop generation after </think> tag
|
| 121 |
self.stop_at_reasoning: bool = False
|
| 122 |
|
| 123 |
+
# Generation phase - "cot" or "codes"
|
| 124 |
+
# Used to determine FSM behavior when prompt already contains CoT
|
| 125 |
+
self.generation_phase: str = "cot"
|
| 126 |
+
|
| 127 |
# Current state
|
| 128 |
self.state = FSMState.THINK_TAG
|
| 129 |
self.position_in_state = 0 # Position within current state's fixed string
|
|
|
|
| 133 |
# Caption generation state tracking
|
| 134 |
self.caption_after_newline = False # Track if we're right after a newline in caption
|
| 135 |
self.caption_token_count = 0 # Track token count for caption (max 512)
|
| 136 |
+
self.caption_ending = False # Track if caption is ending (after detecting non-indented line)
|
| 137 |
+
self.pending_field_name = "" # Accumulate field name tokens when caption is ending
|
| 138 |
|
| 139 |
# Token queue for user-provided fields (injected directly without generation)
|
| 140 |
self.user_field_token_queue: List[int] = []
|
|
|
|
| 142 |
|
| 143 |
# Pre-compute token IDs for efficiency
|
| 144 |
self._precompute_tokens()
|
| 145 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
self._char_to_tokens: Dict[str, set] = {} # Precomputed char -> token IDs mapping
|
| 147 |
|
| 148 |
# Precompute token mappings once (O(vocab_size), runs once at init)
|
|
|
|
| 185 |
|
| 186 |
# Build language prefix tree (similar to keyscale but for language codes)
|
| 187 |
self.language_prefix_tree = self._build_language_prefix_tree()
|
| 188 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
# Fixed strings for each state
|
| 190 |
# IMPORTANT: Do NOT include trailing space after colon - tokenizer will handle spacing
|
| 191 |
# All matching should be done at token level, not string level
|
|
|
|
| 196 |
FSMState.BPM_NAME: "bpm:",
|
| 197 |
FSMState.CAPTION_NAME: "caption:",
|
| 198 |
FSMState.DURATION_NAME: "duration:",
|
|
|
|
| 199 |
FSMState.KEYSCALE_NAME: "keyscale:",
|
| 200 |
FSMState.LANGUAGE_NAME: "language:",
|
| 201 |
FSMState.TIMESIG_NAME: "timesignature:",
|
| 202 |
FSMState.THINK_END_TAG: "</think>",
|
| 203 |
}
|
| 204 |
|
| 205 |
+
# State transitions
|
| 206 |
self._build_state_transitions()
|
| 207 |
|
| 208 |
def _get_next_field_state(self, current_field: str) -> Optional[FSMState]:
|
|
|
|
| 211 |
even if the field is user-provided (we still need to generate the field name).
|
| 212 |
|
| 213 |
Args:
|
| 214 |
+
current_field: Current field name ("bpm", "caption", "duration", "keyscale", "language", "timesignature")
|
| 215 |
|
| 216 |
Returns:
|
| 217 |
Next FSMState (NAME state of next field), or THINK_END_TAG if no more fields
|
| 218 |
"""
|
| 219 |
# New field order: bpm -> caption -> duration -> keyscale -> language -> timesignature
|
| 220 |
+
field_order = ["bpm", "caption", "duration","keyscale", "language", "timesignature"]
|
|
|
|
| 221 |
field_to_state = {
|
| 222 |
"bpm": FSMState.BPM_NAME,
|
| 223 |
"caption": FSMState.CAPTION_NAME,
|
| 224 |
"duration": FSMState.DURATION_NAME,
|
|
|
|
| 225 |
"keyscale": FSMState.KEYSCALE_NAME,
|
| 226 |
"language": FSMState.LANGUAGE_NAME,
|
| 227 |
"timesignature": FSMState.TIMESIG_NAME,
|
|
|
|
| 235 |
# Find next field in order
|
| 236 |
for i in range(current_idx + 1, len(field_order)):
|
| 237 |
field = field_order[i]
|
| 238 |
+
|
|
|
|
|
|
|
|
|
|
| 239 |
if field == "caption" and self.skip_caption:
|
| 240 |
continue
|
| 241 |
if field == "language" and self.skip_language:
|
|
|
|
| 248 |
return FSMState.THINK_END_TAG
|
| 249 |
|
| 250 |
def _build_state_transitions(self):
|
| 251 |
+
"""Build state transition map based on user-provided metadata."""
|
| 252 |
self.next_state = {
|
| 253 |
FSMState.THINK_TAG: FSMState.NEWLINE_AFTER_THINK,
|
| 254 |
FSMState.NEWLINE_AFTER_THINK: FSMState.BPM_NAME, # Always start with BPM
|
|
|
|
| 257 |
}
|
| 258 |
|
| 259 |
# Build transitions for all fields (even if user-provided, we still need to generate field name)
|
| 260 |
+
# Field order: bpm -> caption -> duration -> keyscale -> language -> timesignature
|
| 261 |
|
| 262 |
# BPM field: NAME -> VALUE -> next field (caption or duration)
|
| 263 |
self.next_state[FSMState.BPM_NAME] = FSMState.BPM_VALUE
|
|
|
|
| 272 |
self.next_state[FSMState.DURATION_NAME] = FSMState.DURATION_VALUE
|
| 273 |
self.next_state[FSMState.DURATION_VALUE] = self._get_next_field_state("duration")
|
| 274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
# Keyscale field: NAME -> VALUE -> next field (language or timesignature)
|
| 276 |
self.next_state[FSMState.KEYSCALE_NAME] = FSMState.KEYSCALE_VALUE
|
| 277 |
self.next_state[FSMState.KEYSCALE_VALUE] = self._get_next_field_state("keyscale")
|
|
|
|
| 285 |
self.next_state[FSMState.TIMESIG_NAME] = FSMState.TIMESIG_VALUE
|
| 286 |
self.next_state[FSMState.TIMESIG_VALUE] = FSMState.THINK_END_TAG
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
def set_skip_caption(self, skip: bool):
|
| 289 |
"""Set whether to skip caption generation and rebuild state transitions."""
|
| 290 |
self.skip_caption = skip
|
|
|
|
| 338 |
"""
|
| 339 |
self.stop_at_reasoning = stop
|
| 340 |
|
| 341 |
+
def set_generation_phase(self, phase: str):
|
| 342 |
+
"""
|
| 343 |
+
Set the generation phase.
|
| 344 |
+
|
| 345 |
+
Args:
|
| 346 |
+
phase: "cot" for CoT metadata generation, "codes" for audio codes generation,
|
| 347 |
+
or "understand" for audio understanding (codes → metadata + lyrics).
|
| 348 |
+
When phase is "codes" and the input prompt already contains </think>,
|
| 349 |
+
the FSM will skip metadata generation and go directly to codes generation.
|
| 350 |
+
When phase is "understand", generate CoT metadata then free-form lyrics.
|
| 351 |
+
"""
|
| 352 |
+
if phase not in ("cot", "codes", "understand"):
|
| 353 |
+
raise ValueError(f"Invalid generation phase: {phase!r}. Must be 'cot', 'codes', or 'understand'")
|
| 354 |
+
self.generation_phase = phase
|
| 355 |
+
|
| 356 |
def set_user_metadata(self, metadata: Optional[Dict[str, Optional[str]]] = None):
|
| 357 |
"""
|
| 358 |
Set user-provided metadata fields. Fields that are provided will be used directly
|
|
|
|
| 366 |
- "keyscale": Optional[str] - e.g., "G major"
|
| 367 |
- "language": Optional[str] - e.g., "en"
|
| 368 |
- "timesignature": Optional[str] - e.g., "4"
|
|
|
|
| 369 |
If None, clears all user-provided metadata.
|
| 370 |
"""
|
| 371 |
if metadata is None:
|
| 372 |
metadata = {}
|
| 373 |
|
| 374 |
# Update user-provided metadata
|
| 375 |
+
for field in ["bpm", "caption", "duration", "keyscale", "language", "timesignature"]:
|
| 376 |
if field in metadata:
|
| 377 |
self.user_provided_metadata[field] = metadata[field]
|
| 378 |
else:
|
|
|
|
| 438 |
# Vocab size
|
| 439 |
self.vocab_size = len(self.tokenizer)
|
| 440 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
# EOS token for duration-constrained codes generation
|
| 442 |
self.eos_token_id = self.tokenizer.eos_token_id
|
| 443 |
|
|
|
|
| 460 |
# Precompute audio code mask for efficient blocking (O(1) instead of O(n))
|
| 461 |
# This mask will be added to scores during caption generation
|
| 462 |
self.audio_code_mask: Optional[torch.Tensor] = None
|
| 463 |
+
# Inverse mask: block all non-audio-code tokens (for CODES_GENERATION state)
|
| 464 |
+
self.non_audio_code_mask: Optional[torch.Tensor] = None
|
| 465 |
self._build_audio_code_mask()
|
| 466 |
|
| 467 |
# Build valid keyscales set (prefix tree will be built after _char_to_tokens is initialized)
|
|
|
|
| 497 |
This mask can be added to scores in O(1) time instead of O(n) loop.
|
| 498 |
|
| 499 |
The mask is [1, vocab_size] tensor with -inf at audio code token positions.
|
| 500 |
+
|
| 501 |
+
Also builds the inverse mask (non_audio_code_mask) for CODES_GENERATION state,
|
| 502 |
+
which blocks all non-audio-code tokens.
|
| 503 |
"""
|
| 504 |
if not self.audio_code_token_ids:
|
| 505 |
self.audio_code_mask = None
|
| 506 |
+
self.non_audio_code_mask = None
|
| 507 |
return
|
| 508 |
|
| 509 |
# Create mask tensor: 0 everywhere, -inf at audio code positions
|
|
|
|
| 518 |
|
| 519 |
self.audio_code_mask = mask
|
| 520 |
|
| 521 |
+
# Build inverse mask: -inf everywhere EXCEPT at audio code positions
|
| 522 |
+
# This is used in CODES_GENERATION state to only allow audio codes
|
| 523 |
+
inverse_mask = torch.full((1, self.vocab_size), float('-inf'), dtype=torch.float32)
|
| 524 |
+
inverse_mask[0, audio_code_indices] = 0
|
| 525 |
+
|
| 526 |
+
# Also allow EOS token in codes generation (will be controlled by duration constraint)
|
| 527 |
+
if self.eos_token_id is not None:
|
| 528 |
+
inverse_mask[0, self.eos_token_id] = 0
|
| 529 |
+
|
| 530 |
+
self.non_audio_code_mask = inverse_mask
|
| 531 |
+
|
| 532 |
if self.debug:
|
| 533 |
+
logger.debug(f"Built audio code masks for {len(self.audio_code_token_ids)} tokens")
|
| 534 |
|
| 535 |
def _build_keyscale_prefix_tree(self) -> Dict[Tuple[int, ...], Set[int]]:
|
| 536 |
"""
|
|
|
|
| 807 |
print(f" {repr(ks)}")
|
| 808 |
|
| 809 |
print("=" * 60)
|
| 810 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 811 |
def _precompute_char_token_mapping(self):
|
| 812 |
"""
|
| 813 |
Precompute mapping from characters to token IDs and token decoded texts.
|
|
|
|
| 859 |
|
| 860 |
if self.debug:
|
| 861 |
logger.debug(f"Precomputed char->token mapping for {len(self._char_to_tokens)} unique characters")
|
| 862 |
+
|
| 863 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 864 |
def _get_trie_node_from_trie(self, trie: Dict, prefix: str) -> Optional[Dict]:
|
| 865 |
"""Get a trie node from a specific trie (helper for caption vs full trie)."""
|
| 866 |
node = trie
|
|
|
|
| 869 |
return None
|
| 870 |
node = node[char]
|
| 871 |
return node
|
| 872 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 873 |
def reset(self):
|
| 874 |
"""Reset the processor state for a new generation."""
|
| 875 |
self.state = FSMState.THINK_TAG
|
|
|
|
| 881 |
self.current_user_field = None # Reset current user field
|
| 882 |
self.caption_after_newline = False # Reset caption newline tracking
|
| 883 |
self.caption_token_count = 0 # Reset caption token count
|
| 884 |
+
self.caption_ending = False # Reset caption ending tracking
|
| 885 |
+
self.pending_field_name = "" # Reset pending field name
|
| 886 |
|
| 887 |
def set_target_duration(self, duration: Optional[float]):
|
| 888 |
"""
|
|
|
|
| 902 |
if self.debug:
|
| 903 |
logger.debug("Target duration cleared, no duration constraint")
|
| 904 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 905 |
def _get_allowed_tokens_for_fixed_string(self, fixed_str: str) -> List[int]:
|
| 906 |
"""
|
| 907 |
Get the token IDs that can continue the fixed string from current position.
|
|
|
|
| 1060 |
logger.debug(f"Numeric field decision: newline_prob={newline_prob:.4f}, max_digit_prob={max_digit_prob:.4f}")
|
| 1061 |
|
| 1062 |
return newline_prob > max_digit_prob
|
| 1063 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1064 |
def _get_allowed_keyscale_tokens(self) -> List[int]:
|
| 1065 |
"""
|
| 1066 |
Get allowed tokens for keyscale field using the precomputed prefix tree.
|
|
|
|
| 1134 |
return self._apply_temperature_scaling(scores)
|
| 1135 |
|
| 1136 |
if self.state == FSMState.COMPLETED:
|
| 1137 |
+
# In understanding phase, block audio codes during lyrics generation (COMPLETED state)
|
| 1138 |
+
if self.generation_phase == "understand" and self.audio_code_mask is not None:
|
| 1139 |
+
# Move mask to same device/dtype as scores if needed
|
| 1140 |
+
if self.audio_code_mask.device != scores.device or self.audio_code_mask.dtype != scores.dtype:
|
| 1141 |
+
self.audio_code_mask = self.audio_code_mask.to(device=scores.device, dtype=scores.dtype)
|
| 1142 |
+
scores = scores + self.audio_code_mask
|
| 1143 |
return self._apply_temperature_scaling(scores)
|
| 1144 |
|
| 1145 |
+
# For codes phase, detect if input already contains </think> and skip to CODES_GENERATION
|
| 1146 |
+
if self.generation_phase == "codes" and self.state == FSMState.THINK_TAG:
|
| 1147 |
+
# Check if input contains </think> token sequence
|
| 1148 |
+
if self._input_contains_think_end_tag(input_ids):
|
| 1149 |
+
# Skip metadata generation, go directly to codes generation
|
| 1150 |
+
self.state = FSMState.CODES_GENERATION
|
| 1151 |
+
self.codes_count = 0
|
| 1152 |
+
if self.debug:
|
| 1153 |
+
logger.debug("Codes phase: detected </think> in input, skipping to CODES_GENERATION")
|
| 1154 |
+
|
| 1155 |
if self.state == FSMState.CODES_GENERATION:
|
| 1156 |
+
# Block all non-audio-code tokens (only allow audio codes and EOS)
|
| 1157 |
+
if self.non_audio_code_mask is not None:
|
| 1158 |
+
# Move mask to same device/dtype as scores if needed
|
| 1159 |
+
if self.non_audio_code_mask.device != scores.device or self.non_audio_code_mask.dtype != scores.dtype:
|
| 1160 |
+
self.non_audio_code_mask = self.non_audio_code_mask.to(device=scores.device, dtype=scores.dtype)
|
| 1161 |
+
scores = scores + self.non_audio_code_mask
|
| 1162 |
+
|
| 1163 |
# Apply duration constraint in codes generation phase
|
| 1164 |
if self.target_codes is not None and self.eos_token_id is not None:
|
| 1165 |
if self.codes_count < self.target_codes:
|
|
|
|
| 1186 |
# Apply temperature scaling after constraint masking
|
| 1187 |
return self._apply_temperature_scaling(scores)
|
| 1188 |
|
| 1189 |
+
def _input_contains_think_end_tag(self, input_ids: torch.LongTensor) -> bool:
|
| 1190 |
+
"""
|
| 1191 |
+
Check if input contains the </think> closing tag.
|
| 1192 |
+
|
| 1193 |
+
Args:
|
| 1194 |
+
input_ids: [batch_size, seq_len] input token IDs
|
| 1195 |
+
|
| 1196 |
+
Returns:
|
| 1197 |
+
True if </think> is found in the input (any sequence in batch)
|
| 1198 |
+
"""
|
| 1199 |
+
# Tokenize </think> to get its token sequence
|
| 1200 |
+
think_end_tokens = self.tokenizer.encode("</think>", add_special_tokens=False)
|
| 1201 |
+
if not think_end_tokens:
|
| 1202 |
+
return False
|
| 1203 |
+
|
| 1204 |
+
# Check each sequence in batch
|
| 1205 |
+
for b in range(input_ids.shape[0]):
|
| 1206 |
+
seq = input_ids[b].tolist()
|
| 1207 |
+
# Search for the token sequence in the input
|
| 1208 |
+
for i in range(len(seq) - len(think_end_tokens) + 1):
|
| 1209 |
+
if seq[i:i+len(think_end_tokens)] == think_end_tokens:
|
| 1210 |
+
return True
|
| 1211 |
+
|
| 1212 |
+
return False
|
| 1213 |
+
|
| 1214 |
def _apply_temperature_scaling(self, scores: torch.FloatTensor) -> torch.FloatTensor:
|
| 1215 |
"""
|
| 1216 |
Apply temperature scaling based on current generation phase.
|
|
|
|
| 1248 |
Uses the same tokenization logic as prefix tree building.
|
| 1249 |
|
| 1250 |
Args:
|
| 1251 |
+
field_name: Field name ("bpm", "caption", "duration", "keyscale", "language", "timesignature")
|
| 1252 |
|
| 1253 |
Returns:
|
| 1254 |
List of token IDs for the complete field, or None if field is not provided
|
|
|
|
| 1265 |
"keyscale": "keyscale: ",
|
| 1266 |
"language": "language: ",
|
| 1267 |
"timesignature": "timesignature: ",
|
|
|
|
| 1268 |
}
|
| 1269 |
prefix = field_to_prefix[field_name]
|
| 1270 |
full_text = f"{prefix}{value}\n"
|
|
|
|
| 1419 |
|
| 1420 |
# If top token does NOT start with space/tab, it's a new field (like "duration:")
|
| 1421 |
if len(top_token_text) > 0 and top_token_text[0] not in ' \t':
|
| 1422 |
+
# Caption is ending - LM is generating next field name
|
| 1423 |
+
# Instead of forcing state transition to DURATION_NAME (which would regenerate the field name),
|
| 1424 |
+
# we enter a "caption_ending" mode where we allow free generation until we detect the field value
|
| 1425 |
self.caption_after_newline = False
|
| 1426 |
+
self.caption_ending = True # Start tracking field name
|
| 1427 |
+
self.pending_field_name = "" # Reset pending field name
|
| 1428 |
+
# Allow free generation (no constraints) so LM can generate field name naturally
|
| 1429 |
+
return scores
|
| 1430 |
else:
|
| 1431 |
# It's indentation, continue caption
|
| 1432 |
self.caption_after_newline = False
|
| 1433 |
|
| 1434 |
+
# If caption is ending (LM generating next field name), allow free generation
|
| 1435 |
+
# and track the field name until we see colon
|
| 1436 |
+
if self.caption_ending:
|
| 1437 |
+
# Allow any token (free generation)
|
| 1438 |
+
# The field name detection will happen in update_state()
|
| 1439 |
+
return scores
|
| 1440 |
+
|
| 1441 |
# Block backticks (code blocks)
|
| 1442 |
if self.backtick_token is not None:
|
| 1443 |
scores[0, self.backtick_token] = float('-inf')
|
|
|
|
| 1505 |
mask[0, self.newline_token] = 0
|
| 1506 |
|
| 1507 |
scores = scores + mask
|
| 1508 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1509 |
elif self.state == FSMState.KEYSCALE_VALUE:
|
| 1510 |
# Check if field is user-provided and we haven't started injecting yet
|
| 1511 |
if self.user_provided_metadata["keyscale"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
|
|
|
|
| 1543 |
scores = scores + mask
|
| 1544 |
|
| 1545 |
elif self.state == FSMState.LANGUAGE_VALUE:
|
| 1546 |
+
# Language field: Use top-1 probability language (greedy selection)
|
| 1547 |
+
# Unlike other fields, we don't use prefix tree sampling.
|
| 1548 |
+
# Instead, we select the highest probability language at the start,
|
| 1549 |
+
# then force generate the rest of that language code.
|
| 1550 |
|
| 1551 |
# Check if field is user-provided and we haven't started injecting yet
|
| 1552 |
if self.user_provided_metadata["language"] is not None and not self.user_field_token_queue and not self.accumulated_token_ids:
|
|
|
|
| 1562 |
scores = scores + mask
|
| 1563 |
return scores
|
| 1564 |
|
| 1565 |
+
# If we haven't started generating language yet (empty accumulated_token_ids),
|
| 1566 |
+
# select the top-1 probability token from all valid first tokens
|
| 1567 |
+
if not self.accumulated_token_ids:
|
| 1568 |
+
# Get all possible first tokens for all languages
|
| 1569 |
+
empty_prefix = tuple()
|
| 1570 |
+
if empty_prefix in self.language_prefix_tree:
|
| 1571 |
+
candidate_tokens = list(self.language_prefix_tree[empty_prefix])
|
| 1572 |
+
|
| 1573 |
+
if candidate_tokens:
|
| 1574 |
+
# Find the token with highest probability (top-1)
|
| 1575 |
+
# Create a mask that blocks all tokens except candidates
|
| 1576 |
+
temp_mask = torch.full_like(scores, float('-inf'))
|
| 1577 |
+
for t in candidate_tokens:
|
| 1578 |
+
temp_mask[0, t] = 0
|
| 1579 |
+
temp_scores = scores + temp_mask
|
| 1580 |
+
|
| 1581 |
+
# Get the highest probability token among candidates
|
| 1582 |
+
top_token_id = torch.argmax(temp_scores[0]).item()
|
| 1583 |
+
|
| 1584 |
+
# Only allow this top-1 token, block all others (including other language tokens)
|
| 1585 |
+
mask[0, top_token_id] = 0
|
| 1586 |
+
scores = scores + mask
|
| 1587 |
+
|
| 1588 |
+
if self.debug:
|
| 1589 |
+
top_token_text = self.tokenizer.decode([top_token_id])
|
| 1590 |
+
logger.debug(f"Language field: selected top-1 token {top_token_id} ({repr(top_token_text)}) from {len(candidate_tokens)} candidates")
|
| 1591 |
+
else:
|
| 1592 |
+
# No valid first tokens found - force newline
|
| 1593 |
+
if self.newline_token:
|
| 1594 |
+
mask[0, self.newline_token] = 0
|
| 1595 |
+
scores = scores + mask
|
| 1596 |
else:
|
| 1597 |
+
# Empty prefix not in tree - force newline
|
| 1598 |
if self.newline_token:
|
| 1599 |
mask[0, self.newline_token] = 0
|
| 1600 |
scores = scores + mask
|
| 1601 |
+
else:
|
| 1602 |
+
# We've started generating a language, continue with prefix tree constraints
|
| 1603 |
+
# Check if current token sequence is complete (allows newline)
|
| 1604 |
+
token_prefix = tuple(self.accumulated_token_ids)
|
| 1605 |
+
if token_prefix in self.language_prefix_tree and self.newline_token in self.language_prefix_tree[token_prefix]:
|
| 1606 |
+
# Complete language, allow newline
|
| 1607 |
+
if self.newline_token:
|
| 1608 |
+
mask[0, self.newline_token] = 0
|
| 1609 |
+
scores = scores + mask
|
| 1610 |
+
else:
|
| 1611 |
+
# Not complete, allow valid continuation tokens
|
| 1612 |
+
allowed = self._get_allowed_language_tokens()
|
| 1613 |
+
if allowed:
|
| 1614 |
+
for t in allowed:
|
| 1615 |
+
mask[0, t] = 0
|
| 1616 |
+
scores = scores + mask
|
| 1617 |
+
else:
|
| 1618 |
+
# No valid tokens found - force newline to end field
|
| 1619 |
+
if self.newline_token:
|
| 1620 |
+
mask[0, self.newline_token] = 0
|
| 1621 |
+
scores = scores + mask
|
| 1622 |
|
| 1623 |
elif self.state == FSMState.TIMESIG_VALUE:
|
| 1624 |
# Check if field is user-provided and we haven't started injecting yet
|
|
|
|
| 1657 |
old_state = self.state
|
| 1658 |
next_state = self.next_state[self.state]
|
| 1659 |
|
| 1660 |
+
# Handle different cases at THINK_END_TAG based on generation phase
|
| 1661 |
+
# NOTE: Do NOT override next_state here when stop_at_reasoning=True
|
| 1662 |
+
# because we need to transition to the tag state first to generate </think>,
|
| 1663 |
+
# then handle stop_at_reasoning in update_state() AFTER the tag is complete
|
| 1664 |
+
if old_state == FSMState.THINK_END_TAG:
|
| 1665 |
+
if self.generation_phase == "understand":
|
| 1666 |
+
# Understanding mode: allow free-form lyrics after metadata
|
| 1667 |
+
# No more constrained decoding after </think>
|
| 1668 |
+
next_state = FSMState.COMPLETED
|
| 1669 |
+
if self.debug:
|
| 1670 |
+
logger.debug(f"generation_phase='understand': allowing free-form lyrics after </think>")
|
| 1671 |
+
# else: default to CODES_GENERATION (for "codes" phase) or respect stop_at_reasoning flag
|
| 1672 |
|
| 1673 |
self.state = next_state
|
| 1674 |
self.position_in_state = 0
|
|
|
|
| 1676 |
self.accumulated_token_ids = [] # Reset token ID sequence for new field
|
| 1677 |
self.caption_after_newline = False # Reset caption newline tracking
|
| 1678 |
self.caption_token_count = 0 # Reset caption token count
|
| 1679 |
+
self.caption_ending = False # Reset caption ending tracking
|
| 1680 |
+
self.pending_field_name = "" # Reset pending field name
|
| 1681 |
if self.debug:
|
| 1682 |
logger.debug(f"FSM transition: {old_state.name} -> {self.state.name}")
|
| 1683 |
|
|
|
|
| 1748 |
|
| 1749 |
# Check if we've completed the fixed string
|
| 1750 |
if self.position_in_state >= len(fixed_str):
|
| 1751 |
+
# Special handling for THINK_END_TAG with stop_at_reasoning
|
| 1752 |
+
if self.state == FSMState.THINK_END_TAG and self.stop_at_reasoning:
|
| 1753 |
+
# </think> tag is complete, now we can stop generation
|
| 1754 |
+
# Force transition to COMPLETED instead of CODES_GENERATION
|
| 1755 |
+
old_state = self.state
|
| 1756 |
+
self.state = FSMState.COMPLETED
|
| 1757 |
+
self.position_in_state = 0
|
| 1758 |
+
self.accumulated_value = ""
|
| 1759 |
+
self.accumulated_token_ids = []
|
| 1760 |
+
if self.debug:
|
| 1761 |
+
logger.debug(f"FSM transition (stop_at_reasoning): {old_state.name} -> {self.state.name}")
|
| 1762 |
+
else:
|
| 1763 |
+
self._transition_to_next_state()
|
| 1764 |
|
| 1765 |
elif self.state in [FSMState.BPM_VALUE, FSMState.DURATION_VALUE, FSMState.TIMESIG_VALUE]:
|
| 1766 |
# Accumulate numeric value using token ID sequence
|
| 1767 |
if generated_token_id == self.newline_token:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1768 |
old_state = self.state
|
| 1769 |
self._transition_to_next_state()
|
| 1770 |
# IMPORTANT: After state transition, if new state is a fixed_strings state,
|
|
|
|
| 1779 |
# Also update legacy accumulated_value for compatibility
|
| 1780 |
if token_str.strip().isdigit():
|
| 1781 |
self.accumulated_value += token_str.strip()
|
| 1782 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1783 |
elif self.state == FSMState.CAPTION_VALUE:
|
| 1784 |
# Track token count for 512 limit
|
| 1785 |
self.caption_token_count += 1
|
|
|
|
| 1792 |
# Mark that we need to check next token for field transition
|
| 1793 |
self.caption_after_newline = True
|
| 1794 |
else:
|
| 1795 |
+
# Not a newline - if we were after newline and this is not space,
|
| 1796 |
# transition already happened in _process_single_sequence
|
| 1797 |
self.caption_after_newline = False
|
| 1798 |
+
|
| 1799 |
+
# If caption is ending, accumulate field name and detect field completion
|
| 1800 |
+
if self.caption_ending:
|
| 1801 |
+
self.pending_field_name += token_str
|
| 1802 |
+
|
| 1803 |
+
# Check if we've completed a field name (detected colon)
|
| 1804 |
+
if ':' in token_str or token_str.strip() == ':':
|
| 1805 |
+
# Extract field name (before colon)
|
| 1806 |
+
field_name_full = self.pending_field_name.strip()
|
| 1807 |
+
# Remove trailing colon if present
|
| 1808 |
+
field_name = field_name_full.rstrip(':').strip().lower()
|
| 1809 |
+
|
| 1810 |
+
if self.debug:
|
| 1811 |
+
logger.debug(f"Detected field name after caption: {repr(field_name)}")
|
| 1812 |
+
|
| 1813 |
+
# Map field name to VALUE state
|
| 1814 |
+
field_name_to_value_state = {
|
| 1815 |
+
"duration": FSMState.DURATION_VALUE,
|
| 1816 |
+
"keyscale": FSMState.KEYSCALE_VALUE,
|
| 1817 |
+
"language": FSMState.LANGUAGE_VALUE,
|
| 1818 |
+
"timesignature": FSMState.TIMESIG_VALUE,
|
| 1819 |
+
}
|
| 1820 |
+
|
| 1821 |
+
if field_name in field_name_to_value_state:
|
| 1822 |
+
# Transition directly to the field's VALUE state
|
| 1823 |
+
old_state = self.state
|
| 1824 |
+
self.state = field_name_to_value_state[field_name]
|
| 1825 |
+
self.position_in_state = 0
|
| 1826 |
+
self.accumulated_value = ""
|
| 1827 |
+
self.accumulated_token_ids = []
|
| 1828 |
+
self.caption_ending = False
|
| 1829 |
+
self.pending_field_name = ""
|
| 1830 |
+
|
| 1831 |
+
if self.debug:
|
| 1832 |
+
logger.debug(f"FSM transition (caption ending): {old_state.name} -> {self.state.name}")
|
| 1833 |
+
else:
|
| 1834 |
+
# Unknown field name, force transition to next field
|
| 1835 |
+
if self.debug:
|
| 1836 |
+
logger.warning(f"Unknown field name after caption: {repr(field_name)}, forcing transition")
|
| 1837 |
+
self.caption_ending = False
|
| 1838 |
+
self.pending_field_name = ""
|
| 1839 |
+
self._transition_to_next_state()
|
| 1840 |
|
| 1841 |
elif self.state == FSMState.KEYSCALE_VALUE:
|
| 1842 |
if generated_token_id == self.newline_token:
|
acestep/gradio_ui.py
CHANGED
|
@@ -363,12 +363,20 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
|
|
| 363 |
|
| 364 |
# Audio Codes for text2music
|
| 365 |
with gr.Accordion("🎼 LM Codes Hints", open=False, visible=True) as text2music_audio_codes_group:
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
# Repainting controls
|
| 374 |
with gr.Group(visible=False) as repainting_group:
|
|
@@ -639,6 +647,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
|
|
| 639 |
"src_audio": src_audio,
|
| 640 |
"convert_src_to_codes_btn": convert_src_to_codes_btn,
|
| 641 |
"text2music_audio_code_string": text2music_audio_code_string,
|
|
|
|
| 642 |
"text2music_audio_codes_group": text2music_audio_codes_group,
|
| 643 |
"lm_temperature": lm_temperature,
|
| 644 |
"lm_cfg_scale": lm_cfg_scale,
|
|
@@ -679,6 +688,9 @@ def create_results_section(dit_handler) -> dict:
|
|
| 679 |
with gr.Group():
|
| 680 |
gr.HTML('<div class="section-header"><h3>🎧 Generated Results</h3></div>')
|
| 681 |
|
|
|
|
|
|
|
|
|
|
| 682 |
status_output = gr.Textbox(label="Generation Status", interactive=False)
|
| 683 |
|
| 684 |
with gr.Row():
|
|
@@ -725,6 +737,7 @@ def create_results_section(dit_handler) -> dict:
|
|
| 725 |
align_plot_2 = gr.Plot(label="Attention Focus Score Heatmap (Sample 2)")
|
| 726 |
|
| 727 |
return {
|
|
|
|
| 728 |
"status_output": status_output,
|
| 729 |
"generated_audio_1": generated_audio_1,
|
| 730 |
"generated_audio_2": generated_audio_2,
|
|
@@ -751,7 +764,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 751 |
task_type: The task type (e.g., "text2music")
|
| 752 |
|
| 753 |
Returns:
|
| 754 |
-
Tuple of (
|
| 755 |
"""
|
| 756 |
try:
|
| 757 |
# Get the project root directory
|
|
@@ -764,14 +777,14 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 764 |
# Check if directory exists
|
| 765 |
if not os.path.exists(examples_dir):
|
| 766 |
gr.Warning(f"Examples directory not found: examples/{task_type}/")
|
| 767 |
-
return "", "", True
|
| 768 |
|
| 769 |
# Find all JSON files in the directory
|
| 770 |
json_files = glob.glob(os.path.join(examples_dir, "*.json"))
|
| 771 |
|
| 772 |
if not json_files:
|
| 773 |
gr.Warning(f"No JSON files found in examples/{task_type}/")
|
| 774 |
-
return "", "", True
|
| 775 |
|
| 776 |
# Randomly select one file
|
| 777 |
selected_file = random.choice(json_files)
|
|
@@ -796,19 +809,111 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 796 |
if not isinstance(think_value, bool):
|
| 797 |
think_value = True
|
| 798 |
|
| 799 |
-
|
| 800 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 801 |
|
| 802 |
except json.JSONDecodeError as e:
|
| 803 |
gr.Warning(f"Failed to parse JSON file {os.path.basename(selected_file)}: {str(e)}")
|
| 804 |
-
return "", "", True
|
| 805 |
except Exception as e:
|
| 806 |
gr.Warning(f"Error reading file {os.path.basename(selected_file)}: {str(e)}")
|
| 807 |
-
return "", "", True
|
| 808 |
|
| 809 |
except Exception as e:
|
| 810 |
gr.Warning(f"Error loading example: {str(e)}")
|
| 811 |
-
return "", "", True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
|
| 813 |
def update_init_status(status_msg, enable_btn):
|
| 814 |
"""Update initialization status and enable/disable generate button"""
|
|
@@ -1105,8 +1210,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 1105 |
metadata_lines.append(f"- **User Query Rewritten Caption:** {lm_generated_metadata['caption']}")
|
| 1106 |
if lm_generated_metadata.get('duration'):
|
| 1107 |
metadata_lines.append(f"- **Duration:** {lm_generated_metadata['duration']} seconds")
|
| 1108 |
-
if lm_generated_metadata.get('genres'):
|
| 1109 |
-
metadata_lines.append(f"- **Genres:** {lm_generated_metadata['genres']}")
|
| 1110 |
if lm_generated_metadata.get('keyscale'):
|
| 1111 |
metadata_lines.append(f"- **KeyScale:** {lm_generated_metadata['keyscale']}")
|
| 1112 |
if lm_generated_metadata.get('language'):
|
|
@@ -1134,7 +1237,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 1134 |
align_score_2,
|
| 1135 |
align_text_2,
|
| 1136 |
align_plot_2,
|
| 1137 |
-
updated_audio_codes # Update audio codes in UI
|
|
|
|
| 1138 |
)
|
| 1139 |
|
| 1140 |
generation_section["generate_btn"].click(
|
|
@@ -1186,7 +1290,8 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 1186 |
results_section["align_score_2"],
|
| 1187 |
results_section["align_text_2"],
|
| 1188 |
results_section["align_plot_2"],
|
| 1189 |
-
generation_section["text2music_audio_code_string"] # Update audio codes display
|
|
|
|
| 1190 |
]
|
| 1191 |
)
|
| 1192 |
|
|
@@ -1306,33 +1411,214 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
|
|
| 1306 |
]
|
| 1307 |
)
|
| 1308 |
|
| 1309 |
-
# Send generated audio to src_audio
|
| 1310 |
-
def
|
| 1311 |
-
"""Send generated audio file to src_audio input
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1312 |
if audio_file is None:
|
| 1313 |
-
return None
|
| 1314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1315 |
|
| 1316 |
results_section["send_to_src_btn_1"].click(
|
| 1317 |
-
fn=
|
| 1318 |
-
inputs=[
|
| 1319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1320 |
)
|
| 1321 |
|
| 1322 |
results_section["send_to_src_btn_2"].click(
|
| 1323 |
-
fn=
|
| 1324 |
-
inputs=[
|
| 1325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1326 |
)
|
| 1327 |
|
| 1328 |
-
# Sample button -
|
| 1329 |
generation_section["sample_btn"].click(
|
| 1330 |
-
fn=
|
| 1331 |
inputs=[generation_section["task_type"]],
|
| 1332 |
outputs=[
|
| 1333 |
generation_section["captions"],
|
| 1334 |
generation_section["lyrics"],
|
| 1335 |
-
generation_section["think_checkbox"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1336 |
]
|
| 1337 |
)
|
| 1338 |
|
|
|
|
| 363 |
|
| 364 |
# Audio Codes for text2music
|
| 365 |
with gr.Accordion("🎼 LM Codes Hints", open=False, visible=True) as text2music_audio_codes_group:
|
| 366 |
+
with gr.Row(equal_height=True):
|
| 367 |
+
text2music_audio_code_string = gr.Textbox(
|
| 368 |
+
label="LM Codes Hints",
|
| 369 |
+
placeholder="<|audio_code_10695|><|audio_code_54246|>...",
|
| 370 |
+
lines=6,
|
| 371 |
+
info="Paste LM codes hints for text2music generation",
|
| 372 |
+
scale=9,
|
| 373 |
+
)
|
| 374 |
+
transcribe_btn = gr.Button(
|
| 375 |
+
"Transcribe",
|
| 376 |
+
variant="secondary",
|
| 377 |
+
size="sm",
|
| 378 |
+
scale=1,
|
| 379 |
+
)
|
| 380 |
|
| 381 |
# Repainting controls
|
| 382 |
with gr.Group(visible=False) as repainting_group:
|
|
|
|
| 647 |
"src_audio": src_audio,
|
| 648 |
"convert_src_to_codes_btn": convert_src_to_codes_btn,
|
| 649 |
"text2music_audio_code_string": text2music_audio_code_string,
|
| 650 |
+
"transcribe_btn": transcribe_btn,
|
| 651 |
"text2music_audio_codes_group": text2music_audio_codes_group,
|
| 652 |
"lm_temperature": lm_temperature,
|
| 653 |
"lm_cfg_scale": lm_cfg_scale,
|
|
|
|
| 688 |
with gr.Group():
|
| 689 |
gr.HTML('<div class="section-header"><h3>🎧 Generated Results</h3></div>')
|
| 690 |
|
| 691 |
+
# Hidden state to store LM-generated metadata
|
| 692 |
+
lm_metadata_state = gr.State(value=None)
|
| 693 |
+
|
| 694 |
status_output = gr.Textbox(label="Generation Status", interactive=False)
|
| 695 |
|
| 696 |
with gr.Row():
|
|
|
|
| 737 |
align_plot_2 = gr.Plot(label="Attention Focus Score Heatmap (Sample 2)")
|
| 738 |
|
| 739 |
return {
|
| 740 |
+
"lm_metadata_state": lm_metadata_state,
|
| 741 |
"status_output": status_output,
|
| 742 |
"generated_audio_1": generated_audio_1,
|
| 743 |
"generated_audio_2": generated_audio_2,
|
|
|
|
| 764 |
task_type: The task type (e.g., "text2music")
|
| 765 |
|
| 766 |
Returns:
|
| 767 |
+
Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
|
| 768 |
"""
|
| 769 |
try:
|
| 770 |
# Get the project root directory
|
|
|
|
| 777 |
# Check if directory exists
|
| 778 |
if not os.path.exists(examples_dir):
|
| 779 |
gr.Warning(f"Examples directory not found: examples/{task_type}/")
|
| 780 |
+
return "", "", True, None, None, "", "", ""
|
| 781 |
|
| 782 |
# Find all JSON files in the directory
|
| 783 |
json_files = glob.glob(os.path.join(examples_dir, "*.json"))
|
| 784 |
|
| 785 |
if not json_files:
|
| 786 |
gr.Warning(f"No JSON files found in examples/{task_type}/")
|
| 787 |
+
return "", "", True, None, None, "", "", ""
|
| 788 |
|
| 789 |
# Randomly select one file
|
| 790 |
selected_file = random.choice(json_files)
|
|
|
|
| 809 |
if not isinstance(think_value, bool):
|
| 810 |
think_value = True
|
| 811 |
|
| 812 |
+
# Extract optional metadata fields
|
| 813 |
+
bpm_value = None
|
| 814 |
+
if 'bpm' in data and data['bpm'] not in [None, "N/A", ""]:
|
| 815 |
+
try:
|
| 816 |
+
bpm_value = int(data['bpm'])
|
| 817 |
+
except (ValueError, TypeError):
|
| 818 |
+
pass
|
| 819 |
+
|
| 820 |
+
duration_value = None
|
| 821 |
+
if 'duration' in data and data['duration'] not in [None, "N/A", ""]:
|
| 822 |
+
try:
|
| 823 |
+
duration_value = float(data['duration'])
|
| 824 |
+
except (ValueError, TypeError):
|
| 825 |
+
pass
|
| 826 |
+
|
| 827 |
+
keyscale_value = data.get('keyscale', '')
|
| 828 |
+
if keyscale_value in [None, "N/A"]:
|
| 829 |
+
keyscale_value = ''
|
| 830 |
+
|
| 831 |
+
language_value = data.get('language', '')
|
| 832 |
+
if language_value in [None, "N/A"]:
|
| 833 |
+
language_value = ''
|
| 834 |
+
|
| 835 |
+
timesignature_value = data.get('timesignature', '')
|
| 836 |
+
if timesignature_value in [None, "N/A"]:
|
| 837 |
+
timesignature_value = ''
|
| 838 |
+
|
| 839 |
+
gr.Info(f"📁 Loaded example from {os.path.basename(selected_file)}")
|
| 840 |
+
return caption_value, lyrics_value, think_value, bpm_value, duration_value, keyscale_value, language_value, timesignature_value
|
| 841 |
|
| 842 |
except json.JSONDecodeError as e:
|
| 843 |
gr.Warning(f"Failed to parse JSON file {os.path.basename(selected_file)}: {str(e)}")
|
| 844 |
+
return "", "", True, None, None, "", "", ""
|
| 845 |
except Exception as e:
|
| 846 |
gr.Warning(f"Error reading file {os.path.basename(selected_file)}: {str(e)}")
|
| 847 |
+
return "", "", True, None, None, "", "", ""
|
| 848 |
|
| 849 |
except Exception as e:
|
| 850 |
gr.Warning(f"Error loading example: {str(e)}")
|
| 851 |
+
return "", "", True, None, None, "", "", ""
|
| 852 |
+
|
| 853 |
+
def sample_example_smart(task_type: str):
|
| 854 |
+
"""Smart sample function that uses LM if initialized, otherwise falls back to examples
|
| 855 |
+
|
| 856 |
+
Args:
|
| 857 |
+
task_type: The task type (e.g., "text2music")
|
| 858 |
+
|
| 859 |
+
Returns:
|
| 860 |
+
Tuple of (caption, lyrics, think, bpm, duration, keyscale, language, timesignature) for updating UI components
|
| 861 |
+
"""
|
| 862 |
+
# Check if LM is initialized
|
| 863 |
+
if llm_handler.llm_initialized:
|
| 864 |
+
# Use LM to generate example
|
| 865 |
+
try:
|
| 866 |
+
# Generate example using LM with empty input (NO USER INPUT)
|
| 867 |
+
metadata, status = llm_handler.understand_audio_from_codes(
|
| 868 |
+
audio_codes="NO USER INPUT",
|
| 869 |
+
use_constrained_decoding=True,
|
| 870 |
+
temperature=0.85,
|
| 871 |
+
)
|
| 872 |
+
|
| 873 |
+
if metadata:
|
| 874 |
+
caption_value = metadata.get('caption', '')
|
| 875 |
+
lyrics_value = metadata.get('lyrics', '')
|
| 876 |
+
think_value = True # Always enable think when using LM-generated examples
|
| 877 |
+
|
| 878 |
+
# Extract optional metadata fields
|
| 879 |
+
bpm_value = None
|
| 880 |
+
if 'bpm' in metadata and metadata['bpm'] not in [None, "N/A", ""]:
|
| 881 |
+
try:
|
| 882 |
+
bpm_value = int(metadata['bpm'])
|
| 883 |
+
except (ValueError, TypeError):
|
| 884 |
+
pass
|
| 885 |
+
|
| 886 |
+
duration_value = None
|
| 887 |
+
if 'duration' in metadata and metadata['duration'] not in [None, "N/A", ""]:
|
| 888 |
+
try:
|
| 889 |
+
duration_value = float(metadata['duration'])
|
| 890 |
+
except (ValueError, TypeError):
|
| 891 |
+
pass
|
| 892 |
+
|
| 893 |
+
keyscale_value = metadata.get('keyscale', '')
|
| 894 |
+
if keyscale_value in [None, "N/A"]:
|
| 895 |
+
keyscale_value = ''
|
| 896 |
+
|
| 897 |
+
language_value = metadata.get('language', '')
|
| 898 |
+
if language_value in [None, "N/A"]:
|
| 899 |
+
language_value = ''
|
| 900 |
+
|
| 901 |
+
timesignature_value = metadata.get('timesignature', '')
|
| 902 |
+
if timesignature_value in [None, "N/A"]:
|
| 903 |
+
timesignature_value = ''
|
| 904 |
+
|
| 905 |
+
gr.Info("🤖 Generated example using LM (Language Model)")
|
| 906 |
+
return caption_value, lyrics_value, think_value, bpm_value, duration_value, keyscale_value, language_value, timesignature_value
|
| 907 |
+
else:
|
| 908 |
+
gr.Warning("Failed to generate example using LM, falling back to examples directory")
|
| 909 |
+
return load_random_example(task_type)
|
| 910 |
+
|
| 911 |
+
except Exception as e:
|
| 912 |
+
gr.Warning(f"Error generating example with LM: {str(e)}, falling back to examples directory")
|
| 913 |
+
return load_random_example(task_type)
|
| 914 |
+
else:
|
| 915 |
+
# LM not initialized, use examples directory
|
| 916 |
+
return load_random_example(task_type)
|
| 917 |
|
| 918 |
def update_init_status(status_msg, enable_btn):
|
| 919 |
"""Update initialization status and enable/disable generate button"""
|
|
|
|
| 1210 |
metadata_lines.append(f"- **User Query Rewritten Caption:** {lm_generated_metadata['caption']}")
|
| 1211 |
if lm_generated_metadata.get('duration'):
|
| 1212 |
metadata_lines.append(f"- **Duration:** {lm_generated_metadata['duration']} seconds")
|
|
|
|
|
|
|
| 1213 |
if lm_generated_metadata.get('keyscale'):
|
| 1214 |
metadata_lines.append(f"- **KeyScale:** {lm_generated_metadata['keyscale']}")
|
| 1215 |
if lm_generated_metadata.get('language'):
|
|
|
|
| 1237 |
align_score_2,
|
| 1238 |
align_text_2,
|
| 1239 |
align_plot_2,
|
| 1240 |
+
updated_audio_codes, # Update audio codes in UI
|
| 1241 |
+
lm_generated_metadata # Store metadata for "Send to src audio" buttons
|
| 1242 |
)
|
| 1243 |
|
| 1244 |
generation_section["generate_btn"].click(
|
|
|
|
| 1290 |
results_section["align_score_2"],
|
| 1291 |
results_section["align_text_2"],
|
| 1292 |
results_section["align_plot_2"],
|
| 1293 |
+
generation_section["text2music_audio_code_string"], # Update audio codes display
|
| 1294 |
+
results_section["lm_metadata_state"] # Store metadata
|
| 1295 |
]
|
| 1296 |
)
|
| 1297 |
|
|
|
|
| 1411 |
]
|
| 1412 |
)
|
| 1413 |
|
| 1414 |
+
# Send generated audio to src_audio and populate metadata
|
| 1415 |
+
def send_audio_to_src_with_metadata(audio_file, lm_metadata):
|
| 1416 |
+
"""Send generated audio file to src_audio input and populate metadata fields
|
| 1417 |
+
|
| 1418 |
+
Args:
|
| 1419 |
+
audio_file: Audio file path
|
| 1420 |
+
lm_metadata: Dictionary containing LM-generated metadata
|
| 1421 |
+
|
| 1422 |
+
Returns:
|
| 1423 |
+
Tuple of (audio_file, bpm, caption, duration, key_scale, language, time_signature)
|
| 1424 |
+
"""
|
| 1425 |
if audio_file is None:
|
| 1426 |
+
return None, None, None, None, None, None, None
|
| 1427 |
+
|
| 1428 |
+
# Extract metadata fields if available
|
| 1429 |
+
bpm_value = None
|
| 1430 |
+
caption_value = None
|
| 1431 |
+
duration_value = None
|
| 1432 |
+
key_scale_value = None
|
| 1433 |
+
language_value = None
|
| 1434 |
+
time_signature_value = None
|
| 1435 |
+
|
| 1436 |
+
if lm_metadata:
|
| 1437 |
+
# BPM
|
| 1438 |
+
if lm_metadata.get('bpm'):
|
| 1439 |
+
bpm_str = lm_metadata.get('bpm')
|
| 1440 |
+
if bpm_str and bpm_str != "N/A":
|
| 1441 |
+
try:
|
| 1442 |
+
bpm_value = int(bpm_str)
|
| 1443 |
+
except (ValueError, TypeError):
|
| 1444 |
+
pass
|
| 1445 |
+
|
| 1446 |
+
# Caption (Rewritten Caption)
|
| 1447 |
+
if lm_metadata.get('caption'):
|
| 1448 |
+
caption_value = lm_metadata.get('caption')
|
| 1449 |
+
|
| 1450 |
+
# Duration
|
| 1451 |
+
if lm_metadata.get('duration'):
|
| 1452 |
+
duration_str = lm_metadata.get('duration')
|
| 1453 |
+
if duration_str and duration_str != "N/A":
|
| 1454 |
+
try:
|
| 1455 |
+
duration_value = float(duration_str)
|
| 1456 |
+
except (ValueError, TypeError):
|
| 1457 |
+
pass
|
| 1458 |
+
|
| 1459 |
+
# KeyScale
|
| 1460 |
+
if lm_metadata.get('keyscale'):
|
| 1461 |
+
key_scale_str = lm_metadata.get('keyscale')
|
| 1462 |
+
if key_scale_str and key_scale_str != "N/A":
|
| 1463 |
+
key_scale_value = key_scale_str
|
| 1464 |
+
|
| 1465 |
+
# Language
|
| 1466 |
+
if lm_metadata.get('language'):
|
| 1467 |
+
language_str = lm_metadata.get('language')
|
| 1468 |
+
if language_str and language_str != "N/A":
|
| 1469 |
+
language_value = language_str
|
| 1470 |
+
|
| 1471 |
+
# Time Signature
|
| 1472 |
+
if lm_metadata.get('timesignature'):
|
| 1473 |
+
time_sig_str = lm_metadata.get('timesignature')
|
| 1474 |
+
if time_sig_str and time_sig_str != "N/A":
|
| 1475 |
+
time_signature_value = time_sig_str
|
| 1476 |
+
|
| 1477 |
+
return (
|
| 1478 |
+
audio_file,
|
| 1479 |
+
bpm_value,
|
| 1480 |
+
caption_value,
|
| 1481 |
+
duration_value,
|
| 1482 |
+
key_scale_value,
|
| 1483 |
+
language_value,
|
| 1484 |
+
time_signature_value
|
| 1485 |
+
)
|
| 1486 |
|
| 1487 |
results_section["send_to_src_btn_1"].click(
|
| 1488 |
+
fn=send_audio_to_src_with_metadata,
|
| 1489 |
+
inputs=[
|
| 1490 |
+
results_section["generated_audio_1"],
|
| 1491 |
+
results_section["lm_metadata_state"]
|
| 1492 |
+
],
|
| 1493 |
+
outputs=[
|
| 1494 |
+
generation_section["src_audio"],
|
| 1495 |
+
generation_section["bpm"],
|
| 1496 |
+
generation_section["captions"],
|
| 1497 |
+
generation_section["audio_duration"],
|
| 1498 |
+
generation_section["key_scale"],
|
| 1499 |
+
generation_section["vocal_language"],
|
| 1500 |
+
generation_section["time_signature"]
|
| 1501 |
+
]
|
| 1502 |
)
|
| 1503 |
|
| 1504 |
results_section["send_to_src_btn_2"].click(
|
| 1505 |
+
fn=send_audio_to_src_with_metadata,
|
| 1506 |
+
inputs=[
|
| 1507 |
+
results_section["generated_audio_2"],
|
| 1508 |
+
results_section["lm_metadata_state"]
|
| 1509 |
+
],
|
| 1510 |
+
outputs=[
|
| 1511 |
+
generation_section["src_audio"],
|
| 1512 |
+
generation_section["bpm"],
|
| 1513 |
+
generation_section["captions"],
|
| 1514 |
+
generation_section["audio_duration"],
|
| 1515 |
+
generation_section["key_scale"],
|
| 1516 |
+
generation_section["vocal_language"],
|
| 1517 |
+
generation_section["time_signature"]
|
| 1518 |
+
]
|
| 1519 |
)
|
| 1520 |
|
| 1521 |
+
# Sample button - smart sample (uses LM if initialized, otherwise examples)
|
| 1522 |
generation_section["sample_btn"].click(
|
| 1523 |
+
fn=sample_example_smart,
|
| 1524 |
inputs=[generation_section["task_type"]],
|
| 1525 |
outputs=[
|
| 1526 |
generation_section["captions"],
|
| 1527 |
generation_section["lyrics"],
|
| 1528 |
+
generation_section["think_checkbox"],
|
| 1529 |
+
generation_section["bpm"],
|
| 1530 |
+
generation_section["audio_duration"],
|
| 1531 |
+
generation_section["key_scale"],
|
| 1532 |
+
generation_section["vocal_language"],
|
| 1533 |
+
generation_section["time_signature"],
|
| 1534 |
+
]
|
| 1535 |
+
)
|
| 1536 |
+
|
| 1537 |
+
# Transcribe audio codes to metadata (or generate example if empty)
|
| 1538 |
+
def transcribe_audio_codes(audio_code_string):
|
| 1539 |
+
"""
|
| 1540 |
+
Transcribe audio codes to metadata using LLM understanding.
|
| 1541 |
+
If audio_code_string is empty, generate a sample example instead.
|
| 1542 |
+
|
| 1543 |
+
Args:
|
| 1544 |
+
audio_code_string: String containing audio codes (or empty for example generation)
|
| 1545 |
+
|
| 1546 |
+
Returns:
|
| 1547 |
+
Tuple of (status_message, caption, lyrics, bpm, duration, keyscale, language, timesignature)
|
| 1548 |
+
"""
|
| 1549 |
+
if not llm_handler.llm_initialized:
|
| 1550 |
+
return "❌ 5Hz LM not initialized. Please initialize it first.", "", "", None, None, "", "", ""
|
| 1551 |
+
|
| 1552 |
+
# If codes are empty, this becomes a "generate example" task
|
| 1553 |
+
# Use "NO USER INPUT" as the input to generate a sample
|
| 1554 |
+
if not audio_code_string or not audio_code_string.strip():
|
| 1555 |
+
audio_code_string = "NO USER INPUT"
|
| 1556 |
+
|
| 1557 |
+
# Call LLM understanding
|
| 1558 |
+
metadata, status = llm_handler.understand_audio_from_codes(audio_codes=audio_code_string, use_constrained_decoding=True)
|
| 1559 |
+
|
| 1560 |
+
# Extract fields for UI update
|
| 1561 |
+
caption = metadata.get('caption', '')
|
| 1562 |
+
lyrics = metadata.get('lyrics', '')
|
| 1563 |
+
bpm = metadata.get('bpm')
|
| 1564 |
+
duration = metadata.get('duration')
|
| 1565 |
+
keyscale = metadata.get('keyscale', '')
|
| 1566 |
+
language = metadata.get('language', '')
|
| 1567 |
+
timesignature = metadata.get('timesignature', '')
|
| 1568 |
+
|
| 1569 |
+
# Convert to appropriate types
|
| 1570 |
+
try:
|
| 1571 |
+
bpm = int(bpm) if bpm and bpm != 'N/A' else None
|
| 1572 |
+
except:
|
| 1573 |
+
bpm = None
|
| 1574 |
+
|
| 1575 |
+
try:
|
| 1576 |
+
duration = float(duration) if duration and duration != 'N/A' else None
|
| 1577 |
+
except:
|
| 1578 |
+
duration = None
|
| 1579 |
+
|
| 1580 |
+
return (
|
| 1581 |
+
status,
|
| 1582 |
+
caption,
|
| 1583 |
+
lyrics,
|
| 1584 |
+
bpm,
|
| 1585 |
+
duration,
|
| 1586 |
+
keyscale,
|
| 1587 |
+
language,
|
| 1588 |
+
timesignature
|
| 1589 |
+
)
|
| 1590 |
+
|
| 1591 |
+
# Update transcribe button text based on whether codes are present
|
| 1592 |
+
def update_transcribe_button_text(audio_code_string):
|
| 1593 |
+
"""
|
| 1594 |
+
Update the transcribe button text based on input content.
|
| 1595 |
+
If empty: "Generate Example"
|
| 1596 |
+
If has content: "Transcribe"
|
| 1597 |
+
"""
|
| 1598 |
+
if not audio_code_string or not audio_code_string.strip():
|
| 1599 |
+
return gr.update(value="Generate Example")
|
| 1600 |
+
else:
|
| 1601 |
+
return gr.update(value="Transcribe")
|
| 1602 |
+
|
| 1603 |
+
# Update button text when codes change
|
| 1604 |
+
generation_section["text2music_audio_code_string"].change(
|
| 1605 |
+
fn=update_transcribe_button_text,
|
| 1606 |
+
inputs=[generation_section["text2music_audio_code_string"]],
|
| 1607 |
+
outputs=[generation_section["transcribe_btn"]]
|
| 1608 |
+
)
|
| 1609 |
+
|
| 1610 |
+
generation_section["transcribe_btn"].click(
|
| 1611 |
+
fn=transcribe_audio_codes,
|
| 1612 |
+
inputs=[generation_section["text2music_audio_code_string"]],
|
| 1613 |
+
outputs=[
|
| 1614 |
+
results_section["status_output"], # Show status
|
| 1615 |
+
generation_section["captions"], # Update caption field
|
| 1616 |
+
generation_section["lyrics"], # Update lyrics field
|
| 1617 |
+
generation_section["bpm"], # Update BPM field
|
| 1618 |
+
generation_section["audio_duration"], # Update duration field
|
| 1619 |
+
generation_section["key_scale"], # Update keyscale field
|
| 1620 |
+
generation_section["vocal_language"], # Update language field
|
| 1621 |
+
generation_section["time_signature"], # Update time signature field
|
| 1622 |
]
|
| 1623 |
)
|
| 1624 |
|
acestep/llm_inference.py
CHANGED
|
@@ -8,6 +8,7 @@ import time
|
|
| 8 |
from typing import Optional, Dict, Any, Tuple, List
|
| 9 |
from contextlib import contextmanager
|
| 10 |
|
|
|
|
| 11 |
import torch
|
| 12 |
from loguru import logger
|
| 13 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
@@ -17,7 +18,7 @@ from transformers.generation.logits_process import (
|
|
| 17 |
RepetitionPenaltyLogitsProcessor,
|
| 18 |
)
|
| 19 |
from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
|
| 20 |
-
from acestep.constants import DEFAULT_LM_INSTRUCTION
|
| 21 |
|
| 22 |
|
| 23 |
class LLMHandler:
|
|
@@ -247,6 +248,10 @@ class LLMHandler:
|
|
| 247 |
stop_at_reasoning: bool = False,
|
| 248 |
skip_caption: bool = False,
|
| 249 |
skip_language: bool = False,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
) -> str:
|
| 251 |
"""Shared vllm path: accept prebuilt formatted prompt and return text."""
|
| 252 |
from nanovllm import SamplingParams
|
|
@@ -258,12 +263,14 @@ class LLMHandler:
|
|
| 258 |
# Use shared constrained processor if enabled
|
| 259 |
constrained_processor = None
|
| 260 |
if use_constrained_decoding or use_phase_temperatures:
|
|
|
|
|
|
|
|
|
|
| 261 |
# Use shared processor, just update caption and settings
|
| 262 |
self.constrained_processor.enabled = use_constrained_decoding
|
| 263 |
self.constrained_processor.debug = constrained_decoding_debug
|
| 264 |
self.constrained_processor.metadata_temperature = metadata_temperature if use_phase_temperatures else None
|
| 265 |
self.constrained_processor.codes_temperature = codes_temperature if use_phase_temperatures else None
|
| 266 |
-
self.constrained_processor.update_caption(formatted_prompt) # Use formatted prompt for genre extraction
|
| 267 |
self.constrained_processor.set_target_duration(target_duration)
|
| 268 |
# Always call set_user_metadata to ensure previous settings are cleared if None
|
| 269 |
self.constrained_processor.set_user_metadata(user_metadata)
|
|
@@ -271,6 +278,8 @@ class LLMHandler:
|
|
| 271 |
# Set skip_caption and skip_language based on flags
|
| 272 |
self.constrained_processor.set_skip_caption(skip_caption)
|
| 273 |
self.constrained_processor.set_skip_language(skip_language)
|
|
|
|
|
|
|
| 274 |
|
| 275 |
constrained_processor = self.constrained_processor
|
| 276 |
|
|
@@ -286,7 +295,21 @@ class LLMHandler:
|
|
| 286 |
)
|
| 287 |
|
| 288 |
if cfg_scale > 1.0:
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
outputs = self.llm.generate(
|
| 291 |
[formatted_prompt],
|
| 292 |
sampling_params,
|
|
@@ -326,6 +349,10 @@ class LLMHandler:
|
|
| 326 |
stop_at_reasoning: bool = False,
|
| 327 |
skip_caption: bool = False,
|
| 328 |
skip_language: bool = False,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
) -> str:
|
| 330 |
"""Shared PyTorch path: accept prebuilt formatted prompt and return text."""
|
| 331 |
inputs = self.llm_tokenizer(
|
|
@@ -338,10 +365,12 @@ class LLMHandler:
|
|
| 338 |
# Use shared constrained processor if enabled
|
| 339 |
constrained_processor = None
|
| 340 |
if use_constrained_decoding:
|
|
|
|
|
|
|
|
|
|
| 341 |
# Use shared processor, just update caption and settings
|
| 342 |
self.constrained_processor.enabled = use_constrained_decoding
|
| 343 |
self.constrained_processor.debug = constrained_decoding_debug
|
| 344 |
-
self.constrained_processor.update_caption(formatted_prompt) # Use formatted prompt for genre extraction
|
| 345 |
self.constrained_processor.set_target_duration(target_duration)
|
| 346 |
# Always call set_user_metadata to ensure previous settings are cleared if None
|
| 347 |
self.constrained_processor.set_user_metadata(user_metadata)
|
|
@@ -349,6 +378,8 @@ class LLMHandler:
|
|
| 349 |
# Set skip_caption and skip_language based on flags
|
| 350 |
self.constrained_processor.set_skip_caption(skip_caption)
|
| 351 |
self.constrained_processor.set_skip_language(skip_language)
|
|
|
|
|
|
|
| 352 |
|
| 353 |
constrained_processor = self.constrained_processor
|
| 354 |
|
|
@@ -366,7 +397,18 @@ class LLMHandler:
|
|
| 366 |
logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
|
| 367 |
|
| 368 |
if cfg_scale > 1.0:
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
# Tokenize both prompts together to ensure same length (with left padding)
|
| 372 |
# Left padding is important for generation tasks
|
|
@@ -464,9 +506,33 @@ class LLMHandler:
|
|
| 464 |
"""Check if all required metadata are present."""
|
| 465 |
if user_metadata is None:
|
| 466 |
return False
|
| 467 |
-
if 'bpm' in user_metadata and 'keyscale' in user_metadata and 'timesignature' in user_metadata and 'duration' in user_metadata
|
| 468 |
return True
|
| 469 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
|
| 471 |
def generate_with_stop_condition(
|
| 472 |
self,
|
|
@@ -486,10 +552,10 @@ class LLMHandler:
|
|
| 486 |
use_cot_caption: bool = True,
|
| 487 |
use_cot_language: bool = True,
|
| 488 |
) -> Tuple[Dict[str, Any], str, str]:
|
| 489 |
-
"""
|
| 490 |
|
| 491 |
-
- infer_type='dit':
|
| 492 |
-
- infer_type='llm_dit':
|
| 493 |
|
| 494 |
Args:
|
| 495 |
target_duration: Target duration in seconds for codes generation constraint.
|
|
@@ -503,17 +569,21 @@ class LLMHandler:
|
|
| 503 |
if infer_type not in {"dit", "llm_dit"}:
|
| 504 |
return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
|
| 505 |
|
| 506 |
-
|
| 507 |
-
formatted_prompt = self.build_formatted_prompt(caption, lyrics)
|
| 508 |
-
|
| 509 |
-
# Determine stop condition
|
| 510 |
-
stop_at_reasoning = (infer_type == "dit")
|
| 511 |
-
has_all_metas = self.has_all_metas(user_metadata)
|
| 512 |
audio_codes = ""
|
|
|
|
| 513 |
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
formatted_prompt=formatted_prompt,
|
| 518 |
cfg={
|
| 519 |
"temperature": temperature,
|
|
@@ -522,39 +592,121 @@ class LLMHandler:
|
|
| 522 |
"top_k": top_k,
|
| 523 |
"top_p": top_p,
|
| 524 |
"repetition_penalty": repetition_penalty,
|
| 525 |
-
"target_duration":
|
| 526 |
"user_metadata": user_metadata,
|
| 527 |
"skip_caption": not use_cot_caption,
|
| 528 |
"skip_language": not use_cot_language,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
},
|
| 530 |
use_constrained_decoding=use_constrained_decoding,
|
| 531 |
constrained_decoding_debug=constrained_decoding_debug,
|
| 532 |
-
stop_at_reasoning=
|
| 533 |
)
|
| 534 |
-
|
|
|
|
| 535 |
return {}, "", status
|
| 536 |
-
|
| 537 |
-
# Parse output
|
| 538 |
-
metadata,
|
| 539 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
|
| 541 |
-
|
|
|
|
|
|
|
| 542 |
return metadata, audio_codes, status_msg
|
| 543 |
|
| 544 |
-
def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False) -> str:
|
| 545 |
"""
|
| 546 |
Build the chat-formatted prompt for 5Hz LM from caption/lyrics.
|
| 547 |
Raises a ValueError if the tokenizer is not initialized.
|
| 548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
Example:
|
| 550 |
prompt = handler.build_formatted_prompt("calm piano", "hello world")
|
| 551 |
"""
|
| 552 |
if self.llm_tokenizer is None:
|
| 553 |
raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
|
|
|
|
| 554 |
if is_negative_prompt:
|
| 555 |
-
prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
else:
|
|
|
|
| 557 |
prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
|
|
|
|
| 558 |
return self.llm_tokenizer.apply_chat_template(
|
| 559 |
[
|
| 560 |
{"role": "system", "content": f"# Instruction\n{DEFAULT_LM_INSTRUCTION}\n\n"},
|
|
@@ -563,7 +715,258 @@ class LLMHandler:
|
|
| 563 |
tokenize=False,
|
| 564 |
add_generation_prompt=True,
|
| 565 |
)
|
| 566 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
def generate_from_formatted_prompt(
|
| 568 |
self,
|
| 569 |
formatted_prompt: str,
|
|
@@ -583,6 +986,7 @@ class LLMHandler:
|
|
| 583 |
- negative_prompt (str) used when cfg_scale > 1
|
| 584 |
- top_k (int), top_p (float), repetition_penalty (float)
|
| 585 |
- target_duration (float): Target duration in seconds for codes generation
|
|
|
|
| 586 |
use_constrained_decoding: Whether to use FSM-based constrained decoding
|
| 587 |
constrained_decoding_debug: Whether to enable debug logging for constrained decoding
|
| 588 |
stop_at_reasoning: If True, stop generation immediately after </think> tag (no audio codes)
|
|
@@ -610,6 +1014,11 @@ class LLMHandler:
|
|
| 610 |
user_metadata = cfg.get("user_metadata") # User-provided metadata fields
|
| 611 |
skip_caption = cfg.get("skip_caption", False) # Skip caption generation in CoT
|
| 612 |
skip_language = cfg.get("skip_language", False) # Skip language generation in CoT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
|
| 614 |
try:
|
| 615 |
if self.llm_backend == "vllm":
|
|
@@ -628,6 +1037,10 @@ class LLMHandler:
|
|
| 628 |
stop_at_reasoning=stop_at_reasoning,
|
| 629 |
skip_caption=skip_caption,
|
| 630 |
skip_language=skip_language,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
)
|
| 632 |
return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
|
| 633 |
|
|
@@ -647,6 +1060,10 @@ class LLMHandler:
|
|
| 647 |
stop_at_reasoning=stop_at_reasoning,
|
| 648 |
skip_caption=skip_caption,
|
| 649 |
skip_language=skip_language,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
)
|
| 651 |
return output_text, f"✅ Generated successfully (pt) | length={len(output_text)}"
|
| 652 |
|
|
|
|
| 8 |
from typing import Optional, Dict, Any, Tuple, List
|
| 9 |
from contextlib import contextmanager
|
| 10 |
|
| 11 |
+
import yaml
|
| 12 |
import torch
|
| 13 |
from loguru import logger
|
| 14 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
| 18 |
RepetitionPenaltyLogitsProcessor,
|
| 19 |
)
|
| 20 |
from acestep.constrained_logits_processor import MetadataConstrainedLogitsProcessor
|
| 21 |
+
from acestep.constants import DEFAULT_LM_INSTRUCTION, DEFAULT_LM_UNDERSTAND_INSTRUCTION
|
| 22 |
|
| 23 |
|
| 24 |
class LLMHandler:
|
|
|
|
| 248 |
stop_at_reasoning: bool = False,
|
| 249 |
skip_caption: bool = False,
|
| 250 |
skip_language: bool = False,
|
| 251 |
+
generation_phase: str = "cot",
|
| 252 |
+
caption: str = "",
|
| 253 |
+
lyrics: str = "",
|
| 254 |
+
cot_text: str = "",
|
| 255 |
) -> str:
|
| 256 |
"""Shared vllm path: accept prebuilt formatted prompt and return text."""
|
| 257 |
from nanovllm import SamplingParams
|
|
|
|
| 263 |
# Use shared constrained processor if enabled
|
| 264 |
constrained_processor = None
|
| 265 |
if use_constrained_decoding or use_phase_temperatures:
|
| 266 |
+
# Reset processor state for new generation
|
| 267 |
+
self.constrained_processor.reset()
|
| 268 |
+
|
| 269 |
# Use shared processor, just update caption and settings
|
| 270 |
self.constrained_processor.enabled = use_constrained_decoding
|
| 271 |
self.constrained_processor.debug = constrained_decoding_debug
|
| 272 |
self.constrained_processor.metadata_temperature = metadata_temperature if use_phase_temperatures else None
|
| 273 |
self.constrained_processor.codes_temperature = codes_temperature if use_phase_temperatures else None
|
|
|
|
| 274 |
self.constrained_processor.set_target_duration(target_duration)
|
| 275 |
# Always call set_user_metadata to ensure previous settings are cleared if None
|
| 276 |
self.constrained_processor.set_user_metadata(user_metadata)
|
|
|
|
| 278 |
# Set skip_caption and skip_language based on flags
|
| 279 |
self.constrained_processor.set_skip_caption(skip_caption)
|
| 280 |
self.constrained_processor.set_skip_language(skip_language)
|
| 281 |
+
# Set generation phase for phase-aware processing
|
| 282 |
+
self.constrained_processor.set_generation_phase(generation_phase)
|
| 283 |
|
| 284 |
constrained_processor = self.constrained_processor
|
| 285 |
|
|
|
|
| 295 |
)
|
| 296 |
|
| 297 |
if cfg_scale > 1.0:
|
| 298 |
+
# Build unconditional prompt based on generation phase
|
| 299 |
+
if generation_phase == "codes":
|
| 300 |
+
# Codes phase: use empty CoT in unconditional prompt
|
| 301 |
+
# formatted_prompt was built with build_formatted_prompt_with_cot(caption, lyrics, cot_text)
|
| 302 |
+
# For unconditional, we use empty CoT: build_formatted_prompt_with_cot(caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=...)
|
| 303 |
+
formatted_unconditional_prompt = self.build_formatted_prompt_with_cot(
|
| 304 |
+
caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
|
| 305 |
+
)
|
| 306 |
+
else:
|
| 307 |
+
# CoT phase: unconditional prompt
|
| 308 |
+
# If negative_prompt is provided, use it as caption; otherwise remove caption and keep only lyrics
|
| 309 |
+
formatted_unconditional_prompt = self.build_formatted_prompt(
|
| 310 |
+
caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
outputs = self.llm.generate(
|
| 314 |
[formatted_prompt],
|
| 315 |
sampling_params,
|
|
|
|
| 349 |
stop_at_reasoning: bool = False,
|
| 350 |
skip_caption: bool = False,
|
| 351 |
skip_language: bool = False,
|
| 352 |
+
generation_phase: str = "cot",
|
| 353 |
+
caption: str = "",
|
| 354 |
+
lyrics: str = "",
|
| 355 |
+
cot_text: str = "",
|
| 356 |
) -> str:
|
| 357 |
"""Shared PyTorch path: accept prebuilt formatted prompt and return text."""
|
| 358 |
inputs = self.llm_tokenizer(
|
|
|
|
| 365 |
# Use shared constrained processor if enabled
|
| 366 |
constrained_processor = None
|
| 367 |
if use_constrained_decoding:
|
| 368 |
+
# Reset processor state for new generation
|
| 369 |
+
self.constrained_processor.reset()
|
| 370 |
+
|
| 371 |
# Use shared processor, just update caption and settings
|
| 372 |
self.constrained_processor.enabled = use_constrained_decoding
|
| 373 |
self.constrained_processor.debug = constrained_decoding_debug
|
|
|
|
| 374 |
self.constrained_processor.set_target_duration(target_duration)
|
| 375 |
# Always call set_user_metadata to ensure previous settings are cleared if None
|
| 376 |
self.constrained_processor.set_user_metadata(user_metadata)
|
|
|
|
| 378 |
# Set skip_caption and skip_language based on flags
|
| 379 |
self.constrained_processor.set_skip_caption(skip_caption)
|
| 380 |
self.constrained_processor.set_skip_language(skip_language)
|
| 381 |
+
# Set generation phase for phase-aware processing
|
| 382 |
+
self.constrained_processor.set_generation_phase(generation_phase)
|
| 383 |
|
| 384 |
constrained_processor = self.constrained_processor
|
| 385 |
|
|
|
|
| 397 |
logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
|
| 398 |
|
| 399 |
if cfg_scale > 1.0:
|
| 400 |
+
# Build unconditional prompt based on generation phase
|
| 401 |
+
if generation_phase == "codes":
|
| 402 |
+
# Codes phase: use empty CoT in unconditional prompt
|
| 403 |
+
formatted_unconditional_prompt = self.build_formatted_prompt_with_cot(
|
| 404 |
+
caption, lyrics, cot_text, is_negative_prompt=True, negative_prompt=negative_prompt
|
| 405 |
+
)
|
| 406 |
+
else:
|
| 407 |
+
# CoT phase: unconditional prompt
|
| 408 |
+
# If negative_prompt is provided, use it as caption; otherwise remove caption and keep only lyrics
|
| 409 |
+
formatted_unconditional_prompt = self.build_formatted_prompt(
|
| 410 |
+
caption, lyrics, is_negative_prompt=True, generation_phase="cot", negative_prompt=negative_prompt
|
| 411 |
+
)
|
| 412 |
|
| 413 |
# Tokenize both prompts together to ensure same length (with left padding)
|
| 414 |
# Left padding is important for generation tasks
|
|
|
|
| 506 |
"""Check if all required metadata are present."""
|
| 507 |
if user_metadata is None:
|
| 508 |
return False
|
| 509 |
+
if 'bpm' in user_metadata and 'keyscale' in user_metadata and 'timesignature' in user_metadata and 'duration' in user_metadata:
|
| 510 |
return True
|
| 511 |
return False
|
| 512 |
+
|
| 513 |
+
def _format_metadata_as_cot(self, metadata: Dict[str, Any]) -> str:
|
| 514 |
+
"""
|
| 515 |
+
Format parsed metadata as CoT text using YAML format (matching training format).
|
| 516 |
+
|
| 517 |
+
Args:
|
| 518 |
+
metadata: Dictionary with keys: bpm, caption, duration, keyscale, language, timesignature
|
| 519 |
+
|
| 520 |
+
Returns:
|
| 521 |
+
Formatted CoT text: "<think>\n{yaml_content}\n</think>"
|
| 522 |
+
"""
|
| 523 |
+
# Build cot_items dict with only non-None values
|
| 524 |
+
cot_items = {}
|
| 525 |
+
for key in ['bpm', 'caption', 'duration', 'keyscale', 'language', 'timesignature']:
|
| 526 |
+
if key in metadata and metadata[key] is not None:
|
| 527 |
+
cot_items[key] = metadata[key]
|
| 528 |
+
|
| 529 |
+
# Format as YAML (sorted keys, unicode support)
|
| 530 |
+
if len(cot_items) > 0:
|
| 531 |
+
cot_yaml = yaml.dump(cot_items, allow_unicode=True, sort_keys=True).strip()
|
| 532 |
+
else:
|
| 533 |
+
cot_yaml = ""
|
| 534 |
+
|
| 535 |
+
return f"<think>\n{cot_yaml}\n</think>"
|
| 536 |
|
| 537 |
def generate_with_stop_condition(
|
| 538 |
self,
|
|
|
|
| 552 |
use_cot_caption: bool = True,
|
| 553 |
use_cot_language: bool = True,
|
| 554 |
) -> Tuple[Dict[str, Any], str, str]:
|
| 555 |
+
"""Two-phase LM generation: CoT generation followed by audio codes generation.
|
| 556 |
|
| 557 |
+
- infer_type='dit': Phase 1 only - generate CoT and return metas (no audio codes)
|
| 558 |
+
- infer_type='llm_dit': Phase 1 + Phase 2 - generate CoT then audio codes
|
| 559 |
|
| 560 |
Args:
|
| 561 |
target_duration: Target duration in seconds for codes generation constraint.
|
|
|
|
| 569 |
if infer_type not in {"dit", "llm_dit"}:
|
| 570 |
return {}, "", f"❌ invalid infer_type: {infer_type!r} (expected 'dit' or 'llm_dit')"
|
| 571 |
|
| 572 |
+
metadata = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
audio_codes = ""
|
| 574 |
+
has_all_metas = self.has_all_metas(user_metadata)
|
| 575 |
|
| 576 |
+
# ========== PHASE 1: CoT Generation ==========
|
| 577 |
+
# Always generate CoT unless all metadata are user-provided
|
| 578 |
+
if not has_all_metas:
|
| 579 |
+
logger.info("Phase 1: Generating CoT metadata...")
|
| 580 |
+
|
| 581 |
+
# Build formatted prompt for CoT phase
|
| 582 |
+
formatted_prompt = self.build_formatted_prompt(caption, lyrics, generation_phase="cot")
|
| 583 |
+
|
| 584 |
+
logger.info(f"generate_with_stop_condition: formatted_prompt={formatted_prompt}")
|
| 585 |
+
# Generate CoT (stop at </think>)
|
| 586 |
+
cot_output_text, status = self.generate_from_formatted_prompt(
|
| 587 |
formatted_prompt=formatted_prompt,
|
| 588 |
cfg={
|
| 589 |
"temperature": temperature,
|
|
|
|
| 592 |
"top_k": top_k,
|
| 593 |
"top_p": top_p,
|
| 594 |
"repetition_penalty": repetition_penalty,
|
| 595 |
+
"target_duration": None, # No duration constraint for CoT phase
|
| 596 |
"user_metadata": user_metadata,
|
| 597 |
"skip_caption": not use_cot_caption,
|
| 598 |
"skip_language": not use_cot_language,
|
| 599 |
+
"generation_phase": "cot",
|
| 600 |
+
# Pass context for building unconditional prompt in CoT phase
|
| 601 |
+
"caption": caption,
|
| 602 |
+
"lyrics": lyrics,
|
| 603 |
},
|
| 604 |
use_constrained_decoding=use_constrained_decoding,
|
| 605 |
constrained_decoding_debug=constrained_decoding_debug,
|
| 606 |
+
stop_at_reasoning=True, # Always stop at </think> in Phase 1
|
| 607 |
)
|
| 608 |
+
|
| 609 |
+
if not cot_output_text:
|
| 610 |
return {}, "", status
|
| 611 |
+
|
| 612 |
+
# Parse metadata from CoT output
|
| 613 |
+
metadata, _ = self.parse_lm_output(cot_output_text)
|
| 614 |
+
logger.info(f"Phase 1 completed. Generated metadata: {list(metadata.keys())}")
|
| 615 |
+
else:
|
| 616 |
+
# Use user-provided metadata
|
| 617 |
+
logger.info("Phase 1: Using user-provided metadata (skipping generation)")
|
| 618 |
+
metadata = {k: v for k, v in user_metadata.items() if v is not None}
|
| 619 |
+
|
| 620 |
+
# If infer_type is 'dit', stop here and return only metadata
|
| 621 |
+
if infer_type == "dit":
|
| 622 |
+
status_msg = f"✅ Generated CoT metadata successfully\nFields: {', '.join(metadata.keys())}"
|
| 623 |
+
return metadata, "", status_msg
|
| 624 |
+
|
| 625 |
+
# ========== PHASE 2: Audio Codes Generation ==========
|
| 626 |
+
logger.info("Phase 2: Generating audio codes...")
|
| 627 |
+
|
| 628 |
+
# Format metadata as CoT using YAML (matching training format)
|
| 629 |
+
cot_text = self._format_metadata_as_cot(metadata)
|
| 630 |
+
|
| 631 |
+
# Build formatted prompt with CoT for codes generation phase
|
| 632 |
+
formatted_prompt_with_cot = self.build_formatted_prompt_with_cot(caption, lyrics, cot_text)
|
| 633 |
+
logger.info(f"generate_with_stop_condition: formatted_prompt_with_cot={formatted_prompt_with_cot}")
|
| 634 |
+
# Generate audio codes
|
| 635 |
+
codes_output_text, status = self.generate_from_formatted_prompt(
|
| 636 |
+
formatted_prompt=formatted_prompt_with_cot,
|
| 637 |
+
cfg={
|
| 638 |
+
"temperature": temperature,
|
| 639 |
+
"cfg_scale": cfg_scale,
|
| 640 |
+
"negative_prompt": negative_prompt,
|
| 641 |
+
"top_k": top_k,
|
| 642 |
+
"top_p": top_p,
|
| 643 |
+
"repetition_penalty": repetition_penalty,
|
| 644 |
+
"target_duration": target_duration,
|
| 645 |
+
"user_metadata": None, # No user metadata injection in Phase 2
|
| 646 |
+
"skip_caption": True, # Skip caption since CoT is already included
|
| 647 |
+
"skip_language": True, # Skip language since CoT is already included
|
| 648 |
+
"generation_phase": "codes",
|
| 649 |
+
# Pass context for building unconditional prompt in codes phase
|
| 650 |
+
"caption": caption,
|
| 651 |
+
"lyrics": lyrics,
|
| 652 |
+
"cot_text": cot_text,
|
| 653 |
+
},
|
| 654 |
+
use_constrained_decoding=use_constrained_decoding,
|
| 655 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 656 |
+
stop_at_reasoning=False, # Generate codes until EOS
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
+
if not codes_output_text:
|
| 660 |
+
return metadata, "", status
|
| 661 |
+
|
| 662 |
+
# Parse audio codes from output (metadata should be same as Phase 1)
|
| 663 |
+
_, audio_codes = self.parse_lm_output(codes_output_text)
|
| 664 |
+
|
| 665 |
codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
|
| 666 |
+
logger.info(f"Phase 2 completed. Generated {codes_count} audio codes")
|
| 667 |
+
|
| 668 |
+
status_msg = f"✅ Generated successfully (2-phase)\nPhase 1: CoT metadata\nPhase 2: {codes_count} audio codes"
|
| 669 |
return metadata, audio_codes, status_msg
|
| 670 |
|
| 671 |
+
def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False, generation_phase: str = "cot", negative_prompt: str = "NO USER INPUT") -> str:
|
| 672 |
"""
|
| 673 |
Build the chat-formatted prompt for 5Hz LM from caption/lyrics.
|
| 674 |
Raises a ValueError if the tokenizer is not initialized.
|
| 675 |
|
| 676 |
+
Args:
|
| 677 |
+
caption: Caption text
|
| 678 |
+
lyrics: Lyrics text
|
| 679 |
+
is_negative_prompt: If True, builds unconditional prompt for CFG
|
| 680 |
+
generation_phase: "cot" or "codes" - affects unconditional prompt format
|
| 681 |
+
negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
|
| 682 |
+
|
| 683 |
Example:
|
| 684 |
prompt = handler.build_formatted_prompt("calm piano", "hello world")
|
| 685 |
"""
|
| 686 |
if self.llm_tokenizer is None:
|
| 687 |
raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
|
| 688 |
+
|
| 689 |
if is_negative_prompt:
|
| 690 |
+
# Unconditional prompt for CFG
|
| 691 |
+
# Check if user provided a meaningful negative prompt (not the default)
|
| 692 |
+
has_negative_prompt = negative_prompt and negative_prompt.strip() and negative_prompt.strip() != "NO USER INPUT"
|
| 693 |
+
|
| 694 |
+
if generation_phase == "cot":
|
| 695 |
+
# CoT phase unconditional prompt
|
| 696 |
+
if has_negative_prompt:
|
| 697 |
+
# If negative prompt provided, use it as caption
|
| 698 |
+
prompt = f"# Caption\n{negative_prompt}\n\n# Lyric\n{lyrics}\n"
|
| 699 |
+
else:
|
| 700 |
+
# No negative prompt: remove caption, keep only lyrics
|
| 701 |
+
prompt = f"# Lyric\n{lyrics}\n"
|
| 702 |
+
else:
|
| 703 |
+
# Codes phase: will be handled by build_formatted_prompt_with_cot
|
| 704 |
+
# For backward compatibility, use simple caption as before
|
| 705 |
+
prompt = caption
|
| 706 |
else:
|
| 707 |
+
# Conditional prompt: include both caption and lyrics
|
| 708 |
prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
|
| 709 |
+
|
| 710 |
return self.llm_tokenizer.apply_chat_template(
|
| 711 |
[
|
| 712 |
{"role": "system", "content": f"# Instruction\n{DEFAULT_LM_INSTRUCTION}\n\n"},
|
|
|
|
| 715 |
tokenize=False,
|
| 716 |
add_generation_prompt=True,
|
| 717 |
)
|
| 718 |
+
|
| 719 |
+
def build_formatted_prompt_with_cot(self, caption: str, lyrics: str, cot_text: str, is_negative_prompt: bool = False, negative_prompt: str = "NO USER INPUT") -> str:
|
| 720 |
+
"""
|
| 721 |
+
Build the chat-formatted prompt for codes generation phase with pre-generated CoT.
|
| 722 |
+
|
| 723 |
+
Args:
|
| 724 |
+
caption: Caption text
|
| 725 |
+
lyrics: Lyrics text
|
| 726 |
+
cot_text: Pre-generated CoT text (e.g., "<think>\\nbpm: 120\\n...\\n</think>")
|
| 727 |
+
is_negative_prompt: If True, uses empty CoT for CFG unconditional prompt
|
| 728 |
+
negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
|
| 729 |
+
|
| 730 |
+
Returns:
|
| 731 |
+
Formatted prompt string
|
| 732 |
+
|
| 733 |
+
Example:
|
| 734 |
+
cot = "<think>\\nbpm: 120\\ncaption: calm piano\\n...\\n</think>"
|
| 735 |
+
prompt = handler.build_formatted_prompt_with_cot("calm piano", "hello", cot)
|
| 736 |
+
"""
|
| 737 |
+
if self.llm_tokenizer is None:
|
| 738 |
+
raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
|
| 739 |
+
|
| 740 |
+
if is_negative_prompt:
|
| 741 |
+
# Unconditional prompt for codes phase
|
| 742 |
+
# Check if user provided a meaningful negative prompt
|
| 743 |
+
has_negative_prompt = negative_prompt and negative_prompt.strip() and negative_prompt.strip() != "NO USER INPUT"
|
| 744 |
+
|
| 745 |
+
# Use empty CoT for unconditional
|
| 746 |
+
cot_for_prompt = "<think>\n</think>"
|
| 747 |
+
|
| 748 |
+
if has_negative_prompt:
|
| 749 |
+
# If negative prompt provided, use it as caption
|
| 750 |
+
caption_for_prompt = negative_prompt
|
| 751 |
+
else:
|
| 752 |
+
# No negative prompt: use original caption
|
| 753 |
+
caption_for_prompt = caption
|
| 754 |
+
else:
|
| 755 |
+
# Conditional prompt: use the full CoT and original caption
|
| 756 |
+
cot_for_prompt = cot_text
|
| 757 |
+
caption_for_prompt = caption
|
| 758 |
+
|
| 759 |
+
# Build user prompt with caption and lyrics ONLY (no COT)
|
| 760 |
+
# COT should be in the assistant's message, not user's
|
| 761 |
+
user_prompt = f"# Caption\n{caption_for_prompt}\n\n# Lyric\n{lyrics}\n"
|
| 762 |
+
|
| 763 |
+
# Build the chat with assistant message containing the COT
|
| 764 |
+
# The model will continue generation after the COT
|
| 765 |
+
formatted = self.llm_tokenizer.apply_chat_template(
|
| 766 |
+
[
|
| 767 |
+
{"role": "system", "content": f"# Instruction\n{DEFAULT_LM_INSTRUCTION}\n\n"},
|
| 768 |
+
{"role": "user", "content": user_prompt},
|
| 769 |
+
{"role": "assistant", "content": cot_for_prompt},
|
| 770 |
+
],
|
| 771 |
+
tokenize=False,
|
| 772 |
+
add_generation_prompt=False, # Don't add generation prompt, COT is already in assistant
|
| 773 |
+
)
|
| 774 |
+
|
| 775 |
+
# Add a newline after </think> so model generates audio codes on next line
|
| 776 |
+
if not formatted.endswith('\n'):
|
| 777 |
+
formatted += '\n'
|
| 778 |
+
|
| 779 |
+
return formatted
|
| 780 |
+
|
| 781 |
+
def build_formatted_prompt_for_understanding(
|
| 782 |
+
self,
|
| 783 |
+
audio_codes: str,
|
| 784 |
+
is_negative_prompt: bool = False,
|
| 785 |
+
negative_prompt: str = "NO USER INPUT"
|
| 786 |
+
) -> str:
|
| 787 |
+
"""
|
| 788 |
+
Build the chat-formatted prompt for audio understanding from codes.
|
| 789 |
+
|
| 790 |
+
This is the reverse of generation: given audio codes, generate metadata and lyrics.
|
| 791 |
+
|
| 792 |
+
Args:
|
| 793 |
+
audio_codes: Audio code string (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 794 |
+
is_negative_prompt: If True, builds unconditional prompt for CFG
|
| 795 |
+
negative_prompt: Negative prompt for CFG (used when is_negative_prompt=True)
|
| 796 |
+
|
| 797 |
+
Returns:
|
| 798 |
+
Formatted prompt string
|
| 799 |
+
|
| 800 |
+
Example:
|
| 801 |
+
codes = "<|audio_code_18953|><|audio_code_13833|>..."
|
| 802 |
+
prompt = handler.build_formatted_prompt_for_understanding(codes)
|
| 803 |
+
"""
|
| 804 |
+
if self.llm_tokenizer is None:
|
| 805 |
+
raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
|
| 806 |
+
|
| 807 |
+
# For understanding task, user provides audio codes
|
| 808 |
+
# Unconditional prompt uses negative_prompt or empty string
|
| 809 |
+
if is_negative_prompt:
|
| 810 |
+
user_content = negative_prompt if negative_prompt and negative_prompt.strip() else ""
|
| 811 |
+
else:
|
| 812 |
+
user_content = audio_codes
|
| 813 |
+
|
| 814 |
+
return self.llm_tokenizer.apply_chat_template(
|
| 815 |
+
[
|
| 816 |
+
{
|
| 817 |
+
"role": "system",
|
| 818 |
+
"content": f"# Instruction\n{DEFAULT_LM_UNDERSTAND_INSTRUCTION}\n\n"
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"role": "user",
|
| 822 |
+
"content": user_content
|
| 823 |
+
},
|
| 824 |
+
],
|
| 825 |
+
tokenize=False,
|
| 826 |
+
add_generation_prompt=True,
|
| 827 |
+
)
|
| 828 |
+
|
| 829 |
+
def understand_audio_from_codes(
|
| 830 |
+
self,
|
| 831 |
+
audio_codes: str,
|
| 832 |
+
temperature: float = 0.3,
|
| 833 |
+
cfg_scale: float = 1.0,
|
| 834 |
+
negative_prompt: str = "NO USER INPUT",
|
| 835 |
+
top_k: Optional[int] = None,
|
| 836 |
+
top_p: Optional[float] = None,
|
| 837 |
+
repetition_penalty: float = 1.0,
|
| 838 |
+
use_constrained_decoding: bool = True,
|
| 839 |
+
constrained_decoding_debug: bool = False,
|
| 840 |
+
) -> Tuple[Dict[str, Any], str]:
|
| 841 |
+
"""
|
| 842 |
+
Understand audio codes and generate metadata + lyrics.
|
| 843 |
+
|
| 844 |
+
This is the reverse of the normal generation flow:
|
| 845 |
+
- Input: Audio codes
|
| 846 |
+
- Output: Metadata (bpm, caption, duration, etc.) + Lyrics
|
| 847 |
+
|
| 848 |
+
Args:
|
| 849 |
+
audio_codes: String of audio code tokens (e.g., "<|audio_code_123|><|audio_code_456|>...")
|
| 850 |
+
temperature: Sampling temperature for generation
|
| 851 |
+
cfg_scale: Classifier-Free Guidance scale (1.0 = no CFG, >1.0 = use CFG)
|
| 852 |
+
negative_prompt: Negative prompt for CFG
|
| 853 |
+
top_k: Top-K sampling (None = disabled)
|
| 854 |
+
top_p: Top-P (nucleus) sampling (None = disabled)
|
| 855 |
+
repetition_penalty: Repetition penalty (1.0 = no penalty)
|
| 856 |
+
use_constrained_decoding: Whether to use FSM-based constrained decoding for metadata
|
| 857 |
+
constrained_decoding_debug: Whether to enable debug logging for constrained decoding
|
| 858 |
+
|
| 859 |
+
Returns:
|
| 860 |
+
Tuple of (metadata_dict, status_message)
|
| 861 |
+
metadata_dict contains:
|
| 862 |
+
- bpm: int or str
|
| 863 |
+
- caption: str
|
| 864 |
+
- duration: int or str
|
| 865 |
+
- genres: str
|
| 866 |
+
- keyscale: str
|
| 867 |
+
- language: str
|
| 868 |
+
- timesignature: str
|
| 869 |
+
- lyrics: str (extracted from output after </think>)
|
| 870 |
+
|
| 871 |
+
Example:
|
| 872 |
+
codes = "<|audio_code_18953|><|audio_code_13833|>..."
|
| 873 |
+
metadata, status = handler.understand_audio_from_codes(codes)
|
| 874 |
+
print(metadata['caption']) # "A cinematic orchestral piece..."
|
| 875 |
+
print(metadata['lyrics']) # "[Intro: ...]\\n..."
|
| 876 |
+
"""
|
| 877 |
+
if not getattr(self, "llm_initialized", False):
|
| 878 |
+
return {}, "❌ 5Hz LM not initialized. Please initialize it first."
|
| 879 |
+
|
| 880 |
+
if not audio_codes or not audio_codes.strip():
|
| 881 |
+
return {}, "❌ No audio codes provided. Please paste audio codes first."
|
| 882 |
+
|
| 883 |
+
logger.info(f"Understanding audio codes (length: {len(audio_codes)} chars)")
|
| 884 |
+
|
| 885 |
+
# Build formatted prompt for understanding
|
| 886 |
+
formatted_prompt = self.build_formatted_prompt_for_understanding(audio_codes)
|
| 887 |
+
print(f"formatted_prompt: {formatted_prompt}")
|
| 888 |
+
# Generate using constrained decoding (understand phase)
|
| 889 |
+
# We want to generate metadata first (CoT), then lyrics (natural text)
|
| 890 |
+
output_text, status = self.generate_from_formatted_prompt(
|
| 891 |
+
formatted_prompt=formatted_prompt,
|
| 892 |
+
cfg={
|
| 893 |
+
"temperature": temperature,
|
| 894 |
+
"cfg_scale": cfg_scale,
|
| 895 |
+
"negative_prompt": negative_prompt,
|
| 896 |
+
"top_k": top_k,
|
| 897 |
+
"top_p": top_p,
|
| 898 |
+
"repetition_penalty": repetition_penalty,
|
| 899 |
+
"target_duration": None, # No duration constraint for understanding
|
| 900 |
+
"user_metadata": None, # No user metadata injection
|
| 901 |
+
"skip_caption": False, # Generate caption
|
| 902 |
+
"skip_language": False, # Generate language
|
| 903 |
+
"generation_phase": "understand", # Understanding phase: generate CoT metadata, then free-form lyrics
|
| 904 |
+
# Context for building unconditional prompt
|
| 905 |
+
"caption": "",
|
| 906 |
+
"lyrics": "",
|
| 907 |
+
},
|
| 908 |
+
use_constrained_decoding=use_constrained_decoding,
|
| 909 |
+
constrained_decoding_debug=constrained_decoding_debug,
|
| 910 |
+
stop_at_reasoning=False, # Continue after </think> to generate lyrics
|
| 911 |
+
)
|
| 912 |
+
|
| 913 |
+
if not output_text:
|
| 914 |
+
return {}, status
|
| 915 |
+
|
| 916 |
+
# Parse metadata and extract lyrics
|
| 917 |
+
metadata, _ = self.parse_lm_output(output_text)
|
| 918 |
+
|
| 919 |
+
# Extract lyrics section (everything after </think>)
|
| 920 |
+
lyrics = self._extract_lyrics_from_output(output_text)
|
| 921 |
+
if lyrics:
|
| 922 |
+
metadata['lyrics'] = lyrics
|
| 923 |
+
|
| 924 |
+
logger.info(f"Understanding completed. Generated {len(metadata)} metadata fields")
|
| 925 |
+
if constrained_decoding_debug:
|
| 926 |
+
logger.debug(f"Generated metadata: {list(metadata.keys())}")
|
| 927 |
+
logger.debug(f"Output text preview: {output_text[:200]}...")
|
| 928 |
+
|
| 929 |
+
status_msg = f"✅ Understanding completed successfully\nGenerated fields: {', '.join(metadata.keys())}"
|
| 930 |
+
return metadata, status_msg
|
| 931 |
+
|
| 932 |
+
def _extract_lyrics_from_output(self, output_text: str) -> str:
|
| 933 |
+
"""
|
| 934 |
+
Extract lyrics section from LLM output.
|
| 935 |
+
|
| 936 |
+
The lyrics appear after the </think> tag and typically start with "# Lyric"
|
| 937 |
+
or directly with lyric content.
|
| 938 |
+
|
| 939 |
+
Args:
|
| 940 |
+
output_text: Full LLM output text
|
| 941 |
+
|
| 942 |
+
Returns:
|
| 943 |
+
Extracted lyrics string, or empty string if no lyrics found
|
| 944 |
+
"""
|
| 945 |
+
import re
|
| 946 |
+
|
| 947 |
+
# Find the </think> tag
|
| 948 |
+
think_end_pattern = r'</think>'
|
| 949 |
+
match = re.search(think_end_pattern, output_text)
|
| 950 |
+
|
| 951 |
+
if not match:
|
| 952 |
+
# No </think> tag found, no lyrics
|
| 953 |
+
return ""
|
| 954 |
+
|
| 955 |
+
# Extract everything after </think>
|
| 956 |
+
after_think = output_text[match.end():].strip()
|
| 957 |
+
|
| 958 |
+
if not after_think:
|
| 959 |
+
return ""
|
| 960 |
+
|
| 961 |
+
# Remove "# Lyric" header if present
|
| 962 |
+
lyric_header_pattern = r'^#\s*Lyri[c|cs]?\s*\n'
|
| 963 |
+
after_think = re.sub(lyric_header_pattern, '', after_think, flags=re.IGNORECASE)
|
| 964 |
+
|
| 965 |
+
# Remove <|im_end|> tag at the end if present
|
| 966 |
+
after_think = re.sub(r'<\|im_end\|>\s*$', '', after_think)
|
| 967 |
+
|
| 968 |
+
return after_think.strip()
|
| 969 |
+
|
| 970 |
def generate_from_formatted_prompt(
|
| 971 |
self,
|
| 972 |
formatted_prompt: str,
|
|
|
|
| 986 |
- negative_prompt (str) used when cfg_scale > 1
|
| 987 |
- top_k (int), top_p (float), repetition_penalty (float)
|
| 988 |
- target_duration (float): Target duration in seconds for codes generation
|
| 989 |
+
- generation_phase (str): "cot" or "codes" for phase-aware CFG
|
| 990 |
use_constrained_decoding: Whether to use FSM-based constrained decoding
|
| 991 |
constrained_decoding_debug: Whether to enable debug logging for constrained decoding
|
| 992 |
stop_at_reasoning: If True, stop generation immediately after </think> tag (no audio codes)
|
|
|
|
| 1014 |
user_metadata = cfg.get("user_metadata") # User-provided metadata fields
|
| 1015 |
skip_caption = cfg.get("skip_caption", False) # Skip caption generation in CoT
|
| 1016 |
skip_language = cfg.get("skip_language", False) # Skip language generation in CoT
|
| 1017 |
+
generation_phase = cfg.get("generation_phase", "cot") # "cot" or "codes"
|
| 1018 |
+
# Additional context for codes phase unconditional prompt building
|
| 1019 |
+
caption = cfg.get("caption", "")
|
| 1020 |
+
lyrics = cfg.get("lyrics", "")
|
| 1021 |
+
cot_text = cfg.get("cot_text", "")
|
| 1022 |
|
| 1023 |
try:
|
| 1024 |
if self.llm_backend == "vllm":
|
|
|
|
| 1037 |
stop_at_reasoning=stop_at_reasoning,
|
| 1038 |
skip_caption=skip_caption,
|
| 1039 |
skip_language=skip_language,
|
| 1040 |
+
generation_phase=generation_phase,
|
| 1041 |
+
caption=caption,
|
| 1042 |
+
lyrics=lyrics,
|
| 1043 |
+
cot_text=cot_text,
|
| 1044 |
)
|
| 1045 |
return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
|
| 1046 |
|
|
|
|
| 1060 |
stop_at_reasoning=stop_at_reasoning,
|
| 1061 |
skip_caption=skip_caption,
|
| 1062 |
skip_language=skip_language,
|
| 1063 |
+
generation_phase=generation_phase,
|
| 1064 |
+
caption=caption,
|
| 1065 |
+
lyrics=lyrics,
|
| 1066 |
+
cot_text=cot_text,
|
| 1067 |
)
|
| 1068 |
return output_text, f"✅ Generated successfully (pt) | length={len(output_text)}"
|
| 1069 |
|
examples/text2music/example_01.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "pop,
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An explosive, high-energy pop-rock track with a strong anime theme song feel. The song kicks off with a catchy, synthesized brass fanfare over a driving rock beat with punchy drums and a solid bassline. A powerful, clear male vocal enters with a theatrical and energetic delivery, soaring through the verses and hitting powerful high notes in the chorus. The arrangement is dense and dynamic, featuring rhythmic electric guitar chords, brief instrumental breaks with synth flourishes, and a consistent, danceable groove throughout. The overall mood is triumphant, adventurous, and exhilarating.",
|
| 4 |
+
"lyrics": "[Intro - Synth Brass Fanfare]\n\n[Verse 1]\n黑夜里的风吹过耳畔\n甜蜜时光转瞬即万\n脚步飘摇在星光上\n心追节奏心跳狂乱\n耳边传来电吉他呼唤\n手指轻触碰点流点燃\n梦在云端任它蔓延\n疯狂跳跃自由无间\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Instrumental Break - Synth Brass Melody]\n\n[Verse 2]\n鼓点撞击黑夜的底端\n跳动节拍连接你我俩\n在这里让灵魂发光\n燃尽所有不留遗憾\n\n[Instrumental Break - Synth Brass Melody]\n\n[Bridge]\n光影交错彼此的视线\n霓虹之下夜空的蔚蓝\n月光洒下温热心田\n追逐梦想它不会遥远\n\n[Chorus]\n心电感应在震动间\n拥抱未来勇敢冒险\n那旋律在心中无限\n世界变得如此耀眼\n\n[Outro - Instrumental with Synth Brass Melody]\n[Song ends abruptly]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 160,
|
| 7 |
+
"keyscale": "B minor",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_02.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A melancholic Latin trap track built on a foundation of deep 808 sub-bass and crisp, rolling hi-hats from a drum machine. A somber synth pad provides an atmospheric backdrop for the emotional male lead vocal, which is treated with noticeable auto-tune and spacious reverb. The chorus introduces layered vocals for added intensity and features prominent echoed ad-libs that drift through the mix. The arrangement includes a brief breakdown where the beat recedes to emphasize the raw vocal delivery before returning to the full instrumental for a final section featuring melodic synth lines over the main groove.",
|
| 4 |
+
"lyrics": "[Intro]\nYeah\nMe dicen ven, pero yo ya no quiero na'\nTe cansaste de estar pa' mí, ya no hay marcha atrás\nTu esencia se fue sin avisar\nPero este mundo no me va a borrar\n\n[Verse 1]\nNo hay más pa' ti ni de mami, ahora ves\nNo vengas con drama, lo sabes\nMe dijeron pa' siempre, pero mentían\nComo si tú no estuvieras\nSi yo fui tu juego y tú fuiste mi cárcel\nAhora no lloro aunque cierre la cana\nMe arde el aire, no hay vuelta, no hay miedo\nHoy dejo atrás todo lo que me da miedo\nTú me botaste, te fuiste del puerto\nYo me quedé solo, fue pura derrota\n\n[Pre-Chorus]\nNo era de por ti, lo entiendo, mi pana\nTe vas con otros aunque duela la mañana\nY si vuelvo, mira donde he llegado\nYa no hay vuelta, no\nNi tu plan en la mano\n\n[Chorus]\nMe dicen ven, pero yo ya no quiero na'\nNo quiero\nTe cansaste de estar pa' mí, ya no hay marcha atrás\nTu esencia se fue sin avisar\nPero este mundo no me va a borrar\nNo me va a borrar\nSigo solo, sigo vacío, sigo abajo de mi tierra\nAhora dejo atrás lo que me diste\nEn un año me despido, te miro a los ojos\nYa no soy tu tonto\nTe lo avisé, pero nunca intentaste conmigo\n\n[Verse 2]\nTe vi delirar, te vi mirarte\nAhora entiendo que no fuiste amante\nPero ya no me duele, ya no es rabia\nYa no soy el tonto que dejaste en tu cama\n\n[Chorus]\nMe dicen ven, pero yo ya no quiero na'\nNo quiero\nTe cansaste de estar pa' mí, ya no hay marcha atrás\nTu esencia se fue sin avisar\nPero este mundo no me va a borrar\n\n[Outro]\n[Instrumental fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 159,
|
| 7 |
+
"keyscale": "G# minor",
|
| 8 |
+
"language": "es",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_03.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A dark, atmospheric trap track driven by a deep 808 sub-bass line and crisp, rolling hi-hats from a drum machine. A male vocalist delivers assertive French rap verses with a confident flow over moody synth pads that create an urban, nocturnal soundscape. The chorus features layered vocals for emphasis, enhancing the hypnotic feel. The arrangement includes sections where vocal chops and ad-libs are used rhythmically alongside subtle sound effects like record scratches, culminating in a filtered outro that fades into silence before reprising the main hook one last time.",
|
| 4 |
+
"lyrics": "[Intro]\nAh\nLa rue est froide\nEh, eh, eh\n\n[Chorus]\nLa nuit tombe et je rentre, je fume sous la lune\nLes ombres m'appellent, je suis pas seul, je continue\nLes rêves sous la pluie, mon cœur sous la béton\nJe trace ma route même si la route est longue\n\n[Verse 1]\nLes murs ont des oreilles, les pavés des secrets\nUn faux sourire et t'es déjà trop proche de ta mère\nJ'ai des chaînes invisibles mais elles claquent plus fort\nChaque épreuve c'est une chance, faut qu'on en profite encore\nJ'entends les sirènes, mais je suis déjà en bas\nLe silence est dur mais ça me tient le bras\nTrop de choses à dire, je te jure c'est une blague\nJe sais que le monde tourne mais j'avance sans drague\nFaut que ça pète, pas de place pour les faux\nLa rue c'est un jeu, faut pas se prendre trop au sérieux\nLes rêves c'est du béton, pas des gratte-ciels\nJe fais mon truc, je suis un soldat, pas un pasteur\nLes vautours tournent, je suis dans mon délire\nChaque erreur me fait rire, chaque choix me fait frémir\nJ'ai le cœur glacé mais le mental est solide\nLa rue m'apprend qu'il faut jamais on la liquide\nJ'entends des voix mais elles ne disent rien\nLe silence est une arme, faut que je prenne un destin\nFaut que ça tourne, laisse-les rêver ou c'est du passé\nC'est la vie, faut jamais lâcher\n\n[Chorus]\nLa nuit tombe et je rentre, je fume sous la lune\nLes ombres m'appellent, je suis pas seul, je continue\nLes rêves sous la pluie, mon cœur sous la béton\nJe trace ma route même si la route est longue\nLa nuit tombe et je rentre, je fume sous la lune\nLes ombres m'appellent, je suis pas seul, je continue\nLes rêves sous la pluie, mon cœur sous la béton\nJe trace ma route même si la route est longue\n\n[Verse 2]\nTrop de gens cherchent une main, je trouve que des pièges\nChaque pas est un combat, faut que je lâche\nLe monde est lent mais les vrais sont lucides\nJ'ai gravé mon nom dans la peur et l'acide\nLa ville dort mais je garde mon miroir\nLes faux veulent ma peau mais le regard est noir\nLa rue c'est un livre, j'y crache ma libération\nJe suis là pour tout prendre, pas pour la direction\nLes ombres murmurent mais je suis déjà loin\nLe silence m'éteint, je garde la lumière au poignet\nLes regards sont froids, je vois des âmes en veille\nJe fais mon taf, je vois le ciel qui surveille\n\n[Chorus]\nLa nuit tombe et je rentre, je fume sous la lune\nLes ombres m'appellent, je suis pas seul, je continue\nLes rêves sous la pluie, mon cœur sous la béton\nJe trace ma route même si la route est longue\n\n[Outro]\nLa nuit tombe\nEh, eh, eh\nLa nuit tombe\nEh, eh, eh\nLa nuit tombe et je rentre, je fume sous la lune\nLes ombres m'appellent, je suis pas seul, je continue\nLes rêves sous la pluie, mon cœur sous la béton\nJe trace ma route même si la route est longue\n[Beat fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 142,
|
| 7 |
+
"keyscale": "E minor",
|
| 8 |
+
"language": "fr",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_04.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A mid-tempo Mandopop ballad built on a steady electronic drum machine groove and a clean synth bassline. The arrangement features layered synthesizers providing chordal pads and melodic counterpoints, complemented by a clean electric guitar playing arpeggiated figures. The emotional male lead vocal, sung in Mandarin, soars through the verses and choruses, often reaching into a powerful falsetto. The track includes an expressive, melodic electric guitar solo and a dynamic bridge that builds tension before a final, passionate chorus filled with vocal ad-libs and a climactic guitar flourish.",
|
| 4 |
+
"lyrics": "[Intro]\nYeah\nOne\n\n[Verse 1]\n[zh] ye4 wan3 de5 feng1 qing1 qing1 chui1 guo4\n[zh] ni3 de5 xiao4 rong2 zai4 wo3 xin1 zhong1 shan3 shuo4\n[zh] yi1 ju4 hua4 shuo1 bu4 chu1 kou3 de5 zhi2 zhuo2\n[zh] xiang4 xing1 guang1 sa3 man3 wu2 jin4 de5 ye4 kong1\n\n\n[Chorus]\n[zh] ni3 shi4 wo3 xin1 di3 wu2 fa3 mo3 qu4 de5 meng4\n[zh] zai4 mei3 yi2 ge4 shun4 jian1 dou1 rang4 wo3 chen2 zui4 tong4\n[zh] ai4 zai4 xuan2 lv4 zhong1 wo3 de5 ling2 hun2 xiang1 tong1\n[zh] gen1 sui2 jie2 pai1 yi1 qie4 dou1 shi4 xin1 dong4\n\n\n[Verse 2]\n[zh] jie1 jiao3 de5 deng1 ying3 ying4 chu1 bi3 ci3 yan3 shen2\n[zh] xin1 tiao4 de5 jie2 zou4 bian4 de2 geng4 mi2 ren2\n[zh] yong1 bao4 de5 wen1 du4 rang4 shi4 jie4 bian4 qing1 chen2\n[zh] ai4 xiang4 yi1 shou3 wu2 fa3 ting2 zhi3 de5 qu1 ben3\n\n\n[Pre-Chorus]\n[zh] ni3 de5 sheng1 yin1 zai4 yin1 fu2 li3 piao1 you2\n[zh] mei3 yi2 ge4 shun4 jian1 dou1 rang4 wo3 xin1 gan3 shou4\n[zh] ji2 ta1 de5 sheng1 bo1 hua4 chu1 ni3 de5 xiao4 rong2\n[zh] he2 sheng1 qi3 he2 gu3 dian3 dou1 zai4 zhi4 re4 you2 dong4\n\n\n[Chorus]\n[zh] ni3 shi4 wo3 xin1 di3 wu2 fa3 mo3 qu4 de5 meng4\n[zh] zai4 mei3 yi2 ge4 shun4 jian1 dou1 rang4 wo3 chen2 zui4 tong4\n[zh] ai4 zai4 xuan2 lv4 zhong1 wo3 de5 ling2 hun2 xiang1 tong1\n[zh] gen1 sui2 jie2 pai1 yi1 qie4 dou1 shi4 xin1 dong4\n\n\n[Guitar Solo]\nOh\nYeah\n\n[Bridge]\n[zh] dang1 dian4 liu2 de5 shan3 shuo4 rang4 ye4 wan3 fei4 teng2\n[zh] wo3 men5 de5 xin1 ling2 ru2 gu3 sheng1 ban1 fan1 gun3\n[zh] jiu4 rang4 zhe4 yi1 ke4 sui2 yin1 yue4 geng4 zhen1\n[zh] ni3 shi4 wo3 de5 yin1 yue4 shi4 yong3 heng2 de5 wen1 hen2\n\n\n[Outro]\n[Vocal ad-libs]\nOh\nOh\nOh\nOh\n[Final guitar chord and fade out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 235,
|
| 7 |
+
"keyscale": "G# minor",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_05.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An explosive, high-energy J-rock and pop-rock anthem driven by a powerful, punchy drum beat and a thick wall of distorted, chugging rhythm guitars. A passionate male lead vocal, delivered in a mix of Japanese and English, soars over the top with layered harmonies and echoed ad-libs that amplify the song's anthemic quality. The arrangement is dynamic, featuring a blistering, melodic guitar solo and a brief, atmospheric bridge with synth pads that provides a moment of contrast before slamming back into the high-octane chorus. The track concludes with a final, powerful guitar chord and a lingering sense of energy.",
|
| 4 |
+
"lyrics": "[Intro]\nOne more time, just one more time\nMou modorenai, mou modorenai\nThe GURU, my dream come true, my dream come true\n\n[Guitar Riff]\n\n[Verse 1]\nNani ga seigi ka, nani ga koukai ka\nHakase tsuzukeru, hashirinukeru\nNo life's a game, no rules to follow\nDakedo arata na chikara ga yomigaeru\n\n[Pre-Chorus]\nNo, this is our new life\nAisuru chikara, ima koso\nWe believe in ourselves\nMou atomodori dekinai\n\n[Chorus]\nEikou no hikari\nSono saki ni iru jiyuu ga aru\nOne more time, just one more time\nMou modorenai, mou modorenai\nThe GURU, my dream come true, my dream come true\n\n[Verse 2]\nKizutsuita tsubasa mo aruita michi mo\nOnaji hibi no naka de susumu\nNo stage to the hill, sore demo susumu\nMayowanai de susumu, susumu\n\n[Pre-Chorus]\nAisuru chikara, ima koso\nWe believe in ourselves\nMou atomodori dekinai\n\n[Chorus]\nEikou no hikari\nSono saki ni iru jiyuu ga aru\nOne more time, just one more time\nMou modorenai, mou modorenai\nThe GURU, my dream come true, my dream come true\n\n[Guitar Solo]\n\n[Bridge]\nDonna arashi ni matsu ka? I know\nNando mo yami ni shizumu ka?\nOre wa yuku, ore wa yuku\nChikara tsuyoku, yume wo motte\nChikara tsuyoku, yume wo motte\nTooi hikari, moeagaru\nKono kokoro ni hibiku takami\nWe are in the zone, we are in the zone\n\n[Breakdown]\nSono saki ni iru mirai ga aru\nOne more time, just one more time\nMou modorenai, mou modorenai\nThe GURU, my dream come true, my dream come true\n\n[Outro]\nKizutsuite mo\nMakenai yume wo\nNo stage to the hill, sore demo\nOre wa susumu, ore wa susumu\nMata tachiagaru, mata tachiagaru\n\n[Final Guitar Riff and Fade Out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 200,
|
| 7 |
+
"keyscale": "E♭ minor",
|
| 8 |
+
"language": "ja",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_06.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An energetic and uplifting J-pop track with strong anime theme song characteristics. The song opens with a bright piano melody and shimmering synth arpeggios before launching into a driving pop-rock arrangement. A clear, powerful male vocal leads the track, supported by a tight rhythm section of punchy drums and a melodic bassline. Clean and slightly overdriven electric guitars provide rhythmic chords and soaring lead lines, particularly during the anthemic chorus and a brief, expressive guitar solo. The structure includes a reflective bridge that momentarily softens the dynamics, followed by a powerful final chorus and an outro featuring melodic guitar licks and vocal ad-libs that fade to a clean finish.",
|
| 4 |
+
"lyrics": "[Intro - Synth Arpeggio & Guitar Riff]\n\n[Verse 1]\n空に浮かぶ雲の切れ間に\n願い事をそっと浮かべて\n今日がまた新しい始まり\n今この瞬間を生きていこう\n\n[Pre-Chorus]\nほら 聞こえるよ\n心の奥の声が\n何か始まる予感を抱いて\n今すぐに飛び出そう\n\n[Chorus]\n飛び出そう 明日へと\n風をまとって進むだけさ\nどんな日もが物語るから\n輝く未来を掴もう\n光の中へ踊ろう\n\n[Instrumental Break]\n\n[Verse 2]\n朝日が差し込む君の影\n迷わずに進んでいけばいい\n不安も希望も抱きしめて\n強く踏み出してゆくよ\n\n[Pre-Chorus]\nほら 聞こえるよ\n胸の奥の鼓動が\n夢の扉を開いてゆく\n輝きを捕まえる\n\n[Chorus]\n飛び出そう 明日へと\n風をまとって進むだけさ\nどんな日もが物語るから\n輝く未来を掴もう\n光の中へ踊ろう\n\n[Guitar Solo]\n\n[Bridge]\nもしも道に迷って立ち止まる日が来ても\nこの道の先は笑顔が待ってるから\n信じてみようよ\n未来は輝いてる\n一つの星に生まれ変わるんだ\n\n[Chorus]\n飛び出そう 明日へと\n風をまとって進むだけさ\nどんな日もが物語るから\n輝く未来を掴もう\n光の中へ踊ろう\n\n[Outro]\n音楽と共に飛ぶよ\n[Synth arpeggio fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 205,
|
| 7 |
+
"keyscale": "F# major",
|
| 8 |
+
"language": "ja",
|
| 9 |
+
"timesignature": "2"
|
| 10 |
}
|
examples/text2music/example_07.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An energetic electro-pop track driven by a squelchy, funky synth bassline and a punchy drum machine beat. The song opens with a distinctive, pitch-bending synth lead that serves as a recurring hook. The male vocals are delivered in a confident, rhythmic, almost rap-like cadence during the verses, transitioning to a more melodic and anthemic style in the chorus. The production is clean and modern, with layered synths and dynamic builds. A brief, atmospheric bridge offers a moment of contrast with sustained synth pads before the track slams back into a high-energy instrumental section and a final, powerful chorus.",
|
| 4 |
+
"lyrics": "[Intro - Synth Bass Riff]\n\n[Verse 1]\nChaque jour, c'est la retraite, c'est le plein de repas\nOn est déjà en vacances, on est là pour ça\nLes copains ont tout ce qu'on veut, on est prêts à bosser\nMais on revient à pied, on va tout explodir\nLes clients regardent, faut qu'on les contacte\nC'est pas une course, c'est un moment de spectacle\nOn attend, on sourit, on fait la loi\nMais on préfère l'action, pas l'effort du choix\n\n[Chorus]\nOn est là pour marquer, là pour tout exploser\nLe feu, la patate, faut vite tout chambouler\nMême en retard, faut bouger\nParce qu'on a plus peur d'exister\n\n[Instrumental Break - Synth Bass Riff]\n\n[Verse 2]\nPas besoin d'armure, faut du capital\nOn est les mêmes, on est dans l'agenda digital\nOn se frotte les mains, c'est l'énergie pure\nMême s'il y a bientôt dix jours à défendre sans murmure\nLes affiches tremblent, les commandes explosent\nL'agence, c'est notre trône, pas juste une cause\nOn fait le taf, on fait bouger le pays\nMais ensemble on a tout, c'est pour rassoumer ici\nLes copains, les meufs, c'est la règle de tout\nSi on part en morceaux, c'est qu'on y va jusqu'au bout\nLa patate en flammes, l'action dans la main\nOn est là pour marquer, faut pas lâcher notre refrain\n\n[Chorus]\nOn est là pour marquer, là pour tout exploser\nLe feu, la patate, faut vite tout chambouler\nMême en retard, faut bouger\nParce qu'on a plus peur d'exister\n\n[Bridge]\nLe stress monte, la galère s'installe\nOn est là pour gratter tout, peu importe le débat fatal\nOn rêve grand, on part à l'aventure\nLa patate au niveau haut, c'est notre vraie mur\nAlors levons nos verres, remplissons notre esprit\nLes coups de malaxe, on\n'est pas à se trahir\nC'est l'ambiance qu'on déclare, pas besoin d'armure\nOn va marquer l'équipe, on est là, on assure\n\n[Instrumental Break - Synth Bass Riff]\n\n[Outro]\nLe stress monte, la galère s'installe\nOn est là pour gratter tout, peu importe le débat fatal\nOn rêve grand, on part à l'aventure\nLa patate au niveau haut, c'est notre vraie mur\nAlors levons nos verres, remplissons notre esprit\nLes coups de malaxe, on\n'est pas à se trahir\nC'est l'ambiance qu'on déclare, pas besoin d'armure\nOn va marquer l'équipe, on est là, on assure\n[Synth bass riff fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 180,
|
| 7 |
+
"keyscale": "F minor",
|
| 8 |
+
"language": "fr",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_08.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An upbeat and celebratory Brazilian samba-pop track driven by a vibrant rhythm section. A nimble nylon-string acoustic guitar lays down the chordal foundation, while a lively brass section, featuring trumpets and trombones, punctuates the arrangement with festive fanfares and melodic lines. The lead male vocal is clear and joyful, singing over a tight groove provided by a clean electric bass and a full drum kit. The song's structure is built for dancing, with an infectious chorus and an instrumental break showcasing a spirited brass and guitar interplay.",
|
| 4 |
+
"lyrics": "[Intro - Brass Section Melody]\n\n[Verse 1]\nOlha a morena dançando na rua\nVestido vermelho é pura magia\nParece rainha de chapéu de bua\nMeu coração bate na tua melodia\n\n[Chorus]\nParabéns, malagueira morena\nPra todos os cães do nosso Brasil\nCaminho no pôr do sol, pequena\nEntri no nosso show\nFeliz sinal\n\n[Instrumental Break - Brass Section]\n\n[Verse 2]\nMorena gira com sua piquenino\nE os olhos d'água refletindo no céu\nCabelo caído com o rostinho fino\nDesperta no ritmo do carrossel\n\n[Chorus]\nParabéns, malagueira morena\nPra todos os cães do nosso Brasil\nCaminho no pôr do sol, pequena\nEntri no nosso show\nFeliz sinal\n\n[Instrumental Break - Brass Section]\n\n[Bridge]\nLuzes estreladas iluminam a noite\nReflete nos becos, nas ruas, nos bares\nVem brincar no canto, sorrir no derroche\nEssa morena dança onde as estrelas caem\n\n[Chorus]\nParabéns, malagueira morena\nPra todos os cães do nosso Brasil\nCaminho no pôr do sol, pequena\nEntri no nosso show\nFeliz sinal\n\n[Outro - Brass Section Melody]\n[Final chord fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 174,
|
| 7 |
+
"keyscale": "C major",
|
| 8 |
+
"language": "pt",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_09.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "pop,
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An explosive, high-energy J-pop track with strong hyperpop and chiptune influences. The song is driven by a relentless four-on-the-floor electronic drum beat and a punchy synth bassline. A high-pitched, digitally processed female vocal, reminiscent of Vocaloid, delivers rapid-fire melodies over layers of bright, arpeggiated synths and video game-style sound effects. The structure is dynamic, featuring intense verses and anthemic choruses that build into a frenetic instrumental break filled with complex synth runs and glitched-out fills. The track concludes with a dramatic tape-stop effect, deconstructing the beat into a wash of fading synth noise.",
|
| 4 |
+
"lyrics": "[Intro]\nKimi no koe ga kikoeru\nMotto motto tooku tooku\nAnata no ai wa eien ni\nSugiru omoi koto ni kaete\n\n[Instrumental Break]\n\n[Verse 1]\nYume no naka de kawasu kotoba\nTada no kakehiki\nKokoro no naka de narihibiku\nAnata e no ai koi suru\n\n[Chorus]\nAnata no ai\nItsumademo yume mitai\nKienai omoi\nItsumademo tsunagatteru\n\n[Post-Chorus]\nKimi to deatta ano hi\nSubete ga chigatta\nAnata no ai de tsunagatte\nOwaranai you ni kono mama\n\n[Instrumental Break]\n\n[Verse 2]\nKurikaesu hibi no naka de\nAi ga afuredasu\nAnata no nukumori o kanjiru\nYukuate no nai koi\n\n[Chorus]\nAnata no ai\nItsumademo yume mitai\nKienai omoi\nItsumademo tsunagatteru\n\n[Instrumental Solo]\n\n[Bridge]\nKurikaesu hibi no naka de\nAi ga afuredasu\nAnata no nukumori o kanjiru\nYukuate no nai koi\n\n[Chorus]\nAnata no ai\nItsumademo yume mitai\nKienai omoi\nItsumademo tsunagatteru\n\n[Outro]\n[Instrumental Breakdown]\n[Synth arpeggio fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 167,
|
| 7 |
+
"keyscale": "E♭ minor",
|
| 8 |
+
"language": "ja",
|
| 9 |
+
"timesignature": "2"
|
| 10 |
}
|
examples/text2music/example_10.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A smooth, contemporary R&B track built on a foundation of clean, chorused electric guitar chords and a steady, unobtrusive drum machine beat. A round synth bass provides a warm low end. The lead male vocal is clear and emotive, delivered in a smooth tenor with occasional falsetto ad-libs and layered harmonies that swell during the anthemic chorus. The arrangement features a brief, intimate bridge with whispered vocals before transitioning into a melodic guitar-led instrumental break. The track concludes with a layered vocal outro, combining sung melodies with whispered phrases that fade to a clean finish.",
|
| 4 |
+
"lyrics": "[Intro]\nYeah\nOh\nYeah\n\n[Verse 1]\nMidnight whispers in my ear\nStars above, they're shining clear\nCity lights, they start to fade\nUnderneath, our plans we've made\nMoonlit shadows twist and turn\nHearts on fire, passions burn\nLaughter echoes through the night\nHolding hands, we take flight\n\n[Chorus]\nYou're my heartbeat symphony\nEchoes of sweet destiny\nEvery beat a symphony\nYou and me, we're wild and free\n\n[Verse 2]\nNeon dreams and city haze\nLost in love's enchanting maze\nWhisper secrets in the breeze\nTime just stops as we appease\n\n[Chorus]\nYou're my heartbeat symphony\nEchoes of sweet destiny\nEvery beat a symphony\nYou and me, we're wild and free\n\n[Bridge]\nDance through stars and find our way\nIn your arms, I'll always stay\nLove so strong, it never bends\nTogether 'til the journey ends\n\n[Instrumental Break]\n\n[Outro]\nDance through stars and find our way\nIn your arms, I'll always stay\nLove so strong, it never bends\nTogether 'til the journey ends\nYou're my heartbeat symphony\nEchoes of sweet destiny\nEvery beat a symphony\nYou and me, we're wild and free\nDance through stars and find our way\nIn your arms, I'll always stay\nLove so strong, it never bends\nTogether 'til the journey ends",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 228,
|
| 7 |
+
"keyscale": "A major",
|
| 8 |
+
"language": "en",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_11.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An explosive, high-energy K-pop and EDM track driven by a relentless four-on-the-floor beat and a pulsing synth bassline. The arrangement opens with a bright piano melody and shimmering arpeggiated synths before slamming into the main groove. A powerful, clear male lead vocal delivers an anthemic melody, punctuated by energetic ad-libs and hype-man shouts. The production is dense with layered synthesizers, including soaring leads, atmospheric pads, and dynamic FX like risers and sweeps that build tension into the explosive choruses. A mid-song instrumental break features a melodic synth lead and vocal chops, leading into a final, climactic chorus and an abrupt ending with a tape-stop sound effect.",
|
| 4 |
+
"lyrics": "[Intro]\n[Synth arpeggio and pads]\n\n[Verse 1]\n[ko] hwangholhan i jilseo-e neon ppajyeo\n\n[en] (oh yeah)\n\n[ko] lideum wi-e sinhoneul jilleo\n\n[en] (oh, let's go)\n\n[ko] eodum sog-e bich-eul kkeul-eo\n[ko] modeun geos-i bultago\n\n\n[Pre-Chorus]\n[ko] bich-i nal mag-eulyeo\n[ko] moduga geudaelo bichna\n[ko] geo-ul sog-e bichwojwo\n[ko] naneun jasin-ui jibjunghae\n\n\n[Chorus]\n[ko] oelo-un i jilseo gip-eojyeo\n\n[en] (yeah, yeah)\n\n[ko] lideum wi-e him-eul jwi-yeo\n\n[en] (let's go)\n\n[ko] saelo-un sidaeleul yeol-eo\n\n[en] (oh yeah)\n\n[ko] gaseumsog-e bulkkoch-i pi-eo\n\n\n[Verse 2]\n[ko] geodaehan mudae sog-eseo\n[ko] nan tae-eonal geo-ya\n[ko] modeun geos-i choego-ui jilseo\n[ko] uli hamkke chumchwo\n\n\n[Bridge]\n[ko] bichgwa eodum-i salajigo\n[ko] ulin seololeul bichwo\n[ko] eunmilhan i sungan sog-eseo\n[ko] ulin hamkke mandeul-eoga\n\n\n[Chorus]\n[ko] oelo-un i jilseo gip-eojyeo\n\n[en] (yeah, yeah)\n\n[ko] lideum wi-e him-eul jwi-yeo\n\n[en] (let's go)\n\n[ko] saelo-un sidaeleul yeol-eo\n\n[en] (oh yeah)\n\n[ko] gaseumsog-e bulkkoch-i pi-eo\n\n\n[Instrumental Break]\n[Synth lead melody with vocal chops]\n\n[Outro]\n[Synth arpeggio fades out]\n[abrupt silence]",
|
| 5 |
+
"bpm": 40,
|
| 6 |
+
"duration": 210,
|
| 7 |
+
"keyscale": "E♭ minor",
|
| 8 |
+
"language": "ko",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_12.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An energetic and anthemic pop-rock track driven by a clean, funky electric guitar riff and a punchy, tight drum and bass groove. The song opens with a catchy guitar hook before settling into a verse led by a clear, powerful male vocal. The arrangement builds dynamically into a soaring, uplifting chorus with layered vocals and driving instrumentation. A melodic guitar solo section features expressive bends and fills. The track includes a brief, atmospheric bridge with wordless vocalizations before launching into a final, powerful chorus and concluding with the initial guitar riff.",
|
| 4 |
+
"lyrics": "[Intro - Guitar Riff]\n\n[Verse 1]\n翻过山又像迷雾\n心也照亮每一步\n大地在脚下伸出手\n追逐梦想不要停留\n\n[Pre-Chorus]\n星光摇曳在夜幕\n未来就在眼前呼\n跳动脉搏像闪电火\n展开翅膀冲破束缚\n\n[Chorus]\n在梦的边缘跳跃光辉\n无数的光点汇聚成美\n燃烧的希望把世界敲碎\n一切再次新的节奏到位\n\n[Guitar Interlude]\n\n[Verse 2]\n音符交错夜色深处\n心的律动难以抗拒\n脚步合拍如同律步\n舞出一曲全新的篇幅\n\n[Pre-Chorus]\n银河展开狂奔翅膀\n你和我合奏不息光\n跨越万千热浪激荡\n前行不停冲破过往\n\n[Chorus]\n在梦的边缘跳跃光辉\n无数的光点汇聚成美\n燃烧的希望把世界敲碎\n一切再次新的节奏到位\n\n[Guitar Solo]\n\n[Bridge - Vocal Harmonies]\n(Hmm-mm-mm-mm)\n(Hmm-mm-mm-mm)\n(Hmm-mm-mm-mm)\n(Hmm-mm-mm-mm)\n\n[Chorus]\n在梦的边缘跳跃光辉\n无数的光点汇聚成美\n燃烧的希望把世界敲碎\n一切再次新的节奏到位\n\n[Outro - Guitar Riff]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 190,
|
| 7 |
+
"keyscale": "C# minor",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_13.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A classic boom-bap hip-hop track built on a nostalgic foundation. A steady drum machine groove with a crisp snare drives the song forward beneath atmospheric synth pads that create an introspective mood. The male rapper delivers confident verses with a clear flow, while the chorus shifts to melodic, layered vocal harmonies soaked in reverb for an anthemic feel. The arrangement is punctuated by spoken ad-libs treated with echo effects and concludes with an extended outro featuring heavily filtered vocals over a stripped-back beat before fading into ambient textures.",
|
| 4 |
+
"lyrics": "[Intro]\nYo\nIt's Laðum\nYou already know\nMoney talks\nThey don't talk back\n\n[Verse 1]\nI stay with the heat, streets raised me\nNever folded under pressure, now I'm way crazy\nChains on my neck, still my grind daily\nBut my soul stay heavy, yeah the grind ain't lazy\nOld heads stackin', but I flip through the beat\nStackin' chips on chips, now they movin' in the street\nBut the streets ain't ready, now my path got heat\nUsed to dream to win, now I move with the beat\nCopped a cold brick, got heat in my heart\nNow my chain swing tight, I been playin' my part\nMoney ain't a thing, but it takes its part\nStill I carry my soul like it's built from the start\n\n[Chorus]\nSo tell me, why they hate when you live?\nGot the heart of a king, but the soul in the bills\nTell me, who the heck do they feel?\nWhen the world keep spinnin' but you lost it all still\n(Still, still, still)\nWhy they hate when you live?\nWith the heart of a king, but the soul in the bills\nTell me, who the heck do they feel?\nWhen the world keep spinnin' but you lost it all still\n(Still, still, still)\nWhy they hate when you live?\nWith the heart of a king, but the soul in the bills\nTell me, who the heck do they feel?\nWhen the world keep spinnin' but you lost it all still\n(Still, still, still)\n\n[Verse 2]\nThey talk about the grind but I'm broke, I'm grindin'\nEvery dollar spent, now the people still whinin'\nBut I keep my head high, never fold, never lyin'\nAnd my heart's still cold even when the day's lyin'\nOne road left but I can't ride that chain\nI'ma keep it movin', yeah they all feel my pain\nThey say money talks, but I'm answerin' it's still\nSo I made it in silence, now I'm free from the deal\n\n[Chorus]\nTell me, why they hate when you live?\nGot the heart of a king, but the soul in the bills\nTell me, who the heck do they feel?\nWhen the world keep spinnin' but you lost it all still\n(Still, still, still)\nWhy they hate when you live?\nWith the heart of a king, but the soul in the bills\nTell me, who the heck do they feel?\nWhen the world keep spinnin' but you lost it all still\n(Still, still, still)\nWhy they hate when you live?\nWith the heart of a king, but the soul in the bills\nTell me, who the heck do they feel?\nWhen the world keep spinnin' but you lost it all still\n(Still, still, still)\n\n[Outro]\nThey can't cage the vibe, yeah\nYou gotta stay in\nIt's money and power, a universe, one rule\nMoney is the money\nAnd I'm not at the bottom\n[Instrumental fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 210,
|
| 7 |
+
"keyscale": "F# major",
|
| 8 |
+
"language": "en",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_14.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An aggressive, high-energy electronic track driven by a relentless four-on-the-floor kick drum and a gritty, distorted synth bassline. The arrangement opens with a chiptune-style arpeggiated synth melody before slamming into the main groove. A forceful male vocal, delivered in a rhythmic, almost-rapped style, cuts through the dense mix of sharp synth stabs and atmospheric pads. The chorus is anthemic and powerful, with layered vocals creating a sense of defiance. The track features dynamic shifts, including a breakdown with filtered vocals and a build-up that reintroduces the intense beat, culminating in a powerful outro with vocal chops and a final, abrupt stop.",
|
| 4 |
+
"lyrics": "[Intro]\n[Synth arpeggio intro]\n[Drum build-up]\n\n[Verse 1]\nEfo, siren klocha' mi kocha\nDu mevi shemol al kol ha'bachar me'ha'ba\nTachat, kai la'pata\nNima bila mefila\n\n[Chorus]\nTishuvot lefater bach'ka\nBatecha nolecha im telev elor lachta\nTirot lanu, venashmeach\nKa'otzot pachta\n\n[Instrumental Drop]\n\n[Verse 2]\nHakavat et hasnan\nHakol bishvili al kol hachol rotza\nHesham tamash, tsurad lo\nHesham rishon, rosh, orad\n\n[Chorus]\nTishuvot lefater bach'ka\nBatecha nolecha im telev elor lachta\nTirot lanu, venashmeach\nKa'otzot pachta\n\n[Instrumental Drop]\n\n[Bridge]\nTishuvot, hetta, bach'ka\nBatecha nolecha im telev elor lachta\nTirot lanu, venashmeach\nKa'otzot pachta\n\n[Breakdown]\n[Filtered synth arpeggio]\n(Tishuvot lefater bach'ka) [pitched down, reverbed]\n\n[Chorus]\nTishuvot lefater bach'ka\nBatecha nolecha im telev elor lachta\nTirot lanu, venashmeach\nKa'otzot pachta\n\n[Outro]\n[Synth arpeggio fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 254,
|
| 7 |
+
"keyscale": "E minor",
|
| 8 |
+
"language": "he",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_15.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An intense, high-energy electronic track driven by a relentless, arpeggiated synth melody and a hard-hitting trap beat with rapid-fire hi-hats and a deep sub-bass. A powerful female vocal, delivered in Mandarin, cuts through the mix with a clear, assertive tone. The arrangement builds through tense verses into an anthemic, soaring chorus where the vocals become more melodic and layered. The track features dynamic drops and builds, including a brief, atmospheric bridge with filtered vocals and pads before slamming back into a final, climactic chorus and an instrumental outro that deconstructs the main synth motif.",
|
| 4 |
+
"lyrics": "[Intro - Arpeggiated Synth Melody]\n\n[Verse 1]\n午夜城市灯光闪\n黑夜中你藏心弦\n一脚踏入深渊洞\n脚下跳动电波涌\n喧嚣夜深云雾涌\n孤单的你在夜空\n光与影交织梦境\n每一拍都在追忆\n\n[Chorus]\nYeah\n黑夜之上我是逃亡\n风暴之中寻找方向\n燃烧灵魂去疯狂\n这世界锁不住渴望\n\n[Instrumental Break - Synth Melody]\n\n[Verse 2]\n时间律动催人转\n繁星作画流星卷\n故事未完灵魂闪\n快将心跳连接遍\n\n[Instrumental Drop]\n\n[Bridge]\n冲破那无边黑暗\n光束驱散悲伤酸\n虚伪遮眼无需问\n黑夜是我唯一的吻\n\n[Chorus]\n黑夜之上我是逃亡\n风暴之中寻找方向\n燃烧灵魂去疯狂\n这世界锁不住渴望\n\n[Instrumental Break]\n\n[Chorus - Layered Vocals]\n冲破那无边黑暗\n光束驱散悲伤酸\n虚伪遮眼无需问\n黑夜是我唯一的吻\n\n[Outro - Synth Melody Fades Out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 194,
|
| 7 |
+
"keyscale": "E minor",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_16.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An upbeat, retro-flavored synth-pop track with a distinct 80s city pop aesthetic. The song is built on a foundation of a punchy, gated-reverb drum machine beat and a funky, melodic synth bassline that drives the groove. Layers of bright, brassy synth stabs and shimmering pads create a vibrant, nostalgic soundscape. A clean, funky electric guitar adds rhythmic texture with muted strums and short licks. The lead vocal is a clear, smooth female voice, delivering a catchy melody with a touch of reverb. The track follows a classic pop structure, with an energetic chorus and a brief, more reflective bridge before a final instrumental fade-out.",
|
| 4 |
+
"lyrics": "[Intro - Synth & Drum Machine]\n\n[Verse 1]\n月亮挂在夜空轻轻摇\n城市霓虹在眼前飘\n你的脚步像慢动作漂\n心跳加速无法逃跑\n\n[Chorus]\n跟着节奏慢慢靠近\n梦里的旋律吸引不停\n你的眼神如星光的印\n我的世界因你而安静\n\n[Instrumental Break]\n\n[Verse 2]\n街头热气弥漫着热浪\n脚步和节拍越来越长\n空气中弥漫淡淡芬芳\n我们的距离瞬间在曝光\n\n[Chorus]\n跟着节奏慢慢靠近\n梦里的旋律吸引不停\n你的眼神如星光的印\n我的世界因你而安静\n\n[Bridge]\n午夜钟声在耳边回荡\n时间却静止无法隐藏\n你的轮廓在脑海点亮\n每一瞬间都让我疯狂\n\n[Instrumental Break]\n\n[Chorus]\n跟着节奏慢慢靠近\n梦里的旋律吸引不停\n你的眼神如星光的印\n我的世界因你而安静\n\n[Outro - Instrumental Fade Out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 187,
|
| 7 |
+
"keyscale": "E minor",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_17.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[Verse]\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A melancholic piano melody opens over atmospheric pads, setting an introspective tone before a crisp lo-fi hip-hop beat enters with a deep sub-bass foundation. The lead male vocal is smooth and emotive, delivered with significant reverb that enhances its spacious quality. The arrangement builds dynamically into a powerful chorus where layered vocals soar over swelling synth strings and intensified drums. A brief bridge offers a moment of reflection with more sparse instrumentation before launching back into a climactic final chorus featuring passionate ad-libs and harmonies. The track concludes with fading ambient textures and lingering piano notes.",
|
| 4 |
+
"lyrics": "[Intro: Piano Arpeggio]\n\n[Verse 1]\nFootsteps echo in the quiet night\nA lamp flicker, casting cold light\nShadows stretch long on the pavement below\nWhat they say, I don't know\n\n[Pre-Chorus]\nWhispers in the misty air\nA promise made, left somewhere\nWhoa\n\n[Chorus]\nI've got dreams that fade, but they stay\nLost in the glow of yesterday\nWhoa\nI'm chasing shadows in the rain\nBut every step feels the same\n\n[Verse 2]\nNeon signs hum their lonesome tune\nThe sky painted dark, a fading moon\nWindows fogged with stories untold\nThis city's heart is icy cold\n(Ooooh)\n\n[Pre-Chorus]\nEvery corner hides a piece of me\nA ghost of who I used to be\nOh\n\n[Chorus]\nI've got dreams that fade, but they stay\nLost in the glow of yesterday\nWhoa\nI'm chasing shadows in the rain\nBut every step feels the same\n\n[Outro]\n[Piano arpeggio fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 185,
|
| 7 |
+
"keyscale": "B♭ minor",
|
| 8 |
+
"language": "en",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_18.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A clean, melodic electric guitar riff with a touch of chorus effect opens the track, establishing a catchy, looping motif. A male vocalist enters with a confident, rhythmic rap flow over a steady, mid-tempo hip-hop beat and a smooth bassline. The chorus elevates the energy with a more powerful, sung vocal delivery, punctuated by echoed, soaring ad-libs. The arrangement is built around the interplay between the crisp guitar and the dynamic vocal performance, which shifts between rapping and melodic singing. The track concludes with the initial guitar riff returning, accompanied by a whispered vocal, before fading to silence.",
|
| 4 |
+
"lyrics": "[Intro: Electric Guitar Melody]\n\n[Verse 1]\nJCC的爱\n是永不凋零的rose\nJCC的爱\n是永不凋零的rose\nJCC的爱\n是永不凋零的rose\nJCC的爱\n是永不凋零的rose\n\n[Pre-Chorus]\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\n\n[Chorus]\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\n\n[Verse 2]\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\n\n[Chorus]\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\nJCC的爱是永不凋零的rose\n\n[Outro: Electric Guitar Melody]\nJCC的rose\nJCC的rose",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 175,
|
| 7 |
+
"keyscale": "E minor",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_19.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A classic boom-bap hip-hop track built on a steady drum machine groove and a prominent, funky bassline. A clean electric guitar plays a recurring melodic riff and chords, giving the song a slightly jazzy, contemplative feel. The lead vocal is a male rapper delivering a clear, narrative-driven story in French. The choruses introduce layered, harmonized male vocals that add a melodic and slightly melancholic dimension. The track concludes with a spoken-word outro over the fading instrumental, reinforcing the song's humorous and relatable theme.",
|
| 4 |
+
"lyrics": "[Intro]\nYo, écoute l'histoire de Cécile 8\nLa femme du crocodile\nElle achète pas du pain mais du vernis sans malice\nQuand elle passe, ça devient trop pire, trop radical\nElle préfère des roupettes, c'est pas méchant, c'est capital\n\n[Verse 1]\nSa femme se fait des coups par trois équipes sévères\nMais quand elle sort, c'est pas des coups mais des bières\nElle a montré un CV, pas un collègue à l'école\nElle a frappé le premier, dit que c'était du mollo sans bol\n\n[Chorus]\nCécile 8 est ma muse, elle a le front censé clair\nMais derrière ses lunettes, y'a que des bonnes affaires\nCécile 8 est ma muse, elle a le front censé clair\nMais derrière ses lunettes, y'a que des bonnes affaires\n\n[Verse 2]\nUn soir elle s'est fait pousser, trois pierres de regards\nElle l'a pris pour une coquée, pas le genre de fille qui s'égare\nMais dans la cour, elle court, se balade sur le trottoir\nElle traque ses fichiers bidons comme un dandy sans espoir\nElle a vu l'argent du renouveau, pas la galère\nMais dans la cour elle plane comme un résidu dans la mer\nCertains s'excusent du bien avec leurs valises\nMais moi ce que je veux, c'est des billets frits en vitrine\n\n[Chorus]\nCécile 8 est ma muse, elle a le front censé clair\nMais derrière ses lunettes, y'a que des bonnes affaires\nCécile 8 est ma muse, elle a le front censé clair\nMais derrière ses lunettes, y'a que des bonnes affaires\n\n[Bridge]\nL'argent monte à l'eau mais c'est du cocaïne\nElle veut des femmes en trop mais c'est pas l'amour qui gagne\nLes romans partent plus tard, elle compte des milliers\nUn service de boulot, c'est pas ce qu'elle veut pour les oubliés\nLe seul travail qui paye, c'est la richesse du décor\nLe seul job qu'elle trouve, c'est lui même en cas de mort\nElle arrive à la porte, amour que reine sa libido\nCe mec, t'as capté ? C'est du vin dans ton vino, mon amigo\n\n[Verse 3]\nSon bar tabou, Sécile 8 au premier virage\nElle s'installe dans le théâtre, love dans le carnage\nMais la seule vérité, c'est qu'elle fait pas le bien\nElle veut voler, elle veut ça, sans jamais aller loin\n\n[Chorus]\nCécile 8 est ma muse, elle a le front censé clair\nMais derrière ses lunettes, y'a que des bonnes affaires\nCécile 8 est ma muse, elle a le front censé clair\nMais derrière ses lunettes, y'a que des bonnes affaires\n\n[Outro]\nYo, et dans le chill, y'a des flammes à manger\nMais Cécile 8, elle peut pas les oublier\nElle rêve de biff ou d'un voyage loin d'ici\nUn jour, je dirai ce que j'ai déjà pris\nC'est pas l'amour, c'est un carnaval\nUn jour, elle verra, je suis pas ton carnaval\nCecile 8, la fille du coin dans la vie\nTu dois courir après l'argent ou mourir pour la justice\nCécile 8, la femme du crocodile\nElle aime, elle craint, mais elle veut son style\nCécile 8, la femme du crocodile\nElle aime, elle craint, mais elle veut son style\nCécile 8, la femme du crocodile\nElle aime, elle craint, mais elle veut son style\nCecile 8, la femme du crocodile\nElle aime, elle craint, mais elle veut son style",
|
| 5 |
+
"bpm": 200,
|
| 6 |
+
"duration": 213,
|
| 7 |
+
"keyscale": "D minor",
|
| 8 |
+
"language": "fr",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_20.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[Intro]\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A classic, gritty hip-hop track built on a steady boom-bap drum machine groove and a deep, foundational sub-bass line. A male vocalist delivers a confident, narrative rap in Spanish, detailing the life of a tough-guy truck driver with a clear, rhythmic flow. The production is clean yet raw, with subtle atmospheric pads in the background and standard vocal reverb. The track transitions into a more reflective, spoken-word style bridge before concluding with an instrumental outro featuring a melodic synth line over the persistent beat.",
|
| 4 |
+
"lyrics": "[Intro]\nHey\nEste es pa' la güera\nEn San Antonio\nEl corazón frío\nPa' que quede frío\nAsí\nYeah\n\n[Verse 1]\nPa' la güera en el hospital\nDonde el 11 de 2001 suena criminal\nYa se metió pa' pegarle a la lejanía\nDonde el calor es de verdad, ¿dónde empieza el día?\nYa no es la abuela, la sonrisa en los zapatos\nAquí el café en la mesa ya tiene los cuajazos\nAquí el taller de ley ya viste el tajío\nY el viejo sin lluvia llega, qué desafío\nConecta, te conecta, siente el mensaje\nDale que en la calle no hay sabotaje\nLa calle es caldo, el asfalto pura vida\nAquí se escribe pa' cada chiva la movida\nYa es tradición, la historia bien escrita\nEn cada puerta que toca hay mil citas\nEl tío Javier ya se jodió el lío\nY ahora en el punk ya está su artilgo\nDesde Jalisco hasta la 115 en la piel\nYa está el vecindario y el volante a romper\nYa es borrar pendejos, el arte respeto\nPonte trucha en lo que te pillo, es nuestro decreto\n\n[Verse 2]\nUnos en ayuda, otros en parque\nOtros tienen GPS, otros cruzan barcos\nEn caminos que pasan sin conocer\nPero, ¿quién te pone el estilo, quién?\nEn 4L, el gas bajo, pa' allá nadie\nSomos tronos, tú ya cayó que esto no es botarla\nAsí que chinguen a su madre\nCon buenas frases las cantas, bro\n\n[Outro]\nY te voy pa' allá\nPa' la ruta\nAh\nHey\n[Instrumental fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 124,
|
| 7 |
+
"keyscale": "F# minor",
|
| 8 |
+
"language": "es",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_21.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[Intro
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A smooth, jazzy lo-fi hip-hop track built on a foundation of a gentle piano melody and a relaxed, steady drum machine groove. A warm, round bassline provides a solid harmonic base. The song features a duet between a clear, melodic female vocalist and a smooth, conversational male vocalist who trade verses and harmonize beautifully in the choruses. The arrangement is punctuated by tasteful, melodic saxophone fills that enhance the jazzy, late-night atmosphere. The track concludes with an extended instrumental outro where the saxophone takes center stage with an expressive, improvisational solo over the core piano and rhythm section, before fading out with a final, lingering piano chord and a soft whoosh effect.",
|
| 4 |
+
"lyrics": "[Intro: Piano & Saxophone]\n\n[Verse 1: Female Vocal]\n月光落进窗\n像是起色的那杯\n你还在我的心底旋转\n抛开所有烦忧\n这爱像流星坠落\n在夜空闪耀\n无需解答 心动是唯一的解药\n\n[Verse 2: Male Vocal]\n你的眼神像迷宫\n我徘徊其中\n每个音符都带了心脏的轰鸣\n触碰你的温度就像电流穿心\n无法抗拒你的爱把我拉入梦境\n\n[Chorus: Duet]\n心跳随着你狂奔\n如海潮汹涌\n爱的节奏翻滚着\n像梦境中梦\n你的名字是旋律\n旋律中成风\n每一拍都诉说爱无声的悸动\n\n[Instrumental Break: Saxophone Solo]\n\n[Verse 3: Male Vocal]\n黑夜像个舞台\n我们点了彩灯\n节拍跟随心跳\n默契融成一吻\n别让夜晚停下来\n爱永远在认真\n每一秒都像歌\n开启奇妙人生\n\n[Chorus: Duet]\n心跳随着你狂奔\n如海潮汹涌\n爱的节奏翻滚着\n像梦境中梦\n你的名字是旋律\n旋律中成风\n每一拍都诉说爱无声的悸动\n\n[Bridge: Duet]\n当黎明来临时\n我们点燃光芒\n你的爱是我心中最暖的信仰\n再多风雨兼程\n也不愿停下\n因为你的存在是我唯一牵挂\n\n[Instrumental Break: Saxophone Solo]\n\n[Chorus: Duet]\n心跳随着你狂奔\n如海潮汹涌\n爱的节奏翻滚着\n像梦境中梦\n你的名字是旋律\n旋律中成风\n每一拍都诉说爱无声的悸动\n\n[Outro: Duet & Saxophone]\n当黎明来临时\n我们点燃光芒\n你的爱是我心中最暖的信仰\n再多风雨兼程\n也不愿停下\n因为你的存在是我唯一牵挂\n[Saxophone melody fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 258,
|
| 7 |
+
"keyscale": "B♭ major",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_22.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An upbeat, celebratory synth-pop track driven by a punchy four-on-the-floor drum machine beat and a vibrant synth bassline. The song opens with a catchy, wordless vocal chop melody that serves as a recurring hook. A clear, enthusiastic male lead vocal carries the verses and soars into an anthemic, layered chorus. The production is polished and modern, featuring bright synth pads, melodic synth leads, and subtle atmospheric effects that create a sense of scale and joy. The track builds dynamically into each chorus, followed by an instrumental break that reprises the main vocal chop hook, before culminating in a powerful final chorus and a clean, decisive ending.",
|
| 4 |
+
"lyrics": "[Intro - Vocal Chop Melody]\n\n[Verse 1]\nÉbred a természet, vigan\nA végtelen a tájba\nAhol a nyár áll az iskola\nA szívünkben tombol a zaj\n\n[Pre-Chorus]\nFel is encontrató mámor\nEgy szolidi táncparkett\nEz a miénk tű early\nEnrég a tűz, enyhék a fák közt\n\n[Chorus]\nEz a mi időnk, rád gondolok\nTáncolok a holdvilág\nA szívünk dobban, de mégis újra élek\nMost és újra\nÉbred a természet, mindkötöm nevét most az élet\nSzállj velem, csak értünk lennem\nA város szívében élj egy szebb világban\n\n[Instrumental Break - Vocal Chop Melody]\n\n[Verse 2]\nA színek és áram szövedezik\nA parkett üteme hallom\nNézd, ahogy a fény táncot jár\nA világ velünk egy táncot vonva\n\n[Pre-Chorus]\nCsillagok mutatója\nEgyütt lélegzünk\nA tenger színeivel esküszünk\n\n[Chorus]\nEz a mi időnk, rád gondolok\nTáncolok a holdvilág\nA szívünk dobban, de mégis újra élek\nMost és újra\nÉbred a természet, mindkötöm nevét most az élet\nSzállj velem, csak értünk lennem\nA város szívében élj egy szebb világban\n\n[Bridge]\nA naplemente vezető ma\nCsak miénk az idő és a pillanat\nA természetünk vivan már\nMost és most húz a szívbe, hopp!\n\n[Instrumental Break - Vocal Chop Melody]\n\n[Outro]\nÉbredj már, együtt repülünk\nAhol a nyár a tengernek ad\nÚjra itt leszünk\nA természet vivan most és mindkötöm nevem",
|
| 5 |
+
"bpm": 201,
|
| 6 |
+
"duration": 184,
|
| 7 |
+
"keyscale": "F# minor",
|
| 8 |
+
"language": "hu",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_23.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[Intro]\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A heartfelt Latin pop-rock ballad driven by a clean, melodic electric guitar that weaves intricate fills and arpeggios throughout the arrangement. The track opens with a distinctive guitar hook before settling into a steady groove laid down by an acoustic drum kit and a solid bassline. The lead male vocal is clear and emotional, delivering a nostalgic and melancholic narrative with increasing passion in the soaring choruses. The song's structure includes a dynamic bridge that shifts the mood towards hopefulness, followed by an expressive and melodic guitar solo that carries the emotional weight of the track before a final, powerful chorus and a gentle, reflective outro.",
|
| 4 |
+
"lyrics": "[Intro - Electric Guitar Melody]\n\n[Verse 1]\nTe extraño demasiado\nSin ti ya no es igual\nEstoy viendo tus fotos\nY el mundo se me va\nEspero que un día\nMe hayas vuelto a ver\nQue yo por lo menos\nPueda hacerte saber\n\n[Pre-Chorus]\nQue aunque ya no estés aquí\nMi corazón te extraña, sí\nY eso es todo lo que soy\nSiempre te recordaré\n\n[Chorus]\nY aunque ya no estés aquí\nMi corazón te extraña, sí\nY eso es todo lo que soy\nSiempre te recordaré\n\n[Instrumental Break - Guitar Melody]\n\n[Verse 2]\nSi acaso he pensado\nPero jamás te olvidé\nEres parte de mi vida\nUn regalo que soñé\nMe sobran tus recuerdos\nPero falta tu calor\nMe falta la esperanza\nDe que vuelvas, por favor\n\n[Pre-Chorus]\nY aunque ya no estés aquí\nMi corazón te extraña, sí\nY eso es todo lo que soy\nSiempre te recordaré\n\n[Chorus]\nY aunque ya no estés aquí\nMi corazón te extraña, sí\nY eso es todo lo que soy\nSiempre te recordaré\n\n[Bridge]\nTal vez el tiempo me traiga de vuelta a tu vida\nTal vez en tus días\nYa no quede cicatriz\nPero yo quiero contarte\nQue aún vives en mí\nQue te quiero, lo sabes\nY te extraño, lo sé\n\n[Chorus]\nY aunque ya no estés aquí\nMi corazón te extraña, sí\nY eso es todo lo que soy\nSiempre te recordaré\n\n[Outro]\nCariño mío\nLo sentiste tan real\nTe amo tanto\nQue ni lo puedo olvidar\nEres mi razón\nY mi eterno despertar\n\n[Guitar Solo]\n\n[Final Outro]\nCariño mío\nLo sentiste tan real\nTe amo tanto\nQue ni lo puedo olvidar\nEres mi razón\nY mi eterno despertar\n[Song fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 245,
|
| 7 |
+
"keyscale": "A major",
|
| 8 |
+
"language": "es",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_24.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An energetic J-rock or anime-style rock track driven by powerful, distorted electric guitars and a driving drum beat. The song opens with a clean, arpeggiated guitar figure before exploding into a full-band arrangement with a soaring lead guitar melody. A clear, powerful female vocal delivers an emotional performance in Japanese, building from the verses into an anthemic, belted chorus. The arrangement features a dynamic bridge with atmospheric vocalizations and a melodic, technically proficient guitar solo complete with expressive bends and fast runs. The track concludes with a powerful final chorus and a climactic guitar flourish.",
|
| 4 |
+
"lyrics": "[Intro - Arpeggiated Electric Guitar]\n\n[Verse 1]\n晨光吻醒陌生的街道\n耳边风轻拂昨日的欢笑\n时针转为脚边的影子奔跑\n梦醒之后谁还在拥抱\n\n[Pre-Chorus]\n轻快的节拍 穿越了时间\n青春像火在燃烧着热血\n遗失的时间 留不住一切\n青春像光它忽明又忽灭\n\n[Chorus]\n黄昏的余音在耳边倾诉\n追逐的脚印已模糊模糊\n翻滚的青春是烈焰的舞\n我们在光阴里漫步反复\n\n[Instrumental Break - Guitar Melody]\n\n[Verse 2]\n汗水挥洒出闪亮的汗珠\n双手扣紧那未完的赌注\n岁月灼热月却描摹着温度\n青春绽放刺破天空深处\n\n[Pre-Chorus]\n轻快的节拍 穿越了时间\n青春像火在燃烧着热血\n遗失的时间 留不住一切\n青春像光它忽明又忽灭\n\n[Bridge - Vocalizations]\n(Ooooh)\n(Ooooh)\n(Ooooh)\n(Ooooh)\n\n[Guitar Solo]\n\n[Pre-Chorus]\n轻快的节拍 穿越了时间\n青春像火在燃烧着热血\n遗失的时间 留不住一切\n青春像光它忽明又忽灭\n\n[Chorus]\n黄昏的余音在耳边倾诉\n追逐的脚印已模糊模糊\n翻滚的青春是烈焰的舞\n我们在光阴里漫步反复\n\n[Outro - Guitar Solo over Chorus Chords]\n[Song ends abruptly]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 213,
|
| 7 |
+
"keyscale": "F minor",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "2"
|
| 10 |
}
|
examples/text2music/example_25.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[Intro]\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An explosive, high-energy anime rock anthem driven by a powerful male tenor vocal performance in Mandarin. The track kicks off with an anthemic 'whoa-oh' vocal hook over a driving four-on-the-floor beat, immediately establishing a J-rock or C-pop rock feel. The production is dense and polished, layering crunchy, distorted power-chord guitars with a punchy synth bass and a tight, modern drum kit. The choruses erupt with soaring, emotional vocals and layered harmonies, creating a massive wall of sound. A brief instrumental bridge features a chiptune-style synth arpeggio before a final, powerful chorus and an outro that fades out with the signature vocal hook.",
|
| 4 |
+
"lyrics": "[Intro]\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\n\n[Verse 1]\n满天星辰都沉默\n谁在黑暗中闪烁\n脚步轻踏入枷锁\n世界若你如此执着\n\n[Pre-Chorus]\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\n\n[Verse 2]\n你听风声在飘散\n梦的影子太凌乱\n无边黑夜探不安\n你的光芒怎能熄灭\n\n[Chorus]\n燃烧吧烈火之光\n黑暗中我们敢闯\n即使跌倒心不慌\n听那世界心跳响\n\n[Post-Chorus]\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\n\n[Verse 3]\n冷雨敲窗滴作响\n谁的伤痛扎中央\n双拳紧握去前方\n哪怕破碎也不会慌\n\n[Bridge]\n与爱相逢也能追\n日出时分睁开眼\n点起它如火蔓延\n这一切都有它的未来\n\n[Chorus]\n燃烧吧烈火之光\n黑暗中我们敢闯\n即使跌倒心不慌\n听那世界心跳响\n\n[Post-Chorus]\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\n\n[Instrumental Break]\n\n[Bridge]\n与爱相逢也能追\n日出时分睁开眼\n点起它如火蔓延\n这一切都有它的未来\n\n[Chorus]\n燃烧吧烈火之光\n黑暗中我们敢闯\n即使跌倒心不慌\n听那世界心跳响\n\n[Outro]\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\nWoah-oh-oh-oh-oh-oh\n[abrupt silence]",
|
| 5 |
+
"bpm": 200,
|
| 6 |
+
"duration": 190,
|
| 7 |
+
"keyscale": "E♭ major",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "2"
|
| 10 |
}
|
examples/text2music/example_26.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A modern reggaeton track with a confident, defiant energy, built on a classic dembow drum machine beat and a deep sub-bass line. The song opens with a catchy, pitched-up vocal chop melody that serves as a recurring hook. The lead female vocal is clear and assertive, delivering lyrics with a rhythmic, almost confrontational flow. The chorus expands with layered vocal harmonies, adding power and emphasis. A bridge section introduces a filtered, telephone-like vocal effect for contrast before the track builds back into a final, powerful chorus and fades out with the initial vocal chop melody and atmospheric synth pads.",
|
| 4 |
+
"lyrics": "[Intro - Vocal Chop Melody]\n\n[Verse 1]\nMe decían que no iba a llegar\nPero este paso me hizo dudar\nTodo lo que quise, todo lo que fui\nHoy vuelven y quieren venir hacia mí\nDicen que soy fría, que no valgo na'\nPero lo que ves es lo que gané ya\nNo me digas pobre, yo ya me cansé\nConozco lo duro que vengo y lo sé\n\n[Chorus]\nDicen que no valgo, que yo soy raro\nPero si me ven raro, mejor ni me hablen claro\nLa envidia y la envidia, eso no se nota\nNo me vendo, no me borran, no me joden en la boca\n\n[Verse 2]\nTengo el estilo que quieren copiar\nPero yo prefiero a quien me quiera mirar\nDicen que me odian, que quieren juzgar\nPero no me importa, ya aprendí a luchar\nTengo el estilo que quieren copiar\nPero yo prefiero a quien me quiera mirar\nDicen que me odian, que quieren juzgar\nPero no me importa, ya aprendí a luchar\n\n[Chorus]\nDicen que no valgo, que yo soy raro\nPero si me ven raro, mejor ni me hablen claro\nLa envidia y la envidia, eso no se nota\nNo me vendo, no me borran, no me joden en la boca\n\n[Bridge - Filtered Vocals]\nNo soy de los que suelen juzgar\nNo soy de los que saben tratar\nDicen que he cambiado, que ya no soy igual\nPero no me quejo, no voy a parar\n\n[Chorus - Layered Vocals]\nDicen que no valgo, que yo soy raro\nPero si me ven raro, mejor ni me hablen claro\nLa envidia y la envidia, eso no se nota\nNo me vendo, no me borran, no me joden en la boca\n\n[Outro]\nNo me vendo, no me borran, no me joden en la boca\nNo me vendo, no me borran, no me joden en la boca\n[Vocal chop melody fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 140,
|
| 7 |
+
"keyscale": "E♭ minor",
|
| 8 |
+
"language": "es",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_27.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A moody, atmospheric trap track opens with a clean, arpeggiated synth melody and a deep sub-bass foundation. A crisp trap beat with punchy kicks and rattling hi-hats drives the rhythm. The male lead vocal, delivered in Polish, shifts between melodic rapping and singing, conveying a sense of urban melancholy and introspection. The choruses are marked by a prominent, expressive saxophone melody that weaves through the mix, adding a layer of jazzy, melancholic character. The track progresses through verses and choruses, culminating in an extended instrumental outro where the beat deconstructs, leaving behind atmospheric synth pads and a final, lingering saxophone motif before fading into a music box-like synth melody.",
|
| 4 |
+
"lyrics": "[Intro]\nYeah\nYeah\n\n[Verse 1]\nDrzwi otwarte na zakrętach, suki uderzały w lód\nEj, ej\nW białych obłokach mrok unosi, to sen dla nas jest blues\nBłyskawice na kolanach, a serca na dłoni\nŻycie bez reguł, jedno ja i ona, patrząc w prawdę, ale bez słów\nŻycie bez reguł, jedno ja i ona, patrząc w prawdę, ale bez słów\n\n[Chorus - Instrumental Drop]\n[Synth lead melody with vocal chops]\n\n[Verse 2]\nZnów nie widać dla nich, patrz, szkło zasłania się światłem\nEj, ej\nKażda zła chwila to ciężar, by zgubić ślad choćby jasne\nBłyskawice na kolanach, a serca na dłoni\nŻycie bez reguł, jedno ja i ona, patrząc w prawdę, ale bez słów\nŻycie bez reguł, jedno ja i ona, patrząc w prawdę, ale bez słów\n\n[Chorus - Instrumental Drop]\n[Synth lead melody with vocal chops]\n\n[Bridge]\nZnów nie widać dla nich, patrz, szkło zasłania się światłem\nKażda zła chwila to ciężar, by zgubić ślad choćby jasne\n[Synth arpeggios and beat]\n\n[Chorus - Instrumental Drop]\n[Synth lead melody with vocal chops]\n\n[Outro]\n[Synth lead melody with vocal chops and beat]\n[Beat fades out, synth melody continues and fades]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 177,
|
| 7 |
+
"keyscale": "F minor",
|
| 8 |
+
"language": "pl",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_28.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[Intro
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An energetic, high-tempo J-pop and chiptune fusion track driven by a relentless four-on-the-floor drum machine beat and a pulsing synth bassline. The soundscape is dominated by bright, arpeggiated 8-bit style synthesizers that create a vibrant, video-game-like atmosphere. A clear, high-pitched female vocal delivers the melody with an upbeat, almost frantic energy, soaring over the dense electronic arrangement. The structure includes dynamic builds and drops, with a brief, atmospheric bridge featuring a whispered English vocal line before launching back into the powerful, synth-laden chorus and an instrumental synth-heavy outro.",
|
| 4 |
+
"lyrics": "[Intro - Synth Arpeggio]\n\n[Verse 1]\n森の奥で育ちの音\n素衣ざなたてが木が笑う\n流れてゆく時の中で\nそれたちの声が踊り出す\n\n[Pre-Chorus]\n鳥が歌えば風がふわりと\n母の愛を運んできたよ\n子供たちの背中を追って\n深い森の上で生きる\n\n[Chorus]\n森の鳥語るよ\n好き君を想うその時に\n月夜の影忍び寄る\n静けさの中君を呼ぶ\n\n[Instrumental Break]\n\n[Verse 2]\n木の下で続く坂道\n鳥たちの声は夢を誘う\n絵を描いた木の夢\n木が叫ぶ運命の声\n\n[Pre-Chorus]\n時は流れて僕は行く\n名もない木は今もここに\n君がくれたそのぬくもり\n風がそっと背中を押す\n\n[Chorus]\n森の鳥語るよ\n好き君を想うその時に\n月夜の影忍び寄る\n静けさの中君を呼ぶ\n\n[Bridge]\n月の影に隠れた船\n季節巡り風が歌う\n風が呼ぶ声の先に\n静かにささやく君の名を\n\n[Breakdown]\n[whispered]\n響けその歌で\n静かに響く木の音\n\n[Instrumental Break]\n\n[Outro]\n森の奥の果てまで\n[Synth arpeggio fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 218,
|
| 7 |
+
"keyscale": "F minor",
|
| 8 |
+
"language": "ja",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_29.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An energetic chiptune-pop track driven by a punchy electronic drum machine and a bright, catchy 8-bit synth lead that plays a memorable riff. A clear, powerful male vocal delivers an anthemic melody in Mandarin, soaring over the vibrant digital arrangement. The song follows a classic pop structure with verses building into an explosive, uplifting chorus. A brief, more atmospheric bridge provides a moment of reflection before launching back into the final chorus and an instrumental outro that fades out on the iconic synth melody.",
|
| 4 |
+
"lyrics": "[Intro - Chiptune Synth Melody]\nOh!\n\n[Verse 1]\n翻过山峰\n找寻光\n脚步轻快像飞翔\n天空洒满\n梦的光\n未来在呼唤我方向\n\n[Chorus]\n一起飞越那片天\n勇敢拥抱每个瞬间\n冒险是我们的语言\n心跳同步无边无边\n\n[Instrumental Break - Chiptune Synth Melody]\n\n[Verse 2]\n风吹过\n沙漠青烟\n勇气化作星的线\n脚步带着\n希望远\n未知旅途从不停歇\n\n[Chorus]\n一起飞越那片天\n勇敢拥抱每个瞬间\n冒险是我们的语言\n心跳同步无边无边\n\n[Instrumental Break - Chiptune Synth Melody]\n\n[Bridge]\n脚印落在时间线\n画下每一个明天\n星辰为我们兑现\n爱是唯一的心愿\n\n[Chorus]\n一起飞越那片天\n勇敢拥抱每个瞬间\n冒险是我们的语言\n心跳同步无边无边\n\n[Outro - Chiptune Synth Melody & Beat]\n[Beat fades out, synth melody continues and fades]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 163,
|
| 7 |
+
"keyscale": "B♭ major",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_30.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An upbeat French pop-rap track built on a clean, catchy electric guitar riff that loops throughout. The production is modern and crisp, featuring a punchy drum machine beat with sharp claps and steady hi-hats, anchored by a smooth synth bassline. A confident male lead vocal delivers rhythmic verses and an anthemic chorus filled with layered harmonies for emphasis. The song includes sections where the vocals become more processed and filtered, adding texture before transitioning into a brief instrumental break highlighting the main guitar motif alongside melodic humming ad-libs.",
|
| 4 |
+
"lyrics": "[Intro]\n[Clean electric guitar riff]\nYeah\nYeah\n\n[Verse 1]\nDans le game, on trace, on trace\nDès le matin, on ne se relâche\nRêve en couleur, rêve en or\nTous les autres, quelqu'un ignore\n\n[Chorus]\nMains dans les poches\nLèvres dans les nuits\nLis-moi pas, moi\nC'est quoi? C'est qu'une vraie vie\n\n[Verse 2]\nJe pars en bataille, sourire au coin\nNouvelle vision, non, j'en ai rien\nToujours au top, jamais en panne\nLa révolution vient d'une autre âge, c'est l'heure du barrage\n\n[Pre-Chorus]\nMains dans les poches, lèvres dans les nuits\nLis-moi pas, moi, c'est quoi? C'est qu'une vraie vie\nJe pars en bataille, sourire au coin\nNouvelle vision, non, j'en ai rien\nToujours au top, jamais en panne\nLa révolution vient d'une autre âge, c'est l'heure du barrage\n\n[Chorus]\nMains dans les poches\nLèvres dans les nuits\nLis-moi pas, moi\nC'est quoi? C'est qu'une vraie vie\n\n[Bridge]\nJe pars en bataille, sourire au coin\nNouvelle vision, non, j'en ai rien\nToujours au top, jamais en panne\nLa révolution vient d'une autre âge, c'est l'heure du barrage\n\n[Pre-Chorus]\nMains dans les poches, lèvres dans les nuits\nLis-moi pas, moi, c'est quoi? C'est qu'une vraie vie\n\n[Chorus]\nMains dans les poches, lèvres dans les nuits\nLis-moi pas, moi, c'est quoi? C'est qu'une vraie vie\nJe pars en bataille, sourire au coin\nNouvelle vision, non, j'en ai rien\nToujours au top, jamais en panne\nLa révolution vient d'une autre âge, c'est l'heure du barrage\n\n[Outro]\nMains dans les poches\nLèvres dans les nuits\nLis-moi pas, moi\nC'est quoi? C'est qu'une vraie vie\n[Instrumental break with guitar solo]\nJe pars en bataille, sourire au coin\nNouvelle vision, non, j'en ai rien\nToujours au top, jamais en panne\nLa révolution vient d'une autre âge, c'est l'heure du barrage\nMains dans les poches, lèvres dans les nuits\nLis-moi pas, moi, c'est quoi? C'est qu'une vraie vie\nJe pars en bataille, sourire au coin\nNouvelle vision, non, j'en ai rien\nToujours au top, jamais en panne\nLa révolution vient d'une autre âge, c'est l'heure du barrage\n[Music fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 220,
|
| 7 |
+
"keyscale": "A major",
|
| 8 |
+
"language": "fr",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_31.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A classic country-folk ballad driven by a steady acoustic guitar rhythm and a warm, earnest male vocal with a gentle twang. A simple bassline and a straightforward drum beat establish a heartfelt, mid-tempo groove. A plaintive harmonica weaves in and out, providing melodic fills and taking the lead during the instrumental breaks and the extended outro. The production is clean and direct, focusing on the storytelling and the sincere vocal performance, creating a nostalgic and romantic atmosphere.",
|
| 4 |
+
"lyrics": "[Intro - Acoustic Guitar]\n\n[Verse 1]\nFound you standing in the river's roar\nHeartbeats racing like never before\nEyes met mine and time stood still\nDreams were spinning 'round this hill\nWalking barefoot on a dusty road\nFingers tracing where the pelo grow\nHear your laugh, it's a sweet melody\nLove a tune that sets me free\n\n[Chorus]\nI'll ride these heartstrings all night long\nSing our song where we belong\nIn the quiet moments where we're strong\nYour heartbeat's my favorite song\n\n[Instrumental Break - Harmonica Solo]\n\n[Verse 2]\nStars above us start to glow\nEvening whispers, soft and low\nUnderneath the moonlit sky\nWe're just two souls, you and I\nIn the silence, hearts will find their tune\nUnderneath the prairie moon\n\n[Instrumental Break - Acoustic Guitar]\n\n[Bridge]\nI'll ride those heartstrings 'til they stray\nWe'll find our way back someday\n\n[Chorus]\nI'll ride these heartstrings all night long\nSing our song where we belong\nIn the quiet moments where we're strong\nYour heartbeat's my favorite song\n\n[Outro - Harmonica and Guitar]\n[Final guitar strum]",
|
| 5 |
+
"bpm": 200,
|
| 6 |
+
"duration": 202,
|
| 7 |
+
"keyscale": "A major",
|
| 8 |
+
"language": "en",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_32.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "J-
|
| 4 |
-
"lyrics": "[Intro]\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An energetic, bilingual J-pop and French pop vocal performance over a driving electronic beat. The track is built on a foundation of punchy synth bass, crisp drum machine rhythms, and layered synthesizers. A powerful female lead vocal delivers catchy melodies in Japanese, contrasted by a male rapper who performs a verse in French and English. The production is polished and modern, featuring vocal chops, synth arpeggios, and dynamic builds that lead into an anthemic, soaring chorus with layered harmonies and expressive vocal ad-libs. The song concludes with a brief, reflective spoken-word section in Japanese before fading out.",
|
| 4 |
+
"lyrics": "[Intro]\nKaze ni notte\nAtarashii\nDoboji to omoide\n\n[Verse 1]\nJono wa hirogaru 100% yorokobi\nKamishime dori mo imi ni naru shi\nMinna ore o yurugasu\nTaifuu ga shinai no kamo\n\n[Pre-Chorus]\nJinchou onna no you ni ikki\nKoukou de kyou mo ikitsuku\nJiyuu no kaze ni notte yotte\nDoboji to omoide\n\n[Chorus]\nChoushi sawarazu\nHidari ni wo terashite\nKyou mo tokubetsu sugite\nOre no jinsei wo kakeru\n\n[Post-Chorus]\nSubete wo yakareta ore no machi\nIchido ikiru nara imi ga wakaru\nOmae wa sou yatte karada wo katte\nKuzusenai nai\nKoko de doko made mo yukou ka\n\n[Instrumental Break]\n\n[Verse 2 - Male Rap]\nCulte de fiesta\nHousou shite\nKyou mo tokubetsu sugite\nOre no jinsei wo kakeru\n\n[Bridge]\nSubete wo yakareta ore no machi\nIchido ikiru nara imi ga wakaru\nOmae wa sou yatte karada wo katte\nKuzusenai nai\nKoko de doko made mo yukou ka\n\n[Chorus]\nSubete wo yakareta ore no machi\nIchido ikiru nara imi ga wakaru\nOmae wa sou yatte karada wo katte\nKuzusenai nai\nKoko de doko made mo yukou ka\n\n[Outro]\nSubete wo yakareta ore no machi\nIchido ikiru nara imi ga wakaru\nOmae wa sou yatte karada wo katte\nKuzusenai nai\nKoko de doko made mo yukou ka\nSaa saa doko made mo oikakero\nJiyuu no kaze ni notte yotte\nDoboji to omoide\n[Synth arpeggio fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 197,
|
| 7 |
+
"keyscale": "F# major",
|
| 8 |
+
"language": "ja",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_33.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A romantic Latin pop track opens with a clean, arpeggiated piano melody that forms the song's harmonic core. A heartfelt male vocal enters, singing in Spanish with an earnest and emotional delivery. The arrangement builds as a steady, mid-tempo reggaeton-style drum machine beat and a subtle synth bassline kick in, transforming the ballad into a danceable yet tender piece. The chorus is memorable and features layered vocal harmonies for emphasis. The track includes an instrumental break highlighting the piano and synth melody before concluding with a final chorus and a brief, atmospheric piano-led outro.",
|
| 4 |
+
"lyrics": "[Intro: Piano Melody]\n\n[Verse 1]\nMe enamoré de ti\nPorque tú eres la princesa\nY yo un príncipe soñando con tu belleza\nY ya no sé cómo puedo explicarlo\nPero ya la vida me trajo\nA tu lado, me siento orgulloso de ser el caballo más enamorado\n\n[Pre-Chorus]\nPero si yo pudiera retroceder el tiempo\nMe acercaría más a ti\n\n[Chorus]\nPero, oh-oh-oh\nPero, oh-oh-oh\nSi yo pudiera retroceder el tiempo\nMe acercaría más a ti\nPero, oh-oh-oh\nPero, oh-oh-oh\n\n[Verse 2]\nTú te quedaste para siempre en mi mente\nSi te miro a los ojos, siento todo lo diferente\nPorque contigo se detiene el tiempo\nSolo tú y yo sabemos el momento\nMe gustaría detener el tiempo\nAbrazarte, besarte, bailarte lento\nTu mirada me atrapa, es un cuento\nQue aún no termina, no, no\n\n[Bridge]\nEs imposible\nQue yo te quiera más\nY es increíble\nQue yo te quiera más\nY es increíble\nQue yo te quiera más\nMe haces falta\nTe amo y duele tanto, mi vida\n\n[Chorus]\nPero si yo pudiera retroceder el tiempo\nMe acercaría más a ti\nPero, oh-oh-oh\nPero, oh-oh-oh\nPero si yo pudiera retroceder el tiempo\nMe acercaría más a ti\nPero, oh-oh-oh\nPero, oh-oh-oh\n\n[Outro: Instrumental with Piano and Synth Pads]\n[Song fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 150,
|
| 7 |
+
"keyscale": "G major",
|
| 8 |
+
"language": "es",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_34.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An explosive modern metal track driven by high-gain, palm-muted guitar riffs and relentless double-bass drumming. The song kicks off with an aggressive, chugging guitar intro that sets a powerful, energetic tone. A clean, forceful male vocal enters, delivering an anthemic melody over the driving rhythm section. The arrangement features dynamic shifts, including a melodic and technical guitar solo section with harmonized leads. The production is polished and punchy, emphasizing the tight, powerful interplay between the dual guitars and the rhythm section, creating a sound reminiscent of high-energy anime theme music or J-rock fusedic metal.",
|
| 4 |
+
"lyrics": "[Intro - Guitar Riff]\n\n[Verse 1]\nMain selfie, bukan main kejar\nKata-kata manis jadi gelombang besar\nInsta buat teman, semua terkata\nTapi Wi-Fi raya tarik ke mana?\n\n[Chorus]\nMain dengan lowkey, buat hidup terasa\nHarga naik, story mahal punya\nBicara macam maya, suka duka\nTapi cuma idea buat dunia\n\n[Guitar Solo]\n\n[Verse 2]\nScroll TikTok semua berisi hiburan\nCaption kejar hilang, semua tumpang keban\nTapi lapar jalan gelak ketawa\nIngat kenapa setiap malam termenung rasa\n\n[Chorus]\nMain dengan lowkey, buat hidup terasa\nHarga naik, story mahal punya\nBicara macam maya, suka duka\nTapi cuma idea buat dunia\n\n[Bridge]\nPandai dengan semua orang\nTapi tahu di mana damaikan\nRekam-mu belaka, sini hulur tangan\nWalau dunia ini penuh cabaran\n\n[Guitar Solo]\n\n[Chorus]\nMain dengan lowkey, buat hidup terasa\nHarga naik, story mahal punya\nBicara macam maya, suka duka\nTapi cuma idea buat dunia\n\n[Outro - Guitar Solo]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 218,
|
| 7 |
+
"keyscale": "B♭ minor",
|
| 8 |
+
"language": "ms",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_35.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[Intro]\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An energetic, bilingual pop-rap track built on a foundation of punchy trap drums and a deep sub-bass. The song's signature element is a highly catchy, synthesized woodwind melody with a distinct Middle Eastern or Balkan flavor, which serves as the main hook. A male vocalist delivers confident, rhythmic rap verses, seamlessly switching between English and French. The production is clean and modern, with hype-man ad-libs and vocal shouts punctuating the arrangement, creating a vibrant, club-ready atmosphere perfect for a night out.",
|
| 4 |
+
"lyrics": "[Intro]\n[Synth melody intro]\nYo, c'est Big Boss sur le beat\nMaxi sans détour\nOMG, hey baby, viens goûter au détour\nOn lève les bras, tourne le ciel\nLaisse l'énergie nous appeler\nLaisse-moi chauffer ton corps\nDécollage, gars!\n\n[Instrumental Drop]\n\n[Verse 1]\nSway, DJ, pousse ton petto\nLes basses cognent, c'est trop chaud\nLa vibe est dans ma tête, pas de règles ni de stop\nJe suis déjà hors de contrôle, ready to drop\nShort lu, ta playlist, elle déchire le trône\nJ'essaie de conduire mais la passion m'emprisonne\nLa vibe est divine, dansons sans limite\nGarde la vitesse, ce soir c'est la frénésie\n\n[Chorus]\nOn est les boss, on chauffe sans hésitation\nMaxi, Silan, Moustique dans la mission\nJusque la nuit, fais-moi vibrer\nTant que la musique monte, je ne peux plus freiner\n\n[Instrumental Drop]\n\n[Verse 2]\nOn est les boss, on chauffe sans hésitation\nMaxi, Silan, Moustique dans la mission\nJusque la nuit, fais-moi vibrer\nTant que la musique monte, je ne peux plus freiner\nDes ladies all in, te fait sourire en public\nJe suis prêt pour l'action, pas pour te faire diviser\nLa peau douce, ton corps contre le mien\nOn se donne à fond, plus rien ne nous retient\nJe t'invite à la danse, je suis ton vestin\nQuand tu danses, c'est le moment, laisse-toi porter par le refrain\nOn est là pour le show, pas besoin de mots\nJe t'emmène au-delà de ce tempo\nDanse, balance, oublie tout, on brille ensemble\nLa nuit est jeune, l'énergie nous rassemble\nAlors viens, collé-serré, laisse-toi porter\nBig Boss, on est là pour tout casser\n\n[Instrumental Drop]\n\n[Outro]\n[Synth melody fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 135,
|
| 7 |
+
"keyscale": "G minor",
|
| 8 |
+
"language": "fr",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_36.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[Intro
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "An intense, high-energy industrial metal track driven by a powerful, pounding drum machine and chugging, heavily distorted rhythm guitars. The song opens with a tense, cinematic synth arpeggio before erupting into a full-throttle verse. A commanding female lead vocal soars over the dense instrumentation, delivering a dramatic and powerful performance. The arrangement features a dynamic shift into a spoken-word bridge with a menacing, processed male vocal, adding to the dark, narrative atmosphere. The track culminates in a massive, anthemic chorus with layered vocals and a final, screamed vocal flourish before fading out with atmospheric synths and a lingering guitar motif.",
|
| 4 |
+
"lyrics": "[Intro - Synth Arpeggio & Driving Beat]\n\n[Verse 1]\nSous les cieux de la nuit, elle se lève\nLes mains caressant l'épaule en alerte\nSes doigts effleurent les vêtements qui comptent\nSon rire est une promesse qu'il comprendra\n\n[Pre-Chorus]\nElle est parée de rouge et de sang\nForte, sans force, infinie, forte\nSes épaules sont chaotiques, liées par la réalité\nL'écho de son corps gronde, la pression qui monte\n\n[Chorus]\nLa sorcière au caoutchouc se couvre des habits vêtements\nDevaient emporter les fils de son corps\nElle brandit sa lame tendue et ses yeux ne brillent plus\nQuand elle embrasse les ténèbres de son ombre\n\n[Instrumental Break]\n\n[Verse 2]\nLes chasseurs se battent dans l'ombre, mais elle ne chancelle pas\nSa sagesse est une bataille et la menace est son instinct\nElle évoque les légendes des plus terres\nElle aime la nuit, la transpirance infinie\nL'attirance, la puissance qui les hante\n\n[Instrumental Break]\n\n[Bridge - Male Spoken Word, Filtered]\nLes indices se cachent, les espoirs s'effacent\nElle est l'empereur de son château sacré\nSon père est une victime\nElle règne à sa loi\nDe la sorcière au caoutchouc, les cris de son esprit\n\n[Chorus]\nLa sorcière au caoutchouc, la danse du monde se nourrit\nDe ses coups de poing plus fin, de son rire plus fatal\nDans ce chaos de la nuit, elle est le chant de la mort\nLa sorcière au caoutchouc, la mort, la mort, la mort\n\n[Instrumental Outro with Vocal Ad-libs]\n(Ahhh-ah-ah-ahhh)\n(Ahhh-ah-ah-ahhh)\n\n[Final Verse - Layered Vocals]\nElle est parée de rouge et de sang\nForte, sans force, infinie, forte\nElle est parée de rouge et de sang\nForte, sans force, infinie, forte\nElle est parée de rouge et de sang\nForte, sans force, infinie, forte\nElle est parée de rouge et de sang\nForte, sans force, infinie, forte\nForte!\n\n[Outro - Male Spoken Word, Filtered]\nLa sorcière au caoutchouc, la mort, la mort, la mort\n[Song ends abruptly]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 252,
|
| 7 |
+
"keyscale": "E minor",
|
| 8 |
+
"language": "fr",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_37.json
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
-
"caption": "
|
| 4 |
-
"lyrics": "[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"think": true,
|
| 3 |
+
"caption": "A classic blues-rock arrangement with a soulful, nocturnal groove. The track is built on a tight rhythm section of a round-toned bass guitar and a crisp acoustic drum kit. A clean, slightly overdriven electric guitar provides bluesy licks and chordal stabs throughout the verses. The lead male vocal is smooth and soulful with a touch of rasp, soaring into layered harmonies during the expansive chorus. The song features a melodic and expressive guitar solo with classic bends and vibrato, followed by a breakdown and a final, stripped-down vocal section that fades into a concluding guitar flourish.",
|
| 4 |
+
"lyrics": "[Intro - Guitar Riff]\n\n[Verse 1]\nCity lights, they dance for miles\nNeon dreams, they hum in style\nLate night tales in smoky bars\nWishing on these falling stars\n\n[Verse 2]\nFootsteps echo on the street\nMelodies in perfect beat\nLonely hearts with hidden scars\nStill we chase these falling stars\n\n[Chorus]\nOh, let's take this ride tonight\nThrough the shadows in the night\nFeel the rhythm of guitars\nLost in wishes, flying far\n\n[Verse 3]\nIn a world of glass and steel\nBroken dreams feel so unreal\nYet we sing beneath the stars\nWishing on these falling stars\n\n[Guitar Solo]\n\n[Bridge]\nBaselines thumping through the dark\nPianos playing in the park\nEvery note a shot of heart\nEvery whisper, falling stars\n\n[Chorus]\nOh, let's take this ride tonight\nThrough the shadows in the night\nFeel the rhythm of guitars\nLost in wishes, flying far\n\n[Outro]\nIn a world of glass and steel\nBroken dreams feel so unreal\nYet we sing beneath the stars\nWishing on these falling stars\n\n[Extended Guitar Solo and Outro]\n[Song fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 235,
|
| 7 |
+
"keyscale": "G minor",
|
| 8 |
+
"language": "en",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
}
|
examples/text2music/example_38.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"think": true,
|
| 3 |
+
"caption": "A driving post-punk arrangement kicks off with layered electric guitars—one clean and arpeggiated, the other providing distorted chordal texture—over a solid bassline and powerful live drums. The male lead vocal is delivered with an angsty, strained quality that builds into an anthemic, shouted chorus where his voice cracks with emotion. Following a melodic yet noisy guitar solo filled with feedback and expressive bends, the track breaks down to its core rhythmic elements before fading out on lingering guitar noise.",
|
| 4 |
+
"lyrics": "[Intro - Arpeggiated Electric Guitar]\n\n[Verse 1]\nUnder neon lights, they flicker and fade\nLost in the hum of this restless parade\nA city's heartbeat pulses in my veins\nSearching for answers in the acid rain\nShadows stretch long, they're clawing my way\nThe night is alive but it's mine to obey\n\n[Chorus]\nOh, shadows of neon\nWhere do you run?\nBleeding out the night just to make us one\nOh, shadows of neon\nTell me the truth\nAre you the ghost or the one I chase with you?\n\n[Guitar Solo]\n\n[Verse 2]\nThe walls are whispering secrets they can't keep\nEchoes of footsteps buried six feet deep\nEvery corner's a story you'll never forget\nI'm tangled in chaos, dressed up as regret\n\n[Chorus]\nOh, shadows of neon\nWhere do you run?\nBleeding out the night just to make us one\nOh, shadows of neon\nTell me the truth\nAre you the ghost or the one I chase with you?\n\n[Outro - Extended Guitar Solo]\n[Music fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 178,
|
| 7 |
+
"keyscale": "A minor",
|
| 8 |
+
"language": "en",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
+
}
|
examples/text2music/example_39.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"think": true,
|
| 3 |
+
"caption": "An explosive fusion of chiptune and punk rock, this track erupts with a high-energy, 8-bit synth melody playing over a driving, distorted power-chord guitar riff and a punchy, tight drum beat. The lead male vocal is delivered with a clear, energetic, and slightly nasal punk-rock inflection. The song maintains a relentless pace, structured around catchy, anthemic choruses and verses that playfully list various types of coffee with satiric words. The chiptune synth hook serves as a recurring motif, reinforcing the track's video game-inspired sonic identity before ending abruptly after a final guitar flourish.",
|
| 4 |
+
"lyrics": "[Intro - Chiptune Synth Melody]\n\n[Verse 1]\nVos p'tits mots le pes, c'est pour me réconforter\nJe rentre chez moi avec rien à boire\nOn fait caca du pain chaud sur le lait qui grêle\nOn fume du shit sur le clocher des pommes\nOn oublie de nettoyer le four\n\n[Pre-Chorus]\nVos p'tits mots le pes, c'est le vin qui me délecte\nL'assiette\n'est plus un médicament\nOn danse au musée avec des pieds en terrasse\nUn Ricard qui tourne à plein poumons\nOn boit un verre pour oublier\n\n[Chorus]\nJ'aimerais mieux, j'ai pas la cocaïne\nQu'on ferme les yeux pour être tous ensemble\nLes wokos sont tristes, la bière est toxique\nLes poches sont vides mais vous y pensez\nJ'aimerais mieux, j'ai pas la cocaïne\nQu'on ferme les yeux pour être tous ensemble\nLes wokos sont tristes, la bière est toxique\nLes poches sont vides mais vous y pensez\n\n[Instrumental Break - Chiptune Synth Melody]\n\n[Verse 2]\nVos p'tits mots le pes, c'est pour me rassasier\nOn sort en berne avec des céréales\nOn boit pour oublier le cocaïne qu'on boit\nOn oublie de saluer le pot de bon matin\nLa bise aux potes, le rhum, le rhum\n\n[Pre-Chorus]\nVos p'tits mots le pes, c'est pour m'amender le cocaïne\nLa table en grand avec du café\nOn rit aux larmes quand on a rien à perdre\nOn s'amuse dans le coin de la cuisine\nÀ faire de la merde avec des souris\n\n[Chorus]\nJ'aimerais mieux, j'ai pas la cocaïne\nQu'on ferme les yeux pour être tous ensemble\nLes wokos sont tristes, la bière est toxique\nLes poches sont vides mais vous y pensez\nJ'aimerais mieux, j'ai pas la cocaïne\nQu'on ferme les yeux pour être tous ensemble\nLes wokos sont tristes, la bière est toxique\nLes poches sont vides mais vous y pensez\n\n[Outro - Chiptune Synth Melody]\n[abrupt silence]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 190,
|
| 7 |
+
"keyscale": "E major",
|
| 8 |
+
"language": "fr",
|
| 9 |
+
"timesignature": "2"
|
| 10 |
+
}
|
examples/text2music/example_40.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"think": true,
|
| 3 |
+
"caption": "A driving indie rock track with dream pop undertones, built on a steady drum machine beat and an active bassline that anchors the song's momentum. Clean electric guitars weave intricate arpeggios throughout the verses; one plays rhythmic chords while another adds atmospheric lead lines soaked in delay during instrumental breaks. A clear female vocal delivers a melancholic melody in Polish, soaring into layered harmonies for the expansive chorus sections. The arrangement features dynamic shifts between sparser verses and fuller choruses, culminating in an extended outro where melodic guitar leads soar over wordless 'ah' vocals before fading out to a final sustained chord.",
|
| 4 |
+
"lyrics": "[Intro - Instrumental]\n\n[Verse 1]\nChciałbym zatrzymać czas\nGdy zatapiam się w tobie\nByleby nie palić się\nA byłem w niebie\n\n[Chorus]\nBo twoje oczy\nA ja tworzę świat\nPrzez ciebie\nNiebo niebo\nI światło jest we mnie\nBo jesteś tu\n\n[Guitar Solo]\n\n[Verse 2]\nKiedy widzę z daleka twoją twarz\nNic już nie muszę cieszyć się\nRozczaruję twoją ciszę\nI pragnę tej chwili\n\n[Chorus]\nBo twoje oczy\nA ja tworzę świat\nPrzez ciebie\nNiebo niebo\nI światło jest we mnie\nBo jesteś tu\n\n[Instrumental Bridge with Guitar Solo]\n\n[Outro]\n[Scat singing]\nAhhh...\nAhhh...\nAhhh...\nAhhh...\n[Song fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 194,
|
| 7 |
+
"keyscale": "D major",
|
| 8 |
+
"language": "pl",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
+
}
|
examples/text2music/example_41.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"think": true,
|
| 3 |
+
"caption": "An energetic and quirky funk-rock track driven by a tight, groovy bassline and a punchy drum machine beat. The song opens with a catchy, scatted vocal hook that sets a playful tone. The lead male vocal is delivered in a rapid-fire, almost rap-like cadence during the verses, shifting to a more melodic and anthemic style for the chorus. The arrangement is punctuated by bright, synthesized brass stabs and a lively saxophone and trumpet-led instrumental break. The track breaks down into a more contemplative section with clean electric guitar before rebuilding to a high-energy finale that fades out with the initial scat motif.",
|
| 4 |
+
"lyrics": "[Intro]\n[Scatting vocal sample]\nLig na...\nLig na...\nLig na...\nLig na...\n\n[Verse 1]\nCzasem chciałbym woda na prysznic\nNa lajczak zawsze zemykać\nMyśli, że mam złe wieści\nŻe to za mała kawiatura\nA może niedziela, mam chwilę później\nWtedy to poszło nie tak\nMój ziomal umyślny stek w tym cyfrowym mieście\nZa oknem jeszcze\n\n[Pre-Chorus]\nDzień jak co dzień w kaploferze\nPod goliszkiem jak na różowo\nJak już obchodzi, co jutro się dzieje\nI co nam znowu przyszło\n\n[Chorus]\nŻycie to nie te dni, ani te dni\nCzyli zatem trzy, do tego trochę chwili\nLecz jestem tym zimnym i zimnym\nZimnym i zimnym\nBo w zimie jest coś, co ma zapalić\nCztery, podpisane pod jajami\nZimy, to chyba nie obchodzi nas\nI tu każda parada jest zimna jak twoja krew\n\n[Instrumental Break with Synth Brass]\n\n[Verse 2]\nLecimy dalej\nNa celownik wziął\nA więc rano\nPod szkołą pod osłoną nocy\nPłacąc za stare nieco\nWszystko się kończy, a my możemy się przejąć\nZegarek kolejny leci\nMiasto żyje naprawdę\nTo jeszcze nie koniec\nTo jeszcze nie koniec\nNie\n\n[Bridge]\nA co to za pech, że wiecznie upiętne dni mijają\nI mimo, że już za późno\nDziś, jutro, niezniszczalny\nJuż nie zniknął\n\n[Instrumental Break with Synth Brass]\n\n[Outro]\n[Scatting vocal sample]\n[Synth brass melody]\n[Final chord]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 208,
|
| 7 |
+
"keyscale": "D minor",
|
| 8 |
+
"language": "pl",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
+
}
|
examples/text2music/example_42.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"think": true,
|
| 3 |
+
"caption": "An explosive big band jazz arrangement kicks off with a powerful brass fanfare and a driving swing rhythm section featuring a walking upright bass and crisp ride cymbals. A commanding female vocalist enters with a theatrical, powerful delivery, her voice soaring over the dynamic horn stabs and saxophone lines. The track includes an energetic, improvisational saxophone solo over the tight rhythm section. The arrangement is dynamic, shifting between full-band shouts and more subdued passages, culminating in a dramatic final vocal statement and a classic big band flourish to finish.",
|
| 4 |
+
"lyrics": "[Intro - Full Band Fanfare]\n\n[Verse 1]\nKani\nKuni no kodoku ni omoi dashi\nKuni no yasuraki mo yoyotsu mo nashi\nTsuranuku utagoe yo\nIshisakiyo moeyo\nKanawanu negai o\n\n[Pre-Chorus]\nKujike no sakeba\nKono yo no owari\nKodoku no naka de\nKasumi ni ikiru\n\n[Chorus]\nTsuranuku utagoe yo\nKanawanu negai o\nKujike no sakeba\nKono yo no owari\nKodoku no naka de\nKasumi ni ikiru\n\n[Instrumental Break - Saxophone Solo]\n\n[Bridge]\nKujike no sakeba\nKono yo no owari\nKodoku no naka de\nKasumi ni ikiru\n\n[Chorus]\nTsuranuku utagoe yo\nKanawanu negai o\nKujike no sakeba\nKono yo no owari\nKodoku no naka de\nKasumi ni ikiru\n\n[Outro - Full Band Crescendo and Final Chord]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 175,
|
| 7 |
+
"keyscale": "D minor",
|
| 8 |
+
"language": "ja",
|
| 9 |
+
"timesignature": "2"
|
| 10 |
+
}
|
examples/text2music/example_43.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"think": true,
|
| 3 |
+
"caption": "An energetic and playful children's pop-rock song with a driving, upbeat tempo. The track is built on a foundation of a punchy drum machine beat and a clean electric bassline. Distorted electric guitars play catchy power-chord riffs, giving the chorus a rock edge. A clear, enthusiastic male lead vocal delivers the lyrics with a sing-along quality, supported by layered backing vocals in the chorus. The arrangement includes an instrumental break featuring a synth lead melody and a brief, stripped-down bridge before launching back into the high-energy chorus and a final guitar-led outro.",
|
| 4 |
+
"lyrics": "[Intro]\nHé, csávó!\nMicsoda csávó, a főnök királyfi, figyelj jól!\n[Verse 1]\nPörög a főnix, pörög a táj\nMindenki nevet, ez nem vitás\nCsini, Miki, vidám sztorikat\nA kocsma így rogy, a tűz benned ragad\n[Chorus]\nHé, hé, kicsi Miki, gyere vissza hajnalig\nTámadjon hát, és szórja a vizet\nHé, hé, Miki Miki, hajnalig jön, de\nNem áll meg senki, jöhet a tánc, ne késlekedj el!\n[Verse 2]\nA kocsma illatát mesteri a nevetés\nMiki meg a csini is énekel, ez az egész éves élmény\nPoci, csiki, pörög a rúzs\nMindenki vele, ezt sose fogja\n[Chorus]\nHé, hé, kicsi Miki, gyere vissza hajnalig\nTámadjon hát, és szórja a vizet\nHé, hé, Miki Miki, hajnalig jön, de\nNem áll meg senki, jöhet a tánc, ne késlekedj el!\n[Bridge]\nHé, hé, há, te Miki Miki, mesteri\nEzt mi sose hagynád\nÓ, micsoda, micsoda, Miki Miki, gyere vissza, gyere vissza!\n[Instrumental Break - Synth Solo]\n[Verse 3]\nA kocsma illatát mesteri a nevetés\nMiki meg a csini is énekel, ez az egész éves élmény\nPoci, csiki, pörög a rúzs\nMindenki vele, ezt sose fogja\n[Chorus]\nHé, hé, kicsi Miki, gyere vissza hajnalig\nTámadjon hát, és szórja a vizet\nHé, hé, Miki Miki, hajnalig jön, de\nNem áll meg senki, jöhet a tánc, ne késlekedj el!\n[Outro]\nHé, hih, teli a kocsma\nMiki Miki, Miki Miki, az öröm\nÓ, micsoda, micsoda, Miki Miki\nGyere vissza, gyere vissza!\n[Instrumental Outro with Synth Solo]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 181,
|
| 7 |
+
"keyscale": "C major",
|
| 8 |
+
"language": "hu",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
+
}
|
examples/text2music/example_44.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"think": true,
|
| 3 |
+
"caption": "A smooth, contemporary R&B track built on a foundation of clean, melodic electric guitar and a steady, unobtrusive drum machine beat. The song opens with a catchy, wordless vocal hook that sets a romantic, nighttime city mood. A clear male tenor vocal, delivered in a mix of Mandarin and English, glides over the arrangement, supported by lush, layered harmonies and echoed ad-libs that add depth and space to the mix. The production is polished and modern, with a focus on creating an immersive, heartfelt atmosphere through tasteful reverb and a well-balanced low end. The track concludes with a brief, melodic guitar solo that fades out, reinforcing its soulful character.",
|
| 4 |
+
"lyrics": "[Intro]\n(Ooh-ooh-ooh-ooh)\n(Ooh-ooh-ooh-ooh)\n(Ooh-ooh-ooh-ooh)\n(Ooh-ooh-ooh-ooh)\n\n[Verse 1]\n[zh] ni2 hong2 deng1 xia4 wo3 men5 man4 bu4 zai4 ye4 wan3\n[en] (ooh-ooh-ooh-ooh)\n\n[zh] ni3 de5 yan3 shen2 xiang4 xing1 chen2 zai4 wo3 xin1 shang4 shan3\n[en] (ooh-ooh-ooh-ooh)\n\n[zh] ye4 feng1 chui1 dong4 zhe5 jie1 tou2 de5 lang4 man4\n[en] (ooh-ooh-ooh-ooh)\n\n[zh] mei3 ge4 jiao3 luo4 dou1 you3 wei4 zhi1 de5 da2 an4\n[en] (ooh-ooh-ooh-ooh)\n\n[en] [chorus]\n\n[zh] ai4 zai4 yong1 ji3 de5 di4 fang1 jing4 jing4 su4 shuo1\n[zh] ni3 de5 wen1 rou2 rang4 wo3 mei2 ban4 fa3 duo3\n[zh] shi4 jie4 tai4 da4 que4 yin1 ni3 yi3 jing1 wo3\n[zh] wu2 lun4 duo1 yuan3 ye3 bu2 hui4 mi2 shi1 zi4 wo3\n\n\n[Post-Chorus]\n[zh] ni3 shi4 wo3 de5 deng1 ta3 zhi3 yin3 wo3 qian2 xing2\n[zh] sheng1 ming4 de5 lv3 tu2 yin1 ni3 bian4 de2 an1 ding4\n[zh] tian1 bian1 de5 yun2 duo3 jiu4 zai4 tou1 tou1 kan4 ni3\n[zh] zhe4 yi1 miao3 zhong1 zai4 wo3 xin1 li3 hua4 xuan2 lv4\n[en] (ooh-ooh-ooh-ooh)\n\n[en] [verse two]\n\n[zh] ni3 de5 shou3 zhi3 hua2 guo4 wo3 xin1 tiao4 de5 mei3\n[zh] mei3 ge4 zi4 jie2 xiang4 shi4 liu2 xing1 hua2 guo4 de5 hui1\n[zh] gu3 dian3 chen2 zhong4 dan4 shi4 yin1 ni3 zhong4 yue4\n[zh] xin1 tiao4 li3 shi4 ni3 ai4 de5 jie2 zou4 mei2 yan2\n[en] (ooh-ooh-ooh-ooh)\n\n[zh] ni3 de5 shou3 zhi3 hua2 guo4 wo3 xin1 tiao4 de5 mei3\n[zh] mei3 ge4 zi4 jie2 xiang4 shi4 liu2 xing1 hua2 guo4 de5 hui1\n[zh] gu3 dian3 chen2 zhong4 dan4 shi4 yin1 ni3 zhong4 yue4\n[zh] xin1 tiao4 li3 shi4 ni3 ai4 de5 jie2 zou4 mei2 yan2\n[en] (ooh-ooh-ooh-ooh)\n\n[en] [bridge]\n\n[zh] ai4 zai4 yong1 ji3 de5 di4 fang1 jing4 jing4 su4 shuo1\n[zh] ni3 de5 wen1 rou2 rang4 wo3 mei2 ban4 fa3 duo3\n[zh] shi4 jie4 tai4 da4 que4 yin1 ni3 yi3 jing1 wo3\n[zh] wu2 lun4 duo1 yuan3 ye3 bu2 hui4 mi2 shi1 zi4 wo3\n[zh] yin1 fu2 zai4 wo3 men5 zhi1 jian1 qing1 qing1 xuan2 zhuan3\n[zh] ling2 hun2 de5 xuan2 lv4 tiao4 dong4 liu2 xia4 shi1 pian1\n[zh] zhe4 yi1 shou3 ge1 de5 ge1 ci2 ni3 lai2 ting1 jian4\n[zh] ai4 zai4 yong1 ji3 zhong1 rong2 hua4 zai4 shun4 jian1\n[en] (ooh-ooh-ooh-ooh)\n\n[en] [chorus]\n\n[zh] ai4 zai4 yong1 ji3 de5 di4 fang1 jing4 jing4 su4 shuo1\n[zh] ni3 de5 wen1 rou2 rang4 wo3 mei2 ban4 fa3 duo3\n[zh] shi4 jie4 tai4 da4 que4 yin1 ni3 yi3 jing1 wo3\n[zh] wu2 lun4 duo1 yuan3 ye3 bu2 hui4 mi2 shi1 zi4 wo3\n\n\n[Outro]\n[Instrumental with vocal ad-libs]\n(Ooh-ooh-ooh-ooh)\n(Ooh-ooh-ooh-ooh)\n(Ooh-ooh-ooh-ooh)\n(Ooh-ooh-ooh-ooh)\n[Music fades out]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 202,
|
| 7 |
+
"keyscale": "C# major",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
+
}
|
examples/text2music/example_45.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"think": true,
|
| 3 |
+
"caption": "An explosive burst of high-energy J-rock and chiptune, the track kicks off with a blistering, harmonized guitar and synth arpeggio intro reminiscent of a video game boss battle. A powerful, clear female vocal drives the verses over a tight, funky slap bassline and a driving drum beat. The arrangement builds into an anthemic, soaring chorus with layered vocals and powerful guitar chords. The track is punctuated by two virtuosic, shred-style guitar solos filled with rapid-fire licks and technical flourishes. The song concludes with a final, triumphant chorus and a flurry of guitar pyrotechnics before an abrupt end.",
|
| 4 |
+
"lyrics": "[Intro - Guitar & Synth Lead]\n\n[Verse 1]\n[zh] ye4 feng1 fu2 guo4 wo3 de5 lian3\n[zh] ni2 hong2 xia4 de5 meng4 jing4 wu2 bian1\n[zh] cheng2 shi4 jie2 zou4 xiang4 xin1 tiao4 zai4 bian4\n[zh] mei3 ge4 jiao3 bu4 dai4 zhe5 xin1 de5 xuan2 nian4\n[zh] ni2 hong2 jian4 yue4 que4 kan4 bu4 qing1 lian3\n[zh] cang2 zhe5 huan1 xiao4 he2 wu2 sheng1 de5 zhai4\n[zh] wo3 gen1 sui2 zhe5 xin1 zhong1 de5 bo1 lan2\n[zh] yue4 guo4 fan2 hua2 zhui1 sui2 zhe5 guang1 huan2\n\n\n[Chorus]\n[zh] tiao4 jin4 ye4 se4 de5 yin2 he2 xuan2 lv4\n[zh] fan1 yue4 gu1 dan1 yu3 wei4 zhi1 de5 mi2\n[zh] xin1 zai4 fei1 xiang2 zhui1 zhu2 de5 ji4 yi4\n[zh] rang4 meng4 ru2 xing1 chen2 zai4 ye4 kong1 yan2 xu4\n\n\n[Guitar & Synth Solo]\n\n[Verse 2]\n[zh] mei3 shan4 chuang1 wai4 dou1 cang2 zhe5 hua4 yu3\n[zh] guo4 qu4 wei4 lai2 dou1 qiao3 ran2 kao4 jin4\n[zh] jiao3 bu4 wu2 sheng1 dan4 ji4 yi4 tou4 ming2\n[zh] zai4 chen2 mo4 zhong1 zhao3 dao4 xin1 de5 su1 xing3\n[zh] shui2 shuo1 meng4 jing4 zong3 shi4 bu4 ming2 bai2\n[zh] zhui1 zhu2 xing1 chen2 yue4 guo4 zhe4 shan1 mai4\n[zh] yi1 pian4 guang1 ying3 jiang1 ling2 hun2 jie3 kai1\n[zh] ye4 wan3 dai4 zhe5 yi1 chang2 wu2 jin4 de5 ai4\n\n\n[Chorus]\n[zh] tiao4 jin4 ye4 se4 de5 yin2 he2 xuan2 lv4\n[zh] fan1 yue4 gu1 dan1 yu3 wei4 zhi1 de5 mi2\n[zh] xin1 zai4 fei1 xiang2 zhui1 zhu2 de5 ji4 yi4\n[zh] rang4 meng4 ru2 xing1 chen2 zai4 ye4 kong1 yan2 xu4\n\n\n[Guitar & Synth Solo]\n\n[Bridge]\n[zh] shui2 shuo1 meng4 jing4 zong3 shi4 bu4 ming2 bai2\n[zh] zhui1 zhu2 xing1 chen2 yue4 guo4 zhe4 shan1 mai4\n[zh] yi1 pian4 guang1 ying3 jiang1 ling2 hun2 jie3 kai1\n[zh] ye4 wan3 dai4 zhe5 yi1 chang2 wu2 jin4 de5 ai4\n\n\n[Chorus]\n[zh] tiao4 jin4 ye4 se4 de5 yin2 he2 xuan2 lv4\n[zh] fan1 yue4 gu1 dan1 yu3 wei4 zhi1 de5 mi2\n[zh] xin1 zai4 fei1 xiang2 zhui1 zhu2 de5 ji4 yi4\n[zh] rang4 meng4 ru2 xing1 chen2 zai4 ye4 kong1 yan2 xu4\n\n\n[Outro - Guitar & Synth Solo]\n[Song ends abruptly]",
|
| 5 |
+
"bpm": 40,
|
| 6 |
+
"duration": 224,
|
| 7 |
+
"keyscale": "E minor",
|
| 8 |
+
"language": "zh",
|
| 9 |
+
"timesignature": "2"
|
| 10 |
+
}
|
examples/text2music/example_46.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"think": true,
|
| 3 |
+
"caption": "An energetic indie rock track driven by a clean, catchy electric guitar riff and a punchy, straightforward drum and bass groove. The male lead vocal is clear and powerful, delivering an anthemic melody that soars during the high-energy chorus. The arrangement follows a classic verse-chorus structure, punctuated by a brief, melodic guitar break and a dynamic bridge that builds back into a final, powerful chorus before concluding with the initial guitar riff.",
|
| 4 |
+
"lyrics": "[Intro - Guitar Riff]\n\n[Verse 1]\nWoke up in the electric city, bright light\nNeon signs and shadows we ignite\nStepping to the beat, feel the heart race\nLife's a wild ride in this endless place\n\n[Pre-Chorus]\nSynths and bass, they carry us above\nFunk rhythms mixed with electric love\nDrums pounding loud in the crowded street\nGuitar strings strum to our moving feet\n\n[Chorus]\nTurn it up, feel the electric heartbeat\nEvery pulse bringing us to our feet\nDance through the night, we're invincible\nElectric heartbeat, unstoppable\n\n[Instrumental Break - Guitar Riff]\n\n[Verse 2]\nCity lights flicker in a rhythmic trance\nUnder moon's glow, we take a chance\nVoices harmonize in a primal scream\nEvery note a spark, fueling our dream\n\n[Pre-Chorus]\nFeel the rush, let it take control\nElectric fire burning in our soul\nNo turning back, we're lost in the sound\nIn this moment, forever unbound\n\n[Chorus]\nTurn it up, feel the electric heartbeat\nEvery pulse bringing us to our feet\nDance through the night, we're invincible\nElectric heartbeat, unstoppable\n\n[Outro - Guitar Riff and Drums]\n[abrupt silence]",
|
| 5 |
+
"bpm": 100,
|
| 6 |
+
"duration": 153,
|
| 7 |
+
"keyscale": "A major",
|
| 8 |
+
"language": "en",
|
| 9 |
+
"timesignature": "4"
|
| 10 |
+
}
|