Initial release: TheArtist chord-generation paper companion

Browse files

Files changed (7) hide show

README.md +98 -0
best.pt +3 -0
config.json +25 -0
eval_results.csv +8 -0
model.py +294 -0
tokenizer.json +356 -0
tokenizer.py +379 -0

README.md ADDED Viewed

	@@ -0,0 +1,98 @@

+---
+license: other
+library_name: pytorch
+tags:
+  - music
+  - music-generation
+  - chord-generation
+  - symbolic-music
+  - music-transformer
+  - jazz
+  - pop
+language:
+  - en
+pipeline_tag: text-generation
+---
+# TheArtist Music Transformer — F2 (Pop 5K Mix)
+**Jazz-adapted chord model with a 5,000-sequence pop rehearsal buffer. Calibration point that the paper finds is dominated by F3 on every axis.**
+One of six checkpoints released alongside the paper *Empirical Study of Pop and Jazz Mix Ratios for Genre-Adaptive Chord Generation* (Lee, 2026). See the collection overview at `PearlLeeStudio/TheArtist-MusicTransformer-pop-baseline`.
+## Model summary
+| Field | Value |
+|---|---|
+| Architecture | Music Transformer with relative positional attention |
+| Parameters | 25,661,440 |
+| Vocabulary size | 351 tokens |
+| Max sequence length | 256 |
+| d_model / heads / FFN / layers | 512 / 8 / 2048 / 8 |
+| Fine-tune resumed from | Phase 0 pop baseline |
+| Best epoch | 4 |
+## Training data
+All 1,513 jazz training sequences plus 5,000 pop rehearsal sequences (seed 42). Pop:jazz ≈ 3.3:1.
+## Evaluation (held-out per-genre test sets)
+| Metric | Pop test | Jazz test |
+|---|---:|---:|
+| Top-1 accuracy | 84.07% | 79.90% |
+| Top-5 accuracy | 97.04% | 92.14% |
+| Perplexity | 1.75 | 2.33 |
+| Δ vs. Phase 0 baseline | −0.17 | +7.04 |
+F2 is dominated by F3 on every axis. It is released for reproducibility of the saturation curve described in the paper (see paper §6.1, §7.3) but is not the recommended choice for any operating point. Prefer F3 for the balanced setting, F1 for pop-leaning, or F4 for jazz-leaning.
+## Intended use
+Reference checkpoint for replication and saturation-curve analysis. Not recommended as a default for chord-composition workflows.
+## Usage
+```python
+import torch
+from huggingface_hub import hf_hub_download
+from model import MusicTransformer
+from tokenizer import ChordTokenizer
+ckpt_path = hf_hub_download(
+    repo_id="PearlLeeStudio/TheArtist-MusicTransformer-ft-pop67",
+    filename="best.pt",
+)
+tokenizer = ChordTokenizer()
+ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+model = MusicTransformer(
+    vocab_size=tokenizer.vocab_size,
+    d_model=512, n_heads=8, d_ff=2048, n_layers=8,
+    max_seq_len=256, dropout=0.0, pad_id=tokenizer.pad_id,
+)
+model.load_state_dict(ckpt["model_state_dict"])
+model.eval()
+```
+## Training-data licenses
+| Dataset | License |
+|---|---|
+| Chordonomicon | Public (user-generated) |
+| McGill Billboard | CC0 |
+| Jazz Harmony Treebank | Public |
+| JazzStandards (iReal Pro) | Community redistribution |
+| Weimar Jazz Database | ODbL |
+| JAAH | Research-use public |
+## Citation
+```bibtex
+@misc{lee2026chordmix,
+  title  = {Empirical Study of Pop and Jazz Mix Ratios for Genre-Adaptive Chord Generation},
+  author = {Lee, Jinju},
+  year   = {2026},
+  eprint = {arXiv:XXXX.XXXXX}
+}
+```

best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9aebcae5294c331aab43c509af698062f9bc7e50fb06e92280ee93126491d7b
+size 308077642

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "run_name": "ft_jazz_pop67",
+  "resume_from": "checkpoints/phase0_pop_baseline/best.pt",
+  "pop_mix_count": 5000,
+  "epochs": 10,
+  "batch_size": 64,
+  "gradient_accumulation_steps": 2,
+  "lr": 2e-05,
+  "weight_decay": 0.01,
+  "warmup_epochs": 2,
+  "max_grad_norm": 1.0,
+  "d_model": 512,
+  "n_heads": 8,
+  "d_ff": 2048,
+  "n_layers": 8,
+  "max_seq_len": 256,
+  "dropout": 0.1,
+  "use_amp": true,
+  "checkpoint_every": 1,
+  "patience": 5,
+  "num_workers": 4,
+  "persistent_workers": true,
+  "prefetch_factor": 4,
+  "log_every_steps": 200
+}

eval_results.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+epoch,lr,train_loss,val_loss,val_ppl,val_top1,val_top5,pop_loss,pop_ppl,pop_top1,pop_top5,jazz_loss,jazz_ppl,jazz_top1,jazz_top5
+3,2.56e-04,0.7450,0.5703,1.77,83.99,96.81,0.5490,1.73,84.21,97.09,1.3893,4.01,72.86,86.51
+4,2.07e-04,0.6459,0.5660,1.76,84.03,96.05,0.5599,1.75,84.06,96.17,0.8482,2.34,79.91,91.46
+5,1.50e-04,0.6020,0.5750,1.78,83.83,96.01,0.5694,1.77,83.87,96.12,0.8305,2.29,80.28,91.86
+6,9.26e-05,0.5770,0.5843,1.79,83.67,95.97,0.5790,1.78,83.69,96.08,0.8298,2.29,80.33,91.73
+7,4.39e-05,0.5587,0.5926,1.81,83.54,95.94,0.5879,1.80,83.54,96.05,0.8339,2.30,80.19,91.77
+8,1.14e-05,0.5471,0.5983,1.82,83.40,96.78,0.5937,1.81,83.41,96.88,0.8365,2.31,80.09,92.58
+9,0.00e+00,0.5410,0.6005,1.82,83.38,96.78,0.5958,1.81,83.39,96.87,0.8374,2.31,80.12,92.62

model.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""Music Transformer with relative attention for chord generation.
+Architecture: Transformer decoder (autoregressive) with relative position
+encoding (Shaw et al. 2018, efficient skewing from Huang et al. 2018).
+Default config (~25M params):
+    d_model=512, n_heads=8, d_ff=2048, n_layers=8
+"""
+from __future__ import annotations
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class RelativeMultiHeadAttention(nn.Module):
+    """Multi-head self-attention with relative position bias."""
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        max_seq_len: int,
+        dropout: float = 0.1,
+    ) -> None:
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.n_heads = n_heads
+        self.d_k = d_model // n_heads
+        self.scale = math.sqrt(self.d_k)
+        self.w_q = nn.Linear(d_model, d_model)
+        self.w_k = nn.Linear(d_model, d_model)
+        self.w_v = nn.Linear(d_model, d_model)
+        self.w_o = nn.Linear(d_model, d_model)
+        # Learnable relative position embeddings: positions in [-max_len+1, max_len-1]
+        self.max_seq_len = max_seq_len
+        self.rel_emb = nn.Embedding(2 * max_seq_len - 1, self.d_k)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        """
+        Args:
+            x: (B, L, D)
+            mask: (L, L) bool — True = masked (don't attend)
+        Returns:
+            (B, L, D)
+        """
+        B, L, _ = x.shape
+        H, dk = self.n_heads, self.d_k
+        Q = self.w_q(x).view(B, L, H, dk).transpose(1, 2)  # (B, H, L, dk)
+        K = self.w_k(x).view(B, L, H, dk).transpose(1, 2)
+        V = self.w_v(x).view(B, L, H, dk).transpose(1, 2)
+        # Content attention: Q K^T
+        content = torch.matmul(Q, K.transpose(-2, -1))  # (B, H, L, L)
+        # Relative position attention: Q R^T via efficient gather
+        rel = self._relative_attention(Q, L)  # (B, H, L, L)
+        attn = (content + rel) / self.scale
+        if mask is not None:
+            attn = attn.masked_fill(mask.unsqueeze(0).unsqueeze(0), float("-inf"))
+        attn = self.dropout(F.softmax(attn, dim=-1))
+        out = torch.matmul(attn, V)  # (B, H, L, dk)
+        out = out.transpose(1, 2).contiguous().view(B, L, -1)
+        return self.w_o(out)
+    def _relative_attention(self, Q: torch.Tensor, L: int) -> torch.Tensor:
+        """Compute Q @ R^T using relative position embeddings.
+        Uses the index-gather approach: for each (i, j) pair, the relative
+        position is j - i, shifted to a non-negative index.
+        """
+        device = Q.device
+        # Relative position indices: rel[i,j] = j - i + max_seq_len - 1
+        positions = torch.arange(L, device=device)
+        rel_idx = positions.unsqueeze(0) - positions.unsqueeze(1) + self.max_seq_len - 1
+        rel_idx = rel_idx.clamp(0, 2 * self.max_seq_len - 2)
+        R = self.rel_emb(rel_idx)  # (L, L, dk)
+        # Q: (B, H, L, dk)  R: (L, L, dk) → need (B, H, L, L)
+        # Reshape Q to (B*H, L, dk), bmm with R^T reshaped
+        BH = Q.shape[0] * Q.shape[1]
+        Q_flat = Q.reshape(BH, L, self.d_k)  # (BH, L, dk)
+        # For each query position i, we want dot(Q[i], R[i, :, :]) → (BH, L, L)
+        # R: (L, L, dk) → transpose last two → (L, dk, L)
+        # Then Q_flat[:, i, :] @ R[i, :, :].T for each i
+        # Efficient: einsum
+        rel_score = torch.einsum("bld,lsd->bls", Q_flat, R)  # (BH, L, L)
+        return rel_score.view(Q.shape[0], Q.shape[1], L, L)
+class TransformerBlock(nn.Module):
+    """Pre-norm Transformer decoder block."""
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        d_ff: int,
+        max_seq_len: int,
+        dropout: float = 0.1,
+    ) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(d_model)
+        self.attn = RelativeMultiHeadAttention(d_model, n_heads, max_seq_len, dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.ffn = nn.Sequential(
+            nn.Linear(d_model, d_ff),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_ff, d_model),
+            nn.Dropout(dropout),
+        )
+        self.drop = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        x = x + self.drop(self.attn(self.norm1(x), mask))
+        x = x + self.ffn(self.norm2(x))
+        return x
+class MusicTransformer(nn.Module):
+    """Autoregressive Music Transformer for chord generation."""
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int = 512,
+        n_heads: int = 8,
+        d_ff: int = 2048,
+        n_layers: int = 8,
+        max_seq_len: int = 512,
+        dropout: float = 0.1,
+        pad_id: int = 0,
+    ) -> None:
+        super().__init__()
+        self.d_model = d_model
+        self.max_seq_len = max_seq_len
+        self.pad_id = pad_id
+        self.token_emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
+        self.drop = nn.Dropout(dropout)
+        self.layers = nn.ModuleList([
+            TransformerBlock(d_model, n_heads, d_ff, max_seq_len, dropout)
+            for _ in range(n_layers)
+        ])
+        self.norm = nn.LayerNorm(d_model)
+        self.out_proj = nn.Linear(d_model, vocab_size, bias=False)
+        # Weight tying (embedding ↔ output projection)
+        self.out_proj.weight = self.token_emb.weight
+        self._init_weights()
+    def _init_weights(self) -> None:
+        for name, p in self.named_parameters():
+            if p.dim() > 1 and "token_emb" not in name:
+                nn.init.xavier_uniform_(p)
+        # Embedding std=1/sqrt(d_model) so that after *sqrt(d_model) scaling
+        # inputs have unit variance, and weight-tied output logits stay small
+        nn.init.normal_(self.token_emb.weight, mean=0.0, std=self.d_model ** -0.5)
+    @staticmethod
+    def _causal_mask(L: int, device: torch.device) -> torch.Tensor:
+        """Upper-triangular causal mask (True = masked)."""
+        return torch.triu(torch.ones(L, L, device=device, dtype=torch.bool), diagonal=1)
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input_ids: (B, L) token IDs
+        Returns:
+            logits: (B, L, vocab_size)
+        """
+        B, L = input_ids.shape
+        x = self.token_emb(input_ids) * math.sqrt(self.d_model)
+        x = self.drop(x)
+        mask = self._causal_mask(L, input_ids.device)
+        for layer in self.layers:
+            x = layer(x, mask)
+        return self.out_proj(self.norm(x))
+    def count_parameters(self) -> int:
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+    @torch.no_grad()
+    def generate(
+        self,
+        prompt_ids: torch.Tensor,
+        max_new_tokens: int = 64,
+        temperature: float = 1.0,
+        top_k: int = 0,
+        top_p: float = 0.9,
+        eos_id: int = 2,
+        repetition_penalty: float = 1.0,
+        no_repeat_ngram_size: int = 0,
+        ignore_repeat_token_ids: set[int] | None = None,
+    ) -> torch.Tensor:
+        """Autoregressive generation from a prompt.
+        Args:
+            prompt_ids: (1, L) token IDs including [BOS] and context.
+            max_new_tokens: maximum tokens to generate.
+            temperature: sampling temperature (lower = more deterministic).
+            top_k: keep only top-k logits (0 = disabled).
+            top_p: nucleus sampling threshold.
+            eos_id: stop token.
+            repetition_penalty: divide logits of previously-seen tokens by
+                this factor (HF convention). > 1.0 discourages repeats.
+                1.0 disables. Typical: 1.2–1.5.
+            no_repeat_ngram_size: ban candidate tokens that would complete
+                an n-gram already present in the current sequence (n =
+                this value). 0 disables. Typical: 3 for chord sequences.
+            ignore_repeat_token_ids: token ids exempt from the two repetition
+                controls above — e.g. [BAR] or other separators that
+                *should* recur. If None, no exemptions.
+        Returns:
+            (1, L') full sequence including prompt and generated tokens.
+        """
+        self.eval()
+        ids = prompt_ids.clone()
+        exempt = ignore_repeat_token_ids or set()
+        for _ in range(max_new_tokens):
+            ctx = ids[:, -self.max_seq_len :]
+            logits = self(ctx)[:, -1, :] / max(temperature, 1e-8)
+            # Repetition penalty (HuggingFace-style): scale already-seen token
+            # logits so they are less attractive. Positive logits get divided,
+            # negative logits get multiplied (stays "less attractive" either sign).
+            if repetition_penalty != 1.0:
+                seen = set(ids[0].tolist()) - exempt
+                if seen:
+                    idx = torch.tensor(list(seen), device=logits.device, dtype=torch.long)
+                    vals = logits[0, idx]
+                    vals = torch.where(
+                        vals > 0,
+                        vals / repetition_penalty,
+                        vals * repetition_penalty,
+                    )
+                    logits[0, idx] = vals
+            # No-repeat n-gram: block any candidate token that would complete
+            # an n-gram already present earlier in the sequence.
+            if no_repeat_ngram_size > 0 and ids.shape[1] >= no_repeat_ngram_size:
+                n = no_repeat_ngram_size
+                seq = ids[0].tolist()
+                prefix = tuple(seq[-(n - 1):]) if n > 1 else ()
+                banned: set[int] = set()
+                for i in range(len(seq) - n + 1):
+                    if tuple(seq[i : i + n - 1]) == prefix:
+                        banned.add(seq[i + n - 1])
+                banned -= exempt
+                if banned:
+                    bidx = torch.tensor(list(banned), device=logits.device, dtype=torch.long)
+                    logits[0, bidx] = float("-inf")
+            # Top-k
+            if top_k > 0:
+                topk_vals, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < topk_vals[:, -1:]] = float("-inf")
+            # Top-p (nucleus)
+            if 0 < top_p < 1.0:
+                sorted_logits, sorted_idx = torch.sort(logits, descending=True)
+                cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                remove = cum_probs - F.softmax(sorted_logits, dim=-1) > top_p
+                sorted_logits[remove] = float("-inf")
+                logits = sorted_logits.scatter(1, sorted_idx, sorted_logits)
+            probs = F.softmax(logits, dim=-1)
+            next_id = torch.multinomial(probs, num_samples=1)
+            ids = torch.cat([ids, next_id], dim=-1)
+            if (next_id == eos_id).all():
+                break
+        return ids

tokenizer.json ADDED Viewed

	@@ -0,0 +1,356 @@

+{
+  "token2id": {
+    "[PAD]": 0,
+    "[BOS]": 1,
+    "[EOS]": 2,
+    "[BAR]": 3,
+    "[KEY:Cmaj]": 4,
+    "[KEY:Dbmaj]": 5,
+    "[KEY:Dmaj]": 6,
+    "[KEY:Ebmaj]": 7,
+    "[KEY:Emaj]": 8,
+    "[KEY:Fmaj]": 9,
+    "[KEY:F#maj]": 10,
+    "[KEY:Gmaj]": 11,
+    "[KEY:Abmaj]": 12,
+    "[KEY:Amaj]": 13,
+    "[KEY:Bbmaj]": 14,
+    "[KEY:Bmaj]": 15,
+    "[KEY:Cmin]": 16,
+    "[KEY:Dbmin]": 17,
+    "[KEY:Dmin]": 18,
+    "[KEY:Ebmin]": 19,
+    "[KEY:Emin]": 20,
+    "[KEY:Fmin]": 21,
+    "[KEY:F#min]": 22,
+    "[KEY:Gmin]": 23,
+    "[KEY:Abmin]": 24,
+    "[KEY:Amin]": 25,
+    "[KEY:Bbmin]": 26,
+    "[KEY:Bmin]": 27,
+    "[TIME:4/4]": 28,
+    "[TIME:3/4]": 29,
+    "[TIME:6/8]": 30,
+    "[TIME:2/4]": 31,
+    "[TIME:5/4]": 32,
+    "[GENRE:jazz]": 33,
+    "[GENRE:pop]": 34,
+    "[GENRE:rock]": 35,
+    "[GENRE:blues]": 36,
+    "[GENRE:bossa]": 37,
+    "[GENRE:none]": 38,
+    "Cmaj": 39,
+    "Cm": 40,
+    "C7": 41,
+    "Cmaj7": 42,
+    "Cm7": 43,
+    "Cm7b5": 44,
+    "Cdim7": 45,
+    "Cdim": 46,
+    "Caug": 47,
+    "Csus4": 48,
+    "Csus2": 49,
+    "C6": 50,
+    "Cm6": 51,
+    "C9": 52,
+    "Cm9": 53,
+    "Cmaj9": 54,
+    "C11": 55,
+    "Cm11": 56,
+    "C13": 57,
+    "Cm13": 58,
+    "Cadd9": 59,
+    "CmMaj7": 60,
+    "C7b9": 61,
+    "C7#9": 62,
+    "C7#11": 63,
+    "C7b13": 64,
+    "Dbmaj": 65,
+    "Dbm": 66,
+    "Db7": 67,
+    "Dbmaj7": 68,
+    "Dbm7": 69,
+    "Dbm7b5": 70,
+    "Dbdim7": 71,
+    "Dbdim": 72,
+    "Dbaug": 73,
+    "Dbsus4": 74,
+    "Dbsus2": 75,
+    "Db6": 76,
+    "Dbm6": 77,
+    "Db9": 78,
+    "Dbm9": 79,
+    "Dbmaj9": 80,
+    "Db11": 81,
+    "Dbm11": 82,
+    "Db13": 83,
+    "Dbm13": 84,
+    "Dbadd9": 85,
+    "DbmMaj7": 86,
+    "Db7b9": 87,
+    "Db7#9": 88,
+    "Db7#11": 89,
+    "Db7b13": 90,
+    "Dmaj": 91,
+    "Dm": 92,
+    "D7": 93,
+    "Dmaj7": 94,
+    "Dm7": 95,
+    "Dm7b5": 96,
+    "Ddim7": 97,
+    "Ddim": 98,
+    "Daug": 99,
+    "Dsus4": 100,
+    "Dsus2": 101,
+    "D6": 102,
+    "Dm6": 103,
+    "D9": 104,
+    "Dm9": 105,
+    "Dmaj9": 106,
+    "D11": 107,
+    "Dm11": 108,
+    "D13": 109,
+    "Dm13": 110,
+    "Dadd9": 111,
+    "DmMaj7": 112,
+    "D7b9": 113,
+    "D7#9": 114,
+    "D7#11": 115,
+    "D7b13": 116,
+    "Ebmaj": 117,
+    "Ebm": 118,
+    "Eb7": 119,
+    "Ebmaj7": 120,
+    "Ebm7": 121,
+    "Ebm7b5": 122,
+    "Ebdim7": 123,
+    "Ebdim": 124,
+    "Ebaug": 125,
+    "Ebsus4": 126,
+    "Ebsus2": 127,
+    "Eb6": 128,
+    "Ebm6": 129,
+    "Eb9": 130,
+    "Ebm9": 131,
+    "Ebmaj9": 132,
+    "Eb11": 133,
+    "Ebm11": 134,
+    "Eb13": 135,
+    "Ebm13": 136,
+    "Ebadd9": 137,
+    "EbmMaj7": 138,
+    "Eb7b9": 139,
+    "Eb7#9": 140,
+    "Eb7#11": 141,
+    "Eb7b13": 142,
+    "Emaj": 143,
+    "Em": 144,
+    "E7": 145,
+    "Emaj7": 146,
+    "Em7": 147,
+    "Em7b5": 148,
+    "Edim7": 149,
+    "Edim": 150,
+    "Eaug": 151,
+    "Esus4": 152,
+    "Esus2": 153,
+    "E6": 154,
+    "Em6": 155,
+    "E9": 156,
+    "Em9": 157,
+    "Emaj9": 158,
+    "E11": 159,
+    "Em11": 160,
+    "E13": 161,
+    "Em13": 162,
+    "Eadd9": 163,
+    "EmMaj7": 164,
+    "E7b9": 165,
+    "E7#9": 166,
+    "E7#11": 167,
+    "E7b13": 168,
+    "Fmaj": 169,
+    "Fm": 170,
+    "F7": 171,
+    "Fmaj7": 172,
+    "Fm7": 173,
+    "Fm7b5": 174,
+    "Fdim7": 175,
+    "Fdim": 176,
+    "Faug": 177,
+    "Fsus4": 178,
+    "Fsus2": 179,
+    "F6": 180,
+    "Fm6": 181,
+    "F9": 182,
+    "Fm9": 183,
+    "Fmaj9": 184,
+    "F11": 185,
+    "Fm11": 186,
+    "F13": 187,
+    "Fm13": 188,
+    "Fadd9": 189,
+    "FmMaj7": 190,
+    "F7b9": 191,
+    "F7#9": 192,
+    "F7#11": 193,
+    "F7b13": 194,
+    "F#maj": 195,
+    "F#m": 196,
+    "F#7": 197,
+    "F#maj7": 198,
+    "F#m7": 199,
+    "F#m7b5": 200,
+    "F#dim7": 201,
+    "F#dim": 202,
+    "F#aug": 203,
+    "F#sus4": 204,
+    "F#sus2": 205,
+    "F#6": 206,
+    "F#m6": 207,
+    "F#9": 208,
+    "F#m9": 209,
+    "F#maj9": 210,
+    "F#11": 211,
+    "F#m11": 212,
+    "F#13": 213,
+    "F#m13": 214,
+    "F#add9": 215,
+    "F#mMaj7": 216,
+    "F#7b9": 217,
+    "F#7#9": 218,
+    "F#7#11": 219,
+    "F#7b13": 220,
+    "Gmaj": 221,
+    "Gm": 222,
+    "G7": 223,
+    "Gmaj7": 224,
+    "Gm7": 225,
+    "Gm7b5": 226,
+    "Gdim7": 227,
+    "Gdim": 228,
+    "Gaug": 229,
+    "Gsus4": 230,
+    "Gsus2": 231,
+    "G6": 232,
+    "Gm6": 233,
+    "G9": 234,
+    "Gm9": 235,
+    "Gmaj9": 236,
+    "G11": 237,
+    "Gm11": 238,
+    "G13": 239,
+    "Gm13": 240,
+    "Gadd9": 241,
+    "GmMaj7": 242,
+    "G7b9": 243,
+    "G7#9": 244,
+    "G7#11": 245,
+    "G7b13": 246,
+    "Abmaj": 247,
+    "Abm": 248,
+    "Ab7": 249,
+    "Abmaj7": 250,
+    "Abm7": 251,
+    "Abm7b5": 252,
+    "Abdim7": 253,
+    "Abdim": 254,
+    "Abaug": 255,
+    "Absus4": 256,
+    "Absus2": 257,
+    "Ab6": 258,
+    "Abm6": 259,
+    "Ab9": 260,
+    "Abm9": 261,
+    "Abmaj9": 262,
+    "Ab11": 263,
+    "Abm11": 264,
+    "Ab13": 265,
+    "Abm13": 266,
+    "Abadd9": 267,
+    "AbmMaj7": 268,
+    "Ab7b9": 269,
+    "Ab7#9": 270,
+    "Ab7#11": 271,
+    "Ab7b13": 272,
+    "Amaj": 273,
+    "Am": 274,
+    "A7": 275,
+    "Amaj7": 276,
+    "Am7": 277,
+    "Am7b5": 278,
+    "Adim7": 279,
+    "Adim": 280,
+    "Aaug": 281,
+    "Asus4": 282,
+    "Asus2": 283,
+    "A6": 284,
+    "Am6": 285,
+    "A9": 286,
+    "Am9": 287,
+    "Amaj9": 288,
+    "A11": 289,
+    "Am11": 290,
+    "A13": 291,
+    "Am13": 292,
+    "Aadd9": 293,
+    "AmMaj7": 294,
+    "A7b9": 295,
+    "A7#9": 296,
+    "A7#11": 297,
+    "A7b13": 298,
+    "Bbmaj": 299,
+    "Bbm": 300,
+    "Bb7": 301,
+    "Bbmaj7": 302,
+    "Bbm7": 303,
+    "Bbm7b5": 304,
+    "Bbdim7": 305,
+    "Bbdim": 306,
+    "Bbaug": 307,
+    "Bbsus4": 308,
+    "Bbsus2": 309,
+    "Bb6": 310,
+    "Bbm6": 311,
+    "Bb9": 312,
+    "Bbm9": 313,
+    "Bbmaj9": 314,
+    "Bb11": 315,
+    "Bbm11": 316,
+    "Bb13": 317,
+    "Bbm13": 318,
+    "Bbadd9": 319,
+    "BbmMaj7": 320,
+    "Bb7b9": 321,
+    "Bb7#9": 322,
+    "Bb7#11": 323,
+    "Bb7b13": 324,
+    "Bmaj": 325,
+    "Bm": 326,
+    "B7": 327,
+    "Bmaj7": 328,
+    "Bm7": 329,
+    "Bm7b5": 330,
+    "Bdim7": 331,
+    "Bdim": 332,
+    "Baug": 333,
+    "Bsus4": 334,
+    "Bsus2": 335,
+    "B6": 336,
+    "Bm6": 337,
+    "B9": 338,
+    "Bm9": 339,
+    "Bmaj9": 340,
+    "B11": 341,
+    "Bm11": 342,
+    "B13": 343,
+    "Bm13": 344,
+    "Badd9": 345,
+    "BmMaj7": 346,
+    "B7b9": 347,
+    "B7#9": 348,
+    "B7#11": 349,
+    "B7b13": 350
+  },
+  "vocab_size": 351
+}

tokenizer.py ADDED Viewed

	@@ -0,0 +1,379 @@

+"""Chord sequence tokenizer for Music Transformer training.
+Vocabulary (~350 tokens):
+  [PAD]=0, [BOS]=1, [EOS]=2, [BAR]=3
+  [KEY:Cmaj] ... [KEY:Bmin]  (24 keys)
+  [TIME:4/4] ... [TIME:5/4]  (5 time sigs)
+  [GENRE:jazz] ... [GENRE:none]  (6 genres)
+  Cmaj, Cm, C7, ... B7b13      (12 roots x 26 qualities = 312 chords)
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+# Canonical root names (jazz convention: prefer flats)
+ROOTS = ["C", "Db", "D", "Eb", "E", "F", "F#", "G", "Ab", "A", "Bb", "B"]
+# Root name aliases for normalization
+ROOT_ALIASES: dict[str, str] = {
+    "C#": "Db", "D#": "Eb", "E#": "F", "Fb": "E",
+    "G#": "Ab", "A#": "Bb", "B#": "C", "Cb": "B",
+    "Gb": "F#",
+    # Lowercase
+    "c": "C", "d": "D", "e": "E", "f": "F", "g": "G", "a": "A", "b": "B",
+    "c#": "Db", "db": "Db", "d#": "Eb", "eb": "Eb",
+    "f#": "F#", "gb": "F#", "g#": "Ab", "ab": "Ab",
+    "a#": "Bb", "bb": "Bb", "cb": "B", "fb": "E",
+}
+# Chord qualities in our vocabulary
+QUALITIES = [
+    "maj", "m", "7", "maj7", "m7", "m7b5", "dim7", "dim", "aug",
+    "sus4", "sus2", "6", "m6", "9", "m9", "maj9", "11", "m11",
+    "13", "m13", "add9", "mMaj7", "7b9", "7#9", "7#11", "7b13",
+]
+# Quality alias mapping → canonical quality
+_QUALITY_ALIASES: dict[str, str] = {
+    # Major
+    "major": "maj", "M": "maj",
+    # Minor
+    "min": "m", "minor": "m", "-": "m", "mi": "m",
+    # Dominant 7
+    "dom7": "7", "dom": "7",
+    # Major 7
+    "^7": "maj7", "M7": "maj7", "Maj7": "maj7", "major7": "maj7",
+    "j7": "maj7", "^": "maj7", "delta": "maj7",
+    # Minor 7
+    "min7": "m7", "-7": "m7", "mi7": "m7",
+    # Half-diminished
+    "hdim7": "m7b5", "hdim": "m7b5", "h7": "m7b5",
+    "%7": "m7b5", "%": "m7b5",
+    # Diminished
+    "o": "dim", "o7": "dim7",
+    # Augmented
+    "+": "aug",
+    # Suspended
+    "sus": "sus4",
+    # 6th
+    "min6": "m6", "-6": "m6",
+    # 9th
+    "min9": "m9", "-9": "m9", "M9": "maj9", "^9": "maj9", "Maj9": "maj9",
+    # 11th
+    "min11": "m11", "-11": "m11",
+    # 13th
+    "min13": "m13", "-13": "m13",
+    # Minor-major 7
+    "minmaj7": "mMaj7", "-^7": "mMaj7", "mM7": "mMaj7",
+    # Altered dominants
+    "7alt": "7b9",
+}
+# Keys and metadata
+MAJOR_KEYS = [f"{r}maj" for r in ROOTS]
+MINOR_KEYS = [f"{r}min" for r in ROOTS]
+ALL_KEYS = MAJOR_KEYS + MINOR_KEYS
+TIME_SIGS = ["4/4", "3/4", "6/8", "2/4", "5/4"]
+GENRES = ["jazz", "pop", "rock", "blues", "bossa"]
+class ChordTokenizer:
+    """Deterministic tokenizer for chord sequences."""
+    PAD = 0
+    BOS = 1
+    EOS = 2
+    BAR = 3
+    def __init__(self) -> None:
+        self.token2id: dict[str, int] = {}
+        self.id2token: dict[int, str] = {}
+        self._build_vocab()
+    # ------------------------------------------------------------------
+    # Vocab construction
+    # ------------------------------------------------------------------
+    def _build_vocab(self) -> None:
+        tokens: list[str] = ["[PAD]", "[BOS]", "[EOS]", "[BAR]"]
+        for key in ALL_KEYS:
+            tokens.append(f"[KEY:{key}]")
+        for ts in TIME_SIGS:
+            tokens.append(f"[TIME:{ts}]")
+        for genre in GENRES:
+            tokens.append(f"[GENRE:{genre}]")
+        tokens.append("[GENRE:none]")
+        for root in ROOTS:
+            for quality in QUALITIES:
+                tokens.append(f"{root}{quality}")
+        for i, tok in enumerate(tokens):
+            self.token2id[tok] = i
+            self.id2token[i] = tok
+    @property
+    def vocab_size(self) -> int:
+        return len(self.token2id)
+    @property
+    def pad_id(self) -> int:
+        return self.PAD
+    @property
+    def bos_id(self) -> int:
+        return self.BOS
+    @property
+    def eos_id(self) -> int:
+        return self.EOS
+    @property
+    def bar_id(self) -> int:
+        return self.BAR
+    # ------------------------------------------------------------------
+    # Encoding helpers
+    # ------------------------------------------------------------------
+    def encode_chord(self, chord_str: str) -> int | None:
+        token = self.normalize_chord(chord_str)
+        return self.token2id.get(token) if token else None
+    def encode_key(self, key_str: str) -> int | None:
+        return self.token2id.get(f"[KEY:{key_str}]")
+    def encode_time_sig(self, ts: str) -> int | None:
+        return self.token2id.get(f"[TIME:{ts}]")
+    def encode_genre(self, genre: str) -> int | None:
+        return self.token2id.get(f"[GENRE:{genre}]")
+    def encode_sequence(self, song: dict) -> list[int]:
+        """Encode a unified song dict to a token-ID sequence.
+        Expected *song* format::
+            {
+                "key": "Cmaj",
+                "time_signature": "4/4",
+                "genre": "jazz",
+                "bars": [["Cmaj7", "Am7"], ["Dm7", "G7"], ...]
+            }
+        """
+        ids: list[int] = [self.BOS]
+        kid = self.encode_key(song.get("key", "Cmaj"))
+        if kid is not None:
+            ids.append(kid)
+        tid = self.encode_time_sig(song.get("time_signature", "4/4"))
+        if tid is not None:
+            ids.append(tid)
+        gid = self.encode_genre(song.get("genre", "none"))
+        if gid is not None:
+            ids.append(gid)
+        for bar in song.get("bars", []):
+            ids.append(self.BAR)
+            for chord in bar:
+                cid = self.encode_chord(chord)
+                if cid is not None:
+                    ids.append(cid)
+        ids.append(self.EOS)
+        return ids
+    def decode(self, ids: list[int]) -> list[str]:
+        return [self.id2token.get(i, "[UNK]") for i in ids]
+    # ------------------------------------------------------------------
+    # Chord normalization
+    # ------------------------------------------------------------------
+    @staticmethod
+    def normalize_root(root: str) -> str | None:
+        """Normalize a root note name to canonical form."""
+        if root in ROOTS:
+            return root
+        if root in ROOT_ALIASES:
+            return ROOT_ALIASES[root]
+        # Try capitalize first letter
+        cap = root[0].upper() + root[1:] if len(root) > 1 else root.upper()
+        if cap in ROOTS:
+            return cap
+        if cap in ROOT_ALIASES:
+            return ROOT_ALIASES[cap]
+        return None
+    @staticmethod
+    def normalize_chord(chord_str: str) -> str | None:
+        """Normalize any chord notation to ``{Root}{quality}`` in our vocab."""
+        if not chord_str or chord_str in (
+            "N", "NC", "N.C.", "X", "x",
+            "pause", "silence", "&pause", "end",
+        ):
+            return None
+        # Strip slash-chord bass
+        if "/" in chord_str:
+            chord_str = chord_str.split("/")[0]
+        # Billboard colon format  Root:Quality
+        if ":" in chord_str:
+            root_part, qual_part = chord_str.split(":", 1)
+            # qual_part may also have /bass — already stripped above
+        else:
+            root_part = chord_str[0]
+            qual_part = chord_str[1:]
+            if qual_part and qual_part[0] in ("b", "#"):
+                root_part += qual_part[0]
+                qual_part = qual_part[1:]
+        norm_root = ChordTokenizer.normalize_root(root_part)
+        if norm_root is None:
+            return None
+        quality = ChordTokenizer._normalize_quality(qual_part)
+        if quality is None or quality not in QUALITIES:
+            return None
+        return f"{norm_root}{quality}"
+    @staticmethod
+    def _normalize_quality(q: str) -> str | None:
+        """Map various quality notations to our canonical set."""
+        if not q:
+            return "maj"
+        # Direct hit
+        if q in QUALITIES:
+            return q
+        # Alias table
+        if q in _QUALITY_ALIASES:
+            return _QUALITY_ALIASES[q]
+        # Case-insensitive alias search
+        for alias, canon in _QUALITY_ALIASES.items():
+            if q.lower() == alias.lower():
+                return canon
+        # ---- Heuristic fallbacks for unusual notations ----
+        # WJazzD altered dominants: "79b" → 7b9, "79#" → 7#9, etc.
+        if q.startswith("7"):
+            tail = q[1:]
+            if "b9" in tail or "9b" in tail:
+                return "7b9"
+            if "#9" in tail or "9#" in tail:
+                return "7#9"
+            if "#11" in tail or "11#" in tail:
+                return "7#11"
+            if "b13" in tail or "13b" in tail:
+                return "7b13"
+        # Compound minor qualities
+        if q.startswith("m") or q.startswith("-"):
+            inner = q.lstrip("m").lstrip("-")
+            if "7" in inner and ("b5" in inner or "b5" in q):
+                return "m7b5"
+            if "7" in inner:
+                return "m7"
+            if "9" in inner:
+                return "m9"
+            if "11" in inner:
+                return "m11"
+            if "13" in inner:
+                return "m13"
+            if "6" in inner:
+                return "m6"
+            return "m"
+        # Bare numbers
+        if q in ("7",):
+            return "7"
+        if q in ("9",):
+            return "9"
+        if q in ("6",):
+            return "6"
+        if q in ("11",):
+            return "11"
+        if q in ("13",):
+            return "13"
+        # If nothing matched, approximate as major
+        return "maj"
+    # ------------------------------------------------------------------
+    # Transposition
+    # ------------------------------------------------------------------
+    def transpose_chord_token(self, token: str, semitones: int) -> str | None:
+        """Transpose a chord token string by *semitones*."""
+        if token.startswith("["):
+            return None
+        root = token[0]
+        rest = token[1:]
+        if rest and rest[0] in ("b", "#"):
+            root += rest[0]
+            rest = rest[1:]
+        norm_root = self.normalize_root(root)
+        if norm_root is None:
+            return None
+        new_root = ROOTS[(ROOTS.index(norm_root) + semitones) % 12]
+        return f"{new_root}{rest}"
+    def transpose_key_token(self, token: str, semitones: int) -> str:
+        """Transpose a key token like ``[KEY:Cmaj]``."""
+        inner = token[5:-1]  # strip [KEY: and ]
+        if inner.endswith("maj"):
+            root, mode = inner[:-3], "maj"
+        elif inner.endswith("min"):
+            root, mode = inner[:-3], "min"
+        else:
+            return token
+        norm = self.normalize_root(root)
+        if norm is None:
+            return token
+        new_root = ROOTS[(ROOTS.index(norm) + semitones) % 12]
+        return f"[KEY:{new_root}{mode}]"
+    def transpose_sequence(self, ids: list[int], semitones: int) -> list[int]:
+        """Transpose every chord & key token in *ids* by *semitones*."""
+        if semitones % 12 == 0:
+            return list(ids)
+        out: list[int] = []
+        for tid in ids:
+            tok = self.id2token.get(tid)
+            if tok is None:
+                out.append(tid)
+            elif tok.startswith("[KEY:"):
+                new = self.transpose_key_token(tok, semitones)
+                out.append(self.token2id.get(new, tid))
+            elif tok.startswith("[") or tid <= self.BAR:
+                out.append(tid)
+            else:
+                new = self.transpose_chord_token(tok, semitones)
+                out.append(self.token2id[new] if new and new in self.token2id else tid)
+        return out
+    # ------------------------------------------------------------------
+    # Persistence
+    # ------------------------------------------------------------------
+    def save(self, path: str | Path) -> None:
+        Path(path).write_text(json.dumps({
+            "token2id": self.token2id,
+            "vocab_size": self.vocab_size,
+        }, indent=2, ensure_ascii=False))
+    @classmethod
+    def load(cls, path: str | Path) -> ChordTokenizer:
+        tok = cls()
+        data = json.loads(Path(path).read_text())
+        assert data["vocab_size"] == tok.vocab_size, (
+            f"Vocab mismatch: file={data['vocab_size']}, current={tok.vocab_size}"
+        )
+        return tok