Upload Conv-TasNet best checkpoint

Browse files

Files changed (8) hide show

README.md +95 -0
best.ckpt +3 -0
configs/data.yaml +10 -0
configs/train.yaml +35 -0
requirements.txt +5 -0
src/model.py +114 -0
src/separate.py +141 -0
training_metadata.json +9 -0

README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+---
+license: apache-2.0
+tags:
+  - audio
+  - speech
+  - source-separation
+  - conv-tasnet
+  - asteroid
+  - pytorch
+library_name: pytorch
+pipeline_tag: audio-to-audio
+---
+# Cocktail Party AI - Conv-TasNet 3-Source Separator
+This repository contains the best checkpoint from a Conv-TasNet model trained for speech source separation.
+The model takes a mixed speech waveform and estimates 3 separated source waveforms.
+## Checkpoint
+- File: `best.ckpt`
+- Architecture: Asteroid `ConvTasNet`
+- Number of sources: 3
+- Sample rate: 16 kHz
+- Training checkpoint epoch: 68
+- Best validation loss: -2.909952
+- Approximate validation SI-SNR: 2.91 dB
+## Files
+```text
+best.ckpt
+configs/data.yaml
+configs/train.yaml
+requirements.txt
+src/model.py
+src/separate.py
+```
+## Usage
+Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+Load the checkpoint with the project code:
+```python
+import yaml
+import torch
+from src.model import build_model, load_checkpoint
+with open("configs/train.yaml") as f:
+    train_cfg = yaml.safe_load(f)
+with open("configs/data.yaml") as f:
+    data_cfg = yaml.safe_load(f)
+mod = train_cfg["model"]
+ds = data_cfg["dataset"]
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = build_model(
+    n_src=ds["n_src"],
+    sample_rate=ds["sample_rate"],
+    n_filters=mod["n_filters"],
+    filter_length=mod["filter_length"],
+    stride=mod["stride"],
+    n_blocks=mod["n_blocks"],
+    n_repeats=mod["n_repeats"],
+    bn_chan=mod["bn_chan"],
+    hid_chan=mod["hid_chan"],
+    skip_chan=mod["skip_chan"],
+    norm_type=mod["norm_type"],
+    mask_act=mod["mask_act"],
+    use_gradient_checkpointing=False,
+).to(device)
+load_checkpoint(model, "best.ckpt", device)
+model.eval()
+```
+To separate a WAV file using this project:
+```bash
+python src/separate.py --mix path/to/mixture.wav --ckpt best.ckpt
+```
+## Notes
+This is a research/training checkpoint, not a fully packaged `transformers` pipeline.
+It depends on PyTorch, Torchaudio, and Asteroid.

best.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ca8736f8da084c480bd765a99cb0fb4f2f1b2a9c94437cf345896c75bab8ee4
+size 23479878

configs/data.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+dataset:
+  root_dir: data/mixtures       # chemin vers le dossier des mélanges
+  n_src: 3                      # nombre de sources (source_1 à source_n)
+  sample_rate: 16000            # fréquence d'échantillonnage en Hz
+  segment_duration: 5.0         # durée d'un segment en secondes (0 = fichier entier)
+splits:
+  train_ratio: 0.8              # 80% train
+  val_ratio: 0.1                # 10% validation
+  test_ratio: 0.1               # calculé automatiquement

configs/train.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+training:
+  epochs: 200
+  early_stopping: 20
+  batch_size: 48
+  accumulation_steps: 2       # effective batch = 48 × 2 = 96
+  learning_rate: 0.00005
+  grad_clip: 30.0               # ← réduit : 1.0 provoquait des instabilités
+  save_every: 10
+  num_workers: 4
+  seed: 42
+scheduler:
+  name: warmup_cosine
+  warmup_epochs: 5            # ← augmenté : plus de warmup avec peu de données
+  min_lr: 0.0000001
+optimizer:
+  name: adam
+  weight_decay: 0.001
+model:
+  n_filters: 256
+  filter_length: 16
+  stride: 8
+  n_blocks: 6
+  n_repeats: 3
+  bn_chan: 128
+  hid_chan: 256
+  skip_chan: 128
+  norm_type: gLN
+  mask_act: relu
+  gradient_checkpointing: true
+paths:
+  checkpoint_dir: checkpoints_3src_2
+  log_dir: logs_3src_2

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.0.0
+torchaudio>=2.0.0
+asteroid>=0.6.0
+numpy>=1.24.0
+matplotlib>=3.10.8

src/model.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from asteroid.models import ConvTasNet
+def build_model(n_src=5, sample_rate=8000,
+                n_filters=512, filter_length=16,
+                stride=8, n_blocks=8, n_repeats=3,
+                bn_chan=128, hid_chan=512, skip_chan=128,
+                norm_type="gLN", mask_act="relu",
+                use_gradient_checkpointing=False):
+    model = ConvTasNet(
+        n_src=n_src, sample_rate=sample_rate,
+        n_filters=n_filters, filter_length=filter_length,
+        stride=stride, n_blocks=n_blocks, n_repeats=n_repeats,
+        bn_chan=bn_chan, hid_chan=hid_chan, skip_chan=skip_chan,
+        norm_type=norm_type, mask_act=mask_act,
+    )
+    if use_gradient_checkpointing:
+        _apply_gradient_checkpointing(model)
+        print("[Model] Gradient checkpointing : ACTIVÉ  (-50% VRAM, +30% temps)")
+    else:
+        print("[Model] Gradient checkpointing : désactivé")
+    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"[Model] Conv-TasNet  |  Paramètres entraînables : {n_params:,}")
+    return model
+def _apply_gradient_checkpointing(model):
+    if not hasattr(model, "masker") or not hasattr(model.masker, "TCN"):
+        print("[Warning] masker.TCN introuvable — gradient checkpointing non appliqué.")
+        return
+    original_blocks = list(model.masker.TCN.named_children())
+    if not original_blocks:
+        return
+    for name, block in original_blocks:
+        _wrap_block(model.masker.TCN, name, block)
+    print(f"[Model] {len(original_blocks)} blocs TCN checkpointés.")
+def _wrap_block(parent, name, block):
+    class CheckpointedBlock(nn.Module):
+        def __init__(self, inner):
+            super().__init__()
+            self.inner = inner
+        def forward(self, x):
+            if not x.requires_grad:
+                x = x.requires_grad_(True)
+            return checkpoint(self.inner, x, use_reentrant=False)
+    setattr(parent, name, CheckpointedBlock(block))
+def load_checkpoint(model, path, device="cpu"):
+    """
+    Load checkpoint safely.
+    Automatically handles the .inner. key mismatch caused by
+    gradient checkpointing wrapper (CheckpointedBlock).
+    """
+    ckpt  = torch.load(path, map_location=device)
+    state = ckpt.get("model_state_dict", ckpt)
+    model_keys = set(model.state_dict().keys())
+    # Try loading as-is first
+    missing, unexpected = model.load_state_dict(state, strict=False)
+    missing_set    = set(missing)
+    unexpected_set = set(unexpected)
+    # Case 1 : checkpoint has .inner. but model doesn't → strip .inner.
+    if any(".inner." in k for k in unexpected_set) and \
+       any(".inner." not in k for k in missing_set):
+        state = {k.replace(".inner.", "."): v for k, v in state.items()}
+        model.load_state_dict(state, strict=True)
+        print("[Model] '.inner.' stripped from checkpoint keys (GC ON → OFF)")
+    # Case 2 : model has .inner. but checkpoint doesn't → add .inner.
+    elif any(".inner." in k for k in missing_set) and \
+         any(".inner." not in k for k in unexpected_set):
+        new_state = {}
+        for k, v in state.items():
+            if "masker.TCN." in k and ".inner." not in k:
+                parts = k.split(".")
+                parts.insert(3, "inner")
+                k = ".".join(parts)
+            new_state[k] = v
+        model.load_state_dict(new_state, strict=True)
+        print("[Model] '.inner.' added to checkpoint keys (GC OFF → ON)")
+    # Case 3 : loaded fine on first try
+    elif len(missing) == 0 and len(unexpected) == 0:
+        print("[Model] Checkpoint chargé sans modification")
+    else:
+        raise RuntimeError(
+            f"Cannot load checkpoint — unresolvable key mismatch:\n"
+            f"  Missing   : {list(missing)[:3]}...\n"
+            f"  Unexpected: {list(unexpected)[:3]}..."
+        )
+    epoch    = ckpt.get("epoch", "?")
+    val_loss = ckpt.get("best_val_loss", "?")
+    print(f"[Model] Checkpoint chargé depuis {path}  "
+          f"(epoch {epoch}, val loss {val_loss})")
+    return model

src/separate.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+separate.py — Séparation de sources avec le modèle entraîné
+Usage :
+    python main.py separate --mix data/mixture/mix_0/mixture.wav
+    python main.py separate --mix data/mixture/mix_0/mixture.wav --ckpt checkpoints/best.ckpt
+    python main.py separate --mix mon_audio.wav --out_dir outputs/separated_audio
+"""
+import os
+import argparse
+import torch
+import torchaudio
+from src.model import build_model, load_checkpoint
+import yaml
+def load_config(path):
+    with open(path, "r") as f:
+        return yaml.safe_load(f)
+def parse_args():
+    p = argparse.ArgumentParser(description="Séparation de sources Conv-TasNet")
+    p.add_argument("--mix",       type=str, required=True,
+                   help="Chemin vers le fichier mixture.wav à séparer")
+    p.add_argument("--ckpt",      type=str, default="checkpoints/best.ckpt",
+                   help="Checkpoint du modèle entraîné")
+    p.add_argument("--out_dir",   type=str, default="outputs/separated_audio",
+                   help="Dossier de sortie pour les sources séparées")
+    p.add_argument("--train_cfg", type=str, default="configs/train.yaml")
+    p.add_argument("--data_cfg",  type=str, default="configs/data.yaml")
+    return p.parse_args()
+def separate(mix_path, model, sample_rate, device, out_dir):
+    """Charge un mixture.wav, sépare les sources, sauvegarde les .wav."""
+    # ── Charger le fichier audio ─────────────
+    mixture, sr = torchaudio.load(mix_path)
+    if sr != sample_rate:
+        print(f"  Resample {sr} Hz → {sample_rate} Hz")
+        mixture = torchaudio.functional.resample(mixture, sr, sample_rate)
+    # Mono (1, T)
+    if mixture.shape[0] > 1:
+        mixture = mixture.mean(dim=0, keepdim=True)
+    print(f"  Durée   : {mixture.shape[-1] / sample_rate:.2f}s  "
+          f"({mixture.shape[-1]} samples)")
+    # ── Inférence ────────────────────────────
+    mixture = mixture.to(device)           # (1, T)
+    with torch.no_grad():
+        # Le modèle attend (B, T) → unsqueeze batch dim
+        est_sources = model(mixture.unsqueeze(0))   # (1, n_src, T)
+        est_sources = est_sources.squeeze(0)         # (n_src, T)
+    # ── Sauvegarder les sources séparées ─────
+    os.makedirs(out_dir, exist_ok=True)
+    mix_name = os.path.splitext(os.path.basename(mix_path))[0]
+    for i, src in enumerate(est_sources):
+        src_cpu = src.unsqueeze(0).cpu()   # (1, T)
+        # Normaliser pour éviter la saturation
+        max_val = src_cpu.abs().max()
+        if max_val > 0:
+            src_cpu = src_cpu / max_val * 0.9
+        out_path = os.path.join(out_dir, f"{mix_name}_source_{i+1}.wav")
+        torchaudio.save(out_path, src_cpu, sample_rate)
+        print(f"  ✓ Source {i+1} sauvegardée : {out_path}")
+    return est_sources
+def main():
+    args  = parse_args()
+    tcfg  = load_config(args.train_cfg)
+    dcfg  = load_config(args.data_cfg)
+    mod = tcfg["model"]
+    ds  = dcfg["dataset"]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"\n[Config] Device      : {device}")
+    print(f"[Config] Checkpoint  : {args.ckpt}")
+    print(f"[Config] Fichier mix : {args.mix}\n")
+    # ── Charger le modèle ────────────────────
+    model = build_model(
+        n_src         = ds["n_src"],
+        sample_rate   = ds["sample_rate"],
+        n_filters     = mod["n_filters"],
+        filter_length = mod["filter_length"],
+        stride        = mod["stride"],
+        n_blocks      = mod["n_blocks"],
+        n_repeats     = mod["n_repeats"],
+        bn_chan        = mod["bn_chan"],
+        hid_chan       = mod["hid_chan"],
+        skip_chan      = mod["skip_chan"],
+        norm_type      = mod["norm_type"],
+        mask_act       = mod["mask_act"],
+        use_gradient_checkpointing = False,   # pas besoin en inférence
+    )
+    # ── Charger les poids entraînés ──────────
+    if not os.path.exists(args.ckpt):
+        raise FileNotFoundError(
+            f"Checkpoint introuvable : {args.ckpt}\n"
+            f"Lancez d'abord : python main.py train"
+        )
+    load_checkpoint(model, args.ckpt, device)
+    model.to(device)
+    model.eval()
+    ckpt  = torch.load(args.ckpt, map_location="cpu")
+    epoch = ckpt.get("epoch", "?")
+    val   = ckpt.get("best_val_loss", None)
+    if val is not None:
+        print(f"[Model] Checkpoint chargé  (epoch {epoch}, val loss {val:.4f})\n")
+    else:
+        print(f"[Model] Checkpoint chargé  (epoch {epoch})\n")
+    # ── Séparation ───────────────────────────
+    separate(
+        mix_path    = args.mix,
+        model       = model,
+        sample_rate = ds["sample_rate"],
+        device      = device,
+        out_dir     = args.out_dir,
+    )
+    print(f"\n[Done] Sources séparées dans : {args.out_dir}/")
+if __name__ == "__main__":
+    main()

training_metadata.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "checkpoint_file": "best.ckpt",
+  "epoch": 68,
+  "best_val_loss": -2.9099522034327188,
+  "validation_si_snr_db": 2.9099522034327188,
+  "contains_optimizer_state": true,
+  "contains_scaler_state": true,
+  "model_state_tensors": 261
+}