File size: 3,039 Bytes
fdc2b0b
1636761
 
08c5e28
1636761
 
08c5e28
1636761
 
08c5e28
1636761
 
08c5e28
1636761
 
08c5e28
1636761
 
08c5e28
 
1636761
fdc2b0b
1636761
 
 
 
08c5e28
1636761
08c5e28
 
1636761
08c5e28
1636761
 
 
 
 
 
 
08c5e28
1636761
 
 
08c5e28
1636761
 
 
08c5e28
 
 
 
 
 
 
 
 
 
 
 
 
1636761
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# DramaBox IC-LoRA training config β€” values become the defaults for
# `accelerate launch src/train.py --config configs/training_args.example.yaml`.
# Any flag explicitly passed on the CLI overrides the YAML.

# ── Data ───────────────────────────────────────────────────────────────────
# One entry per preprocessed dataset (output dirs from src/preprocess.py).
data_dir:
  - /path/to/preprocessed_dataset_a/
  - /path/to/preprocessed_dataset_b/

# One index file per data_dir entry. Each line follows the format you fed to
# preprocess.py β€” see README "Prepare your index file".
speaker_index:
  - /path/to/preprocessed_dataset_a/index.txt
  - /path/to/preprocessed_dataset_b/index.txt

# Output directory for LoRA shards + logs (relative paths resolve against the
# repo root).
output_dir: tts_iclora_v1

# ── Base model ─────────────────────────────────────────────────────────────
# Train your LoRA on top of DramaBox itself (recommended) β€” the trimmed audio
# components are enough; no need to ship the raw LTX-2.3 base.
checkpoint: dramabox-dit-v1.safetensors
full_checkpoint: dramabox-audio-components.safetensors
base_model: dev          # 'dev' = ShiftedLogitNormal sampler; 'distilled' = DistilledTimestepSampler

# ── LoRA hyperparams (rank == alpha β†’ scale = 1.0) ─────────────────────────
lora_rank: 128
lora_alpha: 128
lora_dropout: 0.1        # ~0.1 helps regularize on small datasets

# Resume an existing LoRA β€” step number parsed from the filename
# (e.g. lora_step_05000.safetensors β†’ starts at step 5000).
# resume_lora: tts_iclora_v0/lora_step_05000.safetensors

# ── Voice-cloning reference tokens ─────────────────────────────────────────
ref_ratio: 0.3           # fraction of training samples that get a ref-token tail
max_ref_tokens: 200      # cap on appended ref tokens after patchification

# CFG training: probability of zeroing the text condition (forces reliance on
# the voice ref / unconditional path).
text_dropout: 0.4

# ── Schedule ───────────────────────────────────────────────────────────────
# Cosine + 1e-4 = from-scratch fine-tune.
# Constant + 1e-5 = polish on top of an existing LoRA (use with `resume_lora`).
steps: 10000
lr: 1.0e-04
lr_scheduler: cosine
warmup_steps: 500

batch_size: 1
grad_accum: 4
max_grad_norm: 1.0

save_every: 500
log_every: 50
seed: 53

# Optional per-save-step validation pass. Generates a sample for every speaker
# in the val_config so you can A/B listen during training.
# val_config: configs/val_config.example.yaml