Spaces:
Running on Zero
Running on Zero
File size: 3,039 Bytes
fdc2b0b 1636761 08c5e28 1636761 08c5e28 1636761 08c5e28 1636761 08c5e28 1636761 08c5e28 1636761 08c5e28 1636761 fdc2b0b 1636761 08c5e28 1636761 08c5e28 1636761 08c5e28 1636761 08c5e28 1636761 08c5e28 1636761 08c5e28 1636761 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | # DramaBox IC-LoRA training config β values become the defaults for
# `accelerate launch src/train.py --config configs/training_args.example.yaml`.
# Any flag explicitly passed on the CLI overrides the YAML.
# ββ Data βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# One entry per preprocessed dataset (output dirs from src/preprocess.py).
data_dir:
- /path/to/preprocessed_dataset_a/
- /path/to/preprocessed_dataset_b/
# One index file per data_dir entry. Each line follows the format you fed to
# preprocess.py β see README "Prepare your index file".
speaker_index:
- /path/to/preprocessed_dataset_a/index.txt
- /path/to/preprocessed_dataset_b/index.txt
# Output directory for LoRA shards + logs (relative paths resolve against the
# repo root).
output_dir: tts_iclora_v1
# ββ Base model βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Train your LoRA on top of DramaBox itself (recommended) β the trimmed audio
# components are enough; no need to ship the raw LTX-2.3 base.
checkpoint: dramabox-dit-v1.safetensors
full_checkpoint: dramabox-audio-components.safetensors
base_model: dev # 'dev' = ShiftedLogitNormal sampler; 'distilled' = DistilledTimestepSampler
# ββ LoRA hyperparams (rank == alpha β scale = 1.0) βββββββββββββββββββββββββ
lora_rank: 128
lora_alpha: 128
lora_dropout: 0.1 # ~0.1 helps regularize on small datasets
# Resume an existing LoRA β step number parsed from the filename
# (e.g. lora_step_05000.safetensors β starts at step 5000).
# resume_lora: tts_iclora_v0/lora_step_05000.safetensors
# ββ Voice-cloning reference tokens βββββββββββββββββββββββββββββββββββββββββ
ref_ratio: 0.3 # fraction of training samples that get a ref-token tail
max_ref_tokens: 200 # cap on appended ref tokens after patchification
# CFG training: probability of zeroing the text condition (forces reliance on
# the voice ref / unconditional path).
text_dropout: 0.4
# ββ Schedule βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Cosine + 1e-4 = from-scratch fine-tune.
# Constant + 1e-5 = polish on top of an existing LoRA (use with `resume_lora`).
steps: 10000
lr: 1.0e-04
lr_scheduler: cosine
warmup_steps: 500
batch_size: 1
grad_accum: 4
max_grad_norm: 1.0
save_every: 500
log_every: 50
seed: 53
# Optional per-save-step validation pass. Generates a sample for every speaker
# in the val_config so you can A/B listen during training.
# val_config: configs/val_config.example.yaml
|