| import torch |
| from torch.nn import Parameter |
| from ..models.factory import create_model_from_config |
|
|
| def create_training_wrapper_from_config(model_config, model): |
| model_type = model_config.get('model_type', None) |
| assert model_type is not None, 'model_type must be specified in model config' |
|
|
| training_config = model_config.get('training', None) |
| assert training_config is not None, 'training config must be specified in model config' |
|
|
| if model_type == 'autoencoder': |
| from .autoencoders import AutoencoderTrainingWrapper |
|
|
| ema_copy = None |
|
|
| if training_config.get("use_ema", False): |
| ema_copy = create_model_from_config(model_config) |
| ema_copy = create_model_from_config(model_config) |
| |
| for name, param in model.state_dict().items(): |
| if isinstance(param, Parameter): |
| |
| param = param.data |
| ema_copy.state_dict()[name].copy_(param) |
|
|
| use_ema = training_config.get("use_ema", False) |
|
|
| latent_mask_ratio = training_config.get("latent_mask_ratio", 0.0) |
|
|
| teacher_model = training_config.get("teacher_model", None) |
| if teacher_model is not None: |
| teacher_model = create_model_from_config(teacher_model) |
| teacher_model = teacher_model.eval().requires_grad_(False) |
|
|
| teacher_model_ckpt = training_config.get("teacher_model_ckpt", None) |
| if teacher_model_ckpt is not None: |
| teacher_model.load_state_dict(torch.load(teacher_model_ckpt)["state_dict"]) |
| else: |
| raise ValueError("teacher_model_ckpt must be specified if teacher_model is specified") |
|
|
| return AutoencoderTrainingWrapper( |
| model, |
| lr=training_config["learning_rate"], |
| warmup_steps=training_config.get("warmup_steps", 0), |
| encoder_freeze_on_warmup=training_config.get("encoder_freeze_on_warmup", False), |
| sample_rate=model_config["sample_rate"], |
| loss_config=training_config.get("loss_configs", None), |
| optimizer_configs=training_config.get("optimizer_configs", None), |
| use_ema=use_ema, |
| ema_copy=ema_copy if use_ema else None, |
| force_input_mono=training_config.get("force_input_mono", False), |
| latent_mask_ratio=latent_mask_ratio, |
| teacher_model=teacher_model |
| ) |
| elif model_type == 'diffusion_uncond': |
| from .diffusion import DiffusionUncondTrainingWrapper |
| return DiffusionUncondTrainingWrapper( |
| model, |
| lr=training_config["learning_rate"], |
| pre_encoded=training_config.get("pre_encoded", False), |
| ) |
| elif model_type == 'diffusion_cond': |
| from .diffusion import DiffusionCondTrainingWrapper |
| return DiffusionCondTrainingWrapper( |
| model, |
| lr=training_config.get("learning_rate", None), |
| mask_padding=training_config.get("mask_padding", False), |
| mask_padding_dropout=training_config.get("mask_padding_dropout", 0.0), |
| use_ema = training_config.get("use_ema", True), |
| log_loss_info=training_config.get("log_loss_info", False), |
| optimizer_configs=training_config.get("optimizer_configs", None), |
| pre_encoded=training_config.get("pre_encoded", False), |
| cfg_dropout_prob = training_config.get("cfg_dropout_prob", 0.1), |
| timestep_sampler = training_config.get("timestep_sampler", "uniform") |
| ) |
| elif model_type == 'diffusion_prior': |
| from .diffusion import DiffusionPriorTrainingWrapper |
| from ..models.diffusion_prior import PriorType |
|
|
| ema_copy = create_model_from_config(model_config) |
| |
| |
| for name, param in model.state_dict().items(): |
| if isinstance(param, Parameter): |
| |
| param = param.data |
| ema_copy.state_dict()[name].copy_(param) |
|
|
| prior_type = training_config.get("prior_type", "mono_stereo") |
|
|
| if prior_type == "mono_stereo": |
| prior_type_enum = PriorType.MonoToStereo |
| else: |
| raise ValueError(f"Unknown prior type: {prior_type}") |
|
|
| return DiffusionPriorTrainingWrapper( |
| model, |
| lr=training_config["learning_rate"], |
| ema_copy=ema_copy, |
| prior_type=prior_type_enum, |
| log_loss_info=training_config.get("log_loss_info", False), |
| use_reconstruction_loss=training_config.get("use_reconstruction_loss", False), |
| ) |
| elif model_type == 'diffusion_cond_inpaint': |
| from .diffusion import DiffusionCondInpaintTrainingWrapper |
| return DiffusionCondInpaintTrainingWrapper( |
| model, |
| lr=training_config.get("learning_rate", None), |
| max_mask_segments = training_config.get("max_mask_segments", 10), |
| log_loss_info=training_config.get("log_loss_info", False), |
| optimizer_configs=training_config.get("optimizer_configs", None), |
| use_ema=training_config.get("use_ema", True), |
| pre_encoded=training_config.get("pre_encoded", False), |
| cfg_dropout_prob = training_config.get("cfg_dropout_prob", 0.1), |
| timestep_sampler = training_config.get("timestep_sampler", "uniform") |
| ) |
| elif model_type == 'diffusion_autoencoder': |
| from .diffusion import DiffusionAutoencoderTrainingWrapper |
|
|
| ema_copy = create_model_from_config(model_config) |
| |
| |
| for name, param in model.state_dict().items(): |
| if isinstance(param, Parameter): |
| |
| param = param.data |
| ema_copy.state_dict()[name].copy_(param) |
|
|
| return DiffusionAutoencoderTrainingWrapper( |
| model, |
| ema_copy=ema_copy, |
| lr=training_config["learning_rate"], |
| use_reconstruction_loss=training_config.get("use_reconstruction_loss", False) |
| ) |
| elif model_type == 'lm': |
| from .lm import AudioLanguageModelTrainingWrapper |
|
|
| ema_copy = create_model_from_config(model_config) |
|
|
| for name, param in model.state_dict().items(): |
| if isinstance(param, Parameter): |
| |
| param = param.data |
| ema_copy.state_dict()[name].copy_(param) |
|
|
| return AudioLanguageModelTrainingWrapper( |
| model, |
| ema_copy=ema_copy, |
| lr=training_config.get("learning_rate", None), |
| use_ema=training_config.get("use_ema", False), |
| optimizer_configs=training_config.get("optimizer_configs", None), |
| pre_encoded=training_config.get("pre_encoded", False), |
| ) |
|
|
| else: |
| raise NotImplementedError(f'Unknown model type: {model_type}') |
|
|
| def create_demo_callback_from_config(model_config, **kwargs): |
| model_type = model_config.get('model_type', None) |
| assert model_type is not None, 'model_type must be specified in model config' |
|
|
| training_config = model_config.get('training', None) |
| assert training_config is not None, 'training config must be specified in model config' |
|
|
| demo_config = training_config.get("demo", {}) |
|
|
| if model_type == 'autoencoder': |
| from .autoencoders import AutoencoderDemoCallback |
| return AutoencoderDemoCallback( |
| demo_every=demo_config.get("demo_every", 2000), |
| sample_size=model_config["sample_size"], |
| sample_rate=model_config["sample_rate"], |
| **kwargs |
| ) |
| elif model_type == 'diffusion_uncond': |
| from .diffusion import DiffusionUncondDemoCallback |
| return DiffusionUncondDemoCallback( |
| demo_every=demo_config.get("demo_every", 2000), |
| demo_steps=demo_config.get("demo_steps", 250), |
| sample_rate=model_config["sample_rate"] |
| ) |
| elif model_type == "diffusion_autoencoder": |
| from .diffusion import DiffusionAutoencoderDemoCallback |
| return DiffusionAutoencoderDemoCallback( |
| demo_every=demo_config.get("demo_every", 2000), |
| demo_steps=demo_config.get("demo_steps", 250), |
| sample_size=model_config["sample_size"], |
| sample_rate=model_config["sample_rate"], |
| **kwargs |
| ) |
| elif model_type == "diffusion_prior": |
| from .diffusion import DiffusionPriorDemoCallback |
| return DiffusionPriorDemoCallback( |
| demo_every=demo_config.get("demo_every", 2000), |
| demo_steps=demo_config.get("demo_steps", 250), |
| sample_size=model_config["sample_size"], |
| sample_rate=model_config["sample_rate"], |
| **kwargs |
| ) |
| elif model_type == "diffusion_cond": |
| from .diffusion import DiffusionCondDemoCallback |
|
|
| return DiffusionCondDemoCallback( |
| demo_every=demo_config.get("demo_every", 2000), |
| sample_size=model_config["sample_size"], |
| sample_rate=model_config["sample_rate"], |
| demo_steps=demo_config.get("demo_steps", 250), |
| num_demos=demo_config["num_demos"], |
| demo_cfg_scales=demo_config["demo_cfg_scales"], |
| demo_conditioning=demo_config.get("demo_cond", {}), |
| demo_cond_from_batch=demo_config.get("demo_cond_from_batch", False), |
| display_audio_cond=demo_config.get("display_audio_cond", False), |
| ) |
| elif model_type == "diffusion_cond_inpaint": |
| from .diffusion import DiffusionCondInpaintDemoCallback |
|
|
| return DiffusionCondInpaintDemoCallback( |
| demo_every=demo_config.get("demo_every", 2000), |
| sample_size=model_config["sample_size"], |
| sample_rate=model_config["sample_rate"], |
| demo_steps=demo_config.get("demo_steps", 250), |
| demo_cfg_scales=demo_config["demo_cfg_scales"], |
| **kwargs |
| ) |
| |
| elif model_type == "lm": |
| from .lm import AudioLanguageModelDemoCallback |
|
|
| return AudioLanguageModelDemoCallback( |
| demo_every=demo_config.get("demo_every", 2000), |
| sample_size=model_config["sample_size"], |
| sample_rate=model_config["sample_rate"], |
| demo_cfg_scales=demo_config.get("demo_cfg_scales", [1]), |
| demo_conditioning=demo_config.get("demo_cond", None), |
| num_demos=demo_config.get("num_demos", 8), |
| **kwargs |
| ) |
| else: |
| raise NotImplementedError(f'Unknown model type: {model_type}') |