shethjenil
/

StyleTTS

Safetensors

Model card Files Files and versions

xet

Community

shethjenil commited on 3 days ago

Commit

e0ac99f

verified ·

1 Parent(s): 82be5fe

Upload 2 files

Browse files

Files changed (2) hide show

config.yaml +105 -0
main.py +195 -0

config.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+log_dir: "Models/Output"
+save_freq: 5
+log_interval: 10
+device: "cuda"
+epochs: 50
+batch_size: 8
+max_len: 400
+pretrained_model: ""
+second_stage_load_pretrained: true
+load_only_params: true
+external_models:
+  asr:
+    input_dim: 80
+    hidden_dim: 256
+    n_token: 178
+  plbert:
+    vocab_size: 178
+    hidden_size: 768
+    num_attention_heads: 12
+    intermediate_size: 2048
+    dropout: 0.1
+data_params:
+  train_data: "shethjenil/audiodata"
+  root_path: ""
+  min_length: 50
+preprocess_params:
+  sr: 24000
+  n_fft: 2048
+  win_length: 1200
+  hop_length: 300
+model_params:
+  multispeaker: true
+  dim_in: 64
+  hidden_dim: 128
+  max_conv_dim: 512
+  n_layer: 2
+  n_mels: 80
+  n_token: 178
+  max_dur: 50
+  style_dim: 128
+  dropout: 0.2
+  decoder:
+    type: "istftnet"
+    hidden_dim: 256
+    decoder_out_dim: 256
+    asr_res_in: 128
+    resblock_kernel_sizes: [3, 3]
+    upsample_rates: [10, 6]
+    upsample_initial_channel: 256
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    upsample_kernel_sizes: [20, 12]
+    gen_istft_n_fft: 20
+    gen_istft_hop_size: 5
+    disable_complex: true
+  slm:
+    model: "microsoft/wavlm-base-plus"
+    sr: 16000
+    hidden: 768
+    nlayers: 13
+    initial_channel: 64
+  diffusion:
+    embedding_mask_proba: 0.1
+    transformer:
+      num_layers: 3
+      num_heads: 8
+      head_features: 64
+      multiplier: 2
+    dist:
+      sigma_data: 0.2
+      estimate_sigma_data: true
+      mean: -3.0
+      std: 1.0
+loss_params:
+  lambda_mel: 5.0
+  lambda_gen: 1.0
+  lambda_slm: 1.0
+  lambda_mono: 1.0
+  lambda_s2s: 1.0
+  lambda_F0: 1.0
+  lambda_norm: 1.0
+  lambda_dur: 1.0
+  lambda_ce: 20.0
+  lambda_sty: 1.0
+  lambda_diff: 1.0
+  diff_epoch: 10
+  joint_epoch: 30
+optimizer_params:
+  lr: 0.0001
+  bert_lr: 0.00001
+  ft_lr: 0.0001
+slmadv_params:
+  min_len: 400
+  max_len: 500
+  batch_percentage: 0.5
+  iter: 10
+  thresh: 5.0
+  scale: 0.01
+  sig: 1.5

main.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# %%bash
+# pip uninstall -q styletts2 -y
+# pip install git+https://github.com/dummyjenil/styletts2 -q
+import torch
+from torch import nn
+import onnx
+import torch.nn.utils.parametrize as parametrize
+from styletts2.models.styletts2 import StyleTTS2Model, StyleTTS2Config
+from onnx_toolkit import ONNXParser
+from safetensors.torch import save_file
+from huggingface_hub import hf_hub_download
+# -----------------------------
+# Utils
+# -----------------------------
+def get_layer_from_key(model: nn.Module, key: str):
+    module = model
+    for part in key.split(".")[:-1]:
+        module = module[int(part)] if part.isdigit() else getattr(module, part)
+    return module
+def is_bidirectional(node):
+    for attr in node.attribute:
+        if attr.name == "direction":
+            value = onnx.helper.get_attribute_value(attr)
+            if isinstance(value, bytes):
+                value = value.decode("utf-8")
+            return value == "bidirectional"
+    return False
+# -----------------------------
+# LSTM Conversion
+# -----------------------------
+def convert_onnx_lstm(W, R, B, layer_name="lstm", bidirectional=False):
+    def reorder_gates(w):
+        i, o, f, c = torch.chunk(w, 4, dim=0)
+        return torch.cat([i, f, c, o], dim=0)
+    state_dict = {}
+    for d in range(W.shape[0]):
+        suffix = "" if d == 0 else "_reverse"
+        w_ih = reorder_gates(torch.tensor(W[d]))
+        w_hh = reorder_gates(torch.tensor(R[d]))
+        b_ih, b_hh = torch.chunk(torch.tensor(B[d]), 2, dim=0)
+        b_ih = reorder_gates(b_ih)
+        b_hh = reorder_gates(b_hh)
+        state_dict[f"{layer_name}.weight_ih_l0{suffix}"] = w_ih
+        state_dict[f"{layer_name}.weight_hh_l0{suffix}"] = w_hh
+        state_dict[f"{layer_name}.bias_ih_l0{suffix}"] = b_ih
+        state_dict[f"{layer_name}.bias_hh_l0{suffix}"] = b_hh
+    return state_dict
+# -----------------------------
+# MAIN FUNCTION (UPDATED)
+# -----------------------------
+def convert_model_from_local(
+    onnx_path: str,
+    config,
+    save_path: str
+):
+    """
+    onnx_path: local path (hf_hub_download se mila hua bhi chalega)
+    config: StyleTTS2Config OR dict
+    save_path: output safetensors path
+    """
+    # ---- Config handling ----
+    if isinstance(config, dict):
+        config = StyleTTS2Config(**config)
+    model = StyleTTS2Model(config)
+    # Remove unused modules
+    for attr in [
+        "wd", "msd", "mpd", "pitch_extractor",
+        "text_aligner", "diffusion",
+        "predictor_encoder", "style_encoder"
+    ]:
+        if hasattr(model, attr):
+            delattr(model, attr)
+    # Remove parametrizations
+    for module in model.modules():
+        if hasattr(module, "parametrizations") and "weight" in module.parametrizations:
+            parametrize.remove_parametrizations(module, "weight", leave_parametrized=True)
+    state_dict = model.state_dict()
+    # ---- Load ONNX (LOCAL PATH) ----
+    m = ONNXParser(onnx_path)
+    # -------- LSTM handling --------
+    pytorch_lstm = {
+        name: module for name, module in model.named_modules()
+        if isinstance(module, nn.LSTM)
+    }
+    onnx_lstm_layers = {
+        i.name.rstrip("/LSTM").lstrip("/").replace("/", "."): i
+        for i in m.find().find_by_op_type("LSTM")
+    }
+    predefined_dict = {}
+    for pt_name in pytorch_lstm:
+        if pt_name not in onnx_lstm_layers:
+            continue
+        node = onnx_lstm_layers[pt_name]
+        block = m.find().find_by_name(node.name, exact=True)
+        tensors = list(block.tensor().values())
+        if len(tensors) != 3:
+            continue
+        w, r, b = tensors
+        converted = convert_onnx_lstm(
+            w, r, b,
+            pt_name,
+            is_bidirectional(block.single_node)
+        )
+        predefined_dict.update(converted)
+    # -------- Build state_dict --------
+    finder = m.find()
+    new_state_dict = {}
+    for name, tensor in state_dict.items():
+        full_key = "kmodel." + name
+        results = finder.find_by_tensor(full_key)
+        if results:
+            new_state_dict[name] = torch.tensor(results[0].tensor()[full_key])
+            continue
+        module = get_layer_from_key(model, name)
+        if isinstance(module, nn.LSTM) and name in predefined_dict:
+            new_state_dict[name] = predefined_dict[name]
+            continue
+        new_state_dict[name] = tensor  # fallback
+    # Load weights
+    new_state_dict['decoder.generator.stft.window'] = model.decoder.generator.stft.window.clone()
+    model.load_state_dict(new_state_dict)
+    # Make contiguous
+    final_sd = {
+        k: v.contiguous() if not v.is_contiguous() else v
+        for k, v in model.state_dict().items()
+    }
+    save_file(final_sd, save_path)
+# config = StyleTTS2Config.from_yaml("config.yaml")
+# convert_model_from_local(
+#     onnx_path=hf_hub_download("KittenML/kitten-tts-nano-0.8-fp32","kitten_tts_nano_v0_8.onnx"),
+#     config=config,
+#     save_path="mini_model.safetensors"
+# )
+# ----
+# quantization baaki hai
+# config.model_params.style_dim = 512
+# config.model_params.hidden_dim = 512
+# config.model_params.decoder.hidden_dim = 1024
+# config.model_params.decoder.decoder_out_dim = 512
+# config.model_params.decoder.asr_res_in = 256
+# config.model_params.decoder.upsample_initial_channel = 512
+# convert_model_from_local(
+#     onnx_path=hf_hub_download("KittenML/kitten-tts-mini-0.8","kitten_tts_mini_v0_8.onnx"),
+#     config=config,
+#     save_path="mini_model.safetensors"
+# )