Leon299 commited on about 1 month ago

Commit

8337fa0

verified ·

1 Parent(s): 834120b

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

MuCodec/.gitattributes +2 -0
MuCodec/.gitignore +3 -0
MuCodec/LICENSE +21 -0
MuCodec/LICENSE_weights +399 -0
MuCodec/__pycache__/generate.cpython-310.pyc +0 -0
MuCodec/__pycache__/generate.cpython-312.pyc +0 -0
MuCodec/__pycache__/model.cpython-310.pyc +0 -0
MuCodec/__pycache__/model.cpython-312.pyc +0 -0
MuCodec/configs/models/transformer2D.json +25 -0
MuCodec/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json +14 -0
MuCodec/generate.py +247 -0
MuCodec/libs/rvq/__pycache__/descript_quantize3.cpython-310.pyc +0 -0
MuCodec/libs/rvq/descript_quantize3.py +298 -0
MuCodec/model.py +367 -0
MuCodec/models/attention.py +682 -0
MuCodec/models/transformer_2d_flow.py +545 -0
MuCodec/mp3_to_code.py +187 -0
MuCodec/muq_dev/test.py +22 -0
MuCodec/readme.md +67 -0
MuCodec/requirements.txt +335 -0
MuCodec/tools/get_melvaehifigan48k.py +1551 -0
MuCodec/tools/torch_tools.py +100 -0
__pycache__/audio_tokens.cpython-310.pyc +0 -0
__pycache__/audio_tokens.cpython-312.pyc +0 -0
__pycache__/condition_encoders.cpython-310.pyc +0 -0
__pycache__/condition_encoders.cpython-312.pyc +0 -0
__pycache__/dataset.cpython-310.pyc +0 -0
__pycache__/dataset.cpython-312.pyc +0 -0
__pycache__/decoders.cpython-310.pyc +0 -0
__pycache__/decoders.cpython-312.pyc +0 -0
__pycache__/inference_full.cpython-310.pyc +0 -0
__pycache__/inference_full.cpython-312.pyc +0 -0
__pycache__/modelling_qwen3.cpython-310.pyc +0 -0
__pycache__/modelling_qwen3.cpython-312.pyc +0 -0
__pycache__/runtime_utils.cpython-310.pyc +0 -0
__pycache__/runtime_utils.cpython-312.pyc +0 -0
audio_tokens.py +21 -0
batch_infer_checkpoints.py +402 -0
condition_encoders.py +149 -0
dataset.py +513 -0
decoders.py +158 -0
inference_full.py +1084 -0
modelling_qwen3.py +237 -0
muse_mucodec_chord.ds/dataset_dict.json +1 -0
runtime_utils.py +111 -0
train.py +259 -0
vocab/__init__.py +51 -0
vocab/chord.py +144 -0
vocab/sections.py +105 -0
wandb/debug-cli.root.log +0 -0

MuCodec/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.pt filter=lfs diff=lfs merge=lfs -text
2	+ *.pth filter=lfs diff=lfs merge=lfs -text

MuCodec/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__
+*.pt
+*.pth

MuCodec/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MuCodec/LICENSE_weights ADDED Viewed

	@@ -0,0 +1,399 @@

+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

MuCodec/__pycache__/generate.cpython-310.pyc ADDED Viewed

Binary file (8.18 kB). View file

MuCodec/__pycache__/generate.cpython-312.pyc ADDED Viewed

Binary file (18 kB). View file

MuCodec/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (12.2 kB). View file

MuCodec/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (21.9 kB). View file

MuCodec/configs/models/transformer2D.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_class_name": "Transformer2DModel",
+  "activation_fn": "gelu-approximate",
+  "attention_bias": true,
+  "attention_head_dim": 72,
+  "attention_type": "default",
+  "cross_attention_dim": null,
+  "double_self_attention": false,
+  "dropout": 0.0,
+  "in_channels": 96,
+  "norm_elementwise_affine": false,
+  "norm_eps": 1e-06,
+  "norm_num_groups": 32,
+  "norm_type": "ada_norm_single",
+  "num_attention_heads": 22,
+  "num_embeds_ada_norm": 1000,
+  "num_layers": 24,
+  "num_vector_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 32,
+  "patch_size": 2,
+  "sample_size": 384,
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

MuCodec/configs/scheduler/stable_diffusion_2.1_largenoise_sample.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.8.0",
+  "beta_end": 0.02,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.0015,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "prediction_type": "sample",
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "trained_betas": null
+}

MuCodec/generate.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import json
+import torch
+from tqdm import tqdm
+import sys
+from model import PromptCondAudioDiffusion
+from diffusers import DDIMScheduler, DDPMScheduler
+import torchaudio
+import librosa
+import os
+import math
+import numpy as np
+from tools.get_melvaehifigan48k import build_pretrained_models
+import tools.torch_tools as torch_tools
+from safetensors.torch import load_file
+class MuCodec:
+    def __init__(self, \
+        model_path, \
+        layer_num, \
+        load_main_model=True, \
+        device="cuda:0"):
+        self.layer_num = layer_num - 1
+        self.sample_rate = 48000
+        self.device = device
+        self.MAX_DURATION = 360
+        if load_main_model:
+            audio_ldm_path = os.path.dirname(os.path.abspath(__file__)) + "/tools/audioldm_48k.pth"
+            self.vae, self.stft = build_pretrained_models(audio_ldm_path)
+            self.vae, self.stft = self.vae.eval().to(device), self.stft.eval().to(device)
+            main_config = {
+                "num_channels":32,
+                "unet_model_name":None,
+                "unet_model_config_path":os.path.dirname(os.path.abspath(__file__)) + "/configs/models/transformer2D.json",
+                "snr_gamma":None,
+            }
+            self.model = PromptCondAudioDiffusion(**main_config)
+            if model_path.endswith('.safetensors'):
+                main_weights = load_file(model_path)
+            else:
+                main_weights = torch.load(model_path, map_location='cpu')
+            self.model.load_state_dict(main_weights, strict=False)
+            self.model = self.model.to(device)
+            print ("Successfully loaded checkpoint from:", model_path)
+        else:
+            main_config = {
+                "num_channels":32,
+                "unet_model_name":None,
+                "unet_model_config_path":None,
+                "snr_gamma":None,
+            }
+            self.model = PromptCondAudioDiffusion(**main_config).to(device)
+            main_weights = torch.load(model_path, map_location='cpu')
+            self.model.load_state_dict(main_weights, strict=False)
+            self.model = self.model.to(device)
+            print ("Successfully loaded checkpoint from:", model_path)
+        self.model.eval()
+        self.model.init_device_dtype(torch.device(device), torch.float32)
+        print("scaling factor: ", self.model.normfeat.std)
+    def file2code(self, fname):
+        orig_samples, fs = torchaudio.load(fname)
+        if(fs!=self.sample_rate):
+            orig_samples = torchaudio.functional.resample(orig_samples, fs, self.sample_rate)
+            fs = self.sample_rate
+        if orig_samples.shape[0] == 1:
+            orig_samples = torch.cat([orig_samples, orig_samples], 0)
+        return self.sound2code(orig_samples)
+    @torch.no_grad()
+    @torch.autocast(device_type="cuda", dtype=torch.float32)
+    def sound2code(self, orig_samples, batch_size=3):
+        if(orig_samples.ndim == 2):
+            audios = orig_samples.unsqueeze(0).to(self.device)
+        elif(orig_samples.ndim == 3):
+            audios = orig_samples.to(self.device)
+        else:
+            assert orig_samples.ndim in (2,3), orig_samples.shape
+        audios = self.preprocess_audio(audios)
+        audios = audios.squeeze(0)
+        orig_length = audios.shape[-1]
+        min_samples = int(40.96 * self.sample_rate)
+        output_len = int(orig_length / float(self.sample_rate) * 25) + 1
+        print("output_len: ", output_len)
+        while(audios.shape[-1] < min_samples + 480):
+            audios = torch.cat([audios, audios], -1)
+        int_max_len=audios.shape[-1]//min_samples+1
+        # print("int_max_len: ", int_max_len)
+        audios = torch.cat([audios, audios], -1)
+        # print("audios:",audios.shape)
+        audios=audios[:,:int(int_max_len*(min_samples+480))]
+        codes_list=[]
+        audio_input = audios.reshape(2, -1, min_samples+480).permute(1, 0, 2).reshape(-1, 2, min_samples+480)
+        for audio_inx in range(0, audio_input.shape[0], batch_size):
+            # import pdb; pdb.set_trace()
+            codes, _, spk_embeds = self.model.fetch_codes_batch((audio_input[audio_inx:audio_inx+batch_size]), additional_feats=[],layer=self.layer_num)
+            codes_list.append(torch.cat(codes, 1))
+            # print("codes_list",codes_list[0].shape)
+        codes = torch.cat(codes_list, 0).permute(1,0,2).reshape(1, -1)[None] # B 3 T -> 3 B T
+        codes=codes[:,:,:output_len]
+        return codes
+    @torch.no_grad()
+    def code2sound(self, codes, prompt=None, duration=40.96, guidance_scale=1.5, num_steps=20, disable_progress=False):
+        codes = codes.to(self.device)
+        first_latent = torch.randn(codes.shape[0], 32, 512, 32).to(self.device)
+        first_latent_length = 0
+        first_latent_codes_length = 0
+        if(isinstance(prompt, torch.Tensor)):
+            prompt = prompt.to(self.device)
+            if(prompt.ndim == 3):
+                assert prompt.shape[0] == 1, prompt.shape
+                prompt = prompt[0]
+            elif(prompt.ndim == 1):
+                prompt = prompt.unsqueeze(0).repeat(2,1)
+            elif(prompt.ndim == 2):
+                if(prompt.shape[0] == 1):
+                    prompt = prompt.repeat(2,1)
+            if(prompt.shape[-1] < int(30.76 * self.sample_rate)):
+                prompt = prompt[:,:int(10.24*self.sample_rate)] # limit max length to 10.24
+            else:
+                prompt = prompt[:,int(20.48*self.sample_rate):int(30.72*self.sample_rate)] # limit max length to 10.24
+            true_mel , _, _ = torch_tools.wav_to_fbank2(prompt, -1, fn_STFT=self.stft) # maximum 10.24s
+            true_mel = true_mel.unsqueeze(1).to(self.device)
+            true_latent = torch.cat([self.vae.get_first_stage_encoding(self.vae.encode_first_stage(true_mel[[m]])) for m in range(true_mel.shape[0])],0)
+            true_latent = true_latent.reshape(true_latent.shape[0]//2, -1, true_latent.shape[2], true_latent.shape[3]).detach()
+            first_latent[:,:,0:true_latent.shape[2],:] = true_latent
+            first_latent_length = true_latent.shape[2]
+            first_latent_codes = self.sound2code(prompt)[:,:,0:first_latent_length*2] # B 4 T
+            first_latent_codes_length = first_latent_codes.shape[-1]
+            codes = torch.cat([first_latent_codes, codes], -1)
+        min_samples = 1024
+        hop_samples = min_samples // 4 * 3
+        ovlp_samples = min_samples - hop_samples
+        hop_frames = hop_samples // 2
+        ovlp_frames = ovlp_samples // 2
+        codes_len= codes.shape[-1]
+        target_len = int((codes_len - first_latent_codes_length) / 100 * 4 * self.sample_rate)
+        if(codes_len < min_samples):
+            while(codes.shape[-1] < min_samples):
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:,:,0:min_samples]
+        codes_len = codes.shape[-1]
+        if((codes_len - ovlp_frames) % hop_samples > 0):
+            len_codes=math.ceil((codes_len - ovlp_samples) / float(hop_samples)) * hop_samples + ovlp_samples
+            while(codes.shape[-1] < len_codes):
+                codes = torch.cat([codes, codes], -1)
+            codes = codes[:,:,0:len_codes]
+        latent_length = 512
+        latent_list = []
+        spk_embeds = torch.zeros([1, 32, 1, 32], device=codes.device)
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            for sinx in range(0, codes.shape[-1]-hop_samples, hop_samples):
+                codes_input=[]
+                codes_input.append(codes[:,:,sinx:sinx+min_samples])
+                if(sinx == 0):
+                    incontext_length = first_latent_length
+                    latents = self.model.inference_codes(codes_input, spk_embeds, first_latent, latent_length, incontext_length, additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+                else:
+                    true_latent = latent_list[-1][:,:,-ovlp_frames:,:]
+                    len_add_to_512 = 512 - true_latent.shape[-2]
+                    incontext_length = true_latent.shape[-2]
+                    true_latent = torch.cat([true_latent, torch.randn(true_latent.shape[0], true_latent.shape[1], len_add_to_512, true_latent.shape[-1]).to(self.device)], -2)
+                    latents = self.model.inference_codes(codes_input, spk_embeds, true_latent, latent_length, incontext_length,  additional_feats=[], guidance_scale=1.5, num_steps = num_steps, disable_progress=disable_progress, scenario='other_seg')
+                    latent_list.append(latents)
+        latent_list = [l.float() for l in latent_list]
+        latent_list[0] = latent_list[0][:,:,first_latent_length:,:]
+        min_samples =  int(duration * self.sample_rate)
+        hop_samples = min_samples // 4 * 3
+        ovlp_samples = min_samples - hop_samples
+        with torch.no_grad():
+            output = None
+            for i in range(len(latent_list)):
+                latent = latent_list[i]
+                bsz , ch, t, f = latent.shape
+                latent = latent.reshape(bsz*2, ch//2, t, f)
+                mel = self.vae.decode_first_stage(latent)
+                cur_output = self.vae.decode_to_waveform(mel)
+                cur_output = torch.from_numpy(cur_output)[:, 0:min_samples]
+                if output is None:
+                    output = cur_output
+                else:
+                    ov_win = torch.from_numpy(np.linspace(0, 1, ovlp_samples)[None, :])
+                    ov_win = torch.cat([ov_win, 1 - ov_win], -1)
+                    output[:, -ovlp_samples:] = output[:, -ovlp_samples:] * ov_win[:, -ovlp_samples:] + cur_output[:, 0:ovlp_samples] * ov_win[:, 0:ovlp_samples]
+                    output = torch.cat([output, cur_output[:, ovlp_samples:]], -1)
+            output = output[:, 0:target_len]
+        return output
+    @torch.no_grad()
+    def preprocess_audio(self, input_audios, threshold=0.8):
+        assert len(input_audios.shape) == 3, input_audios.shape
+        nchan = input_audios.shape[1]
+        input_audios = input_audios.reshape(input_audios.shape[0], -1)
+        norm_value = torch.ones_like(input_audios[:,0])
+        max_volume = input_audios.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios.reshape(input_audios.shape[0], nchan, -1)/norm_value.unsqueeze(-1).unsqueeze(-1)
+    @torch.no_grad()
+    def sound2sound(self, sound, prompt=None, min_duration=40.96, steps=50, disable_progress=False):
+        codes = self.sound2code(sound)
+        wave = self.code2sound(codes, prompt, duration=min_duration, guidance_scale=1.5, num_steps=steps, disable_progress=disable_progress)
+        return wave
+if __name__=="__main__":
+    ckpt_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ckpt/mucodec.pt")
+    mucodec = MuCodec(model_path=ckpt_path,layer_num=7,load_main_model=True)
+    filelist = []
+    root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_wav")
+    for f in [os.path.join(root_dir, f) for f in os.listdir(root_dir) if '.flac' in f or '.wav' in f or '.mp3' in f]:
+        a, fs = torchaudio.load(f)
+        if(fs!=48000):
+            a = torchaudio.functional.resample(a, fs, 48000)
+        if(a.shape[0]==1):
+            a = torch.cat([a,a],0)
+        ori_len = a.shape[-1]
+        filelist.append([a, '', [0, a.shape[-1]/48000.], f,ori_len])
+    reconstructed_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reconstructed")
+    os.makedirs(reconstructed_dir, exist_ok=True)
+    for sample_idx, (orig_samples, lyric, st_et, fname,ori_len) in enumerate(filelist):
+        print(fname, lyric)
+        wave = mucodec.sound2sound(orig_samples,None)
+        wave = wave[:,0:ori_len]
+        torchaudio.save(os.path.join(reconstructed_dir, os.path.basename(fname)),wave.detach().cpu(), 48000)

MuCodec/libs/rvq/__pycache__/descript_quantize3.cpython-310.pyc ADDED Viewed

Binary file (10 kB). View file

MuCodec/libs/rvq/descript_quantize3.py ADDED Viewed

	@@ -0,0 +1,298 @@

+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+class VectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo:
+    https://github.com/karpathy/deep-vector-quantization
+    Additionally uses following tricks from Improved VQGAN
+    (https://arxiv.org/pdf/2110.04627.pdf):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+    def __init__(self, input_dim: int, codebook_size: int, codebook_dim: int, stale_tolerance: int = 100):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.in_proj = WNConv1d(input_dim, codebook_dim, kernel_size=1)
+        self.out_proj = WNConv1d(codebook_dim, input_dim, kernel_size=1)
+        self.codebook = nn.Embedding(codebook_size, codebook_dim)
+        self.register_buffer("stale_counter", torch.zeros(self.codebook_size,))
+        self.stale_tolerance = stale_tolerance
+    def forward(self, z):
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes (ViT-VQGAN) Project input into low-dimensional space
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        z_q, indices = self.decode_latents(z_e)
+        commitment_loss = F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+        codebook_loss = F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_proj(z_q)
+        return z_q, commitment_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight  # codebook: (N x D)
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance with codebook
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        if(self.training):
+            onehots = torch.nn.functional.one_hot(indices, self.codebook_size).float()  # B, T, codebook_size
+            stale_codes = (onehots.sum(0).sum(0) == 0).float()
+            self.stale_counter = self.stale_counter * stale_codes + stale_codes
+            # random replace codes that haven't been used for a while
+            replace_code = (self.stale_counter == self.stale_tolerance).float() # codebook_size
+            if replace_code.sum(-1) > 0:
+                print("Replace {} codes".format(replace_code.sum(-1)))
+                random_input_idx = torch.randperm(encodings.shape[0])
+                random_input = encodings[random_input_idx].view(encodings.shape)
+                if random_input.shape[0] < self.codebook_size:
+                    random_input = torch.cat([random_input]*(self.codebook_size // random_input.shape[0] + 1), 0)
+                random_input = random_input[:self.codebook_size,:].contiguous()  # codebook_size, dim
+                self.codebook.weight.data = self.codebook.weight.data * (1 - replace_code).unsqueeze(-1) + random_input * replace_code.unsqueeze(-1)
+                self.stale_counter = self.stale_counter * (1 - replace_code)
+        return z_q, indices
+class ResidualVectorQuantize(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: float = 0.0,
+        stale_tolerance: int = 100,
+    ):
+        super().__init__()
+        if isinstance(codebook_dim, int):
+            codebook_dim = [codebook_dim for _ in range(n_codebooks)]
+        self.n_codebooks = n_codebooks
+        self.codebook_dim = codebook_dim
+        self.codebook_size = codebook_size
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(input_dim, codebook_size, codebook_dim[i], stale_tolerance=stale_tolerance)
+                for i in range(n_codebooks)
+            ]
+        )
+        self.quantizer_dropout = quantizer_dropout
+    def forward(self, z, n_quantizers: int = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+            "vq/commitment_loss" : Tensor[1]
+                Commitment loss to train encoder to predict vectors closer to codebook
+                entries
+            "vq/codebook_loss" : Tensor[1]
+                Codebook loss to update the codebook
+        """
+        z_q = 0
+        residual = z
+        commitment_loss = 0
+        codebook_loss = 0
+        codebook_indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((z.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (z.shape[0],))
+            n_dropout = int(z.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+        else:
+            n_quantizers = torch.ones((z.shape[0],)) * n_quantizers + 1
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            # if self.training is False and i >= n_quantizers:
+            #     break
+            z_q_i, commitment_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            z_q = z_q + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            # Sum losses
+            commitment_loss += (commitment_loss_i * mask).mean()
+            codebook_loss += (codebook_loss_i * mask).mean()
+            codebook_indices.append(indices_i)
+            latents.append(z_e_i)
+        codes = torch.stack(codebook_indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        encodings = F.one_hot(codes, self.codebook_size).float() # B N T 1024
+        for n in range(encodings.shape[1]):
+            print("Lyaer {}, Ratio of unused vector : {:.1f}".format(n,
+                (encodings[:,n,:,:].sum(0).sum(0) < 1.0).sum()/torch.numel(encodings[:,n,:,:].sum(0).sum(0) < 1.0) * 100.
+            ))
+        return z_q, codes, latents, commitment_loss, codebook_loss, n_quantizers.clamp(max=self.n_codebooks).long() - 1
+    def from_codes(self, codes: torch.Tensor):
+        """Given the quantized codes, reconstruct the continuous representation
+        Parameters
+        ----------
+        codes : Tensor[B x N x T]
+            Quantized discrete representation of input
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        """
+        z_q = 0.0
+        z_p = []
+        n_codebooks = codes.shape[1]
+        for i in range(n_codebooks):
+            z_p_i = self.quantizers[i].decode_code(codes[:, i, :])
+            z_p.append(z_p_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), codes
+    def from_latents(self, latents: torch.Tensor):
+        """Given the unquantized latents, reconstruct the
+        continuous representation after quantization.
+        Parameters
+        ----------
+        latents : Tensor[B x N x T]
+            Continuous representation of input after projection
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized representation of full-projected space
+        Tensor[B x D x T]
+            Quantized representation of latent space
+        """
+        z_q = 0
+        z_p = []
+        codes = []
+        dims = np.cumsum([0] + [q.codebook_dim for q in self.quantizers])
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[
+            0
+        ]
+        for i in range(n_codebooks):
+            j, k = dims[i], dims[i + 1]
+            z_p_i, codes_i = self.quantizers[i].decode_latents(latents[:, j:k, :])
+            z_p.append(z_p_i)
+            codes.append(codes_i)
+            z_q_i = self.quantizers[i].out_proj(z_p_i)
+            z_q = z_q + z_q_i
+        return z_q, torch.cat(z_p, dim=1), torch.stack(codes, dim=1)
+if __name__ == "__main__":
+    rvq = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 4, codebook_size = 1024, codebook_dim = 32, quantizer_dropout = 0.0)
+    x = torch.randn(16, 1024, 80)
+    quantized_prompt_embeds, codes, _, commitment_loss, codebook_loss, rvq_usage = rvq(x)
+    print(quantized_prompt_embeds.shape)
+    print(codes.shape)
+    # w/o reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0
+    # w/ reconstruction
+    loss = commitment_loss * 0.25 + codebook_loss * 1.0 + (x - quantized_prompt_embeds).abs().mean()

MuCodec/model.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import yaml
+import random
+import inspect
+import numpy as np
+from tqdm import tqdm
+import typing as tp
+from abc import ABC
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from einops import repeat
+from tools.torch_tools import wav_to_fbank
+import os
+import diffusers
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers import DDPMScheduler
+from models.transformer_2d_flow import Transformer2DModel
+from libs.rvq.descript_quantize3 import ResidualVectorQuantize
+from torch.cuda.amp import autocast
+from muq_dev.test import load_model
+class SampleProcessor(torch.nn.Module):
+    def project_sample(self, x: torch.Tensor):
+        """Project the original sample to the 'space' where the diffusion will happen."""
+        return x
+    def return_sample(self, z: torch.Tensor):
+        """Project back from diffusion space to the actual sample space."""
+        return z
+class Feature2DProcessor(SampleProcessor):
+    def __init__(self, dim: int = 8, power_std: tp.Union[float, tp.List[float], torch.Tensor] = 1., \
+                 num_samples: int = 100_000):
+        super().__init__()
+        self.num_samples = num_samples
+        self.dim = dim
+        self.power_std = power_std
+        self.register_buffer('counts', torch.zeros(1))
+        self.register_buffer('sum_x', torch.zeros(dim, 32))
+        self.register_buffer('sum_x2', torch.zeros(dim, 32))
+        self.register_buffer('sum_target_x2', torch.zeros(dim, 32))
+        self.counts: torch.Tensor
+        self.sum_x: torch.Tensor
+        self.sum_x2: torch.Tensor
+    @property
+    def mean(self):
+        mean = self.sum_x / self.counts
+        return mean
+    @property
+    def std(self):
+        std = (self.sum_x2 / self.counts - self.mean**2).clamp(min=0).sqrt()
+        return std
+    @property
+    def target_std(self):
+        return 1
+    def project_sample(self, x: torch.Tensor):
+        assert x.dim() == 4
+        if self.counts.item() < self.num_samples:
+            self.counts += len(x)
+            self.sum_x += x.mean(dim=(2,)).sum(dim=0)
+            self.sum_x2 += x.pow(2).mean(dim=(2,)).sum(dim=0)
+        rescale = (self.target_std / self.std.clamp(min=1e-12)) ** self.power_std  # same output size
+        x = (x - self.mean.view(1, -1, 1, 32).contiguous()) * rescale.view(1, -1, 1, 32).contiguous()
+        return x
+    def return_sample(self, x: torch.Tensor):
+        assert x.dim() == 4
+        rescale = (self.std / self.target_std) ** self.power_std
+        x = x * rescale.view(1, -1, 1, 32).contiguous() + self.mean.view(1, -1, 1, 32).contiguous()
+        return x
+class BASECFM(torch.nn.Module, ABC):
+    def __init__(
+        self,
+        estimator,
+    ):
+        super().__init__()
+        self.sigma_min = 1e-4
+        self.estimator = estimator
+    @torch.inference_mode()
+    def forward(self, mu, n_timesteps, temperature=1.0):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device)
+        return self.solve_euler(z, t_span=t_span)
+    def solve_euler(self, x, incontext_x, incontext_length, t_span, mu, added_cond_kwargs, guidance_scale):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_channels, mel_timesteps, n_feats)
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        noise = x.clone()
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in tqdm(range(1, len(t_span))):
+            x[:,:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,:,0:incontext_length,:] + t * incontext_x[:,:,0:incontext_length,:]
+            if(guidance_scale > 1.0):
+                dphi_dt = self.estimator( \
+                    torch.cat([ \
+                        torch.cat([x, x], 0), \
+                        torch.cat([incontext_x, incontext_x], 0), \
+                        torch.cat([torch.zeros_like(mu), mu], 0), \
+                        ], 1), \
+                timestep = t.unsqueeze(-1).repeat(2), \
+                added_cond_kwargs={k:torch.cat([v,v],0) for k,v in added_cond_kwargs.items()}).sample
+                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
+                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
+            else:
+                dphi_dt = self.estimator(torch.cat([x, incontext_x, mu], 1), \
+                timestep = t.unsqueeze(-1),
+                added_cond_kwargs=added_cond_kwargs).sample
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+class PromptCondAudioDiffusion(nn.Module):
+    def __init__(
+        self,
+        num_channels,
+        unet_model_name=None,
+        unet_model_config_path=None,
+        snr_gamma=None,
+        uncondition=True,
+        out_paint=False,
+    ):
+        super().__init__()
+        assert unet_model_name is not None or unet_model_config_path is not None, "Either UNet pretrain model name or a config file path is required"
+        self.unet_model_name = unet_model_name
+        self.unet_model_config_path = unet_model_config_path
+        self.snr_gamma = snr_gamma
+        self.uncondition = uncondition
+        self.num_channels = num_channels
+        # https://huggingface.co/docs/diffusers/v0.14.0/en/api/schedulers/overview
+        self.normfeat = Feature2DProcessor(dim=num_channels)
+        self.sample_rate = 48000
+        self.rsp48toclap = torchaudio.transforms.Resample(48000, 24000)
+        self.rsq48towav2vec = torchaudio.transforms.Resample(48000, 16000)
+        muencoder_dir = "muq_dev/muq_fairseq"
+        muencoder_ckpt = "muq_dev/muq.pt"
+        self.muencoder = load_model(
+            model_dir=os.path.abspath(muencoder_dir),
+            checkpoint_dir=os.path.abspath(muencoder_ckpt),
+        )
+        self.rsq48tomuencoder = torchaudio.transforms.Resample(48000, 24000)
+        for v in self.muencoder.parameters():v.requires_grad = False
+        self.rvq_muencoder_emb = ResidualVectorQuantize(input_dim = 1024, n_codebooks = 1, codebook_size = 16_384, codebook_dim = 32, quantizer_dropout = 0.0, stale_tolerance=200)
+        self.cond_muencoder_emb = nn.Linear(1024, 16*32)
+        self.zero_cond_embedding1 = nn.Parameter(torch.randn(32*32,))
+        unet = Transformer2DModel.from_config(
+            unet_model_config_path,
+        )
+        self.set_from = "random"
+        self.cfm_wrapper = BASECFM(unet)
+        print("Transformer initialized from pretrain.")
+    def compute_snr(self, timesteps):
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+        # Expand the tensors.
+        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+        # Compute SNR.
+        snr = (alpha / sigma) ** 2
+        return snr
+    def preprocess_audio(self, input_audios, threshold=0.9):
+        assert len(input_audios.shape) == 2, input_audios.shape
+        norm_value = torch.ones_like(input_audios[:,0])
+        max_volume = input_audios.abs().max(dim=-1)[0]
+        norm_value[max_volume>threshold] = max_volume[max_volume>threshold] / threshold
+        return input_audios/norm_value.unsqueeze(-1)
+    def extract_muencoder_embeds(self, input_audio_0,input_audio_1,layer):
+        input_wav_mean = (input_audio_0 + input_audio_1) / 2.0
+        input_wav_mean = self.muencoder(self.rsq48tomuencoder(input_wav_mean), features_only = True)
+        layer_results = input_wav_mean['layer_results']
+        muencoder_emb = layer_results[layer]
+        muencoder_emb = muencoder_emb.permute(0,2,1).contiguous()
+        return muencoder_emb
+    def init_device_dtype(self, device, dtype):
+        self.device = device
+        self.dtype = dtype
+    @torch.no_grad()
+    def fetch_codes(self, input_audios, additional_feats,layer):
+        input_audio_0 = input_audios[[0],:]
+        input_audio_1 = input_audios[[1],:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.muencoder.eval()
+        muencoder_emb = self.extract_muencoder_embeds(input_audio_0,input_audio_1,layer)
+        muencoder_emb = muencoder_emb.detach()
+        self.rvq_muencoder_emb.eval()
+        quantized_muencoder_emb, codes_muencoder_emb, *_ = self.rvq_muencoder_emb(muencoder_emb)
+        spk_embeds = None
+        return [codes_muencoder_emb], [muencoder_emb], spk_embeds
+    @torch.no_grad()
+    def fetch_codes_batch(self, input_audios, additional_feats,layer):
+        input_audio_0 = input_audios[:,0,:]
+        input_audio_1 = input_audios[:,1,:]
+        input_audio_0 = self.preprocess_audio(input_audio_0)
+        input_audio_1 = self.preprocess_audio(input_audio_1)
+        self.muencoder.eval()
+        muencoder_emb = self.extract_muencoder_embeds(input_audio_0,input_audio_1,layer)
+        muencoder_emb = muencoder_emb.detach()
+        self.rvq_muencoder_emb.eval()
+        quantized_muencoder_emb, codes_muencoder_emb, *_ = self.rvq_muencoder_emb(muencoder_emb) # b,d,t
+        spk_embeds = None
+        return [codes_muencoder_emb], [muencoder_emb], spk_embeds
+    @torch.no_grad()
+    def inference_codes(self, codes, spk_embeds, true_latents, latent_length,incontext_length, additional_feats,
+                  guidance_scale=2, num_steps=20,
+                  disable_progress=True, scenario='start_seg'):
+        classifier_free_guidance = guidance_scale > 1.0
+        device = self.device
+        dtype = self.dtype
+        codes_muencoder_emb = codes[0]
+        batch_size = codes_muencoder_emb.shape[0]
+        quantized_muencoder_emb,_,_=self.rvq_muencoder_emb.from_codes(codes_muencoder_emb)
+        quantized_muencoder_emb = self.cond_muencoder_emb(quantized_muencoder_emb.permute(0,2,1)) # b t 16*32
+        quantized_muencoder_emb = quantized_muencoder_emb.reshape(quantized_muencoder_emb.shape[0], quantized_muencoder_emb.shape[1]//2, 2, 16, 32).reshape(quantized_muencoder_emb.shape[0], quantized_muencoder_emb.shape[1]//2, 2*16, 32).permute(0,2,1,3).contiguous() # b 32 t f
+        num_frames = quantized_muencoder_emb.shape[-2]
+        num_channels_latents = self.num_channels
+        latents = self.prepare_latents(batch_size, num_frames, num_channels_latents, dtype, device)
+        bsz, _, height, width = latents.shape
+        resolution = torch.tensor([height, width]).repeat(bsz, 1)
+        aspect_ratio = torch.tensor([float(height / width)]).repeat(bsz, 1)
+        resolution = resolution.to(dtype=quantized_muencoder_emb.dtype, device=device)
+        aspect_ratio = aspect_ratio.to(dtype=quantized_muencoder_emb.dtype, device=device)
+        if classifier_free_guidance:
+            resolution = torch.cat([resolution, resolution], 0)
+            aspect_ratio = torch.cat([aspect_ratio, aspect_ratio], 0)
+        added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
+        latent_masks = torch.zeros(latents.shape[0], latents.shape[2], dtype=torch.int64, device=latents.device)
+        latent_masks[:,0:latent_length] = 2
+        if(scenario=='other_seg'):
+            latent_masks[:,0:incontext_length] = 1
+        quantized_muencoder_emb = (latent_masks > 0.5).unsqueeze(1).unsqueeze(-1) * quantized_muencoder_emb \
+            + (latent_masks < 0.5).unsqueeze(1).unsqueeze(-1) * self.zero_cond_embedding1.reshape(1,32,1,32)
+        true_latents = self.normfeat.project_sample(true_latents)
+        incontext_latents = true_latents * ((latent_masks > 0.5) * (latent_masks < 1.5)).unsqueeze(1).unsqueeze(-1).float()
+        incontext_length = ((latent_masks > 0.5) * (latent_masks < 1.5)).sum(-1)[0]
+        additional_model_input = torch.cat([quantized_muencoder_emb],1)
+        temperature = 1.0
+        t_span = torch.linspace(0, 1, num_steps + 1, device=quantized_muencoder_emb.device)
+        latents = self.cfm_wrapper.solve_euler(latents * temperature, incontext_latents, incontext_length, t_span, additional_model_input, added_cond_kwargs, guidance_scale)
+        latents[:,:,0:incontext_length,:] = incontext_latents[:,:,0:incontext_length,:]
+        latents = self.normfeat.return_sample(latents)
+        return latents
+    @torch.no_grad()
+    def inference(self, input_audios, lyric, true_latents, latent_length, additional_feats, guidance_scale=2, num_steps=20,
+                  disable_progress=True,layer=5,scenario='start_seg'):
+        codes, embeds, spk_embeds = self.fetch_codes(input_audios, additional_feats,layer)
+        latents = self.inference_codes(codes, spk_embeds, true_latents, latent_length, additional_feats, \
+            guidance_scale=guidance_scale, num_steps=num_steps, \
+            disable_progress=disable_progress,scenario=scenario)
+        return latents
+    def prepare_latents(self, batch_size, num_frames, num_channels_latents, dtype, device):
+        divisor = 4
+        shape = (batch_size, num_channels_latents, num_frames, 32)
+        if(num_frames%divisor>0):
+            num_frames = round(num_frames/float(divisor))*divisor
+            shape = (batch_size, num_channels_latents, num_frames, 32)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        return latents

MuCodec/models/attention.py ADDED Viewed

	@@ -0,0 +1,682 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import USE_PEFT_BACKEND
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+from diffusers.models.attention_processor import Attention
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+def _chunked_feed_forward(
+    ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int, lora_scale: Optional[float] = None
+):
+    # "feed_forward_chunk_size" can be used to save memory
+    if hidden_states.shape[chunk_dim] % chunk_size != 0:
+        raise ValueError(
+            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+        )
+    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
+    if lora_scale is None:
+        ff_output = torch.cat(
+            [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+            dim=chunk_dim,
+        )
+    else:
+        # TOOD(Patrick): LoRA scale can be removed once PEFT refactor is complete
+        ff_output = torch.cat(
+            [ff(hid_slice, scale=lora_scale) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+            dim=chunk_dim,
+        )
+    return ff_output
+@maybe_allow_in_graph
+class GatedSelfAttentionDense(nn.Module):
+    r"""
+    A gated self-attention dense layer that combines visual features and object features.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        context_dim (`int`): The number of channels in the context.
+        n_heads (`int`): The number of heads to use for attention.
+        d_head (`int`): The number of channels in each head.
+    """
+    def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
+        super().__init__()
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+        self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
+        self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
+        self.enabled = True
+    def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
+        if not self.enabled:
+            return x
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+        return x
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+        ada_norm_bias: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_continuous:
+            self.norm1 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "rms_norm",
+            )
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if self.use_ada_layer_norm:
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif self.use_ada_layer_norm_continuous:
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            else:
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            self.attn2 = None
+        # 3. Feed-forward
+        if self.use_ada_layer_norm_continuous:
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+        elif not self.use_ada_layer_norm_single:
+            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+        # 5. Scale-shift for PixArt-Alpha.
+        if self.use_ada_layer_norm_single:
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.use_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.use_ada_layer_norm_continuous:
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.use_ada_layer_norm_single:
+            # print("Using PixArt-Alpha norm")
+            # print("time step: ", timestep.shape)
+            # print("self.scale_shift_table: ", self.scale_shift_table.shape)
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            # print("scale_msa: ", scale_msa.shape)
+            # print("shift_msa: ", shift_msa.shape)
+            #scale_msa:  torch.Size([5, 1, 1152])
+            #shift_msa:  torch.Size([5, 1, 1152])
+            # exit()
+            # print("before: ", norm_hidden_states.shape)
+            #before:  torch.Size([5, 3584, 1152])
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+            # print("after: ", norm_hidden_states.shape)
+            #before:  torch.Size([5, 3584, 1152])
+            # exit()
+            norm_hidden_states = norm_hidden_states.squeeze(1)
+        else:
+            raise ValueError("Incorrect norm used")
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+        # 1. Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        # 2. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.use_ada_layer_norm_single:
+            attn_output = gate_msa * attn_output
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        # 2.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.use_ada_layer_norm_zero or self.use_layer_norm:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.use_ada_layer_norm_single:
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.use_ada_layer_norm_continuous:
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+            if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        if self.use_ada_layer_norm_continuous:
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self.use_ada_layer_norm_single:
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(
+                self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.use_ada_layer_norm_single:
+            ff_output = gate_mlp * ff_output
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        return hidden_states
+@maybe_allow_in_graph
+class TemporalBasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block for video like data.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        time_mix_inner_dim (`int`): The number of channels for temporal attention.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+    """
+    def __init__(
+        self,
+        dim: int,
+        time_mix_inner_dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.is_res = dim == time_mix_inner_dim
+        self.norm_in = nn.LayerNorm(dim)
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.norm_in = nn.LayerNorm(dim)
+        self.ff_in = FeedForward(
+            dim,
+            dim_out=time_mix_inner_dim,
+            activation_fn="geglu",
+        )
+        self.norm1 = nn.LayerNorm(time_mix_inner_dim)
+        self.attn1 = Attention(
+            query_dim=time_mix_inner_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            cross_attention_dim=None,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = nn.LayerNorm(time_mix_inner_dim)
+            self.attn2 = Attention(
+                query_dim=time_mix_inner_dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(time_mix_inner_dim)
+        self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = None
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        # chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off
+        self._chunk_dim = 1
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        num_frames: int,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        batch_frames, seq_length, channels = hidden_states.shape
+        batch_size = batch_frames // num_frames
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels)
+        residual = hidden_states
+        hidden_states = self.norm_in(hidden_states)
+        if self._chunk_size is not None:
+            hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            hidden_states = self.ff_in(hidden_states)
+        if self.is_res:
+            hidden_states = hidden_states + residual
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
+        hidden_states = attn_output + hidden_states
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self._chunk_size is not None:
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.is_res:
+            hidden_states = ff_output + hidden_states
+        else:
+            hidden_states = ff_output
+        hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels)
+        return hidden_states
+class SkipFFTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        kv_input_dim: int,
+        kv_input_dim_proj_use_bias: bool,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        if kv_input_dim != dim:
+            self.kv_mapper = nn.Linear(kv_input_dim, dim, kv_input_dim_proj_use_bias)
+        else:
+            self.kv_mapper = None
+        self.norm1 = RMSNorm(dim, 1e-06)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim,
+            out_bias=attention_out_bias,
+        )
+        self.norm2 = RMSNorm(dim, 1e-06)
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+        )
+    def forward(self, hidden_states, encoder_hidden_states, cross_attention_kwargs):
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        if self.kv_mapper is not None:
+            encoder_hidden_states = self.kv_mapper(F.silu(encoder_hidden_states))
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+        hidden_states = attn_output + hidden_states
+        norm_hidden_states = self.norm2(hidden_states)
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+        hidden_states = attn_output + hidden_states
+        return hidden_states
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(linear_cls(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear)
+        for module in self.net:
+            if isinstance(module, compatible_cls):
+                hidden_states = module(hidden_states, scale)
+            else:
+                hidden_states = module(hidden_states)
+        return hidden_states

MuCodec/models/transformer_2d_flow.py ADDED Viewed

	@@ -0,0 +1,545 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+import math
+from typing import Any, Dict, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, is_torch_version
+from models.attention import BasicTransformerBlock
+from diffusers.models.embeddings import PatchEmbed, PixArtAlphaTextProjection
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.embeddings import TimestepEmbedding
+class PixArtAlphaCombinedFlowEmbeddings(nn.Module):
+    """
+    For PixArt-Alpha.
+    Reference:
+    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
+    """
+    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
+        super().__init__()
+        self.flow_t_size = 512
+        self.outdim = size_emb_dim
+        self.timestep_embedder = TimestepEmbedding(in_channels=self.flow_t_size, time_embed_dim=embedding_dim)
+        self.use_additional_conditions = use_additional_conditions
+        if use_additional_conditions:
+            self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+            self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+            self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+    # https://github.com/atong01/conditional-flow-matching/blob/main/torchcfm/models/unet/nn.py#L87
+    def timestep_embedding(self, timesteps, max_period=10000, scale=1000):
+        """Create sinusoidal timestep embeddings.
+        :param timesteps: a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an [N x dim] Tensor of positional embeddings.
+        """
+        half = self.flow_t_size // 2
+        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, device=timesteps.device) / half).type(timesteps.type())
+        args = timesteps[:, None] * freqs[None] * scale
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if self.flow_t_size % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
+        timesteps_proj = self.timestep_embedding(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+        if self.use_additional_conditions:
+            resolution_emb = self.additional_condition_proj(resolution.flatten()).to(hidden_dtype)
+            resolution_emb = self.resolution_embedder(resolution_emb).reshape(batch_size, -1)
+            aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).to(hidden_dtype)
+            aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb).reshape(batch_size, -1)
+            conditioning = timesteps_emb + torch.cat([resolution_emb, aspect_ratio_emb], dim=1)
+        else:
+            conditioning = timesteps_emb
+        return conditioning
+class AdaLayerNormSingleFlow(nn.Module):
+    r"""
+    Norm layer adaptive layer norm single (adaLN-single).
+    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
+    """
+    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
+        super().__init__()
+        self.emb = PixArtAlphaCombinedFlowEmbeddings(
+            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
+        )
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        batch_size: Optional[int] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # No modulation happening here.
+        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
+        return self.linear(self.silu(embedded_timestep)), embedded_timestep
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
+        linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = linear_cls(in_channels, inner_dim)
+            else:
+                self.proj_in = conv_cls(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+            self.height = sample_size
+            self.width = sample_size
+            self.patch_size = patch_size
+            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
+            interpolation_scale = max(interpolation_scale, 1)
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+                interpolation_scale=interpolation_scale,
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = linear_cls(inner_dim, in_channels)
+            else:
+                self.proj_out = conv_cls(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches and norm_type != "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        elif self.is_input_patches and norm_type == "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
+            self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        # 5. PixArt-Alpha blocks.
+        self.adaln_single = None
+        self.use_additional_conditions = False
+        if norm_type == "ada_norm_single":
+            self.use_additional_conditions = self.config.sample_size == 128
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingleFlow(inner_dim, use_additional_conditions=self.use_additional_conditions)
+        self.caption_projection = None
+        if caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+        self.gradient_checkpointing = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+            hidden_states = self.pos_embed(hidden_states)
+            if self.adaln_single is not None:
+                if self.use_additional_conditions and added_cond_kwargs is None:
+                    raise ValueError(
+                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                    )
+                batch_size = hidden_states.shape[0]
+                timestep, embedded_timestep = self.adaln_single(
+                    timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+                )
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                )
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+            else:
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        if self.is_input_patches:
+            if self.config.norm_type != "ada_norm_single":
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                hidden_states = self.proj_out_2(hidden_states)
+            elif self.config.norm_type == "ada_norm_single":
+                shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states)
+                # Modulation
+                hidden_states = hidden_states * (1 + scale) + shift
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.squeeze(1)
+            # unpatchify
+            if self.adaln_single is None:
+                height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

MuCodec/mp3_to_code.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import argparse
+import os
+from pathlib import Path
+import tempfile
+import traceback
+import numpy as np
+import torch
+from tqdm import tqdm
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Batch encode MP3 files to MuCodec codes (recursive)."
+    )
+    parser.add_argument("input_dir", type=Path, help="Input folder (recursive scan)")
+    parser.add_argument("output_dir", type=Path, help="Output folder for saved codes")
+    parser.add_argument(
+        "--ckpt",
+        type=Path,
+        default=Path(__file__).resolve().parent / "ckpt" / "mucodec.pt",
+        help="Path to MuCodec checkpoint",
+    )
+    parser.add_argument(
+        "--layer-num",
+        type=int,
+        default=7,
+        help="MuCodec layer num (default follows generate.py)",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda:0",
+        help="Torch device, e.g. cuda:0",
+    )
+    parser.add_argument(
+        "--ext",
+        nargs="+",
+        default=[".mp3"],
+        help="Audio extensions to include, e.g. .mp3 .wav .flac",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["npz", "pt", "npy", "both", "all"],
+        default="npz",
+        help="Output format for code files",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Recompute files even if output already exists (disable resume)",
+    )
+    parser.add_argument(
+        "--strict",
+        action="store_true",
+        help="Stop immediately on first failed file",
+    )
+    return parser.parse_args()
+def list_audio_files(root: Path, exts):
+    ext_set = {e.lower() if e.startswith(".") else f".{e.lower()}" for e in exts}
+    files = [
+        p
+        for p in root.rglob("*")
+        if p.is_file() and p.suffix.lower() in ext_set
+    ]
+    files.sort()
+    return files
+def expected_output_paths(output_stem: Path, fmt: str):
+    if fmt == "npz":
+        return [output_stem.with_suffix(".npz")]
+    if fmt == "pt":
+        return [output_stem.with_suffix(".pt")]
+    if fmt == "npy":
+        return [output_stem.with_suffix(".npy")]
+    if fmt == "both":
+        return [output_stem.with_suffix(".pt"), output_stem.with_suffix(".npy")]
+    if fmt == "all":
+        return [
+            output_stem.with_suffix(".npz"),
+            output_stem.with_suffix(".pt"),
+            output_stem.with_suffix(".npy"),
+        ]
+    raise ValueError(f"Unsupported format: {fmt}")
+def save_npz_atomic(codes_np: np.ndarray, output_path: Path):
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = None
+    try:
+        with tempfile.NamedTemporaryFile(
+            mode="wb",
+            suffix=".npz",
+            dir=output_path.parent,
+            delete=False,
+        ) as tmp_file:
+            tmp_path = Path(tmp_file.name)
+            np.savez_compressed(tmp_file, codes=codes_np)
+        os.replace(tmp_path, output_path)
+    except Exception:
+        if tmp_path is not None and tmp_path.exists():
+            tmp_path.unlink()
+        raise
+def save_codes(codes: torch.Tensor, output_stem: Path, fmt: str):
+    codes_cpu = codes.detach().cpu()
+    codes_np = codes_cpu.numpy()
+    if fmt in ("npz", "all"):
+        save_npz_atomic(codes_np, output_stem.with_suffix(".npz"))
+    if fmt in ("pt", "both", "all"):
+        torch.save(codes_cpu, output_stem.with_suffix(".pt"))
+    if fmt in ("npy", "both", "all"):
+        np.save(output_stem.with_suffix(".npy"), codes_np)
+def main():
+    args = parse_args()
+    from generate import MuCodec
+    if not args.input_dir.exists() or not args.input_dir.is_dir():
+        raise ValueError(f"input_dir does not exist or is not a directory: {args.input_dir}")
+    if not args.ckpt.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {args.ckpt}")
+    if args.device.startswith("cuda") and not torch.cuda.is_available():
+        raise RuntimeError("CUDA device requested but torch.cuda.is_available() is False")
+    audio_files = list_audio_files(args.input_dir, args.ext)
+    if not audio_files:
+        print("No audio files found.")
+        return
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    mucodec = MuCodec(
+        model_path=str(args.ckpt),
+        layer_num=args.layer_num,
+        load_main_model=True,
+        device=args.device,
+    )
+    resume_enabled = not args.overwrite
+    ok = 0
+    skipped = 0
+    failed = []
+    for src in tqdm(audio_files, desc="Encoding", unit="file"):
+        rel = src.relative_to(args.input_dir)
+        output_stem = (args.output_dir / rel).with_suffix("")
+        output_paths = expected_output_paths(output_stem, args.format)
+        if resume_enabled and all(p.exists() for p in output_paths):
+            skipped += 1
+            continue
+        output_stem.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            codes = mucodec.file2code(str(src))
+            save_codes(codes, output_stem, args.format)
+            ok += 1
+        except Exception as e:
+            failed.append((src, str(e)))
+            print(f"[FAILED] {src}: {e}")
+            if args.strict:
+                print("--strict enabled, stopping on first failure.")
+                traceback.print_exc()
+                break
+    print(
+        "Done. "
+        f"success={ok}, skipped={skipped}, failed={len(failed)}, total={len(audio_files)}"
+    )
+    if failed:
+        print("Failed files:")
+        for path, err in failed:
+            print(f"- {path}: {err}")
+if __name__ == "__main__":
+    main()

MuCodec/muq_dev/test.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+from dataclasses import dataclass
+import fairseq
+import os.path as op
+root = op.dirname(op.abspath(__file__))
+@dataclass
+class UserDirModule:
+    user_dir: str
+def load_model(model_dir, checkpoint_dir):
+    '''Load Fairseq SSL model'''
+    model_path = UserDirModule(model_dir)
+    fairseq.utils.import_user_module(model_path)
+    model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_dir], strict=False)
+    model = model[0]
+    return model

MuCodec/readme.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# MuCodec: Ultra Low-Bitrate Music Codec
+This repository is the official code repository for MuCodec: Ultra Low-Bitrate Music Codec. You can find our paper on [arXiv] (https://arxiv.org/pdf/2409.13216). The demo page is available [here](https://xuyaoxun.github.io/MuCodec_demo/).
+In this repository, we provide the Mucodec model, inference scripts, and the checkpoint that has been trained on the Million Song Dataset. Specifically, we have released the model and inference code corresponding to the lowest bitrate of 0.35 kbps as mentioned in the paper, to demonstrate the effectiveness of our work.
+MuCodec supports 48kHz, dual-channel (stereo) audio reconstruction. If the original audio is in a different format, it will first be converted to 48kHz, dual-channel audio.
+## Installation
+You can install the necessary dependencies using the `requirements.txt` file with Python 3.8.12:
+```bash
+pip install -r requirements.txt
+```
+Due to storage limitations, we have saved the model checkpoints on Hugging Face at https://huggingface.co/yaoxunxu/mucodec. You can easily download the models from Hugging Face and save them in the following directories:
+- Save `audioldm_48k.pth` in the `tools` folder.
+- Save `muq.pt` in the `muq_dev` folder.
+- Save `mucodec.pt` in the `ckpt` folder.
+Please note that all three checkpoints must be downloaded completely for the model to load correctly. The final file paths should be:
+```
+tools/audioldm_48k.pth
+muq_dev/muq.pt
+ckpt/mucodec.pt
+```
+The file `audioldm_48k.pth` is sourced from https://huggingface.co/haoheliu/audioldm_48k/blob/main/audioldm_48k.pth.
+## Inference
+To run inference, use the following command:
+```bash
+python3 generate.py
+```
+We have provided a sample song `test.wav`, randomly sampled from the Million Song Dataset, in the `test_wav` folder. The default input path is `test_wav/test.wav`, and the output path for the reconstructed audio is `reconstruct/test.wav`.
+In the `generate.py` file, we have implemented several functions to facilitate the music compression and reconstruction process. You can easily obtain compressed tokens from audio using the `sound2code` function, and reconstruct the audio from tokens using the `code2sound` function.
+## Note
+Please note that the open-sourced model was trained solely on the Million Song Dataset. Considering the quality issues of this dataset, the open-sourced model may not achieve the same performance as demonstrated in the demo. Unfortunately, due to copyright restrictions, we are unable to release the checkpoints trained on additional datasets. However, you can use your own dataset to further train the model and achieve better results.
+## License
+The code in this repository is released under the MIT license as found in the [LICENSE](LICENSE) file.
+The model weights (muq.pt, mucodec.pt) in this repository are released under the CC-BY-NC 4.0 license, as detailed in the [LICENSE_weights](LICENSE_weights) file.
+## Citation
+If you find our work useful, please cite our paper:
+```bibtex
+@article{xu2024mucodec,
+  title={MuCodec: Ultra Low-Bitrate Music Codec},
+  author={Xu, Yaoxun and Chen, Hangting and Yu, Jianwei and Tan, Wei and Gu, Rongzhi and Lei, Shun and Lin, Zhiwei and Wu, Zhiyong},
+  journal={arXiv preprint arXiv:2409.13216},
+  year={2024}
+}
+```

MuCodec/requirements.txt ADDED Viewed

	@@ -0,0 +1,335 @@

+absl-py==2.0.0
+accelerate==0.30.1
+aeiou==0.0.20
+aiobotocore==2.13.1
+aiofiles==23.2.1
+aiohttp==3.9.3
+aioitertools==0.11.0
+aiosignal==1.3.1
+alias-free-torch==0.0.6
+altair==5.3.0
+annotated-types==0.6.0
+antlr4-python3-runtime==4.8
+anyio==4.3.0
+appdirs==1.4.4
+argbind==0.3.9
+asttokens==2.4.1
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs==23.1.0
+audioread==3.0.1
+auraloss==0.4.0
+av==11.0.0
+backcall==0.2.0
+beartype==0.18.5
+bitarray==2.9.2
+bleach==6.1.0
+blis==0.7.11
+bokeh==3.1.1
+botocore==1.34.131
+braceexpand==0.1.7
+cachetools==5.3.2
+catalogue==2.0.10
+certifi==2023.11.17
+cffi==1.16.0
+charset-normalizer==3.3.2
+clean-fid==0.1.35
+click==8.1.7
+clip-anytorch==2.6.0
+cloudpathlib==0.16.0
+cloudpickle==3.0.0
+cn2an==0.5.22
+colorama==0.4.6
+colorcet==3.1.0
+colorlog==6.8.2
+confection==0.1.4
+configparser==7.0.0
+contourpy==1.1.1
+cycler==0.12.1
+cymem==2.0.8
+Cython==3.0.10
+dataclasses==0.6
+datasets
+dctorch==0.1.2
+decorator==5.1.1
+decord==0.6.0
+deepspeed==0.14.0
+demucs==4.0.1
+descript-audio-codec==1.0.0
+descript-audiotools==0.7.2
+diffusers==0.27.2
+dill==0.3.8
+Distance==0.1.3
+docker-pycreds==0.4.0
+docopt==0.6.2
+docstring_parser==0.16
+dora_search==0.1.12
+einops==0.7.0
+einops-exts==0.0.4
+einx==0.3.0
+ema-pytorch==0.2.3
+encodec==0.1.1
+exceptiongroup==1.2.0
+executing==2.0.1
+expecttest==0.1.6
+fairseq==0.12.2
+fastapi==0.110.3
+fastcore==1.6.3
+ffmpy==0.3.2
+filelock==3.13.1
+fire==0.6.0
+flashy==0.0.2
+flatten-dict==0.4.2
+fonttools==4.49.0
+frozendict==2.4.4
+frozenlist==1.4.1
+fsspec==2024.6.1
+ftfy==6.1.3
+future==1.0.0
+g2p-en==2.1.0
+gin-config==0.5.0
+gitdb==4.0.11
+GitPython==3.1.43
+google-auth==2.23.4
+google-auth-oauthlib==1.0.0
+gradio==4.26.0
+gradio_client==0.15.1
+grpcio==1.59.3
+h11==0.14.0
+h5py==3.11.0
+hjson==3.1.0
+holoviews==1.17.1
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.23.5
+hydra-colorlog==1.2.0
+hydra-core==1.0.7
+hypothesis==6.90.0
+idna==3.4
+imageio==2.34.2
+importlib-metadata==6.8.0
+importlib-resources==5.12.0
+inflect==7.0.0
+ipython==8.12.3
+jedi==0.19.1
+jieba-fast==0.53
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.3.2
+json5==0.9.25
+jsonlines==4.0.0
+jsonmerge==1.9.2
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+julius==0.2.7
+k-diffusion==0.1.1
+kaldiio==2.18.0
+kiwisolver==1.4.5
+kornia==0.7.3
+kornia_rs==0.1.5
+laion-clap==1.1.4
+lameenc==1.7.0
+langcodes==3.4.0
+language_data==1.2.0
+lazy_loader==0.3
+librosa==0.9.2
+lightning==2.2.1
+lightning-utilities==0.10.1
+linkify-it-py==2.0.3
+lion-pytorch==0.2.2
+llvmlite==0.41.1
+local-attention==1.8.6
+loguru==0.7.2
+lxml==5.2.2
+marisa-trie==1.1.1
+Markdown==3.5.1
+markdown-it-py==3.0.0
+markdown2==2.5.0
+MarkupSafe==2.1.3
+matplotlib==3.7.5
+matplotlib-inline==0.1.7
+mdit-py-plugins==0.4.1
+mdurl==0.1.2
+mpmath==1.3.0
+msgpack==1.0.8
+multidict==6.0.5
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy-extensions==1.0.0
+networkx==3.1
+ninja==1.11.1.1
+nltk==3.8.1
+nnAudio==0.3.3
+num2words==0.5.13
+numba==0.58.1
+numpy==1.23.5
+nvidia-cublas-cu11==11.11.3.6
+nvidia-cuda-cupti-cu11==11.8.87
+nvidia-cuda-nvrtc-cu11==11.8.89
+nvidia-cuda-runtime-cu11==11.8.89
+nvidia-cudnn-cu11==8.7.0.84
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.3.0.86
+nvidia-cusolver-cu11==11.4.1.48
+nvidia-cusparse-cu11==11.7.5.86
+nvidia-nccl-cu11==2.19.3
+nvidia-nvtx-cu11==11.8.86
+oauthlib==3.2.2
+omegaconf
+opencv-contrib-python==4.8.1.78
+opencv-python==4.8.1.78
+openunmix==1.2.1
+orjson==3.10.3
+packaging==23.2
+pandas==2.0.2
+panel==1.2.3
+param==2.1.1
+parso==0.8.4
+pathtools==0.1.2
+pedalboard==0.7.4
+peft==0.10.0
+pexpect==4.9.0
+pickleshare==0.7.5
+Pillow==10.1.0
+pkgutil_resolve_name==1.3.10
+platformdirs==4.2.0
+plotly==5.23.0
+pooch==1.8.1
+portalocker==2.10.1
+prefigure==0.0.9
+preshed==3.0.9
+proces==0.1.7
+prodict==0.8.18
+progressbar==2.5
+prompt_toolkit==3.0.47
+protobuf==3.19.6
+psutil==5.9.6
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+pyarrow==17.0.0
+pyarrow-hotfix==0.6
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pybind11==2.11.1
+pycparser==2.21
+pydantic==2.6.3
+pydantic_core==2.16.3
+pydub==0.25.1
+Pygments==2.18.0
+pyloudnorm==0.1.1
+pynndescent==0.5.13
+pynvml==11.5.0
+pyparsing==3.1.2
+pypinyin==0.51.0
+pyre-extensions==0.0.29
+pyreaper==0.0.10
+pystoi==0.4.1
+python-dateutil==2.8.2
+python-multipart==0.0.9
+pytorch-lightning==2.1.0
+pytz==2023.3.post1
+pyviz_comms==3.0.3
+PyWavelets==1.4.1
+PyYAML==6.0.1
+randomname==0.2.1
+referencing==0.35.1
+regex==2023.10.3
+requests==2.32.3
+requests-oauthlib==1.3.1
+resampy==0.4.3
+retrying==1.3.4
+rich==13.7.1
+rpds-py==0.18.1
+rsa==4.9
+ruamel.yaml==0.18.5
+ruamel.yaml.clib==0.2.8
+ruff==0.4.4
+s3fs==2024.6.1
+s3transfer==0.7.0
+sacrebleu==2.4.2
+safetensors==0.4.3
+scikit-image==0.21.0
+scikit-learn==1.3.2
+scipy==1.10.1
+semantic-version==2.10.0
+sentencepiece==0.1.99
+sentry-sdk==2.10.0
+setproctitle==1.3.3
+shellingham==1.5.4
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+SoundFile==0.10.2
+sox==1.4.1
+soxr==0.3.7
+spacy==3.7.4
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.4.8
+stack-data==0.6.3
+starlette==0.37.2
+submitit==1.5.1
+sympy==1.12
+tabulate==0.9.0
+tenacity==9.0.0
+tensorboard==2.14.0
+tensorboard-data-server==0.7.2
+termcolor==2.3.0
+thinc==8.2.3
+threadpoolctl==3.3.0
+tifffile==2023.7.10
+timm==0.9.11
+tokenizers==0.19.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.0+cu118
+torch-stoi==0.2.1
+torchaudio==2.2.0+cu118
+torchdata==0.7.1
+torchdiffeq==0.2.4
+torchlibrosa==0.1.0
+torchmetrics==0.11.4
+torchsde==0.2.6
+torchtext==0.17.0
+torchvision==0.17.0+cu118
+tornado==6.4.1
+tqdm==4.66.4
+traitlets==5.14.3
+trampoline==0.1.2
+transformers==4.42.4
+treetable==0.2.5
+triton==2.2.0
+typeguard==2.13.0
+typer==0.9.4
+types-dataclasses==0.6.6
+typing-inspect==0.9.0
+typing_extensions==4.8.0
+tzdata==2023.3
+uc-micro-py==1.0.3
+umap-learn==0.5.6
+Unidecode==1.3.8
+urllib3==1.26.18
+uvicorn==0.29.0
+v-diffusion-pytorch==0.0.2
+vector-quantize-pytorch==1.9.14
+wandb==0.15.4
+wasabi==1.1.2
+wcwidth==0.2.12
+weasel==0.3.4
+webdataset==0.2.48
+webencodings==0.5.1
+websockets==11.0.3
+Werkzeug==3.0.1
+wget==3.2
+wordsegment==1.3.1
+wrapt==1.16.0
+x-clip==0.14.4
+x-transformers==1.26.6
+xformers==0.0.24+cu118
+xxhash==3.4.1
+xyzservices==2024.6.0
+yarl==1.9.4
+zipp==3.17.0

MuCodec/tools/get_melvaehifigan48k.py ADDED Viewed

	@@ -0,0 +1,1551 @@

+import soundfile as sf
+import os
+from librosa.filters import mel as librosa_mel_fn
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+import tools.torch_tools as torch_tools
+import torch.nn as nn
+import torch
+import numpy as np
+from einops import rearrange
+from scipy.signal import get_window
+from librosa.util import pad_center, tiny
+import librosa.util as librosa_util
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+LRELU_SLOPE = 0.1
+class ResBlock(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList(
+            [
+                torch.nn.utils.weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                torch.nn.utils.weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                torch.nn.utils.weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                torch.nn.utils.weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                torch.nn.utils.weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                torch.nn.utils.weight_norm(
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = torch.nn.functional.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.convs2:
+            torch.nn.utils.remove_weight_norm(l)
+class Generator_old(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator_old, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = torch.nn.utils.weight_norm(
+            nn.Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
+        )
+        resblock = ResBlock
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                torch.nn.utils.weight_norm(
+                    nn.ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = torch.nn.utils.weight_norm(nn.Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = torch.nn.functional.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        # print("Removing weight norm...")
+        for l in self.ups:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        torch.nn.utils.remove_weight_norm(self.conv_pre)
+        torch.nn.utils.remove_weight_norm(self.conv_post)
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # Do time downsampling here
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class DownsampleTimeStride4(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # Do time downsampling here
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=5, stride=(4, 2), padding=1
+            )
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=(4, 2), stride=(4, 2))
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class UpsampleTimeStride4(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=5, stride=1, padding=2
+            )
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=(4.0, 2.0), mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w).contiguous()
+        q = q.permute(0, 2, 1).contiguous()  # b,hw,c
+        k = k.reshape(b, c, h * w).contiguous()  # b,c,hw
+        w_ = torch.bmm(q, k).contiguous()  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w).contiguous()
+        w_ = w_.permute(0, 2, 1).contiguous()  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(
+            v, w_
+        ).contiguous()  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w).contiguous()
+        h_ = self.proj_out(h_)
+        return x + h_
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none"], f"attn_type {attn_type} unknown"
+    # print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        raise ValueError(attn_type)
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        double_z=True,
+        use_linear_attn=False,
+        attn_type="vanilla",
+        downsample_time_stride4_levels=[],
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.downsample_time_stride4_levels = downsample_time_stride4_levels
+        if len(self.downsample_time_stride4_levels) > 0:
+            assert max(self.downsample_time_stride4_levels) < self.num_resolutions, (
+                "The level to perform downsample 4 operation need to be smaller than the total resolution number %s"
+                % str(self.num_resolutions)
+            )
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                if i_level in self.downsample_time_stride4_levels:
+                    down.downsample = DownsampleTimeStride4(block_in, resamp_with_conv)
+                else:
+                    down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        ch,
+        out_ch,
+        ch_mult=(1, 2, 4, 8),
+        num_res_blocks,
+        attn_resolutions,
+        dropout=0.0,
+        resamp_with_conv=True,
+        in_channels,
+        resolution,
+        z_channels,
+        give_pre_end=False,
+        tanh_out=False,
+        use_linear_attn=False,
+        downsample_time_stride4_levels=[],
+        attn_type="vanilla",
+        **ignorekwargs,
+    ):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+        self.downsample_time_stride4_levels = downsample_time_stride4_levels
+        if len(self.downsample_time_stride4_levels) > 0:
+            assert max(self.downsample_time_stride4_levels) < self.num_resolutions, (
+                "The level to perform downsample 4 operation need to be smaller than the total resolution number %s"
+                % str(self.num_resolutions)
+            )
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # print(
+        #     "Working with z of shape {} = {} dimensions.".format(
+        #         self.z_shape, np.prod(self.z_shape)
+        #     )
+        # )
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                if i_level - 1 in self.downsample_time_stride4_levels:
+                    up.upsample = UpsampleTimeStride4(block_in, resamp_with_conv)
+                else:
+                    up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(
+                device=self.parameters.device
+            )
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(
+            device=self.parameters.device
+        )
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.mean(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
+            else:
+                return 0.5 * torch.mean(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )
+    def nll(self, sample, dims=[1, 2, 3]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self):
+        return self.mean
+def get_vocoder_config_48k():
+    return {
+        "resblock": "1",
+        "num_gpus": 8,
+        "batch_size": 128,
+        "learning_rate": 0.0001,
+        "adam_b1": 0.8,
+        "adam_b2": 0.99,
+        "lr_decay": 0.999,
+        "seed": 1234,
+        "upsample_rates": [6,5,4,2,2],
+        "upsample_kernel_sizes": [12,10,8,4,4],
+        "upsample_initial_channel": 1536,
+        "resblock_kernel_sizes": [3,7,11,15],
+        "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5], [1,3,5]],
+        "segment_size": 15360,
+        "num_mels": 256,
+        "n_fft": 2048,
+        "hop_size": 480,
+        "win_size": 2048,
+        "sampling_rate": 48000,
+        "fmin": 20,
+        "fmax": 24000,
+        "fmax_for_loss": None,
+        "num_workers": 8,
+        "dist_config": {
+            "dist_backend": "nccl",
+            "dist_url": "tcp://localhost:18273",
+            "world_size": 1
+        }
+    }
+def get_vocoder(config, device, mel_bins):
+    name = "HiFi-GAN"
+    speaker = ""
+    if name == "MelGAN":
+        if speaker == "LJSpeech":
+            vocoder = torch.hub.load(
+                "descriptinc/melgan-neurips", "load_melgan", "linda_johnson"
+            )
+        elif speaker == "universal":
+            vocoder = torch.hub.load(
+                "descriptinc/melgan-neurips", "load_melgan", "multi_speaker"
+            )
+        vocoder.mel2wav.eval()
+        vocoder.mel2wav.to(device)
+    elif name == "HiFi-GAN":
+        if(mel_bins == 256):
+            config = get_vocoder_config_48k()
+            config = AttrDict(config)
+            vocoder = Generator_old(config)
+            # print("Load hifigan/g_01080000")
+            # ckpt = torch.load(os.path.join(ROOT, "hifigan/g_01080000"))
+            # ckpt = torch.load(os.path.join(ROOT, "hifigan/g_00660000"))
+            # ckpt = torch_version_orig_mod_remove(ckpt)
+            # vocoder.load_state_dict(ckpt["generator"])
+            vocoder.eval()
+            vocoder.remove_weight_norm()
+            vocoder.to(device)
+        else:
+            raise ValueError(mel_bins)
+    return vocoder
+def vocoder_infer(mels, vocoder, lengths=None):
+    with torch.no_grad():
+        wavs = vocoder(mels).squeeze(1)
+    #wavs = (wavs.cpu().numpy() * 32768).astype("int16")
+    wavs = (wavs.cpu().numpy())
+    if lengths is not None:
+        wavs = wavs[:, :lengths]
+    # wavs = [wav for wav in wavs]
+    # for i in range(len(mels)):
+    #     if lengths is not None:
+    #         wavs[i] = wavs[i][: lengths[i]]
+    return wavs
+@torch.no_grad()
+def vocoder_chunk_infer(mels, vocoder, lengths=None):
+    chunk_size = 256*4
+    shift_size = 256*1
+    ov_size = chunk_size-shift_size
+    # import pdb;pdb.set_trace()
+    for cinx in range(0, mels.shape[2], shift_size):
+        if(cinx==0):
+            wavs = vocoder(mels[:,:,cinx:cinx+chunk_size]).squeeze(1).cpu()
+            num_samples = int(wavs.shape[-1]/chunk_size)*chunk_size
+            wavs = wavs[:,0:num_samples]
+            ov_sample = int(float(wavs.shape[-1]) * ov_size / chunk_size)
+            ov_win = torch.from_numpy(np.linspace(0,1,ov_sample)[None,:])
+            ov_win = torch.cat([ov_win,1-ov_win],-1)
+            if(cinx+chunk_size>=mels.shape[2]):
+                break
+        else:
+            cur_wav = vocoder(mels[:,:,cinx:cinx+chunk_size]).squeeze(1).cpu()[:,0:num_samples]
+            wavs[:,-ov_sample:] =  wavs[:,-ov_sample:] * ov_win[:,-ov_sample:] + cur_wav[:,0:ov_sample] * ov_win[:,0:ov_sample]
+            # wavs[:,-ov_sample:] =  wavs[:,-ov_sample:] * 1.0 + cur_wav[:,0:ov_sample] * 0.0
+            wavs = torch.cat([wavs, cur_wav[:,ov_sample:]],-1)
+            if(cinx+chunk_size>=mels.shape[2]):
+                break
+        # print(wavs.shape)
+    wavs = (wavs.cpu().numpy())
+    if lengths is not None:
+        wavs = wavs[:, :lengths]
+    # print(wavs.shape)
+    return wavs
+def synth_one_sample(mel_input, mel_prediction, labels, vocoder):
+    if vocoder is not None:
+        wav_reconstruction = vocoder_infer(
+            mel_input.permute(0, 2, 1),
+            vocoder,
+        )
+        wav_prediction = vocoder_infer(
+            mel_prediction.permute(0, 2, 1),
+            vocoder,
+        )
+    else:
+        wav_reconstruction = wav_prediction = None
+    return wav_reconstruction, wav_prediction
+class AutoencoderKL(nn.Module):
+    def __init__(
+        self,
+        ddconfig=None,
+        lossconfig=None,
+        batchsize=None,
+        embed_dim=None,
+        time_shuffle=1,
+        subband=1,
+        sampling_rate=16000,
+        ckpt_path=None,
+        reload_from_ckpt=None,
+        ignore_keys=[],
+        image_key="fbank",
+        colorize_nlabels=None,
+        monitor=None,
+        base_learning_rate=1e-5,
+        scale_factor=1
+    ):
+        super().__init__()
+        self.automatic_optimization = False
+        assert (
+            "mel_bins" in ddconfig.keys()
+        ), "mel_bins is not specified in the Autoencoder config"
+        num_mel = ddconfig["mel_bins"]
+        self.image_key = image_key
+        self.sampling_rate = sampling_rate
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = None
+        self.subband = int(subband)
+        if self.subband > 1:
+            print("Use subband decomposition %s" % self.subband)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        if self.image_key == "fbank":
+            self.vocoder = get_vocoder(None, "cpu", num_mel)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels) == int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.learning_rate = float(base_learning_rate)
+        # print("Initial learning rate %s" % self.learning_rate)
+        self.time_shuffle = time_shuffle
+        self.reload_from_ckpt = reload_from_ckpt
+        self.reloaded = False
+        self.mean, self.std = None, None
+        self.feature_cache = None
+        self.flag_first_run = True
+        self.train_step = 0
+        self.logger_save_dir = None
+        self.logger_exp_name = None
+        self.scale_factor = scale_factor
+        print("Num parameters:")
+        print("Encoder : ", sum(p.numel() for p in self.encoder.parameters()))
+        print("Decoder : ", sum(p.numel() for p in self.decoder.parameters()))
+        print("Vocoder : ", sum(p.numel() for p in self.vocoder.parameters()))
+    def get_log_dir(self):
+        if self.logger_save_dir is None and self.logger_exp_name is None:
+            return os.path.join(self.logger.save_dir, self.logger._project)
+        else:
+            return os.path.join(self.logger_save_dir, self.logger_exp_name)
+    def set_log_dir(self, save_dir, exp_name):
+        self.logger_save_dir = save_dir
+        self.logger_exp_name = exp_name
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        # x = self.time_shuffle_operation(x)
+        # x = self.freq_split_subband(x)
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        # bs, ch, shuffled_timesteps, fbins = dec.size()
+        # dec = self.time_unshuffle_operation(dec, bs, int(ch*shuffled_timesteps), fbins)
+        # dec = self.freq_merge_subband(dec)
+        return dec
+    def decode_to_waveform(self, dec):
+        if self.image_key == "fbank":
+            dec = dec.squeeze(1).permute(0, 2, 1)
+            wav_reconstruction = vocoder_chunk_infer(dec, self.vocoder)
+        elif self.image_key == "stft":
+            dec = dec.squeeze(1).permute(0, 2, 1)
+            wav_reconstruction = self.wave_decoder(dec)
+        return wav_reconstruction
+    def mel_spectrogram_to_waveform(
+        self, mel, savepath=".", bs=None, name="outwav", save=True
+    ):
+        # Mel: [bs, 1, t-steps, fbins]
+        if len(mel.size()) == 4:
+            mel = mel.squeeze(1)
+        mel = mel.permute(0, 2, 1)
+        waveform = self.vocoder(mel)
+        waveform = waveform.cpu().detach().numpy()
+        #if save:
+        #    self.save_waveform(waveform, savepath, name)
+        return waveform
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        return self.encode(x)
+    @torch.no_grad()
+    def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
+        if predict_cids:
+            if z.dim() == 4:
+                z = torch.argmax(z.exp(), dim=1).long()
+            z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
+            z = rearrange(z, "b h w c -> b c h w").contiguous()
+        z = 1.0 / self.scale_factor * z
+        return self.decode(z)
+    def decode_first_stage_withgrad(self, z):
+        z = 1.0 / self.scale_factor * z
+        return self.decode(z)
+    def get_first_stage_encoding(self, encoder_posterior, use_mode=False):
+        if isinstance(encoder_posterior, DiagonalGaussianDistribution) and not use_mode:
+            z = encoder_posterior.sample()
+        elif isinstance(encoder_posterior, DiagonalGaussianDistribution) and use_mode:
+            z = encoder_posterior.mode()
+        elif isinstance(encoder_posterior, torch.Tensor):
+            z = encoder_posterior
+        else:
+            raise NotImplementedError(
+                f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented"
+            )
+        return self.scale_factor * z
+    def visualize_latent(self, input):
+        import matplotlib.pyplot as plt
+        # for i in range(10):
+        #     zero_input = torch.zeros_like(input) - 11.59
+        #     zero_input[:,:,i * 16: i * 16 + 16,:16] += 13.59
+        #     posterior = self.encode(zero_input)
+        #     latent = posterior.sample()
+        #     avg_latent = torch.mean(latent, dim=1)[0]
+        #     plt.imshow(avg_latent.cpu().detach().numpy().T)
+        #     plt.savefig("%s.png" % i)
+        #     plt.close()
+        np.save("input.npy", input.cpu().detach().numpy())
+        # zero_input = torch.zeros_like(input) - 11.59
+        time_input = input.clone()
+        time_input[:, :, :, :32] *= 0
+        time_input[:, :, :, :32] -= 11.59
+        np.save("time_input.npy", time_input.cpu().detach().numpy())
+        posterior = self.encode(time_input)
+        latent = posterior.sample()
+        np.save("time_latent.npy", latent.cpu().detach().numpy())
+        avg_latent = torch.mean(latent, dim=1)
+        for i in range(avg_latent.size(0)):
+            plt.imshow(avg_latent[i].cpu().detach().numpy().T)
+            plt.savefig("freq_%s.png" % i)
+            plt.close()
+        freq_input = input.clone()
+        freq_input[:, :, :512, :] *= 0
+        freq_input[:, :, :512, :] -= 11.59
+        np.save("freq_input.npy", freq_input.cpu().detach().numpy())
+        posterior = self.encode(freq_input)
+        latent = posterior.sample()
+        np.save("freq_latent.npy", latent.cpu().detach().numpy())
+        avg_latent = torch.mean(latent, dim=1)
+        for i in range(avg_latent.size(0)):
+            plt.imshow(avg_latent[i].cpu().detach().numpy().T)
+            plt.savefig("time_%s.png" % i)
+            plt.close()
+    def get_input(self, batch):
+        fname, text, label_indices, waveform, stft, fbank = (
+            batch["fname"],
+            batch["text"],
+            batch["label_vector"],
+            batch["waveform"],
+            batch["stft"],
+            batch["log_mel_spec"],
+        )
+        # if(self.time_shuffle != 1):
+        #     if(fbank.size(1) % self.time_shuffle != 0):
+        #         pad_len = self.time_shuffle - (fbank.size(1) % self.time_shuffle)
+        #         fbank = torch.nn.functional.pad(fbank, (0,0,0,pad_len))
+        ret = {}
+        ret["fbank"], ret["stft"], ret["fname"], ret["waveform"] = (
+            fbank.unsqueeze(1),
+            stft.unsqueeze(1),
+            fname,
+            waveform.unsqueeze(1),
+        )
+        return ret
+    def save_wave(self, batch_wav, fname, save_dir):
+        os.makedirs(save_dir, exist_ok=True)
+        for wav, name in zip(batch_wav, fname):
+            name = os.path.basename(name)
+            sf.write(os.path.join(save_dir, name), wav, samplerate=self.sampling_rate)
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    @torch.no_grad()
+    def log_images(self, batch, train=True, only_inputs=False, waveform=None, **kwargs):
+        log = dict()
+        x = batch.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            log["samples"] = self.decode(posterior.sample())
+            log["reconstructions"] = xrec
+        log["inputs"] = x
+        wavs = self._log_img(log, train=train, index=0, waveform=waveform)
+        return wavs
+    def _log_img(self, log, train=True, index=0, waveform=None):
+        images_input = self.tensor2numpy(log["inputs"][index, 0]).T
+        images_reconstruct = self.tensor2numpy(log["reconstructions"][index, 0]).T
+        images_samples = self.tensor2numpy(log["samples"][index, 0]).T
+        if train:
+            name = "train"
+        else:
+            name = "val"
+        if self.logger is not None:
+            self.logger.log_image(
+                "img_%s" % name,
+                [images_input, images_reconstruct, images_samples],
+                caption=["input", "reconstruct", "samples"],
+            )
+        inputs, reconstructions, samples = (
+            log["inputs"],
+            log["reconstructions"],
+            log["samples"],
+        )
+        if self.image_key == "fbank":
+            wav_original, wav_prediction = synth_one_sample(
+                inputs[index],
+                reconstructions[index],
+                labels="validation",
+                vocoder=self.vocoder,
+            )
+            wav_original, wav_samples = synth_one_sample(
+                inputs[index], samples[index], labels="validation", vocoder=self.vocoder
+            )
+            wav_original, wav_samples, wav_prediction = (
+                wav_original[0],
+                wav_samples[0],
+                wav_prediction[0],
+            )
+        elif self.image_key == "stft":
+            wav_prediction = (
+                self.decode_to_waveform(reconstructions)[index, 0]
+                .cpu()
+                .detach()
+                .numpy()
+            )
+            wav_samples = (
+                self.decode_to_waveform(samples)[index, 0].cpu().detach().numpy()
+            )
+            wav_original = waveform[index, 0].cpu().detach().numpy()
+        if self.logger is not None:
+            self.logger.experiment.log(
+                {
+                    "original_%s"
+                    % name: wandb.Audio(
+                        wav_original, caption="original", sample_rate=self.sampling_rate
+                    ),
+                    "reconstruct_%s"
+                    % name: wandb.Audio(
+                        wav_prediction,
+                        caption="reconstruct",
+                        sample_rate=self.sampling_rate,
+                    ),
+                    "samples_%s"
+                    % name: wandb.Audio(
+                        wav_samples, caption="samples", sample_rate=self.sampling_rate
+                    ),
+                }
+            )
+        return wav_original, wav_prediction, wav_samples
+    def tensor2numpy(self, tensor):
+        return tensor.cpu().detach().numpy()
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = torch.nn.functional.conv2d(x, weight=self.colorize)
+        x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0
+        return x
+class IdentityFirstStage(torch.nn.Module):
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
+        super().__init__()
+    def encode(self, x, *args, **kwargs):
+        return x
+    def decode(self, x, *args, **kwargs):
+        return x
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+    def forward(self, x, *args, **kwargs):
+        return x
+def window_sumsquare(
+    window,
+    n_frames,
+    hop_length,
+    win_length,
+    n_fft,
+    dtype=np.float32,
+    norm=None,
+):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
+    return x
+def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return normalize_fun(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length, hop_length, win_length, window="hann"):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
+        )
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
+        )
+        if window is not None:
+            assert filter_length >= win_length
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, size=filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+        self.register_buffer("forward_basis", forward_basis.float())
+        self.register_buffer("inverse_basis", inverse_basis.float())
+    def transform(self, input_data):
+        device = self.forward_basis.device
+        input_data = input_data.to(device)
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+        self.num_samples = num_samples
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = torch.nn.functional.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode="reflect",
+        )
+        input_data = input_data.squeeze(1)
+        forward_transform = torch.nn.functional.conv1d(
+            input_data,
+            torch.autograd.Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )#.cpu()
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
+        return magnitude, phase
+    def inverse(self, magnitude, phase):
+        device = self.forward_basis.device
+        magnitude, phase = magnitude.to(device), phase.to(device)
+        recombine_magnitude_phase = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        inverse_transform = torch.nn.functional.conv_transpose1d(
+            recombine_magnitude_phase,
+            torch.autograd.Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0,
+        )
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window,
+                magnitude.size(-1),
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                n_fft=self.filter_length,
+                dtype=np.float32,
+            )
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0]
+            )
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False
+            )
+            window_sum = window_sum
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
+                approx_nonzero_indices
+            ]
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+        inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
+        inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
+        return inverse_transform
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction
+class TacotronSTFT(torch.nn.Module):
+    def __init__(
+        self,
+        filter_length,
+        hop_length,
+        win_length,
+        n_mel_channels,
+        sampling_rate,
+        mel_fmin,
+        mel_fmax,
+    ):
+        super(TacotronSTFT, self).__init__()
+        self.n_mel_channels = n_mel_channels
+        self.sampling_rate = sampling_rate
+        self.stft_fn = STFT(filter_length, hop_length, win_length)
+        mel_basis = librosa_mel_fn(
+            sr = sampling_rate, n_fft = filter_length, n_mels = n_mel_channels, fmin = mel_fmin, fmax = mel_fmax
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+    def spectral_normalize(self, magnitudes, normalize_fun):
+        output = dynamic_range_compression(magnitudes, normalize_fun)
+        return output
+    def spectral_de_normalize(self, magnitudes):
+        output = dynamic_range_decompression(magnitudes)
+        return output
+    def mel_spectrogram(self, y, normalize_fun=torch.log):
+        """Computes mel-spectrograms from a batch of waves
+        PARAMS
+        ------
+        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
+        RETURNS
+        -------
+        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
+        """
+        assert torch.min(y.data) >= -1, torch.min(y.data)
+        assert torch.max(y.data) <= 1, torch.max(y.data)
+        magnitudes, phases = self.stft_fn.transform(y)
+        magnitudes = magnitudes.data
+        mel_output = torch.matmul(self.mel_basis, magnitudes)
+        mel_output = self.spectral_normalize(mel_output, normalize_fun)
+        energy = torch.norm(magnitudes, dim=1)
+        log_magnitudes = self.spectral_normalize(magnitudes, normalize_fun)
+        return mel_output, log_magnitudes, energy
+def build_pretrained_models(ckpt):
+    checkpoint = torch.load(ckpt, map_location="cpu")
+    scale_factor = checkpoint["state_dict"]["scale_factor"].item()
+    print("scale_factor: ", scale_factor)
+    vae_state_dict = {k[18:]: v for k, v in checkpoint["state_dict"].items() if "first_stage_model." in k}
+    config = {
+        "preprocessing": {
+            "audio": {
+            "sampling_rate": 48000,
+            "max_wav_value": 32768,
+            "duration": 10.24
+            },
+            "stft": {
+            "filter_length": 2048,
+            "hop_length": 480,
+            "win_length": 2048
+            },
+            "mel": {
+            "n_mel_channels": 256,
+            "mel_fmin": 20,
+            "mel_fmax": 24000
+            }
+        },
+        "model": {
+            "params": {
+                "first_stage_config": {
+                    "params": {
+                        "sampling_rate": 48000,
+                        "batchsize": 4,
+                        "monitor": "val/rec_loss",
+                        "image_key": "fbank",
+                        "subband": 1,
+                        "embed_dim": 16,
+                        "time_shuffle": 1,
+                        "lossconfig": {
+                            "target": "audioldm2.latent_diffusion.modules.losses.LPIPSWithDiscriminator",
+                            "params": {
+                            "disc_start": 50001,
+                            "kl_weight": 1000,
+                            "disc_weight": 0.5,
+                            "disc_in_channels": 1
+                            }
+                        },
+                        "ddconfig": {
+                            "double_z": True,
+                            "mel_bins": 256,
+                            "z_channels": 16,
+                            "resolution": 256,
+                            "downsample_time": False,
+                            "in_channels": 1,
+                            "out_ch": 1,
+                            "ch": 128,
+                            "ch_mult": [
+                            1,
+                            2,
+                            4,
+                            8
+                            ],
+                            "num_res_blocks": 2,
+                            "attn_resolutions": [],
+                            "dropout": 0
+                        }
+                    }
+                },
+            }
+        }
+    }
+    vae_config = config["model"]["params"]["first_stage_config"]["params"]
+    vae_config["scale_factor"] = scale_factor
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(vae_state_dict)
+    fn_STFT = TacotronSTFT(
+        config["preprocessing"]["stft"]["filter_length"],
+        config["preprocessing"]["stft"]["hop_length"],
+        config["preprocessing"]["stft"]["win_length"],
+        config["preprocessing"]["mel"]["n_mel_channels"],
+        config["preprocessing"]["audio"]["sampling_rate"],
+        config["preprocessing"]["mel"]["mel_fmin"],
+        config["preprocessing"]["mel"]["mel_fmax"],
+    )
+    vae.eval()
+    fn_STFT.eval()
+    return vae, fn_STFT

MuCodec/tools/torch_tools.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+import torchaudio
+import random
+import itertools
+import numpy as np
+def normalize_wav(waveform):
+    waveform = waveform - torch.mean(waveform)
+    waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-8)
+    return waveform * 0.5
+def pad_wav(waveform, segment_length):
+    waveform_length = len(waveform)
+    if segment_length is None or waveform_length == segment_length:
+        return waveform
+    elif waveform_length > segment_length:
+        return waveform[:segment_length]
+    else:
+        pad_wav = torch.zeros(segment_length - waveform_length).to(waveform.device)
+        waveform = torch.cat([waveform, pad_wav])
+        return waveform
+def _pad_spec(fbank, target_length=1024):
+    batch, n_frames, channels = fbank.shape
+    p = target_length - n_frames
+    if p > 0:
+        pad = torch.zeros(batch, p, channels).to(fbank.device)
+        fbank = torch.cat([fbank, pad], 1)
+    elif p < 0:
+        fbank = fbank[:, :target_length, :]
+    if channels % 2 != 0:
+        fbank = fbank[:, :, :-1]
+    return fbank
+def read_wav_file(filename, segment_length):
+    waveform, sr = torchaudio.load(filename)  # Faster!!!
+    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=16000)[0]
+    try:
+        waveform = normalize_wav(waveform)
+    except:
+        print ("Exception normalizing:", filename)
+        waveform = torch.ones(160000)
+    waveform = pad_wav(waveform, segment_length).unsqueeze(0)
+    waveform = waveform / torch.max(torch.abs(waveform))
+    waveform = 0.5 * waveform
+    return waveform
+def get_mel_from_wav(audio, _stft):
+    audio = torch.nan_to_num(torch.clip(audio, -1, 1))
+    audio = torch.autograd.Variable(audio, requires_grad=False)
+    melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
+    return melspec, log_magnitudes_stft, energy
+def wav_to_fbank(paths, target_length=1024, fn_STFT=None):
+    assert fn_STFT is not None
+    waveform = torch.cat([read_wav_file(path, target_length * 160) for path in paths], 0)  # hop size is 160
+    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
+    fbank = fbank.transpose(1, 2)
+    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
+    fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
+        log_magnitudes_stft, target_length
+    )
+    return fbank, log_magnitudes_stft, waveform
+def wav_to_fbank2(waveform, target_length=-1, fn_STFT=None):
+    assert fn_STFT is not None
+    fbank, log_magnitudes_stft, energy = get_mel_from_wav(waveform, fn_STFT)
+    fbank = fbank.transpose(1, 2)
+    log_magnitudes_stft = log_magnitudes_stft.transpose(1, 2)
+    # print(fbank.shape, log_magnitudes_stft.shape)
+    if(target_length>0):
+        fbank, log_magnitudes_stft = _pad_spec(fbank, target_length), _pad_spec(
+            log_magnitudes_stft, target_length
+        )
+    return fbank, log_magnitudes_stft, waveform
+def uncapitalize(s):
+    if s:
+        return s[:1].lower() + s[1:]
+    else:
+        return ""

__pycache__/audio_tokens.cpython-310.pyc ADDED Viewed

Binary file (826 Bytes). View file

__pycache__/audio_tokens.cpython-312.pyc ADDED Viewed

Binary file (951 Bytes). View file

__pycache__/condition_encoders.cpython-310.pyc ADDED Viewed

Binary file (4.24 kB). View file

__pycache__/condition_encoders.cpython-312.pyc ADDED Viewed

Binary file (6.85 kB). View file

__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (12.6 kB). View file

__pycache__/dataset.cpython-312.pyc ADDED Viewed

Binary file (21.5 kB). View file

__pycache__/decoders.cpython-310.pyc ADDED Viewed

Binary file (3.86 kB). View file

__pycache__/decoders.cpython-312.pyc ADDED Viewed

Binary file (5.77 kB). View file

__pycache__/inference_full.cpython-310.pyc ADDED Viewed

Binary file (24.5 kB). View file

__pycache__/inference_full.cpython-312.pyc ADDED Viewed

Binary file (35 kB). View file

__pycache__/modelling_qwen3.cpython-310.pyc ADDED Viewed

Binary file (5.33 kB). View file

__pycache__/modelling_qwen3.cpython-312.pyc ADDED Viewed

Binary file (9.53 kB). View file

__pycache__/runtime_utils.cpython-310.pyc ADDED Viewed

Binary file (3.07 kB). View file

__pycache__/runtime_utils.cpython-312.pyc ADDED Viewed

Binary file (3.65 kB). View file

audio_tokens.py ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+SOA_TOKEN = "[SOA]"
+EOA_TOKEN = "[EOA]"
+MASK_AUDIO_TOKEN = "[MASK_AUDIO]"
+def audio_id_to_token(audio_id: int) -> str:
+    return f"<AUDIO_{int(audio_id)}>"
+def add_audio_special_tokens(tokenizer, num_audio_token: int) -> int:
+    special_tokens = [audio_id_to_token(i) for i in range(num_audio_token)] + [
+        MASK_AUDIO_TOKEN,
+        SOA_TOKEN,
+        EOA_TOKEN,
+    ]
+    return tokenizer.add_tokens(
+        special_tokens,
+        special_tokens=True,
+    )

batch_infer_checkpoints.py ADDED Viewed

	@@ -0,0 +1,402 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import traceback
+from pathlib import Path
+import datasets
+import torch
+from inference_full import (
+    TokenLayout,
+    batch_generate_segmentwise,
+    build_mucodec_decoder,
+    generate_segmentwise,
+    load_hf_template_sample_from_music_dataset,
+    save_outputs,
+)
+from runtime_utils import (
+    load_magel_checkpoint,
+    load_music_dataset,
+    maybe_compile_model,
+    resolve_device,
+    seed_everything,
+)
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run audio inference on validation samples for multiple checkpoints."
+    )
+    parser.add_argument(
+        "--checkpoint_list",
+        type=str,
+        default=None,
+        help="Text file with one checkpoint path per line.",
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        default=None,
+        help="Directory to scan for checkpoint-* subdirectories and optional final.",
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default="muse_mucodec_chord.ds",
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        default="validation",
+    )
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        default="checkpoints/Qwen3-0.6B",
+    )
+    parser.add_argument(
+        "--sample_indices",
+        type=int,
+        nargs="*",
+        default=None,
+        help="Specific sample indices to infer. Leave unset to run the full split.",
+    )
+    parser.add_argument(
+        "--max_samples",
+        type=int,
+        default=0,
+        help="Run only the first N samples from the split. Ignored if --sample_indices is set.",
+    )
+    parser.add_argument(
+        "--infer_batch_size",
+        type=int,
+        default=1,
+        help="Number of samples to decode together per step for the same checkpoint.",
+    )
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top_k", type=int, default=50)
+    parser.add_argument("--top_p", type=float, default=0.90)
+    parser.add_argument("--greedy", action="store_true", default=False)
+    parser.add_argument("--max_audio_tokens", type=int, default=0)
+    parser.add_argument("--fps", type=int, default=25)
+    parser.add_argument("--seed", type=int, default=1234)
+    parser.add_argument("--device", type=str, default="auto")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bfloat16",
+        choices=["float32", "float16", "bfloat16"],
+    )
+    parser.add_argument(
+        "--attn_implementation",
+        type=str,
+        default="sdpa",
+        choices=["eager", "sdpa", "flash_attention_2"],
+    )
+    parser.add_argument("--use_cache", action="store_true", default=True)
+    parser.add_argument("--no_cache", action="store_true", default=False)
+    parser.add_argument("--compile", action="store_true", default=False)
+    parser.add_argument(
+        "--compile_mode",
+        type=str,
+        default="reduce-overhead",
+        choices=["default", "reduce-overhead", "max-autotune"],
+    )
+    parser.add_argument("--mucodec_device", type=str, default="auto")
+    parser.add_argument("--mucodec_layer_num", type=int, default=7)
+    parser.add_argument("--mucodec_duration", type=float, default=40.96)
+    parser.add_argument("--mucodec_guidance_scale", type=float, default=1.5)
+    parser.add_argument("--mucodec_num_steps", type=int, default=20)
+    parser.add_argument("--mucodec_sample_rate", type=int, default=48000)
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="/root/new_batch_predictions",
+        help="Root output dir. Each checkpoint gets its own subdirectory.",
+    )
+    parser.add_argument(
+        "--summary_json",
+        type=str,
+        default="/root/new_batch_predictions/summary.json",
+    )
+    args = parser.parse_args()
+    if not args.checkpoint_list and not args.checkpoint_dir:
+        parser.error("one of --checkpoint_list or --checkpoint_dir is required")
+    return args
+def parse_checkpoint_list(path: str) -> list[str]:
+    checkpoints: list[str] = []
+    with open(path, "r", encoding="utf-8") as f:
+        for raw_line in f:
+            line = raw_line.strip()
+            if not line or line.startswith("#"):
+                continue
+            checkpoints.append(line)
+    if not checkpoints:
+        raise ValueError(f"No checkpoints found in list: {path}")
+    return checkpoints
+def scan_checkpoint_dir(path: str) -> list[str]:
+    root = Path(path)
+    if not root.is_dir():
+        raise NotADirectoryError(f"Checkpoint directory not found: {path}")
+    checkpoint_dirs = [
+        item
+        for item in root.iterdir()
+        if item.is_dir() and item.name.startswith("checkpoint-")
+    ]
+    checkpoint_dirs = sorted(
+        checkpoint_dirs,
+        key=lambda p: int(p.name.split("-", 1)[1])
+        if p.name.split("-", 1)[1].isdigit()
+        else p.name,
+    )
+    final_dir = root / "final"
+    if final_dir.is_dir():
+        checkpoint_dirs.append(final_dir)
+    checkpoints = [str(path_obj) for path_obj in checkpoint_dirs]
+    if not checkpoints:
+        raise ValueError(f"No checkpoint-* directories found under: {path}")
+    return checkpoints
+def get_dtype(name: str) -> torch.dtype:
+    return {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+    }[name]
+def get_split_size(dataset_path: str, split: str) -> int:
+    dataset_obj = datasets.load_from_disk(dataset_path)
+    if isinstance(dataset_obj, datasets.DatasetDict):
+        if split not in dataset_obj:
+            raise KeyError(f"Split not found: {split}")
+        return len(dataset_obj[split])
+    return len(dataset_obj)
+def resolve_sample_indices(
+    dataset_path: str,
+    split: str,
+    sample_indices: list[int] | None,
+    max_samples: int,
+) -> list[int]:
+    if sample_indices:
+        return list(sample_indices)
+    split_size = get_split_size(dataset_path, split)
+    if max_samples and max_samples > 0:
+        split_size = min(split_size, max_samples)
+    return list(range(split_size))
+def sanitize_checkpoint_name(checkpoint_path: str) -> str:
+    path = Path(checkpoint_path.rstrip("/"))
+    if path.parent.name:
+        return f"{path.parent.name}__{path.name}"
+    return path.name
+def chunk_list(items: list[int], chunk_size: int) -> list[list[int]]:
+    return [items[i : i + chunk_size] for i in range(0, len(items), chunk_size)]
+def main() -> None:
+    args = parse_args()
+    seed_everything(args.seed)
+    if args.checkpoint_list:
+        checkpoints = parse_checkpoint_list(args.checkpoint_list)
+    else:
+        checkpoints = scan_checkpoint_dir(args.checkpoint_dir)
+    sample_indices = resolve_sample_indices(
+        dataset_path=args.dataset_path,
+        split=args.split,
+        sample_indices=args.sample_indices,
+        max_samples=args.max_samples,
+    )
+    use_cache = args.use_cache and not args.no_cache
+    device = resolve_device(args.device)
+    dtype = get_dtype(args.dtype)
+    if device.type == "cpu" and dtype != torch.float32:
+        print(f"[WARN] dtype {dtype} on CPU may be unsupported; fallback to float32.")
+        dtype = torch.float32
+    output_root = Path(args.output_dir)
+    output_root.mkdir(parents=True, exist_ok=True)
+    print(f"[INFO] checkpoints={len(checkpoints)}")
+    print(f"[INFO] samples_per_checkpoint={len(sample_indices)}")
+    print(f"[INFO] device={device}, dtype={dtype}, use_cache={use_cache}")
+    mucodec_decoder = build_mucodec_decoder(args)
+    summary: list[dict] = []
+    for checkpoint_path in checkpoints:
+        ckpt_name = sanitize_checkpoint_name(checkpoint_path)
+        ckpt_output_dir = output_root / ckpt_name
+        json_dir = ckpt_output_dir / "json"
+        wav_dir = ckpt_output_dir / "wav"
+        print(f"\n[INFO] loading model from {checkpoint_path}")
+        model = load_magel_checkpoint(
+            checkpoint_path=checkpoint_path,
+            device=device,
+            dtype=dtype,
+            attn_implementation=args.attn_implementation,
+        )
+        model = maybe_compile_model(
+            model,
+            enabled=bool(args.compile),
+            mode=str(args.compile_mode),
+        )
+        num_audio_codebook = int(getattr(model.config, "magel_num_audio_token", 16384))
+        music_ds = load_music_dataset(
+            dataset_path=args.dataset_path,
+            split=args.split,
+            tokenizer_path=args.tokenizer_path,
+            num_audio_token=num_audio_codebook,
+            use_fast=True,
+        )
+        checkpoint_record = {
+            "checkpoint_path": checkpoint_path,
+            "checkpoint_name": ckpt_name,
+            "status": "ok",
+            "num_samples_requested": len(sample_indices),
+            "results": [],
+        }
+        try:
+            for batch_indices in chunk_list(sample_indices, max(1, int(args.infer_batch_size))):
+                samples = []
+                for sample_idx in batch_indices:
+                    print(
+                        f"[INFO] checkpoint={ckpt_name} sample_idx={sample_idx} split={args.split}"
+                    )
+                    samples.append(
+                        load_hf_template_sample_from_music_dataset(
+                            music_ds=music_ds,
+                            sample_idx=sample_idx,
+                            num_audio_codebook=num_audio_codebook,
+                        )
+                    )
+                layout = TokenLayout(
+                    num_text_token=samples[0].num_text_token,
+                    num_audio_codebook=num_audio_codebook,
+                )
+                if len(samples) == 1:
+                    batch_outputs = [
+                        generate_segmentwise(
+                            model=model,
+                            sample=samples[0],
+                            layout=layout,
+                            device=device,
+                            use_cache=use_cache,
+                            temperature=float(args.temperature),
+                            top_k=int(args.top_k),
+                            top_p=float(args.top_p),
+                            greedy=bool(args.greedy),
+                            max_audio_tokens=max(0, int(args.max_audio_tokens)),
+                        )
+                    ]
+                else:
+                    try:
+                        batch_outputs = batch_generate_segmentwise(
+                            model=model,
+                            samples=samples,
+                            layout=layout,
+                            device=device,
+                            use_cache=use_cache,
+                            temperature=float(args.temperature),
+                            top_k=int(args.top_k),
+                            top_p=float(args.top_p),
+                            greedy=bool(args.greedy),
+                            max_audio_tokens=max(0, int(args.max_audio_tokens)),
+                        )
+                    except Exception as exc:
+                        print(
+                            "[WARN] batch_generate_segmentwise failed; "
+                            f"falling back to single-sample decode. error={exc!r}"
+                        )
+                        traceback.print_exc()
+                        batch_outputs = [
+                            generate_segmentwise(
+                                model=model,
+                                sample=sample,
+                                layout=layout,
+                                device=device,
+                                use_cache=use_cache,
+                                temperature=float(args.temperature),
+                                top_k=int(args.top_k),
+                                top_p=float(args.top_p),
+                                greedy=bool(args.greedy),
+                                max_audio_tokens=max(0, int(args.max_audio_tokens)),
+                            )
+                            for sample in samples
+                        ]
+                for sample_idx, sample, batch_output in zip(batch_indices, samples, batch_outputs):
+                    generated_ids, sampled_count, sampled_chord_ids, sampled_segment_ids = batch_output
+                    prefix = f"{sample_idx:05d}_{sample.song_id}"
+                    # save_outputs expects these attributes on args.
+                    args.sample_idx = sample_idx
+                    args.json_output_dir = str(json_dir)
+                    args.wav_output_dir = str(wav_dir)
+                    save_outputs(
+                        output_dir=str(ckpt_output_dir),
+                        output_prefix=prefix,
+                        sample=sample,
+                        layout=layout,
+                        generated_ids=generated_ids,
+                        sampled_chord_ids=sampled_chord_ids,
+                        sampled_segment_ids=sampled_segment_ids,
+                        args=args,
+                        mucodec_decoder=mucodec_decoder,
+                    )
+                    checkpoint_record["results"].append(
+                        {
+                            "sample_idx": sample_idx,
+                            "song_id": sample.song_id,
+                            "generated_audio_tokens": sampled_count,
+                            "wav_path": str(wav_dir / f"{prefix}.wav"),
+                            "json_path": str(json_dir / f"{prefix}.chord_segment.json"),
+                        }
+                    )
+        except Exception as exc:
+            checkpoint_record["status"] = "error"
+            checkpoint_record["error"] = str(exc)
+            print(f"[ERROR] checkpoint {checkpoint_path}: {exc!r}")
+            traceback.print_exc()
+        summary.append(checkpoint_record)
+        del model
+        if device.type == "cuda":
+            torch.cuda.empty_cache()
+    summary_path = Path(args.summary_json)
+    summary_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(summary_path, "w", encoding="utf-8") as f:
+        json.dump(summary, f, ensure_ascii=False, indent=2)
+    print(f"\nSaved summary to: {summary_path}")
+if __name__ == "__main__":
+    main()

condition_encoders.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import math
+import torch
+import torch.nn as nn
+from typing import Optional
+from vocab import NUM_CHORD_CLASSES, NUM_STRUCTURE_CLASSES
+class EncoderBlock(nn.Module):
+    def __init__(
+        self, d_model: int, n_layers: int = 2, n_heads: int = 8, dropout: float = 0.0
+    ):
+        super().__init__()
+        layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=n_heads,
+            dim_feedforward=4 * d_model,
+            dropout=dropout,
+            activation="gelu",
+            batch_first=True,
+            norm_first=True,
+        )
+        self.enc = nn.TransformerEncoder(layer, num_layers=n_layers)
+    def forward(
+        self, x: torch.Tensor, pad_mask: Optional[torch.BoolTensor] = None
+    ) -> torch.Tensor:
+        # pad_mask: [B,T], True means PAD (masked)
+        return self.enc(x, src_key_padding_mask=pad_mask)
+class ConditionEncoder(nn.Module):
+    """
+    Condition encoder for AdaLN-Zero:
+      Inputs: two aligned sequences
+        x_chord: [B,T,D_in]
+        x_seg:   [B,T,D_in]
+      Output:
+        cond_expanded: [B,T,H]  (feed into your AdaLN layers as cond_expanded)
+    What it encodes per token:
+      - token-level: chord/segment content at time t
+      - position: global position (always) + optional segment-relative position
+      - segment context: via x_seg + bidirectional transformer mixing
+    Notes:
+      - Non-causal (sees future): good for "guidance" conditions.
+      - Compute once per sample at generation start; slice per step.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        chord_embed_dim: int = 512,
+        structure_embed_dim: int = 512,
+        n_layers: int = 2,
+        n_heads: int = 8,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.chord_embedding = nn.Embedding(
+            NUM_CHORD_CLASSES, chord_embed_dim, padding_idx=0
+        )
+        self.structure_embedding = nn.Embedding(
+            NUM_STRUCTURE_CLASSES, structure_embed_dim, padding_idx=0
+        )
+        self.cond_dim = chord_embed_dim + structure_embed_dim
+        self.cond_proj = nn.Linear(self.cond_dim, hidden_size)
+        # Small bidirectional transformer
+        self.encoder = EncoderBlock(
+            d_model=hidden_size, n_layers=n_layers, n_heads=n_heads, dropout=dropout
+        )
+        self.proj_out = nn.Linear(hidden_size, hidden_size)
+    @staticmethod
+    def _sincos_pos(
+        positions: torch.Tensor, dim: int, dtype: torch.dtype
+    ) -> torch.Tensor:
+        """
+        positions: [B, T], absolute positions (0..T-1)
+        returns: [B, T, dim] sinusoidal positional encoding
+        """
+        if dim <= 0:
+            raise ValueError("dim must be > 0 for positional encoding.")
+        half = dim // 2
+        if half == 0:
+            return torch.zeros(
+                positions.size(0),
+                positions.size(1),
+                dim,
+                device=positions.device,
+                dtype=dtype,
+            )
+        pos = positions.to(dtype=torch.float32)
+        freqs = torch.exp(
+            -math.log(10000.0)
+            * torch.arange(half, device=positions.device, dtype=torch.float32)
+            / half
+        )
+        angles = pos.unsqueeze(-1) * freqs  # [B, T, half]
+        enc = torch.zeros(
+            positions.size(0),
+            positions.size(1),
+            dim,
+            device=positions.device,
+            dtype=torch.float32,
+        )
+        enc[..., 0 : 2 * half : 2] = torch.sin(angles)
+        enc[..., 1 : 2 * half : 2] = torch.cos(angles)
+        return enc.to(dtype=dtype)
+    def forward(
+        self,
+        chord_ids: torch.Tensor,  # [B, T]
+        structure_ids: torch.Tensor,  # [B, T]
+    ) -> torch.Tensor:
+        chord_emb = self.chord_embedding(chord_ids)  # [B, T, chord_dim]
+        structure_emb = self.structure_embedding(structure_ids)  # [B, T, struct_dim]
+        cond = torch.cat([chord_emb, structure_emb], dim=-1)
+        cond = self.cond_proj(cond)
+        # Encoder attention mask is computed separately from condition content.
+        # True means this token can be attended by the condition encoder.
+        valid_tokens = chord_ids.ne(0) | structure_ids.ne(0)
+        pad_mask = ~valid_tokens
+        # Position ids are contiguous only on valid condition-id tokens.
+        pos = valid_tokens.to(torch.long).cumsum(dim=1) - 1
+        pos = torch.where(valid_tokens, pos, torch.zeros_like(pos))
+        pos_enc = self._sincos_pos(pos, self.hidden_size, cond.dtype)
+        valid_mask = valid_tokens.unsqueeze(-1)
+        cond = cond + pos_enc * valid_mask.to(dtype=cond.dtype)
+        encoded = self.encoder(cond, pad_mask=pad_mask)
+        # [B, T, hidden_size]
+        return self.proj_out(encoded)

dataset.py ADDED Viewed

	@@ -0,0 +1,513 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+"""Dataset/collate implementation for music training data."""
+import math
+import re
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoTokenizer
+from audio_tokens import (
+    EOA_TOKEN,
+    MASK_AUDIO_TOKEN,
+    SOA_TOKEN,
+    add_audio_special_tokens,
+    audio_id_to_token,
+)
+from vocab import (
+    CHORD_BOS_ID,
+    CHORD_EOS_ID,
+    STRUCTURE_BOS_ID,
+    STRUCTURE_EOS_ID,
+    build_frame_chord_ids,
+    build_frame_structure_ids,
+    normalize_structure_label,
+)
+CN_LANGUAGE_LABELS = {"cn", "zh", "zh-cn", "chinese"}
+SECTION_NAME_MAP = {
+    "intro": "Intro",
+    "verse": "Verse",
+    "chorus": "Chorus",
+    "prechorus": "Pre-Chorus",
+    "bridge": "Bridge",
+    "outro": "Outro",
+    "pad": "Pad",
+}
+SINGLETON_SECTION_NAMES = {"intro", "outro", "pad"}
+ENDING_PUNCTUATION = {".", ";", "!", "?", "。", "？", "！", "；"}
+def _pad_batch_field(batch, key: str, padding_value):
+    return pad_sequence(
+        [row[key] for row in batch],
+        batch_first=True,
+        padding_value=padding_value,
+    )
+def detect_language(text: str, language: str | None = None) -> str:
+    return (
+        text.replace(" ", ";")
+        if str(language).strip().lower() in CN_LANGUAGE_LABELS
+        else text
+    )
+def normalize_section_text(
+    text: str, structure: str, language: str | None = None
+) -> str:
+    text = str(text or "")
+    text = (
+        text.replace(f"[{structure.upper()}]", "")
+        .replace(f"[{structure.lower()}]", "")
+        .replace(",", ";")
+        .replace(".", ";")
+        .replace("，", ";")
+        .replace("。", ";")
+    )
+    text = detect_language(text, language=language)
+    text = re.sub(r";(?=[A-Za-z])", "; ", text)
+    if text and text[-1] not in ENDING_PUNCTUATION:
+        text += ";"
+    return text
+class DataCollate:
+    def __call__(self, batch):
+        input_ids = _pad_batch_field(batch, "token_ids", 0)
+        labels = input_ids
+        mask_padded = _pad_batch_field(batch, "mask", 0)
+        attention_mask_padded = _pad_batch_field(batch, "attention_mask", 0)
+        chord_ids_padded = _pad_batch_field(batch, "chord_ids", 0)
+        structure_ids_padded = _pad_batch_field(batch, "structure_ids", 0)
+        condition_mask_padded = _pad_batch_field(batch, "condition_mask", False)
+        return {
+            "input_ids": input_ids,
+            "labels": labels,
+            "masks": mask_padded,
+            "attention_mask": attention_mask_padded,
+            "chord_ids": chord_ids_padded,
+            "structure_ids": structure_ids_padded,
+            "condition_mask": condition_mask_padded,
+        }
+class MusicDataset(torch.utils.data.Dataset):
+    """Fly dataset with music-code tokens and section-conditioned text."""
+    def __init__(
+        self,
+        datasets,
+        split: str,
+        tokenizer_path: str,
+        num_audio_token=16384,
+        fps=25,
+        use_fast=True,
+    ):
+        self._data = datasets[split]
+        self.tokenizer_path = tokenizer_path
+        self.use_fast = use_fast
+        self.num_audio_token = num_audio_token
+        self.fps = fps
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.tokenizer_path,
+            local_files_only=True,
+            use_fast=self.use_fast,
+        )
+        add_audio_special_tokens(self.tokenizer, self.num_audio_token)
+        self.tokenizer_vocab_size = len(self.tokenizer)
+        self.audio_prefix_length = int(
+            self.tokenizer.convert_tokens_to_ids(audio_id_to_token(0))
+        )
+        self.num_text_token = self.audio_prefix_length
+        self.MASK_AUDIO = int(self.tokenizer.convert_tokens_to_ids(MASK_AUDIO_TOKEN))
+        self.BOS_AUDIO = int(self.tokenizer.convert_tokens_to_ids(SOA_TOKEN))
+        self.EOS_AUDIO = int(self.tokenizer.convert_tokens_to_ids(EOA_TOKEN))
+        self._assistant_audio_placeholder = f"{SOA_TOKEN}{EOA_TOKEN}"
+        self._chat_template_kwargs = {"enable_thinking": False}
+    def __len__(self):
+        return len(self._data)
+    @staticmethod
+    def _positions(token_ids: torch.Tensor, target_id: int) -> torch.Tensor:
+        return torch.nonzero(token_ids == target_id, as_tuple=False).squeeze(-1)
+    @staticmethod
+    def _sorted_sections(sample: dict) -> list[dict]:
+        return sorted(
+            (
+                {
+                    "raw_index": raw_index,
+                    "text": str(seg["text"]),
+                    "desc": str(seg["desc"]).strip(),
+                    "start": float(seg["start"]),
+                    "end": float(seg["end"]),
+                    "structure": normalize_structure_label(seg["section"]),
+                }
+                for raw_index, seg in enumerate(sample.get("sections", []))
+            ),
+            key=lambda seg: (seg["start"], seg["end"], seg["raw_index"]),
+        )
+    @staticmethod
+    def _sorted_chords(sample: dict) -> list[dict]:
+        return sorted(
+            (
+                {
+                    "raw_index": raw_index,
+                    "type": str(seg.get("type")),
+                    "start": float(seg.get("start", 0.0)),
+                    "end": float(seg.get("end", 0.0)),
+                }
+                for raw_index, seg in enumerate(sample.get("chords", []))
+            ),
+            key=lambda seg: (seg["start"], seg["end"], seg["raw_index"]),
+        )
+    def __getitem__(self, idx):
+        sample = self._data[idx]
+        sections = self._prepare_sections(sample)
+        chords = self._prepare_chords(sample)
+        token_ids, attention_mask, frame_idx_map = self._tokenize_messages(
+            self._build_messages(sample, sections),
+            sample["mucodec_codes"],
+            sections,
+        )
+        total_frames = len(sample["mucodec_codes"])
+        structure_ids = build_frame_structure_ids(sections, total_frames, fps=self.fps)
+        chord_ids = build_frame_chord_ids(chords, total_frames, fps=self.fps)
+        structure_ids = torch.from_numpy(structure_ids)
+        chord_ids = torch.from_numpy(chord_ids)
+        (
+            audio_codebook_mask,
+            bos_audio_mask,
+            eos_mask,
+            label_mask,
+            condition_mask,
+        ) = self._build_token_masks(token_ids)
+        chord_ids_aligned, structure_ids_aligned = self._align_condition_ids(
+            token_ids=token_ids,
+            frame_idx_map=frame_idx_map,
+            total_frames=total_frames,
+            chord_ids=chord_ids,
+            structure_ids=structure_ids,
+            audio_codebook_mask=audio_codebook_mask,
+            bos_audio_mask=bos_audio_mask,
+            eos_mask=eos_mask,
+        )
+        return {
+            "token_ids": token_ids,
+            "mask": label_mask,
+            "attention_mask": attention_mask,
+            "chord_ids": chord_ids_aligned,
+            "structure_ids": structure_ids_aligned,
+            "condition_mask": condition_mask,
+        }
+    def _tokenize_messages(
+        self,
+        messages: list[dict[str, str]],
+        full_audio_codes,
+        sections: list[dict],
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        chat_inputs = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=False,
+            return_tensors="pt",
+            return_dict=True,
+            **self._chat_template_kwargs,
+        )
+        token_ids = chat_inputs["input_ids"]
+        attention_mask = chat_inputs["attention_mask"]
+        token_ids = token_ids.squeeze(0)
+        attention_mask = attention_mask.squeeze(0)
+        token_ids = token_ids.to(torch.long)
+        attention_mask = attention_mask.to(torch.long)
+        return self._expand_audio_tokens(
+            token_ids=token_ids,
+            attention_mask=attention_mask,
+            full_audio_codes=full_audio_codes,
+            sections=sections,
+        )
+    def _frame_bounds(
+        self,
+        start: float,
+        end: float,
+        total_frames: int,
+        prev_end_idx: int = 0,
+    ) -> tuple[int, int]:
+        start_idx = int(start * self.fps)
+        end_idx = int(math.ceil(end * self.fps))
+        start_idx = max(prev_end_idx, min(total_frames, start_idx))
+        end_idx = max(start_idx, min(total_frames, end_idx))
+        return start_idx, end_idx
+    def _prepare_sections(self, sample: dict) -> list[dict]:
+        sections = []
+        section_counts: dict[str, int] = {}
+        sample_language = sample.get("language")
+        total_frames = len(sample["mucodec_codes"])
+        prev_end_idx = 0
+        for seg in self._sorted_sections(sample):
+            structure = seg["structure"]
+            section_counts[structure] = section_counts.get(structure, 0) + 1
+            raw_start_idx = max(0, min(total_frames, int(seg["start"] * self.fps)))
+            raw_end_idx = max(
+                raw_start_idx,
+                min(total_frames, int(math.ceil(seg["end"] * self.fps))),
+            )
+            start_idx = prev_end_idx
+            end_idx = max(start_idx, raw_end_idx)
+            sections.append(
+                {
+                    "text": normalize_section_text(
+                        seg["text"], structure, language=sample_language
+                    ),
+                    "desc": seg["desc"],
+                    "start": start_idx / float(self.fps),
+                    "end": end_idx / float(self.fps),
+                    "start_frame": start_idx,
+                    "end_frame": end_idx,
+                    "structure": structure,
+                    "tag": f"{structure}{section_counts[structure]}",
+                    "index": section_counts[structure],
+                }
+            )
+            prev_end_idx = end_idx
+        if sections:
+            sections[-1]["end_frame"] = total_frames
+            sections[-1]["end"] = total_frames / float(self.fps)
+        return sections
+    def _prepare_chords(self, sample: dict) -> list[dict]:
+        chords = []
+        total_frames = len(sample["mucodec_codes"])
+        prev_end_idx = 0
+        for seg in self._sorted_chords(sample):
+            start_idx, end_idx = self._frame_bounds(
+                seg["start"],
+                seg["end"],
+                total_frames,
+                prev_end_idx=prev_end_idx,
+            )
+            chords.append(
+                {
+                    "type": seg["type"],
+                    "start": start_idx / float(self.fps),
+                    "end": end_idx / float(self.fps),
+                    "start_frame": start_idx,
+                    "end_frame": end_idx,
+                }
+            )
+            prev_end_idx = end_idx
+        return chords
+    def _format_section_label(self, section: dict) -> str:
+        structure = section["structure"]
+        index = section["index"]
+        label = SECTION_NAME_MAP[structure]
+        if structure in SINGLETON_SECTION_NAMES and index == 1:
+            return label
+        return f"{label} {index}"
+    def _build_section_user_content(
+        self, sample: dict, section: dict, is_first_turn: bool
+    ) -> str:
+        parts = []
+        if is_first_turn:
+            style = sample["style"].strip()
+            if style:
+                parts.append(
+                    f"Please generate a song in the following style:{style}\n"
+                    "Next, I will tell you the requirements and lyrics for the song "
+                    "fragment to be generated, section by section."
+                )
+            else:
+                parts.append(
+                    "Please generate the song section by section. "
+                    "Next, I will tell you the requirements and lyrics for each fragment."
+                )
+        section_parts = [f"[{self._format_section_label(section)}]"]
+        desc = section["desc"]
+        if desc:
+            section_parts.append(f"[desc:{desc}]")
+        lyrics = section["text"]
+        if lyrics:
+            section_parts.append(f"[lyrics:{lyrics}]")
+        parts.append("".join(section_parts))
+        return "\n".join(part for part in parts if part)
+    def _build_messages(
+        self,
+        sample: dict,
+        sections: list[dict],
+    ) -> list[dict[str, str]]:
+        messages: list[dict[str, str]] = [None] * (2 * len(sections))
+        for i, section in enumerate(sections):
+            msg_idx = 2 * i
+            messages[msg_idx] = {
+                "role": "user",
+                "content": self._build_section_user_content(
+                    sample, section, is_first_turn=(i == 0)
+                ),
+            }
+            messages[msg_idx + 1] = {
+                "role": "assistant",
+                "content": self._assistant_audio_placeholder,
+            }
+        return messages
+    def _expand_audio_tokens(
+        self,
+        token_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        full_audio_codes,
+        sections: list[dict],
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if not sections:
+            return (
+                token_ids,
+                attention_mask,
+                torch.full(token_ids.shape, -1, dtype=torch.long),
+            )
+        bos_positions = self._positions(token_ids, self.BOS_AUDIO)
+        eos_positions = self._positions(token_ids, self.EOS_AUDIO)
+        audio_code_tensor = torch.as_tensor(full_audio_codes, dtype=torch.long)
+        extra_audio_tokens = sum(
+            int(section["end_frame"]) - int(section["start_frame"])
+            for section in sections
+        )
+        final_len = token_ids.numel() + extra_audio_tokens
+        expanded_token_ids = torch.empty(final_len, dtype=torch.long)
+        expanded_attention_mask = torch.empty(final_len, dtype=torch.long)
+        frame_idx_map = torch.full((final_len,), -1, dtype=torch.long)
+        read_pos = 0
+        write_pos = 0
+        for bos_pos, eos_pos, section in zip(
+            bos_positions.tolist(), eos_positions.tolist(), sections
+        ):
+            start_idx = int(section["start_frame"])
+            end_idx = int(section["end_frame"])
+            audio_len = end_idx - start_idx
+            prefix_len = bos_pos + 1 - read_pos
+            next_write = write_pos + prefix_len
+            expanded_token_ids[write_pos:next_write] = token_ids[read_pos : bos_pos + 1]
+            expanded_attention_mask[write_pos:next_write] = attention_mask[
+                read_pos : bos_pos + 1
+            ]
+            frame_idx_map[next_write - 1] = start_idx if audio_len > 0 else -1
+            write_pos = next_write
+            if audio_len > 0:
+                next_write = write_pos + audio_len
+                expanded_token_ids[write_pos:next_write] = audio_code_tensor[
+                    start_idx:end_idx
+                ]
+                expanded_token_ids[write_pos:next_write].add_(self.audio_prefix_length)
+                expanded_attention_mask[write_pos:next_write] = 1
+                frame_idx_map[write_pos:next_write] = torch.arange(
+                    start_idx, end_idx, dtype=torch.long
+                )
+                write_pos = next_write
+            expanded_token_ids[write_pos] = token_ids[eos_pos]
+            expanded_attention_mask[write_pos] = attention_mask[eos_pos]
+            frame_idx_map[write_pos] = end_idx - 1 if audio_len > 0 else -1
+            write_pos += 1
+            read_pos = eos_pos + 1
+        tail_len = token_ids.numel() - read_pos
+        if tail_len > 0:
+            expanded_token_ids[write_pos : write_pos + tail_len] = token_ids[read_pos:]
+            expanded_attention_mask[write_pos : write_pos + tail_len] = attention_mask[
+                read_pos:
+            ]
+        return expanded_token_ids, expanded_attention_mask, frame_idx_map
+    def _build_token_masks(
+        self, token_ids: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        audio_codebook_mask = (token_ids >= self.audio_prefix_length) & (
+            token_ids < self.MASK_AUDIO
+        )
+        bos_audio_mask = token_ids == self.BOS_AUDIO
+        eos_mask = token_ids == self.EOS_AUDIO
+        label_mask = (audio_codebook_mask | eos_mask).long()
+        condition_mask = audio_codebook_mask | bos_audio_mask | eos_mask
+        return audio_codebook_mask, bos_audio_mask, eos_mask, label_mask, condition_mask
+    def _align_condition_ids(
+        self,
+        token_ids: torch.Tensor,
+        frame_idx_map: torch.Tensor,
+        total_frames: int,
+        chord_ids: torch.Tensor,
+        structure_ids: torch.Tensor,
+        audio_codebook_mask: torch.Tensor,
+        bos_audio_mask: torch.Tensor,
+        eos_mask: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        seq_len = token_ids.numel()
+        chord_ids_aligned = torch.zeros(seq_len, dtype=torch.long)
+        structure_ids_aligned = torch.zeros(seq_len, dtype=torch.long)
+        bos_positions = torch.nonzero(bos_audio_mask, as_tuple=False).squeeze(-1)
+        chord_ids_aligned[bos_positions] = CHORD_BOS_ID
+        structure_ids_aligned[bos_positions] = STRUCTURE_BOS_ID
+        audio_positions = torch.nonzero(audio_codebook_mask, as_tuple=False).squeeze(-1)
+        cur_frame_idx = frame_idx_map[audio_positions]
+        cur_frame_idx = cur_frame_idx.clamp(0, max(total_frames - 1, 0))
+        chord_ids_aligned[audio_positions] = chord_ids[cur_frame_idx]
+        structure_ids_aligned[audio_positions] = structure_ids[cur_frame_idx]
+        eos_positions = torch.nonzero(eos_mask, as_tuple=False).squeeze(-1)
+        chord_ids_aligned[eos_positions] = CHORD_EOS_ID
+        structure_ids_aligned[eos_positions] = STRUCTURE_EOS_ID
+        return chord_ids_aligned, structure_ids_aligned

decoders.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import torch
+from typing import Optional, Tuple
+from torch import nn as nn
+from transformers.cache_utils import Cache
+from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer
+class AdaLN(nn.Module):
+    """
+    DiT-style AdaLN:
+      cond_token -> (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp)
+    If zero_init=True, then at step0:
+      shift/scale/gate are all exactly 0 -> base behavior preserved (mathematically).
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        cond_dim: int,
+        zero_init: bool = True,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.act = nn.SiLU()
+        self.linear = nn.Linear(cond_dim, 6 * hidden_size, bias=True)
+        if zero_init:
+            nn.init.zeros_(self.linear.weight)
+            nn.init.zeros_(self.linear.bias)
+    def forward(self, cond_token: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+        """
+        cond_token: [B, T, cond_dim]
+        returns 6 tensors, each [B, T, H]
+        """
+        params = self.linear(self.act(cond_token))  # [B, T, 6H]
+        (
+            shift_msa,
+            scale_msa,
+            gate_msa,
+            shift_mlp,
+            scale_mlp,
+            gate_mlp,
+        ) = params.chunk(6, dim=-1)
+        return shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp
+def apply_adaln(
+    x_norm: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor
+) -> torch.Tensor:
+    # x_norm * (1 + scale) + shift
+    return x_norm * (1.0 + scale) + shift
+class Qwen3DecoderLayerAdaLN(Qwen3DecoderLayer):
+    """
+    Qwen3 decoder layer with AdaLN injection:
+      - Modulate normalized input with (shift, scale) on masked positions.
+      - IMPORTANT: gate must preserve base behavior at gate=0:
+            out = out_base * (1 + gate)   (on masked positions)
+        so that when gate==0, out==out_base.
+    Only applied on audio positions (condition_mask==True).
+    """
+    def __init__(
+        self,
+        config,
+        layer_idx: int,
+        cond_dim: int,
+        zero_init: bool = True,
+    ):
+        super().__init__(config, layer_idx)
+        self.dit_adaln = AdaLN(
+            hidden_size=config.hidden_size,
+            cond_dim=cond_dim,
+            zero_init=zero_init,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        cond_expanded: Optional[torch.Tensor] = None,  # [B, T, cond_dim]
+        condition_mask: Optional[torch.BoolTensor] = None,  # [B, T]
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ):
+        # Keep the condition path fully tensor-based; avoid .item() checks that
+        # can force GPU-CPU synchronization in autoregressive decoding.
+        do_cond = (cond_expanded is not None) and (condition_mask is not None)
+        if do_cond:
+            (
+                shift_msa,
+                scale_msa,
+                gate_msa,
+                shift_mlp,
+                scale_mlp,
+                gate_mlp,
+            ) = self.dit_adaln(cond_expanded)
+            mask_expanded = condition_mask.unsqueeze(-1)  # [B, T, 1]
+        # ---- Self-Attention branch ----
+        residual = hidden_states
+        x_norm = self.input_layernorm(hidden_states)  # RMSNorm in Qwen3
+        if do_cond:
+            x_mod = apply_adaln(x_norm, shift_msa, scale_msa)
+            x_in = torch.where(mask_expanded, x_mod, x_norm)
+        else:
+            x_in = x_norm
+        attn_out, _ = self.self_attn(
+            hidden_states=x_in,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        if do_cond:
+            # Preserve base when gate==0: attn_out_audio = (1 + gate) * attn_out_base
+            attn_out = torch.where(mask_expanded, (1.0 + gate_msa) * attn_out, attn_out)
+        hidden_states = residual + attn_out
+        # ---- MLP branch ----
+        residual = hidden_states
+        x_norm = self.post_attention_layernorm(hidden_states)
+        if do_cond:
+            x_mod = apply_adaln(x_norm, shift_mlp, scale_mlp)
+            x_in = torch.where(mask_expanded, x_mod, x_norm)
+        else:
+            x_in = x_norm
+        mlp_out = self.mlp(x_in)
+        if do_cond:
+            # Preserve base when gate==0
+            mlp_out = torch.where(mask_expanded, (1.0 + gate_mlp) * mlp_out, mlp_out)
+        hidden_states = residual + mlp_out
+        return hidden_states

inference_full.py ADDED Viewed

	@@ -0,0 +1,1084 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+HF-driven inference for MAGEL with segment-level autoregressive generation.
+Uses from HF sample:
+- text instruction/template tokens (token_ids scaffold)
+- control tokens: chord_ids/structure_ids
+Does NOT use:
+- ground-truth audio token values as input (audio codebook positions are masked)
+"""
+import argparse
+import contextlib
+import importlib
+import json
+import os
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional
+import numpy as np
+import torch
+from runtime_utils import (
+    load_magel_checkpoint,
+    load_music_dataset,
+    maybe_compile_model,
+    maybe_mark_compile_step_begin,
+    resolve_device,
+    seed_everything,
+)
+from vocab import (
+    CHORD_BOS_ID,
+    CHORD_EOS_ID,
+    STRUCTURE_EOS_ID,
+    chord_id_to_label,
+    structure_id_to_label,
+)
+from modelling_qwen3 import MAGEL
+REPO_ROOT = Path(__file__).resolve().parent
+MUCODEC_ROOT = REPO_ROOT / "MuCodec"
+@dataclass
+class TokenLayout:
+    num_text_token: int
+    num_audio_codebook: int = 16384
+    @property
+    def audio_start(self) -> int:
+        return self.num_text_token
+    @property
+    def audio_end(self) -> int:
+        return self.num_text_token + self.num_audio_codebook
+    @property
+    def mask_audio(self) -> int:
+        return self.audio_end
+    @property
+    def bos_audio(self) -> int:
+        return self.audio_end + 1
+    @property
+    def eos_audio(self) -> int:
+        return self.audio_end + 2
+@dataclass
+class SegmentSpan:
+    seg_idx: int
+    bos_pos: int
+    eos_pos: int
+    audio_positions: list[int]
+@dataclass
+class HFTemplateSample:
+    song_id: str
+    num_text_token: int
+    template_ids: torch.Tensor  # [T], original token_ids
+    input_ids: torch.Tensor  # [T], audio codebook replaced with MASK_AUDIO
+    chord_ids: torch.Tensor  # [T]
+    structure_ids: torch.Tensor  # [T]
+    condition_mask: torch.Tensor  # [T]
+    is_audio_codebook: torch.Tensor  # [T]
+    is_eos: torch.Tensor  # [T]
+    segments: list[SegmentSpan]
+    raw_item: dict[str, Any]
+    @property
+    def seq_len(self) -> int:
+        return int(self.input_ids.numel())
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Segment-wise AR generation from HF controls/scaffold."
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="./output_qwen3_0p6b_train/final",
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default="muse_mucodec_chord.ds",
+    )
+    parser.add_argument("--split", type=str, default="validation")
+    parser.add_argument("--sample_idx", type=int, default=0)
+    parser.add_argument(
+        "--tokenizer_path", type=str, default="checkpoints/Qwen3-0.6B"
+    )
+    parser.add_argument(
+        "--num_audio_codebook",
+        type=int,
+        default=None,
+        help="Audio codebook size. Defaults to checkpoint metadata when available.",
+    )
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top_k", type=int, default=50)
+    parser.add_argument("--top_p", type=float, default=0.90)
+    parser.add_argument("--greedy", action="store_true", default=False)
+    parser.add_argument("--max_audio_tokens", type=int, default=0)
+    parser.add_argument("--fps", type=int, default=25)
+    parser.add_argument("--seed", type=int, default=1234)
+    parser.add_argument("--device", type=str, default="auto")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bfloat16",
+        choices=["float32", "float16", "bfloat16"],
+    )
+    parser.add_argument("--use_cache", action="store_true", default=True)
+    parser.add_argument("--no_cache", action="store_true", default=False)
+    parser.add_argument("--compile", action="store_true", default=False)
+    parser.add_argument(
+        "--compile_mode",
+        type=str,
+        default="reduce-overhead",
+        choices=["default", "reduce-overhead", "max-autotune"],
+    )
+    parser.add_argument(
+        "--attn_implementation",
+        type=str,
+        default="sdpa",
+        choices=["eager", "sdpa", "flash_attention_2"],
+    )
+    parser.add_argument("--output_dir", type=str, default="predictions")
+    parser.add_argument("--output_prefix", type=str, default="")
+    parser.add_argument(
+        "--json_output_dir",
+        type=str,
+        default="predictions/json",
+        help="Directory for chord/segment json. Default: <output_dir>/json",
+    )
+    parser.add_argument(
+        "--mucodec_device",
+        type=str,
+        default="auto",
+        help="Device string for MuCodec, for example cuda:0.",
+    )
+    parser.add_argument(
+        "--mucodec_layer_num",
+        type=int,
+        default=7,
+        help="MuCodec layer_num passed to the official decoder.",
+    )
+    parser.add_argument(
+        "--mucodec_duration",
+        type=float,
+        default=40.96,
+        help="Chunk duration argument passed to MuCodec code2sound.",
+    )
+    parser.add_argument(
+        "--mucodec_guidance_scale",
+        type=float,
+        default=1.5,
+        help="Guidance scale argument passed to MuCodec code2sound.",
+    )
+    parser.add_argument(
+        "--mucodec_num_steps",
+        type=int,
+        default=20,
+        help="Sampling steps argument passed to MuCodec code2sound.",
+    )
+    parser.add_argument(
+        "--mucodec_sample_rate",
+        type=int,
+        default=48000,
+        help="Sample rate used when saving decoded wav.",
+    )
+    parser.add_argument(
+        "--wav_output_dir",
+        type=str,
+        default="predictions/wav",
+        help="Directory for decoded wav. Default: <output_dir>/wav",
+    )
+    return parser.parse_args()
+def resolve_runtime_device_str(device_arg: str) -> str:
+    if device_arg != "auto":
+        return device_arg
+    if torch.cuda.is_available():
+        return "cuda:0"
+    if torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+@contextlib.contextmanager
+def pushd(path: str):
+    prev = os.getcwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(prev)
+def ensure_sys_path(path: str) -> None:
+    if path and path not in sys.path:
+        sys.path.insert(0, path)
+def get_mucodec_root() -> str:
+    if not MUCODEC_ROOT.is_dir():
+        raise FileNotFoundError(f"MuCodec directory not found: {MUCODEC_ROOT}")
+    if not (MUCODEC_ROOT / "generate.py").is_file():
+        raise FileNotFoundError(
+            f"MuCodec entrypoint not found: {MUCODEC_ROOT / 'generate.py'}"
+        )
+    return str(MUCODEC_ROOT)
+def import_mucodec_class():
+    repo_path = get_mucodec_root()
+    ensure_sys_path(repo_path)
+    try:
+        module = importlib.import_module("generate")
+        return getattr(module, "MuCodec"), repo_path
+    except Exception as exc:  # pragma: no cover - env dependent
+        raise ImportError(f"Could not import MuCodec from {repo_path}/generate.py: {exc}")
+def build_mucodec_decoder(args: argparse.Namespace) -> Any:
+    MuCodec, resolved_repo = import_mucodec_class()
+    ckpt_path = os.path.join(resolved_repo, "ckpt", "mucodec.pt")
+    if not os.path.exists(ckpt_path):
+        raise FileNotFoundError(f"MuCodec checkpoint not found: {ckpt_path}")
+    required_local_files = [
+        os.path.join(resolved_repo, "tools", "audioldm_48k.pth"),
+        os.path.join(resolved_repo, "muq_dev", "muq.pt"),
+    ]
+    for path in required_local_files:
+        if not os.path.exists(path):
+            raise FileNotFoundError(
+                f"Required MuCodec dependency not found for current folder structure: {path}"
+            )
+    mucodec_device = resolve_runtime_device_str(args.mucodec_device)
+    if resolved_repo:
+        print(f"[INFO] resolved MuCodec repo: {resolved_repo}")
+    print(f"[INFO] loading MuCodec from {ckpt_path} on {mucodec_device}")
+    with pushd(resolved_repo):
+        decoder = MuCodec(
+            model_path=ckpt_path,
+            layer_num=int(args.mucodec_layer_num),
+            load_main_model=True,
+            device=mucodec_device,
+        )
+    setattr(decoder, "_magel_mucodec_repo", resolved_repo)
+    return decoder
+def decode_mucodec_codes(
+    mucodec_decoder: Any,
+    shifted_codes: np.ndarray,
+    args: argparse.Namespace,
+) -> torch.Tensor:
+    if shifted_codes.ndim != 1:
+        raise ValueError(
+            f"Expected 1D MuCodec token stream, got shape {shifted_codes.shape}"
+        )
+    codes = torch.from_numpy(shifted_codes.astype(np.int64, copy=False))
+    codes = codes.unsqueeze(0).unsqueeze(0)
+    repo_path = getattr(mucodec_decoder, "_magel_mucodec_repo", "")
+    decode_ctx = pushd(repo_path) if repo_path else contextlib.nullcontext()
+    with decode_ctx:
+        wave = mucodec_decoder.code2sound(
+            codes,
+            prompt=None,
+            duration=float(args.mucodec_duration),
+            guidance_scale=float(args.mucodec_guidance_scale),
+            num_steps=int(args.mucodec_num_steps),
+            disable_progress=True,
+        )
+    if not torch.is_tensor(wave):
+        wave = torch.as_tensor(wave)
+    if wave.ndim == 1:
+        wave = wave.unsqueeze(0)
+    return wave.detach().cpu().to(torch.float32)
+def build_segment_spans(
+    template_ids: torch.Tensor,
+    is_audio_codebook: torch.Tensor,
+    layout: TokenLayout,
+) -> list[SegmentSpan]:
+    bos_positions = torch.where(template_ids.eq(layout.bos_audio))[0].tolist()
+    eos_positions = torch.where(template_ids.eq(layout.eos_audio))[0].tolist()
+    if not bos_positions or not eos_positions:
+        return []
+    spans: list[SegmentSpan] = []
+    eos_ptr = 0
+    for b in bos_positions:
+        while eos_ptr < len(eos_positions) and eos_positions[eos_ptr] <= b:
+            eos_ptr += 1
+        if eos_ptr >= len(eos_positions):
+            break
+        e = eos_positions[eos_ptr]
+        eos_ptr += 1
+        idx = torch.arange(template_ids.numel(), device=template_ids.device)
+        mask = is_audio_codebook & (idx > b) & (idx < e)
+        audio_positions = torch.where(mask)[0].tolist()
+        spans.append(
+            SegmentSpan(
+                seg_idx=len(spans),
+                bos_pos=int(b),
+                eos_pos=int(e),
+                audio_positions=[int(p) for p in audio_positions],
+            )
+        )
+    return spans
+def load_hf_template_sample(
+    dataset_path: str,
+    split: str,
+    tokenizer_path: str,
+    sample_idx: int,
+    num_audio_codebook: int,
+) -> HFTemplateSample:
+    music_ds = load_music_dataset(
+        dataset_path=dataset_path,
+        split=split,
+        tokenizer_path=tokenizer_path,
+        num_audio_token=num_audio_codebook,
+        use_fast=True,
+    )
+    return load_hf_template_sample_from_music_dataset(
+        music_ds=music_ds,
+        sample_idx=sample_idx,
+        num_audio_codebook=num_audio_codebook,
+    )
+def load_hf_template_sample_from_music_dataset(
+    music_ds,
+    sample_idx: int,
+    num_audio_codebook: int,
+) -> HFTemplateSample:
+    layout = TokenLayout(
+        num_text_token=music_ds.num_text_token,
+        num_audio_codebook=num_audio_codebook,
+    )
+    raw_item = music_ds._data[sample_idx]
+    row = music_ds[sample_idx]
+    template_ids = row["token_ids"].to(torch.long)
+    chord_ids = row["chord_ids"].to(torch.long)
+    structure_ids = row["structure_ids"].to(torch.long)
+    condition_mask = row["condition_mask"].to(torch.bool)
+    seq_len = int(template_ids.numel())
+    for name, t in [
+        ("chord_ids", chord_ids),
+        ("structure_ids", structure_ids),
+        ("condition_mask", condition_mask),
+    ]:
+        if int(t.numel()) != seq_len:
+            raise ValueError(f"{name} length mismatch: {int(t.numel())} != {seq_len}")
+    is_audio_codebook = (template_ids >= layout.audio_start) & (
+        template_ids < layout.audio_end
+    )
+    is_eos = template_ids.eq(layout.eos_audio)
+    # Remove GT audio token values from input scaffold.
+    input_ids = template_ids.clone()
+    input_ids[is_audio_codebook] = layout.mask_audio
+    spans = build_segment_spans(template_ids, is_audio_codebook, layout)
+    return HFTemplateSample(
+        song_id=str(raw_item.get("song_id", f"sample_{sample_idx}")),
+        num_text_token=music_ds.num_text_token,
+        template_ids=template_ids,
+        input_ids=input_ids,
+        chord_ids=chord_ids,
+        structure_ids=structure_ids,
+        condition_mask=condition_mask,
+        is_audio_codebook=is_audio_codebook,
+        is_eos=is_eos,
+        segments=spans,
+        raw_item=raw_item,
+    )
+def apply_top_k_top_p(logits: torch.Tensor, top_k: int, top_p: float) -> torch.Tensor:
+    if top_k is not None and top_k > 0:
+        k = min(top_k, logits.shape[-1])
+        values, _ = torch.topk(logits, k, dim=-1)
+        kth = values[:, -1].unsqueeze(-1)
+        logits = logits.masked_fill(logits < kth, float("-inf"))
+    if top_p is not None and 0.0 < top_p < 1.0:
+        sorted_logits, sorted_idx = torch.sort(logits, descending=True, dim=-1)
+        sorted_probs = torch.softmax(sorted_logits, dim=-1)
+        cum_probs = torch.cumsum(sorted_probs, dim=-1)
+        remove_mask = cum_probs > top_p
+        remove_mask[:, 0] = False
+        sorted_logits = sorted_logits.masked_fill(remove_mask, float("-inf"))
+        filtered = torch.full_like(logits, float("-inf"))
+        filtered.scatter_(dim=-1, index=sorted_idx, src=sorted_logits)
+        logits = filtered
+    return logits
+def sample_from_logits(
+    logits: torch.Tensor,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    greedy: bool,
+) -> int:
+    if greedy or temperature <= 0:
+        return int(torch.argmax(logits, dim=-1).item())
+    logits = logits / max(temperature, 1e-6)
+    logits = apply_top_k_top_p(logits, top_k=top_k, top_p=top_p)
+    if not torch.isfinite(logits).any():
+        raise RuntimeError("All logits are -inf after filtering.")
+    probs = torch.softmax(logits, dim=-1)
+    return int(torch.multinomial(probs, num_samples=1).item())
+def sample_audio_token_from_logits(
+    logits: torch.Tensor,
+    layout: TokenLayout,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    greedy: bool,
+) -> int:
+    audio_logits = logits[:, layout.audio_start : layout.audio_end]
+    sampled_audio_idx = sample_from_logits(
+        audio_logits,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        greedy=greedy,
+    )
+    return int(layout.audio_start + sampled_audio_idx)
+def chord_id_to_type(chord_id: int) -> str:
+    decoded = chord_id_to_label(chord_id)
+    return decoded if decoded != "N" or chord_id in {1, CHORD_BOS_ID, CHORD_EOS_ID} else f"unknown_{chord_id}"
+def segment_id_to_type(segment_id: int) -> str:
+    decoded = structure_id_to_label(segment_id)
+    return decoded if 0 <= segment_id <= STRUCTURE_EOS_ID else f"unknown_{segment_id}"
+def to_intervals(type_ids: list[int], fps: int, mapper) -> list[dict[str, Any]]:
+    if not type_ids:
+        return []
+    out: list[dict[str, Any]] = []
+    start = 0
+    cur = type_ids[0]
+    for i in range(1, len(type_ids) + 1):
+        if i == len(type_ids) or type_ids[i] != cur:
+            out.append(
+                {
+                    "start": round(start / float(fps), 6),
+                    "end": round(i / float(fps), 6),
+                    "type": mapper(int(cur)),
+                }
+            )
+            if i < len(type_ids):
+                start = i
+                cur = type_ids[i]
+    return out
+def merge_same_type_with_small_gap(
+    intervals: list[dict[str, Any]], fps: int, max_gap_frames: int = 1
+) -> list[dict[str, Any]]:
+    if not intervals:
+        return []
+    max_gap_s = float(max_gap_frames) / float(fps)
+    merged = [dict(intervals[0])]
+    for cur in intervals[1:]:
+        prev = merged[-1]
+        gap_s = float(cur["start"]) - float(prev["end"])
+        if prev.get("type") == cur.get("type") and gap_s <= (max_gap_s + 1e-9):
+            prev["end"] = cur["end"]
+        else:
+            merged.append(dict(cur))
+    return merged
+@torch.inference_mode()
+def generate_segmentwise(
+    model: MAGEL,
+    sample: HFTemplateSample,
+    layout: TokenLayout,
+    device: torch.device,
+    use_cache: bool,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    greedy: bool,
+    max_audio_tokens: int,
+) -> tuple[torch.Tensor, int, list[int], list[int]]:
+    import time
+    seq_template = sample.input_ids.to(device)
+    chord_template = sample.chord_ids.to(device)
+    structure_template = sample.structure_ids.to(device)
+    condition_mask_template = sample.condition_mask.to(device)
+    is_audio_code = sample.is_audio_codebook.to(device)
+    is_eos = sample.is_eos.to(device)
+    slot_positions = torch.where(is_audio_code | is_eos)[0]
+    if slot_positions.numel() == 0:
+        # No generation slot: return scaffold as-is.
+        return seq_template.detach().cpu(), 0, [], []
+    start_pos = int(slot_positions[0].item())
+    if sample.segments:
+        end_pos = int(sample.segments[-1].eos_pos)
+    else:
+        end_pos = int(slot_positions[-1].item())
+    sampled_chord_ids: list[int] = []
+    sampled_segment_ids: list[int] = []
+    generated_ids = seq_template.clone()
+    sampled_count = 0
+    past_key_values: Optional[tuple] = None
+    # Precompute full-sequence condition once so cached decoding keeps
+    # the same global condition-encoder context as training.
+    cond_template: torch.Tensor = model.condition_encoder(
+        chord_template.unsqueeze(0),
+        structure_template.unsqueeze(0),
+    )
+    # Prefill with fixed prefix.
+    full_attention_mask = torch.ones(
+        (1, sample.seq_len), dtype=torch.long, device=device
+    )
+    prefix_ids = generated_ids[:start_pos].unsqueeze(0)
+    prefix_attn = full_attention_mask[:, :start_pos]
+    model_kwargs = dict(
+        input_ids=prefix_ids,
+        attention_mask=prefix_attn,
+        condition_mask=condition_mask_template[:start_pos].unsqueeze(0),
+        cond_precomputed=cond_template[:, :start_pos, :],
+        use_cache=use_cache,
+    )
+    maybe_mark_compile_step_begin(model)
+    prefill_t0 = time.perf_counter()
+    out = model(**model_kwargs)
+    prefill_time_s = time.perf_counter() - prefill_t0
+    logits_next = out.logits[:, -1, :]
+    if use_cache:
+        past_key_values = out.past_key_values
+        step_ids = torch.empty((1, 1), dtype=torch.long, device=device)
+    decode_time_s = 0.0
+    for i in range(start_pos, end_pos + 1):
+        if bool(is_audio_code[i].item()):
+            if max_audio_tokens > 0 and sampled_count >= max_audio_tokens:
+                break
+            next_id = sample_audio_token_from_logits(
+                logits_next,
+                layout=layout,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                greedy=greedy,
+            )
+            sampled_count += 1
+            # Controls are input-aligned to the token sequence.
+            cond_pos = i
+            sampled_chord_ids.append(int(chord_template[cond_pos].item()))
+            sampled_segment_ids.append(int(structure_template[cond_pos].item()))
+        elif bool(is_eos[i].item()):
+            next_id = layout.eos_audio
+        else:
+            next_id = int(seq_template[i].item())
+        generated_ids[i] = int(next_id)
+        if i >= end_pos:
+            break
+        if use_cache:
+            step_ids[0, 0] = int(next_id)
+            step_attn = full_attention_mask[:, : i + 2]
+            model_kwargs = dict(
+                input_ids=step_ids,
+                attention_mask=step_attn,
+                condition_mask=condition_mask_template[i : i + 1].unsqueeze(0),
+                cond_precomputed=cond_template[:, i : i + 1, :],
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+            maybe_mark_compile_step_begin(model)
+            step_t0 = time.perf_counter()
+            out = model(**model_kwargs)
+            decode_time_s += time.perf_counter() - step_t0
+            logits_next = out.logits[:, -1, :]
+            past_key_values = out.past_key_values
+        else:
+            cur_len = i + 1
+            model_kwargs = dict(
+                input_ids=generated_ids[:cur_len].unsqueeze(0),
+                attention_mask=full_attention_mask[:, :cur_len],
+                condition_mask=condition_mask_template[:cur_len].unsqueeze(0),
+                cond_precomputed=cond_template[:, :cur_len, :],
+                use_cache=False,
+            )
+            maybe_mark_compile_step_begin(model)
+            step_t0 = time.perf_counter()
+            out = model(**model_kwargs)
+            decode_time_s += time.perf_counter() - step_t0
+            logits_next = out.logits[:, -1, :]
+    total_gen_time_s = prefill_time_s + decode_time_s
+    tokens_per_second = (
+        float(sampled_count) / decode_time_s if decode_time_s > 0 and sampled_count > 0 else 0.0
+    )
+    print(
+        "[PROFILE] generation "
+        f"prefill_s={prefill_time_s:.3f} "
+        f"decode_s={decode_time_s:.3f} "
+        f"total_s={total_gen_time_s:.3f} "
+        f"sampled_audio_tokens={sampled_count} "
+        f"decode_tok_per_s={tokens_per_second:.3f}"
+    )
+    return (
+        generated_ids.detach().cpu(),
+        sampled_count,
+        sampled_chord_ids,
+        sampled_segment_ids,
+    )
+@torch.inference_mode()
+def batch_generate_segmentwise(
+    model: MAGEL,
+    samples: list[HFTemplateSample],
+    layout: TokenLayout,
+    device: torch.device,
+    use_cache: bool,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    greedy: bool,
+    max_audio_tokens: int,
+) -> list[tuple[torch.Tensor, int, list[int], list[int]]]:
+    import time
+    if not samples:
+        return []
+    if not use_cache:
+        return [
+            generate_segmentwise(
+                model=model,
+                sample=sample,
+                layout=layout,
+                device=device,
+                use_cache=use_cache,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                greedy=greedy,
+                max_audio_tokens=max_audio_tokens,
+            )
+            for sample in samples
+        ]
+    batch_size = len(samples)
+    seq_lens = [sample.seq_len for sample in samples]
+    max_seq_len = max(seq_lens)
+    seq_templates = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)
+    generated_ids = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)
+    chord_templates = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)
+    structure_templates = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)
+    condition_mask_templates = torch.zeros(
+        (batch_size, max_seq_len), dtype=torch.bool, device=device
+    )
+    is_audio_code_templates = torch.zeros(
+        (batch_size, max_seq_len), dtype=torch.bool, device=device
+    )
+    is_eos_templates = torch.zeros((batch_size, max_seq_len), dtype=torch.bool, device=device)
+    start_positions: list[int] = []
+    end_positions: list[int] = []
+    sampled_counts = [0 for _ in samples]
+    sampled_chord_ids: list[list[int]] = [[] for _ in samples]
+    sampled_segment_ids: list[list[int]] = [[] for _ in samples]
+    valid_sample_mask = torch.ones(batch_size, dtype=torch.bool, device=device)
+    for row_idx, sample in enumerate(samples):
+        seq_templates[row_idx, : sample.seq_len] = sample.input_ids.to(device)
+        generated_ids[row_idx, : sample.seq_len] = sample.input_ids.to(device)
+        chord_templates[row_idx, : sample.seq_len] = sample.chord_ids.to(device)
+        structure_templates[row_idx, : sample.seq_len] = sample.structure_ids.to(device)
+        condition_mask_templates[row_idx, : sample.seq_len] = sample.condition_mask.to(device)
+        is_audio_code_templates[row_idx, : sample.seq_len] = sample.is_audio_codebook.to(device)
+        is_eos_templates[row_idx, : sample.seq_len] = sample.is_eos.to(device)
+        slot_positions = torch.where(
+            is_audio_code_templates[row_idx, : sample.seq_len]
+            | is_eos_templates[row_idx, : sample.seq_len]
+        )[0]
+        if slot_positions.numel() == 0:
+            valid_sample_mask[row_idx] = False
+            start_positions.append(sample.seq_len)
+            end_positions.append(sample.seq_len - 1)
+            continue
+        start_pos = int(slot_positions[0].item())
+        if sample.segments:
+            end_pos = int(sample.segments[-1].eos_pos)
+        else:
+            end_pos = int(slot_positions[-1].item())
+        start_positions.append(start_pos)
+        end_positions.append(end_pos)
+    if not bool(valid_sample_mask.any().item()):
+        return [
+            (sample.input_ids.detach().cpu(), 0, [], [])
+            for sample in samples
+        ]
+    start_positions_t = torch.tensor(start_positions, dtype=torch.long, device=device)
+    end_positions_t = torch.tensor(end_positions, dtype=torch.long, device=device)
+    prefix_lens = start_positions_t.clone()
+    max_prefix_len = int(prefix_lens.max().item())
+    max_decode_steps = int((end_positions_t - start_positions_t + 1).clamp_min(0).max().item())
+    cond_template = model.condition_encoder(chord_templates, structure_templates)
+    prefix_attention_mask = (
+        torch.arange(max_prefix_len, device=device).unsqueeze(0) < prefix_lens.unsqueeze(1)
+    ).to(torch.long)
+    prefill_t0 = time.perf_counter()
+    maybe_mark_compile_step_begin(model)
+    out = model(
+        input_ids=generated_ids[:, :max_prefix_len],
+        attention_mask=prefix_attention_mask,
+        condition_mask=condition_mask_templates[:, :max_prefix_len],
+        cond_precomputed=cond_template[:, :max_prefix_len, :],
+        use_cache=True,
+    )
+    prefill_time_s = time.perf_counter() - prefill_t0
+    gather_idx = (prefix_lens - 1).clamp_min(0)
+    batch_indices = torch.arange(batch_size, device=device)
+    logits_next = out.logits[batch_indices, gather_idx, :]
+    past_key_values = out.past_key_values
+    step_ids = torch.zeros((batch_size, 1), dtype=torch.long, device=device)
+    decode_valid_mask = torch.zeros(
+        (batch_size, max_decode_steps), dtype=torch.bool, device=device
+    )
+    decode_time_s = 0.0
+    for step_idx in range(max_decode_steps):
+        cur_positions = start_positions_t + step_idx
+        active_mask = valid_sample_mask & cur_positions.le(end_positions_t)
+        if not bool(active_mask.any().item()):
+            break
+        next_ids = torch.zeros(batch_size, dtype=torch.long, device=device)
+        for row_idx in range(batch_size):
+            if not bool(active_mask[row_idx].item()):
+                continue
+            cur_pos = int(cur_positions[row_idx].item())
+            if bool(is_audio_code_templates[row_idx, cur_pos].item()):
+                if max_audio_tokens > 0 and sampled_counts[row_idx] >= max_audio_tokens:
+                    valid_sample_mask[row_idx] = False
+                    continue
+                next_id = sample_audio_token_from_logits(
+                    logits_next[row_idx : row_idx + 1],
+                    layout=layout,
+                    temperature=temperature,
+                    top_k=top_k,
+                    top_p=top_p,
+                    greedy=greedy,
+                )
+                sampled_counts[row_idx] += 1
+                sampled_chord_ids[row_idx].append(
+                    int(chord_templates[row_idx, cur_pos].item())
+                )
+                sampled_segment_ids[row_idx].append(
+                    int(structure_templates[row_idx, cur_pos].item())
+                )
+            elif bool(is_eos_templates[row_idx, cur_pos].item()):
+                next_id = layout.eos_audio
+            else:
+                next_id = int(seq_templates[row_idx, cur_pos].item())
+            generated_ids[row_idx, cur_pos] = int(next_id)
+            next_ids[row_idx] = int(next_id)
+            decode_valid_mask[row_idx, step_idx] = True
+        if step_idx >= max_decode_steps - 1:
+            break
+        step_ids[:, 0] = next_ids
+        step_attention_mask = torch.cat(
+            [
+                prefix_attention_mask,
+                decode_valid_mask[:, : step_idx + 1].to(torch.long),
+            ],
+            dim=1,
+        )
+        step_condition_mask = torch.zeros((batch_size, 1), dtype=torch.bool, device=device)
+        step_cond = torch.zeros(
+            (batch_size, 1, cond_template.shape[-1]),
+            dtype=cond_template.dtype,
+            device=device,
+        )
+        for row_idx in range(batch_size):
+            if not bool(decode_valid_mask[row_idx, step_idx].item()):
+                continue
+            cur_pos = int(cur_positions[row_idx].item())
+            step_condition_mask[row_idx, 0] = condition_mask_templates[row_idx, cur_pos]
+            step_cond[row_idx, 0, :] = cond_template[row_idx, cur_pos, :]
+        step_t0 = time.perf_counter()
+        maybe_mark_compile_step_begin(model)
+        out = model(
+            input_ids=step_ids,
+            attention_mask=step_attention_mask,
+            condition_mask=step_condition_mask,
+            cond_precomputed=step_cond,
+            past_key_values=past_key_values,
+            use_cache=True,
+        )
+        decode_time_s += time.perf_counter() - step_t0
+        logits_next = out.logits[:, -1, :]
+        past_key_values = out.past_key_values
+    total_sampled_tokens = sum(sampled_counts)
+    total_gen_time_s = prefill_time_s + decode_time_s
+    tokens_per_second = (
+        float(total_sampled_tokens) / decode_time_s
+        if decode_time_s > 0 and total_sampled_tokens > 0
+        else 0.0
+    )
+    print(
+        "[PROFILE] batch_generation "
+        f"batch_size={batch_size} "
+        f"prefill_s={prefill_time_s:.3f} "
+        f"decode_s={decode_time_s:.3f} "
+        f"total_s={total_gen_time_s:.3f} "
+        f"sampled_audio_tokens={total_sampled_tokens} "
+        f"decode_tok_per_s={tokens_per_second:.3f}"
+    )
+    outputs: list[tuple[torch.Tensor, int, list[int], list[int]]] = []
+    for row_idx, sample in enumerate(samples):
+        if not bool((torch.where(sample.is_audio_codebook | sample.is_eos)[0]).numel()):
+            outputs.append((sample.input_ids.detach().cpu(), 0, [], []))
+            continue
+        outputs.append(
+            (
+                generated_ids[row_idx, : sample.seq_len].detach().cpu(),
+                sampled_counts[row_idx],
+                sampled_chord_ids[row_idx],
+                sampled_segment_ids[row_idx],
+            )
+        )
+    return outputs
+def save_outputs(
+    output_dir: str,
+    output_prefix: str,
+    sample: HFTemplateSample,
+    layout: TokenLayout,
+    generated_ids: torch.Tensor,
+    sampled_chord_ids: list[int],
+    sampled_segment_ids: list[int],
+    args: argparse.Namespace,
+    mucodec_decoder: Any = None,
+) -> None:
+    import time
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    prefix = output_prefix or f"{sample.song_id}_{args.sample_idx}_{stamp}"
+    json_dir = args.json_output_dir or os.path.join(output_dir, "json")
+    wav_dir = args.wav_output_dir or os.path.join(output_dir, "wav")
+    Path(json_dir).mkdir(parents=True, exist_ok=True)
+    Path(wav_dir).mkdir(parents=True, exist_ok=True)
+    json_path = os.path.join(json_dir, f"{prefix}.chord_segment.json")
+    wav_path = os.path.join(wav_dir, f"{prefix}.wav")
+    gen_full = generated_ids.cpu().numpy().astype(np.int64)
+    gen_audio_raw = gen_full[
+        (gen_full >= layout.audio_start) & (gen_full < layout.audio_end)
+    ]
+    gen_audio_shift = gen_audio_raw - layout.audio_start
+    save_t0 = time.perf_counter()
+    if gen_audio_shift.size == 0:
+        print("[WARN] No generated MuCodec tokens; skipping wav decode.")
+    else:
+        import torchaudio
+        wave = decode_mucodec_codes(mucodec_decoder, gen_audio_shift, args)
+        torchaudio.save(wav_path, wave, int(args.mucodec_sample_rate))
+        print(f"[OK] {wav_path}")
+    chord_intervals = to_intervals(
+        sampled_chord_ids, fps=int(args.fps), mapper=chord_id_to_type
+    )
+    segment_intervals = to_intervals(
+        sampled_segment_ids, fps=int(args.fps), mapper=segment_id_to_type
+    )
+    # PAD is used for EOS-related conditioning; drop it in exported json.
+    chord_intervals = [x for x in chord_intervals if x.get("type") != "pad"]
+    segment_intervals = [x for x in segment_intervals if x.get("type") != "pad"]
+    chord_intervals = merge_same_type_with_small_gap(
+        chord_intervals, fps=int(args.fps), max_gap_frames=1
+    )
+    segment_intervals = merge_same_type_with_small_gap(
+        segment_intervals, fps=int(args.fps), max_gap_frames=1
+    )
+    chord_segment = {
+        "song_id": sample.song_id,
+        "sample_idx": int(args.sample_idx),
+        "fps": int(args.fps),
+        "generated_audio_count": int(gen_audio_raw.shape[0]),
+        "chord": chord_intervals,
+        "segment": segment_intervals,
+    }
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(chord_segment, f, ensure_ascii=False, indent=2)
+    print(f"[OK] {json_path}")
+    save_time_s = time.perf_counter() - save_t0
+    print(
+        "[PROFILE] save "
+        f"save_s={save_time_s:.3f} "
+        f"generated_audio_count={int(gen_audio_raw.shape[0])}"
+    )
+def main() -> None:
+    import time
+    args = parse_args()
+    seed_everything(args.seed)
+    use_cache = args.use_cache and not args.no_cache
+    device = resolve_device(args.device)
+    dtype = {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+    }[args.dtype]
+    if device.type == "cpu" and dtype != torch.float32:
+        print(f"[WARN] dtype {dtype} on CPU may be unsupported; fallback to float32.")
+        dtype = torch.float32
+    print(f"[INFO] device={device}, dtype={dtype}, use_cache={use_cache}")
+    print(f"[INFO] loading model from {args.model_path}")
+    model = load_magel_checkpoint(
+        checkpoint_path=args.model_path,
+        device=device,
+        dtype=dtype,
+        attn_implementation=args.attn_implementation,
+    )
+    model = maybe_compile_model(
+        model,
+        enabled=bool(args.compile),
+        mode=str(args.compile_mode),
+    )
+    num_audio_codebook = (
+        int(args.num_audio_codebook)
+        if args.num_audio_codebook is not None
+        else int(getattr(model.config, "magel_num_audio_token", 16384))
+    )
+    print(f"[INFO] num_audio_codebook={num_audio_codebook}")
+    print(f"[INFO] loading HF sample idx={args.sample_idx} from {args.dataset_path}")
+    sample = load_hf_template_sample(
+        dataset_path=args.dataset_path,
+        split=args.split,
+        tokenizer_path=args.tokenizer_path,
+        sample_idx=args.sample_idx,
+        num_audio_codebook=num_audio_codebook,
+    )
+    layout = TokenLayout(
+        num_text_token=sample.num_text_token,
+        num_audio_codebook=num_audio_codebook,
+    )
+    print(
+        f"[INFO] song_id={sample.song_id}, seq_len={sample.seq_len}, segments={len(sample.segments)}"
+    )
+    mucodec_decoder = build_mucodec_decoder(args)
+    print("[INFO] running segment-level autoregressive generation...")
+    t1 = time.time()
+    (
+        generated_ids,
+        sampled_count,
+        sampled_chord_ids,
+        sampled_segment_ids,
+    ) = generate_segmentwise(
+        model=model,
+        sample=sample,
+        layout=layout,
+        device=device,
+        use_cache=use_cache,
+        temperature=float(args.temperature),
+        top_k=int(args.top_k),
+        top_p=float(args.top_p),
+        greedy=bool(args.greedy),
+        max_audio_tokens=max(0, int(args.max_audio_tokens)),
+    )
+    print(f"[INFO] sampled audio tokens: {sampled_count}")
+    print(f"[INFO] output sequence length: {generated_ids.numel()}")
+    t2 = time.time()
+    print("total time:", t2 - t1)
+    save_outputs(
+        output_dir=args.output_dir,
+        output_prefix=args.output_prefix,
+        sample=sample,
+        layout=layout,
+        generated_ids=generated_ids,
+        sampled_chord_ids=sampled_chord_ids,
+        sampled_segment_ids=sampled_segment_ids,
+        args=args,
+        mucodec_decoder=mucodec_decoder,
+    )
+if __name__ == "__main__":
+    main()

modelling_qwen3.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import torch
+import torch.nn.functional as F
+from typing import Any, Optional
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.qwen3.modeling_qwen3 import Qwen3Config, Qwen3ForCausalLM
+from transformers.cache_utils import Cache
+from decoders import Qwen3DecoderLayerAdaLN
+from condition_encoders import ConditionEncoder
+from vocab import (
+    CHORD_BOS_ID,
+    CHORD_EOS_ID,
+    CHORD_N_ID,
+    SEGMENT_FALLBACK_ID,
+    STRUCTURE_BOS_ID,
+    STRUCTURE_EOS_ID,
+)
+class MAGEL(Qwen3ForCausalLM):
+    """
+    - masks-based CE loss
+    - decoder layers replaced with Qwen3DecoderLayerAdaLN
+    """
+    def __init__(
+        self,
+        config: Qwen3Config,
+        **kwargs: Any,
+    ):
+        super().__init__(config)
+        adaln_dim = int(config.hidden_size)
+        chord_dropout_trigger_prob = float(config.magel_chord_dropout_trigger_prob)
+        structure_dropout_trigger_prob = float(config.magel_structure_dropout_trigger_prob)
+        self.vocab_size = config.vocab_size
+        self.adaln_dim = adaln_dim
+        self.condition_encoder = ConditionEncoder(hidden_size=adaln_dim)
+        self.chord_dropout_trigger_prob = chord_dropout_trigger_prob
+        self.structure_dropout_trigger_prob = structure_dropout_trigger_prob
+        for layer_idx in range(len(self.model.layers)):
+            self.model.layers[layer_idx] = Qwen3DecoderLayerAdaLN(
+                config,
+                layer_idx=layer_idx,
+                cond_dim=adaln_dim,
+            )
+        # Persist MAGEL-specific ctor args so checkpoints can be reloaded without
+        # out-of-band flags.
+        self.config.magel_chord_dropout_trigger_prob = chord_dropout_trigger_prob
+        self.config.magel_structure_dropout_trigger_prob = structure_dropout_trigger_prob
+        self.post_init()
+    @staticmethod
+    def _drop_audio_condition_spans(
+        ids: torch.LongTensor,
+        condition_mask: torch.BoolTensor,
+        trigger_prob: float,
+        replacement_id: int,
+        bos_id: int,
+        eos_id: int,
+    ) -> torch.LongTensor:
+        if trigger_prob <= 0.0:
+            return ids
+        # Only drop aligned audio-condition positions; keep BOS/EOS untouched.
+        eligible_mask = condition_mask & (ids != bos_id) & (ids != eos_id)
+        if not eligible_mask.any():
+            return ids
+        dropped = ids.clone()
+        trigger_mask = torch.rand(ids.size(0), device=ids.device) < trigger_prob
+        span_len = 25
+        for batch_idx in torch.nonzero(trigger_mask, as_tuple=False).flatten():
+            candidate_positions = torch.nonzero(
+                eligible_mask[batch_idx], as_tuple=False
+            ).flatten()
+            num_candidates = int(candidate_positions.numel())
+            if num_candidates == 0:
+                continue
+            drop_ratio = torch.rand((), device=ids.device).item()
+            num_to_drop = int(round(drop_ratio * num_candidates))
+            if num_to_drop <= 0:
+                continue
+            remaining = num_to_drop
+            available_positions = candidate_positions.clone()
+            while remaining > 0:
+                num_available = int(available_positions.numel())
+                if num_available == 0:
+                    break
+                cur_span_len = min(span_len, remaining)
+                if num_available <= cur_span_len:
+                    start_idx = 0
+                    selected_positions = available_positions[:cur_span_len]
+                else:
+                    max_start = num_available - cur_span_len + 1
+                    start_idx = int(
+                        torch.randint(0, max_start, (1,), device=ids.device).item()
+                    )
+                    selected_positions = available_positions[
+                        start_idx : start_idx + cur_span_len
+                    ]
+                dropped[batch_idx, selected_positions] = replacement_id
+                keep_mask = torch.ones(
+                    num_available,
+                    dtype=torch.bool,
+                    device=ids.device,
+                )
+                keep_mask[start_idx : start_idx + int(selected_positions.numel())] = False
+                available_positions = available_positions[keep_mask]
+                remaining -= int(selected_positions.numel())
+        return dropped
+    def _build_condition(
+        self,
+        chord_ids: Optional[torch.LongTensor],
+        structure_ids: Optional[torch.LongTensor],
+        condition_mask: Optional[torch.BoolTensor],
+        cond_precomputed: Optional[torch.FloatTensor],
+    ) -> Optional[torch.FloatTensor]:
+        if cond_precomputed is not None:
+            return cond_precomputed
+        if chord_ids is None or structure_ids is None:
+            return None
+        if self.training:
+            if condition_mask is None:
+                raise ValueError("condition_mask is required during training.")
+            chord_ids = self._drop_audio_condition_spans(
+                ids=chord_ids,
+                condition_mask=condition_mask,
+                trigger_prob=self.chord_dropout_trigger_prob,
+                replacement_id=CHORD_N_ID,
+                bos_id=CHORD_BOS_ID,
+                eos_id=CHORD_EOS_ID,
+            )
+            structure_ids = self._drop_audio_condition_spans(
+                ids=structure_ids,
+                condition_mask=condition_mask,
+                trigger_prob=self.structure_dropout_trigger_prob,
+                replacement_id=SEGMENT_FALLBACK_ID,
+                bos_id=STRUCTURE_BOS_ID,
+                eos_id=STRUCTURE_EOS_ID,
+            )
+        return self.condition_encoder(chord_ids, structure_ids)
+    def ce_loss(
+        self,
+        logits: torch.FloatTensor,
+        labels: Optional[torch.LongTensor],
+        masks: Optional[torch.LongTensor],
+    ) -> Optional[torch.Tensor]:
+        if labels is None or masks is None:
+            return None
+        shift_logits = logits[:, :-1, :].contiguous()
+        shift_labels = labels[:, 1:].clone()
+        valid_token_mask = masks[:, 1:].bool().contiguous()
+        if not valid_token_mask.any():
+            return shift_logits.new_zeros(())
+        shift_labels.masked_fill_(~valid_token_mask, -100)
+        loss_sum = F.cross_entropy(
+            shift_logits.view(-1, self.config.vocab_size),
+            shift_labels.view(-1).to(shift_logits.device),
+            ignore_index=-100,
+            reduction="sum",
+        )
+        valid_count = valid_token_mask.sum().to(
+            device=loss_sum.device,
+            dtype=loss_sum.dtype,
+        )
+        return loss_sum / valid_count.clamp_min(1)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        masks: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        chord_ids: Optional[torch.LongTensor] = None,
+        structure_ids: Optional[torch.LongTensor] = None,
+        condition_mask: Optional[torch.BoolTensor] = None,
+        cond_precomputed: Optional[torch.FloatTensor] = None,
+    ) -> CausalLMOutputWithPast:
+        if use_cache is None:
+            use_cache = self.config.use_cache
+        if inputs_embeds is None:
+            inputs_embeds = self.model.embed_tokens(input_ids)
+        cond = self._build_condition(
+            chord_ids=chord_ids,
+            structure_ids=structure_ids,
+            condition_mask=condition_mask,
+            cond_precomputed=cond_precomputed,
+        )
+        base_out = self.model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cond_expanded=cond,
+            condition_mask=condition_mask,
+            cache_position=cache_position,
+        )
+        hidden_states = base_out.last_hidden_state
+        logits = self.lm_head(hidden_states)
+        loss = self.ce_loss(logits=logits, labels=labels, masks=masks)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=base_out.past_key_values,
+            hidden_states=base_out.hidden_states,
+            attentions=base_out.attentions,
+        )

muse_mucodec_chord.ds/dataset_dict.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"splits": ["train", "validation"]}

runtime_utils.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import random
+import datasets
+import numpy as np
+import torch
+from datasets import DatasetDict
+from transformers import AutoConfig
+from dataset import MusicDataset
+from modelling_qwen3 import MAGEL
+def seed_everything(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def resolve_device(device_arg: str) -> torch.device:
+    if device_arg != "auto":
+        return torch.device(device_arg)
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    if torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+def move_batch_to_device(
+    batch: dict[str, torch.Tensor], device: torch.device
+) -> dict[str, torch.Tensor]:
+    return {
+        key: value.to(device) if torch.is_tensor(value) else value
+        for key, value in batch.items()
+    }
+def load_music_dataset(
+    dataset_path: str,
+    split: str,
+    tokenizer_path: str,
+    num_audio_token: int = 16384,
+    fps: int = 25,
+    use_fast: bool = True,
+) -> MusicDataset:
+    hf = datasets.load_from_disk(dataset_path)
+    if isinstance(hf, DatasetDict):
+        if split not in hf:
+            raise KeyError(f"Split not found: {split}")
+        container = hf
+    else:
+        container = {split: hf}
+    return MusicDataset(
+        datasets=container,
+        split=split,
+        tokenizer_path=tokenizer_path,
+        num_audio_token=num_audio_token,
+        fps=fps,
+        use_fast=use_fast,
+    )
+def load_magel_checkpoint(
+    checkpoint_path: str,
+    device: torch.device,
+    dtype: torch.dtype = torch.float32,
+    attn_implementation: str = "sdpa",
+) -> MAGEL:
+    config = AutoConfig.from_pretrained(
+        checkpoint_path,
+        local_files_only=True,
+    )
+    model = MAGEL.from_pretrained(
+        checkpoint_path,
+        config=config,
+        torch_dtype=dtype,
+        attn_implementation=attn_implementation,
+        local_files_only=True,
+    )
+    model.to(device=device)
+    model.eval()
+    return model
+def maybe_compile_model(
+    model,
+    enabled: bool = False,
+    mode: str = "reduce-overhead",
+):
+    if not enabled:
+        setattr(model, "_magel_is_compiled", False)
+        return model
+    if not hasattr(torch, "compile"):
+        raise RuntimeError("torch.compile is not available in this PyTorch build.")
+    compiled_model = torch.compile(model, mode=mode)
+    setattr(compiled_model, "_magel_is_compiled", True)
+    return compiled_model
+def maybe_mark_compile_step_begin(model) -> None:
+    if not getattr(model, "_magel_is_compiled", False):
+        return
+    compiler_ns = getattr(torch, "compiler", None)
+    if compiler_ns is None:
+        return
+    mark_step_begin = getattr(compiler_ns, "cudagraph_mark_step_begin", None)
+    if mark_step_begin is None:
+        return
+    mark_step_begin()

train.py ADDED Viewed

	@@ -0,0 +1,259 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Train MAGEL directly from a vanilla Qwen3 checkpoint.
+Compared with train.py/train_newparaonly.py, this script:
+1) Loads an original Qwen3 base checkpoint.
+2) Resolves MAGEL hparams explicitly at construction time.
+3) Initializes MAGEL extra modules from scratch and trains end-to-end.
+"""
+import argparse
+import os
+import torch
+from transformers import (
+    AutoConfig,
+    Trainer,
+    TrainingArguments,
+)
+import datasets
+from dataset import DataCollate, MusicDataset
+from modelling_qwen3 import MAGEL
+def resolve_model_source(model_path: str, resume_from_checkpoint: str | None) -> str:
+    if not resume_from_checkpoint:
+        return model_path
+    if os.path.abspath(model_path) != os.path.abspath(resume_from_checkpoint):
+        print(
+            "Ignoring --model_path during resume and loading config/model from: "
+            f"{resume_from_checkpoint}"
+        )
+    return resume_from_checkpoint
+def create_model(
+    model_path: str,
+    model_dtype: torch.dtype,
+    target_vocab_size: int,
+    attn_implementation: str,
+) -> MAGEL:
+    print(f"Loading Qwen3 model from: {model_path}")
+    config = AutoConfig.from_pretrained(
+        model_path,
+        local_files_only=True,
+    )
+    model = MAGEL.from_pretrained(
+        model_path,
+        torch_dtype=model_dtype,
+        config=config,
+        attn_implementation=attn_implementation,
+        ignore_mismatched_sizes=True,
+        local_files_only=True,
+    )
+    model.resize_token_embeddings(target_vocab_size)
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    magel_extra_params = sum(
+        p.numel()
+        for name, p in model.named_parameters()
+        if ("condition_encoder" in name or "dit_adaln" in name)
+    )
+    print(f"Total parameters: {total_params:,}")
+    print(f"Trainable parameters: {trainable_params:,}")
+    print(f"MAGEL extra parameters: {magel_extra_params:,}")
+    print(
+        "MAGEL config: "
+        f"adaln_dim={model.adaln_dim}, "
+        f"chord_dropout_trigger_prob={model.chord_dropout_trigger_prob}, "
+        f"structure_dropout_trigger_prob={model.structure_dropout_trigger_prob}"
+    )
+    return model
+def create_dataset(
+    dataset_path: str,
+    tokenizer_path: str,
+    num_audio_token: int = 16384,
+) -> MusicDataset:
+    print(f"Loading dataset from: {dataset_path}")
+    print(f"Loading tokenizer from: {tokenizer_path}")
+    hf_ds = datasets.load_from_disk(dataset_path)
+    train_dataset = MusicDataset(
+        hf_ds,
+        split="train",
+        tokenizer_path=tokenizer_path,
+        num_audio_token=num_audio_token,
+        use_fast=True,
+    )
+    print(f"Dataset size: {len(train_dataset)}")
+    return train_dataset
+def main():
+    parser = argparse.ArgumentParser(
+        description="Train MAGEL directly from a vanilla Qwen3 base checkpoint."
+    )
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default="muse_mucodec_chord.ds",
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="checkpoints/Qwen3-0.6B",
+        help="Local Qwen3 base checkpoint path.",
+    )
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        default="checkpoints/Qwen3-0.6B",
+        help="Local tokenizer checkpoint path.",
+    )
+    parser.add_argument(
+        "--model_dtype",
+        type=str,
+        default="bfloat16",
+        choices=["float32", "float16", "bfloat16"],
+    )
+    parser.add_argument(
+        "--attn_implementation",
+        type=str,
+        default="sdpa",
+        choices=["eager", "sdpa", "flash_attention_2"],
+    )
+    parser.add_argument("--output_dir", type=str, default="./output_qwen3_0p6b_train")
+    parser.add_argument("--per_device_train_batch_size", type=int, default=1)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=4)
+    parser.add_argument("--learning_rate", type=float, default=1e-4)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--num_train_epochs", type=float, default=20)
+    parser.add_argument("--warmup_steps", type=int, default=1000)
+    parser.add_argument("--max_grad_norm", type=float, default=5.0)
+    parser.add_argument("--logging_steps", type=int, default=10)
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="Resume training from a Trainer checkpoint directory such as output_dir/checkpoint-500.",
+    )
+    parser.add_argument("--dataloader_num_workers", type=int, default=12)
+    parser.add_argument(
+        "--gradient_checkpointing",
+        dest="gradient_checkpointing",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--deepspeed",
+        type=str,
+        default=None,
+        help="Path to DeepSpeed config. Leave unset to disable DeepSpeed.",
+    )
+    parser.add_argument("--report_to", type=str, default="wandb")
+    parser.add_argument("--wandb_project", type=str, default="vaultum-qwen3-0p6b")
+    parser.add_argument("--wandb_run_name", type=str, default=None)
+    args = parser.parse_args()
+    model_dtype = {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+    }[args.model_dtype]
+    model_source = resolve_model_source(
+        model_path=args.model_path,
+        resume_from_checkpoint=args.resume_from_checkpoint,
+    )
+    base_config = AutoConfig.from_pretrained(
+        model_source,
+        local_files_only=True,
+    )
+    num_audio_token = int(base_config.magel_num_audio_token)
+    print(f"Using num_audio_token={num_audio_token}")
+    train_dataset = create_dataset(
+        dataset_path=args.dataset_path,
+        tokenizer_path=args.tokenizer_path,
+        num_audio_token=num_audio_token,
+    )
+    target_vocab_size = train_dataset.tokenizer_vocab_size
+    model = create_model(
+        model_path=model_source,
+        model_dtype=model_dtype,
+        attn_implementation=args.attn_implementation,
+        target_vocab_size=target_vocab_size,
+    )
+    training_args = TrainingArguments(
+        output_dir=args.output_dir,
+        per_device_train_batch_size=args.per_device_train_batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        num_train_epochs=args.num_train_epochs,
+        warmup_steps=args.warmup_steps,
+        max_grad_norm=args.max_grad_norm,
+        logging_steps=args.logging_steps,
+        save_strategy="epoch",
+        dataloader_num_workers=args.dataloader_num_workers,
+        bf16=(args.model_dtype == "bfloat16"),
+        fp16=(args.model_dtype == "float16"),
+        gradient_checkpointing=args.gradient_checkpointing,
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        deepspeed=args.deepspeed,
+        remove_unused_columns=False,
+        dataloader_drop_last=True,
+        report_to=args.report_to,
+        logging_dir=None,
+        run_name=args.wandb_run_name,
+    )
+    if args.wandb_project and "wandb" in args.report_to:
+        os.environ["WANDB_PROJECT"] = args.wandb_project
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        data_collator=DataCollate(),
+    )
+    if args.resume_from_checkpoint:
+        print(f"Resuming training from checkpoint: {args.resume_from_checkpoint}")
+    else:
+        print("Starting training from current model initialization.")
+    trainer.train(resume_from_checkpoint=args.resume_from_checkpoint)
+    final_dir = os.path.join(args.output_dir, "final")
+    trainer.save_model(final_dir)
+    train_dataset.tokenizer.save_pretrained(final_dir)
+    print(f"Training complete. Final model saved to: {final_dir}")
+if __name__ == "__main__":
+    main()

vocab/__init__.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+"""Condition vocab package."""
+from .chord import (
+    CHORD_BOS_ID,
+    CHORD_EOS_ID,
+    CHORD_LABELS,
+    CHORD_LABEL_TO_ID,
+    CHORD_N_ID,
+    NUM_CHORD_CLASSES,
+    build_frame_chord_ids,
+    chord_id_to_label,
+    chord_to_id,
+    normalize_chord_text,
+)
+from .sections import (
+    SEGMENT_FALLBACK_ID,
+    SEGMENT_LABELS,
+    SEGMENT_LABEL_TO_ID,
+    NUM_STRUCTURE_CLASSES,
+    STRUCTURE_BOS_ID,
+    STRUCTURE_EOS_ID,
+    build_frame_structure_ids,
+    normalize_structure_label,
+    structure_id_to_label,
+    structure_to_id,
+)
+__all__ = [
+    "CHORD_BOS_ID",
+    "CHORD_EOS_ID",
+    "CHORD_LABELS",
+    "CHORD_LABEL_TO_ID",
+    "CHORD_N_ID",
+    "NUM_CHORD_CLASSES",
+    "normalize_chord_text",
+    "chord_to_id",
+    "chord_id_to_label",
+    "build_frame_chord_ids",
+    "SEGMENT_LABELS",
+    "SEGMENT_LABEL_TO_ID",
+    "SEGMENT_FALLBACK_ID",
+    "STRUCTURE_BOS_ID",
+    "STRUCTURE_EOS_ID",
+    "NUM_STRUCTURE_CLASSES",
+    "normalize_structure_label",
+    "structure_to_id",
+    "structure_id_to_label",
+    "build_frame_structure_ids",
+]

vocab/chord.py ADDED Viewed

	@@ -0,0 +1,144 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+"""Chord vocab helpers.
+Dataset inspection summary for `muse_mucodec_chord.ds`:
+- `chords[].type` uses only 25 labels
+- labels are either `N` or `Root:maj|min`
+- roots are represented with sharps rather than flats
+"""
+import math
+import re
+import numpy as np
+CHORD_BOS_ID = 26
+CHORD_EOS_ID = 27
+CHORD_N_ID = 1
+NUM_CHORD_CLASSES = 28
+_PITCH_TO_PC = {
+    "C": 0,
+    "B#": 0,
+    "C#": 1,
+    "Db": 1,
+    "D": 2,
+    "D#": 3,
+    "Eb": 3,
+    "E": 4,
+    "Fb": 4,
+    "F": 5,
+    "E#": 5,
+    "F#": 6,
+    "Gb": 6,
+    "G": 7,
+    "G#": 8,
+    "Ab": 8,
+    "A": 9,
+    "A#": 10,
+    "Bb": 10,
+    "B": 11,
+    "Cb": 11,
+}
+_PC_TO_ROOT = [
+    "C",
+    "C#",
+    "D",
+    "D#",
+    "E",
+    "F",
+    "F#",
+    "G",
+    "G#",
+    "A",
+    "A#",
+    "B",
+]
+CHORD_ROOTS = tuple(_PC_TO_ROOT)
+_QUALITIES = [
+    "maj",
+    "min",
+]
+CHORD_LABELS = (
+    ("pad", "N")
+    + tuple(f"{root}:{quality}" for root in CHORD_ROOTS for quality in _QUALITIES)
+    + ("bos", "eos")
+)
+CHORD_LABEL_TO_ID = {label: index for index, label in enumerate(CHORD_LABELS)}
+def normalize_chord_text(label: str) -> str:
+    if label is None or not isinstance(label, str):
+        return "N"
+    label = label.strip()
+    if not label or label.lower() in {"n", "none"}:
+        return "N"
+    base = label.replace("♯", "#").replace("♭", "b").strip()
+    match = re.fullmatch(r"([A-Ga-g])([#b]?):(maj|min)", base, flags=re.IGNORECASE)
+    if not match:
+        return "N"
+    root = (match.group(1).upper() + match.group(2)).strip()
+    pc = _PITCH_TO_PC.get(root)
+    if pc is None:
+        return "N"
+    quality = match.group(3).lower()
+    return f"{CHORD_ROOTS[pc]}:{quality}"
+def chord_to_id(label: str) -> int:
+    return CHORD_LABEL_TO_ID.get(normalize_chord_text(label), CHORD_N_ID)
+def chord_id_to_label(chord_id: int) -> str:
+    if 0 <= chord_id < len(CHORD_LABELS):
+        return CHORD_LABELS[chord_id]
+    return "N"
+def build_frame_chord_ids(
+    chord_segments: list[dict], total_frames: int, fps: int = 25
+) -> np.ndarray:
+    chord_arr = (
+        np.full((total_frames,), CHORD_N_ID, dtype=np.int64)
+        if total_frames > 0
+        else np.zeros((0,), dtype=np.int64)
+    )
+    if not isinstance(chord_segments, (list, tuple)):
+        return chord_arr
+    for seg in chord_segments:
+        if not isinstance(seg, dict):
+            continue
+        if "start_frame" in seg and "end_frame" in seg:
+            start = int(seg.get("start_frame", 0))
+            end = int(seg.get("end_frame", 0))
+        else:
+            start = int(float(seg.get("start", 0.0)) * fps)
+            end = int(math.ceil(float(seg.get("end", 0.0)) * fps))
+        start = max(0, min(total_frames, start))
+        end = max(start, min(total_frames, end))
+        if end <= start:
+            continue
+        label = seg.get("type") or seg.get("chord") or seg.get("label") or "N"
+        chord_arr[start:end] = chord_to_id(str(label))
+    return chord_arr
+if __name__ == "__main__":
+    import datasets
+    ds = datasets.load_from_disk("muse_mucodec_chord.ds")
+    sample = ds["train"][0]
+    total_frames = 1500  # e.g., for a 6-second clip at 25 fps
+    chord_arr = build_frame_chord_ids(sample["chords"], total_frames)
+    print(chord_arr)

vocab/sections.py ADDED Viewed

	@@ -0,0 +1,105 @@

+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+"""Section vocab helpers.
+Dataset inspection summary for `muse_mucodec_chord.ds`:
+- `sections[]` has fields `section/text/start/end/desc`
+- `sections[].section` uses only 6 labels:
+  `Intro`, `Verse`, `Prechorus`, `Chorus`, `Bridge`, `Outro`
+"""
+import math
+import re
+import numpy as np
+SEGMENT_LABELS = (
+    "pad",
+    "intro",
+    "verse",
+    "chorus",
+    "prechorus",
+    "bridge",
+    "outro",
+)
+SEGMENT_LABEL_TO_ID = {label: index for index, label in enumerate(SEGMENT_LABELS)}
+SEGMENT_FALLBACK_ID = SEGMENT_LABEL_TO_ID["pad"]
+STRUCTURE_BOS_ID = len(SEGMENT_LABELS)
+STRUCTURE_EOS_ID = STRUCTURE_BOS_ID + 1
+NUM_STRUCTURE_CLASSES = STRUCTURE_EOS_ID + 1
+def normalize_structure_label(label: str) -> str:
+    if label is None or not isinstance(label, str):
+        return "pad"
+    normalized = re.sub(r"[\s_-]+", "", label.strip().lower())
+    normalized = re.sub(r"\d+", "", normalized)
+    if not normalized:
+        return "pad"
+    return (
+        normalized
+        if normalized != "pad" and normalized in SEGMENT_LABEL_TO_ID
+        else "pad"
+    )
+def structure_to_id(structure: str) -> int:
+    return SEGMENT_LABEL_TO_ID.get(
+        normalize_structure_label(structure), SEGMENT_FALLBACK_ID
+    )
+def structure_id_to_label(segment_id: int) -> str:
+    if segment_id == STRUCTURE_BOS_ID:
+        return "bos"
+    if segment_id == STRUCTURE_EOS_ID:
+        return "eos"
+    if 0 <= segment_id < len(SEGMENT_LABELS):
+        return SEGMENT_LABELS[segment_id]
+    return "pad"
+def build_frame_structure_ids(
+    sections: list[dict], total_frames: int, fps: int = 25
+) -> np.ndarray:
+    labels = np.full((total_frames,), SEGMENT_FALLBACK_ID, dtype=np.int64)
+    if not isinstance(sections, (list, tuple)):
+        return labels
+    for seg in sections:
+        if not isinstance(seg, dict):
+            continue
+        if "start_frame" in seg and "end_frame" in seg:
+            start = int(seg.get("start_frame", 0))
+            end = int(seg.get("end_frame", 0))
+        else:
+            start = int(float(seg.get("start", 0.0)) * fps)
+            end = int(math.ceil(float(seg.get("end", 0.0)) * fps))
+        start = max(0, min(total_frames, start))
+        end = max(start, min(total_frames, end))
+        if end <= start:
+            continue
+        label = seg.get("section", seg.get("structure", "")) or ""
+        labels[start:end] = structure_to_id(str(label))
+    return labels
+if __name__ == "__main__":
+    # Example usage
+    segments = [
+        {"section": "intro", "start": 0.0, "end": 10.0},
+        {"section": "Verse", "start": 10.0, "end": 30.0},
+        {"section": "Chorus", "start": 30.0, "end": 50.0},
+    ]
+    total_frames = 50  # e.g., for a 50-second audio at 25 fps
+    labels = build_frame_structure_ids(segments, total_frames)
+    print(labels)

wandb/debug-cli.root.log ADDED Viewed

File without changes