File size: 1,679 Bytes
0afe769
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import torch
from safetensors.torch import load_model
import json
from importlib.resources import files
from types import SimpleNamespace

from .amphion_codec.codec import CodecEncoder, CodecDecoder

class amphion_codec:
    def __init__(self, configs, device=None):
        self.device = device

        cfg_path = files(__package__) / "maskgct.json"
        with open(cfg_path, "r", encoding="utf-8") as f:
            cfg = json.load(f, object_hook=lambda d: SimpleNamespace(**d))

        self.codec_encoder, self.codec_decoder = self.build_acoustic_codec(cfg.model.acoustic_codec, device)
        load_model(self.codec_encoder, f"{configs.infer_ckpt_dir}/MaskGCT_codec_encoder.safetensors")
        load_model(self.codec_decoder, f"{configs.infer_ckpt_dir}/MaskGCT_codec_decoder.safetensors")

    def build_acoustic_codec(self, cfg, device):
        codec_encoder = CodecEncoder(cfg=cfg.encoder)
        codec_decoder = CodecDecoder(cfg=cfg.decoder)
        codec_encoder.eval()
        codec_decoder.eval()
        codec_encoder.to(device)
        codec_decoder.to(device)
        return codec_encoder, codec_decoder
    
    @torch.no_grad()
    def encode(self, speech_24k):
        vq_emb = self.codec_encoder(speech_24k)
        _, vq, _, _, _ = self.codec_decoder.quantizer(vq_emb)
        acoustic_code = vq.permute(1, 2, 0)
        return acoustic_code # [b, l, c]
    
    @torch.no_grad()
    def decode(self, tokens, n_quantizers=12):
        vq_emb = self.codec_decoder.vq2emb(tokens.permute(2, 0, 1), n_quantizers)
        recovered_audio = self.codec_decoder(vq_emb)
        recovered_audio = recovered_audio[0][0].cpu().numpy()
        return recovered_audio