jingyaogong commited on
Commit
477708d
·
verified ·
1 Parent(s): bce19b2

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +10 -3
  2. config.json +49 -0
  3. model.safetensors +3 -0
  4. preprocessor_config.json +10 -0
README.md CHANGED
@@ -1,3 +1,10 @@
1
- ---
2
- license: cc-by-4.0
3
- ---
 
 
 
 
 
 
 
 
1
+ # Mimi FP16
2
+
3
+ This directory contains an FP16 converted version of [Kyutai Mimi](https://huggingface.co/kyutai/mimi), used as an audio codec for encoding and decoding. The original model was released by Kyutai, and the model architecture, weights, authorship, and license remain attributed to the original authors.
4
+
5
+ The original Mimi model is released under the `CC-BY-4.0` license. The files here are only converted to FP16 to reduce memory usage and make local usage more convenient. When using, redistributing, or citing these files, please keep the original Kyutai attribution, model link, and license notice.
6
+
7
+ Original links:
8
+
9
+ - Hugging Face: https://huggingface.co/kyutai/mimi
10
+ - License: https://creativecommons.org/licenses/by/4.0/
config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MimiModel"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "audio_channels": 1,
7
+ "codebook_dim": 256,
8
+ "codebook_size": 2048,
9
+ "compress": 2,
10
+ "dilation_growth_rate": 2,
11
+ "frame_rate": 12.5,
12
+ "head_dim": 64,
13
+ "hidden_act": "gelu",
14
+ "hidden_size": 512,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 2048,
17
+ "kernel_size": 7,
18
+ "last_kernel_size": 3,
19
+ "layer_scale_initial_scale": 0.01,
20
+ "max_position_embeddings": 8000,
21
+ "model_type": "mimi",
22
+ "norm_eps": 1e-05,
23
+ "normalize": false,
24
+ "num_attention_heads": 8,
25
+ "num_filters": 64,
26
+ "num_hidden_layers": 8,
27
+ "num_key_value_heads": 8,
28
+ "num_residual_layers": 1,
29
+ "num_semantic_quantizers": 1,
30
+ "pad_mode": "constant",
31
+ "residual_kernel_size": 3,
32
+ "rope_theta": 10000.0,
33
+ "sampling_rate": 24000,
34
+ "sliding_window": 250,
35
+ "torch_dtype": "float16",
36
+ "transformers_version": "4.45.0.dev0",
37
+ "trim_right_ratio": 1.0,
38
+ "upsample_groups": 512,
39
+ "upsampling_ratios": [
40
+ 8,
41
+ 6,
42
+ 5,
43
+ 4
44
+ ],
45
+ "use_cache": false,
46
+ "use_causal_conv": true,
47
+ "use_conv_shortcut": false,
48
+ "vector_quantization_hidden_dimension": 256
49
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7542ee039d3025d5089cf227d21df64b6b8eff08fcd376a11a1fbd178dd9d3f5
3
+ size 192346842
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length_s": null,
3
+ "feature_extractor_type": "EncodecFeatureExtractor",
4
+ "feature_size": 1,
5
+ "overlap": null,
6
+ "padding_side": "right",
7
+ "padding_value": 0.0,
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 24000
10
+ }