Chuhaojin commited on
Commit
242b203
·
verified ·
1 Parent(s): 3a98029

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ hubert_kmeans/model.mdl filter=lfs diff=lfs merge=lfs -text
37
+ llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ llm/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,31 @@
1
- ---
2
- license: cc-by-nc-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SentiAvatar Model Checkpoints
2
+
3
+ 请从以下位置下载模型权重,并放置到此目录下:
4
+
5
+ ## 目录结构
6
+
7
+ ```
8
+ checkpoints/
9
+ ├── llm/ # Qwen2-0.5B SFT (Motion Token Planner)
10
+ │ ├── config.json
11
+ │ ├── model.safetensors
12
+ │ ├── tokenizer.json
13
+ │ └── ...
14
+ ├── mask_transformer/ # Audio-Motion Mask Transformer
15
+ │ ├── config.json
16
+ │ └── model.safetensors
17
+ ├── rvqvae/ # Residual VQ-VAE
18
+ │ ├── opt.txt # 模型配置
19
+ │ └── model/
20
+ │ └── epoch_30.pth # 模型权重
21
+ ├── face_vqvae/ # Face VQVAE
22
+ │ ├── pytorch_model_face_fad2cl_260116_codesize2048_codelength512.bin
23
+ │ ├── mat_final.npy
24
+ │ └── mat_final_R_I.npy
25
+ ├── chinese-hubert-base/ # Chinese HuBERT
26
+ │ ├── config.json
27
+ │ ├── preprocessor_config.json
28
+ │ └── pytorch_model.bin
29
+ └── eval_model/ # ChronAccRet 评测模型
30
+ └── best_model.pt
31
+ ```
chinese-hubert-base/README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+ Pretrained on 10k hours WenetSpeech L subset. More details in [TencentGameMate/chinese_speech_pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
5
+
6
+ This model does not have a tokenizer as it was pretrained on audio alone.
7
+ In order to use this model speech recognition, a tokenizer should be created and the model should be fine-tuned on labeled text data.
8
+
9
+ python package:
10
+ transformers==4.16.2
11
+
12
+ ```python
13
+
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ import soundfile as sf
18
+
19
+ from transformers import (
20
+ Wav2Vec2FeatureExtractor,
21
+ HubertModel,
22
+ )
23
+
24
+
25
+ model_path=""
26
+ wav_path=""
27
+
28
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
29
+ model = HubertModel.from_pretrained(model_path)
30
+
31
+ # for pretrain: Wav2Vec2ForPreTraining
32
+ # model = Wav2Vec2ForPreTraining.from_pretrained(model_path)
33
+
34
+ model = model.to(device)
35
+ model = model.half()
36
+ model.eval()
37
+
38
+ wav, sr = sf.read(wav_path)
39
+ input_values = feature_extractor(wav, return_tensors="pt").input_values
40
+ input_values = input_values.half()
41
+ input_values = input_values.to(device)
42
+
43
+ with torch.no_grad():
44
+ outputs = model(input_values)
45
+ last_hidden_state = outputs.last_hidden_state
46
+
47
+
48
+ ```
chinese-hubert-base/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModel"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.20.0.dev0",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
chinese-hubert-base/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
chinese-hubert-base/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fefccd26c2794a583b80f6f7210c721873cb7ebae2c1cde3baf9b27855e24d8
3
+ size 377552987
eval_model/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:828fe6b931e1ca8cc8f092836290f998507be286f1ac5149ed503b49d65ddb01
3
+ size 454859165
face_vqvae/mat_final.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f055de09c64182696499a26c2d6109349c627195bcd40c6adc3dd27f3922b34b
3
+ size 21140
face_vqvae/mat_final_R_I.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67befad9e98e372995b5c5d6883bf98a4f6d993f09e139857d12fe16c7257242
3
+ size 21140
face_vqvae/pytorch_model_face_fad2cl_260116_codesize2048_codelength512.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1353b1f67308a4ebe6a5c81a0c8a255963b806125717c0d5eb32165767c974f0
3
+ size 51968811
hubert_kmeans/model.mdl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1faf1a70098f1853427347520475a802de9aaf7dfb955c3af2cd83b6ca3857cd
3
+ size 1538989
llm/added_tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
llm/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 896,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 4864,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 24,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 14,
16
+ "num_hidden_layers": 24,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": 32768,
22
+ "tie_word_embeddings": true,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.50.0",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 225250
28
+ }
llm/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.50.0"
14
+ }
llm/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
llm/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:148ea91b5a9c20a6e388e4145b24228b178174c5d04cf2a24c8ee45d2a8426c7
3
+ size 1119476616
llm/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
llm/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77ea10323ffa96cd0baf4ad882dc2bced4d304f9fafd10ee05b4950d98c1179b
3
+ size 25286251
llm/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d454d38e6758b1e98515e8a7f5460aed3b0af55156b7dd1557cd0185114dd544
3
+ size 13133512
llm/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
mask_transformer/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AudioMotionTransformer"
4
+ ],
5
+ "audio_feat_dim": 768,
6
+ "codebook_size": 512,
7
+ "cond_drop_prob": 0.2,
8
+ "dropout": 0.2,
9
+ "dtype": "float32",
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 512,
12
+ "intermediate_size": 1536,
13
+ "max_position_embeddings": 512,
14
+ "model_type": "audio_motion_transformer",
15
+ "num_frames": 5,
16
+ "num_heads": 16,
17
+ "num_layers": 8,
18
+ "num_tokens_per_frame": 4,
19
+ "rms_norm_eps": 1e-06,
20
+ "transformers_version": "4.57.1",
21
+ "vocab_size": 2049
22
+ }
mask_transformer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8ed946d32423cc7bf393f9967f7d3fbd11894b5f0b67ff55dbffdb5471358bb
3
+ size 96170728
mask_transformer/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6113ea4928940205ec665c711dc80be83466909f2cbb7dcdc9a05479a1d5b970
3
+ size 192411979
mask_transformer/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba6cdd9db0560aaf782233459fbc22e7b29251377516e638dc99072fc9275b68
3
+ size 15429
mask_transformer/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f166488621222660ee59b49e91252a3d8629c4c55a1695f1ee697de614582c8c
3
+ size 15429
mask_transformer/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abac5d9e6b4349d5e1569132ca1055d02315f27bf7dbc7383a633a787ac0de72
3
+ size 15429
mask_transformer/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5ea0c1e69201fb66e3dede3dfb473772b1a42c59a22368456e5dfc002b652e3
3
+ size 15429
mask_transformer/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2a68cc89bc205077640e629149b54fb06ae537dfdeb010a8c11a0804c012cda
3
+ size 1465
mask_transformer/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
mask_transformer/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:845ce5cf164cb40b2109e8c7665fb79c6e8ebf8e4aadeab38af2e24879078735
3
+ size 5777
rvqvae/model/epoch_30.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92e3acf2b8a34a8705b24a8005b891d18b590d78fb2d37a11debe29389530a43
3
+ size 790198921
rvqvae/opt.txt ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ------------ Options -------------
2
+ batch_size: 256
3
+ body_dim: 153
4
+ body_joints_num: 24
5
+ body_parts: ['body', 'left', 'right', 'positions']
6
+ checkpoints_dir: ./checkpoints
7
+ code_dim: 512
8
+ commit: 0.02
9
+ data_root: /disk1/chuhao/dataset/mocap/mocap_susu_gen_demo/quat63nodes_v4_fix_pos
10
+ dataset_name: quat63nodes_v2_0120
11
+ debug: False
12
+ depth: 3
13
+ dilation_growth_rate: 3
14
+ down_t: 1
15
+ eval_every_e: 1
16
+ feat_bias: 5
17
+ fps: 20
18
+ gamma: 0.05
19
+ gpu_id: 0
20
+ is_continue: False
21
+ left_dim: 120
22
+ left_joints_num: 20
23
+ local_rank: 0
24
+ log_dir: ./log/vq
25
+ log_every: 10
26
+ loss_vel: 50.0
27
+ lr: 0.0001
28
+ max_epoch: 100
29
+ milestones: [50000, 1000000]
30
+ mu: 0.99
31
+ name: gqzV4
32
+ nb_code: 512
33
+ num_quantizers: 4
34
+ num_workers: 4
35
+ quantize_dropout_cutoff_index: 1
36
+ quantize_dropout_prob: 0.8
37
+ recons_loss: l1_smooth
38
+ right_dim: 120
39
+ right_joints_num: 20
40
+ save_every_e: 2
41
+ save_latest: 500
42
+ seed: 3407
43
+ shared_codebook: False
44
+ start_positions_epoch: 0
45
+ stride_t: 2
46
+ total_joints_num: 63
47
+ use_whole_encoder: False
48
+ vq_act: relu
49
+ vq_cnn_depth: 3
50
+ vq_norm: None
51
+ warm_up_iter: 2000
52
+ weight_decay: 0.0
53
+ weight_rec: 5.0
54
+ which_epoch: latest
55
+ whole_dim: 393
56
+ width: 512
57
+ window_size: 64
58
+ -------------- End ----------------