yujiepan commited on
Commit
80d69e7
·
verified ·
1 Parent(s): 001a3af

Upload folder using huggingface_hub

Browse files
.meta.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "torch": "2.11.0+cu126",
3
+ "transformers": "5.7.0.dev0"
4
+ }
README.md ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model:
4
+ - tencent/Hy3-preview
5
+ ---
6
+
7
+ This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [tencent/Hy3-preview](https://huggingface.co/tencent/Hy3-preview).
8
+
9
+ | File path | Size |
10
+ |------|------|
11
+ | model.safetensors | 5.4MB |
12
+
13
+
14
+ ### Example usage:
15
+
16
+ ```python
17
+ from transformers import AutoModelForCausalLM, AutoTokenizer
18
+
19
+ model_id = "yujiepan/hy-v3-tiny-random"
20
+
21
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
22
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", trust_remote_code=True)
23
+ messages = [
24
+ {"role": "user", "content": "Write a short poem about AI."},
25
+ ]
26
+ inputs = tokenizer.apply_chat_template(
27
+ messages,
28
+ tokenize=True,
29
+ return_tensors="pt",
30
+ add_generation_prompt=True,
31
+ reasoning_effort='high',
32
+ )
33
+ print(inputs)
34
+ outputs = model.generate(**inputs.to(model.device), max_new_tokens=32)
35
+ output_text = tokenizer.decode(outputs[0])
36
+ print(output_text)
37
+ ```
38
+
39
+ ### Codes to create this repo:
40
+
41
+ <details>
42
+ <summary>Click to expand</summary>
43
+
44
+ ```python
45
+ import json
46
+ from copy import deepcopy
47
+ from pathlib import Path
48
+
49
+ import torch
50
+ import torch.nn as nn
51
+
52
+ from huggingface_hub import file_exists, hf_hub_download
53
+ from transformers import (
54
+ AutoConfig,
55
+ AutoModelForCausalLM,
56
+ AutoTokenizer,
57
+ GenerationConfig,
58
+ set_seed,
59
+ )
60
+
61
+ source_model_id = "tencent/Hy3-preview"
62
+ save_folder = "/tmp/yujiepan/hy-v3-tiny-random"
63
+
64
+ processor = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
65
+ processor.save_pretrained(save_folder)
66
+
67
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
68
+ config_json = json.load(f)
69
+ config_json.update({
70
+ 'expert_hidden_dim': 32,
71
+ 'moe_intermediate_size': 32,
72
+ 'head_dim': 32,
73
+ 'hidden_size': 8,
74
+ 'intermediate_size': 32,
75
+ 'num_attention_heads': 8,
76
+ 'num_hidden_layers': 4,
77
+ 'num_key_value_heads': 4,
78
+ })
79
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
80
+ json.dump(config_json, f, indent=2)
81
+
82
+ config = AutoConfig.from_pretrained(
83
+ save_folder,
84
+ trust_remote_code=True,
85
+ )
86
+ print(config)
87
+ torch.set_default_dtype(torch.bfloat16)
88
+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).eval().cpu()
89
+ torch.set_default_dtype(torch.float32)
90
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
91
+ model.generation_config = GenerationConfig.from_pretrained(
92
+ source_model_id, trust_remote_code=True,
93
+ )
94
+ model.generation_config.top_k = 40 # original value in source model is -1 , which is invalid
95
+
96
+ # mtp
97
+ mtp = deepcopy(model.model.layers[-1])
98
+ mtp.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
99
+ mtp.enorm = nn.RMSNorm(config.hidden_size)
100
+ mtp.hnorm = nn.RMSNorm(config.hidden_size)
101
+ mtp.final_layernorm = nn.RMSNorm(config.hidden_size)
102
+ model.model.layers.append(mtp)
103
+
104
+ # init weights
105
+ set_seed(42)
106
+ model = model.cpu().eval()
107
+ n_params = sum(p.numel() for p in model.parameters())
108
+ with torch.no_grad():
109
+ for name, p in sorted(model.named_parameters()):
110
+ torch.nn.init.normal_(p, 0, 0.2)
111
+ print(name, p.shape, p.dtype, f'{p.numel() / n_params * 100: .2f}%')
112
+
113
+ # expert bias is in float32
114
+ for i in range(config.first_k_dense_replace, config.num_hidden_layers, 1):
115
+ model.model.layers[i].mlp.e_score_correction_bias = nn.Parameter(torch.randn_like(
116
+ model.model.layers[i].mlp.e_score_correction_bias
117
+ ).float() * 0.002)
118
+
119
+ model.save_pretrained(save_folder)
120
+ print(model)
121
+ ```
122
+
123
+ </details>
124
+
125
+ ### Printing the model:
126
+
127
+ <details><summary>Click to expand</summary>
128
+
129
+ ```text
130
+ HYV3ForCausalLM(
131
+ (model): HYV3Model(
132
+ (embed_tokens): Embedding(120832, 8, padding_idx=120002)
133
+ (layers): ModuleList(
134
+ (0): HYV3DecoderLayer(
135
+ (self_attn): HYV3Attention(
136
+ (q_proj): Linear(in_features=8, out_features=256, bias=False)
137
+ (k_proj): Linear(in_features=8, out_features=128, bias=False)
138
+ (v_proj): Linear(in_features=8, out_features=128, bias=False)
139
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
140
+ (q_norm): HYV3RMSNorm((32,), eps=1e-05)
141
+ (k_norm): HYV3RMSNorm((32,), eps=1e-05)
142
+ )
143
+ (mlp): HYV3MLP(
144
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
145
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
146
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
147
+ (act_fn): SiLUActivation()
148
+ )
149
+ (input_layernorm): HYV3RMSNorm((8,), eps=1e-05)
150
+ (post_attention_layernorm): HYV3RMSNorm((8,), eps=1e-05)
151
+ )
152
+ (1-3): 3 x HYV3DecoderLayer(
153
+ (self_attn): HYV3Attention(
154
+ (q_proj): Linear(in_features=8, out_features=256, bias=False)
155
+ (k_proj): Linear(in_features=8, out_features=128, bias=False)
156
+ (v_proj): Linear(in_features=8, out_features=128, bias=False)
157
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
158
+ (q_norm): HYV3RMSNorm((32,), eps=1e-05)
159
+ (k_norm): HYV3RMSNorm((32,), eps=1e-05)
160
+ )
161
+ (mlp): HYV3MoE(
162
+ (gate): HYV3TopKRouter()
163
+ (experts): HYV3Experts(
164
+ (act_fn): SiLUActivation()
165
+ )
166
+ (shared_experts): HYV3MLP(
167
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
168
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
169
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
170
+ (act_fn): SiLUActivation()
171
+ )
172
+ )
173
+ (input_layernorm): HYV3RMSNorm((8,), eps=1e-05)
174
+ (post_attention_layernorm): HYV3RMSNorm((8,), eps=1e-05)
175
+ )
176
+ (4): HYV3DecoderLayer(
177
+ (self_attn): HYV3Attention(
178
+ (q_proj): Linear(in_features=8, out_features=256, bias=False)
179
+ (k_proj): Linear(in_features=8, out_features=128, bias=False)
180
+ (v_proj): Linear(in_features=8, out_features=128, bias=False)
181
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
182
+ (q_norm): HYV3RMSNorm((32,), eps=1e-05)
183
+ (k_norm): HYV3RMSNorm((32,), eps=1e-05)
184
+ )
185
+ (mlp): HYV3MoE(
186
+ (gate): HYV3TopKRouter()
187
+ (experts): HYV3Experts(
188
+ (act_fn): SiLUActivation()
189
+ )
190
+ (shared_experts): HYV3MLP(
191
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
192
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
193
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
194
+ (act_fn): SiLUActivation()
195
+ )
196
+ )
197
+ (input_layernorm): HYV3RMSNorm((8,), eps=1e-05)
198
+ (post_attention_layernorm): HYV3RMSNorm((8,), eps=1e-05)
199
+ (eh_proj): Linear(in_features=16, out_features=8, bias=False)
200
+ (enorm): RMSNorm((8,), eps=None, elementwise_affine=True)
201
+ (hnorm): RMSNorm((8,), eps=None, elementwise_affine=True)
202
+ (final_layernorm): RMSNorm((8,), eps=None, elementwise_affine=True)
203
+ )
204
+ )
205
+ (norm): HYV3RMSNorm((8,), eps=1e-05)
206
+ (rotary_emb): HYV3RotaryEmbedding()
207
+ )
208
+ (lm_head): Linear(in_features=8, out_features=120832, bias=False)
209
+ )
210
+ ```
211
+
212
+ </details>
213
+
214
+ ### Test environment:
215
+
216
+ - torch: 2.11.0+cu126
217
+ - transformers: 5.7.0.dev0
chat_template.jinja ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {#- ----------‑‑‑ special token variables ‑‑‑---------- -#}
2
+ {%- set bos_token = '<|hy_begin▁of▁sentence|>' %}
3
+ {%- set pad_token = '<|hy_▁pad▁|>' %}
4
+ {%- set user_token = '<|hy_User|>' %}
5
+ {%- set assistant_token = '<|hy_Assistant|>' %}
6
+ {%- set eos_token = '<|hy_eos|>' %}
7
+ {%- set think_begin_token = '<think>' %}
8
+ {%- set think_end_token = '</think>' %}
9
+ {%- set toolcalls_begin_token = '<tool_calls>' %}
10
+ {%- set toolcalls_end_token = '</tool_calls>' %}
11
+ {%- set toolcall_begin_token = '<tool_call>' %}
12
+ {%- set toolcall_end_token = '</tool_call>' %}
13
+ {%- set toolsep_token = '<tool_sep>' %}
14
+ {%- set argkey_begin_token = '<arg_key>' %}
15
+ {%- set argkey_end_token = '</arg_key>' %}
16
+ {%- set argvalue_begin_token = '<arg_value>' %}
17
+ {%- set argvalue_end_token = '</arg_value>' %}
18
+ {%- set toolresponses_begin_token = '<tool_responses>' %}
19
+ {%- set toolresponses_end_token = '</tool_responses>' %}
20
+ {%- set toolresponse_begin_token = '<tool_response>' %}
21
+ {%- set toolresponse_end_token = '</tool_response>' %}
22
+ {%- set reasoning_mode_token = '<|reasoning_mode|>' %}
23
+ {#- ----------‑‑‑ hyperparameters variables ‑‑‑---------- -#}
24
+ {%- if not add_generation_prompt is defined %}
25
+ {%- set add_generation_prompt = false %}
26
+ {%- endif %}
27
+ {%- if not interleaved_thinking is defined %}
28
+ {%- set interleaved_thinking = false %}
29
+ {%- endif %}
30
+ {%- if not tools %}
31
+ {%- set interleaved_thinking = false %}
32
+ {%- endif %}
33
+ {%- if not is_training is defined %}
34
+ {%- set is_training = false %}
35
+ {%- endif %}
36
+ {%- if not reasoning_effort is defined or reasoning_effort not in ['high', 'low', 'no_think'] %}
37
+ {%- set reasoning_effort = 'no_think' %}
38
+ {%- endif %}
39
+
40
+ {%- macro visible_text(content) -%}
41
+ {%- if content is string -%}
42
+ {{- content }}
43
+ {%- elif content is iterable and content is not mapping -%}
44
+ {%- for item in content -%}
45
+ {%- if item is mapping and item.type == 'text' -%}
46
+ {{- item.text }}
47
+ {%- elif item is string -%}
48
+ {{- item }}
49
+ {%- endif -%}
50
+ {%- endfor -%}
51
+ {%- elif content is none -%}
52
+ {{- '' }}
53
+ {%- else -%}
54
+ {{- content }}
55
+ {%- endif -%}
56
+ {%- endmacro -%}
57
+
58
+ {%- set ns = namespace(last_user_index=-1) %}
59
+ {%- set sp_ns = namespace(system_prompt='', is_first_sp=true) %}
60
+ {%- for message in messages %}
61
+ {%- if message['role'] == 'system' %}
62
+ {%- set sp_ns.system_prompt = sp_ns.system_prompt + visible_text(message['content']) %}
63
+ {%- endif %}
64
+ {%- if message['role'] == 'user' %}
65
+ {%- set ns.last_user_index = loop.index0 %}
66
+ {%- endif %}
67
+ {%- endfor %}
68
+ {%- if reasoning_effort is defined and reasoning_effort is string and reasoning_effort != '' and not tools %}
69
+ {%- set sp_ns.system_prompt = sp_ns.system_prompt + reasoning_mode_token + 'reasoning_effort:' + reasoning_effort %}
70
+ {%- endif %}
71
+ {{- bos_token }}
72
+ {{- sp_ns.system_prompt }}
73
+ {%- if tools %}
74
+ {%- if sp_ns.system_prompt != '' %}
75
+ {{- '\n\n# Tools\n\nYou may call one or more functions to assist with the user query.' }}
76
+ {%- else %}
77
+ {{- '# Tools\n\nYou may call one or more functions to assist with the user query.' }}
78
+ {%- endif %}
79
+ {{- '\n\nYou are provided with function signatures within <tools></tools> XML tags:' }}
80
+ {{- '\n<tools>\n' }}
81
+ {%- for tool in tools %}
82
+ {%- if loop.index0 > 0 %}
83
+ {{- '\n' }}
84
+ {%- endif %}
85
+ {{- tool | tojson }}
86
+ {%- endfor %}
87
+ {{- '\n</tools>\n\n' }}
88
+ {{- 'For function call returns, you should first print ' + toolcalls_begin_token + '\n' }}
89
+ {{- 'For each function call, you should return object like:\n' }}
90
+ {{- toolcall_begin_token + '{function-name}' + toolsep_token + '\n' }}
91
+ {{- argkey_begin_token + '{arg-key-1}' + argkey_end_token + '\n' }}
92
+ {{- argvalue_begin_token + '{arg-value-1}' + argvalue_end_token + '\n' }}
93
+ {{- argkey_begin_token + '{arg-key-2}' + argkey_end_token + '\n' }}
94
+ {{- argvalue_begin_token + '{arg-value-2}' + argvalue_end_token + '\n' }}
95
+ {{- '...\n' }}
96
+ {{- toolcall_end_token + '\n' }}
97
+ {%- if reasoning_effort is defined and reasoning_effort is string and reasoning_effort != '' %}
98
+ {{- 'At the end of function call returns, you should print ' + toolcalls_end_token + reasoning_mode_token + 'reasoning_effort:' + reasoning_effort }}
99
+ {%- else %}
100
+ {{- 'At the end of function call returns, you should print ' + toolcalls_end_token }}
101
+ {%- endif %}
102
+ {%- endif %}
103
+
104
+ {%- set prev_ns = namespace(is_tool=false, is_tool_first=true) %}
105
+ {%- set last_ns = namespace(last_is_assistant=false) %}
106
+ {%- for message in messages %}
107
+ {%- if message['role'] == 'user' %}
108
+ {%- if prev_ns.is_tool %}
109
+ {{- toolresponses_end_token }}
110
+ {%- endif %}
111
+ {{- user_token + visible_text(message['content']) }}
112
+ {%- set prev_ns.is_tool = false %}
113
+ {%- endif %}
114
+ {%- if message['role'] == 'assistant' %}
115
+ {%- if 'reasoning_content' in message and message['reasoning_content'] is string %}
116
+ {%- set rc = message['reasoning_content'] %}
117
+ {%- elif 'reasoning' in message and message['reasoning'] is string %}
118
+ {%- set rc = message['reasoning'] %}
119
+ {%- else %}
120
+ {%- set rc = none %}
121
+ {%- endif %}
122
+ {%- if is_training %}
123
+ {%- if rc is not none %}
124
+ {%- set content = think_begin_token + rc + think_end_token + visible_text(message['content']) %}
125
+ {%- else %}
126
+ {%- set content = think_begin_token + think_end_token + visible_text(message['content']) %}
127
+ {%- endif %}
128
+ {%- else %}
129
+ {%- if interleaved_thinking %}
130
+ {%- if loop.index0 > ns.last_user_index and rc is not none %}
131
+ {%- set content = think_begin_token + rc + think_end_token + visible_text(message['content']) %}
132
+ {%- else %}
133
+ {%- set content = think_begin_token + think_end_token + visible_text(message['content']) %}
134
+ {%- endif %}
135
+ {%- else %}
136
+ {%- set content = think_begin_token + think_end_token + visible_text(message['content']) %}
137
+ {%- endif %}
138
+ {%- endif %}
139
+ {%- if prev_ns.is_tool %}
140
+ {{- toolresponses_end_token }}
141
+ {%- endif %}
142
+ {{- assistant_token }}
143
+ {%- if message['tool_calls'] is defined and message['tool_calls'] %}
144
+ {%- set prev_ns.is_tool_first = true %}
145
+ {{- content }}
146
+ {{- toolcalls_begin_token + '\n' }}
147
+ {%- for tool in message['tool_calls'] %}
148
+ {%- set arguments = tool['function']['arguments'] %}
149
+ {{- toolcall_begin_token + tool['function']['name'] + toolsep_token + '\n' }}
150
+ {%- for key, value in arguments.items() %}
151
+ {{- argkey_begin_token + key + argkey_end_token + '\n' }}
152
+ {%- if value is not string %}
153
+ {%- set value = value | tojson(ensure_ascii=False) %}
154
+ {%- endif %}
155
+ {{- argvalue_begin_token + value + argvalue_end_token + '\n' }}
156
+ {%- endfor %}
157
+ {{- toolcall_end_token + '\n' }}
158
+ {%- endfor %}
159
+ {{- toolcalls_end_token + eos_token }}
160
+ {%- else %}
161
+ {%- if not loop.last or is_training %}
162
+ {{- content + eos_token }}
163
+ {%- else %}
164
+ {{- content }}
165
+ {%- endif %}
166
+ {%- endif %}
167
+ {%- set prev_ns.is_tool = false %}
168
+ {%- endif %}
169
+ {%- if message['role'] == 'tool' %}
170
+ {%- set prev_ns.is_tool = true %}
171
+ {%- if prev_ns.is_tool_first %}
172
+ {{- toolresponses_begin_token + '\n' }}
173
+ {%- set prev_ns.is_tool_first = false %}
174
+ {%- endif %}
175
+ {{- toolresponse_begin_token + '\n' + visible_text(message['content']) + '\n' + toolresponse_end_token + '\n' }}
176
+ {%- endif %}
177
+ {%- if loop.last and message['role'] == 'assistant' %}
178
+ {%- set last_ns.last_is_assistant = true %}
179
+ {%- endif %}
180
+
181
+ {%- endfor %}
182
+ {%- if prev_ns.is_tool %}
183
+ {{- toolresponses_end_token }}
184
+ {%- endif %}
185
+ {%- if add_generation_prompt %}
186
+ {%- if not last_ns.last_is_assistant %}
187
+ {%- if reasoning_effort is defined and reasoning_effort in ['low', 'high'] %}
188
+ {{- assistant_token + think_begin_token }}
189
+ {%- elif reasoning_effort is defined and reasoning_effort == 'no_think' %}
190
+ {{- assistant_token + think_begin_token + think_end_token }}
191
+ {%- else %}
192
+ {{- assistant_token }}
193
+ {%- endif %}
194
+ {%- endif %}
195
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "HYV3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 120000,
8
+ "dtype": "bfloat16",
9
+ "enable_attention_fp32_softmax": false,
10
+ "enable_lm_head_fp32": true,
11
+ "enable_moe_fp32_combine": false,
12
+ "eod_token_id": 120026,
13
+ "eos_token_id": 120025,
14
+ "expert_hidden_dim": 32,
15
+ "first_k_dense_replace": 1,
16
+ "head_dim": 32,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 8,
19
+ "initializer_range": 0.006,
20
+ "intermediate_size": 32,
21
+ "max_position_embeddings": 262144,
22
+ "mlp_bias": false,
23
+ "mlp_layer_types": [
24
+ "dense",
25
+ "sparse",
26
+ "sparse",
27
+ "sparse"
28
+ ],
29
+ "model_type": "hy_v3",
30
+ "moe_intermediate_size": 32,
31
+ "moe_router_enable_expert_bias": true,
32
+ "moe_router_use_sigmoid": true,
33
+ "num_attention_heads": 8,
34
+ "num_experts": 192,
35
+ "num_experts_per_tok": 8,
36
+ "num_hidden_layers": 4,
37
+ "num_key_value_heads": 4,
38
+ "num_nextn_predict_layers": 1,
39
+ "num_shared_experts": 1,
40
+ "output_router_logits": true,
41
+ "pad_token_id": 120002,
42
+ "qk_norm": true,
43
+ "rms_norm_eps": 1e-05,
44
+ "rope_parameters": {
45
+ "rope_theta": 11158840.0,
46
+ "rope_type": "default"
47
+ },
48
+ "route_norm": true,
49
+ "router_scaling_factor": 2.826,
50
+ "sep_token_id": 120007,
51
+ "tie_word_embeddings": false,
52
+ "transformers_version": "5.7.0.dev0",
53
+ "use_cache": true,
54
+ "use_grouped_mm": false,
55
+ "vocab_size": 120832
56
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 120000,
3
+ "do_sample": true,
4
+ "eos_token_id": 120025,
5
+ "pad_token_id": 120002,
6
+ "temperature": 0.9,
7
+ "top_k": 40,
8
+ "top_p": 1,
9
+ "transformers_version": "5.7.0.dev0",
10
+ "trust_remote_code": true
11
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30b39e65fb3d102afe15a892641b41b7ff8eb4a5f01f5a199dfb0bda397c5401
3
+ size 5401256
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|hy_begin▁of▁sentence|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|hy_eos|>",
6
+ "is_local": false,
7
+ "local_files_only": false,
8
+ "model_max_length": 1000000000000000019884624838656,
9
+ "pad_token": "<|hy_▁pad▁|>",
10
+ "tokenizer_class": "TokenizersBackend"
11
+ }