YongganFu commited on
Commit
804d4b4
·
verified ·
1 Parent(s): 39c8fdc

Sync tokenizer to the instruct variant (vocab + special tokens for chat template)

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +1 -1
  2. tokenizer.json +2 -2
  3. tokenizer_config.json +17 -17
special_tokens_map.json CHANGED
@@ -7,7 +7,7 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "</s>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|im_end|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3277c00fe5fb3963b3cb7c07b7f183722d2af4d775a4aea7cfb3684d7cccbc2f
3
- size 17078330
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:623c34567aebb18582765289fbe23d901c62704d6518d71866e0e58db892b5b7
3
+ size 17077484
tokenizer_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "add_bos_token": true,
3
  "add_eos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
@@ -84,7 +84,7 @@
84
  "special": true
85
  },
86
  "10": {
87
- "content": "<SPECIAL_10>",
88
  "lstrip": false,
89
  "normalized": false,
90
  "rstrip": false,
@@ -92,7 +92,7 @@
92
  "special": true
93
  },
94
  "11": {
95
- "content": "<SPECIAL_11>",
96
  "lstrip": false,
97
  "normalized": false,
98
  "rstrip": false,
@@ -100,52 +100,52 @@
100
  "special": true
101
  },
102
  "12": {
103
- "content": "<SPECIAL_12>",
104
  "lstrip": false,
105
  "normalized": false,
106
  "rstrip": false,
107
  "single_word": false,
108
- "special": true
109
  },
110
  "13": {
111
- "content": "<SPECIAL_13>",
112
  "lstrip": false,
113
  "normalized": false,
114
  "rstrip": false,
115
  "single_word": false,
116
- "special": true
117
  },
118
  "14": {
119
- "content": "<SPECIAL_14>",
120
  "lstrip": false,
121
  "normalized": false,
122
  "rstrip": false,
123
  "single_word": false,
124
- "special": true
125
  },
126
  "15": {
127
- "content": "<SPECIAL_15>",
128
  "lstrip": false,
129
  "normalized": false,
130
  "rstrip": false,
131
  "single_word": false,
132
- "special": true
133
  },
134
  "16": {
135
- "content": "<SPECIAL_16>",
136
  "lstrip": false,
137
  "normalized": false,
138
  "rstrip": false,
139
  "single_word": false,
140
- "special": true
141
  },
142
  "17": {
143
- "content": "<SPECIAL_17>",
144
  "lstrip": false,
145
  "normalized": false,
146
  "rstrip": false,
147
  "single_word": false,
148
- "special": true
149
  },
150
  "18": {
151
  "content": "<SPECIAL_18>",
@@ -8006,13 +8006,13 @@
8006
  },
8007
  "bos_token": "<s>",
8008
  "clean_up_tokenization_spaces": false,
8009
- "eos_token": "</s>",
8010
  "extra_special_tokens": {},
8011
  "model_input_names": [
8012
  "input_ids",
8013
  "attention_mask"
8014
  ],
8015
- "model_max_length": 8192,
8016
  "tokenizer_class": "PreTrainedTokenizerFast",
8017
  "unk_token": "<unk>"
8018
  }
 
1
  {
2
+ "add_bos_token": false,
3
  "add_eos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
 
84
  "special": true
85
  },
86
  "10": {
87
+ "content": "<|im_start|>",
88
  "lstrip": false,
89
  "normalized": false,
90
  "rstrip": false,
 
92
  "special": true
93
  },
94
  "11": {
95
+ "content": "<|im_end|>",
96
  "lstrip": false,
97
  "normalized": false,
98
  "rstrip": false,
 
100
  "special": true
101
  },
102
  "12": {
103
+ "content": "<think>",
104
  "lstrip": false,
105
  "normalized": false,
106
  "rstrip": false,
107
  "single_word": false,
108
+ "special": false
109
  },
110
  "13": {
111
+ "content": "</think>",
112
  "lstrip": false,
113
  "normalized": false,
114
  "rstrip": false,
115
  "single_word": false,
116
+ "special": false
117
  },
118
  "14": {
119
+ "content": "<tool_call>",
120
  "lstrip": false,
121
  "normalized": false,
122
  "rstrip": false,
123
  "single_word": false,
124
+ "special": false
125
  },
126
  "15": {
127
+ "content": "</tool_call>",
128
  "lstrip": false,
129
  "normalized": false,
130
  "rstrip": false,
131
  "single_word": false,
132
+ "special": false
133
  },
134
  "16": {
135
+ "content": "<tool_response>",
136
  "lstrip": false,
137
  "normalized": false,
138
  "rstrip": false,
139
  "single_word": false,
140
+ "special": false
141
  },
142
  "17": {
143
+ "content": "</tool_response>",
144
  "lstrip": false,
145
  "normalized": false,
146
  "rstrip": false,
147
  "single_word": false,
148
+ "special": false
149
  },
150
  "18": {
151
  "content": "<SPECIAL_18>",
 
8006
  },
8007
  "bos_token": "<s>",
8008
  "clean_up_tokenization_spaces": false,
8009
+ "eos_token": "<|im_end|>",
8010
  "extra_special_tokens": {},
8011
  "model_input_names": [
8012
  "input_ids",
8013
  "attention_mask"
8014
  ],
8015
+ "model_max_length": 262144,
8016
  "tokenizer_class": "PreTrainedTokenizerFast",
8017
  "unk_token": "<unk>"
8018
  }