tokenizer_config.json · michelinolinolino/gemma4-4b-sci at main

gemma4-4b-sci / tokenizer_config.json

michelino

Upload folder using huggingface_hub

e9ebb45 verified 6 days ago

4.27 kB

	{
	"audio_token": "<\|audio\|>",
	"backend": "tokenizers",
	"boa_token": "<\|audio>",
	"boi_token": "<\|image>",
	"bos_token": "<bos>",
	"eoa_token": "<audio\|>",
	"eoc_token": "<channel\|>",
	"eoi_token": "<image\|>",
	"eos_token": "<turn\|>",
	"eot_token": "<turn\|>",
	"escape_token": "<\|\"\|>",
	"etc_token": "<tool_call\|>",
	"etd_token": "<tool\|>",
	"etr_token": "<tool_response\|>",
	"extra_special_tokens": [
	"<\|video\|>"
	],
	"image_token": "<\|image\|>",
	"is_local": false,
	"mask_token": "<mask>",
	"model_max_length": 131072,
	"model_specific_special_tokens": {
	"audio_token": "<\|audio\|>",
	"boa_token": "<\|audio>",
	"boi_token": "<\|image>",
	"eoa_token": "<audio\|>",
	"eoc_token": "<channel\|>",
	"eoi_token": "<image\|>",
	"eot_token": "<turn\|>",
	"escape_token": "<\|\"\|>",
	"etc_token": "<tool_call\|>",
	"etd_token": "<tool\|>",
	"etr_token": "<tool_response\|>",
	"image_token": "<\|image\|>",
	"soc_token": "<\|channel>",
	"sot_token": "<\|turn>",
	"stc_token": "<\|tool_call>",
	"std_token": "<\|tool>",
	"str_token": "<\|tool_response>",
	"think_token": "<\|think\|>"
	},
	"pad_token": "<pad>",
	"padding_side": "right",
	"processor_class": "Gemma4Processor",
	"response_schema": {
	"properties": {
	"content": {
	"type": "string"
	},
	"role": {
	"const": "assistant"
	},
	"thinking": {
	"type": "string"
	},
	"tool_calls": {
	"items": {
	"properties": {
	"function": {
	"properties": {
	"arguments": {
	"additionalProperties": {},
	"type": "object",
	"x-parser": "gemma4-tool-call"
	},
	"name": {
	"type": "string"
	}
	},
	"type": "object",
	"x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
	},
	"type": {
	"const": "function"
	}
	},
	"type": "object"
	},
	"type": "array",
	"x-regex-iterator": "<\\\|tool_call>(.*?)<tool_call\\\|>"
	}
	},
	"type": "object",
	"x-regex": "(\\<\\\|channel\\>thought\\n(?P<thinking>.?)\\<channel\\\|\\>)?(?P<tool_calls>\\<\\\|tool_call\\>.\\<tool_call\\\|\\>)?(?P<content>(?:(?!\\<turn\\\|\\>)(?!\\<\\\|tool_response\\>).)+)?(?:\\<turn\\\|\\>\|\\<\\\|tool_response\\>)?"
	},
	"soc_token": "<\|channel>",
	"sot_token": "<\|turn>",
	"stc_token": "<\|tool_call>",
	"std_token": "<\|tool>",
	"str_token": "<\|tool_response>",
	"think_token": "<\|think\|>",
	"tokenizer_class": "GemmaTokenizer",
	"unk_token": "<unk>",
	"chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<\|turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] \| trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'audio' -%}\n {{ '<\|audio\|>' }}\n {%- elif item['type'] == 'image' -%}\n {{ '<\|image\|>' }}\n {%- elif item['type'] == 'video' -%}\n {{ '<\|video\|>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] \| trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<turn\|>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<\|turn>model\n'}}\n{%- endif -%}\n"
	}