| """Tokenization classes for Arctic.""" |
|
|
| from typing import Any, Dict, Optional |
|
|
| from transformers.models.llama import LlamaTokenizer |
|
|
|
|
| class ArcticTokenizer(LlamaTokenizer): |
|
|
| def __init__( |
| self, |
| vocab_file, |
| unk_token="<unk>", |
| bos_token="<s>", |
| eos_token="</s>", |
| pad_token=None, |
| sp_model_kwargs: Optional[Dict[str, Any]] = None, |
| add_bos_token=True, |
| add_eos_token=False, |
| clean_up_tokenization_spaces=False, |
| use_default_system_prompt=False, |
| spaces_between_special_tokens=False, |
| legacy=False, |
| add_prefix_space=True, |
| **kwargs, |
| ): |
| |
| super().__init__( |
| vocab_file, |
| bos_token=bos_token, |
| eos_token=eos_token, |
| unk_token=unk_token, |
| pad_token=pad_token, |
| sp_model_kwargs=sp_model_kwargs, |
| add_bos_token=add_bos_token, |
| add_eos_token=add_eos_token, |
| clean_up_tokenization_spaces=clean_up_tokenization_spaces, |
| use_default_system_prompt=use_default_system_prompt, |
| spaces_between_special_tokens=spaces_between_special_tokens, |
| legacy=legacy, |
| add_prefix_space=add_prefix_space, |
| **kwargs, |
| ) |
|
|
| @property |
| def default_chat_template(self): |
| """ |
| This template formats inputs in the standard Arctic format. |
| """ |
| return ( |
| "{% for message in messages %}" |
| "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" |
| "{% endfor %}" |
| "{% if add_generation_prompt %}" |
| "{{ '<|im_start|>assistant\n' }}" |
| "{% endif %}" |
| ) |
|
|