| from packaging import version |
| import transformers |
| if version.parse(transformers.__version__) < version.parse("4.31.0"): |
| raise ImportError( |
| f"You are using transformers=={transformers.__version__}, but transformers>=4.31.0 is required to use DeciLM. Please upgrade transformers." |
| ) |
| from transformers.models.llama.configuration_llama import LlamaConfig |
| from transformers.utils import logging |
|
|
|
|
| logger = logging.get_logger(__name__) |
|
|
| LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} |
|
|
|
|
| class DeciLMConfig(LlamaConfig): |
| r""" |
| |
| Args: |
| num_key_value_heads_per_layer (`List[int]`): |
| The number of key-value heads per layer. |
| naive_attention_prefill (`bool`, *optional*, defaults to False): |
| Whether to use naive matmul or scaled dot product attention during prefill. |
| naive_attention_decode_batched (`bool`, *optional*, defaults to True): |
| Whether to use naive matmul or scaled dot product attention during decode for batch_size > 1. |
| naive_attention_decode_single (`bool`, *optional*, defaults to False): |
| Whether to use naive matmul or scaled dot product attention during decode for batch_size == 1. |
| |
| |
| ```""" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| num_key_value_heads_per_layer: list = None, |
| naive_attention_prefill: bool = False, |
| naive_attention_decode_batched: bool = False, |
| naive_attention_decode_single: bool = False, |
| **kwargs, |
| ): |
| self.num_key_value_heads_per_layer = num_key_value_heads_per_layer |
| self.naive_attention_prefill = naive_attention_prefill |
| self.naive_attention_decode_batched = naive_attention_decode_batched |
| self.naive_attention_decode_single = naive_attention_decode_single |
| super().__init__(**kwargs, ) |
|
|
|
|