| """ |
| OLMo configuration |
| """ |
|
|
| from transformers import AutoConfig, PretrainedConfig |
| from transformers.utils import logging |
|
|
| from .config import ModelConfig |
| from .aliases import PathOrStr |
| from .beam_search import Sampler |
| from .exceptions import OLMoError |
| from .initialization import ModuleType |
| from .optim import Optimizer |
| from .util import StrEnum |
| from .safetensors_util import STKey |
| from .torch_util import seed_all |
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| class OLMoConfig(PretrainedConfig): |
| model_type = "olmo" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__(self, use_cache: bool = False, **kwargs): |
| model_config = ModelConfig() |
| all_kwargs = model_config.asdict() |
| all_kwargs.update(kwargs) |
| all_kwargs.update({"use_cache": use_cache}) |
| all_kwargs.update( |
| { |
| "architectures": all_kwargs.get("architectures", ["OLMoModelForCausalLM"]) |
| or ["OLMoModelForCausalLM"] |
| } |
| ) |
| super().__init__(**all_kwargs) |
|
|
| @property |
| def num_attention_heads(self): |
| return self.n_heads |
|
|
| @property |
| def num_hidden_layers(self): |
| return self.n_layers |
|
|
| @property |
| def hidden_size(self): |
| return self.d_model |
|
|
|
|
| |
| |
|
|