HemanthSai7 commited on
Commit
094f4e0
·
verified ·
1 Parent(s): 3749884

Fix configuration_nandi.py: use __init__ instead of @strict /__post_init__ for compatibility with released transformers

Browse files
Files changed (1) hide show
  1. configuration_nandi.py +67 -58
configuration_nandi.py CHANGED
@@ -1,10 +1,4 @@
1
- # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
- # This file was automatically generated from src/transformers/models/nandi/modular_nandi.py.
3
- # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
- # the file from the modular. If any change should be done, please apply the change to the
5
- # modular_nandi.py file directly. One of our CI enforces this.
6
- # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
- # Copyright 2026 The HuggingFace Inc. team. All rights reserved.
8
  #
9
  # Licensed under the Apache License, Version 2.0 (the "License");
10
  # you may not use this file except in compliance with the License.
@@ -18,29 +12,25 @@
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
 
21
- from huggingface_hub.dataclasses import strict
22
-
23
  from transformers.configuration_utils import PretrainedConfig
24
- from transformers.modeling_rope_utils import RopeParameters
25
 
26
 
27
- @strict(accept_kwargs=True)
28
  class NandiConfig(PretrainedConfig):
29
  r"""
 
 
30
  Example:
31
 
32
  ```python
33
- >>> from transformers import NandiConfig, NandiForCausalLM
34
 
35
- >>> # Initializing a Nandi style configuration
36
- >>> configuration = NandiConfig()
37
 
38
- >>> # Initializing a model from the Nandi style configuration
39
- >>> model = NandiForCausalLM(configuration)
40
 
41
- >>> # Accessing the model configuration
42
  >>> configuration = model.config
43
- ```"""
 
44
 
45
  model_type = "nandi"
46
  keys_to_ignore_at_inference = ["past_key_values"]
@@ -55,44 +45,56 @@ class NandiConfig(PretrainedConfig):
55
  "layers.*.mlp.down_proj": "rowwise",
56
  }
57
 
58
- # Defaults from the provided Nanotron training config.
59
- vocab_size: int = 131072
60
- hidden_size: int = 832
61
- intermediate_size: int = 2496
62
- num_hidden_layers: int = 16
63
- num_attention_heads: int = 16
64
- num_key_value_heads: int | None = 4
65
- head_dim: int | None = None
66
- hidden_act: str = "silu"
67
- max_position_embeddings: int = 2048
68
- initializer_range: float = 0.008
69
- rms_norm_eps: float = 1e-5
70
- use_cache: bool = True
71
- pad_token_id: int | None = None
72
- bos_token_id: int | None = 1
73
- eos_token_id: int | list[int] | None = 0
74
- pretraining_tp: int | None = 1
75
- tie_word_embeddings: bool = True
76
- rope_parameters: RopeParameters | dict | None = None
77
- attention_bias: bool = False
78
- attention_dropout: float = 0.0
79
- mlp_bias: bool = False
80
-
81
- # Nandi-specific options.
82
- factorized_embedding: bool = True
83
- embedding_rank: int = 196
84
- layer_sharing: bool = True
85
- layer_sharing_repeats: int = 2
86
-
87
- def __post_init__(self, **kwargs):
88
- if self.num_key_value_heads is None:
89
- self.num_key_value_heads = self.num_attention_heads
90
- if self.head_dim is None:
91
- self.head_dim = self.hidden_size // self.num_attention_heads
92
- if self.rope_parameters is None:
93
- self.rope_parameters = {"rope_theta": 100000.0}
94
- if not self.layer_sharing:
95
- self.layer_sharing_repeats = 1
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  if self.factorized_embedding and self.embedding_rank <= 0:
98
  raise ValueError(
@@ -100,12 +102,19 @@ class NandiConfig(PretrainedConfig):
100
  )
101
  if self.hidden_size % self.num_attention_heads != 0:
102
  raise ValueError(
103
- f"`hidden_size` ({self.hidden_size}) must be divisible by `num_attention_heads` ({self.num_attention_heads})."
 
104
  )
105
  if self.layer_sharing_repeats < 1:
106
  raise ValueError(f"`layer_sharing_repeats` must be >= 1, got {self.layer_sharing_repeats}.")
107
 
108
- super().__post_init__(**kwargs)
 
 
 
 
 
 
109
 
110
 
111
  __all__ = ["NandiConfig"]
 
1
+ # Copyright 2026 RTA AI Labs. All rights reserved.
 
 
 
 
 
 
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
  # you may not use this file except in compliance with the License.
 
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
 
 
 
15
  from transformers.configuration_utils import PretrainedConfig
 
16
 
17
 
 
18
  class NandiConfig(PretrainedConfig):
19
  r"""
20
+ Configuration class for the Nandi model.
21
+
22
  Example:
23
 
24
  ```python
25
+ >>> from transformers import AutoConfig, AutoModelForCausalLM
26
 
27
+ >>> configuration = AutoConfig.from_pretrained("Rta-AILabs/Nandi-150M-remote", trust_remote_code=True)
 
28
 
29
+ >>> model = AutoModelForCausalLM.from_pretrained("Rta-AILabs/Nandi-150M-remote", trust_remote_code=True)
 
30
 
 
31
  >>> configuration = model.config
32
+ ```
33
+ """
34
 
35
  model_type = "nandi"
36
  keys_to_ignore_at_inference = ["past_key_values"]
 
45
  "layers.*.mlp.down_proj": "rowwise",
46
  }
47
 
48
+ def __init__(
49
+ self,
50
+ vocab_size=131072,
51
+ hidden_size=832,
52
+ intermediate_size=2496,
53
+ num_hidden_layers=16,
54
+ num_attention_heads=16,
55
+ num_key_value_heads=4,
56
+ head_dim=None,
57
+ hidden_act="silu",
58
+ max_position_embeddings=2048,
59
+ initializer_range=0.008,
60
+ rms_norm_eps=1e-5,
61
+ use_cache=True,
62
+ pad_token_id=None,
63
+ bos_token_id=1,
64
+ eos_token_id=0,
65
+ pretraining_tp=1,
66
+ tie_word_embeddings=True,
67
+ rope_parameters=None,
68
+ attention_bias=False,
69
+ attention_dropout=0.0,
70
+ mlp_bias=False,
71
+ factorized_embedding=True,
72
+ embedding_rank=196,
73
+ layer_sharing=True,
74
+ layer_sharing_repeats=2,
75
+ **kwargs,
76
+ ):
77
+ self.vocab_size = vocab_size
78
+ self.hidden_size = hidden_size
79
+ self.intermediate_size = intermediate_size
80
+ self.num_hidden_layers = num_hidden_layers
81
+ self.num_attention_heads = num_attention_heads
82
+ self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads
83
+ self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads
84
+ self.hidden_act = hidden_act
85
+ self.max_position_embeddings = max_position_embeddings
86
+ self.initializer_range = initializer_range
87
+ self.rms_norm_eps = rms_norm_eps
88
+ self.use_cache = use_cache
89
+ self.pretraining_tp = pretraining_tp
90
+ self.rope_parameters = rope_parameters if rope_parameters is not None else {"rope_theta": 100000.0}
91
+ self.attention_bias = attention_bias
92
+ self.attention_dropout = attention_dropout
93
+ self.mlp_bias = mlp_bias
94
+ self.factorized_embedding = factorized_embedding
95
+ self.embedding_rank = embedding_rank
96
+ self.layer_sharing = layer_sharing
97
+ self.layer_sharing_repeats = layer_sharing_repeats if layer_sharing else 1
98
 
99
  if self.factorized_embedding and self.embedding_rank <= 0:
100
  raise ValueError(
 
102
  )
103
  if self.hidden_size % self.num_attention_heads != 0:
104
  raise ValueError(
105
+ f"`hidden_size` ({self.hidden_size}) must be divisible by "
106
+ f"`num_attention_heads` ({self.num_attention_heads})."
107
  )
108
  if self.layer_sharing_repeats < 1:
109
  raise ValueError(f"`layer_sharing_repeats` must be >= 1, got {self.layer_sharing_repeats}.")
110
 
111
+ super().__init__(
112
+ pad_token_id=pad_token_id,
113
+ bos_token_id=bos_token_id,
114
+ eos_token_id=eos_token_id,
115
+ tie_word_embeddings=tie_word_embeddings,
116
+ **kwargs,
117
+ )
118
 
119
 
120
  __all__ = ["NandiConfig"]