Sailesh Panda commited on
Commit ·
7c50ab0
1
Parent(s): 85af162
Experiments
Browse files- config.json +2 -0
- configuration_hinvec.py +1 -1
- modeling_hinvec.py +2 -3
config.json
CHANGED
|
@@ -33,6 +33,8 @@
|
|
| 33 |
"use_cache": true,
|
| 34 |
"use_sliding_window": true,
|
| 35 |
"vocab_size": 256002,
|
|
|
|
|
|
|
| 36 |
"auto_map": {
|
| 37 |
"AutoConfig": "Sailesh97/Hinvec2--configuration_hinvec.HinvecConfig",
|
| 38 |
"AutoModel": "Sailesh97/Hinvec2--modeling_hinvec.HinvecModel"}
|
|
|
|
| 33 |
"use_cache": true,
|
| 34 |
"use_sliding_window": true,
|
| 35 |
"vocab_size": 256002,
|
| 36 |
+
"pad_token_id": 3,
|
| 37 |
+
"bos_token_id": 256000,
|
| 38 |
"auto_map": {
|
| 39 |
"AutoConfig": "Sailesh97/Hinvec2--configuration_hinvec.HinvecConfig",
|
| 40 |
"AutoModel": "Sailesh97/Hinvec2--modeling_hinvec.HinvecModel"}
|
configuration_hinvec.py
CHANGED
|
@@ -151,7 +151,7 @@ class HinvecConfig(PretrainedConfig):
|
|
| 151 |
rope_theta=10000.0,
|
| 152 |
rope_scaling=None,
|
| 153 |
use_sliding_window=False,
|
| 154 |
-
sliding_window=
|
| 155 |
attention_bias=False,
|
| 156 |
max_window_layers=28,
|
| 157 |
attention_dropout=0.0,
|
|
|
|
| 151 |
rope_theta=10000.0,
|
| 152 |
rope_scaling=None,
|
| 153 |
use_sliding_window=False,
|
| 154 |
+
sliding_window=512,
|
| 155 |
attention_bias=False,
|
| 156 |
max_window_layers=28,
|
| 157 |
attention_dropout=0.0,
|
modeling_hinvec.py
CHANGED
|
@@ -997,7 +997,6 @@ class HinvecModel(HinvecPreTrainedModel):
|
|
| 997 |
attention_mask: Optional[torch.Tensor] = None,
|
| 998 |
position_ids: Optional[torch.LongTensor] = None,
|
| 999 |
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 1000 |
-
cls_token_id: Optional[int] = 256000, # Add CLS token ID as parameter
|
| 1001 |
**kwargs: Unpack[TransformersKwargs],
|
| 1002 |
) -> BaseModelOutputWithPast:
|
| 1003 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
@@ -1021,9 +1020,9 @@ class HinvecModel(HinvecPreTrainedModel):
|
|
| 1021 |
global_attention_mask = torch.zeros((batch_size, seq_len), device=device, dtype=torch.long)
|
| 1022 |
|
| 1023 |
# If input_ids is available, find CLS tokens
|
| 1024 |
-
if input_ids is not None and
|
| 1025 |
# Mark all CLS token positions as global attention
|
| 1026 |
-
global_attention_mask = (input_ids ==
|
| 1027 |
else:
|
| 1028 |
# Default: assume first token is CLS (common convention)
|
| 1029 |
global_attention_mask[:, 0] = 1
|
|
|
|
| 997 |
attention_mask: Optional[torch.Tensor] = None,
|
| 998 |
position_ids: Optional[torch.LongTensor] = None,
|
| 999 |
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
|
|
| 1000 |
**kwargs: Unpack[TransformersKwargs],
|
| 1001 |
) -> BaseModelOutputWithPast:
|
| 1002 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
|
|
| 1020 |
global_attention_mask = torch.zeros((batch_size, seq_len), device=device, dtype=torch.long)
|
| 1021 |
|
| 1022 |
# If input_ids is available, find CLS tokens
|
| 1023 |
+
if input_ids is not None and self.config.bos_token_id is not None:
|
| 1024 |
# Mark all CLS token positions as global attention
|
| 1025 |
+
global_attention_mask = (input_ids == self.config.bos_token_id).long()
|
| 1026 |
else:
|
| 1027 |
# Default: assume first token is CLS (common convention)
|
| 1028 |
global_attention_mask[:, 0] = 1
|