Sailesh Panda commited on
Commit
7c50ab0
·
1 Parent(s): 85af162

Experiments

Browse files
Files changed (3) hide show
  1. config.json +2 -0
  2. configuration_hinvec.py +1 -1
  3. modeling_hinvec.py +2 -3
config.json CHANGED
@@ -33,6 +33,8 @@
33
  "use_cache": true,
34
  "use_sliding_window": true,
35
  "vocab_size": 256002,
 
 
36
  "auto_map": {
37
  "AutoConfig": "Sailesh97/Hinvec2--configuration_hinvec.HinvecConfig",
38
  "AutoModel": "Sailesh97/Hinvec2--modeling_hinvec.HinvecModel"}
 
33
  "use_cache": true,
34
  "use_sliding_window": true,
35
  "vocab_size": 256002,
36
+ "pad_token_id": 3,
37
+ "bos_token_id": 256000,
38
  "auto_map": {
39
  "AutoConfig": "Sailesh97/Hinvec2--configuration_hinvec.HinvecConfig",
40
  "AutoModel": "Sailesh97/Hinvec2--modeling_hinvec.HinvecModel"}
configuration_hinvec.py CHANGED
@@ -151,7 +151,7 @@ class HinvecConfig(PretrainedConfig):
151
  rope_theta=10000.0,
152
  rope_scaling=None,
153
  use_sliding_window=False,
154
- sliding_window=4096,
155
  attention_bias=False,
156
  max_window_layers=28,
157
  attention_dropout=0.0,
 
151
  rope_theta=10000.0,
152
  rope_scaling=None,
153
  use_sliding_window=False,
154
+ sliding_window=512,
155
  attention_bias=False,
156
  max_window_layers=28,
157
  attention_dropout=0.0,
modeling_hinvec.py CHANGED
@@ -997,7 +997,6 @@ class HinvecModel(HinvecPreTrainedModel):
997
  attention_mask: Optional[torch.Tensor] = None,
998
  position_ids: Optional[torch.LongTensor] = None,
999
  inputs_embeds: Optional[torch.FloatTensor] = None,
1000
- cls_token_id: Optional[int] = 256000, # Add CLS token ID as parameter
1001
  **kwargs: Unpack[TransformersKwargs],
1002
  ) -> BaseModelOutputWithPast:
1003
  if (input_ids is None) ^ (inputs_embeds is not None):
@@ -1021,9 +1020,9 @@ class HinvecModel(HinvecPreTrainedModel):
1021
  global_attention_mask = torch.zeros((batch_size, seq_len), device=device, dtype=torch.long)
1022
 
1023
  # If input_ids is available, find CLS tokens
1024
- if input_ids is not None and cls_token_id is not None:
1025
  # Mark all CLS token positions as global attention
1026
- global_attention_mask = (input_ids == cls_token_id).long()
1027
  else:
1028
  # Default: assume first token is CLS (common convention)
1029
  global_attention_mask[:, 0] = 1
 
997
  attention_mask: Optional[torch.Tensor] = None,
998
  position_ids: Optional[torch.LongTensor] = None,
999
  inputs_embeds: Optional[torch.FloatTensor] = None,
 
1000
  **kwargs: Unpack[TransformersKwargs],
1001
  ) -> BaseModelOutputWithPast:
1002
  if (input_ids is None) ^ (inputs_embeds is not None):
 
1020
  global_attention_mask = torch.zeros((batch_size, seq_len), device=device, dtype=torch.long)
1021
 
1022
  # If input_ids is available, find CLS tokens
1023
+ if input_ids is not None and self.config.bos_token_id is not None:
1024
  # Mark all CLS token positions as global attention
1025
+ global_attention_mask = (input_ids == self.config.bos_token_id).long()
1026
  else:
1027
  # Default: assume first token is CLS (common convention)
1028
  global_attention_mask[:, 0] = 1