vishesh-t27 commited on
Commit
bdf2e42
·
verified ·
1 Parent(s): ba4702b

Update tokenization_nandi.py

Browse files
Files changed (1) hide show
  1. tokenization_nandi.py +33 -27
tokenization_nandi.py CHANGED
@@ -91,34 +91,40 @@ class NandiTokenizer(TokenizersBackend):
91
  **kwargs,
92
  )
93
 
94
- def encode(
95
- self,
96
- text,
97
- text_pair=None,
98
- add_special_tokens: bool = True,
99
- padding=False,
100
- truncation=None,
101
- max_length=None,
102
- stride: int = 0,
103
- padding_side=None,
104
- return_tensors=None,
105
- **kwargs,
106
- ):
107
  if isinstance(text, str):
108
- # This is a temporary fix to match the behaviour of the training pipeline
109
- text = "<|im_start|>" + " " + text
110
- return super().encode(
111
- text,
112
- text_pair=text_pair,
113
- add_special_tokens=add_special_tokens,
114
- padding=padding,
115
- truncation=truncation,
116
- max_length=max_length,
117
- stride=stride,
118
- padding_side=padding_side,
119
- return_tensors=return_tensors,
120
- **kwargs,
121
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
  __all__ = ["NandiTokenizer"]
 
91
  **kwargs,
92
  )
93
 
94
+ def prepare_for_tokenization(self, text, **kwargs):
95
+
 
 
 
 
 
 
 
 
 
 
 
96
  if isinstance(text, str):
97
+ text = "<|im_start|> " + text
98
+ return (text, kwargs)
99
+
100
+ # def encode(
101
+ # self,
102
+ # text,
103
+ # text_pair=None,
104
+ # add_special_tokens: bool = True,
105
+ # padding=False,
106
+ # truncation=None,
107
+ # max_length=None,
108
+ # stride: int = 0,
109
+ # padding_side=None,
110
+ # return_tensors=None,
111
+ # **kwargs,
112
+ # ):
113
+ # if isinstance(text, str):
114
+ # # This is a temporary fix to match the behaviour of the training pipeline
115
+ # text = "<|im_start|>" + " " + text
116
+ # return super().encode(
117
+ # text,
118
+ # text_pair=text_pair,
119
+ # add_special_tokens=add_special_tokens,
120
+ # padding=padding,
121
+ # truncation=truncation,
122
+ # max_length=max_length,
123
+ # stride=stride,
124
+ # padding_side=padding_side,
125
+ # return_tensors=return_tensors,
126
+ # **kwargs,
127
+ # )
128
 
129
 
130
  __all__ = ["NandiTokenizer"]