oyly commited on
Commit
af727db
·
1 Parent(s): 56aad4b

fix tokenizer bug

Browse files
Files changed (1) hide show
  1. flux/modules/conditioner_lore.py +8 -7
flux/modules/conditioner_lore.py CHANGED
@@ -112,23 +112,24 @@ class HFEmbedder(nn.Module):
112
  if (words is None) or start_idx<0: # some samples do not need this
113
  return [-1]
114
  res = []
115
- flag = 0
 
116
  for i in range(start_idx,len(tokens)):
117
  this_token = tokens[i].strip('▁')
118
  if this_token == "":
119
  continue
120
  if words.startswith(this_token):
121
  res.append(i)
122
- flag = 1
123
- if words.endswith(this_token):
124
  break
125
  else:
126
  continue
127
- if flag and words.endswith(this_token):
128
- res.append(i)
129
- break
130
- if flag:
131
  res.append(i)
 
 
132
  return res
133
 
134
  for src_words, tgt_words, src_index, tgt_index in replacements:
 
112
  if (words is None) or start_idx<0: # some samples do not need this
113
  return [-1]
114
  res = []
115
+ l_words = len(words.replace(" ", ""))
116
+ l_find = 0
117
  for i in range(start_idx,len(tokens)):
118
  this_token = tokens[i].strip('▁')
119
  if this_token == "":
120
  continue
121
  if words.startswith(this_token):
122
  res.append(i)
123
+ l_find += len(this_token)
124
+ if l_find >= l_words:
125
  break
126
  else:
127
  continue
128
+ if l_find:
129
+ l_find += len(this_token)
 
 
130
  res.append(i)
131
+ if l_find >= l_words:
132
+ break
133
  return res
134
 
135
  for src_words, tgt_words, src_index, tgt_index in replacements: