Charlie81 commited on
Commit
5c05368
·
1 Parent(s): 52bdc02

tokenize fn

Browse files
Files changed (1) hide show
  1. scripts/train.py +5 -3
scripts/train.py CHANGED
@@ -39,7 +39,6 @@ def main():
39
  def tokenize_function(examples):
40
  texts = []
41
  for message_list in examples["messages"]:
42
- # Format the conversation history into a single string
43
  formatted = ""
44
  for msg in message_list:
45
  role = msg["role"]
@@ -51,13 +50,16 @@ def main():
51
  else:
52
  formatted += f"{role.capitalize()}: {content}\n"
53
  texts.append(formatted)
54
-
55
- return tokenizer(
56
  texts,
57
  truncation=True,
58
  max_length=4096,
59
  padding="max_length"
60
  )
 
 
 
61
 
62
 
63
  tokenized_dataset = dataset.map(
 
39
  def tokenize_function(examples):
40
  texts = []
41
  for message_list in examples["messages"]:
 
42
  formatted = ""
43
  for msg in message_list:
44
  role = msg["role"]
 
50
  else:
51
  formatted += f"{role.capitalize()}: {content}\n"
52
  texts.append(formatted)
53
+
54
+ tokenized = tokenizer(
55
  texts,
56
  truncation=True,
57
  max_length=4096,
58
  padding="max_length"
59
  )
60
+ # Add labels for language modeling
61
+ tokenized["labels"] = tokenized["input_ids"].copy()
62
+ return tokenized
63
 
64
 
65
  tokenized_dataset = dataset.map(