Charlie81
/

LoRE

Charlie81 commited on Jul 6, 2025

Commit

5c05368

1 Parent(s): 52bdc02

tokenize fn

Files changed (1) hide show

scripts/train.py CHANGED Viewed

@@ -39,7 +39,6 @@ def main():
     def tokenize_function(examples):
         texts = []
         for message_list in examples["messages"]:
-            # Format the conversation history into a single string
             formatted = ""
             for msg in message_list:
                 role = msg["role"]
@@ -51,13 +50,16 @@ def main():
                 else:
                     formatted += f"{role.capitalize()}: {content}\n"
             texts.append(formatted)
-        return tokenizer(
             texts,
             truncation=True,
             max_length=4096,
             padding="max_length"
         )
     tokenized_dataset = dataset.map(

     def tokenize_function(examples):
         texts = []
         for message_list in examples["messages"]:
             formatted = ""
             for msg in message_list:
                 role = msg["role"]
                 else:
                     formatted += f"{role.capitalize()}: {content}\n"
             texts.append(formatted)
+        tokenized = tokenizer(
             texts,
             truncation=True,
             max_length=4096,
             padding="max_length"
         )
+        # Add labels for language modeling
+        tokenized["labels"] = tokenized["input_ids"].copy()
+        return tokenized
     tokenized_dataset = dataset.map(