asdf98 commited on
Commit
8df17e2
·
verified ·
1 Parent(s): dc0b1ad

Upload EthicalHacking_Qwen3-8B_Colab.ipynb

Browse files
Files changed (1) hide show
  1. EthicalHacking_Qwen3-8B_Colab.ipynb +24 -2
EthicalHacking_Qwen3-8B_Colab.ipynb CHANGED
@@ -82,7 +82,28 @@
82
  "ds1 = ds1.map(to_messages, remove_columns=ds1.column_names, batched=False)\n",
83
  "ds2 = ds2.map(to_messages, remove_columns=ds2.column_names, batched=False)\n",
84
  "train_dataset = concatenate_datasets([ds1, ds2])\n",
85
- "print(f\"✅ Combined: {len(train_dataset)} rows\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  ]
87
  },
88
  {
@@ -98,7 +119,7 @@
98
  " model=model,\n",
99
  " tokenizer=tokenizer,\n",
100
  " train_dataset=train_dataset,\n",
101
- " dataset_text_field=\"messages\",\n",
102
  " max_seq_length=MAX_SEQ_LENGTH,\n",
103
  " dataset_num_proc=2,\n",
104
  " packing=False,\n",
@@ -119,6 +140,7 @@
119
  " report_to=\"none\",\n",
120
  " ),\n",
121
  ")\n",
 
122
  "trainer.train()\n",
123
  "model.save_pretrained(\"./cyber-lora-adapter\")\n",
124
  "tokenizer.save_pretrained(\"./cyber-lora-adapter\")\n",
 
82
  "ds1 = ds1.map(to_messages, remove_columns=ds1.column_names, batched=False)\n",
83
  "ds2 = ds2.map(to_messages, remove_columns=ds2.column_names, batched=False)\n",
84
  "train_dataset = concatenate_datasets([ds1, ds2])\n",
85
+ "print(f\"✅ Messages dataset: {len(train_dataset)} rows\")\n",
86
+ "\n",
87
+ "# ========== PRE-PROCESS: messages → text with chat template ==========\n",
88
+ "def convert_messages_to_text(examples):\n",
89
+ " texts = []\n",
90
+ " for msgs in examples[\"messages\"]:\n",
91
+ " text = tokenizer.apply_chat_template(\n",
92
+ " msgs,\n",
93
+ " tokenize=False,\n",
94
+ " add_generation_prompt=False,\n",
95
+ " )\n",
96
+ " texts.append(text)\n",
97
+ " return {\"text\": texts}\n",
98
+ "\n",
99
+ "print(\"🔄 Converting messages to text...\")\n",
100
+ "train_dataset = train_dataset.map(\n",
101
+ " convert_messages_to_text,\n",
102
+ " batched=True,\n",
103
+ " remove_columns=[\"messages\"],\n",
104
+ " batch_size=100,\n",
105
+ ")\n",
106
+ "print(f\"✅ Dataset ready with columns: {train_dataset.column_names}\")"
107
  ]
108
  },
109
  {
 
119
  " model=model,\n",
120
  " tokenizer=tokenizer,\n",
121
  " train_dataset=train_dataset,\n",
122
+ " dataset_text_field=\"text\", # ← standard text format\n",
123
  " max_seq_length=MAX_SEQ_LENGTH,\n",
124
  " dataset_num_proc=2,\n",
125
  " packing=False,\n",
 
140
  " report_to=\"none\",\n",
141
  " ),\n",
142
  ")\n",
143
+ "\n",
144
  "trainer.train()\n",
145
  "model.save_pretrained(\"./cyber-lora-adapter\")\n",
146
  "tokenizer.save_pretrained(\"./cyber-lora-adapter\")\n",