Upload EthicalHacking_Qwen3-8B_Colab.ipynb
Browse files
EthicalHacking_Qwen3-8B_Colab.ipynb
CHANGED
|
@@ -82,7 +82,28 @@
|
|
| 82 |
"ds1 = ds1.map(to_messages, remove_columns=ds1.column_names, batched=False)\n",
|
| 83 |
"ds2 = ds2.map(to_messages, remove_columns=ds2.column_names, batched=False)\n",
|
| 84 |
"train_dataset = concatenate_datasets([ds1, ds2])\n",
|
| 85 |
-
"print(f\"✅
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
]
|
| 87 |
},
|
| 88 |
{
|
|
@@ -98,7 +119,7 @@
|
|
| 98 |
" model=model,\n",
|
| 99 |
" tokenizer=tokenizer,\n",
|
| 100 |
" train_dataset=train_dataset,\n",
|
| 101 |
-
" dataset_text_field=\"
|
| 102 |
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
| 103 |
" dataset_num_proc=2,\n",
|
| 104 |
" packing=False,\n",
|
|
@@ -119,6 +140,7 @@
|
|
| 119 |
" report_to=\"none\",\n",
|
| 120 |
" ),\n",
|
| 121 |
")\n",
|
|
|
|
| 122 |
"trainer.train()\n",
|
| 123 |
"model.save_pretrained(\"./cyber-lora-adapter\")\n",
|
| 124 |
"tokenizer.save_pretrained(\"./cyber-lora-adapter\")\n",
|
|
|
|
| 82 |
"ds1 = ds1.map(to_messages, remove_columns=ds1.column_names, batched=False)\n",
|
| 83 |
"ds2 = ds2.map(to_messages, remove_columns=ds2.column_names, batched=False)\n",
|
| 84 |
"train_dataset = concatenate_datasets([ds1, ds2])\n",
|
| 85 |
+
"print(f\"✅ Messages dataset: {len(train_dataset)} rows\")\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"# ========== PRE-PROCESS: messages → text with chat template ==========\n",
|
| 88 |
+
"def convert_messages_to_text(examples):\n",
|
| 89 |
+
" texts = []\n",
|
| 90 |
+
" for msgs in examples[\"messages\"]:\n",
|
| 91 |
+
" text = tokenizer.apply_chat_template(\n",
|
| 92 |
+
" msgs,\n",
|
| 93 |
+
" tokenize=False,\n",
|
| 94 |
+
" add_generation_prompt=False,\n",
|
| 95 |
+
" )\n",
|
| 96 |
+
" texts.append(text)\n",
|
| 97 |
+
" return {\"text\": texts}\n",
|
| 98 |
+
"\n",
|
| 99 |
+
"print(\"🔄 Converting messages to text...\")\n",
|
| 100 |
+
"train_dataset = train_dataset.map(\n",
|
| 101 |
+
" convert_messages_to_text,\n",
|
| 102 |
+
" batched=True,\n",
|
| 103 |
+
" remove_columns=[\"messages\"],\n",
|
| 104 |
+
" batch_size=100,\n",
|
| 105 |
+
")\n",
|
| 106 |
+
"print(f\"✅ Dataset ready with columns: {train_dataset.column_names}\")"
|
| 107 |
]
|
| 108 |
},
|
| 109 |
{
|
|
|
|
| 119 |
" model=model,\n",
|
| 120 |
" tokenizer=tokenizer,\n",
|
| 121 |
" train_dataset=train_dataset,\n",
|
| 122 |
+
" dataset_text_field=\"text\", # ← standard text format\n",
|
| 123 |
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
| 124 |
" dataset_num_proc=2,\n",
|
| 125 |
" packing=False,\n",
|
|
|
|
| 140 |
" report_to=\"none\",\n",
|
| 141 |
" ),\n",
|
| 142 |
")\n",
|
| 143 |
+
"\n",
|
| 144 |
"trainer.train()\n",
|
| 145 |
"model.save_pretrained(\"./cyber-lora-adapter\")\n",
|
| 146 |
"tokenizer.save_pretrained(\"./cyber-lora-adapter\")\n",
|