ggdpx commited on
Commit
0e038ee
·
verified ·
1 Parent(s): 10843c5

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. baseline.py +90 -0
  2. data_loader.py +43 -0
  3. train.py +164 -0
  4. verify_setup.py +29 -0
baseline.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from sklearn.metrics import classification_report, confusion_matrix
5
+ from tqdm import tqdm
6
+ import os
7
+
8
+ def get_device():
9
+ if torch.cuda.is_available():
10
+ return "cuda"
11
+ elif torch.backends.mps.is_available():
12
+ return "mps"
13
+ return "cpu"
14
+
15
+ def main():
16
+ device = get_device()
17
+ print(f"Using device: {device}")
18
+
19
+ model_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
20
+ print(f"Loading model and tokenizer: {model_id}")
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
23
+ # Using float16 for efficiency on MPS/CUDA, or float32 on CPU
24
+ torch_dtype = torch.float16 if device != "cpu" else torch.float32
25
+
26
+ model = AutoModelForCausalLM.from_pretrained(
27
+ model_id,
28
+ torch_dtype=torch_dtype,
29
+ device_map=device
30
+ )
31
+
32
+ # Load test data
33
+ test_path = "data/test.csv"
34
+ if not os.path.exists(test_path):
35
+ print(f"Error: {test_path} not found. Please run data_loader.py first.")
36
+ return
37
+
38
+ df = pd.read_csv(test_path)
39
+ # To keep the baseline test fast, let's run on 100 for a quick baseline.
40
+ sample_size = min(100, len(df))
41
+ df_sample = df.sample(sample_size, random_state=42)
42
+
43
+ predictions = []
44
+ labels = []
45
+
46
+ print(f"Evaluating zero-shot performance on {sample_size} samples...")
47
+
48
+ for _, row in tqdm(df_sample.iterrows(), total=sample_size):
49
+ text = str(row['text'])
50
+ label = int(row['phishing']) # 0 for safe, 1 for phishing
51
+
52
+ # SmolLM2-Instruct prompt format
53
+ messages = [{"role": "user", "content": f"""Classify the following email text as either 'Safe' or 'Phishing'. Respond with only one word: 'Safe' or 'Phishing'.
54
+
55
+ Email text: {text}
56
+
57
+ Classification:"""}]
58
+ input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
59
+
60
+ inputs = tokenizer(input_text, return_tensors="pt").to(device)
61
+
62
+ with torch.no_grad():
63
+ output = model.generate(
64
+ **inputs,
65
+ max_new_tokens=10,
66
+ temperature=0.1,
67
+ do_sample=False,
68
+ pad_token_id=tokenizer.eos_token_id
69
+ )
70
+
71
+ response = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip().lower()
72
+
73
+ if 'phishing' in response:
74
+ predictions.append(1)
75
+ elif 'safe' in response:
76
+ predictions.append(0)
77
+ else:
78
+ # Fallback if the model doesn't follow instructions well
79
+ # print(f"Warning: Model gave unexpected response: '{response}'")
80
+ predictions.append(0)
81
+
82
+ labels.append(label)
83
+
84
+ print("\nBaseline Results (Zero-Shot):")
85
+ print(classification_report(labels, predictions, target_names=['Safe', 'Phishing'], zero_division=0))
86
+ print("\nConfusion Matrix:")
87
+ print(confusion_matrix(labels, predictions))
88
+
89
+ if __name__ == "__main__":
90
+ main()
data_loader.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datasets import load_dataset
3
+ from sklearn.model_selection import train_test_split
4
+ import pandas as pd
5
+
6
+
7
+ def prepare_data(dataset_name: str = "David-Egea/phishing-texts"):
8
+ print(f"Loading dataset: {dataset_name}...")
9
+ # The dataset usually loads into a 'train' split if not specified
10
+ ds = load_dataset(dataset_name)
11
+
12
+ # Convert to pandas for easier manipulation/splitting
13
+ df: pd.DataFrame = ds["train"].to_pandas() # type: ignore
14
+
15
+ print(f"Total samples: {len(df)}")
16
+ print(f"Class distribution:\n{df['phishing'].value_counts(normalize=True)}")
17
+
18
+ # 80% Train, 20% Temp (Val + Test)
19
+ train_df, temp_df = train_test_split(
20
+ df, test_size=0.2, random_state=42, stratify=df["phishing"]
21
+ )
22
+
23
+ # Split temp into 50% Val, 50% Test (results in 10% each of total)
24
+ val_df, test_df = train_test_split(
25
+ temp_df, test_size=0.5, random_state=42, stratify=temp_df["phishing"]
26
+ )
27
+
28
+ print(f"Train samples: {len(train_df)}")
29
+ print(f"Val samples: {len(val_df)}")
30
+ print(f"Test samples: {len(test_df)}")
31
+
32
+ # Ensure data directory exists
33
+ os.makedirs("data", exist_ok=True)
34
+
35
+ # Save splits
36
+ train_df.to_csv("data/train.csv", index=False)
37
+ val_df.to_csv("data/val.csv", index=False)
38
+ test_df.to_csv("data/test.csv", index=False)
39
+ print("Splits saved to data/ folder.")
40
+
41
+
42
+ if __name__ == "__main__":
43
+ prepare_data()
train.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from datasets import load_dataset, Dataset
4
+ from transformers import (
5
+ AutoModelForCausalLM,
6
+ AutoTokenizer,
7
+ )
8
+ from peft import LoraConfig
9
+ from trl.trainer.sft_trainer import SFTTrainer
10
+ from trl.trainer.sft_config import SFTConfig
11
+ import argparse
12
+ import pandas as pd
13
+
14
+ # Define tokenizer globally for the mapping function
15
+ tokenizer = None
16
+
17
+
18
+ def format_instruction(sample):
19
+ # Standard format for SmolLM2-Instruct
20
+ label_str = "Phishing" if sample["phishing"] == 1 else "Safe"
21
+
22
+ messages = [
23
+ {
24
+ "role": "user",
25
+ "content": f"Classify the following email text as either 'Safe' or 'Phishing'. Respond with only one word: 'Safe' or 'Phishing'.\n\nEmail text: {sample['text']}\n\nClassification:",
26
+ },
27
+ {"role": "assistant", "content": label_str},
28
+ ]
29
+ # tokenizer is now accessible globally
30
+ return (
31
+ {"text": tokenizer.apply_chat_template(messages, tokenize=False)}
32
+ if tokenizer
33
+ else {"text": ""}
34
+ )
35
+
36
+
37
+ def main(args):
38
+ global tokenizer
39
+ device = (
40
+ "cuda"
41
+ if torch.cuda.is_available()
42
+ else "mps"
43
+ if torch.backends.mps.is_available()
44
+ else "cpu"
45
+ )
46
+ print(f"Using device: {device}")
47
+
48
+ model_id = args.model_id
49
+ print(f"Loading tokenizer and model: {model_id}")
50
+
51
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
52
+ tokenizer.pad_token = tokenizer.eos_token
53
+
54
+ # Load Model
55
+ model = AutoModelForCausalLM.from_pretrained(
56
+ model_id,
57
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
58
+ device_map=device if device != "mps" else None,
59
+ )
60
+ if device == "mps":
61
+ model.to("mps") # type: ignore
62
+
63
+ # LoRA Configuration
64
+ peft_config = LoraConfig(
65
+ r=args.lora_r,
66
+ lora_alpha=args.lora_alpha,
67
+ lora_dropout=args.lora_dropout,
68
+ target_modules=[
69
+ "q_proj",
70
+ "k_proj",
71
+ "v_proj",
72
+ "o_proj",
73
+ "gate_proj",
74
+ "up_proj",
75
+ "down_proj",
76
+ ],
77
+ bias="none",
78
+ task_type="CAUSAL_LM",
79
+ )
80
+
81
+ # Load Data
82
+ print(f"Loading data from {args.dataset_name}...")
83
+ if os.path.exists(args.dataset_name):
84
+ train_df = pd.read_csv(os.path.join(args.dataset_name, "train.csv"))
85
+ val_df = pd.read_csv(os.path.join(args.dataset_name, "val.csv"))
86
+ if args.quick_test:
87
+ train_df = train_df.head(100)
88
+ val_df = val_df.head(20)
89
+ train_dataset = Dataset.from_pandas(train_df)
90
+ val_dataset = Dataset.from_pandas(val_df)
91
+ else:
92
+ dataset = load_dataset(args.dataset_name)
93
+ train_dataset = dataset["train"]
94
+ val_dataset = dataset["validation"] if "validation" in dataset else None
95
+
96
+ # Apply formatting
97
+ print("Formatting datasets...")
98
+ train_dataset = train_dataset.map(format_instruction)
99
+ if val_dataset:
100
+ val_dataset = val_dataset.map(format_instruction)
101
+
102
+ # Use SFTConfig for modern TRL
103
+ sft_config = SFTConfig(
104
+ output_dir=args.output_dir,
105
+ per_device_train_batch_size=args.batch_size,
106
+ gradient_accumulation_steps=args.grad_accum,
107
+ learning_rate=args.lr,
108
+ logging_steps=10,
109
+ num_train_epochs=args.epochs,
110
+ max_steps=args.max_steps,
111
+ eval_strategy="steps" if val_dataset else "no",
112
+ eval_steps=100,
113
+ save_strategy="steps",
114
+ save_steps=100,
115
+ lr_scheduler_type="cosine",
116
+ warmup_ratio=0.1,
117
+ bf16=torch.cuda.is_available(),
118
+ push_to_hub=args.push_to_hub,
119
+ report_to="tensorboard" if not args.no_report else "none",
120
+ remove_unused_columns=False,
121
+ dataset_text_field="text",
122
+ max_length=args.max_seq_length,
123
+ )
124
+
125
+ # Standard HF SFTTrainer
126
+ trainer = SFTTrainer(
127
+ model=model,
128
+ train_dataset=train_dataset,
129
+ eval_dataset=val_dataset,
130
+ peft_config=peft_config,
131
+ processing_class=tokenizer,
132
+ args=sft_config,
133
+ )
134
+
135
+ print("Starting training...")
136
+ trainer.train()
137
+
138
+ print(f"Saving model to {args.output_dir}")
139
+ trainer.save_model(args.output_dir)
140
+ if args.push_to_hub:
141
+ trainer.push_to_hub()
142
+
143
+
144
+ if __name__ == "__main__":
145
+ parser = argparse.ArgumentParser()
146
+ parser.add_argument(
147
+ "--model_id", type=str, default="HuggingFaceTB/SmolLM2-135M-Instruct"
148
+ )
149
+ parser.add_argument("--dataset_name", type=str, default="data/")
150
+ parser.add_argument("--output_dir", type=str, default="models/smollm2-phish-sft")
151
+ parser.add_argument("--batch_size", type=int, default=4)
152
+ parser.add_argument("--grad_accum", type=int, default=4)
153
+ parser.add_argument("--lr", type=float, default=2e-4)
154
+ parser.add_argument("--epochs", type=int, default=1)
155
+ parser.add_argument("--max_steps", type=int, default=-1)
156
+ parser.add_argument("--max_seq_length", type=int, default=512)
157
+ parser.add_argument("--lora_r", type=int, default=16)
158
+ parser.add_argument("--lora_alpha", type=int, default=32)
159
+ parser.add_argument("--lora_dropout", type=float, default=0.05)
160
+ parser.add_argument("--quick_test", action="store_true")
161
+ parser.add_argument("--push_to_hub", action="store_true")
162
+ parser.add_argument("--no_report", action="store_true")
163
+ args = parser.parse_args()
164
+ main(args)
verify_setup.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import transformers
3
+ import datasets
4
+ import peft
5
+ import accelerate
6
+ import platform
7
+
8
+
9
+ def verify():
10
+ print(f"OS: {platform.system()} {platform.release()}")
11
+ print(f"Python: {platform.python_version()}")
12
+ print("-" * 20)
13
+ print(f"PyTorch version: {torch.__version__}")
14
+ print(f"Transformers version: {transformers.__version__}")
15
+ print(f"Datasets version: {datasets.__version__}")
16
+ print(f"PEFT version: {peft.__version__}")
17
+ print(f"Accelerate version: {accelerate.__version__}")
18
+
19
+ # Check for GPU
20
+ if torch.cuda.is_available():
21
+ print(f"GPU: {torch.cuda.get_device_name(0)} (CUDA available)")
22
+ elif torch.backends.mps.is_available():
23
+ print("GPU: Apple Silicon MPS available")
24
+ else:
25
+ print("GPU: Not available (using CPU)")
26
+
27
+
28
+ if __name__ == "__main__":
29
+ verify()