Samarthrr commited on
Commit
1e7af6f
Β·
verified Β·
1 Parent(s): ba08842

Update train_engine.py

Browse files
Files changed (1) hide show
  1. train_engine.py +37 -19
train_engine.py CHANGED
@@ -5,62 +5,80 @@ from transformers import (
5
  Trainer,
6
  TrainingArguments
7
  )
8
- from datasets import load_dataset
9
  import pandas as pd
10
  import os
11
 
12
  def train_on_devign(base_model="microsoft/codebert-base", output_dir="./trained_model"):
13
- print(f"πŸš€ Initializing Autotrain Engine for {base_model}")
14
 
 
 
 
15
  # 1. Load specialized Devign dataset
16
  print("πŸ“₯ Loading Devign dataset from Hugging Face Hub...")
17
  try:
18
- dataset = load_dataset("DetectVul/devign")
19
  except Exception as e:
20
  print(f"Failed to load Devign: {e}. Falling back to sample dataset.")
21
  return
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  tokenizer = AutoTokenizer.from_pretrained(base_model)
24
 
25
  def tokenize_function(examples):
26
  return tokenizer(examples["func"], padding="max_length", truncation=True, max_length=512)
27
 
28
- print("βœ‚οΈ Tokenizing dataset...")
29
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
30
 
31
- # 2. Load Model
32
  print("🧠 Loading Base Model...")
33
- model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)
34
 
35
- # 3. Setup Training
36
  training_args = TrainingArguments(
37
  output_dir=output_dir,
38
- evaluation_strategy="epoch",
39
- learning_rate=2e-5,
40
- per_device_train_batch_size=8, # Optimized for high-performance
41
- per_device_eval_batch_size=8,
42
  num_train_epochs=3,
 
 
43
  weight_decay=0.01,
44
- push_to_hub=False,
45
  logging_dir='./logs',
 
 
46
  )
47
 
48
  trainer = Trainer(
49
  model=model,
50
  args=training_args,
51
- train_dataset=tokenized_datasets["train"],
52
- eval_dataset=tokenized_datasets["test"],
53
  )
54
 
55
- # 4. Train
56
- print("πŸ”₯ Starting Fine-tuning cycle...")
57
  trainer.train()
58
 
59
- # 5. Save & Update
60
- print(f"βœ… Training Complete. Saving to {output_dir}")
61
  model.save_pretrained(output_dir)
62
  tokenizer.save_pretrained(output_dir)
63
 
64
  if __name__ == "__main__":
65
- # In a real scenario, this would be triggered by /train
66
  train_on_devign()
 
5
  Trainer,
6
  TrainingArguments
7
  )
8
+ from datasets import load_dataset, Dataset
9
  import pandas as pd
10
  import os
11
 
12
  def train_on_devign(base_model="microsoft/codebert-base", output_dir="./trained_model"):
13
+ print(f"πŸš€ Initializing Autotrain Engine (Precision v2) for {base_model}")
14
 
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ print(f"πŸ’» Using hardware: {device}")
17
+
18
  # 1. Load specialized Devign dataset
19
  print("πŸ“₯ Loading Devign dataset from Hugging Face Hub...")
20
  try:
21
+ remote_data = load_dataset("DetectVul/devign", split="train[:5000]") # Limit to 5k for speed
22
  except Exception as e:
23
  print(f"Failed to load Devign: {e}. Falling back to sample dataset.")
24
  return
25
 
26
+ # 2. Integrate Local Feedback Data (Active Learning)
27
+ feedback_file = "feedback_dataset.csv"
28
+ if os.path.exists(feedback_file):
29
+ print("πŸ“ˆ Merging local feedback data into training set...")
30
+ fb_df = pd.read_csv(feedback_file)
31
+ # Assuming CSV has 'original_code' and we treat applied fixes as 'Safe' (Label 0) or similar
32
+ # For simplicity, we just add the code and label it
33
+ fb_data = Dataset.from_pandas(fb_df.rename(columns={'original_code': 'func'}))
34
+ # Add labels if missing
35
+ if 'label' not in fb_data.column_names:
36
+ fb_data = fb_data.add_column("label", [1] * len(fb_data)) # Treat feedback items as vulnerable patterns we should recognize
37
+
38
+ # Merge remote and local
39
+ from datasets import concatenate_datasets
40
+ dataset = concatenate_datasets([remote_data, fb_data])
41
+ else:
42
+ dataset = remote_data
43
+
44
  tokenizer = AutoTokenizer.from_pretrained(base_model)
45
 
46
  def tokenize_function(examples):
47
  return tokenizer(examples["func"], padding="max_length", truncation=True, max_length=512)
48
 
49
+ print("βœ‚οΈ Tokenizing hybrid dataset...")
50
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
51
 
52
+ # 3. Load Model
53
  print("🧠 Loading Base Model...")
54
+ model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2).to(device)
55
 
56
+ # 4. Setup Training
57
  training_args = TrainingArguments(
58
  output_dir=output_dir,
 
 
 
 
59
  num_train_epochs=3,
60
+ per_device_train_batch_size=4, # Reduced for stability on wider range of hardware
61
+ learning_rate=2e-5,
62
  weight_decay=0.01,
 
63
  logging_dir='./logs',
64
+ save_strategy="no",
65
+ report_to="none"
66
  )
67
 
68
  trainer = Trainer(
69
  model=model,
70
  args=training_args,
71
+ train_dataset=tokenized_datasets,
 
72
  )
73
 
74
+ # 5. Train
75
+ print("πŸ”₯ Starting active learning cycle...")
76
  trainer.train()
77
 
78
+ # 6. Save results
79
+ print(f"βœ… Training Complete. Saving weights to {output_dir}")
80
  model.save_pretrained(output_dir)
81
  tokenizer.save_pretrained(output_dir)
82
 
83
  if __name__ == "__main__":
 
84
  train_on_devign()