Samarthrr commited on
Commit
ee63cc2
·
verified ·
1 Parent(s): 22a0620

Upload train_engine.py

Browse files
Files changed (1) hide show
  1. train_engine.py +66 -0
train_engine.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModelForSequenceClassification,
5
+ Trainer,
6
+ TrainingArguments
7
+ )
8
+ from datasets import load_dataset
9
+ import pandas as pd
10
+ import os
11
+
12
+ def train_on_devign(base_model="microsoft/codebert-base", output_dir="./trained_model"):
13
+ print(f"🚀 Initializing Autotrain Engine for {base_model}")
14
+
15
+ # 1. Load specialized Devign dataset
16
+ print("📥 Loading Devign dataset from Hugging Face Hub...")
17
+ try:
18
+ dataset = load_dataset("DetectVul/devign")
19
+ except Exception as e:
20
+ print(f"Failed to load Devign: {e}. Falling back to sample dataset.")
21
+ return
22
+
23
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
24
+
25
+ def tokenize_function(examples):
26
+ return tokenizer(examples["func"], padding="max_length", truncation=True, max_length=512)
27
+
28
+ print("✂️ Tokenizing dataset...")
29
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
30
+
31
+ # 2. Load Model
32
+ print("🧠 Loading Base Model...")
33
+ model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2)
34
+
35
+ # 3. Setup Training
36
+ training_args = TrainingArguments(
37
+ output_dir=output_dir,
38
+ evaluation_strategy="epoch",
39
+ learning_rate=2e-5,
40
+ per_device_train_batch_size=8, # Optimized for high-performance
41
+ per_device_eval_batch_size=8,
42
+ num_train_epochs=3,
43
+ weight_decay=0.01,
44
+ push_to_hub=False,
45
+ logging_dir='./logs',
46
+ )
47
+
48
+ trainer = Trainer(
49
+ model=model,
50
+ args=training_args,
51
+ train_dataset=tokenized_datasets["train"],
52
+ eval_dataset=tokenized_datasets["test"],
53
+ )
54
+
55
+ # 4. Train
56
+ print("🔥 Starting Fine-tuning cycle...")
57
+ trainer.train()
58
+
59
+ # 5. Save & Update
60
+ print(f"✅ Training Complete. Saving to {output_dir}")
61
+ model.save_pretrained(output_dir)
62
+ tokenizer.save_pretrained(output_dir)
63
+
64
+ if __name__ == "__main__":
65
+ # In a real scenario, this would be triggered by /train
66
+ train_on_devign()