Spaces:
Sleeping
Sleeping
File size: 3,005 Bytes
ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 1e7af6f ee63cc2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments
)
from datasets import load_dataset, Dataset
import pandas as pd
import os
def train_on_devign(base_model="microsoft/codebert-base", output_dir="./trained_model"):
print(f"🚀 Initializing Autotrain Engine (Precision v2) for {base_model}")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"💻 Using hardware: {device}")
# 1. Load specialized Devign dataset
print("📥 Loading Devign dataset from Hugging Face Hub...")
try:
remote_data = load_dataset("DetectVul/devign", split="train[:5000]") # Limit to 5k for speed
except Exception as e:
print(f"Failed to load Devign: {e}. Falling back to sample dataset.")
return
# 2. Integrate Local Feedback Data (Active Learning)
feedback_file = "feedback_dataset.csv"
if os.path.exists(feedback_file):
print("📈 Merging local feedback data into training set...")
fb_df = pd.read_csv(feedback_file)
# Assuming CSV has 'original_code' and we treat applied fixes as 'Safe' (Label 0) or similar
# For simplicity, we just add the code and label it
fb_data = Dataset.from_pandas(fb_df.rename(columns={'original_code': 'func'}))
# Add labels if missing
if 'label' not in fb_data.column_names:
fb_data = fb_data.add_column("label", [1] * len(fb_data)) # Treat feedback items as vulnerable patterns we should recognize
# Merge remote and local
from datasets import concatenate_datasets
dataset = concatenate_datasets([remote_data, fb_data])
else:
dataset = remote_data
tokenizer = AutoTokenizer.from_pretrained(base_model)
def tokenize_function(examples):
return tokenizer(examples["func"], padding="max_length", truncation=True, max_length=512)
print("✂️ Tokenizing hybrid dataset...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# 3. Load Model
print("🧠 Loading Base Model...")
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels=2).to(device)
# 4. Setup Training
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=4, # Reduced for stability on wider range of hardware
learning_rate=2e-5,
weight_decay=0.01,
logging_dir='./logs',
save_strategy="no",
report_to="none"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
)
# 5. Train
print("🔥 Starting active learning cycle...")
trainer.train()
# 6. Save results
print(f"✅ Training Complete. Saving weights to {output_dir}")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
if __name__ == "__main__":
train_on_devign()
|