av-codes
/

prompt-injection-hrm-text

+#!/usr/bin/env python3
+"""Evaluate and fine-tune DistilBERT on Bordair multimodal dataset.
+Two tests:
+1. Zero-shot: existing av-codes/prompt-injection-detector-v2 on bordair eval split
+2. Fine-tune: distilbert-base-uncased trained on bordair train split, 1 epoch
+Uses the same train/eval split as HRM-Text (seed=42, stratified 90/10).
+"""
+import json
+import glob
+import time
+import numpy as np
+import datasets as hf_datasets
+import evaluate
+import torch
+from datasets import Dataset
+from huggingface_hub import snapshot_download
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    pipeline,
+)
+def load_bordair_multimodal():
+    print("📦 Downloading Bordair/bordair-multimodal...")
+    path = snapshot_download(repo_id="Bordair/bordair-multimodal", repo_type="dataset")
+    print(f"   Downloaded to: {path}")
+    all_samples = []
+    patterns = [
+        "benign/*.json",
+        "payloads/*/*.json",
+        "payloads_v5/*.json",
+        "payloads_v5_external/*/*.json",
+    ]
+    for pattern in patterns:
+        files = sorted(glob.glob(f"{path}/{pattern}"))
+        for f in files:
+            fname = f.split("/")[-1]
+            if fname in ("summary.json", "_pool.json", "summary_old.json"):
+                continue
+            try:
+                with open(f, "r") as fh:
+                    data = json.load(fh)
+            except (json.JSONDecodeError, UnicodeDecodeError):
+                continue
+            if isinstance(data, list):
+                for item in data:
+                    if isinstance(item, dict) and item.get("expected_detection") is not None:
+                        text_parts = [item.get("text", "")]
+                        for k in ("image_content", "document_content", "audio_content"):
+                            if item.get(k):
+                                text_parts.append(item[k])
+                        all_samples.append({
+                            "text": "\n".join(text_parts),
+                            "label": 1 if item["expected_detection"] else 0,
+                        })
+        print(f"   {pattern}: {len(all_samples)} cumulative")
+    ds = Dataset.from_list(all_samples)
+    print(f"\n✅ Total: {len(ds)} samples ({sum(1 for s in all_samples if s['label']==1)} injection, {sum(1 for s in all_samples if s['label']==0)} safe)")
+    return ds
+def compute_metrics(eval_pred):
+    accuracy = evaluate.load("accuracy")
+    precision_m = evaluate.load("precision")
+    recall_m = evaluate.load("recall")
+    f1_m = evaluate.load("f1")
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    return {
+        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
+        "precision": precision_m.compute(predictions=preds, references=labels)["precision"],
+        "recall": recall_m.compute(predictions=preds, references=labels)["recall"],
+        "f1": f1_m.compute(predictions=preds, references=labels)["f1"],
+    }
+def main():
+    merged = load_bordair_multimodal()
+    merged = merged.cast_column("label", hf_datasets.ClassLabel(names=["safe", "injection"]))
+    split = merged.train_test_split(test_size=0.1, seed=42, stratify_by_column="label")
+    train_dataset = split["train"]
+    eval_dataset = split["test"]
+    print(f"   Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
+    # ── Test 1: Zero-shot eval of existing model ─────────────────────────
+    print("\n" + "="*60)
+    print("TEST 1: Zero-shot eval of av-codes/prompt-injection-detector-v2")
+    print("="*60)
+    zs_model_id = "av-codes/prompt-injection-detector-v2"
+    zs_tokenizer = AutoTokenizer.from_pretrained(zs_model_id)
+    zs_model = AutoModelForSequenceClassification.from_pretrained(zs_model_id)
+    zs_args = TrainingArguments(
+        output_dir="/tmp/zs_eval",
+        per_device_eval_batch_size=64,
+        fp16=torch.cuda.is_available(),
+        report_to="none",
+        disable_tqdm=True,
+        use_cpu=not torch.cuda.is_available(),
+        remove_unused_columns=False,
+    )
+    def zs_tokenize(batch):
+        return zs_tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
+    eval_tok = eval_dataset.map(zs_tokenize, batched=True, batch_size=1000)
+    zs_trainer = Trainer(
+        model=zs_model,
+        args=zs_args,
+        eval_dataset=eval_tok,
+        compute_metrics=compute_metrics,
+    )
+    t0 = time.time()
+    zs_results = zs_trainer.evaluate()
+    t1 = time.time()
+    print(f"\n📊 Zero-shot results ({t1-t0:.0f}s):")
+    for k, v in zs_results.items():
+        print(f"   {k}: {v}")
+    del zs_model, zs_trainer
+    torch.cuda.empty_cache()
+    # ── Test 2: Fine-tune DistilBERT on bordair ──────────────────────────
+    print("\n" + "="*60)
+    print("TEST 2: Fine-tune distilbert-base-uncased on bordair (1 epoch)")
+    print("="*60)
+    ft_model_id = "distilbert-base-uncased"
+    ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_id)
+    ft_model = AutoModelForSequenceClassification.from_pretrained(ft_model_id, num_labels=2)
+    def ft_tokenize(batch):
+        return ft_tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
+    train_tok = train_dataset.map(ft_tokenize, batched=True, batch_size=1000)
+    eval_tok2 = eval_dataset.map(ft_tokenize, batched=True, batch_size=1000)
+    ft_args = TrainingArguments(
+        output_dir="/tmp/ft_distilbert",
+        learning_rate=2e-5,
+        per_device_train_batch_size=32,
+        per_device_eval_batch_size=64,
+        num_train_epochs=1,
+        weight_decay=0.01,
+        warmup_steps=500,
+        lr_scheduler_type="cosine",
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=False,
+        logging_strategy="steps",
+        logging_steps=100,
+        logging_first_step=True,
+        disable_tqdm=True,
+        fp16=torch.cuda.is_available(),
+        report_to="none",
+        use_cpu=not torch.cuda.is_available(),
+        dataloader_num_workers=4,
+        seed=42,
+        remove_unused_columns=False,
+    )
+    ft_trainer = Trainer(
+        model=ft_model,
+        args=ft_args,
+        train_dataset=train_tok,
+        eval_dataset=eval_tok2,
+        compute_metrics=compute_metrics,
+    )
+    t0 = time.time()
+    ft_trainer.train()
+    t1 = time.time()
+    print(f"\n⏱️  Training time: {t1-t0:.0f}s ({(t1-t0)/3600:.1f}h)")
+    ft_results = ft_trainer.evaluate()
+    print(f"\n📊 Fine-tuned DistilBERT results:")
+    for k, v in ft_results.items():
+        print(f"   {k}: {v}")
+    # ── Summary ──────────────────────────────────────────────────────────
+    print("\n" + "="*60)
+    print("SUMMARY — Bordair multimodal eval set (47,644 samples)")
+    print("="*60)
+    print(f"   Zero-shot DistilBERT v2 (61K data):  F1={zs_results.get('eval_f1', '?')}")
+    print(f"   Fine-tuned DistilBERT (bordair 1ep):  F1={ft_results.get('eval_f1', '?')}")
+    print(f"   HRM-Text (bordair, in progress):      F1=TBD (check HF Jobs)")
+if __name__ == "__main__":
+    main()