av-codes
/

prompt-injection-hrm-text

@@ -1,202 +0,0 @@
-#!/usr/bin/env python3
-"""Evaluate and fine-tune DistilBERT on Bordair multimodal dataset.
-Two tests:
-1. Zero-shot: existing av-codes/prompt-injection-detector-v2 on bordair eval split
-2. Fine-tune: distilbert-base-uncased trained on bordair train split, 1 epoch
-Uses the same train/eval split as HRM-Text (seed=42, stratified 90/10).
-"""
-import json
-import glob
-import time
-import numpy as np
-import datasets as hf_datasets
-import evaluate
-import torch
-from datasets import Dataset
-from huggingface_hub import snapshot_download
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    Trainer,
-    TrainingArguments,
-    pipeline,
-)
-def load_bordair_multimodal():
-    print("📦 Downloading Bordair/bordair-multimodal...")
-    path = snapshot_download(repo_id="Bordair/bordair-multimodal", repo_type="dataset")
-    print(f"   Downloaded to: {path}")
-    all_samples = []
-    patterns = [
-        "benign/*.json",
-        "payloads/*/*.json",
-        "payloads_v5/*.json",
-        "payloads_v5_external/*/*.json",
-    ]
-    for pattern in patterns:
-        files = sorted(glob.glob(f"{path}/{pattern}"))
-        for f in files:
-            fname = f.split("/")[-1]
-            if fname in ("summary.json", "_pool.json", "summary_old.json"):
-                continue
-            try:
-                with open(f, "r") as fh:
-                    data = json.load(fh)
-            except (json.JSONDecodeError, UnicodeDecodeError):
-                continue
-            if isinstance(data, list):
-                for item in data:
-                    if isinstance(item, dict) and item.get("expected_detection") is not None:
-                        text_parts = [item.get("text", "")]
-                        for k in ("image_content", "document_content", "audio_content"):
-                            if item.get(k):
-                                text_parts.append(item[k])
-                        all_samples.append({
-                            "text": "\n".join(text_parts),
-                            "label": 1 if item["expected_detection"] else 0,
-                        })
-        print(f"   {pattern}: {len(all_samples)} cumulative")
-    ds = Dataset.from_list(all_samples)
-    print(f"\n✅ Total: {len(ds)} samples ({sum(1 for s in all_samples if s['label']==1)} injection, {sum(1 for s in all_samples if s['label']==0)} safe)")
-    return ds
-def compute_metrics(eval_pred):
-    accuracy = evaluate.load("accuracy")
-    precision_m = evaluate.load("precision")
-    recall_m = evaluate.load("recall")
-    f1_m = evaluate.load("f1")
-    logits, labels = eval_pred
-    preds = np.argmax(logits, axis=-1)
-    return {
-        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
-        "precision": precision_m.compute(predictions=preds, references=labels)["precision"],
-        "recall": recall_m.compute(predictions=preds, references=labels)["recall"],
-        "f1": f1_m.compute(predictions=preds, references=labels)["f1"],
-    }
-def main():
-    merged = load_bordair_multimodal()
-    merged = merged.cast_column("label", hf_datasets.ClassLabel(names=["safe", "injection"]))
-    split = merged.train_test_split(test_size=0.1, seed=42, stratify_by_column="label")
-    train_dataset = split["train"]
-    eval_dataset = split["test"]
-    print(f"   Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
-    # ── Test 1: Zero-shot eval of existing model ─────────────────────────
-    print("\n" + "="*60)
-    print("TEST 1: Zero-shot eval of av-codes/prompt-injection-detector-v2")
-    print("="*60)
-    zs_model_id = "av-codes/prompt-injection-detector-v2"
-    zs_tokenizer = AutoTokenizer.from_pretrained(zs_model_id)
-    zs_model = AutoModelForSequenceClassification.from_pretrained(zs_model_id)
-    zs_args = TrainingArguments(
-        output_dir="/tmp/zs_eval",
-        per_device_eval_batch_size=64,
-        fp16=torch.cuda.is_available(),
-        report_to="none",
-        disable_tqdm=True,
-        use_cpu=not torch.cuda.is_available(),
-        remove_unused_columns=False,
-    )
-    def zs_tokenize(batch):
-        return zs_tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
-    eval_tok = eval_dataset.map(zs_tokenize, batched=True, batch_size=1000)
-    zs_trainer = Trainer(
-        model=zs_model,
-        args=zs_args,
-        eval_dataset=eval_tok,
-        compute_metrics=compute_metrics,
-    )
-    t0 = time.time()
-    zs_results = zs_trainer.evaluate()
-    t1 = time.time()
-    print(f"\n📊 Zero-shot results ({t1-t0:.0f}s):")
-    for k, v in zs_results.items():
-        print(f"   {k}: {v}")
-    del zs_model, zs_trainer
-    torch.cuda.empty_cache()
-    # ── Test 2: Fine-tune DistilBERT on bordair ──────────────────────────
-    print("\n" + "="*60)
-    print("TEST 2: Fine-tune distilbert-base-uncased on bordair (1 epoch)")
-    print("="*60)
-    ft_model_id = "distilbert-base-uncased"
-    ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_id)
-    ft_model = AutoModelForSequenceClassification.from_pretrained(ft_model_id, num_labels=2)
-    def ft_tokenize(batch):
-        return ft_tokenizer(batch["text"], truncation=True, padding="max_length", max_length=512)
-    train_tok = train_dataset.map(ft_tokenize, batched=True, batch_size=1000)
-    eval_tok2 = eval_dataset.map(ft_tokenize, batched=True, batch_size=1000)
-    ft_args = TrainingArguments(
-        output_dir="/tmp/ft_distilbert",
-        learning_rate=2e-5,
-        per_device_train_batch_size=32,
-        per_device_eval_batch_size=64,
-        num_train_epochs=1,
-        weight_decay=0.01,
-        warmup_steps=500,
-        lr_scheduler_type="cosine",
-        eval_strategy="epoch",
-        save_strategy="epoch",
-        load_best_model_at_end=False,
-        logging_strategy="steps",
-        logging_steps=100,
-        logging_first_step=True,
-        disable_tqdm=True,
-        fp16=torch.cuda.is_available(),
-        report_to="none",
-        use_cpu=not torch.cuda.is_available(),
-        dataloader_num_workers=4,
-        seed=42,
-        remove_unused_columns=False,
-    )
-    ft_trainer = Trainer(
-        model=ft_model,
-        args=ft_args,
-        train_dataset=train_tok,
-        eval_dataset=eval_tok2,
-        compute_metrics=compute_metrics,
-    )
-    t0 = time.time()
-    ft_trainer.train()
-    t1 = time.time()
-    print(f"\n⏱️  Training time: {t1-t0:.0f}s ({(t1-t0)/3600:.1f}h)")
-    ft_results = ft_trainer.evaluate()
-    print(f"\n📊 Fine-tuned DistilBERT results:")
-    for k, v in ft_results.items():
-        print(f"   {k}: {v}")
-    # ── Summary ──────────────────────────────────────────────────────────
-    print("\n" + "="*60)
-    print("SUMMARY — Bordair multimodal eval set (47,644 samples)")
-    print("="*60)
-    print(f"   Zero-shot DistilBERT v2 (61K data):  F1={zs_results.get('eval_f1', '?')}")
-    print(f"   Fine-tuned DistilBERT (bordair 1ep):  F1={ft_results.get('eval_f1', '?')}")
-    print(f"   HRM-Text (bordair, in progress):      F1=TBD (check HF Jobs)")
-if __name__ == "__main__":
-    main()