mr233
/

TokenHD-8B-Mix

@@ -68,3 +68,63 @@ TokenHD models are evaluated with two metrics:
 - **S_incor**: Token-level F1 on hallucinated (incorrect) responses — measures how precisely the detector localizes errors.
 - **S_cor**: Recall on hallucination-free (correct) responses — measures how rarely the detector raises false alarms.

 - **S_incor**: Token-level F1 on hallucinated (incorrect) responses — measures how precisely the detector localizes errors.
 - **S_cor**: Recall on hallucination-free (correct) responses — measures how rarely the detector raises false alarms.
+---
+## Evaluation
+Evaluate using the [TokenHD eval dataset](https://huggingface.co/datasets/mr233/TokenHD-eval-data):
+```python
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+import torch
+import numpy as np
+def hard_f1(y_true, y_pred):
+    if max(y_true) == 0:
+        y_true, y_pred = 1 - y_true, 1 - y_pred
+    tp = np.sum((y_pred == 1) & (y_true == 1))
+    fp = np.sum((y_pred == 1) & (y_true == 0))
+    fn = np.sum((y_pred == 0) & (y_true == 1))
+    precision = tp / (tp + fp + 1e-7)
+    recall    = tp / (tp + fn + 1e-7)
+    f1        = 2 * precision * recall / (precision + recall + 1e-7)
+    return precision, recall, f1
+model_id = "mr233/TokenHD-8B-Mix"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForTokenClassification.from_pretrained(
+    model_id, num_labels=1, torch_dtype=torch.bfloat16, device_map="auto"
+)
+model.eval()
+dataset = load_dataset("mr233/TokenHD-eval-data",
+                       data_files="tokenhd_eval_math_500.jsonl", split="train")
+f1_incor, f1_cor = [], []
+for item in dataset:
+    problem, raw_answer = item["problem"], item["raw_answer"]
+    token_weights_gt = np.array(item["token_weights"], dtype=np.float32)
+    gt_hard = (token_weights_gt > 0.5).astype(np.float32)
+    messages = [{"role": "user", "content": problem},
+                {"role": "assistant", "content": raw_answer}]
+    input_ids = tokenizer.apply_chat_template(
+        messages, tokenize=True, add_generation_prompt=False)[:-2]
+    input_tensor = torch.tensor(input_ids, device=model.device).unsqueeze(0)
+    with torch.no_grad():
+        logits = model(input_ids=input_tensor).logits
+    scores = torch.sigmoid(logits.squeeze(-1).squeeze(0))[-len(token_weights_gt):]
+    pred_hard = (scores.float().cpu().numpy() > 0.5).astype(np.float32)
+    _, _, f1 = hard_f1(gt_hard, pred_hard)
+    if item["correctness"] == -1:
+        f1_incor.append(f1)
+    else:
+        f1_cor.append(f1)
+print(f"S_incor (F1 on hallucinated): {np.mean(f1_incor)*100:.2f}")
+print(f"S_cor   (recall on correct):  {np.mean(f1_cor)*100:.2f}")
+```