mr233
/

TokenHD-8B-Mix

@@ -64,16 +64,7 @@ scores = torch.sigmoid(logits.squeeze(-1).squeeze(0))[-len(response_ids):]
 ## Evaluation
-TokenHD models are evaluated with two metrics:
-- **S_incor**: Token-level F1 on hallucinated (incorrect) responses — measures how precisely the detector localizes errors.
-- **S_cor**: Recall on hallucination-free (correct) responses — measures how rarely the detector raises false alarms.
----
-## Evaluation
-Evaluate using the [TokenHD eval dataset](https://huggingface.co/datasets/mr233/TokenHD-eval-data):
 ```python
 from datasets import load_dataset
@@ -99,32 +90,43 @@ model = AutoModelForTokenClassification.from_pretrained(
 )
 model.eval()
-dataset = load_dataset("mr233/TokenHD-eval-data",
-                       data_files="tokenhd_eval_math_500.jsonl", split="train")
-f1_incor, f1_cor = [], []
-for item in dataset:
-    problem, raw_answer = item["problem"], item["raw_answer"]
-    token_weights_gt = np.array(item["token_weights"], dtype=np.float32)
-    gt_hard = (token_weights_gt > 0.5).astype(np.float32)
-    messages = [{"role": "user", "content": problem},
-                {"role": "assistant", "content": raw_answer}]
-    input_ids = tokenizer.apply_chat_template(
-        messages, tokenize=True, add_generation_prompt=False)[:-2]
-    input_tensor = torch.tensor(input_ids, device=model.device).unsqueeze(0)
-    with torch.no_grad():
-        logits = model(input_ids=input_tensor).logits
-    scores = torch.sigmoid(logits.squeeze(-1).squeeze(0))[-len(token_weights_gt):]
-    pred_hard = (scores.float().cpu().numpy() > 0.5).astype(np.float32)
-    _, _, f1 = hard_f1(gt_hard, pred_hard)
-    if item["correctness"] == -1:
-        f1_incor.append(f1)
-    else:
-        f1_cor.append(f1)
-print(f"S_incor (F1 on hallucinated): {np.mean(f1_incor)*100:.2f}")
-print(f"S_cor   (recall on correct):  {np.mean(f1_cor)*100:.2f}")
 ```

 ## Evaluation
+Use the [TokenHD eval dataset](https://huggingface.co/datasets/mr233/TokenHD-eval-data) to compute **S_incor** (token F1 on hallucinated samples) and **S_cor** (recall on hallucination-free samples):
 ```python
 from datasets import load_dataset
 )
 model.eval()
+benchmarks = [
+    "tokenhd_eval_math_500",
+    "tokenhd_eval_math_aime",
+    "tokenhd_eval_math_gpqa",
+    "tokenhd_eval_math_fin_qa",
+    "tokenhd_eval_math_olym",
+    "tokenhd_eval_math_olym_phy",
+    "tokenhd_eval_code_codeelo",
+    "tokenhd_eval_code_live_code_lite",
+]
+for bench in benchmarks:
+    dataset = load_dataset("mr233/TokenHD-eval-data",
+                           data_files=f"{bench}.jsonl", split="train")
+    f1_incor, f1_cor = [], []
+    for item in dataset:
+        token_weights_gt = np.array(item["token_weights"], dtype=np.float32)
+        gt_hard = (token_weights_gt > 0.5).astype(np.float32)
+        messages = [{"role": "user",      "content": item["problem"]},
+                    {"role": "assistant", "content": item["raw_answer"]}]
+        input_ids = tokenizer.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=False)[:-2]
+        input_tensor = torch.tensor(input_ids, device=model.device).unsqueeze(0)
+        with torch.no_grad():
+            logits = model(input_ids=input_tensor).logits
+        scores = torch.sigmoid(logits.squeeze(-1).squeeze(0))[-len(gt_hard):]
+        pred_hard = (scores.float().cpu().numpy() > 0.5).astype(np.float32)
+        _, _, f1 = hard_f1(gt_hard, pred_hard)
+        if item["correctness"] == -1:
+            f1_incor.append(f1)
+        else:
+            f1_cor.append(f1)
+    s_incor = np.mean(f1_incor) * 100 if f1_incor else float("nan")
+    s_cor   = np.mean(f1_cor)   * 100 if f1_cor   else float("nan")
+    print(f"{bench:<44s}  S_incor={s_incor:.2f}  S_cor={s_cor:.2f}")
 ```