mr233 commited on
Commit
3910817
·
verified ·
1 Parent(s): c60fad1

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +38 -38
README.md CHANGED
@@ -64,16 +64,7 @@ scores = torch.sigmoid(logits.squeeze(-1).squeeze(0))[-len(response_ids):]
64
 
65
  ## Evaluation
66
 
67
- TokenHD models are evaluated with two metrics:
68
-
69
- - **S_incor**: Token-level F1 on hallucinated (incorrect) responses — measures how precisely the detector localizes errors.
70
- - **S_cor**: Recall on hallucination-free (correct) responses — measures how rarely the detector raises false alarms.
71
-
72
- ---
73
-
74
- ## Evaluation
75
-
76
- Evaluate using the [TokenHD eval dataset](https://huggingface.co/datasets/mr233/TokenHD-eval-data):
77
 
78
  ```python
79
  from datasets import load_dataset
@@ -99,32 +90,41 @@ model = AutoModelForTokenClassification.from_pretrained(
99
  )
100
  model.eval()
101
 
102
- dataset = load_dataset("mr233/TokenHD-eval-data",
103
- data_files="tokenhd_eval_math_500.jsonl", split="train")
104
-
105
- f1_incor, f1_cor = [], []
106
- for item in dataset:
107
- problem, raw_answer = item["problem"], item["raw_answer"]
108
- token_weights_gt = np.array(item["token_weights"], dtype=np.float32)
109
- gt_hard = (token_weights_gt > 0.5).astype(np.float32)
110
-
111
- messages = [{"role": "user", "content": problem},
112
- {"role": "assistant", "content": raw_answer}]
113
- input_ids = tokenizer.apply_chat_template(
114
- messages, tokenize=True, add_generation_prompt=False)[:-2]
115
- input_tensor = torch.tensor(input_ids, device=model.device).unsqueeze(0)
116
-
117
- with torch.no_grad():
118
- logits = model(input_ids=input_tensor).logits
119
- scores = torch.sigmoid(logits.squeeze(-1).squeeze(0))[-len(token_weights_gt):]
120
- pred_hard = (scores.float().cpu().numpy() > 0.5).astype(np.float32)
121
-
122
- _, _, f1 = hard_f1(gt_hard, pred_hard)
123
- if item["correctness"] == -1:
124
- f1_incor.append(f1)
125
- else:
126
- f1_cor.append(f1)
127
-
128
- print(f"S_incor (F1 on hallucinated): {np.mean(f1_incor)*100:.2f}")
129
- print(f"S_cor (recall on correct): {np.mean(f1_cor)*100:.2f}")
 
 
 
 
 
 
 
 
 
130
  ```
 
64
 
65
  ## Evaluation
66
 
67
+ Use the [TokenHD eval dataset](https://huggingface.co/datasets/mr233/TokenHD-eval-data) to compute **S_incor** (token F1 on hallucinated samples) and **S_cor** (recall on hallucination-free samples):
 
 
 
 
 
 
 
 
 
68
 
69
  ```python
70
  from datasets import load_dataset
 
90
  )
91
  model.eval()
92
 
93
+ benchmarks = [
94
+ "tokenhd_eval_math_500",
95
+ "tokenhd_eval_math_aime",
96
+ "tokenhd_eval_math_gpqa",
97
+ "tokenhd_eval_math_fin_qa",
98
+ "tokenhd_eval_math_olym",
99
+ "tokenhd_eval_math_olym_phy",
100
+ ]
101
+
102
+ for bench in benchmarks:
103
+ dataset = load_dataset("mr233/TokenHD-eval-data",
104
+ data_files=f"{bench}.jsonl", split="train")
105
+ f1_incor, f1_cor = [], []
106
+ for item in dataset:
107
+ token_weights_gt = np.array(item["token_weights"], dtype=np.float32)
108
+ gt_hard = (token_weights_gt > 0.5).astype(np.float32)
109
+
110
+ messages = [{"role": "user", "content": item["problem"]},
111
+ {"role": "assistant", "content": item["raw_answer"]}]
112
+ input_ids = tokenizer.apply_chat_template(
113
+ messages, tokenize=True, add_generation_prompt=False)[:-2]
114
+ input_tensor = torch.tensor(input_ids, device=model.device).unsqueeze(0)
115
+
116
+ with torch.no_grad():
117
+ logits = model(input_ids=input_tensor).logits
118
+ scores = torch.sigmoid(logits.squeeze(-1).squeeze(0))[-len(gt_hard):]
119
+ pred_hard = (scores.float().cpu().numpy() > 0.5).astype(np.float32)
120
+
121
+ _, _, f1 = hard_f1(gt_hard, pred_hard)
122
+ if item["correctness"] == -1:
123
+ f1_incor.append(f1)
124
+ else:
125
+ f1_cor.append(f1)
126
+
127
+ s_incor = np.mean(f1_incor) * 100 if f1_incor else float("nan")
128
+ s_cor = np.mean(f1_cor) * 100 if f1_cor else float("nan")
129
+ print(f"{bench:<40s} S_incor={s_incor:.2f} S_cor={s_cor:.2f}")
130
  ```