Jayant-Kernel commited on
Commit
68e5af2
·
1 Parent(s): e4aea5d

add: evaluate 1.5B base vs trained, upload charts

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -1
  2. evaluate.py +37 -29
Dockerfile CHANGED
@@ -21,4 +21,4 @@ COPY data/ /home/trainer/.local/lib/python3.10/site-packages/deceit_env/data/
21
  COPY data/ /app/data/
22
  COPY train.py .
23
 
24
- CMD ["python", "train.py"]
 
21
  COPY data/ /app/data/
22
  COPY train.py .
23
 
24
+ CMD ["python", "evaluate.py"]
evaluate.py CHANGED
@@ -1,7 +1,13 @@
1
- import os, json, re, torch, pathlib, gc, time
2
  import threading
3
  from http.server import HTTPServer, BaseHTTPRequestHandler
4
 
 
 
 
 
 
 
5
  class HealthHandler(BaseHTTPRequestHandler):
6
  def do_GET(self):
7
  self.send_response(200)
@@ -17,6 +23,7 @@ health_thread = threading.Thread(
17
  health_thread.start()
18
  print("Health server started on port 7860")
19
 
 
20
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
21
  from deceit_env.server.environment import DeceitEnvironment
22
  from deceit_env.server.grader import Grader
@@ -43,14 +50,15 @@ def parse_action(text):
43
  obj = json.loads(text)
44
  if isinstance(obj, dict):
45
  return {
46
- "reasoning": str(obj.get("reasoning","")),
47
- "answer": str(obj.get("answer","")),
48
- "confidence": float(max(0,min(1,obj.get("confidence",0.5)))),
49
- "abstain": bool(obj.get("abstain",False)),
50
  "is_final": True,
51
  }
52
- except: pass
53
- return {"reasoning":"","answer":"","confidence":0.0,"abstain":True,"is_final":True}
 
54
 
55
  def evaluate_model(model_name, label, n_episodes=30):
56
  print(f"\nEvaluating: {label}")
@@ -74,7 +82,7 @@ def evaluate_model(model_name, label, n_episodes=30):
74
  _ur.urlretrieve(f"{_RAW}/{_fname}", f"/tmp/{_fname}")
75
 
76
  grader = Grader(cache_path="/tmp/eval_cache.json",
77
- openai_api_key=os.environ.get("OPENAI_API_KEY",""))
78
  env = DeceitEnvironment(
79
  dataset_path="/tmp/level1.jsonl",
80
  level2_dataset_path="/tmp/level2.jsonl",
@@ -90,14 +98,14 @@ def evaluate_model(model_name, label, n_episodes=30):
90
  for i in range(n_episodes):
91
  obs = env.reset()
92
  msgs = [
93
- {"role":"system","content":SYSTEM_PROMPT},
94
- {"role":"user","content":f"Question: {obs.question}\n\nRespond in JSON."},
95
  ]
96
  prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
97
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
98
  with torch.no_grad():
99
  out = model.generate(**inputs, max_new_tokens=150, do_sample=False,
100
- pad_token_id=tokenizer.eos_token_id)
101
  text = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
102
  parsed = parse_action(text)
103
 
@@ -119,7 +127,7 @@ def evaluate_model(model_name, label, n_episodes=30):
119
  if parsed["abstain"]:
120
  abstain_count += 1
121
 
122
- if (i+1) % 10 == 0:
123
  print(f" {i+1}/{n_episodes} done, mean reward so far: {sum(rewards)/len(rewards):.3f}")
124
 
125
  del model
@@ -128,20 +136,20 @@ def evaluate_model(model_name, label, n_episodes=30):
128
 
129
  return {
130
  "label": label,
131
- "mean_reward": sum(rewards)/len(rewards),
132
- "accuracy": correct_count/n_episodes,
133
- "confident_wrong_rate": confident_wrong/n_episodes,
134
- "abstain_rate": abstain_count/n_episodes,
135
  "rewards": rewards,
136
  }
137
 
138
- results_05b = evaluate_model("Ajsaxena/deceit-qwen-0.5b-full", "DECEIT 0.5B Trained", n_episodes=30)
139
- results_15b = evaluate_model("Ajsaxena/deceit-qwen-1.5b-full", "DECEIT 1.5B Trained", n_episodes=30)
140
 
141
- print("\n" + "="*60)
142
  print("RESULTS COMPARISON")
143
- print("="*60)
144
- for r in [results_05b, results_15b]:
145
  print(f"\n{r['label']}:")
146
  print(f" Mean Reward: {r['mean_reward']:+.3f}")
147
  print(f" Accuracy: {r['accuracy']*100:.1f}%")
@@ -150,37 +158,37 @@ for r in [results_05b, results_15b]:
150
 
151
  # Plot 1 — Comparison bar chart
152
  fig, axes = plt.subplots(1, 3, figsize=(14, 5))
153
- models = [results_05b["label"], results_15b["label"]]
154
  colors = ["#e74c3c", "#2ecc71"]
155
 
156
- axes[0].bar(models, [results_05b["mean_reward"], results_15b["mean_reward"]], color=colors)
157
  axes[0].axhline(y=0, color="gray", linestyle="--", alpha=0.5)
158
  axes[0].set_title("Mean Episode Reward")
159
  axes[0].set_ylabel("Reward")
160
 
161
- axes[1].bar(models, [results_05b["accuracy"]*100, results_15b["accuracy"]*100], color=colors)
162
  axes[1].set_title("Answer Accuracy (%)")
163
  axes[1].set_ylabel("Accuracy %")
164
  axes[1].set_ylim(0, 100)
165
 
166
- axes[2].bar(models, [results_05b["confident_wrong_rate"]*100, results_15b["confident_wrong_rate"]*100], color=colors)
167
  axes[2].set_title("Confident Wrong Rate %\n(Sycophancy Proxy - lower is better)")
168
  axes[2].set_ylabel("%")
169
  axes[2].set_ylim(0, 100)
170
 
171
- plt.suptitle("DECEIT: 0.5B vs 1.5B Trained Model Comparison", fontsize=13)
172
  plt.tight_layout()
173
  plt.savefig("/tmp/comparison_chart.png", dpi=150, bbox_inches="tight")
174
  print("\nSaved comparison_chart.png")
175
 
176
  # Plot 2 — Reward distribution
177
  fig2, ax = plt.subplots(figsize=(10, 5))
178
- ax.hist(results_05b["rewards"], bins=15, alpha=0.6, color="#e74c3c", label="DECEIT 0.5B Trained")
179
- ax.hist(results_15b["rewards"], bins=15, alpha=0.6, color="#2ecc71", label="DECEIT 1.5B Trained")
180
  ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)
181
  ax.set_xlabel("Episode Reward")
182
  ax.set_ylabel("Count")
183
- ax.set_title("Reward Distribution: 0.5B vs 1.5B Trained")
184
  ax.legend()
185
  plt.tight_layout()
186
  plt.savefig("/tmp/reward_distribution.png", dpi=150, bbox_inches="tight")
 
1
+ import os, json, re, gc, time
2
  import threading
3
  from http.server import HTTPServer, BaseHTTPRequestHandler
4
 
5
+ os.environ["HF_HOME"] = "/tmp/huggingface"
6
+ os.environ["HOME"] = "/tmp"
7
+ os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torch_cache"
8
+ os.makedirs("/tmp/huggingface", exist_ok=True)
9
+ os.makedirs("/tmp/torch_cache", exist_ok=True)
10
+
11
  class HealthHandler(BaseHTTPRequestHandler):
12
  def do_GET(self):
13
  self.send_response(200)
 
23
  health_thread.start()
24
  print("Health server started on port 7860")
25
 
26
+ import torch
27
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
28
  from deceit_env.server.environment import DeceitEnvironment
29
  from deceit_env.server.grader import Grader
 
50
  obj = json.loads(text)
51
  if isinstance(obj, dict):
52
  return {
53
+ "reasoning": str(obj.get("reasoning", "")),
54
+ "answer": str(obj.get("answer", "")),
55
+ "confidence": float(max(0, min(1, obj.get("confidence", 0.5)))),
56
+ "abstain": bool(obj.get("abstain", False)),
57
  "is_final": True,
58
  }
59
+ except:
60
+ pass
61
+ return {"reasoning": "", "answer": "", "confidence": 0.0, "abstain": True, "is_final": True}
62
 
63
  def evaluate_model(model_name, label, n_episodes=30):
64
  print(f"\nEvaluating: {label}")
 
82
  _ur.urlretrieve(f"{_RAW}/{_fname}", f"/tmp/{_fname}")
83
 
84
  grader = Grader(cache_path="/tmp/eval_cache.json",
85
+ openai_api_key=os.environ.get("OPENAI_API_KEY", ""))
86
  env = DeceitEnvironment(
87
  dataset_path="/tmp/level1.jsonl",
88
  level2_dataset_path="/tmp/level2.jsonl",
 
98
  for i in range(n_episodes):
99
  obs = env.reset()
100
  msgs = [
101
+ {"role": "system", "content": SYSTEM_PROMPT},
102
+ {"role": "user", "content": f"Question: {obs.question}\n\nRespond in JSON."},
103
  ]
104
  prompt = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
105
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
106
  with torch.no_grad():
107
  out = model.generate(**inputs, max_new_tokens=150, do_sample=False,
108
+ pad_token_id=tokenizer.eos_token_id)
109
  text = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
110
  parsed = parse_action(text)
111
 
 
127
  if parsed["abstain"]:
128
  abstain_count += 1
129
 
130
+ if (i + 1) % 10 == 0:
131
  print(f" {i+1}/{n_episodes} done, mean reward so far: {sum(rewards)/len(rewards):.3f}")
132
 
133
  del model
 
136
 
137
  return {
138
  "label": label,
139
+ "mean_reward": sum(rewards) / len(rewards),
140
+ "accuracy": correct_count / n_episodes,
141
+ "confident_wrong_rate": confident_wrong / n_episodes,
142
+ "abstain_rate": abstain_count / n_episodes,
143
  "rewards": rewards,
144
  }
145
 
146
+ base_results = evaluate_model("Qwen/Qwen2.5-1.5B-Instruct", "Base 1.5B (untrained)", n_episodes=30)
147
+ trained_results = evaluate_model("Ajsaxena/deceit-qwen-1.5b-full", "DECEIT 1.5B Trained", n_episodes=30)
148
 
149
+ print("\n" + "=" * 60)
150
  print("RESULTS COMPARISON")
151
+ print("=" * 60)
152
+ for r in [base_results, trained_results]:
153
  print(f"\n{r['label']}:")
154
  print(f" Mean Reward: {r['mean_reward']:+.3f}")
155
  print(f" Accuracy: {r['accuracy']*100:.1f}%")
 
158
 
159
  # Plot 1 — Comparison bar chart
160
  fig, axes = plt.subplots(1, 3, figsize=(14, 5))
161
+ models = [base_results["label"], trained_results["label"]]
162
  colors = ["#e74c3c", "#2ecc71"]
163
 
164
+ axes[0].bar(models, [base_results["mean_reward"], trained_results["mean_reward"]], color=colors)
165
  axes[0].axhline(y=0, color="gray", linestyle="--", alpha=0.5)
166
  axes[0].set_title("Mean Episode Reward")
167
  axes[0].set_ylabel("Reward")
168
 
169
+ axes[1].bar(models, [base_results["accuracy"] * 100, trained_results["accuracy"] * 100], color=colors)
170
  axes[1].set_title("Answer Accuracy (%)")
171
  axes[1].set_ylabel("Accuracy %")
172
  axes[1].set_ylim(0, 100)
173
 
174
+ axes[2].bar(models, [base_results["confident_wrong_rate"] * 100, trained_results["confident_wrong_rate"] * 100], color=colors)
175
  axes[2].set_title("Confident Wrong Rate %\n(Sycophancy Proxy - lower is better)")
176
  axes[2].set_ylabel("%")
177
  axes[2].set_ylim(0, 100)
178
 
179
+ plt.suptitle("DECEIT: Base 1.5B vs Trained 1.5B Model", fontsize=13)
180
  plt.tight_layout()
181
  plt.savefig("/tmp/comparison_chart.png", dpi=150, bbox_inches="tight")
182
  print("\nSaved comparison_chart.png")
183
 
184
  # Plot 2 — Reward distribution
185
  fig2, ax = plt.subplots(figsize=(10, 5))
186
+ ax.hist(base_results["rewards"], bins=15, alpha=0.6, color="#e74c3c", label="Base 1.5B (untrained)")
187
+ ax.hist(trained_results["rewards"], bins=15, alpha=0.6, color="#2ecc71", label="DECEIT 1.5B Trained")
188
  ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)
189
  ax.set_xlabel("Episode Reward")
190
  ax.set_ylabel("Count")
191
+ ax.set_title("Reward Distribution: Base 1.5B vs DECEIT 1.5B Trained")
192
  ax.legend()
193
  plt.tight_layout()
194
  plt.savefig("/tmp/reward_distribution.png", dpi=150, bbox_inches="tight")