Jayant-Kernel commited on
Commit ·
0bdaeb6
1
Parent(s): 20bb6de
update: compare 0.5B vs 1.5B trained models
Browse files- evaluate.py +43 -56
evaluate.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import os, json, re, torch, pathlib, gc
|
| 2 |
import threading
|
| 3 |
from http.server import HTTPServer, BaseHTTPRequestHandler
|
| 4 |
|
|
@@ -10,24 +10,22 @@ class HealthHandler(BaseHTTPRequestHandler):
|
|
| 10 |
def log_message(self, format, *args):
|
| 11 |
pass
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
health_thread = threading.Thread(target=start_health_server, daemon=True)
|
| 18 |
health_thread.start()
|
| 19 |
print("Health server started on port 7860")
|
| 20 |
|
| 21 |
-
from
|
| 22 |
from deceit_env.server.environment import DeceitEnvironment
|
| 23 |
from deceit_env.server.grader import Grader
|
| 24 |
from deceit_env.models import DeceitAction
|
| 25 |
import matplotlib.pyplot as plt
|
| 26 |
import matplotlib
|
| 27 |
matplotlib.use('Agg')
|
|
|
|
| 28 |
|
| 29 |
-
# Auth
|
| 30 |
-
from huggingface_hub import login
|
| 31 |
login(token=os.environ["HF_TOKEN"])
|
| 32 |
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
|
| 33 |
os.environ["DECEIT_GRADER_CACHE"] = "/tmp/eval_cache.json"
|
|
@@ -56,15 +54,20 @@ def parse_action(text):
|
|
| 56 |
|
| 57 |
def evaluate_model(model_name, label, n_episodes=30):
|
| 58 |
print(f"\nEvaluating: {label}")
|
| 59 |
-
|
| 60 |
-
model_name=model_name,
|
| 61 |
-
max_seq_length=1024,
|
| 62 |
-
dtype=None,
|
| 63 |
load_in_4bit=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
)
|
| 65 |
-
|
|
|
|
| 66 |
|
| 67 |
-
# Download dataset from GitHub
|
| 68 |
import urllib.request as _ur
|
| 69 |
_RAW = "https://raw.githubusercontent.com/Jayant-kernel/DECEIT-the-ai-truth-environment-/main/src/deceit_env/data"
|
| 70 |
for _fname in ["level1.jsonl", "level2.jsonl", "level3.jsonl"]:
|
|
@@ -119,7 +122,6 @@ def evaluate_model(model_name, label, n_episodes=30):
|
|
| 119 |
if (i+1) % 10 == 0:
|
| 120 |
print(f" {i+1}/{n_episodes} done, mean reward so far: {sum(rewards)/len(rewards):.3f}")
|
| 121 |
|
| 122 |
-
# Free model from GPU
|
| 123 |
del model
|
| 124 |
torch.cuda.empty_cache()
|
| 125 |
gc.collect()
|
|
@@ -133,85 +135,70 @@ def evaluate_model(model_name, label, n_episodes=30):
|
|
| 133 |
"rewards": rewards,
|
| 134 |
}
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
trained_results = evaluate_model("Ajsaxena/deceit-qwen-0.5b-full", "DECEIT Trained", n_episodes=30)
|
| 139 |
|
| 140 |
-
# Print comparison
|
| 141 |
print("\n" + "="*60)
|
| 142 |
print("RESULTS COMPARISON")
|
| 143 |
print("="*60)
|
| 144 |
-
for r in [
|
| 145 |
print(f"\n{r['label']}:")
|
| 146 |
print(f" Mean Reward: {r['mean_reward']:+.3f}")
|
| 147 |
print(f" Accuracy: {r['accuracy']*100:.1f}%")
|
| 148 |
print(f" Confident Wrong Rate: {r['confident_wrong_rate']*100:.1f}% <- sycophancy proxy")
|
| 149 |
print(f" Abstain Rate: {r['abstain_rate']*100:.1f}%")
|
| 150 |
|
| 151 |
-
# Plot 1 —
|
| 152 |
fig, axes = plt.subplots(1, 3, figsize=(14, 5))
|
| 153 |
-
|
| 154 |
-
models = [base_results["label"], trained_results["label"]]
|
| 155 |
colors = ["#e74c3c", "#2ecc71"]
|
| 156 |
|
| 157 |
-
|
| 158 |
-
axes[0].bar(models, [base_results["mean_reward"], trained_results["mean_reward"]], color=colors)
|
| 159 |
axes[0].axhline(y=0, color="gray", linestyle="--", alpha=0.5)
|
| 160 |
axes[0].set_title("Mean Episode Reward")
|
| 161 |
axes[0].set_ylabel("Reward")
|
| 162 |
|
| 163 |
-
|
| 164 |
-
axes[1].bar(models, [base_results["accuracy"]*100, trained_results["accuracy"]*100], color=colors)
|
| 165 |
axes[1].set_title("Answer Accuracy (%)")
|
| 166 |
axes[1].set_ylabel("Accuracy %")
|
| 167 |
axes[1].set_ylim(0, 100)
|
| 168 |
|
| 169 |
-
|
| 170 |
-
axes[2].bar(models, [base_results["confident_wrong_rate"]*100, trained_results["confident_wrong_rate"]*100], color=colors)
|
| 171 |
axes[2].set_title("Confident Wrong Rate %\n(Sycophancy Proxy - lower is better)")
|
| 172 |
axes[2].set_ylabel("%")
|
| 173 |
axes[2].set_ylim(0, 100)
|
| 174 |
|
| 175 |
-
plt.suptitle("DECEIT:
|
| 176 |
plt.tight_layout()
|
| 177 |
-
plt.savefig("comparison_chart.png", dpi=150, bbox_inches="tight")
|
| 178 |
print("\nSaved comparison_chart.png")
|
| 179 |
|
| 180 |
# Plot 2 — Reward distribution
|
| 181 |
fig2, ax = plt.subplots(figsize=(10, 5))
|
| 182 |
-
ax.hist(
|
| 183 |
-
ax.hist(
|
| 184 |
ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)
|
| 185 |
ax.set_xlabel("Episode Reward")
|
| 186 |
ax.set_ylabel("Count")
|
| 187 |
-
ax.set_title("Reward Distribution:
|
| 188 |
ax.legend()
|
| 189 |
plt.tight_layout()
|
| 190 |
-
plt.savefig("reward_distribution.png", dpi=150, bbox_inches="tight")
|
| 191 |
print("Saved reward_distribution.png")
|
| 192 |
|
| 193 |
-
print("\nDone! Download comparison_chart.png and reward_distribution.png")
|
| 194 |
-
|
| 195 |
-
from huggingface_hub import upload_file
|
| 196 |
-
import time
|
| 197 |
-
|
| 198 |
try:
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
repo_id="Ajsaxena/deceit-qwen-0.5b-full",
|
| 209 |
-
repo_type="model"
|
| 210 |
-
)
|
| 211 |
-
print("Charts uploaded to HF Hub successfully!")
|
| 212 |
except Exception as e:
|
| 213 |
print(f"Upload error: {e}")
|
| 214 |
|
| 215 |
-
print("Keeping alive
|
| 216 |
-
time.sleep(
|
| 217 |
print("Done.")
|
|
|
|
| 1 |
+
import os, json, re, torch, pathlib, gc, time
|
| 2 |
import threading
|
| 3 |
from http.server import HTTPServer, BaseHTTPRequestHandler
|
| 4 |
|
|
|
|
| 10 |
def log_message(self, format, *args):
|
| 11 |
pass
|
| 12 |
|
| 13 |
+
health_thread = threading.Thread(
|
| 14 |
+
target=lambda: HTTPServer(("0.0.0.0", 7860), HealthHandler).serve_forever(),
|
| 15 |
+
daemon=True
|
| 16 |
+
)
|
|
|
|
| 17 |
health_thread.start()
|
| 18 |
print("Health server started on port 7860")
|
| 19 |
|
| 20 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 21 |
from deceit_env.server.environment import DeceitEnvironment
|
| 22 |
from deceit_env.server.grader import Grader
|
| 23 |
from deceit_env.models import DeceitAction
|
| 24 |
import matplotlib.pyplot as plt
|
| 25 |
import matplotlib
|
| 26 |
matplotlib.use('Agg')
|
| 27 |
+
from huggingface_hub import login, upload_file
|
| 28 |
|
|
|
|
|
|
|
| 29 |
login(token=os.environ["HF_TOKEN"])
|
| 30 |
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
|
| 31 |
os.environ["DECEIT_GRADER_CACHE"] = "/tmp/eval_cache.json"
|
|
|
|
| 54 |
|
| 55 |
def evaluate_model(model_name, label, n_episodes=30):
|
| 56 |
print(f"\nEvaluating: {label}")
|
| 57 |
+
bnb_config = BitsAndBytesConfig(
|
|
|
|
|
|
|
|
|
|
| 58 |
load_in_4bit=True,
|
| 59 |
+
bnb_4bit_quant_type="nf4",
|
| 60 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 61 |
+
)
|
| 62 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 63 |
+
model_name,
|
| 64 |
+
quantization_config=bnb_config,
|
| 65 |
+
device_map="auto",
|
| 66 |
+
trust_remote_code=True,
|
| 67 |
)
|
| 68 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 69 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 70 |
|
|
|
|
| 71 |
import urllib.request as _ur
|
| 72 |
_RAW = "https://raw.githubusercontent.com/Jayant-kernel/DECEIT-the-ai-truth-environment-/main/src/deceit_env/data"
|
| 73 |
for _fname in ["level1.jsonl", "level2.jsonl", "level3.jsonl"]:
|
|
|
|
| 122 |
if (i+1) % 10 == 0:
|
| 123 |
print(f" {i+1}/{n_episodes} done, mean reward so far: {sum(rewards)/len(rewards):.3f}")
|
| 124 |
|
|
|
|
| 125 |
del model
|
| 126 |
torch.cuda.empty_cache()
|
| 127 |
gc.collect()
|
|
|
|
| 135 |
"rewards": rewards,
|
| 136 |
}
|
| 137 |
|
| 138 |
+
results_05b = evaluate_model("Ajsaxena/deceit-qwen-0.5b-full", "DECEIT 0.5B Trained", n_episodes=30)
|
| 139 |
+
results_15b = evaluate_model("Ajsaxena/deceit-qwen-1.5b-full", "DECEIT 1.5B Trained", n_episodes=30)
|
|
|
|
| 140 |
|
|
|
|
| 141 |
print("\n" + "="*60)
|
| 142 |
print("RESULTS COMPARISON")
|
| 143 |
print("="*60)
|
| 144 |
+
for r in [results_05b, results_15b]:
|
| 145 |
print(f"\n{r['label']}:")
|
| 146 |
print(f" Mean Reward: {r['mean_reward']:+.3f}")
|
| 147 |
print(f" Accuracy: {r['accuracy']*100:.1f}%")
|
| 148 |
print(f" Confident Wrong Rate: {r['confident_wrong_rate']*100:.1f}% <- sycophancy proxy")
|
| 149 |
print(f" Abstain Rate: {r['abstain_rate']*100:.1f}%")
|
| 150 |
|
| 151 |
+
# Plot 1 — Comparison bar chart
|
| 152 |
fig, axes = plt.subplots(1, 3, figsize=(14, 5))
|
| 153 |
+
models = [results_05b["label"], results_15b["label"]]
|
|
|
|
| 154 |
colors = ["#e74c3c", "#2ecc71"]
|
| 155 |
|
| 156 |
+
axes[0].bar(models, [results_05b["mean_reward"], results_15b["mean_reward"]], color=colors)
|
|
|
|
| 157 |
axes[0].axhline(y=0, color="gray", linestyle="--", alpha=0.5)
|
| 158 |
axes[0].set_title("Mean Episode Reward")
|
| 159 |
axes[0].set_ylabel("Reward")
|
| 160 |
|
| 161 |
+
axes[1].bar(models, [results_05b["accuracy"]*100, results_15b["accuracy"]*100], color=colors)
|
|
|
|
| 162 |
axes[1].set_title("Answer Accuracy (%)")
|
| 163 |
axes[1].set_ylabel("Accuracy %")
|
| 164 |
axes[1].set_ylim(0, 100)
|
| 165 |
|
| 166 |
+
axes[2].bar(models, [results_05b["confident_wrong_rate"]*100, results_15b["confident_wrong_rate"]*100], color=colors)
|
|
|
|
| 167 |
axes[2].set_title("Confident Wrong Rate %\n(Sycophancy Proxy - lower is better)")
|
| 168 |
axes[2].set_ylabel("%")
|
| 169 |
axes[2].set_ylim(0, 100)
|
| 170 |
|
| 171 |
+
plt.suptitle("DECEIT: 0.5B vs 1.5B Trained Model Comparison", fontsize=13)
|
| 172 |
plt.tight_layout()
|
| 173 |
+
plt.savefig("/tmp/comparison_chart.png", dpi=150, bbox_inches="tight")
|
| 174 |
print("\nSaved comparison_chart.png")
|
| 175 |
|
| 176 |
# Plot 2 — Reward distribution
|
| 177 |
fig2, ax = plt.subplots(figsize=(10, 5))
|
| 178 |
+
ax.hist(results_05b["rewards"], bins=15, alpha=0.6, color="#e74c3c", label="DECEIT 0.5B Trained")
|
| 179 |
+
ax.hist(results_15b["rewards"], bins=15, alpha=0.6, color="#2ecc71", label="DECEIT 1.5B Trained")
|
| 180 |
ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)
|
| 181 |
ax.set_xlabel("Episode Reward")
|
| 182 |
ax.set_ylabel("Count")
|
| 183 |
+
ax.set_title("Reward Distribution: 0.5B vs 1.5B Trained")
|
| 184 |
ax.legend()
|
| 185 |
plt.tight_layout()
|
| 186 |
+
plt.savefig("/tmp/reward_distribution.png", dpi=150, bbox_inches="tight")
|
| 187 |
print("Saved reward_distribution.png")
|
| 188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
try:
|
| 190 |
+
for fname in ["comparison_chart.png", "reward_distribution.png"]:
|
| 191 |
+
upload_file(
|
| 192 |
+
path_or_fileobj=f"/tmp/{fname}",
|
| 193 |
+
path_in_repo=fname,
|
| 194 |
+
repo_id="Ajsaxena/deceit-qwen-1.5b-full",
|
| 195 |
+
repo_type="model"
|
| 196 |
+
)
|
| 197 |
+
print(f"Uploaded {fname} to HF Hub")
|
| 198 |
+
print("All charts uploaded!")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
except Exception as e:
|
| 200 |
print(f"Upload error: {e}")
|
| 201 |
|
| 202 |
+
print("Keeping alive...")
|
| 203 |
+
time.sleep(3600)
|
| 204 |
print("Done.")
|