Jayant-Kernel commited on
Commit ·
6b64fd2
1
Parent(s): e662a77
evaluate: switch to 0.5B model comparison, 200 episodes
Browse files- evaluate.py +9 -9
evaluate.py
CHANGED
|
@@ -37,8 +37,8 @@ import matplotlib.pyplot as plt
|
|
| 37 |
|
| 38 |
login(token=os.environ["HF_TOKEN"])
|
| 39 |
|
| 40 |
-
BASE_MODEL = "Qwen/Qwen2.5-
|
| 41 |
-
TRAINED_MODEL = "Ajsaxena/deceit-qwen-
|
| 42 |
N_EPISODES = 200
|
| 43 |
|
| 44 |
from deceit_env.server.environment import DeceitEnvironment
|
|
@@ -179,14 +179,14 @@ def evaluate_model(model_name, label, n_episodes=200, is_trained=False):
|
|
| 179 |
return results
|
| 180 |
|
| 181 |
# Run evaluations
|
| 182 |
-
base_results = evaluate_model(BASE_MODEL, "Base
|
| 183 |
-
trained_results = evaluate_model(TRAINED_MODEL, "DECEIT
|
| 184 |
|
| 185 |
# Calculate percentages
|
| 186 |
def pct(val, total):
|
| 187 |
return round((val / total) * 100, 1)
|
| 188 |
|
| 189 |
-
labels = ["Base
|
| 190 |
colors = ["#e74c3c", "#2ecc71"]
|
| 191 |
|
| 192 |
mean_rewards = [
|
|
@@ -234,20 +234,20 @@ axes[3].set_title("Abstain Rate %\n(Honest Uncertainty — higher is better)")
|
|
| 234 |
axes[3].set_ylabel("%")
|
| 235 |
axes[3].set_ylim(0, 100)
|
| 236 |
|
| 237 |
-
plt.suptitle("DECEIT: Base
|
| 238 |
plt.tight_layout()
|
| 239 |
-
plt.savefig("/tmp/
|
| 240 |
plt.close()
|
| 241 |
print("Chart saved")
|
| 242 |
|
| 243 |
# Upload to HF Hub
|
| 244 |
for fname, hf_name in [
|
| 245 |
-
("/tmp/
|
| 246 |
]:
|
| 247 |
upload_file(
|
| 248 |
path_or_fileobj=fname,
|
| 249 |
path_in_repo=hf_name,
|
| 250 |
-
repo_id="Ajsaxena/deceit-qwen-
|
| 251 |
repo_type="model"
|
| 252 |
)
|
| 253 |
print(f"Uploaded {hf_name} to HF Hub")
|
|
|
|
| 37 |
|
| 38 |
login(token=os.environ["HF_TOKEN"])
|
| 39 |
|
| 40 |
+
BASE_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
|
| 41 |
+
TRAINED_MODEL = "Ajsaxena/deceit-qwen-0.5b-full"
|
| 42 |
N_EPISODES = 200
|
| 43 |
|
| 44 |
from deceit_env.server.environment import DeceitEnvironment
|
|
|
|
| 179 |
return results
|
| 180 |
|
| 181 |
# Run evaluations
|
| 182 |
+
base_results = evaluate_model(BASE_MODEL, "Base 0.5B (untrained)", N_EPISODES, is_trained=False)
|
| 183 |
+
trained_results = evaluate_model(TRAINED_MODEL, "DECEIT 0.5B Trained", N_EPISODES, is_trained=True)
|
| 184 |
|
| 185 |
# Calculate percentages
|
| 186 |
def pct(val, total):
|
| 187 |
return round((val / total) * 100, 1)
|
| 188 |
|
| 189 |
+
labels = ["Base 0.5B\n(untrained)", "DECEIT 0.5B\nTrained"]
|
| 190 |
colors = ["#e74c3c", "#2ecc71"]
|
| 191 |
|
| 192 |
mean_rewards = [
|
|
|
|
| 234 |
axes[3].set_ylabel("%")
|
| 235 |
axes[3].set_ylim(0, 100)
|
| 236 |
|
| 237 |
+
plt.suptitle("DECEIT: Base 0.5B vs Trained 0.5B Model\n(200 episodes each)", fontsize=13)
|
| 238 |
plt.tight_layout()
|
| 239 |
+
plt.savefig("/tmp/comparison_0.5b_200ep.png", dpi=150, bbox_inches="tight")
|
| 240 |
plt.close()
|
| 241 |
print("Chart saved")
|
| 242 |
|
| 243 |
# Upload to HF Hub
|
| 244 |
for fname, hf_name in [
|
| 245 |
+
("/tmp/comparison_0.5b_200ep.png", "comparison_0.5b_200ep.png"),
|
| 246 |
]:
|
| 247 |
upload_file(
|
| 248 |
path_or_fileobj=fname,
|
| 249 |
path_in_repo=hf_name,
|
| 250 |
+
repo_id="Ajsaxena/deceit-qwen-0.5b-full",
|
| 251 |
repo_type="model"
|
| 252 |
)
|
| 253 |
print(f"Uploaded {hf_name} to HF Hub")
|