Spaces:

Ajsaxena
/

deceit1

Paused

Jayant-Kernel commited on 13 days ago

Commit

6b64fd2

1 Parent(s): e662a77

evaluate: switch to 0.5B model comparison, 200 episodes

Files changed (1) hide show

evaluate.py CHANGED Viewed

@@ -37,8 +37,8 @@ import matplotlib.pyplot as plt
 login(token=os.environ["HF_TOKEN"])
-BASE_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
-TRAINED_MODEL = "Ajsaxena/deceit-qwen-1.5b-full"
 N_EPISODES = 200
 from deceit_env.server.environment import DeceitEnvironment
@@ -179,14 +179,14 @@ def evaluate_model(model_name, label, n_episodes=200, is_trained=False):
     return results
 # Run evaluations
-base_results = evaluate_model(BASE_MODEL, "Base 1.5B (untrained)", N_EPISODES, is_trained=False)
-trained_results = evaluate_model(TRAINED_MODEL, "DECEIT 1.5B Trained", N_EPISODES, is_trained=True)
 # Calculate percentages
 def pct(val, total):
     return round((val / total) * 100, 1)
-labels = ["Base 1.5B\n(untrained)", "DECEIT 1.5B\nTrained"]
 colors = ["#e74c3c", "#2ecc71"]
 mean_rewards = [
@@ -234,20 +234,20 @@ axes[3].set_title("Abstain Rate %\n(Honest Uncertainty — higher is better)")
 axes[3].set_ylabel("%")
 axes[3].set_ylim(0, 100)
-plt.suptitle("DECEIT: Base 1.5B vs Trained 1.5B Model\n(30 episodes each)", fontsize=13)
 plt.tight_layout()
-plt.savefig("/tmp/comparison_1.5b.png", dpi=150, bbox_inches="tight")
 plt.close()
 print("Chart saved")
 # Upload to HF Hub
 for fname, hf_name in [
-    ("/tmp/comparison_1.5b.png", "comparison_1.5b.png"),
 ]:
     upload_file(
         path_or_fileobj=fname,
         path_in_repo=hf_name,
-        repo_id="Ajsaxena/deceit-qwen-1.5b-full",
         repo_type="model"
     )
     print(f"Uploaded {hf_name} to HF Hub")

 login(token=os.environ["HF_TOKEN"])
+BASE_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
+TRAINED_MODEL = "Ajsaxena/deceit-qwen-0.5b-full"
 N_EPISODES = 200
 from deceit_env.server.environment import DeceitEnvironment
     return results
 # Run evaluations
+base_results = evaluate_model(BASE_MODEL, "Base 0.5B (untrained)", N_EPISODES, is_trained=False)
+trained_results = evaluate_model(TRAINED_MODEL, "DECEIT 0.5B Trained", N_EPISODES, is_trained=True)
 # Calculate percentages
 def pct(val, total):
     return round((val / total) * 100, 1)
+labels = ["Base 0.5B\n(untrained)", "DECEIT 0.5B\nTrained"]
 colors = ["#e74c3c", "#2ecc71"]
 mean_rewards = [
 axes[3].set_ylabel("%")
 axes[3].set_ylim(0, 100)
+plt.suptitle("DECEIT: Base 0.5B vs Trained 0.5B Model\n(200 episodes each)", fontsize=13)
 plt.tight_layout()
+plt.savefig("/tmp/comparison_0.5b_200ep.png", dpi=150, bbox_inches="tight")
 plt.close()
 print("Chart saved")
 # Upload to HF Hub
 for fname, hf_name in [
+    ("/tmp/comparison_0.5b_200ep.png", "comparison_0.5b_200ep.png"),
 ]:
     upload_file(
         path_or_fileobj=fname,
         path_in_repo=hf_name,
+        repo_id="Ajsaxena/deceit-qwen-0.5b-full",
         repo_type="model"
     )
     print(f"Uploaded {hf_name} to HF Hub")