Jayant-Kernel commited on
update: evaluate retrained model, upload charts to HF Hub
Browse files- evaluate.py +44 -1
evaluate.py
CHANGED
|
@@ -1,4 +1,23 @@
|
|
| 1 |
import os, json, re, torch, pathlib, gc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from unsloth import FastLanguageModel
|
| 3 |
from deceit_env.server.environment import DeceitEnvironment
|
| 4 |
from deceit_env.server.grader import Grader
|
|
@@ -116,7 +135,7 @@ def evaluate_model(model_name, label, n_episodes=30):
|
|
| 116 |
|
| 117 |
# Evaluate both models — 200 episodes each (env samples randomly with replacement)
|
| 118 |
base_results = evaluate_model("unsloth/Qwen2.5-0.5B-Instruct", "Base Model (untrained)", n_episodes=200)
|
| 119 |
-
trained_results = evaluate_model("Ajsaxena/deceit-qwen-0.5b-full", "DECEIT Trained", n_episodes=
|
| 120 |
|
| 121 |
# Print comparison
|
| 122 |
print("\n" + "="*60)
|
|
@@ -172,3 +191,27 @@ plt.savefig("reward_distribution.png", dpi=150, bbox_inches="tight")
|
|
| 172 |
print("Saved reward_distribution.png")
|
| 173 |
|
| 174 |
print("\nDone! Download comparison_chart.png and reward_distribution.png")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os, json, re, torch, pathlib, gc
|
| 2 |
+
import threading
|
| 3 |
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
| 4 |
+
|
| 5 |
+
class HealthHandler(BaseHTTPRequestHandler):
|
| 6 |
+
def do_GET(self):
|
| 7 |
+
self.send_response(200)
|
| 8 |
+
self.end_headers()
|
| 9 |
+
self.wfile.write(b"Evaluation in progress...")
|
| 10 |
+
def log_message(self, format, *args):
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
def start_health_server():
|
| 14 |
+
server = HTTPServer(("0.0.0.0", 7860), HealthHandler)
|
| 15 |
+
server.serve_forever()
|
| 16 |
+
|
| 17 |
+
health_thread = threading.Thread(target=start_health_server, daemon=True)
|
| 18 |
+
health_thread.start()
|
| 19 |
+
print("Health server started on port 7860")
|
| 20 |
+
|
| 21 |
from unsloth import FastLanguageModel
|
| 22 |
from deceit_env.server.environment import DeceitEnvironment
|
| 23 |
from deceit_env.server.grader import Grader
|
|
|
|
| 135 |
|
| 136 |
# Evaluate both models — 200 episodes each (env samples randomly with replacement)
|
| 137 |
base_results = evaluate_model("unsloth/Qwen2.5-0.5B-Instruct", "Base Model (untrained)", n_episodes=200)
|
| 138 |
+
trained_results = evaluate_model("Ajsaxena/deceit-qwen-0.5b-full", "DECEIT Trained", n_episodes=30)
|
| 139 |
|
| 140 |
# Print comparison
|
| 141 |
print("\n" + "="*60)
|
|
|
|
| 191 |
print("Saved reward_distribution.png")
|
| 192 |
|
| 193 |
print("\nDone! Download comparison_chart.png and reward_distribution.png")
|
| 194 |
+
|
| 195 |
+
from huggingface_hub import upload_file
|
| 196 |
+
import time
|
| 197 |
+
|
| 198 |
+
try:
|
| 199 |
+
upload_file(
|
| 200 |
+
path_or_fileobj="comparison_chart.png",
|
| 201 |
+
path_in_repo="comparison_chart.png",
|
| 202 |
+
repo_id="Ajsaxena/deceit-qwen-0.5b-full",
|
| 203 |
+
repo_type="model"
|
| 204 |
+
)
|
| 205 |
+
upload_file(
|
| 206 |
+
path_or_fileobj="reward_distribution.png",
|
| 207 |
+
path_in_repo="reward_distribution.png",
|
| 208 |
+
repo_id="Ajsaxena/deceit-qwen-0.5b-full",
|
| 209 |
+
repo_type="model"
|
| 210 |
+
)
|
| 211 |
+
print("Charts uploaded to HF Hub successfully!")
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f"Upload error: {e}")
|
| 214 |
+
|
| 215 |
+
print("Keeping alive for 120 seconds...")
|
| 216 |
+
time.sleep(120)
|
| 217 |
+
print("Done.")
|