Spaces:

Ajsaxena
/

deceit1

Paused

App Files Files Community

Jayant-Kernel commited on 13 days ago

Commit

0bdaeb6

1 Parent(s): 20bb6de

update: compare 0.5B vs 1.5B trained models

Browse files

Files changed (1) hide show

evaluate.py +43 -56

evaluate.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os, json, re, torch, pathlib, gc
 import threading
 from http.server import HTTPServer, BaseHTTPRequestHandler
@@ -10,24 +10,22 @@ class HealthHandler(BaseHTTPRequestHandler):
     def log_message(self, format, *args):
         pass
-def start_health_server():
-    server = HTTPServer(("0.0.0.0", 7860), HealthHandler)
-    server.serve_forever()
-health_thread = threading.Thread(target=start_health_server, daemon=True)
 health_thread.start()
 print("Health server started on port 7860")
-from unsloth import FastLanguageModel
 from deceit_env.server.environment import DeceitEnvironment
 from deceit_env.server.grader import Grader
 from deceit_env.models import DeceitAction
 import matplotlib.pyplot as plt
 import matplotlib
 matplotlib.use('Agg')
-# Auth
-from huggingface_hub import login
 login(token=os.environ["HF_TOKEN"])
 os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
 os.environ["DECEIT_GRADER_CACHE"] = "/tmp/eval_cache.json"
@@ -56,15 +54,20 @@ def parse_action(text):
 def evaluate_model(model_name, label, n_episodes=30):
     print(f"\nEvaluating: {label}")
-    model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name=model_name,
-        max_seq_length=1024,
-        dtype=None,
         load_in_4bit=True,
     )
-    FastLanguageModel.for_inference(model)
-    # Download dataset from GitHub
     import urllib.request as _ur
     _RAW = "https://raw.githubusercontent.com/Jayant-kernel/DECEIT-the-ai-truth-environment-/main/src/deceit_env/data"
     for _fname in ["level1.jsonl", "level2.jsonl", "level3.jsonl"]:
@@ -119,7 +122,6 @@ def evaluate_model(model_name, label, n_episodes=30):
         if (i+1) % 10 == 0:
             print(f"  {i+1}/{n_episodes} done, mean reward so far: {sum(rewards)/len(rewards):.3f}")
-    # Free model from GPU
     del model
     torch.cuda.empty_cache()
     gc.collect()
@@ -133,85 +135,70 @@ def evaluate_model(model_name, label, n_episodes=30):
         "rewards": rewards,
     }
-# Evaluate both models — 200 episodes each (env samples randomly with replacement)
-base_results = evaluate_model("unsloth/Qwen2.5-0.5B-Instruct", "Base Model (untrained)", n_episodes=200)
-trained_results = evaluate_model("Ajsaxena/deceit-qwen-0.5b-full", "DECEIT Trained", n_episodes=30)
-# Print comparison
 print("\n" + "="*60)
 print("RESULTS COMPARISON")
 print("="*60)
-for r in [base_results, trained_results]:
     print(f"\n{r['label']}:")
     print(f"  Mean Reward:          {r['mean_reward']:+.3f}")
     print(f"  Accuracy:             {r['accuracy']*100:.1f}%")
     print(f"  Confident Wrong Rate: {r['confident_wrong_rate']*100:.1f}%  <- sycophancy proxy")
     print(f"  Abstain Rate:         {r['abstain_rate']*100:.1f}%")
-# Plot 1 — Reward comparison bar chart
 fig, axes = plt.subplots(1, 3, figsize=(14, 5))
-models = [base_results["label"], trained_results["label"]]
 colors = ["#e74c3c", "#2ecc71"]
-# Bar 1 — Mean reward
-axes[0].bar(models, [base_results["mean_reward"], trained_results["mean_reward"]], color=colors)
 axes[0].axhline(y=0, color="gray", linestyle="--", alpha=0.5)
 axes[0].set_title("Mean Episode Reward")
 axes[0].set_ylabel("Reward")
-# Bar 2 — Accuracy
-axes[1].bar(models, [base_results["accuracy"]*100, trained_results["accuracy"]*100], color=colors)
 axes[1].set_title("Answer Accuracy (%)")
 axes[1].set_ylabel("Accuracy %")
 axes[1].set_ylim(0, 100)
-# Bar 3 — Confident wrong rate (sycophancy proxy)
-axes[2].bar(models, [base_results["confident_wrong_rate"]*100, trained_results["confident_wrong_rate"]*100], color=colors)
 axes[2].set_title("Confident Wrong Rate %\n(Sycophancy Proxy - lower is better)")
 axes[2].set_ylabel("%")
 axes[2].set_ylim(0, 100)
-plt.suptitle("DECEIT: Base Model vs Trained Model\n(Qwen 2.5 0.5B, 200 episodes each)", fontsize=13)
 plt.tight_layout()
-plt.savefig("comparison_chart.png", dpi=150, bbox_inches="tight")
 print("\nSaved comparison_chart.png")
 # Plot 2 — Reward distribution
 fig2, ax = plt.subplots(figsize=(10, 5))
-ax.hist(base_results["rewards"], bins=15, alpha=0.6, color="#e74c3c", label="Base Model")
-ax.hist(trained_results["rewards"], bins=15, alpha=0.6, color="#2ecc71", label="DECEIT Trained")
 ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)
 ax.set_xlabel("Episode Reward")
 ax.set_ylabel("Count")
-ax.set_title("Reward Distribution: Base vs Trained")
 ax.legend()
 plt.tight_layout()
-plt.savefig("reward_distribution.png", dpi=150, bbox_inches="tight")
 print("Saved reward_distribution.png")
-print("\nDone! Download comparison_chart.png and reward_distribution.png")
-from huggingface_hub import upload_file
-import time
 try:
-    upload_file(
-        path_or_fileobj="comparison_chart.png",
-        path_in_repo="comparison_chart.png",
-        repo_id="Ajsaxena/deceit-qwen-0.5b-full",
-        repo_type="model"
-    )
-    upload_file(
-        path_or_fileobj="reward_distribution.png",
-        path_in_repo="reward_distribution.png",
-        repo_id="Ajsaxena/deceit-qwen-0.5b-full",
-        repo_type="model"
-    )
-    print("Charts uploaded to HF Hub successfully!")
 except Exception as e:
     print(f"Upload error: {e}")
-print("Keeping alive for 120 seconds...")
-time.sleep(120)
 print("Done.")

+import os, json, re, torch, pathlib, gc, time
 import threading
 from http.server import HTTPServer, BaseHTTPRequestHandler
     def log_message(self, format, *args):
         pass
+health_thread = threading.Thread(
+    target=lambda: HTTPServer(("0.0.0.0", 7860), HealthHandler).serve_forever(),
+    daemon=True
+)
 health_thread.start()
 print("Health server started on port 7860")
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from deceit_env.server.environment import DeceitEnvironment
 from deceit_env.server.grader import Grader
 from deceit_env.models import DeceitAction
 import matplotlib.pyplot as plt
 import matplotlib
 matplotlib.use('Agg')
+from huggingface_hub import login, upload_file
 login(token=os.environ["HF_TOKEN"])
 os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
 os.environ["DECEIT_GRADER_CACHE"] = "/tmp/eval_cache.json"
 def evaluate_model(model_name, label, n_episodes=30):
     print(f"\nEvaluating: {label}")
+    bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
     )
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
     import urllib.request as _ur
     _RAW = "https://raw.githubusercontent.com/Jayant-kernel/DECEIT-the-ai-truth-environment-/main/src/deceit_env/data"
     for _fname in ["level1.jsonl", "level2.jsonl", "level3.jsonl"]:
         if (i+1) % 10 == 0:
             print(f"  {i+1}/{n_episodes} done, mean reward so far: {sum(rewards)/len(rewards):.3f}")
     del model
     torch.cuda.empty_cache()
     gc.collect()
         "rewards": rewards,
     }
+results_05b = evaluate_model("Ajsaxena/deceit-qwen-0.5b-full", "DECEIT 0.5B Trained", n_episodes=30)
+results_15b = evaluate_model("Ajsaxena/deceit-qwen-1.5b-full", "DECEIT 1.5B Trained", n_episodes=30)
 print("\n" + "="*60)
 print("RESULTS COMPARISON")
 print("="*60)
+for r in [results_05b, results_15b]:
     print(f"\n{r['label']}:")
     print(f"  Mean Reward:          {r['mean_reward']:+.3f}")
     print(f"  Accuracy:             {r['accuracy']*100:.1f}%")
     print(f"  Confident Wrong Rate: {r['confident_wrong_rate']*100:.1f}%  <- sycophancy proxy")
     print(f"  Abstain Rate:         {r['abstain_rate']*100:.1f}%")
+# Plot 1 — Comparison bar chart
 fig, axes = plt.subplots(1, 3, figsize=(14, 5))
+models = [results_05b["label"], results_15b["label"]]
 colors = ["#e74c3c", "#2ecc71"]
+axes[0].bar(models, [results_05b["mean_reward"], results_15b["mean_reward"]], color=colors)
 axes[0].axhline(y=0, color="gray", linestyle="--", alpha=0.5)
 axes[0].set_title("Mean Episode Reward")
 axes[0].set_ylabel("Reward")
+axes[1].bar(models, [results_05b["accuracy"]*100, results_15b["accuracy"]*100], color=colors)
 axes[1].set_title("Answer Accuracy (%)")
 axes[1].set_ylabel("Accuracy %")
 axes[1].set_ylim(0, 100)
+axes[2].bar(models, [results_05b["confident_wrong_rate"]*100, results_15b["confident_wrong_rate"]*100], color=colors)
 axes[2].set_title("Confident Wrong Rate %\n(Sycophancy Proxy - lower is better)")
 axes[2].set_ylabel("%")
 axes[2].set_ylim(0, 100)
+plt.suptitle("DECEIT: 0.5B vs 1.5B Trained Model Comparison", fontsize=13)
 plt.tight_layout()
+plt.savefig("/tmp/comparison_chart.png", dpi=150, bbox_inches="tight")
 print("\nSaved comparison_chart.png")
 # Plot 2 — Reward distribution
 fig2, ax = plt.subplots(figsize=(10, 5))
+ax.hist(results_05b["rewards"], bins=15, alpha=0.6, color="#e74c3c", label="DECEIT 0.5B Trained")
+ax.hist(results_15b["rewards"], bins=15, alpha=0.6, color="#2ecc71", label="DECEIT 1.5B Trained")
 ax.axvline(x=0, color="gray", linestyle="--", alpha=0.5)
 ax.set_xlabel("Episode Reward")
 ax.set_ylabel("Count")
+ax.set_title("Reward Distribution: 0.5B vs 1.5B Trained")
 ax.legend()
 plt.tight_layout()
+plt.savefig("/tmp/reward_distribution.png", dpi=150, bbox_inches="tight")
 print("Saved reward_distribution.png")
 try:
+    for fname in ["comparison_chart.png", "reward_distribution.png"]:
+        upload_file(
+            path_or_fileobj=f"/tmp/{fname}",
+            path_in_repo=fname,
+            repo_id="Ajsaxena/deceit-qwen-1.5b-full",
+            repo_type="model"
+        )
+        print(f"Uploaded {fname} to HF Hub")
+    print("All charts uploaded!")
 except Exception as e:
     print(f"Upload error: {e}")
+print("Keeping alive...")
+time.sleep(3600)
 print("Done.")