Spaces:

Ajsaxena
/

deceit1

Paused

File size: 4,061 Bytes

import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
import io, os, threading
from PIL import Image
from http.server import HTTPServer, BaseHTTPRequestHandler
from huggingface_hub import login, upload_file

# Health server
class HealthHandler(BaseHTTPRequestHandler):
    def do_GET(self):
        self.send_response(200)
        self.end_headers()
        self.wfile.write(b"Generating charts...")
    def log_message(self, format, *args):
        pass

health_thread = threading.Thread(
    target=lambda: HTTPServer(("0.0.0.0", 7860), HealthHandler).serve_forever(),
    daemon=True
)
health_thread.start()
print("Health server started")

# Auth
login(token=os.environ["HF_TOKEN"])

# Data
models = ['Base Model\n(untrained)', 'DECEIT 1.5B Trained']
colors = ['#e74c3c', '#2ecc71']
mean_rewards = [0.137, 0.130]
accuracy = [50.0, 36.7]
confident_wrong = [36.7, 26.7]
abstain_rate = [10.0, 36.7]

# Chart 1 - Comparison bar chart
fig, axes = plt.subplots(1, 4, figsize=(16, 5))

axes[0].bar(models, mean_rewards, color=colors)
axes[0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[0].set_title('Mean Episode Reward')
axes[0].set_ylabel('Reward')

axes[1].bar(models, accuracy, color=colors)
axes[1].set_title('Accuracy %')
axes[1].set_ylabel('%')
axes[1].set_ylim(0, 100)

axes[2].bar(models, confident_wrong, color=colors)
axes[2].set_title('Confident Wrong %\n(Sycophancy - lower is better)')
axes[2].set_ylabel('%')
axes[2].set_ylim(0, 100)

axes[3].bar(models, abstain_rate, color=colors)
axes[3].set_title('Abstain Rate %\n(Honest Uncertainty - higher is better)')
axes[3].set_ylabel('%')
axes[3].set_ylim(0, 100)

plt.suptitle('DECEIT: Base Model vs Trained Model\n(Qwen 2.5 1.5B, 30 episodes each)', fontsize=13)
plt.tight_layout()
plt.savefig('/tmp/comparison_chart.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved comparison_chart.png")

# Chart 2 - Sycophancy focus
fig2, ax = plt.subplots(figsize=(8, 6))
x = range(len(models))
bars = ax.bar(x, confident_wrong, color=colors, width=0.5)
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.set_ylabel('Confident Wrong Rate %')
ax.set_title('Sycophancy Reduction\n(Confident Wrong Rate - lower is better)', fontsize=13)
ax.set_ylim(0, 60)
for bar, val in zip(bars, confident_wrong):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{val}%', ha='center', va='bottom', fontweight='bold', fontsize=14)
ax.annotate('27% reduction\nin sycophancy', xy=(1, 26.7), xytext=(0.5, 45),
            arrowprops=dict(arrowstyle='->', color='black'),
            fontsize=12, fontweight='bold', color='green')
plt.tight_layout()
plt.savefig('/tmp/sycophancy_chart.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved sycophancy_chart.png")

# Chart 3 - Abstain rate (honesty)
fig3, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(x, abstain_rate, color=colors, width=0.5)
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.set_ylabel('Abstain Rate %')
ax.set_title('Honest Uncertainty\n(Abstain Rate - higher means more honest)', fontsize=13)
ax.set_ylim(0, 60)
for bar, val in zip(bars, abstain_rate):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{val}%', ha='center', va='bottom', fontweight='bold', fontsize=14)
ax.annotate('267% increase\nin honest abstention', xy=(1, 36.7), xytext=(0.3, 50),
            arrowprops=dict(arrowstyle='->', color='black'),
            fontsize=12, fontweight='bold', color='green')
plt.tight_layout()
plt.savefig('/tmp/honesty_chart.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved honesty_chart.png")

# Upload all charts to HF Hub
for filename in ['/tmp/comparison_chart.png', '/tmp/sycophancy_chart.png', '/tmp/honesty_chart.png']:
    upload_file(
        path_or_fileobj=filename,
        path_in_repo=filename,
        repo_id="Ajsaxena/deceit-qwen-1.5b-full",
        repo_type="model"
    )
    print(f"Uploaded {filename} to HF Hub")

print("All charts uploaded! Check huggingface.co/Ajsaxena/deceit-qwen-0.5b-full")

import time
time.sleep(3600)