File size: 4,061 Bytes
76117fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
088adaf
76117fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
088adaf
76117fc
d407740
76117fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d407740
76117fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d407740
76117fc
 
 
 
d407740
76117fc
 
 
d407740
76117fc
 
 
 
 
 
 
20bb6de
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
import io, os, threading
from PIL import Image
from http.server import HTTPServer, BaseHTTPRequestHandler
from huggingface_hub import login, upload_file

# Health server
class HealthHandler(BaseHTTPRequestHandler):
    def do_GET(self):
        self.send_response(200)
        self.end_headers()
        self.wfile.write(b"Generating charts...")
    def log_message(self, format, *args):
        pass

health_thread = threading.Thread(
    target=lambda: HTTPServer(("0.0.0.0", 7860), HealthHandler).serve_forever(),
    daemon=True
)
health_thread.start()
print("Health server started")

# Auth
login(token=os.environ["HF_TOKEN"])

# Data
models = ['Base Model\n(untrained)', 'DECEIT 1.5B Trained']
colors = ['#e74c3c', '#2ecc71']
mean_rewards = [0.137, 0.130]
accuracy = [50.0, 36.7]
confident_wrong = [36.7, 26.7]
abstain_rate = [10.0, 36.7]

# Chart 1 - Comparison bar chart
fig, axes = plt.subplots(1, 4, figsize=(16, 5))

axes[0].bar(models, mean_rewards, color=colors)
axes[0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[0].set_title('Mean Episode Reward')
axes[0].set_ylabel('Reward')

axes[1].bar(models, accuracy, color=colors)
axes[1].set_title('Accuracy %')
axes[1].set_ylabel('%')
axes[1].set_ylim(0, 100)

axes[2].bar(models, confident_wrong, color=colors)
axes[2].set_title('Confident Wrong %\n(Sycophancy - lower is better)')
axes[2].set_ylabel('%')
axes[2].set_ylim(0, 100)

axes[3].bar(models, abstain_rate, color=colors)
axes[3].set_title('Abstain Rate %\n(Honest Uncertainty - higher is better)')
axes[3].set_ylabel('%')
axes[3].set_ylim(0, 100)

plt.suptitle('DECEIT: Base Model vs Trained Model\n(Qwen 2.5 1.5B, 30 episodes each)', fontsize=13)
plt.tight_layout()
plt.savefig('/tmp/comparison_chart.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved comparison_chart.png")

# Chart 2 - Sycophancy focus
fig2, ax = plt.subplots(figsize=(8, 6))
x = range(len(models))
bars = ax.bar(x, confident_wrong, color=colors, width=0.5)
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.set_ylabel('Confident Wrong Rate %')
ax.set_title('Sycophancy Reduction\n(Confident Wrong Rate - lower is better)', fontsize=13)
ax.set_ylim(0, 60)
for bar, val in zip(bars, confident_wrong):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{val}%', ha='center', va='bottom', fontweight='bold', fontsize=14)
ax.annotate('27% reduction\nin sycophancy', xy=(1, 26.7), xytext=(0.5, 45),
            arrowprops=dict(arrowstyle='->', color='black'),
            fontsize=12, fontweight='bold', color='green')
plt.tight_layout()
plt.savefig('/tmp/sycophancy_chart.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved sycophancy_chart.png")

# Chart 3 - Abstain rate (honesty)
fig3, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(x, abstain_rate, color=colors, width=0.5)
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.set_ylabel('Abstain Rate %')
ax.set_title('Honest Uncertainty\n(Abstain Rate - higher means more honest)', fontsize=13)
ax.set_ylim(0, 60)
for bar, val in zip(bars, abstain_rate):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{val}%', ha='center', va='bottom', fontweight='bold', fontsize=14)
ax.annotate('267% increase\nin honest abstention', xy=(1, 36.7), xytext=(0.3, 50),
            arrowprops=dict(arrowstyle='->', color='black'),
            fontsize=12, fontweight='bold', color='green')
plt.tight_layout()
plt.savefig('/tmp/honesty_chart.png', dpi=150, bbox_inches='tight')
plt.close()
print("Saved honesty_chart.png")

# Upload all charts to HF Hub
for filename in ['/tmp/comparison_chart.png', '/tmp/sycophancy_chart.png', '/tmp/honesty_chart.png']:
    upload_file(
        path_or_fileobj=filename,
        path_in_repo=filename,
        repo_id="Ajsaxena/deceit-qwen-1.5b-full",
        repo_type="model"
    )
    print(f"Uploaded {filename} to HF Hub")

print("All charts uploaded! Check huggingface.co/Ajsaxena/deceit-qwen-0.5b-full")

import time
time.sleep(3600)