Jayant-Kernel commited on
Commit
53e9ac9
Β·
unverified Β·
0 Parent(s):

initial: DECEIT results Gradio Space

Browse files
Files changed (3) hide show
  1. README.md +12 -0
  2. app.py +78 -0
  3. requirements.txt +3 -0
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: DECEIT Results
3
+ emoji: πŸ“Š
4
+ colorFrom: red
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+ # DECEIT Evaluation Results
12
+ Interactive charts showing base vs trained model comparison.
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import matplotlib
4
+ matplotlib.use('Agg')
5
+ import io
6
+ from PIL import Image
7
+
8
+ def show_results():
9
+ models = ['Base Model\n(untrained)', 'DECEIT Trained']
10
+ colors = ['#e74c3c', '#2ecc71']
11
+
12
+ fig, axes = plt.subplots(1, 4, figsize=(16, 5))
13
+
14
+ axes[0].bar(models, [0.137, 0.130], color=colors)
15
+ axes[0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
16
+ axes[0].set_title('Mean Episode Reward')
17
+ axes[0].set_ylabel('Reward')
18
+
19
+ axes[1].bar(models, [50.0, 36.7], color=colors)
20
+ axes[1].set_title('Accuracy %')
21
+ axes[1].set_ylabel('%')
22
+ axes[1].set_ylim(0, 100)
23
+
24
+ axes[2].bar(models, [36.7, 26.7], color=colors)
25
+ axes[2].set_title('Confident Wrong %\n(Sycophancy - lower is better)')
26
+ axes[2].set_ylabel('%')
27
+ axes[2].set_ylim(0, 100)
28
+
29
+ axes[3].bar(models, [10.0, 36.7], color=colors)
30
+ axes[3].set_title('Abstain Rate %\n(Honest Uncertainty - higher is better)')
31
+ axes[3].set_ylabel('%')
32
+ axes[3].set_ylim(0, 100)
33
+
34
+ plt.suptitle('DECEIT: Base Model vs Trained Model\n(Qwen 2.5 0.5B, 30 episodes each)', fontsize=13)
35
+ plt.tight_layout()
36
+
37
+ buf = io.BytesIO()
38
+ plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
39
+ buf.seek(0)
40
+ plt.close()
41
+ return Image.open(buf)
42
+
43
+ with gr.Blocks(title="DECEIT Results") as demo:
44
+ gr.Markdown("# DECEIT β€” Evaluation Results")
45
+ gr.Markdown("## Comparing Base Qwen 2.5 0.5B vs DECEIT-Trained Model")
46
+ gr.Markdown("""
47
+ ### Key Finding: Sycophancy reduced by 27%
48
+ The trained model learns to say 'I don't know' instead of confidently hallucinating.
49
+ """)
50
+
51
+ with gr.Row():
52
+ gr.Markdown("""
53
+ | Metric | Base Model | DECEIT Trained | Change |
54
+ |--------|-----------|----------------|--------|
55
+ | Mean Reward | +0.137 | +0.130 | similar |
56
+ | Accuracy | 50.0% | 36.7% | ↓ abstains more |
57
+ | **Confident Wrong (Sycophancy)** | **36.7%** | **26.7%** | **↓ 27% reduction** |
58
+ | **Abstain Rate (Honest Uncertainty)** | **10.0%** | **36.7%** | **↑ 267% increase** |
59
+ """)
60
+
61
+ chart = gr.Image(label="Comparison Chart")
62
+ btn = gr.Button("Regenerate Chart", variant="primary")
63
+ btn.click(show_results, outputs=chart)
64
+ demo.load(show_results, outputs=chart)
65
+
66
+ gr.Markdown("""
67
+ ### What the results mean
68
+ - **Confident Wrong Rate dropped 27%** β€” the model is less sycophantic
69
+ - **Abstain Rate increased 267%** β€” the model learned honest uncertainty
70
+ - **Accuracy appears lower** because abstaining on hard questions is correct behavior
71
+
72
+ ### Links
73
+ - [HF Space (live env)](https://huggingface.co/spaces/Ajsaxena/DECEIT)
74
+ - [Trained Model](https://huggingface.co/Ajsaxena/deceit-qwen-0.5b-full)
75
+ - [GitHub](https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-)
76
+ """)
77
+
78
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ matplotlib
3
+ Pillow