File size: 2,865 Bytes
53e9ac9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import gradio as gr
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
import io
from PIL import Image

def show_results():
    models = ['Base Model\n(untrained)', 'DECEIT Trained']
    colors = ['#e74c3c', '#2ecc71']

    fig, axes = plt.subplots(1, 4, figsize=(16, 5))

    axes[0].bar(models, [0.137, 0.130], color=colors)
    axes[0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
    axes[0].set_title('Mean Episode Reward')
    axes[0].set_ylabel('Reward')

    axes[1].bar(models, [50.0, 36.7], color=colors)
    axes[1].set_title('Accuracy %')
    axes[1].set_ylabel('%')
    axes[1].set_ylim(0, 100)

    axes[2].bar(models, [36.7, 26.7], color=colors)
    axes[2].set_title('Confident Wrong %\n(Sycophancy - lower is better)')
    axes[2].set_ylabel('%')
    axes[2].set_ylim(0, 100)

    axes[3].bar(models, [10.0, 36.7], color=colors)
    axes[3].set_title('Abstain Rate %\n(Honest Uncertainty - higher is better)')
    axes[3].set_ylabel('%')
    axes[3].set_ylim(0, 100)

    plt.suptitle('DECEIT: Base Model vs Trained Model\n(Qwen 2.5 0.5B, 30 episodes each)', fontsize=13)
    plt.tight_layout()

    buf = io.BytesIO()
    plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
    buf.seek(0)
    plt.close()
    return Image.open(buf)

with gr.Blocks(title="DECEIT Results") as demo:
    gr.Markdown("# DECEIT β€” Evaluation Results")
    gr.Markdown("## Comparing Base Qwen 2.5 0.5B vs DECEIT-Trained Model")
    gr.Markdown("""
    ### Key Finding: Sycophancy reduced by 27%
    The trained model learns to say 'I don't know' instead of confidently hallucinating.
    """)

    with gr.Row():
        gr.Markdown("""
        | Metric | Base Model | DECEIT Trained | Change |
        |--------|-----------|----------------|--------|
        | Mean Reward | +0.137 | +0.130 | similar |
        | Accuracy | 50.0% | 36.7% | ↓ abstains more |
        | **Confident Wrong (Sycophancy)** | **36.7%** | **26.7%** | **↓ 27% reduction** |
        | **Abstain Rate (Honest Uncertainty)** | **10.0%** | **36.7%** | **↑ 267% increase** |
        """)

    chart = gr.Image(label="Comparison Chart")
    btn = gr.Button("Regenerate Chart", variant="primary")
    btn.click(show_results, outputs=chart)
    demo.load(show_results, outputs=chart)

    gr.Markdown("""
    ### What the results mean
    - **Confident Wrong Rate dropped 27%** β€” the model is less sycophantic
    - **Abstain Rate increased 267%** β€” the model learned honest uncertainty
    - **Accuracy appears lower** because abstaining on hard questions is correct behavior

    ### Links
    - [HF Space (live env)](https://huggingface.co/spaces/Ajsaxena/DECEIT)
    - [Trained Model](https://huggingface.co/Ajsaxena/deceit-qwen-0.5b-full)
    - [GitHub](https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-)
    """)

demo.launch()