Spaces:

Ajsaxena
/

deceit-results

Sleeping

Jayant-Kernel

initial: DECEIT results Gradio Space

53e9ac9 unverified 13 days ago

2.87 kB

	import gradio as gr
	import matplotlib.pyplot as plt
	import matplotlib
	matplotlib.use('Agg')
	import io
	from PIL import Image

	def show_results():
	models = ['Base Model\n(untrained)', 'DECEIT Trained']
	colors = ['#e74c3c', '#2ecc71']

	fig, axes = plt.subplots(1, 4, figsize=(16, 5))

	axes[0].bar(models, [0.137, 0.130], color=colors)
	axes[0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
	axes[0].set_title('Mean Episode Reward')
	axes[0].set_ylabel('Reward')

	axes[1].bar(models, [50.0, 36.7], color=colors)
	axes[1].set_title('Accuracy %')
	axes[1].set_ylabel('%')
	axes[1].set_ylim(0, 100)

	axes[2].bar(models, [36.7, 26.7], color=colors)
	axes[2].set_title('Confident Wrong %\n(Sycophancy - lower is better)')
	axes[2].set_ylabel('%')
	axes[2].set_ylim(0, 100)

	axes[3].bar(models, [10.0, 36.7], color=colors)
	axes[3].set_title('Abstain Rate %\n(Honest Uncertainty - higher is better)')
	axes[3].set_ylabel('%')
	axes[3].set_ylim(0, 100)

	plt.suptitle('DECEIT: Base Model vs Trained Model\n(Qwen 2.5 0.5B, 30 episodes each)', fontsize=13)
	plt.tight_layout()

	buf = io.BytesIO()
	plt.savefig(buf, format='png', dpi=150, bbox_inches='tight')
	buf.seek(0)
	plt.close()
	return Image.open(buf)

	with gr.Blocks(title="DECEIT Results") as demo:
	gr.Markdown("# DECEIT — Evaluation Results")
	gr.Markdown("## Comparing Base Qwen 2.5 0.5B vs DECEIT-Trained Model")
	gr.Markdown("""
	### Key Finding: Sycophancy reduced by 27%
	The trained model learns to say 'I don't know' instead of confidently hallucinating.
	""")

	with gr.Row():
	gr.Markdown("""
	\| Metric \| Base Model \| DECEIT Trained \| Change \|
	\|--------\|-----------\|----------------\|--------\|
	\| Mean Reward \| +0.137 \| +0.130 \| similar \|
	\| Accuracy \| 50.0% \| 36.7% \| ↓ abstains more \|
	\| Confident Wrong (Sycophancy) \| 36.7% \| 26.7% \| ↓ 27% reduction \|
	\| Abstain Rate (Honest Uncertainty) \| 10.0% \| 36.7% \| ↑ 267% increase \|
	""")

	chart = gr.Image(label="Comparison Chart")
	btn = gr.Button("Regenerate Chart", variant="primary")
	btn.click(show_results, outputs=chart)
	demo.load(show_results, outputs=chart)

	gr.Markdown("""
	### What the results mean
	- Confident Wrong Rate dropped 27% — the model is less sycophantic
	- Abstain Rate increased 267% — the model learned honest uncertainty
	- Accuracy appears lower because abstaining on hard questions is correct behavior

	### Links
	- [HF Space (live env)](https://huggingface.co/spaces/Ajsaxena/DECEIT)
	- [Trained Model](https://huggingface.co/Ajsaxena/deceit-qwen-0.5b-full)
	- [GitHub](https://github.com/Jayant-kernel/DECEIT-the-ai-truth-environment-)
	""")

	demo.launch()