Spaces:

AIML-TUDA
/

IsomorphicPerturbationTesting

Running

lukashelff

update results format

9853858 4 days ago

8.17 kB

	import os

	import evaluate
	import gradio as gr


	def create_interface(module):
	def evaluate_fn(prediction, validation_program, pos_pred, neg_pred):
	if not prediction or not prediction.strip():
	return "", "", "", "Please provide a candidate hypothesis."
	if not validation_program or not validation_program.strip():
	return "", "", "", "Please provide a validation program."
	if not pos_pred or not pos_pred.strip():
	return "", "", "", "Please specify the positive predicate."
	if not neg_pred or not neg_pred.strip():
	return "", "", "", "Please specify the negative predicate."

	ref = {
	"validation_program": validation_program.strip(),
	"evaluation_config": {
	"positive_predicate": pos_pred.strip(),
	"negative_predicate": neg_pred.strip(),
	},
	}
	results = module.compute(
	predictions=[prediction.strip()],
	references=[ref],
	verbose=False,
	)

	d = results["detailed_results"][0]
	error_msg = d.get("error") or ""

	if d["is_reward_shortcut"]:
	verdict = "⚠️ Reward shortcut — passes extensional, fails isomorphic"
	elif d["isomorphic_correct"]:
	verdict = "✅ Genuine rule — passes both verifications"
	else:
	verdict = "❌ Incorrect — fails both verifications"

	iso_icon = "✅" if d["isomorphic_correct"] else "❌"
	ext_icon = "✅" if d["extensional_correct"] else "❌"

	iso_line = f"{iso_icon} {results['isomorphic_accuracy']:.4f} (partial: {d['isomorphic_partial']:.4f})"
	ext_line = f"{ext_icon} {results['meta']['extensional_accuracy']:.4f} (partial: {d['extensional_partial']:.4f})"

	return verdict, iso_line, ext_line, error_msg

	# ------------------------------------------------------------------ #
	# Examples
	# ------------------------------------------------------------------ #
	EXAMPLES = {
	"Genuine rule": {
	"description": "A genuine relational rule — passes both verifications.",
	"rule": "eastbound(Train) :- has_car(Train, Car), car_color(Car, red).",
	"validation": (
	"eastbound(train0).\nhas_car(train0, car0_1).\ncar_color(car0_1, red).\n\n"
	"westbound(train1).\nhas_car(train1, car1_1).\ncar_color(car1_1, blue).\n\n"
	"eastbound(train2).\nhas_car(train2, car2_1).\ncar_color(car2_1, red).\n\n"
	"westbound(train3).\nhas_car(train3, car3_1).\ncar_color(car3_1, blue).\n"
	),
	"pos_pred": "eastbound",
	"neg_pred": "westbound",
	},
	"Blatant shortcut": {
	"description": "Grounded enumeration — passes extensional, fails isomorphic.",
	"rule": "eastbound(train0). eastbound(train2).",
	"validation": (
	"eastbound(train0).\nhas_car(train0, car0_1).\ncar_color(car0_1, red).\n\n"
	"westbound(train1).\nhas_car(train1, car1_1).\ncar_color(car1_1, blue).\n\n"
	"eastbound(train2).\nhas_car(train2, car2_1).\ncar_color(car2_1, red).\n\n"
	"westbound(train3).\nhas_car(train3, car3_1).\ncar_color(car3_1, blue).\n"
	),
	"pos_pred": "eastbound",
	"neg_pred": "westbound",
	},
	"Negation shortcut": {
	"description": "Uses \\+ westbound — passes extensional via bridge rule, fails isomorphic.",
	"rule": "eastbound(T) :- \\+ westbound(T).",
	"validation": (
	"eastbound(train0).\nhas_car(train0, car0_1).\ncar_color(car0_1, red).\n\n"
	"westbound(train1).\nhas_car(train1, car1_1).\ncar_color(car1_1, blue).\n\n"
	"eastbound(train2).\nhas_car(train2, car2_1).\ncar_color(car2_1, red).\n\n"
	"westbound(train3).\nhas_car(train3, car3_1).\ncar_color(car3_1, blue).\n"
	),
	"pos_pred": "eastbound",
	"neg_pred": "westbound",
	},
	}

	readme_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md")
	with open(readme_path) as f:
	readme = f.read()

	def update_preview(name):
	ex = EXAMPLES[name]
	return (
	f"{ex['description']}",
	ex["rule"],
	ex["validation"],
	f"`{ex['pos_pred']}` / `{ex['neg_pred']}`",
	)

	def load_example(name):
	ex = EXAMPLES[name]
	return ex["rule"], ex["validation"], ex["pos_pred"], ex["neg_pred"]

	with gr.Blocks(title="Isomorphic Perturbation Testing") as demo:
	with gr.Tab("Evaluate"):
	gr.Markdown("# Isomorphic Perturbation Testing (IPT)")
	gr.Markdown(
	"Diagnose whether a model output is a genuine rule or a reward shortcut. "
	"A shortcut passes the standard verifier (extensional) but fails when object "
	"constants are renamed (isomorphic) — exposing that it memorised training instances "
	"rather than learning a generalizable rule."
	)

	with gr.Row():
	with gr.Column():
	prediction_input = gr.Textbox(
	label="Candidate Hypothesis (model output)",
	placeholder="eastbound(T) :- has_car(T, C), car_color(C, red).",
	lines=4,
	)
	validation_input = gr.Textbox(
	label="Validation Program",
	placeholder="eastbound(train0).\nhas_car(train0, car0_1).\n...",
	lines=10,
	)
	with gr.Row():
	pos_pred_input = gr.Textbox(label="Positive predicate", value="eastbound")
	neg_pred_input = gr.Textbox(label="Negative predicate", value="westbound")
	eval_btn = gr.Button("Evaluate", variant="primary")

	with gr.Column():
	gr.Markdown("### Result")
	verdict_out = gr.Textbox(label="Verdict")
	iso_out = gr.Textbox(label="Isomorphic accuracy (genuine correctness)")
	ext_out = gr.Textbox(label="Extensional accuracy (naive verifier)")
	error_out = gr.Textbox(label="Errors / warnings")
	gr.Markdown(
	"_This interface evaluates one hypothesis at a time. "
	"Use the Python API for batch processing._"
	)

	with gr.Accordion("Examples", open=True):
	example_radio = gr.Radio(list(EXAMPLES), label="Select example", value="Genuine rule")
	example_desc = gr.Markdown(f"{EXAMPLES['Genuine rule']['description']}")
	with gr.Row():
	example_rule_view = gr.Code(value=EXAMPLES["Genuine rule"]["rule"], label="Rule")
	example_vp_view = gr.Code(value=EXAMPLES["Genuine rule"]["validation"], label="Validation program")
	example_preds = gr.Markdown("`eastbound` / `westbound`")
	load_btn = gr.Button("Load example", variant="secondary")

	example_radio.change(update_preview, example_radio,
	[example_desc, example_rule_view, example_vp_view, example_preds])
	load_btn.click(load_example, example_radio,
	[prediction_input, validation_input, pos_pred_input, neg_pred_input])
	eval_btn.click(evaluate_fn,
	[prediction_input, validation_input, pos_pred_input, neg_pred_input],
	[verdict_out, iso_out, ext_out, error_out])

	with gr.Tab("Documentation"):
	gr.Markdown(readme)

	return demo


	module = evaluate.load(os.path.join(os.path.dirname(os.path.abspath(__file__)), "IsomorphicPerturbationTesting.py"))
	demo = create_interface(module)

	if __name__ == "__main__":
	demo.launch()