LUNA-Training / check_sft_alignment.py

Upload check_sft_alignment.py with huggingface_hub

56ea5bf verified 16 days ago

1.64 kB

	"""Quick check: what does the SFT dataset say for the eval questions?"""
	import json, re

	with open("Base/Datasets/sft_clean/train.json", "r", encoding="utf-8") as f:
	data = json.load(f)

	exact_queries = {
	"Who are you?": r"^Who are you\?$",
	"What is your name?": r"^What is your name\?$",
	"Who created you?": r"^Who created you\?$",
	"Who is Asterizer?": r"^Who is Asterizer\?$",
	"What is LUNA?": r"^What is LUNA\?$",
	"Are you an AI?": r"are you an ai",
	"Tell me about yourself": r"tell me about yourself",
	"Translate hello to Spanish": r"translate.hello.spanish",
	"Write a poem about the moon": r"^Write a.*poem about the moon",
	"Summarize evolution": r"summarize.*theory of evolution",
	"Explain photosynthesis": r"explain photosynthesis",
	"What is 25 times 4?": r"25 times 4\|25 \* 4\|25\*4\|25 x 4",
	"Capital of France": r"capital of france",
	}

	for label, pat in exact_queries.items():
	matches = [e for e in data if re.search(pat, e.get("instruction", "").strip(), re.I)]
	print(f"\n{'='*60}")
	print(f" {label} ({len(matches)} matches in dataset)")
	print(f"{'='*60}")
	for m in matches[:3]:
	inst = m["instruction"][:100]
	out = m["output"][:300]
	print(f" Q: {inst}")
	print(f" A: {out}")
	print()

	# Count all Asterizer-branded identity entries
	identity_kw = [e for e in data if "asterizer" in e.get("output", "").lower() or "luna" in e.get("output", "").lower()[:50]]
	print(f"\n{'='*60}")
	print(f" Entries mentioning Asterizer/LUNA in output: {len(identity_kw)}")
	print(f"{'='*60}")