LUNA-Training / check_sft_alignment.py
ASTERIZER's picture
Upload check_sft_alignment.py with huggingface_hub
56ea5bf verified
"""Quick check: what does the SFT dataset say for the eval questions?"""
import json, re
with open("Base/Datasets/sft_clean/train.json", "r", encoding="utf-8") as f:
data = json.load(f)
exact_queries = {
"Who are you?": r"^Who are you\?$",
"What is your name?": r"^What is your name\?$",
"Who created you?": r"^Who created you\?$",
"Who is Asterizer?": r"^Who is Asterizer\?$",
"What is LUNA?": r"^What is LUNA\?$",
"Are you an AI?": r"are you an ai",
"Tell me about yourself": r"tell me about yourself",
"Translate hello to Spanish": r"translate.*hello.*spanish",
"Write a poem about the moon": r"^Write a.*poem about the moon",
"Summarize evolution": r"summarize.*theory of evolution",
"Explain photosynthesis": r"explain photosynthesis",
"What is 25 times 4?": r"25 times 4|25 \* 4|25\*4|25 x 4",
"Capital of France": r"capital of france",
}
for label, pat in exact_queries.items():
matches = [e for e in data if re.search(pat, e.get("instruction", "").strip(), re.I)]
print(f"\n{'='*60}")
print(f" {label} ({len(matches)} matches in dataset)")
print(f"{'='*60}")
for m in matches[:3]:
inst = m["instruction"][:100]
out = m["output"][:300]
print(f" Q: {inst}")
print(f" A: {out}")
print()
# Count all Asterizer-branded identity entries
identity_kw = [e for e in data if "asterizer" in e.get("output", "").lower() or "luna" in e.get("output", "").lower()[:50]]
print(f"\n{'='*60}")
print(f" Entries mentioning Asterizer/LUNA in output: {len(identity_kw)}")
print(f"{'='*60}")