| """Quick check: what does the SFT dataset say for the eval questions?"""
|
| import json, re
|
|
|
| with open("Base/Datasets/sft_clean/train.json", "r", encoding="utf-8") as f:
|
| data = json.load(f)
|
|
|
| exact_queries = {
|
| "Who are you?": r"^Who are you\?$",
|
| "What is your name?": r"^What is your name\?$",
|
| "Who created you?": r"^Who created you\?$",
|
| "Who is Asterizer?": r"^Who is Asterizer\?$",
|
| "What is LUNA?": r"^What is LUNA\?$",
|
| "Are you an AI?": r"are you an ai",
|
| "Tell me about yourself": r"tell me about yourself",
|
| "Translate hello to Spanish": r"translate.*hello.*spanish",
|
| "Write a poem about the moon": r"^Write a.*poem about the moon",
|
| "Summarize evolution": r"summarize.*theory of evolution",
|
| "Explain photosynthesis": r"explain photosynthesis",
|
| "What is 25 times 4?": r"25 times 4|25 \* 4|25\*4|25 x 4",
|
| "Capital of France": r"capital of france",
|
| }
|
|
|
| for label, pat in exact_queries.items():
|
| matches = [e for e in data if re.search(pat, e.get("instruction", "").strip(), re.I)]
|
| print(f"\n{'='*60}")
|
| print(f" {label} ({len(matches)} matches in dataset)")
|
| print(f"{'='*60}")
|
| for m in matches[:3]:
|
| inst = m["instruction"][:100]
|
| out = m["output"][:300]
|
| print(f" Q: {inst}")
|
| print(f" A: {out}")
|
| print()
|
|
|
|
|
| identity_kw = [e for e in data if "asterizer" in e.get("output", "").lower() or "luna" in e.get("output", "").lower()[:50]]
|
| print(f"\n{'='*60}")
|
| print(f" Entries mentioning Asterizer/LUNA in output: {len(identity_kw)}")
|
| print(f"{'='*60}")
|
|
|