ASTERIZER commited on
Commit
56ea5bf
·
verified ·
1 Parent(s): 01e6957

Upload check_sft_alignment.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. check_sft_alignment.py +39 -0
check_sft_alignment.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quick check: what does the SFT dataset say for the eval questions?"""
2
+ import json, re
3
+
4
+ with open("Base/Datasets/sft_clean/train.json", "r", encoding="utf-8") as f:
5
+ data = json.load(f)
6
+
7
+ exact_queries = {
8
+ "Who are you?": r"^Who are you\?$",
9
+ "What is your name?": r"^What is your name\?$",
10
+ "Who created you?": r"^Who created you\?$",
11
+ "Who is Asterizer?": r"^Who is Asterizer\?$",
12
+ "What is LUNA?": r"^What is LUNA\?$",
13
+ "Are you an AI?": r"are you an ai",
14
+ "Tell me about yourself": r"tell me about yourself",
15
+ "Translate hello to Spanish": r"translate.*hello.*spanish",
16
+ "Write a poem about the moon": r"^Write a.*poem about the moon",
17
+ "Summarize evolution": r"summarize.*theory of evolution",
18
+ "Explain photosynthesis": r"explain photosynthesis",
19
+ "What is 25 times 4?": r"25 times 4|25 \* 4|25\*4|25 x 4",
20
+ "Capital of France": r"capital of france",
21
+ }
22
+
23
+ for label, pat in exact_queries.items():
24
+ matches = [e for e in data if re.search(pat, e.get("instruction", "").strip(), re.I)]
25
+ print(f"\n{'='*60}")
26
+ print(f" {label} ({len(matches)} matches in dataset)")
27
+ print(f"{'='*60}")
28
+ for m in matches[:3]:
29
+ inst = m["instruction"][:100]
30
+ out = m["output"][:300]
31
+ print(f" Q: {inst}")
32
+ print(f" A: {out}")
33
+ print()
34
+
35
+ # Count all Asterizer-branded identity entries
36
+ identity_kw = [e for e in data if "asterizer" in e.get("output", "").lower() or "luna" in e.get("output", "").lower()[:50]]
37
+ print(f"\n{'='*60}")
38
+ print(f" Entries mentioning Asterizer/LUNA in output: {len(identity_kw)}")
39
+ print(f"{'='*60}")