s23deepak commited on
Commit
0f23a85
·
verified ·
1 Parent(s): b644b23

Upload format_dataset.py

Browse files
Files changed (1) hide show
  1. format_dataset.py +133 -0
format_dataset.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Format BothBosu scam-dialogue (and optionally Scammer-Conversation)
4
+ into standardized chat-template JSONL for SFT.
5
+
6
+ REQUIREMENTS:
7
+ pip install datasets transformers
8
+
9
+ USAGE:
10
+ python format_dataset.py --out_dir ./formatted_scam_data
11
+
12
+ OUTPUT:
13
+ formatted_scam_data/
14
+ train.jsonl (chat-format messages per line)
15
+ test.jsonl
16
+ README.md (dataset card fragment)
17
+
18
+ Each JSONL line:
19
+ {"messages": [
20
+ {"role": "system", "content": "You are a phone scam detection expert."},
21
+ {"role": "user", "content": "Read this transcript...\n\n{transcript}"},
22
+ {"role": "assistant", "content": "SCAM"}
23
+ ]}
24
+ """
25
+
26
+ import argparse
27
+ import json
28
+ from pathlib import Path
29
+
30
+ from datasets import load_dataset, concatenate_datasets
31
+
32
+
33
+ PROMPT_TEMPLATE = (
34
+ "Read this phone call transcript and classify it:\n\n"
35
+ "{transcript}\n\n"
36
+ "Answer with exactly ONE word: SCAM or LEGITIMATE."
37
+ )
38
+ SYSTEM = "You are a phone scam detection expert."
39
+
40
+
41
+ def parse_args():
42
+ p = argparse.ArgumentParser()
43
+ p.add_argument("--primary", default="BothBosu/scam-dialogue")
44
+ p.add_argument("--secondary", default="BothBosu/Scammer-Conversation",
45
+ help="Optional extra dataset to merge into train")
46
+ p.add_argument("--out_dir", default="./formatted_scam_data")
47
+ return p.parse_args()
48
+
49
+
50
+ def row_to_chat(row):
51
+ """Convert a raw dataset row → ChatML dict."""
52
+ answer = "SCAM" if row["label"] == 1 else "LEGITIMATE"
53
+ # Handle different column names across datasets
54
+ transcript = row.get("dialogue") or row.get("conversation")
55
+ return {
56
+ "messages": [
57
+ {"role": "system", "content": SYSTEM},
58
+ {"role": "user", "content": PROMPT_TEMPLATE.format(transcript=transcript)},
59
+ {"role": "assistant", "content": answer},
60
+ ]
61
+ }
62
+
63
+
64
+ def save_jsonl(rows, path: Path):
65
+ path.parent.mkdir(parents=True, exist_ok=True)
66
+ with open(path, "w", encoding="utf-8") as f:
67
+ for r in rows:
68
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
69
+ print(f"Saved {len(rows)} rows → {path}")
70
+
71
+
72
+ def main():
73
+ args = parse_args()
74
+ out_dir = Path(args.out_dir)
75
+
76
+ # Load primary
77
+ print(f"Loading primary dataset: {args.primary}")
78
+ ds_train = load_dataset(args.primary, split="train")
79
+ ds_test = load_dataset(args.primary, split="test")
80
+
81
+ # Optional secondary merge
82
+ if args.secondary:
83
+ try:
84
+ ds_extra = load_dataset(args.secondary, split="train")
85
+ n_before = len(ds_train)
86
+ ds_train = concatenate_datasets([ds_train, ds_extra])
87
+ print(f"Merged {args.secondary}: {n_before} → {len(ds_train)} train rows")
88
+ except Exception as e:
89
+ print(f"Skipped secondary dataset: {e}")
90
+
91
+ # Convert
92
+ train_rows = [row_to_chat(r) for r in ds_train]
93
+ test_rows = [row_to_chat(r) for r in ds_test]
94
+
95
+ # Save
96
+ save_jsonl(train_rows, out_dir / "train.jsonl")
97
+ save_jsonl(test_rows, out_dir / "test.jsonl")
98
+
99
+ # Stats
100
+ n_scam_train = sum(1 for r in train_rows if r["messages"][2]["content"] == "SCAM")
101
+ n_scam_test = sum(1 for r in test_rows if r["messages"][2]["content"] == "SCAM")
102
+
103
+ stats = {
104
+ "train": {"total": len(train_rows), "scam": n_scam_train, "legit": len(train_rows) - n_scam_train},
105
+ "test": {"total": len(test_rows), "scam": n_scam_test, "legit": len(test_rows) - n_scam_test},
106
+ }
107
+
108
+ (out_dir / "stats.json").write_text(json.dumps(stats, indent=2))
109
+ print(f"\nStats:\n{json.dumps(stats, indent=2)}")
110
+
111
+ # README fragment
112
+ readme = f"""# Formatted Scam-Call Dataset (ChatML)
113
+
114
+ Generated by `format_dataset.py`.
115
+
116
+ ## Sources
117
+ - Primary: {args.primary}
118
+ - Secondary: {args.secondary or "None"}
119
+
120
+ ## Statistics
121
+ ```json
122
+ {json.dumps(stats, indent=2)}
123
+ ```
124
+
125
+ ## Schema
126
+ Each `.jsonl` line is a ChatML message list compatible with TRL / Unsloth SFTTrainer.
127
+ """
128
+ (out_dir / "README.md").write_text(readme)
129
+ print(f"\nDone. Output directory: {out_dir.absolute()}")
130
+
131
+
132
+ if __name__ == "__main__":
133
+ main()