File size: 4,048 Bytes
0f23a85 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | #!/usr/bin/env python3
"""
Format BothBosu scam-dialogue (and optionally Scammer-Conversation)
into standardized chat-template JSONL for SFT.
REQUIREMENTS:
pip install datasets transformers
USAGE:
python format_dataset.py --out_dir ./formatted_scam_data
OUTPUT:
formatted_scam_data/
train.jsonl (chat-format messages per line)
test.jsonl
README.md (dataset card fragment)
Each JSONL line:
{"messages": [
{"role": "system", "content": "You are a phone scam detection expert."},
{"role": "user", "content": "Read this transcript...\n\n{transcript}"},
{"role": "assistant", "content": "SCAM"}
]}
"""
import argparse
import json
from pathlib import Path
from datasets import load_dataset, concatenate_datasets
PROMPT_TEMPLATE = (
"Read this phone call transcript and classify it:\n\n"
"{transcript}\n\n"
"Answer with exactly ONE word: SCAM or LEGITIMATE."
)
SYSTEM = "You are a phone scam detection expert."
def parse_args():
p = argparse.ArgumentParser()
p.add_argument("--primary", default="BothBosu/scam-dialogue")
p.add_argument("--secondary", default="BothBosu/Scammer-Conversation",
help="Optional extra dataset to merge into train")
p.add_argument("--out_dir", default="./formatted_scam_data")
return p.parse_args()
def row_to_chat(row):
"""Convert a raw dataset row → ChatML dict."""
answer = "SCAM" if row["label"] == 1 else "LEGITIMATE"
# Handle different column names across datasets
transcript = row.get("dialogue") or row.get("conversation")
return {
"messages": [
{"role": "system", "content": SYSTEM},
{"role": "user", "content": PROMPT_TEMPLATE.format(transcript=transcript)},
{"role": "assistant", "content": answer},
]
}
def save_jsonl(rows, path: Path):
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
for r in rows:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
print(f"Saved {len(rows)} rows → {path}")
def main():
args = parse_args()
out_dir = Path(args.out_dir)
# Load primary
print(f"Loading primary dataset: {args.primary}")
ds_train = load_dataset(args.primary, split="train")
ds_test = load_dataset(args.primary, split="test")
# Optional secondary merge
if args.secondary:
try:
ds_extra = load_dataset(args.secondary, split="train")
n_before = len(ds_train)
ds_train = concatenate_datasets([ds_train, ds_extra])
print(f"Merged {args.secondary}: {n_before} → {len(ds_train)} train rows")
except Exception as e:
print(f"Skipped secondary dataset: {e}")
# Convert
train_rows = [row_to_chat(r) for r in ds_train]
test_rows = [row_to_chat(r) for r in ds_test]
# Save
save_jsonl(train_rows, out_dir / "train.jsonl")
save_jsonl(test_rows, out_dir / "test.jsonl")
# Stats
n_scam_train = sum(1 for r in train_rows if r["messages"][2]["content"] == "SCAM")
n_scam_test = sum(1 for r in test_rows if r["messages"][2]["content"] == "SCAM")
stats = {
"train": {"total": len(train_rows), "scam": n_scam_train, "legit": len(train_rows) - n_scam_train},
"test": {"total": len(test_rows), "scam": n_scam_test, "legit": len(test_rows) - n_scam_test},
}
(out_dir / "stats.json").write_text(json.dumps(stats, indent=2))
print(f"\nStats:\n{json.dumps(stats, indent=2)}")
# README fragment
readme = f"""# Formatted Scam-Call Dataset (ChatML)
Generated by `format_dataset.py`.
## Sources
- Primary: {args.primary}
- Secondary: {args.secondary or "None"}
## Statistics
```json
{json.dumps(stats, indent=2)}
```
## Schema
Each `.jsonl` line is a ChatML message list compatible with TRL / Unsloth SFTTrainer.
"""
(out_dir / "README.md").write_text(readme)
print(f"\nDone. Output directory: {out_dir.absolute()}")
if __name__ == "__main__":
main()
|