File size: 4,048 Bytes
0f23a85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
"""
Format BothBosu scam-dialogue (and optionally Scammer-Conversation)
into standardized chat-template JSONL for SFT.

REQUIREMENTS:
  pip install datasets transformers

USAGE:
  python format_dataset.py --out_dir ./formatted_scam_data

OUTPUT:
  formatted_scam_data/
    train.jsonl   (chat-format messages per line)
    test.jsonl
    README.md     (dataset card fragment)

Each JSONL line:
  {"messages": [
    {"role": "system",    "content": "You are a phone scam detection expert."},
    {"role": "user",      "content": "Read this transcript...\n\n{transcript}"},
    {"role": "assistant", "content": "SCAM"}
  ]}
"""

import argparse
import json
from pathlib import Path

from datasets import load_dataset, concatenate_datasets


PROMPT_TEMPLATE = (
    "Read this phone call transcript and classify it:\n\n"
    "{transcript}\n\n"
    "Answer with exactly ONE word: SCAM or LEGITIMATE."
)
SYSTEM = "You are a phone scam detection expert."


def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--primary", default="BothBosu/scam-dialogue")
    p.add_argument("--secondary", default="BothBosu/Scammer-Conversation",
                   help="Optional extra dataset to merge into train")
    p.add_argument("--out_dir", default="./formatted_scam_data")
    return p.parse_args()


def row_to_chat(row):
    """Convert a raw dataset row → ChatML dict."""
    answer = "SCAM" if row["label"] == 1 else "LEGITIMATE"
    # Handle different column names across datasets
    transcript = row.get("dialogue") or row.get("conversation")
    return {
        "messages": [
            {"role": "system",    "content": SYSTEM},
            {"role": "user",      "content": PROMPT_TEMPLATE.format(transcript=transcript)},
            {"role": "assistant", "content": answer},
        ]
    }


def save_jsonl(rows, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"Saved {len(rows)} rows → {path}")


def main():
    args = parse_args()
    out_dir = Path(args.out_dir)

    # Load primary
    print(f"Loading primary dataset: {args.primary}")
    ds_train = load_dataset(args.primary, split="train")
    ds_test  = load_dataset(args.primary, split="test")

    # Optional secondary merge
    if args.secondary:
        try:
            ds_extra = load_dataset(args.secondary, split="train")
            n_before = len(ds_train)
            ds_train = concatenate_datasets([ds_train, ds_extra])
            print(f"Merged {args.secondary}: {n_before}{len(ds_train)} train rows")
        except Exception as e:
            print(f"Skipped secondary dataset: {e}")

    # Convert
    train_rows = [row_to_chat(r) for r in ds_train]
    test_rows  = [row_to_chat(r) for r in ds_test]

    # Save
    save_jsonl(train_rows, out_dir / "train.jsonl")
    save_jsonl(test_rows,  out_dir / "test.jsonl")

    # Stats
    n_scam_train = sum(1 for r in train_rows if r["messages"][2]["content"] == "SCAM")
    n_scam_test  = sum(1 for r in test_rows  if r["messages"][2]["content"] == "SCAM")

    stats = {
        "train": {"total": len(train_rows), "scam": n_scam_train, "legit": len(train_rows) - n_scam_train},
        "test":  {"total": len(test_rows),  "scam": n_scam_test,  "legit": len(test_rows)  - n_scam_test},
    }

    (out_dir / "stats.json").write_text(json.dumps(stats, indent=2))
    print(f"\nStats:\n{json.dumps(stats, indent=2)}")

    # README fragment
    readme = f"""# Formatted Scam-Call Dataset (ChatML)

Generated by `format_dataset.py`.

## Sources
- Primary: {args.primary}
- Secondary: {args.secondary or "None"}

## Statistics
```json
{json.dumps(stats, indent=2)}
```

## Schema
Each `.jsonl` line is a ChatML message list compatible with TRL / Unsloth SFTTrainer.
"""
    (out_dir / "README.md").write_text(readme)
    print(f"\nDone. Output directory: {out_dir.absolute()}")


if __name__ == "__main__":
    main()