File size: 10,868 Bytes
ec4ae03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#!/usr/bin/env python3
"""
Create dual-task training dataset by mixing question-generation and solution-generation examples.

This script:
1. Loads existing solution data (GSM8K format)
2. Loads question-generation data (synthetic)
3. Adds task prefixes to distinguish tasks
4. Mixes datasets according to specified ratio
5. Shuffles and splits into train/validation

Usage:
    python scripts/create_dual_task_dataset.py \
        --solution-data data/sft/gsm8k_sft.jsonl \
        --question-data data/sft/question_generation.jsonl \
        --output-train data/sft/dual_task_train.jsonl \
        --output-val data/sft/dual_task_val.jsonl \
        --mix-ratio 0.8 \
        --val-split 0.1
"""

from __future__ import annotations

import argparse
import json
import random
import sys
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))

from src.config.prompts import SOLVE_TASK_PREFIX, GENERATE_TASK_PREFIX


def load_jsonl(path: Path) -> list[dict[str, Any]]:
    """Load JSONL file into list of records."""
    records = []
    with path.open(encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return records


def add_solve_prefix(record: dict[str, Any]) -> dict[str, Any]:
    """
    Add 'Solve Problem' task prefix to user message.
    
    This signals the model to generate a step-by-step solution.
    """
    modified = record.copy()
    modified["messages"] = []
    
    for msg in record["messages"]:
        new_msg = msg.copy()
        if msg["role"] == "user":
            # Add task prefix to user content
            content = msg["content"]
            if not content.startswith(SOLVE_TASK_PREFIX):
                new_msg["content"] = SOLVE_TASK_PREFIX + content
        modified["messages"].append(new_msg)
    
    # Update text field if present
    if "text" in modified:
        # Find and update user section
        text = modified["text"]
        if "<|user|>" in text:
            parts = text.split("<|user|>")
            if len(parts) > 1:
                user_part = parts[1]
                if not user_part.strip().startswith(SOLVE_TASK_PREFIX):
                    parts[1] = f"\n{SOLVE_TASK_PREFIX}" + user_part
                    modified["text"] = "<|user|>".join(parts)
    
    # Mark as solve task
    modified["task_type"] = "solve"
    
    return modified


def verify_question_prefix(record: dict[str, Any]) -> dict[str, Any]:
    """
    Verify question generation record has proper prefix.
    
    Should already have it from generation script, but double-check.
    """
    modified = record.copy()
    modified["messages"] = []
    
    for msg in record["messages"]:
        new_msg = msg.copy()
        if msg["role"] == "user":
            content = msg["content"]
            if not content.startswith(GENERATE_TASK_PREFIX):
                new_msg["content"] = GENERATE_TASK_PREFIX + content
        modified["messages"].append(new_msg)
    
    # Update text field if present
    if "text" in modified:
        text = modified["text"]
        if "<|user|>" in text:
            parts = text.split("<|user|>")
            if len(parts) > 1:
                user_part = parts[1]
                if not user_part.strip().startswith(GENERATE_TASK_PREFIX):
                    parts[1] = f"\n{GENERATE_TASK_PREFIX}" + user_part
                    modified["text"] = "<|user|>".join(parts)
    
    # Mark as question generation task
    modified["task_type"] = "generate"
    
    return modified


def sample_with_ratio(
    solution_records: list[dict[str, Any]],
    question_records: list[dict[str, Any]],
    mix_ratio: float,
    target_total: int | None = None,
) -> list[dict[str, Any]]:
    """
    Sample and mix datasets according to specified ratio.
    
    Args:
        solution_records: Solution examples
        question_records: Question generation examples
        mix_ratio: Fraction of solutions in final dataset (0.8 = 80% solutions, 20% questions)
        target_total: Target total examples (None = use all available data)
    
    Returns:
        Mixed dataset
    """
    n_solutions = len(solution_records)
    n_questions = len(question_records)
    
    if target_total is None:
        # Use all available data
        target_total = n_solutions + n_questions
    
    # Calculate target counts
    n_sol_target = int(target_total * mix_ratio)
    n_q_target = target_total - n_sol_target
    
    # Check availability
    if n_sol_target > n_solutions:
        print(f"Warning: Requested {n_sol_target} solutions but only {n_solutions} available.")
        n_sol_target = n_solutions
    
    if n_q_target > n_questions:
        print(f"Warning: Requested {n_q_target} questions but only {n_questions} available.")
        n_q_target = n_questions
    
    # Sample
    selected_solutions = random.sample(solution_records, n_sol_target)
    selected_questions = random.sample(question_records, n_q_target)
    
    print(f"Sampled {n_sol_target} solutions and {n_q_target} questions")
    print(f"Actual ratio: {n_sol_target/(n_sol_target+n_q_target):.2%} solutions, "
          f"{n_q_target/(n_sol_target+n_q_target):.2%} questions")
    
    return selected_solutions + selected_questions


def write_jsonl(records: list[dict[str, Any]], path: Path) -> None:
    """Write records to JSONL file."""
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for record in records:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Create dual-task training dataset from solution and question-generation examples."
    )
    parser.add_argument(
        "--solution-data",
        type=Path,
        required=True,
        help="Path to solution training data (GSM8K format)",
    )
    parser.add_argument(
        "--question-data",
        type=Path,
        required=True,
        help="Path to question-generation training data",
    )
    parser.add_argument(
        "--output-train",
        type=Path,
        required=True,
        help="Output path for training split",
    )
    parser.add_argument(
        "--output-val",
        type=Path,
        required=True,
        help="Output path for validation split",
    )
    parser.add_argument(
        "--mix-ratio",
        type=float,
        default=0.8,
        help="Fraction of solutions in mixed dataset (default: 0.8 = 80%% solutions)",
    )
    parser.add_argument(
        "--val-split",
        type=float,
        default=0.1,
        help="Fraction of data to use for validation (default: 0.1 = 10%%)",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="Random seed for reproducibility",
    )
    parser.add_argument(
        "--max-total",
        type=int,
        default=None,
        help="Maximum total examples to include (None = use all available)",
    )
    args = parser.parse_args()
    
    # Validate inputs
    if not args.solution_data.exists():
        raise SystemExit(f"Error: Solution data not found at {args.solution_data}")
    if not args.question_data.exists():
        raise SystemExit(f"Error: Question data not found at {args.question_data}")
    
    if not (0 < args.mix_ratio < 1):
        raise SystemExit("Error: --mix-ratio must be between 0 and 1")
    if not (0 < args.val_split < 1):
        raise SystemExit("Error: --val-split must be between 0 and 1")
    
    # Set random seed
    random.seed(args.seed)
    
    print("=" * 60)
    print("Dual-Task Dataset Creation")
    print("=" * 60)
    
    # Load data
    print("\n1. Loading data...")
    print(f"   Solution data: {args.solution_data}")
    solution_records = load_jsonl(args.solution_data)
    print(f"   Loaded {len(solution_records)} solution examples")
    
    print(f"   Question data: {args.question_data}")
    question_records = load_jsonl(args.question_data)
    print(f"   Loaded {len(question_records)} question-generation examples")
    
    # Add task prefixes
    print("\n2. Adding task prefixes...")
    print("   Adding 'Solve Problem' prefix to solution examples...")
    solution_records = [add_solve_prefix(r) for r in solution_records]
    
    print("   Verifying 'Generate Question' prefix on question examples...")
    question_records = [verify_question_prefix(r) for r in question_records]
    
    # Mix datasets
    print(f"\n3. Mixing datasets (ratio: {args.mix_ratio:.0%} solutions, {1-args.mix_ratio:.0%} questions)...")
    mixed_records = sample_with_ratio(
        solution_records=solution_records,
        question_records=question_records,
        mix_ratio=args.mix_ratio,
        target_total=args.max_total,
    )
    
    # Shuffle
    print(f"\n4. Shuffling {len(mixed_records)} total examples...")
    random.shuffle(mixed_records)
    
    # Split train/val
    n_val = int(len(mixed_records) * args.val_split)
    n_train = len(mixed_records) - n_val
    
    train_records = mixed_records[:n_train]
    val_records = mixed_records[n_train:]
    
    print(f"\n5. Splitting data:")
    print(f"   Training: {len(train_records)} examples ({len(train_records)/len(mixed_records):.1%})")
    print(f"   Validation: {len(val_records)} examples ({len(val_records)/len(mixed_records):.1%})")
    
    # Verify split composition
    train_solve = sum(1 for r in train_records if r.get("task_type") == "solve")
    train_gen = sum(1 for r in train_records if r.get("task_type") == "generate")
    val_solve = sum(1 for r in val_records if r.get("task_type") == "solve")
    val_gen = sum(1 for r in val_records if r.get("task_type") == "generate")
    
    print(f"\n   Train composition:")
    print(f"     Solve: {train_solve} ({train_solve/len(train_records):.1%})")
    print(f"     Generate: {train_gen} ({train_gen/len(train_records):.1%})")
    
    print(f"   Val composition:")
    print(f"     Solve: {val_solve} ({val_solve/len(val_records):.1%})")
    print(f"     Generate: {val_gen} ({val_gen/len(val_records):.1%})")
    
    # Write outputs
    print(f"\n6. Writing output files...")
    print(f"   Training data: {args.output_train}")
    write_jsonl(train_records, args.output_train)
    
    print(f"   Validation data: {args.output_val}")
    write_jsonl(val_records, args.output_val)
    
    print("\n" + "=" * 60)
    print("Dual-task dataset creation complete!")
    print("=" * 60)
    print(f"\nOutput files:")
    print(f"  Train: {args.output_train} ({len(train_records)} examples)")
    print(f"  Val:   {args.output_val} ({len(val_records)} examples)")
    print(f"\nNext step: Train dual-task model using these files")


if __name__ == "__main__":
    main()