File size: 3,629 Bytes
2568517 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | #!/usr/bin/env python3
"""
wipe_mock_data.py
Removes any training records that were generated in mock/keyless mode.
A record is "bad" (mock) if:
- offer_amount is None in any turn, OR
- all rewards across the episode are exactly 0.0 (no real grading happened), OR
- the record contains the sentinel string "MOCK" in any utterance field.
Run before real data generation to avoid contaminating the training set.
Usage: python scripts/wipe_mock_data.py [--dry-run]
"""
import argparse
import json
import os
import shutil
from pathlib import Path
from datetime import datetime
DATA_FILE = Path("data/episodes.jsonl")
BACKUP_DIR = Path("data/backups")
def is_mock_record(record: dict) -> bool:
"""Return True if this record should be considered mock/bad data."""
# Sentinel string check
conversation = record.get("conversation", [])
for turn in conversation:
utterance = turn.get("utterance", "") or ""
if "MOCK" in utterance.upper():
return True
# All rewards zero — no real grading
rewards = record.get("step_rewards", [])
if rewards and all(r == 0.0 for r in rewards):
return True
# Terminal reward exactly 0 with no deal (indicates mock grader output)
if record.get("reward", 0.0) == 0.0 and not record.get("deal_reached", False):
# Only flag if there's also no efficiency score (truly ungraded)
if record.get("deal_efficiency") is None:
return True
# offer_amount None in first turn is a strong mock signal
if conversation:
first_turn = conversation[0]
if first_turn.get("offer_amount") is None:
return True
return False
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true",
help="Print what would be deleted without deleting")
args = parser.parse_args()
if not DATA_FILE.exists():
print(f"[INFO] {DATA_FILE} does not exist — nothing to wipe.")
return
records = []
with open(DATA_FILE, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
try:
records.append(json.loads(line))
except json.JSONDecodeError:
print(f"[WARN] Skipping malformed line: {line[:80]}")
total = len(records)
good = [r for r in records if not is_mock_record(r)]
bad = [r for r in records if is_mock_record(r)]
print(f"[INFO] Total records: {total}")
print(f"[INFO] Good records: {len(good)}")
print(f"[INFO] Mock/bad records to remove: {len(bad)}")
if not bad:
print("[INFO] No mock data found. Nothing to wipe.")
return
if args.dry_run:
print("[DRY RUN] Would remove the following records (first 5 shown):")
for r in bad[:5]:
print(f" episode_id={r.get('episode_id', '?')} "
f"persona={r.get('persona', '?')} "
f"reward={r.get('reward', '?')}")
return
# Back up existing file before modifying
BACKUP_DIR.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = BACKUP_DIR / f"episodes_backup_{ts}.jsonl"
shutil.copy2(DATA_FILE, backup_path)
print(f"[INFO] Backup saved to {backup_path}")
# Write only good records back
with open(DATA_FILE, "w", encoding="utf-8") as f:
for r in good:
f.write(json.dumps(r) + "\n")
print(f"[OK] Wiped {len(bad)} mock records. {len(good)} good records remain.")
if __name__ == "__main__":
main()
|