narcolepticchicken commited on
Commit
a7bd4f1
·
verified ·
1 Parent(s): dd5bf53

Fix Nemotron-PII spans parsing with ast.literal_eval fallback

Browse files
Files changed (1) hide show
  1. train.py +5 -2
train.py CHANGED
@@ -1,5 +1,5 @@
1
  #!/usr/bin/env python3
2
- import json, random, argparse
3
  import numpy as np, torch
4
  from datasets import load_dataset, Dataset
5
  from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
@@ -66,7 +66,10 @@ def load_nemotron_split(split, max_examples=10000):
66
  text = ex["text"]
67
  spans_raw = ex["spans"]
68
  if isinstance(spans_raw, str):
69
- spans_raw = json.loads(spans_raw)
 
 
 
70
  spans = []
71
  for sp in spans_raw:
72
  lab = NEMOTRON_MAP.get(sp["label"])
 
1
  #!/usr/bin/env python3
2
+ import json, random, argparse, ast
3
  import numpy as np, torch
4
  from datasets import load_dataset, Dataset
5
  from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
 
66
  text = ex["text"]
67
  spans_raw = ex["spans"]
68
  if isinstance(spans_raw, str):
69
+ try:
70
+ spans_raw = json.loads(spans_raw)
71
+ except json.JSONDecodeError:
72
+ spans_raw = ast.literal_eval(spans_raw)
73
  spans = []
74
  for sp in spans_raw:
75
  lab = NEMOTRON_MAP.get(sp["label"])