Fix Nemotron-PII spans parsing with ast.literal_eval fallback
Browse files
train.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
import json, random, argparse
|
| 3 |
import numpy as np, torch
|
| 4 |
from datasets import load_dataset, Dataset
|
| 5 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
|
|
@@ -66,7 +66,10 @@ def load_nemotron_split(split, max_examples=10000):
|
|
| 66 |
text = ex["text"]
|
| 67 |
spans_raw = ex["spans"]
|
| 68 |
if isinstance(spans_raw, str):
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
| 70 |
spans = []
|
| 71 |
for sp in spans_raw:
|
| 72 |
lab = NEMOTRON_MAP.get(sp["label"])
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
+
import json, random, argparse, ast
|
| 3 |
import numpy as np, torch
|
| 4 |
from datasets import load_dataset, Dataset
|
| 5 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
|
|
|
|
| 66 |
text = ex["text"]
|
| 67 |
spans_raw = ex["spans"]
|
| 68 |
if isinstance(spans_raw, str):
|
| 69 |
+
try:
|
| 70 |
+
spans_raw = json.loads(spans_raw)
|
| 71 |
+
except json.JSONDecodeError:
|
| 72 |
+
spans_raw = ast.literal_eval(spans_raw)
|
| 73 |
spans = []
|
| 74 |
for sp in spans_raw:
|
| 75 |
lab = NEMOTRON_MAP.get(sp["label"])
|