| { | |
| "source_dataset": "HuggingFaceFW/fineweb-edu", | |
| "source_config": "sample-10BT", | |
| "train_rows": 80000, | |
| "validation_rows": 4000, | |
| "text_column": "text", | |
| "cleaning": "remove null bytes, strip, collapse whitespace, filter by minimum character length", | |
| "min_text_chars": 200 | |
| } |