{ "source_dataset": "HuggingFaceFW/fineweb-edu", "source_config": "sample-10BT", "train_rows": 80000, "validation_rows": 4000, "text_column": "text", "cleaning": "remove null bytes, strip, collapse whitespace, filter by minimum character length", "min_text_chars": 200 }