ARBS / training /data /tokenize_from_hf.py
CLIWorks's picture
Upload folder using huggingface_hub
d8bc908 verified
"""Download and tokenize a HuggingFace dataset to the ARB byte vocabulary.
Usage:
python training/data/tokenize_from_hf.py \\
--repo "HuggingFaceFW/fineweb-edu" \\
--subset "sample-10BT" \\
--split "train" \\
--max_samples 10000 \\
--output "training/data/fineweb-edu.pt"
The script downloads text data, encodes it as UTF-8 bytes, and saves
as a tensor of byte indices compatible with ARB's VOCAB=297.
"""
import os, sys, torch
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from arbitor.config import VOCAB, SPECIAL_VOCAB
def encode_text(text: str) -> list[int]:
"""Encode text to byte token indices (0-255) with BOS/EOS markers."""
tokens = [SPECIAL_VOCAB['BOS']]
for byte in text.encode('utf-8'):
if byte < VOCAB: # bytes 0-255 fit in our vocab
tokens.append(byte)
tokens.append(SPECIAL_VOCAB['EOS'])
return tokens
def download_dataset(repo: str, subset: str = None, split: str = "train",
max_samples: int = None, token: str = None,
text_col: str = "text"):
"""Download and tokenize a HuggingFace text dataset.
Args:
repo: HF dataset repo (e.g., "HuggingFaceFW/fineweb-edu")
subset: Config/subset name (e.g., "sample-10BT")
split: Dataset split ("train", "test", etc.)
max_samples: Limit number of samples (None = all)
token: HF API token for private/gated datasets
text_col: Column name containing text data
Returns:
torch.Tensor of concatenated byte token sequences
"""
try:
from datasets import load_dataset
except ImportError:
print("Install datasets: pip install datasets")
sys.exit(1)
kwargs = {"split": split, "streaming": True}
if subset:
kwargs["name"] = subset
if token:
kwargs["token"] = token
print(f"Loading {repo}/{subset or ''} ({split})...")
ds = load_dataset(repo, **kwargs)
if max_samples:
ds = ds.take(max_samples)
all_tokens = []
count = 0
for example in ds:
text = example.get(text_col, "")
if not text or not isinstance(text, str):
continue
tokens = encode_text(text)
all_tokens.extend(tokens)
count += 1
if count % 1000 == 0:
print(f" Processed {count} samples, {len(all_tokens):,} tokens")
# Truncate or pad to align with model's expected dimensions
tensor = torch.tensor(all_tokens, dtype=torch.long)
print(f"Done: {count} samples → {len(tensor):,} byte tokens")
return tensor
def save_tensor(tensor: torch.Tensor, path: str):
"""Save tokenized data to .pt file, creating directory if needed."""
os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
torch.save(tensor, path)
print(f"Saved to {path}")
def load_tensor(path: str) -> torch.Tensor:
"""Load previously tokenized .pt file."""
return torch.load(path, weights_only=True)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Tokenize HF dataset to ARB byte format")
parser.add_argument("--repo", type=str, default="HuggingFaceFW/fineweb-edu",
help="HF dataset repo")
parser.add_argument("--subset", type=str, default="sample-10BT",
help="Dataset config/subset")
parser.add_argument("--split", type=str, default="train")
parser.add_argument("--output", type=str, default="training/data/dataset.pt",
help="Output .pt file path")
parser.add_argument("--max-samples", type=int, default=None,
help="Limit samples (default: all)")
parser.add_argument("--token", type=str, default=None,
help="HF API token for private datasets")
parser.add_argument("--text-col", type=str, default="text",
help="Text column name")
parser.add_argument("--verify", action="store_true",
help="Load and verify an existing .pt file")
args = parser.parse_args()
if args.verify:
data = load_tensor(args.output)
print(f"Loaded {args.output}: {len(data):,} tokens, "
f"min={data.min().item()}, max={data.max().item()}")
print(f" Valid range: 0-{VOCAB-1}")
sys.exit(0)
tensor = download_dataset(args.repo, args.subset, args.split,
args.max_samples, args.token, args.text_col)
save_tensor(tensor, args.output)