ARBS / training /data /prepare_starcoder.py
CLIWorks's picture
Upload folder using huggingface_hub
d8bc908 verified
"""StarCoderData code data streaming.
Dataset: https://huggingface.co/datasets/bigcode/starcoderdata
Storage: ~50 GB for sampled subset
License: Permissive (varies by language)
"""
import torch
from dataclasses import dataclass
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from arbitor.config import SPECIAL_VOCAB
@dataclass
class StarCoderConfig:
languages: tuple = ("python", "javascript", "java", "cpp", "rust")
ctx: int = 2048
batch_size: int = 8
shuffle_buffer: int = 10000
split: str = "train"
sample_rate: float = 0.05
class StarCoderStream:
def __init__(self, cfg: StarCoderConfig):
self.cfg = cfg
self._ds = None
self._bos = SPECIAL_VOCAB['BOS']
self._eos = SPECIAL_VOCAB['EOS']
self._pad = SPECIAL_VOCAB['PAD']
def _lazy_init(self):
if self._ds is not None:
return
from datasets import load_dataset, concatenate_datasets
streams = []
for lang in self.cfg.languages:
ds = load_dataset("bigcode/starcoderdata", lang,
split=self.cfg.split, streaming=True)
streams.append(ds)
merged = concatenate_datasets(streams)
self._ds = merged.shuffle(buffer_size=self.cfg.shuffle_buffer, seed=42)
def _tokenize(self, text: str) -> torch.LongTensor:
raw = text.encode("utf-8")
tokens = [self._bos] + list(raw) + [self._eos]
return torch.tensor(tokens[:self.cfg.ctx + 1], dtype=torch.long)
def batches(self):
self._lazy_init()
buf = []
for example in self._ds:
tokens = self._tokenize(example["content"])
if tokens.numel() < 5:
continue
buf.append(tokens)
if len(buf) >= self.cfg.batch_size:
batch = buf[:self.cfg.batch_size]
buf = buf[self.cfg.batch_size:]
max_len = max(t.numel() for t in batch)
padded = torch.stack([
torch.cat([t, t.new_full((max_len - t.numel(),), self._pad)])
for t in batch
])
yield padded[:, :-1].contiguous(), padded[:, 1:].contiguous()
def num_samples(self) -> int:
return 250_000_000_000 // self.cfg.ctx