iamkoder001
/

ARAVALLI-1

ecological-intelligence

environmental-protection

Model card Files Files and versions

ARAVALLI-1 / data /tokenizer_train.py

iamkoder001's picture

Create data/tokenizer_train.py

cd37343 verified about 2 months ago

history blame contribute delete

2.08 kB

	import os
	from tokenizers import Tokenizer
	from tokenizers.models import BPE
	from tokenizers.trainers import BpeTrainer
	from tokenizers.pre_tokenizers import Whitespace, ByteLevel
	from tokenizers.processors import TemplateProcessing

	def train_sovereign_tokenizer(corpus_path, vocab_size=50257):
	"""
	Trains a custom BPE tokenizer optimized for Indic and Ecological technical terms.
	Target: 50,257 tokens (matches the model_config.yaml).
	"""
	# 1. Initialize an empty BPE model
	# ByteLevel ensures we can handle any UTF-8 character without [UNK] tokens
	tokenizer = Tokenizer(BPE(unk_token="<\|unk\|>"))

	# 2. Set the Pre-Tokenizer
	# We use ByteLevel to treat the text as a sequence of bytes
	tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)

	# 3. Initialize the Trainer
	# We include special tokens for GOEC protocols (SN, NE, IPN)
	trainer = BpeTrainer(
	vocab_size=vocab_size,
	min_frequency=2,
	special_tokens=[
	"<\|endoftext\|>",
	"<\|unk\|>",
	"<\|pad\|>",
	"CATEGORY_SN",
	"CATEGORY_NE",
	"CATEGORY_IPN"
	],
	show_progress=True,
	initial_alphabet=ByteLevel.alphabet()
	)

	# 4. Train on the Sovereign Corpus
	print(f"Commencing Tokenizer Training on {corpus_path}...")
	files = [os.path.join(corpus_path, f) for f in os.listdir(corpus_path) if f.endswith(".txt")]
	tokenizer.train(files, trainer)

	# 5. Post-Processing
	# Add template to handle start/end of sequences for the Secretariat
	tokenizer.post_processor = TemplateProcessing(
	single="$A <\|endoftext\|>",
	special_tokens=[("<\|endoftext\|>", 0)],
	)

	# 6. Save the Sovereign Lens
	tokenizer.save("data/processed/aravalli_tokenizer.json")
	print("Sovereign Tokenizer Enacted and Saved to data/processed/")

	if __name__ == "__main__":
	# Ensure raw data exists before training
	if not os.path.exists("data/raw/"):
	os.makedirs("data/raw/")
	train_sovereign_tokenizer("data/raw/")