| { | |
| "model": "multilingual_32k", | |
| "vocab_size": 32000, | |
| "bos_id": 1, | |
| "eos_id": 2, | |
| "config": { | |
| "character_coverage": 0.9995, | |
| "model_type": "bpe", | |
| "byte_fallback": true, | |
| "split_digits": true, | |
| "max_sentence_length": 16384, | |
| "input_sentence_size": 10000000 | |
| }, | |
| "data_sources": { | |
| "en": "allenai/c4 (en)", | |
| "ar": "wikimedia/wikipedia (20231101.ar)", | |
| "he": "wikimedia/wikipedia (20231101.he)", | |
| "fa": "wikimedia/wikipedia (20231101.fa)" | |
| }, | |
| "languages": { | |
| "en": { | |
| "num_tokens": 131858, | |
| "num_bytes": 502591, | |
| "num_words": 85508, | |
| "num_chars": 500000, | |
| "bytes_per_token": 3.81, | |
| "tokens_per_word": 1.54 | |
| }, | |
| "ar": { | |
| "num_tokens": 138572, | |
| "num_bytes": 900643, | |
| "num_words": 81698, | |
| "num_chars": 500000, | |
| "bytes_per_token": 6.5, | |
| "tokens_per_word": 1.7 | |
| }, | |
| "he": { | |
| "num_tokens": 150214, | |
| "num_bytes": 876334, | |
| "num_words": 81962, | |
| "num_chars": 500000, | |
| "bytes_per_token": 5.83, | |
| "tokens_per_word": 1.83 | |
| }, | |
| "fa": { | |
| "num_tokens": 129491, | |
| "num_bytes": 902876, | |
| "num_words": 91425, | |
| "num_chars": 500000, | |
| "bytes_per_token": 6.97, | |
| "tokens_per_word": 1.42 | |
| } | |
| }, | |
| "timestamp": "2026-04-01T14:12:42Z" | |
| } |