ronnengmail commited on
Commit
bbf3fc6
·
verified ·
1 Parent(s): fb8a501

Upload tokenizer/fertility_report.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer/fertility_report.json +55 -0
tokenizer/fertility_report.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "multilingual_32k",
3
+ "vocab_size": 32000,
4
+ "bos_id": 1,
5
+ "eos_id": 2,
6
+ "config": {
7
+ "character_coverage": 0.9995,
8
+ "model_type": "bpe",
9
+ "byte_fallback": true,
10
+ "split_digits": true,
11
+ "max_sentence_length": 16384,
12
+ "input_sentence_size": 10000000
13
+ },
14
+ "data_sources": {
15
+ "en": "allenai/c4 (en)",
16
+ "ar": "wikimedia/wikipedia (20231101.ar)",
17
+ "he": "wikimedia/wikipedia (20231101.he)",
18
+ "fa": "wikimedia/wikipedia (20231101.fa)"
19
+ },
20
+ "languages": {
21
+ "en": {
22
+ "num_tokens": 131858,
23
+ "num_bytes": 502591,
24
+ "num_words": 85508,
25
+ "num_chars": 500000,
26
+ "bytes_per_token": 3.81,
27
+ "tokens_per_word": 1.54
28
+ },
29
+ "ar": {
30
+ "num_tokens": 138572,
31
+ "num_bytes": 900643,
32
+ "num_words": 81698,
33
+ "num_chars": 500000,
34
+ "bytes_per_token": 6.5,
35
+ "tokens_per_word": 1.7
36
+ },
37
+ "he": {
38
+ "num_tokens": 150214,
39
+ "num_bytes": 876334,
40
+ "num_words": 81962,
41
+ "num_chars": 500000,
42
+ "bytes_per_token": 5.83,
43
+ "tokens_per_word": 1.83
44
+ },
45
+ "fa": {
46
+ "num_tokens": 129491,
47
+ "num_bytes": 902876,
48
+ "num_words": 91425,
49
+ "num_chars": 500000,
50
+ "bytes_per_token": 6.97,
51
+ "tokens_per_word": 1.42
52
+ }
53
+ },
54
+ "timestamp": "2026-04-01T14:12:42Z"
55
+ }