pavan207 commited on
Commit
02a3ce5
·
verified ·
1 Parent(s): a236c03

Upload tokenizer_parakeet_48k_261k/tokenizer_full_stats.json with huggingface_hub

Browse files
tokenizer_parakeet_48k_261k/tokenizer_full_stats.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": {
3
+ "tokens": 453320060.0,
4
+ "words": 349957279.0,
5
+ "lines": 16916959.0,
6
+ "tokens_per_word": 1.2953582828605774,
7
+ "avg_tokens_per_line": 26.796781856597278
8
+ },
9
+ "hi": {
10
+ "tokens": 280684874.0,
11
+ "words": 210299384.0,
12
+ "lines": 10398436.0,
13
+ "tokens_per_word": 1.3346918505476935,
14
+ "avg_tokens_per_line": 26.992989522655137
15
+ },
16
+ "te": {
17
+ "tokens": 209728016.0,
18
+ "words": 119466611.0,
19
+ "lines": 8520112.0,
20
+ "tokens_per_word": 1.7555366662238372,
21
+ "avg_tokens_per_line": 24.61564073336125
22
+ },
23
+ "ml": {
24
+ "tokens": 151621954.0,
25
+ "words": 77461860.0,
26
+ "lines": 5431191.0,
27
+ "tokens_per_word": 1.9573755910328,
28
+ "avg_tokens_per_line": 27.916888579318975
29
+ },
30
+ "pa": {
31
+ "tokens": 149498418.0,
32
+ "words": 108515094.0,
33
+ "lines": 5036773.0,
34
+ "tokens_per_word": 1.377673948289627,
35
+ "avg_tokens_per_line": 29.68138885750857
36
+ },
37
+ "ta": {
38
+ "tokens": 132895268.0,
39
+ "words": 72855361.0,
40
+ "lines": 4533355.0,
41
+ "tokens_per_word": 1.8240973097367537,
42
+ "avg_tokens_per_line": 29.314992538638602
43
+ },
44
+ "kn": {
45
+ "tokens": 86369605.0,
46
+ "words": 45502553.0,
47
+ "lines": 2988443.0,
48
+ "tokens_per_word": 1.8981265732496373,
49
+ "avg_tokens_per_line": 28.90120541030898
50
+ },
51
+ "gu": {
52
+ "tokens": 75126588.0,
53
+ "words": 44453889.0,
54
+ "lines": 2626542.0,
55
+ "tokens_per_word": 1.6899891030906204,
56
+ "avg_tokens_per_line": 28.602850439855903
57
+ },
58
+ "bn": {
59
+ "tokens": 66371126.0,
60
+ "words": 39086017.0,
61
+ "lines": 2441433.0,
62
+ "tokens_per_word": 1.6980785225570567,
63
+ "avg_tokens_per_line": 27.18531534553682
64
+ },
65
+ "mr": {
66
+ "tokens": 61685551.0,
67
+ "words": 38015683.0,
68
+ "lines": 2256325.0,
69
+ "tokens_per_word": 1.6226342954301256,
70
+ "avg_tokens_per_line": 27.338947625009695
71
+ },
72
+ "or": {
73
+ "tokens": 38244695.0,
74
+ "words": 19515379.0,
75
+ "lines": 1241428.0,
76
+ "tokens_per_word": 1.9597208437509719,
77
+ "avg_tokens_per_line": 30.807018208063617
78
+ },
79
+ "as": {
80
+ "tokens": 23542984.0,
81
+ "words": 12977320.0,
82
+ "lines": 785702.0,
83
+ "tokens_per_word": 1.8141637872842775,
84
+ "avg_tokens_per_line": 29.96426635034657
85
+ }
86
+ }