fschlatt commited on
Commit
b55bd0b
·
verified ·
1 Parent(s): e1aa043

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "absolute_positional_embedding_type": null,
3
+ "architectures": [
4
+ "TiteForPreTraining"
5
+ ],
6
+ "dropout_prob": 0.1,
7
+ "hidden_act": "gelu_pytorch_tanh",
8
+ "hidden_sizes": [
9
+ 768,
10
+ 768,
11
+ 768,
12
+ 768,
13
+ 768,
14
+ 768,
15
+ 768,
16
+ 768,
17
+ 768,
18
+ 768,
19
+ 768,
20
+ 768
21
+ ],
22
+ "initializer_range": 0.02,
23
+ "intermediate_sizes": [
24
+ 3072,
25
+ 3072,
26
+ 3072,
27
+ 3072,
28
+ 3072,
29
+ 3072,
30
+ 3072,
31
+ 3072,
32
+ 3072,
33
+ 3072,
34
+ 3072,
35
+ 3072
36
+ ],
37
+ "kernel_sizes": [
38
+ null,
39
+ null,
40
+ null,
41
+ 2,
42
+ 2,
43
+ 2,
44
+ 2,
45
+ 2,
46
+ 2,
47
+ 2,
48
+ 2,
49
+ 2
50
+ ],
51
+ "layer_norm_eps": 1e-12,
52
+ "max_position_embeddings": 512,
53
+ "model_type": "tite",
54
+ "norm_location": "post",
55
+ "norm_type": "layer",
56
+ "num_attention_heads": [
57
+ 12,
58
+ 12,
59
+ 12,
60
+ 12,
61
+ 12,
62
+ 12,
63
+ 12,
64
+ 12,
65
+ 12,
66
+ 12,
67
+ 12,
68
+ 12
69
+ ],
70
+ "num_hidden_layers": 12,
71
+ "pad_token_id": 0,
72
+ "pooling_implementation": "triton",
73
+ "pooling_location": "intra",
74
+ "positional_embedding_type": null,
75
+ "relative_positional_embedding_type": "rotary",
76
+ "rope_implementation": "eager",
77
+ "rotary_interleaved": true,
78
+ "strides": [
79
+ null,
80
+ null,
81
+ null,
82
+ 2,
83
+ 2,
84
+ 2,
85
+ 2,
86
+ 2,
87
+ 2,
88
+ 2,
89
+ 2,
90
+ 2
91
+ ],
92
+ "torch_dtype": "float32",
93
+ "transformers_version": "4.52.4",
94
+ "vocab_size": 30522
95
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5bb85190c15abf78d033a207e29af4e8312336a2b783102c9438dd2f8643add
3
+ size 467345784
pl_config.yaml ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # lightning.pytorch==2.5.2
2
+ seed_everything: 42
3
+ trainer:
4
+ accelerator: auto
5
+ strategy: auto
6
+ devices: auto
7
+ num_nodes: 1
8
+ precision: bf16-mixed
9
+ callbacks:
10
+ - class_path: lightning.pytorch.callbacks.ModelCheckpoint
11
+ init_args:
12
+ dirpath: null
13
+ filename: null
14
+ monitor: null
15
+ verbose: false
16
+ save_last: null
17
+ save_top_k: 1
18
+ save_weights_only: false
19
+ mode: min
20
+ auto_insert_metric_name: true
21
+ every_n_train_steps: null
22
+ train_time_interval: null
23
+ every_n_epochs: null
24
+ save_on_train_epoch_end: null
25
+ enable_version_counter: true
26
+ fast_dev_run: false
27
+ max_epochs: null
28
+ min_epochs: null
29
+ max_steps: 200000
30
+ min_steps: null
31
+ max_time: null
32
+ limit_train_batches: null
33
+ limit_val_batches: null
34
+ limit_test_batches: null
35
+ limit_predict_batches: null
36
+ overfit_batches: 0.0
37
+ val_check_interval: 50000
38
+ check_val_every_n_epoch: 1
39
+ num_sanity_val_steps: null
40
+ log_every_n_steps: null
41
+ enable_checkpointing: null
42
+ enable_progress_bar: false
43
+ enable_model_summary: null
44
+ accumulate_grad_batches: 2
45
+ gradient_clip_val: 1
46
+ gradient_clip_algorithm: null
47
+ deterministic: null
48
+ benchmark: null
49
+ inference_mode: true
50
+ use_distributed_sampler: true
51
+ profiler: null
52
+ detect_anomaly: false
53
+ barebones: false
54
+ plugins: null
55
+ sync_batchnorm: false
56
+ reload_dataloaders_every_n_epochs: 0
57
+ default_root_dir: null
58
+ model_registry: null
59
+ model:
60
+ class_path: tite.module.TiteModule
61
+ init_args:
62
+ model:
63
+ class_path: tite.model.TiteForPreTraining
64
+ init_args:
65
+ config:
66
+ class_path: tite.model.TiteConfig
67
+ init_args:
68
+ vocab_size: 30522
69
+ num_hidden_layers: 12
70
+ hidden_sizes: 768
71
+ num_attention_heads: 12
72
+ intermediate_sizes: 3072
73
+ kernel_sizes:
74
+ - null
75
+ - null
76
+ - null
77
+ - 2
78
+ - 2
79
+ - 2
80
+ - 2
81
+ - 2
82
+ - 2
83
+ - 2
84
+ - 2
85
+ - 2
86
+ strides:
87
+ - null
88
+ - null
89
+ - null
90
+ - 2
91
+ - 2
92
+ - 2
93
+ - 2
94
+ - 2
95
+ - 2
96
+ - 2
97
+ - 2
98
+ - 2
99
+ dropout_prob: 0.1
100
+ max_position_embeddings: 512
101
+ initializer_range: 0.02
102
+ layer_norm_eps: 1.0e-12
103
+ pad_token_id: 0
104
+ hidden_act: gelu_pytorch_tanh
105
+ absolute_positional_embedding_type: null
106
+ relative_positional_embedding_type: rotary
107
+ pooling_location: intra
108
+ rotary_interleaved: true
109
+ norm_location: post
110
+ norm_type: layer
111
+ pooling_implementation: triton
112
+ rope_implementation: eager
113
+ positional_embedding_type: null
114
+ enhanced_masked_auto_encoding: true
115
+ bow_auto_encoding: true
116
+ tokenizer:
117
+ class_path: tite.model.TiteTokenizer
118
+ init_args:
119
+ vocab_file: tokenizers/tite/vocab.txt
120
+ tokenizer_file: tokenizers/tite/tokenizer.json
121
+ do_lower_case: true
122
+ unk_token: '[UNK]'
123
+ sep_token: '[SEP]'
124
+ pad_token: '[PAD]'
125
+ cls_token: '[CLS]'
126
+ mask_token: '[MASK]'
127
+ tokenize_chinese_chars: true
128
+ strip_accents: null
129
+ dict_kwargs:
130
+ model_max_length: 512
131
+ validate_on_glue: true
132
+ validate_on_trec_dl: true
133
+ log_gradients: false
134
+ compile: true
135
+ data:
136
+ class_path: tite.datasets.FineWebDataModule
137
+ init_args:
138
+ collator:
139
+ class_path: tite.datasets.TransformationCollator
140
+ init_args:
141
+ text_keys:
142
+ - text
143
+ - null
144
+ string_transformations: null
145
+ token_transformations:
146
+ - class_path: tite.transformation.TokenMask
147
+ init_args:
148
+ mask_id: 103
149
+ mask_prob: 0.3
150
+ transformation_prob: 1.0
151
+ max_length: 512
152
+ path: HuggingFaceFW/fineweb-edu
153
+ batch_size: 128
154
+ seed: null
155
+ num_workers: 8
156
+ streaming: true
157
+ lr_scheduler:
158
+ class_path: tite.utils.lr_schedulers.SigmoidLRSchedulerWithLinearWarmup
159
+ init_args:
160
+ num_warmup_steps: 3000
161
+ final_value: 0.02
162
+ num_delay_steps: 0
163
+ optimizer:
164
+ class_path: tite.utils.adamw.AdamWNoWeightDecayBiasNorm
165
+ init_args:
166
+ lr: 0.0001
167
+ betas:
168
+ - 0.9
169
+ - 0.999
170
+ eps: 1.0e-08
171
+ weight_decay: 0.01
172
+ amsgrad: false
173
+ maximize: false
174
+ foreach: null
175
+ capturable: false
176
+ differentiable: false
177
+ fused: null
178
+ ckpt_path: null
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "max_length": 512,
50
+ "model_max_length": 512,
51
+ "pad_to_multiple_of": 8,
52
+ "pad_token": "[PAD]",
53
+ "pad_token_type_id": 0,
54
+ "padding_side": "right",
55
+ "sep_token": "[SEP]",
56
+ "stride": 0,
57
+ "strip_accents": null,
58
+ "tokenize_chinese_chars": true,
59
+ "tokenizer_class": "TiteTokenizer",
60
+ "truncation_side": "right",
61
+ "truncation_strategy": "longest_first",
62
+ "unk_token": "[UNK]"
63
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff