Zohaib002 commited on
Commit
fb939a3
·
verified ·
1 Parent(s): 8b4797b

End of training

Browse files
README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: mit
4
+ base_model: gavin124/gpt2-finetuned-cnn-summarization-v2
5
+ tags:
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: GPT2-Fixed-Train-final
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # GPT2-Fixed-Train-final
16
+
17
+ This model is a fine-tuned version of [gavin124/gpt2-finetuned-cnn-summarization-v2](https://huggingface.co/gavin124/gpt2-finetuned-cnn-summarization-v2) on the None dataset.
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 2e-05
37
+ - train_batch_size: 8
38
+ - eval_batch_size: 1
39
+ - seed: 42
40
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
41
+ - lr_scheduler_type: linear
42
+ - num_epochs: 1
43
+ - mixed_precision_training: Native AMP
44
+
45
+ ### Training results
46
+
47
+
48
+
49
+ ### Framework versions
50
+
51
+ - Transformers 4.57.2
52
+ - Pytorch 2.9.0+cu126
53
+ - Datasets 4.0.0
54
+ - Tokenizers 0.22.1
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<pad>": 50258,
3
+ "<|startoftext|>": 50257,
4
+ "<|summarize|>": 50259
5
+ }
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50257,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "pad_token_id": 50256,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "transformers_version": "4.57.2",
37
+ "use_cache": true,
38
+ "vocab_size": 50260
39
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50257,
4
+ "eos_token_id": [
5
+ 50256
6
+ ],
7
+ "pad_token_id": 50256,
8
+ "transformers_version": "4.57.2"
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06a5f5a3d7b7d0720c7f7ddb554bb1d3dab7fcdc90d3fac306ac3d68b4040820
3
+ size 497783424
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:949879ee8578b7a8bc6161e1874f5b0865ad3daa398adda547b70ec6ba97fd92
3
+ size 497825203
special_tokens_map.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|summarize|>"
4
+ ],
5
+ "bos_token": {
6
+ "content": "<|startoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eos_token": {
13
+ "content": "<|endoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "pad_token": "<|endoftext|>",
20
+ "unk_token": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ }
27
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "50257": {
14
+ "content": "<|startoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "50258": {
22
+ "content": "<pad>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "50259": {
30
+ "content": "<|summarize|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ }
37
+ },
38
+ "additional_special_tokens": [
39
+ "<|summarize|>"
40
+ ],
41
+ "bos_token": "<|startoftext|>",
42
+ "clean_up_tokenization_spaces": false,
43
+ "eos_token": "<|endoftext|>",
44
+ "errors": "replace",
45
+ "extra_special_tokens": {},
46
+ "model_max_length": 1024,
47
+ "pad_token": "<|endoftext|>",
48
+ "tokenizer_class": "GPT2Tokenizer",
49
+ "unk_token": "<|endoftext|>"
50
+ }
trainer_state.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 548,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.9124087591240876,
14
+ "grad_norm": 2.4354753494262695,
15
+ "learning_rate": 1.788321167883212e-06,
16
+ "loss": 2.268,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 1.0,
21
+ "step": 548,
22
+ "total_flos": 572229550080000.0,
23
+ "train_loss": 2.278221241749116,
24
+ "train_runtime": 323.4271,
25
+ "train_samples_per_second": 13.542,
26
+ "train_steps_per_second": 1.694
27
+ }
28
+ ],
29
+ "logging_steps": 500,
30
+ "max_steps": 548,
31
+ "num_input_tokens_seen": 0,
32
+ "num_train_epochs": 1,
33
+ "save_steps": 500,
34
+ "stateful_callbacks": {
35
+ "TrainerControl": {
36
+ "args": {
37
+ "should_epoch_stop": false,
38
+ "should_evaluate": false,
39
+ "should_log": false,
40
+ "should_save": true,
41
+ "should_training_stop": true
42
+ },
43
+ "attributes": {}
44
+ }
45
+ },
46
+ "total_flos": 572229550080000.0,
47
+ "train_batch_size": 8,
48
+ "trial_name": null,
49
+ "trial_params": null
50
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10b11a0b93c0f23c79f67959b9a8581dc1b9cabbe554b060372ffbbd563c3d21
3
+ size 5969
training_args.json ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/content/drive/MyDrive/GPT2-Fixed-Train-final",
3
+ "overwrite_output_dir": false,
4
+ "do_train": false,
5
+ "do_eval": false,
6
+ "do_predict": false,
7
+ "eval_strategy": "no",
8
+ "prediction_loss_only": false,
9
+ "per_device_train_batch_size": 8,
10
+ "per_device_eval_batch_size": 1,
11
+ "per_gpu_train_batch_size": null,
12
+ "per_gpu_eval_batch_size": null,
13
+ "gradient_accumulation_steps": 1,
14
+ "eval_accumulation_steps": null,
15
+ "eval_delay": 0,
16
+ "torch_empty_cache_steps": null,
17
+ "learning_rate": 2e-05,
18
+ "weight_decay": 0.01,
19
+ "adam_beta1": 0.9,
20
+ "adam_beta2": 0.999,
21
+ "adam_epsilon": 1e-08,
22
+ "max_grad_norm": 1.0,
23
+ "num_train_epochs": 1,
24
+ "max_steps": -1,
25
+ "lr_scheduler_type": "linear",
26
+ "lr_scheduler_kwargs": {},
27
+ "warmup_ratio": 0.0,
28
+ "warmup_steps": 0,
29
+ "log_level": "passive",
30
+ "log_level_replica": "warning",
31
+ "log_on_each_node": true,
32
+ "logging_dir": "GPT2-Fixed-Train/runs/Dec02_17-10-05_a30a22f2a9f5",
33
+ "logging_strategy": "steps",
34
+ "logging_first_step": false,
35
+ "logging_steps": 500,
36
+ "logging_nan_inf_filter": true,
37
+ "save_strategy": "steps",
38
+ "save_steps": 500,
39
+ "save_total_limit": 3,
40
+ "save_safetensors": true,
41
+ "save_on_each_node": false,
42
+ "save_only_model": false,
43
+ "restore_callback_states_from_checkpoint": false,
44
+ "no_cuda": false,
45
+ "use_cpu": false,
46
+ "use_mps_device": false,
47
+ "seed": 42,
48
+ "data_seed": null,
49
+ "jit_mode_eval": false,
50
+ "bf16": false,
51
+ "fp16": true,
52
+ "fp16_opt_level": "O1",
53
+ "half_precision_backend": "auto",
54
+ "bf16_full_eval": false,
55
+ "fp16_full_eval": false,
56
+ "tf32": null,
57
+ "local_rank": 0,
58
+ "ddp_backend": null,
59
+ "tpu_num_cores": null,
60
+ "tpu_metrics_debug": false,
61
+ "debug": [],
62
+ "dataloader_drop_last": false,
63
+ "eval_steps": null,
64
+ "dataloader_num_workers": 0,
65
+ "dataloader_prefetch_factor": null,
66
+ "past_index": -1,
67
+ "run_name": null,
68
+ "disable_tqdm": false,
69
+ "remove_unused_columns": true,
70
+ "label_names": null,
71
+ "load_best_model_at_end": false,
72
+ "metric_for_best_model": null,
73
+ "greater_is_better": null,
74
+ "ignore_data_skip": false,
75
+ "fsdp": [],
76
+ "fsdp_min_num_params": 0,
77
+ "fsdp_config": {
78
+ "min_num_params": 0,
79
+ "xla": false,
80
+ "xla_fsdp_v2": false,
81
+ "xla_fsdp_grad_ckpt": false
82
+ },
83
+ "fsdp_transformer_layer_cls_to_wrap": null,
84
+ "accelerator_config": {
85
+ "split_batches": false,
86
+ "dispatch_batches": null,
87
+ "even_batches": true,
88
+ "use_seedable_sampler": true,
89
+ "non_blocking": false,
90
+ "gradient_accumulation_kwargs": null
91
+ },
92
+ "parallelism_config": null,
93
+ "deepspeed": null,
94
+ "label_smoothing_factor": 0.0,
95
+ "optim": "adamw_torch_fused",
96
+ "optim_args": null,
97
+ "adafactor": false,
98
+ "group_by_length": false,
99
+ "length_column_name": "length",
100
+ "report_to": [],
101
+ "project": "huggingface",
102
+ "trackio_space_id": "trackio",
103
+ "ddp_find_unused_parameters": null,
104
+ "ddp_bucket_cap_mb": null,
105
+ "ddp_broadcast_buffers": null,
106
+ "dataloader_pin_memory": true,
107
+ "dataloader_persistent_workers": false,
108
+ "skip_memory_metrics": true,
109
+ "use_legacy_prediction_loop": false,
110
+ "push_to_hub": false,
111
+ "resume_from_checkpoint": null,
112
+ "hub_model_id": null,
113
+ "hub_strategy": "every_save",
114
+ "hub_token": "<HUB_TOKEN>",
115
+ "hub_private_repo": null,
116
+ "hub_always_push": false,
117
+ "hub_revision": null,
118
+ "gradient_checkpointing": false,
119
+ "gradient_checkpointing_kwargs": null,
120
+ "include_inputs_for_metrics": false,
121
+ "include_for_metrics": [],
122
+ "eval_do_concat_batches": true,
123
+ "fp16_backend": "auto",
124
+ "push_to_hub_model_id": null,
125
+ "push_to_hub_organization": null,
126
+ "push_to_hub_token": "<PUSH_TO_HUB_TOKEN>",
127
+ "mp_parameters": "",
128
+ "auto_find_batch_size": false,
129
+ "full_determinism": false,
130
+ "torchdynamo": null,
131
+ "ray_scope": "last",
132
+ "ddp_timeout": 1800,
133
+ "torch_compile": false,
134
+ "torch_compile_backend": null,
135
+ "torch_compile_mode": null,
136
+ "include_tokens_per_second": false,
137
+ "include_num_input_tokens_seen": "no",
138
+ "neftune_noise_alpha": null,
139
+ "optim_target_modules": null,
140
+ "batch_eval_metrics": false,
141
+ "eval_on_start": false,
142
+ "use_liger_kernel": false,
143
+ "liger_kernel_config": null,
144
+ "eval_use_gather_object": false,
145
+ "average_tokens_across_devices": true,
146
+ "sortish_sampler": false,
147
+ "predict_with_generate": false,
148
+ "generation_max_length": null,
149
+ "generation_num_beams": null,
150
+ "generation_config": null
151
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff