Training in progress, step 10

Browse files

Files changed (11) hide show

README.md +10 -13
adapter_config.json +5 -9
adapter_model.safetensors +1 -1
added_tokens.json +24 -0
all_results.json +6 -6
merges.txt +0 -0
tokenizer_config.json +1 -1
train_results.json +6 -6
trainer_state.json +713 -13
training_args.bin +2 -2
vocab.json +0 -0

README.md CHANGED Viewed

@@ -3,19 +3,16 @@ library_name: peft
 license: llama3.2
 base_model: meta-llama/Llama-3.2-1B
 tags:
-- base_model:adapter:meta-llama/Llama-3.2-1B
-- lora
-- transformers
-pipeline_tag: text-generation
 model-index:
-- name: testing
   results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
-# testing
 This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on an unknown dataset.
@@ -45,10 +42,10 @@ The following hyperparameters were used during training:
 - gradient_accumulation_steps: 10
 - total_train_batch_size: 160
 - total_eval_batch_size: 64
-- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
-- lr_scheduler_warmup_ratio: 0.03
-- training_steps: 20
 ### Training results
@@ -56,8 +53,8 @@ The following hyperparameters were used during training:
 ### Framework versions
-- PEFT 0.16.0
-- Transformers 4.53.2
-- Pytorch 2.7.1+cu126
-- Datasets 4.0.0
 - Tokenizers 0.21.2

 license: llama3.2
 base_model: meta-llama/Llama-3.2-1B
 tags:
+- generated_from_trainer
 model-index:
+- name: multi-gpu-llama-3-2-1b-40k-1e-4-custom-sft-2048-seqlen
   results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
+# multi-gpu-llama-3-2-1b-40k-1e-4-custom-sft-2048-seqlen
 This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on an unknown dataset.
 - gradient_accumulation_steps: 10
 - total_train_batch_size: 160
 - total_eval_batch_size: 64
+- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- training_steps: 1000
 ### Training results
 ### Framework versions
+- PEFT 0.14.0
+- Transformers 4.51.3
+- Pytorch 2.3.0+cu121
+- Datasets 2.15.0
 - Tokenizers 0.21.2

adapter_config.json CHANGED Viewed

@@ -3,7 +3,6 @@
   "auto_mapping": null,
   "base_model_name_or_path": "meta-llama/Llama-3.2-1B",
   "bias": "none",
-  "corda_config": null,
   "eva_config": null,
   "exclude_modules": null,
   "fan_in_fan_out": false,
@@ -13,29 +12,26 @@
   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
-  "lora_alpha": 16,
   "lora_bias": false,
   "lora_dropout": 0.05,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
-  "qalora_group_size": 16,
   "r": 8,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "k_proj",
     "v_proj",
     "o_proj",
-    "down_proj",
     "q_proj",
-    "gate_proj",
-    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
-  "trainable_token_indices": null,
   "use_dora": false,
-  "use_qalora": false,
   "use_rslora": false
 }

   "auto_mapping": null,
   "base_model_name_or_path": "meta-llama/Llama-3.2-1B",
   "bias": "none",
   "eva_config": null,
   "exclude_modules": null,
   "fan_in_fan_out": false,
   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
+  "lora_alpha": 16.0,
   "lora_bias": false,
   "lora_dropout": 0.05,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
   "r": 8,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "gate_proj",
     "v_proj",
+    "up_proj",
     "o_proj",
     "q_proj",
+    "down_proj",
+    "k_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,
   "use_rslora": false
 }

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b9a4110b2f067c1c3df33d9875e16ff46ec4326efa78cd8ecf7e6b17dfa169e1
 size 11301520

 version https://git-lfs.github.com/spec/v1
+oid sha256:daf11edf266a10e84e0736bf55479df8aadd34e14d6515e4f3e2b5c89e4ebecc
 size 11301520

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 0.0365296803652968,
-    "total_flos": 3.848728354383462e+16,
-    "train_loss": 1.5816513061523438,
-    "train_runtime": 267.2013,
-    "train_samples_per_second": 11.976,
-    "train_steps_per_second": 0.075
 }

 {
+    "epoch": 4.0,
+    "total_flos": 1.9243641771917312e+18,
+    "train_loss": 0.050279883230105044,
+    "train_runtime": 7527.0065,
+    "train_samples_per_second": 21.257,
+    "train_steps_per_second": 0.133
 }

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -2060,5 +2060,5 @@
   "model_max_length": 131072,
   "pad_token": "<|end_of_text|>",
   "padding_side": "right",
-  "tokenizer_class": "PreTrainedTokenizerFast"
 }

   "model_max_length": 131072,
   "pad_token": "<|end_of_text|>",
   "padding_side": "right",
+  "tokenizer_class": "PreTrainedTokenizer"
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 0.0365296803652968,
-    "total_flos": 3.848728354383462e+16,
-    "train_loss": 1.5816513061523438,
-    "train_runtime": 267.2013,
-    "train_samples_per_second": 11.976,
-    "train_steps_per_second": 0.075
 }

 {
+    "epoch": 4.0,
+    "total_flos": 1.9243641771917312e+18,
+    "train_loss": 0.050279883230105044,
+    "train_runtime": 7527.0065,
+    "train_samples_per_second": 21.257,
+    "train_steps_per_second": 0.133
 }

trainer_state.json CHANGED Viewed

@@ -2,27 +2,727 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.0365296803652968,
   "eval_steps": 0,
-  "global_step": 20,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0365296803652968,
-      "step": 20,
-      "total_flos": 3.848728354383462e+16,
-      "train_loss": 1.5816513061523438,
-      "train_runtime": 267.2013,
-      "train_samples_per_second": 11.976,
-      "train_steps_per_second": 0.075
     }
   ],
-  "logging_steps": 100,
-  "max_steps": 20,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -36,7 +736,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.848728354383462e+16,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 4.0,
   "eval_steps": 0,
+  "global_step": 1000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.04,
+      "grad_norm": 2.592644453048706,
+      "learning_rate": 5.8859191006777906e-05,
+      "loss": 4.3679,
+      "step": 10
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.18703584372997284,
+      "learning_rate": 7.657757302033369e-05,
+      "loss": 0.0344,
+      "step": 20
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.10068075358867645,
+      "learning_rate": 8.694216207171606e-05,
+      "loss": 0.0263,
+      "step": 30
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.06798414140939713,
+      "learning_rate": 9.429595503388949e-05,
+      "loss": 0.0251,
+      "step": 40
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.0597367137670517,
+      "learning_rate": 0.0001,
+      "loss": 0.0245,
+      "step": 50
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.04497462511062622,
+      "learning_rate": 0.0001,
+      "loss": 0.0236,
+      "step": 60
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.04295974224805832,
+      "learning_rate": 0.0001,
+      "loss": 0.0231,
+      "step": 70
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0416204072535038,
+      "learning_rate": 0.0001,
+      "loss": 0.0225,
+      "step": 80
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.03994525969028473,
+      "learning_rate": 0.0001,
+      "loss": 0.022,
+      "step": 90
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.045212067663669586,
+      "learning_rate": 0.0001,
+      "loss": 0.0215,
+      "step": 100
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.034988779574632645,
+      "learning_rate": 0.0001,
+      "loss": 0.0209,
+      "step": 110
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.03513108938932419,
+      "learning_rate": 0.0001,
+      "loss": 0.0205,
+      "step": 120
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.03687143325805664,
+      "learning_rate": 0.0001,
+      "loss": 0.0197,
+      "step": 130
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.031846143305301666,
+      "learning_rate": 0.0001,
+      "loss": 0.0192,
+      "step": 140
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.04322415590286255,
+      "learning_rate": 0.0001,
+      "loss": 0.0184,
+      "step": 150
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.03877222165465355,
+      "learning_rate": 0.0001,
+      "loss": 0.0176,
+      "step": 160
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.03259311988949776,
+      "learning_rate": 0.0001,
+      "loss": 0.0165,
+      "step": 170
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.04251185432076454,
+      "learning_rate": 0.0001,
+      "loss": 0.0151,
+      "step": 180
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.055991314351558685,
+      "learning_rate": 0.0001,
+      "loss": 0.0135,
+      "step": 190
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.049590013921260834,
+      "learning_rate": 0.0001,
+      "loss": 0.0112,
+      "step": 200
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.029243191704154015,
+      "learning_rate": 0.0001,
+      "loss": 0.0097,
+      "step": 210
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.028539329767227173,
+      "learning_rate": 0.0001,
+      "loss": 0.0087,
+      "step": 220
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.02026432566344738,
+      "learning_rate": 0.0001,
+      "loss": 0.008,
+      "step": 230
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.017264919355511665,
+      "learning_rate": 0.0001,
+      "loss": 0.0076,
+      "step": 240
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.022517086938023567,
+      "learning_rate": 0.0001,
+      "loss": 0.0071,
+      "step": 250
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 0.016512203961610794,
+      "learning_rate": 0.0001,
+      "loss": 0.007,
+      "step": 260
+    },
+    {
+      "epoch": 1.08,
+      "grad_norm": 0.01752372272312641,
+      "learning_rate": 0.0001,
+      "loss": 0.0065,
+      "step": 270
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 0.014535382390022278,
+      "learning_rate": 0.0001,
+      "loss": 0.006,
+      "step": 280
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 0.01353287324309349,
+      "learning_rate": 0.0001,
+      "loss": 0.0057,
+      "step": 290
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.014846131205558777,
+      "learning_rate": 0.0001,
+      "loss": 0.0055,
+      "step": 300
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.01546796876937151,
+      "learning_rate": 0.0001,
+      "loss": 0.0052,
+      "step": 310
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.013868402689695358,
+      "learning_rate": 0.0001,
+      "loss": 0.0049,
+      "step": 320
+    },
+    {
+      "epoch": 1.32,
+      "grad_norm": 0.013835090212523937,
+      "learning_rate": 0.0001,
+      "loss": 0.0049,
+      "step": 330
+    },
+    {
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.01379898190498352,
+      "learning_rate": 0.0001,
+      "loss": 0.0047,
+      "step": 340
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 0.014727466739714146,
+      "learning_rate": 0.0001,
+      "loss": 0.0045,
+      "step": 350
+    },
+    {
+      "epoch": 1.44,
+      "grad_norm": 0.011309986934065819,
+      "learning_rate": 0.0001,
+      "loss": 0.0044,
+      "step": 360
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 0.021115651354193687,
+      "learning_rate": 0.0001,
+      "loss": 0.0042,
+      "step": 370
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 0.01117760967463255,
+      "learning_rate": 0.0001,
+      "loss": 0.0041,
+      "step": 380
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.010027006268501282,
+      "learning_rate": 0.0001,
+      "loss": 0.0041,
+      "step": 390
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.012770233675837517,
+      "learning_rate": 0.0001,
+      "loss": 0.0039,
+      "step": 400
+    },
+    {
+      "epoch": 1.6400000000000001,
+      "grad_norm": 0.012774297036230564,
+      "learning_rate": 0.0001,
+      "loss": 0.0037,
+      "step": 410
+    },
+    {
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.018494736403226852,
+      "learning_rate": 0.0001,
+      "loss": 0.0038,
+      "step": 420
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 0.010668110102415085,
+      "learning_rate": 0.0001,
+      "loss": 0.0035,
+      "step": 430
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 0.009887272492051125,
+      "learning_rate": 0.0001,
+      "loss": 0.0036,
+      "step": 440
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 0.009353808127343655,
+      "learning_rate": 0.0001,
+      "loss": 0.0034,
+      "step": 450
+    },
+    {
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.009478743188083172,
+      "learning_rate": 0.0001,
+      "loss": 0.0034,
+      "step": 460
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.012139740400016308,
+      "learning_rate": 0.0001,
+      "loss": 0.0034,
+      "step": 470
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.011693431995809078,
+      "learning_rate": 0.0001,
+      "loss": 0.0034,
+      "step": 480
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 0.009398797526955605,
+      "learning_rate": 0.0001,
+      "loss": 0.0031,
+      "step": 490
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.008883015252649784,
+      "learning_rate": 0.0001,
+      "loss": 0.0032,
+      "step": 500
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 0.00975903868675232,
+      "learning_rate": 0.0001,
+      "loss": 0.0031,
+      "step": 510
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 0.009527013637125492,
+      "learning_rate": 0.0001,
+      "loss": 0.0029,
+      "step": 520
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 0.011014975607395172,
+      "learning_rate": 0.0001,
+      "loss": 0.003,
+      "step": 530
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 0.00897703692317009,
+      "learning_rate": 0.0001,
+      "loss": 0.0029,
+      "step": 540
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.009742386639118195,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 550
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.010376248508691788,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 560
+    },
+    {
+      "epoch": 2.2800000000000002,
+      "grad_norm": 0.010401038452982903,
+      "learning_rate": 0.0001,
+      "loss": 0.0028,
+      "step": 570
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 0.009547604247927666,
+      "learning_rate": 0.0001,
+      "loss": 0.0027,
+      "step": 580
+    },
+    {
+      "epoch": 2.36,
+      "grad_norm": 0.007607370615005493,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 590
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.008813534863293171,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 600
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 0.010278712958097458,
+      "learning_rate": 0.0001,
+      "loss": 0.0026,
+      "step": 610
+    },
+    {
+      "epoch": 2.48,
+      "grad_norm": 0.011279788799583912,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 620
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.009542998857796192,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 630
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.008792417123913765,
+      "learning_rate": 0.0001,
+      "loss": 0.0024,
+      "step": 640
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 0.008578693494200706,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 650
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.00964912585914135,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 660
+    },
+    {
+      "epoch": 2.68,
+      "grad_norm": 0.010405980981886387,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 670
+    },
+    {
+      "epoch": 2.7199999999999998,
+      "grad_norm": 0.010341090150177479,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 680
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 0.008614353835582733,
+      "learning_rate": 0.0001,
+      "loss": 0.0025,
+      "step": 690
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.007937280461192131,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 700
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.01031468715518713,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 710
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.00910355243831873,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 720
+    },
+    {
+      "epoch": 2.92,
+      "grad_norm": 0.00943271815776825,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 730
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 0.009353592060506344,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 740
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.010902035981416702,
+      "learning_rate": 0.0001,
+      "loss": 0.0023,
+      "step": 750
+    },
+    {
+      "epoch": 3.04,
+      "grad_norm": 0.009516863152384758,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 760
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 0.011545202694833279,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 770
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.007929637096822262,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 780
+    },
+    {
+      "epoch": 3.16,
+      "grad_norm": 0.008281239308416843,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 790
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 0.010873930528759956,
+      "learning_rate": 0.0001,
+      "loss": 0.0022,
+      "step": 800
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 0.008918201550841331,
+      "learning_rate": 0.0001,
+      "loss": 0.0021,
+      "step": 810
+    },
+    {
+      "epoch": 3.2800000000000002,
+      "grad_norm": 0.010175652801990509,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 820
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 0.010831949301064014,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 830
+    },
+    {
+      "epoch": 3.36,
+      "grad_norm": 0.008530331775546074,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 840
+    },
+    {
+      "epoch": 3.4,
+      "grad_norm": 0.009083152748644352,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 850
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.009484045207500458,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 860
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 0.008554712869226933,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 870
+    },
+    {
+      "epoch": 3.52,
+      "grad_norm": 0.010989188216626644,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 880
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 0.009386119432747364,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 890
+    },
+    {
+      "epoch": 3.6,
+      "grad_norm": 0.007983546704053879,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 900
+    },
+    {
+      "epoch": 3.64,
+      "grad_norm": 0.008688435889780521,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 910
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 0.00950028095394373,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 920
+    },
+    {
+      "epoch": 3.7199999999999998,
+      "grad_norm": 0.010507030412554741,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 930
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 0.009756983257830143,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 940
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 0.008237377740442753,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 950
+    },
+    {
+      "epoch": 3.84,
+      "grad_norm": 0.0092322314158082,
+      "learning_rate": 0.0001,
+      "loss": 0.002,
+      "step": 960
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 0.006983477156609297,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 970
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 0.009244760498404503,
+      "learning_rate": 0.0001,
+      "loss": 0.0019,
+      "step": 980
+    },
+    {
+      "epoch": 3.96,
+      "grad_norm": 0.009358828887343407,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 990
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.009272817522287369,
+      "learning_rate": 0.0001,
+      "loss": 0.0018,
+      "step": 1000
+    },
+    {
+      "epoch": 4.0,
+      "step": 1000,
+      "total_flos": 1.9243641771917312e+18,
+      "train_loss": 0.050279883230105044,
+      "train_runtime": 7527.0065,
+      "train_samples_per_second": 21.257,
+      "train_steps_per_second": 0.133
     }
   ],
+  "logging_steps": 10,
+  "max_steps": 1000,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 1.9243641771917312e+18,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0b4ef0ada0bb7702fd24371930da3d2298189a4d27c86b3c9aac6c85f106d575
-size 6289

 version https://git-lfs.github.com/spec/v1
+oid sha256:2d1e2f02fb5745ba0f4e76e2eff21d03ad5d85b29be9ff6ff5010da0e9f41530
+size 9528

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff