sravanthib commited on
Commit
1918cb3
·
verified ·
1 Parent(s): bdfc007

Training completed

Browse files
README.md CHANGED
@@ -38,8 +38,10 @@ The following hyperparameters were used during training:
38
  - eval_batch_size: 8
39
  - seed: 42
40
  - distributed_type: multi-GPU
 
41
  - gradient_accumulation_steps: 10
42
- - total_train_batch_size: 20
 
43
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
44
  - lr_scheduler_type: cosine
45
  - lr_scheduler_warmup_ratio: 0.05
@@ -55,4 +57,4 @@ The following hyperparameters were used during training:
55
  - Transformers 4.51.3
56
  - Pytorch 2.3.0+cu121
57
  - Datasets 2.15.0
58
- - Tokenizers 0.21.2
 
38
  - eval_batch_size: 8
39
  - seed: 42
40
  - distributed_type: multi-GPU
41
+ - num_devices: 2
42
  - gradient_accumulation_steps: 10
43
+ - total_train_batch_size: 40
44
+ - total_eval_batch_size: 16
45
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: cosine
47
  - lr_scheduler_warmup_ratio: 0.05
 
57
  - Transformers 4.51.3
58
  - Pytorch 2.3.0+cu121
59
  - Datasets 2.15.0
60
+ - Tokenizers 0.21.4
adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
 
 
 
27
  "k_proj",
28
  "up_proj",
29
- "gate_proj",
30
- "down_proj",
31
- "q_proj",
32
- "o_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
24
  "revision": null,
25
  "target_modules": [
26
  "v_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "o_proj",
30
  "k_proj",
31
  "up_proj",
32
+ "gate_proj"
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40179f18b5d2cbc4f5334b8b416bea9dba2886ec9136b756b5516f5eddb7ccad
3
  size 11301520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d035c7d217578a4cf81b6f3d9058e6b661d834dc982c560bb61d99f7eb41ab0b
3
  size 11301520
all_results.json CHANGED
@@ -1,8 +1,11 @@
1
  {
2
- "epoch": 0.005,
3
- "total_flos": 2405455221489664.0,
4
- "train_loss": 3.154673767089844,
5
- "train_runtime": 69.8938,
6
- "train_samples_per_second": 2.861,
7
- "train_steps_per_second": 0.143
 
 
 
8
  }
 
1
  {
2
+ "avg_step_time": 6.669609618186951,
3
+ "epoch": 0.01,
4
+ "total_flos": 4810910442979328.0,
5
+ "total_training_time": 89.90108489990234,
6
+ "total_training_time_mins": 1.4983514149983723,
7
+ "train_loss": 3.1294017791748048,
8
+ "train_runtime": 82.5533,
9
+ "train_samples_per_second": 4.845,
10
+ "train_steps_per_second": 0.121
11
  }
tokenizer_config.json CHANGED
@@ -2050,6 +2050,7 @@
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
 
2053
  "clean_up_tokenization_spaces": true,
2054
  "eos_token": "<|end_of_text|>",
2055
  "extra_special_tokens": {},
 
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant\n'}}{% endif %}",
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|end_of_text|>",
2056
  "extra_special_tokens": {},
train_results.json CHANGED
@@ -1,8 +1,11 @@
1
  {
2
- "epoch": 0.005,
3
- "total_flos": 2405455221489664.0,
4
- "train_loss": 3.154673767089844,
5
- "train_runtime": 69.8938,
6
- "train_samples_per_second": 2.861,
7
- "train_steps_per_second": 0.143
 
 
 
8
  }
 
1
  {
2
+ "avg_step_time": 6.669609618186951,
3
+ "epoch": 0.01,
4
+ "total_flos": 4810910442979328.0,
5
+ "total_training_time": 89.90108489990234,
6
+ "total_training_time_mins": 1.4983514149983723,
7
+ "train_loss": 3.1294017791748048,
8
+ "train_runtime": 82.5533,
9
+ "train_samples_per_second": 4.845,
10
+ "train_steps_per_second": 0.121
11
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.005,
6
  "eval_steps": 0,
7
  "global_step": 10,
8
  "is_hyper_param_search": false,
@@ -10,20 +10,20 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.005,
14
- "grad_norm": 1.493022084236145,
15
  "learning_rate": 0.0001,
16
- "loss": 3.1547,
17
  "step": 10
18
  },
19
  {
20
- "epoch": 0.005,
21
  "step": 10,
22
- "total_flos": 2405455221489664.0,
23
- "train_loss": 3.154673767089844,
24
- "train_runtime": 69.8938,
25
- "train_samples_per_second": 2.861,
26
- "train_steps_per_second": 0.143
27
  }
28
  ],
29
  "logging_steps": 10,
@@ -43,7 +43,7 @@
43
  "attributes": {}
44
  }
45
  },
46
- "total_flos": 2405455221489664.0,
47
  "train_batch_size": 2,
48
  "trial_name": null,
49
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.01,
6
  "eval_steps": 0,
7
  "global_step": 10,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.01,
14
+ "grad_norm": 0.5258329510688782,
15
  "learning_rate": 0.0001,
16
+ "loss": 3.1294,
17
  "step": 10
18
  },
19
  {
20
+ "epoch": 0.01,
21
  "step": 10,
22
+ "total_flos": 4810910442979328.0,
23
+ "train_loss": 3.1294017791748048,
24
+ "train_runtime": 82.5533,
25
+ "train_samples_per_second": 4.845,
26
+ "train_steps_per_second": 0.121
27
  }
28
  ],
29
  "logging_steps": 10,
 
43
  "attributes": {}
44
  }
45
  },
46
+ "total_flos": 4810910442979328.0,
47
  "train_batch_size": 2,
48
  "trial_name": null,
49
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5b476429fe4dd0fdd2be265d8440ed606c5801b51bea1e75391a43581bf7dd3
3
- size 9528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6302d7204c11d65102ccfe9a727ca8d3bba190dd19170c0e413b309c5b094043
3
+ size 8440