File size: 2,654 Bytes
dd958be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
defaults:
  - base
  - exp_manager: sft
  - model: llm-jp-3-13b
  - trainer: sft
  - _self_

data:
  train_ds:
    data_dir: ${data_dir}/tuning/train
    global_batch_size: ${gbs}
    micro_batch_size: ${mbs}
  validation_ds:
    data_dir: ${data_dir}/tuning/dev
    global_batch_size: ${gbs}
    micro_batch_size: ${mbs}

# tuning datasets
  # max_train_samples: max number of samples to use for training. -1 means all. 0 means not to use.
  # split_dev: whether to split the dataset into training and validation datasets. If false, the dataset is used for training only.
  # upsampling_factor: upsampling factor for the dataset. 1 means no upsampling. Valid for both training and validation datasets.
datasets:
  answer_carefully:
    max_train_samples: -1 # -1 means all
    split_dev: false
    upsampling_factor: 16
  calm3_22b_chat_20241018083433--Qwen2.5_32B_Instruct_20241022115410:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
  calm3_22b_chat_20241022133932--Qwen2.5_32B_Instruct_20241024100350:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
  calm3_22b_chat_20241022155627--Qwen2.5_32B_Instruct_20241024144245:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
  daring_anteater_en:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
  flan:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
  ichikara:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
  logical_math_coding_wizard8x22b:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
  multiturn_calm3:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
  random_to_fixed_multiturn_calm3:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
  synthetic_jp_en_coding_0:
    max_train_samples: -1
    split_dev: true
    upsampling_factor: 1
# number of dev samples are the minimum value of {max_dev_samples, max_dev_ratio * number of dev samples} in the dataset.
max_dev_samples: 1000
max_dev_ratio: 0.1

# hyperparameters
gbs: 64
mbs: 1
dropout: 0.0
lr: 2e-5
min_lr: 2e-6

# other options
use_mpi: false
use_slurm: false # This option should be set to true when using Slurm and MPI. Otherwise, set it to false.

ignore_hparams_on_save: false

# constants
hparams_to_ignore_on_save:
  - project
  - work_dir
  - data_dir
  - seed
  - name
  - exp_dir
  - run_id
  - run_dir
  - config_name
  - logger
  - hparams_to_ignore_on_save
  - per_device_train_batch_size
  - per_device_eval_batch_size
  - gradient_checkpointing
  - logging_steps
  - eval_steps
  - save_steps
  - use_mpi
  - use_slurm