File size: 2,654 Bytes
dd958be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | defaults:
- base
- exp_manager: sft
- model: llm-jp-3-13b
- trainer: sft
- _self_
data:
train_ds:
data_dir: ${data_dir}/tuning/train
global_batch_size: ${gbs}
micro_batch_size: ${mbs}
validation_ds:
data_dir: ${data_dir}/tuning/dev
global_batch_size: ${gbs}
micro_batch_size: ${mbs}
# tuning datasets
# max_train_samples: max number of samples to use for training. -1 means all. 0 means not to use.
# split_dev: whether to split the dataset into training and validation datasets. If false, the dataset is used for training only.
# upsampling_factor: upsampling factor for the dataset. 1 means no upsampling. Valid for both training and validation datasets.
datasets:
answer_carefully:
max_train_samples: -1 # -1 means all
split_dev: false
upsampling_factor: 16
calm3_22b_chat_20241018083433--Qwen2.5_32B_Instruct_20241022115410:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
calm3_22b_chat_20241022133932--Qwen2.5_32B_Instruct_20241024100350:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
calm3_22b_chat_20241022155627--Qwen2.5_32B_Instruct_20241024144245:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
daring_anteater_en:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
flan:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
ichikara:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
logical_math_coding_wizard8x22b:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
multiturn_calm3:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
random_to_fixed_multiturn_calm3:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
synthetic_jp_en_coding_0:
max_train_samples: -1
split_dev: true
upsampling_factor: 1
# number of dev samples are the minimum value of {max_dev_samples, max_dev_ratio * number of dev samples} in the dataset.
max_dev_samples: 1000
max_dev_ratio: 0.1
# hyperparameters
gbs: 64
mbs: 1
dropout: 0.0
lr: 2e-5
min_lr: 2e-6
# other options
use_mpi: false
use_slurm: false # This option should be set to true when using Slurm and MPI. Otherwise, set it to false.
ignore_hparams_on_save: false
# constants
hparams_to_ignore_on_save:
- project
- work_dir
- data_dir
- seed
- name
- exp_dir
- run_id
- run_dir
- config_name
- logger
- hparams_to_ignore_on_save
- per_device_train_batch_size
- per_device_eval_batch_size
- gradient_checkpointing
- logging_steps
- eval_steps
- save_steps
- use_mpi
- use_slurm
|