YAML Metadata Warning:empty or missing yaml metadata in repo card
Check out the documentation for more information.
Model trained in following setup:
variables:
global_seed: 46
max_seq_len: 8192
run_name: llama32_V3_DFT_Q512xP500_8192_TOP3GT
max_seq_len: ${variables.max_seq_len}
run_name: ${variables.run_name}
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: true
activation_checkpointing_reentrant: false
activation_cpu_offload: false
model:
name: hf_causal_lm
pretrained: true
pretrained_model_name_or_path: mfajcik/Llama-3.2-3B-Instruct_PLUS2IDLAYERS # plus2layers exp
use_flash_attention_2: true
copy_config:
copy_implementation: v3.4_fulljoint_ss
copy_token: "|copy|"
# train_copy_head_only: true # freezes all the model weights, except for the copyhead.
# train_copy_head_and_2extra_id_layers: true # plus2layers exp
# copy_head_matrices_init_path: /storage/brno2/home/ifajcik/code/llm-foundry25/extra_saved/pretrained_matrices/llama32_3B_8192_v3_copyhead_matrices_init_1500s.pt # plus2layers exp
config_overrides:
span_heads: 256
span_d: 6144
# Queriess
K_past_positives: 312 # how many 'positives' to take from tokens with copy position annotated (span gt)
K_past_negatives: 200 # how many 'negatives' to sample from tokens with without copy position annotation (token gt)
# Past states - how many candidates to consider for starts/ends for each query (number includes gt if available + negatives)
K_start: 500
K_end: 500
include_prefix_candidates: true
include_prefix_candidates_n: 40
include_suffix_candidates: true
include_suffix_candidates_n: 10
smart_sampling: true
hn_topk_positions: 800
distill_length_reward_beta: 5
distill_teacher_span_topk: 3
distill_teacher_span_topk_includegt: true
# Tokenizer
tokenizer:
name: meta-llama/Llama-3.2-3B-Instruct
kwargs:
model_max_length: ${variables.max_seq_len}
# Dataloaders
train_loader:
name: finetuning
dataset:
hf_name: v3_data
split: train
max_seq_len: ${variables.max_seq_len}
allow_pad_trimming: true
decoder_only_format: true
shuffle: true
preprocessing_fn: src.preprocessing.utils:filter_prompt_response
drop_last: true
num_workers: 4
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 30
eval_loaders:
- name: finetuning
label: wildchat
dataset:
hf_name: v3_validation_data/wildchat_4k
split: validation
max_seq_len: ${variables.max_seq_len}
allow_pad_trimming: false
decoder_only_format: true
shuffle: false
preprocessing_fn: src.preprocessing.utils:filter_prompt_response
drop_last: true
num_workers: 4
pin_memory: false
prefetch_factor: 2
persistent_workers: true
timeout: 30
# - name: finetuning
# label: ultrachat
# dataset:
# hf_name: v3_validation_data/ultrachat_4k
# split: validation
# max_seq_len: ${variables.max_seq_len}
# allow_pad_trimming: false
# decoder_only_format: true
# shuffle: false
# preprocessing_fn: src.preprocessing.utils:filter_prompt_response
# drop_last: true
# num_workers: 8
# pin_memory: false
# prefetch_factor: 2
# persistent_workers: true
# timeout: 0
## Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 200ba
alpha_f: 0.1
optimizer:
name: decoupled_lionw
lr: 5e-6
betas:
- 0.9
- 0.95
weight_decay: 0.0
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 2.0
max_duration: 1ep
eval_interval: 500ba
#eval_first: true
global_train_batch_size: 128
# System
seed: ${variables.global_seed}
device_eval_batch_size: 1
device_train_microbatch_size: 1
precision: amp_bf16
# Logging
progress_bar: true
log_to_console: true
console_log_interval: 1ba
callbacks:
speed_monitor:
window_size: 10
lr_monitor: { }
memory_monitor: { }
runtime_estimator: { }
loggers:
wandb:
project: "copylm" # Replace with your project name
entity: "ifajcik-brno-university-of-technology" # Replace with your username or team name
name: "v3_DPT_P2_Q512xP500_8192_FULLFT_TOPK3GTBETA5" # Optional: name of the current experiment
# Checkpoint to local filesystem or remote object store
save_interval: 500ba
autoresume: true
save_num_checkpoints_to_keep: 20 # Important, this cleans up checkpoints saved to DISK
load_path: /storage/brno2/home/ifajcik/code/llm-foundry25/.saved/llama32_V3_DFT_Q512xP400_8192/latest-rank0.pt
load_weights_only: true
save_folder: /storage/brno2/home/ifajcik/code/llm-foundry25/.saved/${variables.run_name}
- Downloads last month
- 17
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support