You need to agree to share your contact information to access this model

This repository is publicly accessible, but you have to accept the conditions to access its files and content.

Log in or Sign Up to review the conditions and access this model content.

YAML Metadata Warning:empty or missing yaml metadata in repo card

Check out the documentation for more information.

Model trained in following setup:

variables:
  global_seed: 46
  max_seq_len: 8192
  run_name: llama32_V3_DFT_Q512xP500_8192_TOP3GT

max_seq_len: ${variables.max_seq_len}
run_name: ${variables.run_name}

fsdp_config:
  sharding_strategy: FULL_SHARD
  mixed_precision: PURE
  activation_checkpointing: true
  activation_checkpointing_reentrant: false
  activation_cpu_offload: false

model:
  name: hf_causal_lm
  pretrained: true
  pretrained_model_name_or_path: mfajcik/Llama-3.2-3B-Instruct_PLUS2IDLAYERS # plus2layers exp
  use_flash_attention_2: true
  copy_config:
    copy_implementation: v3.4_fulljoint_ss
    copy_token: "|copy|"
  #    train_copy_head_only: true # freezes all the model weights, except for the copyhead.
  #    train_copy_head_and_2extra_id_layers: true # plus2layers exp
  #    copy_head_matrices_init_path: /storage/brno2/home/ifajcik/code/llm-foundry25/extra_saved/pretrained_matrices/llama32_3B_8192_v3_copyhead_matrices_init_1500s.pt # plus2layers exp

  config_overrides:
    span_heads: 256
    span_d: 6144
    # Queriess
    K_past_positives: 312 # how many 'positives' to take from tokens with copy position annotated (span gt)
    K_past_negatives: 200 # how many 'negatives' to sample from tokens with without copy position annotation (token gt)
    # Past states - how many candidates to consider for starts/ends for each query (number includes gt if available + negatives)
    K_start: 500
    K_end: 500
    include_prefix_candidates: true
    include_prefix_candidates_n: 40
    include_suffix_candidates: true
    include_suffix_candidates_n: 10
    smart_sampling: true
    hn_topk_positions: 800

    distill_length_reward_beta: 5
    distill_teacher_span_topk: 3
    distill_teacher_span_topk_includegt: true

# Tokenizer
tokenizer:
  name: meta-llama/Llama-3.2-3B-Instruct
  kwargs:
    model_max_length: ${variables.max_seq_len}

# Dataloaders
train_loader:
  name: finetuning
  dataset:
    hf_name: v3_data
    split: train
    max_seq_len: ${variables.max_seq_len}
    allow_pad_trimming: true
    decoder_only_format: true
    shuffle: true
    preprocessing_fn: src.preprocessing.utils:filter_prompt_response
  drop_last: true
  num_workers: 4
  pin_memory: false
  prefetch_factor: 2
  persistent_workers: true
  timeout: 30

eval_loaders:
  - name: finetuning
    label: wildchat
    dataset:
      hf_name: v3_validation_data/wildchat_4k
      split: validation
      max_seq_len: ${variables.max_seq_len}
      allow_pad_trimming: false
      decoder_only_format: true
      shuffle: false
      preprocessing_fn: src.preprocessing.utils:filter_prompt_response
    drop_last: true
    num_workers: 4
    pin_memory: false
    prefetch_factor: 2
    persistent_workers: true
    timeout: 30

#  - name: finetuning
#    label: ultrachat
#    dataset:
#      hf_name: v3_validation_data/ultrachat_4k
#      split: validation
#      max_seq_len: ${variables.max_seq_len}
#      allow_pad_trimming: false
#      decoder_only_format: true
#      shuffle: false
#      preprocessing_fn: src.preprocessing.utils:filter_prompt_response
#    drop_last: true
#    num_workers: 8
#    pin_memory: false
#    prefetch_factor: 2
#    persistent_workers: true
#    timeout: 0

## Optimization
scheduler:
  name: cosine_with_warmup
  t_warmup: 200ba
  alpha_f: 0.1

optimizer:
  name: decoupled_lionw
  lr: 5e-6
  betas:
    - 0.9
    - 0.95
  weight_decay: 0.0

algorithms:
  gradient_clipping:
    clipping_type: norm
    clipping_threshold: 2.0

max_duration: 1ep


eval_interval: 500ba
#eval_first: true


global_train_batch_size: 128

# System
seed: ${variables.global_seed}
device_eval_batch_size: 1
device_train_microbatch_size: 1
precision: amp_bf16

# Logging
progress_bar: true
log_to_console: true
console_log_interval: 1ba

callbacks:
  speed_monitor:
    window_size: 10
  lr_monitor: { }
  memory_monitor: { }
  runtime_estimator: { }

loggers:
  wandb:
    project: "copylm"  # Replace with your project name
    entity: "ifajcik-brno-university-of-technology"  # Replace with your username or team name
    name: "v3_DPT_P2_Q512xP500_8192_FULLFT_TOPK3GTBETA5"  # Optional: name of the current experiment

# Checkpoint to local filesystem or remote object store
save_interval: 500ba
autoresume: true
save_num_checkpoints_to_keep: 20  # Important, this cleans up checkpoints saved to DISK

load_path: /storage/brno2/home/ifajcik/code/llm-foundry25/.saved/llama32_V3_DFT_Q512xP400_8192/latest-rank0.pt
load_weights_only: true

save_folder: /storage/brno2/home/ifajcik/code/llm-foundry25/.saved/${variables.run_name}
Downloads last month
17
Safetensors
Model size
3B params
Tensor type
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support