Built with Axolotl

See axolotl config

axolotl version: 0.10.1

# base_model: ./models/almanach--gprn-live-8b-test
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
debug: false

plugins:
  - axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true

strict: false

chat_template: llama3
datasets:
  - name: tulu-3-sft-mixture-gaperon
    path: ./data/tulu-3-sft-mixture-gaperon
    type: chat_template
    split: train
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant

train_on_inputs: false

dataset_prepared_path: ./cache/tulu-3-sft-mixture-gaperon
val_set_size: 0.02
eval_sample_packing: false
output_dir: ./runs/

sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

num_epochs: 4
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 3e-5

bf16: auto
tf32: false

gradient_checkpointing: false
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
auto_resume_from_checkpoints: true
logging_steps: 10
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.01
# fsdp:
#   - full_shard
#   - auto_wrap
# fsdp_config:
#   fsdp_limit_all_gathers: true
#   fsdp_sync_module_states: true
#   fsdp_offload_params: true
#   fsdp_use_orig_params: false
#   fsdp_cpu_ram_efficient_loading: true
#   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
#   fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
#   fsdp_state_dict_type: FULL_STATE_DICT
#   fsdp_sharding_strategy: FULL_SHARD
#   fsdp_backward_prefetch: BACKWARD_PRE

deepspeed: deepspeed_configs/zero3_bf16.json
eot_tokens:
  - "<|eot_id|>"
special_tokens:
  pad_token: <|end_of_text|>
  eos_token: <|eot_id|>
  bos_token: <|begin_of_text|>
# chat_template: jinja
# chat_template_jinja: |-
#   {{- bos_token }}
#   {%- if custom_tools is defined %}
#       {%- set tools = custom_tools %}
#   {%- endif %}
#   {%- if not tools_in_user_message is defined %}
#       {%- set tools_in_user_message = true %}
#   {%- endif %}
#   {%- if not tools is defined %}
#       {%- set tools = none %}
#   {%- endif %}

#   {#- This block extracts the system message, so we can slot it into the right place. #}
#   {%- if messages[0]['role'] == 'system' %}
#       {%- set system_message = messages[0]['content']|trim %}
#       {%- set messages = messages[1:] %}
#   {%- else %}
#       {%- set system_message = "You are OLMo 2, a helpful and harmless AI Assistant built by the Allen Institute for AI." %}
#   {%- endif %}

#   {#- System message + builtin tools #}
#   {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
#   {%- if builtin_tools is defined or tools is not none %}
#       {{- "Environment: ipython\n" }}
#   {%- endif %}
#   {%- if builtin_tools is defined %}
#       {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
#   {%- endif %}
#   {%- if tools is not none and not tools_in_user_message %}
#       {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
#       {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
#       {{- "Do not use variables.\n\n" }}
#       {%- for t in tools %}
#           {{- t | tojson(indent=4) }}
#           {{- "\n\n" }}
#       {%- endfor %}
#   {%- endif %}
#   {{- system_message }}
#   {{- "<|eot_id|>" }}

#   {#- Custom tools are passed in a user message with some extra guidance #}
#   {%- if tools_in_user_message and not tools is none %}
#       {#- Extract the first user message so we can plug it in here #}
#       {%- if messages | length != 0 %}
#           {%- set first_user_message = messages[0]['content']|trim %}
#           {%- set messages = messages[1:] %}
#       {%- else %}
#           {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
#   {%- endif %}
#       {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
#       {{- "Given the following functions, please respond with a JSON for a function call " }}
#       {{- "with its proper arguments that best answers the given prompt.\n\n" }}
#       {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
#       {{- "Do not use variables.\n\n" }}
#       {%- for t in tools %}
#           {{- t | tojson(indent=4) }}
#           {{- "\n\n" }}
#       {%- endfor %}
#       {{- first_user_message + "<|eot_id|>"}}
#   {%- endif %}

#   {%- for message in messages %}
#       {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
#           {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
#       {%- elif 'tool_calls' in message %}
#           {%- if not message.tool_calls|length == 1 %}
#               {{- raise_exception("This model only supports single tool-calls at once!") }}
#           {%- endif %}
#           {%- set tool_call = message.tool_calls[0].function %}
#           {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
#               {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
#               {{- "<|python_tag|>" + tool_call.name + ".call(" }}
#               {%- for arg_name, arg_val in tool_call.arguments | items %}
#                   {{- arg_name + '="' + arg_val + '"' }}
#                   {%- if not loop.last %}
#                       {{- ", " }}
#                   {%- endif %}
#                   {%- endfor %}
#               {{- ")" }}
#           {%- else  %}
#               {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
#               {{- '{"name": "' + tool_call.name + '", ' }}
#               {{- '"parameters": ' }}
#               {{- tool_call.arguments | tojson }}
#               {{- "}" }}
#           {%- endif %}
#           {%- if builtin_tools is defined %}
#               {#- This means we're in ipython mode #}
#               {{- "<|eom_id|>" }}
#           {%- else %}
#               {{- "<|eot_id|>" }}
#           {%- endif %}
#       {%- elif message.role == "tool" or message.role == "ipython" %}
#           {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
#           {%- if message.content is mapping or message.content is iterable %}
#               {{- message.content | tojson }}
#           {%- else %}
#               {{- message.content }}
#           {%- endif %}
#           {{- "<|eot_id|>" }}
#       {%- endif %}
#   {%- endfor %}
#   {%- if add_generation_prompt %}
#       {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
#   {%- endif %}

runs/llama8b-gap2-phase7-bf16true-cd_375896--36k.hf/grpn_tulu_en_high_lr

This model was trained from scratch on the None dataset. It achieves the following results on the evaluation set:

  • Loss: 0.0506

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

  • learning_rate: 3e-05
  • train_batch_size: 4
  • eval_batch_size: 4
  • seed: 42
  • distributed_type: multi-GPU
  • num_devices: 16
  • total_train_batch_size: 64
  • total_eval_batch_size: 64
  • optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
  • lr_scheduler_type: cosine
  • lr_scheduler_warmup_steps: 736
  • training_steps: 7364

Training results

Training Loss Epoch Step Validation Loss
No log 0 0 5.4402
0.6513 0.5003 921 0.0550
0.6206 1.0 1841 0.0529
0.6206 1.0005 1842 0.0525
0.5506 1.5008 2763 0.0523
0.5256 2.0011 3684 0.0520
0.458 2.5014 4605 0.0514
0.449 3.0 5523 0.0512
0.449 3.0016 5526 0.0509
0.398 3.5019 6447 0.0505
0.4128 4.0 7364 0.0506

Framework versions

  • Transformers 4.52.4
  • Pytorch 2.6.0+rocm6.2.4
  • Datasets 3.6.0
  • Tokenizers 0.21.1
Downloads last month
-
Safetensors
Model size
8B params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support