Leon299 commited on 13 days ago

Commit

fd1afc8

verified ·

1 Parent(s): 60f02df

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

__pycache__/inference_full.cpython-312.pyc +0 -0
__pycache__/runtime_utils.cpython-312.pyc +0 -0
batch_infer.sh +148 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_18.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_19.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_2.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_20.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_21.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_22.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_23.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_24.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_25.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_26.pth +3 -0
output_qwen3_plain_ar/checkpoint-17233/rng_state_27.pth +3 -0
output_qwen3_plain_ar/checkpoint-18140/trainer_state.json +0 -0
output_qwen3_plain_ar/checkpoint-18140/zero_to_fp32.py +760 -0
output_qwen3_plain_ar/checkpoint-2721/config.json +66 -0
output_qwen3_plain_ar/checkpoint-2721/generation_config.json +13 -0
output_qwen3_plain_ar/checkpoint-2721/latest +1 -0
output_qwen3_plain_ar/checkpoint-2721/trainer_state.json +1938 -0
output_qwen3_plain_ar/checkpoint-2721/zero_to_fp32.py +760 -0
output_qwen3_plain_ar/checkpoint-3628/config.json +66 -0
output_qwen3_plain_ar/checkpoint-3628/generation_config.json +13 -0
output_qwen3_plain_ar/checkpoint-3628/latest +1 -0
output_qwen3_plain_ar/checkpoint-3628/trainer_state.json +2568 -0
output_qwen3_plain_ar/checkpoint-3628/zero_to_fp32.py +760 -0
output_qwen3_plain_ar/checkpoint-4535/config.json +66 -0
output_qwen3_plain_ar/checkpoint-4535/generation_config.json +13 -0
output_qwen3_plain_ar/checkpoint-4535/latest +1 -0
output_qwen3_plain_ar/checkpoint-4535/trainer_state.json +3205 -0
output_qwen3_plain_ar/checkpoint-4535/zero_to_fp32.py +760 -0
output_qwen3_plain_ar/checkpoint-5442/config.json +66 -0
output_qwen3_plain_ar/checkpoint-5442/generation_config.json +13 -0
output_qwen3_plain_ar/checkpoint-5442/latest +1 -0
output_qwen3_plain_ar/checkpoint-5442/trainer_state.json +0 -0
output_qwen3_plain_ar/checkpoint-5442/zero_to_fp32.py +760 -0
output_qwen3_plain_ar/checkpoint-6349/config.json +66 -0
output_qwen3_plain_ar/checkpoint-6349/generation_config.json +13 -0
output_qwen3_plain_ar/checkpoint-6349/latest +1 -0
output_qwen3_plain_ar/checkpoint-6349/trainer_state.json +0 -0
output_qwen3_plain_ar/checkpoint-6349/zero_to_fp32.py +760 -0
output_qwen3_plain_ar/checkpoint-7256/config.json +66 -0
output_qwen3_plain_ar/checkpoint-7256/generation_config.json +13 -0
output_qwen3_plain_ar/checkpoint-7256/latest +1 -0
output_qwen3_plain_ar/checkpoint-7256/trainer_state.json +0 -0
output_qwen3_plain_ar/checkpoint-7256/zero_to_fp32.py +760 -0
output_qwen3_plain_ar/checkpoint-8163/config.json +66 -0
output_qwen3_plain_ar/checkpoint-8163/generation_config.json +13 -0
output_qwen3_plain_ar/checkpoint-8163/latest +1 -0
output_qwen3_plain_ar/checkpoint-8163/trainer_state.json +0 -0

__pycache__/inference_full.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/inference_full.cpython-312.pyc and b/__pycache__/inference_full.cpython-312.pyc differ

__pycache__/runtime_utils.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/runtime_utils.cpython-312.pyc and b/__pycache__/runtime_utils.cpython-312.pyc differ

batch_infer.sh ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/usr/bin/env bash
+set -uo pipefail
+########################################
+# 配置区（你只需要改这里）
+########################################
+SCRIPT_PATH="qwen3_plain_ar.py"
+DATASET_PATH="muse_mucodec_chord.ds"
+# tokenizer（必须是带 chat_template 的）
+TOKENIZER_PATH="/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/final"
+# checkpoint 列表
+CHECKPOINTS=(
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-907"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-1814"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-2721"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-3628"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-4535"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-5442"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-6349"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-7256"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-8163"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-9070"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-9977"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-10884"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-11791"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-12698"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-13605"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-14512"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-15419"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-16326"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-17233"
+  "/algo-intern/user/leonchen/cond_gen/output_qwen3_plain_ar/checkpoint-18140"
+)
+# 输出根目录
+OUTPUT_ROOT="/root/batch_preditions_ablation"
+# 每个 checkpoint 推理多少条
+NUM_SAMPLES=20
+########################################
+# 推理参数（可以调）
+########################################
+DEVICE="cuda:0"
+DTYPE="bfloat16"
+ATTN_IMPLEMENTATION="sdpa"
+TEMPERATURE=1.0
+TOP_K=50
+TOP_P=0.9
+MAX_NEW_TOKENS=4096
+# 是否跳过音频解码（调试建议先开）
+SKIP_DECODE=false
+########################################
+# 日志文件
+########################################
+FAILED_LOG="${OUTPUT_ROOT}/failed_cases.log"
+SUCCESS_LOG="${OUTPUT_ROOT}/success_cases.log"
+########################################
+# 开始执行
+########################################
+mkdir -p "${OUTPUT_ROOT}"
+touch "${FAILED_LOG}"
+touch "${SUCCESS_LOG}"
+echo "======================================" | tee -a "${SUCCESS_LOG}"
+echo "Batch inference started at $(date)" | tee -a "${SUCCESS_LOG}"
+echo "Output root: ${OUTPUT_ROOT}" | tee -a "${SUCCESS_LOG}"
+echo "======================================" | tee -a "${SUCCESS_LOG}"
+for CKPT in "${CHECKPOINTS[@]}"; do
+  CKPT_NAME=$(basename "${CKPT}")
+  OUT_DIR="${OUTPUT_ROOT}/${CKPT_NAME}"
+  CKPT_LOG="${OUT_DIR}/run.log"
+  echo "======================================"
+  echo "Running checkpoint: ${CKPT_NAME}"
+  echo "Output dir: ${OUT_DIR}"
+  echo "======================================"
+  if [ ! -d "${CKPT}" ]; then
+    echo "[ERROR] checkpoint directory not found: ${CKPT}" | tee -a "${FAILED_LOG}"
+    continue
+  fi
+  mkdir -p "${OUT_DIR}"
+  touch "${CKPT_LOG}"
+  for ((i=0; i<NUM_SAMPLES; i++)); do
+    echo "[INFO] checkpoint=${CKPT_NAME} sample_idx=${i}" | tee -a "${CKPT_LOG}"
+    CMD=(
+      python "${SCRIPT_PATH}" infer
+      --model_path "${CKPT}"
+      --tokenizer_path "${TOKENIZER_PATH}"
+      --dataset_path "${DATASET_PATH}"
+      --split validation
+      --sample_idx "${i}"
+      --device "${DEVICE}"
+      --dtype "${DTYPE}"
+      --attn_implementation "${ATTN_IMPLEMENTATION}"
+      --temperature "${TEMPERATURE}"
+      --top_k "${TOP_K}"
+      --top_p "${TOP_P}"
+      --max_new_tokens_per_section "${MAX_NEW_TOKENS}"
+      --output_dir "${OUT_DIR}"
+      --output_prefix "sample_${i}"
+    )
+    if [ "${SKIP_DECODE}" = true ]; then
+      CMD+=(--skip_decode)
+    fi
+    {
+      echo "[CMD] ${CMD[*]}"
+      "${CMD[@]}"
+    } >> "${CKPT_LOG}" 2>&1
+    EXIT_CODE=$?
+    if [ ${EXIT_CODE} -ne 0 ]; then
+      echo "[ERROR] checkpoint=${CKPT_NAME} sample_idx=${i} exit_code=${EXIT_CODE}" | tee -a "${FAILED_LOG}"
+      continue
+    else
+      echo "[OK] checkpoint=${CKPT_NAME} sample_idx=${i}" | tee -a "${SUCCESS_LOG}"
+    fi
+  done
+  echo "[DONE] checkpoint=${CKPT_NAME}" | tee -a "${SUCCESS_LOG}"
+done
+echo "======================================" | tee -a "${SUCCESS_LOG}"
+echo "Batch inference finished at $(date)" | tee -a "${SUCCESS_LOG}"
+echo "Success log: ${SUCCESS_LOG}" | tee -a "${SUCCESS_LOG}"
+echo "Failed log: ${FAILED_LOG}" | tee -a "${SUCCESS_LOG}"
+echo "All done."

output_qwen3_plain_ar/checkpoint-17233/rng_state_18.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd17b3bc8809e659c44ad5767c09471365bf1aaf99af587ed6ceb8212a83647f
+size 16340

output_qwen3_plain_ar/checkpoint-17233/rng_state_19.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48baee25b7a07901dc04101212281542a2c94ba2e39212de0833d10d73bcff15
+size 16340

output_qwen3_plain_ar/checkpoint-17233/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab8c0ea40d6071acfd801325ef5fce06795f95b25fd7cf033726000b4174406a
+size 16325

output_qwen3_plain_ar/checkpoint-17233/rng_state_20.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b4d0f639063a020849c066d4bdf38ac11a72c2fb705b5d36090aaec746bc0c
+size 16340

output_qwen3_plain_ar/checkpoint-17233/rng_state_21.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1f3338bc26fbbe65cb13f269b236abdee71e69af31f680d19dd714839c3cb60
+size 16340

output_qwen3_plain_ar/checkpoint-17233/rng_state_22.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9190d2dd67803c301fed8e4bb7793ee439602ceff38c5fd9b3ddfa7f8b3ebee1
+size 16340

output_qwen3_plain_ar/checkpoint-17233/rng_state_23.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fcc224b33040d41584dd3cbdfb1bca14ca82f95e91c4ce8da7d1fa5af39fb996
+size 16340

output_qwen3_plain_ar/checkpoint-17233/rng_state_24.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b75d0da8e588491425dfede07d4fcf9a4236407463ca5cdf5cdf29b1a8fca5d1
+size 16340

output_qwen3_plain_ar/checkpoint-17233/rng_state_25.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f509a161c9a3c4078f25919a5284e639b26b17196b37c50b6359eb10892a6477
+size 16340

output_qwen3_plain_ar/checkpoint-17233/rng_state_26.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa1a5e34bfbf6b2ef864a91a119b55c4972c987b01ac5ce6edbd95a8dbfaf56b
+size 16340

output_qwen3_plain_ar/checkpoint-17233/rng_state_27.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9105492960e0409542231deb8d2dc0148b7dbd7179c17107bf77eab0f18f32f
+size 16340

output_qwen3_plain_ar/checkpoint-18140/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

output_qwen3_plain_ar/checkpoint-18140/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

output_qwen3_plain_ar/checkpoint-2721/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "magel_chord_dropout_trigger_prob": 0.6,
+  "magel_num_audio_token": 16384,
+  "magel_structure_dropout_trigger_prob": 0.6,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.4.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 168056
+}

output_qwen3_plain_ar/checkpoint-2721/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.4.0"
+}

output_qwen3_plain_ar/checkpoint-2721/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step2721

output_qwen3_plain_ar/checkpoint-2721/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1938 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 2721,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011028398125172319,
+      "grad_norm": 435.2422180175781,
+      "learning_rate": 9e-07,
+      "loss": 20.84569549560547,
+      "step": 10
+    },
+    {
+      "epoch": 0.022056796250344637,
+      "grad_norm": 141.7341766357422,
+      "learning_rate": 1.9e-06,
+      "loss": 18.69615936279297,
+      "step": 20
+    },
+    {
+      "epoch": 0.033085194375516956,
+      "grad_norm": 74.42520904541016,
+      "learning_rate": 2.9e-06,
+      "loss": 16.079673767089844,
+      "step": 30
+    },
+    {
+      "epoch": 0.044113592500689275,
+      "grad_norm": 24.73248863220215,
+      "learning_rate": 3.9e-06,
+      "loss": 13.684315490722657,
+      "step": 40
+    },
+    {
+      "epoch": 0.055141990625861594,
+      "grad_norm": 7.049101829528809,
+      "learning_rate": 4.9000000000000005e-06,
+      "loss": 12.474874877929688,
+      "step": 50
+    },
+    {
+      "epoch": 0.06617038875103391,
+      "grad_norm": 2.3411474227905273,
+      "learning_rate": 5.9e-06,
+      "loss": 12.072142028808594,
+      "step": 60
+    },
+    {
+      "epoch": 0.07719878687620624,
+      "grad_norm": 1.126215934753418,
+      "learning_rate": 6.900000000000001e-06,
+      "loss": 11.938906860351562,
+      "step": 70
+    },
+    {
+      "epoch": 0.08822718500137855,
+      "grad_norm": 1.2050226926803589,
+      "learning_rate": 7.9e-06,
+      "loss": 11.81988296508789,
+      "step": 80
+    },
+    {
+      "epoch": 0.09925558312655088,
+      "grad_norm": 1.444793462753296,
+      "learning_rate": 8.9e-06,
+      "loss": 11.602033996582032,
+      "step": 90
+    },
+    {
+      "epoch": 0.11028398125172319,
+      "grad_norm": 5.791665077209473,
+      "learning_rate": 9.900000000000002e-06,
+      "loss": 11.201815032958985,
+      "step": 100
+    },
+    {
+      "epoch": 0.12131237937689551,
+      "grad_norm": 9.492277145385742,
+      "learning_rate": 1.09e-05,
+      "loss": 10.535708618164062,
+      "step": 110
+    },
+    {
+      "epoch": 0.13234077750206782,
+      "grad_norm": 2.7546133995056152,
+      "learning_rate": 1.19e-05,
+      "loss": 9.847169494628906,
+      "step": 120
+    },
+    {
+      "epoch": 0.14336917562724014,
+      "grad_norm": 1.0953313112258911,
+      "learning_rate": 1.29e-05,
+      "loss": 9.429026031494141,
+      "step": 130
+    },
+    {
+      "epoch": 0.15439757375241248,
+      "grad_norm": 0.7153559327125549,
+      "learning_rate": 1.3900000000000002e-05,
+      "loss": 9.266969299316406,
+      "step": 140
+    },
+    {
+      "epoch": 0.1654259718775848,
+      "grad_norm": 0.5888933539390564,
+      "learning_rate": 1.49e-05,
+      "loss": 9.1935546875,
+      "step": 150
+    },
+    {
+      "epoch": 0.1764543700027571,
+      "grad_norm": 0.4850365221500397,
+      "learning_rate": 1.59e-05,
+      "loss": 9.19604034423828,
+      "step": 160
+    },
+    {
+      "epoch": 0.1874827681279294,
+      "grad_norm": 0.5772538185119629,
+      "learning_rate": 1.69e-05,
+      "loss": 9.17010726928711,
+      "step": 170
+    },
+    {
+      "epoch": 0.19851116625310175,
+      "grad_norm": 0.4283920228481293,
+      "learning_rate": 1.79e-05,
+      "loss": 9.172830200195312,
+      "step": 180
+    },
+    {
+      "epoch": 0.20953956437827406,
+      "grad_norm": 0.8650698065757751,
+      "learning_rate": 1.8900000000000002e-05,
+      "loss": 9.154988098144532,
+      "step": 190
+    },
+    {
+      "epoch": 0.22056796250344637,
+      "grad_norm": 0.42017608880996704,
+      "learning_rate": 1.9900000000000003e-05,
+      "loss": 9.146849060058594,
+      "step": 200
+    },
+    {
+      "epoch": 0.23159636062861869,
+      "grad_norm": 0.9125994443893433,
+      "learning_rate": 2.09e-05,
+      "loss": 9.164442443847657,
+      "step": 210
+    },
+    {
+      "epoch": 0.24262475875379103,
+      "grad_norm": 0.6468876004219055,
+      "learning_rate": 2.19e-05,
+      "loss": 9.159596252441407,
+      "step": 220
+    },
+    {
+      "epoch": 0.25365315687896334,
+      "grad_norm": 0.4124819338321686,
+      "learning_rate": 2.29e-05,
+      "loss": 9.13860626220703,
+      "step": 230
+    },
+    {
+      "epoch": 0.26468155500413565,
+      "grad_norm": 1.990302562713623,
+      "learning_rate": 2.39e-05,
+      "loss": 9.145040893554688,
+      "step": 240
+    },
+    {
+      "epoch": 0.27570995312930796,
+      "grad_norm": 0.7875277400016785,
+      "learning_rate": 2.4900000000000002e-05,
+      "loss": 9.152925109863281,
+      "step": 250
+    },
+    {
+      "epoch": 0.2867383512544803,
+      "grad_norm": 0.8343706130981445,
+      "learning_rate": 2.5900000000000003e-05,
+      "loss": 9.132975769042968,
+      "step": 260
+    },
+    {
+      "epoch": 0.2977667493796526,
+      "grad_norm": 3.00996470451355,
+      "learning_rate": 2.6900000000000003e-05,
+      "loss": 9.097848510742187,
+      "step": 270
+    },
+    {
+      "epoch": 0.30879514750482495,
+      "grad_norm": 2.4282069206237793,
+      "learning_rate": 2.7900000000000004e-05,
+      "loss": 9.042235565185546,
+      "step": 280
+    },
+    {
+      "epoch": 0.31982354562999726,
+      "grad_norm": 4.171019554138184,
+      "learning_rate": 2.8899999999999998e-05,
+      "loss": 8.927298736572265,
+      "step": 290
+    },
+    {
+      "epoch": 0.3308519437551696,
+      "grad_norm": 2.197887659072876,
+      "learning_rate": 2.9900000000000002e-05,
+      "loss": 8.805252075195312,
+      "step": 300
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 10.306541442871094,
+      "learning_rate": 3.09e-05,
+      "loss": 8.673678588867187,
+      "step": 310
+    },
+    {
+      "epoch": 0.3529087400055142,
+      "grad_norm": 8.463860511779785,
+      "learning_rate": 3.19e-05,
+      "loss": 8.570347595214844,
+      "step": 320
+    },
+    {
+      "epoch": 0.3639371381306865,
+      "grad_norm": 3.999753475189209,
+      "learning_rate": 3.29e-05,
+      "loss": 8.429109191894531,
+      "step": 330
+    },
+    {
+      "epoch": 0.3749655362558588,
+      "grad_norm": 5.259007930755615,
+      "learning_rate": 3.3900000000000004e-05,
+      "loss": 8.334149169921876,
+      "step": 340
+    },
+    {
+      "epoch": 0.38599393438103113,
+      "grad_norm": 8.362598419189453,
+      "learning_rate": 3.49e-05,
+      "loss": 8.196139526367187,
+      "step": 350
+    },
+    {
+      "epoch": 0.3970223325062035,
+      "grad_norm": 10.273512840270996,
+      "learning_rate": 3.59e-05,
+      "loss": 8.040153503417969,
+      "step": 360
+    },
+    {
+      "epoch": 0.4080507306313758,
+      "grad_norm": 5.111108303070068,
+      "learning_rate": 3.69e-05,
+      "loss": 7.866473388671875,
+      "step": 370
+    },
+    {
+      "epoch": 0.4190791287565481,
+      "grad_norm": 9.192107200622559,
+      "learning_rate": 3.79e-05,
+      "loss": 7.695774841308594,
+      "step": 380
+    },
+    {
+      "epoch": 0.43010752688172044,
+      "grad_norm": 5.393336772918701,
+      "learning_rate": 3.8900000000000004e-05,
+      "loss": 7.498152160644532,
+      "step": 390
+    },
+    {
+      "epoch": 0.44113592500689275,
+      "grad_norm": 10.53490161895752,
+      "learning_rate": 3.99e-05,
+      "loss": 7.270246887207032,
+      "step": 400
+    },
+    {
+      "epoch": 0.45216432313206506,
+      "grad_norm": 6.174643516540527,
+      "learning_rate": 4.09e-05,
+      "loss": 7.127191162109375,
+      "step": 410
+    },
+    {
+      "epoch": 0.46319272125723737,
+      "grad_norm": 4.522936820983887,
+      "learning_rate": 4.19e-05,
+      "loss": 6.871500396728516,
+      "step": 420
+    },
+    {
+      "epoch": 0.4742211193824097,
+      "grad_norm": 4.3594207763671875,
+      "learning_rate": 4.29e-05,
+      "loss": 6.702586364746094,
+      "step": 430
+    },
+    {
+      "epoch": 0.48524951750758205,
+      "grad_norm": 5.950730323791504,
+      "learning_rate": 4.39e-05,
+      "loss": 6.493560791015625,
+      "step": 440
+    },
+    {
+      "epoch": 0.49627791563275436,
+      "grad_norm": 6.233413219451904,
+      "learning_rate": 4.49e-05,
+      "loss": 6.293489074707031,
+      "step": 450
+    },
+    {
+      "epoch": 0.5073063137579267,
+      "grad_norm": 7.656834125518799,
+      "learning_rate": 4.5900000000000004e-05,
+      "loss": 6.102347946166992,
+      "step": 460
+    },
+    {
+      "epoch": 0.518334711883099,
+      "grad_norm": 4.319094657897949,
+      "learning_rate": 4.69e-05,
+      "loss": 5.928083419799805,
+      "step": 470
+    },
+    {
+      "epoch": 0.5293631100082713,
+      "grad_norm": 5.585537433624268,
+      "learning_rate": 4.79e-05,
+      "loss": 5.77436637878418,
+      "step": 480
+    },
+    {
+      "epoch": 0.5403915081334436,
+      "grad_norm": 5.104014873504639,
+      "learning_rate": 4.89e-05,
+      "loss": 5.636859130859375,
+      "step": 490
+    },
+    {
+      "epoch": 0.5514199062586159,
+      "grad_norm": 5.453028202056885,
+      "learning_rate": 4.99e-05,
+      "loss": 5.507636260986328,
+      "step": 500
+    },
+    {
+      "epoch": 0.5624483043837882,
+      "grad_norm": 7.728854179382324,
+      "learning_rate": 5.0900000000000004e-05,
+      "loss": 5.411964416503906,
+      "step": 510
+    },
+    {
+      "epoch": 0.5734767025089605,
+      "grad_norm": 4.50288724899292,
+      "learning_rate": 5.19e-05,
+      "loss": 5.295291900634766,
+      "step": 520
+    },
+    {
+      "epoch": 0.5845051006341329,
+      "grad_norm": 4.245919704437256,
+      "learning_rate": 5.2900000000000005e-05,
+      "loss": 5.194162750244141,
+      "step": 530
+    },
+    {
+      "epoch": 0.5955334987593052,
+      "grad_norm": 6.278975963592529,
+      "learning_rate": 5.390000000000001e-05,
+      "loss": 5.113618087768555,
+      "step": 540
+    },
+    {
+      "epoch": 0.6065618968844775,
+      "grad_norm": 4.214662075042725,
+      "learning_rate": 5.4900000000000006e-05,
+      "loss": 5.038372039794922,
+      "step": 550
+    },
+    {
+      "epoch": 0.6175902950096499,
+      "grad_norm": 3.5404605865478516,
+      "learning_rate": 5.590000000000001e-05,
+      "loss": 4.935391235351562,
+      "step": 560
+    },
+    {
+      "epoch": 0.6286186931348222,
+      "grad_norm": 3.6460280418395996,
+      "learning_rate": 5.69e-05,
+      "loss": 4.896538543701172,
+      "step": 570
+    },
+    {
+      "epoch": 0.6396470912599945,
+      "grad_norm": 5.254800796508789,
+      "learning_rate": 5.79e-05,
+      "loss": 4.829419708251953,
+      "step": 580
+    },
+    {
+      "epoch": 0.6506754893851668,
+      "grad_norm": 5.132180690765381,
+      "learning_rate": 5.89e-05,
+      "loss": 4.793368148803711,
+      "step": 590
+    },
+    {
+      "epoch": 0.6617038875103392,
+      "grad_norm": 4.222960948944092,
+      "learning_rate": 5.99e-05,
+      "loss": 4.746239852905274,
+      "step": 600
+    },
+    {
+      "epoch": 0.6727322856355115,
+      "grad_norm": 4.070414066314697,
+      "learning_rate": 6.09e-05,
+      "loss": 4.688523864746093,
+      "step": 610
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 3.4652583599090576,
+      "learning_rate": 6.19e-05,
+      "loss": 4.692922973632813,
+      "step": 620
+    },
+    {
+      "epoch": 0.6947890818858561,
+      "grad_norm": 4.559128284454346,
+      "learning_rate": 6.29e-05,
+      "loss": 4.639920043945312,
+      "step": 630
+    },
+    {
+      "epoch": 0.7058174800110284,
+      "grad_norm": 3.197758436203003,
+      "learning_rate": 6.390000000000001e-05,
+      "loss": 4.601907348632812,
+      "step": 640
+    },
+    {
+      "epoch": 0.7168458781362007,
+      "grad_norm": 4.209578514099121,
+      "learning_rate": 6.49e-05,
+      "loss": 4.56639404296875,
+      "step": 650
+    },
+    {
+      "epoch": 0.727874276261373,
+      "grad_norm": 3.701484203338623,
+      "learning_rate": 6.59e-05,
+      "loss": 4.545608901977539,
+      "step": 660
+    },
+    {
+      "epoch": 0.7389026743865453,
+      "grad_norm": 3.951927900314331,
+      "learning_rate": 6.690000000000001e-05,
+      "loss": 4.493326187133789,
+      "step": 670
+    },
+    {
+      "epoch": 0.7499310725117176,
+      "grad_norm": 4.219130039215088,
+      "learning_rate": 6.790000000000001e-05,
+      "loss": 4.482691955566406,
+      "step": 680
+    },
+    {
+      "epoch": 0.76095947063689,
+      "grad_norm": 6.267204284667969,
+      "learning_rate": 6.89e-05,
+      "loss": 4.4599052429199215,
+      "step": 690
+    },
+    {
+      "epoch": 0.7719878687620623,
+      "grad_norm": 3.367382764816284,
+      "learning_rate": 6.99e-05,
+      "loss": 4.429808807373047,
+      "step": 700
+    },
+    {
+      "epoch": 0.7830162668872346,
+      "grad_norm": 3.8906455039978027,
+      "learning_rate": 7.09e-05,
+      "loss": 4.4144752502441404,
+      "step": 710
+    },
+    {
+      "epoch": 0.794044665012407,
+      "grad_norm": 6.759398460388184,
+      "learning_rate": 7.19e-05,
+      "loss": 4.385488891601563,
+      "step": 720
+    },
+    {
+      "epoch": 0.8050730631375793,
+      "grad_norm": 3.520167350769043,
+      "learning_rate": 7.29e-05,
+      "loss": 4.397706985473633,
+      "step": 730
+    },
+    {
+      "epoch": 0.8161014612627516,
+      "grad_norm": 2.7510974407196045,
+      "learning_rate": 7.390000000000001e-05,
+      "loss": 4.374617385864258,
+      "step": 740
+    },
+    {
+      "epoch": 0.8271298593879239,
+      "grad_norm": 4.395699977874756,
+      "learning_rate": 7.49e-05,
+      "loss": 4.3302146911621096,
+      "step": 750
+    },
+    {
+      "epoch": 0.8381582575130962,
+      "grad_norm": 3.277766704559326,
+      "learning_rate": 7.59e-05,
+      "loss": 4.313335418701172,
+      "step": 760
+    },
+    {
+      "epoch": 0.8491866556382686,
+      "grad_norm": 2.466207981109619,
+      "learning_rate": 7.69e-05,
+      "loss": 4.3226570129394535,
+      "step": 770
+    },
+    {
+      "epoch": 0.8602150537634409,
+      "grad_norm": 3.637355327606201,
+      "learning_rate": 7.790000000000001e-05,
+      "loss": 4.295929718017578,
+      "step": 780
+    },
+    {
+      "epoch": 0.8712434518886132,
+      "grad_norm": 3.155527353286743,
+      "learning_rate": 7.890000000000001e-05,
+      "loss": 4.287591552734375,
+      "step": 790
+    },
+    {
+      "epoch": 0.8822718500137855,
+      "grad_norm": 3.593884229660034,
+      "learning_rate": 7.99e-05,
+      "loss": 4.267314147949219,
+      "step": 800
+    },
+    {
+      "epoch": 0.8933002481389578,
+      "grad_norm": 2.361081123352051,
+      "learning_rate": 8.090000000000001e-05,
+      "loss": 4.265741348266602,
+      "step": 810
+    },
+    {
+      "epoch": 0.9043286462641301,
+      "grad_norm": 2.7084105014801025,
+      "learning_rate": 8.19e-05,
+      "loss": 4.261878204345703,
+      "step": 820
+    },
+    {
+      "epoch": 0.9153570443893024,
+      "grad_norm": 3.6093873977661133,
+      "learning_rate": 8.29e-05,
+      "loss": 4.211677551269531,
+      "step": 830
+    },
+    {
+      "epoch": 0.9263854425144747,
+      "grad_norm": 3.9739396572113037,
+      "learning_rate": 8.39e-05,
+      "loss": 4.224007034301758,
+      "step": 840
+    },
+    {
+      "epoch": 0.9374138406396471,
+      "grad_norm": 2.174050807952881,
+      "learning_rate": 8.49e-05,
+      "loss": 4.211782836914063,
+      "step": 850
+    },
+    {
+      "epoch": 0.9484422387648194,
+      "grad_norm": 2.7151405811309814,
+      "learning_rate": 8.59e-05,
+      "loss": 4.204391098022461,
+      "step": 860
+    },
+    {
+      "epoch": 0.9594706368899917,
+      "grad_norm": 3.7480661869049072,
+      "learning_rate": 8.69e-05,
+      "loss": 4.175582504272461,
+      "step": 870
+    },
+    {
+      "epoch": 0.9704990350151641,
+      "grad_norm": 3.1127700805664062,
+      "learning_rate": 8.790000000000001e-05,
+      "loss": 4.183733749389648,
+      "step": 880
+    },
+    {
+      "epoch": 0.9815274331403364,
+      "grad_norm": 2.750716209411621,
+      "learning_rate": 8.89e-05,
+      "loss": 4.167971801757813,
+      "step": 890
+    },
+    {
+      "epoch": 0.9925558312655087,
+      "grad_norm": 4.02509880065918,
+      "learning_rate": 8.99e-05,
+      "loss": 4.170472717285156,
+      "step": 900
+    },
+    {
+      "epoch": 1.0033085194375517,
+      "grad_norm": 3.0058505535125732,
+      "learning_rate": 9.090000000000001e-05,
+      "loss": 4.1449127197265625,
+      "step": 910
+    },
+    {
+      "epoch": 1.014336917562724,
+      "grad_norm": 2.553403377532959,
+      "learning_rate": 9.190000000000001e-05,
+      "loss": 4.1404258728027346,
+      "step": 920
+    },
+    {
+      "epoch": 1.0253653156878964,
+      "grad_norm": 2.8066084384918213,
+      "learning_rate": 9.290000000000001e-05,
+      "loss": 4.110780334472656,
+      "step": 930
+    },
+    {
+      "epoch": 1.0363937138130686,
+      "grad_norm": 3.904608726501465,
+      "learning_rate": 9.39e-05,
+      "loss": 4.134862899780273,
+      "step": 940
+    },
+    {
+      "epoch": 1.047422111938241,
+      "grad_norm": 2.217729330062866,
+      "learning_rate": 9.49e-05,
+      "loss": 4.112079620361328,
+      "step": 950
+    },
+    {
+      "epoch": 1.0584505100634134,
+      "grad_norm": 2.498760938644409,
+      "learning_rate": 9.59e-05,
+      "loss": 4.097566986083985,
+      "step": 960
+    },
+    {
+      "epoch": 1.0694789081885856,
+      "grad_norm": 3.577143907546997,
+      "learning_rate": 9.69e-05,
+      "loss": 4.081307220458984,
+      "step": 970
+    },
+    {
+      "epoch": 1.080507306313758,
+      "grad_norm": 3.283250570297241,
+      "learning_rate": 9.790000000000001e-05,
+      "loss": 4.103987503051758,
+      "step": 980
+    },
+    {
+      "epoch": 1.0915357044389302,
+      "grad_norm": 2.1897776126861572,
+      "learning_rate": 9.89e-05,
+      "loss": 4.084938812255859,
+      "step": 990
+    },
+    {
+      "epoch": 1.1025641025641026,
+      "grad_norm": 2.6925997734069824,
+      "learning_rate": 9.99e-05,
+      "loss": 4.058921051025391,
+      "step": 1000
+    },
+    {
+      "epoch": 1.1135925006892748,
+      "grad_norm": 3.4118456840515137,
+      "learning_rate": 9.994749124854142e-05,
+      "loss": 4.061585235595703,
+      "step": 1010
+    },
+    {
+      "epoch": 1.1246208988144473,
+      "grad_norm": 2.6139297485351562,
+      "learning_rate": 9.988914819136523e-05,
+      "loss": 4.070050048828125,
+      "step": 1020
+    },
+    {
+      "epoch": 1.1356492969396195,
+      "grad_norm": 1.8616399765014648,
+      "learning_rate": 9.983080513418903e-05,
+      "loss": 4.0413330078125,
+      "step": 1030
+    },
+    {
+      "epoch": 1.146677695064792,
+      "grad_norm": 2.361706018447876,
+      "learning_rate": 9.977246207701284e-05,
+      "loss": 4.023075866699219,
+      "step": 1040
+    },
+    {
+      "epoch": 1.157706093189964,
+      "grad_norm": 3.815014123916626,
+      "learning_rate": 9.971411901983664e-05,
+      "loss": 4.036756134033203,
+      "step": 1050
+    },
+    {
+      "epoch": 1.1687344913151365,
+      "grad_norm": 2.4410274028778076,
+      "learning_rate": 9.965577596266045e-05,
+      "loss": 4.020483779907226,
+      "step": 1060
+    },
+    {
+      "epoch": 1.1797628894403087,
+      "grad_norm": 2.768084764480591,
+      "learning_rate": 9.959743290548426e-05,
+      "loss": 4.021839141845703,
+      "step": 1070
+    },
+    {
+      "epoch": 1.1907912875654811,
+      "grad_norm": 1.9342570304870605,
+      "learning_rate": 9.953908984830806e-05,
+      "loss": 4.026360321044922,
+      "step": 1080
+    },
+    {
+      "epoch": 1.2018196856906533,
+      "grad_norm": 2.8184762001037598,
+      "learning_rate": 9.948074679113187e-05,
+      "loss": 4.007581329345703,
+      "step": 1090
+    },
+    {
+      "epoch": 1.2128480838158258,
+      "grad_norm": 3.2656188011169434,
+      "learning_rate": 9.942240373395566e-05,
+      "loss": 3.9965087890625,
+      "step": 1100
+    },
+    {
+      "epoch": 1.223876481940998,
+      "grad_norm": 2.4359538555145264,
+      "learning_rate": 9.936406067677947e-05,
+      "loss": 3.9959388732910157,
+      "step": 1110
+    },
+    {
+      "epoch": 1.2349048800661704,
+      "grad_norm": 1.9357632398605347,
+      "learning_rate": 9.930571761960327e-05,
+      "loss": 3.9851417541503906,
+      "step": 1120
+    },
+    {
+      "epoch": 1.2459332781913428,
+      "grad_norm": 2.1269352436065674,
+      "learning_rate": 9.924737456242708e-05,
+      "loss": 3.9773223876953123,
+      "step": 1130
+    },
+    {
+      "epoch": 1.256961676316515,
+      "grad_norm": 3.3491597175598145,
+      "learning_rate": 9.918903150525088e-05,
+      "loss": 3.9877471923828125,
+      "step": 1140
+    },
+    {
+      "epoch": 1.2679900744416872,
+      "grad_norm": 1.8646328449249268,
+      "learning_rate": 9.913068844807468e-05,
+      "loss": 3.9694965362548826,
+      "step": 1150
+    },
+    {
+      "epoch": 1.2790184725668596,
+      "grad_norm": 2.6204631328582764,
+      "learning_rate": 9.907234539089849e-05,
+      "loss": 3.9611881256103514,
+      "step": 1160
+    },
+    {
+      "epoch": 1.290046870692032,
+      "grad_norm": 1.872028112411499,
+      "learning_rate": 9.901400233372228e-05,
+      "loss": 3.964163970947266,
+      "step": 1170
+    },
+    {
+      "epoch": 1.3010752688172043,
+      "grad_norm": 3.490435838699341,
+      "learning_rate": 9.895565927654609e-05,
+      "loss": 3.959897994995117,
+      "step": 1180
+    },
+    {
+      "epoch": 1.3121036669423767,
+      "grad_norm": 2.862489700317383,
+      "learning_rate": 9.88973162193699e-05,
+      "loss": 3.9567939758300783,
+      "step": 1190
+    },
+    {
+      "epoch": 1.3231320650675489,
+      "grad_norm": 3.0570664405822754,
+      "learning_rate": 9.883897316219371e-05,
+      "loss": 3.9470645904541017,
+      "step": 1200
+    },
+    {
+      "epoch": 1.3341604631927213,
+      "grad_norm": 1.9254627227783203,
+      "learning_rate": 9.878063010501752e-05,
+      "loss": 3.9442317962646483,
+      "step": 1210
+    },
+    {
+      "epoch": 1.3451888613178935,
+      "grad_norm": 3.606224298477173,
+      "learning_rate": 9.872228704784131e-05,
+      "loss": 3.9380733489990236,
+      "step": 1220
+    },
+    {
+      "epoch": 1.356217259443066,
+      "grad_norm": 2.1184027194976807,
+      "learning_rate": 9.866394399066512e-05,
+      "loss": 3.9452835083007813,
+      "step": 1230
+    },
+    {
+      "epoch": 1.3672456575682381,
+      "grad_norm": 1.8997142314910889,
+      "learning_rate": 9.860560093348892e-05,
+      "loss": 3.9270603179931642,
+      "step": 1240
+    },
+    {
+      "epoch": 1.3782740556934105,
+      "grad_norm": 2.9672305583953857,
+      "learning_rate": 9.854725787631273e-05,
+      "loss": 3.9120155334472657,
+      "step": 1250
+    },
+    {
+      "epoch": 1.389302453818583,
+      "grad_norm": 1.9220951795578003,
+      "learning_rate": 9.848891481913652e-05,
+      "loss": 3.900279235839844,
+      "step": 1260
+    },
+    {
+      "epoch": 1.4003308519437552,
+      "grad_norm": 2.013521194458008,
+      "learning_rate": 9.843057176196033e-05,
+      "loss": 3.9147193908691404,
+      "step": 1270
+    },
+    {
+      "epoch": 1.4113592500689274,
+      "grad_norm": 1.451686143875122,
+      "learning_rate": 9.837222870478413e-05,
+      "loss": 3.906220245361328,
+      "step": 1280
+    },
+    {
+      "epoch": 1.4223876481940998,
+      "grad_norm": 4.606860637664795,
+      "learning_rate": 9.831388564760794e-05,
+      "loss": 3.905352020263672,
+      "step": 1290
+    },
+    {
+      "epoch": 1.4334160463192722,
+      "grad_norm": 1.779123306274414,
+      "learning_rate": 9.825554259043175e-05,
+      "loss": 3.9137496948242188,
+      "step": 1300
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 2.086585521697998,
+      "learning_rate": 9.819719953325554e-05,
+      "loss": 3.89554443359375,
+      "step": 1310
+    },
+    {
+      "epoch": 1.4554728425696168,
+      "grad_norm": 3.3514609336853027,
+      "learning_rate": 9.813885647607935e-05,
+      "loss": 3.8901123046875,
+      "step": 1320
+    },
+    {
+      "epoch": 1.466501240694789,
+      "grad_norm": 2.1145269870758057,
+      "learning_rate": 9.808051341890316e-05,
+      "loss": 3.8892486572265623,
+      "step": 1330
+    },
+    {
+      "epoch": 1.4775296388199615,
+      "grad_norm": 1.5503329038619995,
+      "learning_rate": 9.802217036172697e-05,
+      "loss": 3.8922355651855467,
+      "step": 1340
+    },
+    {
+      "epoch": 1.4885580369451337,
+      "grad_norm": 2.3014304637908936,
+      "learning_rate": 9.796382730455076e-05,
+      "loss": 3.8860099792480467,
+      "step": 1350
+    },
+    {
+      "epoch": 1.499586435070306,
+      "grad_norm": 1.9633557796478271,
+      "learning_rate": 9.790548424737457e-05,
+      "loss": 3.875183868408203,
+      "step": 1360
+    },
+    {
+      "epoch": 1.5106148331954783,
+      "grad_norm": 2.228351593017578,
+      "learning_rate": 9.784714119019837e-05,
+      "loss": 3.8726768493652344,
+      "step": 1370
+    },
+    {
+      "epoch": 1.5216432313206507,
+      "grad_norm": 3.0888657569885254,
+      "learning_rate": 9.778879813302218e-05,
+      "loss": 3.872690963745117,
+      "step": 1380
+    },
+    {
+      "epoch": 1.5326716294458231,
+      "grad_norm": 2.0078868865966797,
+      "learning_rate": 9.773045507584599e-05,
+      "loss": 3.8612388610839843,
+      "step": 1390
+    },
+    {
+      "epoch": 1.5437000275709953,
+      "grad_norm": 2.1966569423675537,
+      "learning_rate": 9.767211201866978e-05,
+      "loss": 3.8649852752685545,
+      "step": 1400
+    },
+    {
+      "epoch": 1.5547284256961675,
+      "grad_norm": 2.1047487258911133,
+      "learning_rate": 9.761376896149359e-05,
+      "loss": 3.8632328033447267,
+      "step": 1410
+    },
+    {
+      "epoch": 1.56575682382134,
+      "grad_norm": 1.9347233772277832,
+      "learning_rate": 9.755542590431739e-05,
+      "loss": 3.8362571716308596,
+      "step": 1420
+    },
+    {
+      "epoch": 1.5767852219465124,
+      "grad_norm": 1.7961437702178955,
+      "learning_rate": 9.74970828471412e-05,
+      "loss": 3.8461585998535157,
+      "step": 1430
+    },
+    {
+      "epoch": 1.5878136200716846,
+      "grad_norm": 2.4657342433929443,
+      "learning_rate": 9.743873978996499e-05,
+      "loss": 3.842551040649414,
+      "step": 1440
+    },
+    {
+      "epoch": 1.5988420181968568,
+      "grad_norm": 2.043138027191162,
+      "learning_rate": 9.73803967327888e-05,
+      "loss": 3.8387855529785155,
+      "step": 1450
+    },
+    {
+      "epoch": 1.6098704163220292,
+      "grad_norm": 3.732532262802124,
+      "learning_rate": 9.732205367561261e-05,
+      "loss": 3.8399681091308593,
+      "step": 1460
+    },
+    {
+      "epoch": 1.6208988144472016,
+      "grad_norm": 2.43684720993042,
+      "learning_rate": 9.726371061843642e-05,
+      "loss": 3.8324966430664062,
+      "step": 1470
+    },
+    {
+      "epoch": 1.6319272125723738,
+      "grad_norm": 2.4433460235595703,
+      "learning_rate": 9.720536756126023e-05,
+      "loss": 3.817783737182617,
+      "step": 1480
+    },
+    {
+      "epoch": 1.642955610697546,
+      "grad_norm": 2.1049606800079346,
+      "learning_rate": 9.714702450408402e-05,
+      "loss": 3.804280090332031,
+      "step": 1490
+    },
+    {
+      "epoch": 1.6539840088227185,
+      "grad_norm": 3.529686450958252,
+      "learning_rate": 9.708868144690783e-05,
+      "loss": 3.805449295043945,
+      "step": 1500
+    },
+    {
+      "epoch": 1.6650124069478909,
+      "grad_norm": 2.0984089374542236,
+      "learning_rate": 9.703033838973162e-05,
+      "loss": 3.788246917724609,
+      "step": 1510
+    },
+    {
+      "epoch": 1.6760408050730633,
+      "grad_norm": 1.9434291124343872,
+      "learning_rate": 9.697199533255543e-05,
+      "loss": 3.7875442504882812,
+      "step": 1520
+    },
+    {
+      "epoch": 1.6870692031982355,
+      "grad_norm": 1.99173903465271,
+      "learning_rate": 9.691365227537923e-05,
+      "loss": 3.7807193756103517,
+      "step": 1530
+    },
+    {
+      "epoch": 1.6980976013234077,
+      "grad_norm": 2.5006911754608154,
+      "learning_rate": 9.685530921820304e-05,
+      "loss": 3.744763946533203,
+      "step": 1540
+    },
+    {
+      "epoch": 1.7091259994485801,
+      "grad_norm": 2.1816165447235107,
+      "learning_rate": 9.679696616102685e-05,
+      "loss": 3.760245513916016,
+      "step": 1550
+    },
+    {
+      "epoch": 1.7201543975737525,
+      "grad_norm": 2.123291492462158,
+      "learning_rate": 9.673862310385064e-05,
+      "loss": 3.738916778564453,
+      "step": 1560
+    },
+    {
+      "epoch": 1.7311827956989247,
+      "grad_norm": 2.378187894821167,
+      "learning_rate": 9.668028004667445e-05,
+      "loss": 3.734139251708984,
+      "step": 1570
+    },
+    {
+      "epoch": 1.742211193824097,
+      "grad_norm": 2.54819393157959,
+      "learning_rate": 9.662193698949825e-05,
+      "loss": 3.715302276611328,
+      "step": 1580
+    },
+    {
+      "epoch": 1.7532395919492694,
+      "grad_norm": 4.285822868347168,
+      "learning_rate": 9.656359393232206e-05,
+      "loss": 3.72213134765625,
+      "step": 1590
+    },
+    {
+      "epoch": 1.7642679900744418,
+      "grad_norm": 1.8676700592041016,
+      "learning_rate": 9.650525087514586e-05,
+      "loss": 3.7252479553222657,
+      "step": 1600
+    },
+    {
+      "epoch": 1.775296388199614,
+      "grad_norm": 1.6977792978286743,
+      "learning_rate": 9.644690781796967e-05,
+      "loss": 3.704994964599609,
+      "step": 1610
+    },
+    {
+      "epoch": 1.7863247863247862,
+      "grad_norm": 1.8334232568740845,
+      "learning_rate": 9.638856476079347e-05,
+      "loss": 3.6980815887451173,
+      "step": 1620
+    },
+    {
+      "epoch": 1.7973531844499586,
+      "grad_norm": 2.6574559211730957,
+      "learning_rate": 9.633022170361728e-05,
+      "loss": 3.683759307861328,
+      "step": 1630
+    },
+    {
+      "epoch": 1.808381582575131,
+      "grad_norm": 2.085084915161133,
+      "learning_rate": 9.627187864644109e-05,
+      "loss": 3.67755126953125,
+      "step": 1640
+    },
+    {
+      "epoch": 1.8194099807003032,
+      "grad_norm": 1.685441017150879,
+      "learning_rate": 9.621353558926488e-05,
+      "loss": 3.656099319458008,
+      "step": 1650
+    },
+    {
+      "epoch": 1.8304383788254754,
+      "grad_norm": 2.4462475776672363,
+      "learning_rate": 9.615519253208869e-05,
+      "loss": 3.668656921386719,
+      "step": 1660
+    },
+    {
+      "epoch": 1.8414667769506479,
+      "grad_norm": 1.54155433177948,
+      "learning_rate": 9.609684947491249e-05,
+      "loss": 3.66968994140625,
+      "step": 1670
+    },
+    {
+      "epoch": 1.8524951750758203,
+      "grad_norm": 3.862130880355835,
+      "learning_rate": 9.60385064177363e-05,
+      "loss": 3.6412506103515625,
+      "step": 1680
+    },
+    {
+      "epoch": 1.8635235732009927,
+      "grad_norm": 1.7317070960998535,
+      "learning_rate": 9.598016336056009e-05,
+      "loss": 3.639806365966797,
+      "step": 1690
+    },
+    {
+      "epoch": 1.874551971326165,
+      "grad_norm": 2.2640931606292725,
+      "learning_rate": 9.59218203033839e-05,
+      "loss": 3.6341064453125,
+      "step": 1700
+    },
+    {
+      "epoch": 1.8855803694513371,
+      "grad_norm": 3.653146743774414,
+      "learning_rate": 9.586347724620771e-05,
+      "loss": 3.6380882263183594,
+      "step": 1710
+    },
+    {
+      "epoch": 1.8966087675765095,
+      "grad_norm": 1.8987306356430054,
+      "learning_rate": 9.58051341890315e-05,
+      "loss": 3.6405975341796877,
+      "step": 1720
+    },
+    {
+      "epoch": 1.907637165701682,
+      "grad_norm": 2.202659845352173,
+      "learning_rate": 9.574679113185531e-05,
+      "loss": 3.6375991821289064,
+      "step": 1730
+    },
+    {
+      "epoch": 1.9186655638268542,
+      "grad_norm": 1.5091872215270996,
+      "learning_rate": 9.568844807467912e-05,
+      "loss": 3.6208465576171873,
+      "step": 1740
+    },
+    {
+      "epoch": 1.9296939619520264,
+      "grad_norm": 1.9811325073242188,
+      "learning_rate": 9.563010501750293e-05,
+      "loss": 3.600755310058594,
+      "step": 1750
+    },
+    {
+      "epoch": 1.9407223600771988,
+      "grad_norm": 3.184499979019165,
+      "learning_rate": 9.557176196032673e-05,
+      "loss": 3.6109405517578126,
+      "step": 1760
+    },
+    {
+      "epoch": 1.9517507582023712,
+      "grad_norm": 2.340125322341919,
+      "learning_rate": 9.551341890315054e-05,
+      "loss": 3.6129817962646484,
+      "step": 1770
+    },
+    {
+      "epoch": 1.9627791563275434,
+      "grad_norm": 1.7258495092391968,
+      "learning_rate": 9.545507584597433e-05,
+      "loss": 3.590809631347656,
+      "step": 1780
+    },
+    {
+      "epoch": 1.9738075544527156,
+      "grad_norm": 1.6129754781723022,
+      "learning_rate": 9.539673278879814e-05,
+      "loss": 3.5866302490234374,
+      "step": 1790
+    },
+    {
+      "epoch": 1.984835952577888,
+      "grad_norm": 2.7458667755126953,
+      "learning_rate": 9.533838973162195e-05,
+      "loss": 3.596644973754883,
+      "step": 1800
+    },
+    {
+      "epoch": 1.9958643507030605,
+      "grad_norm": 2.258280038833618,
+      "learning_rate": 9.528004667444574e-05,
+      "loss": 3.5881332397460937,
+      "step": 1810
+    },
+    {
+      "epoch": 2.0066170388751035,
+      "grad_norm": 2.1228580474853516,
+      "learning_rate": 9.522170361726955e-05,
+      "loss": 3.5709766387939452,
+      "step": 1820
+    },
+    {
+      "epoch": 2.017645437000276,
+      "grad_norm": 1.588876485824585,
+      "learning_rate": 9.516336056009335e-05,
+      "loss": 3.5627593994140625,
+      "step": 1830
+    },
+    {
+      "epoch": 2.028673835125448,
+      "grad_norm": 2.451474189758301,
+      "learning_rate": 9.510501750291716e-05,
+      "loss": 3.5535301208496093,
+      "step": 1840
+    },
+    {
+      "epoch": 2.0397022332506203,
+      "grad_norm": 2.0007503032684326,
+      "learning_rate": 9.504667444574095e-05,
+      "loss": 3.553875732421875,
+      "step": 1850
+    },
+    {
+      "epoch": 2.0507306313757927,
+      "grad_norm": 1.4410080909729004,
+      "learning_rate": 9.498833138856476e-05,
+      "loss": 3.550189971923828,
+      "step": 1860
+    },
+    {
+      "epoch": 2.061759029500965,
+      "grad_norm": 2.062835216522217,
+      "learning_rate": 9.492998833138857e-05,
+      "loss": 3.5456893920898436,
+      "step": 1870
+    },
+    {
+      "epoch": 2.072787427626137,
+      "grad_norm": 2.4534783363342285,
+      "learning_rate": 9.487164527421238e-05,
+      "loss": 3.536829376220703,
+      "step": 1880
+    },
+    {
+      "epoch": 2.0838158257513095,
+      "grad_norm": 2.2788970470428467,
+      "learning_rate": 9.481330221703619e-05,
+      "loss": 3.5525283813476562,
+      "step": 1890
+    },
+    {
+      "epoch": 2.094844223876482,
+      "grad_norm": 1.4259227514266968,
+      "learning_rate": 9.475495915985998e-05,
+      "loss": 3.5479995727539064,
+      "step": 1900
+    },
+    {
+      "epoch": 2.1058726220016544,
+      "grad_norm": 2.672534465789795,
+      "learning_rate": 9.469661610268379e-05,
+      "loss": 3.5359420776367188,
+      "step": 1910
+    },
+    {
+      "epoch": 2.116901020126827,
+      "grad_norm": 2.0648045539855957,
+      "learning_rate": 9.463827304550759e-05,
+      "loss": 3.5452896118164063,
+      "step": 1920
+    },
+    {
+      "epoch": 2.1279294182519988,
+      "grad_norm": 1.6846543550491333,
+      "learning_rate": 9.45799299883314e-05,
+      "loss": 3.5434345245361327,
+      "step": 1930
+    },
+    {
+      "epoch": 2.138957816377171,
+      "grad_norm": 1.9105942249298096,
+      "learning_rate": 9.452158693115519e-05,
+      "loss": 3.5351535797119142,
+      "step": 1940
+    },
+    {
+      "epoch": 2.1499862145023436,
+      "grad_norm": 1.8230890035629272,
+      "learning_rate": 9.4463243873979e-05,
+      "loss": 3.5190963745117188,
+      "step": 1950
+    },
+    {
+      "epoch": 2.161014612627516,
+      "grad_norm": 1.6383274793624878,
+      "learning_rate": 9.440490081680281e-05,
+      "loss": 3.5228431701660154,
+      "step": 1960
+    },
+    {
+      "epoch": 2.172043010752688,
+      "grad_norm": 1.7378439903259277,
+      "learning_rate": 9.43465577596266e-05,
+      "loss": 3.520981216430664,
+      "step": 1970
+    },
+    {
+      "epoch": 2.1830714088778604,
+      "grad_norm": 1.941454529762268,
+      "learning_rate": 9.428821470245041e-05,
+      "loss": 3.519342803955078,
+      "step": 1980
+    },
+    {
+      "epoch": 2.194099807003033,
+      "grad_norm": 1.8295516967773438,
+      "learning_rate": 9.422987164527421e-05,
+      "loss": 3.5412979125976562,
+      "step": 1990
+    },
+    {
+      "epoch": 2.2051282051282053,
+      "grad_norm": 1.8052620887756348,
+      "learning_rate": 9.417152858809802e-05,
+      "loss": 3.5153289794921876,
+      "step": 2000
+    },
+    {
+      "epoch": 2.2161566032533773,
+      "grad_norm": 2.1949570178985596,
+      "learning_rate": 9.411318553092183e-05,
+      "loss": 3.521608352661133,
+      "step": 2010
+    },
+    {
+      "epoch": 2.2271850013785497,
+      "grad_norm": 1.746172308921814,
+      "learning_rate": 9.405484247374564e-05,
+      "loss": 3.5008296966552734,
+      "step": 2020
+    },
+    {
+      "epoch": 2.238213399503722,
+      "grad_norm": 2.5374276638031006,
+      "learning_rate": 9.399649941656943e-05,
+      "loss": 3.5140228271484375,
+      "step": 2030
+    },
+    {
+      "epoch": 2.2492417976288945,
+      "grad_norm": 1.7763218879699707,
+      "learning_rate": 9.393815635939324e-05,
+      "loss": 3.510652542114258,
+      "step": 2040
+    },
+    {
+      "epoch": 2.2602701957540665,
+      "grad_norm": 1.6599587202072144,
+      "learning_rate": 9.387981330221705e-05,
+      "loss": 3.5122325897216795,
+      "step": 2050
+    },
+    {
+      "epoch": 2.271298593879239,
+      "grad_norm": 2.1496078968048096,
+      "learning_rate": 9.382147024504085e-05,
+      "loss": 3.5139747619628907,
+      "step": 2060
+    },
+    {
+      "epoch": 2.2823269920044114,
+      "grad_norm": 1.64266836643219,
+      "learning_rate": 9.376312718786465e-05,
+      "loss": 3.507743072509766,
+      "step": 2070
+    },
+    {
+      "epoch": 2.293355390129584,
+      "grad_norm": 2.1241567134857178,
+      "learning_rate": 9.370478413068845e-05,
+      "loss": 3.5162708282470705,
+      "step": 2080
+    },
+    {
+      "epoch": 2.304383788254756,
+      "grad_norm": 1.8391071557998657,
+      "learning_rate": 9.364644107351226e-05,
+      "loss": 3.4955375671386717,
+      "step": 2090
+    },
+    {
+      "epoch": 2.315412186379928,
+      "grad_norm": 2.7478973865509033,
+      "learning_rate": 9.358809801633605e-05,
+      "loss": 3.497519302368164,
+      "step": 2100
+    },
+    {
+      "epoch": 2.3264405845051006,
+      "grad_norm": 1.938588261604309,
+      "learning_rate": 9.352975495915986e-05,
+      "loss": 3.490141677856445,
+      "step": 2110
+    },
+    {
+      "epoch": 2.337468982630273,
+      "grad_norm": 1.5637104511260986,
+      "learning_rate": 9.347141190198366e-05,
+      "loss": 3.499908447265625,
+      "step": 2120
+    },
+    {
+      "epoch": 2.3484973807554455,
+      "grad_norm": 1.882504940032959,
+      "learning_rate": 9.341306884480747e-05,
+      "loss": 3.491979217529297,
+      "step": 2130
+    },
+    {
+      "epoch": 2.3595257788806174,
+      "grad_norm": 1.8528521060943604,
+      "learning_rate": 9.335472578763128e-05,
+      "loss": 3.4961143493652345,
+      "step": 2140
+    },
+    {
+      "epoch": 2.37055417700579,
+      "grad_norm": 1.8050177097320557,
+      "learning_rate": 9.329638273045509e-05,
+      "loss": 3.4948150634765627,
+      "step": 2150
+    },
+    {
+      "epoch": 2.3815825751309623,
+      "grad_norm": 1.816784381866455,
+      "learning_rate": 9.32380396732789e-05,
+      "loss": 3.4910873413085937,
+      "step": 2160
+    },
+    {
+      "epoch": 2.3926109732561347,
+      "grad_norm": 1.9779244661331177,
+      "learning_rate": 9.317969661610269e-05,
+      "loss": 3.492570495605469,
+      "step": 2170
+    },
+    {
+      "epoch": 2.4036393713813067,
+      "grad_norm": 1.8939772844314575,
+      "learning_rate": 9.31213535589265e-05,
+      "loss": 3.473868560791016,
+      "step": 2180
+    },
+    {
+      "epoch": 2.414667769506479,
+      "grad_norm": 2.1493656635284424,
+      "learning_rate": 9.30630105017503e-05,
+      "loss": 3.494515228271484,
+      "step": 2190
+    },
+    {
+      "epoch": 2.4256961676316515,
+      "grad_norm": 1.8989397287368774,
+      "learning_rate": 9.30046674445741e-05,
+      "loss": 3.487537384033203,
+      "step": 2200
+    },
+    {
+      "epoch": 2.436724565756824,
+      "grad_norm": 1.881856918334961,
+      "learning_rate": 9.294632438739791e-05,
+      "loss": 3.475904083251953,
+      "step": 2210
+    },
+    {
+      "epoch": 2.447752963881996,
+      "grad_norm": 1.9463883638381958,
+      "learning_rate": 9.288798133022171e-05,
+      "loss": 3.4829254150390625,
+      "step": 2220
+    },
+    {
+      "epoch": 2.4587813620071683,
+      "grad_norm": 2.01379656791687,
+      "learning_rate": 9.282963827304552e-05,
+      "loss": 3.472850036621094,
+      "step": 2230
+    },
+    {
+      "epoch": 2.4698097601323408,
+      "grad_norm": 2.442741632461548,
+      "learning_rate": 9.277129521586931e-05,
+      "loss": 3.47030029296875,
+      "step": 2240
+    },
+    {
+      "epoch": 2.480838158257513,
+      "grad_norm": 1.5051734447479248,
+      "learning_rate": 9.271295215869312e-05,
+      "loss": 3.489413833618164,
+      "step": 2250
+    },
+    {
+      "epoch": 2.4918665563826856,
+      "grad_norm": 1.9489309787750244,
+      "learning_rate": 9.265460910151692e-05,
+      "loss": 3.464769744873047,
+      "step": 2260
+    },
+    {
+      "epoch": 2.5028949545078576,
+      "grad_norm": 2.319654941558838,
+      "learning_rate": 9.259626604434072e-05,
+      "loss": 3.469140625,
+      "step": 2270
+    },
+    {
+      "epoch": 2.51392335263303,
+      "grad_norm": 1.7984129190444946,
+      "learning_rate": 9.253792298716453e-05,
+      "loss": 3.466594696044922,
+      "step": 2280
+    },
+    {
+      "epoch": 2.5249517507582024,
+      "grad_norm": 1.640869379043579,
+      "learning_rate": 9.247957992998833e-05,
+      "loss": 3.463022994995117,
+      "step": 2290
+    },
+    {
+      "epoch": 2.5359801488833744,
+      "grad_norm": 1.6698195934295654,
+      "learning_rate": 9.242123687281214e-05,
+      "loss": 3.4695220947265626,
+      "step": 2300
+    },
+    {
+      "epoch": 2.547008547008547,
+      "grad_norm": 2.2945683002471924,
+      "learning_rate": 9.236289381563595e-05,
+      "loss": 3.469150924682617,
+      "step": 2310
+    },
+    {
+      "epoch": 2.5580369451337193,
+      "grad_norm": 1.7678370475769043,
+      "learning_rate": 9.230455075845976e-05,
+      "loss": 3.470307159423828,
+      "step": 2320
+    },
+    {
+      "epoch": 2.5690653432588917,
+      "grad_norm": 1.8386255502700806,
+      "learning_rate": 9.224620770128355e-05,
+      "loss": 3.4638832092285154,
+      "step": 2330
+    },
+    {
+      "epoch": 2.580093741384064,
+      "grad_norm": 2.0348527431488037,
+      "learning_rate": 9.218786464410736e-05,
+      "loss": 3.460480880737305,
+      "step": 2340
+    },
+    {
+      "epoch": 2.5911221395092365,
+      "grad_norm": 1.845974326133728,
+      "learning_rate": 9.212952158693116e-05,
+      "loss": 3.4529083251953123,
+      "step": 2350
+    },
+    {
+      "epoch": 2.6021505376344085,
+      "grad_norm": 2.0843095779418945,
+      "learning_rate": 9.207117852975496e-05,
+      "loss": 3.4576786041259764,
+      "step": 2360
+    },
+    {
+      "epoch": 2.613178935759581,
+      "grad_norm": 1.7627031803131104,
+      "learning_rate": 9.201283547257876e-05,
+      "loss": 3.4450752258300783,
+      "step": 2370
+    },
+    {
+      "epoch": 2.6242073338847534,
+      "grad_norm": 1.371972918510437,
+      "learning_rate": 9.195449241540257e-05,
+      "loss": 3.464734649658203,
+      "step": 2380
+    },
+    {
+      "epoch": 2.6352357320099253,
+      "grad_norm": 1.6781940460205078,
+      "learning_rate": 9.189614935822638e-05,
+      "loss": 3.444991683959961,
+      "step": 2390
+    },
+    {
+      "epoch": 2.6462641301350978,
+      "grad_norm": 1.8782585859298706,
+      "learning_rate": 9.183780630105017e-05,
+      "loss": 3.4558509826660155,
+      "step": 2400
+    },
+    {
+      "epoch": 2.65729252826027,
+      "grad_norm": 1.942812204360962,
+      "learning_rate": 9.177946324387398e-05,
+      "loss": 3.4555503845214846,
+      "step": 2410
+    },
+    {
+      "epoch": 2.6683209263854426,
+      "grad_norm": 1.404680609703064,
+      "learning_rate": 9.172112018669778e-05,
+      "loss": 3.438182830810547,
+      "step": 2420
+    },
+    {
+      "epoch": 2.679349324510615,
+      "grad_norm": 1.7656677961349487,
+      "learning_rate": 9.166277712952159e-05,
+      "loss": 3.4622947692871096,
+      "step": 2430
+    },
+    {
+      "epoch": 2.690377722635787,
+      "grad_norm": 1.8348901271820068,
+      "learning_rate": 9.16044340723454e-05,
+      "loss": 3.438182830810547,
+      "step": 2440
+    },
+    {
+      "epoch": 2.7014061207609594,
+      "grad_norm": 2.0641167163848877,
+      "learning_rate": 9.15460910151692e-05,
+      "loss": 3.441473388671875,
+      "step": 2450
+    },
+    {
+      "epoch": 2.712434518886132,
+      "grad_norm": 1.726035475730896,
+      "learning_rate": 9.148774795799301e-05,
+      "loss": 3.441991424560547,
+      "step": 2460
+    },
+    {
+      "epoch": 2.7234629170113043,
+      "grad_norm": 1.854658603668213,
+      "learning_rate": 9.142940490081681e-05,
+      "loss": 3.4441551208496093,
+      "step": 2470
+    },
+    {
+      "epoch": 2.7344913151364763,
+      "grad_norm": 1.8229296207427979,
+      "learning_rate": 9.137106184364062e-05,
+      "loss": 3.441034698486328,
+      "step": 2480
+    },
+    {
+      "epoch": 2.7455197132616487,
+      "grad_norm": 1.6627975702285767,
+      "learning_rate": 9.131271878646441e-05,
+      "loss": 3.4399124145507813,
+      "step": 2490
+    },
+    {
+      "epoch": 2.756548111386821,
+      "grad_norm": 1.4111251831054688,
+      "learning_rate": 9.125437572928822e-05,
+      "loss": 3.4374462127685548,
+      "step": 2500
+    },
+    {
+      "epoch": 2.7675765095119935,
+      "grad_norm": 2.015869379043579,
+      "learning_rate": 9.119603267211202e-05,
+      "loss": 3.4262016296386717,
+      "step": 2510
+    },
+    {
+      "epoch": 2.778604907637166,
+      "grad_norm": 2.2818591594696045,
+      "learning_rate": 9.113768961493583e-05,
+      "loss": 3.446285629272461,
+      "step": 2520
+    },
+    {
+      "epoch": 2.789633305762338,
+      "grad_norm": 1.8643262386322021,
+      "learning_rate": 9.107934655775962e-05,
+      "loss": 3.4362293243408204,
+      "step": 2530
+    },
+    {
+      "epoch": 2.8006617038875103,
+      "grad_norm": 1.248988151550293,
+      "learning_rate": 9.102100350058343e-05,
+      "loss": 3.441702651977539,
+      "step": 2540
+    },
+    {
+      "epoch": 2.8116901020126828,
+      "grad_norm": 1.5247464179992676,
+      "learning_rate": 9.096266044340724e-05,
+      "loss": 3.4388256072998047,
+      "step": 2550
+    },
+    {
+      "epoch": 2.8227185001378547,
+      "grad_norm": 1.9120620489120483,
+      "learning_rate": 9.090431738623103e-05,
+      "loss": 3.4206756591796874,
+      "step": 2560
+    },
+    {
+      "epoch": 2.833746898263027,
+      "grad_norm": 1.4591054916381836,
+      "learning_rate": 9.084597432905484e-05,
+      "loss": 3.4229709625244142,
+      "step": 2570
+    },
+    {
+      "epoch": 2.8447752963881996,
+      "grad_norm": 2.24849796295166,
+      "learning_rate": 9.078763127187865e-05,
+      "loss": 3.426911163330078,
+      "step": 2580
+    },
+    {
+      "epoch": 2.855803694513372,
+      "grad_norm": 1.5658804178237915,
+      "learning_rate": 9.072928821470246e-05,
+      "loss": 3.445120620727539,
+      "step": 2590
+    },
+    {
+      "epoch": 2.8668320926385444,
+      "grad_norm": 1.483583688735962,
+      "learning_rate": 9.067094515752626e-05,
+      "loss": 3.430312728881836,
+      "step": 2600
+    },
+    {
+      "epoch": 2.8778604907637164,
+      "grad_norm": 1.5759658813476562,
+      "learning_rate": 9.061260210035007e-05,
+      "loss": 3.4178386688232423,
+      "step": 2610
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 1.9259848594665527,
+      "learning_rate": 9.055425904317386e-05,
+      "loss": 3.430949401855469,
+      "step": 2620
+    },
+    {
+      "epoch": 2.8999172870140613,
+      "grad_norm": 1.470717191696167,
+      "learning_rate": 9.049591598599767e-05,
+      "loss": 3.439757537841797,
+      "step": 2630
+    },
+    {
+      "epoch": 2.9109456851392337,
+      "grad_norm": 1.8934212923049927,
+      "learning_rate": 9.043757292882148e-05,
+      "loss": 3.430719757080078,
+      "step": 2640
+    },
+    {
+      "epoch": 2.9219740832644057,
+      "grad_norm": 1.6267489194869995,
+      "learning_rate": 9.037922987164527e-05,
+      "loss": 3.4224998474121096,
+      "step": 2650
+    },
+    {
+      "epoch": 2.933002481389578,
+      "grad_norm": 1.6213353872299194,
+      "learning_rate": 9.032088681446908e-05,
+      "loss": 3.4213233947753907,
+      "step": 2660
+    },
+    {
+      "epoch": 2.9440308795147505,
+      "grad_norm": 1.961879849433899,
+      "learning_rate": 9.026254375729288e-05,
+      "loss": 3.4108352661132812,
+      "step": 2670
+    },
+    {
+      "epoch": 2.955059277639923,
+      "grad_norm": 1.7363910675048828,
+      "learning_rate": 9.020420070011669e-05,
+      "loss": 3.423554229736328,
+      "step": 2680
+    },
+    {
+      "epoch": 2.9660876757650954,
+      "grad_norm": 1.6161952018737793,
+      "learning_rate": 9.014585764294048e-05,
+      "loss": 3.418962860107422,
+      "step": 2690
+    },
+    {
+      "epoch": 2.9771160738902673,
+      "grad_norm": 1.8065682649612427,
+      "learning_rate": 9.008751458576429e-05,
+      "loss": 3.4218765258789063,
+      "step": 2700
+    },
+    {
+      "epoch": 2.9881444720154398,
+      "grad_norm": 1.4285337924957275,
+      "learning_rate": 9.00291715285881e-05,
+      "loss": 3.413957214355469,
+      "step": 2710
+    },
+    {
+      "epoch": 2.999172870140612,
+      "grad_norm": 1.30274498462677,
+      "learning_rate": 8.997082847141191e-05,
+      "loss": 3.4176124572753905,
+      "step": 2720
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 18140,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1083298732048384.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

output_qwen3_plain_ar/checkpoint-2721/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

output_qwen3_plain_ar/checkpoint-3628/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "magel_chord_dropout_trigger_prob": 0.6,
+  "magel_num_audio_token": 16384,
+  "magel_structure_dropout_trigger_prob": 0.6,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.4.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 168056
+}

output_qwen3_plain_ar/checkpoint-3628/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.4.0"
+}

output_qwen3_plain_ar/checkpoint-3628/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step3628

output_qwen3_plain_ar/checkpoint-3628/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2568 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.0,
+  "eval_steps": 500,
+  "global_step": 3628,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011028398125172319,
+      "grad_norm": 435.2422180175781,
+      "learning_rate": 9e-07,
+      "loss": 20.84569549560547,
+      "step": 10
+    },
+    {
+      "epoch": 0.022056796250344637,
+      "grad_norm": 141.7341766357422,
+      "learning_rate": 1.9e-06,
+      "loss": 18.69615936279297,
+      "step": 20
+    },
+    {
+      "epoch": 0.033085194375516956,
+      "grad_norm": 74.42520904541016,
+      "learning_rate": 2.9e-06,
+      "loss": 16.079673767089844,
+      "step": 30
+    },
+    {
+      "epoch": 0.044113592500689275,
+      "grad_norm": 24.73248863220215,
+      "learning_rate": 3.9e-06,
+      "loss": 13.684315490722657,
+      "step": 40
+    },
+    {
+      "epoch": 0.055141990625861594,
+      "grad_norm": 7.049101829528809,
+      "learning_rate": 4.9000000000000005e-06,
+      "loss": 12.474874877929688,
+      "step": 50
+    },
+    {
+      "epoch": 0.06617038875103391,
+      "grad_norm": 2.3411474227905273,
+      "learning_rate": 5.9e-06,
+      "loss": 12.072142028808594,
+      "step": 60
+    },
+    {
+      "epoch": 0.07719878687620624,
+      "grad_norm": 1.126215934753418,
+      "learning_rate": 6.900000000000001e-06,
+      "loss": 11.938906860351562,
+      "step": 70
+    },
+    {
+      "epoch": 0.08822718500137855,
+      "grad_norm": 1.2050226926803589,
+      "learning_rate": 7.9e-06,
+      "loss": 11.81988296508789,
+      "step": 80
+    },
+    {
+      "epoch": 0.09925558312655088,
+      "grad_norm": 1.444793462753296,
+      "learning_rate": 8.9e-06,
+      "loss": 11.602033996582032,
+      "step": 90
+    },
+    {
+      "epoch": 0.11028398125172319,
+      "grad_norm": 5.791665077209473,
+      "learning_rate": 9.900000000000002e-06,
+      "loss": 11.201815032958985,
+      "step": 100
+    },
+    {
+      "epoch": 0.12131237937689551,
+      "grad_norm": 9.492277145385742,
+      "learning_rate": 1.09e-05,
+      "loss": 10.535708618164062,
+      "step": 110
+    },
+    {
+      "epoch": 0.13234077750206782,
+      "grad_norm": 2.7546133995056152,
+      "learning_rate": 1.19e-05,
+      "loss": 9.847169494628906,
+      "step": 120
+    },
+    {
+      "epoch": 0.14336917562724014,
+      "grad_norm": 1.0953313112258911,
+      "learning_rate": 1.29e-05,
+      "loss": 9.429026031494141,
+      "step": 130
+    },
+    {
+      "epoch": 0.15439757375241248,
+      "grad_norm": 0.7153559327125549,
+      "learning_rate": 1.3900000000000002e-05,
+      "loss": 9.266969299316406,
+      "step": 140
+    },
+    {
+      "epoch": 0.1654259718775848,
+      "grad_norm": 0.5888933539390564,
+      "learning_rate": 1.49e-05,
+      "loss": 9.1935546875,
+      "step": 150
+    },
+    {
+      "epoch": 0.1764543700027571,
+      "grad_norm": 0.4850365221500397,
+      "learning_rate": 1.59e-05,
+      "loss": 9.19604034423828,
+      "step": 160
+    },
+    {
+      "epoch": 0.1874827681279294,
+      "grad_norm": 0.5772538185119629,
+      "learning_rate": 1.69e-05,
+      "loss": 9.17010726928711,
+      "step": 170
+    },
+    {
+      "epoch": 0.19851116625310175,
+      "grad_norm": 0.4283920228481293,
+      "learning_rate": 1.79e-05,
+      "loss": 9.172830200195312,
+      "step": 180
+    },
+    {
+      "epoch": 0.20953956437827406,
+      "grad_norm": 0.8650698065757751,
+      "learning_rate": 1.8900000000000002e-05,
+      "loss": 9.154988098144532,
+      "step": 190
+    },
+    {
+      "epoch": 0.22056796250344637,
+      "grad_norm": 0.42017608880996704,
+      "learning_rate": 1.9900000000000003e-05,
+      "loss": 9.146849060058594,
+      "step": 200
+    },
+    {
+      "epoch": 0.23159636062861869,
+      "grad_norm": 0.9125994443893433,
+      "learning_rate": 2.09e-05,
+      "loss": 9.164442443847657,
+      "step": 210
+    },
+    {
+      "epoch": 0.24262475875379103,
+      "grad_norm": 0.6468876004219055,
+      "learning_rate": 2.19e-05,
+      "loss": 9.159596252441407,
+      "step": 220
+    },
+    {
+      "epoch": 0.25365315687896334,
+      "grad_norm": 0.4124819338321686,
+      "learning_rate": 2.29e-05,
+      "loss": 9.13860626220703,
+      "step": 230
+    },
+    {
+      "epoch": 0.26468155500413565,
+      "grad_norm": 1.990302562713623,
+      "learning_rate": 2.39e-05,
+      "loss": 9.145040893554688,
+      "step": 240
+    },
+    {
+      "epoch": 0.27570995312930796,
+      "grad_norm": 0.7875277400016785,
+      "learning_rate": 2.4900000000000002e-05,
+      "loss": 9.152925109863281,
+      "step": 250
+    },
+    {
+      "epoch": 0.2867383512544803,
+      "grad_norm": 0.8343706130981445,
+      "learning_rate": 2.5900000000000003e-05,
+      "loss": 9.132975769042968,
+      "step": 260
+    },
+    {
+      "epoch": 0.2977667493796526,
+      "grad_norm": 3.00996470451355,
+      "learning_rate": 2.6900000000000003e-05,
+      "loss": 9.097848510742187,
+      "step": 270
+    },
+    {
+      "epoch": 0.30879514750482495,
+      "grad_norm": 2.4282069206237793,
+      "learning_rate": 2.7900000000000004e-05,
+      "loss": 9.042235565185546,
+      "step": 280
+    },
+    {
+      "epoch": 0.31982354562999726,
+      "grad_norm": 4.171019554138184,
+      "learning_rate": 2.8899999999999998e-05,
+      "loss": 8.927298736572265,
+      "step": 290
+    },
+    {
+      "epoch": 0.3308519437551696,
+      "grad_norm": 2.197887659072876,
+      "learning_rate": 2.9900000000000002e-05,
+      "loss": 8.805252075195312,
+      "step": 300
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 10.306541442871094,
+      "learning_rate": 3.09e-05,
+      "loss": 8.673678588867187,
+      "step": 310
+    },
+    {
+      "epoch": 0.3529087400055142,
+      "grad_norm": 8.463860511779785,
+      "learning_rate": 3.19e-05,
+      "loss": 8.570347595214844,
+      "step": 320
+    },
+    {
+      "epoch": 0.3639371381306865,
+      "grad_norm": 3.999753475189209,
+      "learning_rate": 3.29e-05,
+      "loss": 8.429109191894531,
+      "step": 330
+    },
+    {
+      "epoch": 0.3749655362558588,
+      "grad_norm": 5.259007930755615,
+      "learning_rate": 3.3900000000000004e-05,
+      "loss": 8.334149169921876,
+      "step": 340
+    },
+    {
+      "epoch": 0.38599393438103113,
+      "grad_norm": 8.362598419189453,
+      "learning_rate": 3.49e-05,
+      "loss": 8.196139526367187,
+      "step": 350
+    },
+    {
+      "epoch": 0.3970223325062035,
+      "grad_norm": 10.273512840270996,
+      "learning_rate": 3.59e-05,
+      "loss": 8.040153503417969,
+      "step": 360
+    },
+    {
+      "epoch": 0.4080507306313758,
+      "grad_norm": 5.111108303070068,
+      "learning_rate": 3.69e-05,
+      "loss": 7.866473388671875,
+      "step": 370
+    },
+    {
+      "epoch": 0.4190791287565481,
+      "grad_norm": 9.192107200622559,
+      "learning_rate": 3.79e-05,
+      "loss": 7.695774841308594,
+      "step": 380
+    },
+    {
+      "epoch": 0.43010752688172044,
+      "grad_norm": 5.393336772918701,
+      "learning_rate": 3.8900000000000004e-05,
+      "loss": 7.498152160644532,
+      "step": 390
+    },
+    {
+      "epoch": 0.44113592500689275,
+      "grad_norm": 10.53490161895752,
+      "learning_rate": 3.99e-05,
+      "loss": 7.270246887207032,
+      "step": 400
+    },
+    {
+      "epoch": 0.45216432313206506,
+      "grad_norm": 6.174643516540527,
+      "learning_rate": 4.09e-05,
+      "loss": 7.127191162109375,
+      "step": 410
+    },
+    {
+      "epoch": 0.46319272125723737,
+      "grad_norm": 4.522936820983887,
+      "learning_rate": 4.19e-05,
+      "loss": 6.871500396728516,
+      "step": 420
+    },
+    {
+      "epoch": 0.4742211193824097,
+      "grad_norm": 4.3594207763671875,
+      "learning_rate": 4.29e-05,
+      "loss": 6.702586364746094,
+      "step": 430
+    },
+    {
+      "epoch": 0.48524951750758205,
+      "grad_norm": 5.950730323791504,
+      "learning_rate": 4.39e-05,
+      "loss": 6.493560791015625,
+      "step": 440
+    },
+    {
+      "epoch": 0.49627791563275436,
+      "grad_norm": 6.233413219451904,
+      "learning_rate": 4.49e-05,
+      "loss": 6.293489074707031,
+      "step": 450
+    },
+    {
+      "epoch": 0.5073063137579267,
+      "grad_norm": 7.656834125518799,
+      "learning_rate": 4.5900000000000004e-05,
+      "loss": 6.102347946166992,
+      "step": 460
+    },
+    {
+      "epoch": 0.518334711883099,
+      "grad_norm": 4.319094657897949,
+      "learning_rate": 4.69e-05,
+      "loss": 5.928083419799805,
+      "step": 470
+    },
+    {
+      "epoch": 0.5293631100082713,
+      "grad_norm": 5.585537433624268,
+      "learning_rate": 4.79e-05,
+      "loss": 5.77436637878418,
+      "step": 480
+    },
+    {
+      "epoch": 0.5403915081334436,
+      "grad_norm": 5.104014873504639,
+      "learning_rate": 4.89e-05,
+      "loss": 5.636859130859375,
+      "step": 490
+    },
+    {
+      "epoch": 0.5514199062586159,
+      "grad_norm": 5.453028202056885,
+      "learning_rate": 4.99e-05,
+      "loss": 5.507636260986328,
+      "step": 500
+    },
+    {
+      "epoch": 0.5624483043837882,
+      "grad_norm": 7.728854179382324,
+      "learning_rate": 5.0900000000000004e-05,
+      "loss": 5.411964416503906,
+      "step": 510
+    },
+    {
+      "epoch": 0.5734767025089605,
+      "grad_norm": 4.50288724899292,
+      "learning_rate": 5.19e-05,
+      "loss": 5.295291900634766,
+      "step": 520
+    },
+    {
+      "epoch": 0.5845051006341329,
+      "grad_norm": 4.245919704437256,
+      "learning_rate": 5.2900000000000005e-05,
+      "loss": 5.194162750244141,
+      "step": 530
+    },
+    {
+      "epoch": 0.5955334987593052,
+      "grad_norm": 6.278975963592529,
+      "learning_rate": 5.390000000000001e-05,
+      "loss": 5.113618087768555,
+      "step": 540
+    },
+    {
+      "epoch": 0.6065618968844775,
+      "grad_norm": 4.214662075042725,
+      "learning_rate": 5.4900000000000006e-05,
+      "loss": 5.038372039794922,
+      "step": 550
+    },
+    {
+      "epoch": 0.6175902950096499,
+      "grad_norm": 3.5404605865478516,
+      "learning_rate": 5.590000000000001e-05,
+      "loss": 4.935391235351562,
+      "step": 560
+    },
+    {
+      "epoch": 0.6286186931348222,
+      "grad_norm": 3.6460280418395996,
+      "learning_rate": 5.69e-05,
+      "loss": 4.896538543701172,
+      "step": 570
+    },
+    {
+      "epoch": 0.6396470912599945,
+      "grad_norm": 5.254800796508789,
+      "learning_rate": 5.79e-05,
+      "loss": 4.829419708251953,
+      "step": 580
+    },
+    {
+      "epoch": 0.6506754893851668,
+      "grad_norm": 5.132180690765381,
+      "learning_rate": 5.89e-05,
+      "loss": 4.793368148803711,
+      "step": 590
+    },
+    {
+      "epoch": 0.6617038875103392,
+      "grad_norm": 4.222960948944092,
+      "learning_rate": 5.99e-05,
+      "loss": 4.746239852905274,
+      "step": 600
+    },
+    {
+      "epoch": 0.6727322856355115,
+      "grad_norm": 4.070414066314697,
+      "learning_rate": 6.09e-05,
+      "loss": 4.688523864746093,
+      "step": 610
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 3.4652583599090576,
+      "learning_rate": 6.19e-05,
+      "loss": 4.692922973632813,
+      "step": 620
+    },
+    {
+      "epoch": 0.6947890818858561,
+      "grad_norm": 4.559128284454346,
+      "learning_rate": 6.29e-05,
+      "loss": 4.639920043945312,
+      "step": 630
+    },
+    {
+      "epoch": 0.7058174800110284,
+      "grad_norm": 3.197758436203003,
+      "learning_rate": 6.390000000000001e-05,
+      "loss": 4.601907348632812,
+      "step": 640
+    },
+    {
+      "epoch": 0.7168458781362007,
+      "grad_norm": 4.209578514099121,
+      "learning_rate": 6.49e-05,
+      "loss": 4.56639404296875,
+      "step": 650
+    },
+    {
+      "epoch": 0.727874276261373,
+      "grad_norm": 3.701484203338623,
+      "learning_rate": 6.59e-05,
+      "loss": 4.545608901977539,
+      "step": 660
+    },
+    {
+      "epoch": 0.7389026743865453,
+      "grad_norm": 3.951927900314331,
+      "learning_rate": 6.690000000000001e-05,
+      "loss": 4.493326187133789,
+      "step": 670
+    },
+    {
+      "epoch": 0.7499310725117176,
+      "grad_norm": 4.219130039215088,
+      "learning_rate": 6.790000000000001e-05,
+      "loss": 4.482691955566406,
+      "step": 680
+    },
+    {
+      "epoch": 0.76095947063689,
+      "grad_norm": 6.267204284667969,
+      "learning_rate": 6.89e-05,
+      "loss": 4.4599052429199215,
+      "step": 690
+    },
+    {
+      "epoch": 0.7719878687620623,
+      "grad_norm": 3.367382764816284,
+      "learning_rate": 6.99e-05,
+      "loss": 4.429808807373047,
+      "step": 700
+    },
+    {
+      "epoch": 0.7830162668872346,
+      "grad_norm": 3.8906455039978027,
+      "learning_rate": 7.09e-05,
+      "loss": 4.4144752502441404,
+      "step": 710
+    },
+    {
+      "epoch": 0.794044665012407,
+      "grad_norm": 6.759398460388184,
+      "learning_rate": 7.19e-05,
+      "loss": 4.385488891601563,
+      "step": 720
+    },
+    {
+      "epoch": 0.8050730631375793,
+      "grad_norm": 3.520167350769043,
+      "learning_rate": 7.29e-05,
+      "loss": 4.397706985473633,
+      "step": 730
+    },
+    {
+      "epoch": 0.8161014612627516,
+      "grad_norm": 2.7510974407196045,
+      "learning_rate": 7.390000000000001e-05,
+      "loss": 4.374617385864258,
+      "step": 740
+    },
+    {
+      "epoch": 0.8271298593879239,
+      "grad_norm": 4.395699977874756,
+      "learning_rate": 7.49e-05,
+      "loss": 4.3302146911621096,
+      "step": 750
+    },
+    {
+      "epoch": 0.8381582575130962,
+      "grad_norm": 3.277766704559326,
+      "learning_rate": 7.59e-05,
+      "loss": 4.313335418701172,
+      "step": 760
+    },
+    {
+      "epoch": 0.8491866556382686,
+      "grad_norm": 2.466207981109619,
+      "learning_rate": 7.69e-05,
+      "loss": 4.3226570129394535,
+      "step": 770
+    },
+    {
+      "epoch": 0.8602150537634409,
+      "grad_norm": 3.637355327606201,
+      "learning_rate": 7.790000000000001e-05,
+      "loss": 4.295929718017578,
+      "step": 780
+    },
+    {
+      "epoch": 0.8712434518886132,
+      "grad_norm": 3.155527353286743,
+      "learning_rate": 7.890000000000001e-05,
+      "loss": 4.287591552734375,
+      "step": 790
+    },
+    {
+      "epoch": 0.8822718500137855,
+      "grad_norm": 3.593884229660034,
+      "learning_rate": 7.99e-05,
+      "loss": 4.267314147949219,
+      "step": 800
+    },
+    {
+      "epoch": 0.8933002481389578,
+      "grad_norm": 2.361081123352051,
+      "learning_rate": 8.090000000000001e-05,
+      "loss": 4.265741348266602,
+      "step": 810
+    },
+    {
+      "epoch": 0.9043286462641301,
+      "grad_norm": 2.7084105014801025,
+      "learning_rate": 8.19e-05,
+      "loss": 4.261878204345703,
+      "step": 820
+    },
+    {
+      "epoch": 0.9153570443893024,
+      "grad_norm": 3.6093873977661133,
+      "learning_rate": 8.29e-05,
+      "loss": 4.211677551269531,
+      "step": 830
+    },
+    {
+      "epoch": 0.9263854425144747,
+      "grad_norm": 3.9739396572113037,
+      "learning_rate": 8.39e-05,
+      "loss": 4.224007034301758,
+      "step": 840
+    },
+    {
+      "epoch": 0.9374138406396471,
+      "grad_norm": 2.174050807952881,
+      "learning_rate": 8.49e-05,
+      "loss": 4.211782836914063,
+      "step": 850
+    },
+    {
+      "epoch": 0.9484422387648194,
+      "grad_norm": 2.7151405811309814,
+      "learning_rate": 8.59e-05,
+      "loss": 4.204391098022461,
+      "step": 860
+    },
+    {
+      "epoch": 0.9594706368899917,
+      "grad_norm": 3.7480661869049072,
+      "learning_rate": 8.69e-05,
+      "loss": 4.175582504272461,
+      "step": 870
+    },
+    {
+      "epoch": 0.9704990350151641,
+      "grad_norm": 3.1127700805664062,
+      "learning_rate": 8.790000000000001e-05,
+      "loss": 4.183733749389648,
+      "step": 880
+    },
+    {
+      "epoch": 0.9815274331403364,
+      "grad_norm": 2.750716209411621,
+      "learning_rate": 8.89e-05,
+      "loss": 4.167971801757813,
+      "step": 890
+    },
+    {
+      "epoch": 0.9925558312655087,
+      "grad_norm": 4.02509880065918,
+      "learning_rate": 8.99e-05,
+      "loss": 4.170472717285156,
+      "step": 900
+    },
+    {
+      "epoch": 1.0033085194375517,
+      "grad_norm": 3.0058505535125732,
+      "learning_rate": 9.090000000000001e-05,
+      "loss": 4.1449127197265625,
+      "step": 910
+    },
+    {
+      "epoch": 1.014336917562724,
+      "grad_norm": 2.553403377532959,
+      "learning_rate": 9.190000000000001e-05,
+      "loss": 4.1404258728027346,
+      "step": 920
+    },
+    {
+      "epoch": 1.0253653156878964,
+      "grad_norm": 2.8066084384918213,
+      "learning_rate": 9.290000000000001e-05,
+      "loss": 4.110780334472656,
+      "step": 930
+    },
+    {
+      "epoch": 1.0363937138130686,
+      "grad_norm": 3.904608726501465,
+      "learning_rate": 9.39e-05,
+      "loss": 4.134862899780273,
+      "step": 940
+    },
+    {
+      "epoch": 1.047422111938241,
+      "grad_norm": 2.217729330062866,
+      "learning_rate": 9.49e-05,
+      "loss": 4.112079620361328,
+      "step": 950
+    },
+    {
+      "epoch": 1.0584505100634134,
+      "grad_norm": 2.498760938644409,
+      "learning_rate": 9.59e-05,
+      "loss": 4.097566986083985,
+      "step": 960
+    },
+    {
+      "epoch": 1.0694789081885856,
+      "grad_norm": 3.577143907546997,
+      "learning_rate": 9.69e-05,
+      "loss": 4.081307220458984,
+      "step": 970
+    },
+    {
+      "epoch": 1.080507306313758,
+      "grad_norm": 3.283250570297241,
+      "learning_rate": 9.790000000000001e-05,
+      "loss": 4.103987503051758,
+      "step": 980
+    },
+    {
+      "epoch": 1.0915357044389302,
+      "grad_norm": 2.1897776126861572,
+      "learning_rate": 9.89e-05,
+      "loss": 4.084938812255859,
+      "step": 990
+    },
+    {
+      "epoch": 1.1025641025641026,
+      "grad_norm": 2.6925997734069824,
+      "learning_rate": 9.99e-05,
+      "loss": 4.058921051025391,
+      "step": 1000
+    },
+    {
+      "epoch": 1.1135925006892748,
+      "grad_norm": 3.4118456840515137,
+      "learning_rate": 9.994749124854142e-05,
+      "loss": 4.061585235595703,
+      "step": 1010
+    },
+    {
+      "epoch": 1.1246208988144473,
+      "grad_norm": 2.6139297485351562,
+      "learning_rate": 9.988914819136523e-05,
+      "loss": 4.070050048828125,
+      "step": 1020
+    },
+    {
+      "epoch": 1.1356492969396195,
+      "grad_norm": 1.8616399765014648,
+      "learning_rate": 9.983080513418903e-05,
+      "loss": 4.0413330078125,
+      "step": 1030
+    },
+    {
+      "epoch": 1.146677695064792,
+      "grad_norm": 2.361706018447876,
+      "learning_rate": 9.977246207701284e-05,
+      "loss": 4.023075866699219,
+      "step": 1040
+    },
+    {
+      "epoch": 1.157706093189964,
+      "grad_norm": 3.815014123916626,
+      "learning_rate": 9.971411901983664e-05,
+      "loss": 4.036756134033203,
+      "step": 1050
+    },
+    {
+      "epoch": 1.1687344913151365,
+      "grad_norm": 2.4410274028778076,
+      "learning_rate": 9.965577596266045e-05,
+      "loss": 4.020483779907226,
+      "step": 1060
+    },
+    {
+      "epoch": 1.1797628894403087,
+      "grad_norm": 2.768084764480591,
+      "learning_rate": 9.959743290548426e-05,
+      "loss": 4.021839141845703,
+      "step": 1070
+    },
+    {
+      "epoch": 1.1907912875654811,
+      "grad_norm": 1.9342570304870605,
+      "learning_rate": 9.953908984830806e-05,
+      "loss": 4.026360321044922,
+      "step": 1080
+    },
+    {
+      "epoch": 1.2018196856906533,
+      "grad_norm": 2.8184762001037598,
+      "learning_rate": 9.948074679113187e-05,
+      "loss": 4.007581329345703,
+      "step": 1090
+    },
+    {
+      "epoch": 1.2128480838158258,
+      "grad_norm": 3.2656188011169434,
+      "learning_rate": 9.942240373395566e-05,
+      "loss": 3.9965087890625,
+      "step": 1100
+    },
+    {
+      "epoch": 1.223876481940998,
+      "grad_norm": 2.4359538555145264,
+      "learning_rate": 9.936406067677947e-05,
+      "loss": 3.9959388732910157,
+      "step": 1110
+    },
+    {
+      "epoch": 1.2349048800661704,
+      "grad_norm": 1.9357632398605347,
+      "learning_rate": 9.930571761960327e-05,
+      "loss": 3.9851417541503906,
+      "step": 1120
+    },
+    {
+      "epoch": 1.2459332781913428,
+      "grad_norm": 2.1269352436065674,
+      "learning_rate": 9.924737456242708e-05,
+      "loss": 3.9773223876953123,
+      "step": 1130
+    },
+    {
+      "epoch": 1.256961676316515,
+      "grad_norm": 3.3491597175598145,
+      "learning_rate": 9.918903150525088e-05,
+      "loss": 3.9877471923828125,
+      "step": 1140
+    },
+    {
+      "epoch": 1.2679900744416872,
+      "grad_norm": 1.8646328449249268,
+      "learning_rate": 9.913068844807468e-05,
+      "loss": 3.9694965362548826,
+      "step": 1150
+    },
+    {
+      "epoch": 1.2790184725668596,
+      "grad_norm": 2.6204631328582764,
+      "learning_rate": 9.907234539089849e-05,
+      "loss": 3.9611881256103514,
+      "step": 1160
+    },
+    {
+      "epoch": 1.290046870692032,
+      "grad_norm": 1.872028112411499,
+      "learning_rate": 9.901400233372228e-05,
+      "loss": 3.964163970947266,
+      "step": 1170
+    },
+    {
+      "epoch": 1.3010752688172043,
+      "grad_norm": 3.490435838699341,
+      "learning_rate": 9.895565927654609e-05,
+      "loss": 3.959897994995117,
+      "step": 1180
+    },
+    {
+      "epoch": 1.3121036669423767,
+      "grad_norm": 2.862489700317383,
+      "learning_rate": 9.88973162193699e-05,
+      "loss": 3.9567939758300783,
+      "step": 1190
+    },
+    {
+      "epoch": 1.3231320650675489,
+      "grad_norm": 3.0570664405822754,
+      "learning_rate": 9.883897316219371e-05,
+      "loss": 3.9470645904541017,
+      "step": 1200
+    },
+    {
+      "epoch": 1.3341604631927213,
+      "grad_norm": 1.9254627227783203,
+      "learning_rate": 9.878063010501752e-05,
+      "loss": 3.9442317962646483,
+      "step": 1210
+    },
+    {
+      "epoch": 1.3451888613178935,
+      "grad_norm": 3.606224298477173,
+      "learning_rate": 9.872228704784131e-05,
+      "loss": 3.9380733489990236,
+      "step": 1220
+    },
+    {
+      "epoch": 1.356217259443066,
+      "grad_norm": 2.1184027194976807,
+      "learning_rate": 9.866394399066512e-05,
+      "loss": 3.9452835083007813,
+      "step": 1230
+    },
+    {
+      "epoch": 1.3672456575682381,
+      "grad_norm": 1.8997142314910889,
+      "learning_rate": 9.860560093348892e-05,
+      "loss": 3.9270603179931642,
+      "step": 1240
+    },
+    {
+      "epoch": 1.3782740556934105,
+      "grad_norm": 2.9672305583953857,
+      "learning_rate": 9.854725787631273e-05,
+      "loss": 3.9120155334472657,
+      "step": 1250
+    },
+    {
+      "epoch": 1.389302453818583,
+      "grad_norm": 1.9220951795578003,
+      "learning_rate": 9.848891481913652e-05,
+      "loss": 3.900279235839844,
+      "step": 1260
+    },
+    {
+      "epoch": 1.4003308519437552,
+      "grad_norm": 2.013521194458008,
+      "learning_rate": 9.843057176196033e-05,
+      "loss": 3.9147193908691404,
+      "step": 1270
+    },
+    {
+      "epoch": 1.4113592500689274,
+      "grad_norm": 1.451686143875122,
+      "learning_rate": 9.837222870478413e-05,
+      "loss": 3.906220245361328,
+      "step": 1280
+    },
+    {
+      "epoch": 1.4223876481940998,
+      "grad_norm": 4.606860637664795,
+      "learning_rate": 9.831388564760794e-05,
+      "loss": 3.905352020263672,
+      "step": 1290
+    },
+    {
+      "epoch": 1.4334160463192722,
+      "grad_norm": 1.779123306274414,
+      "learning_rate": 9.825554259043175e-05,
+      "loss": 3.9137496948242188,
+      "step": 1300
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 2.086585521697998,
+      "learning_rate": 9.819719953325554e-05,
+      "loss": 3.89554443359375,
+      "step": 1310
+    },
+    {
+      "epoch": 1.4554728425696168,
+      "grad_norm": 3.3514609336853027,
+      "learning_rate": 9.813885647607935e-05,
+      "loss": 3.8901123046875,
+      "step": 1320
+    },
+    {
+      "epoch": 1.466501240694789,
+      "grad_norm": 2.1145269870758057,
+      "learning_rate": 9.808051341890316e-05,
+      "loss": 3.8892486572265623,
+      "step": 1330
+    },
+    {
+      "epoch": 1.4775296388199615,
+      "grad_norm": 1.5503329038619995,
+      "learning_rate": 9.802217036172697e-05,
+      "loss": 3.8922355651855467,
+      "step": 1340
+    },
+    {
+      "epoch": 1.4885580369451337,
+      "grad_norm": 2.3014304637908936,
+      "learning_rate": 9.796382730455076e-05,
+      "loss": 3.8860099792480467,
+      "step": 1350
+    },
+    {
+      "epoch": 1.499586435070306,
+      "grad_norm": 1.9633557796478271,
+      "learning_rate": 9.790548424737457e-05,
+      "loss": 3.875183868408203,
+      "step": 1360
+    },
+    {
+      "epoch": 1.5106148331954783,
+      "grad_norm": 2.228351593017578,
+      "learning_rate": 9.784714119019837e-05,
+      "loss": 3.8726768493652344,
+      "step": 1370
+    },
+    {
+      "epoch": 1.5216432313206507,
+      "grad_norm": 3.0888657569885254,
+      "learning_rate": 9.778879813302218e-05,
+      "loss": 3.872690963745117,
+      "step": 1380
+    },
+    {
+      "epoch": 1.5326716294458231,
+      "grad_norm": 2.0078868865966797,
+      "learning_rate": 9.773045507584599e-05,
+      "loss": 3.8612388610839843,
+      "step": 1390
+    },
+    {
+      "epoch": 1.5437000275709953,
+      "grad_norm": 2.1966569423675537,
+      "learning_rate": 9.767211201866978e-05,
+      "loss": 3.8649852752685545,
+      "step": 1400
+    },
+    {
+      "epoch": 1.5547284256961675,
+      "grad_norm": 2.1047487258911133,
+      "learning_rate": 9.761376896149359e-05,
+      "loss": 3.8632328033447267,
+      "step": 1410
+    },
+    {
+      "epoch": 1.56575682382134,
+      "grad_norm": 1.9347233772277832,
+      "learning_rate": 9.755542590431739e-05,
+      "loss": 3.8362571716308596,
+      "step": 1420
+    },
+    {
+      "epoch": 1.5767852219465124,
+      "grad_norm": 1.7961437702178955,
+      "learning_rate": 9.74970828471412e-05,
+      "loss": 3.8461585998535157,
+      "step": 1430
+    },
+    {
+      "epoch": 1.5878136200716846,
+      "grad_norm": 2.4657342433929443,
+      "learning_rate": 9.743873978996499e-05,
+      "loss": 3.842551040649414,
+      "step": 1440
+    },
+    {
+      "epoch": 1.5988420181968568,
+      "grad_norm": 2.043138027191162,
+      "learning_rate": 9.73803967327888e-05,
+      "loss": 3.8387855529785155,
+      "step": 1450
+    },
+    {
+      "epoch": 1.6098704163220292,
+      "grad_norm": 3.732532262802124,
+      "learning_rate": 9.732205367561261e-05,
+      "loss": 3.8399681091308593,
+      "step": 1460
+    },
+    {
+      "epoch": 1.6208988144472016,
+      "grad_norm": 2.43684720993042,
+      "learning_rate": 9.726371061843642e-05,
+      "loss": 3.8324966430664062,
+      "step": 1470
+    },
+    {
+      "epoch": 1.6319272125723738,
+      "grad_norm": 2.4433460235595703,
+      "learning_rate": 9.720536756126023e-05,
+      "loss": 3.817783737182617,
+      "step": 1480
+    },
+    {
+      "epoch": 1.642955610697546,
+      "grad_norm": 2.1049606800079346,
+      "learning_rate": 9.714702450408402e-05,
+      "loss": 3.804280090332031,
+      "step": 1490
+    },
+    {
+      "epoch": 1.6539840088227185,
+      "grad_norm": 3.529686450958252,
+      "learning_rate": 9.708868144690783e-05,
+      "loss": 3.805449295043945,
+      "step": 1500
+    },
+    {
+      "epoch": 1.6650124069478909,
+      "grad_norm": 2.0984089374542236,
+      "learning_rate": 9.703033838973162e-05,
+      "loss": 3.788246917724609,
+      "step": 1510
+    },
+    {
+      "epoch": 1.6760408050730633,
+      "grad_norm": 1.9434291124343872,
+      "learning_rate": 9.697199533255543e-05,
+      "loss": 3.7875442504882812,
+      "step": 1520
+    },
+    {
+      "epoch": 1.6870692031982355,
+      "grad_norm": 1.99173903465271,
+      "learning_rate": 9.691365227537923e-05,
+      "loss": 3.7807193756103517,
+      "step": 1530
+    },
+    {
+      "epoch": 1.6980976013234077,
+      "grad_norm": 2.5006911754608154,
+      "learning_rate": 9.685530921820304e-05,
+      "loss": 3.744763946533203,
+      "step": 1540
+    },
+    {
+      "epoch": 1.7091259994485801,
+      "grad_norm": 2.1816165447235107,
+      "learning_rate": 9.679696616102685e-05,
+      "loss": 3.760245513916016,
+      "step": 1550
+    },
+    {
+      "epoch": 1.7201543975737525,
+      "grad_norm": 2.123291492462158,
+      "learning_rate": 9.673862310385064e-05,
+      "loss": 3.738916778564453,
+      "step": 1560
+    },
+    {
+      "epoch": 1.7311827956989247,
+      "grad_norm": 2.378187894821167,
+      "learning_rate": 9.668028004667445e-05,
+      "loss": 3.734139251708984,
+      "step": 1570
+    },
+    {
+      "epoch": 1.742211193824097,
+      "grad_norm": 2.54819393157959,
+      "learning_rate": 9.662193698949825e-05,
+      "loss": 3.715302276611328,
+      "step": 1580
+    },
+    {
+      "epoch": 1.7532395919492694,
+      "grad_norm": 4.285822868347168,
+      "learning_rate": 9.656359393232206e-05,
+      "loss": 3.72213134765625,
+      "step": 1590
+    },
+    {
+      "epoch": 1.7642679900744418,
+      "grad_norm": 1.8676700592041016,
+      "learning_rate": 9.650525087514586e-05,
+      "loss": 3.7252479553222657,
+      "step": 1600
+    },
+    {
+      "epoch": 1.775296388199614,
+      "grad_norm": 1.6977792978286743,
+      "learning_rate": 9.644690781796967e-05,
+      "loss": 3.704994964599609,
+      "step": 1610
+    },
+    {
+      "epoch": 1.7863247863247862,
+      "grad_norm": 1.8334232568740845,
+      "learning_rate": 9.638856476079347e-05,
+      "loss": 3.6980815887451173,
+      "step": 1620
+    },
+    {
+      "epoch": 1.7973531844499586,
+      "grad_norm": 2.6574559211730957,
+      "learning_rate": 9.633022170361728e-05,
+      "loss": 3.683759307861328,
+      "step": 1630
+    },
+    {
+      "epoch": 1.808381582575131,
+      "grad_norm": 2.085084915161133,
+      "learning_rate": 9.627187864644109e-05,
+      "loss": 3.67755126953125,
+      "step": 1640
+    },
+    {
+      "epoch": 1.8194099807003032,
+      "grad_norm": 1.685441017150879,
+      "learning_rate": 9.621353558926488e-05,
+      "loss": 3.656099319458008,
+      "step": 1650
+    },
+    {
+      "epoch": 1.8304383788254754,
+      "grad_norm": 2.4462475776672363,
+      "learning_rate": 9.615519253208869e-05,
+      "loss": 3.668656921386719,
+      "step": 1660
+    },
+    {
+      "epoch": 1.8414667769506479,
+      "grad_norm": 1.54155433177948,
+      "learning_rate": 9.609684947491249e-05,
+      "loss": 3.66968994140625,
+      "step": 1670
+    },
+    {
+      "epoch": 1.8524951750758203,
+      "grad_norm": 3.862130880355835,
+      "learning_rate": 9.60385064177363e-05,
+      "loss": 3.6412506103515625,
+      "step": 1680
+    },
+    {
+      "epoch": 1.8635235732009927,
+      "grad_norm": 1.7317070960998535,
+      "learning_rate": 9.598016336056009e-05,
+      "loss": 3.639806365966797,
+      "step": 1690
+    },
+    {
+      "epoch": 1.874551971326165,
+      "grad_norm": 2.2640931606292725,
+      "learning_rate": 9.59218203033839e-05,
+      "loss": 3.6341064453125,
+      "step": 1700
+    },
+    {
+      "epoch": 1.8855803694513371,
+      "grad_norm": 3.653146743774414,
+      "learning_rate": 9.586347724620771e-05,
+      "loss": 3.6380882263183594,
+      "step": 1710
+    },
+    {
+      "epoch": 1.8966087675765095,
+      "grad_norm": 1.8987306356430054,
+      "learning_rate": 9.58051341890315e-05,
+      "loss": 3.6405975341796877,
+      "step": 1720
+    },
+    {
+      "epoch": 1.907637165701682,
+      "grad_norm": 2.202659845352173,
+      "learning_rate": 9.574679113185531e-05,
+      "loss": 3.6375991821289064,
+      "step": 1730
+    },
+    {
+      "epoch": 1.9186655638268542,
+      "grad_norm": 1.5091872215270996,
+      "learning_rate": 9.568844807467912e-05,
+      "loss": 3.6208465576171873,
+      "step": 1740
+    },
+    {
+      "epoch": 1.9296939619520264,
+      "grad_norm": 1.9811325073242188,
+      "learning_rate": 9.563010501750293e-05,
+      "loss": 3.600755310058594,
+      "step": 1750
+    },
+    {
+      "epoch": 1.9407223600771988,
+      "grad_norm": 3.184499979019165,
+      "learning_rate": 9.557176196032673e-05,
+      "loss": 3.6109405517578126,
+      "step": 1760
+    },
+    {
+      "epoch": 1.9517507582023712,
+      "grad_norm": 2.340125322341919,
+      "learning_rate": 9.551341890315054e-05,
+      "loss": 3.6129817962646484,
+      "step": 1770
+    },
+    {
+      "epoch": 1.9627791563275434,
+      "grad_norm": 1.7258495092391968,
+      "learning_rate": 9.545507584597433e-05,
+      "loss": 3.590809631347656,
+      "step": 1780
+    },
+    {
+      "epoch": 1.9738075544527156,
+      "grad_norm": 1.6129754781723022,
+      "learning_rate": 9.539673278879814e-05,
+      "loss": 3.5866302490234374,
+      "step": 1790
+    },
+    {
+      "epoch": 1.984835952577888,
+      "grad_norm": 2.7458667755126953,
+      "learning_rate": 9.533838973162195e-05,
+      "loss": 3.596644973754883,
+      "step": 1800
+    },
+    {
+      "epoch": 1.9958643507030605,
+      "grad_norm": 2.258280038833618,
+      "learning_rate": 9.528004667444574e-05,
+      "loss": 3.5881332397460937,
+      "step": 1810
+    },
+    {
+      "epoch": 2.0066170388751035,
+      "grad_norm": 2.1228580474853516,
+      "learning_rate": 9.522170361726955e-05,
+      "loss": 3.5709766387939452,
+      "step": 1820
+    },
+    {
+      "epoch": 2.017645437000276,
+      "grad_norm": 1.588876485824585,
+      "learning_rate": 9.516336056009335e-05,
+      "loss": 3.5627593994140625,
+      "step": 1830
+    },
+    {
+      "epoch": 2.028673835125448,
+      "grad_norm": 2.451474189758301,
+      "learning_rate": 9.510501750291716e-05,
+      "loss": 3.5535301208496093,
+      "step": 1840
+    },
+    {
+      "epoch": 2.0397022332506203,
+      "grad_norm": 2.0007503032684326,
+      "learning_rate": 9.504667444574095e-05,
+      "loss": 3.553875732421875,
+      "step": 1850
+    },
+    {
+      "epoch": 2.0507306313757927,
+      "grad_norm": 1.4410080909729004,
+      "learning_rate": 9.498833138856476e-05,
+      "loss": 3.550189971923828,
+      "step": 1860
+    },
+    {
+      "epoch": 2.061759029500965,
+      "grad_norm": 2.062835216522217,
+      "learning_rate": 9.492998833138857e-05,
+      "loss": 3.5456893920898436,
+      "step": 1870
+    },
+    {
+      "epoch": 2.072787427626137,
+      "grad_norm": 2.4534783363342285,
+      "learning_rate": 9.487164527421238e-05,
+      "loss": 3.536829376220703,
+      "step": 1880
+    },
+    {
+      "epoch": 2.0838158257513095,
+      "grad_norm": 2.2788970470428467,
+      "learning_rate": 9.481330221703619e-05,
+      "loss": 3.5525283813476562,
+      "step": 1890
+    },
+    {
+      "epoch": 2.094844223876482,
+      "grad_norm": 1.4259227514266968,
+      "learning_rate": 9.475495915985998e-05,
+      "loss": 3.5479995727539064,
+      "step": 1900
+    },
+    {
+      "epoch": 2.1058726220016544,
+      "grad_norm": 2.672534465789795,
+      "learning_rate": 9.469661610268379e-05,
+      "loss": 3.5359420776367188,
+      "step": 1910
+    },
+    {
+      "epoch": 2.116901020126827,
+      "grad_norm": 2.0648045539855957,
+      "learning_rate": 9.463827304550759e-05,
+      "loss": 3.5452896118164063,
+      "step": 1920
+    },
+    {
+      "epoch": 2.1279294182519988,
+      "grad_norm": 1.6846543550491333,
+      "learning_rate": 9.45799299883314e-05,
+      "loss": 3.5434345245361327,
+      "step": 1930
+    },
+    {
+      "epoch": 2.138957816377171,
+      "grad_norm": 1.9105942249298096,
+      "learning_rate": 9.452158693115519e-05,
+      "loss": 3.5351535797119142,
+      "step": 1940
+    },
+    {
+      "epoch": 2.1499862145023436,
+      "grad_norm": 1.8230890035629272,
+      "learning_rate": 9.4463243873979e-05,
+      "loss": 3.5190963745117188,
+      "step": 1950
+    },
+    {
+      "epoch": 2.161014612627516,
+      "grad_norm": 1.6383274793624878,
+      "learning_rate": 9.440490081680281e-05,
+      "loss": 3.5228431701660154,
+      "step": 1960
+    },
+    {
+      "epoch": 2.172043010752688,
+      "grad_norm": 1.7378439903259277,
+      "learning_rate": 9.43465577596266e-05,
+      "loss": 3.520981216430664,
+      "step": 1970
+    },
+    {
+      "epoch": 2.1830714088778604,
+      "grad_norm": 1.941454529762268,
+      "learning_rate": 9.428821470245041e-05,
+      "loss": 3.519342803955078,
+      "step": 1980
+    },
+    {
+      "epoch": 2.194099807003033,
+      "grad_norm": 1.8295516967773438,
+      "learning_rate": 9.422987164527421e-05,
+      "loss": 3.5412979125976562,
+      "step": 1990
+    },
+    {
+      "epoch": 2.2051282051282053,
+      "grad_norm": 1.8052620887756348,
+      "learning_rate": 9.417152858809802e-05,
+      "loss": 3.5153289794921876,
+      "step": 2000
+    },
+    {
+      "epoch": 2.2161566032533773,
+      "grad_norm": 2.1949570178985596,
+      "learning_rate": 9.411318553092183e-05,
+      "loss": 3.521608352661133,
+      "step": 2010
+    },
+    {
+      "epoch": 2.2271850013785497,
+      "grad_norm": 1.746172308921814,
+      "learning_rate": 9.405484247374564e-05,
+      "loss": 3.5008296966552734,
+      "step": 2020
+    },
+    {
+      "epoch": 2.238213399503722,
+      "grad_norm": 2.5374276638031006,
+      "learning_rate": 9.399649941656943e-05,
+      "loss": 3.5140228271484375,
+      "step": 2030
+    },
+    {
+      "epoch": 2.2492417976288945,
+      "grad_norm": 1.7763218879699707,
+      "learning_rate": 9.393815635939324e-05,
+      "loss": 3.510652542114258,
+      "step": 2040
+    },
+    {
+      "epoch": 2.2602701957540665,
+      "grad_norm": 1.6599587202072144,
+      "learning_rate": 9.387981330221705e-05,
+      "loss": 3.5122325897216795,
+      "step": 2050
+    },
+    {
+      "epoch": 2.271298593879239,
+      "grad_norm": 2.1496078968048096,
+      "learning_rate": 9.382147024504085e-05,
+      "loss": 3.5139747619628907,
+      "step": 2060
+    },
+    {
+      "epoch": 2.2823269920044114,
+      "grad_norm": 1.64266836643219,
+      "learning_rate": 9.376312718786465e-05,
+      "loss": 3.507743072509766,
+      "step": 2070
+    },
+    {
+      "epoch": 2.293355390129584,
+      "grad_norm": 2.1241567134857178,
+      "learning_rate": 9.370478413068845e-05,
+      "loss": 3.5162708282470705,
+      "step": 2080
+    },
+    {
+      "epoch": 2.304383788254756,
+      "grad_norm": 1.8391071557998657,
+      "learning_rate": 9.364644107351226e-05,
+      "loss": 3.4955375671386717,
+      "step": 2090
+    },
+    {
+      "epoch": 2.315412186379928,
+      "grad_norm": 2.7478973865509033,
+      "learning_rate": 9.358809801633605e-05,
+      "loss": 3.497519302368164,
+      "step": 2100
+    },
+    {
+      "epoch": 2.3264405845051006,
+      "grad_norm": 1.938588261604309,
+      "learning_rate": 9.352975495915986e-05,
+      "loss": 3.490141677856445,
+      "step": 2110
+    },
+    {
+      "epoch": 2.337468982630273,
+      "grad_norm": 1.5637104511260986,
+      "learning_rate": 9.347141190198366e-05,
+      "loss": 3.499908447265625,
+      "step": 2120
+    },
+    {
+      "epoch": 2.3484973807554455,
+      "grad_norm": 1.882504940032959,
+      "learning_rate": 9.341306884480747e-05,
+      "loss": 3.491979217529297,
+      "step": 2130
+    },
+    {
+      "epoch": 2.3595257788806174,
+      "grad_norm": 1.8528521060943604,
+      "learning_rate": 9.335472578763128e-05,
+      "loss": 3.4961143493652345,
+      "step": 2140
+    },
+    {
+      "epoch": 2.37055417700579,
+      "grad_norm": 1.8050177097320557,
+      "learning_rate": 9.329638273045509e-05,
+      "loss": 3.4948150634765627,
+      "step": 2150
+    },
+    {
+      "epoch": 2.3815825751309623,
+      "grad_norm": 1.816784381866455,
+      "learning_rate": 9.32380396732789e-05,
+      "loss": 3.4910873413085937,
+      "step": 2160
+    },
+    {
+      "epoch": 2.3926109732561347,
+      "grad_norm": 1.9779244661331177,
+      "learning_rate": 9.317969661610269e-05,
+      "loss": 3.492570495605469,
+      "step": 2170
+    },
+    {
+      "epoch": 2.4036393713813067,
+      "grad_norm": 1.8939772844314575,
+      "learning_rate": 9.31213535589265e-05,
+      "loss": 3.473868560791016,
+      "step": 2180
+    },
+    {
+      "epoch": 2.414667769506479,
+      "grad_norm": 2.1493656635284424,
+      "learning_rate": 9.30630105017503e-05,
+      "loss": 3.494515228271484,
+      "step": 2190
+    },
+    {
+      "epoch": 2.4256961676316515,
+      "grad_norm": 1.8989397287368774,
+      "learning_rate": 9.30046674445741e-05,
+      "loss": 3.487537384033203,
+      "step": 2200
+    },
+    {
+      "epoch": 2.436724565756824,
+      "grad_norm": 1.881856918334961,
+      "learning_rate": 9.294632438739791e-05,
+      "loss": 3.475904083251953,
+      "step": 2210
+    },
+    {
+      "epoch": 2.447752963881996,
+      "grad_norm": 1.9463883638381958,
+      "learning_rate": 9.288798133022171e-05,
+      "loss": 3.4829254150390625,
+      "step": 2220
+    },
+    {
+      "epoch": 2.4587813620071683,
+      "grad_norm": 2.01379656791687,
+      "learning_rate": 9.282963827304552e-05,
+      "loss": 3.472850036621094,
+      "step": 2230
+    },
+    {
+      "epoch": 2.4698097601323408,
+      "grad_norm": 2.442741632461548,
+      "learning_rate": 9.277129521586931e-05,
+      "loss": 3.47030029296875,
+      "step": 2240
+    },
+    {
+      "epoch": 2.480838158257513,
+      "grad_norm": 1.5051734447479248,
+      "learning_rate": 9.271295215869312e-05,
+      "loss": 3.489413833618164,
+      "step": 2250
+    },
+    {
+      "epoch": 2.4918665563826856,
+      "grad_norm": 1.9489309787750244,
+      "learning_rate": 9.265460910151692e-05,
+      "loss": 3.464769744873047,
+      "step": 2260
+    },
+    {
+      "epoch": 2.5028949545078576,
+      "grad_norm": 2.319654941558838,
+      "learning_rate": 9.259626604434072e-05,
+      "loss": 3.469140625,
+      "step": 2270
+    },
+    {
+      "epoch": 2.51392335263303,
+      "grad_norm": 1.7984129190444946,
+      "learning_rate": 9.253792298716453e-05,
+      "loss": 3.466594696044922,
+      "step": 2280
+    },
+    {
+      "epoch": 2.5249517507582024,
+      "grad_norm": 1.640869379043579,
+      "learning_rate": 9.247957992998833e-05,
+      "loss": 3.463022994995117,
+      "step": 2290
+    },
+    {
+      "epoch": 2.5359801488833744,
+      "grad_norm": 1.6698195934295654,
+      "learning_rate": 9.242123687281214e-05,
+      "loss": 3.4695220947265626,
+      "step": 2300
+    },
+    {
+      "epoch": 2.547008547008547,
+      "grad_norm": 2.2945683002471924,
+      "learning_rate": 9.236289381563595e-05,
+      "loss": 3.469150924682617,
+      "step": 2310
+    },
+    {
+      "epoch": 2.5580369451337193,
+      "grad_norm": 1.7678370475769043,
+      "learning_rate": 9.230455075845976e-05,
+      "loss": 3.470307159423828,
+      "step": 2320
+    },
+    {
+      "epoch": 2.5690653432588917,
+      "grad_norm": 1.8386255502700806,
+      "learning_rate": 9.224620770128355e-05,
+      "loss": 3.4638832092285154,
+      "step": 2330
+    },
+    {
+      "epoch": 2.580093741384064,
+      "grad_norm": 2.0348527431488037,
+      "learning_rate": 9.218786464410736e-05,
+      "loss": 3.460480880737305,
+      "step": 2340
+    },
+    {
+      "epoch": 2.5911221395092365,
+      "grad_norm": 1.845974326133728,
+      "learning_rate": 9.212952158693116e-05,
+      "loss": 3.4529083251953123,
+      "step": 2350
+    },
+    {
+      "epoch": 2.6021505376344085,
+      "grad_norm": 2.0843095779418945,
+      "learning_rate": 9.207117852975496e-05,
+      "loss": 3.4576786041259764,
+      "step": 2360
+    },
+    {
+      "epoch": 2.613178935759581,
+      "grad_norm": 1.7627031803131104,
+      "learning_rate": 9.201283547257876e-05,
+      "loss": 3.4450752258300783,
+      "step": 2370
+    },
+    {
+      "epoch": 2.6242073338847534,
+      "grad_norm": 1.371972918510437,
+      "learning_rate": 9.195449241540257e-05,
+      "loss": 3.464734649658203,
+      "step": 2380
+    },
+    {
+      "epoch": 2.6352357320099253,
+      "grad_norm": 1.6781940460205078,
+      "learning_rate": 9.189614935822638e-05,
+      "loss": 3.444991683959961,
+      "step": 2390
+    },
+    {
+      "epoch": 2.6462641301350978,
+      "grad_norm": 1.8782585859298706,
+      "learning_rate": 9.183780630105017e-05,
+      "loss": 3.4558509826660155,
+      "step": 2400
+    },
+    {
+      "epoch": 2.65729252826027,
+      "grad_norm": 1.942812204360962,
+      "learning_rate": 9.177946324387398e-05,
+      "loss": 3.4555503845214846,
+      "step": 2410
+    },
+    {
+      "epoch": 2.6683209263854426,
+      "grad_norm": 1.404680609703064,
+      "learning_rate": 9.172112018669778e-05,
+      "loss": 3.438182830810547,
+      "step": 2420
+    },
+    {
+      "epoch": 2.679349324510615,
+      "grad_norm": 1.7656677961349487,
+      "learning_rate": 9.166277712952159e-05,
+      "loss": 3.4622947692871096,
+      "step": 2430
+    },
+    {
+      "epoch": 2.690377722635787,
+      "grad_norm": 1.8348901271820068,
+      "learning_rate": 9.16044340723454e-05,
+      "loss": 3.438182830810547,
+      "step": 2440
+    },
+    {
+      "epoch": 2.7014061207609594,
+      "grad_norm": 2.0641167163848877,
+      "learning_rate": 9.15460910151692e-05,
+      "loss": 3.441473388671875,
+      "step": 2450
+    },
+    {
+      "epoch": 2.712434518886132,
+      "grad_norm": 1.726035475730896,
+      "learning_rate": 9.148774795799301e-05,
+      "loss": 3.441991424560547,
+      "step": 2460
+    },
+    {
+      "epoch": 2.7234629170113043,
+      "grad_norm": 1.854658603668213,
+      "learning_rate": 9.142940490081681e-05,
+      "loss": 3.4441551208496093,
+      "step": 2470
+    },
+    {
+      "epoch": 2.7344913151364763,
+      "grad_norm": 1.8229296207427979,
+      "learning_rate": 9.137106184364062e-05,
+      "loss": 3.441034698486328,
+      "step": 2480
+    },
+    {
+      "epoch": 2.7455197132616487,
+      "grad_norm": 1.6627975702285767,
+      "learning_rate": 9.131271878646441e-05,
+      "loss": 3.4399124145507813,
+      "step": 2490
+    },
+    {
+      "epoch": 2.756548111386821,
+      "grad_norm": 1.4111251831054688,
+      "learning_rate": 9.125437572928822e-05,
+      "loss": 3.4374462127685548,
+      "step": 2500
+    },
+    {
+      "epoch": 2.7675765095119935,
+      "grad_norm": 2.015869379043579,
+      "learning_rate": 9.119603267211202e-05,
+      "loss": 3.4262016296386717,
+      "step": 2510
+    },
+    {
+      "epoch": 2.778604907637166,
+      "grad_norm": 2.2818591594696045,
+      "learning_rate": 9.113768961493583e-05,
+      "loss": 3.446285629272461,
+      "step": 2520
+    },
+    {
+      "epoch": 2.789633305762338,
+      "grad_norm": 1.8643262386322021,
+      "learning_rate": 9.107934655775962e-05,
+      "loss": 3.4362293243408204,
+      "step": 2530
+    },
+    {
+      "epoch": 2.8006617038875103,
+      "grad_norm": 1.248988151550293,
+      "learning_rate": 9.102100350058343e-05,
+      "loss": 3.441702651977539,
+      "step": 2540
+    },
+    {
+      "epoch": 2.8116901020126828,
+      "grad_norm": 1.5247464179992676,
+      "learning_rate": 9.096266044340724e-05,
+      "loss": 3.4388256072998047,
+      "step": 2550
+    },
+    {
+      "epoch": 2.8227185001378547,
+      "grad_norm": 1.9120620489120483,
+      "learning_rate": 9.090431738623103e-05,
+      "loss": 3.4206756591796874,
+      "step": 2560
+    },
+    {
+      "epoch": 2.833746898263027,
+      "grad_norm": 1.4591054916381836,
+      "learning_rate": 9.084597432905484e-05,
+      "loss": 3.4229709625244142,
+      "step": 2570
+    },
+    {
+      "epoch": 2.8447752963881996,
+      "grad_norm": 2.24849796295166,
+      "learning_rate": 9.078763127187865e-05,
+      "loss": 3.426911163330078,
+      "step": 2580
+    },
+    {
+      "epoch": 2.855803694513372,
+      "grad_norm": 1.5658804178237915,
+      "learning_rate": 9.072928821470246e-05,
+      "loss": 3.445120620727539,
+      "step": 2590
+    },
+    {
+      "epoch": 2.8668320926385444,
+      "grad_norm": 1.483583688735962,
+      "learning_rate": 9.067094515752626e-05,
+      "loss": 3.430312728881836,
+      "step": 2600
+    },
+    {
+      "epoch": 2.8778604907637164,
+      "grad_norm": 1.5759658813476562,
+      "learning_rate": 9.061260210035007e-05,
+      "loss": 3.4178386688232423,
+      "step": 2610
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 1.9259848594665527,
+      "learning_rate": 9.055425904317386e-05,
+      "loss": 3.430949401855469,
+      "step": 2620
+    },
+    {
+      "epoch": 2.8999172870140613,
+      "grad_norm": 1.470717191696167,
+      "learning_rate": 9.049591598599767e-05,
+      "loss": 3.439757537841797,
+      "step": 2630
+    },
+    {
+      "epoch": 2.9109456851392337,
+      "grad_norm": 1.8934212923049927,
+      "learning_rate": 9.043757292882148e-05,
+      "loss": 3.430719757080078,
+      "step": 2640
+    },
+    {
+      "epoch": 2.9219740832644057,
+      "grad_norm": 1.6267489194869995,
+      "learning_rate": 9.037922987164527e-05,
+      "loss": 3.4224998474121096,
+      "step": 2650
+    },
+    {
+      "epoch": 2.933002481389578,
+      "grad_norm": 1.6213353872299194,
+      "learning_rate": 9.032088681446908e-05,
+      "loss": 3.4213233947753907,
+      "step": 2660
+    },
+    {
+      "epoch": 2.9440308795147505,
+      "grad_norm": 1.961879849433899,
+      "learning_rate": 9.026254375729288e-05,
+      "loss": 3.4108352661132812,
+      "step": 2670
+    },
+    {
+      "epoch": 2.955059277639923,
+      "grad_norm": 1.7363910675048828,
+      "learning_rate": 9.020420070011669e-05,
+      "loss": 3.423554229736328,
+      "step": 2680
+    },
+    {
+      "epoch": 2.9660876757650954,
+      "grad_norm": 1.6161952018737793,
+      "learning_rate": 9.014585764294048e-05,
+      "loss": 3.418962860107422,
+      "step": 2690
+    },
+    {
+      "epoch": 2.9771160738902673,
+      "grad_norm": 1.8065682649612427,
+      "learning_rate": 9.008751458576429e-05,
+      "loss": 3.4218765258789063,
+      "step": 2700
+    },
+    {
+      "epoch": 2.9881444720154398,
+      "grad_norm": 1.4285337924957275,
+      "learning_rate": 9.00291715285881e-05,
+      "loss": 3.413957214355469,
+      "step": 2710
+    },
+    {
+      "epoch": 2.999172870140612,
+      "grad_norm": 1.30274498462677,
+      "learning_rate": 8.997082847141191e-05,
+      "loss": 3.4176124572753905,
+      "step": 2720
+    },
+    {
+      "epoch": 3.009925558312655,
+      "grad_norm": 1.5460416078567505,
+      "learning_rate": 8.991248541423572e-05,
+      "loss": 3.388013458251953,
+      "step": 2730
+    },
+    {
+      "epoch": 3.0209539564378276,
+      "grad_norm": 1.5832446813583374,
+      "learning_rate": 8.985414235705951e-05,
+      "loss": 3.3929378509521486,
+      "step": 2740
+    },
+    {
+      "epoch": 3.0319823545629996,
+      "grad_norm": 1.6086630821228027,
+      "learning_rate": 8.979579929988332e-05,
+      "loss": 3.3940502166748048,
+      "step": 2750
+    },
+    {
+      "epoch": 3.043010752688172,
+      "grad_norm": 1.6624842882156372,
+      "learning_rate": 8.973745624270712e-05,
+      "loss": 3.388884353637695,
+      "step": 2760
+    },
+    {
+      "epoch": 3.0540391508133444,
+      "grad_norm": 1.7352933883666992,
+      "learning_rate": 8.967911318553093e-05,
+      "loss": 3.409127426147461,
+      "step": 2770
+    },
+    {
+      "epoch": 3.065067548938517,
+      "grad_norm": 1.45657217502594,
+      "learning_rate": 8.962077012835472e-05,
+      "loss": 3.389351654052734,
+      "step": 2780
+    },
+    {
+      "epoch": 3.076095947063689,
+      "grad_norm": 1.4969090223312378,
+      "learning_rate": 8.956242707117853e-05,
+      "loss": 3.3988433837890626,
+      "step": 2790
+    },
+    {
+      "epoch": 3.0871243451888613,
+      "grad_norm": 1.710800051689148,
+      "learning_rate": 8.950408401400234e-05,
+      "loss": 3.395826721191406,
+      "step": 2800
+    },
+    {
+      "epoch": 3.0981527433140337,
+      "grad_norm": 1.6347870826721191,
+      "learning_rate": 8.944574095682614e-05,
+      "loss": 3.391011047363281,
+      "step": 2810
+    },
+    {
+      "epoch": 3.109181141439206,
+      "grad_norm": 1.4630122184753418,
+      "learning_rate": 8.938739789964995e-05,
+      "loss": 3.401841735839844,
+      "step": 2820
+    },
+    {
+      "epoch": 3.120209539564378,
+      "grad_norm": 1.547430157661438,
+      "learning_rate": 8.932905484247374e-05,
+      "loss": 3.3979782104492187,
+      "step": 2830
+    },
+    {
+      "epoch": 3.1312379376895505,
+      "grad_norm": 1.5614186525344849,
+      "learning_rate": 8.927071178529755e-05,
+      "loss": 3.3884544372558594,
+      "step": 2840
+    },
+    {
+      "epoch": 3.142266335814723,
+      "grad_norm": 1.4073251485824585,
+      "learning_rate": 8.921236872812136e-05,
+      "loss": 3.3886154174804686,
+      "step": 2850
+    },
+    {
+      "epoch": 3.1532947339398953,
+      "grad_norm": 1.3639475107192993,
+      "learning_rate": 8.915402567094517e-05,
+      "loss": 3.383074951171875,
+      "step": 2860
+    },
+    {
+      "epoch": 3.1643231320650678,
+      "grad_norm": 2.3929882049560547,
+      "learning_rate": 8.909568261376896e-05,
+      "loss": 3.3788246154785155,
+      "step": 2870
+    },
+    {
+      "epoch": 3.1753515301902397,
+      "grad_norm": 1.7196829319000244,
+      "learning_rate": 8.903733955659277e-05,
+      "loss": 3.3822708129882812,
+      "step": 2880
+    },
+    {
+      "epoch": 3.186379928315412,
+      "grad_norm": 1.526293396949768,
+      "learning_rate": 8.897899649941658e-05,
+      "loss": 3.381543731689453,
+      "step": 2890
+    },
+    {
+      "epoch": 3.1974083264405846,
+      "grad_norm": 1.2336128950119019,
+      "learning_rate": 8.892065344224038e-05,
+      "loss": 3.3975807189941407,
+      "step": 2900
+    },
+    {
+      "epoch": 3.208436724565757,
+      "grad_norm": 1.4868130683898926,
+      "learning_rate": 8.886231038506419e-05,
+      "loss": 3.3970687866210936,
+      "step": 2910
+    },
+    {
+      "epoch": 3.219465122690929,
+      "grad_norm": 1.5349540710449219,
+      "learning_rate": 8.880396732788798e-05,
+      "loss": 3.385994720458984,
+      "step": 2920
+    },
+    {
+      "epoch": 3.2304935208161014,
+      "grad_norm": 1.5333718061447144,
+      "learning_rate": 8.874562427071179e-05,
+      "loss": 3.362841796875,
+      "step": 2930
+    },
+    {
+      "epoch": 3.241521918941274,
+      "grad_norm": 1.514235258102417,
+      "learning_rate": 8.868728121353558e-05,
+      "loss": 3.3816680908203125,
+      "step": 2940
+    },
+    {
+      "epoch": 3.2525503170664463,
+      "grad_norm": 1.5870161056518555,
+      "learning_rate": 8.86289381563594e-05,
+      "loss": 3.3818199157714846,
+      "step": 2950
+    },
+    {
+      "epoch": 3.2635787151916182,
+      "grad_norm": 1.6295320987701416,
+      "learning_rate": 8.85705950991832e-05,
+      "loss": 3.379594421386719,
+      "step": 2960
+    },
+    {
+      "epoch": 3.2746071133167907,
+      "grad_norm": 1.533991813659668,
+      "learning_rate": 8.8512252042007e-05,
+      "loss": 3.387801742553711,
+      "step": 2970
+    },
+    {
+      "epoch": 3.285635511441963,
+      "grad_norm": 2.2125084400177,
+      "learning_rate": 8.845390898483081e-05,
+      "loss": 3.3856468200683594,
+      "step": 2980
+    },
+    {
+      "epoch": 3.2966639095671355,
+      "grad_norm": 1.800207495689392,
+      "learning_rate": 8.839556592765462e-05,
+      "loss": 3.3843597412109374,
+      "step": 2990
+    },
+    {
+      "epoch": 3.3076923076923075,
+      "grad_norm": 1.3071027994155884,
+      "learning_rate": 8.833722287047842e-05,
+      "loss": 3.3861888885498046,
+      "step": 3000
+    },
+    {
+      "epoch": 3.31872070581748,
+      "grad_norm": 1.7724641561508179,
+      "learning_rate": 8.827887981330222e-05,
+      "loss": 3.3929458618164063,
+      "step": 3010
+    },
+    {
+      "epoch": 3.3297491039426523,
+      "grad_norm": 1.3397877216339111,
+      "learning_rate": 8.822053675612603e-05,
+      "loss": 3.3785301208496095,
+      "step": 3020
+    },
+    {
+      "epoch": 3.3407775020678248,
+      "grad_norm": 1.352630376815796,
+      "learning_rate": 8.816219369894982e-05,
+      "loss": 3.3796306610107423,
+      "step": 3030
+    },
+    {
+      "epoch": 3.351805900192997,
+      "grad_norm": 1.5996475219726562,
+      "learning_rate": 8.810385064177363e-05,
+      "loss": 3.362406921386719,
+      "step": 3040
+    },
+    {
+      "epoch": 3.362834298318169,
+      "grad_norm": 1.6010814905166626,
+      "learning_rate": 8.804550758459744e-05,
+      "loss": 3.3811767578125,
+      "step": 3050
+    },
+    {
+      "epoch": 3.3738626964433416,
+      "grad_norm": 1.3276373147964478,
+      "learning_rate": 8.798716452742124e-05,
+      "loss": 3.3732643127441406,
+      "step": 3060
+    },
+    {
+      "epoch": 3.384891094568514,
+      "grad_norm": 1.7741515636444092,
+      "learning_rate": 8.792882147024505e-05,
+      "loss": 3.381968688964844,
+      "step": 3070
+    },
+    {
+      "epoch": 3.3959194926936864,
+      "grad_norm": 1.7820576429367065,
+      "learning_rate": 8.787047841306884e-05,
+      "loss": 3.358811950683594,
+      "step": 3080
+    },
+    {
+      "epoch": 3.4069478908188584,
+      "grad_norm": 1.389573574066162,
+      "learning_rate": 8.781213535589265e-05,
+      "loss": 3.36102180480957,
+      "step": 3090
+    },
+    {
+      "epoch": 3.417976288944031,
+      "grad_norm": 1.1910648345947266,
+      "learning_rate": 8.775379229871645e-05,
+      "loss": 3.3652645111083985,
+      "step": 3100
+    },
+    {
+      "epoch": 3.4290046870692032,
+      "grad_norm": 1.965219497680664,
+      "learning_rate": 8.769544924154026e-05,
+      "loss": 3.3735313415527344,
+      "step": 3110
+    },
+    {
+      "epoch": 3.4400330851943757,
+      "grad_norm": 1.5992330312728882,
+      "learning_rate": 8.763710618436406e-05,
+      "loss": 3.362974166870117,
+      "step": 3120
+    },
+    {
+      "epoch": 3.4510614833195477,
+      "grad_norm": 2.2293193340301514,
+      "learning_rate": 8.757876312718787e-05,
+      "loss": 3.3681709289550783,
+      "step": 3130
+    },
+    {
+      "epoch": 3.46208988144472,
+      "grad_norm": 1.2978801727294922,
+      "learning_rate": 8.752042007001168e-05,
+      "loss": 3.3776336669921876,
+      "step": 3140
+    },
+    {
+      "epoch": 3.4731182795698925,
+      "grad_norm": 1.227036714553833,
+      "learning_rate": 8.746207701283548e-05,
+      "loss": 3.3590301513671874,
+      "step": 3150
+    },
+    {
+      "epoch": 3.484146677695065,
+      "grad_norm": 1.8023360967636108,
+      "learning_rate": 8.740373395565929e-05,
+      "loss": 3.35421142578125,
+      "step": 3160
+    },
+    {
+      "epoch": 3.495175075820237,
+      "grad_norm": 1.6423453092575073,
+      "learning_rate": 8.734539089848308e-05,
+      "loss": 3.3748985290527345,
+      "step": 3170
+    },
+    {
+      "epoch": 3.5062034739454093,
+      "grad_norm": 1.3261916637420654,
+      "learning_rate": 8.728704784130689e-05,
+      "loss": 3.36380615234375,
+      "step": 3180
+    },
+    {
+      "epoch": 3.5172318720705817,
+      "grad_norm": 1.290014624595642,
+      "learning_rate": 8.722870478413069e-05,
+      "loss": 3.3596282958984376,
+      "step": 3190
+    },
+    {
+      "epoch": 3.528260270195754,
+      "grad_norm": 2.0481576919555664,
+      "learning_rate": 8.71703617269545e-05,
+      "loss": 3.358118438720703,
+      "step": 3200
+    },
+    {
+      "epoch": 3.5392886683209266,
+      "grad_norm": 1.4758331775665283,
+      "learning_rate": 8.71120186697783e-05,
+      "loss": 3.3536834716796875,
+      "step": 3210
+    },
+    {
+      "epoch": 3.5503170664460986,
+      "grad_norm": 1.4340440034866333,
+      "learning_rate": 8.70536756126021e-05,
+      "loss": 3.358259582519531,
+      "step": 3220
+    },
+    {
+      "epoch": 3.561345464571271,
+      "grad_norm": 1.6952699422836304,
+      "learning_rate": 8.699533255542591e-05,
+      "loss": 3.3730777740478515,
+      "step": 3230
+    },
+    {
+      "epoch": 3.5723738626964434,
+      "grad_norm": 1.9069234132766724,
+      "learning_rate": 8.69369894982497e-05,
+      "loss": 3.3552001953125,
+      "step": 3240
+    },
+    {
+      "epoch": 3.5834022608216154,
+      "grad_norm": 1.6194590330123901,
+      "learning_rate": 8.687864644107351e-05,
+      "loss": 3.3562744140625,
+      "step": 3250
+    },
+    {
+      "epoch": 3.594430658946788,
+      "grad_norm": 1.33975350856781,
+      "learning_rate": 8.682030338389732e-05,
+      "loss": 3.3622581481933596,
+      "step": 3260
+    },
+    {
+      "epoch": 3.6054590570719602,
+      "grad_norm": 1.3948160409927368,
+      "learning_rate": 8.676196032672113e-05,
+      "loss": 3.3645614624023437,
+      "step": 3270
+    },
+    {
+      "epoch": 3.6164874551971327,
+      "grad_norm": 1.4972363710403442,
+      "learning_rate": 8.670361726954493e-05,
+      "loss": 3.3713829040527346,
+      "step": 3280
+    },
+    {
+      "epoch": 3.627515853322305,
+      "grad_norm": 1.9456968307495117,
+      "learning_rate": 8.664527421236874e-05,
+      "loss": 3.3617935180664062,
+      "step": 3290
+    },
+    {
+      "epoch": 3.6385442514474775,
+      "grad_norm": 1.8050702810287476,
+      "learning_rate": 8.658693115519254e-05,
+      "loss": 3.359496307373047,
+      "step": 3300
+    },
+    {
+      "epoch": 3.6495726495726495,
+      "grad_norm": 1.294492244720459,
+      "learning_rate": 8.652858809801634e-05,
+      "loss": 3.361173629760742,
+      "step": 3310
+    },
+    {
+      "epoch": 3.660601047697822,
+      "grad_norm": 1.7897614240646362,
+      "learning_rate": 8.647024504084015e-05,
+      "loss": 3.3475852966308595,
+      "step": 3320
+    },
+    {
+      "epoch": 3.6716294458229943,
+      "grad_norm": 1.5647767782211304,
+      "learning_rate": 8.641190198366394e-05,
+      "loss": 3.3594207763671875,
+      "step": 3330
+    },
+    {
+      "epoch": 3.6826578439481663,
+      "grad_norm": 1.3839472532272339,
+      "learning_rate": 8.635355892648775e-05,
+      "loss": 3.361709976196289,
+      "step": 3340
+    },
+    {
+      "epoch": 3.6936862420733387,
+      "grad_norm": 1.543115258216858,
+      "learning_rate": 8.629521586931155e-05,
+      "loss": 3.349272918701172,
+      "step": 3350
+    },
+    {
+      "epoch": 3.704714640198511,
+      "grad_norm": 1.2722103595733643,
+      "learning_rate": 8.623687281213536e-05,
+      "loss": 3.3600040435791017,
+      "step": 3360
+    },
+    {
+      "epoch": 3.7157430383236836,
+      "grad_norm": 2.396493434906006,
+      "learning_rate": 8.617852975495915e-05,
+      "loss": 3.359762954711914,
+      "step": 3370
+    },
+    {
+      "epoch": 3.726771436448856,
+      "grad_norm": 1.3756037950515747,
+      "learning_rate": 8.612018669778296e-05,
+      "loss": 3.3409027099609374,
+      "step": 3380
+    },
+    {
+      "epoch": 3.737799834574028,
+      "grad_norm": 1.5124824047088623,
+      "learning_rate": 8.606184364060677e-05,
+      "loss": 3.346342849731445,
+      "step": 3390
+    },
+    {
+      "epoch": 3.7488282326992004,
+      "grad_norm": 1.3679585456848145,
+      "learning_rate": 8.600350058343058e-05,
+      "loss": 3.3478328704833986,
+      "step": 3400
+    },
+    {
+      "epoch": 3.759856630824373,
+      "grad_norm": 1.3470197916030884,
+      "learning_rate": 8.594515752625439e-05,
+      "loss": 3.352674865722656,
+      "step": 3410
+    },
+    {
+      "epoch": 3.770885028949545,
+      "grad_norm": 1.4775781631469727,
+      "learning_rate": 8.588681446907818e-05,
+      "loss": 3.3504791259765625,
+      "step": 3420
+    },
+    {
+      "epoch": 3.7819134270747172,
+      "grad_norm": 1.1987943649291992,
+      "learning_rate": 8.582847141190199e-05,
+      "loss": 3.3457687377929686,
+      "step": 3430
+    },
+    {
+      "epoch": 3.7929418251998896,
+      "grad_norm": 1.8007314205169678,
+      "learning_rate": 8.577012835472579e-05,
+      "loss": 3.3557716369628907,
+      "step": 3440
+    },
+    {
+      "epoch": 3.803970223325062,
+      "grad_norm": 1.4193800687789917,
+      "learning_rate": 8.57117852975496e-05,
+      "loss": 3.346666717529297,
+      "step": 3450
+    },
+    {
+      "epoch": 3.8149986214502345,
+      "grad_norm": 1.600216031074524,
+      "learning_rate": 8.56534422403734e-05,
+      "loss": 3.354322814941406,
+      "step": 3460
+    },
+    {
+      "epoch": 3.826027019575407,
+      "grad_norm": 1.6823015213012695,
+      "learning_rate": 8.55950991831972e-05,
+      "loss": 3.3344764709472656,
+      "step": 3470
+    },
+    {
+      "epoch": 3.837055417700579,
+      "grad_norm": 1.8002822399139404,
+      "learning_rate": 8.553675612602101e-05,
+      "loss": 3.338224411010742,
+      "step": 3480
+    },
+    {
+      "epoch": 3.8480838158257513,
+      "grad_norm": 1.019519567489624,
+      "learning_rate": 8.54784130688448e-05,
+      "loss": 3.342393493652344,
+      "step": 3490
+    },
+    {
+      "epoch": 3.8591122139509237,
+      "grad_norm": 1.4397176504135132,
+      "learning_rate": 8.542007001166861e-05,
+      "loss": 3.3416332244873046,
+      "step": 3500
+    },
+    {
+      "epoch": 3.8701406120760957,
+      "grad_norm": 1.398215889930725,
+      "learning_rate": 8.536172695449241e-05,
+      "loss": 3.3455711364746095,
+      "step": 3510
+    },
+    {
+      "epoch": 3.881169010201268,
+      "grad_norm": 1.431221604347229,
+      "learning_rate": 8.530338389731622e-05,
+      "loss": 3.3510116577148437,
+      "step": 3520
+    },
+    {
+      "epoch": 3.8921974083264406,
+      "grad_norm": 1.2339868545532227,
+      "learning_rate": 8.524504084014003e-05,
+      "loss": 3.333365631103516,
+      "step": 3530
+    },
+    {
+      "epoch": 3.903225806451613,
+      "grad_norm": 1.2564575672149658,
+      "learning_rate": 8.518669778296384e-05,
+      "loss": 3.355131912231445,
+      "step": 3540
+    },
+    {
+      "epoch": 3.9142542045767854,
+      "grad_norm": 1.44709050655365,
+      "learning_rate": 8.512835472578765e-05,
+      "loss": 3.352345275878906,
+      "step": 3550
+    },
+    {
+      "epoch": 3.9252826027019574,
+      "grad_norm": 1.0984286069869995,
+      "learning_rate": 8.507001166861144e-05,
+      "loss": 3.3399391174316406,
+      "step": 3560
+    },
+    {
+      "epoch": 3.93631100082713,
+      "grad_norm": 1.521567702293396,
+      "learning_rate": 8.501166861143525e-05,
+      "loss": 3.3333946228027345,
+      "step": 3570
+    },
+    {
+      "epoch": 3.9473393989523022,
+      "grad_norm": 1.3443926572799683,
+      "learning_rate": 8.495332555425905e-05,
+      "loss": 3.3321746826171874,
+      "step": 3580
+    },
+    {
+      "epoch": 3.9583677970774747,
+      "grad_norm": 1.539640188217163,
+      "learning_rate": 8.489498249708285e-05,
+      "loss": 3.335438537597656,
+      "step": 3590
+    },
+    {
+      "epoch": 3.9693961952026466,
+      "grad_norm": 1.123307466506958,
+      "learning_rate": 8.483663943990665e-05,
+      "loss": 3.3397190093994142,
+      "step": 3600
+    },
+    {
+      "epoch": 3.980424593327819,
+      "grad_norm": 1.6037691831588745,
+      "learning_rate": 8.477829638273046e-05,
+      "loss": 3.3357570648193358,
+      "step": 3610
+    },
+    {
+      "epoch": 3.9914529914529915,
+      "grad_norm": 1.6570971012115479,
+      "learning_rate": 8.471995332555425e-05,
+      "loss": 3.341298294067383,
+      "step": 3620
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 18140,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1444428795346944.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

output_qwen3_plain_ar/checkpoint-3628/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

output_qwen3_plain_ar/checkpoint-4535/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "magel_chord_dropout_trigger_prob": 0.6,
+  "magel_num_audio_token": 16384,
+  "magel_structure_dropout_trigger_prob": 0.6,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.4.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 168056
+}

output_qwen3_plain_ar/checkpoint-4535/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.4.0"
+}

output_qwen3_plain_ar/checkpoint-4535/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step4535

output_qwen3_plain_ar/checkpoint-4535/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3205 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 4535,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.011028398125172319,
+      "grad_norm": 435.2422180175781,
+      "learning_rate": 9e-07,
+      "loss": 20.84569549560547,
+      "step": 10
+    },
+    {
+      "epoch": 0.022056796250344637,
+      "grad_norm": 141.7341766357422,
+      "learning_rate": 1.9e-06,
+      "loss": 18.69615936279297,
+      "step": 20
+    },
+    {
+      "epoch": 0.033085194375516956,
+      "grad_norm": 74.42520904541016,
+      "learning_rate": 2.9e-06,
+      "loss": 16.079673767089844,
+      "step": 30
+    },
+    {
+      "epoch": 0.044113592500689275,
+      "grad_norm": 24.73248863220215,
+      "learning_rate": 3.9e-06,
+      "loss": 13.684315490722657,
+      "step": 40
+    },
+    {
+      "epoch": 0.055141990625861594,
+      "grad_norm": 7.049101829528809,
+      "learning_rate": 4.9000000000000005e-06,
+      "loss": 12.474874877929688,
+      "step": 50
+    },
+    {
+      "epoch": 0.06617038875103391,
+      "grad_norm": 2.3411474227905273,
+      "learning_rate": 5.9e-06,
+      "loss": 12.072142028808594,
+      "step": 60
+    },
+    {
+      "epoch": 0.07719878687620624,
+      "grad_norm": 1.126215934753418,
+      "learning_rate": 6.900000000000001e-06,
+      "loss": 11.938906860351562,
+      "step": 70
+    },
+    {
+      "epoch": 0.08822718500137855,
+      "grad_norm": 1.2050226926803589,
+      "learning_rate": 7.9e-06,
+      "loss": 11.81988296508789,
+      "step": 80
+    },
+    {
+      "epoch": 0.09925558312655088,
+      "grad_norm": 1.444793462753296,
+      "learning_rate": 8.9e-06,
+      "loss": 11.602033996582032,
+      "step": 90
+    },
+    {
+      "epoch": 0.11028398125172319,
+      "grad_norm": 5.791665077209473,
+      "learning_rate": 9.900000000000002e-06,
+      "loss": 11.201815032958985,
+      "step": 100
+    },
+    {
+      "epoch": 0.12131237937689551,
+      "grad_norm": 9.492277145385742,
+      "learning_rate": 1.09e-05,
+      "loss": 10.535708618164062,
+      "step": 110
+    },
+    {
+      "epoch": 0.13234077750206782,
+      "grad_norm": 2.7546133995056152,
+      "learning_rate": 1.19e-05,
+      "loss": 9.847169494628906,
+      "step": 120
+    },
+    {
+      "epoch": 0.14336917562724014,
+      "grad_norm": 1.0953313112258911,
+      "learning_rate": 1.29e-05,
+      "loss": 9.429026031494141,
+      "step": 130
+    },
+    {
+      "epoch": 0.15439757375241248,
+      "grad_norm": 0.7153559327125549,
+      "learning_rate": 1.3900000000000002e-05,
+      "loss": 9.266969299316406,
+      "step": 140
+    },
+    {
+      "epoch": 0.1654259718775848,
+      "grad_norm": 0.5888933539390564,
+      "learning_rate": 1.49e-05,
+      "loss": 9.1935546875,
+      "step": 150
+    },
+    {
+      "epoch": 0.1764543700027571,
+      "grad_norm": 0.4850365221500397,
+      "learning_rate": 1.59e-05,
+      "loss": 9.19604034423828,
+      "step": 160
+    },
+    {
+      "epoch": 0.1874827681279294,
+      "grad_norm": 0.5772538185119629,
+      "learning_rate": 1.69e-05,
+      "loss": 9.17010726928711,
+      "step": 170
+    },
+    {
+      "epoch": 0.19851116625310175,
+      "grad_norm": 0.4283920228481293,
+      "learning_rate": 1.79e-05,
+      "loss": 9.172830200195312,
+      "step": 180
+    },
+    {
+      "epoch": 0.20953956437827406,
+      "grad_norm": 0.8650698065757751,
+      "learning_rate": 1.8900000000000002e-05,
+      "loss": 9.154988098144532,
+      "step": 190
+    },
+    {
+      "epoch": 0.22056796250344637,
+      "grad_norm": 0.42017608880996704,
+      "learning_rate": 1.9900000000000003e-05,
+      "loss": 9.146849060058594,
+      "step": 200
+    },
+    {
+      "epoch": 0.23159636062861869,
+      "grad_norm": 0.9125994443893433,
+      "learning_rate": 2.09e-05,
+      "loss": 9.164442443847657,
+      "step": 210
+    },
+    {
+      "epoch": 0.24262475875379103,
+      "grad_norm": 0.6468876004219055,
+      "learning_rate": 2.19e-05,
+      "loss": 9.159596252441407,
+      "step": 220
+    },
+    {
+      "epoch": 0.25365315687896334,
+      "grad_norm": 0.4124819338321686,
+      "learning_rate": 2.29e-05,
+      "loss": 9.13860626220703,
+      "step": 230
+    },
+    {
+      "epoch": 0.26468155500413565,
+      "grad_norm": 1.990302562713623,
+      "learning_rate": 2.39e-05,
+      "loss": 9.145040893554688,
+      "step": 240
+    },
+    {
+      "epoch": 0.27570995312930796,
+      "grad_norm": 0.7875277400016785,
+      "learning_rate": 2.4900000000000002e-05,
+      "loss": 9.152925109863281,
+      "step": 250
+    },
+    {
+      "epoch": 0.2867383512544803,
+      "grad_norm": 0.8343706130981445,
+      "learning_rate": 2.5900000000000003e-05,
+      "loss": 9.132975769042968,
+      "step": 260
+    },
+    {
+      "epoch": 0.2977667493796526,
+      "grad_norm": 3.00996470451355,
+      "learning_rate": 2.6900000000000003e-05,
+      "loss": 9.097848510742187,
+      "step": 270
+    },
+    {
+      "epoch": 0.30879514750482495,
+      "grad_norm": 2.4282069206237793,
+      "learning_rate": 2.7900000000000004e-05,
+      "loss": 9.042235565185546,
+      "step": 280
+    },
+    {
+      "epoch": 0.31982354562999726,
+      "grad_norm": 4.171019554138184,
+      "learning_rate": 2.8899999999999998e-05,
+      "loss": 8.927298736572265,
+      "step": 290
+    },
+    {
+      "epoch": 0.3308519437551696,
+      "grad_norm": 2.197887659072876,
+      "learning_rate": 2.9900000000000002e-05,
+      "loss": 8.805252075195312,
+      "step": 300
+    },
+    {
+      "epoch": 0.3418803418803419,
+      "grad_norm": 10.306541442871094,
+      "learning_rate": 3.09e-05,
+      "loss": 8.673678588867187,
+      "step": 310
+    },
+    {
+      "epoch": 0.3529087400055142,
+      "grad_norm": 8.463860511779785,
+      "learning_rate": 3.19e-05,
+      "loss": 8.570347595214844,
+      "step": 320
+    },
+    {
+      "epoch": 0.3639371381306865,
+      "grad_norm": 3.999753475189209,
+      "learning_rate": 3.29e-05,
+      "loss": 8.429109191894531,
+      "step": 330
+    },
+    {
+      "epoch": 0.3749655362558588,
+      "grad_norm": 5.259007930755615,
+      "learning_rate": 3.3900000000000004e-05,
+      "loss": 8.334149169921876,
+      "step": 340
+    },
+    {
+      "epoch": 0.38599393438103113,
+      "grad_norm": 8.362598419189453,
+      "learning_rate": 3.49e-05,
+      "loss": 8.196139526367187,
+      "step": 350
+    },
+    {
+      "epoch": 0.3970223325062035,
+      "grad_norm": 10.273512840270996,
+      "learning_rate": 3.59e-05,
+      "loss": 8.040153503417969,
+      "step": 360
+    },
+    {
+      "epoch": 0.4080507306313758,
+      "grad_norm": 5.111108303070068,
+      "learning_rate": 3.69e-05,
+      "loss": 7.866473388671875,
+      "step": 370
+    },
+    {
+      "epoch": 0.4190791287565481,
+      "grad_norm": 9.192107200622559,
+      "learning_rate": 3.79e-05,
+      "loss": 7.695774841308594,
+      "step": 380
+    },
+    {
+      "epoch": 0.43010752688172044,
+      "grad_norm": 5.393336772918701,
+      "learning_rate": 3.8900000000000004e-05,
+      "loss": 7.498152160644532,
+      "step": 390
+    },
+    {
+      "epoch": 0.44113592500689275,
+      "grad_norm": 10.53490161895752,
+      "learning_rate": 3.99e-05,
+      "loss": 7.270246887207032,
+      "step": 400
+    },
+    {
+      "epoch": 0.45216432313206506,
+      "grad_norm": 6.174643516540527,
+      "learning_rate": 4.09e-05,
+      "loss": 7.127191162109375,
+      "step": 410
+    },
+    {
+      "epoch": 0.46319272125723737,
+      "grad_norm": 4.522936820983887,
+      "learning_rate": 4.19e-05,
+      "loss": 6.871500396728516,
+      "step": 420
+    },
+    {
+      "epoch": 0.4742211193824097,
+      "grad_norm": 4.3594207763671875,
+      "learning_rate": 4.29e-05,
+      "loss": 6.702586364746094,
+      "step": 430
+    },
+    {
+      "epoch": 0.48524951750758205,
+      "grad_norm": 5.950730323791504,
+      "learning_rate": 4.39e-05,
+      "loss": 6.493560791015625,
+      "step": 440
+    },
+    {
+      "epoch": 0.49627791563275436,
+      "grad_norm": 6.233413219451904,
+      "learning_rate": 4.49e-05,
+      "loss": 6.293489074707031,
+      "step": 450
+    },
+    {
+      "epoch": 0.5073063137579267,
+      "grad_norm": 7.656834125518799,
+      "learning_rate": 4.5900000000000004e-05,
+      "loss": 6.102347946166992,
+      "step": 460
+    },
+    {
+      "epoch": 0.518334711883099,
+      "grad_norm": 4.319094657897949,
+      "learning_rate": 4.69e-05,
+      "loss": 5.928083419799805,
+      "step": 470
+    },
+    {
+      "epoch": 0.5293631100082713,
+      "grad_norm": 5.585537433624268,
+      "learning_rate": 4.79e-05,
+      "loss": 5.77436637878418,
+      "step": 480
+    },
+    {
+      "epoch": 0.5403915081334436,
+      "grad_norm": 5.104014873504639,
+      "learning_rate": 4.89e-05,
+      "loss": 5.636859130859375,
+      "step": 490
+    },
+    {
+      "epoch": 0.5514199062586159,
+      "grad_norm": 5.453028202056885,
+      "learning_rate": 4.99e-05,
+      "loss": 5.507636260986328,
+      "step": 500
+    },
+    {
+      "epoch": 0.5624483043837882,
+      "grad_norm": 7.728854179382324,
+      "learning_rate": 5.0900000000000004e-05,
+      "loss": 5.411964416503906,
+      "step": 510
+    },
+    {
+      "epoch": 0.5734767025089605,
+      "grad_norm": 4.50288724899292,
+      "learning_rate": 5.19e-05,
+      "loss": 5.295291900634766,
+      "step": 520
+    },
+    {
+      "epoch": 0.5845051006341329,
+      "grad_norm": 4.245919704437256,
+      "learning_rate": 5.2900000000000005e-05,
+      "loss": 5.194162750244141,
+      "step": 530
+    },
+    {
+      "epoch": 0.5955334987593052,
+      "grad_norm": 6.278975963592529,
+      "learning_rate": 5.390000000000001e-05,
+      "loss": 5.113618087768555,
+      "step": 540
+    },
+    {
+      "epoch": 0.6065618968844775,
+      "grad_norm": 4.214662075042725,
+      "learning_rate": 5.4900000000000006e-05,
+      "loss": 5.038372039794922,
+      "step": 550
+    },
+    {
+      "epoch": 0.6175902950096499,
+      "grad_norm": 3.5404605865478516,
+      "learning_rate": 5.590000000000001e-05,
+      "loss": 4.935391235351562,
+      "step": 560
+    },
+    {
+      "epoch": 0.6286186931348222,
+      "grad_norm": 3.6460280418395996,
+      "learning_rate": 5.69e-05,
+      "loss": 4.896538543701172,
+      "step": 570
+    },
+    {
+      "epoch": 0.6396470912599945,
+      "grad_norm": 5.254800796508789,
+      "learning_rate": 5.79e-05,
+      "loss": 4.829419708251953,
+      "step": 580
+    },
+    {
+      "epoch": 0.6506754893851668,
+      "grad_norm": 5.132180690765381,
+      "learning_rate": 5.89e-05,
+      "loss": 4.793368148803711,
+      "step": 590
+    },
+    {
+      "epoch": 0.6617038875103392,
+      "grad_norm": 4.222960948944092,
+      "learning_rate": 5.99e-05,
+      "loss": 4.746239852905274,
+      "step": 600
+    },
+    {
+      "epoch": 0.6727322856355115,
+      "grad_norm": 4.070414066314697,
+      "learning_rate": 6.09e-05,
+      "loss": 4.688523864746093,
+      "step": 610
+    },
+    {
+      "epoch": 0.6837606837606838,
+      "grad_norm": 3.4652583599090576,
+      "learning_rate": 6.19e-05,
+      "loss": 4.692922973632813,
+      "step": 620
+    },
+    {
+      "epoch": 0.6947890818858561,
+      "grad_norm": 4.559128284454346,
+      "learning_rate": 6.29e-05,
+      "loss": 4.639920043945312,
+      "step": 630
+    },
+    {
+      "epoch": 0.7058174800110284,
+      "grad_norm": 3.197758436203003,
+      "learning_rate": 6.390000000000001e-05,
+      "loss": 4.601907348632812,
+      "step": 640
+    },
+    {
+      "epoch": 0.7168458781362007,
+      "grad_norm": 4.209578514099121,
+      "learning_rate": 6.49e-05,
+      "loss": 4.56639404296875,
+      "step": 650
+    },
+    {
+      "epoch": 0.727874276261373,
+      "grad_norm": 3.701484203338623,
+      "learning_rate": 6.59e-05,
+      "loss": 4.545608901977539,
+      "step": 660
+    },
+    {
+      "epoch": 0.7389026743865453,
+      "grad_norm": 3.951927900314331,
+      "learning_rate": 6.690000000000001e-05,
+      "loss": 4.493326187133789,
+      "step": 670
+    },
+    {
+      "epoch": 0.7499310725117176,
+      "grad_norm": 4.219130039215088,
+      "learning_rate": 6.790000000000001e-05,
+      "loss": 4.482691955566406,
+      "step": 680
+    },
+    {
+      "epoch": 0.76095947063689,
+      "grad_norm": 6.267204284667969,
+      "learning_rate": 6.89e-05,
+      "loss": 4.4599052429199215,
+      "step": 690
+    },
+    {
+      "epoch": 0.7719878687620623,
+      "grad_norm": 3.367382764816284,
+      "learning_rate": 6.99e-05,
+      "loss": 4.429808807373047,
+      "step": 700
+    },
+    {
+      "epoch": 0.7830162668872346,
+      "grad_norm": 3.8906455039978027,
+      "learning_rate": 7.09e-05,
+      "loss": 4.4144752502441404,
+      "step": 710
+    },
+    {
+      "epoch": 0.794044665012407,
+      "grad_norm": 6.759398460388184,
+      "learning_rate": 7.19e-05,
+      "loss": 4.385488891601563,
+      "step": 720
+    },
+    {
+      "epoch": 0.8050730631375793,
+      "grad_norm": 3.520167350769043,
+      "learning_rate": 7.29e-05,
+      "loss": 4.397706985473633,
+      "step": 730
+    },
+    {
+      "epoch": 0.8161014612627516,
+      "grad_norm": 2.7510974407196045,
+      "learning_rate": 7.390000000000001e-05,
+      "loss": 4.374617385864258,
+      "step": 740
+    },
+    {
+      "epoch": 0.8271298593879239,
+      "grad_norm": 4.395699977874756,
+      "learning_rate": 7.49e-05,
+      "loss": 4.3302146911621096,
+      "step": 750
+    },
+    {
+      "epoch": 0.8381582575130962,
+      "grad_norm": 3.277766704559326,
+      "learning_rate": 7.59e-05,
+      "loss": 4.313335418701172,
+      "step": 760
+    },
+    {
+      "epoch": 0.8491866556382686,
+      "grad_norm": 2.466207981109619,
+      "learning_rate": 7.69e-05,
+      "loss": 4.3226570129394535,
+      "step": 770
+    },
+    {
+      "epoch": 0.8602150537634409,
+      "grad_norm": 3.637355327606201,
+      "learning_rate": 7.790000000000001e-05,
+      "loss": 4.295929718017578,
+      "step": 780
+    },
+    {
+      "epoch": 0.8712434518886132,
+      "grad_norm": 3.155527353286743,
+      "learning_rate": 7.890000000000001e-05,
+      "loss": 4.287591552734375,
+      "step": 790
+    },
+    {
+      "epoch": 0.8822718500137855,
+      "grad_norm": 3.593884229660034,
+      "learning_rate": 7.99e-05,
+      "loss": 4.267314147949219,
+      "step": 800
+    },
+    {
+      "epoch": 0.8933002481389578,
+      "grad_norm": 2.361081123352051,
+      "learning_rate": 8.090000000000001e-05,
+      "loss": 4.265741348266602,
+      "step": 810
+    },
+    {
+      "epoch": 0.9043286462641301,
+      "grad_norm": 2.7084105014801025,
+      "learning_rate": 8.19e-05,
+      "loss": 4.261878204345703,
+      "step": 820
+    },
+    {
+      "epoch": 0.9153570443893024,
+      "grad_norm": 3.6093873977661133,
+      "learning_rate": 8.29e-05,
+      "loss": 4.211677551269531,
+      "step": 830
+    },
+    {
+      "epoch": 0.9263854425144747,
+      "grad_norm": 3.9739396572113037,
+      "learning_rate": 8.39e-05,
+      "loss": 4.224007034301758,
+      "step": 840
+    },
+    {
+      "epoch": 0.9374138406396471,
+      "grad_norm": 2.174050807952881,
+      "learning_rate": 8.49e-05,
+      "loss": 4.211782836914063,
+      "step": 850
+    },
+    {
+      "epoch": 0.9484422387648194,
+      "grad_norm": 2.7151405811309814,
+      "learning_rate": 8.59e-05,
+      "loss": 4.204391098022461,
+      "step": 860
+    },
+    {
+      "epoch": 0.9594706368899917,
+      "grad_norm": 3.7480661869049072,
+      "learning_rate": 8.69e-05,
+      "loss": 4.175582504272461,
+      "step": 870
+    },
+    {
+      "epoch": 0.9704990350151641,
+      "grad_norm": 3.1127700805664062,
+      "learning_rate": 8.790000000000001e-05,
+      "loss": 4.183733749389648,
+      "step": 880
+    },
+    {
+      "epoch": 0.9815274331403364,
+      "grad_norm": 2.750716209411621,
+      "learning_rate": 8.89e-05,
+      "loss": 4.167971801757813,
+      "step": 890
+    },
+    {
+      "epoch": 0.9925558312655087,
+      "grad_norm": 4.02509880065918,
+      "learning_rate": 8.99e-05,
+      "loss": 4.170472717285156,
+      "step": 900
+    },
+    {
+      "epoch": 1.0033085194375517,
+      "grad_norm": 3.0058505535125732,
+      "learning_rate": 9.090000000000001e-05,
+      "loss": 4.1449127197265625,
+      "step": 910
+    },
+    {
+      "epoch": 1.014336917562724,
+      "grad_norm": 2.553403377532959,
+      "learning_rate": 9.190000000000001e-05,
+      "loss": 4.1404258728027346,
+      "step": 920
+    },
+    {
+      "epoch": 1.0253653156878964,
+      "grad_norm": 2.8066084384918213,
+      "learning_rate": 9.290000000000001e-05,
+      "loss": 4.110780334472656,
+      "step": 930
+    },
+    {
+      "epoch": 1.0363937138130686,
+      "grad_norm": 3.904608726501465,
+      "learning_rate": 9.39e-05,
+      "loss": 4.134862899780273,
+      "step": 940
+    },
+    {
+      "epoch": 1.047422111938241,
+      "grad_norm": 2.217729330062866,
+      "learning_rate": 9.49e-05,
+      "loss": 4.112079620361328,
+      "step": 950
+    },
+    {
+      "epoch": 1.0584505100634134,
+      "grad_norm": 2.498760938644409,
+      "learning_rate": 9.59e-05,
+      "loss": 4.097566986083985,
+      "step": 960
+    },
+    {
+      "epoch": 1.0694789081885856,
+      "grad_norm": 3.577143907546997,
+      "learning_rate": 9.69e-05,
+      "loss": 4.081307220458984,
+      "step": 970
+    },
+    {
+      "epoch": 1.080507306313758,
+      "grad_norm": 3.283250570297241,
+      "learning_rate": 9.790000000000001e-05,
+      "loss": 4.103987503051758,
+      "step": 980
+    },
+    {
+      "epoch": 1.0915357044389302,
+      "grad_norm": 2.1897776126861572,
+      "learning_rate": 9.89e-05,
+      "loss": 4.084938812255859,
+      "step": 990
+    },
+    {
+      "epoch": 1.1025641025641026,
+      "grad_norm": 2.6925997734069824,
+      "learning_rate": 9.99e-05,
+      "loss": 4.058921051025391,
+      "step": 1000
+    },
+    {
+      "epoch": 1.1135925006892748,
+      "grad_norm": 3.4118456840515137,
+      "learning_rate": 9.994749124854142e-05,
+      "loss": 4.061585235595703,
+      "step": 1010
+    },
+    {
+      "epoch": 1.1246208988144473,
+      "grad_norm": 2.6139297485351562,
+      "learning_rate": 9.988914819136523e-05,
+      "loss": 4.070050048828125,
+      "step": 1020
+    },
+    {
+      "epoch": 1.1356492969396195,
+      "grad_norm": 1.8616399765014648,
+      "learning_rate": 9.983080513418903e-05,
+      "loss": 4.0413330078125,
+      "step": 1030
+    },
+    {
+      "epoch": 1.146677695064792,
+      "grad_norm": 2.361706018447876,
+      "learning_rate": 9.977246207701284e-05,
+      "loss": 4.023075866699219,
+      "step": 1040
+    },
+    {
+      "epoch": 1.157706093189964,
+      "grad_norm": 3.815014123916626,
+      "learning_rate": 9.971411901983664e-05,
+      "loss": 4.036756134033203,
+      "step": 1050
+    },
+    {
+      "epoch": 1.1687344913151365,
+      "grad_norm": 2.4410274028778076,
+      "learning_rate": 9.965577596266045e-05,
+      "loss": 4.020483779907226,
+      "step": 1060
+    },
+    {
+      "epoch": 1.1797628894403087,
+      "grad_norm": 2.768084764480591,
+      "learning_rate": 9.959743290548426e-05,
+      "loss": 4.021839141845703,
+      "step": 1070
+    },
+    {
+      "epoch": 1.1907912875654811,
+      "grad_norm": 1.9342570304870605,
+      "learning_rate": 9.953908984830806e-05,
+      "loss": 4.026360321044922,
+      "step": 1080
+    },
+    {
+      "epoch": 1.2018196856906533,
+      "grad_norm": 2.8184762001037598,
+      "learning_rate": 9.948074679113187e-05,
+      "loss": 4.007581329345703,
+      "step": 1090
+    },
+    {
+      "epoch": 1.2128480838158258,
+      "grad_norm": 3.2656188011169434,
+      "learning_rate": 9.942240373395566e-05,
+      "loss": 3.9965087890625,
+      "step": 1100
+    },
+    {
+      "epoch": 1.223876481940998,
+      "grad_norm": 2.4359538555145264,
+      "learning_rate": 9.936406067677947e-05,
+      "loss": 3.9959388732910157,
+      "step": 1110
+    },
+    {
+      "epoch": 1.2349048800661704,
+      "grad_norm": 1.9357632398605347,
+      "learning_rate": 9.930571761960327e-05,
+      "loss": 3.9851417541503906,
+      "step": 1120
+    },
+    {
+      "epoch": 1.2459332781913428,
+      "grad_norm": 2.1269352436065674,
+      "learning_rate": 9.924737456242708e-05,
+      "loss": 3.9773223876953123,
+      "step": 1130
+    },
+    {
+      "epoch": 1.256961676316515,
+      "grad_norm": 3.3491597175598145,
+      "learning_rate": 9.918903150525088e-05,
+      "loss": 3.9877471923828125,
+      "step": 1140
+    },
+    {
+      "epoch": 1.2679900744416872,
+      "grad_norm": 1.8646328449249268,
+      "learning_rate": 9.913068844807468e-05,
+      "loss": 3.9694965362548826,
+      "step": 1150
+    },
+    {
+      "epoch": 1.2790184725668596,
+      "grad_norm": 2.6204631328582764,
+      "learning_rate": 9.907234539089849e-05,
+      "loss": 3.9611881256103514,
+      "step": 1160
+    },
+    {
+      "epoch": 1.290046870692032,
+      "grad_norm": 1.872028112411499,
+      "learning_rate": 9.901400233372228e-05,
+      "loss": 3.964163970947266,
+      "step": 1170
+    },
+    {
+      "epoch": 1.3010752688172043,
+      "grad_norm": 3.490435838699341,
+      "learning_rate": 9.895565927654609e-05,
+      "loss": 3.959897994995117,
+      "step": 1180
+    },
+    {
+      "epoch": 1.3121036669423767,
+      "grad_norm": 2.862489700317383,
+      "learning_rate": 9.88973162193699e-05,
+      "loss": 3.9567939758300783,
+      "step": 1190
+    },
+    {
+      "epoch": 1.3231320650675489,
+      "grad_norm": 3.0570664405822754,
+      "learning_rate": 9.883897316219371e-05,
+      "loss": 3.9470645904541017,
+      "step": 1200
+    },
+    {
+      "epoch": 1.3341604631927213,
+      "grad_norm": 1.9254627227783203,
+      "learning_rate": 9.878063010501752e-05,
+      "loss": 3.9442317962646483,
+      "step": 1210
+    },
+    {
+      "epoch": 1.3451888613178935,
+      "grad_norm": 3.606224298477173,
+      "learning_rate": 9.872228704784131e-05,
+      "loss": 3.9380733489990236,
+      "step": 1220
+    },
+    {
+      "epoch": 1.356217259443066,
+      "grad_norm": 2.1184027194976807,
+      "learning_rate": 9.866394399066512e-05,
+      "loss": 3.9452835083007813,
+      "step": 1230
+    },
+    {
+      "epoch": 1.3672456575682381,
+      "grad_norm": 1.8997142314910889,
+      "learning_rate": 9.860560093348892e-05,
+      "loss": 3.9270603179931642,
+      "step": 1240
+    },
+    {
+      "epoch": 1.3782740556934105,
+      "grad_norm": 2.9672305583953857,
+      "learning_rate": 9.854725787631273e-05,
+      "loss": 3.9120155334472657,
+      "step": 1250
+    },
+    {
+      "epoch": 1.389302453818583,
+      "grad_norm": 1.9220951795578003,
+      "learning_rate": 9.848891481913652e-05,
+      "loss": 3.900279235839844,
+      "step": 1260
+    },
+    {
+      "epoch": 1.4003308519437552,
+      "grad_norm": 2.013521194458008,
+      "learning_rate": 9.843057176196033e-05,
+      "loss": 3.9147193908691404,
+      "step": 1270
+    },
+    {
+      "epoch": 1.4113592500689274,
+      "grad_norm": 1.451686143875122,
+      "learning_rate": 9.837222870478413e-05,
+      "loss": 3.906220245361328,
+      "step": 1280
+    },
+    {
+      "epoch": 1.4223876481940998,
+      "grad_norm": 4.606860637664795,
+      "learning_rate": 9.831388564760794e-05,
+      "loss": 3.905352020263672,
+      "step": 1290
+    },
+    {
+      "epoch": 1.4334160463192722,
+      "grad_norm": 1.779123306274414,
+      "learning_rate": 9.825554259043175e-05,
+      "loss": 3.9137496948242188,
+      "step": 1300
+    },
+    {
+      "epoch": 1.4444444444444444,
+      "grad_norm": 2.086585521697998,
+      "learning_rate": 9.819719953325554e-05,
+      "loss": 3.89554443359375,
+      "step": 1310
+    },
+    {
+      "epoch": 1.4554728425696168,
+      "grad_norm": 3.3514609336853027,
+      "learning_rate": 9.813885647607935e-05,
+      "loss": 3.8901123046875,
+      "step": 1320
+    },
+    {
+      "epoch": 1.466501240694789,
+      "grad_norm": 2.1145269870758057,
+      "learning_rate": 9.808051341890316e-05,
+      "loss": 3.8892486572265623,
+      "step": 1330
+    },
+    {
+      "epoch": 1.4775296388199615,
+      "grad_norm": 1.5503329038619995,
+      "learning_rate": 9.802217036172697e-05,
+      "loss": 3.8922355651855467,
+      "step": 1340
+    },
+    {
+      "epoch": 1.4885580369451337,
+      "grad_norm": 2.3014304637908936,
+      "learning_rate": 9.796382730455076e-05,
+      "loss": 3.8860099792480467,
+      "step": 1350
+    },
+    {
+      "epoch": 1.499586435070306,
+      "grad_norm": 1.9633557796478271,
+      "learning_rate": 9.790548424737457e-05,
+      "loss": 3.875183868408203,
+      "step": 1360
+    },
+    {
+      "epoch": 1.5106148331954783,
+      "grad_norm": 2.228351593017578,
+      "learning_rate": 9.784714119019837e-05,
+      "loss": 3.8726768493652344,
+      "step": 1370
+    },
+    {
+      "epoch": 1.5216432313206507,
+      "grad_norm": 3.0888657569885254,
+      "learning_rate": 9.778879813302218e-05,
+      "loss": 3.872690963745117,
+      "step": 1380
+    },
+    {
+      "epoch": 1.5326716294458231,
+      "grad_norm": 2.0078868865966797,
+      "learning_rate": 9.773045507584599e-05,
+      "loss": 3.8612388610839843,
+      "step": 1390
+    },
+    {
+      "epoch": 1.5437000275709953,
+      "grad_norm": 2.1966569423675537,
+      "learning_rate": 9.767211201866978e-05,
+      "loss": 3.8649852752685545,
+      "step": 1400
+    },
+    {
+      "epoch": 1.5547284256961675,
+      "grad_norm": 2.1047487258911133,
+      "learning_rate": 9.761376896149359e-05,
+      "loss": 3.8632328033447267,
+      "step": 1410
+    },
+    {
+      "epoch": 1.56575682382134,
+      "grad_norm": 1.9347233772277832,
+      "learning_rate": 9.755542590431739e-05,
+      "loss": 3.8362571716308596,
+      "step": 1420
+    },
+    {
+      "epoch": 1.5767852219465124,
+      "grad_norm": 1.7961437702178955,
+      "learning_rate": 9.74970828471412e-05,
+      "loss": 3.8461585998535157,
+      "step": 1430
+    },
+    {
+      "epoch": 1.5878136200716846,
+      "grad_norm": 2.4657342433929443,
+      "learning_rate": 9.743873978996499e-05,
+      "loss": 3.842551040649414,
+      "step": 1440
+    },
+    {
+      "epoch": 1.5988420181968568,
+      "grad_norm": 2.043138027191162,
+      "learning_rate": 9.73803967327888e-05,
+      "loss": 3.8387855529785155,
+      "step": 1450
+    },
+    {
+      "epoch": 1.6098704163220292,
+      "grad_norm": 3.732532262802124,
+      "learning_rate": 9.732205367561261e-05,
+      "loss": 3.8399681091308593,
+      "step": 1460
+    },
+    {
+      "epoch": 1.6208988144472016,
+      "grad_norm": 2.43684720993042,
+      "learning_rate": 9.726371061843642e-05,
+      "loss": 3.8324966430664062,
+      "step": 1470
+    },
+    {
+      "epoch": 1.6319272125723738,
+      "grad_norm": 2.4433460235595703,
+      "learning_rate": 9.720536756126023e-05,
+      "loss": 3.817783737182617,
+      "step": 1480
+    },
+    {
+      "epoch": 1.642955610697546,
+      "grad_norm": 2.1049606800079346,
+      "learning_rate": 9.714702450408402e-05,
+      "loss": 3.804280090332031,
+      "step": 1490
+    },
+    {
+      "epoch": 1.6539840088227185,
+      "grad_norm": 3.529686450958252,
+      "learning_rate": 9.708868144690783e-05,
+      "loss": 3.805449295043945,
+      "step": 1500
+    },
+    {
+      "epoch": 1.6650124069478909,
+      "grad_norm": 2.0984089374542236,
+      "learning_rate": 9.703033838973162e-05,
+      "loss": 3.788246917724609,
+      "step": 1510
+    },
+    {
+      "epoch": 1.6760408050730633,
+      "grad_norm": 1.9434291124343872,
+      "learning_rate": 9.697199533255543e-05,
+      "loss": 3.7875442504882812,
+      "step": 1520
+    },
+    {
+      "epoch": 1.6870692031982355,
+      "grad_norm": 1.99173903465271,
+      "learning_rate": 9.691365227537923e-05,
+      "loss": 3.7807193756103517,
+      "step": 1530
+    },
+    {
+      "epoch": 1.6980976013234077,
+      "grad_norm": 2.5006911754608154,
+      "learning_rate": 9.685530921820304e-05,
+      "loss": 3.744763946533203,
+      "step": 1540
+    },
+    {
+      "epoch": 1.7091259994485801,
+      "grad_norm": 2.1816165447235107,
+      "learning_rate": 9.679696616102685e-05,
+      "loss": 3.760245513916016,
+      "step": 1550
+    },
+    {
+      "epoch": 1.7201543975737525,
+      "grad_norm": 2.123291492462158,
+      "learning_rate": 9.673862310385064e-05,
+      "loss": 3.738916778564453,
+      "step": 1560
+    },
+    {
+      "epoch": 1.7311827956989247,
+      "grad_norm": 2.378187894821167,
+      "learning_rate": 9.668028004667445e-05,
+      "loss": 3.734139251708984,
+      "step": 1570
+    },
+    {
+      "epoch": 1.742211193824097,
+      "grad_norm": 2.54819393157959,
+      "learning_rate": 9.662193698949825e-05,
+      "loss": 3.715302276611328,
+      "step": 1580
+    },
+    {
+      "epoch": 1.7532395919492694,
+      "grad_norm": 4.285822868347168,
+      "learning_rate": 9.656359393232206e-05,
+      "loss": 3.72213134765625,
+      "step": 1590
+    },
+    {
+      "epoch": 1.7642679900744418,
+      "grad_norm": 1.8676700592041016,
+      "learning_rate": 9.650525087514586e-05,
+      "loss": 3.7252479553222657,
+      "step": 1600
+    },
+    {
+      "epoch": 1.775296388199614,
+      "grad_norm": 1.6977792978286743,
+      "learning_rate": 9.644690781796967e-05,
+      "loss": 3.704994964599609,
+      "step": 1610
+    },
+    {
+      "epoch": 1.7863247863247862,
+      "grad_norm": 1.8334232568740845,
+      "learning_rate": 9.638856476079347e-05,
+      "loss": 3.6980815887451173,
+      "step": 1620
+    },
+    {
+      "epoch": 1.7973531844499586,
+      "grad_norm": 2.6574559211730957,
+      "learning_rate": 9.633022170361728e-05,
+      "loss": 3.683759307861328,
+      "step": 1630
+    },
+    {
+      "epoch": 1.808381582575131,
+      "grad_norm": 2.085084915161133,
+      "learning_rate": 9.627187864644109e-05,
+      "loss": 3.67755126953125,
+      "step": 1640
+    },
+    {
+      "epoch": 1.8194099807003032,
+      "grad_norm": 1.685441017150879,
+      "learning_rate": 9.621353558926488e-05,
+      "loss": 3.656099319458008,
+      "step": 1650
+    },
+    {
+      "epoch": 1.8304383788254754,
+      "grad_norm": 2.4462475776672363,
+      "learning_rate": 9.615519253208869e-05,
+      "loss": 3.668656921386719,
+      "step": 1660
+    },
+    {
+      "epoch": 1.8414667769506479,
+      "grad_norm": 1.54155433177948,
+      "learning_rate": 9.609684947491249e-05,
+      "loss": 3.66968994140625,
+      "step": 1670
+    },
+    {
+      "epoch": 1.8524951750758203,
+      "grad_norm": 3.862130880355835,
+      "learning_rate": 9.60385064177363e-05,
+      "loss": 3.6412506103515625,
+      "step": 1680
+    },
+    {
+      "epoch": 1.8635235732009927,
+      "grad_norm": 1.7317070960998535,
+      "learning_rate": 9.598016336056009e-05,
+      "loss": 3.639806365966797,
+      "step": 1690
+    },
+    {
+      "epoch": 1.874551971326165,
+      "grad_norm": 2.2640931606292725,
+      "learning_rate": 9.59218203033839e-05,
+      "loss": 3.6341064453125,
+      "step": 1700
+    },
+    {
+      "epoch": 1.8855803694513371,
+      "grad_norm": 3.653146743774414,
+      "learning_rate": 9.586347724620771e-05,
+      "loss": 3.6380882263183594,
+      "step": 1710
+    },
+    {
+      "epoch": 1.8966087675765095,
+      "grad_norm": 1.8987306356430054,
+      "learning_rate": 9.58051341890315e-05,
+      "loss": 3.6405975341796877,
+      "step": 1720
+    },
+    {
+      "epoch": 1.907637165701682,
+      "grad_norm": 2.202659845352173,
+      "learning_rate": 9.574679113185531e-05,
+      "loss": 3.6375991821289064,
+      "step": 1730
+    },
+    {
+      "epoch": 1.9186655638268542,
+      "grad_norm": 1.5091872215270996,
+      "learning_rate": 9.568844807467912e-05,
+      "loss": 3.6208465576171873,
+      "step": 1740
+    },
+    {
+      "epoch": 1.9296939619520264,
+      "grad_norm": 1.9811325073242188,
+      "learning_rate": 9.563010501750293e-05,
+      "loss": 3.600755310058594,
+      "step": 1750
+    },
+    {
+      "epoch": 1.9407223600771988,
+      "grad_norm": 3.184499979019165,
+      "learning_rate": 9.557176196032673e-05,
+      "loss": 3.6109405517578126,
+      "step": 1760
+    },
+    {
+      "epoch": 1.9517507582023712,
+      "grad_norm": 2.340125322341919,
+      "learning_rate": 9.551341890315054e-05,
+      "loss": 3.6129817962646484,
+      "step": 1770
+    },
+    {
+      "epoch": 1.9627791563275434,
+      "grad_norm": 1.7258495092391968,
+      "learning_rate": 9.545507584597433e-05,
+      "loss": 3.590809631347656,
+      "step": 1780
+    },
+    {
+      "epoch": 1.9738075544527156,
+      "grad_norm": 1.6129754781723022,
+      "learning_rate": 9.539673278879814e-05,
+      "loss": 3.5866302490234374,
+      "step": 1790
+    },
+    {
+      "epoch": 1.984835952577888,
+      "grad_norm": 2.7458667755126953,
+      "learning_rate": 9.533838973162195e-05,
+      "loss": 3.596644973754883,
+      "step": 1800
+    },
+    {
+      "epoch": 1.9958643507030605,
+      "grad_norm": 2.258280038833618,
+      "learning_rate": 9.528004667444574e-05,
+      "loss": 3.5881332397460937,
+      "step": 1810
+    },
+    {
+      "epoch": 2.0066170388751035,
+      "grad_norm": 2.1228580474853516,
+      "learning_rate": 9.522170361726955e-05,
+      "loss": 3.5709766387939452,
+      "step": 1820
+    },
+    {
+      "epoch": 2.017645437000276,
+      "grad_norm": 1.588876485824585,
+      "learning_rate": 9.516336056009335e-05,
+      "loss": 3.5627593994140625,
+      "step": 1830
+    },
+    {
+      "epoch": 2.028673835125448,
+      "grad_norm": 2.451474189758301,
+      "learning_rate": 9.510501750291716e-05,
+      "loss": 3.5535301208496093,
+      "step": 1840
+    },
+    {
+      "epoch": 2.0397022332506203,
+      "grad_norm": 2.0007503032684326,
+      "learning_rate": 9.504667444574095e-05,
+      "loss": 3.553875732421875,
+      "step": 1850
+    },
+    {
+      "epoch": 2.0507306313757927,
+      "grad_norm": 1.4410080909729004,
+      "learning_rate": 9.498833138856476e-05,
+      "loss": 3.550189971923828,
+      "step": 1860
+    },
+    {
+      "epoch": 2.061759029500965,
+      "grad_norm": 2.062835216522217,
+      "learning_rate": 9.492998833138857e-05,
+      "loss": 3.5456893920898436,
+      "step": 1870
+    },
+    {
+      "epoch": 2.072787427626137,
+      "grad_norm": 2.4534783363342285,
+      "learning_rate": 9.487164527421238e-05,
+      "loss": 3.536829376220703,
+      "step": 1880
+    },
+    {
+      "epoch": 2.0838158257513095,
+      "grad_norm": 2.2788970470428467,
+      "learning_rate": 9.481330221703619e-05,
+      "loss": 3.5525283813476562,
+      "step": 1890
+    },
+    {
+      "epoch": 2.094844223876482,
+      "grad_norm": 1.4259227514266968,
+      "learning_rate": 9.475495915985998e-05,
+      "loss": 3.5479995727539064,
+      "step": 1900
+    },
+    {
+      "epoch": 2.1058726220016544,
+      "grad_norm": 2.672534465789795,
+      "learning_rate": 9.469661610268379e-05,
+      "loss": 3.5359420776367188,
+      "step": 1910
+    },
+    {
+      "epoch": 2.116901020126827,
+      "grad_norm": 2.0648045539855957,
+      "learning_rate": 9.463827304550759e-05,
+      "loss": 3.5452896118164063,
+      "step": 1920
+    },
+    {
+      "epoch": 2.1279294182519988,
+      "grad_norm": 1.6846543550491333,
+      "learning_rate": 9.45799299883314e-05,
+      "loss": 3.5434345245361327,
+      "step": 1930
+    },
+    {
+      "epoch": 2.138957816377171,
+      "grad_norm": 1.9105942249298096,
+      "learning_rate": 9.452158693115519e-05,
+      "loss": 3.5351535797119142,
+      "step": 1940
+    },
+    {
+      "epoch": 2.1499862145023436,
+      "grad_norm": 1.8230890035629272,
+      "learning_rate": 9.4463243873979e-05,
+      "loss": 3.5190963745117188,
+      "step": 1950
+    },
+    {
+      "epoch": 2.161014612627516,
+      "grad_norm": 1.6383274793624878,
+      "learning_rate": 9.440490081680281e-05,
+      "loss": 3.5228431701660154,
+      "step": 1960
+    },
+    {
+      "epoch": 2.172043010752688,
+      "grad_norm": 1.7378439903259277,
+      "learning_rate": 9.43465577596266e-05,
+      "loss": 3.520981216430664,
+      "step": 1970
+    },
+    {
+      "epoch": 2.1830714088778604,
+      "grad_norm": 1.941454529762268,
+      "learning_rate": 9.428821470245041e-05,
+      "loss": 3.519342803955078,
+      "step": 1980
+    },
+    {
+      "epoch": 2.194099807003033,
+      "grad_norm": 1.8295516967773438,
+      "learning_rate": 9.422987164527421e-05,
+      "loss": 3.5412979125976562,
+      "step": 1990
+    },
+    {
+      "epoch": 2.2051282051282053,
+      "grad_norm": 1.8052620887756348,
+      "learning_rate": 9.417152858809802e-05,
+      "loss": 3.5153289794921876,
+      "step": 2000
+    },
+    {
+      "epoch": 2.2161566032533773,
+      "grad_norm": 2.1949570178985596,
+      "learning_rate": 9.411318553092183e-05,
+      "loss": 3.521608352661133,
+      "step": 2010
+    },
+    {
+      "epoch": 2.2271850013785497,
+      "grad_norm": 1.746172308921814,
+      "learning_rate": 9.405484247374564e-05,
+      "loss": 3.5008296966552734,
+      "step": 2020
+    },
+    {
+      "epoch": 2.238213399503722,
+      "grad_norm": 2.5374276638031006,
+      "learning_rate": 9.399649941656943e-05,
+      "loss": 3.5140228271484375,
+      "step": 2030
+    },
+    {
+      "epoch": 2.2492417976288945,
+      "grad_norm": 1.7763218879699707,
+      "learning_rate": 9.393815635939324e-05,
+      "loss": 3.510652542114258,
+      "step": 2040
+    },
+    {
+      "epoch": 2.2602701957540665,
+      "grad_norm": 1.6599587202072144,
+      "learning_rate": 9.387981330221705e-05,
+      "loss": 3.5122325897216795,
+      "step": 2050
+    },
+    {
+      "epoch": 2.271298593879239,
+      "grad_norm": 2.1496078968048096,
+      "learning_rate": 9.382147024504085e-05,
+      "loss": 3.5139747619628907,
+      "step": 2060
+    },
+    {
+      "epoch": 2.2823269920044114,
+      "grad_norm": 1.64266836643219,
+      "learning_rate": 9.376312718786465e-05,
+      "loss": 3.507743072509766,
+      "step": 2070
+    },
+    {
+      "epoch": 2.293355390129584,
+      "grad_norm": 2.1241567134857178,
+      "learning_rate": 9.370478413068845e-05,
+      "loss": 3.5162708282470705,
+      "step": 2080
+    },
+    {
+      "epoch": 2.304383788254756,
+      "grad_norm": 1.8391071557998657,
+      "learning_rate": 9.364644107351226e-05,
+      "loss": 3.4955375671386717,
+      "step": 2090
+    },
+    {
+      "epoch": 2.315412186379928,
+      "grad_norm": 2.7478973865509033,
+      "learning_rate": 9.358809801633605e-05,
+      "loss": 3.497519302368164,
+      "step": 2100
+    },
+    {
+      "epoch": 2.3264405845051006,
+      "grad_norm": 1.938588261604309,
+      "learning_rate": 9.352975495915986e-05,
+      "loss": 3.490141677856445,
+      "step": 2110
+    },
+    {
+      "epoch": 2.337468982630273,
+      "grad_norm": 1.5637104511260986,
+      "learning_rate": 9.347141190198366e-05,
+      "loss": 3.499908447265625,
+      "step": 2120
+    },
+    {
+      "epoch": 2.3484973807554455,
+      "grad_norm": 1.882504940032959,
+      "learning_rate": 9.341306884480747e-05,
+      "loss": 3.491979217529297,
+      "step": 2130
+    },
+    {
+      "epoch": 2.3595257788806174,
+      "grad_norm": 1.8528521060943604,
+      "learning_rate": 9.335472578763128e-05,
+      "loss": 3.4961143493652345,
+      "step": 2140
+    },
+    {
+      "epoch": 2.37055417700579,
+      "grad_norm": 1.8050177097320557,
+      "learning_rate": 9.329638273045509e-05,
+      "loss": 3.4948150634765627,
+      "step": 2150
+    },
+    {
+      "epoch": 2.3815825751309623,
+      "grad_norm": 1.816784381866455,
+      "learning_rate": 9.32380396732789e-05,
+      "loss": 3.4910873413085937,
+      "step": 2160
+    },
+    {
+      "epoch": 2.3926109732561347,
+      "grad_norm": 1.9779244661331177,
+      "learning_rate": 9.317969661610269e-05,
+      "loss": 3.492570495605469,
+      "step": 2170
+    },
+    {
+      "epoch": 2.4036393713813067,
+      "grad_norm": 1.8939772844314575,
+      "learning_rate": 9.31213535589265e-05,
+      "loss": 3.473868560791016,
+      "step": 2180
+    },
+    {
+      "epoch": 2.414667769506479,
+      "grad_norm": 2.1493656635284424,
+      "learning_rate": 9.30630105017503e-05,
+      "loss": 3.494515228271484,
+      "step": 2190
+    },
+    {
+      "epoch": 2.4256961676316515,
+      "grad_norm": 1.8989397287368774,
+      "learning_rate": 9.30046674445741e-05,
+      "loss": 3.487537384033203,
+      "step": 2200
+    },
+    {
+      "epoch": 2.436724565756824,
+      "grad_norm": 1.881856918334961,
+      "learning_rate": 9.294632438739791e-05,
+      "loss": 3.475904083251953,
+      "step": 2210
+    },
+    {
+      "epoch": 2.447752963881996,
+      "grad_norm": 1.9463883638381958,
+      "learning_rate": 9.288798133022171e-05,
+      "loss": 3.4829254150390625,
+      "step": 2220
+    },
+    {
+      "epoch": 2.4587813620071683,
+      "grad_norm": 2.01379656791687,
+      "learning_rate": 9.282963827304552e-05,
+      "loss": 3.472850036621094,
+      "step": 2230
+    },
+    {
+      "epoch": 2.4698097601323408,
+      "grad_norm": 2.442741632461548,
+      "learning_rate": 9.277129521586931e-05,
+      "loss": 3.47030029296875,
+      "step": 2240
+    },
+    {
+      "epoch": 2.480838158257513,
+      "grad_norm": 1.5051734447479248,
+      "learning_rate": 9.271295215869312e-05,
+      "loss": 3.489413833618164,
+      "step": 2250
+    },
+    {
+      "epoch": 2.4918665563826856,
+      "grad_norm": 1.9489309787750244,
+      "learning_rate": 9.265460910151692e-05,
+      "loss": 3.464769744873047,
+      "step": 2260
+    },
+    {
+      "epoch": 2.5028949545078576,
+      "grad_norm": 2.319654941558838,
+      "learning_rate": 9.259626604434072e-05,
+      "loss": 3.469140625,
+      "step": 2270
+    },
+    {
+      "epoch": 2.51392335263303,
+      "grad_norm": 1.7984129190444946,
+      "learning_rate": 9.253792298716453e-05,
+      "loss": 3.466594696044922,
+      "step": 2280
+    },
+    {
+      "epoch": 2.5249517507582024,
+      "grad_norm": 1.640869379043579,
+      "learning_rate": 9.247957992998833e-05,
+      "loss": 3.463022994995117,
+      "step": 2290
+    },
+    {
+      "epoch": 2.5359801488833744,
+      "grad_norm": 1.6698195934295654,
+      "learning_rate": 9.242123687281214e-05,
+      "loss": 3.4695220947265626,
+      "step": 2300
+    },
+    {
+      "epoch": 2.547008547008547,
+      "grad_norm": 2.2945683002471924,
+      "learning_rate": 9.236289381563595e-05,
+      "loss": 3.469150924682617,
+      "step": 2310
+    },
+    {
+      "epoch": 2.5580369451337193,
+      "grad_norm": 1.7678370475769043,
+      "learning_rate": 9.230455075845976e-05,
+      "loss": 3.470307159423828,
+      "step": 2320
+    },
+    {
+      "epoch": 2.5690653432588917,
+      "grad_norm": 1.8386255502700806,
+      "learning_rate": 9.224620770128355e-05,
+      "loss": 3.4638832092285154,
+      "step": 2330
+    },
+    {
+      "epoch": 2.580093741384064,
+      "grad_norm": 2.0348527431488037,
+      "learning_rate": 9.218786464410736e-05,
+      "loss": 3.460480880737305,
+      "step": 2340
+    },
+    {
+      "epoch": 2.5911221395092365,
+      "grad_norm": 1.845974326133728,
+      "learning_rate": 9.212952158693116e-05,
+      "loss": 3.4529083251953123,
+      "step": 2350
+    },
+    {
+      "epoch": 2.6021505376344085,
+      "grad_norm": 2.0843095779418945,
+      "learning_rate": 9.207117852975496e-05,
+      "loss": 3.4576786041259764,
+      "step": 2360
+    },
+    {
+      "epoch": 2.613178935759581,
+      "grad_norm": 1.7627031803131104,
+      "learning_rate": 9.201283547257876e-05,
+      "loss": 3.4450752258300783,
+      "step": 2370
+    },
+    {
+      "epoch": 2.6242073338847534,
+      "grad_norm": 1.371972918510437,
+      "learning_rate": 9.195449241540257e-05,
+      "loss": 3.464734649658203,
+      "step": 2380
+    },
+    {
+      "epoch": 2.6352357320099253,
+      "grad_norm": 1.6781940460205078,
+      "learning_rate": 9.189614935822638e-05,
+      "loss": 3.444991683959961,
+      "step": 2390
+    },
+    {
+      "epoch": 2.6462641301350978,
+      "grad_norm": 1.8782585859298706,
+      "learning_rate": 9.183780630105017e-05,
+      "loss": 3.4558509826660155,
+      "step": 2400
+    },
+    {
+      "epoch": 2.65729252826027,
+      "grad_norm": 1.942812204360962,
+      "learning_rate": 9.177946324387398e-05,
+      "loss": 3.4555503845214846,
+      "step": 2410
+    },
+    {
+      "epoch": 2.6683209263854426,
+      "grad_norm": 1.404680609703064,
+      "learning_rate": 9.172112018669778e-05,
+      "loss": 3.438182830810547,
+      "step": 2420
+    },
+    {
+      "epoch": 2.679349324510615,
+      "grad_norm": 1.7656677961349487,
+      "learning_rate": 9.166277712952159e-05,
+      "loss": 3.4622947692871096,
+      "step": 2430
+    },
+    {
+      "epoch": 2.690377722635787,
+      "grad_norm": 1.8348901271820068,
+      "learning_rate": 9.16044340723454e-05,
+      "loss": 3.438182830810547,
+      "step": 2440
+    },
+    {
+      "epoch": 2.7014061207609594,
+      "grad_norm": 2.0641167163848877,
+      "learning_rate": 9.15460910151692e-05,
+      "loss": 3.441473388671875,
+      "step": 2450
+    },
+    {
+      "epoch": 2.712434518886132,
+      "grad_norm": 1.726035475730896,
+      "learning_rate": 9.148774795799301e-05,
+      "loss": 3.441991424560547,
+      "step": 2460
+    },
+    {
+      "epoch": 2.7234629170113043,
+      "grad_norm": 1.854658603668213,
+      "learning_rate": 9.142940490081681e-05,
+      "loss": 3.4441551208496093,
+      "step": 2470
+    },
+    {
+      "epoch": 2.7344913151364763,
+      "grad_norm": 1.8229296207427979,
+      "learning_rate": 9.137106184364062e-05,
+      "loss": 3.441034698486328,
+      "step": 2480
+    },
+    {
+      "epoch": 2.7455197132616487,
+      "grad_norm": 1.6627975702285767,
+      "learning_rate": 9.131271878646441e-05,
+      "loss": 3.4399124145507813,
+      "step": 2490
+    },
+    {
+      "epoch": 2.756548111386821,
+      "grad_norm": 1.4111251831054688,
+      "learning_rate": 9.125437572928822e-05,
+      "loss": 3.4374462127685548,
+      "step": 2500
+    },
+    {
+      "epoch": 2.7675765095119935,
+      "grad_norm": 2.015869379043579,
+      "learning_rate": 9.119603267211202e-05,
+      "loss": 3.4262016296386717,
+      "step": 2510
+    },
+    {
+      "epoch": 2.778604907637166,
+      "grad_norm": 2.2818591594696045,
+      "learning_rate": 9.113768961493583e-05,
+      "loss": 3.446285629272461,
+      "step": 2520
+    },
+    {
+      "epoch": 2.789633305762338,
+      "grad_norm": 1.8643262386322021,
+      "learning_rate": 9.107934655775962e-05,
+      "loss": 3.4362293243408204,
+      "step": 2530
+    },
+    {
+      "epoch": 2.8006617038875103,
+      "grad_norm": 1.248988151550293,
+      "learning_rate": 9.102100350058343e-05,
+      "loss": 3.441702651977539,
+      "step": 2540
+    },
+    {
+      "epoch": 2.8116901020126828,
+      "grad_norm": 1.5247464179992676,
+      "learning_rate": 9.096266044340724e-05,
+      "loss": 3.4388256072998047,
+      "step": 2550
+    },
+    {
+      "epoch": 2.8227185001378547,
+      "grad_norm": 1.9120620489120483,
+      "learning_rate": 9.090431738623103e-05,
+      "loss": 3.4206756591796874,
+      "step": 2560
+    },
+    {
+      "epoch": 2.833746898263027,
+      "grad_norm": 1.4591054916381836,
+      "learning_rate": 9.084597432905484e-05,
+      "loss": 3.4229709625244142,
+      "step": 2570
+    },
+    {
+      "epoch": 2.8447752963881996,
+      "grad_norm": 2.24849796295166,
+      "learning_rate": 9.078763127187865e-05,
+      "loss": 3.426911163330078,
+      "step": 2580
+    },
+    {
+      "epoch": 2.855803694513372,
+      "grad_norm": 1.5658804178237915,
+      "learning_rate": 9.072928821470246e-05,
+      "loss": 3.445120620727539,
+      "step": 2590
+    },
+    {
+      "epoch": 2.8668320926385444,
+      "grad_norm": 1.483583688735962,
+      "learning_rate": 9.067094515752626e-05,
+      "loss": 3.430312728881836,
+      "step": 2600
+    },
+    {
+      "epoch": 2.8778604907637164,
+      "grad_norm": 1.5759658813476562,
+      "learning_rate": 9.061260210035007e-05,
+      "loss": 3.4178386688232423,
+      "step": 2610
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 1.9259848594665527,
+      "learning_rate": 9.055425904317386e-05,
+      "loss": 3.430949401855469,
+      "step": 2620
+    },
+    {
+      "epoch": 2.8999172870140613,
+      "grad_norm": 1.470717191696167,
+      "learning_rate": 9.049591598599767e-05,
+      "loss": 3.439757537841797,
+      "step": 2630
+    },
+    {
+      "epoch": 2.9109456851392337,
+      "grad_norm": 1.8934212923049927,
+      "learning_rate": 9.043757292882148e-05,
+      "loss": 3.430719757080078,
+      "step": 2640
+    },
+    {
+      "epoch": 2.9219740832644057,
+      "grad_norm": 1.6267489194869995,
+      "learning_rate": 9.037922987164527e-05,
+      "loss": 3.4224998474121096,
+      "step": 2650
+    },
+    {
+      "epoch": 2.933002481389578,
+      "grad_norm": 1.6213353872299194,
+      "learning_rate": 9.032088681446908e-05,
+      "loss": 3.4213233947753907,
+      "step": 2660
+    },
+    {
+      "epoch": 2.9440308795147505,
+      "grad_norm": 1.961879849433899,
+      "learning_rate": 9.026254375729288e-05,
+      "loss": 3.4108352661132812,
+      "step": 2670
+    },
+    {
+      "epoch": 2.955059277639923,
+      "grad_norm": 1.7363910675048828,
+      "learning_rate": 9.020420070011669e-05,
+      "loss": 3.423554229736328,
+      "step": 2680
+    },
+    {
+      "epoch": 2.9660876757650954,
+      "grad_norm": 1.6161952018737793,
+      "learning_rate": 9.014585764294048e-05,
+      "loss": 3.418962860107422,
+      "step": 2690
+    },
+    {
+      "epoch": 2.9771160738902673,
+      "grad_norm": 1.8065682649612427,
+      "learning_rate": 9.008751458576429e-05,
+      "loss": 3.4218765258789063,
+      "step": 2700
+    },
+    {
+      "epoch": 2.9881444720154398,
+      "grad_norm": 1.4285337924957275,
+      "learning_rate": 9.00291715285881e-05,
+      "loss": 3.413957214355469,
+      "step": 2710
+    },
+    {
+      "epoch": 2.999172870140612,
+      "grad_norm": 1.30274498462677,
+      "learning_rate": 8.997082847141191e-05,
+      "loss": 3.4176124572753905,
+      "step": 2720
+    },
+    {
+      "epoch": 3.009925558312655,
+      "grad_norm": 1.5460416078567505,
+      "learning_rate": 8.991248541423572e-05,
+      "loss": 3.388013458251953,
+      "step": 2730
+    },
+    {
+      "epoch": 3.0209539564378276,
+      "grad_norm": 1.5832446813583374,
+      "learning_rate": 8.985414235705951e-05,
+      "loss": 3.3929378509521486,
+      "step": 2740
+    },
+    {
+      "epoch": 3.0319823545629996,
+      "grad_norm": 1.6086630821228027,
+      "learning_rate": 8.979579929988332e-05,
+      "loss": 3.3940502166748048,
+      "step": 2750
+    },
+    {
+      "epoch": 3.043010752688172,
+      "grad_norm": 1.6624842882156372,
+      "learning_rate": 8.973745624270712e-05,
+      "loss": 3.388884353637695,
+      "step": 2760
+    },
+    {
+      "epoch": 3.0540391508133444,
+      "grad_norm": 1.7352933883666992,
+      "learning_rate": 8.967911318553093e-05,
+      "loss": 3.409127426147461,
+      "step": 2770
+    },
+    {
+      "epoch": 3.065067548938517,
+      "grad_norm": 1.45657217502594,
+      "learning_rate": 8.962077012835472e-05,
+      "loss": 3.389351654052734,
+      "step": 2780
+    },
+    {
+      "epoch": 3.076095947063689,
+      "grad_norm": 1.4969090223312378,
+      "learning_rate": 8.956242707117853e-05,
+      "loss": 3.3988433837890626,
+      "step": 2790
+    },
+    {
+      "epoch": 3.0871243451888613,
+      "grad_norm": 1.710800051689148,
+      "learning_rate": 8.950408401400234e-05,
+      "loss": 3.395826721191406,
+      "step": 2800
+    },
+    {
+      "epoch": 3.0981527433140337,
+      "grad_norm": 1.6347870826721191,
+      "learning_rate": 8.944574095682614e-05,
+      "loss": 3.391011047363281,
+      "step": 2810
+    },
+    {
+      "epoch": 3.109181141439206,
+      "grad_norm": 1.4630122184753418,
+      "learning_rate": 8.938739789964995e-05,
+      "loss": 3.401841735839844,
+      "step": 2820
+    },
+    {
+      "epoch": 3.120209539564378,
+      "grad_norm": 1.547430157661438,
+      "learning_rate": 8.932905484247374e-05,
+      "loss": 3.3979782104492187,
+      "step": 2830
+    },
+    {
+      "epoch": 3.1312379376895505,
+      "grad_norm": 1.5614186525344849,
+      "learning_rate": 8.927071178529755e-05,
+      "loss": 3.3884544372558594,
+      "step": 2840
+    },
+    {
+      "epoch": 3.142266335814723,
+      "grad_norm": 1.4073251485824585,
+      "learning_rate": 8.921236872812136e-05,
+      "loss": 3.3886154174804686,
+      "step": 2850
+    },
+    {
+      "epoch": 3.1532947339398953,
+      "grad_norm": 1.3639475107192993,
+      "learning_rate": 8.915402567094517e-05,
+      "loss": 3.383074951171875,
+      "step": 2860
+    },
+    {
+      "epoch": 3.1643231320650678,
+      "grad_norm": 2.3929882049560547,
+      "learning_rate": 8.909568261376896e-05,
+      "loss": 3.3788246154785155,
+      "step": 2870
+    },
+    {
+      "epoch": 3.1753515301902397,
+      "grad_norm": 1.7196829319000244,
+      "learning_rate": 8.903733955659277e-05,
+      "loss": 3.3822708129882812,
+      "step": 2880
+    },
+    {
+      "epoch": 3.186379928315412,
+      "grad_norm": 1.526293396949768,
+      "learning_rate": 8.897899649941658e-05,
+      "loss": 3.381543731689453,
+      "step": 2890
+    },
+    {
+      "epoch": 3.1974083264405846,
+      "grad_norm": 1.2336128950119019,
+      "learning_rate": 8.892065344224038e-05,
+      "loss": 3.3975807189941407,
+      "step": 2900
+    },
+    {
+      "epoch": 3.208436724565757,
+      "grad_norm": 1.4868130683898926,
+      "learning_rate": 8.886231038506419e-05,
+      "loss": 3.3970687866210936,
+      "step": 2910
+    },
+    {
+      "epoch": 3.219465122690929,
+      "grad_norm": 1.5349540710449219,
+      "learning_rate": 8.880396732788798e-05,
+      "loss": 3.385994720458984,
+      "step": 2920
+    },
+    {
+      "epoch": 3.2304935208161014,
+      "grad_norm": 1.5333718061447144,
+      "learning_rate": 8.874562427071179e-05,
+      "loss": 3.362841796875,
+      "step": 2930
+    },
+    {
+      "epoch": 3.241521918941274,
+      "grad_norm": 1.514235258102417,
+      "learning_rate": 8.868728121353558e-05,
+      "loss": 3.3816680908203125,
+      "step": 2940
+    },
+    {
+      "epoch": 3.2525503170664463,
+      "grad_norm": 1.5870161056518555,
+      "learning_rate": 8.86289381563594e-05,
+      "loss": 3.3818199157714846,
+      "step": 2950
+    },
+    {
+      "epoch": 3.2635787151916182,
+      "grad_norm": 1.6295320987701416,
+      "learning_rate": 8.85705950991832e-05,
+      "loss": 3.379594421386719,
+      "step": 2960
+    },
+    {
+      "epoch": 3.2746071133167907,
+      "grad_norm": 1.533991813659668,
+      "learning_rate": 8.8512252042007e-05,
+      "loss": 3.387801742553711,
+      "step": 2970
+    },
+    {
+      "epoch": 3.285635511441963,
+      "grad_norm": 2.2125084400177,
+      "learning_rate": 8.845390898483081e-05,
+      "loss": 3.3856468200683594,
+      "step": 2980
+    },
+    {
+      "epoch": 3.2966639095671355,
+      "grad_norm": 1.800207495689392,
+      "learning_rate": 8.839556592765462e-05,
+      "loss": 3.3843597412109374,
+      "step": 2990
+    },
+    {
+      "epoch": 3.3076923076923075,
+      "grad_norm": 1.3071027994155884,
+      "learning_rate": 8.833722287047842e-05,
+      "loss": 3.3861888885498046,
+      "step": 3000
+    },
+    {
+      "epoch": 3.31872070581748,
+      "grad_norm": 1.7724641561508179,
+      "learning_rate": 8.827887981330222e-05,
+      "loss": 3.3929458618164063,
+      "step": 3010
+    },
+    {
+      "epoch": 3.3297491039426523,
+      "grad_norm": 1.3397877216339111,
+      "learning_rate": 8.822053675612603e-05,
+      "loss": 3.3785301208496095,
+      "step": 3020
+    },
+    {
+      "epoch": 3.3407775020678248,
+      "grad_norm": 1.352630376815796,
+      "learning_rate": 8.816219369894982e-05,
+      "loss": 3.3796306610107423,
+      "step": 3030
+    },
+    {
+      "epoch": 3.351805900192997,
+      "grad_norm": 1.5996475219726562,
+      "learning_rate": 8.810385064177363e-05,
+      "loss": 3.362406921386719,
+      "step": 3040
+    },
+    {
+      "epoch": 3.362834298318169,
+      "grad_norm": 1.6010814905166626,
+      "learning_rate": 8.804550758459744e-05,
+      "loss": 3.3811767578125,
+      "step": 3050
+    },
+    {
+      "epoch": 3.3738626964433416,
+      "grad_norm": 1.3276373147964478,
+      "learning_rate": 8.798716452742124e-05,
+      "loss": 3.3732643127441406,
+      "step": 3060
+    },
+    {
+      "epoch": 3.384891094568514,
+      "grad_norm": 1.7741515636444092,
+      "learning_rate": 8.792882147024505e-05,
+      "loss": 3.381968688964844,
+      "step": 3070
+    },
+    {
+      "epoch": 3.3959194926936864,
+      "grad_norm": 1.7820576429367065,
+      "learning_rate": 8.787047841306884e-05,
+      "loss": 3.358811950683594,
+      "step": 3080
+    },
+    {
+      "epoch": 3.4069478908188584,
+      "grad_norm": 1.389573574066162,
+      "learning_rate": 8.781213535589265e-05,
+      "loss": 3.36102180480957,
+      "step": 3090
+    },
+    {
+      "epoch": 3.417976288944031,
+      "grad_norm": 1.1910648345947266,
+      "learning_rate": 8.775379229871645e-05,
+      "loss": 3.3652645111083985,
+      "step": 3100
+    },
+    {
+      "epoch": 3.4290046870692032,
+      "grad_norm": 1.965219497680664,
+      "learning_rate": 8.769544924154026e-05,
+      "loss": 3.3735313415527344,
+      "step": 3110
+    },
+    {
+      "epoch": 3.4400330851943757,
+      "grad_norm": 1.5992330312728882,
+      "learning_rate": 8.763710618436406e-05,
+      "loss": 3.362974166870117,
+      "step": 3120
+    },
+    {
+      "epoch": 3.4510614833195477,
+      "grad_norm": 2.2293193340301514,
+      "learning_rate": 8.757876312718787e-05,
+      "loss": 3.3681709289550783,
+      "step": 3130
+    },
+    {
+      "epoch": 3.46208988144472,
+      "grad_norm": 1.2978801727294922,
+      "learning_rate": 8.752042007001168e-05,
+      "loss": 3.3776336669921876,
+      "step": 3140
+    },
+    {
+      "epoch": 3.4731182795698925,
+      "grad_norm": 1.227036714553833,
+      "learning_rate": 8.746207701283548e-05,
+      "loss": 3.3590301513671874,
+      "step": 3150
+    },
+    {
+      "epoch": 3.484146677695065,
+      "grad_norm": 1.8023360967636108,
+      "learning_rate": 8.740373395565929e-05,
+      "loss": 3.35421142578125,
+      "step": 3160
+    },
+    {
+      "epoch": 3.495175075820237,
+      "grad_norm": 1.6423453092575073,
+      "learning_rate": 8.734539089848308e-05,
+      "loss": 3.3748985290527345,
+      "step": 3170
+    },
+    {
+      "epoch": 3.5062034739454093,
+      "grad_norm": 1.3261916637420654,
+      "learning_rate": 8.728704784130689e-05,
+      "loss": 3.36380615234375,
+      "step": 3180
+    },
+    {
+      "epoch": 3.5172318720705817,
+      "grad_norm": 1.290014624595642,
+      "learning_rate": 8.722870478413069e-05,
+      "loss": 3.3596282958984376,
+      "step": 3190
+    },
+    {
+      "epoch": 3.528260270195754,
+      "grad_norm": 2.0481576919555664,
+      "learning_rate": 8.71703617269545e-05,
+      "loss": 3.358118438720703,
+      "step": 3200
+    },
+    {
+      "epoch": 3.5392886683209266,
+      "grad_norm": 1.4758331775665283,
+      "learning_rate": 8.71120186697783e-05,
+      "loss": 3.3536834716796875,
+      "step": 3210
+    },
+    {
+      "epoch": 3.5503170664460986,
+      "grad_norm": 1.4340440034866333,
+      "learning_rate": 8.70536756126021e-05,
+      "loss": 3.358259582519531,
+      "step": 3220
+    },
+    {
+      "epoch": 3.561345464571271,
+      "grad_norm": 1.6952699422836304,
+      "learning_rate": 8.699533255542591e-05,
+      "loss": 3.3730777740478515,
+      "step": 3230
+    },
+    {
+      "epoch": 3.5723738626964434,
+      "grad_norm": 1.9069234132766724,
+      "learning_rate": 8.69369894982497e-05,
+      "loss": 3.3552001953125,
+      "step": 3240
+    },
+    {
+      "epoch": 3.5834022608216154,
+      "grad_norm": 1.6194590330123901,
+      "learning_rate": 8.687864644107351e-05,
+      "loss": 3.3562744140625,
+      "step": 3250
+    },
+    {
+      "epoch": 3.594430658946788,
+      "grad_norm": 1.33975350856781,
+      "learning_rate": 8.682030338389732e-05,
+      "loss": 3.3622581481933596,
+      "step": 3260
+    },
+    {
+      "epoch": 3.6054590570719602,
+      "grad_norm": 1.3948160409927368,
+      "learning_rate": 8.676196032672113e-05,
+      "loss": 3.3645614624023437,
+      "step": 3270
+    },
+    {
+      "epoch": 3.6164874551971327,
+      "grad_norm": 1.4972363710403442,
+      "learning_rate": 8.670361726954493e-05,
+      "loss": 3.3713829040527346,
+      "step": 3280
+    },
+    {
+      "epoch": 3.627515853322305,
+      "grad_norm": 1.9456968307495117,
+      "learning_rate": 8.664527421236874e-05,
+      "loss": 3.3617935180664062,
+      "step": 3290
+    },
+    {
+      "epoch": 3.6385442514474775,
+      "grad_norm": 1.8050702810287476,
+      "learning_rate": 8.658693115519254e-05,
+      "loss": 3.359496307373047,
+      "step": 3300
+    },
+    {
+      "epoch": 3.6495726495726495,
+      "grad_norm": 1.294492244720459,
+      "learning_rate": 8.652858809801634e-05,
+      "loss": 3.361173629760742,
+      "step": 3310
+    },
+    {
+      "epoch": 3.660601047697822,
+      "grad_norm": 1.7897614240646362,
+      "learning_rate": 8.647024504084015e-05,
+      "loss": 3.3475852966308595,
+      "step": 3320
+    },
+    {
+      "epoch": 3.6716294458229943,
+      "grad_norm": 1.5647767782211304,
+      "learning_rate": 8.641190198366394e-05,
+      "loss": 3.3594207763671875,
+      "step": 3330
+    },
+    {
+      "epoch": 3.6826578439481663,
+      "grad_norm": 1.3839472532272339,
+      "learning_rate": 8.635355892648775e-05,
+      "loss": 3.361709976196289,
+      "step": 3340
+    },
+    {
+      "epoch": 3.6936862420733387,
+      "grad_norm": 1.543115258216858,
+      "learning_rate": 8.629521586931155e-05,
+      "loss": 3.349272918701172,
+      "step": 3350
+    },
+    {
+      "epoch": 3.704714640198511,
+      "grad_norm": 1.2722103595733643,
+      "learning_rate": 8.623687281213536e-05,
+      "loss": 3.3600040435791017,
+      "step": 3360
+    },
+    {
+      "epoch": 3.7157430383236836,
+      "grad_norm": 2.396493434906006,
+      "learning_rate": 8.617852975495915e-05,
+      "loss": 3.359762954711914,
+      "step": 3370
+    },
+    {
+      "epoch": 3.726771436448856,
+      "grad_norm": 1.3756037950515747,
+      "learning_rate": 8.612018669778296e-05,
+      "loss": 3.3409027099609374,
+      "step": 3380
+    },
+    {
+      "epoch": 3.737799834574028,
+      "grad_norm": 1.5124824047088623,
+      "learning_rate": 8.606184364060677e-05,
+      "loss": 3.346342849731445,
+      "step": 3390
+    },
+    {
+      "epoch": 3.7488282326992004,
+      "grad_norm": 1.3679585456848145,
+      "learning_rate": 8.600350058343058e-05,
+      "loss": 3.3478328704833986,
+      "step": 3400
+    },
+    {
+      "epoch": 3.759856630824373,
+      "grad_norm": 1.3470197916030884,
+      "learning_rate": 8.594515752625439e-05,
+      "loss": 3.352674865722656,
+      "step": 3410
+    },
+    {
+      "epoch": 3.770885028949545,
+      "grad_norm": 1.4775781631469727,
+      "learning_rate": 8.588681446907818e-05,
+      "loss": 3.3504791259765625,
+      "step": 3420
+    },
+    {
+      "epoch": 3.7819134270747172,
+      "grad_norm": 1.1987943649291992,
+      "learning_rate": 8.582847141190199e-05,
+      "loss": 3.3457687377929686,
+      "step": 3430
+    },
+    {
+      "epoch": 3.7929418251998896,
+      "grad_norm": 1.8007314205169678,
+      "learning_rate": 8.577012835472579e-05,
+      "loss": 3.3557716369628907,
+      "step": 3440
+    },
+    {
+      "epoch": 3.803970223325062,
+      "grad_norm": 1.4193800687789917,
+      "learning_rate": 8.57117852975496e-05,
+      "loss": 3.346666717529297,
+      "step": 3450
+    },
+    {
+      "epoch": 3.8149986214502345,
+      "grad_norm": 1.600216031074524,
+      "learning_rate": 8.56534422403734e-05,
+      "loss": 3.354322814941406,
+      "step": 3460
+    },
+    {
+      "epoch": 3.826027019575407,
+      "grad_norm": 1.6823015213012695,
+      "learning_rate": 8.55950991831972e-05,
+      "loss": 3.3344764709472656,
+      "step": 3470
+    },
+    {
+      "epoch": 3.837055417700579,
+      "grad_norm": 1.8002822399139404,
+      "learning_rate": 8.553675612602101e-05,
+      "loss": 3.338224411010742,
+      "step": 3480
+    },
+    {
+      "epoch": 3.8480838158257513,
+      "grad_norm": 1.019519567489624,
+      "learning_rate": 8.54784130688448e-05,
+      "loss": 3.342393493652344,
+      "step": 3490
+    },
+    {
+      "epoch": 3.8591122139509237,
+      "grad_norm": 1.4397176504135132,
+      "learning_rate": 8.542007001166861e-05,
+      "loss": 3.3416332244873046,
+      "step": 3500
+    },
+    {
+      "epoch": 3.8701406120760957,
+      "grad_norm": 1.398215889930725,
+      "learning_rate": 8.536172695449241e-05,
+      "loss": 3.3455711364746095,
+      "step": 3510
+    },
+    {
+      "epoch": 3.881169010201268,
+      "grad_norm": 1.431221604347229,
+      "learning_rate": 8.530338389731622e-05,
+      "loss": 3.3510116577148437,
+      "step": 3520
+    },
+    {
+      "epoch": 3.8921974083264406,
+      "grad_norm": 1.2339868545532227,
+      "learning_rate": 8.524504084014003e-05,
+      "loss": 3.333365631103516,
+      "step": 3530
+    },
+    {
+      "epoch": 3.903225806451613,
+      "grad_norm": 1.2564575672149658,
+      "learning_rate": 8.518669778296384e-05,
+      "loss": 3.355131912231445,
+      "step": 3540
+    },
+    {
+      "epoch": 3.9142542045767854,
+      "grad_norm": 1.44709050655365,
+      "learning_rate": 8.512835472578765e-05,
+      "loss": 3.352345275878906,
+      "step": 3550
+    },
+    {
+      "epoch": 3.9252826027019574,
+      "grad_norm": 1.0984286069869995,
+      "learning_rate": 8.507001166861144e-05,
+      "loss": 3.3399391174316406,
+      "step": 3560
+    },
+    {
+      "epoch": 3.93631100082713,
+      "grad_norm": 1.521567702293396,
+      "learning_rate": 8.501166861143525e-05,
+      "loss": 3.3333946228027345,
+      "step": 3570
+    },
+    {
+      "epoch": 3.9473393989523022,
+      "grad_norm": 1.3443926572799683,
+      "learning_rate": 8.495332555425905e-05,
+      "loss": 3.3321746826171874,
+      "step": 3580
+    },
+    {
+      "epoch": 3.9583677970774747,
+      "grad_norm": 1.539640188217163,
+      "learning_rate": 8.489498249708285e-05,
+      "loss": 3.335438537597656,
+      "step": 3590
+    },
+    {
+      "epoch": 3.9693961952026466,
+      "grad_norm": 1.123307466506958,
+      "learning_rate": 8.483663943990665e-05,
+      "loss": 3.3397190093994142,
+      "step": 3600
+    },
+    {
+      "epoch": 3.980424593327819,
+      "grad_norm": 1.6037691831588745,
+      "learning_rate": 8.477829638273046e-05,
+      "loss": 3.3357570648193358,
+      "step": 3610
+    },
+    {
+      "epoch": 3.9914529914529915,
+      "grad_norm": 1.6570971012115479,
+      "learning_rate": 8.471995332555425e-05,
+      "loss": 3.341298294067383,
+      "step": 3620
+    },
+    {
+      "epoch": 4.0022056796250345,
+      "grad_norm": 1.4301789999008179,
+      "learning_rate": 8.466161026837806e-05,
+      "loss": 3.3353721618652346,
+      "step": 3630
+    },
+    {
+      "epoch": 4.013234077750207,
+      "grad_norm": 1.539963722229004,
+      "learning_rate": 8.460326721120187e-05,
+      "loss": 3.3291671752929686,
+      "step": 3640
+    },
+    {
+      "epoch": 4.024262475875379,
+      "grad_norm": 1.5195462703704834,
+      "learning_rate": 8.454492415402567e-05,
+      "loss": 3.3193031311035157,
+      "step": 3650
+    },
+    {
+      "epoch": 4.035290874000552,
+      "grad_norm": 1.423514485359192,
+      "learning_rate": 8.448658109684948e-05,
+      "loss": 3.316299057006836,
+      "step": 3660
+    },
+    {
+      "epoch": 4.046319272125724,
+      "grad_norm": 1.4557220935821533,
+      "learning_rate": 8.442823803967328e-05,
+      "loss": 3.310700607299805,
+      "step": 3670
+    },
+    {
+      "epoch": 4.057347670250896,
+      "grad_norm": 1.6277695894241333,
+      "learning_rate": 8.43698949824971e-05,
+      "loss": 3.3296432495117188,
+      "step": 3680
+    },
+    {
+      "epoch": 4.068376068376068,
+      "grad_norm": 1.4026418924331665,
+      "learning_rate": 8.431155192532089e-05,
+      "loss": 3.316411590576172,
+      "step": 3690
+    },
+    {
+      "epoch": 4.079404466501241,
+      "grad_norm": 1.3620136976242065,
+      "learning_rate": 8.42532088681447e-05,
+      "loss": 3.3130718231201173,
+      "step": 3700
+    },
+    {
+      "epoch": 4.090432864626413,
+      "grad_norm": 1.4140877723693848,
+      "learning_rate": 8.419486581096851e-05,
+      "loss": 3.321166229248047,
+      "step": 3710
+    },
+    {
+      "epoch": 4.101461262751585,
+      "grad_norm": 1.3145273923873901,
+      "learning_rate": 8.41365227537923e-05,
+      "loss": 3.3267845153808593,
+      "step": 3720
+    },
+    {
+      "epoch": 4.112489660876758,
+      "grad_norm": 1.1830849647521973,
+      "learning_rate": 8.407817969661611e-05,
+      "loss": 3.315142059326172,
+      "step": 3730
+    },
+    {
+      "epoch": 4.12351805900193,
+      "grad_norm": 1.4326401948928833,
+      "learning_rate": 8.401983663943991e-05,
+      "loss": 3.313446807861328,
+      "step": 3740
+    },
+    {
+      "epoch": 4.134546457127103,
+      "grad_norm": 1.2179306745529175,
+      "learning_rate": 8.396149358226372e-05,
+      "loss": 3.3100254058837892,
+      "step": 3750
+    },
+    {
+      "epoch": 4.145574855252274,
+      "grad_norm": 1.3347259759902954,
+      "learning_rate": 8.390315052508751e-05,
+      "loss": 3.3180007934570312,
+      "step": 3760
+    },
+    {
+      "epoch": 4.156603253377447,
+      "grad_norm": 1.4468998908996582,
+      "learning_rate": 8.384480746791132e-05,
+      "loss": 3.307207489013672,
+      "step": 3770
+    },
+    {
+      "epoch": 4.167631651502619,
+      "grad_norm": 1.5258162021636963,
+      "learning_rate": 8.378646441073512e-05,
+      "loss": 3.3205909729003906,
+      "step": 3780
+    },
+    {
+      "epoch": 4.1786600496277915,
+      "grad_norm": 1.4104669094085693,
+      "learning_rate": 8.372812135355892e-05,
+      "loss": 3.309407424926758,
+      "step": 3790
+    },
+    {
+      "epoch": 4.189688447752964,
+      "grad_norm": 1.4369711875915527,
+      "learning_rate": 8.366977829638273e-05,
+      "loss": 3.3081008911132814,
+      "step": 3800
+    },
+    {
+      "epoch": 4.200716845878136,
+      "grad_norm": 1.2004350423812866,
+      "learning_rate": 8.361143523920654e-05,
+      "loss": 3.3130638122558596,
+      "step": 3810
+    },
+    {
+      "epoch": 4.211745244003309,
+      "grad_norm": 1.2577087879180908,
+      "learning_rate": 8.355309218203035e-05,
+      "loss": 3.312338638305664,
+      "step": 3820
+    },
+    {
+      "epoch": 4.222773642128481,
+      "grad_norm": 1.3649225234985352,
+      "learning_rate": 8.349474912485415e-05,
+      "loss": 3.323046875,
+      "step": 3830
+    },
+    {
+      "epoch": 4.233802040253654,
+      "grad_norm": 1.3110648393630981,
+      "learning_rate": 8.343640606767796e-05,
+      "loss": 3.3168025970458985,
+      "step": 3840
+    },
+    {
+      "epoch": 4.244830438378825,
+      "grad_norm": 1.493674635887146,
+      "learning_rate": 8.337806301050175e-05,
+      "loss": 3.320719909667969,
+      "step": 3850
+    },
+    {
+      "epoch": 4.2558588365039975,
+      "grad_norm": 1.283460259437561,
+      "learning_rate": 8.331971995332556e-05,
+      "loss": 3.3127769470214843,
+      "step": 3860
+    },
+    {
+      "epoch": 4.26688723462917,
+      "grad_norm": 1.4842219352722168,
+      "learning_rate": 8.326137689614936e-05,
+      "loss": 3.3042266845703123,
+      "step": 3870
+    },
+    {
+      "epoch": 4.277915632754342,
+      "grad_norm": 1.1820423603057861,
+      "learning_rate": 8.320303383897316e-05,
+      "loss": 3.3116954803466796,
+      "step": 3880
+    },
+    {
+      "epoch": 4.288944030879515,
+      "grad_norm": 1.5040090084075928,
+      "learning_rate": 8.314469078179697e-05,
+      "loss": 3.310440444946289,
+      "step": 3890
+    },
+    {
+      "epoch": 4.299972429004687,
+      "grad_norm": 1.1614471673965454,
+      "learning_rate": 8.308634772462077e-05,
+      "loss": 3.3075687408447267,
+      "step": 3900
+    },
+    {
+      "epoch": 4.31100082712986,
+      "grad_norm": 1.5577434301376343,
+      "learning_rate": 8.302800466744458e-05,
+      "loss": 3.312149429321289,
+      "step": 3910
+    },
+    {
+      "epoch": 4.322029225255032,
+      "grad_norm": 1.6462024450302124,
+      "learning_rate": 8.296966161026837e-05,
+      "loss": 3.321173095703125,
+      "step": 3920
+    },
+    {
+      "epoch": 4.333057623380204,
+      "grad_norm": 1.302138090133667,
+      "learning_rate": 8.291131855309218e-05,
+      "loss": 3.3210208892822264,
+      "step": 3930
+    },
+    {
+      "epoch": 4.344086021505376,
+      "grad_norm": 1.6717387437820435,
+      "learning_rate": 8.285297549591599e-05,
+      "loss": 3.3135406494140627,
+      "step": 3940
+    },
+    {
+      "epoch": 4.3551144196305485,
+      "grad_norm": 1.5899906158447266,
+      "learning_rate": 8.27946324387398e-05,
+      "loss": 3.31378059387207,
+      "step": 3950
+    },
+    {
+      "epoch": 4.366142817755721,
+      "grad_norm": 1.2071844339370728,
+      "learning_rate": 8.273628938156361e-05,
+      "loss": 3.3018829345703127,
+      "step": 3960
+    },
+    {
+      "epoch": 4.377171215880893,
+      "grad_norm": 1.8953418731689453,
+      "learning_rate": 8.26779463243874e-05,
+      "loss": 3.3119953155517576,
+      "step": 3970
+    },
+    {
+      "epoch": 4.388199614006066,
+      "grad_norm": 1.7741807699203491,
+      "learning_rate": 8.261960326721121e-05,
+      "loss": 3.3027114868164062,
+      "step": 3980
+    },
+    {
+      "epoch": 4.399228012131238,
+      "grad_norm": 1.3921217918395996,
+      "learning_rate": 8.256126021003501e-05,
+      "loss": 3.317920684814453,
+      "step": 3990
+    },
+    {
+      "epoch": 4.410256410256411,
+      "grad_norm": 1.1690531969070435,
+      "learning_rate": 8.250291715285882e-05,
+      "loss": 3.29705810546875,
+      "step": 4000
+    },
+    {
+      "epoch": 4.421284808381582,
+      "grad_norm": 1.3882209062576294,
+      "learning_rate": 8.244457409568261e-05,
+      "loss": 3.304886245727539,
+      "step": 4010
+    },
+    {
+      "epoch": 4.4323132065067545,
+      "grad_norm": 2.1946423053741455,
+      "learning_rate": 8.238623103850642e-05,
+      "loss": 3.3152816772460936,
+      "step": 4020
+    },
+    {
+      "epoch": 4.443341604631927,
+      "grad_norm": 1.517082929611206,
+      "learning_rate": 8.232788798133022e-05,
+      "loss": 3.3114837646484374,
+      "step": 4030
+    },
+    {
+      "epoch": 4.454370002757099,
+      "grad_norm": 1.2431399822235107,
+      "learning_rate": 8.226954492415403e-05,
+      "loss": 3.306407165527344,
+      "step": 4040
+    },
+    {
+      "epoch": 4.465398400882272,
+      "grad_norm": 1.5142467021942139,
+      "learning_rate": 8.221120186697783e-05,
+      "loss": 3.3055789947509764,
+      "step": 4050
+    },
+    {
+      "epoch": 4.476426799007444,
+      "grad_norm": 1.1361483335494995,
+      "learning_rate": 8.215285880980163e-05,
+      "loss": 3.3048805236816405,
+      "step": 4060
+    },
+    {
+      "epoch": 4.487455197132617,
+      "grad_norm": 1.1522105932235718,
+      "learning_rate": 8.209451575262544e-05,
+      "loss": 3.2948539733886717,
+      "step": 4070
+    },
+    {
+      "epoch": 4.498483595257789,
+      "grad_norm": 1.1002084016799927,
+      "learning_rate": 8.203617269544925e-05,
+      "loss": 3.306837463378906,
+      "step": 4080
+    },
+    {
+      "epoch": 4.5095119933829615,
+      "grad_norm": 1.4114456176757812,
+      "learning_rate": 8.197782963827306e-05,
+      "loss": 3.3008705139160157,
+      "step": 4090
+    },
+    {
+      "epoch": 4.520540391508133,
+      "grad_norm": 1.3177834749221802,
+      "learning_rate": 8.191948658109685e-05,
+      "loss": 3.3015769958496093,
+      "step": 4100
+    },
+    {
+      "epoch": 4.5315687896333054,
+      "grad_norm": 1.2859690189361572,
+      "learning_rate": 8.186114352392066e-05,
+      "loss": 3.3012962341308594,
+      "step": 4110
+    },
+    {
+      "epoch": 4.542597187758478,
+      "grad_norm": 1.149977445602417,
+      "learning_rate": 8.180280046674446e-05,
+      "loss": 3.2883323669433593,
+      "step": 4120
+    },
+    {
+      "epoch": 4.55362558588365,
+      "grad_norm": 1.1980609893798828,
+      "learning_rate": 8.174445740956827e-05,
+      "loss": 3.305525207519531,
+      "step": 4130
+    },
+    {
+      "epoch": 4.564653984008823,
+      "grad_norm": 1.2316346168518066,
+      "learning_rate": 8.168611435239207e-05,
+      "loss": 3.296540069580078,
+      "step": 4140
+    },
+    {
+      "epoch": 4.575682382133995,
+      "grad_norm": 1.456752896308899,
+      "learning_rate": 8.162777129521587e-05,
+      "loss": 3.301634979248047,
+      "step": 4150
+    },
+    {
+      "epoch": 4.586710780259168,
+      "grad_norm": 1.5025802850723267,
+      "learning_rate": 8.156942823803968e-05,
+      "loss": 3.303561782836914,
+      "step": 4160
+    },
+    {
+      "epoch": 4.59773917838434,
+      "grad_norm": 1.3021212816238403,
+      "learning_rate": 8.151108518086347e-05,
+      "loss": 3.302195739746094,
+      "step": 4170
+    },
+    {
+      "epoch": 4.608767576509512,
+      "grad_norm": 1.758484125137329,
+      "learning_rate": 8.145274212368728e-05,
+      "loss": 3.298839569091797,
+      "step": 4180
+    },
+    {
+      "epoch": 4.619795974634684,
+      "grad_norm": 1.034860372543335,
+      "learning_rate": 8.139439906651108e-05,
+      "loss": 3.308152770996094,
+      "step": 4190
+    },
+    {
+      "epoch": 4.630824372759856,
+      "grad_norm": 1.233070969581604,
+      "learning_rate": 8.133605600933489e-05,
+      "loss": 3.297984313964844,
+      "step": 4200
+    },
+    {
+      "epoch": 4.641852770885029,
+      "grad_norm": 1.7277765274047852,
+      "learning_rate": 8.12777129521587e-05,
+      "loss": 3.3045902252197266,
+      "step": 4210
+    },
+    {
+      "epoch": 4.652881169010201,
+      "grad_norm": 1.2869057655334473,
+      "learning_rate": 8.12193698949825e-05,
+      "loss": 3.3063819885253904,
+      "step": 4220
+    },
+    {
+      "epoch": 4.663909567135374,
+      "grad_norm": 1.1411103010177612,
+      "learning_rate": 8.116102683780631e-05,
+      "loss": 3.2905479431152345,
+      "step": 4230
+    },
+    {
+      "epoch": 4.674937965260546,
+      "grad_norm": 1.342445969581604,
+      "learning_rate": 8.110268378063011e-05,
+      "loss": 3.2918365478515623,
+      "step": 4240
+    },
+    {
+      "epoch": 4.6859663633857185,
+      "grad_norm": 1.206933617591858,
+      "learning_rate": 8.104434072345392e-05,
+      "loss": 3.303221893310547,
+      "step": 4250
+    },
+    {
+      "epoch": 4.696994761510891,
+      "grad_norm": 1.3959113359451294,
+      "learning_rate": 8.098599766627771e-05,
+      "loss": 3.3067909240722657,
+      "step": 4260
+    },
+    {
+      "epoch": 4.708023159636063,
+      "grad_norm": 1.9725914001464844,
+      "learning_rate": 8.092765460910152e-05,
+      "loss": 3.300902557373047,
+      "step": 4270
+    },
+    {
+      "epoch": 4.719051557761235,
+      "grad_norm": 1.3540401458740234,
+      "learning_rate": 8.086931155192532e-05,
+      "loss": 3.3051612854003904,
+      "step": 4280
+    },
+    {
+      "epoch": 4.730079955886407,
+      "grad_norm": 1.2321784496307373,
+      "learning_rate": 8.081096849474913e-05,
+      "loss": 3.2939830780029298,
+      "step": 4290
+    },
+    {
+      "epoch": 4.74110835401158,
+      "grad_norm": 1.2586874961853027,
+      "learning_rate": 8.075262543757294e-05,
+      "loss": 3.301250457763672,
+      "step": 4300
+    },
+    {
+      "epoch": 4.752136752136752,
+      "grad_norm": 1.1622635126113892,
+      "learning_rate": 8.069428238039673e-05,
+      "loss": 3.29620361328125,
+      "step": 4310
+    },
+    {
+      "epoch": 4.7631651502619246,
+      "grad_norm": 1.204060673713684,
+      "learning_rate": 8.063593932322054e-05,
+      "loss": 3.301993179321289,
+      "step": 4320
+    },
+    {
+      "epoch": 4.774193548387097,
+      "grad_norm": 1.2462209463119507,
+      "learning_rate": 8.057759626604434e-05,
+      "loss": 3.294866180419922,
+      "step": 4330
+    },
+    {
+      "epoch": 4.785221946512269,
+      "grad_norm": 1.086969256401062,
+      "learning_rate": 8.051925320886814e-05,
+      "loss": 3.290216827392578,
+      "step": 4340
+    },
+    {
+      "epoch": 4.796250344637441,
+      "grad_norm": 1.6685938835144043,
+      "learning_rate": 8.046091015169195e-05,
+      "loss": 3.291016387939453,
+      "step": 4350
+    },
+    {
+      "epoch": 4.807278742762613,
+      "grad_norm": 1.279870867729187,
+      "learning_rate": 8.040256709451576e-05,
+      "loss": 3.2892833709716798,
+      "step": 4360
+    },
+    {
+      "epoch": 4.818307140887786,
+      "grad_norm": 1.083748459815979,
+      "learning_rate": 8.034422403733956e-05,
+      "loss": 3.298921585083008,
+      "step": 4370
+    },
+    {
+      "epoch": 4.829335539012958,
+      "grad_norm": 1.215922474861145,
+      "learning_rate": 8.028588098016337e-05,
+      "loss": 3.290033721923828,
+      "step": 4380
+    },
+    {
+      "epoch": 4.840363937138131,
+      "grad_norm": 1.4302809238433838,
+      "learning_rate": 8.022753792298718e-05,
+      "loss": 3.289847564697266,
+      "step": 4390
+    },
+    {
+      "epoch": 4.851392335263303,
+      "grad_norm": 1.2112072706222534,
+      "learning_rate": 8.016919486581097e-05,
+      "loss": 3.289959716796875,
+      "step": 4400
+    },
+    {
+      "epoch": 4.8624207333884755,
+      "grad_norm": 1.5671532154083252,
+      "learning_rate": 8.011085180863478e-05,
+      "loss": 3.2878067016601564,
+      "step": 4410
+    },
+    {
+      "epoch": 4.873449131513648,
+      "grad_norm": 1.6009515523910522,
+      "learning_rate": 8.005250875145858e-05,
+      "loss": 3.286448669433594,
+      "step": 4420
+    },
+    {
+      "epoch": 4.88447752963882,
+      "grad_norm": 1.324246883392334,
+      "learning_rate": 7.999416569428238e-05,
+      "loss": 3.2920913696289062,
+      "step": 4430
+    },
+    {
+      "epoch": 4.895505927763992,
+      "grad_norm": 1.2959766387939453,
+      "learning_rate": 7.993582263710618e-05,
+      "loss": 3.3023696899414063,
+      "step": 4440
+    },
+    {
+      "epoch": 4.906534325889164,
+      "grad_norm": 1.0577853918075562,
+      "learning_rate": 7.987747957992999e-05,
+      "loss": 3.2800533294677736,
+      "step": 4450
+    },
+    {
+      "epoch": 4.917562724014337,
+      "grad_norm": 1.5235346555709839,
+      "learning_rate": 7.98191365227538e-05,
+      "loss": 3.2923828125,
+      "step": 4460
+    },
+    {
+      "epoch": 4.928591122139509,
+      "grad_norm": 1.4431898593902588,
+      "learning_rate": 7.97607934655776e-05,
+      "loss": 3.2987926483154295,
+      "step": 4470
+    },
+    {
+      "epoch": 4.9396195202646815,
+      "grad_norm": 1.6988770961761475,
+      "learning_rate": 7.97024504084014e-05,
+      "loss": 3.2937850952148438,
+      "step": 4480
+    },
+    {
+      "epoch": 4.950647918389854,
+      "grad_norm": 1.3248101472854614,
+      "learning_rate": 7.964410735122521e-05,
+      "loss": 3.284267044067383,
+      "step": 4490
+    },
+    {
+      "epoch": 4.961676316515026,
+      "grad_norm": 1.3350111246109009,
+      "learning_rate": 7.958576429404902e-05,
+      "loss": 3.2882057189941407,
+      "step": 4500
+    },
+    {
+      "epoch": 4.972704714640199,
+      "grad_norm": 1.434801459312439,
+      "learning_rate": 7.952742123687282e-05,
+      "loss": 3.28388671875,
+      "step": 4510
+    },
+    {
+      "epoch": 4.983733112765371,
+      "grad_norm": 1.0145658254623413,
+      "learning_rate": 7.946907817969662e-05,
+      "loss": 3.288136291503906,
+      "step": 4520
+    },
+    {
+      "epoch": 4.994761510890543,
+      "grad_norm": 1.1575376987457275,
+      "learning_rate": 7.941073512252042e-05,
+      "loss": 3.2970806121826173,
+      "step": 4530
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 18140,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1805561643270144.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

output_qwen3_plain_ar/checkpoint-4535/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

output_qwen3_plain_ar/checkpoint-5442/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "magel_chord_dropout_trigger_prob": 0.6,
+  "magel_num_audio_token": 16384,
+  "magel_structure_dropout_trigger_prob": 0.6,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.4.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 168056
+}

output_qwen3_plain_ar/checkpoint-5442/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.4.0"
+}

output_qwen3_plain_ar/checkpoint-5442/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step5442

output_qwen3_plain_ar/checkpoint-5442/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

output_qwen3_plain_ar/checkpoint-5442/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

output_qwen3_plain_ar/checkpoint-6349/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "magel_chord_dropout_trigger_prob": 0.6,
+  "magel_num_audio_token": 16384,
+  "magel_structure_dropout_trigger_prob": 0.6,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.4.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 168056
+}

output_qwen3_plain_ar/checkpoint-6349/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.4.0"
+}

output_qwen3_plain_ar/checkpoint-6349/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step6349

output_qwen3_plain_ar/checkpoint-6349/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

output_qwen3_plain_ar/checkpoint-6349/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

output_qwen3_plain_ar/checkpoint-7256/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "magel_chord_dropout_trigger_prob": 0.6,
+  "magel_num_audio_token": 16384,
+  "magel_structure_dropout_trigger_prob": 0.6,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.4.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 168056
+}

output_qwen3_plain_ar/checkpoint-7256/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.4.0"
+}

output_qwen3_plain_ar/checkpoint-7256/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step7256

output_qwen3_plain_ar/checkpoint-7256/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

output_qwen3_plain_ar/checkpoint-7256/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

output_qwen3_plain_ar/checkpoint-8163/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "magel_chord_dropout_trigger_prob": 0.6,
+  "magel_num_audio_token": 16384,
+  "magel_structure_dropout_trigger_prob": 0.6,
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.4.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 168056
+}

output_qwen3_plain_ar/checkpoint-8163/generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "5.4.0"
+}

output_qwen3_plain_ar/checkpoint-8163/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step8163

output_qwen3_plain_ar/checkpoint-8163/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff