Spaces:
Build error
Build error
Upload 21 files
Browse files- display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
- display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/latest +1 -0
- display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/zero_to_fp32.py +483 -0
- display_v3/2023-04-14_16-59-07/hparams.yaml +23 -0
- display_v3/2023-04-14_16-59-07/metrics.csv +255 -0
- display_v3/2023-04-14_16-59-07/yes.txt +0 -0
- display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
- display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/latest +1 -0
- display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/zero_to_fp32.py +483 -0
- display_v3/2023-04-14_17-06-18/hparams.yaml +23 -0
- display_v3/2023-04-14_17-06-18/metrics.csv +38 -0
- display_v3/2023-04-14_17-06-18/yes.txt +0 -0
- display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
- display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
- display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/latest +1 -0
- display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/zero_to_fp32.py +483 -0
- display_v3/2023-04-14_17-59-45/hparams.yaml +23 -0
- display_v3/2023-04-14_17-59-45/metrics.csv +193 -0
- display_v3/2023-04-14_17-59-45/yes.txt +0 -0
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9921f36370ead56c25c58b8409cb71175b00e3c5ad5105f5fc49666915361ce
|
| 3 |
+
size 220228915
|
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4f3ae9d4195c5adab49e78a909cfe95ae8d21c7a2ffc90eed224e122f51eabae
|
| 3 |
+
size 1320918341
|
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/latest
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
checkpoint
|
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/zero_to_fp32.py
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
'''Copyright The Microsoft DeepSpeed Team'''
|
| 3 |
+
|
| 4 |
+
# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
|
| 5 |
+
# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
|
| 6 |
+
# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
|
| 7 |
+
# application.
|
| 8 |
+
#
|
| 9 |
+
# example: python zero_to_fp32.py . pytorch_model.bin
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import torch
|
| 13 |
+
import glob
|
| 14 |
+
import math
|
| 15 |
+
import os
|
| 16 |
+
import re
|
| 17 |
+
from collections import OrderedDict
|
| 18 |
+
|
| 19 |
+
# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
|
| 20 |
+
# DeepSpeed data structures it has to be available in the current python environment.
|
| 21 |
+
from deepspeed.utils import logger
|
| 22 |
+
from deepspeed.checkpoint.constants import (DS_VERSION,
|
| 23 |
+
OPTIMIZER_STATE_DICT,
|
| 24 |
+
SINGLE_PARTITION_OF_FP32_GROUPS,
|
| 25 |
+
FP32_FLAT_GROUPS,
|
| 26 |
+
ZERO_STAGE,
|
| 27 |
+
PARTITION_COUNT,
|
| 28 |
+
PARAM_SHAPES,
|
| 29 |
+
BUFFER_NAMES)
|
| 30 |
+
|
| 31 |
+
debug = 0
|
| 32 |
+
|
| 33 |
+
# load to cpu
|
| 34 |
+
device = torch.device('cpu')
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def atoi(text):
|
| 38 |
+
return int(text) if text.isdigit() else text
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def natural_keys(text):
|
| 42 |
+
'''
|
| 43 |
+
alist.sort(key=natural_keys) sorts in human order
|
| 44 |
+
http://nedbatchelder.com/blog/200712/human_sorting.html
|
| 45 |
+
(See Toothy's implementation in the comments)
|
| 46 |
+
'''
|
| 47 |
+
return [atoi(c) for c in re.split(r'(\d+)', text)]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def get_model_state_file(checkpoint_dir, zero_stage):
|
| 51 |
+
if not os.path.isdir(checkpoint_dir):
|
| 52 |
+
raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
|
| 53 |
+
|
| 54 |
+
# there should be only one file
|
| 55 |
+
if zero_stage == 2:
|
| 56 |
+
file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
|
| 57 |
+
elif zero_stage == 3:
|
| 58 |
+
file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
|
| 59 |
+
|
| 60 |
+
if not os.path.exists(file):
|
| 61 |
+
raise FileNotFoundError(f"can't find model states file at '{file}'")
|
| 62 |
+
|
| 63 |
+
return file
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_optim_files(checkpoint_dir):
|
| 67 |
+
# XXX: need to test that this simple glob rule works for multi-node setup too
|
| 68 |
+
optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
|
| 69 |
+
"*_optim_states.pt")),
|
| 70 |
+
key=natural_keys)
|
| 71 |
+
|
| 72 |
+
if len(optim_files) == 0:
|
| 73 |
+
raise FileNotFoundError(
|
| 74 |
+
f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
|
| 75 |
+
|
| 76 |
+
return optim_files
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def parse_model_state(file):
|
| 80 |
+
state_dict = torch.load(file, map_location=device)
|
| 81 |
+
|
| 82 |
+
if BUFFER_NAMES not in state_dict:
|
| 83 |
+
raise ValueError(f"{file} is not a model state checkpoint")
|
| 84 |
+
buffer_names = state_dict[BUFFER_NAMES]
|
| 85 |
+
if debug:
|
| 86 |
+
print("Found buffers:", buffer_names)
|
| 87 |
+
|
| 88 |
+
# recover just the buffers while restoring them to fp32 if they were saved in fp16
|
| 89 |
+
buffers = {
|
| 90 |
+
k: v.float()
|
| 91 |
+
for k,
|
| 92 |
+
v in state_dict["module"].items() if k in buffer_names
|
| 93 |
+
}
|
| 94 |
+
param_shapes = state_dict[PARAM_SHAPES]
|
| 95 |
+
|
| 96 |
+
ds_version = state_dict.get(DS_VERSION, None)
|
| 97 |
+
|
| 98 |
+
return buffers, param_shapes, ds_version
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def parse_optim_states(files, ds_checkpoint_dir):
|
| 102 |
+
|
| 103 |
+
total_files = len(files)
|
| 104 |
+
state_dicts = []
|
| 105 |
+
for f in files:
|
| 106 |
+
state_dicts.append(torch.load(f, map_location=device))
|
| 107 |
+
|
| 108 |
+
if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
|
| 109 |
+
raise ValueError(f"{files[0]} is not a zero checkpoint")
|
| 110 |
+
zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
|
| 111 |
+
world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
|
| 112 |
+
|
| 113 |
+
# For ZeRO-2 each param group can have different partition_count as data parallelism for expert
|
| 114 |
+
# parameters can be different from data parallelism for non-expert parameters. So we can just
|
| 115 |
+
# use the max of the partition_count to get the dp world_size.
|
| 116 |
+
|
| 117 |
+
if type(world_size) is list:
|
| 118 |
+
world_size = max(world_size)
|
| 119 |
+
|
| 120 |
+
if world_size != total_files:
|
| 121 |
+
raise ValueError(
|
| 122 |
+
f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
|
| 123 |
+
"Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# the groups are named differently in each stage
|
| 127 |
+
if zero_stage == 2:
|
| 128 |
+
fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
|
| 129 |
+
elif zero_stage == 3:
|
| 130 |
+
fp32_groups_key = FP32_FLAT_GROUPS
|
| 131 |
+
else:
|
| 132 |
+
raise ValueError(f"unknown zero stage {zero_stage}")
|
| 133 |
+
|
| 134 |
+
if zero_stage == 2:
|
| 135 |
+
fp32_flat_groups = [
|
| 136 |
+
state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
|
| 137 |
+
for i in range(len(state_dicts))
|
| 138 |
+
]
|
| 139 |
+
elif zero_stage == 3:
|
| 140 |
+
# if there is more than one param group, there will be multiple flattened tensors - one
|
| 141 |
+
# flattened tensor per group - for simplicity merge them into a single tensor
|
| 142 |
+
#
|
| 143 |
+
# XXX: could make the script more memory efficient for when there are multiple groups - it
|
| 144 |
+
# will require matching the sub-lists of param_shapes for each param group flattened tensor
|
| 145 |
+
|
| 146 |
+
fp32_flat_groups = [
|
| 147 |
+
torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
|
| 148 |
+
0) for i in range(len(state_dicts))
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
return zero_stage, world_size, fp32_flat_groups
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
|
| 155 |
+
"""
|
| 156 |
+
Returns fp32 state_dict reconstructed from ds checkpoint
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
- ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
|
| 160 |
+
|
| 161 |
+
"""
|
| 162 |
+
print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
|
| 163 |
+
|
| 164 |
+
optim_files = get_optim_files(ds_checkpoint_dir)
|
| 165 |
+
zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
|
| 166 |
+
print(
|
| 167 |
+
f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
|
| 168 |
+
|
| 169 |
+
model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
|
| 170 |
+
buffers, param_shapes, ds_version = parse_model_state(model_file)
|
| 171 |
+
print(f'Parsing checkpoint created by deepspeed=={ds_version}')
|
| 172 |
+
|
| 173 |
+
if zero_stage == 2:
|
| 174 |
+
return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
|
| 175 |
+
param_shapes,
|
| 176 |
+
fp32_flat_groups,
|
| 177 |
+
buffers)
|
| 178 |
+
elif zero_stage == 3:
|
| 179 |
+
return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
|
| 180 |
+
param_shapes,
|
| 181 |
+
fp32_flat_groups,
|
| 182 |
+
buffers)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
|
| 186 |
+
param_shapes,
|
| 187 |
+
fp32_flat_groups,
|
| 188 |
+
buffers):
|
| 189 |
+
|
| 190 |
+
# Reconstruction protocol:
|
| 191 |
+
#
|
| 192 |
+
# XXX: document this
|
| 193 |
+
|
| 194 |
+
if debug:
|
| 195 |
+
for i in range(world_size):
|
| 196 |
+
for j in range(len(fp32_flat_groups[0])):
|
| 197 |
+
print(
|
| 198 |
+
f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
|
| 199 |
+
|
| 200 |
+
# XXX: memory usage doubles here (zero2)
|
| 201 |
+
num_param_groups = len(fp32_flat_groups[0])
|
| 202 |
+
merged_single_partition_of_fp32_groups = []
|
| 203 |
+
for i in range(num_param_groups):
|
| 204 |
+
merged_partitions = [sd[i] for sd in fp32_flat_groups]
|
| 205 |
+
full_single_fp32_vector = torch.cat(merged_partitions, 0)
|
| 206 |
+
merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
|
| 207 |
+
avail_numel = sum([
|
| 208 |
+
full_single_fp32_vector.numel()
|
| 209 |
+
for full_single_fp32_vector in merged_single_partition_of_fp32_groups
|
| 210 |
+
])
|
| 211 |
+
|
| 212 |
+
if debug:
|
| 213 |
+
wanted_params = sum([len(shapes) for shapes in param_shapes])
|
| 214 |
+
wanted_numel = sum(
|
| 215 |
+
[sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
|
| 216 |
+
# not asserting if there is a mismatch due to possible padding
|
| 217 |
+
print(f"Have {avail_numel} numels to process.")
|
| 218 |
+
print(f"Need {wanted_numel} numels in {wanted_params} params.")
|
| 219 |
+
|
| 220 |
+
state_dict = OrderedDict()
|
| 221 |
+
|
| 222 |
+
# buffers
|
| 223 |
+
state_dict.update(buffers)
|
| 224 |
+
if debug:
|
| 225 |
+
print(f"added {len(buffers)} buffers")
|
| 226 |
+
|
| 227 |
+
# params
|
| 228 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
| 229 |
+
# out-of-core computing solution
|
| 230 |
+
total_numel = 0
|
| 231 |
+
total_params = 0
|
| 232 |
+
for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
|
| 233 |
+
offset = 0
|
| 234 |
+
avail_numel = full_single_fp32_vector.numel()
|
| 235 |
+
for name, shape in shapes.items():
|
| 236 |
+
|
| 237 |
+
unpartitioned_numel = shape.numel()
|
| 238 |
+
total_numel += unpartitioned_numel
|
| 239 |
+
total_params += 1
|
| 240 |
+
|
| 241 |
+
if debug:
|
| 242 |
+
print(
|
| 243 |
+
f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
|
| 244 |
+
)
|
| 245 |
+
state_dict[name] = full_single_fp32_vector.narrow(
|
| 246 |
+
0,
|
| 247 |
+
offset,
|
| 248 |
+
unpartitioned_numel).view(shape)
|
| 249 |
+
offset += unpartitioned_numel
|
| 250 |
+
|
| 251 |
+
# Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
|
| 252 |
+
# avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
|
| 253 |
+
# paddings performed in the code it's almost impossible to predict the exact numbers w/o the
|
| 254 |
+
# live optimizer object, so we are checking that the numbers are within the right range
|
| 255 |
+
align_to = 2 * world_size
|
| 256 |
+
|
| 257 |
+
def zero2_align(x):
|
| 258 |
+
return align_to * math.ceil(x / align_to)
|
| 259 |
+
|
| 260 |
+
if debug:
|
| 261 |
+
print(f"original offset={offset}, avail_numel={avail_numel}")
|
| 262 |
+
|
| 263 |
+
offset = zero2_align(offset)
|
| 264 |
+
avail_numel = zero2_align(avail_numel)
|
| 265 |
+
|
| 266 |
+
if debug:
|
| 267 |
+
print(f"aligned offset={offset}, avail_numel={avail_numel}")
|
| 268 |
+
|
| 269 |
+
# Sanity check
|
| 270 |
+
if offset != avail_numel:
|
| 271 |
+
raise ValueError(
|
| 272 |
+
f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
| 273 |
+
|
| 274 |
+
print(
|
| 275 |
+
f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
return state_dict
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def zero3_partitioned_param_info(unpartitioned_numel, world_size):
|
| 282 |
+
remainder = unpartitioned_numel % world_size
|
| 283 |
+
padding_numel = (world_size - remainder) if remainder else 0
|
| 284 |
+
partitioned_numel = math.ceil(unpartitioned_numel / world_size)
|
| 285 |
+
return partitioned_numel, padding_numel
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
|
| 289 |
+
param_shapes,
|
| 290 |
+
fp32_flat_groups,
|
| 291 |
+
buffers):
|
| 292 |
+
|
| 293 |
+
# Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
|
| 294 |
+
# param, re-consolidating each param, while dealing with padding if any
|
| 295 |
+
|
| 296 |
+
avail_numel = fp32_flat_groups[0].numel() * world_size
|
| 297 |
+
# merge list of dicts, preserving order
|
| 298 |
+
param_shapes = {k: v for d in param_shapes for k, v in d.items()}
|
| 299 |
+
|
| 300 |
+
if debug:
|
| 301 |
+
for i in range(world_size):
|
| 302 |
+
print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
|
| 303 |
+
|
| 304 |
+
wanted_params = len(param_shapes)
|
| 305 |
+
wanted_numel = sum(shape.numel() for shape in param_shapes.values())
|
| 306 |
+
# not asserting if there is a mismatch due to possible padding
|
| 307 |
+
print(f"Have {avail_numel} numels to process.")
|
| 308 |
+
print(f"Need {wanted_numel} numels in {wanted_params} params.")
|
| 309 |
+
|
| 310 |
+
state_dict = OrderedDict()
|
| 311 |
+
|
| 312 |
+
# buffers
|
| 313 |
+
state_dict.update(buffers)
|
| 314 |
+
if debug:
|
| 315 |
+
print(f"added {len(buffers)} buffers")
|
| 316 |
+
|
| 317 |
+
# params
|
| 318 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
| 319 |
+
# out-of-core computing solution
|
| 320 |
+
offset = 0
|
| 321 |
+
total_numel = 0
|
| 322 |
+
total_params = 0
|
| 323 |
+
for name, shape in param_shapes.items():
|
| 324 |
+
|
| 325 |
+
unpartitioned_numel = shape.numel()
|
| 326 |
+
total_numel += unpartitioned_numel
|
| 327 |
+
total_params += 1
|
| 328 |
+
|
| 329 |
+
partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
|
| 330 |
+
|
| 331 |
+
if debug:
|
| 332 |
+
print(
|
| 333 |
+
f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# XXX: memory usage doubles here
|
| 337 |
+
state_dict[name] = torch.cat(
|
| 338 |
+
tuple(fp32_flat_groups[i].narrow(0,
|
| 339 |
+
offset,
|
| 340 |
+
partitioned_numel)
|
| 341 |
+
for i in range(world_size)),
|
| 342 |
+
0).narrow(0,
|
| 343 |
+
0,
|
| 344 |
+
unpartitioned_numel).view(shape)
|
| 345 |
+
offset += partitioned_numel
|
| 346 |
+
|
| 347 |
+
offset *= world_size
|
| 348 |
+
|
| 349 |
+
# Sanity check
|
| 350 |
+
if offset != avail_numel:
|
| 351 |
+
raise ValueError(
|
| 352 |
+
f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
| 353 |
+
|
| 354 |
+
print(
|
| 355 |
+
f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
return state_dict
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
|
| 362 |
+
"""
|
| 363 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
|
| 364 |
+
``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
|
| 365 |
+
via a model hub.
|
| 366 |
+
|
| 367 |
+
Args:
|
| 368 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder
|
| 369 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
|
| 370 |
+
|
| 371 |
+
Returns:
|
| 372 |
+
- pytorch ``state_dict``
|
| 373 |
+
|
| 374 |
+
Note: this approach may not work if your application doesn't have sufficient free CPU memory and
|
| 375 |
+
you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
|
| 376 |
+
the checkpoint.
|
| 377 |
+
|
| 378 |
+
A typical usage might be ::
|
| 379 |
+
|
| 380 |
+
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
|
| 381 |
+
# do the training and checkpoint saving
|
| 382 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
|
| 383 |
+
model = model.cpu() # move to cpu
|
| 384 |
+
model.load_state_dict(state_dict)
|
| 385 |
+
# submit to model hub or save the model to share with others
|
| 386 |
+
|
| 387 |
+
In this example the ``model`` will no longer be usable in the deepspeed context of the same
|
| 388 |
+
application. i.e. you will need to re-initialize the deepspeed engine, since
|
| 389 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
| 390 |
+
|
| 391 |
+
If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
|
| 392 |
+
|
| 393 |
+
"""
|
| 394 |
+
if tag is None:
|
| 395 |
+
latest_path = os.path.join(checkpoint_dir, 'latest')
|
| 396 |
+
if os.path.isfile(latest_path):
|
| 397 |
+
with open(latest_path, 'r') as fd:
|
| 398 |
+
tag = fd.read().strip()
|
| 399 |
+
else:
|
| 400 |
+
raise ValueError(f"Unable to find 'latest' file at {latest_path}")
|
| 401 |
+
|
| 402 |
+
ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
|
| 403 |
+
|
| 404 |
+
if not os.path.isdir(ds_checkpoint_dir):
|
| 405 |
+
raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
|
| 406 |
+
|
| 407 |
+
return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
|
| 411 |
+
"""
|
| 412 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
|
| 413 |
+
loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
|
| 414 |
+
|
| 415 |
+
Args:
|
| 416 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
| 417 |
+
- ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
|
| 418 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
| 419 |
+
"""
|
| 420 |
+
|
| 421 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
|
| 422 |
+
print(f"Saving fp32 state dict to {output_file}")
|
| 423 |
+
torch.save(state_dict, output_file)
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
|
| 427 |
+
"""
|
| 428 |
+
1. Put the provided model to cpu
|
| 429 |
+
2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
|
| 430 |
+
3. Load it into the provided model
|
| 431 |
+
|
| 432 |
+
Args:
|
| 433 |
+
- ``model``: the model object to update
|
| 434 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
| 435 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
| 436 |
+
|
| 437 |
+
Returns:
|
| 438 |
+
- ``model`: modified model
|
| 439 |
+
|
| 440 |
+
Make sure you have plenty of CPU memory available before you call this function. If you don't
|
| 441 |
+
have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
|
| 442 |
+
conveniently placed for you in the checkpoint folder.
|
| 443 |
+
|
| 444 |
+
A typical usage might be ::
|
| 445 |
+
|
| 446 |
+
from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
|
| 447 |
+
model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
|
| 448 |
+
# submit to model hub or save the model to share with others
|
| 449 |
+
|
| 450 |
+
Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
|
| 451 |
+
of the same application. i.e. you will need to re-initialize the deepspeed engine, since
|
| 452 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
| 453 |
+
|
| 454 |
+
"""
|
| 455 |
+
logger.info(f"Extracting fp32 weights")
|
| 456 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
|
| 457 |
+
|
| 458 |
+
logger.info(f"Overwriting model with fp32 weights")
|
| 459 |
+
model = model.cpu()
|
| 460 |
+
model.load_state_dict(state_dict, strict=False)
|
| 461 |
+
|
| 462 |
+
return model
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
if __name__ == "__main__":
|
| 466 |
+
|
| 467 |
+
parser = argparse.ArgumentParser()
|
| 468 |
+
parser.add_argument(
|
| 469 |
+
"checkpoint_dir",
|
| 470 |
+
type=str,
|
| 471 |
+
help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
|
| 472 |
+
parser.add_argument(
|
| 473 |
+
"output_file",
|
| 474 |
+
type=str,
|
| 475 |
+
help=
|
| 476 |
+
"path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
|
| 477 |
+
)
|
| 478 |
+
parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
|
| 479 |
+
args = parser.parse_args()
|
| 480 |
+
|
| 481 |
+
debug = args.debug
|
| 482 |
+
|
| 483 |
+
convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
|
display_v3/2023-04-14_16-59-07/hparams.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amp: true
|
| 2 |
+
batch_size: 16
|
| 3 |
+
cls_target: hd
|
| 4 |
+
deepspeed: true
|
| 5 |
+
dev_data_file: ''
|
| 6 |
+
downsample_data: true
|
| 7 |
+
early_dropout: null
|
| 8 |
+
epochs: 10
|
| 9 |
+
freeze_encoder: false
|
| 10 |
+
just_test: false
|
| 11 |
+
log_fold: ./logs
|
| 12 |
+
log_step: 10
|
| 13 |
+
lr: 5.0e-05
|
| 14 |
+
model_name: bert-base-uncased
|
| 15 |
+
positive_ratio: 0.4
|
| 16 |
+
pretrained_model_fold: ./pretrained_model
|
| 17 |
+
rdrop: null
|
| 18 |
+
running time: 0:06:52
|
| 19 |
+
share_encoder: false
|
| 20 |
+
test_data_file: ''
|
| 21 |
+
train_data_file: ''
|
| 22 |
+
train_ratio: 0.8
|
| 23 |
+
version: structure cmp
|
display_v3/2023-04-14_16-59-07/metrics.csv
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
train_loss,epoch,step,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_macro_f1,train_accuracy,train_precision,train_recall,train_f1,train_macro_f1,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_macro_f1
|
| 2 |
+
0.611328125,0,9,,,,,,,,,,,,,,,,,
|
| 3 |
+
0.5078125,0,19,,,,,,,,,,,,,,,,,
|
| 4 |
+
0.7041015625,0,29,,,,,,,,,,,,,,,,,
|
| 5 |
+
0.7119140625,0,39,,,,,,,,,,,,,,,,,
|
| 6 |
+
0.398193359375,0,49,,,,,,,,,,,,,,,,,
|
| 7 |
+
0.461181640625,0,59,,,,,,,,,,,,,,,,,
|
| 8 |
+
0.386474609375,0,69,,,,,,,,,,,,,,,,,
|
| 9 |
+
0.4326171875,0,79,,,,,,,,,,,,,,,,,
|
| 10 |
+
0.66748046875,0,89,,,,,,,,,,,,,,,,,
|
| 11 |
+
0.33544921875,0,99,,,,,,,,,,,,,,,,,
|
| 12 |
+
0.30810546875,0,109,,,,,,,,,,,,,,,,,
|
| 13 |
+
0.55615234375,0,119,,,,,,,,,,,,,,,,,
|
| 14 |
+
0.2861328125,0,129,,,,,,,,,,,,,,,,,
|
| 15 |
+
0.60400390625,0,139,,,,,,,,,,,,,,,,,
|
| 16 |
+
0.623046875,0,149,,,,,,,,,,,,,,,,,
|
| 17 |
+
0.53076171875,0,159,,,,,,,,,,,,,,,,,
|
| 18 |
+
0.418701171875,0,169,,,,,,,,,,,,,,,,,
|
| 19 |
+
0.495361328125,0,179,,,,,,,,,,,,,,,,,
|
| 20 |
+
0.40625,0,189,,,,,,,,,,,,,,,,,
|
| 21 |
+
0.448974609375,0,199,,,,,,,,,,,,,,,,,
|
| 22 |
+
0.295654296875,0,209,,,,,,,,,,,,,,,,,
|
| 23 |
+
0.6376953125,0,219,,,,,,,,,,,,,,,,,
|
| 24 |
+
0.6279296875,0,229,,,,,,,,,,,,,,,,,
|
| 25 |
+
,0,232,0.3514973521232605,0.8561705946922302,0.33702337741851807,0.7425474524497986,0.46362099051475525,0.46362099051475525,,,,,,,,,,,
|
| 26 |
+
,0,232,,,,,,,0.7682795524597168,0.7229344844818115,0.6821236610412598,0.7019363641738892,0.7019363641738892,,,,,,
|
| 27 |
+
0.2568359375,1,239,,,,,,,,,,,,,,,,,
|
| 28 |
+
0.229248046875,1,249,,,,,,,,,,,,,,,,,
|
| 29 |
+
0.066650390625,1,259,,,,,,,,,,,,,,,,,
|
| 30 |
+
0.3134765625,1,269,,,,,,,,,,,,,,,,,
|
| 31 |
+
0.1636962890625,1,279,,,,,,,,,,,,,,,,,
|
| 32 |
+
0.36279296875,1,289,,,,,,,,,,,,,,,,,
|
| 33 |
+
0.150146484375,1,299,,,,,,,,,,,,,,,,,
|
| 34 |
+
0.246337890625,1,309,,,,,,,,,,,,,,,,,
|
| 35 |
+
0.3505859375,1,319,,,,,,,,,,,,,,,,,
|
| 36 |
+
0.329345703125,1,329,,,,,,,,,,,,,,,,,
|
| 37 |
+
0.181640625,1,339,,,,,,,,,,,,,,,,,
|
| 38 |
+
0.2763671875,1,349,,,,,,,,,,,,,,,,,
|
| 39 |
+
0.241455078125,1,359,,,,,,,,,,,,,,,,,
|
| 40 |
+
0.302490234375,1,369,,,,,,,,,,,,,,,,,
|
| 41 |
+
0.255859375,1,379,,,,,,,,,,,,,,,,,
|
| 42 |
+
0.24267578125,1,389,,,,,,,,,,,,,,,,,
|
| 43 |
+
0.465576171875,1,399,,,,,,,,,,,,,,,,,
|
| 44 |
+
0.484619140625,1,409,,,,,,,,,,,,,,,,,
|
| 45 |
+
0.262451171875,1,419,,,,,,,,,,,,,,,,,
|
| 46 |
+
0.271728515625,1,429,,,,,,,,,,,,,,,,,
|
| 47 |
+
0.190673828125,1,439,,,,,,,,,,,,,,,,,
|
| 48 |
+
0.474853515625,1,449,,,,,,,,,,,,,,,,,
|
| 49 |
+
0.56201171875,1,459,,,,,,,,,,,,,,,,,
|
| 50 |
+
,1,465,0.3760688304901123,0.8321233987808228,0.30616509914398193,0.794037938117981,0.4419306218624115,0.4419306218624115,,,,,,,,,,,
|
| 51 |
+
,1,465,,,,,,,0.875268816947937,0.8464140892028809,0.8407257795333862,0.8435603380203247,0.8435603380203247,,,,,,
|
| 52 |
+
0.07574462890625,2,469,,,,,,,,,,,,,,,,,
|
| 53 |
+
0.206787109375,2,479,,,,,,,,,,,,,,,,,
|
| 54 |
+
0.033966064453125,2,489,,,,,,,,,,,,,,,,,
|
| 55 |
+
0.1544189453125,2,499,,,,,,,,,,,,,,,,,
|
| 56 |
+
0.13671875,2,509,,,,,,,,,,,,,,,,,
|
| 57 |
+
0.07061767578125,2,519,,,,,,,,,,,,,,,,,
|
| 58 |
+
0.0055084228515625,2,529,,,,,,,,,,,,,,,,,
|
| 59 |
+
0.2064208984375,2,539,,,,,,,,,,,,,,,,,
|
| 60 |
+
0.020599365234375,2,549,,,,,,,,,,,,,,,,,
|
| 61 |
+
0.0143585205078125,2,559,,,,,,,,,,,,,,,,,
|
| 62 |
+
0.0257720947265625,2,569,,,,,,,,,,,,,,,,,
|
| 63 |
+
0.052337646484375,2,579,,,,,,,,,,,,,,,,,
|
| 64 |
+
0.0276031494140625,2,589,,,,,,,,,,,,,,,,,
|
| 65 |
+
0.044097900390625,2,599,,,,,,,,,,,,,,,,,
|
| 66 |
+
0.06817626953125,2,609,,,,,,,,,,,,,,,,,
|
| 67 |
+
0.1556396484375,2,619,,,,,,,,,,,,,,,,,
|
| 68 |
+
0.32763671875,2,629,,,,,,,,,,,,,,,,,
|
| 69 |
+
0.05426025390625,2,639,,,,,,,,,,,,,,,,,
|
| 70 |
+
0.0640869140625,2,649,,,,,,,,,,,,,,,,,
|
| 71 |
+
0.0293426513671875,2,659,,,,,,,,,,,,,,,,,
|
| 72 |
+
0.2349853515625,2,669,,,,,,,,,,,,,,,,,
|
| 73 |
+
0.0736083984375,2,679,,,,,,,,,,,,,,,,,
|
| 74 |
+
0.0474853515625,2,689,,,,,,,,,,,,,,,,,
|
| 75 |
+
,2,698,1.0874371528625488,0.7009981870651245,0.2065553516149521,0.9051490426063538,0.33635449409484863,0.33635449409484863,,,,,,,,,,,
|
| 76 |
+
,2,698,,,,,,,0.9575268626213074,0.9421542286872864,0.9522849321365356,0.9471924901008606,0.9471924901008606,,,,,,
|
| 77 |
+
0.0280609130859375,3,699,,,,,,,,,,,,,,,,,
|
| 78 |
+
0.09930419921875,3,709,,,,,,,,,,,,,,,,,
|
| 79 |
+
0.0268096923828125,3,719,,,,,,,,,,,,,,,,,
|
| 80 |
+
0.03912353515625,3,729,,,,,,,,,,,,,,,,,
|
| 81 |
+
0.00539398193359375,3,739,,,,,,,,,,,,,,,,,
|
| 82 |
+
0.0030689239501953125,3,749,,,,,,,,,,,,,,,,,
|
| 83 |
+
0.00466156005859375,3,759,,,,,,,,,,,,,,,,,
|
| 84 |
+
0.0199432373046875,3,769,,,,,,,,,,,,,,,,,
|
| 85 |
+
0.0164337158203125,3,779,,,,,,,,,,,,,,,,,
|
| 86 |
+
0.09100341796875,3,789,,,,,,,,,,,,,,,,,
|
| 87 |
+
0.0152435302734375,3,799,,,,,,,,,,,,,,,,,
|
| 88 |
+
0.0019006729125976562,3,809,,,,,,,,,,,,,,,,,
|
| 89 |
+
0.00083160400390625,3,819,,,,,,,,,,,,,,,,,
|
| 90 |
+
0.0223541259765625,3,829,,,,,,,,,,,,,,,,,
|
| 91 |
+
0.1595458984375,3,839,,,,,,,,,,,,,,,,,
|
| 92 |
+
0.004375457763671875,3,849,,,,,,,,,,,,,,,,,
|
| 93 |
+
0.01349639892578125,3,859,,,,,,,,,,,,,,,,,
|
| 94 |
+
0.040191650390625,3,869,,,,,,,,,,,,,,,,,
|
| 95 |
+
0.031494140625,3,879,,,,,,,,,,,,,,,,,
|
| 96 |
+
0.01474761962890625,3,889,,,,,,,,,,,,,,,,,
|
| 97 |
+
0.022308349609375,3,899,,,,,,,,,,,,,,,,,
|
| 98 |
+
0.035919189453125,3,909,,,,,,,,,,,,,,,,,
|
| 99 |
+
0.107177734375,3,919,,,,,,,,,,,,,,,,,
|
| 100 |
+
0.04888916015625,3,929,,,,,,,,,,,,,,,,,
|
| 101 |
+
,3,931,0.452526330947876,0.8439201712608337,0.31168830394744873,0.7154471278190613,0.43421053886413574,0.43421053886413574,,,,,,,,,,,
|
| 102 |
+
,3,931,,,,,,,0.9806451797485352,0.974530816078186,0.977150559425354,0.9758388996124268,0.9758388996124268,,,,,,
|
| 103 |
+
0.0186004638671875,4,939,,,,,,,,,,,,,,,,,
|
| 104 |
+
0.0024738311767578125,4,949,,,,,,,,,,,,,,,,,
|
| 105 |
+
0.0012655258178710938,4,959,,,,,,,,,,,,,,,,,
|
| 106 |
+
0.001422882080078125,4,969,,,,,,,,,,,,,,,,,
|
| 107 |
+
0.0029392242431640625,4,979,,,,,,,,,,,,,,,,,
|
| 108 |
+
0.1710205078125,4,989,,,,,,,,,,,,,,,,,
|
| 109 |
+
0.12115478515625,4,999,,,,,,,,,,,,,,,,,
|
| 110 |
+
0.00638580322265625,4,1009,,,,,,,,,,,,,,,,,
|
| 111 |
+
0.00469207763671875,4,1019,,,,,,,,,,,,,,,,,
|
| 112 |
+
0.013702392578125,4,1029,,,,,,,,,,,,,,,,,
|
| 113 |
+
0.0222625732421875,4,1039,,,,,,,,,,,,,,,,,
|
| 114 |
+
0.045074462890625,4,1049,,,,,,,,,,,,,,,,,
|
| 115 |
+
0.00867462158203125,4,1059,,,,,,,,,,,,,,,,,
|
| 116 |
+
0.003887176513671875,4,1069,,,,,,,,,,,,,,,,,
|
| 117 |
+
0.029052734375,4,1079,,,,,,,,,,,,,,,,,
|
| 118 |
+
0.0028285980224609375,4,1089,,,,,,,,,,,,,,,,,
|
| 119 |
+
0.00045561790466308594,4,1099,,,,,,,,,,,,,,,,,
|
| 120 |
+
0.0133209228515625,4,1109,,,,,,,,,,,,,,,,,
|
| 121 |
+
0.304443359375,4,1119,,,,,,,,,,,,,,,,,
|
| 122 |
+
0.002223968505859375,4,1129,,,,,,,,,,,,,,,,,
|
| 123 |
+
0.0014781951904296875,4,1139,,,,,,,,,,,,,,,,,
|
| 124 |
+
0.005718231201171875,4,1149,,,,,,,,,,,,,,,,,
|
| 125 |
+
0.0115966796875,4,1159,,,,,,,,,,,,,,,,,
|
| 126 |
+
,4,1164,0.7260090708732605,0.7958257794380188,0.2644188106060028,0.8075881004333496,0.39839571714401245,0.39839571714401245,,,,,,,,,,,
|
| 127 |
+
,4,1164,,,,,,,0.9889785051345825,0.9852448105812073,0.9872311949729919,0.9862369894981384,0.9862369894981384,,,,,,
|
| 128 |
+
0.005008697509765625,5,1169,,,,,,,,,,,,,,,,,
|
| 129 |
+
0.007183074951171875,5,1179,,,,,,,,,,,,,,,,,
|
| 130 |
+
0.1400146484375,5,1189,,,,,,,,,,,,,,,,,
|
| 131 |
+
0.0007276535034179688,5,1199,,,,,,,,,,,,,,,,,
|
| 132 |
+
0.1689453125,5,1209,,,,,,,,,,,,,,,,,
|
| 133 |
+
0.00457763671875,5,1219,,,,,,,,,,,,,,,,,
|
| 134 |
+
0.035430908203125,5,1229,,,,,,,,,,,,,,,,,
|
| 135 |
+
0.0012540817260742188,5,1239,,,,,,,,,,,,,,,,,
|
| 136 |
+
0.0225372314453125,5,1249,,,,,,,,,,,,,,,,,
|
| 137 |
+
0.0008778572082519531,5,1259,,,,,,,,,,,,,,,,,
|
| 138 |
+
0.01336669921875,5,1269,,,,,,,,,,,,,,,,,
|
| 139 |
+
0.00044846534729003906,5,1279,,,,,,,,,,,,,,,,,
|
| 140 |
+
0.00408172607421875,5,1289,,,,,,,,,,,,,,,,,
|
| 141 |
+
0.00037980079650878906,5,1299,,,,,,,,,,,,,,,,,
|
| 142 |
+
0.0004723072052001953,5,1309,,,,,,,,,,,,,,,,,
|
| 143 |
+
0.01436614990234375,5,1319,,,,,,,,,,,,,,,,,
|
| 144 |
+
0.0670166015625,5,1329,,,,,,,,,,,,,,,,,
|
| 145 |
+
0.07574462890625,5,1339,,,,,,,,,,,,,,,,,
|
| 146 |
+
0.01025390625,5,1349,,,,,,,,,,,,,,,,,
|
| 147 |
+
0.10150146484375,5,1359,,,,,,,,,,,,,,,,,
|
| 148 |
+
0.0014791488647460938,5,1369,,,,,,,,,,,,,,,,,
|
| 149 |
+
0.003528594970703125,5,1379,,,,,,,,,,,,,,,,,
|
| 150 |
+
0.002532958984375,5,1389,,,,,,,,,,,,,,,,,
|
| 151 |
+
,5,1397,1.1667187213897705,0.7558983564376831,0.2332075536251068,0.8373983502388,0.3648169934749603,0.3648169934749603,,,,,,,,,,,
|
| 152 |
+
,5,1397,,,,,,,0.9905914068222046,0.9892255663871765,0.9872311949729919,0.988227367401123,0.988227367401123,,,,,,
|
| 153 |
+
0.004306793212890625,6,1399,,,,,,,,,,,,,,,,,
|
| 154 |
+
0.010406494140625,6,1409,,,,,,,,,,,,,,,,,
|
| 155 |
+
0.00102996826171875,6,1419,,,,,,,,,,,,,,,,,
|
| 156 |
+
0.00447845458984375,6,1429,,,,,,,,,,,,,,,,,
|
| 157 |
+
0.0005435943603515625,6,1439,,,,,,,,,,,,,,,,,
|
| 158 |
+
0.155029296875,6,1449,,,,,,,,,,,,,,,,,
|
| 159 |
+
0.00028395652770996094,6,1459,,,,,,,,,,,,,,,,,
|
| 160 |
+
0.0070037841796875,6,1469,,,,,,,,,,,,,,,,,
|
| 161 |
+
0.009063720703125,6,1479,,,,,,,,,,,,,,,,,
|
| 162 |
+
0.07452392578125,6,1489,,,,,,,,,,,,,,,,,
|
| 163 |
+
0.005832672119140625,6,1499,,,,,,,,,,,,,,,,,
|
| 164 |
+
0.0043487548828125,6,1509,,,,,,,,,,,,,,,,,
|
| 165 |
+
0.00695037841796875,6,1519,,,,,,,,,,,,,,,,,
|
| 166 |
+
0.06646728515625,6,1529,,,,,,,,,,,,,,,,,
|
| 167 |
+
0.001789093017578125,6,1539,,,,,,,,,,,,,,,,,
|
| 168 |
+
0.00323486328125,6,1549,,,,,,,,,,,,,,,,,
|
| 169 |
+
0.0006985664367675781,6,1559,,,,,,,,,,,,,,,,,
|
| 170 |
+
0.0648193359375,6,1569,,,,,,,,,,,,,,,,,
|
| 171 |
+
0.01558685302734375,6,1579,,,,,,,,,,,,,,,,,
|
| 172 |
+
0.00103759765625,6,1589,,,,,,,,,,,,,,,,,
|
| 173 |
+
0.001270294189453125,6,1599,,,,,,,,,,,,,,,,,
|
| 174 |
+
0.0002396106719970703,6,1609,,,,,,,,,,,,,,,,,
|
| 175 |
+
0.0003399848937988281,6,1619,,,,,,,,,,,,,,,,,
|
| 176 |
+
0.04937744140625,6,1629,,,,,,,,,,,,,,,,,
|
| 177 |
+
,6,1630,0.5342352986335754,0.8849818706512451,0.38383838534355164,0.6178861856460571,0.4735202491283417,0.4735202491283417,,,,,,,,,,,
|
| 178 |
+
,6,1630,,,,,,,0.9916666746139526,0.987943708896637,0.9912634491920471,0.9896007776260376,0.9896007776260376,,,,,,
|
| 179 |
+
0.0019931793212890625,7,1639,,,,,,,,,,,,,,,,,
|
| 180 |
+
0.0010328292846679688,7,1649,,,,,,,,,,,,,,,,,
|
| 181 |
+
0.002391815185546875,7,1659,,,,,,,,,,,,,,,,,
|
| 182 |
+
0.025543212890625,7,1669,,,,,,,,,,,,,,,,,
|
| 183 |
+
0.0016775131225585938,7,1679,,,,,,,,,,,,,,,,,
|
| 184 |
+
0.035919189453125,7,1689,,,,,,,,,,,,,,,,,
|
| 185 |
+
0.00547027587890625,7,1699,,,,,,,,,,,,,,,,,
|
| 186 |
+
0.0006341934204101562,7,1709,,,,,,,,,,,,,,,,,
|
| 187 |
+
0.0009632110595703125,7,1719,,,,,,,,,,,,,,,,,
|
| 188 |
+
0.00418853759765625,7,1729,,,,,,,,,,,,,,,,,
|
| 189 |
+
0.0033130645751953125,7,1739,,,,,,,,,,,,,,,,,
|
| 190 |
+
0.001251220703125,7,1749,,,,,,,,,,,,,,,,,
|
| 191 |
+
0.00024580955505371094,7,1759,,,,,,,,,,,,,,,,,
|
| 192 |
+
0.0007381439208984375,7,1769,,,,,,,,,,,,,,,,,
|
| 193 |
+
0.00131988525390625,7,1779,,,,,,,,,,,,,,,,,
|
| 194 |
+
0.00652313232421875,7,1789,,,,,,,,,,,,,,,,,
|
| 195 |
+
0.00263214111328125,7,1799,,,,,,,,,,,,,,,,,
|
| 196 |
+
0.0014677047729492188,7,1809,,,,,,,,,,,,,,,,,
|
| 197 |
+
0.0016336441040039062,7,1819,,,,,,,,,,,,,,,,,
|
| 198 |
+
0.0007638931274414062,7,1829,,,,,,,,,,,,,,,,,
|
| 199 |
+
0.00135040283203125,7,1839,,,,,,,,,,,,,,,,,
|
| 200 |
+
0.002391815185546875,7,1849,,,,,,,,,,,,,,,,,
|
| 201 |
+
0.0011796951293945312,7,1859,,,,,,,,,,,,,,,,,
|
| 202 |
+
,7,1863,1.4027303457260132,0.7279945611953735,0.22370173037052155,0.9105691313743591,0.3591662347316742,0.3591662347316742,,,,,,,,,,,
|
| 203 |
+
,7,1863,,,,,,,0.9935483932495117,0.9925975799560547,0.9912634491920471,0.9919300675392151,0.9919300675392151,,,,,,
|
| 204 |
+
0.00128936767578125,8,1869,,,,,,,,,,,,,,,,,
|
| 205 |
+
0.00234222412109375,8,1879,,,,,,,,,,,,,,,,,
|
| 206 |
+
0.00522613525390625,8,1889,,,,,,,,,,,,,,,,,
|
| 207 |
+
0.1265869140625,8,1899,,,,,,,,,,,,,,,,,
|
| 208 |
+
0.0026607513427734375,8,1909,,,,,,,,,,,,,,,,,
|
| 209 |
+
0.0024738311767578125,8,1919,,,,,,,,,,,,,,,,,
|
| 210 |
+
0.0029926300048828125,8,1929,,,,,,,,,,,,,,,,,
|
| 211 |
+
0.0010385513305664062,8,1939,,,,,,,,,,,,,,,,,
|
| 212 |
+
0.0003845691680908203,8,1949,,,,,,,,,,,,,,,,,
|
| 213 |
+
0.0232086181640625,8,1959,,,,,,,,,,,,,,,,,
|
| 214 |
+
0.00035262107849121094,8,1969,,,,,,,,,,,,,,,,,
|
| 215 |
+
0.00084686279296875,8,1979,,,,,,,,,,,,,,,,,
|
| 216 |
+
0.0023326873779296875,8,1989,,,,,,,,,,,,,,,,,
|
| 217 |
+
0.0024738311767578125,8,1999,,,,,,,,,,,,,,,,,
|
| 218 |
+
0.0016727447509765625,8,2009,,,,,,,,,,,,,,,,,
|
| 219 |
+
0.0006160736083984375,8,2019,,,,,,,,,,,,,,,,,
|
| 220 |
+
0.0017137527465820312,8,2029,,,,,,,,,,,,,,,,,
|
| 221 |
+
0.145751953125,8,2039,,,,,,,,,,,,,,,,,
|
| 222 |
+
0.000591278076171875,8,2049,,,,,,,,,,,,,,,,,
|
| 223 |
+
0.001270294189453125,8,2059,,,,,,,,,,,,,,,,,
|
| 224 |
+
0.00011342763900756836,8,2069,,,,,,,,,,,,,,,,,
|
| 225 |
+
0.0004799365997314453,8,2079,,,,,,,,,,,,,,,,,
|
| 226 |
+
0.00064849853515625,8,2089,,,,,,,,,,,,,,,,,
|
| 227 |
+
,8,2096,1.368309736251831,0.7813067436218262,0.2531120479106903,0.8265582919120789,0.38754764199256897,0.38754764199256897,,,,,,,,,,,
|
| 228 |
+
,8,2096,,,,,,,0.9967741966247559,0.9953020215034485,0.9966397881507874,0.9959704279899597,0.9959704279899597,,,,,,
|
| 229 |
+
0.00011461973190307617,9,2099,,,,,,,,,,,,,,,,,
|
| 230 |
+
0.00010311603546142578,9,2109,,,,,,,,,,,,,,,,,
|
| 231 |
+
8.416175842285156e-05,9,2119,,,,,,,,,,,,,,,,,
|
| 232 |
+
0.00021708011627197266,9,2129,,,,,,,,,,,,,,,,,
|
| 233 |
+
0.0002524852752685547,9,2139,,,,,,,,,,,,,,,,,
|
| 234 |
+
8.296966552734375e-05,9,2149,,,,,,,,,,,,,,,,,
|
| 235 |
+
0.0005345344543457031,9,2159,,,,,,,,,,,,,,,,,
|
| 236 |
+
0.0016202926635742188,9,2169,,,,,,,,,,,,,,,,,
|
| 237 |
+
0.0078582763671875,9,2179,,,,,,,,,,,,,,,,,
|
| 238 |
+
0.00012177228927612305,9,2189,,,,,,,,,,,,,,,,,
|
| 239 |
+
0.15673828125,9,2199,,,,,,,,,,,,,,,,,
|
| 240 |
+
0.00012803077697753906,9,2209,,,,,,,,,,,,,,,,,
|
| 241 |
+
8.165836334228516e-05,9,2219,,,,,,,,,,,,,,,,,
|
| 242 |
+
7.212162017822266e-05,9,2229,,,,,,,,,,,,,,,,,
|
| 243 |
+
0.0003197193145751953,9,2239,,,,,,,,,,,,,,,,,
|
| 244 |
+
0.09149169921875,9,2249,,,,,,,,,,,,,,,,,
|
| 245 |
+
0.001117706298828125,9,2259,,,,,,,,,,,,,,,,,
|
| 246 |
+
0.00513458251953125,9,2269,,,,,,,,,,,,,,,,,
|
| 247 |
+
0.021209716796875,9,2279,,,,,,,,,,,,,,,,,
|
| 248 |
+
0.0097503662109375,9,2289,,,,,,,,,,,,,,,,,
|
| 249 |
+
0.0246429443359375,9,2299,,,,,,,,,,,,,,,,,
|
| 250 |
+
0.0011377334594726562,9,2309,,,,,,,,,,,,,,,,,
|
| 251 |
+
0.001354217529296875,9,2319,,,,,,,,,,,,,,,,,
|
| 252 |
+
0.00032639503479003906,9,2329,,,,,,,,,,,,,,,,,
|
| 253 |
+
,9,2329,1.1365300416946411,0.7631579041481018,0.242562934756279,0.8617886304855347,0.37857142090797424,0.37857142090797424,,,,,,,,,,,
|
| 254 |
+
,9,2329,,,,,,,0.9940860271453857,0.9912868738174438,0.9939516186714172,0.9926174283027649,0.9926174283027649,,,,,,
|
| 255 |
+
,10,2330,,,,,,,,,,,,0.5662358999252319,0.8833031058311462,0.3981233239173889,0.6048879623413086,0.4801940321922302,0.4801940321922302
|
display_v3/2023-04-14_16-59-07/yes.txt
ADDED
|
File without changes
|
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0fc4b81c7a5f3c433e5c8de5b92730a3c76c4a32c2dc8b60ad349d236a1c0697
|
| 3 |
+
size 220228915
|
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1bacc5807e8789eb3a6853e0d7a148cf921a0737988170b3dfcd7136f205cc60
|
| 3 |
+
size 1320918341
|
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/latest
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
checkpoint
|
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/zero_to_fp32.py
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
'''Copyright The Microsoft DeepSpeed Team'''
|
| 3 |
+
|
| 4 |
+
# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
|
| 5 |
+
# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
|
| 6 |
+
# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
|
| 7 |
+
# application.
|
| 8 |
+
#
|
| 9 |
+
# example: python zero_to_fp32.py . pytorch_model.bin
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import torch
|
| 13 |
+
import glob
|
| 14 |
+
import math
|
| 15 |
+
import os
|
| 16 |
+
import re
|
| 17 |
+
from collections import OrderedDict
|
| 18 |
+
|
| 19 |
+
# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
|
| 20 |
+
# DeepSpeed data structures it has to be available in the current python environment.
|
| 21 |
+
from deepspeed.utils import logger
|
| 22 |
+
from deepspeed.checkpoint.constants import (DS_VERSION,
|
| 23 |
+
OPTIMIZER_STATE_DICT,
|
| 24 |
+
SINGLE_PARTITION_OF_FP32_GROUPS,
|
| 25 |
+
FP32_FLAT_GROUPS,
|
| 26 |
+
ZERO_STAGE,
|
| 27 |
+
PARTITION_COUNT,
|
| 28 |
+
PARAM_SHAPES,
|
| 29 |
+
BUFFER_NAMES)
|
| 30 |
+
|
| 31 |
+
debug = 0
|
| 32 |
+
|
| 33 |
+
# load to cpu
|
| 34 |
+
device = torch.device('cpu')
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def atoi(text):
|
| 38 |
+
return int(text) if text.isdigit() else text
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def natural_keys(text):
|
| 42 |
+
'''
|
| 43 |
+
alist.sort(key=natural_keys) sorts in human order
|
| 44 |
+
http://nedbatchelder.com/blog/200712/human_sorting.html
|
| 45 |
+
(See Toothy's implementation in the comments)
|
| 46 |
+
'''
|
| 47 |
+
return [atoi(c) for c in re.split(r'(\d+)', text)]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def get_model_state_file(checkpoint_dir, zero_stage):
|
| 51 |
+
if not os.path.isdir(checkpoint_dir):
|
| 52 |
+
raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
|
| 53 |
+
|
| 54 |
+
# there should be only one file
|
| 55 |
+
if zero_stage == 2:
|
| 56 |
+
file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
|
| 57 |
+
elif zero_stage == 3:
|
| 58 |
+
file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
|
| 59 |
+
|
| 60 |
+
if not os.path.exists(file):
|
| 61 |
+
raise FileNotFoundError(f"can't find model states file at '{file}'")
|
| 62 |
+
|
| 63 |
+
return file
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_optim_files(checkpoint_dir):
|
| 67 |
+
# XXX: need to test that this simple glob rule works for multi-node setup too
|
| 68 |
+
optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
|
| 69 |
+
"*_optim_states.pt")),
|
| 70 |
+
key=natural_keys)
|
| 71 |
+
|
| 72 |
+
if len(optim_files) == 0:
|
| 73 |
+
raise FileNotFoundError(
|
| 74 |
+
f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
|
| 75 |
+
|
| 76 |
+
return optim_files
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def parse_model_state(file):
|
| 80 |
+
state_dict = torch.load(file, map_location=device)
|
| 81 |
+
|
| 82 |
+
if BUFFER_NAMES not in state_dict:
|
| 83 |
+
raise ValueError(f"{file} is not a model state checkpoint")
|
| 84 |
+
buffer_names = state_dict[BUFFER_NAMES]
|
| 85 |
+
if debug:
|
| 86 |
+
print("Found buffers:", buffer_names)
|
| 87 |
+
|
| 88 |
+
# recover just the buffers while restoring them to fp32 if they were saved in fp16
|
| 89 |
+
buffers = {
|
| 90 |
+
k: v.float()
|
| 91 |
+
for k,
|
| 92 |
+
v in state_dict["module"].items() if k in buffer_names
|
| 93 |
+
}
|
| 94 |
+
param_shapes = state_dict[PARAM_SHAPES]
|
| 95 |
+
|
| 96 |
+
ds_version = state_dict.get(DS_VERSION, None)
|
| 97 |
+
|
| 98 |
+
return buffers, param_shapes, ds_version
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def parse_optim_states(files, ds_checkpoint_dir):
|
| 102 |
+
|
| 103 |
+
total_files = len(files)
|
| 104 |
+
state_dicts = []
|
| 105 |
+
for f in files:
|
| 106 |
+
state_dicts.append(torch.load(f, map_location=device))
|
| 107 |
+
|
| 108 |
+
if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
|
| 109 |
+
raise ValueError(f"{files[0]} is not a zero checkpoint")
|
| 110 |
+
zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
|
| 111 |
+
world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
|
| 112 |
+
|
| 113 |
+
# For ZeRO-2 each param group can have different partition_count as data parallelism for expert
|
| 114 |
+
# parameters can be different from data parallelism for non-expert parameters. So we can just
|
| 115 |
+
# use the max of the partition_count to get the dp world_size.
|
| 116 |
+
|
| 117 |
+
if type(world_size) is list:
|
| 118 |
+
world_size = max(world_size)
|
| 119 |
+
|
| 120 |
+
if world_size != total_files:
|
| 121 |
+
raise ValueError(
|
| 122 |
+
f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
|
| 123 |
+
"Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# the groups are named differently in each stage
|
| 127 |
+
if zero_stage == 2:
|
| 128 |
+
fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
|
| 129 |
+
elif zero_stage == 3:
|
| 130 |
+
fp32_groups_key = FP32_FLAT_GROUPS
|
| 131 |
+
else:
|
| 132 |
+
raise ValueError(f"unknown zero stage {zero_stage}")
|
| 133 |
+
|
| 134 |
+
if zero_stage == 2:
|
| 135 |
+
fp32_flat_groups = [
|
| 136 |
+
state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
|
| 137 |
+
for i in range(len(state_dicts))
|
| 138 |
+
]
|
| 139 |
+
elif zero_stage == 3:
|
| 140 |
+
# if there is more than one param group, there will be multiple flattened tensors - one
|
| 141 |
+
# flattened tensor per group - for simplicity merge them into a single tensor
|
| 142 |
+
#
|
| 143 |
+
# XXX: could make the script more memory efficient for when there are multiple groups - it
|
| 144 |
+
# will require matching the sub-lists of param_shapes for each param group flattened tensor
|
| 145 |
+
|
| 146 |
+
fp32_flat_groups = [
|
| 147 |
+
torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
|
| 148 |
+
0) for i in range(len(state_dicts))
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
return zero_stage, world_size, fp32_flat_groups
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
|
| 155 |
+
"""
|
| 156 |
+
Returns fp32 state_dict reconstructed from ds checkpoint
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
- ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
|
| 160 |
+
|
| 161 |
+
"""
|
| 162 |
+
print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
|
| 163 |
+
|
| 164 |
+
optim_files = get_optim_files(ds_checkpoint_dir)
|
| 165 |
+
zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
|
| 166 |
+
print(
|
| 167 |
+
f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
|
| 168 |
+
|
| 169 |
+
model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
|
| 170 |
+
buffers, param_shapes, ds_version = parse_model_state(model_file)
|
| 171 |
+
print(f'Parsing checkpoint created by deepspeed=={ds_version}')
|
| 172 |
+
|
| 173 |
+
if zero_stage == 2:
|
| 174 |
+
return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
|
| 175 |
+
param_shapes,
|
| 176 |
+
fp32_flat_groups,
|
| 177 |
+
buffers)
|
| 178 |
+
elif zero_stage == 3:
|
| 179 |
+
return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
|
| 180 |
+
param_shapes,
|
| 181 |
+
fp32_flat_groups,
|
| 182 |
+
buffers)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
|
| 186 |
+
param_shapes,
|
| 187 |
+
fp32_flat_groups,
|
| 188 |
+
buffers):
|
| 189 |
+
|
| 190 |
+
# Reconstruction protocol:
|
| 191 |
+
#
|
| 192 |
+
# XXX: document this
|
| 193 |
+
|
| 194 |
+
if debug:
|
| 195 |
+
for i in range(world_size):
|
| 196 |
+
for j in range(len(fp32_flat_groups[0])):
|
| 197 |
+
print(
|
| 198 |
+
f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
|
| 199 |
+
|
| 200 |
+
# XXX: memory usage doubles here (zero2)
|
| 201 |
+
num_param_groups = len(fp32_flat_groups[0])
|
| 202 |
+
merged_single_partition_of_fp32_groups = []
|
| 203 |
+
for i in range(num_param_groups):
|
| 204 |
+
merged_partitions = [sd[i] for sd in fp32_flat_groups]
|
| 205 |
+
full_single_fp32_vector = torch.cat(merged_partitions, 0)
|
| 206 |
+
merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
|
| 207 |
+
avail_numel = sum([
|
| 208 |
+
full_single_fp32_vector.numel()
|
| 209 |
+
for full_single_fp32_vector in merged_single_partition_of_fp32_groups
|
| 210 |
+
])
|
| 211 |
+
|
| 212 |
+
if debug:
|
| 213 |
+
wanted_params = sum([len(shapes) for shapes in param_shapes])
|
| 214 |
+
wanted_numel = sum(
|
| 215 |
+
[sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
|
| 216 |
+
# not asserting if there is a mismatch due to possible padding
|
| 217 |
+
print(f"Have {avail_numel} numels to process.")
|
| 218 |
+
print(f"Need {wanted_numel} numels in {wanted_params} params.")
|
| 219 |
+
|
| 220 |
+
state_dict = OrderedDict()
|
| 221 |
+
|
| 222 |
+
# buffers
|
| 223 |
+
state_dict.update(buffers)
|
| 224 |
+
if debug:
|
| 225 |
+
print(f"added {len(buffers)} buffers")
|
| 226 |
+
|
| 227 |
+
# params
|
| 228 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
| 229 |
+
# out-of-core computing solution
|
| 230 |
+
total_numel = 0
|
| 231 |
+
total_params = 0
|
| 232 |
+
for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
|
| 233 |
+
offset = 0
|
| 234 |
+
avail_numel = full_single_fp32_vector.numel()
|
| 235 |
+
for name, shape in shapes.items():
|
| 236 |
+
|
| 237 |
+
unpartitioned_numel = shape.numel()
|
| 238 |
+
total_numel += unpartitioned_numel
|
| 239 |
+
total_params += 1
|
| 240 |
+
|
| 241 |
+
if debug:
|
| 242 |
+
print(
|
| 243 |
+
f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
|
| 244 |
+
)
|
| 245 |
+
state_dict[name] = full_single_fp32_vector.narrow(
|
| 246 |
+
0,
|
| 247 |
+
offset,
|
| 248 |
+
unpartitioned_numel).view(shape)
|
| 249 |
+
offset += unpartitioned_numel
|
| 250 |
+
|
| 251 |
+
# Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
|
| 252 |
+
# avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
|
| 253 |
+
# paddings performed in the code it's almost impossible to predict the exact numbers w/o the
|
| 254 |
+
# live optimizer object, so we are checking that the numbers are within the right range
|
| 255 |
+
align_to = 2 * world_size
|
| 256 |
+
|
| 257 |
+
def zero2_align(x):
|
| 258 |
+
return align_to * math.ceil(x / align_to)
|
| 259 |
+
|
| 260 |
+
if debug:
|
| 261 |
+
print(f"original offset={offset}, avail_numel={avail_numel}")
|
| 262 |
+
|
| 263 |
+
offset = zero2_align(offset)
|
| 264 |
+
avail_numel = zero2_align(avail_numel)
|
| 265 |
+
|
| 266 |
+
if debug:
|
| 267 |
+
print(f"aligned offset={offset}, avail_numel={avail_numel}")
|
| 268 |
+
|
| 269 |
+
# Sanity check
|
| 270 |
+
if offset != avail_numel:
|
| 271 |
+
raise ValueError(
|
| 272 |
+
f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
| 273 |
+
|
| 274 |
+
print(
|
| 275 |
+
f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
return state_dict
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def zero3_partitioned_param_info(unpartitioned_numel, world_size):
|
| 282 |
+
remainder = unpartitioned_numel % world_size
|
| 283 |
+
padding_numel = (world_size - remainder) if remainder else 0
|
| 284 |
+
partitioned_numel = math.ceil(unpartitioned_numel / world_size)
|
| 285 |
+
return partitioned_numel, padding_numel
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
|
| 289 |
+
param_shapes,
|
| 290 |
+
fp32_flat_groups,
|
| 291 |
+
buffers):
|
| 292 |
+
|
| 293 |
+
# Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
|
| 294 |
+
# param, re-consolidating each param, while dealing with padding if any
|
| 295 |
+
|
| 296 |
+
avail_numel = fp32_flat_groups[0].numel() * world_size
|
| 297 |
+
# merge list of dicts, preserving order
|
| 298 |
+
param_shapes = {k: v for d in param_shapes for k, v in d.items()}
|
| 299 |
+
|
| 300 |
+
if debug:
|
| 301 |
+
for i in range(world_size):
|
| 302 |
+
print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
|
| 303 |
+
|
| 304 |
+
wanted_params = len(param_shapes)
|
| 305 |
+
wanted_numel = sum(shape.numel() for shape in param_shapes.values())
|
| 306 |
+
# not asserting if there is a mismatch due to possible padding
|
| 307 |
+
print(f"Have {avail_numel} numels to process.")
|
| 308 |
+
print(f"Need {wanted_numel} numels in {wanted_params} params.")
|
| 309 |
+
|
| 310 |
+
state_dict = OrderedDict()
|
| 311 |
+
|
| 312 |
+
# buffers
|
| 313 |
+
state_dict.update(buffers)
|
| 314 |
+
if debug:
|
| 315 |
+
print(f"added {len(buffers)} buffers")
|
| 316 |
+
|
| 317 |
+
# params
|
| 318 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
| 319 |
+
# out-of-core computing solution
|
| 320 |
+
offset = 0
|
| 321 |
+
total_numel = 0
|
| 322 |
+
total_params = 0
|
| 323 |
+
for name, shape in param_shapes.items():
|
| 324 |
+
|
| 325 |
+
unpartitioned_numel = shape.numel()
|
| 326 |
+
total_numel += unpartitioned_numel
|
| 327 |
+
total_params += 1
|
| 328 |
+
|
| 329 |
+
partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
|
| 330 |
+
|
| 331 |
+
if debug:
|
| 332 |
+
print(
|
| 333 |
+
f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# XXX: memory usage doubles here
|
| 337 |
+
state_dict[name] = torch.cat(
|
| 338 |
+
tuple(fp32_flat_groups[i].narrow(0,
|
| 339 |
+
offset,
|
| 340 |
+
partitioned_numel)
|
| 341 |
+
for i in range(world_size)),
|
| 342 |
+
0).narrow(0,
|
| 343 |
+
0,
|
| 344 |
+
unpartitioned_numel).view(shape)
|
| 345 |
+
offset += partitioned_numel
|
| 346 |
+
|
| 347 |
+
offset *= world_size
|
| 348 |
+
|
| 349 |
+
# Sanity check
|
| 350 |
+
if offset != avail_numel:
|
| 351 |
+
raise ValueError(
|
| 352 |
+
f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
| 353 |
+
|
| 354 |
+
print(
|
| 355 |
+
f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
return state_dict
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
|
| 362 |
+
"""
|
| 363 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
|
| 364 |
+
``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
|
| 365 |
+
via a model hub.
|
| 366 |
+
|
| 367 |
+
Args:
|
| 368 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder
|
| 369 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
|
| 370 |
+
|
| 371 |
+
Returns:
|
| 372 |
+
- pytorch ``state_dict``
|
| 373 |
+
|
| 374 |
+
Note: this approach may not work if your application doesn't have sufficient free CPU memory and
|
| 375 |
+
you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
|
| 376 |
+
the checkpoint.
|
| 377 |
+
|
| 378 |
+
A typical usage might be ::
|
| 379 |
+
|
| 380 |
+
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
|
| 381 |
+
# do the training and checkpoint saving
|
| 382 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
|
| 383 |
+
model = model.cpu() # move to cpu
|
| 384 |
+
model.load_state_dict(state_dict)
|
| 385 |
+
# submit to model hub or save the model to share with others
|
| 386 |
+
|
| 387 |
+
In this example the ``model`` will no longer be usable in the deepspeed context of the same
|
| 388 |
+
application. i.e. you will need to re-initialize the deepspeed engine, since
|
| 389 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
| 390 |
+
|
| 391 |
+
If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
|
| 392 |
+
|
| 393 |
+
"""
|
| 394 |
+
if tag is None:
|
| 395 |
+
latest_path = os.path.join(checkpoint_dir, 'latest')
|
| 396 |
+
if os.path.isfile(latest_path):
|
| 397 |
+
with open(latest_path, 'r') as fd:
|
| 398 |
+
tag = fd.read().strip()
|
| 399 |
+
else:
|
| 400 |
+
raise ValueError(f"Unable to find 'latest' file at {latest_path}")
|
| 401 |
+
|
| 402 |
+
ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
|
| 403 |
+
|
| 404 |
+
if not os.path.isdir(ds_checkpoint_dir):
|
| 405 |
+
raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
|
| 406 |
+
|
| 407 |
+
return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
|
| 411 |
+
"""
|
| 412 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
|
| 413 |
+
loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
|
| 414 |
+
|
| 415 |
+
Args:
|
| 416 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
| 417 |
+
- ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
|
| 418 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
| 419 |
+
"""
|
| 420 |
+
|
| 421 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
|
| 422 |
+
print(f"Saving fp32 state dict to {output_file}")
|
| 423 |
+
torch.save(state_dict, output_file)
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
|
| 427 |
+
"""
|
| 428 |
+
1. Put the provided model to cpu
|
| 429 |
+
2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
|
| 430 |
+
3. Load it into the provided model
|
| 431 |
+
|
| 432 |
+
Args:
|
| 433 |
+
- ``model``: the model object to update
|
| 434 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
| 435 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
| 436 |
+
|
| 437 |
+
Returns:
|
| 438 |
+
- ``model`: modified model
|
| 439 |
+
|
| 440 |
+
Make sure you have plenty of CPU memory available before you call this function. If you don't
|
| 441 |
+
have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
|
| 442 |
+
conveniently placed for you in the checkpoint folder.
|
| 443 |
+
|
| 444 |
+
A typical usage might be ::
|
| 445 |
+
|
| 446 |
+
from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
|
| 447 |
+
model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
|
| 448 |
+
# submit to model hub or save the model to share with others
|
| 449 |
+
|
| 450 |
+
Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
|
| 451 |
+
of the same application. i.e. you will need to re-initialize the deepspeed engine, since
|
| 452 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
| 453 |
+
|
| 454 |
+
"""
|
| 455 |
+
logger.info(f"Extracting fp32 weights")
|
| 456 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
|
| 457 |
+
|
| 458 |
+
logger.info(f"Overwriting model with fp32 weights")
|
| 459 |
+
model = model.cpu()
|
| 460 |
+
model.load_state_dict(state_dict, strict=False)
|
| 461 |
+
|
| 462 |
+
return model
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
if __name__ == "__main__":
|
| 466 |
+
|
| 467 |
+
parser = argparse.ArgumentParser()
|
| 468 |
+
parser.add_argument(
|
| 469 |
+
"checkpoint_dir",
|
| 470 |
+
type=str,
|
| 471 |
+
help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
|
| 472 |
+
parser.add_argument(
|
| 473 |
+
"output_file",
|
| 474 |
+
type=str,
|
| 475 |
+
help=
|
| 476 |
+
"path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
|
| 477 |
+
)
|
| 478 |
+
parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
|
| 479 |
+
args = parser.parse_args()
|
| 480 |
+
|
| 481 |
+
debug = args.debug
|
| 482 |
+
|
| 483 |
+
convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
|
display_v3/2023-04-14_17-06-18/hparams.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amp: true
|
| 2 |
+
batch_size: 16
|
| 3 |
+
cls_target: cv
|
| 4 |
+
deepspeed: true
|
| 5 |
+
dev_data_file: ''
|
| 6 |
+
downsample_data: true
|
| 7 |
+
early_dropout: null
|
| 8 |
+
epochs: 10
|
| 9 |
+
freeze_encoder: false
|
| 10 |
+
just_test: false
|
| 11 |
+
log_fold: ./logs
|
| 12 |
+
log_step: 10
|
| 13 |
+
lr: 5.0e-05
|
| 14 |
+
model_name: bert-base-uncased
|
| 15 |
+
positive_ratio: 0.4
|
| 16 |
+
pretrained_model_fold: ./pretrained_model
|
| 17 |
+
rdrop: null
|
| 18 |
+
running time: 0:01:56
|
| 19 |
+
share_encoder: false
|
| 20 |
+
test_data_file: ''
|
| 21 |
+
train_data_file: ''
|
| 22 |
+
train_ratio: 0.8
|
| 23 |
+
version: structure cmp
|
display_v3/2023-04-14_17-06-18/metrics.csv
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
train_loss,epoch,step,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_macro_f1,train_accuracy,train_precision,train_recall,train_f1,train_macro_f1,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_macro_f1
|
| 2 |
+
0.70458984375,0,9,,,,,,,,,,,,,,,,,
|
| 3 |
+
,0,15,0.5909940004348755,0.6669691205024719,0.014864864759147167,0.6875,0.029100529849529266,0.029100529849529266,,,,,,,,,,,
|
| 4 |
+
,0,15,,,,,,,0.6275303363800049,0.7692307829856873,0.10101009905338287,0.1785714328289032,0.1785714328289032,,,,,,
|
| 5 |
+
0.4501953125,1,19,,,,,,,,,,,,,,,,,
|
| 6 |
+
0.361328125,1,29,,,,,,,,,,,,,,,,,
|
| 7 |
+
,1,31,0.23693892359733582,0.9108439087867737,0.045340050011873245,0.5625,0.0839160829782486,0.0839160829782486,,,,,,,,,,,
|
| 8 |
+
,1,31,,,,,,,0.8825910687446594,0.8500000238418579,0.8585858345031738,0.8542713522911072,0.8542713522911072,,,,,,
|
| 9 |
+
0.08734130859375,2,39,,,,,,,,,,,,,,,,,
|
| 10 |
+
,2,47,0.6708253026008606,0.7488657236099243,0.023914968594908714,0.84375,0.04651162773370743,0.04651162773370743,,,,,,,,,,,
|
| 11 |
+
,2,47,,,,,,,0.9473684430122375,0.9479166865348816,0.9191918969154358,0.9333333373069763,0.9333333373069763,,,,,,
|
| 12 |
+
0.078369140625,3,49,,,,,,,,,,,,,,,,,
|
| 13 |
+
0.0077056884765625,3,59,,,,,,,,,,,,,,,,,
|
| 14 |
+
,3,63,0.16590917110443115,0.9494101405143738,0.06787330657243729,0.46875,0.11857707798480988,0.11857707798480988,,,,,,,,,,,
|
| 15 |
+
,3,63,,,,,,,0.9959514141082764,0.9900000095367432,1.0,0.9949748516082764,0.9949748516082764,,,,,,
|
| 16 |
+
0.004322052001953125,4,69,,,,,,,,,,,,,,,,,
|
| 17 |
+
0.00762176513671875,4,79,,,,,,,,,,,,,,,,,
|
| 18 |
+
,4,79,0.48889032006263733,0.8593466281890869,0.03627760335803032,0.71875,0.06906907260417938,0.06906907260417938,,,,,,,,,,,
|
| 19 |
+
,4,79,,,,,,,0.9919028282165527,0.9898989796638489,0.9898989796638489,0.9898989796638489,0.9898989796638489,,,,,,
|
| 20 |
+
0.078125,5,89,,,,,,,,,,,,,,,,,
|
| 21 |
+
,5,95,1.2922793626785278,0.7039473652839661,0.021068472415208817,0.875,0.04114621505141258,0.04114621505141258,,,,,,,,,,,
|
| 22 |
+
,5,95,,,,,,,0.9919028282165527,0.9898989796638489,0.9898989796638489,0.9898989796638489,0.9898989796638489,,,,,,
|
| 23 |
+
0.015960693359375,6,99,,,,,,,,,,,,,,,,,
|
| 24 |
+
0.003612518310546875,6,109,,,,,,,,,,,,,,,,,
|
| 25 |
+
,6,111,0.8841056227684021,0.7894737124443054,0.028421051800251007,0.84375,0.05498981848359108,0.05498981848359108,,,,,,,,,,,
|
| 26 |
+
,6,111,,,,,,,0.9959514141082764,0.9900000095367432,1.0,0.9949748516082764,0.9949748516082764,,,,,,
|
| 27 |
+
0.0019989013671875,7,119,,,,,,,,,,,,,,,,,
|
| 28 |
+
,7,127,0.69402015209198,0.8504990935325623,0.039647575467824936,0.84375,0.07573632895946503,0.07573632895946503,,,,,,,,,,,
|
| 29 |
+
,7,127,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,
|
| 30 |
+
0.0010175704956054688,8,129,,,,,,,,,,,,,,,,,
|
| 31 |
+
0.0007581710815429688,8,139,,,,,,,,,,,,,,,,,
|
| 32 |
+
,8,143,0.6454178094863892,0.8666061758995056,0.042763158679008484,0.8125,0.08124999701976776,0.08124999701976776,,,,,,,,,,,
|
| 33 |
+
,8,143,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,
|
| 34 |
+
0.0009255409240722656,9,149,,,,,,,,,,,,,,,,,
|
| 35 |
+
0.00054931640625,9,159,,,,,,,,,,,,,,,,,
|
| 36 |
+
,9,159,0.6520931720733643,0.8681941628456116,0.04326122999191284,0.8125,0.08214849978685379,0.08214849978685379,,,,,,,,,,,
|
| 37 |
+
,9,159,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,
|
| 38 |
+
,10,160,,,,,,,,,,,,0.1696726232767105,0.9466424584388733,0.0439189188182354,0.5416666865348816,0.08124999701976776,0.08124999701976776
|
display_v3/2023-04-14_17-06-18/yes.txt
ADDED
|
File without changes
|
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/mp_rank_00_model_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42d544e1553c822e187b976e75cb402dc7a351855b355913ad28b7ed8e97e4e8
|
| 3 |
+
size 220228915
|
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64eb30cf079871506af468afdfbf83a06d02247526ba2b092ab84f001e57929b
|
| 3 |
+
size 1320918341
|
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/latest
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
checkpoint
|
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/zero_to_fp32.py
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
'''Copyright The Microsoft DeepSpeed Team'''
|
| 3 |
+
|
| 4 |
+
# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
|
| 5 |
+
# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
|
| 6 |
+
# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
|
| 7 |
+
# application.
|
| 8 |
+
#
|
| 9 |
+
# example: python zero_to_fp32.py . pytorch_model.bin
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import torch
|
| 13 |
+
import glob
|
| 14 |
+
import math
|
| 15 |
+
import os
|
| 16 |
+
import re
|
| 17 |
+
from collections import OrderedDict
|
| 18 |
+
|
| 19 |
+
# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
|
| 20 |
+
# DeepSpeed data structures it has to be available in the current python environment.
|
| 21 |
+
from deepspeed.utils import logger
|
| 22 |
+
from deepspeed.checkpoint.constants import (DS_VERSION,
|
| 23 |
+
OPTIMIZER_STATE_DICT,
|
| 24 |
+
SINGLE_PARTITION_OF_FP32_GROUPS,
|
| 25 |
+
FP32_FLAT_GROUPS,
|
| 26 |
+
ZERO_STAGE,
|
| 27 |
+
PARTITION_COUNT,
|
| 28 |
+
PARAM_SHAPES,
|
| 29 |
+
BUFFER_NAMES)
|
| 30 |
+
|
| 31 |
+
debug = 0
|
| 32 |
+
|
| 33 |
+
# load to cpu
|
| 34 |
+
device = torch.device('cpu')
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def atoi(text):
|
| 38 |
+
return int(text) if text.isdigit() else text
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def natural_keys(text):
|
| 42 |
+
'''
|
| 43 |
+
alist.sort(key=natural_keys) sorts in human order
|
| 44 |
+
http://nedbatchelder.com/blog/200712/human_sorting.html
|
| 45 |
+
(See Toothy's implementation in the comments)
|
| 46 |
+
'''
|
| 47 |
+
return [atoi(c) for c in re.split(r'(\d+)', text)]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def get_model_state_file(checkpoint_dir, zero_stage):
|
| 51 |
+
if not os.path.isdir(checkpoint_dir):
|
| 52 |
+
raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
|
| 53 |
+
|
| 54 |
+
# there should be only one file
|
| 55 |
+
if zero_stage == 2:
|
| 56 |
+
file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
|
| 57 |
+
elif zero_stage == 3:
|
| 58 |
+
file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
|
| 59 |
+
|
| 60 |
+
if not os.path.exists(file):
|
| 61 |
+
raise FileNotFoundError(f"can't find model states file at '{file}'")
|
| 62 |
+
|
| 63 |
+
return file
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_optim_files(checkpoint_dir):
|
| 67 |
+
# XXX: need to test that this simple glob rule works for multi-node setup too
|
| 68 |
+
optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
|
| 69 |
+
"*_optim_states.pt")),
|
| 70 |
+
key=natural_keys)
|
| 71 |
+
|
| 72 |
+
if len(optim_files) == 0:
|
| 73 |
+
raise FileNotFoundError(
|
| 74 |
+
f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
|
| 75 |
+
|
| 76 |
+
return optim_files
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def parse_model_state(file):
|
| 80 |
+
state_dict = torch.load(file, map_location=device)
|
| 81 |
+
|
| 82 |
+
if BUFFER_NAMES not in state_dict:
|
| 83 |
+
raise ValueError(f"{file} is not a model state checkpoint")
|
| 84 |
+
buffer_names = state_dict[BUFFER_NAMES]
|
| 85 |
+
if debug:
|
| 86 |
+
print("Found buffers:", buffer_names)
|
| 87 |
+
|
| 88 |
+
# recover just the buffers while restoring them to fp32 if they were saved in fp16
|
| 89 |
+
buffers = {
|
| 90 |
+
k: v.float()
|
| 91 |
+
for k,
|
| 92 |
+
v in state_dict["module"].items() if k in buffer_names
|
| 93 |
+
}
|
| 94 |
+
param_shapes = state_dict[PARAM_SHAPES]
|
| 95 |
+
|
| 96 |
+
ds_version = state_dict.get(DS_VERSION, None)
|
| 97 |
+
|
| 98 |
+
return buffers, param_shapes, ds_version
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def parse_optim_states(files, ds_checkpoint_dir):
|
| 102 |
+
|
| 103 |
+
total_files = len(files)
|
| 104 |
+
state_dicts = []
|
| 105 |
+
for f in files:
|
| 106 |
+
state_dicts.append(torch.load(f, map_location=device))
|
| 107 |
+
|
| 108 |
+
if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
|
| 109 |
+
raise ValueError(f"{files[0]} is not a zero checkpoint")
|
| 110 |
+
zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
|
| 111 |
+
world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
|
| 112 |
+
|
| 113 |
+
# For ZeRO-2 each param group can have different partition_count as data parallelism for expert
|
| 114 |
+
# parameters can be different from data parallelism for non-expert parameters. So we can just
|
| 115 |
+
# use the max of the partition_count to get the dp world_size.
|
| 116 |
+
|
| 117 |
+
if type(world_size) is list:
|
| 118 |
+
world_size = max(world_size)
|
| 119 |
+
|
| 120 |
+
if world_size != total_files:
|
| 121 |
+
raise ValueError(
|
| 122 |
+
f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
|
| 123 |
+
"Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# the groups are named differently in each stage
|
| 127 |
+
if zero_stage == 2:
|
| 128 |
+
fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
|
| 129 |
+
elif zero_stage == 3:
|
| 130 |
+
fp32_groups_key = FP32_FLAT_GROUPS
|
| 131 |
+
else:
|
| 132 |
+
raise ValueError(f"unknown zero stage {zero_stage}")
|
| 133 |
+
|
| 134 |
+
if zero_stage == 2:
|
| 135 |
+
fp32_flat_groups = [
|
| 136 |
+
state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
|
| 137 |
+
for i in range(len(state_dicts))
|
| 138 |
+
]
|
| 139 |
+
elif zero_stage == 3:
|
| 140 |
+
# if there is more than one param group, there will be multiple flattened tensors - one
|
| 141 |
+
# flattened tensor per group - for simplicity merge them into a single tensor
|
| 142 |
+
#
|
| 143 |
+
# XXX: could make the script more memory efficient for when there are multiple groups - it
|
| 144 |
+
# will require matching the sub-lists of param_shapes for each param group flattened tensor
|
| 145 |
+
|
| 146 |
+
fp32_flat_groups = [
|
| 147 |
+
torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
|
| 148 |
+
0) for i in range(len(state_dicts))
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
return zero_stage, world_size, fp32_flat_groups
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
|
| 155 |
+
"""
|
| 156 |
+
Returns fp32 state_dict reconstructed from ds checkpoint
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
- ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
|
| 160 |
+
|
| 161 |
+
"""
|
| 162 |
+
print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
|
| 163 |
+
|
| 164 |
+
optim_files = get_optim_files(ds_checkpoint_dir)
|
| 165 |
+
zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
|
| 166 |
+
print(
|
| 167 |
+
f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
|
| 168 |
+
|
| 169 |
+
model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
|
| 170 |
+
buffers, param_shapes, ds_version = parse_model_state(model_file)
|
| 171 |
+
print(f'Parsing checkpoint created by deepspeed=={ds_version}')
|
| 172 |
+
|
| 173 |
+
if zero_stage == 2:
|
| 174 |
+
return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
|
| 175 |
+
param_shapes,
|
| 176 |
+
fp32_flat_groups,
|
| 177 |
+
buffers)
|
| 178 |
+
elif zero_stage == 3:
|
| 179 |
+
return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
|
| 180 |
+
param_shapes,
|
| 181 |
+
fp32_flat_groups,
|
| 182 |
+
buffers)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
|
| 186 |
+
param_shapes,
|
| 187 |
+
fp32_flat_groups,
|
| 188 |
+
buffers):
|
| 189 |
+
|
| 190 |
+
# Reconstruction protocol:
|
| 191 |
+
#
|
| 192 |
+
# XXX: document this
|
| 193 |
+
|
| 194 |
+
if debug:
|
| 195 |
+
for i in range(world_size):
|
| 196 |
+
for j in range(len(fp32_flat_groups[0])):
|
| 197 |
+
print(
|
| 198 |
+
f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
|
| 199 |
+
|
| 200 |
+
# XXX: memory usage doubles here (zero2)
|
| 201 |
+
num_param_groups = len(fp32_flat_groups[0])
|
| 202 |
+
merged_single_partition_of_fp32_groups = []
|
| 203 |
+
for i in range(num_param_groups):
|
| 204 |
+
merged_partitions = [sd[i] for sd in fp32_flat_groups]
|
| 205 |
+
full_single_fp32_vector = torch.cat(merged_partitions, 0)
|
| 206 |
+
merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
|
| 207 |
+
avail_numel = sum([
|
| 208 |
+
full_single_fp32_vector.numel()
|
| 209 |
+
for full_single_fp32_vector in merged_single_partition_of_fp32_groups
|
| 210 |
+
])
|
| 211 |
+
|
| 212 |
+
if debug:
|
| 213 |
+
wanted_params = sum([len(shapes) for shapes in param_shapes])
|
| 214 |
+
wanted_numel = sum(
|
| 215 |
+
[sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
|
| 216 |
+
# not asserting if there is a mismatch due to possible padding
|
| 217 |
+
print(f"Have {avail_numel} numels to process.")
|
| 218 |
+
print(f"Need {wanted_numel} numels in {wanted_params} params.")
|
| 219 |
+
|
| 220 |
+
state_dict = OrderedDict()
|
| 221 |
+
|
| 222 |
+
# buffers
|
| 223 |
+
state_dict.update(buffers)
|
| 224 |
+
if debug:
|
| 225 |
+
print(f"added {len(buffers)} buffers")
|
| 226 |
+
|
| 227 |
+
# params
|
| 228 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
| 229 |
+
# out-of-core computing solution
|
| 230 |
+
total_numel = 0
|
| 231 |
+
total_params = 0
|
| 232 |
+
for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
|
| 233 |
+
offset = 0
|
| 234 |
+
avail_numel = full_single_fp32_vector.numel()
|
| 235 |
+
for name, shape in shapes.items():
|
| 236 |
+
|
| 237 |
+
unpartitioned_numel = shape.numel()
|
| 238 |
+
total_numel += unpartitioned_numel
|
| 239 |
+
total_params += 1
|
| 240 |
+
|
| 241 |
+
if debug:
|
| 242 |
+
print(
|
| 243 |
+
f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
|
| 244 |
+
)
|
| 245 |
+
state_dict[name] = full_single_fp32_vector.narrow(
|
| 246 |
+
0,
|
| 247 |
+
offset,
|
| 248 |
+
unpartitioned_numel).view(shape)
|
| 249 |
+
offset += unpartitioned_numel
|
| 250 |
+
|
| 251 |
+
# Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
|
| 252 |
+
# avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
|
| 253 |
+
# paddings performed in the code it's almost impossible to predict the exact numbers w/o the
|
| 254 |
+
# live optimizer object, so we are checking that the numbers are within the right range
|
| 255 |
+
align_to = 2 * world_size
|
| 256 |
+
|
| 257 |
+
def zero2_align(x):
|
| 258 |
+
return align_to * math.ceil(x / align_to)
|
| 259 |
+
|
| 260 |
+
if debug:
|
| 261 |
+
print(f"original offset={offset}, avail_numel={avail_numel}")
|
| 262 |
+
|
| 263 |
+
offset = zero2_align(offset)
|
| 264 |
+
avail_numel = zero2_align(avail_numel)
|
| 265 |
+
|
| 266 |
+
if debug:
|
| 267 |
+
print(f"aligned offset={offset}, avail_numel={avail_numel}")
|
| 268 |
+
|
| 269 |
+
# Sanity check
|
| 270 |
+
if offset != avail_numel:
|
| 271 |
+
raise ValueError(
|
| 272 |
+
f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
| 273 |
+
|
| 274 |
+
print(
|
| 275 |
+
f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
return state_dict
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def zero3_partitioned_param_info(unpartitioned_numel, world_size):
|
| 282 |
+
remainder = unpartitioned_numel % world_size
|
| 283 |
+
padding_numel = (world_size - remainder) if remainder else 0
|
| 284 |
+
partitioned_numel = math.ceil(unpartitioned_numel / world_size)
|
| 285 |
+
return partitioned_numel, padding_numel
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
|
| 289 |
+
param_shapes,
|
| 290 |
+
fp32_flat_groups,
|
| 291 |
+
buffers):
|
| 292 |
+
|
| 293 |
+
# Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
|
| 294 |
+
# param, re-consolidating each param, while dealing with padding if any
|
| 295 |
+
|
| 296 |
+
avail_numel = fp32_flat_groups[0].numel() * world_size
|
| 297 |
+
# merge list of dicts, preserving order
|
| 298 |
+
param_shapes = {k: v for d in param_shapes for k, v in d.items()}
|
| 299 |
+
|
| 300 |
+
if debug:
|
| 301 |
+
for i in range(world_size):
|
| 302 |
+
print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
|
| 303 |
+
|
| 304 |
+
wanted_params = len(param_shapes)
|
| 305 |
+
wanted_numel = sum(shape.numel() for shape in param_shapes.values())
|
| 306 |
+
# not asserting if there is a mismatch due to possible padding
|
| 307 |
+
print(f"Have {avail_numel} numels to process.")
|
| 308 |
+
print(f"Need {wanted_numel} numels in {wanted_params} params.")
|
| 309 |
+
|
| 310 |
+
state_dict = OrderedDict()
|
| 311 |
+
|
| 312 |
+
# buffers
|
| 313 |
+
state_dict.update(buffers)
|
| 314 |
+
if debug:
|
| 315 |
+
print(f"added {len(buffers)} buffers")
|
| 316 |
+
|
| 317 |
+
# params
|
| 318 |
+
# XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
|
| 319 |
+
# out-of-core computing solution
|
| 320 |
+
offset = 0
|
| 321 |
+
total_numel = 0
|
| 322 |
+
total_params = 0
|
| 323 |
+
for name, shape in param_shapes.items():
|
| 324 |
+
|
| 325 |
+
unpartitioned_numel = shape.numel()
|
| 326 |
+
total_numel += unpartitioned_numel
|
| 327 |
+
total_params += 1
|
| 328 |
+
|
| 329 |
+
partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
|
| 330 |
+
|
| 331 |
+
if debug:
|
| 332 |
+
print(
|
| 333 |
+
f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# XXX: memory usage doubles here
|
| 337 |
+
state_dict[name] = torch.cat(
|
| 338 |
+
tuple(fp32_flat_groups[i].narrow(0,
|
| 339 |
+
offset,
|
| 340 |
+
partitioned_numel)
|
| 341 |
+
for i in range(world_size)),
|
| 342 |
+
0).narrow(0,
|
| 343 |
+
0,
|
| 344 |
+
unpartitioned_numel).view(shape)
|
| 345 |
+
offset += partitioned_numel
|
| 346 |
+
|
| 347 |
+
offset *= world_size
|
| 348 |
+
|
| 349 |
+
# Sanity check
|
| 350 |
+
if offset != avail_numel:
|
| 351 |
+
raise ValueError(
|
| 352 |
+
f"consumed {offset} numels out of {avail_numel} - something is wrong")
|
| 353 |
+
|
| 354 |
+
print(
|
| 355 |
+
f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
return state_dict
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
|
| 362 |
+
"""
|
| 363 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
|
| 364 |
+
``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
|
| 365 |
+
via a model hub.
|
| 366 |
+
|
| 367 |
+
Args:
|
| 368 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder
|
| 369 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
|
| 370 |
+
|
| 371 |
+
Returns:
|
| 372 |
+
- pytorch ``state_dict``
|
| 373 |
+
|
| 374 |
+
Note: this approach may not work if your application doesn't have sufficient free CPU memory and
|
| 375 |
+
you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
|
| 376 |
+
the checkpoint.
|
| 377 |
+
|
| 378 |
+
A typical usage might be ::
|
| 379 |
+
|
| 380 |
+
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
|
| 381 |
+
# do the training and checkpoint saving
|
| 382 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
|
| 383 |
+
model = model.cpu() # move to cpu
|
| 384 |
+
model.load_state_dict(state_dict)
|
| 385 |
+
# submit to model hub or save the model to share with others
|
| 386 |
+
|
| 387 |
+
In this example the ``model`` will no longer be usable in the deepspeed context of the same
|
| 388 |
+
application. i.e. you will need to re-initialize the deepspeed engine, since
|
| 389 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
| 390 |
+
|
| 391 |
+
If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
|
| 392 |
+
|
| 393 |
+
"""
|
| 394 |
+
if tag is None:
|
| 395 |
+
latest_path = os.path.join(checkpoint_dir, 'latest')
|
| 396 |
+
if os.path.isfile(latest_path):
|
| 397 |
+
with open(latest_path, 'r') as fd:
|
| 398 |
+
tag = fd.read().strip()
|
| 399 |
+
else:
|
| 400 |
+
raise ValueError(f"Unable to find 'latest' file at {latest_path}")
|
| 401 |
+
|
| 402 |
+
ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
|
| 403 |
+
|
| 404 |
+
if not os.path.isdir(ds_checkpoint_dir):
|
| 405 |
+
raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
|
| 406 |
+
|
| 407 |
+
return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
|
| 411 |
+
"""
|
| 412 |
+
Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
|
| 413 |
+
loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
|
| 414 |
+
|
| 415 |
+
Args:
|
| 416 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
| 417 |
+
- ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
|
| 418 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
| 419 |
+
"""
|
| 420 |
+
|
| 421 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
|
| 422 |
+
print(f"Saving fp32 state dict to {output_file}")
|
| 423 |
+
torch.save(state_dict, output_file)
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
|
| 427 |
+
"""
|
| 428 |
+
1. Put the provided model to cpu
|
| 429 |
+
2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
|
| 430 |
+
3. Load it into the provided model
|
| 431 |
+
|
| 432 |
+
Args:
|
| 433 |
+
- ``model``: the model object to update
|
| 434 |
+
- ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
|
| 435 |
+
- ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
|
| 436 |
+
|
| 437 |
+
Returns:
|
| 438 |
+
- ``model`: modified model
|
| 439 |
+
|
| 440 |
+
Make sure you have plenty of CPU memory available before you call this function. If you don't
|
| 441 |
+
have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
|
| 442 |
+
conveniently placed for you in the checkpoint folder.
|
| 443 |
+
|
| 444 |
+
A typical usage might be ::
|
| 445 |
+
|
| 446 |
+
from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
|
| 447 |
+
model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
|
| 448 |
+
# submit to model hub or save the model to share with others
|
| 449 |
+
|
| 450 |
+
Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
|
| 451 |
+
of the same application. i.e. you will need to re-initialize the deepspeed engine, since
|
| 452 |
+
``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
|
| 453 |
+
|
| 454 |
+
"""
|
| 455 |
+
logger.info(f"Extracting fp32 weights")
|
| 456 |
+
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
|
| 457 |
+
|
| 458 |
+
logger.info(f"Overwriting model with fp32 weights")
|
| 459 |
+
model = model.cpu()
|
| 460 |
+
model.load_state_dict(state_dict, strict=False)
|
| 461 |
+
|
| 462 |
+
return model
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
if __name__ == "__main__":
|
| 466 |
+
|
| 467 |
+
parser = argparse.ArgumentParser()
|
| 468 |
+
parser.add_argument(
|
| 469 |
+
"checkpoint_dir",
|
| 470 |
+
type=str,
|
| 471 |
+
help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
|
| 472 |
+
parser.add_argument(
|
| 473 |
+
"output_file",
|
| 474 |
+
type=str,
|
| 475 |
+
help=
|
| 476 |
+
"path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
|
| 477 |
+
)
|
| 478 |
+
parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
|
| 479 |
+
args = parser.parse_args()
|
| 480 |
+
|
| 481 |
+
debug = args.debug
|
| 482 |
+
|
| 483 |
+
convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
|
display_v3/2023-04-14_17-59-45/hparams.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
amp: true
|
| 2 |
+
batch_size: 16
|
| 3 |
+
cls_target: vo
|
| 4 |
+
deepspeed: true
|
| 5 |
+
dev_data_file: ''
|
| 6 |
+
downsample_data: true
|
| 7 |
+
early_dropout: null
|
| 8 |
+
epochs: 10
|
| 9 |
+
freeze_encoder: false
|
| 10 |
+
just_test: false
|
| 11 |
+
log_fold: ./logs
|
| 12 |
+
log_step: 10
|
| 13 |
+
lr: 5.0e-05
|
| 14 |
+
model_name: bert-base-uncased
|
| 15 |
+
positive_ratio: 0.4
|
| 16 |
+
pretrained_model_fold: ./pretrained_model
|
| 17 |
+
rdrop: null
|
| 18 |
+
running time: 0:05:34
|
| 19 |
+
share_encoder: false
|
| 20 |
+
test_data_file: ''
|
| 21 |
+
train_data_file: ''
|
| 22 |
+
train_ratio: 0.8
|
| 23 |
+
version: structure cmp
|
display_v3/2023-04-14_17-59-45/metrics.csv
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
train_loss,epoch,step,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_macro_f1,train_accuracy,train_precision,train_recall,train_f1,train_macro_f1,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_macro_f1
|
| 2 |
+
0.55810546875,0,9,,,,,,,,,,,,,,,,,
|
| 3 |
+
0.46435546875,0,19,,,,,,,,,,,,,,,,,
|
| 4 |
+
0.34228515625,0,29,,,,,,,,,,,,,,,,,
|
| 5 |
+
0.27783203125,0,39,,,,,,,,,,,,,,,,,
|
| 6 |
+
0.59716796875,0,49,,,,,,,,,,,,,,,,,
|
| 7 |
+
0.316650390625,0,59,,,,,,,,,,,,,,,,,
|
| 8 |
+
0.5478515625,0,69,,,,,,,,,,,,,,,,,
|
| 9 |
+
0.173828125,0,79,,,,,,,,,,,,,,,,,
|
| 10 |
+
0.501953125,0,89,,,,,,,,,,,,,,,,,
|
| 11 |
+
0.38623046875,0,99,,,,,,,,,,,,,,,,,
|
| 12 |
+
0.36279296875,0,109,,,,,,,,,,,,,,,,,
|
| 13 |
+
0.2105712890625,0,119,,,,,,,,,,,,,,,,,
|
| 14 |
+
0.2744140625,0,129,,,,,,,,,,,,,,,,,
|
| 15 |
+
0.44677734375,0,139,,,,,,,,,,,,,,,,,
|
| 16 |
+
0.6474609375,0,149,,,,,,,,,,,,,,,,,
|
| 17 |
+
0.81298828125,0,159,,,,,,,,,,,,,,,,,
|
| 18 |
+
0.60888671875,0,169,,,,,,,,,,,,,,,,,
|
| 19 |
+
,0,170,0.6008952260017395,0.7477313876152039,0.18957704305648804,0.865517258644104,0.31102851033210754,0.31102851033210754,,,,,,,,,,,
|
| 20 |
+
,0,170,,,,,,,0.7950036525726318,0.777429461479187,0.6831955909729004,0.7272727489471436,0.7272727489471436,,,,,,
|
| 21 |
+
0.305419921875,1,179,,,,,,,,,,,,,,,,,
|
| 22 |
+
0.390869140625,1,189,,,,,,,,,,,,,,,,,
|
| 23 |
+
0.2052001953125,1,199,,,,,,,,,,,,,,,,,
|
| 24 |
+
0.0948486328125,1,209,,,,,,,,,,,,,,,,,
|
| 25 |
+
0.38427734375,1,219,,,,,,,,,,,,,,,,,
|
| 26 |
+
0.1968994140625,1,229,,,,,,,,,,,,,,,,,
|
| 27 |
+
0.078369140625,1,239,,,,,,,,,,,,,,,,,
|
| 28 |
+
0.356689453125,1,249,,,,,,,,,,,,,,,,,
|
| 29 |
+
0.43505859375,1,259,,,,,,,,,,,,,,,,,
|
| 30 |
+
0.485107421875,1,269,,,,,,,,,,,,,,,,,
|
| 31 |
+
0.1243896484375,1,279,,,,,,,,,,,,,,,,,
|
| 32 |
+
0.05401611328125,1,289,,,,,,,,,,,,,,,,,
|
| 33 |
+
0.35595703125,1,299,,,,,,,,,,,,,,,,,
|
| 34 |
+
0.0572509765625,1,309,,,,,,,,,,,,,,,,,
|
| 35 |
+
0.1417236328125,1,319,,,,,,,,,,,,,,,,,
|
| 36 |
+
0.2315673828125,1,329,,,,,,,,,,,,,,,,,
|
| 37 |
+
0.232421875,1,339,,,,,,,,,,,,,,,,,
|
| 38 |
+
,1,341,0.24903717637062073,0.9004083275794983,0.36767318844795227,0.7137930989265442,0.48534584045410156,0.48534584045410156,,,,,,,,,,,
|
| 39 |
+
,1,341,,,,,,,0.8934606909751892,0.8779565095901489,0.8521579504013062,0.8648648858070374,0.8648648858070374,,,,,,
|
| 40 |
+
0.1656494140625,2,349,,,,,,,,,,,,,,,,,
|
| 41 |
+
0.336181640625,2,359,,,,,,,,,,,,,,,,,
|
| 42 |
+
0.1632080078125,2,369,,,,,,,,,,,,,,,,,
|
| 43 |
+
0.042724609375,2,379,,,,,,,,,,,,,,,,,
|
| 44 |
+
0.352783203125,2,389,,,,,,,,,,,,,,,,,
|
| 45 |
+
0.0268096923828125,2,399,,,,,,,,,,,,,,,,,
|
| 46 |
+
0.01428985595703125,2,409,,,,,,,,,,,,,,,,,
|
| 47 |
+
0.1790771484375,2,419,,,,,,,,,,,,,,,,,
|
| 48 |
+
0.0181427001953125,2,429,,,,,,,,,,,,,,,,,
|
| 49 |
+
0.04736328125,2,439,,,,,,,,,,,,,,,,,
|
| 50 |
+
0.2493896484375,2,449,,,,,,,,,,,,,,,,,
|
| 51 |
+
0.08538818359375,2,459,,,,,,,,,,,,,,,,,
|
| 52 |
+
0.583984375,2,469,,,,,,,,,,,,,,,,,
|
| 53 |
+
0.0457763671875,2,479,,,,,,,,,,,,,,,,,
|
| 54 |
+
0.1326904296875,2,489,,,,,,,,,,,,,,,,,
|
| 55 |
+
0.156494140625,2,499,,,,,,,,,,,,,,,,,
|
| 56 |
+
0.1724853515625,2,509,,,,,,,,,,,,,,,,,
|
| 57 |
+
,2,512,0.5341271758079529,0.7738203406333923,0.2016877681016922,0.8241379261016846,0.3240678012371063,0.3240678012371063,,,,,,,,,,,
|
| 58 |
+
,2,512,,,,,,,0.9566495418548584,0.9491211771965027,0.942148745059967,0.9456221461296082,0.9456221461296082,,,,,,
|
| 59 |
+
0.056976318359375,3,519,,,,,,,,,,,,,,,,,
|
| 60 |
+
0.1407470703125,3,529,,,,,,,,,,,,,,,,,
|
| 61 |
+
0.048370361328125,3,539,,,,,,,,,,,,,,,,,
|
| 62 |
+
0.004375457763671875,3,549,,,,,,,,,,,,,,,,,
|
| 63 |
+
0.01050567626953125,3,559,,,,,,,,,,,,,,,,,
|
| 64 |
+
0.08062744140625,3,569,,,,,,,,,,,,,,,,,
|
| 65 |
+
0.376953125,3,579,,,,,,,,,,,,,,,,,
|
| 66 |
+
0.04742431640625,3,589,,,,,,,,,,,,,,,,,
|
| 67 |
+
0.143798828125,3,599,,,,,,,,,,,,,,,,,
|
| 68 |
+
0.030059814453125,3,609,,,,,,,,,,,,,,,,,
|
| 69 |
+
0.040374755859375,3,619,,,,,,,,,,,,,,,,,
|
| 70 |
+
0.19873046875,3,629,,,,,,,,,,,,,,,,,
|
| 71 |
+
0.031402587890625,3,639,,,,,,,,,,,,,,,,,
|
| 72 |
+
0.00252532958984375,3,649,,,,,,,,,,,,,,,,,
|
| 73 |
+
0.0867919921875,3,659,,,,,,,,,,,,,,,,,
|
| 74 |
+
0.040191650390625,3,669,,,,,,,,,,,,,,,,,
|
| 75 |
+
0.0982666015625,3,679,,,,,,,,,,,,,,,,,
|
| 76 |
+
,3,683,0.4307195246219635,0.8677404522895813,0.2984869182109833,0.748275876045227,0.42674532532691956,0.42674532532691956,,,,,,,,,,,
|
| 77 |
+
,3,683,,,,,,,0.9742836356163025,0.9670027494430542,0.968778669834137,0.9678899049758911,0.9678899049758911,,,,,,
|
| 78 |
+
0.003635406494140625,4,689,,,,,,,,,,,,,,,,,
|
| 79 |
+
0.0293426513671875,4,699,,,,,,,,,,,,,,,,,
|
| 80 |
+
0.001384735107421875,4,709,,,,,,,,,,,,,,,,,
|
| 81 |
+
0.003826141357421875,4,719,,,,,,,,,,,,,,,,,
|
| 82 |
+
0.0015773773193359375,4,729,,,,,,,,,,,,,,,,,
|
| 83 |
+
0.0043182373046875,4,739,,,,,,,,,,,,,,,,,
|
| 84 |
+
0.001644134521484375,4,749,,,,,,,,,,,,,,,,,
|
| 85 |
+
0.03326416015625,4,759,,,,,,,,,,,,,,,,,
|
| 86 |
+
0.001827239990234375,4,769,,,,,,,,,,,,,,,,,
|
| 87 |
+
0.0008625984191894531,4,779,,,,,,,,,,,,,,,,,
|
| 88 |
+
0.0024814605712890625,4,789,,,,,,,,,,,,,,,,,
|
| 89 |
+
0.04608154296875,4,799,,,,,,,,,,,,,,,,,
|
| 90 |
+
0.030364990234375,4,809,,,,,,,,,,,,,,,,,
|
| 91 |
+
0.04498291015625,4,819,,,,,,,,,,,,,,,,,
|
| 92 |
+
0.0010805130004882812,4,829,,,,,,,,,,,,,,,,,
|
| 93 |
+
0.007289886474609375,4,839,,,,,,,,,,,,,,,,,
|
| 94 |
+
0.7626953125,4,849,,,,,,,,,,,,,,,,,
|
| 95 |
+
,4,854,1.0994356870651245,0.7813067436218262,0.2074652761220932,0.8241379261016846,0.3314840495586395,0.3314840495586395,,,,,,,,,,,
|
| 96 |
+
,4,854,,,,,,,0.9911829829216003,0.989880383014679,0.9880624413490295,0.9889705777168274,0.9889705777168274,,,,,,
|
| 97 |
+
0.07330322265625,5,859,,,,,,,,,,,,,,,,,
|
| 98 |
+
0.007152557373046875,5,869,,,,,,,,,,,,,,,,,
|
| 99 |
+
0.0017251968383789062,5,879,,,,,,,,,,,,,,,,,
|
| 100 |
+
0.00966644287109375,5,889,,,,,,,,,,,,,,,,,
|
| 101 |
+
0.014617919921875,5,899,,,,,,,,,,,,,,,,,
|
| 102 |
+
0.00643157958984375,5,909,,,,,,,,,,,,,,,,,
|
| 103 |
+
0.1793212890625,5,919,,,,,,,,,,,,,,,,,
|
| 104 |
+
0.0158843994140625,5,929,,,,,,,,,,,,,,,,,
|
| 105 |
+
0.01483917236328125,5,939,,,,,,,,,,,,,,,,,
|
| 106 |
+
0.0116424560546875,5,949,,,,,,,,,,,,,,,,,
|
| 107 |
+
0.046630859375,5,959,,,,,,,,,,,,,,,,,
|
| 108 |
+
0.01290130615234375,5,969,,,,,,,,,,,,,,,,,
|
| 109 |
+
0.01458740234375,5,979,,,,,,,,,,,,,,,,,
|
| 110 |
+
0.031829833984375,5,989,,,,,,,,,,,,,,,,,
|
| 111 |
+
0.036346435546875,5,999,,,,,,,,,,,,,,,,,
|
| 112 |
+
0.0152435302734375,5,1009,,,,,,,,,,,,,,,,,
|
| 113 |
+
0.00556182861328125,5,1019,,,,,,,,,,,,,,,,,
|
| 114 |
+
,5,1025,0.37797266244888306,0.9088021516799927,0.39147287607192993,0.6965517401695251,0.5012406706809998,0.5012406706809998,,,,,,,,,,,
|
| 115 |
+
,5,1025,,,,,,,0.9819985032081604,0.9753199219703674,0.9797979593276978,0.9775538444519043,0.9775538444519043,,,,,,
|
| 116 |
+
0.02069091796875,6,1029,,,,,,,,,,,,,,,,,
|
| 117 |
+
0.205078125,6,1039,,,,,,,,,,,,,,,,,
|
| 118 |
+
0.01212310791015625,6,1049,,,,,,,,,,,,,,,,,
|
| 119 |
+
0.016571044921875,6,1059,,,,,,,,,,,,,,,,,
|
| 120 |
+
0.053070068359375,6,1069,,,,,,,,,,,,,,,,,
|
| 121 |
+
0.0028896331787109375,6,1079,,,,,,,,,,,,,,,,,
|
| 122 |
+
0.1202392578125,6,1089,,,,,,,,,,,,,,,,,
|
| 123 |
+
0.00884246826171875,6,1099,,,,,,,,,,,,,,,,,
|
| 124 |
+
0.002231597900390625,6,1109,,,,,,,,,,,,,,,,,
|
| 125 |
+
0.00974273681640625,6,1119,,,,,,,,,,,,,,,,,
|
| 126 |
+
0.0335693359375,6,1129,,,,,,,,,,,,,,,,,
|
| 127 |
+
0.004673004150390625,6,1139,,,,,,,,,,,,,,,,,
|
| 128 |
+
0.287109375,6,1149,,,,,,,,,,,,,,,,,
|
| 129 |
+
0.08795166015625,6,1159,,,,,,,,,,,,,,,,,
|
| 130 |
+
0.00901031494140625,6,1169,,,,,,,,,,,,,,,,,
|
| 131 |
+
0.01025390625,6,1179,,,,,,,,,,,,,,,,,
|
| 132 |
+
0.050384521484375,6,1189,,,,,,,,,,,,,,,,,
|
| 133 |
+
,6,1196,0.5810363292694092,0.8743194341659546,0.3053097426891327,0.7137930989265442,0.42768594622612,0.42768594622612,,,,,,,,,,,
|
| 134 |
+
,6,1196,,,,,,,0.9911829829216003,0.9880843162536621,0.9898989796638489,0.988990843296051,0.988990843296051,,,,,,
|
| 135 |
+
0.005062103271484375,7,1199,,,,,,,,,,,,,,,,,
|
| 136 |
+
0.0116119384765625,7,1209,,,,,,,,,,,,,,,,,
|
| 137 |
+
0.0006499290466308594,7,1219,,,,,,,,,,,,,,,,,
|
| 138 |
+
0.009674072265625,7,1229,,,,,,,,,,,,,,,,,
|
| 139 |
+
0.004718780517578125,7,1239,,,,,,,,,,,,,,,,,
|
| 140 |
+
0.0006432533264160156,7,1249,,,,,,,,,,,,,,,,,
|
| 141 |
+
0.0006594657897949219,7,1259,,,,,,,,,,,,,,,,,
|
| 142 |
+
0.0006146430969238281,7,1269,,,,,,,,,,,,,,,,,
|
| 143 |
+
0.0008616447448730469,7,1279,,,,,,,,,,,,,,,,,
|
| 144 |
+
0.0004451274871826172,7,1289,,,,,,,,,,,,,,,,,
|
| 145 |
+
0.0004410743713378906,7,1299,,,,,,,,,,,,,,,,,
|
| 146 |
+
0.0035152435302734375,7,1309,,,,,,,,,,,,,,,,,
|
| 147 |
+
0.0005249977111816406,7,1319,,,,,,,,,,,,,,,,,
|
| 148 |
+
0.0005850791931152344,7,1329,,,,,,,,,,,,,,,,,
|
| 149 |
+
0.1878662109375,7,1339,,,,,,,,,,,,,,,,,
|
| 150 |
+
0.016326904296875,7,1349,,,,,,,,,,,,,,,,,
|
| 151 |
+
0.0019102096557617188,7,1359,,,,,,,,,,,,,,,,,
|
| 152 |
+
,7,1367,0.49221912026405334,0.8886116147041321,0.3327786922454834,0.6896551847457886,0.44893378019332886,0.44893378019332886,,,,,,,,,,,
|
| 153 |
+
,7,1367,,,,,,,0.9966936111450195,0.9963235259056091,0.9954086542129517,0.9958658814430237,0.9958658814430237,,,,,,
|
| 154 |
+
0.0009813308715820312,8,1369,,,,,,,,,,,,,,,,,
|
| 155 |
+
0.00127410888671875,8,1379,,,,,,,,,,,,,,,,,
|
| 156 |
+
0.0015325546264648438,8,1389,,,,,,,,,,,,,,,,,
|
| 157 |
+
0.00914764404296875,8,1399,,,,,,,,,,,,,,,,,
|
| 158 |
+
0.0008273124694824219,8,1409,,,,,,,,,,,,,,,,,
|
| 159 |
+
0.0011987686157226562,8,1419,,,,,,,,,,,,,,,,,
|
| 160 |
+
0.0003414154052734375,8,1429,,,,,,,,,,,,,,,,,
|
| 161 |
+
0.0031108856201171875,8,1439,,,,,,,,,,,,,,,,,
|
| 162 |
+
0.0004572868347167969,8,1449,,,,,,,,,,,,,,,,,
|
| 163 |
+
0.0006923675537109375,8,1459,,,,,,,,,,,,,,,,,
|
| 164 |
+
0.0003120899200439453,8,1469,,,,,,,,,,,,,,,,,
|
| 165 |
+
0.003658294677734375,8,1479,,,,,,,,,,,,,,,,,
|
| 166 |
+
0.00034880638122558594,8,1489,,,,,,,,,,,,,,,,,
|
| 167 |
+
0.000492095947265625,8,1499,,,,,,,,,,,,,,,,,
|
| 168 |
+
0.0002238750457763672,8,1509,,,,,,,,,,,,,,,,,
|
| 169 |
+
0.0003514289855957031,8,1519,,,,,,,,,,,,,,,,,
|
| 170 |
+
0.380126953125,8,1529,,,,,,,,,,,,,,,,,
|
| 171 |
+
,8,1538,0.47979027032852173,0.909709632396698,0.3897959291934967,0.6586207151412964,0.48974359035491943,0.48974359035491943,,,,,,,,,,,
|
| 172 |
+
,8,1538,,,,,,,0.997061014175415,0.9981566667556763,0.994490385055542,0.9963201284408569,0.9963201284408569,,,,,,
|
| 173 |
+
0.0142364501953125,9,1539,,,,,,,,,,,,,,,,,
|
| 174 |
+
0.002254486083984375,9,1549,,,,,,,,,,,,,,,,,
|
| 175 |
+
0.0031986236572265625,9,1559,,,,,,,,,,,,,,,,,
|
| 176 |
+
0.000621795654296875,9,1569,,,,,,,,,,,,,,,,,
|
| 177 |
+
0.0004968643188476562,9,1579,,,,,,,,,,,,,,,,,
|
| 178 |
+
0.0007233619689941406,9,1589,,,,,,,,,,,,,,,,,
|
| 179 |
+
0.0003178119659423828,9,1599,,,,,,,,,,,,,,,,,
|
| 180 |
+
0.0004620552062988281,9,1609,,,,,,,,,,,,,,,,,
|
| 181 |
+
0.0002932548522949219,9,1619,,,,,,,,,,,,,,,,,
|
| 182 |
+
0.001575469970703125,9,1629,,,,,,,,,,,,,,,,,
|
| 183 |
+
0.0008373260498046875,9,1639,,,,,,,,,,,,,,,,,
|
| 184 |
+
0.011688232421875,9,1649,,,,,,,,,,,,,,,,,
|
| 185 |
+
0.0089111328125,9,1659,,,,,,,,,,,,,,,,,
|
| 186 |
+
0.00579071044921875,9,1669,,,,,,,,,,,,,,,,,
|
| 187 |
+
0.0009889602661132812,9,1679,,,,,,,,,,,,,,,,,
|
| 188 |
+
0.043060302734375,9,1689,,,,,,,,,,,,,,,,,
|
| 189 |
+
0.0005102157592773438,9,1699,,,,,,,,,,,,,,,,,
|
| 190 |
+
0.0002346038818359375,9,1709,,,,,,,,,,,,,,,,,
|
| 191 |
+
,9,1709,0.6371660232543945,0.8752268552780151,0.30882352590560913,0.7241379022598267,0.4329896867275238,0.4329896867275238,,,,,,,,,,,
|
| 192 |
+
,9,1709,,,,,,,0.9933872222900391,0.9899359345436096,0.9935720562934875,0.9917507171630859,0.9917507171630859,,,,,,
|
| 193 |
+
,10,1710,,,,,,,,,,,,0.40940672159194946,0.8987295627593994,0.36324167251586914,0.6802167892456055,0.473584920167923,0.473584920167923
|
display_v3/2023-04-14_17-59-45/yes.txt
ADDED
|
File without changes
|