Zpwang-AI commited on
Commit
51cf79c
·
1 Parent(s): ece213d

Upload 21 files

Browse files
Files changed (21) hide show
  1. display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
  2. display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  3. display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/latest +1 -0
  4. display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/zero_to_fp32.py +483 -0
  5. display_v3/2023-04-14_16-59-07/hparams.yaml +23 -0
  6. display_v3/2023-04-14_16-59-07/metrics.csv +255 -0
  7. display_v3/2023-04-14_16-59-07/yes.txt +0 -0
  8. display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
  9. display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  10. display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/latest +1 -0
  11. display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/zero_to_fp32.py +483 -0
  12. display_v3/2023-04-14_17-06-18/hparams.yaml +23 -0
  13. display_v3/2023-04-14_17-06-18/metrics.csv +38 -0
  14. display_v3/2023-04-14_17-06-18/yes.txt +0 -0
  15. display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
  16. display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  17. display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/latest +1 -0
  18. display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/zero_to_fp32.py +483 -0
  19. display_v3/2023-04-14_17-59-45/hparams.yaml +23 -0
  20. display_v3/2023-04-14_17-59-45/metrics.csv +193 -0
  21. display_v3/2023-04-14_17-59-45/yes.txt +0 -0
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9921f36370ead56c25c58b8409cb71175b00e3c5ad5105f5fc49666915361ce
3
+ size 220228915
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f3ae9d4195c5adab49e78a909cfe95ae8d21c7a2ffc90eed224e122f51eabae
3
+ size 1320918341
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ checkpoint
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/zero_to_fp32.py ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ '''Copyright The Microsoft DeepSpeed Team'''
3
+
4
+ # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
5
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
6
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
7
+ # application.
8
+ #
9
+ # example: python zero_to_fp32.py . pytorch_model.bin
10
+
11
+ import argparse
12
+ import torch
13
+ import glob
14
+ import math
15
+ import os
16
+ import re
17
+ from collections import OrderedDict
18
+
19
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
20
+ # DeepSpeed data structures it has to be available in the current python environment.
21
+ from deepspeed.utils import logger
22
+ from deepspeed.checkpoint.constants import (DS_VERSION,
23
+ OPTIMIZER_STATE_DICT,
24
+ SINGLE_PARTITION_OF_FP32_GROUPS,
25
+ FP32_FLAT_GROUPS,
26
+ ZERO_STAGE,
27
+ PARTITION_COUNT,
28
+ PARAM_SHAPES,
29
+ BUFFER_NAMES)
30
+
31
+ debug = 0
32
+
33
+ # load to cpu
34
+ device = torch.device('cpu')
35
+
36
+
37
+ def atoi(text):
38
+ return int(text) if text.isdigit() else text
39
+
40
+
41
+ def natural_keys(text):
42
+ '''
43
+ alist.sort(key=natural_keys) sorts in human order
44
+ http://nedbatchelder.com/blog/200712/human_sorting.html
45
+ (See Toothy's implementation in the comments)
46
+ '''
47
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
48
+
49
+
50
+ def get_model_state_file(checkpoint_dir, zero_stage):
51
+ if not os.path.isdir(checkpoint_dir):
52
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
53
+
54
+ # there should be only one file
55
+ if zero_stage == 2:
56
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
57
+ elif zero_stage == 3:
58
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
59
+
60
+ if not os.path.exists(file):
61
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
62
+
63
+ return file
64
+
65
+
66
+ def get_optim_files(checkpoint_dir):
67
+ # XXX: need to test that this simple glob rule works for multi-node setup too
68
+ optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
69
+ "*_optim_states.pt")),
70
+ key=natural_keys)
71
+
72
+ if len(optim_files) == 0:
73
+ raise FileNotFoundError(
74
+ f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
75
+
76
+ return optim_files
77
+
78
+
79
+ def parse_model_state(file):
80
+ state_dict = torch.load(file, map_location=device)
81
+
82
+ if BUFFER_NAMES not in state_dict:
83
+ raise ValueError(f"{file} is not a model state checkpoint")
84
+ buffer_names = state_dict[BUFFER_NAMES]
85
+ if debug:
86
+ print("Found buffers:", buffer_names)
87
+
88
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
89
+ buffers = {
90
+ k: v.float()
91
+ for k,
92
+ v in state_dict["module"].items() if k in buffer_names
93
+ }
94
+ param_shapes = state_dict[PARAM_SHAPES]
95
+
96
+ ds_version = state_dict.get(DS_VERSION, None)
97
+
98
+ return buffers, param_shapes, ds_version
99
+
100
+
101
+ def parse_optim_states(files, ds_checkpoint_dir):
102
+
103
+ total_files = len(files)
104
+ state_dicts = []
105
+ for f in files:
106
+ state_dicts.append(torch.load(f, map_location=device))
107
+
108
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
109
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
110
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
111
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
112
+
113
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
114
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
115
+ # use the max of the partition_count to get the dp world_size.
116
+
117
+ if type(world_size) is list:
118
+ world_size = max(world_size)
119
+
120
+ if world_size != total_files:
121
+ raise ValueError(
122
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
123
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
124
+ )
125
+
126
+ # the groups are named differently in each stage
127
+ if zero_stage == 2:
128
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
129
+ elif zero_stage == 3:
130
+ fp32_groups_key = FP32_FLAT_GROUPS
131
+ else:
132
+ raise ValueError(f"unknown zero stage {zero_stage}")
133
+
134
+ if zero_stage == 2:
135
+ fp32_flat_groups = [
136
+ state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
137
+ for i in range(len(state_dicts))
138
+ ]
139
+ elif zero_stage == 3:
140
+ # if there is more than one param group, there will be multiple flattened tensors - one
141
+ # flattened tensor per group - for simplicity merge them into a single tensor
142
+ #
143
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
144
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
145
+
146
+ fp32_flat_groups = [
147
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
148
+ 0) for i in range(len(state_dicts))
149
+ ]
150
+
151
+ return zero_stage, world_size, fp32_flat_groups
152
+
153
+
154
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
155
+ """
156
+ Returns fp32 state_dict reconstructed from ds checkpoint
157
+
158
+ Args:
159
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
160
+
161
+ """
162
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
163
+
164
+ optim_files = get_optim_files(ds_checkpoint_dir)
165
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
166
+ print(
167
+ f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
168
+
169
+ model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
170
+ buffers, param_shapes, ds_version = parse_model_state(model_file)
171
+ print(f'Parsing checkpoint created by deepspeed=={ds_version}')
172
+
173
+ if zero_stage == 2:
174
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
175
+ param_shapes,
176
+ fp32_flat_groups,
177
+ buffers)
178
+ elif zero_stage == 3:
179
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
180
+ param_shapes,
181
+ fp32_flat_groups,
182
+ buffers)
183
+
184
+
185
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
186
+ param_shapes,
187
+ fp32_flat_groups,
188
+ buffers):
189
+
190
+ # Reconstruction protocol:
191
+ #
192
+ # XXX: document this
193
+
194
+ if debug:
195
+ for i in range(world_size):
196
+ for j in range(len(fp32_flat_groups[0])):
197
+ print(
198
+ f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
199
+
200
+ # XXX: memory usage doubles here (zero2)
201
+ num_param_groups = len(fp32_flat_groups[0])
202
+ merged_single_partition_of_fp32_groups = []
203
+ for i in range(num_param_groups):
204
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
205
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
206
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
207
+ avail_numel = sum([
208
+ full_single_fp32_vector.numel()
209
+ for full_single_fp32_vector in merged_single_partition_of_fp32_groups
210
+ ])
211
+
212
+ if debug:
213
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
214
+ wanted_numel = sum(
215
+ [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
216
+ # not asserting if there is a mismatch due to possible padding
217
+ print(f"Have {avail_numel} numels to process.")
218
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
219
+
220
+ state_dict = OrderedDict()
221
+
222
+ # buffers
223
+ state_dict.update(buffers)
224
+ if debug:
225
+ print(f"added {len(buffers)} buffers")
226
+
227
+ # params
228
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
229
+ # out-of-core computing solution
230
+ total_numel = 0
231
+ total_params = 0
232
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
233
+ offset = 0
234
+ avail_numel = full_single_fp32_vector.numel()
235
+ for name, shape in shapes.items():
236
+
237
+ unpartitioned_numel = shape.numel()
238
+ total_numel += unpartitioned_numel
239
+ total_params += 1
240
+
241
+ if debug:
242
+ print(
243
+ f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
244
+ )
245
+ state_dict[name] = full_single_fp32_vector.narrow(
246
+ 0,
247
+ offset,
248
+ unpartitioned_numel).view(shape)
249
+ offset += unpartitioned_numel
250
+
251
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
252
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
253
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
254
+ # live optimizer object, so we are checking that the numbers are within the right range
255
+ align_to = 2 * world_size
256
+
257
+ def zero2_align(x):
258
+ return align_to * math.ceil(x / align_to)
259
+
260
+ if debug:
261
+ print(f"original offset={offset}, avail_numel={avail_numel}")
262
+
263
+ offset = zero2_align(offset)
264
+ avail_numel = zero2_align(avail_numel)
265
+
266
+ if debug:
267
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
268
+
269
+ # Sanity check
270
+ if offset != avail_numel:
271
+ raise ValueError(
272
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
273
+
274
+ print(
275
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
276
+ )
277
+
278
+ return state_dict
279
+
280
+
281
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
282
+ remainder = unpartitioned_numel % world_size
283
+ padding_numel = (world_size - remainder) if remainder else 0
284
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
285
+ return partitioned_numel, padding_numel
286
+
287
+
288
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
289
+ param_shapes,
290
+ fp32_flat_groups,
291
+ buffers):
292
+
293
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
294
+ # param, re-consolidating each param, while dealing with padding if any
295
+
296
+ avail_numel = fp32_flat_groups[0].numel() * world_size
297
+ # merge list of dicts, preserving order
298
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
299
+
300
+ if debug:
301
+ for i in range(world_size):
302
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
303
+
304
+ wanted_params = len(param_shapes)
305
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
306
+ # not asserting if there is a mismatch due to possible padding
307
+ print(f"Have {avail_numel} numels to process.")
308
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
309
+
310
+ state_dict = OrderedDict()
311
+
312
+ # buffers
313
+ state_dict.update(buffers)
314
+ if debug:
315
+ print(f"added {len(buffers)} buffers")
316
+
317
+ # params
318
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
319
+ # out-of-core computing solution
320
+ offset = 0
321
+ total_numel = 0
322
+ total_params = 0
323
+ for name, shape in param_shapes.items():
324
+
325
+ unpartitioned_numel = shape.numel()
326
+ total_numel += unpartitioned_numel
327
+ total_params += 1
328
+
329
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
330
+
331
+ if debug:
332
+ print(
333
+ f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
334
+ )
335
+
336
+ # XXX: memory usage doubles here
337
+ state_dict[name] = torch.cat(
338
+ tuple(fp32_flat_groups[i].narrow(0,
339
+ offset,
340
+ partitioned_numel)
341
+ for i in range(world_size)),
342
+ 0).narrow(0,
343
+ 0,
344
+ unpartitioned_numel).view(shape)
345
+ offset += partitioned_numel
346
+
347
+ offset *= world_size
348
+
349
+ # Sanity check
350
+ if offset != avail_numel:
351
+ raise ValueError(
352
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
353
+
354
+ print(
355
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
356
+ )
357
+
358
+ return state_dict
359
+
360
+
361
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
362
+ """
363
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
364
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
365
+ via a model hub.
366
+
367
+ Args:
368
+ - ``checkpoint_dir``: path to the desired checkpoint folder
369
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
370
+
371
+ Returns:
372
+ - pytorch ``state_dict``
373
+
374
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
375
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
376
+ the checkpoint.
377
+
378
+ A typical usage might be ::
379
+
380
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
381
+ # do the training and checkpoint saving
382
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
383
+ model = model.cpu() # move to cpu
384
+ model.load_state_dict(state_dict)
385
+ # submit to model hub or save the model to share with others
386
+
387
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
388
+ application. i.e. you will need to re-initialize the deepspeed engine, since
389
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
390
+
391
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
392
+
393
+ """
394
+ if tag is None:
395
+ latest_path = os.path.join(checkpoint_dir, 'latest')
396
+ if os.path.isfile(latest_path):
397
+ with open(latest_path, 'r') as fd:
398
+ tag = fd.read().strip()
399
+ else:
400
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
401
+
402
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
403
+
404
+ if not os.path.isdir(ds_checkpoint_dir):
405
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
406
+
407
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
408
+
409
+
410
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
411
+ """
412
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
413
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
414
+
415
+ Args:
416
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
417
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
418
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
419
+ """
420
+
421
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
422
+ print(f"Saving fp32 state dict to {output_file}")
423
+ torch.save(state_dict, output_file)
424
+
425
+
426
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
427
+ """
428
+ 1. Put the provided model to cpu
429
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
430
+ 3. Load it into the provided model
431
+
432
+ Args:
433
+ - ``model``: the model object to update
434
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
435
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
436
+
437
+ Returns:
438
+ - ``model`: modified model
439
+
440
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
441
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
442
+ conveniently placed for you in the checkpoint folder.
443
+
444
+ A typical usage might be ::
445
+
446
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
447
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
448
+ # submit to model hub or save the model to share with others
449
+
450
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
451
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
452
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
453
+
454
+ """
455
+ logger.info(f"Extracting fp32 weights")
456
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
457
+
458
+ logger.info(f"Overwriting model with fp32 weights")
459
+ model = model.cpu()
460
+ model.load_state_dict(state_dict, strict=False)
461
+
462
+ return model
463
+
464
+
465
+ if __name__ == "__main__":
466
+
467
+ parser = argparse.ArgumentParser()
468
+ parser.add_argument(
469
+ "checkpoint_dir",
470
+ type=str,
471
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
472
+ parser.add_argument(
473
+ "output_file",
474
+ type=str,
475
+ help=
476
+ "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
477
+ )
478
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
479
+ args = parser.parse_args()
480
+
481
+ debug = args.debug
482
+
483
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
display_v3/2023-04-14_16-59-07/hparams.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ amp: true
2
+ batch_size: 16
3
+ cls_target: hd
4
+ deepspeed: true
5
+ dev_data_file: ''
6
+ downsample_data: true
7
+ early_dropout: null
8
+ epochs: 10
9
+ freeze_encoder: false
10
+ just_test: false
11
+ log_fold: ./logs
12
+ log_step: 10
13
+ lr: 5.0e-05
14
+ model_name: bert-base-uncased
15
+ positive_ratio: 0.4
16
+ pretrained_model_fold: ./pretrained_model
17
+ rdrop: null
18
+ running time: 0:06:52
19
+ share_encoder: false
20
+ test_data_file: ''
21
+ train_data_file: ''
22
+ train_ratio: 0.8
23
+ version: structure cmp
display_v3/2023-04-14_16-59-07/metrics.csv ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_loss,epoch,step,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_macro_f1,train_accuracy,train_precision,train_recall,train_f1,train_macro_f1,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_macro_f1
2
+ 0.611328125,0,9,,,,,,,,,,,,,,,,,
3
+ 0.5078125,0,19,,,,,,,,,,,,,,,,,
4
+ 0.7041015625,0,29,,,,,,,,,,,,,,,,,
5
+ 0.7119140625,0,39,,,,,,,,,,,,,,,,,
6
+ 0.398193359375,0,49,,,,,,,,,,,,,,,,,
7
+ 0.461181640625,0,59,,,,,,,,,,,,,,,,,
8
+ 0.386474609375,0,69,,,,,,,,,,,,,,,,,
9
+ 0.4326171875,0,79,,,,,,,,,,,,,,,,,
10
+ 0.66748046875,0,89,,,,,,,,,,,,,,,,,
11
+ 0.33544921875,0,99,,,,,,,,,,,,,,,,,
12
+ 0.30810546875,0,109,,,,,,,,,,,,,,,,,
13
+ 0.55615234375,0,119,,,,,,,,,,,,,,,,,
14
+ 0.2861328125,0,129,,,,,,,,,,,,,,,,,
15
+ 0.60400390625,0,139,,,,,,,,,,,,,,,,,
16
+ 0.623046875,0,149,,,,,,,,,,,,,,,,,
17
+ 0.53076171875,0,159,,,,,,,,,,,,,,,,,
18
+ 0.418701171875,0,169,,,,,,,,,,,,,,,,,
19
+ 0.495361328125,0,179,,,,,,,,,,,,,,,,,
20
+ 0.40625,0,189,,,,,,,,,,,,,,,,,
21
+ 0.448974609375,0,199,,,,,,,,,,,,,,,,,
22
+ 0.295654296875,0,209,,,,,,,,,,,,,,,,,
23
+ 0.6376953125,0,219,,,,,,,,,,,,,,,,,
24
+ 0.6279296875,0,229,,,,,,,,,,,,,,,,,
25
+ ,0,232,0.3514973521232605,0.8561705946922302,0.33702337741851807,0.7425474524497986,0.46362099051475525,0.46362099051475525,,,,,,,,,,,
26
+ ,0,232,,,,,,,0.7682795524597168,0.7229344844818115,0.6821236610412598,0.7019363641738892,0.7019363641738892,,,,,,
27
+ 0.2568359375,1,239,,,,,,,,,,,,,,,,,
28
+ 0.229248046875,1,249,,,,,,,,,,,,,,,,,
29
+ 0.066650390625,1,259,,,,,,,,,,,,,,,,,
30
+ 0.3134765625,1,269,,,,,,,,,,,,,,,,,
31
+ 0.1636962890625,1,279,,,,,,,,,,,,,,,,,
32
+ 0.36279296875,1,289,,,,,,,,,,,,,,,,,
33
+ 0.150146484375,1,299,,,,,,,,,,,,,,,,,
34
+ 0.246337890625,1,309,,,,,,,,,,,,,,,,,
35
+ 0.3505859375,1,319,,,,,,,,,,,,,,,,,
36
+ 0.329345703125,1,329,,,,,,,,,,,,,,,,,
37
+ 0.181640625,1,339,,,,,,,,,,,,,,,,,
38
+ 0.2763671875,1,349,,,,,,,,,,,,,,,,,
39
+ 0.241455078125,1,359,,,,,,,,,,,,,,,,,
40
+ 0.302490234375,1,369,,,,,,,,,,,,,,,,,
41
+ 0.255859375,1,379,,,,,,,,,,,,,,,,,
42
+ 0.24267578125,1,389,,,,,,,,,,,,,,,,,
43
+ 0.465576171875,1,399,,,,,,,,,,,,,,,,,
44
+ 0.484619140625,1,409,,,,,,,,,,,,,,,,,
45
+ 0.262451171875,1,419,,,,,,,,,,,,,,,,,
46
+ 0.271728515625,1,429,,,,,,,,,,,,,,,,,
47
+ 0.190673828125,1,439,,,,,,,,,,,,,,,,,
48
+ 0.474853515625,1,449,,,,,,,,,,,,,,,,,
49
+ 0.56201171875,1,459,,,,,,,,,,,,,,,,,
50
+ ,1,465,0.3760688304901123,0.8321233987808228,0.30616509914398193,0.794037938117981,0.4419306218624115,0.4419306218624115,,,,,,,,,,,
51
+ ,1,465,,,,,,,0.875268816947937,0.8464140892028809,0.8407257795333862,0.8435603380203247,0.8435603380203247,,,,,,
52
+ 0.07574462890625,2,469,,,,,,,,,,,,,,,,,
53
+ 0.206787109375,2,479,,,,,,,,,,,,,,,,,
54
+ 0.033966064453125,2,489,,,,,,,,,,,,,,,,,
55
+ 0.1544189453125,2,499,,,,,,,,,,,,,,,,,
56
+ 0.13671875,2,509,,,,,,,,,,,,,,,,,
57
+ 0.07061767578125,2,519,,,,,,,,,,,,,,,,,
58
+ 0.0055084228515625,2,529,,,,,,,,,,,,,,,,,
59
+ 0.2064208984375,2,539,,,,,,,,,,,,,,,,,
60
+ 0.020599365234375,2,549,,,,,,,,,,,,,,,,,
61
+ 0.0143585205078125,2,559,,,,,,,,,,,,,,,,,
62
+ 0.0257720947265625,2,569,,,,,,,,,,,,,,,,,
63
+ 0.052337646484375,2,579,,,,,,,,,,,,,,,,,
64
+ 0.0276031494140625,2,589,,,,,,,,,,,,,,,,,
65
+ 0.044097900390625,2,599,,,,,,,,,,,,,,,,,
66
+ 0.06817626953125,2,609,,,,,,,,,,,,,,,,,
67
+ 0.1556396484375,2,619,,,,,,,,,,,,,,,,,
68
+ 0.32763671875,2,629,,,,,,,,,,,,,,,,,
69
+ 0.05426025390625,2,639,,,,,,,,,,,,,,,,,
70
+ 0.0640869140625,2,649,,,,,,,,,,,,,,,,,
71
+ 0.0293426513671875,2,659,,,,,,,,,,,,,,,,,
72
+ 0.2349853515625,2,669,,,,,,,,,,,,,,,,,
73
+ 0.0736083984375,2,679,,,,,,,,,,,,,,,,,
74
+ 0.0474853515625,2,689,,,,,,,,,,,,,,,,,
75
+ ,2,698,1.0874371528625488,0.7009981870651245,0.2065553516149521,0.9051490426063538,0.33635449409484863,0.33635449409484863,,,,,,,,,,,
76
+ ,2,698,,,,,,,0.9575268626213074,0.9421542286872864,0.9522849321365356,0.9471924901008606,0.9471924901008606,,,,,,
77
+ 0.0280609130859375,3,699,,,,,,,,,,,,,,,,,
78
+ 0.09930419921875,3,709,,,,,,,,,,,,,,,,,
79
+ 0.0268096923828125,3,719,,,,,,,,,,,,,,,,,
80
+ 0.03912353515625,3,729,,,,,,,,,,,,,,,,,
81
+ 0.00539398193359375,3,739,,,,,,,,,,,,,,,,,
82
+ 0.0030689239501953125,3,749,,,,,,,,,,,,,,,,,
83
+ 0.00466156005859375,3,759,,,,,,,,,,,,,,,,,
84
+ 0.0199432373046875,3,769,,,,,,,,,,,,,,,,,
85
+ 0.0164337158203125,3,779,,,,,,,,,,,,,,,,,
86
+ 0.09100341796875,3,789,,,,,,,,,,,,,,,,,
87
+ 0.0152435302734375,3,799,,,,,,,,,,,,,,,,,
88
+ 0.0019006729125976562,3,809,,,,,,,,,,,,,,,,,
89
+ 0.00083160400390625,3,819,,,,,,,,,,,,,,,,,
90
+ 0.0223541259765625,3,829,,,,,,,,,,,,,,,,,
91
+ 0.1595458984375,3,839,,,,,,,,,,,,,,,,,
92
+ 0.004375457763671875,3,849,,,,,,,,,,,,,,,,,
93
+ 0.01349639892578125,3,859,,,,,,,,,,,,,,,,,
94
+ 0.040191650390625,3,869,,,,,,,,,,,,,,,,,
95
+ 0.031494140625,3,879,,,,,,,,,,,,,,,,,
96
+ 0.01474761962890625,3,889,,,,,,,,,,,,,,,,,
97
+ 0.022308349609375,3,899,,,,,,,,,,,,,,,,,
98
+ 0.035919189453125,3,909,,,,,,,,,,,,,,,,,
99
+ 0.107177734375,3,919,,,,,,,,,,,,,,,,,
100
+ 0.04888916015625,3,929,,,,,,,,,,,,,,,,,
101
+ ,3,931,0.452526330947876,0.8439201712608337,0.31168830394744873,0.7154471278190613,0.43421053886413574,0.43421053886413574,,,,,,,,,,,
102
+ ,3,931,,,,,,,0.9806451797485352,0.974530816078186,0.977150559425354,0.9758388996124268,0.9758388996124268,,,,,,
103
+ 0.0186004638671875,4,939,,,,,,,,,,,,,,,,,
104
+ 0.0024738311767578125,4,949,,,,,,,,,,,,,,,,,
105
+ 0.0012655258178710938,4,959,,,,,,,,,,,,,,,,,
106
+ 0.001422882080078125,4,969,,,,,,,,,,,,,,,,,
107
+ 0.0029392242431640625,4,979,,,,,,,,,,,,,,,,,
108
+ 0.1710205078125,4,989,,,,,,,,,,,,,,,,,
109
+ 0.12115478515625,4,999,,,,,,,,,,,,,,,,,
110
+ 0.00638580322265625,4,1009,,,,,,,,,,,,,,,,,
111
+ 0.00469207763671875,4,1019,,,,,,,,,,,,,,,,,
112
+ 0.013702392578125,4,1029,,,,,,,,,,,,,,,,,
113
+ 0.0222625732421875,4,1039,,,,,,,,,,,,,,,,,
114
+ 0.045074462890625,4,1049,,,,,,,,,,,,,,,,,
115
+ 0.00867462158203125,4,1059,,,,,,,,,,,,,,,,,
116
+ 0.003887176513671875,4,1069,,,,,,,,,,,,,,,,,
117
+ 0.029052734375,4,1079,,,,,,,,,,,,,,,,,
118
+ 0.0028285980224609375,4,1089,,,,,,,,,,,,,,,,,
119
+ 0.00045561790466308594,4,1099,,,,,,,,,,,,,,,,,
120
+ 0.0133209228515625,4,1109,,,,,,,,,,,,,,,,,
121
+ 0.304443359375,4,1119,,,,,,,,,,,,,,,,,
122
+ 0.002223968505859375,4,1129,,,,,,,,,,,,,,,,,
123
+ 0.0014781951904296875,4,1139,,,,,,,,,,,,,,,,,
124
+ 0.005718231201171875,4,1149,,,,,,,,,,,,,,,,,
125
+ 0.0115966796875,4,1159,,,,,,,,,,,,,,,,,
126
+ ,4,1164,0.7260090708732605,0.7958257794380188,0.2644188106060028,0.8075881004333496,0.39839571714401245,0.39839571714401245,,,,,,,,,,,
127
+ ,4,1164,,,,,,,0.9889785051345825,0.9852448105812073,0.9872311949729919,0.9862369894981384,0.9862369894981384,,,,,,
128
+ 0.005008697509765625,5,1169,,,,,,,,,,,,,,,,,
129
+ 0.007183074951171875,5,1179,,,,,,,,,,,,,,,,,
130
+ 0.1400146484375,5,1189,,,,,,,,,,,,,,,,,
131
+ 0.0007276535034179688,5,1199,,,,,,,,,,,,,,,,,
132
+ 0.1689453125,5,1209,,,,,,,,,,,,,,,,,
133
+ 0.00457763671875,5,1219,,,,,,,,,,,,,,,,,
134
+ 0.035430908203125,5,1229,,,,,,,,,,,,,,,,,
135
+ 0.0012540817260742188,5,1239,,,,,,,,,,,,,,,,,
136
+ 0.0225372314453125,5,1249,,,,,,,,,,,,,,,,,
137
+ 0.0008778572082519531,5,1259,,,,,,,,,,,,,,,,,
138
+ 0.01336669921875,5,1269,,,,,,,,,,,,,,,,,
139
+ 0.00044846534729003906,5,1279,,,,,,,,,,,,,,,,,
140
+ 0.00408172607421875,5,1289,,,,,,,,,,,,,,,,,
141
+ 0.00037980079650878906,5,1299,,,,,,,,,,,,,,,,,
142
+ 0.0004723072052001953,5,1309,,,,,,,,,,,,,,,,,
143
+ 0.01436614990234375,5,1319,,,,,,,,,,,,,,,,,
144
+ 0.0670166015625,5,1329,,,,,,,,,,,,,,,,,
145
+ 0.07574462890625,5,1339,,,,,,,,,,,,,,,,,
146
+ 0.01025390625,5,1349,,,,,,,,,,,,,,,,,
147
+ 0.10150146484375,5,1359,,,,,,,,,,,,,,,,,
148
+ 0.0014791488647460938,5,1369,,,,,,,,,,,,,,,,,
149
+ 0.003528594970703125,5,1379,,,,,,,,,,,,,,,,,
150
+ 0.002532958984375,5,1389,,,,,,,,,,,,,,,,,
151
+ ,5,1397,1.1667187213897705,0.7558983564376831,0.2332075536251068,0.8373983502388,0.3648169934749603,0.3648169934749603,,,,,,,,,,,
152
+ ,5,1397,,,,,,,0.9905914068222046,0.9892255663871765,0.9872311949729919,0.988227367401123,0.988227367401123,,,,,,
153
+ 0.004306793212890625,6,1399,,,,,,,,,,,,,,,,,
154
+ 0.010406494140625,6,1409,,,,,,,,,,,,,,,,,
155
+ 0.00102996826171875,6,1419,,,,,,,,,,,,,,,,,
156
+ 0.00447845458984375,6,1429,,,,,,,,,,,,,,,,,
157
+ 0.0005435943603515625,6,1439,,,,,,,,,,,,,,,,,
158
+ 0.155029296875,6,1449,,,,,,,,,,,,,,,,,
159
+ 0.00028395652770996094,6,1459,,,,,,,,,,,,,,,,,
160
+ 0.0070037841796875,6,1469,,,,,,,,,,,,,,,,,
161
+ 0.009063720703125,6,1479,,,,,,,,,,,,,,,,,
162
+ 0.07452392578125,6,1489,,,,,,,,,,,,,,,,,
163
+ 0.005832672119140625,6,1499,,,,,,,,,,,,,,,,,
164
+ 0.0043487548828125,6,1509,,,,,,,,,,,,,,,,,
165
+ 0.00695037841796875,6,1519,,,,,,,,,,,,,,,,,
166
+ 0.06646728515625,6,1529,,,,,,,,,,,,,,,,,
167
+ 0.001789093017578125,6,1539,,,,,,,,,,,,,,,,,
168
+ 0.00323486328125,6,1549,,,,,,,,,,,,,,,,,
169
+ 0.0006985664367675781,6,1559,,,,,,,,,,,,,,,,,
170
+ 0.0648193359375,6,1569,,,,,,,,,,,,,,,,,
171
+ 0.01558685302734375,6,1579,,,,,,,,,,,,,,,,,
172
+ 0.00103759765625,6,1589,,,,,,,,,,,,,,,,,
173
+ 0.001270294189453125,6,1599,,,,,,,,,,,,,,,,,
174
+ 0.0002396106719970703,6,1609,,,,,,,,,,,,,,,,,
175
+ 0.0003399848937988281,6,1619,,,,,,,,,,,,,,,,,
176
+ 0.04937744140625,6,1629,,,,,,,,,,,,,,,,,
177
+ ,6,1630,0.5342352986335754,0.8849818706512451,0.38383838534355164,0.6178861856460571,0.4735202491283417,0.4735202491283417,,,,,,,,,,,
178
+ ,6,1630,,,,,,,0.9916666746139526,0.987943708896637,0.9912634491920471,0.9896007776260376,0.9896007776260376,,,,,,
179
+ 0.0019931793212890625,7,1639,,,,,,,,,,,,,,,,,
180
+ 0.0010328292846679688,7,1649,,,,,,,,,,,,,,,,,
181
+ 0.002391815185546875,7,1659,,,,,,,,,,,,,,,,,
182
+ 0.025543212890625,7,1669,,,,,,,,,,,,,,,,,
183
+ 0.0016775131225585938,7,1679,,,,,,,,,,,,,,,,,
184
+ 0.035919189453125,7,1689,,,,,,,,,,,,,,,,,
185
+ 0.00547027587890625,7,1699,,,,,,,,,,,,,,,,,
186
+ 0.0006341934204101562,7,1709,,,,,,,,,,,,,,,,,
187
+ 0.0009632110595703125,7,1719,,,,,,,,,,,,,,,,,
188
+ 0.00418853759765625,7,1729,,,,,,,,,,,,,,,,,
189
+ 0.0033130645751953125,7,1739,,,,,,,,,,,,,,,,,
190
+ 0.001251220703125,7,1749,,,,,,,,,,,,,,,,,
191
+ 0.00024580955505371094,7,1759,,,,,,,,,,,,,,,,,
192
+ 0.0007381439208984375,7,1769,,,,,,,,,,,,,,,,,
193
+ 0.00131988525390625,7,1779,,,,,,,,,,,,,,,,,
194
+ 0.00652313232421875,7,1789,,,,,,,,,,,,,,,,,
195
+ 0.00263214111328125,7,1799,,,,,,,,,,,,,,,,,
196
+ 0.0014677047729492188,7,1809,,,,,,,,,,,,,,,,,
197
+ 0.0016336441040039062,7,1819,,,,,,,,,,,,,,,,,
198
+ 0.0007638931274414062,7,1829,,,,,,,,,,,,,,,,,
199
+ 0.00135040283203125,7,1839,,,,,,,,,,,,,,,,,
200
+ 0.002391815185546875,7,1849,,,,,,,,,,,,,,,,,
201
+ 0.0011796951293945312,7,1859,,,,,,,,,,,,,,,,,
202
+ ,7,1863,1.4027303457260132,0.7279945611953735,0.22370173037052155,0.9105691313743591,0.3591662347316742,0.3591662347316742,,,,,,,,,,,
203
+ ,7,1863,,,,,,,0.9935483932495117,0.9925975799560547,0.9912634491920471,0.9919300675392151,0.9919300675392151,,,,,,
204
+ 0.00128936767578125,8,1869,,,,,,,,,,,,,,,,,
205
+ 0.00234222412109375,8,1879,,,,,,,,,,,,,,,,,
206
+ 0.00522613525390625,8,1889,,,,,,,,,,,,,,,,,
207
+ 0.1265869140625,8,1899,,,,,,,,,,,,,,,,,
208
+ 0.0026607513427734375,8,1909,,,,,,,,,,,,,,,,,
209
+ 0.0024738311767578125,8,1919,,,,,,,,,,,,,,,,,
210
+ 0.0029926300048828125,8,1929,,,,,,,,,,,,,,,,,
211
+ 0.0010385513305664062,8,1939,,,,,,,,,,,,,,,,,
212
+ 0.0003845691680908203,8,1949,,,,,,,,,,,,,,,,,
213
+ 0.0232086181640625,8,1959,,,,,,,,,,,,,,,,,
214
+ 0.00035262107849121094,8,1969,,,,,,,,,,,,,,,,,
215
+ 0.00084686279296875,8,1979,,,,,,,,,,,,,,,,,
216
+ 0.0023326873779296875,8,1989,,,,,,,,,,,,,,,,,
217
+ 0.0024738311767578125,8,1999,,,,,,,,,,,,,,,,,
218
+ 0.0016727447509765625,8,2009,,,,,,,,,,,,,,,,,
219
+ 0.0006160736083984375,8,2019,,,,,,,,,,,,,,,,,
220
+ 0.0017137527465820312,8,2029,,,,,,,,,,,,,,,,,
221
+ 0.145751953125,8,2039,,,,,,,,,,,,,,,,,
222
+ 0.000591278076171875,8,2049,,,,,,,,,,,,,,,,,
223
+ 0.001270294189453125,8,2059,,,,,,,,,,,,,,,,,
224
+ 0.00011342763900756836,8,2069,,,,,,,,,,,,,,,,,
225
+ 0.0004799365997314453,8,2079,,,,,,,,,,,,,,,,,
226
+ 0.00064849853515625,8,2089,,,,,,,,,,,,,,,,,
227
+ ,8,2096,1.368309736251831,0.7813067436218262,0.2531120479106903,0.8265582919120789,0.38754764199256897,0.38754764199256897,,,,,,,,,,,
228
+ ,8,2096,,,,,,,0.9967741966247559,0.9953020215034485,0.9966397881507874,0.9959704279899597,0.9959704279899597,,,,,,
229
+ 0.00011461973190307617,9,2099,,,,,,,,,,,,,,,,,
230
+ 0.00010311603546142578,9,2109,,,,,,,,,,,,,,,,,
231
+ 8.416175842285156e-05,9,2119,,,,,,,,,,,,,,,,,
232
+ 0.00021708011627197266,9,2129,,,,,,,,,,,,,,,,,
233
+ 0.0002524852752685547,9,2139,,,,,,,,,,,,,,,,,
234
+ 8.296966552734375e-05,9,2149,,,,,,,,,,,,,,,,,
235
+ 0.0005345344543457031,9,2159,,,,,,,,,,,,,,,,,
236
+ 0.0016202926635742188,9,2169,,,,,,,,,,,,,,,,,
237
+ 0.0078582763671875,9,2179,,,,,,,,,,,,,,,,,
238
+ 0.00012177228927612305,9,2189,,,,,,,,,,,,,,,,,
239
+ 0.15673828125,9,2199,,,,,,,,,,,,,,,,,
240
+ 0.00012803077697753906,9,2209,,,,,,,,,,,,,,,,,
241
+ 8.165836334228516e-05,9,2219,,,,,,,,,,,,,,,,,
242
+ 7.212162017822266e-05,9,2229,,,,,,,,,,,,,,,,,
243
+ 0.0003197193145751953,9,2239,,,,,,,,,,,,,,,,,
244
+ 0.09149169921875,9,2249,,,,,,,,,,,,,,,,,
245
+ 0.001117706298828125,9,2259,,,,,,,,,,,,,,,,,
246
+ 0.00513458251953125,9,2269,,,,,,,,,,,,,,,,,
247
+ 0.021209716796875,9,2279,,,,,,,,,,,,,,,,,
248
+ 0.0097503662109375,9,2289,,,,,,,,,,,,,,,,,
249
+ 0.0246429443359375,9,2299,,,,,,,,,,,,,,,,,
250
+ 0.0011377334594726562,9,2309,,,,,,,,,,,,,,,,,
251
+ 0.001354217529296875,9,2319,,,,,,,,,,,,,,,,,
252
+ 0.00032639503479003906,9,2329,,,,,,,,,,,,,,,,,
253
+ ,9,2329,1.1365300416946411,0.7631579041481018,0.242562934756279,0.8617886304855347,0.37857142090797424,0.37857142090797424,,,,,,,,,,,
254
+ ,9,2329,,,,,,,0.9940860271453857,0.9912868738174438,0.9939516186714172,0.9926174283027649,0.9926174283027649,,,,,,
255
+ ,10,2330,,,,,,,,,,,,0.5662358999252319,0.8833031058311462,0.3981233239173889,0.6048879623413086,0.4801940321922302,0.4801940321922302
display_v3/2023-04-14_16-59-07/yes.txt ADDED
File without changes
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fc4b81c7a5f3c433e5c8de5b92730a3c76c4a32c2dc8b60ad349d236a1c0697
3
+ size 220228915
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bacc5807e8789eb3a6853e0d7a148cf921a0737988170b3dfcd7136f205cc60
3
+ size 1320918341
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ checkpoint
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/zero_to_fp32.py ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ '''Copyright The Microsoft DeepSpeed Team'''
3
+
4
+ # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
5
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
6
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
7
+ # application.
8
+ #
9
+ # example: python zero_to_fp32.py . pytorch_model.bin
10
+
11
+ import argparse
12
+ import torch
13
+ import glob
14
+ import math
15
+ import os
16
+ import re
17
+ from collections import OrderedDict
18
+
19
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
20
+ # DeepSpeed data structures it has to be available in the current python environment.
21
+ from deepspeed.utils import logger
22
+ from deepspeed.checkpoint.constants import (DS_VERSION,
23
+ OPTIMIZER_STATE_DICT,
24
+ SINGLE_PARTITION_OF_FP32_GROUPS,
25
+ FP32_FLAT_GROUPS,
26
+ ZERO_STAGE,
27
+ PARTITION_COUNT,
28
+ PARAM_SHAPES,
29
+ BUFFER_NAMES)
30
+
31
+ debug = 0
32
+
33
+ # load to cpu
34
+ device = torch.device('cpu')
35
+
36
+
37
+ def atoi(text):
38
+ return int(text) if text.isdigit() else text
39
+
40
+
41
+ def natural_keys(text):
42
+ '''
43
+ alist.sort(key=natural_keys) sorts in human order
44
+ http://nedbatchelder.com/blog/200712/human_sorting.html
45
+ (See Toothy's implementation in the comments)
46
+ '''
47
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
48
+
49
+
50
+ def get_model_state_file(checkpoint_dir, zero_stage):
51
+ if not os.path.isdir(checkpoint_dir):
52
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
53
+
54
+ # there should be only one file
55
+ if zero_stage == 2:
56
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
57
+ elif zero_stage == 3:
58
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
59
+
60
+ if not os.path.exists(file):
61
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
62
+
63
+ return file
64
+
65
+
66
+ def get_optim_files(checkpoint_dir):
67
+ # XXX: need to test that this simple glob rule works for multi-node setup too
68
+ optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
69
+ "*_optim_states.pt")),
70
+ key=natural_keys)
71
+
72
+ if len(optim_files) == 0:
73
+ raise FileNotFoundError(
74
+ f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
75
+
76
+ return optim_files
77
+
78
+
79
+ def parse_model_state(file):
80
+ state_dict = torch.load(file, map_location=device)
81
+
82
+ if BUFFER_NAMES not in state_dict:
83
+ raise ValueError(f"{file} is not a model state checkpoint")
84
+ buffer_names = state_dict[BUFFER_NAMES]
85
+ if debug:
86
+ print("Found buffers:", buffer_names)
87
+
88
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
89
+ buffers = {
90
+ k: v.float()
91
+ for k,
92
+ v in state_dict["module"].items() if k in buffer_names
93
+ }
94
+ param_shapes = state_dict[PARAM_SHAPES]
95
+
96
+ ds_version = state_dict.get(DS_VERSION, None)
97
+
98
+ return buffers, param_shapes, ds_version
99
+
100
+
101
+ def parse_optim_states(files, ds_checkpoint_dir):
102
+
103
+ total_files = len(files)
104
+ state_dicts = []
105
+ for f in files:
106
+ state_dicts.append(torch.load(f, map_location=device))
107
+
108
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
109
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
110
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
111
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
112
+
113
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
114
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
115
+ # use the max of the partition_count to get the dp world_size.
116
+
117
+ if type(world_size) is list:
118
+ world_size = max(world_size)
119
+
120
+ if world_size != total_files:
121
+ raise ValueError(
122
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
123
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
124
+ )
125
+
126
+ # the groups are named differently in each stage
127
+ if zero_stage == 2:
128
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
129
+ elif zero_stage == 3:
130
+ fp32_groups_key = FP32_FLAT_GROUPS
131
+ else:
132
+ raise ValueError(f"unknown zero stage {zero_stage}")
133
+
134
+ if zero_stage == 2:
135
+ fp32_flat_groups = [
136
+ state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
137
+ for i in range(len(state_dicts))
138
+ ]
139
+ elif zero_stage == 3:
140
+ # if there is more than one param group, there will be multiple flattened tensors - one
141
+ # flattened tensor per group - for simplicity merge them into a single tensor
142
+ #
143
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
144
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
145
+
146
+ fp32_flat_groups = [
147
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
148
+ 0) for i in range(len(state_dicts))
149
+ ]
150
+
151
+ return zero_stage, world_size, fp32_flat_groups
152
+
153
+
154
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
155
+ """
156
+ Returns fp32 state_dict reconstructed from ds checkpoint
157
+
158
+ Args:
159
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
160
+
161
+ """
162
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
163
+
164
+ optim_files = get_optim_files(ds_checkpoint_dir)
165
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
166
+ print(
167
+ f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
168
+
169
+ model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
170
+ buffers, param_shapes, ds_version = parse_model_state(model_file)
171
+ print(f'Parsing checkpoint created by deepspeed=={ds_version}')
172
+
173
+ if zero_stage == 2:
174
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
175
+ param_shapes,
176
+ fp32_flat_groups,
177
+ buffers)
178
+ elif zero_stage == 3:
179
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
180
+ param_shapes,
181
+ fp32_flat_groups,
182
+ buffers)
183
+
184
+
185
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
186
+ param_shapes,
187
+ fp32_flat_groups,
188
+ buffers):
189
+
190
+ # Reconstruction protocol:
191
+ #
192
+ # XXX: document this
193
+
194
+ if debug:
195
+ for i in range(world_size):
196
+ for j in range(len(fp32_flat_groups[0])):
197
+ print(
198
+ f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
199
+
200
+ # XXX: memory usage doubles here (zero2)
201
+ num_param_groups = len(fp32_flat_groups[0])
202
+ merged_single_partition_of_fp32_groups = []
203
+ for i in range(num_param_groups):
204
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
205
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
206
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
207
+ avail_numel = sum([
208
+ full_single_fp32_vector.numel()
209
+ for full_single_fp32_vector in merged_single_partition_of_fp32_groups
210
+ ])
211
+
212
+ if debug:
213
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
214
+ wanted_numel = sum(
215
+ [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
216
+ # not asserting if there is a mismatch due to possible padding
217
+ print(f"Have {avail_numel} numels to process.")
218
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
219
+
220
+ state_dict = OrderedDict()
221
+
222
+ # buffers
223
+ state_dict.update(buffers)
224
+ if debug:
225
+ print(f"added {len(buffers)} buffers")
226
+
227
+ # params
228
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
229
+ # out-of-core computing solution
230
+ total_numel = 0
231
+ total_params = 0
232
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
233
+ offset = 0
234
+ avail_numel = full_single_fp32_vector.numel()
235
+ for name, shape in shapes.items():
236
+
237
+ unpartitioned_numel = shape.numel()
238
+ total_numel += unpartitioned_numel
239
+ total_params += 1
240
+
241
+ if debug:
242
+ print(
243
+ f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
244
+ )
245
+ state_dict[name] = full_single_fp32_vector.narrow(
246
+ 0,
247
+ offset,
248
+ unpartitioned_numel).view(shape)
249
+ offset += unpartitioned_numel
250
+
251
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
252
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
253
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
254
+ # live optimizer object, so we are checking that the numbers are within the right range
255
+ align_to = 2 * world_size
256
+
257
+ def zero2_align(x):
258
+ return align_to * math.ceil(x / align_to)
259
+
260
+ if debug:
261
+ print(f"original offset={offset}, avail_numel={avail_numel}")
262
+
263
+ offset = zero2_align(offset)
264
+ avail_numel = zero2_align(avail_numel)
265
+
266
+ if debug:
267
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
268
+
269
+ # Sanity check
270
+ if offset != avail_numel:
271
+ raise ValueError(
272
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
273
+
274
+ print(
275
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
276
+ )
277
+
278
+ return state_dict
279
+
280
+
281
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
282
+ remainder = unpartitioned_numel % world_size
283
+ padding_numel = (world_size - remainder) if remainder else 0
284
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
285
+ return partitioned_numel, padding_numel
286
+
287
+
288
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
289
+ param_shapes,
290
+ fp32_flat_groups,
291
+ buffers):
292
+
293
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
294
+ # param, re-consolidating each param, while dealing with padding if any
295
+
296
+ avail_numel = fp32_flat_groups[0].numel() * world_size
297
+ # merge list of dicts, preserving order
298
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
299
+
300
+ if debug:
301
+ for i in range(world_size):
302
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
303
+
304
+ wanted_params = len(param_shapes)
305
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
306
+ # not asserting if there is a mismatch due to possible padding
307
+ print(f"Have {avail_numel} numels to process.")
308
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
309
+
310
+ state_dict = OrderedDict()
311
+
312
+ # buffers
313
+ state_dict.update(buffers)
314
+ if debug:
315
+ print(f"added {len(buffers)} buffers")
316
+
317
+ # params
318
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
319
+ # out-of-core computing solution
320
+ offset = 0
321
+ total_numel = 0
322
+ total_params = 0
323
+ for name, shape in param_shapes.items():
324
+
325
+ unpartitioned_numel = shape.numel()
326
+ total_numel += unpartitioned_numel
327
+ total_params += 1
328
+
329
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
330
+
331
+ if debug:
332
+ print(
333
+ f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
334
+ )
335
+
336
+ # XXX: memory usage doubles here
337
+ state_dict[name] = torch.cat(
338
+ tuple(fp32_flat_groups[i].narrow(0,
339
+ offset,
340
+ partitioned_numel)
341
+ for i in range(world_size)),
342
+ 0).narrow(0,
343
+ 0,
344
+ unpartitioned_numel).view(shape)
345
+ offset += partitioned_numel
346
+
347
+ offset *= world_size
348
+
349
+ # Sanity check
350
+ if offset != avail_numel:
351
+ raise ValueError(
352
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
353
+
354
+ print(
355
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
356
+ )
357
+
358
+ return state_dict
359
+
360
+
361
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
362
+ """
363
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
364
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
365
+ via a model hub.
366
+
367
+ Args:
368
+ - ``checkpoint_dir``: path to the desired checkpoint folder
369
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
370
+
371
+ Returns:
372
+ - pytorch ``state_dict``
373
+
374
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
375
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
376
+ the checkpoint.
377
+
378
+ A typical usage might be ::
379
+
380
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
381
+ # do the training and checkpoint saving
382
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
383
+ model = model.cpu() # move to cpu
384
+ model.load_state_dict(state_dict)
385
+ # submit to model hub or save the model to share with others
386
+
387
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
388
+ application. i.e. you will need to re-initialize the deepspeed engine, since
389
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
390
+
391
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
392
+
393
+ """
394
+ if tag is None:
395
+ latest_path = os.path.join(checkpoint_dir, 'latest')
396
+ if os.path.isfile(latest_path):
397
+ with open(latest_path, 'r') as fd:
398
+ tag = fd.read().strip()
399
+ else:
400
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
401
+
402
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
403
+
404
+ if not os.path.isdir(ds_checkpoint_dir):
405
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
406
+
407
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
408
+
409
+
410
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
411
+ """
412
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
413
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
414
+
415
+ Args:
416
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
417
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
418
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
419
+ """
420
+
421
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
422
+ print(f"Saving fp32 state dict to {output_file}")
423
+ torch.save(state_dict, output_file)
424
+
425
+
426
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
427
+ """
428
+ 1. Put the provided model to cpu
429
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
430
+ 3. Load it into the provided model
431
+
432
+ Args:
433
+ - ``model``: the model object to update
434
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
435
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
436
+
437
+ Returns:
438
+ - ``model`: modified model
439
+
440
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
441
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
442
+ conveniently placed for you in the checkpoint folder.
443
+
444
+ A typical usage might be ::
445
+
446
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
447
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
448
+ # submit to model hub or save the model to share with others
449
+
450
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
451
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
452
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
453
+
454
+ """
455
+ logger.info(f"Extracting fp32 weights")
456
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
457
+
458
+ logger.info(f"Overwriting model with fp32 weights")
459
+ model = model.cpu()
460
+ model.load_state_dict(state_dict, strict=False)
461
+
462
+ return model
463
+
464
+
465
+ if __name__ == "__main__":
466
+
467
+ parser = argparse.ArgumentParser()
468
+ parser.add_argument(
469
+ "checkpoint_dir",
470
+ type=str,
471
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
472
+ parser.add_argument(
473
+ "output_file",
474
+ type=str,
475
+ help=
476
+ "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
477
+ )
478
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
479
+ args = parser.parse_args()
480
+
481
+ debug = args.debug
482
+
483
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
display_v3/2023-04-14_17-06-18/hparams.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ amp: true
2
+ batch_size: 16
3
+ cls_target: cv
4
+ deepspeed: true
5
+ dev_data_file: ''
6
+ downsample_data: true
7
+ early_dropout: null
8
+ epochs: 10
9
+ freeze_encoder: false
10
+ just_test: false
11
+ log_fold: ./logs
12
+ log_step: 10
13
+ lr: 5.0e-05
14
+ model_name: bert-base-uncased
15
+ positive_ratio: 0.4
16
+ pretrained_model_fold: ./pretrained_model
17
+ rdrop: null
18
+ running time: 0:01:56
19
+ share_encoder: false
20
+ test_data_file: ''
21
+ train_data_file: ''
22
+ train_ratio: 0.8
23
+ version: structure cmp
display_v3/2023-04-14_17-06-18/metrics.csv ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_loss,epoch,step,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_macro_f1,train_accuracy,train_precision,train_recall,train_f1,train_macro_f1,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_macro_f1
2
+ 0.70458984375,0,9,,,,,,,,,,,,,,,,,
3
+ ,0,15,0.5909940004348755,0.6669691205024719,0.014864864759147167,0.6875,0.029100529849529266,0.029100529849529266,,,,,,,,,,,
4
+ ,0,15,,,,,,,0.6275303363800049,0.7692307829856873,0.10101009905338287,0.1785714328289032,0.1785714328289032,,,,,,
5
+ 0.4501953125,1,19,,,,,,,,,,,,,,,,,
6
+ 0.361328125,1,29,,,,,,,,,,,,,,,,,
7
+ ,1,31,0.23693892359733582,0.9108439087867737,0.045340050011873245,0.5625,0.0839160829782486,0.0839160829782486,,,,,,,,,,,
8
+ ,1,31,,,,,,,0.8825910687446594,0.8500000238418579,0.8585858345031738,0.8542713522911072,0.8542713522911072,,,,,,
9
+ 0.08734130859375,2,39,,,,,,,,,,,,,,,,,
10
+ ,2,47,0.6708253026008606,0.7488657236099243,0.023914968594908714,0.84375,0.04651162773370743,0.04651162773370743,,,,,,,,,,,
11
+ ,2,47,,,,,,,0.9473684430122375,0.9479166865348816,0.9191918969154358,0.9333333373069763,0.9333333373069763,,,,,,
12
+ 0.078369140625,3,49,,,,,,,,,,,,,,,,,
13
+ 0.0077056884765625,3,59,,,,,,,,,,,,,,,,,
14
+ ,3,63,0.16590917110443115,0.9494101405143738,0.06787330657243729,0.46875,0.11857707798480988,0.11857707798480988,,,,,,,,,,,
15
+ ,3,63,,,,,,,0.9959514141082764,0.9900000095367432,1.0,0.9949748516082764,0.9949748516082764,,,,,,
16
+ 0.004322052001953125,4,69,,,,,,,,,,,,,,,,,
17
+ 0.00762176513671875,4,79,,,,,,,,,,,,,,,,,
18
+ ,4,79,0.48889032006263733,0.8593466281890869,0.03627760335803032,0.71875,0.06906907260417938,0.06906907260417938,,,,,,,,,,,
19
+ ,4,79,,,,,,,0.9919028282165527,0.9898989796638489,0.9898989796638489,0.9898989796638489,0.9898989796638489,,,,,,
20
+ 0.078125,5,89,,,,,,,,,,,,,,,,,
21
+ ,5,95,1.2922793626785278,0.7039473652839661,0.021068472415208817,0.875,0.04114621505141258,0.04114621505141258,,,,,,,,,,,
22
+ ,5,95,,,,,,,0.9919028282165527,0.9898989796638489,0.9898989796638489,0.9898989796638489,0.9898989796638489,,,,,,
23
+ 0.015960693359375,6,99,,,,,,,,,,,,,,,,,
24
+ 0.003612518310546875,6,109,,,,,,,,,,,,,,,,,
25
+ ,6,111,0.8841056227684021,0.7894737124443054,0.028421051800251007,0.84375,0.05498981848359108,0.05498981848359108,,,,,,,,,,,
26
+ ,6,111,,,,,,,0.9959514141082764,0.9900000095367432,1.0,0.9949748516082764,0.9949748516082764,,,,,,
27
+ 0.0019989013671875,7,119,,,,,,,,,,,,,,,,,
28
+ ,7,127,0.69402015209198,0.8504990935325623,0.039647575467824936,0.84375,0.07573632895946503,0.07573632895946503,,,,,,,,,,,
29
+ ,7,127,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,
30
+ 0.0010175704956054688,8,129,,,,,,,,,,,,,,,,,
31
+ 0.0007581710815429688,8,139,,,,,,,,,,,,,,,,,
32
+ ,8,143,0.6454178094863892,0.8666061758995056,0.042763158679008484,0.8125,0.08124999701976776,0.08124999701976776,,,,,,,,,,,
33
+ ,8,143,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,
34
+ 0.0009255409240722656,9,149,,,,,,,,,,,,,,,,,
35
+ 0.00054931640625,9,159,,,,,,,,,,,,,,,,,
36
+ ,9,159,0.6520931720733643,0.8681941628456116,0.04326122999191284,0.8125,0.08214849978685379,0.08214849978685379,,,,,,,,,,,
37
+ ,9,159,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,
38
+ ,10,160,,,,,,,,,,,,0.1696726232767105,0.9466424584388733,0.0439189188182354,0.5416666865348816,0.08124999701976776,0.08124999701976776
display_v3/2023-04-14_17-06-18/yes.txt ADDED
File without changes
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d544e1553c822e187b976e75cb402dc7a351855b355913ad28b7ed8e97e4e8
3
+ size 220228915
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64eb30cf079871506af468afdfbf83a06d02247526ba2b092ab84f001e57929b
3
+ size 1320918341
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ checkpoint
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/zero_to_fp32.py ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ '''Copyright The Microsoft DeepSpeed Team'''
3
+
4
+ # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
5
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
6
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
7
+ # application.
8
+ #
9
+ # example: python zero_to_fp32.py . pytorch_model.bin
10
+
11
+ import argparse
12
+ import torch
13
+ import glob
14
+ import math
15
+ import os
16
+ import re
17
+ from collections import OrderedDict
18
+
19
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
20
+ # DeepSpeed data structures it has to be available in the current python environment.
21
+ from deepspeed.utils import logger
22
+ from deepspeed.checkpoint.constants import (DS_VERSION,
23
+ OPTIMIZER_STATE_DICT,
24
+ SINGLE_PARTITION_OF_FP32_GROUPS,
25
+ FP32_FLAT_GROUPS,
26
+ ZERO_STAGE,
27
+ PARTITION_COUNT,
28
+ PARAM_SHAPES,
29
+ BUFFER_NAMES)
30
+
31
+ debug = 0
32
+
33
+ # load to cpu
34
+ device = torch.device('cpu')
35
+
36
+
37
+ def atoi(text):
38
+ return int(text) if text.isdigit() else text
39
+
40
+
41
+ def natural_keys(text):
42
+ '''
43
+ alist.sort(key=natural_keys) sorts in human order
44
+ http://nedbatchelder.com/blog/200712/human_sorting.html
45
+ (See Toothy's implementation in the comments)
46
+ '''
47
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
48
+
49
+
50
+ def get_model_state_file(checkpoint_dir, zero_stage):
51
+ if not os.path.isdir(checkpoint_dir):
52
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
53
+
54
+ # there should be only one file
55
+ if zero_stage == 2:
56
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
57
+ elif zero_stage == 3:
58
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
59
+
60
+ if not os.path.exists(file):
61
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
62
+
63
+ return file
64
+
65
+
66
+ def get_optim_files(checkpoint_dir):
67
+ # XXX: need to test that this simple glob rule works for multi-node setup too
68
+ optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
69
+ "*_optim_states.pt")),
70
+ key=natural_keys)
71
+
72
+ if len(optim_files) == 0:
73
+ raise FileNotFoundError(
74
+ f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
75
+
76
+ return optim_files
77
+
78
+
79
+ def parse_model_state(file):
80
+ state_dict = torch.load(file, map_location=device)
81
+
82
+ if BUFFER_NAMES not in state_dict:
83
+ raise ValueError(f"{file} is not a model state checkpoint")
84
+ buffer_names = state_dict[BUFFER_NAMES]
85
+ if debug:
86
+ print("Found buffers:", buffer_names)
87
+
88
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
89
+ buffers = {
90
+ k: v.float()
91
+ for k,
92
+ v in state_dict["module"].items() if k in buffer_names
93
+ }
94
+ param_shapes = state_dict[PARAM_SHAPES]
95
+
96
+ ds_version = state_dict.get(DS_VERSION, None)
97
+
98
+ return buffers, param_shapes, ds_version
99
+
100
+
101
+ def parse_optim_states(files, ds_checkpoint_dir):
102
+
103
+ total_files = len(files)
104
+ state_dicts = []
105
+ for f in files:
106
+ state_dicts.append(torch.load(f, map_location=device))
107
+
108
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
109
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
110
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
111
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
112
+
113
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
114
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
115
+ # use the max of the partition_count to get the dp world_size.
116
+
117
+ if type(world_size) is list:
118
+ world_size = max(world_size)
119
+
120
+ if world_size != total_files:
121
+ raise ValueError(
122
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
123
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
124
+ )
125
+
126
+ # the groups are named differently in each stage
127
+ if zero_stage == 2:
128
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
129
+ elif zero_stage == 3:
130
+ fp32_groups_key = FP32_FLAT_GROUPS
131
+ else:
132
+ raise ValueError(f"unknown zero stage {zero_stage}")
133
+
134
+ if zero_stage == 2:
135
+ fp32_flat_groups = [
136
+ state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
137
+ for i in range(len(state_dicts))
138
+ ]
139
+ elif zero_stage == 3:
140
+ # if there is more than one param group, there will be multiple flattened tensors - one
141
+ # flattened tensor per group - for simplicity merge them into a single tensor
142
+ #
143
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
144
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
145
+
146
+ fp32_flat_groups = [
147
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
148
+ 0) for i in range(len(state_dicts))
149
+ ]
150
+
151
+ return zero_stage, world_size, fp32_flat_groups
152
+
153
+
154
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
155
+ """
156
+ Returns fp32 state_dict reconstructed from ds checkpoint
157
+
158
+ Args:
159
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
160
+
161
+ """
162
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
163
+
164
+ optim_files = get_optim_files(ds_checkpoint_dir)
165
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
166
+ print(
167
+ f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
168
+
169
+ model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
170
+ buffers, param_shapes, ds_version = parse_model_state(model_file)
171
+ print(f'Parsing checkpoint created by deepspeed=={ds_version}')
172
+
173
+ if zero_stage == 2:
174
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
175
+ param_shapes,
176
+ fp32_flat_groups,
177
+ buffers)
178
+ elif zero_stage == 3:
179
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
180
+ param_shapes,
181
+ fp32_flat_groups,
182
+ buffers)
183
+
184
+
185
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
186
+ param_shapes,
187
+ fp32_flat_groups,
188
+ buffers):
189
+
190
+ # Reconstruction protocol:
191
+ #
192
+ # XXX: document this
193
+
194
+ if debug:
195
+ for i in range(world_size):
196
+ for j in range(len(fp32_flat_groups[0])):
197
+ print(
198
+ f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
199
+
200
+ # XXX: memory usage doubles here (zero2)
201
+ num_param_groups = len(fp32_flat_groups[0])
202
+ merged_single_partition_of_fp32_groups = []
203
+ for i in range(num_param_groups):
204
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
205
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
206
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
207
+ avail_numel = sum([
208
+ full_single_fp32_vector.numel()
209
+ for full_single_fp32_vector in merged_single_partition_of_fp32_groups
210
+ ])
211
+
212
+ if debug:
213
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
214
+ wanted_numel = sum(
215
+ [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
216
+ # not asserting if there is a mismatch due to possible padding
217
+ print(f"Have {avail_numel} numels to process.")
218
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
219
+
220
+ state_dict = OrderedDict()
221
+
222
+ # buffers
223
+ state_dict.update(buffers)
224
+ if debug:
225
+ print(f"added {len(buffers)} buffers")
226
+
227
+ # params
228
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
229
+ # out-of-core computing solution
230
+ total_numel = 0
231
+ total_params = 0
232
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
233
+ offset = 0
234
+ avail_numel = full_single_fp32_vector.numel()
235
+ for name, shape in shapes.items():
236
+
237
+ unpartitioned_numel = shape.numel()
238
+ total_numel += unpartitioned_numel
239
+ total_params += 1
240
+
241
+ if debug:
242
+ print(
243
+ f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
244
+ )
245
+ state_dict[name] = full_single_fp32_vector.narrow(
246
+ 0,
247
+ offset,
248
+ unpartitioned_numel).view(shape)
249
+ offset += unpartitioned_numel
250
+
251
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
252
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
253
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
254
+ # live optimizer object, so we are checking that the numbers are within the right range
255
+ align_to = 2 * world_size
256
+
257
+ def zero2_align(x):
258
+ return align_to * math.ceil(x / align_to)
259
+
260
+ if debug:
261
+ print(f"original offset={offset}, avail_numel={avail_numel}")
262
+
263
+ offset = zero2_align(offset)
264
+ avail_numel = zero2_align(avail_numel)
265
+
266
+ if debug:
267
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
268
+
269
+ # Sanity check
270
+ if offset != avail_numel:
271
+ raise ValueError(
272
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
273
+
274
+ print(
275
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
276
+ )
277
+
278
+ return state_dict
279
+
280
+
281
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
282
+ remainder = unpartitioned_numel % world_size
283
+ padding_numel = (world_size - remainder) if remainder else 0
284
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
285
+ return partitioned_numel, padding_numel
286
+
287
+
288
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
289
+ param_shapes,
290
+ fp32_flat_groups,
291
+ buffers):
292
+
293
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
294
+ # param, re-consolidating each param, while dealing with padding if any
295
+
296
+ avail_numel = fp32_flat_groups[0].numel() * world_size
297
+ # merge list of dicts, preserving order
298
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
299
+
300
+ if debug:
301
+ for i in range(world_size):
302
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
303
+
304
+ wanted_params = len(param_shapes)
305
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
306
+ # not asserting if there is a mismatch due to possible padding
307
+ print(f"Have {avail_numel} numels to process.")
308
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
309
+
310
+ state_dict = OrderedDict()
311
+
312
+ # buffers
313
+ state_dict.update(buffers)
314
+ if debug:
315
+ print(f"added {len(buffers)} buffers")
316
+
317
+ # params
318
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
319
+ # out-of-core computing solution
320
+ offset = 0
321
+ total_numel = 0
322
+ total_params = 0
323
+ for name, shape in param_shapes.items():
324
+
325
+ unpartitioned_numel = shape.numel()
326
+ total_numel += unpartitioned_numel
327
+ total_params += 1
328
+
329
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
330
+
331
+ if debug:
332
+ print(
333
+ f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
334
+ )
335
+
336
+ # XXX: memory usage doubles here
337
+ state_dict[name] = torch.cat(
338
+ tuple(fp32_flat_groups[i].narrow(0,
339
+ offset,
340
+ partitioned_numel)
341
+ for i in range(world_size)),
342
+ 0).narrow(0,
343
+ 0,
344
+ unpartitioned_numel).view(shape)
345
+ offset += partitioned_numel
346
+
347
+ offset *= world_size
348
+
349
+ # Sanity check
350
+ if offset != avail_numel:
351
+ raise ValueError(
352
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
353
+
354
+ print(
355
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
356
+ )
357
+
358
+ return state_dict
359
+
360
+
361
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
362
+ """
363
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
364
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
365
+ via a model hub.
366
+
367
+ Args:
368
+ - ``checkpoint_dir``: path to the desired checkpoint folder
369
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
370
+
371
+ Returns:
372
+ - pytorch ``state_dict``
373
+
374
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
375
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
376
+ the checkpoint.
377
+
378
+ A typical usage might be ::
379
+
380
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
381
+ # do the training and checkpoint saving
382
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
383
+ model = model.cpu() # move to cpu
384
+ model.load_state_dict(state_dict)
385
+ # submit to model hub or save the model to share with others
386
+
387
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
388
+ application. i.e. you will need to re-initialize the deepspeed engine, since
389
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
390
+
391
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
392
+
393
+ """
394
+ if tag is None:
395
+ latest_path = os.path.join(checkpoint_dir, 'latest')
396
+ if os.path.isfile(latest_path):
397
+ with open(latest_path, 'r') as fd:
398
+ tag = fd.read().strip()
399
+ else:
400
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
401
+
402
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
403
+
404
+ if not os.path.isdir(ds_checkpoint_dir):
405
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
406
+
407
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
408
+
409
+
410
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
411
+ """
412
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
413
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
414
+
415
+ Args:
416
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
417
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
418
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
419
+ """
420
+
421
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
422
+ print(f"Saving fp32 state dict to {output_file}")
423
+ torch.save(state_dict, output_file)
424
+
425
+
426
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
427
+ """
428
+ 1. Put the provided model to cpu
429
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
430
+ 3. Load it into the provided model
431
+
432
+ Args:
433
+ - ``model``: the model object to update
434
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
435
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
436
+
437
+ Returns:
438
+ - ``model`: modified model
439
+
440
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
441
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
442
+ conveniently placed for you in the checkpoint folder.
443
+
444
+ A typical usage might be ::
445
+
446
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
447
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
448
+ # submit to model hub or save the model to share with others
449
+
450
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
451
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
452
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
453
+
454
+ """
455
+ logger.info(f"Extracting fp32 weights")
456
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
457
+
458
+ logger.info(f"Overwriting model with fp32 weights")
459
+ model = model.cpu()
460
+ model.load_state_dict(state_dict, strict=False)
461
+
462
+ return model
463
+
464
+
465
+ if __name__ == "__main__":
466
+
467
+ parser = argparse.ArgumentParser()
468
+ parser.add_argument(
469
+ "checkpoint_dir",
470
+ type=str,
471
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
472
+ parser.add_argument(
473
+ "output_file",
474
+ type=str,
475
+ help=
476
+ "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
477
+ )
478
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
479
+ args = parser.parse_args()
480
+
481
+ debug = args.debug
482
+
483
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
display_v3/2023-04-14_17-59-45/hparams.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ amp: true
2
+ batch_size: 16
3
+ cls_target: vo
4
+ deepspeed: true
5
+ dev_data_file: ''
6
+ downsample_data: true
7
+ early_dropout: null
8
+ epochs: 10
9
+ freeze_encoder: false
10
+ just_test: false
11
+ log_fold: ./logs
12
+ log_step: 10
13
+ lr: 5.0e-05
14
+ model_name: bert-base-uncased
15
+ positive_ratio: 0.4
16
+ pretrained_model_fold: ./pretrained_model
17
+ rdrop: null
18
+ running time: 0:05:34
19
+ share_encoder: false
20
+ test_data_file: ''
21
+ train_data_file: ''
22
+ train_ratio: 0.8
23
+ version: structure cmp
display_v3/2023-04-14_17-59-45/metrics.csv ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_loss,epoch,step,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_macro_f1,train_accuracy,train_precision,train_recall,train_f1,train_macro_f1,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_macro_f1
2
+ 0.55810546875,0,9,,,,,,,,,,,,,,,,,
3
+ 0.46435546875,0,19,,,,,,,,,,,,,,,,,
4
+ 0.34228515625,0,29,,,,,,,,,,,,,,,,,
5
+ 0.27783203125,0,39,,,,,,,,,,,,,,,,,
6
+ 0.59716796875,0,49,,,,,,,,,,,,,,,,,
7
+ 0.316650390625,0,59,,,,,,,,,,,,,,,,,
8
+ 0.5478515625,0,69,,,,,,,,,,,,,,,,,
9
+ 0.173828125,0,79,,,,,,,,,,,,,,,,,
10
+ 0.501953125,0,89,,,,,,,,,,,,,,,,,
11
+ 0.38623046875,0,99,,,,,,,,,,,,,,,,,
12
+ 0.36279296875,0,109,,,,,,,,,,,,,,,,,
13
+ 0.2105712890625,0,119,,,,,,,,,,,,,,,,,
14
+ 0.2744140625,0,129,,,,,,,,,,,,,,,,,
15
+ 0.44677734375,0,139,,,,,,,,,,,,,,,,,
16
+ 0.6474609375,0,149,,,,,,,,,,,,,,,,,
17
+ 0.81298828125,0,159,,,,,,,,,,,,,,,,,
18
+ 0.60888671875,0,169,,,,,,,,,,,,,,,,,
19
+ ,0,170,0.6008952260017395,0.7477313876152039,0.18957704305648804,0.865517258644104,0.31102851033210754,0.31102851033210754,,,,,,,,,,,
20
+ ,0,170,,,,,,,0.7950036525726318,0.777429461479187,0.6831955909729004,0.7272727489471436,0.7272727489471436,,,,,,
21
+ 0.305419921875,1,179,,,,,,,,,,,,,,,,,
22
+ 0.390869140625,1,189,,,,,,,,,,,,,,,,,
23
+ 0.2052001953125,1,199,,,,,,,,,,,,,,,,,
24
+ 0.0948486328125,1,209,,,,,,,,,,,,,,,,,
25
+ 0.38427734375,1,219,,,,,,,,,,,,,,,,,
26
+ 0.1968994140625,1,229,,,,,,,,,,,,,,,,,
27
+ 0.078369140625,1,239,,,,,,,,,,,,,,,,,
28
+ 0.356689453125,1,249,,,,,,,,,,,,,,,,,
29
+ 0.43505859375,1,259,,,,,,,,,,,,,,,,,
30
+ 0.485107421875,1,269,,,,,,,,,,,,,,,,,
31
+ 0.1243896484375,1,279,,,,,,,,,,,,,,,,,
32
+ 0.05401611328125,1,289,,,,,,,,,,,,,,,,,
33
+ 0.35595703125,1,299,,,,,,,,,,,,,,,,,
34
+ 0.0572509765625,1,309,,,,,,,,,,,,,,,,,
35
+ 0.1417236328125,1,319,,,,,,,,,,,,,,,,,
36
+ 0.2315673828125,1,329,,,,,,,,,,,,,,,,,
37
+ 0.232421875,1,339,,,,,,,,,,,,,,,,,
38
+ ,1,341,0.24903717637062073,0.9004083275794983,0.36767318844795227,0.7137930989265442,0.48534584045410156,0.48534584045410156,,,,,,,,,,,
39
+ ,1,341,,,,,,,0.8934606909751892,0.8779565095901489,0.8521579504013062,0.8648648858070374,0.8648648858070374,,,,,,
40
+ 0.1656494140625,2,349,,,,,,,,,,,,,,,,,
41
+ 0.336181640625,2,359,,,,,,,,,,,,,,,,,
42
+ 0.1632080078125,2,369,,,,,,,,,,,,,,,,,
43
+ 0.042724609375,2,379,,,,,,,,,,,,,,,,,
44
+ 0.352783203125,2,389,,,,,,,,,,,,,,,,,
45
+ 0.0268096923828125,2,399,,,,,,,,,,,,,,,,,
46
+ 0.01428985595703125,2,409,,,,,,,,,,,,,,,,,
47
+ 0.1790771484375,2,419,,,,,,,,,,,,,,,,,
48
+ 0.0181427001953125,2,429,,,,,,,,,,,,,,,,,
49
+ 0.04736328125,2,439,,,,,,,,,,,,,,,,,
50
+ 0.2493896484375,2,449,,,,,,,,,,,,,,,,,
51
+ 0.08538818359375,2,459,,,,,,,,,,,,,,,,,
52
+ 0.583984375,2,469,,,,,,,,,,,,,,,,,
53
+ 0.0457763671875,2,479,,,,,,,,,,,,,,,,,
54
+ 0.1326904296875,2,489,,,,,,,,,,,,,,,,,
55
+ 0.156494140625,2,499,,,,,,,,,,,,,,,,,
56
+ 0.1724853515625,2,509,,,,,,,,,,,,,,,,,
57
+ ,2,512,0.5341271758079529,0.7738203406333923,0.2016877681016922,0.8241379261016846,0.3240678012371063,0.3240678012371063,,,,,,,,,,,
58
+ ,2,512,,,,,,,0.9566495418548584,0.9491211771965027,0.942148745059967,0.9456221461296082,0.9456221461296082,,,,,,
59
+ 0.056976318359375,3,519,,,,,,,,,,,,,,,,,
60
+ 0.1407470703125,3,529,,,,,,,,,,,,,,,,,
61
+ 0.048370361328125,3,539,,,,,,,,,,,,,,,,,
62
+ 0.004375457763671875,3,549,,,,,,,,,,,,,,,,,
63
+ 0.01050567626953125,3,559,,,,,,,,,,,,,,,,,
64
+ 0.08062744140625,3,569,,,,,,,,,,,,,,,,,
65
+ 0.376953125,3,579,,,,,,,,,,,,,,,,,
66
+ 0.04742431640625,3,589,,,,,,,,,,,,,,,,,
67
+ 0.143798828125,3,599,,,,,,,,,,,,,,,,,
68
+ 0.030059814453125,3,609,,,,,,,,,,,,,,,,,
69
+ 0.040374755859375,3,619,,,,,,,,,,,,,,,,,
70
+ 0.19873046875,3,629,,,,,,,,,,,,,,,,,
71
+ 0.031402587890625,3,639,,,,,,,,,,,,,,,,,
72
+ 0.00252532958984375,3,649,,,,,,,,,,,,,,,,,
73
+ 0.0867919921875,3,659,,,,,,,,,,,,,,,,,
74
+ 0.040191650390625,3,669,,,,,,,,,,,,,,,,,
75
+ 0.0982666015625,3,679,,,,,,,,,,,,,,,,,
76
+ ,3,683,0.4307195246219635,0.8677404522895813,0.2984869182109833,0.748275876045227,0.42674532532691956,0.42674532532691956,,,,,,,,,,,
77
+ ,3,683,,,,,,,0.9742836356163025,0.9670027494430542,0.968778669834137,0.9678899049758911,0.9678899049758911,,,,,,
78
+ 0.003635406494140625,4,689,,,,,,,,,,,,,,,,,
79
+ 0.0293426513671875,4,699,,,,,,,,,,,,,,,,,
80
+ 0.001384735107421875,4,709,,,,,,,,,,,,,,,,,
81
+ 0.003826141357421875,4,719,,,,,,,,,,,,,,,,,
82
+ 0.0015773773193359375,4,729,,,,,,,,,,,,,,,,,
83
+ 0.0043182373046875,4,739,,,,,,,,,,,,,,,,,
84
+ 0.001644134521484375,4,749,,,,,,,,,,,,,,,,,
85
+ 0.03326416015625,4,759,,,,,,,,,,,,,,,,,
86
+ 0.001827239990234375,4,769,,,,,,,,,,,,,,,,,
87
+ 0.0008625984191894531,4,779,,,,,,,,,,,,,,,,,
88
+ 0.0024814605712890625,4,789,,,,,,,,,,,,,,,,,
89
+ 0.04608154296875,4,799,,,,,,,,,,,,,,,,,
90
+ 0.030364990234375,4,809,,,,,,,,,,,,,,,,,
91
+ 0.04498291015625,4,819,,,,,,,,,,,,,,,,,
92
+ 0.0010805130004882812,4,829,,,,,,,,,,,,,,,,,
93
+ 0.007289886474609375,4,839,,,,,,,,,,,,,,,,,
94
+ 0.7626953125,4,849,,,,,,,,,,,,,,,,,
95
+ ,4,854,1.0994356870651245,0.7813067436218262,0.2074652761220932,0.8241379261016846,0.3314840495586395,0.3314840495586395,,,,,,,,,,,
96
+ ,4,854,,,,,,,0.9911829829216003,0.989880383014679,0.9880624413490295,0.9889705777168274,0.9889705777168274,,,,,,
97
+ 0.07330322265625,5,859,,,,,,,,,,,,,,,,,
98
+ 0.007152557373046875,5,869,,,,,,,,,,,,,,,,,
99
+ 0.0017251968383789062,5,879,,,,,,,,,,,,,,,,,
100
+ 0.00966644287109375,5,889,,,,,,,,,,,,,,,,,
101
+ 0.014617919921875,5,899,,,,,,,,,,,,,,,,,
102
+ 0.00643157958984375,5,909,,,,,,,,,,,,,,,,,
103
+ 0.1793212890625,5,919,,,,,,,,,,,,,,,,,
104
+ 0.0158843994140625,5,929,,,,,,,,,,,,,,,,,
105
+ 0.01483917236328125,5,939,,,,,,,,,,,,,,,,,
106
+ 0.0116424560546875,5,949,,,,,,,,,,,,,,,,,
107
+ 0.046630859375,5,959,,,,,,,,,,,,,,,,,
108
+ 0.01290130615234375,5,969,,,,,,,,,,,,,,,,,
109
+ 0.01458740234375,5,979,,,,,,,,,,,,,,,,,
110
+ 0.031829833984375,5,989,,,,,,,,,,,,,,,,,
111
+ 0.036346435546875,5,999,,,,,,,,,,,,,,,,,
112
+ 0.0152435302734375,5,1009,,,,,,,,,,,,,,,,,
113
+ 0.00556182861328125,5,1019,,,,,,,,,,,,,,,,,
114
+ ,5,1025,0.37797266244888306,0.9088021516799927,0.39147287607192993,0.6965517401695251,0.5012406706809998,0.5012406706809998,,,,,,,,,,,
115
+ ,5,1025,,,,,,,0.9819985032081604,0.9753199219703674,0.9797979593276978,0.9775538444519043,0.9775538444519043,,,,,,
116
+ 0.02069091796875,6,1029,,,,,,,,,,,,,,,,,
117
+ 0.205078125,6,1039,,,,,,,,,,,,,,,,,
118
+ 0.01212310791015625,6,1049,,,,,,,,,,,,,,,,,
119
+ 0.016571044921875,6,1059,,,,,,,,,,,,,,,,,
120
+ 0.053070068359375,6,1069,,,,,,,,,,,,,,,,,
121
+ 0.0028896331787109375,6,1079,,,,,,,,,,,,,,,,,
122
+ 0.1202392578125,6,1089,,,,,,,,,,,,,,,,,
123
+ 0.00884246826171875,6,1099,,,,,,,,,,,,,,,,,
124
+ 0.002231597900390625,6,1109,,,,,,,,,,,,,,,,,
125
+ 0.00974273681640625,6,1119,,,,,,,,,,,,,,,,,
126
+ 0.0335693359375,6,1129,,,,,,,,,,,,,,,,,
127
+ 0.004673004150390625,6,1139,,,,,,,,,,,,,,,,,
128
+ 0.287109375,6,1149,,,,,,,,,,,,,,,,,
129
+ 0.08795166015625,6,1159,,,,,,,,,,,,,,,,,
130
+ 0.00901031494140625,6,1169,,,,,,,,,,,,,,,,,
131
+ 0.01025390625,6,1179,,,,,,,,,,,,,,,,,
132
+ 0.050384521484375,6,1189,,,,,,,,,,,,,,,,,
133
+ ,6,1196,0.5810363292694092,0.8743194341659546,0.3053097426891327,0.7137930989265442,0.42768594622612,0.42768594622612,,,,,,,,,,,
134
+ ,6,1196,,,,,,,0.9911829829216003,0.9880843162536621,0.9898989796638489,0.988990843296051,0.988990843296051,,,,,,
135
+ 0.005062103271484375,7,1199,,,,,,,,,,,,,,,,,
136
+ 0.0116119384765625,7,1209,,,,,,,,,,,,,,,,,
137
+ 0.0006499290466308594,7,1219,,,,,,,,,,,,,,,,,
138
+ 0.009674072265625,7,1229,,,,,,,,,,,,,,,,,
139
+ 0.004718780517578125,7,1239,,,,,,,,,,,,,,,,,
140
+ 0.0006432533264160156,7,1249,,,,,,,,,,,,,,,,,
141
+ 0.0006594657897949219,7,1259,,,,,,,,,,,,,,,,,
142
+ 0.0006146430969238281,7,1269,,,,,,,,,,,,,,,,,
143
+ 0.0008616447448730469,7,1279,,,,,,,,,,,,,,,,,
144
+ 0.0004451274871826172,7,1289,,,,,,,,,,,,,,,,,
145
+ 0.0004410743713378906,7,1299,,,,,,,,,,,,,,,,,
146
+ 0.0035152435302734375,7,1309,,,,,,,,,,,,,,,,,
147
+ 0.0005249977111816406,7,1319,,,,,,,,,,,,,,,,,
148
+ 0.0005850791931152344,7,1329,,,,,,,,,,,,,,,,,
149
+ 0.1878662109375,7,1339,,,,,,,,,,,,,,,,,
150
+ 0.016326904296875,7,1349,,,,,,,,,,,,,,,,,
151
+ 0.0019102096557617188,7,1359,,,,,,,,,,,,,,,,,
152
+ ,7,1367,0.49221912026405334,0.8886116147041321,0.3327786922454834,0.6896551847457886,0.44893378019332886,0.44893378019332886,,,,,,,,,,,
153
+ ,7,1367,,,,,,,0.9966936111450195,0.9963235259056091,0.9954086542129517,0.9958658814430237,0.9958658814430237,,,,,,
154
+ 0.0009813308715820312,8,1369,,,,,,,,,,,,,,,,,
155
+ 0.00127410888671875,8,1379,,,,,,,,,,,,,,,,,
156
+ 0.0015325546264648438,8,1389,,,,,,,,,,,,,,,,,
157
+ 0.00914764404296875,8,1399,,,,,,,,,,,,,,,,,
158
+ 0.0008273124694824219,8,1409,,,,,,,,,,,,,,,,,
159
+ 0.0011987686157226562,8,1419,,,,,,,,,,,,,,,,,
160
+ 0.0003414154052734375,8,1429,,,,,,,,,,,,,,,,,
161
+ 0.0031108856201171875,8,1439,,,,,,,,,,,,,,,,,
162
+ 0.0004572868347167969,8,1449,,,,,,,,,,,,,,,,,
163
+ 0.0006923675537109375,8,1459,,,,,,,,,,,,,,,,,
164
+ 0.0003120899200439453,8,1469,,,,,,,,,,,,,,,,,
165
+ 0.003658294677734375,8,1479,,,,,,,,,,,,,,,,,
166
+ 0.00034880638122558594,8,1489,,,,,,,,,,,,,,,,,
167
+ 0.000492095947265625,8,1499,,,,,,,,,,,,,,,,,
168
+ 0.0002238750457763672,8,1509,,,,,,,,,,,,,,,,,
169
+ 0.0003514289855957031,8,1519,,,,,,,,,,,,,,,,,
170
+ 0.380126953125,8,1529,,,,,,,,,,,,,,,,,
171
+ ,8,1538,0.47979027032852173,0.909709632396698,0.3897959291934967,0.6586207151412964,0.48974359035491943,0.48974359035491943,,,,,,,,,,,
172
+ ,8,1538,,,,,,,0.997061014175415,0.9981566667556763,0.994490385055542,0.9963201284408569,0.9963201284408569,,,,,,
173
+ 0.0142364501953125,9,1539,,,,,,,,,,,,,,,,,
174
+ 0.002254486083984375,9,1549,,,,,,,,,,,,,,,,,
175
+ 0.0031986236572265625,9,1559,,,,,,,,,,,,,,,,,
176
+ 0.000621795654296875,9,1569,,,,,,,,,,,,,,,,,
177
+ 0.0004968643188476562,9,1579,,,,,,,,,,,,,,,,,
178
+ 0.0007233619689941406,9,1589,,,,,,,,,,,,,,,,,
179
+ 0.0003178119659423828,9,1599,,,,,,,,,,,,,,,,,
180
+ 0.0004620552062988281,9,1609,,,,,,,,,,,,,,,,,
181
+ 0.0002932548522949219,9,1619,,,,,,,,,,,,,,,,,
182
+ 0.001575469970703125,9,1629,,,,,,,,,,,,,,,,,
183
+ 0.0008373260498046875,9,1639,,,,,,,,,,,,,,,,,
184
+ 0.011688232421875,9,1649,,,,,,,,,,,,,,,,,
185
+ 0.0089111328125,9,1659,,,,,,,,,,,,,,,,,
186
+ 0.00579071044921875,9,1669,,,,,,,,,,,,,,,,,
187
+ 0.0009889602661132812,9,1679,,,,,,,,,,,,,,,,,
188
+ 0.043060302734375,9,1689,,,,,,,,,,,,,,,,,
189
+ 0.0005102157592773438,9,1699,,,,,,,,,,,,,,,,,
190
+ 0.0002346038818359375,9,1709,,,,,,,,,,,,,,,,,
191
+ ,9,1709,0.6371660232543945,0.8752268552780151,0.30882352590560913,0.7241379022598267,0.4329896867275238,0.4329896867275238,,,,,,,,,,,
192
+ ,9,1709,,,,,,,0.9933872222900391,0.9899359345436096,0.9935720562934875,0.9917507171630859,0.9917507171630859,,,,,,
193
+ ,10,1710,,,,,,,,,,,,0.40940672159194946,0.8987295627593994,0.36324167251586914,0.6802167892456055,0.473584920167923,0.473584920167923
display_v3/2023-04-14_17-59-45/yes.txt ADDED
File without changes