Spaces:

Zpwang-AI
/

InsultingLanguageDetection

Build error

App Files Files Community

Zpwang-AI commited on Apr 14, 2023

Commit

51cf79c

1 Parent(s): ece213d

Upload 21 files

Browse files

Files changed (21) hide show

display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/latest +1 -0
display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/zero_to_fp32.py +483 -0
display_v3/2023-04-14_16-59-07/hparams.yaml +23 -0
display_v3/2023-04-14_16-59-07/metrics.csv +255 -0
display_v3/2023-04-14_16-59-07/yes.txt +0 -0
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/latest +1 -0
display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/zero_to_fp32.py +483 -0
display_v3/2023-04-14_17-06-18/hparams.yaml +23 -0
display_v3/2023-04-14_17-06-18/metrics.csv +38 -0
display_v3/2023-04-14_17-06-18/yes.txt +0 -0
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/mp_rank_00_model_states.pt +3 -0
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/latest +1 -0
display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/zero_to_fp32.py +483 -0
display_v3/2023-04-14_17-59-45/hparams.yaml +23 -0
display_v3/2023-04-14_17-59-45/metrics.csv +193 -0
display_v3/2023-04-14_17-59-45/yes.txt +0 -0

display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9921f36370ead56c25c58b8409cb71175b00e3c5ad5105f5fc49666915361ce
+size 220228915

display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f3ae9d4195c5adab49e78a909cfe95ae8d21c7a2ffc90eed224e122f51eabae
+size 1320918341

display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoint

display_v3/2023-04-14_16-59-07/epoch6-f1score0.47.ckpt/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,483 @@

+#!/usr/bin/env python
+'''Copyright The Microsoft DeepSpeed Team'''
+# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION,
+                                            OPTIMIZER_STATE_DICT,
+                                            SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS,
+                                            ZERO_STAGE,
+                                            PARTITION_COUNT,
+                                            PARAM_SHAPES,
+                                            BUFFER_NAMES)
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage == 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_optim_files(checkpoint_dir):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
+                                                "*_optim_states.pt")),
+                         key=natural_keys)
+    if len(optim_files) == 0:
+        raise FileNotFoundError(
+            f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
+    return optim_files
+def parse_model_state(file):
+    state_dict = torch.load(file, map_location=device)
+    if BUFFER_NAMES not in state_dict:
+        raise ValueError(f"{file} is not a model state checkpoint")
+    buffer_names = state_dict[BUFFER_NAMES]
+    if debug:
+        print("Found buffers:", buffer_names)
+    # recover just the buffers while restoring them to fp32 if they were saved in fp16
+    buffers = {
+        k: v.float()
+        for k,
+        v in state_dict["module"].items() if k in buffer_names
+    }
+    param_shapes = state_dict[PARAM_SHAPES]
+    ds_version = state_dict.get(DS_VERSION, None)
+    return buffers, param_shapes, ds_version
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dicts.append(torch.load(f, map_location=device))
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage == 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage == 2:
+        fp32_flat_groups = [
+            state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
+            for i in range(len(state_dicts))
+        ]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
+                      0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(
+        f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
+    buffers, param_shapes, ds_version = parse_model_state(model_file)
+    print(f'Parsing checkpoint created by deepspeed=={ds_version}')
+    if zero_stage == 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
+                                                          param_shapes,
+                                                          fp32_flat_groups,
+                                                          buffers)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
+                                                          param_shapes,
+                                                          fp32_flat_groups,
+                                                          buffers)
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
+                                               param_shapes,
+                                               fp32_flat_groups,
+                                               buffers):
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(
+                    f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum([
+        full_single_fp32_vector.numel()
+        for full_single_fp32_vector in merged_single_partition_of_fp32_groups
+    ])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum(
+            [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    state_dict = OrderedDict()
+    # buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(
+                    f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
+                )
+            state_dict[name] = full_single_fp32_vector.narrow(
+                0,
+                offset,
+                unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(
+                f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(
+        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
+    )
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
+                                               param_shapes,
+                                               fp32_flat_groups,
+                                               buffers):
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    state_dict = OrderedDict()
+    # buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0,
+                                             offset,
+                                             partitioned_numel)
+                  for i in range(world_size)),
+            0).narrow(0,
+                      0,
+                      unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(
+            f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(
+        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
+    )
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "checkpoint_dir",
+        type=str,
+        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help=
+        "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
+    )
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)

display_v3/2023-04-14_16-59-07/hparams.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+amp: true
+batch_size: 16
+cls_target: hd
+deepspeed: true
+dev_data_file: ''
+downsample_data: true
+early_dropout: null
+epochs: 10
+freeze_encoder: false
+just_test: false
+log_fold: ./logs
+log_step: 10
+lr: 5.0e-05
+model_name: bert-base-uncased
+positive_ratio: 0.4
+pretrained_model_fold: ./pretrained_model
+rdrop: null
+running time: 0:06:52
+share_encoder: false
+test_data_file: ''
+train_data_file: ''
+train_ratio: 0.8
+version: structure cmp

display_v3/2023-04-14_16-59-07/metrics.csv ADDED Viewed

	@@ -0,0 +1,255 @@

+train_loss,epoch,step,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_macro_f1,train_accuracy,train_precision,train_recall,train_f1,train_macro_f1,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_macro_f1
+0.611328125,0,9,,,,,,,,,,,,,,,,,
+0.5078125,0,19,,,,,,,,,,,,,,,,,
+0.7041015625,0,29,,,,,,,,,,,,,,,,,
+0.7119140625,0,39,,,,,,,,,,,,,,,,,
+0.398193359375,0,49,,,,,,,,,,,,,,,,,
+0.461181640625,0,59,,,,,,,,,,,,,,,,,
+0.386474609375,0,69,,,,,,,,,,,,,,,,,
+0.4326171875,0,79,,,,,,,,,,,,,,,,,
+0.66748046875,0,89,,,,,,,,,,,,,,,,,
+0.33544921875,0,99,,,,,,,,,,,,,,,,,
+0.30810546875,0,109,,,,,,,,,,,,,,,,,
+0.55615234375,0,119,,,,,,,,,,,,,,,,,
+0.2861328125,0,129,,,,,,,,,,,,,,,,,
+0.60400390625,0,139,,,,,,,,,,,,,,,,,
+0.623046875,0,149,,,,,,,,,,,,,,,,,
+0.53076171875,0,159,,,,,,,,,,,,,,,,,
+0.418701171875,0,169,,,,,,,,,,,,,,,,,
+0.495361328125,0,179,,,,,,,,,,,,,,,,,
+0.40625,0,189,,,,,,,,,,,,,,,,,
+0.448974609375,0,199,,,,,,,,,,,,,,,,,
+0.295654296875,0,209,,,,,,,,,,,,,,,,,
+0.6376953125,0,219,,,,,,,,,,,,,,,,,
+0.6279296875,0,229,,,,,,,,,,,,,,,,,
+,0,232,0.3514973521232605,0.8561705946922302,0.33702337741851807,0.7425474524497986,0.46362099051475525,0.46362099051475525,,,,,,,,,,,
+,0,232,,,,,,,0.7682795524597168,0.7229344844818115,0.6821236610412598,0.7019363641738892,0.7019363641738892,,,,,,
+0.2568359375,1,239,,,,,,,,,,,,,,,,,
+0.229248046875,1,249,,,,,,,,,,,,,,,,,
+0.066650390625,1,259,,,,,,,,,,,,,,,,,
+0.3134765625,1,269,,,,,,,,,,,,,,,,,
+0.1636962890625,1,279,,,,,,,,,,,,,,,,,
+0.36279296875,1,289,,,,,,,,,,,,,,,,,
+0.150146484375,1,299,,,,,,,,,,,,,,,,,
+0.246337890625,1,309,,,,,,,,,,,,,,,,,
+0.3505859375,1,319,,,,,,,,,,,,,,,,,
+0.329345703125,1,329,,,,,,,,,,,,,,,,,
+0.181640625,1,339,,,,,,,,,,,,,,,,,
+0.2763671875,1,349,,,,,,,,,,,,,,,,,
+0.241455078125,1,359,,,,,,,,,,,,,,,,,
+0.302490234375,1,369,,,,,,,,,,,,,,,,,
+0.255859375,1,379,,,,,,,,,,,,,,,,,
+0.24267578125,1,389,,,,,,,,,,,,,,,,,
+0.465576171875,1,399,,,,,,,,,,,,,,,,,
+0.484619140625,1,409,,,,,,,,,,,,,,,,,
+0.262451171875,1,419,,,,,,,,,,,,,,,,,
+0.271728515625,1,429,,,,,,,,,,,,,,,,,
+0.190673828125,1,439,,,,,,,,,,,,,,,,,
+0.474853515625,1,449,,,,,,,,,,,,,,,,,
+0.56201171875,1,459,,,,,,,,,,,,,,,,,
+,1,465,0.3760688304901123,0.8321233987808228,0.30616509914398193,0.794037938117981,0.4419306218624115,0.4419306218624115,,,,,,,,,,,
+,1,465,,,,,,,0.875268816947937,0.8464140892028809,0.8407257795333862,0.8435603380203247,0.8435603380203247,,,,,,
+0.07574462890625,2,469,,,,,,,,,,,,,,,,,
+0.206787109375,2,479,,,,,,,,,,,,,,,,,
+0.033966064453125,2,489,,,,,,,,,,,,,,,,,
+0.1544189453125,2,499,,,,,,,,,,,,,,,,,
+0.13671875,2,509,,,,,,,,,,,,,,,,,
+0.07061767578125,2,519,,,,,,,,,,,,,,,,,
+0.0055084228515625,2,529,,,,,,,,,,,,,,,,,
+0.2064208984375,2,539,,,,,,,,,,,,,,,,,
+0.020599365234375,2,549,,,,,,,,,,,,,,,,,
+0.0143585205078125,2,559,,,,,,,,,,,,,,,,,
+0.0257720947265625,2,569,,,,,,,,,,,,,,,,,
+0.052337646484375,2,579,,,,,,,,,,,,,,,,,
+0.0276031494140625,2,589,,,,,,,,,,,,,,,,,
+0.044097900390625,2,599,,,,,,,,,,,,,,,,,
+0.06817626953125,2,609,,,,,,,,,,,,,,,,,
+0.1556396484375,2,619,,,,,,,,,,,,,,,,,
+0.32763671875,2,629,,,,,,,,,,,,,,,,,
+0.05426025390625,2,639,,,,,,,,,,,,,,,,,
+0.0640869140625,2,649,,,,,,,,,,,,,,,,,
+0.0293426513671875,2,659,,,,,,,,,,,,,,,,,
+0.2349853515625,2,669,,,,,,,,,,,,,,,,,
+0.0736083984375,2,679,,,,,,,,,,,,,,,,,
+0.0474853515625,2,689,,,,,,,,,,,,,,,,,
+,2,698,1.0874371528625488,0.7009981870651245,0.2065553516149521,0.9051490426063538,0.33635449409484863,0.33635449409484863,,,,,,,,,,,
+,2,698,,,,,,,0.9575268626213074,0.9421542286872864,0.9522849321365356,0.9471924901008606,0.9471924901008606,,,,,,
+0.0280609130859375,3,699,,,,,,,,,,,,,,,,,
+0.09930419921875,3,709,,,,,,,,,,,,,,,,,
+0.0268096923828125,3,719,,,,,,,,,,,,,,,,,
+0.03912353515625,3,729,,,,,,,,,,,,,,,,,
+0.00539398193359375,3,739,,,,,,,,,,,,,,,,,
+0.0030689239501953125,3,749,,,,,,,,,,,,,,,,,
+0.00466156005859375,3,759,,,,,,,,,,,,,,,,,
+0.0199432373046875,3,769,,,,,,,,,,,,,,,,,
+0.0164337158203125,3,779,,,,,,,,,,,,,,,,,
+0.09100341796875,3,789,,,,,,,,,,,,,,,,,
+0.0152435302734375,3,799,,,,,,,,,,,,,,,,,
+0.0019006729125976562,3,809,,,,,,,,,,,,,,,,,
+0.00083160400390625,3,819,,,,,,,,,,,,,,,,,
+0.0223541259765625,3,829,,,,,,,,,,,,,,,,,
+0.1595458984375,3,839,,,,,,,,,,,,,,,,,
+0.004375457763671875,3,849,,,,,,,,,,,,,,,,,
+0.01349639892578125,3,859,,,,,,,,,,,,,,,,,
+0.040191650390625,3,869,,,,,,,,,,,,,,,,,
+0.031494140625,3,879,,,,,,,,,,,,,,,,,
+0.01474761962890625,3,889,,,,,,,,,,,,,,,,,
+0.022308349609375,3,899,,,,,,,,,,,,,,,,,
+0.035919189453125,3,909,,,,,,,,,,,,,,,,,
+0.107177734375,3,919,,,,,,,,,,,,,,,,,
+0.04888916015625,3,929,,,,,,,,,,,,,,,,,
+,3,931,0.452526330947876,0.8439201712608337,0.31168830394744873,0.7154471278190613,0.43421053886413574,0.43421053886413574,,,,,,,,,,,
+,3,931,,,,,,,0.9806451797485352,0.974530816078186,0.977150559425354,0.9758388996124268,0.9758388996124268,,,,,,
+0.0186004638671875,4,939,,,,,,,,,,,,,,,,,
+0.0024738311767578125,4,949,,,,,,,,,,,,,,,,,
+0.0012655258178710938,4,959,,,,,,,,,,,,,,,,,
+0.001422882080078125,4,969,,,,,,,,,,,,,,,,,
+0.0029392242431640625,4,979,,,,,,,,,,,,,,,,,
+0.1710205078125,4,989,,,,,,,,,,,,,,,,,
+0.12115478515625,4,999,,,,,,,,,,,,,,,,,
+0.00638580322265625,4,1009,,,,,,,,,,,,,,,,,
+0.00469207763671875,4,1019,,,,,,,,,,,,,,,,,
+0.013702392578125,4,1029,,,,,,,,,,,,,,,,,
+0.0222625732421875,4,1039,,,,,,,,,,,,,,,,,
+0.045074462890625,4,1049,,,,,,,,,,,,,,,,,
+0.00867462158203125,4,1059,,,,,,,,,,,,,,,,,
+0.003887176513671875,4,1069,,,,,,,,,,,,,,,,,
+0.029052734375,4,1079,,,,,,,,,,,,,,,,,
+0.0028285980224609375,4,1089,,,,,,,,,,,,,,,,,
+0.00045561790466308594,4,1099,,,,,,,,,,,,,,,,,
+0.0133209228515625,4,1109,,,,,,,,,,,,,,,,,
+0.304443359375,4,1119,,,,,,,,,,,,,,,,,
+0.002223968505859375,4,1129,,,,,,,,,,,,,,,,,
+0.0014781951904296875,4,1139,,,,,,,,,,,,,,,,,
+0.005718231201171875,4,1149,,,,,,,,,,,,,,,,,
+0.0115966796875,4,1159,,,,,,,,,,,,,,,,,
+,4,1164,0.7260090708732605,0.7958257794380188,0.2644188106060028,0.8075881004333496,0.39839571714401245,0.39839571714401245,,,,,,,,,,,
+,4,1164,,,,,,,0.9889785051345825,0.9852448105812073,0.9872311949729919,0.9862369894981384,0.9862369894981384,,,,,,
+0.005008697509765625,5,1169,,,,,,,,,,,,,,,,,
+0.007183074951171875,5,1179,,,,,,,,,,,,,,,,,
+0.1400146484375,5,1189,,,,,,,,,,,,,,,,,
+0.0007276535034179688,5,1199,,,,,,,,,,,,,,,,,
+0.1689453125,5,1209,,,,,,,,,,,,,,,,,
+0.00457763671875,5,1219,,,,,,,,,,,,,,,,,
+0.035430908203125,5,1229,,,,,,,,,,,,,,,,,
+0.0012540817260742188,5,1239,,,,,,,,,,,,,,,,,
+0.0225372314453125,5,1249,,,,,,,,,,,,,,,,,
+0.0008778572082519531,5,1259,,,,,,,,,,,,,,,,,
+0.01336669921875,5,1269,,,,,,,,,,,,,,,,,
+0.00044846534729003906,5,1279,,,,,,,,,,,,,,,,,
+0.00408172607421875,5,1289,,,,,,,,,,,,,,,,,
+0.00037980079650878906,5,1299,,,,,,,,,,,,,,,,,
+0.0004723072052001953,5,1309,,,,,,,,,,,,,,,,,
+0.01436614990234375,5,1319,,,,,,,,,,,,,,,,,
+0.0670166015625,5,1329,,,,,,,,,,,,,,,,,
+0.07574462890625,5,1339,,,,,,,,,,,,,,,,,
+0.01025390625,5,1349,,,,,,,,,,,,,,,,,
+0.10150146484375,5,1359,,,,,,,,,,,,,,,,,
+0.0014791488647460938,5,1369,,,,,,,,,,,,,,,,,
+0.003528594970703125,5,1379,,,,,,,,,,,,,,,,,
+0.002532958984375,5,1389,,,,,,,,,,,,,,,,,
+,5,1397,1.1667187213897705,0.7558983564376831,0.2332075536251068,0.8373983502388,0.3648169934749603,0.3648169934749603,,,,,,,,,,,
+,5,1397,,,,,,,0.9905914068222046,0.9892255663871765,0.9872311949729919,0.988227367401123,0.988227367401123,,,,,,
+0.004306793212890625,6,1399,,,,,,,,,,,,,,,,,
+0.010406494140625,6,1409,,,,,,,,,,,,,,,,,
+0.00102996826171875,6,1419,,,,,,,,,,,,,,,,,
+0.00447845458984375,6,1429,,,,,,,,,,,,,,,,,
+0.0005435943603515625,6,1439,,,,,,,,,,,,,,,,,
+0.155029296875,6,1449,,,,,,,,,,,,,,,,,
+0.00028395652770996094,6,1459,,,,,,,,,,,,,,,,,
+0.0070037841796875,6,1469,,,,,,,,,,,,,,,,,
+0.009063720703125,6,1479,,,,,,,,,,,,,,,,,
+0.07452392578125,6,1489,,,,,,,,,,,,,,,,,
+0.005832672119140625,6,1499,,,,,,,,,,,,,,,,,
+0.0043487548828125,6,1509,,,,,,,,,,,,,,,,,
+0.00695037841796875,6,1519,,,,,,,,,,,,,,,,,
+0.06646728515625,6,1529,,,,,,,,,,,,,,,,,
+0.001789093017578125,6,1539,,,,,,,,,,,,,,,,,
+0.00323486328125,6,1549,,,,,,,,,,,,,,,,,
+0.0006985664367675781,6,1559,,,,,,,,,,,,,,,,,
+0.0648193359375,6,1569,,,,,,,,,,,,,,,,,
+0.01558685302734375,6,1579,,,,,,,,,,,,,,,,,
+0.00103759765625,6,1589,,,,,,,,,,,,,,,,,
+0.001270294189453125,6,1599,,,,,,,,,,,,,,,,,
+0.0002396106719970703,6,1609,,,,,,,,,,,,,,,,,
+0.0003399848937988281,6,1619,,,,,,,,,,,,,,,,,
+0.04937744140625,6,1629,,,,,,,,,,,,,,,,,
+,6,1630,0.5342352986335754,0.8849818706512451,0.38383838534355164,0.6178861856460571,0.4735202491283417,0.4735202491283417,,,,,,,,,,,
+,6,1630,,,,,,,0.9916666746139526,0.987943708896637,0.9912634491920471,0.9896007776260376,0.9896007776260376,,,,,,
+0.0019931793212890625,7,1639,,,,,,,,,,,,,,,,,
+0.0010328292846679688,7,1649,,,,,,,,,,,,,,,,,
+0.002391815185546875,7,1659,,,,,,,,,,,,,,,,,
+0.025543212890625,7,1669,,,,,,,,,,,,,,,,,
+0.0016775131225585938,7,1679,,,,,,,,,,,,,,,,,
+0.035919189453125,7,1689,,,,,,,,,,,,,,,,,
+0.00547027587890625,7,1699,,,,,,,,,,,,,,,,,
+0.0006341934204101562,7,1709,,,,,,,,,,,,,,,,,
+0.0009632110595703125,7,1719,,,,,,,,,,,,,,,,,
+0.00418853759765625,7,1729,,,,,,,,,,,,,,,,,
+0.0033130645751953125,7,1739,,,,,,,,,,,,,,,,,
+0.001251220703125,7,1749,,,,,,,,,,,,,,,,,
+0.00024580955505371094,7,1759,,,,,,,,,,,,,,,,,
+0.0007381439208984375,7,1769,,,,,,,,,,,,,,,,,
+0.00131988525390625,7,1779,,,,,,,,,,,,,,,,,
+0.00652313232421875,7,1789,,,,,,,,,,,,,,,,,
+0.00263214111328125,7,1799,,,,,,,,,,,,,,,,,
+0.0014677047729492188,7,1809,,,,,,,,,,,,,,,,,
+0.0016336441040039062,7,1819,,,,,,,,,,,,,,,,,
+0.0007638931274414062,7,1829,,,,,,,,,,,,,,,,,
+0.00135040283203125,7,1839,,,,,,,,,,,,,,,,,
+0.002391815185546875,7,1849,,,,,,,,,,,,,,,,,
+0.0011796951293945312,7,1859,,,,,,,,,,,,,,,,,
+,7,1863,1.4027303457260132,0.7279945611953735,0.22370173037052155,0.9105691313743591,0.3591662347316742,0.3591662347316742,,,,,,,,,,,
+,7,1863,,,,,,,0.9935483932495117,0.9925975799560547,0.9912634491920471,0.9919300675392151,0.9919300675392151,,,,,,
+0.00128936767578125,8,1869,,,,,,,,,,,,,,,,,
+0.00234222412109375,8,1879,,,,,,,,,,,,,,,,,
+0.00522613525390625,8,1889,,,,,,,,,,,,,,,,,
+0.1265869140625,8,1899,,,,,,,,,,,,,,,,,
+0.0026607513427734375,8,1909,,,,,,,,,,,,,,,,,
+0.0024738311767578125,8,1919,,,,,,,,,,,,,,,,,
+0.0029926300048828125,8,1929,,,,,,,,,,,,,,,,,
+0.0010385513305664062,8,1939,,,,,,,,,,,,,,,,,
+0.0003845691680908203,8,1949,,,,,,,,,,,,,,,,,
+0.0232086181640625,8,1959,,,,,,,,,,,,,,,,,
+0.00035262107849121094,8,1969,,,,,,,,,,,,,,,,,
+0.00084686279296875,8,1979,,,,,,,,,,,,,,,,,
+0.0023326873779296875,8,1989,,,,,,,,,,,,,,,,,
+0.0024738311767578125,8,1999,,,,,,,,,,,,,,,,,
+0.0016727447509765625,8,2009,,,,,,,,,,,,,,,,,
+0.0006160736083984375,8,2019,,,,,,,,,,,,,,,,,
+0.0017137527465820312,8,2029,,,,,,,,,,,,,,,,,
+0.145751953125,8,2039,,,,,,,,,,,,,,,,,
+0.000591278076171875,8,2049,,,,,,,,,,,,,,,,,
+0.001270294189453125,8,2059,,,,,,,,,,,,,,,,,
+0.00011342763900756836,8,2069,,,,,,,,,,,,,,,,,
+0.0004799365997314453,8,2079,,,,,,,,,,,,,,,,,
+0.00064849853515625,8,2089,,,,,,,,,,,,,,,,,
+,8,2096,1.368309736251831,0.7813067436218262,0.2531120479106903,0.8265582919120789,0.38754764199256897,0.38754764199256897,,,,,,,,,,,
+,8,2096,,,,,,,0.9967741966247559,0.9953020215034485,0.9966397881507874,0.9959704279899597,0.9959704279899597,,,,,,
+0.00011461973190307617,9,2099,,,,,,,,,,,,,,,,,
+0.00010311603546142578,9,2109,,,,,,,,,,,,,,,,,
+8.416175842285156e-05,9,2119,,,,,,,,,,,,,,,,,
+0.00021708011627197266,9,2129,,,,,,,,,,,,,,,,,
+0.0002524852752685547,9,2139,,,,,,,,,,,,,,,,,
+8.296966552734375e-05,9,2149,,,,,,,,,,,,,,,,,
+0.0005345344543457031,9,2159,,,,,,,,,,,,,,,,,
+0.0016202926635742188,9,2169,,,,,,,,,,,,,,,,,
+0.0078582763671875,9,2179,,,,,,,,,,,,,,,,,
+0.00012177228927612305,9,2189,,,,,,,,,,,,,,,,,
+0.15673828125,9,2199,,,,,,,,,,,,,,,,,
+0.00012803077697753906,9,2209,,,,,,,,,,,,,,,,,
+8.165836334228516e-05,9,2219,,,,,,,,,,,,,,,,,
+7.212162017822266e-05,9,2229,,,,,,,,,,,,,,,,,
+0.0003197193145751953,9,2239,,,,,,,,,,,,,,,,,
+0.09149169921875,9,2249,,,,,,,,,,,,,,,,,
+0.001117706298828125,9,2259,,,,,,,,,,,,,,,,,
+0.00513458251953125,9,2269,,,,,,,,,,,,,,,,,
+0.021209716796875,9,2279,,,,,,,,,,,,,,,,,
+0.0097503662109375,9,2289,,,,,,,,,,,,,,,,,
+0.0246429443359375,9,2299,,,,,,,,,,,,,,,,,
+0.0011377334594726562,9,2309,,,,,,,,,,,,,,,,,
+0.001354217529296875,9,2319,,,,,,,,,,,,,,,,,
+0.00032639503479003906,9,2329,,,,,,,,,,,,,,,,,
+,9,2329,1.1365300416946411,0.7631579041481018,0.242562934756279,0.8617886304855347,0.37857142090797424,0.37857142090797424,,,,,,,,,,,
+,9,2329,,,,,,,0.9940860271453857,0.9912868738174438,0.9939516186714172,0.9926174283027649,0.9926174283027649,,,,,,
+,10,2330,,,,,,,,,,,,0.5662358999252319,0.8833031058311462,0.3981233239173889,0.6048879623413086,0.4801940321922302,0.4801940321922302

display_v3/2023-04-14_16-59-07/yes.txt ADDED Viewed

File without changes

display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fc4b81c7a5f3c433e5c8de5b92730a3c76c4a32c2dc8b60ad349d236a1c0697
+size 220228915

display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bacc5807e8789eb3a6853e0d7a148cf921a0737988170b3dfcd7136f205cc60
+size 1320918341

display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoint

display_v3/2023-04-14_17-06-18/epoch3-f1score0.12.ckpt/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,483 @@

+#!/usr/bin/env python
+'''Copyright The Microsoft DeepSpeed Team'''
+# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION,
+                                            OPTIMIZER_STATE_DICT,
+                                            SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS,
+                                            ZERO_STAGE,
+                                            PARTITION_COUNT,
+                                            PARAM_SHAPES,
+                                            BUFFER_NAMES)
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage == 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_optim_files(checkpoint_dir):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
+                                                "*_optim_states.pt")),
+                         key=natural_keys)
+    if len(optim_files) == 0:
+        raise FileNotFoundError(
+            f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
+    return optim_files
+def parse_model_state(file):
+    state_dict = torch.load(file, map_location=device)
+    if BUFFER_NAMES not in state_dict:
+        raise ValueError(f"{file} is not a model state checkpoint")
+    buffer_names = state_dict[BUFFER_NAMES]
+    if debug:
+        print("Found buffers:", buffer_names)
+    # recover just the buffers while restoring them to fp32 if they were saved in fp16
+    buffers = {
+        k: v.float()
+        for k,
+        v in state_dict["module"].items() if k in buffer_names
+    }
+    param_shapes = state_dict[PARAM_SHAPES]
+    ds_version = state_dict.get(DS_VERSION, None)
+    return buffers, param_shapes, ds_version
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dicts.append(torch.load(f, map_location=device))
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage == 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage == 2:
+        fp32_flat_groups = [
+            state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
+            for i in range(len(state_dicts))
+        ]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
+                      0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(
+        f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
+    buffers, param_shapes, ds_version = parse_model_state(model_file)
+    print(f'Parsing checkpoint created by deepspeed=={ds_version}')
+    if zero_stage == 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
+                                                          param_shapes,
+                                                          fp32_flat_groups,
+                                                          buffers)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
+                                                          param_shapes,
+                                                          fp32_flat_groups,
+                                                          buffers)
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
+                                               param_shapes,
+                                               fp32_flat_groups,
+                                               buffers):
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(
+                    f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum([
+        full_single_fp32_vector.numel()
+        for full_single_fp32_vector in merged_single_partition_of_fp32_groups
+    ])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum(
+            [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    state_dict = OrderedDict()
+    # buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(
+                    f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
+                )
+            state_dict[name] = full_single_fp32_vector.narrow(
+                0,
+                offset,
+                unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(
+                f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(
+        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
+    )
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
+                                               param_shapes,
+                                               fp32_flat_groups,
+                                               buffers):
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    state_dict = OrderedDict()
+    # buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0,
+                                             offset,
+                                             partitioned_numel)
+                  for i in range(world_size)),
+            0).narrow(0,
+                      0,
+                      unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(
+            f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(
+        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
+    )
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "checkpoint_dir",
+        type=str,
+        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help=
+        "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
+    )
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)

display_v3/2023-04-14_17-06-18/hparams.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+amp: true
+batch_size: 16
+cls_target: cv
+deepspeed: true
+dev_data_file: ''
+downsample_data: true
+early_dropout: null
+epochs: 10
+freeze_encoder: false
+just_test: false
+log_fold: ./logs
+log_step: 10
+lr: 5.0e-05
+model_name: bert-base-uncased
+positive_ratio: 0.4
+pretrained_model_fold: ./pretrained_model
+rdrop: null
+running time: 0:01:56
+share_encoder: false
+test_data_file: ''
+train_data_file: ''
+train_ratio: 0.8
+version: structure cmp

display_v3/2023-04-14_17-06-18/metrics.csv ADDED Viewed

	@@ -0,0 +1,38 @@

+train_loss,epoch,step,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_macro_f1,train_accuracy,train_precision,train_recall,train_f1,train_macro_f1,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_macro_f1
+0.70458984375,0,9,,,,,,,,,,,,,,,,,
+,0,15,0.5909940004348755,0.6669691205024719,0.014864864759147167,0.6875,0.029100529849529266,0.029100529849529266,,,,,,,,,,,
+,0,15,,,,,,,0.6275303363800049,0.7692307829856873,0.10101009905338287,0.1785714328289032,0.1785714328289032,,,,,,
+0.4501953125,1,19,,,,,,,,,,,,,,,,,
+0.361328125,1,29,,,,,,,,,,,,,,,,,
+,1,31,0.23693892359733582,0.9108439087867737,0.045340050011873245,0.5625,0.0839160829782486,0.0839160829782486,,,,,,,,,,,
+,1,31,,,,,,,0.8825910687446594,0.8500000238418579,0.8585858345031738,0.8542713522911072,0.8542713522911072,,,,,,
+0.08734130859375,2,39,,,,,,,,,,,,,,,,,
+,2,47,0.6708253026008606,0.7488657236099243,0.023914968594908714,0.84375,0.04651162773370743,0.04651162773370743,,,,,,,,,,,
+,2,47,,,,,,,0.9473684430122375,0.9479166865348816,0.9191918969154358,0.9333333373069763,0.9333333373069763,,,,,,
+0.078369140625,3,49,,,,,,,,,,,,,,,,,
+0.0077056884765625,3,59,,,,,,,,,,,,,,,,,
+,3,63,0.16590917110443115,0.9494101405143738,0.06787330657243729,0.46875,0.11857707798480988,0.11857707798480988,,,,,,,,,,,
+,3,63,,,,,,,0.9959514141082764,0.9900000095367432,1.0,0.9949748516082764,0.9949748516082764,,,,,,
+0.004322052001953125,4,69,,,,,,,,,,,,,,,,,
+0.00762176513671875,4,79,,,,,,,,,,,,,,,,,
+,4,79,0.48889032006263733,0.8593466281890869,0.03627760335803032,0.71875,0.06906907260417938,0.06906907260417938,,,,,,,,,,,
+,4,79,,,,,,,0.9919028282165527,0.9898989796638489,0.9898989796638489,0.9898989796638489,0.9898989796638489,,,,,,
+0.078125,5,89,,,,,,,,,,,,,,,,,
+,5,95,1.2922793626785278,0.7039473652839661,0.021068472415208817,0.875,0.04114621505141258,0.04114621505141258,,,,,,,,,,,
+,5,95,,,,,,,0.9919028282165527,0.9898989796638489,0.9898989796638489,0.9898989796638489,0.9898989796638489,,,,,,
+0.015960693359375,6,99,,,,,,,,,,,,,,,,,
+0.003612518310546875,6,109,,,,,,,,,,,,,,,,,
+,6,111,0.8841056227684021,0.7894737124443054,0.028421051800251007,0.84375,0.05498981848359108,0.05498981848359108,,,,,,,,,,,
+,6,111,,,,,,,0.9959514141082764,0.9900000095367432,1.0,0.9949748516082764,0.9949748516082764,,,,,,
+0.0019989013671875,7,119,,,,,,,,,,,,,,,,,
+,7,127,0.69402015209198,0.8504990935325623,0.039647575467824936,0.84375,0.07573632895946503,0.07573632895946503,,,,,,,,,,,
+,7,127,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,
+0.0010175704956054688,8,129,,,,,,,,,,,,,,,,,
+0.0007581710815429688,8,139,,,,,,,,,,,,,,,,,
+,8,143,0.6454178094863892,0.8666061758995056,0.042763158679008484,0.8125,0.08124999701976776,0.08124999701976776,,,,,,,,,,,
+,8,143,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,
+0.0009255409240722656,9,149,,,,,,,,,,,,,,,,,
+0.00054931640625,9,159,,,,,,,,,,,,,,,,,
+,9,159,0.6520931720733643,0.8681941628456116,0.04326122999191284,0.8125,0.08214849978685379,0.08214849978685379,,,,,,,,,,,
+,9,159,,,,,,,1.0,1.0,1.0,1.0,1.0,,,,,,
+,10,160,,,,,,,,,,,,0.1696726232767105,0.9466424584388733,0.0439189188182354,0.5416666865348816,0.08124999701976776,0.08124999701976776

display_v3/2023-04-14_17-06-18/yes.txt ADDED Viewed

File without changes

display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42d544e1553c822e187b976e75cb402dc7a351855b355913ad28b7ed8e97e4e8
+size 220228915

display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64eb30cf079871506af468afdfbf83a06d02247526ba2b092ab84f001e57929b
+size 1320918341

display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoint

display_v3/2023-04-14_17-59-45/epoch5-f1score0.50.ckpt/zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,483 @@

+#!/usr/bin/env python
+'''Copyright The Microsoft DeepSpeed Team'''
+# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION,
+                                            OPTIMIZER_STATE_DICT,
+                                            SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS,
+                                            ZERO_STAGE,
+                                            PARTITION_COUNT,
+                                            PARAM_SHAPES,
+                                            BUFFER_NAMES)
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage == 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_optim_files(checkpoint_dir):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
+                                                "*_optim_states.pt")),
+                         key=natural_keys)
+    if len(optim_files) == 0:
+        raise FileNotFoundError(
+            f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
+    return optim_files
+def parse_model_state(file):
+    state_dict = torch.load(file, map_location=device)
+    if BUFFER_NAMES not in state_dict:
+        raise ValueError(f"{file} is not a model state checkpoint")
+    buffer_names = state_dict[BUFFER_NAMES]
+    if debug:
+        print("Found buffers:", buffer_names)
+    # recover just the buffers while restoring them to fp32 if they were saved in fp16
+    buffers = {
+        k: v.float()
+        for k,
+        v in state_dict["module"].items() if k in buffer_names
+    }
+    param_shapes = state_dict[PARAM_SHAPES]
+    ds_version = state_dict.get(DS_VERSION, None)
+    return buffers, param_shapes, ds_version
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dicts.append(torch.load(f, map_location=device))
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage == 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage == 2:
+        fp32_flat_groups = [
+            state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
+            for i in range(len(state_dicts))
+        ]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
+                      0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(
+        f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
+    buffers, param_shapes, ds_version = parse_model_state(model_file)
+    print(f'Parsing checkpoint created by deepspeed=={ds_version}')
+    if zero_stage == 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
+                                                          param_shapes,
+                                                          fp32_flat_groups,
+                                                          buffers)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
+                                                          param_shapes,
+                                                          fp32_flat_groups,
+                                                          buffers)
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
+                                               param_shapes,
+                                               fp32_flat_groups,
+                                               buffers):
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(
+                    f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum([
+        full_single_fp32_vector.numel()
+        for full_single_fp32_vector in merged_single_partition_of_fp32_groups
+    ])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum(
+            [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    state_dict = OrderedDict()
+    # buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(
+                    f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
+                )
+            state_dict[name] = full_single_fp32_vector.narrow(
+                0,
+                offset,
+                unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(
+                f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(
+        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
+    )
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
+                                               param_shapes,
+                                               fp32_flat_groups,
+                                               buffers):
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    state_dict = OrderedDict()
+    # buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0,
+                                             offset,
+                                             partitioned_numel)
+                  for i in range(world_size)),
+            0).narrow(0,
+                      0,
+                      unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(
+            f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(
+        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
+    )
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "checkpoint_dir",
+        type=str,
+        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help=
+        "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
+    )
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)

display_v3/2023-04-14_17-59-45/hparams.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+amp: true
+batch_size: 16
+cls_target: vo
+deepspeed: true
+dev_data_file: ''
+downsample_data: true
+early_dropout: null
+epochs: 10
+freeze_encoder: false
+just_test: false
+log_fold: ./logs
+log_step: 10
+lr: 5.0e-05
+model_name: bert-base-uncased
+positive_ratio: 0.4
+pretrained_model_fold: ./pretrained_model
+rdrop: null
+running time: 0:05:34
+share_encoder: false
+test_data_file: ''
+train_data_file: ''
+train_ratio: 0.8
+version: structure cmp

display_v3/2023-04-14_17-59-45/metrics.csv ADDED Viewed

	@@ -0,0 +1,193 @@

+train_loss,epoch,step,val_loss,val_accuracy,val_precision,val_recall,val_f1,val_macro_f1,train_accuracy,train_precision,train_recall,train_f1,train_macro_f1,test_loss,test_accuracy,test_precision,test_recall,test_f1,test_macro_f1
+0.55810546875,0,9,,,,,,,,,,,,,,,,,
+0.46435546875,0,19,,,,,,,,,,,,,,,,,
+0.34228515625,0,29,,,,,,,,,,,,,,,,,
+0.27783203125,0,39,,,,,,,,,,,,,,,,,
+0.59716796875,0,49,,,,,,,,,,,,,,,,,
+0.316650390625,0,59,,,,,,,,,,,,,,,,,
+0.5478515625,0,69,,,,,,,,,,,,,,,,,
+0.173828125,0,79,,,,,,,,,,,,,,,,,
+0.501953125,0,89,,,,,,,,,,,,,,,,,
+0.38623046875,0,99,,,,,,,,,,,,,,,,,
+0.36279296875,0,109,,,,,,,,,,,,,,,,,
+0.2105712890625,0,119,,,,,,,,,,,,,,,,,
+0.2744140625,0,129,,,,,,,,,,,,,,,,,
+0.44677734375,0,139,,,,,,,,,,,,,,,,,
+0.6474609375,0,149,,,,,,,,,,,,,,,,,
+0.81298828125,0,159,,,,,,,,,,,,,,,,,
+0.60888671875,0,169,,,,,,,,,,,,,,,,,
+,0,170,0.6008952260017395,0.7477313876152039,0.18957704305648804,0.865517258644104,0.31102851033210754,0.31102851033210754,,,,,,,,,,,
+,0,170,,,,,,,0.7950036525726318,0.777429461479187,0.6831955909729004,0.7272727489471436,0.7272727489471436,,,,,,
+0.305419921875,1,179,,,,,,,,,,,,,,,,,
+0.390869140625,1,189,,,,,,,,,,,,,,,,,
+0.2052001953125,1,199,,,,,,,,,,,,,,,,,
+0.0948486328125,1,209,,,,,,,,,,,,,,,,,
+0.38427734375,1,219,,,,,,,,,,,,,,,,,
+0.1968994140625,1,229,,,,,,,,,,,,,,,,,
+0.078369140625,1,239,,,,,,,,,,,,,,,,,
+0.356689453125,1,249,,,,,,,,,,,,,,,,,
+0.43505859375,1,259,,,,,,,,,,,,,,,,,
+0.485107421875,1,269,,,,,,,,,,,,,,,,,
+0.1243896484375,1,279,,,,,,,,,,,,,,,,,
+0.05401611328125,1,289,,,,,,,,,,,,,,,,,
+0.35595703125,1,299,,,,,,,,,,,,,,,,,
+0.0572509765625,1,309,,,,,,,,,,,,,,,,,
+0.1417236328125,1,319,,,,,,,,,,,,,,,,,
+0.2315673828125,1,329,,,,,,,,,,,,,,,,,
+0.232421875,1,339,,,,,,,,,,,,,,,,,
+,1,341,0.24903717637062073,0.9004083275794983,0.36767318844795227,0.7137930989265442,0.48534584045410156,0.48534584045410156,,,,,,,,,,,
+,1,341,,,,,,,0.8934606909751892,0.8779565095901489,0.8521579504013062,0.8648648858070374,0.8648648858070374,,,,,,
+0.1656494140625,2,349,,,,,,,,,,,,,,,,,
+0.336181640625,2,359,,,,,,,,,,,,,,,,,
+0.1632080078125,2,369,,,,,,,,,,,,,,,,,
+0.042724609375,2,379,,,,,,,,,,,,,,,,,
+0.352783203125,2,389,,,,,,,,,,,,,,,,,
+0.0268096923828125,2,399,,,,,,,,,,,,,,,,,
+0.01428985595703125,2,409,,,,,,,,,,,,,,,,,
+0.1790771484375,2,419,,,,,,,,,,,,,,,,,
+0.0181427001953125,2,429,,,,,,,,,,,,,,,,,
+0.04736328125,2,439,,,,,,,,,,,,,,,,,
+0.2493896484375,2,449,,,,,,,,,,,,,,,,,
+0.08538818359375,2,459,,,,,,,,,,,,,,,,,
+0.583984375,2,469,,,,,,,,,,,,,,,,,
+0.0457763671875,2,479,,,,,,,,,,,,,,,,,
+0.1326904296875,2,489,,,,,,,,,,,,,,,,,
+0.156494140625,2,499,,,,,,,,,,,,,,,,,
+0.1724853515625,2,509,,,,,,,,,,,,,,,,,
+,2,512,0.5341271758079529,0.7738203406333923,0.2016877681016922,0.8241379261016846,0.3240678012371063,0.3240678012371063,,,,,,,,,,,
+,2,512,,,,,,,0.9566495418548584,0.9491211771965027,0.942148745059967,0.9456221461296082,0.9456221461296082,,,,,,
+0.056976318359375,3,519,,,,,,,,,,,,,,,,,
+0.1407470703125,3,529,,,,,,,,,,,,,,,,,
+0.048370361328125,3,539,,,,,,,,,,,,,,,,,
+0.004375457763671875,3,549,,,,,,,,,,,,,,,,,
+0.01050567626953125,3,559,,,,,,,,,,,,,,,,,
+0.08062744140625,3,569,,,,,,,,,,,,,,,,,
+0.376953125,3,579,,,,,,,,,,,,,,,,,
+0.04742431640625,3,589,,,,,,,,,,,,,,,,,
+0.143798828125,3,599,,,,,,,,,,,,,,,,,
+0.030059814453125,3,609,,,,,,,,,,,,,,,,,
+0.040374755859375,3,619,,,,,,,,,,,,,,,,,
+0.19873046875,3,629,,,,,,,,,,,,,,,,,
+0.031402587890625,3,639,,,,,,,,,,,,,,,,,
+0.00252532958984375,3,649,,,,,,,,,,,,,,,,,
+0.0867919921875,3,659,,,,,,,,,,,,,,,,,
+0.040191650390625,3,669,,,,,,,,,,,,,,,,,
+0.0982666015625,3,679,,,,,,,,,,,,,,,,,
+,3,683,0.4307195246219635,0.8677404522895813,0.2984869182109833,0.748275876045227,0.42674532532691956,0.42674532532691956,,,,,,,,,,,
+,3,683,,,,,,,0.9742836356163025,0.9670027494430542,0.968778669834137,0.9678899049758911,0.9678899049758911,,,,,,
+0.003635406494140625,4,689,,,,,,,,,,,,,,,,,
+0.0293426513671875,4,699,,,,,,,,,,,,,,,,,
+0.001384735107421875,4,709,,,,,,,,,,,,,,,,,
+0.003826141357421875,4,719,,,,,,,,,,,,,,,,,
+0.0015773773193359375,4,729,,,,,,,,,,,,,,,,,
+0.0043182373046875,4,739,,,,,,,,,,,,,,,,,
+0.001644134521484375,4,749,,,,,,,,,,,,,,,,,
+0.03326416015625,4,759,,,,,,,,,,,,,,,,,
+0.001827239990234375,4,769,,,,,,,,,,,,,,,,,
+0.0008625984191894531,4,779,,,,,,,,,,,,,,,,,
+0.0024814605712890625,4,789,,,,,,,,,,,,,,,,,
+0.04608154296875,4,799,,,,,,,,,,,,,,,,,
+0.030364990234375,4,809,,,,,,,,,,,,,,,,,
+0.04498291015625,4,819,,,,,,,,,,,,,,,,,
+0.0010805130004882812,4,829,,,,,,,,,,,,,,,,,
+0.007289886474609375,4,839,,,,,,,,,,,,,,,,,
+0.7626953125,4,849,,,,,,,,,,,,,,,,,
+,4,854,1.0994356870651245,0.7813067436218262,0.2074652761220932,0.8241379261016846,0.3314840495586395,0.3314840495586395,,,,,,,,,,,
+,4,854,,,,,,,0.9911829829216003,0.989880383014679,0.9880624413490295,0.9889705777168274,0.9889705777168274,,,,,,
+0.07330322265625,5,859,,,,,,,,,,,,,,,,,
+0.007152557373046875,5,869,,,,,,,,,,,,,,,,,
+0.0017251968383789062,5,879,,,,,,,,,,,,,,,,,
+0.00966644287109375,5,889,,,,,,,,,,,,,,,,,
+0.014617919921875,5,899,,,,,,,,,,,,,,,,,
+0.00643157958984375,5,909,,,,,,,,,,,,,,,,,
+0.1793212890625,5,919,,,,,,,,,,,,,,,,,
+0.0158843994140625,5,929,,,,,,,,,,,,,,,,,
+0.01483917236328125,5,939,,,,,,,,,,,,,,,,,
+0.0116424560546875,5,949,,,,,,,,,,,,,,,,,
+0.046630859375,5,959,,,,,,,,,,,,,,,,,
+0.01290130615234375,5,969,,,,,,,,,,,,,,,,,
+0.01458740234375,5,979,,,,,,,,,,,,,,,,,
+0.031829833984375,5,989,,,,,,,,,,,,,,,,,
+0.036346435546875,5,999,,,,,,,,,,,,,,,,,
+0.0152435302734375,5,1009,,,,,,,,,,,,,,,,,
+0.00556182861328125,5,1019,,,,,,,,,,,,,,,,,
+,5,1025,0.37797266244888306,0.9088021516799927,0.39147287607192993,0.6965517401695251,0.5012406706809998,0.5012406706809998,,,,,,,,,,,
+,5,1025,,,,,,,0.9819985032081604,0.9753199219703674,0.9797979593276978,0.9775538444519043,0.9775538444519043,,,,,,
+0.02069091796875,6,1029,,,,,,,,,,,,,,,,,
+0.205078125,6,1039,,,,,,,,,,,,,,,,,
+0.01212310791015625,6,1049,,,,,,,,,,,,,,,,,
+0.016571044921875,6,1059,,,,,,,,,,,,,,,,,
+0.053070068359375,6,1069,,,,,,,,,,,,,,,,,
+0.0028896331787109375,6,1079,,,,,,,,,,,,,,,,,
+0.1202392578125,6,1089,,,,,,,,,,,,,,,,,
+0.00884246826171875,6,1099,,,,,,,,,,,,,,,,,
+0.002231597900390625,6,1109,,,,,,,,,,,,,,,,,
+0.00974273681640625,6,1119,,,,,,,,,,,,,,,,,
+0.0335693359375,6,1129,,,,,,,,,,,,,,,,,
+0.004673004150390625,6,1139,,,,,,,,,,,,,,,,,
+0.287109375,6,1149,,,,,,,,,,,,,,,,,
+0.08795166015625,6,1159,,,,,,,,,,,,,,,,,
+0.00901031494140625,6,1169,,,,,,,,,,,,,,,,,
+0.01025390625,6,1179,,,,,,,,,,,,,,,,,
+0.050384521484375,6,1189,,,,,,,,,,,,,,,,,
+,6,1196,0.5810363292694092,0.8743194341659546,0.3053097426891327,0.7137930989265442,0.42768594622612,0.42768594622612,,,,,,,,,,,
+,6,1196,,,,,,,0.9911829829216003,0.9880843162536621,0.9898989796638489,0.988990843296051,0.988990843296051,,,,,,
+0.005062103271484375,7,1199,,,,,,,,,,,,,,,,,
+0.0116119384765625,7,1209,,,,,,,,,,,,,,,,,
+0.0006499290466308594,7,1219,,,,,,,,,,,,,,,,,
+0.009674072265625,7,1229,,,,,,,,,,,,,,,,,
+0.004718780517578125,7,1239,,,,,,,,,,,,,,,,,
+0.0006432533264160156,7,1249,,,,,,,,,,,,,,,,,
+0.0006594657897949219,7,1259,,,,,,,,,,,,,,,,,
+0.0006146430969238281,7,1269,,,,,,,,,,,,,,,,,
+0.0008616447448730469,7,1279,,,,,,,,,,,,,,,,,
+0.0004451274871826172,7,1289,,,,,,,,,,,,,,,,,
+0.0004410743713378906,7,1299,,,,,,,,,,,,,,,,,
+0.0035152435302734375,7,1309,,,,,,,,,,,,,,,,,
+0.0005249977111816406,7,1319,,,,,,,,,,,,,,,,,
+0.0005850791931152344,7,1329,,,,,,,,,,,,,,,,,
+0.1878662109375,7,1339,,,,,,,,,,,,,,,,,
+0.016326904296875,7,1349,,,,,,,,,,,,,,,,,
+0.0019102096557617188,7,1359,,,,,,,,,,,,,,,,,
+,7,1367,0.49221912026405334,0.8886116147041321,0.3327786922454834,0.6896551847457886,0.44893378019332886,0.44893378019332886,,,,,,,,,,,
+,7,1367,,,,,,,0.9966936111450195,0.9963235259056091,0.9954086542129517,0.9958658814430237,0.9958658814430237,,,,,,
+0.0009813308715820312,8,1369,,,,,,,,,,,,,,,,,
+0.00127410888671875,8,1379,,,,,,,,,,,,,,,,,
+0.0015325546264648438,8,1389,,,,,,,,,,,,,,,,,
+0.00914764404296875,8,1399,,,,,,,,,,,,,,,,,
+0.0008273124694824219,8,1409,,,,,,,,,,,,,,,,,
+0.0011987686157226562,8,1419,,,,,,,,,,,,,,,,,
+0.0003414154052734375,8,1429,,,,,,,,,,,,,,,,,
+0.0031108856201171875,8,1439,,,,,,,,,,,,,,,,,
+0.0004572868347167969,8,1449,,,,,,,,,,,,,,,,,
+0.0006923675537109375,8,1459,,,,,,,,,,,,,,,,,
+0.0003120899200439453,8,1469,,,,,,,,,,,,,,,,,
+0.003658294677734375,8,1479,,,,,,,,,,,,,,,,,
+0.00034880638122558594,8,1489,,,,,,,,,,,,,,,,,
+0.000492095947265625,8,1499,,,,,,,,,,,,,,,,,
+0.0002238750457763672,8,1509,,,,,,,,,,,,,,,,,
+0.0003514289855957031,8,1519,,,,,,,,,,,,,,,,,
+0.380126953125,8,1529,,,,,,,,,,,,,,,,,
+,8,1538,0.47979027032852173,0.909709632396698,0.3897959291934967,0.6586207151412964,0.48974359035491943,0.48974359035491943,,,,,,,,,,,
+,8,1538,,,,,,,0.997061014175415,0.9981566667556763,0.994490385055542,0.9963201284408569,0.9963201284408569,,,,,,
+0.0142364501953125,9,1539,,,,,,,,,,,,,,,,,
+0.002254486083984375,9,1549,,,,,,,,,,,,,,,,,
+0.0031986236572265625,9,1559,,,,,,,,,,,,,,,,,
+0.000621795654296875,9,1569,,,,,,,,,,,,,,,,,
+0.0004968643188476562,9,1579,,,,,,,,,,,,,,,,,
+0.0007233619689941406,9,1589,,,,,,,,,,,,,,,,,
+0.0003178119659423828,9,1599,,,,,,,,,,,,,,,,,
+0.0004620552062988281,9,1609,,,,,,,,,,,,,,,,,
+0.0002932548522949219,9,1619,,,,,,,,,,,,,,,,,
+0.001575469970703125,9,1629,,,,,,,,,,,,,,,,,
+0.0008373260498046875,9,1639,,,,,,,,,,,,,,,,,
+0.011688232421875,9,1649,,,,,,,,,,,,,,,,,
+0.0089111328125,9,1659,,,,,,,,,,,,,,,,,
+0.00579071044921875,9,1669,,,,,,,,,,,,,,,,,
+0.0009889602661132812,9,1679,,,,,,,,,,,,,,,,,
+0.043060302734375,9,1689,,,,,,,,,,,,,,,,,
+0.0005102157592773438,9,1699,,,,,,,,,,,,,,,,,
+0.0002346038818359375,9,1709,,,,,,,,,,,,,,,,,
+,9,1709,0.6371660232543945,0.8752268552780151,0.30882352590560913,0.7241379022598267,0.4329896867275238,0.4329896867275238,,,,,,,,,,,
+,9,1709,,,,,,,0.9933872222900391,0.9899359345436096,0.9935720562934875,0.9917507171630859,0.9917507171630859,,,,,,
+,10,1710,,,,,,,,,,,,0.40940672159194946,0.8987295627593994,0.36324167251586914,0.6802167892456055,0.473584920167923,0.473584920167923

display_v3/2023-04-14_17-59-45/yes.txt ADDED Viewed

File without changes