Buckets:
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Pre-Training on SlimPajama-6B (Azure A100)\n", | |
| "\n", | |
| "**Model**: 154M Decoder-only Transformer (GQA + REPO-Attention + Flash-Attention) \n", | |
| "**Dataset**: SlimPajama-6B via Oxen \n", | |
| "**Tracking**: Weights & Biases" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Uncomment on Azure if needed\n", | |
| "# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121\n", | |
| "# !pip install oxen wandb transformers tokenizers pyarrow" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "PyTorch: 2.5.1+cu121\n", | |
| "CUDA: True\n", | |
| "GPU: NVIDIA A100 80GB PCIe\n", | |
| "VRAM: 85.0 GB\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import os, sys, time, math\n", | |
| "import torch\n", | |
| "import torch.nn as nn\n", | |
| "from torch.amp import autocast, GradScaler\n", | |
| "from torch.utils.data import DataLoader\n", | |
| "from datetime import datetime\n", | |
| "import wandb\n", | |
| "\n", | |
| "PROJECT_ROOT = os.path.abspath(\".\")\n", | |
| "if not os.path.exists(os.path.join(PROJECT_ROOT, \"transformer\")) and os.path.exists(os.path.join(PROJECT_ROOT, \"Transformers\", \"transformer\")):\n", | |
| " PROJECT_ROOT = os.path.join(PROJECT_ROOT, \"Transformers\")\n", | |
| "TRAIN_DIR = os.path.join(PROJECT_ROOT, \"train\")\n", | |
| "sys.path.insert(0, PROJECT_ROOT)\n", | |
| "sys.path.insert(0, TRAIN_DIR)\n", | |
| "\n", | |
| "from transformer.build_transformer import build_transformer\n", | |
| "from dataset_define import SlimPajamaDataset\n", | |
| "from save_checkpoint import save_checkpoint\n", | |
| "from tokenizer import tokenizer\n", | |
| "\n", | |
| "print(f\"PyTorch: {torch.__version__}\")\n", | |
| "print(f\"CUDA: {torch.cuda.is_available()}\")\n", | |
| "if torch.cuda.is_available():\n", | |
| " print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n", | |
| " print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB\")\n", | |
| "\n", | |
| "# Silence the '> 1024' warning — our model handles 2048 tokens\n", | |
| "tokenizer.model_max_length = 2048\n", | |
| "\n", | |
| "\n", | |
| "# Suppress HuggingFace 'sequence longer than' warning — our dataset chunks correctly\n", | |
| "import logging\n", | |
| "logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.ERROR)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Vocab: 50260, Pad ID: 50257\n", | |
| "Tokens/step: 1,441,792\n", | |
| "Est steps/epoch: 4,161\n", | |
| "Total steps: 4,161\n", | |
| "Warmup: 200 steps\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== CONFIG ========================\n", | |
| "DATASET_DIR = os.path.join(PROJECT_ROOT, \"SlimPajama-6B\")\n", | |
| "CHECKPOINT_DIR = os.path.join(PROJECT_ROOT, \"checkpoints\")\n", | |
| "\n", | |
| "# Model\n", | |
| "D_MODEL = 768\n", | |
| "NUM_LAYERS = 12\n", | |
| "NUM_HEADS = 12\n", | |
| "KV_HEADS = 4\n", | |
| "D_FF = 3072\n", | |
| "DROPOUT = 0.1\n", | |
| "MAX_SEQ_LEN = 2048\n", | |
| "USE_REPO = True\n", | |
| "USE_FLASH = True\n", | |
| "\n", | |
| "# Training\n", | |
| "EPOCHS = 1\n", | |
| "BATCH_SIZE = 22\n", | |
| "GRAD_ACCUM = 32 # 4x larger effective batch\n", | |
| "LEARNING_RATE = 2e-4\n", | |
| "MIN_LR = 2e-5 # 10% of peak\n", | |
| "WEIGHT_DECAY = 0.01\n", | |
| "MAX_GRAD_NORM = 1.0\n", | |
| "WARMUP_STEPS = 200 # Short warmup so model starts learning fast\n", | |
| "\n", | |
| "# Estimated steps (for cosine schedule)\n", | |
| "TOKENS_IN_DATASET = 6_000_000_000\n", | |
| "TOKENS_PER_STEP = BATCH_SIZE * GRAD_ACCUM * MAX_SEQ_LEN\n", | |
| "EST_STEPS_PER_EPOCH = TOKENS_IN_DATASET // TOKENS_PER_STEP\n", | |
| "TOTAL_STEPS = EST_STEPS_PER_EPOCH * EPOCHS\n", | |
| "\n", | |
| "# WandB\n", | |
| "WANDB_PROJECT = \"Spedrox_llm\"\n", | |
| "USE_WANDB = True\n", | |
| "\n", | |
| "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", | |
| "VOCAB_SIZE = len(tokenizer)\n", | |
| "PAD_ID = tokenizer.pad_token_id\n", | |
| "\n", | |
| "print(f\"Vocab: {VOCAB_SIZE}, Pad ID: {PAD_ID}\")\n", | |
| "print(f\"Tokens/step: {TOKENS_PER_STEP:,}\")\n", | |
| "print(f\"Est steps/epoch: {EST_STEPS_PER_EPOCH:,}\")\n", | |
| "print(f\"Total steps: {TOTAL_STEPS:,}\")\n", | |
| "print(f\"Warmup: {WARMUP_STEPS} steps\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Dataset exists at /home/spedrox/Transformers/SlimPajama-6B\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== CLONE DATASET ========================\n", | |
| "import oxen\n", | |
| "\n", | |
| "if not os.path.exists(DATASET_DIR):\n", | |
| " print(\"Cloning SlimPajama-6B...\")\n", | |
| " oxen.clone(\"https://hub.oxen.ai/datasets/SlimPajama-6B\", DATASET_DIR)\n", | |
| " print(\"Done!\")\n", | |
| "else:\n", | |
| " print(f\"Dataset exists at {DATASET_DIR}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Parameters: 154,582,996 (154.6M)\n", | |
| "REPO: ON, Flash: ON\n", | |
| "[OK] RMSNorm gamma parameters are non-zero\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== BUILD MODEL ========================\n", | |
| "model = build_transformer(\n", | |
| " src_vocab_size=VOCAB_SIZE, tgt_vocab_size=VOCAB_SIZE,\n", | |
| " src_seq_len=MAX_SEQ_LEN, tgt_seq_len=MAX_SEQ_LEN,\n", | |
| " d_model=D_MODEL, N=NUM_LAYERS, h=NUM_HEADS, kv_h=KV_HEADS,\n", | |
| " dropout=DROPOUT, d_ff=D_FF, use_repo=USE_REPO, use_flash=USE_FLASH,\n", | |
| ")\n", | |
| "model = model.to(device)\n", | |
| "\n", | |
| "total_params = sum(p.numel() for p in model.parameters())\n", | |
| "print(f\"Parameters: {total_params:,} ({total_params/1e6:.1f}M)\")\n", | |
| "print(f\"REPO: {'ON' if USE_REPO else 'OFF'}, Flash: {'ON' if USE_FLASH else 'OFF'}\")\n", | |
| "\n", | |
| "# Verify RMSNorm gamma is NOT zero\n", | |
| "for name, p in model.named_parameters():\n", | |
| " if 'gamma' in name:\n", | |
| " assert p.abs().sum() > 0, f\"FATAL: {name} is all zeros!\"\n", | |
| "print(\"[OK] RMSNorm gamma parameters are non-zero\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Running sanity check...\n", | |
| "Output shape: torch.Size([2, 128, 50260])\n", | |
| "Output range: [-0.9149, 0.9226]\n", | |
| "Output std: 0.1738\n", | |
| "[OK] Sanity check passed\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== SANITY CHECK ========================\n", | |
| "print(\"Running sanity check...\")\n", | |
| "model.train()\n", | |
| "dummy = torch.randint(0, VOCAB_SIZE, (2, 128), device=device)\n", | |
| "\n", | |
| "with torch.no_grad():\n", | |
| " x = model.tgt_embed(dummy)\n", | |
| " for layer in model.decoder.layers:\n", | |
| " x, _ = layer(x, tgt_mask=None, use_cache=False)\n", | |
| " x = model.decoder.norm(x)\n", | |
| " logits = model.project(x)\n", | |
| "\n", | |
| "print(f\"Output shape: {logits.shape}\")\n", | |
| "print(f\"Output range: [{logits.min().item():.4f}, {logits.max().item():.4f}]\")\n", | |
| "print(f\"Output std: {logits.std().item():.4f}\")\n", | |
| "assert logits.std().item() > 0.01, \"FATAL: Model output has near-zero variance!\"\n", | |
| "print(\"[OK] Sanity check passed\")\n", | |
| "\n", | |
| "del dummy, x, logits\n", | |
| "torch.cuda.empty_cache()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Found 50 parquet files in /home/spedrox/Transformers/SlimPajama-6B\n", | |
| "DataLoader ready (batch=22, workers=16)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== DATASET & DATALOADER ========================\n", | |
| "train_dataset = SlimPajamaDataset(\n", | |
| " data_dir=DATASET_DIR, tokenizer=tokenizer, max_length=MAX_SEQ_LEN,\n", | |
| ")\n", | |
| "train_loader = DataLoader(\n", | |
| " train_dataset, batch_size=BATCH_SIZE,\n", | |
| " num_workers=16, pin_memory=True, prefetch_factor=2,\n", | |
| ")\n", | |
| "print(f\"DataLoader ready (batch={BATCH_SIZE}, workers=16)\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n", | |
| "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n", | |
| "\u001b[34m\u001b[1mwandb\u001b[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.\n", | |
| "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /home/spedrox/.netrc\n", | |
| "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mdinmaybrahmaofficial\u001b[0m (\u001b[33mdinmaybrahmaofficial-indian-institute-of-technology\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "Tracking run with wandb version 0.25.1" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "Run data is saved locally in <code>/home/spedrox/wandb/run-20260403_113443-4f6hux31</code>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "Syncing run <strong><a href='https://wandb.ai/dinmaybrahmaofficial-indian-institute-of-technology/Spedrox_llm/runs/4f6hux31' target=\"_blank\">slimpajama_0403_1134</a></strong> to <a href='https://wandb.ai/dinmaybrahmaofficial-indian-institute-of-technology/Spedrox_llm' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| " View project at <a href='https://wandb.ai/dinmaybrahmaofficial-indian-institute-of-technology/Spedrox_llm' target=\"_blank\">https://wandb.ai/dinmaybrahmaofficial-indian-institute-of-technology/Spedrox_llm</a>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| " View run at <a href='https://wandb.ai/dinmaybrahmaofficial-indian-institute-of-technology/Spedrox_llm/runs/4f6hux31' target=\"_blank\">https://wandb.ai/dinmaybrahmaofficial-indian-institute-of-technology/Spedrox_llm/runs/4f6hux31</a>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "WandB: https://wandb.ai/dinmaybrahmaofficial-indian-institute-of-technology/Spedrox_llm/runs/4f6hux31\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== WANDB ========================\n", | |
| "wandb.login(key=\"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\")\n", | |
| "\n", | |
| "if USE_WANDB:\n", | |
| " wandb.init(\n", | |
| " project=WANDB_PROJECT,\n", | |
| " name=f\"slimpajama_{datetime.now().strftime('%m%d_%H%M')}\",\n", | |
| " config={\n", | |
| " \"model\": \"decoder_only_transformer\",\n", | |
| " \"params\": total_params,\n", | |
| " \"d_model\": D_MODEL, \"layers\": NUM_LAYERS,\n", | |
| " \"heads\": NUM_HEADS, \"kv_heads\": KV_HEADS,\n", | |
| " \"d_ff\": D_FF, \"seq_len\": MAX_SEQ_LEN,\n", | |
| " \"batch_size\": BATCH_SIZE, \"grad_accum\": GRAD_ACCUM,\n", | |
| " \"effective_batch\": BATCH_SIZE * GRAD_ACCUM,\n", | |
| " \"lr\": LEARNING_RATE, \"min_lr\": MIN_LR,\n", | |
| " \"warmup\": WARMUP_STEPS, \"total_steps\": TOTAL_STEPS,\n", | |
| " \"dataset\": \"SlimPajama-6B\",\n", | |
| " \"features\": [\"GQA\", \"REPO-Attention\", \"Flash-Attention\", \"RMSNorm\"],\n", | |
| " },\n", | |
| " tags=[\"pre-training\", \"slimpajama\", \"a100\"]\n", | |
| " )\n", | |
| " wandb.watch(model, log=\"all\", log_freq=200)\n", | |
| " print(f\"WandB: {wandb.run.url}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Original schedule (end):\n", | |
| " Step 3961: LR = 0.00002113\n", | |
| " Step 4061: LR = 0.00002028\n", | |
| " Step 4160: LR = 0.00002000\n", | |
| "\n", | |
| "Warm restart schedule:\n", | |
| " Step 4161: LR = 0.00006000\n", | |
| " Step 4186: LR = 0.00059992\n", | |
| " Step 4211: LR = 0.00059946\n", | |
| " Step 4361: LR = 0.00058794\n", | |
| " Step 4661: LR = 0.00052317\n", | |
| " Step 5161: LR = 0.00033213\n", | |
| " Step 5661: LR = 0.00013984\n", | |
| " Step 6161: LR = 0.00006000\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== LR SCHEDULE (Cosine Warm Restart) ========================\n", | |
| "# Phase 1 was the original cosine schedule (already done).\n", | |
| "# Phase 2: Warm-restart with a new mini-cycle to break through the 3.3 plateau.\n", | |
| "\n", | |
| "RESTART_LR = 6e-4 # Pushed harder (was 1e-4) # Lower than original 2e-4 since model is partially trained\n", | |
| "RESTART_MIN = 6e-5 # Pushed harder floor # Decay down to this\n", | |
| "RESTART_WARMUP = 10 # Fast ramp-up # Quick warmup (model is already warmed)\n", | |
| "RESTART_CYCLE = 2000 # New cosine cycle length\n", | |
| "\n", | |
| "def get_lr(step):\n", | |
| " \"\"\"Cosine warm restart schedule.\"\"\"\n", | |
| " if step < WARMUP_STEPS:\n", | |
| " # Original warmup (won't be used on resume, but kept for safety)\n", | |
| " return LEARNING_RATE * (step + 1) / WARMUP_STEPS\n", | |
| "\n", | |
| " # If we're past the original schedule, use warm restart\n", | |
| " if step >= TOTAL_STEPS:\n", | |
| " restart_step = step - TOTAL_STEPS\n", | |
| " if restart_step < RESTART_WARMUP:\n", | |
| " return RESTART_LR * (restart_step + 1) / RESTART_WARMUP\n", | |
| " progress = (restart_step - RESTART_WARMUP) / max(1, RESTART_CYCLE - RESTART_WARMUP)\n", | |
| " progress = min(progress, 1.0)\n", | |
| " cosine = 0.5 * (1.0 + math.cos(math.pi * progress))\n", | |
| " return RESTART_MIN + (RESTART_LR - RESTART_MIN) * cosine\n", | |
| "\n", | |
| " # Original cosine decay\n", | |
| " progress = (step - WARMUP_STEPS) / max(1, TOTAL_STEPS - WARMUP_STEPS)\n", | |
| " progress = min(progress, 1.0)\n", | |
| " cosine = 0.5 * (1.0 + math.cos(math.pi * progress))\n", | |
| " return MIN_LR + (LEARNING_RATE - MIN_LR) * cosine\n", | |
| "\n", | |
| "# Preview the restart schedule\n", | |
| "print(\"Original schedule (end):\")\n", | |
| "for s in [TOTAL_STEPS - 200, TOTAL_STEPS - 100, TOTAL_STEPS - 1]:\n", | |
| " print(f\" Step {s:>6}: LR = {get_lr(s):.8f}\")\n", | |
| "print(\"\\nWarm restart schedule:\")\n", | |
| "for s in [TOTAL_STEPS, TOTAL_STEPS + 25, TOTAL_STEPS + 50, TOTAL_STEPS + 200, TOTAL_STEPS + 500, TOTAL_STEPS + 1000, TOTAL_STEPS + 1500, TOTAL_STEPS + 2000]:\n", | |
| " print(f\" Step {s:>6}: LR = {get_lr(s):.8f}\")\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Params with weight decay : 115,867,776\n", | |
| "Params without weight decay: 38,715,220\n", | |
| "Mixed precision: BF16\n", | |
| "GradScaler: OFF (BF16)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== OPTIMIZER ========================\n", | |
| "\n", | |
| "os.makedirs(CHECKPOINT_DIR, exist_ok=True)\n", | |
| "\n", | |
| "# Separate param groups: NO weight decay on norms, biases, embeddings\n", | |
| "decay_params = []\n", | |
| "no_decay_params = []\n", | |
| "for name, p in model.named_parameters():\n", | |
| " if not p.requires_grad:\n", | |
| " continue\n", | |
| " if p.dim() == 1 or \"norm\" in name or \"gamma\" in name or \"bias\" in name or \"embed\" in name:\n", | |
| " no_decay_params.append(p)\n", | |
| " else:\n", | |
| " decay_params.append(p)\n", | |
| "\n", | |
| "print(f\"Params with weight decay : {sum(p.numel() for p in decay_params):,}\")\n", | |
| "print(f\"Params without weight decay: {sum(p.numel() for p in no_decay_params):,}\")\n", | |
| "\n", | |
| "optimizer = torch.optim.AdamW([\n", | |
| " {\"params\": decay_params, \"weight_decay\": WEIGHT_DECAY},\n", | |
| " {\"params\": no_decay_params, \"weight_decay\": 0.0},\n", | |
| "], lr=LEARNING_RATE, betas=(0.9, 0.95))\n", | |
| "\n", | |
| "# BF16 on A100 (no GradScaler needed for BF16)\n", | |
| "use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()\n", | |
| "amp_dtype = torch.bfloat16 if use_bf16 else torch.float16\n", | |
| "scaler = GradScaler(enabled=(not use_bf16)) # Only needed for FP16\n", | |
| "\n", | |
| "print(f\"Mixed precision: {'BF16' if use_bf16 else 'FP16'}\")\n", | |
| "print(f\"GradScaler: {'OFF (BF16)' if use_bf16 else 'ON (FP16)'}\")\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Available checkpoints:\n", | |
| " auto_epoch1_step9906.pt (1855 MB)\n", | |
| " auto_epoch1_step8256.pt (1855 MB)\n", | |
| " auto_epoch1_step6605.pt (1855 MB)\n", | |
| " auto_epoch1_step4954.pt (1855 MB)\n", | |
| " auto_epoch1_step3302.pt (1855 MB)\n", | |
| "\n", | |
| "Resuming from: /home/spedrox/Transformers/checkpoints/auto_epoch1_step9906.pt\n", | |
| "Model weights restored. Starting WARM RESTART.\n", | |
| "global_step = 4161 (= TOTAL_STEPS, triggers restart)\n", | |
| "LR = 0.00006000 (beginning of restart warmup)\n", | |
| "Restart cycle: 2000 steps\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== RESUME FROM CHECKPOINT ========================\n", | |
| "import glob\n", | |
| "\n", | |
| "ckpt_files = glob.glob(os.path.join(CHECKPOINT_DIR, '*.pt'))\n", | |
| "if ckpt_files:\n", | |
| " ckpt_files.sort(key=os.path.getmtime, reverse=True)\n", | |
| " print('Available checkpoints:')\n", | |
| " for f in ckpt_files[:5]:\n", | |
| " print(f' {os.path.basename(f)} ({os.path.getsize(f)/1e6:.0f} MB)')\n", | |
| "\n", | |
| " RESUME_PATH = ckpt_files[0]\n", | |
| " print(f'\\nResuming from: {RESUME_PATH}')\n", | |
| "\n", | |
| " ckpt = torch.load(RESUME_PATH, map_location=device, weights_only=False)\n", | |
| " model.load_state_dict(ckpt['model_state_dict'])\n", | |
| " optimizer.load_state_dict(ckpt['optimizer_state_dict'])\n", | |
| " best_loss = ckpt.get('best_loss', float('inf'))\n", | |
| "\n", | |
| " # FORCE warm restart: set global_step to exactly TOTAL_STEPS\n", | |
| " # so get_lr() sees restart_step=0 and begins the fresh cosine cycle\n", | |
| " global_step = TOTAL_STEPS\n", | |
| " lr = get_lr(global_step)\n", | |
| " for pg in optimizer.param_groups:\n", | |
| " pg['lr'] = lr\n", | |
| "\n", | |
| " print(f'Model weights restored. Starting WARM RESTART.')\n", | |
| " print(f'global_step = {global_step} (= TOTAL_STEPS, triggers restart)')\n", | |
| " print(f'LR = {lr:.8f} (beginning of restart warmup)')\n", | |
| " print(f'Restart cycle: {RESTART_CYCLE} steps')\n", | |
| " del ckpt\n", | |
| " torch.cuda.empty_cache()\n", | |
| "else:\n", | |
| " print('No checkpoints found!')\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Continuing from step 4161...\n", | |
| "\n", | |
| "Starting training...\n", | |
| " Epochs: 1, Batch: 22, Accum: 32\n", | |
| " Effective batch: 704\n", | |
| " Peak LR: 0.0002, Warmup: 200 steps\n", | |
| " Est total steps: 4,161\n", | |
| "\n", | |
| "E1 B 0 | loss=3.2393 | lr=0.000060 | step=4161 | 7s | 12.0GB\n", | |
| "E1 B 20 | loss=3.4637 | lr=0.000060 | step=4161 | 19s | 12.0GB\n", | |
| "E1 B 40 | loss=3.8487 | lr=0.000120 | step=4162 | 30s | 12.0GB\n", | |
| "E1 B 60 | loss=3.2247 | lr=0.000120 | step=4162 | 41s | 12.0GB\n", | |
| "E1 B 80 | loss=3.9864 | lr=0.000180 | step=4163 | 52s | 12.0GB\n", | |
| "E1 B 100 | loss=3.7597 | lr=0.000240 | step=4164 | 62s | 12.0GB\n", | |
| "E1 B 120 | loss=3.8237 | lr=0.000240 | step=4164 | 74s | 12.0GB\n", | |
| "E1 B 140 | loss=3.5014 | lr=0.000300 | step=4165 | 84s | 12.0GB\n", | |
| "E1 B 160 | loss=3.3559 | lr=0.000360 | step=4166 | 95s | 12.0GB\n", | |
| "E1 B 180 | loss=3.5242 | lr=0.000360 | step=4166 | 106s | 12.0GB\n", | |
| "E1 B 200 | loss=3.7209 | lr=0.000420 | step=4167 | 117s | 12.0GB\n", | |
| "E1 B 220 | loss=3.7207 | lr=0.000420 | step=4167 | 128s | 12.0GB\n", | |
| "E1 B 240 | loss=3.5746 | lr=0.000480 | step=4168 | 139s | 12.0GB\n", | |
| "E1 B 260 | loss=3.8666 | lr=0.000540 | step=4169 | 150s | 12.0GB\n", | |
| "E1 B 280 | loss=3.7142 | lr=0.000540 | step=4169 | 160s | 12.0GB\n", | |
| "E1 B 300 | loss=4.0608 | lr=0.000600 | step=4170 | 171s | 12.0GB\n", | |
| "E1 B 320 | loss=4.1418 | lr=0.000600 | step=4171 | 182s | 12.0GB\n", | |
| "E1 B 340 | loss=3.6602 | lr=0.000600 | step=4171 | 193s | 12.0GB\n", | |
| "E1 B 360 | loss=3.6876 | lr=0.000600 | step=4172 | 204s | 12.0GB\n", | |
| "E1 B 380 | loss=3.9618 | lr=0.000600 | step=4172 | 215s | 12.0GB\n", | |
| "E1 B 400 | loss=3.6377 | lr=0.000600 | step=4173 | 225s | 12.0GB\n", | |
| "E1 B 420 | loss=3.5309 | lr=0.000600 | step=4174 | 237s | 12.0GB\n", | |
| "E1 B 440 | loss=3.8527 | lr=0.000600 | step=4174 | 247s | 12.0GB\n", | |
| "E1 B 460 | loss=3.6355 | lr=0.000600 | step=4175 | 258s | 12.0GB\n", | |
| "E1 B 480 | loss=4.1892 | lr=0.000600 | step=4176 | 269s | 12.0GB\n", | |
| "E1 B 500 | loss=4.0167 | lr=0.000600 | step=4176 | 280s | 12.0GB\n", | |
| "E1 B 520 | loss=4.2543 | lr=0.000600 | step=4177 | 291s | 12.0GB\n", | |
| "E1 B 540 | loss=4.3476 | lr=0.000600 | step=4177 | 302s | 12.0GB\n", | |
| "E1 B 560 | loss=3.7142 | lr=0.000600 | step=4178 | 313s | 12.0GB\n", | |
| "E1 B 580 | loss=4.1953 | lr=0.000600 | step=4179 | 323s | 12.0GB\n", | |
| "E1 B 600 | loss=3.8647 | lr=0.000600 | step=4179 | 334s | 12.0GB\n", | |
| "E1 B 620 | loss=3.9434 | lr=0.000600 | step=4180 | 345s | 12.0GB\n", | |
| "E1 B 640 | loss=3.9713 | lr=0.000600 | step=4181 | 356s | 12.0GB\n", | |
| "E1 B 660 | loss=4.1566 | lr=0.000600 | step=4181 | 367s | 12.0GB\n", | |
| "E1 B 680 | loss=3.5517 | lr=0.000600 | step=4182 | 378s | 12.0GB\n", | |
| "E1 B 700 | loss=3.8292 | lr=0.000600 | step=4182 | 388s | 12.0GB\n", | |
| "E1 B 720 | loss=4.0472 | lr=0.000600 | step=4183 | 399s | 12.0GB\n", | |
| "E1 B 740 | loss=3.2511 | lr=0.000600 | step=4184 | 410s | 12.0GB\n", | |
| "E1 B 760 | loss=3.8495 | lr=0.000600 | step=4184 | 421s | 12.0GB\n", | |
| "E1 B 780 | loss=3.6649 | lr=0.000600 | step=4185 | 432s | 12.0GB\n", | |
| "E1 B 800 | loss=4.3933 | lr=0.000600 | step=4186 | 443s | 12.0GB\n", | |
| "E1 B 820 | loss=4.1908 | lr=0.000600 | step=4186 | 454s | 12.0GB\n", | |
| "E1 B 840 | loss=3.5818 | lr=0.000600 | step=4187 | 464s | 12.0GB\n", | |
| "E1 B 860 | loss=4.0422 | lr=0.000600 | step=4187 | 475s | 12.0GB\n", | |
| "E1 B 880 | loss=3.6796 | lr=0.000600 | step=4188 | 486s | 12.0GB\n", | |
| "E1 B 900 | loss=3.7878 | lr=0.000600 | step=4189 | 497s | 12.0GB\n", | |
| "E1 B 920 | loss=3.7404 | lr=0.000600 | step=4189 | 508s | 12.0GB\n", | |
| "E1 B 940 | loss=3.6401 | lr=0.000600 | step=4190 | 519s | 12.0GB\n", | |
| "E1 B 960 | loss=3.4780 | lr=0.000600 | step=4191 | 530s | 12.0GB\n", | |
| "E1 B 980 | loss=3.5547 | lr=0.000600 | step=4191 | 541s | 12.0GB\n", | |
| "E1 B 1000 | loss=3.8773 | lr=0.000600 | step=4192 | 551s | 12.0GB\n", | |
| "E1 B 1020 | loss=4.2421 | lr=0.000600 | step=4192 | 562s | 12.0GB\n", | |
| "E1 B 1040 | loss=3.3708 | lr=0.000600 | step=4193 | 573s | 12.0GB\n", | |
| "E1 B 1060 | loss=4.0438 | lr=0.000600 | step=4194 | 584s | 12.0GB\n", | |
| "E1 B 1080 | loss=3.5210 | lr=0.000600 | step=4194 | 595s | 12.0GB\n", | |
| "E1 B 1100 | loss=4.0428 | lr=0.000600 | step=4195 | 606s | 12.0GB\n", | |
| "E1 B 1120 | loss=3.4459 | lr=0.000600 | step=4196 | 617s | 12.0GB\n", | |
| "E1 B 1140 | loss=3.4632 | lr=0.000600 | step=4196 | 627s | 12.0GB\n", | |
| "E1 B 1160 | loss=3.9588 | lr=0.000600 | step=4197 | 638s | 12.0GB\n", | |
| "E1 B 1180 | loss=3.6564 | lr=0.000600 | step=4197 | 649s | 12.0GB\n", | |
| "E1 B 1200 | loss=3.4086 | lr=0.000600 | step=4198 | 660s | 12.0GB\n", | |
| "E1 B 1220 | loss=3.7761 | lr=0.000600 | step=4199 | 671s | 12.0GB\n", | |
| "E1 B 1240 | loss=3.6578 | lr=0.000600 | step=4199 | 681s | 12.0GB\n", | |
| "E1 B 1260 | loss=3.9621 | lr=0.000600 | step=4200 | 692s | 12.0GB\n", | |
| "E1 B 1280 | loss=3.8570 | lr=0.000600 | step=4201 | 703s | 12.0GB\n", | |
| "E1 B 1300 | loss=3.6209 | lr=0.000600 | step=4201 | 714s | 12.0GB\n", | |
| "E1 B 1320 | loss=3.9640 | lr=0.000600 | step=4202 | 725s | 12.0GB\n", | |
| "E1 B 1340 | loss=3.7827 | lr=0.000600 | step=4202 | 736s | 12.0GB\n", | |
| "E1 B 1360 | loss=3.8991 | lr=0.000600 | step=4203 | 747s | 12.0GB\n", | |
| "E1 B 1380 | loss=3.6718 | lr=0.000600 | step=4204 | 757s | 12.0GB\n", | |
| "E1 B 1400 | loss=3.4916 | lr=0.000600 | step=4204 | 768s | 12.0GB\n", | |
| "E1 B 1420 | loss=3.8652 | lr=0.000600 | step=4205 | 779s | 12.0GB\n", | |
| "E1 B 1440 | loss=3.7964 | lr=0.000600 | step=4206 | 790s | 12.0GB\n", | |
| "E1 B 1460 | loss=3.6250 | lr=0.000600 | step=4206 | 801s | 12.0GB\n", | |
| "E1 B 1480 | loss=3.6291 | lr=0.000600 | step=4207 | 812s | 12.0GB\n", | |
| "E1 B 1500 | loss=3.8778 | lr=0.000600 | step=4207 | 823s | 12.0GB\n", | |
| "E1 B 1520 | loss=3.7780 | lr=0.000600 | step=4208 | 834s | 12.0GB\n", | |
| "E1 B 1540 | loss=3.7162 | lr=0.000600 | step=4209 | 844s | 12.0GB\n", | |
| "E1 B 1560 | loss=3.9384 | lr=0.000600 | step=4209 | 855s | 12.0GB\n", | |
| "E1 B 1580 | loss=4.1989 | lr=0.000599 | step=4210 | 866s | 12.0GB\n", | |
| "E1 B 1600 | loss=3.5106 | lr=0.000599 | step=4211 | 877s | 12.0GB\n", | |
| "E1 B 1620 | loss=3.7452 | lr=0.000599 | step=4211 | 888s | 12.0GB\n", | |
| "E1 B 1640 | loss=3.9518 | lr=0.000599 | step=4212 | 899s | 12.0GB\n", | |
| "E1 B 1660 | loss=3.4435 | lr=0.000599 | step=4212 | 910s | 12.0GB\n", | |
| "E1 B 1680 | loss=3.6866 | lr=0.000599 | step=4213 | 920s | 12.0GB\n", | |
| "E1 B 1700 | loss=3.8041 | lr=0.000599 | step=4214 | 931s | 12.0GB\n", | |
| "E1 B 1720 | loss=3.9648 | lr=0.000599 | step=4214 | 942s | 12.0GB\n", | |
| "E1 B 1740 | loss=3.4477 | lr=0.000599 | step=4215 | 953s | 12.0GB\n", | |
| "E1 B 1760 | loss=4.0275 | lr=0.000599 | step=4216 | 964s | 12.0GB\n", | |
| "E1 B 1780 | loss=3.4429 | lr=0.000599 | step=4216 | 975s | 12.0GB\n", | |
| "E1 B 1800 | loss=4.1383 | lr=0.000599 | step=4217 | 986s | 12.0GB\n", | |
| "E1 B 1820 | loss=3.5344 | lr=0.000599 | step=4217 | 997s | 12.0GB\n", | |
| "E1 B 1840 | loss=3.7222 | lr=0.000599 | step=4218 | 1008s | 12.0GB\n", | |
| "E1 B 1860 | loss=3.4233 | lr=0.000599 | step=4219 | 1019s | 12.0GB\n", | |
| "E1 B 1880 | loss=3.9298 | lr=0.000599 | step=4219 | 1029s | 12.0GB\n", | |
| "E1 B 1900 | loss=3.8500 | lr=0.000599 | step=4220 | 1040s | 12.0GB\n", | |
| "E1 B 1920 | loss=4.2440 | lr=0.000599 | step=4221 | 1051s | 12.0GB\n", | |
| "E1 B 1940 | loss=3.5291 | lr=0.000599 | step=4221 | 1062s | 12.0GB\n", | |
| "E1 B 1960 | loss=3.9739 | lr=0.000599 | step=4222 | 1073s | 12.0GB\n", | |
| "E1 B 1980 | loss=3.5322 | lr=0.000599 | step=4222 | 1084s | 12.0GB\n", | |
| "E1 B 2000 | loss=3.9469 | lr=0.000599 | step=4223 | 1094s | 12.0GB\n", | |
| "E1 B 2020 | loss=2.7852 | lr=0.000599 | step=4224 | 1105s | 12.0GB\n", | |
| "E1 B 2040 | loss=3.0960 | lr=0.000599 | step=4224 | 1116s | 12.0GB\n", | |
| "E1 B 2060 | loss=3.8916 | lr=0.000599 | step=4225 | 1127s | 12.0GB\n", | |
| "E1 B 2080 | loss=3.2364 | lr=0.000599 | step=4226 | 1138s | 12.0GB\n", | |
| "E1 B 2100 | loss=3.3543 | lr=0.000599 | step=4226 | 1149s | 12.0GB\n", | |
| "E1 B 2120 | loss=3.6049 | lr=0.000599 | step=4227 | 1160s | 12.0GB\n", | |
| "E1 B 2140 | loss=3.5488 | lr=0.000599 | step=4227 | 1171s | 12.0GB\n", | |
| "E1 B 2160 | loss=3.7135 | lr=0.000599 | step=4228 | 1182s | 12.0GB\n", | |
| "E1 B 2180 | loss=3.9285 | lr=0.000599 | step=4229 | 1192s | 12.0GB\n", | |
| "E1 B 2200 | loss=3.5403 | lr=0.000599 | step=4229 | 1203s | 12.0GB\n", | |
| "E1 B 2220 | loss=3.7862 | lr=0.000599 | step=4230 | 1215s | 12.0GB\n", | |
| "E1 B 2240 | loss=3.6874 | lr=0.000599 | step=4231 | 1225s | 12.0GB\n", | |
| "E1 B 2260 | loss=3.1078 | lr=0.000599 | step=4231 | 1237s | 12.0GB\n", | |
| "E1 B 2280 | loss=4.0781 | lr=0.000599 | step=4232 | 1247s | 12.0GB\n", | |
| "E1 B 2300 | loss=3.8102 | lr=0.000599 | step=4232 | 1258s | 12.0GB\n", | |
| "E1 B 2320 | loss=3.7388 | lr=0.000599 | step=4233 | 1269s | 12.0GB\n", | |
| "E1 B 2340 | loss=3.2348 | lr=0.000599 | step=4234 | 1280s | 12.0GB\n", | |
| "E1 B 2360 | loss=3.8346 | lr=0.000599 | step=4234 | 1291s | 12.0GB\n", | |
| "E1 B 2380 | loss=3.7181 | lr=0.000599 | step=4235 | 1302s | 12.0GB\n", | |
| "E1 B 2400 | loss=3.9881 | lr=0.000599 | step=4236 | 1313s | 12.0GB\n", | |
| "E1 B 2420 | loss=3.5285 | lr=0.000599 | step=4236 | 1324s | 12.0GB\n", | |
| "E1 B 2440 | loss=4.1913 | lr=0.000599 | step=4237 | 1334s | 12.0GB\n", | |
| "E1 B 2460 | loss=3.2704 | lr=0.000599 | step=4237 | 1345s | 12.0GB\n", | |
| "E1 B 2480 | loss=3.8462 | lr=0.000598 | step=4238 | 1356s | 12.0GB\n", | |
| "E1 B 2500 | loss=2.7529 | lr=0.000598 | step=4239 | 1367s | 12.0GB\n", | |
| "E1 B 2520 | loss=3.7160 | lr=0.000598 | step=4239 | 1378s | 12.0GB\n", | |
| "E1 B 2540 | loss=3.7637 | lr=0.000598 | step=4240 | 1389s | 12.0GB\n", | |
| "E1 B 2560 | loss=3.2431 | lr=0.000598 | step=4241 | 1400s | 12.0GB\n", | |
| "E1 B 2580 | loss=4.1330 | lr=0.000598 | step=4241 | 1410s | 12.0GB\n", | |
| "E1 B 2600 | loss=3.6478 | lr=0.000598 | step=4242 | 1421s | 12.0GB\n", | |
| "E1 B 3380 | loss=4.2498 | lr=0.000597 | step=4266 | 1847s | 12.0GB\n", | |
| "E1 B 3400 | loss=3.2623 | lr=0.000597 | step=4267 | 1857s | 12.0GB\n", | |
| "E1 B 3420 | loss=4.0463 | lr=0.000597 | step=4267 | 1868s | 12.0GB\n", | |
| "E1 B 3440 | loss=3.7635 | lr=0.000597 | step=4268 | 1879s | 12.0GB\n", | |
| "E1 B 3460 | loss=3.8957 | lr=0.000597 | step=4269 | 1890s | 12.0GB\n", | |
| "E1 B 3480 | loss=3.1938 | lr=0.000597 | step=4269 | 1901s | 12.0GB\n", | |
| "E1 B 3500 | loss=3.6709 | lr=0.000597 | step=4270 | 1912s | 12.0GB\n", | |
| "E1 B 3520 | loss=3.7140 | lr=0.000597 | step=4271 | 1923s | 12.0GB\n", | |
| "E1 B 3540 | loss=4.1297 | lr=0.000597 | step=4271 | 1934s | 12.0GB\n", | |
| "E1 B 3560 | loss=3.4378 | lr=0.000597 | step=4272 | 1945s | 12.0GB\n", | |
| "E1 B 3580 | loss=3.1454 | lr=0.000597 | step=4272 | 1955s | 12.0GB\n", | |
| "E1 B 3600 | loss=3.6922 | lr=0.000597 | step=4273 | 1966s | 12.0GB\n", | |
| "E1 B 3620 | loss=3.2862 | lr=0.000596 | step=4274 | 1977s | 12.0GB\n", | |
| "E1 B 3640 | loss=3.8259 | lr=0.000596 | step=4274 | 1988s | 12.0GB\n", | |
| "E1 B 3660 | loss=3.9424 | lr=0.000596 | step=4275 | 1999s | 12.0GB\n", | |
| "E1 B 3680 | loss=3.7049 | lr=0.000596 | step=4276 | 2010s | 12.0GB\n", | |
| "E1 B 3700 | loss=3.9472 | lr=0.000596 | step=4276 | 2021s | 12.0GB\n", | |
| "E1 B 3720 | loss=3.5500 | lr=0.000596 | step=4277 | 2032s | 12.0GB\n", | |
| "E1 B 3740 | loss=3.9042 | lr=0.000596 | step=4277 | 2042s | 12.0GB\n", | |
| "E1 B 3760 | loss=3.7923 | lr=0.000596 | step=4278 | 2053s | 12.0GB\n", | |
| "E1 B 3780 | loss=3.6776 | lr=0.000596 | step=4279 | 2064s | 12.0GB\n", | |
| "E1 B 3800 | loss=3.9467 | lr=0.000596 | step=4279 | 2075s | 12.0GB\n", | |
| "E1 B 3820 | loss=3.2125 | lr=0.000596 | step=4280 | 2086s | 12.0GB\n", | |
| "E1 B 3840 | loss=3.9384 | lr=0.000596 | step=4281 | 2097s | 12.0GB\n", | |
| "E1 B 3860 | loss=3.7973 | lr=0.000596 | step=4281 | 2108s | 12.0GB\n", | |
| "E1 B 3880 | loss=4.1037 | lr=0.000596 | step=4282 | 2119s | 12.0GB\n", | |
| "E1 B 3900 | loss=3.8967 | lr=0.000596 | step=4282 | 2129s | 12.0GB\n", | |
| "E1 B 3920 | loss=3.7767 | lr=0.000596 | step=4283 | 2140s | 12.0GB\n", | |
| "E1 B 3940 | loss=4.1352 | lr=0.000596 | step=4284 | 2151s | 12.0GB\n", | |
| "E1 B 3960 | loss=3.7326 | lr=0.000596 | step=4284 | 2162s | 12.0GB\n", | |
| "E1 B 3980 | loss=3.9891 | lr=0.000596 | step=4285 | 2173s | 12.0GB\n", | |
| "E1 B 4000 | loss=3.4817 | lr=0.000596 | step=4286 | 2184s | 12.0GB\n", | |
| "E1 B 4020 | loss=3.5175 | lr=0.000596 | step=4286 | 2195s | 12.0GB\n", | |
| "E1 B 4040 | loss=4.1090 | lr=0.000595 | step=4287 | 2206s | 12.0GB\n", | |
| "E1 B 4060 | loss=3.4153 | lr=0.000595 | step=4287 | 2217s | 12.0GB\n", | |
| "E1 B 4080 | loss=3.7458 | lr=0.000595 | step=4288 | 2227s | 12.0GB\n", | |
| "E1 B 4100 | loss=3.2143 | lr=0.000595 | step=4289 | 2238s | 12.0GB\n", | |
| "E1 B 4120 | loss=3.7533 | lr=0.000595 | step=4289 | 2249s | 12.0GB\n", | |
| "E1 B 4140 | loss=3.1566 | lr=0.000595 | step=4290 | 2260s | 12.0GB\n", | |
| "E1 B 4160 | loss=3.7368 | lr=0.000595 | step=4291 | 2271s | 12.0GB\n", | |
| "E1 B 4180 | loss=3.2834 | lr=0.000595 | step=4291 | 2282s | 12.0GB\n", | |
| "E1 B 4200 | loss=3.6542 | lr=0.000595 | step=4292 | 2293s | 12.0GB\n", | |
| "E1 B 4220 | loss=4.0738 | lr=0.000595 | step=4292 | 2304s | 12.0GB\n", | |
| "E1 B 4240 | loss=4.2073 | lr=0.000595 | step=4293 | 2315s | 12.0GB\n", | |
| "E1 B 4260 | loss=3.8717 | lr=0.000595 | step=4294 | 2326s | 12.0GB\n", | |
| "E1 B 4280 | loss=3.5919 | lr=0.000595 | step=4294 | 2337s | 12.0GB\n", | |
| "E1 B 4300 | loss=4.0324 | lr=0.000595 | step=4295 | 2347s | 12.0GB\n", | |
| "E1 B 4320 | loss=3.8404 | lr=0.000595 | step=4296 | 2359s | 12.0GB\n", | |
| "E1 B 4340 | loss=3.4561 | lr=0.000595 | step=4296 | 2369s | 12.0GB\n", | |
| "E1 B 4360 | loss=3.4775 | lr=0.000595 | step=4297 | 2380s | 12.0GB\n", | |
| "E1 B 4380 | loss=4.1283 | lr=0.000595 | step=4297 | 2391s | 12.0GB\n", | |
| "E1 B 4400 | loss=3.7408 | lr=0.000595 | step=4298 | 2402s | 12.0GB\n", | |
| "E1 B 4420 | loss=3.5278 | lr=0.000595 | step=4299 | 2413s | 12.0GB\n", | |
| "E1 B 4440 | loss=3.7521 | lr=0.000595 | step=4299 | 2424s | 12.0GB\n", | |
| "E1 B 4460 | loss=4.2358 | lr=0.000594 | step=4300 | 2435s | 12.0GB\n", | |
| "E1 B 4480 | loss=3.8559 | lr=0.000594 | step=4301 | 2446s | 12.0GB\n", | |
| "E1 B 4500 | loss=3.5122 | lr=0.000594 | step=4301 | 2456s | 12.0GB\n", | |
| "E1 B 4520 | loss=3.8501 | lr=0.000594 | step=4302 | 2467s | 12.0GB\n", | |
| "E1 B 4540 | loss=3.6822 | lr=0.000594 | step=4302 | 2478s | 12.0GB\n", | |
| "E1 B 4560 | loss=3.3621 | lr=0.000594 | step=4303 | 2489s | 12.0GB\n", | |
| "E1 B 4580 | loss=3.5062 | lr=0.000594 | step=4304 | 2500s | 12.0GB\n", | |
| "E1 B 4600 | loss=3.0757 | lr=0.000594 | step=4304 | 2511s | 12.0GB\n", | |
| "E1 B 4620 | loss=4.0975 | lr=0.000594 | step=4305 | 2522s | 12.0GB\n", | |
| "E1 B 4640 | loss=3.6449 | lr=0.000594 | step=4306 | 2533s | 12.0GB\n", | |
| "E1 B 4660 | loss=3.5956 | lr=0.000594 | step=4306 | 2544s | 12.0GB\n", | |
| "E1 B 4680 | loss=3.4892 | lr=0.000594 | step=4307 | 2555s | 12.0GB\n", | |
| "E1 B 4700 | loss=3.9820 | lr=0.000594 | step=4307 | 2565s | 12.0GB\n", | |
| "E1 B 4720 | loss=3.4963 | lr=0.000594 | step=4308 | 2577s | 12.0GB\n", | |
| "E1 B 4740 | loss=3.8315 | lr=0.000594 | step=4309 | 2587s | 12.0GB\n", | |
| "E1 B 4760 | loss=3.4831 | lr=0.000594 | step=4309 | 2599s | 12.0GB\n", | |
| "E1 B 4780 | loss=3.8785 | lr=0.000594 | step=4310 | 2609s | 12.0GB\n", | |
| "E1 B 4800 | loss=3.3087 | lr=0.000593 | step=4311 | 2620s | 12.0GB\n", | |
| "E1 B 4820 | loss=3.5024 | lr=0.000593 | step=4311 | 2631s | 12.0GB\n", | |
| "E1 B 4840 | loss=3.8783 | lr=0.000593 | step=4312 | 2642s | 12.0GB\n", | |
| "E1 B 4860 | loss=4.0501 | lr=0.000593 | step=4312 | 2653s | 12.0GB\n", | |
| "E1 B 4880 | loss=3.6705 | lr=0.000593 | step=4313 | 2664s | 12.0GB\n", | |
| "E1 B 4900 | loss=3.8526 | lr=0.000593 | step=4314 | 2675s | 12.0GB\n", | |
| "E1 B 4920 | loss=3.8669 | lr=0.000593 | step=4314 | 2686s | 12.0GB\n", | |
| "E1 B 4940 | loss=4.1182 | lr=0.000593 | step=4315 | 2696s | 12.0GB\n", | |
| "E1 B 4960 | loss=3.8719 | lr=0.000593 | step=4316 | 2708s | 12.0GB\n", | |
| "E1 B 4980 | loss=3.8207 | lr=0.000593 | step=4316 | 2718s | 12.0GB\n", | |
| "E1 B 5000 | loss=3.5777 | lr=0.000593 | step=4317 | 2729s | 12.0GB\n", | |
| "E1 B 5020 | loss=3.2576 | lr=0.000593 | step=4317 | 2740s | 12.0GB\n", | |
| "E1 B 5040 | loss=3.8911 | lr=0.000593 | step=4318 | 2751s | 12.0GB\n", | |
| "E1 B 5060 | loss=4.1871 | lr=0.000593 | step=4319 | 2762s | 12.0GB\n", | |
| "E1 B 5080 | loss=3.0519 | lr=0.000593 | step=4319 | 2773s | 12.0GB\n", | |
| "E1 B 5100 | loss=3.7473 | lr=0.000593 | step=4320 | 2784s | 12.0GB\n", | |
| "E1 B 5120 | loss=3.4271 | lr=0.000592 | step=4321 | 2794s | 12.0GB\n", | |
| "E1 B 5140 | loss=3.3310 | lr=0.000592 | step=4321 | 2805s | 12.0GB\n", | |
| "E1 B 5160 | loss=4.0783 | lr=0.000592 | step=4322 | 2816s | 12.0GB\n", | |
| "E1 B 5180 | loss=3.7565 | lr=0.000592 | step=4322 | 2827s | 12.0GB\n", | |
| "E1 B 5200 | loss=3.5564 | lr=0.000592 | step=4323 | 2838s | 12.0GB\n", | |
| "E1 B 5220 | loss=3.7110 | lr=0.000592 | step=4324 | 2849s | 12.0GB\n", | |
| "E1 B 5240 | loss=3.2367 | lr=0.000592 | step=4324 | 2860s | 12.0GB\n", | |
| "E1 B 5260 | loss=3.5480 | lr=0.000592 | step=4325 | 2871s | 12.0GB\n", | |
| "E1 B 5280 | loss=4.1523 | lr=0.000592 | step=4326 | 2881s | 12.0GB\n", | |
| "E1 B 5300 | loss=3.5962 | lr=0.000592 | step=4326 | 2892s | 12.0GB\n", | |
| "E1 B 5320 | loss=3.5437 | lr=0.000592 | step=4327 | 2903s | 12.0GB\n", | |
| "E1 B 5340 | loss=3.3924 | lr=0.000592 | step=4327 | 2914s | 12.0GB\n", | |
| "E1 B 5360 | loss=3.5345 | lr=0.000592 | step=4328 | 2925s | 12.0GB\n", | |
| "E1 B 5380 | loss=3.5430 | lr=0.000592 | step=4329 | 2935s | 12.0GB\n", | |
| "E1 B 5400 | loss=3.6471 | lr=0.000592 | step=4329 | 2946s | 12.0GB\n", | |
| "E1 B 5420 | loss=3.4325 | lr=0.000592 | step=4330 | 2957s | 12.0GB\n", | |
| "E1 B 5440 | loss=3.7222 | lr=0.000591 | step=4331 | 2968s | 12.0GB\n", | |
| "E1 B 5460 | loss=3.8542 | lr=0.000591 | step=4331 | 2979s | 12.0GB\n", | |
| "E1 B 5480 | loss=3.5362 | lr=0.000591 | step=4332 | 2990s | 12.0GB\n", | |
| "E1 B 5500 | loss=2.8639 | lr=0.000591 | step=4332 | 3001s | 12.0GB\n", | |
| "E1 B 5520 | loss=4.1180 | lr=0.000591 | step=4333 | 3012s | 12.0GB\n", | |
| "E1 B 5540 | loss=3.4848 | lr=0.000591 | step=4334 | 3022s | 12.0GB\n", | |
| "E1 B 5560 | loss=3.7433 | lr=0.000591 | step=4334 | 3034s | 12.0GB\n", | |
| "E1 B 5580 | loss=3.5411 | lr=0.000591 | step=4335 | 3044s | 12.0GB\n", | |
| "E1 B 5600 | loss=3.8508 | lr=0.000591 | step=4336 | 3055s | 12.0GB\n", | |
| "E1 B 5620 | loss=3.7803 | lr=0.000591 | step=4336 | 3067s | 12.0GB\n", | |
| "E1 B 5640 | loss=3.7531 | lr=0.000591 | step=4337 | 3077s | 12.0GB\n", | |
| "E1 B 5660 | loss=3.2870 | lr=0.000591 | step=4337 | 3088s | 12.0GB\n", | |
| "E1 B 5680 | loss=3.6641 | lr=0.000591 | step=4338 | 3099s | 12.0GB\n", | |
| "E1 B 5700 | loss=3.6535 | lr=0.000591 | step=4339 | 3110s | 12.0GB\n", | |
| "E1 B 5720 | loss=3.2108 | lr=0.000591 | step=4339 | 3121s | 12.0GB\n", | |
| "E1 B 5740 | loss=3.4020 | lr=0.000590 | step=4340 | 3132s | 12.0GB\n", | |
| "E1 B 5760 | loss=3.9905 | lr=0.000590 | step=4341 | 3143s | 12.0GB\n", | |
| "E1 B 5780 | loss=4.0137 | lr=0.000590 | step=4341 | 3154s | 12.0GB\n", | |
| "E1 B 5800 | loss=3.3318 | lr=0.000590 | step=4342 | 3164s | 12.0GB\n", | |
| "E1 B 5820 | loss=3.2309 | lr=0.000590 | step=4342 | 3176s | 12.0GB\n", | |
| "E1 B 5840 | loss=3.5400 | lr=0.000590 | step=4343 | 3186s | 12.0GB\n", | |
| "E1 B 5860 | loss=3.6860 | lr=0.000590 | step=4344 | 3197s | 12.0GB\n", | |
| "E1 B 5880 | loss=3.9270 | lr=0.000590 | step=4344 | 3208s | 12.0GB\n", | |
| "E1 B 5900 | loss=3.7669 | lr=0.000590 | step=4345 | 3219s | 12.0GB\n", | |
| "E1 B 5920 | loss=3.9128 | lr=0.000590 | step=4346 | 3230s | 12.0GB\n", | |
| "E1 B 5940 | loss=3.5685 | lr=0.000590 | step=4346 | 3241s | 12.0GB\n", | |
| "E1 B 5960 | loss=3.6749 | lr=0.000590 | step=4347 | 3252s | 12.0GB\n", | |
| "E1 B 5980 | loss=3.7940 | lr=0.000590 | step=4347 | 3263s | 12.0GB\n", | |
| "E1 B 6000 | loss=4.0832 | lr=0.000590 | step=4348 | 3274s | 12.0GB\n", | |
| "E1 B 6020 | loss=2.9997 | lr=0.000589 | step=4349 | 3285s | 12.0GB\n", | |
| "E1 B 6040 | loss=3.8771 | lr=0.000589 | step=4349 | 3296s | 12.0GB\n", | |
| "E1 B 6060 | loss=3.8635 | lr=0.000589 | step=4350 | 3307s | 12.0GB\n", | |
| "E1 B 6080 | loss=4.0735 | lr=0.000589 | step=4351 | 3317s | 12.0GB\n", | |
| "E1 B 6100 | loss=3.8305 | lr=0.000589 | step=4351 | 3328s | 12.0GB\n", | |
| "E1 B 6120 | loss=4.2192 | lr=0.000589 | step=4352 | 3339s | 12.0GB\n", | |
| "E1 B 6140 | loss=3.6033 | lr=0.000589 | step=4352 | 3350s | 12.0GB\n", | |
| "E1 B 6160 | loss=3.9481 | lr=0.000589 | step=4353 | 3361s | 12.0GB\n", | |
| "E1 B 6180 | loss=3.4794 | lr=0.000589 | step=4354 | 3372s | 12.0GB\n", | |
| "E1 B 6200 | loss=3.6994 | lr=0.000589 | step=4354 | 3383s | 12.0GB\n", | |
| "E1 B 6220 | loss=3.1112 | lr=0.000589 | step=4355 | 3394s | 12.0GB\n", | |
| "E1 B 6240 | loss=3.8113 | lr=0.000589 | step=4356 | 3404s | 12.0GB\n", | |
| "E1 B 6260 | loss=3.7428 | lr=0.000589 | step=4356 | 3415s | 12.0GB\n", | |
| "E1 B 6280 | loss=3.3264 | lr=0.000588 | step=4357 | 3426s | 12.0GB\n", | |
| "E1 B 6300 | loss=3.5052 | lr=0.000588 | step=4357 | 3437s | 12.0GB\n", | |
| "E1 B 6320 | loss=4.0709 | lr=0.000588 | step=4358 | 3448s | 12.0GB\n", | |
| "E1 B 6340 | loss=3.5083 | lr=0.000588 | step=4359 | 3459s | 12.0GB\n", | |
| "E1 B 6360 | loss=3.6742 | lr=0.000588 | step=4359 | 3469s | 12.0GB\n", | |
| "E1 B 6380 | loss=3.9251 | lr=0.000588 | step=4360 | 3480s | 12.0GB\n", | |
| "E1 B 6400 | loss=3.4903 | lr=0.000588 | step=4361 | 3491s | 12.0GB\n", | |
| "E1 B 6420 | loss=3.2723 | lr=0.000588 | step=4361 | 3502s | 12.0GB\n", | |
| "E1 B 6440 | loss=3.3902 | lr=0.000588 | step=4362 | 3513s | 12.0GB\n", | |
| "E1 B 6460 | loss=3.0574 | lr=0.000588 | step=4362 | 3524s | 12.0GB\n", | |
| "E1 B 6480 | loss=4.1356 | lr=0.000588 | step=4363 | 3534s | 12.0GB\n", | |
| "E1 B 6500 | loss=3.7635 | lr=0.000588 | step=4364 | 3545s | 12.0GB\n", | |
| "E1 B 6520 | loss=3.9686 | lr=0.000588 | step=4364 | 3556s | 12.0GB\n", | |
| "E1 B 6540 | loss=3.9279 | lr=0.000587 | step=4365 | 3567s | 12.0GB\n", | |
| "E1 B 6560 | loss=3.4935 | lr=0.000587 | step=4366 | 3578s | 12.0GB\n", | |
| "E1 B 7460 | loss=3.7588 | lr=0.000583 | step=4394 | 4066s | 12.0GB\n", | |
| "E1 B 7480 | loss=3.9924 | lr=0.000583 | step=4394 | 4076s | 12.0GB\n", | |
| "E1 B 7500 | loss=3.5432 | lr=0.000583 | step=4395 | 4087s | 12.0GB\n", | |
| "E1 B 7520 | loss=3.9018 | lr=0.000583 | step=4396 | 4098s | 12.0GB\n", | |
| "E1 B 7540 | loss=3.5246 | lr=0.000583 | step=4396 | 4109s | 12.0GB\n", | |
| "E1 B 7560 | loss=3.9392 | lr=0.000583 | step=4397 | 4120s | 12.0GB\n", | |
| "E1 B 7580 | loss=3.7607 | lr=0.000583 | step=4397 | 4131s | 12.0GB\n", | |
| "E1 B 7600 | loss=3.9575 | lr=0.000583 | step=4398 | 4142s | 12.0GB\n", | |
| "E1 B 7620 | loss=4.0790 | lr=0.000583 | step=4399 | 4153s | 12.0GB\n", | |
| "E1 B 7640 | loss=3.4059 | lr=0.000583 | step=4399 | 4163s | 12.0GB\n", | |
| "E1 B 7660 | loss=3.4406 | lr=0.000583 | step=4400 | 4174s | 12.0GB\n", | |
| "E1 B 7680 | loss=4.1068 | lr=0.000582 | step=4401 | 4185s | 12.0GB\n", | |
| "E1 B 7700 | loss=3.9717 | lr=0.000582 | step=4401 | 4196s | 12.0GB\n", | |
| "E1 B 7720 | loss=3.5703 | lr=0.000582 | step=4402 | 4207s | 12.0GB\n", | |
| "E1 B 7740 | loss=4.0311 | lr=0.000582 | step=4402 | 4217s | 12.0GB\n", | |
| "E1 B 7760 | loss=3.9605 | lr=0.000582 | step=4403 | 4228s | 12.0GB\n", | |
| "E1 B 7780 | loss=3.6978 | lr=0.000582 | step=4404 | 4239s | 12.0GB\n", | |
| "E1 B 7800 | loss=3.6814 | lr=0.000582 | step=4404 | 4250s | 12.0GB\n", | |
| "E1 B 7820 | loss=3.9002 | lr=0.000582 | step=4405 | 4261s | 12.0GB\n", | |
| "E1 B 7840 | loss=3.3289 | lr=0.000582 | step=4406 | 4272s | 12.0GB\n", | |
| "E1 B 7860 | loss=3.2967 | lr=0.000582 | step=4406 | 4283s | 12.0GB\n", | |
| "E1 B 7880 | loss=3.6585 | lr=0.000581 | step=4407 | 4294s | 12.0GB\n", | |
| "E1 B 7900 | loss=3.7935 | lr=0.000581 | step=4407 | 4304s | 12.0GB\n", | |
| "E1 B 7920 | loss=4.0240 | lr=0.000581 | step=4408 | 4315s | 12.0GB\n", | |
| "E1 B 7940 | loss=3.5756 | lr=0.000581 | step=4409 | 4326s | 12.0GB\n", | |
| "E1 B 7960 | loss=3.9151 | lr=0.000581 | step=4409 | 4337s | 12.0GB\n", | |
| "E1 B 7980 | loss=3.4314 | lr=0.000581 | step=4410 | 4348s | 12.0GB\n", | |
| "E1 B 8000 | loss=3.5642 | lr=0.000581 | step=4411 | 4359s | 12.0GB\n", | |
| "E1 B 8020 | loss=3.3443 | lr=0.000581 | step=4411 | 4370s | 12.0GB\n", | |
| "E1 B 8040 | loss=3.9221 | lr=0.000581 | step=4412 | 4380s | 12.0GB\n", | |
| "E1 B 8060 | loss=3.9750 | lr=0.000581 | step=4412 | 4391s | 12.0GB\n", | |
| "E1 B 8080 | loss=3.6784 | lr=0.000581 | step=4413 | 4402s | 12.0GB\n", | |
| "E1 B 8100 | loss=2.8846 | lr=0.000580 | step=4414 | 4413s | 12.0GB\n", | |
| "E1 B 8120 | loss=3.3296 | lr=0.000580 | step=4414 | 4424s | 12.0GB\n", | |
| "E1 B 8140 | loss=3.9181 | lr=0.000580 | step=4415 | 4435s | 12.0GB\n", | |
| "E1 B 8160 | loss=3.5268 | lr=0.000580 | step=4416 | 4446s | 12.0GB\n", | |
| "E1 B 8180 | loss=3.7826 | lr=0.000580 | step=4416 | 4456s | 12.0GB\n", | |
| "E1 B 8200 | loss=3.6549 | lr=0.000580 | step=4417 | 4467s | 12.0GB\n", | |
| "E1 B 8220 | loss=3.2850 | lr=0.000580 | step=4417 | 4478s | 12.0GB\n", | |
| "E1 B 8240 | loss=4.1135 | lr=0.000580 | step=4418 | 4489s | 12.0GB\n", | |
| "E1 B 8260 | loss=3.9221 | lr=0.000580 | step=4419 | 4500s | 12.0GB\n", | |
| "E1 B 8280 | loss=3.7016 | lr=0.000580 | step=4419 | 4510s | 12.0GB\n", | |
| "E1 B 8300 | loss=3.6920 | lr=0.000579 | step=4420 | 4521s | 12.0GB\n", | |
| "E1 B 8320 | loss=3.5037 | lr=0.000579 | step=4421 | 4532s | 12.0GB\n", | |
| "E1 B 8340 | loss=3.3619 | lr=0.000579 | step=4421 | 4543s | 12.0GB\n", | |
| "E1 B 8360 | loss=3.5319 | lr=0.000579 | step=4422 | 4554s | 12.0GB\n", | |
| "E1 B 8380 | loss=3.1474 | lr=0.000579 | step=4422 | 4564s | 12.0GB\n", | |
| "E1 B 8400 | loss=3.4665 | lr=0.000579 | step=4423 | 4575s | 12.0GB\n", | |
| "E1 B 8420 | loss=3.2620 | lr=0.000579 | step=4424 | 4586s | 12.0GB\n", | |
| "E1 B 8440 | loss=3.7977 | lr=0.000579 | step=4424 | 4597s | 12.0GB\n", | |
| "E1 B 8460 | loss=3.6890 | lr=0.000579 | step=4425 | 4608s | 12.0GB\n", | |
| "E1 B 8480 | loss=3.5342 | lr=0.000578 | step=4426 | 4619s | 12.0GB\n", | |
| "E1 B 8500 | loss=4.1854 | lr=0.000578 | step=4426 | 4630s | 12.0GB\n", | |
| "E1 B 8520 | loss=3.8791 | lr=0.000578 | step=4427 | 4641s | 12.0GB\n", | |
| "E1 B 8540 | loss=3.7094 | lr=0.000578 | step=4427 | 4651s | 12.0GB\n", | |
| "E1 B 8560 | loss=3.7675 | lr=0.000578 | step=4428 | 4662s | 12.0GB\n", | |
| "E1 B 8580 | loss=3.6762 | lr=0.000578 | step=4429 | 4673s | 12.0GB\n", | |
| "E1 B 8600 | loss=3.6483 | lr=0.000578 | step=4429 | 4684s | 12.0GB\n", | |
| "E1 B 8620 | loss=3.7616 | lr=0.000578 | step=4430 | 4695s | 12.0GB\n", | |
| "E1 B 8640 | loss=3.8960 | lr=0.000578 | step=4431 | 4706s | 12.0GB\n", | |
| "E1 B 8660 | loss=3.2143 | lr=0.000578 | step=4431 | 4717s | 12.0GB\n", | |
| "E1 B 8680 | loss=3.7283 | lr=0.000577 | step=4432 | 4727s | 12.0GB\n", | |
| "E1 B 8700 | loss=3.6771 | lr=0.000577 | step=4432 | 4738s | 12.0GB\n", | |
| "E1 B 8720 | loss=3.4654 | lr=0.000577 | step=4433 | 4749s | 12.0GB\n", | |
| "E1 B 8740 | loss=3.9070 | lr=0.000577 | step=4434 | 4760s | 12.0GB\n", | |
| "E1 B 8760 | loss=3.7033 | lr=0.000577 | step=4434 | 4771s | 12.0GB\n", | |
| "E1 B 8780 | loss=4.1702 | lr=0.000577 | step=4435 | 4781s | 12.0GB\n", | |
| "E1 B 8800 | loss=3.9153 | lr=0.000577 | step=4436 | 4792s | 12.0GB\n", | |
| "E1 B 8820 | loss=3.9761 | lr=0.000577 | step=4436 | 4803s | 12.0GB\n", | |
| "E1 B 8840 | loss=3.6060 | lr=0.000577 | step=4437 | 4814s | 12.0GB\n", | |
| "E1 B 8860 | loss=3.8920 | lr=0.000577 | step=4437 | 4825s | 12.0GB\n", | |
| "E1 B 8880 | loss=3.5466 | lr=0.000576 | step=4438 | 4836s | 12.0GB\n", | |
| "E1 B 8900 | loss=3.5015 | lr=0.000576 | step=4439 | 4846s | 12.0GB\n", | |
| "E1 B 8920 | loss=3.9832 | lr=0.000576 | step=4439 | 4857s | 12.0GB\n", | |
| "E1 B 8940 | loss=3.5292 | lr=0.000576 | step=4440 | 4868s | 12.0GB\n", | |
| "E1 B 8960 | loss=3.2885 | lr=0.000576 | step=4441 | 4879s | 12.0GB\n", | |
| "E1 B 8980 | loss=4.1046 | lr=0.000576 | step=4441 | 4890s | 12.0GB\n", | |
| "E1 B 9000 | loss=3.7715 | lr=0.000576 | step=4442 | 4901s | 12.0GB\n", | |
| "E1 B 9020 | loss=3.6860 | lr=0.000576 | step=4442 | 4912s | 12.0GB\n", | |
| "E1 B 9040 | loss=3.6556 | lr=0.000575 | step=4443 | 4923s | 12.0GB\n", | |
| "E1 B 9060 | loss=3.8672 | lr=0.000575 | step=4444 | 4934s | 12.0GB\n", | |
| "E1 B 9080 | loss=3.3887 | lr=0.000575 | step=4444 | 4944s | 12.0GB\n", | |
| "E1 B 9100 | loss=4.0118 | lr=0.000575 | step=4445 | 4955s | 12.0GB\n", | |
| "E1 B 9120 | loss=4.1592 | lr=0.000575 | step=4446 | 4966s | 12.0GB\n", | |
| "E1 B 9140 | loss=3.5818 | lr=0.000575 | step=4446 | 4977s | 12.0GB\n", | |
| "E1 B 9160 | loss=3.3381 | lr=0.000575 | step=4447 | 4988s | 12.0GB\n", | |
| "E1 B 9180 | loss=3.7957 | lr=0.000575 | step=4447 | 4998s | 12.0GB\n", | |
| "E1 B 9200 | loss=3.4947 | lr=0.000575 | step=4448 | 5009s | 12.0GB\n", | |
| "E1 B 9220 | loss=3.6440 | lr=0.000574 | step=4449 | 5020s | 12.0GB\n", | |
| "E1 B 9240 | loss=3.9519 | lr=0.000574 | step=4449 | 5031s | 12.0GB\n", | |
| "E1 B 9260 | loss=3.7230 | lr=0.000574 | step=4450 | 5042s | 12.0GB\n", | |
| "E1 B 9280 | loss=4.0447 | lr=0.000574 | step=4451 | 5053s | 12.0GB\n", | |
| "E1 B 9300 | loss=3.9662 | lr=0.000574 | step=4451 | 5063s | 12.0GB\n", | |
| "E1 B 9320 | loss=3.5119 | lr=0.000574 | step=4452 | 5074s | 12.0GB\n", | |
| "E1 B 9340 | loss=3.8177 | lr=0.000574 | step=4452 | 5085s | 12.0GB\n", | |
| "E1 B 9360 | loss=3.6362 | lr=0.000574 | step=4453 | 5096s | 12.0GB\n", | |
| "E1 B 9380 | loss=3.8890 | lr=0.000573 | step=4454 | 5107s | 12.0GB\n", | |
| "E1 B 9400 | loss=3.5255 | lr=0.000573 | step=4454 | 5118s | 12.0GB\n", | |
| "E1 B 9420 | loss=4.0151 | lr=0.000573 | step=4455 | 5129s | 12.0GB\n", | |
| "E1 B 9440 | loss=2.8805 | lr=0.000573 | step=4456 | 5140s | 12.0GB\n", | |
| "E1 B 9460 | loss=3.5653 | lr=0.000573 | step=4456 | 5151s | 12.0GB\n", | |
| "E1 B 9480 | loss=3.3855 | lr=0.000573 | step=4457 | 5161s | 12.0GB\n", | |
| "E1 B 9500 | loss=4.1694 | lr=0.000573 | step=4457 | 5172s | 12.0GB\n", | |
| "E1 B 9520 | loss=3.5008 | lr=0.000573 | step=4458 | 5183s | 12.0GB\n", | |
| "E1 B 9540 | loss=3.7059 | lr=0.000573 | step=4459 | 5194s | 12.0GB\n", | |
| "E1 B 9560 | loss=3.7016 | lr=0.000573 | step=4459 | 5205s | 12.0GB\n", | |
| "E1 B 9580 | loss=3.6644 | lr=0.000572 | step=4460 | 5216s | 12.0GB\n", | |
| "E1 B 9600 | loss=3.8400 | lr=0.000572 | step=4461 | 5226s | 12.0GB\n", | |
| "E1 B 9620 | loss=3.5602 | lr=0.000572 | step=4461 | 5237s | 12.0GB\n", | |
| "E1 B 9640 | loss=3.4154 | lr=0.000572 | step=4462 | 5248s | 12.0GB\n", | |
| "E1 B 9660 | loss=3.5660 | lr=0.000572 | step=4462 | 5259s | 12.0GB\n", | |
| "E1 B 9680 | loss=3.7336 | lr=0.000572 | step=4463 | 5270s | 12.0GB\n", | |
| "E1 B 9700 | loss=3.4588 | lr=0.000572 | step=4464 | 5281s | 12.0GB\n", | |
| "E1 B 9720 | loss=3.0297 | lr=0.000572 | step=4464 | 5292s | 12.0GB\n", | |
| "E1 B 9740 | loss=3.6946 | lr=0.000571 | step=4465 | 5302s | 12.0GB\n", | |
| "E1 B 9760 | loss=4.0037 | lr=0.000571 | step=4466 | 5313s | 12.0GB\n", | |
| "E1 B 9780 | loss=3.6218 | lr=0.000571 | step=4466 | 5324s | 12.0GB\n", | |
| "E1 B 9800 | loss=4.2324 | lr=0.000571 | step=4467 | 5335s | 12.0GB\n", | |
| "E1 B 9820 | loss=3.9094 | lr=0.000571 | step=4467 | 5346s | 12.0GB\n", | |
| "E1 B 9840 | loss=3.9099 | lr=0.000571 | step=4468 | 5356s | 12.0GB\n", | |
| "E1 B 9860 | loss=3.7706 | lr=0.000571 | step=4469 | 5367s | 12.0GB\n", | |
| "E1 B 9880 | loss=3.2210 | lr=0.000571 | step=4469 | 5378s | 12.0GB\n", | |
| "E1 B 9900 | loss=3.6080 | lr=0.000570 | step=4470 | 5389s | 12.0GB\n", | |
| "E1 B 9920 | loss=4.0966 | lr=0.000570 | step=4471 | 5400s | 12.0GB\n", | |
| "E1 B 9940 | loss=3.2607 | lr=0.000570 | step=4471 | 5410s | 12.0GB\n", | |
| "E1 B 9960 | loss=3.7564 | lr=0.000570 | step=4472 | 5421s | 12.0GB\n", | |
| "E1 B 9980 | loss=3.3734 | lr=0.000570 | step=4472 | 5432s | 12.0GB\n", | |
| "E1 B10000 | loss=3.9126 | lr=0.000570 | step=4473 | 5443s | 12.0GB\n", | |
| "E1 B10020 | loss=3.9910 | lr=0.000570 | step=4474 | 5454s | 12.0GB\n", | |
| "E1 B10040 | loss=3.5968 | lr=0.000570 | step=4474 | 5465s | 12.0GB\n", | |
| "E1 B10060 | loss=3.6759 | lr=0.000569 | step=4475 | 5476s | 12.0GB\n", | |
| "E1 B10080 | loss=3.5678 | lr=0.000569 | step=4476 | 5486s | 12.0GB\n", | |
| "E1 B10100 | loss=3.1702 | lr=0.000569 | step=4476 | 5497s | 12.0GB\n", | |
| "E1 B10120 | loss=3.5416 | lr=0.000569 | step=4477 | 5508s | 12.0GB\n", | |
| "E1 B10140 | loss=4.3872 | lr=0.000569 | step=4477 | 5519s | 12.0GB\n", | |
| "E1 B10160 | loss=4.1150 | lr=0.000569 | step=4478 | 5530s | 12.0GB\n", | |
| "E1 B10180 | loss=3.4371 | lr=0.000569 | step=4479 | 5540s | 12.0GB\n", | |
| "E1 B10200 | loss=3.7154 | lr=0.000569 | step=4479 | 5551s | 12.0GB\n", | |
| "E1 B10220 | loss=3.5275 | lr=0.000569 | step=4480 | 5562s | 12.0GB\n", | |
| "E1 B10240 | loss=3.8369 | lr=0.000568 | step=4481 | 5573s | 12.0GB\n", | |
| "E1 B10260 | loss=3.7127 | lr=0.000568 | step=4481 | 5584s | 12.0GB\n", | |
| "E1 B10280 | loss=3.3610 | lr=0.000568 | step=4482 | 5595s | 12.0GB\n", | |
| "E1 B10300 | loss=3.6494 | lr=0.000568 | step=4482 | 5606s | 12.0GB\n", | |
| "E1 B10320 | loss=3.3435 | lr=0.000568 | step=4483 | 5617s | 12.0GB\n", | |
| "E1 B10340 | loss=3.3789 | lr=0.000568 | step=4484 | 5627s | 12.0GB\n", | |
| "E1 B10360 | loss=3.1983 | lr=0.000568 | step=4484 | 5638s | 12.0GB\n", | |
| "E1 B10380 | loss=4.0545 | lr=0.000568 | step=4485 | 5649s | 12.0GB\n", | |
| "E1 B10400 | loss=3.8453 | lr=0.000567 | step=4486 | 5660s | 12.0GB\n", | |
| "E1 B10420 | loss=3.8194 | lr=0.000567 | step=4486 | 5671s | 12.0GB\n", | |
| "E1 B10440 | loss=3.4792 | lr=0.000567 | step=4487 | 5682s | 12.0GB\n", | |
| "E1 B10460 | loss=3.3315 | lr=0.000567 | step=4487 | 5693s | 12.0GB\n", | |
| "E1 B10480 | loss=3.6889 | lr=0.000567 | step=4488 | 5703s | 12.0GB\n", | |
| "E1 B10500 | loss=3.2926 | lr=0.000567 | step=4489 | 5714s | 12.0GB\n", | |
| "E1 B10520 | loss=3.4131 | lr=0.000567 | step=4489 | 5725s | 12.0GB\n", | |
| "E1 B10540 | loss=3.7499 | lr=0.000566 | step=4490 | 5736s | 12.0GB\n", | |
| "E1 B10560 | loss=3.9906 | lr=0.000566 | step=4491 | 5747s | 12.0GB\n", | |
| "E1 B10580 | loss=3.6390 | lr=0.000566 | step=4491 | 5757s | 12.0GB\n", | |
| "E1 B10600 | loss=3.5183 | lr=0.000566 | step=4492 | 5768s | 12.0GB\n", | |
| "E1 B10620 | loss=2.9833 | lr=0.000566 | step=4492 | 5779s | 12.0GB\n", | |
| "E1 B10640 | loss=3.4111 | lr=0.000566 | step=4493 | 5790s | 12.0GB\n", | |
| "E1 B10660 | loss=3.1319 | lr=0.000566 | step=4494 | 5801s | 12.0GB\n", | |
| "E1 B10680 | loss=4.1349 | lr=0.000566 | step=4494 | 5812s | 12.0GB\n", | |
| "E1 B10700 | loss=3.7414 | lr=0.000565 | step=4495 | 5823s | 12.0GB\n", | |
| "E1 B10720 | loss=3.9542 | lr=0.000565 | step=4496 | 5834s | 12.0GB\n", | |
| "E1 B10740 | loss=3.8533 | lr=0.000565 | step=4496 | 5844s | 12.0GB\n", | |
| "E1 B10760 | loss=3.5904 | lr=0.000565 | step=4497 | 5855s | 12.0GB\n", | |
| "E1 B10780 | loss=3.8577 | lr=0.000565 | step=4497 | 5866s | 12.0GB\n", | |
| "E1 B10800 | loss=3.8280 | lr=0.000565 | step=4498 | 5877s | 12.0GB\n", | |
| "E1 B10820 | loss=3.8997 | lr=0.000565 | step=4499 | 5888s | 12.0GB\n", | |
| "E1 B10840 | loss=3.4914 | lr=0.000565 | step=4499 | 5899s | 12.0GB\n", | |
| "E1 B10860 | loss=3.7780 | lr=0.000564 | step=4500 | 5910s | 12.0GB\n", | |
| "E1 B10880 | loss=3.3137 | lr=0.000564 | step=4501 | 5920s | 12.0GB\n", | |
| "E1 B10900 | loss=4.0723 | lr=0.000564 | step=4501 | 5931s | 12.0GB\n", | |
| "E1 B10920 | loss=3.2153 | lr=0.000564 | step=4502 | 5942s | 12.0GB\n", | |
| "E1 B10940 | loss=3.4515 | lr=0.000564 | step=4502 | 5953s | 12.0GB\n", | |
| "E1 B10960 | loss=3.0773 | lr=0.000564 | step=4503 | 5964s | 12.0GB\n", | |
| "E1 B10980 | loss=4.0201 | lr=0.000564 | step=4504 | 5975s | 12.0GB\n", | |
| "E1 B11000 | loss=3.9318 | lr=0.000564 | step=4504 | 5985s | 12.0GB\n", | |
| "E1 B11020 | loss=3.5208 | lr=0.000563 | step=4505 | 5996s | 12.0GB\n", | |
| "E1 B11040 | loss=3.9974 | lr=0.000563 | step=4506 | 6007s | 12.0GB\n", | |
| "E1 B11060 | loss=3.6317 | lr=0.000563 | step=4506 | 6018s | 12.0GB\n", | |
| "E1 B11080 | loss=3.8175 | lr=0.000563 | step=4507 | 6029s | 12.0GB\n", | |
| "E1 B11100 | loss=3.8171 | lr=0.000563 | step=4507 | 6040s | 12.0GB\n", | |
| "E1 B11120 | loss=3.6869 | lr=0.000563 | step=4508 | 6051s | 12.0GB\n", | |
| "E1 B11140 | loss=3.4689 | lr=0.000562 | step=4509 | 6061s | 12.0GB\n", | |
| "E1 B11160 | loss=3.7661 | lr=0.000562 | step=4509 | 6072s | 12.0GB\n", | |
| "E1 B11180 | loss=4.0698 | lr=0.000562 | step=4510 | 6083s | 12.0GB\n", | |
| "E1 B11200 | loss=3.5344 | lr=0.000562 | step=4511 | 6094s | 12.0GB\n", | |
| "E1 B11220 | loss=4.1427 | lr=0.000562 | step=4511 | 6105s | 12.0GB\n", | |
| "E1 B11240 | loss=3.1306 | lr=0.000562 | step=4512 | 6116s | 12.0GB\n", | |
| "E1 B11260 | loss=3.3947 | lr=0.000562 | step=4512 | 6127s | 12.0GB\n", | |
| "E1 B11280 | loss=3.4037 | lr=0.000562 | step=4513 | 6138s | 12.0GB\n", | |
| "E1 B11300 | loss=3.3066 | lr=0.000561 | step=4514 | 6148s | 12.0GB\n", | |
| "E1 B11320 | loss=3.6849 | lr=0.000561 | step=4514 | 6159s | 12.0GB\n", | |
| "E1 B11340 | loss=3.7419 | lr=0.000561 | step=4515 | 6170s | 12.0GB\n", | |
| "E1 B11360 | loss=3.9112 | lr=0.000561 | step=4516 | 6181s | 12.0GB\n", | |
| "E1 B11380 | loss=3.8776 | lr=0.000561 | step=4516 | 6192s | 12.0GB\n", | |
| "E1 B11400 | loss=3.7667 | lr=0.000561 | step=4517 | 6202s | 12.0GB\n", | |
| "E1 B11420 | loss=3.7207 | lr=0.000561 | step=4517 | 6213s | 12.0GB\n", | |
| "E1 B11440 | loss=3.4328 | lr=0.000560 | step=4518 | 6224s | 12.0GB\n", | |
| "E1 B11460 | loss=3.6496 | lr=0.000560 | step=4519 | 6235s | 12.0GB\n", | |
| "E1 B11480 | loss=3.9213 | lr=0.000560 | step=4519 | 6246s | 12.0GB\n", | |
| "E1 B11500 | loss=3.2339 | lr=0.000560 | step=4520 | 6257s | 12.0GB\n", | |
| "E1 B11520 | loss=3.7248 | lr=0.000560 | step=4521 | 6268s | 12.0GB\n", | |
| "E1 B11540 | loss=3.3425 | lr=0.000560 | step=4521 | 6278s | 12.0GB\n", | |
| "E1 B11560 | loss=3.0856 | lr=0.000560 | step=4522 | 6290s | 12.0GB\n", | |
| "E1 B11580 | loss=3.6616 | lr=0.000560 | step=4522 | 6300s | 12.0GB\n", | |
| "E1 B11600 | loss=3.7340 | lr=0.000559 | step=4523 | 6311s | 12.0GB\n", | |
| "E1 B11620 | loss=4.0090 | lr=0.000559 | step=4524 | 6322s | 12.0GB\n", | |
| "E1 B11640 | loss=3.7399 | lr=0.000559 | step=4524 | 6333s | 12.0GB\n", | |
| "E1 B11660 | loss=3.7283 | lr=0.000559 | step=4525 | 6344s | 12.0GB\n", | |
| "E1 B11680 | loss=3.4567 | lr=0.000559 | step=4526 | 6355s | 12.0GB\n", | |
| "E1 B11700 | loss=3.8698 | lr=0.000559 | step=4526 | 6365s | 12.0GB\n", | |
| "E1 B11720 | loss=3.7220 | lr=0.000558 | step=4527 | 6376s | 12.0GB\n", | |
| "E1 B11740 | loss=2.9580 | lr=0.000558 | step=4527 | 6387s | 12.0GB\n", | |
| "E1 B11760 | loss=3.5134 | lr=0.000558 | step=4528 | 6398s | 12.0GB\n", | |
| "E1 B11780 | loss=3.5830 | lr=0.000558 | step=4529 | 6409s | 12.0GB\n", | |
| "E1 B11800 | loss=3.3416 | lr=0.000558 | step=4529 | 6420s | 12.0GB\n", | |
| "E1 B11820 | loss=3.2925 | lr=0.000558 | step=4530 | 6431s | 12.0GB\n", | |
| "E1 B11840 | loss=3.8109 | lr=0.000558 | step=4531 | 6441s | 12.0GB\n", | |
| "E1 B11860 | loss=3.9211 | lr=0.000558 | step=4531 | 6452s | 12.0GB\n", | |
| "E1 B11880 | loss=3.3420 | lr=0.000557 | step=4532 | 6463s | 12.0GB\n", | |
| "E1 B11900 | loss=2.8897 | lr=0.000557 | step=4532 | 6474s | 12.0GB\n", | |
| "E1 B11920 | loss=3.8599 | lr=0.000557 | step=4533 | 6485s | 12.0GB\n", | |
| "E1 B11940 | loss=3.9627 | lr=0.000557 | step=4534 | 6496s | 12.0GB\n", | |
| "E1 B11960 | loss=3.3775 | lr=0.000557 | step=4534 | 6507s | 12.0GB\n", | |
| "E1 B11980 | loss=3.0813 | lr=0.000557 | step=4535 | 6517s | 12.0GB\n", | |
| "E1 B12000 | loss=3.3710 | lr=0.000556 | step=4536 | 6528s | 12.0GB\n", | |
| "E1 B12020 | loss=4.0220 | lr=0.000556 | step=4536 | 6539s | 12.0GB\n", | |
| "E1 B12040 | loss=3.6183 | lr=0.000556 | step=4537 | 6550s | 12.0GB\n", | |
| "E1 B12060 | loss=3.4753 | lr=0.000556 | step=4537 | 6561s | 12.0GB\n", | |
| "E1 B12080 | loss=3.7946 | lr=0.000556 | step=4538 | 6572s | 12.0GB\n", | |
| "E1 B12100 | loss=3.8959 | lr=0.000556 | step=4539 | 6582s | 12.0GB\n", | |
| "E1 B12120 | loss=3.4846 | lr=0.000556 | step=4539 | 6593s | 12.0GB\n", | |
| "E1 B12140 | loss=3.3113 | lr=0.000555 | step=4540 | 6604s | 12.0GB\n", | |
| "E1 B12160 | loss=3.4144 | lr=0.000555 | step=4541 | 6615s | 12.0GB\n", | |
| "E1 B12180 | loss=3.5783 | lr=0.000555 | step=4541 | 6626s | 12.0GB\n", | |
| "E1 B12200 | loss=3.3012 | lr=0.000555 | step=4542 | 6637s | 12.0GB\n", | |
| "E1 B12220 | loss=3.3818 | lr=0.000555 | step=4542 | 6648s | 12.0GB\n", | |
| "E1 B12240 | loss=3.7957 | lr=0.000555 | step=4543 | 6658s | 12.0GB\n", | |
| "E1 B12260 | loss=3.8215 | lr=0.000555 | step=4544 | 6669s | 12.0GB\n", | |
| "E1 B12280 | loss=3.5549 | lr=0.000555 | step=4544 | 6680s | 12.0GB\n", | |
| "E1 B12300 | loss=3.3651 | lr=0.000554 | step=4545 | 6691s | 12.0GB\n", | |
| "E1 B12320 | loss=3.3444 | lr=0.000554 | step=4546 | 6702s | 12.0GB\n", | |
| "E1 B12340 | loss=3.9622 | lr=0.000554 | step=4546 | 6712s | 12.0GB\n", | |
| "E1 B12360 | loss=3.4421 | lr=0.000554 | step=4547 | 6724s | 12.0GB\n", | |
| "E1 B12380 | loss=3.4991 | lr=0.000554 | step=4547 | 6734s | 12.0GB\n", | |
| "E1 B12400 | loss=3.8970 | lr=0.000554 | step=4548 | 6745s | 12.0GB\n", | |
| "E1 B12420 | loss=3.0477 | lr=0.000553 | step=4549 | 6756s | 12.0GB\n", | |
| "E1 B12440 | loss=3.5214 | lr=0.000553 | step=4549 | 6767s | 12.0GB\n", | |
| "E1 B12460 | loss=3.0554 | lr=0.000553 | step=4550 | 6778s | 12.0GB\n", | |
| "E1 B12480 | loss=3.4414 | lr=0.000553 | step=4551 | 6789s | 12.0GB\n", | |
| "E1 B12500 | loss=4.2476 | lr=0.000553 | step=4551 | 6799s | 12.0GB\n", | |
| "E1 B12520 | loss=3.6529 | lr=0.000553 | step=4552 | 6810s | 12.0GB\n", | |
| "E1 B12540 | loss=3.5521 | lr=0.000553 | step=4552 | 6821s | 12.0GB\n", | |
| "E1 B12560 | loss=3.9918 | lr=0.000552 | step=4553 | 6832s | 12.0GB\n", | |
| "E1 B12580 | loss=3.5520 | lr=0.000552 | step=4554 | 6843s | 12.0GB\n", | |
| "E1 B12600 | loss=3.3276 | lr=0.000552 | step=4554 | 6854s | 12.0GB\n", | |
| "E1 B12620 | loss=3.4012 | lr=0.000552 | step=4555 | 6865s | 12.0GB\n", | |
| "E1 B12640 | loss=3.5206 | lr=0.000552 | step=4556 | 6875s | 12.0GB\n", | |
| "E1 B12660 | loss=2.8729 | lr=0.000552 | step=4556 | 6886s | 12.0GB\n", | |
| "E1 B12680 | loss=3.3694 | lr=0.000551 | step=4557 | 6897s | 12.0GB\n", | |
| "E1 B12700 | loss=3.8004 | lr=0.000551 | step=4557 | 6908s | 12.0GB\n", | |
| "E1 B12720 | loss=3.5845 | lr=0.000551 | step=4558 | 6919s | 12.0GB\n", | |
| "E1 B12740 | loss=3.4681 | lr=0.000551 | step=4559 | 6930s | 12.0GB\n", | |
| "E1 B12760 | loss=3.5470 | lr=0.000551 | step=4559 | 6941s | 12.0GB\n", | |
| "E1 B12780 | loss=3.4184 | lr=0.000551 | step=4560 | 6951s | 12.0GB\n", | |
| "E1 B12800 | loss=3.0710 | lr=0.000550 | step=4561 | 6962s | 12.0GB\n", | |
| "E1 B12820 | loss=3.7679 | lr=0.000550 | step=4561 | 6973s | 12.0GB\n", | |
| "E1 B12840 | loss=3.2458 | lr=0.000550 | step=4562 | 6984s | 12.0GB\n", | |
| "E1 B12860 | loss=3.4125 | lr=0.000550 | step=4562 | 6995s | 12.0GB\n", | |
| "E1 B12880 | loss=3.6963 | lr=0.000550 | step=4563 | 7006s | 12.0GB\n", | |
| "E1 B12900 | loss=3.5222 | lr=0.000550 | step=4564 | 7016s | 12.0GB\n", | |
| "E1 B12920 | loss=3.6076 | lr=0.000550 | step=4564 | 7027s | 12.0GB\n", | |
| "E1 B12940 | loss=3.4708 | lr=0.000549 | step=4565 | 7038s | 12.0GB\n", | |
| "E1 B12960 | loss=3.4640 | lr=0.000549 | step=4566 | 7049s | 12.0GB\n", | |
| "E1 B12980 | loss=3.4967 | lr=0.000549 | step=4566 | 7060s | 12.0GB\n", | |
| "E1 B13000 | loss=4.0417 | lr=0.000549 | step=4567 | 7071s | 12.0GB\n", | |
| "E1 B13020 | loss=3.4071 | lr=0.000549 | step=4567 | 7082s | 12.0GB\n", | |
| "E1 B13040 | loss=3.9623 | lr=0.000549 | step=4568 | 7092s | 12.0GB\n", | |
| "E1 B13060 | loss=3.8871 | lr=0.000548 | step=4569 | 7103s | 12.0GB\n", | |
| "E1 B13080 | loss=3.8245 | lr=0.000548 | step=4569 | 7114s | 12.0GB\n", | |
| "E1 B13100 | loss=4.0772 | lr=0.000548 | step=4570 | 7125s | 12.0GB\n", | |
| "E1 B13120 | loss=3.3284 | lr=0.000548 | step=4571 | 7136s | 12.0GB\n", | |
| "E1 B13140 | loss=3.8306 | lr=0.000548 | step=4571 | 7147s | 12.0GB\n", | |
| "E1 B13160 | loss=3.5510 | lr=0.000548 | step=4572 | 7158s | 12.0GB\n", | |
| "E1 B13180 | loss=3.8633 | lr=0.000548 | step=4572 | 7168s | 12.0GB\n", | |
| "E1 B13200 | loss=3.1338 | lr=0.000547 | step=4573 | 7179s | 12.0GB\n", | |
| "E1 B13220 | loss=3.7669 | lr=0.000547 | step=4574 | 7190s | 12.0GB\n", | |
| "[SAVED] Checkpoint saved to: /home/spedrox/Transformers/checkpoints/auto_epoch1_step4574.pt\n", | |
| "E1 B13240 | loss=3.4762 | lr=0.000547 | step=4574 | 7202s | 12.0GB\n", | |
| "E1 B13260 | loss=3.7858 | lr=0.000547 | step=4575 | 7213s | 12.0GB\n", | |
| "E1 B13280 | loss=3.7659 | lr=0.000547 | step=4576 | 7224s | 12.0GB\n", | |
| "E1 B13300 | loss=3.6765 | lr=0.000547 | step=4576 | 7235s | 12.0GB\n", | |
| "E1 B13320 | loss=3.0681 | lr=0.000546 | step=4577 | 7246s | 12.0GB\n", | |
| "E1 B13340 | loss=3.9373 | lr=0.000546 | step=4577 | 7256s | 12.0GB\n", | |
| "E1 B13360 | loss=3.5668 | lr=0.000546 | step=4578 | 7267s | 12.0GB\n", | |
| "E1 B13380 | loss=3.4079 | lr=0.000546 | step=4579 | 7278s | 12.0GB\n", | |
| "E1 B13400 | loss=3.2671 | lr=0.000546 | step=4579 | 7289s | 12.0GB\n", | |
| "E1 B13420 | loss=3.7897 | lr=0.000546 | step=4580 | 7300s | 12.0GB\n", | |
| "E1 B13440 | loss=3.7164 | lr=0.000545 | step=4581 | 7310s | 12.0GB\n", | |
| "E1 B13460 | loss=3.4427 | lr=0.000545 | step=4581 | 7322s | 12.0GB\n", | |
| "E1 B13480 | loss=3.7896 | lr=0.000545 | step=4582 | 7332s | 12.0GB\n", | |
| "E1 B13500 | loss=3.5399 | lr=0.000545 | step=4582 | 7343s | 12.0GB\n", | |
| "E1 B13520 | loss=3.7662 | lr=0.000545 | step=4583 | 7354s | 12.0GB\n", | |
| "E1 B13540 | loss=3.4838 | lr=0.000545 | step=4584 | 7365s | 12.0GB\n", | |
| "E1 B13560 | loss=2.9369 | lr=0.000545 | step=4584 | 7376s | 12.0GB\n", | |
| "E1 B13580 | loss=4.0582 | lr=0.000544 | step=4585 | 7386s | 12.0GB\n", | |
| "E1 B13600 | loss=3.8900 | lr=0.000544 | step=4586 | 7397s | 12.0GB\n", | |
| "E1 B13620 | loss=3.5219 | lr=0.000544 | step=4586 | 7408s | 12.0GB\n", | |
| "E1 B13640 | loss=4.0037 | lr=0.000544 | step=4587 | 7419s | 12.0GB\n", | |
| "E1 B13660 | loss=3.6090 | lr=0.000544 | step=4587 | 7430s | 12.0GB\n", | |
| "E1 B13680 | loss=3.1603 | lr=0.000544 | step=4588 | 7441s | 12.0GB\n", | |
| "E1 B13700 | loss=3.7120 | lr=0.000543 | step=4589 | 7451s | 12.0GB\n", | |
| "E1 B13720 | loss=3.6829 | lr=0.000543 | step=4589 | 7462s | 12.0GB\n", | |
| "E1 B13740 | loss=3.7867 | lr=0.000543 | step=4590 | 7473s | 12.0GB\n", | |
| "E1 B13760 | loss=3.7865 | lr=0.000543 | step=4591 | 7484s | 12.0GB\n", | |
| "E1 B13780 | loss=3.5623 | lr=0.000543 | step=4591 | 7495s | 12.0GB\n", | |
| "E1 B13800 | loss=3.6049 | lr=0.000543 | step=4592 | 7506s | 12.0GB\n", | |
| "E1 B13820 | loss=3.4885 | lr=0.000543 | step=4592 | 7517s | 12.0GB\n", | |
| "E1 B13840 | loss=3.3832 | lr=0.000542 | step=4593 | 7527s | 12.0GB\n", | |
| "E1 B13860 | loss=3.9758 | lr=0.000542 | step=4594 | 7538s | 12.0GB\n", | |
| "E1 B13880 | loss=4.2042 | lr=0.000542 | step=4594 | 7549s | 12.0GB\n", | |
| "E1 B13900 | loss=3.8671 | lr=0.000542 | step=4595 | 7560s | 12.0GB\n", | |
| "E1 B13920 | loss=3.5813 | lr=0.000541 | step=4596 | 7571s | 12.0GB\n", | |
| "E1 B13940 | loss=3.7433 | lr=0.000541 | step=4596 | 7581s | 12.0GB\n", | |
| "E1 B13960 | loss=3.2050 | lr=0.000541 | step=4597 | 7592s | 12.0GB\n", | |
| "E1 B13980 | loss=3.6970 | lr=0.000541 | step=4597 | 7603s | 12.0GB\n", | |
| "E1 B14000 | loss=3.4994 | lr=0.000541 | step=4598 | 7614s | 12.0GB\n", | |
| "E1 B14020 | loss=3.7733 | lr=0.000541 | step=4599 | 7625s | 12.0GB\n", | |
| "E1 B14040 | loss=3.5705 | lr=0.000541 | step=4599 | 7636s | 12.0GB\n", | |
| "E1 B14060 | loss=3.7267 | lr=0.000540 | step=4600 | 7647s | 12.0GB\n", | |
| "E1 B14080 | loss=3.4656 | lr=0.000540 | step=4601 | 7657s | 12.0GB\n", | |
| "E1 B14100 | loss=3.9683 | lr=0.000540 | step=4601 | 7668s | 12.0GB\n", | |
| "E1 B14120 | loss=3.4433 | lr=0.000540 | step=4602 | 7679s | 12.0GB\n", | |
| "E1 B14140 | loss=3.2508 | lr=0.000540 | step=4602 | 7690s | 12.0GB\n", | |
| "E1 B14160 | loss=3.2970 | lr=0.000540 | step=4603 | 7701s | 12.0GB\n", | |
| "E1 B14180 | loss=3.7326 | lr=0.000539 | step=4604 | 7711s | 12.0GB\n", | |
| "E1 B14200 | loss=3.3949 | lr=0.000539 | step=4604 | 7722s | 12.0GB\n", | |
| "E1 B14220 | loss=2.9957 | lr=0.000539 | step=4605 | 7733s | 12.0GB\n", | |
| "E1 B14240 | loss=3.3279 | lr=0.000539 | step=4606 | 7744s | 12.0GB\n", | |
| "E1 B14260 | loss=3.7099 | lr=0.000539 | step=4606 | 7755s | 12.0GB\n", | |
| "E1 B14280 | loss=3.3056 | lr=0.000539 | step=4607 | 7766s | 12.0GB\n", | |
| "E1 B14300 | loss=3.7645 | lr=0.000539 | step=4607 | 7776s | 12.0GB\n", | |
| "E1 B14320 | loss=4.0314 | lr=0.000538 | step=4608 | 7787s | 12.0GB\n", | |
| "E1 B14340 | loss=3.5340 | lr=0.000538 | step=4609 | 7798s | 12.0GB\n", | |
| "E1 B14360 | loss=3.2238 | lr=0.000538 | step=4609 | 7809s | 12.0GB\n", | |
| "E1 B14380 | loss=3.5954 | lr=0.000538 | step=4610 | 7820s | 12.0GB\n", | |
| "E1 B14400 | loss=3.4484 | lr=0.000537 | step=4611 | 7831s | 12.0GB\n", | |
| "E1 B14420 | loss=3.5944 | lr=0.000537 | step=4611 | 7842s | 12.0GB\n", | |
| "E1 B14440 | loss=3.8301 | lr=0.000537 | step=4612 | 7853s | 12.0GB\n", | |
| "E1 B14460 | loss=3.5630 | lr=0.000537 | step=4612 | 7864s | 12.0GB\n", | |
| "E1 B14480 | loss=3.5284 | lr=0.000537 | step=4613 | 7874s | 12.0GB\n", | |
| "E1 B14500 | loss=3.5871 | lr=0.000537 | step=4614 | 7885s | 12.0GB\n", | |
| "E1 B14520 | loss=3.4255 | lr=0.000537 | step=4614 | 7896s | 12.0GB\n", | |
| "E1 B14540 | loss=3.8356 | lr=0.000536 | step=4615 | 7907s | 12.0GB\n", | |
| "E1 B14560 | loss=3.5553 | lr=0.000536 | step=4616 | 7918s | 12.0GB\n", | |
| "E1 B14580 | loss=3.7851 | lr=0.000536 | step=4616 | 7928s | 12.0GB\n", | |
| "E1 B14600 | loss=3.4283 | lr=0.000536 | step=4617 | 7939s | 12.0GB\n", | |
| "E1 B14620 | loss=3.5296 | lr=0.000536 | step=4617 | 7950s | 12.0GB\n", | |
| "E1 B14640 | loss=3.7185 | lr=0.000536 | step=4618 | 7961s | 12.0GB\n", | |
| "E1 B14660 | loss=3.7297 | lr=0.000535 | step=4619 | 7972s | 12.0GB\n", | |
| "E1 B14680 | loss=3.3585 | lr=0.000535 | step=4619 | 7983s | 12.0GB\n", | |
| "E1 B14700 | loss=3.6645 | lr=0.000535 | step=4620 | 7993s | 12.0GB\n", | |
| "E1 B14720 | loss=3.6405 | lr=0.000535 | step=4621 | 8004s | 12.0GB\n", | |
| "E1 B14740 | loss=3.4520 | lr=0.000535 | step=4621 | 8015s | 12.0GB\n", | |
| "E1 B14760 | loss=3.3239 | lr=0.000534 | step=4622 | 8026s | 12.0GB\n", | |
| "E1 B14780 | loss=3.6989 | lr=0.000534 | step=4622 | 8037s | 12.0GB\n", | |
| "E1 B14800 | loss=3.2144 | lr=0.000534 | step=4623 | 8048s | 12.0GB\n", | |
| "E1 B14820 | loss=3.6017 | lr=0.000534 | step=4624 | 8059s | 12.0GB\n", | |
| "E1 B14840 | loss=3.8878 | lr=0.000534 | step=4624 | 8070s | 12.0GB\n", | |
| "E1 B14860 | loss=3.6501 | lr=0.000534 | step=4625 | 8081s | 12.0GB\n", | |
| "E1 B14880 | loss=3.5632 | lr=0.000533 | step=4626 | 8091s | 12.0GB\n", | |
| "E1 B14900 | loss=3.8801 | lr=0.000533 | step=4626 | 8102s | 12.0GB\n", | |
| "E1 B14920 | loss=3.5604 | lr=0.000533 | step=4627 | 8113s | 12.0GB\n", | |
| "E1 B14940 | loss=3.6093 | lr=0.000533 | step=4627 | 8124s | 12.0GB\n", | |
| "E1 B14960 | loss=3.8432 | lr=0.000533 | step=4628 | 8135s | 12.0GB\n", | |
| "E1 B14980 | loss=3.6731 | lr=0.000532 | step=4629 | 8146s | 12.0GB\n", | |
| "E1 B15000 | loss=3.7453 | lr=0.000532 | step=4629 | 8156s | 12.0GB\n", | |
| "E1 B15020 | loss=3.1902 | lr=0.000532 | step=4630 | 8167s | 12.0GB\n", | |
| "E1 B15040 | loss=3.9759 | lr=0.000532 | step=4631 | 8178s | 12.0GB\n", | |
| "E1 B15060 | loss=3.5921 | lr=0.000532 | step=4631 | 8189s | 12.0GB\n", | |
| "E1 B15080 | loss=3.7955 | lr=0.000532 | step=4632 | 8200s | 12.0GB\n", | |
| "E1 B15100 | loss=3.1859 | lr=0.000532 | step=4632 | 8211s | 12.0GB\n", | |
| "E1 B15120 | loss=3.3266 | lr=0.000531 | step=4633 | 8222s | 12.0GB\n", | |
| "E1 B15140 | loss=2.9951 | lr=0.000531 | step=4634 | 8232s | 12.0GB\n", | |
| "E1 B15160 | loss=3.1925 | lr=0.000531 | step=4634 | 8243s | 12.0GB\n", | |
| "E1 B15180 | loss=3.7554 | lr=0.000531 | step=4635 | 8254s | 12.0GB\n", | |
| "E1 B15200 | loss=3.6881 | lr=0.000530 | step=4636 | 8265s | 12.0GB\n", | |
| "E1 B15220 | loss=4.0078 | lr=0.000530 | step=4636 | 8276s | 12.0GB\n", | |
| "E1 B15240 | loss=3.9151 | lr=0.000530 | step=4637 | 8286s | 12.0GB\n", | |
| "E1 B15260 | loss=3.6669 | lr=0.000530 | step=4637 | 8297s | 12.0GB\n", | |
| "E1 B15280 | loss=3.7740 | lr=0.000530 | step=4638 | 8308s | 12.0GB\n", | |
| "E1 B15300 | loss=3.5239 | lr=0.000530 | step=4639 | 8319s | 12.0GB\n", | |
| "E1 B15320 | loss=3.3195 | lr=0.000530 | step=4639 | 8330s | 12.0GB\n", | |
| "E1 B15340 | loss=3.7336 | lr=0.000529 | step=4640 | 8341s | 12.0GB\n", | |
| "E1 B15360 | loss=3.9771 | lr=0.000529 | step=4641 | 8352s | 12.0GB\n", | |
| "E1 B15380 | loss=3.6891 | lr=0.000529 | step=4641 | 8362s | 12.0GB\n", | |
| "E1 B15400 | loss=3.7304 | lr=0.000529 | step=4642 | 8373s | 12.0GB\n", | |
| "E1 B15420 | loss=3.6676 | lr=0.000529 | step=4642 | 8384s | 12.0GB\n", | |
| "E1 B15440 | loss=3.7816 | lr=0.000528 | step=4643 | 8395s | 12.0GB\n", | |
| "E1 B15460 | loss=4.0167 | lr=0.000528 | step=4644 | 8406s | 12.0GB\n", | |
| "E1 B15480 | loss=3.5876 | lr=0.000528 | step=4644 | 8416s | 12.0GB\n", | |
| "E1 B15500 | loss=3.3342 | lr=0.000528 | step=4645 | 8427s | 12.0GB\n", | |
| "E1 B15520 | loss=4.1262 | lr=0.000528 | step=4646 | 8438s | 12.0GB\n", | |
| "E1 B15540 | loss=3.9095 | lr=0.000528 | step=4646 | 8449s | 12.0GB\n", | |
| "E1 B15560 | loss=3.9194 | lr=0.000527 | step=4647 | 8460s | 12.0GB\n", | |
| "E1 B15580 | loss=2.6783 | lr=0.000527 | step=4647 | 8470s | 12.0GB\n", | |
| "E1 B15600 | loss=3.6020 | lr=0.000527 | step=4648 | 8481s | 12.0GB\n", | |
| "E1 B15620 | loss=3.4268 | lr=0.000527 | step=4649 | 8492s | 12.0GB\n", | |
| "E1 B15640 | loss=3.5878 | lr=0.000527 | step=4649 | 8503s | 12.0GB\n", | |
| "E1 B15660 | loss=3.9185 | lr=0.000526 | step=4650 | 8514s | 12.0GB\n", | |
| "E1 B15680 | loss=3.5568 | lr=0.000526 | step=4651 | 8524s | 12.0GB\n", | |
| "E1 B15700 | loss=3.5793 | lr=0.000526 | step=4651 | 8535s | 12.0GB\n", | |
| "E1 B15720 | loss=3.9349 | lr=0.000526 | step=4652 | 8546s | 12.0GB\n", | |
| "E1 B15740 | loss=3.7211 | lr=0.000526 | step=4652 | 8557s | 12.0GB\n", | |
| "E1 B15760 | loss=3.7580 | lr=0.000526 | step=4653 | 8568s | 12.0GB\n", | |
| "E1 B15780 | loss=3.8156 | lr=0.000525 | step=4654 | 8578s | 12.0GB\n", | |
| "E1 B15800 | loss=3.8089 | lr=0.000525 | step=4654 | 8589s | 12.0GB\n", | |
| "E1 B15820 | loss=3.7674 | lr=0.000525 | step=4655 | 8600s | 12.0GB\n", | |
| "E1 B15840 | loss=3.5427 | lr=0.000525 | step=4656 | 8611s | 12.0GB\n", | |
| "E1 B15860 | loss=3.9465 | lr=0.000525 | step=4656 | 8622s | 12.0GB\n", | |
| "E1 B15880 | loss=3.2502 | lr=0.000524 | step=4657 | 8633s | 12.0GB\n", | |
| "E1 B15900 | loss=3.6790 | lr=0.000524 | step=4657 | 8643s | 12.0GB\n", | |
| "E1 B15920 | loss=3.9152 | lr=0.000524 | step=4658 | 8654s | 12.0GB\n", | |
| "E1 B15940 | loss=3.7068 | lr=0.000524 | step=4659 | 8665s | 12.0GB\n", | |
| "E1 B15960 | loss=3.6129 | lr=0.000524 | step=4659 | 8676s | 12.0GB\n", | |
| "E1 B15980 | loss=3.8030 | lr=0.000523 | step=4660 | 8687s | 12.0GB\n", | |
| "E1 B16000 | loss=3.9211 | lr=0.000523 | step=4661 | 8698s | 12.0GB\n", | |
| "E1 B16020 | loss=3.9647 | lr=0.000523 | step=4661 | 8709s | 12.0GB\n", | |
| "E1 B16040 | loss=3.7150 | lr=0.000523 | step=4662 | 8719s | 12.0GB\n", | |
| "E1 B16060 | loss=3.6997 | lr=0.000523 | step=4662 | 8730s | 12.0GB\n", | |
| "E1 B16080 | loss=3.3745 | lr=0.000523 | step=4663 | 8741s | 12.0GB\n", | |
| "E1 B16100 | loss=3.7085 | lr=0.000522 | step=4664 | 8752s | 12.0GB\n", | |
| "E1 B16120 | loss=3.4774 | lr=0.000522 | step=4664 | 8763s | 12.0GB\n", | |
| "E1 B16140 | loss=3.8498 | lr=0.000522 | step=4665 | 8774s | 12.0GB\n", | |
| "E1 B16160 | loss=2.9288 | lr=0.000522 | step=4666 | 8785s | 12.0GB\n", | |
| "E1 B16180 | loss=3.8713 | lr=0.000522 | step=4666 | 8795s | 12.0GB\n", | |
| "E1 B16200 | loss=3.7599 | lr=0.000521 | step=4667 | 8806s | 12.0GB\n", | |
| "E1 B16220 | loss=3.3343 | lr=0.000521 | step=4667 | 8817s | 12.0GB\n", | |
| "E1 B16240 | loss=3.5703 | lr=0.000521 | step=4668 | 8828s | 12.0GB\n", | |
| "E1 B16260 | loss=3.2515 | lr=0.000521 | step=4669 | 8839s | 12.0GB\n", | |
| "E1 B16280 | loss=3.9524 | lr=0.000521 | step=4669 | 8850s | 12.0GB\n", | |
| "E1 B16300 | loss=3.7848 | lr=0.000520 | step=4670 | 8860s | 12.0GB\n", | |
| "E1 B16320 | loss=3.7893 | lr=0.000520 | step=4671 | 8872s | 12.0GB\n", | |
| "E1 B16340 | loss=3.4424 | lr=0.000520 | step=4671 | 8882s | 12.0GB\n", | |
| "E1 B16360 | loss=3.7894 | lr=0.000520 | step=4672 | 8893s | 12.0GB\n", | |
| "E1 B16380 | loss=3.4307 | lr=0.000520 | step=4672 | 8904s | 12.0GB\n", | |
| "E1 B16400 | loss=3.4072 | lr=0.000520 | step=4673 | 8915s | 12.0GB\n", | |
| "E1 B16420 | loss=4.0177 | lr=0.000519 | step=4674 | 8926s | 12.0GB\n", | |
| "E1 B16440 | loss=3.6548 | lr=0.000519 | step=4674 | 8936s | 12.0GB\n", | |
| "E1 B16460 | loss=3.5178 | lr=0.000519 | step=4675 | 8947s | 12.0GB\n", | |
| "E1 B16480 | loss=3.3491 | lr=0.000519 | step=4676 | 8958s | 12.0GB\n", | |
| "E1 B16500 | loss=3.2887 | lr=0.000519 | step=4676 | 8969s | 12.0GB\n", | |
| "E1 B16520 | loss=3.8528 | lr=0.000518 | step=4677 | 8980s | 12.0GB\n", | |
| "E1 B16540 | loss=3.7825 | lr=0.000518 | step=4677 | 8991s | 12.0GB\n", | |
| "E1 B16560 | loss=4.0140 | lr=0.000518 | step=4678 | 9002s | 12.0GB\n", | |
| "E1 B16580 | loss=3.5878 | lr=0.000518 | step=4679 | 9012s | 12.0GB\n", | |
| "E1 B16600 | loss=3.7900 | lr=0.000518 | step=4679 | 9023s | 12.0GB\n", | |
| "E1 B16620 | loss=3.6624 | lr=0.000517 | step=4680 | 9034s | 12.0GB\n", | |
| "E1 B16640 | loss=2.9227 | lr=0.000517 | step=4681 | 9045s | 12.0GB\n", | |
| "E1 B16660 | loss=3.0362 | lr=0.000517 | step=4681 | 9056s | 12.0GB\n", | |
| "E1 B16680 | loss=3.0783 | lr=0.000517 | step=4682 | 9067s | 12.0GB\n", | |
| "E1 B16700 | loss=3.2623 | lr=0.000517 | step=4682 | 9077s | 12.0GB\n", | |
| "E1 B16720 | loss=3.6685 | lr=0.000516 | step=4683 | 9088s | 12.0GB\n", | |
| "E1 B16740 | loss=3.8044 | lr=0.000516 | step=4684 | 9099s | 12.0GB\n", | |
| "E1 B16760 | loss=3.6746 | lr=0.000516 | step=4684 | 9110s | 12.0GB\n", | |
| "E1 B16780 | loss=3.9200 | lr=0.000516 | step=4685 | 9121s | 12.0GB\n", | |
| "E1 B16800 | loss=3.3981 | lr=0.000516 | step=4686 | 9131s | 12.0GB\n", | |
| "E1 B16820 | loss=3.6064 | lr=0.000516 | step=4686 | 9142s | 12.0GB\n", | |
| "E1 B16840 | loss=3.9160 | lr=0.000515 | step=4687 | 9153s | 12.0GB\n", | |
| "E1 B16860 | loss=2.8895 | lr=0.000515 | step=4687 | 9164s | 12.0GB\n", | |
| "E1 B16880 | loss=3.8388 | lr=0.000515 | step=4688 | 9175s | 12.0GB\n", | |
| "E1 B16900 | loss=3.3849 | lr=0.000515 | step=4689 | 9186s | 12.0GB\n", | |
| "E1 B16920 | loss=3.8158 | lr=0.000515 | step=4689 | 9197s | 12.0GB\n", | |
| "E1 B16940 | loss=3.8618 | lr=0.000514 | step=4690 | 9207s | 12.0GB\n", | |
| "E1 B16960 | loss=3.4646 | lr=0.000514 | step=4691 | 9218s | 12.0GB\n", | |
| "E1 B16980 | loss=3.4437 | lr=0.000514 | step=4691 | 9229s | 12.0GB\n", | |
| "E1 B17000 | loss=3.4719 | lr=0.000514 | step=4692 | 9240s | 12.0GB\n", | |
| "E1 B17020 | loss=4.0562 | lr=0.000514 | step=4692 | 9251s | 12.0GB\n", | |
| "E1 B17040 | loss=3.9825 | lr=0.000513 | step=4693 | 9262s | 12.0GB\n", | |
| "E1 B17060 | loss=3.3620 | lr=0.000513 | step=4694 | 9273s | 12.0GB\n", | |
| "E1 B17080 | loss=3.6819 | lr=0.000513 | step=4694 | 9283s | 12.0GB\n", | |
| "E1 B17100 | loss=3.4374 | lr=0.000513 | step=4695 | 9294s | 12.0GB\n", | |
| "E1 B17120 | loss=3.2610 | lr=0.000512 | step=4696 | 9305s | 12.0GB\n", | |
| "E1 B17140 | loss=3.0374 | lr=0.000512 | step=4696 | 9316s | 12.0GB\n", | |
| "E1 B17160 | loss=3.3523 | lr=0.000512 | step=4697 | 9327s | 12.0GB\n", | |
| "E1 B17180 | loss=3.1935 | lr=0.000512 | step=4697 | 9337s | 12.0GB\n", | |
| "E1 B17200 | loss=2.7538 | lr=0.000512 | step=4698 | 9348s | 12.0GB\n", | |
| "E1 B17220 | loss=3.7147 | lr=0.000512 | step=4699 | 9359s | 12.0GB\n", | |
| "E1 B17240 | loss=3.6338 | lr=0.000512 | step=4699 | 9370s | 12.0GB\n", | |
| "E1 B17260 | loss=3.9150 | lr=0.000511 | step=4700 | 9381s | 12.0GB\n", | |
| "E1 B17280 | loss=3.7103 | lr=0.000511 | step=4701 | 9392s | 12.0GB\n", | |
| "E1 B17300 | loss=3.4421 | lr=0.000511 | step=4701 | 9402s | 12.0GB\n", | |
| "E1 B17320 | loss=3.8645 | lr=0.000511 | step=4702 | 9413s | 12.0GB\n", | |
| "E1 B17340 | loss=3.6876 | lr=0.000511 | step=4702 | 9424s | 12.0GB\n", | |
| "E1 B17360 | loss=3.6183 | lr=0.000510 | step=4703 | 9435s | 12.0GB\n", | |
| "E1 B17380 | loss=3.1357 | lr=0.000510 | step=4704 | 9446s | 12.0GB\n", | |
| "E1 B17400 | loss=3.5974 | lr=0.000510 | step=4704 | 9457s | 12.0GB\n", | |
| "E1 B17420 | loss=3.4442 | lr=0.000510 | step=4705 | 9468s | 12.0GB\n", | |
| "E1 B17440 | loss=3.1941 | lr=0.000509 | step=4706 | 9478s | 12.0GB\n", | |
| "E1 B17460 | loss=3.6786 | lr=0.000509 | step=4706 | 9489s | 12.0GB\n", | |
| "E1 B17480 | loss=3.8438 | lr=0.000509 | step=4707 | 9500s | 12.0GB\n", | |
| "E1 B17500 | loss=3.7240 | lr=0.000509 | step=4707 | 9511s | 12.0GB\n", | |
| "E1 B17520 | loss=2.8875 | lr=0.000509 | step=4708 | 9522s | 12.0GB\n", | |
| "E1 B17540 | loss=3.5598 | lr=0.000508 | step=4709 | 9533s | 12.0GB\n", | |
| "E1 B17560 | loss=4.0223 | lr=0.000508 | step=4709 | 9544s | 12.0GB\n", | |
| "E1 B17580 | loss=4.0608 | lr=0.000508 | step=4710 | 9554s | 12.0GB\n", | |
| "E1 B17600 | loss=3.6853 | lr=0.000508 | step=4711 | 9565s | 12.0GB\n", | |
| "E1 B17620 | loss=3.8094 | lr=0.000508 | step=4711 | 9576s | 12.0GB\n", | |
| "E1 B17640 | loss=3.5969 | lr=0.000507 | step=4712 | 9587s | 12.0GB\n", | |
| "E1 B17660 | loss=3.3221 | lr=0.000507 | step=4712 | 9598s | 12.0GB\n", | |
| "E1 B17680 | loss=3.4366 | lr=0.000507 | step=4713 | 9609s | 12.0GB\n", | |
| "E1 B17700 | loss=3.2076 | lr=0.000507 | step=4714 | 9620s | 12.0GB\n", | |
| "E1 B17720 | loss=3.4754 | lr=0.000507 | step=4714 | 9631s | 12.0GB\n", | |
| "E1 B17740 | loss=3.8241 | lr=0.000506 | step=4715 | 9641s | 12.0GB\n", | |
| "E1 B17760 | loss=3.6548 | lr=0.000506 | step=4716 | 9652s | 12.0GB\n", | |
| "E1 B17780 | loss=3.9470 | lr=0.000506 | step=4716 | 9663s | 12.0GB\n", | |
| "E1 B17800 | loss=3.9560 | lr=0.000506 | step=4717 | 9674s | 12.0GB\n", | |
| "E1 B17820 | loss=3.4773 | lr=0.000506 | step=4717 | 9685s | 12.0GB\n", | |
| "E1 B17840 | loss=3.5450 | lr=0.000505 | step=4718 | 9695s | 12.0GB\n", | |
| "E1 B17860 | loss=3.9347 | lr=0.000505 | step=4719 | 9706s | 12.0GB\n", | |
| "E1 B17880 | loss=3.2176 | lr=0.000505 | step=4719 | 9717s | 12.0GB\n", | |
| "E1 B17900 | loss=3.5696 | lr=0.000505 | step=4720 | 9728s | 12.0GB\n", | |
| "E1 B17920 | loss=3.7430 | lr=0.000504 | step=4721 | 9739s | 12.0GB\n", | |
| "E1 B17940 | loss=3.1883 | lr=0.000504 | step=4721 | 9750s | 12.0GB\n", | |
| "E1 B17960 | loss=3.6833 | lr=0.000504 | step=4722 | 9761s | 12.0GB\n", | |
| "E1 B17980 | loss=3.5830 | lr=0.000504 | step=4722 | 9771s | 12.0GB\n", | |
| "E1 B18000 | loss=3.4364 | lr=0.000504 | step=4723 | 9782s | 12.0GB\n", | |
| "E1 B18020 | loss=3.6262 | lr=0.000503 | step=4724 | 9793s | 12.0GB\n", | |
| "E1 B18040 | loss=3.8005 | lr=0.000503 | step=4724 | 9804s | 12.0GB\n", | |
| "E1 B18060 | loss=4.0659 | lr=0.000503 | step=4725 | 9815s | 12.0GB\n", | |
| "E1 B18080 | loss=3.5156 | lr=0.000503 | step=4726 | 9825s | 12.0GB\n", | |
| "E1 B18100 | loss=3.4394 | lr=0.000503 | step=4726 | 9836s | 12.0GB\n", | |
| "E1 B18120 | loss=4.0423 | lr=0.000502 | step=4727 | 9847s | 12.0GB\n", | |
| "E1 B18140 | loss=4.1539 | lr=0.000502 | step=4727 | 9858s | 12.0GB\n", | |
| "E1 B18160 | loss=3.9463 | lr=0.000502 | step=4728 | 9869s | 12.0GB\n", | |
| "E1 B18180 | loss=3.9627 | lr=0.000502 | step=4729 | 9879s | 12.0GB\n", | |
| "E1 B18200 | loss=4.0079 | lr=0.000502 | step=4729 | 9890s | 12.0GB\n", | |
| "E1 B18220 | loss=3.9469 | lr=0.000502 | step=4730 | 9901s | 12.0GB\n", | |
| "E1 B18240 | loss=3.5010 | lr=0.000501 | step=4731 | 9912s | 12.0GB\n", | |
| "E1 B18260 | loss=3.8297 | lr=0.000501 | step=4731 | 9923s | 12.0GB\n", | |
| "E1 B18280 | loss=3.9131 | lr=0.000501 | step=4732 | 9934s | 12.0GB\n", | |
| "E1 B18300 | loss=3.3401 | lr=0.000501 | step=4732 | 9944s | 12.0GB\n", | |
| "E1 B18320 | loss=3.6218 | lr=0.000501 | step=4733 | 9955s | 12.0GB\n", | |
| "E1 B18340 | loss=3.8618 | lr=0.000500 | step=4734 | 9966s | 12.0GB\n", | |
| "E1 B18360 | loss=4.0161 | lr=0.000500 | step=4734 | 9977s | 12.0GB\n", | |
| "E1 B18380 | loss=4.0321 | lr=0.000500 | step=4735 | 9988s | 12.0GB\n", | |
| "E1 B18400 | loss=3.5492 | lr=0.000500 | step=4736 | 9999s | 12.0GB\n", | |
| "E1 B18420 | loss=2.2225 | lr=0.000500 | step=4736 | 10010s | 12.0GB\n", | |
| "E1 B18440 | loss=3.7971 | lr=0.000499 | step=4737 | 10020s | 12.0GB\n", | |
| "E1 B18460 | loss=4.0066 | lr=0.000499 | step=4737 | 10031s | 12.0GB\n", | |
| "E1 B18480 | loss=3.9193 | lr=0.000499 | step=4738 | 10042s | 12.0GB\n", | |
| "E1 B18500 | loss=3.1617 | lr=0.000499 | step=4739 | 10053s | 12.0GB\n", | |
| "E1 B18520 | loss=3.4578 | lr=0.000499 | step=4739 | 10064s | 12.0GB\n", | |
| "E1 B18540 | loss=3.8322 | lr=0.000498 | step=4740 | 10075s | 12.0GB\n", | |
| "E1 B18560 | loss=3.8193 | lr=0.000498 | step=4741 | 10086s | 12.0GB\n", | |
| "E1 B18580 | loss=3.5207 | lr=0.000498 | step=4741 | 10096s | 12.0GB\n", | |
| "E1 B18600 | loss=3.5149 | lr=0.000498 | step=4742 | 10107s | 12.0GB\n", | |
| "E1 B18620 | loss=3.3524 | lr=0.000498 | step=4742 | 10118s | 12.0GB\n", | |
| "E1 B18640 | loss=3.3334 | lr=0.000497 | step=4743 | 10129s | 12.0GB\n", | |
| "E1 B18660 | loss=3.2191 | lr=0.000497 | step=4744 | 10140s | 12.0GB\n", | |
| "E1 B18680 | loss=3.3109 | lr=0.000497 | step=4744 | 10151s | 12.0GB\n", | |
| "E1 B18700 | loss=3.6123 | lr=0.000497 | step=4745 | 10161s | 12.0GB\n", | |
| "E1 B18720 | loss=3.2261 | lr=0.000496 | step=4746 | 10172s | 12.0GB\n", | |
| "E1 B18740 | loss=3.3636 | lr=0.000496 | step=4746 | 10183s | 12.0GB\n", | |
| "E1 B18760 | loss=3.3872 | lr=0.000496 | step=4747 | 10194s | 12.0GB\n", | |
| "E1 B18780 | loss=3.8045 | lr=0.000496 | step=4747 | 10205s | 12.0GB\n", | |
| "E1 B18800 | loss=3.7654 | lr=0.000496 | step=4748 | 10216s | 12.0GB\n", | |
| "E1 B18820 | loss=3.7371 | lr=0.000495 | step=4749 | 10227s | 12.0GB\n", | |
| "E1 B18840 | loss=3.7708 | lr=0.000495 | step=4749 | 10237s | 12.0GB\n", | |
| "E1 B18860 | loss=3.4715 | lr=0.000495 | step=4750 | 10248s | 12.0GB\n", | |
| "E1 B18880 | loss=2.9569 | lr=0.000495 | step=4751 | 10259s | 12.0GB\n", | |
| "E1 B18900 | loss=2.9004 | lr=0.000495 | step=4751 | 10270s | 12.0GB\n", | |
| "E1 B18920 | loss=3.6417 | lr=0.000494 | step=4752 | 10281s | 12.0GB\n", | |
| "E1 B18940 | loss=4.0751 | lr=0.000494 | step=4752 | 10292s | 12.0GB\n", | |
| "E1 B18960 | loss=3.9682 | lr=0.000494 | step=4753 | 10303s | 12.0GB\n", | |
| "E1 B18980 | loss=3.3080 | lr=0.000493 | step=4754 | 10313s | 12.0GB\n", | |
| "E1 B19000 | loss=3.6964 | lr=0.000493 | step=4754 | 10324s | 12.0GB\n", | |
| "E1 B19020 | loss=3.6574 | lr=0.000493 | step=4755 | 10335s | 12.0GB\n", | |
| "E1 B19040 | loss=3.6207 | lr=0.000493 | step=4756 | 10346s | 12.0GB\n", | |
| "E1 B19060 | loss=3.5495 | lr=0.000493 | step=4756 | 10357s | 12.0GB\n", | |
| "E1 B19080 | loss=3.3664 | lr=0.000492 | step=4757 | 10368s | 12.0GB\n", | |
| "E1 B19100 | loss=3.3639 | lr=0.000492 | step=4757 | 10378s | 12.0GB\n", | |
| "E1 B19120 | loss=3.9101 | lr=0.000492 | step=4758 | 10389s | 12.0GB\n", | |
| "E1 B19140 | loss=3.4268 | lr=0.000492 | step=4759 | 10400s | 12.0GB\n", | |
| "E1 B19160 | loss=3.6607 | lr=0.000492 | step=4759 | 10411s | 12.0GB\n", | |
| "E1 B19180 | loss=3.4684 | lr=0.000491 | step=4760 | 10422s | 12.0GB\n", | |
| "E1 B19200 | loss=3.3371 | lr=0.000491 | step=4761 | 10433s | 12.0GB\n", | |
| "E1 B19220 | loss=3.4770 | lr=0.000491 | step=4761 | 10444s | 12.0GB\n", | |
| "E1 B19240 | loss=3.8563 | lr=0.000491 | step=4762 | 10454s | 12.0GB\n", | |
| "E1 B19260 | loss=3.2565 | lr=0.000491 | step=4762 | 10465s | 12.0GB\n", | |
| "E1 B19280 | loss=3.5451 | lr=0.000490 | step=4763 | 10476s | 12.0GB\n", | |
| "E1 B19300 | loss=3.7681 | lr=0.000490 | step=4764 | 10487s | 12.0GB\n", | |
| "E1 B19320 | loss=3.8859 | lr=0.000490 | step=4764 | 10498s | 12.0GB\n", | |
| "E1 B19340 | loss=3.7048 | lr=0.000490 | step=4765 | 10509s | 12.0GB\n", | |
| "E1 B19360 | loss=3.6960 | lr=0.000489 | step=4766 | 10520s | 12.0GB\n", | |
| "E1 B19380 | loss=3.2113 | lr=0.000489 | step=4766 | 10530s | 12.0GB\n", | |
| "E1 B19400 | loss=3.8300 | lr=0.000489 | step=4767 | 10541s | 12.0GB\n", | |
| "E1 B19420 | loss=2.9761 | lr=0.000489 | step=4767 | 10552s | 12.0GB\n", | |
| "E1 B19440 | loss=3.3190 | lr=0.000489 | step=4768 | 10563s | 12.0GB\n", | |
| "E1 B19460 | loss=3.7198 | lr=0.000488 | step=4769 | 10574s | 12.0GB\n", | |
| "E1 B19480 | loss=3.2659 | lr=0.000488 | step=4769 | 10585s | 12.0GB\n", | |
| "E1 B19500 | loss=3.7874 | lr=0.000488 | step=4770 | 10596s | 12.0GB\n", | |
| "E1 B19520 | loss=3.9013 | lr=0.000488 | step=4771 | 10607s | 12.0GB\n", | |
| "E1 B19540 | loss=3.4400 | lr=0.000488 | step=4771 | 10617s | 12.0GB\n", | |
| "E1 B19560 | loss=3.7815 | lr=0.000487 | step=4772 | 10628s | 12.0GB\n", | |
| "E1 B19580 | loss=4.0152 | lr=0.000487 | step=4772 | 10639s | 12.0GB\n", | |
| "E1 B19600 | loss=3.3567 | lr=0.000487 | step=4773 | 10650s | 12.0GB\n", | |
| "E1 B19620 | loss=4.0636 | lr=0.000487 | step=4774 | 10661s | 12.0GB\n", | |
| "E1 B19640 | loss=3.8267 | lr=0.000487 | step=4774 | 10672s | 12.0GB\n", | |
| "E1 B19660 | loss=3.0816 | lr=0.000486 | step=4775 | 10683s | 12.0GB\n", | |
| "E1 B19680 | loss=3.6697 | lr=0.000486 | step=4776 | 10693s | 12.0GB\n", | |
| "E1 B19700 | loss=3.2949 | lr=0.000486 | step=4776 | 10704s | 12.0GB\n", | |
| "E1 B19720 | loss=3.4744 | lr=0.000486 | step=4777 | 10715s | 12.0GB\n", | |
| "E1 B19740 | loss=3.7729 | lr=0.000486 | step=4777 | 10726s | 12.0GB\n", | |
| "E1 B19760 | loss=3.4864 | lr=0.000485 | step=4778 | 10737s | 12.0GB\n", | |
| "E1 B19780 | loss=2.6309 | lr=0.000485 | step=4779 | 10748s | 12.0GB\n", | |
| "E1 B19800 | loss=3.4228 | lr=0.000485 | step=4779 | 10758s | 12.0GB\n", | |
| "E1 B19820 | loss=3.6884 | lr=0.000485 | step=4780 | 10769s | 12.0GB\n", | |
| "E1 B19840 | loss=3.5250 | lr=0.000484 | step=4781 | 10780s | 12.0GB\n", | |
| "E1 B19860 | loss=3.4740 | lr=0.000484 | step=4781 | 10791s | 12.0GB\n", | |
| "E1 B19880 | loss=3.7028 | lr=0.000484 | step=4782 | 10802s | 12.0GB\n", | |
| "E1 B19900 | loss=4.0438 | lr=0.000484 | step=4782 | 10813s | 12.0GB\n", | |
| "E1 B19920 | loss=3.8486 | lr=0.000483 | step=4783 | 10824s | 12.0GB\n", | |
| "E1 B19940 | loss=3.8473 | lr=0.000483 | step=4784 | 10834s | 12.0GB\n", | |
| "E1 B19960 | loss=3.4150 | lr=0.000483 | step=4784 | 10845s | 12.0GB\n", | |
| "E1 B19980 | loss=3.6953 | lr=0.000483 | step=4785 | 10856s | 12.0GB\n", | |
| "E1 B20000 | loss=4.1786 | lr=0.000482 | step=4786 | 10867s | 12.0GB\n", | |
| "E1 B20020 | loss=3.3390 | lr=0.000482 | step=4786 | 10878s | 12.0GB\n", | |
| "E1 B20040 | loss=3.7157 | lr=0.000482 | step=4787 | 10889s | 12.0GB\n", | |
| "E1 B20060 | loss=3.7897 | lr=0.000482 | step=4787 | 10900s | 12.0GB\n", | |
| "E1 B20080 | loss=2.7745 | lr=0.000482 | step=4788 | 10911s | 12.0GB\n", | |
| "E1 B20100 | loss=3.4621 | lr=0.000481 | step=4789 | 10921s | 12.0GB\n", | |
| "E1 B20120 | loss=3.7668 | lr=0.000481 | step=4789 | 10932s | 12.0GB\n", | |
| "E1 B20140 | loss=3.5815 | lr=0.000481 | step=4790 | 10943s | 12.0GB\n", | |
| "E1 B20160 | loss=3.8193 | lr=0.000481 | step=4791 | 10954s | 12.0GB\n", | |
| "E1 B20180 | loss=2.7656 | lr=0.000481 | step=4791 | 10965s | 12.0GB\n", | |
| "E1 B20200 | loss=3.7449 | lr=0.000480 | step=4792 | 10975s | 12.0GB\n", | |
| "E1 B20220 | loss=3.6068 | lr=0.000480 | step=4792 | 10986s | 12.0GB\n", | |
| "E1 B20240 | loss=3.7449 | lr=0.000480 | step=4793 | 10997s | 12.0GB\n", | |
| "E1 B20260 | loss=3.4059 | lr=0.000480 | step=4794 | 11008s | 12.0GB\n", | |
| "E1 B20280 | loss=3.8344 | lr=0.000480 | step=4794 | 11019s | 12.0GB\n", | |
| "E1 B20300 | loss=3.8879 | lr=0.000479 | step=4795 | 11029s | 12.0GB\n", | |
| "E1 B20320 | loss=3.2521 | lr=0.000479 | step=4796 | 11040s | 12.0GB\n", | |
| "E1 B20340 | loss=3.1142 | lr=0.000479 | step=4796 | 11051s | 12.0GB\n", | |
| "E1 B20360 | loss=3.2035 | lr=0.000479 | step=4797 | 11062s | 12.0GB\n", | |
| "E1 B20380 | loss=3.8140 | lr=0.000479 | step=4797 | 11073s | 12.0GB\n", | |
| "E1 B20400 | loss=3.3051 | lr=0.000478 | step=4798 | 11084s | 12.0GB\n", | |
| "E1 B20420 | loss=4.0647 | lr=0.000478 | step=4799 | 11095s | 12.0GB\n", | |
| "E1 B20440 | loss=3.5170 | lr=0.000478 | step=4799 | 11105s | 12.0GB\n", | |
| "E1 B20460 | loss=3.3191 | lr=0.000477 | step=4800 | 11116s | 12.0GB\n", | |
| "E1 B20480 | loss=3.7245 | lr=0.000477 | step=4801 | 11127s | 12.0GB\n", | |
| "E1 B20500 | loss=4.2356 | lr=0.000477 | step=4801 | 11138s | 12.0GB\n", | |
| "E1 B20520 | loss=3.4636 | lr=0.000477 | step=4802 | 11149s | 12.0GB\n", | |
| "E1 B20540 | loss=3.8208 | lr=0.000477 | step=4802 | 11160s | 12.0GB\n", | |
| "E1 B20560 | loss=3.1483 | lr=0.000476 | step=4803 | 11171s | 12.0GB\n", | |
| "E1 B20580 | loss=3.6152 | lr=0.000476 | step=4804 | 11181s | 12.0GB\n", | |
| "E1 B20600 | loss=3.7210 | lr=0.000476 | step=4804 | 11192s | 12.0GB\n", | |
| "E1 B20620 | loss=3.7620 | lr=0.000476 | step=4805 | 11203s | 12.0GB\n", | |
| "E1 B20640 | loss=3.7058 | lr=0.000475 | step=4806 | 11214s | 12.0GB\n", | |
| "E1 B20660 | loss=3.3306 | lr=0.000475 | step=4806 | 11225s | 12.0GB\n", | |
| "E1 B20680 | loss=3.8335 | lr=0.000475 | step=4807 | 11236s | 12.0GB\n", | |
| "E1 B20700 | loss=3.5726 | lr=0.000475 | step=4807 | 11246s | 12.0GB\n", | |
| "E1 B20720 | loss=3.5849 | lr=0.000475 | step=4808 | 11258s | 12.0GB\n", | |
| "E1 B20740 | loss=3.9291 | lr=0.000474 | step=4809 | 11268s | 12.0GB\n", | |
| "E1 B20760 | loss=3.8323 | lr=0.000474 | step=4809 | 11279s | 12.0GB\n", | |
| "E1 B20780 | loss=3.8389 | lr=0.000474 | step=4810 | 11290s | 12.0GB\n", | |
| "E1 B20800 | loss=3.7930 | lr=0.000474 | step=4811 | 11301s | 12.0GB\n", | |
| "E1 B20820 | loss=3.3067 | lr=0.000474 | step=4811 | 11312s | 12.0GB\n", | |
| "E1 B20840 | loss=3.5891 | lr=0.000473 | step=4812 | 11322s | 12.0GB\n", | |
| "E1 B20860 | loss=3.6297 | lr=0.000473 | step=4812 | 11333s | 12.0GB\n", | |
| "E1 B20880 | loss=3.4881 | lr=0.000473 | step=4813 | 11344s | 12.0GB\n", | |
| "E1 B20900 | loss=3.2608 | lr=0.000472 | step=4814 | 11355s | 12.0GB\n", | |
| "E1 B20920 | loss=3.8812 | lr=0.000472 | step=4814 | 11366s | 12.0GB\n", | |
| "E1 B20940 | loss=3.8893 | lr=0.000472 | step=4815 | 11377s | 12.0GB\n", | |
| "E1 B20960 | loss=3.6312 | lr=0.000472 | step=4816 | 11388s | 12.0GB\n", | |
| "E1 B20980 | loss=3.9429 | lr=0.000472 | step=4816 | 11398s | 12.0GB\n", | |
| "E1 B21000 | loss=3.6469 | lr=0.000471 | step=4817 | 11409s | 12.0GB\n", | |
| "E1 B21020 | loss=3.8276 | lr=0.000471 | step=4817 | 11420s | 12.0GB\n", | |
| "E1 B21040 | loss=3.1819 | lr=0.000471 | step=4818 | 11431s | 12.0GB\n", | |
| "E1 B21060 | loss=3.5652 | lr=0.000471 | step=4819 | 11442s | 12.0GB\n", | |
| "E1 B21080 | loss=3.7116 | lr=0.000471 | step=4819 | 11453s | 12.0GB\n", | |
| "E1 B21100 | loss=3.5899 | lr=0.000470 | step=4820 | 11463s | 12.0GB\n", | |
| "E1 B21120 | loss=3.6385 | lr=0.000470 | step=4821 | 11474s | 12.0GB\n", | |
| "E1 B21140 | loss=3.6116 | lr=0.000470 | step=4821 | 11485s | 12.0GB\n", | |
| "E1 B21160 | loss=3.4876 | lr=0.000470 | step=4822 | 11496s | 12.0GB\n", | |
| "E1 B21180 | loss=2.7928 | lr=0.000470 | step=4822 | 11507s | 12.0GB\n", | |
| "E1 B21200 | loss=3.2959 | lr=0.000469 | step=4823 | 11518s | 12.0GB\n", | |
| "E1 B21220 | loss=3.2488 | lr=0.000469 | step=4824 | 11529s | 12.0GB\n", | |
| "E1 B21240 | loss=3.9810 | lr=0.000469 | step=4824 | 11539s | 12.0GB\n", | |
| "E1 B21260 | loss=3.4409 | lr=0.000468 | step=4825 | 11551s | 12.0GB\n", | |
| "E1 B21280 | loss=3.3919 | lr=0.000468 | step=4826 | 11561s | 12.0GB\n", | |
| "E1 B21300 | loss=3.6066 | lr=0.000468 | step=4826 | 11572s | 12.0GB\n", | |
| "E1 B21320 | loss=3.7456 | lr=0.000468 | step=4827 | 11583s | 12.0GB\n", | |
| "E1 B21340 | loss=3.7623 | lr=0.000468 | step=4827 | 11594s | 12.0GB\n", | |
| "E1 B21360 | loss=3.5647 | lr=0.000467 | step=4828 | 11605s | 12.0GB\n", | |
| "E1 B21380 | loss=3.3589 | lr=0.000467 | step=4829 | 11615s | 12.0GB\n", | |
| "E1 B21400 | loss=3.5778 | lr=0.000467 | step=4829 | 11626s | 12.0GB\n", | |
| "E1 B21420 | loss=3.4602 | lr=0.000467 | step=4830 | 11637s | 12.0GB\n", | |
| "E1 B21440 | loss=3.6518 | lr=0.000466 | step=4831 | 11648s | 12.0GB\n", | |
| "E1 B21460 | loss=3.4086 | lr=0.000466 | step=4831 | 11659s | 12.0GB\n", | |
| "E1 B21480 | loss=3.6213 | lr=0.000466 | step=4832 | 11670s | 12.0GB\n", | |
| "E1 B21500 | loss=3.2806 | lr=0.000466 | step=4832 | 11680s | 12.0GB\n", | |
| "E1 B21520 | loss=3.4391 | lr=0.000465 | step=4833 | 11691s | 12.0GB\n", | |
| "E1 B21540 | loss=3.6801 | lr=0.000465 | step=4834 | 11702s | 12.0GB\n", | |
| "E1 B21560 | loss=3.5161 | lr=0.000465 | step=4834 | 11713s | 12.0GB\n", | |
| "E1 B21580 | loss=3.6287 | lr=0.000465 | step=4835 | 11724s | 12.0GB\n", | |
| "E1 B21600 | loss=3.4723 | lr=0.000464 | step=4836 | 11734s | 12.0GB\n", | |
| "E1 B21620 | loss=3.6654 | lr=0.000464 | step=4836 | 11745s | 12.0GB\n", | |
| "E1 B21640 | loss=3.4930 | lr=0.000464 | step=4837 | 11756s | 12.0GB\n", | |
| "E1 B21660 | loss=3.2791 | lr=0.000464 | step=4837 | 11767s | 12.0GB\n", | |
| "E1 B21680 | loss=3.2304 | lr=0.000464 | step=4838 | 11778s | 12.0GB\n", | |
| "E1 B21700 | loss=3.5780 | lr=0.000463 | step=4839 | 11788s | 12.0GB\n", | |
| "E1 B21720 | loss=3.3952 | lr=0.000463 | step=4839 | 11799s | 12.0GB\n", | |
| "E1 B21740 | loss=3.0119 | lr=0.000463 | step=4840 | 11810s | 12.0GB\n", | |
| "E1 B21760 | loss=3.7743 | lr=0.000463 | step=4841 | 11821s | 12.0GB\n", | |
| "E1 B21780 | loss=3.7173 | lr=0.000463 | step=4841 | 11832s | 12.0GB\n", | |
| "E1 B21800 | loss=3.7074 | lr=0.000462 | step=4842 | 11843s | 12.0GB\n", | |
| "E1 B21820 | loss=3.6044 | lr=0.000462 | step=4842 | 11854s | 12.0GB\n", | |
| "E1 B21840 | loss=3.3934 | lr=0.000462 | step=4843 | 11864s | 12.0GB\n", | |
| "E1 B21860 | loss=3.8649 | lr=0.000461 | step=4844 | 11875s | 12.0GB\n", | |
| "E1 B21880 | loss=3.6675 | lr=0.000461 | step=4844 | 11886s | 12.0GB\n", | |
| "E1 B21900 | loss=3.2675 | lr=0.000461 | step=4845 | 11897s | 12.0GB\n", | |
| "E1 B21920 | loss=3.2880 | lr=0.000461 | step=4846 | 11908s | 12.0GB\n", | |
| "E1 B21940 | loss=3.4379 | lr=0.000461 | step=4846 | 11919s | 12.0GB\n", | |
| "E1 B21960 | loss=3.2820 | lr=0.000460 | step=4847 | 11930s | 12.0GB\n", | |
| "E1 B21980 | loss=3.6073 | lr=0.000460 | step=4847 | 11940s | 12.0GB\n", | |
| "E1 B22000 | loss=3.6944 | lr=0.000460 | step=4848 | 11951s | 12.0GB\n", | |
| "E1 B22020 | loss=3.4143 | lr=0.000460 | step=4849 | 11962s | 12.0GB\n", | |
| "E1 B22040 | loss=3.6798 | lr=0.000460 | step=4849 | 11973s | 12.0GB\n", | |
| "E1 B22060 | loss=3.2871 | lr=0.000459 | step=4850 | 11984s | 12.0GB\n", | |
| "E1 B22080 | loss=3.3731 | lr=0.000459 | step=4851 | 11995s | 12.0GB\n", | |
| "E1 B22100 | loss=3.2190 | lr=0.000459 | step=4851 | 12005s | 12.0GB\n", | |
| "E1 B22120 | loss=3.5056 | lr=0.000458 | step=4852 | 12016s | 12.0GB\n", | |
| "E1 B22140 | loss=3.5595 | lr=0.000458 | step=4852 | 12027s | 12.0GB\n", | |
| "E1 B22160 | loss=3.2776 | lr=0.000458 | step=4853 | 12038s | 12.0GB\n", | |
| "E1 B22180 | loss=3.6260 | lr=0.000458 | step=4854 | 12049s | 12.0GB\n", | |
| "E1 B22200 | loss=2.9745 | lr=0.000458 | step=4854 | 12060s | 12.0GB\n", | |
| "E1 B22220 | loss=3.3694 | lr=0.000457 | step=4855 | 12071s | 12.0GB\n", | |
| "E1 B22240 | loss=3.5799 | lr=0.000457 | step=4856 | 12081s | 12.0GB\n", | |
| "E1 B22260 | loss=3.2559 | lr=0.000457 | step=4856 | 12092s | 12.0GB\n", | |
| "E1 B22280 | loss=3.6499 | lr=0.000457 | step=4857 | 12103s | 12.0GB\n", | |
| "E1 B22300 | loss=3.1695 | lr=0.000457 | step=4857 | 12114s | 12.0GB\n", | |
| "E1 B22320 | loss=3.1934 | lr=0.000456 | step=4858 | 12125s | 12.0GB\n", | |
| "E1 B22340 | loss=3.8149 | lr=0.000456 | step=4859 | 12135s | 12.0GB\n", | |
| "E1 B22360 | loss=3.4963 | lr=0.000456 | step=4859 | 12146s | 12.0GB\n", | |
| "E1 B22380 | loss=3.7786 | lr=0.000455 | step=4860 | 12157s | 12.0GB\n", | |
| "E1 B22400 | loss=2.8336 | lr=0.000455 | step=4861 | 12168s | 12.0GB\n", | |
| "E1 B22420 | loss=3.1371 | lr=0.000455 | step=4861 | 12179s | 12.0GB\n", | |
| "E1 B22440 | loss=3.1434 | lr=0.000455 | step=4862 | 12189s | 12.0GB\n", | |
| "E1 B22460 | loss=4.0221 | lr=0.000455 | step=4862 | 12200s | 12.0GB\n", | |
| "E1 B22480 | loss=3.3503 | lr=0.000454 | step=4863 | 12211s | 12.0GB\n", | |
| "E1 B22500 | loss=3.5887 | lr=0.000454 | step=4864 | 12222s | 12.0GB\n", | |
| "E1 B22520 | loss=3.8658 | lr=0.000454 | step=4864 | 12233s | 12.0GB\n", | |
| "E1 B22540 | loss=3.9582 | lr=0.000454 | step=4865 | 12244s | 12.0GB\n", | |
| "E1 B22560 | loss=3.6130 | lr=0.000453 | step=4866 | 12255s | 12.0GB\n", | |
| "E1 B22580 | loss=3.9035 | lr=0.000453 | step=4866 | 12265s | 12.0GB\n", | |
| "E1 B22600 | loss=3.4495 | lr=0.000453 | step=4867 | 12276s | 12.0GB\n", | |
| "E1 B22620 | loss=3.7212 | lr=0.000453 | step=4867 | 12287s | 12.0GB\n", | |
| "E1 B22640 | loss=3.5402 | lr=0.000452 | step=4868 | 12298s | 12.0GB\n", | |
| "E1 B22660 | loss=3.3200 | lr=0.000452 | step=4869 | 12309s | 12.0GB\n", | |
| "E1 B22680 | loss=3.4649 | lr=0.000452 | step=4869 | 12320s | 12.0GB\n", | |
| "E1 B22700 | loss=2.9249 | lr=0.000452 | step=4870 | 12331s | 12.0GB\n", | |
| "E1 B22720 | loss=3.2887 | lr=0.000451 | step=4871 | 12342s | 12.0GB\n", | |
| "E1 B22740 | loss=3.6601 | lr=0.000451 | step=4871 | 12352s | 12.0GB\n", | |
| "E1 B22760 | loss=3.4427 | lr=0.000451 | step=4872 | 12363s | 12.0GB\n", | |
| "E1 B22780 | loss=3.3118 | lr=0.000451 | step=4872 | 12374s | 12.0GB\n", | |
| "E1 B22800 | loss=2.8072 | lr=0.000450 | step=4873 | 12385s | 12.0GB\n", | |
| "E1 B22820 | loss=3.4987 | lr=0.000450 | step=4874 | 12396s | 12.0GB\n", | |
| "E1 B22840 | loss=3.8054 | lr=0.000450 | step=4874 | 12407s | 12.0GB\n", | |
| "E1 B22860 | loss=3.1587 | lr=0.000450 | step=4875 | 12418s | 12.0GB\n", | |
| "E1 B22880 | loss=3.8781 | lr=0.000449 | step=4876 | 12428s | 12.0GB\n", | |
| "E1 B22900 | loss=3.3794 | lr=0.000449 | step=4876 | 12439s | 12.0GB\n", | |
| "E1 B22920 | loss=3.2177 | lr=0.000449 | step=4877 | 12450s | 12.0GB\n", | |
| "E1 B22940 | loss=3.8462 | lr=0.000449 | step=4877 | 12461s | 12.0GB\n", | |
| "E1 B22960 | loss=2.8065 | lr=0.000449 | step=4878 | 12472s | 12.0GB\n", | |
| "E1 B22980 | loss=3.7444 | lr=0.000448 | step=4879 | 12482s | 12.0GB\n", | |
| "E1 B23000 | loss=3.0406 | lr=0.000448 | step=4879 | 12493s | 12.0GB\n", | |
| "E1 B23020 | loss=3.6982 | lr=0.000448 | step=4880 | 12504s | 12.0GB\n", | |
| "E1 B23040 | loss=3.1486 | lr=0.000447 | step=4881 | 12515s | 12.0GB\n", | |
| "E1 B23060 | loss=3.4180 | lr=0.000447 | step=4881 | 12526s | 12.0GB\n", | |
| "E1 B23080 | loss=3.1856 | lr=0.000447 | step=4882 | 12537s | 12.0GB\n", | |
| "E1 B23100 | loss=3.9107 | lr=0.000447 | step=4882 | 12547s | 12.0GB\n", | |
| "E1 B23120 | loss=3.6783 | lr=0.000447 | step=4883 | 12558s | 12.0GB\n", | |
| "E1 B23140 | loss=3.5179 | lr=0.000446 | step=4884 | 12569s | 12.0GB\n", | |
| "E1 B23160 | loss=3.1287 | lr=0.000446 | step=4884 | 12580s | 12.0GB\n", | |
| "E1 B23180 | loss=3.8437 | lr=0.000446 | step=4885 | 12591s | 12.0GB\n", | |
| "E1 B23200 | loss=3.3440 | lr=0.000446 | step=4886 | 12602s | 12.0GB\n", | |
| "E1 B23220 | loss=3.9070 | lr=0.000446 | step=4886 | 12613s | 12.0GB\n", | |
| "E1 B23240 | loss=3.8997 | lr=0.000445 | step=4887 | 12623s | 12.0GB\n", | |
| "E1 B23260 | loss=3.3088 | lr=0.000445 | step=4887 | 12634s | 12.0GB\n", | |
| "E1 B23280 | loss=3.8936 | lr=0.000445 | step=4888 | 12645s | 12.0GB\n", | |
| "E1 B23300 | loss=3.4948 | lr=0.000444 | step=4889 | 12656s | 12.0GB\n", | |
| "E1 B23320 | loss=3.4849 | lr=0.000444 | step=4889 | 12667s | 12.0GB\n", | |
| "E1 B23340 | loss=3.3783 | lr=0.000444 | step=4890 | 12678s | 12.0GB\n", | |
| "E1 B23360 | loss=3.3134 | lr=0.000444 | step=4891 | 12689s | 12.0GB\n", | |
| "E1 B23380 | loss=2.9131 | lr=0.000444 | step=4891 | 12699s | 12.0GB\n", | |
| "E1 B23400 | loss=3.6583 | lr=0.000443 | step=4892 | 12710s | 12.0GB\n", | |
| "E1 B23420 | loss=3.7176 | lr=0.000443 | step=4892 | 12721s | 12.0GB\n", | |
| "E1 B23440 | loss=3.7055 | lr=0.000443 | step=4893 | 12732s | 12.0GB\n", | |
| "E1 B23460 | loss=3.4473 | lr=0.000442 | step=4894 | 12743s | 12.0GB\n", | |
| "E1 B23480 | loss=3.4606 | lr=0.000442 | step=4894 | 12754s | 12.0GB\n", | |
| "E1 B23500 | loss=3.6305 | lr=0.000442 | step=4895 | 12764s | 12.0GB\n", | |
| "E1 B23520 | loss=3.3946 | lr=0.000442 | step=4896 | 12775s | 12.0GB\n", | |
| "E1 B23540 | loss=3.3780 | lr=0.000442 | step=4896 | 12786s | 12.0GB\n", | |
| "E1 B23560 | loss=3.4992 | lr=0.000441 | step=4897 | 12797s | 12.0GB\n", | |
| "E1 B23580 | loss=3.8806 | lr=0.000441 | step=4897 | 12808s | 12.0GB\n", | |
| "E1 B23600 | loss=3.3802 | lr=0.000441 | step=4898 | 12819s | 12.0GB\n", | |
| "E1 B23620 | loss=3.5861 | lr=0.000440 | step=4899 | 12830s | 12.0GB\n", | |
| "E1 B23640 | loss=3.8803 | lr=0.000440 | step=4899 | 12840s | 12.0GB\n", | |
| "E1 B23660 | loss=3.4512 | lr=0.000440 | step=4900 | 12851s | 12.0GB\n", | |
| "E1 B23680 | loss=4.0879 | lr=0.000440 | step=4901 | 12862s | 12.0GB\n", | |
| "E1 B23700 | loss=3.5736 | lr=0.000440 | step=4901 | 12873s | 12.0GB\n", | |
| "E1 B23720 | loss=3.6083 | lr=0.000439 | step=4902 | 12884s | 12.0GB\n", | |
| "E1 B23740 | loss=3.5070 | lr=0.000439 | step=4902 | 12894s | 12.0GB\n", | |
| "E1 B23760 | loss=3.6224 | lr=0.000439 | step=4903 | 12905s | 12.0GB\n", | |
| "E1 B23780 | loss=3.0983 | lr=0.000439 | step=4904 | 12916s | 12.0GB\n", | |
| "E1 B23800 | loss=3.8892 | lr=0.000439 | step=4904 | 12927s | 12.0GB\n", | |
| "E1 B23820 | loss=4.0243 | lr=0.000438 | step=4905 | 12938s | 12.0GB\n", | |
| "E1 B23840 | loss=3.4785 | lr=0.000438 | step=4906 | 12949s | 12.0GB\n", | |
| "E1 B23860 | loss=3.5346 | lr=0.000438 | step=4906 | 12960s | 12.0GB\n", | |
| "E1 B23880 | loss=3.4360 | lr=0.000437 | step=4907 | 12970s | 12.0GB\n", | |
| "E1 B23900 | loss=3.8723 | lr=0.000437 | step=4907 | 12981s | 12.0GB\n", | |
| "E1 B23920 | loss=3.3388 | lr=0.000437 | step=4908 | 12992s | 12.0GB\n", | |
| "E1 B23940 | loss=3.5175 | lr=0.000437 | step=4909 | 13003s | 12.0GB\n", | |
| "E1 B23960 | loss=3.2382 | lr=0.000437 | step=4909 | 13014s | 12.0GB\n", | |
| "E1 B23980 | loss=3.4159 | lr=0.000436 | step=4910 | 13025s | 12.0GB\n", | |
| "E1 B24000 | loss=3.5781 | lr=0.000436 | step=4911 | 13035s | 12.0GB\n", | |
| "E1 B24020 | loss=3.3608 | lr=0.000436 | step=4911 | 13046s | 12.0GB\n", | |
| "E1 B24040 | loss=3.5169 | lr=0.000435 | step=4912 | 13057s | 12.0GB\n", | |
| "E1 B24060 | loss=3.5106 | lr=0.000435 | step=4912 | 13068s | 12.0GB\n", | |
| "E1 B24080 | loss=3.4944 | lr=0.000435 | step=4913 | 13079s | 12.0GB\n", | |
| "E1 B24100 | loss=4.0368 | lr=0.000435 | step=4914 | 13090s | 12.0GB\n", | |
| "E1 B24120 | loss=3.3850 | lr=0.000435 | step=4914 | 13101s | 12.0GB\n", | |
| "E1 B24140 | loss=3.6751 | lr=0.000434 | step=4915 | 13111s | 12.0GB\n", | |
| "E1 B24160 | loss=3.3746 | lr=0.000434 | step=4916 | 13122s | 12.0GB\n", | |
| "E1 B24180 | loss=3.6394 | lr=0.000434 | step=4916 | 13133s | 12.0GB\n", | |
| "E1 B24200 | loss=3.7715 | lr=0.000433 | step=4917 | 13144s | 12.0GB\n", | |
| "E1 B24220 | loss=3.4295 | lr=0.000433 | step=4917 | 13155s | 12.0GB\n", | |
| "E1 B24240 | loss=4.0320 | lr=0.000433 | step=4918 | 13166s | 12.0GB\n", | |
| "E1 B24260 | loss=3.8062 | lr=0.000433 | step=4919 | 13177s | 12.0GB\n", | |
| "E1 B24280 | loss=3.8035 | lr=0.000433 | step=4919 | 13187s | 12.0GB\n", | |
| "E1 B24300 | loss=3.5507 | lr=0.000432 | step=4920 | 13198s | 12.0GB\n", | |
| "E1 B24320 | loss=3.3794 | lr=0.000432 | step=4921 | 13209s | 12.0GB\n", | |
| "E1 B24340 | loss=4.0198 | lr=0.000432 | step=4921 | 13220s | 12.0GB\n", | |
| "E1 B24360 | loss=3.8569 | lr=0.000431 | step=4922 | 13231s | 12.0GB\n", | |
| "E1 B24380 | loss=3.7611 | lr=0.000431 | step=4922 | 13241s | 12.0GB\n", | |
| "E1 B24400 | loss=3.7705 | lr=0.000431 | step=4923 | 13252s | 12.0GB\n", | |
| "E1 B24420 | loss=3.0190 | lr=0.000431 | step=4924 | 13263s | 12.0GB\n", | |
| "E1 B24440 | loss=3.7499 | lr=0.000431 | step=4924 | 13274s | 12.0GB\n", | |
| "E1 B24460 | loss=3.6518 | lr=0.000430 | step=4925 | 13285s | 12.0GB\n", | |
| "E1 B24480 | loss=3.7865 | lr=0.000430 | step=4926 | 13296s | 12.0GB\n", | |
| "E1 B24500 | loss=3.1010 | lr=0.000430 | step=4926 | 13306s | 12.0GB\n", | |
| "E1 B24520 | loss=3.4612 | lr=0.000429 | step=4927 | 13317s | 12.0GB\n", | |
| "E1 B24540 | loss=3.4903 | lr=0.000429 | step=4927 | 13328s | 12.0GB\n", | |
| "E1 B24560 | loss=3.2525 | lr=0.000429 | step=4928 | 13339s | 12.0GB\n", | |
| "E1 B24580 | loss=3.5318 | lr=0.000429 | step=4929 | 13350s | 12.0GB\n", | |
| "E1 B24600 | loss=3.4591 | lr=0.000429 | step=4929 | 13361s | 12.0GB\n", | |
| "E1 B24620 | loss=3.6028 | lr=0.000428 | step=4930 | 13372s | 12.0GB\n", | |
| "E1 B24640 | loss=3.1059 | lr=0.000428 | step=4931 | 13383s | 12.0GB\n", | |
| "E1 B24660 | loss=3.8259 | lr=0.000428 | step=4931 | 13394s | 12.0GB\n", | |
| "E1 B24680 | loss=3.6998 | lr=0.000427 | step=4932 | 13404s | 12.0GB\n", | |
| "E1 B24700 | loss=3.1745 | lr=0.000427 | step=4932 | 13415s | 12.0GB\n", | |
| "E1 B24720 | loss=3.8418 | lr=0.000427 | step=4933 | 13426s | 12.0GB\n", | |
| "E1 B24740 | loss=3.1091 | lr=0.000427 | step=4934 | 13437s | 12.0GB\n", | |
| "E1 B24760 | loss=3.5076 | lr=0.000427 | step=4934 | 13448s | 12.0GB\n", | |
| "E1 B24780 | loss=3.1879 | lr=0.000426 | step=4935 | 13458s | 12.0GB\n", | |
| "E1 B24800 | loss=3.5779 | lr=0.000426 | step=4936 | 13469s | 12.0GB\n", | |
| "E1 B24820 | loss=3.1780 | lr=0.000426 | step=4936 | 13480s | 12.0GB\n", | |
| "E1 B24840 | loss=3.9145 | lr=0.000425 | step=4937 | 13491s | 12.0GB\n", | |
| "E1 B24860 | loss=3.5436 | lr=0.000425 | step=4937 | 13502s | 12.0GB\n", | |
| "E1 B24880 | loss=3.3128 | lr=0.000425 | step=4938 | 13513s | 12.0GB\n", | |
| "E1 B24900 | loss=3.5493 | lr=0.000425 | step=4939 | 13524s | 12.0GB\n", | |
| "E1 B24920 | loss=3.5766 | lr=0.000425 | step=4939 | 13535s | 12.0GB\n", | |
| "E1 B24940 | loss=3.7863 | lr=0.000424 | step=4940 | 13545s | 12.0GB\n", | |
| "E1 B24960 | loss=3.6315 | lr=0.000424 | step=4941 | 13556s | 12.0GB\n", | |
| "E1 B24980 | loss=2.7098 | lr=0.000424 | step=4941 | 13567s | 12.0GB\n", | |
| "E1 B25000 | loss=3.5038 | lr=0.000424 | step=4942 | 13578s | 12.0GB\n", | |
| "E1 B25020 | loss=3.0940 | lr=0.000424 | step=4942 | 13589s | 12.0GB\n", | |
| "E1 B25040 | loss=3.4241 | lr=0.000423 | step=4943 | 13600s | 12.0GB\n", | |
| "E1 B25060 | loss=3.5914 | lr=0.000423 | step=4944 | 13611s | 12.0GB\n", | |
| "E1 B25080 | loss=3.2504 | lr=0.000423 | step=4944 | 13621s | 12.0GB\n", | |
| "E1 B25100 | loss=3.9381 | lr=0.000422 | step=4945 | 13632s | 12.0GB\n", | |
| "E1 B25120 | loss=3.7746 | lr=0.000422 | step=4946 | 13643s | 12.0GB\n", | |
| "E1 B25140 | loss=2.9835 | lr=0.000422 | step=4946 | 13654s | 12.0GB\n", | |
| "E1 B25160 | loss=3.3647 | lr=0.000421 | step=4947 | 13665s | 12.0GB\n", | |
| "E1 B25180 | loss=3.9059 | lr=0.000421 | step=4947 | 13676s | 12.0GB\n", | |
| "E1 B25200 | loss=3.8741 | lr=0.000421 | step=4948 | 13686s | 12.0GB\n", | |
| "E1 B25220 | loss=2.8788 | lr=0.000421 | step=4949 | 13697s | 12.0GB\n", | |
| "E1 B25240 | loss=3.6051 | lr=0.000421 | step=4949 | 13708s | 12.0GB\n", | |
| "E1 B25260 | loss=3.3991 | lr=0.000420 | step=4950 | 13719s | 12.0GB\n", | |
| "E1 B25280 | loss=3.6056 | lr=0.000420 | step=4951 | 13730s | 12.0GB\n", | |
| "E1 B25300 | loss=2.9274 | lr=0.000420 | step=4951 | 13740s | 12.0GB\n", | |
| "E1 B25320 | loss=3.2301 | lr=0.000419 | step=4952 | 13751s | 12.0GB\n", | |
| "E1 B25340 | loss=3.5280 | lr=0.000419 | step=4952 | 13762s | 12.0GB\n", | |
| "E1 B25360 | loss=3.8888 | lr=0.000419 | step=4953 | 13773s | 12.0GB\n", | |
| "E1 B25380 | loss=2.8609 | lr=0.000419 | step=4954 | 13784s | 12.0GB\n", | |
| "E1 B25400 | loss=3.4338 | lr=0.000419 | step=4954 | 13795s | 12.0GB\n", | |
| "E1 B25420 | loss=3.6740 | lr=0.000418 | step=4955 | 13806s | 12.0GB\n", | |
| "E1 B25440 | loss=3.8806 | lr=0.000418 | step=4956 | 13816s | 12.0GB\n", | |
| "E1 B25460 | loss=3.8368 | lr=0.000418 | step=4956 | 13827s | 12.0GB\n", | |
| "E1 B25480 | loss=3.7533 | lr=0.000417 | step=4957 | 13838s | 12.0GB\n", | |
| "E1 B25500 | loss=3.5869 | lr=0.000417 | step=4957 | 13849s | 12.0GB\n", | |
| "E1 B25520 | loss=3.8248 | lr=0.000417 | step=4958 | 13860s | 12.0GB\n", | |
| "E1 B25540 | loss=2.7628 | lr=0.000417 | step=4959 | 13871s | 12.0GB\n", | |
| "E1 B25560 | loss=3.9692 | lr=0.000417 | step=4959 | 13882s | 12.0GB\n", | |
| "E1 B25580 | loss=3.2583 | lr=0.000416 | step=4960 | 13892s | 12.0GB\n", | |
| "E1 B25600 | loss=3.4334 | lr=0.000416 | step=4961 | 13903s | 12.0GB\n", | |
| "E1 B25620 | loss=3.6134 | lr=0.000416 | step=4961 | 13914s | 12.0GB\n", | |
| "E1 B25640 | loss=3.1456 | lr=0.000415 | step=4962 | 13925s | 12.0GB\n", | |
| "E1 B25660 | loss=3.8615 | lr=0.000415 | step=4962 | 13936s | 12.0GB\n", | |
| "E1 B25680 | loss=3.5333 | lr=0.000415 | step=4963 | 13947s | 12.0GB\n", | |
| "E1 B25700 | loss=3.6401 | lr=0.000415 | step=4964 | 13957s | 12.0GB\n", | |
| "E1 B25720 | loss=2.7807 | lr=0.000415 | step=4964 | 13968s | 12.0GB\n", | |
| "E1 B25740 | loss=3.6467 | lr=0.000414 | step=4965 | 13979s | 12.0GB\n", | |
| "E1 B25760 | loss=3.2870 | lr=0.000414 | step=4966 | 13990s | 12.0GB\n", | |
| "E1 B25780 | loss=3.2872 | lr=0.000414 | step=4966 | 14001s | 12.0GB\n", | |
| "E1 B25800 | loss=3.7470 | lr=0.000413 | step=4967 | 14012s | 12.0GB\n", | |
| "E1 B25820 | loss=3.8406 | lr=0.000413 | step=4967 | 14023s | 12.0GB\n", | |
| "E1 B25840 | loss=3.5743 | lr=0.000413 | step=4968 | 14034s | 12.0GB\n", | |
| "E1 B25860 | loss=3.4428 | lr=0.000413 | step=4969 | 14045s | 12.0GB\n", | |
| "E1 B25880 | loss=3.3368 | lr=0.000413 | step=4969 | 14055s | 12.0GB\n", | |
| "E1 B25900 | loss=3.3002 | lr=0.000412 | step=4970 | 14066s | 12.0GB\n", | |
| "E1 B25920 | loss=3.7617 | lr=0.000412 | step=4971 | 14077s | 12.0GB\n", | |
| "E1 B25940 | loss=2.8183 | lr=0.000412 | step=4971 | 14088s | 12.0GB\n", | |
| "E1 B25960 | loss=2.7850 | lr=0.000411 | step=4972 | 14099s | 12.0GB\n", | |
| "E1 B25980 | loss=3.6429 | lr=0.000411 | step=4972 | 14109s | 12.0GB\n", | |
| "E1 B26000 | loss=3.6248 | lr=0.000411 | step=4973 | 14120s | 12.0GB\n", | |
| "E1 B26020 | loss=3.9476 | lr=0.000411 | step=4974 | 14131s | 12.0GB\n", | |
| "E1 B26040 | loss=3.7221 | lr=0.000411 | step=4974 | 14142s | 12.0GB\n", | |
| "E1 B26060 | loss=4.0401 | lr=0.000410 | step=4975 | 14153s | 12.0GB\n", | |
| "E1 B26080 | loss=3.2853 | lr=0.000410 | step=4976 | 14164s | 12.0GB\n", | |
| "E1 B26100 | loss=3.5873 | lr=0.000410 | step=4976 | 14174s | 12.0GB\n", | |
| "E1 B26120 | loss=3.3819 | lr=0.000409 | step=4977 | 14185s | 12.0GB\n", | |
| "E1 B26140 | loss=2.7990 | lr=0.000409 | step=4977 | 14196s | 12.0GB\n", | |
| "E1 B26160 | loss=3.5141 | lr=0.000409 | step=4978 | 14207s | 12.0GB\n", | |
| "E1 B26180 | loss=2.7465 | lr=0.000409 | step=4979 | 14218s | 12.0GB\n", | |
| "E1 B26200 | loss=3.7604 | lr=0.000409 | step=4979 | 14229s | 12.0GB\n", | |
| "E1 B26220 | loss=3.8415 | lr=0.000408 | step=4980 | 14240s | 12.0GB\n", | |
| "E1 B26240 | loss=2.8629 | lr=0.000408 | step=4981 | 14250s | 12.0GB\n", | |
| "E1 B26260 | loss=3.9403 | lr=0.000408 | step=4981 | 14261s | 12.0GB\n", | |
| "E1 B26280 | loss=3.1647 | lr=0.000407 | step=4982 | 14272s | 12.0GB\n", | |
| "E1 B26300 | loss=3.6963 | lr=0.000407 | step=4982 | 14283s | 12.0GB\n", | |
| "E1 B26320 | loss=3.4216 | lr=0.000407 | step=4983 | 14294s | 12.0GB\n", | |
| "E1 B26340 | loss=3.2970 | lr=0.000407 | step=4984 | 14304s | 12.0GB\n", | |
| "E1 B26360 | loss=3.8007 | lr=0.000407 | step=4984 | 14315s | 12.0GB\n", | |
| "E1 B26380 | loss=3.6792 | lr=0.000406 | step=4985 | 14326s | 12.0GB\n", | |
| "E1 B26400 | loss=3.3375 | lr=0.000406 | step=4986 | 14337s | 12.0GB\n", | |
| "E1 B26420 | loss=3.4695 | lr=0.000406 | step=4986 | 14348s | 12.0GB\n", | |
| "E1 B26440 | loss=3.5424 | lr=0.000405 | step=4987 | 14358s | 12.0GB\n", | |
| "E1 B26460 | loss=3.9141 | lr=0.000405 | step=4987 | 14369s | 12.0GB\n", | |
| "E1 B26480 | loss=3.3546 | lr=0.000405 | step=4988 | 14380s | 12.0GB\n", | |
| "E1 B26500 | loss=3.4930 | lr=0.000404 | step=4989 | 14391s | 12.0GB\n", | |
| "E1 B26520 | loss=3.7345 | lr=0.000404 | step=4989 | 14402s | 12.0GB\n", | |
| "[SAVED] Checkpoint saved to: /home/spedrox/Transformers/checkpoints/auto_epoch1_step4989.pt\n", | |
| "E1 B26540 | loss=3.8095 | lr=0.000404 | step=4990 | 14414s | 12.0GB\n", | |
| "E1 B26560 | loss=3.3759 | lr=0.000404 | step=4991 | 14425s | 12.0GB\n", | |
| "E1 B26580 | loss=3.5695 | lr=0.000404 | step=4991 | 14435s | 12.0GB\n", | |
| "E1 B26600 | loss=3.8147 | lr=0.000403 | step=4992 | 14446s | 12.0GB\n", | |
| "E1 B26620 | loss=3.4743 | lr=0.000403 | step=4992 | 14457s | 12.0GB\n", | |
| "E1 B26640 | loss=3.3927 | lr=0.000403 | step=4993 | 14468s | 12.0GB\n", | |
| "E1 B26660 | loss=2.9684 | lr=0.000402 | step=4994 | 14479s | 12.0GB\n", | |
| "E1 B26680 | loss=3.9439 | lr=0.000402 | step=4994 | 14490s | 12.0GB\n", | |
| "E1 B26700 | loss=3.3368 | lr=0.000402 | step=4995 | 14501s | 12.0GB\n", | |
| "E1 B26720 | loss=3.7026 | lr=0.000402 | step=4996 | 14512s | 12.0GB\n", | |
| "E1 B26740 | loss=3.4886 | lr=0.000402 | step=4996 | 14522s | 12.0GB\n", | |
| "E1 B26760 | loss=3.7736 | lr=0.000401 | step=4997 | 14533s | 12.0GB\n", | |
| "E1 B26780 | loss=3.6057 | lr=0.000401 | step=4997 | 14544s | 12.0GB\n", | |
| "E1 B26800 | loss=3.8810 | lr=0.000401 | step=4998 | 14555s | 12.0GB\n", | |
| "E1 B26820 | loss=3.3142 | lr=0.000400 | step=4999 | 14566s | 12.0GB\n", | |
| "E1 B26840 | loss=3.8771 | lr=0.000400 | step=4999 | 14577s | 12.0GB\n", | |
| "E1 B26860 | loss=3.0084 | lr=0.000400 | step=5000 | 14588s | 12.0GB\n", | |
| "E1 B26880 | loss=4.0164 | lr=0.000400 | step=5001 | 14599s | 12.0GB\n", | |
| "E1 B26900 | loss=3.1404 | lr=0.000400 | step=5001 | 14609s | 12.0GB\n", | |
| "E1 B26920 | loss=4.1475 | lr=0.000399 | step=5002 | 14620s | 12.0GB\n", | |
| "E1 B26940 | loss=3.5440 | lr=0.000399 | step=5002 | 14631s | 12.0GB\n", | |
| "E1 B26960 | loss=3.3280 | lr=0.000399 | step=5003 | 14642s | 12.0GB\n", | |
| "E1 B26980 | loss=3.8527 | lr=0.000398 | step=5004 | 14652s | 12.0GB\n", | |
| "E1 B27000 | loss=3.8424 | lr=0.000398 | step=5004 | 14663s | 12.0GB\n", | |
| "E1 B27020 | loss=3.6947 | lr=0.000398 | step=5005 | 14674s | 12.0GB\n", | |
| "E1 B27040 | loss=3.0392 | lr=0.000397 | step=5006 | 14685s | 12.0GB\n", | |
| "E1 B27060 | loss=3.6312 | lr=0.000397 | step=5006 | 14696s | 12.0GB\n", | |
| "E1 B27080 | loss=3.7321 | lr=0.000397 | step=5007 | 14707s | 12.0GB\n", | |
| "E1 B27100 | loss=3.8801 | lr=0.000397 | step=5007 | 14717s | 12.0GB\n", | |
| "E1 B27120 | loss=2.7253 | lr=0.000397 | step=5008 | 14728s | 12.0GB\n", | |
| "E1 B27140 | loss=3.5252 | lr=0.000396 | step=5009 | 14739s | 12.0GB\n", | |
| "E1 B27160 | loss=3.9129 | lr=0.000396 | step=5009 | 14750s | 12.0GB\n", | |
| "E1 B27180 | loss=3.2345 | lr=0.000396 | step=5010 | 14761s | 12.0GB\n", | |
| "E1 B27200 | loss=3.5554 | lr=0.000395 | step=5011 | 14772s | 12.0GB\n", | |
| "E1 B27220 | loss=3.6058 | lr=0.000395 | step=5011 | 14783s | 12.0GB\n", | |
| "E1 B27240 | loss=4.0371 | lr=0.000395 | step=5012 | 14794s | 12.0GB\n", | |
| "E1 B27260 | loss=3.3209 | lr=0.000395 | step=5012 | 14805s | 12.0GB\n", | |
| "E1 B27280 | loss=3.8953 | lr=0.000395 | step=5013 | 14815s | 12.0GB\n", | |
| "E1 B27300 | loss=3.8048 | lr=0.000394 | step=5014 | 14826s | 12.0GB\n", | |
| "E1 B27320 | loss=3.9074 | lr=0.000394 | step=5014 | 14837s | 12.0GB\n", | |
| "E1 B27340 | loss=3.8233 | lr=0.000394 | step=5015 | 14848s | 12.0GB\n", | |
| "E1 B27360 | loss=3.7038 | lr=0.000393 | step=5016 | 14859s | 12.0GB\n", | |
| "E1 B27380 | loss=3.2882 | lr=0.000393 | step=5016 | 14870s | 12.0GB\n", | |
| "E1 B27400 | loss=4.1671 | lr=0.000393 | step=5017 | 14880s | 12.0GB\n", | |
| "E1 B27420 | loss=3.6806 | lr=0.000393 | step=5017 | 14891s | 12.0GB\n", | |
| "E1 B27440 | loss=3.5987 | lr=0.000393 | step=5018 | 14902s | 12.0GB\n", | |
| "E1 B27460 | loss=3.3280 | lr=0.000392 | step=5019 | 14913s | 12.0GB\n", | |
| "E1 B27480 | loss=3.7397 | lr=0.000392 | step=5019 | 14924s | 12.0GB\n", | |
| "E1 B27500 | loss=3.9335 | lr=0.000392 | step=5020 | 14934s | 12.0GB\n", | |
| "E1 B27520 | loss=3.5258 | lr=0.000391 | step=5021 | 14945s | 12.0GB\n", | |
| "E1 B27540 | loss=3.8988 | lr=0.000391 | step=5021 | 14956s | 12.0GB\n", | |
| "E1 B27560 | loss=3.3827 | lr=0.000391 | step=5022 | 14967s | 12.0GB\n", | |
| "E1 B27580 | loss=3.7979 | lr=0.000391 | step=5022 | 14978s | 12.0GB\n", | |
| "E1 B27600 | loss=3.7189 | lr=0.000390 | step=5023 | 14989s | 12.0GB\n", | |
| "E1 B27620 | loss=3.6412 | lr=0.000390 | step=5024 | 15000s | 12.0GB\n", | |
| "E1 B27640 | loss=3.8602 | lr=0.000390 | step=5024 | 15010s | 12.0GB\n", | |
| "E1 B27660 | loss=3.2110 | lr=0.000390 | step=5025 | 15021s | 12.0GB\n", | |
| "E1 B27680 | loss=3.2444 | lr=0.000389 | step=5026 | 15032s | 12.0GB\n", | |
| "E1 B27700 | loss=3.3729 | lr=0.000389 | step=5026 | 15043s | 12.0GB\n", | |
| "E1 B27720 | loss=3.9324 | lr=0.000389 | step=5027 | 15054s | 12.0GB\n", | |
| "E1 B27740 | loss=3.6677 | lr=0.000389 | step=5027 | 15065s | 12.0GB\n", | |
| "E1 B27760 | loss=3.2831 | lr=0.000388 | step=5028 | 15076s | 12.0GB\n", | |
| "E1 B27780 | loss=3.6353 | lr=0.000388 | step=5029 | 15086s | 12.0GB\n", | |
| "E1 B27800 | loss=3.7980 | lr=0.000388 | step=5029 | 15097s | 12.0GB\n", | |
| "E1 B27820 | loss=3.2172 | lr=0.000388 | step=5030 | 15108s | 12.0GB\n", | |
| "E1 B27840 | loss=3.5407 | lr=0.000387 | step=5031 | 15119s | 12.0GB\n", | |
| "E1 B27860 | loss=2.9529 | lr=0.000387 | step=5031 | 15130s | 12.0GB\n", | |
| "E1 B27880 | loss=3.3911 | lr=0.000387 | step=5032 | 15141s | 12.0GB\n", | |
| "E1 B27900 | loss=3.3645 | lr=0.000387 | step=5032 | 15151s | 12.0GB\n", | |
| "E1 B27920 | loss=3.3011 | lr=0.000386 | step=5033 | 15162s | 12.0GB\n", | |
| "E1 B27940 | loss=3.5460 | lr=0.000386 | step=5034 | 15173s | 12.0GB\n", | |
| "E1 B27960 | loss=3.8915 | lr=0.000386 | step=5034 | 15184s | 12.0GB\n", | |
| "E1 B27980 | loss=3.0708 | lr=0.000385 | step=5035 | 15195s | 12.0GB\n", | |
| "E1 B28000 | loss=2.7979 | lr=0.000385 | step=5036 | 15205s | 12.0GB\n", | |
| "E1 B28020 | loss=3.7306 | lr=0.000385 | step=5036 | 15216s | 12.0GB\n", | |
| "E1 B28040 | loss=3.8111 | lr=0.000385 | step=5037 | 15227s | 12.0GB\n", | |
| "E1 B28060 | loss=3.2395 | lr=0.000385 | step=5037 | 15238s | 12.0GB\n", | |
| "E1 B28080 | loss=3.3460 | lr=0.000384 | step=5038 | 15249s | 12.0GB\n", | |
| "E1 B28100 | loss=3.3409 | lr=0.000384 | step=5039 | 15260s | 12.0GB\n", | |
| "E1 B28120 | loss=3.6009 | lr=0.000384 | step=5039 | 15271s | 12.0GB\n", | |
| "E1 B28140 | loss=3.7609 | lr=0.000383 | step=5040 | 15281s | 12.0GB\n", | |
| "E1 B28160 | loss=2.9209 | lr=0.000383 | step=5041 | 15293s | 12.0GB\n", | |
| "E1 B28180 | loss=2.8932 | lr=0.000383 | step=5041 | 15303s | 12.0GB\n", | |
| "E1 B28200 | loss=3.9660 | lr=0.000383 | step=5042 | 15314s | 12.0GB\n", | |
| "E1 B28220 | loss=3.2723 | lr=0.000383 | step=5042 | 15325s | 12.0GB\n", | |
| "E1 B28240 | loss=3.3144 | lr=0.000382 | step=5043 | 15336s | 12.0GB\n", | |
| "E1 B28260 | loss=3.4275 | lr=0.000382 | step=5044 | 15347s | 12.0GB\n", | |
| "E1 B28280 | loss=3.9336 | lr=0.000382 | step=5044 | 15358s | 12.0GB\n", | |
| "E1 B28300 | loss=3.2864 | lr=0.000381 | step=5045 | 15368s | 12.0GB\n", | |
| "E1 B28320 | loss=3.4334 | lr=0.000381 | step=5046 | 15379s | 12.0GB\n", | |
| "E1 B28340 | loss=3.2147 | lr=0.000381 | step=5046 | 15390s | 12.0GB\n", | |
| "E1 B28360 | loss=3.5784 | lr=0.000380 | step=5047 | 15401s | 12.0GB\n", | |
| "E1 B28380 | loss=3.6441 | lr=0.000380 | step=5047 | 15412s | 12.0GB\n", | |
| "E1 B28400 | loss=3.2938 | lr=0.000380 | step=5048 | 15423s | 12.0GB\n", | |
| "E1 B28420 | loss=3.9342 | lr=0.000380 | step=5049 | 15434s | 12.0GB\n", | |
| "E1 B28440 | loss=4.0055 | lr=0.000380 | step=5049 | 15444s | 12.0GB\n", | |
| "E1 B28460 | loss=3.0819 | lr=0.000379 | step=5050 | 15455s | 12.0GB\n", | |
| "E1 B28480 | loss=3.2738 | lr=0.000379 | step=5051 | 15466s | 12.0GB\n", | |
| "E1 B28500 | loss=3.4871 | lr=0.000379 | step=5051 | 15477s | 12.0GB\n", | |
| "E1 B28520 | loss=3.3662 | lr=0.000378 | step=5052 | 15488s | 12.0GB\n", | |
| "E1 B28540 | loss=3.4072 | lr=0.000378 | step=5052 | 15498s | 12.0GB\n", | |
| "E1 B28560 | loss=3.3289 | lr=0.000378 | step=5053 | 15509s | 12.0GB\n", | |
| "E1 B28580 | loss=3.5751 | lr=0.000377 | step=5054 | 15520s | 12.0GB\n", | |
| "E1 B28600 | loss=4.0007 | lr=0.000377 | step=5054 | 15531s | 12.0GB\n", | |
| "E1 B28620 | loss=3.7902 | lr=0.000377 | step=5055 | 15542s | 12.0GB\n", | |
| "E1 B28640 | loss=3.3766 | lr=0.000377 | step=5056 | 15553s | 12.0GB\n", | |
| "E1 B28660 | loss=3.3543 | lr=0.000377 | step=5056 | 15564s | 12.0GB\n", | |
| "E1 B28680 | loss=3.1487 | lr=0.000376 | step=5057 | 15575s | 12.0GB\n", | |
| "E1 B28700 | loss=3.4571 | lr=0.000376 | step=5057 | 15585s | 12.0GB\n", | |
| "E1 B28720 | loss=3.9504 | lr=0.000376 | step=5058 | 15596s | 12.0GB\n", | |
| "E1 B28740 | loss=4.2288 | lr=0.000375 | step=5059 | 15607s | 12.0GB\n", | |
| "E1 B28760 | loss=4.1612 | lr=0.000375 | step=5059 | 15618s | 12.0GB\n", | |
| "E1 B28780 | loss=3.2038 | lr=0.000375 | step=5060 | 15629s | 12.0GB\n", | |
| "E1 B28800 | loss=3.4873 | lr=0.000375 | step=5061 | 15640s | 12.0GB\n", | |
| "E1 B28820 | loss=3.3439 | lr=0.000375 | step=5061 | 15651s | 12.0GB\n", | |
| "E1 B28840 | loss=3.4705 | lr=0.000374 | step=5062 | 15661s | 12.0GB\n", | |
| "E1 B28860 | loss=3.8846 | lr=0.000374 | step=5062 | 15672s | 12.0GB\n", | |
| "E1 B28880 | loss=3.5027 | lr=0.000374 | step=5063 | 15683s | 12.0GB\n", | |
| "E1 B28900 | loss=3.2639 | lr=0.000373 | step=5064 | 15694s | 12.0GB\n", | |
| "E1 B28920 | loss=3.3081 | lr=0.000373 | step=5064 | 15705s | 12.0GB\n", | |
| "E1 B28940 | loss=3.8983 | lr=0.000373 | step=5065 | 15716s | 12.0GB\n", | |
| "E1 B28960 | loss=3.6940 | lr=0.000372 | step=5066 | 15727s | 12.0GB\n", | |
| "E1 B28980 | loss=3.6517 | lr=0.000372 | step=5066 | 15737s | 12.0GB\n", | |
| "E1 B29000 | loss=3.9069 | lr=0.000372 | step=5067 | 15748s | 12.0GB\n", | |
| "E1 B29020 | loss=3.2380 | lr=0.000372 | step=5067 | 15759s | 12.0GB\n", | |
| "E1 B29040 | loss=3.8585 | lr=0.000372 | step=5068 | 15770s | 12.0GB\n", | |
| "E1 B29060 | loss=3.6463 | lr=0.000371 | step=5069 | 15781s | 12.0GB\n", | |
| "E1 B29080 | loss=3.9509 | lr=0.000371 | step=5069 | 15792s | 12.0GB\n", | |
| "E1 B29100 | loss=2.9151 | lr=0.000371 | step=5070 | 15802s | 12.0GB\n", | |
| "E1 B29120 | loss=3.6782 | lr=0.000370 | step=5071 | 15813s | 12.0GB\n", | |
| "E1 B29140 | loss=3.4876 | lr=0.000370 | step=5071 | 15824s | 12.0GB\n", | |
| "E1 B29160 | loss=3.0929 | lr=0.000370 | step=5072 | 15835s | 12.0GB\n", | |
| "E1 B29180 | loss=3.4309 | lr=0.000370 | step=5072 | 15846s | 12.0GB\n", | |
| "E1 B29200 | loss=3.8218 | lr=0.000369 | step=5073 | 15857s | 12.0GB\n", | |
| "E1 B29220 | loss=3.4776 | lr=0.000369 | step=5074 | 15868s | 12.0GB\n", | |
| "E1 B29240 | loss=3.5979 | lr=0.000369 | step=5074 | 15879s | 12.0GB\n", | |
| "E1 B29260 | loss=3.7141 | lr=0.000369 | step=5075 | 15890s | 12.0GB\n", | |
| "E1 B29280 | loss=3.3557 | lr=0.000368 | step=5076 | 15900s | 12.0GB\n", | |
| "E1 B29300 | loss=3.5840 | lr=0.000368 | step=5076 | 15911s | 12.0GB\n", | |
| "E1 B29320 | loss=3.7102 | lr=0.000368 | step=5077 | 15922s | 12.0GB\n", | |
| "E1 B29340 | loss=3.3457 | lr=0.000368 | step=5077 | 15933s | 12.0GB\n", | |
| "E1 B29360 | loss=3.7696 | lr=0.000367 | step=5078 | 15944s | 12.0GB\n", | |
| "E1 B29380 | loss=3.5044 | lr=0.000367 | step=5079 | 15954s | 12.0GB\n", | |
| "E1 B29400 | loss=3.6796 | lr=0.000367 | step=5079 | 15965s | 12.0GB\n", | |
| "E1 B29420 | loss=3.6739 | lr=0.000367 | step=5080 | 15976s | 12.0GB\n", | |
| "E1 B29440 | loss=3.3947 | lr=0.000366 | step=5081 | 15987s | 12.0GB\n", | |
| "E1 B29460 | loss=3.5062 | lr=0.000366 | step=5081 | 15998s | 12.0GB\n", | |
| "E1 B29480 | loss=3.2264 | lr=0.000366 | step=5082 | 16009s | 12.0GB\n", | |
| "E1 B29500 | loss=3.3014 | lr=0.000366 | step=5082 | 16019s | 12.0GB\n", | |
| "E1 B29520 | loss=2.9197 | lr=0.000365 | step=5083 | 16030s | 12.0GB\n", | |
| "E1 B29540 | loss=3.6012 | lr=0.000365 | step=5084 | 16041s | 12.0GB\n", | |
| "E1 B29560 | loss=3.2362 | lr=0.000365 | step=5084 | 16052s | 12.0GB\n", | |
| "E1 B29580 | loss=3.0337 | lr=0.000364 | step=5085 | 16063s | 12.0GB\n", | |
| "E1 B29600 | loss=2.9766 | lr=0.000364 | step=5086 | 16074s | 12.0GB\n", | |
| "E1 B29620 | loss=4.0326 | lr=0.000364 | step=5086 | 16085s | 12.0GB\n", | |
| "E1 B29640 | loss=3.4412 | lr=0.000364 | step=5087 | 16095s | 12.0GB\n", | |
| "E1 B29660 | loss=3.6379 | lr=0.000364 | step=5087 | 16106s | 12.0GB\n", | |
| "E1 B29680 | loss=3.3010 | lr=0.000363 | step=5088 | 16117s | 12.0GB\n", | |
| "E1 B29700 | loss=3.4794 | lr=0.000363 | step=5089 | 16128s | 12.0GB\n", | |
| "E1 B29720 | loss=3.8773 | lr=0.000363 | step=5089 | 16139s | 12.0GB\n", | |
| "E1 B29740 | loss=3.3558 | lr=0.000362 | step=5090 | 16149s | 12.0GB\n", | |
| "E1 B29760 | loss=3.2116 | lr=0.000362 | step=5091 | 16160s | 12.0GB\n", | |
| "E1 B29780 | loss=3.3962 | lr=0.000362 | step=5091 | 16171s | 12.0GB\n", | |
| "E1 B29800 | loss=3.6255 | lr=0.000361 | step=5092 | 16182s | 12.0GB\n", | |
| "E1 B29820 | loss=2.6882 | lr=0.000361 | step=5092 | 16193s | 12.0GB\n", | |
| "E1 B29840 | loss=3.6510 | lr=0.000361 | step=5093 | 16204s | 12.0GB\n", | |
| "E1 B29860 | loss=3.6430 | lr=0.000361 | step=5094 | 16215s | 12.0GB\n", | |
| "E1 B29880 | loss=3.0693 | lr=0.000361 | step=5094 | 16225s | 12.0GB\n", | |
| "E1 B29900 | loss=3.0187 | lr=0.000360 | step=5095 | 16236s | 12.0GB\n", | |
| "E1 B29920 | loss=3.6686 | lr=0.000360 | step=5096 | 16247s | 12.0GB\n", | |
| "E1 B29940 | loss=3.6890 | lr=0.000360 | step=5096 | 16258s | 12.0GB\n", | |
| "E1 B29960 | loss=3.4463 | lr=0.000359 | step=5097 | 16269s | 12.0GB\n", | |
| "E1 B29980 | loss=3.9382 | lr=0.000359 | step=5097 | 16280s | 12.0GB\n", | |
| "E1 B30000 | loss=3.8128 | lr=0.000359 | step=5098 | 16290s | 12.0GB\n", | |
| "E1 B30020 | loss=3.2378 | lr=0.000359 | step=5099 | 16302s | 12.0GB\n", | |
| "E1 B30040 | loss=3.4392 | lr=0.000359 | step=5099 | 16312s | 12.0GB\n", | |
| "E1 B30060 | loss=3.5596 | lr=0.000358 | step=5100 | 16323s | 12.0GB\n", | |
| "E1 B30080 | loss=3.4917 | lr=0.000358 | step=5101 | 16334s | 12.0GB\n", | |
| "E1 B30100 | loss=3.2715 | lr=0.000358 | step=5101 | 16345s | 12.0GB\n", | |
| "E1 B30120 | loss=3.7950 | lr=0.000357 | step=5102 | 16356s | 12.0GB\n", | |
| "E1 B30140 | loss=3.5755 | lr=0.000357 | step=5102 | 16366s | 12.0GB\n", | |
| "E1 B30160 | loss=3.5234 | lr=0.000357 | step=5103 | 16377s | 12.0GB\n", | |
| "E1 B30180 | loss=3.2620 | lr=0.000356 | step=5104 | 16388s | 12.0GB\n", | |
| "E1 B30200 | loss=3.6228 | lr=0.000356 | step=5104 | 16399s | 12.0GB\n", | |
| "E1 B30220 | loss=3.2647 | lr=0.000356 | step=5105 | 16410s | 12.0GB\n", | |
| "E1 B30240 | loss=3.3555 | lr=0.000356 | step=5106 | 16421s | 12.0GB\n", | |
| "E1 B30260 | loss=3.5605 | lr=0.000356 | step=5106 | 16432s | 12.0GB\n", | |
| "E1 B30280 | loss=3.9218 | lr=0.000355 | step=5107 | 16442s | 12.0GB\n", | |
| "E1 B30300 | loss=3.2349 | lr=0.000355 | step=5107 | 16453s | 12.0GB\n", | |
| "E1 B30320 | loss=3.4205 | lr=0.000355 | step=5108 | 16464s | 12.0GB\n", | |
| "E1 B30340 | loss=3.7548 | lr=0.000354 | step=5109 | 16475s | 12.0GB\n", | |
| "E1 B30360 | loss=3.3915 | lr=0.000354 | step=5109 | 16486s | 12.0GB\n", | |
| "E1 B30380 | loss=3.6721 | lr=0.000354 | step=5110 | 16497s | 12.0GB\n", | |
| "E1 B30400 | loss=3.7512 | lr=0.000353 | step=5111 | 16508s | 12.0GB\n", | |
| "E1 B30420 | loss=3.6690 | lr=0.000353 | step=5111 | 16519s | 12.0GB\n", | |
| "E1 B30440 | loss=3.9831 | lr=0.000353 | step=5112 | 16529s | 12.0GB\n", | |
| "E1 B30460 | loss=3.5402 | lr=0.000353 | step=5112 | 16540s | 12.0GB\n", | |
| "E1 B30480 | loss=3.2308 | lr=0.000353 | step=5113 | 16551s | 12.0GB\n", | |
| "E1 B30500 | loss=3.5922 | lr=0.000352 | step=5114 | 16562s | 12.0GB\n", | |
| "E1 B30520 | loss=3.7384 | lr=0.000352 | step=5114 | 16573s | 12.0GB\n", | |
| "E1 B30540 | loss=3.2319 | lr=0.000352 | step=5115 | 16584s | 12.0GB\n", | |
| "E1 B30560 | loss=3.3892 | lr=0.000351 | step=5116 | 16595s | 12.0GB\n", | |
| "E1 B30580 | loss=4.0624 | lr=0.000351 | step=5116 | 16605s | 12.0GB\n", | |
| "E1 B30600 | loss=3.7533 | lr=0.000351 | step=5117 | 16616s | 12.0GB\n", | |
| "E1 B30620 | loss=3.0953 | lr=0.000351 | step=5117 | 16627s | 12.0GB\n", | |
| "E1 B30640 | loss=2.6819 | lr=0.000350 | step=5118 | 16638s | 12.0GB\n", | |
| "E1 B30660 | loss=3.6394 | lr=0.000350 | step=5119 | 16649s | 12.0GB\n", | |
| "E1 B30680 | loss=2.9590 | lr=0.000350 | step=5119 | 16660s | 12.0GB\n", | |
| "E1 B30700 | loss=3.9846 | lr=0.000350 | step=5120 | 16670s | 12.0GB\n", | |
| "E1 B30720 | loss=3.8289 | lr=0.000349 | step=5121 | 16681s | 12.0GB\n", | |
| "E1 B30740 | loss=3.3920 | lr=0.000349 | step=5121 | 16692s | 12.0GB\n", | |
| "E1 B30760 | loss=3.4980 | lr=0.000349 | step=5122 | 16703s | 12.0GB\n", | |
| "E1 B30780 | loss=3.1782 | lr=0.000349 | step=5122 | 16714s | 12.0GB\n", | |
| "E1 B30800 | loss=3.1211 | lr=0.000348 | step=5123 | 16724s | 12.0GB\n", | |
| "E1 B30820 | loss=3.4772 | lr=0.000348 | step=5124 | 16735s | 12.0GB\n", | |
| "E1 B30840 | loss=3.1390 | lr=0.000348 | step=5124 | 16746s | 12.0GB\n", | |
| "E1 B30860 | loss=3.6477 | lr=0.000347 | step=5125 | 16757s | 12.0GB\n", | |
| "E1 B30880 | loss=3.4252 | lr=0.000347 | step=5126 | 16768s | 12.0GB\n", | |
| "E1 B30900 | loss=3.7798 | lr=0.000347 | step=5126 | 16779s | 12.0GB\n", | |
| "E1 B30920 | loss=2.9573 | lr=0.000347 | step=5127 | 16790s | 12.0GB\n", | |
| "E1 B30940 | loss=3.9935 | lr=0.000347 | step=5127 | 16800s | 12.0GB\n", | |
| "E1 B30960 | loss=3.4221 | lr=0.000346 | step=5128 | 16811s | 12.0GB\n", | |
| "E1 B30980 | loss=3.8883 | lr=0.000346 | step=5129 | 16822s | 12.0GB\n", | |
| "E1 B31000 | loss=3.4045 | lr=0.000346 | step=5129 | 16833s | 12.0GB\n", | |
| "E1 B31020 | loss=3.5196 | lr=0.000345 | step=5130 | 16844s | 12.0GB\n", | |
| "E1 B31040 | loss=3.7383 | lr=0.000345 | step=5131 | 16855s | 12.0GB\n", | |
| "E1 B31060 | loss=3.3740 | lr=0.000345 | step=5131 | 16866s | 12.0GB\n", | |
| "E1 B31080 | loss=3.8066 | lr=0.000344 | step=5132 | 16877s | 12.0GB\n", | |
| "E1 B31100 | loss=3.5984 | lr=0.000344 | step=5132 | 16887s | 12.0GB\n", | |
| "E1 B31120 | loss=3.9609 | lr=0.000344 | step=5133 | 16898s | 12.0GB\n", | |
| "E1 B31140 | loss=3.7073 | lr=0.000344 | step=5134 | 16909s | 12.0GB\n", | |
| "E1 B31160 | loss=3.3976 | lr=0.000344 | step=5134 | 16920s | 12.0GB\n", | |
| "E1 B31180 | loss=3.6763 | lr=0.000343 | step=5135 | 16931s | 12.0GB\n", | |
| "E1 B31200 | loss=3.0848 | lr=0.000343 | step=5136 | 16941s | 12.0GB\n", | |
| "E1 B31220 | loss=3.3935 | lr=0.000343 | step=5136 | 16952s | 12.0GB\n", | |
| "E1 B31240 | loss=3.1853 | lr=0.000342 | step=5137 | 16963s | 12.0GB\n", | |
| "E1 B31260 | loss=3.2642 | lr=0.000342 | step=5137 | 16974s | 12.0GB\n", | |
| "E1 B31280 | loss=3.4655 | lr=0.000342 | step=5138 | 16985s | 12.0GB\n", | |
| "E1 B31300 | loss=3.6589 | lr=0.000342 | step=5139 | 16996s | 12.0GB\n", | |
| "E1 B31320 | loss=3.4965 | lr=0.000342 | step=5139 | 17007s | 12.0GB\n", | |
| "E1 B31340 | loss=3.4022 | lr=0.000341 | step=5140 | 17017s | 12.0GB\n", | |
| "E1 B31360 | loss=3.3143 | lr=0.000341 | step=5141 | 17028s | 12.0GB\n", | |
| "E1 B31380 | loss=3.6565 | lr=0.000341 | step=5141 | 17039s | 12.0GB\n", | |
| "E1 B31400 | loss=4.0138 | lr=0.000340 | step=5142 | 17050s | 12.0GB\n", | |
| "E1 B31420 | loss=3.4836 | lr=0.000340 | step=5142 | 17061s | 12.0GB\n", | |
| "E1 B31440 | loss=3.4289 | lr=0.000340 | step=5143 | 17072s | 12.0GB\n", | |
| "E1 B31460 | loss=3.5784 | lr=0.000339 | step=5144 | 17083s | 12.0GB\n", | |
| "E1 B31480 | loss=3.7952 | lr=0.000339 | step=5144 | 17093s | 12.0GB\n", | |
| "E1 B31500 | loss=3.9563 | lr=0.000339 | step=5145 | 17104s | 12.0GB\n", | |
| "E1 B31520 | loss=3.4458 | lr=0.000339 | step=5146 | 17115s | 12.0GB\n", | |
| "E1 B31540 | loss=3.5722 | lr=0.000339 | step=5146 | 17126s | 12.0GB\n", | |
| "E1 B31560 | loss=3.7430 | lr=0.000338 | step=5147 | 17137s | 12.0GB\n", | |
| "E1 B31580 | loss=3.7160 | lr=0.000338 | step=5147 | 17148s | 12.0GB\n", | |
| "E1 B31600 | loss=3.4265 | lr=0.000338 | step=5148 | 17158s | 12.0GB\n", | |
| "E1 B31620 | loss=3.4560 | lr=0.000337 | step=5149 | 17169s | 12.0GB\n", | |
| "E1 B31640 | loss=2.9494 | lr=0.000337 | step=5149 | 17180s | 12.0GB\n", | |
| "E1 B31660 | loss=3.4210 | lr=0.000337 | step=5150 | 17191s | 12.0GB\n", | |
| "E1 B31680 | loss=3.2652 | lr=0.000336 | step=5151 | 17202s | 12.0GB\n", | |
| "E1 B31700 | loss=3.1650 | lr=0.000336 | step=5151 | 17213s | 12.0GB\n", | |
| "E1 B31720 | loss=3.8392 | lr=0.000336 | step=5152 | 17224s | 12.0GB\n", | |
| "E1 B31740 | loss=3.4646 | lr=0.000336 | step=5152 | 17234s | 12.0GB\n", | |
| "E1 B31760 | loss=3.8756 | lr=0.000336 | step=5153 | 17246s | 12.0GB\n", | |
| "E1 B31780 | loss=3.2961 | lr=0.000335 | step=5154 | 17256s | 12.0GB\n", | |
| "E1 B31800 | loss=3.7959 | lr=0.000335 | step=5154 | 17267s | 12.0GB\n", | |
| "E1 B31820 | loss=3.0390 | lr=0.000335 | step=5155 | 17278s | 12.0GB\n", | |
| "E1 B31840 | loss=3.7117 | lr=0.000334 | step=5156 | 17289s | 12.0GB\n", | |
| "E1 B31860 | loss=3.5349 | lr=0.000334 | step=5156 | 17300s | 12.0GB\n", | |
| "E1 B31880 | loss=3.4139 | lr=0.000334 | step=5157 | 17311s | 12.0GB\n", | |
| "E1 B31900 | loss=3.3302 | lr=0.000334 | step=5157 | 17321s | 12.0GB\n", | |
| "E1 B31920 | loss=3.8232 | lr=0.000333 | step=5158 | 17332s | 12.0GB\n", | |
| "E1 B31940 | loss=3.1530 | lr=0.000333 | step=5159 | 17343s | 12.0GB\n", | |
| "E1 B31960 | loss=3.6944 | lr=0.000333 | step=5159 | 17354s | 12.0GB\n", | |
| "E1 B31980 | loss=3.5010 | lr=0.000333 | step=5160 | 17365s | 12.0GB\n", | |
| "E1 B32000 | loss=3.6479 | lr=0.000332 | step=5161 | 17376s | 12.0GB\n", | |
| "E1 B32020 | loss=3.3536 | lr=0.000332 | step=5161 | 17387s | 12.0GB\n", | |
| "E1 B32040 | loss=3.8754 | lr=0.000332 | step=5162 | 17397s | 12.0GB\n", | |
| "E1 B32060 | loss=3.6921 | lr=0.000332 | step=5162 | 17408s | 12.0GB\n", | |
| "E1 B32080 | loss=3.3234 | lr=0.000331 | step=5163 | 17419s | 12.0GB\n", | |
| "E1 B32100 | loss=3.8497 | lr=0.000331 | step=5164 | 17430s | 12.0GB\n", | |
| "E1 B32120 | loss=3.6441 | lr=0.000331 | step=5164 | 17441s | 12.0GB\n", | |
| "E1 B32140 | loss=3.1910 | lr=0.000330 | step=5165 | 17451s | 12.0GB\n", | |
| "E1 B32160 | loss=3.1812 | lr=0.000330 | step=5166 | 17462s | 12.0GB\n", | |
| "E1 B32180 | loss=4.0027 | lr=0.000330 | step=5166 | 17473s | 12.0GB\n", | |
| "E1 B32200 | loss=3.5971 | lr=0.000330 | step=5167 | 17484s | 12.0GB\n", | |
| "E1 B32220 | loss=3.0657 | lr=0.000330 | step=5167 | 17495s | 12.0GB\n", | |
| "E1 B32240 | loss=3.0604 | lr=0.000329 | step=5168 | 17506s | 12.0GB\n", | |
| "E1 B32260 | loss=3.9363 | lr=0.000329 | step=5169 | 17517s | 12.0GB\n", | |
| "E1 B32280 | loss=3.5211 | lr=0.000329 | step=5169 | 17527s | 12.0GB\n", | |
| "E1 B32300 | loss=3.4431 | lr=0.000328 | step=5170 | 17538s | 12.0GB\n", | |
| "E1 B32320 | loss=3.2701 | lr=0.000328 | step=5171 | 17549s | 12.0GB\n", | |
| "E1 B32340 | loss=3.4917 | lr=0.000328 | step=5171 | 17560s | 12.0GB\n", | |
| "E1 B32360 | loss=3.8927 | lr=0.000327 | step=5172 | 17571s | 12.0GB\n", | |
| "E1 B32380 | loss=3.3332 | lr=0.000327 | step=5172 | 17582s | 12.0GB\n", | |
| "E1 B32400 | loss=3.4945 | lr=0.000327 | step=5173 | 17593s | 12.0GB\n", | |
| "E1 B32420 | loss=3.4342 | lr=0.000327 | step=5174 | 17604s | 12.0GB\n", | |
| "E1 B32440 | loss=3.6375 | lr=0.000327 | step=5174 | 17614s | 12.0GB\n", | |
| "E1 B32460 | loss=3.8395 | lr=0.000326 | step=5175 | 17625s | 12.0GB\n", | |
| "E1 B32480 | loss=3.7280 | lr=0.000326 | step=5176 | 17636s | 12.0GB\n", | |
| "E1 B32500 | loss=3.6951 | lr=0.000326 | step=5176 | 17647s | 12.0GB\n", | |
| "E1 B32520 | loss=3.4682 | lr=0.000325 | step=5177 | 17658s | 12.0GB\n", | |
| "E1 B32540 | loss=2.8396 | lr=0.000325 | step=5177 | 17668s | 12.0GB\n", | |
| "E1 B32560 | loss=3.0266 | lr=0.000325 | step=5178 | 17679s | 12.0GB\n", | |
| "E1 B32580 | loss=3.6402 | lr=0.000324 | step=5179 | 17690s | 12.0GB\n", | |
| "E1 B32600 | loss=3.6749 | lr=0.000324 | step=5179 | 17701s | 12.0GB\n", | |
| "E1 B32620 | loss=3.2880 | lr=0.000324 | step=5180 | 17712s | 12.0GB\n", | |
| "E1 B32640 | loss=3.8583 | lr=0.000324 | step=5181 | 17723s | 12.0GB\n", | |
| "E1 B32660 | loss=3.5358 | lr=0.000324 | step=5181 | 17734s | 12.0GB\n", | |
| "E1 B32680 | loss=3.3803 | lr=0.000323 | step=5182 | 17744s | 12.0GB\n", | |
| "E1 B32700 | loss=3.4898 | lr=0.000323 | step=5182 | 17755s | 12.0GB\n", | |
| "E1 B32720 | loss=3.4683 | lr=0.000323 | step=5183 | 17766s | 12.0GB\n", | |
| "E1 B32740 | loss=3.4809 | lr=0.000322 | step=5184 | 17777s | 12.0GB\n", | |
| "E1 B32760 | loss=3.4807 | lr=0.000322 | step=5184 | 17788s | 12.0GB\n", | |
| "E1 B32780 | loss=3.8324 | lr=0.000322 | step=5185 | 17798s | 12.0GB\n", | |
| "E1 B32800 | loss=3.0022 | lr=0.000321 | step=5186 | 17809s | 12.0GB\n", | |
| "E1 B32820 | loss=2.8884 | lr=0.000321 | step=5186 | 17820s | 12.0GB\n", | |
| "E1 B32840 | loss=3.4288 | lr=0.000321 | step=5187 | 17831s | 12.0GB\n", | |
| "E1 B32860 | loss=3.2077 | lr=0.000321 | step=5187 | 17842s | 12.0GB\n", | |
| "E1 B32880 | loss=3.6348 | lr=0.000321 | step=5188 | 17853s | 12.0GB\n", | |
| "E1 B32900 | loss=3.6701 | lr=0.000320 | step=5189 | 17864s | 12.0GB\n", | |
| "E1 B32920 | loss=3.4878 | lr=0.000320 | step=5189 | 17875s | 12.0GB\n", | |
| "E1 B32940 | loss=3.4674 | lr=0.000320 | step=5190 | 17885s | 12.0GB\n", | |
| "E1 B32960 | loss=3.2019 | lr=0.000319 | step=5191 | 17896s | 12.0GB\n", | |
| "E1 B32980 | loss=3.0448 | lr=0.000319 | step=5191 | 17907s | 12.0GB\n", | |
| "E1 B33000 | loss=3.7245 | lr=0.000319 | step=5192 | 17918s | 12.0GB\n", | |
| "E1 B33020 | loss=3.5850 | lr=0.000319 | step=5192 | 17929s | 12.0GB\n", | |
| "E1 B33040 | loss=3.5758 | lr=0.000318 | step=5193 | 17940s | 12.0GB\n", | |
| "E1 B33060 | loss=3.4459 | lr=0.000318 | step=5194 | 17951s | 12.0GB\n", | |
| "E1 B33080 | loss=3.2107 | lr=0.000318 | step=5194 | 17962s | 12.0GB\n", | |
| "E1 B33100 | loss=3.5245 | lr=0.000318 | step=5195 | 17972s | 12.0GB\n", | |
| "E1 B33120 | loss=3.2601 | lr=0.000317 | step=5196 | 17983s | 12.0GB\n", | |
| "E1 B33140 | loss=3.8080 | lr=0.000317 | step=5196 | 17994s | 12.0GB\n", | |
| "E1 B33160 | loss=3.6375 | lr=0.000317 | step=5197 | 18005s | 12.0GB\n", | |
| "E1 B33180 | loss=2.9046 | lr=0.000317 | step=5197 | 18016s | 12.0GB\n", | |
| "E1 B33200 | loss=3.2773 | lr=0.000316 | step=5198 | 18027s | 12.0GB\n", | |
| "E1 B33220 | loss=3.7153 | lr=0.000316 | step=5199 | 18038s | 12.0GB\n", | |
| "E1 B33240 | loss=3.7549 | lr=0.000316 | step=5199 | 18048s | 12.0GB\n", | |
| "E1 B33260 | loss=3.5923 | lr=0.000316 | step=5200 | 18059s | 12.0GB\n", | |
| "E1 B33280 | loss=3.3030 | lr=0.000315 | step=5201 | 18070s | 12.0GB\n", | |
| "E1 B33300 | loss=3.7354 | lr=0.000315 | step=5201 | 18081s | 12.0GB\n", | |
| "E1 B33320 | loss=3.9771 | lr=0.000315 | step=5202 | 18092s | 12.0GB\n", | |
| "E1 B33340 | loss=3.1174 | lr=0.000315 | step=5202 | 18102s | 12.0GB\n", | |
| "E1 B33360 | loss=3.7415 | lr=0.000314 | step=5203 | 18113s | 12.0GB\n", | |
| "E1 B33380 | loss=3.4105 | lr=0.000314 | step=5204 | 18124s | 12.0GB\n", | |
| "E1 B33400 | loss=3.6626 | lr=0.000314 | step=5204 | 18135s | 12.0GB\n", | |
| "E1 B33420 | loss=3.3601 | lr=0.000313 | step=5205 | 18146s | 12.0GB\n", | |
| "E1 B33440 | loss=3.6837 | lr=0.000313 | step=5206 | 18157s | 12.0GB\n", | |
| "E1 B33460 | loss=3.9853 | lr=0.000313 | step=5206 | 18168s | 12.0GB\n", | |
| "E1 B33480 | loss=3.3920 | lr=0.000313 | step=5207 | 18178s | 12.0GB\n", | |
| "E1 B33500 | loss=3.5004 | lr=0.000313 | step=5207 | 18189s | 12.0GB\n", | |
| "E1 B33520 | loss=3.3933 | lr=0.000312 | step=5208 | 18200s | 12.0GB\n", | |
| "E1 B33540 | loss=3.1927 | lr=0.000312 | step=5209 | 18211s | 12.0GB\n", | |
| "E1 B33560 | loss=3.3750 | lr=0.000312 | step=5209 | 18222s | 12.0GB\n", | |
| "E1 B33580 | loss=3.5924 | lr=0.000311 | step=5210 | 18233s | 12.0GB\n", | |
| "E1 B33600 | loss=3.2127 | lr=0.000311 | step=5211 | 18243s | 12.0GB\n", | |
| "E1 B33620 | loss=3.2320 | lr=0.000311 | step=5211 | 18254s | 12.0GB\n", | |
| "E1 B33640 | loss=3.7991 | lr=0.000310 | step=5212 | 18265s | 12.0GB\n", | |
| "E1 B33660 | loss=3.4198 | lr=0.000310 | step=5212 | 18276s | 12.0GB\n", | |
| "E1 B33680 | loss=3.6935 | lr=0.000310 | step=5213 | 18287s | 12.0GB\n", | |
| "E1 B33700 | loss=3.2064 | lr=0.000310 | step=5214 | 18298s | 12.0GB\n", | |
| "E1 B33720 | loss=3.5440 | lr=0.000310 | step=5214 | 18309s | 12.0GB\n", | |
| "E1 B33740 | loss=3.0891 | lr=0.000309 | step=5215 | 18319s | 12.0GB\n", | |
| "E1 B33760 | loss=3.2807 | lr=0.000309 | step=5216 | 18330s | 12.0GB\n", | |
| "E1 B33780 | loss=3.9300 | lr=0.000309 | step=5216 | 18341s | 12.0GB\n", | |
| "E1 B33800 | loss=3.3691 | lr=0.000308 | step=5217 | 18352s | 12.0GB\n", | |
| "E1 B33820 | loss=3.1102 | lr=0.000308 | step=5217 | 18363s | 12.0GB\n", | |
| "E1 B33840 | loss=3.4534 | lr=0.000308 | step=5218 | 18374s | 12.0GB\n", | |
| "E1 B33860 | loss=3.6736 | lr=0.000307 | step=5219 | 18385s | 12.0GB\n", | |
| "E1 B33880 | loss=3.5132 | lr=0.000307 | step=5219 | 18396s | 12.0GB\n", | |
| "E1 B33900 | loss=3.4907 | lr=0.000307 | step=5220 | 18406s | 12.0GB\n", | |
| "E1 B33920 | loss=3.6840 | lr=0.000307 | step=5221 | 18417s | 12.0GB\n", | |
| "E1 B33940 | loss=3.5443 | lr=0.000307 | step=5221 | 18428s | 12.0GB\n", | |
| "E1 B33960 | loss=3.5106 | lr=0.000306 | step=5222 | 18439s | 12.0GB\n", | |
| "E1 B33980 | loss=3.4446 | lr=0.000306 | step=5222 | 18450s | 12.0GB\n", | |
| "E1 B34000 | loss=2.9789 | lr=0.000306 | step=5223 | 18461s | 12.0GB\n", | |
| "E1 B34020 | loss=3.7297 | lr=0.000305 | step=5224 | 18472s | 12.0GB\n", | |
| "E1 B34040 | loss=3.1365 | lr=0.000305 | step=5224 | 18482s | 12.0GB\n", | |
| "E1 B34060 | loss=3.5688 | lr=0.000305 | step=5225 | 18493s | 12.0GB\n", | |
| "E1 B34080 | loss=3.9344 | lr=0.000304 | step=5226 | 18504s | 12.0GB\n", | |
| "E1 B34100 | loss=3.4828 | lr=0.000304 | step=5226 | 18515s | 12.0GB\n", | |
| "E1 B34120 | loss=3.5875 | lr=0.000304 | step=5227 | 18526s | 12.0GB\n", | |
| "E1 B34140 | loss=3.6305 | lr=0.000304 | step=5227 | 18537s | 12.0GB\n", | |
| "E1 B34160 | loss=3.2945 | lr=0.000304 | step=5228 | 18548s | 12.0GB\n", | |
| "E1 B34180 | loss=3.1983 | lr=0.000303 | step=5229 | 18558s | 12.0GB\n", | |
| "E1 B34200 | loss=2.8199 | lr=0.000303 | step=5229 | 18569s | 12.0GB\n", | |
| "E1 B34220 | loss=3.3696 | lr=0.000303 | step=5230 | 18580s | 12.0GB\n", | |
| "E1 B34240 | loss=3.8264 | lr=0.000302 | step=5231 | 18591s | 12.0GB\n", | |
| "E1 B34260 | loss=3.3119 | lr=0.000302 | step=5231 | 18602s | 12.0GB\n", | |
| "E1 B34280 | loss=3.1872 | lr=0.000302 | step=5232 | 18613s | 12.0GB\n", | |
| "E1 B34300 | loss=3.3126 | lr=0.000302 | step=5232 | 18623s | 12.0GB\n", | |
| "E1 B34320 | loss=3.2036 | lr=0.000301 | step=5233 | 18635s | 12.0GB\n", | |
| "E1 B34340 | loss=3.0260 | lr=0.000301 | step=5234 | 18645s | 12.0GB\n", | |
| "E1 B34360 | loss=3.4337 | lr=0.000301 | step=5234 | 18656s | 12.0GB\n", | |
| "E1 B34380 | loss=3.6881 | lr=0.000301 | step=5235 | 18667s | 12.0GB\n", | |
| "E1 B34400 | loss=3.5469 | lr=0.000300 | step=5236 | 18678s | 12.0GB\n", | |
| "E1 B34420 | loss=3.0098 | lr=0.000300 | step=5236 | 18689s | 12.0GB\n", | |
| "E1 B34440 | loss=3.3193 | lr=0.000300 | step=5237 | 18700s | 12.0GB\n", | |
| "E1 B34460 | loss=3.5947 | lr=0.000300 | step=5237 | 18711s | 12.0GB\n", | |
| "E1 B34480 | loss=3.7235 | lr=0.000299 | step=5238 | 18721s | 12.0GB\n", | |
| "E1 B34500 | loss=3.2365 | lr=0.000299 | step=5239 | 18732s | 12.0GB\n", | |
| "E1 B34520 | loss=3.4069 | lr=0.000299 | step=5239 | 18743s | 12.0GB\n", | |
| "E1 B34540 | loss=3.3951 | lr=0.000299 | step=5240 | 18754s | 12.0GB\n", | |
| "E1 B34560 | loss=4.1664 | lr=0.000298 | step=5241 | 18765s | 12.0GB\n", | |
| "E1 B34580 | loss=3.7328 | lr=0.000298 | step=5241 | 18775s | 12.0GB\n", | |
| "E1 B34600 | loss=3.5229 | lr=0.000298 | step=5242 | 18786s | 12.0GB\n", | |
| "E1 B34620 | loss=3.7139 | lr=0.000298 | step=5242 | 18797s | 12.0GB\n", | |
| "E1 B34640 | loss=3.2190 | lr=0.000297 | step=5243 | 18808s | 12.0GB\n", | |
| "E1 B34660 | loss=3.4113 | lr=0.000297 | step=5244 | 18819s | 12.0GB\n", | |
| "E1 B34680 | loss=3.4539 | lr=0.000297 | step=5244 | 18830s | 12.0GB\n", | |
| "E1 B34700 | loss=3.4468 | lr=0.000296 | step=5245 | 18840s | 12.0GB\n", | |
| "E1 B34720 | loss=3.7184 | lr=0.000296 | step=5246 | 18851s | 12.0GB\n", | |
| "E1 B34740 | loss=3.2855 | lr=0.000296 | step=5246 | 18862s | 12.0GB\n", | |
| "E1 B34760 | loss=3.6530 | lr=0.000296 | step=5247 | 18873s | 12.0GB\n", | |
| "E1 B34780 | loss=3.6512 | lr=0.000296 | step=5247 | 18884s | 12.0GB\n", | |
| "E1 B34800 | loss=3.4023 | lr=0.000295 | step=5248 | 18894s | 12.0GB\n", | |
| "E1 B34820 | loss=3.2032 | lr=0.000295 | step=5249 | 18905s | 12.0GB\n", | |
| "E1 B34840 | loss=3.4380 | lr=0.000295 | step=5249 | 18916s | 12.0GB\n", | |
| "E1 B34860 | loss=3.4082 | lr=0.000294 | step=5250 | 18927s | 12.0GB\n", | |
| "E1 B34880 | loss=3.2422 | lr=0.000294 | step=5251 | 18938s | 12.0GB\n", | |
| "E1 B34900 | loss=3.5386 | lr=0.000294 | step=5251 | 18949s | 12.0GB\n", | |
| "E1 B34920 | loss=3.7222 | lr=0.000293 | step=5252 | 18960s | 12.0GB\n", | |
| "E1 B34940 | loss=3.4974 | lr=0.000293 | step=5252 | 18970s | 12.0GB\n", | |
| "E1 B34960 | loss=2.9927 | lr=0.000293 | step=5253 | 18981s | 12.0GB\n", | |
| "E1 B34980 | loss=3.5785 | lr=0.000293 | step=5254 | 18992s | 12.0GB\n", | |
| "E1 B35000 | loss=3.8642 | lr=0.000293 | step=5254 | 19003s | 12.0GB\n", | |
| "E1 B35020 | loss=3.5255 | lr=0.000292 | step=5255 | 19014s | 12.0GB\n", | |
| "E1 B35040 | loss=3.4349 | lr=0.000292 | step=5256 | 19024s | 12.0GB\n", | |
| "E1 B35060 | loss=3.8442 | lr=0.000292 | step=5256 | 19035s | 12.0GB\n", | |
| "E1 B35080 | loss=2.9881 | lr=0.000291 | step=5257 | 19046s | 12.0GB\n", | |
| "E1 B35100 | loss=3.6151 | lr=0.000291 | step=5257 | 19057s | 12.0GB\n", | |
| "E1 B35120 | loss=3.8709 | lr=0.000291 | step=5258 | 19068s | 12.0GB\n", | |
| "E1 B35140 | loss=3.0029 | lr=0.000291 | step=5259 | 19078s | 12.0GB\n", | |
| "E1 B35160 | loss=2.6568 | lr=0.000291 | step=5259 | 19089s | 12.0GB\n", | |
| "E1 B35180 | loss=3.8542 | lr=0.000290 | step=5260 | 19100s | 12.0GB\n", | |
| "E1 B35200 | loss=3.6133 | lr=0.000290 | step=5261 | 19111s | 12.0GB\n", | |
| "E1 B35220 | loss=3.1443 | lr=0.000290 | step=5261 | 19122s | 12.0GB\n", | |
| "E1 B35240 | loss=3.6259 | lr=0.000289 | step=5262 | 19133s | 12.0GB\n", | |
| "E1 B35260 | loss=3.4646 | lr=0.000289 | step=5262 | 19144s | 12.0GB\n", | |
| "E1 B35280 | loss=3.0274 | lr=0.000289 | step=5263 | 19155s | 12.0GB\n", | |
| "E1 B35300 | loss=3.3368 | lr=0.000288 | step=5264 | 19165s | 12.0GB\n", | |
| "E1 B35320 | loss=3.3145 | lr=0.000288 | step=5264 | 19176s | 12.0GB\n", | |
| "E1 B35340 | loss=3.6095 | lr=0.000288 | step=5265 | 19187s | 12.0GB\n", | |
| "E1 B35360 | loss=3.8292 | lr=0.000288 | step=5266 | 19198s | 12.0GB\n", | |
| "E1 B35380 | loss=3.2421 | lr=0.000288 | step=5266 | 19209s | 12.0GB\n", | |
| "E1 B35400 | loss=3.3306 | lr=0.000287 | step=5267 | 19220s | 12.0GB\n", | |
| "E1 B35420 | loss=3.5127 | lr=0.000287 | step=5267 | 19231s | 12.0GB\n", | |
| "E1 B35440 | loss=3.6502 | lr=0.000287 | step=5268 | 19241s | 12.0GB\n", | |
| "E1 B35460 | loss=3.5052 | lr=0.000286 | step=5269 | 19252s | 12.0GB\n", | |
| "E1 B35480 | loss=3.6994 | lr=0.000286 | step=5269 | 19263s | 12.0GB\n", | |
| "E1 B35500 | loss=3.5257 | lr=0.000286 | step=5270 | 19274s | 12.0GB\n", | |
| "E1 B35520 | loss=3.8089 | lr=0.000285 | step=5271 | 19285s | 12.0GB\n", | |
| "E1 B35540 | loss=3.4303 | lr=0.000285 | step=5271 | 19296s | 12.0GB\n", | |
| "E1 B35560 | loss=3.1203 | lr=0.000285 | step=5272 | 19307s | 12.0GB\n", | |
| "E1 B35580 | loss=3.6674 | lr=0.000285 | step=5272 | 19317s | 12.0GB\n", | |
| "E1 B35600 | loss=3.7049 | lr=0.000285 | step=5273 | 19328s | 12.0GB\n", | |
| "E1 B35620 | loss=3.8061 | lr=0.000284 | step=5274 | 19339s | 12.0GB\n", | |
| "E1 B35640 | loss=3.4406 | lr=0.000284 | step=5274 | 19350s | 12.0GB\n", | |
| "E1 B35660 | loss=4.0125 | lr=0.000284 | step=5275 | 19361s | 12.0GB\n", | |
| "E1 B35680 | loss=3.6131 | lr=0.000283 | step=5276 | 19372s | 12.0GB\n", | |
| "E1 B35700 | loss=2.6319 | lr=0.000283 | step=5276 | 19382s | 12.0GB\n", | |
| "E1 B35720 | loss=3.9020 | lr=0.000283 | step=5277 | 19393s | 12.0GB\n", | |
| "E1 B35740 | loss=3.9964 | lr=0.000283 | step=5277 | 19404s | 12.0GB\n", | |
| "E1 B35760 | loss=3.7687 | lr=0.000283 | step=5278 | 19415s | 12.0GB\n", | |
| "E1 B35780 | loss=3.1358 | lr=0.000282 | step=5279 | 19426s | 12.0GB\n", | |
| "E1 B35800 | loss=3.6551 | lr=0.000282 | step=5279 | 19436s | 12.0GB\n", | |
| "E1 B35820 | loss=3.5898 | lr=0.000282 | step=5280 | 19448s | 12.0GB\n", | |
| "E1 B35840 | loss=3.8250 | lr=0.000281 | step=5281 | 19458s | 12.0GB\n", | |
| "E1 B35860 | loss=3.6884 | lr=0.000281 | step=5281 | 19469s | 12.0GB\n", | |
| "E1 B35880 | loss=3.9341 | lr=0.000281 | step=5282 | 19480s | 12.0GB\n", | |
| "E1 B35900 | loss=3.8614 | lr=0.000281 | step=5282 | 19491s | 12.0GB\n", | |
| "E1 B35920 | loss=3.0373 | lr=0.000280 | step=5283 | 19502s | 12.0GB\n", | |
| "E1 B35940 | loss=3.2366 | lr=0.000280 | step=5284 | 19512s | 12.0GB\n", | |
| "E1 B35960 | loss=3.6519 | lr=0.000280 | step=5284 | 19523s | 12.0GB\n", | |
| "E1 B35980 | loss=3.4759 | lr=0.000280 | step=5285 | 19534s | 12.0GB\n", | |
| "E1 B36000 | loss=3.6050 | lr=0.000279 | step=5286 | 19545s | 12.0GB\n", | |
| "E1 B36020 | loss=3.2001 | lr=0.000279 | step=5286 | 19556s | 12.0GB\n", | |
| "E1 B36040 | loss=3.1117 | lr=0.000279 | step=5287 | 19567s | 12.0GB\n", | |
| "E1 B36060 | loss=3.5468 | lr=0.000279 | step=5287 | 19578s | 12.0GB\n", | |
| "E1 B36080 | loss=3.6994 | lr=0.000278 | step=5288 | 19588s | 12.0GB\n", | |
| "E1 B36100 | loss=3.5568 | lr=0.000278 | step=5289 | 19599s | 12.0GB\n", | |
| "E1 B36120 | loss=3.7048 | lr=0.000278 | step=5289 | 19610s | 12.0GB\n", | |
| "E1 B36140 | loss=3.4077 | lr=0.000277 | step=5290 | 19621s | 12.0GB\n", | |
| "E1 B36160 | loss=3.4949 | lr=0.000277 | step=5291 | 19632s | 12.0GB\n", | |
| "E1 B36180 | loss=3.5637 | lr=0.000277 | step=5291 | 19642s | 12.0GB\n", | |
| "E1 B36200 | loss=2.9662 | lr=0.000277 | step=5292 | 19653s | 12.0GB\n", | |
| "E1 B36220 | loss=3.2112 | lr=0.000277 | step=5292 | 19664s | 12.0GB\n", | |
| "E1 B36240 | loss=3.8873 | lr=0.000276 | step=5293 | 19675s | 12.0GB\n", | |
| "E1 B36260 | loss=3.6678 | lr=0.000276 | step=5294 | 19686s | 12.0GB\n", | |
| "E1 B36280 | loss=3.3917 | lr=0.000276 | step=5294 | 19697s | 12.0GB\n", | |
| "E1 B36300 | loss=3.7717 | lr=0.000275 | step=5295 | 19707s | 12.0GB\n", | |
| "E1 B36320 | loss=3.4981 | lr=0.000275 | step=5296 | 19718s | 12.0GB\n", | |
| "E1 B36340 | loss=3.3784 | lr=0.000275 | step=5296 | 19729s | 12.0GB\n", | |
| "E1 B36360 | loss=3.4541 | lr=0.000275 | step=5297 | 19740s | 12.0GB\n", | |
| "E1 B36380 | loss=3.6218 | lr=0.000275 | step=5297 | 19751s | 12.0GB\n", | |
| "E1 B36400 | loss=3.6317 | lr=0.000274 | step=5298 | 19762s | 12.0GB\n", | |
| "E1 B36420 | loss=3.3185 | lr=0.000274 | step=5299 | 19773s | 12.0GB\n", | |
| "E1 B36440 | loss=3.4515 | lr=0.000274 | step=5299 | 19783s | 12.0GB\n", | |
| "E1 B36460 | loss=3.7896 | lr=0.000273 | step=5300 | 19794s | 12.0GB\n", | |
| "E1 B36480 | loss=3.6837 | lr=0.000273 | step=5301 | 19805s | 12.0GB\n", | |
| "E1 B36500 | loss=3.8207 | lr=0.000273 | step=5301 | 19816s | 12.0GB\n", | |
| "E1 B36520 | loss=2.8555 | lr=0.000272 | step=5302 | 19827s | 12.0GB\n", | |
| "E1 B36540 | loss=3.2924 | lr=0.000272 | step=5302 | 19837s | 12.0GB\n", | |
| "E1 B36560 | loss=3.7286 | lr=0.000272 | step=5303 | 19848s | 12.0GB\n", | |
| "E1 B36580 | loss=3.7447 | lr=0.000272 | step=5304 | 19859s | 12.0GB\n", | |
| "E1 B36600 | loss=2.7712 | lr=0.000272 | step=5304 | 19870s | 12.0GB\n", | |
| "E1 B36620 | loss=3.7412 | lr=0.000271 | step=5305 | 19881s | 12.0GB\n", | |
| "E1 B36640 | loss=3.3220 | lr=0.000271 | step=5306 | 19892s | 12.0GB\n", | |
| "E1 B36660 | loss=3.3316 | lr=0.000271 | step=5306 | 19903s | 12.0GB\n", | |
| "E1 B36680 | loss=3.0927 | lr=0.000270 | step=5307 | 19913s | 12.0GB\n", | |
| "E1 B36700 | loss=3.0771 | lr=0.000270 | step=5307 | 19924s | 12.0GB\n", | |
| "E1 B36720 | loss=3.5380 | lr=0.000270 | step=5308 | 19935s | 12.0GB\n", | |
| "E1 B36740 | loss=3.2336 | lr=0.000270 | step=5309 | 19946s | 12.0GB\n", | |
| "E1 B36760 | loss=3.2664 | lr=0.000270 | step=5309 | 19957s | 12.0GB\n", | |
| "E1 B36780 | loss=3.5680 | lr=0.000269 | step=5310 | 19968s | 12.0GB\n", | |
| "E1 B36800 | loss=3.3772 | lr=0.000269 | step=5311 | 19978s | 12.0GB\n", | |
| "E1 B36820 | loss=3.8156 | lr=0.000269 | step=5311 | 19989s | 12.0GB\n", | |
| "E1 B36840 | loss=3.4270 | lr=0.000268 | step=5312 | 20000s | 12.0GB\n", | |
| "E1 B36860 | loss=3.8598 | lr=0.000268 | step=5312 | 20011s | 12.0GB\n", | |
| "E1 B36880 | loss=3.7456 | lr=0.000268 | step=5313 | 20022s | 12.0GB\n", | |
| "E1 B36900 | loss=3.3930 | lr=0.000267 | step=5314 | 20033s | 12.0GB\n", | |
| "E1 B36920 | loss=3.4098 | lr=0.000267 | step=5314 | 20044s | 12.0GB\n", | |
| "E1 B36940 | loss=3.7624 | lr=0.000267 | step=5315 | 20054s | 12.0GB\n", | |
| "E1 B36960 | loss=3.6373 | lr=0.000267 | step=5316 | 20065s | 12.0GB\n", | |
| "E1 B36980 | loss=3.5859 | lr=0.000267 | step=5316 | 20076s | 12.0GB\n", | |
| "E1 B37000 | loss=3.6466 | lr=0.000266 | step=5317 | 20087s | 12.0GB\n", | |
| "E1 B37020 | loss=3.3120 | lr=0.000266 | step=5317 | 20098s | 12.0GB\n", | |
| "E1 B37040 | loss=3.6111 | lr=0.000266 | step=5318 | 20109s | 12.0GB\n", | |
| "E1 B37060 | loss=4.0343 | lr=0.000265 | step=5319 | 20120s | 12.0GB\n", | |
| "E1 B37080 | loss=3.8327 | lr=0.000265 | step=5319 | 20130s | 12.0GB\n", | |
| "E1 B37100 | loss=3.5405 | lr=0.000265 | step=5320 | 20141s | 12.0GB\n", | |
| "E1 B37120 | loss=3.6769 | lr=0.000265 | step=5321 | 20152s | 12.0GB\n", | |
| "E1 B37140 | loss=3.4299 | lr=0.000265 | step=5321 | 20163s | 12.0GB\n", | |
| "E1 B37160 | loss=3.8798 | lr=0.000264 | step=5322 | 20174s | 12.0GB\n", | |
| "E1 B37180 | loss=3.0131 | lr=0.000264 | step=5322 | 20185s | 12.0GB\n", | |
| "E1 B37200 | loss=3.5730 | lr=0.000264 | step=5323 | 20195s | 12.0GB\n", | |
| "E1 B37220 | loss=3.6217 | lr=0.000263 | step=5324 | 20206s | 12.0GB\n", | |
| "E1 B37240 | loss=3.7855 | lr=0.000263 | step=5324 | 20217s | 12.0GB\n", | |
| "E1 B37260 | loss=3.3359 | lr=0.000263 | step=5325 | 20228s | 12.0GB\n", | |
| "E1 B37280 | loss=3.5552 | lr=0.000263 | step=5326 | 20239s | 12.0GB\n", | |
| "E1 B37300 | loss=2.4886 | lr=0.000263 | step=5326 | 20250s | 12.0GB\n", | |
| "E1 B37320 | loss=3.9733 | lr=0.000262 | step=5327 | 20261s | 12.0GB\n", | |
| "E1 B37340 | loss=3.3662 | lr=0.000262 | step=5327 | 20271s | 12.0GB\n", | |
| "E1 B37360 | loss=3.9401 | lr=0.000262 | step=5328 | 20282s | 12.0GB\n", | |
| "E1 B37380 | loss=3.5721 | lr=0.000261 | step=5329 | 20293s | 12.0GB\n", | |
| "E1 B37400 | loss=3.7174 | lr=0.000261 | step=5329 | 20304s | 12.0GB\n", | |
| "E1 B37420 | loss=3.4538 | lr=0.000261 | step=5330 | 20315s | 12.0GB\n", | |
| "E1 B37440 | loss=3.8217 | lr=0.000260 | step=5331 | 20325s | 12.0GB\n", | |
| "E1 B37460 | loss=3.4167 | lr=0.000260 | step=5331 | 20336s | 12.0GB\n", | |
| "E1 B37480 | loss=2.9459 | lr=0.000260 | step=5332 | 20347s | 12.0GB\n", | |
| "E1 B37500 | loss=3.7515 | lr=0.000260 | step=5332 | 20358s | 12.0GB\n", | |
| "E1 B37520 | loss=3.8459 | lr=0.000260 | step=5333 | 20369s | 12.0GB\n", | |
| "E1 B37540 | loss=3.5853 | lr=0.000259 | step=5334 | 20380s | 12.0GB\n", | |
| "E1 B37560 | loss=2.9558 | lr=0.000259 | step=5334 | 20391s | 12.0GB\n", | |
| "E1 B37580 | loss=3.7609 | lr=0.000259 | step=5335 | 20401s | 12.0GB\n", | |
| "E1 B37600 | loss=3.2636 | lr=0.000258 | step=5336 | 20412s | 12.0GB\n", | |
| "E1 B37620 | loss=3.2442 | lr=0.000258 | step=5336 | 20423s | 12.0GB\n", | |
| "E1 B37640 | loss=3.3707 | lr=0.000258 | step=5337 | 20434s | 12.0GB\n", | |
| "E1 B37660 | loss=3.3732 | lr=0.000258 | step=5337 | 20445s | 12.0GB\n", | |
| "E1 B37680 | loss=3.7993 | lr=0.000258 | step=5338 | 20455s | 12.0GB\n", | |
| "E1 B37700 | loss=3.3523 | lr=0.000257 | step=5339 | 20466s | 12.0GB\n", | |
| "E1 B37720 | loss=3.3059 | lr=0.000257 | step=5339 | 20477s | 12.0GB\n", | |
| "E1 B37740 | loss=3.2976 | lr=0.000257 | step=5340 | 20488s | 12.0GB\n", | |
| "E1 B37760 | loss=2.7305 | lr=0.000256 | step=5341 | 20499s | 12.0GB\n", | |
| "E1 B37780 | loss=3.7620 | lr=0.000256 | step=5341 | 20509s | 12.0GB\n", | |
| "E1 B37800 | loss=3.0444 | lr=0.000256 | step=5342 | 20520s | 12.0GB\n", | |
| "E1 B37820 | loss=3.7213 | lr=0.000256 | step=5342 | 20531s | 12.0GB\n", | |
| "E1 B37840 | loss=3.6728 | lr=0.000256 | step=5343 | 20542s | 12.0GB\n", | |
| "E1 B37860 | loss=3.7177 | lr=0.000255 | step=5344 | 20553s | 12.0GB\n", | |
| "E1 B37880 | loss=2.8857 | lr=0.000255 | step=5344 | 20563s | 12.0GB\n", | |
| "E1 B37900 | loss=3.2180 | lr=0.000255 | step=5345 | 20574s | 12.0GB\n", | |
| "E1 B37920 | loss=3.8378 | lr=0.000254 | step=5346 | 20585s | 12.0GB\n", | |
| "E1 B37940 | loss=3.6831 | lr=0.000254 | step=5346 | 20596s | 12.0GB\n", | |
| "E1 B37960 | loss=3.1581 | lr=0.000254 | step=5347 | 20607s | 12.0GB\n", | |
| "E1 B37980 | loss=3.2254 | lr=0.000254 | step=5347 | 20618s | 12.0GB\n", | |
| "E1 B38000 | loss=3.2753 | lr=0.000253 | step=5348 | 20629s | 12.0GB\n", | |
| "E1 B38020 | loss=3.3482 | lr=0.000253 | step=5349 | 20640s | 12.0GB\n", | |
| "E1 B38040 | loss=3.9132 | lr=0.000253 | step=5349 | 20650s | 12.0GB\n", | |
| "E1 B38060 | loss=3.6252 | lr=0.000253 | step=5350 | 20661s | 12.0GB\n", | |
| "E1 B38080 | loss=3.3603 | lr=0.000252 | step=5351 | 20672s | 12.0GB\n", | |
| "E1 B38100 | loss=3.6257 | lr=0.000252 | step=5351 | 20683s | 12.0GB\n", | |
| "E1 B38120 | loss=3.6196 | lr=0.000252 | step=5352 | 20694s | 12.0GB\n", | |
| "E1 B38140 | loss=3.5829 | lr=0.000252 | step=5352 | 20705s | 12.0GB\n", | |
| "E1 B38160 | loss=3.6941 | lr=0.000251 | step=5353 | 20716s | 12.0GB\n", | |
| "E1 B38180 | loss=3.2810 | lr=0.000251 | step=5354 | 20726s | 12.0GB\n", | |
| "E1 B38200 | loss=3.9177 | lr=0.000251 | step=5354 | 20737s | 12.0GB\n", | |
| "E1 B38220 | loss=3.5354 | lr=0.000251 | step=5355 | 20748s | 12.0GB\n", | |
| "E1 B38240 | loss=3.1905 | lr=0.000250 | step=5356 | 20759s | 12.0GB\n", | |
| "E1 B38260 | loss=3.1084 | lr=0.000250 | step=5356 | 20770s | 12.0GB\n", | |
| "E1 B38280 | loss=3.3726 | lr=0.000250 | step=5357 | 20781s | 12.0GB\n", | |
| "E1 B38300 | loss=3.7658 | lr=0.000250 | step=5357 | 20791s | 12.0GB\n", | |
| "E1 B38320 | loss=3.8325 | lr=0.000249 | step=5358 | 20802s | 12.0GB\n", | |
| "E1 B38340 | loss=3.3632 | lr=0.000249 | step=5359 | 20813s | 12.0GB\n", | |
| "E1 B38360 | loss=2.9547 | lr=0.000249 | step=5359 | 20824s | 12.0GB\n", | |
| "E1 B38380 | loss=3.5351 | lr=0.000249 | step=5360 | 20835s | 12.0GB\n", | |
| "E1 B38400 | loss=3.7818 | lr=0.000248 | step=5361 | 20846s | 12.0GB\n", | |
| "E1 B38420 | loss=3.6468 | lr=0.000248 | step=5361 | 20857s | 12.0GB\n", | |
| "E1 B38440 | loss=3.1102 | lr=0.000248 | step=5362 | 20867s | 12.0GB\n", | |
| "E1 B38460 | loss=3.7033 | lr=0.000248 | step=5362 | 20878s | 12.0GB\n", | |
| "E1 B38480 | loss=2.8785 | lr=0.000247 | step=5363 | 20889s | 12.0GB\n", | |
| "E1 B38500 | loss=3.5959 | lr=0.000247 | step=5364 | 20900s | 12.0GB\n", | |
| "E1 B38520 | loss=3.3142 | lr=0.000247 | step=5364 | 20911s | 12.0GB\n", | |
| "E1 B38540 | loss=3.5375 | lr=0.000247 | step=5365 | 20921s | 12.0GB\n", | |
| "E1 B38560 | loss=4.1182 | lr=0.000246 | step=5366 | 20932s | 12.0GB\n", | |
| "E1 B38580 | loss=3.3002 | lr=0.000246 | step=5366 | 20943s | 12.0GB\n", | |
| "E1 B38600 | loss=3.3290 | lr=0.000246 | step=5367 | 20954s | 12.0GB\n", | |
| "E1 B38620 | loss=3.5244 | lr=0.000246 | step=5367 | 20965s | 12.0GB\n", | |
| "E1 B38640 | loss=3.7054 | lr=0.000245 | step=5368 | 20976s | 12.0GB\n", | |
| "E1 B38660 | loss=3.6105 | lr=0.000245 | step=5369 | 20987s | 12.0GB\n", | |
| "E1 B38680 | loss=2.9456 | lr=0.000245 | step=5369 | 20997s | 12.0GB\n", | |
| "E1 B38700 | loss=3.3495 | lr=0.000245 | step=5370 | 21008s | 12.0GB\n", | |
| "E1 B38720 | loss=3.5166 | lr=0.000244 | step=5371 | 21019s | 12.0GB\n", | |
| "E1 B38740 | loss=3.4393 | lr=0.000244 | step=5371 | 21030s | 12.0GB\n", | |
| "E1 B38760 | loss=3.5909 | lr=0.000244 | step=5372 | 21041s | 12.0GB\n", | |
| "E1 B38780 | loss=3.3982 | lr=0.000244 | step=5372 | 21051s | 12.0GB\n", | |
| "E1 B38800 | loss=3.5715 | lr=0.000243 | step=5373 | 21062s | 12.0GB\n", | |
| "E1 B38820 | loss=3.7405 | lr=0.000243 | step=5374 | 21073s | 12.0GB\n", | |
| "E1 B38840 | loss=3.4955 | lr=0.000243 | step=5374 | 21084s | 12.0GB\n", | |
| "E1 B38860 | loss=3.9911 | lr=0.000243 | step=5375 | 21095s | 12.0GB\n", | |
| "E1 B38880 | loss=3.4674 | lr=0.000242 | step=5376 | 21106s | 12.0GB\n", | |
| "E1 B38900 | loss=3.1128 | lr=0.000242 | step=5376 | 21117s | 12.0GB\n", | |
| "E1 B38920 | loss=3.6652 | lr=0.000242 | step=5377 | 21128s | 12.0GB\n", | |
| "E1 B38940 | loss=3.8105 | lr=0.000242 | step=5377 | 21138s | 12.0GB\n", | |
| "E1 B38960 | loss=3.7549 | lr=0.000241 | step=5378 | 21149s | 12.0GB\n", | |
| "E1 B38980 | loss=3.3070 | lr=0.000241 | step=5379 | 21160s | 12.0GB\n", | |
| "E1 B39000 | loss=3.6276 | lr=0.000241 | step=5379 | 21171s | 12.0GB\n", | |
| "E1 B39020 | loss=3.6632 | lr=0.000241 | step=5380 | 21182s | 12.0GB\n", | |
| "E1 B39040 | loss=3.6880 | lr=0.000240 | step=5381 | 21193s | 12.0GB\n", | |
| "E1 B39060 | loss=2.9014 | lr=0.000240 | step=5381 | 21204s | 12.0GB\n", | |
| "E1 B39080 | loss=3.2696 | lr=0.000240 | step=5382 | 21214s | 12.0GB\n", | |
| "E1 B39100 | loss=3.2919 | lr=0.000240 | step=5382 | 21225s | 12.0GB\n", | |
| "E1 B39120 | loss=3.7564 | lr=0.000239 | step=5383 | 21236s | 12.0GB\n", | |
| "E1 B39140 | loss=3.1074 | lr=0.000239 | step=5384 | 21247s | 12.0GB\n", | |
| "E1 B39160 | loss=3.7878 | lr=0.000239 | step=5384 | 21258s | 12.0GB\n", | |
| "E1 B39180 | loss=3.1210 | lr=0.000239 | step=5385 | 21269s | 12.0GB\n", | |
| "E1 B39200 | loss=3.5709 | lr=0.000238 | step=5386 | 21279s | 12.0GB\n", | |
| "E1 B39220 | loss=2.9338 | lr=0.000238 | step=5386 | 21291s | 12.0GB\n", | |
| "E1 B39240 | loss=2.9845 | lr=0.000238 | step=5387 | 21301s | 12.0GB\n", | |
| "E1 B39260 | loss=3.2936 | lr=0.000238 | step=5387 | 21312s | 12.0GB\n", | |
| "E1 B39280 | loss=3.3649 | lr=0.000237 | step=5388 | 21323s | 12.0GB\n", | |
| "E1 B39300 | loss=3.7680 | lr=0.000237 | step=5389 | 21334s | 12.0GB\n", | |
| "E1 B39320 | loss=3.3632 | lr=0.000237 | step=5389 | 21345s | 12.0GB\n", | |
| "E1 B39340 | loss=3.9010 | lr=0.000236 | step=5390 | 21355s | 12.0GB\n", | |
| "E1 B39360 | loss=3.5927 | lr=0.000236 | step=5391 | 21366s | 12.0GB\n", | |
| "E1 B39380 | loss=3.7821 | lr=0.000236 | step=5391 | 21377s | 12.0GB\n", | |
| "E1 B39400 | loss=3.4154 | lr=0.000236 | step=5392 | 21388s | 12.0GB\n", | |
| "E1 B39420 | loss=3.5459 | lr=0.000236 | step=5392 | 21399s | 12.0GB\n", | |
| "E1 B39440 | loss=3.3340 | lr=0.000235 | step=5393 | 21410s | 12.0GB\n", | |
| "E1 B39460 | loss=3.8098 | lr=0.000235 | step=5394 | 21421s | 12.0GB\n", | |
| "E1 B39480 | loss=3.1746 | lr=0.000235 | step=5394 | 21431s | 12.0GB\n", | |
| "E1 B39500 | loss=3.6814 | lr=0.000235 | step=5395 | 21442s | 12.0GB\n", | |
| "E1 B39520 | loss=3.6751 | lr=0.000234 | step=5396 | 21453s | 12.0GB\n", | |
| "E1 B39540 | loss=3.7042 | lr=0.000234 | step=5396 | 21464s | 12.0GB\n", | |
| "E1 B39560 | loss=3.3944 | lr=0.000234 | step=5397 | 21475s | 12.0GB\n", | |
| "E1 B39580 | loss=3.8849 | lr=0.000234 | step=5397 | 21486s | 12.0GB\n", | |
| "E1 B39600 | loss=3.7461 | lr=0.000233 | step=5398 | 21496s | 12.0GB\n", | |
| "E1 B39620 | loss=3.8156 | lr=0.000233 | step=5399 | 21507s | 12.0GB\n", | |
| "E1 B39640 | loss=3.2901 | lr=0.000233 | step=5399 | 21518s | 12.0GB\n", | |
| "E1 B39660 | loss=3.8295 | lr=0.000233 | step=5400 | 21529s | 12.0GB\n", | |
| "E1 B39680 | loss=3.3929 | lr=0.000232 | step=5401 | 21540s | 12.0GB\n", | |
| "E1 B39700 | loss=2.8815 | lr=0.000232 | step=5401 | 21551s | 12.0GB\n", | |
| "E1 B39720 | loss=3.9067 | lr=0.000232 | step=5402 | 21562s | 12.0GB\n", | |
| "E1 B39740 | loss=3.5439 | lr=0.000232 | step=5402 | 21572s | 12.0GB\n", | |
| "E1 B39760 | loss=3.7676 | lr=0.000231 | step=5403 | 21583s | 12.0GB\n", | |
| "E1 B39780 | loss=3.6051 | lr=0.000231 | step=5404 | 21594s | 12.0GB\n", | |
| "[SAVED] Checkpoint saved to: /home/spedrox/Transformers/checkpoints/auto_epoch1_step5404.pt\n", | |
| "E1 B39800 | loss=4.0102 | lr=0.000231 | step=5404 | 21606s | 12.0GB\n", | |
| "E1 B39820 | loss=3.4380 | lr=0.000231 | step=5405 | 21617s | 12.0GB\n", | |
| "E1 B39840 | loss=3.3908 | lr=0.000230 | step=5406 | 21628s | 12.0GB\n", | |
| "E1 B39860 | loss=3.0876 | lr=0.000230 | step=5406 | 21639s | 12.0GB\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# ======================== TRAINING LOOP ========================\n", | |
| "model.train()\n", | |
| "if 'global_step' not in globals():\n", | |
| " global_step = global_step if \"global_step\" in dir() and global_step > 0 else 0\n", | |
| "else:\n", | |
| " print(f'Continuing from step {global_step}...')\n", | |
| "if 'best_loss' not in globals():\n", | |
| " best_loss = best_loss if \"best_loss\" in dir() and best_loss < float(\"inf\") else float(\"inf\")\n", | |
| "last_ckpt_time = time.time()\n", | |
| "\n", | |
| "print(f\"\\nStarting training...\")\n", | |
| "print(f\" Epochs: {EPOCHS}, Batch: {BATCH_SIZE}, Accum: {GRAD_ACCUM}\")\n", | |
| "print(f\" Effective batch: {BATCH_SIZE * GRAD_ACCUM}\")\n", | |
| "print(f\" Peak LR: {LEARNING_RATE}, Warmup: {WARMUP_STEPS} steps\")\n", | |
| "print(f\" Est total steps: {TOTAL_STEPS:,}\")\n", | |
| "print()\n", | |
| "\n", | |
| "for epoch in range(EPOCHS):\n", | |
| " total_loss = 0.0\n", | |
| " batch_count = 0\n", | |
| " micro_count = 0\n", | |
| " epoch_start = time.time()\n", | |
| "\n", | |
| " optimizer.zero_grad(set_to_none=True)\n", | |
| "\n", | |
| " for i, batch in enumerate(train_loader):\n", | |
| " # Auto-checkpoint every 2 hours\n", | |
| " if time.time() - last_ckpt_time >= 7200:\n", | |
| " avg = total_loss / max(batch_count, 1)\n", | |
| " save_checkpoint(model, optimizer, epoch, global_step, avg, best_loss,\n", | |
| " CHECKPOINT_DIR, f\"auto_epoch{epoch+1}_step{global_step}.pt\")\n", | |
| " last_ckpt_time = time.time()\n", | |
| "\n", | |
| " input_ids = batch[\"input_ids\"].to(device, non_blocking=True)\n", | |
| " labels = batch[\"labels\"].to(device, non_blocking=True)\n", | |
| "\n", | |
| " # Forward\n", | |
| " with autocast(device_type=\"cuda\", dtype=amp_dtype):\n", | |
| " x = model.tgt_embed(input_ids)\n", | |
| " for layer in model.decoder.layers:\n", | |
| " x, _ = layer(x, tgt_mask=None, use_cache=False)\n", | |
| " x = model.decoder.norm(x)\n", | |
| " logits = model.project(x)\n", | |
| "\n", | |
| " shift_logits = logits[..., :-1, :].contiguous()\n", | |
| " shift_labels = labels[..., 1:].contiguous()\n", | |
| " loss = nn.CrossEntropyLoss(ignore_index=-100)(\n", | |
| " shift_logits.view(-1, shift_logits.size(-1)),\n", | |
| " shift_labels.view(-1)\n", | |
| " )\n", | |
| " loss = loss / GRAD_ACCUM\n", | |
| "\n", | |
| " # Skip bad batches\n", | |
| " if not torch.isfinite(loss):\n", | |
| " print(f\"[WARN] Non-finite loss at batch {i}, skipping\")\n", | |
| " optimizer.zero_grad(set_to_none=True)\n", | |
| " micro_count = 0\n", | |
| " continue\n", | |
| "\n", | |
| " # Backward\n", | |
| " if scaler.is_enabled():\n", | |
| " scaler.scale(loss).backward()\n", | |
| " else:\n", | |
| " loss.backward()\n", | |
| "\n", | |
| " real_loss = loss.item() * GRAD_ACCUM\n", | |
| " total_loss += real_loss\n", | |
| " batch_count += 1\n", | |
| " micro_count += 1\n", | |
| "\n", | |
| " # Optimizer step every GRAD_ACCUM micro-batches\n", | |
| " if micro_count >= GRAD_ACCUM:\n", | |
| " if scaler.is_enabled():\n", | |
| " scaler.unscale_(optimizer)\n", | |
| "\n", | |
| " grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)\n", | |
| "\n", | |
| " if scaler.is_enabled():\n", | |
| " scaler.step(optimizer)\n", | |
| " scaler.update()\n", | |
| " else:\n", | |
| " optimizer.step()\n", | |
| "\n", | |
| " optimizer.zero_grad(set_to_none=True)\n", | |
| " global_step += 1\n", | |
| " micro_count = 0\n", | |
| "\n", | |
| " # Update LR\n", | |
| " lr = get_lr(global_step)\n", | |
| " for pg in optimizer.param_groups:\n", | |
| " pg['lr'] = lr\n", | |
| "\n", | |
| " # Log to WandB\n", | |
| " if USE_WANDB:\n", | |
| " log_dict = {\n", | |
| " \"train/loss\": real_loss,\n", | |
| " \"train/lr\": lr,\n", | |
| " \"train/grad_norm\": float(grad_norm),\n", | |
| " \"train/step\": global_step,\n", | |
| " \"train/epoch\": epoch + 1,\n", | |
| " }\n", | |
| " if torch.cuda.is_available():\n", | |
| " log_dict[\"system/gpu_gb\"] = torch.cuda.memory_allocated() / 1e9\n", | |
| " wandb.log(log_dict, step=global_step)\n", | |
| "\n", | |
| " # Print progress\n", | |
| " if i % 20 == 0:\n", | |
| " elapsed = time.time() - epoch_start\n", | |
| " lr_now = optimizer.param_groups[0]['lr']\n", | |
| " gpu_gb = torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else 0\n", | |
| " print(f\"E{epoch+1} B{i:>5} | loss={real_loss:.4f} | lr={lr_now:.6f} | \"\n", | |
| " f\"step={global_step} | {elapsed:.0f}s | {gpu_gb:.1f}GB\")\n", | |
| "\n", | |
| " if i % 50 == 0 and torch.cuda.is_available():\n", | |
| " torch.cuda.empty_cache()\n", | |
| "\n", | |
| " # Flush leftover micro-batches\n", | |
| " if micro_count > 0:\n", | |
| " if scaler.is_enabled():\n", | |
| " scaler.unscale_(optimizer)\n", | |
| " torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)\n", | |
| " if scaler.is_enabled():\n", | |
| " scaler.step(optimizer)\n", | |
| " scaler.update()\n", | |
| " else:\n", | |
| " optimizer.step()\n", | |
| " optimizer.zero_grad(set_to_none=True)\n", | |
| " global_step += 1\n", | |
| "\n", | |
| " # End of epoch\n", | |
| " avg_loss = total_loss / max(batch_count, 1)\n", | |
| " duration = time.time() - epoch_start\n", | |
| " print(f\"\\nEpoch {epoch+1}/{EPOCHS} done | avg_loss={avg_loss:.4f} | {duration:.0f}s\")\n", | |
| "\n", | |
| " if USE_WANDB:\n", | |
| " wandb.log({\"epoch/avg_loss\": avg_loss, \"epoch/duration\": duration}, step=global_step)\n", | |
| "\n", | |
| " if avg_loss < best_loss:\n", | |
| " best_loss = avg_loss\n", | |
| " save_checkpoint(model, optimizer, epoch, global_step, avg_loss, best_loss,\n", | |
| " CHECKPOINT_DIR, \"best_model.pt\")\n", | |
| " if USE_WANDB:\n", | |
| " wandb.log({\"train/best_loss\": best_loss}, step=global_step)\n", | |
| "\n", | |
| " save_checkpoint(model, optimizer, epoch, global_step, avg_loss, best_loss,\n", | |
| " CHECKPOINT_DIR, f\"epoch_{epoch+1}.pt\")\n", | |
| "\n", | |
| "# Final\n", | |
| "print(\"\\nTraining complete!\")\n", | |
| "save_checkpoint(model, optimizer, EPOCHS-1, global_step, avg_loss, best_loss,\n", | |
| " CHECKPOINT_DIR, \"final_model.pt\")\n", | |
| "if USE_WANDB:\n", | |
| " wandb.finish()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Emergency save — model is still in GPU memory!\n", | |
| "save_checkpoint(\n", | |
| " model, optimizer, 0, global_step, 0.0, best_loss,\n", | |
| " CHECKPOINT_DIR, f\"manual_step{global_step}.pt\"\n", | |
| ")\n", | |
| "print(f\"Saved at step {global_step}!\")\n" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } | |
Xet Storage Details
- Size:
- 179 kB
- Xet hash:
- ffd65da4a74926639b44627b45b476e6824a02375bf61dc1a208e5e6cac9764f
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.