diff --git a/New/REG/README.md b/New/REG/README.md new file mode 100644 index 0000000000000000000000000000000000000000..789cd399f528ec8055e8c58beccb7bbb6f04c5c5 --- /dev/null +++ b/New/REG/README.md @@ -0,0 +1,156 @@ +

+

Representation Entanglement for Generation: Training Diffusion Transformers Is Much Easier Than You Think (NeurIPS 2025 Oral) +

+

+ Ge Wu1  + Shen Zhang3  + Ruijing Shi1  + Shanghua Gao4  + Zhenyuan Chen1  + Lei Wang1  + Zhaowei Chen3  + Hongcheng Gao5  + Yao Tang3  + Jian Yang1  + Ming-Ming Cheng1,2  + Xiang Li1,2*  +

+ $^{1}$ VCIP, CS, Nankai University, $^{2}$ NKIARI, Shenzhen Futian, $^{3}$ JIIOV Technology, + $^{4}$ Harvard University, $^{5}$ University of Chinese Academy of Sciences +

+

+ + + +
+

+

+

+

+ + +## 🚩 Overview + +![overview](fig/reg.png) + +REPA and its variants effectively mitigate training challenges in diffusion models by incorporating external visual representations from pretrained models, through alignment between the noisy hidden projections of denoising networks and foundational clean image representations. +We argue that the external alignment, which is absent during the entire denoising inference process, falls short of fully harnessing the potential of discriminative representations. + +In this work, we propose a straightforward method called Representation Entanglement for Generation (REG), which entangles low-level image latents with a single high-level class token from pretrained foundation models for denoising. +REG acquires the capability to produce coherent image-class pairs directly from pure noise, +substantially improving both generation quality and training efficiency. +This is accomplished with negligible additional inference overhead, **requiring only one single additional token for denoising (<0.5\% increase in FLOPs and latency).** +The inference process concurrently reconstructs both image latents and their corresponding global semantics, where the acquired semantic knowledge actively guides and enhances the image generation process. + +On ImageNet $256{\times}256$, SiT-XL/2 + REG demonstrates remarkable convergence acceleration, **achieving $\textbf{63}\times$ and $\textbf{23}\times$ faster training than SiT-XL/2 and SiT-XL/2 + REPA, respectively.** +More impressively, SiT-L/2 + REG trained for merely 400K iterations outperforms SiT-XL/2 + REPA trained for 4M iterations ($\textbf{10}\times$ longer). + + + +## 📰 News + +- **[2025.08.05]** We have released the pre-trained weights of REG + SiT-XL/2 in 4M (800 epochs). + + +## 📝 Results + +- Performance on ImageNet $256{\times}256$ with FID=1.36 by introducing a single class token. +- $\textbf{63}\times$ and $\textbf{23}\times$ faster training than SiT-XL/2 and SiT-XL/2 + REPA. + +
+Results +
+ + +## 📋 Plan +- More training steps on ImageNet 256&512 and T2I. + + +## 👊 Usage + +### 1. Environment setup + +```bash +conda create -n reg python=3.10.16 -y +conda activate reg +pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 +pip install -r requirements.txt +``` + +### 2. Dataset + +#### Dataset download + +Currently, we provide experiments for ImageNet. You can place the data that you want and can specifiy it via `--data-dir` arguments in training scripts. + +#### Preprocessing data +Please refer to the preprocessing guide. And you can directly download our processed data, ImageNet data [link](https://huggingface.co/WindATree/ImageNet-256-VAE/tree/main), and ImageNet data after VAE encoder [link]( https://huggingface.co/WindATree/vae-sd/tree/main) + +### 3. Training +Run train.sh +```bash +bash train.sh +``` + +train.sh contains the following content. +```bash +accelerate launch --multi_gpu --num_processes $NUM_GPUS train.py \ + --report-to="wandb" \ + --allow-tf32 \ + --mixed-precision="fp16" \ + --seed=0 \ + --path-type="linear" \ + --prediction="v" \ + --weighting="uniform" \ + --model="SiT-B/2" \ + --enc-type="dinov2-vit-b" \ + --proj-coeff=0.5 \ + --encoder-depth=4 \ #SiT-L/XL use 8, SiT-B use 4 + --output-dir="your_path" \ + --exp-name="linear-dinov2-b-enc4" \ + --batch-size=256 \ + --data-dir="data_path/imagenet_vae" \ + --cls=0.03 +``` + +Then this script will automatically create the folder in `exps` to save logs and checkpoints. You can adjust the following options: + +- `--models`: `[SiT-B/2, SiT-L/2, SiT-XL/2]` +- `--enc-type`: `[dinov2-vit-b, clip-vit-L]` +- `--proj-coeff`: Any values larger than 0 +- `--encoder-depth`: Any values between 1 to the depth of the model +- `--output-dir`: Any directory that you want to save checkpoints and logs +- `--exp-name`: Any string name (the folder will be created under `output-dir`) +- `--cls`: Weight coefficients of REG loss + + +### 4. Generate images and evaluation +You can generate images and get the final results through the following script. +The weight of REG can be found in this [link](https://pan.baidu.com/s/1QX2p3ybh1KfNU7wsp5McWw?pwd=khpp) or [HF](https://huggingface.co/Martinser/REG/tree/main). + +```bash +bash eval.sh +``` + + +## Citation +If you find our work, this repository, or pretrained models useful, please consider giving a star and citation. +``` +@article{wu2025representation, + title={Representation Entanglement for Generation: Training Diffusion Transformers Is Much Easier Than You Think}, + author={Wu, Ge and Zhang, Shen and Shi, Ruijing and Gao, Shanghua and Chen, Zhenyuan and Wang, Lei and Chen, Zhaowei and Gao, Hongcheng and Tang, Yao and Yang, Jian and others}, + journal={arXiv preprint arXiv:2507.01467}, + year={2025} +} +``` + +## Contact +If you have any questions, please create an issue on this repository, contact at gewu.nku@gmail.com or wechat(wg1158848). + + +## Acknowledgements + +Our code is based on [REPA](https://github.com/sihyun-yu/REPA), along with [SiT](https://github.com/willisma/SiT), [DINOv2](https://github.com/facebookresearch/dinov2), [ADM](https://github.com/openai/guided-diffusion) and [U-ViT](https://github.com/baofff/U-ViT) repositories. We thank the authors for releasing their code. If you use our model and code, please consider citing these works as well. + + + diff --git a/New/REG/eval.sh b/New/REG/eval.sh new file mode 100644 index 0000000000000000000000000000000000000000..4dc30ccb256d66edd3479eecfd420b6edad2fb53 --- /dev/null +++ b/New/REG/eval.sh @@ -0,0 +1,52 @@ + +random_number=$((RANDOM % 100 + 1200)) +NUM_GPUS=8 +STEP="4000000" +SAVE_PATH="your_path/reg_xlarge_dinov2_base_align_8_cls/linear-dinov2-b-enc8" +VAE_PATH="your_vae_path/" +NUM_STEP=250 +MODEL_SIZE='XL' +CFG_SCALE=2.3 +CLS_CFG_SCALE=2.3 +GH=0.85 + +export NCCL_P2P_DISABLE=1 + +python -m torch.distributed.launch --master_port=$random_number --nproc_per_node=$NUM_GPUS generate.py \ + --model SiT-XL/2 \ + --num-fid-samples 50000 \ + --ckpt ${SAVE_PATH}/checkpoints/${STEP}.pt \ + --path-type=linear \ + --encoder-depth=8 \ + --projector-embed-dims=768 \ + --per-proc-batch-size=64 \ + --mode=sde \ + --num-steps=${NUM_STEP} \ + --cfg-scale=${CFG_SCALE} \ + --cls-cfg-scale=${CLS_CFG_SCALE} \ + --guidance-high=${GH} \ + --sample-dir ${SAVE_PATH}/checkpoints \ + --cls=768 + + +python ./evaluations/evaluator.py \ + --ref_batch your_path/VIRTUAL_imagenet256_labeled.npz \ + --sample_batch ${SAVE_PATH}/checkpoints/SiT-${MODEL_SIZE}-2-${STEP}-size-256-vae-ema-cfg-${CFG_SCALE}-seed-0-sde-${GH}-${CLS_CFG_SCALE}.npz \ + --save_path ${SAVE_PATH}/checkpoints \ + --cfg_cond 1 \ + --step ${STEP} \ + --num_steps ${NUM_STEP} \ + --cfg ${CFG_SCALE} \ + --cls_cfg ${CLS_CFG_SCALE} \ + --gh ${GH} + + + + + + + + + + + diff --git a/New/REG/generate.py b/New/REG/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..57d9db3cca5e51918c62bc50bd889d3c66f1d663 --- /dev/null +++ b/New/REG/generate.py @@ -0,0 +1,253 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +""" +Samples a large number of images from a pre-trained SiT model using DDP. +Subsequently saves a .npz file that can be used to compute FID and other +evaluation metrics via the ADM repo: https://github.com/openai/guided-diffusion/tree/main/evaluations + +For a simple single-GPU/CPU sampling script, see sample.py. +""" +import torch +import torch.distributed as dist +from models.sit import SiT_models +from diffusers.models import AutoencoderKL +from tqdm import tqdm +import os +from PIL import Image +import numpy as np +import math +import argparse +from samplers import euler_maruyama_sampler, euler_sampler +from utils import load_legacy_checkpoints, download_model + + +def create_npz_from_sample_folder(sample_dir, num=50_000): + """ + Builds a single .npz file from a folder of .png samples. + """ + samples = [] + for i in tqdm(range(num), desc="Building .npz file from samples"): + sample_pil = Image.open(f"{sample_dir}/{i:06d}.png") + sample_np = np.asarray(sample_pil).astype(np.uint8) + samples.append(sample_np) + samples = np.stack(samples) + assert samples.shape == (num, samples.shape[1], samples.shape[2], 3) + npz_path = f"{sample_dir}.npz" + np.savez(npz_path, arr_0=samples) + print(f"Saved .npz file to {npz_path} [shape={samples.shape}].") + return npz_path + + +def main(args): + """ + Run sampling. + """ + torch.backends.cuda.matmul.allow_tf32 = args.tf32 # True: fast but may lead to some small numerical differences + assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage" + torch.set_grad_enabled(False) + + # Setup DDP:cd + dist.init_process_group("nccl") + rank = dist.get_rank() + device = rank % torch.cuda.device_count() + seed = args.global_seed * dist.get_world_size() + rank + torch.manual_seed(seed) + torch.cuda.set_device(device) + print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.") + + # Load model: + block_kwargs = {"fused_attn": args.fused_attn, "qk_norm": args.qk_norm} + latent_size = args.resolution // 8 + model = SiT_models[args.model]( + input_size=latent_size, + num_classes=args.num_classes, + use_cfg = True, + z_dims = [int(z_dim) for z_dim in args.projector_embed_dims.split(',')], + encoder_depth=args.encoder_depth, + **block_kwargs, + ).to(device) + # Auto-download a pre-trained model or load a custom SiT checkpoint from train.py: + ckpt_path = args.ckpt + + + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + if ckpt_path is None: + args.ckpt = 'SiT-XL-2-256x256.pt' + assert args.model == 'SiT-XL/2' + assert len(args.projector_embed_dims.split(',')) == 1 + assert int(args.projector_embed_dims.split(',')[0]) == 768 + state_dict = download_model('last.pt') + else: + state_dict = torch.load(ckpt_path, map_location=f'cuda:{device}')['ema'] + + if args.legacy: + state_dict = load_legacy_checkpoints( + state_dict=state_dict, encoder_depth=args.encoder_depth + ) + model.load_state_dict(state_dict) + #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + + model.eval() # important! + vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device) + #vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path="your_local_path/weight/").to(device) + + + # Create folder to save samples: + model_string_name = args.model.replace("/", "-") + ckpt_string_name = os.path.basename(args.ckpt).replace(".pt", "") if args.ckpt else "pretrained" + folder_name = f"{model_string_name}-{ckpt_string_name}-size-{args.resolution}-vae-{args.vae}-" \ + f"cfg-{args.cfg_scale}-seed-{args.global_seed}-{args.mode}-{args.guidance_high}-{args.cls_cfg_scale}" + sample_folder_dir = f"{args.sample_dir}/{folder_name}" + if rank == 0: + os.makedirs(sample_folder_dir, exist_ok=True) + print(f"Saving .png samples at {sample_folder_dir}") + dist.barrier() + + # Figure out how many samples we need to generate on each GPU and how many iterations we need to run: + n = args.per_proc_batch_size + global_batch_size = n * dist.get_world_size() + # To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples: + total_samples = int(math.ceil(args.num_fid_samples / global_batch_size) * global_batch_size) + if rank == 0: + print(f"Total number of images that will be sampled: {total_samples}") + print(f"SiT Parameters: {sum(p.numel() for p in model.parameters()):,}") + print(f"projector Parameters: {sum(p.numel() for p in model.projectors.parameters()):,}") + assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size" + samples_needed_this_gpu = int(total_samples // dist.get_world_size()) + assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size" + iterations = int(samples_needed_this_gpu // n) + pbar = range(iterations) + pbar = tqdm(pbar) if rank == 0 else pbar + fixed_noise = None + if args.fixed_noise_file: + try: + fixed_noise = torch.load(args.fixed_noise_file, map_location="cpu", weights_only=True) + except TypeError: + fixed_noise = torch.load(args.fixed_noise_file, map_location="cpu") + for k in ("z", "y", "cls_z"): + if k not in fixed_noise: + raise KeyError(f"fixed noise file missing key: {k}") + if int(fixed_noise["z"].shape[0]) < total_samples: + raise ValueError( + f"fixed noise size={fixed_noise['z'].shape[0]} < required total_samples={total_samples}" + ) + if rank == 0: + print(f"Using fixed noise file: {args.fixed_noise_file}") + + total = 0 + for _ in pbar: + if fixed_noise is not None: + idx = torch.arange(total + rank, total + global_batch_size, dist.get_world_size(), dtype=torch.long) + z = fixed_noise["z"][idx].to(device=device) + y = fixed_noise["y"][idx].to(device=device) + cls_z = fixed_noise["cls_z"][idx].to(device=device) + else: + z = torch.randn(n, model.in_channels, latent_size, latent_size, device=device) + y = torch.randint(0, args.num_classes, (n,), device=device) + cls_z = torch.randn(n, args.cls, device=device) + + # Sample images: + sampling_kwargs = dict( + model=model, + latents=z, + y=y, + num_steps=args.num_steps, + heun=args.heun, + cfg_scale=args.cfg_scale, + guidance_low=args.guidance_low, + guidance_high=args.guidance_high, + path_type=args.path_type, + cls_latents=cls_z, + args=args + ) + with torch.no_grad(): + if args.mode == "sde": + samples = euler_maruyama_sampler(**sampling_kwargs).to(torch.float32) + elif args.mode == "ode": + samples = euler_sampler(**sampling_kwargs).to(torch.float32) + else: + raise NotImplementedError() + + latents_scale = torch.tensor( + [0.18215, 0.18215, 0.18215, 0.18215, ] + ).view(1, 4, 1, 1).to(device) + latents_bias = -torch.tensor( + [0., 0., 0., 0.,] + ).view(1, 4, 1, 1).to(device) + samples = vae.decode((samples - latents_bias) / latents_scale).sample + samples = (samples + 1) / 2. + samples = torch.clamp( + 255. * samples, 0, 255 + ).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy() + + # Save samples to disk as individual .png files + for i, sample in enumerate(samples): + index = i * dist.get_world_size() + rank + total + Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png") + total += global_batch_size + + # Make sure all processes have finished saving their samples before attempting to convert to .npz + dist.barrier() + if rank == 0: + create_npz_from_sample_folder(sample_folder_dir, args.num_fid_samples) + print("Done.") + dist.barrier() + dist.destroy_process_group() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # seed + parser.add_argument("--global-seed", type=int, default=0) + + # precision + parser.add_argument("--tf32", action=argparse.BooleanOptionalAction, default=True, + help="By default, use TF32 matmuls. This massively accelerates sampling on Ampere GPUs.") + + # logging/saving: + parser.add_argument("--ckpt", type=str, default=None, help="Optional path to a SiT checkpoint.") + parser.add_argument("--sample-dir", type=str, default="samples") + + # model + parser.add_argument("--model", type=str, choices=list(SiT_models.keys()), default="SiT-XL/2") + parser.add_argument("--num-classes", type=int, default=1000) + parser.add_argument("--encoder-depth", type=int, default=8) + parser.add_argument("--resolution", type=int, choices=[256, 512], default=256) + parser.add_argument("--fused-attn", action=argparse.BooleanOptionalAction, default=False) + parser.add_argument("--qk-norm", action=argparse.BooleanOptionalAction, default=False) + # vae + parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="ema") + + # number of samples + parser.add_argument("--per-proc-batch-size", type=int, default=32) + parser.add_argument("--num-fid-samples", type=int, default=50_000) + + # sampling related hyperparameters + parser.add_argument("--mode", type=str, default="ode") + parser.add_argument("--cfg-scale", type=float, default=1.5) + parser.add_argument("--cls-cfg-scale", type=float, default=1.5) + parser.add_argument("--projector-embed-dims", type=str, default="768,1024") + parser.add_argument("--path-type", type=str, default="linear", choices=["linear", "cosine"]) + parser.add_argument("--num-steps", type=int, default=50) + parser.add_argument("--heun", action=argparse.BooleanOptionalAction, default=False) # only for ode + parser.add_argument("--guidance-low", type=float, default=0.) + parser.add_argument("--guidance-high", type=float, default=1.) + parser.add_argument('--local-rank', default=-1, type=int) + parser.add_argument('--cls', default=768, type=int) + # will be deprecated + parser.add_argument("--legacy", action=argparse.BooleanOptionalAction, default=False) # only for ode + parser.add_argument( + "--fixed-noise-file", + type=str, + default=None, + help="Optional .pt with keys z/y/cls_z to force identical initial states across runs.", + ) + + + args = parser.parse_args() + main(args) diff --git a/New/REG/loss.py b/New/REG/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..43bf202aa7d533ec21cfe076e31ab6b43c5c5bd1 --- /dev/null +++ b/New/REG/loss.py @@ -0,0 +1,102 @@ +import torch +import numpy as np +import torch.nn.functional as F + +def mean_flat(x): + """ + Take the mean over all non-batch dimensions. + """ + return torch.mean(x, dim=list(range(1, len(x.size())))) + +def sum_flat(x): + """ + Take the mean over all non-batch dimensions. + """ + return torch.sum(x, dim=list(range(1, len(x.size())))) + +class SILoss: + def __init__( + self, + prediction='v', + path_type="linear", + weighting="uniform", + encoders=[], + accelerator=None, + latents_scale=None, + latents_bias=None, + ): + self.prediction = prediction + self.weighting = weighting + self.path_type = path_type + self.encoders = encoders + self.accelerator = accelerator + self.latents_scale = latents_scale + self.latents_bias = latents_bias + + def interpolant(self, t): + if self.path_type == "linear": + alpha_t = 1 - t + sigma_t = t + d_alpha_t = -1 + d_sigma_t = 1 + elif self.path_type == "cosine": + alpha_t = torch.cos(t * np.pi / 2) + sigma_t = torch.sin(t * np.pi / 2) + d_alpha_t = -np.pi / 2 * torch.sin(t * np.pi / 2) + d_sigma_t = np.pi / 2 * torch.cos(t * np.pi / 2) + else: + raise NotImplementedError() + + return alpha_t, sigma_t, d_alpha_t, d_sigma_t + + def __call__(self, model, images, model_kwargs=None, zs=None, cls_token=None, + time_input=None, noises=None,): + if model_kwargs == None: + model_kwargs = {} + # sample timesteps + if time_input is None: + if self.weighting == "uniform": + time_input = torch.rand((images.shape[0], 1, 1, 1)) + elif self.weighting == "lognormal": + # sample timestep according to log-normal distribution of sigmas following EDM + rnd_normal = torch.randn((images.shape[0], 1 ,1, 1)) + sigma = rnd_normal.exp() + if self.path_type == "linear": + time_input = sigma / (1 + sigma) + elif self.path_type == "cosine": + time_input = 2 / np.pi * torch.atan(sigma) + + time_input = time_input.to(device=images.device, dtype=images.dtype) + + if noises is None: + noises = torch.randn_like(images) + noises_cls = torch.randn_like(cls_token) + + alpha_t, sigma_t, d_alpha_t, d_sigma_t = self.interpolant(time_input) + + model_input = alpha_t * images + sigma_t * noises + cls_input = alpha_t.squeeze(-1).squeeze(-1) * cls_token + sigma_t.squeeze(-1).squeeze(-1) * noises_cls + if self.prediction == 'v': + model_target = d_alpha_t * images + d_sigma_t * noises + cls_target = d_alpha_t * cls_token + d_sigma_t * noises_cls + else: + raise NotImplementedError() + + model_output, zs_tilde, cls_output = model(model_input, time_input.flatten(), **model_kwargs, + cls_token=cls_input) + + #denoising_loss + denoising_loss = mean_flat((model_output - model_target) ** 2) + denoising_loss_cls = mean_flat((cls_output - cls_target) ** 2) + + # projection loss + proj_loss = 0. + bsz = zs[0].shape[0] + for i, (z, z_tilde) in enumerate(zip(zs, zs_tilde)): + for j, (z_j, z_tilde_j) in enumerate(zip(z, z_tilde)): + z_tilde_j = torch.nn.functional.normalize(z_tilde_j, dim=-1) + z_j = torch.nn.functional.normalize(z_j, dim=-1) + proj_loss += mean_flat(-(z_j * z_tilde_j).sum(dim=-1)) + proj_loss /= (len(zs) * bsz) + + return denoising_loss, proj_loss, time_input, noises, denoising_loss_cls diff --git a/New/REG/requirements.txt b/New/REG/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..38b75c7be1f02fad99e40cf23fb19cc817505a84 --- /dev/null +++ b/New/REG/requirements.txt @@ -0,0 +1,97 @@ + - pip: + absl-py==2.2.2 + accelerate==1.2.1 + aiohappyeyeballs==2.6.1 + aiohttp==3.11.16 + aiosignal==1.3.2 + astunparse==1.6.3 + async-timeout==5.0.1 + attrs==25.3.0 + certifi==2022.12.7 + charset-normalizer==2.1.1 + click==8.1.8 + datasets==2.20.0 + diffusers==0.32.1 + dill==0.3.8 + docker-pycreds==0.4.0 + einops==0.8.1 + filelock==3.13.1 + flatbuffers==25.2.10 + frozenlist==1.5.0 + fsspec==2024.5.0 + ftfy==6.3.1 + gast==0.6.0 + gitdb==4.0.12 + gitpython==3.1.44 + google-pasta==0.2.0 + grpcio==1.71.0 + h5py==3.13.0 + huggingface-hub==0.27.1 + idna==3.4 + importlib-metadata==8.6.1 + jinja2==3.1.4 + joblib==1.4.2 + keras==3.9.2 + libclang==18.1.1 + markdown==3.8 + markdown-it-py==3.0.0 + markupsafe==2.1.5 + mdurl==0.1.2 + ml-dtypes==0.3.2 + mpmath==1.3.0 + multidict==6.4.3 + multiprocess==0.70.16 + namex==0.0.8 + networkx==3.3 + numpy==1.26.4 + opt-einsum==3.4.0 + optree==0.15.0 + packaging==24.2 + pandas==2.2.3 + pillow==11.0.0 + platformdirs==4.3.7 + propcache==0.3.1 + protobuf==4.25.6 + psutil==7.0.0 + pyarrow==19.0.1 + pyarrow-hotfix==0.6 + pygments==2.19.1 + python-dateutil==2.9.0.post0 + pytz==2025.2 + pyyaml==6.0.2 + regex==2024.11.6 + requests==2.32.3 + rich==14.0.0 + safetensors==0.5.3 + scikit-learn==1.5.1 + scipy==1.15.2 + sentry-sdk==2.26.1 + setproctitle==1.3.5 + six==1.17.0 + smmap==5.0.2 + sympy==1.13.1 + tensorboard==2.16.1 + tensorboard-data-server==0.7.2 + tensorflow==2.16.1 + tensorflow-io-gcs-filesystem==0.37.1 + termcolor==3.0.1 + tf-keras==2.16.0 + threadpoolctl==3.6.0 + timm==1.0.12 + tokenizers==0.21.0 + tqdm==4.67.1 + transformers==4.47.0 + triton==2.1.0 + typing-extensions==4.12.2 + tzdata==2025.2 + urllib3==1.26.13 + wandb==0.17.6 + wcwidth==0.2.13 + werkzeug==3.1.3 + wrapt==1.17.2 + xformer==1.0.1 + xformers==0.0.23 + xxhash==3.5.0 + yarl==1.20.0 + zipp==3.21.0 + diff --git a/back/evaluations/README.md b/back/evaluations/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6ad0ab6c0b3982ad60950df7ffa9af5662d31b2b --- /dev/null +++ b/back/evaluations/README.md @@ -0,0 +1,72 @@ +# Evaluations + +To compare different generative models, we use FID, sFID, Precision, Recall, and Inception Score. These metrics can all be calculated using batches of samples, which we store in `.npz` (numpy) files. + +# Download batches + +We provide pre-computed sample batches for the reference datasets, our diffusion models, and several baselines we compare against. These are all stored in `.npz` format. + +Reference dataset batches contain pre-computed statistics over the whole dataset, as well as 10,000 images for computing Precision and Recall. All other batches contain 50,000 images which can be used to compute statistics and Precision/Recall. + +Here are links to download all of the sample and reference batches: + + * LSUN + * LSUN bedroom: [reference batch](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/bedroom/VIRTUAL_lsun_bedroom256.npz) + * [ADM (dropout)](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/bedroom/admnet_dropout_lsun_bedroom.npz) + * [DDPM](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/bedroom/ddpm_lsun_bedroom.npz) + * [IDDPM](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/bedroom/iddpm_lsun_bedroom.npz) + * [StyleGAN](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/bedroom/stylegan_lsun_bedroom.npz) + * LSUN cat: [reference batch](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/cat/VIRTUAL_lsun_cat256.npz) + * [ADM (dropout)](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/cat/admnet_dropout_lsun_cat.npz) + * [StyleGAN2](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/cat/stylegan2_lsun_cat.npz) + * LSUN horse: [reference batch](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/horse/VIRTUAL_lsun_horse256.npz) + * [ADM (dropout)](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/horse/admnet_dropout_lsun_horse.npz) + * [ADM](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/lsun/horse/admnet_lsun_horse.npz) + + * ImageNet + * ImageNet 64x64: [reference batch](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/64/VIRTUAL_imagenet64_labeled.npz) + * [ADM](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/64/admnet_imagenet64.npz) + * [IDDPM](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/64/iddpm_imagenet64.npz) + * [BigGAN](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/64/biggan_deep_imagenet64.npz) + * ImageNet 128x128: [reference batch](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/128/VIRTUAL_imagenet128_labeled.npz) + * [ADM](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/128/admnet_imagenet128.npz) + * [ADM-G](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/128/admnet_guided_imagenet128.npz) + * [ADM-G, 25 steps](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/128/admnet_guided_25step_imagenet128.npz) + * [BigGAN-deep (trunc=1.0)](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/128/biggan_deep_trunc1_imagenet128.npz) + * ImageNet 256x256: [reference batch](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/VIRTUAL_imagenet256_labeled.npz) + * [ADM](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/admnet_imagenet256.npz) + * [ADM-G](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/admnet_guided_imagenet256.npz) + * [ADM-G, 25 step](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/admnet_guided_25step_imagenet256.npz) + * [ADM-G + ADM-U](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/admnet_guided_upsampled_imagenet256.npz) + * [ADM-U](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/admnet_upsampled_imagenet256.npz) + * [BigGAN-deep (trunc=1.0)](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/256/biggan_deep_trunc1_imagenet256.npz) + * ImageNet 512x512: [reference batch](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/512/VIRTUAL_imagenet512.npz) + * [ADM](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/512/admnet_imagenet512.npz) + * [ADM-G](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/512/admnet_guided_imagenet512.npz) + * [ADM-G, 25 step](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/512/admnet_guided_25step_imagenet512.npz) + * [ADM-G + ADM-U](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/512/admnet_guided_upsampled_imagenet512.npz) + * [ADM-U](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/512/admnet_upsampled_imagenet512.npz) + * [BigGAN-deep (trunc=1.0)](https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/imagenet/512/biggan_deep_trunc1_imagenet512.npz) + +# Run evaluations + +First, generate or download a batch of samples and download the corresponding reference batch for the given dataset. For this example, we'll use ImageNet 256x256, so the refernce batch is `VIRTUAL_imagenet256_labeled.npz` and we can use the sample batch `admnet_guided_upsampled_imagenet256.npz`. + +Next, run the `evaluator.py` script. The requirements of this script can be found in [requirements.txt](requirements.txt). Pass two arguments to the script: the reference batch and the sample batch. The script will download the InceptionV3 model used for evaluations into the current working directory (if it is not already present). This file is roughly 100MB. + +The output of the script will look something like this, where the first `...` is a bunch of verbose TensorFlow logging: + +``` +$ python evaluator.py VIRTUAL_imagenet256_labeled.npz admnet_guided_upsampled_imagenet256.npz +... +computing reference batch activations... +computing/reading reference batch statistics... +computing sample batch activations... +computing/reading sample batch statistics... +Computing evaluations... +Inception Score: 215.8370361328125 +FID: 3.9425574129223264 +sFID: 6.140433703346162 +Precision: 0.8265 +Recall: 0.5309 +``` diff --git a/back/evaluations/evaluator.py b/back/evaluations/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..8c4375b989eec9d8141fb145e7882bd3dfa0f211 --- /dev/null +++ b/back/evaluations/evaluator.py @@ -0,0 +1,679 @@ +import argparse +import io +import os +import random +import warnings +import zipfile +from abc import ABC, abstractmethod +from contextlib import contextmanager +from functools import partial +from multiprocessing import cpu_count +from multiprocessing.pool import ThreadPool +from typing import Iterable, Optional, Tuple + +import numpy as np +import requests +import tensorflow.compat.v1 as tf +from scipy import linalg +from tqdm.auto import tqdm + +INCEPTION_V3_URL = "https://openaipublic.blob.core.windows.net/diffusion/jul-2021/ref_batches/classify_image_graph_def.pb" +INCEPTION_V3_PATH = "classify_image_graph_def.pb" + +FID_POOL_NAME = "pool_3:0" +FID_SPATIAL_NAME = "mixed_6/conv:0" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--ref_batch", help="path to reference batch npz file") + parser.add_argument("--sample_batch", help="path to sample batch npz file") + parser.add_argument("--save_path", help="path to sample batch npz file") + parser.add_argument("--cfg_cond", default=1, type=int) + parser.add_argument("--step", default=1, type=int) + parser.add_argument("--cfg", default=1.0, type=float) + parser.add_argument("--cls_cfg", default=1.0, type=float) + parser.add_argument("--gh", default=1.0, type=float) + parser.add_argument("--num_steps", default=250, type=int) + args = parser.parse_args() + + if not os.path.exists(args.save_path): + os.mkdir(args.save_path) + + + config = tf.ConfigProto( + allow_soft_placement=True # allows DecodeJpeg to run on CPU in Inception graph + ) + config.gpu_options.allow_growth = True + evaluator = Evaluator(tf.Session(config=config)) + + print("warming up TensorFlow...") + # This will cause TF to print a bunch of verbose stuff now rather + # than after the next print(), to help prevent confusion. + evaluator.warmup() + + print("computing reference batch activations...") + ref_acts = evaluator.read_activations(args.ref_batch) + print("computing/reading reference batch statistics...") + ref_stats, ref_stats_spatial = evaluator.read_statistics(args.ref_batch, ref_acts) + + print("computing sample batch activations...") + sample_acts = evaluator.read_activations(args.sample_batch) + print("computing/reading sample batch statistics...") + sample_stats, sample_stats_spatial = evaluator.read_statistics(args.sample_batch, sample_acts) + + print("Computing evaluations...") + Inception_Score = evaluator.compute_inception_score(sample_acts[0]) + FID = sample_stats.frechet_distance(ref_stats) + sFID = sample_stats_spatial.frechet_distance(ref_stats_spatial) + prec, recall = evaluator.compute_prec_recall(ref_acts[0], sample_acts[0]) + + print("Inception Score:", Inception_Score) + print("FID:", FID) + print("sFID:", sFID) + print("Precision:", prec) + print("Recall:", recall) + + if args.cfg_cond: + file_path = args.save_path + str(args.num_steps) + str(args.step) + str(args.cfg) + str(args.gh) + str(args.cls_cfg)+ "cfg_cond_true.txt" + else: + file_path = args.save_path + str(args.num_steps) + str(args.step) + str(args.cfg) + str(args.gh) + str(args.cls_cfg)+ "cfg_cond_false.txt" + with open(file_path, "w") as file: + file.write("Inception Score: {}\n".format(Inception_Score)) + file.write("FID: {}\n".format(FID)) + file.write("sFID: {}\n".format(sFID)) + file.write("Precision: {}\n".format(prec)) + file.write("Recall: {}\n".format(recall)) + + +class InvalidFIDException(Exception): + pass + + +class FIDStatistics: + def __init__(self, mu: np.ndarray, sigma: np.ndarray): + self.mu = mu + self.sigma = sigma + + def frechet_distance(self, other, eps=1e-6): + """ + Compute the Frechet distance between two sets of statistics. + """ + # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L132 + mu1, sigma1 = self.mu, self.sigma + mu2, sigma2 = other.mu, other.sigma + + mu1 = np.atleast_1d(mu1) + mu2 = np.atleast_1d(mu2) + + sigma1 = np.atleast_2d(sigma1) + sigma2 = np.atleast_2d(sigma2) + + assert ( + mu1.shape == mu2.shape + ), f"Training and test mean vectors have different lengths: {mu1.shape}, {mu2.shape}" + assert ( + sigma1.shape == sigma2.shape + ), f"Training and test covariances have different dimensions: {sigma1.shape}, {sigma2.shape}" + + diff = mu1 - mu2 + + # product might be almost singular + covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) + if not np.isfinite(covmean).all(): + msg = ( + "fid calculation produces singular product; adding %s to diagonal of cov estimates" + % eps + ) + warnings.warn(msg) + offset = np.eye(sigma1.shape[0]) * eps + covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) + + # numerical error might give slight imaginary component + if np.iscomplexobj(covmean): + if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): + m = np.max(np.abs(covmean.imag)) + raise ValueError("Imaginary component {}".format(m)) + covmean = covmean.real + + tr_covmean = np.trace(covmean) + + return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean + + +class Evaluator: + def __init__( + self, + session, + batch_size=64, + softmax_batch_size=512, + ): + self.sess = session + self.batch_size = batch_size + self.softmax_batch_size = softmax_batch_size + self.manifold_estimator = ManifoldEstimator(session) + with self.sess.graph.as_default(): + self.image_input = tf.placeholder(tf.float32, shape=[None, None, None, 3]) + self.softmax_input = tf.placeholder(tf.float32, shape=[None, 2048]) + self.pool_features, self.spatial_features = _create_feature_graph(self.image_input) + self.softmax = _create_softmax_graph(self.softmax_input) + + def warmup(self): + self.compute_activations(np.zeros([1, 8, 64, 64, 3])) + + def read_activations(self, npz_path: str) -> Tuple[np.ndarray, np.ndarray]: + with open_npz_array(npz_path, "arr_0") as reader: + return self.compute_activations(reader.read_batches(self.batch_size)) + + def compute_activations(self, batches: Iterable[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]: + """ + Compute image features for downstream evals. + + :param batches: a iterator over NHWC numpy arrays in [0, 255]. + :return: a tuple of numpy arrays of shape [N x X], where X is a feature + dimension. The tuple is (pool_3, spatial). + """ + preds = [] + spatial_preds = [] + for batch in tqdm(batches): + batch = batch.astype(np.float32) + pred, spatial_pred = self.sess.run( + [self.pool_features, self.spatial_features], {self.image_input: batch} + ) + preds.append(pred.reshape([pred.shape[0], -1])) + spatial_preds.append(spatial_pred.reshape([spatial_pred.shape[0], -1])) + return ( + np.concatenate(preds, axis=0), + np.concatenate(spatial_preds, axis=0), + ) + + def read_statistics( + self, npz_path: str, activations: Tuple[np.ndarray, np.ndarray] + ) -> Tuple[FIDStatistics, FIDStatistics]: + obj = np.load(npz_path) + if "mu" in list(obj.keys()): + return FIDStatistics(obj["mu"], obj["sigma"]), FIDStatistics( + obj["mu_s"], obj["sigma_s"] + ) + return tuple(self.compute_statistics(x) for x in activations) + + def compute_statistics(self, activations: np.ndarray) -> FIDStatistics: + mu = np.mean(activations, axis=0) + sigma = np.cov(activations, rowvar=False) + return FIDStatistics(mu, sigma) + + def compute_inception_score(self, activations: np.ndarray, split_size: int = 5000) -> float: + softmax_out = [] + for i in range(0, len(activations), self.softmax_batch_size): + acts = activations[i : i + self.softmax_batch_size] + softmax_out.append(self.sess.run(self.softmax, feed_dict={self.softmax_input: acts})) + preds = np.concatenate(softmax_out, axis=0) + # https://github.com/openai/improved-gan/blob/4f5d1ec5c16a7eceb206f42bfc652693601e1d5c/inception_score/model.py#L46 + scores = [] + for i in range(0, len(preds), split_size): + part = preds[i : i + split_size] + kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0))) + kl = np.mean(np.sum(kl, 1)) + scores.append(np.exp(kl)) + return float(np.mean(scores)) + + def compute_prec_recall( + self, activations_ref: np.ndarray, activations_sample: np.ndarray + ) -> Tuple[float, float]: + radii_1 = self.manifold_estimator.manifold_radii(activations_ref) + radii_2 = self.manifold_estimator.manifold_radii(activations_sample) + pr = self.manifold_estimator.evaluate_pr( + activations_ref, radii_1, activations_sample, radii_2 + ) + return (float(pr[0][0]), float(pr[1][0])) + + +class ManifoldEstimator: + """ + A helper for comparing manifolds of feature vectors. + + Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L57 + """ + + def __init__( + self, + session, + row_batch_size=10000, + col_batch_size=10000, + nhood_sizes=(3,), + clamp_to_percentile=None, + eps=1e-5, + ): + """ + Estimate the manifold of given feature vectors. + + :param session: the TensorFlow session. + :param row_batch_size: row batch size to compute pairwise distances + (parameter to trade-off between memory usage and performance). + :param col_batch_size: column batch size to compute pairwise distances. + :param nhood_sizes: number of neighbors used to estimate the manifold. + :param clamp_to_percentile: prune hyperspheres that have radius larger than + the given percentile. + :param eps: small number for numerical stability. + """ + self.distance_block = DistanceBlock(session) + self.row_batch_size = row_batch_size + self.col_batch_size = col_batch_size + self.nhood_sizes = nhood_sizes + self.num_nhoods = len(nhood_sizes) + self.clamp_to_percentile = clamp_to_percentile + self.eps = eps + + def warmup(self): + feats, radii = ( + np.zeros([1, 2048], dtype=np.float32), + np.zeros([1, 1], dtype=np.float32), + ) + self.evaluate_pr(feats, radii, feats, radii) + + def manifold_radii(self, features: np.ndarray) -> np.ndarray: + num_images = len(features) + + # Estimate manifold of features by calculating distances to k-NN of each sample. + radii = np.zeros([num_images, self.num_nhoods], dtype=np.float32) + distance_batch = np.zeros([self.row_batch_size, num_images], dtype=np.float32) + seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32) + + for begin1 in range(0, num_images, self.row_batch_size): + end1 = min(begin1 + self.row_batch_size, num_images) + row_batch = features[begin1:end1] + + for begin2 in range(0, num_images, self.col_batch_size): + end2 = min(begin2 + self.col_batch_size, num_images) + col_batch = features[begin2:end2] + + # Compute distances between batches. + distance_batch[ + 0 : end1 - begin1, begin2:end2 + ] = self.distance_block.pairwise_distances(row_batch, col_batch) + + # Find the k-nearest neighbor from the current batch. + radii[begin1:end1, :] = np.concatenate( + [ + x[:, self.nhood_sizes] + for x in _numpy_partition(distance_batch[0 : end1 - begin1, :], seq, axis=1) + ], + axis=0, + ) + + if self.clamp_to_percentile is not None: + max_distances = np.percentile(radii, self.clamp_to_percentile, axis=0) + radii[radii > max_distances] = 0 + return radii + + def evaluate(self, features: np.ndarray, radii: np.ndarray, eval_features: np.ndarray): + """ + Evaluate if new feature vectors are at the manifold. + """ + num_eval_images = eval_features.shape[0] + num_ref_images = radii.shape[0] + distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32) + batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32) + max_realism_score = np.zeros([num_eval_images], dtype=np.float32) + nearest_indices = np.zeros([num_eval_images], dtype=np.int32) + + for begin1 in range(0, num_eval_images, self.row_batch_size): + end1 = min(begin1 + self.row_batch_size, num_eval_images) + feature_batch = eval_features[begin1:end1] + + for begin2 in range(0, num_ref_images, self.col_batch_size): + end2 = min(begin2 + self.col_batch_size, num_ref_images) + ref_batch = features[begin2:end2] + + distance_batch[ + 0 : end1 - begin1, begin2:end2 + ] = self.distance_block.pairwise_distances(feature_batch, ref_batch) + + # From the minibatch of new feature vectors, determine if they are in the estimated manifold. + # If a feature vector is inside a hypersphere of some reference sample, then + # the new sample lies at the estimated manifold. + # The radii of the hyperspheres are determined from distances of neighborhood size k. + samples_in_manifold = distance_batch[0 : end1 - begin1, :, None] <= radii + batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32) + + max_realism_score[begin1:end1] = np.max( + radii[:, 0] / (distance_batch[0 : end1 - begin1, :] + self.eps), axis=1 + ) + nearest_indices[begin1:end1] = np.argmin(distance_batch[0 : end1 - begin1, :], axis=1) + + return { + "fraction": float(np.mean(batch_predictions)), + "batch_predictions": batch_predictions, + "max_realisim_score": max_realism_score, + "nearest_indices": nearest_indices, + } + + def evaluate_pr( + self, + features_1: np.ndarray, + radii_1: np.ndarray, + features_2: np.ndarray, + radii_2: np.ndarray, + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Evaluate precision and recall efficiently. + + :param features_1: [N1 x D] feature vectors for reference batch. + :param radii_1: [N1 x K1] radii for reference vectors. + :param features_2: [N2 x D] feature vectors for the other batch. + :param radii_2: [N x K2] radii for other vectors. + :return: a tuple of arrays for (precision, recall): + - precision: an np.ndarray of length K1 + - recall: an np.ndarray of length K2 + """ + features_1_status = np.zeros([len(features_1), radii_2.shape[1]], dtype=np.bool_) + features_2_status = np.zeros([len(features_2), radii_1.shape[1]], dtype=np.bool_) + for begin_1 in range(0, len(features_1), self.row_batch_size): + end_1 = begin_1 + self.row_batch_size + batch_1 = features_1[begin_1:end_1] + for begin_2 in range(0, len(features_2), self.col_batch_size): + end_2 = begin_2 + self.col_batch_size + batch_2 = features_2[begin_2:end_2] + batch_1_in, batch_2_in = self.distance_block.less_thans( + batch_1, radii_1[begin_1:end_1], batch_2, radii_2[begin_2:end_2] + ) + features_1_status[begin_1:end_1] |= batch_1_in + features_2_status[begin_2:end_2] |= batch_2_in + return ( + np.mean(features_2_status.astype(np.float64), axis=0), + np.mean(features_1_status.astype(np.float64), axis=0), + ) + + +class DistanceBlock: + """ + Calculate pairwise distances between vectors. + + Adapted from https://github.com/kynkaat/improved-precision-and-recall-metric/blob/f60f25e5ad933a79135c783fcda53de30f42c9b9/precision_recall.py#L34 + """ + + def __init__(self, session): + self.session = session + + # Initialize TF graph to calculate pairwise distances. + with session.graph.as_default(): + self._features_batch1 = tf.placeholder(tf.float32, shape=[None, None]) + self._features_batch2 = tf.placeholder(tf.float32, shape=[None, None]) + distance_block_16 = _batch_pairwise_distances( + tf.cast(self._features_batch1, tf.float16), + tf.cast(self._features_batch2, tf.float16), + ) + self.distance_block = tf.cond( + tf.reduce_all(tf.math.is_finite(distance_block_16)), + lambda: tf.cast(distance_block_16, tf.float32), + lambda: _batch_pairwise_distances(self._features_batch1, self._features_batch2), + ) + + # Extra logic for less thans. + self._radii1 = tf.placeholder(tf.float32, shape=[None, None]) + self._radii2 = tf.placeholder(tf.float32, shape=[None, None]) + dist32 = tf.cast(self.distance_block, tf.float32)[..., None] + self._batch_1_in = tf.math.reduce_any(dist32 <= self._radii2, axis=1) + self._batch_2_in = tf.math.reduce_any(dist32 <= self._radii1[:, None], axis=0) + + def pairwise_distances(self, U, V): + """ + Evaluate pairwise distances between two batches of feature vectors. + """ + return self.session.run( + self.distance_block, + feed_dict={self._features_batch1: U, self._features_batch2: V}, + ) + + def less_thans(self, batch_1, radii_1, batch_2, radii_2): + return self.session.run( + [self._batch_1_in, self._batch_2_in], + feed_dict={ + self._features_batch1: batch_1, + self._features_batch2: batch_2, + self._radii1: radii_1, + self._radii2: radii_2, + }, + ) + + +def _batch_pairwise_distances(U, V): + """ + Compute pairwise distances between two batches of feature vectors. + """ + with tf.variable_scope("pairwise_dist_block"): + # Squared norms of each row in U and V. + norm_u = tf.reduce_sum(tf.square(U), 1) + norm_v = tf.reduce_sum(tf.square(V), 1) + + # norm_u as a column and norm_v as a row vectors. + norm_u = tf.reshape(norm_u, [-1, 1]) + norm_v = tf.reshape(norm_v, [1, -1]) + + # Pairwise squared Euclidean distances. + D = tf.maximum(norm_u - 2 * tf.matmul(U, V, False, True) + norm_v, 0.0) + + return D + + +class NpzArrayReader(ABC): + @abstractmethod + def read_batch(self, batch_size: int) -> Optional[np.ndarray]: + pass + + @abstractmethod + def remaining(self) -> int: + pass + + def read_batches(self, batch_size: int) -> Iterable[np.ndarray]: + def gen_fn(): + while True: + batch = self.read_batch(batch_size) + if batch is None: + break + yield batch + + rem = self.remaining() + num_batches = rem // batch_size + int(rem % batch_size != 0) + return BatchIterator(gen_fn, num_batches) + + +class BatchIterator: + def __init__(self, gen_fn, length): + self.gen_fn = gen_fn + self.length = length + + def __len__(self): + return self.length + + def __iter__(self): + return self.gen_fn() + + +class StreamingNpzArrayReader(NpzArrayReader): + def __init__(self, arr_f, shape, dtype): + self.arr_f = arr_f + self.shape = shape + self.dtype = dtype + self.idx = 0 + + def read_batch(self, batch_size: int) -> Optional[np.ndarray]: + if self.idx >= self.shape[0]: + return None + + bs = min(batch_size, self.shape[0] - self.idx) + self.idx += bs + + if self.dtype.itemsize == 0: + return np.ndarray([bs, *self.shape[1:]], dtype=self.dtype) + + read_count = bs * np.prod(self.shape[1:]) + read_size = int(read_count * self.dtype.itemsize) + data = _read_bytes(self.arr_f, read_size, "array data") + return np.frombuffer(data, dtype=self.dtype).reshape([bs, *self.shape[1:]]) + + def remaining(self) -> int: + return max(0, self.shape[0] - self.idx) + + +class MemoryNpzArrayReader(NpzArrayReader): + def __init__(self, arr): + self.arr = arr + self.idx = 0 + + @classmethod + def load(cls, path: str, arr_name: str): + with open(path, "rb") as f: + arr = np.load(f)[arr_name] + return cls(arr) + + def read_batch(self, batch_size: int) -> Optional[np.ndarray]: + if self.idx >= self.arr.shape[0]: + return None + + res = self.arr[self.idx : self.idx + batch_size] + self.idx += batch_size + return res + + def remaining(self) -> int: + return max(0, self.arr.shape[0] - self.idx) + + +@contextmanager +def open_npz_array(path: str, arr_name: str) -> NpzArrayReader: + with _open_npy_file(path, arr_name) as arr_f: + version = np.lib.format.read_magic(arr_f) + if version == (1, 0): + header = np.lib.format.read_array_header_1_0(arr_f) + elif version == (2, 0): + header = np.lib.format.read_array_header_2_0(arr_f) + else: + yield MemoryNpzArrayReader.load(path, arr_name) + return + shape, fortran, dtype = header + if fortran or dtype.hasobject: + yield MemoryNpzArrayReader.load(path, arr_name) + else: + yield StreamingNpzArrayReader(arr_f, shape, dtype) + + +def _read_bytes(fp, size, error_template="ran out of data"): + """ + Copied from: https://github.com/numpy/numpy/blob/fb215c76967739268de71aa4bda55dd1b062bc2e/numpy/lib/format.py#L788-L886 + + Read from file-like object until size bytes are read. + Raises ValueError if not EOF is encountered before size bytes are read. + Non-blocking objects only supported if they derive from io objects. + Required as e.g. ZipExtFile in python 2.6 can return less data than + requested. + """ + data = bytes() + while True: + # io files (default in python3) return None or raise on + # would-block, python2 file will truncate, probably nothing can be + # done about that. note that regular files can't be non-blocking + try: + r = fp.read(size - len(data)) + data += r + if len(r) == 0 or len(data) == size: + break + except io.BlockingIOError: + pass + if len(data) != size: + msg = "EOF: reading %s, expected %d bytes got %d" + raise ValueError(msg % (error_template, size, len(data))) + else: + return data + + +@contextmanager +def _open_npy_file(path: str, arr_name: str): + with open(path, "rb") as f: + with zipfile.ZipFile(f, "r") as zip_f: + if f"{arr_name}.npy" not in zip_f.namelist(): + raise ValueError(f"missing {arr_name} in npz file") + with zip_f.open(f"{arr_name}.npy", "r") as arr_f: + yield arr_f + + +def _download_inception_model(): + if os.path.exists(INCEPTION_V3_PATH): + return + print("downloading InceptionV3 model...") + with requests.get(INCEPTION_V3_URL, stream=True) as r: + r.raise_for_status() + tmp_path = INCEPTION_V3_PATH + ".tmp" + with open(tmp_path, "wb") as f: + for chunk in tqdm(r.iter_content(chunk_size=8192)): + f.write(chunk) + os.rename(tmp_path, INCEPTION_V3_PATH) + + +def _create_feature_graph(input_batch): + _download_inception_model() + prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}" + with open(INCEPTION_V3_PATH, "rb") as f: + graph_def = tf.GraphDef() + graph_def.ParseFromString(f.read()) + pool3, spatial = tf.import_graph_def( + graph_def, + input_map={f"ExpandDims:0": input_batch}, + return_elements=[FID_POOL_NAME, FID_SPATIAL_NAME], + name=prefix, + ) + _update_shapes(pool3) + spatial = spatial[..., :7] + return pool3, spatial + + +def _create_softmax_graph(input_batch): + _download_inception_model() + prefix = f"{random.randrange(2**32)}_{random.randrange(2**32)}" + with open(INCEPTION_V3_PATH, "rb") as f: + graph_def = tf.GraphDef() + graph_def.ParseFromString(f.read()) + (matmul,) = tf.import_graph_def( + graph_def, return_elements=[f"softmax/logits/MatMul"], name=prefix + ) + w = matmul.inputs[1] + logits = tf.matmul(input_batch, w) + return tf.nn.softmax(logits) + + +def _update_shapes(pool3): + # https://github.com/bioinf-jku/TTUR/blob/73ab375cdf952a12686d9aa7978567771084da42/fid.py#L50-L63 + ops = pool3.graph.get_operations() + for op in ops: + for o in op.outputs: + shape = o.get_shape() + if shape._dims is not None: # pylint: disable=protected-access + # shape = [s.value for s in shape] TF 1.x + shape = [s for s in shape] # TF 2.x + new_shape = [] + for j, s in enumerate(shape): + if s == 1 and j == 0: + new_shape.append(None) + else: + new_shape.append(s) + o.__dict__["_shape_val"] = tf.TensorShape(new_shape) + return pool3 + + +def _numpy_partition(arr, kth, **kwargs): + num_workers = min(cpu_count(), len(arr)) + chunk_size = len(arr) // num_workers + extra = len(arr) % num_workers + + start_idx = 0 + batches = [] + for i in range(num_workers): + size = chunk_size + (1 if i < extra else 0) + batches.append(arr[start_idx : start_idx + size]) + start_idx += size + + with ThreadPool(num_workers) as pool: + return list(pool.map(partial(np.partition, kth=kth, **kwargs), batches)) + + +if __name__ == "__main__": + main() diff --git a/back/evaluations/requirements.txt b/back/evaluations/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc6df305a4169b13bcfab5e238e4ff1c97b6baaa --- /dev/null +++ b/back/evaluations/requirements.txt @@ -0,0 +1,4 @@ +tensorflow-gpu>=2.0 +scipy +requests +tqdm \ No newline at end of file diff --git a/back/models/__pycache__/mocov3_vit.cpython-310.pyc b/back/models/__pycache__/mocov3_vit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f42dc274d9be052a4103e51bb5c8cf1553543775 Binary files /dev/null and b/back/models/__pycache__/mocov3_vit.cpython-310.pyc differ diff --git a/back/models/__pycache__/mocov3_vit.cpython-312.pyc b/back/models/__pycache__/mocov3_vit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8fd5f3e5a7e1bf22c75d208b2b92b24973bc7ae3 Binary files /dev/null and b/back/models/__pycache__/mocov3_vit.cpython-312.pyc differ diff --git a/back/models/__pycache__/sit.cpython-310.pyc b/back/models/__pycache__/sit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..137892194ba1b446d00abd3bdb323b2c4bf39cc6 Binary files /dev/null and b/back/models/__pycache__/sit.cpython-310.pyc differ diff --git a/back/models/__pycache__/sit.cpython-312.pyc b/back/models/__pycache__/sit.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..123f798b03f83111f9d5dade94cf959120a924bb Binary files /dev/null and b/back/models/__pycache__/sit.cpython-312.pyc differ diff --git a/back/models/clip_vit.py b/back/models/clip_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..299e3c08633bffb5b3a500ebf9c98a5019109191 --- /dev/null +++ b/back/models/clip_vit.py @@ -0,0 +1,426 @@ +from collections import OrderedDict +from typing import Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +import clip + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.relu1 = nn.ReLU(inplace=True) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu2 = nn.ReLU(inplace=True) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.relu3 = nn.ReLU(inplace=True) + + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential(OrderedDict([ + ("-1", nn.AvgPool2d(stride)), + ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)), + ("1", nn.BatchNorm2d(planes * self.expansion)) + ])) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu1(self.bn1(self.conv1(x))) + out = self.relu2(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu3(out) + return out + + +class AttentionPool2d(nn.Module): + def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None): + super().__init__() + self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + + def forward(self, x): + x = x.flatten(start_dim=2).permute(2, 0, 1) # NCHW -> (HW)NC + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC + x, _ = F.multi_head_attention_forward( + query=x[:1], key=x, value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False + ) + return x.squeeze(0) + + +class ModifiedResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, layers, output_dim, heads, input_resolution=224, width=64): + super().__init__() + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(width // 2) + self.relu1 = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(width // 2) + self.relu2 = nn.ReLU(inplace=True) + self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.relu3 = nn.ReLU(inplace=True) + self.avgpool = nn.AvgPool2d(2) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim) + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + def stem(x): + x = self.relu1(self.bn1(self.conv1(x))) + x = self.relu2(self.bn2(self.conv2(x))) + x = self.relu3(self.bn3(self.conv3(x))) + x = self.avgpool(x) + return x + + x = x.type(self.conv1.weight.dtype) + x = stem(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.attnpool(x) + + return x + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential(OrderedDict([ + ("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model)) + ])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None + return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class UpdatedVisionTransformer(nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, x: torch.Tensor): + x = self.model.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + x = torch.cat([self.model.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.model.positional_embedding.to(x.dtype) + x = self.model.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.model.transformer(x) + x = x.permute(1, 0, 2)[:, 1:] # LND -> NLD + + # x = self.ln_post(x[:, 0, :]) + + # if self.proj is not None: + # x = x @ self.proj + + return x + + +class CLIP(nn.Module): + def __init__(self, + embed_dim: int, + # vision + image_resolution: int, + vision_layers: Union[Tuple[int, int, int, int], int], + vision_width: int, + vision_patch_size: int, + # text + context_length: int, + vocab_size: int, + transformer_width: int, + transformer_heads: int, + transformer_layers: int + ): + super().__init__() + + self.context_length = context_length + + if isinstance(vision_layers, (tuple, list)): + vision_heads = vision_width * 32 // 64 + self.visual = ModifiedResNet( + layers=vision_layers, + output_dim=embed_dim, + heads=vision_heads, + input_resolution=image_resolution, + width=vision_width + ) + else: + vision_heads = vision_width // 64 + self.visual = UpdatedVisionTransformer( + input_resolution=image_resolution, + patch_size=vision_patch_size, + width=vision_width, + layers=vision_layers, + heads=vision_heads, + output_dim=embed_dim + ) + + self.transformer = Transformer( + width=transformer_width, + layers=transformer_layers, + heads=transformer_heads, + attn_mask=self.build_attention_mask() + ) + + self.vocab_size = vocab_size + self.token_embedding = nn.Embedding(vocab_size, transformer_width) + self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width)) + self.ln_final = LayerNorm(transformer_width) + + self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim)) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + self.initialize_parameters() + + def initialize_parameters(self): + nn.init.normal_(self.token_embedding.weight, std=0.02) + nn.init.normal_(self.positional_embedding, std=0.01) + + if isinstance(self.visual, ModifiedResNet): + if self.visual.attnpool is not None: + std = self.visual.attnpool.c_proj.in_features ** -0.5 + nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) + + for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]: + for name, param in resnet_block.named_parameters(): + if name.endswith("bn3.weight"): + nn.init.zeros_(param) + + proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5) + attn_std = self.transformer.width ** -0.5 + fc_std = (2 * self.transformer.width) ** -0.5 + for block in self.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + if self.text_projection is not None: + nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5) + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float("-inf")) + mask.triu_(1) # zero out the lower diagonal + return mask + + @property + def dtype(self): + return self.visual.conv1.weight.dtype + + def encode_image(self, image): + return self.visual(image.type(self.dtype)) + + def encode_text(self, text): + x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model] + + x = x + self.positional_embedding.type(self.dtype) + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x).type(self.dtype) + + # x.shape = [batch_size, n_ctx, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection + + return x + + def forward(self, image, text): + image_features = self.encode_image(image) + text_features = self.encode_text(text) + + # normalized features + image_features = image_features / image_features.norm(dim=1, keepdim=True) + text_features = text_features / text_features.norm(dim=1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_image = logit_scale * image_features @ text_features.t() + logits_per_text = logits_per_image.t() + + # shape = [global_batch_size, global_batch_size] + return logits_per_image, logits_per_text + + +def convert_weights(model: nn.Module): + """Convert applicable model parameters to fp16""" + + def _convert_weights_to_fp16(l): + if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)): + l.weight.data = l.weight.data.half() + if l.bias is not None: + l.bias.data = l.bias.data.half() + + if isinstance(l, nn.MultiheadAttention): + for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]: + tensor = getattr(l, attr) + if tensor is not None: + tensor.data = tensor.data.half() + + for name in ["text_projection", "proj"]: + if hasattr(l, name): + attr = getattr(l, name) + if attr is not None: + attr.data = attr.data.half() + + model.apply(_convert_weights_to_fp16) + + +def build_model(state_dict: dict): + vit = "visual.proj" in state_dict + + if vit: + vision_width = state_dict["visual.conv1.weight"].shape[0] + vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")]) + vision_patch_size = state_dict["visual.conv1.weight"].shape[-1] + grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5) + image_resolution = vision_patch_size * grid_size + else: + counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]] + vision_layers = tuple(counts) + vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0] + output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5) + vision_patch_size = None + assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0] + image_resolution = output_width * 32 + + embed_dim = state_dict["text_projection"].shape[1] + context_length = state_dict["positional_embedding"].shape[0] + vocab_size = state_dict["token_embedding.weight"].shape[0] + transformer_width = state_dict["ln_final.weight"].shape[0] + transformer_heads = transformer_width // 64 + transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks"))) + + model = CLIP( + embed_dim, + image_resolution, vision_layers, vision_width, vision_patch_size, + context_length, vocab_size, transformer_width, transformer_heads, transformer_layers + ) + + for key in ["input_resolution", "context_length", "vocab_size"]: + if key in state_dict: + del state_dict[key] + + convert_weights(model) + model.load_state_dict(state_dict) + return model.eval() \ No newline at end of file diff --git a/back/models/jepa.py b/back/models/jepa.py new file mode 100644 index 0000000000000000000000000000000000000000..0d8b2205172ff6d7501898ce3568ed1426e35a23 --- /dev/null +++ b/back/models/jepa.py @@ -0,0 +1,547 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# + +import math +from functools import partial +import numpy as np + +import torch +import torch.nn as nn + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +def repeat_interleave_batch(x, B, repeat): + N = len(x) // B + x = torch.cat([ + torch.cat([x[i*B:(i+1)*B] for _ in range(repeat)], dim=0) + for i in range(N) + ], dim=0) + return x + +def apply_masks(x, masks): + """ + :param x: tensor of shape [B (batch-size), N (num-patches), D (feature-dim)] + :param masks: list of tensors containing indices of patches in [N] to keep + """ + all_x = [] + for m in masks: + mask_keep = m.unsqueeze(-1).repeat(1, 1, x.size(-1)) + all_x += [torch.gather(x, dim=1, index=mask_keep)] + return torch.cat(all_x, dim=0) + +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=float) + grid_w = np.arange(grid_size, dtype=float) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid length + return: + pos_embed: [grid_size, embed_dim] or [1+grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid = np.arange(grid_size, dtype=float) + pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=float) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +def drop_path(x, drop_prob: float = 0., training: bool = False): + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class MLP(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x, attn + + +class Block(nn.Module): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = MLP(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x, return_attention=False): + y, attn = self.attn(self.norm1(x)) + if return_attention: + return attn + x = x + self.drop_path(y) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + num_patches = (img_size // patch_size) * (img_size // patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + B, C, H, W = x.shape + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +class ConvEmbed(nn.Module): + """ + 3x3 Convolution stems for ViT following ViTC models + """ + + def __init__(self, channels, strides, img_size=224, in_chans=3, batch_norm=True): + super().__init__() + # Build the stems + stem = [] + channels = [in_chans] + channels + for i in range(len(channels) - 2): + stem += [nn.Conv2d(channels[i], channels[i+1], kernel_size=3, + stride=strides[i], padding=1, bias=(not batch_norm))] + if batch_norm: + stem += [nn.BatchNorm2d(channels[i+1])] + stem += [nn.ReLU(inplace=True)] + stem += [nn.Conv2d(channels[-2], channels[-1], kernel_size=1, stride=strides[-1])] + self.stem = nn.Sequential(*stem) + + # Comptute the number of patches + stride_prod = int(np.prod(strides)) + self.num_patches = (img_size[0] // stride_prod)**2 + + def forward(self, x): + p = self.stem(x) + return p.flatten(2).transpose(1, 2) + + +class VisionTransformerPredictor(nn.Module): + """ Vision Transformer """ + def __init__( + self, + num_patches, + embed_dim=768, + predictor_embed_dim=384, + depth=6, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + norm_layer=nn.LayerNorm, + init_std=0.02, + **kwargs + ): + super().__init__() + self.predictor_embed = nn.Linear(embed_dim, predictor_embed_dim, bias=True) + self.mask_token = nn.Parameter(torch.zeros(1, 1, predictor_embed_dim)) + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + # -- + self.predictor_pos_embed = nn.Parameter(torch.zeros(1, num_patches, predictor_embed_dim), + requires_grad=False) + predictor_pos_embed = get_2d_sincos_pos_embed(self.predictor_pos_embed.shape[-1], + int(num_patches**.5), + cls_token=False) + self.predictor_pos_embed.data.copy_(torch.from_numpy(predictor_pos_embed).float().unsqueeze(0)) + # -- + self.predictor_blocks = nn.ModuleList([ + Block( + dim=predictor_embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) + for i in range(depth)]) + self.predictor_norm = norm_layer(predictor_embed_dim) + self.predictor_proj = nn.Linear(predictor_embed_dim, embed_dim, bias=True) + # ------ + self.init_std = init_std + trunc_normal_(self.mask_token, std=self.init_std) + self.apply(self._init_weights) + self.fix_init_weight() + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.predictor_blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=self.init_std) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + trunc_normal_(m.weight, std=self.init_std) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x, masks_x, masks): + assert (masks is not None) and (masks_x is not None), 'Cannot run predictor without mask indices' + + if not isinstance(masks_x, list): + masks_x = [masks_x] + + if not isinstance(masks, list): + masks = [masks] + + # -- Batch Size + B = len(x) // len(masks_x) + + # -- map from encoder-dim to pedictor-dim + x = self.predictor_embed(x) + + # -- add positional embedding to x tokens + x_pos_embed = self.predictor_pos_embed.repeat(B, 1, 1) + x += apply_masks(x_pos_embed, masks_x) + + _, N_ctxt, D = x.shape + + # -- concat mask tokens to x + pos_embs = self.predictor_pos_embed.repeat(B, 1, 1) + pos_embs = apply_masks(pos_embs, masks) + pos_embs = repeat_interleave_batch(pos_embs, B, repeat=len(masks_x)) + # -- + pred_tokens = self.mask_token.repeat(pos_embs.size(0), pos_embs.size(1), 1) + # -- + pred_tokens += pos_embs + x = x.repeat(len(masks), 1, 1) + x = torch.cat([x, pred_tokens], dim=1) + + # -- fwd prop + for blk in self.predictor_blocks: + x = blk(x) + x = self.predictor_norm(x) + + # -- return preds for mask tokens + x = x[:, N_ctxt:] + x = self.predictor_proj(x) + + return x + + +class VisionTransformer(nn.Module): + """ Vision Transformer """ + def __init__( + self, + img_size=[224], + patch_size=16, + in_chans=3, + embed_dim=768, + predictor_embed_dim=384, + depth=12, + predictor_depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + norm_layer=nn.LayerNorm, + init_std=0.02, + **kwargs + ): + super().__init__() + self.num_features = self.embed_dim = embed_dim + self.num_heads = num_heads + # -- + self.patch_embed = PatchEmbed( + img_size=img_size[0], + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + # -- + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim), requires_grad=False) + pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], + int(self.patch_embed.num_patches**.5), + cls_token=False) + self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) + # -- + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + # ------ + self.init_std = init_std + self.apply(self._init_weights) + self.fix_init_weight() + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=self.init_std) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + trunc_normal_(m.weight, std=self.init_std) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x, masks=None): + if masks is not None: + if not isinstance(masks, list): + masks = [masks] + + # -- patchify x + x = self.patch_embed(x) + B, N, D = x.shape + + # -- add positional embedding to x + pos_embed = self.interpolate_pos_encoding(x, self.pos_embed) + x = x + pos_embed + + # -- mask x + if masks is not None: + x = apply_masks(x, masks) + + # -- fwd prop + for i, blk in enumerate(self.blocks): + x = blk(x) + + if self.norm is not None: + x = self.norm(x) + + return x + + def interpolate_pos_encoding(self, x, pos_embed): + npatch = x.shape[1] - 1 + N = pos_embed.shape[1] - 1 + if npatch == N: + return pos_embed + class_emb = pos_embed[:, 0] + pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + pos_embed = nn.functional.interpolate( + pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2), + scale_factor=math.sqrt(npatch / N), + mode='bicubic', + ) + pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_emb.unsqueeze(0), pos_embed), dim=1) + + +def vit_predictor(**kwargs): + model = VisionTransformerPredictor( + mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), + **kwargs) + return model + + +def vit_tiny(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, + qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_small(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, + qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_base(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, + qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_large(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, + qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_huge(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, + qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_giant(patch_size=16, **kwargs): + model = VisionTransformer( + patch_size=patch_size, embed_dim=1408, depth=40, num_heads=16, mlp_ratio=48/11, + qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +VIT_EMBED_DIMS = { + 'vit_tiny': 192, + 'vit_small': 384, + 'vit_base': 768, + 'vit_large': 1024, + 'vit_huge': 1280, + 'vit_giant': 1408, +} \ No newline at end of file diff --git a/back/models/mae_vit.py b/back/models/mae_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..f391f5317cf7025324d79a3ab5368724810842d6 --- /dev/null +++ b/back/models/mae_vit.py @@ -0,0 +1,71 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- + +from functools import partial + +import torch +import torch.nn as nn + +import timm.models.vision_transformer + + +class VisionTransformer(timm.models.vision_transformer.VisionTransformer): + """ Vision Transformer with support for global average pooling + """ + def __init__(self, global_pool=False, **kwargs): + super(VisionTransformer, self).__init__(**kwargs) + + self.global_pool = global_pool + if self.global_pool: + norm_layer = kwargs['norm_layer'] + embed_dim = kwargs['embed_dim'] + self.fc_norm = norm_layer(embed_dim) + + del self.norm # remove the original norm + + def forward_features(self, x): + B = x.shape[0] + x = self.patch_embed(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + x = x + self.pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = x[:, 1:, :] #.mean(dim=1) # global pool without cls token + + return x + + +def vit_base_patch16(**kwargs): + model = VisionTransformer( + num_classes=0, + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_large_patch16(**kwargs): + model = VisionTransformer( + num_classes=0, + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model + + +def vit_huge_patch14(**kwargs): + model = VisionTransformer( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return model \ No newline at end of file diff --git a/back/models/mocov3_vit.py b/back/models/mocov3_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..7136596d204769e5d45852bfb0c0ad1c8c183a3b --- /dev/null +++ b/back/models/mocov3_vit.py @@ -0,0 +1,207 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +import torch +import torch.nn as nn +from functools import partial, reduce +from operator import mul + +from timm.layers.helpers import to_2tuple +from timm.models.vision_transformer import VisionTransformer, _cfg +from timm.models.vision_transformer import PatchEmbed + +__all__ = [ + 'vit_small', + 'vit_base', + 'vit_large', + 'vit_conv_small', + 'vit_conv_base', +] + + +def patchify_avg(input_tensor, patch_size): + # Ensure input tensor is 4D: (batch_size, channels, height, width) + if input_tensor.dim() != 4: + raise ValueError("Input tensor must be 4D (batch_size, channels, height, width)") + + # Get input tensor dimensions + batch_size, channels, height, width = input_tensor.shape + + # Ensure patch_size is valid + patch_height, patch_width = patch_size, patch_size + if height % patch_height != 0 or width % patch_width != 0: + raise ValueError("Input tensor dimensions must be divisible by patch_size") + + # Use unfold to create patches + patches = input_tensor.unfold(2, patch_height, patch_height).unfold(3, patch_width, patch_width) + + # Reshape patches to desired format: (batch_size, num_patches, channels) + patches = patches.contiguous().view( + batch_size, channels, -1, patch_height, patch_width + ).mean(dim=-1).mean(dim=-1) + patches = patches.permute(0, 2, 1).contiguous() + + return patches + + + +class VisionTransformerMoCo(VisionTransformer): + def __init__(self, stop_grad_conv1=False, **kwargs): + super().__init__(**kwargs) + # Use fixed 2D sin-cos position embedding + self.build_2d_sincos_position_embedding() + + # weight initialization + for name, m in self.named_modules(): + if isinstance(m, nn.Linear): + if 'qkv' in name: + # treat the weights of Q, K, V separately + val = math.sqrt(6. / float(m.weight.shape[0] // 3 + m.weight.shape[1])) + nn.init.uniform_(m.weight, -val, val) + else: + nn.init.xavier_uniform_(m.weight) + nn.init.zeros_(m.bias) + nn.init.normal_(self.cls_token, std=1e-6) + + if isinstance(self.patch_embed, PatchEmbed): + # xavier_uniform initialization + val = math.sqrt(6. / float(3 * reduce(mul, self.patch_embed.patch_size, 1) + self.embed_dim)) + nn.init.uniform_(self.patch_embed.proj.weight, -val, val) + nn.init.zeros_(self.patch_embed.proj.bias) + + if stop_grad_conv1: + self.patch_embed.proj.weight.requires_grad = False + self.patch_embed.proj.bias.requires_grad = False + + def build_2d_sincos_position_embedding(self, temperature=10000.): + h = self.patch_embed.img_size[0] // self.patch_embed.patch_size[0] + w = self.patch_embed.img_size[1] // self.patch_embed.patch_size[1] + grid_w = torch.arange(w, dtype=torch.float32) + grid_h = torch.arange(h, dtype=torch.float32) + grid_w, grid_h = torch.meshgrid(grid_w, grid_h) + assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' + pos_dim = self.embed_dim // 4 + omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim + omega = 1. / (temperature**omega) + out_w = torch.einsum('m,d->md', [grid_w.flatten(), omega]) + out_h = torch.einsum('m,d->md', [grid_h.flatten(), omega]) + pos_emb = torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], dim=1)[None, :, :] + + # assert self.num_tokens == 1, 'Assuming one and only one token, [cls]' + pe_token = torch.zeros([1, 1, self.embed_dim], dtype=torch.float32) + self.pos_embed = nn.Parameter(torch.cat([pe_token, pos_emb], dim=1)) + self.pos_embed.requires_grad = False + + def forward_diffusion_output(self, x): + x = x.reshape(*x.shape[0:2], -1).permute(0, 2, 1) + x = self._pos_embed(x) + x = self.patch_drop(x) + x = self.norm_pre(x) + x = self.blocks(x) + x = self.norm(x) + return x + +class ConvStem(nn.Module): + """ + ConvStem, from Early Convolutions Help Transformers See Better, Tete et al. https://arxiv.org/abs/2106.14881 + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True): + super().__init__() + + assert patch_size == 16, 'ConvStem only supports patch size of 16' + assert embed_dim % 8 == 0, 'Embed dimension must be divisible by 8 for ConvStem' + + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + # build stem, similar to the design in https://arxiv.org/abs/2106.14881 + stem = [] + input_dim, output_dim = 3, embed_dim // 8 + for l in range(4): + stem.append(nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=2, padding=1, bias=False)) + stem.append(nn.BatchNorm2d(output_dim)) + stem.append(nn.ReLU(inplace=True)) + input_dim = output_dim + output_dim *= 2 + stem.append(nn.Conv2d(input_dim, embed_dim, kernel_size=1)) + self.proj = nn.Sequential(*stem) + + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x + + +def vit_small(**kwargs): + model = VisionTransformerMoCo( + img_size=256, + patch_size=16, embed_dim=384, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + +def vit_base(**kwargs): + model = VisionTransformerMoCo( + img_size=256, + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + +def vit_large(**kwargs): + model = VisionTransformerMoCo( + img_size=256, + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model.default_cfg = _cfg() + return model + +def vit_conv_small(**kwargs): + # minus one ViT block + model = VisionTransformerMoCo( + patch_size=16, embed_dim=384, depth=11, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), embed_layer=ConvStem, **kwargs) + model.default_cfg = _cfg() + return model + +def vit_conv_base(**kwargs): + # minus one ViT block + model = VisionTransformerMoCo( + patch_size=16, embed_dim=768, depth=11, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), embed_layer=ConvStem, **kwargs) + model.default_cfg = _cfg() + return model + +def build_mlp(num_layers, input_dim, mlp_dim, output_dim, last_bn=True): + mlp = [] + for l in range(num_layers): + dim1 = input_dim if l == 0 else mlp_dim + dim2 = output_dim if l == num_layers - 1 else mlp_dim + + mlp.append(nn.Linear(dim1, dim2, bias=False)) + + if l < num_layers - 1: + mlp.append(nn.BatchNorm1d(dim2)) + mlp.append(nn.ReLU(inplace=True)) + elif last_bn: + # follow SimCLR's design: https://github.com/google-research/simclr/blob/master/model_util.py#L157 + # for simplicity, we further removed gamma in BN + mlp.append(nn.BatchNorm1d(dim2, affine=False)) + + return nn.Sequential(*mlp) \ No newline at end of file diff --git a/back/models/sit.py b/back/models/sit.py new file mode 100644 index 0000000000000000000000000000000000000000..f7a88609ce901b956a9e03049f7389b5c819d7bf --- /dev/null +++ b/back/models/sit.py @@ -0,0 +1,420 @@ +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# References: +# GLIDE: https://github.com/openai/glide-text2im +# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py +# -------------------------------------------------------- + +import torch +import torch.nn as nn +import numpy as np +import math +from timm.models.vision_transformer import PatchEmbed, Attention, Mlp + + +def build_mlp(hidden_size, projector_dim, z_dim): + return nn.Sequential( + nn.Linear(hidden_size, projector_dim), + nn.SiLU(), + nn.Linear(projector_dim, projector_dim), + nn.SiLU(), + nn.Linear(projector_dim, z_dim), + ) + +def modulate(x, shift, scale): + return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + +################################################################################# +# Embedding Layers for Timesteps and Class Labels # +################################################################################# +class TimestepEmbedder(nn.Module): + """ + Embeds scalar timesteps into vector representations. + """ + def __init__(self, hidden_size, frequency_embedding_size=256): + super().__init__() + self.mlp = nn.Sequential( + nn.Linear(frequency_embedding_size, hidden_size, bias=True), + nn.SiLU(), + nn.Linear(hidden_size, hidden_size, bias=True), + ) + self.frequency_embedding_size = frequency_embedding_size + + @staticmethod + def positional_embedding(t, dim, max_period=10000): + """ + Create sinusoidal timestep embeddings. + :param t: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an (N, D) Tensor of positional embeddings. + """ + # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half + ).to(device=t.device) + args = t[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + return embedding + + def forward(self, t): + self.timestep_embedding = self.positional_embedding + t_freq = self.timestep_embedding(t, dim=self.frequency_embedding_size).to(t.dtype) + t_emb = self.mlp(t_freq) + return t_emb + + +class LabelEmbedder(nn.Module): + """ + Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance. + """ + def __init__(self, num_classes, hidden_size, dropout_prob): + super().__init__() + use_cfg_embedding = dropout_prob > 0 + self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size) + self.num_classes = num_classes + self.dropout_prob = dropout_prob + + def token_drop(self, labels, force_drop_ids=None): + """ + Drops labels to enable classifier-free guidance. + """ + if force_drop_ids is None: + drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob + else: + drop_ids = force_drop_ids == 1 + labels = torch.where(drop_ids, self.num_classes, labels) + return labels + + def forward(self, labels, train, force_drop_ids=None): + use_dropout = self.dropout_prob > 0 + if (train and use_dropout) or (force_drop_ids is not None): + labels = self.token_drop(labels, force_drop_ids) + embeddings = self.embedding_table(labels) + return embeddings + + +################################################################################# +# Core SiT Model # +################################################################################# + +class SiTBlock(nn.Module): + """ + A SiT block with adaptive layer norm zero (adaLN-Zero) conditioning. + """ + def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs): + super().__init__() + self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.attn = Attention( + hidden_size, num_heads=num_heads, qkv_bias=True, qk_norm=block_kwargs["qk_norm"] + ) + if "fused_attn" in block_kwargs.keys(): + self.attn.fused_attn = block_kwargs["fused_attn"] + self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + mlp_hidden_dim = int(hidden_size * mlp_ratio) + approx_gelu = lambda: nn.GELU(approximate="tanh") + self.mlp = Mlp( + in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0 + ) + self.adaLN_modulation = nn.Sequential( + nn.SiLU(), + nn.Linear(hidden_size, 6 * hidden_size, bias=True) + ) + + def forward(self, x, c): + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.adaLN_modulation(c).chunk(6, dim=-1) + ) + x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa)) + x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp)) + + return x + + +class FinalLayer(nn.Module): + """ + The final layer of SiT. + """ + def __init__(self, hidden_size, patch_size, out_channels, cls_token_dim): + super().__init__() + self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) + self.linear_cls = nn.Linear(hidden_size, cls_token_dim, bias=True) + self.adaLN_modulation = nn.Sequential( + nn.SiLU(), + nn.Linear(hidden_size, 2 * hidden_size, bias=True) + ) + + def forward(self, x, c, cls=None): + shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1) + x = modulate(self.norm_final(x), shift, scale) + + if cls is None: + x = self.linear(x) + return x, None + else: + cls_token = self.linear_cls(x[:, 0]).unsqueeze(1) + x = self.linear(x[:, 1:]) + return x, cls_token.squeeze(1) + + +class SiT(nn.Module): + """ + Diffusion model with a Transformer backbone. + """ + def __init__( + self, + path_type='edm', + input_size=32, + patch_size=2, + in_channels=4, + hidden_size=1152, + decoder_hidden_size=768, + encoder_depth=8, + depth=28, + num_heads=16, + mlp_ratio=4.0, + class_dropout_prob=0.1, + num_classes=1000, + use_cfg=False, + z_dims=[768], + projector_dim=2048, + cls_token_dim=768, + **block_kwargs # fused_attn + ): + super().__init__() + self.path_type = path_type + self.in_channels = in_channels + self.out_channels = in_channels + self.patch_size = patch_size + self.num_heads = num_heads + self.use_cfg = use_cfg + self.num_classes = num_classes + self.z_dims = z_dims + self.encoder_depth = encoder_depth + + self.x_embedder = PatchEmbed( + input_size, patch_size, in_channels, hidden_size, bias=True + ) + self.t_embedder = TimestepEmbedder(hidden_size) # timestep embedding type + self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob) + num_patches = self.x_embedder.num_patches + # Will use fixed sin-cos embedding: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches+1, hidden_size), requires_grad=False) + + self.blocks = nn.ModuleList([ + SiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, **block_kwargs) for _ in range(depth) + ]) + self.projectors = nn.ModuleList([ + build_mlp(hidden_size, projector_dim, z_dim) for z_dim in z_dims + ]) + + z_dim = self.z_dims[0] + cls_token_dim = z_dim + self.final_layer = FinalLayer(decoder_hidden_size, patch_size, self.out_channels, cls_token_dim) + + + self.cls_projectors2 = nn.Linear(in_features=cls_token_dim, out_features=hidden_size, bias=True) + self.wg_norm = nn.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6) + + self.initialize_weights() + + def initialize_weights(self): + # Initialize transformer layers: + def _basic_init(module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + self.apply(_basic_init) + + # Initialize (and freeze) pos_embed by sin-cos embedding: + pos_embed = get_2d_sincos_pos_embed( + self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5), cls_token=1, extra_tokens=1 + ) + self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) + + # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): + w = self.x_embedder.proj.weight.data + nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + nn.init.constant_(self.x_embedder.proj.bias, 0) + + # Initialize label embedding table: + nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02) + + # Initialize timestep embedding MLP: + nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) + nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) + + # Zero-out adaLN modulation layers in SiT blocks: + for block in self.blocks: + nn.init.constant_(block.adaLN_modulation[-1].weight, 0) + nn.init.constant_(block.adaLN_modulation[-1].bias, 0) + + # Zero-out output layers: + nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0) + nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0) + nn.init.constant_(self.final_layer.linear.weight, 0) + nn.init.constant_(self.final_layer.linear.bias, 0) + nn.init.constant_(self.final_layer.linear_cls.weight, 0) + nn.init.constant_(self.final_layer.linear_cls.bias, 0) + + def unpatchify(self, x, patch_size=None): + """ + x: (N, T, patch_size**2 * C) + imgs: (N, C, H, W) + """ + c = self.out_channels + p = self.x_embedder.patch_size[0] if patch_size is None else patch_size + h = w = int(x.shape[1] ** 0.5) + assert h * w == x.shape[1] + + x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) + x = torch.einsum('nhwpqc->nchpwq', x) + imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p)) + return imgs + + def forward(self, x, t, y, return_logvar=False, cls_token=None): + """ + Forward pass of SiT. + x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) + t: (N,) tensor of diffusion timesteps + y: (N,) tensor of class labels + """ + + #cat with cls_token + x = self.x_embedder(x) # (N, T, D), where T = H * W / patch_size ** 2 + if cls_token is not None: + cls_token = self.cls_projectors2(cls_token) + cls_token = self.wg_norm(cls_token) + cls_token = cls_token.unsqueeze(1) # [b, length, d] + x = torch.cat((cls_token, x), dim=1) + x = x + self.pos_embed + else: + exit() + N, T, D = x.shape + + # timestep and class embedding + t_embed = self.t_embedder(t) # (N, D) + y = self.y_embedder(y, self.training) # (N, D) + c = t_embed + y + + for i, block in enumerate(self.blocks): + x = block(x, c) + if (i + 1) == self.encoder_depth: + zs = [projector(x.reshape(-1, D)).reshape(N, T, -1) for projector in self.projectors] + + x, cls_token = self.final_layer(x, c, cls=cls_token) + x = self.unpatchify(x) + + return x, zs, cls_token + + +################################################################################# +# Sine/Cosine Positional Embedding Functions # +################################################################################# +# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py + +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token and extra_tokens > 0: + pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float64) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +################################################################################# +# SiT Configs # +################################################################################# + +def SiT_XL_2(**kwargs): + return SiT(depth=28, hidden_size=1152, decoder_hidden_size=1152, patch_size=2, num_heads=16, **kwargs) + +def SiT_XL_4(**kwargs): + return SiT(depth=28, hidden_size=1152, decoder_hidden_size=1152, patch_size=4, num_heads=16, **kwargs) + +def SiT_XL_8(**kwargs): + return SiT(depth=28, hidden_size=1152, decoder_hidden_size=1152, patch_size=8, num_heads=16, **kwargs) + +def SiT_L_2(**kwargs): + return SiT(depth=24, hidden_size=1024, decoder_hidden_size=1024, patch_size=2, num_heads=16, **kwargs) + +def SiT_L_4(**kwargs): + return SiT(depth=24, hidden_size=1024, decoder_hidden_size=1024, patch_size=4, num_heads=16, **kwargs) + +def SiT_L_8(**kwargs): + return SiT(depth=24, hidden_size=1024, decoder_hidden_size=1024, patch_size=8, num_heads=16, **kwargs) + +def SiT_B_2(**kwargs): + return SiT(depth=12, hidden_size=768, decoder_hidden_size=768, patch_size=2, num_heads=12, **kwargs) + +def SiT_B_4(**kwargs): + return SiT(depth=12, hidden_size=768, decoder_hidden_size=768, patch_size=4, num_heads=12, **kwargs) + +def SiT_B_8(**kwargs): + return SiT(depth=12, hidden_size=768, decoder_hidden_size=768, patch_size=8, num_heads=12, **kwargs) + +def SiT_S_2(**kwargs): + return SiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs) + +def SiT_S_4(**kwargs): + return SiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs) + +def SiT_S_8(**kwargs): + return SiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs) + + +SiT_models = { + 'SiT-XL/2': SiT_XL_2, 'SiT-XL/4': SiT_XL_4, 'SiT-XL/8': SiT_XL_8, + 'SiT-L/2': SiT_L_2, 'SiT-L/4': SiT_L_4, 'SiT-L/8': SiT_L_8, + 'SiT-B/2': SiT_B_2, 'SiT-B/4': SiT_B_4, 'SiT-B/8': SiT_B_8, + 'SiT-S/2': SiT_S_2, 'SiT-S/4': SiT_S_4, 'SiT-S/8': SiT_S_8, +} + diff --git a/back/preprocessing/README.md b/back/preprocessing/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bfe3be25b5391acc4bf63b921f792bffea65b368 --- /dev/null +++ b/back/preprocessing/README.md @@ -0,0 +1,25 @@ +

Preprocessing Guide +

+ +#### Dataset download + +We follow the preprocessing code used in [edm2](https://github.com/NVlabs/edm2). In this code we made a several edits: (1) we removed unncessary parts except preprocessing because this code is only used for preprocessing, (2) we use [-1, 1] range for an input to the stable diffusion VAE (similar to DiT or SiT) unlike edm2 that uses [0, 1] range, and (3) we consider preprocessing to 256x256 resolution (or 512x512 resolution). + +After downloading ImageNet, please run the following scripts (please update 256x256 to 512x512 if you want to do experiments on 512x512 resolution); + +Convert raw ImageNet data to a ZIP archive at 256x256 resolution +```bash +bash dataset_prepare_encode.sh +``` + +Convert the pixel data to VAE latents + +```bash +bash dataset_prepare_convert.sh +``` + +Here,`YOUR_DOWNLOAD_PATH` is the directory that you downloaded the dataset, and `TARGET_PATH` is the directory that you will save the preprocessed images and corresponding compressed latent vectors. This directory will be used for your experiment scripts. + +## Acknowledgement + +This code is mainly built upon [edm2](https://github.com/NVlabs/edm2) repository. diff --git a/back/preprocessing/dataset_image_encoder.py b/back/preprocessing/dataset_image_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..35992acbdaf5543722250c181b4056c0b60a6fcc --- /dev/null +++ b/back/preprocessing/dataset_image_encoder.py @@ -0,0 +1,353 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +"""Tool for creating ZIP/PNG based datasets.""" + +from collections.abc import Iterator +from dataclasses import dataclass +import functools +import io +import json +import os +import re +import zipfile +from pathlib import Path +from typing import Callable, Optional, Tuple, Union +import click +import numpy as np +import PIL.Image +import torch +from tqdm import tqdm + +from encoders import StabilityVAEEncoder +from utils import load_encoders +from torchvision.transforms import Normalize +from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +CLIP_DEFAULT_MEAN = (0.48145466, 0.4578275, 0.40821073) +CLIP_DEFAULT_STD = (0.26862954, 0.26130258, 0.27577711) + +def preprocess_raw_image(x, enc_type): + resolution = x.shape[-1] + if 'clip' in enc_type: + x = x / 255. + x = torch.nn.functional.interpolate(x, 224 * (resolution // 256), mode='bicubic') + x = Normalize(CLIP_DEFAULT_MEAN, CLIP_DEFAULT_STD)(x) + elif 'mocov3' in enc_type or 'mae' in enc_type: + x = x / 255. + x = Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)(x) + elif 'dinov2' in enc_type: + x = x / 255. + x = Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)(x) + x = torch.nn.functional.interpolate(x, 224 * (resolution // 256), mode='bicubic') + elif 'dinov1' in enc_type: + x = x / 255. + x = Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)(x) + elif 'jepa' in enc_type: + x = x / 255. + x = Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)(x) + x = torch.nn.functional.interpolate(x, 224 * (resolution // 256), mode='bicubic') + + return x + + +#---------------------------------------------------------------------------- + +@dataclass +class ImageEntry: + img: np.ndarray + label: Optional[int] + +#---------------------------------------------------------------------------- +# Parse a 'M,N' or 'MxN' integer tuple. +# Example: '4x2' returns (4,2) + +def parse_tuple(s: str) -> Tuple[int, int]: + m = re.match(r'^(\d+)[x,](\d+)$', s) + if m: + return int(m.group(1)), int(m.group(2)) + raise click.ClickException(f'cannot parse tuple {s}') + +#---------------------------------------------------------------------------- + +def maybe_min(a: int, b: Optional[int]) -> int: + if b is not None: + return min(a, b) + return a + +#---------------------------------------------------------------------------- + +def file_ext(name: Union[str, Path]) -> str: + return str(name).split('.')[-1] + +#---------------------------------------------------------------------------- + +def is_image_ext(fname: Union[str, Path]) -> bool: + ext = file_ext(fname).lower() + return f'.{ext}' in PIL.Image.EXTENSION + +#---------------------------------------------------------------------------- + +def open_image_folder(source_dir, *, max_images: Optional[int]) -> tuple[int, Iterator[ImageEntry]]: + input_images = [] + def _recurse_dirs(root: str): # workaround Path().rglob() slowness + with os.scandir(root) as it: + for e in it: + if e.is_file(): + input_images.append(os.path.join(root, e.name)) + elif e.is_dir(): + _recurse_dirs(os.path.join(root, e.name)) + _recurse_dirs(source_dir) + input_images = sorted([f for f in input_images if is_image_ext(f)]) + + arch_fnames = {fname: os.path.relpath(fname, source_dir).replace('\\', '/') for fname in input_images} + max_idx = maybe_min(len(input_images), max_images) + + # Load labels. + labels = dict() + meta_fname = os.path.join(source_dir, 'dataset.json') + if os.path.isfile(meta_fname): + with open(meta_fname, 'r') as file: + data = json.load(file)['labels'] + if data is not None: + labels = {x[0]: x[1] for x in data} + + # No labels available => determine from top-level directory names. + if len(labels) == 0: + toplevel_names = {arch_fname: arch_fname.split('/')[0] if '/' in arch_fname else '' for arch_fname in arch_fnames.values()} + toplevel_indices = {toplevel_name: idx for idx, toplevel_name in enumerate(sorted(set(toplevel_names.values())))} + if len(toplevel_indices) > 1: + labels = {arch_fname: toplevel_indices[toplevel_name] for arch_fname, toplevel_name in toplevel_names.items()} + + def iterate_images(): + for idx, fname in enumerate(input_images): + img = np.array(PIL.Image.open(fname).convert('RGB'))#.transpose(2, 0, 1) + yield ImageEntry(img=img, label=labels.get(arch_fnames[fname])) + if idx >= max_idx - 1: + break + return max_idx, iterate_images() + +#---------------------------------------------------------------------------- + +def open_image_zip(source, *, max_images: Optional[int]) -> tuple[int, Iterator[ImageEntry]]: + with zipfile.ZipFile(source, mode='r') as z: + input_images = [str(f) for f in sorted(z.namelist()) if is_image_ext(f)] + max_idx = maybe_min(len(input_images), max_images) + + # Load labels. + labels = dict() + if 'dataset.json' in z.namelist(): + with z.open('dataset.json', 'r') as file: + data = json.load(file)['labels'] + if data is not None: + labels = {x[0]: x[1] for x in data} + + def iterate_images(): + with zipfile.ZipFile(source, mode='r') as z: + for idx, fname in enumerate(input_images): + with z.open(fname, 'r') as file: + img = np.array(PIL.Image.open(file).convert('RGB')) + yield ImageEntry(img=img, label=labels.get(fname)) + if idx >= max_idx - 1: + break + return max_idx, iterate_images() + +#---------------------------------------------------------------------------- + +def make_transform( + transform: Optional[str], + output_width: Optional[int], + output_height: Optional[int] +) -> Callable[[np.ndarray], Optional[np.ndarray]]: + def scale(width, height, img): + w = img.shape[1] + h = img.shape[0] + if width == w and height == h: + return img + img = PIL.Image.fromarray(img, 'RGB') + ww = width if width is not None else w + hh = height if height is not None else h + img = img.resize((ww, hh), PIL.Image.Resampling.LANCZOS) + return np.array(img) + + def center_crop(width, height, img): + crop = np.min(img.shape[:2]) + img = img[(img.shape[0] - crop) // 2 : (img.shape[0] + crop) // 2, (img.shape[1] - crop) // 2 : (img.shape[1] + crop) // 2] + img = PIL.Image.fromarray(img, 'RGB') + img = img.resize((width, height), PIL.Image.Resampling.LANCZOS) + return np.array(img) + + def center_crop_wide(width, height, img): + ch = int(np.round(width * img.shape[0] / img.shape[1])) + if img.shape[1] < width or ch < height: + return None + + img = img[(img.shape[0] - ch) // 2 : (img.shape[0] + ch) // 2] + img = PIL.Image.fromarray(img, 'RGB') + img = img.resize((width, height), PIL.Image.Resampling.LANCZOS) + img = np.array(img) + + canvas = np.zeros([width, width, 3], dtype=np.uint8) + canvas[(width - height) // 2 : (width + height) // 2, :] = img + return canvas + + def center_crop_imagenet(image_size: int, arr: np.ndarray): + """ + Center cropping implementation from ADM. + https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126 + """ + pil_image = PIL.Image.fromarray(arr) + while min(*pil_image.size) >= 2 * image_size: + new_size = tuple(x // 2 for x in pil_image.size) + assert len(new_size) == 2 + pil_image = pil_image.resize(new_size, resample=PIL.Image.Resampling.BOX) + + scale = image_size / min(*pil_image.size) + new_size = tuple(round(x * scale) for x in pil_image.size) + assert len(new_size) == 2 + pil_image = pil_image.resize(new_size, resample=PIL.Image.Resampling.BICUBIC) + + arr = np.array(pil_image) + crop_y = (arr.shape[0] - image_size) // 2 + crop_x = (arr.shape[1] - image_size) // 2 + return arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size] + + if transform is None: + return functools.partial(scale, output_width, output_height) + if transform == 'center-crop': + if output_width is None or output_height is None: + raise click.ClickException('must specify --resolution=WxH when using ' + transform + 'transform') + return functools.partial(center_crop, output_width, output_height) + if transform == 'center-crop-wide': + if output_width is None or output_height is None: + raise click.ClickException('must specify --resolution=WxH when using ' + transform + ' transform') + return functools.partial(center_crop_wide, output_width, output_height) + if transform == 'center-crop-dhariwal': + if output_width is None or output_height is None: + raise click.ClickException('must specify --resolution=WxH when using ' + transform + ' transform') + if output_width != output_height: + raise click.ClickException('width and height must match in --resolution=WxH when using ' + transform + ' transform') + return functools.partial(center_crop_imagenet, output_width) + assert False, 'unknown transform' + +#---------------------------------------------------------------------------- + +def open_dataset(source, *, max_images: Optional[int]): + if os.path.isdir(source): + return open_image_folder(source, max_images=max_images) + elif os.path.isfile(source): + if file_ext(source) == 'zip': + return open_image_zip(source, max_images=max_images) + else: + raise click.ClickException(f'Only zip archives are supported: {source}') + else: + raise click.ClickException(f'Missing input file or directory: {source}') + +#---------------------------------------------------------------------------- + +def open_dest(dest: str) -> Tuple[str, Callable[[str, Union[bytes, str]], None], Callable[[], None]]: + dest_ext = file_ext(dest) + + if dest_ext == 'zip': + if os.path.dirname(dest) != '': + os.makedirs(os.path.dirname(dest), exist_ok=True) + zf = zipfile.ZipFile(file=dest, mode='w', compression=zipfile.ZIP_STORED) + def zip_write_bytes(fname: str, data: Union[bytes, str]): + zf.writestr(fname, data) + return '', zip_write_bytes, zf.close + else: + # If the output folder already exists, check that is is + # empty. + # + # Note: creating the output directory is not strictly + # necessary as folder_write_bytes() also mkdirs, but it's better + # to give an error message earlier in case the dest folder + # somehow cannot be created. + if os.path.isdir(dest) and len(os.listdir(dest)) != 0: + raise click.ClickException('--dest folder must be empty') + os.makedirs(dest, exist_ok=True) + + def folder_write_bytes(fname: str, data: Union[bytes, str]): + os.makedirs(os.path.dirname(fname), exist_ok=True) + with open(fname, 'wb') as fout: + if isinstance(data, str): + data = data.encode('utf8') + fout.write(data) + return dest, folder_write_bytes, lambda: None + +#---------------------------------------------------------------------------- + +@click.group() +def cmdline(): + '''Dataset processing tool for dataset image data conversion and VAE encode/decode preprocessing.''' + if os.environ.get('WORLD_SIZE', '1') != '1': + raise click.ClickException('Distributed execution is not supported.') + + +#---------------------------------------------------------------------------- + + + +@cmdline.command() +@click.option('--source', help='Input directory or archive name', metavar='PATH', type=str, required=True) +@click.option('--dest', help='Output directory or archive name', metavar='PATH', type=str, required=True) +@click.option('--max-images', help='Maximum number of images to output', metavar='INT', type=int) +@click.option('--enc-type', help='Maximum number of images to output', metavar='PATH', type=str, default='dinov2-vit-b') +@click.option('--resolution', help='Maximum number of images to output', metavar='INT', type=int, default=256) + +def encode( + source: str, + dest: str, + max_images: Optional[int], + enc_type, + resolution +): + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + encoder, encoder_type, architectures = load_encoders(enc_type, device, resolution) + encoder, encoder_type, architectures = encoder[0], encoder_type[0], architectures[0] + print("Encoder is over!!!") + + """Encode pixel data to VAE latents.""" + PIL.Image.init() + if dest == '': + raise click.ClickException('--dest output filename or directory must not be an empty string') + + num_files, input_iter = open_dataset(source, max_images=max_images) + archive_root_dir, save_bytes, close_dest = open_dest(dest) + print("Data is over!!!") + labels = [] + + temp_list1 = [] + temp_list2 = [] + for idx, image in tqdm(enumerate(input_iter), total=num_files): + with torch.no_grad(): + img_tensor = torch.tensor(image.img).to('cuda').permute(2, 0, 1).unsqueeze(0) + raw_image_ = preprocess_raw_image(img_tensor, encoder_type) + z = encoder.forward_features(raw_image_) + if 'dinov2' in encoder_type: z = z['x_norm_patchtokens'] + temp_list1.append(z) + z = z.detach().cpu().numpy() + temp_list2.append(z) + + idx_str = f'{idx:08d}' + archive_fname = f'{idx_str[:5]}/img-feature-{idx_str}.npy' + + f = io.BytesIO() + np.save(f, z) + save_bytes(os.path.join(archive_root_dir, archive_fname), f.getvalue()) + labels.append([archive_fname, image.label] if image.label is not None else None) + + + metadata = {'labels': labels if all(x is not None for x in labels) else None} + save_bytes(os.path.join(archive_root_dir, 'dataset.json'), json.dumps(metadata)) + close_dest() + +if __name__ == "__main__": + cmdline() + + +#---------------------------------------------------------------------------- diff --git a/back/preprocessing/dataset_prepare_convert.sh b/back/preprocessing/dataset_prepare_convert.sh new file mode 100644 index 0000000000000000000000000000000000000000..778520424a0720ff52f99f6b325021a5f6768ca8 --- /dev/null +++ b/back/preprocessing/dataset_prepare_convert.sh @@ -0,0 +1,11 @@ + + + + + +#256 +python preprocessing/dataset_tools.py convert \ + --source=/home/share/imagenet/train \ + --dest=/home/share/imagenet_vae/imagenet_256_vae \ + --resolution=256x256 \ + --transform=center-crop-dhariwal \ No newline at end of file diff --git a/back/preprocessing/dataset_prepare_encode.sh b/back/preprocessing/dataset_prepare_encode.sh new file mode 100644 index 0000000000000000000000000000000000000000..63272ce1a372ade6ab330aef7b926b5236c65542 --- /dev/null +++ b/back/preprocessing/dataset_prepare_encode.sh @@ -0,0 +1,9 @@ + + + + + +#256 +python preprocessing/dataset_tools.py encode \ + --source=/home/share/imagenet_vae/imagenet_256_vae \ + --dest=/home/share/imagenet_vae/vae-sd-256 \ No newline at end of file diff --git a/back/preprocessing/dataset_tools.py b/back/preprocessing/dataset_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..2b0b5287aa994bb1440f945a3a4308efc518d1a6 --- /dev/null +++ b/back/preprocessing/dataset_tools.py @@ -0,0 +1,422 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +"""Tool for creating ZIP/PNG based datasets.""" + +from collections.abc import Iterator +from dataclasses import dataclass +import functools +import io +import json +import os +import re +import zipfile +from pathlib import Path +from typing import Callable, Optional, Tuple, Union +import click +import numpy as np +import PIL.Image +import torch +from tqdm import tqdm + +from encoders import StabilityVAEEncoder + +#---------------------------------------------------------------------------- + +@dataclass +class ImageEntry: + img: np.ndarray + label: Optional[int] + +#---------------------------------------------------------------------------- +# Parse a 'M,N' or 'MxN' integer tuple. +# Example: '4x2' returns (4,2) + +def parse_tuple(s: str) -> Tuple[int, int]: + m = re.match(r'^(\d+)[x,](\d+)$', s) + if m: + return int(m.group(1)), int(m.group(2)) + raise click.ClickException(f'cannot parse tuple {s}') + +#---------------------------------------------------------------------------- + +def maybe_min(a: int, b: Optional[int]) -> int: + if b is not None: + return min(a, b) + return a + +#---------------------------------------------------------------------------- + +def file_ext(name: Union[str, Path]) -> str: + return str(name).split('.')[-1] + +#---------------------------------------------------------------------------- + +def is_image_ext(fname: Union[str, Path]) -> bool: + ext = file_ext(fname).lower() + return f'.{ext}' in PIL.Image.EXTENSION + +#---------------------------------------------------------------------------- + +def open_image_folder(source_dir, *, max_images: Optional[int]) -> tuple[int, Iterator[ImageEntry]]: + input_images = [] + def _recurse_dirs(root: str): # workaround Path().rglob() slowness + with os.scandir(root) as it: + for e in it: + if e.is_file(): + input_images.append(os.path.join(root, e.name)) + elif e.is_dir(): + _recurse_dirs(os.path.join(root, e.name)) + _recurse_dirs(source_dir) + input_images = sorted([f for f in input_images if is_image_ext(f)]) + + arch_fnames = {fname: os.path.relpath(fname, source_dir).replace('\\', '/') for fname in input_images} + max_idx = maybe_min(len(input_images), max_images) + + # Load labels. + labels = dict() + meta_fname = os.path.join(source_dir, 'dataset.json') + if os.path.isfile(meta_fname): + with open(meta_fname, 'r') as file: + data = json.load(file)['labels'] + if data is not None: + labels = {x[0]: x[1] for x in data} + + # No labels available => determine from top-level directory names. + if len(labels) == 0: + toplevel_names = {arch_fname: arch_fname.split('/')[0] if '/' in arch_fname else '' for arch_fname in arch_fnames.values()} + toplevel_indices = {toplevel_name: idx for idx, toplevel_name in enumerate(sorted(set(toplevel_names.values())))} + if len(toplevel_indices) > 1: + labels = {arch_fname: toplevel_indices[toplevel_name] for arch_fname, toplevel_name in toplevel_names.items()} + + def iterate_images(): + for idx, fname in enumerate(input_images): + img = np.array(PIL.Image.open(fname).convert('RGB')) + yield ImageEntry(img=img, label=labels.get(arch_fnames[fname])) + if idx >= max_idx - 1: + break + return max_idx, iterate_images() + +#---------------------------------------------------------------------------- + +def open_image_zip(source, *, max_images: Optional[int]) -> tuple[int, Iterator[ImageEntry]]: + with zipfile.ZipFile(source, mode='r') as z: + input_images = [str(f) for f in sorted(z.namelist()) if is_image_ext(f)] + max_idx = maybe_min(len(input_images), max_images) + + # Load labels. + labels = dict() + if 'dataset.json' in z.namelist(): + with z.open('dataset.json', 'r') as file: + data = json.load(file)['labels'] + if data is not None: + labels = {x[0]: x[1] for x in data} + + def iterate_images(): + with zipfile.ZipFile(source, mode='r') as z: + for idx, fname in enumerate(input_images): + with z.open(fname, 'r') as file: + img = np.array(PIL.Image.open(file).convert('RGB')) + yield ImageEntry(img=img, label=labels.get(fname)) + if idx >= max_idx - 1: + break + return max_idx, iterate_images() + +#---------------------------------------------------------------------------- + +def make_transform( + transform: Optional[str], + output_width: Optional[int], + output_height: Optional[int] +) -> Callable[[np.ndarray], Optional[np.ndarray]]: + def scale(width, height, img): + w = img.shape[1] + h = img.shape[0] + if width == w and height == h: + return img + img = PIL.Image.fromarray(img, 'RGB') + ww = width if width is not None else w + hh = height if height is not None else h + img = img.resize((ww, hh), PIL.Image.Resampling.LANCZOS) + return np.array(img) + + def center_crop(width, height, img): + crop = np.min(img.shape[:2]) + img = img[(img.shape[0] - crop) // 2 : (img.shape[0] + crop) // 2, (img.shape[1] - crop) // 2 : (img.shape[1] + crop) // 2] + img = PIL.Image.fromarray(img, 'RGB') + img = img.resize((width, height), PIL.Image.Resampling.LANCZOS) + return np.array(img) + + def center_crop_wide(width, height, img): + ch = int(np.round(width * img.shape[0] / img.shape[1])) + if img.shape[1] < width or ch < height: + return None + + img = img[(img.shape[0] - ch) // 2 : (img.shape[0] + ch) // 2] + img = PIL.Image.fromarray(img, 'RGB') + img = img.resize((width, height), PIL.Image.Resampling.LANCZOS) + img = np.array(img) + + canvas = np.zeros([width, width, 3], dtype=np.uint8) + canvas[(width - height) // 2 : (width + height) // 2, :] = img + return canvas + + def center_crop_imagenet(image_size: int, arr: np.ndarray): + """ + Center cropping implementation from ADM. + https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126 + """ + pil_image = PIL.Image.fromarray(arr) + while min(*pil_image.size) >= 2 * image_size: + new_size = tuple(x // 2 for x in pil_image.size) + assert len(new_size) == 2 + pil_image = pil_image.resize(new_size, resample=PIL.Image.Resampling.BOX) + + scale = image_size / min(*pil_image.size) + new_size = tuple(round(x * scale) for x in pil_image.size) + assert len(new_size) == 2 + pil_image = pil_image.resize(new_size, resample=PIL.Image.Resampling.BICUBIC) + + arr = np.array(pil_image) + crop_y = (arr.shape[0] - image_size) // 2 + crop_x = (arr.shape[1] - image_size) // 2 + return arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size] + + if transform is None: + return functools.partial(scale, output_width, output_height) + if transform == 'center-crop': + if output_width is None or output_height is None: + raise click.ClickException('must specify --resolution=WxH when using ' + transform + 'transform') + return functools.partial(center_crop, output_width, output_height) + if transform == 'center-crop-wide': + if output_width is None or output_height is None: + raise click.ClickException('must specify --resolution=WxH when using ' + transform + ' transform') + return functools.partial(center_crop_wide, output_width, output_height) + if transform == 'center-crop-dhariwal': + if output_width is None or output_height is None: + raise click.ClickException('must specify --resolution=WxH when using ' + transform + ' transform') + if output_width != output_height: + raise click.ClickException('width and height must match in --resolution=WxH when using ' + transform + ' transform') + return functools.partial(center_crop_imagenet, output_width) + assert False, 'unknown transform' + +#---------------------------------------------------------------------------- + +def open_dataset(source, *, max_images: Optional[int]): + if os.path.isdir(source): + return open_image_folder(source, max_images=max_images) + elif os.path.isfile(source): + if file_ext(source) == 'zip': + return open_image_zip(source, max_images=max_images) + else: + raise click.ClickException(f'Only zip archives are supported: {source}') + else: + raise click.ClickException(f'Missing input file or directory: {source}') + +#---------------------------------------------------------------------------- + +def open_dest(dest: str) -> Tuple[str, Callable[[str, Union[bytes, str]], None], Callable[[], None]]: + dest_ext = file_ext(dest) + + if dest_ext == 'zip': + if os.path.dirname(dest) != '': + os.makedirs(os.path.dirname(dest), exist_ok=True) + zf = zipfile.ZipFile(file=dest, mode='w', compression=zipfile.ZIP_STORED) + def zip_write_bytes(fname: str, data: Union[bytes, str]): + zf.writestr(fname, data) + return '', zip_write_bytes, zf.close + else: + # If the output folder already exists, check that is is + # empty. + # + # Note: creating the output directory is not strictly + # necessary as folder_write_bytes() also mkdirs, but it's better + # to give an error message earlier in case the dest folder + # somehow cannot be created. + if os.path.isdir(dest) and len(os.listdir(dest)) != 0: + raise click.ClickException('--dest folder must be empty') + os.makedirs(dest, exist_ok=True) + + def folder_write_bytes(fname: str, data: Union[bytes, str]): + os.makedirs(os.path.dirname(fname), exist_ok=True) + with open(fname, 'wb') as fout: + if isinstance(data, str): + data = data.encode('utf8') + fout.write(data) + return dest, folder_write_bytes, lambda: None + +#---------------------------------------------------------------------------- + +@click.group() +def cmdline(): + '''Dataset processing tool for dataset image data conversion and VAE encode/decode preprocessing.''' + if os.environ.get('WORLD_SIZE', '1') != '1': + raise click.ClickException('Distributed execution is not supported.') + +#---------------------------------------------------------------------------- + +@cmdline.command() +@click.option('--source', help='Input directory or archive name', metavar='PATH', type=str, required=True) +@click.option('--dest', help='Output directory or archive name', metavar='PATH', type=str, required=True) +@click.option('--max-images', help='Maximum number of images to output', metavar='INT', type=int) +@click.option('--transform', help='Input crop/resize mode', metavar='MODE', type=click.Choice(['center-crop', 'center-crop-wide', 'center-crop-dhariwal'])) +@click.option('--resolution', help='Output resolution (e.g., 512x512)', metavar='WxH', type=parse_tuple) + +def convert( + source: str, + dest: str, + max_images: Optional[int], + transform: Optional[str], + resolution: Optional[Tuple[int, int]] +): + """Convert an image dataset into archive format for training. + + Specifying the input images: + + \b + --source path/ Recursively load all images from path/ + --source dataset.zip Load all images from dataset.zip + + Specifying the output format and path: + + \b + --dest /path/to/dir Save output files under /path/to/dir + --dest /path/to/dataset.zip Save output files into /path/to/dataset.zip + + The output dataset format can be either an image folder or an uncompressed zip archive. + Zip archives makes it easier to move datasets around file servers and clusters, and may + offer better training performance on network file systems. + + Images within the dataset archive will be stored as uncompressed PNG. + Uncompresed PNGs can be efficiently decoded in the training loop. + + Class labels are stored in a file called 'dataset.json' that is stored at the + dataset root folder. This file has the following structure: + + \b + { + "labels": [ + ["00000/img00000000.png",6], + ["00000/img00000001.png",9], + ... repeated for every image in the datase + ["00049/img00049999.png",1] + ] + } + + If the 'dataset.json' file cannot be found, class labels are determined from + top-level directory names. + + Image scale/crop and resolution requirements: + + Output images must be square-shaped and they must all have the same power-of-two + dimensions. + + To scale arbitrary input image size to a specific width and height, use the + --resolution option. Output resolution will be either the original + input resolution (if resolution was not specified) or the one specified with + --resolution option. + + The --transform=center-crop-dhariwal selects a crop/rescale mode that is intended + to exactly match with results obtained for ImageNet in common diffusion model literature: + + \b + python dataset_tool.py convert --source=downloads/imagenet/ILSVRC/Data/CLS-LOC/train \\ + --dest=datasets/img64.zip --resolution=64x64 --transform=center-crop-dhariwal + """ + PIL.Image.init() + if dest == '': + raise click.ClickException('--dest output filename or directory must not be an empty string') + print("Begin!!!!!!!!") + num_files, input_iter = open_dataset(source, max_images=max_images) + print("open_dataset is over") + archive_root_dir, save_bytes, close_dest = open_dest(dest) + print("open_dest is over") + transform_image = make_transform(transform, *resolution if resolution is not None else (None, None)) + dataset_attrs = None + + labels = [] + for idx, image in tqdm(enumerate(input_iter), total=num_files): + idx_str = f'{idx:08d}' + archive_fname = f'{idx_str[:5]}/img{idx_str}.png' + + # Apply crop and resize. + img = transform_image(image.img) + if img is None: + continue + + # Error check to require uniform image attributes across + # the whole dataset. + assert img.ndim == 3 + cur_image_attrs = {'width': img.shape[1], 'height': img.shape[0]} + if dataset_attrs is None: + dataset_attrs = cur_image_attrs + width = dataset_attrs['width'] + height = dataset_attrs['height'] + if width != height: + raise click.ClickException(f'Image dimensions after scale and crop are required to be square. Got {width}x{height}') + if width != 2 ** int(np.floor(np.log2(width))): + raise click.ClickException('Image width/height after scale and crop are required to be power-of-two') + elif dataset_attrs != cur_image_attrs: + err = [f' dataset {k}/cur image {k}: {dataset_attrs[k]}/{cur_image_attrs[k]}' for k in dataset_attrs.keys()] + raise click.ClickException(f'Image {archive_fname} attributes must be equal across all images of the dataset. Got:\n' + '\n'.join(err)) + + # Save the image as an uncompressed PNG. + img = PIL.Image.fromarray(img) + image_bits = io.BytesIO() + img.save(image_bits, format='png', compress_level=0, optimize=False) + save_bytes(os.path.join(archive_root_dir, archive_fname), image_bits.getbuffer()) + labels.append([archive_fname, image.label] if image.label is not None else None) + + metadata = {'labels': labels if all(x is not None for x in labels) else None} + save_bytes(os.path.join(archive_root_dir, 'dataset.json'), json.dumps(metadata)) + close_dest() + +#---------------------------------------------------------------------------- + +@cmdline.command() +@click.option('--model-url', help='VAE encoder model', metavar='URL', type=str, default='stabilityai/sd-vae-ft-mse', show_default=True) +@click.option('--source', help='Input directory or archive name', metavar='PATH', type=str, required=True) +@click.option('--dest', help='Output directory or archive name', metavar='PATH', type=str, required=True) +@click.option('--max-images', help='Maximum number of images to output', metavar='INT', type=int) + +def encode( + model_url: str, + source: str, + dest: str, + max_images: Optional[int], +): + """Encode pixel data to VAE latents.""" + PIL.Image.init() + if dest == '': + raise click.ClickException('--dest output filename or directory must not be an empty string') + + vae = StabilityVAEEncoder(vae_name=model_url, batch_size=1) + print("VAE is over!!!") + num_files, input_iter = open_dataset(source, max_images=max_images) + archive_root_dir, save_bytes, close_dest = open_dest(dest) + print("Data is over!!!") + labels = [] + #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + for idx, image in tqdm(enumerate(input_iter), total=num_files): + img_tensor = torch.tensor(image.img).to('cuda').permute(2, 0, 1).unsqueeze(0) + mean_std = vae.encode_pixels(img_tensor)[0].cpu() + idx_str = f'{idx:08d}' + archive_fname = f'{idx_str[:5]}/img-mean-std-{idx_str}.npy' + + f = io.BytesIO() + np.save(f, mean_std) + save_bytes(os.path.join(archive_root_dir, archive_fname), f.getvalue()) + labels.append([archive_fname, image.label] if image.label is not None else None) + + metadata = {'labels': labels if all(x is not None for x in labels) else None} + save_bytes(os.path.join(archive_root_dir, 'dataset.json'), json.dumps(metadata)) + close_dest() + +if __name__ == "__main__": + cmdline() + +#---------------------------------------------------------------------------- \ No newline at end of file diff --git a/back/preprocessing/dnnlib/__init__.py b/back/preprocessing/dnnlib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a6205366fc03b4ce3a99ab265e7975c9d3656c27 --- /dev/null +++ b/back/preprocessing/dnnlib/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +from .util import EasyDict, make_cache_dir_path diff --git a/back/preprocessing/dnnlib/__pycache__/__init__.cpython-312.pyc b/back/preprocessing/dnnlib/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b17f88251ae584f8093d365594af6fb100402aa0 Binary files /dev/null and b/back/preprocessing/dnnlib/__pycache__/__init__.cpython-312.pyc differ diff --git a/back/preprocessing/dnnlib/__pycache__/util.cpython-312.pyc b/back/preprocessing/dnnlib/__pycache__/util.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da7bbee135dc4f9d3d330aab2c12183dd149ef4a Binary files /dev/null and b/back/preprocessing/dnnlib/__pycache__/util.cpython-312.pyc differ diff --git a/back/preprocessing/dnnlib/util.py b/back/preprocessing/dnnlib/util.py new file mode 100644 index 0000000000000000000000000000000000000000..05389c5ef04c303288f4f8919d8f72f3260f6c5d --- /dev/null +++ b/back/preprocessing/dnnlib/util.py @@ -0,0 +1,485 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +"""Miscellaneous utility classes and functions.""" + +import ctypes +import fnmatch +import importlib +import inspect +import numpy as np +import os +import shutil +import sys +import types +import io +import pickle +import re +import requests +import html +import hashlib +import glob +import tempfile +import urllib +import urllib.parse +import uuid + +from typing import Any, Callable, BinaryIO, List, Tuple, Union, Optional + +# Util classes +# ------------------------------------------------------------------------------------------ + + +class EasyDict(dict): + """Convenience class that behaves like a dict but allows access with the attribute syntax.""" + + def __getattr__(self, name: str) -> Any: + try: + return self[name] + except KeyError: + raise AttributeError(name) + + def __setattr__(self, name: str, value: Any) -> None: + self[name] = value + + def __delattr__(self, name: str) -> None: + del self[name] + + +class Logger(object): + """Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file.""" + + def __init__(self, file_name: Optional[str] = None, file_mode: str = "w", should_flush: bool = True): + self.file = None + + if file_name is not None: + self.file = open(file_name, file_mode) + + self.should_flush = should_flush + self.stdout = sys.stdout + self.stderr = sys.stderr + + sys.stdout = self + sys.stderr = self + + def __enter__(self) -> "Logger": + return self + + def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: + self.close() + + def write(self, text: Union[str, bytes]) -> None: + """Write text to stdout (and a file) and optionally flush.""" + if isinstance(text, bytes): + text = text.decode() + if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash + return + + if self.file is not None: + self.file.write(text) + + self.stdout.write(text) + + if self.should_flush: + self.flush() + + def flush(self) -> None: + """Flush written text to both stdout and a file, if open.""" + if self.file is not None: + self.file.flush() + + self.stdout.flush() + + def close(self) -> None: + """Flush, close possible files, and remove stdout/stderr mirroring.""" + self.flush() + + # if using multiple loggers, prevent closing in wrong order + if sys.stdout is self: + sys.stdout = self.stdout + if sys.stderr is self: + sys.stderr = self.stderr + + if self.file is not None: + self.file.close() + self.file = None + + +# Cache directories +# ------------------------------------------------------------------------------------------ + +_dnnlib_cache_dir = None + +def set_cache_dir(path: str) -> None: + global _dnnlib_cache_dir + _dnnlib_cache_dir = path + +def make_cache_dir_path(*paths: str) -> str: + if _dnnlib_cache_dir is not None: + return os.path.join(_dnnlib_cache_dir, *paths) + if 'DNNLIB_CACHE_DIR' in os.environ: + return os.path.join(os.environ['DNNLIB_CACHE_DIR'], *paths) + if 'HOME' in os.environ: + return os.path.join(os.environ['HOME'], '.cache', 'dnnlib', *paths) + if 'USERPROFILE' in os.environ: + return os.path.join(os.environ['USERPROFILE'], '.cache', 'dnnlib', *paths) + return os.path.join(tempfile.gettempdir(), '.cache', 'dnnlib', *paths) + +# Small util functions +# ------------------------------------------------------------------------------------------ + + +def format_time(seconds: Union[int, float]) -> str: + """Convert the seconds to human readable string with days, hours, minutes and seconds.""" + s = int(np.rint(seconds)) + + if s < 60: + return "{0}s".format(s) + elif s < 60 * 60: + return "{0}m {1:02}s".format(s // 60, s % 60) + elif s < 24 * 60 * 60: + return "{0}h {1:02}m {2:02}s".format(s // (60 * 60), (s // 60) % 60, s % 60) + else: + return "{0}d {1:02}h {2:02}m".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24, (s // 60) % 60) + + +def format_time_brief(seconds: Union[int, float]) -> str: + """Convert the seconds to human readable string with days, hours, minutes and seconds.""" + s = int(np.rint(seconds)) + + if s < 60: + return "{0}s".format(s) + elif s < 60 * 60: + return "{0}m {1:02}s".format(s // 60, s % 60) + elif s < 24 * 60 * 60: + return "{0}h {1:02}m".format(s // (60 * 60), (s // 60) % 60) + else: + return "{0}d {1:02}h".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24) + + +def tuple_product(t: Tuple) -> Any: + """Calculate the product of the tuple elements.""" + result = 1 + + for v in t: + result *= v + + return result + + +_str_to_ctype = { + "uint8": ctypes.c_ubyte, + "uint16": ctypes.c_uint16, + "uint32": ctypes.c_uint32, + "uint64": ctypes.c_uint64, + "int8": ctypes.c_byte, + "int16": ctypes.c_int16, + "int32": ctypes.c_int32, + "int64": ctypes.c_int64, + "float32": ctypes.c_float, + "float64": ctypes.c_double +} + + +def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]: + """Given a type name string (or an object having a __name__ attribute), return matching Numpy and ctypes types that have the same size in bytes.""" + type_str = None + + if isinstance(type_obj, str): + type_str = type_obj + elif hasattr(type_obj, "__name__"): + type_str = type_obj.__name__ + elif hasattr(type_obj, "name"): + type_str = type_obj.name + else: + raise RuntimeError("Cannot infer type name from input") + + assert type_str in _str_to_ctype.keys() + + my_dtype = np.dtype(type_str) + my_ctype = _str_to_ctype[type_str] + + assert my_dtype.itemsize == ctypes.sizeof(my_ctype) + + return my_dtype, my_ctype + + +def is_pickleable(obj: Any) -> bool: + try: + with io.BytesIO() as stream: + pickle.dump(obj, stream) + return True + except: + return False + + +# Functionality to import modules/objects by name, and call functions by name +# ------------------------------------------------------------------------------------------ + +def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, str]: + """Searches for the underlying module behind the name to some python object. + Returns the module and the object name (original name with module part removed).""" + + # allow convenience shorthands, substitute them by full names + obj_name = re.sub("^np.", "numpy.", obj_name) + obj_name = re.sub("^tf.", "tensorflow.", obj_name) + + # list alternatives for (module_name, local_obj_name) + parts = obj_name.split(".") + name_pairs = [(".".join(parts[:i]), ".".join(parts[i:])) for i in range(len(parts), 0, -1)] + + # try each alternative in turn + for module_name, local_obj_name in name_pairs: + try: + module = importlib.import_module(module_name) # may raise ImportError + get_obj_from_module(module, local_obj_name) # may raise AttributeError + return module, local_obj_name + except: + pass + + # maybe some of the modules themselves contain errors? + for module_name, _local_obj_name in name_pairs: + try: + importlib.import_module(module_name) # may raise ImportError + except ImportError: + if not str(sys.exc_info()[1]).startswith("No module named '" + module_name + "'"): + raise + + # maybe the requested attribute is missing? + for module_name, local_obj_name in name_pairs: + try: + module = importlib.import_module(module_name) # may raise ImportError + get_obj_from_module(module, local_obj_name) # may raise AttributeError + except ImportError: + pass + + # we are out of luck, but we have no idea why + raise ImportError(obj_name) + + +def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any: + """Traverses the object name and returns the last (rightmost) python object.""" + if obj_name == '': + return module + obj = module + for part in obj_name.split("."): + obj = getattr(obj, part) + return obj + + +def get_obj_by_name(name: str) -> Any: + """Finds the python object with the given name.""" + module, obj_name = get_module_from_obj_name(name) + return get_obj_from_module(module, obj_name) + + +def call_func_by_name(*args, func_name: Union[str, Callable], **kwargs) -> Any: + """Finds the python object with the given name and calls it as a function.""" + assert func_name is not None + func_obj = get_obj_by_name(func_name) if isinstance(func_name, str) else func_name + assert callable(func_obj) + return func_obj(*args, **kwargs) + + +def construct_class_by_name(*args, class_name: Union[str, type], **kwargs) -> Any: + """Finds the python class with the given name and constructs it with the given arguments.""" + return call_func_by_name(*args, func_name=class_name, **kwargs) + + +def get_module_dir_by_obj_name(obj_name: str) -> str: + """Get the directory path of the module containing the given object name.""" + module, _ = get_module_from_obj_name(obj_name) + return os.path.dirname(inspect.getfile(module)) + + +def is_top_level_function(obj: Any) -> bool: + """Determine whether the given object is a top-level function, i.e., defined at module scope using 'def'.""" + return callable(obj) and obj.__name__ in sys.modules[obj.__module__].__dict__ + + +def get_top_level_function_name(obj: Any) -> str: + """Return the fully-qualified name of a top-level function.""" + assert is_top_level_function(obj) + module = obj.__module__ + if module == '__main__': + fname = sys.modules[module].__file__ + assert fname is not None + module = os.path.splitext(os.path.basename(fname))[0] + return module + "." + obj.__name__ + + +# File system helpers +# ------------------------------------------------------------------------------------------ + +def list_dir_recursively_with_ignore(dir_path: str, ignores: Optional[List[str]] = None, add_base_to_relative: bool = False) -> List[Tuple[str, str]]: + """List all files recursively in a given directory while ignoring given file and directory names. + Returns list of tuples containing both absolute and relative paths.""" + assert os.path.isdir(dir_path) + base_name = os.path.basename(os.path.normpath(dir_path)) + + if ignores is None: + ignores = [] + + result = [] + + for root, dirs, files in os.walk(dir_path, topdown=True): + for ignore_ in ignores: + dirs_to_remove = [d for d in dirs if fnmatch.fnmatch(d, ignore_)] + + # dirs need to be edited in-place + for d in dirs_to_remove: + dirs.remove(d) + + files = [f for f in files if not fnmatch.fnmatch(f, ignore_)] + + absolute_paths = [os.path.join(root, f) for f in files] + relative_paths = [os.path.relpath(p, dir_path) for p in absolute_paths] + + if add_base_to_relative: + relative_paths = [os.path.join(base_name, p) for p in relative_paths] + + assert len(absolute_paths) == len(relative_paths) + result += zip(absolute_paths, relative_paths) + + return result + + +def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None: + """Takes in a list of tuples of (src, dst) paths and copies files. + Will create all necessary directories.""" + for file in files: + target_dir_name = os.path.dirname(file[1]) + + # will create all intermediate-level directories + os.makedirs(target_dir_name, exist_ok=True) + shutil.copyfile(file[0], file[1]) + + +# URL helpers +# ------------------------------------------------------------------------------------------ + +def is_url(obj: Any, allow_file_urls: bool = False) -> bool: + """Determine whether the given object is a valid URL string.""" + if not isinstance(obj, str) or not "://" in obj: + return False + if allow_file_urls and obj.startswith('file://'): + return True + try: + res = urllib.parse.urlparse(obj) + if not res.scheme or not res.netloc or not "." in res.netloc: + return False + res = urllib.parse.urlparse(urllib.parse.urljoin(obj, "/")) + if not res.scheme or not res.netloc or not "." in res.netloc: + return False + except: + return False + return True + +# Note on static typing: a better API would be to split 'open_url' to 'openl_url' and +# 'download_url' with separate return types (BinaryIO, str). As the `return_filename=True` +# case is somewhat uncommon, we just pretend like this function never returns a string +# and type ignore return value for those cases. +def open_url(url: str, cache_dir: Optional[str] = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True) -> BinaryIO: + """Download the given URL and return a binary-mode file object to access the data.""" + assert num_attempts >= 1 + assert not (return_filename and (not cache)) + + # Doesn't look like an URL scheme so interpret it as a local filename. + if not re.match('^[a-z]+://', url): + return url if return_filename else open(url, "rb") # type: ignore + + # Handle file URLs. This code handles unusual file:// patterns that + # arise on Windows: + # + # file:///c:/foo.txt + # + # which would translate to a local '/c:/foo.txt' filename that's + # invalid. Drop the forward slash for such pathnames. + # + # If you touch this code path, you should test it on both Linux and + # Windows. + # + # Some internet resources suggest using urllib.request.url2pathname() + # but that converts forward slashes to backslashes and this causes + # its own set of problems. + if url.startswith('file://'): + filename = urllib.parse.urlparse(url).path + if re.match(r'^/[a-zA-Z]:', filename): + filename = filename[1:] + return filename if return_filename else open(filename, "rb") # type: ignore + + assert is_url(url) + + # Lookup from cache. + if cache_dir is None: + cache_dir = make_cache_dir_path('downloads') + + url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest() + if cache: + cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*")) + if len(cache_files) == 1: + filename = cache_files[0] + return filename if return_filename else open(filename, "rb") # type: ignore + + # Download. + url_name = None + url_data = None + with requests.Session() as session: + if verbose: + print("Downloading %s ..." % url, end="", flush=True) + for attempts_left in reversed(range(num_attempts)): + try: + with session.get(url) as res: + res.raise_for_status() + if len(res.content) == 0: + raise IOError("No data received") + + if len(res.content) < 8192: + content_str = res.content.decode("utf-8") + if "download_warning" in res.headers.get("Set-Cookie", ""): + links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link] + if len(links) == 1: + url = urllib.parse.urljoin(url, links[0]) + raise IOError("Google Drive virus checker nag") + if "Google Drive - Quota exceeded" in content_str: + raise IOError("Google Drive download quota exceeded -- please try again later") + + match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", "")) + url_name = match[1] if match else url + url_data = res.content + if verbose: + print(" done") + break + except KeyboardInterrupt: + raise + except: + if not attempts_left: + if verbose: + print(" failed") + raise + if verbose: + print(".", end="", flush=True) + + assert url_data is not None + + # Save to cache. + if cache: + assert url_name is not None + safe_name = re.sub(r"[^0-9a-zA-Z-._]", "_", url_name) + safe_name = safe_name[:min(len(safe_name), 128)] + cache_file = os.path.join(cache_dir, url_md5 + "_" + safe_name) + temp_file = os.path.join(cache_dir, "tmp_" + uuid.uuid4().hex + "_" + url_md5 + "_" + safe_name) + os.makedirs(cache_dir, exist_ok=True) + with open(temp_file, "wb") as f: + f.write(url_data) + os.replace(temp_file, cache_file) # atomic + if return_filename: + return cache_file # type: ignore + + # Return data as file object. + assert not return_filename + return io.BytesIO(url_data) diff --git a/back/preprocessing/encoders.py b/back/preprocessing/encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..53c742bdd25ac8a00f034453596540010f8d87c9 --- /dev/null +++ b/back/preprocessing/encoders.py @@ -0,0 +1,103 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +"""Converting between pixel and latent representations of image data.""" + +import os +import warnings +import numpy as np +import torch +from torch_utils import persistence +from torch_utils import misc + +warnings.filterwarnings('ignore', 'torch.utils._pytree._register_pytree_node is deprecated.') +warnings.filterwarnings('ignore', '`resume_download` is deprecated') + +#---------------------------------------------------------------------------- +# Abstract base class for encoders/decoders that convert back and forth +# between pixel and latent representations of image data. +# +# Logically, "raw pixels" are first encoded into "raw latents" that are +# then further encoded into "final latents". Decoding, on the other hand, +# goes directly from the final latents to raw pixels. The final latents are +# used as inputs and outputs of the model, whereas the raw latents are +# stored in the dataset. This separation provides added flexibility in terms +# of performing just-in-time adjustments, such as data whitening, without +# having to construct a new dataset. +# +# All image data is represented as PyTorch tensors in NCHW order. +# Raw pixels are represented as 3-channel uint8. + +@persistence.persistent_class +class Encoder: + def __init__(self): + pass + + def init(self, device): # force lazy init to happen now + pass + + def __getstate__(self): + return self.__dict__ + + def encode_pixels(self, x): # raw pixels => raw latents + raise NotImplementedError # to be overridden by subclass +#---------------------------------------------------------------------------- +# Pre-trained VAE encoder from Stability AI. + +@persistence.persistent_class +class StabilityVAEEncoder(Encoder): + def __init__(self, + vae_name = 'stabilityai/sd-vae-ft-mse', # Name of the VAE to use. + batch_size = 8, # Batch size to use when running the VAE. + ): + super().__init__() + self.vae_name = vae_name + self.batch_size = int(batch_size) + self._vae = None + + def init(self, device): # force lazy init to happen now + super().init(device) + if self._vae is None: + self._vae = load_stability_vae(self.vae_name, device=device) + else: + self._vae.to(device) + + def __getstate__(self): + return dict(super().__getstate__(), _vae=None) # do not pickle the vae + + def _run_vae_encoder(self, x): + d = self._vae.encode(x)['latent_dist'] + return torch.cat([d.mean, d.std], dim=1) + + def encode_pixels(self, x): # raw pixels => raw latents + self.init(x.device) + x = x.to(torch.float32) / 127.5 - 1 + x = torch.cat([self._run_vae_encoder(batch) for batch in x.split(self.batch_size)]) + return x + +#---------------------------------------------------------------------------- + +def load_stability_vae(vae_name='stabilityai/sd-vae-ft-mse', device=torch.device('cpu')): + import dnnlib + cache_dir = dnnlib.make_cache_dir_path('diffusers') + os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1' + os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1' + os.environ['HF_HOME'] = cache_dir + + + import diffusers # pip install diffusers # pyright: ignore [reportMissingImports] + try: + # First try with local_files_only to avoid consulting tfhub metadata if the model is already in cache. + vae = diffusers.models.AutoencoderKL.from_pretrained( + vae_name, cache_dir=cache_dir, local_files_only=True + ) + except: + # Could not load the model from cache; try without local_files_only. + vae = diffusers.models.AutoencoderKL.from_pretrained(vae_name, cache_dir=cache_dir) + return vae.eval().requires_grad_(False).to(device) + +#---------------------------------------------------------------------------- \ No newline at end of file diff --git a/back/preprocessing/torch_utils/__init__.py b/back/preprocessing/torch_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..85c9500c68b5789d0244a9a7a9cc0924129b0308 --- /dev/null +++ b/back/preprocessing/torch_utils/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +# empty diff --git a/back/preprocessing/torch_utils/distributed.py b/back/preprocessing/torch_utils/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..682f650770eefd5b64f9298e3b10c869af6bbe76 --- /dev/null +++ b/back/preprocessing/torch_utils/distributed.py @@ -0,0 +1,140 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +import os +import re +import socket +import torch +import torch.distributed +from . import training_stats + +_sync_device = None + +#---------------------------------------------------------------------------- + +def init(): + global _sync_device + + if not torch.distributed.is_initialized(): + # Setup some reasonable defaults for env-based distributed init if + # not set by the running environment. + if 'MASTER_ADDR' not in os.environ: + os.environ['MASTER_ADDR'] = 'localhost' + if 'MASTER_PORT' not in os.environ: + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(('', 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + os.environ['MASTER_PORT'] = str(s.getsockname()[1]) + s.close() + if 'RANK' not in os.environ: + os.environ['RANK'] = '0' + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = '0' + if 'WORLD_SIZE' not in os.environ: + os.environ['WORLD_SIZE'] = '1' + backend = 'gloo' if os.name == 'nt' else 'nccl' + torch.distributed.init_process_group(backend=backend, init_method='env://') + torch.cuda.set_device(int(os.environ.get('LOCAL_RANK', '0'))) + + _sync_device = torch.device('cuda') if get_world_size() > 1 else None + training_stats.init_multiprocessing(rank=get_rank(), sync_device=_sync_device) + +#---------------------------------------------------------------------------- + +def get_rank(): + return torch.distributed.get_rank() if torch.distributed.is_initialized() else 0 + +#---------------------------------------------------------------------------- + +def get_world_size(): + return torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1 + +#---------------------------------------------------------------------------- + +def should_stop(): + return False + +#---------------------------------------------------------------------------- + +def should_suspend(): + return False + +#---------------------------------------------------------------------------- + +def request_suspend(): + pass + +#---------------------------------------------------------------------------- + +def update_progress(cur, total): + pass + +#---------------------------------------------------------------------------- + +def print0(*args, **kwargs): + if get_rank() == 0: + print(*args, **kwargs) + +#---------------------------------------------------------------------------- + +class CheckpointIO: + def __init__(self, **kwargs): + self._state_objs = kwargs + + def save(self, pt_path, verbose=True): + if verbose: + print0(f'Saving {pt_path} ... ', end='', flush=True) + data = dict() + for name, obj in self._state_objs.items(): + if obj is None: + data[name] = None + elif isinstance(obj, dict): + data[name] = obj + elif hasattr(obj, 'state_dict'): + data[name] = obj.state_dict() + elif hasattr(obj, '__getstate__'): + data[name] = obj.__getstate__() + elif hasattr(obj, '__dict__'): + data[name] = obj.__dict__ + else: + raise ValueError(f'Invalid state object of type {type(obj).__name__}') + if get_rank() == 0: + torch.save(data, pt_path) + if verbose: + print0('done') + + def load(self, pt_path, verbose=True): + if verbose: + print0(f'Loading {pt_path} ... ', end='', flush=True) + data = torch.load(pt_path, map_location=torch.device('cpu')) + for name, obj in self._state_objs.items(): + if obj is None: + pass + elif isinstance(obj, dict): + obj.clear() + obj.update(data[name]) + elif hasattr(obj, 'load_state_dict'): + obj.load_state_dict(data[name]) + elif hasattr(obj, '__setstate__'): + obj.__setstate__(data[name]) + elif hasattr(obj, '__dict__'): + obj.__dict__.clear() + obj.__dict__.update(data[name]) + else: + raise ValueError(f'Invalid state object of type {type(obj).__name__}') + if verbose: + print0('done') + + def load_latest(self, run_dir, pattern=r'training-state-(\d+).pt', verbose=True): + fnames = [entry.name for entry in os.scandir(run_dir) if entry.is_file() and re.fullmatch(pattern, entry.name)] + if len(fnames) == 0: + return None + pt_path = os.path.join(run_dir, max(fnames, key=lambda x: float(re.fullmatch(pattern, x).group(1)))) + self.load(pt_path, verbose=verbose) + return pt_path + +#---------------------------------------------------------------------------- diff --git a/back/preprocessing/torch_utils/misc.py b/back/preprocessing/torch_utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..78bc59a37464602d96f3375f13665559bbdbc789 --- /dev/null +++ b/back/preprocessing/torch_utils/misc.py @@ -0,0 +1,277 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +import re +import contextlib +import functools +import numpy as np +import torch +import warnings +import dnnlib + +#---------------------------------------------------------------------------- +# Re-seed torch & numpy random generators based on the given arguments. + +def set_random_seed(*args): + seed = hash(args) % (1 << 31) + torch.manual_seed(seed) + np.random.seed(seed) + +#---------------------------------------------------------------------------- +# Cached construction of constant tensors. Avoids CPU=>GPU copy when the +# same constant is used multiple times. + +_constant_cache = dict() + +def constant(value, shape=None, dtype=None, device=None, memory_format=None): + value = np.asarray(value) + if shape is not None: + shape = tuple(shape) + if dtype is None: + dtype = torch.get_default_dtype() + if device is None: + device = torch.device('cpu') + if memory_format is None: + memory_format = torch.contiguous_format + + key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format) + tensor = _constant_cache.get(key, None) + if tensor is None: + tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device) + if shape is not None: + tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape)) + tensor = tensor.contiguous(memory_format=memory_format) + _constant_cache[key] = tensor + return tensor + +#---------------------------------------------------------------------------- +# Variant of constant() that inherits dtype and device from the given +# reference tensor by default. + +def const_like(ref, value, shape=None, dtype=None, device=None, memory_format=None): + if dtype is None: + dtype = ref.dtype + if device is None: + device = ref.device + return constant(value, shape=shape, dtype=dtype, device=device, memory_format=memory_format) + +#---------------------------------------------------------------------------- +# Cached construction of temporary tensors in pinned CPU memory. + +@functools.lru_cache(None) +def pinned_buf(shape, dtype): + return torch.empty(shape, dtype=dtype).pin_memory() + +#---------------------------------------------------------------------------- +# Symbolic assert. + +try: + symbolic_assert = torch._assert # 1.8.0a0 # pylint: disable=protected-access +except AttributeError: + symbolic_assert = torch.Assert # 1.7.0 + +#---------------------------------------------------------------------------- +# Context manager to temporarily suppress known warnings in torch.jit.trace(). +# Note: Cannot use catch_warnings because of https://bugs.python.org/issue29672 + +@contextlib.contextmanager +def suppress_tracer_warnings(): + flt = ('ignore', None, torch.jit.TracerWarning, None, 0) + warnings.filters.insert(0, flt) + yield + warnings.filters.remove(flt) + +#---------------------------------------------------------------------------- +# Assert that the shape of a tensor matches the given list of integers. +# None indicates that the size of a dimension is allowed to vary. +# Performs symbolic assertion when used in torch.jit.trace(). + +def assert_shape(tensor, ref_shape): + if tensor.ndim != len(ref_shape): + raise AssertionError(f'Wrong number of dimensions: got {tensor.ndim}, expected {len(ref_shape)}') + for idx, (size, ref_size) in enumerate(zip(tensor.shape, ref_shape)): + if ref_size is None: + pass + elif isinstance(ref_size, torch.Tensor): + with suppress_tracer_warnings(): # as_tensor results are registered as constants + symbolic_assert(torch.equal(torch.as_tensor(size), ref_size), f'Wrong size for dimension {idx}') + elif isinstance(size, torch.Tensor): + with suppress_tracer_warnings(): # as_tensor results are registered as constants + symbolic_assert(torch.equal(size, torch.as_tensor(ref_size)), f'Wrong size for dimension {idx}: expected {ref_size}') + elif size != ref_size: + raise AssertionError(f'Wrong size for dimension {idx}: got {size}, expected {ref_size}') + +#---------------------------------------------------------------------------- +# Function decorator that calls torch.autograd.profiler.record_function(). + +def profiled_function(fn): + def decorator(*args, **kwargs): + with torch.autograd.profiler.record_function(fn.__name__): + return fn(*args, **kwargs) + decorator.__name__ = fn.__name__ + return decorator + +#---------------------------------------------------------------------------- +# Sampler for torch.utils.data.DataLoader that loops over the dataset +# indefinitely, shuffling items as it goes. + +class InfiniteSampler(torch.utils.data.Sampler): + def __init__(self, dataset, rank=0, num_replicas=1, shuffle=True, seed=0, start_idx=0): + assert len(dataset) > 0 + assert num_replicas > 0 + assert 0 <= rank < num_replicas + warnings.filterwarnings('ignore', '`data_source` argument is not used and will be removed') + super().__init__(dataset) + self.dataset_size = len(dataset) + self.start_idx = start_idx + rank + self.stride = num_replicas + self.shuffle = shuffle + self.seed = seed + + def __iter__(self): + idx = self.start_idx + epoch = None + while True: + if epoch != idx // self.dataset_size: + epoch = idx // self.dataset_size + order = np.arange(self.dataset_size) + if self.shuffle: + np.random.RandomState(hash((self.seed, epoch)) % (1 << 31)).shuffle(order) + yield int(order[idx % self.dataset_size]) + idx += self.stride + +#---------------------------------------------------------------------------- +# Utilities for operating with torch.nn.Module parameters and buffers. + +def params_and_buffers(module): + assert isinstance(module, torch.nn.Module) + return list(module.parameters()) + list(module.buffers()) + +def named_params_and_buffers(module): + assert isinstance(module, torch.nn.Module) + return list(module.named_parameters()) + list(module.named_buffers()) + +@torch.no_grad() +def copy_params_and_buffers(src_module, dst_module, require_all=False): + assert isinstance(src_module, torch.nn.Module) + assert isinstance(dst_module, torch.nn.Module) + src_tensors = dict(named_params_and_buffers(src_module)) + for name, tensor in named_params_and_buffers(dst_module): + assert (name in src_tensors) or (not require_all) + if name in src_tensors: + tensor.copy_(src_tensors[name]) + +#---------------------------------------------------------------------------- +# Context manager for easily enabling/disabling DistributedDataParallel +# synchronization. + +@contextlib.contextmanager +def ddp_sync(module, sync): + assert isinstance(module, torch.nn.Module) + if sync or not isinstance(module, torch.nn.parallel.DistributedDataParallel): + yield + else: + with module.no_sync(): + yield + +#---------------------------------------------------------------------------- +# Check DistributedDataParallel consistency across processes. + +def check_ddp_consistency(module, ignore_regex=None): + assert isinstance(module, torch.nn.Module) + for name, tensor in named_params_and_buffers(module): + fullname = type(module).__name__ + '.' + name + if ignore_regex is not None and re.fullmatch(ignore_regex, fullname): + continue + tensor = tensor.detach() + if tensor.is_floating_point(): + tensor = torch.nan_to_num(tensor) + other = tensor.clone() + torch.distributed.broadcast(tensor=other, src=0) + assert (tensor == other).all(), fullname + +#---------------------------------------------------------------------------- +# Print summary table of module hierarchy. + +@torch.no_grad() +def print_module_summary(module, inputs, max_nesting=3, skip_redundant=True): + assert isinstance(module, torch.nn.Module) + assert not isinstance(module, torch.jit.ScriptModule) + assert isinstance(inputs, (tuple, list)) + + # Register hooks. + entries = [] + nesting = [0] + def pre_hook(_mod, _inputs): + nesting[0] += 1 + def post_hook(mod, _inputs, outputs): + nesting[0] -= 1 + if nesting[0] <= max_nesting: + outputs = list(outputs) if isinstance(outputs, (tuple, list)) else [outputs] + outputs = [t for t in outputs if isinstance(t, torch.Tensor)] + entries.append(dnnlib.EasyDict(mod=mod, outputs=outputs)) + hooks = [mod.register_forward_pre_hook(pre_hook) for mod in module.modules()] + hooks += [mod.register_forward_hook(post_hook) for mod in module.modules()] + + # Run module. + outputs = module(*inputs) + for hook in hooks: + hook.remove() + + # Identify unique outputs, parameters, and buffers. + tensors_seen = set() + for e in entries: + e.unique_params = [t for t in e.mod.parameters() if id(t) not in tensors_seen] + e.unique_buffers = [t for t in e.mod.buffers() if id(t) not in tensors_seen] + e.unique_outputs = [t for t in e.outputs if id(t) not in tensors_seen] + tensors_seen |= {id(t) for t in e.unique_params + e.unique_buffers + e.unique_outputs} + + # Filter out redundant entries. + if skip_redundant: + entries = [e for e in entries if len(e.unique_params) or len(e.unique_buffers) or len(e.unique_outputs)] + + # Construct table. + rows = [[type(module).__name__, 'Parameters', 'Buffers', 'Output shape', 'Datatype']] + rows += [['---'] * len(rows[0])] + param_total = 0 + buffer_total = 0 + submodule_names = {mod: name for name, mod in module.named_modules()} + for e in entries: + name = '' if e.mod is module else submodule_names[e.mod] + param_size = sum(t.numel() for t in e.unique_params) + buffer_size = sum(t.numel() for t in e.unique_buffers) + output_shapes = [str(list(t.shape)) for t in e.outputs] + output_dtypes = [str(t.dtype).split('.')[-1] for t in e.outputs] + rows += [[ + name + (':0' if len(e.outputs) >= 2 else ''), + str(param_size) if param_size else '-', + str(buffer_size) if buffer_size else '-', + (output_shapes + ['-'])[0], + (output_dtypes + ['-'])[0], + ]] + for idx in range(1, len(e.outputs)): + rows += [[name + f':{idx}', '-', '-', output_shapes[idx], output_dtypes[idx]]] + param_total += param_size + buffer_total += buffer_size + rows += [['---'] * len(rows[0])] + rows += [['Total', str(param_total), str(buffer_total), '-', '-']] + + # Print table. + widths = [max(len(cell) for cell in column) for column in zip(*rows)] + print() + for row in rows: + print(' '.join(cell + ' ' * (width - len(cell)) for cell, width in zip(row, widths))) + print() + +#---------------------------------------------------------------------------- +# Tile a batch of images into a 2D grid. + +def tile_images(x, w, h): + assert x.ndim == 4 # NCHW => CHW + return x.reshape(h, w, *x.shape[1:]).permute(2, 0, 3, 1, 4).reshape(x.shape[1], h * x.shape[2], w * x.shape[3]) + +#---------------------------------------------------------------------------- diff --git a/back/preprocessing/torch_utils/persistence.py b/back/preprocessing/torch_utils/persistence.py new file mode 100644 index 0000000000000000000000000000000000000000..0b81e212a149890b1caaf7ae3d9e3b3cfddb05c7 --- /dev/null +++ b/back/preprocessing/torch_utils/persistence.py @@ -0,0 +1,257 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +"""Facilities for pickling Python code alongside other data. + +The pickled code is automatically imported into a separate Python module +during unpickling. This way, any previously exported pickles will remain +usable even if the original code is no longer available, or if the current +version of the code is not consistent with what was originally pickled.""" + +import sys +import pickle +import io +import inspect +import copy +import uuid +import types +import functools +import dnnlib + +#---------------------------------------------------------------------------- + +_version = 6 # internal version number +_decorators = set() # {decorator_class, ...} +_import_hooks = [] # [hook_function, ...] +_module_to_src_dict = dict() # {module: src, ...} +_src_to_module_dict = dict() # {src: module, ...} + +#---------------------------------------------------------------------------- + +def persistent_class(orig_class): + r"""Class decorator that extends a given class to save its source code + when pickled. + + Example: + + from torch_utils import persistence + + @persistence.persistent_class + class MyNetwork(torch.nn.Module): + def __init__(self, num_inputs, num_outputs): + super().__init__() + self.fc = MyLayer(num_inputs, num_outputs) + ... + + @persistence.persistent_class + class MyLayer(torch.nn.Module): + ... + + When pickled, any instance of `MyNetwork` and `MyLayer` will save its + source code alongside other internal state (e.g., parameters, buffers, + and submodules). This way, any previously exported pickle will remain + usable even if the class definitions have been modified or are no + longer available. + + The decorator saves the source code of the entire Python module + containing the decorated class. It does *not* save the source code of + any imported modules. Thus, the imported modules must be available + during unpickling, also including `torch_utils.persistence` itself. + + It is ok to call functions defined in the same module from the + decorated class. However, if the decorated class depends on other + classes defined in the same module, they must be decorated as well. + This is illustrated in the above example in the case of `MyLayer`. + + It is also possible to employ the decorator just-in-time before + calling the constructor. For example: + + cls = MyLayer + if want_to_make_it_persistent: + cls = persistence.persistent_class(cls) + layer = cls(num_inputs, num_outputs) + + As an additional feature, the decorator also keeps track of the + arguments that were used to construct each instance of the decorated + class. The arguments can be queried via `obj.init_args` and + `obj.init_kwargs`, and they are automatically pickled alongside other + object state. This feature can be disabled on a per-instance basis + by setting `self._record_init_args = False` in the constructor. + + A typical use case is to first unpickle a previous instance of a + persistent class, and then upgrade it to use the latest version of + the source code: + + with open('old_pickle.pkl', 'rb') as f: + old_net = pickle.load(f) + new_net = MyNetwork(*old_obj.init_args, **old_obj.init_kwargs) + misc.copy_params_and_buffers(old_net, new_net, require_all=True) + """ + assert isinstance(orig_class, type) + if is_persistent(orig_class): + return orig_class + + assert orig_class.__module__ in sys.modules + orig_module = sys.modules[orig_class.__module__] + orig_module_src = _module_to_src(orig_module) + + @functools.wraps(orig_class, updated=()) + class Decorator(orig_class): + _orig_module_src = orig_module_src + _orig_class_name = orig_class.__name__ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + record_init_args = getattr(self, '_record_init_args', True) + self._init_args = copy.deepcopy(args) if record_init_args else None + self._init_kwargs = copy.deepcopy(kwargs) if record_init_args else None + assert orig_class.__name__ in orig_module.__dict__ + _check_pickleable(self.__reduce__()) + + @property + def init_args(self): + assert self._init_args is not None + return copy.deepcopy(self._init_args) + + @property + def init_kwargs(self): + assert self._init_kwargs is not None + return dnnlib.EasyDict(copy.deepcopy(self._init_kwargs)) + + def __reduce__(self): + fields = list(super().__reduce__()) + fields += [None] * max(3 - len(fields), 0) + if fields[0] is not _reconstruct_persistent_obj: + meta = dict(type='class', version=_version, module_src=self._orig_module_src, class_name=self._orig_class_name, state=fields[2]) + fields[0] = _reconstruct_persistent_obj # reconstruct func + fields[1] = (meta,) # reconstruct args + fields[2] = None # state dict + return tuple(fields) + + _decorators.add(Decorator) + return Decorator + +#---------------------------------------------------------------------------- + +def is_persistent(obj): + r"""Test whether the given object or class is persistent, i.e., + whether it will save its source code when pickled. + """ + try: + if obj in _decorators: + return True + except TypeError: + pass + return type(obj) in _decorators # pylint: disable=unidiomatic-typecheck + +#---------------------------------------------------------------------------- + +def import_hook(hook): + r"""Register an import hook that is called whenever a persistent object + is being unpickled. A typical use case is to patch the pickled source + code to avoid errors and inconsistencies when the API of some imported + module has changed. + + The hook should have the following signature: + + hook(meta) -> modified meta + + `meta` is an instance of `dnnlib.EasyDict` with the following fields: + + type: Type of the persistent object, e.g. `'class'`. + version: Internal version number of `torch_utils.persistence`. + module_src Original source code of the Python module. + class_name: Class name in the original Python module. + state: Internal state of the object. + + Example: + + @persistence.import_hook + def wreck_my_network(meta): + if meta.class_name == 'MyNetwork': + print('MyNetwork is being imported. I will wreck it!') + meta.module_src = meta.module_src.replace("True", "False") + return meta + """ + assert callable(hook) + _import_hooks.append(hook) + +#---------------------------------------------------------------------------- + +def _reconstruct_persistent_obj(meta): + r"""Hook that is called internally by the `pickle` module to unpickle + a persistent object. + """ + meta = dnnlib.EasyDict(meta) + meta.state = dnnlib.EasyDict(meta.state) + for hook in _import_hooks: + meta = hook(meta) + assert meta is not None + + assert meta.version == _version + module = _src_to_module(meta.module_src) + + assert meta.type == 'class' + orig_class = module.__dict__[meta.class_name] + decorator_class = persistent_class(orig_class) + obj = decorator_class.__new__(decorator_class) + + setstate = getattr(obj, '__setstate__', None) + if callable(setstate): + setstate(meta.state) # pylint: disable=not-callable + else: + obj.__dict__.update(meta.state) + return obj + +#---------------------------------------------------------------------------- + +def _module_to_src(module): + r"""Query the source code of a given Python module. + """ + src = _module_to_src_dict.get(module, None) + if src is None: + src = inspect.getsource(module) + _module_to_src_dict[module] = src + _src_to_module_dict[src] = module + return src + +def _src_to_module(src): + r"""Get or create a Python module for the given source code. + """ + module = _src_to_module_dict.get(src, None) + if module is None: + module_name = "_imported_module_" + uuid.uuid4().hex + module = types.ModuleType(module_name) + sys.modules[module_name] = module + _module_to_src_dict[module] = src + _src_to_module_dict[src] = module + exec(src, module.__dict__) # pylint: disable=exec-used + return module + +#---------------------------------------------------------------------------- + +def _check_pickleable(obj): + r"""Check that the given object is pickleable, raising an exception if + it is not. This function is expected to be considerably more efficient + than actually pickling the object. + """ + def recurse(obj): + if isinstance(obj, (list, tuple, set)): + return [recurse(x) for x in obj] + if isinstance(obj, dict): + return [[recurse(x), recurse(y)] for x, y in obj.items()] + if isinstance(obj, (str, int, float, bool, bytes, bytearray)): + return None # Python primitive types are pickleable. + if f'{type(obj).__module__}.{type(obj).__name__}' in ['numpy.ndarray', 'torch.Tensor', 'torch.nn.parameter.Parameter']: + return None # NumPy arrays and PyTorch tensors are pickleable. + if is_persistent(obj): + return None # Persistent objects are pickleable, by virtue of the constructor check. + return obj + with io.BytesIO() as f: + pickle.dump(recurse(obj), f) + +#---------------------------------------------------------------------------- diff --git a/back/preprocessing/torch_utils/training_stats.py b/back/preprocessing/torch_utils/training_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..0b7ffc26ed028d6e2bf4485146dd2fb6b60ed0a3 --- /dev/null +++ b/back/preprocessing/torch_utils/training_stats.py @@ -0,0 +1,283 @@ +# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# This work is licensed under a Creative Commons +# Attribution-NonCommercial-ShareAlike 4.0 International License. +# You should have received a copy of the license along with this +# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ + +"""Facilities for reporting and collecting training statistics across +multiple processes and devices. The interface is designed to minimize +synchronization overhead as well as the amount of boilerplate in user +code.""" + +import re +import numpy as np +import torch +import dnnlib + +from . import misc + +#---------------------------------------------------------------------------- + +_num_moments = 3 # [num_scalars, sum_of_scalars, sum_of_squares] +_reduce_dtype = torch.float32 # Data type to use for initial per-tensor reduction. +_counter_dtype = torch.float64 # Data type to use for the internal counters. +_rank = 0 # Rank of the current process. +_sync_device = None # Device to use for multiprocess communication. None = single-process. +_sync_called = False # Has _sync() been called yet? +_counters = dict() # Running counters on each device, updated by report(): name => device => torch.Tensor +_cumulative = dict() # Cumulative counters on the CPU, updated by _sync(): name => torch.Tensor + +#---------------------------------------------------------------------------- + +def init_multiprocessing(rank, sync_device): + r"""Initializes `torch_utils.training_stats` for collecting statistics + across multiple processes. + + This function must be called after + `torch.distributed.init_process_group()` and before `Collector.update()`. + The call is not necessary if multi-process collection is not needed. + + Args: + rank: Rank of the current process. + sync_device: PyTorch device to use for inter-process + communication, or None to disable multi-process + collection. Typically `torch.device('cuda', rank)`. + """ + global _rank, _sync_device + assert not _sync_called + _rank = rank + _sync_device = sync_device + +#---------------------------------------------------------------------------- + +@misc.profiled_function +def report(name, value): + r"""Broadcasts the given set of scalars to all interested instances of + `Collector`, across device and process boundaries. NaNs and Infs are + ignored. + + This function is expected to be extremely cheap and can be safely + called from anywhere in the training loop, loss function, or inside a + `torch.nn.Module`. + + Warning: The current implementation expects the set of unique names to + be consistent across processes. Please make sure that `report()` is + called at least once for each unique name by each process, and in the + same order. If a given process has no scalars to broadcast, it can do + `report(name, [])` (empty list). + + Args: + name: Arbitrary string specifying the name of the statistic. + Averages are accumulated separately for each unique name. + value: Arbitrary set of scalars. Can be a list, tuple, + NumPy array, PyTorch tensor, or Python scalar. + + Returns: + The same `value` that was passed in. + """ + if name not in _counters: + _counters[name] = dict() + + elems = torch.as_tensor(value) + if elems.numel() == 0: + return value + + elems = elems.detach().flatten().to(_reduce_dtype) + square = elems.square() + finite = square.isfinite() + moments = torch.stack([ + finite.sum(dtype=_reduce_dtype), + torch.where(finite, elems, 0).sum(), + torch.where(finite, square, 0).sum(), + ]) + assert moments.ndim == 1 and moments.shape[0] == _num_moments + moments = moments.to(_counter_dtype) + + device = moments.device + if device not in _counters[name]: + _counters[name][device] = torch.zeros_like(moments) + _counters[name][device].add_(moments) + return value + +#---------------------------------------------------------------------------- + +def report0(name, value): + r"""Broadcasts the given set of scalars by the first process (`rank = 0`), + but ignores any scalars provided by the other processes. + See `report()` for further details. + """ + report(name, value if _rank == 0 else []) + return value + +#---------------------------------------------------------------------------- + +class Collector: + r"""Collects the scalars broadcasted by `report()` and `report0()` and + computes their long-term averages (mean and standard deviation) over + user-defined periods of time. + + The averages are first collected into internal counters that are not + directly visible to the user. They are then copied to the user-visible + state as a result of calling `update()` and can then be queried using + `mean()`, `std()`, `as_dict()`, etc. Calling `update()` also resets the + internal counters for the next round, so that the user-visible state + effectively reflects averages collected between the last two calls to + `update()`. + + Args: + regex: Regular expression defining which statistics to + collect. The default is to collect everything. + keep_previous: Whether to retain the previous averages if no + scalars were collected on a given round + (default: False). + """ + def __init__(self, regex='.*', keep_previous=False): + self._regex = re.compile(regex) + self._keep_previous = keep_previous + self._cumulative = dict() + self._moments = dict() + self.update() + self._moments.clear() + + def names(self): + r"""Returns the names of all statistics broadcasted so far that + match the regular expression specified at construction time. + """ + return [name for name in _counters if self._regex.fullmatch(name)] + + def update(self): + r"""Copies current values of the internal counters to the + user-visible state and resets them for the next round. + + If `keep_previous=True` was specified at construction time, the + operation is skipped for statistics that have received no scalars + since the last update, retaining their previous averages. + + This method performs a number of GPU-to-CPU transfers and one + `torch.distributed.all_reduce()`. It is intended to be called + periodically in the main training loop, typically once every + N training steps. + """ + if not self._keep_previous: + self._moments.clear() + for name, cumulative in _sync(self.names()): + if name not in self._cumulative: + self._cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype) + delta = cumulative - self._cumulative[name] + self._cumulative[name].copy_(cumulative) + if float(delta[0]) != 0: + self._moments[name] = delta + + def _get_delta(self, name): + r"""Returns the raw moments that were accumulated for the given + statistic between the last two calls to `update()`, or zero if + no scalars were collected. + """ + assert self._regex.fullmatch(name) + if name not in self._moments: + self._moments[name] = torch.zeros([_num_moments], dtype=_counter_dtype) + return self._moments[name] + + def num(self, name): + r"""Returns the number of scalars that were accumulated for the given + statistic between the last two calls to `update()`, or zero if + no scalars were collected. + """ + delta = self._get_delta(name) + return int(delta[0]) + + def mean(self, name): + r"""Returns the mean of the scalars that were accumulated for the + given statistic between the last two calls to `update()`, or NaN if + no scalars were collected. + """ + delta = self._get_delta(name) + if int(delta[0]) == 0: + return float('nan') + return float(delta[1] / delta[0]) + + def std(self, name): + r"""Returns the standard deviation of the scalars that were + accumulated for the given statistic between the last two calls to + `update()`, or NaN if no scalars were collected. + """ + delta = self._get_delta(name) + if int(delta[0]) == 0 or not np.isfinite(float(delta[1])): + return float('nan') + if int(delta[0]) == 1: + return float(0) + mean = float(delta[1] / delta[0]) + raw_var = float(delta[2] / delta[0]) + return np.sqrt(max(raw_var - np.square(mean), 0)) + + def as_dict(self): + r"""Returns the averages accumulated between the last two calls to + `update()` as an `dnnlib.EasyDict`. The contents are as follows: + + dnnlib.EasyDict( + NAME = dnnlib.EasyDict(num=FLOAT, mean=FLOAT, std=FLOAT), + ... + ) + """ + stats = dnnlib.EasyDict() + for name in self.names(): + stats[name] = dnnlib.EasyDict(num=self.num(name), mean=self.mean(name), std=self.std(name)) + return stats + + def __getitem__(self, name): + r"""Convenience getter. + `collector[name]` is a synonym for `collector.mean(name)`. + """ + return self.mean(name) + +#---------------------------------------------------------------------------- + +def _sync(names): + r"""Synchronize the global cumulative counters across devices and + processes. Called internally by `Collector.update()`. + """ + if len(names) == 0: + return [] + global _sync_called + _sync_called = True + + # Check that all ranks have the same set of names. + if _sync_device is not None: + value = hash(tuple(tuple(ord(char) for char in name) for name in names)) + other = torch.as_tensor(value, dtype=torch.int64, device=_sync_device) + torch.distributed.broadcast(tensor=other, src=0) + if value != int(other.cpu()): + raise ValueError('Training statistics are inconsistent between ranks') + + # Collect deltas within current rank. + deltas = [] + device = _sync_device if _sync_device is not None else torch.device('cpu') + for name in names: + delta = torch.zeros([_num_moments], dtype=_counter_dtype, device=device) + for counter in _counters[name].values(): + delta.add_(counter.to(device)) + counter.copy_(torch.zeros_like(counter)) + deltas.append(delta) + deltas = torch.stack(deltas) + + # Sum deltas across ranks. + if _sync_device is not None: + torch.distributed.all_reduce(deltas) + + # Update cumulative values. + deltas = deltas.cpu() + for idx, name in enumerate(names): + if name not in _cumulative: + _cumulative[name] = torch.zeros([_num_moments], dtype=_counter_dtype) + _cumulative[name].add_(deltas[idx]) + + # Return name-value pairs. + return [(name, _cumulative[name]) for name in names] + +#---------------------------------------------------------------------------- +# Convenience. + +default_collector = Collector() + +#---------------------------------------------------------------------------- diff --git a/back/wandb/debug-internal.log b/back/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2484b13253b230d8b8997b5a63e356b295b653e3 --- /dev/null +++ b/back/wandb/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2026-03-23T13:58:41.647788404+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-23T13:58:42.578470875+08:00","level":"INFO","msg":"stream: created new stream","id":"w9holkos"} +{"time":"2026-03-23T13:58:42.578676113+08:00","level":"INFO","msg":"handler: started","stream_id":"w9holkos"} +{"time":"2026-03-23T13:58:42.579473589+08:00","level":"INFO","msg":"stream: started","id":"w9holkos"} +{"time":"2026-03-23T13:58:42.57951741+08:00","level":"INFO","msg":"sender: started","stream_id":"w9holkos"} +{"time":"2026-03-23T13:58:42.579478227+08:00","level":"INFO","msg":"writer: started","stream_id":"w9holkos"} +{"time":"2026-03-23T14:49:13.568442881+08:00","level":"INFO","msg":"api: retrying HTTP error","status":408,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n408 Request Timeout\n\n\n

Error: Request Timeout

\n

Your client has taken too long to issue its request.

\n

\n\n"} +{"time":"2026-03-23T14:52:15.597652411+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-23T14:52:26.072213509+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": write tcp 172.20.98.27:52324->35.186.228.49:443: write: broken pipe"} +{"time":"2026-03-23T17:02:52.905542765+08:00","level":"INFO","msg":"api: retrying HTTP error","status":408,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n408 Request Timeout\n\n\n

Error: Request Timeout

\n

Your client has taken too long to issue its request.

\n

\n\n"} +{"time":"2026-03-23T17:05:55.176103762+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-23T17:06:10.164453104+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": unexpected EOF"} +{"time":"2026-03-23T22:05:06.25355716+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": read tcp 172.20.98.27:44154->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-23T22:05:20.791067182+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": read tcp 172.20.98.27:40392->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-24T02:18:38.770696332+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-03-24T06:25:41.879737278+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-03-24T06:30:14.989373032+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-03-24T09:05:02.85908394+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": read tcp 172.20.98.27:46722->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-25T04:41:04.741907157+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": unexpected EOF"} diff --git a/back/wandb/debug.log b/back/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..64360b6d725bb9039e8cce4fc50ca91a6d72ced5 --- /dev/null +++ b/back/wandb/debug.log @@ -0,0 +1,20 @@ +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_setup.py:_flush():81] Configure stats pid to 400275 +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260323_135841-w9holkos/logs/debug.log +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260323_135841-w9holkos/logs/debug-internal.log +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_init.py:init():844] calling init triggers +2026-03-23 13:58:41,344 INFO MainThread:400275 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-23 13:58:41,344 INFO MainThread:400275 [wandb_init.py:init():892] starting backend +2026-03-23 13:58:41,630 INFO MainThread:400275 [wandb_init.py:init():895] sending inform_init request +2026-03-23 13:58:41,643 INFO MainThread:400275 [wandb_init.py:init():903] backend started and connected +2026-03-23 13:58:41,646 INFO MainThread:400275 [wandb_init.py:init():973] updated telemetry +2026-03-23 13:58:41,659 INFO MainThread:400275 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-23 13:58:43,108 INFO MainThread:400275 [wandb_init.py:init():1042] starting run threads in backend +2026-03-23 13:58:43,201 INFO MainThread:400275 [wandb_run.py:_console_start():2524] atexit reg +2026-03-23 13:58:43,201 INFO MainThread:400275 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-23 13:58:43,201 INFO MainThread:400275 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-23 13:58:43,202 INFO MainThread:400275 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-23 13:58:43,209 INFO MainThread:400275 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-23 13:58:43,210 INFO MainThread:400275 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment-0.75', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 2000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.05, 't_c': 0.75, 'ot_cls': True} diff --git a/back/wandb/run-20260322_141726-2yw08kz9/files/config.yaml b/back/wandb/run-20260322_141726-2yw08kz9/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c8d61d7933543518fd896d036839b51ca42a0e8e --- /dev/null +++ b/back/wandb/run-20260322_141726-2yw08kz9/files/config.yaml @@ -0,0 +1,203 @@ +_wandb: + value: + cli_version: 0.25.0 + e: + 257k9ot60u1bv0aiwlacsvutj9c72h7y: + args: + - --report-to + - wandb + - --allow-tf32 + - --mixed-precision + - bf16 + - --seed + - "0" + - --path-type + - linear + - --prediction + - v + - --weighting + - uniform + - --model + - SiT-XL/2 + - --enc-type + - dinov2-vit-b + - --encoder-depth + - "8" + - --proj-coeff + - "0.5" + - --output-dir + - exps + - --exp-name + - jsflow-experiment + - --batch-size + - "256" + - --data-dir + - /gemini/space/zhaozy/dataset/Imagenet/imagenet_256 + - --semantic-features-dir + - /gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0 + - --learning-rate + - "0.00005" + - --t-c + - "0.5" + - --cls + - "0.2" + - --ot-cls + codePath: train.py + codePathLocal: train.py + cpu_count: 96 + cpu_count_logical: 192 + cudaVersion: "13.0" + disk: + /: + total: "3838880616448" + used: "357556633600" + email: 2365972933@qq.com + executable: /gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python + git: + commit: 021ea2e50c38c5803bd9afff16316958a01fbd1d + remote: https://github.com/Martinser/REG.git + gpu: NVIDIA H100 80GB HBM3 + gpu_count: 4 + gpu_nvidia: + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-a09f2421-99e6-a72e-63bd-fd7452510758 + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e + host: 24c964746905d416ce09d045f9a06f23-taskrole1-0 + memory: + total: "2164115296256" + os: Linux-5.15.0-94-generic-x86_64-with-glibc2.35 + program: /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py + python: CPython 3.12.9 + root: /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG + startedAt: "2026-03-22T06:17:26.670763Z" + writerId: 257k9ot60u1bv0aiwlacsvutj9c72h7y + m: [] + python_version: 3.12.9 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 53 + - 63 + - 71 + - 83 + - 98 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 53 + - 63 + - 71 + - 83 + - 98 + "3": + - 13 + - 61 + "4": 3.12.9 + "5": 0.25.0 + "6": 4.53.2 + "12": 0.25.0 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0 +allow_tf32: + value: true +batch_size: + value: 256 +cfg_prob: + value: 0.1 +checkpointing_steps: + value: 10000 +cls: + value: 0.2 +data_dir: + value: /gemini/space/zhaozy/dataset/Imagenet/imagenet_256 +enc_type: + value: dinov2-vit-b +encoder_depth: + value: 8 +epochs: + value: 1400 +exp_name: + value: jsflow-experiment +fused_attn: + value: true +gradient_accumulation_steps: + value: 1 +learning_rate: + value: 5e-05 +legacy: + value: false +logging_dir: + value: logs +max_grad_norm: + value: 1 +max_train_steps: + value: 1000000 +mixed_precision: + value: bf16 +model: + value: SiT-XL/2 +num_classes: + value: 1000 +num_workers: + value: 4 +ops_head: + value: 16 +ot_cls: + value: true +output_dir: + value: exps +path_type: + value: linear +prediction: + value: v +proj_coeff: + value: 0.5 +qk_norm: + value: false +report_to: + value: wandb +resolution: + value: 256 +resume_step: + value: 0 +sampling_steps: + value: 10000 +seed: + value: 0 +semantic_features_dir: + value: /gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0 +t_c: + value: 0.5 +weighting: + value: uniform diff --git a/back/wandb/run-20260322_141726-2yw08kz9/files/output.log b/back/wandb/run-20260322_141726-2yw08kz9/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..3a5a98aa93b20204d2fa008f18ef436b946ebf84 --- /dev/null +++ b/back/wandb/run-20260322_141726-2yw08kz9/files/output.log @@ -0,0 +1,27 @@ +Steps: 0%| | 1/1000000 [00:02<614:34:39, 2.21s/it][2026-03-22 14:17:31] Generating EMA samples done. +[2026-03-22 14:17:31] Step: 1, Training Logs: loss_final: 3.278940, loss_mean: 1.706308, proj_loss: 0.001541, loss_mean_cls: 1.571091, grad_norm: 1.481672 +Steps: 0%| | 2/1000000 [00:02<289:06:04, 1.04s/it, grad_norm=1.48, loss_final=3.28, loss_mean=1.71, loss_mean_cls=1.57, proj_loss=0.001[2026-03-22 14:17:31] Step: 2, Training Logs: loss_final: 3.211831, loss_mean: 1.688932, proj_loss: -0.010287, loss_mean_cls: 1.533185, grad_norm: 1.055476 +Steps: 0%| | 3/1000000 [00:02<187:48:39, 1.48it/s, grad_norm=1.06, loss_final=3.21, loss_mean=1.69, loss_mean_cls=1.53, proj_loss=-0.01[2026-03-22 14:17:31] Step: 3, Training Logs: loss_final: 3.201248, loss_mean: 1.663205, proj_loss: -0.019184, loss_mean_cls: 1.557227, grad_norm: 1.116387 +Steps: 0%| | 4/1000000 [00:02<140:12:43, 1.98it/s, grad_norm=1.12, loss_final=3.2, loss_mean=1.66, loss_mean_cls=1.56, proj_loss=-0.019[2026-03-22 14:17:32] Step: 4, Training Logs: loss_final: 3.198367, loss_mean: 1.682051, proj_loss: -0.026376, loss_mean_cls: 1.542691, grad_norm: 0.722294 +Steps: 0%| | 5/1000000 [00:03<113:52:43, 2.44it/s, grad_norm=0.722, loss_final=3.2, loss_mean=1.68, loss_mean_cls=1.54, proj_loss=-0.02[2026-03-22 14:17:32] Step: 5, Training Logs: loss_final: 3.140483, loss_mean: 1.679105, proj_loss: -0.034564, loss_mean_cls: 1.495943, grad_norm: 0.811589 +Steps: 0%| | 6/1000000 [00:03<97:59:40, 2.83it/s, grad_norm=0.812, loss_final=3.14, loss_mean=1.68, loss_mean_cls=1.5, proj_loss=-0.034[2026-03-22 14:17:32] Step: 6, Training Logs: loss_final: 2.988440, loss_mean: 1.682339, proj_loss: -0.039506, loss_mean_cls: 1.345606, grad_norm: 0.931524 +Steps: 0%| | 7/1000000 [00:03<87:55:00, 3.16it/s, grad_norm=0.932, loss_final=2.99, loss_mean=1.68, loss_mean_cls=1.35, proj_loss=-0.03[2026-03-22 14:17:32] Step: 7, Training Logs: loss_final: 3.111949, loss_mean: 1.690802, proj_loss: -0.042757, loss_mean_cls: 1.463904, grad_norm: 0.830852 +Steps: 0%| | 8/1000000 [00:03<81:19:20, 3.42it/s, grad_norm=0.831, loss_final=3.11, loss_mean=1.69, loss_mean_cls=1.46, proj_loss=-0.04[2026-03-22 14:17:33] Step: 8, Training Logs: loss_final: 3.278931, loss_mean: 1.660797, proj_loss: -0.045011, loss_mean_cls: 1.663145, grad_norm: 0.847438 +Steps: 0%| | 9/1000000 [00:04<76:56:10, 3.61it/s, grad_norm=0.847, loss_final=3.28, loss_mean=1.66, loss_mean_cls=1.66, proj_loss=-0.04[2026-03-22 14:17:33] Step: 9, Training Logs: loss_final: 3.221569, loss_mean: 1.658834, proj_loss: -0.046031, loss_mean_cls: 1.608767, grad_norm: 0.909827 +Steps: 0%| | 10/1000000 [00:04<73:57:18, 3.76it/s, grad_norm=0.91, loss_final=3.22, loss_mean=1.66, loss_mean_cls=1.61, proj_loss=-0.04[2026-03-22 14:17:33] Step: 10, Training Logs: loss_final: 3.216744, loss_mean: 1.665229, proj_loss: -0.047761, loss_mean_cls: 1.599277, grad_norm: 1.014574 +Steps: 0%| | 11/1000000 [00:04<71:52:01, 3.87it/s, grad_norm=1.01, loss_final=3.22, loss_mean=1.67, loss_mean_cls=1.6, proj_loss=-0.047[2026-03-22 14:17:33] Step: 11, Training Logs: loss_final: 3.216658, loss_mean: 1.649915, proj_loss: -0.049347, loss_mean_cls: 1.616090, grad_norm: 1.028789 +Steps: 0%| | 12/1000000 [00:04<70:26:20, 3.94it/s, grad_norm=1.03, loss_final=3.22, loss_mean=1.65, loss_mean_cls=1.62, proj_loss=-0.04[2026-03-22 14:17:34] Step: 12, Training Logs: loss_final: 3.155676, loss_mean: 1.624463, proj_loss: -0.049856, loss_mean_cls: 1.581069, grad_norm: 1.231291 +Steps: 0%| | 13/1000000 [00:05<69:25:29, 4.00it/s, grad_norm=1.23, loss_final=3.16, loss_mean=1.62, loss_mean_cls=1.58, proj_loss=-0.04Traceback (most recent call last): + File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 527, in + main(args) + File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 415, in main + "loss_final": accelerator.gather(loss).mean().detach().item(), + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +KeyboardInterrupt +[rank0]: Traceback (most recent call last): +[rank0]: File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 527, in +[rank0]: main(args) +[rank0]: File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 415, in main +[rank0]: "loss_final": accelerator.gather(loss).mean().detach().item(), +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: KeyboardInterrupt diff --git a/back/wandb/run-20260322_141726-2yw08kz9/files/requirements.txt b/back/wandb/run-20260322_141726-2yw08kz9/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0235910d0d99b7dee69b9a7f2f90012c8b711cc --- /dev/null +++ b/back/wandb/run-20260322_141726-2yw08kz9/files/requirements.txt @@ -0,0 +1,168 @@ +dill==0.3.8 +mkl-service==2.4.0 +mpmath==1.3.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +torch==2.5.1 +ptyprocess==0.7.0 +traitlets==5.14.3 +pyasn1==0.6.1 +opencv-python-headless==4.12.0.88 +nest-asyncio==1.6.0 +kiwisolver==1.4.8 +click==8.2.1 +fire==0.7.1 +diffusers==0.35.1 +accelerate==1.7.0 +ipykernel==6.29.5 +peft==0.17.1 +attrs==24.3.0 +six==1.17.0 +numpy==2.0.1 +yarl==1.18.0 +huggingface_hub==0.34.4 +Bottleneck==1.4.2 +numexpr==2.11.0 +dataclasses==0.6 +typing-inspection==0.4.1 +safetensors==0.5.3 +pyparsing==3.2.3 +psutil==7.0.0 +imageio==2.37.0 +debugpy==1.8.14 +cycler==0.12.1 +pyasn1_modules==0.4.2 +matplotlib-inline==0.1.7 +matplotlib==3.10.3 +jedi==0.19.2 +tokenizers==0.21.2 +seaborn==0.13.2 +timm==1.0.15 +aiohappyeyeballs==2.6.1 +hf-xet==1.1.8 +multidict==6.1.0 +tqdm==4.67.1 +wheel==0.45.1 +simsimd==6.5.1 +sentencepiece==0.2.1 +grpcio==1.74.0 +asttokens==3.0.0 +absl-py==2.3.1 +stack-data==0.6.3 +pandas==2.3.0 +importlib_metadata==8.7.0 +pytorch-image-generation-metrics==0.6.1 +frozenlist==1.5.0 +MarkupSafe==3.0.2 +setuptools==78.1.1 +multiprocess==0.70.15 +pip==25.1 +requests==2.32.3 +mkl_random==1.2.8 +tensorboard-plugin-wit==1.8.1 +ExifRead-nocycle==3.0.1 +webdataset==0.2.111 +threadpoolctl==3.6.0 +pyarrow==21.0.0 +executing==2.2.0 +decorator==5.2.1 +contourpy==1.3.2 +annotated-types==0.7.0 +scikit-learn==1.7.1 +jupyter_client==8.6.3 +albumentations==1.4.24 +wandb==0.25.0 +certifi==2025.8.3 +idna==3.7 +xxhash==3.5.0 +Jinja2==3.1.6 +python-dateutil==2.9.0.post0 +aiosignal==1.4.0 +triton==3.1.0 +torchvision==0.20.1 +stringzilla==3.12.6 +pure_eval==0.2.3 +braceexpand==0.1.7 +zipp==3.22.0 +oauthlib==3.3.1 +Markdown==3.8.2 +fsspec==2025.3.0 +fonttools==4.58.2 +comm==0.2.2 +ipython==9.3.0 +img2dataset==1.47.0 +networkx==3.4.2 +PySocks==1.7.1 +tzdata==2025.2 +smmap==5.0.2 +mkl_fft==1.3.11 +sentry-sdk==2.29.1 +Pygments==2.19.1 +pexpect==4.9.0 +ftfy==6.3.1 +einops==0.8.1 +requests-oauthlib==2.0.0 +gitdb==4.0.12 +albucore==0.0.23 +torchdiffeq==0.2.5 +GitPython==3.1.44 +bitsandbytes==0.47.0 +pytorch-fid==0.3.0 +clean-fid==0.1.35 +pytorch-gan-metrics==0.5.4 +Brotli==1.0.9 +charset-normalizer==3.3.2 +gmpy2==2.2.1 +pillow==11.1.0 +PyYAML==6.0.2 +tornado==6.5.1 +termcolor==3.1.0 +setproctitle==1.3.6 +scipy==1.15.3 +regex==2024.11.6 +protobuf==6.31.1 +platformdirs==4.3.8 +joblib==1.5.1 +cachetools==4.2.4 +ipython_pygments_lexers==1.1.1 +google-auth==1.35.0 +transformers==4.53.2 +torch-fidelity==0.3.0 +tensorboard==2.4.0 +filelock==3.17.0 +packaging==25.0 +propcache==0.3.1 +pytz==2025.2 +aiohttp==3.11.10 +wcwidth==0.2.13 +clip==0.2.0 +Werkzeug==3.1.3 +tensorboard-data-server==0.6.1 +sympy==1.13.1 +pyzmq==26.4.0 +pydantic_core==2.33.2 +prompt_toolkit==3.0.51 +parso==0.8.4 +docker-pycreds==0.4.0 +rsa==4.9.1 +pydantic==2.11.5 +jupyter_core==5.8.1 +google-auth-oauthlib==0.4.6 +datasets==4.0.0 +torch-tb-profiler==0.4.3 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +typeguard==4.3.0 +inflect==7.3.1 +jaraco.text==3.12.1 +tomli==2.0.1 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/back/wandb/run-20260322_141726-2yw08kz9/files/wandb-metadata.json b/back/wandb/run-20260322_141726-2yw08kz9/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..b96ef5d556d91f95f04f077800bbba110069d09f --- /dev/null +++ b/back/wandb/run-20260322_141726-2yw08kz9/files/wandb-metadata.json @@ -0,0 +1,101 @@ +{ + "os": "Linux-5.15.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.9", + "startedAt": "2026-03-22T06:17:26.670763Z", + "args": [ + "--report-to", + "wandb", + "--allow-tf32", + "--mixed-precision", + "bf16", + "--seed", + "0", + "--path-type", + "linear", + "--prediction", + "v", + "--weighting", + "uniform", + "--model", + "SiT-XL/2", + "--enc-type", + "dinov2-vit-b", + "--encoder-depth", + "8", + "--proj-coeff", + "0.5", + "--output-dir", + "exps", + "--exp-name", + "jsflow-experiment", + "--batch-size", + "256", + "--data-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256", + "--semantic-features-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0", + "--learning-rate", + "0.00005", + "--t-c", + "0.5", + "--cls", + "0.2", + "--ot-cls" + ], + "program": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "git": { + "remote": "https://github.com/Martinser/REG.git", + "commit": "021ea2e50c38c5803bd9afff16316958a01fbd1d" + }, + "email": "2365972933@qq.com", + "root": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG", + "host": "24c964746905d416ce09d045f9a06f23-taskrole1-0", + "executable": "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 4, + "disk": { + "/": { + "total": "3838880616448", + "used": "357556633600" + } + }, + "memory": { + "total": "2164115296256" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-a09f2421-99e6-a72e-63bd-fd7452510758" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e" + } + ], + "cudaVersion": "13.0", + "writerId": "257k9ot60u1bv0aiwlacsvutj9c72h7y" +} \ No newline at end of file diff --git a/back/wandb/run-20260322_141726-2yw08kz9/files/wandb-summary.json b/back/wandb/run-20260322_141726-2yw08kz9/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..0c4d27dbf4ac566ce3a1e7d152919c750ec0d917 --- /dev/null +++ b/back/wandb/run-20260322_141726-2yw08kz9/files/wandb-summary.json @@ -0,0 +1 @@ +{"loss_mean_cls":1.5810688734054565,"_timestamp":1.7741602540511734e+09,"_runtime":5.247627056,"loss_mean":1.6244629621505737,"proj_loss":-0.04985573887825012,"grad_norm":1.2312908172607422,"_wandb":{"runtime":5},"_step":12,"loss_final":3.1556761264801025} \ No newline at end of file diff --git a/back/wandb/run-20260322_141726-2yw08kz9/logs/debug-internal.log b/back/wandb/run-20260322_141726-2yw08kz9/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e624c9f2985d5ecbcc1b975325ec011ef0b3b72d --- /dev/null +++ b/back/wandb/run-20260322_141726-2yw08kz9/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2026-03-22T14:17:27.013311984+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-22T14:17:28.347732261+08:00","level":"INFO","msg":"stream: created new stream","id":"2yw08kz9"} +{"time":"2026-03-22T14:17:28.347960938+08:00","level":"INFO","msg":"handler: started","stream_id":"2yw08kz9"} +{"time":"2026-03-22T14:17:28.348671928+08:00","level":"INFO","msg":"stream: started","id":"2yw08kz9"} +{"time":"2026-03-22T14:17:28.348731034+08:00","level":"INFO","msg":"sender: started","stream_id":"2yw08kz9"} +{"time":"2026-03-22T14:17:28.348748525+08:00","level":"INFO","msg":"writer: started","stream_id":"2yw08kz9"} +{"time":"2026-03-22T14:17:34.316421629+08:00","level":"INFO","msg":"stream: closing","id":"2yw08kz9"} diff --git a/back/wandb/run-20260322_141726-2yw08kz9/logs/debug.log b/back/wandb/run-20260322_141726-2yw08kz9/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..1d1e39e2e9d69aab1ce034ae3eaf5956942cb10f --- /dev/null +++ b/back/wandb/run-20260322_141726-2yw08kz9/logs/debug.log @@ -0,0 +1,22 @@ +2026-03-22 14:17:26,691 INFO MainThread:316313 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-22 14:17:26,691 INFO MainThread:316313 [wandb_setup.py:_flush():81] Configure stats pid to 316313 +2026-03-22 14:17:26,691 INFO MainThread:316313 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-22 14:17:26,691 INFO MainThread:316313 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_141726-2yw08kz9/logs/debug.log +2026-03-22 14:17:26,691 INFO MainThread:316313 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_141726-2yw08kz9/logs/debug-internal.log +2026-03-22 14:17:26,691 INFO MainThread:316313 [wandb_init.py:init():844] calling init triggers +2026-03-22 14:17:26,691 INFO MainThread:316313 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-22 14:17:26,691 INFO MainThread:316313 [wandb_init.py:init():892] starting backend +2026-03-22 14:17:26,994 INFO MainThread:316313 [wandb_init.py:init():895] sending inform_init request +2026-03-22 14:17:27,008 INFO MainThread:316313 [wandb_init.py:init():903] backend started and connected +2026-03-22 14:17:27,011 INFO MainThread:316313 [wandb_init.py:init():973] updated telemetry +2026-03-22 14:17:27,025 INFO MainThread:316313 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-22 14:17:29,067 INFO MainThread:316313 [wandb_init.py:init():1042] starting run threads in backend +2026-03-22 14:17:29,158 INFO MainThread:316313 [wandb_run.py:_console_start():2524] atexit reg +2026-03-22 14:17:29,158 INFO MainThread:316313 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-22 14:17:29,158 INFO MainThread:316313 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-22 14:17:29,159 INFO MainThread:316313 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-22 14:17:29,163 INFO MainThread:316313 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-22 14:17:29,163 INFO MainThread:316313 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 10000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.2, 't_c': 0.5, 'ot_cls': True} +2026-03-22 14:17:34,316 INFO wandb-AsyncioManager-main:316313 [service_client.py:_forward_responses():134] Reached EOF. +2026-03-22 14:17:34,316 INFO wandb-AsyncioManager-main:316313 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles. diff --git a/back/wandb/run-20260322_141726-2yw08kz9/run-2yw08kz9.wandb b/back/wandb/run-20260322_141726-2yw08kz9/run-2yw08kz9.wandb new file mode 100644 index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32 Binary files /dev/null and b/back/wandb/run-20260322_141726-2yw08kz9/run-2yw08kz9.wandb differ diff --git a/back/wandb/run-20260322_141833-vm0y8t9t/files/output.log b/back/wandb/run-20260322_141833-vm0y8t9t/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..6f81fb21903b31703cde53f59cc5bc1e405cf3bf --- /dev/null +++ b/back/wandb/run-20260322_141833-vm0y8t9t/files/output.log @@ -0,0 +1,3089 @@ +Steps: 0%| | 1/1000000 [00:01<552:16:34, 1.99s/it][2026-03-22 14:18:38] Generating EMA samples done. +[2026-03-22 14:18:38] Step: 1, Training Logs: loss_final: 3.278940, loss_mean: 1.706308, proj_loss: 0.001541, loss_mean_cls: 1.571091, grad_norm: 1.481672 +Steps: 0%| | 2/1000000 [00:02<263:22:15, 1.05it/s, grad_norm=1.48, loss_final=3.28, loss_mean=1.71, loss_mean_cls=1.57, proj_loss=0.00154][2026-03-22 14:18:39] Step: 2, Training Logs: loss_final: 3.211831, loss_mean: 1.688932, proj_loss: -0.010287, loss_mean_cls: 1.533185, grad_norm: 1.055476 +Steps: 0%| | 3/1000000 [00:02<174:00:15, 1.60it/s, grad_norm=1.06, loss_final=3.21, loss_mean=1.69, loss_mean_cls=1.53, proj_loss=-0.0103][2026-03-22 14:18:39] Step: 3, Training Logs: loss_final: 3.201248, loss_mean: 1.663205, proj_loss: -0.019184, loss_mean_cls: 1.557227, grad_norm: 1.116388 +Steps: 0%| | 4/1000000 [00:02<131:59:32, 2.10it/s, grad_norm=1.12, loss_final=3.2, loss_mean=1.66, loss_mean_cls=1.56, proj_loss=-0.0192][2026-03-22 14:18:39] Step: 4, Training Logs: loss_final: 3.198366, loss_mean: 1.682051, proj_loss: -0.026376, loss_mean_cls: 1.542691, grad_norm: 0.722296 +Steps: 0%| | 5/1000000 [00:02<108:43:37, 2.55it/s, grad_norm=0.722, loss_final=3.2, loss_mean=1.68, loss_mean_cls=1.54, proj_loss=-0.0264][2026-03-22 14:18:39] Step: 5, Training Logs: loss_final: 3.140483, loss_mean: 1.679105, proj_loss: -0.034564, loss_mean_cls: 1.495943, grad_norm: 0.811583 +Steps: 0%| | 6/1000000 [00:03<94:42:58, 2.93it/s, grad_norm=0.812, loss_final=3.14, loss_mean=1.68, loss_mean_cls=1.5, proj_loss=-0.0346] [2026-03-22 14:18:40] Step: 6, Training Logs: loss_final: 2.988440, loss_mean: 1.682339, proj_loss: -0.039506, loss_mean_cls: 1.345606, grad_norm: 0.931521 +Steps: 0%| | 7/1000000 [00:03<85:49:01, 3.24it/s, grad_norm=0.932, loss_final=2.99, loss_mean=1.68, loss_mean_cls=1.35, proj_loss=-0.0395][2026-03-22 14:18:40] Step: 7, Training Logs: loss_final: 3.111948, loss_mean: 1.690802, proj_loss: -0.042758, loss_mean_cls: 1.463904, grad_norm: 0.830850 +Steps: 0%| | 8/1000000 [00:03<79:57:55, 3.47it/s, grad_norm=0.831, loss_final=3.11, loss_mean=1.69, loss_mean_cls=1.46, proj_loss=-0.0428][2026-03-22 14:18:40] Step: 8, Training Logs: loss_final: 3.278931, loss_mean: 1.660797, proj_loss: -0.045011, loss_mean_cls: 1.663145, grad_norm: 0.847439 +Steps: 0%| | 9/1000000 [00:03<76:04:36, 3.65it/s, grad_norm=0.847, loss_final=3.28, loss_mean=1.66, loss_mean_cls=1.66, proj_loss=-0.045][2026-03-22 14:18:40] Step: 9, Training Logs: loss_final: 3.221569, loss_mean: 1.658834, proj_loss: -0.046031, loss_mean_cls: 1.608767, grad_norm: 0.909834 +Steps: 0%| | 10/1000000 [00:04<73:25:35, 3.78it/s, grad_norm=0.91, loss_final=3.22, loss_mean=1.66, loss_mean_cls=1.61, proj_loss=-0.046][2026-03-22 14:18:41] Step: 10, Training Logs: loss_final: 3.216745, loss_mean: 1.665229, proj_loss: -0.047761, loss_mean_cls: 1.599277, grad_norm: 1.014567 +Steps: 0%| | 11/1000000 [00:04<71:37:12, 3.88it/s, grad_norm=1.01, loss_final=3.22, loss_mean=1.67, loss_mean_cls=1.6, proj_loss=-0.0478][2026-03-22 14:18:41] Step: 11, Training Logs: loss_final: 3.216658, loss_mean: 1.649914, proj_loss: -0.049347, loss_mean_cls: 1.616090, grad_norm: 1.028799 +Steps: 0%| | 12/1000000 [00:04<70:23:58, 3.95it/s, grad_norm=1.03, loss_final=3.22, loss_mean=1.65, loss_mean_cls=1.62, proj_loss=-0.0493][2026-03-22 14:18:41] Step: 12, Training Logs: loss_final: 3.155675, loss_mean: 1.624462, proj_loss: -0.049856, loss_mean_cls: 1.581069, grad_norm: 1.231295 +Steps: 0%| | 13/1000000 [00:04<69:31:48, 4.00it/s, grad_norm=1.23, loss_final=3.16, loss_mean=1.62, loss_mean_cls=1.58, proj_loss=-0.0499][2026-03-22 14:18:41] Step: 13, Training Logs: loss_final: 3.070825, loss_mean: 1.622859, proj_loss: -0.051401, loss_mean_cls: 1.499367, grad_norm: 1.393218 +Steps: 0%| | 14/1000000 [00:05<68:54:33, 4.03it/s, grad_norm=1.39, loss_final=3.07, loss_mean=1.62, loss_mean_cls=1.5, proj_loss=-0.0514][2026-03-22 14:18:42] Step: 14, Training Logs: loss_final: 3.023751, loss_mean: 1.600025, proj_loss: -0.051506, loss_mean_cls: 1.475232, grad_norm: 1.276921 +Steps: 0%| | 15/1000000 [00:05<68:35:23, 4.05it/s, grad_norm=1.28, loss_final=3.02, loss_mean=1.6, loss_mean_cls=1.48, proj_loss=-0.0515][2026-03-22 14:18:42] Step: 15, Training Logs: loss_final: 3.066810, loss_mean: 1.586753, proj_loss: -0.054823, loss_mean_cls: 1.534880, grad_norm: 1.232969 +Steps: 0%| | 16/1000000 [00:05<68:23:59, 4.06it/s, grad_norm=1.23, loss_final=3.07, loss_mean=1.59, loss_mean_cls=1.53, proj_loss=-0.0548][2026-03-22 14:18:42] Step: 16, Training Logs: loss_final: 3.039314, loss_mean: 1.599315, proj_loss: -0.052305, loss_mean_cls: 1.492303, grad_norm: 1.259287 +Steps: 0%| | 17/1000000 [00:05<68:11:38, 4.07it/s, grad_norm=1.26, loss_final=3.04, loss_mean=1.6, loss_mean_cls=1.49, proj_loss=-0.0523][2026-03-22 14:18:42] Step: 17, Training Logs: loss_final: 3.199210, loss_mean: 1.640759, proj_loss: -0.051654, loss_mean_cls: 1.610105, grad_norm: 2.487287 +Steps: 0%| | 18/1000000 [00:06<68:03:26, 4.08it/s, grad_norm=2.49, loss_final=3.2, loss_mean=1.64, loss_mean_cls=1.61, proj_loss=-0.0517][2026-03-22 14:18:43] Step: 18, Training Logs: loss_final: 3.061103, loss_mean: 1.581945, proj_loss: -0.054858, loss_mean_cls: 1.534016, grad_norm: 1.313252 +Steps: 0%| | 19/1000000 [00:06<68:00:34, 4.08it/s, grad_norm=1.31, loss_final=3.06, loss_mean=1.58, loss_mean_cls=1.53, proj_loss=-0.0549][2026-03-22 14:18:43] Step: 19, Training Logs: loss_final: 2.976792, loss_mean: 1.595596, proj_loss: -0.055190, loss_mean_cls: 1.436385, grad_norm: 1.910110 +Steps: 0%| | 20/1000000 [00:06<67:55:38, 4.09it/s, grad_norm=1.91, loss_final=2.98, loss_mean=1.6, loss_mean_cls=1.44, proj_loss=-0.0552][2026-03-22 14:18:43] Step: 20, Training Logs: loss_final: 2.986480, loss_mean: 1.607371, proj_loss: -0.053412, loss_mean_cls: 1.432521, grad_norm: 2.002217 +Steps: 0%| | 21/1000000 [00:06<67:55:36, 4.09it/s, grad_norm=2, loss_final=2.99, loss_mean=1.61, loss_mean_cls=1.43, proj_loss=-0.0534][2026-03-22 14:18:43] Step: 21, Training Logs: loss_final: 3.133109, loss_mean: 1.585298, proj_loss: -0.053897, loss_mean_cls: 1.601709, grad_norm: 1.288423 +Steps: 0%| | 22/1000000 [00:07<67:53:14, 4.09it/s, grad_norm=1.29, loss_final=3.13, loss_mean=1.59, loss_mean_cls=1.6, proj_loss=-0.0539][2026-03-22 14:18:44] Step: 22, Training Logs: loss_final: 3.013652, loss_mean: 1.545496, proj_loss: -0.053496, loss_mean_cls: 1.521651, grad_norm: 1.089474 +Steps: 0%| | 23/1000000 [00:07<67:49:52, 4.10it/s, grad_norm=1.09, loss_final=3.01, loss_mean=1.55, loss_mean_cls=1.52, proj_loss=-0.0535][2026-03-22 14:18:44] Step: 23, Training Logs: loss_final: 2.908185, loss_mean: 1.575139, proj_loss: -0.051113, loss_mean_cls: 1.384159, grad_norm: 1.398506 +Steps: 0%| | 24/1000000 [00:07<67:47:15, 4.10it/s, grad_norm=1.4, loss_final=2.91, loss_mean=1.58, loss_mean_cls=1.38, proj_loss=-0.0511][2026-03-22 14:18:44] Step: 24, Training Logs: loss_final: 3.198563, loss_mean: 1.565791, proj_loss: -0.054296, loss_mean_cls: 1.687067, grad_norm: 1.426405 +Steps: 0%| | 25/1000000 [00:07<67:46:35, 4.10it/s, grad_norm=1.43, loss_final=3.2, loss_mean=1.57, loss_mean_cls=1.69, proj_loss=-0.0543][2026-03-22 14:18:44] Step: 25, Training Logs: loss_final: 3.094658, loss_mean: 1.559575, proj_loss: -0.056038, loss_mean_cls: 1.591121, grad_norm: 1.249540 +Steps: 0%| | 26/1000000 [00:08<67:48:55, 4.10it/s, grad_norm=1.25, loss_final=3.09, loss_mean=1.56, loss_mean_cls=1.59, proj_loss=-0.056][2026-03-22 14:18:45] Step: 26, Training Logs: loss_final: 3.145165, loss_mean: 1.552291, proj_loss: -0.055623, loss_mean_cls: 1.648497, grad_norm: 1.188950 +Steps: 0%| | 27/1000000 [00:08<67:49:45, 4.10it/s, grad_norm=1.19, loss_final=3.15, loss_mean=1.55, loss_mean_cls=1.65, proj_loss=-0.0556][2026-03-22 14:18:45] Step: 27, Training Logs: loss_final: 3.178053, loss_mean: 1.553970, proj_loss: -0.054968, loss_mean_cls: 1.679051, grad_norm: 1.136118 +Steps: 0%| | 28/1000000 [00:08<67:46:44, 4.10it/s, grad_norm=1.14, loss_final=3.18, loss_mean=1.55, loss_mean_cls=1.68, proj_loss=-0.055][2026-03-22 14:18:45] Step: 28, Training Logs: loss_final: 3.245641, loss_mean: 1.564707, proj_loss: -0.054255, loss_mean_cls: 1.735189, grad_norm: 1.168559 +Steps: 0%| | 29/1000000 [00:08<67:48:36, 4.10it/s, grad_norm=1.17, loss_final=3.25, loss_mean=1.56, loss_mean_cls=1.74, proj_loss=-0.0543][2026-03-22 14:18:45] Step: 29, Training Logs: loss_final: 2.968938, loss_mean: 1.521793, proj_loss: -0.055075, loss_mean_cls: 1.502221, grad_norm: 1.176622 +Steps: 0%| | 30/1000000 [00:09<67:48:25, 4.10it/s, grad_norm=1.18, loss_final=2.97, loss_mean=1.52, loss_mean_cls=1.5, proj_loss=-0.0551][2026-03-22 14:18:45] Step: 30, Training Logs: loss_final: 2.929142, loss_mean: 1.501763, proj_loss: -0.055719, loss_mean_cls: 1.483098, grad_norm: 1.037135 +Steps: 0%| | 31/1000000 [00:09<67:45:24, 4.10it/s, grad_norm=1.04, loss_final=2.93, loss_mean=1.5, loss_mean_cls=1.48, proj_loss=-0.0557][2026-03-22 14:18:46] Step: 31, Training Logs: loss_final: 3.035356, loss_mean: 1.495703, proj_loss: -0.056197, loss_mean_cls: 1.595850, grad_norm: 1.035048 +Steps: 0%| | 32/1000000 [00:09<67:45:44, 4.10it/s, grad_norm=1.04, loss_final=3.04, loss_mean=1.5, loss_mean_cls=1.6, proj_loss=-0.0562][2026-03-22 14:18:46] Step: 32, Training Logs: loss_final: 3.120868, loss_mean: 1.470076, proj_loss: -0.053967, loss_mean_cls: 1.704759, grad_norm: 1.129027 +Steps: 0%| | 33/1000000 [00:09<67:44:17, 4.10it/s, grad_norm=1.13, loss_final=3.12, loss_mean=1.47, loss_mean_cls=1.7, proj_loss=-0.054][2026-03-22 14:18:46] Step: 33, Training Logs: loss_final: 2.932063, loss_mean: 1.471810, proj_loss: -0.058707, loss_mean_cls: 1.518960, grad_norm: 1.014777 +Steps: 0%| | 34/1000000 [00:10<67:46:19, 4.10it/s, grad_norm=1.01, loss_final=2.93, loss_mean=1.47, loss_mean_cls=1.52, proj_loss=-0.0587][2026-03-22 14:18:46] Step: 34, Training Logs: loss_final: 2.971079, loss_mean: 1.459125, proj_loss: -0.056896, loss_mean_cls: 1.568851, grad_norm: 0.883067 +Steps: 0%| | 35/1000000 [00:10<67:46:35, 4.10it/s, grad_norm=0.883, loss_final=2.97, loss_mean=1.46, loss_mean_cls=1.57, proj_loss=-0.0569][2026-03-22 14:18:47] Step: 35, Training Logs: loss_final: 3.074455, loss_mean: 1.454788, proj_loss: -0.052639, loss_mean_cls: 1.672306, grad_norm: 1.047092 +Steps: 0%| | 36/1000000 [00:10<67:47:27, 4.10it/s, grad_norm=1.05, loss_final=3.07, loss_mean=1.45, loss_mean_cls=1.67, proj_loss=-0.0526][2026-03-22 14:18:47] Step: 36, Training Logs: loss_final: 2.803893, loss_mean: 1.451987, proj_loss: -0.054401, loss_mean_cls: 1.406307, grad_norm: 1.084699 +Steps: 0%| | 37/1000000 [00:10<67:51:49, 4.09it/s, grad_norm=1.08, loss_final=2.8, loss_mean=1.45, loss_mean_cls=1.41, proj_loss=-0.0544][2026-03-22 14:18:47] Step: 37, Training Logs: loss_final: 3.069530, loss_mean: 1.465638, proj_loss: -0.057412, loss_mean_cls: 1.661303, grad_norm: 1.570807 +Steps: 0%| | 38/1000000 [00:10<67:49:42, 4.10it/s, grad_norm=1.57, loss_final=3.07, loss_mean=1.47, loss_mean_cls=1.66, proj_loss=-0.0574][2026-03-22 14:18:47] Step: 38, Training Logs: loss_final: 2.959071, loss_mean: 1.440286, proj_loss: -0.056678, loss_mean_cls: 1.575463, grad_norm: 1.156314 +Steps: 0%| | 39/1000000 [00:11<67:51:12, 4.09it/s, grad_norm=1.16, loss_final=2.96, loss_mean=1.44, loss_mean_cls=1.58, proj_loss=-0.0567][2026-03-22 14:18:48] Step: 39, Training Logs: loss_final: 2.967646, loss_mean: 1.454544, proj_loss: -0.055702, loss_mean_cls: 1.568804, grad_norm: 1.655622 +Steps: 0%| | 40/1000000 [00:11<67:52:55, 4.09it/s, grad_norm=1.66, loss_final=2.97, loss_mean=1.45, loss_mean_cls=1.57, proj_loss=-0.0557][2026-03-22 14:18:48] Step: 40, Training Logs: loss_final: 2.866812, loss_mean: 1.417070, proj_loss: -0.056346, loss_mean_cls: 1.506088, grad_norm: 1.111123 +Steps: 0%| | 41/1000000 [00:11<67:51:48, 4.09it/s, grad_norm=1.11, loss_final=2.87, loss_mean=1.42, loss_mean_cls=1.51, proj_loss=-0.0563][2026-03-22 14:18:48] Step: 41, Training Logs: loss_final: 2.892921, loss_mean: 1.424606, proj_loss: -0.056070, loss_mean_cls: 1.524385, grad_norm: 1.222003 +Steps: 0%| | 42/1000000 [00:11<67:53:04, 4.09it/s, grad_norm=1.22, loss_final=2.89, loss_mean=1.42, loss_mean_cls=1.52, proj_loss=-0.0561][2026-03-22 14:18:48] Step: 42, Training Logs: loss_final: 2.907476, loss_mean: 1.440376, proj_loss: -0.054416, loss_mean_cls: 1.521516, grad_norm: 1.217327 +Steps: 0%| | 43/1000000 [00:12<67:56:18, 4.09it/s, grad_norm=1.22, loss_final=2.91, loss_mean=1.44, loss_mean_cls=1.52, proj_loss=-0.0544][2026-03-22 14:18:49] Step: 43, Training Logs: loss_final: 2.865600, loss_mean: 1.448479, proj_loss: -0.055570, loss_mean_cls: 1.472691, grad_norm: 1.146884 +Steps: 0%| | 44/1000000 [00:12<67:52:40, 4.09it/s, grad_norm=1.15, loss_final=2.87, loss_mean=1.45, loss_mean_cls=1.47, proj_loss=-0.0556][2026-03-22 14:18:49] Step: 44, Training Logs: loss_final: 3.020347, loss_mean: 1.388225, proj_loss: -0.055340, loss_mean_cls: 1.687462, grad_norm: 0.725009 +Steps: 0%| | 45/1000000 [00:12<67:51:24, 4.09it/s, grad_norm=0.725, loss_final=3.02, loss_mean=1.39, loss_mean_cls=1.69, proj_loss=-0.0553][2026-03-22 14:18:49] Step: 45, Training Logs: loss_final: 3.022197, loss_mean: 1.395039, proj_loss: -0.054841, loss_mean_cls: 1.682000, grad_norm: 0.951807 +Steps: 0%| | 46/1000000 [00:12<67:49:33, 4.10it/s, grad_norm=0.952, loss_final=3.02, loss_mean=1.4, loss_mean_cls=1.68, proj_loss=-0.0548][2026-03-22 14:18:49] Step: 46, Training Logs: loss_final: 2.886384, loss_mean: 1.395942, proj_loss: -0.056362, loss_mean_cls: 1.546803, grad_norm: 0.854682 +Steps: 0%| | 47/1000000 [00:13<67:49:47, 4.10it/s, grad_norm=0.855, loss_final=2.89, loss_mean=1.4, loss_mean_cls=1.55, proj_loss=-0.0564][2026-03-22 14:18:50] Step: 47, Training Logs: loss_final: 2.972678, loss_mean: 1.415834, proj_loss: -0.055993, loss_mean_cls: 1.612837, grad_norm: 0.833776 +Steps: 0%| | 48/1000000 [00:13<67:51:06, 4.09it/s, grad_norm=0.834, loss_final=2.97, loss_mean=1.42, loss_mean_cls=1.61, proj_loss=-0.056][2026-03-22 14:18:50] Step: 48, Training Logs: loss_final: 2.898823, loss_mean: 1.415357, proj_loss: -0.054915, loss_mean_cls: 1.538381, grad_norm: 0.795023 +Steps: 0%| | 49/1000000 [00:13<67:47:48, 4.10it/s, grad_norm=0.795, loss_final=2.9, loss_mean=1.42, loss_mean_cls=1.54, proj_loss=-0.0549][2026-03-22 14:18:50] Step: 49, Training Logs: loss_final: 2.755019, loss_mean: 1.400865, proj_loss: -0.055659, loss_mean_cls: 1.409813, grad_norm: 0.672886 +Steps: 0%| | 50/1000000 [00:13<67:46:32, 4.10it/s, grad_norm=0.673, loss_final=2.76, loss_mean=1.4, loss_mean_cls=1.41, proj_loss=-0.0557][2026-03-22 14:18:50] Step: 50, Training Logs: loss_final: 2.991987, loss_mean: 1.381256, proj_loss: -0.052705, loss_mean_cls: 1.663436, grad_norm: 0.778755 +Steps: 0%| | 51/1000000 [00:14<67:44:25, 4.10it/s, grad_norm=0.779, loss_final=2.99, loss_mean=1.38, loss_mean_cls=1.66, proj_loss=-0.0527][2026-03-22 14:18:51] Step: 51, Training Logs: loss_final: 2.767639, loss_mean: 1.404030, proj_loss: -0.054948, loss_mean_cls: 1.418557, grad_norm: 0.844478 +Steps: 0%| | 52/1000000 [00:14<67:45:04, 4.10it/s, grad_norm=0.844, loss_final=2.77, loss_mean=1.4, loss_mean_cls=1.42, proj_loss=-0.0549][2026-03-22 14:18:51] Step: 52, Training Logs: loss_final: 2.855297, loss_mean: 1.382917, proj_loss: -0.056069, loss_mean_cls: 1.528448, grad_norm: 0.793791 +Steps: 0%| | 53/1000000 [00:14<67:44:46, 4.10it/s, grad_norm=0.794, loss_final=2.86, loss_mean=1.38, loss_mean_cls=1.53, proj_loss=-0.0561][2026-03-22 14:18:51] Step: 53, Training Logs: loss_final: 2.975523, loss_mean: 1.358341, proj_loss: -0.055004, loss_mean_cls: 1.672186, grad_norm: 0.726457 +Steps: 0%| | 54/1000000 [00:14<67:45:03, 4.10it/s, grad_norm=0.726, loss_final=2.98, loss_mean=1.36, loss_mean_cls=1.67, proj_loss=-0.055][2026-03-22 14:18:51] Step: 54, Training Logs: loss_final: 2.806391, loss_mean: 1.368134, proj_loss: -0.056875, loss_mean_cls: 1.495132, grad_norm: 0.803702 +Steps: 0%| | 55/1000000 [00:15<67:46:13, 4.10it/s, grad_norm=0.804, loss_final=2.81, loss_mean=1.37, loss_mean_cls=1.5, proj_loss=-0.0569][2026-03-22 14:18:52] Step: 55, Training Logs: loss_final: 2.868218, loss_mean: 1.370599, proj_loss: -0.056521, loss_mean_cls: 1.554140, grad_norm: 0.621114 +Steps: 0%| | 56/1000000 [00:15<67:46:24, 4.10it/s, grad_norm=0.621, loss_final=2.87, loss_mean=1.37, loss_mean_cls=1.55, proj_loss=-0.0565][2026-03-22 14:18:52] Step: 56, Training Logs: loss_final: 2.805166, loss_mean: 1.349566, proj_loss: -0.053763, loss_mean_cls: 1.509362, grad_norm: 0.735919 +Steps: 0%| | 57/1000000 [00:15<67:46:21, 4.10it/s, grad_norm=0.736, loss_final=2.81, loss_mean=1.35, loss_mean_cls=1.51, proj_loss=-0.0538][2026-03-22 14:18:52] Step: 57, Training Logs: loss_final: 2.794783, loss_mean: 1.358964, proj_loss: -0.055332, loss_mean_cls: 1.491151, grad_norm: 0.986208 +Steps: 0%| | 58/1000000 [00:15<67:45:18, 4.10it/s, grad_norm=0.986, loss_final=2.79, loss_mean=1.36, loss_mean_cls=1.49, proj_loss=-0.0553][2026-03-22 14:18:52] Step: 58, Training Logs: loss_final: 2.864296, loss_mean: 1.366817, proj_loss: -0.056353, loss_mean_cls: 1.553833, grad_norm: 1.601226 +Steps: 0%| | 59/1000000 [00:16<67:46:19, 4.10it/s, grad_norm=1.6, loss_final=2.86, loss_mean=1.37, loss_mean_cls=1.55, proj_loss=-0.0564][2026-03-22 14:18:53] Step: 59, Training Logs: loss_final: 2.992794, loss_mean: 1.338288, proj_loss: -0.054880, loss_mean_cls: 1.709386, grad_norm: 0.901207 +Steps: 0%| | 60/1000000 [00:16<67:44:43, 4.10it/s, grad_norm=0.901, loss_final=2.99, loss_mean=1.34, loss_mean_cls=1.71, proj_loss=-0.0549][2026-03-22 14:18:53] Step: 60, Training Logs: loss_final: 2.838339, loss_mean: 1.329440, proj_loss: -0.056306, loss_mean_cls: 1.565205, grad_norm: 1.125150 +Steps: 0%| | 61/1000000 [00:16<67:46:34, 4.10it/s, grad_norm=1.13, loss_final=2.84, loss_mean=1.33, loss_mean_cls=1.57, proj_loss=-0.0563][2026-03-22 14:18:53] Step: 61, Training Logs: loss_final: 2.783628, loss_mean: 1.361142, proj_loss: -0.054125, loss_mean_cls: 1.476611, grad_norm: 1.162003 +Steps: 0%| | 62/1000000 [00:16<67:44:25, 4.10it/s, grad_norm=1.16, loss_final=2.78, loss_mean=1.36, loss_mean_cls=1.48, proj_loss=-0.0541][2026-03-22 14:18:53] Step: 62, Training Logs: loss_final: 2.902646, loss_mean: 1.314883, proj_loss: -0.055008, loss_mean_cls: 1.642771, grad_norm: 0.848085 +Steps: 0%| | 63/1000000 [00:17<67:43:24, 4.10it/s, grad_norm=0.848, loss_final=2.9, loss_mean=1.31, loss_mean_cls=1.64, proj_loss=-0.055][2026-03-22 14:18:54] Step: 63, Training Logs: loss_final: 2.815609, loss_mean: 1.355244, proj_loss: -0.054202, loss_mean_cls: 1.514567, grad_norm: 0.901665 +Steps: 0%| | 64/1000000 [00:17<67:43:48, 4.10it/s, grad_norm=0.902, loss_final=2.82, loss_mean=1.36, loss_mean_cls=1.51, proj_loss=-0.0542][2026-03-22 14:18:54] Step: 64, Training Logs: loss_final: 2.815337, loss_mean: 1.324930, proj_loss: -0.056205, loss_mean_cls: 1.546612, grad_norm: 0.835467 +Steps: 0%| | 65/1000000 [00:17<68:38:01, 4.05it/s, grad_norm=0.835, loss_final=2.82, loss_mean=1.32, loss_mean_cls=1.55, proj_loss=-0.0562][2026-03-22 14:18:54] Step: 65, Training Logs: loss_final: 2.742552, loss_mean: 1.323682, proj_loss: -0.057330, loss_mean_cls: 1.476200, grad_norm: 0.688253 +Steps: 0%| | 66/1000000 [00:17<68:21:58, 4.06it/s, grad_norm=0.688, loss_final=2.74, loss_mean=1.32, loss_mean_cls=1.48, proj_loss=-0.0573][2026-03-22 14:18:54] Step: 66, Training Logs: loss_final: 2.774845, loss_mean: 1.298849, proj_loss: -0.057293, loss_mean_cls: 1.533289, grad_norm: 0.987087 +Steps: 0%| | 67/1000000 [00:18<68:10:15, 4.07it/s, grad_norm=0.987, loss_final=2.77, loss_mean=1.3, loss_mean_cls=1.53, proj_loss=-0.0573][2026-03-22 14:18:55] Step: 67, Training Logs: loss_final: 2.755432, loss_mean: 1.358293, proj_loss: -0.055953, loss_mean_cls: 1.453091, grad_norm: 2.789582 +Steps: 0%| | 68/1000000 [00:18<68:02:26, 4.08it/s, grad_norm=2.79, loss_final=2.76, loss_mean=1.36, loss_mean_cls=1.45, proj_loss=-0.056][2026-03-22 14:18:55] Step: 68, Training Logs: loss_final: 2.824990, loss_mean: 1.318952, proj_loss: -0.055079, loss_mean_cls: 1.561116, grad_norm: 2.135020 +Steps: 0%| | 69/1000000 [00:18<67:57:59, 4.09it/s, grad_norm=2.14, loss_final=2.82, loss_mean=1.32, loss_mean_cls=1.56, proj_loss=-0.0551][2026-03-22 14:18:55] Step: 69, Training Logs: loss_final: 3.010809, loss_mean: 1.365431, proj_loss: -0.055378, loss_mean_cls: 1.700757, grad_norm: 4.008772 +Steps: 0%| | 70/1000000 [00:18<67:53:52, 4.09it/s, grad_norm=4.01, loss_final=3.01, loss_mean=1.37, loss_mean_cls=1.7, proj_loss=-0.0554][2026-03-22 14:18:55] Step: 70, Training Logs: loss_final: 2.888796, loss_mean: 1.366079, proj_loss: -0.056011, loss_mean_cls: 1.578728, grad_norm: 3.466098 +Steps: 0%| | 71/1000000 [00:19<67:50:38, 4.09it/s, grad_norm=3.47, loss_final=2.89, loss_mean=1.37, loss_mean_cls=1.58, proj_loss=-0.056][2026-03-22 14:18:56] Step: 71, Training Logs: loss_final: 2.858286, loss_mean: 1.297444, proj_loss: -0.058877, loss_mean_cls: 1.619719, grad_norm: 1.514890 +Steps: 0%| | 72/1000000 [00:19<67:57:24, 4.09it/s, grad_norm=1.51, loss_final=2.86, loss_mean=1.3, loss_mean_cls=1.62, proj_loss=-0.0589][2026-03-22 14:18:56] Step: 72, Training Logs: loss_final: 2.979393, loss_mean: 1.301500, proj_loss: -0.054872, loss_mean_cls: 1.732766, grad_norm: 2.135420 +Steps: 0%| | 73/1000000 [00:19<67:52:27, 4.09it/s, grad_norm=2.14, loss_final=2.98, loss_mean=1.3, loss_mean_cls=1.73, proj_loss=-0.0549][2026-03-22 14:18:56] Step: 73, Training Logs: loss_final: 2.805136, loss_mean: 1.301045, proj_loss: -0.054705, loss_mean_cls: 1.558795, grad_norm: 1.472563 +Steps: 0%| | 74/1000000 [00:19<67:49:26, 4.10it/s, grad_norm=1.47, loss_final=2.81, loss_mean=1.3, loss_mean_cls=1.56, proj_loss=-0.0547][2026-03-22 14:18:56] Step: 74, Training Logs: loss_final: 2.767108, loss_mean: 1.301609, proj_loss: -0.055950, loss_mean_cls: 1.521449, grad_norm: 1.303126 +Steps: 0%| | 75/1000000 [00:20<67:49:01, 4.10it/s, grad_norm=1.3, loss_final=2.77, loss_mean=1.3, loss_mean_cls=1.52, proj_loss=-0.056][2026-03-22 14:18:56] Step: 75, Training Logs: loss_final: 2.869498, loss_mean: 1.298964, proj_loss: -0.054982, loss_mean_cls: 1.625517, grad_norm: 1.143291 +Steps: 0%| | 76/1000000 [00:20<67:57:54, 4.09it/s, grad_norm=1.14, loss_final=2.87, loss_mean=1.3, loss_mean_cls=1.63, proj_loss=-0.055][2026-03-22 14:18:57] Step: 76, Training Logs: loss_final: 2.887869, loss_mean: 1.258030, proj_loss: -0.058697, loss_mean_cls: 1.688537, grad_norm: 0.878449 +Steps: 0%| | 77/1000000 [00:20<67:54:18, 4.09it/s, grad_norm=0.878, loss_final=2.89, loss_mean=1.26, loss_mean_cls=1.69, proj_loss=-0.0587][2026-03-22 14:18:57] Step: 77, Training Logs: loss_final: 2.753749, loss_mean: 1.277754, proj_loss: -0.056418, loss_mean_cls: 1.532413, grad_norm: 0.984427 +Steps: 0%| | 78/1000000 [00:20<67:51:29, 4.09it/s, grad_norm=0.984, loss_final=2.75, loss_mean=1.28, loss_mean_cls=1.53, proj_loss=-0.0564][2026-03-22 14:18:57] Step: 78, Training Logs: loss_final: 2.765853, loss_mean: 1.297250, proj_loss: -0.057848, loss_mean_cls: 1.526451, grad_norm: 0.944413 +Steps: 0%| | 79/1000000 [00:21<67:51:58, 4.09it/s, grad_norm=0.944, loss_final=2.77, loss_mean=1.3, loss_mean_cls=1.53, proj_loss=-0.0578][2026-03-22 14:18:57] Step: 79, Training Logs: loss_final: 2.772735, loss_mean: 1.274151, proj_loss: -0.057881, loss_mean_cls: 1.556465, grad_norm: 0.860547 +Steps: 0%| | 80/1000000 [00:21<67:59:34, 4.09it/s, grad_norm=0.861, loss_final=2.77, loss_mean=1.27, loss_mean_cls=1.56, proj_loss=-0.0579][2026-03-22 14:18:58] Step: 80, Training Logs: loss_final: 2.820037, loss_mean: 1.269140, proj_loss: -0.055304, loss_mean_cls: 1.606200, grad_norm: 0.887420 +Steps: 0%| | 81/1000000 [00:21<67:59:48, 4.08it/s, grad_norm=0.887, loss_final=2.82, loss_mean=1.27, loss_mean_cls=1.61, proj_loss=-0.0553][2026-03-22 14:18:58] Step: 81, Training Logs: loss_final: 2.827598, loss_mean: 1.237674, proj_loss: -0.057646, loss_mean_cls: 1.647570, grad_norm: 0.885278 +Steps: 0%| | 82/1000000 [00:21<67:55:59, 4.09it/s, grad_norm=0.885, loss_final=2.83, loss_mean=1.24, loss_mean_cls=1.65, proj_loss=-0.0576][2026-03-22 14:18:58] Step: 82, Training Logs: loss_final: 2.733480, loss_mean: 1.250498, proj_loss: -0.056153, loss_mean_cls: 1.539135, grad_norm: 0.951965 +Steps: 0%| | 83/1000000 [00:21<67:53:03, 4.09it/s, grad_norm=0.952, loss_final=2.73, loss_mean=1.25, loss_mean_cls=1.54, proj_loss=-0.0562][2026-03-22 14:18:58] Step: 83, Training Logs: loss_final: 2.584397, loss_mean: 1.263374, proj_loss: -0.057286, loss_mean_cls: 1.378309, grad_norm: 1.048554 +Steps: 0%| | 84/1000000 [00:22<67:59:47, 4.08it/s, grad_norm=1.05, loss_final=2.58, loss_mean=1.26, loss_mean_cls=1.38, proj_loss=-0.0573][2026-03-22 14:18:59] Step: 84, Training Logs: loss_final: 2.703234, loss_mean: 1.288059, proj_loss: -0.056180, loss_mean_cls: 1.471354, grad_norm: 2.892923 +Steps: 0%| | 85/1000000 [00:22<67:56:01, 4.09it/s, grad_norm=2.89, loss_final=2.7, loss_mean=1.29, loss_mean_cls=1.47, proj_loss=-0.0562][2026-03-22 14:18:59] Step: 85, Training Logs: loss_final: 2.747946, loss_mean: 1.215422, proj_loss: -0.055964, loss_mean_cls: 1.588488, grad_norm: 1.213116 +Steps: 0%| | 86/1000000 [00:22<67:53:28, 4.09it/s, grad_norm=1.21, loss_final=2.75, loss_mean=1.22, loss_mean_cls=1.59, proj_loss=-0.056][2026-03-22 14:18:59] Step: 86, Training Logs: loss_final: 2.730498, loss_mean: 1.333743, proj_loss: -0.057338, loss_mean_cls: 1.454093, grad_norm: 4.372561 +Steps: 0%| | 87/1000000 [00:22<67:51:43, 4.09it/s, grad_norm=4.37, loss_final=2.73, loss_mean=1.33, loss_mean_cls=1.45, proj_loss=-0.0573][2026-03-22 14:18:59] Step: 87, Training Logs: loss_final: 2.861186, loss_mean: 1.318530, proj_loss: -0.055812, loss_mean_cls: 1.598468, grad_norm: 4.691638 +Steps: 0%| | 88/1000000 [00:23<68:00:32, 4.08it/s, grad_norm=4.69, loss_final=2.86, loss_mean=1.32, loss_mean_cls=1.6, proj_loss=-0.0558][2026-03-22 14:19:00] Step: 88, Training Logs: loss_final: 2.736260, loss_mean: 1.245452, proj_loss: -0.053076, loss_mean_cls: 1.543884, grad_norm: 2.351711 +Steps: 0%| | 89/1000000 [00:23<67:59:13, 4.09it/s, grad_norm=2.35, loss_final=2.74, loss_mean=1.25, loss_mean_cls=1.54, proj_loss=-0.0531][2026-03-22 14:19:00] Step: 89, Training Logs: loss_final: 2.563126, loss_mean: 1.218451, proj_loss: -0.056614, loss_mean_cls: 1.401289, grad_norm: 1.291820 +Steps: 0%| | 90/1000000 [00:23<67:54:50, 4.09it/s, grad_norm=1.29, loss_final=2.56, loss_mean=1.22, loss_mean_cls=1.4, proj_loss=-0.0566][2026-03-22 14:19:00] Step: 90, Training Logs: loss_final: 2.724583, loss_mean: 1.256498, proj_loss: -0.055576, loss_mean_cls: 1.523661, grad_norm: 1.656507 +Steps: 0%| | 91/1000000 [00:23<67:51:59, 4.09it/s, grad_norm=1.66, loss_final=2.72, loss_mean=1.26, loss_mean_cls=1.52, proj_loss=-0.0556][2026-03-22 14:19:00] Step: 91, Training Logs: loss_final: 2.758380, loss_mean: 1.205727, proj_loss: -0.054558, loss_mean_cls: 1.607212, grad_norm: 1.104787 +Steps: 0%| | 92/1000000 [00:24<67:52:55, 4.09it/s, grad_norm=1.1, loss_final=2.76, loss_mean=1.21, loss_mean_cls=1.61, proj_loss=-0.0546][2026-03-22 14:19:01] Step: 92, Training Logs: loss_final: 2.680668, loss_mean: 1.170029, proj_loss: -0.056593, loss_mean_cls: 1.567232, grad_norm: 1.004115 +Steps: 0%| | 93/1000000 [00:24<67:51:27, 4.09it/s, grad_norm=1, loss_final=2.68, loss_mean=1.17, loss_mean_cls=1.57, proj_loss=-0.0566][2026-03-22 14:19:01] Step: 93, Training Logs: loss_final: 2.618125, loss_mean: 1.221195, proj_loss: -0.058355, loss_mean_cls: 1.455285, grad_norm: 0.981058 +Steps: 0%| | 94/1000000 [00:24<67:49:38, 4.09it/s, grad_norm=0.981, loss_final=2.62, loss_mean=1.22, loss_mean_cls=1.46, proj_loss=-0.0584][2026-03-22 14:19:01] Step: 94, Training Logs: loss_final: 2.464368, loss_mean: 1.220258, proj_loss: -0.058463, loss_mean_cls: 1.302573, grad_norm: 0.955343 +Steps: 0%| | 95/1000000 [00:24<67:48:34, 4.10it/s, grad_norm=0.955, loss_final=2.46, loss_mean=1.22, loss_mean_cls=1.3, proj_loss=-0.0585][2026-03-22 14:19:01] Step: 95, Training Logs: loss_final: 2.634202, loss_mean: 1.218974, proj_loss: -0.055104, loss_mean_cls: 1.470332, grad_norm: 0.859403 +Steps: 0%| | 96/1000000 [00:25<68:00:24, 4.08it/s, grad_norm=0.859, loss_final=2.63, loss_mean=1.22, loss_mean_cls=1.47, proj_loss=-0.0551][2026-03-22 14:19:02] Step: 96, Training Logs: loss_final: 2.633074, loss_mean: 1.206295, proj_loss: -0.057686, loss_mean_cls: 1.484466, grad_norm: 1.036271 +Steps: 0%| | 97/1000000 [00:25<67:58:56, 4.09it/s, grad_norm=1.04, loss_final=2.63, loss_mean=1.21, loss_mean_cls=1.48, proj_loss=-0.0577][2026-03-22 14:19:02] Step: 97, Training Logs: loss_final: 2.522636, loss_mean: 1.216953, proj_loss: -0.056108, loss_mean_cls: 1.361791, grad_norm: 1.459754 +Steps: 0%| | 98/1000000 [00:25<67:56:54, 4.09it/s, grad_norm=1.46, loss_final=2.52, loss_mean=1.22, loss_mean_cls=1.36, proj_loss=-0.0561][2026-03-22 14:19:02] Step: 98, Training Logs: loss_final: 2.715345, loss_mean: 1.166881, proj_loss: -0.058632, loss_mean_cls: 1.607096, grad_norm: 1.252616 +Steps: 0%| | 99/1000000 [00:25<67:54:24, 4.09it/s, grad_norm=1.25, loss_final=2.72, loss_mean=1.17, loss_mean_cls=1.61, proj_loss=-0.0586][2026-03-22 14:19:02] Step: 99, Training Logs: loss_final: 2.636289, loss_mean: 1.178455, proj_loss: -0.057648, loss_mean_cls: 1.515482, grad_norm: 2.464276 +Steps: 0%| | 100/1000000 [00:26<68:01:32, 4.08it/s, grad_norm=2.46, loss_final=2.64, loss_mean=1.18, loss_mean_cls=1.52, proj_loss=-0.0576][2026-03-22 14:19:03] Step: 100, Training Logs: loss_final: 2.576569, loss_mean: 1.166189, proj_loss: -0.056893, loss_mean_cls: 1.467273, grad_norm: 0.850549 +Steps: 0%| | 101/1000000 [00:26<67:56:19, 4.09it/s, grad_norm=0.851, loss_final=2.58, loss_mean=1.17, loss_mean_cls=1.47, proj_loss=-0.0569][2026-03-22 14:19:03] Step: 101, Training Logs: loss_final: 2.706146, loss_mean: 1.166944, proj_loss: -0.057944, loss_mean_cls: 1.597147, grad_norm: 1.695053 +Steps: 0%| | 102/1000000 [00:26<67:53:56, 4.09it/s, grad_norm=1.7, loss_final=2.71, loss_mean=1.17, loss_mean_cls=1.6, proj_loss=-0.0579][2026-03-22 14:19:03] Step: 102, Training Logs: loss_final: 2.631995, loss_mean: 1.161747, proj_loss: -0.053381, loss_mean_cls: 1.523629, grad_norm: 1.444454 +Steps: 0%| | 103/1000000 [00:26<67:50:00, 4.09it/s, grad_norm=1.44, loss_final=2.63, loss_mean=1.16, loss_mean_cls=1.52, proj_loss=-0.0534][2026-03-22 14:19:03] Step: 103, Training Logs: loss_final: 2.586321, loss_mean: 1.155336, proj_loss: -0.055789, loss_mean_cls: 1.486775, grad_norm: 1.094547 +Steps: 0%| | 104/1000000 [00:27<67:57:58, 4.09it/s, grad_norm=1.09, loss_final=2.59, loss_mean=1.16, loss_mean_cls=1.49, proj_loss=-0.0558][2026-03-22 14:19:04] Step: 104, Training Logs: loss_final: 2.547353, loss_mean: 1.162411, proj_loss: -0.056362, loss_mean_cls: 1.441304, grad_norm: 1.012446 +Steps: 0%| | 105/1000000 [00:27<67:55:57, 4.09it/s, grad_norm=1.01, loss_final=2.55, loss_mean=1.16, loss_mean_cls=1.44, proj_loss=-0.0564][2026-03-22 14:19:04] Step: 105, Training Logs: loss_final: 2.765155, loss_mean: 1.112110, proj_loss: -0.056011, loss_mean_cls: 1.709056, grad_norm: 0.985335 +Steps: 0%| | 106/1000000 [00:27<67:53:08, 4.09it/s, grad_norm=0.985, loss_final=2.77, loss_mean=1.11, loss_mean_cls=1.71, proj_loss=-0.056][2026-03-22 14:19:04] Step: 106, Training Logs: loss_final: 2.693915, loss_mean: 1.123071, proj_loss: -0.057249, loss_mean_cls: 1.628093, grad_norm: 1.065300 +Steps: 0%| | 107/1000000 [00:27<67:53:21, 4.09it/s, grad_norm=1.07, loss_final=2.69, loss_mean=1.12, loss_mean_cls=1.63, proj_loss=-0.0572][2026-03-22 14:19:04] Step: 107, Training Logs: loss_final: 2.748741, loss_mean: 1.099942, proj_loss: -0.056753, loss_mean_cls: 1.705551, grad_norm: 0.935595 +Steps: 0%| | 108/1000000 [00:28<67:53:39, 4.09it/s, grad_norm=0.936, loss_final=2.75, loss_mean=1.1, loss_mean_cls=1.71, proj_loss=-0.0568][2026-03-22 14:19:05] Step: 108, Training Logs: loss_final: 2.594934, loss_mean: 1.131677, proj_loss: -0.057230, loss_mean_cls: 1.520488, grad_norm: 1.168766 +Steps: 0%| | 109/1000000 [00:28<67:51:58, 4.09it/s, grad_norm=1.17, loss_final=2.59, loss_mean=1.13, loss_mean_cls=1.52, proj_loss=-0.0572][2026-03-22 14:19:05] Step: 109, Training Logs: loss_final: 2.443985, loss_mean: 1.135302, proj_loss: -0.058341, loss_mean_cls: 1.367025, grad_norm: 1.241183 +Steps: 0%| | 110/1000000 [00:28<67:49:23, 4.10it/s, grad_norm=1.24, loss_final=2.44, loss_mean=1.14, loss_mean_cls=1.37, proj_loss=-0.0583][2026-03-22 14:19:05] Step: 110, Training Logs: loss_final: 2.621114, loss_mean: 1.124415, proj_loss: -0.056934, loss_mean_cls: 1.553633, grad_norm: 0.794116 +Steps: 0%| | 111/1000000 [00:28<67:48:18, 4.10it/s, grad_norm=0.794, loss_final=2.62, loss_mean=1.12, loss_mean_cls=1.55, proj_loss=-0.0569][2026-03-22 14:19:05] Step: 111, Training Logs: loss_final: 2.517128, loss_mean: 1.146446, proj_loss: -0.057632, loss_mean_cls: 1.428313, grad_norm: 1.228724 +Steps: 0%| | 112/1000000 [00:29<67:47:39, 4.10it/s, grad_norm=1.23, loss_final=2.52, loss_mean=1.15, loss_mean_cls=1.43, proj_loss=-0.0576][2026-03-22 14:19:06] Step: 112, Training Logs: loss_final: 2.599692, loss_mean: 1.128979, proj_loss: -0.057710, loss_mean_cls: 1.528423, grad_norm: 1.164886 +Steps: 0%| | 113/1000000 [00:29<67:48:26, 4.10it/s, grad_norm=1.16, loss_final=2.6, loss_mean=1.13, loss_mean_cls=1.53, proj_loss=-0.0577][2026-03-22 14:19:06] Step: 113, Training Logs: loss_final: 2.627851, loss_mean: 1.112141, proj_loss: -0.057974, loss_mean_cls: 1.573684, grad_norm: 1.479042 +Steps: 0%| | 114/1000000 [00:29<67:50:18, 4.09it/s, grad_norm=1.48, loss_final=2.63, loss_mean=1.11, loss_mean_cls=1.57, proj_loss=-0.058][2026-03-22 14:19:06] Step: 114, Training Logs: loss_final: 2.570479, loss_mean: 1.128133, proj_loss: -0.054225, loss_mean_cls: 1.496571, grad_norm: 1.966034 +Steps: 0%| | 115/1000000 [00:29<67:48:37, 4.10it/s, grad_norm=1.97, loss_final=2.57, loss_mean=1.13, loss_mean_cls=1.5, proj_loss=-0.0542][2026-03-22 14:19:06] Step: 115, Training Logs: loss_final: 2.573661, loss_mean: 1.140157, proj_loss: -0.056421, loss_mean_cls: 1.489925, grad_norm: 1.740988 +Steps: 0%| | 116/1000000 [00:30<67:57:45, 4.09it/s, grad_norm=1.74, loss_final=2.57, loss_mean=1.14, loss_mean_cls=1.49, proj_loss=-0.0564][2026-03-22 14:19:07] Step: 116, Training Logs: loss_final: 2.467959, loss_mean: 1.139320, proj_loss: -0.057365, loss_mean_cls: 1.386004, grad_norm: 1.114979 +Steps: 0%| | 117/1000000 [00:30<67:55:14, 4.09it/s, grad_norm=1.11, loss_final=2.47, loss_mean=1.14, loss_mean_cls=1.39, proj_loss=-0.0574][2026-03-22 14:19:07] Step: 117, Training Logs: loss_final: 2.470653, loss_mean: 1.139182, proj_loss: -0.056143, loss_mean_cls: 1.387614, grad_norm: 1.248588 +Steps: 0%| | 118/1000000 [00:30<67:53:41, 4.09it/s, grad_norm=1.25, loss_final=2.47, loss_mean=1.14, loss_mean_cls=1.39, proj_loss=-0.0561][2026-03-22 14:19:07] Step: 118, Training Logs: loss_final: 2.537615, loss_mean: 1.109310, proj_loss: -0.057753, loss_mean_cls: 1.486058, grad_norm: 1.241159 +Steps: 0%| | 119/1000000 [00:30<67:51:37, 4.09it/s, grad_norm=1.24, loss_final=2.54, loss_mean=1.11, loss_mean_cls=1.49, proj_loss=-0.0578][2026-03-22 14:19:07] Step: 119, Training Logs: loss_final: 2.543604, loss_mean: 1.100913, proj_loss: -0.056094, loss_mean_cls: 1.498785, grad_norm: 1.870446 +Steps: 0%| | 120/1000000 [00:31<67:55:33, 4.09it/s, grad_norm=1.87, loss_final=2.54, loss_mean=1.1, loss_mean_cls=1.5, proj_loss=-0.0561][2026-03-22 14:19:07] Step: 120, Training Logs: loss_final: 2.682661, loss_mean: 1.071801, proj_loss: -0.057869, loss_mean_cls: 1.668729, grad_norm: 2.095625 +Steps: 0%| | 121/1000000 [00:31<67:52:51, 4.09it/s, grad_norm=2.1, loss_final=2.68, loss_mean=1.07, loss_mean_cls=1.67, proj_loss=-0.0579][2026-03-22 14:19:08] Step: 121, Training Logs: loss_final: 2.548899, loss_mean: 1.108838, proj_loss: -0.057370, loss_mean_cls: 1.497431, grad_norm: 1.436383 +Steps: 0%| | 122/1000000 [00:31<67:52:05, 4.09it/s, grad_norm=1.44, loss_final=2.55, loss_mean=1.11, loss_mean_cls=1.5, proj_loss=-0.0574][2026-03-22 14:19:08] Step: 122, Training Logs: loss_final: 2.545063, loss_mean: 1.146179, proj_loss: -0.058934, loss_mean_cls: 1.457818, grad_norm: 2.273976 +Steps: 0%| | 123/1000000 [00:31<67:49:47, 4.09it/s, grad_norm=2.27, loss_final=2.55, loss_mean=1.15, loss_mean_cls=1.46, proj_loss=-0.0589][2026-03-22 14:19:08] Step: 123, Training Logs: loss_final: 2.643265, loss_mean: 1.104617, proj_loss: -0.056222, loss_mean_cls: 1.594870, grad_norm: 1.547992 +Steps: 0%| | 124/1000000 [00:32<67:48:40, 4.10it/s, grad_norm=1.55, loss_final=2.64, loss_mean=1.1, loss_mean_cls=1.59, proj_loss=-0.0562][2026-03-22 14:19:08] Step: 124, Training Logs: loss_final: 2.494236, loss_mean: 1.099249, proj_loss: -0.054442, loss_mean_cls: 1.449429, grad_norm: 1.050850 +Steps: 0%| | 125/1000000 [00:32<68:13:31, 4.07it/s, grad_norm=1.05, loss_final=2.49, loss_mean=1.1, loss_mean_cls=1.45, proj_loss=-0.0544][2026-03-22 14:19:09] Step: 125, Training Logs: loss_final: 2.571626, loss_mean: 1.113769, proj_loss: -0.057994, loss_mean_cls: 1.515851, grad_norm: 1.496189 +Steps: 0%| | 126/1000000 [00:32<68:29:14, 4.06it/s, grad_norm=1.5, loss_final=2.57, loss_mean=1.11, loss_mean_cls=1.52, proj_loss=-0.058][2026-03-22 14:19:09] Step: 126, Training Logs: loss_final: 2.551067, loss_mean: 1.096680, proj_loss: -0.055529, loss_mean_cls: 1.509916, grad_norm: 1.138145 +Steps: 0%| | 127/1000000 [00:32<68:16:01, 4.07it/s, grad_norm=1.14, loss_final=2.55, loss_mean=1.1, loss_mean_cls=1.51, proj_loss=-0.0555][2026-03-22 14:19:09] Step: 127, Training Logs: loss_final: 2.394177, loss_mean: 1.113413, proj_loss: -0.057270, loss_mean_cls: 1.338034, grad_norm: 1.247788 +Steps: 0%| | 128/1000000 [00:32<68:18:08, 4.07it/s, grad_norm=1.25, loss_final=2.39, loss_mean=1.11, loss_mean_cls=1.34, proj_loss=-0.0573][2026-03-22 14:19:09] Step: 128, Training Logs: loss_final: 2.614193, loss_mean: 1.109603, proj_loss: -0.056962, loss_mean_cls: 1.561552, grad_norm: 1.090682 +Steps: 0%| | 129/1000000 [00:33<68:10:37, 4.07it/s, grad_norm=1.09, loss_final=2.61, loss_mean=1.11, loss_mean_cls=1.56, proj_loss=-0.057][2026-03-22 14:19:10] Step: 129, Training Logs: loss_final: 2.527130, loss_mean: 1.056950, proj_loss: -0.056380, loss_mean_cls: 1.526560, grad_norm: 0.976852 +Steps: 0%| | 130/1000000 [00:33<68:06:28, 4.08it/s, grad_norm=0.977, loss_final=2.53, loss_mean=1.06, loss_mean_cls=1.53, proj_loss=-0.0564][2026-03-22 14:19:10] Step: 130, Training Logs: loss_final: 2.592848, loss_mean: 1.087289, proj_loss: -0.057947, loss_mean_cls: 1.563505, grad_norm: 1.044299 +Steps: 0%| | 131/1000000 [00:33<68:01:30, 4.08it/s, grad_norm=1.04, loss_final=2.59, loss_mean=1.09, loss_mean_cls=1.56, proj_loss=-0.0579][2026-03-22 14:19:10] Step: 131, Training Logs: loss_final: 2.500877, loss_mean: 1.091501, proj_loss: -0.057742, loss_mean_cls: 1.467119, grad_norm: 1.476207 +Steps: 0%| | 132/1000000 [00:33<68:05:25, 4.08it/s, grad_norm=1.48, loss_final=2.5, loss_mean=1.09, loss_mean_cls=1.47, proj_loss=-0.0577][2026-03-22 14:19:10] Step: 132, Training Logs: loss_final: 2.585325, loss_mean: 1.085464, proj_loss: -0.055406, loss_mean_cls: 1.555267, grad_norm: 1.226250 +Steps: 0%| | 133/1000000 [00:34<68:00:21, 4.08it/s, grad_norm=1.23, loss_final=2.59, loss_mean=1.09, loss_mean_cls=1.56, proj_loss=-0.0554][2026-03-22 14:19:11] Step: 133, Training Logs: loss_final: 2.646006, loss_mean: 1.054086, proj_loss: -0.057984, loss_mean_cls: 1.649904, grad_norm: 1.274551 +Steps: 0%| | 134/1000000 [00:34<67:59:04, 4.09it/s, grad_norm=1.27, loss_final=2.65, loss_mean=1.05, loss_mean_cls=1.65, proj_loss=-0.058][2026-03-22 14:19:11] Step: 134, Training Logs: loss_final: 2.526016, loss_mean: 1.095336, proj_loss: -0.055786, loss_mean_cls: 1.486466, grad_norm: 1.575624 +Steps: 0%| | 135/1000000 [00:34<67:59:28, 4.08it/s, grad_norm=1.58, loss_final=2.53, loss_mean=1.1, loss_mean_cls=1.49, proj_loss=-0.0558][2026-03-22 14:19:11] Step: 135, Training Logs: loss_final: 2.485176, loss_mean: 1.080694, proj_loss: -0.057484, loss_mean_cls: 1.461966, grad_norm: 1.155581 +Steps: 0%| | 136/1000000 [00:34<68:04:27, 4.08it/s, grad_norm=1.16, loss_final=2.49, loss_mean=1.08, loss_mean_cls=1.46, proj_loss=-0.0575][2026-03-22 14:19:11] Step: 136, Training Logs: loss_final: 2.553384, loss_mean: 1.070915, proj_loss: -0.057508, loss_mean_cls: 1.539977, grad_norm: 1.165784 +Steps: 0%| | 137/1000000 [00:35<67:59:06, 4.09it/s, grad_norm=1.17, loss_final=2.55, loss_mean=1.07, loss_mean_cls=1.54, proj_loss=-0.0575][2026-03-22 14:19:12] Step: 137, Training Logs: loss_final: 2.488815, loss_mean: 1.077868, proj_loss: -0.056167, loss_mean_cls: 1.467114, grad_norm: 1.009476 +Steps: 0%| | 138/1000000 [00:35<67:55:24, 4.09it/s, grad_norm=1.01, loss_final=2.49, loss_mean=1.08, loss_mean_cls=1.47, proj_loss=-0.0562][2026-03-22 14:19:12] Step: 138, Training Logs: loss_final: 2.458341, loss_mean: 1.080608, proj_loss: -0.058098, loss_mean_cls: 1.435831, grad_norm: 1.038572 +Steps: 0%| | 139/1000000 [00:35<67:53:36, 4.09it/s, grad_norm=1.04, loss_final=2.46, loss_mean=1.08, loss_mean_cls=1.44, proj_loss=-0.0581][2026-03-22 14:19:12] Step: 139, Training Logs: loss_final: 2.354093, loss_mean: 1.108878, proj_loss: -0.057859, loss_mean_cls: 1.303074, grad_norm: 1.141737 +Steps: 0%| | 140/1000000 [00:35<67:58:15, 4.09it/s, grad_norm=1.14, loss_final=2.35, loss_mean=1.11, loss_mean_cls=1.3, proj_loss=-0.0579][2026-03-22 14:19:12] Step: 140, Training Logs: loss_final: 2.494307, loss_mean: 1.095914, proj_loss: -0.056031, loss_mean_cls: 1.454424, grad_norm: 1.060810 +Steps: 0%| | 141/1000000 [00:36<67:56:35, 4.09it/s, grad_norm=1.06, loss_final=2.49, loss_mean=1.1, loss_mean_cls=1.45, proj_loss=-0.056][2026-03-22 14:19:13] Step: 141, Training Logs: loss_final: 2.363685, loss_mean: 1.092302, proj_loss: -0.060314, loss_mean_cls: 1.331697, grad_norm: 0.925352 +Steps: 0%| | 142/1000000 [00:36<67:56:21, 4.09it/s, grad_norm=0.925, loss_final=2.36, loss_mean=1.09, loss_mean_cls=1.33, proj_loss=-0.0603][2026-03-22 14:19:13] Step: 142, Training Logs: loss_final: 2.521195, loss_mean: 1.065769, proj_loss: -0.056440, loss_mean_cls: 1.511866, grad_norm: 1.013242 +Steps: 0%| | 143/1000000 [00:36<67:54:29, 4.09it/s, grad_norm=1.01, loss_final=2.52, loss_mean=1.07, loss_mean_cls=1.51, proj_loss=-0.0564][2026-03-22 14:19:13] Step: 143, Training Logs: loss_final: 2.524553, loss_mean: 1.059675, proj_loss: -0.056340, loss_mean_cls: 1.521218, grad_norm: 1.259756 +Steps: 0%| | 144/1000000 [00:36<68:03:33, 4.08it/s, grad_norm=1.26, loss_final=2.52, loss_mean=1.06, loss_mean_cls=1.52, proj_loss=-0.0563][2026-03-22 14:19:13] Step: 144, Training Logs: loss_final: 2.561340, loss_mean: 1.072999, proj_loss: -0.056237, loss_mean_cls: 1.544578, grad_norm: 1.077458 +Steps: 0%| | 145/1000000 [00:37<67:59:42, 4.08it/s, grad_norm=1.08, loss_final=2.56, loss_mean=1.07, loss_mean_cls=1.54, proj_loss=-0.0562][2026-03-22 14:19:14] Step: 145, Training Logs: loss_final: 2.532787, loss_mean: 1.060979, proj_loss: -0.059286, loss_mean_cls: 1.531094, grad_norm: 0.924314 +Steps: 0%| | 146/1000000 [00:37<67:58:13, 4.09it/s, grad_norm=0.924, loss_final=2.53, loss_mean=1.06, loss_mean_cls=1.53, proj_loss=-0.0593][2026-03-22 14:19:14] Step: 146, Training Logs: loss_final: 2.521715, loss_mean: 1.050236, proj_loss: -0.057111, loss_mean_cls: 1.528590, grad_norm: 0.865574 +Steps: 0%| | 147/1000000 [00:37<67:55:17, 4.09it/s, grad_norm=0.866, loss_final=2.52, loss_mean=1.05, loss_mean_cls=1.53, proj_loss=-0.0571][2026-03-22 14:19:14] Step: 147, Training Logs: loss_final: 2.458502, loss_mean: 1.064373, proj_loss: -0.057272, loss_mean_cls: 1.451400, grad_norm: 0.862700 +Steps: 0%| | 148/1000000 [00:37<68:04:38, 4.08it/s, grad_norm=0.863, loss_final=2.46, loss_mean=1.06, loss_mean_cls=1.45, proj_loss=-0.0573][2026-03-22 14:19:14] Step: 148, Training Logs: loss_final: 2.516913, loss_mean: 1.052264, proj_loss: -0.056620, loss_mean_cls: 1.521269, grad_norm: 0.865426 +Steps: 0%| | 149/1000000 [00:38<68:00:20, 4.08it/s, grad_norm=0.865, loss_final=2.52, loss_mean=1.05, loss_mean_cls=1.52, proj_loss=-0.0566][2026-03-22 14:19:15] Step: 149, Training Logs: loss_final: 2.658225, loss_mean: 1.038570, proj_loss: -0.058063, loss_mean_cls: 1.677717, grad_norm: 1.219221 +Steps: 0%| | 150/1000000 [00:38<67:57:20, 4.09it/s, grad_norm=1.22, loss_final=2.66, loss_mean=1.04, loss_mean_cls=1.68, proj_loss=-0.0581][2026-03-22 14:19:15] Step: 150, Training Logs: loss_final: 2.482506, loss_mean: 1.059958, proj_loss: -0.059083, loss_mean_cls: 1.481632, grad_norm: 1.002050 +Steps: 0%| | 151/1000000 [00:38<67:56:10, 4.09it/s, grad_norm=1, loss_final=2.48, loss_mean=1.06, loss_mean_cls=1.48, proj_loss=-0.0591][2026-03-22 14:19:15] Step: 151, Training Logs: loss_final: 2.424309, loss_mean: 1.096779, proj_loss: -0.057571, loss_mean_cls: 1.385101, grad_norm: 2.426383 +Steps: 0%| | 152/1000000 [00:38<68:01:45, 4.08it/s, grad_norm=2.43, loss_final=2.42, loss_mean=1.1, loss_mean_cls=1.39, proj_loss=-0.0576][2026-03-22 14:19:15] Step: 152, Training Logs: loss_final: 2.475242, loss_mean: 1.053060, proj_loss: -0.055582, loss_mean_cls: 1.477764, grad_norm: 1.253333 +Steps: 0%| | 153/1000000 [00:39<68:02:02, 4.08it/s, grad_norm=1.25, loss_final=2.48, loss_mean=1.05, loss_mean_cls=1.48, proj_loss=-0.0556][2026-03-22 14:19:16] Step: 153, Training Logs: loss_final: 2.400266, loss_mean: 1.094930, proj_loss: -0.055071, loss_mean_cls: 1.360406, grad_norm: 1.767353 +Steps: 0%| | 154/1000000 [00:39<68:03:28, 4.08it/s, grad_norm=1.77, loss_final=2.4, loss_mean=1.09, loss_mean_cls=1.36, proj_loss=-0.0551][2026-03-22 14:19:16] Step: 154, Training Logs: loss_final: 2.497192, loss_mean: 1.078536, proj_loss: -0.056677, loss_mean_cls: 1.475333, grad_norm: 1.516172 +Steps: 0%| | 155/1000000 [00:39<68:04:18, 4.08it/s, grad_norm=1.52, loss_final=2.5, loss_mean=1.08, loss_mean_cls=1.48, proj_loss=-0.0567][2026-03-22 14:19:16] Step: 155, Training Logs: loss_final: 2.371750, loss_mean: 1.112894, proj_loss: -0.056490, loss_mean_cls: 1.315346, grad_norm: 1.298721 +Steps: 0%| | 156/1000000 [00:39<68:08:36, 4.08it/s, grad_norm=1.3, loss_final=2.37, loss_mean=1.11, loss_mean_cls=1.32, proj_loss=-0.0565][2026-03-22 14:19:16] Step: 156, Training Logs: loss_final: 2.494965, loss_mean: 1.067823, proj_loss: -0.056968, loss_mean_cls: 1.484110, grad_norm: 1.698287 +Steps: 0%| | 157/1000000 [00:40<68:02:35, 4.08it/s, grad_norm=1.7, loss_final=2.49, loss_mean=1.07, loss_mean_cls=1.48, proj_loss=-0.057][2026-03-22 14:19:17] Step: 157, Training Logs: loss_final: 2.476393, loss_mean: 1.060466, proj_loss: -0.057597, loss_mean_cls: 1.473524, grad_norm: 1.480494 +Steps: 0%| | 158/1000000 [00:40<67:57:58, 4.09it/s, grad_norm=1.48, loss_final=2.48, loss_mean=1.06, loss_mean_cls=1.47, proj_loss=-0.0576][2026-03-22 14:19:17] Step: 158, Training Logs: loss_final: 2.544346, loss_mean: 1.047846, proj_loss: -0.057997, loss_mean_cls: 1.554496, grad_norm: 1.012108 +Steps: 0%| | 159/1000000 [00:40<67:59:52, 4.08it/s, grad_norm=1.01, loss_final=2.54, loss_mean=1.05, loss_mean_cls=1.55, proj_loss=-0.058][2026-03-22 14:19:17] Step: 159, Training Logs: loss_final: 2.553063, loss_mean: 1.040626, proj_loss: -0.057151, loss_mean_cls: 1.569588, grad_norm: 1.587906 +Steps: 0%| | 160/1000000 [00:40<68:05:31, 4.08it/s, grad_norm=1.59, loss_final=2.55, loss_mean=1.04, loss_mean_cls=1.57, proj_loss=-0.0572][2026-03-22 14:19:17] Step: 160, Training Logs: loss_final: 2.547958, loss_mean: 1.049449, proj_loss: -0.056724, loss_mean_cls: 1.555233, grad_norm: 1.021629 +Steps: 0%| | 161/1000000 [00:41<68:00:54, 4.08it/s, grad_norm=1.02, loss_final=2.55, loss_mean=1.05, loss_mean_cls=1.56, proj_loss=-0.0567][2026-03-22 14:19:18] Step: 161, Training Logs: loss_final: 2.416015, loss_mean: 1.047302, proj_loss: -0.056090, loss_mean_cls: 1.424804, grad_norm: 1.292019 +Steps: 0%| | 162/1000000 [00:41<67:57:35, 4.09it/s, grad_norm=1.29, loss_final=2.42, loss_mean=1.05, loss_mean_cls=1.42, proj_loss=-0.0561][2026-03-22 14:19:18] Step: 162, Training Logs: loss_final: 2.557564, loss_mean: 1.044981, proj_loss: -0.055948, loss_mean_cls: 1.568532, grad_norm: 1.075079 +Steps: 0%| | 163/1000000 [00:41<67:55:08, 4.09it/s, grad_norm=1.08, loss_final=2.56, loss_mean=1.04, loss_mean_cls=1.57, proj_loss=-0.0559][2026-03-22 14:19:18] Step: 163, Training Logs: loss_final: 2.448874, loss_mean: 1.048477, proj_loss: -0.055716, loss_mean_cls: 1.456113, grad_norm: 1.363331 +Steps: 0%| | 164/1000000 [00:41<68:02:28, 4.08it/s, grad_norm=1.36, loss_final=2.45, loss_mean=1.05, loss_mean_cls=1.46, proj_loss=-0.0557][2026-03-22 14:19:18] Step: 164, Training Logs: loss_final: 2.570829, loss_mean: 1.031468, proj_loss: -0.057877, loss_mean_cls: 1.597239, grad_norm: 1.306098 +Steps: 0%| | 165/1000000 [00:42<68:05:26, 4.08it/s, grad_norm=1.31, loss_final=2.57, loss_mean=1.03, loss_mean_cls=1.6, proj_loss=-0.0579][2026-03-22 14:19:19] Step: 165, Training Logs: loss_final: 2.605064, loss_mean: 1.032269, proj_loss: -0.055724, loss_mean_cls: 1.628520, grad_norm: 1.190812 +Steps: 0%| | 166/1000000 [00:42<68:10:38, 4.07it/s, grad_norm=1.19, loss_final=2.61, loss_mean=1.03, loss_mean_cls=1.63, proj_loss=-0.0557][2026-03-22 14:19:19] Step: 166, Training Logs: loss_final: 2.392647, loss_mean: 1.048530, proj_loss: -0.054274, loss_mean_cls: 1.398391, grad_norm: 1.340627 +Steps: 0%| | 167/1000000 [00:42<68:08:39, 4.08it/s, grad_norm=1.34, loss_final=2.39, loss_mean=1.05, loss_mean_cls=1.4, proj_loss=-0.0543][2026-03-22 14:19:19] Step: 167, Training Logs: loss_final: 2.578850, loss_mean: 1.023171, proj_loss: -0.055771, loss_mean_cls: 1.611450, grad_norm: 1.473304 +Steps: 0%| | 168/1000000 [00:42<68:13:58, 4.07it/s, grad_norm=1.47, loss_final=2.58, loss_mean=1.02, loss_mean_cls=1.61, proj_loss=-0.0558][2026-03-22 14:19:19] Step: 168, Training Logs: loss_final: 2.521425, loss_mean: 1.037124, proj_loss: -0.056832, loss_mean_cls: 1.541133, grad_norm: 1.395707 +Steps: 0%| | 169/1000000 [00:43<68:07:36, 4.08it/s, grad_norm=1.4, loss_final=2.52, loss_mean=1.04, loss_mean_cls=1.54, proj_loss=-0.0568][2026-03-22 14:19:19] Step: 169, Training Logs: loss_final: 2.497666, loss_mean: 1.027886, proj_loss: -0.057116, loss_mean_cls: 1.526896, grad_norm: 1.239626 +Steps: 0%| | 170/1000000 [00:43<68:04:23, 4.08it/s, grad_norm=1.24, loss_final=2.5, loss_mean=1.03, loss_mean_cls=1.53, proj_loss=-0.0571][2026-03-22 14:19:20] Step: 170, Training Logs: loss_final: 2.455544, loss_mean: 1.041079, proj_loss: -0.058409, loss_mean_cls: 1.472874, grad_norm: 1.108328 +Steps: 0%| | 171/1000000 [00:43<68:07:23, 4.08it/s, grad_norm=1.11, loss_final=2.46, loss_mean=1.04, loss_mean_cls=1.47, proj_loss=-0.0584][2026-03-22 14:19:20] Step: 171, Training Logs: loss_final: 2.450570, loss_mean: 1.024154, proj_loss: -0.056721, loss_mean_cls: 1.483137, grad_norm: 1.205717 +Steps: 0%| | 172/1000000 [00:43<68:12:17, 4.07it/s, grad_norm=1.21, loss_final=2.45, loss_mean=1.02, loss_mean_cls=1.48, proj_loss=-0.0567][2026-03-22 14:19:20] Step: 172, Training Logs: loss_final: 2.400511, loss_mean: 1.060266, proj_loss: -0.057886, loss_mean_cls: 1.398131, grad_norm: 1.451644 +Steps: 0%| | 173/1000000 [00:44<68:05:24, 4.08it/s, grad_norm=1.45, loss_final=2.4, loss_mean=1.06, loss_mean_cls=1.4, proj_loss=-0.0579][2026-03-22 14:19:20] Step: 173, Training Logs: loss_final: 2.442010, loss_mean: 1.043688, proj_loss: -0.058081, loss_mean_cls: 1.456402, grad_norm: 1.443722 +Steps: 0%| | 174/1000000 [00:44<68:00:17, 4.08it/s, grad_norm=1.44, loss_final=2.44, loss_mean=1.04, loss_mean_cls=1.46, proj_loss=-0.0581][2026-03-22 14:19:21] Step: 174, Training Logs: loss_final: 2.286139, loss_mean: 1.048998, proj_loss: -0.058625, loss_mean_cls: 1.295766, grad_norm: 1.027251 +Steps: 0%| | 175/1000000 [00:44<67:56:18, 4.09it/s, grad_norm=1.03, loss_final=2.29, loss_mean=1.05, loss_mean_cls=1.3, proj_loss=-0.0586][2026-03-22 14:19:21] Step: 175, Training Logs: loss_final: 2.576854, loss_mean: 1.036820, proj_loss: -0.054980, loss_mean_cls: 1.595014, grad_norm: 1.776191 +Steps: 0%| | 176/1000000 [00:44<67:54:46, 4.09it/s, grad_norm=1.78, loss_final=2.58, loss_mean=1.04, loss_mean_cls=1.6, proj_loss=-0.055][2026-03-22 14:19:21] Step: 176, Training Logs: loss_final: 2.433633, loss_mean: 1.015530, proj_loss: -0.057093, loss_mean_cls: 1.475196, grad_norm: 1.277916 +Steps: 0%| | 177/1000000 [00:44<67:52:14, 4.09it/s, grad_norm=1.28, loss_final=2.43, loss_mean=1.02, loss_mean_cls=1.48, proj_loss=-0.0571][2026-03-22 14:19:21] Step: 177, Training Logs: loss_final: 2.466080, loss_mean: 1.063157, proj_loss: -0.056343, loss_mean_cls: 1.459267, grad_norm: 2.455123 +Steps: 0%| | 178/1000000 [00:45<67:50:47, 4.09it/s, grad_norm=2.46, loss_final=2.47, loss_mean=1.06, loss_mean_cls=1.46, proj_loss=-0.0563][2026-03-22 14:19:22] Step: 178, Training Logs: loss_final: 2.475201, loss_mean: 1.070674, proj_loss: -0.056419, loss_mean_cls: 1.460946, grad_norm: 1.960027 +Steps: 0%| | 179/1000000 [00:45<67:48:02, 4.10it/s, grad_norm=1.96, loss_final=2.48, loss_mean=1.07, loss_mean_cls=1.46, proj_loss=-0.0564][2026-03-22 14:19:22] Step: 179, Training Logs: loss_final: 2.547365, loss_mean: 1.027595, proj_loss: -0.057741, loss_mean_cls: 1.577512, grad_norm: 1.754595 +Steps: 0%| | 180/1000000 [00:45<67:49:49, 4.09it/s, grad_norm=1.75, loss_final=2.55, loss_mean=1.03, loss_mean_cls=1.58, proj_loss=-0.0577][2026-03-22 14:19:22] Step: 180, Training Logs: loss_final: 2.310210, loss_mean: 1.107928, proj_loss: -0.055539, loss_mean_cls: 1.257820, grad_norm: 3.005248 +Steps: 0%| | 181/1000000 [00:45<67:51:23, 4.09it/s, grad_norm=3.01, loss_final=2.31, loss_mean=1.11, loss_mean_cls=1.26, proj_loss=-0.0555][2026-03-22 14:19:22] Step: 181, Training Logs: loss_final: 2.477400, loss_mean: 1.048507, proj_loss: -0.057452, loss_mean_cls: 1.486345, grad_norm: 2.795506 +Steps: 0%| | 182/1000000 [00:46<67:53:28, 4.09it/s, grad_norm=2.8, loss_final=2.48, loss_mean=1.05, loss_mean_cls=1.49, proj_loss=-0.0575][2026-03-22 14:19:23] Step: 182, Training Logs: loss_final: 2.562784, loss_mean: 1.008567, proj_loss: -0.057198, loss_mean_cls: 1.611415, grad_norm: 1.757723 +Steps: 0%| | 183/1000000 [00:46<67:54:12, 4.09it/s, grad_norm=1.76, loss_final=2.56, loss_mean=1.01, loss_mean_cls=1.61, proj_loss=-0.0572][2026-03-22 14:19:23] Step: 183, Training Logs: loss_final: 2.547480, loss_mean: 1.037836, proj_loss: -0.057248, loss_mean_cls: 1.566892, grad_norm: 2.038158 +Steps: 0%| | 184/1000000 [00:46<67:54:56, 4.09it/s, grad_norm=2.04, loss_final=2.55, loss_mean=1.04, loss_mean_cls=1.57, proj_loss=-0.0572][2026-03-22 14:19:23] Step: 184, Training Logs: loss_final: 2.419293, loss_mean: 1.056002, proj_loss: -0.057034, loss_mean_cls: 1.420325, grad_norm: 1.892118 +Steps: 0%| | 185/1000000 [00:46<67:52:19, 4.09it/s, grad_norm=1.89, loss_final=2.42, loss_mean=1.06, loss_mean_cls=1.42, proj_loss=-0.057][2026-03-22 14:19:23] Step: 185, Training Logs: loss_final: 2.497940, loss_mean: 1.036438, proj_loss: -0.056785, loss_mean_cls: 1.518287, grad_norm: 1.897867 +Steps: 0%| | 186/1000000 [00:47<67:51:54, 4.09it/s, grad_norm=1.9, loss_final=2.5, loss_mean=1.04, loss_mean_cls=1.52, proj_loss=-0.0568][2026-03-22 14:19:24] Step: 186, Training Logs: loss_final: 2.441293, loss_mean: 1.038147, proj_loss: -0.055119, loss_mean_cls: 1.458265, grad_norm: 1.296474 +Steps: 0%| | 187/1000000 [00:47<67:49:43, 4.09it/s, grad_norm=1.3, loss_final=2.44, loss_mean=1.04, loss_mean_cls=1.46, proj_loss=-0.0551][2026-03-22 14:19:24] Step: 187, Training Logs: loss_final: 2.573579, loss_mean: 1.019546, proj_loss: -0.054985, loss_mean_cls: 1.609018, grad_norm: 1.917169 +Steps: 0%| | 188/1000000 [00:47<67:53:28, 4.09it/s, grad_norm=1.92, loss_final=2.57, loss_mean=1.02, loss_mean_cls=1.61, proj_loss=-0.055][2026-03-22 14:19:24] Step: 188, Training Logs: loss_final: 2.484155, loss_mean: 1.017135, proj_loss: -0.056258, loss_mean_cls: 1.523279, grad_norm: 1.454356 +Steps: 0%| | 189/1000000 [00:47<67:52:04, 4.09it/s, grad_norm=1.45, loss_final=2.48, loss_mean=1.02, loss_mean_cls=1.52, proj_loss=-0.0563][2026-03-22 14:19:24] Step: 189, Training Logs: loss_final: 2.354201, loss_mean: 1.039577, proj_loss: -0.055384, loss_mean_cls: 1.370008, grad_norm: 1.846185 +Steps: 0%| | 190/1000000 [00:48<67:52:27, 4.09it/s, grad_norm=1.85, loss_final=2.35, loss_mean=1.04, loss_mean_cls=1.37, proj_loss=-0.0554][2026-03-22 14:19:25] Step: 190, Training Logs: loss_final: 2.479640, loss_mean: 1.020760, proj_loss: -0.058315, loss_mean_cls: 1.517195, grad_norm: 1.555801 +Steps: 0%| | 191/1000000 [00:48<67:51:52, 4.09it/s, grad_norm=1.56, loss_final=2.48, loss_mean=1.02, loss_mean_cls=1.52, proj_loss=-0.0583][2026-03-22 14:19:25] Step: 191, Training Logs: loss_final: 2.370326, loss_mean: 1.024044, proj_loss: -0.055783, loss_mean_cls: 1.402066, grad_norm: 1.662531 +Steps: 0%| | 192/1000000 [00:48<67:53:07, 4.09it/s, grad_norm=1.66, loss_final=2.37, loss_mean=1.02, loss_mean_cls=1.4, proj_loss=-0.0558][2026-03-22 14:19:25] Step: 192, Training Logs: loss_final: 2.405635, loss_mean: 1.036722, proj_loss: -0.056821, loss_mean_cls: 1.425735, grad_norm: 1.355959 +Steps: 0%| | 193/1000000 [00:48<67:53:16, 4.09it/s, grad_norm=1.36, loss_final=2.41, loss_mean=1.04, loss_mean_cls=1.43, proj_loss=-0.0568][2026-03-22 14:19:25] Step: 193, Training Logs: loss_final: 2.520981, loss_mean: 1.019700, proj_loss: -0.058203, loss_mean_cls: 1.559483, grad_norm: 1.974458 +Steps: 0%| | 194/1000000 [00:49<67:53:18, 4.09it/s, grad_norm=1.97, loss_final=2.52, loss_mean=1.02, loss_mean_cls=1.56, proj_loss=-0.0582][2026-03-22 14:19:26] Step: 194, Training Logs: loss_final: 2.505752, loss_mean: 1.039195, proj_loss: -0.053609, loss_mean_cls: 1.520166, grad_norm: 1.623887 +Steps: 0%| | 195/1000000 [00:49<67:52:42, 4.09it/s, grad_norm=1.62, loss_final=2.51, loss_mean=1.04, loss_mean_cls=1.52, proj_loss=-0.0536][2026-03-22 14:19:26] Step: 195, Training Logs: loss_final: 2.485296, loss_mean: 1.014107, proj_loss: -0.056201, loss_mean_cls: 1.527390, grad_norm: 1.582160 +Steps: 0%| | 196/1000000 [00:49<67:52:47, 4.09it/s, grad_norm=1.58, loss_final=2.49, loss_mean=1.01, loss_mean_cls=1.53, proj_loss=-0.0562][2026-03-22 14:19:26] Step: 196, Training Logs: loss_final: 2.479381, loss_mean: 1.004452, proj_loss: -0.057193, loss_mean_cls: 1.532122, grad_norm: 1.692287 +Steps: 0%| | 197/1000000 [00:49<67:51:50, 4.09it/s, grad_norm=1.69, loss_final=2.48, loss_mean=1, loss_mean_cls=1.53, proj_loss=-0.0572][2026-03-22 14:19:26] Step: 197, Training Logs: loss_final: 2.403533, loss_mean: 1.043223, proj_loss: -0.058170, loss_mean_cls: 1.418480, grad_norm: 1.695594 +Steps: 0%| | 198/1000000 [00:50<67:52:22, 4.09it/s, grad_norm=1.7, loss_final=2.4, loss_mean=1.04, loss_mean_cls=1.42, proj_loss=-0.0582][2026-03-22 14:19:27] Step: 198, Training Logs: loss_final: 2.349401, loss_mean: 1.029490, proj_loss: -0.057329, loss_mean_cls: 1.377240, grad_norm: 1.433843 +Steps: 0%| | 199/1000000 [00:50<67:52:14, 4.09it/s, grad_norm=1.43, loss_final=2.35, loss_mean=1.03, loss_mean_cls=1.38, proj_loss=-0.0573][2026-03-22 14:19:27] Step: 199, Training Logs: loss_final: 2.364271, loss_mean: 1.027763, proj_loss: -0.056625, loss_mean_cls: 1.393133, grad_norm: 2.173416 +Steps: 0%| | 200/1000000 [00:50<67:59:05, 4.09it/s, grad_norm=2.17, loss_final=2.36, loss_mean=1.03, loss_mean_cls=1.39, proj_loss=-0.0566][2026-03-22 14:19:27] Step: 200, Training Logs: loss_final: 2.368784, loss_mean: 1.039356, proj_loss: -0.056930, loss_mean_cls: 1.386358, grad_norm: 1.809994 +Steps: 0%| | 201/1000000 [00:50<67:57:11, 4.09it/s, grad_norm=1.81, loss_final=2.37, loss_mean=1.04, loss_mean_cls=1.39, proj_loss=-0.0569][2026-03-22 14:19:27] Step: 201, Training Logs: loss_final: 2.392078, loss_mean: 1.030825, proj_loss: -0.058396, loss_mean_cls: 1.419649, grad_norm: 2.107682 +Steps: 0%| | 202/1000000 [00:51<67:56:00, 4.09it/s, grad_norm=2.11, loss_final=2.39, loss_mean=1.03, loss_mean_cls=1.42, proj_loss=-0.0584][2026-03-22 14:19:28] Step: 202, Training Logs: loss_final: 2.434229, loss_mean: 1.028864, proj_loss: -0.058218, loss_mean_cls: 1.463583, grad_norm: 1.856877 +Steps: 0%| | 203/1000000 [00:51<67:54:42, 4.09it/s, grad_norm=1.86, loss_final=2.43, loss_mean=1.03, loss_mean_cls=1.46, proj_loss=-0.0582][2026-03-22 14:19:28] Step: 203, Training Logs: loss_final: 2.261772, loss_mean: 1.050076, proj_loss: -0.057231, loss_mean_cls: 1.268926, grad_norm: 2.059533 +Steps: 0%| | 204/1000000 [00:51<68:05:50, 4.08it/s, grad_norm=2.06, loss_final=2.26, loss_mean=1.05, loss_mean_cls=1.27, proj_loss=-0.0572][2026-03-22 14:19:28] Step: 204, Training Logs: loss_final: 2.461822, loss_mean: 1.021379, proj_loss: -0.059877, loss_mean_cls: 1.500319, grad_norm: 1.839072 +Steps: 0%| | 205/1000000 [00:51<68:02:22, 4.08it/s, grad_norm=1.84, loss_final=2.46, loss_mean=1.02, loss_mean_cls=1.5, proj_loss=-0.0599][2026-03-22 14:19:28] Step: 205, Training Logs: loss_final: 2.379940, loss_mean: 1.026777, proj_loss: -0.059100, loss_mean_cls: 1.412263, grad_norm: 1.453350 +Steps: 0%| | 206/1000000 [00:52<68:01:32, 4.08it/s, grad_norm=1.45, loss_final=2.38, loss_mean=1.03, loss_mean_cls=1.41, proj_loss=-0.0591][2026-03-22 14:19:29] Step: 206, Training Logs: loss_final: 2.445671, loss_mean: 1.014222, proj_loss: -0.057537, loss_mean_cls: 1.488986, grad_norm: 1.736755 +Steps: 0%| | 207/1000000 [00:52<67:59:48, 4.08it/s, grad_norm=1.74, loss_final=2.45, loss_mean=1.01, loss_mean_cls=1.49, proj_loss=-0.0575][2026-03-22 14:19:29] Step: 207, Training Logs: loss_final: 2.500912, loss_mean: 0.990467, proj_loss: -0.054792, loss_mean_cls: 1.565236, grad_norm: 1.375595 +Steps: 0%| | 208/1000000 [00:52<67:58:41, 4.09it/s, grad_norm=1.38, loss_final=2.5, loss_mean=0.99, loss_mean_cls=1.57, proj_loss=-0.0548][2026-03-22 14:19:29] Step: 208, Training Logs: loss_final: 2.443306, loss_mean: 1.033038, proj_loss: -0.057736, loss_mean_cls: 1.468004, grad_norm: 2.242796 +Steps: 0%| | 209/1000000 [00:52<67:58:38, 4.09it/s, grad_norm=2.24, loss_final=2.44, loss_mean=1.03, loss_mean_cls=1.47, proj_loss=-0.0577][2026-03-22 14:19:29] Step: 209, Training Logs: loss_final: 2.402170, loss_mean: 1.016365, proj_loss: -0.056745, loss_mean_cls: 1.442549, grad_norm: 2.346330 +Steps: 0%| | 210/1000000 [00:53<67:59:32, 4.08it/s, grad_norm=2.35, loss_final=2.4, loss_mean=1.02, loss_mean_cls=1.44, proj_loss=-0.0567][2026-03-22 14:19:30] Step: 210, Training Logs: loss_final: 2.493217, loss_mean: 1.013664, proj_loss: -0.057231, loss_mean_cls: 1.536784, grad_norm: 1.666508 +Steps: 0%| | 211/1000000 [00:53<67:57:50, 4.09it/s, grad_norm=1.67, loss_final=2.49, loss_mean=1.01, loss_mean_cls=1.54, proj_loss=-0.0572][2026-03-22 14:19:30] Step: 211, Training Logs: loss_final: 2.428266, loss_mean: 1.018641, proj_loss: -0.059920, loss_mean_cls: 1.469545, grad_norm: 2.680066 +Steps: 0%| | 212/1000000 [00:53<67:56:39, 4.09it/s, grad_norm=2.68, loss_final=2.43, loss_mean=1.02, loss_mean_cls=1.47, proj_loss=-0.0599][2026-03-22 14:19:30] Step: 212, Training Logs: loss_final: 2.381510, loss_mean: 1.024258, proj_loss: -0.057693, loss_mean_cls: 1.414945, grad_norm: 1.757013 +Steps: 0%| | 213/1000000 [00:53<67:55:50, 4.09it/s, grad_norm=1.76, loss_final=2.38, loss_mean=1.02, loss_mean_cls=1.41, proj_loss=-0.0577][2026-03-22 14:19:30] Step: 213, Training Logs: loss_final: 2.361980, loss_mean: 1.033190, proj_loss: -0.058522, loss_mean_cls: 1.387313, grad_norm: 2.182894 +Steps: 0%| | 214/1000000 [00:54<67:55:57, 4.09it/s, grad_norm=2.18, loss_final=2.36, loss_mean=1.03, loss_mean_cls=1.39, proj_loss=-0.0585][2026-03-22 14:19:30] Step: 214, Training Logs: loss_final: 2.587444, loss_mean: 0.990967, proj_loss: -0.055972, loss_mean_cls: 1.652449, grad_norm: 2.378022 +Steps: 0%| | 215/1000000 [00:54<67:55:58, 4.09it/s, grad_norm=2.38, loss_final=2.59, loss_mean=0.991, loss_mean_cls=1.65, proj_loss=-0.056][2026-03-22 14:19:31] Step: 215, Training Logs: loss_final: 2.430295, loss_mean: 1.010448, proj_loss: -0.057854, loss_mean_cls: 1.477701, grad_norm: 2.752020 +Steps: 0%| | 216/1000000 [00:54<67:56:56, 4.09it/s, grad_norm=2.75, loss_final=2.43, loss_mean=1.01, loss_mean_cls=1.48, proj_loss=-0.0579][2026-03-22 14:19:31] Step: 216, Training Logs: loss_final: 2.382021, loss_mean: 1.039085, proj_loss: -0.057878, loss_mean_cls: 1.400813, grad_norm: 2.178588 +Steps: 0%| | 217/1000000 [00:54<67:56:04, 4.09it/s, grad_norm=2.18, loss_final=2.38, loss_mean=1.04, loss_mean_cls=1.4, proj_loss=-0.0579][2026-03-22 14:19:31] Step: 217, Training Logs: loss_final: 2.407312, loss_mean: 1.017399, proj_loss: -0.059836, loss_mean_cls: 1.449749, grad_norm: 2.073661 +Steps: 0%| | 218/1000000 [00:55<68:00:59, 4.08it/s, grad_norm=2.07, loss_final=2.41, loss_mean=1.02, loss_mean_cls=1.45, proj_loss=-0.0598][2026-03-22 14:19:31] Step: 218, Training Logs: loss_final: 2.497019, loss_mean: 1.004074, proj_loss: -0.056015, loss_mean_cls: 1.548960, grad_norm: 2.132362 +Steps: 0%| | 219/1000000 [00:55<67:57:21, 4.09it/s, grad_norm=2.13, loss_final=2.5, loss_mean=1, loss_mean_cls=1.55, proj_loss=-0.056][2026-03-22 14:19:32] Step: 219, Training Logs: loss_final: 2.406238, loss_mean: 1.006791, proj_loss: -0.057510, loss_mean_cls: 1.456958, grad_norm: 1.512980 +Steps: 0%| | 220/1000000 [00:55<67:55:00, 4.09it/s, grad_norm=1.51, loss_final=2.41, loss_mean=1.01, loss_mean_cls=1.46, proj_loss=-0.0575][2026-03-22 14:19:32] Step: 220, Training Logs: loss_final: 2.419377, loss_mean: 1.000298, proj_loss: -0.056437, loss_mean_cls: 1.475516, grad_norm: 2.369027 +Steps: 0%| | 221/1000000 [00:55<67:54:03, 4.09it/s, grad_norm=2.37, loss_final=2.42, loss_mean=1, loss_mean_cls=1.48, proj_loss=-0.0564][2026-03-22 14:19:32] Step: 221, Training Logs: loss_final: 2.464646, loss_mean: 1.023287, proj_loss: -0.055405, loss_mean_cls: 1.496764, grad_norm: 2.463533 +Steps: 0%| | 222/1000000 [00:55<68:00:16, 4.08it/s, grad_norm=2.46, loss_final=2.46, loss_mean=1.02, loss_mean_cls=1.5, proj_loss=-0.0554][2026-03-22 14:19:32] Step: 222, Training Logs: loss_final: 2.377361, loss_mean: 1.008526, proj_loss: -0.058981, loss_mean_cls: 1.427816, grad_norm: 1.505774 +Steps: 0%| | 223/1000000 [00:56<67:59:42, 4.08it/s, grad_norm=1.51, loss_final=2.38, loss_mean=1.01, loss_mean_cls=1.43, proj_loss=-0.059][2026-03-22 14:19:33] Step: 223, Training Logs: loss_final: 2.461032, loss_mean: 1.010314, proj_loss: -0.056267, loss_mean_cls: 1.506986, grad_norm: 2.044033 +Steps: 0%| | 224/1000000 [00:56<68:01:12, 4.08it/s, grad_norm=2.04, loss_final=2.46, loss_mean=1.01, loss_mean_cls=1.51, proj_loss=-0.0563][2026-03-22 14:19:33] Step: 224, Training Logs: loss_final: 2.382167, loss_mean: 1.014647, proj_loss: -0.059271, loss_mean_cls: 1.426790, grad_norm: 2.346870 +Steps: 0%| | 225/1000000 [00:56<68:02:29, 4.08it/s, grad_norm=2.35, loss_final=2.38, loss_mean=1.01, loss_mean_cls=1.43, proj_loss=-0.0593][2026-03-22 14:19:33] Step: 225, Training Logs: loss_final: 2.420557, loss_mean: 1.019961, proj_loss: -0.057895, loss_mean_cls: 1.458492, grad_norm: 2.247477 +Steps: 0%| | 226/1000000 [00:56<68:11:34, 4.07it/s, grad_norm=2.25, loss_final=2.42, loss_mean=1.02, loss_mean_cls=1.46, proj_loss=-0.0579][2026-03-22 14:19:33] Step: 226, Training Logs: loss_final: 2.401914, loss_mean: 1.000011, proj_loss: -0.058599, loss_mean_cls: 1.460501, grad_norm: 2.309558 +Steps: 0%| | 227/1000000 [00:57<68:17:56, 4.07it/s, grad_norm=2.31, loss_final=2.4, loss_mean=1, loss_mean_cls=1.46, proj_loss=-0.0586][2026-03-22 14:19:34] Step: 227, Training Logs: loss_final: 2.393010, loss_mean: 1.020151, proj_loss: -0.058559, loss_mean_cls: 1.431418, grad_norm: 2.444839 +Steps: 0%| | 228/1000000 [00:57<68:13:04, 4.07it/s, grad_norm=2.44, loss_final=2.39, loss_mean=1.02, loss_mean_cls=1.43, proj_loss=-0.0586][2026-03-22 14:19:34] Step: 228, Training Logs: loss_final: 2.533042, loss_mean: 0.974218, proj_loss: -0.057214, loss_mean_cls: 1.616037, grad_norm: 1.969072 +Steps: 0%| | 229/1000000 [00:57<68:09:49, 4.07it/s, grad_norm=1.97, loss_final=2.53, loss_mean=0.974, loss_mean_cls=1.62, proj_loss=-0.0572][2026-03-22 14:19:34] Step: 229, Training Logs: loss_final: 2.449763, loss_mean: 1.010785, proj_loss: -0.058586, loss_mean_cls: 1.497564, grad_norm: 3.930691 +Steps: 0%| | 230/1000000 [00:58<88:51:26, 3.13it/s, grad_norm=3.93, loss_final=2.45, loss_mean=1.01, loss_mean_cls=1.5, proj_loss=-0.0586][2026-03-22 14:19:35] Step: 230, Training Logs: loss_final: 2.272266, loss_mean: 1.020012, proj_loss: -0.057282, loss_mean_cls: 1.309536, grad_norm: 2.203508 +Steps: 0%| | 231/1000000 [00:58<82:38:03, 3.36it/s, grad_norm=2.2, loss_final=2.27, loss_mean=1.02, loss_mean_cls=1.31, proj_loss=-0.0573][2026-03-22 14:19:35] Step: 231, Training Logs: loss_final: 2.318239, loss_mean: 1.026936, proj_loss: -0.057703, loss_mean_cls: 1.349006, grad_norm: 2.577084 +Steps: 0%| | 232/1000000 [00:58<78:16:39, 3.55it/s, grad_norm=2.58, loss_final=2.32, loss_mean=1.03, loss_mean_cls=1.35, proj_loss=-0.0577][2026-03-22 14:19:35] Step: 232, Training Logs: loss_final: 2.362986, loss_mean: 1.018652, proj_loss: -0.057811, loss_mean_cls: 1.402144, grad_norm: 2.311390 +Steps: 0%| | 233/1000000 [00:58<75:11:41, 3.69it/s, grad_norm=2.31, loss_final=2.36, loss_mean=1.02, loss_mean_cls=1.4, proj_loss=-0.0578][2026-03-22 14:19:35] Step: 233, Training Logs: loss_final: 2.350294, loss_mean: 1.003907, proj_loss: -0.055714, loss_mean_cls: 1.402101, grad_norm: 2.684932 +Steps: 0%| | 234/1000000 [00:59<73:04:44, 3.80it/s, grad_norm=2.68, loss_final=2.35, loss_mean=1, loss_mean_cls=1.4, proj_loss=-0.0557][2026-03-22 14:19:36] Step: 234, Training Logs: loss_final: 2.417665, loss_mean: 1.006075, proj_loss: -0.059199, loss_mean_cls: 1.470788, grad_norm: 2.663181 +Steps: 0%| | 235/1000000 [00:59<71:36:28, 3.88it/s, grad_norm=2.66, loss_final=2.42, loss_mean=1.01, loss_mean_cls=1.47, proj_loss=-0.0592][2026-03-22 14:19:36] Step: 235, Training Logs: loss_final: 2.346486, loss_mean: 1.038429, proj_loss: -0.054582, loss_mean_cls: 1.362638, grad_norm: 2.381177 +Steps: 0%| | 236/1000000 [00:59<70:31:07, 3.94it/s, grad_norm=2.38, loss_final=2.35, loss_mean=1.04, loss_mean_cls=1.36, proj_loss=-0.0546][2026-03-22 14:19:36] Step: 236, Training Logs: loss_final: 2.436447, loss_mean: 1.005687, proj_loss: -0.057850, loss_mean_cls: 1.488610, grad_norm: 2.180495 +Steps: 0%| | 237/1000000 [00:59<69:44:30, 3.98it/s, grad_norm=2.18, loss_final=2.44, loss_mean=1.01, loss_mean_cls=1.49, proj_loss=-0.0579][2026-03-22 14:19:36] Step: 237, Training Logs: loss_final: 2.490688, loss_mean: 0.996940, proj_loss: -0.057274, loss_mean_cls: 1.551021, grad_norm: 2.473797 +Steps: 0%| | 238/1000000 [01:00<69:11:42, 4.01it/s, grad_norm=2.47, loss_final=2.49, loss_mean=0.997, loss_mean_cls=1.55, proj_loss=-0.0573][2026-03-22 14:19:37] Step: 238, Training Logs: loss_final: 2.316025, loss_mean: 1.026229, proj_loss: -0.058121, loss_mean_cls: 1.347917, grad_norm: 1.915328 +Steps: 0%| | 239/1000000 [01:00<68:50:18, 4.03it/s, grad_norm=1.92, loss_final=2.32, loss_mean=1.03, loss_mean_cls=1.35, proj_loss=-0.0581][2026-03-22 14:19:37] Step: 239, Training Logs: loss_final: 2.444077, loss_mean: 1.001710, proj_loss: -0.058401, loss_mean_cls: 1.500768, grad_norm: 1.790663 +Steps: 0%| | 240/1000000 [01:00<68:37:53, 4.05it/s, grad_norm=1.79, loss_final=2.44, loss_mean=1, loss_mean_cls=1.5, proj_loss=-0.0584][2026-03-22 14:19:37] Step: 240, Training Logs: loss_final: 2.453539, loss_mean: 0.995916, proj_loss: -0.059266, loss_mean_cls: 1.516889, grad_norm: 2.805066 +Steps: 0%| | 241/1000000 [01:00<68:26:38, 4.06it/s, grad_norm=2.81, loss_final=2.45, loss_mean=0.996, loss_mean_cls=1.52, proj_loss=-0.0593][2026-03-22 14:19:37] Step: 241, Training Logs: loss_final: 2.361762, loss_mean: 0.995088, proj_loss: -0.057465, loss_mean_cls: 1.424138, grad_norm: 2.223625 +Steps: 0%| | 242/1000000 [01:01<68:18:55, 4.07it/s, grad_norm=2.22, loss_final=2.36, loss_mean=0.995, loss_mean_cls=1.42, proj_loss=-0.0575][2026-03-22 14:19:38] Step: 242, Training Logs: loss_final: 2.323387, loss_mean: 1.003063, proj_loss: -0.056781, loss_mean_cls: 1.377105, grad_norm: 2.503578 +Steps: 0%| | 243/1000000 [01:01<68:14:42, 4.07it/s, grad_norm=2.5, loss_final=2.32, loss_mean=1, loss_mean_cls=1.38, proj_loss=-0.0568][2026-03-22 14:19:38] Step: 243, Training Logs: loss_final: 2.414788, loss_mean: 1.010746, proj_loss: -0.057683, loss_mean_cls: 1.461725, grad_norm: 1.514548 +Steps: 0%| | 244/1000000 [01:01<68:13:20, 4.07it/s, grad_norm=1.51, loss_final=2.41, loss_mean=1.01, loss_mean_cls=1.46, proj_loss=-0.0577][2026-03-22 14:19:38] Step: 244, Training Logs: loss_final: 2.422950, loss_mean: 1.008679, proj_loss: -0.057921, loss_mean_cls: 1.472192, grad_norm: 3.362099 +Steps: 0%| | 245/1000000 [01:01<68:11:49, 4.07it/s, grad_norm=3.36, loss_final=2.42, loss_mean=1.01, loss_mean_cls=1.47, proj_loss=-0.0579][2026-03-22 14:19:38] Step: 245, Training Logs: loss_final: 2.416363, loss_mean: 0.993499, proj_loss: -0.056928, loss_mean_cls: 1.479792, grad_norm: 2.605845 +Steps: 0%| | 246/1000000 [01:02<68:09:41, 4.07it/s, grad_norm=2.61, loss_final=2.42, loss_mean=0.993, loss_mean_cls=1.48, proj_loss=-0.0569][2026-03-22 14:19:39] Step: 246, Training Logs: loss_final: 2.343595, loss_mean: 1.008899, proj_loss: -0.058427, loss_mean_cls: 1.393122, grad_norm: 2.531854 +Steps: 0%| | 247/1000000 [01:02<68:07:25, 4.08it/s, grad_norm=2.53, loss_final=2.34, loss_mean=1.01, loss_mean_cls=1.39, proj_loss=-0.0584][2026-03-22 14:19:39] Step: 247, Training Logs: loss_final: 2.351943, loss_mean: 0.998563, proj_loss: -0.059909, loss_mean_cls: 1.413290, grad_norm: 3.113782 +Steps: 0%| | 248/1000000 [01:02<68:07:06, 4.08it/s, grad_norm=3.11, loss_final=2.35, loss_mean=0.999, loss_mean_cls=1.41, proj_loss=-0.0599][2026-03-22 14:19:39] Step: 248, Training Logs: loss_final: 2.212756, loss_mean: 1.030522, proj_loss: -0.057964, loss_mean_cls: 1.240197, grad_norm: 2.526502 +Steps: 0%| | 249/1000000 [01:02<68:04:43, 4.08it/s, grad_norm=2.53, loss_final=2.21, loss_mean=1.03, loss_mean_cls=1.24, proj_loss=-0.058][2026-03-22 14:19:39] Step: 249, Training Logs: loss_final: 2.203023, loss_mean: 1.055844, proj_loss: -0.056694, loss_mean_cls: 1.203872, grad_norm: 3.054946 +Steps: 0%| | 250/1000000 [01:03<68:01:20, 4.08it/s, grad_norm=3.05, loss_final=2.2, loss_mean=1.06, loss_mean_cls=1.2, proj_loss=-0.0567][2026-03-22 14:19:40] Step: 250, Training Logs: loss_final: 2.363709, loss_mean: 1.002349, proj_loss: -0.057456, loss_mean_cls: 1.418816, grad_norm: 2.879371 +Steps: 0%| | 251/1000000 [01:03<68:00:39, 4.08it/s, grad_norm=2.88, loss_final=2.36, loss_mean=1, loss_mean_cls=1.42, proj_loss=-0.0575][2026-03-22 14:19:40] Step: 251, Training Logs: loss_final: 2.397712, loss_mean: 1.016627, proj_loss: -0.057054, loss_mean_cls: 1.438139, grad_norm: 2.226318 +Steps: 0%| | 252/1000000 [01:03<67:59:05, 4.08it/s, grad_norm=2.23, loss_final=2.4, loss_mean=1.02, loss_mean_cls=1.44, proj_loss=-0.0571][2026-03-22 14:19:40] Step: 252, Training Logs: loss_final: 2.351789, loss_mean: 1.022730, proj_loss: -0.059416, loss_mean_cls: 1.388475, grad_norm: 2.127080 +Steps: 0%| | 253/1000000 [01:03<67:56:51, 4.09it/s, grad_norm=2.13, loss_final=2.35, loss_mean=1.02, loss_mean_cls=1.39, proj_loss=-0.0594][2026-03-22 14:19:40] Step: 253, Training Logs: loss_final: 2.390544, loss_mean: 1.000972, proj_loss: -0.060216, loss_mean_cls: 1.449788, grad_norm: 2.974514 +Steps: 0%| | 254/1000000 [01:04<67:55:02, 4.09it/s, grad_norm=2.97, loss_final=2.39, loss_mean=1, loss_mean_cls=1.45, proj_loss=-0.0602][2026-03-22 14:19:41] Step: 254, Training Logs: loss_final: 2.349092, loss_mean: 1.001086, proj_loss: -0.057487, loss_mean_cls: 1.405493, grad_norm: 2.134796 +Steps: 0%| | 255/1000000 [01:04<67:55:26, 4.09it/s, grad_norm=2.13, loss_final=2.35, loss_mean=1, loss_mean_cls=1.41, proj_loss=-0.0575][2026-03-22 14:19:41] Step: 255, Training Logs: loss_final: 2.427955, loss_mean: 1.017070, proj_loss: -0.058159, loss_mean_cls: 1.469044, grad_norm: 3.860640 +Steps: 0%| | 256/1000000 [01:04<67:54:08, 4.09it/s, grad_norm=3.86, loss_final=2.43, loss_mean=1.02, loss_mean_cls=1.47, proj_loss=-0.0582][2026-03-22 14:19:41] Step: 256, Training Logs: loss_final: 2.400475, loss_mean: 1.007476, proj_loss: -0.058751, loss_mean_cls: 1.451750, grad_norm: 3.476752 +Steps: 0%| | 257/1000000 [01:04<67:54:07, 4.09it/s, grad_norm=3.48, loss_final=2.4, loss_mean=1.01, loss_mean_cls=1.45, proj_loss=-0.0588][2026-03-22 14:19:41] Step: 257, Training Logs: loss_final: 2.270208, loss_mean: 1.018039, proj_loss: -0.058131, loss_mean_cls: 1.310300, grad_norm: 2.966115 +Steps: 0%| | 258/1000000 [01:05<67:54:50, 4.09it/s, grad_norm=2.97, loss_final=2.27, loss_mean=1.02, loss_mean_cls=1.31, proj_loss=-0.0581][2026-03-22 14:19:42] Step: 258, Training Logs: loss_final: 2.382706, loss_mean: 1.014374, proj_loss: -0.058149, loss_mean_cls: 1.426482, grad_norm: 3.447179 +Steps: 0%| | 259/1000000 [01:05<67:54:09, 4.09it/s, grad_norm=3.45, loss_final=2.38, loss_mean=1.01, loss_mean_cls=1.43, proj_loss=-0.0581][2026-03-22 14:19:42] Step: 259, Training Logs: loss_final: 2.429102, loss_mean: 1.002499, proj_loss: -0.055362, loss_mean_cls: 1.481965, grad_norm: 3.000974 +Steps: 0%| | 260/1000000 [01:05<67:54:25, 4.09it/s, grad_norm=3, loss_final=2.43, loss_mean=1, loss_mean_cls=1.48, proj_loss=-0.0554][2026-03-22 14:19:42] Step: 260, Training Logs: loss_final: 2.396965, loss_mean: 1.013649, proj_loss: -0.055120, loss_mean_cls: 1.438436, grad_norm: 2.995438 +Steps: 0%| | 261/1000000 [01:05<67:55:50, 4.09it/s, grad_norm=3, loss_final=2.4, loss_mean=1.01, loss_mean_cls=1.44, proj_loss=-0.0551][2026-03-22 14:19:42] Step: 261, Training Logs: loss_final: 2.336987, loss_mean: 1.002369, proj_loss: -0.057848, loss_mean_cls: 1.392467, grad_norm: 2.192955 +Steps: 0%| | 262/1000000 [01:06<67:55:46, 4.09it/s, grad_norm=2.19, loss_final=2.34, loss_mean=1, loss_mean_cls=1.39, proj_loss=-0.0578][2026-03-22 14:19:43] Step: 262, Training Logs: loss_final: 2.438083, loss_mean: 1.001337, proj_loss: -0.057323, loss_mean_cls: 1.494070, grad_norm: 2.790765 +Steps: 0%| | 263/1000000 [01:06<67:55:15, 4.09it/s, grad_norm=2.79, loss_final=2.44, loss_mean=1, loss_mean_cls=1.49, proj_loss=-0.0573][2026-03-22 14:19:43] Step: 263, Training Logs: loss_final: 2.431377, loss_mean: 0.990921, proj_loss: -0.056653, loss_mean_cls: 1.497110, grad_norm: 3.174301 +Steps: 0%| | 264/1000000 [01:06<68:14:15, 4.07it/s, grad_norm=3.17, loss_final=2.43, loss_mean=0.991, loss_mean_cls=1.5, proj_loss=-0.0567][2026-03-22 14:19:43] Step: 264, Training Logs: loss_final: 2.320493, loss_mean: 1.025649, proj_loss: -0.057949, loss_mean_cls: 1.352793, grad_norm: 3.097336 +Steps: 0%| | 265/1000000 [01:06<68:39:03, 4.05it/s, grad_norm=3.1, loss_final=2.32, loss_mean=1.03, loss_mean_cls=1.35, proj_loss=-0.0579][2026-03-22 14:19:43] Step: 265, Training Logs: loss_final: 2.306084, loss_mean: 1.014024, proj_loss: -0.059594, loss_mean_cls: 1.351654, grad_norm: 2.227662 +Steps: 0%| | 266/1000000 [01:07<68:24:42, 4.06it/s, grad_norm=2.23, loss_final=2.31, loss_mean=1.01, loss_mean_cls=1.35, proj_loss=-0.0596][2026-03-22 14:19:43] Step: 266, Training Logs: loss_final: 2.376215, loss_mean: 0.993122, proj_loss: -0.060688, loss_mean_cls: 1.443781, grad_norm: 2.954054 +Steps: 0%| | 267/1000000 [01:07<68:58:39, 4.03it/s, grad_norm=2.95, loss_final=2.38, loss_mean=0.993, loss_mean_cls=1.44, proj_loss=-0.0607][2026-03-22 14:19:44] Step: 267, Training Logs: loss_final: 2.269247, loss_mean: 1.030942, proj_loss: -0.058567, loss_mean_cls: 1.296872, grad_norm: 2.344165 +Steps: 0%| | 268/1000000 [01:07<68:40:56, 4.04it/s, grad_norm=2.34, loss_final=2.27, loss_mean=1.03, loss_mean_cls=1.3, proj_loss=-0.0586][2026-03-22 14:19:44] Step: 268, Training Logs: loss_final: 2.456507, loss_mean: 0.976977, proj_loss: -0.056921, loss_mean_cls: 1.536451, grad_norm: 2.023528 +Steps: 0%| | 269/1000000 [01:07<68:26:45, 4.06it/s, grad_norm=2.02, loss_final=2.46, loss_mean=0.977, loss_mean_cls=1.54, proj_loss=-0.0569][2026-03-22 14:19:44] Step: 269, Training Logs: loss_final: 2.354723, loss_mean: 0.986322, proj_loss: -0.058110, loss_mean_cls: 1.426511, grad_norm: 3.180856 +Steps: 0%| | 270/1000000 [01:08<68:16:38, 4.07it/s, grad_norm=3.18, loss_final=2.35, loss_mean=0.986, loss_mean_cls=1.43, proj_loss=-0.0581][2026-03-22 14:19:44] Step: 270, Training Logs: loss_final: 2.280562, loss_mean: 1.005617, proj_loss: -0.059017, loss_mean_cls: 1.333962, grad_norm: 2.684496 +Steps: 0%| | 271/1000000 [01:08<68:14:37, 4.07it/s, grad_norm=2.68, loss_final=2.28, loss_mean=1.01, loss_mean_cls=1.33, proj_loss=-0.059][2026-03-22 14:19:45] Step: 271, Training Logs: loss_final: 2.317871, loss_mean: 1.012842, proj_loss: -0.058912, loss_mean_cls: 1.363941, grad_norm: 3.036946 +Steps: 0%| | 272/1000000 [01:08<68:08:45, 4.08it/s, grad_norm=3.04, loss_final=2.32, loss_mean=1.01, loss_mean_cls=1.36, proj_loss=-0.0589][2026-03-22 14:19:45] Step: 272, Training Logs: loss_final: 2.385137, loss_mean: 1.000553, proj_loss: -0.059777, loss_mean_cls: 1.444361, grad_norm: 3.539759 +Steps: 0%| | 273/1000000 [01:08<68:07:35, 4.08it/s, grad_norm=3.54, loss_final=2.39, loss_mean=1, loss_mean_cls=1.44, proj_loss=-0.0598][2026-03-22 14:19:45] Step: 273, Training Logs: loss_final: 2.367022, loss_mean: 0.994028, proj_loss: -0.056444, loss_mean_cls: 1.429439, grad_norm: 2.172052 +Steps: 0%| | 274/1000000 [01:09<68:13:37, 4.07it/s, grad_norm=2.17, loss_final=2.37, loss_mean=0.994, loss_mean_cls=1.43, proj_loss=-0.0564][2026-03-22 14:19:45] Step: 274, Training Logs: loss_final: 2.203403, loss_mean: 1.002820, proj_loss: -0.057796, loss_mean_cls: 1.258380, grad_norm: 2.657309 +Steps: 0%| | 275/1000000 [01:09<68:24:01, 4.06it/s, grad_norm=2.66, loss_final=2.2, loss_mean=1, loss_mean_cls=1.26, proj_loss=-0.0578][2026-03-22 14:19:46] Step: 275, Training Logs: loss_final: 2.383395, loss_mean: 0.976105, proj_loss: -0.057104, loss_mean_cls: 1.464394, grad_norm: 2.328879 +Steps: 0%| | 276/1000000 [01:09<68:19:26, 4.06it/s, grad_norm=2.33, loss_final=2.38, loss_mean=0.976, loss_mean_cls=1.46, proj_loss=-0.0571][2026-03-22 14:19:46] Step: 276, Training Logs: loss_final: 2.280921, loss_mean: 1.005401, proj_loss: -0.058190, loss_mean_cls: 1.333710, grad_norm: 2.776584 +Steps: 0%| | 277/1000000 [01:09<68:14:25, 4.07it/s, grad_norm=2.78, loss_final=2.28, loss_mean=1.01, loss_mean_cls=1.33, proj_loss=-0.0582][2026-03-22 14:19:46] Step: 277, Training Logs: loss_final: 2.292071, loss_mean: 1.007384, proj_loss: -0.058793, loss_mean_cls: 1.343479, grad_norm: 2.823737 +Steps: 0%| | 278/1000000 [01:09<68:10:58, 4.07it/s, grad_norm=2.82, loss_final=2.29, loss_mean=1.01, loss_mean_cls=1.34, proj_loss=-0.0588][2026-03-22 14:19:46] Step: 278, Training Logs: loss_final: 2.310614, loss_mean: 1.000927, proj_loss: -0.057527, loss_mean_cls: 1.367213, grad_norm: 4.344402 +Steps: 0%| | 279/1000000 [01:10<68:07:45, 4.08it/s, grad_norm=4.34, loss_final=2.31, loss_mean=1, loss_mean_cls=1.37, proj_loss=-0.0575][2026-03-22 14:19:47] Step: 279, Training Logs: loss_final: 2.314987, loss_mean: 1.010582, proj_loss: -0.058157, loss_mean_cls: 1.362563, grad_norm: 3.820344 +Steps: 0%| | 280/1000000 [01:10<68:07:45, 4.08it/s, grad_norm=3.82, loss_final=2.31, loss_mean=1.01, loss_mean_cls=1.36, proj_loss=-0.0582][2026-03-22 14:19:47] Step: 280, Training Logs: loss_final: 2.535098, loss_mean: 0.972374, proj_loss: -0.056190, loss_mean_cls: 1.618913, grad_norm: 3.632533 +Steps: 0%| | 281/1000000 [01:10<68:04:56, 4.08it/s, grad_norm=3.63, loss_final=2.54, loss_mean=0.972, loss_mean_cls=1.62, proj_loss=-0.0562][2026-03-22 14:19:47] Step: 281, Training Logs: loss_final: 2.372003, loss_mean: 0.994459, proj_loss: -0.056867, loss_mean_cls: 1.434410, grad_norm: 2.173905 +Steps: 0%| | 282/1000000 [01:10<68:01:02, 4.08it/s, grad_norm=2.17, loss_final=2.37, loss_mean=0.994, loss_mean_cls=1.43, proj_loss=-0.0569][2026-03-22 14:19:47] Step: 282, Training Logs: loss_final: 2.322843, loss_mean: 0.993267, proj_loss: -0.056794, loss_mean_cls: 1.386369, grad_norm: 3.607821 +Steps: 0%| | 283/1000000 [01:11<67:56:48, 4.09it/s, grad_norm=3.61, loss_final=2.32, loss_mean=0.993, loss_mean_cls=1.39, proj_loss=-0.0568][2026-03-22 14:19:48] Step: 283, Training Logs: loss_final: 2.340357, loss_mean: 1.018130, proj_loss: -0.058078, loss_mean_cls: 1.380305, grad_norm: 2.773349 +Steps: 0%| | 284/1000000 [01:11<67:55:37, 4.09it/s, grad_norm=2.77, loss_final=2.34, loss_mean=1.02, loss_mean_cls=1.38, proj_loss=-0.0581][2026-03-22 14:19:48] Step: 284, Training Logs: loss_final: 2.395195, loss_mean: 0.988437, proj_loss: -0.058098, loss_mean_cls: 1.464856, grad_norm: 3.215780 +Steps: 0%| | 285/1000000 [01:11<67:52:57, 4.09it/s, grad_norm=3.22, loss_final=2.4, loss_mean=0.988, loss_mean_cls=1.46, proj_loss=-0.0581][2026-03-22 14:19:48] Step: 285, Training Logs: loss_final: 2.393924, loss_mean: 0.991772, proj_loss: -0.056639, loss_mean_cls: 1.458791, grad_norm: 2.378118 +Steps: 0%| | 286/1000000 [01:11<67:52:40, 4.09it/s, grad_norm=2.38, loss_final=2.39, loss_mean=0.992, loss_mean_cls=1.46, proj_loss=-0.0566][2026-03-22 14:19:48] Step: 286, Training Logs: loss_final: 2.278171, loss_mean: 1.024379, proj_loss: -0.057191, loss_mean_cls: 1.310983, grad_norm: 3.103330 +Steps: 0%| | 287/1000000 [01:12<67:53:51, 4.09it/s, grad_norm=3.1, loss_final=2.28, loss_mean=1.02, loss_mean_cls=1.31, proj_loss=-0.0572][2026-03-22 14:19:49] Step: 287, Training Logs: loss_final: 2.262352, loss_mean: 1.008808, proj_loss: -0.059562, loss_mean_cls: 1.313106, grad_norm: 2.913167 +Steps: 0%| | 288/1000000 [01:12<68:03:31, 4.08it/s, grad_norm=2.91, loss_final=2.26, loss_mean=1.01, loss_mean_cls=1.31, proj_loss=-0.0596][2026-03-22 14:19:49] Step: 288, Training Logs: loss_final: 2.325700, loss_mean: 0.993052, proj_loss: -0.058921, loss_mean_cls: 1.391568, grad_norm: 2.902373 +Steps: 0%| | 289/1000000 [01:12<68:01:06, 4.08it/s, grad_norm=2.9, loss_final=2.33, loss_mean=0.993, loss_mean_cls=1.39, proj_loss=-0.0589][2026-03-22 14:19:49] Step: 289, Training Logs: loss_final: 2.482008, loss_mean: 1.002765, proj_loss: -0.058991, loss_mean_cls: 1.538233, grad_norm: 4.948828 +Steps: 0%| | 290/1000000 [01:12<67:57:34, 4.09it/s, grad_norm=4.95, loss_final=2.48, loss_mean=1, loss_mean_cls=1.54, proj_loss=-0.059][2026-03-22 14:19:49] Step: 290, Training Logs: loss_final: 2.289652, loss_mean: 0.999615, proj_loss: -0.058107, loss_mean_cls: 1.348145, grad_norm: 3.820845 +Steps: 0%| | 291/1000000 [01:13<67:54:46, 4.09it/s, grad_norm=3.82, loss_final=2.29, loss_mean=1, loss_mean_cls=1.35, proj_loss=-0.0581][2026-03-22 14:19:50] Step: 291, Training Logs: loss_final: 2.299533, loss_mean: 1.008915, proj_loss: -0.056736, loss_mean_cls: 1.347354, grad_norm: 3.051347 +Steps: 0%| | 292/1000000 [01:13<67:53:32, 4.09it/s, grad_norm=3.05, loss_final=2.3, loss_mean=1.01, loss_mean_cls=1.35, proj_loss=-0.0567][2026-03-22 14:19:50] Step: 292, Training Logs: loss_final: 2.278351, loss_mean: 0.998127, proj_loss: -0.058568, loss_mean_cls: 1.338792, grad_norm: 2.038716 +Steps: 0%| | 293/1000000 [01:13<67:54:25, 4.09it/s, grad_norm=2.04, loss_final=2.28, loss_mean=0.998, loss_mean_cls=1.34, proj_loss=-0.0586][2026-03-22 14:19:50] Step: 293, Training Logs: loss_final: 2.347316, loss_mean: 1.012432, proj_loss: -0.057342, loss_mean_cls: 1.392226, grad_norm: 5.837279 +Steps: 0%| | 294/1000000 [01:13<67:55:51, 4.09it/s, grad_norm=5.84, loss_final=2.35, loss_mean=1.01, loss_mean_cls=1.39, proj_loss=-0.0573][2026-03-22 14:19:50] Step: 294, Training Logs: loss_final: 2.319334, loss_mean: 1.021003, proj_loss: -0.057105, loss_mean_cls: 1.355437, grad_norm: 3.573577 +Steps: 0%| | 295/1000000 [01:14<68:02:13, 4.08it/s, grad_norm=3.57, loss_final=2.32, loss_mean=1.02, loss_mean_cls=1.36, proj_loss=-0.0571][2026-03-22 14:19:51] Step: 295, Training Logs: loss_final: 2.328225, loss_mean: 1.034366, proj_loss: -0.058078, loss_mean_cls: 1.351937, grad_norm: 7.205377 +Steps: 0%| | 296/1000000 [01:14<68:01:02, 4.08it/s, grad_norm=7.21, loss_final=2.33, loss_mean=1.03, loss_mean_cls=1.35, proj_loss=-0.0581][2026-03-22 14:19:51] Step: 296, Training Logs: loss_final: 2.216763, loss_mean: 1.029431, proj_loss: -0.058705, loss_mean_cls: 1.246038, grad_norm: 5.000008 +Steps: 0%| | 297/1000000 [01:14<67:56:24, 4.09it/s, grad_norm=5, loss_final=2.22, loss_mean=1.03, loss_mean_cls=1.25, proj_loss=-0.0587][2026-03-22 14:19:51] Step: 297, Training Logs: loss_final: 2.211247, loss_mean: 1.033921, proj_loss: -0.056669, loss_mean_cls: 1.233996, grad_norm: 5.174049 +Steps: 0%| | 298/1000000 [01:14<67:54:34, 4.09it/s, grad_norm=5.17, loss_final=2.21, loss_mean=1.03, loss_mean_cls=1.23, proj_loss=-0.0567][2026-03-22 14:19:51] Step: 298, Training Logs: loss_final: 2.321795, loss_mean: 1.023356, proj_loss: -0.058767, loss_mean_cls: 1.357206, grad_norm: 4.293063 +Steps: 0%| | 299/1000000 [01:15<84:08:37, 3.30it/s, grad_norm=4.29, loss_final=2.32, loss_mean=1.02, loss_mean_cls=1.36, proj_loss=-0.0588][2026-03-22 14:19:52] Step: 299, Training Logs: loss_final: 2.228771, loss_mean: 1.017856, proj_loss: -0.058224, loss_mean_cls: 1.269140, grad_norm: 2.614931 +Steps: 0%| | 300/1000000 [01:15<80:10:00, 3.46it/s, grad_norm=2.61, loss_final=2.23, loss_mean=1.02, loss_mean_cls=1.27, proj_loss=-0.0582][2026-03-22 14:19:52] Step: 300, Training Logs: loss_final: 2.346102, loss_mean: 1.000178, proj_loss: -0.056717, loss_mean_cls: 1.402641, grad_norm: 4.080683 +Steps: 0%| | 301/1000000 [01:15<76:29:20, 3.63it/s, grad_norm=4.08, loss_final=2.35, loss_mean=1, loss_mean_cls=1.4, proj_loss=-0.0567][2026-03-22 14:19:52] Step: 301, Training Logs: loss_final: 2.251483, loss_mean: 1.025164, proj_loss: -0.059692, loss_mean_cls: 1.286011, grad_norm: 3.032129 +Steps: 0%| | 302/1000000 [01:16<74:10:55, 3.74it/s, grad_norm=3.03, loss_final=2.25, loss_mean=1.03, loss_mean_cls=1.29, proj_loss=-0.0597][2026-03-22 14:19:53] Step: 302, Training Logs: loss_final: 2.288393, loss_mean: 1.017370, proj_loss: -0.056415, loss_mean_cls: 1.327437, grad_norm: 3.804932 +Steps: 0%| | 303/1000000 [01:16<72:16:33, 3.84it/s, grad_norm=3.8, loss_final=2.29, loss_mean=1.02, loss_mean_cls=1.33, proj_loss=-0.0564][2026-03-22 14:19:53] Step: 303, Training Logs: loss_final: 2.239770, loss_mean: 1.006860, proj_loss: -0.057436, loss_mean_cls: 1.290346, grad_norm: 3.158314 +Steps: 0%| | 304/1000000 [01:16<70:59:04, 3.91it/s, grad_norm=3.16, loss_final=2.24, loss_mean=1.01, loss_mean_cls=1.29, proj_loss=-0.0574][2026-03-22 14:19:53] Step: 304, Training Logs: loss_final: 2.365096, loss_mean: 0.995754, proj_loss: -0.056953, loss_mean_cls: 1.426295, grad_norm: 2.791248 +Steps: 0%| | 305/1000000 [01:16<70:01:11, 3.97it/s, grad_norm=2.79, loss_final=2.37, loss_mean=0.996, loss_mean_cls=1.43, proj_loss=-0.057][2026-03-22 14:19:53] Step: 305, Training Logs: loss_final: 2.225269, loss_mean: 1.002861, proj_loss: -0.061507, loss_mean_cls: 1.283915, grad_norm: 3.372386 +Steps: 0%| | 306/1000000 [01:17<69:22:36, 4.00it/s, grad_norm=3.37, loss_final=2.23, loss_mean=1, loss_mean_cls=1.28, proj_loss=-0.0615][2026-03-22 14:19:53] Step: 306, Training Logs: loss_final: 2.287671, loss_mean: 1.003590, proj_loss: -0.057052, loss_mean_cls: 1.341133, grad_norm: 2.397296 +Steps: 0%| | 307/1000000 [01:17<68:54:38, 4.03it/s, grad_norm=2.4, loss_final=2.29, loss_mean=1, loss_mean_cls=1.34, proj_loss=-0.0571][2026-03-22 14:19:54] Step: 307, Training Logs: loss_final: 2.280365, loss_mean: 1.008361, proj_loss: -0.055503, loss_mean_cls: 1.327507, grad_norm: 2.913342 +Steps: 0%| | 308/1000000 [01:17<68:33:57, 4.05it/s, grad_norm=2.91, loss_final=2.28, loss_mean=1.01, loss_mean_cls=1.33, proj_loss=-0.0555][2026-03-22 14:19:54] Step: 308, Training Logs: loss_final: 2.336535, loss_mean: 1.002016, proj_loss: -0.057126, loss_mean_cls: 1.391645, grad_norm: 2.155728 +Steps: 0%| | 309/1000000 [01:17<68:21:23, 4.06it/s, grad_norm=2.16, loss_final=2.34, loss_mean=1, loss_mean_cls=1.39, proj_loss=-0.0571][2026-03-22 14:19:54] Step: 309, Training Logs: loss_final: 2.396993, loss_mean: 0.970719, proj_loss: -0.057805, loss_mean_cls: 1.484079, grad_norm: 2.728538 +Steps: 0%| | 310/1000000 [01:18<68:19:46, 4.06it/s, grad_norm=2.73, loss_final=2.4, loss_mean=0.971, loss_mean_cls=1.48, proj_loss=-0.0578][2026-03-22 14:19:54] Step: 310, Training Logs: loss_final: 2.212172, loss_mean: 1.025991, proj_loss: -0.058993, loss_mean_cls: 1.245174, grad_norm: 2.509374 +Steps: 0%| | 311/1000000 [01:18<68:10:41, 4.07it/s, grad_norm=2.51, loss_final=2.21, loss_mean=1.03, loss_mean_cls=1.25, proj_loss=-0.059][2026-03-22 14:19:55] Step: 311, Training Logs: loss_final: 2.262700, loss_mean: 1.008095, proj_loss: -0.059353, loss_mean_cls: 1.313958, grad_norm: 3.918103 +Steps: 0%| | 312/1000000 [01:18<68:05:19, 4.08it/s, grad_norm=3.92, loss_final=2.26, loss_mean=1.01, loss_mean_cls=1.31, proj_loss=-0.0594][2026-03-22 14:19:55] Step: 312, Training Logs: loss_final: 2.335941, loss_mean: 0.995164, proj_loss: -0.058907, loss_mean_cls: 1.399683, grad_norm: 2.803473 +Steps: 0%| | 313/1000000 [01:18<68:19:34, 4.06it/s, grad_norm=2.8, loss_final=2.34, loss_mean=0.995, loss_mean_cls=1.4, proj_loss=-0.0589][2026-03-22 14:19:55] Step: 313, Training Logs: loss_final: 2.336644, loss_mean: 1.014890, proj_loss: -0.059322, loss_mean_cls: 1.381076, grad_norm: 2.513654 +Steps: 0%| | 314/1000000 [01:19<68:11:39, 4.07it/s, grad_norm=2.51, loss_final=2.34, loss_mean=1.01, loss_mean_cls=1.38, proj_loss=-0.0593][2026-03-22 14:19:55] Step: 314, Training Logs: loss_final: 2.444101, loss_mean: 0.968513, proj_loss: -0.057992, loss_mean_cls: 1.533580, grad_norm: 2.897161 +Steps: 0%| | 315/1000000 [01:19<68:06:36, 4.08it/s, grad_norm=2.9, loss_final=2.44, loss_mean=0.969, loss_mean_cls=1.53, proj_loss=-0.058][2026-03-22 14:19:56] Step: 315, Training Logs: loss_final: 2.232019, loss_mean: 1.049551, proj_loss: -0.056831, loss_mean_cls: 1.239298, grad_norm: 5.832131 +Steps: 0%| | 316/1000000 [01:19<68:02:02, 4.08it/s, grad_norm=5.83, loss_final=2.23, loss_mean=1.05, loss_mean_cls=1.24, proj_loss=-0.0568][2026-03-22 14:19:56] Step: 316, Training Logs: loss_final: 2.431592, loss_mean: 0.985503, proj_loss: -0.059202, loss_mean_cls: 1.505290, grad_norm: 3.469964 +Steps: 0%| | 317/1000000 [01:19<68:57:16, 4.03it/s, grad_norm=3.47, loss_final=2.43, loss_mean=0.986, loss_mean_cls=1.51, proj_loss=-0.0592][2026-03-22 14:19:56] Step: 317, Training Logs: loss_final: 2.200737, loss_mean: 1.027445, proj_loss: -0.057263, loss_mean_cls: 1.230556, grad_norm: 5.898205 +Steps: 0%| | 318/1000000 [01:19<68:36:49, 4.05it/s, grad_norm=5.9, loss_final=2.2, loss_mean=1.03, loss_mean_cls=1.23, proj_loss=-0.0573][2026-03-22 14:19:56] Step: 318, Training Logs: loss_final: 2.386936, loss_mean: 1.001455, proj_loss: -0.060526, loss_mean_cls: 1.446006, grad_norm: 5.647823 +Steps: 0%| | 319/1000000 [01:20<68:27:45, 4.06it/s, grad_norm=5.65, loss_final=2.39, loss_mean=1, loss_mean_cls=1.45, proj_loss=-0.0605][2026-03-22 14:19:57] Step: 319, Training Logs: loss_final: 2.332057, loss_mean: 0.991405, proj_loss: -0.055957, loss_mean_cls: 1.396610, grad_norm: 3.541842 +Steps: 0%| | 320/1000000 [01:20<68:31:33, 4.05it/s, grad_norm=3.54, loss_final=2.33, loss_mean=0.991, loss_mean_cls=1.4, proj_loss=-0.056][2026-03-22 14:19:57] Step: 320, Training Logs: loss_final: 2.367045, loss_mean: 0.980793, proj_loss: -0.058760, loss_mean_cls: 1.445013, grad_norm: 4.161117 +Steps: 0%| | 321/1000000 [01:20<68:18:59, 4.06it/s, grad_norm=4.16, loss_final=2.37, loss_mean=0.981, loss_mean_cls=1.45, proj_loss=-0.0588][2026-03-22 14:19:57] Step: 321, Training Logs: loss_final: 2.473254, loss_mean: 0.968888, proj_loss: -0.056004, loss_mean_cls: 1.560370, grad_norm: 4.145257 +Steps: 0%| | 322/1000000 [01:20<68:23:54, 4.06it/s, grad_norm=4.15, loss_final=2.47, loss_mean=0.969, loss_mean_cls=1.56, proj_loss=-0.056][2026-03-22 14:19:57] Step: 322, Training Logs: loss_final: 2.301666, loss_mean: 0.985842, proj_loss: -0.058058, loss_mean_cls: 1.373882, grad_norm: 3.224638 +Steps: 0%| | 323/1000000 [01:21<68:30:35, 4.05it/s, grad_norm=3.22, loss_final=2.3, loss_mean=0.986, loss_mean_cls=1.37, proj_loss=-0.0581][2026-03-22 14:19:58] Step: 323, Training Logs: loss_final: 2.215894, loss_mean: 0.998158, proj_loss: -0.055524, loss_mean_cls: 1.273260, grad_norm: 4.259904 +Steps: 0%| | 324/1000000 [01:21<68:19:10, 4.06it/s, grad_norm=4.26, loss_final=2.22, loss_mean=0.998, loss_mean_cls=1.27, proj_loss=-0.0555][2026-03-22 14:19:58] Step: 324, Training Logs: loss_final: 2.318767, loss_mean: 0.996455, proj_loss: -0.058972, loss_mean_cls: 1.381285, grad_norm: 3.615956 +Steps: 0%| | 325/1000000 [01:21<68:10:37, 4.07it/s, grad_norm=3.62, loss_final=2.32, loss_mean=0.996, loss_mean_cls=1.38, proj_loss=-0.059][2026-03-22 14:19:58] Step: 325, Training Logs: loss_final: 2.330206, loss_mean: 0.994062, proj_loss: -0.057910, loss_mean_cls: 1.394055, grad_norm: 4.636720 +Steps: 0%| | 326/1000000 [01:21<68:05:18, 4.08it/s, grad_norm=4.64, loss_final=2.33, loss_mean=0.994, loss_mean_cls=1.39, proj_loss=-0.0579][2026-03-22 14:19:58] Step: 326, Training Logs: loss_final: 2.360142, loss_mean: 0.991292, proj_loss: -0.058925, loss_mean_cls: 1.427775, grad_norm: 2.939858 +Steps: 0%| | 327/1000000 [01:22<68:00:15, 4.08it/s, grad_norm=2.94, loss_final=2.36, loss_mean=0.991, loss_mean_cls=1.43, proj_loss=-0.0589][2026-03-22 14:19:59] Step: 327, Training Logs: loss_final: 2.219021, loss_mean: 1.009099, proj_loss: -0.057898, loss_mean_cls: 1.267821, grad_norm: 4.148186 +Steps: 0%| | 328/1000000 [01:22<67:59:16, 4.08it/s, grad_norm=4.15, loss_final=2.22, loss_mean=1.01, loss_mean_cls=1.27, proj_loss=-0.0579][2026-03-22 14:19:59] Step: 328, Training Logs: loss_final: 2.337511, loss_mean: 0.988247, proj_loss: -0.057530, loss_mean_cls: 1.406793, grad_norm: 3.598891 +Steps: 0%| | 329/1000000 [01:22<67:56:02, 4.09it/s, grad_norm=3.6, loss_final=2.34, loss_mean=0.988, loss_mean_cls=1.41, proj_loss=-0.0575][2026-03-22 14:19:59] Step: 329, Training Logs: loss_final: 2.266762, loss_mean: 0.990178, proj_loss: -0.059707, loss_mean_cls: 1.336292, grad_norm: 3.699545 +Steps: 0%| | 330/1000000 [01:22<67:56:09, 4.09it/s, grad_norm=3.7, loss_final=2.27, loss_mean=0.99, loss_mean_cls=1.34, proj_loss=-0.0597][2026-03-22 14:19:59] Step: 330, Training Logs: loss_final: 2.347367, loss_mean: 0.982749, proj_loss: -0.057294, loss_mean_cls: 1.421912, grad_norm: 3.925313 +Steps: 0%| | 331/1000000 [01:23<67:54:03, 4.09it/s, grad_norm=3.93, loss_final=2.35, loss_mean=0.983, loss_mean_cls=1.42, proj_loss=-0.0573][2026-03-22 14:20:00] Step: 331, Training Logs: loss_final: 2.331422, loss_mean: 0.980784, proj_loss: -0.058702, loss_mean_cls: 1.409341, grad_norm: 3.162480 +Steps: 0%| | 332/1000000 [01:23<67:54:44, 4.09it/s, grad_norm=3.16, loss_final=2.33, loss_mean=0.981, loss_mean_cls=1.41, proj_loss=-0.0587][2026-03-22 14:20:00] Step: 332, Training Logs: loss_final: 2.464796, loss_mean: 0.996701, proj_loss: -0.056373, loss_mean_cls: 1.524467, grad_norm: 5.255553 +Steps: 0%| | 333/1000000 [01:23<67:53:49, 4.09it/s, grad_norm=5.26, loss_final=2.46, loss_mean=0.997, loss_mean_cls=1.52, proj_loss=-0.0564][2026-03-22 14:20:00] Step: 333, Training Logs: loss_final: 2.344169, loss_mean: 1.023771, proj_loss: -0.058466, loss_mean_cls: 1.378864, grad_norm: 4.616671 +Steps: 0%| | 334/1000000 [01:23<67:52:24, 4.09it/s, grad_norm=4.62, loss_final=2.34, loss_mean=1.02, loss_mean_cls=1.38, proj_loss=-0.0585][2026-03-22 14:20:00] Step: 334, Training Logs: loss_final: 2.355217, loss_mean: 0.993954, proj_loss: -0.060414, loss_mean_cls: 1.421678, grad_norm: 4.004363 +Steps: 0%| | 335/1000000 [01:24<67:50:00, 4.09it/s, grad_norm=4, loss_final=2.36, loss_mean=0.994, loss_mean_cls=1.42, proj_loss=-0.0604][2026-03-22 14:20:01] Step: 335, Training Logs: loss_final: 2.255792, loss_mean: 1.038731, proj_loss: -0.061038, loss_mean_cls: 1.278099, grad_norm: 3.962729 +Steps: 0%| | 336/1000000 [01:24<67:52:32, 4.09it/s, grad_norm=3.96, loss_final=2.26, loss_mean=1.04, loss_mean_cls=1.28, proj_loss=-0.061][2026-03-22 14:20:01] Step: 336, Training Logs: loss_final: 2.379284, loss_mean: 1.004497, proj_loss: -0.058064, loss_mean_cls: 1.432851, grad_norm: 4.164021 +Steps: 0%| | 337/1000000 [01:24<67:53:17, 4.09it/s, grad_norm=4.16, loss_final=2.38, loss_mean=1, loss_mean_cls=1.43, proj_loss=-0.0581][2026-03-22 14:20:01] Step: 337, Training Logs: loss_final: 2.293793, loss_mean: 1.014095, proj_loss: -0.057547, loss_mean_cls: 1.337245, grad_norm: 3.843891 +Steps: 0%| | 338/1000000 [01:24<67:55:07, 4.09it/s, grad_norm=3.84, loss_final=2.29, loss_mean=1.01, loss_mean_cls=1.34, proj_loss=-0.0575][2026-03-22 14:20:01] Step: 338, Training Logs: loss_final: 2.340591, loss_mean: 1.020793, proj_loss: -0.059370, loss_mean_cls: 1.379168, grad_norm: 5.069335 +Steps: 0%| | 339/1000000 [01:25<67:56:02, 4.09it/s, grad_norm=5.07, loss_final=2.34, loss_mean=1.02, loss_mean_cls=1.38, proj_loss=-0.0594][2026-03-22 14:20:02] Step: 339, Training Logs: loss_final: 2.314798, loss_mean: 0.999987, proj_loss: -0.059062, loss_mean_cls: 1.373873, grad_norm: 4.319807 +Steps: 0%| | 340/1000000 [01:25<67:58:55, 4.08it/s, grad_norm=4.32, loss_final=2.31, loss_mean=1, loss_mean_cls=1.37, proj_loss=-0.0591][2026-03-22 14:20:02] Step: 340, Training Logs: loss_final: 2.286497, loss_mean: 1.015856, proj_loss: -0.057056, loss_mean_cls: 1.327698, grad_norm: 4.498700 +Steps: 0%| | 341/1000000 [01:25<68:01:05, 4.08it/s, grad_norm=4.5, loss_final=2.29, loss_mean=1.02, loss_mean_cls=1.33, proj_loss=-0.0571][2026-03-22 14:20:02] Step: 341, Training Logs: loss_final: 2.231526, loss_mean: 1.010389, proj_loss: -0.058065, loss_mean_cls: 1.279203, grad_norm: 4.208199 +Steps: 0%| | 342/1000000 [01:25<68:01:46, 4.08it/s, grad_norm=4.21, loss_final=2.23, loss_mean=1.01, loss_mean_cls=1.28, proj_loss=-0.0581][2026-03-22 14:20:02] Step: 342, Training Logs: loss_final: 2.238235, loss_mean: 1.011418, proj_loss: -0.055820, loss_mean_cls: 1.282637, grad_norm: 3.504606 +Steps: 0%| | 343/1000000 [01:26<67:56:51, 4.09it/s, grad_norm=3.5, loss_final=2.24, loss_mean=1.01, loss_mean_cls=1.28, proj_loss=-0.0558][2026-03-22 14:20:03] Step: 343, Training Logs: loss_final: 2.337071, loss_mean: 0.979552, proj_loss: -0.059280, loss_mean_cls: 1.416799, grad_norm: 2.866694 +Steps: 0%| | 344/1000000 [01:26<67:56:13, 4.09it/s, grad_norm=2.87, loss_final=2.34, loss_mean=0.98, loss_mean_cls=1.42, proj_loss=-0.0593][2026-03-22 14:20:03] Step: 344, Training Logs: loss_final: 2.282130, loss_mean: 0.999257, proj_loss: -0.057402, loss_mean_cls: 1.340275, grad_norm: 3.516690 +Steps: 0%| | 345/1000000 [01:26<67:53:46, 4.09it/s, grad_norm=3.52, loss_final=2.28, loss_mean=0.999, loss_mean_cls=1.34, proj_loss=-0.0574][2026-03-22 14:20:03] Step: 345, Training Logs: loss_final: 2.182292, loss_mean: 1.015092, proj_loss: -0.060132, loss_mean_cls: 1.227332, grad_norm: 2.809801 +Steps: 0%| | 346/1000000 [01:26<67:51:55, 4.09it/s, grad_norm=2.81, loss_final=2.18, loss_mean=1.02, loss_mean_cls=1.23, proj_loss=-0.0601][2026-03-22 14:20:03] Step: 346, Training Logs: loss_final: 2.301151, loss_mean: 0.988171, proj_loss: -0.055866, loss_mean_cls: 1.368846, grad_norm: 4.718043 +Steps: 0%| | 347/1000000 [01:27<67:51:07, 4.09it/s, grad_norm=4.72, loss_final=2.3, loss_mean=0.988, loss_mean_cls=1.37, proj_loss=-0.0559][2026-03-22 14:20:04] Step: 347, Training Logs: loss_final: 2.322074, loss_mean: 1.015082, proj_loss: -0.057425, loss_mean_cls: 1.364418, grad_norm: 3.245585 +Steps: 0%| | 348/1000000 [01:27<67:50:50, 4.09it/s, grad_norm=3.25, loss_final=2.32, loss_mean=1.02, loss_mean_cls=1.36, proj_loss=-0.0574][2026-03-22 14:20:04] Step: 348, Training Logs: loss_final: 2.294516, loss_mean: 1.013337, proj_loss: -0.059819, loss_mean_cls: 1.340997, grad_norm: 4.094025 +Steps: 0%| | 349/1000000 [01:27<67:50:48, 4.09it/s, grad_norm=4.09, loss_final=2.29, loss_mean=1.01, loss_mean_cls=1.34, proj_loss=-0.0598][2026-03-22 14:20:04] Step: 349, Training Logs: loss_final: 2.422338, loss_mean: 0.993739, proj_loss: -0.057873, loss_mean_cls: 1.486472, grad_norm: 4.767544 +Steps: 0%| | 350/1000000 [01:27<67:49:40, 4.09it/s, grad_norm=4.77, loss_final=2.42, loss_mean=0.994, loss_mean_cls=1.49, proj_loss=-0.0579][2026-03-22 14:20:04] Step: 350, Training Logs: loss_final: 2.409947, loss_mean: 0.964686, proj_loss: -0.057379, loss_mean_cls: 1.502640, grad_norm: 2.970408 +Steps: 0%| | 351/1000000 [01:28<67:48:58, 4.09it/s, grad_norm=2.97, loss_final=2.41, loss_mean=0.965, loss_mean_cls=1.5, proj_loss=-0.0574][2026-03-22 14:20:05] Step: 351, Training Logs: loss_final: 2.189398, loss_mean: 1.010357, proj_loss: -0.056696, loss_mean_cls: 1.235737, grad_norm: 4.322056 +Steps: 0%| | 352/1000000 [01:28<67:52:36, 4.09it/s, grad_norm=4.32, loss_final=2.19, loss_mean=1.01, loss_mean_cls=1.24, proj_loss=-0.0567][2026-03-22 14:20:05] Step: 352, Training Logs: loss_final: 2.392604, loss_mean: 0.965816, proj_loss: -0.058932, loss_mean_cls: 1.485720, grad_norm: 4.565329 +Steps: 0%| | 353/1000000 [01:28<67:51:37, 4.09it/s, grad_norm=4.57, loss_final=2.39, loss_mean=0.966, loss_mean_cls=1.49, proj_loss=-0.0589][2026-03-22 14:20:05] Step: 353, Training Logs: loss_final: 2.228235, loss_mean: 0.980087, proj_loss: -0.058622, loss_mean_cls: 1.306770, grad_norm: 3.266721 +Steps: 0%| | 354/1000000 [01:28<67:50:20, 4.09it/s, grad_norm=3.27, loss_final=2.23, loss_mean=0.98, loss_mean_cls=1.31, proj_loss=-0.0586][2026-03-22 14:20:05] Step: 354, Training Logs: loss_final: 2.279156, loss_mean: 0.993569, proj_loss: -0.056600, loss_mean_cls: 1.342188, grad_norm: 3.829586 +Steps: 0%| | 355/1000000 [01:29<67:49:24, 4.09it/s, grad_norm=3.83, loss_final=2.28, loss_mean=0.994, loss_mean_cls=1.34, proj_loss=-0.0566][2026-03-22 14:20:06] Step: 355, Training Logs: loss_final: 2.271208, loss_mean: 0.972936, proj_loss: -0.057624, loss_mean_cls: 1.355896, grad_norm: 3.526128 +Steps: 0%| | 356/1000000 [01:29<67:51:30, 4.09it/s, grad_norm=3.53, loss_final=2.27, loss_mean=0.973, loss_mean_cls=1.36, proj_loss=-0.0576][2026-03-22 14:20:06] Step: 356, Training Logs: loss_final: 2.283756, loss_mean: 1.003642, proj_loss: -0.056658, loss_mean_cls: 1.336772, grad_norm: 3.572510 +Steps: 0%| | 357/1000000 [01:29<67:51:44, 4.09it/s, grad_norm=3.57, loss_final=2.28, loss_mean=1, loss_mean_cls=1.34, proj_loss=-0.0567][2026-03-22 14:20:06] Step: 357, Training Logs: loss_final: 2.348462, loss_mean: 0.990212, proj_loss: -0.058600, loss_mean_cls: 1.416850, grad_norm: 4.575479 +Steps: 0%| | 358/1000000 [01:29<67:51:31, 4.09it/s, grad_norm=4.58, loss_final=2.35, loss_mean=0.99, loss_mean_cls=1.42, proj_loss=-0.0586][2026-03-22 14:20:06] Step: 358, Training Logs: loss_final: 2.234488, loss_mean: 0.996059, proj_loss: -0.058124, loss_mean_cls: 1.296553, grad_norm: 3.107389 +Steps: 0%| | 359/1000000 [01:30<67:51:23, 4.09it/s, grad_norm=3.11, loss_final=2.23, loss_mean=0.996, loss_mean_cls=1.3, proj_loss=-0.0581][2026-03-22 14:20:06] Step: 359, Training Logs: loss_final: 2.198285, loss_mean: 0.995328, proj_loss: -0.058346, loss_mean_cls: 1.261302, grad_norm: 3.341925 +Steps: 0%| | 360/1000000 [01:30<67:51:17, 4.09it/s, grad_norm=3.34, loss_final=2.2, loss_mean=0.995, loss_mean_cls=1.26, proj_loss=-0.0583][2026-03-22 14:20:07] Step: 360, Training Logs: loss_final: 2.361061, loss_mean: 0.984533, proj_loss: -0.056721, loss_mean_cls: 1.433248, grad_norm: 3.979322 +Steps: 0%| | 361/1000000 [01:30<67:53:18, 4.09it/s, grad_norm=3.98, loss_final=2.36, loss_mean=0.985, loss_mean_cls=1.43, proj_loss=-0.0567][2026-03-22 14:20:07] Step: 361, Training Logs: loss_final: 2.336386, loss_mean: 0.964445, proj_loss: -0.059054, loss_mean_cls: 1.430995, grad_norm: 4.457042 +Steps: 0%| | 362/1000000 [01:30<67:51:31, 4.09it/s, grad_norm=4.46, loss_final=2.34, loss_mean=0.964, loss_mean_cls=1.43, proj_loss=-0.0591][2026-03-22 14:20:07] Step: 362, Training Logs: loss_final: 2.311689, loss_mean: 0.976058, proj_loss: -0.060536, loss_mean_cls: 1.396167, grad_norm: 3.171746 +Steps: 0%| | 363/1000000 [01:30<67:53:02, 4.09it/s, grad_norm=3.17, loss_final=2.31, loss_mean=0.976, loss_mean_cls=1.4, proj_loss=-0.0605][2026-03-22 14:20:07] Step: 363, Training Logs: loss_final: 2.281297, loss_mean: 0.989302, proj_loss: -0.056693, loss_mean_cls: 1.348688, grad_norm: 4.793407 +Steps: 0%| | 364/1000000 [01:31<67:50:40, 4.09it/s, grad_norm=4.79, loss_final=2.28, loss_mean=0.989, loss_mean_cls=1.35, proj_loss=-0.0567][2026-03-22 14:20:08] Step: 364, Training Logs: loss_final: 2.316994, loss_mean: 0.992018, proj_loss: -0.057872, loss_mean_cls: 1.382848, grad_norm: 3.040039 +Steps: 0%| | 365/1000000 [01:31<67:56:21, 4.09it/s, grad_norm=3.04, loss_final=2.32, loss_mean=0.992, loss_mean_cls=1.38, proj_loss=-0.0579][2026-03-22 14:20:08] Step: 365, Training Logs: loss_final: 2.307911, loss_mean: 0.981108, proj_loss: -0.057569, loss_mean_cls: 1.384372, grad_norm: 4.158062 +Steps: 0%| | 366/1000000 [01:31<67:54:18, 4.09it/s, grad_norm=4.16, loss_final=2.31, loss_mean=0.981, loss_mean_cls=1.38, proj_loss=-0.0576][2026-03-22 14:20:08] Step: 366, Training Logs: loss_final: 2.406893, loss_mean: 0.976786, proj_loss: -0.057538, loss_mean_cls: 1.487645, grad_norm: 4.538278 +Steps: 0%| | 367/1000000 [01:31<67:52:50, 4.09it/s, grad_norm=4.54, loss_final=2.41, loss_mean=0.977, loss_mean_cls=1.49, proj_loss=-0.0575][2026-03-22 14:20:08] Step: 367, Training Logs: loss_final: 2.116904, loss_mean: 1.018823, proj_loss: -0.056448, loss_mean_cls: 1.154529, grad_norm: 2.767322 +Steps: 0%| | 368/1000000 [01:32<68:45:18, 4.04it/s, grad_norm=2.77, loss_final=2.12, loss_mean=1.02, loss_mean_cls=1.15, proj_loss=-0.0564][2026-03-22 14:20:09] Step: 368, Training Logs: loss_final: 2.216955, loss_mean: 1.016231, proj_loss: -0.057681, loss_mean_cls: 1.258405, grad_norm: 3.432180 +Steps: 0%| | 369/1000000 [01:32<68:28:17, 4.06it/s, grad_norm=3.43, loss_final=2.22, loss_mean=1.02, loss_mean_cls=1.26, proj_loss=-0.0577][2026-03-22 14:20:09] Step: 369, Training Logs: loss_final: 2.166255, loss_mean: 1.006527, proj_loss: -0.058946, loss_mean_cls: 1.218674, grad_norm: 2.566114 +Steps: 0%| | 370/1000000 [01:32<68:20:58, 4.06it/s, grad_norm=2.57, loss_final=2.17, loss_mean=1.01, loss_mean_cls=1.22, proj_loss=-0.0589][2026-03-22 14:20:09] Step: 370, Training Logs: loss_final: 2.275909, loss_mean: 0.987904, proj_loss: -0.060456, loss_mean_cls: 1.348461, grad_norm: 3.137770 +Steps: 0%| | 371/1000000 [01:32<68:11:26, 4.07it/s, grad_norm=3.14, loss_final=2.28, loss_mean=0.988, loss_mean_cls=1.35, proj_loss=-0.0605][2026-03-22 14:20:09] Step: 371, Training Logs: loss_final: 2.275592, loss_mean: 0.994606, proj_loss: -0.056635, loss_mean_cls: 1.337620, grad_norm: 3.367281 +Steps: 0%| | 372/1000000 [01:33<68:04:23, 4.08it/s, grad_norm=3.37, loss_final=2.28, loss_mean=0.995, loss_mean_cls=1.34, proj_loss=-0.0566][2026-03-22 14:20:10] Step: 372, Training Logs: loss_final: 2.278903, loss_mean: 0.990156, proj_loss: -0.057787, loss_mean_cls: 1.346534, grad_norm: 2.903803 +Steps: 0%| | 373/1000000 [01:33<68:00:56, 4.08it/s, grad_norm=2.9, loss_final=2.28, loss_mean=0.99, loss_mean_cls=1.35, proj_loss=-0.0578][2026-03-22 14:20:10] Step: 373, Training Logs: loss_final: 2.255970, loss_mean: 0.979886, proj_loss: -0.057197, loss_mean_cls: 1.333280, grad_norm: 4.811442 +Steps: 0%| | 374/1000000 [01:33<67:59:35, 4.08it/s, grad_norm=4.81, loss_final=2.26, loss_mean=0.98, loss_mean_cls=1.33, proj_loss=-0.0572][2026-03-22 14:20:10] Step: 374, Training Logs: loss_final: 2.340236, loss_mean: 0.976071, proj_loss: -0.056502, loss_mean_cls: 1.420667, grad_norm: 2.987112 +Steps: 0%| | 375/1000000 [01:33<67:55:34, 4.09it/s, grad_norm=2.99, loss_final=2.34, loss_mean=0.976, loss_mean_cls=1.42, proj_loss=-0.0565][2026-03-22 14:20:10] Step: 375, Training Logs: loss_final: 2.232835, loss_mean: 0.984552, proj_loss: -0.059110, loss_mean_cls: 1.307393, grad_norm: 3.864259 +Steps: 0%| | 376/1000000 [01:34<67:55:37, 4.09it/s, grad_norm=3.86, loss_final=2.23, loss_mean=0.985, loss_mean_cls=1.31, proj_loss=-0.0591][2026-03-22 14:20:11] Step: 376, Training Logs: loss_final: 2.319957, loss_mean: 0.984081, proj_loss: -0.058786, loss_mean_cls: 1.394662, grad_norm: 2.687956 +Steps: 0%| | 377/1000000 [01:34<67:54:11, 4.09it/s, grad_norm=2.69, loss_final=2.32, loss_mean=0.984, loss_mean_cls=1.39, proj_loss=-0.0588][2026-03-22 14:20:11] Step: 377, Training Logs: loss_final: 2.274205, loss_mean: 0.980135, proj_loss: -0.058385, loss_mean_cls: 1.352455, grad_norm: 5.215763 +Steps: 0%| | 378/1000000 [01:35<102:17:04, 2.71it/s, grad_norm=5.22, loss_final=2.27, loss_mean=0.98, loss_mean_cls=1.35, proj_loss=-0.0584][2026-03-22 14:20:12] Step: 378, Training Logs: loss_final: 2.314647, loss_mean: 0.980593, proj_loss: -0.058186, loss_mean_cls: 1.392240, grad_norm: 4.649534 +Steps: 0%| | 379/1000000 [01:35<91:59:25, 3.02it/s, grad_norm=4.65, loss_final=2.31, loss_mean=0.981, loss_mean_cls=1.39, proj_loss=-0.0582] [2026-03-22 14:20:12] Step: 379, Training Logs: loss_final: 2.295524, loss_mean: 0.983185, proj_loss: -0.057559, loss_mean_cls: 1.369899, grad_norm: 3.556038 +Steps: 0%| | 380/1000000 [01:35<84:44:49, 3.28it/s, grad_norm=3.56, loss_final=2.3, loss_mean=0.983, loss_mean_cls=1.37, proj_loss=-0.0576][2026-03-22 14:20:12] Step: 380, Training Logs: loss_final: 2.324325, loss_mean: 0.968764, proj_loss: -0.059361, loss_mean_cls: 1.414923, grad_norm: 4.204075 +Steps: 0%| | 381/1000000 [01:35<79:38:25, 3.49it/s, grad_norm=4.2, loss_final=2.32, loss_mean=0.969, loss_mean_cls=1.41, proj_loss=-0.0594][2026-03-22 14:20:12] Step: 381, Training Logs: loss_final: 2.182228, loss_mean: 1.011492, proj_loss: -0.059437, loss_mean_cls: 1.230173, grad_norm: 2.763878 +Steps: 0%| | 382/1000000 [01:36<76:05:28, 3.65it/s, grad_norm=2.76, loss_final=2.18, loss_mean=1.01, loss_mean_cls=1.23, proj_loss=-0.0594][2026-03-22 14:20:13] Step: 382, Training Logs: loss_final: 2.321783, loss_mean: 0.986025, proj_loss: -0.060315, loss_mean_cls: 1.396073, grad_norm: 3.465212 +Steps: 0%| | 383/1000000 [01:36<73:35:35, 3.77it/s, grad_norm=3.47, loss_final=2.32, loss_mean=0.986, loss_mean_cls=1.4, proj_loss=-0.0603][2026-03-22 14:20:13] Step: 383, Training Logs: loss_final: 2.253924, loss_mean: 0.982041, proj_loss: -0.058440, loss_mean_cls: 1.330322, grad_norm: 2.450527 +Steps: 0%| | 384/1000000 [01:36<71:50:40, 3.86it/s, grad_norm=2.45, loss_final=2.25, loss_mean=0.982, loss_mean_cls=1.33, proj_loss=-0.0584][2026-03-22 14:20:13] Step: 384, Training Logs: loss_final: 2.109679, loss_mean: 1.029902, proj_loss: -0.058684, loss_mean_cls: 1.138461, grad_norm: 3.462537 +Steps: 0%| | 385/1000000 [01:36<70:39:12, 3.93it/s, grad_norm=3.46, loss_final=2.11, loss_mean=1.03, loss_mean_cls=1.14, proj_loss=-0.0587][2026-03-22 14:20:13] Step: 385, Training Logs: loss_final: 2.186038, loss_mean: 0.996259, proj_loss: -0.057033, loss_mean_cls: 1.246813, grad_norm: 2.991550 +Steps: 0%| | 386/1000000 [01:37<69:47:16, 3.98it/s, grad_norm=2.99, loss_final=2.19, loss_mean=0.996, loss_mean_cls=1.25, proj_loss=-0.057][2026-03-22 14:20:14] Step: 386, Training Logs: loss_final: 2.308272, loss_mean: 1.000530, proj_loss: -0.060192, loss_mean_cls: 1.367934, grad_norm: 4.691333 +Steps: 0%| | 387/1000000 [01:37<69:12:25, 4.01it/s, grad_norm=4.69, loss_final=2.31, loss_mean=1, loss_mean_cls=1.37, proj_loss=-0.0602][2026-03-22 14:20:14] Step: 387, Training Logs: loss_final: 2.231615, loss_mean: 0.999004, proj_loss: -0.061751, loss_mean_cls: 1.294363, grad_norm: 4.262595 +Steps: 0%| | 388/1000000 [01:37<68:45:58, 4.04it/s, grad_norm=4.26, loss_final=2.23, loss_mean=0.999, loss_mean_cls=1.29, proj_loss=-0.0618][2026-03-22 14:20:14] Step: 388, Training Logs: loss_final: 2.330231, loss_mean: 0.977314, proj_loss: -0.058210, loss_mean_cls: 1.411127, grad_norm: 3.307407 +Steps: 0%| | 389/1000000 [01:37<68:27:55, 4.06it/s, grad_norm=3.31, loss_final=2.33, loss_mean=0.977, loss_mean_cls=1.41, proj_loss=-0.0582][2026-03-22 14:20:14] Step: 389, Training Logs: loss_final: 2.235628, loss_mean: 1.031994, proj_loss: -0.058002, loss_mean_cls: 1.261637, grad_norm: 3.893300 +Steps: 0%| | 390/1000000 [01:38<87:15:30, 3.18it/s, grad_norm=3.89, loss_final=2.24, loss_mean=1.03, loss_mean_cls=1.26, proj_loss=-0.058][2026-03-22 14:20:15] Step: 390, Training Logs: loss_final: 2.285347, loss_mean: 0.978773, proj_loss: -0.058241, loss_mean_cls: 1.364816, grad_norm: 3.327995 +Steps: 0%| | 391/1000000 [01:38<103:38:22, 2.68it/s, grad_norm=3.33, loss_final=2.29, loss_mean=0.979, loss_mean_cls=1.36, proj_loss=-0.0582][2026-03-22 14:20:15] Step: 391, Training Logs: loss_final: 2.227756, loss_mean: 0.989495, proj_loss: -0.057726, loss_mean_cls: 1.295987, grad_norm: 3.112076 +Steps: 0%| | 392/1000000 [01:39<92:53:24, 2.99it/s, grad_norm=3.11, loss_final=2.23, loss_mean=0.989, loss_mean_cls=1.3, proj_loss=-0.0577] [2026-03-22 14:20:15] Step: 392, Training Logs: loss_final: 2.269423, loss_mean: 0.996295, proj_loss: -0.060300, loss_mean_cls: 1.333427, grad_norm: 6.674192 +Steps: 0%| | 393/1000000 [01:39<85:29:08, 3.25it/s, grad_norm=6.67, loss_final=2.27, loss_mean=0.996, loss_mean_cls=1.33, proj_loss=-0.0603][2026-03-22 14:20:16] Step: 393, Training Logs: loss_final: 2.281091, loss_mean: 0.988479, proj_loss: -0.057888, loss_mean_cls: 1.350500, grad_norm: 3.924334 +Steps: 0%| | 394/1000000 [01:39<80:11:06, 3.46it/s, grad_norm=3.92, loss_final=2.28, loss_mean=0.988, loss_mean_cls=1.35, proj_loss=-0.0579][2026-03-22 14:20:16] Step: 394, Training Logs: loss_final: 2.185519, loss_mean: 1.005672, proj_loss: -0.058024, loss_mean_cls: 1.237870, grad_norm: 4.112461 +Steps: 0%| | 395/1000000 [01:39<76:28:48, 3.63it/s, grad_norm=4.11, loss_final=2.19, loss_mean=1.01, loss_mean_cls=1.24, proj_loss=-0.058][2026-03-22 14:20:16] Step: 395, Training Logs: loss_final: 2.386366, loss_mean: 0.968909, proj_loss: -0.056856, loss_mean_cls: 1.474313, grad_norm: 5.187436 +Steps: 0%| | 396/1000000 [01:39<73:52:37, 3.76it/s, grad_norm=5.19, loss_final=2.39, loss_mean=0.969, loss_mean_cls=1.47, proj_loss=-0.0569][2026-03-22 14:20:16] Step: 396, Training Logs: loss_final: 2.267938, loss_mean: 0.987622, proj_loss: -0.056950, loss_mean_cls: 1.337266, grad_norm: 2.570689 +Steps: 0%| | 397/1000000 [01:40<72:02:23, 3.85it/s, grad_norm=2.57, loss_final=2.27, loss_mean=0.988, loss_mean_cls=1.34, proj_loss=-0.0569][2026-03-22 14:20:17] Step: 397, Training Logs: loss_final: 2.351856, loss_mean: 0.997997, proj_loss: -0.057726, loss_mean_cls: 1.411584, grad_norm: 6.836082 +Steps: 0%| | 398/1000000 [01:40<70:45:12, 3.92it/s, grad_norm=6.84, loss_final=2.35, loss_mean=0.998, loss_mean_cls=1.41, proj_loss=-0.0577][2026-03-22 14:20:17] Step: 398, Training Logs: loss_final: 2.318354, loss_mean: 0.978805, proj_loss: -0.060181, loss_mean_cls: 1.399730, grad_norm: 4.913385 +Steps: 0%| | 399/1000000 [01:40<69:51:56, 3.97it/s, grad_norm=4.91, loss_final=2.32, loss_mean=0.979, loss_mean_cls=1.4, proj_loss=-0.0602][2026-03-22 14:20:17] Step: 399, Training Logs: loss_final: 2.233021, loss_mean: 0.988261, proj_loss: -0.059342, loss_mean_cls: 1.304102, grad_norm: 5.204639 +Steps: 0%| | 400/1000000 [01:40<69:17:32, 4.01it/s, grad_norm=5.2, loss_final=2.23, loss_mean=0.988, loss_mean_cls=1.3, proj_loss=-0.0593][2026-03-22 14:20:17] Step: 400, Training Logs: loss_final: 2.201959, loss_mean: 1.006708, proj_loss: -0.057858, loss_mean_cls: 1.253109, grad_norm: 4.786574 +Steps: 0%| | 401/1000000 [01:41<68:51:09, 4.03it/s, grad_norm=4.79, loss_final=2.2, loss_mean=1.01, loss_mean_cls=1.25, proj_loss=-0.0579][2026-03-22 14:20:18] Step: 401, Training Logs: loss_final: 2.272716, loss_mean: 0.983193, proj_loss: -0.057071, loss_mean_cls: 1.346594, grad_norm: 3.773796 +Steps: 0%| | 402/1000000 [01:41<68:32:53, 4.05it/s, grad_norm=3.77, loss_final=2.27, loss_mean=0.983, loss_mean_cls=1.35, proj_loss=-0.0571][2026-03-22 14:20:18] Step: 402, Training Logs: loss_final: 2.298203, loss_mean: 0.981961, proj_loss: -0.060387, loss_mean_cls: 1.376629, grad_norm: 5.180000 +Steps: 0%| | 403/1000000 [01:41<68:19:18, 4.06it/s, grad_norm=5.18, loss_final=2.3, loss_mean=0.982, loss_mean_cls=1.38, proj_loss=-0.0604][2026-03-22 14:20:18] Step: 403, Training Logs: loss_final: 2.223511, loss_mean: 1.019502, proj_loss: -0.057985, loss_mean_cls: 1.261994, grad_norm: 4.553927 +Steps: 0%| | 404/1000000 [01:41<68:11:00, 4.07it/s, grad_norm=4.55, loss_final=2.22, loss_mean=1.02, loss_mean_cls=1.26, proj_loss=-0.058][2026-03-22 14:20:18] Step: 404, Training Logs: loss_final: 2.295543, loss_mean: 0.977401, proj_loss: -0.057071, loss_mean_cls: 1.375213, grad_norm: 3.936870 +Steps: 0%| | 405/1000000 [01:42<68:03:47, 4.08it/s, grad_norm=3.94, loss_final=2.3, loss_mean=0.977, loss_mean_cls=1.38, proj_loss=-0.0571][2026-03-22 14:20:19] Step: 405, Training Logs: loss_final: 2.306837, loss_mean: 0.981870, proj_loss: -0.059837, loss_mean_cls: 1.384803, grad_norm: 5.357948 +Steps: 0%| | 406/1000000 [01:42<68:00:12, 4.08it/s, grad_norm=5.36, loss_final=2.31, loss_mean=0.982, loss_mean_cls=1.38, proj_loss=-0.0598][2026-03-22 14:20:19] Step: 406, Training Logs: loss_final: 2.230212, loss_mean: 0.976347, proj_loss: -0.057913, loss_mean_cls: 1.311779, grad_norm: 3.646336 +Steps: 0%| | 407/1000000 [01:42<67:56:07, 4.09it/s, grad_norm=3.65, loss_final=2.23, loss_mean=0.976, loss_mean_cls=1.31, proj_loss=-0.0579][2026-03-22 14:20:19] Step: 407, Training Logs: loss_final: 2.279867, loss_mean: 0.981211, proj_loss: -0.060523, loss_mean_cls: 1.359179, grad_norm: 4.753129 +Steps: 0%| | 408/1000000 [01:42<67:55:52, 4.09it/s, grad_norm=4.75, loss_final=2.28, loss_mean=0.981, loss_mean_cls=1.36, proj_loss=-0.0605][2026-03-22 14:20:19] Step: 408, Training Logs: loss_final: 2.320192, loss_mean: 0.990502, proj_loss: -0.056488, loss_mean_cls: 1.386179, grad_norm: 4.133608 +Steps: 0%| | 409/1000000 [01:43<68:00:52, 4.08it/s, grad_norm=4.13, loss_final=2.32, loss_mean=0.991, loss_mean_cls=1.39, proj_loss=-0.0565][2026-03-22 14:20:20] Step: 409, Training Logs: loss_final: 2.326152, loss_mean: 0.968858, proj_loss: -0.057658, loss_mean_cls: 1.414951, grad_norm: 4.225843 +Steps: 0%| | 410/1000000 [01:43<67:53:05, 4.09it/s, grad_norm=4.23, loss_final=2.33, loss_mean=0.969, loss_mean_cls=1.41, proj_loss=-0.0577][2026-03-22 14:20:20] Step: 410, Training Logs: loss_final: 2.110424, loss_mean: 1.001929, proj_loss: -0.060828, loss_mean_cls: 1.169322, grad_norm: 2.379059 +Steps: 0%| | 411/1000000 [01:43<67:53:08, 4.09it/s, grad_norm=2.38, loss_final=2.11, loss_mean=1, loss_mean_cls=1.17, proj_loss=-0.0608][2026-03-22 14:20:20] Step: 411, Training Logs: loss_final: 2.251604, loss_mean: 0.980295, proj_loss: -0.059803, loss_mean_cls: 1.331112, grad_norm: 3.988060 +Steps: 0%| | 412/1000000 [01:43<67:52:15, 4.09it/s, grad_norm=3.99, loss_final=2.25, loss_mean=0.98, loss_mean_cls=1.33, proj_loss=-0.0598][2026-03-22 14:20:20] Step: 412, Training Logs: loss_final: 2.080223, loss_mean: 1.020447, proj_loss: -0.059485, loss_mean_cls: 1.119261, grad_norm: 3.300060 +Steps: 0%| | 413/1000000 [01:44<67:54:19, 4.09it/s, grad_norm=3.3, loss_final=2.08, loss_mean=1.02, loss_mean_cls=1.12, proj_loss=-0.0595][2026-03-22 14:20:21] Step: 413, Training Logs: loss_final: 2.385801, loss_mean: 0.959805, proj_loss: -0.057751, loss_mean_cls: 1.483747, grad_norm: 4.241670 +Steps: 0%| | 414/1000000 [01:44<67:52:48, 4.09it/s, grad_norm=4.24, loss_final=2.39, loss_mean=0.96, loss_mean_cls=1.48, proj_loss=-0.0578][2026-03-22 14:20:21] Step: 414, Training Logs: loss_final: 2.291421, loss_mean: 0.994551, proj_loss: -0.058868, loss_mean_cls: 1.355738, grad_norm: 3.014782 +Steps: 0%| | 415/1000000 [01:44<67:52:03, 4.09it/s, grad_norm=3.01, loss_final=2.29, loss_mean=0.995, loss_mean_cls=1.36, proj_loss=-0.0589][2026-03-22 14:20:21] Step: 415, Training Logs: loss_final: 2.188199, loss_mean: 0.992952, proj_loss: -0.059569, loss_mean_cls: 1.254816, grad_norm: 3.693486 +Steps: 0%| | 416/1000000 [01:44<67:51:53, 4.09it/s, grad_norm=3.69, loss_final=2.19, loss_mean=0.993, loss_mean_cls=1.25, proj_loss=-0.0596][2026-03-22 14:20:21] Step: 416, Training Logs: loss_final: 2.200731, loss_mean: 1.000519, proj_loss: -0.061954, loss_mean_cls: 1.262166, grad_norm: 4.032738 +Steps: 0%| | 417/1000000 [01:45<68:00:10, 4.08it/s, grad_norm=4.03, loss_final=2.2, loss_mean=1, loss_mean_cls=1.26, proj_loss=-0.062][2026-03-22 14:20:22] Step: 417, Training Logs: loss_final: 2.281032, loss_mean: 0.967590, proj_loss: -0.057181, loss_mean_cls: 1.370623, grad_norm: 3.854725 +Steps: 0%| | 418/1000000 [01:45<67:58:48, 4.08it/s, grad_norm=3.85, loss_final=2.28, loss_mean=0.968, loss_mean_cls=1.37, proj_loss=-0.0572][2026-03-22 14:20:22] Step: 418, Training Logs: loss_final: 2.194999, loss_mean: 0.994929, proj_loss: -0.056438, loss_mean_cls: 1.256508, grad_norm: 3.899894 +Steps: 0%| | 419/1000000 [01:45<67:58:04, 4.09it/s, grad_norm=3.9, loss_final=2.19, loss_mean=0.995, loss_mean_cls=1.26, proj_loss=-0.0564][2026-03-22 14:20:22] Step: 419, Training Logs: loss_final: 2.223687, loss_mean: 0.971438, proj_loss: -0.058930, loss_mean_cls: 1.311179, grad_norm: 4.986359 +Steps: 0%| | 420/1000000 [01:45<67:57:34, 4.09it/s, grad_norm=4.99, loss_final=2.22, loss_mean=0.971, loss_mean_cls=1.31, proj_loss=-0.0589][2026-03-22 14:20:22] Step: 420, Training Logs: loss_final: 2.213604, loss_mean: 0.994489, proj_loss: -0.059459, loss_mean_cls: 1.278574, grad_norm: 3.927530 +Steps: 0%| | 421/1000000 [01:46<67:53:51, 4.09it/s, grad_norm=3.93, loss_final=2.21, loss_mean=0.994, loss_mean_cls=1.28, proj_loss=-0.0595][2026-03-22 14:20:23] Step: 421, Training Logs: loss_final: 2.210022, loss_mean: 0.979540, proj_loss: -0.058561, loss_mean_cls: 1.289043, grad_norm: 4.141888 +Steps: 0%| | 422/1000000 [01:46<67:53:21, 4.09it/s, grad_norm=4.14, loss_final=2.21, loss_mean=0.98, loss_mean_cls=1.29, proj_loss=-0.0586][2026-03-22 14:20:23] Step: 422, Training Logs: loss_final: 2.208905, loss_mean: 0.986952, proj_loss: -0.059718, loss_mean_cls: 1.281671, grad_norm: 2.982492 +Steps: 0%| | 423/1000000 [01:46<67:51:29, 4.09it/s, grad_norm=2.98, loss_final=2.21, loss_mean=0.987, loss_mean_cls=1.28, proj_loss=-0.0597][2026-03-22 14:20:23] Step: 423, Training Logs: loss_final: 2.286334, loss_mean: 0.997861, proj_loss: -0.056714, loss_mean_cls: 1.345186, grad_norm: 5.728573 +Steps: 0%| | 424/1000000 [01:46<67:51:14, 4.09it/s, grad_norm=5.73, loss_final=2.29, loss_mean=0.998, loss_mean_cls=1.35, proj_loss=-0.0567][2026-03-22 14:20:23] Step: 424, Training Logs: loss_final: 2.293791, loss_mean: 0.981182, proj_loss: -0.059491, loss_mean_cls: 1.372100, grad_norm: 4.858315 +Steps: 0%| | 425/1000000 [01:47<68:13:56, 4.07it/s, grad_norm=4.86, loss_final=2.29, loss_mean=0.981, loss_mean_cls=1.37, proj_loss=-0.0595][2026-03-22 14:20:24] Step: 425, Training Logs: loss_final: 2.199400, loss_mean: 0.987067, proj_loss: -0.058535, loss_mean_cls: 1.270868, grad_norm: 5.098457 +Steps: 0%| | 426/1000000 [01:47<68:08:01, 4.08it/s, grad_norm=5.1, loss_final=2.2, loss_mean=0.987, loss_mean_cls=1.27, proj_loss=-0.0585][2026-03-22 14:20:24] Step: 426, Training Logs: loss_final: 2.275179, loss_mean: 0.988700, proj_loss: -0.060556, loss_mean_cls: 1.347036, grad_norm: 4.829874 +Steps: 0%| | 427/1000000 [01:47<68:04:17, 4.08it/s, grad_norm=4.83, loss_final=2.28, loss_mean=0.989, loss_mean_cls=1.35, proj_loss=-0.0606][2026-03-22 14:20:24] Step: 427, Training Logs: loss_final: 2.265438, loss_mean: 0.974761, proj_loss: -0.060014, loss_mean_cls: 1.350691, grad_norm: 2.851936 +Steps: 0%| | 428/1000000 [01:47<68:00:30, 4.08it/s, grad_norm=2.85, loss_final=2.27, loss_mean=0.975, loss_mean_cls=1.35, proj_loss=-0.06][2026-03-22 14:20:24] Step: 428, Training Logs: loss_final: 2.157981, loss_mean: 1.027188, proj_loss: -0.061142, loss_mean_cls: 1.191935, grad_norm: 5.303761 +Steps: 0%| | 429/1000000 [01:48<67:56:52, 4.09it/s, grad_norm=5.3, loss_final=2.16, loss_mean=1.03, loss_mean_cls=1.19, proj_loss=-0.0611][2026-03-22 14:20:25] Step: 429, Training Logs: loss_final: 2.232961, loss_mean: 0.976092, proj_loss: -0.058211, loss_mean_cls: 1.315081, grad_norm: 3.396650 +Steps: 0%| | 430/1000000 [01:48<67:55:51, 4.09it/s, grad_norm=3.4, loss_final=2.23, loss_mean=0.976, loss_mean_cls=1.32, proj_loss=-0.0582][2026-03-22 14:20:25] Step: 430, Training Logs: loss_final: 2.407070, loss_mean: 0.984443, proj_loss: -0.058617, loss_mean_cls: 1.481244, grad_norm: 6.920260 +Steps: 0%| | 431/1000000 [01:48<67:54:39, 4.09it/s, grad_norm=6.92, loss_final=2.41, loss_mean=0.984, loss_mean_cls=1.48, proj_loss=-0.0586][2026-03-22 14:20:25] Step: 431, Training Logs: loss_final: 2.269690, loss_mean: 1.006294, proj_loss: -0.059451, loss_mean_cls: 1.322846, grad_norm: 5.922136 +Steps: 0%| | 432/1000000 [01:48<67:53:17, 4.09it/s, grad_norm=5.92, loss_final=2.27, loss_mean=1.01, loss_mean_cls=1.32, proj_loss=-0.0595][2026-03-22 14:20:25] Step: 432, Training Logs: loss_final: 2.324449, loss_mean: 0.972886, proj_loss: -0.059444, loss_mean_cls: 1.411008, grad_norm: 4.249897 +Steps: 0%| | 433/1000000 [01:49<67:51:12, 4.09it/s, grad_norm=4.25, loss_final=2.32, loss_mean=0.973, loss_mean_cls=1.41, proj_loss=-0.0594][2026-03-22 14:20:25] Step: 433, Training Logs: loss_final: 2.223172, loss_mean: 1.008789, proj_loss: -0.059553, loss_mean_cls: 1.273936, grad_norm: 4.311980 +Steps: 0%| | 434/1000000 [01:49<68:08:22, 4.07it/s, grad_norm=4.31, loss_final=2.22, loss_mean=1.01, loss_mean_cls=1.27, proj_loss=-0.0596][2026-03-22 14:20:26] Step: 434, Training Logs: loss_final: 2.381098, loss_mean: 0.982808, proj_loss: -0.057217, loss_mean_cls: 1.455507, grad_norm: 5.462544 +Steps: 0%| | 435/1000000 [01:49<68:02:40, 4.08it/s, grad_norm=5.46, loss_final=2.38, loss_mean=0.983, loss_mean_cls=1.46, proj_loss=-0.0572][2026-03-22 14:20:26] Step: 435, Training Logs: loss_final: 2.283510, loss_mean: 0.972423, proj_loss: -0.056835, loss_mean_cls: 1.367922, grad_norm: 5.391679 +Steps: 0%| | 436/1000000 [01:49<67:57:45, 4.09it/s, grad_norm=5.39, loss_final=2.28, loss_mean=0.972, loss_mean_cls=1.37, proj_loss=-0.0568][2026-03-22 14:20:26] Step: 436, Training Logs: loss_final: 2.165787, loss_mean: 0.984889, proj_loss: -0.058911, loss_mean_cls: 1.239809, grad_norm: 4.226048 +Steps: 0%| | 437/1000000 [01:50<67:55:22, 4.09it/s, grad_norm=4.23, loss_final=2.17, loss_mean=0.985, loss_mean_cls=1.24, proj_loss=-0.0589][2026-03-22 14:20:26] Step: 437, Training Logs: loss_final: 2.236045, loss_mean: 0.978801, proj_loss: -0.057799, loss_mean_cls: 1.315042, grad_norm: 4.737506 +Steps: 0%| | 438/1000000 [01:50<67:58:32, 4.08it/s, grad_norm=4.74, loss_final=2.24, loss_mean=0.979, loss_mean_cls=1.32, proj_loss=-0.0578][2026-03-22 14:20:27] Step: 438, Training Logs: loss_final: 2.303256, loss_mean: 0.980231, proj_loss: -0.059358, loss_mean_cls: 1.382384, grad_norm: 2.799668 +Steps: 0%| | 439/1000000 [01:50<67:54:07, 4.09it/s, grad_norm=2.8, loss_final=2.3, loss_mean=0.98, loss_mean_cls=1.38, proj_loss=-0.0594][2026-03-22 14:20:27] Step: 439, Training Logs: loss_final: 2.220355, loss_mean: 0.989741, proj_loss: -0.062012, loss_mean_cls: 1.292626, grad_norm: 4.706633 +Steps: 0%| | 440/1000000 [01:50<67:52:05, 4.09it/s, grad_norm=4.71, loss_final=2.22, loss_mean=0.99, loss_mean_cls=1.29, proj_loss=-0.062][2026-03-22 14:20:27] Step: 440, Training Logs: loss_final: 2.217596, loss_mean: 0.979606, proj_loss: -0.059441, loss_mean_cls: 1.297430, grad_norm: 4.338180 +Steps: 0%| | 441/1000000 [01:50<67:53:41, 4.09it/s, grad_norm=4.34, loss_final=2.22, loss_mean=0.98, loss_mean_cls=1.3, proj_loss=-0.0594][2026-03-22 14:20:27] Step: 441, Training Logs: loss_final: 2.189496, loss_mean: 1.008704, proj_loss: -0.057361, loss_mean_cls: 1.238153, grad_norm: 3.518607 +Steps: 0%| | 442/1000000 [01:51<67:54:03, 4.09it/s, grad_norm=3.52, loss_final=2.19, loss_mean=1.01, loss_mean_cls=1.24, proj_loss=-0.0574][2026-03-22 14:20:28] Step: 442, Training Logs: loss_final: 2.234536, loss_mean: 0.977283, proj_loss: -0.057481, loss_mean_cls: 1.314735, grad_norm: 5.136640 +Steps: 0%| | 443/1000000 [01:51<67:51:54, 4.09it/s, grad_norm=5.14, loss_final=2.23, loss_mean=0.977, loss_mean_cls=1.31, proj_loss=-0.0575][2026-03-22 14:20:28] Step: 443, Training Logs: loss_final: 2.275381, loss_mean: 0.975159, proj_loss: -0.059353, loss_mean_cls: 1.359574, grad_norm: 3.333773 +Steps: 0%| | 444/1000000 [01:51<67:53:27, 4.09it/s, grad_norm=3.33, loss_final=2.28, loss_mean=0.975, loss_mean_cls=1.36, proj_loss=-0.0594][2026-03-22 14:20:28] Step: 444, Training Logs: loss_final: 2.216619, loss_mean: 0.986302, proj_loss: -0.056965, loss_mean_cls: 1.287282, grad_norm: 4.972692 +Steps: 0%| | 445/1000000 [01:51<67:54:41, 4.09it/s, grad_norm=4.97, loss_final=2.22, loss_mean=0.986, loss_mean_cls=1.29, proj_loss=-0.057][2026-03-22 14:20:28] Step: 445, Training Logs: loss_final: 2.201251, loss_mean: 0.976328, proj_loss: -0.058835, loss_mean_cls: 1.283759, grad_norm: 4.648229 +Steps: 0%| | 446/1000000 [01:52<67:52:13, 4.09it/s, grad_norm=4.65, loss_final=2.2, loss_mean=0.976, loss_mean_cls=1.28, proj_loss=-0.0588][2026-03-22 14:20:29] Step: 446, Training Logs: loss_final: 2.288623, loss_mean: 0.971925, proj_loss: -0.057374, loss_mean_cls: 1.374072, grad_norm: 3.431377 +Steps: 0%| | 447/1000000 [01:52<67:50:30, 4.09it/s, grad_norm=3.43, loss_final=2.29, loss_mean=0.972, loss_mean_cls=1.37, proj_loss=-0.0574][2026-03-22 14:20:29] Step: 447, Training Logs: loss_final: 2.258942, loss_mean: 0.986713, proj_loss: -0.058765, loss_mean_cls: 1.330993, grad_norm: 3.892381 +Steps: 0%| | 448/1000000 [01:52<67:51:48, 4.09it/s, grad_norm=3.89, loss_final=2.26, loss_mean=0.987, loss_mean_cls=1.33, proj_loss=-0.0588][2026-03-22 14:20:29] Step: 448, Training Logs: loss_final: 2.132431, loss_mean: 0.992169, proj_loss: -0.058755, loss_mean_cls: 1.199018, grad_norm: 3.629778 +Steps: 0%| | 449/1000000 [01:52<67:51:46, 4.09it/s, grad_norm=3.63, loss_final=2.13, loss_mean=0.992, loss_mean_cls=1.2, proj_loss=-0.0588][2026-03-22 14:20:29] Step: 449, Training Logs: loss_final: 2.130105, loss_mean: 1.006957, proj_loss: -0.059040, loss_mean_cls: 1.182188, grad_norm: 4.918933 +Steps: 0%| | 450/1000000 [01:53<67:52:34, 4.09it/s, grad_norm=4.92, loss_final=2.13, loss_mean=1.01, loss_mean_cls=1.18, proj_loss=-0.059][2026-03-22 14:20:30] Step: 450, Training Logs: loss_final: 2.147272, loss_mean: 0.999468, proj_loss: -0.058340, loss_mean_cls: 1.206143, grad_norm: 2.634993 +Steps: 0%| | 451/1000000 [01:53<67:52:49, 4.09it/s, grad_norm=2.63, loss_final=2.15, loss_mean=0.999, loss_mean_cls=1.21, proj_loss=-0.0583][2026-03-22 14:20:30] Step: 451, Training Logs: loss_final: 2.218933, loss_mean: 0.991926, proj_loss: -0.062098, loss_mean_cls: 1.289105, grad_norm: 5.029998 +Steps: 0%| | 452/1000000 [01:53<67:51:40, 4.09it/s, grad_norm=5.03, loss_final=2.22, loss_mean=0.992, loss_mean_cls=1.29, proj_loss=-0.0621][2026-03-22 14:20:30] Step: 452, Training Logs: loss_final: 2.279976, loss_mean: 0.989750, proj_loss: -0.058677, loss_mean_cls: 1.348904, grad_norm: 5.562757 +Steps: 0%| | 453/1000000 [01:53<67:50:26, 4.09it/s, grad_norm=5.56, loss_final=2.28, loss_mean=0.99, loss_mean_cls=1.35, proj_loss=-0.0587][2026-03-22 14:20:30] Step: 453, Training Logs: loss_final: 2.194054, loss_mean: 0.988331, proj_loss: -0.061760, loss_mean_cls: 1.267483, grad_norm: 4.059344 +Steps: 0%| | 454/1000000 [01:54<67:50:15, 4.09it/s, grad_norm=4.06, loss_final=2.19, loss_mean=0.988, loss_mean_cls=1.27, proj_loss=-0.0618][2026-03-22 14:20:31] Step: 454, Training Logs: loss_final: 2.343036, loss_mean: 0.961489, proj_loss: -0.056695, loss_mean_cls: 1.438241, grad_norm: 3.385412 +Steps: 0%| | 455/1000000 [01:54<67:56:01, 4.09it/s, grad_norm=3.39, loss_final=2.34, loss_mean=0.961, loss_mean_cls=1.44, proj_loss=-0.0567][2026-03-22 14:20:31] Step: 455, Training Logs: loss_final: 2.231686, loss_mean: 0.972600, proj_loss: -0.056798, loss_mean_cls: 1.315884, grad_norm: 4.921273 +Steps: 0%| | 456/1000000 [01:54<67:59:44, 4.08it/s, grad_norm=4.92, loss_final=2.23, loss_mean=0.973, loss_mean_cls=1.32, proj_loss=-0.0568][2026-03-22 14:20:31] Step: 456, Training Logs: loss_final: 2.150503, loss_mean: 0.993978, proj_loss: -0.059369, loss_mean_cls: 1.215894, grad_norm: 2.963259 +Steps: 0%| | 457/1000000 [01:54<67:59:24, 4.08it/s, grad_norm=2.96, loss_final=2.15, loss_mean=0.994, loss_mean_cls=1.22, proj_loss=-0.0594][2026-03-22 14:20:31] Step: 457, Training Logs: loss_final: 2.208416, loss_mean: 0.993983, proj_loss: -0.059607, loss_mean_cls: 1.274040, grad_norm: 4.961050 +Steps: 0%| | 458/1000000 [01:55<68:01:58, 4.08it/s, grad_norm=4.96, loss_final=2.21, loss_mean=0.994, loss_mean_cls=1.27, proj_loss=-0.0596][2026-03-22 14:20:32] Step: 458, Training Logs: loss_final: 2.255560, loss_mean: 0.982075, proj_loss: -0.058664, loss_mean_cls: 1.332149, grad_norm: 5.866855 +Steps: 0%| | 459/1000000 [01:55<68:01:37, 4.08it/s, grad_norm=5.87, loss_final=2.26, loss_mean=0.982, loss_mean_cls=1.33, proj_loss=-0.0587][2026-03-22 14:20:32] Step: 459, Training Logs: loss_final: 2.165869, loss_mean: 0.995263, proj_loss: -0.061168, loss_mean_cls: 1.231774, grad_norm: 3.138543 +Steps: 0%| | 460/1000000 [01:55<67:57:57, 4.09it/s, grad_norm=3.14, loss_final=2.17, loss_mean=0.995, loss_mean_cls=1.23, proj_loss=-0.0612][2026-03-22 14:20:32] Step: 460, Training Logs: loss_final: 2.218707, loss_mean: 0.967870, proj_loss: -0.057152, loss_mean_cls: 1.307989, grad_norm: 4.311220 +Steps: 0%| | 461/1000000 [01:55<67:58:04, 4.09it/s, grad_norm=4.31, loss_final=2.22, loss_mean=0.968, loss_mean_cls=1.31, proj_loss=-0.0572][2026-03-22 14:20:32] Step: 461, Training Logs: loss_final: 2.327362, loss_mean: 0.969360, proj_loss: -0.058927, loss_mean_cls: 1.416929, grad_norm: 3.147379 +Steps: 0%| | 462/1000000 [01:56<67:55:34, 4.09it/s, grad_norm=3.15, loss_final=2.33, loss_mean=0.969, loss_mean_cls=1.42, proj_loss=-0.0589][2026-03-22 14:20:33] Step: 462, Training Logs: loss_final: 2.212939, loss_mean: 0.974943, proj_loss: -0.057992, loss_mean_cls: 1.295988, grad_norm: 3.483954 +Steps: 0%| | 463/1000000 [01:56<67:53:46, 4.09it/s, grad_norm=3.48, loss_final=2.21, loss_mean=0.975, loss_mean_cls=1.3, proj_loss=-0.058][2026-03-22 14:20:33] Step: 463, Training Logs: loss_final: 2.205391, loss_mean: 0.982744, proj_loss: -0.058457, loss_mean_cls: 1.281104, grad_norm: 3.097699 +Steps: 0%| | 464/1000000 [01:56<67:53:02, 4.09it/s, grad_norm=3.1, loss_final=2.21, loss_mean=0.983, loss_mean_cls=1.28, proj_loss=-0.0585][2026-03-22 14:20:33] Step: 464, Training Logs: loss_final: 2.170446, loss_mean: 1.006090, proj_loss: -0.057564, loss_mean_cls: 1.221919, grad_norm: 5.991818 +Steps: 0%| | 465/1000000 [01:56<67:53:54, 4.09it/s, grad_norm=5.99, loss_final=2.17, loss_mean=1.01, loss_mean_cls=1.22, proj_loss=-0.0576][2026-03-22 14:20:33] Step: 465, Training Logs: loss_final: 2.096595, loss_mean: 0.996422, proj_loss: -0.061705, loss_mean_cls: 1.161878, grad_norm: 4.699298 +Steps: 0%| | 466/1000000 [01:57<69:51:28, 3.97it/s, grad_norm=4.7, loss_final=2.1, loss_mean=0.996, loss_mean_cls=1.16, proj_loss=-0.0617][2026-03-22 14:20:34] Step: 466, Training Logs: loss_final: 2.238476, loss_mean: 0.959426, proj_loss: -0.059801, loss_mean_cls: 1.338850, grad_norm: 2.183851 +Steps: 0%| | 467/1000000 [01:57<69:14:19, 4.01it/s, grad_norm=2.18, loss_final=2.24, loss_mean=0.959, loss_mean_cls=1.34, proj_loss=-0.0598][2026-03-22 14:20:34] Step: 467, Training Logs: loss_final: 2.286291, loss_mean: 0.969618, proj_loss: -0.058574, loss_mean_cls: 1.375247, grad_norm: 4.419387 +Steps: 0%| | 468/1000000 [01:57<68:49:16, 4.03it/s, grad_norm=4.42, loss_final=2.29, loss_mean=0.97, loss_mean_cls=1.38, proj_loss=-0.0586][2026-03-22 14:20:34] Step: 468, Training Logs: loss_final: 2.229783, loss_mean: 0.992729, proj_loss: -0.057432, loss_mean_cls: 1.294485, grad_norm: 5.083217 +Steps: 0%| | 469/1000000 [01:57<68:32:42, 4.05it/s, grad_norm=5.08, loss_final=2.23, loss_mean=0.993, loss_mean_cls=1.29, proj_loss=-0.0574][2026-03-22 14:20:34] Step: 469, Training Logs: loss_final: 2.207290, loss_mean: 0.990208, proj_loss: -0.061377, loss_mean_cls: 1.278458, grad_norm: 4.585229 +Steps: 0%| | 470/1000000 [01:58<68:20:10, 4.06it/s, grad_norm=4.59, loss_final=2.21, loss_mean=0.99, loss_mean_cls=1.28, proj_loss=-0.0614][2026-03-22 14:20:35] Step: 470, Training Logs: loss_final: 2.189537, loss_mean: 0.996487, proj_loss: -0.058592, loss_mean_cls: 1.251642, grad_norm: 3.972016 +Steps: 0%| | 471/1000000 [01:58<68:12:33, 4.07it/s, grad_norm=3.97, loss_final=2.19, loss_mean=0.996, loss_mean_cls=1.25, proj_loss=-0.0586][2026-03-22 14:20:35] Step: 471, Training Logs: loss_final: 2.238361, loss_mean: 0.976567, proj_loss: -0.058533, loss_mean_cls: 1.320326, grad_norm: 6.683768 +Steps: 0%| | 472/1000000 [01:58<68:13:45, 4.07it/s, grad_norm=6.68, loss_final=2.24, loss_mean=0.977, loss_mean_cls=1.32, proj_loss=-0.0585][2026-03-22 14:20:35] Step: 472, Training Logs: loss_final: 2.165776, loss_mean: 0.980357, proj_loss: -0.058832, loss_mean_cls: 1.244251, grad_norm: 3.566517 +Steps: 0%| | 473/1000000 [01:58<68:05:13, 4.08it/s, grad_norm=3.57, loss_final=2.17, loss_mean=0.98, loss_mean_cls=1.24, proj_loss=-0.0588][2026-03-22 14:20:35] Step: 473, Training Logs: loss_final: 2.250320, loss_mean: 0.992703, proj_loss: -0.059269, loss_mean_cls: 1.316886, grad_norm: 6.311441 +Steps: 0%| | 474/1000000 [01:59<68:02:05, 4.08it/s, grad_norm=6.31, loss_final=2.25, loss_mean=0.993, loss_mean_cls=1.32, proj_loss=-0.0593][2026-03-22 14:20:36] Step: 474, Training Logs: loss_final: 2.181869, loss_mean: 0.991865, proj_loss: -0.060042, loss_mean_cls: 1.250046, grad_norm: 4.553138 +Steps: 0%| | 475/1000000 [01:59<67:59:46, 4.08it/s, grad_norm=4.55, loss_final=2.18, loss_mean=0.992, loss_mean_cls=1.25, proj_loss=-0.06][2026-03-22 14:20:36] Step: 475, Training Logs: loss_final: 2.141857, loss_mean: 1.016759, proj_loss: -0.059572, loss_mean_cls: 1.184670, grad_norm: 5.017949 +Steps: 0%| | 476/1000000 [01:59<67:57:31, 4.09it/s, grad_norm=5.02, loss_final=2.14, loss_mean=1.02, loss_mean_cls=1.18, proj_loss=-0.0596][2026-03-22 14:20:36] Step: 476, Training Logs: loss_final: 2.215069, loss_mean: 0.991378, proj_loss: -0.060284, loss_mean_cls: 1.283975, grad_norm: 5.368576 +Steps: 0%| | 477/1000000 [01:59<67:57:32, 4.09it/s, grad_norm=5.37, loss_final=2.22, loss_mean=0.991, loss_mean_cls=1.28, proj_loss=-0.0603][2026-03-22 14:20:36] Step: 477, Training Logs: loss_final: 2.140334, loss_mean: 0.983754, proj_loss: -0.061515, loss_mean_cls: 1.218095, grad_norm: 5.355197 +Steps: 0%| | 478/1000000 [02:00<67:54:32, 4.09it/s, grad_norm=5.36, loss_final=2.14, loss_mean=0.984, loss_mean_cls=1.22, proj_loss=-0.0615][2026-03-22 14:20:37] Step: 478, Training Logs: loss_final: 2.164129, loss_mean: 0.984291, proj_loss: -0.059247, loss_mean_cls: 1.239085, grad_norm: 4.987415 +Steps: 0%| | 479/1000000 [02:00<67:51:54, 4.09it/s, grad_norm=4.99, loss_final=2.16, loss_mean=0.984, loss_mean_cls=1.24, proj_loss=-0.0592][2026-03-22 14:20:37] Step: 479, Training Logs: loss_final: 2.197810, loss_mean: 0.987854, proj_loss: -0.059580, loss_mean_cls: 1.269536, grad_norm: 4.335651 +Steps: 0%| | 480/1000000 [02:00<67:52:35, 4.09it/s, grad_norm=4.34, loss_final=2.2, loss_mean=0.988, loss_mean_cls=1.27, proj_loss=-0.0596][2026-03-22 14:20:37] Step: 480, Training Logs: loss_final: 2.103379, loss_mean: 0.985465, proj_loss: -0.061103, loss_mean_cls: 1.179018, grad_norm: 4.356501 +Steps: 0%| | 481/1000000 [02:00<67:51:34, 4.09it/s, grad_norm=4.36, loss_final=2.1, loss_mean=0.985, loss_mean_cls=1.18, proj_loss=-0.0611][2026-03-22 14:20:37] Step: 481, Training Logs: loss_final: 2.188284, loss_mean: 0.977397, proj_loss: -0.058277, loss_mean_cls: 1.269164, grad_norm: 3.109108 +Steps: 0%| | 482/1000000 [02:01<67:57:02, 4.09it/s, grad_norm=3.11, loss_final=2.19, loss_mean=0.977, loss_mean_cls=1.27, proj_loss=-0.0583][2026-03-22 14:20:37] Step: 482, Training Logs: loss_final: 2.248460, loss_mean: 0.971294, proj_loss: -0.058605, loss_mean_cls: 1.335770, grad_norm: 4.592916 +Steps: 0%| | 483/1000000 [02:01<67:54:40, 4.09it/s, grad_norm=4.59, loss_final=2.25, loss_mean=0.971, loss_mean_cls=1.34, proj_loss=-0.0586][2026-03-22 14:20:38] Step: 483, Training Logs: loss_final: 2.252840, loss_mean: 0.963383, proj_loss: -0.058093, loss_mean_cls: 1.347549, grad_norm: 3.333370 +Steps: 0%| | 484/1000000 [02:01<67:53:02, 4.09it/s, grad_norm=3.33, loss_final=2.25, loss_mean=0.963, loss_mean_cls=1.35, proj_loss=-0.0581][2026-03-22 14:20:38] Step: 484, Training Logs: loss_final: 2.136487, loss_mean: 1.002811, proj_loss: -0.061196, loss_mean_cls: 1.194873, grad_norm: 3.627267 +Steps: 0%| | 485/1000000 [02:01<67:51:51, 4.09it/s, grad_norm=3.63, loss_final=2.14, loss_mean=1, loss_mean_cls=1.19, proj_loss=-0.0612][2026-03-22 14:20:38] Step: 485, Training Logs: loss_final: 2.262251, loss_mean: 0.964486, proj_loss: -0.059272, loss_mean_cls: 1.357037, grad_norm: 4.684875 +Steps: 0%| | 486/1000000 [02:02<67:50:26, 4.09it/s, grad_norm=4.68, loss_final=2.26, loss_mean=0.964, loss_mean_cls=1.36, proj_loss=-0.0593][2026-03-22 14:20:38] Step: 486, Training Logs: loss_final: 2.179305, loss_mean: 0.977521, proj_loss: -0.061973, loss_mean_cls: 1.263756, grad_norm: 2.802778 +Steps: 0%| | 487/1000000 [02:02<67:51:57, 4.09it/s, grad_norm=2.8, loss_final=2.18, loss_mean=0.978, loss_mean_cls=1.26, proj_loss=-0.062][2026-03-22 14:20:39] Step: 487, Training Logs: loss_final: 2.219464, loss_mean: 0.966110, proj_loss: -0.061903, loss_mean_cls: 1.315257, grad_norm: 3.331964 +Steps: 0%| | 488/1000000 [02:02<67:52:37, 4.09it/s, grad_norm=3.33, loss_final=2.22, loss_mean=0.966, loss_mean_cls=1.32, proj_loss=-0.0619][2026-03-22 14:20:39] Step: 488, Training Logs: loss_final: 2.130683, loss_mean: 0.987740, proj_loss: -0.057256, loss_mean_cls: 1.200199, grad_norm: 3.764877 +Steps: 0%| | 489/1000000 [02:02<67:53:58, 4.09it/s, grad_norm=3.76, loss_final=2.13, loss_mean=0.988, loss_mean_cls=1.2, proj_loss=-0.0573][2026-03-22 14:20:39] Step: 489, Training Logs: loss_final: 2.113854, loss_mean: 0.997016, proj_loss: -0.059706, loss_mean_cls: 1.176544, grad_norm: 3.601428 +Steps: 0%| | 490/1000000 [02:02<67:51:18, 4.09it/s, grad_norm=3.6, loss_final=2.11, loss_mean=0.997, loss_mean_cls=1.18, proj_loss=-0.0597][2026-03-22 14:20:39] Step: 490, Training Logs: loss_final: 2.056890, loss_mean: 0.980720, proj_loss: -0.059609, loss_mean_cls: 1.135778, grad_norm: 3.726739 +Steps: 0%| | 491/1000000 [02:03<67:49:57, 4.09it/s, grad_norm=3.73, loss_final=2.06, loss_mean=0.981, loss_mean_cls=1.14, proj_loss=-0.0596][2026-03-22 14:20:40] Step: 491, Training Logs: loss_final: 2.233366, loss_mean: 0.971571, proj_loss: -0.058034, loss_mean_cls: 1.319829, grad_norm: 3.449430 +Steps: 0%| | 492/1000000 [02:03<67:50:06, 4.09it/s, grad_norm=3.45, loss_final=2.23, loss_mean=0.972, loss_mean_cls=1.32, proj_loss=-0.058][2026-03-22 14:20:40] Step: 492, Training Logs: loss_final: 2.203485, loss_mean: 0.979466, proj_loss: -0.058376, loss_mean_cls: 1.282395, grad_norm: 3.131816 +Steps: 0%| | 493/1000000 [02:03<67:52:05, 4.09it/s, grad_norm=3.13, loss_final=2.2, loss_mean=0.979, loss_mean_cls=1.28, proj_loss=-0.0584][2026-03-22 14:20:40] Step: 493, Training Logs: loss_final: 2.240656, loss_mean: 0.984130, proj_loss: -0.058641, loss_mean_cls: 1.315167, grad_norm: 5.033063 +Steps: 0%| | 494/1000000 [02:03<67:51:00, 4.09it/s, grad_norm=5.03, loss_final=2.24, loss_mean=0.984, loss_mean_cls=1.32, proj_loss=-0.0586][2026-03-22 14:20:40] Step: 494, Training Logs: loss_final: 2.127258, loss_mean: 0.988910, proj_loss: -0.059332, loss_mean_cls: 1.197679, grad_norm: 3.698272 +Steps: 0%| | 495/1000000 [02:04<67:54:23, 4.09it/s, grad_norm=3.7, loss_final=2.13, loss_mean=0.989, loss_mean_cls=1.2, proj_loss=-0.0593][2026-03-22 14:20:41] Step: 495, Training Logs: loss_final: 2.083417, loss_mean: 1.002096, proj_loss: -0.058883, loss_mean_cls: 1.140204, grad_norm: 4.446256 +Steps: 0%| | 496/1000000 [02:04<67:52:24, 4.09it/s, grad_norm=4.45, loss_final=2.08, loss_mean=1, loss_mean_cls=1.14, proj_loss=-0.0589][2026-03-22 14:20:41] Step: 496, Training Logs: loss_final: 2.264308, loss_mean: 0.953247, proj_loss: -0.058260, loss_mean_cls: 1.369321, grad_norm: 2.920902 +Steps: 0%| | 497/1000000 [02:04<67:51:15, 4.09it/s, grad_norm=2.92, loss_final=2.26, loss_mean=0.953, loss_mean_cls=1.37, proj_loss=-0.0583][2026-03-22 14:20:41] Step: 497, Training Logs: loss_final: 2.205276, loss_mean: 0.970872, proj_loss: -0.057658, loss_mean_cls: 1.292062, grad_norm: 4.088206 +Steps: 0%| | 498/1000000 [02:04<67:51:29, 4.09it/s, grad_norm=4.09, loss_final=2.21, loss_mean=0.971, loss_mean_cls=1.29, proj_loss=-0.0577][2026-03-22 14:20:41] Step: 498, Training Logs: loss_final: 2.124184, loss_mean: 0.982330, proj_loss: -0.058291, loss_mean_cls: 1.200145, grad_norm: 4.103152 +Steps: 0%| | 499/1000000 [02:05<67:54:27, 4.09it/s, grad_norm=4.1, loss_final=2.12, loss_mean=0.982, loss_mean_cls=1.2, proj_loss=-0.0583][2026-03-22 14:20:42] Step: 499, Training Logs: loss_final: 2.239665, loss_mean: 0.959578, proj_loss: -0.063071, loss_mean_cls: 1.343158, grad_norm: 5.214294 +Steps: 0%| | 500/1000000 [02:05<67:56:33, 4.09it/s, grad_norm=5.21, loss_final=2.24, loss_mean=0.96, loss_mean_cls=1.34, proj_loss=-0.0631][2026-03-22 14:20:42] Step: 500, Training Logs: loss_final: 2.207018, loss_mean: 0.966606, proj_loss: -0.058644, loss_mean_cls: 1.299056, grad_norm: 4.935897 +Steps: 0%| | 501/1000000 [02:05<67:53:10, 4.09it/s, grad_norm=4.94, loss_final=2.21, loss_mean=0.967, loss_mean_cls=1.3, proj_loss=-0.0586][2026-03-22 14:20:42] Step: 501, Training Logs: loss_final: 2.095946, loss_mean: 0.978376, proj_loss: -0.060091, loss_mean_cls: 1.177661, grad_norm: 4.423510 +Steps: 0%| | 502/1000000 [02:05<67:51:41, 4.09it/s, grad_norm=4.42, loss_final=2.1, loss_mean=0.978, loss_mean_cls=1.18, proj_loss=-0.0601][2026-03-22 14:20:42] Step: 502, Training Logs: loss_final: 2.239799, loss_mean: 0.964013, proj_loss: -0.060797, loss_mean_cls: 1.336582, grad_norm: 5.167456 +Steps: 0%| | 503/1000000 [02:06<67:51:25, 4.09it/s, grad_norm=5.17, loss_final=2.24, loss_mean=0.964, loss_mean_cls=1.34, proj_loss=-0.0608][2026-03-22 14:20:43] Step: 503, Training Logs: loss_final: 2.231127, loss_mean: 0.993025, proj_loss: -0.058421, loss_mean_cls: 1.296523, grad_norm: 6.211541 +Steps: 0%| | 504/1000000 [02:06<67:50:11, 4.09it/s, grad_norm=6.21, loss_final=2.23, loss_mean=0.993, loss_mean_cls=1.3, proj_loss=-0.0584][2026-03-22 14:20:43] Step: 504, Training Logs: loss_final: 2.354332, loss_mean: 0.947601, proj_loss: -0.059204, loss_mean_cls: 1.465935, grad_norm: 5.135830 +Steps: 0%| | 505/1000000 [02:06<67:50:46, 4.09it/s, grad_norm=5.14, loss_final=2.35, loss_mean=0.948, loss_mean_cls=1.47, proj_loss=-0.0592][2026-03-22 14:20:43] Step: 505, Training Logs: loss_final: 2.185061, loss_mean: 0.973575, proj_loss: -0.058616, loss_mean_cls: 1.270102, grad_norm: 4.197425 +Steps: 0%| | 506/1000000 [02:06<67:50:15, 4.09it/s, grad_norm=4.2, loss_final=2.19, loss_mean=0.974, loss_mean_cls=1.27, proj_loss=-0.0586][2026-03-22 14:20:43] Step: 506, Training Logs: loss_final: 2.162872, loss_mean: 0.991753, proj_loss: -0.062126, loss_mean_cls: 1.233244, grad_norm: 5.808655 +Steps: 0%| | 507/1000000 [02:07<67:49:32, 4.09it/s, grad_norm=5.81, loss_final=2.16, loss_mean=0.992, loss_mean_cls=1.23, proj_loss=-0.0621][2026-03-22 14:20:44] Step: 507, Training Logs: loss_final: 2.086110, loss_mean: 0.997946, proj_loss: -0.059546, loss_mean_cls: 1.147710, grad_norm: 5.856195 +Steps: 0%| | 508/1000000 [02:07<67:48:52, 4.09it/s, grad_norm=5.86, loss_final=2.09, loss_mean=0.998, loss_mean_cls=1.15, proj_loss=-0.0595][2026-03-22 14:20:44] Step: 508, Training Logs: loss_final: 2.196493, loss_mean: 0.998588, proj_loss: -0.060165, loss_mean_cls: 1.258070, grad_norm: 6.233054 +Steps: 0%| | 509/1000000 [02:07<67:49:26, 4.09it/s, grad_norm=6.23, loss_final=2.2, loss_mean=0.999, loss_mean_cls=1.26, proj_loss=-0.0602][2026-03-22 14:20:44] Step: 509, Training Logs: loss_final: 2.256826, loss_mean: 0.967448, proj_loss: -0.060353, loss_mean_cls: 1.349731, grad_norm: 3.803860 +Steps: 0%| | 510/1000000 [02:07<67:52:09, 4.09it/s, grad_norm=3.8, loss_final=2.26, loss_mean=0.967, loss_mean_cls=1.35, proj_loss=-0.0604][2026-03-22 14:20:44] Step: 510, Training Logs: loss_final: 2.330690, loss_mean: 0.959253, proj_loss: -0.058203, loss_mean_cls: 1.429641, grad_norm: 5.567198 +Steps: 0%| | 511/1000000 [02:08<69:22:01, 4.00it/s, grad_norm=5.57, loss_final=2.33, loss_mean=0.959, loss_mean_cls=1.43, proj_loss=-0.0582][2026-03-22 14:20:45] Step: 511, Training Logs: loss_final: 2.193352, loss_mean: 0.957747, proj_loss: -0.058764, loss_mean_cls: 1.294368, grad_norm: 2.843701 +Steps: 0%| | 512/1000000 [02:08<68:55:52, 4.03it/s, grad_norm=2.84, loss_final=2.19, loss_mean=0.958, loss_mean_cls=1.29, proj_loss=-0.0588][2026-03-22 14:20:45] Step: 512, Training Logs: loss_final: 2.288110, loss_mean: 0.966131, proj_loss: -0.059924, loss_mean_cls: 1.381902, grad_norm: 5.176105 +Steps: 0%| | 513/1000000 [02:08<69:59:41, 3.97it/s, grad_norm=5.18, loss_final=2.29, loss_mean=0.966, loss_mean_cls=1.38, proj_loss=-0.0599][2026-03-22 14:20:45] Step: 513, Training Logs: loss_final: 2.215688, loss_mean: 0.973516, proj_loss: -0.060082, loss_mean_cls: 1.302254, grad_norm: 2.864606 +Steps: 0%| | 514/1000000 [02:08<69:24:11, 4.00it/s, grad_norm=2.86, loss_final=2.22, loss_mean=0.974, loss_mean_cls=1.3, proj_loss=-0.0601][2026-03-22 14:20:45] Step: 514, Training Logs: loss_final: 2.108583, loss_mean: 0.987000, proj_loss: -0.061530, loss_mean_cls: 1.183114, grad_norm: 4.731382 +Steps: 0%| | 515/1000000 [02:09<68:59:03, 4.02it/s, grad_norm=4.73, loss_final=2.11, loss_mean=0.987, loss_mean_cls=1.18, proj_loss=-0.0615][2026-03-22 14:20:46] Step: 515, Training Logs: loss_final: 2.113441, loss_mean: 0.990813, proj_loss: -0.060061, loss_mean_cls: 1.182689, grad_norm: 3.035948 +Steps: 0%| | 516/1000000 [02:09<68:40:56, 4.04it/s, grad_norm=3.04, loss_final=2.11, loss_mean=0.991, loss_mean_cls=1.18, proj_loss=-0.0601][2026-03-22 14:20:46] Step: 516, Training Logs: loss_final: 2.360488, loss_mean: 0.939435, proj_loss: -0.059124, loss_mean_cls: 1.480177, grad_norm: 4.312132 +Steps: 0%| | 517/1000000 [02:09<68:27:01, 4.06it/s, grad_norm=4.31, loss_final=2.36, loss_mean=0.939, loss_mean_cls=1.48, proj_loss=-0.0591][2026-03-22 14:20:46] Step: 517, Training Logs: loss_final: 2.131731, loss_mean: 0.988281, proj_loss: -0.061266, loss_mean_cls: 1.204716, grad_norm: 3.361706 +Steps: 0%| | 518/1000000 [02:09<69:01:12, 4.02it/s, grad_norm=3.36, loss_final=2.13, loss_mean=0.988, loss_mean_cls=1.2, proj_loss=-0.0613][2026-03-22 14:20:46] Step: 518, Training Logs: loss_final: 2.149808, loss_mean: 0.988267, proj_loss: -0.062020, loss_mean_cls: 1.223562, grad_norm: 3.496089 +Steps: 0%| | 519/1000000 [02:10<68:45:06, 4.04it/s, grad_norm=3.5, loss_final=2.15, loss_mean=0.988, loss_mean_cls=1.22, proj_loss=-0.062][2026-03-22 14:20:47] Step: 519, Training Logs: loss_final: 2.285702, loss_mean: 0.966097, proj_loss: -0.061243, loss_mean_cls: 1.380847, grad_norm: 6.050120 +Steps: 0%| | 520/1000000 [02:10<68:27:56, 4.06it/s, grad_norm=6.05, loss_final=2.29, loss_mean=0.966, loss_mean_cls=1.38, proj_loss=-0.0612][2026-03-22 14:20:47] Step: 520, Training Logs: loss_final: 2.278159, loss_mean: 0.963303, proj_loss: -0.061361, loss_mean_cls: 1.376217, grad_norm: 4.626216 +Steps: 0%| | 521/1000000 [02:10<68:15:22, 4.07it/s, grad_norm=4.63, loss_final=2.28, loss_mean=0.963, loss_mean_cls=1.38, proj_loss=-0.0614][2026-03-22 14:20:47] Step: 521, Training Logs: loss_final: 2.226913, loss_mean: 0.974057, proj_loss: -0.060265, loss_mean_cls: 1.313121, grad_norm: 3.577757 +Steps: 0%| | 522/1000000 [02:10<68:08:04, 4.07it/s, grad_norm=3.58, loss_final=2.23, loss_mean=0.974, loss_mean_cls=1.31, proj_loss=-0.0603][2026-03-22 14:20:47] Step: 522, Training Logs: loss_final: 2.316179, loss_mean: 0.969890, proj_loss: -0.058544, loss_mean_cls: 1.404833, grad_norm: 7.367499 +Steps: 0%| | 523/1000000 [02:11<68:03:05, 4.08it/s, grad_norm=7.37, loss_final=2.32, loss_mean=0.97, loss_mean_cls=1.4, proj_loss=-0.0585][2026-03-22 14:20:48] Step: 523, Training Logs: loss_final: 2.189030, loss_mean: 0.986525, proj_loss: -0.061242, loss_mean_cls: 1.263747, grad_norm: 4.474055 +Steps: 0%| | 524/1000000 [02:11<67:58:47, 4.08it/s, grad_norm=4.47, loss_final=2.19, loss_mean=0.987, loss_mean_cls=1.26, proj_loss=-0.0612][2026-03-22 14:20:48] Step: 524, Training Logs: loss_final: 2.354845, loss_mean: 0.955181, proj_loss: -0.059503, loss_mean_cls: 1.459167, grad_norm: 5.526030 +Steps: 0%| | 525/1000000 [02:11<67:56:39, 4.09it/s, grad_norm=5.53, loss_final=2.35, loss_mean=0.955, loss_mean_cls=1.46, proj_loss=-0.0595][2026-03-22 14:20:48] Step: 525, Training Logs: loss_final: 2.271682, loss_mean: 0.974088, proj_loss: -0.062366, loss_mean_cls: 1.359959, grad_norm: 5.406707 +Steps: 0%| | 526/1000000 [02:11<67:53:54, 4.09it/s, grad_norm=5.41, loss_final=2.27, loss_mean=0.974, loss_mean_cls=1.36, proj_loss=-0.0624][2026-03-22 14:20:48] Step: 526, Training Logs: loss_final: 2.135042, loss_mean: 0.966009, proj_loss: -0.061970, loss_mean_cls: 1.231003, grad_norm: 2.742318 +Steps: 0%| | 527/1000000 [02:12<67:51:43, 4.09it/s, grad_norm=2.74, loss_final=2.14, loss_mean=0.966, loss_mean_cls=1.23, proj_loss=-0.062][2026-03-22 14:20:49] Step: 527, Training Logs: loss_final: 2.251400, loss_mean: 0.954921, proj_loss: -0.059495, loss_mean_cls: 1.355975, grad_norm: 3.882216 +Steps: 0%| | 528/1000000 [02:12<67:50:41, 4.09it/s, grad_norm=3.88, loss_final=2.25, loss_mean=0.955, loss_mean_cls=1.36, proj_loss=-0.0595][2026-03-22 14:20:49] Step: 528, Training Logs: loss_final: 2.261133, loss_mean: 0.966433, proj_loss: -0.062243, loss_mean_cls: 1.356943, grad_norm: 5.067153 +Steps: 0%| | 529/1000000 [02:12<67:50:13, 4.09it/s, grad_norm=5.07, loss_final=2.26, loss_mean=0.966, loss_mean_cls=1.36, proj_loss=-0.0622][2026-03-22 14:20:49] Step: 529, Training Logs: loss_final: 2.138190, loss_mean: 0.978485, proj_loss: -0.061722, loss_mean_cls: 1.221427, grad_norm: 3.883346 +Steps: 0%| | 530/1000000 [02:12<67:49:49, 4.09it/s, grad_norm=3.88, loss_final=2.14, loss_mean=0.978, loss_mean_cls=1.22, proj_loss=-0.0617][2026-03-22 14:20:49] Step: 530, Training Logs: loss_final: 2.187263, loss_mean: 0.978140, proj_loss: -0.062078, loss_mean_cls: 1.271201, grad_norm: 3.781747 +Steps: 0%| | 531/1000000 [02:13<67:49:47, 4.09it/s, grad_norm=3.78, loss_final=2.19, loss_mean=0.978, loss_mean_cls=1.27, proj_loss=-0.0621][2026-03-22 14:20:50] Step: 531, Training Logs: loss_final: 2.063388, loss_mean: 1.004853, proj_loss: -0.060148, loss_mean_cls: 1.118682, grad_norm: 3.139444 +Steps: 0%| | 532/1000000 [02:13<67:51:49, 4.09it/s, grad_norm=3.14, loss_final=2.06, loss_mean=1, loss_mean_cls=1.12, proj_loss=-0.0601][2026-03-22 14:20:50] Step: 532, Training Logs: loss_final: 2.058257, loss_mean: 1.007013, proj_loss: -0.060278, loss_mean_cls: 1.111522, grad_norm: 5.201224 +Steps: 0%| | 533/1000000 [02:13<67:51:55, 4.09it/s, grad_norm=5.2, loss_final=2.06, loss_mean=1.01, loss_mean_cls=1.11, proj_loss=-0.0603][2026-03-22 14:20:50] Step: 533, Training Logs: loss_final: 2.203726, loss_mean: 1.000561, proj_loss: -0.059427, loss_mean_cls: 1.262592, grad_norm: 6.280879 +Steps: 0%| | 534/1000000 [02:13<67:51:11, 4.09it/s, grad_norm=6.28, loss_final=2.2, loss_mean=1, loss_mean_cls=1.26, proj_loss=-0.0594][2026-03-22 14:20:50] Step: 534, Training Logs: loss_final: 2.229250, loss_mean: 0.981182, proj_loss: -0.056781, loss_mean_cls: 1.304849, grad_norm: 4.974138 +Steps: 0%| | 535/1000000 [02:14<67:49:22, 4.09it/s, grad_norm=4.97, loss_final=2.23, loss_mean=0.981, loss_mean_cls=1.3, proj_loss=-0.0568][2026-03-22 14:20:50] Step: 535, Training Logs: loss_final: 2.205718, loss_mean: 0.988409, proj_loss: -0.060335, loss_mean_cls: 1.277644, grad_norm: 7.260615 +Steps: 0%| | 536/1000000 [02:14<67:49:04, 4.09it/s, grad_norm=7.26, loss_final=2.21, loss_mean=0.988, loss_mean_cls=1.28, proj_loss=-0.0603][2026-03-22 14:20:51] Step: 536, Training Logs: loss_final: 2.135105, loss_mean: 0.998708, proj_loss: -0.060076, loss_mean_cls: 1.196472, grad_norm: 6.200536 +Steps: 0%| | 537/1000000 [02:14<67:48:02, 4.09it/s, grad_norm=6.2, loss_final=2.14, loss_mean=0.999, loss_mean_cls=1.2, proj_loss=-0.0601][2026-03-22 14:20:51] Step: 537, Training Logs: loss_final: 2.205738, loss_mean: 0.980140, proj_loss: -0.060681, loss_mean_cls: 1.286278, grad_norm: 5.207778 +Steps: 0%| | 538/1000000 [02:14<67:47:45, 4.10it/s, grad_norm=5.21, loss_final=2.21, loss_mean=0.98, loss_mean_cls=1.29, proj_loss=-0.0607][2026-03-22 14:20:51] Step: 538, Training Logs: loss_final: 2.199223, loss_mean: 0.989483, proj_loss: -0.061839, loss_mean_cls: 1.271580, grad_norm: 5.421466 +Steps: 0%| | 539/1000000 [02:15<67:48:22, 4.09it/s, grad_norm=5.42, loss_final=2.2, loss_mean=0.989, loss_mean_cls=1.27, proj_loss=-0.0618][2026-03-22 14:20:51] Step: 539, Training Logs: loss_final: 2.067359, loss_mean: 0.989603, proj_loss: -0.060940, loss_mean_cls: 1.138697, grad_norm: 5.856514 +Steps: 0%| | 540/1000000 [02:15<67:49:09, 4.09it/s, grad_norm=5.86, loss_final=2.07, loss_mean=0.99, loss_mean_cls=1.14, proj_loss=-0.0609][2026-03-22 14:20:52] Step: 540, Training Logs: loss_final: 2.270764, loss_mean: 0.963626, proj_loss: -0.061027, loss_mean_cls: 1.368165, grad_norm: 3.211356 +Steps: 0%| | 541/1000000 [02:15<67:49:04, 4.09it/s, grad_norm=3.21, loss_final=2.27, loss_mean=0.964, loss_mean_cls=1.37, proj_loss=-0.061][2026-03-22 14:20:52] Step: 541, Training Logs: loss_final: 2.139523, loss_mean: 0.974059, proj_loss: -0.060188, loss_mean_cls: 1.225652, grad_norm: 4.962556 +Steps: 0%| | 542/1000000 [02:15<67:48:08, 4.09it/s, grad_norm=4.96, loss_final=2.14, loss_mean=0.974, loss_mean_cls=1.23, proj_loss=-0.0602][2026-03-22 14:20:52] Step: 542, Training Logs: loss_final: 2.253240, loss_mean: 0.956832, proj_loss: -0.060529, loss_mean_cls: 1.356937, grad_norm: 5.779308 +Steps: 0%| | 543/1000000 [02:15<67:47:25, 4.10it/s, grad_norm=5.78, loss_final=2.25, loss_mean=0.957, loss_mean_cls=1.36, proj_loss=-0.0605][2026-03-22 14:20:52] Step: 543, Training Logs: loss_final: 2.217140, loss_mean: 0.968071, proj_loss: -0.064000, loss_mean_cls: 1.313069, grad_norm: 5.246296 +Steps: 0%| | 544/1000000 [02:16<67:46:54, 4.10it/s, grad_norm=5.25, loss_final=2.22, loss_mean=0.968, loss_mean_cls=1.31, proj_loss=-0.064][2026-03-22 14:20:53] Step: 544, Training Logs: loss_final: 2.178052, loss_mean: 0.972682, proj_loss: -0.060236, loss_mean_cls: 1.265607, grad_norm: 3.696732 +Steps: 0%| | 545/1000000 [02:16<67:48:26, 4.09it/s, grad_norm=3.7, loss_final=2.18, loss_mean=0.973, loss_mean_cls=1.27, proj_loss=-0.0602][2026-03-22 14:20:53] Step: 545, Training Logs: loss_final: 2.297266, loss_mean: 0.984920, proj_loss: -0.057642, loss_mean_cls: 1.369989, grad_norm: 8.362285 +Steps: 0%| | 546/1000000 [02:16<67:47:39, 4.10it/s, grad_norm=8.36, loss_final=2.3, loss_mean=0.985, loss_mean_cls=1.37, proj_loss=-0.0576][2026-03-22 14:20:53] Step: 546, Training Logs: loss_final: 2.166357, loss_mean: 0.981648, proj_loss: -0.059265, loss_mean_cls: 1.243974, grad_norm: 5.926443 +Steps: 0%| | 547/1000000 [02:16<67:47:05, 4.10it/s, grad_norm=5.93, loss_final=2.17, loss_mean=0.982, loss_mean_cls=1.24, proj_loss=-0.0593][2026-03-22 14:20:53] Step: 547, Training Logs: loss_final: 2.197813, loss_mean: 1.004378, proj_loss: -0.060527, loss_mean_cls: 1.253962, grad_norm: 6.358476 +Steps: 0%| | 548/1000000 [02:17<67:47:26, 4.10it/s, grad_norm=6.36, loss_final=2.2, loss_mean=1, loss_mean_cls=1.25, proj_loss=-0.0605][2026-03-22 14:20:54] Step: 548, Training Logs: loss_final: 2.152431, loss_mean: 0.992737, proj_loss: -0.061078, loss_mean_cls: 1.220772, grad_norm: 5.423854 +Steps: 0%| | 549/1000000 [02:17<67:46:34, 4.10it/s, grad_norm=5.42, loss_final=2.15, loss_mean=0.993, loss_mean_cls=1.22, proj_loss=-0.0611][2026-03-22 14:20:54] Step: 549, Training Logs: loss_final: 2.103204, loss_mean: 0.983895, proj_loss: -0.062251, loss_mean_cls: 1.181559, grad_norm: 5.337381 +Steps: 0%| | 550/1000000 [02:17<67:47:18, 4.10it/s, grad_norm=5.34, loss_final=2.1, loss_mean=0.984, loss_mean_cls=1.18, proj_loss=-0.0623][2026-03-22 14:20:54] Step: 550, Training Logs: loss_final: 2.040282, loss_mean: 0.990798, proj_loss: -0.064149, loss_mean_cls: 1.113634, grad_norm: 3.817158 +Steps: 0%| | 551/1000000 [02:17<71:13:22, 3.90it/s, grad_norm=3.82, loss_final=2.04, loss_mean=0.991, loss_mean_cls=1.11, proj_loss=-0.0641][2026-03-22 14:20:54] Step: 551, Training Logs: loss_final: 2.186899, loss_mean: 0.988953, proj_loss: -0.061218, loss_mean_cls: 1.259164, grad_norm: 3.998615 +Steps: 0%| | 552/1000000 [02:18<70:12:41, 3.95it/s, grad_norm=4, loss_final=2.19, loss_mean=0.989, loss_mean_cls=1.26, proj_loss=-0.0612][2026-03-22 14:20:55] Step: 552, Training Logs: loss_final: 2.141904, loss_mean: 0.977359, proj_loss: -0.061999, loss_mean_cls: 1.226544, grad_norm: 3.053002 +Steps: 0%| | 553/1000000 [02:18<69:32:50, 3.99it/s, grad_norm=3.05, loss_final=2.14, loss_mean=0.977, loss_mean_cls=1.23, proj_loss=-0.062][2026-03-22 14:20:55] Step: 553, Training Logs: loss_final: 2.273206, loss_mean: 1.000487, proj_loss: -0.060829, loss_mean_cls: 1.333549, grad_norm: 5.264680 +Steps: 0%| | 554/1000000 [02:18<69:01:38, 4.02it/s, grad_norm=5.26, loss_final=2.27, loss_mean=1, loss_mean_cls=1.33, proj_loss=-0.0608][2026-03-22 14:20:55] Step: 554, Training Logs: loss_final: 2.285488, loss_mean: 0.953851, proj_loss: -0.061897, loss_mean_cls: 1.393533, grad_norm: 3.217772 +Steps: 0%| | 555/1000000 [02:18<68:40:07, 4.04it/s, grad_norm=3.22, loss_final=2.29, loss_mean=0.954, loss_mean_cls=1.39, proj_loss=-0.0619][2026-03-22 14:20:55] Step: 555, Training Logs: loss_final: 2.161291, loss_mean: 0.973774, proj_loss: -0.063199, loss_mean_cls: 1.250716, grad_norm: 4.815956 +Steps: 0%| | 556/1000000 [02:19<68:25:30, 4.06it/s, grad_norm=4.82, loss_final=2.16, loss_mean=0.974, loss_mean_cls=1.25, proj_loss=-0.0632][2026-03-22 14:20:56] Step: 556, Training Logs: loss_final: 2.328026, loss_mean: 0.985610, proj_loss: -0.061798, loss_mean_cls: 1.404214, grad_norm: 4.971229 +Steps: 0%| | 557/1000000 [02:19<68:14:10, 4.07it/s, grad_norm=4.97, loss_final=2.33, loss_mean=0.986, loss_mean_cls=1.4, proj_loss=-0.0618][2026-03-22 14:20:56] Step: 557, Training Logs: loss_final: 2.133715, loss_mean: 0.990828, proj_loss: -0.060527, loss_mean_cls: 1.203414, grad_norm: 3.632231 +Steps: 0%| | 558/1000000 [02:19<68:06:20, 4.08it/s, grad_norm=3.63, loss_final=2.13, loss_mean=0.991, loss_mean_cls=1.2, proj_loss=-0.0605][2026-03-22 14:20:56] Step: 558, Training Logs: loss_final: 2.207404, loss_mean: 0.983611, proj_loss: -0.061621, loss_mean_cls: 1.285415, grad_norm: 6.839581 +Steps: 0%| | 559/1000000 [02:19<68:02:04, 4.08it/s, grad_norm=6.84, loss_final=2.21, loss_mean=0.984, loss_mean_cls=1.29, proj_loss=-0.0616][2026-03-22 14:20:56] Step: 559, Training Logs: loss_final: 2.191194, loss_mean: 0.983591, proj_loss: -0.063409, loss_mean_cls: 1.271011, grad_norm: 6.904275 +Steps: 0%| | 560/1000000 [02:20<67:58:55, 4.08it/s, grad_norm=6.9, loss_final=2.19, loss_mean=0.984, loss_mean_cls=1.27, proj_loss=-0.0634][2026-03-22 14:20:57] Step: 560, Training Logs: loss_final: 2.208581, loss_mean: 0.997022, proj_loss: -0.060891, loss_mean_cls: 1.272449, grad_norm: 7.479423 +Steps: 0%| | 561/1000000 [02:20<67:55:32, 4.09it/s, grad_norm=7.48, loss_final=2.21, loss_mean=0.997, loss_mean_cls=1.27, proj_loss=-0.0609][2026-03-22 14:20:57] Step: 561, Training Logs: loss_final: 2.201179, loss_mean: 0.979143, proj_loss: -0.061854, loss_mean_cls: 1.283890, grad_norm: 4.718158 +Steps: 0%| | 562/1000000 [02:20<67:53:58, 4.09it/s, grad_norm=4.72, loss_final=2.2, loss_mean=0.979, loss_mean_cls=1.28, proj_loss=-0.0619][2026-03-22 14:20:57] Step: 562, Training Logs: loss_final: 2.146618, loss_mean: 0.981041, proj_loss: -0.063284, loss_mean_cls: 1.228861, grad_norm: 6.535775 +Steps: 0%| | 563/1000000 [02:20<67:52:48, 4.09it/s, grad_norm=6.54, loss_final=2.15, loss_mean=0.981, loss_mean_cls=1.23, proj_loss=-0.0633][2026-03-22 14:20:57] Step: 563, Training Logs: loss_final: 2.144148, loss_mean: 0.996098, proj_loss: -0.062326, loss_mean_cls: 1.210376, grad_norm: 6.366833 +Steps: 0%| | 564/1000000 [02:21<67:54:39, 4.09it/s, grad_norm=6.37, loss_final=2.14, loss_mean=0.996, loss_mean_cls=1.21, proj_loss=-0.0623][2026-03-22 14:20:58] Step: 564, Training Logs: loss_final: 2.185469, loss_mean: 0.979847, proj_loss: -0.061693, loss_mean_cls: 1.267315, grad_norm: 5.055241 +Steps: 0%| | 565/1000000 [02:21<67:53:32, 4.09it/s, grad_norm=5.06, loss_final=2.19, loss_mean=0.98, loss_mean_cls=1.27, proj_loss=-0.0617][2026-03-22 14:20:58] Step: 565, Training Logs: loss_final: 2.248096, loss_mean: 0.974785, proj_loss: -0.060726, loss_mean_cls: 1.334038, grad_norm: 5.510690 +Steps: 0%| | 566/1000000 [02:21<67:51:31, 4.09it/s, grad_norm=5.51, loss_final=2.25, loss_mean=0.975, loss_mean_cls=1.33, proj_loss=-0.0607][2026-03-22 14:20:58] Step: 566, Training Logs: loss_final: 2.259720, loss_mean: 0.985610, proj_loss: -0.061436, loss_mean_cls: 1.335546, grad_norm: 5.036625 +Steps: 0%| | 567/1000000 [02:21<68:29:45, 4.05it/s, grad_norm=5.04, loss_final=2.26, loss_mean=0.986, loss_mean_cls=1.34, proj_loss=-0.0614][2026-03-22 14:20:58] Step: 567, Training Logs: loss_final: 2.273218, loss_mean: 0.947334, proj_loss: -0.064474, loss_mean_cls: 1.390358, grad_norm: 2.870609 +Steps: 0%| | 568/1000000 [02:22<68:19:04, 4.06it/s, grad_norm=2.87, loss_final=2.27, loss_mean=0.947, loss_mean_cls=1.39, proj_loss=-0.0645][2026-03-22 14:20:59] Step: 568, Training Logs: loss_final: 2.037439, loss_mean: 1.004138, proj_loss: -0.063517, loss_mean_cls: 1.096818, grad_norm: 4.709720 +Steps: 0%| | 569/1000000 [02:22<68:11:11, 4.07it/s, grad_norm=4.71, loss_final=2.04, loss_mean=1, loss_mean_cls=1.1, proj_loss=-0.0635][2026-03-22 14:20:59] Step: 569, Training Logs: loss_final: 2.181001, loss_mean: 0.960102, proj_loss: -0.062253, loss_mean_cls: 1.283153, grad_norm: 3.784510 +Steps: 0%| | 570/1000000 [02:22<68:05:23, 4.08it/s, grad_norm=3.78, loss_final=2.18, loss_mean=0.96, loss_mean_cls=1.28, proj_loss=-0.0623][2026-03-22 14:20:59] Step: 570, Training Logs: loss_final: 2.092660, loss_mean: 0.989117, proj_loss: -0.063458, loss_mean_cls: 1.167001, grad_norm: 3.700529 +Steps: 0%| | 571/1000000 [02:22<68:01:50, 4.08it/s, grad_norm=3.7, loss_final=2.09, loss_mean=0.989, loss_mean_cls=1.17, proj_loss=-0.0635][2026-03-22 14:20:59] Step: 571, Training Logs: loss_final: 2.141695, loss_mean: 0.989042, proj_loss: -0.061625, loss_mean_cls: 1.214278, grad_norm: 4.891847 +Steps: 0%| | 572/1000000 [02:23<67:57:23, 4.09it/s, grad_norm=4.89, loss_final=2.14, loss_mean=0.989, loss_mean_cls=1.21, proj_loss=-0.0616][2026-03-22 14:21:00] Step: 572, Training Logs: loss_final: 2.175188, loss_mean: 0.977991, proj_loss: -0.064080, loss_mean_cls: 1.261278, grad_norm: 4.501512 +Steps: 0%| | 573/1000000 [02:23<67:55:08, 4.09it/s, grad_norm=4.5, loss_final=2.18, loss_mean=0.978, loss_mean_cls=1.26, proj_loss=-0.0641][2026-03-22 14:21:00] Step: 573, Training Logs: loss_final: 2.104811, loss_mean: 1.011537, proj_loss: -0.059430, loss_mean_cls: 1.152703, grad_norm: 5.704524 +Steps: 0%| | 574/1000000 [02:23<67:56:15, 4.09it/s, grad_norm=5.7, loss_final=2.1, loss_mean=1.01, loss_mean_cls=1.15, proj_loss=-0.0594][2026-03-22 14:21:00] Step: 574, Training Logs: loss_final: 2.177673, loss_mean: 1.007271, proj_loss: -0.065539, loss_mean_cls: 1.235940, grad_norm: 5.820727 +Steps: 0%| | 575/1000000 [02:23<67:56:16, 4.09it/s, grad_norm=5.82, loss_final=2.18, loss_mean=1.01, loss_mean_cls=1.24, proj_loss=-0.0655][2026-03-22 14:21:00] Step: 575, Training Logs: loss_final: 2.141531, loss_mean: 0.986220, proj_loss: -0.064849, loss_mean_cls: 1.220160, grad_norm: 3.597393 +Steps: 0%| | 576/1000000 [02:24<67:54:06, 4.09it/s, grad_norm=3.6, loss_final=2.14, loss_mean=0.986, loss_mean_cls=1.22, proj_loss=-0.0648][2026-03-22 14:21:01] Step: 576, Training Logs: loss_final: 2.171700, loss_mean: 1.003743, proj_loss: -0.061599, loss_mean_cls: 1.229556, grad_norm: 3.930120 +Steps: 0%| | 577/1000000 [02:24<67:52:39, 4.09it/s, grad_norm=3.93, loss_final=2.17, loss_mean=1, loss_mean_cls=1.23, proj_loss=-0.0616][2026-03-22 14:21:01] Step: 577, Training Logs: loss_final: 2.090117, loss_mean: 0.977401, proj_loss: -0.064269, loss_mean_cls: 1.176985, grad_norm: 4.080326 +Steps: 0%| | 578/1000000 [02:24<67:51:55, 4.09it/s, grad_norm=4.08, loss_final=2.09, loss_mean=0.977, loss_mean_cls=1.18, proj_loss=-0.0643][2026-03-22 14:21:01] Step: 578, Training Logs: loss_final: 2.211670, loss_mean: 0.967357, proj_loss: -0.062163, loss_mean_cls: 1.306476, grad_norm: 3.723619 +Steps: 0%| | 579/1000000 [02:24<67:53:17, 4.09it/s, grad_norm=3.72, loss_final=2.21, loss_mean=0.967, loss_mean_cls=1.31, proj_loss=-0.0622][2026-03-22 14:21:01] Step: 579, Training Logs: loss_final: 2.245803, loss_mean: 0.972640, proj_loss: -0.062397, loss_mean_cls: 1.335560, grad_norm: 5.798278 +Steps: 0%| | 580/1000000 [02:25<67:53:43, 4.09it/s, grad_norm=5.8, loss_final=2.25, loss_mean=0.973, loss_mean_cls=1.34, proj_loss=-0.0624][2026-03-22 14:21:02] Step: 580, Training Logs: loss_final: 2.302383, loss_mean: 0.944994, proj_loss: -0.064087, loss_mean_cls: 1.421475, grad_norm: 3.727936 +Steps: 0%| | 581/1000000 [02:25<67:54:09, 4.09it/s, grad_norm=3.73, loss_final=2.3, loss_mean=0.945, loss_mean_cls=1.42, proj_loss=-0.0641][2026-03-22 14:21:02] Step: 581, Training Logs: loss_final: 2.120909, loss_mean: 0.995300, proj_loss: -0.067322, loss_mean_cls: 1.192931, grad_norm: 5.109536 +Steps: 0%| | 582/1000000 [02:25<67:52:56, 4.09it/s, grad_norm=5.11, loss_final=2.12, loss_mean=0.995, loss_mean_cls=1.19, proj_loss=-0.0673][2026-03-22 14:21:02] Step: 582, Training Logs: loss_final: 2.203097, loss_mean: 0.964858, proj_loss: -0.062543, loss_mean_cls: 1.300782, grad_norm: 5.347281 +Steps: 0%| | 583/1000000 [02:25<67:51:56, 4.09it/s, grad_norm=5.35, loss_final=2.2, loss_mean=0.965, loss_mean_cls=1.3, proj_loss=-0.0625][2026-03-22 14:21:02] Step: 583, Training Logs: loss_final: 2.232214, loss_mean: 0.955924, proj_loss: -0.064766, loss_mean_cls: 1.341056, grad_norm: 2.936062 +Steps: 0%| | 584/1000000 [02:26<67:50:52, 4.09it/s, grad_norm=2.94, loss_final=2.23, loss_mean=0.956, loss_mean_cls=1.34, proj_loss=-0.0648][2026-03-22 14:21:03] Step: 584, Training Logs: loss_final: 2.061971, loss_mean: 0.978012, proj_loss: -0.064176, loss_mean_cls: 1.148134, grad_norm: 4.155626 +Steps: 0%| | 585/1000000 [02:26<67:50:14, 4.09it/s, grad_norm=4.16, loss_final=2.06, loss_mean=0.978, loss_mean_cls=1.15, proj_loss=-0.0642][2026-03-22 14:21:03] Step: 585, Training Logs: loss_final: 2.172802, loss_mean: 0.971392, proj_loss: -0.063298, loss_mean_cls: 1.264708, grad_norm: 2.407312 +Steps: 0%| | 586/1000000 [02:26<67:49:23, 4.09it/s, grad_norm=2.41, loss_final=2.17, loss_mean=0.971, loss_mean_cls=1.26, proj_loss=-0.0633][2026-03-22 14:21:03] Step: 586, Training Logs: loss_final: 2.110399, loss_mean: 0.978964, proj_loss: -0.065261, loss_mean_cls: 1.196697, grad_norm: 5.207154 +Steps: 0%| | 587/1000000 [02:26<67:48:10, 4.09it/s, grad_norm=5.21, loss_final=2.11, loss_mean=0.979, loss_mean_cls=1.2, proj_loss=-0.0653][2026-03-22 14:21:03] Step: 587, Training Logs: loss_final: 2.095244, loss_mean: 0.998132, proj_loss: -0.062992, loss_mean_cls: 1.160104, grad_norm: 4.117619 +Steps: 0%| | 588/1000000 [02:27<67:48:34, 4.09it/s, grad_norm=4.12, loss_final=2.1, loss_mean=0.998, loss_mean_cls=1.16, proj_loss=-0.063][2026-03-22 14:21:03] Step: 588, Training Logs: loss_final: 2.191583, loss_mean: 0.969775, proj_loss: -0.063906, loss_mean_cls: 1.285714, grad_norm: 4.327306 +Steps: 0%| | 589/1000000 [02:27<67:52:02, 4.09it/s, grad_norm=4.33, loss_final=2.19, loss_mean=0.97, loss_mean_cls=1.29, proj_loss=-0.0639][2026-03-22 14:21:04] Step: 589, Training Logs: loss_final: 2.124367, loss_mean: 0.978291, proj_loss: -0.065014, loss_mean_cls: 1.211091, grad_norm: 4.509607 +Steps: 0%| | 590/1000000 [02:27<67:49:52, 4.09it/s, grad_norm=4.51, loss_final=2.12, loss_mean=0.978, loss_mean_cls=1.21, proj_loss=-0.065][2026-03-22 14:21:04] Step: 590, Training Logs: loss_final: 2.127177, loss_mean: 0.968074, proj_loss: -0.063181, loss_mean_cls: 1.222284, grad_norm: 3.481958 +Steps: 0%| | 591/1000000 [02:27<67:49:14, 4.09it/s, grad_norm=3.48, loss_final=2.13, loss_mean=0.968, loss_mean_cls=1.22, proj_loss=-0.0632][2026-03-22 14:21:04] Step: 591, Training Logs: loss_final: 2.112418, loss_mean: 0.969067, proj_loss: -0.064483, loss_mean_cls: 1.207834, grad_norm: 3.427503 +Steps: 0%| | 592/1000000 [02:28<67:50:33, 4.09it/s, grad_norm=3.43, loss_final=2.11, loss_mean=0.969, loss_mean_cls=1.21, proj_loss=-0.0645][2026-03-22 14:21:04] Step: 592, Training Logs: loss_final: 2.166664, loss_mean: 0.969278, proj_loss: -0.065986, loss_mean_cls: 1.263373, grad_norm: 3.047794 +Steps: 0%| | 593/1000000 [02:28<67:50:35, 4.09it/s, grad_norm=3.05, loss_final=2.17, loss_mean=0.969, loss_mean_cls=1.26, proj_loss=-0.066][2026-03-22 14:21:05] Step: 593, Training Logs: loss_final: 2.255140, loss_mean: 0.942620, proj_loss: -0.062266, loss_mean_cls: 1.374786, grad_norm: 3.495926 +Steps: 0%| | 594/1000000 [02:28<67:49:52, 4.09it/s, grad_norm=3.5, loss_final=2.26, loss_mean=0.943, loss_mean_cls=1.37, proj_loss=-0.0623][2026-03-22 14:21:05] Step: 594, Training Logs: loss_final: 2.076726, loss_mean: 0.979947, proj_loss: -0.066033, loss_mean_cls: 1.162812, grad_norm: 3.167931 +Steps: 0%| | 595/1000000 [02:28<67:47:46, 4.09it/s, grad_norm=3.17, loss_final=2.08, loss_mean=0.98, loss_mean_cls=1.16, proj_loss=-0.066][2026-03-22 14:21:05] Step: 595, Training Logs: loss_final: 2.278763, loss_mean: 0.937645, proj_loss: -0.063243, loss_mean_cls: 1.404361, grad_norm: 3.727136 +Steps: 0%| | 596/1000000 [02:28<67:48:15, 4.09it/s, grad_norm=3.73, loss_final=2.28, loss_mean=0.938, loss_mean_cls=1.4, proj_loss=-0.0632][2026-03-22 14:21:05] Step: 596, Training Logs: loss_final: 2.175210, loss_mean: 0.957998, proj_loss: -0.064804, loss_mean_cls: 1.282017, grad_norm: 4.081759 +Steps: 0%| | 597/1000000 [02:29<67:46:05, 4.10it/s, grad_norm=4.08, loss_final=2.18, loss_mean=0.958, loss_mean_cls=1.28, proj_loss=-0.0648][2026-03-22 14:21:06] Step: 597, Training Logs: loss_final: 2.189453, loss_mean: 0.965237, proj_loss: -0.063746, loss_mean_cls: 1.287961, grad_norm: 4.179336 +Steps: 0%| | 598/1000000 [02:29<67:46:46, 4.10it/s, grad_norm=4.18, loss_final=2.19, loss_mean=0.965, loss_mean_cls=1.29, proj_loss=-0.0637][2026-03-22 14:21:06] Step: 598, Training Logs: loss_final: 2.120576, loss_mean: 0.955146, proj_loss: -0.067788, loss_mean_cls: 1.233218, grad_norm: 2.206650 +Steps: 0%| | 599/1000000 [02:29<67:47:10, 4.10it/s, grad_norm=2.21, loss_final=2.12, loss_mean=0.955, loss_mean_cls=1.23, proj_loss=-0.0678][2026-03-22 14:21:06] Step: 599, Training Logs: loss_final: 1.978146, loss_mean: 0.996885, proj_loss: -0.064711, loss_mean_cls: 1.045971, grad_norm: 2.853482 +Steps: 0%| | 600/1000000 [02:29<67:51:37, 4.09it/s, grad_norm=2.85, loss_final=1.98, loss_mean=0.997, loss_mean_cls=1.05, proj_loss=-0.0647][2026-03-22 14:21:06] Step: 600, Training Logs: loss_final: 2.026524, loss_mean: 0.971614, proj_loss: -0.067704, loss_mean_cls: 1.122613, grad_norm: 5.603035 +Steps: 0%| | 601/1000000 [02:30<67:52:38, 4.09it/s, grad_norm=5.6, loss_final=2.03, loss_mean=0.972, loss_mean_cls=1.12, proj_loss=-0.0677][2026-03-22 14:21:07] Step: 601, Training Logs: loss_final: 2.151049, loss_mean: 0.981207, proj_loss: -0.066237, loss_mean_cls: 1.236079, grad_norm: 2.353442 +Steps: 0%| | 602/1000000 [02:30<67:52:40, 4.09it/s, grad_norm=2.35, loss_final=2.15, loss_mean=0.981, loss_mean_cls=1.24, proj_loss=-0.0662][2026-03-22 14:21:07] Step: 602, Training Logs: loss_final: 2.117908, loss_mean: 0.996192, proj_loss: -0.065980, loss_mean_cls: 1.187696, grad_norm: 3.999296 +Steps: 0%| | 603/1000000 [02:30<67:51:51, 4.09it/s, grad_norm=4, loss_final=2.12, loss_mean=0.996, loss_mean_cls=1.19, proj_loss=-0.066][2026-03-22 14:21:07] Step: 603, Training Logs: loss_final: 2.183301, loss_mean: 0.967642, proj_loss: -0.066760, loss_mean_cls: 1.282420, grad_norm: 6.367744 +Steps: 0%| | 604/1000000 [02:30<67:51:05, 4.09it/s, grad_norm=6.37, loss_final=2.18, loss_mean=0.968, loss_mean_cls=1.28, proj_loss=-0.0668][2026-03-22 14:21:07] Step: 604, Training Logs: loss_final: 2.140062, loss_mean: 0.968254, proj_loss: -0.068456, loss_mean_cls: 1.240264, grad_norm: 3.629561 +Steps: 0%| | 605/1000000 [02:31<67:51:15, 4.09it/s, grad_norm=3.63, loss_final=2.14, loss_mean=0.968, loss_mean_cls=1.24, proj_loss=-0.0685][2026-03-22 14:21:08] Step: 605, Training Logs: loss_final: 2.220972, loss_mean: 0.985158, proj_loss: -0.065339, loss_mean_cls: 1.301154, grad_norm: 5.629348 +Steps: 0%| | 606/1000000 [02:31<67:49:32, 4.09it/s, grad_norm=5.63, loss_final=2.22, loss_mean=0.985, loss_mean_cls=1.3, proj_loss=-0.0653][2026-03-22 14:21:08] Step: 606, Training Logs: loss_final: 2.164835, loss_mean: 0.966859, proj_loss: -0.066843, loss_mean_cls: 1.264819, grad_norm: 3.049122 +Steps: 0%| | 607/1000000 [02:31<67:50:30, 4.09it/s, grad_norm=3.05, loss_final=2.16, loss_mean=0.967, loss_mean_cls=1.26, proj_loss=-0.0668][2026-03-22 14:21:08] Step: 607, Training Logs: loss_final: 2.275700, loss_mean: 1.004963, proj_loss: -0.066441, loss_mean_cls: 1.337178, grad_norm: 10.500995 +Steps: 0%| | 608/1000000 [02:31<67:50:47, 4.09it/s, grad_norm=10.5, loss_final=2.28, loss_mean=1, loss_mean_cls=1.34, proj_loss=-0.0664][2026-03-22 14:21:08] Step: 608, Training Logs: loss_final: 2.347462, loss_mean: 0.994643, proj_loss: -0.068248, loss_mean_cls: 1.421067, grad_norm: 8.871629 +Steps: 0%| | 609/1000000 [02:32<67:51:28, 4.09it/s, grad_norm=8.87, loss_final=2.35, loss_mean=0.995, loss_mean_cls=1.42, proj_loss=-0.0682][2026-03-22 14:21:09] Step: 609, Training Logs: loss_final: 2.146955, loss_mean: 1.000419, proj_loss: -0.065558, loss_mean_cls: 1.212094, grad_norm: 7.731081 +Steps: 0%| | 610/1000000 [02:32<67:51:36, 4.09it/s, grad_norm=7.73, loss_final=2.15, loss_mean=1, loss_mean_cls=1.21, proj_loss=-0.0656][2026-03-22 14:21:09] Step: 610, Training Logs: loss_final: 2.154022, loss_mean: 0.996848, proj_loss: -0.065047, loss_mean_cls: 1.222221, grad_norm: 7.056615 +Steps: 0%| | 611/1000000 [02:32<67:52:01, 4.09it/s, grad_norm=7.06, loss_final=2.15, loss_mean=0.997, loss_mean_cls=1.22, proj_loss=-0.065][2026-03-22 14:21:09] Step: 611, Training Logs: loss_final: 2.274244, loss_mean: 0.969138, proj_loss: -0.065160, loss_mean_cls: 1.370265, grad_norm: 3.881412 +Steps: 0%| | 612/1000000 [02:32<67:51:47, 4.09it/s, grad_norm=3.88, loss_final=2.27, loss_mean=0.969, loss_mean_cls=1.37, proj_loss=-0.0652][2026-03-22 14:21:09] Step: 612, Training Logs: loss_final: 2.199945, loss_mean: 0.976158, proj_loss: -0.069215, loss_mean_cls: 1.293002, grad_norm: 5.560696 +Steps: 0%| | 613/1000000 [02:33<67:51:32, 4.09it/s, grad_norm=5.56, loss_final=2.2, loss_mean=0.976, loss_mean_cls=1.29, proj_loss=-0.0692][2026-03-22 14:21:10] Step: 613, Training Logs: loss_final: 2.186071, loss_mean: 1.000336, proj_loss: -0.067315, loss_mean_cls: 1.253049, grad_norm: 6.070102 +Steps: 0%| | 614/1000000 [02:33<67:51:49, 4.09it/s, grad_norm=6.07, loss_final=2.19, loss_mean=1, loss_mean_cls=1.25, proj_loss=-0.0673][2026-03-22 14:21:10] Step: 614, Training Logs: loss_final: 2.208308, loss_mean: 0.956293, proj_loss: -0.067986, loss_mean_cls: 1.320001, grad_norm: 3.927862 +Steps: 0%| | 615/1000000 [02:33<67:50:05, 4.09it/s, grad_norm=3.93, loss_final=2.21, loss_mean=0.956, loss_mean_cls=1.32, proj_loss=-0.068][2026-03-22 14:21:10] Step: 615, Training Logs: loss_final: 2.124035, loss_mean: 0.999948, proj_loss: -0.067024, loss_mean_cls: 1.191112, grad_norm: 4.925114 +Steps: 0%| | 616/1000000 [02:33<67:50:19, 4.09it/s, grad_norm=4.93, loss_final=2.12, loss_mean=1, loss_mean_cls=1.19, proj_loss=-0.067][2026-03-22 14:21:10] Step: 616, Training Logs: loss_final: 2.130698, loss_mean: 0.993392, proj_loss: -0.068418, loss_mean_cls: 1.205724, grad_norm: 6.341565 +Steps: 0%| | 617/1000000 [02:34<67:52:47, 4.09it/s, grad_norm=6.34, loss_final=2.13, loss_mean=0.993, loss_mean_cls=1.21, proj_loss=-0.0684][2026-03-22 14:21:11] Step: 617, Training Logs: loss_final: 2.056426, loss_mean: 0.996539, proj_loss: -0.070580, loss_mean_cls: 1.130468, grad_norm: 5.279878 +Steps: 0%| | 618/1000000 [02:34<67:52:56, 4.09it/s, grad_norm=5.28, loss_final=2.06, loss_mean=0.997, loss_mean_cls=1.13, proj_loss=-0.0706][2026-03-22 14:21:11] Step: 618, Training Logs: loss_final: 2.080036, loss_mean: 0.967913, proj_loss: -0.068741, loss_mean_cls: 1.180864, grad_norm: 3.774520 +Steps: 0%| | 619/1000000 [02:34<67:51:09, 4.09it/s, grad_norm=3.77, loss_final=2.08, loss_mean=0.968, loss_mean_cls=1.18, proj_loss=-0.0687][2026-03-22 14:21:11] Step: 619, Training Logs: loss_final: 2.216872, loss_mean: 0.970162, proj_loss: -0.070146, loss_mean_cls: 1.316856, grad_norm: 4.620839 +Steps: 0%| | 620/1000000 [02:34<67:50:31, 4.09it/s, grad_norm=4.62, loss_final=2.22, loss_mean=0.97, loss_mean_cls=1.32, proj_loss=-0.0701][2026-03-22 14:21:11] Step: 620, Training Logs: loss_final: 2.201080, loss_mean: 0.953117, proj_loss: -0.068138, loss_mean_cls: 1.316101, grad_norm: 3.416345 +Steps: 0%| | 621/1000000 [02:35<67:50:52, 4.09it/s, grad_norm=3.42, loss_final=2.2, loss_mean=0.953, loss_mean_cls=1.32, proj_loss=-0.0681][2026-03-22 14:21:12] Step: 621, Training Logs: loss_final: 2.060477, loss_mean: 0.990160, proj_loss: -0.069590, loss_mean_cls: 1.139906, grad_norm: 4.121397 +Steps: 0%| | 622/1000000 [02:35<67:50:45, 4.09it/s, grad_norm=4.12, loss_final=2.06, loss_mean=0.99, loss_mean_cls=1.14, proj_loss=-0.0696][2026-03-22 14:21:12] Step: 622, Training Logs: loss_final: 2.130145, loss_mean: 0.977191, proj_loss: -0.067610, loss_mean_cls: 1.220565, grad_norm: 3.128952 +Steps: 0%| | 623/1000000 [02:35<67:50:44, 4.09it/s, grad_norm=3.13, loss_final=2.13, loss_mean=0.977, loss_mean_cls=1.22, proj_loss=-0.0676][2026-03-22 14:21:12] Step: 623, Training Logs: loss_final: 2.177294, loss_mean: 0.961684, proj_loss: -0.066633, loss_mean_cls: 1.282243, grad_norm: 5.167995 +Steps: 0%| | 624/1000000 [02:35<67:50:41, 4.09it/s, grad_norm=5.17, loss_final=2.18, loss_mean=0.962, loss_mean_cls=1.28, proj_loss=-0.0666][2026-03-22 14:21:12] Step: 624, Training Logs: loss_final: 2.182857, loss_mean: 0.960667, proj_loss: -0.068734, loss_mean_cls: 1.290923, grad_norm: 3.311600 +Steps: 0%| | 625/1000000 [02:36<67:51:11, 4.09it/s, grad_norm=3.31, loss_final=2.18, loss_mean=0.961, loss_mean_cls=1.29, proj_loss=-0.0687][2026-03-22 14:21:13] Step: 625, Training Logs: loss_final: 2.246362, loss_mean: 0.965418, proj_loss: -0.068302, loss_mean_cls: 1.349246, grad_norm: 4.077569 +Steps: 0%| | 626/1000000 [02:36<67:50:25, 4.09it/s, grad_norm=4.08, loss_final=2.25, loss_mean=0.965, loss_mean_cls=1.35, proj_loss=-0.0683][2026-03-22 14:21:13] Step: 626, Training Logs: loss_final: 2.185746, loss_mean: 0.967423, proj_loss: -0.070004, loss_mean_cls: 1.288327, grad_norm: 3.315011 +Steps: 0%| | 627/1000000 [02:36<67:49:36, 4.09it/s, grad_norm=3.32, loss_final=2.19, loss_mean=0.967, loss_mean_cls=1.29, proj_loss=-0.07][2026-03-22 14:21:13] Step: 627, Training Logs: loss_final: 2.177786, loss_mean: 0.989315, proj_loss: -0.068447, loss_mean_cls: 1.256918, grad_norm: 5.917777 +Steps: 0%| | 628/1000000 [02:36<67:50:00, 4.09it/s, grad_norm=5.92, loss_final=2.18, loss_mean=0.989, loss_mean_cls=1.26, proj_loss=-0.0684][2026-03-22 14:21:13] Step: 628, Training Logs: loss_final: 2.159706, loss_mean: 0.956393, proj_loss: -0.071853, loss_mean_cls: 1.275166, grad_norm: 4.437042 +Steps: 0%| | 629/1000000 [02:37<67:49:00, 4.09it/s, grad_norm=4.44, loss_final=2.16, loss_mean=0.956, loss_mean_cls=1.28, proj_loss=-0.0719][2026-03-22 14:21:14] Step: 629, Training Logs: loss_final: 2.149879, loss_mean: 0.988074, proj_loss: -0.071474, loss_mean_cls: 1.233279, grad_norm: 5.150825 +Steps: 0%| | 630/1000000 [02:37<67:50:09, 4.09it/s, grad_norm=5.15, loss_final=2.15, loss_mean=0.988, loss_mean_cls=1.23, proj_loss=-0.0715][2026-03-22 14:21:14] Step: 630, Training Logs: loss_final: 2.248794, loss_mean: 0.946894, proj_loss: -0.069972, loss_mean_cls: 1.371872, grad_norm: 5.604119 +Steps: 0%| | 631/1000000 [02:37<67:49:17, 4.09it/s, grad_norm=5.6, loss_final=2.25, loss_mean=0.947, loss_mean_cls=1.37, proj_loss=-0.07][2026-03-22 14:21:14] Step: 631, Training Logs: loss_final: 2.205578, loss_mean: 0.952186, proj_loss: -0.071299, loss_mean_cls: 1.324691, grad_norm: 4.098359 +Steps: 0%| | 632/1000000 [02:37<67:48:34, 4.09it/s, grad_norm=4.1, loss_final=2.21, loss_mean=0.952, loss_mean_cls=1.32, proj_loss=-0.0713][2026-03-22 14:21:14] Step: 632, Training Logs: loss_final: 2.134508, loss_mean: 0.980813, proj_loss: -0.068166, loss_mean_cls: 1.221861, grad_norm: 4.622672 +Steps: 0%| | 633/1000000 [02:38<67:48:18, 4.09it/s, grad_norm=4.62, loss_final=2.13, loss_mean=0.981, loss_mean_cls=1.22, proj_loss=-0.0682][2026-03-22 14:21:14] Step: 633, Training Logs: loss_final: 2.218406, loss_mean: 0.968308, proj_loss: -0.070354, loss_mean_cls: 1.320451, grad_norm: 4.056019 +Steps: 0%| | 634/1000000 [02:38<67:46:45, 4.10it/s, grad_norm=4.06, loss_final=2.22, loss_mean=0.968, loss_mean_cls=1.32, proj_loss=-0.0704][2026-03-22 14:21:15] Step: 634, Training Logs: loss_final: 2.119235, loss_mean: 0.969893, proj_loss: -0.072879, loss_mean_cls: 1.222221, grad_norm: 4.044963 +Steps: 0%| | 635/1000000 [02:38<67:45:56, 4.10it/s, grad_norm=4.04, loss_final=2.12, loss_mean=0.97, loss_mean_cls=1.22, proj_loss=-0.0729][2026-03-22 14:21:15] Step: 635, Training Logs: loss_final: 2.081723, loss_mean: 0.976732, proj_loss: -0.070181, loss_mean_cls: 1.175173, grad_norm: 3.990371 +Steps: 0%| | 636/1000000 [02:38<67:45:59, 4.10it/s, grad_norm=3.99, loss_final=2.08, loss_mean=0.977, loss_mean_cls=1.18, proj_loss=-0.0702][2026-03-22 14:21:15] Step: 636, Training Logs: loss_final: 2.054133, loss_mean: 0.980666, proj_loss: -0.070406, loss_mean_cls: 1.143874, grad_norm: 2.916575 +Steps: 0%| | 637/1000000 [02:39<67:46:10, 4.10it/s, grad_norm=2.92, loss_final=2.05, loss_mean=0.981, loss_mean_cls=1.14, proj_loss=-0.0704][2026-03-22 14:21:15] Step: 637, Training Logs: loss_final: 2.085275, loss_mean: 0.995376, proj_loss: -0.070583, loss_mean_cls: 1.160483, grad_norm: 5.650589 +Steps: 0%| | 638/1000000 [02:39<67:46:40, 4.10it/s, grad_norm=5.65, loss_final=2.09, loss_mean=0.995, loss_mean_cls=1.16, proj_loss=-0.0706][2026-03-22 14:21:16] Step: 638, Training Logs: loss_final: 2.101131, loss_mean: 0.970626, proj_loss: -0.069671, loss_mean_cls: 1.200175, grad_norm: 5.370400 +Steps: 0%| | 639/1000000 [02:39<67:49:06, 4.09it/s, grad_norm=5.37, loss_final=2.1, loss_mean=0.971, loss_mean_cls=1.2, proj_loss=-0.0697][2026-03-22 14:21:16] Step: 639, Training Logs: loss_final: 2.090049, loss_mean: 0.969166, proj_loss: -0.070890, loss_mean_cls: 1.191773, grad_norm: 2.396040 +Steps: 0%| | 640/1000000 [02:39<67:48:56, 4.09it/s, grad_norm=2.4, loss_final=2.09, loss_mean=0.969, loss_mean_cls=1.19, proj_loss=-0.0709][2026-03-22 14:21:16] Step: 640, Training Logs: loss_final: 2.214421, loss_mean: 0.926874, proj_loss: -0.073471, loss_mean_cls: 1.361018, grad_norm: 4.894749 +Steps: 0%| | 641/1000000 [02:39<67:51:05, 4.09it/s, grad_norm=4.89, loss_final=2.21, loss_mean=0.927, loss_mean_cls=1.36, proj_loss=-0.0735][2026-03-22 14:21:16] Step: 641, Training Logs: loss_final: 2.018575, loss_mean: 0.986377, proj_loss: -0.069744, loss_mean_cls: 1.101942, grad_norm: 3.802602 +Steps: 0%| | 642/1000000 [02:40<67:50:26, 4.09it/s, grad_norm=3.8, loss_final=2.02, loss_mean=0.986, loss_mean_cls=1.1, proj_loss=-0.0697][2026-03-22 14:21:17] Step: 642, Training Logs: loss_final: 2.193265, loss_mean: 0.956099, proj_loss: -0.072034, loss_mean_cls: 1.309199, grad_norm: 3.803820 +Steps: 0%| | 643/1000000 [02:40<67:47:59, 4.09it/s, grad_norm=3.8, loss_final=2.19, loss_mean=0.956, loss_mean_cls=1.31, proj_loss=-0.072][2026-03-22 14:21:17] Step: 643, Training Logs: loss_final: 2.109872, loss_mean: 0.954999, proj_loss: -0.072042, loss_mean_cls: 1.226916, grad_norm: 3.309985 +Steps: 0%| | 644/1000000 [02:40<67:47:27, 4.09it/s, grad_norm=3.31, loss_final=2.11, loss_mean=0.955, loss_mean_cls=1.23, proj_loss=-0.072][2026-03-22 14:21:17] Step: 644, Training Logs: loss_final: 2.147683, loss_mean: 0.975506, proj_loss: -0.069504, loss_mean_cls: 1.241680, grad_norm: 3.903918 +Steps: 0%| | 645/1000000 [02:40<67:47:08, 4.10it/s, grad_norm=3.9, loss_final=2.15, loss_mean=0.976, loss_mean_cls=1.24, proj_loss=-0.0695][2026-03-22 14:21:17] Step: 645, Training Logs: loss_final: 2.169838, loss_mean: 0.979741, proj_loss: -0.073838, loss_mean_cls: 1.263935, grad_norm: 3.418840 +Steps: 0%| | 646/1000000 [02:41<67:46:23, 4.10it/s, grad_norm=3.42, loss_final=2.17, loss_mean=0.98, loss_mean_cls=1.26, proj_loss=-0.0738][2026-03-22 14:21:18] Step: 646, Training Logs: loss_final: 2.093252, loss_mean: 0.955875, proj_loss: -0.068899, loss_mean_cls: 1.206276, grad_norm: 3.170529 +Steps: 0%| | 647/1000000 [02:41<67:47:12, 4.10it/s, grad_norm=3.17, loss_final=2.09, loss_mean=0.956, loss_mean_cls=1.21, proj_loss=-0.0689][2026-03-22 14:21:18] Step: 647, Training Logs: loss_final: 2.087992, loss_mean: 0.998753, proj_loss: -0.071562, loss_mean_cls: 1.160801, grad_norm: 3.394355 +Steps: 0%| | 648/1000000 [02:41<67:46:44, 4.10it/s, grad_norm=3.39, loss_final=2.09, loss_mean=0.999, loss_mean_cls=1.16, proj_loss=-0.0716][2026-03-22 14:21:18] Step: 648, Training Logs: loss_final: 2.236397, loss_mean: 0.957460, proj_loss: -0.067678, loss_mean_cls: 1.346615, grad_norm: 3.617621 +Steps: 0%| | 649/1000000 [02:41<67:46:33, 4.10it/s, grad_norm=3.62, loss_final=2.24, loss_mean=0.957, loss_mean_cls=1.35, proj_loss=-0.0677][2026-03-22 14:21:18] Step: 649, Training Logs: loss_final: 2.102829, loss_mean: 0.972611, proj_loss: -0.072656, loss_mean_cls: 1.202874, grad_norm: 3.835155 +Steps: 0%| | 650/1000000 [02:42<67:47:04, 4.10it/s, grad_norm=3.84, loss_final=2.1, loss_mean=0.973, loss_mean_cls=1.2, proj_loss=-0.0727][2026-03-22 14:21:19] Step: 650, Training Logs: loss_final: 2.163689, loss_mean: 0.973210, proj_loss: -0.066700, loss_mean_cls: 1.257179, grad_norm: 5.462162 +Steps: 0%| | 651/1000000 [02:42<67:53:29, 4.09it/s, grad_norm=5.46, loss_final=2.16, loss_mean=0.973, loss_mean_cls=1.26, proj_loss=-0.0667][2026-03-22 14:21:19] Step: 651, Training Logs: loss_final: 2.038178, loss_mean: 0.960535, proj_loss: -0.074830, loss_mean_cls: 1.152473, grad_norm: 2.260548 +Steps: 0%| | 652/1000000 [02:42<67:52:23, 4.09it/s, grad_norm=2.26, loss_final=2.04, loss_mean=0.961, loss_mean_cls=1.15, proj_loss=-0.0748][2026-03-22 14:21:19] Step: 652, Training Logs: loss_final: 2.202062, loss_mean: 0.969158, proj_loss: -0.071172, loss_mean_cls: 1.304075, grad_norm: 5.171290 +Steps: 0%| | 653/1000000 [02:42<67:51:11, 4.09it/s, grad_norm=5.17, loss_final=2.2, loss_mean=0.969, loss_mean_cls=1.3, proj_loss=-0.0712][2026-03-22 14:21:19] Step: 653, Training Logs: loss_final: 2.239961, loss_mean: 0.959558, proj_loss: -0.070175, loss_mean_cls: 1.350578, grad_norm: 4.329452 +Steps: 0%| | 654/1000000 [02:43<67:49:41, 4.09it/s, grad_norm=4.33, loss_final=2.24, loss_mean=0.96, loss_mean_cls=1.35, proj_loss=-0.0702][2026-03-22 14:21:20] Step: 654, Training Logs: loss_final: 2.390363, loss_mean: 0.913247, proj_loss: -0.071204, loss_mean_cls: 1.548320, grad_norm: 2.966028 +Steps: 0%| | 655/1000000 [02:43<67:50:23, 4.09it/s, grad_norm=2.97, loss_final=2.39, loss_mean=0.913, loss_mean_cls=1.55, proj_loss=-0.0712][2026-03-22 14:21:20] Step: 655, Training Logs: loss_final: 2.190274, loss_mean: 0.962882, proj_loss: -0.068814, loss_mean_cls: 1.296206, grad_norm: 2.874873 +Steps: 0%| | 656/1000000 [02:43<67:51:34, 4.09it/s, grad_norm=2.87, loss_final=2.19, loss_mean=0.963, loss_mean_cls=1.3, proj_loss=-0.0688][2026-03-22 14:21:20] Step: 656, Training Logs: loss_final: 2.123530, loss_mean: 0.973800, proj_loss: -0.071246, loss_mean_cls: 1.220976, grad_norm: 8.065234 +Steps: 0%| | 657/1000000 [02:43<67:52:47, 4.09it/s, grad_norm=8.07, loss_final=2.12, loss_mean=0.974, loss_mean_cls=1.22, proj_loss=-0.0712][2026-03-22 14:21:20] Step: 657, Training Logs: loss_final: 2.312509, loss_mean: 0.933418, proj_loss: -0.072970, loss_mean_cls: 1.452060, grad_norm: 6.007413 +Steps: 0%| | 658/1000000 [02:44<67:52:55, 4.09it/s, grad_norm=6.01, loss_final=2.31, loss_mean=0.933, loss_mean_cls=1.45, proj_loss=-0.073][2026-03-22 14:21:21] Step: 658, Training Logs: loss_final: 2.174653, loss_mean: 0.986598, proj_loss: -0.074693, loss_mean_cls: 1.262747, grad_norm: 9.090040 +Steps: 0%| | 659/1000000 [02:44<67:59:44, 4.08it/s, grad_norm=9.09, loss_final=2.17, loss_mean=0.987, loss_mean_cls=1.26, proj_loss=-0.0747][2026-03-22 14:21:21] Step: 659, Training Logs: loss_final: 2.316578, loss_mean: 0.965186, proj_loss: -0.073651, loss_mean_cls: 1.425044, grad_norm: 8.551615 +Steps: 0%| | 660/1000000 [02:44<67:57:32, 4.08it/s, grad_norm=8.55, loss_final=2.32, loss_mean=0.965, loss_mean_cls=1.43, proj_loss=-0.0737][2026-03-22 14:21:21] Step: 660, Training Logs: loss_final: 2.149529, loss_mean: 1.007122, proj_loss: -0.073310, loss_mean_cls: 1.215717, grad_norm: 7.703752 +Steps: 0%| | 661/1000000 [02:44<67:56:27, 4.09it/s, grad_norm=7.7, loss_final=2.15, loss_mean=1.01, loss_mean_cls=1.22, proj_loss=-0.0733][2026-03-22 14:21:21] Step: 661, Training Logs: loss_final: 2.076893, loss_mean: 0.998469, proj_loss: -0.070680, loss_mean_cls: 1.149105, grad_norm: 5.904031 +Steps: 0%| | 662/1000000 [02:45<67:54:44, 4.09it/s, grad_norm=5.9, loss_final=2.08, loss_mean=0.998, loss_mean_cls=1.15, proj_loss=-0.0707][2026-03-22 14:21:22] Step: 662, Training Logs: loss_final: 2.190420, loss_mean: 0.970560, proj_loss: -0.072423, loss_mean_cls: 1.292283, grad_norm: 5.244567 +Steps: 0%| | 663/1000000 [02:45<67:59:21, 4.08it/s, grad_norm=5.24, loss_final=2.19, loss_mean=0.971, loss_mean_cls=1.29, proj_loss=-0.0724][2026-03-22 14:21:22] Step: 663, Training Logs: loss_final: 2.188117, loss_mean: 0.984837, proj_loss: -0.070608, loss_mean_cls: 1.273889, grad_norm: 7.076348 +Steps: 0%| | 664/1000000 [02:45<67:58:37, 4.08it/s, grad_norm=7.08, loss_final=2.19, loss_mean=0.985, loss_mean_cls=1.27, proj_loss=-0.0706][2026-03-22 14:21:22] Step: 664, Training Logs: loss_final: 2.101690, loss_mean: 0.980684, proj_loss: -0.070801, loss_mean_cls: 1.191807, grad_norm: 5.530171 +Steps: 0%| | 665/1000000 [02:45<67:56:13, 4.09it/s, grad_norm=5.53, loss_final=2.1, loss_mean=0.981, loss_mean_cls=1.19, proj_loss=-0.0708][2026-03-22 14:21:22] Step: 665, Training Logs: loss_final: 2.024759, loss_mean: 0.982579, proj_loss: -0.075889, loss_mean_cls: 1.118069, grad_norm: 3.736725 +Steps: 0%| | 666/1000000 [02:46<67:56:45, 4.09it/s, grad_norm=3.74, loss_final=2.02, loss_mean=0.983, loss_mean_cls=1.12, proj_loss=-0.0759][2026-03-22 14:21:23] Step: 666, Training Logs: loss_final: 2.248340, loss_mean: 0.949678, proj_loss: -0.073098, loss_mean_cls: 1.371760, grad_norm: 4.724831 +Steps: 0%| | 667/1000000 [02:46<67:54:35, 4.09it/s, grad_norm=4.72, loss_final=2.25, loss_mean=0.95, loss_mean_cls=1.37, proj_loss=-0.0731][2026-03-22 14:21:23] Step: 667, Training Logs: loss_final: 2.056572, loss_mean: 0.980350, proj_loss: -0.073906, loss_mean_cls: 1.150129, grad_norm: 2.162504 +Steps: 0%| | 668/1000000 [02:46<67:53:31, 4.09it/s, grad_norm=2.16, loss_final=2.06, loss_mean=0.98, loss_mean_cls=1.15, proj_loss=-0.0739][2026-03-22 14:21:23] Step: 668, Training Logs: loss_final: 2.194027, loss_mean: 0.949207, proj_loss: -0.071165, loss_mean_cls: 1.315985, grad_norm: 3.636473 +Steps: 0%| | 669/1000000 [02:46<67:51:47, 4.09it/s, grad_norm=3.64, loss_final=2.19, loss_mean=0.949, loss_mean_cls=1.32, proj_loss=-0.0712][2026-03-22 14:21:23] Step: 669, Training Logs: loss_final: 2.033893, loss_mean: 0.959361, proj_loss: -0.072823, loss_mean_cls: 1.147354, grad_norm: 3.449106 +Steps: 0%| | 670/1000000 [02:47<67:53:59, 4.09it/s, grad_norm=3.45, loss_final=2.03, loss_mean=0.959, loss_mean_cls=1.15, proj_loss=-0.0728][2026-03-22 14:21:24] Step: 670, Training Logs: loss_final: 2.017012, loss_mean: 0.971517, proj_loss: -0.072534, loss_mean_cls: 1.118030, grad_norm: 3.736691 +Steps: 0%| | 671/1000000 [02:47<67:52:47, 4.09it/s, grad_norm=3.74, loss_final=2.02, loss_mean=0.972, loss_mean_cls=1.12, proj_loss=-0.0725][2026-03-22 14:21:24] Step: 671, Training Logs: loss_final: 2.076915, loss_mean: 0.969285, proj_loss: -0.073543, loss_mean_cls: 1.181173, grad_norm: 4.633058 +Steps: 0%| | 672/1000000 [02:47<67:54:23, 4.09it/s, grad_norm=4.63, loss_final=2.08, loss_mean=0.969, loss_mean_cls=1.18, proj_loss=-0.0735][2026-03-22 14:21:24] Step: 672, Training Logs: loss_final: 2.109717, loss_mean: 0.974036, proj_loss: -0.071178, loss_mean_cls: 1.206859, grad_norm: 4.400041 +Steps: 0%| | 673/1000000 [02:47<67:53:16, 4.09it/s, grad_norm=4.4, loss_final=2.11, loss_mean=0.974, loss_mean_cls=1.21, proj_loss=-0.0712][2026-03-22 14:21:24] Step: 673, Training Logs: loss_final: 2.231132, loss_mean: 0.984155, proj_loss: -0.073164, loss_mean_cls: 1.320140, grad_norm: 6.994076 +Steps: 0%| | 674/1000000 [02:48<67:52:56, 4.09it/s, grad_norm=6.99, loss_final=2.23, loss_mean=0.984, loss_mean_cls=1.32, proj_loss=-0.0732][2026-03-22 14:21:25] Step: 674, Training Logs: loss_final: 2.187077, loss_mean: 0.985099, proj_loss: -0.072650, loss_mean_cls: 1.274628, grad_norm: 6.856577 +Steps: 0%| | 675/1000000 [02:48<67:52:34, 4.09it/s, grad_norm=6.86, loss_final=2.19, loss_mean=0.985, loss_mean_cls=1.27, proj_loss=-0.0727][2026-03-22 14:21:25] Step: 675, Training Logs: loss_final: 2.153089, loss_mean: 0.968705, proj_loss: -0.072275, loss_mean_cls: 1.256659, grad_norm: 4.855498 +Steps: 0%| | 676/1000000 [02:48<67:53:55, 4.09it/s, grad_norm=4.86, loss_final=2.15, loss_mean=0.969, loss_mean_cls=1.26, proj_loss=-0.0723][2026-03-22 14:21:25] Step: 676, Training Logs: loss_final: 2.130665, loss_mean: 1.012787, proj_loss: -0.072921, loss_mean_cls: 1.190799, grad_norm: 5.679408 +Steps: 0%| | 677/1000000 [02:48<67:54:02, 4.09it/s, grad_norm=5.68, loss_final=2.13, loss_mean=1.01, loss_mean_cls=1.19, proj_loss=-0.0729][2026-03-22 14:21:25] Step: 677, Training Logs: loss_final: 2.256723, loss_mean: 0.990281, proj_loss: -0.072828, loss_mean_cls: 1.339270, grad_norm: 7.328655 +Steps: 0%| | 678/1000000 [02:49<67:53:26, 4.09it/s, grad_norm=7.33, loss_final=2.26, loss_mean=0.99, loss_mean_cls=1.34, proj_loss=-0.0728][2026-03-22 14:21:25] Step: 678, Training Logs: loss_final: 2.143439, loss_mean: 0.981841, proj_loss: -0.074412, loss_mean_cls: 1.236010, grad_norm: 5.377297 +Steps: 0%| | 679/1000000 [02:49<67:53:44, 4.09it/s, grad_norm=5.38, loss_final=2.14, loss_mean=0.982, loss_mean_cls=1.24, proj_loss=-0.0744][2026-03-22 14:21:26] Step: 679, Training Logs: loss_final: 2.202073, loss_mean: 0.970854, proj_loss: -0.071461, loss_mean_cls: 1.302680, grad_norm: 3.232244 +Steps: 0%| | 680/1000000 [02:49<67:53:58, 4.09it/s, grad_norm=3.23, loss_final=2.2, loss_mean=0.971, loss_mean_cls=1.3, proj_loss=-0.0715][2026-03-22 14:21:26] Step: 680, Training Logs: loss_final: 2.114711, loss_mean: 0.989823, proj_loss: -0.073814, loss_mean_cls: 1.198702, grad_norm: 8.277411 +Steps: 0%| | 681/1000000 [02:49<67:54:57, 4.09it/s, grad_norm=8.28, loss_final=2.11, loss_mean=0.99, loss_mean_cls=1.2, proj_loss=-0.0738][2026-03-22 14:21:26] Step: 681, Training Logs: loss_final: 2.115553, loss_mean: 0.992001, proj_loss: -0.073988, loss_mean_cls: 1.197540, grad_norm: 8.127829 +Steps: 0%| | 682/1000000 [02:50<67:54:48, 4.09it/s, grad_norm=8.13, loss_final=2.12, loss_mean=0.992, loss_mean_cls=1.2, proj_loss=-0.074][2026-03-22 14:21:26] Step: 682, Training Logs: loss_final: 1.972790, loss_mean: 0.997240, proj_loss: -0.074814, loss_mean_cls: 1.050364, grad_norm: 3.542120 +Steps: 0%| | 683/1000000 [02:50<67:53:17, 4.09it/s, grad_norm=3.54, loss_final=1.97, loss_mean=0.997, loss_mean_cls=1.05, proj_loss=-0.0748][2026-03-22 14:21:27] Step: 683, Training Logs: loss_final: 2.167662, loss_mean: 0.983247, proj_loss: -0.071511, loss_mean_cls: 1.255926, grad_norm: 4.837854 +Steps: 0%| | 684/1000000 [02:50<67:53:15, 4.09it/s, grad_norm=4.84, loss_final=2.17, loss_mean=0.983, loss_mean_cls=1.26, proj_loss=-0.0715][2026-03-22 14:21:27] Step: 684, Training Logs: loss_final: 2.204380, loss_mean: 0.954049, proj_loss: -0.071895, loss_mean_cls: 1.322226, grad_norm: 5.667681 +Steps: 0%| | 685/1000000 [02:50<67:51:52, 4.09it/s, grad_norm=5.67, loss_final=2.2, loss_mean=0.954, loss_mean_cls=1.32, proj_loss=-0.0719][2026-03-22 14:21:27] Step: 685, Training Logs: loss_final: 2.096167, loss_mean: 0.997044, proj_loss: -0.071772, loss_mean_cls: 1.170895, grad_norm: 4.550072 +Steps: 0%| | 686/1000000 [02:50<67:52:17, 4.09it/s, grad_norm=4.55, loss_final=2.1, loss_mean=0.997, loss_mean_cls=1.17, proj_loss=-0.0718][2026-03-22 14:21:27] Step: 686, Training Logs: loss_final: 2.153662, loss_mean: 0.970218, proj_loss: -0.071469, loss_mean_cls: 1.254913, grad_norm: 3.212667 +Steps: 0%| | 687/1000000 [02:51<67:50:11, 4.09it/s, grad_norm=3.21, loss_final=2.15, loss_mean=0.97, loss_mean_cls=1.25, proj_loss=-0.0715][2026-03-22 14:21:28] Step: 687, Training Logs: loss_final: 2.093323, loss_mean: 0.978456, proj_loss: -0.071239, loss_mean_cls: 1.186105, grad_norm: 3.596979 +Steps: 0%| | 688/1000000 [02:51<67:49:58, 4.09it/s, grad_norm=3.6, loss_final=2.09, loss_mean=0.978, loss_mean_cls=1.19, proj_loss=-0.0712][2026-03-22 14:21:28] Step: 688, Training Logs: loss_final: 2.058494, loss_mean: 0.976337, proj_loss: -0.073783, loss_mean_cls: 1.155940, grad_norm: 2.643276 +Steps: 0%| | 689/1000000 [02:51<67:49:32, 4.09it/s, grad_norm=2.64, loss_final=2.06, loss_mean=0.976, loss_mean_cls=1.16, proj_loss=-0.0738][2026-03-22 14:21:28] Step: 689, Training Logs: loss_final: 2.124714, loss_mean: 0.961631, proj_loss: -0.073543, loss_mean_cls: 1.236625, grad_norm: 3.715477 +Steps: 0%| | 690/1000000 [02:51<67:49:16, 4.09it/s, grad_norm=3.72, loss_final=2.12, loss_mean=0.962, loss_mean_cls=1.24, proj_loss=-0.0735][2026-03-22 14:21:28] Step: 690, Training Logs: loss_final: 2.307211, loss_mean: 0.944340, proj_loss: -0.073789, loss_mean_cls: 1.436660, grad_norm: 5.409539 +Steps: 0%| | 691/1000000 [02:52<67:48:00, 4.09it/s, grad_norm=5.41, loss_final=2.31, loss_mean=0.944, loss_mean_cls=1.44, proj_loss=-0.0738][2026-03-22 14:21:29] Step: 691, Training Logs: loss_final: 2.130095, loss_mean: 0.969899, proj_loss: -0.073146, loss_mean_cls: 1.233341, grad_norm: 4.143322 +Steps: 0%| | 692/1000000 [02:52<67:47:02, 4.10it/s, grad_norm=4.14, loss_final=2.13, loss_mean=0.97, loss_mean_cls=1.23, proj_loss=-0.0731][2026-03-22 14:21:29] Step: 692, Training Logs: loss_final: 2.114479, loss_mean: 0.976590, proj_loss: -0.076061, loss_mean_cls: 1.213950, grad_norm: 4.319047 +Steps: 0%| | 693/1000000 [02:52<67:47:30, 4.09it/s, grad_norm=4.32, loss_final=2.11, loss_mean=0.977, loss_mean_cls=1.21, proj_loss=-0.0761][2026-03-22 14:21:29] Step: 693, Training Logs: loss_final: 2.128447, loss_mean: 0.960071, proj_loss: -0.077676, loss_mean_cls: 1.246052, grad_norm: 3.856296 +Steps: 0%| | 694/1000000 [02:52<67:47:32, 4.09it/s, grad_norm=3.86, loss_final=2.13, loss_mean=0.96, loss_mean_cls=1.25, proj_loss=-0.0777][2026-03-22 14:21:29] Step: 694, Training Logs: loss_final: 2.283683, loss_mean: 0.941871, proj_loss: -0.071775, loss_mean_cls: 1.413587, grad_norm: 5.876876 +Steps: 0%| | 695/1000000 [02:53<67:46:09, 4.10it/s, grad_norm=5.88, loss_final=2.28, loss_mean=0.942, loss_mean_cls=1.41, proj_loss=-0.0718][2026-03-22 14:21:30] Step: 695, Training Logs: loss_final: 2.163754, loss_mean: 0.962224, proj_loss: -0.074077, loss_mean_cls: 1.275607, grad_norm: 3.273260 +Steps: 0%| | 696/1000000 [02:53<67:46:40, 4.10it/s, grad_norm=3.27, loss_final=2.16, loss_mean=0.962, loss_mean_cls=1.28, proj_loss=-0.0741][2026-03-22 14:21:30] Step: 696, Training Logs: loss_final: 2.178030, loss_mean: 0.977744, proj_loss: -0.072686, loss_mean_cls: 1.272971, grad_norm: 7.109209 +Steps: 0%| | 697/1000000 [02:53<67:47:01, 4.10it/s, grad_norm=7.11, loss_final=2.18, loss_mean=0.978, loss_mean_cls=1.27, proj_loss=-0.0727][2026-03-22 14:21:30] Step: 697, Training Logs: loss_final: 2.275244, loss_mean: 0.998510, proj_loss: -0.074707, loss_mean_cls: 1.351441, grad_norm: 6.447857 +Steps: 0%| | 698/1000000 [02:53<67:47:59, 4.09it/s, grad_norm=6.45, loss_final=2.28, loss_mean=0.999, loss_mean_cls=1.35, proj_loss=-0.0747][2026-03-22 14:21:30] Step: 698, Training Logs: loss_final: 2.192010, loss_mean: 0.978807, proj_loss: -0.072544, loss_mean_cls: 1.285747, grad_norm: 5.079141 +Steps: 0%| | 699/1000000 [02:54<67:47:06, 4.10it/s, grad_norm=5.08, loss_final=2.19, loss_mean=0.979, loss_mean_cls=1.29, proj_loss=-0.0725][2026-03-22 14:21:31] Step: 699, Training Logs: loss_final: 2.112041, loss_mean: 0.992527, proj_loss: -0.076057, loss_mean_cls: 1.195571, grad_norm: 4.924953 +Steps: 0%| | 700/1000000 [02:54<67:50:50, 4.09it/s, grad_norm=4.92, loss_final=2.11, loss_mean=0.993, loss_mean_cls=1.2, proj_loss=-0.0761][2026-03-22 14:21:31] Step: 700, Training Logs: loss_final: 2.231241, loss_mean: 0.968876, proj_loss: -0.074425, loss_mean_cls: 1.336790, grad_norm: 5.967338 +Steps: 0%| | 701/1000000 [02:54<67:49:33, 4.09it/s, grad_norm=5.97, loss_final=2.23, loss_mean=0.969, loss_mean_cls=1.34, proj_loss=-0.0744][2026-03-22 14:21:31] Step: 701, Training Logs: loss_final: 2.196524, loss_mean: 0.983769, proj_loss: -0.071350, loss_mean_cls: 1.284105, grad_norm: 3.530261 +Steps: 0%| | 702/1000000 [02:54<67:48:31, 4.09it/s, grad_norm=3.53, loss_final=2.2, loss_mean=0.984, loss_mean_cls=1.28, proj_loss=-0.0713][2026-03-22 14:21:31] Step: 702, Training Logs: loss_final: 2.161765, loss_mean: 0.959085, proj_loss: -0.071409, loss_mean_cls: 1.274089, grad_norm: 3.786325 +Steps: 0%| | 703/1000000 [02:55<67:48:57, 4.09it/s, grad_norm=3.79, loss_final=2.16, loss_mean=0.959, loss_mean_cls=1.27, proj_loss=-0.0714][2026-03-22 14:21:32] Step: 703, Training Logs: loss_final: 2.150702, loss_mean: 0.989297, proj_loss: -0.076895, loss_mean_cls: 1.238300, grad_norm: 4.503509 +Steps: 0%| | 704/1000000 [02:55<67:50:19, 4.09it/s, grad_norm=4.5, loss_final=2.15, loss_mean=0.989, loss_mean_cls=1.24, proj_loss=-0.0769][2026-03-22 14:21:32] Step: 704, Training Logs: loss_final: 2.129473, loss_mean: 0.965510, proj_loss: -0.076549, loss_mean_cls: 1.240512, grad_norm: 3.080023 +Steps: 0%| | 705/1000000 [02:55<67:51:29, 4.09it/s, grad_norm=3.08, loss_final=2.13, loss_mean=0.966, loss_mean_cls=1.24, proj_loss=-0.0765][2026-03-22 14:21:32] Step: 705, Training Logs: loss_final: 2.218551, loss_mean: 0.951262, proj_loss: -0.077259, loss_mean_cls: 1.344548, grad_norm: 3.242213 +Steps: 0%| | 706/1000000 [02:55<67:50:57, 4.09it/s, grad_norm=3.24, loss_final=2.22, loss_mean=0.951, loss_mean_cls=1.34, proj_loss=-0.0773][2026-03-22 14:21:32] Step: 706, Training Logs: loss_final: 2.147186, loss_mean: 0.958186, proj_loss: -0.072647, loss_mean_cls: 1.261646, grad_norm: 3.281579 +Steps: 0%| | 707/1000000 [02:56<67:50:01, 4.09it/s, grad_norm=3.28, loss_final=2.15, loss_mean=0.958, loss_mean_cls=1.26, proj_loss=-0.0726][2026-03-22 14:21:33] Step: 707, Training Logs: loss_final: 2.127967, loss_mean: 0.971554, proj_loss: -0.076988, loss_mean_cls: 1.233401, grad_norm: 4.747355 +Steps: 0%| | 708/1000000 [02:56<68:00:19, 4.08it/s, grad_norm=4.75, loss_final=2.13, loss_mean=0.972, loss_mean_cls=1.23, proj_loss=-0.077][2026-03-22 14:21:33] Step: 708, Training Logs: loss_final: 1.981470, loss_mean: 0.964095, proj_loss: -0.077228, loss_mean_cls: 1.094602, grad_norm: 4.136192 +Steps: 0%| | 709/1000000 [02:56<68:03:47, 4.08it/s, grad_norm=4.14, loss_final=1.98, loss_mean=0.964, loss_mean_cls=1.09, proj_loss=-0.0772][2026-03-22 14:21:33] Step: 709, Training Logs: loss_final: 2.196379, loss_mean: 0.961497, proj_loss: -0.073046, loss_mean_cls: 1.307929, grad_norm: 3.816951 +Steps: 0%| | 710/1000000 [02:56<68:01:01, 4.08it/s, grad_norm=3.82, loss_final=2.2, loss_mean=0.961, loss_mean_cls=1.31, proj_loss=-0.073][2026-03-22 14:21:33] Step: 710, Training Logs: loss_final: 2.081352, loss_mean: 0.972363, proj_loss: -0.073581, loss_mean_cls: 1.182571, grad_norm: 5.299668 +Steps: 0%| | 711/1000000 [02:57<67:57:55, 4.08it/s, grad_norm=5.3, loss_final=2.08, loss_mean=0.972, loss_mean_cls=1.18, proj_loss=-0.0736][2026-03-22 14:21:34] Step: 711, Training Logs: loss_final: 2.146814, loss_mean: 0.965630, proj_loss: -0.073639, loss_mean_cls: 1.254824, grad_norm: 4.937774 +Steps: 0%| | 712/1000000 [02:57<67:57:36, 4.08it/s, grad_norm=4.94, loss_final=2.15, loss_mean=0.966, loss_mean_cls=1.25, proj_loss=-0.0736][2026-03-22 14:21:34] Step: 712, Training Logs: loss_final: 2.138308, loss_mean: 0.946127, proj_loss: -0.076371, loss_mean_cls: 1.268552, grad_norm: 4.172966 +Steps: 0%| | 713/1000000 [02:57<67:56:12, 4.09it/s, grad_norm=4.17, loss_final=2.14, loss_mean=0.946, loss_mean_cls=1.27, proj_loss=-0.0764][2026-03-22 14:21:34] Step: 713, Training Logs: loss_final: 2.140288, loss_mean: 0.961059, proj_loss: -0.075995, loss_mean_cls: 1.255224, grad_norm: 3.954709 +Steps: 0%| | 714/1000000 [02:57<67:55:31, 4.09it/s, grad_norm=3.95, loss_final=2.14, loss_mean=0.961, loss_mean_cls=1.26, proj_loss=-0.076][2026-03-22 14:21:34] Step: 714, Training Logs: loss_final: 2.129333, loss_mean: 0.974343, proj_loss: -0.073525, loss_mean_cls: 1.228514, grad_norm: 5.889304 +Steps: 0%| | 715/1000000 [02:58<67:54:08, 4.09it/s, grad_norm=5.89, loss_final=2.13, loss_mean=0.974, loss_mean_cls=1.23, proj_loss=-0.0735][2026-03-22 14:21:35] Step: 715, Training Logs: loss_final: 2.114133, loss_mean: 0.970782, proj_loss: -0.073824, loss_mean_cls: 1.217175, grad_norm: 4.233385 +Steps: 0%| | 716/1000000 [02:58<67:55:54, 4.09it/s, grad_norm=4.23, loss_final=2.11, loss_mean=0.971, loss_mean_cls=1.22, proj_loss=-0.0738][2026-03-22 14:21:35] Step: 716, Training Logs: loss_final: 2.118987, loss_mean: 0.957172, proj_loss: -0.079722, loss_mean_cls: 1.241537, grad_norm: 4.460123 +Steps: 0%| | 717/1000000 [02:58<67:55:16, 4.09it/s, grad_norm=4.46, loss_final=2.12, loss_mean=0.957, loss_mean_cls=1.24, proj_loss=-0.0797][2026-03-22 14:21:35] Step: 717, Training Logs: loss_final: 2.056143, loss_mean: 0.993793, proj_loss: -0.075314, loss_mean_cls: 1.137665, grad_norm: 3.957570 +Steps: 0%| | 718/1000000 [02:58<67:53:03, 4.09it/s, grad_norm=3.96, loss_final=2.06, loss_mean=0.994, loss_mean_cls=1.14, proj_loss=-0.0753][2026-03-22 14:21:35] Step: 718, Training Logs: loss_final: 2.127090, loss_mean: 0.968277, proj_loss: -0.076237, loss_mean_cls: 1.235050, grad_norm: 4.046544 +Steps: 0%| | 719/1000000 [02:59<67:52:57, 4.09it/s, grad_norm=4.05, loss_final=2.13, loss_mean=0.968, loss_mean_cls=1.24, proj_loss=-0.0762][2026-03-22 14:21:36] Step: 719, Training Logs: loss_final: 2.139192, loss_mean: 0.953130, proj_loss: -0.075836, loss_mean_cls: 1.261898, grad_norm: 3.764421 +Steps: 0%| | 720/1000000 [02:59<67:54:31, 4.09it/s, grad_norm=3.76, loss_final=2.14, loss_mean=0.953, loss_mean_cls=1.26, proj_loss=-0.0758][2026-03-22 14:21:36] Step: 720, Training Logs: loss_final: 2.140062, loss_mean: 0.966494, proj_loss: -0.078804, loss_mean_cls: 1.252372, grad_norm: 4.384556 +Steps: 0%| | 721/1000000 [02:59<67:54:54, 4.09it/s, grad_norm=4.38, loss_final=2.14, loss_mean=0.966, loss_mean_cls=1.25, proj_loss=-0.0788][2026-03-22 14:21:36] Step: 721, Training Logs: loss_final: 2.246375, loss_mean: 0.948907, proj_loss: -0.073090, loss_mean_cls: 1.370558, grad_norm: 3.405907 +Steps: 0%| | 722/1000000 [02:59<67:54:14, 4.09it/s, grad_norm=3.41, loss_final=2.25, loss_mean=0.949, loss_mean_cls=1.37, proj_loss=-0.0731][2026-03-22 14:21:36] Step: 722, Training Logs: loss_final: 2.110936, loss_mean: 0.972921, proj_loss: -0.077670, loss_mean_cls: 1.215685, grad_norm: 5.028756 +Steps: 0%| | 723/1000000 [03:00<67:53:45, 4.09it/s, grad_norm=5.03, loss_final=2.11, loss_mean=0.973, loss_mean_cls=1.22, proj_loss=-0.0777][2026-03-22 14:21:36] Step: 723, Training Logs: loss_final: 2.125809, loss_mean: 0.982598, proj_loss: -0.075631, loss_mean_cls: 1.218842, grad_norm: 3.104342 +Steps: 0%| | 724/1000000 [03:00<67:51:48, 4.09it/s, grad_norm=3.1, loss_final=2.13, loss_mean=0.983, loss_mean_cls=1.22, proj_loss=-0.0756][2026-03-22 14:21:37] Step: 724, Training Logs: loss_final: 2.163781, loss_mean: 0.957594, proj_loss: -0.076887, loss_mean_cls: 1.283073, grad_norm: 3.150593 +Steps: 0%| | 725/1000000 [03:00<67:50:17, 4.09it/s, grad_norm=3.15, loss_final=2.16, loss_mean=0.958, loss_mean_cls=1.28, proj_loss=-0.0769][2026-03-22 14:21:37] Step: 725, Training Logs: loss_final: 2.209868, loss_mean: 0.950752, proj_loss: -0.073700, loss_mean_cls: 1.332816, grad_norm: 3.076938 +Steps: 0%| | 726/1000000 [03:00<67:48:59, 4.09it/s, grad_norm=3.08, loss_final=2.21, loss_mean=0.951, loss_mean_cls=1.33, proj_loss=-0.0737][2026-03-22 14:21:37] Step: 726, Training Logs: loss_final: 2.177760, loss_mean: 0.952258, proj_loss: -0.076659, loss_mean_cls: 1.302162, grad_norm: 3.814282 +Steps: 0%| | 727/1000000 [03:01<67:48:42, 4.09it/s, grad_norm=3.81, loss_final=2.18, loss_mean=0.952, loss_mean_cls=1.3, proj_loss=-0.0767][2026-03-22 14:21:37] Step: 727, Training Logs: loss_final: 1.990267, loss_mean: 0.967493, proj_loss: -0.076671, loss_mean_cls: 1.099445, grad_norm: 3.253520 +Steps: 0%| | 728/1000000 [03:01<67:48:22, 4.09it/s, grad_norm=3.25, loss_final=1.99, loss_mean=0.967, loss_mean_cls=1.1, proj_loss=-0.0767][2026-03-22 14:21:38] Step: 728, Training Logs: loss_final: 2.165018, loss_mean: 0.956352, proj_loss: -0.075472, loss_mean_cls: 1.284138, grad_norm: 3.806964 +Steps: 0%| | 729/1000000 [03:01<67:51:04, 4.09it/s, grad_norm=3.81, loss_final=2.17, loss_mean=0.956, loss_mean_cls=1.28, proj_loss=-0.0755][2026-03-22 14:21:38] Step: 729, Training Logs: loss_final: 2.030330, loss_mean: 0.981945, proj_loss: -0.075810, loss_mean_cls: 1.124195, grad_norm: 4.138080 +Steps: 0%| | 730/1000000 [03:01<67:50:51, 4.09it/s, grad_norm=4.14, loss_final=2.03, loss_mean=0.982, loss_mean_cls=1.12, proj_loss=-0.0758][2026-03-22 14:21:38] Step: 730, Training Logs: loss_final: 2.090206, loss_mean: 0.961550, proj_loss: -0.079215, loss_mean_cls: 1.207871, grad_norm: 4.092400 +Steps: 0%| | 731/1000000 [03:01<67:49:03, 4.09it/s, grad_norm=4.09, loss_final=2.09, loss_mean=0.962, loss_mean_cls=1.21, proj_loss=-0.0792][2026-03-22 14:21:38] Step: 731, Training Logs: loss_final: 2.093230, loss_mean: 0.977331, proj_loss: -0.076630, loss_mean_cls: 1.192530, grad_norm: 4.857824 +Steps: 0%| | 732/1000000 [03:02<67:49:30, 4.09it/s, grad_norm=4.86, loss_final=2.09, loss_mean=0.977, loss_mean_cls=1.19, proj_loss=-0.0766][2026-03-22 14:21:39] Step: 732, Training Logs: loss_final: 2.078781, loss_mean: 0.940616, proj_loss: -0.077090, loss_mean_cls: 1.215255, grad_norm: 4.426333 +Steps: 0%| | 733/1000000 [03:02<67:49:44, 4.09it/s, grad_norm=4.43, loss_final=2.08, loss_mean=0.941, loss_mean_cls=1.22, proj_loss=-0.0771][2026-03-22 14:21:39] Step: 733, Training Logs: loss_final: 2.006384, loss_mean: 0.981071, proj_loss: -0.078026, loss_mean_cls: 1.103339, grad_norm: 4.152643 +Steps: 0%| | 734/1000000 [03:02<67:49:25, 4.09it/s, grad_norm=4.15, loss_final=2.01, loss_mean=0.981, loss_mean_cls=1.1, proj_loss=-0.078][2026-03-22 14:21:39] Step: 734, Training Logs: loss_final: 2.180797, loss_mean: 0.940472, proj_loss: -0.078574, loss_mean_cls: 1.318898, grad_norm: 3.070439 +Steps: 0%| | 735/1000000 [03:02<67:49:34, 4.09it/s, grad_norm=3.07, loss_final=2.18, loss_mean=0.94, loss_mean_cls=1.32, proj_loss=-0.0786][2026-03-22 14:21:39] Step: 735, Training Logs: loss_final: 2.152138, loss_mean: 0.944302, proj_loss: -0.080290, loss_mean_cls: 1.288127, grad_norm: 2.453887 +Steps: 0%| | 736/1000000 [03:03<67:50:47, 4.09it/s, grad_norm=2.45, loss_final=2.15, loss_mean=0.944, loss_mean_cls=1.29, proj_loss=-0.0803][2026-03-22 14:21:40] Step: 736, Training Logs: loss_final: 2.039553, loss_mean: 0.985318, proj_loss: -0.078632, loss_mean_cls: 1.132867, grad_norm: 3.771309 +Steps: 0%| | 737/1000000 [03:03<67:49:34, 4.09it/s, grad_norm=3.77, loss_final=2.04, loss_mean=0.985, loss_mean_cls=1.13, proj_loss=-0.0786][2026-03-22 14:21:40] Step: 737, Training Logs: loss_final: 2.142627, loss_mean: 0.961491, proj_loss: -0.074055, loss_mean_cls: 1.255191, grad_norm: 2.645691 +Steps: 0%| | 738/1000000 [03:03<67:50:55, 4.09it/s, grad_norm=2.65, loss_final=2.14, loss_mean=0.961, loss_mean_cls=1.26, proj_loss=-0.0741][2026-03-22 14:21:40] Step: 738, Training Logs: loss_final: 2.055294, loss_mean: 0.962025, proj_loss: -0.075889, loss_mean_cls: 1.169158, grad_norm: 3.663109 +Steps: 0%| | 739/1000000 [03:03<67:50:17, 4.09it/s, grad_norm=3.66, loss_final=2.06, loss_mean=0.962, loss_mean_cls=1.17, proj_loss=-0.0759][2026-03-22 14:21:40] Step: 739, Training Logs: loss_final: 2.094603, loss_mean: 0.951881, proj_loss: -0.076665, loss_mean_cls: 1.219387, grad_norm: 4.146465 +Steps: 0%| | 740/1000000 [03:04<67:50:38, 4.09it/s, grad_norm=4.15, loss_final=2.09, loss_mean=0.952, loss_mean_cls=1.22, proj_loss=-0.0767][2026-03-22 14:21:41] Step: 740, Training Logs: loss_final: 2.034490, loss_mean: 0.958788, proj_loss: -0.079862, loss_mean_cls: 1.155564, grad_norm: 2.266011 +Steps: 0%| | 741/1000000 [03:04<67:48:46, 4.09it/s, grad_norm=2.27, loss_final=2.03, loss_mean=0.959, loss_mean_cls=1.16, proj_loss=-0.0799][2026-03-22 14:21:41] Step: 741, Training Logs: loss_final: 2.064063, loss_mean: 0.989329, proj_loss: -0.080465, loss_mean_cls: 1.155198, grad_norm: 5.295407 +Steps: 0%| | 742/1000000 [03:04<67:49:46, 4.09it/s, grad_norm=5.3, loss_final=2.06, loss_mean=0.989, loss_mean_cls=1.16, proj_loss=-0.0805][2026-03-22 14:21:41] Step: 742, Training Logs: loss_final: 2.038242, loss_mean: 0.982801, proj_loss: -0.076564, loss_mean_cls: 1.132004, grad_norm: 3.931806 +Steps: 0%| | 743/1000000 [03:04<67:48:21, 4.09it/s, grad_norm=3.93, loss_final=2.04, loss_mean=0.983, loss_mean_cls=1.13, proj_loss=-0.0766][2026-03-22 14:21:41] Step: 743, Training Logs: loss_final: 2.097609, loss_mean: 0.968587, proj_loss: -0.080275, loss_mean_cls: 1.209297, grad_norm: 4.173186 +Steps: 0%| | 744/1000000 [03:05<67:50:47, 4.09it/s, grad_norm=4.17, loss_final=2.1, loss_mean=0.969, loss_mean_cls=1.21, proj_loss=-0.0803][2026-03-22 14:21:42] Step: 744, Training Logs: loss_final: 2.060798, loss_mean: 0.971172, proj_loss: -0.079511, loss_mean_cls: 1.169137, grad_norm: 4.974647 +Steps: 0%| | 745/1000000 [03:05<67:49:26, 4.09it/s, grad_norm=4.97, loss_final=2.06, loss_mean=0.971, loss_mean_cls=1.17, proj_loss=-0.0795][2026-03-22 14:21:42] Step: 745, Training Logs: loss_final: 2.045893, loss_mean: 0.968805, proj_loss: -0.077565, loss_mean_cls: 1.154652, grad_norm: 3.926522 +Steps: 0%| | 746/1000000 [03:05<67:49:39, 4.09it/s, grad_norm=3.93, loss_final=2.05, loss_mean=0.969, loss_mean_cls=1.15, proj_loss=-0.0776][2026-03-22 14:21:42] Step: 746, Training Logs: loss_final: 2.061150, loss_mean: 0.965649, proj_loss: -0.081957, loss_mean_cls: 1.177458, grad_norm: 3.140280 +Steps: 0%| | 747/1000000 [03:05<67:47:51, 4.09it/s, grad_norm=3.14, loss_final=2.06, loss_mean=0.966, loss_mean_cls=1.18, proj_loss=-0.082][2026-03-22 14:21:42] Step: 747, Training Logs: loss_final: 2.127262, loss_mean: 0.964007, proj_loss: -0.080160, loss_mean_cls: 1.243415, grad_norm: 3.476040 +Steps: 0%| | 748/1000000 [03:06<67:48:27, 4.09it/s, grad_norm=3.48, loss_final=2.13, loss_mean=0.964, loss_mean_cls=1.24, proj_loss=-0.0802][2026-03-22 14:21:43] Step: 748, Training Logs: loss_final: 2.137519, loss_mean: 0.969788, proj_loss: -0.081210, loss_mean_cls: 1.248941, grad_norm: 4.347077 +Steps: 0%| | 749/1000000 [03:06<67:58:14, 4.08it/s, grad_norm=4.35, loss_final=2.14, loss_mean=0.97, loss_mean_cls=1.25, proj_loss=-0.0812][2026-03-22 14:21:43] Step: 749, Training Logs: loss_final: 2.156271, loss_mean: 0.964935, proj_loss: -0.076827, loss_mean_cls: 1.268163, grad_norm: 3.428306 +Steps: 0%| | 750/1000000 [03:06<67:54:07, 4.09it/s, grad_norm=3.43, loss_final=2.16, loss_mean=0.965, loss_mean_cls=1.27, proj_loss=-0.0768][2026-03-22 14:21:43] Step: 750, Training Logs: loss_final: 2.036618, loss_mean: 0.980715, proj_loss: -0.080174, loss_mean_cls: 1.136078, grad_norm: 3.763176 +Steps: 0%| | 751/1000000 [03:06<67:52:52, 4.09it/s, grad_norm=3.76, loss_final=2.04, loss_mean=0.981, loss_mean_cls=1.14, proj_loss=-0.0802][2026-03-22 14:21:43] Step: 751, Training Logs: loss_final: 2.169140, loss_mean: 0.946604, proj_loss: -0.082237, loss_mean_cls: 1.304773, grad_norm: 4.555301 +Steps: 0%| | 752/1000000 [03:07<67:51:32, 4.09it/s, grad_norm=4.56, loss_final=2.17, loss_mean=0.947, loss_mean_cls=1.3, proj_loss=-0.0822][2026-03-22 14:21:44] Step: 752, Training Logs: loss_final: 2.093178, loss_mean: 0.953881, proj_loss: -0.082109, loss_mean_cls: 1.221405, grad_norm: 2.679973 +Steps: 0%| | 753/1000000 [03:07<68:30:13, 4.05it/s, grad_norm=2.68, loss_final=2.09, loss_mean=0.954, loss_mean_cls=1.22, proj_loss=-0.0821][2026-03-22 14:21:44] Step: 753, Training Logs: loss_final: 2.024928, loss_mean: 0.997641, proj_loss: -0.081599, loss_mean_cls: 1.108886, grad_norm: 4.680913 +Steps: 0%| | 754/1000000 [03:07<68:30:49, 4.05it/s, grad_norm=4.68, loss_final=2.02, loss_mean=0.998, loss_mean_cls=1.11, proj_loss=-0.0816][2026-03-22 14:21:44] Step: 754, Training Logs: loss_final: 1.946205, loss_mean: 0.990580, proj_loss: -0.084465, loss_mean_cls: 1.040091, grad_norm: 3.221764 +Steps: 0%| | 755/1000000 [03:07<69:28:19, 4.00it/s, grad_norm=3.22, loss_final=1.95, loss_mean=0.991, loss_mean_cls=1.04, proj_loss=-0.0845][2026-03-22 14:21:44] Step: 755, Training Logs: loss_final: 2.086390, loss_mean: 0.952850, proj_loss: -0.082882, loss_mean_cls: 1.216422, grad_norm: 2.996629 +Steps: 0%| | 756/1000000 [03:08<68:59:42, 4.02it/s, grad_norm=3, loss_final=2.09, loss_mean=0.953, loss_mean_cls=1.22, proj_loss=-0.0829][2026-03-22 14:21:45] Step: 756, Training Logs: loss_final: 2.098153, loss_mean: 0.984476, proj_loss: -0.081680, loss_mean_cls: 1.195357, grad_norm: 6.519622 +Steps: 0%| | 757/1000000 [03:08<68:38:27, 4.04it/s, grad_norm=6.52, loss_final=2.1, loss_mean=0.984, loss_mean_cls=1.2, proj_loss=-0.0817][2026-03-22 14:21:45] Step: 757, Training Logs: loss_final: 1.977382, loss_mean: 0.990188, proj_loss: -0.082457, loss_mean_cls: 1.069651, grad_norm: 4.507527 +Steps: 0%| | 758/1000000 [03:08<68:21:53, 4.06it/s, grad_norm=4.51, loss_final=1.98, loss_mean=0.99, loss_mean_cls=1.07, proj_loss=-0.0825][2026-03-22 14:21:45] Step: 758, Training Logs: loss_final: 2.109636, loss_mean: 0.983170, proj_loss: -0.082280, loss_mean_cls: 1.208746, grad_norm: 5.347106 +Steps: 0%| | 759/1000000 [03:08<68:13:15, 4.07it/s, grad_norm=5.35, loss_final=2.11, loss_mean=0.983, loss_mean_cls=1.21, proj_loss=-0.0823][2026-03-22 14:21:45] Step: 759, Training Logs: loss_final: 2.037825, loss_mean: 0.991533, proj_loss: -0.083377, loss_mean_cls: 1.129670, grad_norm: 5.760731 +Steps: 0%| | 760/1000000 [03:09<68:06:05, 4.08it/s, grad_norm=5.76, loss_final=2.04, loss_mean=0.992, loss_mean_cls=1.13, proj_loss=-0.0834][2026-03-22 14:21:46] Step: 760, Training Logs: loss_final: 2.127241, loss_mean: 0.952974, proj_loss: -0.081061, loss_mean_cls: 1.255327, grad_norm: 3.351038 +Steps: 0%| | 761/1000000 [03:09<68:01:17, 4.08it/s, grad_norm=3.35, loss_final=2.13, loss_mean=0.953, loss_mean_cls=1.26, proj_loss=-0.0811][2026-03-22 14:21:46] Step: 761, Training Logs: loss_final: 2.045956, loss_mean: 0.962397, proj_loss: -0.084792, loss_mean_cls: 1.168352, grad_norm: 3.220066 +Steps: 0%| | 762/1000000 [03:09<67:58:15, 4.08it/s, grad_norm=3.22, loss_final=2.05, loss_mean=0.962, loss_mean_cls=1.17, proj_loss=-0.0848][2026-03-22 14:21:46] Step: 762, Training Logs: loss_final: 2.065463, loss_mean: 0.966644, proj_loss: -0.087263, loss_mean_cls: 1.186082, grad_norm: 4.436079 +Steps: 0%| | 763/1000000 [03:09<67:56:21, 4.09it/s, grad_norm=4.44, loss_final=2.07, loss_mean=0.967, loss_mean_cls=1.19, proj_loss=-0.0873][2026-03-22 14:21:46] Step: 763, Training Logs: loss_final: 2.096690, loss_mean: 0.962959, proj_loss: -0.086541, loss_mean_cls: 1.220273, grad_norm: 3.414900 +Steps: 0%| | 764/1000000 [03:10<68:35:54, 4.05it/s, grad_norm=3.41, loss_final=2.1, loss_mean=0.963, loss_mean_cls=1.22, proj_loss=-0.0865][2026-03-22 14:21:47] Step: 764, Training Logs: loss_final: 2.098323, loss_mean: 0.969684, proj_loss: -0.083416, loss_mean_cls: 1.212055, grad_norm: 5.406796 +Steps: 0%| | 765/1000000 [03:10<68:20:18, 4.06it/s, grad_norm=5.41, loss_final=2.1, loss_mean=0.97, loss_mean_cls=1.21, proj_loss=-0.0834][2026-03-22 14:21:47] Step: 765, Training Logs: loss_final: 2.017545, loss_mean: 0.967299, proj_loss: -0.087842, loss_mean_cls: 1.138089, grad_norm: 3.689546 +Steps: 0%| | 766/1000000 [03:10<68:11:38, 4.07it/s, grad_norm=3.69, loss_final=2.02, loss_mean=0.967, loss_mean_cls=1.14, proj_loss=-0.0878][2026-03-22 14:21:47] Step: 766, Training Logs: loss_final: 1.991996, loss_mean: 0.978144, proj_loss: -0.085273, loss_mean_cls: 1.099126, grad_norm: 4.146812 +Steps: 0%| | 767/1000000 [03:10<68:04:58, 4.08it/s, grad_norm=4.15, loss_final=1.99, loss_mean=0.978, loss_mean_cls=1.1, proj_loss=-0.0853][2026-03-22 14:21:47] Step: 767, Training Logs: loss_final: 2.026615, loss_mean: 0.981765, proj_loss: -0.086002, loss_mean_cls: 1.130852, grad_norm: 3.919865 +Steps: 0%| | 768/1000000 [03:11<68:04:26, 4.08it/s, grad_norm=3.92, loss_final=2.03, loss_mean=0.982, loss_mean_cls=1.13, proj_loss=-0.086][2026-03-22 14:21:48] Step: 768, Training Logs: loss_final: 2.193405, loss_mean: 0.938825, proj_loss: -0.088275, loss_mean_cls: 1.342855, grad_norm: 3.062545 +Steps: 0%| | 769/1000000 [03:11<68:00:14, 4.08it/s, grad_norm=3.06, loss_final=2.19, loss_mean=0.939, loss_mean_cls=1.34, proj_loss=-0.0883][2026-03-22 14:21:48] Step: 769, Training Logs: loss_final: 2.132828, loss_mean: 0.960900, proj_loss: -0.088710, loss_mean_cls: 1.260638, grad_norm: 6.332769 +Steps: 0%| | 770/1000000 [03:11<67:57:42, 4.08it/s, grad_norm=6.33, loss_final=2.13, loss_mean=0.961, loss_mean_cls=1.26, proj_loss=-0.0887][2026-03-22 14:21:48] Step: 770, Training Logs: loss_final: 2.087386, loss_mean: 0.964220, proj_loss: -0.087804, loss_mean_cls: 1.210970, grad_norm: 5.549887 +Steps: 0%| | 771/1000000 [03:11<67:54:45, 4.09it/s, grad_norm=5.55, loss_final=2.09, loss_mean=0.964, loss_mean_cls=1.21, proj_loss=-0.0878][2026-03-22 14:21:48] Step: 771, Training Logs: loss_final: 1.987191, loss_mean: 0.999363, proj_loss: -0.088703, loss_mean_cls: 1.076531, grad_norm: 3.782216 +Steps: 0%| | 772/1000000 [03:12<67:56:05, 4.09it/s, grad_norm=3.78, loss_final=1.99, loss_mean=0.999, loss_mean_cls=1.08, proj_loss=-0.0887][2026-03-22 14:21:49] Step: 772, Training Logs: loss_final: 2.241617, loss_mean: 0.952382, proj_loss: -0.087846, loss_mean_cls: 1.377080, grad_norm: 4.634291 +Steps: 0%| | 773/1000000 [03:12<67:55:12, 4.09it/s, grad_norm=4.63, loss_final=2.24, loss_mean=0.952, loss_mean_cls=1.38, proj_loss=-0.0878][2026-03-22 14:21:49] Step: 773, Training Logs: loss_final: 2.166540, loss_mean: 0.948957, proj_loss: -0.087263, loss_mean_cls: 1.304845, grad_norm: 5.136911 +Steps: 0%| | 774/1000000 [03:12<67:52:03, 4.09it/s, grad_norm=5.14, loss_final=2.17, loss_mean=0.949, loss_mean_cls=1.3, proj_loss=-0.0873][2026-03-22 14:21:49] Step: 774, Training Logs: loss_final: 2.097072, loss_mean: 0.967849, proj_loss: -0.092662, loss_mean_cls: 1.221885, grad_norm: 3.352931 +Steps: 0%| | 775/1000000 [03:12<67:49:58, 4.09it/s, grad_norm=3.35, loss_final=2.1, loss_mean=0.968, loss_mean_cls=1.22, proj_loss=-0.0927][2026-03-22 14:21:49] Step: 775, Training Logs: loss_final: 2.176714, loss_mean: 0.955863, proj_loss: -0.087815, loss_mean_cls: 1.308666, grad_norm: 5.390338 +Steps: 0%| | 776/1000000 [03:13<67:51:05, 4.09it/s, grad_norm=5.39, loss_final=2.18, loss_mean=0.956, loss_mean_cls=1.31, proj_loss=-0.0878][2026-03-22 14:21:49] Step: 776, Training Logs: loss_final: 2.063912, loss_mean: 0.971337, proj_loss: -0.094997, loss_mean_cls: 1.187572, grad_norm: 5.571620 +Steps: 0%| | 777/1000000 [03:13<67:51:06, 4.09it/s, grad_norm=5.57, loss_final=2.06, loss_mean=0.971, loss_mean_cls=1.19, proj_loss=-0.095][2026-03-22 14:21:50] Step: 777, Training Logs: loss_final: 2.098864, loss_mean: 0.973858, proj_loss: -0.090820, loss_mean_cls: 1.215826, grad_norm: 5.223228 +Steps: 0%| | 778/1000000 [03:13<67:50:54, 4.09it/s, grad_norm=5.22, loss_final=2.1, loss_mean=0.974, loss_mean_cls=1.22, proj_loss=-0.0908][2026-03-22 14:21:50] Step: 778, Training Logs: loss_final: 1.983098, loss_mean: 0.977991, proj_loss: -0.095582, loss_mean_cls: 1.100689, grad_norm: 4.823508 +Steps: 0%| | 779/1000000 [03:13<67:54:53, 4.09it/s, grad_norm=4.82, loss_final=1.98, loss_mean=0.978, loss_mean_cls=1.1, proj_loss=-0.0956][2026-03-22 14:21:50] Step: 779, Training Logs: loss_final: 2.146504, loss_mean: 0.963254, proj_loss: -0.090002, loss_mean_cls: 1.273252, grad_norm: 6.723526 +Steps: 0%| | 780/1000000 [03:14<68:00:17, 4.08it/s, grad_norm=6.72, loss_final=2.15, loss_mean=0.963, loss_mean_cls=1.27, proj_loss=-0.09][2026-03-22 14:21:50] Step: 780, Training Logs: loss_final: 2.123104, loss_mean: 0.960381, proj_loss: -0.095155, loss_mean_cls: 1.257878, grad_norm: 3.432437 +Steps: 0%| | 781/1000000 [03:14<68:02:23, 4.08it/s, grad_norm=3.43, loss_final=2.12, loss_mean=0.96, loss_mean_cls=1.26, proj_loss=-0.0952][2026-03-22 14:21:51] Step: 781, Training Logs: loss_final: 2.105645, loss_mean: 0.944464, proj_loss: -0.092772, loss_mean_cls: 1.253953, grad_norm: 6.483830 +Steps: 0%| | 782/1000000 [03:14<68:03:18, 4.08it/s, grad_norm=6.48, loss_final=2.11, loss_mean=0.944, loss_mean_cls=1.25, proj_loss=-0.0928][2026-03-22 14:21:51] Step: 782, Training Logs: loss_final: 2.143785, loss_mean: 0.980985, proj_loss: -0.096286, loss_mean_cls: 1.259085, grad_norm: 6.377752 +Steps: 0%| | 783/1000000 [03:14<68:03:17, 4.08it/s, grad_norm=6.38, loss_final=2.14, loss_mean=0.981, loss_mean_cls=1.26, proj_loss=-0.0963][2026-03-22 14:21:51] Step: 783, Training Logs: loss_final: 2.247288, loss_mean: 0.943151, proj_loss: -0.091749, loss_mean_cls: 1.395885, grad_norm: 4.521032 +Steps: 0%| | 784/1000000 [03:14<68:06:15, 4.08it/s, grad_norm=4.52, loss_final=2.25, loss_mean=0.943, loss_mean_cls=1.4, proj_loss=-0.0917][2026-03-22 14:21:51] Step: 784, Training Logs: loss_final: 2.195061, loss_mean: 0.942621, proj_loss: -0.095389, loss_mean_cls: 1.347829, grad_norm: 4.747560 +Steps: 0%| | 785/1000000 [03:15<68:05:17, 4.08it/s, grad_norm=4.75, loss_final=2.2, loss_mean=0.943, loss_mean_cls=1.35, proj_loss=-0.0954][2026-03-22 14:21:52] Step: 785, Training Logs: loss_final: 2.106625, loss_mean: 0.982044, proj_loss: -0.099441, loss_mean_cls: 1.224022, grad_norm: 4.509027 +Steps: 0%| | 786/1000000 [03:15<68:08:56, 4.07it/s, grad_norm=4.51, loss_final=2.11, loss_mean=0.982, loss_mean_cls=1.22, proj_loss=-0.0994][2026-03-22 14:21:52] Step: 786, Training Logs: loss_final: 2.014539, loss_mean: 0.980734, proj_loss: -0.099146, loss_mean_cls: 1.132951, grad_norm: 5.158790 +Steps: 0%| | 787/1000000 [03:15<68:03:10, 4.08it/s, grad_norm=5.16, loss_final=2.01, loss_mean=0.981, loss_mean_cls=1.13, proj_loss=-0.0991][2026-03-22 14:21:52] Step: 787, Training Logs: loss_final: 2.035773, loss_mean: 0.965986, proj_loss: -0.100102, loss_mean_cls: 1.169889, grad_norm: 4.829082 +Steps: 0%| | 788/1000000 [03:15<67:59:40, 4.08it/s, grad_norm=4.83, loss_final=2.04, loss_mean=0.966, loss_mean_cls=1.17, proj_loss=-0.1][2026-03-22 14:21:52] Step: 788, Training Logs: loss_final: 2.105659, loss_mean: 0.948965, proj_loss: -0.101982, loss_mean_cls: 1.258676, grad_norm: 3.994235 +Steps: 0%| | 789/1000000 [03:16<67:56:20, 4.09it/s, grad_norm=3.99, loss_final=2.11, loss_mean=0.949, loss_mean_cls=1.26, proj_loss=-0.102][2026-03-22 14:21:53] Step: 789, Training Logs: loss_final: 2.195401, loss_mean: 0.947180, proj_loss: -0.098363, loss_mean_cls: 1.346584, grad_norm: 4.402639 +Steps: 0%| | 790/1000000 [03:16<67:53:48, 4.09it/s, grad_norm=4.4, loss_final=2.2, loss_mean=0.947, loss_mean_cls=1.35, proj_loss=-0.0984][2026-03-22 14:21:53] Step: 790, Training Logs: loss_final: 2.040124, loss_mean: 0.980214, proj_loss: -0.102485, loss_mean_cls: 1.162395, grad_norm: 4.354618 +Steps: 0%| | 791/1000000 [03:16<67:54:55, 4.09it/s, grad_norm=4.35, loss_final=2.04, loss_mean=0.98, loss_mean_cls=1.16, proj_loss=-0.102][2026-03-22 14:21:53] Step: 791, Training Logs: loss_final: 2.179226, loss_mean: 0.944505, proj_loss: -0.097487, loss_mean_cls: 1.332209, grad_norm: 3.860151 +Steps: 0%| | 792/1000000 [03:16<67:49:33, 4.09it/s, grad_norm=3.86, loss_final=2.18, loss_mean=0.945, loss_mean_cls=1.33, proj_loss=-0.0975][2026-03-22 14:21:53] Step: 792, Training Logs: loss_final: 2.050296, loss_mean: 0.963929, proj_loss: -0.103783, loss_mean_cls: 1.190150, grad_norm: 4.204041 +Steps: 0%| | 793/1000000 [03:17<67:54:47, 4.09it/s, grad_norm=4.2, loss_final=2.05, loss_mean=0.964, loss_mean_cls=1.19, proj_loss=-0.104][2026-03-22 14:21:54] Step: 793, Training Logs: loss_final: 2.104850, loss_mean: 0.958489, proj_loss: -0.100422, loss_mean_cls: 1.246783, grad_norm: 4.462892 +Steps: 0%| | 794/1000000 [03:17<67:54:47, 4.09it/s, grad_norm=4.46, loss_final=2.1, loss_mean=0.958, loss_mean_cls=1.25, proj_loss=-0.1][2026-03-22 14:21:54] Step: 794, Training Logs: loss_final: 1.910276, loss_mean: 1.008103, proj_loss: -0.107404, loss_mean_cls: 1.009577, grad_norm: 3.363417 +Steps: 0%| | 795/1000000 [03:17<67:56:48, 4.08it/s, grad_norm=3.36, loss_final=1.91, loss_mean=1.01, loss_mean_cls=1.01, proj_loss=-0.107][2026-03-22 14:21:54] Step: 795, Training Logs: loss_final: 2.088057, loss_mean: 0.950988, proj_loss: -0.108278, loss_mean_cls: 1.245347, grad_norm: 3.746659 +Steps: 0%| | 796/1000000 [03:17<67:59:19, 4.08it/s, grad_norm=3.75, loss_final=2.09, loss_mean=0.951, loss_mean_cls=1.25, proj_loss=-0.108][2026-03-22 14:21:54] Step: 796, Training Logs: loss_final: 2.050619, loss_mean: 0.980202, proj_loss: -0.108499, loss_mean_cls: 1.178915, grad_norm: 4.743355 +Steps: 0%| | 797/1000000 [03:18<68:00:19, 4.08it/s, grad_norm=4.74, loss_final=2.05, loss_mean=0.98, loss_mean_cls=1.18, proj_loss=-0.108][2026-03-22 14:21:55] Step: 797, Training Logs: loss_final: 2.044584, loss_mean: 0.971928, proj_loss: -0.109983, loss_mean_cls: 1.182639, grad_norm: 5.733259 +Steps: 0%| | 798/1000000 [03:18<68:01:00, 4.08it/s, grad_norm=5.73, loss_final=2.04, loss_mean=0.972, loss_mean_cls=1.18, proj_loss=-0.11][2026-03-22 14:21:55] Step: 798, Training Logs: loss_final: 2.105774, loss_mean: 0.933882, proj_loss: -0.110648, loss_mean_cls: 1.282541, grad_norm: 3.438566 +Steps: 0%| | 799/1000000 [03:18<68:00:16, 4.08it/s, grad_norm=3.44, loss_final=2.11, loss_mean=0.934, loss_mean_cls=1.28, proj_loss=-0.111][2026-03-22 14:21:55] Step: 799, Training Logs: loss_final: 2.062822, loss_mean: 0.991748, proj_loss: -0.110448, loss_mean_cls: 1.181522, grad_norm: 7.009576 +Steps: 0%| | 800/1000000 [03:18<68:05:46, 4.08it/s, grad_norm=7.01, loss_final=2.06, loss_mean=0.992, loss_mean_cls=1.18, proj_loss=-0.11][2026-03-22 14:21:55] Step: 800, Training Logs: loss_final: 2.056200, loss_mean: 0.974655, proj_loss: -0.114912, loss_mean_cls: 1.196457, grad_norm: 5.366398 +Steps: 0%| | 801/1000000 [03:19<68:06:12, 4.08it/s, grad_norm=5.37, loss_final=2.06, loss_mean=0.975, loss_mean_cls=1.2, proj_loss=-0.115][2026-03-22 14:21:56] Step: 801, Training Logs: loss_final: 1.994977, loss_mean: 0.977429, proj_loss: -0.117083, loss_mean_cls: 1.134631, grad_norm: 5.976596 +Steps: 0%| | 802/1000000 [03:19<68:07:31, 4.07it/s, grad_norm=5.98, loss_final=1.99, loss_mean=0.977, loss_mean_cls=1.13, proj_loss=-0.117][2026-03-22 14:21:56] Step: 802, Training Logs: loss_final: 2.236481, loss_mean: 0.953467, proj_loss: -0.114076, loss_mean_cls: 1.397090, grad_norm: 4.040402 +Steps: 0%| | 803/1000000 [03:19<69:07:36, 4.02it/s, grad_norm=4.04, loss_final=2.24, loss_mean=0.953, loss_mean_cls=1.4, proj_loss=-0.114][2026-03-22 14:21:56] Step: 803, Training Logs: loss_final: 2.071093, loss_mean: 0.978423, proj_loss: -0.119558, loss_mean_cls: 1.212229, grad_norm: 7.312470 +Steps: 0%| | 804/1000000 [03:19<70:40:42, 3.93it/s, grad_norm=7.31, loss_final=2.07, loss_mean=0.978, loss_mean_cls=1.21, proj_loss=-0.12][2026-03-22 14:21:56] Step: 804, Training Logs: loss_final: 2.169501, loss_mean: 0.983850, proj_loss: -0.117721, loss_mean_cls: 1.303372, grad_norm: 8.325838 +Steps: 0%| | 805/1000000 [03:20<69:48:23, 3.98it/s, grad_norm=8.33, loss_final=2.17, loss_mean=0.984, loss_mean_cls=1.3, proj_loss=-0.118][2026-03-22 14:21:57] Step: 805, Training Logs: loss_final: 1.977509, loss_mean: 0.992024, proj_loss: -0.114997, loss_mean_cls: 1.100483, grad_norm: 4.290338 +Steps: 0%| | 806/1000000 [03:20<69:12:08, 4.01it/s, grad_norm=4.29, loss_final=1.98, loss_mean=0.992, loss_mean_cls=1.1, proj_loss=-0.115][2026-03-22 14:21:57] Step: 806, Training Logs: loss_final: 2.092355, loss_mean: 0.935179, proj_loss: -0.121296, loss_mean_cls: 1.278472, grad_norm: 4.224316 +Steps: 0%| | 807/1000000 [03:20<68:46:11, 4.04it/s, grad_norm=4.22, loss_final=2.09, loss_mean=0.935, loss_mean_cls=1.28, proj_loss=-0.121][2026-03-22 14:21:57] Step: 807, Training Logs: loss_final: 2.120369, loss_mean: 0.977962, proj_loss: -0.118442, loss_mean_cls: 1.260849, grad_norm: 5.274889 +Steps: 0%| | 808/1000000 [03:20<68:32:14, 4.05it/s, grad_norm=5.27, loss_final=2.12, loss_mean=0.978, loss_mean_cls=1.26, proj_loss=-0.118][2026-03-22 14:21:57] Step: 808, Training Logs: loss_final: 2.002277, loss_mean: 0.973054, proj_loss: -0.123362, loss_mean_cls: 1.152586, grad_norm: 3.879204 +Steps: 0%| | 809/1000000 [03:21<68:19:30, 4.06it/s, grad_norm=3.88, loss_final=2, loss_mean=0.973, loss_mean_cls=1.15, proj_loss=-0.123][2026-03-22 14:21:58] Step: 809, Training Logs: loss_final: 2.048566, loss_mean: 0.983181, proj_loss: -0.121751, loss_mean_cls: 1.187136, grad_norm: 6.327525 +Steps: 0%| | 810/1000000 [03:21<68:09:57, 4.07it/s, grad_norm=6.33, loss_final=2.05, loss_mean=0.983, loss_mean_cls=1.19, proj_loss=-0.122][2026-03-22 14:21:58] Step: 810, Training Logs: loss_final: 2.050696, loss_mean: 0.964370, proj_loss: -0.127865, loss_mean_cls: 1.214191, grad_norm: 4.794471 +Steps: 0%| | 811/1000000 [03:21<68:04:12, 4.08it/s, grad_norm=4.79, loss_final=2.05, loss_mean=0.964, loss_mean_cls=1.21, proj_loss=-0.128][2026-03-22 14:21:58] Step: 811, Training Logs: loss_final: 2.011240, loss_mean: 0.963192, proj_loss: -0.127497, loss_mean_cls: 1.175544, grad_norm: 3.477321 +Steps: 0%| | 812/1000000 [03:21<68:01:20, 4.08it/s, grad_norm=3.48, loss_final=2.01, loss_mean=0.963, loss_mean_cls=1.18, proj_loss=-0.127][2026-03-22 14:21:58] Step: 812, Training Logs: loss_final: 2.047701, loss_mean: 0.991563, proj_loss: -0.130512, loss_mean_cls: 1.186649, grad_norm: 6.057303 +Steps: 0%| | 813/1000000 [03:22<68:27:41, 4.05it/s, grad_norm=6.06, loss_final=2.05, loss_mean=0.992, loss_mean_cls=1.19, proj_loss=-0.131][2026-03-22 14:21:59] Step: 813, Training Logs: loss_final: 2.118625, loss_mean: 0.983433, proj_loss: -0.130379, loss_mean_cls: 1.265570, grad_norm: 5.827415 +Steps: 0%| | 814/1000000 [03:22<68:15:50, 4.07it/s, grad_norm=5.83, loss_final=2.12, loss_mean=0.983, loss_mean_cls=1.27, proj_loss=-0.13][2026-03-22 14:21:59] Step: 814, Training Logs: loss_final: 2.015497, loss_mean: 0.982940, proj_loss: -0.132164, loss_mean_cls: 1.164722, grad_norm: 4.040463 +Steps: 0%| | 815/1000000 [03:22<68:08:42, 4.07it/s, grad_norm=4.04, loss_final=2.02, loss_mean=0.983, loss_mean_cls=1.16, proj_loss=-0.132][2026-03-22 14:21:59] Step: 815, Training Logs: loss_final: 2.226746, loss_mean: 0.946413, proj_loss: -0.127354, loss_mean_cls: 1.407687, grad_norm: 4.560720 +Steps: 0%| | 816/1000000 [03:22<68:04:14, 4.08it/s, grad_norm=4.56, loss_final=2.23, loss_mean=0.946, loss_mean_cls=1.41, proj_loss=-0.127][2026-03-22 14:21:59] Step: 816, Training Logs: loss_final: 2.149718, loss_mean: 0.988891, proj_loss: -0.131813, loss_mean_cls: 1.292639, grad_norm: 5.390722 +Steps: 0%| | 817/1000000 [03:23<67:59:30, 4.08it/s, grad_norm=5.39, loss_final=2.15, loss_mean=0.989, loss_mean_cls=1.29, proj_loss=-0.132][2026-03-22 14:22:00] Step: 817, Training Logs: loss_final: 2.037907, loss_mean: 0.981404, proj_loss: -0.130614, loss_mean_cls: 1.187116, grad_norm: 4.081782 +Steps: 0%| | 818/1000000 [03:23<67:55:57, 4.09it/s, grad_norm=4.08, loss_final=2.04, loss_mean=0.981, loss_mean_cls=1.19, proj_loss=-0.131][2026-03-22 14:22:00] Step: 818, Training Logs: loss_final: 2.086300, loss_mean: 0.976397, proj_loss: -0.134130, loss_mean_cls: 1.244033, grad_norm: 4.450793 +Steps: 0%| | 819/1000000 [03:23<67:54:01, 4.09it/s, grad_norm=4.45, loss_final=2.09, loss_mean=0.976, loss_mean_cls=1.24, proj_loss=-0.134][2026-03-22 14:22:00] Step: 819, Training Logs: loss_final: 2.046916, loss_mean: 1.007201, proj_loss: -0.133241, loss_mean_cls: 1.172956, grad_norm: 5.152673 +Steps: 0%| | 820/1000000 [03:23<67:52:10, 4.09it/s, grad_norm=5.15, loss_final=2.05, loss_mean=1.01, loss_mean_cls=1.17, proj_loss=-0.133][2026-03-22 14:22:00] Step: 820, Training Logs: loss_final: 2.040529, loss_mean: 0.995564, proj_loss: -0.137522, loss_mean_cls: 1.182487, grad_norm: 4.467822 +Steps: 0%| | 821/1000000 [03:24<67:52:15, 4.09it/s, grad_norm=4.47, loss_final=2.04, loss_mean=0.996, loss_mean_cls=1.18, proj_loss=-0.138][2026-03-22 14:22:01] Step: 821, Training Logs: loss_final: 2.078788, loss_mean: 0.965254, proj_loss: -0.135182, loss_mean_cls: 1.248716, grad_norm: 2.627210 +Steps: 0%| | 822/1000000 [03:24<67:51:59, 4.09it/s, grad_norm=2.63, loss_final=2.08, loss_mean=0.965, loss_mean_cls=1.25, proj_loss=-0.135][2026-03-22 14:22:01] Step: 822, Training Logs: loss_final: 1.967810, loss_mean: 0.990707, proj_loss: -0.140553, loss_mean_cls: 1.117657, grad_norm: 4.381603 +Steps: 0%| | 823/1000000 [03:24<67:51:38, 4.09it/s, grad_norm=4.38, loss_final=1.97, loss_mean=0.991, loss_mean_cls=1.12, proj_loss=-0.141][2026-03-22 14:22:01] Step: 823, Training Logs: loss_final: 2.095392, loss_mean: 0.989928, proj_loss: -0.137700, loss_mean_cls: 1.243164, grad_norm: 3.891414 +Steps: 0%| | 824/1000000 [03:24<67:53:07, 4.09it/s, grad_norm=3.89, loss_final=2.1, loss_mean=0.99, loss_mean_cls=1.24, proj_loss=-0.138][2026-03-22 14:22:01] Step: 824, Training Logs: loss_final: 2.112465, loss_mean: 0.973453, proj_loss: -0.135979, loss_mean_cls: 1.274991, grad_norm: 3.604100 +Steps: 0%| | 825/1000000 [03:25<67:54:05, 4.09it/s, grad_norm=3.6, loss_final=2.11, loss_mean=0.973, loss_mean_cls=1.27, proj_loss=-0.136][2026-03-22 14:22:02] Step: 825, Training Logs: loss_final: 2.095467, loss_mean: 0.996523, proj_loss: -0.141304, loss_mean_cls: 1.240248, grad_norm: 8.211204 +Steps: 0%| | 826/1000000 [03:25<67:55:03, 4.09it/s, grad_norm=8.21, loss_final=2.1, loss_mean=0.997, loss_mean_cls=1.24, proj_loss=-0.141][2026-03-22 14:22:02] Step: 826, Training Logs: loss_final: 1.981363, loss_mean: 0.992221, proj_loss: -0.142323, loss_mean_cls: 1.131464, grad_norm: 6.152771 +Steps: 0%| | 827/1000000 [03:25<67:56:07, 4.09it/s, grad_norm=6.15, loss_final=1.98, loss_mean=0.992, loss_mean_cls=1.13, proj_loss=-0.142][2026-03-22 14:22:02] Step: 827, Training Logs: loss_final: 2.052670, loss_mean: 0.981611, proj_loss: -0.141905, loss_mean_cls: 1.212964, grad_norm: 5.799304 +Steps: 0%| | 828/1000000 [03:25<67:54:31, 4.09it/s, grad_norm=5.8, loss_final=2.05, loss_mean=0.982, loss_mean_cls=1.21, proj_loss=-0.142][2026-03-22 14:22:02] Step: 828, Training Logs: loss_final: 2.012059, loss_mean: 0.978708, proj_loss: -0.143773, loss_mean_cls: 1.177123, grad_norm: 6.266032 +Steps: 0%| | 829/1000000 [03:26<67:55:46, 4.09it/s, grad_norm=6.27, loss_final=2.01, loss_mean=0.979, loss_mean_cls=1.18, proj_loss=-0.144][2026-03-22 14:22:02] Step: 829, Training Logs: loss_final: 2.063862, loss_mean: 0.976221, proj_loss: -0.143374, loss_mean_cls: 1.231014, grad_norm: 5.196197 +Steps: 0%| | 830/1000000 [03:26<67:55:38, 4.09it/s, grad_norm=5.2, loss_final=2.06, loss_mean=0.976, loss_mean_cls=1.23, proj_loss=-0.143][2026-03-22 14:22:03] Step: 830, Training Logs: loss_final: 1.949476, loss_mean: 0.975977, proj_loss: -0.149251, loss_mean_cls: 1.122751, grad_norm: 4.044153 +Steps: 0%| | 831/1000000 [03:26<67:52:58, 4.09it/s, grad_norm=4.04, loss_final=1.95, loss_mean=0.976, loss_mean_cls=1.12, proj_loss=-0.149][2026-03-22 14:22:03] Step: 831, Training Logs: loss_final: 2.059429, loss_mean: 0.968131, proj_loss: -0.150958, loss_mean_cls: 1.242256, grad_norm: 5.483463 +Steps: 0%| | 832/1000000 [03:26<67:52:42, 4.09it/s, grad_norm=5.48, loss_final=2.06, loss_mean=0.968, loss_mean_cls=1.24, proj_loss=-0.151][2026-03-22 14:22:03] Step: 832, Training Logs: loss_final: 1.918911, loss_mean: 0.978946, proj_loss: -0.147189, loss_mean_cls: 1.087154, grad_norm: 4.387174 +Steps: 0%| | 833/1000000 [03:27<67:52:04, 4.09it/s, grad_norm=4.39, loss_final=1.92, loss_mean=0.979, loss_mean_cls=1.09, proj_loss=-0.147][2026-03-22 14:22:03] Step: 833, Training Logs: loss_final: 1.871921, loss_mean: 0.974168, proj_loss: -0.152190, loss_mean_cls: 1.049942, grad_norm: 3.909389 +Steps: 0%| | 834/1000000 [03:27<67:51:59, 4.09it/s, grad_norm=3.91, loss_final=1.87, loss_mean=0.974, loss_mean_cls=1.05, proj_loss=-0.152][2026-03-22 14:22:04] Step: 834, Training Logs: loss_final: 2.033444, loss_mean: 0.963446, proj_loss: -0.154906, loss_mean_cls: 1.224903, grad_norm: 3.969376 +Steps: 0%| | 835/1000000 [03:27<67:49:58, 4.09it/s, grad_norm=3.97, loss_final=2.03, loss_mean=0.963, loss_mean_cls=1.22, proj_loss=-0.155][2026-03-22 14:22:04] Step: 835, Training Logs: loss_final: 1.943520, loss_mean: 0.975134, proj_loss: -0.162636, loss_mean_cls: 1.131022, grad_norm: 3.559300 +Steps: 0%| | 836/1000000 [03:27<67:49:08, 4.09it/s, grad_norm=3.56, loss_final=1.94, loss_mean=0.975, loss_mean_cls=1.13, proj_loss=-0.163][2026-03-22 14:22:04] Step: 836, Training Logs: loss_final: 2.010460, loss_mean: 0.966146, proj_loss: -0.157214, loss_mean_cls: 1.201528, grad_norm: 3.143436 +Steps: 0%| | 837/1000000 [03:27<67:48:23, 4.09it/s, grad_norm=3.14, loss_final=2.01, loss_mean=0.966, loss_mean_cls=1.2, proj_loss=-0.157][2026-03-22 14:22:04] Step: 837, Training Logs: loss_final: 2.040556, loss_mean: 0.948833, proj_loss: -0.156290, loss_mean_cls: 1.248013, grad_norm: 4.335613 +Steps: 0%| | 838/1000000 [03:28<67:48:30, 4.09it/s, grad_norm=4.34, loss_final=2.04, loss_mean=0.949, loss_mean_cls=1.25, proj_loss=-0.156][2026-03-22 14:22:05] Step: 838, Training Logs: loss_final: 1.853937, loss_mean: 0.990027, proj_loss: -0.160798, loss_mean_cls: 1.024708, grad_norm: 3.365407 +Steps: 0%| | 839/1000000 [03:28<67:48:25, 4.09it/s, grad_norm=3.37, loss_final=1.85, loss_mean=0.99, loss_mean_cls=1.02, proj_loss=-0.161][2026-03-22 14:22:05] Step: 839, Training Logs: loss_final: 1.957041, loss_mean: 0.959821, proj_loss: -0.159504, loss_mean_cls: 1.156725, grad_norm: 4.356314 +Steps: 0%| | 840/1000000 [03:28<67:49:58, 4.09it/s, grad_norm=4.36, loss_final=1.96, loss_mean=0.96, loss_mean_cls=1.16, proj_loss=-0.16][2026-03-22 14:22:05] Step: 840, Training Logs: loss_final: 2.053461, loss_mean: 0.952224, proj_loss: -0.156285, loss_mean_cls: 1.257523, grad_norm: 3.150462 +Steps: 0%| | 841/1000000 [03:28<67:48:58, 4.09it/s, grad_norm=3.15, loss_final=2.05, loss_mean=0.952, loss_mean_cls=1.26, proj_loss=-0.156][2026-03-22 14:22:05] Step: 841, Training Logs: loss_final: 2.108087, loss_mean: 0.950740, proj_loss: -0.158977, loss_mean_cls: 1.316323, grad_norm: 3.481474 +Steps: 0%| | 842/1000000 [03:29<67:49:30, 4.09it/s, grad_norm=3.48, loss_final=2.11, loss_mean=0.951, loss_mean_cls=1.32, proj_loss=-0.159][2026-03-22 14:22:06] Step: 842, Training Logs: loss_final: 2.127496, loss_mean: 0.918759, proj_loss: -0.159252, loss_mean_cls: 1.367990, grad_norm: 3.818204 +Steps: 0%| | 843/1000000 [03:29<67:49:34, 4.09it/s, grad_norm=3.82, loss_final=2.13, loss_mean=0.919, loss_mean_cls=1.37, proj_loss=-0.159][2026-03-22 14:22:06] Step: 843, Training Logs: loss_final: 2.046182, loss_mean: 0.981652, proj_loss: -0.161681, loss_mean_cls: 1.226210, grad_norm: 4.298227 +Steps: 0%| | 844/1000000 [03:29<67:50:16, 4.09it/s, grad_norm=4.3, loss_final=2.05, loss_mean=0.982, loss_mean_cls=1.23, proj_loss=-0.162][2026-03-22 14:22:06] Step: 844, Training Logs: loss_final: 1.934721, loss_mean: 0.970717, proj_loss: -0.165949, loss_mean_cls: 1.129953, grad_norm: 3.092539 +Steps: 0%| | 845/1000000 [03:29<67:50:22, 4.09it/s, grad_norm=3.09, loss_final=1.93, loss_mean=0.971, loss_mean_cls=1.13, proj_loss=-0.166][2026-03-22 14:22:06] Step: 845, Training Logs: loss_final: 1.979816, loss_mean: 0.976594, proj_loss: -0.167426, loss_mean_cls: 1.170648, grad_norm: 3.777038 +Steps: 0%| | 846/1000000 [03:30<67:50:18, 4.09it/s, grad_norm=3.78, loss_final=1.98, loss_mean=0.977, loss_mean_cls=1.17, proj_loss=-0.167][2026-03-22 14:22:07] Step: 846, Training Logs: loss_final: 1.914301, loss_mean: 0.977255, proj_loss: -0.175262, loss_mean_cls: 1.112307, grad_norm: 3.845878 +Steps: 0%| | 847/1000000 [03:30<67:48:46, 4.09it/s, grad_norm=3.85, loss_final=1.91, loss_mean=0.977, loss_mean_cls=1.11, proj_loss=-0.175][2026-03-22 14:22:07] Step: 847, Training Logs: loss_final: 2.067458, loss_mean: 0.952491, proj_loss: -0.166760, loss_mean_cls: 1.281726, grad_norm: 4.853002 +Steps: 0%| | 848/1000000 [03:30<67:48:53, 4.09it/s, grad_norm=4.85, loss_final=2.07, loss_mean=0.952, loss_mean_cls=1.28, proj_loss=-0.167][2026-03-22 14:22:07] Step: 848, Training Logs: loss_final: 1.978981, loss_mean: 0.978052, proj_loss: -0.174142, loss_mean_cls: 1.175071, grad_norm: 3.150731 +Steps: 0%| | 849/1000000 [03:30<67:48:29, 4.09it/s, grad_norm=3.15, loss_final=1.98, loss_mean=0.978, loss_mean_cls=1.18, proj_loss=-0.174][2026-03-22 14:22:07] Step: 849, Training Logs: loss_final: 1.915762, loss_mean: 0.980542, proj_loss: -0.177855, loss_mean_cls: 1.113075, grad_norm: 4.501724 +Steps: 0%| | 850/1000000 [03:31<67:48:12, 4.09it/s, grad_norm=4.5, loss_final=1.92, loss_mean=0.981, loss_mean_cls=1.11, proj_loss=-0.178][2026-03-22 14:22:08] Step: 850, Training Logs: loss_final: 1.986747, loss_mean: 0.987144, proj_loss: -0.170913, loss_mean_cls: 1.170517, grad_norm: 3.420043 +Steps: 0%| | 851/1000000 [03:31<67:47:37, 4.09it/s, grad_norm=3.42, loss_final=1.99, loss_mean=0.987, loss_mean_cls=1.17, proj_loss=-0.171][2026-03-22 14:22:08] Step: 851, Training Logs: loss_final: 1.972643, loss_mean: 0.971834, proj_loss: -0.171793, loss_mean_cls: 1.172602, grad_norm: 3.946788 +Steps: 0%| | 852/1000000 [03:31<67:49:28, 4.09it/s, grad_norm=3.95, loss_final=1.97, loss_mean=0.972, loss_mean_cls=1.17, proj_loss=-0.172][2026-03-22 14:22:08] Step: 852, Training Logs: loss_final: 1.939714, loss_mean: 1.001366, proj_loss: -0.177352, loss_mean_cls: 1.115700, grad_norm: 4.574537 +Steps: 0%| | 853/1000000 [03:31<67:50:16, 4.09it/s, grad_norm=4.57, loss_final=1.94, loss_mean=1, loss_mean_cls=1.12, proj_loss=-0.177][2026-03-22 14:22:08] Step: 853, Training Logs: loss_final: 1.976163, loss_mean: 0.981159, proj_loss: -0.182921, loss_mean_cls: 1.177925, grad_norm: 4.075083 +Steps: 0%| | 854/1000000 [03:32<67:51:37, 4.09it/s, grad_norm=4.08, loss_final=1.98, loss_mean=0.981, loss_mean_cls=1.18, proj_loss=-0.183][2026-03-22 14:22:09] Step: 854, Training Logs: loss_final: 1.894502, loss_mean: 0.979895, proj_loss: -0.181520, loss_mean_cls: 1.096127, grad_norm: 4.024747 +Steps: 0%| | 855/1000000 [03:32<69:32:56, 3.99it/s, grad_norm=4.02, loss_final=1.89, loss_mean=0.98, loss_mean_cls=1.1, proj_loss=-0.182][2026-03-22 14:22:09] Step: 855, Training Logs: loss_final: 2.035582, loss_mean: 0.938610, proj_loss: -0.182441, loss_mean_cls: 1.279413, grad_norm: 4.517505 +Steps: 0%| | 856/1000000 [03:32<67:49:05, 4.09it/s, grad_norm=4.52, loss_final=2.04, loss_mean=0.939, loss_mean_cls=1.28, proj_loss=-0.182][2026-03-22 14:22:09] Step: 856, Training Logs: loss_final: 2.004049, loss_mean: 0.960558, proj_loss: -0.179675, loss_mean_cls: 1.223167, grad_norm: 3.898611 +Steps: 0%| | 857/1000000 [03:32<67:49:46, 4.09it/s, grad_norm=3.9, loss_final=2, loss_mean=0.961, loss_mean_cls=1.22, proj_loss=-0.18][2026-03-22 14:22:09] Step: 857, Training Logs: loss_final: 2.007303, loss_mean: 0.976671, proj_loss: -0.180497, loss_mean_cls: 1.211128, grad_norm: 3.967739 +Steps: 0%| | 858/1000000 [03:33<67:48:56, 4.09it/s, grad_norm=3.97, loss_final=2.01, loss_mean=0.977, loss_mean_cls=1.21, proj_loss=-0.18][2026-03-22 14:22:10] Step: 858, Training Logs: loss_final: 1.938581, loss_mean: 0.969794, proj_loss: -0.178891, loss_mean_cls: 1.147678, grad_norm: 4.239844 +Steps: 0%| | 859/1000000 [03:33<67:49:08, 4.09it/s, grad_norm=4.24, loss_final=1.94, loss_mean=0.97, loss_mean_cls=1.15, proj_loss=-0.179][2026-03-22 14:22:10] Step: 859, Training Logs: loss_final: 1.931306, loss_mean: 0.957110, proj_loss: -0.184464, loss_mean_cls: 1.158660, grad_norm: 2.487862 +Steps: 0%| | 860/1000000 [03:33<67:50:42, 4.09it/s, grad_norm=2.49, loss_final=1.93, loss_mean=0.957, loss_mean_cls=1.16, proj_loss=-0.184][2026-03-22 14:22:10] Step: 860, Training Logs: loss_final: 2.005199, loss_mean: 0.962736, proj_loss: -0.186931, loss_mean_cls: 1.229394, grad_norm: 4.497987 +Steps: 0%| | 861/1000000 [03:33<67:49:28, 4.09it/s, grad_norm=4.5, loss_final=2.01, loss_mean=0.963, loss_mean_cls=1.23, proj_loss=-0.187][2026-03-22 14:22:10] Step: 861, Training Logs: loss_final: 1.963304, loss_mean: 0.966088, proj_loss: -0.183902, loss_mean_cls: 1.181118, grad_norm: 4.990728 +Steps: 0%| | 862/1000000 [03:34<67:50:07, 4.09it/s, grad_norm=4.99, loss_final=1.96, loss_mean=0.966, loss_mean_cls=1.18, proj_loss=-0.184][2026-03-22 14:22:11] Step: 862, Training Logs: loss_final: 1.957494, loss_mean: 0.964650, proj_loss: -0.187902, loss_mean_cls: 1.180746, grad_norm: 3.142728 +Steps: 0%| | 863/1000000 [03:34<67:48:46, 4.09it/s, grad_norm=3.14, loss_final=1.96, loss_mean=0.965, loss_mean_cls=1.18, proj_loss=-0.188][2026-03-22 14:22:11] Step: 863, Training Logs: loss_final: 1.998865, loss_mean: 0.975436, proj_loss: -0.184710, loss_mean_cls: 1.208139, grad_norm: 5.872191 +Steps: 0%| | 864/1000000 [03:34<67:48:55, 4.09it/s, grad_norm=5.87, loss_final=2, loss_mean=0.975, loss_mean_cls=1.21, proj_loss=-0.185][2026-03-22 14:22:11] Step: 864, Training Logs: loss_final: 1.866922, loss_mean: 0.971504, proj_loss: -0.190218, loss_mean_cls: 1.085636, grad_norm: 4.225492 +Steps: 0%| | 865/1000000 [03:34<67:48:33, 4.09it/s, grad_norm=4.23, loss_final=1.87, loss_mean=0.972, loss_mean_cls=1.09, proj_loss=-0.19][2026-03-22 14:22:11] Step: 865, Training Logs: loss_final: 1.925103, loss_mean: 0.936408, proj_loss: -0.186080, loss_mean_cls: 1.174775, grad_norm: 3.113080 +Steps: 0%| | 866/1000000 [03:35<67:48:42, 4.09it/s, grad_norm=3.11, loss_final=1.93, loss_mean=0.936, loss_mean_cls=1.17, proj_loss=-0.186][2026-03-22 14:22:12] Step: 866, Training Logs: loss_final: 2.178003, loss_mean: 0.924199, proj_loss: -0.183445, loss_mean_cls: 1.437249, grad_norm: 4.247255 +Steps: 0%| | 867/1000000 [03:35<68:04:06, 4.08it/s, grad_norm=4.25, loss_final=2.18, loss_mean=0.924, loss_mean_cls=1.44, proj_loss=-0.183][2026-03-22 14:22:12] Step: 867, Training Logs: loss_final: 2.011570, loss_mean: 0.943272, proj_loss: -0.182120, loss_mean_cls: 1.250418, grad_norm: 2.868419 +Steps: 0%| | 868/1000000 [03:35<68:01:18, 4.08it/s, grad_norm=2.87, loss_final=2.01, loss_mean=0.943, loss_mean_cls=1.25, proj_loss=-0.182][2026-03-22 14:22:12] Step: 868, Training Logs: loss_final: 2.018129, loss_mean: 0.965595, proj_loss: -0.188758, loss_mean_cls: 1.241292, grad_norm: 3.785961 +Steps: 0%| | 869/1000000 [03:35<67:57:30, 4.08it/s, grad_norm=3.79, loss_final=2.02, loss_mean=0.966, loss_mean_cls=1.24, proj_loss=-0.189][2026-03-22 14:22:12] Step: 869, Training Logs: loss_final: 1.941883, loss_mean: 0.965900, proj_loss: -0.195020, loss_mean_cls: 1.171003, grad_norm: 5.514290 +Steps: 0%| | 870/1000000 [03:36<67:55:35, 4.09it/s, grad_norm=5.51, loss_final=1.94, loss_mean=0.966, loss_mean_cls=1.17, proj_loss=-0.195][2026-03-22 14:22:13] Step: 870, Training Logs: loss_final: 1.974219, loss_mean: 0.944978, proj_loss: -0.190760, loss_mean_cls: 1.220002, grad_norm: 2.132933 +Steps: 0%| | 871/1000000 [03:36<67:53:09, 4.09it/s, grad_norm=2.13, loss_final=1.97, loss_mean=0.945, loss_mean_cls=1.22, proj_loss=-0.191][2026-03-22 14:22:13] Step: 871, Training Logs: loss_final: 2.054462, loss_mean: 0.952058, proj_loss: -0.194448, loss_mean_cls: 1.296852, grad_norm: 5.252785 +Steps: 0%| | 872/1000000 [03:36<67:52:20, 4.09it/s, grad_norm=5.25, loss_final=2.05, loss_mean=0.952, loss_mean_cls=1.3, proj_loss=-0.194][2026-03-22 14:22:13] Step: 872, Training Logs: loss_final: 2.105700, loss_mean: 0.941619, proj_loss: -0.186436, loss_mean_cls: 1.350516, grad_norm: 4.485591 +Steps: 0%| | 873/1000000 [03:36<67:53:05, 4.09it/s, grad_norm=4.49, loss_final=2.11, loss_mean=0.942, loss_mean_cls=1.35, proj_loss=-0.186][2026-03-22 14:22:13] Step: 873, Training Logs: loss_final: 1.957384, loss_mean: 0.952221, proj_loss: -0.191913, loss_mean_cls: 1.197077, grad_norm: 3.155878 +Steps: 0%| | 874/1000000 [03:37<67:51:59, 4.09it/s, grad_norm=3.16, loss_final=1.96, loss_mean=0.952, loss_mean_cls=1.2, proj_loss=-0.192][2026-03-22 14:22:14] Step: 874, Training Logs: loss_final: 1.847072, loss_mean: 0.981975, proj_loss: -0.198237, loss_mean_cls: 1.063334, grad_norm: 4.841829 +Steps: 0%| | 875/1000000 [03:37<67:51:15, 4.09it/s, grad_norm=4.84, loss_final=1.85, loss_mean=0.982, loss_mean_cls=1.06, proj_loss=-0.198][2026-03-22 14:22:14] Step: 875, Training Logs: loss_final: 1.942434, loss_mean: 0.965660, proj_loss: -0.195058, loss_mean_cls: 1.171832, grad_norm: 4.090907 +Steps: 0%| | 876/1000000 [03:37<67:49:45, 4.09it/s, grad_norm=4.09, loss_final=1.94, loss_mean=0.966, loss_mean_cls=1.17, proj_loss=-0.195][2026-03-22 14:22:14] Step: 876, Training Logs: loss_final: 2.046967, loss_mean: 0.933763, proj_loss: -0.191639, loss_mean_cls: 1.304843, grad_norm: 5.164370 +Steps: 0%| | 877/1000000 [03:37<67:50:52, 4.09it/s, grad_norm=5.16, loss_final=2.05, loss_mean=0.934, loss_mean_cls=1.3, proj_loss=-0.192][2026-03-22 14:22:14] Step: 877, Training Logs: loss_final: 2.005267, loss_mean: 0.963318, proj_loss: -0.196030, loss_mean_cls: 1.237980, grad_norm: 4.118323 +Steps: 0%| | 878/1000000 [03:38<67:50:04, 4.09it/s, grad_norm=4.12, loss_final=2.01, loss_mean=0.963, loss_mean_cls=1.24, proj_loss=-0.196][2026-03-22 14:22:14] Step: 878, Training Logs: loss_final: 2.017564, loss_mean: 0.948104, proj_loss: -0.198540, loss_mean_cls: 1.268000, grad_norm: 3.049477 +Steps: 0%| | 879/1000000 [03:38<67:49:46, 4.09it/s, grad_norm=3.05, loss_final=2.02, loss_mean=0.948, loss_mean_cls=1.27, proj_loss=-0.199][2026-03-22 14:22:15] Step: 879, Training Logs: loss_final: 1.813880, loss_mean: 0.992025, proj_loss: -0.207299, loss_mean_cls: 1.029153, grad_norm: 5.493132 +Steps: 0%| | 880/1000000 [03:38<67:50:42, 4.09it/s, grad_norm=5.49, loss_final=1.81, loss_mean=0.992, loss_mean_cls=1.03, proj_loss=-0.207][2026-03-22 14:22:15] Step: 880, Training Logs: loss_final: 2.055350, loss_mean: 0.978116, proj_loss: -0.196410, loss_mean_cls: 1.273644, grad_norm: 5.576014 +Steps: 0%| | 881/1000000 [03:38<67:50:42, 4.09it/s, grad_norm=5.58, loss_final=2.06, loss_mean=0.978, loss_mean_cls=1.27, proj_loss=-0.196][2026-03-22 14:22:15] Step: 881, Training Logs: loss_final: 2.040881, loss_mean: 0.955448, proj_loss: -0.196330, loss_mean_cls: 1.281763, grad_norm: 5.075332 +Steps: 0%| | 882/1000000 [03:39<67:51:00, 4.09it/s, grad_norm=5.08, loss_final=2.04, loss_mean=0.955, loss_mean_cls=1.28, proj_loss=-0.196][2026-03-22 14:22:15] Step: 882, Training Logs: loss_final: 1.927965, loss_mean: 0.943686, proj_loss: -0.203832, loss_mean_cls: 1.188111, grad_norm: 4.352892 +Steps: 0%| | 883/1000000 [03:39<67:50:13, 4.09it/s, grad_norm=4.35, loss_final=1.93, loss_mean=0.944, loss_mean_cls=1.19, proj_loss=-0.204][2026-03-22 14:22:16] Step: 883, Training Logs: loss_final: 1.944727, loss_mean: 0.956046, proj_loss: -0.199079, loss_mean_cls: 1.187760, grad_norm: 3.676380 +Steps: 0%| | 884/1000000 [03:39<67:50:29, 4.09it/s, grad_norm=3.68, loss_final=1.94, loss_mean=0.956, loss_mean_cls=1.19, proj_loss=-0.199][2026-03-22 14:22:16] Step: 884, Training Logs: loss_final: 1.962663, loss_mean: 0.944596, proj_loss: -0.205264, loss_mean_cls: 1.223332, grad_norm: 3.413293 +Steps: 0%| | 885/1000000 [03:39<67:48:37, 4.09it/s, grad_norm=3.41, loss_final=1.96, loss_mean=0.945, loss_mean_cls=1.22, proj_loss=-0.205][2026-03-22 14:22:16] Step: 885, Training Logs: loss_final: 1.930224, loss_mean: 0.946617, proj_loss: -0.206468, loss_mean_cls: 1.190075, grad_norm: 3.290601 +Steps: 0%| | 886/1000000 [03:39<67:49:35, 4.09it/s, grad_norm=3.29, loss_final=1.93, loss_mean=0.947, loss_mean_cls=1.19, proj_loss=-0.206][2026-03-22 14:22:16] Step: 886, Training Logs: loss_final: 1.942364, loss_mean: 0.954496, proj_loss: -0.206492, loss_mean_cls: 1.194360, grad_norm: 4.153343 +Steps: 0%| | 887/1000000 [03:40<67:49:03, 4.09it/s, grad_norm=4.15, loss_final=1.94, loss_mean=0.954, loss_mean_cls=1.19, proj_loss=-0.206][2026-03-22 14:22:17] Step: 887, Training Logs: loss_final: 2.058885, loss_mean: 0.933986, proj_loss: -0.203362, loss_mean_cls: 1.328261, grad_norm: 3.505941 +Steps: 0%| | 888/1000000 [03:40<67:50:03, 4.09it/s, grad_norm=3.51, loss_final=2.06, loss_mean=0.934, loss_mean_cls=1.33, proj_loss=-0.203][2026-03-22 14:22:17] Step: 888, Training Logs: loss_final: 1.970387, loss_mean: 0.952884, proj_loss: -0.206090, loss_mean_cls: 1.223593, grad_norm: 3.579885 +Steps: 0%| | 889/1000000 [03:40<67:50:15, 4.09it/s, grad_norm=3.58, loss_final=1.97, loss_mean=0.953, loss_mean_cls=1.22, proj_loss=-0.206][2026-03-22 14:22:17] Step: 889, Training Logs: loss_final: 1.991607, loss_mean: 0.942340, proj_loss: -0.210221, loss_mean_cls: 1.259488, grad_norm: 3.512340 +Steps: 0%| | 890/1000000 [03:40<67:49:57, 4.09it/s, grad_norm=3.51, loss_final=1.99, loss_mean=0.942, loss_mean_cls=1.26, proj_loss=-0.21][2026-03-22 14:22:17] Step: 890, Training Logs: loss_final: 2.009949, loss_mean: 0.924915, proj_loss: -0.202070, loss_mean_cls: 1.287105, grad_norm: 1.812297 +Steps: 0%| | 891/1000000 [03:41<67:51:01, 4.09it/s, grad_norm=1.81, loss_final=2.01, loss_mean=0.925, loss_mean_cls=1.29, proj_loss=-0.202][2026-03-22 14:22:18] Step: 891, Training Logs: loss_final: 1.970139, loss_mean: 0.956525, proj_loss: -0.208821, loss_mean_cls: 1.222435, grad_norm: 4.802045 +Steps: 0%| | 892/1000000 [03:41<67:51:20, 4.09it/s, grad_norm=4.8, loss_final=1.97, loss_mean=0.957, loss_mean_cls=1.22, proj_loss=-0.209][2026-03-22 14:22:18] Step: 892, Training Logs: loss_final: 1.946616, loss_mean: 0.953070, proj_loss: -0.203123, loss_mean_cls: 1.196669, grad_norm: 2.651133 +Steps: 0%| | 893/1000000 [03:41<67:52:04, 4.09it/s, grad_norm=2.65, loss_final=1.95, loss_mean=0.953, loss_mean_cls=1.2, proj_loss=-0.203][2026-03-22 14:22:18] Step: 893, Training Logs: loss_final: 1.892475, loss_mean: 0.971628, proj_loss: -0.214842, loss_mean_cls: 1.135689, grad_norm: 5.065969 +Steps: 0%| | 894/1000000 [03:41<67:51:09, 4.09it/s, grad_norm=5.07, loss_final=1.89, loss_mean=0.972, loss_mean_cls=1.14, proj_loss=-0.215][2026-03-22 14:22:18] Step: 894, Training Logs: loss_final: 1.904836, loss_mean: 0.970074, proj_loss: -0.209061, loss_mean_cls: 1.143822, grad_norm: 4.417174 +Steps: 0%| | 895/1000000 [03:42<67:51:06, 4.09it/s, grad_norm=4.42, loss_final=1.9, loss_mean=0.97, loss_mean_cls=1.14, proj_loss=-0.209][2026-03-22 14:22:19] Step: 895, Training Logs: loss_final: 2.016874, loss_mean: 0.953368, proj_loss: -0.206640, loss_mean_cls: 1.270146, grad_norm: 4.927187 +Steps: 0%| | 896/1000000 [03:42<67:52:08, 4.09it/s, grad_norm=4.93, loss_final=2.02, loss_mean=0.953, loss_mean_cls=1.27, proj_loss=-0.207][2026-03-22 14:22:19] Step: 896, Training Logs: loss_final: 1.910431, loss_mean: 0.974325, proj_loss: -0.215232, loss_mean_cls: 1.151338, grad_norm: 3.868045 +Steps: 0%| | 897/1000000 [03:42<67:52:12, 4.09it/s, grad_norm=3.87, loss_final=1.91, loss_mean=0.974, loss_mean_cls=1.15, proj_loss=-0.215][2026-03-22 14:22:19] Step: 897, Training Logs: loss_final: 2.022474, loss_mean: 0.967080, proj_loss: -0.205094, loss_mean_cls: 1.260489, grad_norm: 4.743105 +Steps: 0%| | 898/1000000 [03:42<67:54:32, 4.09it/s, grad_norm=4.74, loss_final=2.02, loss_mean=0.967, loss_mean_cls=1.26, proj_loss=-0.205][2026-03-22 14:22:19] Step: 898, Training Logs: loss_final: 2.012558, loss_mean: 0.949975, proj_loss: -0.209017, loss_mean_cls: 1.271601, grad_norm: 3.301651 +Steps: 0%| | 899/1000000 [03:43<68:00:20, 4.08it/s, grad_norm=3.3, loss_final=2.01, loss_mean=0.95, loss_mean_cls=1.27, proj_loss=-0.209][2026-03-22 14:22:20] Step: 899, Training Logs: loss_final: 1.912520, loss_mean: 0.970975, proj_loss: -0.213687, loss_mean_cls: 1.155233, grad_norm: 4.558823 +Steps: 0%| | 900/1000000 [03:43<68:00:36, 4.08it/s, grad_norm=4.56, loss_final=1.91, loss_mean=0.971, loss_mean_cls=1.16, proj_loss=-0.214][2026-03-22 14:22:20] Step: 900, Training Logs: loss_final: 2.108838, loss_mean: 0.962547, proj_loss: -0.207706, loss_mean_cls: 1.353996, grad_norm: 6.012200 +Steps: 0%| | 901/1000000 [03:43<67:57:42, 4.08it/s, grad_norm=6.01, loss_final=2.11, loss_mean=0.963, loss_mean_cls=1.35, proj_loss=-0.208][2026-03-22 14:22:20] Step: 901, Training Logs: loss_final: 1.954648, loss_mean: 0.938474, proj_loss: -0.216346, loss_mean_cls: 1.232520, grad_norm: 3.743661 +Steps: 0%| | 902/1000000 [03:43<67:56:37, 4.08it/s, grad_norm=3.74, loss_final=1.95, loss_mean=0.938, loss_mean_cls=1.23, proj_loss=-0.216][2026-03-22 14:22:20] Step: 902, Training Logs: loss_final: 2.015341, loss_mean: 0.953647, proj_loss: -0.206824, loss_mean_cls: 1.268517, grad_norm: 4.166207 +Steps: 0%| | 903/1000000 [03:44<67:54:36, 4.09it/s, grad_norm=4.17, loss_final=2.02, loss_mean=0.954, loss_mean_cls=1.27, proj_loss=-0.207][2026-03-22 14:22:21] Step: 903, Training Logs: loss_final: 1.944133, loss_mean: 0.943036, proj_loss: -0.213317, loss_mean_cls: 1.214414, grad_norm: 2.936978 +Steps: 0%| | 904/1000000 [03:44<67:52:44, 4.09it/s, grad_norm=2.94, loss_final=1.94, loss_mean=0.943, loss_mean_cls=1.21, proj_loss=-0.213][2026-03-22 14:22:21] Step: 904, Training Logs: loss_final: 1.990227, loss_mean: 0.932819, proj_loss: -0.215145, loss_mean_cls: 1.272553, grad_norm: 2.613549 +Steps: 0%| | 905/1000000 [03:44<67:52:27, 4.09it/s, grad_norm=2.61, loss_final=1.99, loss_mean=0.933, loss_mean_cls=1.27, proj_loss=-0.215][2026-03-22 14:22:21] Step: 905, Training Logs: loss_final: 2.023131, loss_mean: 0.953698, proj_loss: -0.217748, loss_mean_cls: 1.287180, grad_norm: 4.733779 +Steps: 0%| | 906/1000000 [03:44<67:49:52, 4.09it/s, grad_norm=4.73, loss_final=2.02, loss_mean=0.954, loss_mean_cls=1.29, proj_loss=-0.218][2026-03-22 14:22:21] Step: 906, Training Logs: loss_final: 1.922510, loss_mean: 0.959873, proj_loss: -0.226579, loss_mean_cls: 1.189215, grad_norm: 4.354317 +Steps: 0%| | 907/1000000 [03:45<67:48:54, 4.09it/s, grad_norm=4.35, loss_final=1.92, loss_mean=0.96, loss_mean_cls=1.19, proj_loss=-0.227][2026-03-22 14:22:22] Step: 907, Training Logs: loss_final: 2.018095, loss_mean: 0.950975, proj_loss: -0.220634, loss_mean_cls: 1.287754, grad_norm: 3.316363 +Steps: 0%| | 908/1000000 [03:45<67:51:34, 4.09it/s, grad_norm=3.32, loss_final=2.02, loss_mean=0.951, loss_mean_cls=1.29, proj_loss=-0.221][2026-03-22 14:22:22] Step: 908, Training Logs: loss_final: 1.942213, loss_mean: 0.966215, proj_loss: -0.220046, loss_mean_cls: 1.196044, grad_norm: 3.241148 +Steps: 0%| | 909/1000000 [03:45<67:51:11, 4.09it/s, grad_norm=3.24, loss_final=1.94, loss_mean=0.966, loss_mean_cls=1.2, proj_loss=-0.22][2026-03-22 14:22:22] Step: 909, Training Logs: loss_final: 1.978897, loss_mean: 0.935399, proj_loss: -0.219806, loss_mean_cls: 1.263304, grad_norm: 2.798213 +Steps: 0%| | 910/1000000 [03:45<67:50:47, 4.09it/s, grad_norm=2.8, loss_final=1.98, loss_mean=0.935, loss_mean_cls=1.26, proj_loss=-0.22][2026-03-22 14:22:22] Step: 910, Training Logs: loss_final: 1.866157, loss_mean: 0.945987, proj_loss: -0.223787, loss_mean_cls: 1.143957, grad_norm: 3.342788 +Steps: 0%| | 911/1000000 [03:46<67:50:40, 4.09it/s, grad_norm=3.34, loss_final=1.87, loss_mean=0.946, loss_mean_cls=1.14, proj_loss=-0.224][2026-03-22 14:22:23] Step: 911, Training Logs: loss_final: 1.895892, loss_mean: 0.949086, proj_loss: -0.219524, loss_mean_cls: 1.166330, grad_norm: 3.500055 +Steps: 0%| | 912/1000000 [03:46<67:48:58, 4.09it/s, grad_norm=3.5, loss_final=1.9, loss_mean=0.949, loss_mean_cls=1.17, proj_loss=-0.22][2026-03-22 14:22:23] Step: 912, Training Logs: loss_final: 1.902415, loss_mean: 0.970641, proj_loss: -0.226461, loss_mean_cls: 1.158234, grad_norm: 3.380736 +Steps: 0%| | 913/1000000 [03:46<67:49:17, 4.09it/s, grad_norm=3.38, loss_final=1.9, loss_mean=0.971, loss_mean_cls=1.16, proj_loss=-0.226][2026-03-22 14:22:23] Step: 913, Training Logs: loss_final: 1.935894, loss_mean: 0.958731, proj_loss: -0.224823, loss_mean_cls: 1.201987, grad_norm: 3.923548 +Steps: 0%| | 914/1000000 [03:46<67:49:43, 4.09it/s, grad_norm=3.92, loss_final=1.94, loss_mean=0.959, loss_mean_cls=1.2, proj_loss=-0.225][2026-03-22 14:22:23] Step: 914, Training Logs: loss_final: 1.941257, loss_mean: 0.957104, proj_loss: -0.229022, loss_mean_cls: 1.213175, grad_norm: 4.613307 +Steps: 0%| | 915/1000000 [03:47<67:50:09, 4.09it/s, grad_norm=4.61, loss_final=1.94, loss_mean=0.957, loss_mean_cls=1.21, proj_loss=-0.229][2026-03-22 14:22:24] Step: 915, Training Logs: loss_final: 1.883513, loss_mean: 0.960357, proj_loss: -0.223980, loss_mean_cls: 1.147136, grad_norm: 3.072484 +Steps: 0%| | 916/1000000 [03:47<67:51:36, 4.09it/s, grad_norm=3.07, loss_final=1.88, loss_mean=0.96, loss_mean_cls=1.15, proj_loss=-0.224][2026-03-22 14:22:24] Step: 916, Training Logs: loss_final: 1.930877, loss_mean: 0.967501, proj_loss: -0.223301, loss_mean_cls: 1.186676, grad_norm: 6.015883 +Steps: 0%| | 917/1000000 [03:47<67:50:26, 4.09it/s, grad_norm=6.02, loss_final=1.93, loss_mean=0.968, loss_mean_cls=1.19, proj_loss=-0.223][2026-03-22 14:22:24] Step: 917, Training Logs: loss_final: 1.959445, loss_mean: 0.956990, proj_loss: -0.227398, loss_mean_cls: 1.229853, grad_norm: 4.138543 +Steps: 0%| | 918/1000000 [03:47<67:50:15, 4.09it/s, grad_norm=4.14, loss_final=1.96, loss_mean=0.957, loss_mean_cls=1.23, proj_loss=-0.227][2026-03-22 14:22:24] Step: 918, Training Logs: loss_final: 1.841001, loss_mean: 0.973480, proj_loss: -0.230144, loss_mean_cls: 1.097665, grad_norm: 5.009937 +Steps: 0%| | 919/1000000 [03:48<67:48:53, 4.09it/s, grad_norm=5.01, loss_final=1.84, loss_mean=0.973, loss_mean_cls=1.1, proj_loss=-0.23][2026-03-22 14:22:25] Step: 919, Training Logs: loss_final: 2.047078, loss_mean: 0.962367, proj_loss: -0.226085, loss_mean_cls: 1.310796, grad_norm: 4.999136 +Steps: 0%| | 920/1000000 [03:48<67:48:37, 4.09it/s, grad_norm=5, loss_final=2.05, loss_mean=0.962, loss_mean_cls=1.31, proj_loss=-0.226][2026-03-22 14:22:25] Step: 920, Training Logs: loss_final: 1.939581, loss_mean: 0.958598, proj_loss: -0.228670, loss_mean_cls: 1.209653, grad_norm: 3.782726 +Steps: 0%| | 921/1000000 [03:48<67:47:24, 4.09it/s, grad_norm=3.78, loss_final=1.94, loss_mean=0.959, loss_mean_cls=1.21, proj_loss=-0.229][2026-03-22 14:22:25] Step: 921, Training Logs: loss_final: 2.006084, loss_mean: 0.935557, proj_loss: -0.228742, loss_mean_cls: 1.299270, grad_norm: 5.192626 +Steps: 0%| | 922/1000000 [03:48<67:47:43, 4.09it/s, grad_norm=5.19, loss_final=2.01, loss_mean=0.936, loss_mean_cls=1.3, proj_loss=-0.229][2026-03-22 14:22:25] Step: 922, Training Logs: loss_final: 1.997884, loss_mean: 0.946193, proj_loss: -0.227418, loss_mean_cls: 1.279110, grad_norm: 2.903185 +Steps: 0%| | 923/1000000 [03:49<67:46:32, 4.09it/s, grad_norm=2.9, loss_final=2, loss_mean=0.946, loss_mean_cls=1.28, proj_loss=-0.227][2026-03-22 14:22:25] Step: 923, Training Logs: loss_final: 2.048506, loss_mean: 0.933570, proj_loss: -0.223494, loss_mean_cls: 1.338430, grad_norm: 4.008429 +Steps: 0%| | 924/1000000 [03:49<67:48:17, 4.09it/s, grad_norm=4.01, loss_final=2.05, loss_mean=0.934, loss_mean_cls=1.34, proj_loss=-0.223][2026-03-22 14:22:26] Step: 924, Training Logs: loss_final: 1.786105, loss_mean: 0.984351, proj_loss: -0.233896, loss_mean_cls: 1.035650, grad_norm: 4.178951 +Steps: 0%| | 925/1000000 [03:49<67:48:30, 4.09it/s, grad_norm=4.18, loss_final=1.79, loss_mean=0.984, loss_mean_cls=1.04, proj_loss=-0.234][2026-03-22 14:22:26] Step: 925, Training Logs: loss_final: 1.926671, loss_mean: 0.959963, proj_loss: -0.231284, loss_mean_cls: 1.197992, grad_norm: 2.730970 +Steps: 0%| | 926/1000000 [03:49<67:47:53, 4.09it/s, grad_norm=2.73, loss_final=1.93, loss_mean=0.96, loss_mean_cls=1.2, proj_loss=-0.231][2026-03-22 14:22:26] Step: 926, Training Logs: loss_final: 1.888944, loss_mean: 0.969946, proj_loss: -0.234429, loss_mean_cls: 1.153427, grad_norm: 4.731897 +Steps: 0%| | 927/1000000 [03:50<67:47:58, 4.09it/s, grad_norm=4.73, loss_final=1.89, loss_mean=0.97, loss_mean_cls=1.15, proj_loss=-0.234][2026-03-22 14:22:26] Step: 927, Training Logs: loss_final: 1.988038, loss_mean: 0.963710, proj_loss: -0.226112, loss_mean_cls: 1.250440, grad_norm: 3.160530 +Steps: 0%| | 928/1000000 [03:50<67:48:14, 4.09it/s, grad_norm=3.16, loss_final=1.99, loss_mean=0.964, loss_mean_cls=1.25, proj_loss=-0.226][2026-03-22 14:22:27] Step: 928, Training Logs: loss_final: 1.980097, loss_mean: 0.979011, proj_loss: -0.234616, loss_mean_cls: 1.235702, grad_norm: 4.043937 +Steps: 0%| | 929/1000000 [03:50<67:48:00, 4.09it/s, grad_norm=4.04, loss_final=1.98, loss_mean=0.979, loss_mean_cls=1.24, proj_loss=-0.235][2026-03-22 14:22:27] Step: 929, Training Logs: loss_final: 1.996380, loss_mean: 0.971828, proj_loss: -0.227800, loss_mean_cls: 1.252352, grad_norm: 4.451698 +Steps: 0%| | 930/1000000 [03:50<67:47:15, 4.09it/s, grad_norm=4.45, loss_final=2, loss_mean=0.972, loss_mean_cls=1.25, proj_loss=-0.228][2026-03-22 14:22:27] Step: 930, Training Logs: loss_final: 1.939644, loss_mean: 0.956679, proj_loss: -0.234548, loss_mean_cls: 1.217513, grad_norm: 3.162782 +Steps: 0%| | 931/1000000 [03:50<67:49:00, 4.09it/s, grad_norm=3.16, loss_final=1.94, loss_mean=0.957, loss_mean_cls=1.22, proj_loss=-0.235][2026-03-22 14:22:27] Step: 931, Training Logs: loss_final: 1.949172, loss_mean: 0.980491, proj_loss: -0.236005, loss_mean_cls: 1.204686, grad_norm: 5.246933 +Steps: 0%| | 932/1000000 [03:51<67:48:45, 4.09it/s, grad_norm=5.25, loss_final=1.95, loss_mean=0.98, loss_mean_cls=1.2, proj_loss=-0.236][2026-03-22 14:22:28] Step: 932, Training Logs: loss_final: 1.873839, loss_mean: 0.943882, proj_loss: -0.235362, loss_mean_cls: 1.165318, grad_norm: 4.314257 +Steps: 0%| | 933/1000000 [03:51<67:49:37, 4.09it/s, grad_norm=4.31, loss_final=1.87, loss_mean=0.944, loss_mean_cls=1.17, proj_loss=-0.235][2026-03-22 14:22:28] Step: 933, Training Logs: loss_final: 1.948492, loss_mean: 0.945764, proj_loss: -0.229580, loss_mean_cls: 1.232309, grad_norm: 3.084450 +Steps: 0%| | 934/1000000 [03:51<67:49:56, 4.09it/s, grad_norm=3.08, loss_final=1.95, loss_mean=0.946, loss_mean_cls=1.23, proj_loss=-0.23][2026-03-22 14:22:28] Step: 934, Training Logs: loss_final: 2.004744, loss_mean: 0.930618, proj_loss: -0.229454, loss_mean_cls: 1.303581, grad_norm: 4.298483 +Steps: 0%| | 935/1000000 [03:51<67:48:02, 4.09it/s, grad_norm=4.3, loss_final=2, loss_mean=0.931, loss_mean_cls=1.3, proj_loss=-0.229][2026-03-22 14:22:28] Step: 935, Training Logs: loss_final: 1.960353, loss_mean: 0.963309, proj_loss: -0.234424, loss_mean_cls: 1.231468, grad_norm: 3.384859 +Steps: 0%| | 936/1000000 [03:52<67:49:20, 4.09it/s, grad_norm=3.38, loss_final=1.96, loss_mean=0.963, loss_mean_cls=1.23, proj_loss=-0.234][2026-03-22 14:22:29] Step: 936, Training Logs: loss_final: 1.855338, loss_mean: 0.981335, proj_loss: -0.243689, loss_mean_cls: 1.117692, grad_norm: 3.873538 +Steps: 0%| | 937/1000000 [03:52<67:48:46, 4.09it/s, grad_norm=3.87, loss_final=1.86, loss_mean=0.981, loss_mean_cls=1.12, proj_loss=-0.244][2026-03-22 14:22:29] Step: 937, Training Logs: loss_final: 1.946396, loss_mean: 0.951109, proj_loss: -0.236099, loss_mean_cls: 1.231387, grad_norm: 3.739757 +Steps: 0%| | 938/1000000 [03:52<67:50:11, 4.09it/s, grad_norm=3.74, loss_final=1.95, loss_mean=0.951, loss_mean_cls=1.23, proj_loss=-0.236][2026-03-22 14:22:29] Step: 938, Training Logs: loss_final: 2.039521, loss_mean: 0.942143, proj_loss: -0.234605, loss_mean_cls: 1.331983, grad_norm: 4.359126 +Steps: 0%| | 939/1000000 [03:52<67:48:54, 4.09it/s, grad_norm=4.36, loss_final=2.04, loss_mean=0.942, loss_mean_cls=1.33, proj_loss=-0.235][2026-03-22 14:22:29] Step: 939, Training Logs: loss_final: 1.873675, loss_mean: 0.955036, proj_loss: -0.236568, loss_mean_cls: 1.155207, grad_norm: 3.765803 +Steps: 0%| | 940/1000000 [03:53<67:48:27, 4.09it/s, grad_norm=3.77, loss_final=1.87, loss_mean=0.955, loss_mean_cls=1.16, proj_loss=-0.237][2026-03-22 14:22:30] Step: 940, Training Logs: loss_final: 1.895000, loss_mean: 0.962306, proj_loss: -0.239042, loss_mean_cls: 1.171736, grad_norm: 4.375537 +Steps: 0%| | 941/1000000 [03:53<67:47:56, 4.09it/s, grad_norm=4.38, loss_final=1.9, loss_mean=0.962, loss_mean_cls=1.17, proj_loss=-0.239][2026-03-22 14:22:30] Step: 941, Training Logs: loss_final: 2.068066, loss_mean: 0.927619, proj_loss: -0.234942, loss_mean_cls: 1.375389, grad_norm: 5.984337 +Steps: 0%| | 942/1000000 [03:53<67:48:00, 4.09it/s, grad_norm=5.98, loss_final=2.07, loss_mean=0.928, loss_mean_cls=1.38, proj_loss=-0.235][2026-03-22 14:22:30] Step: 942, Training Logs: loss_final: 1.853673, loss_mean: 0.980936, proj_loss: -0.235795, loss_mean_cls: 1.108533, grad_norm: 4.217766 +Steps: 0%| | 943/1000000 [03:53<67:46:46, 4.09it/s, grad_norm=4.22, loss_final=1.85, loss_mean=0.981, loss_mean_cls=1.11, proj_loss=-0.236][2026-03-22 14:22:30] Step: 943, Training Logs: loss_final: 1.921973, loss_mean: 0.979681, proj_loss: -0.242145, loss_mean_cls: 1.184437, grad_norm: 6.484753 +Steps: 0%| | 944/1000000 [03:54<67:48:12, 4.09it/s, grad_norm=6.48, loss_final=1.92, loss_mean=0.98, loss_mean_cls=1.18, proj_loss=-0.242][2026-03-22 14:22:31] Step: 944, Training Logs: loss_final: 1.998416, loss_mean: 0.993177, proj_loss: -0.234035, loss_mean_cls: 1.239274, grad_norm: 7.425176 +Steps: 0%| | 945/1000000 [03:54<67:50:29, 4.09it/s, grad_norm=7.43, loss_final=2, loss_mean=0.993, loss_mean_cls=1.24, proj_loss=-0.234][2026-03-22 14:22:31] Step: 945, Training Logs: loss_final: 1.948526, loss_mean: 0.944785, proj_loss: -0.241478, loss_mean_cls: 1.245218, grad_norm: 2.764012 +Steps: 0%| | 946/1000000 [03:54<67:52:00, 4.09it/s, grad_norm=2.76, loss_final=1.95, loss_mean=0.945, loss_mean_cls=1.25, proj_loss=-0.241][2026-03-22 14:22:31] Step: 946, Training Logs: loss_final: 1.894624, loss_mean: 0.970377, proj_loss: -0.243380, loss_mean_cls: 1.167627, grad_norm: 6.329157 +Steps: 0%| | 947/1000000 [03:54<67:50:01, 4.09it/s, grad_norm=6.33, loss_final=1.89, loss_mean=0.97, loss_mean_cls=1.17, proj_loss=-0.243][2026-03-22 14:22:31] Step: 947, Training Logs: loss_final: 1.892581, loss_mean: 0.972898, proj_loss: -0.250048, loss_mean_cls: 1.169732, grad_norm: 6.128619 +Steps: 0%| | 948/1000000 [03:55<67:48:29, 4.09it/s, grad_norm=6.13, loss_final=1.89, loss_mean=0.973, loss_mean_cls=1.17, proj_loss=-0.25][2026-03-22 14:22:32] Step: 948, Training Logs: loss_final: 1.924761, loss_mean: 0.970226, proj_loss: -0.239483, loss_mean_cls: 1.194018, grad_norm: 3.940879 +Steps: 0%| | 949/1000000 [03:55<67:51:06, 4.09it/s, grad_norm=3.94, loss_final=1.92, loss_mean=0.97, loss_mean_cls=1.19, proj_loss=-0.239][2026-03-22 14:22:32] Step: 949, Training Logs: loss_final: 1.784726, loss_mean: 0.981223, proj_loss: -0.246875, loss_mean_cls: 1.050378, grad_norm: 4.175007 +Steps: 0%| | 950/1000000 [03:55<67:51:40, 4.09it/s, grad_norm=4.18, loss_final=1.78, loss_mean=0.981, loss_mean_cls=1.05, proj_loss=-0.247][2026-03-22 14:22:32] Step: 950, Training Logs: loss_final: 1.922230, loss_mean: 0.952745, proj_loss: -0.242284, loss_mean_cls: 1.211769, grad_norm: 4.687802 +Steps: 0%| | 951/1000000 [03:55<67:51:05, 4.09it/s, grad_norm=4.69, loss_final=1.92, loss_mean=0.953, loss_mean_cls=1.21, proj_loss=-0.242][2026-03-22 14:22:32] Step: 951, Training Logs: loss_final: 1.883098, loss_mean: 0.956980, proj_loss: -0.246736, loss_mean_cls: 1.172854, grad_norm: 4.225523 +Steps: 0%| | 952/1000000 [03:56<67:50:24, 4.09it/s, grad_norm=4.23, loss_final=1.88, loss_mean=0.957, loss_mean_cls=1.17, proj_loss=-0.247][2026-03-22 14:22:33] Step: 952, Training Logs: loss_final: 1.917413, loss_mean: 0.949098, proj_loss: -0.242424, loss_mean_cls: 1.210739, grad_norm: 2.613087 +Steps: 0%| | 953/1000000 [03:56<67:49:18, 4.09it/s, grad_norm=2.61, loss_final=1.92, loss_mean=0.949, loss_mean_cls=1.21, proj_loss=-0.242][2026-03-22 14:22:33] Step: 953, Training Logs: loss_final: 1.887275, loss_mean: 0.955455, proj_loss: -0.244446, loss_mean_cls: 1.176266, grad_norm: 3.122737 +Steps: 0%| | 954/1000000 [03:56<67:49:37, 4.09it/s, grad_norm=3.12, loss_final=1.89, loss_mean=0.955, loss_mean_cls=1.18, proj_loss=-0.244][2026-03-22 14:22:33] Step: 954, Training Logs: loss_final: 1.964231, loss_mean: 0.952054, proj_loss: -0.244289, loss_mean_cls: 1.256466, grad_norm: 2.933727 +Steps: 0%| | 955/1000000 [03:56<67:51:05, 4.09it/s, grad_norm=2.93, loss_final=1.96, loss_mean=0.952, loss_mean_cls=1.26, proj_loss=-0.244][2026-03-22 14:22:33] Step: 955, Training Logs: loss_final: 1.864038, loss_mean: 0.952112, proj_loss: -0.247408, loss_mean_cls: 1.159334, grad_norm: 2.868754 +Steps: 0%| | 956/1000000 [03:57<67:51:07, 4.09it/s, grad_norm=2.87, loss_final=1.86, loss_mean=0.952, loss_mean_cls=1.16, proj_loss=-0.247][2026-03-22 14:22:34] Step: 956, Training Logs: loss_final: 1.768161, loss_mean: 0.970201, proj_loss: -0.256667, loss_mean_cls: 1.054627, grad_norm: 2.823402 +Steps: 0%| | 957/1000000 [03:57<67:49:59, 4.09it/s, grad_norm=2.82, loss_final=1.77, loss_mean=0.97, loss_mean_cls=1.05, proj_loss=-0.257][2026-03-22 14:22:34] Step: 957, Training Logs: loss_final: 1.888892, loss_mean: 0.954739, proj_loss: -0.252797, loss_mean_cls: 1.186949, grad_norm: 3.059472 +Steps: 0%| | 958/1000000 [03:57<67:51:20, 4.09it/s, grad_norm=3.06, loss_final=1.89, loss_mean=0.955, loss_mean_cls=1.19, proj_loss=-0.253][2026-03-22 14:22:34] Step: 958, Training Logs: loss_final: 1.933366, loss_mean: 0.957610, proj_loss: -0.243827, loss_mean_cls: 1.219582, grad_norm: 3.551217 +Steps: 0%| | 959/1000000 [03:57<67:51:01, 4.09it/s, grad_norm=3.55, loss_final=1.93, loss_mean=0.958, loss_mean_cls=1.22, proj_loss=-0.244][2026-03-22 14:22:34] Step: 959, Training Logs: loss_final: 1.936707, loss_mean: 0.937427, proj_loss: -0.246359, loss_mean_cls: 1.245639, grad_norm: 2.734181 +Steps: 0%| | 960/1000000 [03:58<67:51:31, 4.09it/s, grad_norm=2.73, loss_final=1.94, loss_mean=0.937, loss_mean_cls=1.25, proj_loss=-0.246][2026-03-22 14:22:35] Step: 960, Training Logs: loss_final: 1.907807, loss_mean: 0.947239, proj_loss: -0.248524, loss_mean_cls: 1.209092, grad_norm: 2.985177 +Steps: 0%| | 961/1000000 [03:58<67:51:35, 4.09it/s, grad_norm=2.99, loss_final=1.91, loss_mean=0.947, loss_mean_cls=1.21, proj_loss=-0.249][2026-03-22 14:22:35] Step: 961, Training Logs: loss_final: 1.922364, loss_mean: 0.964578, proj_loss: -0.248497, loss_mean_cls: 1.206284, grad_norm: 2.865325 +Steps: 0%| | 962/1000000 [03:58<67:52:30, 4.09it/s, grad_norm=2.87, loss_final=1.92, loss_mean=0.965, loss_mean_cls=1.21, proj_loss=-0.248][2026-03-22 14:22:35] Step: 962, Training Logs: loss_final: 1.762696, loss_mean: 0.985973, proj_loss: -0.251577, loss_mean_cls: 1.028299, grad_norm: 2.161229 +Steps: 0%| | 963/1000000 [03:58<67:50:07, 4.09it/s, grad_norm=2.16, loss_final=1.76, loss_mean=0.986, loss_mean_cls=1.03, proj_loss=-0.252][2026-03-22 14:22:35] Step: 963, Training Logs: loss_final: 1.959643, loss_mean: 0.929943, proj_loss: -0.240114, loss_mean_cls: 1.269814, grad_norm: 2.108594 +Steps: 0%| | 964/1000000 [03:59<67:51:39, 4.09it/s, grad_norm=2.11, loss_final=1.96, loss_mean=0.93, loss_mean_cls=1.27, proj_loss=-0.24][2026-03-22 14:22:36] Step: 964, Training Logs: loss_final: 1.878421, loss_mean: 0.960908, proj_loss: -0.252815, loss_mean_cls: 1.170329, grad_norm: 2.230841 +Steps: 0%| | 965/1000000 [03:59<67:51:20, 4.09it/s, grad_norm=2.23, loss_final=1.88, loss_mean=0.961, loss_mean_cls=1.17, proj_loss=-0.253][2026-03-22 14:22:36] Step: 965, Training Logs: loss_final: 1.916144, loss_mean: 0.946238, proj_loss: -0.246121, loss_mean_cls: 1.216027, grad_norm: 2.949650 +Steps: 0%| | 966/1000000 [03:59<67:50:01, 4.09it/s, grad_norm=2.95, loss_final=1.92, loss_mean=0.946, loss_mean_cls=1.22, proj_loss=-0.246][2026-03-22 14:22:36] Step: 966, Training Logs: loss_final: 1.868349, loss_mean: 0.970794, proj_loss: -0.252753, loss_mean_cls: 1.150309, grad_norm: 4.239683 +Steps: 0%| | 967/1000000 [03:59<67:48:11, 4.09it/s, grad_norm=4.24, loss_final=1.87, loss_mean=0.971, loss_mean_cls=1.15, proj_loss=-0.253][2026-03-22 14:22:36] Step: 967, Training Logs: loss_final: 2.014107, loss_mean: 0.922847, proj_loss: -0.243225, loss_mean_cls: 1.334486, grad_norm: 2.977138 +Steps: 0%| | 968/1000000 [04:00<67:48:13, 4.09it/s, grad_norm=2.98, loss_final=2.01, loss_mean=0.923, loss_mean_cls=1.33, proj_loss=-0.243][2026-03-22 14:22:36] Step: 968, Training Logs: loss_final: 1.906203, loss_mean: 0.943117, proj_loss: -0.253441, loss_mean_cls: 1.216527, grad_norm: 3.363862 +Steps: 0%| | 969/1000000 [04:00<67:48:29, 4.09it/s, grad_norm=3.36, loss_final=1.91, loss_mean=0.943, loss_mean_cls=1.22, proj_loss=-0.253][2026-03-22 14:22:37] Step: 969, Training Logs: loss_final: 1.813220, loss_mean: 0.963689, proj_loss: -0.256250, loss_mean_cls: 1.105780, grad_norm: 3.059353 +Steps: 0%| | 970/1000000 [04:00<67:49:59, 4.09it/s, grad_norm=3.06, loss_final=1.81, loss_mean=0.964, loss_mean_cls=1.11, proj_loss=-0.256][2026-03-22 14:22:37] Step: 970, Training Logs: loss_final: 1.963657, loss_mean: 0.952358, proj_loss: -0.248636, loss_mean_cls: 1.259935, grad_norm: 4.066586 +Steps: 0%| | 971/1000000 [04:00<67:48:22, 4.09it/s, grad_norm=4.07, loss_final=1.96, loss_mean=0.952, loss_mean_cls=1.26, proj_loss=-0.249][2026-03-22 14:22:37] Step: 971, Training Logs: loss_final: 1.957227, loss_mean: 0.955390, proj_loss: -0.245777, loss_mean_cls: 1.247613, grad_norm: 2.104157 +Steps: 0%| | 972/1000000 [04:01<67:49:17, 4.09it/s, grad_norm=2.1, loss_final=1.96, loss_mean=0.955, loss_mean_cls=1.25, proj_loss=-0.246][2026-03-22 14:22:37] Step: 972, Training Logs: loss_final: 1.820289, loss_mean: 0.972984, proj_loss: -0.260393, loss_mean_cls: 1.107699, grad_norm: 3.731823 +Steps: 0%| | 973/1000000 [04:01<67:48:26, 4.09it/s, grad_norm=3.73, loss_final=1.82, loss_mean=0.973, loss_mean_cls=1.11, proj_loss=-0.26][2026-03-22 14:22:38] Step: 973, Training Logs: loss_final: 1.895398, loss_mean: 0.942844, proj_loss: -0.252555, loss_mean_cls: 1.205109, grad_norm: 4.268732 +Steps: 0%| | 974/1000000 [04:01<67:48:10, 4.09it/s, grad_norm=4.27, loss_final=1.9, loss_mean=0.943, loss_mean_cls=1.21, proj_loss=-0.253][2026-03-22 14:22:38] Step: 974, Training Logs: loss_final: 1.954166, loss_mean: 0.946767, proj_loss: -0.246063, loss_mean_cls: 1.253462, grad_norm: 2.139684 +Steps: 0%| | 975/1000000 [04:01<67:48:26, 4.09it/s, grad_norm=2.14, loss_final=1.95, loss_mean=0.947, loss_mean_cls=1.25, proj_loss=-0.246][2026-03-22 14:22:38] Step: 975, Training Logs: loss_final: 1.804028, loss_mean: 0.946476, proj_loss: -0.257058, loss_mean_cls: 1.114610, grad_norm: 2.304515 +Steps: 0%| | 976/1000000 [04:01<67:49:47, 4.09it/s, grad_norm=2.3, loss_final=1.8, loss_mean=0.946, loss_mean_cls=1.11, proj_loss=-0.257][2026-03-22 14:22:38] Step: 976, Training Logs: loss_final: 1.930672, loss_mean: 0.948821, proj_loss: -0.249972, loss_mean_cls: 1.231822, grad_norm: 3.444871 +Steps: 0%| | 977/1000000 [04:02<67:48:59, 4.09it/s, grad_norm=3.44, loss_final=1.93, loss_mean=0.949, loss_mean_cls=1.23, proj_loss=-0.25][2026-03-22 14:22:39] Step: 977, Training Logs: loss_final: 1.875571, loss_mean: 0.938301, proj_loss: -0.252994, loss_mean_cls: 1.190265, grad_norm: 2.710012 +Steps: 0%| | 978/1000000 [04:02<67:48:07, 4.09it/s, grad_norm=2.71, loss_final=1.88, loss_mean=0.938, loss_mean_cls=1.19, proj_loss=-0.253][2026-03-22 14:22:39] Step: 978, Training Logs: loss_final: 1.914310, loss_mean: 0.966273, proj_loss: -0.251893, loss_mean_cls: 1.199930, grad_norm: 3.801232 +Steps: 0%| | 979/1000000 [04:02<67:47:30, 4.09it/s, grad_norm=3.8, loss_final=1.91, loss_mean=0.966, loss_mean_cls=1.2, proj_loss=-0.252][2026-03-22 14:22:39] Step: 979, Training Logs: loss_final: 1.957050, loss_mean: 0.941813, proj_loss: -0.246689, loss_mean_cls: 1.261927, grad_norm: 3.878805 +Steps: 0%| | 980/1000000 [04:02<67:47:44, 4.09it/s, grad_norm=3.88, loss_final=1.96, loss_mean=0.942, loss_mean_cls=1.26, proj_loss=-0.247][2026-03-22 14:22:39] Step: 980, Training Logs: loss_final: 1.981707, loss_mean: 0.954517, proj_loss: -0.245628, loss_mean_cls: 1.272817, grad_norm: 4.105142 +Steps: 0%| | 981/1000000 [04:03<67:47:29, 4.09it/s, grad_norm=4.11, loss_final=1.98, loss_mean=0.955, loss_mean_cls=1.27, proj_loss=-0.246][2026-03-22 14:22:40] Step: 981, Training Logs: loss_final: 2.110691, loss_mean: 0.931609, proj_loss: -0.248631, loss_mean_cls: 1.427714, grad_norm: 5.976174 +Steps: 0%| | 982/1000000 [04:03<67:48:13, 4.09it/s, grad_norm=5.98, loss_final=2.11, loss_mean=0.932, loss_mean_cls=1.43, proj_loss=-0.249][2026-03-22 14:22:40] Step: 982, Training Logs: loss_final: 1.897159, loss_mean: 0.967319, proj_loss: -0.260111, loss_mean_cls: 1.189950, grad_norm: 3.311844 +Steps: 0%| | 983/1000000 [04:03<67:46:40, 4.09it/s, grad_norm=3.31, loss_final=1.9, loss_mean=0.967, loss_mean_cls=1.19, proj_loss=-0.26][2026-03-22 14:22:40] Step: 983, Training Logs: loss_final: 1.867021, loss_mean: 0.996251, proj_loss: -0.264532, loss_mean_cls: 1.135303, grad_norm: 6.225163 +Steps: 0%| | 984/1000000 [04:03<67:49:36, 4.09it/s, grad_norm=6.23, loss_final=1.87, loss_mean=0.996, loss_mean_cls=1.14, proj_loss=-0.265][2026-03-22 14:22:40] Step: 984, Training Logs: loss_final: 1.901306, loss_mean: 0.964884, proj_loss: -0.259041, loss_mean_cls: 1.195463, grad_norm: 4.729737 +Steps: 0%| | 985/1000000 [04:04<67:49:48, 4.09it/s, grad_norm=4.73, loss_final=1.9, loss_mean=0.965, loss_mean_cls=1.2, proj_loss=-0.259][2026-03-22 14:22:41] Step: 985, Training Logs: loss_final: 1.911270, loss_mean: 0.964767, proj_loss: -0.256904, loss_mean_cls: 1.203407, grad_norm: 5.968437 +Steps: 0%| | 986/1000000 [04:04<67:47:53, 4.09it/s, grad_norm=5.97, loss_final=1.91, loss_mean=0.965, loss_mean_cls=1.2, proj_loss=-0.257][2026-03-22 14:22:41] Step: 986, Training Logs: loss_final: 1.947526, loss_mean: 0.953858, proj_loss: -0.256268, loss_mean_cls: 1.249936, grad_norm: 6.260946 +Steps: 0%| | 987/1000000 [04:04<67:48:08, 4.09it/s, grad_norm=6.26, loss_final=1.95, loss_mean=0.954, loss_mean_cls=1.25, proj_loss=-0.256][2026-03-22 14:22:41] Step: 987, Training Logs: loss_final: 1.974986, loss_mean: 0.977916, proj_loss: -0.250809, loss_mean_cls: 1.247879, grad_norm: 6.197188 +Steps: 0%| | 988/1000000 [04:04<67:54:03, 4.09it/s, grad_norm=6.2, loss_final=1.97, loss_mean=0.978, loss_mean_cls=1.25, proj_loss=-0.251][2026-03-22 14:22:41] Step: 988, Training Logs: loss_final: 1.852437, loss_mean: 0.969902, proj_loss: -0.253859, loss_mean_cls: 1.136394, grad_norm: 2.009534 +Steps: 0%| | 989/1000000 [04:05<67:52:57, 4.09it/s, grad_norm=2.01, loss_final=1.85, loss_mean=0.97, loss_mean_cls=1.14, proj_loss=-0.254][2026-03-22 14:22:42] Step: 989, Training Logs: loss_final: 1.844886, loss_mean: 0.972517, proj_loss: -0.262271, loss_mean_cls: 1.134640, grad_norm: 4.557157 +Steps: 0%| | 990/1000000 [04:05<67:52:48, 4.09it/s, grad_norm=4.56, loss_final=1.84, loss_mean=0.973, loss_mean_cls=1.13, proj_loss=-0.262][2026-03-22 14:22:42] Step: 990, Training Logs: loss_final: 1.892673, loss_mean: 0.966945, proj_loss: -0.257153, loss_mean_cls: 1.182880, grad_norm: 3.965164 +Steps: 0%| | 991/1000000 [04:05<67:52:36, 4.09it/s, grad_norm=3.97, loss_final=1.89, loss_mean=0.967, loss_mean_cls=1.18, proj_loss=-0.257][2026-03-22 14:22:42] Step: 991, Training Logs: loss_final: 1.776937, loss_mean: 0.984665, proj_loss: -0.265359, loss_mean_cls: 1.057630, grad_norm: 3.389288 +Steps: 0%| | 992/1000000 [04:05<68:00:44, 4.08it/s, grad_norm=3.39, loss_final=1.78, loss_mean=0.985, loss_mean_cls=1.06, proj_loss=-0.265][2026-03-22 14:22:42] Step: 992, Training Logs: loss_final: 2.026923, loss_mean: 0.946800, proj_loss: -0.259308, loss_mean_cls: 1.339431, grad_norm: 4.723173 +Steps: 0%| | 993/1000000 [04:06<67:56:48, 4.08it/s, grad_norm=4.72, loss_final=2.03, loss_mean=0.947, loss_mean_cls=1.34, proj_loss=-0.259][2026-03-22 14:22:43] Step: 993, Training Logs: loss_final: 1.993100, loss_mean: 0.942759, proj_loss: -0.253368, loss_mean_cls: 1.303708, grad_norm: 3.582067 +Steps: 0%| | 994/1000000 [04:06<67:54:09, 4.09it/s, grad_norm=3.58, loss_final=1.99, loss_mean=0.943, loss_mean_cls=1.3, proj_loss=-0.253][2026-03-22 14:22:43] Step: 994, Training Logs: loss_final: 1.809873, loss_mean: 0.985142, proj_loss: -0.271686, loss_mean_cls: 1.096417, grad_norm: 2.809860 +Steps: 0%| | 995/1000000 [04:06<67:52:07, 4.09it/s, grad_norm=2.81, loss_final=1.81, loss_mean=0.985, loss_mean_cls=1.1, proj_loss=-0.272][2026-03-22 14:22:43] Step: 995, Training Logs: loss_final: 1.845222, loss_mean: 0.956688, proj_loss: -0.263560, loss_mean_cls: 1.152095, grad_norm: 2.162610 +Steps: 0%| | 996/1000000 [04:06<67:52:14, 4.09it/s, grad_norm=2.16, loss_final=1.85, loss_mean=0.957, loss_mean_cls=1.15, proj_loss=-0.264][2026-03-22 14:22:43] Step: 996, Training Logs: loss_final: 1.842705, loss_mean: 0.974173, proj_loss: -0.259924, loss_mean_cls: 1.128456, grad_norm: 4.908190 +Steps: 0%| | 997/1000000 [04:07<67:50:16, 4.09it/s, grad_norm=4.91, loss_final=1.84, loss_mean=0.974, loss_mean_cls=1.13, proj_loss=-0.26][2026-03-22 14:22:44] Step: 997, Training Logs: loss_final: 1.915580, loss_mean: 0.950979, proj_loss: -0.258612, loss_mean_cls: 1.223213, grad_norm: 4.786732 +Steps: 0%| | 998/1000000 [04:07<67:48:18, 4.09it/s, grad_norm=4.79, loss_final=1.92, loss_mean=0.951, loss_mean_cls=1.22, proj_loss=-0.259][2026-03-22 14:22:44] Step: 998, Training Logs: loss_final: 1.878917, loss_mean: 0.954231, proj_loss: -0.260636, loss_mean_cls: 1.185322, grad_norm: 2.361614 +Steps: 0%| | 999/1000000 [04:07<67:48:29, 4.09it/s, grad_norm=2.36, loss_final=1.88, loss_mean=0.954, loss_mean_cls=1.19, proj_loss=-0.261][2026-03-22 14:22:44] Step: 999, Training Logs: loss_final: 1.837704, loss_mean: 0.959491, proj_loss: -0.266400, loss_mean_cls: 1.144614, grad_norm: 4.579381 +Steps: 0%| | 1000/1000000 [04:07<67:51:46, 4.09it/s, grad_norm=4.58, loss_final=1.84, loss_mean=0.959, loss_mean_cls=1.14, proj_loss=-0.266][2026-03-22 14:22:44] Step: 1000, Training Logs: loss_final: 1.828048, loss_mean: 0.966554, proj_loss: -0.269552, loss_mean_cls: 1.131047, grad_norm: 3.546944 +Steps: 0%| | 1001/1000000 [04:08<68:31:04, 4.05it/s, grad_norm=3.55, loss_final=1.83, loss_mean=0.967, loss_mean_cls=1.13, proj_loss=-0.27][2026-03-22 14:22:45] Step: 1001, Training Logs: loss_final: 1.851938, loss_mean: 0.980770, proj_loss: -0.267305, loss_mean_cls: 1.138474, grad_norm: 4.670735 +Steps: 0%| | 1002/1000000 [04:08<68:18:50, 4.06it/s, grad_norm=4.67, loss_final=1.85, loss_mean=0.981, loss_mean_cls=1.14, proj_loss=-0.267][2026-03-22 14:22:45] Step: 1002, Training Logs: loss_final: 2.001915, loss_mean: 0.947522, proj_loss: -0.263821, loss_mean_cls: 1.318214, grad_norm: 4.438341 +Steps: 0%| | 1003/1000000 [04:08<68:08:28, 4.07it/s, grad_norm=4.44, loss_final=2, loss_mean=0.948, loss_mean_cls=1.32, proj_loss=-0.264][2026-03-22 14:22:45] Step: 1003, Training Logs: loss_final: 1.886163, loss_mean: 0.963202, proj_loss: -0.265245, loss_mean_cls: 1.188205, grad_norm: 5.297237 +Steps: 0%| | 1004/1000000 [04:08<68:03:43, 4.08it/s, grad_norm=5.3, loss_final=1.89, loss_mean=0.963, loss_mean_cls=1.19, proj_loss=-0.265][2026-03-22 14:22:45] Step: 1004, Training Logs: loss_final: 1.833279, loss_mean: 0.950969, proj_loss: -0.275579, loss_mean_cls: 1.157890, grad_norm: 4.598338 +Steps: 0%| | 1005/1000000 [04:09<67:57:36, 4.08it/s, grad_norm=4.6, loss_final=1.83, loss_mean=0.951, loss_mean_cls=1.16, proj_loss=-0.276][2026-03-22 14:22:46] Step: 1005, Training Logs: loss_final: 1.888894, loss_mean: 0.968156, proj_loss: -0.261295, loss_mean_cls: 1.182033, grad_norm: 4.263714 +Steps: 0%| | 1006/1000000 [04:09<67:55:20, 4.09it/s, grad_norm=4.26, loss_final=1.89, loss_mean=0.968, loss_mean_cls=1.18, proj_loss=-0.261][2026-03-22 14:22:46] Step: 1006, Training Logs: loss_final: 1.889123, loss_mean: 0.967783, proj_loss: -0.266015, loss_mean_cls: 1.187355, grad_norm: 4.434775 +Steps: 0%| | 1007/1000000 [04:09<67:53:12, 4.09it/s, grad_norm=4.43, loss_final=1.89, loss_mean=0.968, loss_mean_cls=1.19, proj_loss=-0.266][2026-03-22 14:22:46] Step: 1007, Training Logs: loss_final: 1.902571, loss_mean: 0.950960, proj_loss: -0.263827, loss_mean_cls: 1.215438, grad_norm: 2.489290 +Steps: 0%| | 1008/1000000 [04:09<67:51:34, 4.09it/s, grad_norm=2.49, loss_final=1.9, loss_mean=0.951, loss_mean_cls=1.22, proj_loss=-0.264][2026-03-22 14:22:46] Step: 1008, Training Logs: loss_final: 1.810636, loss_mean: 0.976047, proj_loss: -0.271009, loss_mean_cls: 1.105597, grad_norm: 2.175740 +Steps: 0%| | 1009/1000000 [04:10<67:50:52, 4.09it/s, grad_norm=2.18, loss_final=1.81, loss_mean=0.976, loss_mean_cls=1.11, proj_loss=-0.271][2026-03-22 14:22:47] Step: 1009, Training Logs: loss_final: 1.834280, loss_mean: 0.964594, proj_loss: -0.269337, loss_mean_cls: 1.139023, grad_norm: 2.750659 +Steps: 0%| | 1010/1000000 [04:10<67:51:01, 4.09it/s, grad_norm=2.75, loss_final=1.83, loss_mean=0.965, loss_mean_cls=1.14, proj_loss=-0.269][2026-03-22 14:22:47] Step: 1010, Training Logs: loss_final: 1.855954, loss_mean: 0.966163, proj_loss: -0.268760, loss_mean_cls: 1.158551, grad_norm: 3.543124 +Steps: 0%| | 1011/1000000 [04:10<67:49:19, 4.09it/s, grad_norm=3.54, loss_final=1.86, loss_mean=0.966, loss_mean_cls=1.16, proj_loss=-0.269][2026-03-22 14:22:47] Step: 1011, Training Logs: loss_final: 1.821817, loss_mean: 0.973605, proj_loss: -0.270944, loss_mean_cls: 1.119156, grad_norm: 2.033006 +Steps: 0%| | 1012/1000000 [04:10<67:49:49, 4.09it/s, grad_norm=2.03, loss_final=1.82, loss_mean=0.974, loss_mean_cls=1.12, proj_loss=-0.271][2026-03-22 14:22:47] Step: 1012, Training Logs: loss_final: 1.784953, loss_mean: 0.966485, proj_loss: -0.277591, loss_mean_cls: 1.096059, grad_norm: 4.834623 +Steps: 0%| | 1013/1000000 [04:11<67:49:18, 4.09it/s, grad_norm=4.83, loss_final=1.78, loss_mean=0.966, loss_mean_cls=1.1, proj_loss=-0.278][2026-03-22 14:22:47] Step: 1013, Training Logs: loss_final: 1.906565, loss_mean: 0.940111, proj_loss: -0.266767, loss_mean_cls: 1.233221, grad_norm: 3.668871 +Steps: 0%| | 1014/1000000 [04:11<68:26:43, 4.05it/s, grad_norm=3.67, loss_final=1.91, loss_mean=0.94, loss_mean_cls=1.23, proj_loss=-0.267][2026-03-22 14:22:48] Step: 1014, Training Logs: loss_final: 1.750586, loss_mean: 0.989189, proj_loss: -0.271305, loss_mean_cls: 1.032701, grad_norm: 4.088102 +Steps: 0%| | 1015/1000000 [04:11<68:13:39, 4.07it/s, grad_norm=4.09, loss_final=1.75, loss_mean=0.989, loss_mean_cls=1.03, proj_loss=-0.271][2026-03-22 14:22:48] Step: 1015, Training Logs: loss_final: 1.823315, loss_mean: 0.968695, proj_loss: -0.270733, loss_mean_cls: 1.125353, grad_norm: 4.077370 +Steps: 0%| | 1016/1000000 [04:11<68:06:41, 4.07it/s, grad_norm=4.08, loss_final=1.82, loss_mean=0.969, loss_mean_cls=1.13, proj_loss=-0.271][2026-03-22 14:22:48] Step: 1016, Training Logs: loss_final: 1.598093, loss_mean: 0.992445, proj_loss: -0.278883, loss_mean_cls: 0.884531, grad_norm: 3.685085 +Steps: 0%| | 1017/1000000 [04:12<68:00:12, 4.08it/s, grad_norm=3.69, loss_final=1.6, loss_mean=0.992, loss_mean_cls=0.885, proj_loss=-0.279][2026-03-22 14:22:48] Step: 1017, Training Logs: loss_final: 1.936925, loss_mean: 0.940505, proj_loss: -0.271147, loss_mean_cls: 1.267566, grad_norm: 4.280972 +Steps: 0%| | 1018/1000000 [04:12<67:57:43, 4.08it/s, grad_norm=4.28, loss_final=1.94, loss_mean=0.941, loss_mean_cls=1.27, proj_loss=-0.271][2026-03-22 14:22:49] Step: 1018, Training Logs: loss_final: 1.893152, loss_mean: 0.938081, proj_loss: -0.267284, loss_mean_cls: 1.222355, grad_norm: 3.231122 +Steps: 0%| | 1019/1000000 [04:12<67:54:58, 4.09it/s, grad_norm=3.23, loss_final=1.89, loss_mean=0.938, loss_mean_cls=1.22, proj_loss=-0.267][2026-03-22 14:22:49] Step: 1019, Training Logs: loss_final: 1.878340, loss_mean: 0.951156, proj_loss: -0.271103, loss_mean_cls: 1.198287, grad_norm: 4.478257 +Steps: 0%| | 1020/1000000 [04:12<67:53:50, 4.09it/s, grad_norm=4.48, loss_final=1.88, loss_mean=0.951, loss_mean_cls=1.2, proj_loss=-0.271][2026-03-22 14:22:49] Step: 1020, Training Logs: loss_final: 1.912427, loss_mean: 0.923806, proj_loss: -0.262790, loss_mean_cls: 1.251412, grad_norm: 3.007521 +Steps: 0%| | 1021/1000000 [04:12<67:51:05, 4.09it/s, grad_norm=3.01, loss_final=1.91, loss_mean=0.924, loss_mean_cls=1.25, proj_loss=-0.263][2026-03-22 14:22:49] Step: 1021, Training Logs: loss_final: 1.921558, loss_mean: 0.966115, proj_loss: -0.267159, loss_mean_cls: 1.222602, grad_norm: 4.731379 +Steps: 0%| | 1022/1000000 [04:13<67:50:03, 4.09it/s, grad_norm=4.73, loss_final=1.92, loss_mean=0.966, loss_mean_cls=1.22, proj_loss=-0.267][2026-03-22 14:22:50] Step: 1022, Training Logs: loss_final: 2.008161, loss_mean: 0.962642, proj_loss: -0.258697, loss_mean_cls: 1.304215, grad_norm: 4.295395 +Steps: 0%| | 1023/1000000 [04:13<67:51:05, 4.09it/s, grad_norm=4.3, loss_final=2.01, loss_mean=0.963, loss_mean_cls=1.3, proj_loss=-0.259][2026-03-22 14:22:50] Step: 1023, Training Logs: loss_final: 1.949536, loss_mean: 0.939209, proj_loss: -0.262286, loss_mean_cls: 1.272613, grad_norm: 2.613741 +Steps: 0%| | 1024/1000000 [04:13<67:51:44, 4.09it/s, grad_norm=2.61, loss_final=1.95, loss_mean=0.939, loss_mean_cls=1.27, proj_loss=-0.262][2026-03-22 14:22:50] Step: 1024, Training Logs: loss_final: 1.991852, loss_mean: 0.976120, proj_loss: -0.269783, loss_mean_cls: 1.285515, grad_norm: 5.744042 +Steps: 0%| | 1025/1000000 [04:13<67:49:41, 4.09it/s, grad_norm=5.74, loss_final=1.99, loss_mean=0.976, loss_mean_cls=1.29, proj_loss=-0.27][2026-03-22 14:22:50] Step: 1025, Training Logs: loss_final: 2.056113, loss_mean: 0.987083, proj_loss: -0.268904, loss_mean_cls: 1.337934, grad_norm: 6.963561 +Steps: 0%| | 1026/1000000 [04:14<67:48:28, 4.09it/s, grad_norm=6.96, loss_final=2.06, loss_mean=0.987, loss_mean_cls=1.34, proj_loss=-0.269][2026-03-22 14:22:51] Step: 1026, Training Logs: loss_final: 1.954209, loss_mean: 0.944911, proj_loss: -0.275996, loss_mean_cls: 1.285294, grad_norm: 4.432662 +Steps: 0%| | 1027/1000000 [04:14<67:47:08, 4.09it/s, grad_norm=4.43, loss_final=1.95, loss_mean=0.945, loss_mean_cls=1.29, proj_loss=-0.276][2026-03-22 14:22:51] Step: 1027, Training Logs: loss_final: 1.999049, loss_mean: 0.983300, proj_loss: -0.266751, loss_mean_cls: 1.282499, grad_norm: 4.185306 +Steps: 0%| | 1028/1000000 [04:14<67:48:41, 4.09it/s, grad_norm=4.19, loss_final=2, loss_mean=0.983, loss_mean_cls=1.28, proj_loss=-0.267][2026-03-22 14:22:51] Step: 1028, Training Logs: loss_final: 1.914964, loss_mean: 1.023042, proj_loss: -0.274723, loss_mean_cls: 1.166646, grad_norm: 5.985567 +Steps: 0%| | 1029/1000000 [04:14<67:48:09, 4.09it/s, grad_norm=5.99, loss_final=1.91, loss_mean=1.02, loss_mean_cls=1.17, proj_loss=-0.275][2026-03-22 14:22:51] Step: 1029, Training Logs: loss_final: 2.024636, loss_mean: 0.978476, proj_loss: -0.265696, loss_mean_cls: 1.311857, grad_norm: 6.266421 +Steps: 0%| | 1030/1000000 [04:15<67:48:31, 4.09it/s, grad_norm=6.27, loss_final=2.02, loss_mean=0.978, loss_mean_cls=1.31, proj_loss=-0.266][2026-03-22 14:22:52] Step: 1030, Training Logs: loss_final: 1.929570, loss_mean: 0.962528, proj_loss: -0.266239, loss_mean_cls: 1.233281, grad_norm: 3.576082 +Steps: 0%| | 1031/1000000 [04:15<67:47:36, 4.09it/s, grad_norm=3.58, loss_final=1.93, loss_mean=0.963, loss_mean_cls=1.23, proj_loss=-0.266][2026-03-22 14:22:52] Step: 1031, Training Logs: loss_final: 2.061259, loss_mean: 0.943492, proj_loss: -0.260842, loss_mean_cls: 1.378609, grad_norm: 4.162201 +Steps: 0%| | 1032/1000000 [04:15<67:48:00, 4.09it/s, grad_norm=4.16, loss_final=2.06, loss_mean=0.943, loss_mean_cls=1.38, proj_loss=-0.261][2026-03-22 14:22:52] Step: 1032, Training Logs: loss_final: 1.935646, loss_mean: 0.967015, proj_loss: -0.276456, loss_mean_cls: 1.245087, grad_norm: 5.404397 +Steps: 0%| | 1033/1000000 [04:15<67:48:09, 4.09it/s, grad_norm=5.4, loss_final=1.94, loss_mean=0.967, loss_mean_cls=1.25, proj_loss=-0.276][2026-03-22 14:22:52] Step: 1033, Training Logs: loss_final: 1.689953, loss_mean: 1.020716, proj_loss: -0.284848, loss_mean_cls: 0.954085, grad_norm: 4.347778 +Steps: 0%| | 1034/1000000 [04:16<67:48:33, 4.09it/s, grad_norm=4.35, loss_final=1.69, loss_mean=1.02, loss_mean_cls=0.954, proj_loss=-0.285][2026-03-22 14:22:53] Step: 1034, Training Logs: loss_final: 1.840701, loss_mean: 0.973027, proj_loss: -0.277717, loss_mean_cls: 1.145392, grad_norm: 2.624850 +Steps: 0%| | 1035/1000000 [04:16<67:50:20, 4.09it/s, grad_norm=2.62, loss_final=1.84, loss_mean=0.973, loss_mean_cls=1.15, proj_loss=-0.278][2026-03-22 14:22:53] Step: 1035, Training Logs: loss_final: 1.840417, loss_mean: 0.975882, proj_loss: -0.277469, loss_mean_cls: 1.142004, grad_norm: 3.243732 +Steps: 0%| | 1036/1000000 [04:16<67:50:56, 4.09it/s, grad_norm=3.24, loss_final=1.84, loss_mean=0.976, loss_mean_cls=1.14, proj_loss=-0.277][2026-03-22 14:22:53] Step: 1036, Training Logs: loss_final: 1.840993, loss_mean: 0.966874, proj_loss: -0.271449, loss_mean_cls: 1.145568, grad_norm: 2.809939 +Steps: 0%| | 1037/1000000 [04:16<67:50:52, 4.09it/s, grad_norm=2.81, loss_final=1.84, loss_mean=0.967, loss_mean_cls=1.15, proj_loss=-0.271][2026-03-22 14:22:53] Step: 1037, Training Logs: loss_final: 1.800449, loss_mean: 0.978847, proj_loss: -0.282224, loss_mean_cls: 1.103826, grad_norm: 2.755342 +Steps: 0%| | 1038/1000000 [04:17<67:50:19, 4.09it/s, grad_norm=2.76, loss_final=1.8, loss_mean=0.979, loss_mean_cls=1.1, proj_loss=-0.282][2026-03-22 14:22:54] Step: 1038, Training Logs: loss_final: 1.831461, loss_mean: 0.965346, proj_loss: -0.279600, loss_mean_cls: 1.145715, grad_norm: 3.107309 +Steps: 0%| | 1039/1000000 [04:17<67:50:35, 4.09it/s, grad_norm=3.11, loss_final=1.83, loss_mean=0.965, loss_mean_cls=1.15, proj_loss=-0.28][2026-03-22 14:22:54] Step: 1039, Training Logs: loss_final: 1.900643, loss_mean: 0.959138, proj_loss: -0.278074, loss_mean_cls: 1.219579, grad_norm: 3.005465 +Steps: 0%| | 1040/1000000 [04:17<67:49:44, 4.09it/s, grad_norm=3.01, loss_final=1.9, loss_mean=0.959, loss_mean_cls=1.22, proj_loss=-0.278][2026-03-22 14:22:54] Step: 1040, Training Logs: loss_final: 1.924544, loss_mean: 0.955864, proj_loss: -0.273077, loss_mean_cls: 1.241756, grad_norm: 3.004242 +Steps: 0%| | 1041/1000000 [04:17<67:50:31, 4.09it/s, grad_norm=3, loss_final=1.92, loss_mean=0.956, loss_mean_cls=1.24, proj_loss=-0.273][2026-03-22 14:22:54] Step: 1041, Training Logs: loss_final: 1.910683, loss_mean: 0.951798, proj_loss: -0.275073, loss_mean_cls: 1.233958, grad_norm: 3.874050 +Steps: 0%| | 1042/1000000 [04:18<67:47:41, 4.09it/s, grad_norm=3.87, loss_final=1.91, loss_mean=0.952, loss_mean_cls=1.23, proj_loss=-0.275][2026-03-22 14:22:55] Step: 1042, Training Logs: loss_final: 1.758256, loss_mean: 0.976025, proj_loss: -0.278416, loss_mean_cls: 1.060648, grad_norm: 3.426778 +Steps: 0%| | 1043/1000000 [04:18<67:46:26, 4.09it/s, grad_norm=3.43, loss_final=1.76, loss_mean=0.976, loss_mean_cls=1.06, proj_loss=-0.278][2026-03-22 14:22:55] Step: 1043, Training Logs: loss_final: 1.859437, loss_mean: 0.960035, proj_loss: -0.281144, loss_mean_cls: 1.180547, grad_norm: 3.155366 +Steps: 0%| | 1044/1000000 [04:18<67:47:28, 4.09it/s, grad_norm=3.16, loss_final=1.86, loss_mean=0.96, loss_mean_cls=1.18, proj_loss=-0.281][2026-03-22 14:22:55] Step: 1044, Training Logs: loss_final: 1.888640, loss_mean: 0.975189, proj_loss: -0.278285, loss_mean_cls: 1.191735, grad_norm: 3.534621 +Steps: 0%| | 1045/1000000 [04:18<67:47:31, 4.09it/s, grad_norm=3.53, loss_final=1.89, loss_mean=0.975, loss_mean_cls=1.19, proj_loss=-0.278][2026-03-22 14:22:55] Step: 1045, Training Logs: loss_final: 1.720338, loss_mean: 0.982792, proj_loss: -0.289127, loss_mean_cls: 1.026672, grad_norm: 3.141181 +Steps: 0%| | 1046/1000000 [04:19<67:48:12, 4.09it/s, grad_norm=3.14, loss_final=1.72, loss_mean=0.983, loss_mean_cls=1.03, proj_loss=-0.289][2026-03-22 14:22:56] Step: 1046, Training Logs: loss_final: 1.822671, loss_mean: 0.965013, proj_loss: -0.285474, loss_mean_cls: 1.143131, grad_norm: 2.848263 +Steps: 0%| | 1047/1000000 [04:19<67:47:52, 4.09it/s, grad_norm=2.85, loss_final=1.82, loss_mean=0.965, loss_mean_cls=1.14, proj_loss=-0.285][2026-03-22 14:22:56] Step: 1047, Training Logs: loss_final: 2.011014, loss_mean: 0.977345, proj_loss: -0.270861, loss_mean_cls: 1.304530, grad_norm: 5.434084 +Steps: 0%| | 1048/1000000 [04:19<67:47:17, 4.09it/s, grad_norm=5.43, loss_final=2.01, loss_mean=0.977, loss_mean_cls=1.3, proj_loss=-0.271][2026-03-22 14:22:56] Step: 1048, Training Logs: loss_final: 1.809797, loss_mean: 0.986647, proj_loss: -0.284403, loss_mean_cls: 1.107552, grad_norm: 4.034800 +Steps: 0%| | 1049/1000000 [04:19<67:46:41, 4.09it/s, grad_norm=4.03, loss_final=1.81, loss_mean=0.987, loss_mean_cls=1.11, proj_loss=-0.284][2026-03-22 14:22:56] Step: 1049, Training Logs: loss_final: 1.837767, loss_mean: 0.969913, proj_loss: -0.286232, loss_mean_cls: 1.154086, grad_norm: 3.416554 +Steps: 0%| | 1050/1000000 [04:20<67:47:22, 4.09it/s, grad_norm=3.42, loss_final=1.84, loss_mean=0.97, loss_mean_cls=1.15, proj_loss=-0.286][2026-03-22 14:22:57] Step: 1050, Training Logs: loss_final: 1.958815, loss_mean: 0.961375, proj_loss: -0.273897, loss_mean_cls: 1.271336, grad_norm: 4.410818 +Steps: 0%| | 1051/1000000 [04:20<67:46:29, 4.09it/s, grad_norm=4.41, loss_final=1.96, loss_mean=0.961, loss_mean_cls=1.27, proj_loss=-0.274][2026-03-22 14:22:57] Step: 1051, Training Logs: loss_final: 1.917031, loss_mean: 0.974009, proj_loss: -0.281066, loss_mean_cls: 1.224088, grad_norm: 4.691284 +Steps: 0%| | 1052/1000000 [04:20<67:47:56, 4.09it/s, grad_norm=4.69, loss_final=1.92, loss_mean=0.974, loss_mean_cls=1.22, proj_loss=-0.281][2026-03-22 14:22:57] Step: 1052, Training Logs: loss_final: 1.850797, loss_mean: 0.969906, proj_loss: -0.280254, loss_mean_cls: 1.161145, grad_norm: 3.369816 +Steps: 0%| | 1053/1000000 [04:20<67:48:09, 4.09it/s, grad_norm=3.37, loss_final=1.85, loss_mean=0.97, loss_mean_cls=1.16, proj_loss=-0.28][2026-03-22 14:22:57] Step: 1053, Training Logs: loss_final: 1.810939, loss_mean: 0.981444, proj_loss: -0.284868, loss_mean_cls: 1.114363, grad_norm: 3.506300 +Steps: 0%| | 1054/1000000 [04:21<67:47:49, 4.09it/s, grad_norm=3.51, loss_final=1.81, loss_mean=0.981, loss_mean_cls=1.11, proj_loss=-0.285][2026-03-22 14:22:58] Step: 1054, Training Logs: loss_final: 1.710563, loss_mean: 0.964761, proj_loss: -0.291063, loss_mean_cls: 1.036865, grad_norm: 2.255666 +Steps: 0%| | 1055/1000000 [04:21<67:47:46, 4.09it/s, grad_norm=2.26, loss_final=1.71, loss_mean=0.965, loss_mean_cls=1.04, proj_loss=-0.291][2026-03-22 14:22:58] Step: 1055, Training Logs: loss_final: 1.866554, loss_mean: 0.962539, proj_loss: -0.284870, loss_mean_cls: 1.188886, grad_norm: 3.198633 +Steps: 0%| | 1056/1000000 [04:21<67:48:41, 4.09it/s, grad_norm=3.2, loss_final=1.87, loss_mean=0.963, loss_mean_cls=1.19, proj_loss=-0.285][2026-03-22 14:22:58] Step: 1056, Training Logs: loss_final: 1.940348, loss_mean: 0.952442, proj_loss: -0.279127, loss_mean_cls: 1.267034, grad_norm: 3.027682 +Steps: 0%| | 1057/1000000 [04:21<67:48:38, 4.09it/s, grad_norm=3.03, loss_final=1.94, loss_mean=0.952, loss_mean_cls=1.27, proj_loss=-0.279][2026-03-22 14:22:58] Step: 1057, Training Logs: loss_final: 1.818961, loss_mean: 0.954805, proj_loss: -0.286818, loss_mean_cls: 1.150974, grad_norm: 2.561054 +Steps: 0%| | 1058/1000000 [04:22<67:46:38, 4.09it/s, grad_norm=2.56, loss_final=1.82, loss_mean=0.955, loss_mean_cls=1.15, proj_loss=-0.287][2026-03-22 14:22:58] Step: 1058, Training Logs: loss_final: 1.862569, loss_mean: 0.948526, proj_loss: -0.284180, loss_mean_cls: 1.198223, grad_norm: 3.636090 +Steps: 0%| | 1059/1000000 [04:22<67:45:37, 4.10it/s, grad_norm=3.64, loss_final=1.86, loss_mean=0.949, loss_mean_cls=1.2, proj_loss=-0.284][2026-03-22 14:22:59] Step: 1059, Training Logs: loss_final: 1.879918, loss_mean: 0.951368, proj_loss: -0.283310, loss_mean_cls: 1.211860, grad_norm: 2.850493 +Steps: 0%| | 1060/1000000 [04:22<68:36:47, 4.04it/s, grad_norm=2.85, loss_final=1.88, loss_mean=0.951, loss_mean_cls=1.21, proj_loss=-0.283][2026-03-22 14:22:59] Step: 1060, Training Logs: loss_final: 1.838698, loss_mean: 0.969715, proj_loss: -0.289927, loss_mean_cls: 1.158910, grad_norm: 2.352147 +Steps: 0%| | 1061/1000000 [04:22<68:13:27, 4.07it/s, grad_norm=2.35, loss_final=1.84, loss_mean=0.97, loss_mean_cls=1.16, proj_loss=-0.29][2026-03-22 14:22:59] Step: 1061, Training Logs: loss_final: 1.862671, loss_mean: 0.941427, proj_loss: -0.285703, loss_mean_cls: 1.206948, grad_norm: 3.062054 +Steps: 0%| | 1062/1000000 [04:23<68:07:42, 4.07it/s, grad_norm=3.06, loss_final=1.86, loss_mean=0.941, loss_mean_cls=1.21, proj_loss=-0.286][2026-03-22 14:22:59] Step: 1062, Training Logs: loss_final: 1.961954, loss_mean: 0.933646, proj_loss: -0.269715, loss_mean_cls: 1.298024, grad_norm: 3.089479 +Steps: 0%| | 1063/1000000 [04:23<68:33:43, 4.05it/s, grad_norm=3.09, loss_final=1.96, loss_mean=0.934, loss_mean_cls=1.3, proj_loss=-0.27][2026-03-22 14:23:00] Step: 1063, Training Logs: loss_final: 1.905295, loss_mean: 0.971367, proj_loss: -0.278212, loss_mean_cls: 1.212139, grad_norm: 3.366829 +Steps: 0%| | 1064/1000000 [04:23<68:19:20, 4.06it/s, grad_norm=3.37, loss_final=1.91, loss_mean=0.971, loss_mean_cls=1.21, proj_loss=-0.278][2026-03-22 14:23:00] Step: 1064, Training Logs: loss_final: 1.781446, loss_mean: 0.956354, proj_loss: -0.280933, loss_mean_cls: 1.106025, grad_norm: 3.047469 +Steps: 0%| | 1065/1000000 [04:23<68:09:44, 4.07it/s, grad_norm=3.05, loss_final=1.78, loss_mean=0.956, loss_mean_cls=1.11, proj_loss=-0.281][2026-03-22 14:23:00] Step: 1065, Training Logs: loss_final: 1.857992, loss_mean: 0.963771, proj_loss: -0.283494, loss_mean_cls: 1.177714, grad_norm: 2.480014 +Steps: 0%| | 1066/1000000 [04:24<68:01:28, 4.08it/s, grad_norm=2.48, loss_final=1.86, loss_mean=0.964, loss_mean_cls=1.18, proj_loss=-0.283][2026-03-22 14:23:00] Step: 1066, Training Logs: loss_final: 1.715978, loss_mean: 0.982298, proj_loss: -0.292794, loss_mean_cls: 1.026474, grad_norm: 3.674762 +Steps: 0%| | 1067/1000000 [04:24<67:57:13, 4.08it/s, grad_norm=3.67, loss_final=1.72, loss_mean=0.982, loss_mean_cls=1.03, proj_loss=-0.293][2026-03-22 14:23:01] Step: 1067, Training Logs: loss_final: 1.888247, loss_mean: 0.949244, proj_loss: -0.285220, loss_mean_cls: 1.224222, grad_norm: 3.627043 +Steps: 0%| | 1068/1000000 [04:24<67:52:38, 4.09it/s, grad_norm=3.63, loss_final=1.89, loss_mean=0.949, loss_mean_cls=1.22, proj_loss=-0.285][2026-03-22 14:23:01] Step: 1068, Training Logs: loss_final: 1.882249, loss_mean: 0.961205, proj_loss: -0.287298, loss_mean_cls: 1.208342, grad_norm: 4.405542 +Steps: 0%| | 1069/1000000 [04:24<67:50:49, 4.09it/s, grad_norm=4.41, loss_final=1.88, loss_mean=0.961, loss_mean_cls=1.21, proj_loss=-0.287][2026-03-22 14:23:01] Step: 1069, Training Logs: loss_final: 2.032710, loss_mean: 0.926261, proj_loss: -0.274067, loss_mean_cls: 1.380516, grad_norm: 4.707780 +Steps: 0%| | 1070/1000000 [04:24<67:51:44, 4.09it/s, grad_norm=4.71, loss_final=2.03, loss_mean=0.926, loss_mean_cls=1.38, proj_loss=-0.274][2026-03-22 14:23:01] Step: 1070, Training Logs: loss_final: 1.824550, loss_mean: 0.953830, proj_loss: -0.288405, loss_mean_cls: 1.159125, grad_norm: 2.588824 +Steps: 0%| | 1071/1000000 [04:25<67:52:09, 4.09it/s, grad_norm=2.59, loss_final=1.82, loss_mean=0.954, loss_mean_cls=1.16, proj_loss=-0.288][2026-03-22 14:23:02] Step: 1071, Training Logs: loss_final: 1.744807, loss_mean: 0.948603, proj_loss: -0.289524, loss_mean_cls: 1.085729, grad_norm: 2.453203 +Steps: 0%| | 1072/1000000 [04:25<67:53:47, 4.09it/s, grad_norm=2.45, loss_final=1.74, loss_mean=0.949, loss_mean_cls=1.09, proj_loss=-0.29][2026-03-22 14:23:02] Step: 1072, Training Logs: loss_final: 1.823947, loss_mean: 0.950786, proj_loss: -0.292210, loss_mean_cls: 1.165371, grad_norm: 3.306567 +Steps: 0%| | 1073/1000000 [04:25<67:52:19, 4.09it/s, grad_norm=3.31, loss_final=1.82, loss_mean=0.951, loss_mean_cls=1.17, proj_loss=-0.292][2026-03-22 14:23:02] Step: 1073, Training Logs: loss_final: 1.896544, loss_mean: 0.927081, proj_loss: -0.285010, loss_mean_cls: 1.254473, grad_norm: 2.534081 +Steps: 0%| | 1074/1000000 [04:25<67:50:56, 4.09it/s, grad_norm=2.53, loss_final=1.9, loss_mean=0.927, loss_mean_cls=1.25, proj_loss=-0.285][2026-03-22 14:23:02] Step: 1074, Training Logs: loss_final: 1.844604, loss_mean: 0.936409, proj_loss: -0.289250, loss_mean_cls: 1.197445, grad_norm: 3.841146 +Steps: 0%| | 1075/1000000 [04:26<67:49:27, 4.09it/s, grad_norm=3.84, loss_final=1.84, loss_mean=0.936, loss_mean_cls=1.2, proj_loss=-0.289][2026-03-22 14:23:03] Step: 1075, Training Logs: loss_final: 1.896246, loss_mean: 0.932813, proj_loss: -0.288450, loss_mean_cls: 1.251883, grad_norm: 2.097380 +Steps: 0%| | 1076/1000000 [04:26<67:48:35, 4.09it/s, grad_norm=2.1, loss_final=1.9, loss_mean=0.933, loss_mean_cls=1.25, proj_loss=-0.288][2026-03-22 14:23:03] Step: 1076, Training Logs: loss_final: 1.770725, loss_mean: 0.949943, proj_loss: -0.292571, loss_mean_cls: 1.113353, grad_norm: 3.722975 +Steps: 0%| | 1077/1000000 [04:26<67:47:55, 4.09it/s, grad_norm=3.72, loss_final=1.77, loss_mean=0.95, loss_mean_cls=1.11, proj_loss=-0.293][2026-03-22 14:23:03] Step: 1077, Training Logs: loss_final: 1.776622, loss_mean: 0.957672, proj_loss: -0.284642, loss_mean_cls: 1.103592, grad_norm: 2.260784 +Steps: 0%| | 1078/1000000 [04:26<67:46:23, 4.09it/s, grad_norm=2.26, loss_final=1.78, loss_mean=0.958, loss_mean_cls=1.1, proj_loss=-0.285][2026-03-22 14:23:03] Step: 1078, Training Logs: loss_final: 1.707800, loss_mean: 0.955332, proj_loss: -0.291525, loss_mean_cls: 1.043993, grad_norm: 4.538267 +Steps: 0%| | 1079/1000000 [04:27<67:47:26, 4.09it/s, grad_norm=4.54, loss_final=1.71, loss_mean=0.955, loss_mean_cls=1.04, proj_loss=-0.292][2026-03-22 14:23:04] Step: 1079, Training Logs: loss_final: 1.914079, loss_mean: 0.942935, proj_loss: -0.287780, loss_mean_cls: 1.258924, grad_norm: 4.328535 +Steps: 0%| | 1080/1000000 [04:27<67:48:31, 4.09it/s, grad_norm=4.33, loss_final=1.91, loss_mean=0.943, loss_mean_cls=1.26, proj_loss=-0.288][2026-03-22 14:23:04] Step: 1080, Training Logs: loss_final: 1.787886, loss_mean: 0.937629, proj_loss: -0.291115, loss_mean_cls: 1.141371, grad_norm: 2.524561 +Steps: 0%| | 1081/1000000 [04:27<67:48:12, 4.09it/s, grad_norm=2.52, loss_final=1.79, loss_mean=0.938, loss_mean_cls=1.14, proj_loss=-0.291][2026-03-22 14:23:04] Step: 1081, Training Logs: loss_final: 1.766326, loss_mean: 0.970077, proj_loss: -0.292141, loss_mean_cls: 1.088389, grad_norm: 3.544532 +Steps: 0%| | 1082/1000000 [04:27<67:47:58, 4.09it/s, grad_norm=3.54, loss_final=1.77, loss_mean=0.97, loss_mean_cls=1.09, proj_loss=-0.292][2026-03-22 14:23:04] Step: 1082, Training Logs: loss_final: 1.760164, loss_mean: 0.951433, proj_loss: -0.288243, loss_mean_cls: 1.096974, grad_norm: 3.088391 +Steps: 0%| | 1083/1000000 [04:28<67:47:21, 4.09it/s, grad_norm=3.09, loss_final=1.76, loss_mean=0.951, loss_mean_cls=1.1, proj_loss=-0.288][2026-03-22 14:23:05] Step: 1083, Training Logs: loss_final: 1.848181, loss_mean: 0.926183, proj_loss: -0.296443, loss_mean_cls: 1.218442, grad_norm: 2.897361 +Steps: 0%| | 1084/1000000 [04:28<67:49:02, 4.09it/s, grad_norm=2.9, loss_final=1.85, loss_mean=0.926, loss_mean_cls=1.22, proj_loss=-0.296][2026-03-22 14:23:05] Step: 1084, Training Logs: loss_final: 1.943888, loss_mean: 0.923795, proj_loss: -0.284067, loss_mean_cls: 1.304160, grad_norm: 3.961891 +Steps: 0%| | 1085/1000000 [04:28<67:48:28, 4.09it/s, grad_norm=3.96, loss_final=1.94, loss_mean=0.924, loss_mean_cls=1.3, proj_loss=-0.284][2026-03-22 14:23:05] Step: 1085, Training Logs: loss_final: 1.822563, loss_mean: 0.965230, proj_loss: -0.293898, loss_mean_cls: 1.151231, grad_norm: 3.427469 +Steps: 0%| | 1086/1000000 [04:28<67:47:33, 4.09it/s, grad_norm=3.43, loss_final=1.82, loss_mean=0.965, loss_mean_cls=1.15, proj_loss=-0.294][2026-03-22 14:23:05] Step: 1086, Training Logs: loss_final: 1.841355, loss_mean: 0.957491, proj_loss: -0.288106, loss_mean_cls: 1.171970, grad_norm: 3.757615 +Steps: 0%| | 1087/1000000 [04:29<67:45:45, 4.09it/s, grad_norm=3.76, loss_final=1.84, loss_mean=0.957, loss_mean_cls=1.17, proj_loss=-0.288][2026-03-22 14:23:06] Step: 1087, Training Logs: loss_final: 1.864559, loss_mean: 0.964263, proj_loss: -0.292090, loss_mean_cls: 1.192386, grad_norm: 3.285673 +Steps: 0%| | 1088/1000000 [04:29<67:47:05, 4.09it/s, grad_norm=3.29, loss_final=1.86, loss_mean=0.964, loss_mean_cls=1.19, proj_loss=-0.292][2026-03-22 14:23:06] Step: 1088, Training Logs: loss_final: 1.943669, loss_mean: 0.935135, proj_loss: -0.287918, loss_mean_cls: 1.296453, grad_norm: 3.423045 +Steps: 0%| | 1089/1000000 [04:29<67:47:23, 4.09it/s, grad_norm=3.42, loss_final=1.94, loss_mean=0.935, loss_mean_cls=1.3, proj_loss=-0.288][2026-03-22 14:23:06] Step: 1089, Training Logs: loss_final: 1.758385, loss_mean: 0.957230, proj_loss: -0.290456, loss_mean_cls: 1.091612, grad_norm: 3.087879 +Steps: 0%| | 1090/1000000 [04:29<67:46:38, 4.09it/s, grad_norm=3.09, loss_final=1.76, loss_mean=0.957, loss_mean_cls=1.09, proj_loss=-0.29][2026-03-22 14:23:06] Step: 1090, Training Logs: loss_final: 1.723512, loss_mean: 0.976143, proj_loss: -0.303097, loss_mean_cls: 1.050466, grad_norm: 2.830979 +Steps: 0%| | 1091/1000000 [04:30<67:46:13, 4.09it/s, grad_norm=2.83, loss_final=1.72, loss_mean=0.976, loss_mean_cls=1.05, proj_loss=-0.303][2026-03-22 14:23:07] Step: 1091, Training Logs: loss_final: 1.816505, loss_mean: 0.945568, proj_loss: -0.293156, loss_mean_cls: 1.164093, grad_norm: 3.168502 +Steps: 0%| | 1092/1000000 [04:30<67:46:29, 4.09it/s, grad_norm=3.17, loss_final=1.82, loss_mean=0.946, loss_mean_cls=1.16, proj_loss=-0.293][2026-03-22 14:23:07] Step: 1092, Training Logs: loss_final: 1.806454, loss_mean: 0.947914, proj_loss: -0.291497, loss_mean_cls: 1.150036, grad_norm: 2.464971 +Steps: 0%| | 1093/1000000 [04:30<67:47:11, 4.09it/s, grad_norm=2.46, loss_final=1.81, loss_mean=0.948, loss_mean_cls=1.15, proj_loss=-0.291][2026-03-22 14:23:07] Step: 1093, Training Logs: loss_final: 1.817385, loss_mean: 0.944553, proj_loss: -0.294051, loss_mean_cls: 1.166883, grad_norm: 2.859681 +Steps: 0%| | 1094/1000000 [04:30<67:47:01, 4.09it/s, grad_norm=2.86, loss_final=1.82, loss_mean=0.945, loss_mean_cls=1.17, proj_loss=-0.294][2026-03-22 14:23:07] Step: 1094, Training Logs: loss_final: 1.773586, loss_mean: 0.943628, proj_loss: -0.292399, loss_mean_cls: 1.122357, grad_norm: 3.250970 +Steps: 0%| | 1095/1000000 [04:31<67:48:10, 4.09it/s, grad_norm=3.25, loss_final=1.77, loss_mean=0.944, loss_mean_cls=1.12, proj_loss=-0.292][2026-03-22 14:23:08] Step: 1095, Training Logs: loss_final: 1.803022, loss_mean: 0.945735, proj_loss: -0.294579, loss_mean_cls: 1.151866, grad_norm: 2.685385 +Steps: 0%| | 1096/1000000 [04:31<67:48:32, 4.09it/s, grad_norm=2.69, loss_final=1.8, loss_mean=0.946, loss_mean_cls=1.15, proj_loss=-0.295][2026-03-22 14:23:08] Step: 1096, Training Logs: loss_final: 1.791841, loss_mean: 0.941796, proj_loss: -0.291457, loss_mean_cls: 1.141502, grad_norm: 3.554270 +Steps: 0%| | 1097/1000000 [04:31<67:47:21, 4.09it/s, grad_norm=3.55, loss_final=1.79, loss_mean=0.942, loss_mean_cls=1.14, proj_loss=-0.291][2026-03-22 14:23:08] Step: 1097, Training Logs: loss_final: 1.956817, loss_mean: 0.932580, proj_loss: -0.285355, loss_mean_cls: 1.309593, grad_norm: 2.863244 +Steps: 0%| | 1098/1000000 [04:31<67:48:39, 4.09it/s, grad_norm=2.86, loss_final=1.96, loss_mean=0.933, loss_mean_cls=1.31, proj_loss=-0.285][2026-03-22 14:23:08] Step: 1098, Training Logs: loss_final: 1.795945, loss_mean: 0.953687, proj_loss: -0.290292, loss_mean_cls: 1.132550, grad_norm: 2.684406 +Steps: 0%| | 1099/1000000 [04:32<67:46:46, 4.09it/s, grad_norm=2.68, loss_final=1.8, loss_mean=0.954, loss_mean_cls=1.13, proj_loss=-0.29][2026-03-22 14:23:09] Step: 1099, Training Logs: loss_final: 1.835364, loss_mean: 0.942163, proj_loss: -0.300646, loss_mean_cls: 1.193847, grad_norm: 4.100742 +Steps: 0%| | 1100/1000000 [04:32<67:50:02, 4.09it/s, grad_norm=4.1, loss_final=1.84, loss_mean=0.942, loss_mean_cls=1.19, proj_loss=-0.301][2026-03-22 14:23:09] Step: 1100, Training Logs: loss_final: 1.869519, loss_mean: 0.954970, proj_loss: -0.289515, loss_mean_cls: 1.204065, grad_norm: 3.561254 +Steps: 0%| | 1101/1000000 [04:32<67:47:34, 4.09it/s, grad_norm=3.56, loss_final=1.87, loss_mean=0.955, loss_mean_cls=1.2, proj_loss=-0.29][2026-03-22 14:23:09] Step: 1101, Training Logs: loss_final: 1.852451, loss_mean: 0.943706, proj_loss: -0.288283, loss_mean_cls: 1.197029, grad_norm: 2.481205 +Steps: 0%| | 1102/1000000 [04:32<67:47:32, 4.09it/s, grad_norm=2.48, loss_final=1.85, loss_mean=0.944, loss_mean_cls=1.2, proj_loss=-0.288][2026-03-22 14:23:09] Step: 1102, Training Logs: loss_final: 1.834404, loss_mean: 0.949688, proj_loss: -0.297495, loss_mean_cls: 1.182211, grad_norm: 4.720915 +Steps: 0%| | 1103/1000000 [04:33<67:45:57, 4.09it/s, grad_norm=4.72, loss_final=1.83, loss_mean=0.95, loss_mean_cls=1.18, proj_loss=-0.297][2026-03-22 14:23:10] Step: 1103, Training Logs: loss_final: 1.826876, loss_mean: 0.955258, proj_loss: -0.293469, loss_mean_cls: 1.165087, grad_norm: 3.658348 +Steps: 0%| | 1104/1000000 [04:33<67:47:41, 4.09it/s, grad_norm=3.66, loss_final=1.83, loss_mean=0.955, loss_mean_cls=1.17, proj_loss=-0.293][2026-03-22 14:23:10] Step: 1104, Training Logs: loss_final: 1.757542, loss_mean: 0.954733, proj_loss: -0.299570, loss_mean_cls: 1.102379, grad_norm: 2.746080 +Steps: 0%| | 1105/1000000 [04:33<67:47:21, 4.09it/s, grad_norm=2.75, loss_final=1.76, loss_mean=0.955, loss_mean_cls=1.1, proj_loss=-0.3][2026-03-22 14:23:10] Step: 1105, Training Logs: loss_final: 1.703868, loss_mean: 0.943911, proj_loss: -0.301700, loss_mean_cls: 1.061658, grad_norm: 3.201198 +Steps: 0%| | 1106/1000000 [04:33<67:48:42, 4.09it/s, grad_norm=3.2, loss_final=1.7, loss_mean=0.944, loss_mean_cls=1.06, proj_loss=-0.302][2026-03-22 14:23:10] Step: 1106, Training Logs: loss_final: 1.887186, loss_mean: 0.932713, proj_loss: -0.291688, loss_mean_cls: 1.246161, grad_norm: 2.829004 +Steps: 0%| | 1107/1000000 [04:34<67:48:17, 4.09it/s, grad_norm=2.83, loss_final=1.89, loss_mean=0.933, loss_mean_cls=1.25, proj_loss=-0.292][2026-03-22 14:23:10] Step: 1107, Training Logs: loss_final: 1.741190, loss_mean: 0.965882, proj_loss: -0.293991, loss_mean_cls: 1.069300, grad_norm: 3.058988 +Steps: 0%| | 1108/1000000 [04:34<67:49:43, 4.09it/s, grad_norm=3.06, loss_final=1.74, loss_mean=0.966, loss_mean_cls=1.07, proj_loss=-0.294][2026-03-22 14:23:11] Step: 1108, Training Logs: loss_final: 1.797125, loss_mean: 0.948414, proj_loss: -0.296774, loss_mean_cls: 1.145486, grad_norm: 2.294533 +Steps: 0%| | 1109/1000000 [04:34<67:48:45, 4.09it/s, grad_norm=2.29, loss_final=1.8, loss_mean=0.948, loss_mean_cls=1.15, proj_loss=-0.297][2026-03-22 14:23:11] Step: 1109, Training Logs: loss_final: 1.854430, loss_mean: 0.947069, proj_loss: -0.290994, loss_mean_cls: 1.198355, grad_norm: 3.789261 +Steps: 0%| | 1110/1000000 [04:34<67:49:15, 4.09it/s, grad_norm=3.79, loss_final=1.85, loss_mean=0.947, loss_mean_cls=1.2, proj_loss=-0.291][2026-03-22 14:23:11] Step: 1110, Training Logs: loss_final: 1.865216, loss_mean: 0.959918, proj_loss: -0.289527, loss_mean_cls: 1.194825, grad_norm: 3.877531 +Steps: 0%| | 1111/1000000 [04:35<67:50:04, 4.09it/s, grad_norm=3.88, loss_final=1.87, loss_mean=0.96, loss_mean_cls=1.19, proj_loss=-0.29][2026-03-22 14:23:11] Step: 1111, Training Logs: loss_final: 1.768759, loss_mean: 0.951813, proj_loss: -0.297738, loss_mean_cls: 1.114684, grad_norm: 2.894841 +Steps: 0%| | 1112/1000000 [04:35<67:51:06, 4.09it/s, grad_norm=2.89, loss_final=1.77, loss_mean=0.952, loss_mean_cls=1.11, proj_loss=-0.298][2026-03-22 14:23:12] Step: 1112, Training Logs: loss_final: 1.881774, loss_mean: 0.931876, proj_loss: -0.294887, loss_mean_cls: 1.244785, grad_norm: 3.775355 +Steps: 0%| | 1113/1000000 [04:35<67:50:35, 4.09it/s, grad_norm=3.78, loss_final=1.88, loss_mean=0.932, loss_mean_cls=1.24, proj_loss=-0.295][2026-03-22 14:23:12] Step: 1113, Training Logs: loss_final: 1.793103, loss_mean: 0.933473, proj_loss: -0.294668, loss_mean_cls: 1.154298, grad_norm: 2.512551 +Steps: 0%| | 1114/1000000 [04:35<67:49:56, 4.09it/s, grad_norm=2.51, loss_final=1.79, loss_mean=0.933, loss_mean_cls=1.15, proj_loss=-0.295][2026-03-22 14:23:12] Step: 1114, Training Logs: loss_final: 1.853438, loss_mean: 0.944751, proj_loss: -0.298462, loss_mean_cls: 1.207149, grad_norm: 2.442463 +Steps: 0%| | 1115/1000000 [04:35<67:47:35, 4.09it/s, grad_norm=2.44, loss_final=1.85, loss_mean=0.945, loss_mean_cls=1.21, proj_loss=-0.298][2026-03-22 14:23:12] Step: 1115, Training Logs: loss_final: 1.825060, loss_mean: 0.956386, proj_loss: -0.298525, loss_mean_cls: 1.167199, grad_norm: 3.831409 +Steps: 0%| | 1116/1000000 [04:36<67:47:51, 4.09it/s, grad_norm=3.83, loss_final=1.83, loss_mean=0.956, loss_mean_cls=1.17, proj_loss=-0.299][2026-03-22 14:23:13] Step: 1116, Training Logs: loss_final: 1.991725, loss_mean: 0.920156, proj_loss: -0.286696, loss_mean_cls: 1.358265, grad_norm: 3.893149 +Steps: 0%| | 1117/1000000 [04:36<67:47:08, 4.09it/s, grad_norm=3.89, loss_final=1.99, loss_mean=0.92, loss_mean_cls=1.36, proj_loss=-0.287][2026-03-22 14:23:13] Step: 1117, Training Logs: loss_final: 1.784711, loss_mean: 0.959043, proj_loss: -0.298947, loss_mean_cls: 1.124614, grad_norm: 3.503432 +Steps: 0%| | 1118/1000000 [04:36<67:45:49, 4.09it/s, grad_norm=3.5, loss_final=1.78, loss_mean=0.959, loss_mean_cls=1.12, proj_loss=-0.299][2026-03-22 14:23:13] Step: 1118, Training Logs: loss_final: 1.803153, loss_mean: 0.948442, proj_loss: -0.299788, loss_mean_cls: 1.154500, grad_norm: 3.018397 +Steps: 0%| | 1119/1000000 [04:36<67:45:57, 4.09it/s, grad_norm=3.02, loss_final=1.8, loss_mean=0.948, loss_mean_cls=1.15, proj_loss=-0.3][2026-03-22 14:23:13] Step: 1119, Training Logs: loss_final: 1.900707, loss_mean: 0.951208, proj_loss: -0.292922, loss_mean_cls: 1.242421, grad_norm: 5.110574 +Steps: 0%| | 1120/1000000 [04:37<67:47:42, 4.09it/s, grad_norm=5.11, loss_final=1.9, loss_mean=0.951, loss_mean_cls=1.24, proj_loss=-0.293][2026-03-22 14:23:14] Step: 1120, Training Logs: loss_final: 1.936113, loss_mean: 0.939654, proj_loss: -0.285172, loss_mean_cls: 1.281631, grad_norm: 3.633196 +Steps: 0%| | 1121/1000000 [04:37<67:47:17, 4.09it/s, grad_norm=3.63, loss_final=1.94, loss_mean=0.94, loss_mean_cls=1.28, proj_loss=-0.285][2026-03-22 14:23:14] Step: 1121, Training Logs: loss_final: 1.839893, loss_mean: 0.956886, proj_loss: -0.299127, loss_mean_cls: 1.182133, grad_norm: 4.622831 +Steps: 0%| | 1122/1000000 [04:37<67:47:32, 4.09it/s, grad_norm=4.62, loss_final=1.84, loss_mean=0.957, loss_mean_cls=1.18, proj_loss=-0.299][2026-03-22 14:23:14] Step: 1122, Training Logs: loss_final: 1.764102, loss_mean: 0.992452, proj_loss: -0.299491, loss_mean_cls: 1.071140, grad_norm: 4.827521 +Steps: 0%| | 1123/1000000 [04:37<67:47:00, 4.09it/s, grad_norm=4.83, loss_final=1.76, loss_mean=0.992, loss_mean_cls=1.07, proj_loss=-0.299][2026-03-22 14:23:14] Step: 1123, Training Logs: loss_final: 1.883460, loss_mean: 0.942739, proj_loss: -0.298548, loss_mean_cls: 1.239270, grad_norm: 4.488664 +Steps: 0%| | 1124/1000000 [04:38<67:48:09, 4.09it/s, grad_norm=4.49, loss_final=1.88, loss_mean=0.943, loss_mean_cls=1.24, proj_loss=-0.299][2026-03-22 14:23:15] Step: 1124, Training Logs: loss_final: 1.877362, loss_mean: 0.948316, proj_loss: -0.294423, loss_mean_cls: 1.223469, grad_norm: 5.424586 +Steps: 0%| | 1125/1000000 [04:38<67:48:20, 4.09it/s, grad_norm=5.42, loss_final=1.88, loss_mean=0.948, loss_mean_cls=1.22, proj_loss=-0.294][2026-03-22 14:23:15] Step: 1125, Training Logs: loss_final: 1.781129, loss_mean: 0.971762, proj_loss: -0.299347, loss_mean_cls: 1.108714, grad_norm: 4.043241 +Steps: 0%| | 1126/1000000 [04:38<67:46:25, 4.09it/s, grad_norm=4.04, loss_final=1.78, loss_mean=0.972, loss_mean_cls=1.11, proj_loss=-0.299][2026-03-22 14:23:15] Step: 1126, Training Logs: loss_final: 1.916839, loss_mean: 0.930260, proj_loss: -0.286485, loss_mean_cls: 1.273064, grad_norm: 4.264523 +Steps: 0%| | 1127/1000000 [04:38<67:45:12, 4.10it/s, grad_norm=4.26, loss_final=1.92, loss_mean=0.93, loss_mean_cls=1.27, proj_loss=-0.286][2026-03-22 14:23:15] Step: 1127, Training Logs: loss_final: 1.891945, loss_mean: 0.922219, proj_loss: -0.288743, loss_mean_cls: 1.258469, grad_norm: 3.286744 +Steps: 0%| | 1128/1000000 [04:39<67:45:48, 4.09it/s, grad_norm=3.29, loss_final=1.89, loss_mean=0.922, loss_mean_cls=1.26, proj_loss=-0.289][2026-03-22 14:23:16] Step: 1128, Training Logs: loss_final: 1.660298, loss_mean: 0.959692, proj_loss: -0.305775, loss_mean_cls: 1.006380, grad_norm: 4.072810 +Steps: 0%| | 1129/1000000 [04:39<67:44:24, 4.10it/s, grad_norm=4.07, loss_final=1.66, loss_mean=0.96, loss_mean_cls=1.01, proj_loss=-0.306][2026-03-22 14:23:16] Step: 1129, Training Logs: loss_final: 1.885886, loss_mean: 0.950908, proj_loss: -0.296581, loss_mean_cls: 1.231559, grad_norm: 4.112264 +Steps: 0%| | 1130/1000000 [04:39<67:46:22, 4.09it/s, grad_norm=4.11, loss_final=1.89, loss_mean=0.951, loss_mean_cls=1.23, proj_loss=-0.297][2026-03-22 14:23:16] Step: 1130, Training Logs: loss_final: 1.729251, loss_mean: 0.946225, proj_loss: -0.305597, loss_mean_cls: 1.088624, grad_norm: 2.053043 +Steps: 0%| | 1131/1000000 [04:39<67:45:19, 4.10it/s, grad_norm=2.05, loss_final=1.73, loss_mean=0.946, loss_mean_cls=1.09, proj_loss=-0.306][2026-03-22 14:23:16] Step: 1131, Training Logs: loss_final: 1.762199, loss_mean: 0.957724, proj_loss: -0.305177, loss_mean_cls: 1.109653, grad_norm: 2.528394 +Steps: 0%| | 1132/1000000 [04:40<67:47:10, 4.09it/s, grad_norm=2.53, loss_final=1.76, loss_mean=0.958, loss_mean_cls=1.11, proj_loss=-0.305][2026-03-22 14:23:17] Step: 1132, Training Logs: loss_final: 1.865804, loss_mean: 0.904517, proj_loss: -0.295888, loss_mean_cls: 1.257175, grad_norm: 1.413386 +Steps: 0%| | 1133/1000000 [04:40<67:46:16, 4.09it/s, grad_norm=1.41, loss_final=1.87, loss_mean=0.905, loss_mean_cls=1.26, proj_loss=-0.296][2026-03-22 14:23:17] Step: 1133, Training Logs: loss_final: 1.682604, loss_mean: 0.961043, proj_loss: -0.309454, loss_mean_cls: 1.031015, grad_norm: 2.626368 +Steps: 0%| | 1134/1000000 [04:40<67:49:18, 4.09it/s, grad_norm=2.63, loss_final=1.68, loss_mean=0.961, loss_mean_cls=1.03, proj_loss=-0.309][2026-03-22 14:23:17] Step: 1134, Training Logs: loss_final: 1.707529, loss_mean: 0.968168, proj_loss: -0.306943, loss_mean_cls: 1.046304, grad_norm: 3.459657 +Steps: 0%| | 1135/1000000 [04:40<67:47:43, 4.09it/s, grad_norm=3.46, loss_final=1.71, loss_mean=0.968, loss_mean_cls=1.05, proj_loss=-0.307][2026-03-22 14:23:17] Step: 1135, Training Logs: loss_final: 1.835307, loss_mean: 0.937956, proj_loss: -0.304571, loss_mean_cls: 1.201922, grad_norm: 2.933634 +Steps: 0%| | 1136/1000000 [04:41<67:50:27, 4.09it/s, grad_norm=2.93, loss_final=1.84, loss_mean=0.938, loss_mean_cls=1.2, proj_loss=-0.305][2026-03-22 14:23:18] Step: 1136, Training Logs: loss_final: 1.888677, loss_mean: 0.931886, proj_loss: -0.300304, loss_mean_cls: 1.257095, grad_norm: 2.181157 +Steps: 0%| | 1137/1000000 [04:41<67:49:10, 4.09it/s, grad_norm=2.18, loss_final=1.89, loss_mean=0.932, loss_mean_cls=1.26, proj_loss=-0.3][2026-03-22 14:23:18] Step: 1137, Training Logs: loss_final: 1.706767, loss_mean: 0.950022, proj_loss: -0.307878, loss_mean_cls: 1.064622, grad_norm: 2.033181 +Steps: 0%| | 1138/1000000 [04:41<67:47:40, 4.09it/s, grad_norm=2.03, loss_final=1.71, loss_mean=0.95, loss_mean_cls=1.06, proj_loss=-0.308][2026-03-22 14:23:18] Step: 1138, Training Logs: loss_final: 1.719054, loss_mean: 0.963196, proj_loss: -0.307614, loss_mean_cls: 1.063472, grad_norm: 2.832459 +Steps: 0%| | 1139/1000000 [04:41<67:45:01, 4.10it/s, grad_norm=2.83, loss_final=1.72, loss_mean=0.963, loss_mean_cls=1.06, proj_loss=-0.308][2026-03-22 14:23:18] Step: 1139, Training Logs: loss_final: 2.060929, loss_mean: 0.905150, proj_loss: -0.288037, loss_mean_cls: 1.443816, grad_norm: 2.420004 +Steps: 0%| | 1140/1000000 [04:42<67:47:13, 4.09it/s, grad_norm=2.42, loss_final=2.06, loss_mean=0.905, loss_mean_cls=1.44, proj_loss=-0.288][2026-03-22 14:23:19] Step: 1140, Training Logs: loss_final: 1.857386, loss_mean: 0.942110, proj_loss: -0.303625, loss_mean_cls: 1.218901, grad_norm: 3.090404 +Steps: 0%| | 1141/1000000 [04:42<67:48:20, 4.09it/s, grad_norm=3.09, loss_final=1.86, loss_mean=0.942, loss_mean_cls=1.22, proj_loss=-0.304][2026-03-22 14:23:19] Step: 1141, Training Logs: loss_final: 1.771210, loss_mean: 0.932573, proj_loss: -0.301822, loss_mean_cls: 1.140459, grad_norm: 2.654076 +Steps: 0%| | 1142/1000000 [04:42<67:49:04, 4.09it/s, grad_norm=2.65, loss_final=1.77, loss_mean=0.933, loss_mean_cls=1.14, proj_loss=-0.302][2026-03-22 14:23:19] Step: 1142, Training Logs: loss_final: 1.881113, loss_mean: 0.947924, proj_loss: -0.298272, loss_mean_cls: 1.231461, grad_norm: 4.777999 +Steps: 0%| | 1143/1000000 [04:42<67:49:46, 4.09it/s, grad_norm=4.78, loss_final=1.88, loss_mean=0.948, loss_mean_cls=1.23, proj_loss=-0.298][2026-03-22 14:23:19] Step: 1143, Training Logs: loss_final: 1.749643, loss_mean: 0.948676, proj_loss: -0.305619, loss_mean_cls: 1.106586, grad_norm: 3.612749 +Steps: 0%| | 1144/1000000 [04:43<67:50:30, 4.09it/s, grad_norm=3.61, loss_final=1.75, loss_mean=0.949, loss_mean_cls=1.11, proj_loss=-0.306][2026-03-22 14:23:20] Step: 1144, Training Logs: loss_final: 1.887999, loss_mean: 0.948054, proj_loss: -0.296923, loss_mean_cls: 1.236869, grad_norm: 3.601933 +Steps: 0%| | 1145/1000000 [04:43<67:48:36, 4.09it/s, grad_norm=3.6, loss_final=1.89, loss_mean=0.948, loss_mean_cls=1.24, proj_loss=-0.297][2026-03-22 14:23:20] Step: 1145, Training Logs: loss_final: 1.776381, loss_mean: 0.937311, proj_loss: -0.300041, loss_mean_cls: 1.139111, grad_norm: 3.189273 +Steps: 0%| | 1146/1000000 [04:43<67:48:28, 4.09it/s, grad_norm=3.19, loss_final=1.78, loss_mean=0.937, loss_mean_cls=1.14, proj_loss=-0.3][2026-03-22 14:23:20] Step: 1146, Training Logs: loss_final: 1.796358, loss_mean: 0.926210, proj_loss: -0.304552, loss_mean_cls: 1.174700, grad_norm: 2.920329 +Steps: 0%| | 1147/1000000 [04:43<67:51:03, 4.09it/s, grad_norm=2.92, loss_final=1.8, loss_mean=0.926, loss_mean_cls=1.17, proj_loss=-0.305][2026-03-22 14:23:20] Step: 1147, Training Logs: loss_final: 1.905934, loss_mean: 0.937742, proj_loss: -0.299232, loss_mean_cls: 1.267424, grad_norm: 3.893075 +Steps: 0%| | 1148/1000000 [04:44<67:51:14, 4.09it/s, grad_norm=3.89, loss_final=1.91, loss_mean=0.938, loss_mean_cls=1.27, proj_loss=-0.299][2026-03-22 14:23:21] Step: 1148, Training Logs: loss_final: 1.797339, loss_mean: 0.944709, proj_loss: -0.305734, loss_mean_cls: 1.158364, grad_norm: 3.104636 +Steps: 0%| | 1149/1000000 [04:44<67:49:21, 4.09it/s, grad_norm=3.1, loss_final=1.8, loss_mean=0.945, loss_mean_cls=1.16, proj_loss=-0.306][2026-03-22 14:23:21] Step: 1149, Training Logs: loss_final: 1.807014, loss_mean: 0.946126, proj_loss: -0.302691, loss_mean_cls: 1.163580, grad_norm: 2.812555 +Steps: 0%| | 1150/1000000 [04:44<67:47:29, 4.09it/s, grad_norm=2.81, loss_final=1.81, loss_mean=0.946, loss_mean_cls=1.16, proj_loss=-0.303][2026-03-22 14:23:21] Step: 1150, Training Logs: loss_final: 1.712162, loss_mean: 0.976752, proj_loss: -0.316176, loss_mean_cls: 1.051586, grad_norm: 3.081000 +Steps: 0%| | 1151/1000000 [04:44<67:47:06, 4.09it/s, grad_norm=3.08, loss_final=1.71, loss_mean=0.977, loss_mean_cls=1.05, proj_loss=-0.316][2026-03-22 14:23:21] Step: 1151, Training Logs: loss_final: 1.785365, loss_mean: 0.945056, proj_loss: -0.305074, loss_mean_cls: 1.145382, grad_norm: 2.636243 +Steps: 0%| | 1152/1000000 [04:45<67:48:26, 4.09it/s, grad_norm=2.64, loss_final=1.79, loss_mean=0.945, loss_mean_cls=1.15, proj_loss=-0.305][2026-03-22 14:23:21] Step: 1152, Training Logs: loss_final: 1.794377, loss_mean: 0.947439, proj_loss: -0.305959, loss_mean_cls: 1.152897, grad_norm: 3.903947 +Steps: 0%| | 1153/1000000 [04:45<67:49:16, 4.09it/s, grad_norm=3.9, loss_final=1.79, loss_mean=0.947, loss_mean_cls=1.15, proj_loss=-0.306][2026-03-22 14:23:22] Step: 1153, Training Logs: loss_final: 1.936981, loss_mean: 0.924758, proj_loss: -0.298337, loss_mean_cls: 1.310560, grad_norm: 2.391515 +Steps: 0%| | 1154/1000000 [04:45<67:48:07, 4.09it/s, grad_norm=2.39, loss_final=1.94, loss_mean=0.925, loss_mean_cls=1.31, proj_loss=-0.298][2026-03-22 14:23:22] Step: 1154, Training Logs: loss_final: 1.680719, loss_mean: 0.962244, proj_loss: -0.313390, loss_mean_cls: 1.031865, grad_norm: 4.551323 +Steps: 0%| | 1155/1000000 [04:45<67:47:17, 4.09it/s, grad_norm=4.55, loss_final=1.68, loss_mean=0.962, loss_mean_cls=1.03, proj_loss=-0.313][2026-03-22 14:23:22] Step: 1155, Training Logs: loss_final: 1.901754, loss_mean: 0.933731, proj_loss: -0.302995, loss_mean_cls: 1.271018, grad_norm: 4.123012 +Steps: 0%| | 1156/1000000 [04:45<67:48:49, 4.09it/s, grad_norm=4.12, loss_final=1.9, loss_mean=0.934, loss_mean_cls=1.27, proj_loss=-0.303][2026-03-22 14:23:22] Step: 1156, Training Logs: loss_final: 1.829203, loss_mean: 0.935554, proj_loss: -0.302251, loss_mean_cls: 1.195900, grad_norm: 3.525926 +Steps: 0%| | 1157/1000000 [04:46<67:46:32, 4.09it/s, grad_norm=3.53, loss_final=1.83, loss_mean=0.936, loss_mean_cls=1.2, proj_loss=-0.302][2026-03-22 14:23:23] Step: 1157, Training Logs: loss_final: 1.889118, loss_mean: 0.955024, proj_loss: -0.302609, loss_mean_cls: 1.236703, grad_norm: 5.148920 +Steps: 0%| | 1158/1000000 [04:46<67:45:56, 4.09it/s, grad_norm=5.15, loss_final=1.89, loss_mean=0.955, loss_mean_cls=1.24, proj_loss=-0.303][2026-03-22 14:23:23] Step: 1158, Training Logs: loss_final: 1.765697, loss_mean: 0.940713, proj_loss: -0.313325, loss_mean_cls: 1.138309, grad_norm: 2.820875 +Steps: 0%| | 1159/1000000 [04:46<67:47:11, 4.09it/s, grad_norm=2.82, loss_final=1.77, loss_mean=0.941, loss_mean_cls=1.14, proj_loss=-0.313][2026-03-22 14:23:23] Step: 1159, Training Logs: loss_final: 1.742835, loss_mean: 0.953678, proj_loss: -0.308381, loss_mean_cls: 1.097537, grad_norm: 4.227281 +Steps: 0%| | 1160/1000000 [04:46<67:47:54, 4.09it/s, grad_norm=4.23, loss_final=1.74, loss_mean=0.954, loss_mean_cls=1.1, proj_loss=-0.308][2026-03-22 14:23:23] Step: 1160, Training Logs: loss_final: 1.844609, loss_mean: 0.940556, proj_loss: -0.302315, loss_mean_cls: 1.206369, grad_norm: 4.561705 +Steps: 0%| | 1161/1000000 [04:47<67:46:27, 4.09it/s, grad_norm=4.56, loss_final=1.84, loss_mean=0.941, loss_mean_cls=1.21, proj_loss=-0.302][2026-03-22 14:23:24] Step: 1161, Training Logs: loss_final: 1.740768, loss_mean: 0.933798, proj_loss: -0.320493, loss_mean_cls: 1.127463, grad_norm: 2.958712 +Steps: 0%| | 1162/1000000 [04:47<67:48:43, 4.09it/s, grad_norm=2.96, loss_final=1.74, loss_mean=0.934, loss_mean_cls=1.13, proj_loss=-0.32][2026-03-22 14:23:24] Step: 1162, Training Logs: loss_final: 1.794486, loss_mean: 0.955688, proj_loss: -0.302474, loss_mean_cls: 1.141271, grad_norm: 3.292614 +Steps: 0%| | 1163/1000000 [04:47<70:25:06, 3.94it/s, grad_norm=3.29, loss_final=1.79, loss_mean=0.956, loss_mean_cls=1.14, proj_loss=-0.302][2026-03-22 14:23:24] Step: 1163, Training Logs: loss_final: 1.768836, loss_mean: 0.949752, proj_loss: -0.302888, loss_mean_cls: 1.121973, grad_norm: 2.922616 +Steps: 0%| | 1164/1000000 [04:47<69:51:04, 3.97it/s, grad_norm=2.92, loss_final=1.77, loss_mean=0.95, loss_mean_cls=1.12, proj_loss=-0.303][2026-03-22 14:23:24] Step: 1164, Training Logs: loss_final: 1.902510, loss_mean: 0.933055, proj_loss: -0.297065, loss_mean_cls: 1.266521, grad_norm: 2.652225 +Steps: 0%| | 1165/1000000 [04:48<69:13:46, 4.01it/s, grad_norm=2.65, loss_final=1.9, loss_mean=0.933, loss_mean_cls=1.27, proj_loss=-0.297][2026-03-22 14:23:25] Step: 1165, Training Logs: loss_final: 1.720651, loss_mean: 0.953584, proj_loss: -0.308400, loss_mean_cls: 1.075467, grad_norm: 3.864665 +Steps: 0%| | 1166/1000000 [04:48<68:46:18, 4.03it/s, grad_norm=3.86, loss_final=1.72, loss_mean=0.954, loss_mean_cls=1.08, proj_loss=-0.308][2026-03-22 14:23:25] Step: 1166, Training Logs: loss_final: 1.856918, loss_mean: 0.949273, proj_loss: -0.307123, loss_mean_cls: 1.214768, grad_norm: 4.137567 +Steps: 0%| | 1167/1000000 [04:48<68:26:00, 4.05it/s, grad_norm=4.14, loss_final=1.86, loss_mean=0.949, loss_mean_cls=1.21, proj_loss=-0.307][2026-03-22 14:23:25] Step: 1167, Training Logs: loss_final: 1.694178, loss_mean: 0.953372, proj_loss: -0.314497, loss_mean_cls: 1.055302, grad_norm: 3.212618 +Steps: 0%| | 1168/1000000 [04:48<68:14:45, 4.07it/s, grad_norm=3.21, loss_final=1.69, loss_mean=0.953, loss_mean_cls=1.06, proj_loss=-0.314][2026-03-22 14:23:25] Step: 1168, Training Logs: loss_final: 1.835830, loss_mean: 0.944207, proj_loss: -0.303592, loss_mean_cls: 1.195215, grad_norm: 2.791528 +Steps: 0%| | 1169/1000000 [04:49<68:06:19, 4.07it/s, grad_norm=2.79, loss_final=1.84, loss_mean=0.944, loss_mean_cls=1.2, proj_loss=-0.304][2026-03-22 14:23:26] Step: 1169, Training Logs: loss_final: 1.738982, loss_mean: 0.944780, proj_loss: -0.311881, loss_mean_cls: 1.106083, grad_norm: 2.427335 +Steps: 0%| | 1170/1000000 [04:49<67:58:44, 4.08it/s, grad_norm=2.43, loss_final=1.74, loss_mean=0.945, loss_mean_cls=1.11, proj_loss=-0.312][2026-03-22 14:23:26] Step: 1170, Training Logs: loss_final: 1.900557, loss_mean: 0.934847, proj_loss: -0.302870, loss_mean_cls: 1.268579, grad_norm: 3.886341 +Steps: 0%| | 1171/1000000 [04:49<67:54:04, 4.09it/s, grad_norm=3.89, loss_final=1.9, loss_mean=0.935, loss_mean_cls=1.27, proj_loss=-0.303][2026-03-22 14:23:26] Step: 1171, Training Logs: loss_final: 1.825037, loss_mean: 0.939314, proj_loss: -0.304671, loss_mean_cls: 1.190393, grad_norm: 3.003208 +Steps: 0%| | 1172/1000000 [04:49<67:51:54, 4.09it/s, grad_norm=3, loss_final=1.83, loss_mean=0.939, loss_mean_cls=1.19, proj_loss=-0.305][2026-03-22 14:23:26] Step: 1172, Training Logs: loss_final: 1.822953, loss_mean: 0.940733, proj_loss: -0.298438, loss_mean_cls: 1.180658, grad_norm: 3.572948 +Steps: 0%| | 1173/1000000 [04:50<67:49:46, 4.09it/s, grad_norm=3.57, loss_final=1.82, loss_mean=0.941, loss_mean_cls=1.18, proj_loss=-0.298][2026-03-22 14:23:27] Step: 1173, Training Logs: loss_final: 1.755520, loss_mean: 0.936454, proj_loss: -0.313103, loss_mean_cls: 1.132169, grad_norm: 2.900073 +Steps: 0%| | 1174/1000000 [04:50<67:49:39, 4.09it/s, grad_norm=2.9, loss_final=1.76, loss_mean=0.936, loss_mean_cls=1.13, proj_loss=-0.313][2026-03-22 14:23:27] Step: 1174, Training Logs: loss_final: 1.743037, loss_mean: 0.937433, proj_loss: -0.313983, loss_mean_cls: 1.119588, grad_norm: 2.815353 +Steps: 0%| | 1175/1000000 [04:50<67:49:02, 4.09it/s, grad_norm=2.82, loss_final=1.74, loss_mean=0.937, loss_mean_cls=1.12, proj_loss=-0.314][2026-03-22 14:23:27] Step: 1175, Training Logs: loss_final: 1.937747, loss_mean: 0.921969, proj_loss: -0.302093, loss_mean_cls: 1.317871, grad_norm: 3.722590 +Steps: 0%| | 1176/1000000 [04:50<67:49:03, 4.09it/s, grad_norm=3.72, loss_final=1.94, loss_mean=0.922, loss_mean_cls=1.32, proj_loss=-0.302][2026-03-22 14:23:27] Step: 1176, Training Logs: loss_final: 1.851542, loss_mean: 0.929344, proj_loss: -0.304240, loss_mean_cls: 1.226437, grad_norm: 3.951726 +Steps: 0%| | 1177/1000000 [04:51<67:47:08, 4.09it/s, grad_norm=3.95, loss_final=1.85, loss_mean=0.929, loss_mean_cls=1.23, proj_loss=-0.304][2026-03-22 14:23:28] Step: 1177, Training Logs: loss_final: 1.683969, loss_mean: 0.941325, proj_loss: -0.312597, loss_mean_cls: 1.055241, grad_norm: 3.180809 +Steps: 0%| | 1178/1000000 [04:51<67:46:58, 4.09it/s, grad_norm=3.18, loss_final=1.68, loss_mean=0.941, loss_mean_cls=1.06, proj_loss=-0.313][2026-03-22 14:23:28] Step: 1178, Training Logs: loss_final: 1.817972, loss_mean: 0.954695, proj_loss: -0.308342, loss_mean_cls: 1.171619, grad_norm: 3.971528 +Steps: 0%| | 1179/1000000 [04:51<67:45:03, 4.10it/s, grad_norm=3.97, loss_final=1.82, loss_mean=0.955, loss_mean_cls=1.17, proj_loss=-0.308][2026-03-22 14:23:28] Step: 1179, Training Logs: loss_final: 1.749583, loss_mean: 0.952524, proj_loss: -0.314818, loss_mean_cls: 1.111877, grad_norm: 1.958656 +Steps: 0%| | 1180/1000000 [04:51<67:45:33, 4.09it/s, grad_norm=1.96, loss_final=1.75, loss_mean=0.953, loss_mean_cls=1.11, proj_loss=-0.315][2026-03-22 14:23:28] Step: 1180, Training Logs: loss_final: 1.781913, loss_mean: 0.929293, proj_loss: -0.312012, loss_mean_cls: 1.164632, grad_norm: 2.735970 +Steps: 0%| | 1181/1000000 [04:52<67:44:17, 4.10it/s, grad_norm=2.74, loss_final=1.78, loss_mean=0.929, loss_mean_cls=1.16, proj_loss=-0.312][2026-03-22 14:23:29] Step: 1181, Training Logs: loss_final: 1.901230, loss_mean: 0.940791, proj_loss: -0.305764, loss_mean_cls: 1.266203, grad_norm: 2.957419 +Steps: 0%| | 1182/1000000 [04:52<67:46:48, 4.09it/s, grad_norm=2.96, loss_final=1.9, loss_mean=0.941, loss_mean_cls=1.27, proj_loss=-0.306][2026-03-22 14:23:29] Step: 1182, Training Logs: loss_final: 1.777893, loss_mean: 0.946531, proj_loss: -0.313682, loss_mean_cls: 1.145045, grad_norm: 2.542526 +Steps: 0%| | 1183/1000000 [04:52<67:46:40, 4.09it/s, grad_norm=2.54, loss_final=1.78, loss_mean=0.947, loss_mean_cls=1.15, proj_loss=-0.314][2026-03-22 14:23:29] Step: 1183, Training Logs: loss_final: 1.770659, loss_mean: 0.966997, proj_loss: -0.313648, loss_mean_cls: 1.117310, grad_norm: 4.511147 +Steps: 0%| | 1184/1000000 [04:52<67:46:46, 4.09it/s, grad_norm=4.51, loss_final=1.77, loss_mean=0.967, loss_mean_cls=1.12, proj_loss=-0.314][2026-03-22 14:23:29] Step: 1184, Training Logs: loss_final: 1.763188, loss_mean: 0.958786, proj_loss: -0.313832, loss_mean_cls: 1.118234, grad_norm: 4.193927 +Steps: 0%| | 1185/1000000 [04:53<67:45:11, 4.09it/s, grad_norm=4.19, loss_final=1.76, loss_mean=0.959, loss_mean_cls=1.12, proj_loss=-0.314][2026-03-22 14:23:30] Step: 1185, Training Logs: loss_final: 1.723783, loss_mean: 0.943024, proj_loss: -0.317436, loss_mean_cls: 1.098195, grad_norm: 3.512090 +Steps: 0%| | 1186/1000000 [04:53<67:45:59, 4.09it/s, grad_norm=3.51, loss_final=1.72, loss_mean=0.943, loss_mean_cls=1.1, proj_loss=-0.317][2026-03-22 14:23:30] Step: 1186, Training Logs: loss_final: 1.864099, loss_mean: 0.949262, proj_loss: -0.301527, loss_mean_cls: 1.216363, grad_norm: 2.236271 +Steps: 0%| | 1187/1000000 [04:53<67:45:39, 4.09it/s, grad_norm=2.24, loss_final=1.86, loss_mean=0.949, loss_mean_cls=1.22, proj_loss=-0.302][2026-03-22 14:23:30] Step: 1187, Training Logs: loss_final: 1.720021, loss_mean: 0.957550, proj_loss: -0.315104, loss_mean_cls: 1.077575, grad_norm: 3.440147 +Steps: 0%| | 1188/1000000 [04:53<67:47:23, 4.09it/s, grad_norm=3.44, loss_final=1.72, loss_mean=0.958, loss_mean_cls=1.08, proj_loss=-0.315][2026-03-22 14:23:30] Step: 1188, Training Logs: loss_final: 1.757398, loss_mean: 0.960555, proj_loss: -0.314421, loss_mean_cls: 1.111265, grad_norm: 3.152963 +Steps: 0%| | 1189/1000000 [04:54<67:46:02, 4.09it/s, grad_norm=3.15, loss_final=1.76, loss_mean=0.961, loss_mean_cls=1.11, proj_loss=-0.314][2026-03-22 14:23:31] Step: 1189, Training Logs: loss_final: 1.920227, loss_mean: 0.920655, proj_loss: -0.305722, loss_mean_cls: 1.305294, grad_norm: 2.290668 +Steps: 0%| | 1190/1000000 [04:54<67:45:11, 4.09it/s, grad_norm=2.29, loss_final=1.92, loss_mean=0.921, loss_mean_cls=1.31, proj_loss=-0.306][2026-03-22 14:23:31] Step: 1190, Training Logs: loss_final: 1.963314, loss_mean: 0.929893, proj_loss: -0.301418, loss_mean_cls: 1.334838, grad_norm: 2.574378 +Steps: 0%| | 1191/1000000 [04:54<67:46:47, 4.09it/s, grad_norm=2.57, loss_final=1.96, loss_mean=0.93, loss_mean_cls=1.33, proj_loss=-0.301][2026-03-22 14:23:31] Step: 1191, Training Logs: loss_final: 1.762842, loss_mean: 0.950277, proj_loss: -0.315570, loss_mean_cls: 1.128135, grad_norm: 2.725083 +Steps: 0%| | 1192/1000000 [04:54<67:46:59, 4.09it/s, grad_norm=2.73, loss_final=1.76, loss_mean=0.95, loss_mean_cls=1.13, proj_loss=-0.316][2026-03-22 14:23:31] Step: 1192, Training Logs: loss_final: 1.864051, loss_mean: 0.958250, proj_loss: -0.302873, loss_mean_cls: 1.208674, grad_norm: 6.114379 +Steps: 0%| | 1193/1000000 [04:55<67:46:36, 4.09it/s, grad_norm=6.11, loss_final=1.86, loss_mean=0.958, loss_mean_cls=1.21, proj_loss=-0.303][2026-03-22 14:23:32] Step: 1193, Training Logs: loss_final: 2.096668, loss_mean: 0.907579, proj_loss: -0.304942, loss_mean_cls: 1.494031, grad_norm: 3.076366 +Steps: 0%| | 1194/1000000 [04:55<67:58:19, 4.08it/s, grad_norm=3.08, loss_final=2.1, loss_mean=0.908, loss_mean_cls=1.49, proj_loss=-0.305][2026-03-22 14:23:32] Step: 1194, Training Logs: loss_final: 1.745011, loss_mean: 0.953326, proj_loss: -0.316772, loss_mean_cls: 1.108457, grad_norm: 2.675573 +Steps: 0%| | 1195/1000000 [04:55<67:54:36, 4.09it/s, grad_norm=2.68, loss_final=1.75, loss_mean=0.953, loss_mean_cls=1.11, proj_loss=-0.317][2026-03-22 14:23:32] Step: 1195, Training Logs: loss_final: 1.817006, loss_mean: 0.956780, proj_loss: -0.309301, loss_mean_cls: 1.169526, grad_norm: 3.432609 +Steps: 0%| | 1196/1000000 [04:55<67:53:50, 4.09it/s, grad_norm=3.43, loss_final=1.82, loss_mean=0.957, loss_mean_cls=1.17, proj_loss=-0.309][2026-03-22 14:23:32] Step: 1196, Training Logs: loss_final: 1.949736, loss_mean: 0.950628, proj_loss: -0.311119, loss_mean_cls: 1.310226, grad_norm: 7.431603 +Steps: 0%| | 1197/1000000 [04:56<67:51:56, 4.09it/s, grad_norm=7.43, loss_final=1.95, loss_mean=0.951, loss_mean_cls=1.31, proj_loss=-0.311][2026-03-22 14:23:33] Step: 1197, Training Logs: loss_final: 1.882651, loss_mean: 0.942854, proj_loss: -0.312206, loss_mean_cls: 1.252003, grad_norm: 4.389934 +Steps: 0%| | 1198/1000000 [04:56<67:50:04, 4.09it/s, grad_norm=4.39, loss_final=1.88, loss_mean=0.943, loss_mean_cls=1.25, proj_loss=-0.312][2026-03-22 14:23:33] Step: 1198, Training Logs: loss_final: 1.920572, loss_mean: 0.926909, proj_loss: -0.310267, loss_mean_cls: 1.303930, grad_norm: 3.389154 +Steps: 0%| | 1199/1000000 [04:56<67:52:32, 4.09it/s, grad_norm=3.39, loss_final=1.92, loss_mean=0.927, loss_mean_cls=1.3, proj_loss=-0.31][2026-03-22 14:23:33] Step: 1199, Training Logs: loss_final: 1.796322, loss_mean: 0.938496, proj_loss: -0.316833, loss_mean_cls: 1.174658, grad_norm: 4.175150 +Steps: 0%| | 1200/1000000 [04:56<67:53:21, 4.09it/s, grad_norm=4.18, loss_final=1.8, loss_mean=0.938, loss_mean_cls=1.17, proj_loss=-0.317][2026-03-22 14:23:33] Step: 1200, Training Logs: loss_final: 1.798055, loss_mean: 0.939327, proj_loss: -0.317771, loss_mean_cls: 1.176498, grad_norm: 3.371943 +Steps: 0%| | 1201/1000000 [04:57<67:50:32, 4.09it/s, grad_norm=3.37, loss_final=1.8, loss_mean=0.939, loss_mean_cls=1.18, proj_loss=-0.318][2026-03-22 14:23:33] Step: 1201, Training Logs: loss_final: 1.942126, loss_mean: 0.933712, proj_loss: -0.307002, loss_mean_cls: 1.315416, grad_norm: 3.740316 +Steps: 0%| | 1202/1000000 [04:57<67:49:33, 4.09it/s, grad_norm=3.74, loss_final=1.94, loss_mean=0.934, loss_mean_cls=1.32, proj_loss=-0.307][2026-03-22 14:23:34] Step: 1202, Training Logs: loss_final: 1.866170, loss_mean: 0.928643, proj_loss: -0.313712, loss_mean_cls: 1.251239, grad_norm: 4.575485 +Steps: 0%| | 1203/1000000 [04:57<67:50:43, 4.09it/s, grad_norm=4.58, loss_final=1.87, loss_mean=0.929, loss_mean_cls=1.25, proj_loss=-0.314][2026-03-22 14:23:34] Step: 1203, Training Logs: loss_final: 1.848716, loss_mean: 0.945819, proj_loss: -0.304190, loss_mean_cls: 1.207087, grad_norm: 2.320541 +Steps: 0%| | 1204/1000000 [04:57<67:49:40, 4.09it/s, grad_norm=2.32, loss_final=1.85, loss_mean=0.946, loss_mean_cls=1.21, proj_loss=-0.304][2026-03-22 14:23:34] Step: 1204, Training Logs: loss_final: 1.788750, loss_mean: 0.957542, proj_loss: -0.317995, loss_mean_cls: 1.149203, grad_norm: 3.642763 +Steps: 0%| | 1205/1000000 [04:58<67:46:50, 4.09it/s, grad_norm=3.64, loss_final=1.79, loss_mean=0.958, loss_mean_cls=1.15, proj_loss=-0.318][2026-03-22 14:23:34] Step: 1205, Training Logs: loss_final: 1.782948, loss_mean: 0.962218, proj_loss: -0.325563, loss_mean_cls: 1.146293, grad_norm: 4.472889 +Steps: 0%| | 1206/1000000 [04:58<67:47:37, 4.09it/s, grad_norm=4.47, loss_final=1.78, loss_mean=0.962, loss_mean_cls=1.15, proj_loss=-0.326][2026-03-22 14:23:35] Step: 1206, Training Logs: loss_final: 1.853024, loss_mean: 0.953157, proj_loss: -0.314536, loss_mean_cls: 1.214403, grad_norm: 2.643104 +Steps: 0%| | 1207/1000000 [04:58<67:49:27, 4.09it/s, grad_norm=2.64, loss_final=1.85, loss_mean=0.953, loss_mean_cls=1.21, proj_loss=-0.315][2026-03-22 14:23:35] Step: 1207, Training Logs: loss_final: 1.834600, loss_mean: 0.933384, proj_loss: -0.314618, loss_mean_cls: 1.215835, grad_norm: 4.119919 +Steps: 0%| | 1208/1000000 [04:58<67:48:45, 4.09it/s, grad_norm=4.12, loss_final=1.83, loss_mean=0.933, loss_mean_cls=1.22, proj_loss=-0.315][2026-03-22 14:23:35] Step: 1208, Training Logs: loss_final: 1.767472, loss_mean: 0.956410, proj_loss: -0.319158, loss_mean_cls: 1.130220, grad_norm: 5.380687 +Steps: 0%| | 1209/1000000 [04:58<67:49:12, 4.09it/s, grad_norm=5.38, loss_final=1.77, loss_mean=0.956, loss_mean_cls=1.13, proj_loss=-0.319][2026-03-22 14:23:35] Step: 1209, Training Logs: loss_final: 1.916335, loss_mean: 0.927820, proj_loss: -0.304858, loss_mean_cls: 1.293374, grad_norm: 4.253114 +Steps: 0%| | 1210/1000000 [04:59<67:48:03, 4.09it/s, grad_norm=4.25, loss_final=1.92, loss_mean=0.928, loss_mean_cls=1.29, proj_loss=-0.305][2026-03-22 14:23:36] Step: 1210, Training Logs: loss_final: 1.846698, loss_mean: 0.942709, proj_loss: -0.314171, loss_mean_cls: 1.218160, grad_norm: 2.710320 +Steps: 0%| | 1211/1000000 [04:59<67:48:15, 4.09it/s, grad_norm=2.71, loss_final=1.85, loss_mean=0.943, loss_mean_cls=1.22, proj_loss=-0.314][2026-03-22 14:23:36] Step: 1211, Training Logs: loss_final: 1.805889, loss_mean: 0.946234, proj_loss: -0.318246, loss_mean_cls: 1.177902, grad_norm: 4.075829 +Steps: 0%| | 1212/1000000 [04:59<67:47:47, 4.09it/s, grad_norm=4.08, loss_final=1.81, loss_mean=0.946, loss_mean_cls=1.18, proj_loss=-0.318][2026-03-22 14:23:36] Step: 1212, Training Logs: loss_final: 1.860186, loss_mean: 0.940715, proj_loss: -0.306541, loss_mean_cls: 1.226012, grad_norm: 3.171114 +Steps: 0%| | 1213/1000000 [04:59<67:49:24, 4.09it/s, grad_norm=3.17, loss_final=1.86, loss_mean=0.941, loss_mean_cls=1.23, proj_loss=-0.307][2026-03-22 14:23:36] Step: 1213, Training Logs: loss_final: 1.740132, loss_mean: 0.953696, proj_loss: -0.314842, loss_mean_cls: 1.101278, grad_norm: 3.720135 +Steps: 0%| | 1214/1000000 [05:00<67:49:09, 4.09it/s, grad_norm=3.72, loss_final=1.74, loss_mean=0.954, loss_mean_cls=1.1, proj_loss=-0.315][2026-03-22 14:23:37] Step: 1214, Training Logs: loss_final: 1.828593, loss_mean: 0.944097, proj_loss: -0.318704, loss_mean_cls: 1.203201, grad_norm: 4.946072 +Steps: 0%| | 1215/1000000 [05:00<67:48:46, 4.09it/s, grad_norm=4.95, loss_final=1.83, loss_mean=0.944, loss_mean_cls=1.2, proj_loss=-0.319][2026-03-22 14:23:37] Step: 1215, Training Logs: loss_final: 1.833663, loss_mean: 0.919030, proj_loss: -0.311385, loss_mean_cls: 1.226018, grad_norm: 2.580941 +Steps: 0%| | 1216/1000000 [05:00<67:49:31, 4.09it/s, grad_norm=2.58, loss_final=1.83, loss_mean=0.919, loss_mean_cls=1.23, proj_loss=-0.311][2026-03-22 14:23:37] Step: 1216, Training Logs: loss_final: 1.846532, loss_mean: 0.945307, proj_loss: -0.311031, loss_mean_cls: 1.212255, grad_norm: 2.896205 +Steps: 0%| | 1217/1000000 [05:00<67:49:15, 4.09it/s, grad_norm=2.9, loss_final=1.85, loss_mean=0.945, loss_mean_cls=1.21, proj_loss=-0.311][2026-03-22 14:23:37] Step: 1217, Training Logs: loss_final: 1.752563, loss_mean: 0.951893, proj_loss: -0.328674, loss_mean_cls: 1.129344, grad_norm: 3.267737 +Steps: 0%| | 1218/1000000 [05:01<67:51:20, 4.09it/s, grad_norm=3.27, loss_final=1.75, loss_mean=0.952, loss_mean_cls=1.13, proj_loss=-0.329][2026-03-22 14:23:38] Step: 1218, Training Logs: loss_final: 1.793847, loss_mean: 0.934172, proj_loss: -0.310370, loss_mean_cls: 1.170044, grad_norm: 2.936037 +Steps: 0%| | 1219/1000000 [05:01<67:51:38, 4.09it/s, grad_norm=2.94, loss_final=1.79, loss_mean=0.934, loss_mean_cls=1.17, proj_loss=-0.31][2026-03-22 14:23:38] Step: 1219, Training Logs: loss_final: 1.761931, loss_mean: 0.962071, proj_loss: -0.318437, loss_mean_cls: 1.118297, grad_norm: 2.273418 +Steps: 0%| | 1220/1000000 [05:01<67:51:32, 4.09it/s, grad_norm=2.27, loss_final=1.76, loss_mean=0.962, loss_mean_cls=1.12, proj_loss=-0.318][2026-03-22 14:23:38] Step: 1220, Training Logs: loss_final: 1.861085, loss_mean: 0.924752, proj_loss: -0.321352, loss_mean_cls: 1.257685, grad_norm: 3.631563 +Steps: 0%| | 1221/1000000 [05:01<67:51:38, 4.09it/s, grad_norm=3.63, loss_final=1.86, loss_mean=0.925, loss_mean_cls=1.26, proj_loss=-0.321][2026-03-22 14:23:38] Step: 1221, Training Logs: loss_final: 1.813038, loss_mean: 0.924816, proj_loss: -0.313632, loss_mean_cls: 1.201854, grad_norm: 1.996164 +Steps: 0%| | 1222/1000000 [05:02<67:49:36, 4.09it/s, grad_norm=2, loss_final=1.81, loss_mean=0.925, loss_mean_cls=1.2, proj_loss=-0.314][2026-03-22 14:23:39] Step: 1222, Training Logs: loss_final: 1.805966, loss_mean: 0.935636, proj_loss: -0.318190, loss_mean_cls: 1.188520, grad_norm: 2.996971 +Steps: 0%| | 1223/1000000 [05:02<67:48:37, 4.09it/s, grad_norm=3, loss_final=1.81, loss_mean=0.936, loss_mean_cls=1.19, proj_loss=-0.318][2026-03-22 14:23:39] Step: 1223, Training Logs: loss_final: 1.706764, loss_mean: 0.943638, proj_loss: -0.319025, loss_mean_cls: 1.082151, grad_norm: 3.036177 +Steps: 0%| | 1224/1000000 [05:02<67:47:51, 4.09it/s, grad_norm=3.04, loss_final=1.71, loss_mean=0.944, loss_mean_cls=1.08, proj_loss=-0.319][2026-03-22 14:23:39] Step: 1224, Training Logs: loss_final: 1.748832, loss_mean: 0.959207, proj_loss: -0.317015, loss_mean_cls: 1.106639, grad_norm: 2.717441 +Steps: 0%| | 1225/1000000 [05:02<67:48:30, 4.09it/s, grad_norm=2.72, loss_final=1.75, loss_mean=0.959, loss_mean_cls=1.11, proj_loss=-0.317][2026-03-22 14:23:39] Step: 1225, Training Logs: loss_final: 1.874413, loss_mean: 0.935921, proj_loss: -0.317625, loss_mean_cls: 1.256116, grad_norm: 3.069627 +Steps: 0%| | 1226/1000000 [05:03<67:45:35, 4.09it/s, grad_norm=3.07, loss_final=1.87, loss_mean=0.936, loss_mean_cls=1.26, proj_loss=-0.318][2026-03-22 14:23:40] Step: 1226, Training Logs: loss_final: 1.784621, loss_mean: 0.934256, proj_loss: -0.318877, loss_mean_cls: 1.169242, grad_norm: 3.727524 +Steps: 0%| | 1227/1000000 [05:03<67:45:42, 4.09it/s, grad_norm=3.73, loss_final=1.78, loss_mean=0.934, loss_mean_cls=1.17, proj_loss=-0.319][2026-03-22 14:23:40] Step: 1227, Training Logs: loss_final: 1.927528, loss_mean: 0.930781, proj_loss: -0.303175, loss_mean_cls: 1.299922, grad_norm: 2.717117 +Steps: 0%| | 1228/1000000 [05:03<67:46:32, 4.09it/s, grad_norm=2.72, loss_final=1.93, loss_mean=0.931, loss_mean_cls=1.3, proj_loss=-0.303][2026-03-22 14:23:40] Step: 1228, Training Logs: loss_final: 1.861788, loss_mean: 0.934329, proj_loss: -0.312884, loss_mean_cls: 1.240343, grad_norm: 2.621555 +Steps: 0%| | 1229/1000000 [05:03<67:45:25, 4.09it/s, grad_norm=2.62, loss_final=1.86, loss_mean=0.934, loss_mean_cls=1.24, proj_loss=-0.313][2026-03-22 14:23:40] Step: 1229, Training Logs: loss_final: 1.814894, loss_mean: 0.955497, proj_loss: -0.319223, loss_mean_cls: 1.178620, grad_norm: 4.993718 +Steps: 0%| | 1230/1000000 [05:04<67:47:24, 4.09it/s, grad_norm=4.99, loss_final=1.81, loss_mean=0.955, loss_mean_cls=1.18, proj_loss=-0.319][2026-03-22 14:23:41] Step: 1230, Training Logs: loss_final: 1.984470, loss_mean: 0.927782, proj_loss: -0.309577, loss_mean_cls: 1.366266, grad_norm: 4.530805 +Steps: 0%| | 1231/1000000 [05:04<67:45:37, 4.09it/s, grad_norm=4.53, loss_final=1.98, loss_mean=0.928, loss_mean_cls=1.37, proj_loss=-0.31][2026-03-22 14:23:41] Step: 1231, Training Logs: loss_final: 1.915885, loss_mean: 0.918872, proj_loss: -0.314357, loss_mean_cls: 1.311370, grad_norm: 4.080816 +Steps: 0%| | 1232/1000000 [05:04<67:47:14, 4.09it/s, grad_norm=4.08, loss_final=1.92, loss_mean=0.919, loss_mean_cls=1.31, proj_loss=-0.314][2026-03-22 14:23:41] Step: 1232, Training Logs: loss_final: 1.726823, loss_mean: 0.969569, proj_loss: -0.319838, loss_mean_cls: 1.077092, grad_norm: 4.050582 +Steps: 0%| | 1233/1000000 [05:04<67:58:07, 4.08it/s, grad_norm=4.05, loss_final=1.73, loss_mean=0.97, loss_mean_cls=1.08, proj_loss=-0.32][2026-03-22 14:23:41] Step: 1233, Training Logs: loss_final: 1.935134, loss_mean: 0.939847, proj_loss: -0.315112, loss_mean_cls: 1.310399, grad_norm: 5.605870 +Steps: 0%| | 1234/1000000 [05:05<67:56:30, 4.08it/s, grad_norm=5.61, loss_final=1.94, loss_mean=0.94, loss_mean_cls=1.31, proj_loss=-0.315][2026-03-22 14:23:42] Step: 1234, Training Logs: loss_final: 1.812523, loss_mean: 0.962844, proj_loss: -0.317152, loss_mean_cls: 1.166831, grad_norm: 4.398152 +Steps: 0%| | 1235/1000000 [05:05<67:54:00, 4.09it/s, grad_norm=4.4, loss_final=1.81, loss_mean=0.963, loss_mean_cls=1.17, proj_loss=-0.317][2026-03-22 14:23:42] Step: 1235, Training Logs: loss_final: 1.829393, loss_mean: 0.945411, proj_loss: -0.312750, loss_mean_cls: 1.196732, grad_norm: 2.819019 +Steps: 0%| | 1236/1000000 [05:05<67:53:09, 4.09it/s, grad_norm=2.82, loss_final=1.83, loss_mean=0.945, loss_mean_cls=1.2, proj_loss=-0.313][2026-03-22 14:23:42] Step: 1236, Training Logs: loss_final: 1.900278, loss_mean: 0.938469, proj_loss: -0.313866, loss_mean_cls: 1.275674, grad_norm: 4.123581 +Steps: 0%| | 1237/1000000 [05:05<67:50:54, 4.09it/s, grad_norm=4.12, loss_final=1.9, loss_mean=0.938, loss_mean_cls=1.28, proj_loss=-0.314][2026-03-22 14:23:42] Step: 1237, Training Logs: loss_final: 1.755058, loss_mean: 0.981102, proj_loss: -0.324700, loss_mean_cls: 1.098657, grad_norm: 4.261373 +Steps: 0%| | 1238/1000000 [05:06<67:49:56, 4.09it/s, grad_norm=4.26, loss_final=1.76, loss_mean=0.981, loss_mean_cls=1.1, proj_loss=-0.325][2026-03-22 14:23:43] Step: 1238, Training Logs: loss_final: 1.657129, loss_mean: 0.976770, proj_loss: -0.327363, loss_mean_cls: 1.007722, grad_norm: 2.433227 +Steps: 0%| | 1239/1000000 [05:06<67:50:29, 4.09it/s, grad_norm=2.43, loss_final=1.66, loss_mean=0.977, loss_mean_cls=1.01, proj_loss=-0.327][2026-03-22 14:23:43] Step: 1239, Training Logs: loss_final: 1.713686, loss_mean: 0.956174, proj_loss: -0.320680, loss_mean_cls: 1.078191, grad_norm: 3.262298 +Steps: 0%| | 1240/1000000 [05:06<67:49:29, 4.09it/s, grad_norm=3.26, loss_final=1.71, loss_mean=0.956, loss_mean_cls=1.08, proj_loss=-0.321][2026-03-22 14:23:43] Step: 1240, Training Logs: loss_final: 1.785179, loss_mean: 0.946863, proj_loss: -0.319645, loss_mean_cls: 1.157961, grad_norm: 2.964411 +Steps: 0%| | 1241/1000000 [05:06<67:50:02, 4.09it/s, grad_norm=2.96, loss_final=1.79, loss_mean=0.947, loss_mean_cls=1.16, proj_loss=-0.32][2026-03-22 14:23:43] Step: 1241, Training Logs: loss_final: 1.685853, loss_mean: 0.951654, proj_loss: -0.323935, loss_mean_cls: 1.058134, grad_norm: 2.463432 +Steps: 0%| | 1242/1000000 [05:07<67:47:47, 4.09it/s, grad_norm=2.46, loss_final=1.69, loss_mean=0.952, loss_mean_cls=1.06, proj_loss=-0.324][2026-03-22 14:23:44] Step: 1242, Training Logs: loss_final: 1.755206, loss_mean: 0.947087, proj_loss: -0.328814, loss_mean_cls: 1.136932, grad_norm: 2.799226 +Steps: 0%| | 1243/1000000 [05:07<67:48:11, 4.09it/s, grad_norm=2.8, loss_final=1.76, loss_mean=0.947, loss_mean_cls=1.14, proj_loss=-0.329][2026-03-22 14:23:44] Step: 1243, Training Logs: loss_final: 1.883717, loss_mean: 0.914157, proj_loss: -0.315127, loss_mean_cls: 1.284687, grad_norm: 3.472044 +Steps: 0%| | 1244/1000000 [05:07<67:49:13, 4.09it/s, grad_norm=3.47, loss_final=1.88, loss_mean=0.914, loss_mean_cls=1.28, proj_loss=-0.315][2026-03-22 14:23:44] Step: 1244, Training Logs: loss_final: 1.750005, loss_mean: 0.971963, proj_loss: -0.319570, loss_mean_cls: 1.097613, grad_norm: 2.063773 +Steps: 0%| | 1245/1000000 [05:07<67:50:47, 4.09it/s, grad_norm=2.06, loss_final=1.75, loss_mean=0.972, loss_mean_cls=1.1, proj_loss=-0.32][2026-03-22 14:23:44] Step: 1245, Training Logs: loss_final: 1.820648, loss_mean: 0.950388, proj_loss: -0.319945, loss_mean_cls: 1.190205, grad_norm: 3.177548 +Steps: 0%| | 1246/1000000 [05:08<67:50:26, 4.09it/s, grad_norm=3.18, loss_final=1.82, loss_mean=0.95, loss_mean_cls=1.19, proj_loss=-0.32][2026-03-22 14:23:44] Step: 1246, Training Logs: loss_final: 1.916550, loss_mean: 0.917297, proj_loss: -0.319448, loss_mean_cls: 1.318702, grad_norm: 1.897025 +Steps: 0%| | 1247/1000000 [05:08<68:38:12, 4.04it/s, grad_norm=1.9, loss_final=1.92, loss_mean=0.917, loss_mean_cls=1.32, proj_loss=-0.319][2026-03-22 14:23:45] Step: 1247, Training Logs: loss_final: 1.820844, loss_mean: 0.944075, proj_loss: -0.319931, loss_mean_cls: 1.196701, grad_norm: 3.353569 +Steps: 0%| | 1248/1000000 [05:08<68:25:19, 4.05it/s, grad_norm=3.35, loss_final=1.82, loss_mean=0.944, loss_mean_cls=1.2, proj_loss=-0.32][2026-03-22 14:23:45] Step: 1248, Training Logs: loss_final: 1.672487, loss_mean: 0.965825, proj_loss: -0.324005, loss_mean_cls: 1.030667, grad_norm: 5.045947 +Steps: 0%| | 1249/1000000 [05:08<68:15:31, 4.06it/s, grad_norm=5.05, loss_final=1.67, loss_mean=0.966, loss_mean_cls=1.03, proj_loss=-0.324][2026-03-22 14:23:45] Step: 1249, Training Logs: loss_final: 1.709295, loss_mean: 0.968311, proj_loss: -0.326389, loss_mean_cls: 1.067373, grad_norm: 3.663382 +Steps: 0%| | 1250/1000000 [05:09<68:08:08, 4.07it/s, grad_norm=3.66, loss_final=1.71, loss_mean=0.968, loss_mean_cls=1.07, proj_loss=-0.326][2026-03-22 14:23:45] Step: 1250, Training Logs: loss_final: 1.758033, loss_mean: 0.961221, proj_loss: -0.313836, loss_mean_cls: 1.110649, grad_norm: 2.702633 +Steps: 0%| | 1251/1000000 [05:09<68:03:19, 4.08it/s, grad_norm=2.7, loss_final=1.76, loss_mean=0.961, loss_mean_cls=1.11, proj_loss=-0.314][2026-03-22 14:23:46] Step: 1251, Training Logs: loss_final: 1.822457, loss_mean: 0.933606, proj_loss: -0.327339, loss_mean_cls: 1.216190, grad_norm: 2.750108 +Steps: 0%| | 1252/1000000 [05:09<68:00:50, 4.08it/s, grad_norm=2.75, loss_final=1.82, loss_mean=0.934, loss_mean_cls=1.22, proj_loss=-0.327][2026-03-22 14:23:46] Step: 1252, Training Logs: loss_final: 1.819864, loss_mean: 0.936724, proj_loss: -0.314423, loss_mean_cls: 1.197563, grad_norm: 3.703568 +Steps: 0%| | 1253/1000000 [05:09<67:56:57, 4.08it/s, grad_norm=3.7, loss_final=1.82, loss_mean=0.937, loss_mean_cls=1.2, proj_loss=-0.314][2026-03-22 14:23:46] Step: 1253, Training Logs: loss_final: 1.801017, loss_mean: 0.933585, proj_loss: -0.327516, loss_mean_cls: 1.194949, grad_norm: 5.775411 +Steps: 0%| | 1254/1000000 [05:09<67:54:41, 4.09it/s, grad_norm=5.78, loss_final=1.8, loss_mean=0.934, loss_mean_cls=1.19, proj_loss=-0.328][2026-03-22 14:23:46] Step: 1254, Training Logs: loss_final: 1.792554, loss_mean: 0.931945, proj_loss: -0.318508, loss_mean_cls: 1.179117, grad_norm: 4.322989 +Steps: 0%| | 1255/1000000 [05:10<67:52:06, 4.09it/s, grad_norm=4.32, loss_final=1.79, loss_mean=0.932, loss_mean_cls=1.18, proj_loss=-0.319][2026-03-22 14:23:47] Step: 1255, Training Logs: loss_final: 1.751156, loss_mean: 0.958965, proj_loss: -0.322261, loss_mean_cls: 1.114451, grad_norm: 3.505724 +Steps: 0%| | 1256/1000000 [05:10<67:52:43, 4.09it/s, grad_norm=3.51, loss_final=1.75, loss_mean=0.959, loss_mean_cls=1.11, proj_loss=-0.322][2026-03-22 14:23:47] Step: 1256, Training Logs: loss_final: 1.738926, loss_mean: 0.970615, proj_loss: -0.328324, loss_mean_cls: 1.096635, grad_norm: 5.456012 +Steps: 0%| | 1257/1000000 [05:10<67:51:11, 4.09it/s, grad_norm=5.46, loss_final=1.74, loss_mean=0.971, loss_mean_cls=1.1, proj_loss=-0.328][2026-03-22 14:23:47] Step: 1257, Training Logs: loss_final: 1.702538, loss_mean: 0.958169, proj_loss: -0.332928, loss_mean_cls: 1.077296, grad_norm: 4.651920 +Steps: 0%| | 1258/1000000 [05:10<67:50:29, 4.09it/s, grad_norm=4.65, loss_final=1.7, loss_mean=0.958, loss_mean_cls=1.08, proj_loss=-0.333][2026-03-22 14:23:47] Step: 1258, Training Logs: loss_final: 1.810792, loss_mean: 0.937219, proj_loss: -0.322259, loss_mean_cls: 1.195832, grad_norm: 5.537309 +Steps: 0%| | 1259/1000000 [05:11<67:50:16, 4.09it/s, grad_norm=5.54, loss_final=1.81, loss_mean=0.937, loss_mean_cls=1.2, proj_loss=-0.322][2026-03-22 14:23:48] Step: 1259, Training Logs: loss_final: 1.886477, loss_mean: 0.931081, proj_loss: -0.318130, loss_mean_cls: 1.273527, grad_norm: 4.645982 +Steps: 0%| | 1260/1000000 [05:11<67:52:02, 4.09it/s, grad_norm=4.65, loss_final=1.89, loss_mean=0.931, loss_mean_cls=1.27, proj_loss=-0.318][2026-03-22 14:23:48] Step: 1260, Training Logs: loss_final: 1.807001, loss_mean: 0.936913, proj_loss: -0.318706, loss_mean_cls: 1.188794, grad_norm: 4.255700 +Steps: 0%| | 1261/1000000 [05:11<67:50:04, 4.09it/s, grad_norm=4.26, loss_final=1.81, loss_mean=0.937, loss_mean_cls=1.19, proj_loss=-0.319][2026-03-22 14:23:48] Step: 1261, Training Logs: loss_final: 1.822835, loss_mean: 0.923340, proj_loss: -0.324082, loss_mean_cls: 1.223576, grad_norm: 3.344316 +Steps: 0%| | 1262/1000000 [05:11<67:49:19, 4.09it/s, grad_norm=3.34, loss_final=1.82, loss_mean=0.923, loss_mean_cls=1.22, proj_loss=-0.324][2026-03-22 14:23:48] Step: 1262, Training Logs: loss_final: 1.839856, loss_mean: 0.942379, proj_loss: -0.321599, loss_mean_cls: 1.219076, grad_norm: 3.855161 +Steps: 0%| | 1263/1000000 [05:12<67:49:35, 4.09it/s, grad_norm=3.86, loss_final=1.84, loss_mean=0.942, loss_mean_cls=1.22, proj_loss=-0.322][2026-03-22 14:23:49] Step: 1263, Training Logs: loss_final: 1.785413, loss_mean: 0.930734, proj_loss: -0.321878, loss_mean_cls: 1.176557, grad_norm: 3.789997 +Steps: 0%| | 1264/1000000 [05:12<68:34:05, 4.05it/s, grad_norm=3.79, loss_final=1.79, loss_mean=0.931, loss_mean_cls=1.18, proj_loss=-0.322][2026-03-22 14:23:49] Step: 1264, Training Logs: loss_final: 1.713523, loss_mean: 0.938562, proj_loss: -0.332088, loss_mean_cls: 1.107049, grad_norm: 3.339417 +Steps: 0%| | 1265/1000000 [05:12<68:19:53, 4.06it/s, grad_norm=3.34, loss_final=1.71, loss_mean=0.939, loss_mean_cls=1.11, proj_loss=-0.332][2026-03-22 14:23:49] Step: 1265, Training Logs: loss_final: 1.752805, loss_mean: 0.931823, proj_loss: -0.324928, loss_mean_cls: 1.145910, grad_norm: 3.011772 +Steps: 0%| | 1266/1000000 [05:12<68:09:24, 4.07it/s, grad_norm=3.01, loss_final=1.75, loss_mean=0.932, loss_mean_cls=1.15, proj_loss=-0.325][2026-03-22 14:23:49] Step: 1266, Training Logs: loss_final: 1.790692, loss_mean: 0.944533, proj_loss: -0.322960, loss_mean_cls: 1.169119, grad_norm: 2.989373 +Steps: 0%| | 1267/1000000 [05:13<68:02:03, 4.08it/s, grad_norm=2.99, loss_final=1.79, loss_mean=0.945, loss_mean_cls=1.17, proj_loss=-0.323][2026-03-22 14:23:50] Step: 1267, Training Logs: loss_final: 1.790031, loss_mean: 0.928954, proj_loss: -0.324941, loss_mean_cls: 1.186018, grad_norm: 3.426455 +Steps: 0%| | 1268/1000000 [05:13<68:00:17, 4.08it/s, grad_norm=3.43, loss_final=1.79, loss_mean=0.929, loss_mean_cls=1.19, proj_loss=-0.325][2026-03-22 14:23:50] Step: 1268, Training Logs: loss_final: 1.869679, loss_mean: 0.921825, proj_loss: -0.324571, loss_mean_cls: 1.272425, grad_norm: 2.981788 +Steps: 0%| | 1269/1000000 [05:13<67:56:30, 4.08it/s, grad_norm=2.98, loss_final=1.87, loss_mean=0.922, loss_mean_cls=1.27, proj_loss=-0.325][2026-03-22 14:23:50] Step: 1269, Training Logs: loss_final: 1.910419, loss_mean: 0.914652, proj_loss: -0.317978, loss_mean_cls: 1.313744, grad_norm: 2.344210 +Steps: 0%| | 1270/1000000 [05:13<67:55:20, 4.08it/s, grad_norm=2.34, loss_final=1.91, loss_mean=0.915, loss_mean_cls=1.31, proj_loss=-0.318][2026-03-22 14:23:50] Step: 1270, Training Logs: loss_final: 1.705519, loss_mean: 0.941640, proj_loss: -0.327540, loss_mean_cls: 1.091419, grad_norm: 3.071284 +Steps: 0%| | 1271/1000000 [05:14<67:50:26, 4.09it/s, grad_norm=3.07, loss_final=1.71, loss_mean=0.942, loss_mean_cls=1.09, proj_loss=-0.328][2026-03-22 14:23:51] Step: 1271, Training Logs: loss_final: 1.863037, loss_mean: 0.923210, proj_loss: -0.318554, loss_mean_cls: 1.258380, grad_norm: 3.562835 +Steps: 0%| | 1272/1000000 [05:14<67:50:36, 4.09it/s, grad_norm=3.56, loss_final=1.86, loss_mean=0.923, loss_mean_cls=1.26, proj_loss=-0.319][2026-03-22 14:23:51] Step: 1272, Training Logs: loss_final: 1.721494, loss_mean: 0.939639, proj_loss: -0.324501, loss_mean_cls: 1.106356, grad_norm: 3.318613 +Steps: 0%| | 1273/1000000 [05:14<67:49:43, 4.09it/s, grad_norm=3.32, loss_final=1.72, loss_mean=0.94, loss_mean_cls=1.11, proj_loss=-0.325][2026-03-22 14:23:51] Step: 1273, Training Logs: loss_final: 1.667182, loss_mean: 0.964593, proj_loss: -0.333266, loss_mean_cls: 1.035855, grad_norm: 4.795157 +Steps: 0%| | 1274/1000000 [05:14<67:50:10, 4.09it/s, grad_norm=4.8, loss_final=1.67, loss_mean=0.965, loss_mean_cls=1.04, proj_loss=-0.333][2026-03-22 14:23:51] Step: 1274, Training Logs: loss_final: 1.850819, loss_mean: 0.925414, proj_loss: -0.323058, loss_mean_cls: 1.248463, grad_norm: 2.795163 +Steps: 0%| | 1275/1000000 [05:15<67:48:55, 4.09it/s, grad_norm=2.8, loss_final=1.85, loss_mean=0.925, loss_mean_cls=1.25, proj_loss=-0.323][2026-03-22 14:23:52] Step: 1275, Training Logs: loss_final: 1.810561, loss_mean: 0.918228, proj_loss: -0.323915, loss_mean_cls: 1.216248, grad_norm: 3.770438 +Steps: 0%| | 1276/1000000 [05:15<67:52:26, 4.09it/s, grad_norm=3.77, loss_final=1.81, loss_mean=0.918, loss_mean_cls=1.22, proj_loss=-0.324][2026-03-22 14:23:52] Step: 1276, Training Logs: loss_final: 1.858283, loss_mean: 0.923527, proj_loss: -0.322028, loss_mean_cls: 1.256784, grad_norm: 4.459487 +Steps: 0%| | 1277/1000000 [05:15<67:51:40, 4.09it/s, grad_norm=4.46, loss_final=1.86, loss_mean=0.924, loss_mean_cls=1.26, proj_loss=-0.322][2026-03-22 14:23:52] Step: 1277, Training Logs: loss_final: 1.836050, loss_mean: 0.929900, proj_loss: -0.322301, loss_mean_cls: 1.228451, grad_norm: 1.465131 +Steps: 0%| | 1278/1000000 [05:15<67:50:02, 4.09it/s, grad_norm=1.47, loss_final=1.84, loss_mean=0.93, loss_mean_cls=1.23, proj_loss=-0.322][2026-03-22 14:23:52] Step: 1278, Training Logs: loss_final: 1.811030, loss_mean: 0.928563, proj_loss: -0.325157, loss_mean_cls: 1.207624, grad_norm: 4.493996 +Steps: 0%| | 1279/1000000 [05:16<67:48:13, 4.09it/s, grad_norm=4.49, loss_final=1.81, loss_mean=0.929, loss_mean_cls=1.21, proj_loss=-0.325][2026-03-22 14:23:53] Step: 1279, Training Logs: loss_final: 1.746664, loss_mean: 0.952683, proj_loss: -0.324171, loss_mean_cls: 1.118152, grad_norm: 3.333959 +Steps: 0%| | 1280/1000000 [05:16<67:47:57, 4.09it/s, grad_norm=3.33, loss_final=1.75, loss_mean=0.953, loss_mean_cls=1.12, proj_loss=-0.324][2026-03-22 14:23:53] Step: 1280, Training Logs: loss_final: 1.874698, loss_mean: 0.910197, proj_loss: -0.321514, loss_mean_cls: 1.286015, grad_norm: 2.710295 +Steps: 0%| | 1281/1000000 [05:16<70:29:20, 3.94it/s, grad_norm=2.71, loss_final=1.87, loss_mean=0.91, loss_mean_cls=1.29, proj_loss=-0.322][2026-03-22 14:23:53] Step: 1281, Training Logs: loss_final: 1.776949, loss_mean: 0.948733, proj_loss: -0.324449, loss_mean_cls: 1.152665, grad_norm: 3.669721 +Steps: 0%| | 1282/1000000 [05:16<69:53:46, 3.97it/s, grad_norm=3.67, loss_final=1.78, loss_mean=0.949, loss_mean_cls=1.15, proj_loss=-0.324][2026-03-22 14:23:53] Step: 1282, Training Logs: loss_final: 1.780368, loss_mean: 0.936503, proj_loss: -0.325952, loss_mean_cls: 1.169817, grad_norm: 2.950014 +Steps: 0%| | 1283/1000000 [05:17<69:15:22, 4.01it/s, grad_norm=2.95, loss_final=1.78, loss_mean=0.937, loss_mean_cls=1.17, proj_loss=-0.326][2026-03-22 14:23:54] Step: 1283, Training Logs: loss_final: 1.653380, loss_mean: 0.959969, proj_loss: -0.331998, loss_mean_cls: 1.025409, grad_norm: 5.503106 +Steps: 0%| | 1284/1000000 [05:17<68:48:55, 4.03it/s, grad_norm=5.5, loss_final=1.65, loss_mean=0.96, loss_mean_cls=1.03, proj_loss=-0.332][2026-03-22 14:23:54] Step: 1284, Training Logs: loss_final: 1.742968, loss_mean: 0.937999, proj_loss: -0.330065, loss_mean_cls: 1.135034, grad_norm: 6.013831 +Steps: 0%| | 1285/1000000 [05:17<68:31:11, 4.05it/s, grad_norm=6.01, loss_final=1.74, loss_mean=0.938, loss_mean_cls=1.14, proj_loss=-0.33][2026-03-22 14:23:54] Step: 1285, Training Logs: loss_final: 1.921903, loss_mean: 0.920980, proj_loss: -0.318150, loss_mean_cls: 1.319073, grad_norm: 2.427318 +Steps: 0%| | 1286/1000000 [05:17<68:19:09, 4.06it/s, grad_norm=2.43, loss_final=1.92, loss_mean=0.921, loss_mean_cls=1.32, proj_loss=-0.318][2026-03-22 14:23:54] Step: 1286, Training Logs: loss_final: 1.785093, loss_mean: 0.951389, proj_loss: -0.320209, loss_mean_cls: 1.153913, grad_norm: 4.713359 +Steps: 0%| | 1287/1000000 [05:18<68:08:50, 4.07it/s, grad_norm=4.71, loss_final=1.79, loss_mean=0.951, loss_mean_cls=1.15, proj_loss=-0.32][2026-03-22 14:23:55] Step: 1287, Training Logs: loss_final: 1.677298, loss_mean: 0.960876, proj_loss: -0.332374, loss_mean_cls: 1.048796, grad_norm: 4.050869 +Steps: 0%| | 1288/1000000 [05:18<68:02:34, 4.08it/s, grad_norm=4.05, loss_final=1.68, loss_mean=0.961, loss_mean_cls=1.05, proj_loss=-0.332][2026-03-22 14:23:55] Step: 1288, Training Logs: loss_final: 1.810358, loss_mean: 0.941813, proj_loss: -0.322931, loss_mean_cls: 1.191476, grad_norm: 2.208773 +Steps: 0%| | 1289/1000000 [05:18<67:57:19, 4.08it/s, grad_norm=2.21, loss_final=1.81, loss_mean=0.942, loss_mean_cls=1.19, proj_loss=-0.323][2026-03-22 14:23:55] Step: 1289, Training Logs: loss_final: 1.746457, loss_mean: 0.951154, proj_loss: -0.327142, loss_mean_cls: 1.122444, grad_norm: 3.842681 +Steps: 0%| | 1290/1000000 [05:18<67:55:11, 4.08it/s, grad_norm=3.84, loss_final=1.75, loss_mean=0.951, loss_mean_cls=1.12, proj_loss=-0.327][2026-03-22 14:23:55] Step: 1290, Training Logs: loss_final: 1.732287, loss_mean: 0.960099, proj_loss: -0.331702, loss_mean_cls: 1.103891, grad_norm: 5.227777 +Steps: 0%| | 1291/1000000 [05:19<67:53:26, 4.09it/s, grad_norm=5.23, loss_final=1.73, loss_mean=0.96, loss_mean_cls=1.1, proj_loss=-0.332][2026-03-22 14:23:56] Step: 1291, Training Logs: loss_final: 1.776851, loss_mean: 0.959416, proj_loss: -0.322438, loss_mean_cls: 1.139872, grad_norm: 3.968759 +Steps: 0%| | 1292/1000000 [05:19<67:53:32, 4.09it/s, grad_norm=3.97, loss_final=1.78, loss_mean=0.959, loss_mean_cls=1.14, proj_loss=-0.322][2026-03-22 14:23:56] Step: 1292, Training Logs: loss_final: 1.678280, loss_mean: 0.948288, proj_loss: -0.332371, loss_mean_cls: 1.062363, grad_norm: 2.260211 +Steps: 0%| | 1293/1000000 [05:19<67:50:51, 4.09it/s, grad_norm=2.26, loss_final=1.68, loss_mean=0.948, loss_mean_cls=1.06, proj_loss=-0.332][2026-03-22 14:23:56] Step: 1293, Training Logs: loss_final: 1.843341, loss_mean: 0.947008, proj_loss: -0.322603, loss_mean_cls: 1.218935, grad_norm: 4.841676 +Steps: 0%| | 1294/1000000 [05:19<67:49:21, 4.09it/s, grad_norm=4.84, loss_final=1.84, loss_mean=0.947, loss_mean_cls=1.22, proj_loss=-0.323][2026-03-22 14:23:56] Step: 1294, Training Logs: loss_final: 1.836627, loss_mean: 0.917787, proj_loss: -0.319624, loss_mean_cls: 1.238464, grad_norm: 2.742404 +Steps: 0%| | 1295/1000000 [05:20<67:47:01, 4.09it/s, grad_norm=2.74, loss_final=1.84, loss_mean=0.918, loss_mean_cls=1.24, proj_loss=-0.32][2026-03-22 14:23:57] Step: 1295, Training Logs: loss_final: 1.816640, loss_mean: 0.918468, proj_loss: -0.327005, loss_mean_cls: 1.225176, grad_norm: 2.849878 +Steps: 0%| | 1296/1000000 [05:20<67:50:10, 4.09it/s, grad_norm=2.85, loss_final=1.82, loss_mean=0.918, loss_mean_cls=1.23, proj_loss=-0.327][2026-03-22 14:23:57] Step: 1296, Training Logs: loss_final: 1.686860, loss_mean: 0.941859, proj_loss: -0.333821, loss_mean_cls: 1.078822, grad_norm: 3.484753 +Steps: 0%| | 1297/1000000 [05:20<67:48:29, 4.09it/s, grad_norm=3.48, loss_final=1.69, loss_mean=0.942, loss_mean_cls=1.08, proj_loss=-0.334][2026-03-22 14:23:57] Step: 1297, Training Logs: loss_final: 1.777332, loss_mean: 0.935017, proj_loss: -0.327230, loss_mean_cls: 1.169545, grad_norm: 2.465258 +Steps: 0%| | 1298/1000000 [05:20<67:48:02, 4.09it/s, grad_norm=2.47, loss_final=1.78, loss_mean=0.935, loss_mean_cls=1.17, proj_loss=-0.327][2026-03-22 14:23:57] Step: 1298, Training Logs: loss_final: 1.728036, loss_mean: 0.939451, proj_loss: -0.330845, loss_mean_cls: 1.119430, grad_norm: 2.535723 +Steps: 0%| | 1299/1000000 [05:21<67:45:15, 4.09it/s, grad_norm=2.54, loss_final=1.73, loss_mean=0.939, loss_mean_cls=1.12, proj_loss=-0.331][2026-03-22 14:23:57] Step: 1299, Training Logs: loss_final: 1.844234, loss_mean: 0.926946, proj_loss: -0.322862, loss_mean_cls: 1.240150, grad_norm: 3.397213 +Steps: 0%| | 1300/1000000 [05:21<67:49:08, 4.09it/s, grad_norm=3.4, loss_final=1.84, loss_mean=0.927, loss_mean_cls=1.24, proj_loss=-0.323][2026-03-22 14:23:58] Step: 1300, Training Logs: loss_final: 1.720353, loss_mean: 0.921017, proj_loss: -0.327584, loss_mean_cls: 1.126920, grad_norm: 2.886642 +Steps: 0%| | 1301/1000000 [05:21<67:47:53, 4.09it/s, grad_norm=2.89, loss_final=1.72, loss_mean=0.921, loss_mean_cls=1.13, proj_loss=-0.328][2026-03-22 14:23:58] Step: 1301, Training Logs: loss_final: 1.674092, loss_mean: 0.952137, proj_loss: -0.327764, loss_mean_cls: 1.049719, grad_norm: 2.634844 +Steps: 0%| | 1302/1000000 [05:21<67:47:20, 4.09it/s, grad_norm=2.63, loss_final=1.67, loss_mean=0.952, loss_mean_cls=1.05, proj_loss=-0.328][2026-03-22 14:23:58] Step: 1302, Training Logs: loss_final: 1.915473, loss_mean: 0.923065, proj_loss: -0.318799, loss_mean_cls: 1.311206, grad_norm: 4.120114 +Steps: 0%| | 1303/1000000 [05:22<67:47:32, 4.09it/s, grad_norm=4.12, loss_final=1.92, loss_mean=0.923, loss_mean_cls=1.31, proj_loss=-0.319][2026-03-22 14:23:58] Step: 1303, Training Logs: loss_final: 1.722899, loss_mean: 0.934535, proj_loss: -0.331320, loss_mean_cls: 1.119684, grad_norm: 2.900390 +Steps: 0%| | 1304/1000000 [05:22<67:48:25, 4.09it/s, grad_norm=2.9, loss_final=1.72, loss_mean=0.935, loss_mean_cls=1.12, proj_loss=-0.331][2026-03-22 14:23:59] Step: 1304, Training Logs: loss_final: 1.810044, loss_mean: 0.936491, proj_loss: -0.330637, loss_mean_cls: 1.204190, grad_norm: 3.848023 +Steps: 0%| | 1305/1000000 [05:22<67:48:31, 4.09it/s, grad_norm=3.85, loss_final=1.81, loss_mean=0.936, loss_mean_cls=1.2, proj_loss=-0.331][2026-03-22 14:23:59] Step: 1305, Training Logs: loss_final: 1.875238, loss_mean: 0.916851, proj_loss: -0.320783, loss_mean_cls: 1.279170, grad_norm: 2.391058 +Steps: 0%| | 1306/1000000 [05:22<67:47:25, 4.09it/s, grad_norm=2.39, loss_final=1.88, loss_mean=0.917, loss_mean_cls=1.28, proj_loss=-0.321][2026-03-22 14:23:59] Step: 1306, Training Logs: loss_final: 1.739868, loss_mean: 0.968478, proj_loss: -0.327510, loss_mean_cls: 1.098900, grad_norm: 3.540697 +Steps: 0%| | 1307/1000000 [05:22<67:47:45, 4.09it/s, grad_norm=3.54, loss_final=1.74, loss_mean=0.968, loss_mean_cls=1.1, proj_loss=-0.328][2026-03-22 14:23:59] Step: 1307, Training Logs: loss_final: 1.918702, loss_mean: 0.919772, proj_loss: -0.324108, loss_mean_cls: 1.323038, grad_norm: 3.677563 +Steps: 0%| | 1308/1000000 [05:23<67:48:13, 4.09it/s, grad_norm=3.68, loss_final=1.92, loss_mean=0.92, loss_mean_cls=1.32, proj_loss=-0.324][2026-03-22 14:24:00] Step: 1308, Training Logs: loss_final: 1.830967, loss_mean: 0.929825, proj_loss: -0.329589, loss_mean_cls: 1.230730, grad_norm: 3.716637 +Steps: 0%| | 1309/1000000 [05:23<67:49:16, 4.09it/s, grad_norm=3.72, loss_final=1.83, loss_mean=0.93, loss_mean_cls=1.23, proj_loss=-0.33][2026-03-22 14:24:00] Step: 1309, Training Logs: loss_final: 1.774727, loss_mean: 0.937425, proj_loss: -0.326882, loss_mean_cls: 1.164184, grad_norm: 4.200304 +Steps: 0%| | 1310/1000000 [05:23<67:49:24, 4.09it/s, grad_norm=4.2, loss_final=1.77, loss_mean=0.937, loss_mean_cls=1.16, proj_loss=-0.327][2026-03-22 14:24:00] Step: 1310, Training Logs: loss_final: 1.828559, loss_mean: 0.941338, proj_loss: -0.322329, loss_mean_cls: 1.209550, grad_norm: 3.087627 +Steps: 0%| | 1311/1000000 [05:23<67:50:50, 4.09it/s, grad_norm=3.09, loss_final=1.83, loss_mean=0.941, loss_mean_cls=1.21, proj_loss=-0.322][2026-03-22 14:24:00] Step: 1311, Training Logs: loss_final: 1.754848, loss_mean: 0.950619, proj_loss: -0.329927, loss_mean_cls: 1.134156, grad_norm: 2.830939 +Steps: 0%| | 1312/1000000 [05:24<70:23:29, 3.94it/s, grad_norm=2.83, loss_final=1.75, loss_mean=0.951, loss_mean_cls=1.13, proj_loss=-0.33][2026-03-22 14:24:01] Step: 1312, Training Logs: loss_final: 1.666578, loss_mean: 0.948103, proj_loss: -0.334614, loss_mean_cls: 1.053089, grad_norm: 2.203413 +Steps: 0%| | 1313/1000000 [05:24<70:08:40, 3.95it/s, grad_norm=2.2, loss_final=1.67, loss_mean=0.948, loss_mean_cls=1.05, proj_loss=-0.335][2026-03-22 14:24:01] Step: 1313, Training Logs: loss_final: 1.785401, loss_mean: 0.937071, proj_loss: -0.323746, loss_mean_cls: 1.172076, grad_norm: 3.137293 +Steps: 0%| | 1314/1000000 [05:24<69:26:32, 3.99it/s, grad_norm=3.14, loss_final=1.79, loss_mean=0.937, loss_mean_cls=1.17, proj_loss=-0.324][2026-03-22 14:24:01] Step: 1314, Training Logs: loss_final: 1.767240, loss_mean: 0.942872, proj_loss: -0.331900, loss_mean_cls: 1.156268, grad_norm: 4.099969 +Steps: 0%| | 1315/1000000 [05:24<68:56:02, 4.02it/s, grad_norm=4.1, loss_final=1.77, loss_mean=0.943, loss_mean_cls=1.16, proj_loss=-0.332][2026-03-22 14:24:01] Step: 1315, Training Logs: loss_final: 1.685761, loss_mean: 0.962490, proj_loss: -0.337827, loss_mean_cls: 1.061098, grad_norm: 4.048435 +Steps: 0%| | 1316/1000000 [05:25<68:49:54, 4.03it/s, grad_norm=4.05, loss_final=1.69, loss_mean=0.962, loss_mean_cls=1.06, proj_loss=-0.338][2026-03-22 14:24:02] Step: 1316, Training Logs: loss_final: 1.772215, loss_mean: 0.932832, proj_loss: -0.332919, loss_mean_cls: 1.172301, grad_norm: 2.342942 +Steps: 0%| | 1317/1000000 [05:25<68:32:38, 4.05it/s, grad_norm=2.34, loss_final=1.77, loss_mean=0.933, loss_mean_cls=1.17, proj_loss=-0.333][2026-03-22 14:24:02] Step: 1317, Training Logs: loss_final: 1.887572, loss_mean: 0.921435, proj_loss: -0.323235, loss_mean_cls: 1.289372, grad_norm: 3.976069 +Steps: 0%| | 1318/1000000 [05:25<68:21:02, 4.06it/s, grad_norm=3.98, loss_final=1.89, loss_mean=0.921, loss_mean_cls=1.29, proj_loss=-0.323][2026-03-22 14:24:02] Step: 1318, Training Logs: loss_final: 1.758029, loss_mean: 0.955890, proj_loss: -0.333655, loss_mean_cls: 1.135793, grad_norm: 3.908876 +Steps: 0%| | 1319/1000000 [05:25<68:10:09, 4.07it/s, grad_norm=3.91, loss_final=1.76, loss_mean=0.956, loss_mean_cls=1.14, proj_loss=-0.334][2026-03-22 14:24:02] Step: 1319, Training Logs: loss_final: 1.622479, loss_mean: 0.950062, proj_loss: -0.340033, loss_mean_cls: 1.012450, grad_norm: 2.528257 +Steps: 0%| | 1320/1000000 [05:26<68:05:34, 4.07it/s, grad_norm=2.53, loss_final=1.62, loss_mean=0.95, loss_mean_cls=1.01, proj_loss=-0.34][2026-03-22 14:24:03] Step: 1320, Training Logs: loss_final: 1.753595, loss_mean: 0.957132, proj_loss: -0.332178, loss_mean_cls: 1.128641, grad_norm: 2.911521 +Steps: 0%| | 1321/1000000 [05:26<67:59:18, 4.08it/s, grad_norm=2.91, loss_final=1.75, loss_mean=0.957, loss_mean_cls=1.13, proj_loss=-0.332][2026-03-22 14:24:03] Step: 1321, Training Logs: loss_final: 1.697248, loss_mean: 0.949720, proj_loss: -0.329697, loss_mean_cls: 1.077225, grad_norm: 2.531019 +Steps: 0%| | 1322/1000000 [05:26<67:56:04, 4.08it/s, grad_norm=2.53, loss_final=1.7, loss_mean=0.95, loss_mean_cls=1.08, proj_loss=-0.33][2026-03-22 14:24:03] Step: 1322, Training Logs: loss_final: 1.798132, loss_mean: 0.946313, proj_loss: -0.333405, loss_mean_cls: 1.185224, grad_norm: 4.445789 +Steps: 0%| | 1323/1000000 [05:26<67:52:21, 4.09it/s, grad_norm=4.45, loss_final=1.8, loss_mean=0.946, loss_mean_cls=1.19, proj_loss=-0.333][2026-03-22 14:24:03] Step: 1323, Training Logs: loss_final: 1.708405, loss_mean: 0.953886, proj_loss: -0.340122, loss_mean_cls: 1.094640, grad_norm: 4.081355 +Steps: 0%| | 1324/1000000 [05:27<67:51:49, 4.09it/s, grad_norm=4.08, loss_final=1.71, loss_mean=0.954, loss_mean_cls=1.09, proj_loss=-0.34][2026-03-22 14:24:04] Step: 1324, Training Logs: loss_final: 1.718467, loss_mean: 0.940952, proj_loss: -0.330924, loss_mean_cls: 1.108439, grad_norm: 2.535523 +Steps: 0%| | 1325/1000000 [05:27<67:50:13, 4.09it/s, grad_norm=2.54, loss_final=1.72, loss_mean=0.941, loss_mean_cls=1.11, proj_loss=-0.331][2026-03-22 14:24:04] Step: 1325, Training Logs: loss_final: 1.773304, loss_mean: 0.955568, proj_loss: -0.331682, loss_mean_cls: 1.149418, grad_norm: 4.346560 +Steps: 0%| | 1326/1000000 [05:27<67:49:45, 4.09it/s, grad_norm=4.35, loss_final=1.77, loss_mean=0.956, loss_mean_cls=1.15, proj_loss=-0.332][2026-03-22 14:24:04] Step: 1326, Training Logs: loss_final: 1.711045, loss_mean: 0.950165, proj_loss: -0.334483, loss_mean_cls: 1.095362, grad_norm: 4.795716 +Steps: 0%| | 1327/1000000 [05:27<67:49:48, 4.09it/s, grad_norm=4.8, loss_final=1.71, loss_mean=0.95, loss_mean_cls=1.1, proj_loss=-0.334][2026-03-22 14:24:04] Step: 1327, Training Logs: loss_final: 1.891035, loss_mean: 0.919116, proj_loss: -0.322987, loss_mean_cls: 1.294906, grad_norm: 3.263766 +Steps: 0%| | 1328/1000000 [05:28<67:50:00, 4.09it/s, grad_norm=3.26, loss_final=1.89, loss_mean=0.919, loss_mean_cls=1.29, proj_loss=-0.323][2026-03-22 14:24:05] Step: 1328, Training Logs: loss_final: 1.861439, loss_mean: 0.925273, proj_loss: -0.330885, loss_mean_cls: 1.267051, grad_norm: 3.133880 +Steps: 0%| | 1329/1000000 [05:28<67:49:49, 4.09it/s, grad_norm=3.13, loss_final=1.86, loss_mean=0.925, loss_mean_cls=1.27, proj_loss=-0.331][2026-03-22 14:24:05] Step: 1329, Training Logs: loss_final: 1.822374, loss_mean: 0.932345, proj_loss: -0.322836, loss_mean_cls: 1.212866, grad_norm: 3.172400 +Steps: 0%| | 1330/1000000 [05:28<67:48:43, 4.09it/s, grad_norm=3.17, loss_final=1.82, loss_mean=0.932, loss_mean_cls=1.21, proj_loss=-0.323][2026-03-22 14:24:05] Step: 1330, Training Logs: loss_final: 1.638597, loss_mean: 0.948731, proj_loss: -0.340040, loss_mean_cls: 1.029907, grad_norm: 2.039446 +Steps: 0%| | 1331/1000000 [05:28<67:48:39, 4.09it/s, grad_norm=2.04, loss_final=1.64, loss_mean=0.949, loss_mean_cls=1.03, proj_loss=-0.34][2026-03-22 14:24:05] Step: 1331, Training Logs: loss_final: 1.755798, loss_mean: 0.935221, proj_loss: -0.326489, loss_mean_cls: 1.147065, grad_norm: 2.787355 +Steps: 0%| | 1332/1000000 [05:29<67:48:14, 4.09it/s, grad_norm=2.79, loss_final=1.76, loss_mean=0.935, loss_mean_cls=1.15, proj_loss=-0.326][2026-03-22 14:24:06] Step: 1332, Training Logs: loss_final: 1.708920, loss_mean: 0.948726, proj_loss: -0.330797, loss_mean_cls: 1.090992, grad_norm: 2.507924 +Steps: 0%| | 1333/1000000 [05:29<67:49:53, 4.09it/s, grad_norm=2.51, loss_final=1.71, loss_mean=0.949, loss_mean_cls=1.09, proj_loss=-0.331][2026-03-22 14:24:06] Step: 1333, Training Logs: loss_final: 1.818417, loss_mean: 0.927013, proj_loss: -0.327400, loss_mean_cls: 1.218804, grad_norm: 2.355953 +Steps: 0%| | 1334/1000000 [05:29<67:51:32, 4.09it/s, grad_norm=2.36, loss_final=1.82, loss_mean=0.927, loss_mean_cls=1.22, proj_loss=-0.327][2026-03-22 14:24:06] Step: 1334, Training Logs: loss_final: 1.845239, loss_mean: 0.926196, proj_loss: -0.334416, loss_mean_cls: 1.253459, grad_norm: 4.740016 +Steps: 0%| | 1335/1000000 [05:29<67:50:28, 4.09it/s, grad_norm=4.74, loss_final=1.85, loss_mean=0.926, loss_mean_cls=1.25, proj_loss=-0.334][2026-03-22 14:24:06] Step: 1335, Training Logs: loss_final: 1.742152, loss_mean: 0.962417, proj_loss: -0.332781, loss_mean_cls: 1.112515, grad_norm: 4.993117 +Steps: 0%| | 1336/1000000 [05:30<67:53:44, 4.09it/s, grad_norm=4.99, loss_final=1.74, loss_mean=0.962, loss_mean_cls=1.11, proj_loss=-0.333][2026-03-22 14:24:07] Step: 1336, Training Logs: loss_final: 1.817697, loss_mean: 0.918263, proj_loss: -0.326258, loss_mean_cls: 1.225692, grad_norm: 3.548280 +Steps: 0%| | 1337/1000000 [05:30<67:52:01, 4.09it/s, grad_norm=3.55, loss_final=1.82, loss_mean=0.918, loss_mean_cls=1.23, proj_loss=-0.326][2026-03-22 14:24:07] Step: 1337, Training Logs: loss_final: 1.733911, loss_mean: 0.954549, proj_loss: -0.337021, loss_mean_cls: 1.116382, grad_norm: 4.674736 +Steps: 0%| | 1338/1000000 [05:30<67:52:14, 4.09it/s, grad_norm=4.67, loss_final=1.73, loss_mean=0.955, loss_mean_cls=1.12, proj_loss=-0.337][2026-03-22 14:24:07] Step: 1338, Training Logs: loss_final: 1.791887, loss_mean: 0.932945, proj_loss: -0.330728, loss_mean_cls: 1.189670, grad_norm: 4.118075 +Steps: 0%| | 1339/1000000 [05:30<67:50:31, 4.09it/s, grad_norm=4.12, loss_final=1.79, loss_mean=0.933, loss_mean_cls=1.19, proj_loss=-0.331][2026-03-22 14:24:07] Step: 1339, Training Logs: loss_final: 1.753849, loss_mean: 0.955238, proj_loss: -0.330625, loss_mean_cls: 1.129237, grad_norm: 4.871002 +Steps: 0%| | 1340/1000000 [05:31<67:50:16, 4.09it/s, grad_norm=4.87, loss_final=1.75, loss_mean=0.955, loss_mean_cls=1.13, proj_loss=-0.331][2026-03-22 14:24:08] Step: 1340, Training Logs: loss_final: 1.835283, loss_mean: 0.942683, proj_loss: -0.343971, loss_mean_cls: 1.236572, grad_norm: 5.180179 +Steps: 0%| | 1341/1000000 [05:31<67:49:32, 4.09it/s, grad_norm=5.18, loss_final=1.84, loss_mean=0.943, loss_mean_cls=1.24, proj_loss=-0.344][2026-03-22 14:24:08] Step: 1341, Training Logs: loss_final: 1.764640, loss_mean: 0.926221, proj_loss: -0.332508, loss_mean_cls: 1.170927, grad_norm: 2.932412 +Steps: 0%| | 1342/1000000 [05:31<67:49:43, 4.09it/s, grad_norm=2.93, loss_final=1.76, loss_mean=0.926, loss_mean_cls=1.17, proj_loss=-0.333][2026-03-22 14:24:08] Step: 1342, Training Logs: loss_final: 1.761385, loss_mean: 0.944113, proj_loss: -0.334670, loss_mean_cls: 1.151942, grad_norm: 5.157106 +Steps: 0%| | 1343/1000000 [05:31<67:47:56, 4.09it/s, grad_norm=5.16, loss_final=1.76, loss_mean=0.944, loss_mean_cls=1.15, proj_loss=-0.335][2026-03-22 14:24:08] Step: 1343, Training Logs: loss_final: 1.782992, loss_mean: 0.934056, proj_loss: -0.337015, loss_mean_cls: 1.185950, grad_norm: 4.326782 +Steps: 0%| | 1344/1000000 [05:32<67:59:42, 4.08it/s, grad_norm=4.33, loss_final=1.78, loss_mean=0.934, loss_mean_cls=1.19, proj_loss=-0.337][2026-03-22 14:24:09] Step: 1344, Training Logs: loss_final: 1.712050, loss_mean: 0.949052, proj_loss: -0.329820, loss_mean_cls: 1.092818, grad_norm: 2.988442 +Steps: 0%| | 1345/1000000 [05:32<67:56:42, 4.08it/s, grad_norm=2.99, loss_final=1.71, loss_mean=0.949, loss_mean_cls=1.09, proj_loss=-0.33][2026-03-22 14:24:09] Step: 1345, Training Logs: loss_final: 1.702727, loss_mean: 0.948235, proj_loss: -0.335831, loss_mean_cls: 1.090323, grad_norm: 3.630783 +Steps: 0%| | 1346/1000000 [05:32<67:53:23, 4.09it/s, grad_norm=3.63, loss_final=1.7, loss_mean=0.948, loss_mean_cls=1.09, proj_loss=-0.336][2026-03-22 14:24:09] Step: 1346, Training Logs: loss_final: 1.866349, loss_mean: 0.924937, proj_loss: -0.328181, loss_mean_cls: 1.269593, grad_norm: 3.953378 +Steps: 0%| | 1347/1000000 [05:32<67:52:51, 4.09it/s, grad_norm=3.95, loss_final=1.87, loss_mean=0.925, loss_mean_cls=1.27, proj_loss=-0.328][2026-03-22 14:24:09] Step: 1347, Training Logs: loss_final: 1.683774, loss_mean: 0.951235, proj_loss: -0.338726, loss_mean_cls: 1.071265, grad_norm: 3.010787 +Steps: 0%| | 1348/1000000 [05:33<67:50:48, 4.09it/s, grad_norm=3.01, loss_final=1.68, loss_mean=0.951, loss_mean_cls=1.07, proj_loss=-0.339][2026-03-22 14:24:10] Step: 1348, Training Logs: loss_final: 1.618266, loss_mean: 0.964277, proj_loss: -0.344794, loss_mean_cls: 0.998783, grad_norm: 4.172046 +Steps: 0%| | 1349/1000000 [05:33<67:50:58, 4.09it/s, grad_norm=4.17, loss_final=1.62, loss_mean=0.964, loss_mean_cls=0.999, proj_loss=-0.345][2026-03-22 14:24:10] Step: 1349, Training Logs: loss_final: 1.929195, loss_mean: 0.915630, proj_loss: -0.331505, loss_mean_cls: 1.345069, grad_norm: 3.261632 +Steps: 0%| | 1350/1000000 [05:33<67:49:26, 4.09it/s, grad_norm=3.26, loss_final=1.93, loss_mean=0.916, loss_mean_cls=1.35, proj_loss=-0.332][2026-03-22 14:24:10] Step: 1350, Training Logs: loss_final: 1.809648, loss_mean: 0.922917, proj_loss: -0.328489, loss_mean_cls: 1.215220, grad_norm: 2.188464 +Steps: 0%| | 1351/1000000 [05:33<67:48:14, 4.09it/s, grad_norm=2.19, loss_final=1.81, loss_mean=0.923, loss_mean_cls=1.22, proj_loss=-0.328][2026-03-22 14:24:10] Step: 1351, Training Logs: loss_final: 1.676302, loss_mean: 0.963810, proj_loss: -0.341486, loss_mean_cls: 1.053978, grad_norm: 5.674265 +Steps: 0%| | 1352/1000000 [05:34<67:47:20, 4.09it/s, grad_norm=5.67, loss_final=1.68, loss_mean=0.964, loss_mean_cls=1.05, proj_loss=-0.341][2026-03-22 14:24:10] Step: 1352, Training Logs: loss_final: 1.660235, loss_mean: 0.957255, proj_loss: -0.341237, loss_mean_cls: 1.044216, grad_norm: 3.812187 +Steps: 0%| | 1353/1000000 [05:34<67:47:04, 4.09it/s, grad_norm=3.81, loss_final=1.66, loss_mean=0.957, loss_mean_cls=1.04, proj_loss=-0.341][2026-03-22 14:24:11] Step: 1353, Training Logs: loss_final: 1.738579, loss_mean: 0.952814, proj_loss: -0.337154, loss_mean_cls: 1.122918, grad_norm: 4.278147 +Steps: 0%| | 1354/1000000 [05:34<67:50:17, 4.09it/s, grad_norm=4.28, loss_final=1.74, loss_mean=0.953, loss_mean_cls=1.12, proj_loss=-0.337][2026-03-22 14:24:11] Step: 1354, Training Logs: loss_final: 1.792521, loss_mean: 0.936310, proj_loss: -0.338134, loss_mean_cls: 1.194344, grad_norm: 4.878528 +Steps: 0%| | 1355/1000000 [05:34<67:50:01, 4.09it/s, grad_norm=4.88, loss_final=1.79, loss_mean=0.936, loss_mean_cls=1.19, proj_loss=-0.338][2026-03-22 14:24:11] Step: 1355, Training Logs: loss_final: 1.752688, loss_mean: 0.935898, proj_loss: -0.333780, loss_mean_cls: 1.150570, grad_norm: 3.889422 +Steps: 0%| | 1356/1000000 [05:35<67:50:47, 4.09it/s, grad_norm=3.89, loss_final=1.75, loss_mean=0.936, loss_mean_cls=1.15, proj_loss=-0.334][2026-03-22 14:24:11] Step: 1356, Training Logs: loss_final: 1.834761, loss_mean: 0.949764, proj_loss: -0.332909, loss_mean_cls: 1.217907, grad_norm: 4.590490 +Steps: 0%| | 1357/1000000 [05:35<67:50:47, 4.09it/s, grad_norm=4.59, loss_final=1.83, loss_mean=0.95, loss_mean_cls=1.22, proj_loss=-0.333][2026-03-22 14:24:12] Step: 1357, Training Logs: loss_final: 1.815552, loss_mean: 0.925196, proj_loss: -0.329819, loss_mean_cls: 1.220175, grad_norm: 3.650165 +Steps: 0%| | 1358/1000000 [05:35<67:50:17, 4.09it/s, grad_norm=3.65, loss_final=1.82, loss_mean=0.925, loss_mean_cls=1.22, proj_loss=-0.33][2026-03-22 14:24:12] Step: 1358, Training Logs: loss_final: 1.734397, loss_mean: 0.930861, proj_loss: -0.337594, loss_mean_cls: 1.141129, grad_norm: 3.569393 +Steps: 0%| | 1359/1000000 [05:35<67:48:03, 4.09it/s, grad_norm=3.57, loss_final=1.73, loss_mean=0.931, loss_mean_cls=1.14, proj_loss=-0.338][2026-03-22 14:24:12] Step: 1359, Training Logs: loss_final: 1.804240, loss_mean: 0.914983, proj_loss: -0.333699, loss_mean_cls: 1.222955, grad_norm: 2.757773 +Steps: 0%| | 1360/1000000 [05:35<67:49:24, 4.09it/s, grad_norm=2.76, loss_final=1.8, loss_mean=0.915, loss_mean_cls=1.22, proj_loss=-0.334][2026-03-22 14:24:12] Step: 1360, Training Logs: loss_final: 1.769148, loss_mean: 0.948646, proj_loss: -0.330492, loss_mean_cls: 1.150994, grad_norm: 2.477651 +Steps: 0%| | 1361/1000000 [05:36<67:49:05, 4.09it/s, grad_norm=2.48, loss_final=1.77, loss_mean=0.949, loss_mean_cls=1.15, proj_loss=-0.33][2026-03-22 14:24:13] Step: 1361, Training Logs: loss_final: 1.766208, loss_mean: 0.942738, proj_loss: -0.335387, loss_mean_cls: 1.158857, grad_norm: 1.924200 +Steps: 0%| | 1362/1000000 [05:36<67:49:38, 4.09it/s, grad_norm=1.92, loss_final=1.77, loss_mean=0.943, loss_mean_cls=1.16, proj_loss=-0.335][2026-03-22 14:24:13] Step: 1362, Training Logs: loss_final: 1.816691, loss_mean: 0.918444, proj_loss: -0.327655, loss_mean_cls: 1.225903, grad_norm: 2.302987 +Steps: 0%| | 1363/1000000 [05:36<67:49:05, 4.09it/s, grad_norm=2.3, loss_final=1.82, loss_mean=0.918, loss_mean_cls=1.23, proj_loss=-0.328][2026-03-22 14:24:13] Step: 1363, Training Logs: loss_final: 1.814210, loss_mean: 0.914783, proj_loss: -0.331561, loss_mean_cls: 1.230988, grad_norm: 2.709250 +Steps: 0%| | 1364/1000000 [05:36<67:50:18, 4.09it/s, grad_norm=2.71, loss_final=1.81, loss_mean=0.915, loss_mean_cls=1.23, proj_loss=-0.332][2026-03-22 14:24:13] Step: 1364, Training Logs: loss_final: 1.656972, loss_mean: 0.944639, proj_loss: -0.333742, loss_mean_cls: 1.046075, grad_norm: 5.772759 +Steps: 0%| | 1365/1000000 [05:37<67:48:53, 4.09it/s, grad_norm=5.77, loss_final=1.66, loss_mean=0.945, loss_mean_cls=1.05, proj_loss=-0.334][2026-03-22 14:24:14] Step: 1365, Training Logs: loss_final: 1.711355, loss_mean: 0.954637, proj_loss: -0.333654, loss_mean_cls: 1.090372, grad_norm: 3.494135 +Steps: 0%| | 1366/1000000 [05:37<67:48:25, 4.09it/s, grad_norm=3.49, loss_final=1.71, loss_mean=0.955, loss_mean_cls=1.09, proj_loss=-0.334][2026-03-22 14:24:14] Step: 1366, Training Logs: loss_final: 1.714564, loss_mean: 0.944246, proj_loss: -0.336034, loss_mean_cls: 1.106352, grad_norm: 3.843110 +Steps: 0%| | 1367/1000000 [05:37<67:47:58, 4.09it/s, grad_norm=3.84, loss_final=1.71, loss_mean=0.944, loss_mean_cls=1.11, proj_loss=-0.336][2026-03-22 14:24:14] Step: 1367, Training Logs: loss_final: 1.828195, loss_mean: 0.930477, proj_loss: -0.333078, loss_mean_cls: 1.230797, grad_norm: 3.846061 +Steps: 0%| | 1368/1000000 [05:37<67:48:24, 4.09it/s, grad_norm=3.85, loss_final=1.83, loss_mean=0.93, loss_mean_cls=1.23, proj_loss=-0.333][2026-03-22 14:24:14] Step: 1368, Training Logs: loss_final: 1.876951, loss_mean: 0.918555, proj_loss: -0.325230, loss_mean_cls: 1.283626, grad_norm: 2.771335 +Steps: 0%| | 1369/1000000 [05:38<67:48:53, 4.09it/s, grad_norm=2.77, loss_final=1.88, loss_mean=0.919, loss_mean_cls=1.28, proj_loss=-0.325][2026-03-22 14:24:15] Step: 1369, Training Logs: loss_final: 1.652430, loss_mean: 0.959194, proj_loss: -0.342082, loss_mean_cls: 1.035318, grad_norm: 1.804438 +Steps: 0%| | 1370/1000000 [05:38<67:49:28, 4.09it/s, grad_norm=1.8, loss_final=1.65, loss_mean=0.959, loss_mean_cls=1.04, proj_loss=-0.342][2026-03-22 14:24:15] Step: 1370, Training Logs: loss_final: 1.833217, loss_mean: 0.917992, proj_loss: -0.336508, loss_mean_cls: 1.251734, grad_norm: 4.658328 +Steps: 0%| | 1371/1000000 [05:38<67:49:34, 4.09it/s, grad_norm=4.66, loss_final=1.83, loss_mean=0.918, loss_mean_cls=1.25, proj_loss=-0.337][2026-03-22 14:24:15] Step: 1371, Training Logs: loss_final: 1.775430, loss_mean: 0.939864, proj_loss: -0.332745, loss_mean_cls: 1.168310, grad_norm: 2.822229 +Steps: 0%| | 1372/1000000 [05:38<67:48:52, 4.09it/s, grad_norm=2.82, loss_final=1.78, loss_mean=0.94, loss_mean_cls=1.17, proj_loss=-0.333][2026-03-22 14:24:15] Step: 1372, Training Logs: loss_final: 1.844331, loss_mean: 0.924965, proj_loss: -0.326126, loss_mean_cls: 1.245492, grad_norm: 2.936644 +Steps: 0%| | 1373/1000000 [05:39<67:46:42, 4.09it/s, grad_norm=2.94, loss_final=1.84, loss_mean=0.925, loss_mean_cls=1.25, proj_loss=-0.326][2026-03-22 14:24:16] Step: 1373, Training Logs: loss_final: 1.852109, loss_mean: 0.931612, proj_loss: -0.326774, loss_mean_cls: 1.247271, grad_norm: 3.182214 +Steps: 0%| | 1374/1000000 [05:39<67:47:19, 4.09it/s, grad_norm=3.18, loss_final=1.85, loss_mean=0.932, loss_mean_cls=1.25, proj_loss=-0.327][2026-03-22 14:24:16] Step: 1374, Training Logs: loss_final: 1.777277, loss_mean: 0.920764, proj_loss: -0.336130, loss_mean_cls: 1.192643, grad_norm: 4.507606 +Steps: 0%| | 1375/1000000 [05:39<67:46:36, 4.09it/s, grad_norm=4.51, loss_final=1.78, loss_mean=0.921, loss_mean_cls=1.19, proj_loss=-0.336][2026-03-22 14:24:16] Step: 1375, Training Logs: loss_final: 1.761504, loss_mean: 0.958680, proj_loss: -0.328541, loss_mean_cls: 1.131365, grad_norm: 3.693140 +Steps: 0%| | 1376/1000000 [05:39<67:48:18, 4.09it/s, grad_norm=3.69, loss_final=1.76, loss_mean=0.959, loss_mean_cls=1.13, proj_loss=-0.329][2026-03-22 14:24:16] Step: 1376, Training Logs: loss_final: 1.692317, loss_mean: 0.941286, proj_loss: -0.345655, loss_mean_cls: 1.096687, grad_norm: 2.998515 +Steps: 0%| | 1377/1000000 [05:40<67:51:30, 4.09it/s, grad_norm=3, loss_final=1.69, loss_mean=0.941, loss_mean_cls=1.1, proj_loss=-0.346][2026-03-22 14:24:17] Step: 1377, Training Logs: loss_final: 1.756803, loss_mean: 0.931452, proj_loss: -0.343180, loss_mean_cls: 1.168530, grad_norm: 4.022770 +Steps: 0%| | 1378/1000000 [05:40<67:49:38, 4.09it/s, grad_norm=4.02, loss_final=1.76, loss_mean=0.931, loss_mean_cls=1.17, proj_loss=-0.343][2026-03-22 14:24:17] Step: 1378, Training Logs: loss_final: 1.629917, loss_mean: 0.972413, proj_loss: -0.346944, loss_mean_cls: 1.004449, grad_norm: 3.468005 +Steps: 0%| | 1379/1000000 [05:40<67:48:12, 4.09it/s, grad_norm=3.47, loss_final=1.63, loss_mean=0.972, loss_mean_cls=1, proj_loss=-0.347][2026-03-22 14:24:17] Step: 1379, Training Logs: loss_final: 1.876531, loss_mean: 0.917121, proj_loss: -0.331891, loss_mean_cls: 1.291301, grad_norm: 3.107888 +Steps: 0%| | 1380/1000000 [05:40<67:50:33, 4.09it/s, grad_norm=3.11, loss_final=1.88, loss_mean=0.917, loss_mean_cls=1.29, proj_loss=-0.332][2026-03-22 14:24:17] Step: 1380, Training Logs: loss_final: 1.820474, loss_mean: 0.938124, proj_loss: -0.324004, loss_mean_cls: 1.206354, grad_norm: 2.804955 +Steps: 0%| | 1381/1000000 [05:41<67:49:15, 4.09it/s, grad_norm=2.8, loss_final=1.82, loss_mean=0.938, loss_mean_cls=1.21, proj_loss=-0.324][2026-03-22 14:24:18] Step: 1381, Training Logs: loss_final: 1.748989, loss_mean: 0.938314, proj_loss: -0.332230, loss_mean_cls: 1.142905, grad_norm: 2.911903 +Steps: 0%| | 1382/1000000 [05:41<67:49:29, 4.09it/s, grad_norm=2.91, loss_final=1.75, loss_mean=0.938, loss_mean_cls=1.14, proj_loss=-0.332][2026-03-22 14:24:18] Step: 1382, Training Logs: loss_final: 1.657624, loss_mean: 0.954616, proj_loss: -0.338185, loss_mean_cls: 1.041194, grad_norm: 3.421182 +Steps: 0%| | 1383/1000000 [05:41<67:50:04, 4.09it/s, grad_norm=3.42, loss_final=1.66, loss_mean=0.955, loss_mean_cls=1.04, proj_loss=-0.338][2026-03-22 14:24:18] Step: 1383, Training Logs: loss_final: 1.549374, loss_mean: 0.965489, proj_loss: -0.350072, loss_mean_cls: 0.933957, grad_norm: 3.480399 +Steps: 0%| | 1384/1000000 [05:41<67:50:02, 4.09it/s, grad_norm=3.48, loss_final=1.55, loss_mean=0.965, loss_mean_cls=0.934, proj_loss=-0.35][2026-03-22 14:24:18] Step: 1384, Training Logs: loss_final: 1.814017, loss_mean: 0.942258, proj_loss: -0.339279, loss_mean_cls: 1.211038, grad_norm: 4.900318 +Steps: 0%| | 1385/1000000 [05:42<67:48:57, 4.09it/s, grad_norm=4.9, loss_final=1.81, loss_mean=0.942, loss_mean_cls=1.21, proj_loss=-0.339][2026-03-22 14:24:19] Step: 1385, Training Logs: loss_final: 1.778942, loss_mean: 0.942418, proj_loss: -0.334000, loss_mean_cls: 1.170523, grad_norm: 3.522086 +Steps: 0%| | 1386/1000000 [05:42<67:47:23, 4.09it/s, grad_norm=3.52, loss_final=1.78, loss_mean=0.942, loss_mean_cls=1.17, proj_loss=-0.334][2026-03-22 14:24:19] Step: 1386, Training Logs: loss_final: 1.701868, loss_mean: 0.950041, proj_loss: -0.341215, loss_mean_cls: 1.093042, grad_norm: 5.246559 +Steps: 0%| | 1387/1000000 [05:42<67:46:28, 4.09it/s, grad_norm=5.25, loss_final=1.7, loss_mean=0.95, loss_mean_cls=1.09, proj_loss=-0.341][2026-03-22 14:24:19] Step: 1387, Training Logs: loss_final: 1.707564, loss_mean: 0.954474, proj_loss: -0.336927, loss_mean_cls: 1.090017, grad_norm: 4.590395 +Steps: 0%| | 1388/1000000 [05:42<67:46:33, 4.09it/s, grad_norm=4.59, loss_final=1.71, loss_mean=0.954, loss_mean_cls=1.09, proj_loss=-0.337][2026-03-22 14:24:19] Step: 1388, Training Logs: loss_final: 1.763653, loss_mean: 0.943730, proj_loss: -0.344565, loss_mean_cls: 1.164488, grad_norm: 3.498099 +Steps: 0%| | 1389/1000000 [05:43<67:48:09, 4.09it/s, grad_norm=3.5, loss_final=1.76, loss_mean=0.944, loss_mean_cls=1.16, proj_loss=-0.345][2026-03-22 14:24:20] Step: 1389, Training Logs: loss_final: 1.815736, loss_mean: 0.919616, proj_loss: -0.331371, loss_mean_cls: 1.227491, grad_norm: 1.813515 +Steps: 0%| | 1390/1000000 [05:43<67:49:05, 4.09it/s, grad_norm=1.81, loss_final=1.82, loss_mean=0.92, loss_mean_cls=1.23, proj_loss=-0.331][2026-03-22 14:24:20] Step: 1390, Training Logs: loss_final: 1.809021, loss_mean: 0.933254, proj_loss: -0.336163, loss_mean_cls: 1.211930, grad_norm: 4.234758 +Steps: 0%| | 1391/1000000 [05:43<67:49:12, 4.09it/s, grad_norm=4.23, loss_final=1.81, loss_mean=0.933, loss_mean_cls=1.21, proj_loss=-0.336][2026-03-22 14:24:20] Step: 1391, Training Logs: loss_final: 1.732347, loss_mean: 0.948346, proj_loss: -0.339829, loss_mean_cls: 1.123830, grad_norm: 3.711619 +Steps: 0%| | 1392/1000000 [05:43<67:51:57, 4.09it/s, grad_norm=3.71, loss_final=1.73, loss_mean=0.948, loss_mean_cls=1.12, proj_loss=-0.34][2026-03-22 14:24:20] Step: 1392, Training Logs: loss_final: 1.707609, loss_mean: 0.952628, proj_loss: -0.331972, loss_mean_cls: 1.086953, grad_norm: 1.929992 +Steps: 0%| | 1393/1000000 [05:44<67:50:13, 4.09it/s, grad_norm=1.93, loss_final=1.71, loss_mean=0.953, loss_mean_cls=1.09, proj_loss=-0.332][2026-03-22 14:24:21] Step: 1393, Training Logs: loss_final: 1.738799, loss_mean: 0.929179, proj_loss: -0.344779, loss_mean_cls: 1.154399, grad_norm: 3.551093 +Steps: 0%| | 1394/1000000 [05:44<67:51:02, 4.09it/s, grad_norm=3.55, loss_final=1.74, loss_mean=0.929, loss_mean_cls=1.15, proj_loss=-0.345][2026-03-22 14:24:21] Step: 1394, Training Logs: loss_final: 1.536999, loss_mean: 0.969696, proj_loss: -0.354369, loss_mean_cls: 0.921672, grad_norm: 3.542141 +Steps: 0%| | 1395/1000000 [05:44<67:49:53, 4.09it/s, grad_norm=3.54, loss_final=1.54, loss_mean=0.97, loss_mean_cls=0.922, proj_loss=-0.354][2026-03-22 14:24:21] Step: 1395, Training Logs: loss_final: 1.643260, loss_mean: 0.953910, proj_loss: -0.338787, loss_mean_cls: 1.028137, grad_norm: 1.814137 +Steps: 0%| | 1396/1000000 [05:44<67:52:26, 4.09it/s, grad_norm=1.81, loss_final=1.64, loss_mean=0.954, loss_mean_cls=1.03, proj_loss=-0.339][2026-03-22 14:24:21] Step: 1396, Training Logs: loss_final: 1.720957, loss_mean: 0.944689, proj_loss: -0.339431, loss_mean_cls: 1.115698, grad_norm: 3.453990 +Steps: 0%| | 1397/1000000 [05:45<67:51:48, 4.09it/s, grad_norm=3.45, loss_final=1.72, loss_mean=0.945, loss_mean_cls=1.12, proj_loss=-0.339][2026-03-22 14:24:21] Step: 1397, Training Logs: loss_final: 1.710948, loss_mean: 0.927503, proj_loss: -0.331433, loss_mean_cls: 1.114878, grad_norm: 2.534233 +Steps: 0%| | 1398/1000000 [05:45<67:52:22, 4.09it/s, grad_norm=2.53, loss_final=1.71, loss_mean=0.928, loss_mean_cls=1.11, proj_loss=-0.331][2026-03-22 14:24:22] Step: 1398, Training Logs: loss_final: 1.690930, loss_mean: 0.932565, proj_loss: -0.341884, loss_mean_cls: 1.100249, grad_norm: 2.579815 +Steps: 0%| | 1399/1000000 [05:45<67:52:28, 4.09it/s, grad_norm=2.58, loss_final=1.69, loss_mean=0.933, loss_mean_cls=1.1, proj_loss=-0.342][2026-03-22 14:24:22] Step: 1399, Training Logs: loss_final: 1.830397, loss_mean: 0.917900, proj_loss: -0.333018, loss_mean_cls: 1.245516, grad_norm: 3.680780 +Steps: 0%| | 1400/1000000 [05:45<67:55:42, 4.08it/s, grad_norm=3.68, loss_final=1.83, loss_mean=0.918, loss_mean_cls=1.25, proj_loss=-0.333][2026-03-22 14:24:22] Step: 1400, Training Logs: loss_final: 1.860620, loss_mean: 0.907897, proj_loss: -0.326890, loss_mean_cls: 1.279612, grad_norm: 2.116249 +Steps: 0%| | 1401/1000000 [05:46<67:53:37, 4.09it/s, grad_norm=2.12, loss_final=1.86, loss_mean=0.908, loss_mean_cls=1.28, proj_loss=-0.327][2026-03-22 14:24:22] Step: 1401, Training Logs: loss_final: 1.685842, loss_mean: 0.943462, proj_loss: -0.341882, loss_mean_cls: 1.084262, grad_norm: 4.434834 +Steps: 0%| | 1402/1000000 [05:46<67:52:52, 4.09it/s, grad_norm=4.43, loss_final=1.69, loss_mean=0.943, loss_mean_cls=1.08, proj_loss=-0.342][2026-03-22 14:24:23] Step: 1402, Training Logs: loss_final: 1.730204, loss_mean: 0.942800, proj_loss: -0.342941, loss_mean_cls: 1.130344, grad_norm: 4.932235 +Steps: 0%| | 1403/1000000 [05:46<67:51:18, 4.09it/s, grad_norm=4.93, loss_final=1.73, loss_mean=0.943, loss_mean_cls=1.13, proj_loss=-0.343][2026-03-22 14:24:23] Step: 1403, Training Logs: loss_final: 1.871104, loss_mean: 0.916475, proj_loss: -0.328588, loss_mean_cls: 1.283217, grad_norm: 2.355859 +Steps: 0%| | 1404/1000000 [05:46<67:53:29, 4.09it/s, grad_norm=2.36, loss_final=1.87, loss_mean=0.916, loss_mean_cls=1.28, proj_loss=-0.329][2026-03-22 14:24:23] Step: 1404, Training Logs: loss_final: 1.681496, loss_mean: 0.942782, proj_loss: -0.339876, loss_mean_cls: 1.078590, grad_norm: 3.891571 +Steps: 0%| | 1405/1000000 [05:46<67:51:06, 4.09it/s, grad_norm=3.89, loss_final=1.68, loss_mean=0.943, loss_mean_cls=1.08, proj_loss=-0.34][2026-03-22 14:24:23] Step: 1405, Training Logs: loss_final: 1.739205, loss_mean: 0.936729, proj_loss: -0.340729, loss_mean_cls: 1.143205, grad_norm: 3.708327 +Steps: 0%| | 1406/1000000 [05:47<67:49:31, 4.09it/s, grad_norm=3.71, loss_final=1.74, loss_mean=0.937, loss_mean_cls=1.14, proj_loss=-0.341][2026-03-22 14:24:24] Step: 1406, Training Logs: loss_final: 1.726806, loss_mean: 0.937388, proj_loss: -0.338769, loss_mean_cls: 1.128187, grad_norm: 2.549609 +Steps: 0%| | 1407/1000000 [05:47<67:49:50, 4.09it/s, grad_norm=2.55, loss_final=1.73, loss_mean=0.937, loss_mean_cls=1.13, proj_loss=-0.339][2026-03-22 14:24:24] Step: 1407, Training Logs: loss_final: 1.705321, loss_mean: 0.915424, proj_loss: -0.337140, loss_mean_cls: 1.127037, grad_norm: 2.118872 +Steps: 0%| | 1408/1000000 [05:47<67:51:17, 4.09it/s, grad_norm=2.12, loss_final=1.71, loss_mean=0.915, loss_mean_cls=1.13, proj_loss=-0.337][2026-03-22 14:24:24] Step: 1408, Training Logs: loss_final: 1.711792, loss_mean: 0.950089, proj_loss: -0.338171, loss_mean_cls: 1.099875, grad_norm: 3.083226 +Steps: 0%| | 1409/1000000 [05:47<67:51:07, 4.09it/s, grad_norm=3.08, loss_final=1.71, loss_mean=0.95, loss_mean_cls=1.1, proj_loss=-0.338][2026-03-22 14:24:24] Step: 1409, Training Logs: loss_final: 1.690766, loss_mean: 0.934664, proj_loss: -0.341935, loss_mean_cls: 1.098037, grad_norm: 2.381657 +Steps: 0%| | 1410/1000000 [05:48<67:51:39, 4.09it/s, grad_norm=2.38, loss_final=1.69, loss_mean=0.935, loss_mean_cls=1.1, proj_loss=-0.342][2026-03-22 14:24:25] Step: 1410, Training Logs: loss_final: 1.671408, loss_mean: 0.943354, proj_loss: -0.343386, loss_mean_cls: 1.071440, grad_norm: 2.882976 +Steps: 0%| | 1411/1000000 [05:48<67:50:54, 4.09it/s, grad_norm=2.88, loss_final=1.67, loss_mean=0.943, loss_mean_cls=1.07, proj_loss=-0.343][2026-03-22 14:24:25] Step: 1411, Training Logs: loss_final: 1.806066, loss_mean: 0.935595, proj_loss: -0.334773, loss_mean_cls: 1.205245, grad_norm: 5.136223 +Steps: 0%| | 1412/1000000 [05:48<67:51:50, 4.09it/s, grad_norm=5.14, loss_final=1.81, loss_mean=0.936, loss_mean_cls=1.21, proj_loss=-0.335][2026-03-22 14:24:25] Step: 1412, Training Logs: loss_final: 1.868973, loss_mean: 0.923437, proj_loss: -0.327502, loss_mean_cls: 1.273038, grad_norm: 2.687995 +Steps: 0%| | 1413/1000000 [05:48<67:51:19, 4.09it/s, grad_norm=2.69, loss_final=1.87, loss_mean=0.923, loss_mean_cls=1.27, proj_loss=-0.328][2026-03-22 14:24:25] Step: 1413, Training Logs: loss_final: 1.890831, loss_mean: 0.908775, proj_loss: -0.332124, loss_mean_cls: 1.314180, grad_norm: 3.651055 +Steps: 0%| | 1414/1000000 [05:49<67:50:16, 4.09it/s, grad_norm=3.65, loss_final=1.89, loss_mean=0.909, loss_mean_cls=1.31, proj_loss=-0.332][2026-03-22 14:24:26] Step: 1414, Training Logs: loss_final: 1.910804, loss_mean: 0.924524, proj_loss: -0.319631, loss_mean_cls: 1.305910, grad_norm: 2.959043 +Steps: 0%| | 1415/1000000 [05:49<67:51:31, 4.09it/s, grad_norm=2.96, loss_final=1.91, loss_mean=0.925, loss_mean_cls=1.31, proj_loss=-0.32][2026-03-22 14:24:26] Step: 1415, Training Logs: loss_final: 1.650767, loss_mean: 0.934344, proj_loss: -0.346640, loss_mean_cls: 1.063062, grad_norm: 2.594854 +Steps: 0%| | 1416/1000000 [05:49<67:51:44, 4.09it/s, grad_norm=2.59, loss_final=1.65, loss_mean=0.934, loss_mean_cls=1.06, proj_loss=-0.347][2026-03-22 14:24:26] Step: 1416, Training Logs: loss_final: 1.920389, loss_mean: 0.915875, proj_loss: -0.334414, loss_mean_cls: 1.338928, grad_norm: 2.753093 +Steps: 0%| | 1417/1000000 [05:49<69:19:29, 4.00it/s, grad_norm=2.75, loss_final=1.92, loss_mean=0.916, loss_mean_cls=1.34, proj_loss=-0.334][2026-03-22 14:24:26] Step: 1417, Training Logs: loss_final: 2.009446, loss_mean: 0.891269, proj_loss: -0.330378, loss_mean_cls: 1.448555, grad_norm: 2.595366 +Steps: 0%| | 1418/1000000 [05:50<68:51:57, 4.03it/s, grad_norm=2.6, loss_final=2.01, loss_mean=0.891, loss_mean_cls=1.45, proj_loss=-0.33][2026-03-22 14:24:27] Step: 1418, Training Logs: loss_final: 1.768119, loss_mean: 0.944341, proj_loss: -0.337697, loss_mean_cls: 1.161475, grad_norm: 5.409155 +Steps: 0%| | 1419/1000000 [05:50<68:30:30, 4.05it/s, grad_norm=5.41, loss_final=1.77, loss_mean=0.944, loss_mean_cls=1.16, proj_loss=-0.338][2026-03-22 14:24:27] Step: 1419, Training Logs: loss_final: 1.736686, loss_mean: 0.931280, proj_loss: -0.350280, loss_mean_cls: 1.155686, grad_norm: 5.697709 +Steps: 0%| | 1420/1000000 [05:50<68:19:30, 4.06it/s, grad_norm=5.7, loss_final=1.74, loss_mean=0.931, loss_mean_cls=1.16, proj_loss=-0.35][2026-03-22 14:24:27] Step: 1420, Training Logs: loss_final: 1.734414, loss_mean: 0.941258, proj_loss: -0.344063, loss_mean_cls: 1.137219, grad_norm: 2.964980 +Steps: 0%| | 1421/1000000 [05:50<68:09:18, 4.07it/s, grad_norm=2.96, loss_final=1.73, loss_mean=0.941, loss_mean_cls=1.14, proj_loss=-0.344][2026-03-22 14:24:27] Step: 1421, Training Logs: loss_final: 1.896575, loss_mean: 0.915622, proj_loss: -0.338367, loss_mean_cls: 1.319320, grad_norm: 5.656537 +Steps: 0%| | 1422/1000000 [05:51<68:03:10, 4.08it/s, grad_norm=5.66, loss_final=1.9, loss_mean=0.916, loss_mean_cls=1.32, proj_loss=-0.338][2026-03-22 14:24:28] Step: 1422, Training Logs: loss_final: 1.852549, loss_mean: 0.946121, proj_loss: -0.337726, loss_mean_cls: 1.244155, grad_norm: 5.942082 +Steps: 0%| | 1423/1000000 [05:51<67:58:21, 4.08it/s, grad_norm=5.94, loss_final=1.85, loss_mean=0.946, loss_mean_cls=1.24, proj_loss=-0.338][2026-03-22 14:24:28] Step: 1423, Training Logs: loss_final: 1.678608, loss_mean: 0.962390, proj_loss: -0.344773, loss_mean_cls: 1.060991, grad_norm: 4.434417 +Steps: 0%| | 1424/1000000 [05:51<67:58:52, 4.08it/s, grad_norm=4.43, loss_final=1.68, loss_mean=0.962, loss_mean_cls=1.06, proj_loss=-0.345][2026-03-22 14:24:28] Step: 1424, Training Logs: loss_final: 1.743860, loss_mean: 0.941411, proj_loss: -0.341164, loss_mean_cls: 1.143614, grad_norm: 2.821614 +Steps: 0%| | 1425/1000000 [05:51<67:56:48, 4.08it/s, grad_norm=2.82, loss_final=1.74, loss_mean=0.941, loss_mean_cls=1.14, proj_loss=-0.341][2026-03-22 14:24:28] Step: 1425, Training Logs: loss_final: 1.842345, loss_mean: 0.950248, proj_loss: -0.340571, loss_mean_cls: 1.232668, grad_norm: 4.712833 +Steps: 0%| | 1426/1000000 [05:52<67:57:09, 4.08it/s, grad_norm=4.71, loss_final=1.84, loss_mean=0.95, loss_mean_cls=1.23, proj_loss=-0.341][2026-03-22 14:24:29] Step: 1426, Training Logs: loss_final: 1.785375, loss_mean: 0.953216, proj_loss: -0.344168, loss_mean_cls: 1.176327, grad_norm: 5.680015 +Steps: 0%| | 1427/1000000 [05:52<67:53:56, 4.09it/s, grad_norm=5.68, loss_final=1.79, loss_mean=0.953, loss_mean_cls=1.18, proj_loss=-0.344][2026-03-22 14:24:29] Step: 1427, Training Logs: loss_final: 1.758520, loss_mean: 0.950550, proj_loss: -0.341716, loss_mean_cls: 1.149687, grad_norm: 5.742887 +Steps: 0%| | 1428/1000000 [05:52<67:52:50, 4.09it/s, grad_norm=5.74, loss_final=1.76, loss_mean=0.951, loss_mean_cls=1.15, proj_loss=-0.342][2026-03-22 14:24:29] Step: 1428, Training Logs: loss_final: 1.782107, loss_mean: 0.937309, proj_loss: -0.331928, loss_mean_cls: 1.176725, grad_norm: 2.716779 +Steps: 0%| | 1429/1000000 [05:52<67:51:04, 4.09it/s, grad_norm=2.72, loss_final=1.78, loss_mean=0.937, loss_mean_cls=1.18, proj_loss=-0.332][2026-03-22 14:24:29] Step: 1429, Training Logs: loss_final: 1.720655, loss_mean: 0.939995, proj_loss: -0.343541, loss_mean_cls: 1.124201, grad_norm: 4.618530 +Steps: 0%| | 1430/1000000 [05:53<67:49:22, 4.09it/s, grad_norm=4.62, loss_final=1.72, loss_mean=0.94, loss_mean_cls=1.12, proj_loss=-0.344][2026-03-22 14:24:30] Step: 1430, Training Logs: loss_final: 1.831005, loss_mean: 0.937862, proj_loss: -0.337907, loss_mean_cls: 1.231050, grad_norm: 6.218366 +Steps: 0%| | 1431/1000000 [05:53<67:48:21, 4.09it/s, grad_norm=6.22, loss_final=1.83, loss_mean=0.938, loss_mean_cls=1.23, proj_loss=-0.338][2026-03-22 14:24:30] Step: 1431, Training Logs: loss_final: 1.808656, loss_mean: 0.921605, proj_loss: -0.332098, loss_mean_cls: 1.219150, grad_norm: 3.090137 +Steps: 0%| | 1432/1000000 [05:53<67:50:11, 4.09it/s, grad_norm=3.09, loss_final=1.81, loss_mean=0.922, loss_mean_cls=1.22, proj_loss=-0.332][2026-03-22 14:24:30] Step: 1432, Training Logs: loss_final: 1.749553, loss_mean: 0.944942, proj_loss: -0.346835, loss_mean_cls: 1.151446, grad_norm: 2.593639 +Steps: 0%| | 1433/1000000 [05:53<67:49:35, 4.09it/s, grad_norm=2.59, loss_final=1.75, loss_mean=0.945, loss_mean_cls=1.15, proj_loss=-0.347][2026-03-22 14:24:30] Step: 1433, Training Logs: loss_final: 1.741894, loss_mean: 0.958360, proj_loss: -0.342561, loss_mean_cls: 1.126095, grad_norm: 3.948745 +Steps: 0%| | 1434/1000000 [05:54<67:48:49, 4.09it/s, grad_norm=3.95, loss_final=1.74, loss_mean=0.958, loss_mean_cls=1.13, proj_loss=-0.343][2026-03-22 14:24:31] Step: 1434, Training Logs: loss_final: 1.696458, loss_mean: 0.938773, proj_loss: -0.342019, loss_mean_cls: 1.099703, grad_norm: 3.479941 +Steps: 0%| | 1435/1000000 [05:54<67:47:42, 4.09it/s, grad_norm=3.48, loss_final=1.7, loss_mean=0.939, loss_mean_cls=1.1, proj_loss=-0.342][2026-03-22 14:24:31] Step: 1435, Training Logs: loss_final: 1.707483, loss_mean: 0.934051, proj_loss: -0.342974, loss_mean_cls: 1.116406, grad_norm: 3.977290 +Steps: 0%| | 1436/1000000 [05:54<67:47:48, 4.09it/s, grad_norm=3.98, loss_final=1.71, loss_mean=0.934, loss_mean_cls=1.12, proj_loss=-0.343][2026-03-22 14:24:31] Step: 1436, Training Logs: loss_final: 1.777066, loss_mean: 0.935846, proj_loss: -0.342934, loss_mean_cls: 1.184153, grad_norm: 4.410994 +Steps: 0%| | 1437/1000000 [05:54<67:46:53, 4.09it/s, grad_norm=4.41, loss_final=1.78, loss_mean=0.936, loss_mean_cls=1.18, proj_loss=-0.343][2026-03-22 14:24:31] Step: 1437, Training Logs: loss_final: 1.851153, loss_mean: 0.935058, proj_loss: -0.339397, loss_mean_cls: 1.255493, grad_norm: 3.154486 +Steps: 0%| | 1438/1000000 [05:55<67:48:49, 4.09it/s, grad_norm=3.15, loss_final=1.85, loss_mean=0.935, loss_mean_cls=1.26, proj_loss=-0.339][2026-03-22 14:24:32] Step: 1438, Training Logs: loss_final: 1.749491, loss_mean: 0.923701, proj_loss: -0.343422, loss_mean_cls: 1.169211, grad_norm: 3.214118 +Steps: 0%| | 1439/1000000 [05:55<67:50:35, 4.09it/s, grad_norm=3.21, loss_final=1.75, loss_mean=0.924, loss_mean_cls=1.17, proj_loss=-0.343][2026-03-22 14:24:32] Step: 1439, Training Logs: loss_final: 1.844018, loss_mean: 0.929357, proj_loss: -0.338904, loss_mean_cls: 1.253565, grad_norm: 4.656380 +Steps: 0%| | 1440/1000000 [05:55<67:51:52, 4.09it/s, grad_norm=4.66, loss_final=1.84, loss_mean=0.929, loss_mean_cls=1.25, proj_loss=-0.339][2026-03-22 14:24:32] Step: 1440, Training Logs: loss_final: 1.770981, loss_mean: 0.917709, proj_loss: -0.343922, loss_mean_cls: 1.197193, grad_norm: 3.810437 +Steps: 0%| | 1441/1000000 [05:55<67:56:43, 4.08it/s, grad_norm=3.81, loss_final=1.77, loss_mean=0.918, loss_mean_cls=1.2, proj_loss=-0.344][2026-03-22 14:24:32] Step: 1441, Training Logs: loss_final: 1.745531, loss_mean: 0.934162, proj_loss: -0.340332, loss_mean_cls: 1.151701, grad_norm: 3.261314 +Steps: 0%| | 1442/1000000 [05:56<67:54:17, 4.08it/s, grad_norm=3.26, loss_final=1.75, loss_mean=0.934, loss_mean_cls=1.15, proj_loss=-0.34][2026-03-22 14:24:33] Step: 1442, Training Logs: loss_final: 1.903999, loss_mean: 0.923041, proj_loss: -0.328683, loss_mean_cls: 1.309640, grad_norm: 3.734756 +Steps: 0%| | 1443/1000000 [05:56<67:51:15, 4.09it/s, grad_norm=3.73, loss_final=1.9, loss_mean=0.923, loss_mean_cls=1.31, proj_loss=-0.329][2026-03-22 14:24:33] Step: 1443, Training Logs: loss_final: 1.688359, loss_mean: 0.948230, proj_loss: -0.343881, loss_mean_cls: 1.084011, grad_norm: 2.711815 +Steps: 0%| | 1444/1000000 [05:56<67:50:11, 4.09it/s, grad_norm=2.71, loss_final=1.69, loss_mean=0.948, loss_mean_cls=1.08, proj_loss=-0.344][2026-03-22 14:24:33] Step: 1444, Training Logs: loss_final: 1.684334, loss_mean: 0.941145, proj_loss: -0.347786, loss_mean_cls: 1.090975, grad_norm: 4.022442 +Steps: 0%| | 1445/1000000 [05:56<67:48:17, 4.09it/s, grad_norm=4.02, loss_final=1.68, loss_mean=0.941, loss_mean_cls=1.09, proj_loss=-0.348][2026-03-22 14:24:33] Step: 1445, Training Logs: loss_final: 1.748541, loss_mean: 0.935925, proj_loss: -0.341130, loss_mean_cls: 1.153746, grad_norm: 3.600013 +Steps: 0%| | 1446/1000000 [05:57<67:48:24, 4.09it/s, grad_norm=3.6, loss_final=1.75, loss_mean=0.936, loss_mean_cls=1.15, proj_loss=-0.341][2026-03-22 14:24:33] Step: 1446, Training Logs: loss_final: 1.698770, loss_mean: 0.936513, proj_loss: -0.340813, loss_mean_cls: 1.103070, grad_norm: 3.073715 +Steps: 0%| | 1447/1000000 [05:57<67:48:46, 4.09it/s, grad_norm=3.07, loss_final=1.7, loss_mean=0.937, loss_mean_cls=1.1, proj_loss=-0.341][2026-03-22 14:24:34] Step: 1447, Training Logs: loss_final: 1.769682, loss_mean: 0.915995, proj_loss: -0.343955, loss_mean_cls: 1.197642, grad_norm: 3.248551 +Steps: 0%| | 1448/1000000 [05:57<67:49:47, 4.09it/s, grad_norm=3.25, loss_final=1.77, loss_mean=0.916, loss_mean_cls=1.2, proj_loss=-0.344][2026-03-22 14:24:34] Step: 1448, Training Logs: loss_final: 1.753075, loss_mean: 0.918574, proj_loss: -0.344721, loss_mean_cls: 1.179222, grad_norm: 4.959263 +Steps: 0%| | 1449/1000000 [05:57<67:48:16, 4.09it/s, grad_norm=4.96, loss_final=1.75, loss_mean=0.919, loss_mean_cls=1.18, proj_loss=-0.345][2026-03-22 14:24:34] Step: 1449, Training Logs: loss_final: 1.901393, loss_mean: 0.919279, proj_loss: -0.330663, loss_mean_cls: 1.312777, grad_norm: 4.810872 +Steps: 0%| | 1450/1000000 [05:58<67:46:54, 4.09it/s, grad_norm=4.81, loss_final=1.9, loss_mean=0.919, loss_mean_cls=1.31, proj_loss=-0.331][2026-03-22 14:24:34] Step: 1450, Training Logs: loss_final: 1.696392, loss_mean: 0.961262, proj_loss: -0.346999, loss_mean_cls: 1.082129, grad_norm: 3.195325 +Steps: 0%| | 1451/1000000 [05:58<67:48:31, 4.09it/s, grad_norm=3.2, loss_final=1.7, loss_mean=0.961, loss_mean_cls=1.08, proj_loss=-0.347][2026-03-22 14:24:35] Step: 1451, Training Logs: loss_final: 1.820456, loss_mean: 0.916044, proj_loss: -0.344889, loss_mean_cls: 1.249301, grad_norm: 3.379948 +Steps: 0%| | 1452/1000000 [05:58<67:48:35, 4.09it/s, grad_norm=3.38, loss_final=1.82, loss_mean=0.916, loss_mean_cls=1.25, proj_loss=-0.345][2026-03-22 14:24:35] Step: 1452, Training Logs: loss_final: 1.776486, loss_mean: 0.920899, proj_loss: -0.342595, loss_mean_cls: 1.198183, grad_norm: 3.516138 +Steps: 0%| | 1453/1000000 [05:58<67:47:56, 4.09it/s, grad_norm=3.52, loss_final=1.78, loss_mean=0.921, loss_mean_cls=1.2, proj_loss=-0.343][2026-03-22 14:24:35] Step: 1453, Training Logs: loss_final: 1.710713, loss_mean: 0.940272, proj_loss: -0.351353, loss_mean_cls: 1.121793, grad_norm: 3.059931 +Steps: 0%| | 1454/1000000 [05:58<67:46:26, 4.09it/s, grad_norm=3.06, loss_final=1.71, loss_mean=0.94, loss_mean_cls=1.12, proj_loss=-0.351][2026-03-22 14:24:35] Step: 1454, Training Logs: loss_final: 1.595119, loss_mean: 0.957194, proj_loss: -0.355362, loss_mean_cls: 0.993287, grad_norm: 2.501424 +Steps: 0%| | 1455/1000000 [05:59<67:45:41, 4.09it/s, grad_norm=2.5, loss_final=1.6, loss_mean=0.957, loss_mean_cls=0.993, proj_loss=-0.355][2026-03-22 14:24:36] Step: 1455, Training Logs: loss_final: 1.760036, loss_mean: 0.931813, proj_loss: -0.342153, loss_mean_cls: 1.170377, grad_norm: 3.689324 +Steps: 0%| | 1456/1000000 [05:59<67:46:59, 4.09it/s, grad_norm=3.69, loss_final=1.76, loss_mean=0.932, loss_mean_cls=1.17, proj_loss=-0.342][2026-03-22 14:24:36] Step: 1456, Training Logs: loss_final: 1.640018, loss_mean: 0.946188, proj_loss: -0.350334, loss_mean_cls: 1.044165, grad_norm: 3.594187 +Steps: 0%| | 1457/1000000 [05:59<67:47:29, 4.09it/s, grad_norm=3.59, loss_final=1.64, loss_mean=0.946, loss_mean_cls=1.04, proj_loss=-0.35][2026-03-22 14:24:36] Step: 1457, Training Logs: loss_final: 1.697036, loss_mean: 0.944677, proj_loss: -0.346806, loss_mean_cls: 1.099165, grad_norm: 2.715035 +Steps: 0%| | 1458/1000000 [05:59<67:47:38, 4.09it/s, grad_norm=2.72, loss_final=1.7, loss_mean=0.945, loss_mean_cls=1.1, proj_loss=-0.347][2026-03-22 14:24:36] Step: 1458, Training Logs: loss_final: 1.786386, loss_mean: 0.919679, proj_loss: -0.335795, loss_mean_cls: 1.202502, grad_norm: 2.190312 +Steps: 0%| | 1459/1000000 [06:00<67:48:41, 4.09it/s, grad_norm=2.19, loss_final=1.79, loss_mean=0.92, loss_mean_cls=1.2, proj_loss=-0.336][2026-03-22 14:24:37] Step: 1459, Training Logs: loss_final: 1.717740, loss_mean: 0.927113, proj_loss: -0.349887, loss_mean_cls: 1.140514, grad_norm: 3.337408 +Steps: 0%| | 1460/1000000 [06:00<67:48:49, 4.09it/s, grad_norm=3.34, loss_final=1.72, loss_mean=0.927, loss_mean_cls=1.14, proj_loss=-0.35][2026-03-22 14:24:37] Step: 1460, Training Logs: loss_final: 1.669442, loss_mean: 0.957662, proj_loss: -0.347184, loss_mean_cls: 1.058964, grad_norm: 3.665614 +Steps: 0%| | 1461/1000000 [06:00<67:48:42, 4.09it/s, grad_norm=3.67, loss_final=1.67, loss_mean=0.958, loss_mean_cls=1.06, proj_loss=-0.347][2026-03-22 14:24:37] Step: 1461, Training Logs: loss_final: 1.870960, loss_mean: 0.921408, proj_loss: -0.327336, loss_mean_cls: 1.276888, grad_norm: 2.111585 +Steps: 0%| | 1462/1000000 [06:00<67:48:23, 4.09it/s, grad_norm=2.11, loss_final=1.87, loss_mean=0.921, loss_mean_cls=1.28, proj_loss=-0.327][2026-03-22 14:24:37] Step: 1462, Training Logs: loss_final: 1.664149, loss_mean: 0.942515, proj_loss: -0.349831, loss_mean_cls: 1.071465, grad_norm: 3.786124 +Steps: 0%| | 1463/1000000 [06:01<67:48:08, 4.09it/s, grad_norm=3.79, loss_final=1.66, loss_mean=0.943, loss_mean_cls=1.07, proj_loss=-0.35][2026-03-22 14:24:38] Step: 1463, Training Logs: loss_final: 1.648456, loss_mean: 0.930032, proj_loss: -0.344569, loss_mean_cls: 1.062993, grad_norm: 2.561708 +Steps: 0%| | 1464/1000000 [06:01<67:49:08, 4.09it/s, grad_norm=2.56, loss_final=1.65, loss_mean=0.93, loss_mean_cls=1.06, proj_loss=-0.345][2026-03-22 14:24:38] Step: 1464, Training Logs: loss_final: 1.681329, loss_mean: 0.923724, proj_loss: -0.348729, loss_mean_cls: 1.106333, grad_norm: 2.155669 +Steps: 0%| | 1465/1000000 [06:01<67:49:50, 4.09it/s, grad_norm=2.16, loss_final=1.68, loss_mean=0.924, loss_mean_cls=1.11, proj_loss=-0.349][2026-03-22 14:24:38] Step: 1465, Training Logs: loss_final: 1.758503, loss_mean: 0.927439, proj_loss: -0.343472, loss_mean_cls: 1.174536, grad_norm: 4.309298 +Steps: 0%| | 1466/1000000 [06:01<67:54:09, 4.08it/s, grad_norm=4.31, loss_final=1.76, loss_mean=0.927, loss_mean_cls=1.17, proj_loss=-0.343][2026-03-22 14:24:38] Step: 1466, Training Logs: loss_final: 1.785414, loss_mean: 0.936695, proj_loss: -0.342696, loss_mean_cls: 1.191415, grad_norm: 3.329139 +Steps: 0%| | 1467/1000000 [06:02<67:52:56, 4.09it/s, grad_norm=3.33, loss_final=1.79, loss_mean=0.937, loss_mean_cls=1.19, proj_loss=-0.343][2026-03-22 14:24:39] Step: 1467, Training Logs: loss_final: 1.672291, loss_mean: 0.941212, proj_loss: -0.348925, loss_mean_cls: 1.080004, grad_norm: 2.486570 +Steps: 0%| | 1468/1000000 [06:02<67:51:44, 4.09it/s, grad_norm=2.49, loss_final=1.67, loss_mean=0.941, loss_mean_cls=1.08, proj_loss=-0.349][2026-03-22 14:24:39] Step: 1468, Training Logs: loss_final: 1.785163, loss_mean: 0.927288, proj_loss: -0.338889, loss_mean_cls: 1.196764, grad_norm: 3.221417 +Steps: 0%| | 1469/1000000 [06:02<67:53:16, 4.09it/s, grad_norm=3.22, loss_final=1.79, loss_mean=0.927, loss_mean_cls=1.2, proj_loss=-0.339][2026-03-22 14:24:39] Step: 1469, Training Logs: loss_final: 1.715548, loss_mean: 0.940317, proj_loss: -0.341192, loss_mean_cls: 1.116423, grad_norm: 3.298194 +Steps: 0%| | 1470/1000000 [06:02<67:52:19, 4.09it/s, grad_norm=3.3, loss_final=1.72, loss_mean=0.94, loss_mean_cls=1.12, proj_loss=-0.341][2026-03-22 14:24:39] Step: 1470, Training Logs: loss_final: 1.750876, loss_mean: 0.908658, proj_loss: -0.336413, loss_mean_cls: 1.178630, grad_norm: 2.845670 +Steps: 0%| | 1471/1000000 [06:03<67:50:48, 4.09it/s, grad_norm=2.85, loss_final=1.75, loss_mean=0.909, loss_mean_cls=1.18, proj_loss=-0.336][2026-03-22 14:24:40] Step: 1471, Training Logs: loss_final: 1.763510, loss_mean: 0.921182, proj_loss: -0.338359, loss_mean_cls: 1.180687, grad_norm: 1.978478 +Steps: 0%| | 1472/1000000 [06:03<67:58:29, 4.08it/s, grad_norm=1.98, loss_final=1.76, loss_mean=0.921, loss_mean_cls=1.18, proj_loss=-0.338][2026-03-22 14:24:40] Step: 1472, Training Logs: loss_final: 1.619691, loss_mean: 0.973082, proj_loss: -0.355023, loss_mean_cls: 1.001632, grad_norm: 3.048279 +Steps: 0%| | 1473/1000000 [06:03<67:54:57, 4.08it/s, grad_norm=3.05, loss_final=1.62, loss_mean=0.973, loss_mean_cls=1, proj_loss=-0.355][2026-03-22 14:24:40] Step: 1473, Training Logs: loss_final: 1.843519, loss_mean: 0.918213, proj_loss: -0.333929, loss_mean_cls: 1.259236, grad_norm: 3.430544 +Steps: 0%| | 1474/1000000 [06:03<67:52:18, 4.09it/s, grad_norm=3.43, loss_final=1.84, loss_mean=0.918, loss_mean_cls=1.26, proj_loss=-0.334][2026-03-22 14:24:40] Step: 1474, Training Logs: loss_final: 1.739318, loss_mean: 0.941495, proj_loss: -0.342727, loss_mean_cls: 1.140550, grad_norm: 3.824240 +Steps: 0%| | 1475/1000000 [06:04<67:49:36, 4.09it/s, grad_norm=3.82, loss_final=1.74, loss_mean=0.941, loss_mean_cls=1.14, proj_loss=-0.343][2026-03-22 14:24:41] Step: 1475, Training Logs: loss_final: 1.685955, loss_mean: 0.935465, proj_loss: -0.346244, loss_mean_cls: 1.096734, grad_norm: 3.142507 +Steps: 0%| | 1476/1000000 [06:04<67:50:19, 4.09it/s, grad_norm=3.14, loss_final=1.69, loss_mean=0.935, loss_mean_cls=1.1, proj_loss=-0.346][2026-03-22 14:24:41] Step: 1476, Training Logs: loss_final: 1.804490, loss_mean: 0.909495, proj_loss: -0.339588, loss_mean_cls: 1.234583, grad_norm: 4.547951 +Steps: 0%| | 1477/1000000 [06:04<67:50:47, 4.09it/s, grad_norm=4.55, loss_final=1.8, loss_mean=0.909, loss_mean_cls=1.23, proj_loss=-0.34][2026-03-22 14:24:41] Step: 1477, Training Logs: loss_final: 1.739345, loss_mean: 0.920196, proj_loss: -0.347776, loss_mean_cls: 1.166925, grad_norm: 3.363498 +Steps: 0%| | 1478/1000000 [06:04<67:51:06, 4.09it/s, grad_norm=3.36, loss_final=1.74, loss_mean=0.92, loss_mean_cls=1.17, proj_loss=-0.348][2026-03-22 14:24:41] Step: 1478, Training Logs: loss_final: 1.808104, loss_mean: 0.905831, proj_loss: -0.340338, loss_mean_cls: 1.242610, grad_norm: 2.888258 +Steps: 0%| | 1479/1000000 [06:05<67:49:46, 4.09it/s, grad_norm=2.89, loss_final=1.81, loss_mean=0.906, loss_mean_cls=1.24, proj_loss=-0.34][2026-03-22 14:24:42] Step: 1479, Training Logs: loss_final: 1.775763, loss_mean: 0.915251, proj_loss: -0.348843, loss_mean_cls: 1.209355, grad_norm: 3.335517 +Steps: 0%| | 1480/1000000 [06:05<67:49:52, 4.09it/s, grad_norm=3.34, loss_final=1.78, loss_mean=0.915, loss_mean_cls=1.21, proj_loss=-0.349][2026-03-22 14:24:42] Step: 1480, Training Logs: loss_final: 1.662782, loss_mean: 0.955601, proj_loss: -0.347824, loss_mean_cls: 1.055006, grad_norm: 2.794362 +Steps: 0%| | 1481/1000000 [06:05<67:50:02, 4.09it/s, grad_norm=2.79, loss_final=1.66, loss_mean=0.956, loss_mean_cls=1.06, proj_loss=-0.348][2026-03-22 14:24:42] Step: 1481, Training Logs: loss_final: 1.782191, loss_mean: 0.934975, proj_loss: -0.342687, loss_mean_cls: 1.189903, grad_norm: 4.158588 +Steps: 0%| | 1482/1000000 [06:05<67:49:31, 4.09it/s, grad_norm=4.16, loss_final=1.78, loss_mean=0.935, loss_mean_cls=1.19, proj_loss=-0.343][2026-03-22 14:24:42] Step: 1482, Training Logs: loss_final: 1.733833, loss_mean: 0.931611, proj_loss: -0.351253, loss_mean_cls: 1.153475, grad_norm: 2.805516 +Steps: 0%| | 1483/1000000 [06:06<67:49:13, 4.09it/s, grad_norm=2.81, loss_final=1.73, loss_mean=0.932, loss_mean_cls=1.15, proj_loss=-0.351][2026-03-22 14:24:43] Step: 1483, Training Logs: loss_final: 1.759606, loss_mean: 0.929367, proj_loss: -0.345907, loss_mean_cls: 1.176147, grad_norm: 4.022744 +Steps: 0%| | 1484/1000000 [06:06<67:50:01, 4.09it/s, grad_norm=4.02, loss_final=1.76, loss_mean=0.929, loss_mean_cls=1.18, proj_loss=-0.346][2026-03-22 14:24:43] Step: 1484, Training Logs: loss_final: 1.701155, loss_mean: 0.930039, proj_loss: -0.351352, loss_mean_cls: 1.122468, grad_norm: 2.881047 +Steps: 0%| | 1485/1000000 [06:06<67:48:30, 4.09it/s, grad_norm=2.88, loss_final=1.7, loss_mean=0.93, loss_mean_cls=1.12, proj_loss=-0.351][2026-03-22 14:24:43] Step: 1485, Training Logs: loss_final: 1.677574, loss_mean: 0.947131, proj_loss: -0.354252, loss_mean_cls: 1.084696, grad_norm: 2.256243 +Steps: 0%| | 1486/1000000 [06:06<67:50:14, 4.09it/s, grad_norm=2.26, loss_final=1.68, loss_mean=0.947, loss_mean_cls=1.08, proj_loss=-0.354][2026-03-22 14:24:43] Step: 1486, Training Logs: loss_final: 1.848749, loss_mean: 0.906450, proj_loss: -0.341388, loss_mean_cls: 1.283687, grad_norm: 2.830824 +Steps: 0%| | 1487/1000000 [06:07<67:49:40, 4.09it/s, grad_norm=2.83, loss_final=1.85, loss_mean=0.906, loss_mean_cls=1.28, proj_loss=-0.341][2026-03-22 14:24:44] Step: 1487, Training Logs: loss_final: 1.709771, loss_mean: 0.932850, proj_loss: -0.342547, loss_mean_cls: 1.119468, grad_norm: 2.053018 +Steps: 0%| | 1488/1000000 [06:07<67:50:45, 4.09it/s, grad_norm=2.05, loss_final=1.71, loss_mean=0.933, loss_mean_cls=1.12, proj_loss=-0.343][2026-03-22 14:24:44] Step: 1488, Training Logs: loss_final: 1.783750, loss_mean: 0.902092, proj_loss: -0.344961, loss_mean_cls: 1.226619, grad_norm: 4.024988 +Steps: 0%| | 1489/1000000 [06:07<67:50:56, 4.09it/s, grad_norm=4.02, loss_final=1.78, loss_mean=0.902, loss_mean_cls=1.23, proj_loss=-0.345][2026-03-22 14:24:44] Step: 1489, Training Logs: loss_final: 1.703304, loss_mean: 0.932386, proj_loss: -0.350630, loss_mean_cls: 1.121548, grad_norm: 2.852614 +Steps: 0%| | 1490/1000000 [06:07<67:52:01, 4.09it/s, grad_norm=2.85, loss_final=1.7, loss_mean=0.932, loss_mean_cls=1.12, proj_loss=-0.351][2026-03-22 14:24:44] Step: 1490, Training Logs: loss_final: 1.816993, loss_mean: 0.911706, proj_loss: -0.344403, loss_mean_cls: 1.249689, grad_norm: 3.055920 +Steps: 0%| | 1491/1000000 [06:08<67:49:49, 4.09it/s, grad_norm=3.06, loss_final=1.82, loss_mean=0.912, loss_mean_cls=1.25, proj_loss=-0.344][2026-03-22 14:24:45] Step: 1491, Training Logs: loss_final: 1.722197, loss_mean: 0.933080, proj_loss: -0.346543, loss_mean_cls: 1.135660, grad_norm: 2.032409 +Steps: 0%| | 1492/1000000 [06:08<67:50:21, 4.09it/s, grad_norm=2.03, loss_final=1.72, loss_mean=0.933, loss_mean_cls=1.14, proj_loss=-0.347][2026-03-22 14:24:45] Step: 1492, Training Logs: loss_final: 1.669232, loss_mean: 0.956145, proj_loss: -0.353064, loss_mean_cls: 1.066152, grad_norm: 2.960075 +Steps: 0%| | 1493/1000000 [06:08<67:48:41, 4.09it/s, grad_norm=2.96, loss_final=1.67, loss_mean=0.956, loss_mean_cls=1.07, proj_loss=-0.353][2026-03-22 14:24:45] Step: 1493, Training Logs: loss_final: 1.739594, loss_mean: 0.930220, proj_loss: -0.345375, loss_mean_cls: 1.154749, grad_norm: 3.935670 +Steps: 0%| | 1494/1000000 [06:08<67:47:16, 4.09it/s, grad_norm=3.94, loss_final=1.74, loss_mean=0.93, loss_mean_cls=1.15, proj_loss=-0.345][2026-03-22 14:24:45] Step: 1494, Training Logs: loss_final: 1.716275, loss_mean: 0.918681, proj_loss: -0.346981, loss_mean_cls: 1.144576, grad_norm: 2.869855 +Steps: 0%| | 1495/1000000 [06:09<67:47:42, 4.09it/s, grad_norm=2.87, loss_final=1.72, loss_mean=0.919, loss_mean_cls=1.14, proj_loss=-0.347][2026-03-22 14:24:45] Step: 1495, Training Logs: loss_final: 1.713073, loss_mean: 0.935481, proj_loss: -0.346820, loss_mean_cls: 1.124412, grad_norm: 2.538412 +Steps: 0%| | 1496/1000000 [06:09<68:24:40, 4.05it/s, grad_norm=2.54, loss_final=1.71, loss_mean=0.935, loss_mean_cls=1.12, proj_loss=-0.347][2026-03-22 14:24:46] Step: 1496, Training Logs: loss_final: 1.640932, loss_mean: 0.934873, proj_loss: -0.357771, loss_mean_cls: 1.063830, grad_norm: 2.628049 +Steps: 0%| | 1497/1000000 [06:09<68:12:46, 4.07it/s, grad_norm=2.63, loss_final=1.64, loss_mean=0.935, loss_mean_cls=1.06, proj_loss=-0.358][2026-03-22 14:24:46] Step: 1497, Training Logs: loss_final: 1.812789, loss_mean: 0.915670, proj_loss: -0.346552, loss_mean_cls: 1.243672, grad_norm: 3.554099 +Steps: 0%| | 1498/1000000 [06:09<68:05:25, 4.07it/s, grad_norm=3.55, loss_final=1.81, loss_mean=0.916, loss_mean_cls=1.24, proj_loss=-0.347][2026-03-22 14:24:46] Step: 1498, Training Logs: loss_final: 1.755622, loss_mean: 0.899862, proj_loss: -0.348669, loss_mean_cls: 1.204430, grad_norm: 2.960074 +Steps: 0%| | 1499/1000000 [06:10<68:01:56, 4.08it/s, grad_norm=2.96, loss_final=1.76, loss_mean=0.9, loss_mean_cls=1.2, proj_loss=-0.349][2026-03-22 14:24:46] Step: 1499, Training Logs: loss_final: 1.865455, loss_mean: 0.932722, proj_loss: -0.338826, loss_mean_cls: 1.271560, grad_norm: 3.435601 +Steps: 0%| | 1500/1000000 [06:10<68:02:32, 4.08it/s, grad_norm=3.44, loss_final=1.87, loss_mean=0.933, loss_mean_cls=1.27, proj_loss=-0.339][2026-03-22 14:24:47] Step: 1500, Training Logs: loss_final: 1.801630, loss_mean: 0.911498, proj_loss: -0.344018, loss_mean_cls: 1.234150, grad_norm: 3.335212 +Steps: 0%| | 1501/1000000 [06:10<67:58:01, 4.08it/s, grad_norm=3.34, loss_final=1.8, loss_mean=0.911, loss_mean_cls=1.23, proj_loss=-0.344][2026-03-22 14:24:47] Step: 1501, Training Logs: loss_final: 1.828490, loss_mean: 0.927346, proj_loss: -0.334890, loss_mean_cls: 1.236035, grad_norm: 2.161504 +Steps: 0%| | 1502/1000000 [06:10<67:55:04, 4.08it/s, grad_norm=2.16, loss_final=1.83, loss_mean=0.927, loss_mean_cls=1.24, proj_loss=-0.335][2026-03-22 14:24:47] Step: 1502, Training Logs: loss_final: 1.855282, loss_mean: 0.925494, proj_loss: -0.339164, loss_mean_cls: 1.268952, grad_norm: 4.631639 +Steps: 0%| | 1503/1000000 [06:10<68:00:58, 4.08it/s, grad_norm=4.63, loss_final=1.86, loss_mean=0.925, loss_mean_cls=1.27, proj_loss=-0.339][2026-03-22 14:24:47] Step: 1503, Training Logs: loss_final: 1.664078, loss_mean: 0.930480, proj_loss: -0.353715, loss_mean_cls: 1.087313, grad_norm: 1.764878 +Steps: 0%| | 1504/1000000 [06:11<67:58:03, 4.08it/s, grad_norm=1.76, loss_final=1.66, loss_mean=0.93, loss_mean_cls=1.09, proj_loss=-0.354][2026-03-22 14:24:48] Step: 1504, Training Logs: loss_final: 1.639790, loss_mean: 0.945657, proj_loss: -0.352546, loss_mean_cls: 1.046679, grad_norm: 4.306772 +Steps: 0%| | 1505/1000000 [06:11<67:55:08, 4.08it/s, grad_norm=4.31, loss_final=1.64, loss_mean=0.946, loss_mean_cls=1.05, proj_loss=-0.353][2026-03-22 14:24:48] Step: 1505, Training Logs: loss_final: 1.671120, loss_mean: 0.936583, proj_loss: -0.355253, loss_mean_cls: 1.089789, grad_norm: 3.224893 +Steps: 0%| | 1506/1000000 [06:11<67:54:05, 4.08it/s, grad_norm=3.22, loss_final=1.67, loss_mean=0.937, loss_mean_cls=1.09, proj_loss=-0.355][2026-03-22 14:24:48] Step: 1506, Training Logs: loss_final: 1.660668, loss_mean: 0.932313, proj_loss: -0.349924, loss_mean_cls: 1.078279, grad_norm: 2.156471 +Steps: 0%| | 1507/1000000 [06:11<67:51:08, 4.09it/s, grad_norm=2.16, loss_final=1.66, loss_mean=0.932, loss_mean_cls=1.08, proj_loss=-0.35][2026-03-22 14:24:48] Step: 1507, Training Logs: loss_final: 1.869651, loss_mean: 0.917650, proj_loss: -0.343609, loss_mean_cls: 1.295611, grad_norm: 4.703074 +Steps: 0%| | 1508/1000000 [06:12<67:50:33, 4.09it/s, grad_norm=4.7, loss_final=1.87, loss_mean=0.918, loss_mean_cls=1.3, proj_loss=-0.344][2026-03-22 14:24:49] Step: 1508, Training Logs: loss_final: 1.822348, loss_mean: 0.932816, proj_loss: -0.346360, loss_mean_cls: 1.235892, grad_norm: 4.771918 +Steps: 0%| | 1509/1000000 [06:12<67:48:45, 4.09it/s, grad_norm=4.77, loss_final=1.82, loss_mean=0.933, loss_mean_cls=1.24, proj_loss=-0.346][2026-03-22 14:24:49] Step: 1509, Training Logs: loss_final: 1.663655, loss_mean: 0.959773, proj_loss: -0.354544, loss_mean_cls: 1.058426, grad_norm: 2.898901 +Steps: 0%| | 1510/1000000 [06:12<67:48:29, 4.09it/s, grad_norm=2.9, loss_final=1.66, loss_mean=0.96, loss_mean_cls=1.06, proj_loss=-0.355][2026-03-22 14:24:49] Step: 1510, Training Logs: loss_final: 1.670935, loss_mean: 0.932556, proj_loss: -0.345986, loss_mean_cls: 1.084364, grad_norm: 2.658350 +Steps: 0%| | 1511/1000000 [06:12<67:47:38, 4.09it/s, grad_norm=2.66, loss_final=1.67, loss_mean=0.933, loss_mean_cls=1.08, proj_loss=-0.346][2026-03-22 14:24:49] Step: 1511, Training Logs: loss_final: 1.840092, loss_mean: 0.885918, proj_loss: -0.344878, loss_mean_cls: 1.299052, grad_norm: 4.544338 +Steps: 0%| | 1512/1000000 [06:13<67:49:19, 4.09it/s, grad_norm=4.54, loss_final=1.84, loss_mean=0.886, loss_mean_cls=1.3, proj_loss=-0.345][2026-03-22 14:24:50] Step: 1512, Training Logs: loss_final: 1.780769, loss_mean: 0.943219, proj_loss: -0.340286, loss_mean_cls: 1.177836, grad_norm: 3.745853 +Steps: 0%| | 1513/1000000 [06:13<68:34:41, 4.04it/s, grad_norm=3.75, loss_final=1.78, loss_mean=0.943, loss_mean_cls=1.18, proj_loss=-0.34][2026-03-22 14:24:50] Step: 1513, Training Logs: loss_final: 1.729316, loss_mean: 0.919737, proj_loss: -0.347271, loss_mean_cls: 1.156851, grad_norm: 2.771198 +Steps: 0%| | 1514/1000000 [06:13<68:21:18, 4.06it/s, grad_norm=2.77, loss_final=1.73, loss_mean=0.92, loss_mean_cls=1.16, proj_loss=-0.347][2026-03-22 14:24:50] Step: 1514, Training Logs: loss_final: 1.785243, loss_mean: 0.944652, proj_loss: -0.341425, loss_mean_cls: 1.182016, grad_norm: 3.002562 +Steps: 0%| | 1515/1000000 [06:13<68:09:43, 4.07it/s, grad_norm=3, loss_final=1.79, loss_mean=0.945, loss_mean_cls=1.18, proj_loss=-0.341][2026-03-22 14:24:50] Step: 1515, Training Logs: loss_final: 1.733733, loss_mean: 0.916696, proj_loss: -0.343643, loss_mean_cls: 1.160681, grad_norm: 1.788266 +Steps: 0%| | 1516/1000000 [06:14<68:02:29, 4.08it/s, grad_norm=1.79, loss_final=1.73, loss_mean=0.917, loss_mean_cls=1.16, proj_loss=-0.344][2026-03-22 14:24:51] Step: 1516, Training Logs: loss_final: 1.708834, loss_mean: 0.943142, proj_loss: -0.342268, loss_mean_cls: 1.107960, grad_norm: 3.081567 +Steps: 0%| | 1517/1000000 [06:14<67:58:42, 4.08it/s, grad_norm=3.08, loss_final=1.71, loss_mean=0.943, loss_mean_cls=1.11, proj_loss=-0.342][2026-03-22 14:24:51] Step: 1517, Training Logs: loss_final: 1.560595, loss_mean: 0.958666, proj_loss: -0.353170, loss_mean_cls: 0.955100, grad_norm: 1.484715 +Steps: 0%| | 1518/1000000 [06:14<67:55:21, 4.08it/s, grad_norm=1.48, loss_final=1.56, loss_mean=0.959, loss_mean_cls=0.955, proj_loss=-0.353][2026-03-22 14:24:51] Step: 1518, Training Logs: loss_final: 1.848190, loss_mean: 0.906096, proj_loss: -0.341419, loss_mean_cls: 1.283513, grad_norm: 3.583914 +Steps: 0%| | 1519/1000000 [06:14<67:52:43, 4.09it/s, grad_norm=3.58, loss_final=1.85, loss_mean=0.906, loss_mean_cls=1.28, proj_loss=-0.341][2026-03-22 14:24:51] Step: 1519, Training Logs: loss_final: 1.772635, loss_mean: 0.925569, proj_loss: -0.340738, loss_mean_cls: 1.187803, grad_norm: 3.074940 +Steps: 0%| | 1520/1000000 [06:15<68:07:10, 4.07it/s, grad_norm=3.07, loss_final=1.77, loss_mean=0.926, loss_mean_cls=1.19, proj_loss=-0.341][2026-03-22 14:24:52] Step: 1520, Training Logs: loss_final: 1.728146, loss_mean: 0.915994, proj_loss: -0.339253, loss_mean_cls: 1.151404, grad_norm: 2.109151 +Steps: 0%| | 1521/1000000 [06:15<68:05:43, 4.07it/s, grad_norm=2.11, loss_final=1.73, loss_mean=0.916, loss_mean_cls=1.15, proj_loss=-0.339][2026-03-22 14:24:52] Step: 1521, Training Logs: loss_final: 1.762693, loss_mean: 0.936937, proj_loss: -0.347961, loss_mean_cls: 1.173716, grad_norm: 3.333568 +Steps: 0%| | 1522/1000000 [06:15<68:04:38, 4.07it/s, grad_norm=3.33, loss_final=1.76, loss_mean=0.937, loss_mean_cls=1.17, proj_loss=-0.348][2026-03-22 14:24:52] Step: 1522, Training Logs: loss_final: 1.644622, loss_mean: 0.949722, proj_loss: -0.353894, loss_mean_cls: 1.048794, grad_norm: 3.049503 +Steps: 0%| | 1523/1000000 [06:15<68:02:30, 4.08it/s, grad_norm=3.05, loss_final=1.64, loss_mean=0.95, loss_mean_cls=1.05, proj_loss=-0.354][2026-03-22 14:24:52] Step: 1523, Training Logs: loss_final: 1.715042, loss_mean: 0.933911, proj_loss: -0.348840, loss_mean_cls: 1.129970, grad_norm: 2.395639 +Steps: 0%| | 1524/1000000 [06:16<67:59:27, 4.08it/s, grad_norm=2.4, loss_final=1.72, loss_mean=0.934, loss_mean_cls=1.13, proj_loss=-0.349][2026-03-22 14:24:53] Step: 1524, Training Logs: loss_final: 1.791627, loss_mean: 0.907271, proj_loss: -0.341253, loss_mean_cls: 1.225609, grad_norm: 3.367114 +Steps: 0%| | 1525/1000000 [06:16<67:58:03, 4.08it/s, grad_norm=3.37, loss_final=1.79, loss_mean=0.907, loss_mean_cls=1.23, proj_loss=-0.341][2026-03-22 14:24:53] Step: 1525, Training Logs: loss_final: 1.780000, loss_mean: 0.920997, proj_loss: -0.344520, loss_mean_cls: 1.203523, grad_norm: 3.129656 +Steps: 0%| | 1526/1000000 [06:16<67:54:31, 4.08it/s, grad_norm=3.13, loss_final=1.78, loss_mean=0.921, loss_mean_cls=1.2, proj_loss=-0.345][2026-03-22 14:24:53] Step: 1526, Training Logs: loss_final: 1.802236, loss_mean: 0.927452, proj_loss: -0.343560, loss_mean_cls: 1.218344, grad_norm: 3.545235 +Steps: 0%| | 1527/1000000 [06:16<67:52:28, 4.09it/s, grad_norm=3.55, loss_final=1.8, loss_mean=0.927, loss_mean_cls=1.22, proj_loss=-0.344][2026-03-22 14:24:53] Step: 1527, Training Logs: loss_final: 1.749302, loss_mean: 0.930679, proj_loss: -0.348961, loss_mean_cls: 1.167584, grad_norm: 3.887058 +Steps: 0%| | 1528/1000000 [06:17<67:50:16, 4.09it/s, grad_norm=3.89, loss_final=1.75, loss_mean=0.931, loss_mean_cls=1.17, proj_loss=-0.349][2026-03-22 14:24:54] Step: 1528, Training Logs: loss_final: 1.680966, loss_mean: 0.931747, proj_loss: -0.349597, loss_mean_cls: 1.098815, grad_norm: 5.292727 +Steps: 0%| | 1529/1000000 [06:17<67:49:52, 4.09it/s, grad_norm=5.29, loss_final=1.68, loss_mean=0.932, loss_mean_cls=1.1, proj_loss=-0.35][2026-03-22 14:24:54] Step: 1529, Training Logs: loss_final: 1.760717, loss_mean: 0.937752, proj_loss: -0.346220, loss_mean_cls: 1.169184, grad_norm: 4.269207 +Steps: 0%| | 1530/1000000 [06:17<67:49:20, 4.09it/s, grad_norm=4.27, loss_final=1.76, loss_mean=0.938, loss_mean_cls=1.17, proj_loss=-0.346][2026-03-22 14:24:54] Step: 1530, Training Logs: loss_final: 1.601118, loss_mean: 0.944475, proj_loss: -0.352608, loss_mean_cls: 1.009251, grad_norm: 4.067022 +Steps: 0%| | 1531/1000000 [06:17<67:47:27, 4.09it/s, grad_norm=4.07, loss_final=1.6, loss_mean=0.944, loss_mean_cls=1.01, proj_loss=-0.353][2026-03-22 14:24:54] Step: 1531, Training Logs: loss_final: 1.851182, loss_mean: 0.943028, proj_loss: -0.342601, loss_mean_cls: 1.250755, grad_norm: 6.386578 +Steps: 0%| | 1532/1000000 [06:18<67:48:22, 4.09it/s, grad_norm=6.39, loss_final=1.85, loss_mean=0.943, loss_mean_cls=1.25, proj_loss=-0.343][2026-03-22 14:24:55] Step: 1532, Training Logs: loss_final: 1.641120, loss_mean: 0.941364, proj_loss: -0.361149, loss_mean_cls: 1.060905, grad_norm: 4.203162 +Steps: 0%| | 1533/1000000 [06:18<67:46:21, 4.09it/s, grad_norm=4.2, loss_final=1.64, loss_mean=0.941, loss_mean_cls=1.06, proj_loss=-0.361][2026-03-22 14:24:55] Step: 1533, Training Logs: loss_final: 1.768812, loss_mean: 0.934636, proj_loss: -0.342683, loss_mean_cls: 1.176859, grad_norm: 3.110876 +Steps: 0%| | 1534/1000000 [06:18<67:46:19, 4.09it/s, grad_norm=3.11, loss_final=1.77, loss_mean=0.935, loss_mean_cls=1.18, proj_loss=-0.343][2026-03-22 14:24:55] Step: 1534, Training Logs: loss_final: 1.754020, loss_mean: 0.927854, proj_loss: -0.349666, loss_mean_cls: 1.175832, grad_norm: 4.754613 +Steps: 0%| | 1535/1000000 [06:18<67:46:38, 4.09it/s, grad_norm=4.75, loss_final=1.75, loss_mean=0.928, loss_mean_cls=1.18, proj_loss=-0.35][2026-03-22 14:24:55] Step: 1535, Training Logs: loss_final: 1.879802, loss_mean: 0.922976, proj_loss: -0.338670, loss_mean_cls: 1.295496, grad_norm: 5.181825 +Steps: 0%| | 1536/1000000 [06:19<67:46:27, 4.09it/s, grad_norm=5.18, loss_final=1.88, loss_mean=0.923, loss_mean_cls=1.3, proj_loss=-0.339][2026-03-22 14:24:56] Step: 1536, Training Logs: loss_final: 1.673127, loss_mean: 0.963101, proj_loss: -0.350650, loss_mean_cls: 1.060676, grad_norm: 2.969548 +Steps: 0%| | 1537/1000000 [06:19<67:46:45, 4.09it/s, grad_norm=2.97, loss_final=1.67, loss_mean=0.963, loss_mean_cls=1.06, proj_loss=-0.351][2026-03-22 14:24:56] Step: 1537, Training Logs: loss_final: 1.729091, loss_mean: 0.923638, proj_loss: -0.348833, loss_mean_cls: 1.154286, grad_norm: 2.688619 +Steps: 0%| | 1538/1000000 [06:19<67:47:03, 4.09it/s, grad_norm=2.69, loss_final=1.73, loss_mean=0.924, loss_mean_cls=1.15, proj_loss=-0.349][2026-03-22 14:24:56] Step: 1538, Training Logs: loss_final: 1.671383, loss_mean: 0.947541, proj_loss: -0.353255, loss_mean_cls: 1.077097, grad_norm: 4.801860 +Steps: 0%| | 1539/1000000 [06:19<67:47:01, 4.09it/s, grad_norm=4.8, loss_final=1.67, loss_mean=0.948, loss_mean_cls=1.08, proj_loss=-0.353][2026-03-22 14:24:56] Step: 1539, Training Logs: loss_final: 1.761064, loss_mean: 0.943833, proj_loss: -0.339376, loss_mean_cls: 1.156607, grad_norm: 4.502411 +Steps: 0%| | 1540/1000000 [06:20<67:47:07, 4.09it/s, grad_norm=4.5, loss_final=1.76, loss_mean=0.944, loss_mean_cls=1.16, proj_loss=-0.339][2026-03-22 14:24:57] Step: 1540, Training Logs: loss_final: 1.752092, loss_mean: 0.937259, proj_loss: -0.352200, loss_mean_cls: 1.167033, grad_norm: 3.206554 +Steps: 0%| | 1541/1000000 [06:20<67:47:37, 4.09it/s, grad_norm=3.21, loss_final=1.75, loss_mean=0.937, loss_mean_cls=1.17, proj_loss=-0.352][2026-03-22 14:24:57] Step: 1541, Training Logs: loss_final: 1.695107, loss_mean: 0.946177, proj_loss: -0.349866, loss_mean_cls: 1.098797, grad_norm: 2.776465 +Steps: 0%| | 1542/1000000 [06:20<67:47:20, 4.09it/s, grad_norm=2.78, loss_final=1.7, loss_mean=0.946, loss_mean_cls=1.1, proj_loss=-0.35][2026-03-22 14:24:57] Step: 1542, Training Logs: loss_final: 1.868937, loss_mean: 0.922220, proj_loss: -0.341670, loss_mean_cls: 1.288386, grad_norm: 4.278832 +Steps: 0%| | 1543/1000000 [06:20<67:48:09, 4.09it/s, grad_norm=4.28, loss_final=1.87, loss_mean=0.922, loss_mean_cls=1.29, proj_loss=-0.342][2026-03-22 14:24:57] Step: 1543, Training Logs: loss_final: 1.677521, loss_mean: 0.945173, proj_loss: -0.351397, loss_mean_cls: 1.083745, grad_norm: 2.955155 +Steps: 0%| | 1544/1000000 [06:21<67:48:24, 4.09it/s, grad_norm=2.96, loss_final=1.68, loss_mean=0.945, loss_mean_cls=1.08, proj_loss=-0.351][2026-03-22 14:24:57] Step: 1544, Training Logs: loss_final: 1.788985, loss_mean: 0.927997, proj_loss: -0.344553, loss_mean_cls: 1.205541, grad_norm: 3.175794 +Steps: 0%| | 1545/1000000 [06:21<67:48:43, 4.09it/s, grad_norm=3.18, loss_final=1.79, loss_mean=0.928, loss_mean_cls=1.21, proj_loss=-0.345][2026-03-22 14:24:58] Step: 1545, Training Logs: loss_final: 1.692937, loss_mean: 0.952120, proj_loss: -0.350435, loss_mean_cls: 1.091252, grad_norm: 3.994453 +Steps: 0%| | 1546/1000000 [06:21<67:47:35, 4.09it/s, grad_norm=3.99, loss_final=1.69, loss_mean=0.952, loss_mean_cls=1.09, proj_loss=-0.35][2026-03-22 14:24:58] Step: 1546, Training Logs: loss_final: 1.781347, loss_mean: 0.931272, proj_loss: -0.342280, loss_mean_cls: 1.192355, grad_norm: 3.992438 +Steps: 0%| | 1547/1000000 [06:21<67:45:05, 4.09it/s, grad_norm=3.99, loss_final=1.78, loss_mean=0.931, loss_mean_cls=1.19, proj_loss=-0.342][2026-03-22 14:24:58] Step: 1547, Training Logs: loss_final: 1.822919, loss_mean: 0.934829, proj_loss: -0.350288, loss_mean_cls: 1.238379, grad_norm: 4.193960 +Steps: 0%| | 1548/1000000 [06:22<67:45:27, 4.09it/s, grad_norm=4.19, loss_final=1.82, loss_mean=0.935, loss_mean_cls=1.24, proj_loss=-0.35][2026-03-22 14:24:58] Step: 1548, Training Logs: loss_final: 1.686688, loss_mean: 0.940683, proj_loss: -0.351920, loss_mean_cls: 1.097926, grad_norm: 2.208272 +Steps: 0%| | 1549/1000000 [06:22<67:46:21, 4.09it/s, grad_norm=2.21, loss_final=1.69, loss_mean=0.941, loss_mean_cls=1.1, proj_loss=-0.352][2026-03-22 14:24:59] Step: 1549, Training Logs: loss_final: 1.734468, loss_mean: 0.928568, proj_loss: -0.350968, loss_mean_cls: 1.156868, grad_norm: 2.968140 +Steps: 0%| | 1550/1000000 [06:22<67:46:23, 4.09it/s, grad_norm=2.97, loss_final=1.73, loss_mean=0.929, loss_mean_cls=1.16, proj_loss=-0.351][2026-03-22 14:24:59] Step: 1550, Training Logs: loss_final: 1.722524, loss_mean: 0.943871, proj_loss: -0.354906, loss_mean_cls: 1.133558, grad_norm: 4.897509 +Steps: 0%| | 1551/1000000 [06:22<67:45:58, 4.09it/s, grad_norm=4.9, loss_final=1.72, loss_mean=0.944, loss_mean_cls=1.13, proj_loss=-0.355][2026-03-22 14:24:59] Step: 1551, Training Logs: loss_final: 1.843158, loss_mean: 0.922662, proj_loss: -0.340425, loss_mean_cls: 1.260921, grad_norm: 3.950800 +Steps: 0%| | 1552/1000000 [06:22<67:47:22, 4.09it/s, grad_norm=3.95, loss_final=1.84, loss_mean=0.923, loss_mean_cls=1.26, proj_loss=-0.34][2026-03-22 14:24:59] Step: 1552, Training Logs: loss_final: 1.786490, loss_mean: 0.934739, proj_loss: -0.350403, loss_mean_cls: 1.202154, grad_norm: 4.537816 +Steps: 0%| | 1553/1000000 [06:23<67:47:25, 4.09it/s, grad_norm=4.54, loss_final=1.79, loss_mean=0.935, loss_mean_cls=1.2, proj_loss=-0.35][2026-03-22 14:25:00] Step: 1553, Training Logs: loss_final: 1.738714, loss_mean: 0.933796, proj_loss: -0.349817, loss_mean_cls: 1.154735, grad_norm: 4.518197 +Steps: 0%| | 1554/1000000 [06:23<67:47:14, 4.09it/s, grad_norm=4.52, loss_final=1.74, loss_mean=0.934, loss_mean_cls=1.15, proj_loss=-0.35][2026-03-22 14:25:00] Step: 1554, Training Logs: loss_final: 1.689106, loss_mean: 0.945108, proj_loss: -0.353531, loss_mean_cls: 1.097529, grad_norm: 3.494847 +Steps: 0%| | 1555/1000000 [06:23<67:48:23, 4.09it/s, grad_norm=3.49, loss_final=1.69, loss_mean=0.945, loss_mean_cls=1.1, proj_loss=-0.354][2026-03-22 14:25:00] Step: 1555, Training Logs: loss_final: 1.700909, loss_mean: 0.940212, proj_loss: -0.353991, loss_mean_cls: 1.114689, grad_norm: 4.060174 +Steps: 0%| | 1556/1000000 [06:23<67:48:07, 4.09it/s, grad_norm=4.06, loss_final=1.7, loss_mean=0.94, loss_mean_cls=1.11, proj_loss=-0.354][2026-03-22 14:25:00] Step: 1556, Training Logs: loss_final: 1.822196, loss_mean: 0.917161, proj_loss: -0.346047, loss_mean_cls: 1.251083, grad_norm: 4.038782 +Steps: 0%| | 1557/1000000 [06:24<67:48:27, 4.09it/s, grad_norm=4.04, loss_final=1.82, loss_mean=0.917, loss_mean_cls=1.25, proj_loss=-0.346][2026-03-22 14:25:01] Step: 1557, Training Logs: loss_final: 1.695153, loss_mean: 0.949321, proj_loss: -0.352516, loss_mean_cls: 1.098349, grad_norm: 3.705662 +Steps: 0%| | 1558/1000000 [06:24<67:48:47, 4.09it/s, grad_norm=3.71, loss_final=1.7, loss_mean=0.949, loss_mean_cls=1.1, proj_loss=-0.353][2026-03-22 14:25:01] Step: 1558, Training Logs: loss_final: 1.866230, loss_mean: 0.906515, proj_loss: -0.340906, loss_mean_cls: 1.300621, grad_norm: 4.151407 +Steps: 0%| | 1559/1000000 [06:24<67:47:15, 4.09it/s, grad_norm=4.15, loss_final=1.87, loss_mean=0.907, loss_mean_cls=1.3, proj_loss=-0.341][2026-03-22 14:25:01] Step: 1559, Training Logs: loss_final: 1.650881, loss_mean: 0.939642, proj_loss: -0.361345, loss_mean_cls: 1.072583, grad_norm: 2.625775 +Steps: 0%| | 1560/1000000 [06:24<67:58:43, 4.08it/s, grad_norm=2.63, loss_final=1.65, loss_mean=0.94, loss_mean_cls=1.07, proj_loss=-0.361][2026-03-22 14:25:01] Step: 1560, Training Logs: loss_final: 1.688840, loss_mean: 0.938602, proj_loss: -0.358514, loss_mean_cls: 1.108752, grad_norm: 2.740416 +Steps: 0%| | 1561/1000000 [06:25<67:56:42, 4.08it/s, grad_norm=2.74, loss_final=1.69, loss_mean=0.939, loss_mean_cls=1.11, proj_loss=-0.359][2026-03-22 14:25:02] Step: 1561, Training Logs: loss_final: 1.846809, loss_mean: 0.897002, proj_loss: -0.344771, loss_mean_cls: 1.294579, grad_norm: 3.158029 +Steps: 0%| | 1562/1000000 [06:25<68:25:43, 4.05it/s, grad_norm=3.16, loss_final=1.85, loss_mean=0.897, loss_mean_cls=1.29, proj_loss=-0.345][2026-03-22 14:25:02] Step: 1562, Training Logs: loss_final: 1.617348, loss_mean: 0.936732, proj_loss: -0.358310, loss_mean_cls: 1.038925, grad_norm: 2.957936 +Steps: 0%| | 1563/1000000 [06:25<68:20:56, 4.06it/s, grad_norm=2.96, loss_final=1.62, loss_mean=0.937, loss_mean_cls=1.04, proj_loss=-0.358][2026-03-22 14:25:02] Step: 1563, Training Logs: loss_final: 1.784790, loss_mean: 0.928772, proj_loss: -0.346722, loss_mean_cls: 1.202740, grad_norm: 4.172043 +Steps: 0%| | 1564/1000000 [06:25<68:12:01, 4.07it/s, grad_norm=4.17, loss_final=1.78, loss_mean=0.929, loss_mean_cls=1.2, proj_loss=-0.347][2026-03-22 14:25:02] Step: 1564, Training Logs: loss_final: 1.675996, loss_mean: 0.951405, proj_loss: -0.357220, loss_mean_cls: 1.081811, grad_norm: 4.134042 +Steps: 0%| | 1565/1000000 [06:26<68:04:02, 4.07it/s, grad_norm=4.13, loss_final=1.68, loss_mean=0.951, loss_mean_cls=1.08, proj_loss=-0.357][2026-03-22 14:25:03] Step: 1565, Training Logs: loss_final: 1.712408, loss_mean: 0.932865, proj_loss: -0.348804, loss_mean_cls: 1.128348, grad_norm: 1.735431 +Steps: 0%| | 1566/1000000 [06:26<67:58:52, 4.08it/s, grad_norm=1.74, loss_final=1.71, loss_mean=0.933, loss_mean_cls=1.13, proj_loss=-0.349][2026-03-22 14:25:03] Step: 1566, Training Logs: loss_final: 1.765437, loss_mean: 0.937393, proj_loss: -0.343351, loss_mean_cls: 1.171396, grad_norm: 3.380391 +Steps: 0%| | 1567/1000000 [06:26<67:54:36, 4.08it/s, grad_norm=3.38, loss_final=1.77, loss_mean=0.937, loss_mean_cls=1.17, proj_loss=-0.343][2026-03-22 14:25:03] Step: 1567, Training Logs: loss_final: 1.786537, loss_mean: 0.929669, proj_loss: -0.350148, loss_mean_cls: 1.207016, grad_norm: 3.975205 +Steps: 0%| | 1568/1000000 [06:26<67:57:29, 4.08it/s, grad_norm=3.98, loss_final=1.79, loss_mean=0.93, loss_mean_cls=1.21, proj_loss=-0.35][2026-03-22 14:25:03] Step: 1568, Training Logs: loss_final: 1.601104, loss_mean: 0.959370, proj_loss: -0.353744, loss_mean_cls: 0.995478, grad_norm: 2.729351 +Steps: 0%| | 1569/1000000 [06:27<68:19:35, 4.06it/s, grad_norm=2.73, loss_final=1.6, loss_mean=0.959, loss_mean_cls=0.995, proj_loss=-0.354][2026-03-22 14:25:04] Step: 1569, Training Logs: loss_final: 1.737841, loss_mean: 0.935096, proj_loss: -0.346558, loss_mean_cls: 1.149303, grad_norm: 1.733097 +Steps: 0%| | 1570/1000000 [06:27<68:09:02, 4.07it/s, grad_norm=1.73, loss_final=1.74, loss_mean=0.935, loss_mean_cls=1.15, proj_loss=-0.347][2026-03-22 14:25:04] Step: 1570, Training Logs: loss_final: 1.811352, loss_mean: 0.901819, proj_loss: -0.341271, loss_mean_cls: 1.250803, grad_norm: 2.380784 +Steps: 0%| | 1571/1000000 [06:27<68:02:23, 4.08it/s, grad_norm=2.38, loss_final=1.81, loss_mean=0.902, loss_mean_cls=1.25, proj_loss=-0.341][2026-03-22 14:25:04] Step: 1571, Training Logs: loss_final: 1.697474, loss_mean: 0.943802, proj_loss: -0.351042, loss_mean_cls: 1.104714, grad_norm: 3.658701 +Steps: 0%| | 1572/1000000 [06:27<67:58:50, 4.08it/s, grad_norm=3.66, loss_final=1.7, loss_mean=0.944, loss_mean_cls=1.1, proj_loss=-0.351][2026-03-22 14:25:04] Step: 1572, Training Logs: loss_final: 1.593696, loss_mean: 0.940766, proj_loss: -0.358808, loss_mean_cls: 1.011738, grad_norm: 2.548829 +Steps: 0%| | 1573/1000000 [06:28<67:55:59, 4.08it/s, grad_norm=2.55, loss_final=1.59, loss_mean=0.941, loss_mean_cls=1.01, proj_loss=-0.359][2026-03-22 14:25:05] Step: 1573, Training Logs: loss_final: 1.820505, loss_mean: 0.909901, proj_loss: -0.351474, loss_mean_cls: 1.262078, grad_norm: 2.534802 +Steps: 0%| | 1574/1000000 [06:28<67:57:13, 4.08it/s, grad_norm=2.53, loss_final=1.82, loss_mean=0.91, loss_mean_cls=1.26, proj_loss=-0.351][2026-03-22 14:25:05] Step: 1574, Training Logs: loss_final: 1.735180, loss_mean: 0.916619, proj_loss: -0.351293, loss_mean_cls: 1.169854, grad_norm: 3.788594 +Steps: 0%| | 1575/1000000 [06:28<67:54:57, 4.08it/s, grad_norm=3.79, loss_final=1.74, loss_mean=0.917, loss_mean_cls=1.17, proj_loss=-0.351][2026-03-22 14:25:05] Step: 1575, Training Logs: loss_final: 1.706332, loss_mean: 0.943515, proj_loss: -0.359872, loss_mean_cls: 1.122690, grad_norm: 2.420875 +Steps: 0%| | 1576/1000000 [06:28<67:54:45, 4.08it/s, grad_norm=2.42, loss_final=1.71, loss_mean=0.944, loss_mean_cls=1.12, proj_loss=-0.36][2026-03-22 14:25:05] Step: 1576, Training Logs: loss_final: 1.774036, loss_mean: 0.926435, proj_loss: -0.351731, loss_mean_cls: 1.199333, grad_norm: 3.856115 +Steps: 0%| | 1577/1000000 [06:29<67:52:13, 4.09it/s, grad_norm=3.86, loss_final=1.77, loss_mean=0.926, loss_mean_cls=1.2, proj_loss=-0.352][2026-03-22 14:25:06] Step: 1577, Training Logs: loss_final: 1.872422, loss_mean: 0.922335, proj_loss: -0.346350, loss_mean_cls: 1.296437, grad_norm: 5.551814 +Steps: 0%| | 1578/1000000 [06:29<67:50:31, 4.09it/s, grad_norm=5.55, loss_final=1.87, loss_mean=0.922, loss_mean_cls=1.3, proj_loss=-0.346][2026-03-22 14:25:06] Step: 1578, Training Logs: loss_final: 1.794927, loss_mean: 0.920052, proj_loss: -0.347157, loss_mean_cls: 1.222032, grad_norm: 3.698366 +Steps: 0%| | 1579/1000000 [06:29<67:48:56, 4.09it/s, grad_norm=3.7, loss_final=1.79, loss_mean=0.92, loss_mean_cls=1.22, proj_loss=-0.347][2026-03-22 14:25:06] Step: 1579, Training Logs: loss_final: 1.748697, loss_mean: 0.920635, proj_loss: -0.347690, loss_mean_cls: 1.175752, grad_norm: 3.912803 +Steps: 0%| | 1580/1000000 [06:29<67:49:10, 4.09it/s, grad_norm=3.91, loss_final=1.75, loss_mean=0.921, loss_mean_cls=1.18, proj_loss=-0.348][2026-03-22 14:25:06] Step: 1580, Training Logs: loss_final: 1.769829, loss_mean: 0.931955, proj_loss: -0.352845, loss_mean_cls: 1.190719, grad_norm: 4.730816 +Steps: 0%| | 1581/1000000 [06:30<67:49:00, 4.09it/s, grad_norm=4.73, loss_final=1.77, loss_mean=0.932, loss_mean_cls=1.19, proj_loss=-0.353][2026-03-22 14:25:07] Step: 1581, Training Logs: loss_final: 1.706027, loss_mean: 0.920717, proj_loss: -0.358234, loss_mean_cls: 1.143545, grad_norm: 3.721624 +Steps: 0%| | 1582/1000000 [06:30<67:47:43, 4.09it/s, grad_norm=3.72, loss_final=1.71, loss_mean=0.921, loss_mean_cls=1.14, proj_loss=-0.358][2026-03-22 14:25:07] Step: 1582, Training Logs: loss_final: 1.816787, loss_mean: 0.905234, proj_loss: -0.342659, loss_mean_cls: 1.254212, grad_norm: 3.330268 +Steps: 0%| | 1583/1000000 [06:30<67:47:31, 4.09it/s, grad_norm=3.33, loss_final=1.82, loss_mean=0.905, loss_mean_cls=1.25, proj_loss=-0.343][2026-03-22 14:25:07] Step: 1583, Training Logs: loss_final: 1.716132, loss_mean: 0.934062, proj_loss: -0.357448, loss_mean_cls: 1.139518, grad_norm: 3.525544 +Steps: 0%| | 1584/1000000 [06:30<67:48:35, 4.09it/s, grad_norm=3.53, loss_final=1.72, loss_mean=0.934, loss_mean_cls=1.14, proj_loss=-0.357][2026-03-22 14:25:07] Step: 1584, Training Logs: loss_final: 1.657911, loss_mean: 0.933383, proj_loss: -0.355607, loss_mean_cls: 1.080136, grad_norm: 2.437551 +Steps: 0%| | 1585/1000000 [06:31<67:48:14, 4.09it/s, grad_norm=2.44, loss_final=1.66, loss_mean=0.933, loss_mean_cls=1.08, proj_loss=-0.356][2026-03-22 14:25:08] Step: 1585, Training Logs: loss_final: 1.733002, loss_mean: 0.928766, proj_loss: -0.359717, loss_mean_cls: 1.163953, grad_norm: 4.598446 +Steps: 0%| | 1586/1000000 [06:31<67:47:57, 4.09it/s, grad_norm=4.6, loss_final=1.73, loss_mean=0.929, loss_mean_cls=1.16, proj_loss=-0.36][2026-03-22 14:25:08] Step: 1586, Training Logs: loss_final: 1.630314, loss_mean: 0.951285, proj_loss: -0.365099, loss_mean_cls: 1.044128, grad_norm: 4.300289 +Steps: 0%| | 1587/1000000 [06:31<67:46:41, 4.09it/s, grad_norm=4.3, loss_final=1.63, loss_mean=0.951, loss_mean_cls=1.04, proj_loss=-0.365][2026-03-22 14:25:08] Step: 1587, Training Logs: loss_final: 1.829457, loss_mean: 0.913824, proj_loss: -0.342222, loss_mean_cls: 1.257855, grad_norm: 2.630697 +Steps: 0%| | 1588/1000000 [06:31<67:46:44, 4.09it/s, grad_norm=2.63, loss_final=1.83, loss_mean=0.914, loss_mean_cls=1.26, proj_loss=-0.342][2026-03-22 14:25:08] Step: 1588, Training Logs: loss_final: 1.697143, loss_mean: 0.929045, proj_loss: -0.356677, loss_mean_cls: 1.124775, grad_norm: 3.471369 +Steps: 0%| | 1589/1000000 [06:32<67:46:31, 4.09it/s, grad_norm=3.47, loss_final=1.7, loss_mean=0.929, loss_mean_cls=1.12, proj_loss=-0.357][2026-03-22 14:25:08] Step: 1589, Training Logs: loss_final: 1.809916, loss_mean: 0.920256, proj_loss: -0.340848, loss_mean_cls: 1.230508, grad_norm: 3.926652 +Steps: 0%| | 1590/1000000 [06:32<68:01:33, 4.08it/s, grad_norm=3.93, loss_final=1.81, loss_mean=0.92, loss_mean_cls=1.23, proj_loss=-0.341][2026-03-22 14:25:09] Step: 1590, Training Logs: loss_final: 1.744270, loss_mean: 0.940395, proj_loss: -0.353861, loss_mean_cls: 1.157736, grad_norm: 2.769077 +Steps: 0%| | 1591/1000000 [06:32<67:56:48, 4.08it/s, grad_norm=2.77, loss_final=1.74, loss_mean=0.94, loss_mean_cls=1.16, proj_loss=-0.354][2026-03-22 14:25:09] Step: 1591, Training Logs: loss_final: 1.802442, loss_mean: 0.908636, proj_loss: -0.347207, loss_mean_cls: 1.241013, grad_norm: 1.915669 +Steps: 0%| | 1592/1000000 [06:32<67:54:01, 4.08it/s, grad_norm=1.92, loss_final=1.8, loss_mean=0.909, loss_mean_cls=1.24, proj_loss=-0.347][2026-03-22 14:25:09] Step: 1592, Training Logs: loss_final: 1.681062, loss_mean: 0.936101, proj_loss: -0.360913, loss_mean_cls: 1.105875, grad_norm: 4.141538 +Steps: 0%| | 1593/1000000 [06:33<67:51:25, 4.09it/s, grad_norm=4.14, loss_final=1.68, loss_mean=0.936, loss_mean_cls=1.11, proj_loss=-0.361][2026-03-22 14:25:09] Step: 1593, Training Logs: loss_final: 1.689340, loss_mean: 0.933112, proj_loss: -0.361346, loss_mean_cls: 1.117574, grad_norm: 3.044300 +Steps: 0%| | 1594/1000000 [06:33<67:50:49, 4.09it/s, grad_norm=3.04, loss_final=1.69, loss_mean=0.933, loss_mean_cls=1.12, proj_loss=-0.361][2026-03-22 14:25:10] Step: 1594, Training Logs: loss_final: 1.691422, loss_mean: 0.927435, proj_loss: -0.362508, loss_mean_cls: 1.126495, grad_norm: 3.059313 +Steps: 0%| | 1595/1000000 [06:33<67:49:37, 4.09it/s, grad_norm=3.06, loss_final=1.69, loss_mean=0.927, loss_mean_cls=1.13, proj_loss=-0.363][2026-03-22 14:25:10] Step: 1595, Training Logs: loss_final: 1.722838, loss_mean: 0.943102, proj_loss: -0.347108, loss_mean_cls: 1.126844, grad_norm: 4.680357 +Steps: 0%| | 1596/1000000 [06:33<67:48:21, 4.09it/s, grad_norm=4.68, loss_final=1.72, loss_mean=0.943, loss_mean_cls=1.13, proj_loss=-0.347][2026-03-22 14:25:10] Step: 1596, Training Logs: loss_final: 1.713534, loss_mean: 0.929577, proj_loss: -0.348994, loss_mean_cls: 1.132951, grad_norm: 2.523744 +Steps: 0%| | 1597/1000000 [06:33<67:47:58, 4.09it/s, grad_norm=2.52, loss_final=1.71, loss_mean=0.93, loss_mean_cls=1.13, proj_loss=-0.349][2026-03-22 14:25:10] Step: 1597, Training Logs: loss_final: 1.772629, loss_mean: 0.900951, proj_loss: -0.351628, loss_mean_cls: 1.223307, grad_norm: 3.275746 +Steps: 0%| | 1598/1000000 [06:34<67:48:42, 4.09it/s, grad_norm=3.28, loss_final=1.77, loss_mean=0.901, loss_mean_cls=1.22, proj_loss=-0.352][2026-03-22 14:25:11] Step: 1598, Training Logs: loss_final: 1.819108, loss_mean: 0.903959, proj_loss: -0.346052, loss_mean_cls: 1.261202, grad_norm: 3.333009 +Steps: 0%| | 1599/1000000 [06:34<67:47:35, 4.09it/s, grad_norm=3.33, loss_final=1.82, loss_mean=0.904, loss_mean_cls=1.26, proj_loss=-0.346][2026-03-22 14:25:11] Step: 1599, Training Logs: loss_final: 1.732919, loss_mean: 0.919937, proj_loss: -0.349738, loss_mean_cls: 1.162719, grad_norm: 2.848257 +Steps: 0%| | 1600/1000000 [06:34<67:50:43, 4.09it/s, grad_norm=2.85, loss_final=1.73, loss_mean=0.92, loss_mean_cls=1.16, proj_loss=-0.35][2026-03-22 14:25:11] Step: 1600, Training Logs: loss_final: 1.827307, loss_mean: 0.916912, proj_loss: -0.343378, loss_mean_cls: 1.253773, grad_norm: 2.525295 +Steps: 0%| | 1601/1000000 [06:34<67:52:26, 4.09it/s, grad_norm=2.53, loss_final=1.83, loss_mean=0.917, loss_mean_cls=1.25, proj_loss=-0.343][2026-03-22 14:25:11] Step: 1601, Training Logs: loss_final: 1.623127, loss_mean: 0.938142, proj_loss: -0.359468, loss_mean_cls: 1.044452, grad_norm: 3.464143 +Steps: 0%| | 1602/1000000 [06:35<67:51:09, 4.09it/s, grad_norm=3.46, loss_final=1.62, loss_mean=0.938, loss_mean_cls=1.04, proj_loss=-0.359][2026-03-22 14:25:12] Step: 1602, Training Logs: loss_final: 1.753913, loss_mean: 0.927687, proj_loss: -0.350009, loss_mean_cls: 1.176235, grad_norm: 3.673428 +Steps: 0%| | 1603/1000000 [06:35<67:49:15, 4.09it/s, grad_norm=3.67, loss_final=1.75, loss_mean=0.928, loss_mean_cls=1.18, proj_loss=-0.35][2026-03-22 14:25:12] Step: 1603, Training Logs: loss_final: 1.743784, loss_mean: 0.926512, proj_loss: -0.345927, loss_mean_cls: 1.163199, grad_norm: 2.220204 +Steps: 0%| | 1604/1000000 [06:35<67:50:55, 4.09it/s, grad_norm=2.22, loss_final=1.74, loss_mean=0.927, loss_mean_cls=1.16, proj_loss=-0.346][2026-03-22 14:25:12] Step: 1604, Training Logs: loss_final: 1.836296, loss_mean: 0.899161, proj_loss: -0.348739, loss_mean_cls: 1.285874, grad_norm: 4.179168 +Steps: 0%| | 1605/1000000 [06:35<67:50:18, 4.09it/s, grad_norm=4.18, loss_final=1.84, loss_mean=0.899, loss_mean_cls=1.29, proj_loss=-0.349][2026-03-22 14:25:12] Step: 1605, Training Logs: loss_final: 1.583164, loss_mean: 0.933243, proj_loss: -0.367419, loss_mean_cls: 1.017339, grad_norm: 2.491801 +Steps: 0%| | 1606/1000000 [06:36<67:48:58, 4.09it/s, grad_norm=2.49, loss_final=1.58, loss_mean=0.933, loss_mean_cls=1.02, proj_loss=-0.367][2026-03-22 14:25:13] Step: 1606, Training Logs: loss_final: 1.684806, loss_mean: 0.926852, proj_loss: -0.351811, loss_mean_cls: 1.109766, grad_norm: 2.937880 +Steps: 0%| | 1607/1000000 [06:36<67:47:59, 4.09it/s, grad_norm=2.94, loss_final=1.68, loss_mean=0.927, loss_mean_cls=1.11, proj_loss=-0.352][2026-03-22 14:25:13] Step: 1607, Training Logs: loss_final: 1.617462, loss_mean: 0.933909, proj_loss: -0.362464, loss_mean_cls: 1.046017, grad_norm: 2.949747 +Steps: 0%| | 1608/1000000 [06:36<67:47:40, 4.09it/s, grad_norm=2.95, loss_final=1.62, loss_mean=0.934, loss_mean_cls=1.05, proj_loss=-0.362][2026-03-22 14:25:13] Step: 1608, Training Logs: loss_final: 1.708503, loss_mean: 0.919073, proj_loss: -0.357782, loss_mean_cls: 1.147213, grad_norm: 2.468578 +Steps: 0%| | 1609/1000000 [06:36<67:47:32, 4.09it/s, grad_norm=2.47, loss_final=1.71, loss_mean=0.919, loss_mean_cls=1.15, proj_loss=-0.358][2026-03-22 14:25:13] Step: 1609, Training Logs: loss_final: 1.652025, loss_mean: 0.952003, proj_loss: -0.358077, loss_mean_cls: 1.058100, grad_norm: 3.686496 +Steps: 0%| | 1610/1000000 [06:37<67:48:52, 4.09it/s, grad_norm=3.69, loss_final=1.65, loss_mean=0.952, loss_mean_cls=1.06, proj_loss=-0.358][2026-03-22 14:25:14] Step: 1610, Training Logs: loss_final: 1.653617, loss_mean: 0.918191, proj_loss: -0.361495, loss_mean_cls: 1.096922, grad_norm: 2.900494 +Steps: 0%| | 1611/1000000 [06:37<67:49:44, 4.09it/s, grad_norm=2.9, loss_final=1.65, loss_mean=0.918, loss_mean_cls=1.1, proj_loss=-0.361][2026-03-22 14:25:14] Step: 1611, Training Logs: loss_final: 1.553025, loss_mean: 0.945256, proj_loss: -0.370641, loss_mean_cls: 0.978411, grad_norm: 4.037807 +Steps: 0%| | 1612/1000000 [06:37<67:49:23, 4.09it/s, grad_norm=4.04, loss_final=1.55, loss_mean=0.945, loss_mean_cls=0.978, proj_loss=-0.371][2026-03-22 14:25:14] Step: 1612, Training Logs: loss_final: 1.612016, loss_mean: 0.925327, proj_loss: -0.360994, loss_mean_cls: 1.047683, grad_norm: 3.635082 +Steps: 0%| | 1613/1000000 [06:37<67:49:45, 4.09it/s, grad_norm=3.64, loss_final=1.61, loss_mean=0.925, loss_mean_cls=1.05, proj_loss=-0.361][2026-03-22 14:25:14] Step: 1613, Training Logs: loss_final: 1.593665, loss_mean: 0.923079, proj_loss: -0.362094, loss_mean_cls: 1.032681, grad_norm: 2.441242 +Steps: 0%| | 1614/1000000 [06:38<67:48:43, 4.09it/s, grad_norm=2.44, loss_final=1.59, loss_mean=0.923, loss_mean_cls=1.03, proj_loss=-0.362][2026-03-22 14:25:15] Step: 1614, Training Logs: loss_final: 1.808875, loss_mean: 0.923825, proj_loss: -0.357938, loss_mean_cls: 1.242988, grad_norm: 5.105047 +Steps: 0%| | 1615/1000000 [06:38<67:49:56, 4.09it/s, grad_norm=5.11, loss_final=1.81, loss_mean=0.924, loss_mean_cls=1.24, proj_loss=-0.358][2026-03-22 14:25:15] Step: 1615, Training Logs: loss_final: 1.775654, loss_mean: 0.920729, proj_loss: -0.354056, loss_mean_cls: 1.208981, grad_norm: 4.952157 +Steps: 0%| | 1616/1000000 [06:38<67:50:28, 4.09it/s, grad_norm=4.95, loss_final=1.78, loss_mean=0.921, loss_mean_cls=1.21, proj_loss=-0.354][2026-03-22 14:25:15] Step: 1616, Training Logs: loss_final: 1.729471, loss_mean: 0.933919, proj_loss: -0.352628, loss_mean_cls: 1.148180, grad_norm: 3.573444 +Steps: 0%| | 1617/1000000 [06:38<67:51:08, 4.09it/s, grad_norm=3.57, loss_final=1.73, loss_mean=0.934, loss_mean_cls=1.15, proj_loss=-0.353][2026-03-22 14:25:15] Step: 1617, Training Logs: loss_final: 1.822048, loss_mean: 0.933768, proj_loss: -0.348647, loss_mean_cls: 1.236927, grad_norm: 3.557503 +Steps: 0%| | 1618/1000000 [06:39<67:48:59, 4.09it/s, grad_norm=3.56, loss_final=1.82, loss_mean=0.934, loss_mean_cls=1.24, proj_loss=-0.349][2026-03-22 14:25:16] Step: 1618, Training Logs: loss_final: 1.877263, loss_mean: 0.901158, proj_loss: -0.344961, loss_mean_cls: 1.321066, grad_norm: 5.175373 +Steps: 0%| | 1619/1000000 [06:39<67:48:13, 4.09it/s, grad_norm=5.18, loss_final=1.88, loss_mean=0.901, loss_mean_cls=1.32, proj_loss=-0.345][2026-03-22 14:25:16] Step: 1619, Training Logs: loss_final: 1.855233, loss_mean: 0.918291, proj_loss: -0.349658, loss_mean_cls: 1.286600, grad_norm: 4.363557 +Steps: 0%| | 1620/1000000 [06:39<67:49:21, 4.09it/s, grad_norm=4.36, loss_final=1.86, loss_mean=0.918, loss_mean_cls=1.29, proj_loss=-0.35][2026-03-22 14:25:16] Step: 1620, Training Logs: loss_final: 1.738279, loss_mean: 0.938219, proj_loss: -0.351785, loss_mean_cls: 1.151846, grad_norm: 3.090364 +Steps: 0%| | 1621/1000000 [06:39<67:49:57, 4.09it/s, grad_norm=3.09, loss_final=1.74, loss_mean=0.938, loss_mean_cls=1.15, proj_loss=-0.352][2026-03-22 14:25:16] Step: 1621, Training Logs: loss_final: 1.655042, loss_mean: 0.941358, proj_loss: -0.353424, loss_mean_cls: 1.067107, grad_norm: 2.680400 +Steps: 0%| | 1622/1000000 [06:40<67:48:46, 4.09it/s, grad_norm=2.68, loss_final=1.66, loss_mean=0.941, loss_mean_cls=1.07, proj_loss=-0.353][2026-03-22 14:25:17] Step: 1622, Training Logs: loss_final: 1.668926, loss_mean: 0.931025, proj_loss: -0.357557, loss_mean_cls: 1.095459, grad_norm: 2.641943 +Steps: 0%| | 1623/1000000 [06:40<67:47:13, 4.09it/s, grad_norm=2.64, loss_final=1.67, loss_mean=0.931, loss_mean_cls=1.1, proj_loss=-0.358][2026-03-22 14:25:17] Step: 1623, Training Logs: loss_final: 1.793010, loss_mean: 0.913258, proj_loss: -0.351080, loss_mean_cls: 1.230831, grad_norm: 2.330783 +Steps: 0%| | 1624/1000000 [06:40<67:48:12, 4.09it/s, grad_norm=2.33, loss_final=1.79, loss_mean=0.913, loss_mean_cls=1.23, proj_loss=-0.351][2026-03-22 14:25:17] Step: 1624, Training Logs: loss_final: 1.738950, loss_mean: 0.922216, proj_loss: -0.361343, loss_mean_cls: 1.178077, grad_norm: 3.562542 +Steps: 0%| | 1625/1000000 [06:40<67:47:36, 4.09it/s, grad_norm=3.56, loss_final=1.74, loss_mean=0.922, loss_mean_cls=1.18, proj_loss=-0.361][2026-03-22 14:25:17] Step: 1625, Training Logs: loss_final: 1.631658, loss_mean: 0.928453, proj_loss: -0.365833, loss_mean_cls: 1.069038, grad_norm: 3.874850 +Steps: 0%| | 1626/1000000 [06:41<67:51:47, 4.09it/s, grad_norm=3.87, loss_final=1.63, loss_mean=0.928, loss_mean_cls=1.07, proj_loss=-0.366][2026-03-22 14:25:18] Step: 1626, Training Logs: loss_final: 1.813735, loss_mean: 0.920009, proj_loss: -0.347888, loss_mean_cls: 1.241615, grad_norm: 2.962320 +Steps: 0%| | 1627/1000000 [06:41<67:49:49, 4.09it/s, grad_norm=2.96, loss_final=1.81, loss_mean=0.92, loss_mean_cls=1.24, proj_loss=-0.348][2026-03-22 14:25:18] Step: 1627, Training Logs: loss_final: 1.718813, loss_mean: 0.931165, proj_loss: -0.351061, loss_mean_cls: 1.138708, grad_norm: 4.502835 +Steps: 0%| | 1628/1000000 [06:41<67:59:16, 4.08it/s, grad_norm=4.5, loss_final=1.72, loss_mean=0.931, loss_mean_cls=1.14, proj_loss=-0.351][2026-03-22 14:25:18] Step: 1628, Training Logs: loss_final: 1.568542, loss_mean: 0.931938, proj_loss: -0.364963, loss_mean_cls: 1.001567, grad_norm: 3.000051 +Steps: 0%| | 1629/1000000 [06:41<67:55:01, 4.08it/s, grad_norm=3, loss_final=1.57, loss_mean=0.932, loss_mean_cls=1, proj_loss=-0.365][2026-03-22 14:25:18] Step: 1629, Training Logs: loss_final: 1.795673, loss_mean: 0.910740, proj_loss: -0.348933, loss_mean_cls: 1.233866, grad_norm: 4.918464 +Steps: 0%| | 1630/1000000 [06:42<67:52:01, 4.09it/s, grad_norm=4.92, loss_final=1.8, loss_mean=0.911, loss_mean_cls=1.23, proj_loss=-0.349][2026-03-22 14:25:19] Step: 1630, Training Logs: loss_final: 1.650239, loss_mean: 0.932526, proj_loss: -0.364056, loss_mean_cls: 1.081769, grad_norm: 4.064533 +Steps: 0%| | 1631/1000000 [06:42<67:49:46, 4.09it/s, grad_norm=4.06, loss_final=1.65, loss_mean=0.933, loss_mean_cls=1.08, proj_loss=-0.364][2026-03-22 14:25:19] Step: 1631, Training Logs: loss_final: 1.633510, loss_mean: 0.949892, proj_loss: -0.359987, loss_mean_cls: 1.043606, grad_norm: 1.648306 +Steps: 0%| | 1632/1000000 [06:42<67:48:25, 4.09it/s, grad_norm=1.65, loss_final=1.63, loss_mean=0.95, loss_mean_cls=1.04, proj_loss=-0.36][2026-03-22 14:25:19] Step: 1632, Training Logs: loss_final: 1.672338, loss_mean: 0.943373, proj_loss: -0.359382, loss_mean_cls: 1.088347, grad_norm: 2.347815 +Steps: 0%| | 1633/1000000 [06:42<67:49:26, 4.09it/s, grad_norm=2.35, loss_final=1.67, loss_mean=0.943, loss_mean_cls=1.09, proj_loss=-0.359][2026-03-22 14:25:19] Step: 1633, Training Logs: loss_final: 1.720679, loss_mean: 0.935701, proj_loss: -0.359309, loss_mean_cls: 1.144287, grad_norm: 2.568954 +Steps: 0%| | 1634/1000000 [06:43<67:49:41, 4.09it/s, grad_norm=2.57, loss_final=1.72, loss_mean=0.936, loss_mean_cls=1.14, proj_loss=-0.359][2026-03-22 14:25:20] Step: 1634, Training Logs: loss_final: 1.545136, loss_mean: 0.941160, proj_loss: -0.368737, loss_mean_cls: 0.972713, grad_norm: 1.775578 +Steps: 0%| | 1635/1000000 [06:43<67:48:54, 4.09it/s, grad_norm=1.78, loss_final=1.55, loss_mean=0.941, loss_mean_cls=0.973, proj_loss=-0.369][2026-03-22 14:25:20] Step: 1635, Training Logs: loss_final: 1.759529, loss_mean: 0.925195, proj_loss: -0.355579, loss_mean_cls: 1.189913, grad_norm: 1.913653 +Steps: 0%| | 1636/1000000 [06:43<67:49:21, 4.09it/s, grad_norm=1.91, loss_final=1.76, loss_mean=0.925, loss_mean_cls=1.19, proj_loss=-0.356][2026-03-22 14:25:20] Step: 1636, Training Logs: loss_final: 1.590163, loss_mean: 0.946909, proj_loss: -0.361308, loss_mean_cls: 1.004562, grad_norm: 3.089141 +Steps: 0%| | 1637/1000000 [06:43<67:48:00, 4.09it/s, grad_norm=3.09, loss_final=1.59, loss_mean=0.947, loss_mean_cls=1, proj_loss=-0.361][2026-03-22 14:25:20] Step: 1637, Training Logs: loss_final: 1.715460, loss_mean: 0.923267, proj_loss: -0.356341, loss_mean_cls: 1.148535, grad_norm: 2.380042 +Steps: 0%| | 1638/1000000 [06:44<67:46:04, 4.09it/s, grad_norm=2.38, loss_final=1.72, loss_mean=0.923, loss_mean_cls=1.15, proj_loss=-0.356][2026-03-22 14:25:20] Step: 1638, Training Logs: loss_final: 1.750763, loss_mean: 0.925379, proj_loss: -0.354495, loss_mean_cls: 1.179879, grad_norm: 3.408661 +Steps: 0%| | 1639/1000000 [06:44<68:45:00, 4.03it/s, grad_norm=3.41, loss_final=1.75, loss_mean=0.925, loss_mean_cls=1.18, proj_loss=-0.354][2026-03-22 14:25:21] Step: 1639, Training Logs: loss_final: 1.705784, loss_mean: 0.928825, proj_loss: -0.359791, loss_mean_cls: 1.136749, grad_norm: 3.271024 +Steps: 0%| | 1640/1000000 [06:44<68:28:05, 4.05it/s, grad_norm=3.27, loss_final=1.71, loss_mean=0.929, loss_mean_cls=1.14, proj_loss=-0.36][2026-03-22 14:25:21] Step: 1640, Training Logs: loss_final: 1.792984, loss_mean: 0.923849, proj_loss: -0.350002, loss_mean_cls: 1.219136, grad_norm: 3.185245 +Steps: 0%| | 1641/1000000 [06:44<68:17:09, 4.06it/s, grad_norm=3.19, loss_final=1.79, loss_mean=0.924, loss_mean_cls=1.22, proj_loss=-0.35][2026-03-22 14:25:21] Step: 1641, Training Logs: loss_final: 1.725812, loss_mean: 0.938953, proj_loss: -0.358118, loss_mean_cls: 1.144978, grad_norm: 4.292795 +Steps: 0%| | 1642/1000000 [06:45<68:10:00, 4.07it/s, grad_norm=4.29, loss_final=1.73, loss_mean=0.939, loss_mean_cls=1.14, proj_loss=-0.358][2026-03-22 14:25:21] Step: 1642, Training Logs: loss_final: 1.749933, loss_mean: 0.918672, proj_loss: -0.357561, loss_mean_cls: 1.188822, grad_norm: 4.252475 +Steps: 0%| | 1643/1000000 [06:45<68:14:12, 4.06it/s, grad_norm=4.25, loss_final=1.75, loss_mean=0.919, loss_mean_cls=1.19, proj_loss=-0.358][2026-03-22 14:25:22] Step: 1643, Training Logs: loss_final: 1.728832, loss_mean: 0.924652, proj_loss: -0.356489, loss_mean_cls: 1.160668, grad_norm: 3.740804 +Steps: 0%| | 1644/1000000 [06:45<68:06:55, 4.07it/s, grad_norm=3.74, loss_final=1.73, loss_mean=0.925, loss_mean_cls=1.16, proj_loss=-0.356][2026-03-22 14:25:22] Step: 1644, Training Logs: loss_final: 1.774643, loss_mean: 0.928244, proj_loss: -0.350497, loss_mean_cls: 1.196896, grad_norm: 4.504292 +Steps: 0%| | 1645/1000000 [06:45<68:00:45, 4.08it/s, grad_norm=4.5, loss_final=1.77, loss_mean=0.928, loss_mean_cls=1.2, proj_loss=-0.35][2026-03-22 14:25:22] Step: 1645, Training Logs: loss_final: 1.793591, loss_mean: 0.903662, proj_loss: -0.348851, loss_mean_cls: 1.238781, grad_norm: 2.136335 +Steps: 0%| | 1646/1000000 [06:45<67:55:59, 4.08it/s, grad_norm=2.14, loss_final=1.79, loss_mean=0.904, loss_mean_cls=1.24, proj_loss=-0.349][2026-03-22 14:25:22] Step: 1646, Training Logs: loss_final: 1.835476, loss_mean: 0.923770, proj_loss: -0.352783, loss_mean_cls: 1.264488, grad_norm: 6.313053 +Steps: 0%| | 1647/1000000 [06:46<67:52:00, 4.09it/s, grad_norm=6.31, loss_final=1.84, loss_mean=0.924, loss_mean_cls=1.26, proj_loss=-0.353][2026-03-22 14:25:23] Step: 1647, Training Logs: loss_final: 1.825842, loss_mean: 0.908993, proj_loss: -0.350459, loss_mean_cls: 1.267307, grad_norm: 4.329178 +Steps: 0%| | 1648/1000000 [06:46<67:51:01, 4.09it/s, grad_norm=4.33, loss_final=1.83, loss_mean=0.909, loss_mean_cls=1.27, proj_loss=-0.35][2026-03-22 14:25:23] Step: 1648, Training Logs: loss_final: 1.692045, loss_mean: 0.950865, proj_loss: -0.359942, loss_mean_cls: 1.101121, grad_norm: 4.958394 +Steps: 0%| | 1649/1000000 [06:46<67:49:19, 4.09it/s, grad_norm=4.96, loss_final=1.69, loss_mean=0.951, loss_mean_cls=1.1, proj_loss=-0.36][2026-03-22 14:25:23] Step: 1649, Training Logs: loss_final: 1.650913, loss_mean: 0.933757, proj_loss: -0.362645, loss_mean_cls: 1.079801, grad_norm: 1.538066 +Steps: 0%| | 1650/1000000 [06:46<67:47:46, 4.09it/s, grad_norm=1.54, loss_final=1.65, loss_mean=0.934, loss_mean_cls=1.08, proj_loss=-0.363][2026-03-22 14:25:23] Step: 1650, Training Logs: loss_final: 1.669367, loss_mean: 0.930818, proj_loss: -0.356181, loss_mean_cls: 1.094730, grad_norm: 4.634021 +Steps: 0%| | 1651/1000000 [06:47<67:46:07, 4.09it/s, grad_norm=4.63, loss_final=1.67, loss_mean=0.931, loss_mean_cls=1.09, proj_loss=-0.356][2026-03-22 14:25:24] Step: 1651, Training Logs: loss_final: 1.814700, loss_mean: 0.938945, proj_loss: -0.353562, loss_mean_cls: 1.229317, grad_norm: 6.054008 +Steps: 0%| | 1652/1000000 [06:47<67:46:02, 4.09it/s, grad_norm=6.05, loss_final=1.81, loss_mean=0.939, loss_mean_cls=1.23, proj_loss=-0.354][2026-03-22 14:25:24] Step: 1652, Training Logs: loss_final: 1.772637, loss_mean: 0.946555, proj_loss: -0.348993, loss_mean_cls: 1.175075, grad_norm: 3.553244 +Steps: 0%| | 1653/1000000 [06:47<67:46:21, 4.09it/s, grad_norm=3.55, loss_final=1.77, loss_mean=0.947, loss_mean_cls=1.18, proj_loss=-0.349][2026-03-22 14:25:24] Step: 1653, Training Logs: loss_final: 1.720865, loss_mean: 0.923881, proj_loss: -0.349867, loss_mean_cls: 1.146851, grad_norm: 4.605822 +Steps: 0%| | 1654/1000000 [06:47<67:46:02, 4.09it/s, grad_norm=4.61, loss_final=1.72, loss_mean=0.924, loss_mean_cls=1.15, proj_loss=-0.35][2026-03-22 14:25:24] Step: 1654, Training Logs: loss_final: 1.701113, loss_mean: 0.965253, proj_loss: -0.355166, loss_mean_cls: 1.091026, grad_norm: 4.736289 +Steps: 0%| | 1655/1000000 [06:48<67:44:32, 4.09it/s, grad_norm=4.74, loss_final=1.7, loss_mean=0.965, loss_mean_cls=1.09, proj_loss=-0.355][2026-03-22 14:25:25] Step: 1655, Training Logs: loss_final: 1.723071, loss_mean: 0.929386, proj_loss: -0.346964, loss_mean_cls: 1.140649, grad_norm: 2.915852 +Steps: 0%| | 1656/1000000 [06:48<67:44:24, 4.09it/s, grad_norm=2.92, loss_final=1.72, loss_mean=0.929, loss_mean_cls=1.14, proj_loss=-0.347][2026-03-22 14:25:25] Step: 1656, Training Logs: loss_final: 1.595716, loss_mean: 0.946411, proj_loss: -0.365531, loss_mean_cls: 1.014836, grad_norm: 1.735760 +Steps: 0%| | 1657/1000000 [06:48<67:44:57, 4.09it/s, grad_norm=1.74, loss_final=1.6, loss_mean=0.946, loss_mean_cls=1.01, proj_loss=-0.366][2026-03-22 14:25:25] Step: 1657, Training Logs: loss_final: 1.782499, loss_mean: 0.918765, proj_loss: -0.351095, loss_mean_cls: 1.214830, grad_norm: 5.196187 +Steps: 0%| | 1658/1000000 [06:48<67:45:16, 4.09it/s, grad_norm=5.2, loss_final=1.78, loss_mean=0.919, loss_mean_cls=1.21, proj_loss=-0.351][2026-03-22 14:25:25] Step: 1658, Training Logs: loss_final: 1.792471, loss_mean: 0.951502, proj_loss: -0.353652, loss_mean_cls: 1.194621, grad_norm: 4.760015 +Steps: 0%| | 1659/1000000 [06:49<67:45:35, 4.09it/s, grad_norm=4.76, loss_final=1.79, loss_mean=0.952, loss_mean_cls=1.19, proj_loss=-0.354][2026-03-22 14:25:26] Step: 1659, Training Logs: loss_final: 1.617967, loss_mean: 0.955766, proj_loss: -0.358315, loss_mean_cls: 1.020515, grad_norm: 2.277297 +Steps: 0%| | 1660/1000000 [06:49<67:45:48, 4.09it/s, grad_norm=2.28, loss_final=1.62, loss_mean=0.956, loss_mean_cls=1.02, proj_loss=-0.358][2026-03-22 14:25:26] Step: 1660, Training Logs: loss_final: 1.669812, loss_mean: 0.937773, proj_loss: -0.355501, loss_mean_cls: 1.087540, grad_norm: 1.636103 +Steps: 0%| | 1661/1000000 [06:49<67:46:36, 4.09it/s, grad_norm=1.64, loss_final=1.67, loss_mean=0.938, loss_mean_cls=1.09, proj_loss=-0.356][2026-03-22 14:25:26] Step: 1661, Training Logs: loss_final: 1.664544, loss_mean: 0.939497, proj_loss: -0.368285, loss_mean_cls: 1.093332, grad_norm: 1.947516 +Steps: 0%| | 1662/1000000 [06:49<67:45:42, 4.09it/s, grad_norm=1.95, loss_final=1.66, loss_mean=0.939, loss_mean_cls=1.09, proj_loss=-0.368][2026-03-22 14:25:26] Step: 1662, Training Logs: loss_final: 1.652037, loss_mean: 0.934876, proj_loss: -0.362292, loss_mean_cls: 1.079453, grad_norm: 2.566067 +Steps: 0%| | 1663/1000000 [06:50<67:46:04, 4.09it/s, grad_norm=2.57, loss_final=1.65, loss_mean=0.935, loss_mean_cls=1.08, proj_loss=-0.362][2026-03-22 14:25:27] Step: 1663, Training Logs: loss_final: 1.731271, loss_mean: 0.927885, proj_loss: -0.354534, loss_mean_cls: 1.157920, grad_norm: 4.174001 +Steps: 0%| | 1664/1000000 [06:50<67:47:30, 4.09it/s, grad_norm=4.17, loss_final=1.73, loss_mean=0.928, loss_mean_cls=1.16, proj_loss=-0.355][2026-03-22 14:25:27] Step: 1664, Training Logs: loss_final: 1.862705, loss_mean: 0.903546, proj_loss: -0.345977, loss_mean_cls: 1.305136, grad_norm: 2.603672 +Steps: 0%| | 1665/1000000 [06:50<67:48:00, 4.09it/s, grad_norm=2.6, loss_final=1.86, loss_mean=0.904, loss_mean_cls=1.31, proj_loss=-0.346][2026-03-22 14:25:27] Step: 1665, Training Logs: loss_final: 1.754847, loss_mean: 0.927460, proj_loss: -0.352850, loss_mean_cls: 1.180238, grad_norm: 3.663232 +Steps: 0%| | 1666/1000000 [06:50<67:48:13, 4.09it/s, grad_norm=3.66, loss_final=1.75, loss_mean=0.927, loss_mean_cls=1.18, proj_loss=-0.353][2026-03-22 14:25:27] Step: 1666, Training Logs: loss_final: 1.688623, loss_mean: 0.929281, proj_loss: -0.359729, loss_mean_cls: 1.119071, grad_norm: 3.330994 +Steps: 0%| | 1667/1000000 [06:51<67:50:28, 4.09it/s, grad_norm=3.33, loss_final=1.69, loss_mean=0.929, loss_mean_cls=1.12, proj_loss=-0.36][2026-03-22 14:25:28] Step: 1667, Training Logs: loss_final: 1.861404, loss_mean: 0.910337, proj_loss: -0.349992, loss_mean_cls: 1.301059, grad_norm: 4.115936 +Steps: 0%| | 1668/1000000 [06:51<67:49:35, 4.09it/s, grad_norm=4.12, loss_final=1.86, loss_mean=0.91, loss_mean_cls=1.3, proj_loss=-0.35][2026-03-22 14:25:28] Step: 1668, Training Logs: loss_final: 1.760635, loss_mean: 0.932356, proj_loss: -0.359659, loss_mean_cls: 1.187938, grad_norm: 5.532197 +Steps: 0%| | 1669/1000000 [06:51<67:49:48, 4.09it/s, grad_norm=5.53, loss_final=1.76, loss_mean=0.932, loss_mean_cls=1.19, proj_loss=-0.36][2026-03-22 14:25:28] Step: 1669, Training Logs: loss_final: 1.726355, loss_mean: 0.921277, proj_loss: -0.352975, loss_mean_cls: 1.158052, grad_norm: 3.355999 +Steps: 0%| | 1670/1000000 [06:51<67:48:35, 4.09it/s, grad_norm=3.36, loss_final=1.73, loss_mean=0.921, loss_mean_cls=1.16, proj_loss=-0.353][2026-03-22 14:25:28] Step: 1670, Training Logs: loss_final: 1.701780, loss_mean: 0.942119, proj_loss: -0.358258, loss_mean_cls: 1.117919, grad_norm: 1.565554 +Steps: 0%| | 1671/1000000 [06:52<67:48:21, 4.09it/s, grad_norm=1.57, loss_final=1.7, loss_mean=0.942, loss_mean_cls=1.12, proj_loss=-0.358][2026-03-22 14:25:29] Step: 1671, Training Logs: loss_final: 1.716236, loss_mean: 0.926371, proj_loss: -0.360652, loss_mean_cls: 1.150518, grad_norm: 4.849587 +Steps: 0%| | 1672/1000000 [06:52<67:48:02, 4.09it/s, grad_norm=4.85, loss_final=1.72, loss_mean=0.926, loss_mean_cls=1.15, proj_loss=-0.361][2026-03-22 14:25:29] Step: 1672, Training Logs: loss_final: 1.672080, loss_mean: 0.955909, proj_loss: -0.361407, loss_mean_cls: 1.077578, grad_norm: 5.070810 +Steps: 0%| | 1673/1000000 [06:52<67:47:49, 4.09it/s, grad_norm=5.07, loss_final=1.67, loss_mean=0.956, loss_mean_cls=1.08, proj_loss=-0.361][2026-03-22 14:25:29] Step: 1673, Training Logs: loss_final: 1.729597, loss_mean: 0.944683, proj_loss: -0.351450, loss_mean_cls: 1.136364, grad_norm: 4.588469 +Steps: 0%| | 1674/1000000 [06:52<67:47:05, 4.09it/s, grad_norm=4.59, loss_final=1.73, loss_mean=0.945, loss_mean_cls=1.14, proj_loss=-0.351][2026-03-22 14:25:29] Step: 1674, Training Logs: loss_final: 1.694836, loss_mean: 0.943238, proj_loss: -0.355416, loss_mean_cls: 1.107015, grad_norm: 2.928304 +Steps: 0%| | 1675/1000000 [06:53<67:45:40, 4.09it/s, grad_norm=2.93, loss_final=1.69, loss_mean=0.943, loss_mean_cls=1.11, proj_loss=-0.355][2026-03-22 14:25:30] Step: 1675, Training Logs: loss_final: 1.687334, loss_mean: 0.939140, proj_loss: -0.358068, loss_mean_cls: 1.106261, grad_norm: 3.470084 +Steps: 0%| | 1676/1000000 [06:53<67:46:30, 4.09it/s, grad_norm=3.47, loss_final=1.69, loss_mean=0.939, loss_mean_cls=1.11, proj_loss=-0.358][2026-03-22 14:25:30] Step: 1676, Training Logs: loss_final: 1.788804, loss_mean: 0.910752, proj_loss: -0.347928, loss_mean_cls: 1.225981, grad_norm: 2.261466 +Steps: 0%| | 1677/1000000 [06:53<67:47:27, 4.09it/s, grad_norm=2.26, loss_final=1.79, loss_mean=0.911, loss_mean_cls=1.23, proj_loss=-0.348][2026-03-22 14:25:30] Step: 1677, Training Logs: loss_final: 1.708495, loss_mean: 0.934376, proj_loss: -0.355463, loss_mean_cls: 1.129581, grad_norm: 3.517217 +Steps: 0%| | 1678/1000000 [06:53<67:46:22, 4.09it/s, grad_norm=3.52, loss_final=1.71, loss_mean=0.934, loss_mean_cls=1.13, proj_loss=-0.355][2026-03-22 14:25:30] Step: 1678, Training Logs: loss_final: 1.893162, loss_mean: 0.900572, proj_loss: -0.346536, loss_mean_cls: 1.339126, grad_norm: 3.281301 +Steps: 0%| | 1679/1000000 [06:54<67:45:35, 4.09it/s, grad_norm=3.28, loss_final=1.89, loss_mean=0.901, loss_mean_cls=1.34, proj_loss=-0.347][2026-03-22 14:25:31] Step: 1679, Training Logs: loss_final: 1.844605, loss_mean: 0.916322, proj_loss: -0.348263, loss_mean_cls: 1.276546, grad_norm: 2.808211 +Steps: 0%| | 1680/1000000 [06:54<67:45:46, 4.09it/s, grad_norm=2.81, loss_final=1.84, loss_mean=0.916, loss_mean_cls=1.28, proj_loss=-0.348][2026-03-22 14:25:31] Step: 1680, Training Logs: loss_final: 1.814154, loss_mean: 0.925101, proj_loss: -0.349501, loss_mean_cls: 1.238554, grad_norm: 4.380561 +Steps: 0%| | 1681/1000000 [06:54<67:45:33, 4.09it/s, grad_norm=4.38, loss_final=1.81, loss_mean=0.925, loss_mean_cls=1.24, proj_loss=-0.35][2026-03-22 14:25:31] Step: 1681, Training Logs: loss_final: 1.649741, loss_mean: 0.952377, proj_loss: -0.366026, loss_mean_cls: 1.063389, grad_norm: 4.350269 +Steps: 0%| | 1682/1000000 [06:54<67:45:09, 4.09it/s, grad_norm=4.35, loss_final=1.65, loss_mean=0.952, loss_mean_cls=1.06, proj_loss=-0.366][2026-03-22 14:25:31] Step: 1682, Training Logs: loss_final: 1.704611, loss_mean: 0.935577, proj_loss: -0.353274, loss_mean_cls: 1.122308, grad_norm: 3.147066 +Steps: 0%| | 1683/1000000 [06:55<67:46:27, 4.09it/s, grad_norm=3.15, loss_final=1.7, loss_mean=0.936, loss_mean_cls=1.12, proj_loss=-0.353][2026-03-22 14:25:31] Step: 1683, Training Logs: loss_final: 1.728203, loss_mean: 0.917621, proj_loss: -0.352375, loss_mean_cls: 1.162958, grad_norm: 2.612372 +Steps: 0%| | 1684/1000000 [06:55<67:48:38, 4.09it/s, grad_norm=2.61, loss_final=1.73, loss_mean=0.918, loss_mean_cls=1.16, proj_loss=-0.352][2026-03-22 14:25:32] Step: 1684, Training Logs: loss_final: 1.726125, loss_mean: 0.929276, proj_loss: -0.356993, loss_mean_cls: 1.153842, grad_norm: 3.875838 +Steps: 0%| | 1685/1000000 [06:55<67:50:24, 4.09it/s, grad_norm=3.88, loss_final=1.73, loss_mean=0.929, loss_mean_cls=1.15, proj_loss=-0.357][2026-03-22 14:25:32] Step: 1685, Training Logs: loss_final: 1.736654, loss_mean: 0.931806, proj_loss: -0.358137, loss_mean_cls: 1.162984, grad_norm: 3.557456 +Steps: 0%| | 1686/1000000 [06:55<67:51:48, 4.09it/s, grad_norm=3.56, loss_final=1.74, loss_mean=0.932, loss_mean_cls=1.16, proj_loss=-0.358][2026-03-22 14:25:32] Step: 1686, Training Logs: loss_final: 1.632674, loss_mean: 0.931992, proj_loss: -0.362464, loss_mean_cls: 1.063146, grad_norm: 2.358850 +Steps: 0%| | 1687/1000000 [06:56<67:50:34, 4.09it/s, grad_norm=2.36, loss_final=1.63, loss_mean=0.932, loss_mean_cls=1.06, proj_loss=-0.362][2026-03-22 14:25:32] Step: 1687, Training Logs: loss_final: 1.748556, loss_mean: 0.921589, proj_loss: -0.360394, loss_mean_cls: 1.187361, grad_norm: 3.336774 +Steps: 0%| | 1688/1000000 [06:56<67:49:55, 4.09it/s, grad_norm=3.34, loss_final=1.75, loss_mean=0.922, loss_mean_cls=1.19, proj_loss=-0.36][2026-03-22 14:25:33] Step: 1688, Training Logs: loss_final: 1.756707, loss_mean: 0.925368, proj_loss: -0.350968, loss_mean_cls: 1.182307, grad_norm: 2.926828 +Steps: 0%| | 1689/1000000 [06:56<67:50:33, 4.09it/s, grad_norm=2.93, loss_final=1.76, loss_mean=0.925, loss_mean_cls=1.18, proj_loss=-0.351][2026-03-22 14:25:33] Step: 1689, Training Logs: loss_final: 1.745894, loss_mean: 0.914247, proj_loss: -0.352567, loss_mean_cls: 1.184214, grad_norm: 1.431139 +Steps: 0%| | 1690/1000000 [06:56<67:49:37, 4.09it/s, grad_norm=1.43, loss_final=1.75, loss_mean=0.914, loss_mean_cls=1.18, proj_loss=-0.353][2026-03-22 14:25:33] Step: 1690, Training Logs: loss_final: 1.811538, loss_mean: 0.925144, proj_loss: -0.348274, loss_mean_cls: 1.234668, grad_norm: 3.816414 +Steps: 0%| | 1691/1000000 [06:56<67:47:54, 4.09it/s, grad_norm=3.82, loss_final=1.81, loss_mean=0.925, loss_mean_cls=1.23, proj_loss=-0.348][2026-03-22 14:25:33] Step: 1691, Training Logs: loss_final: 1.604367, loss_mean: 0.935480, proj_loss: -0.362150, loss_mean_cls: 1.031037, grad_norm: 3.682254 +Steps: 0%| | 1692/1000000 [06:57<67:52:23, 4.09it/s, grad_norm=3.68, loss_final=1.6, loss_mean=0.935, loss_mean_cls=1.03, proj_loss=-0.362][2026-03-22 14:25:34] Step: 1692, Training Logs: loss_final: 1.785828, loss_mean: 0.921009, proj_loss: -0.353579, loss_mean_cls: 1.218398, grad_norm: 2.690147 +Steps: 0%| | 1693/1000000 [06:57<67:51:53, 4.09it/s, grad_norm=2.69, loss_final=1.79, loss_mean=0.921, loss_mean_cls=1.22, proj_loss=-0.354][2026-03-22 14:25:34] Step: 1693, Training Logs: loss_final: 1.721169, loss_mean: 0.917203, proj_loss: -0.360831, loss_mean_cls: 1.164797, grad_norm: 2.545024 +Steps: 0%| | 1694/1000000 [06:57<67:51:52, 4.09it/s, grad_norm=2.55, loss_final=1.72, loss_mean=0.917, loss_mean_cls=1.16, proj_loss=-0.361][2026-03-22 14:25:34] Step: 1694, Training Logs: loss_final: 1.746013, loss_mean: 0.910492, proj_loss: -0.360584, loss_mean_cls: 1.196105, grad_norm: 3.066124 +Steps: 0%| | 1695/1000000 [06:57<67:51:27, 4.09it/s, grad_norm=3.07, loss_final=1.75, loss_mean=0.91, loss_mean_cls=1.2, proj_loss=-0.361][2026-03-22 14:25:34] Step: 1695, Training Logs: loss_final: 1.701722, loss_mean: 0.951051, proj_loss: -0.354550, loss_mean_cls: 1.105221, grad_norm: 2.005522 +Steps: 0%| | 1696/1000000 [06:58<67:53:10, 4.08it/s, grad_norm=2.01, loss_final=1.7, loss_mean=0.951, loss_mean_cls=1.11, proj_loss=-0.355][2026-03-22 14:25:35] Step: 1696, Training Logs: loss_final: 1.703262, loss_mean: 0.939170, proj_loss: -0.361373, loss_mean_cls: 1.125465, grad_norm: 1.964494 +Steps: 0%| | 1697/1000000 [06:58<67:53:32, 4.08it/s, grad_norm=1.96, loss_final=1.7, loss_mean=0.939, loss_mean_cls=1.13, proj_loss=-0.361][2026-03-22 14:25:35] Step: 1697, Training Logs: loss_final: 1.655923, loss_mean: 0.933511, proj_loss: -0.359127, loss_mean_cls: 1.081540, grad_norm: 2.916874 +Steps: 0%| | 1698/1000000 [06:58<67:53:30, 4.08it/s, grad_norm=2.92, loss_final=1.66, loss_mean=0.934, loss_mean_cls=1.08, proj_loss=-0.359][2026-03-22 14:25:35] Step: 1698, Training Logs: loss_final: 1.715394, loss_mean: 0.939367, proj_loss: -0.359162, loss_mean_cls: 1.135188, grad_norm: 4.494693 +Steps: 0%| | 1699/1000000 [06:58<67:50:46, 4.09it/s, grad_norm=4.49, loss_final=1.72, loss_mean=0.939, loss_mean_cls=1.14, proj_loss=-0.359][2026-03-22 14:25:35] Step: 1699, Training Logs: loss_final: 1.696853, loss_mean: 0.928221, proj_loss: -0.358908, loss_mean_cls: 1.127541, grad_norm: 3.995844 +Steps: 0%| | 1700/1000000 [06:59<67:52:25, 4.09it/s, grad_norm=4, loss_final=1.7, loss_mean=0.928, loss_mean_cls=1.13, proj_loss=-0.359][2026-03-22 14:25:36] Step: 1700, Training Logs: loss_final: 1.725596, loss_mean: 0.946613, proj_loss: -0.358186, loss_mean_cls: 1.137169, grad_norm: 2.525111 +Steps: 0%| | 1701/1000000 [06:59<67:50:28, 4.09it/s, grad_norm=2.53, loss_final=1.73, loss_mean=0.947, loss_mean_cls=1.14, proj_loss=-0.358][2026-03-22 14:25:36] Step: 1701, Training Logs: loss_final: 1.833933, loss_mean: 0.887810, proj_loss: -0.353286, loss_mean_cls: 1.299408, grad_norm: 4.779731 +Steps: 0%| | 1702/1000000 [06:59<67:49:44, 4.09it/s, grad_norm=4.78, loss_final=1.83, loss_mean=0.888, loss_mean_cls=1.3, proj_loss=-0.353][2026-03-22 14:25:36] Step: 1702, Training Logs: loss_final: 1.595984, loss_mean: 0.957176, proj_loss: -0.366272, loss_mean_cls: 1.005080, grad_norm: 4.763572 +Steps: 0%| | 1703/1000000 [06:59<67:48:53, 4.09it/s, grad_norm=4.76, loss_final=1.6, loss_mean=0.957, loss_mean_cls=1.01, proj_loss=-0.366][2026-03-22 14:25:36] Step: 1703, Training Logs: loss_final: 1.677342, loss_mean: 0.942578, proj_loss: -0.360767, loss_mean_cls: 1.095531, grad_norm: 2.806648 +Steps: 0%| | 1704/1000000 [07:00<67:49:30, 4.09it/s, grad_norm=2.81, loss_final=1.68, loss_mean=0.943, loss_mean_cls=1.1, proj_loss=-0.361][2026-03-22 14:25:37] Step: 1704, Training Logs: loss_final: 1.669654, loss_mean: 0.925523, proj_loss: -0.362525, loss_mean_cls: 1.106656, grad_norm: 3.353749 +Steps: 0%| | 1705/1000000 [07:00<67:49:52, 4.09it/s, grad_norm=3.35, loss_final=1.67, loss_mean=0.926, loss_mean_cls=1.11, proj_loss=-0.363][2026-03-22 14:25:37] Step: 1705, Training Logs: loss_final: 1.677333, loss_mean: 0.923038, proj_loss: -0.355600, loss_mean_cls: 1.109895, grad_norm: 2.934310 +Steps: 0%| | 1706/1000000 [07:00<67:49:22, 4.09it/s, grad_norm=2.93, loss_final=1.68, loss_mean=0.923, loss_mean_cls=1.11, proj_loss=-0.356][2026-03-22 14:25:37] Step: 1706, Training Logs: loss_final: 1.617581, loss_mean: 0.931885, proj_loss: -0.364500, loss_mean_cls: 1.050196, grad_norm: 2.508992 +Steps: 0%| | 1707/1000000 [07:00<67:49:14, 4.09it/s, grad_norm=2.51, loss_final=1.62, loss_mean=0.932, loss_mean_cls=1.05, proj_loss=-0.364][2026-03-22 14:25:37] Step: 1707, Training Logs: loss_final: 1.747393, loss_mean: 0.913412, proj_loss: -0.362418, loss_mean_cls: 1.196398, grad_norm: 2.627976 +Steps: 0%| | 1708/1000000 [07:01<67:52:09, 4.09it/s, grad_norm=2.63, loss_final=1.75, loss_mean=0.913, loss_mean_cls=1.2, proj_loss=-0.362][2026-03-22 14:25:38] Step: 1708, Training Logs: loss_final: 1.707772, loss_mean: 0.915185, proj_loss: -0.360928, loss_mean_cls: 1.153515, grad_norm: 3.098137 +Steps: 0%| | 1709/1000000 [07:01<67:53:27, 4.08it/s, grad_norm=3.1, loss_final=1.71, loss_mean=0.915, loss_mean_cls=1.15, proj_loss=-0.361][2026-03-22 14:25:38] Step: 1709, Training Logs: loss_final: 1.668993, loss_mean: 0.933591, proj_loss: -0.364118, loss_mean_cls: 1.099519, grad_norm: 2.317921 +Steps: 0%| | 1710/1000000 [07:01<67:55:02, 4.08it/s, grad_norm=2.32, loss_final=1.67, loss_mean=0.934, loss_mean_cls=1.1, proj_loss=-0.364][2026-03-22 14:25:38] Step: 1710, Training Logs: loss_final: 1.640817, loss_mean: 0.932763, proj_loss: -0.370140, loss_mean_cls: 1.078194, grad_norm: 2.739225 +Steps: 0%| | 1711/1000000 [07:01<67:53:29, 4.08it/s, grad_norm=2.74, loss_final=1.64, loss_mean=0.933, loss_mean_cls=1.08, proj_loss=-0.37][2026-03-22 14:25:38] Step: 1711, Training Logs: loss_final: 1.619907, loss_mean: 0.931240, proj_loss: -0.369224, loss_mean_cls: 1.057891, grad_norm: 3.379052 +Steps: 0%| | 1712/1000000 [07:02<67:52:13, 4.09it/s, grad_norm=3.38, loss_final=1.62, loss_mean=0.931, loss_mean_cls=1.06, proj_loss=-0.369][2026-03-22 14:25:39] Step: 1712, Training Logs: loss_final: 1.685135, loss_mean: 0.909439, proj_loss: -0.360210, loss_mean_cls: 1.135907, grad_norm: 1.724280 +Steps: 0%| | 1713/1000000 [07:02<67:50:44, 4.09it/s, grad_norm=1.72, loss_final=1.69, loss_mean=0.909, loss_mean_cls=1.14, proj_loss=-0.36][2026-03-22 14:25:39] Step: 1713, Training Logs: loss_final: 1.740100, loss_mean: 0.910171, proj_loss: -0.356148, loss_mean_cls: 1.186076, grad_norm: 2.787055 +Steps: 0%| | 1714/1000000 [07:02<67:50:16, 4.09it/s, grad_norm=2.79, loss_final=1.74, loss_mean=0.91, loss_mean_cls=1.19, proj_loss=-0.356][2026-03-22 14:25:39] Step: 1714, Training Logs: loss_final: 1.755637, loss_mean: 0.944936, proj_loss: -0.355470, loss_mean_cls: 1.166171, grad_norm: 2.754005 +Steps: 0%| | 1715/1000000 [07:02<67:49:47, 4.09it/s, grad_norm=2.75, loss_final=1.76, loss_mean=0.945, loss_mean_cls=1.17, proj_loss=-0.355][2026-03-22 14:25:39] Step: 1715, Training Logs: loss_final: 1.620546, loss_mean: 0.937541, proj_loss: -0.365210, loss_mean_cls: 1.048215, grad_norm: 1.925057 +Steps: 0%| | 1716/1000000 [07:03<67:51:32, 4.09it/s, grad_norm=1.93, loss_final=1.62, loss_mean=0.938, loss_mean_cls=1.05, proj_loss=-0.365][2026-03-22 14:25:40] Step: 1716, Training Logs: loss_final: 1.638626, loss_mean: 0.932749, proj_loss: -0.363762, loss_mean_cls: 1.069639, grad_norm: 3.437152 +Steps: 0%| | 1717/1000000 [07:03<67:50:12, 4.09it/s, grad_norm=3.44, loss_final=1.64, loss_mean=0.933, loss_mean_cls=1.07, proj_loss=-0.364][2026-03-22 14:25:40] Step: 1717, Training Logs: loss_final: 1.713357, loss_mean: 0.925752, proj_loss: -0.356149, loss_mean_cls: 1.143754, grad_norm: 2.122907 +Steps: 0%| | 1718/1000000 [07:03<67:50:43, 4.09it/s, grad_norm=2.12, loss_final=1.71, loss_mean=0.926, loss_mean_cls=1.14, proj_loss=-0.356][2026-03-22 14:25:40] Step: 1718, Training Logs: loss_final: 1.656127, loss_mean: 0.931583, proj_loss: -0.362520, loss_mean_cls: 1.087063, grad_norm: 3.607439 +Steps: 0%| | 1719/1000000 [07:03<67:49:31, 4.09it/s, grad_norm=3.61, loss_final=1.66, loss_mean=0.932, loss_mean_cls=1.09, proj_loss=-0.363][2026-03-22 14:25:40] Step: 1719, Training Logs: loss_final: 1.504940, loss_mean: 0.948772, proj_loss: -0.374742, loss_mean_cls: 0.930910, grad_norm: 2.642307 +Steps: 0%| | 1720/1000000 [07:04<67:50:49, 4.09it/s, grad_norm=2.64, loss_final=1.5, loss_mean=0.949, loss_mean_cls=0.931, proj_loss=-0.375][2026-03-22 14:25:41] Step: 1720, Training Logs: loss_final: 1.731373, loss_mean: 0.935614, proj_loss: -0.357528, loss_mean_cls: 1.153287, grad_norm: 2.545767 +Steps: 0%| | 1721/1000000 [07:04<67:49:23, 4.09it/s, grad_norm=2.55, loss_final=1.73, loss_mean=0.936, loss_mean_cls=1.15, proj_loss=-0.358][2026-03-22 14:25:41] Step: 1721, Training Logs: loss_final: 1.741536, loss_mean: 0.919517, proj_loss: -0.362542, loss_mean_cls: 1.184560, grad_norm: 3.674434 +Steps: 0%| | 1722/1000000 [07:04<67:49:07, 4.09it/s, grad_norm=3.67, loss_final=1.74, loss_mean=0.92, loss_mean_cls=1.18, proj_loss=-0.363][2026-03-22 14:25:41] Step: 1722, Training Logs: loss_final: 1.695276, loss_mean: 0.921975, proj_loss: -0.366414, loss_mean_cls: 1.139716, grad_norm: 4.233061 +Steps: 0%| | 1723/1000000 [07:04<67:49:36, 4.09it/s, grad_norm=4.23, loss_final=1.7, loss_mean=0.922, loss_mean_cls=1.14, proj_loss=-0.366][2026-03-22 14:25:41] Step: 1723, Training Logs: loss_final: 1.693495, loss_mean: 0.930463, proj_loss: -0.364921, loss_mean_cls: 1.127953, grad_norm: 2.940480 +Steps: 0%| | 1724/1000000 [07:05<67:51:25, 4.09it/s, grad_norm=2.94, loss_final=1.69, loss_mean=0.93, loss_mean_cls=1.13, proj_loss=-0.365][2026-03-22 14:25:42] Step: 1724, Training Logs: loss_final: 1.746367, loss_mean: 0.933660, proj_loss: -0.361534, loss_mean_cls: 1.174240, grad_norm: 1.708063 +Steps: 0%| | 1725/1000000 [07:05<67:51:37, 4.09it/s, grad_norm=1.71, loss_final=1.75, loss_mean=0.934, loss_mean_cls=1.17, proj_loss=-0.362][2026-03-22 14:25:42] Step: 1725, Training Logs: loss_final: 1.582280, loss_mean: 0.930799, proj_loss: -0.371256, loss_mean_cls: 1.022737, grad_norm: 2.429766 +Steps: 0%| | 1726/1000000 [07:05<67:50:06, 4.09it/s, grad_norm=2.43, loss_final=1.58, loss_mean=0.931, loss_mean_cls=1.02, proj_loss=-0.371][2026-03-22 14:25:42] Step: 1726, Training Logs: loss_final: 1.685481, loss_mean: 0.908546, proj_loss: -0.366648, loss_mean_cls: 1.143583, grad_norm: 3.265164 +Steps: 0%| | 1727/1000000 [07:05<67:49:07, 4.09it/s, grad_norm=3.27, loss_final=1.69, loss_mean=0.909, loss_mean_cls=1.14, proj_loss=-0.367][2026-03-22 14:25:42] Step: 1727, Training Logs: loss_final: 1.673306, loss_mean: 0.924694, proj_loss: -0.359946, loss_mean_cls: 1.108557, grad_norm: 2.937569 +Steps: 0%| | 1728/1000000 [07:06<67:48:45, 4.09it/s, grad_norm=2.94, loss_final=1.67, loss_mean=0.925, loss_mean_cls=1.11, proj_loss=-0.36][2026-03-22 14:25:43] Step: 1728, Training Logs: loss_final: 1.721016, loss_mean: 0.905826, proj_loss: -0.362570, loss_mean_cls: 1.177759, grad_norm: 2.609698 +Steps: 0%| | 1729/1000000 [07:06<67:48:03, 4.09it/s, grad_norm=2.61, loss_final=1.72, loss_mean=0.906, loss_mean_cls=1.18, proj_loss=-0.363][2026-03-22 14:25:43] Step: 1729, Training Logs: loss_final: 1.653920, loss_mean: 0.924663, proj_loss: -0.367756, loss_mean_cls: 1.097012, grad_norm: 4.085772 +Steps: 0%| | 1730/1000000 [07:06<67:46:33, 4.09it/s, grad_norm=4.09, loss_final=1.65, loss_mean=0.925, loss_mean_cls=1.1, proj_loss=-0.368][2026-03-22 14:25:43] Step: 1730, Training Logs: loss_final: 1.656910, loss_mean: 0.933515, proj_loss: -0.365503, loss_mean_cls: 1.088898, grad_norm: 2.460912 +Steps: 0%| | 1731/1000000 [07:06<67:47:09, 4.09it/s, grad_norm=2.46, loss_final=1.66, loss_mean=0.934, loss_mean_cls=1.09, proj_loss=-0.366][2026-03-22 14:25:43] Step: 1731, Training Logs: loss_final: 1.637921, loss_mean: 0.934376, proj_loss: -0.365008, loss_mean_cls: 1.068554, grad_norm: 3.969789 +Steps: 0%| | 1732/1000000 [07:07<67:47:13, 4.09it/s, grad_norm=3.97, loss_final=1.64, loss_mean=0.934, loss_mean_cls=1.07, proj_loss=-0.365][2026-03-22 14:25:43] Step: 1732, Training Logs: loss_final: 1.737547, loss_mean: 0.948899, proj_loss: -0.357493, loss_mean_cls: 1.146141, grad_norm: 3.552579 +Steps: 0%| | 1733/1000000 [07:07<67:46:54, 4.09it/s, grad_norm=3.55, loss_final=1.74, loss_mean=0.949, loss_mean_cls=1.15, proj_loss=-0.357][2026-03-22 14:25:44] Step: 1733, Training Logs: loss_final: 1.838832, loss_mean: 0.904426, proj_loss: -0.342232, loss_mean_cls: 1.276638, grad_norm: 1.193087 +Steps: 0%| | 1734/1000000 [07:07<67:47:52, 4.09it/s, grad_norm=1.19, loss_final=1.84, loss_mean=0.904, loss_mean_cls=1.28, proj_loss=-0.342][2026-03-22 14:25:44] Step: 1734, Training Logs: loss_final: 1.909607, loss_mean: 0.900796, proj_loss: -0.350625, loss_mean_cls: 1.359435, grad_norm: 6.814259 +Steps: 0%| | 1735/1000000 [07:07<67:46:49, 4.09it/s, grad_norm=6.81, loss_final=1.91, loss_mean=0.901, loss_mean_cls=1.36, proj_loss=-0.351][2026-03-22 14:25:44] Step: 1735, Training Logs: loss_final: 1.814918, loss_mean: 0.926694, proj_loss: -0.345571, loss_mean_cls: 1.233794, grad_norm: 5.358655 +Steps: 0%| | 1736/1000000 [07:08<67:46:44, 4.09it/s, grad_norm=5.36, loss_final=1.81, loss_mean=0.927, loss_mean_cls=1.23, proj_loss=-0.346][2026-03-22 14:25:44] Step: 1736, Training Logs: loss_final: 1.653765, loss_mean: 0.916789, proj_loss: -0.358349, loss_mean_cls: 1.095324, grad_norm: 2.572405 +Steps: 0%| | 1737/1000000 [07:08<67:47:49, 4.09it/s, grad_norm=2.57, loss_final=1.65, loss_mean=0.917, loss_mean_cls=1.1, proj_loss=-0.358][2026-03-22 14:25:45] Step: 1737, Training Logs: loss_final: 1.617193, loss_mean: 0.943107, proj_loss: -0.368353, loss_mean_cls: 1.042439, grad_norm: 3.319024 +Steps: 0%| | 1738/1000000 [07:08<68:42:00, 4.04it/s, grad_norm=3.32, loss_final=1.62, loss_mean=0.943, loss_mean_cls=1.04, proj_loss=-0.368][2026-03-22 14:25:45] Step: 1738, Training Logs: loss_final: 1.715324, loss_mean: 0.938514, proj_loss: -0.363052, loss_mean_cls: 1.139862, grad_norm: 3.242905 +Steps: 0%| | 1739/1000000 [07:08<68:25:17, 4.05it/s, grad_norm=3.24, loss_final=1.72, loss_mean=0.939, loss_mean_cls=1.14, proj_loss=-0.363][2026-03-22 14:25:45] Step: 1739, Training Logs: loss_final: 1.531216, loss_mean: 0.938133, proj_loss: -0.370498, loss_mean_cls: 0.963581, grad_norm: 2.193012 +Steps: 0%| | 1740/1000000 [07:08<68:14:16, 4.06it/s, grad_norm=2.19, loss_final=1.53, loss_mean=0.938, loss_mean_cls=0.964, proj_loss=-0.37][2026-03-22 14:25:45] Step: 1740, Training Logs: loss_final: 1.663323, loss_mean: 0.924498, proj_loss: -0.367154, loss_mean_cls: 1.105979, grad_norm: 3.126951 +Steps: 0%| | 1741/1000000 [07:09<68:05:10, 4.07it/s, grad_norm=3.13, loss_final=1.66, loss_mean=0.924, loss_mean_cls=1.11, proj_loss=-0.367][2026-03-22 14:25:46] Step: 1741, Training Logs: loss_final: 1.677144, loss_mean: 0.939649, proj_loss: -0.366310, loss_mean_cls: 1.103805, grad_norm: 3.252998 +Steps: 0%| | 1742/1000000 [07:09<68:00:12, 4.08it/s, grad_norm=3.25, loss_final=1.68, loss_mean=0.94, loss_mean_cls=1.1, proj_loss=-0.366][2026-03-22 14:25:46] Step: 1742, Training Logs: loss_final: 1.794521, loss_mean: 0.920666, proj_loss: -0.347764, loss_mean_cls: 1.221619, grad_norm: 2.302285 +Steps: 0%| | 1743/1000000 [07:09<67:56:45, 4.08it/s, grad_norm=2.3, loss_final=1.79, loss_mean=0.921, loss_mean_cls=1.22, proj_loss=-0.348][2026-03-22 14:25:46] Step: 1743, Training Logs: loss_final: 1.590966, loss_mean: 0.938964, proj_loss: -0.368263, loss_mean_cls: 1.020265, grad_norm: 3.698619 +Steps: 0%| | 1744/1000000 [07:09<67:54:27, 4.08it/s, grad_norm=3.7, loss_final=1.59, loss_mean=0.939, loss_mean_cls=1.02, proj_loss=-0.368][2026-03-22 14:25:46] Step: 1744, Training Logs: loss_final: 1.676840, loss_mean: 0.946820, proj_loss: -0.358672, loss_mean_cls: 1.088692, grad_norm: 3.178736 +Steps: 0%| | 1745/1000000 [07:10<67:52:33, 4.09it/s, grad_norm=3.18, loss_final=1.68, loss_mean=0.947, loss_mean_cls=1.09, proj_loss=-0.359][2026-03-22 14:25:47] Step: 1745, Training Logs: loss_final: 1.679469, loss_mean: 0.935345, proj_loss: -0.362557, loss_mean_cls: 1.106681, grad_norm: 2.675426 +Steps: 0%| | 1746/1000000 [07:10<67:51:16, 4.09it/s, grad_norm=2.68, loss_final=1.68, loss_mean=0.935, loss_mean_cls=1.11, proj_loss=-0.363][2026-03-22 14:25:47] Step: 1746, Training Logs: loss_final: 1.623127, loss_mean: 0.942052, proj_loss: -0.366672, loss_mean_cls: 1.047747, grad_norm: 2.466448 +Steps: 0%| | 1747/1000000 [07:10<67:49:54, 4.09it/s, grad_norm=2.47, loss_final=1.62, loss_mean=0.942, loss_mean_cls=1.05, proj_loss=-0.367][2026-03-22 14:25:47] Step: 1747, Training Logs: loss_final: 1.929065, loss_mean: 0.893708, proj_loss: -0.344384, loss_mean_cls: 1.379741, grad_norm: 1.961700 +Steps: 0%| | 1748/1000000 [07:10<67:49:04, 4.09it/s, grad_norm=1.96, loss_final=1.93, loss_mean=0.894, loss_mean_cls=1.38, proj_loss=-0.344][2026-03-22 14:25:47] Step: 1748, Training Logs: loss_final: 1.701631, loss_mean: 0.919866, proj_loss: -0.361989, loss_mean_cls: 1.143754, grad_norm: 2.198323 +Steps: 0%| | 1749/1000000 [07:11<67:49:51, 4.09it/s, grad_norm=2.2, loss_final=1.7, loss_mean=0.92, loss_mean_cls=1.14, proj_loss=-0.362][2026-03-22 14:25:48] Step: 1749, Training Logs: loss_final: 1.617727, loss_mean: 0.926032, proj_loss: -0.367584, loss_mean_cls: 1.059278, grad_norm: 4.264071 +Steps: 0%| | 1750/1000000 [07:11<67:48:50, 4.09it/s, grad_norm=4.26, loss_final=1.62, loss_mean=0.926, loss_mean_cls=1.06, proj_loss=-0.368][2026-03-22 14:25:48] Step: 1750, Training Logs: loss_final: 1.780507, loss_mean: 0.897471, proj_loss: -0.352568, loss_mean_cls: 1.235603, grad_norm: 2.579079 +Steps: 0%| | 1751/1000000 [07:11<67:49:49, 4.09it/s, grad_norm=2.58, loss_final=1.78, loss_mean=0.897, loss_mean_cls=1.24, proj_loss=-0.353][2026-03-22 14:25:48] Step: 1751, Training Logs: loss_final: 1.640637, loss_mean: 0.929468, proj_loss: -0.366692, loss_mean_cls: 1.077861, grad_norm: 2.503332 +Steps: 0%| | 1752/1000000 [07:11<67:48:38, 4.09it/s, grad_norm=2.5, loss_final=1.64, loss_mean=0.929, loss_mean_cls=1.08, proj_loss=-0.367][2026-03-22 14:25:48] Step: 1752, Training Logs: loss_final: 1.629768, loss_mean: 0.946903, proj_loss: -0.366667, loss_mean_cls: 1.049532, grad_norm: 2.970736 +Steps: 0%| | 1753/1000000 [07:12<67:49:30, 4.09it/s, grad_norm=2.97, loss_final=1.63, loss_mean=0.947, loss_mean_cls=1.05, proj_loss=-0.367][2026-03-22 14:25:49] Step: 1753, Training Logs: loss_final: 1.773675, loss_mean: 0.908331, proj_loss: -0.359635, loss_mean_cls: 1.224979, grad_norm: 2.699149 +Steps: 0%| | 1754/1000000 [07:12<67:47:57, 4.09it/s, grad_norm=2.7, loss_final=1.77, loss_mean=0.908, loss_mean_cls=1.22, proj_loss=-0.36][2026-03-22 14:25:49] Step: 1754, Training Logs: loss_final: 1.709604, loss_mean: 0.909619, proj_loss: -0.361352, loss_mean_cls: 1.161337, grad_norm: 4.484830 +Steps: 0%| | 1755/1000000 [07:12<67:47:07, 4.09it/s, grad_norm=4.48, loss_final=1.71, loss_mean=0.91, loss_mean_cls=1.16, proj_loss=-0.361][2026-03-22 14:25:49] Step: 1755, Training Logs: loss_final: 1.617343, loss_mean: 0.943382, proj_loss: -0.372201, loss_mean_cls: 1.046163, grad_norm: 5.007272 +Steps: 0%| | 1756/1000000 [07:12<67:47:35, 4.09it/s, grad_norm=5.01, loss_final=1.62, loss_mean=0.943, loss_mean_cls=1.05, proj_loss=-0.372][2026-03-22 14:25:49] Step: 1756, Training Logs: loss_final: 1.690803, loss_mean: 0.921054, proj_loss: -0.369043, loss_mean_cls: 1.138792, grad_norm: 2.709883 +Steps: 0%| | 1757/1000000 [07:13<67:47:35, 4.09it/s, grad_norm=2.71, loss_final=1.69, loss_mean=0.921, loss_mean_cls=1.14, proj_loss=-0.369][2026-03-22 14:25:50] Step: 1757, Training Logs: loss_final: 1.765124, loss_mean: 0.932414, proj_loss: -0.359532, loss_mean_cls: 1.192243, grad_norm: 4.116140 +Steps: 0%| | 1758/1000000 [07:13<67:49:53, 4.09it/s, grad_norm=4.12, loss_final=1.77, loss_mean=0.932, loss_mean_cls=1.19, proj_loss=-0.36][2026-03-22 14:25:50] Step: 1758, Training Logs: loss_final: 1.636009, loss_mean: 0.952218, proj_loss: -0.364378, loss_mean_cls: 1.048170, grad_norm: 4.469126 +Steps: 0%| | 1759/1000000 [07:13<68:22:40, 4.06it/s, grad_norm=4.47, loss_final=1.64, loss_mean=0.952, loss_mean_cls=1.05, proj_loss=-0.364][2026-03-22 14:25:50] Step: 1759, Training Logs: loss_final: 1.748900, loss_mean: 0.923830, proj_loss: -0.361958, loss_mean_cls: 1.187028, grad_norm: 3.324688 +Steps: 0%| | 1760/1000000 [07:13<68:12:41, 4.07it/s, grad_norm=3.32, loss_final=1.75, loss_mean=0.924, loss_mean_cls=1.19, proj_loss=-0.362][2026-03-22 14:25:50] Step: 1760, Training Logs: loss_final: 1.662642, loss_mean: 0.934146, proj_loss: -0.364049, loss_mean_cls: 1.092544, grad_norm: 1.358805 +Steps: 0%| | 1761/1000000 [07:14<68:10:54, 4.07it/s, grad_norm=1.36, loss_final=1.66, loss_mean=0.934, loss_mean_cls=1.09, proj_loss=-0.364][2026-03-22 14:25:51] Step: 1761, Training Logs: loss_final: 1.709130, loss_mean: 0.906478, proj_loss: -0.365727, loss_mean_cls: 1.168378, grad_norm: 2.947295 +Steps: 0%| | 1762/1000000 [07:14<68:04:41, 4.07it/s, grad_norm=2.95, loss_final=1.71, loss_mean=0.906, loss_mean_cls=1.17, proj_loss=-0.366][2026-03-22 14:25:51] Step: 1762, Training Logs: loss_final: 1.628347, loss_mean: 0.954617, proj_loss: -0.363419, loss_mean_cls: 1.037149, grad_norm: 4.338431 +Steps: 0%| | 1763/1000000 [07:14<67:58:58, 4.08it/s, grad_norm=4.34, loss_final=1.63, loss_mean=0.955, loss_mean_cls=1.04, proj_loss=-0.363][2026-03-22 14:25:51] Step: 1763, Training Logs: loss_final: 1.821855, loss_mean: 0.917010, proj_loss: -0.348036, loss_mean_cls: 1.252881, grad_norm: 2.367898 +Steps: 0%| | 1764/1000000 [07:14<67:56:04, 4.08it/s, grad_norm=2.37, loss_final=1.82, loss_mean=0.917, loss_mean_cls=1.25, proj_loss=-0.348][2026-03-22 14:25:51] Step: 1764, Training Logs: loss_final: 1.639659, loss_mean: 0.930088, proj_loss: -0.359565, loss_mean_cls: 1.069136, grad_norm: 2.837894 +Steps: 0%| | 1765/1000000 [07:15<67:56:14, 4.08it/s, grad_norm=2.84, loss_final=1.64, loss_mean=0.93, loss_mean_cls=1.07, proj_loss=-0.36][2026-03-22 14:25:52] Step: 1765, Training Logs: loss_final: 1.696227, loss_mean: 0.916270, proj_loss: -0.361319, loss_mean_cls: 1.141276, grad_norm: 3.719995 +Steps: 0%| | 1766/1000000 [07:15<68:12:58, 4.06it/s, grad_norm=3.72, loss_final=1.7, loss_mean=0.916, loss_mean_cls=1.14, proj_loss=-0.361][2026-03-22 14:25:52] Step: 1766, Training Logs: loss_final: 1.619585, loss_mean: 0.944946, proj_loss: -0.362884, loss_mean_cls: 1.037522, grad_norm: 3.299699 +Steps: 0%| | 1767/1000000 [07:15<68:04:27, 4.07it/s, grad_norm=3.3, loss_final=1.62, loss_mean=0.945, loss_mean_cls=1.04, proj_loss=-0.363][2026-03-22 14:25:52] Step: 1767, Training Logs: loss_final: 1.691264, loss_mean: 0.939621, proj_loss: -0.363058, loss_mean_cls: 1.114700, grad_norm: 4.858476 +Steps: 0%| | 1768/1000000 [07:15<68:00:19, 4.08it/s, grad_norm=4.86, loss_final=1.69, loss_mean=0.94, loss_mean_cls=1.11, proj_loss=-0.363][2026-03-22 14:25:52] Step: 1768, Training Logs: loss_final: 1.545463, loss_mean: 0.949927, proj_loss: -0.364104, loss_mean_cls: 0.959640, grad_norm: 2.984211 +Steps: 0%| | 1769/1000000 [07:16<67:56:41, 4.08it/s, grad_norm=2.98, loss_final=1.55, loss_mean=0.95, loss_mean_cls=0.96, proj_loss=-0.364][2026-03-22 14:25:53] Step: 1769, Training Logs: loss_final: 1.696367, loss_mean: 0.929723, proj_loss: -0.364383, loss_mean_cls: 1.131027, grad_norm: 3.566419 +Steps: 0%| | 1770/1000000 [07:16<70:42:45, 3.92it/s, grad_norm=3.57, loss_final=1.7, loss_mean=0.93, loss_mean_cls=1.13, proj_loss=-0.364][2026-03-22 14:25:53] Step: 1770, Training Logs: loss_final: 1.664725, loss_mean: 0.933631, proj_loss: -0.365300, loss_mean_cls: 1.096394, grad_norm: 3.067657 +Steps: 0%| | 1771/1000000 [07:16<69:57:50, 3.96it/s, grad_norm=3.07, loss_final=1.66, loss_mean=0.934, loss_mean_cls=1.1, proj_loss=-0.365][2026-03-22 14:25:53] Step: 1771, Training Logs: loss_final: 1.668367, loss_mean: 0.935982, proj_loss: -0.359451, loss_mean_cls: 1.091837, grad_norm: 4.615170 +Steps: 0%| | 1772/1000000 [07:16<69:18:19, 4.00it/s, grad_norm=4.62, loss_final=1.67, loss_mean=0.936, loss_mean_cls=1.09, proj_loss=-0.359][2026-03-22 14:25:53] Step: 1772, Training Logs: loss_final: 1.760138, loss_mean: 0.921899, proj_loss: -0.360644, loss_mean_cls: 1.198883, grad_norm: 4.478012 +Steps: 0%| | 1773/1000000 [07:17<68:51:21, 4.03it/s, grad_norm=4.48, loss_final=1.76, loss_mean=0.922, loss_mean_cls=1.2, proj_loss=-0.361][2026-03-22 14:25:54] Step: 1773, Training Logs: loss_final: 1.703772, loss_mean: 0.939809, proj_loss: -0.366159, loss_mean_cls: 1.130122, grad_norm: 4.395891 +Steps: 0%| | 1774/1000000 [07:17<68:31:30, 4.05it/s, grad_norm=4.4, loss_final=1.7, loss_mean=0.94, loss_mean_cls=1.13, proj_loss=-0.366][2026-03-22 14:25:54] Step: 1774, Training Logs: loss_final: 1.710058, loss_mean: 0.918618, proj_loss: -0.359229, loss_mean_cls: 1.150669, grad_norm: 3.388961 +Steps: 0%| | 1775/1000000 [07:17<68:18:45, 4.06it/s, grad_norm=3.39, loss_final=1.71, loss_mean=0.919, loss_mean_cls=1.15, proj_loss=-0.359][2026-03-22 14:25:54] Step: 1775, Training Logs: loss_final: 1.705125, loss_mean: 0.927479, proj_loss: -0.361335, loss_mean_cls: 1.138981, grad_norm: 3.423658 +Steps: 0%| | 1776/1000000 [07:17<68:08:53, 4.07it/s, grad_norm=3.42, loss_final=1.71, loss_mean=0.927, loss_mean_cls=1.14, proj_loss=-0.361][2026-03-22 14:25:54] Step: 1776, Training Logs: loss_final: 1.646653, loss_mean: 0.919951, proj_loss: -0.364291, loss_mean_cls: 1.090993, grad_norm: 4.736534 +Steps: 0%| | 1777/1000000 [07:18<68:03:30, 4.07it/s, grad_norm=4.74, loss_final=1.65, loss_mean=0.92, loss_mean_cls=1.09, proj_loss=-0.364][2026-03-22 14:25:55] Step: 1777, Training Logs: loss_final: 1.730599, loss_mean: 0.912792, proj_loss: -0.364650, loss_mean_cls: 1.182456, grad_norm: 5.093015 +Steps: 0%| | 1778/1000000 [07:18<67:57:24, 4.08it/s, grad_norm=5.09, loss_final=1.73, loss_mean=0.913, loss_mean_cls=1.18, proj_loss=-0.365][2026-03-22 14:25:55] Step: 1778, Training Logs: loss_final: 1.715393, loss_mean: 0.925770, proj_loss: -0.365350, loss_mean_cls: 1.154973, grad_norm: 3.558389 +Steps: 0%| | 1779/1000000 [07:18<67:53:30, 4.08it/s, grad_norm=3.56, loss_final=1.72, loss_mean=0.926, loss_mean_cls=1.15, proj_loss=-0.365][2026-03-22 14:25:55] Step: 1779, Training Logs: loss_final: 1.723116, loss_mean: 0.918375, proj_loss: -0.358151, loss_mean_cls: 1.162892, grad_norm: 2.807913 +Steps: 0%| | 1780/1000000 [07:18<67:53:33, 4.08it/s, grad_norm=2.81, loss_final=1.72, loss_mean=0.918, loss_mean_cls=1.16, proj_loss=-0.358][2026-03-22 14:25:55] Step: 1780, Training Logs: loss_final: 1.790426, loss_mean: 0.907449, proj_loss: -0.360599, loss_mean_cls: 1.243576, grad_norm: 3.280392 +Steps: 0%| | 1781/1000000 [07:19<67:50:45, 4.09it/s, grad_norm=3.28, loss_final=1.79, loss_mean=0.907, loss_mean_cls=1.24, proj_loss=-0.361][2026-03-22 14:25:56] Step: 1781, Training Logs: loss_final: 1.667214, loss_mean: 0.906712, proj_loss: -0.372647, loss_mean_cls: 1.133148, grad_norm: 2.734885 +Steps: 0%| | 1782/1000000 [07:19<67:49:42, 4.09it/s, grad_norm=2.73, loss_final=1.67, loss_mean=0.907, loss_mean_cls=1.13, proj_loss=-0.373][2026-03-22 14:25:56] Step: 1782, Training Logs: loss_final: 1.680704, loss_mean: 0.917352, proj_loss: -0.366593, loss_mean_cls: 1.129946, grad_norm: 1.814120 +Steps: 0%| | 1783/1000000 [07:19<67:47:53, 4.09it/s, grad_norm=1.81, loss_final=1.68, loss_mean=0.917, loss_mean_cls=1.13, proj_loss=-0.367][2026-03-22 14:25:56] Step: 1783, Training Logs: loss_final: 1.631315, loss_mean: 0.937109, proj_loss: -0.369308, loss_mean_cls: 1.063514, grad_norm: 1.944963 +Steps: 0%| | 1784/1000000 [07:19<67:47:44, 4.09it/s, grad_norm=1.94, loss_final=1.63, loss_mean=0.937, loss_mean_cls=1.06, proj_loss=-0.369][2026-03-22 14:25:56] Step: 1784, Training Logs: loss_final: 1.754131, loss_mean: 0.892635, proj_loss: -0.363250, loss_mean_cls: 1.224746, grad_norm: 1.574644 +Steps: 0%| | 1785/1000000 [07:20<67:49:25, 4.09it/s, grad_norm=1.57, loss_final=1.75, loss_mean=0.893, loss_mean_cls=1.22, proj_loss=-0.363][2026-03-22 14:25:57] Step: 1785, Training Logs: loss_final: 1.661082, loss_mean: 0.922155, proj_loss: -0.370537, loss_mean_cls: 1.109464, grad_norm: 2.618628 +Steps: 0%| | 1786/1000000 [07:20<67:49:06, 4.09it/s, grad_norm=2.62, loss_final=1.66, loss_mean=0.922, loss_mean_cls=1.11, proj_loss=-0.371][2026-03-22 14:25:57] Step: 1786, Training Logs: loss_final: 1.725726, loss_mean: 0.924854, proj_loss: -0.357210, loss_mean_cls: 1.158082, grad_norm: 1.892119 +Steps: 0%| | 1787/1000000 [07:20<67:50:04, 4.09it/s, grad_norm=1.89, loss_final=1.73, loss_mean=0.925, loss_mean_cls=1.16, proj_loss=-0.357][2026-03-22 14:25:57] Step: 1787, Training Logs: loss_final: 1.634791, loss_mean: 0.931380, proj_loss: -0.373269, loss_mean_cls: 1.076680, grad_norm: 1.816262 +Steps: 0%| | 1788/1000000 [07:20<67:49:14, 4.09it/s, grad_norm=1.82, loss_final=1.63, loss_mean=0.931, loss_mean_cls=1.08, proj_loss=-0.373][2026-03-22 14:25:57] Step: 1788, Training Logs: loss_final: 1.692197, loss_mean: 0.921207, proj_loss: -0.371095, loss_mean_cls: 1.142085, grad_norm: 4.073905 +Steps: 0%| | 1789/1000000 [07:21<67:49:17, 4.09it/s, grad_norm=4.07, loss_final=1.69, loss_mean=0.921, loss_mean_cls=1.14, proj_loss=-0.371][2026-03-22 14:25:57] Step: 1789, Training Logs: loss_final: 1.580206, loss_mean: 0.916109, proj_loss: -0.372864, loss_mean_cls: 1.036961, grad_norm: 2.178423 +Steps: 0%| | 1790/1000000 [07:21<67:48:57, 4.09it/s, grad_norm=2.18, loss_final=1.58, loss_mean=0.916, loss_mean_cls=1.04, proj_loss=-0.373][2026-03-22 14:25:58] Step: 1790, Training Logs: loss_final: 1.569365, loss_mean: 0.949665, proj_loss: -0.372251, loss_mean_cls: 0.991950, grad_norm: 1.472038 +Steps: 0%| | 1791/1000000 [07:21<67:48:04, 4.09it/s, grad_norm=1.47, loss_final=1.57, loss_mean=0.95, loss_mean_cls=0.992, proj_loss=-0.372][2026-03-22 14:25:58] Step: 1791, Training Logs: loss_final: 1.720404, loss_mean: 0.922343, proj_loss: -0.364445, loss_mean_cls: 1.162506, grad_norm: 6.370088 +Steps: 0%| | 1792/1000000 [07:21<67:57:34, 4.08it/s, grad_norm=6.37, loss_final=1.72, loss_mean=0.922, loss_mean_cls=1.16, proj_loss=-0.364][2026-03-22 14:25:58] Step: 1792, Training Logs: loss_final: 1.654764, loss_mean: 0.920324, proj_loss: -0.356132, loss_mean_cls: 1.090572, grad_norm: 3.771840 +Steps: 0%| | 1793/1000000 [07:22<67:55:41, 4.08it/s, grad_norm=3.77, loss_final=1.65, loss_mean=0.92, loss_mean_cls=1.09, proj_loss=-0.356][2026-03-22 14:25:58] Step: 1793, Training Logs: loss_final: 1.685291, loss_mean: 0.909793, proj_loss: -0.366467, loss_mean_cls: 1.141965, grad_norm: 1.428734 +Steps: 0%| | 1794/1000000 [07:22<67:54:37, 4.08it/s, grad_norm=1.43, loss_final=1.69, loss_mean=0.91, loss_mean_cls=1.14, proj_loss=-0.366][2026-03-22 14:25:59] Step: 1794, Training Logs: loss_final: 1.709277, loss_mean: 0.931068, proj_loss: -0.367112, loss_mean_cls: 1.145321, grad_norm: 4.116176 +Steps: 0%| | 1795/1000000 [07:22<67:51:43, 4.09it/s, grad_norm=4.12, loss_final=1.71, loss_mean=0.931, loss_mean_cls=1.15, proj_loss=-0.367][2026-03-22 14:25:59] Step: 1795, Training Logs: loss_final: 1.782418, loss_mean: 0.924310, proj_loss: -0.348099, loss_mean_cls: 1.206206, grad_norm: 4.374492 +Steps: 0%| | 1796/1000000 [07:22<67:49:53, 4.09it/s, grad_norm=4.37, loss_final=1.78, loss_mean=0.924, loss_mean_cls=1.21, proj_loss=-0.348][2026-03-22 14:25:59] Step: 1796, Training Logs: loss_final: 1.759494, loss_mean: 0.932613, proj_loss: -0.343128, loss_mean_cls: 1.170009, grad_norm: 3.116666 +Steps: 0%| | 1797/1000000 [07:22<67:49:42, 4.09it/s, grad_norm=3.12, loss_final=1.76, loss_mean=0.933, loss_mean_cls=1.17, proj_loss=-0.343][2026-03-22 14:25:59] Step: 1797, Training Logs: loss_final: 1.686193, loss_mean: 0.911560, proj_loss: -0.363078, loss_mean_cls: 1.137711, grad_norm: 2.670835 +Steps: 0%| | 1798/1000000 [07:23<67:49:02, 4.09it/s, grad_norm=2.67, loss_final=1.69, loss_mean=0.912, loss_mean_cls=1.14, proj_loss=-0.363][2026-03-22 14:26:00] Step: 1798, Training Logs: loss_final: 1.690363, loss_mean: 0.919758, proj_loss: -0.364079, loss_mean_cls: 1.134684, grad_norm: 3.372120 +Steps: 0%| | 1799/1000000 [07:23<67:49:15, 4.09it/s, grad_norm=3.37, loss_final=1.69, loss_mean=0.92, loss_mean_cls=1.13, proj_loss=-0.364][2026-03-22 14:26:00] Step: 1799, Training Logs: loss_final: 1.679202, loss_mean: 0.914334, proj_loss: -0.365263, loss_mean_cls: 1.130132, grad_norm: 2.492553 +Steps: 0%| | 1800/1000000 [07:23<67:52:03, 4.09it/s, grad_norm=2.49, loss_final=1.68, loss_mean=0.914, loss_mean_cls=1.13, proj_loss=-0.365][2026-03-22 14:26:00] Step: 1800, Training Logs: loss_final: 1.679520, loss_mean: 0.924562, proj_loss: -0.367754, loss_mean_cls: 1.122713, grad_norm: 1.783705 +Steps: 0%| | 1801/1000000 [07:23<67:53:01, 4.08it/s, grad_norm=1.78, loss_final=1.68, loss_mean=0.925, loss_mean_cls=1.12, proj_loss=-0.368][2026-03-22 14:26:00] Step: 1801, Training Logs: loss_final: 1.732090, loss_mean: 0.919539, proj_loss: -0.367797, loss_mean_cls: 1.180348, grad_norm: 2.387394 +Steps: 0%| | 1802/1000000 [07:24<67:51:50, 4.09it/s, grad_norm=2.39, loss_final=1.73, loss_mean=0.92, loss_mean_cls=1.18, proj_loss=-0.368][2026-03-22 14:26:01] Step: 1802, Training Logs: loss_final: 1.712352, loss_mean: 0.912618, proj_loss: -0.365768, loss_mean_cls: 1.165502, grad_norm: 2.318329 +Steps: 0%| | 1803/1000000 [07:24<67:52:48, 4.08it/s, grad_norm=2.32, loss_final=1.71, loss_mean=0.913, loss_mean_cls=1.17, proj_loss=-0.366][2026-03-22 14:26:01] Step: 1803, Training Logs: loss_final: 1.629781, loss_mean: 0.919286, proj_loss: -0.369652, loss_mean_cls: 1.080147, grad_norm: 3.080911 +Steps: 0%| | 1804/1000000 [07:24<67:52:26, 4.09it/s, grad_norm=3.08, loss_final=1.63, loss_mean=0.919, loss_mean_cls=1.08, proj_loss=-0.37][2026-03-22 14:26:01] Step: 1804, Training Logs: loss_final: 1.489173, loss_mean: 0.937403, proj_loss: -0.370994, loss_mean_cls: 0.922764, grad_norm: 1.420041 +Steps: 0%| | 1805/1000000 [07:24<68:01:21, 4.08it/s, grad_norm=1.42, loss_final=1.49, loss_mean=0.937, loss_mean_cls=0.923, proj_loss=-0.371][2026-03-22 14:26:01] Step: 1805, Training Logs: loss_final: 1.622226, loss_mean: 0.934736, proj_loss: -0.375093, loss_mean_cls: 1.062583, grad_norm: 2.195407 +Steps: 0%| | 1806/1000000 [07:25<67:59:43, 4.08it/s, grad_norm=2.2, loss_final=1.62, loss_mean=0.935, loss_mean_cls=1.06, proj_loss=-0.375][2026-03-22 14:26:02] Step: 1806, Training Logs: loss_final: 1.689183, loss_mean: 0.921766, proj_loss: -0.365284, loss_mean_cls: 1.132700, grad_norm: 4.010067 +Steps: 0%| | 1807/1000000 [07:25<67:57:06, 4.08it/s, grad_norm=4.01, loss_final=1.69, loss_mean=0.922, loss_mean_cls=1.13, proj_loss=-0.365][2026-03-22 14:26:02] Step: 1807, Training Logs: loss_final: 1.714477, loss_mean: 0.916161, proj_loss: -0.360678, loss_mean_cls: 1.158994, grad_norm: 2.579962 +Steps: 0%| | 1808/1000000 [07:25<67:56:01, 4.08it/s, grad_norm=2.58, loss_final=1.71, loss_mean=0.916, loss_mean_cls=1.16, proj_loss=-0.361][2026-03-22 14:26:02] Step: 1808, Training Logs: loss_final: 1.594559, loss_mean: 0.931688, proj_loss: -0.369191, loss_mean_cls: 1.032062, grad_norm: 1.593375 +Steps: 0%| | 1809/1000000 [07:25<67:54:08, 4.08it/s, grad_norm=1.59, loss_final=1.59, loss_mean=0.932, loss_mean_cls=1.03, proj_loss=-0.369][2026-03-22 14:26:02] Step: 1809, Training Logs: loss_final: 1.513778, loss_mean: 0.944225, proj_loss: -0.380636, loss_mean_cls: 0.950189, grad_norm: 4.799737 +Steps: 0%| | 1810/1000000 [07:26<67:54:16, 4.08it/s, grad_norm=4.8, loss_final=1.51, loss_mean=0.944, loss_mean_cls=0.95, proj_loss=-0.381][2026-03-22 14:26:03] Step: 1810, Training Logs: loss_final: 1.671338, loss_mean: 0.945194, proj_loss: -0.369955, loss_mean_cls: 1.096099, grad_norm: 3.508886 +Steps: 0%| | 1811/1000000 [07:26<67:51:46, 4.09it/s, grad_norm=3.51, loss_final=1.67, loss_mean=0.945, loss_mean_cls=1.1, proj_loss=-0.37][2026-03-22 14:26:03] Step: 1811, Training Logs: loss_final: 1.828794, loss_mean: 0.899971, proj_loss: -0.349753, loss_mean_cls: 1.278576, grad_norm: 3.417782 +Steps: 0%| | 1812/1000000 [07:26<68:25:31, 4.05it/s, grad_norm=3.42, loss_final=1.83, loss_mean=0.9, loss_mean_cls=1.28, proj_loss=-0.35][2026-03-22 14:26:03] Step: 1812, Training Logs: loss_final: 1.678675, loss_mean: 0.926455, proj_loss: -0.368224, loss_mean_cls: 1.120444, grad_norm: 2.601842 +Steps: 0%| | 1813/1000000 [07:26<68:15:05, 4.06it/s, grad_norm=2.6, loss_final=1.68, loss_mean=0.926, loss_mean_cls=1.12, proj_loss=-0.368][2026-03-22 14:26:03] Step: 1813, Training Logs: loss_final: 1.734678, loss_mean: 0.931579, proj_loss: -0.356355, loss_mean_cls: 1.159454, grad_norm: 2.415075 +Steps: 0%| | 1814/1000000 [07:27<68:07:22, 4.07it/s, grad_norm=2.42, loss_final=1.73, loss_mean=0.932, loss_mean_cls=1.16, proj_loss=-0.356][2026-03-22 14:26:04] Step: 1814, Training Logs: loss_final: 1.738088, loss_mean: 0.904455, proj_loss: -0.355229, loss_mean_cls: 1.188863, grad_norm: 3.067106 +Steps: 0%| | 1815/1000000 [07:27<68:00:02, 4.08it/s, grad_norm=3.07, loss_final=1.74, loss_mean=0.904, loss_mean_cls=1.19, proj_loss=-0.355][2026-03-22 14:26:04] Step: 1815, Training Logs: loss_final: 1.698564, loss_mean: 0.945218, proj_loss: -0.368024, loss_mean_cls: 1.121370, grad_norm: 4.353144 +Steps: 0%| | 1816/1000000 [07:27<67:56:03, 4.08it/s, grad_norm=4.35, loss_final=1.7, loss_mean=0.945, loss_mean_cls=1.12, proj_loss=-0.368][2026-03-22 14:26:04] Step: 1816, Training Logs: loss_final: 1.682929, loss_mean: 0.932935, proj_loss: -0.362599, loss_mean_cls: 1.112594, grad_norm: 3.676499 +Steps: 0%| | 1817/1000000 [07:27<67:54:27, 4.08it/s, grad_norm=3.68, loss_final=1.68, loss_mean=0.933, loss_mean_cls=1.11, proj_loss=-0.363][2026-03-22 14:26:04] Step: 1817, Training Logs: loss_final: 1.602755, loss_mean: 0.955650, proj_loss: -0.370436, loss_mean_cls: 1.017542, grad_norm: 3.999806 +Steps: 0%| | 1818/1000000 [07:28<67:52:57, 4.08it/s, grad_norm=4, loss_final=1.6, loss_mean=0.956, loss_mean_cls=1.02, proj_loss=-0.37][2026-03-22 14:26:05] Step: 1818, Training Logs: loss_final: 1.769744, loss_mean: 0.918630, proj_loss: -0.363701, loss_mean_cls: 1.214815, grad_norm: 5.611456 +Steps: 0%| | 1819/1000000 [07:28<67:51:43, 4.09it/s, grad_norm=5.61, loss_final=1.77, loss_mean=0.919, loss_mean_cls=1.21, proj_loss=-0.364][2026-03-22 14:26:05] Step: 1819, Training Logs: loss_final: 1.713674, loss_mean: 0.903061, proj_loss: -0.359393, loss_mean_cls: 1.170006, grad_norm: 3.045094 +Steps: 0%| | 1820/1000000 [07:28<67:51:10, 4.09it/s, grad_norm=3.05, loss_final=1.71, loss_mean=0.903, loss_mean_cls=1.17, proj_loss=-0.359][2026-03-22 14:26:05] Step: 1820, Training Logs: loss_final: 1.634528, loss_mean: 0.933752, proj_loss: -0.364946, loss_mean_cls: 1.065722, grad_norm: 2.120724 +Steps: 0%| | 1821/1000000 [07:28<67:49:32, 4.09it/s, grad_norm=2.12, loss_final=1.63, loss_mean=0.934, loss_mean_cls=1.07, proj_loss=-0.365][2026-03-22 14:26:05] Step: 1821, Training Logs: loss_final: 1.547439, loss_mean: 0.927219, proj_loss: -0.378236, loss_mean_cls: 0.998456, grad_norm: 2.354813 +Steps: 0%| | 1822/1000000 [07:29<70:29:50, 3.93it/s, grad_norm=2.35, loss_final=1.55, loss_mean=0.927, loss_mean_cls=0.998, proj_loss=-0.378][2026-03-22 14:26:06] Step: 1822, Training Logs: loss_final: 1.674857, loss_mean: 0.916497, proj_loss: -0.363193, loss_mean_cls: 1.121552, grad_norm: 2.987213 +Steps: 0%| | 1823/1000000 [07:29<69:43:53, 3.98it/s, grad_norm=2.99, loss_final=1.67, loss_mean=0.916, loss_mean_cls=1.12, proj_loss=-0.363][2026-03-22 14:26:06] Step: 1823, Training Logs: loss_final: 1.551207, loss_mean: 0.947529, proj_loss: -0.379187, loss_mean_cls: 0.982865, grad_norm: 1.906217 +Steps: 0%| | 1824/1000000 [07:29<69:11:53, 4.01it/s, grad_norm=1.91, loss_final=1.55, loss_mean=0.948, loss_mean_cls=0.983, proj_loss=-0.379][2026-03-22 14:26:06] Step: 1824, Training Logs: loss_final: 1.629744, loss_mean: 0.930531, proj_loss: -0.379451, loss_mean_cls: 1.078664, grad_norm: 4.657586 +Steps: 0%| | 1825/1000000 [07:29<68:48:12, 4.03it/s, grad_norm=4.66, loss_final=1.63, loss_mean=0.931, loss_mean_cls=1.08, proj_loss=-0.379][2026-03-22 14:26:06] Step: 1825, Training Logs: loss_final: 1.803712, loss_mean: 0.919145, proj_loss: -0.361528, loss_mean_cls: 1.246095, grad_norm: 3.588258 +Steps: 0%| | 1826/1000000 [07:30<68:32:17, 4.05it/s, grad_norm=3.59, loss_final=1.8, loss_mean=0.919, loss_mean_cls=1.25, proj_loss=-0.362][2026-03-22 14:26:07] Step: 1826, Training Logs: loss_final: 1.776871, loss_mean: 0.902667, proj_loss: -0.361857, loss_mean_cls: 1.236061, grad_norm: 3.566930 +Steps: 0%| | 1827/1000000 [07:30<68:18:43, 4.06it/s, grad_norm=3.57, loss_final=1.78, loss_mean=0.903, loss_mean_cls=1.24, proj_loss=-0.362][2026-03-22 14:26:07] Step: 1827, Training Logs: loss_final: 1.727192, loss_mean: 0.919628, proj_loss: -0.371430, loss_mean_cls: 1.178994, grad_norm: 6.034563 +Steps: 0%| | 1828/1000000 [07:30<68:10:43, 4.07it/s, grad_norm=6.03, loss_final=1.73, loss_mean=0.92, loss_mean_cls=1.18, proj_loss=-0.371][2026-03-22 14:26:07] Step: 1828, Training Logs: loss_final: 1.816599, loss_mean: 0.907882, proj_loss: -0.358232, loss_mean_cls: 1.266949, grad_norm: 4.491025 +Steps: 0%| | 1829/1000000 [07:30<68:04:17, 4.07it/s, grad_norm=4.49, loss_final=1.82, loss_mean=0.908, loss_mean_cls=1.27, proj_loss=-0.358][2026-03-22 14:26:07] Step: 1829, Training Logs: loss_final: 1.828894, loss_mean: 0.914222, proj_loss: -0.355087, loss_mean_cls: 1.269759, grad_norm: 3.713277 +Steps: 0%| | 1830/1000000 [07:31<68:00:16, 4.08it/s, grad_norm=3.71, loss_final=1.83, loss_mean=0.914, loss_mean_cls=1.27, proj_loss=-0.355][2026-03-22 14:26:08] Step: 1830, Training Logs: loss_final: 1.659436, loss_mean: 0.939908, proj_loss: -0.369407, loss_mean_cls: 1.088935, grad_norm: 4.053563 +Steps: 0%| | 1831/1000000 [07:31<67:57:26, 4.08it/s, grad_norm=4.05, loss_final=1.66, loss_mean=0.94, loss_mean_cls=1.09, proj_loss=-0.369][2026-03-22 14:26:08] Step: 1831, Training Logs: loss_final: 1.679166, loss_mean: 0.933796, proj_loss: -0.368196, loss_mean_cls: 1.113566, grad_norm: 3.009118 +Steps: 0%| | 1832/1000000 [07:31<67:55:59, 4.08it/s, grad_norm=3.01, loss_final=1.68, loss_mean=0.934, loss_mean_cls=1.11, proj_loss=-0.368][2026-03-22 14:26:08] Step: 1832, Training Logs: loss_final: 1.787988, loss_mean: 0.908851, proj_loss: -0.357275, loss_mean_cls: 1.236413, grad_norm: 2.184810 +Steps: 0%| | 1833/1000000 [07:31<67:54:51, 4.08it/s, grad_norm=2.18, loss_final=1.79, loss_mean=0.909, loss_mean_cls=1.24, proj_loss=-0.357][2026-03-22 14:26:08] Step: 1833, Training Logs: loss_final: 1.740565, loss_mean: 0.899810, proj_loss: -0.356465, loss_mean_cls: 1.197220, grad_norm: 2.823720 +Steps: 0%| | 1834/1000000 [07:32<67:53:33, 4.08it/s, grad_norm=2.82, loss_final=1.74, loss_mean=0.9, loss_mean_cls=1.2, proj_loss=-0.356][2026-03-22 14:26:09] Step: 1834, Training Logs: loss_final: 1.704970, loss_mean: 0.931218, proj_loss: -0.368195, loss_mean_cls: 1.141947, grad_norm: 4.914431 +Steps: 0%| | 1835/1000000 [07:32<67:52:50, 4.08it/s, grad_norm=4.91, loss_final=1.7, loss_mean=0.931, loss_mean_cls=1.14, proj_loss=-0.368][2026-03-22 14:26:09] Step: 1835, Training Logs: loss_final: 1.681313, loss_mean: 0.920550, proj_loss: -0.373438, loss_mean_cls: 1.134201, grad_norm: 6.131526 +Steps: 0%| | 1836/1000000 [07:32<67:52:29, 4.08it/s, grad_norm=6.13, loss_final=1.68, loss_mean=0.921, loss_mean_cls=1.13, proj_loss=-0.373][2026-03-22 14:26:09] Step: 1836, Training Logs: loss_final: 1.561973, loss_mean: 0.962206, proj_loss: -0.375123, loss_mean_cls: 0.974889, grad_norm: 3.469082 +Steps: 0%| | 1837/1000000 [07:32<67:51:35, 4.09it/s, grad_norm=3.47, loss_final=1.56, loss_mean=0.962, loss_mean_cls=0.975, proj_loss=-0.375][2026-03-22 14:26:09] Step: 1837, Training Logs: loss_final: 1.599083, loss_mean: 0.924002, proj_loss: -0.369697, loss_mean_cls: 1.044778, grad_norm: 2.690511 +Steps: 0%| | 1838/1000000 [07:33<67:51:02, 4.09it/s, grad_norm=2.69, loss_final=1.6, loss_mean=0.924, loss_mean_cls=1.04, proj_loss=-0.37][2026-03-22 14:26:10] Step: 1838, Training Logs: loss_final: 1.699012, loss_mean: 0.914030, proj_loss: -0.359271, loss_mean_cls: 1.144253, grad_norm: 2.977409 +Steps: 0%| | 1839/1000000 [07:33<67:49:43, 4.09it/s, grad_norm=2.98, loss_final=1.7, loss_mean=0.914, loss_mean_cls=1.14, proj_loss=-0.359][2026-03-22 14:26:10] Step: 1839, Training Logs: loss_final: 1.604606, loss_mean: 0.929363, proj_loss: -0.375216, loss_mean_cls: 1.050459, grad_norm: 3.481653 +Steps: 0%| | 1840/1000000 [07:33<67:49:04, 4.09it/s, grad_norm=3.48, loss_final=1.6, loss_mean=0.929, loss_mean_cls=1.05, proj_loss=-0.375][2026-03-22 14:26:10] Step: 1840, Training Logs: loss_final: 1.722385, loss_mean: 0.915255, proj_loss: -0.364331, loss_mean_cls: 1.171461, grad_norm: 2.278544 +Steps: 0%| | 1841/1000000 [07:33<67:50:07, 4.09it/s, grad_norm=2.28, loss_final=1.72, loss_mean=0.915, loss_mean_cls=1.17, proj_loss=-0.364][2026-03-22 14:26:10] Step: 1841, Training Logs: loss_final: 1.874145, loss_mean: 0.901010, proj_loss: -0.355117, loss_mean_cls: 1.328252, grad_norm: 4.367743 +Steps: 0%| | 1842/1000000 [07:34<67:49:03, 4.09it/s, grad_norm=4.37, loss_final=1.87, loss_mean=0.901, loss_mean_cls=1.33, proj_loss=-0.355][2026-03-22 14:26:10] Step: 1842, Training Logs: loss_final: 1.529627, loss_mean: 0.940605, proj_loss: -0.380991, loss_mean_cls: 0.970013, grad_norm: 3.140703 +Steps: 0%| | 1843/1000000 [07:34<67:48:53, 4.09it/s, grad_norm=3.14, loss_final=1.53, loss_mean=0.941, loss_mean_cls=0.97, proj_loss=-0.381][2026-03-22 14:26:11] Step: 1843, Training Logs: loss_final: 1.509070, loss_mean: 0.957354, proj_loss: -0.378465, loss_mean_cls: 0.930180, grad_norm: 3.015364 +Steps: 0%| | 1844/1000000 [07:34<67:51:21, 4.09it/s, grad_norm=3.02, loss_final=1.51, loss_mean=0.957, loss_mean_cls=0.93, proj_loss=-0.378][2026-03-22 14:26:11] Step: 1844, Training Logs: loss_final: 1.599101, loss_mean: 0.919429, proj_loss: -0.374514, loss_mean_cls: 1.054186, grad_norm: 2.720724 +Steps: 0%| | 1845/1000000 [07:34<67:50:55, 4.09it/s, grad_norm=2.72, loss_final=1.6, loss_mean=0.919, loss_mean_cls=1.05, proj_loss=-0.375][2026-03-22 14:26:11] Step: 1845, Training Logs: loss_final: 1.591660, loss_mean: 0.933716, proj_loss: -0.373649, loss_mean_cls: 1.031594, grad_norm: 2.652560 +Steps: 0%| | 1846/1000000 [07:35<67:51:24, 4.09it/s, grad_norm=2.65, loss_final=1.59, loss_mean=0.934, loss_mean_cls=1.03, proj_loss=-0.374][2026-03-22 14:26:11] Step: 1846, Training Logs: loss_final: 1.678378, loss_mean: 0.940135, proj_loss: -0.366770, loss_mean_cls: 1.105013, grad_norm: 3.267191 +Steps: 0%| | 1847/1000000 [07:35<67:52:12, 4.09it/s, grad_norm=3.27, loss_final=1.68, loss_mean=0.94, loss_mean_cls=1.11, proj_loss=-0.367][2026-03-22 14:26:12] Step: 1847, Training Logs: loss_final: 1.654908, loss_mean: 0.937036, proj_loss: -0.372820, loss_mean_cls: 1.090692, grad_norm: 3.097309 +Steps: 0%| | 1848/1000000 [07:35<67:53:28, 4.08it/s, grad_norm=3.1, loss_final=1.65, loss_mean=0.937, loss_mean_cls=1.09, proj_loss=-0.373][2026-03-22 14:26:12] Step: 1848, Training Logs: loss_final: 1.626591, loss_mean: 0.923310, proj_loss: -0.369925, loss_mean_cls: 1.073206, grad_norm: 2.613260 +Steps: 0%| | 1849/1000000 [07:35<67:52:51, 4.08it/s, grad_norm=2.61, loss_final=1.63, loss_mean=0.923, loss_mean_cls=1.07, proj_loss=-0.37][2026-03-22 14:26:12] Step: 1849, Training Logs: loss_final: 1.667565, loss_mean: 0.914465, proj_loss: -0.373666, loss_mean_cls: 1.126766, grad_norm: 4.061103 +Steps: 0%| | 1850/1000000 [07:35<67:50:56, 4.09it/s, grad_norm=4.06, loss_final=1.67, loss_mean=0.914, loss_mean_cls=1.13, proj_loss=-0.374][2026-03-22 14:26:12] Step: 1850, Training Logs: loss_final: 1.670829, loss_mean: 0.911651, proj_loss: -0.372569, loss_mean_cls: 1.131747, grad_norm: 3.565176 +Steps: 0%| | 1851/1000000 [07:36<67:49:31, 4.09it/s, grad_norm=3.57, loss_final=1.67, loss_mean=0.912, loss_mean_cls=1.13, proj_loss=-0.373][2026-03-22 14:26:13] Step: 1851, Training Logs: loss_final: 1.640651, loss_mean: 0.921668, proj_loss: -0.370671, loss_mean_cls: 1.089654, grad_norm: 1.938142 +Steps: 0%| | 1852/1000000 [07:36<67:49:41, 4.09it/s, grad_norm=1.94, loss_final=1.64, loss_mean=0.922, loss_mean_cls=1.09, proj_loss=-0.371][2026-03-22 14:26:13] Step: 1852, Training Logs: loss_final: 1.754192, loss_mean: 0.915132, proj_loss: -0.365417, loss_mean_cls: 1.204478, grad_norm: 3.601894 +Steps: 0%| | 1853/1000000 [07:36<67:51:00, 4.09it/s, grad_norm=3.6, loss_final=1.75, loss_mean=0.915, loss_mean_cls=1.2, proj_loss=-0.365][2026-03-22 14:26:13] Step: 1853, Training Logs: loss_final: 1.663110, loss_mean: 0.932924, proj_loss: -0.371217, loss_mean_cls: 1.101404, grad_norm: 3.379279 +Steps: 0%| | 1854/1000000 [07:36<67:51:36, 4.09it/s, grad_norm=3.38, loss_final=1.66, loss_mean=0.933, loss_mean_cls=1.1, proj_loss=-0.371][2026-03-22 14:26:13] Step: 1854, Training Logs: loss_final: 1.633842, loss_mean: 0.931890, proj_loss: -0.371618, loss_mean_cls: 1.073569, grad_norm: 1.711583 +Steps: 0%| | 1855/1000000 [07:37<67:48:59, 4.09it/s, grad_norm=1.71, loss_final=1.63, loss_mean=0.932, loss_mean_cls=1.07, proj_loss=-0.372][2026-03-22 14:26:14] Step: 1855, Training Logs: loss_final: 1.639843, loss_mean: 0.925132, proj_loss: -0.373651, loss_mean_cls: 1.088362, grad_norm: 4.189360 +Steps: 0%| | 1856/1000000 [07:37<67:50:04, 4.09it/s, grad_norm=4.19, loss_final=1.64, loss_mean=0.925, loss_mean_cls=1.09, proj_loss=-0.374][2026-03-22 14:26:14] Step: 1856, Training Logs: loss_final: 1.664433, loss_mean: 0.927759, proj_loss: -0.371453, loss_mean_cls: 1.108126, grad_norm: 3.120404 +Steps: 0%| | 1857/1000000 [07:37<67:51:00, 4.09it/s, grad_norm=3.12, loss_final=1.66, loss_mean=0.928, loss_mean_cls=1.11, proj_loss=-0.371][2026-03-22 14:26:14] Step: 1857, Training Logs: loss_final: 1.616053, loss_mean: 0.932390, proj_loss: -0.367229, loss_mean_cls: 1.050892, grad_norm: 2.568983 +Steps: 0%| | 1858/1000000 [07:37<67:50:34, 4.09it/s, grad_norm=2.57, loss_final=1.62, loss_mean=0.932, loss_mean_cls=1.05, proj_loss=-0.367][2026-03-22 14:26:14] Step: 1858, Training Logs: loss_final: 1.677510, loss_mean: 0.937132, proj_loss: -0.378413, loss_mean_cls: 1.118791, grad_norm: 1.854146 +Steps: 0%| | 1859/1000000 [07:38<67:49:30, 4.09it/s, grad_norm=1.85, loss_final=1.68, loss_mean=0.937, loss_mean_cls=1.12, proj_loss=-0.378][2026-03-22 14:26:15] Step: 1859, Training Logs: loss_final: 1.755978, loss_mean: 0.911461, proj_loss: -0.364094, loss_mean_cls: 1.208610, grad_norm: 1.500546 +Steps: 0%| | 1860/1000000 [07:38<67:48:59, 4.09it/s, grad_norm=1.5, loss_final=1.76, loss_mean=0.911, loss_mean_cls=1.21, proj_loss=-0.364][2026-03-22 14:26:15] Step: 1860, Training Logs: loss_final: 1.579487, loss_mean: 0.925735, proj_loss: -0.379518, loss_mean_cls: 1.033270, grad_norm: 2.383069 +Steps: 0%| | 1861/1000000 [07:38<67:49:38, 4.09it/s, grad_norm=2.38, loss_final=1.58, loss_mean=0.926, loss_mean_cls=1.03, proj_loss=-0.38][2026-03-22 14:26:15] Step: 1861, Training Logs: loss_final: 1.689648, loss_mean: 0.917319, proj_loss: -0.367026, loss_mean_cls: 1.139354, grad_norm: 1.711200 +Steps: 0%| | 1862/1000000 [07:38<67:50:11, 4.09it/s, grad_norm=1.71, loss_final=1.69, loss_mean=0.917, loss_mean_cls=1.14, proj_loss=-0.367][2026-03-22 14:26:15] Step: 1862, Training Logs: loss_final: 1.547336, loss_mean: 0.946127, proj_loss: -0.376144, loss_mean_cls: 0.977354, grad_norm: 2.403530 +Steps: 0%| | 1863/1000000 [07:39<67:48:41, 4.09it/s, grad_norm=2.4, loss_final=1.55, loss_mean=0.946, loss_mean_cls=0.977, proj_loss=-0.376][2026-03-22 14:26:16] Step: 1863, Training Logs: loss_final: 1.691006, loss_mean: 0.916521, proj_loss: -0.370484, loss_mean_cls: 1.144969, grad_norm: 4.920528 +Steps: 0%| | 1864/1000000 [07:39<67:48:21, 4.09it/s, grad_norm=4.92, loss_final=1.69, loss_mean=0.917, loss_mean_cls=1.14, proj_loss=-0.37][2026-03-22 14:26:16] Step: 1864, Training Logs: loss_final: 1.616162, loss_mean: 0.945239, proj_loss: -0.370390, loss_mean_cls: 1.041313, grad_norm: 2.118362 +Steps: 0%| | 1865/1000000 [07:39<67:49:40, 4.09it/s, grad_norm=2.12, loss_final=1.62, loss_mean=0.945, loss_mean_cls=1.04, proj_loss=-0.37][2026-03-22 14:26:16] Step: 1865, Training Logs: loss_final: 1.716273, loss_mean: 0.920655, proj_loss: -0.370770, loss_mean_cls: 1.166389, grad_norm: 3.523682 +Steps: 0%| | 1866/1000000 [07:39<67:50:29, 4.09it/s, grad_norm=3.52, loss_final=1.72, loss_mean=0.921, loss_mean_cls=1.17, proj_loss=-0.371][2026-03-22 14:26:16] Step: 1866, Training Logs: loss_final: 1.667989, loss_mean: 0.925227, proj_loss: -0.371297, loss_mean_cls: 1.114058, grad_norm: 4.635818 +Steps: 0%| | 1867/1000000 [07:40<67:49:43, 4.09it/s, grad_norm=4.64, loss_final=1.67, loss_mean=0.925, loss_mean_cls=1.11, proj_loss=-0.371][2026-03-22 14:26:17] Step: 1867, Training Logs: loss_final: 1.653873, loss_mean: 0.927957, proj_loss: -0.372078, loss_mean_cls: 1.097994, grad_norm: 2.738409 +Steps: 0%| | 1868/1000000 [07:40<67:49:41, 4.09it/s, grad_norm=2.74, loss_final=1.65, loss_mean=0.928, loss_mean_cls=1.1, proj_loss=-0.372][2026-03-22 14:26:17] Step: 1868, Training Logs: loss_final: 1.628668, loss_mean: 0.915307, proj_loss: -0.375969, loss_mean_cls: 1.089330, grad_norm: 4.536571 +Steps: 0%| | 1869/1000000 [07:40<67:49:22, 4.09it/s, grad_norm=4.54, loss_final=1.63, loss_mean=0.915, loss_mean_cls=1.09, proj_loss=-0.376][2026-03-22 14:26:17] Step: 1869, Training Logs: loss_final: 1.714579, loss_mean: 0.951438, proj_loss: -0.367703, loss_mean_cls: 1.130844, grad_norm: 5.485612 +Steps: 0%| | 1870/1000000 [07:40<67:48:16, 4.09it/s, grad_norm=5.49, loss_final=1.71, loss_mean=0.951, loss_mean_cls=1.13, proj_loss=-0.368][2026-03-22 14:26:17] Step: 1870, Training Logs: loss_final: 1.757226, loss_mean: 0.926047, proj_loss: -0.361345, loss_mean_cls: 1.192524, grad_norm: 5.260156 +Steps: 0%| | 1871/1000000 [07:41<67:48:31, 4.09it/s, grad_norm=5.26, loss_final=1.76, loss_mean=0.926, loss_mean_cls=1.19, proj_loss=-0.361][2026-03-22 14:26:18] Step: 1871, Training Logs: loss_final: 1.683948, loss_mean: 0.920417, proj_loss: -0.368315, loss_mean_cls: 1.131846, grad_norm: 3.328832 +Steps: 0%| | 1872/1000000 [07:41<67:48:34, 4.09it/s, grad_norm=3.33, loss_final=1.68, loss_mean=0.92, loss_mean_cls=1.13, proj_loss=-0.368][2026-03-22 14:26:18] Step: 1872, Training Logs: loss_final: 1.614273, loss_mean: 0.939421, proj_loss: -0.375693, loss_mean_cls: 1.050545, grad_norm: 3.040115 +Steps: 0%| | 1873/1000000 [07:41<67:48:03, 4.09it/s, grad_norm=3.04, loss_final=1.61, loss_mean=0.939, loss_mean_cls=1.05, proj_loss=-0.376][2026-03-22 14:26:18] Step: 1873, Training Logs: loss_final: 1.693323, loss_mean: 0.916679, proj_loss: -0.362384, loss_mean_cls: 1.139028, grad_norm: 3.028145 +Steps: 0%| | 1874/1000000 [07:41<67:46:52, 4.09it/s, grad_norm=3.03, loss_final=1.69, loss_mean=0.917, loss_mean_cls=1.14, proj_loss=-0.362][2026-03-22 14:26:18] Step: 1874, Training Logs: loss_final: 1.449559, loss_mean: 0.952554, proj_loss: -0.384962, loss_mean_cls: 0.881968, grad_norm: 1.354460 +Steps: 0%| | 1875/1000000 [07:42<67:46:58, 4.09it/s, grad_norm=1.35, loss_final=1.45, loss_mean=0.953, loss_mean_cls=0.882, proj_loss=-0.385][2026-03-22 14:26:19] Step: 1875, Training Logs: loss_final: 1.665454, loss_mean: 0.924592, proj_loss: -0.371436, loss_mean_cls: 1.112299, grad_norm: 5.363807 +Steps: 0%| | 1876/1000000 [07:42<67:48:45, 4.09it/s, grad_norm=5.36, loss_final=1.67, loss_mean=0.925, loss_mean_cls=1.11, proj_loss=-0.371][2026-03-22 14:26:19] Step: 1876, Training Logs: loss_final: 1.650543, loss_mean: 0.925987, proj_loss: -0.372560, loss_mean_cls: 1.097115, grad_norm: 3.122636 +Steps: 0%| | 1877/1000000 [07:42<67:47:54, 4.09it/s, grad_norm=3.12, loss_final=1.65, loss_mean=0.926, loss_mean_cls=1.1, proj_loss=-0.373][2026-03-22 14:26:19] Step: 1877, Training Logs: loss_final: 1.875461, loss_mean: 0.909509, proj_loss: -0.356506, loss_mean_cls: 1.322457, grad_norm: 2.870954 +Steps: 0%| | 1878/1000000 [07:42<67:50:00, 4.09it/s, grad_norm=2.87, loss_final=1.88, loss_mean=0.91, loss_mean_cls=1.32, proj_loss=-0.357][2026-03-22 14:26:19] Step: 1878, Training Logs: loss_final: 1.700429, loss_mean: 0.922692, proj_loss: -0.364176, loss_mean_cls: 1.141913, grad_norm: 1.557704 +Steps: 0%| | 1879/1000000 [07:43<67:49:39, 4.09it/s, grad_norm=1.56, loss_final=1.7, loss_mean=0.923, loss_mean_cls=1.14, proj_loss=-0.364][2026-03-22 14:26:20] Step: 1879, Training Logs: loss_final: 1.686565, loss_mean: 0.920910, proj_loss: -0.366882, loss_mean_cls: 1.132537, grad_norm: 1.367439 +Steps: 0%| | 1880/1000000 [07:43<67:50:23, 4.09it/s, grad_norm=1.37, loss_final=1.69, loss_mean=0.921, loss_mean_cls=1.13, proj_loss=-0.367][2026-03-22 14:26:20] Step: 1880, Training Logs: loss_final: 1.688056, loss_mean: 0.904703, proj_loss: -0.370683, loss_mean_cls: 1.154037, grad_norm: 2.091176 +Steps: 0%| | 1881/1000000 [07:43<67:50:36, 4.09it/s, grad_norm=2.09, loss_final=1.69, loss_mean=0.905, loss_mean_cls=1.15, proj_loss=-0.371][2026-03-22 14:26:20] Step: 1881, Training Logs: loss_final: 1.586809, loss_mean: 0.920200, proj_loss: -0.370011, loss_mean_cls: 1.036620, grad_norm: 1.495798 +Steps: 0%| | 1882/1000000 [07:43<67:50:45, 4.09it/s, grad_norm=1.5, loss_final=1.59, loss_mean=0.92, loss_mean_cls=1.04, proj_loss=-0.37][2026-03-22 14:26:20] Step: 1882, Training Logs: loss_final: 1.629456, loss_mean: 0.923743, proj_loss: -0.376370, loss_mean_cls: 1.082083, grad_norm: 3.039310 +Steps: 0%| | 1883/1000000 [07:44<67:50:40, 4.09it/s, grad_norm=3.04, loss_final=1.63, loss_mean=0.924, loss_mean_cls=1.08, proj_loss=-0.376][2026-03-22 14:26:21] Step: 1883, Training Logs: loss_final: 1.700977, loss_mean: 0.925142, proj_loss: -0.370466, loss_mean_cls: 1.146301, grad_norm: 2.960581 +Steps: 0%| | 1884/1000000 [07:44<67:50:51, 4.09it/s, grad_norm=2.96, loss_final=1.7, loss_mean=0.925, loss_mean_cls=1.15, proj_loss=-0.37][2026-03-22 14:26:21] Step: 1884, Training Logs: loss_final: 1.701397, loss_mean: 0.929252, proj_loss: -0.371432, loss_mean_cls: 1.143576, grad_norm: 3.199494 +Steps: 0%| | 1885/1000000 [07:44<67:50:19, 4.09it/s, grad_norm=3.2, loss_final=1.7, loss_mean=0.929, loss_mean_cls=1.14, proj_loss=-0.371][2026-03-22 14:26:21] Step: 1885, Training Logs: loss_final: 1.571205, loss_mean: 0.940101, proj_loss: -0.375346, loss_mean_cls: 1.006450, grad_norm: 2.241360 +Steps: 0%| | 1886/1000000 [07:44<67:48:09, 4.09it/s, grad_norm=2.24, loss_final=1.57, loss_mean=0.94, loss_mean_cls=1.01, proj_loss=-0.375][2026-03-22 14:26:21] Step: 1886, Training Logs: loss_final: 1.620744, loss_mean: 0.932382, proj_loss: -0.367536, loss_mean_cls: 1.055899, grad_norm: 1.384861 +Steps: 0%| | 1887/1000000 [07:45<67:48:48, 4.09it/s, grad_norm=1.38, loss_final=1.62, loss_mean=0.932, loss_mean_cls=1.06, proj_loss=-0.368][2026-03-22 14:26:22] Step: 1887, Training Logs: loss_final: 1.721807, loss_mean: 0.903538, proj_loss: -0.370820, loss_mean_cls: 1.189089, grad_norm: 1.602617 +Steps: 0%| | 1888/1000000 [07:45<67:48:05, 4.09it/s, grad_norm=1.6, loss_final=1.72, loss_mean=0.904, loss_mean_cls=1.19, proj_loss=-0.371][2026-03-22 14:26:22] Step: 1888, Training Logs: loss_final: 1.618202, loss_mean: 0.944040, proj_loss: -0.375539, loss_mean_cls: 1.049701, grad_norm: 4.015816 +Steps: 0%| | 1889/1000000 [07:45<67:48:28, 4.09it/s, grad_norm=4.02, loss_final=1.62, loss_mean=0.944, loss_mean_cls=1.05, proj_loss=-0.376][2026-03-22 14:26:22] Step: 1889, Training Logs: loss_final: 1.701643, loss_mean: 0.921698, proj_loss: -0.375368, loss_mean_cls: 1.155313, grad_norm: 2.503081 +Steps: 0%| | 1890/1000000 [07:45<67:46:29, 4.09it/s, grad_norm=2.5, loss_final=1.7, loss_mean=0.922, loss_mean_cls=1.16, proj_loss=-0.375][2026-03-22 14:26:22] Step: 1890, Training Logs: loss_final: 1.613416, loss_mean: 0.942880, proj_loss: -0.379088, loss_mean_cls: 1.049624, grad_norm: 4.416511 +Steps: 0%| | 1891/1000000 [07:46<67:46:19, 4.09it/s, grad_norm=4.42, loss_final=1.61, loss_mean=0.943, loss_mean_cls=1.05, proj_loss=-0.379][2026-03-22 14:26:22] Step: 1891, Training Logs: loss_final: 1.659374, loss_mean: 0.928410, proj_loss: -0.372749, loss_mean_cls: 1.103713, grad_norm: 3.069381 +Steps: 0%| | 1892/1000000 [07:46<67:48:06, 4.09it/s, grad_norm=3.07, loss_final=1.66, loss_mean=0.928, loss_mean_cls=1.1, proj_loss=-0.373][2026-03-22 14:26:23] Step: 1892, Training Logs: loss_final: 1.627649, loss_mean: 0.904728, proj_loss: -0.373996, loss_mean_cls: 1.096917, grad_norm: 1.494565 +Steps: 0%| | 1893/1000000 [07:46<67:46:34, 4.09it/s, grad_norm=1.49, loss_final=1.63, loss_mean=0.905, loss_mean_cls=1.1, proj_loss=-0.374][2026-03-22 14:26:23] Step: 1893, Training Logs: loss_final: 1.811104, loss_mean: 0.897928, proj_loss: -0.364776, loss_mean_cls: 1.277952, grad_norm: 4.138304 +Steps: 0%| | 1894/1000000 [07:46<67:46:36, 4.09it/s, grad_norm=4.14, loss_final=1.81, loss_mean=0.898, loss_mean_cls=1.28, proj_loss=-0.365][2026-03-22 14:26:23] Step: 1894, Training Logs: loss_final: 1.664383, loss_mean: 0.936464, proj_loss: -0.364890, loss_mean_cls: 1.092810, grad_norm: 3.813292 +Steps: 0%| | 1895/1000000 [07:46<67:44:47, 4.09it/s, grad_norm=3.81, loss_final=1.66, loss_mean=0.936, loss_mean_cls=1.09, proj_loss=-0.365][2026-03-22 14:26:23] Step: 1895, Training Logs: loss_final: 1.725555, loss_mean: 0.922054, proj_loss: -0.363764, loss_mean_cls: 1.167266, grad_norm: 3.857882 +Steps: 0%| | 1896/1000000 [07:47<67:45:20, 4.09it/s, grad_norm=3.86, loss_final=1.73, loss_mean=0.922, loss_mean_cls=1.17, proj_loss=-0.364][2026-03-22 14:26:24] Step: 1896, Training Logs: loss_final: 1.755777, loss_mean: 0.901472, proj_loss: -0.365976, loss_mean_cls: 1.220281, grad_norm: 3.455415 +Steps: 0%| | 1897/1000000 [07:47<67:44:46, 4.09it/s, grad_norm=3.46, loss_final=1.76, loss_mean=0.901, loss_mean_cls=1.22, proj_loss=-0.366][2026-03-22 14:26:24] Step: 1897, Training Logs: loss_final: 1.690307, loss_mean: 0.922383, proj_loss: -0.367387, loss_mean_cls: 1.135311, grad_norm: 4.076333 +Steps: 0%| | 1898/1000000 [07:47<67:45:21, 4.09it/s, grad_norm=4.08, loss_final=1.69, loss_mean=0.922, loss_mean_cls=1.14, proj_loss=-0.367][2026-03-22 14:26:24] Step: 1898, Training Logs: loss_final: 1.666706, loss_mean: 0.940777, proj_loss: -0.368014, loss_mean_cls: 1.093944, grad_norm: 1.966761 +Steps: 0%| | 1899/1000000 [07:47<67:44:23, 4.09it/s, grad_norm=1.97, loss_final=1.67, loss_mean=0.941, loss_mean_cls=1.09, proj_loss=-0.368][2026-03-22 14:26:24] Step: 1899, Training Logs: loss_final: 1.727006, loss_mean: 0.898467, proj_loss: -0.364605, loss_mean_cls: 1.193145, grad_norm: 2.062581 +Steps: 0%| | 1900/1000000 [07:48<67:47:16, 4.09it/s, grad_norm=2.06, loss_final=1.73, loss_mean=0.898, loss_mean_cls=1.19, proj_loss=-0.365][2026-03-22 14:26:25] Step: 1900, Training Logs: loss_final: 1.637511, loss_mean: 0.928071, proj_loss: -0.379409, loss_mean_cls: 1.088848, grad_norm: 2.180321 +Steps: 0%| | 1901/1000000 [07:48<67:45:25, 4.09it/s, grad_norm=2.18, loss_final=1.64, loss_mean=0.928, loss_mean_cls=1.09, proj_loss=-0.379][2026-03-22 14:26:25] Step: 1901, Training Logs: loss_final: 1.688611, loss_mean: 0.927709, proj_loss: -0.368514, loss_mean_cls: 1.129416, grad_norm: 1.360577 +Steps: 0%| | 1902/1000000 [07:48<67:45:17, 4.09it/s, grad_norm=1.36, loss_final=1.69, loss_mean=0.928, loss_mean_cls=1.13, proj_loss=-0.369][2026-03-22 14:26:25] Step: 1902, Training Logs: loss_final: 1.788157, loss_mean: 0.921030, proj_loss: -0.365434, loss_mean_cls: 1.232561, grad_norm: 3.230417 +Steps: 0%| | 1903/1000000 [07:48<67:50:06, 4.09it/s, grad_norm=3.23, loss_final=1.79, loss_mean=0.921, loss_mean_cls=1.23, proj_loss=-0.365][2026-03-22 14:26:25] Step: 1903, Training Logs: loss_final: 1.574359, loss_mean: 0.921098, proj_loss: -0.380085, loss_mean_cls: 1.033346, grad_norm: 3.642829 +Steps: 0%| | 1904/1000000 [07:49<67:48:58, 4.09it/s, grad_norm=3.64, loss_final=1.57, loss_mean=0.921, loss_mean_cls=1.03, proj_loss=-0.38][2026-03-22 14:26:26] Step: 1904, Training Logs: loss_final: 1.752578, loss_mean: 0.894141, proj_loss: -0.363525, loss_mean_cls: 1.221962, grad_norm: 2.428615 +Steps: 0%| | 1905/1000000 [07:49<67:48:14, 4.09it/s, grad_norm=2.43, loss_final=1.75, loss_mean=0.894, loss_mean_cls=1.22, proj_loss=-0.364][2026-03-22 14:26:26] Step: 1905, Training Logs: loss_final: 1.633110, loss_mean: 0.915524, proj_loss: -0.375452, loss_mean_cls: 1.093039, grad_norm: 3.987956 +Steps: 0%| | 1906/1000000 [07:49<67:48:03, 4.09it/s, grad_norm=3.99, loss_final=1.63, loss_mean=0.916, loss_mean_cls=1.09, proj_loss=-0.375][2026-03-22 14:26:26] Step: 1906, Training Logs: loss_final: 1.724621, loss_mean: 0.921173, proj_loss: -0.362724, loss_mean_cls: 1.166172, grad_norm: 3.293756 +Steps: 0%| | 1907/1000000 [07:49<67:47:02, 4.09it/s, grad_norm=3.29, loss_final=1.72, loss_mean=0.921, loss_mean_cls=1.17, proj_loss=-0.363][2026-03-22 14:26:26] Step: 1907, Training Logs: loss_final: 1.669258, loss_mean: 0.907919, proj_loss: -0.368287, loss_mean_cls: 1.129626, grad_norm: 2.901642 +Steps: 0%| | 1908/1000000 [07:50<67:46:50, 4.09it/s, grad_norm=2.9, loss_final=1.67, loss_mean=0.908, loss_mean_cls=1.13, proj_loss=-0.368][2026-03-22 14:26:27] Step: 1908, Training Logs: loss_final: 1.575489, loss_mean: 0.937467, proj_loss: -0.374578, loss_mean_cls: 1.012600, grad_norm: 2.610648 +Steps: 0%| | 1909/1000000 [07:50<67:46:34, 4.09it/s, grad_norm=2.61, loss_final=1.58, loss_mean=0.937, loss_mean_cls=1.01, proj_loss=-0.375][2026-03-22 14:26:27] Step: 1909, Training Logs: loss_final: 1.567028, loss_mean: 0.952513, proj_loss: -0.375793, loss_mean_cls: 0.990307, grad_norm: 4.175015 +Steps: 0%| | 1910/1000000 [07:50<67:48:00, 4.09it/s, grad_norm=4.18, loss_final=1.57, loss_mean=0.953, loss_mean_cls=0.99, proj_loss=-0.376][2026-03-22 14:26:27] Step: 1910, Training Logs: loss_final: 1.660472, loss_mean: 0.927964, proj_loss: -0.375055, loss_mean_cls: 1.107564, grad_norm: 2.010361 +Steps: 0%| | 1911/1000000 [07:50<67:46:24, 4.09it/s, grad_norm=2.01, loss_final=1.66, loss_mean=0.928, loss_mean_cls=1.11, proj_loss=-0.375][2026-03-22 14:26:27] Step: 1911, Training Logs: loss_final: 1.750147, loss_mean: 0.909570, proj_loss: -0.362233, loss_mean_cls: 1.202811, grad_norm: 1.973947 +Steps: 0%| | 1912/1000000 [07:51<67:47:32, 4.09it/s, grad_norm=1.97, loss_final=1.75, loss_mean=0.91, loss_mean_cls=1.2, proj_loss=-0.362][2026-03-22 14:26:28] Step: 1912, Training Logs: loss_final: 1.699127, loss_mean: 0.924069, proj_loss: -0.374316, loss_mean_cls: 1.149373, grad_norm: 3.070421 +Steps: 0%| | 1913/1000000 [07:51<67:45:34, 4.09it/s, grad_norm=3.07, loss_final=1.7, loss_mean=0.924, loss_mean_cls=1.15, proj_loss=-0.374][2026-03-22 14:26:28] Step: 1913, Training Logs: loss_final: 1.713810, loss_mean: 0.924461, proj_loss: -0.361594, loss_mean_cls: 1.150942, grad_norm: 2.232420 +Steps: 0%| | 1914/1000000 [07:51<67:43:55, 4.09it/s, grad_norm=2.23, loss_final=1.71, loss_mean=0.924, loss_mean_cls=1.15, proj_loss=-0.362][2026-03-22 14:26:28] Step: 1914, Training Logs: loss_final: 1.760152, loss_mean: 0.909683, proj_loss: -0.366272, loss_mean_cls: 1.216740, grad_norm: 3.053473 +Steps: 0%| | 1915/1000000 [07:51<67:43:46, 4.09it/s, grad_norm=3.05, loss_final=1.76, loss_mean=0.91, loss_mean_cls=1.22, proj_loss=-0.366][2026-03-22 14:26:28] Step: 1915, Training Logs: loss_final: 1.675780, loss_mean: 0.922635, proj_loss: -0.371383, loss_mean_cls: 1.124528, grad_norm: 3.704199 +Steps: 0%| | 1916/1000000 [07:52<67:45:05, 4.09it/s, grad_norm=3.7, loss_final=1.68, loss_mean=0.923, loss_mean_cls=1.12, proj_loss=-0.371][2026-03-22 14:26:29] Step: 1916, Training Logs: loss_final: 1.617243, loss_mean: 0.935266, proj_loss: -0.376469, loss_mean_cls: 1.058447, grad_norm: 1.937967 +Steps: 0%| | 1917/1000000 [07:52<67:44:28, 4.09it/s, grad_norm=1.94, loss_final=1.62, loss_mean=0.935, loss_mean_cls=1.06, proj_loss=-0.376][2026-03-22 14:26:29] Step: 1917, Training Logs: loss_final: 1.735829, loss_mean: 0.923738, proj_loss: -0.365897, loss_mean_cls: 1.177989, grad_norm: 3.256024 +Steps: 0%| | 1918/1000000 [07:52<67:45:16, 4.09it/s, grad_norm=3.26, loss_final=1.74, loss_mean=0.924, loss_mean_cls=1.18, proj_loss=-0.366][2026-03-22 14:26:29] Step: 1918, Training Logs: loss_final: 1.731122, loss_mean: 0.920456, proj_loss: -0.366248, loss_mean_cls: 1.176914, grad_norm: 4.067166 +Steps: 0%| | 1919/1000000 [07:52<67:45:22, 4.09it/s, grad_norm=4.07, loss_final=1.73, loss_mean=0.92, loss_mean_cls=1.18, proj_loss=-0.366][2026-03-22 14:26:29] Step: 1919, Training Logs: loss_final: 1.715971, loss_mean: 0.921920, proj_loss: -0.366936, loss_mean_cls: 1.160988, grad_norm: 2.845765 +Steps: 0%| | 1920/1000000 [07:53<67:46:56, 4.09it/s, grad_norm=2.85, loss_final=1.72, loss_mean=0.922, loss_mean_cls=1.16, proj_loss=-0.367][2026-03-22 14:26:30] Step: 1920, Training Logs: loss_final: 1.588285, loss_mean: 0.938802, proj_loss: -0.369034, loss_mean_cls: 1.018518, grad_norm: 2.170050 +Steps: 0%| | 1921/1000000 [07:53<67:47:16, 4.09it/s, grad_norm=2.17, loss_final=1.59, loss_mean=0.939, loss_mean_cls=1.02, proj_loss=-0.369][2026-03-22 14:26:30] Step: 1921, Training Logs: loss_final: 1.674552, loss_mean: 0.935162, proj_loss: -0.377890, loss_mean_cls: 1.117281, grad_norm: 2.253970 +Steps: 0%| | 1922/1000000 [07:53<67:46:00, 4.09it/s, grad_norm=2.25, loss_final=1.67, loss_mean=0.935, loss_mean_cls=1.12, proj_loss=-0.378][2026-03-22 14:26:30] Step: 1922, Training Logs: loss_final: 1.631671, loss_mean: 0.930199, proj_loss: -0.374844, loss_mean_cls: 1.076316, grad_norm: 2.276007 +Steps: 0%| | 1923/1000000 [07:53<67:44:33, 4.09it/s, grad_norm=2.28, loss_final=1.63, loss_mean=0.93, loss_mean_cls=1.08, proj_loss=-0.375][2026-03-22 14:26:30] Step: 1923, Training Logs: loss_final: 1.589830, loss_mean: 0.933379, proj_loss: -0.375180, loss_mean_cls: 1.031631, grad_norm: 3.449755 +Steps: 0%| | 1924/1000000 [07:54<67:45:53, 4.09it/s, grad_norm=3.45, loss_final=1.59, loss_mean=0.933, loss_mean_cls=1.03, proj_loss=-0.375][2026-03-22 14:26:31] Step: 1924, Training Logs: loss_final: 1.593997, loss_mean: 0.949282, proj_loss: -0.376523, loss_mean_cls: 1.021238, grad_norm: 2.091160 +Steps: 0%| | 1925/1000000 [07:54<67:54:38, 4.08it/s, grad_norm=2.09, loss_final=1.59, loss_mean=0.949, loss_mean_cls=1.02, proj_loss=-0.377][2026-03-22 14:26:31] Step: 1925, Training Logs: loss_final: 1.710525, loss_mean: 0.893758, proj_loss: -0.367416, loss_mean_cls: 1.184183, grad_norm: 2.507951 +Steps: 0%| | 1926/1000000 [07:54<67:50:27, 4.09it/s, grad_norm=2.51, loss_final=1.71, loss_mean=0.894, loss_mean_cls=1.18, proj_loss=-0.367][2026-03-22 14:26:31] Step: 1926, Training Logs: loss_final: 1.717655, loss_mean: 0.919737, proj_loss: -0.369375, loss_mean_cls: 1.167293, grad_norm: 1.906495 +Steps: 0%| | 1927/1000000 [07:54<67:49:27, 4.09it/s, grad_norm=1.91, loss_final=1.72, loss_mean=0.92, loss_mean_cls=1.17, proj_loss=-0.369][2026-03-22 14:26:31] Step: 1927, Training Logs: loss_final: 1.630880, loss_mean: 0.940401, proj_loss: -0.371049, loss_mean_cls: 1.061528, grad_norm: 3.636186 +Steps: 0%| | 1928/1000000 [07:55<67:50:20, 4.09it/s, grad_norm=3.64, loss_final=1.63, loss_mean=0.94, loss_mean_cls=1.06, proj_loss=-0.371][2026-03-22 14:26:32] Step: 1928, Training Logs: loss_final: 1.724871, loss_mean: 0.925054, proj_loss: -0.370248, loss_mean_cls: 1.170065, grad_norm: 4.004975 +Steps: 0%| | 1929/1000000 [07:55<67:50:32, 4.09it/s, grad_norm=4, loss_final=1.72, loss_mean=0.925, loss_mean_cls=1.17, proj_loss=-0.37][2026-03-22 14:26:32] Step: 1929, Training Logs: loss_final: 1.645873, loss_mean: 0.911771, proj_loss: -0.373113, loss_mean_cls: 1.107215, grad_norm: 5.095534 +Steps: 0%| | 1930/1000000 [07:55<67:49:11, 4.09it/s, grad_norm=5.1, loss_final=1.65, loss_mean=0.912, loss_mean_cls=1.11, proj_loss=-0.373][2026-03-22 14:26:32] Step: 1930, Training Logs: loss_final: 1.766348, loss_mean: 0.907864, proj_loss: -0.360962, loss_mean_cls: 1.219446, grad_norm: 1.435547 +Steps: 0%| | 1931/1000000 [07:55<67:48:22, 4.09it/s, grad_norm=1.44, loss_final=1.77, loss_mean=0.908, loss_mean_cls=1.22, proj_loss=-0.361][2026-03-22 14:26:32] Step: 1931, Training Logs: loss_final: 1.539460, loss_mean: 0.934229, proj_loss: -0.381382, loss_mean_cls: 0.986613, grad_norm: 2.317702 +Steps: 0%| | 1932/1000000 [07:56<67:47:15, 4.09it/s, grad_norm=2.32, loss_final=1.54, loss_mean=0.934, loss_mean_cls=0.987, proj_loss=-0.381][2026-03-22 14:26:33] Step: 1932, Training Logs: loss_final: 1.729662, loss_mean: 0.894810, proj_loss: -0.374647, loss_mean_cls: 1.209499, grad_norm: 1.803753 +Steps: 0%| | 1933/1000000 [07:56<67:46:13, 4.09it/s, grad_norm=1.8, loss_final=1.73, loss_mean=0.895, loss_mean_cls=1.21, proj_loss=-0.375][2026-03-22 14:26:33] Step: 1933, Training Logs: loss_final: 1.690706, loss_mean: 0.916560, proj_loss: -0.368298, loss_mean_cls: 1.142443, grad_norm: 1.379255 +Steps: 0%| | 1934/1000000 [07:56<67:46:08, 4.09it/s, grad_norm=1.38, loss_final=1.69, loss_mean=0.917, loss_mean_cls=1.14, proj_loss=-0.368][2026-03-22 14:26:33] Step: 1934, Training Logs: loss_final: 1.615814, loss_mean: 0.931075, proj_loss: -0.373229, loss_mean_cls: 1.057967, grad_norm: 1.995730 +Steps: 0%| | 1935/1000000 [07:56<67:44:24, 4.09it/s, grad_norm=2, loss_final=1.62, loss_mean=0.931, loss_mean_cls=1.06, proj_loss=-0.373][2026-03-22 14:26:33] Step: 1935, Training Logs: loss_final: 1.829125, loss_mean: 0.902344, proj_loss: -0.352771, loss_mean_cls: 1.279552, grad_norm: 3.509620 +Steps: 0%| | 1936/1000000 [07:57<67:45:02, 4.09it/s, grad_norm=3.51, loss_final=1.83, loss_mean=0.902, loss_mean_cls=1.28, proj_loss=-0.353][2026-03-22 14:26:33] Step: 1936, Training Logs: loss_final: 1.684019, loss_mean: 0.906903, proj_loss: -0.370997, loss_mean_cls: 1.148113, grad_norm: 2.226502 +Steps: 0%| | 1937/1000000 [07:57<67:46:14, 4.09it/s, grad_norm=2.23, loss_final=1.68, loss_mean=0.907, loss_mean_cls=1.15, proj_loss=-0.371][2026-03-22 14:26:34] Step: 1937, Training Logs: loss_final: 1.701846, loss_mean: 0.915058, proj_loss: -0.375514, loss_mean_cls: 1.162302, grad_norm: 2.739184 +Steps: 0%| | 1938/1000000 [07:57<67:46:30, 4.09it/s, grad_norm=2.74, loss_final=1.7, loss_mean=0.915, loss_mean_cls=1.16, proj_loss=-0.376][2026-03-22 14:26:34] Step: 1938, Training Logs: loss_final: 1.788748, loss_mean: 0.916292, proj_loss: -0.358706, loss_mean_cls: 1.231162, grad_norm: 4.227179 +Steps: 0%| | 1939/1000000 [07:57<67:46:09, 4.09it/s, grad_norm=4.23, loss_final=1.79, loss_mean=0.916, loss_mean_cls=1.23, proj_loss=-0.359][2026-03-22 14:26:34] Step: 1939, Training Logs: loss_final: 1.673723, loss_mean: 0.917049, proj_loss: -0.370697, loss_mean_cls: 1.127370, grad_norm: 4.212477 +Steps: 0%| | 1940/1000000 [07:57<67:45:01, 4.09it/s, grad_norm=4.21, loss_final=1.67, loss_mean=0.917, loss_mean_cls=1.13, proj_loss=-0.371][2026-03-22 14:26:34] Step: 1940, Training Logs: loss_final: 1.687411, loss_mean: 0.931490, proj_loss: -0.369326, loss_mean_cls: 1.125247, grad_norm: 3.998763 +Steps: 0%| | 1941/1000000 [07:58<67:44:35, 4.09it/s, grad_norm=4, loss_final=1.69, loss_mean=0.931, loss_mean_cls=1.13, proj_loss=-0.369][2026-03-22 14:26:35] Step: 1941, Training Logs: loss_final: 1.769327, loss_mean: 0.896886, proj_loss: -0.365512, loss_mean_cls: 1.237953, grad_norm: 2.080075 +Steps: 0%| | 1942/1000000 [07:58<67:45:02, 4.09it/s, grad_norm=2.08, loss_final=1.77, loss_mean=0.897, loss_mean_cls=1.24, proj_loss=-0.366][2026-03-22 14:26:35] Step: 1942, Training Logs: loss_final: 1.700289, loss_mean: 0.916416, proj_loss: -0.371403, loss_mean_cls: 1.155276, grad_norm: 2.759376 +Steps: 0%| | 1943/1000000 [07:58<67:45:16, 4.09it/s, grad_norm=2.76, loss_final=1.7, loss_mean=0.916, loss_mean_cls=1.16, proj_loss=-0.371][2026-03-22 14:26:35] Step: 1943, Training Logs: loss_final: 1.713381, loss_mean: 0.920248, proj_loss: -0.369202, loss_mean_cls: 1.162334, grad_norm: 2.653618 +Steps: 0%| | 1944/1000000 [07:58<67:46:33, 4.09it/s, grad_norm=2.65, loss_final=1.71, loss_mean=0.92, loss_mean_cls=1.16, proj_loss=-0.369][2026-03-22 14:26:35] Step: 1944, Training Logs: loss_final: 1.810112, loss_mean: 0.905554, proj_loss: -0.360204, loss_mean_cls: 1.264762, grad_norm: 2.538574 +Steps: 0%| | 1945/1000000 [07:59<67:45:55, 4.09it/s, grad_norm=2.54, loss_final=1.81, loss_mean=0.906, loss_mean_cls=1.26, proj_loss=-0.36][2026-03-22 14:26:36] Step: 1945, Training Logs: loss_final: 1.739807, loss_mean: 0.903491, proj_loss: -0.359192, loss_mean_cls: 1.195508, grad_norm: 1.984574 +Steps: 0%| | 1946/1000000 [07:59<67:47:00, 4.09it/s, grad_norm=1.98, loss_final=1.74, loss_mean=0.903, loss_mean_cls=1.2, proj_loss=-0.359][2026-03-22 14:26:36] Step: 1946, Training Logs: loss_final: 1.729032, loss_mean: 0.906721, proj_loss: -0.369188, loss_mean_cls: 1.191498, grad_norm: 1.980661 +Steps: 0%| | 1947/1000000 [07:59<67:47:09, 4.09it/s, grad_norm=1.98, loss_final=1.73, loss_mean=0.907, loss_mean_cls=1.19, proj_loss=-0.369][2026-03-22 14:26:36] Step: 1947, Training Logs: loss_final: 1.750632, loss_mean: 0.913930, proj_loss: -0.367630, loss_mean_cls: 1.204332, grad_norm: 1.876948 +Steps: 0%| | 1948/1000000 [07:59<67:46:58, 4.09it/s, grad_norm=1.88, loss_final=1.75, loss_mean=0.914, loss_mean_cls=1.2, proj_loss=-0.368][2026-03-22 14:26:36] Step: 1948, Training Logs: loss_final: 1.655800, loss_mean: 0.895356, proj_loss: -0.376396, loss_mean_cls: 1.136840, grad_norm: 2.937264 +Steps: 0%| | 1949/1000000 [08:00<67:46:49, 4.09it/s, grad_norm=2.94, loss_final=1.66, loss_mean=0.895, loss_mean_cls=1.14, proj_loss=-0.376][2026-03-22 14:26:37] Step: 1949, Training Logs: loss_final: 1.645653, loss_mean: 0.932601, proj_loss: -0.370427, loss_mean_cls: 1.083479, grad_norm: 1.461069 +Steps: 0%| | 1950/1000000 [08:00<68:37:49, 4.04it/s, grad_norm=1.46, loss_final=1.65, loss_mean=0.933, loss_mean_cls=1.08, proj_loss=-0.37][2026-03-22 14:26:37] Step: 1950, Training Logs: loss_final: 1.665692, loss_mean: 0.936780, proj_loss: -0.373462, loss_mean_cls: 1.102373, grad_norm: 3.475120 +Steps: 0%| | 1951/1000000 [08:00<68:22:15, 4.05it/s, grad_norm=3.48, loss_final=1.67, loss_mean=0.937, loss_mean_cls=1.1, proj_loss=-0.373][2026-03-22 14:26:37] Step: 1951, Training Logs: loss_final: 1.627299, loss_mean: 0.934539, proj_loss: -0.375300, loss_mean_cls: 1.068061, grad_norm: 3.705708 +Steps: 0%| | 1952/1000000 [08:00<68:11:32, 4.07it/s, grad_norm=3.71, loss_final=1.63, loss_mean=0.935, loss_mean_cls=1.07, proj_loss=-0.375][2026-03-22 14:26:37] Step: 1952, Training Logs: loss_final: 1.691245, loss_mean: 0.914449, proj_loss: -0.375219, loss_mean_cls: 1.152015, grad_norm: 2.496103 +Steps: 0%| | 1953/1000000 [08:01<68:03:59, 4.07it/s, grad_norm=2.5, loss_final=1.69, loss_mean=0.914, loss_mean_cls=1.15, proj_loss=-0.375][2026-03-22 14:26:38] Step: 1953, Training Logs: loss_final: 1.540614, loss_mean: 0.942477, proj_loss: -0.382543, loss_mean_cls: 0.980679, grad_norm: 3.291043 +Steps: 0%| | 1954/1000000 [08:01<67:58:05, 4.08it/s, grad_norm=3.29, loss_final=1.54, loss_mean=0.942, loss_mean_cls=0.981, proj_loss=-0.383][2026-03-22 14:26:38] Step: 1954, Training Logs: loss_final: 1.728045, loss_mean: 0.924672, proj_loss: -0.368776, loss_mean_cls: 1.172149, grad_norm: 4.561461 +Steps: 0%| | 1955/1000000 [08:01<67:54:16, 4.08it/s, grad_norm=4.56, loss_final=1.73, loss_mean=0.925, loss_mean_cls=1.17, proj_loss=-0.369][2026-03-22 14:26:38] Step: 1955, Training Logs: loss_final: 1.723419, loss_mean: 0.914442, proj_loss: -0.370553, loss_mean_cls: 1.179530, grad_norm: 2.700295 +Steps: 0%| | 1956/1000000 [08:01<67:52:02, 4.08it/s, grad_norm=2.7, loss_final=1.72, loss_mean=0.914, loss_mean_cls=1.18, proj_loss=-0.371][2026-03-22 14:26:38] Step: 1956, Training Logs: loss_final: 1.678522, loss_mean: 0.911055, proj_loss: -0.368342, loss_mean_cls: 1.135809, grad_norm: 2.060745 +Steps: 0%| | 1957/1000000 [08:02<67:50:30, 4.09it/s, grad_norm=2.06, loss_final=1.68, loss_mean=0.911, loss_mean_cls=1.14, proj_loss=-0.368][2026-03-22 14:26:39] Step: 1957, Training Logs: loss_final: 1.611350, loss_mean: 0.936497, proj_loss: -0.374990, loss_mean_cls: 1.049843, grad_norm: 3.428355 +Steps: 0%| | 1958/1000000 [08:02<67:48:42, 4.09it/s, grad_norm=3.43, loss_final=1.61, loss_mean=0.936, loss_mean_cls=1.05, proj_loss=-0.375][2026-03-22 14:26:39] Step: 1958, Training Logs: loss_final: 1.874607, loss_mean: 0.902513, proj_loss: -0.358815, loss_mean_cls: 1.330909, grad_norm: 1.955856 +Steps: 0%| | 1959/1000000 [08:02<67:47:46, 4.09it/s, grad_norm=1.96, loss_final=1.87, loss_mean=0.903, loss_mean_cls=1.33, proj_loss=-0.359][2026-03-22 14:26:39] Step: 1959, Training Logs: loss_final: 1.617401, loss_mean: 0.928695, proj_loss: -0.376513, loss_mean_cls: 1.065220, grad_norm: 4.409269 +Steps: 0%| | 1960/1000000 [08:02<67:46:51, 4.09it/s, grad_norm=4.41, loss_final=1.62, loss_mean=0.929, loss_mean_cls=1.07, proj_loss=-0.377][2026-03-22 14:26:39] Step: 1960, Training Logs: loss_final: 1.610395, loss_mean: 0.934815, proj_loss: -0.379273, loss_mean_cls: 1.054854, grad_norm: 2.972611 +Steps: 0%| | 1961/1000000 [08:03<67:46:34, 4.09it/s, grad_norm=2.97, loss_final=1.61, loss_mean=0.935, loss_mean_cls=1.05, proj_loss=-0.379][2026-03-22 14:26:40] Step: 1961, Training Logs: loss_final: 1.557318, loss_mean: 0.927381, proj_loss: -0.380736, loss_mean_cls: 1.010673, grad_norm: 2.055224 +Steps: 0%| | 1962/1000000 [08:03<67:46:27, 4.09it/s, grad_norm=2.06, loss_final=1.56, loss_mean=0.927, loss_mean_cls=1.01, proj_loss=-0.381][2026-03-22 14:26:40] Step: 1962, Training Logs: loss_final: 1.763034, loss_mean: 0.906274, proj_loss: -0.364416, loss_mean_cls: 1.221177, grad_norm: 2.495755 +Steps: 0%| | 1963/1000000 [08:03<67:47:16, 4.09it/s, grad_norm=2.5, loss_final=1.76, loss_mean=0.906, loss_mean_cls=1.22, proj_loss=-0.364][2026-03-22 14:26:40] Step: 1963, Training Logs: loss_final: 1.646051, loss_mean: 0.919283, proj_loss: -0.375160, loss_mean_cls: 1.101927, grad_norm: 2.618007 +Steps: 0%| | 1964/1000000 [08:03<67:47:16, 4.09it/s, grad_norm=2.62, loss_final=1.65, loss_mean=0.919, loss_mean_cls=1.1, proj_loss=-0.375][2026-03-22 14:26:40] Step: 1964, Training Logs: loss_final: 1.802915, loss_mean: 0.898318, proj_loss: -0.365514, loss_mean_cls: 1.270111, grad_norm: 3.814665 +Steps: 0%| | 1965/1000000 [08:04<67:47:35, 4.09it/s, grad_norm=3.81, loss_final=1.8, loss_mean=0.898, loss_mean_cls=1.27, proj_loss=-0.366][2026-03-22 14:26:41] Step: 1965, Training Logs: loss_final: 1.732099, loss_mean: 0.929722, proj_loss: -0.369971, loss_mean_cls: 1.172348, grad_norm: 2.391046 +Steps: 0%| | 1966/1000000 [08:04<67:50:30, 4.09it/s, grad_norm=2.39, loss_final=1.73, loss_mean=0.93, loss_mean_cls=1.17, proj_loss=-0.37][2026-03-22 14:26:41] Step: 1966, Training Logs: loss_final: 1.663900, loss_mean: 0.917834, proj_loss: -0.375432, loss_mean_cls: 1.121499, grad_norm: 1.869851 +Steps: 0%| | 1967/1000000 [08:04<67:49:44, 4.09it/s, grad_norm=1.87, loss_final=1.66, loss_mean=0.918, loss_mean_cls=1.12, proj_loss=-0.375][2026-03-22 14:26:41] Step: 1967, Training Logs: loss_final: 1.817646, loss_mean: 0.882384, proj_loss: -0.357571, loss_mean_cls: 1.292833, grad_norm: 3.891469 +Steps: 0%| | 1968/1000000 [08:04<67:48:39, 4.09it/s, grad_norm=3.89, loss_final=1.82, loss_mean=0.882, loss_mean_cls=1.29, proj_loss=-0.358][2026-03-22 14:26:41] Step: 1968, Training Logs: loss_final: 1.649346, loss_mean: 0.934799, proj_loss: -0.373654, loss_mean_cls: 1.088201, grad_norm: 3.170703 +Steps: 0%| | 1969/1000000 [08:05<67:46:49, 4.09it/s, grad_norm=3.17, loss_final=1.65, loss_mean=0.935, loss_mean_cls=1.09, proj_loss=-0.374][2026-03-22 14:26:42] Step: 1969, Training Logs: loss_final: 1.640248, loss_mean: 0.918506, proj_loss: -0.374940, loss_mean_cls: 1.096682, grad_norm: 2.967689 +Steps: 0%| | 1970/1000000 [08:05<67:47:00, 4.09it/s, grad_norm=2.97, loss_final=1.64, loss_mean=0.919, loss_mean_cls=1.1, proj_loss=-0.375][2026-03-22 14:26:42] Step: 1970, Training Logs: loss_final: 1.796420, loss_mean: 0.888737, proj_loss: -0.364529, loss_mean_cls: 1.272213, grad_norm: 1.834792 +Steps: 0%| | 1971/1000000 [08:05<67:48:06, 4.09it/s, grad_norm=1.83, loss_final=1.8, loss_mean=0.889, loss_mean_cls=1.27, proj_loss=-0.365][2026-03-22 14:26:42] Step: 1971, Training Logs: loss_final: 1.669060, loss_mean: 0.915478, proj_loss: -0.375378, loss_mean_cls: 1.128959, grad_norm: 2.354810 +Steps: 0%| | 1972/1000000 [08:05<67:49:01, 4.09it/s, grad_norm=2.35, loss_final=1.67, loss_mean=0.915, loss_mean_cls=1.13, proj_loss=-0.375][2026-03-22 14:26:42] Step: 1972, Training Logs: loss_final: 1.683895, loss_mean: 0.916645, proj_loss: -0.369777, loss_mean_cls: 1.137026, grad_norm: 3.176129 +Steps: 0%| | 1973/1000000 [08:06<69:43:03, 3.98it/s, grad_norm=3.18, loss_final=1.68, loss_mean=0.917, loss_mean_cls=1.14, proj_loss=-0.37][2026-03-22 14:26:43] Step: 1973, Training Logs: loss_final: 1.737933, loss_mean: 0.930140, proj_loss: -0.368180, loss_mean_cls: 1.175972, grad_norm: 2.499186 +Steps: 0%| | 1974/1000000 [08:06<68:00:43, 4.08it/s, grad_norm=2.5, loss_final=1.74, loss_mean=0.93, loss_mean_cls=1.18, proj_loss=-0.368][2026-03-22 14:26:43] Step: 1974, Training Logs: loss_final: 1.569987, loss_mean: 0.946087, proj_loss: -0.384838, loss_mean_cls: 1.008738, grad_norm: 2.981635 +Steps: 0%| | 1975/1000000 [08:06<67:55:44, 4.08it/s, grad_norm=2.98, loss_final=1.57, loss_mean=0.946, loss_mean_cls=1.01, proj_loss=-0.385][2026-03-22 14:26:43] Step: 1975, Training Logs: loss_final: 1.574238, loss_mean: 0.939002, proj_loss: -0.380763, loss_mean_cls: 1.015999, grad_norm: 5.573771 +Steps: 0%| | 1976/1000000 [08:06<67:53:03, 4.08it/s, grad_norm=5.57, loss_final=1.57, loss_mean=0.939, loss_mean_cls=1.02, proj_loss=-0.381][2026-03-22 14:26:43] Step: 1976, Training Logs: loss_final: 1.616600, loss_mean: 0.931110, proj_loss: -0.373928, loss_mean_cls: 1.059418, grad_norm: 2.602767 +Steps: 0%| | 1977/1000000 [08:07<67:50:07, 4.09it/s, grad_norm=2.6, loss_final=1.62, loss_mean=0.931, loss_mean_cls=1.06, proj_loss=-0.374][2026-03-22 14:26:44] Step: 1977, Training Logs: loss_final: 1.572700, loss_mean: 0.926090, proj_loss: -0.380278, loss_mean_cls: 1.026887, grad_norm: 1.295105 +Steps: 0%| | 1978/1000000 [08:07<67:48:58, 4.09it/s, grad_norm=1.3, loss_final=1.57, loss_mean=0.926, loss_mean_cls=1.03, proj_loss=-0.38][2026-03-22 14:26:44] Step: 1978, Training Logs: loss_final: 1.549084, loss_mean: 0.935356, proj_loss: -0.382254, loss_mean_cls: 0.995981, grad_norm: 1.127538 +Steps: 0%| | 1979/1000000 [08:07<67:47:14, 4.09it/s, grad_norm=1.13, loss_final=1.55, loss_mean=0.935, loss_mean_cls=0.996, proj_loss=-0.382][2026-03-22 14:26:44] Step: 1979, Training Logs: loss_final: 1.676295, loss_mean: 0.902046, proj_loss: -0.369349, loss_mean_cls: 1.143598, grad_norm: 1.839269 +Steps: 0%| | 1980/1000000 [08:07<67:47:04, 4.09it/s, grad_norm=1.84, loss_final=1.68, loss_mean=0.902, loss_mean_cls=1.14, proj_loss=-0.369][2026-03-22 14:26:44] Step: 1980, Training Logs: loss_final: 1.573309, loss_mean: 0.933531, proj_loss: -0.375208, loss_mean_cls: 1.014987, grad_norm: 1.498756 +Steps: 0%| | 1981/1000000 [08:08<67:46:44, 4.09it/s, grad_norm=1.5, loss_final=1.57, loss_mean=0.934, loss_mean_cls=1.01, proj_loss=-0.375][2026-03-22 14:26:45] Step: 1981, Training Logs: loss_final: 1.764181, loss_mean: 0.900897, proj_loss: -0.371977, loss_mean_cls: 1.235261, grad_norm: 2.231945 +Steps: 0%| | 1982/1000000 [08:08<67:45:53, 4.09it/s, grad_norm=2.23, loss_final=1.76, loss_mean=0.901, loss_mean_cls=1.24, proj_loss=-0.372][2026-03-22 14:26:45] Step: 1982, Training Logs: loss_final: 1.713742, loss_mean: 0.915252, proj_loss: -0.373302, loss_mean_cls: 1.171792, grad_norm: 2.912394 +Steps: 0%| | 1983/1000000 [08:08<67:45:56, 4.09it/s, grad_norm=2.91, loss_final=1.71, loss_mean=0.915, loss_mean_cls=1.17, proj_loss=-0.373][2026-03-22 14:26:45] Step: 1983, Training Logs: loss_final: 1.616172, loss_mean: 0.923936, proj_loss: -0.378491, loss_mean_cls: 1.070726, grad_norm: 2.550599 +Steps: 0%| | 1984/1000000 [08:08<67:45:46, 4.09it/s, grad_norm=2.55, loss_final=1.62, loss_mean=0.924, loss_mean_cls=1.07, proj_loss=-0.378][2026-03-22 14:26:45] Step: 1984, Training Logs: loss_final: 1.552551, loss_mean: 0.933451, proj_loss: -0.381454, loss_mean_cls: 1.000554, grad_norm: 3.748269 +Steps: 0%| | 1985/1000000 [08:09<69:00:01, 4.02it/s, grad_norm=3.75, loss_final=1.55, loss_mean=0.933, loss_mean_cls=1, proj_loss=-0.381][2026-03-22 14:26:45] Step: 1985, Training Logs: loss_final: 1.723699, loss_mean: 0.919109, proj_loss: -0.373886, loss_mean_cls: 1.178475, grad_norm: 2.414619 +Steps: 0%| | 1986/1000000 [08:09<68:36:43, 4.04it/s, grad_norm=2.41, loss_final=1.72, loss_mean=0.919, loss_mean_cls=1.18, proj_loss=-0.374][2026-03-22 14:26:46] Step: 1986, Training Logs: loss_final: 1.573992, loss_mean: 0.920288, proj_loss: -0.377843, loss_mean_cls: 1.031546, grad_norm: 1.695000 +Steps: 0%| | 1987/1000000 [08:09<68:20:41, 4.06it/s, grad_norm=1.7, loss_final=1.57, loss_mean=0.92, loss_mean_cls=1.03, proj_loss=-0.378][2026-03-22 14:26:46] Step: 1987, Training Logs: loss_final: 1.835143, loss_mean: 0.903629, proj_loss: -0.364866, loss_mean_cls: 1.296380, grad_norm: 4.202534 +Steps: 0%| | 1988/1000000 [08:09<68:09:32, 4.07it/s, grad_norm=4.2, loss_final=1.84, loss_mean=0.904, loss_mean_cls=1.3, proj_loss=-0.365][2026-03-22 14:26:46] Step: 1988, Training Logs: loss_final: 1.700533, loss_mean: 0.937012, proj_loss: -0.374046, loss_mean_cls: 1.137567, grad_norm: 3.906685 +Steps: 0%| | 1989/1000000 [08:10<68:01:40, 4.08it/s, grad_norm=3.91, loss_final=1.7, loss_mean=0.937, loss_mean_cls=1.14, proj_loss=-0.374][2026-03-22 14:26:46] Step: 1989, Training Logs: loss_final: 1.657473, loss_mean: 0.926272, proj_loss: -0.371483, loss_mean_cls: 1.102683, grad_norm: 2.923362 +Steps: 0%| | 1990/1000000 [08:10<67:55:09, 4.08it/s, grad_norm=2.92, loss_final=1.66, loss_mean=0.926, loss_mean_cls=1.1, proj_loss=-0.371][2026-03-22 14:26:47] Step: 1990, Training Logs: loss_final: 1.555924, loss_mean: 0.948200, proj_loss: -0.380542, loss_mean_cls: 0.988266, grad_norm: 4.361774 +Steps: 0%| | 1991/1000000 [08:10<67:52:45, 4.08it/s, grad_norm=4.36, loss_final=1.56, loss_mean=0.948, loss_mean_cls=0.988, proj_loss=-0.381][2026-03-22 14:26:47] Step: 1991, Training Logs: loss_final: 1.709618, loss_mean: 0.934074, proj_loss: -0.376340, loss_mean_cls: 1.151885, grad_norm: 4.639117 +Steps: 0%| | 1992/1000000 [08:10<67:50:30, 4.09it/s, grad_norm=4.64, loss_final=1.71, loss_mean=0.934, loss_mean_cls=1.15, proj_loss=-0.376][2026-03-22 14:26:47] Step: 1992, Training Logs: loss_final: 1.594804, loss_mean: 0.935193, proj_loss: -0.377901, loss_mean_cls: 1.037512, grad_norm: 4.846311 +Steps: 0%| | 1993/1000000 [08:10<67:48:41, 4.09it/s, grad_norm=4.85, loss_final=1.59, loss_mean=0.935, loss_mean_cls=1.04, proj_loss=-0.378][2026-03-22 14:26:47] Step: 1993, Training Logs: loss_final: 1.708784, loss_mean: 0.913621, proj_loss: -0.367918, loss_mean_cls: 1.163081, grad_norm: 2.214160 +Steps: 0%| | 1994/1000000 [08:11<107:40:58, 2.57it/s, grad_norm=2.21, loss_final=1.71, loss_mean=0.914, loss_mean_cls=1.16, proj_loss=-0.368][2026-03-22 14:26:48] Step: 1994, Training Logs: loss_final: 1.691137, loss_mean: 0.918980, proj_loss: -0.373609, loss_mean_cls: 1.145766, grad_norm: 3.889414 +Steps: 0%| | 1995/1000000 [08:11<95:41:07, 2.90it/s, grad_norm=3.89, loss_final=1.69, loss_mean=0.919, loss_mean_cls=1.15, proj_loss=-0.374] [2026-03-22 14:26:48] Step: 1995, Training Logs: loss_final: 1.616785, loss_mean: 0.938479, proj_loss: -0.377267, loss_mean_cls: 1.055574, grad_norm: 4.686396 +Steps: 0%| | 1996/1000000 [08:12<87:18:44, 3.18it/s, grad_norm=4.69, loss_final=1.62, loss_mean=0.938, loss_mean_cls=1.06, proj_loss=-0.377][2026-03-22 14:26:49] Step: 1996, Training Logs: loss_final: 1.746148, loss_mean: 0.917021, proj_loss: -0.366939, loss_mean_cls: 1.196065, grad_norm: 4.914531 +Steps: 0%| | 1997/1000000 [08:12<81:38:37, 3.40it/s, grad_norm=4.91, loss_final=1.75, loss_mean=0.917, loss_mean_cls=1.2, proj_loss=-0.367][2026-03-22 14:26:49] Step: 1997, Training Logs: loss_final: 1.538054, loss_mean: 0.943935, proj_loss: -0.388112, loss_mean_cls: 0.982230, grad_norm: 3.270077 +Steps: 0%| | 1998/1000000 [08:12<77:26:38, 3.58it/s, grad_norm=3.27, loss_final=1.54, loss_mean=0.944, loss_mean_cls=0.982, proj_loss=-0.388][2026-03-22 14:26:49] Step: 1998, Training Logs: loss_final: 1.557389, loss_mean: 0.924060, proj_loss: -0.384459, loss_mean_cls: 1.017788, grad_norm: 2.198639 +Steps: 0%| | 1999/1000000 [08:12<74:30:28, 3.72it/s, grad_norm=2.2, loss_final=1.56, loss_mean=0.924, loss_mean_cls=1.02, proj_loss=-0.384][2026-03-22 14:26:49] Step: 1999, Training Logs: loss_final: 1.763090, loss_mean: 0.912017, proj_loss: -0.371197, loss_mean_cls: 1.222270, grad_norm: 2.077932 +Steps: 0%| | 2000/1000000 [08:13<72:30:02, 3.82it/s, grad_norm=2.08, loss_final=1.76, loss_mean=0.912, loss_mean_cls=1.22, proj_loss=-0.371][2026-03-22 14:26:50] Step: 2000, Training Logs: loss_final: 1.678610, loss_mean: 0.937399, proj_loss: -0.371083, loss_mean_cls: 1.112294, grad_norm: 3.551670 +Steps: 0%| | 2001/1000000 [08:13<71:05:15, 3.90it/s, grad_norm=3.55, loss_final=1.68, loss_mean=0.937, loss_mean_cls=1.11, proj_loss=-0.371][2026-03-22 14:26:50] Step: 2001, Training Logs: loss_final: 1.676289, loss_mean: 0.916159, proj_loss: -0.377482, loss_mean_cls: 1.137612, grad_norm: 2.857075 +Steps: 0%| | 2002/1000000 [08:13<70:02:07, 3.96it/s, grad_norm=2.86, loss_final=1.68, loss_mean=0.916, loss_mean_cls=1.14, proj_loss=-0.377][2026-03-22 14:26:50] Step: 2002, Training Logs: loss_final: 1.662346, loss_mean: 0.931945, proj_loss: -0.375913, loss_mean_cls: 1.106314, grad_norm: 2.288464 +Steps: 0%| | 2003/1000000 [08:13<69:20:01, 4.00it/s, grad_norm=2.29, loss_final=1.66, loss_mean=0.932, loss_mean_cls=1.11, proj_loss=-0.376][2026-03-22 14:26:50] Step: 2003, Training Logs: loss_final: 1.709447, loss_mean: 0.906162, proj_loss: -0.370742, loss_mean_cls: 1.174027, grad_norm: 2.314746 +Steps: 0%| | 2004/1000000 [08:14<68:49:37, 4.03it/s, grad_norm=2.31, loss_final=1.71, loss_mean=0.906, loss_mean_cls=1.17, proj_loss=-0.371][2026-03-22 14:26:51] Step: 2004, Training Logs: loss_final: 1.649928, loss_mean: 0.905982, proj_loss: -0.366775, loss_mean_cls: 1.110720, grad_norm: 1.895536 +Steps: 0%| | 2005/1000000 [08:14<68:30:14, 4.05it/s, grad_norm=1.9, loss_final=1.65, loss_mean=0.906, loss_mean_cls=1.11, proj_loss=-0.367][2026-03-22 14:26:51] Step: 2005, Training Logs: loss_final: 1.687928, loss_mean: 0.906238, proj_loss: -0.373386, loss_mean_cls: 1.155076, grad_norm: 2.328217 +Steps: 0%| | 2006/1000000 [08:14<68:13:33, 4.06it/s, grad_norm=2.33, loss_final=1.69, loss_mean=0.906, loss_mean_cls=1.16, proj_loss=-0.373][2026-03-22 14:26:51] Step: 2006, Training Logs: loss_final: 1.679448, loss_mean: 0.925927, proj_loss: -0.369341, loss_mean_cls: 1.122862, grad_norm: 2.873988 +Steps: 0%| | 2007/1000000 [08:14<68:03:12, 4.07it/s, grad_norm=2.87, loss_final=1.68, loss_mean=0.926, loss_mean_cls=1.12, proj_loss=-0.369][2026-03-22 14:26:51] Step: 2007, Training Logs: loss_final: 1.790534, loss_mean: 0.891627, proj_loss: -0.365372, loss_mean_cls: 1.264279, grad_norm: 3.626306 +Steps: 0%| | 2008/1000000 [08:15<67:56:26, 4.08it/s, grad_norm=3.63, loss_final=1.79, loss_mean=0.892, loss_mean_cls=1.26, proj_loss=-0.365][2026-03-22 14:26:52] Step: 2008, Training Logs: loss_final: 1.680488, loss_mean: 0.902983, proj_loss: -0.381310, loss_mean_cls: 1.158815, grad_norm: 2.257326 +Steps: 0%| | 2009/1000000 [08:15<68:02:04, 4.07it/s, grad_norm=2.26, loss_final=1.68, loss_mean=0.903, loss_mean_cls=1.16, proj_loss=-0.381][2026-03-22 14:26:52] Step: 2009, Training Logs: loss_final: 1.817103, loss_mean: 0.899746, proj_loss: -0.368839, loss_mean_cls: 1.286196, grad_norm: 1.581313 +Steps: 0%| | 2010/1000000 [08:15<67:59:19, 4.08it/s, grad_norm=1.58, loss_final=1.82, loss_mean=0.9, loss_mean_cls=1.29, proj_loss=-0.369][2026-03-22 14:26:52] Step: 2010, Training Logs: loss_final: 1.568243, loss_mean: 0.914643, proj_loss: -0.381672, loss_mean_cls: 1.035273, grad_norm: 2.082525 +Steps: 0%| | 2011/1000000 [08:15<68:29:40, 4.05it/s, grad_norm=2.08, loss_final=1.57, loss_mean=0.915, loss_mean_cls=1.04, proj_loss=-0.382][2026-03-22 14:26:52] Step: 2011, Training Logs: loss_final: 1.644325, loss_mean: 0.934529, proj_loss: -0.373284, loss_mean_cls: 1.083081, grad_norm: 1.363977 +Steps: 0%| | 2012/1000000 [08:16<68:15:47, 4.06it/s, grad_norm=1.36, loss_final=1.64, loss_mean=0.935, loss_mean_cls=1.08, proj_loss=-0.373][2026-03-22 14:26:53] Step: 2012, Training Logs: loss_final: 1.836501, loss_mean: 0.915314, proj_loss: -0.360773, loss_mean_cls: 1.281961, grad_norm: 3.501070 +Steps: 0%| | 2013/1000000 [08:16<68:06:23, 4.07it/s, grad_norm=3.5, loss_final=1.84, loss_mean=0.915, loss_mean_cls=1.28, proj_loss=-0.361][2026-03-22 14:26:53] Step: 2013, Training Logs: loss_final: 1.683654, loss_mean: 0.933076, proj_loss: -0.371501, loss_mean_cls: 1.122079, grad_norm: 1.771264 +Steps: 0%| | 2014/1000000 [08:16<67:59:53, 4.08it/s, grad_norm=1.77, loss_final=1.68, loss_mean=0.933, loss_mean_cls=1.12, proj_loss=-0.372][2026-03-22 14:26:53] Step: 2014, Training Logs: loss_final: 1.726677, loss_mean: 0.908423, proj_loss: -0.373709, loss_mean_cls: 1.191963, grad_norm: 2.122405 +Steps: 0%| | 2015/1000000 [08:16<67:53:33, 4.08it/s, grad_norm=2.12, loss_final=1.73, loss_mean=0.908, loss_mean_cls=1.19, proj_loss=-0.374][2026-03-22 14:26:53] Step: 2015, Training Logs: loss_final: 1.674694, loss_mean: 0.922498, proj_loss: -0.372428, loss_mean_cls: 1.124624, grad_norm: 3.113617 +Steps: 0%| | 2016/1000000 [08:17<67:49:31, 4.09it/s, grad_norm=3.11, loss_final=1.67, loss_mean=0.922, loss_mean_cls=1.12, proj_loss=-0.372][2026-03-22 14:26:54] Step: 2016, Training Logs: loss_final: 1.572792, loss_mean: 0.932353, proj_loss: -0.381056, loss_mean_cls: 1.021495, grad_norm: 1.767866 +Steps: 0%| | 2017/1000000 [08:17<67:46:43, 4.09it/s, grad_norm=1.77, loss_final=1.57, loss_mean=0.932, loss_mean_cls=1.02, proj_loss=-0.381][2026-03-22 14:26:54] Step: 2017, Training Logs: loss_final: 1.757826, loss_mean: 0.902620, proj_loss: -0.373307, loss_mean_cls: 1.228513, grad_norm: 2.164874 +Steps: 0%| | 2018/1000000 [08:17<67:45:05, 4.09it/s, grad_norm=2.16, loss_final=1.76, loss_mean=0.903, loss_mean_cls=1.23, proj_loss=-0.373][2026-03-22 14:26:54] Step: 2018, Training Logs: loss_final: 1.683985, loss_mean: 0.908348, proj_loss: -0.371379, loss_mean_cls: 1.147016, grad_norm: 3.437344 +Steps: 0%| | 2019/1000000 [08:17<67:44:09, 4.09it/s, grad_norm=3.44, loss_final=1.68, loss_mean=0.908, loss_mean_cls=1.15, proj_loss=-0.371][2026-03-22 14:26:54] Step: 2019, Training Logs: loss_final: 1.688644, loss_mean: 0.914003, proj_loss: -0.377425, loss_mean_cls: 1.152065, grad_norm: 2.731100 +Steps: 0%| | 2020/1000000 [08:18<67:42:56, 4.09it/s, grad_norm=2.73, loss_final=1.69, loss_mean=0.914, loss_mean_cls=1.15, proj_loss=-0.377][2026-03-22 14:26:55] Step: 2020, Training Logs: loss_final: 1.681376, loss_mean: 0.920024, proj_loss: -0.370507, loss_mean_cls: 1.131860, grad_norm: 1.330525 +Steps: 0%| | 2021/1000000 [08:18<67:43:39, 4.09it/s, grad_norm=1.33, loss_final=1.68, loss_mean=0.92, loss_mean_cls=1.13, proj_loss=-0.371][2026-03-22 14:26:55] Step: 2021, Training Logs: loss_final: 1.731677, loss_mean: 0.922214, proj_loss: -0.375420, loss_mean_cls: 1.184883, grad_norm: 2.200567 +Steps: 0%| | 2022/1000000 [08:18<67:42:24, 4.09it/s, grad_norm=2.2, loss_final=1.73, loss_mean=0.922, loss_mean_cls=1.18, proj_loss=-0.375][2026-03-22 14:26:55] Step: 2022, Training Logs: loss_final: 1.905045, loss_mean: 0.901709, proj_loss: -0.361769, loss_mean_cls: 1.365105, grad_norm: 2.438385 +Steps: 0%| | 2023/1000000 [08:18<67:41:42, 4.10it/s, grad_norm=2.44, loss_final=1.91, loss_mean=0.902, loss_mean_cls=1.37, proj_loss=-0.362][2026-03-22 14:26:55] Step: 2023, Training Logs: loss_final: 1.614698, loss_mean: 0.930588, proj_loss: -0.379500, loss_mean_cls: 1.063609, grad_norm: 1.896512 +Steps: 0%| | 2024/1000000 [08:19<68:20:13, 4.06it/s, grad_norm=1.9, loss_final=1.61, loss_mean=0.931, loss_mean_cls=1.06, proj_loss=-0.379][2026-03-22 14:26:56] Step: 2024, Training Logs: loss_final: 1.719252, loss_mean: 0.927729, proj_loss: -0.373472, loss_mean_cls: 1.164995, grad_norm: 4.763294 +Steps: 0%| | 2025/1000000 [08:19<68:00:22, 4.08it/s, grad_norm=4.76, loss_final=1.72, loss_mean=0.928, loss_mean_cls=1.16, proj_loss=-0.373][2026-03-22 14:26:56] Step: 2025, Training Logs: loss_final: 1.615940, loss_mean: 0.910198, proj_loss: -0.377819, loss_mean_cls: 1.083560, grad_norm: 3.582849 +Steps: 0%| | 2026/1000000 [08:19<67:53:50, 4.08it/s, grad_norm=3.58, loss_final=1.62, loss_mean=0.91, loss_mean_cls=1.08, proj_loss=-0.378][2026-03-22 14:26:56] Step: 2026, Training Logs: loss_final: 1.698622, loss_mean: 0.926187, proj_loss: -0.369470, loss_mean_cls: 1.141905, grad_norm: 2.065895 +Steps: 0%| | 2027/1000000 [08:19<67:52:57, 4.08it/s, grad_norm=2.07, loss_final=1.7, loss_mean=0.926, loss_mean_cls=1.14, proj_loss=-0.369][2026-03-22 14:26:56] Step: 2027, Training Logs: loss_final: 1.612897, loss_mean: 0.916878, proj_loss: -0.373727, loss_mean_cls: 1.069746, grad_norm: 1.653714 +Steps: 0%| | 2028/1000000 [08:20<67:51:27, 4.09it/s, grad_norm=1.65, loss_final=1.61, loss_mean=0.917, loss_mean_cls=1.07, proj_loss=-0.374][2026-03-22 14:26:56] Step: 2028, Training Logs: loss_final: 1.698518, loss_mean: 0.912337, proj_loss: -0.372756, loss_mean_cls: 1.158937, grad_norm: 1.800888 +Steps: 0%| | 2029/1000000 [08:20<67:48:15, 4.09it/s, grad_norm=1.8, loss_final=1.7, loss_mean=0.912, loss_mean_cls=1.16, proj_loss=-0.373][2026-03-22 14:26:57] Step: 2029, Training Logs: loss_final: 1.619706, loss_mean: 0.904701, proj_loss: -0.376288, loss_mean_cls: 1.091293, grad_norm: 1.959519 +Steps: 0%| | 2030/1000000 [08:20<67:47:27, 4.09it/s, grad_norm=1.96, loss_final=1.62, loss_mean=0.905, loss_mean_cls=1.09, proj_loss=-0.376][2026-03-22 14:26:57] Step: 2030, Training Logs: loss_final: 1.508562, loss_mean: 0.939453, proj_loss: -0.377568, loss_mean_cls: 0.946676, grad_norm: 1.989730 +Steps: 0%| | 2031/1000000 [08:20<67:45:19, 4.09it/s, grad_norm=1.99, loss_final=1.51, loss_mean=0.939, loss_mean_cls=0.947, proj_loss=-0.378][2026-03-22 14:26:57] Step: 2031, Training Logs: loss_final: 1.757979, loss_mean: 0.905758, proj_loss: -0.364828, loss_mean_cls: 1.217049, grad_norm: 3.859970 +Steps: 0%| | 2032/1000000 [08:21<67:44:42, 4.09it/s, grad_norm=3.86, loss_final=1.76, loss_mean=0.906, loss_mean_cls=1.22, proj_loss=-0.365][2026-03-22 14:26:57] Step: 2032, Training Logs: loss_final: 1.588427, loss_mean: 0.934182, proj_loss: -0.382062, loss_mean_cls: 1.036307, grad_norm: 2.587936 +Steps: 0%| | 2033/1000000 [08:21<67:43:52, 4.09it/s, grad_norm=2.59, loss_final=1.59, loss_mean=0.934, loss_mean_cls=1.04, proj_loss=-0.382][2026-03-22 14:26:58] Step: 2033, Training Logs: loss_final: 1.565282, loss_mean: 0.918164, proj_loss: -0.381575, loss_mean_cls: 1.028693, grad_norm: 2.652712 +Steps: 0%| | 2034/1000000 [08:21<67:45:29, 4.09it/s, grad_norm=2.65, loss_final=1.57, loss_mean=0.918, loss_mean_cls=1.03, proj_loss=-0.382][2026-03-22 14:26:58] Step: 2034, Training Logs: loss_final: 1.670940, loss_mean: 0.924771, proj_loss: -0.380989, loss_mean_cls: 1.127157, grad_norm: 4.701934 +Steps: 0%| | 2035/1000000 [08:21<67:45:41, 4.09it/s, grad_norm=4.7, loss_final=1.67, loss_mean=0.925, loss_mean_cls=1.13, proj_loss=-0.381][2026-03-22 14:26:58] Step: 2035, Training Logs: loss_final: 1.539525, loss_mean: 0.918481, proj_loss: -0.381939, loss_mean_cls: 1.002983, grad_norm: 2.143572 +Steps: 0%| | 2036/1000000 [08:21<67:45:23, 4.09it/s, grad_norm=2.14, loss_final=1.54, loss_mean=0.918, loss_mean_cls=1, proj_loss=-0.382][2026-03-22 14:26:58] Step: 2036, Training Logs: loss_final: 1.488561, loss_mean: 0.930220, proj_loss: -0.385348, loss_mean_cls: 0.943689, grad_norm: 3.290256 +Steps: 0%| | 2037/1000000 [08:22<67:45:00, 4.09it/s, grad_norm=3.29, loss_final=1.49, loss_mean=0.93, loss_mean_cls=0.944, proj_loss=-0.385][2026-03-22 14:26:59] Step: 2037, Training Logs: loss_final: 1.768886, loss_mean: 0.904079, proj_loss: -0.372310, loss_mean_cls: 1.237116, grad_norm: 4.674158 +Steps: 0%| | 2038/1000000 [08:22<67:43:48, 4.09it/s, grad_norm=4.67, loss_final=1.77, loss_mean=0.904, loss_mean_cls=1.24, proj_loss=-0.372][2026-03-22 14:26:59] Step: 2038, Training Logs: loss_final: 1.730335, loss_mean: 0.913634, proj_loss: -0.373822, loss_mean_cls: 1.190524, grad_norm: 3.850026 +Steps: 0%| | 2039/1000000 [08:22<67:44:02, 4.09it/s, grad_norm=3.85, loss_final=1.73, loss_mean=0.914, loss_mean_cls=1.19, proj_loss=-0.374][2026-03-22 14:26:59] Step: 2039, Training Logs: loss_final: 1.692887, loss_mean: 0.921408, proj_loss: -0.371880, loss_mean_cls: 1.143359, grad_norm: 3.079064 +Steps: 0%| | 2040/1000000 [08:22<67:42:22, 4.09it/s, grad_norm=3.08, loss_final=1.69, loss_mean=0.921, loss_mean_cls=1.14, proj_loss=-0.372][2026-03-22 14:26:59] Step: 2040, Training Logs: loss_final: 1.598752, loss_mean: 0.933019, proj_loss: -0.380124, loss_mean_cls: 1.045858, grad_norm: 5.848396 +Steps: 0%| | 2041/1000000 [08:23<67:44:01, 4.09it/s, grad_norm=5.85, loss_final=1.6, loss_mean=0.933, loss_mean_cls=1.05, proj_loss=-0.38][2026-03-22 14:27:00] Step: 2041, Training Logs: loss_final: 1.692518, loss_mean: 0.914174, proj_loss: -0.371297, loss_mean_cls: 1.149641, grad_norm: 4.430083 +Steps: 0%| | 2042/1000000 [08:23<67:45:09, 4.09it/s, grad_norm=4.43, loss_final=1.69, loss_mean=0.914, loss_mean_cls=1.15, proj_loss=-0.371][2026-03-22 14:27:00] Step: 2042, Training Logs: loss_final: 1.757043, loss_mean: 0.902315, proj_loss: -0.367047, loss_mean_cls: 1.221776, grad_norm: 2.870387 +Steps: 0%| | 2043/1000000 [08:23<67:43:45, 4.09it/s, grad_norm=2.87, loss_final=1.76, loss_mean=0.902, loss_mean_cls=1.22, proj_loss=-0.367][2026-03-22 14:27:00] Step: 2043, Training Logs: loss_final: 1.771521, loss_mean: 0.924335, proj_loss: -0.367683, loss_mean_cls: 1.214870, grad_norm: 2.618936 +Steps: 0%| | 2044/1000000 [08:23<67:54:38, 4.08it/s, grad_norm=2.62, loss_final=1.77, loss_mean=0.924, loss_mean_cls=1.21, proj_loss=-0.368][2026-03-22 14:27:00] Step: 2044, Training Logs: loss_final: 1.678766, loss_mean: 0.918670, proj_loss: -0.383316, loss_mean_cls: 1.143411, grad_norm: 6.228079 +Steps: 0%| | 2045/1000000 [08:24<67:51:23, 4.09it/s, grad_norm=6.23, loss_final=1.68, loss_mean=0.919, loss_mean_cls=1.14, proj_loss=-0.383][2026-03-22 14:27:01] Step: 2045, Training Logs: loss_final: 1.587758, loss_mean: 0.943352, proj_loss: -0.386987, loss_mean_cls: 1.031394, grad_norm: 3.897083 +Steps: 0%| | 2046/1000000 [08:24<67:48:38, 4.09it/s, grad_norm=3.9, loss_final=1.59, loss_mean=0.943, loss_mean_cls=1.03, proj_loss=-0.387][2026-03-22 14:27:01] Step: 2046, Training Logs: loss_final: 1.714414, loss_mean: 0.919891, proj_loss: -0.379429, loss_mean_cls: 1.173951, grad_norm: 3.092407 +Steps: 0%| | 2047/1000000 [08:24<67:47:59, 4.09it/s, grad_norm=3.09, loss_final=1.71, loss_mean=0.92, loss_mean_cls=1.17, proj_loss=-0.379][2026-03-22 14:27:01] Step: 2047, Training Logs: loss_final: 1.624598, loss_mean: 0.927704, proj_loss: -0.373314, loss_mean_cls: 1.070208, grad_norm: 2.822051 +Steps: 0%| | 2048/1000000 [08:24<67:49:07, 4.09it/s, grad_norm=2.82, loss_final=1.62, loss_mean=0.928, loss_mean_cls=1.07, proj_loss=-0.373][2026-03-22 14:27:01] Step: 2048, Training Logs: loss_final: 1.770626, loss_mean: 0.901993, proj_loss: -0.370930, loss_mean_cls: 1.239563, grad_norm: 2.666580 +Steps: 0%| | 2049/1000000 [08:25<67:49:02, 4.09it/s, grad_norm=2.67, loss_final=1.77, loss_mean=0.902, loss_mean_cls=1.24, proj_loss=-0.371][2026-03-22 14:27:02] Step: 2049, Training Logs: loss_final: 1.653248, loss_mean: 0.932256, proj_loss: -0.380189, loss_mean_cls: 1.101182, grad_norm: 3.855765 +Steps: 0%| | 2050/1000000 [08:25<67:50:08, 4.09it/s, grad_norm=3.86, loss_final=1.65, loss_mean=0.932, loss_mean_cls=1.1, proj_loss=-0.38][2026-03-22 14:27:02] Step: 2050, Training Logs: loss_final: 1.701059, loss_mean: 0.925258, proj_loss: -0.373637, loss_mean_cls: 1.149438, grad_norm: 3.214369 +Steps: 0%| | 2051/1000000 [08:25<67:48:40, 4.09it/s, grad_norm=3.21, loss_final=1.7, loss_mean=0.925, loss_mean_cls=1.15, proj_loss=-0.374][2026-03-22 14:27:02] Step: 2051, Training Logs: loss_final: 1.741426, loss_mean: 0.907385, proj_loss: -0.372583, loss_mean_cls: 1.206624, grad_norm: 3.046762 +Steps: 0%| | 2052/1000000 [08:25<67:47:46, 4.09it/s, grad_norm=3.05, loss_final=1.74, loss_mean=0.907, loss_mean_cls=1.21, proj_loss=-0.373][2026-03-22 14:27:02] Step: 2052, Training Logs: loss_final: 1.537700, loss_mean: 0.936424, proj_loss: -0.381121, loss_mean_cls: 0.982397, grad_norm: 3.114374 +Steps: 0%| | 2053/1000000 [08:26<67:46:08, 4.09it/s, grad_norm=3.11, loss_final=1.54, loss_mean=0.936, loss_mean_cls=0.982, proj_loss=-0.381][2026-03-22 14:27:03] Step: 2053, Training Logs: loss_final: 1.650192, loss_mean: 0.938463, proj_loss: -0.374493, loss_mean_cls: 1.086223, grad_norm: 4.268185 +Steps: 0%| | 2054/1000000 [08:26<67:47:29, 4.09it/s, grad_norm=4.27, loss_final=1.65, loss_mean=0.938, loss_mean_cls=1.09, proj_loss=-0.374][2026-03-22 14:27:03] Step: 2054, Training Logs: loss_final: 1.622028, loss_mean: 0.934549, proj_loss: -0.384165, loss_mean_cls: 1.071645, grad_norm: 3.903157 +Steps: 0%| | 2055/1000000 [08:26<67:46:07, 4.09it/s, grad_norm=3.9, loss_final=1.62, loss_mean=0.935, loss_mean_cls=1.07, proj_loss=-0.384][2026-03-22 14:27:03] Step: 2055, Training Logs: loss_final: 1.572806, loss_mean: 0.924577, proj_loss: -0.385711, loss_mean_cls: 1.033939, grad_norm: 2.848687 +Steps: 0%| | 2056/1000000 [08:26<67:46:36, 4.09it/s, grad_norm=2.85, loss_final=1.57, loss_mean=0.925, loss_mean_cls=1.03, proj_loss=-0.386][2026-03-22 14:27:03] Step: 2056, Training Logs: loss_final: 1.780506, loss_mean: 0.908841, proj_loss: -0.372207, loss_mean_cls: 1.243872, grad_norm: 3.341675 +Steps: 0%| | 2057/1000000 [08:27<67:45:19, 4.09it/s, grad_norm=3.34, loss_final=1.78, loss_mean=0.909, loss_mean_cls=1.24, proj_loss=-0.372][2026-03-22 14:27:04] Step: 2057, Training Logs: loss_final: 1.633504, loss_mean: 0.937072, proj_loss: -0.384549, loss_mean_cls: 1.080982, grad_norm: 3.429246 +Steps: 0%| | 2058/1000000 [08:27<67:45:51, 4.09it/s, grad_norm=3.43, loss_final=1.63, loss_mean=0.937, loss_mean_cls=1.08, proj_loss=-0.385][2026-03-22 14:27:04] Step: 2058, Training Logs: loss_final: 1.659917, loss_mean: 0.919492, proj_loss: -0.378143, loss_mean_cls: 1.118568, grad_norm: 2.306411 +Steps: 0%| | 2059/1000000 [08:27<67:46:02, 4.09it/s, grad_norm=2.31, loss_final=1.66, loss_mean=0.919, loss_mean_cls=1.12, proj_loss=-0.378][2026-03-22 14:27:04] Step: 2059, Training Logs: loss_final: 1.662260, loss_mean: 0.933144, proj_loss: -0.377280, loss_mean_cls: 1.106395, grad_norm: 2.616468 +Steps: 0%| | 2060/1000000 [08:27<67:46:09, 4.09it/s, grad_norm=2.62, loss_final=1.66, loss_mean=0.933, loss_mean_cls=1.11, proj_loss=-0.377][2026-03-22 14:27:04] Step: 2060, Training Logs: loss_final: 1.632574, loss_mean: 0.930821, proj_loss: -0.379621, loss_mean_cls: 1.081374, grad_norm: 2.714636 +Steps: 0%| | 2061/1000000 [08:28<67:45:39, 4.09it/s, grad_norm=2.71, loss_final=1.63, loss_mean=0.931, loss_mean_cls=1.08, proj_loss=-0.38][2026-03-22 14:27:05] Step: 2061, Training Logs: loss_final: 1.648935, loss_mean: 0.924744, proj_loss: -0.383124, loss_mean_cls: 1.107316, grad_norm: 3.035873 +Steps: 0%| | 2062/1000000 [08:28<68:15:23, 4.06it/s, grad_norm=3.04, loss_final=1.65, loss_mean=0.925, loss_mean_cls=1.11, proj_loss=-0.383][2026-03-22 14:27:05] Step: 2062, Training Logs: loss_final: 1.747947, loss_mean: 0.895249, proj_loss: -0.373989, loss_mean_cls: 1.226688, grad_norm: 2.512295 +Steps: 0%| | 2063/1000000 [08:28<68:04:47, 4.07it/s, grad_norm=2.51, loss_final=1.75, loss_mean=0.895, loss_mean_cls=1.23, proj_loss=-0.374][2026-03-22 14:27:05] Step: 2063, Training Logs: loss_final: 1.858686, loss_mean: 0.874442, proj_loss: -0.368068, loss_mean_cls: 1.352312, grad_norm: 1.784354 +Steps: 0%| | 2064/1000000 [08:28<67:58:52, 4.08it/s, grad_norm=1.78, loss_final=1.86, loss_mean=0.874, loss_mean_cls=1.35, proj_loss=-0.368][2026-03-22 14:27:05] Step: 2064, Training Logs: loss_final: 1.668099, loss_mean: 0.918506, proj_loss: -0.375364, loss_mean_cls: 1.124957, grad_norm: 2.664898 +Steps: 0%| | 2065/1000000 [08:29<67:55:31, 4.08it/s, grad_norm=2.66, loss_final=1.67, loss_mean=0.919, loss_mean_cls=1.12, proj_loss=-0.375][2026-03-22 14:27:06] Step: 2065, Training Logs: loss_final: 1.686926, loss_mean: 0.898789, proj_loss: -0.373844, loss_mean_cls: 1.161980, grad_norm: 2.543244 +Steps: 0%| | 2066/1000000 [08:29<67:51:58, 4.08it/s, grad_norm=2.54, loss_final=1.69, loss_mean=0.899, loss_mean_cls=1.16, proj_loss=-0.374][2026-03-22 14:27:06] Step: 2066, Training Logs: loss_final: 1.619038, loss_mean: 0.932080, proj_loss: -0.386798, loss_mean_cls: 1.073756, grad_norm: 3.130869 +Steps: 0%| | 2067/1000000 [08:29<67:49:09, 4.09it/s, grad_norm=3.13, loss_final=1.62, loss_mean=0.932, loss_mean_cls=1.07, proj_loss=-0.387][2026-03-22 14:27:06] Step: 2067, Training Logs: loss_final: 1.664653, loss_mean: 0.907090, proj_loss: -0.377126, loss_mean_cls: 1.134690, grad_norm: 3.801956 +Steps: 0%| | 2068/1000000 [08:29<67:47:47, 4.09it/s, grad_norm=3.8, loss_final=1.66, loss_mean=0.907, loss_mean_cls=1.13, proj_loss=-0.377][2026-03-22 14:27:06] Step: 2068, Training Logs: loss_final: 1.594419, loss_mean: 0.929062, proj_loss: -0.387905, loss_mean_cls: 1.053261, grad_norm: 5.115566 +Steps: 0%| | 2069/1000000 [08:30<67:45:41, 4.09it/s, grad_norm=5.12, loss_final=1.59, loss_mean=0.929, loss_mean_cls=1.05, proj_loss=-0.388][2026-03-22 14:27:07] Step: 2069, Training Logs: loss_final: 1.689687, loss_mean: 0.909847, proj_loss: -0.378636, loss_mean_cls: 1.158475, grad_norm: 4.690782 +Steps: 0%| | 2070/1000000 [08:30<67:44:42, 4.09it/s, grad_norm=4.69, loss_final=1.69, loss_mean=0.91, loss_mean_cls=1.16, proj_loss=-0.379][2026-03-22 14:27:07] Step: 2070, Training Logs: loss_final: 1.660089, loss_mean: 0.914694, proj_loss: -0.377921, loss_mean_cls: 1.123315, grad_norm: 3.330802 +Steps: 0%| | 2071/1000000 [08:30<67:43:50, 4.09it/s, grad_norm=3.33, loss_final=1.66, loss_mean=0.915, loss_mean_cls=1.12, proj_loss=-0.378][2026-03-22 14:27:07] Step: 2071, Training Logs: loss_final: 1.604807, loss_mean: 0.923181, proj_loss: -0.376790, loss_mean_cls: 1.058416, grad_norm: 1.964088 +Steps: 0%| | 2072/1000000 [08:30<67:43:58, 4.09it/s, grad_norm=1.96, loss_final=1.6, loss_mean=0.923, loss_mean_cls=1.06, proj_loss=-0.377][2026-03-22 14:27:07] Step: 2072, Training Logs: loss_final: 1.565362, loss_mean: 0.931384, proj_loss: -0.384059, loss_mean_cls: 1.018037, grad_norm: 3.859729 +Steps: 0%| | 2073/1000000 [08:31<67:43:54, 4.09it/s, grad_norm=3.86, loss_final=1.57, loss_mean=0.931, loss_mean_cls=1.02, proj_loss=-0.384][2026-03-22 14:27:07] Step: 2073, Training Logs: loss_final: 1.783251, loss_mean: 0.896556, proj_loss: -0.378266, loss_mean_cls: 1.264960, grad_norm: 7.578737 +Steps: 0%| | 2074/1000000 [08:31<67:44:22, 4.09it/s, grad_norm=7.58, loss_final=1.78, loss_mean=0.897, loss_mean_cls=1.26, proj_loss=-0.378][2026-03-22 14:27:08] Step: 2074, Training Logs: loss_final: 1.752746, loss_mean: 0.903439, proj_loss: -0.363951, loss_mean_cls: 1.213258, grad_norm: 3.333764 +Steps: 0%| | 2075/1000000 [08:31<67:43:28, 4.09it/s, grad_norm=3.33, loss_final=1.75, loss_mean=0.903, loss_mean_cls=1.21, proj_loss=-0.364][2026-03-22 14:27:08] Step: 2075, Training Logs: loss_final: 1.599505, loss_mean: 0.919090, proj_loss: -0.382888, loss_mean_cls: 1.063303, grad_norm: 1.909379 +Steps: 0%| | 2076/1000000 [08:31<67:44:14, 4.09it/s, grad_norm=1.91, loss_final=1.6, loss_mean=0.919, loss_mean_cls=1.06, proj_loss=-0.383][2026-03-22 14:27:08] Step: 2076, Training Logs: loss_final: 1.666555, loss_mean: 0.918631, proj_loss: -0.374240, loss_mean_cls: 1.122164, grad_norm: 3.334738 +Steps: 0%| | 2077/1000000 [08:32<67:42:57, 4.09it/s, grad_norm=3.33, loss_final=1.67, loss_mean=0.919, loss_mean_cls=1.12, proj_loss=-0.374][2026-03-22 14:27:08] Step: 2077, Training Logs: loss_final: 1.727468, loss_mean: 0.929666, proj_loss: -0.372076, loss_mean_cls: 1.169878, grad_norm: 5.374979 +Steps: 0%| | 2078/1000000 [08:32<69:54:49, 3.96it/s, grad_norm=5.37, loss_final=1.73, loss_mean=0.93, loss_mean_cls=1.17, proj_loss=-0.372][2026-03-22 14:27:09] Step: 2078, Training Logs: loss_final: 1.826366, loss_mean: 0.895754, proj_loss: -0.355343, loss_mean_cls: 1.285955, grad_norm: 3.406044 +Steps: 0%| | 2079/1000000 [08:32<69:15:51, 4.00it/s, grad_norm=3.41, loss_final=1.83, loss_mean=0.896, loss_mean_cls=1.29, proj_loss=-0.355][2026-03-22 14:27:09] Step: 2079, Training Logs: loss_final: 1.735951, loss_mean: 0.900285, proj_loss: -0.370460, loss_mean_cls: 1.206126, grad_norm: 5.224205 +Steps: 0%| | 2080/1000000 [08:32<68:47:54, 4.03it/s, grad_norm=5.22, loss_final=1.74, loss_mean=0.9, loss_mean_cls=1.21, proj_loss=-0.37][2026-03-22 14:27:09] Step: 2080, Training Logs: loss_final: 1.758204, loss_mean: 0.906206, proj_loss: -0.367786, loss_mean_cls: 1.219784, grad_norm: 4.405997 +Steps: 0%| | 2081/1000000 [08:33<68:29:13, 4.05it/s, grad_norm=4.41, loss_final=1.76, loss_mean=0.906, loss_mean_cls=1.22, proj_loss=-0.368][2026-03-22 14:27:09] Step: 2081, Training Logs: loss_final: 1.681866, loss_mean: 0.903138, proj_loss: -0.380268, loss_mean_cls: 1.158995, grad_norm: 2.153163 +Steps: 0%| | 2082/1000000 [08:33<68:15:54, 4.06it/s, grad_norm=2.15, loss_final=1.68, loss_mean=0.903, loss_mean_cls=1.16, proj_loss=-0.38][2026-03-22 14:27:10] Step: 2082, Training Logs: loss_final: 1.610761, loss_mean: 0.900563, proj_loss: -0.375218, loss_mean_cls: 1.085415, grad_norm: 2.514729 +Steps: 0%| | 2083/1000000 [08:33<68:06:57, 4.07it/s, grad_norm=2.51, loss_final=1.61, loss_mean=0.901, loss_mean_cls=1.09, proj_loss=-0.375][2026-03-22 14:27:10] Step: 2083, Training Logs: loss_final: 1.724440, loss_mean: 0.922099, proj_loss: -0.378925, loss_mean_cls: 1.181266, grad_norm: 3.756308 +Steps: 0%| | 2084/1000000 [08:33<67:59:22, 4.08it/s, grad_norm=3.76, loss_final=1.72, loss_mean=0.922, loss_mean_cls=1.18, proj_loss=-0.379][2026-03-22 14:27:10] Step: 2084, Training Logs: loss_final: 1.634547, loss_mean: 0.918175, proj_loss: -0.373216, loss_mean_cls: 1.089588, grad_norm: 2.618813 +Steps: 0%| | 2085/1000000 [08:33<67:54:10, 4.08it/s, grad_norm=2.62, loss_final=1.63, loss_mean=0.918, loss_mean_cls=1.09, proj_loss=-0.373][2026-03-22 14:27:10] Step: 2085, Training Logs: loss_final: 1.659576, loss_mean: 0.922482, proj_loss: -0.382876, loss_mean_cls: 1.119970, grad_norm: 4.623617 +Steps: 0%| | 2086/1000000 [08:34<67:49:55, 4.09it/s, grad_norm=4.62, loss_final=1.66, loss_mean=0.922, loss_mean_cls=1.12, proj_loss=-0.383][2026-03-22 14:27:11] Step: 2086, Training Logs: loss_final: 1.643263, loss_mean: 0.929829, proj_loss: -0.384885, loss_mean_cls: 1.098318, grad_norm: 6.192009 +Steps: 0%| | 2087/1000000 [08:34<67:46:37, 4.09it/s, grad_norm=6.19, loss_final=1.64, loss_mean=0.93, loss_mean_cls=1.1, proj_loss=-0.385][2026-03-22 14:27:11] Step: 2087, Training Logs: loss_final: 1.648543, loss_mean: 0.930933, proj_loss: -0.378458, loss_mean_cls: 1.096068, grad_norm: 3.381615 +Steps: 0%| | 2088/1000000 [08:34<67:44:32, 4.09it/s, grad_norm=3.38, loss_final=1.65, loss_mean=0.931, loss_mean_cls=1.1, proj_loss=-0.378][2026-03-22 14:27:11] Step: 2088, Training Logs: loss_final: 1.562883, loss_mean: 0.907516, proj_loss: -0.384007, loss_mean_cls: 1.039374, grad_norm: 4.706105 +Steps: 0%| | 2089/1000000 [08:34<67:44:54, 4.09it/s, grad_norm=4.71, loss_final=1.56, loss_mean=0.908, loss_mean_cls=1.04, proj_loss=-0.384][2026-03-22 14:27:11] Step: 2089, Training Logs: loss_final: 1.731058, loss_mean: 0.885432, proj_loss: -0.376348, loss_mean_cls: 1.221974, grad_norm: 2.324062 +Steps: 0%| | 2090/1000000 [08:35<67:43:47, 4.09it/s, grad_norm=2.32, loss_final=1.73, loss_mean=0.885, loss_mean_cls=1.22, proj_loss=-0.376][2026-03-22 14:27:12] Step: 2090, Training Logs: loss_final: 1.553364, loss_mean: 0.929119, proj_loss: -0.383656, loss_mean_cls: 1.007901, grad_norm: 2.665772 +Steps: 0%| | 2091/1000000 [08:35<67:44:09, 4.09it/s, grad_norm=2.67, loss_final=1.55, loss_mean=0.929, loss_mean_cls=1.01, proj_loss=-0.384][2026-03-22 14:27:12] Step: 2091, Training Logs: loss_final: 1.640971, loss_mean: 0.921225, proj_loss: -0.381304, loss_mean_cls: 1.101050, grad_norm: 3.133535 +Steps: 0%| | 2092/1000000 [08:35<67:43:25, 4.09it/s, grad_norm=3.13, loss_final=1.64, loss_mean=0.921, loss_mean_cls=1.1, proj_loss=-0.381][2026-03-22 14:27:12] Step: 2092, Training Logs: loss_final: 1.689127, loss_mean: 0.904516, proj_loss: -0.377159, loss_mean_cls: 1.161770, grad_norm: 2.183009 +Steps: 0%| | 2093/1000000 [08:35<67:42:17, 4.09it/s, grad_norm=2.18, loss_final=1.69, loss_mean=0.905, loss_mean_cls=1.16, proj_loss=-0.377][2026-03-22 14:27:12] Step: 2093, Training Logs: loss_final: 1.783168, loss_mean: 0.899022, proj_loss: -0.376425, loss_mean_cls: 1.260571, grad_norm: 2.705931 +Steps: 0%| | 2094/1000000 [08:36<67:43:43, 4.09it/s, grad_norm=2.71, loss_final=1.78, loss_mean=0.899, loss_mean_cls=1.26, proj_loss=-0.376][2026-03-22 14:27:13] Step: 2094, Training Logs: loss_final: 1.681193, loss_mean: 0.913843, proj_loss: -0.370007, loss_mean_cls: 1.137356, grad_norm: 4.092666 +Steps: 0%| | 2095/1000000 [08:36<67:42:05, 4.09it/s, grad_norm=4.09, loss_final=1.68, loss_mean=0.914, loss_mean_cls=1.14, proj_loss=-0.37][2026-03-22 14:27:13] Step: 2095, Training Logs: loss_final: 1.677915, loss_mean: 0.905929, proj_loss: -0.376753, loss_mean_cls: 1.148739, grad_norm: 4.276698 +Steps: 0%| | 2096/1000000 [08:36<67:42:34, 4.09it/s, grad_norm=4.28, loss_final=1.68, loss_mean=0.906, loss_mean_cls=1.15, proj_loss=-0.377][2026-03-22 14:27:13] Step: 2096, Training Logs: loss_final: 1.708364, loss_mean: 0.925697, proj_loss: -0.372561, loss_mean_cls: 1.155228, grad_norm: 2.432432 +Steps: 0%| | 2097/1000000 [08:36<67:42:33, 4.09it/s, grad_norm=2.43, loss_final=1.71, loss_mean=0.926, loss_mean_cls=1.16, proj_loss=-0.373][2026-03-22 14:27:13] Step: 2097, Training Logs: loss_final: 1.592501, loss_mean: 0.934525, proj_loss: -0.377900, loss_mean_cls: 1.035876, grad_norm: 2.501946 +Steps: 0%| | 2098/1000000 [08:37<67:42:39, 4.09it/s, grad_norm=2.5, loss_final=1.59, loss_mean=0.935, loss_mean_cls=1.04, proj_loss=-0.378][2026-03-22 14:27:14] Step: 2098, Training Logs: loss_final: 1.590795, loss_mean: 0.938131, proj_loss: -0.382693, loss_mean_cls: 1.035357, grad_norm: 2.691141 +Steps: 0%| | 2099/1000000 [08:37<67:40:23, 4.10it/s, grad_norm=2.69, loss_final=1.59, loss_mean=0.938, loss_mean_cls=1.04, proj_loss=-0.383][2026-03-22 14:27:14] Step: 2099, Training Logs: loss_final: 1.736962, loss_mean: 0.900772, proj_loss: -0.372299, loss_mean_cls: 1.208489, grad_norm: 2.441134 +Steps: 0%| | 2100/1000000 [08:37<67:44:08, 4.09it/s, grad_norm=2.44, loss_final=1.74, loss_mean=0.901, loss_mean_cls=1.21, proj_loss=-0.372][2026-03-22 14:27:14] Step: 2100, Training Logs: loss_final: 1.714366, loss_mean: 0.910473, proj_loss: -0.377154, loss_mean_cls: 1.181046, grad_norm: 2.450715 +Steps: 0%| | 2101/1000000 [08:37<67:44:09, 4.09it/s, grad_norm=2.45, loss_final=1.71, loss_mean=0.91, loss_mean_cls=1.18, proj_loss=-0.377][2026-03-22 14:27:14] Step: 2101, Training Logs: loss_final: 1.564230, loss_mean: 0.919774, proj_loss: -0.383926, loss_mean_cls: 1.028382, grad_norm: 2.666119 +Steps: 0%| | 2102/1000000 [08:38<67:44:09, 4.09it/s, grad_norm=2.67, loss_final=1.56, loss_mean=0.92, loss_mean_cls=1.03, proj_loss=-0.384][2026-03-22 14:27:15] Step: 2102, Training Logs: loss_final: 1.723173, loss_mean: 0.902489, proj_loss: -0.374058, loss_mean_cls: 1.194741, grad_norm: 1.826052 +Steps: 0%| | 2103/1000000 [08:38<67:44:30, 4.09it/s, grad_norm=1.83, loss_final=1.72, loss_mean=0.902, loss_mean_cls=1.19, proj_loss=-0.374][2026-03-22 14:27:15] Step: 2103, Training Logs: loss_final: 1.602543, loss_mean: 0.936897, proj_loss: -0.384749, loss_mean_cls: 1.050394, grad_norm: 2.041716 +Steps: 0%| | 2104/1000000 [08:38<67:43:37, 4.09it/s, grad_norm=2.04, loss_final=1.6, loss_mean=0.937, loss_mean_cls=1.05, proj_loss=-0.385][2026-03-22 14:27:15] Step: 2104, Training Logs: loss_final: 1.640111, loss_mean: 0.916306, proj_loss: -0.380939, loss_mean_cls: 1.104744, grad_norm: 3.978137 +Steps: 0%| | 2105/1000000 [08:38<67:43:39, 4.09it/s, grad_norm=3.98, loss_final=1.64, loss_mean=0.916, loss_mean_cls=1.1, proj_loss=-0.381][2026-03-22 14:27:15] Step: 2105, Training Logs: loss_final: 1.748983, loss_mean: 0.899623, proj_loss: -0.370496, loss_mean_cls: 1.219856, grad_norm: 2.310478 +Steps: 0%| | 2106/1000000 [08:39<67:44:40, 4.09it/s, grad_norm=2.31, loss_final=1.75, loss_mean=0.9, loss_mean_cls=1.22, proj_loss=-0.37][2026-03-22 14:27:16] Step: 2106, Training Logs: loss_final: 1.822380, loss_mean: 0.911864, proj_loss: -0.356585, loss_mean_cls: 1.267101, grad_norm: 3.430197 +Steps: 0%| | 2107/1000000 [08:39<67:43:00, 4.09it/s, grad_norm=3.43, loss_final=1.82, loss_mean=0.912, loss_mean_cls=1.27, proj_loss=-0.357][2026-03-22 14:27:16] Step: 2107, Training Logs: loss_final: 1.747150, loss_mean: 0.903689, proj_loss: -0.370744, loss_mean_cls: 1.214206, grad_norm: 5.279519 +Steps: 0%| | 2108/1000000 [08:39<67:42:54, 4.09it/s, grad_norm=5.28, loss_final=1.75, loss_mean=0.904, loss_mean_cls=1.21, proj_loss=-0.371][2026-03-22 14:27:16] Step: 2108, Training Logs: loss_final: 1.535126, loss_mean: 0.910779, proj_loss: -0.380863, loss_mean_cls: 1.005209, grad_norm: 2.735036 +Steps: 0%| | 2109/1000000 [08:39<67:42:50, 4.09it/s, grad_norm=2.74, loss_final=1.54, loss_mean=0.911, loss_mean_cls=1.01, proj_loss=-0.381][2026-03-22 14:27:16] Step: 2109, Training Logs: loss_final: 1.659659, loss_mean: 0.921257, proj_loss: -0.378516, loss_mean_cls: 1.116917, grad_norm: 3.642128 +Steps: 0%| | 2110/1000000 [08:40<67:43:15, 4.09it/s, grad_norm=3.64, loss_final=1.66, loss_mean=0.921, loss_mean_cls=1.12, proj_loss=-0.379][2026-03-22 14:27:17] Step: 2110, Training Logs: loss_final: 1.716829, loss_mean: 0.917935, proj_loss: -0.380947, loss_mean_cls: 1.179841, grad_norm: 3.656683 +Steps: 0%| | 2111/1000000 [08:40<67:42:32, 4.09it/s, grad_norm=3.66, loss_final=1.72, loss_mean=0.918, loss_mean_cls=1.18, proj_loss=-0.381][2026-03-22 14:27:17] Step: 2111, Training Logs: loss_final: 1.654671, loss_mean: 0.917562, proj_loss: -0.383294, loss_mean_cls: 1.120402, grad_norm: 3.672284 +Steps: 0%| | 2112/1000000 [08:40<67:42:45, 4.09it/s, grad_norm=3.67, loss_final=1.65, loss_mean=0.918, loss_mean_cls=1.12, proj_loss=-0.383][2026-03-22 14:27:17] Step: 2112, Training Logs: loss_final: 1.712606, loss_mean: 0.918828, proj_loss: -0.373859, loss_mean_cls: 1.167636, grad_norm: 2.596662 +Steps: 0%| | 2113/1000000 [08:40<67:40:20, 4.10it/s, grad_norm=2.6, loss_final=1.71, loss_mean=0.919, loss_mean_cls=1.17, proj_loss=-0.374][2026-03-22 14:27:17] Step: 2113, Training Logs: loss_final: 1.622258, loss_mean: 0.932534, proj_loss: -0.386556, loss_mean_cls: 1.076281, grad_norm: 6.043453 +Steps: 0%| | 2114/1000000 [08:41<67:42:29, 4.09it/s, grad_norm=6.04, loss_final=1.62, loss_mean=0.933, loss_mean_cls=1.08, proj_loss=-0.387][2026-03-22 14:27:18] Step: 2114, Training Logs: loss_final: 1.635428, loss_mean: 0.906531, proj_loss: -0.377409, loss_mean_cls: 1.106307, grad_norm: 4.920197 +Steps: 0%| | 2115/1000000 [08:41<67:41:07, 4.10it/s, grad_norm=4.92, loss_final=1.64, loss_mean=0.907, loss_mean_cls=1.11, proj_loss=-0.377][2026-03-22 14:27:18] Step: 2115, Training Logs: loss_final: 1.754690, loss_mean: 0.898399, proj_loss: -0.371712, loss_mean_cls: 1.228003, grad_norm: 2.527240 +Steps: 0%| | 2116/1000000 [08:41<67:41:21, 4.10it/s, grad_norm=2.53, loss_final=1.75, loss_mean=0.898, loss_mean_cls=1.23, proj_loss=-0.372][2026-03-22 14:27:18] Step: 2116, Training Logs: loss_final: 1.674503, loss_mean: 0.909174, proj_loss: -0.378592, loss_mean_cls: 1.143921, grad_norm: 4.271376 +Steps: 0%| | 2117/1000000 [08:41<67:42:51, 4.09it/s, grad_norm=4.27, loss_final=1.67, loss_mean=0.909, loss_mean_cls=1.14, proj_loss=-0.379][2026-03-22 14:27:18] Step: 2117, Training Logs: loss_final: 1.787336, loss_mean: 0.895666, proj_loss: -0.368378, loss_mean_cls: 1.260048, grad_norm: 4.416008 +Steps: 0%| | 2118/1000000 [08:42<67:42:57, 4.09it/s, grad_norm=4.42, loss_final=1.79, loss_mean=0.896, loss_mean_cls=1.26, proj_loss=-0.368][2026-03-22 14:27:19] Step: 2118, Training Logs: loss_final: 1.778419, loss_mean: 0.908312, proj_loss: -0.367971, loss_mean_cls: 1.238078, grad_norm: 2.379801 +Steps: 0%| | 2119/1000000 [08:42<67:42:32, 4.09it/s, grad_norm=2.38, loss_final=1.78, loss_mean=0.908, loss_mean_cls=1.24, proj_loss=-0.368][2026-03-22 14:27:19] Step: 2119, Training Logs: loss_final: 1.690987, loss_mean: 0.912029, proj_loss: -0.377881, loss_mean_cls: 1.156839, grad_norm: 2.026155 +Steps: 0%| | 2120/1000000 [08:42<67:42:44, 4.09it/s, grad_norm=2.03, loss_final=1.69, loss_mean=0.912, loss_mean_cls=1.16, proj_loss=-0.378][2026-03-22 14:27:19] Step: 2120, Training Logs: loss_final: 1.650201, loss_mean: 0.922698, proj_loss: -0.381829, loss_mean_cls: 1.109331, grad_norm: 2.068748 +Steps: 0%| | 2121/1000000 [08:42<67:43:50, 4.09it/s, grad_norm=2.07, loss_final=1.65, loss_mean=0.923, loss_mean_cls=1.11, proj_loss=-0.382][2026-03-22 14:27:19] Step: 2121, Training Logs: loss_final: 1.552059, loss_mean: 0.937981, proj_loss: -0.383807, loss_mean_cls: 0.997886, grad_norm: 1.884319 +Steps: 0%| | 2122/1000000 [08:43<67:43:18, 4.09it/s, grad_norm=1.88, loss_final=1.55, loss_mean=0.938, loss_mean_cls=0.998, proj_loss=-0.384][2026-03-22 14:27:19] Step: 2122, Training Logs: loss_final: 1.630673, loss_mean: 0.906908, proj_loss: -0.383856, loss_mean_cls: 1.107622, grad_norm: 1.794074 +Steps: 0%| | 2123/1000000 [08:43<67:43:23, 4.09it/s, grad_norm=1.79, loss_final=1.63, loss_mean=0.907, loss_mean_cls=1.11, proj_loss=-0.384][2026-03-22 14:27:20] Step: 2123, Training Logs: loss_final: 1.656522, loss_mean: 0.908332, proj_loss: -0.383628, loss_mean_cls: 1.131818, grad_norm: 2.893484 +Steps: 0%| | 2124/1000000 [08:43<67:44:21, 4.09it/s, grad_norm=2.89, loss_final=1.66, loss_mean=0.908, loss_mean_cls=1.13, proj_loss=-0.384][2026-03-22 14:27:20] Step: 2124, Training Logs: loss_final: 1.634888, loss_mean: 0.895438, proj_loss: -0.383018, loss_mean_cls: 1.122468, grad_norm: 2.599848 +Steps: 0%| | 2125/1000000 [08:43<67:45:01, 4.09it/s, grad_norm=2.6, loss_final=1.63, loss_mean=0.895, loss_mean_cls=1.12, proj_loss=-0.383][2026-03-22 14:27:20] Step: 2125, Training Logs: loss_final: 1.565992, loss_mean: 0.929526, proj_loss: -0.383378, loss_mean_cls: 1.019844, grad_norm: 2.817781 +Steps: 0%| | 2126/1000000 [08:44<67:43:21, 4.09it/s, grad_norm=2.82, loss_final=1.57, loss_mean=0.93, loss_mean_cls=1.02, proj_loss=-0.383][2026-03-22 14:27:20] Step: 2126, Training Logs: loss_final: 1.814904, loss_mean: 0.889460, proj_loss: -0.367021, loss_mean_cls: 1.292465, grad_norm: 3.025387 +Steps: 0%| | 2127/1000000 [08:44<67:43:38, 4.09it/s, grad_norm=3.03, loss_final=1.81, loss_mean=0.889, loss_mean_cls=1.29, proj_loss=-0.367][2026-03-22 14:27:21] Step: 2127, Training Logs: loss_final: 1.762572, loss_mean: 0.900469, proj_loss: -0.369467, loss_mean_cls: 1.231570, grad_norm: 3.449105 +Steps: 0%| | 2128/1000000 [08:44<67:43:42, 4.09it/s, grad_norm=3.45, loss_final=1.76, loss_mean=0.9, loss_mean_cls=1.23, proj_loss=-0.369][2026-03-22 14:27:21] Step: 2128, Training Logs: loss_final: 1.556495, loss_mean: 0.911939, proj_loss: -0.385377, loss_mean_cls: 1.029933, grad_norm: 4.655104 +Steps: 0%| | 2129/1000000 [08:44<67:42:51, 4.09it/s, grad_norm=4.66, loss_final=1.56, loss_mean=0.912, loss_mean_cls=1.03, proj_loss=-0.385][2026-03-22 14:27:21] Step: 2129, Training Logs: loss_final: 1.730531, loss_mean: 0.907288, proj_loss: -0.380349, loss_mean_cls: 1.203592, grad_norm: 2.103796 +Steps: 0%| | 2130/1000000 [08:44<67:43:33, 4.09it/s, grad_norm=2.1, loss_final=1.73, loss_mean=0.907, loss_mean_cls=1.2, proj_loss=-0.38][2026-03-22 14:27:21] Step: 2130, Training Logs: loss_final: 1.704614, loss_mean: 0.906808, proj_loss: -0.374346, loss_mean_cls: 1.172152, grad_norm: 2.912405 +Steps: 0%| | 2131/1000000 [08:45<67:42:55, 4.09it/s, grad_norm=2.91, loss_final=1.7, loss_mean=0.907, loss_mean_cls=1.17, proj_loss=-0.374][2026-03-22 14:27:22] Step: 2131, Training Logs: loss_final: 1.625639, loss_mean: 0.900264, proj_loss: -0.379599, loss_mean_cls: 1.104975, grad_norm: 2.612108 +Steps: 0%| | 2132/1000000 [08:45<67:43:41, 4.09it/s, grad_norm=2.61, loss_final=1.63, loss_mean=0.9, loss_mean_cls=1.1, proj_loss=-0.38][2026-03-22 14:27:22] Step: 2132, Training Logs: loss_final: 1.714677, loss_mean: 0.891818, proj_loss: -0.375299, loss_mean_cls: 1.198158, grad_norm: 1.668291 +Steps: 0%| | 2133/1000000 [08:45<67:43:06, 4.09it/s, grad_norm=1.67, loss_final=1.71, loss_mean=0.892, loss_mean_cls=1.2, proj_loss=-0.375][2026-03-22 14:27:22] Step: 2133, Training Logs: loss_final: 1.656986, loss_mean: 0.921132, proj_loss: -0.375774, loss_mean_cls: 1.111628, grad_norm: 3.540145 +Steps: 0%| | 2134/1000000 [08:45<67:42:21, 4.09it/s, grad_norm=3.54, loss_final=1.66, loss_mean=0.921, loss_mean_cls=1.11, proj_loss=-0.376][2026-03-22 14:27:22] Step: 2134, Training Logs: loss_final: 1.673975, loss_mean: 0.925327, proj_loss: -0.375244, loss_mean_cls: 1.123892, grad_norm: 4.300418 +Steps: 0%| | 2135/1000000 [08:46<67:41:21, 4.09it/s, grad_norm=4.3, loss_final=1.67, loss_mean=0.925, loss_mean_cls=1.12, proj_loss=-0.375][2026-03-22 14:27:23] Step: 2135, Training Logs: loss_final: 1.635156, loss_mean: 0.911459, proj_loss: -0.383480, loss_mean_cls: 1.107178, grad_norm: 3.131480 +Steps: 0%| | 2136/1000000 [08:46<67:43:04, 4.09it/s, grad_norm=3.13, loss_final=1.64, loss_mean=0.911, loss_mean_cls=1.11, proj_loss=-0.383][2026-03-22 14:27:23] Step: 2136, Training Logs: loss_final: 1.648493, loss_mean: 0.906320, proj_loss: -0.377132, loss_mean_cls: 1.119305, grad_norm: 1.842815 +Steps: 0%| | 2137/1000000 [08:46<67:42:59, 4.09it/s, grad_norm=1.84, loss_final=1.65, loss_mean=0.906, loss_mean_cls=1.12, proj_loss=-0.377][2026-03-22 14:27:23] Step: 2137, Training Logs: loss_final: 1.647248, loss_mean: 0.902103, proj_loss: -0.382660, loss_mean_cls: 1.127805, grad_norm: 2.485699 +Steps: 0%| | 2138/1000000 [08:46<67:44:38, 4.09it/s, grad_norm=2.49, loss_final=1.65, loss_mean=0.902, loss_mean_cls=1.13, proj_loss=-0.383][2026-03-22 14:27:23] Step: 2138, Training Logs: loss_final: 1.601634, loss_mean: 0.918564, proj_loss: -0.383130, loss_mean_cls: 1.066199, grad_norm: 2.195619 +Steps: 0%| | 2139/1000000 [08:47<67:43:33, 4.09it/s, grad_norm=2.2, loss_final=1.6, loss_mean=0.919, loss_mean_cls=1.07, proj_loss=-0.383][2026-03-22 14:27:24] Step: 2139, Training Logs: loss_final: 1.617533, loss_mean: 0.934326, proj_loss: -0.378512, loss_mean_cls: 1.061719, grad_norm: 1.941817 +Steps: 0%| | 2140/1000000 [08:47<67:42:42, 4.09it/s, grad_norm=1.94, loss_final=1.62, loss_mean=0.934, loss_mean_cls=1.06, proj_loss=-0.379][2026-03-22 14:27:24] Step: 2140, Training Logs: loss_final: 1.653827, loss_mean: 0.908167, proj_loss: -0.377300, loss_mean_cls: 1.122961, grad_norm: 1.915558 +Steps: 0%| | 2141/1000000 [08:47<67:41:59, 4.09it/s, grad_norm=1.92, loss_final=1.65, loss_mean=0.908, loss_mean_cls=1.12, proj_loss=-0.377][2026-03-22 14:27:24] Step: 2141, Training Logs: loss_final: 1.632789, loss_mean: 0.908864, proj_loss: -0.381486, loss_mean_cls: 1.105411, grad_norm: 3.403186 +Steps: 0%| | 2142/1000000 [08:47<67:41:32, 4.09it/s, grad_norm=3.4, loss_final=1.63, loss_mean=0.909, loss_mean_cls=1.11, proj_loss=-0.381][2026-03-22 14:27:24] Step: 2142, Training Logs: loss_final: 1.728154, loss_mean: 0.904857, proj_loss: -0.375365, loss_mean_cls: 1.198663, grad_norm: 1.735067 +Steps: 0%| | 2143/1000000 [08:48<69:48:08, 3.97it/s, grad_norm=1.74, loss_final=1.73, loss_mean=0.905, loss_mean_cls=1.2, proj_loss=-0.375][2026-03-22 14:27:25] Step: 2143, Training Logs: loss_final: 1.655800, loss_mean: 0.912677, proj_loss: -0.380762, loss_mean_cls: 1.123886, grad_norm: 2.915242 +Steps: 0%| | 2144/1000000 [08:48<69:14:45, 4.00it/s, grad_norm=2.92, loss_final=1.66, loss_mean=0.913, loss_mean_cls=1.12, proj_loss=-0.381][2026-03-22 14:27:25] Step: 2144, Training Logs: loss_final: 1.798892, loss_mean: 0.898836, proj_loss: -0.372661, loss_mean_cls: 1.272717, grad_norm: 4.972583 +Steps: 0%| | 2145/1000000 [08:48<68:49:32, 4.03it/s, grad_norm=4.97, loss_final=1.8, loss_mean=0.899, loss_mean_cls=1.27, proj_loss=-0.373][2026-03-22 14:27:25] Step: 2145, Training Logs: loss_final: 1.715300, loss_mean: 0.907220, proj_loss: -0.376781, loss_mean_cls: 1.184861, grad_norm: 5.769599 +Steps: 0%| | 2146/1000000 [08:48<68:28:34, 4.05it/s, grad_norm=5.77, loss_final=1.72, loss_mean=0.907, loss_mean_cls=1.18, proj_loss=-0.377][2026-03-22 14:27:25] Step: 2146, Training Logs: loss_final: 1.780008, loss_mean: 0.924664, proj_loss: -0.373303, loss_mean_cls: 1.228647, grad_norm: 4.107882 +Steps: 0%| | 2147/1000000 [08:49<68:14:02, 4.06it/s, grad_norm=4.11, loss_final=1.78, loss_mean=0.925, loss_mean_cls=1.23, proj_loss=-0.373][2026-03-22 14:27:26] Step: 2147, Training Logs: loss_final: 1.725273, loss_mean: 0.911415, proj_loss: -0.377746, loss_mean_cls: 1.191605, grad_norm: 5.320971 +Steps: 0%| | 2148/1000000 [08:49<68:04:27, 4.07it/s, grad_norm=5.32, loss_final=1.73, loss_mean=0.911, loss_mean_cls=1.19, proj_loss=-0.378][2026-03-22 14:27:26] Step: 2148, Training Logs: loss_final: 1.700591, loss_mean: 0.921258, proj_loss: -0.378768, loss_mean_cls: 1.158102, grad_norm: 5.152939 +Steps: 0%| | 2149/1000000 [08:49<67:55:45, 4.08it/s, grad_norm=5.15, loss_final=1.7, loss_mean=0.921, loss_mean_cls=1.16, proj_loss=-0.379][2026-03-22 14:27:26] Step: 2149, Training Logs: loss_final: 1.633639, loss_mean: 0.922123, proj_loss: -0.386217, loss_mean_cls: 1.097733, grad_norm: 4.236574 +Steps: 0%| | 2150/1000000 [08:49<67:52:04, 4.08it/s, grad_norm=4.24, loss_final=1.63, loss_mean=0.922, loss_mean_cls=1.1, proj_loss=-0.386][2026-03-22 14:27:26] Step: 2150, Training Logs: loss_final: 1.779302, loss_mean: 0.907719, proj_loss: -0.365689, loss_mean_cls: 1.237273, grad_norm: 2.710486 +Steps: 0%| | 2151/1000000 [08:50<67:57:25, 4.08it/s, grad_norm=2.71, loss_final=1.78, loss_mean=0.908, loss_mean_cls=1.24, proj_loss=-0.366][2026-03-22 14:27:27] Step: 2151, Training Logs: loss_final: 1.613927, loss_mean: 0.917178, proj_loss: -0.382585, loss_mean_cls: 1.079335, grad_norm: 2.277812 +Steps: 0%| | 2152/1000000 [08:50<67:54:35, 4.08it/s, grad_norm=2.28, loss_final=1.61, loss_mean=0.917, loss_mean_cls=1.08, proj_loss=-0.383][2026-03-22 14:27:27] Step: 2152, Training Logs: loss_final: 1.650745, loss_mean: 0.898428, proj_loss: -0.385516, loss_mean_cls: 1.137833, grad_norm: 3.998219 +Steps: 0%| | 2153/1000000 [08:50<67:51:03, 4.09it/s, grad_norm=4, loss_final=1.65, loss_mean=0.898, loss_mean_cls=1.14, proj_loss=-0.386][2026-03-22 14:27:27] Step: 2153, Training Logs: loss_final: 1.618299, loss_mean: 0.919955, proj_loss: -0.380640, loss_mean_cls: 1.078984, grad_norm: 2.489558 +Steps: 0%| | 2154/1000000 [08:50<67:49:29, 4.09it/s, grad_norm=2.49, loss_final=1.62, loss_mean=0.92, loss_mean_cls=1.08, proj_loss=-0.381][2026-03-22 14:27:27] Step: 2154, Training Logs: loss_final: 1.741551, loss_mean: 0.914749, proj_loss: -0.375999, loss_mean_cls: 1.202801, grad_norm: 4.621233 +Steps: 0%| | 2155/1000000 [08:51<83:12:12, 3.33it/s, grad_norm=4.62, loss_final=1.74, loss_mean=0.915, loss_mean_cls=1.2, proj_loss=-0.376][2026-03-22 14:27:28] Step: 2155, Training Logs: loss_final: 1.691550, loss_mean: 0.935218, proj_loss: -0.377048, loss_mean_cls: 1.133380, grad_norm: 3.157529 +Steps: 0%| | 2156/1000000 [08:51<94:45:39, 2.93it/s, grad_norm=3.16, loss_final=1.69, loss_mean=0.935, loss_mean_cls=1.13, proj_loss=-0.377][2026-03-22 14:27:28] Step: 2156, Training Logs: loss_final: 1.588331, loss_mean: 0.907943, proj_loss: -0.380828, loss_mean_cls: 1.061216, grad_norm: 2.361719 +Steps: 0%| | 2157/1000000 [08:51<86:39:47, 3.20it/s, grad_norm=2.36, loss_final=1.59, loss_mean=0.908, loss_mean_cls=1.06, proj_loss=-0.381][2026-03-22 14:27:28] Step: 2157, Training Logs: loss_final: 1.627228, loss_mean: 0.914073, proj_loss: -0.385356, loss_mean_cls: 1.098511, grad_norm: 2.868151 +Steps: 0%| | 2158/1000000 [08:52<80:57:23, 3.42it/s, grad_norm=2.87, loss_final=1.63, loss_mean=0.914, loss_mean_cls=1.1, proj_loss=-0.385][2026-03-22 14:27:29] Step: 2158, Training Logs: loss_final: 1.614410, loss_mean: 0.926644, proj_loss: -0.379367, loss_mean_cls: 1.067132, grad_norm: 2.646180 +Steps: 0%| | 2159/1000000 [08:52<76:59:54, 3.60it/s, grad_norm=2.65, loss_final=1.61, loss_mean=0.927, loss_mean_cls=1.07, proj_loss=-0.379][2026-03-22 14:27:29] Step: 2159, Training Logs: loss_final: 1.637793, loss_mean: 0.921674, proj_loss: -0.381359, loss_mean_cls: 1.097477, grad_norm: 3.147930 +Steps: 0%| | 2160/1000000 [08:52<74:13:45, 3.73it/s, grad_norm=3.15, loss_final=1.64, loss_mean=0.922, loss_mean_cls=1.1, proj_loss=-0.381][2026-03-22 14:27:29] Step: 2160, Training Logs: loss_final: 1.608172, loss_mean: 0.920889, proj_loss: -0.382949, loss_mean_cls: 1.070232, grad_norm: 2.877318 +Steps: 0%| | 2161/1000000 [08:52<72:15:47, 3.84it/s, grad_norm=2.88, loss_final=1.61, loss_mean=0.921, loss_mean_cls=1.07, proj_loss=-0.383][2026-03-22 14:27:29] Step: 2161, Training Logs: loss_final: 1.733787, loss_mean: 0.903720, proj_loss: -0.374395, loss_mean_cls: 1.204463, grad_norm: 2.706624 +Steps: 0%| | 2162/1000000 [08:53<70:51:19, 3.91it/s, grad_norm=2.71, loss_final=1.73, loss_mean=0.904, loss_mean_cls=1.2, proj_loss=-0.374][2026-03-22 14:27:30] Step: 2162, Training Logs: loss_final: 1.709498, loss_mean: 0.919558, proj_loss: -0.380377, loss_mean_cls: 1.170317, grad_norm: 3.445283 +Steps: 0%| | 2163/1000000 [08:53<69:54:27, 3.96it/s, grad_norm=3.45, loss_final=1.71, loss_mean=0.92, loss_mean_cls=1.17, proj_loss=-0.38][2026-03-22 14:27:30] Step: 2163, Training Logs: loss_final: 1.613998, loss_mean: 0.937917, proj_loss: -0.384745, loss_mean_cls: 1.060827, grad_norm: 4.009830 +Steps: 0%| | 2164/1000000 [08:53<69:14:42, 4.00it/s, grad_norm=4.01, loss_final=1.61, loss_mean=0.938, loss_mean_cls=1.06, proj_loss=-0.385][2026-03-22 14:27:30] Step: 2164, Training Logs: loss_final: 1.721591, loss_mean: 0.899132, proj_loss: -0.381421, loss_mean_cls: 1.203881, grad_norm: 3.281162 +Steps: 0%| | 2165/1000000 [08:53<68:46:28, 4.03it/s, grad_norm=3.28, loss_final=1.72, loss_mean=0.899, loss_mean_cls=1.2, proj_loss=-0.381][2026-03-22 14:27:30] Step: 2165, Training Logs: loss_final: 1.631092, loss_mean: 0.911111, proj_loss: -0.383237, loss_mean_cls: 1.103219, grad_norm: 1.673572 +Steps: 0%| | 2166/1000000 [08:54<68:27:05, 4.05it/s, grad_norm=1.67, loss_final=1.63, loss_mean=0.911, loss_mean_cls=1.1, proj_loss=-0.383][2026-03-22 14:27:31] Step: 2166, Training Logs: loss_final: 1.714051, loss_mean: 0.896550, proj_loss: -0.379377, loss_mean_cls: 1.196879, grad_norm: 2.928983 +Steps: 0%| | 2167/1000000 [08:54<96:02:56, 2.89it/s, grad_norm=2.93, loss_final=1.71, loss_mean=0.897, loss_mean_cls=1.2, proj_loss=-0.379][2026-03-22 14:27:31] Step: 2167, Training Logs: loss_final: 1.724173, loss_mean: 0.913834, proj_loss: -0.381114, loss_mean_cls: 1.191453, grad_norm: 1.922456 +Steps: 0%| | 2168/1000000 [08:55<87:39:19, 3.16it/s, grad_norm=1.92, loss_final=1.72, loss_mean=0.914, loss_mean_cls=1.19, proj_loss=-0.381][2026-03-22 14:27:31] Step: 2168, Training Logs: loss_final: 1.579106, loss_mean: 0.907658, proj_loss: -0.394642, loss_mean_cls: 1.066090, grad_norm: 3.095541 +Steps: 0%| | 2169/1000000 [08:55<81:39:45, 3.39it/s, grad_norm=3.1, loss_final=1.58, loss_mean=0.908, loss_mean_cls=1.07, proj_loss=-0.395][2026-03-22 14:27:32] Step: 2169, Training Logs: loss_final: 1.610957, loss_mean: 0.903478, proj_loss: -0.385521, loss_mean_cls: 1.093001, grad_norm: 3.235241 +Steps: 0%| | 2170/1000000 [08:55<77:29:26, 3.58it/s, grad_norm=3.24, loss_final=1.61, loss_mean=0.903, loss_mean_cls=1.09, proj_loss=-0.386][2026-03-22 14:27:32] Step: 2170, Training Logs: loss_final: 1.699525, loss_mean: 0.909464, proj_loss: -0.378534, loss_mean_cls: 1.168595, grad_norm: 2.834336 +Steps: 0%| | 2171/1000000 [08:55<74:32:35, 3.72it/s, grad_norm=2.83, loss_final=1.7, loss_mean=0.909, loss_mean_cls=1.17, proj_loss=-0.379][2026-03-22 14:27:32] Step: 2171, Training Logs: loss_final: 1.652321, loss_mean: 0.905338, proj_loss: -0.383778, loss_mean_cls: 1.130761, grad_norm: 1.932791 +Steps: 0%| | 2172/1000000 [08:55<72:30:17, 3.82it/s, grad_norm=1.93, loss_final=1.65, loss_mean=0.905, loss_mean_cls=1.13, proj_loss=-0.384][2026-03-22 14:27:32] Step: 2172, Training Logs: loss_final: 1.753297, loss_mean: 0.888453, proj_loss: -0.375775, loss_mean_cls: 1.240619, grad_norm: 3.578386 +Steps: 0%| | 2173/1000000 [08:56<71:02:26, 3.90it/s, grad_norm=3.58, loss_final=1.75, loss_mean=0.888, loss_mean_cls=1.24, proj_loss=-0.376][2026-03-22 14:27:33] Step: 2173, Training Logs: loss_final: 1.691294, loss_mean: 0.913907, proj_loss: -0.377526, loss_mean_cls: 1.154913, grad_norm: 3.725921 +Steps: 0%| | 2174/1000000 [08:56<70:02:19, 3.96it/s, grad_norm=3.73, loss_final=1.69, loss_mean=0.914, loss_mean_cls=1.15, proj_loss=-0.378][2026-03-22 14:27:33] Step: 2174, Training Logs: loss_final: 1.658892, loss_mean: 0.922553, proj_loss: -0.375470, loss_mean_cls: 1.111809, grad_norm: 1.234255 +Steps: 0%| | 2175/1000000 [08:56<69:18:36, 4.00it/s, grad_norm=1.23, loss_final=1.66, loss_mean=0.923, loss_mean_cls=1.11, proj_loss=-0.375][2026-03-22 14:27:33] Step: 2175, Training Logs: loss_final: 1.658087, loss_mean: 0.904975, proj_loss: -0.389238, loss_mean_cls: 1.142350, grad_norm: 5.625872 +Steps: 0%| | 2176/1000000 [08:56<68:50:04, 4.03it/s, grad_norm=5.63, loss_final=1.66, loss_mean=0.905, loss_mean_cls=1.14, proj_loss=-0.389][2026-03-22 14:27:33] Step: 2176, Training Logs: loss_final: 1.650208, loss_mean: 0.924139, proj_loss: -0.377039, loss_mean_cls: 1.103107, grad_norm: 3.146893 +Steps: 0%| | 2177/1000000 [08:57<68:31:12, 4.05it/s, grad_norm=3.15, loss_final=1.65, loss_mean=0.924, loss_mean_cls=1.1, proj_loss=-0.377][2026-03-22 14:27:34] Step: 2177, Training Logs: loss_final: 1.606841, loss_mean: 0.907663, proj_loss: -0.380322, loss_mean_cls: 1.079500, grad_norm: 1.700704 +Steps: 0%| | 2178/1000000 [08:57<68:15:40, 4.06it/s, grad_norm=1.7, loss_final=1.61, loss_mean=0.908, loss_mean_cls=1.08, proj_loss=-0.38][2026-03-22 14:27:34] Step: 2178, Training Logs: loss_final: 1.570042, loss_mean: 0.932854, proj_loss: -0.384596, loss_mean_cls: 1.021784, grad_norm: 2.690274 +Steps: 0%| | 2179/1000000 [08:57<68:03:59, 4.07it/s, grad_norm=2.69, loss_final=1.57, loss_mean=0.933, loss_mean_cls=1.02, proj_loss=-0.385][2026-03-22 14:27:34] Step: 2179, Training Logs: loss_final: 1.660149, loss_mean: 0.932042, proj_loss: -0.380855, loss_mean_cls: 1.108963, grad_norm: 4.624985 +Steps: 0%| | 2180/1000000 [08:57<67:56:20, 4.08it/s, grad_norm=4.62, loss_final=1.66, loss_mean=0.932, loss_mean_cls=1.11, proj_loss=-0.381][2026-03-22 14:27:34] Step: 2180, Training Logs: loss_final: 1.587324, loss_mean: 0.900628, proj_loss: -0.384817, loss_mean_cls: 1.071513, grad_norm: 2.000386 +Steps: 0%| | 2181/1000000 [08:58<67:52:46, 4.08it/s, grad_norm=2, loss_final=1.59, loss_mean=0.901, loss_mean_cls=1.07, proj_loss=-0.385][2026-03-22 14:27:35] Step: 2181, Training Logs: loss_final: 1.678188, loss_mean: 0.899256, proj_loss: -0.384186, loss_mean_cls: 1.163119, grad_norm: 4.319539 +Steps: 0%| | 2182/1000000 [08:58<68:15:00, 4.06it/s, grad_norm=4.32, loss_final=1.68, loss_mean=0.899, loss_mean_cls=1.16, proj_loss=-0.384][2026-03-22 14:27:35] Step: 2182, Training Logs: loss_final: 1.600344, loss_mean: 0.913562, proj_loss: -0.383160, loss_mean_cls: 1.069942, grad_norm: 3.481917 +Steps: 0%| | 2183/1000000 [08:58<68:05:57, 4.07it/s, grad_norm=3.48, loss_final=1.6, loss_mean=0.914, loss_mean_cls=1.07, proj_loss=-0.383][2026-03-22 14:27:35] Step: 2183, Training Logs: loss_final: 1.701849, loss_mean: 0.899974, proj_loss: -0.376881, loss_mean_cls: 1.178756, grad_norm: 2.018749 +Steps: 0%| | 2184/1000000 [08:58<67:59:02, 4.08it/s, grad_norm=2.02, loss_final=1.7, loss_mean=0.9, loss_mean_cls=1.18, proj_loss=-0.377][2026-03-22 14:27:35] Step: 2184, Training Logs: loss_final: 1.694232, loss_mean: 0.905532, proj_loss: -0.373377, loss_mean_cls: 1.162077, grad_norm: 1.903919 +Steps: 0%| | 2185/1000000 [08:59<67:53:26, 4.08it/s, grad_norm=1.9, loss_final=1.69, loss_mean=0.906, loss_mean_cls=1.16, proj_loss=-0.373][2026-03-22 14:27:36] Step: 2185, Training Logs: loss_final: 1.699538, loss_mean: 0.916363, proj_loss: -0.373481, loss_mean_cls: 1.156655, grad_norm: 2.332600 +Steps: 0%| | 2186/1000000 [08:59<67:49:20, 4.09it/s, grad_norm=2.33, loss_final=1.7, loss_mean=0.916, loss_mean_cls=1.16, proj_loss=-0.373][2026-03-22 14:27:36] Step: 2186, Training Logs: loss_final: 1.715693, loss_mean: 0.894928, proj_loss: -0.379250, loss_mean_cls: 1.200015, grad_norm: 3.095625 +Steps: 0%| | 2187/1000000 [08:59<67:44:47, 4.09it/s, grad_norm=3.1, loss_final=1.72, loss_mean=0.895, loss_mean_cls=1.2, proj_loss=-0.379][2026-03-22 14:27:36] Step: 2187, Training Logs: loss_final: 1.564989, loss_mean: 0.943685, proj_loss: -0.385143, loss_mean_cls: 1.006447, grad_norm: 3.816559 +Steps: 0%| | 2188/1000000 [08:59<67:43:38, 4.09it/s, grad_norm=3.82, loss_final=1.56, loss_mean=0.944, loss_mean_cls=1.01, proj_loss=-0.385][2026-03-22 14:27:36] Step: 2188, Training Logs: loss_final: 1.820458, loss_mean: 0.881169, proj_loss: -0.358574, loss_mean_cls: 1.297863, grad_norm: 2.971529 +Steps: 0%| | 2189/1000000 [09:00<67:42:38, 4.09it/s, grad_norm=2.97, loss_final=1.82, loss_mean=0.881, loss_mean_cls=1.3, proj_loss=-0.359][2026-03-22 14:27:37] Step: 2189, Training Logs: loss_final: 1.648403, loss_mean: 0.912813, proj_loss: -0.378938, loss_mean_cls: 1.114527, grad_norm: 2.183436 +Steps: 0%| | 2190/1000000 [09:00<67:42:46, 4.09it/s, grad_norm=2.18, loss_final=1.65, loss_mean=0.913, loss_mean_cls=1.11, proj_loss=-0.379][2026-03-22 14:27:37] Step: 2190, Training Logs: loss_final: 1.651461, loss_mean: 0.911327, proj_loss: -0.385895, loss_mean_cls: 1.126029, grad_norm: 1.625370 +Steps: 0%| | 2191/1000000 [09:00<67:41:41, 4.09it/s, grad_norm=1.63, loss_final=1.65, loss_mean=0.911, loss_mean_cls=1.13, proj_loss=-0.386][2026-03-22 14:27:37] Step: 2191, Training Logs: loss_final: 1.610490, loss_mean: 0.907910, proj_loss: -0.384477, loss_mean_cls: 1.087057, grad_norm: 2.148180 +Steps: 0%| | 2192/1000000 [09:00<67:42:53, 4.09it/s, grad_norm=2.15, loss_final=1.61, loss_mean=0.908, loss_mean_cls=1.09, proj_loss=-0.384][2026-03-22 14:27:37] Step: 2192, Training Logs: loss_final: 1.720218, loss_mean: 0.912734, proj_loss: -0.375727, loss_mean_cls: 1.183211, grad_norm: 3.625062 +Steps: 0%| | 2193/1000000 [09:01<67:43:01, 4.09it/s, grad_norm=3.63, loss_final=1.72, loss_mean=0.913, loss_mean_cls=1.18, proj_loss=-0.376][2026-03-22 14:27:38] Step: 2193, Training Logs: loss_final: 1.714864, loss_mean: 0.919265, proj_loss: -0.375403, loss_mean_cls: 1.171003, grad_norm: 2.222819 +Steps: 0%| | 2194/1000000 [09:01<67:43:57, 4.09it/s, grad_norm=2.22, loss_final=1.71, loss_mean=0.919, loss_mean_cls=1.17, proj_loss=-0.375][2026-03-22 14:27:38] Step: 2194, Training Logs: loss_final: 1.755880, loss_mean: 0.897649, proj_loss: -0.377165, loss_mean_cls: 1.235396, grad_norm: 2.887303 +Steps: 0%| | 2195/1000000 [09:01<67:42:57, 4.09it/s, grad_norm=2.89, loss_final=1.76, loss_mean=0.898, loss_mean_cls=1.24, proj_loss=-0.377][2026-03-22 14:27:38] Step: 2195, Training Logs: loss_final: 1.615776, loss_mean: 0.909463, proj_loss: -0.384081, loss_mean_cls: 1.090394, grad_norm: 3.121777 +Steps: 0%| | 2196/1000000 [09:01<67:42:23, 4.09it/s, grad_norm=3.12, loss_final=1.62, loss_mean=0.909, loss_mean_cls=1.09, proj_loss=-0.384][2026-03-22 14:27:38] Step: 2196, Training Logs: loss_final: 1.646986, loss_mean: 0.913392, proj_loss: -0.380402, loss_mean_cls: 1.113997, grad_norm: 2.921461 +Steps: 0%| | 2197/1000000 [09:02<67:42:31, 4.09it/s, grad_norm=2.92, loss_final=1.65, loss_mean=0.913, loss_mean_cls=1.11, proj_loss=-0.38][2026-03-22 14:27:39] Step: 2197, Training Logs: loss_final: 1.730193, loss_mean: 0.907521, proj_loss: -0.377430, loss_mean_cls: 1.200102, grad_norm: 5.554168 +Steps: 0%| | 2198/1000000 [09:02<67:41:22, 4.09it/s, grad_norm=5.55, loss_final=1.73, loss_mean=0.908, loss_mean_cls=1.2, proj_loss=-0.377][2026-03-22 14:27:39] Step: 2198, Training Logs: loss_final: 1.625470, loss_mean: 0.930765, proj_loss: -0.385872, loss_mean_cls: 1.080576, grad_norm: 2.960288 +Steps: 0%| | 2199/1000000 [09:02<67:42:30, 4.09it/s, grad_norm=2.96, loss_final=1.63, loss_mean=0.931, loss_mean_cls=1.08, proj_loss=-0.386][2026-03-22 14:27:39] Step: 2199, Training Logs: loss_final: 1.646794, loss_mean: 0.924607, proj_loss: -0.381726, loss_mean_cls: 1.103914, grad_norm: 2.506230 +Steps: 0%| | 2200/1000000 [09:02<67:44:30, 4.09it/s, grad_norm=2.51, loss_final=1.65, loss_mean=0.925, loss_mean_cls=1.1, proj_loss=-0.382][2026-03-22 14:27:39] Step: 2200, Training Logs: loss_final: 1.675338, loss_mean: 0.915142, proj_loss: -0.375711, loss_mean_cls: 1.135907, grad_norm: 2.366338 +Steps: 0%| | 2201/1000000 [09:03<67:50:50, 4.09it/s, grad_norm=2.37, loss_final=1.68, loss_mean=0.915, loss_mean_cls=1.14, proj_loss=-0.376][2026-03-22 14:27:40] Step: 2201, Training Logs: loss_final: 1.655841, loss_mean: 0.915916, proj_loss: -0.377704, loss_mean_cls: 1.117630, grad_norm: 2.300270 +Steps: 0%| | 2202/1000000 [09:03<67:47:49, 4.09it/s, grad_norm=2.3, loss_final=1.66, loss_mean=0.916, loss_mean_cls=1.12, proj_loss=-0.378][2026-03-22 14:27:40] Step: 2202, Training Logs: loss_final: 1.668139, loss_mean: 0.931965, proj_loss: -0.382078, loss_mean_cls: 1.118252, grad_norm: 3.699185 +Steps: 0%| | 2203/1000000 [09:03<67:44:49, 4.09it/s, grad_norm=3.7, loss_final=1.67, loss_mean=0.932, loss_mean_cls=1.12, proj_loss=-0.382][2026-03-22 14:27:40] Step: 2203, Training Logs: loss_final: 1.741536, loss_mean: 0.907252, proj_loss: -0.374566, loss_mean_cls: 1.208851, grad_norm: 2.364897 +Steps: 0%| | 2204/1000000 [09:03<67:44:18, 4.09it/s, grad_norm=2.36, loss_final=1.74, loss_mean=0.907, loss_mean_cls=1.21, proj_loss=-0.375][2026-03-22 14:27:40] Step: 2204, Training Logs: loss_final: 1.614056, loss_mean: 0.915404, proj_loss: -0.381233, loss_mean_cls: 1.079886, grad_norm: 2.038018 +Steps: 0%| | 2205/1000000 [09:04<67:41:33, 4.09it/s, grad_norm=2.04, loss_final=1.61, loss_mean=0.915, loss_mean_cls=1.08, proj_loss=-0.381][2026-03-22 14:27:41] Step: 2205, Training Logs: loss_final: 1.639995, loss_mean: 0.909302, proj_loss: -0.385333, loss_mean_cls: 1.116026, grad_norm: 3.338096 +Steps: 0%| | 2206/1000000 [09:04<67:41:32, 4.09it/s, grad_norm=3.34, loss_final=1.64, loss_mean=0.909, loss_mean_cls=1.12, proj_loss=-0.385][2026-03-22 14:27:41] Step: 2206, Training Logs: loss_final: 1.697634, loss_mean: 0.896663, proj_loss: -0.380634, loss_mean_cls: 1.181606, grad_norm: 2.580767 +Steps: 0%| | 2207/1000000 [09:04<67:40:48, 4.10it/s, grad_norm=2.58, loss_final=1.7, loss_mean=0.897, loss_mean_cls=1.18, proj_loss=-0.381][2026-03-22 14:27:41] Step: 2207, Training Logs: loss_final: 1.528823, loss_mean: 0.928586, proj_loss: -0.388640, loss_mean_cls: 0.988877, grad_norm: 2.158971 +Steps: 0%| | 2208/1000000 [09:04<67:41:10, 4.09it/s, grad_norm=2.16, loss_final=1.53, loss_mean=0.929, loss_mean_cls=0.989, proj_loss=-0.389][2026-03-22 14:27:41] Step: 2208, Training Logs: loss_final: 1.665209, loss_mean: 0.906187, proj_loss: -0.381498, loss_mean_cls: 1.140520, grad_norm: 1.688770 +Steps: 0%| | 2209/1000000 [09:05<67:40:23, 4.10it/s, grad_norm=1.69, loss_final=1.67, loss_mean=0.906, loss_mean_cls=1.14, proj_loss=-0.381][2026-03-22 14:27:41] Step: 2209, Training Logs: loss_final: 1.470277, loss_mean: 0.928352, proj_loss: -0.394549, loss_mean_cls: 0.936474, grad_norm: 1.447769 +Steps: 0%| | 2210/1000000 [09:05<67:41:35, 4.09it/s, grad_norm=1.45, loss_final=1.47, loss_mean=0.928, loss_mean_cls=0.936, proj_loss=-0.395][2026-03-22 14:27:42] Step: 2210, Training Logs: loss_final: 1.541854, loss_mean: 0.951830, proj_loss: -0.382766, loss_mean_cls: 0.972790, grad_norm: 1.814278 +Steps: 0%| | 2211/1000000 [09:05<67:41:33, 4.09it/s, grad_norm=1.81, loss_final=1.54, loss_mean=0.952, loss_mean_cls=0.973, proj_loss=-0.383][2026-03-22 14:27:42] Step: 2211, Training Logs: loss_final: 1.636740, loss_mean: 0.911095, proj_loss: -0.387338, loss_mean_cls: 1.112984, grad_norm: 1.528321 +Steps: 0%| | 2212/1000000 [09:05<67:43:04, 4.09it/s, grad_norm=1.53, loss_final=1.64, loss_mean=0.911, loss_mean_cls=1.11, proj_loss=-0.387][2026-03-22 14:27:42] Step: 2212, Training Logs: loss_final: 1.564093, loss_mean: 0.926036, proj_loss: -0.391784, loss_mean_cls: 1.029841, grad_norm: 2.920464 +Steps: 0%| | 2213/1000000 [09:06<67:43:22, 4.09it/s, grad_norm=2.92, loss_final=1.56, loss_mean=0.926, loss_mean_cls=1.03, proj_loss=-0.392][2026-03-22 14:27:42] Step: 2213, Training Logs: loss_final: 1.778319, loss_mean: 0.865711, proj_loss: -0.377039, loss_mean_cls: 1.289648, grad_norm: 3.227534 +Steps: 0%| | 2214/1000000 [09:06<67:42:41, 4.09it/s, grad_norm=3.23, loss_final=1.78, loss_mean=0.866, loss_mean_cls=1.29, proj_loss=-0.377][2026-03-22 14:27:43] Step: 2214, Training Logs: loss_final: 1.612247, loss_mean: 0.925843, proj_loss: -0.380930, loss_mean_cls: 1.067334, grad_norm: 1.225238 +Steps: 0%| | 2215/1000000 [09:06<67:42:14, 4.09it/s, grad_norm=1.23, loss_final=1.61, loss_mean=0.926, loss_mean_cls=1.07, proj_loss=-0.381][2026-03-22 14:27:43] Step: 2215, Training Logs: loss_final: 1.628186, loss_mean: 0.941491, proj_loss: -0.382861, loss_mean_cls: 1.069556, grad_norm: 4.386680 +Steps: 0%| | 2216/1000000 [09:06<67:43:28, 4.09it/s, grad_norm=4.39, loss_final=1.63, loss_mean=0.941, loss_mean_cls=1.07, proj_loss=-0.383][2026-03-22 14:27:43] Step: 2216, Training Logs: loss_final: 1.555470, loss_mean: 0.929043, proj_loss: -0.392332, loss_mean_cls: 1.018759, grad_norm: 4.660582 +Steps: 0%| | 2217/1000000 [09:06<67:44:17, 4.09it/s, grad_norm=4.66, loss_final=1.56, loss_mean=0.929, loss_mean_cls=1.02, proj_loss=-0.392][2026-03-22 14:27:43] Step: 2217, Training Logs: loss_final: 1.706846, loss_mean: 0.900441, proj_loss: -0.372306, loss_mean_cls: 1.178711, grad_norm: 2.365845 +Steps: 0%| | 2218/1000000 [09:07<67:43:42, 4.09it/s, grad_norm=2.37, loss_final=1.71, loss_mean=0.9, loss_mean_cls=1.18, proj_loss=-0.372][2026-03-22 14:27:44] Step: 2218, Training Logs: loss_final: 1.705578, loss_mean: 0.916723, proj_loss: -0.385251, loss_mean_cls: 1.174107, grad_norm: 3.206888 +Steps: 0%| | 2219/1000000 [09:07<67:43:27, 4.09it/s, grad_norm=3.21, loss_final=1.71, loss_mean=0.917, loss_mean_cls=1.17, proj_loss=-0.385][2026-03-22 14:27:44] Step: 2219, Training Logs: loss_final: 1.532513, loss_mean: 0.939737, proj_loss: -0.387755, loss_mean_cls: 0.980531, grad_norm: 2.506008 +Steps: 0%| | 2220/1000000 [09:07<67:43:25, 4.09it/s, grad_norm=2.51, loss_final=1.53, loss_mean=0.94, loss_mean_cls=0.981, proj_loss=-0.388][2026-03-22 14:27:44] Step: 2220, Training Logs: loss_final: 1.568493, loss_mean: 0.912126, proj_loss: -0.389857, loss_mean_cls: 1.046224, grad_norm: 3.652376 +Steps: 0%| | 2221/1000000 [09:07<67:42:28, 4.09it/s, grad_norm=3.65, loss_final=1.57, loss_mean=0.912, loss_mean_cls=1.05, proj_loss=-0.39][2026-03-22 14:27:44] Step: 2221, Training Logs: loss_final: 1.619337, loss_mean: 0.918951, proj_loss: -0.379549, loss_mean_cls: 1.079934, grad_norm: 2.479477 +Steps: 0%| | 2222/1000000 [09:08<67:40:04, 4.10it/s, grad_norm=2.48, loss_final=1.62, loss_mean=0.919, loss_mean_cls=1.08, proj_loss=-0.38][2026-03-22 14:27:45] Step: 2222, Training Logs: loss_final: 1.705997, loss_mean: 0.928073, proj_loss: -0.386729, loss_mean_cls: 1.164653, grad_norm: 5.614715 +Steps: 0%| | 2223/1000000 [09:08<67:40:02, 4.10it/s, grad_norm=5.61, loss_final=1.71, loss_mean=0.928, loss_mean_cls=1.16, proj_loss=-0.387][2026-03-22 14:27:45] Step: 2223, Training Logs: loss_final: 1.806114, loss_mean: 0.903737, proj_loss: -0.372226, loss_mean_cls: 1.274603, grad_norm: 6.377147 +Steps: 0%| | 2224/1000000 [09:08<67:41:31, 4.09it/s, grad_norm=6.38, loss_final=1.81, loss_mean=0.904, loss_mean_cls=1.27, proj_loss=-0.372][2026-03-22 14:27:45] Step: 2224, Training Logs: loss_final: 1.705080, loss_mean: 0.909560, proj_loss: -0.366626, loss_mean_cls: 1.162146, grad_norm: 3.896820 +Steps: 0%| | 2225/1000000 [09:08<67:41:25, 4.09it/s, grad_norm=3.9, loss_final=1.71, loss_mean=0.91, loss_mean_cls=1.16, proj_loss=-0.367][2026-03-22 14:27:45] Step: 2225, Training Logs: loss_final: 1.649285, loss_mean: 0.908535, proj_loss: -0.380702, loss_mean_cls: 1.121453, grad_norm: 1.889055 +Steps: 0%| | 2226/1000000 [09:09<68:30:59, 4.05it/s, grad_norm=1.89, loss_final=1.65, loss_mean=0.909, loss_mean_cls=1.12, proj_loss=-0.381][2026-03-22 14:27:46] Step: 2226, Training Logs: loss_final: 1.695411, loss_mean: 0.896923, proj_loss: -0.383938, loss_mean_cls: 1.182426, grad_norm: 3.411015 +Steps: 0%| | 2227/1000000 [09:09<68:12:51, 4.06it/s, grad_norm=3.41, loss_final=1.7, loss_mean=0.897, loss_mean_cls=1.18, proj_loss=-0.384][2026-03-22 14:27:46] Step: 2227, Training Logs: loss_final: 1.705910, loss_mean: 0.912944, proj_loss: -0.380286, loss_mean_cls: 1.173251, grad_norm: 3.275560 +Steps: 0%| | 2228/1000000 [09:09<68:04:20, 4.07it/s, grad_norm=3.28, loss_final=1.71, loss_mean=0.913, loss_mean_cls=1.17, proj_loss=-0.38][2026-03-22 14:27:46] Step: 2228, Training Logs: loss_final: 1.645745, loss_mean: 0.919569, proj_loss: -0.374513, loss_mean_cls: 1.100689, grad_norm: 1.717982 +Steps: 0%| | 2229/1000000 [09:09<67:59:47, 4.08it/s, grad_norm=1.72, loss_final=1.65, loss_mean=0.92, loss_mean_cls=1.1, proj_loss=-0.375][2026-03-22 14:27:46] Step: 2229, Training Logs: loss_final: 1.583419, loss_mean: 0.916103, proj_loss: -0.394123, loss_mean_cls: 1.061439, grad_norm: 6.345220 +Steps: 0%| | 2230/1000000 [09:10<67:55:07, 4.08it/s, grad_norm=6.35, loss_final=1.58, loss_mean=0.916, loss_mean_cls=1.06, proj_loss=-0.394][2026-03-22 14:27:47] Step: 2230, Training Logs: loss_final: 1.635571, loss_mean: 0.922701, proj_loss: -0.381357, loss_mean_cls: 1.094228, grad_norm: 2.654941 +Steps: 0%| | 2231/1000000 [09:10<67:56:06, 4.08it/s, grad_norm=2.65, loss_final=1.64, loss_mean=0.923, loss_mean_cls=1.09, proj_loss=-0.381][2026-03-22 14:27:47] Step: 2231, Training Logs: loss_final: 1.707629, loss_mean: 0.925870, proj_loss: -0.365990, loss_mean_cls: 1.147750, grad_norm: 1.887382 +Steps: 0%| | 2232/1000000 [09:10<68:24:16, 4.05it/s, grad_norm=1.89, loss_final=1.71, loss_mean=0.926, loss_mean_cls=1.15, proj_loss=-0.366][2026-03-22 14:27:47] Step: 2232, Training Logs: loss_final: 1.628108, loss_mean: 0.910577, proj_loss: -0.386485, loss_mean_cls: 1.104016, grad_norm: 2.447422 +Steps: 0%| | 2233/1000000 [09:10<68:13:25, 4.06it/s, grad_norm=2.45, loss_final=1.63, loss_mean=0.911, loss_mean_cls=1.1, proj_loss=-0.386][2026-03-22 14:27:47] Step: 2233, Training Logs: loss_final: 1.485468, loss_mean: 0.929323, proj_loss: -0.390496, loss_mean_cls: 0.946641, grad_norm: 1.825490 +Steps: 0%| | 2234/1000000 [09:11<68:04:48, 4.07it/s, grad_norm=1.83, loss_final=1.49, loss_mean=0.929, loss_mean_cls=0.947, proj_loss=-0.39][2026-03-22 14:27:48] Step: 2234, Training Logs: loss_final: 1.607451, loss_mean: 0.904491, proj_loss: -0.379980, loss_mean_cls: 1.082940, grad_norm: 1.747504 +Steps: 0%| | 2235/1000000 [09:11<67:56:39, 4.08it/s, grad_norm=1.75, loss_final=1.61, loss_mean=0.904, loss_mean_cls=1.08, proj_loss=-0.38][2026-03-22 14:27:48] Step: 2235, Training Logs: loss_final: 1.657268, loss_mean: 0.912666, proj_loss: -0.388021, loss_mean_cls: 1.132622, grad_norm: 2.826281 +Steps: 0%| | 2236/1000000 [09:11<67:52:23, 4.08it/s, grad_norm=2.83, loss_final=1.66, loss_mean=0.913, loss_mean_cls=1.13, proj_loss=-0.388][2026-03-22 14:27:48] Step: 2236, Training Logs: loss_final: 1.777814, loss_mean: 0.897782, proj_loss: -0.372302, loss_mean_cls: 1.252335, grad_norm: 2.104748 +Steps: 0%| | 2237/1000000 [09:11<67:50:18, 4.09it/s, grad_norm=2.1, loss_final=1.78, loss_mean=0.898, loss_mean_cls=1.25, proj_loss=-0.372][2026-03-22 14:27:48] Step: 2237, Training Logs: loss_final: 1.613389, loss_mean: 0.949609, proj_loss: -0.385524, loss_mean_cls: 1.049304, grad_norm: 2.511735 +Steps: 0%| | 2238/1000000 [09:12<67:46:40, 4.09it/s, grad_norm=2.51, loss_final=1.61, loss_mean=0.95, loss_mean_cls=1.05, proj_loss=-0.386][2026-03-22 14:27:49] Step: 2238, Training Logs: loss_final: 1.705253, loss_mean: 0.896250, proj_loss: -0.378892, loss_mean_cls: 1.187896, grad_norm: 1.750434 +Steps: 0%| | 2239/1000000 [09:12<67:49:52, 4.09it/s, grad_norm=1.75, loss_final=1.71, loss_mean=0.896, loss_mean_cls=1.19, proj_loss=-0.379][2026-03-22 14:27:49] Step: 2239, Training Logs: loss_final: 1.769431, loss_mean: 0.906744, proj_loss: -0.370951, loss_mean_cls: 1.233638, grad_norm: 3.049366 +Steps: 0%| | 2240/1000000 [09:12<67:48:08, 4.09it/s, grad_norm=3.05, loss_final=1.77, loss_mean=0.907, loss_mean_cls=1.23, proj_loss=-0.371][2026-03-22 14:27:49] Step: 2240, Training Logs: loss_final: 1.589346, loss_mean: 0.924871, proj_loss: -0.389593, loss_mean_cls: 1.054067, grad_norm: 2.755787 +Steps: 0%| | 2241/1000000 [09:12<67:47:31, 4.09it/s, grad_norm=2.76, loss_final=1.59, loss_mean=0.925, loss_mean_cls=1.05, proj_loss=-0.39][2026-03-22 14:27:49] Step: 2241, Training Logs: loss_final: 1.745386, loss_mean: 0.891144, proj_loss: -0.379147, loss_mean_cls: 1.233388, grad_norm: 3.783412 +Steps: 0%| | 2242/1000000 [09:13<67:47:12, 4.09it/s, grad_norm=3.78, loss_final=1.75, loss_mean=0.891, loss_mean_cls=1.23, proj_loss=-0.379][2026-03-22 14:27:50] Step: 2242, Training Logs: loss_final: 1.614632, loss_mean: 0.941250, proj_loss: -0.382634, loss_mean_cls: 1.056016, grad_norm: 4.036726 +Steps: 0%| | 2243/1000000 [09:13<67:47:01, 4.09it/s, grad_norm=4.04, loss_final=1.61, loss_mean=0.941, loss_mean_cls=1.06, proj_loss=-0.383][2026-03-22 14:27:50] Step: 2243, Training Logs: loss_final: 1.574409, loss_mean: 0.934765, proj_loss: -0.382201, loss_mean_cls: 1.021845, grad_norm: 3.513715 +Steps: 0%| | 2244/1000000 [09:13<67:46:28, 4.09it/s, grad_norm=3.51, loss_final=1.57, loss_mean=0.935, loss_mean_cls=1.02, proj_loss=-0.382][2026-03-22 14:27:50] Step: 2244, Training Logs: loss_final: 1.659872, loss_mean: 0.911676, proj_loss: -0.390993, loss_mean_cls: 1.139189, grad_norm: 3.444942 +Steps: 0%| | 2245/1000000 [09:13<67:44:51, 4.09it/s, grad_norm=3.44, loss_final=1.66, loss_mean=0.912, loss_mean_cls=1.14, proj_loss=-0.391][2026-03-22 14:27:50] Step: 2245, Training Logs: loss_final: 1.628392, loss_mean: 0.925137, proj_loss: -0.380107, loss_mean_cls: 1.083362, grad_norm: 2.760554 +Steps: 0%| | 2246/1000000 [09:14<67:43:27, 4.09it/s, grad_norm=2.76, loss_final=1.63, loss_mean=0.925, loss_mean_cls=1.08, proj_loss=-0.38][2026-03-22 14:27:51] Step: 2246, Training Logs: loss_final: 1.638563, loss_mean: 0.919277, proj_loss: -0.386967, loss_mean_cls: 1.106253, grad_norm: 3.844553 +Steps: 0%| | 2247/1000000 [09:14<67:40:55, 4.09it/s, grad_norm=3.84, loss_final=1.64, loss_mean=0.919, loss_mean_cls=1.11, proj_loss=-0.387][2026-03-22 14:27:51] Step: 2247, Training Logs: loss_final: 1.632277, loss_mean: 0.909801, proj_loss: -0.388638, loss_mean_cls: 1.111115, grad_norm: 3.243183 +Steps: 0%| | 2248/1000000 [09:14<67:41:06, 4.09it/s, grad_norm=3.24, loss_final=1.63, loss_mean=0.91, loss_mean_cls=1.11, proj_loss=-0.389][2026-03-22 14:27:51] Step: 2248, Training Logs: loss_final: 1.594422, loss_mean: 0.925719, proj_loss: -0.387451, loss_mean_cls: 1.056153, grad_norm: 4.506925 +Steps: 0%| | 2249/1000000 [09:14<67:41:27, 4.09it/s, grad_norm=4.51, loss_final=1.59, loss_mean=0.926, loss_mean_cls=1.06, proj_loss=-0.387][2026-03-22 14:27:51] Step: 2249, Training Logs: loss_final: 1.560207, loss_mean: 0.908622, proj_loss: -0.394239, loss_mean_cls: 1.045823, grad_norm: 1.692199 +Steps: 0%| | 2250/1000000 [09:15<67:41:57, 4.09it/s, grad_norm=1.69, loss_final=1.56, loss_mean=0.909, loss_mean_cls=1.05, proj_loss=-0.394][2026-03-22 14:27:52] Step: 2250, Training Logs: loss_final: 1.648001, loss_mean: 0.910047, proj_loss: -0.386919, loss_mean_cls: 1.124874, grad_norm: 5.456280 +Steps: 0%| | 2251/1000000 [09:15<67:42:08, 4.09it/s, grad_norm=5.46, loss_final=1.65, loss_mean=0.91, loss_mean_cls=1.12, proj_loss=-0.387][2026-03-22 14:27:52] Step: 2251, Training Logs: loss_final: 1.598092, loss_mean: 0.906932, proj_loss: -0.387659, loss_mean_cls: 1.078819, grad_norm: 3.495087 +Steps: 0%| | 2252/1000000 [09:15<67:42:19, 4.09it/s, grad_norm=3.5, loss_final=1.6, loss_mean=0.907, loss_mean_cls=1.08, proj_loss=-0.388][2026-03-22 14:27:52] Step: 2252, Training Logs: loss_final: 1.646865, loss_mean: 0.903712, proj_loss: -0.381635, loss_mean_cls: 1.124789, grad_norm: 2.256072 +Steps: 0%| | 2253/1000000 [09:15<67:43:56, 4.09it/s, grad_norm=2.26, loss_final=1.65, loss_mean=0.904, loss_mean_cls=1.12, proj_loss=-0.382][2026-03-22 14:27:52] Step: 2253, Training Logs: loss_final: 1.628974, loss_mean: 0.894442, proj_loss: -0.387374, loss_mean_cls: 1.121907, grad_norm: 3.913585 +Steps: 0%| | 2254/1000000 [09:16<67:42:54, 4.09it/s, grad_norm=3.91, loss_final=1.63, loss_mean=0.894, loss_mean_cls=1.12, proj_loss=-0.387][2026-03-22 14:27:53] Step: 2254, Training Logs: loss_final: 1.624104, loss_mean: 0.908495, proj_loss: -0.388733, loss_mean_cls: 1.104342, grad_norm: 5.026122 +Steps: 0%| | 2255/1000000 [09:16<67:42:12, 4.09it/s, grad_norm=5.03, loss_final=1.62, loss_mean=0.908, loss_mean_cls=1.1, proj_loss=-0.389][2026-03-22 14:27:53] Step: 2255, Training Logs: loss_final: 1.677308, loss_mean: 0.920848, proj_loss: -0.382378, loss_mean_cls: 1.138838, grad_norm: 3.290658 +Steps: 0%| | 2256/1000000 [09:16<67:41:24, 4.09it/s, grad_norm=3.29, loss_final=1.68, loss_mean=0.921, loss_mean_cls=1.14, proj_loss=-0.382][2026-03-22 14:27:53] Step: 2256, Training Logs: loss_final: 1.686728, loss_mean: 0.923975, proj_loss: -0.381269, loss_mean_cls: 1.144022, grad_norm: 2.068805 +Steps: 0%| | 2257/1000000 [09:16<67:42:15, 4.09it/s, grad_norm=2.07, loss_final=1.69, loss_mean=0.924, loss_mean_cls=1.14, proj_loss=-0.381][2026-03-22 14:27:53] Step: 2257, Training Logs: loss_final: 1.671932, loss_mean: 0.912507, proj_loss: -0.374560, loss_mean_cls: 1.133985, grad_norm: 3.280193 +Steps: 0%| | 2258/1000000 [09:17<67:43:14, 4.09it/s, grad_norm=3.28, loss_final=1.67, loss_mean=0.913, loss_mean_cls=1.13, proj_loss=-0.375][2026-03-22 14:27:53] Step: 2258, Training Logs: loss_final: 1.663188, loss_mean: 0.920442, proj_loss: -0.385476, loss_mean_cls: 1.128222, grad_norm: 2.333840 +Steps: 0%| | 2259/1000000 [09:17<67:42:38, 4.09it/s, grad_norm=2.33, loss_final=1.66, loss_mean=0.92, loss_mean_cls=1.13, proj_loss=-0.385][2026-03-22 14:27:54] Step: 2259, Training Logs: loss_final: 1.582849, loss_mean: 0.911481, proj_loss: -0.386553, loss_mean_cls: 1.057922, grad_norm: 2.575856 +Steps: 0%| | 2260/1000000 [09:17<68:25:51, 4.05it/s, grad_norm=2.58, loss_final=1.58, loss_mean=0.911, loss_mean_cls=1.06, proj_loss=-0.387][2026-03-22 14:27:54] Step: 2260, Training Logs: loss_final: 1.666781, loss_mean: 0.908181, proj_loss: -0.382338, loss_mean_cls: 1.140939, grad_norm: 2.477918 +Steps: 0%| | 2261/1000000 [09:17<68:13:52, 4.06it/s, grad_norm=2.48, loss_final=1.67, loss_mean=0.908, loss_mean_cls=1.14, proj_loss=-0.382][2026-03-22 14:27:54] Step: 2261, Training Logs: loss_final: 1.585370, loss_mean: 0.922035, proj_loss: -0.384536, loss_mean_cls: 1.047870, grad_norm: 1.764007 +Steps: 0%| | 2262/1000000 [09:18<68:04:19, 4.07it/s, grad_norm=1.76, loss_final=1.59, loss_mean=0.922, loss_mean_cls=1.05, proj_loss=-0.385][2026-03-22 14:27:54] Step: 2262, Training Logs: loss_final: 1.698318, loss_mean: 0.901960, proj_loss: -0.382170, loss_mean_cls: 1.178529, grad_norm: 1.238159 +Steps: 0%| | 2263/1000000 [09:18<67:56:59, 4.08it/s, grad_norm=1.24, loss_final=1.7, loss_mean=0.902, loss_mean_cls=1.18, proj_loss=-0.382][2026-03-22 14:27:55] Step: 2263, Training Logs: loss_final: 1.620720, loss_mean: 0.900176, proj_loss: -0.388930, loss_mean_cls: 1.109475, grad_norm: 3.241017 +Steps: 0%| | 2264/1000000 [09:18<67:53:11, 4.08it/s, grad_norm=3.24, loss_final=1.62, loss_mean=0.9, loss_mean_cls=1.11, proj_loss=-0.389][2026-03-22 14:27:55] Step: 2264, Training Logs: loss_final: 1.747346, loss_mean: 0.882633, proj_loss: -0.372357, loss_mean_cls: 1.237071, grad_norm: 2.692417 +Steps: 0%| | 2265/1000000 [09:18<67:51:26, 4.08it/s, grad_norm=2.69, loss_final=1.75, loss_mean=0.883, loss_mean_cls=1.24, proj_loss=-0.372][2026-03-22 14:27:55] Step: 2265, Training Logs: loss_final: 1.727989, loss_mean: 0.887727, proj_loss: -0.374534, loss_mean_cls: 1.214796, grad_norm: 2.125393 +Steps: 0%| | 2266/1000000 [09:18<67:49:58, 4.09it/s, grad_norm=2.13, loss_final=1.73, loss_mean=0.888, loss_mean_cls=1.21, proj_loss=-0.375][2026-03-22 14:27:55] Step: 2266, Training Logs: loss_final: 1.584491, loss_mean: 0.910770, proj_loss: -0.387097, loss_mean_cls: 1.060818, grad_norm: 1.986650 +Steps: 0%| | 2267/1000000 [09:19<67:46:53, 4.09it/s, grad_norm=1.99, loss_final=1.58, loss_mean=0.911, loss_mean_cls=1.06, proj_loss=-0.387][2026-03-22 14:27:56] Step: 2267, Training Logs: loss_final: 1.711505, loss_mean: 0.898464, proj_loss: -0.382662, loss_mean_cls: 1.195704, grad_norm: 1.965559 +Steps: 0%| | 2268/1000000 [09:19<67:45:27, 4.09it/s, grad_norm=1.97, loss_final=1.71, loss_mean=0.898, loss_mean_cls=1.2, proj_loss=-0.383][2026-03-22 14:27:56] Step: 2268, Training Logs: loss_final: 1.670384, loss_mean: 0.905780, proj_loss: -0.382147, loss_mean_cls: 1.146751, grad_norm: 2.469500 +Steps: 0%| | 2269/1000000 [09:19<67:44:43, 4.09it/s, grad_norm=2.47, loss_final=1.67, loss_mean=0.906, loss_mean_cls=1.15, proj_loss=-0.382][2026-03-22 14:27:56] Step: 2269, Training Logs: loss_final: 1.514090, loss_mean: 0.919487, proj_loss: -0.385312, loss_mean_cls: 0.979915, grad_norm: 1.819770 +Steps: 0%| | 2270/1000000 [09:19<67:43:11, 4.09it/s, grad_norm=1.82, loss_final=1.51, loss_mean=0.919, loss_mean_cls=0.98, proj_loss=-0.385][2026-03-22 14:27:56] Step: 2270, Training Logs: loss_final: 1.631646, loss_mean: 0.912481, proj_loss: -0.382018, loss_mean_cls: 1.101183, grad_norm: 2.557100 +Steps: 0%| | 2271/1000000 [09:20<67:46:32, 4.09it/s, grad_norm=2.56, loss_final=1.63, loss_mean=0.912, loss_mean_cls=1.1, proj_loss=-0.382][2026-03-22 14:27:57] Step: 2271, Training Logs: loss_final: 1.674206, loss_mean: 0.910674, proj_loss: -0.376906, loss_mean_cls: 1.140438, grad_norm: 4.198688 +Steps: 0%| | 2272/1000000 [09:20<67:45:29, 4.09it/s, grad_norm=4.2, loss_final=1.67, loss_mean=0.911, loss_mean_cls=1.14, proj_loss=-0.377][2026-03-22 14:27:57] Step: 2272, Training Logs: loss_final: 1.666457, loss_mean: 0.914440, proj_loss: -0.387156, loss_mean_cls: 1.139173, grad_norm: 4.567917 +Steps: 0%| | 2273/1000000 [09:20<67:46:18, 4.09it/s, grad_norm=4.57, loss_final=1.67, loss_mean=0.914, loss_mean_cls=1.14, proj_loss=-0.387][2026-03-22 14:27:57] Step: 2273, Training Logs: loss_final: 1.655063, loss_mean: 0.886320, proj_loss: -0.381990, loss_mean_cls: 1.150733, grad_norm: 3.200830 +Steps: 0%| | 2274/1000000 [09:20<67:45:15, 4.09it/s, grad_norm=3.2, loss_final=1.66, loss_mean=0.886, loss_mean_cls=1.15, proj_loss=-0.382][2026-03-22 14:27:57] Step: 2274, Training Logs: loss_final: 1.569797, loss_mean: 0.922361, proj_loss: -0.392408, loss_mean_cls: 1.039843, grad_norm: 2.060638 +Steps: 0%| | 2275/1000000 [09:21<67:47:11, 4.09it/s, grad_norm=2.06, loss_final=1.57, loss_mean=0.922, loss_mean_cls=1.04, proj_loss=-0.392][2026-03-22 14:27:58] Step: 2275, Training Logs: loss_final: 1.549458, loss_mean: 0.921102, proj_loss: -0.386794, loss_mean_cls: 1.015150, grad_norm: 1.849244 +Steps: 0%| | 2276/1000000 [09:21<67:45:36, 4.09it/s, grad_norm=1.85, loss_final=1.55, loss_mean=0.921, loss_mean_cls=1.02, proj_loss=-0.387][2026-03-22 14:27:58] Step: 2276, Training Logs: loss_final: 1.701842, loss_mean: 0.907529, proj_loss: -0.379944, loss_mean_cls: 1.174256, grad_norm: 3.035604 +Steps: 0%| | 2277/1000000 [09:21<67:44:09, 4.09it/s, grad_norm=3.04, loss_final=1.7, loss_mean=0.908, loss_mean_cls=1.17, proj_loss=-0.38][2026-03-22 14:27:58] Step: 2277, Training Logs: loss_final: 1.671151, loss_mean: 0.891605, proj_loss: -0.380894, loss_mean_cls: 1.160440, grad_norm: 4.019522 +Steps: 0%| | 2278/1000000 [09:21<67:44:32, 4.09it/s, grad_norm=4.02, loss_final=1.67, loss_mean=0.892, loss_mean_cls=1.16, proj_loss=-0.381][2026-03-22 14:27:58] Step: 2278, Training Logs: loss_final: 1.623487, loss_mean: 0.897286, proj_loss: -0.381896, loss_mean_cls: 1.108097, grad_norm: 2.993692 +Steps: 0%| | 2279/1000000 [09:22<67:43:06, 4.09it/s, grad_norm=2.99, loss_final=1.62, loss_mean=0.897, loss_mean_cls=1.11, proj_loss=-0.382][2026-03-22 14:27:59] Step: 2279, Training Logs: loss_final: 1.690022, loss_mean: 0.897147, proj_loss: -0.387363, loss_mean_cls: 1.180238, grad_norm: 5.672110 +Steps: 0%| | 2280/1000000 [09:22<67:45:41, 4.09it/s, grad_norm=5.67, loss_final=1.69, loss_mean=0.897, loss_mean_cls=1.18, proj_loss=-0.387][2026-03-22 14:27:59] Step: 2280, Training Logs: loss_final: 1.576108, loss_mean: 0.936595, proj_loss: -0.385851, loss_mean_cls: 1.025363, grad_norm: 2.660637 +Steps: 0%| | 2281/1000000 [09:22<67:44:56, 4.09it/s, grad_norm=2.66, loss_final=1.58, loss_mean=0.937, loss_mean_cls=1.03, proj_loss=-0.386][2026-03-22 14:27:59] Step: 2281, Training Logs: loss_final: 1.606012, loss_mean: 0.918981, proj_loss: -0.390282, loss_mean_cls: 1.077313, grad_norm: 3.597172 +Steps: 0%| | 2282/1000000 [09:22<67:44:42, 4.09it/s, grad_norm=3.6, loss_final=1.61, loss_mean=0.919, loss_mean_cls=1.08, proj_loss=-0.39][2026-03-22 14:27:59] Step: 2282, Training Logs: loss_final: 1.757307, loss_mean: 0.890797, proj_loss: -0.375223, loss_mean_cls: 1.241732, grad_norm: 6.479735 +Steps: 0%| | 2283/1000000 [09:23<67:43:27, 4.09it/s, grad_norm=6.48, loss_final=1.76, loss_mean=0.891, loss_mean_cls=1.24, proj_loss=-0.375][2026-03-22 14:28:00] Step: 2283, Training Logs: loss_final: 1.572297, loss_mean: 0.910394, proj_loss: -0.396779, loss_mean_cls: 1.058682, grad_norm: 2.749410 +Steps: 0%| | 2284/1000000 [09:23<67:44:11, 4.09it/s, grad_norm=2.75, loss_final=1.57, loss_mean=0.91, loss_mean_cls=1.06, proj_loss=-0.397][2026-03-22 14:28:00] Step: 2284, Training Logs: loss_final: 1.669393, loss_mean: 0.928005, proj_loss: -0.385093, loss_mean_cls: 1.126481, grad_norm: 2.431445 +Steps: 0%| | 2285/1000000 [09:23<67:42:52, 4.09it/s, grad_norm=2.43, loss_final=1.67, loss_mean=0.928, loss_mean_cls=1.13, proj_loss=-0.385][2026-03-22 14:28:00] Step: 2285, Training Logs: loss_final: 1.584952, loss_mean: 0.936507, proj_loss: -0.393855, loss_mean_cls: 1.042300, grad_norm: 3.962764 +Steps: 0%| | 2286/1000000 [09:23<67:41:12, 4.09it/s, grad_norm=3.96, loss_final=1.58, loss_mean=0.937, loss_mean_cls=1.04, proj_loss=-0.394][2026-03-22 14:28:00] Step: 2286, Training Logs: loss_final: 1.657770, loss_mean: 0.927399, proj_loss: -0.387392, loss_mean_cls: 1.117763, grad_norm: 5.220345 +Steps: 0%| | 2287/1000000 [09:24<67:42:39, 4.09it/s, grad_norm=5.22, loss_final=1.66, loss_mean=0.927, loss_mean_cls=1.12, proj_loss=-0.387][2026-03-22 14:28:01] Step: 2287, Training Logs: loss_final: 1.731657, loss_mean: 0.912847, proj_loss: -0.376372, loss_mean_cls: 1.195182, grad_norm: 3.096843 +Steps: 0%| | 2288/1000000 [09:24<67:44:00, 4.09it/s, grad_norm=3.1, loss_final=1.73, loss_mean=0.913, loss_mean_cls=1.2, proj_loss=-0.376][2026-03-22 14:28:01] Step: 2288, Training Logs: loss_final: 1.557075, loss_mean: 0.915798, proj_loss: -0.394835, loss_mean_cls: 1.036111, grad_norm: 4.848583 +Steps: 0%| | 2289/1000000 [09:24<67:44:26, 4.09it/s, grad_norm=4.85, loss_final=1.56, loss_mean=0.916, loss_mean_cls=1.04, proj_loss=-0.395][2026-03-22 14:28:01] Step: 2289, Training Logs: loss_final: 1.579486, loss_mean: 0.923538, proj_loss: -0.387080, loss_mean_cls: 1.043028, grad_norm: 4.197685 +Steps: 0%| | 2290/1000000 [09:24<67:44:21, 4.09it/s, grad_norm=4.2, loss_final=1.58, loss_mean=0.924, loss_mean_cls=1.04, proj_loss=-0.387][2026-03-22 14:28:01] Step: 2290, Training Logs: loss_final: 1.522344, loss_mean: 0.913254, proj_loss: -0.395097, loss_mean_cls: 1.004187, grad_norm: 2.747334 +Steps: 0%| | 2291/1000000 [09:25<67:44:39, 4.09it/s, grad_norm=2.75, loss_final=1.52, loss_mean=0.913, loss_mean_cls=1, proj_loss=-0.395][2026-03-22 14:28:02] Step: 2291, Training Logs: loss_final: 1.619123, loss_mean: 0.919347, proj_loss: -0.387879, loss_mean_cls: 1.087654, grad_norm: 4.315301 +Steps: 0%| | 2292/1000000 [09:25<67:45:32, 4.09it/s, grad_norm=4.32, loss_final=1.62, loss_mean=0.919, loss_mean_cls=1.09, proj_loss=-0.388][2026-03-22 14:28:02] Step: 2292, Training Logs: loss_final: 1.669554, loss_mean: 0.918999, proj_loss: -0.381309, loss_mean_cls: 1.131864, grad_norm: 4.392646 +Steps: 0%| | 2293/1000000 [09:25<67:44:14, 4.09it/s, grad_norm=4.39, loss_final=1.67, loss_mean=0.919, loss_mean_cls=1.13, proj_loss=-0.381][2026-03-22 14:28:02] Step: 2293, Training Logs: loss_final: 1.694931, loss_mean: 0.915843, proj_loss: -0.379447, loss_mean_cls: 1.158535, grad_norm: 3.581850 +Steps: 0%| | 2294/1000000 [09:25<67:43:03, 4.09it/s, grad_norm=3.58, loss_final=1.69, loss_mean=0.916, loss_mean_cls=1.16, proj_loss=-0.379][2026-03-22 14:28:02] Step: 2294, Training Logs: loss_final: 1.590949, loss_mean: 0.901988, proj_loss: -0.394592, loss_mean_cls: 1.083552, grad_norm: 2.222940 +Steps: 0%| | 2295/1000000 [09:26<67:42:41, 4.09it/s, grad_norm=2.22, loss_final=1.59, loss_mean=0.902, loss_mean_cls=1.08, proj_loss=-0.395][2026-03-22 14:28:03] Step: 2295, Training Logs: loss_final: 1.722435, loss_mean: 0.883247, proj_loss: -0.378447, loss_mean_cls: 1.217634, grad_norm: 2.892390 +Steps: 0%| | 2296/1000000 [09:26<67:44:53, 4.09it/s, grad_norm=2.89, loss_final=1.72, loss_mean=0.883, loss_mean_cls=1.22, proj_loss=-0.378][2026-03-22 14:28:03] Step: 2296, Training Logs: loss_final: 1.460481, loss_mean: 0.938337, proj_loss: -0.401124, loss_mean_cls: 0.923267, grad_norm: 2.599492 +Steps: 0%| | 2297/1000000 [09:26<67:44:18, 4.09it/s, grad_norm=2.6, loss_final=1.46, loss_mean=0.938, loss_mean_cls=0.923, proj_loss=-0.401][2026-03-22 14:28:03] Step: 2297, Training Logs: loss_final: 1.603526, loss_mean: 0.907548, proj_loss: -0.390631, loss_mean_cls: 1.086609, grad_norm: 2.059210 +Steps: 0%| | 2298/1000000 [09:26<67:43:26, 4.09it/s, grad_norm=2.06, loss_final=1.6, loss_mean=0.908, loss_mean_cls=1.09, proj_loss=-0.391][2026-03-22 14:28:03] Step: 2298, Training Logs: loss_final: 1.620669, loss_mean: 0.924862, proj_loss: -0.387969, loss_mean_cls: 1.083776, grad_norm: 2.496782 +Steps: 0%| | 2299/1000000 [09:27<67:44:26, 4.09it/s, grad_norm=2.5, loss_final=1.62, loss_mean=0.925, loss_mean_cls=1.08, proj_loss=-0.388][2026-03-22 14:28:04] Step: 2299, Training Logs: loss_final: 1.629084, loss_mean: 0.902164, proj_loss: -0.382058, loss_mean_cls: 1.108979, grad_norm: 3.501413 +Steps: 0%| | 2300/1000000 [09:27<67:45:28, 4.09it/s, grad_norm=3.5, loss_final=1.63, loss_mean=0.902, loss_mean_cls=1.11, proj_loss=-0.382][2026-03-22 14:28:04] Step: 2300, Training Logs: loss_final: 1.559266, loss_mean: 0.922031, proj_loss: -0.392255, loss_mean_cls: 1.029490, grad_norm: 4.253245 +Steps: 0%| | 2301/1000000 [09:27<67:44:46, 4.09it/s, grad_norm=4.25, loss_final=1.56, loss_mean=0.922, loss_mean_cls=1.03, proj_loss=-0.392][2026-03-22 14:28:04] Step: 2301, Training Logs: loss_final: 1.703511, loss_mean: 0.910932, proj_loss: -0.378990, loss_mean_cls: 1.171569, grad_norm: 2.218562 +Steps: 0%| | 2302/1000000 [09:27<67:43:13, 4.09it/s, grad_norm=2.22, loss_final=1.7, loss_mean=0.911, loss_mean_cls=1.17, proj_loss=-0.379][2026-03-22 14:28:04] Step: 2302, Training Logs: loss_final: 1.570122, loss_mean: 0.917312, proj_loss: -0.385888, loss_mean_cls: 1.038698, grad_norm: 2.760257 +Steps: 0%| | 2303/1000000 [09:28<67:41:29, 4.09it/s, grad_norm=2.76, loss_final=1.57, loss_mean=0.917, loss_mean_cls=1.04, proj_loss=-0.386][2026-03-22 14:28:04] Step: 2303, Training Logs: loss_final: 1.557180, loss_mean: 0.929333, proj_loss: -0.381101, loss_mean_cls: 1.008948, grad_norm: 3.298353 +Steps: 0%| | 2304/1000000 [09:28<67:42:16, 4.09it/s, grad_norm=3.3, loss_final=1.56, loss_mean=0.929, loss_mean_cls=1.01, proj_loss=-0.381][2026-03-22 14:28:05] Step: 2304, Training Logs: loss_final: 1.540137, loss_mean: 0.941813, proj_loss: -0.388195, loss_mean_cls: 0.986519, grad_norm: 2.261458 +Steps: 0%| | 2305/1000000 [09:28<67:42:48, 4.09it/s, grad_norm=2.26, loss_final=1.54, loss_mean=0.942, loss_mean_cls=0.987, proj_loss=-0.388][2026-03-22 14:28:05] Step: 2305, Training Logs: loss_final: 1.578400, loss_mean: 0.911899, proj_loss: -0.385343, loss_mean_cls: 1.051844, grad_norm: 1.661348 +Steps: 0%| | 2306/1000000 [09:28<67:43:28, 4.09it/s, grad_norm=1.66, loss_final=1.58, loss_mean=0.912, loss_mean_cls=1.05, proj_loss=-0.385][2026-03-22 14:28:05] Step: 2306, Training Logs: loss_final: 1.557003, loss_mean: 0.894409, proj_loss: -0.388635, loss_mean_cls: 1.051229, grad_norm: 2.655318 +Steps: 0%| | 2307/1000000 [09:29<67:42:19, 4.09it/s, grad_norm=2.66, loss_final=1.56, loss_mean=0.894, loss_mean_cls=1.05, proj_loss=-0.389][2026-03-22 14:28:05] Step: 2307, Training Logs: loss_final: 1.750784, loss_mean: 0.907507, proj_loss: -0.378621, loss_mean_cls: 1.221897, grad_norm: 1.501857 +Steps: 0%| | 2308/1000000 [09:29<67:43:04, 4.09it/s, grad_norm=1.5, loss_final=1.75, loss_mean=0.908, loss_mean_cls=1.22, proj_loss=-0.379][2026-03-22 14:28:06] Step: 2308, Training Logs: loss_final: 1.684878, loss_mean: 0.906496, proj_loss: -0.380677, loss_mean_cls: 1.159059, grad_norm: 2.748135 +Steps: 0%| | 2309/1000000 [09:29<67:42:26, 4.09it/s, grad_norm=2.75, loss_final=1.68, loss_mean=0.906, loss_mean_cls=1.16, proj_loss=-0.381][2026-03-22 14:28:06] Step: 2309, Training Logs: loss_final: 1.598864, loss_mean: 0.908881, proj_loss: -0.386942, loss_mean_cls: 1.076924, grad_norm: 3.137921 +Steps: 0%| | 2310/1000000 [09:29<67:42:03, 4.09it/s, grad_norm=3.14, loss_final=1.6, loss_mean=0.909, loss_mean_cls=1.08, proj_loss=-0.387][2026-03-22 14:28:06] Step: 2310, Training Logs: loss_final: 1.611457, loss_mean: 0.917591, proj_loss: -0.390005, loss_mean_cls: 1.083871, grad_norm: 3.228270 +Steps: 0%| | 2311/1000000 [09:29<68:15:21, 4.06it/s, grad_norm=3.23, loss_final=1.61, loss_mean=0.918, loss_mean_cls=1.08, proj_loss=-0.39][2026-03-22 14:28:06] Step: 2311, Training Logs: loss_final: 1.680414, loss_mean: 0.896087, proj_loss: -0.384887, loss_mean_cls: 1.169214, grad_norm: 2.062578 +Steps: 0%| | 2312/1000000 [09:30<68:03:51, 4.07it/s, grad_norm=2.06, loss_final=1.68, loss_mean=0.896, loss_mean_cls=1.17, proj_loss=-0.385][2026-03-22 14:28:07] Step: 2312, Training Logs: loss_final: 1.508396, loss_mean: 0.919397, proj_loss: -0.392876, loss_mean_cls: 0.981875, grad_norm: 2.893207 +Steps: 0%| | 2313/1000000 [09:30<67:58:46, 4.08it/s, grad_norm=2.89, loss_final=1.51, loss_mean=0.919, loss_mean_cls=0.982, proj_loss=-0.393][2026-03-22 14:28:07] Step: 2313, Training Logs: loss_final: 1.627699, loss_mean: 0.908490, proj_loss: -0.391390, loss_mean_cls: 1.110599, grad_norm: 4.280434 +Steps: 0%| | 2314/1000000 [09:30<67:52:49, 4.08it/s, grad_norm=4.28, loss_final=1.63, loss_mean=0.908, loss_mean_cls=1.11, proj_loss=-0.391][2026-03-22 14:28:07] Step: 2314, Training Logs: loss_final: 1.674996, loss_mean: 0.916622, proj_loss: -0.383912, loss_mean_cls: 1.142286, grad_norm: 4.432993 +Steps: 0%| | 2315/1000000 [09:30<67:48:05, 4.09it/s, grad_norm=4.43, loss_final=1.67, loss_mean=0.917, loss_mean_cls=1.14, proj_loss=-0.384][2026-03-22 14:28:07] Step: 2315, Training Logs: loss_final: 1.547137, loss_mean: 0.919370, proj_loss: -0.394346, loss_mean_cls: 1.022113, grad_norm: 1.510590 +Steps: 0%| | 2316/1000000 [09:31<67:47:21, 4.09it/s, grad_norm=1.51, loss_final=1.55, loss_mean=0.919, loss_mean_cls=1.02, proj_loss=-0.394][2026-03-22 14:28:08] Step: 2316, Training Logs: loss_final: 1.599247, loss_mean: 0.911605, proj_loss: -0.393248, loss_mean_cls: 1.080890, grad_norm: 2.021792 +Steps: 0%| | 2317/1000000 [09:31<67:44:42, 4.09it/s, grad_norm=2.02, loss_final=1.6, loss_mean=0.912, loss_mean_cls=1.08, proj_loss=-0.393][2026-03-22 14:28:08] Step: 2317, Training Logs: loss_final: 1.684928, loss_mean: 0.898400, proj_loss: -0.383273, loss_mean_cls: 1.169801, grad_norm: 3.245164 +Steps: 0%| | 2318/1000000 [09:31<67:43:20, 4.09it/s, grad_norm=3.25, loss_final=1.68, loss_mean=0.898, loss_mean_cls=1.17, proj_loss=-0.383][2026-03-22 14:28:08] Step: 2318, Training Logs: loss_final: 1.773042, loss_mean: 0.887824, proj_loss: -0.373001, loss_mean_cls: 1.258219, grad_norm: 1.563741 +Steps: 0%| | 2319/1000000 [09:31<67:41:48, 4.09it/s, grad_norm=1.56, loss_final=1.77, loss_mean=0.888, loss_mean_cls=1.26, proj_loss=-0.373][2026-03-22 14:28:08] Step: 2319, Training Logs: loss_final: 1.736305, loss_mean: 0.898413, proj_loss: -0.374833, loss_mean_cls: 1.212725, grad_norm: 1.902038 +Steps: 0%| | 2320/1000000 [09:32<67:43:57, 4.09it/s, grad_norm=1.9, loss_final=1.74, loss_mean=0.898, loss_mean_cls=1.21, proj_loss=-0.375][2026-03-22 14:28:09] Step: 2320, Training Logs: loss_final: 1.712367, loss_mean: 0.871298, proj_loss: -0.369168, loss_mean_cls: 1.210238, grad_norm: 2.597375 +Steps: 0%| | 2321/1000000 [09:32<67:42:45, 4.09it/s, grad_norm=2.6, loss_final=1.71, loss_mean=0.871, loss_mean_cls=1.21, proj_loss=-0.369][2026-03-22 14:28:09] Step: 2321, Training Logs: loss_final: 1.688531, loss_mean: 0.901240, proj_loss: -0.375287, loss_mean_cls: 1.162577, grad_norm: 2.065099 +Steps: 0%| | 2322/1000000 [09:32<67:41:00, 4.09it/s, grad_norm=2.07, loss_final=1.69, loss_mean=0.901, loss_mean_cls=1.16, proj_loss=-0.375][2026-03-22 14:28:09] Step: 2322, Training Logs: loss_final: 1.590237, loss_mean: 0.913675, proj_loss: -0.385608, loss_mean_cls: 1.062170, grad_norm: 2.005665 +Steps: 0%| | 2323/1000000 [09:32<67:40:11, 4.10it/s, grad_norm=2.01, loss_final=1.59, loss_mean=0.914, loss_mean_cls=1.06, proj_loss=-0.386][2026-03-22 14:28:09] Step: 2323, Training Logs: loss_final: 1.582793, loss_mean: 0.907715, proj_loss: -0.389893, loss_mean_cls: 1.064971, grad_norm: 3.469495 +Steps: 0%| | 2324/1000000 [09:33<67:41:18, 4.09it/s, grad_norm=3.47, loss_final=1.58, loss_mean=0.908, loss_mean_cls=1.06, proj_loss=-0.39][2026-03-22 14:28:10] Step: 2324, Training Logs: loss_final: 1.599871, loss_mean: 0.910816, proj_loss: -0.384782, loss_mean_cls: 1.073837, grad_norm: 5.729043 +Steps: 0%| | 2325/1000000 [09:33<67:40:28, 4.10it/s, grad_norm=5.73, loss_final=1.6, loss_mean=0.911, loss_mean_cls=1.07, proj_loss=-0.385][2026-03-22 14:28:10] Step: 2325, Training Logs: loss_final: 1.693417, loss_mean: 0.912593, proj_loss: -0.377449, loss_mean_cls: 1.158273, grad_norm: 3.204400 +Steps: 0%| | 2326/1000000 [09:33<67:40:51, 4.09it/s, grad_norm=3.2, loss_final=1.69, loss_mean=0.913, loss_mean_cls=1.16, proj_loss=-0.377][2026-03-22 14:28:10] Step: 2326, Training Logs: loss_final: 1.595000, loss_mean: 0.902500, proj_loss: -0.386126, loss_mean_cls: 1.078626, grad_norm: 2.694958 +Steps: 0%| | 2327/1000000 [09:33<67:42:14, 4.09it/s, grad_norm=2.69, loss_final=1.6, loss_mean=0.902, loss_mean_cls=1.08, proj_loss=-0.386][2026-03-22 14:28:10] Step: 2327, Training Logs: loss_final: 1.741978, loss_mean: 0.895132, proj_loss: -0.385277, loss_mean_cls: 1.232124, grad_norm: 7.486764 +Steps: 0%| | 2328/1000000 [09:34<67:42:21, 4.09it/s, grad_norm=7.49, loss_final=1.74, loss_mean=0.895, loss_mean_cls=1.23, proj_loss=-0.385][2026-03-22 14:28:11] Step: 2328, Training Logs: loss_final: 1.675408, loss_mean: 0.902658, proj_loss: -0.384806, loss_mean_cls: 1.157556, grad_norm: 4.553529 +Steps: 0%| | 2329/1000000 [09:34<67:41:22, 4.09it/s, grad_norm=4.55, loss_final=1.68, loss_mean=0.903, loss_mean_cls=1.16, proj_loss=-0.385][2026-03-22 14:28:11] Step: 2329, Training Logs: loss_final: 1.642900, loss_mean: 0.912381, proj_loss: -0.387775, loss_mean_cls: 1.118294, grad_norm: 5.333797 +Steps: 0%| | 2330/1000000 [09:34<67:41:05, 4.09it/s, grad_norm=5.33, loss_final=1.64, loss_mean=0.912, loss_mean_cls=1.12, proj_loss=-0.388][2026-03-22 14:28:11] Step: 2330, Training Logs: loss_final: 1.625640, loss_mean: 0.908670, proj_loss: -0.380840, loss_mean_cls: 1.097810, grad_norm: 1.440905 +Steps: 0%| | 2331/1000000 [09:34<67:39:29, 4.10it/s, grad_norm=1.44, loss_final=1.63, loss_mean=0.909, loss_mean_cls=1.1, proj_loss=-0.381][2026-03-22 14:28:11] Step: 2331, Training Logs: loss_final: 1.628349, loss_mean: 0.906430, proj_loss: -0.387608, loss_mean_cls: 1.109527, grad_norm: 5.549759 +Steps: 0%| | 2332/1000000 [09:35<67:42:50, 4.09it/s, grad_norm=5.55, loss_final=1.63, loss_mean=0.906, loss_mean_cls=1.11, proj_loss=-0.388][2026-03-22 14:28:12] Step: 2332, Training Logs: loss_final: 1.726693, loss_mean: 0.906020, proj_loss: -0.378828, loss_mean_cls: 1.199501, grad_norm: 4.992978 +Steps: 0%| | 2333/1000000 [09:35<70:25:04, 3.94it/s, grad_norm=4.99, loss_final=1.73, loss_mean=0.906, loss_mean_cls=1.2, proj_loss=-0.379][2026-03-22 14:28:12] Step: 2333, Training Logs: loss_final: 1.831652, loss_mean: 0.888643, proj_loss: -0.375369, loss_mean_cls: 1.318378, grad_norm: 2.041164 +Steps: 0%| | 2334/1000000 [09:35<69:42:20, 3.98it/s, grad_norm=2.04, loss_final=1.83, loss_mean=0.889, loss_mean_cls=1.32, proj_loss=-0.375][2026-03-22 14:28:12] Step: 2334, Training Logs: loss_final: 1.691328, loss_mean: 0.908153, proj_loss: -0.376146, loss_mean_cls: 1.159321, grad_norm: 2.031919 +Steps: 0%| | 2335/1000000 [09:35<69:05:41, 4.01it/s, grad_norm=2.03, loss_final=1.69, loss_mean=0.908, loss_mean_cls=1.16, proj_loss=-0.376][2026-03-22 14:28:12] Step: 2335, Training Logs: loss_final: 1.616839, loss_mean: 0.906230, proj_loss: -0.388853, loss_mean_cls: 1.099461, grad_norm: 3.760700 +Steps: 0%| | 2336/1000000 [09:36<68:39:46, 4.04it/s, grad_norm=3.76, loss_final=1.62, loss_mean=0.906, loss_mean_cls=1.1, proj_loss=-0.389][2026-03-22 14:28:13] Step: 2336, Training Logs: loss_final: 1.671410, loss_mean: 0.908623, proj_loss: -0.382763, loss_mean_cls: 1.145551, grad_norm: 3.509305 +Steps: 0%| | 2337/1000000 [09:36<68:21:53, 4.05it/s, grad_norm=3.51, loss_final=1.67, loss_mean=0.909, loss_mean_cls=1.15, proj_loss=-0.383][2026-03-22 14:28:13] Step: 2337, Training Logs: loss_final: 1.648028, loss_mean: 0.922267, proj_loss: -0.382492, loss_mean_cls: 1.108253, grad_norm: 2.282318 +Steps: 0%| | 2338/1000000 [09:36<68:09:09, 4.07it/s, grad_norm=2.28, loss_final=1.65, loss_mean=0.922, loss_mean_cls=1.11, proj_loss=-0.382][2026-03-22 14:28:13] Step: 2338, Training Logs: loss_final: 1.743208, loss_mean: 0.881482, proj_loss: -0.375038, loss_mean_cls: 1.236765, grad_norm: 3.792393 +Steps: 0%| | 2339/1000000 [09:36<67:59:20, 4.08it/s, grad_norm=3.79, loss_final=1.74, loss_mean=0.881, loss_mean_cls=1.24, proj_loss=-0.375][2026-03-22 14:28:13] Step: 2339, Training Logs: loss_final: 1.677480, loss_mean: 0.901511, proj_loss: -0.379204, loss_mean_cls: 1.155173, grad_norm: 1.999878 +Steps: 0%| | 2340/1000000 [09:37<67:53:40, 4.08it/s, grad_norm=2, loss_final=1.68, loss_mean=0.902, loss_mean_cls=1.16, proj_loss=-0.379][2026-03-22 14:28:14] Step: 2340, Training Logs: loss_final: 1.565257, loss_mean: 0.898456, proj_loss: -0.387944, loss_mean_cls: 1.054746, grad_norm: 3.281229 +Steps: 0%| | 2341/1000000 [09:37<67:49:58, 4.09it/s, grad_norm=3.28, loss_final=1.57, loss_mean=0.898, loss_mean_cls=1.05, proj_loss=-0.388][2026-03-22 14:28:14] Step: 2341, Training Logs: loss_final: 1.708417, loss_mean: 0.893277, proj_loss: -0.373920, loss_mean_cls: 1.189060, grad_norm: 1.512577 +Steps: 0%| | 2342/1000000 [09:37<67:46:59, 4.09it/s, grad_norm=1.51, loss_final=1.71, loss_mean=0.893, loss_mean_cls=1.19, proj_loss=-0.374][2026-03-22 14:28:14] Step: 2342, Training Logs: loss_final: 1.497375, loss_mean: 0.921408, proj_loss: -0.391538, loss_mean_cls: 0.967504, grad_norm: 1.751802 +Steps: 0%| | 2343/1000000 [09:37<67:44:26, 4.09it/s, grad_norm=1.75, loss_final=1.5, loss_mean=0.921, loss_mean_cls=0.968, proj_loss=-0.392][2026-03-22 14:28:14] Step: 2343, Training Logs: loss_final: 1.704354, loss_mean: 0.885983, proj_loss: -0.380224, loss_mean_cls: 1.198596, grad_norm: 2.717904 +Steps: 0%| | 2344/1000000 [09:38<67:45:19, 4.09it/s, grad_norm=2.72, loss_final=1.7, loss_mean=0.886, loss_mean_cls=1.2, proj_loss=-0.38][2026-03-22 14:28:15] Step: 2344, Training Logs: loss_final: 1.629661, loss_mean: 0.913637, proj_loss: -0.380178, loss_mean_cls: 1.096202, grad_norm: 2.074017 +Steps: 0%| | 2345/1000000 [09:38<67:43:48, 4.09it/s, grad_norm=2.07, loss_final=1.63, loss_mean=0.914, loss_mean_cls=1.1, proj_loss=-0.38][2026-03-22 14:28:15] Step: 2345, Training Logs: loss_final: 1.622995, loss_mean: 0.900373, proj_loss: -0.383228, loss_mean_cls: 1.105851, grad_norm: 5.733272 +Steps: 0%| | 2346/1000000 [09:38<67:43:46, 4.09it/s, grad_norm=5.73, loss_final=1.62, loss_mean=0.9, loss_mean_cls=1.11, proj_loss=-0.383][2026-03-22 14:28:15] Step: 2346, Training Logs: loss_final: 1.631514, loss_mean: 0.921702, proj_loss: -0.384567, loss_mean_cls: 1.094380, grad_norm: 4.518955 +Steps: 0%| | 2347/1000000 [09:38<67:42:28, 4.09it/s, grad_norm=4.52, loss_final=1.63, loss_mean=0.922, loss_mean_cls=1.09, proj_loss=-0.385][2026-03-22 14:28:15] Step: 2347, Training Logs: loss_final: 1.671813, loss_mean: 0.929412, proj_loss: -0.383081, loss_mean_cls: 1.125482, grad_norm: 2.628721 +Steps: 0%| | 2348/1000000 [09:39<67:41:53, 4.09it/s, grad_norm=2.63, loss_final=1.67, loss_mean=0.929, loss_mean_cls=1.13, proj_loss=-0.383][2026-03-22 14:28:16] Step: 2348, Training Logs: loss_final: 1.638458, loss_mean: 0.910097, proj_loss: -0.387143, loss_mean_cls: 1.115504, grad_norm: 1.657642 +Steps: 0%| | 2349/1000000 [09:39<67:41:04, 4.09it/s, grad_norm=1.66, loss_final=1.64, loss_mean=0.91, loss_mean_cls=1.12, proj_loss=-0.387][2026-03-22 14:28:16] Step: 2349, Training Logs: loss_final: 1.724969, loss_mean: 0.889055, proj_loss: -0.377408, loss_mean_cls: 1.213322, grad_norm: 2.591490 +Steps: 0%| | 2350/1000000 [09:39<67:41:11, 4.09it/s, grad_norm=2.59, loss_final=1.72, loss_mean=0.889, loss_mean_cls=1.21, proj_loss=-0.377][2026-03-22 14:28:16] Step: 2350, Training Logs: loss_final: 1.595299, loss_mean: 0.931599, proj_loss: -0.390638, loss_mean_cls: 1.054339, grad_norm: 2.741048 +Steps: 0%| | 2351/1000000 [09:39<67:38:39, 4.10it/s, grad_norm=2.74, loss_final=1.6, loss_mean=0.932, loss_mean_cls=1.05, proj_loss=-0.391][2026-03-22 14:28:16] Step: 2351, Training Logs: loss_final: 1.726551, loss_mean: 0.914002, proj_loss: -0.372594, loss_mean_cls: 1.185143, grad_norm: 5.801898 +Steps: 0%| | 2352/1000000 [09:40<67:40:09, 4.10it/s, grad_norm=5.8, loss_final=1.73, loss_mean=0.914, loss_mean_cls=1.19, proj_loss=-0.373][2026-03-22 14:28:16] Step: 2352, Training Logs: loss_final: 1.618204, loss_mean: 0.900877, proj_loss: -0.387192, loss_mean_cls: 1.104519, grad_norm: 3.188387 +Steps: 0%| | 2353/1000000 [09:40<67:40:56, 4.09it/s, grad_norm=3.19, loss_final=1.62, loss_mean=0.901, loss_mean_cls=1.1, proj_loss=-0.387][2026-03-22 14:28:17] Step: 2353, Training Logs: loss_final: 1.688520, loss_mean: 0.913199, proj_loss: -0.379568, loss_mean_cls: 1.154889, grad_norm: 2.149706 +Steps: 0%| | 2354/1000000 [09:40<67:41:14, 4.09it/s, grad_norm=2.15, loss_final=1.69, loss_mean=0.913, loss_mean_cls=1.15, proj_loss=-0.38][2026-03-22 14:28:17] Step: 2354, Training Logs: loss_final: 1.649840, loss_mean: 0.903497, proj_loss: -0.390673, loss_mean_cls: 1.137017, grad_norm: 2.665407 +Steps: 0%| | 2355/1000000 [09:40<67:40:23, 4.10it/s, grad_norm=2.67, loss_final=1.65, loss_mean=0.903, loss_mean_cls=1.14, proj_loss=-0.391][2026-03-22 14:28:17] Step: 2355, Training Logs: loss_final: 1.631788, loss_mean: 0.917099, proj_loss: -0.380447, loss_mean_cls: 1.095137, grad_norm: 2.110029 +Steps: 0%| | 2356/1000000 [09:41<67:40:00, 4.10it/s, grad_norm=2.11, loss_final=1.63, loss_mean=0.917, loss_mean_cls=1.1, proj_loss=-0.38][2026-03-22 14:28:17] Step: 2356, Training Logs: loss_final: 1.736019, loss_mean: 0.905727, proj_loss: -0.378829, loss_mean_cls: 1.209121, grad_norm: 2.219395 +Steps: 0%| | 2357/1000000 [09:41<67:41:32, 4.09it/s, grad_norm=2.22, loss_final=1.74, loss_mean=0.906, loss_mean_cls=1.21, proj_loss=-0.379][2026-03-22 14:28:18] Step: 2357, Training Logs: loss_final: 1.719589, loss_mean: 0.912745, proj_loss: -0.378181, loss_mean_cls: 1.185025, grad_norm: 3.841666 +Steps: 0%| | 2358/1000000 [09:41<67:41:26, 4.09it/s, grad_norm=3.84, loss_final=1.72, loss_mean=0.913, loss_mean_cls=1.19, proj_loss=-0.378][2026-03-22 14:28:18] Step: 2358, Training Logs: loss_final: 1.639682, loss_mean: 0.908085, proj_loss: -0.385325, loss_mean_cls: 1.116922, grad_norm: 2.067041 +Steps: 0%| | 2359/1000000 [09:41<67:41:05, 4.09it/s, grad_norm=2.07, loss_final=1.64, loss_mean=0.908, loss_mean_cls=1.12, proj_loss=-0.385][2026-03-22 14:28:18] Step: 2359, Training Logs: loss_final: 1.681077, loss_mean: 0.902621, proj_loss: -0.373490, loss_mean_cls: 1.151947, grad_norm: 2.597519 +Steps: 0%| | 2360/1000000 [09:41<67:41:55, 4.09it/s, grad_norm=2.6, loss_final=1.68, loss_mean=0.903, loss_mean_cls=1.15, proj_loss=-0.373][2026-03-22 14:28:18] Step: 2360, Training Logs: loss_final: 1.671836, loss_mean: 0.909386, proj_loss: -0.388259, loss_mean_cls: 1.150709, grad_norm: 5.562263 +Steps: 0%| | 2361/1000000 [09:42<67:43:33, 4.09it/s, grad_norm=5.56, loss_final=1.67, loss_mean=0.909, loss_mean_cls=1.15, proj_loss=-0.388][2026-03-22 14:28:19] Step: 2361, Training Logs: loss_final: 1.703080, loss_mean: 0.902395, proj_loss: -0.375094, loss_mean_cls: 1.175778, grad_norm: 3.804608 +Steps: 0%| | 2362/1000000 [09:42<67:42:26, 4.09it/s, grad_norm=3.8, loss_final=1.7, loss_mean=0.902, loss_mean_cls=1.18, proj_loss=-0.375][2026-03-22 14:28:19] Step: 2362, Training Logs: loss_final: 1.637842, loss_mean: 0.921770, proj_loss: -0.379187, loss_mean_cls: 1.095259, grad_norm: 3.062627 +Steps: 0%| | 2363/1000000 [09:42<67:41:35, 4.09it/s, grad_norm=3.06, loss_final=1.64, loss_mean=0.922, loss_mean_cls=1.1, proj_loss=-0.379][2026-03-22 14:28:19] Step: 2363, Training Logs: loss_final: 1.554004, loss_mean: 0.918141, proj_loss: -0.391968, loss_mean_cls: 1.027831, grad_norm: 4.542330 +Steps: 0%| | 2364/1000000 [09:42<67:42:54, 4.09it/s, grad_norm=4.54, loss_final=1.55, loss_mean=0.918, loss_mean_cls=1.03, proj_loss=-0.392][2026-03-22 14:28:19] Step: 2364, Training Logs: loss_final: 1.644528, loss_mean: 0.911794, proj_loss: -0.388394, loss_mean_cls: 1.121128, grad_norm: 3.010797 +Steps: 0%| | 2365/1000000 [09:43<67:42:44, 4.09it/s, grad_norm=3.01, loss_final=1.64, loss_mean=0.912, loss_mean_cls=1.12, proj_loss=-0.388][2026-03-22 14:28:20] Step: 2365, Training Logs: loss_final: 1.689116, loss_mean: 0.918057, proj_loss: -0.387216, loss_mean_cls: 1.158275, grad_norm: 2.393630 +Steps: 0%| | 2366/1000000 [09:43<67:43:28, 4.09it/s, grad_norm=2.39, loss_final=1.69, loss_mean=0.918, loss_mean_cls=1.16, proj_loss=-0.387][2026-03-22 14:28:20] Step: 2366, Training Logs: loss_final: 1.684295, loss_mean: 0.902656, proj_loss: -0.387441, loss_mean_cls: 1.169080, grad_norm: 4.259973 +Steps: 0%| | 2367/1000000 [09:43<67:41:52, 4.09it/s, grad_norm=4.26, loss_final=1.68, loss_mean=0.903, loss_mean_cls=1.17, proj_loss=-0.387][2026-03-22 14:28:20] Step: 2367, Training Logs: loss_final: 1.760578, loss_mean: 0.893999, proj_loss: -0.374846, loss_mean_cls: 1.241426, grad_norm: 2.023045 +Steps: 0%| | 2368/1000000 [09:43<67:43:13, 4.09it/s, grad_norm=2.02, loss_final=1.76, loss_mean=0.894, loss_mean_cls=1.24, proj_loss=-0.375][2026-03-22 14:28:20] Step: 2368, Training Logs: loss_final: 1.666447, loss_mean: 0.918195, proj_loss: -0.379028, loss_mean_cls: 1.127280, grad_norm: 1.811153 +Steps: 0%| | 2369/1000000 [09:44<67:43:19, 4.09it/s, grad_norm=1.81, loss_final=1.67, loss_mean=0.918, loss_mean_cls=1.13, proj_loss=-0.379][2026-03-22 14:28:21] Step: 2369, Training Logs: loss_final: 1.659033, loss_mean: 0.908145, proj_loss: -0.381053, loss_mean_cls: 1.131942, grad_norm: 3.209867 +Steps: 0%| | 2370/1000000 [09:44<67:42:03, 4.09it/s, grad_norm=3.21, loss_final=1.66, loss_mean=0.908, loss_mean_cls=1.13, proj_loss=-0.381][2026-03-22 14:28:21] Step: 2370, Training Logs: loss_final: 1.648995, loss_mean: 0.910630, proj_loss: -0.381086, loss_mean_cls: 1.119451, grad_norm: 2.651581 +Steps: 0%| | 2371/1000000 [09:44<67:42:29, 4.09it/s, grad_norm=2.65, loss_final=1.65, loss_mean=0.911, loss_mean_cls=1.12, proj_loss=-0.381][2026-03-22 14:28:21] Step: 2371, Training Logs: loss_final: 1.708061, loss_mean: 0.895041, proj_loss: -0.383377, loss_mean_cls: 1.196398, grad_norm: 2.320032 +Steps: 0%| | 2372/1000000 [09:44<67:43:18, 4.09it/s, grad_norm=2.32, loss_final=1.71, loss_mean=0.895, loss_mean_cls=1.2, proj_loss=-0.383][2026-03-22 14:28:21] Step: 2372, Training Logs: loss_final: 1.533890, loss_mean: 0.910589, proj_loss: -0.402817, loss_mean_cls: 1.026118, grad_norm: 1.747440 +Steps: 0%| | 2373/1000000 [09:45<67:42:00, 4.09it/s, grad_norm=1.75, loss_final=1.53, loss_mean=0.911, loss_mean_cls=1.03, proj_loss=-0.403][2026-03-22 14:28:22] Step: 2373, Training Logs: loss_final: 1.595259, loss_mean: 0.904282, proj_loss: -0.394602, loss_mean_cls: 1.085579, grad_norm: 4.018057 +Steps: 0%| | 2374/1000000 [09:45<67:43:31, 4.09it/s, grad_norm=4.02, loss_final=1.6, loss_mean=0.904, loss_mean_cls=1.09, proj_loss=-0.395][2026-03-22 14:28:22] Step: 2374, Training Logs: loss_final: 1.846074, loss_mean: 0.883488, proj_loss: -0.368791, loss_mean_cls: 1.331378, grad_norm: 2.250490 +Steps: 0%| | 2375/1000000 [09:45<67:43:27, 4.09it/s, grad_norm=2.25, loss_final=1.85, loss_mean=0.883, loss_mean_cls=1.33, proj_loss=-0.369][2026-03-22 14:28:22] Step: 2375, Training Logs: loss_final: 1.680123, loss_mean: 0.903162, proj_loss: -0.388945, loss_mean_cls: 1.165906, grad_norm: 6.135964 +Steps: 0%| | 2376/1000000 [09:45<67:43:34, 4.09it/s, grad_norm=6.14, loss_final=1.68, loss_mean=0.903, loss_mean_cls=1.17, proj_loss=-0.389][2026-03-22 14:28:22] Step: 2376, Training Logs: loss_final: 1.509517, loss_mean: 0.932479, proj_loss: -0.391231, loss_mean_cls: 0.968270, grad_norm: 3.432312 +Steps: 0%| | 2377/1000000 [09:46<67:42:16, 4.09it/s, grad_norm=3.43, loss_final=1.51, loss_mean=0.932, loss_mean_cls=0.968, proj_loss=-0.391][2026-03-22 14:28:23] Step: 2377, Training Logs: loss_final: 1.562109, loss_mean: 0.922912, proj_loss: -0.388718, loss_mean_cls: 1.027914, grad_norm: 2.447275 +Steps: 0%| | 2378/1000000 [09:46<67:43:44, 4.09it/s, grad_norm=2.45, loss_final=1.56, loss_mean=0.923, loss_mean_cls=1.03, proj_loss=-0.389][2026-03-22 14:28:23] Step: 2378, Training Logs: loss_final: 1.678486, loss_mean: 0.904982, proj_loss: -0.382138, loss_mean_cls: 1.155641, grad_norm: 2.066078 +Steps: 0%| | 2379/1000000 [09:46<67:41:13, 4.09it/s, grad_norm=2.07, loss_final=1.68, loss_mean=0.905, loss_mean_cls=1.16, proj_loss=-0.382][2026-03-22 14:28:23] Step: 2379, Training Logs: loss_final: 1.687328, loss_mean: 0.903206, proj_loss: -0.371100, loss_mean_cls: 1.155222, grad_norm: 1.775342 +Steps: 0%| | 2380/1000000 [09:46<67:44:25, 4.09it/s, grad_norm=1.78, loss_final=1.69, loss_mean=0.903, loss_mean_cls=1.16, proj_loss=-0.371][2026-03-22 14:28:23] Step: 2380, Training Logs: loss_final: 1.701119, loss_mean: 0.903247, proj_loss: -0.383879, loss_mean_cls: 1.181751, grad_norm: 3.118103 +Steps: 0%| | 2381/1000000 [09:47<67:43:22, 4.09it/s, grad_norm=3.12, loss_final=1.7, loss_mean=0.903, loss_mean_cls=1.18, proj_loss=-0.384][2026-03-22 14:28:24] Step: 2381, Training Logs: loss_final: 1.587843, loss_mean: 0.920478, proj_loss: -0.388433, loss_mean_cls: 1.055798, grad_norm: 3.180514 +Steps: 0%| | 2382/1000000 [09:47<69:28:34, 3.99it/s, grad_norm=3.18, loss_final=1.59, loss_mean=0.92, loss_mean_cls=1.06, proj_loss=-0.388][2026-03-22 14:28:24] Step: 2382, Training Logs: loss_final: 1.566630, loss_mean: 0.902914, proj_loss: -0.391584, loss_mean_cls: 1.055300, grad_norm: 2.374216 +Steps: 0%| | 2383/1000000 [09:47<69:07:54, 4.01it/s, grad_norm=2.37, loss_final=1.57, loss_mean=0.903, loss_mean_cls=1.06, proj_loss=-0.392][2026-03-22 14:28:24] Step: 2383, Training Logs: loss_final: 1.623546, loss_mean: 0.894984, proj_loss: -0.388823, loss_mean_cls: 1.117385, grad_norm: 3.296912 +Steps: 0%| | 2384/1000000 [09:47<68:44:18, 4.03it/s, grad_norm=3.3, loss_final=1.62, loss_mean=0.895, loss_mean_cls=1.12, proj_loss=-0.389][2026-03-22 14:28:24] Step: 2384, Training Logs: loss_final: 1.590265, loss_mean: 0.904433, proj_loss: -0.388383, loss_mean_cls: 1.074215, grad_norm: 1.312566 +Steps: 0%| | 2385/1000000 [09:48<68:24:55, 4.05it/s, grad_norm=1.31, loss_final=1.59, loss_mean=0.904, loss_mean_cls=1.07, proj_loss=-0.388][2026-03-22 14:28:25] Step: 2385, Training Logs: loss_final: 1.734797, loss_mean: 0.884944, proj_loss: -0.377235, loss_mean_cls: 1.227088, grad_norm: 5.580904 +Steps: 0%| | 2386/1000000 [09:48<68:13:31, 4.06it/s, grad_norm=5.58, loss_final=1.73, loss_mean=0.885, loss_mean_cls=1.23, proj_loss=-0.377][2026-03-22 14:28:25] Step: 2386, Training Logs: loss_final: 1.573033, loss_mean: 0.921036, proj_loss: -0.394671, loss_mean_cls: 1.046668, grad_norm: 2.664469 +Steps: 0%| | 2387/1000000 [09:48<68:04:14, 4.07it/s, grad_norm=2.66, loss_final=1.57, loss_mean=0.921, loss_mean_cls=1.05, proj_loss=-0.395][2026-03-22 14:28:25] Step: 2387, Training Logs: loss_final: 1.703816, loss_mean: 0.898644, proj_loss: -0.379502, loss_mean_cls: 1.184674, grad_norm: 2.016224 +Steps: 0%| | 2388/1000000 [09:48<67:58:26, 4.08it/s, grad_norm=2.02, loss_final=1.7, loss_mean=0.899, loss_mean_cls=1.18, proj_loss=-0.38][2026-03-22 14:28:25] Step: 2388, Training Logs: loss_final: 1.697865, loss_mean: 0.905227, proj_loss: -0.381411, loss_mean_cls: 1.174049, grad_norm: 5.611855 +Steps: 0%| | 2389/1000000 [09:49<67:53:55, 4.08it/s, grad_norm=5.61, loss_final=1.7, loss_mean=0.905, loss_mean_cls=1.17, proj_loss=-0.381][2026-03-22 14:28:26] Step: 2389, Training Logs: loss_final: 1.599943, loss_mean: 0.911539, proj_loss: -0.382582, loss_mean_cls: 1.070986, grad_norm: 2.819825 +Steps: 0%| | 2390/1000000 [09:49<67:49:48, 4.09it/s, grad_norm=2.82, loss_final=1.6, loss_mean=0.912, loss_mean_cls=1.07, proj_loss=-0.383][2026-03-22 14:28:26] Step: 2390, Training Logs: loss_final: 1.636003, loss_mean: 0.917617, proj_loss: -0.379204, loss_mean_cls: 1.097589, grad_norm: 1.688720 +Steps: 0%| | 2391/1000000 [09:49<67:46:45, 4.09it/s, grad_norm=1.69, loss_final=1.64, loss_mean=0.918, loss_mean_cls=1.1, proj_loss=-0.379][2026-03-22 14:28:26] Step: 2391, Training Logs: loss_final: 1.634804, loss_mean: 0.904248, proj_loss: -0.390103, loss_mean_cls: 1.120660, grad_norm: 4.418167 +Steps: 0%| | 2392/1000000 [09:49<67:44:52, 4.09it/s, grad_norm=4.42, loss_final=1.63, loss_mean=0.904, loss_mean_cls=1.12, proj_loss=-0.39][2026-03-22 14:28:26] Step: 2392, Training Logs: loss_final: 1.532171, loss_mean: 0.912205, proj_loss: -0.393819, loss_mean_cls: 1.013786, grad_norm: 4.641581 +Steps: 0%| | 2393/1000000 [09:50<67:43:55, 4.09it/s, grad_norm=4.64, loss_final=1.53, loss_mean=0.912, loss_mean_cls=1.01, proj_loss=-0.394][2026-03-22 14:28:27] Step: 2393, Training Logs: loss_final: 1.684926, loss_mean: 0.910570, proj_loss: -0.385265, loss_mean_cls: 1.159621, grad_norm: 5.594395 +Steps: 0%| | 2394/1000000 [09:50<67:45:08, 4.09it/s, grad_norm=5.59, loss_final=1.68, loss_mean=0.911, loss_mean_cls=1.16, proj_loss=-0.385][2026-03-22 14:28:27] Step: 2394, Training Logs: loss_final: 1.724207, loss_mean: 0.899280, proj_loss: -0.373815, loss_mean_cls: 1.198742, grad_norm: 3.270754 +Steps: 0%| | 2395/1000000 [09:50<67:44:10, 4.09it/s, grad_norm=3.27, loss_final=1.72, loss_mean=0.899, loss_mean_cls=1.2, proj_loss=-0.374][2026-03-22 14:28:27] Step: 2395, Training Logs: loss_final: 1.701502, loss_mean: 0.890405, proj_loss: -0.381552, loss_mean_cls: 1.192650, grad_norm: 1.229367 +Steps: 0%| | 2396/1000000 [09:50<67:45:07, 4.09it/s, grad_norm=1.23, loss_final=1.7, loss_mean=0.89, loss_mean_cls=1.19, proj_loss=-0.382][2026-03-22 14:28:27] Step: 2396, Training Logs: loss_final: 1.538924, loss_mean: 0.921882, proj_loss: -0.390753, loss_mean_cls: 1.007795, grad_norm: 3.209356 +Steps: 0%| | 2397/1000000 [09:51<67:45:02, 4.09it/s, grad_norm=3.21, loss_final=1.54, loss_mean=0.922, loss_mean_cls=1.01, proj_loss=-0.391][2026-03-22 14:28:28] Step: 2397, Training Logs: loss_final: 1.693634, loss_mean: 0.912277, proj_loss: -0.383054, loss_mean_cls: 1.164410, grad_norm: 3.382766 +Steps: 0%| | 2398/1000000 [09:51<67:43:41, 4.09it/s, grad_norm=3.38, loss_final=1.69, loss_mean=0.912, loss_mean_cls=1.16, proj_loss=-0.383][2026-03-22 14:28:28] Step: 2398, Training Logs: loss_final: 1.566162, loss_mean: 0.906040, proj_loss: -0.391341, loss_mean_cls: 1.051463, grad_norm: 2.050697 +Steps: 0%| | 2399/1000000 [09:51<67:42:25, 4.09it/s, grad_norm=2.05, loss_final=1.57, loss_mean=0.906, loss_mean_cls=1.05, proj_loss=-0.391][2026-03-22 14:28:28] Step: 2399, Training Logs: loss_final: 1.707364, loss_mean: 0.901373, proj_loss: -0.382112, loss_mean_cls: 1.188103, grad_norm: 4.477530 +Steps: 0%| | 2400/1000000 [09:51<67:45:22, 4.09it/s, grad_norm=4.48, loss_final=1.71, loss_mean=0.901, loss_mean_cls=1.19, proj_loss=-0.382][2026-03-22 14:28:28] Step: 2400, Training Logs: loss_final: 1.600446, loss_mean: 0.892489, proj_loss: -0.390260, loss_mean_cls: 1.098217, grad_norm: 4.170170 +Steps: 0%| | 2401/1000000 [09:52<67:42:30, 4.09it/s, grad_norm=4.17, loss_final=1.6, loss_mean=0.892, loss_mean_cls=1.1, proj_loss=-0.39][2026-03-22 14:28:28] Step: 2401, Training Logs: loss_final: 1.593149, loss_mean: 0.916980, proj_loss: -0.385216, loss_mean_cls: 1.061385, grad_norm: 5.518960 +Steps: 0%| | 2402/1000000 [09:52<67:42:27, 4.09it/s, grad_norm=5.52, loss_final=1.59, loss_mean=0.917, loss_mean_cls=1.06, proj_loss=-0.385][2026-03-22 14:28:29] Step: 2402, Training Logs: loss_final: 1.705055, loss_mean: 0.905038, proj_loss: -0.376677, loss_mean_cls: 1.176694, grad_norm: 3.936829 +Steps: 0%| | 2403/1000000 [09:52<67:42:26, 4.09it/s, grad_norm=3.94, loss_final=1.71, loss_mean=0.905, loss_mean_cls=1.18, proj_loss=-0.377][2026-03-22 14:28:29] Step: 2403, Training Logs: loss_final: 1.542416, loss_mean: 0.928020, proj_loss: -0.391841, loss_mean_cls: 1.006238, grad_norm: 1.902991 +Steps: 0%| | 2404/1000000 [09:52<67:43:38, 4.09it/s, grad_norm=1.9, loss_final=1.54, loss_mean=0.928, loss_mean_cls=1.01, proj_loss=-0.392][2026-03-22 14:28:29] Step: 2404, Training Logs: loss_final: 1.683499, loss_mean: 0.908025, proj_loss: -0.382930, loss_mean_cls: 1.158404, grad_norm: 4.932405 +Steps: 0%| | 2405/1000000 [09:53<67:42:21, 4.09it/s, grad_norm=4.93, loss_final=1.68, loss_mean=0.908, loss_mean_cls=1.16, proj_loss=-0.383][2026-03-22 14:28:29] Step: 2405, Training Logs: loss_final: 1.705906, loss_mean: 0.896628, proj_loss: -0.374536, loss_mean_cls: 1.183815, grad_norm: 3.502327 +Steps: 0%| | 2406/1000000 [09:53<67:42:01, 4.09it/s, grad_norm=3.5, loss_final=1.71, loss_mean=0.897, loss_mean_cls=1.18, proj_loss=-0.375][2026-03-22 14:28:30] Step: 2406, Training Logs: loss_final: 1.729737, loss_mean: 0.911803, proj_loss: -0.379459, loss_mean_cls: 1.197393, grad_norm: 2.425126 +Steps: 0%| | 2407/1000000 [09:53<67:41:04, 4.09it/s, grad_norm=2.43, loss_final=1.73, loss_mean=0.912, loss_mean_cls=1.2, proj_loss=-0.379][2026-03-22 14:28:30] Step: 2407, Training Logs: loss_final: 1.751036, loss_mean: 0.880290, proj_loss: -0.380292, loss_mean_cls: 1.251038, grad_norm: 4.366359 +Steps: 0%| | 2408/1000000 [09:53<67:44:20, 4.09it/s, grad_norm=4.37, loss_final=1.75, loss_mean=0.88, loss_mean_cls=1.25, proj_loss=-0.38][2026-03-22 14:28:30] Step: 2408, Training Logs: loss_final: 1.615691, loss_mean: 0.925350, proj_loss: -0.388228, loss_mean_cls: 1.078569, grad_norm: 3.056752 +Steps: 0%| | 2409/1000000 [09:53<67:43:05, 4.09it/s, grad_norm=3.06, loss_final=1.62, loss_mean=0.925, loss_mean_cls=1.08, proj_loss=-0.388][2026-03-22 14:28:30] Step: 2409, Training Logs: loss_final: 1.570615, loss_mean: 0.919396, proj_loss: -0.394649, loss_mean_cls: 1.045867, grad_norm: 1.405719 +Steps: 0%| | 2410/1000000 [09:54<67:41:11, 4.09it/s, grad_norm=1.41, loss_final=1.57, loss_mean=0.919, loss_mean_cls=1.05, proj_loss=-0.395][2026-03-22 14:28:31] Step: 2410, Training Logs: loss_final: 1.749915, loss_mean: 0.908057, proj_loss: -0.373037, loss_mean_cls: 1.214894, grad_norm: 3.511568 +Steps: 0%| | 2411/1000000 [09:54<67:39:31, 4.10it/s, grad_norm=3.51, loss_final=1.75, loss_mean=0.908, loss_mean_cls=1.21, proj_loss=-0.373][2026-03-22 14:28:31] Step: 2411, Training Logs: loss_final: 1.613419, loss_mean: 0.917413, proj_loss: -0.386396, loss_mean_cls: 1.082402, grad_norm: 3.985560 +Steps: 0%| | 2412/1000000 [09:54<67:40:04, 4.10it/s, grad_norm=3.99, loss_final=1.61, loss_mean=0.917, loss_mean_cls=1.08, proj_loss=-0.386][2026-03-22 14:28:31] Step: 2412, Training Logs: loss_final: 1.658579, loss_mean: 0.924150, proj_loss: -0.382171, loss_mean_cls: 1.116600, grad_norm: 4.516686 +Steps: 0%| | 2413/1000000 [09:54<67:39:33, 4.10it/s, grad_norm=4.52, loss_final=1.66, loss_mean=0.924, loss_mean_cls=1.12, proj_loss=-0.382][2026-03-22 14:28:31] Step: 2413, Training Logs: loss_final: 1.542189, loss_mean: 0.935320, proj_loss: -0.388383, loss_mean_cls: 0.995252, grad_norm: 2.205392 +Steps: 0%| | 2414/1000000 [09:55<67:40:27, 4.09it/s, grad_norm=2.21, loss_final=1.54, loss_mean=0.935, loss_mean_cls=0.995, proj_loss=-0.388][2026-03-22 14:28:32] Step: 2414, Training Logs: loss_final: 1.719385, loss_mean: 0.880610, proj_loss: -0.375421, loss_mean_cls: 1.214196, grad_norm: 2.024305 +Steps: 0%| | 2415/1000000 [09:55<67:41:56, 4.09it/s, grad_norm=2.02, loss_final=1.72, loss_mean=0.881, loss_mean_cls=1.21, proj_loss=-0.375][2026-03-22 14:28:32] Step: 2415, Training Logs: loss_final: 1.649303, loss_mean: 0.914736, proj_loss: -0.383591, loss_mean_cls: 1.118158, grad_norm: 1.931663 +Steps: 0%| | 2416/1000000 [09:55<67:43:01, 4.09it/s, grad_norm=1.93, loss_final=1.65, loss_mean=0.915, loss_mean_cls=1.12, proj_loss=-0.384][2026-03-22 14:28:32] Step: 2416, Training Logs: loss_final: 1.679400, loss_mean: 0.900948, proj_loss: -0.380049, loss_mean_cls: 1.158502, grad_norm: 1.498064 +Steps: 0%| | 2417/1000000 [09:55<67:42:38, 4.09it/s, grad_norm=1.5, loss_final=1.68, loss_mean=0.901, loss_mean_cls=1.16, proj_loss=-0.38][2026-03-22 14:28:32] Step: 2417, Training Logs: loss_final: 1.621058, loss_mean: 0.907725, proj_loss: -0.383975, loss_mean_cls: 1.097309, grad_norm: 3.368654 +Steps: 0%| | 2418/1000000 [09:56<67:43:20, 4.09it/s, grad_norm=3.37, loss_final=1.62, loss_mean=0.908, loss_mean_cls=1.1, proj_loss=-0.384][2026-03-22 14:28:33] Step: 2418, Training Logs: loss_final: 1.705574, loss_mean: 0.905553, proj_loss: -0.371538, loss_mean_cls: 1.171559, grad_norm: 3.069082 +Steps: 0%| | 2419/1000000 [09:56<67:41:08, 4.09it/s, grad_norm=3.07, loss_final=1.71, loss_mean=0.906, loss_mean_cls=1.17, proj_loss=-0.372][2026-03-22 14:28:33] Step: 2419, Training Logs: loss_final: 1.693127, loss_mean: 0.895945, proj_loss: -0.379590, loss_mean_cls: 1.176772, grad_norm: 3.864722 +Steps: 0%| | 2420/1000000 [09:56<67:40:18, 4.09it/s, grad_norm=3.86, loss_final=1.69, loss_mean=0.896, loss_mean_cls=1.18, proj_loss=-0.38][2026-03-22 14:28:33] Step: 2420, Training Logs: loss_final: 1.716046, loss_mean: 0.901598, proj_loss: -0.381974, loss_mean_cls: 1.196422, grad_norm: 2.495567 +Steps: 0%| | 2421/1000000 [09:56<67:40:10, 4.09it/s, grad_norm=2.5, loss_final=1.72, loss_mean=0.902, loss_mean_cls=1.2, proj_loss=-0.382][2026-03-22 14:28:33] Step: 2421, Training Logs: loss_final: 1.599559, loss_mean: 0.909124, proj_loss: -0.384365, loss_mean_cls: 1.074800, grad_norm: 2.177449 +Steps: 0%| | 2422/1000000 [09:57<67:39:36, 4.10it/s, grad_norm=2.18, loss_final=1.6, loss_mean=0.909, loss_mean_cls=1.07, proj_loss=-0.384][2026-03-22 14:28:34] Step: 2422, Training Logs: loss_final: 1.577938, loss_mean: 0.911226, proj_loss: -0.391191, loss_mean_cls: 1.057903, grad_norm: 2.773178 +Steps: 0%| | 2423/1000000 [09:57<67:39:46, 4.10it/s, grad_norm=2.77, loss_final=1.58, loss_mean=0.911, loss_mean_cls=1.06, proj_loss=-0.391][2026-03-22 14:28:34] Step: 2423, Training Logs: loss_final: 1.509544, loss_mean: 0.935086, proj_loss: -0.398833, loss_mean_cls: 0.973291, grad_norm: 3.167787 +Steps: 0%| | 2424/1000000 [09:57<67:41:26, 4.09it/s, grad_norm=3.17, loss_final=1.51, loss_mean=0.935, loss_mean_cls=0.973, proj_loss=-0.399][2026-03-22 14:28:34] Step: 2424, Training Logs: loss_final: 1.586622, loss_mean: 0.908843, proj_loss: -0.394657, loss_mean_cls: 1.072437, grad_norm: 2.494283 +Steps: 0%| | 2425/1000000 [09:57<67:42:41, 4.09it/s, grad_norm=2.49, loss_final=1.59, loss_mean=0.909, loss_mean_cls=1.07, proj_loss=-0.395][2026-03-22 14:28:34] Step: 2425, Training Logs: loss_final: 1.749493, loss_mean: 0.894484, proj_loss: -0.375304, loss_mean_cls: 1.230313, grad_norm: 2.678064 +Steps: 0%| | 2426/1000000 [09:58<67:40:59, 4.09it/s, grad_norm=2.68, loss_final=1.75, loss_mean=0.894, loss_mean_cls=1.23, proj_loss=-0.375][2026-03-22 14:28:35] Step: 2426, Training Logs: loss_final: 1.706242, loss_mean: 0.900783, proj_loss: -0.383474, loss_mean_cls: 1.188933, grad_norm: 3.873408 +Steps: 0%| | 2427/1000000 [09:58<68:51:03, 4.02it/s, grad_norm=3.87, loss_final=1.71, loss_mean=0.901, loss_mean_cls=1.19, proj_loss=-0.383][2026-03-22 14:28:35] Step: 2427, Training Logs: loss_final: 1.651400, loss_mean: 0.894095, proj_loss: -0.382221, loss_mean_cls: 1.139526, grad_norm: 3.334446 +Steps: 0%| | 2428/1000000 [09:58<68:11:51, 4.06it/s, grad_norm=3.33, loss_final=1.65, loss_mean=0.894, loss_mean_cls=1.14, proj_loss=-0.382][2026-03-22 14:28:35] Step: 2428, Training Logs: loss_final: 1.839412, loss_mean: 0.864721, proj_loss: -0.376115, loss_mean_cls: 1.350807, grad_norm: 2.100738 +Steps: 0%| | 2429/1000000 [09:58<68:01:41, 4.07it/s, grad_norm=2.1, loss_final=1.84, loss_mean=0.865, loss_mean_cls=1.35, proj_loss=-0.376][2026-03-22 14:28:35] Step: 2429, Training Logs: loss_final: 1.709542, loss_mean: 0.919804, proj_loss: -0.387648, loss_mean_cls: 1.177386, grad_norm: 2.662906 +Steps: 0%| | 2430/1000000 [09:59<67:55:00, 4.08it/s, grad_norm=2.66, loss_final=1.71, loss_mean=0.92, loss_mean_cls=1.18, proj_loss=-0.388][2026-03-22 14:28:36] Step: 2430, Training Logs: loss_final: 1.560237, loss_mean: 0.911329, proj_loss: -0.396430, loss_mean_cls: 1.045338, grad_norm: 2.153771 +Steps: 0%| | 2431/1000000 [09:59<67:50:59, 4.08it/s, grad_norm=2.15, loss_final=1.56, loss_mean=0.911, loss_mean_cls=1.05, proj_loss=-0.396][2026-03-22 14:28:36] Step: 2431, Training Logs: loss_final: 1.576894, loss_mean: 0.922596, proj_loss: -0.397016, loss_mean_cls: 1.051315, grad_norm: 2.369740 +Steps: 0%| | 2432/1000000 [09:59<67:48:39, 4.09it/s, grad_norm=2.37, loss_final=1.58, loss_mean=0.923, loss_mean_cls=1.05, proj_loss=-0.397][2026-03-22 14:28:36] Step: 2432, Training Logs: loss_final: 1.636744, loss_mean: 0.901987, proj_loss: -0.387528, loss_mean_cls: 1.122284, grad_norm: 2.138449 +Steps: 0%| | 2433/1000000 [09:59<67:46:57, 4.09it/s, grad_norm=2.14, loss_final=1.64, loss_mean=0.902, loss_mean_cls=1.12, proj_loss=-0.388][2026-03-22 14:28:36] Step: 2433, Training Logs: loss_final: 1.538742, loss_mean: 0.911455, proj_loss: -0.401577, loss_mean_cls: 1.028864, grad_norm: 3.272442 +Steps: 0%| | 2434/1000000 [10:00<67:46:42, 4.09it/s, grad_norm=3.27, loss_final=1.54, loss_mean=0.911, loss_mean_cls=1.03, proj_loss=-0.402][2026-03-22 14:28:37] Step: 2434, Training Logs: loss_final: 1.602263, loss_mean: 0.901453, proj_loss: -0.394808, loss_mean_cls: 1.095618, grad_norm: 3.373730 +Steps: 0%| | 2435/1000000 [10:00<67:43:59, 4.09it/s, grad_norm=3.37, loss_final=1.6, loss_mean=0.901, loss_mean_cls=1.1, proj_loss=-0.395][2026-03-22 14:28:37] Step: 2435, Training Logs: loss_final: 1.704030, loss_mean: 0.900394, proj_loss: -0.381554, loss_mean_cls: 1.185189, grad_norm: 2.781480 +Steps: 0%| | 2436/1000000 [10:00<67:43:08, 4.09it/s, grad_norm=2.78, loss_final=1.7, loss_mean=0.9, loss_mean_cls=1.19, proj_loss=-0.382][2026-03-22 14:28:37] Step: 2436, Training Logs: loss_final: 1.578684, loss_mean: 0.915815, proj_loss: -0.393364, loss_mean_cls: 1.056233, grad_norm: 5.881618 +Steps: 0%| | 2437/1000000 [10:00<67:41:09, 4.09it/s, grad_norm=5.88, loss_final=1.58, loss_mean=0.916, loss_mean_cls=1.06, proj_loss=-0.393][2026-03-22 14:28:37] Step: 2437, Training Logs: loss_final: 1.572307, loss_mean: 0.927281, proj_loss: -0.390027, loss_mean_cls: 1.035053, grad_norm: 5.766041 +Steps: 0%| | 2438/1000000 [10:01<67:40:24, 4.09it/s, grad_norm=5.77, loss_final=1.57, loss_mean=0.927, loss_mean_cls=1.04, proj_loss=-0.39][2026-03-22 14:28:38] Step: 2438, Training Logs: loss_final: 1.632482, loss_mean: 0.917674, proj_loss: -0.387553, loss_mean_cls: 1.102360, grad_norm: 3.835010 +Steps: 0%| | 2439/1000000 [10:01<67:39:50, 4.10it/s, grad_norm=3.84, loss_final=1.63, loss_mean=0.918, loss_mean_cls=1.1, proj_loss=-0.388][2026-03-22 14:28:38] Step: 2439, Training Logs: loss_final: 1.589951, loss_mean: 0.913145, proj_loss: -0.391661, loss_mean_cls: 1.068467, grad_norm: 4.786949 +Steps: 0%| | 2440/1000000 [10:01<67:40:15, 4.09it/s, grad_norm=4.79, loss_final=1.59, loss_mean=0.913, loss_mean_cls=1.07, proj_loss=-0.392][2026-03-22 14:28:38] Step: 2440, Training Logs: loss_final: 1.542072, loss_mean: 0.913043, proj_loss: -0.399153, loss_mean_cls: 1.028182, grad_norm: 2.986470 +Steps: 0%| | 2441/1000000 [10:01<67:40:23, 4.09it/s, grad_norm=2.99, loss_final=1.54, loss_mean=0.913, loss_mean_cls=1.03, proj_loss=-0.399][2026-03-22 14:28:38] Step: 2441, Training Logs: loss_final: 1.618775, loss_mean: 0.900846, proj_loss: -0.393176, loss_mean_cls: 1.111106, grad_norm: 2.092701 +Steps: 0%| | 2442/1000000 [10:02<67:40:02, 4.10it/s, grad_norm=2.09, loss_final=1.62, loss_mean=0.901, loss_mean_cls=1.11, proj_loss=-0.393][2026-03-22 14:28:39] Step: 2442, Training Logs: loss_final: 1.722097, loss_mean: 0.888818, proj_loss: -0.386295, loss_mean_cls: 1.219574, grad_norm: 4.474964 +Steps: 0%| | 2443/1000000 [10:02<67:39:17, 4.10it/s, grad_norm=4.47, loss_final=1.72, loss_mean=0.889, loss_mean_cls=1.22, proj_loss=-0.386][2026-03-22 14:28:39] Step: 2443, Training Logs: loss_final: 1.652407, loss_mean: 0.934222, proj_loss: -0.396504, loss_mean_cls: 1.114689, grad_norm: 6.368255 +Steps: 0%| | 2444/1000000 [10:02<67:39:15, 4.10it/s, grad_norm=6.37, loss_final=1.65, loss_mean=0.934, loss_mean_cls=1.11, proj_loss=-0.397][2026-03-22 14:28:39] Step: 2444, Training Logs: loss_final: 1.630816, loss_mean: 0.893897, proj_loss: -0.390262, loss_mean_cls: 1.127181, grad_norm: 4.737027 +Steps: 0%| | 2445/1000000 [10:02<67:40:10, 4.09it/s, grad_norm=4.74, loss_final=1.63, loss_mean=0.894, loss_mean_cls=1.13, proj_loss=-0.39][2026-03-22 14:28:39] Step: 2445, Training Logs: loss_final: 1.729886, loss_mean: 0.883472, proj_loss: -0.380332, loss_mean_cls: 1.226746, grad_norm: 3.874095 +Steps: 0%| | 2446/1000000 [10:03<67:39:06, 4.10it/s, grad_norm=3.87, loss_final=1.73, loss_mean=0.883, loss_mean_cls=1.23, proj_loss=-0.38][2026-03-22 14:28:39] Step: 2446, Training Logs: loss_final: 1.633239, loss_mean: 0.916526, proj_loss: -0.391224, loss_mean_cls: 1.107936, grad_norm: 2.652230 +Steps: 0%| | 2447/1000000 [10:03<67:39:30, 4.10it/s, grad_norm=2.65, loss_final=1.63, loss_mean=0.917, loss_mean_cls=1.11, proj_loss=-0.391][2026-03-22 14:28:40] Step: 2447, Training Logs: loss_final: 1.611560, loss_mean: 0.920347, proj_loss: -0.391694, loss_mean_cls: 1.082907, grad_norm: 7.248757 +Steps: 0%| | 2448/1000000 [10:03<67:40:42, 4.09it/s, grad_norm=7.25, loss_final=1.61, loss_mean=0.92, loss_mean_cls=1.08, proj_loss=-0.392][2026-03-22 14:28:40] Step: 2448, Training Logs: loss_final: 1.692301, loss_mean: 0.899073, proj_loss: -0.384214, loss_mean_cls: 1.177442, grad_norm: 4.683475 +Steps: 0%| | 2449/1000000 [10:03<67:40:38, 4.09it/s, grad_norm=4.68, loss_final=1.69, loss_mean=0.899, loss_mean_cls=1.18, proj_loss=-0.384][2026-03-22 14:28:40] Step: 2449, Training Logs: loss_final: 1.731690, loss_mean: 0.897161, proj_loss: -0.384962, loss_mean_cls: 1.219491, grad_norm: 3.948399 +Steps: 0%| | 2450/1000000 [10:04<67:40:29, 4.09it/s, grad_norm=3.95, loss_final=1.73, loss_mean=0.897, loss_mean_cls=1.22, proj_loss=-0.385][2026-03-22 14:28:40] Step: 2450, Training Logs: loss_final: 1.747859, loss_mean: 0.914078, proj_loss: -0.376368, loss_mean_cls: 1.210149, grad_norm: 3.008720 +Steps: 0%| | 2451/1000000 [10:04<67:40:04, 4.09it/s, grad_norm=3.01, loss_final=1.75, loss_mean=0.914, loss_mean_cls=1.21, proj_loss=-0.376][2026-03-22 14:28:41] Step: 2451, Training Logs: loss_final: 1.571073, loss_mean: 0.915524, proj_loss: -0.394798, loss_mean_cls: 1.050347, grad_norm: 4.663060 +Steps: 0%| | 2452/1000000 [10:04<67:41:44, 4.09it/s, grad_norm=4.66, loss_final=1.57, loss_mean=0.916, loss_mean_cls=1.05, proj_loss=-0.395][2026-03-22 14:28:41] Step: 2452, Training Logs: loss_final: 1.666288, loss_mean: 0.901114, proj_loss: -0.380089, loss_mean_cls: 1.145262, grad_norm: 3.339363 +Steps: 0%| | 2453/1000000 [10:04<67:41:10, 4.09it/s, grad_norm=3.34, loss_final=1.67, loss_mean=0.901, loss_mean_cls=1.15, proj_loss=-0.38][2026-03-22 14:28:41] Step: 2453, Training Logs: loss_final: 1.534958, loss_mean: 0.922473, proj_loss: -0.394787, loss_mean_cls: 1.007272, grad_norm: 1.791381 +Steps: 0%| | 2454/1000000 [10:04<67:41:30, 4.09it/s, grad_norm=1.79, loss_final=1.53, loss_mean=0.922, loss_mean_cls=1.01, proj_loss=-0.395][2026-03-22 14:28:41] Step: 2454, Training Logs: loss_final: 1.553386, loss_mean: 0.894495, proj_loss: -0.390301, loss_mean_cls: 1.049193, grad_norm: 1.745131 +Steps: 0%| | 2455/1000000 [10:05<67:41:48, 4.09it/s, grad_norm=1.75, loss_final=1.55, loss_mean=0.894, loss_mean_cls=1.05, proj_loss=-0.39][2026-03-22 14:28:42] Step: 2455, Training Logs: loss_final: 1.722867, loss_mean: 0.882614, proj_loss: -0.383313, loss_mean_cls: 1.223567, grad_norm: 1.586733 +Steps: 0%| | 2456/1000000 [10:05<67:43:10, 4.09it/s, grad_norm=1.59, loss_final=1.72, loss_mean=0.883, loss_mean_cls=1.22, proj_loss=-0.383][2026-03-22 14:28:42] Step: 2456, Training Logs: loss_final: 1.569898, loss_mean: 0.923925, proj_loss: -0.394601, loss_mean_cls: 1.040574, grad_norm: 1.592453 +Steps: 0%| | 2457/1000000 [10:05<67:42:26, 4.09it/s, grad_norm=1.59, loss_final=1.57, loss_mean=0.924, loss_mean_cls=1.04, proj_loss=-0.395][2026-03-22 14:28:42] Step: 2457, Training Logs: loss_final: 1.473796, loss_mean: 0.931669, proj_loss: -0.398133, loss_mean_cls: 0.940260, grad_norm: 2.987783 +Steps: 0%| | 2458/1000000 [10:05<67:43:44, 4.09it/s, grad_norm=2.99, loss_final=1.47, loss_mean=0.932, loss_mean_cls=0.94, proj_loss=-0.398][2026-03-22 14:28:42] Step: 2458, Training Logs: loss_final: 1.592772, loss_mean: 0.921949, proj_loss: -0.386786, loss_mean_cls: 1.057610, grad_norm: 2.136252 +Steps: 0%| | 2459/1000000 [10:06<67:42:16, 4.09it/s, grad_norm=2.14, loss_final=1.59, loss_mean=0.922, loss_mean_cls=1.06, proj_loss=-0.387][2026-03-22 14:28:43] Step: 2459, Training Logs: loss_final: 1.602797, loss_mean: 0.909235, proj_loss: -0.385754, loss_mean_cls: 1.079317, grad_norm: 1.667720 +Steps: 0%| | 2460/1000000 [10:06<67:41:43, 4.09it/s, grad_norm=1.67, loss_final=1.6, loss_mean=0.909, loss_mean_cls=1.08, proj_loss=-0.386][2026-03-22 14:28:43] Step: 2460, Training Logs: loss_final: 1.624400, loss_mean: 0.878865, proj_loss: -0.394504, loss_mean_cls: 1.140039, grad_norm: 1.755526 +Steps: 0%| | 2461/1000000 [10:06<67:42:07, 4.09it/s, grad_norm=1.76, loss_final=1.62, loss_mean=0.879, loss_mean_cls=1.14, proj_loss=-0.395][2026-03-22 14:28:43] Step: 2461, Training Logs: loss_final: 1.618138, loss_mean: 0.918119, proj_loss: -0.386817, loss_mean_cls: 1.086836, grad_norm: 4.091096 +Steps: 0%| | 2462/1000000 [10:06<67:42:05, 4.09it/s, grad_norm=4.09, loss_final=1.62, loss_mean=0.918, loss_mean_cls=1.09, proj_loss=-0.387][2026-03-22 14:28:43] Step: 2462, Training Logs: loss_final: 1.650858, loss_mean: 0.918446, proj_loss: -0.387142, loss_mean_cls: 1.119554, grad_norm: 4.816334 +Steps: 0%| | 2463/1000000 [10:07<67:41:45, 4.09it/s, grad_norm=4.82, loss_final=1.65, loss_mean=0.918, loss_mean_cls=1.12, proj_loss=-0.387][2026-03-22 14:28:44] Step: 2463, Training Logs: loss_final: 1.637524, loss_mean: 0.897314, proj_loss: -0.387506, loss_mean_cls: 1.127716, grad_norm: 2.319224 +Steps: 0%| | 2464/1000000 [10:07<67:41:23, 4.09it/s, grad_norm=2.32, loss_final=1.64, loss_mean=0.897, loss_mean_cls=1.13, proj_loss=-0.388][2026-03-22 14:28:44] Step: 2464, Training Logs: loss_final: 1.581990, loss_mean: 0.906014, proj_loss: -0.392569, loss_mean_cls: 1.068545, grad_norm: 2.158570 +Steps: 0%| | 2465/1000000 [10:07<67:42:09, 4.09it/s, grad_norm=2.16, loss_final=1.58, loss_mean=0.906, loss_mean_cls=1.07, proj_loss=-0.393][2026-03-22 14:28:44] Step: 2465, Training Logs: loss_final: 1.577633, loss_mean: 0.918931, proj_loss: -0.394583, loss_mean_cls: 1.053285, grad_norm: 2.943396 +Steps: 0%| | 2466/1000000 [10:07<67:41:17, 4.09it/s, grad_norm=2.94, loss_final=1.58, loss_mean=0.919, loss_mean_cls=1.05, proj_loss=-0.395][2026-03-22 14:28:44] Step: 2466, Training Logs: loss_final: 1.704193, loss_mean: 0.895913, proj_loss: -0.378721, loss_mean_cls: 1.187001, grad_norm: 2.440375 +Steps: 0%| | 2467/1000000 [10:08<67:41:15, 4.09it/s, grad_norm=2.44, loss_final=1.7, loss_mean=0.896, loss_mean_cls=1.19, proj_loss=-0.379][2026-03-22 14:28:45] Step: 2467, Training Logs: loss_final: 1.599787, loss_mean: 0.918948, proj_loss: -0.388972, loss_mean_cls: 1.069811, grad_norm: 2.577005 +Steps: 0%| | 2468/1000000 [10:08<67:41:11, 4.09it/s, grad_norm=2.58, loss_final=1.6, loss_mean=0.919, loss_mean_cls=1.07, proj_loss=-0.389][2026-03-22 14:28:45] Step: 2468, Training Logs: loss_final: 1.508747, loss_mean: 0.927449, proj_loss: -0.397203, loss_mean_cls: 0.978500, grad_norm: 2.216954 +Steps: 0%| | 2469/1000000 [10:08<67:40:02, 4.09it/s, grad_norm=2.22, loss_final=1.51, loss_mean=0.927, loss_mean_cls=0.979, proj_loss=-0.397][2026-03-22 14:28:45] Step: 2469, Training Logs: loss_final: 1.731203, loss_mean: 0.890109, proj_loss: -0.382078, loss_mean_cls: 1.223171, grad_norm: 6.388166 +Steps: 0%| | 2470/1000000 [10:08<67:39:40, 4.10it/s, grad_norm=6.39, loss_final=1.73, loss_mean=0.89, loss_mean_cls=1.22, proj_loss=-0.382][2026-03-22 14:28:45] Step: 2470, Training Logs: loss_final: 1.671064, loss_mean: 0.923084, proj_loss: -0.384373, loss_mean_cls: 1.132352, grad_norm: 3.293034 +Steps: 0%| | 2471/1000000 [10:09<67:39:54, 4.10it/s, grad_norm=3.29, loss_final=1.67, loss_mean=0.923, loss_mean_cls=1.13, proj_loss=-0.384][2026-03-22 14:28:46] Step: 2471, Training Logs: loss_final: 1.570724, loss_mean: 0.924602, proj_loss: -0.388418, loss_mean_cls: 1.034540, grad_norm: 2.562797 +Steps: 0%| | 2472/1000000 [10:09<67:40:37, 4.09it/s, grad_norm=2.56, loss_final=1.57, loss_mean=0.925, loss_mean_cls=1.03, proj_loss=-0.388][2026-03-22 14:28:46] Step: 2472, Training Logs: loss_final: 1.643736, loss_mean: 0.898395, proj_loss: -0.390438, loss_mean_cls: 1.135779, grad_norm: 3.278136 +Steps: 0%| | 2473/1000000 [10:09<67:41:13, 4.09it/s, grad_norm=3.28, loss_final=1.64, loss_mean=0.898, loss_mean_cls=1.14, proj_loss=-0.39][2026-03-22 14:28:46] Step: 2473, Training Logs: loss_final: 1.641322, loss_mean: 0.916312, proj_loss: -0.386128, loss_mean_cls: 1.111138, grad_norm: 3.397332 +Steps: 0%| | 2474/1000000 [10:09<67:40:49, 4.09it/s, grad_norm=3.4, loss_final=1.64, loss_mean=0.916, loss_mean_cls=1.11, proj_loss=-0.386][2026-03-22 14:28:46] Step: 2474, Training Logs: loss_final: 1.654391, loss_mean: 0.909198, proj_loss: -0.391051, loss_mean_cls: 1.136245, grad_norm: 2.472537 +Steps: 0%| | 2475/1000000 [10:10<67:42:36, 4.09it/s, grad_norm=2.47, loss_final=1.65, loss_mean=0.909, loss_mean_cls=1.14, proj_loss=-0.391][2026-03-22 14:28:47] Step: 2475, Training Logs: loss_final: 1.648863, loss_mean: 0.894517, proj_loss: -0.389854, loss_mean_cls: 1.144200, grad_norm: 1.510286 +Steps: 0%| | 2476/1000000 [10:10<67:45:53, 4.09it/s, grad_norm=1.51, loss_final=1.65, loss_mean=0.895, loss_mean_cls=1.14, proj_loss=-0.39][2026-03-22 14:28:47] Step: 2476, Training Logs: loss_final: 1.649688, loss_mean: 0.902395, proj_loss: -0.390577, loss_mean_cls: 1.137869, grad_norm: 6.753069 +Steps: 0%| | 2477/1000000 [10:10<67:43:18, 4.09it/s, grad_norm=6.75, loss_final=1.65, loss_mean=0.902, loss_mean_cls=1.14, proj_loss=-0.391][2026-03-22 14:28:47] Step: 2477, Training Logs: loss_final: 1.654719, loss_mean: 0.893762, proj_loss: -0.386763, loss_mean_cls: 1.147721, grad_norm: 2.457775 +Steps: 0%| | 2478/1000000 [10:10<67:42:54, 4.09it/s, grad_norm=2.46, loss_final=1.65, loss_mean=0.894, loss_mean_cls=1.15, proj_loss=-0.387][2026-03-22 14:28:47] Step: 2478, Training Logs: loss_final: 1.692612, loss_mean: 0.892901, proj_loss: -0.384854, loss_mean_cls: 1.184565, grad_norm: 3.221672 +Steps: 0%| | 2479/1000000 [10:11<67:42:58, 4.09it/s, grad_norm=3.22, loss_final=1.69, loss_mean=0.893, loss_mean_cls=1.18, proj_loss=-0.385][2026-03-22 14:28:48] Step: 2479, Training Logs: loss_final: 1.725029, loss_mean: 0.878183, proj_loss: -0.387141, loss_mean_cls: 1.233987, grad_norm: 5.453611 +Steps: 0%| | 2480/1000000 [10:11<67:41:05, 4.09it/s, grad_norm=5.45, loss_final=1.73, loss_mean=0.878, loss_mean_cls=1.23, proj_loss=-0.387][2026-03-22 14:28:48] Step: 2480, Training Logs: loss_final: 1.616838, loss_mean: 0.903226, proj_loss: -0.393687, loss_mean_cls: 1.107299, grad_norm: 3.237169 +Steps: 0%| | 2481/1000000 [10:11<67:40:52, 4.09it/s, grad_norm=3.24, loss_final=1.62, loss_mean=0.903, loss_mean_cls=1.11, proj_loss=-0.394][2026-03-22 14:28:48] Step: 2481, Training Logs: loss_final: 1.608165, loss_mean: 0.923205, proj_loss: -0.382819, loss_mean_cls: 1.067780, grad_norm: 3.044482 +Steps: 0%| | 2482/1000000 [10:11<68:38:19, 4.04it/s, grad_norm=3.04, loss_final=1.61, loss_mean=0.923, loss_mean_cls=1.07, proj_loss=-0.383][2026-03-22 14:28:48] Step: 2482, Training Logs: loss_final: 1.469594, loss_mean: 0.927204, proj_loss: -0.400235, loss_mean_cls: 0.942626, grad_norm: 2.837925 +Steps: 0%| | 2483/1000000 [10:12<68:21:08, 4.05it/s, grad_norm=2.84, loss_final=1.47, loss_mean=0.927, loss_mean_cls=0.943, proj_loss=-0.4][2026-03-22 14:28:49] Step: 2483, Training Logs: loss_final: 1.776780, loss_mean: 0.898132, proj_loss: -0.376038, loss_mean_cls: 1.254685, grad_norm: 2.934044 +Steps: 0%| | 2484/1000000 [10:12<68:08:39, 4.07it/s, grad_norm=2.93, loss_final=1.78, loss_mean=0.898, loss_mean_cls=1.25, proj_loss=-0.376][2026-03-22 14:28:49] Step: 2484, Training Logs: loss_final: 1.509388, loss_mean: 0.923453, proj_loss: -0.395200, loss_mean_cls: 0.981135, grad_norm: 2.989574 +Steps: 0%| | 2485/1000000 [10:12<67:59:40, 4.08it/s, grad_norm=2.99, loss_final=1.51, loss_mean=0.923, loss_mean_cls=0.981, proj_loss=-0.395][2026-03-22 14:28:49] Step: 2485, Training Logs: loss_final: 1.635363, loss_mean: 0.896284, proj_loss: -0.389690, loss_mean_cls: 1.128769, grad_norm: 2.051184 +Steps: 0%| | 2486/1000000 [10:12<67:53:23, 4.08it/s, grad_norm=2.05, loss_final=1.64, loss_mean=0.896, loss_mean_cls=1.13, proj_loss=-0.39][2026-03-22 14:28:49] Step: 2486, Training Logs: loss_final: 1.561511, loss_mean: 0.909849, proj_loss: -0.396914, loss_mean_cls: 1.048576, grad_norm: 1.623309 +Steps: 0%| | 2487/1000000 [10:13<67:49:07, 4.09it/s, grad_norm=1.62, loss_final=1.56, loss_mean=0.91, loss_mean_cls=1.05, proj_loss=-0.397][2026-03-22 14:28:50] Step: 2487, Training Logs: loss_final: 1.600456, loss_mean: 0.917077, proj_loss: -0.390956, loss_mean_cls: 1.074334, grad_norm: 2.712483 +Steps: 0%| | 2488/1000000 [10:13<67:47:35, 4.09it/s, grad_norm=2.71, loss_final=1.6, loss_mean=0.917, loss_mean_cls=1.07, proj_loss=-0.391][2026-03-22 14:28:50] Step: 2488, Training Logs: loss_final: 1.676043, loss_mean: 0.895601, proj_loss: -0.391562, loss_mean_cls: 1.172004, grad_norm: 2.754122 +Steps: 0%| | 2489/1000000 [10:13<67:45:16, 4.09it/s, grad_norm=2.75, loss_final=1.68, loss_mean=0.896, loss_mean_cls=1.17, proj_loss=-0.392][2026-03-22 14:28:50] Step: 2489, Training Logs: loss_final: 1.712902, loss_mean: 0.882133, proj_loss: -0.385845, loss_mean_cls: 1.216614, grad_norm: 1.697942 +Steps: 0%| | 2490/1000000 [10:13<67:42:43, 4.09it/s, grad_norm=1.7, loss_final=1.71, loss_mean=0.882, loss_mean_cls=1.22, proj_loss=-0.386][2026-03-22 14:28:50] Step: 2490, Training Logs: loss_final: 1.696819, loss_mean: 0.894332, proj_loss: -0.386009, loss_mean_cls: 1.188497, grad_norm: 2.245986 +Steps: 0%| | 2491/1000000 [10:14<67:41:37, 4.09it/s, grad_norm=2.25, loss_final=1.7, loss_mean=0.894, loss_mean_cls=1.19, proj_loss=-0.386][2026-03-22 14:28:51] Step: 2491, Training Logs: loss_final: 1.609467, loss_mean: 0.919306, proj_loss: -0.391316, loss_mean_cls: 1.081476, grad_norm: 2.077212 +Steps: 0%| | 2492/1000000 [10:14<67:42:16, 4.09it/s, grad_norm=2.08, loss_final=1.61, loss_mean=0.919, loss_mean_cls=1.08, proj_loss=-0.391][2026-03-22 14:28:51] Step: 2492, Training Logs: loss_final: 1.666124, loss_mean: 0.898073, proj_loss: -0.379693, loss_mean_cls: 1.147745, grad_norm: 1.334584 +Steps: 0%| | 2493/1000000 [10:14<67:40:24, 4.09it/s, grad_norm=1.33, loss_final=1.67, loss_mean=0.898, loss_mean_cls=1.15, proj_loss=-0.38][2026-03-22 14:28:51] Step: 2493, Training Logs: loss_final: 1.663391, loss_mean: 0.907663, proj_loss: -0.390857, loss_mean_cls: 1.146586, grad_norm: 2.686038 +Steps: 0%| | 2494/1000000 [10:14<67:40:09, 4.09it/s, grad_norm=2.69, loss_final=1.66, loss_mean=0.908, loss_mean_cls=1.15, proj_loss=-0.391][2026-03-22 14:28:51] Step: 2494, Training Logs: loss_final: 1.605126, loss_mean: 0.922318, proj_loss: -0.390170, loss_mean_cls: 1.072978, grad_norm: 4.074390 +Steps: 0%| | 2495/1000000 [10:15<67:40:08, 4.09it/s, grad_norm=4.07, loss_final=1.61, loss_mean=0.922, loss_mean_cls=1.07, proj_loss=-0.39][2026-03-22 14:28:51] Step: 2495, Training Logs: loss_final: 1.643663, loss_mean: 0.912518, proj_loss: -0.386310, loss_mean_cls: 1.117455, grad_norm: 3.776456 +Steps: 0%| | 2496/1000000 [10:15<67:42:25, 4.09it/s, grad_norm=3.78, loss_final=1.64, loss_mean=0.913, loss_mean_cls=1.12, proj_loss=-0.386][2026-03-22 14:28:52] Step: 2496, Training Logs: loss_final: 1.540923, loss_mean: 0.916360, proj_loss: -0.397032, loss_mean_cls: 1.021594, grad_norm: 3.857509 +Steps: 0%| | 2497/1000000 [10:15<67:44:10, 4.09it/s, grad_norm=3.86, loss_final=1.54, loss_mean=0.916, loss_mean_cls=1.02, proj_loss=-0.397][2026-03-22 14:28:52] Step: 2497, Training Logs: loss_final: 1.596333, loss_mean: 0.904527, proj_loss: -0.390321, loss_mean_cls: 1.082128, grad_norm: 2.219876 +Steps: 0%| | 2498/1000000 [10:15<67:43:57, 4.09it/s, grad_norm=2.22, loss_final=1.6, loss_mean=0.905, loss_mean_cls=1.08, proj_loss=-0.39][2026-03-22 14:28:52] Step: 2498, Training Logs: loss_final: 1.580747, loss_mean: 0.908296, proj_loss: -0.391534, loss_mean_cls: 1.063986, grad_norm: 1.511127 +Steps: 0%| | 2499/1000000 [10:15<67:43:25, 4.09it/s, grad_norm=1.51, loss_final=1.58, loss_mean=0.908, loss_mean_cls=1.06, proj_loss=-0.392][2026-03-22 14:28:52] Step: 2499, Training Logs: loss_final: 1.585468, loss_mean: 0.900221, proj_loss: -0.395798, loss_mean_cls: 1.081045, grad_norm: 5.246626 +Steps: 0%| | 2500/1000000 [10:16<67:44:21, 4.09it/s, grad_norm=5.25, loss_final=1.59, loss_mean=0.9, loss_mean_cls=1.08, proj_loss=-0.396][2026-03-22 14:28:53] Step: 2500, Training Logs: loss_final: 1.704828, loss_mean: 0.911530, proj_loss: -0.380756, loss_mean_cls: 1.174055, grad_norm: 5.607831 +Steps: 0%| | 2501/1000000 [10:16<67:41:52, 4.09it/s, grad_norm=5.61, loss_final=1.7, loss_mean=0.912, loss_mean_cls=1.17, proj_loss=-0.381][2026-03-22 14:28:53] Step: 2501, Training Logs: loss_final: 1.558839, loss_mean: 0.933980, proj_loss: -0.388376, loss_mean_cls: 1.013235, grad_norm: 2.853343 +Steps: 0%| | 2502/1000000 [10:16<67:41:03, 4.09it/s, grad_norm=2.85, loss_final=1.56, loss_mean=0.934, loss_mean_cls=1.01, proj_loss=-0.388][2026-03-22 14:28:53] Step: 2502, Training Logs: loss_final: 1.643495, loss_mean: 0.898811, proj_loss: -0.388852, loss_mean_cls: 1.133536, grad_norm: 3.844865 +Steps: 0%| | 2503/1000000 [10:16<67:41:18, 4.09it/s, grad_norm=3.84, loss_final=1.64, loss_mean=0.899, loss_mean_cls=1.13, proj_loss=-0.389][2026-03-22 14:28:53] Step: 2503, Training Logs: loss_final: 1.739155, loss_mean: 0.904428, proj_loss: -0.378380, loss_mean_cls: 1.213106, grad_norm: 5.049624 +Steps: 0%| | 2504/1000000 [10:17<67:42:43, 4.09it/s, grad_norm=5.05, loss_final=1.74, loss_mean=0.904, loss_mean_cls=1.21, proj_loss=-0.378][2026-03-22 14:28:54] Step: 2504, Training Logs: loss_final: 1.777408, loss_mean: 0.889133, proj_loss: -0.384218, loss_mean_cls: 1.272493, grad_norm: 5.450090 +Steps: 0%| | 2505/1000000 [10:17<67:41:39, 4.09it/s, grad_norm=5.45, loss_final=1.78, loss_mean=0.889, loss_mean_cls=1.27, proj_loss=-0.384][2026-03-22 14:28:54] Step: 2505, Training Logs: loss_final: 1.662163, loss_mean: 0.897851, proj_loss: -0.391176, loss_mean_cls: 1.155488, grad_norm: 3.311177 +Steps: 0%| | 2506/1000000 [10:17<67:41:12, 4.09it/s, grad_norm=3.31, loss_final=1.66, loss_mean=0.898, loss_mean_cls=1.16, proj_loss=-0.391][2026-03-22 14:28:54] Step: 2506, Training Logs: loss_final: 1.596779, loss_mean: 0.898471, proj_loss: -0.394336, loss_mean_cls: 1.092644, grad_norm: 4.889314 +Steps: 0%| | 2507/1000000 [10:17<67:41:15, 4.09it/s, grad_norm=4.89, loss_final=1.6, loss_mean=0.898, loss_mean_cls=1.09, proj_loss=-0.394][2026-03-22 14:28:54] Step: 2507, Training Logs: loss_final: 1.667174, loss_mean: 0.874648, proj_loss: -0.381252, loss_mean_cls: 1.173778, grad_norm: 3.170179 +Steps: 0%| | 2508/1000000 [10:18<67:40:51, 4.09it/s, grad_norm=3.17, loss_final=1.67, loss_mean=0.875, loss_mean_cls=1.17, proj_loss=-0.381][2026-03-22 14:28:55] Step: 2508, Training Logs: loss_final: 1.663950, loss_mean: 0.899302, proj_loss: -0.387376, loss_mean_cls: 1.152024, grad_norm: 2.694527 +Steps: 0%| | 2509/1000000 [10:18<67:40:13, 4.09it/s, grad_norm=2.69, loss_final=1.66, loss_mean=0.899, loss_mean_cls=1.15, proj_loss=-0.387][2026-03-22 14:28:55] Step: 2509, Training Logs: loss_final: 1.646334, loss_mean: 0.907793, proj_loss: -0.391575, loss_mean_cls: 1.130116, grad_norm: 2.420910 +Steps: 0%| | 2510/1000000 [10:18<67:39:22, 4.10it/s, grad_norm=2.42, loss_final=1.65, loss_mean=0.908, loss_mean_cls=1.13, proj_loss=-0.392][2026-03-22 14:28:55] Step: 2510, Training Logs: loss_final: 1.602417, loss_mean: 0.899163, proj_loss: -0.389085, loss_mean_cls: 1.092340, grad_norm: 2.231827 +Steps: 0%| | 2511/1000000 [10:18<67:40:33, 4.09it/s, grad_norm=2.23, loss_final=1.6, loss_mean=0.899, loss_mean_cls=1.09, proj_loss=-0.389][2026-03-22 14:28:55] Step: 2511, Training Logs: loss_final: 1.658357, loss_mean: 0.877678, proj_loss: -0.386042, loss_mean_cls: 1.166721, grad_norm: 3.403192 +Steps: 0%| | 2512/1000000 [10:19<68:16:26, 4.06it/s, grad_norm=3.4, loss_final=1.66, loss_mean=0.878, loss_mean_cls=1.17, proj_loss=-0.386][2026-03-22 14:28:56] Step: 2512, Training Logs: loss_final: 1.625923, loss_mean: 0.915817, proj_loss: -0.392306, loss_mean_cls: 1.102412, grad_norm: 2.802824 +Steps: 0%| | 2513/1000000 [10:19<68:06:05, 4.07it/s, grad_norm=2.8, loss_final=1.63, loss_mean=0.916, loss_mean_cls=1.1, proj_loss=-0.392][2026-03-22 14:28:56] Step: 2513, Training Logs: loss_final: 1.545977, loss_mean: 0.894564, proj_loss: -0.401976, loss_mean_cls: 1.053388, grad_norm: 3.333807 +Steps: 0%| | 2514/1000000 [10:19<67:57:31, 4.08it/s, grad_norm=3.33, loss_final=1.55, loss_mean=0.895, loss_mean_cls=1.05, proj_loss=-0.402][2026-03-22 14:28:56] Step: 2514, Training Logs: loss_final: 1.551586, loss_mean: 0.915462, proj_loss: -0.392507, loss_mean_cls: 1.028631, grad_norm: 2.611377 +Steps: 0%| | 2515/1000000 [10:19<67:52:51, 4.08it/s, grad_norm=2.61, loss_final=1.55, loss_mean=0.915, loss_mean_cls=1.03, proj_loss=-0.393][2026-03-22 14:28:56] Step: 2515, Training Logs: loss_final: 1.550946, loss_mean: 0.921563, proj_loss: -0.398827, loss_mean_cls: 1.028211, grad_norm: 3.011955 +Steps: 0%| | 2516/1000000 [10:20<67:48:55, 4.09it/s, grad_norm=3.01, loss_final=1.55, loss_mean=0.922, loss_mean_cls=1.03, proj_loss=-0.399][2026-03-22 14:28:57] Step: 2516, Training Logs: loss_final: 1.643330, loss_mean: 0.906439, proj_loss: -0.388907, loss_mean_cls: 1.125798, grad_norm: 3.838164 +Steps: 0%| | 2517/1000000 [10:20<67:45:42, 4.09it/s, grad_norm=3.84, loss_final=1.64, loss_mean=0.906, loss_mean_cls=1.13, proj_loss=-0.389][2026-03-22 14:28:57] Step: 2517, Training Logs: loss_final: 1.650191, loss_mean: 0.899007, proj_loss: -0.387559, loss_mean_cls: 1.138743, grad_norm: 1.663324 +Steps: 0%| | 2518/1000000 [10:20<67:42:29, 4.09it/s, grad_norm=1.66, loss_final=1.65, loss_mean=0.899, loss_mean_cls=1.14, proj_loss=-0.388][2026-03-22 14:28:57] Step: 2518, Training Logs: loss_final: 1.627789, loss_mean: 0.905328, proj_loss: -0.394956, loss_mean_cls: 1.117417, grad_norm: 6.430135 +Steps: 0%| | 2519/1000000 [10:20<67:42:09, 4.09it/s, grad_norm=6.43, loss_final=1.63, loss_mean=0.905, loss_mean_cls=1.12, proj_loss=-0.395][2026-03-22 14:28:57] Step: 2519, Training Logs: loss_final: 1.697549, loss_mean: 0.896960, proj_loss: -0.385194, loss_mean_cls: 1.185782, grad_norm: 5.003850 +Steps: 0%| | 2520/1000000 [10:21<67:42:16, 4.09it/s, grad_norm=5, loss_final=1.7, loss_mean=0.897, loss_mean_cls=1.19, proj_loss=-0.385][2026-03-22 14:28:58] Step: 2520, Training Logs: loss_final: 1.597514, loss_mean: 0.923598, proj_loss: -0.394469, loss_mean_cls: 1.068385, grad_norm: 4.995996 +Steps: 0%| | 2521/1000000 [10:21<67:42:01, 4.09it/s, grad_norm=5, loss_final=1.6, loss_mean=0.924, loss_mean_cls=1.07, proj_loss=-0.394][2026-03-22 14:28:58] Step: 2521, Training Logs: loss_final: 1.540150, loss_mean: 0.918433, proj_loss: -0.392051, loss_mean_cls: 1.013768, grad_norm: 2.861080 +Steps: 0%| | 2522/1000000 [10:21<67:43:25, 4.09it/s, grad_norm=2.86, loss_final=1.54, loss_mean=0.918, loss_mean_cls=1.01, proj_loss=-0.392][2026-03-22 14:28:58] Step: 2522, Training Logs: loss_final: 1.625722, loss_mean: 0.901524, proj_loss: -0.386269, loss_mean_cls: 1.110467, grad_norm: 1.661121 +Steps: 0%| | 2523/1000000 [10:21<67:42:33, 4.09it/s, grad_norm=1.66, loss_final=1.63, loss_mean=0.902, loss_mean_cls=1.11, proj_loss=-0.386][2026-03-22 14:28:58] Step: 2523, Training Logs: loss_final: 1.667300, loss_mean: 0.876924, proj_loss: -0.391224, loss_mean_cls: 1.181600, grad_norm: 2.989121 +Steps: 0%| | 2524/1000000 [10:22<67:42:10, 4.09it/s, grad_norm=2.99, loss_final=1.67, loss_mean=0.877, loss_mean_cls=1.18, proj_loss=-0.391][2026-03-22 14:28:59] Step: 2524, Training Logs: loss_final: 1.502226, loss_mean: 0.932296, proj_loss: -0.402073, loss_mean_cls: 0.972003, grad_norm: 2.655473 +Steps: 0%| | 2525/1000000 [10:22<67:40:47, 4.09it/s, grad_norm=2.66, loss_final=1.5, loss_mean=0.932, loss_mean_cls=0.972, proj_loss=-0.402][2026-03-22 14:28:59] Step: 2525, Training Logs: loss_final: 1.554228, loss_mean: 0.906576, proj_loss: -0.394532, loss_mean_cls: 1.042183, grad_norm: 2.638094 +Steps: 0%| | 2526/1000000 [10:22<67:42:08, 4.09it/s, grad_norm=2.64, loss_final=1.55, loss_mean=0.907, loss_mean_cls=1.04, proj_loss=-0.395][2026-03-22 14:28:59] Step: 2526, Training Logs: loss_final: 1.597852, loss_mean: 0.916775, proj_loss: -0.389761, loss_mean_cls: 1.070838, grad_norm: 1.390993 +Steps: 0%| | 2527/1000000 [10:22<67:40:59, 4.09it/s, grad_norm=1.39, loss_final=1.6, loss_mean=0.917, loss_mean_cls=1.07, proj_loss=-0.39][2026-03-22 14:28:59] Step: 2527, Training Logs: loss_final: 1.634923, loss_mean: 0.903378, proj_loss: -0.392626, loss_mean_cls: 1.124170, grad_norm: 3.290484 +Steps: 0%| | 2528/1000000 [10:23<67:40:41, 4.09it/s, grad_norm=3.29, loss_final=1.63, loss_mean=0.903, loss_mean_cls=1.12, proj_loss=-0.393][2026-03-22 14:29:00] Step: 2528, Training Logs: loss_final: 1.640262, loss_mean: 0.905552, proj_loss: -0.387920, loss_mean_cls: 1.122630, grad_norm: 3.136833 +Steps: 0%| | 2529/1000000 [10:23<67:40:25, 4.09it/s, grad_norm=3.14, loss_final=1.64, loss_mean=0.906, loss_mean_cls=1.12, proj_loss=-0.388][2026-03-22 14:29:00] Step: 2529, Training Logs: loss_final: 1.556102, loss_mean: 0.918920, proj_loss: -0.390030, loss_mean_cls: 1.027212, grad_norm: 1.922074 +Steps: 0%| | 2530/1000000 [10:23<67:41:02, 4.09it/s, grad_norm=1.92, loss_final=1.56, loss_mean=0.919, loss_mean_cls=1.03, proj_loss=-0.39][2026-03-22 14:29:00] Step: 2530, Training Logs: loss_final: 1.519302, loss_mean: 0.904741, proj_loss: -0.394660, loss_mean_cls: 1.009221, grad_norm: 2.298595 +Steps: 0%| | 2531/1000000 [10:23<67:44:09, 4.09it/s, grad_norm=2.3, loss_final=1.52, loss_mean=0.905, loss_mean_cls=1.01, proj_loss=-0.395][2026-03-22 14:29:00] Step: 2531, Training Logs: loss_final: 1.618118, loss_mean: 0.908221, proj_loss: -0.386547, loss_mean_cls: 1.096444, grad_norm: 3.088560 +Steps: 0%| | 2532/1000000 [10:24<67:43:50, 4.09it/s, grad_norm=3.09, loss_final=1.62, loss_mean=0.908, loss_mean_cls=1.1, proj_loss=-0.387][2026-03-22 14:29:01] Step: 2532, Training Logs: loss_final: 1.662413, loss_mean: 0.918455, proj_loss: -0.379634, loss_mean_cls: 1.123593, grad_norm: 1.732502 +Steps: 0%| | 2533/1000000 [10:24<67:44:52, 4.09it/s, grad_norm=1.73, loss_final=1.66, loss_mean=0.918, loss_mean_cls=1.12, proj_loss=-0.38][2026-03-22 14:29:01] Step: 2533, Training Logs: loss_final: 1.615317, loss_mean: 0.902830, proj_loss: -0.389832, loss_mean_cls: 1.102319, grad_norm: 2.354469 +Steps: 0%| | 2534/1000000 [10:24<67:44:36, 4.09it/s, grad_norm=2.35, loss_final=1.62, loss_mean=0.903, loss_mean_cls=1.1, proj_loss=-0.39][2026-03-22 14:29:01] Step: 2534, Training Logs: loss_final: 1.731558, loss_mean: 0.904131, proj_loss: -0.378935, loss_mean_cls: 1.206363, grad_norm: 2.135260 +Steps: 0%| | 2535/1000000 [10:24<67:46:06, 4.09it/s, grad_norm=2.14, loss_final=1.73, loss_mean=0.904, loss_mean_cls=1.21, proj_loss=-0.379][2026-03-22 14:29:01] Step: 2535, Training Logs: loss_final: 1.549731, loss_mean: 0.911618, proj_loss: -0.396643, loss_mean_cls: 1.034756, grad_norm: 3.308160 +Steps: 0%| | 2536/1000000 [10:25<67:45:21, 4.09it/s, grad_norm=3.31, loss_final=1.55, loss_mean=0.912, loss_mean_cls=1.03, proj_loss=-0.397][2026-03-22 14:29:02] Step: 2536, Training Logs: loss_final: 1.654786, loss_mean: 0.906178, proj_loss: -0.387519, loss_mean_cls: 1.136127, grad_norm: 2.764054 +Steps: 0%| | 2537/1000000 [10:25<67:47:06, 4.09it/s, grad_norm=2.76, loss_final=1.65, loss_mean=0.906, loss_mean_cls=1.14, proj_loss=-0.388][2026-03-22 14:29:02] Step: 2537, Training Logs: loss_final: 1.573591, loss_mean: 0.921409, proj_loss: -0.393500, loss_mean_cls: 1.045682, grad_norm: 4.587004 +Steps: 0%| | 2538/1000000 [10:25<67:43:03, 4.09it/s, grad_norm=4.59, loss_final=1.57, loss_mean=0.921, loss_mean_cls=1.05, proj_loss=-0.394][2026-03-22 14:29:02] Step: 2538, Training Logs: loss_final: 1.610109, loss_mean: 0.900792, proj_loss: -0.388485, loss_mean_cls: 1.097801, grad_norm: 1.925663 +Steps: 0%| | 2539/1000000 [10:25<67:41:00, 4.09it/s, grad_norm=1.93, loss_final=1.61, loss_mean=0.901, loss_mean_cls=1.1, proj_loss=-0.388][2026-03-22 14:29:02] Step: 2539, Training Logs: loss_final: 1.719660, loss_mean: 0.897214, proj_loss: -0.386580, loss_mean_cls: 1.209027, grad_norm: 2.882412 +Steps: 0%| | 2540/1000000 [10:26<67:41:47, 4.09it/s, grad_norm=2.88, loss_final=1.72, loss_mean=0.897, loss_mean_cls=1.21, proj_loss=-0.387][2026-03-22 14:29:02] Step: 2540, Training Logs: loss_final: 1.657176, loss_mean: 0.915377, proj_loss: -0.388404, loss_mean_cls: 1.130203, grad_norm: 4.524271 +Steps: 0%| | 2541/1000000 [10:26<67:41:25, 4.09it/s, grad_norm=4.52, loss_final=1.66, loss_mean=0.915, loss_mean_cls=1.13, proj_loss=-0.388][2026-03-22 14:29:03] Step: 2541, Training Logs: loss_final: 1.718427, loss_mean: 0.913364, proj_loss: -0.386670, loss_mean_cls: 1.191734, grad_norm: 3.078207 +Steps: 0%| | 2542/1000000 [10:26<67:41:39, 4.09it/s, grad_norm=3.08, loss_final=1.72, loss_mean=0.913, loss_mean_cls=1.19, proj_loss=-0.387][2026-03-22 14:29:03] Step: 2542, Training Logs: loss_final: 1.699572, loss_mean: 0.898007, proj_loss: -0.386829, loss_mean_cls: 1.188393, grad_norm: 6.184021 +Steps: 0%| | 2543/1000000 [10:26<67:41:04, 4.09it/s, grad_norm=6.18, loss_final=1.7, loss_mean=0.898, loss_mean_cls=1.19, proj_loss=-0.387][2026-03-22 14:29:03] Step: 2543, Training Logs: loss_final: 1.609089, loss_mean: 0.906462, proj_loss: -0.397720, loss_mean_cls: 1.100347, grad_norm: 4.330959 +Steps: 0%| | 2544/1000000 [10:26<67:41:14, 4.09it/s, grad_norm=4.33, loss_final=1.61, loss_mean=0.906, loss_mean_cls=1.1, proj_loss=-0.398][2026-03-22 14:29:03] Step: 2544, Training Logs: loss_final: 1.818610, loss_mean: 0.877787, proj_loss: -0.375890, loss_mean_cls: 1.316714, grad_norm: 3.229617 +Steps: 0%| | 2545/1000000 [10:27<67:41:12, 4.09it/s, grad_norm=3.23, loss_final=1.82, loss_mean=0.878, loss_mean_cls=1.32, proj_loss=-0.376][2026-03-22 14:29:04] Step: 2545, Training Logs: loss_final: 1.729519, loss_mean: 0.884647, proj_loss: -0.385096, loss_mean_cls: 1.229968, grad_norm: 3.072585 +Steps: 0%| | 2546/1000000 [10:27<67:40:38, 4.09it/s, grad_norm=3.07, loss_final=1.73, loss_mean=0.885, loss_mean_cls=1.23, proj_loss=-0.385][2026-03-22 14:29:04] Step: 2546, Training Logs: loss_final: 1.696079, loss_mean: 0.901911, proj_loss: -0.388447, loss_mean_cls: 1.182615, grad_norm: 5.736139 +Steps: 0%| | 2547/1000000 [10:27<67:41:35, 4.09it/s, grad_norm=5.74, loss_final=1.7, loss_mean=0.902, loss_mean_cls=1.18, proj_loss=-0.388][2026-03-22 14:29:04] Step: 2547, Training Logs: loss_final: 1.531939, loss_mean: 0.936942, proj_loss: -0.402486, loss_mean_cls: 0.997483, grad_norm: 5.727368 +Steps: 0%| | 2548/1000000 [10:27<67:43:04, 4.09it/s, grad_norm=5.73, loss_final=1.53, loss_mean=0.937, loss_mean_cls=0.997, proj_loss=-0.402][2026-03-22 14:29:04] Step: 2548, Training Logs: loss_final: 1.758955, loss_mean: 0.890276, proj_loss: -0.389573, loss_mean_cls: 1.258252, grad_norm: 6.395543 +Steps: 0%| | 2549/1000000 [10:28<67:40:29, 4.09it/s, grad_norm=6.4, loss_final=1.76, loss_mean=0.89, loss_mean_cls=1.26, proj_loss=-0.39][2026-03-22 14:29:05] Step: 2549, Training Logs: loss_final: 1.535505, loss_mean: 0.905316, proj_loss: -0.395572, loss_mean_cls: 1.025760, grad_norm: 2.404451 +Steps: 0%| | 2550/1000000 [10:28<67:40:13, 4.09it/s, grad_norm=2.4, loss_final=1.54, loss_mean=0.905, loss_mean_cls=1.03, proj_loss=-0.396][2026-03-22 14:29:05] Step: 2550, Training Logs: loss_final: 1.750420, loss_mean: 0.879575, proj_loss: -0.377729, loss_mean_cls: 1.248575, grad_norm: 1.580870 +Steps: 0%| | 2551/1000000 [10:28<67:39:59, 4.09it/s, grad_norm=1.58, loss_final=1.75, loss_mean=0.88, loss_mean_cls=1.25, proj_loss=-0.378][2026-03-22 14:29:05] Step: 2551, Training Logs: loss_final: 1.666795, loss_mean: 0.914192, proj_loss: -0.392889, loss_mean_cls: 1.145492, grad_norm: 4.719364 +Steps: 0%| | 2552/1000000 [10:28<67:39:33, 4.10it/s, grad_norm=4.72, loss_final=1.67, loss_mean=0.914, loss_mean_cls=1.15, proj_loss=-0.393][2026-03-22 14:29:05] Step: 2552, Training Logs: loss_final: 1.685813, loss_mean: 0.903571, proj_loss: -0.388811, loss_mean_cls: 1.171052, grad_norm: 5.157402 +Steps: 0%| | 2553/1000000 [10:29<67:39:40, 4.09it/s, grad_norm=5.16, loss_final=1.69, loss_mean=0.904, loss_mean_cls=1.17, proj_loss=-0.389][2026-03-22 14:29:06] Step: 2553, Training Logs: loss_final: 1.664482, loss_mean: 0.899739, proj_loss: -0.379448, loss_mean_cls: 1.144192, grad_norm: 2.263237 +Steps: 0%| | 2554/1000000 [10:29<67:40:29, 4.09it/s, grad_norm=2.26, loss_final=1.66, loss_mean=0.9, loss_mean_cls=1.14, proj_loss=-0.379][2026-03-22 14:29:06] Step: 2554, Training Logs: loss_final: 1.632157, loss_mean: 0.896470, proj_loss: -0.387096, loss_mean_cls: 1.122783, grad_norm: 2.620989 +Steps: 0%| | 2555/1000000 [10:29<67:42:10, 4.09it/s, grad_norm=2.62, loss_final=1.63, loss_mean=0.896, loss_mean_cls=1.12, proj_loss=-0.387][2026-03-22 14:29:06] Step: 2555, Training Logs: loss_final: 1.723377, loss_mean: 0.884249, proj_loss: -0.380454, loss_mean_cls: 1.219581, grad_norm: 2.950001 +Steps: 0%| | 2556/1000000 [10:29<67:41:16, 4.09it/s, grad_norm=2.95, loss_final=1.72, loss_mean=0.884, loss_mean_cls=1.22, proj_loss=-0.38][2026-03-22 14:29:06] Step: 2556, Training Logs: loss_final: 1.477208, loss_mean: 0.931781, proj_loss: -0.398946, loss_mean_cls: 0.944373, grad_norm: 3.830331 +Steps: 0%| | 2557/1000000 [10:30<67:42:16, 4.09it/s, grad_norm=3.83, loss_final=1.48, loss_mean=0.932, loss_mean_cls=0.944, proj_loss=-0.399][2026-03-22 14:29:07] Step: 2557, Training Logs: loss_final: 1.466495, loss_mean: 0.926550, proj_loss: -0.398228, loss_mean_cls: 0.938174, grad_norm: 4.039796 +Steps: 0%| | 2558/1000000 [10:30<67:41:43, 4.09it/s, grad_norm=4.04, loss_final=1.47, loss_mean=0.927, loss_mean_cls=0.938, proj_loss=-0.398][2026-03-22 14:29:07] Step: 2558, Training Logs: loss_final: 1.537586, loss_mean: 0.920575, proj_loss: -0.394955, loss_mean_cls: 1.011966, grad_norm: 3.266190 +Steps: 0%| | 2559/1000000 [10:30<67:40:54, 4.09it/s, grad_norm=3.27, loss_final=1.54, loss_mean=0.921, loss_mean_cls=1.01, proj_loss=-0.395][2026-03-22 14:29:07] Step: 2559, Training Logs: loss_final: 1.641928, loss_mean: 0.904543, proj_loss: -0.386660, loss_mean_cls: 1.124045, grad_norm: 3.722035 +Steps: 0%| | 2560/1000000 [10:30<67:42:09, 4.09it/s, grad_norm=3.72, loss_final=1.64, loss_mean=0.905, loss_mean_cls=1.12, proj_loss=-0.387][2026-03-22 14:29:07] Step: 2560, Training Logs: loss_final: 1.683934, loss_mean: 0.904630, proj_loss: -0.382820, loss_mean_cls: 1.162124, grad_norm: 2.275338 +Steps: 0%| | 2561/1000000 [10:31<68:16:38, 4.06it/s, grad_norm=2.28, loss_final=1.68, loss_mean=0.905, loss_mean_cls=1.16, proj_loss=-0.383][2026-03-22 14:29:08] Step: 2561, Training Logs: loss_final: 1.695887, loss_mean: 0.914267, proj_loss: -0.388360, loss_mean_cls: 1.169980, grad_norm: 3.729079 +Steps: 0%| | 2562/1000000 [10:31<68:07:44, 4.07it/s, grad_norm=3.73, loss_final=1.7, loss_mean=0.914, loss_mean_cls=1.17, proj_loss=-0.388][2026-03-22 14:29:08] Step: 2562, Training Logs: loss_final: 1.707720, loss_mean: 0.915105, proj_loss: -0.384757, loss_mean_cls: 1.177372, grad_norm: 7.713934 +Steps: 0%| | 2563/1000000 [10:31<67:59:33, 4.07it/s, grad_norm=7.71, loss_final=1.71, loss_mean=0.915, loss_mean_cls=1.18, proj_loss=-0.385][2026-03-22 14:29:08] Step: 2563, Training Logs: loss_final: 1.809669, loss_mean: 0.906044, proj_loss: -0.382316, loss_mean_cls: 1.285941, grad_norm: 6.741941 +Steps: 0%| | 2564/1000000 [10:31<67:56:16, 4.08it/s, grad_norm=6.74, loss_final=1.81, loss_mean=0.906, loss_mean_cls=1.29, proj_loss=-0.382][2026-03-22 14:29:08] Step: 2564, Training Logs: loss_final: 1.514751, loss_mean: 0.924849, proj_loss: -0.397510, loss_mean_cls: 0.987412, grad_norm: 4.120777 +Steps: 0%| | 2565/1000000 [10:32<67:51:07, 4.08it/s, grad_norm=4.12, loss_final=1.51, loss_mean=0.925, loss_mean_cls=0.987, proj_loss=-0.398][2026-03-22 14:29:09] Step: 2565, Training Logs: loss_final: 1.568885, loss_mean: 0.910334, proj_loss: -0.399011, loss_mean_cls: 1.057562, grad_norm: 3.320179 +Steps: 0%| | 2566/1000000 [10:32<67:49:29, 4.09it/s, grad_norm=3.32, loss_final=1.57, loss_mean=0.91, loss_mean_cls=1.06, proj_loss=-0.399][2026-03-22 14:29:09] Step: 2566, Training Logs: loss_final: 1.701336, loss_mean: 0.914801, proj_loss: -0.386399, loss_mean_cls: 1.172934, grad_norm: 4.170665 +Steps: 0%| | 2567/1000000 [10:32<67:47:28, 4.09it/s, grad_norm=4.17, loss_final=1.7, loss_mean=0.915, loss_mean_cls=1.17, proj_loss=-0.386][2026-03-22 14:29:09] Step: 2567, Training Logs: loss_final: 1.676345, loss_mean: 0.896973, proj_loss: -0.385795, loss_mean_cls: 1.165166, grad_norm: 4.617566 +Steps: 0%| | 2568/1000000 [10:32<67:46:21, 4.09it/s, grad_norm=4.62, loss_final=1.68, loss_mean=0.897, loss_mean_cls=1.17, proj_loss=-0.386][2026-03-22 14:29:09] Step: 2568, Training Logs: loss_final: 1.663149, loss_mean: 0.899501, proj_loss: -0.390069, loss_mean_cls: 1.153717, grad_norm: 2.840743 +Steps: 0%| | 2569/1000000 [10:33<67:44:27, 4.09it/s, grad_norm=2.84, loss_final=1.66, loss_mean=0.9, loss_mean_cls=1.15, proj_loss=-0.39][2026-03-22 14:29:10] Step: 2569, Training Logs: loss_final: 1.566928, loss_mean: 0.926564, proj_loss: -0.389132, loss_mean_cls: 1.029496, grad_norm: 1.834299 +Steps: 0%| | 2570/1000000 [10:33<67:44:29, 4.09it/s, grad_norm=1.83, loss_final=1.57, loss_mean=0.927, loss_mean_cls=1.03, proj_loss=-0.389][2026-03-22 14:29:10] Step: 2570, Training Logs: loss_final: 1.605554, loss_mean: 0.898581, proj_loss: -0.393776, loss_mean_cls: 1.100749, grad_norm: 3.690196 +Steps: 0%| | 2571/1000000 [10:33<67:44:36, 4.09it/s, grad_norm=3.69, loss_final=1.61, loss_mean=0.899, loss_mean_cls=1.1, proj_loss=-0.394][2026-03-22 14:29:10] Step: 2571, Training Logs: loss_final: 1.731743, loss_mean: 0.905747, proj_loss: -0.381138, loss_mean_cls: 1.207134, grad_norm: 3.540999 +Steps: 0%| | 2572/1000000 [10:33<67:43:53, 4.09it/s, grad_norm=3.54, loss_final=1.73, loss_mean=0.906, loss_mean_cls=1.21, proj_loss=-0.381][2026-03-22 14:29:10] Step: 2572, Training Logs: loss_final: 1.572604, loss_mean: 0.908435, proj_loss: -0.395880, loss_mean_cls: 1.060050, grad_norm: 2.403856 +Steps: 0%| | 2573/1000000 [10:34<67:43:35, 4.09it/s, grad_norm=2.4, loss_final=1.57, loss_mean=0.908, loss_mean_cls=1.06, proj_loss=-0.396][2026-03-22 14:29:11] Step: 2573, Training Logs: loss_final: 1.546518, loss_mean: 0.917063, proj_loss: -0.389711, loss_mean_cls: 1.019166, grad_norm: 1.343970 +Steps: 0%| | 2574/1000000 [10:34<67:42:24, 4.09it/s, grad_norm=1.34, loss_final=1.55, loss_mean=0.917, loss_mean_cls=1.02, proj_loss=-0.39][2026-03-22 14:29:11] Step: 2574, Training Logs: loss_final: 1.724937, loss_mean: 0.887452, proj_loss: -0.382853, loss_mean_cls: 1.220338, grad_norm: 2.870667 +Steps: 0%| | 2575/1000000 [10:34<67:42:09, 4.09it/s, grad_norm=2.87, loss_final=1.72, loss_mean=0.887, loss_mean_cls=1.22, proj_loss=-0.383][2026-03-22 14:29:11] Step: 2575, Training Logs: loss_final: 1.540839, loss_mean: 0.918768, proj_loss: -0.393046, loss_mean_cls: 1.015117, grad_norm: 1.542984 +Steps: 0%| | 2576/1000000 [10:34<67:41:55, 4.09it/s, grad_norm=1.54, loss_final=1.54, loss_mean=0.919, loss_mean_cls=1.02, proj_loss=-0.393][2026-03-22 14:29:11] Step: 2576, Training Logs: loss_final: 1.655016, loss_mean: 0.909518, proj_loss: -0.385628, loss_mean_cls: 1.131127, grad_norm: 1.519544 +Steps: 0%| | 2577/1000000 [10:35<67:42:31, 4.09it/s, grad_norm=1.52, loss_final=1.66, loss_mean=0.91, loss_mean_cls=1.13, proj_loss=-0.386][2026-03-22 14:29:12] Step: 2577, Training Logs: loss_final: 1.653855, loss_mean: 0.879452, proj_loss: -0.385136, loss_mean_cls: 1.159540, grad_norm: 4.142617 +Steps: 0%| | 2578/1000000 [10:35<67:43:42, 4.09it/s, grad_norm=4.14, loss_final=1.65, loss_mean=0.879, loss_mean_cls=1.16, proj_loss=-0.385][2026-03-22 14:29:12] Step: 2578, Training Logs: loss_final: 1.616083, loss_mean: 0.910323, proj_loss: -0.388075, loss_mean_cls: 1.093835, grad_norm: 2.233170 +Steps: 0%| | 2579/1000000 [10:35<67:43:21, 4.09it/s, grad_norm=2.23, loss_final=1.62, loss_mean=0.91, loss_mean_cls=1.09, proj_loss=-0.388][2026-03-22 14:29:12] Step: 2579, Training Logs: loss_final: 1.579926, loss_mean: 0.884992, proj_loss: -0.393019, loss_mean_cls: 1.087953, grad_norm: 1.929204 +Steps: 0%| | 2580/1000000 [10:35<67:44:36, 4.09it/s, grad_norm=1.93, loss_final=1.58, loss_mean=0.885, loss_mean_cls=1.09, proj_loss=-0.393][2026-03-22 14:29:12] Step: 2580, Training Logs: loss_final: 1.762738, loss_mean: 0.871531, proj_loss: -0.381369, loss_mean_cls: 1.272576, grad_norm: 2.116838 +Steps: 0%| | 2581/1000000 [10:36<67:54:24, 4.08it/s, grad_norm=2.12, loss_final=1.76, loss_mean=0.872, loss_mean_cls=1.27, proj_loss=-0.381][2026-03-22 14:29:13] Step: 2581, Training Logs: loss_final: 1.646545, loss_mean: 0.886459, proj_loss: -0.394656, loss_mean_cls: 1.154741, grad_norm: 3.730932 +Steps: 0%| | 2582/1000000 [10:36<67:50:43, 4.08it/s, grad_norm=3.73, loss_final=1.65, loss_mean=0.886, loss_mean_cls=1.15, proj_loss=-0.395][2026-03-22 14:29:13] Step: 2582, Training Logs: loss_final: 1.575646, loss_mean: 0.901452, proj_loss: -0.395459, loss_mean_cls: 1.069653, grad_norm: 4.548866 +Steps: 0%| | 2583/1000000 [10:36<67:47:24, 4.09it/s, grad_norm=4.55, loss_final=1.58, loss_mean=0.901, loss_mean_cls=1.07, proj_loss=-0.395][2026-03-22 14:29:13] Step: 2583, Training Logs: loss_final: 1.672565, loss_mean: 0.900397, proj_loss: -0.377052, loss_mean_cls: 1.149220, grad_norm: 3.106194 +Steps: 0%| | 2584/1000000 [10:36<67:47:17, 4.09it/s, grad_norm=3.11, loss_final=1.67, loss_mean=0.9, loss_mean_cls=1.15, proj_loss=-0.377][2026-03-22 14:29:13] Step: 2584, Training Logs: loss_final: 1.658776, loss_mean: 0.903902, proj_loss: -0.389521, loss_mean_cls: 1.144395, grad_norm: 2.147795 +Steps: 0%| | 2585/1000000 [10:37<67:46:02, 4.09it/s, grad_norm=2.15, loss_final=1.66, loss_mean=0.904, loss_mean_cls=1.14, proj_loss=-0.39][2026-03-22 14:29:13] Step: 2585, Training Logs: loss_final: 1.603956, loss_mean: 0.925925, proj_loss: -0.391874, loss_mean_cls: 1.069905, grad_norm: 1.619493 +Steps: 0%| | 2586/1000000 [10:37<67:44:42, 4.09it/s, grad_norm=1.62, loss_final=1.6, loss_mean=0.926, loss_mean_cls=1.07, proj_loss=-0.392][2026-03-22 14:29:14] Step: 2586, Training Logs: loss_final: 1.723878, loss_mean: 0.900268, proj_loss: -0.377495, loss_mean_cls: 1.201106, grad_norm: 2.902135 +Steps: 0%| | 2587/1000000 [10:37<74:43:13, 3.71it/s, grad_norm=2.9, loss_final=1.72, loss_mean=0.9, loss_mean_cls=1.2, proj_loss=-0.377][2026-03-22 14:29:14] Step: 2587, Training Logs: loss_final: 1.554587, loss_mean: 0.907146, proj_loss: -0.394290, loss_mean_cls: 1.041731, grad_norm: 4.149312 +Steps: 0%| | 2588/1000000 [10:37<72:35:41, 3.82it/s, grad_norm=4.15, loss_final=1.55, loss_mean=0.907, loss_mean_cls=1.04, proj_loss=-0.394][2026-03-22 14:29:14] Step: 2588, Training Logs: loss_final: 1.590931, loss_mean: 0.915518, proj_loss: -0.396887, loss_mean_cls: 1.072299, grad_norm: 4.546859 +Steps: 0%| | 2589/1000000 [10:38<71:08:00, 3.89it/s, grad_norm=4.55, loss_final=1.59, loss_mean=0.916, loss_mean_cls=1.07, proj_loss=-0.397][2026-03-22 14:29:15] Step: 2589, Training Logs: loss_final: 1.656583, loss_mean: 0.929077, proj_loss: -0.394177, loss_mean_cls: 1.121684, grad_norm: 6.245569 +Steps: 0%| | 2590/1000000 [10:38<70:06:14, 3.95it/s, grad_norm=6.25, loss_final=1.66, loss_mean=0.929, loss_mean_cls=1.12, proj_loss=-0.394][2026-03-22 14:29:15] Step: 2590, Training Logs: loss_final: 1.562636, loss_mean: 0.916120, proj_loss: -0.400837, loss_mean_cls: 1.047354, grad_norm: 1.904564 +Steps: 0%| | 2591/1000000 [10:38<69:23:48, 3.99it/s, grad_norm=1.9, loss_final=1.56, loss_mean=0.916, loss_mean_cls=1.05, proj_loss=-0.401][2026-03-22 14:29:15] Step: 2591, Training Logs: loss_final: 1.752059, loss_mean: 0.897302, proj_loss: -0.377337, loss_mean_cls: 1.232094, grad_norm: 1.432795 +Steps: 0%| | 2592/1000000 [10:38<68:53:42, 4.02it/s, grad_norm=1.43, loss_final=1.75, loss_mean=0.897, loss_mean_cls=1.23, proj_loss=-0.377][2026-03-22 14:29:15] Step: 2592, Training Logs: loss_final: 1.635839, loss_mean: 0.879612, proj_loss: -0.391429, loss_mean_cls: 1.147656, grad_norm: 3.312345 +Steps: 0%| | 2593/1000000 [10:39<68:32:03, 4.04it/s, grad_norm=3.31, loss_final=1.64, loss_mean=0.88, loss_mean_cls=1.15, proj_loss=-0.391][2026-03-22 14:29:16] Step: 2593, Training Logs: loss_final: 1.628300, loss_mean: 0.898863, proj_loss: -0.388395, loss_mean_cls: 1.117832, grad_norm: 1.932279 +Steps: 0%| | 2594/1000000 [10:39<68:17:25, 4.06it/s, grad_norm=1.93, loss_final=1.63, loss_mean=0.899, loss_mean_cls=1.12, proj_loss=-0.388][2026-03-22 14:29:16] Step: 2594, Training Logs: loss_final: 1.501890, loss_mean: 0.919186, proj_loss: -0.395387, loss_mean_cls: 0.978091, grad_norm: 5.428767 +Steps: 0%| | 2595/1000000 [10:39<68:05:52, 4.07it/s, grad_norm=5.43, loss_final=1.5, loss_mean=0.919, loss_mean_cls=0.978, proj_loss=-0.395][2026-03-22 14:29:16] Step: 2595, Training Logs: loss_final: 1.670908, loss_mean: 0.902852, proj_loss: -0.383600, loss_mean_cls: 1.151656, grad_norm: 5.130919 +Steps: 0%| | 2596/1000000 [10:39<67:59:56, 4.07it/s, grad_norm=5.13, loss_final=1.67, loss_mean=0.903, loss_mean_cls=1.15, proj_loss=-0.384][2026-03-22 14:29:16] Step: 2596, Training Logs: loss_final: 1.677253, loss_mean: 0.891405, proj_loss: -0.386992, loss_mean_cls: 1.172841, grad_norm: 3.122303 +Steps: 0%| | 2597/1000000 [10:40<67:54:19, 4.08it/s, grad_norm=3.12, loss_final=1.68, loss_mean=0.891, loss_mean_cls=1.17, proj_loss=-0.387][2026-03-22 14:29:17] Step: 2597, Training Logs: loss_final: 1.665864, loss_mean: 0.897217, proj_loss: -0.386508, loss_mean_cls: 1.155156, grad_norm: 2.616811 +Steps: 0%| | 2598/1000000 [10:40<67:50:57, 4.08it/s, grad_norm=2.62, loss_final=1.67, loss_mean=0.897, loss_mean_cls=1.16, proj_loss=-0.387][2026-03-22 14:29:17] Step: 2598, Training Logs: loss_final: 1.653342, loss_mean: 0.884627, proj_loss: -0.390576, loss_mean_cls: 1.159291, grad_norm: 2.381272 +Steps: 0%| | 2599/1000000 [10:40<67:46:49, 4.09it/s, grad_norm=2.38, loss_final=1.65, loss_mean=0.885, loss_mean_cls=1.16, proj_loss=-0.391][2026-03-22 14:29:17] Step: 2599, Training Logs: loss_final: 1.713917, loss_mean: 0.895771, proj_loss: -0.382188, loss_mean_cls: 1.200334, grad_norm: 2.432243 +Steps: 0%| | 2600/1000000 [10:40<67:50:20, 4.08it/s, grad_norm=2.43, loss_final=1.71, loss_mean=0.896, loss_mean_cls=1.2, proj_loss=-0.382][2026-03-22 14:29:17] Step: 2600, Training Logs: loss_final: 1.631180, loss_mean: 0.912892, proj_loss: -0.385796, loss_mean_cls: 1.104084, grad_norm: 1.582054 +Steps: 0%| | 2601/1000000 [10:41<67:48:08, 4.09it/s, grad_norm=1.58, loss_final=1.63, loss_mean=0.913, loss_mean_cls=1.1, proj_loss=-0.386][2026-03-22 14:29:17] Step: 2601, Training Logs: loss_final: 1.707784, loss_mean: 0.898995, proj_loss: -0.381470, loss_mean_cls: 1.190259, grad_norm: 2.806802 +Steps: 0%| | 2602/1000000 [10:41<67:46:20, 4.09it/s, grad_norm=2.81, loss_final=1.71, loss_mean=0.899, loss_mean_cls=1.19, proj_loss=-0.381][2026-03-22 14:29:18] Step: 2602, Training Logs: loss_final: 1.697689, loss_mean: 0.893771, proj_loss: -0.384839, loss_mean_cls: 1.188757, grad_norm: 2.149661 +Steps: 0%| | 2603/1000000 [10:41<67:44:21, 4.09it/s, grad_norm=2.15, loss_final=1.7, loss_mean=0.894, loss_mean_cls=1.19, proj_loss=-0.385][2026-03-22 14:29:18] Step: 2603, Training Logs: loss_final: 1.590938, loss_mean: 0.904954, proj_loss: -0.394745, loss_mean_cls: 1.080729, grad_norm: 1.449798 +Steps: 0%| | 2604/1000000 [10:41<67:43:50, 4.09it/s, grad_norm=1.45, loss_final=1.59, loss_mean=0.905, loss_mean_cls=1.08, proj_loss=-0.395][2026-03-22 14:29:18] Step: 2604, Training Logs: loss_final: 1.587354, loss_mean: 0.908508, proj_loss: -0.394781, loss_mean_cls: 1.073627, grad_norm: 2.173854 +Steps: 0%| | 2605/1000000 [10:42<67:42:51, 4.09it/s, grad_norm=2.17, loss_final=1.59, loss_mean=0.909, loss_mean_cls=1.07, proj_loss=-0.395][2026-03-22 14:29:18] Step: 2605, Training Logs: loss_final: 1.579980, loss_mean: 0.917417, proj_loss: -0.399791, loss_mean_cls: 1.062354, grad_norm: 4.207720 +Steps: 0%| | 2606/1000000 [10:42<67:41:36, 4.09it/s, grad_norm=4.21, loss_final=1.58, loss_mean=0.917, loss_mean_cls=1.06, proj_loss=-0.4][2026-03-22 14:29:19] Step: 2606, Training Logs: loss_final: 1.603052, loss_mean: 0.918605, proj_loss: -0.388966, loss_mean_cls: 1.073412, grad_norm: 2.337471 +Steps: 0%| | 2607/1000000 [10:42<67:41:22, 4.09it/s, grad_norm=2.34, loss_final=1.6, loss_mean=0.919, loss_mean_cls=1.07, proj_loss=-0.389][2026-03-22 14:29:19] Step: 2607, Training Logs: loss_final: 1.747617, loss_mean: 0.892197, proj_loss: -0.383284, loss_mean_cls: 1.238703, grad_norm: 2.321229 +Steps: 0%| | 2608/1000000 [10:42<67:42:03, 4.09it/s, grad_norm=2.32, loss_final=1.75, loss_mean=0.892, loss_mean_cls=1.24, proj_loss=-0.383][2026-03-22 14:29:19] Step: 2608, Training Logs: loss_final: 1.580128, loss_mean: 0.922238, proj_loss: -0.392960, loss_mean_cls: 1.050850, grad_norm: 2.907172 +Steps: 0%| | 2609/1000000 [10:42<67:42:36, 4.09it/s, grad_norm=2.91, loss_final=1.58, loss_mean=0.922, loss_mean_cls=1.05, proj_loss=-0.393][2026-03-22 14:29:19] Step: 2609, Training Logs: loss_final: 1.577324, loss_mean: 0.892809, proj_loss: -0.393036, loss_mean_cls: 1.077551, grad_norm: 1.816022 +Steps: 0%| | 2610/1000000 [10:43<67:42:01, 4.09it/s, grad_norm=1.82, loss_final=1.58, loss_mean=0.893, loss_mean_cls=1.08, proj_loss=-0.393][2026-03-22 14:29:20] Step: 2610, Training Logs: loss_final: 1.597348, loss_mean: 0.904606, proj_loss: -0.391406, loss_mean_cls: 1.084147, grad_norm: 3.655822 +Steps: 0%| | 2611/1000000 [10:43<67:43:00, 4.09it/s, grad_norm=3.66, loss_final=1.6, loss_mean=0.905, loss_mean_cls=1.08, proj_loss=-0.391][2026-03-22 14:29:20] Step: 2611, Training Logs: loss_final: 1.532263, loss_mean: 0.901118, proj_loss: -0.394565, loss_mean_cls: 1.025710, grad_norm: 2.031685 +Steps: 0%| | 2612/1000000 [10:43<67:43:30, 4.09it/s, grad_norm=2.03, loss_final=1.53, loss_mean=0.901, loss_mean_cls=1.03, proj_loss=-0.395][2026-03-22 14:29:20] Step: 2612, Training Logs: loss_final: 1.450372, loss_mean: 0.924711, proj_loss: -0.412889, loss_mean_cls: 0.938549, grad_norm: 1.574250 +Steps: 0%| | 2613/1000000 [10:43<67:42:41, 4.09it/s, grad_norm=1.57, loss_final=1.45, loss_mean=0.925, loss_mean_cls=0.939, proj_loss=-0.413][2026-03-22 14:29:20] Step: 2613, Training Logs: loss_final: 1.652194, loss_mean: 0.895709, proj_loss: -0.389180, loss_mean_cls: 1.145666, grad_norm: 2.955287 +Steps: 0%| | 2614/1000000 [10:44<67:41:38, 4.09it/s, grad_norm=2.96, loss_final=1.65, loss_mean=0.896, loss_mean_cls=1.15, proj_loss=-0.389][2026-03-22 14:29:21] Step: 2614, Training Logs: loss_final: 1.620951, loss_mean: 0.911283, proj_loss: -0.394758, loss_mean_cls: 1.104426, grad_norm: 2.896968 +Steps: 0%| | 2615/1000000 [10:44<67:42:00, 4.09it/s, grad_norm=2.9, loss_final=1.62, loss_mean=0.911, loss_mean_cls=1.1, proj_loss=-0.395][2026-03-22 14:29:21] Step: 2615, Training Logs: loss_final: 1.551596, loss_mean: 0.904642, proj_loss: -0.398016, loss_mean_cls: 1.044970, grad_norm: 4.292992 +Steps: 0%| | 2616/1000000 [10:44<67:42:37, 4.09it/s, grad_norm=4.29, loss_final=1.55, loss_mean=0.905, loss_mean_cls=1.04, proj_loss=-0.398][2026-03-22 14:29:21] Step: 2616, Training Logs: loss_final: 1.632478, loss_mean: 0.898610, proj_loss: -0.389793, loss_mean_cls: 1.123661, grad_norm: 2.240272 +Steps: 0%| | 2617/1000000 [10:44<67:43:16, 4.09it/s, grad_norm=2.24, loss_final=1.63, loss_mean=0.899, loss_mean_cls=1.12, proj_loss=-0.39][2026-03-22 14:29:21] Step: 2617, Training Logs: loss_final: 1.588402, loss_mean: 0.907814, proj_loss: -0.391911, loss_mean_cls: 1.072498, grad_norm: 1.097574 +Steps: 0%| | 2618/1000000 [10:45<67:43:07, 4.09it/s, grad_norm=1.1, loss_final=1.59, loss_mean=0.908, loss_mean_cls=1.07, proj_loss=-0.392][2026-03-22 14:29:22] Step: 2618, Training Logs: loss_final: 1.521851, loss_mean: 0.909258, proj_loss: -0.398424, loss_mean_cls: 1.011017, grad_norm: 2.014375 +Steps: 0%| | 2619/1000000 [10:45<67:43:46, 4.09it/s, grad_norm=2.01, loss_final=1.52, loss_mean=0.909, loss_mean_cls=1.01, proj_loss=-0.398][2026-03-22 14:29:22] Step: 2619, Training Logs: loss_final: 1.566953, loss_mean: 0.903093, proj_loss: -0.400415, loss_mean_cls: 1.064275, grad_norm: 2.060226 +Steps: 0%| | 2620/1000000 [10:45<67:45:59, 4.09it/s, grad_norm=2.06, loss_final=1.57, loss_mean=0.903, loss_mean_cls=1.06, proj_loss=-0.4][2026-03-22 14:29:22] Step: 2620, Training Logs: loss_final: 1.581115, loss_mean: 0.925875, proj_loss: -0.393324, loss_mean_cls: 1.048563, grad_norm: 3.021306 +Steps: 0%| | 2621/1000000 [10:45<67:45:29, 4.09it/s, grad_norm=3.02, loss_final=1.58, loss_mean=0.926, loss_mean_cls=1.05, proj_loss=-0.393][2026-03-22 14:29:22] Step: 2621, Training Logs: loss_final: 1.592237, loss_mean: 0.909156, proj_loss: -0.391944, loss_mean_cls: 1.075025, grad_norm: 5.051875 +Steps: 0%| | 2622/1000000 [10:46<67:50:16, 4.08it/s, grad_norm=5.05, loss_final=1.59, loss_mean=0.909, loss_mean_cls=1.08, proj_loss=-0.392][2026-03-22 14:29:23] Step: 2622, Training Logs: loss_final: 1.613059, loss_mean: 0.913470, proj_loss: -0.390016, loss_mean_cls: 1.089605, grad_norm: 4.676217 +Steps: 0%| | 2623/1000000 [10:46<67:49:35, 4.08it/s, grad_norm=4.68, loss_final=1.61, loss_mean=0.913, loss_mean_cls=1.09, proj_loss=-0.39][2026-03-22 14:29:23] Step: 2623, Training Logs: loss_final: 1.691253, loss_mean: 0.918145, proj_loss: -0.385180, loss_mean_cls: 1.158287, grad_norm: 2.526405 +Steps: 0%| | 2624/1000000 [10:46<67:48:36, 4.09it/s, grad_norm=2.53, loss_final=1.69, loss_mean=0.918, loss_mean_cls=1.16, proj_loss=-0.385][2026-03-22 14:29:23] Step: 2624, Training Logs: loss_final: 1.641396, loss_mean: 0.885033, proj_loss: -0.399476, loss_mean_cls: 1.155840, grad_norm: 5.580381 +Steps: 0%| | 2625/1000000 [10:46<67:47:50, 4.09it/s, grad_norm=5.58, loss_final=1.64, loss_mean=0.885, loss_mean_cls=1.16, proj_loss=-0.399][2026-03-22 14:29:23] Step: 2625, Training Logs: loss_final: 1.535843, loss_mean: 0.913294, proj_loss: -0.397847, loss_mean_cls: 1.020396, grad_norm: 2.065616 +Steps: 0%| | 2626/1000000 [10:47<67:46:32, 4.09it/s, grad_norm=2.07, loss_final=1.54, loss_mean=0.913, loss_mean_cls=1.02, proj_loss=-0.398][2026-03-22 14:29:24] Step: 2626, Training Logs: loss_final: 1.528301, loss_mean: 0.915457, proj_loss: -0.397475, loss_mean_cls: 1.010320, grad_norm: 2.913206 +Steps: 0%| | 2627/1000000 [10:47<67:45:03, 4.09it/s, grad_norm=2.91, loss_final=1.53, loss_mean=0.915, loss_mean_cls=1.01, proj_loss=-0.397][2026-03-22 14:29:24] Step: 2627, Training Logs: loss_final: 1.553501, loss_mean: 0.896992, proj_loss: -0.394295, loss_mean_cls: 1.050803, grad_norm: 3.952324 +Steps: 0%| | 2628/1000000 [10:47<67:44:37, 4.09it/s, grad_norm=3.95, loss_final=1.55, loss_mean=0.897, loss_mean_cls=1.05, proj_loss=-0.394][2026-03-22 14:29:24] Step: 2628, Training Logs: loss_final: 1.572347, loss_mean: 0.908798, proj_loss: -0.387398, loss_mean_cls: 1.050946, grad_norm: 2.415049 +Steps: 0%| | 2629/1000000 [10:47<67:43:42, 4.09it/s, grad_norm=2.42, loss_final=1.57, loss_mean=0.909, loss_mean_cls=1.05, proj_loss=-0.387][2026-03-22 14:29:24] Step: 2629, Training Logs: loss_final: 1.567762, loss_mean: 0.920459, proj_loss: -0.393016, loss_mean_cls: 1.040318, grad_norm: 4.535135 +Steps: 0%| | 2630/1000000 [10:48<67:42:46, 4.09it/s, grad_norm=4.54, loss_final=1.57, loss_mean=0.92, loss_mean_cls=1.04, proj_loss=-0.393][2026-03-22 14:29:25] Step: 2630, Training Logs: loss_final: 1.689993, loss_mean: 0.884751, proj_loss: -0.385478, loss_mean_cls: 1.190721, grad_norm: 4.331592 +Steps: 0%| | 2631/1000000 [10:48<67:41:57, 4.09it/s, grad_norm=4.33, loss_final=1.69, loss_mean=0.885, loss_mean_cls=1.19, proj_loss=-0.385][2026-03-22 14:29:25] Step: 2631, Training Logs: loss_final: 1.615874, loss_mean: 0.900194, proj_loss: -0.392526, loss_mean_cls: 1.108205, grad_norm: 3.090681 +Steps: 0%| | 2632/1000000 [10:48<67:41:53, 4.09it/s, grad_norm=3.09, loss_final=1.62, loss_mean=0.9, loss_mean_cls=1.11, proj_loss=-0.393][2026-03-22 14:29:25] Step: 2632, Training Logs: loss_final: 1.666070, loss_mean: 0.885167, proj_loss: -0.384259, loss_mean_cls: 1.165163, grad_norm: 1.761487 +Steps: 0%| | 2633/1000000 [10:48<67:41:47, 4.09it/s, grad_norm=1.76, loss_final=1.67, loss_mean=0.885, loss_mean_cls=1.17, proj_loss=-0.384][2026-03-22 14:29:25] Step: 2633, Training Logs: loss_final: 1.621020, loss_mean: 0.911498, proj_loss: -0.394960, loss_mean_cls: 1.104482, grad_norm: 4.324484 +Steps: 0%| | 2634/1000000 [10:49<67:42:15, 4.09it/s, grad_norm=4.32, loss_final=1.62, loss_mean=0.911, loss_mean_cls=1.1, proj_loss=-0.395][2026-03-22 14:29:26] Step: 2634, Training Logs: loss_final: 1.677372, loss_mean: 0.901350, proj_loss: -0.393292, loss_mean_cls: 1.169314, grad_norm: 6.607528 +Steps: 0%| | 2635/1000000 [10:49<67:56:50, 4.08it/s, grad_norm=6.61, loss_final=1.68, loss_mean=0.901, loss_mean_cls=1.17, proj_loss=-0.393][2026-03-22 14:29:26] Step: 2635, Training Logs: loss_final: 1.692349, loss_mean: 0.922496, proj_loss: -0.390725, loss_mean_cls: 1.160579, grad_norm: 5.154307 +Steps: 0%| | 2636/1000000 [10:49<67:52:07, 4.08it/s, grad_norm=5.15, loss_final=1.69, loss_mean=0.922, loss_mean_cls=1.16, proj_loss=-0.391][2026-03-22 14:29:26] Step: 2636, Training Logs: loss_final: 1.578642, loss_mean: 0.915233, proj_loss: -0.400042, loss_mean_cls: 1.063451, grad_norm: 2.469221 +Steps: 0%| | 2637/1000000 [10:49<67:49:29, 4.08it/s, grad_norm=2.47, loss_final=1.58, loss_mean=0.915, loss_mean_cls=1.06, proj_loss=-0.4][2026-03-22 14:29:26] Step: 2637, Training Logs: loss_final: 1.508432, loss_mean: 0.935315, proj_loss: -0.398145, loss_mean_cls: 0.971262, grad_norm: 2.451950 +Steps: 0%| | 2638/1000000 [10:50<67:47:27, 4.09it/s, grad_norm=2.45, loss_final=1.51, loss_mean=0.935, loss_mean_cls=0.971, proj_loss=-0.398][2026-03-22 14:29:27] Step: 2638, Training Logs: loss_final: 1.718105, loss_mean: 0.882200, proj_loss: -0.382448, loss_mean_cls: 1.218354, grad_norm: 3.870857 +Steps: 0%| | 2639/1000000 [10:50<67:45:20, 4.09it/s, grad_norm=3.87, loss_final=1.72, loss_mean=0.882, loss_mean_cls=1.22, proj_loss=-0.382][2026-03-22 14:29:27] Step: 2639, Training Logs: loss_final: 1.693189, loss_mean: 0.891946, proj_loss: -0.384730, loss_mean_cls: 1.185973, grad_norm: 3.231366 +Steps: 0%| | 2640/1000000 [10:50<67:44:13, 4.09it/s, grad_norm=3.23, loss_final=1.69, loss_mean=0.892, loss_mean_cls=1.19, proj_loss=-0.385][2026-03-22 14:29:27] Step: 2640, Training Logs: loss_final: 1.638346, loss_mean: 0.907509, proj_loss: -0.392260, loss_mean_cls: 1.123097, grad_norm: 1.996408 +Steps: 0%| | 2641/1000000 [10:50<67:43:59, 4.09it/s, grad_norm=2, loss_final=1.64, loss_mean=0.908, loss_mean_cls=1.12, proj_loss=-0.392][2026-03-22 14:29:27] Step: 2641, Training Logs: loss_final: 1.630712, loss_mean: 0.910962, proj_loss: -0.388540, loss_mean_cls: 1.108289, grad_norm: 4.992951 +Steps: 0%| | 2642/1000000 [10:51<67:43:51, 4.09it/s, grad_norm=4.99, loss_final=1.63, loss_mean=0.911, loss_mean_cls=1.11, proj_loss=-0.389][2026-03-22 14:29:28] Step: 2642, Training Logs: loss_final: 1.681472, loss_mean: 0.902237, proj_loss: -0.386301, loss_mean_cls: 1.165537, grad_norm: 2.864610 +Steps: 0%| | 2643/1000000 [10:51<67:43:51, 4.09it/s, grad_norm=2.86, loss_final=1.68, loss_mean=0.902, loss_mean_cls=1.17, proj_loss=-0.386][2026-03-22 14:29:28] Step: 2643, Training Logs: loss_final: 1.638409, loss_mean: 0.913056, proj_loss: -0.391495, loss_mean_cls: 1.116847, grad_norm: 2.831197 +Steps: 0%| | 2644/1000000 [10:51<67:44:06, 4.09it/s, grad_norm=2.83, loss_final=1.64, loss_mean=0.913, loss_mean_cls=1.12, proj_loss=-0.391][2026-03-22 14:29:28] Step: 2644, Training Logs: loss_final: 1.674707, loss_mean: 0.897808, proj_loss: -0.393022, loss_mean_cls: 1.169921, grad_norm: 5.189937 +Steps: 0%| | 2645/1000000 [10:51<67:44:25, 4.09it/s, grad_norm=5.19, loss_final=1.67, loss_mean=0.898, loss_mean_cls=1.17, proj_loss=-0.393][2026-03-22 14:29:28] Step: 2645, Training Logs: loss_final: 1.582608, loss_mean: 0.916648, proj_loss: -0.392507, loss_mean_cls: 1.058467, grad_norm: 2.426542 +Steps: 0%| | 2646/1000000 [10:52<67:44:53, 4.09it/s, grad_norm=2.43, loss_final=1.58, loss_mean=0.917, loss_mean_cls=1.06, proj_loss=-0.393][2026-03-22 14:29:28] Step: 2646, Training Logs: loss_final: 1.571407, loss_mean: 0.911740, proj_loss: -0.394934, loss_mean_cls: 1.054601, grad_norm: 1.986814 +Steps: 0%| | 2647/1000000 [10:52<67:43:16, 4.09it/s, grad_norm=1.99, loss_final=1.57, loss_mean=0.912, loss_mean_cls=1.05, proj_loss=-0.395][2026-03-22 14:29:29] Step: 2647, Training Logs: loss_final: 1.629992, loss_mean: 0.919594, proj_loss: -0.391756, loss_mean_cls: 1.102154, grad_norm: 2.770866 +Steps: 0%| | 2648/1000000 [10:52<67:43:28, 4.09it/s, grad_norm=2.77, loss_final=1.63, loss_mean=0.92, loss_mean_cls=1.1, proj_loss=-0.392][2026-03-22 14:29:29] Step: 2648, Training Logs: loss_final: 1.603487, loss_mean: 0.905907, proj_loss: -0.400161, loss_mean_cls: 1.097741, grad_norm: 3.312455 +Steps: 0%| | 2649/1000000 [10:52<67:43:01, 4.09it/s, grad_norm=3.31, loss_final=1.6, loss_mean=0.906, loss_mean_cls=1.1, proj_loss=-0.4][2026-03-22 14:29:29] Step: 2649, Training Logs: loss_final: 1.715753, loss_mean: 0.899389, proj_loss: -0.382657, loss_mean_cls: 1.199021, grad_norm: 3.383528 +Steps: 0%| | 2650/1000000 [10:53<67:42:56, 4.09it/s, grad_norm=3.38, loss_final=1.72, loss_mean=0.899, loss_mean_cls=1.2, proj_loss=-0.383][2026-03-22 14:29:29] Step: 2650, Training Logs: loss_final: 1.519740, loss_mean: 0.927661, proj_loss: -0.398417, loss_mean_cls: 0.990495, grad_norm: 1.788947 +Steps: 0%| | 2651/1000000 [10:53<67:42:39, 4.09it/s, grad_norm=1.79, loss_final=1.52, loss_mean=0.928, loss_mean_cls=0.99, proj_loss=-0.398][2026-03-22 14:29:30] Step: 2651, Training Logs: loss_final: 1.550192, loss_mean: 0.906249, proj_loss: -0.399112, loss_mean_cls: 1.043056, grad_norm: 1.966085 +Steps: 0%| | 2652/1000000 [10:53<67:43:37, 4.09it/s, grad_norm=1.97, loss_final=1.55, loss_mean=0.906, loss_mean_cls=1.04, proj_loss=-0.399][2026-03-22 14:29:30] Step: 2652, Training Logs: loss_final: 1.684790, loss_mean: 0.898284, proj_loss: -0.386803, loss_mean_cls: 1.173310, grad_norm: 1.725063 +Steps: 0%| | 2653/1000000 [10:53<67:43:06, 4.09it/s, grad_norm=1.73, loss_final=1.68, loss_mean=0.898, loss_mean_cls=1.17, proj_loss=-0.387][2026-03-22 14:29:30] Step: 2653, Training Logs: loss_final: 1.539141, loss_mean: 0.915351, proj_loss: -0.398116, loss_mean_cls: 1.021906, grad_norm: 1.536888 +Steps: 0%| | 2654/1000000 [10:53<67:44:44, 4.09it/s, grad_norm=1.54, loss_final=1.54, loss_mean=0.915, loss_mean_cls=1.02, proj_loss=-0.398][2026-03-22 14:29:30] Step: 2654, Training Logs: loss_final: 1.528274, loss_mean: 0.886599, proj_loss: -0.402010, loss_mean_cls: 1.043685, grad_norm: 1.232405 +Steps: 0%| | 2655/1000000 [10:54<67:45:40, 4.09it/s, grad_norm=1.23, loss_final=1.53, loss_mean=0.887, loss_mean_cls=1.04, proj_loss=-0.402][2026-03-22 14:29:31] Step: 2655, Training Logs: loss_final: 1.579761, loss_mean: 0.897833, proj_loss: -0.394634, loss_mean_cls: 1.076562, grad_norm: 4.752432 +Steps: 0%| | 2656/1000000 [10:54<67:45:01, 4.09it/s, grad_norm=4.75, loss_final=1.58, loss_mean=0.898, loss_mean_cls=1.08, proj_loss=-0.395][2026-03-22 14:29:31] Step: 2656, Training Logs: loss_final: 1.609006, loss_mean: 0.911082, proj_loss: -0.389397, loss_mean_cls: 1.087321, grad_norm: 2.414260 +Steps: 0%| | 2657/1000000 [10:54<67:44:33, 4.09it/s, grad_norm=2.41, loss_final=1.61, loss_mean=0.911, loss_mean_cls=1.09, proj_loss=-0.389][2026-03-22 14:29:31] Step: 2657, Training Logs: loss_final: 1.735079, loss_mean: 0.897370, proj_loss: -0.386648, loss_mean_cls: 1.224357, grad_norm: 5.977097 +Steps: 0%| | 2658/1000000 [10:54<67:44:47, 4.09it/s, grad_norm=5.98, loss_final=1.74, loss_mean=0.897, loss_mean_cls=1.22, proj_loss=-0.387][2026-03-22 14:29:31] Step: 2658, Training Logs: loss_final: 1.478789, loss_mean: 0.933723, proj_loss: -0.401476, loss_mean_cls: 0.946542, grad_norm: 4.183433 +Steps: 0%| | 2659/1000000 [10:55<67:44:05, 4.09it/s, grad_norm=4.18, loss_final=1.48, loss_mean=0.934, loss_mean_cls=0.947, proj_loss=-0.401][2026-03-22 14:29:32] Step: 2659, Training Logs: loss_final: 1.643259, loss_mean: 0.897642, proj_loss: -0.385517, loss_mean_cls: 1.131134, grad_norm: 2.604011 +Steps: 0%| | 2660/1000000 [10:55<67:44:48, 4.09it/s, grad_norm=2.6, loss_final=1.64, loss_mean=0.898, loss_mean_cls=1.13, proj_loss=-0.386][2026-03-22 14:29:32] Step: 2660, Training Logs: loss_final: 1.686335, loss_mean: 0.860855, proj_loss: -0.382182, loss_mean_cls: 1.207661, grad_norm: 3.337169 +Steps: 0%| | 2661/1000000 [10:55<67:46:02, 4.09it/s, grad_norm=3.34, loss_final=1.69, loss_mean=0.861, loss_mean_cls=1.21, proj_loss=-0.382][2026-03-22 14:29:32] Step: 2661, Training Logs: loss_final: 1.778926, loss_mean: 0.880548, proj_loss: -0.376848, loss_mean_cls: 1.275227, grad_norm: 2.595365 +Steps: 0%| | 2662/1000000 [10:55<67:46:37, 4.09it/s, grad_norm=2.6, loss_final=1.78, loss_mean=0.881, loss_mean_cls=1.28, proj_loss=-0.377][2026-03-22 14:29:32] Step: 2662, Training Logs: loss_final: 1.660273, loss_mean: 0.903131, proj_loss: -0.390438, loss_mean_cls: 1.147580, grad_norm: 2.589584 +Steps: 0%| | 2663/1000000 [10:56<67:45:48, 4.09it/s, grad_norm=2.59, loss_final=1.66, loss_mean=0.903, loss_mean_cls=1.15, proj_loss=-0.39][2026-03-22 14:29:33] Step: 2663, Training Logs: loss_final: 1.637758, loss_mean: 0.907837, proj_loss: -0.390038, loss_mean_cls: 1.119960, grad_norm: 1.360816 +Steps: 0%| | 2664/1000000 [10:56<67:45:03, 4.09it/s, grad_norm=1.36, loss_final=1.64, loss_mean=0.908, loss_mean_cls=1.12, proj_loss=-0.39][2026-03-22 14:29:33] Step: 2664, Training Logs: loss_final: 1.512346, loss_mean: 0.913028, proj_loss: -0.397870, loss_mean_cls: 0.997188, grad_norm: 4.627115 +Steps: 0%| | 2665/1000000 [10:56<67:44:31, 4.09it/s, grad_norm=4.63, loss_final=1.51, loss_mean=0.913, loss_mean_cls=0.997, proj_loss=-0.398][2026-03-22 14:29:33] Step: 2665, Training Logs: loss_final: 1.729152, loss_mean: 0.905775, proj_loss: -0.388654, loss_mean_cls: 1.212031, grad_norm: 6.874381 +Steps: 0%| | 2666/1000000 [10:56<67:43:46, 4.09it/s, grad_norm=6.87, loss_final=1.73, loss_mean=0.906, loss_mean_cls=1.21, proj_loss=-0.389][2026-03-22 14:29:33] Step: 2666, Training Logs: loss_final: 1.571703, loss_mean: 0.908663, proj_loss: -0.394931, loss_mean_cls: 1.057971, grad_norm: 2.791098 +Steps: 0%| | 2667/1000000 [10:57<67:41:51, 4.09it/s, grad_norm=2.79, loss_final=1.57, loss_mean=0.909, loss_mean_cls=1.06, proj_loss=-0.395][2026-03-22 14:29:34] Step: 2667, Training Logs: loss_final: 1.619925, loss_mean: 0.893853, proj_loss: -0.391293, loss_mean_cls: 1.117365, grad_norm: 4.423941 +Steps: 0%| | 2668/1000000 [10:57<67:44:38, 4.09it/s, grad_norm=4.42, loss_final=1.62, loss_mean=0.894, loss_mean_cls=1.12, proj_loss=-0.391][2026-03-22 14:29:34] Step: 2668, Training Logs: loss_final: 1.638667, loss_mean: 0.878830, proj_loss: -0.393773, loss_mean_cls: 1.153610, grad_norm: 2.957823 +Steps: 0%| | 2669/1000000 [10:57<67:46:31, 4.09it/s, grad_norm=2.96, loss_final=1.64, loss_mean=0.879, loss_mean_cls=1.15, proj_loss=-0.394][2026-03-22 14:29:34] Step: 2669, Training Logs: loss_final: 1.645535, loss_mean: 0.893553, proj_loss: -0.394346, loss_mean_cls: 1.146328, grad_norm: 2.655113 +Steps: 0%| | 2670/1000000 [10:57<67:45:23, 4.09it/s, grad_norm=2.66, loss_final=1.65, loss_mean=0.894, loss_mean_cls=1.15, proj_loss=-0.394][2026-03-22 14:29:34] Step: 2670, Training Logs: loss_final: 1.571567, loss_mean: 0.906595, proj_loss: -0.398896, loss_mean_cls: 1.063868, grad_norm: 2.957549 +Steps: 0%| | 2671/1000000 [10:58<67:43:22, 4.09it/s, grad_norm=2.96, loss_final=1.57, loss_mean=0.907, loss_mean_cls=1.06, proj_loss=-0.399][2026-03-22 14:29:35] Step: 2671, Training Logs: loss_final: 1.681321, loss_mean: 0.880633, proj_loss: -0.387935, loss_mean_cls: 1.188622, grad_norm: 4.302058 +Steps: 0%| | 2672/1000000 [10:58<67:43:44, 4.09it/s, grad_norm=4.3, loss_final=1.68, loss_mean=0.881, loss_mean_cls=1.19, proj_loss=-0.388][2026-03-22 14:29:35] Step: 2672, Training Logs: loss_final: 1.531662, loss_mean: 0.942159, proj_loss: -0.405880, loss_mean_cls: 0.995383, grad_norm: 2.411522 +Steps: 0%| | 2673/1000000 [10:58<67:42:42, 4.09it/s, grad_norm=2.41, loss_final=1.53, loss_mean=0.942, loss_mean_cls=0.995, proj_loss=-0.406][2026-03-22 14:29:35] Step: 2673, Training Logs: loss_final: 1.501664, loss_mean: 0.937892, proj_loss: -0.405929, loss_mean_cls: 0.969702, grad_norm: 3.531541 +Steps: 0%| | 2674/1000000 [10:58<67:41:34, 4.09it/s, grad_norm=3.53, loss_final=1.5, loss_mean=0.938, loss_mean_cls=0.97, proj_loss=-0.406][2026-03-22 14:29:35] Step: 2674, Training Logs: loss_final: 1.598009, loss_mean: 0.897936, proj_loss: -0.395452, loss_mean_cls: 1.095525, grad_norm: 3.362890 +Steps: 0%| | 2675/1000000 [10:59<67:41:52, 4.09it/s, grad_norm=3.36, loss_final=1.6, loss_mean=0.898, loss_mean_cls=1.1, proj_loss=-0.395][2026-03-22 14:29:36] Step: 2675, Training Logs: loss_final: 1.570510, loss_mean: 0.910289, proj_loss: -0.390072, loss_mean_cls: 1.050293, grad_norm: 1.797649 +Steps: 0%| | 2676/1000000 [10:59<67:43:15, 4.09it/s, grad_norm=1.8, loss_final=1.57, loss_mean=0.91, loss_mean_cls=1.05, proj_loss=-0.39][2026-03-22 14:29:36] Step: 2676, Training Logs: loss_final: 1.635680, loss_mean: 0.896576, proj_loss: -0.394947, loss_mean_cls: 1.134052, grad_norm: 2.970322 +Steps: 0%| | 2677/1000000 [10:59<67:44:21, 4.09it/s, grad_norm=2.97, loss_final=1.64, loss_mean=0.897, loss_mean_cls=1.13, proj_loss=-0.395][2026-03-22 14:29:36] Step: 2677, Training Logs: loss_final: 1.603381, loss_mean: 0.901126, proj_loss: -0.391902, loss_mean_cls: 1.094158, grad_norm: 4.817988 +Steps: 0%| | 2678/1000000 [10:59<67:45:11, 4.09it/s, grad_norm=4.82, loss_final=1.6, loss_mean=0.901, loss_mean_cls=1.09, proj_loss=-0.392][2026-03-22 14:29:36] Step: 2678, Training Logs: loss_final: 1.707599, loss_mean: 0.888771, proj_loss: -0.389518, loss_mean_cls: 1.208345, grad_norm: 5.980860 +Steps: 0%| | 2679/1000000 [11:00<67:45:06, 4.09it/s, grad_norm=5.98, loss_final=1.71, loss_mean=0.889, loss_mean_cls=1.21, proj_loss=-0.39][2026-03-22 14:29:37] Step: 2679, Training Logs: loss_final: 1.667078, loss_mean: 0.891262, proj_loss: -0.382116, loss_mean_cls: 1.157933, grad_norm: 2.091849 +Steps: 0%| | 2680/1000000 [11:00<67:43:49, 4.09it/s, grad_norm=2.09, loss_final=1.67, loss_mean=0.891, loss_mean_cls=1.16, proj_loss=-0.382][2026-03-22 14:29:37] Step: 2680, Training Logs: loss_final: 1.553604, loss_mean: 0.910839, proj_loss: -0.399841, loss_mean_cls: 1.042606, grad_norm: 4.327170 +Steps: 0%| | 2681/1000000 [11:00<67:43:31, 4.09it/s, grad_norm=4.33, loss_final=1.55, loss_mean=0.911, loss_mean_cls=1.04, proj_loss=-0.4][2026-03-22 14:29:37] Step: 2681, Training Logs: loss_final: 1.596073, loss_mean: 0.901571, proj_loss: -0.399332, loss_mean_cls: 1.093834, grad_norm: 3.523235 +Steps: 0%| | 2682/1000000 [11:00<67:43:47, 4.09it/s, grad_norm=3.52, loss_final=1.6, loss_mean=0.902, loss_mean_cls=1.09, proj_loss=-0.399][2026-03-22 14:29:37] Step: 2682, Training Logs: loss_final: 1.618979, loss_mean: 0.895985, proj_loss: -0.394249, loss_mean_cls: 1.117243, grad_norm: 2.203111 +Steps: 0%| | 2683/1000000 [11:01<67:43:37, 4.09it/s, grad_norm=2.2, loss_final=1.62, loss_mean=0.896, loss_mean_cls=1.12, proj_loss=-0.394][2026-03-22 14:29:38] Step: 2683, Training Logs: loss_final: 1.637326, loss_mean: 0.889898, proj_loss: -0.397390, loss_mean_cls: 1.144817, grad_norm: 3.198243 +Steps: 0%| | 2684/1000000 [11:01<67:44:52, 4.09it/s, grad_norm=3.2, loss_final=1.64, loss_mean=0.89, loss_mean_cls=1.14, proj_loss=-0.397][2026-03-22 14:29:38] Step: 2684, Training Logs: loss_final: 1.669407, loss_mean: 0.879820, proj_loss: -0.392598, loss_mean_cls: 1.182186, grad_norm: 1.682490 +Steps: 0%| | 2685/1000000 [11:01<67:46:12, 4.09it/s, grad_norm=1.68, loss_final=1.67, loss_mean=0.88, loss_mean_cls=1.18, proj_loss=-0.393][2026-03-22 14:29:38] Step: 2685, Training Logs: loss_final: 1.786990, loss_mean: 0.911452, proj_loss: -0.380262, loss_mean_cls: 1.255800, grad_norm: 2.013376 +Steps: 0%| | 2686/1000000 [11:01<67:46:16, 4.09it/s, grad_norm=2.01, loss_final=1.79, loss_mean=0.911, loss_mean_cls=1.26, proj_loss=-0.38][2026-03-22 14:29:38] Step: 2686, Training Logs: loss_final: 1.652881, loss_mean: 0.887109, proj_loss: -0.387573, loss_mean_cls: 1.153345, grad_norm: 1.738778 +Steps: 0%| | 2687/1000000 [11:02<67:44:19, 4.09it/s, grad_norm=1.74, loss_final=1.65, loss_mean=0.887, loss_mean_cls=1.15, proj_loss=-0.388][2026-03-22 14:29:39] Step: 2687, Training Logs: loss_final: 1.597872, loss_mean: 0.908227, proj_loss: -0.395343, loss_mean_cls: 1.084987, grad_norm: 3.099589 +Steps: 0%| | 2688/1000000 [11:02<67:44:18, 4.09it/s, grad_norm=3.1, loss_final=1.6, loss_mean=0.908, loss_mean_cls=1.08, proj_loss=-0.395][2026-03-22 14:29:39] Step: 2688, Training Logs: loss_final: 1.670798, loss_mean: 0.923440, proj_loss: -0.391699, loss_mean_cls: 1.139058, grad_norm: 3.665051 +Steps: 0%| | 2689/1000000 [11:02<67:42:41, 4.09it/s, grad_norm=3.67, loss_final=1.67, loss_mean=0.923, loss_mean_cls=1.14, proj_loss=-0.392][2026-03-22 14:29:39] Step: 2689, Training Logs: loss_final: 1.674603, loss_mean: 0.901254, proj_loss: -0.388749, loss_mean_cls: 1.162098, grad_norm: 1.710396 +Steps: 0%| | 2690/1000000 [11:02<67:42:20, 4.09it/s, grad_norm=1.71, loss_final=1.67, loss_mean=0.901, loss_mean_cls=1.16, proj_loss=-0.389][2026-03-22 14:29:39] Step: 2690, Training Logs: loss_final: 1.494238, loss_mean: 0.908399, proj_loss: -0.398808, loss_mean_cls: 0.984647, grad_norm: 3.702771 +Steps: 0%| | 2691/1000000 [11:03<67:41:33, 4.09it/s, grad_norm=3.7, loss_final=1.49, loss_mean=0.908, loss_mean_cls=0.985, proj_loss=-0.399][2026-03-22 14:29:39] Step: 2691, Training Logs: loss_final: 1.648496, loss_mean: 0.916143, proj_loss: -0.390723, loss_mean_cls: 1.123077, grad_norm: 5.010060 +Steps: 0%| | 2692/1000000 [11:03<67:42:34, 4.09it/s, grad_norm=5.01, loss_final=1.65, loss_mean=0.916, loss_mean_cls=1.12, proj_loss=-0.391][2026-03-22 14:29:40] Step: 2692, Training Logs: loss_final: 1.563358, loss_mean: 0.921583, proj_loss: -0.396078, loss_mean_cls: 1.037853, grad_norm: 1.592208 +Steps: 0%| | 2693/1000000 [11:03<67:42:05, 4.09it/s, grad_norm=1.59, loss_final=1.56, loss_mean=0.922, loss_mean_cls=1.04, proj_loss=-0.396][2026-03-22 14:29:40] Step: 2693, Training Logs: loss_final: 1.580982, loss_mean: 0.884360, proj_loss: -0.399850, loss_mean_cls: 1.096472, grad_norm: 1.738789 +Steps: 0%| | 2694/1000000 [11:03<67:42:03, 4.09it/s, grad_norm=1.74, loss_final=1.58, loss_mean=0.884, loss_mean_cls=1.1, proj_loss=-0.4][2026-03-22 14:29:40] Step: 2694, Training Logs: loss_final: 1.550366, loss_mean: 0.906361, proj_loss: -0.398697, loss_mean_cls: 1.042702, grad_norm: 5.732833 +Steps: 0%| | 2695/1000000 [11:04<67:41:05, 4.09it/s, grad_norm=5.73, loss_final=1.55, loss_mean=0.906, loss_mean_cls=1.04, proj_loss=-0.399][2026-03-22 14:29:40] Step: 2695, Training Logs: loss_final: 1.538099, loss_mean: 0.921513, proj_loss: -0.397749, loss_mean_cls: 1.014334, grad_norm: 2.204522 +Steps: 0%| | 2696/1000000 [11:04<67:42:40, 4.09it/s, grad_norm=2.2, loss_final=1.54, loss_mean=0.922, loss_mean_cls=1.01, proj_loss=-0.398][2026-03-22 14:29:41] Step: 2696, Training Logs: loss_final: 1.634174, loss_mean: 0.900510, proj_loss: -0.389889, loss_mean_cls: 1.123553, grad_norm: 2.839262 +Steps: 0%| | 2697/1000000 [11:04<67:42:48, 4.09it/s, grad_norm=2.84, loss_final=1.63, loss_mean=0.901, loss_mean_cls=1.12, proj_loss=-0.39][2026-03-22 14:29:41] Step: 2697, Training Logs: loss_final: 1.595565, loss_mean: 0.896912, proj_loss: -0.395267, loss_mean_cls: 1.093919, grad_norm: 3.689483 +Steps: 0%| | 2698/1000000 [11:04<67:41:55, 4.09it/s, grad_norm=3.69, loss_final=1.6, loss_mean=0.897, loss_mean_cls=1.09, proj_loss=-0.395][2026-03-22 14:29:41] Step: 2698, Training Logs: loss_final: 1.531804, loss_mean: 0.910661, proj_loss: -0.395243, loss_mean_cls: 1.016386, grad_norm: 2.611021 +Steps: 0%| | 2699/1000000 [11:04<67:41:53, 4.09it/s, grad_norm=2.61, loss_final=1.53, loss_mean=0.911, loss_mean_cls=1.02, proj_loss=-0.395][2026-03-22 14:29:41] Step: 2699, Training Logs: loss_final: 1.691149, loss_mean: 0.882673, proj_loss: -0.382997, loss_mean_cls: 1.191473, grad_norm: 3.920368 +Steps: 0%| | 2700/1000000 [11:05<67:45:43, 4.09it/s, grad_norm=3.92, loss_final=1.69, loss_mean=0.883, loss_mean_cls=1.19, proj_loss=-0.383][2026-03-22 14:29:42] Step: 2700, Training Logs: loss_final: 1.586428, loss_mean: 0.930591, proj_loss: -0.396509, loss_mean_cls: 1.052346, grad_norm: 3.843321 +Steps: 0%| | 2701/1000000 [11:05<67:45:31, 4.09it/s, grad_norm=3.84, loss_final=1.59, loss_mean=0.931, loss_mean_cls=1.05, proj_loss=-0.397][2026-03-22 14:29:42] Step: 2701, Training Logs: loss_final: 1.506044, loss_mean: 0.925669, proj_loss: -0.400021, loss_mean_cls: 0.980395, grad_norm: 2.185460 +Steps: 0%| | 2702/1000000 [11:05<67:45:12, 4.09it/s, grad_norm=2.19, loss_final=1.51, loss_mean=0.926, loss_mean_cls=0.98, proj_loss=-0.4][2026-03-22 14:29:42] Step: 2702, Training Logs: loss_final: 1.755769, loss_mean: 0.886289, proj_loss: -0.385285, loss_mean_cls: 1.254766, grad_norm: 1.330602 +Steps: 0%| | 2703/1000000 [11:05<67:45:29, 4.09it/s, grad_norm=1.33, loss_final=1.76, loss_mean=0.886, loss_mean_cls=1.25, proj_loss=-0.385][2026-03-22 14:29:42] Step: 2703, Training Logs: loss_final: 1.569512, loss_mean: 0.913950, proj_loss: -0.391484, loss_mean_cls: 1.047046, grad_norm: 2.379223 +Steps: 0%| | 2704/1000000 [11:06<67:50:17, 4.08it/s, grad_norm=2.38, loss_final=1.57, loss_mean=0.914, loss_mean_cls=1.05, proj_loss=-0.391][2026-03-22 14:29:43] Step: 2704, Training Logs: loss_final: 1.744454, loss_mean: 0.893177, proj_loss: -0.382936, loss_mean_cls: 1.234214, grad_norm: 1.666008 +Steps: 0%| | 2705/1000000 [11:06<67:49:44, 4.08it/s, grad_norm=1.67, loss_final=1.74, loss_mean=0.893, loss_mean_cls=1.23, proj_loss=-0.383][2026-03-22 14:29:43] Step: 2705, Training Logs: loss_final: 1.528133, loss_mean: 0.913846, proj_loss: -0.397212, loss_mean_cls: 1.011499, grad_norm: 2.451152 +Steps: 0%| | 2706/1000000 [11:06<67:47:10, 4.09it/s, grad_norm=2.45, loss_final=1.53, loss_mean=0.914, loss_mean_cls=1.01, proj_loss=-0.397][2026-03-22 14:29:43] Step: 2706, Training Logs: loss_final: 1.660215, loss_mean: 0.899152, proj_loss: -0.390362, loss_mean_cls: 1.151425, grad_norm: 2.833068 +Steps: 0%| | 2707/1000000 [11:06<67:45:36, 4.09it/s, grad_norm=2.83, loss_final=1.66, loss_mean=0.899, loss_mean_cls=1.15, proj_loss=-0.39][2026-03-22 14:29:43] Step: 2707, Training Logs: loss_final: 1.617443, loss_mean: 0.901632, proj_loss: -0.398539, loss_mean_cls: 1.114350, grad_norm: 1.967954 +Steps: 0%| | 2708/1000000 [11:07<67:44:49, 4.09it/s, grad_norm=1.97, loss_final=1.62, loss_mean=0.902, loss_mean_cls=1.11, proj_loss=-0.399][2026-03-22 14:29:44] Step: 2708, Training Logs: loss_final: 1.692648, loss_mean: 0.888448, proj_loss: -0.390365, loss_mean_cls: 1.194564, grad_norm: 2.479329 +Steps: 0%| | 2709/1000000 [11:07<67:45:44, 4.09it/s, grad_norm=2.48, loss_final=1.69, loss_mean=0.888, loss_mean_cls=1.19, proj_loss=-0.39][2026-03-22 14:29:44] Step: 2709, Training Logs: loss_final: 1.548581, loss_mean: 0.911254, proj_loss: -0.393918, loss_mean_cls: 1.031245, grad_norm: 2.722206 +Steps: 0%| | 2710/1000000 [11:07<67:44:13, 4.09it/s, grad_norm=2.72, loss_final=1.55, loss_mean=0.911, loss_mean_cls=1.03, proj_loss=-0.394][2026-03-22 14:29:44] Step: 2710, Training Logs: loss_final: 1.649547, loss_mean: 0.901692, proj_loss: -0.394563, loss_mean_cls: 1.142418, grad_norm: 6.164755 +Steps: 0%| | 2711/1000000 [11:07<67:43:18, 4.09it/s, grad_norm=6.16, loss_final=1.65, loss_mean=0.902, loss_mean_cls=1.14, proj_loss=-0.395][2026-03-22 14:29:44] Step: 2711, Training Logs: loss_final: 1.765363, loss_mean: 0.881687, proj_loss: -0.383752, loss_mean_cls: 1.267428, grad_norm: 2.223651 +Steps: 0%| | 2712/1000000 [11:08<67:44:23, 4.09it/s, grad_norm=2.22, loss_final=1.77, loss_mean=0.882, loss_mean_cls=1.27, proj_loss=-0.384][2026-03-22 14:29:45] Step: 2712, Training Logs: loss_final: 1.596998, loss_mean: 0.894281, proj_loss: -0.392138, loss_mean_cls: 1.094855, grad_norm: 3.316161 +Steps: 0%| | 2713/1000000 [11:08<67:43:44, 4.09it/s, grad_norm=3.32, loss_final=1.6, loss_mean=0.894, loss_mean_cls=1.09, proj_loss=-0.392][2026-03-22 14:29:45] Step: 2713, Training Logs: loss_final: 1.552718, loss_mean: 0.910979, proj_loss: -0.400990, loss_mean_cls: 1.042729, grad_norm: 1.018560 +Steps: 0%| | 2714/1000000 [11:08<67:44:23, 4.09it/s, grad_norm=1.02, loss_final=1.55, loss_mean=0.911, loss_mean_cls=1.04, proj_loss=-0.401][2026-03-22 14:29:45] Step: 2714, Training Logs: loss_final: 1.621362, loss_mean: 0.882993, proj_loss: -0.396619, loss_mean_cls: 1.134987, grad_norm: 2.336511 +Steps: 0%| | 2715/1000000 [11:08<67:42:21, 4.09it/s, grad_norm=2.34, loss_final=1.62, loss_mean=0.883, loss_mean_cls=1.13, proj_loss=-0.397][2026-03-22 14:29:45] Step: 2715, Training Logs: loss_final: 1.673984, loss_mean: 0.881819, proj_loss: -0.382275, loss_mean_cls: 1.174440, grad_norm: 1.169244 +Steps: 0%| | 2716/1000000 [11:09<67:41:31, 4.09it/s, grad_norm=1.17, loss_final=1.67, loss_mean=0.882, loss_mean_cls=1.17, proj_loss=-0.382][2026-03-22 14:29:46] Step: 2716, Training Logs: loss_final: 1.527992, loss_mean: 0.918157, proj_loss: -0.403779, loss_mean_cls: 1.013615, grad_norm: 1.847608 +Steps: 0%| | 2717/1000000 [11:09<67:41:37, 4.09it/s, grad_norm=1.85, loss_final=1.53, loss_mean=0.918, loss_mean_cls=1.01, proj_loss=-0.404][2026-03-22 14:29:46] Step: 2717, Training Logs: loss_final: 1.582002, loss_mean: 0.906980, proj_loss: -0.401782, loss_mean_cls: 1.076804, grad_norm: 2.026460 +Steps: 0%| | 2718/1000000 [11:09<67:41:16, 4.09it/s, grad_norm=2.03, loss_final=1.58, loss_mean=0.907, loss_mean_cls=1.08, proj_loss=-0.402][2026-03-22 14:29:46] Step: 2718, Training Logs: loss_final: 1.615703, loss_mean: 0.907873, proj_loss: -0.396812, loss_mean_cls: 1.104642, grad_norm: 1.757276 +Steps: 0%| | 2719/1000000 [11:09<67:41:07, 4.09it/s, grad_norm=1.76, loss_final=1.62, loss_mean=0.908, loss_mean_cls=1.1, proj_loss=-0.397][2026-03-22 14:29:46] Step: 2719, Training Logs: loss_final: 1.692801, loss_mean: 0.894194, proj_loss: -0.384605, loss_mean_cls: 1.183211, grad_norm: 1.772482 +Steps: 0%| | 2720/1000000 [11:10<67:41:38, 4.09it/s, grad_norm=1.77, loss_final=1.69, loss_mean=0.894, loss_mean_cls=1.18, proj_loss=-0.385][2026-03-22 14:29:47] Step: 2720, Training Logs: loss_final: 1.519946, loss_mean: 0.873067, proj_loss: -0.397279, loss_mean_cls: 1.044158, grad_norm: 1.671437 +Steps: 0%| | 2721/1000000 [11:10<67:42:28, 4.09it/s, grad_norm=1.67, loss_final=1.52, loss_mean=0.873, loss_mean_cls=1.04, proj_loss=-0.397][2026-03-22 14:29:47] Step: 2721, Training Logs: loss_final: 1.563717, loss_mean: 0.917917, proj_loss: -0.398775, loss_mean_cls: 1.044574, grad_norm: 3.699408 +Steps: 0%| | 2722/1000000 [11:10<67:41:36, 4.09it/s, grad_norm=3.7, loss_final=1.56, loss_mean=0.918, loss_mean_cls=1.04, proj_loss=-0.399][2026-03-22 14:29:47] Step: 2722, Training Logs: loss_final: 1.462854, loss_mean: 0.915865, proj_loss: -0.398419, loss_mean_cls: 0.945408, grad_norm: 2.361029 +Steps: 0%| | 2723/1000000 [11:10<67:40:36, 4.09it/s, grad_norm=2.36, loss_final=1.46, loss_mean=0.916, loss_mean_cls=0.945, proj_loss=-0.398][2026-03-22 14:29:47] Step: 2723, Training Logs: loss_final: 1.649122, loss_mean: 0.898166, proj_loss: -0.391234, loss_mean_cls: 1.142190, grad_norm: 1.350878 +Steps: 0%| | 2724/1000000 [11:11<67:42:16, 4.09it/s, grad_norm=1.35, loss_final=1.65, loss_mean=0.898, loss_mean_cls=1.14, proj_loss=-0.391][2026-03-22 14:29:48] Step: 2724, Training Logs: loss_final: 1.440381, loss_mean: 0.919093, proj_loss: -0.400819, loss_mean_cls: 0.922107, grad_norm: 2.702924 +Steps: 0%| | 2725/1000000 [11:11<67:40:19, 4.09it/s, grad_norm=2.7, loss_final=1.44, loss_mean=0.919, loss_mean_cls=0.922, proj_loss=-0.401][2026-03-22 14:29:48] Step: 2725, Training Logs: loss_final: 1.701947, loss_mean: 0.895442, proj_loss: -0.383774, loss_mean_cls: 1.190279, grad_norm: 2.142574 +Steps: 0%| | 2726/1000000 [11:11<67:41:39, 4.09it/s, grad_norm=2.14, loss_final=1.7, loss_mean=0.895, loss_mean_cls=1.19, proj_loss=-0.384][2026-03-22 14:29:48] Step: 2726, Training Logs: loss_final: 1.547387, loss_mean: 0.914945, proj_loss: -0.394763, loss_mean_cls: 1.027206, grad_norm: 3.652964 +Steps: 0%| | 2727/1000000 [11:11<67:40:49, 4.09it/s, grad_norm=3.65, loss_final=1.55, loss_mean=0.915, loss_mean_cls=1.03, proj_loss=-0.395][2026-03-22 14:29:48] Step: 2727, Training Logs: loss_final: 1.597558, loss_mean: 0.915140, proj_loss: -0.395244, loss_mean_cls: 1.077661, grad_norm: 4.536752 +Steps: 0%| | 2728/1000000 [11:12<68:21:31, 4.05it/s, grad_norm=4.54, loss_final=1.6, loss_mean=0.915, loss_mean_cls=1.08, proj_loss=-0.395][2026-03-22 14:29:49] Step: 2728, Training Logs: loss_final: 1.732465, loss_mean: 0.885302, proj_loss: -0.382290, loss_mean_cls: 1.229453, grad_norm: 1.538378 +Steps: 0%| | 2729/1000000 [11:12<68:10:04, 4.06it/s, grad_norm=1.54, loss_final=1.73, loss_mean=0.885, loss_mean_cls=1.23, proj_loss=-0.382][2026-03-22 14:29:49] Step: 2729, Training Logs: loss_final: 1.601802, loss_mean: 0.905945, proj_loss: -0.390291, loss_mean_cls: 1.086149, grad_norm: 2.311883 +Steps: 0%| | 2730/1000000 [11:12<68:02:49, 4.07it/s, grad_norm=2.31, loss_final=1.6, loss_mean=0.906, loss_mean_cls=1.09, proj_loss=-0.39][2026-03-22 14:29:49] Step: 2730, Training Logs: loss_final: 1.670921, loss_mean: 0.899784, proj_loss: -0.385083, loss_mean_cls: 1.156219, grad_norm: 2.873561 +Steps: 0%| | 2731/1000000 [11:12<67:57:02, 4.08it/s, grad_norm=2.87, loss_final=1.67, loss_mean=0.9, loss_mean_cls=1.16, proj_loss=-0.385][2026-03-22 14:29:49] Step: 2731, Training Logs: loss_final: 1.713924, loss_mean: 0.878553, proj_loss: -0.383786, loss_mean_cls: 1.219157, grad_norm: 1.501628 +Steps: 0%| | 2732/1000000 [11:13<67:54:53, 4.08it/s, grad_norm=1.5, loss_final=1.71, loss_mean=0.879, loss_mean_cls=1.22, proj_loss=-0.384][2026-03-22 14:29:50] Step: 2732, Training Logs: loss_final: 1.652766, loss_mean: 0.892824, proj_loss: -0.382111, loss_mean_cls: 1.142053, grad_norm: 1.502863 +Steps: 0%| | 2733/1000000 [11:13<67:52:28, 4.08it/s, grad_norm=1.5, loss_final=1.65, loss_mean=0.893, loss_mean_cls=1.14, proj_loss=-0.382][2026-03-22 14:29:50] Step: 2733, Training Logs: loss_final: 1.501253, loss_mean: 0.898596, proj_loss: -0.403117, loss_mean_cls: 1.005774, grad_norm: 2.116030 +Steps: 0%| | 2734/1000000 [11:13<67:49:47, 4.08it/s, grad_norm=2.12, loss_final=1.5, loss_mean=0.899, loss_mean_cls=1.01, proj_loss=-0.403][2026-03-22 14:29:50] Step: 2734, Training Logs: loss_final: 1.597389, loss_mean: 0.891580, proj_loss: -0.392282, loss_mean_cls: 1.098091, grad_norm: 1.381264 +Steps: 0%| | 2735/1000000 [11:13<67:47:36, 4.09it/s, grad_norm=1.38, loss_final=1.6, loss_mean=0.892, loss_mean_cls=1.1, proj_loss=-0.392][2026-03-22 14:29:50] Step: 2735, Training Logs: loss_final: 1.580551, loss_mean: 0.907181, proj_loss: -0.394829, loss_mean_cls: 1.068199, grad_norm: 2.047082 +Steps: 0%| | 2736/1000000 [11:14<67:46:15, 4.09it/s, grad_norm=2.05, loss_final=1.58, loss_mean=0.907, loss_mean_cls=1.07, proj_loss=-0.395][2026-03-22 14:29:50] Step: 2736, Training Logs: loss_final: 1.664330, loss_mean: 0.889355, proj_loss: -0.388683, loss_mean_cls: 1.163658, grad_norm: 1.802206 +Steps: 0%| | 2737/1000000 [11:14<67:48:26, 4.09it/s, grad_norm=1.8, loss_final=1.66, loss_mean=0.889, loss_mean_cls=1.16, proj_loss=-0.389][2026-03-22 14:29:51] Step: 2737, Training Logs: loss_final: 1.483209, loss_mean: 0.911458, proj_loss: -0.402832, loss_mean_cls: 0.974583, grad_norm: 2.523506 +Steps: 0%| | 2738/1000000 [11:14<67:47:16, 4.09it/s, grad_norm=2.52, loss_final=1.48, loss_mean=0.911, loss_mean_cls=0.975, proj_loss=-0.403][2026-03-22 14:29:51] Step: 2738, Training Logs: loss_final: 1.783940, loss_mean: 0.900842, proj_loss: -0.372618, loss_mean_cls: 1.255716, grad_norm: 2.254493 +Steps: 0%| | 2739/1000000 [11:14<67:45:17, 4.09it/s, grad_norm=2.25, loss_final=1.78, loss_mean=0.901, loss_mean_cls=1.26, proj_loss=-0.373][2026-03-22 14:29:51] Step: 2739, Training Logs: loss_final: 1.596111, loss_mean: 0.896686, proj_loss: -0.397714, loss_mean_cls: 1.097139, grad_norm: 4.970145 +Steps: 0%| | 2740/1000000 [11:15<67:48:09, 4.09it/s, grad_norm=4.97, loss_final=1.6, loss_mean=0.897, loss_mean_cls=1.1, proj_loss=-0.398][2026-03-22 14:29:51] Step: 2740, Training Logs: loss_final: 1.518662, loss_mean: 0.915826, proj_loss: -0.392147, loss_mean_cls: 0.994983, grad_norm: 2.231397 +Steps: 0%| | 2741/1000000 [11:15<67:47:21, 4.09it/s, grad_norm=2.23, loss_final=1.52, loss_mean=0.916, loss_mean_cls=0.995, proj_loss=-0.392][2026-03-22 14:29:52] Step: 2741, Training Logs: loss_final: 1.675280, loss_mean: 0.884209, proj_loss: -0.388585, loss_mean_cls: 1.179656, grad_norm: 1.425585 +Steps: 0%| | 2742/1000000 [11:15<67:48:10, 4.09it/s, grad_norm=1.43, loss_final=1.68, loss_mean=0.884, loss_mean_cls=1.18, proj_loss=-0.389][2026-03-22 14:29:52] Step: 2742, Training Logs: loss_final: 1.607575, loss_mean: 0.904020, proj_loss: -0.400439, loss_mean_cls: 1.103994, grad_norm: 4.870441 +Steps: 0%| | 2743/1000000 [11:15<67:45:46, 4.09it/s, grad_norm=4.87, loss_final=1.61, loss_mean=0.904, loss_mean_cls=1.1, proj_loss=-0.4][2026-03-22 14:29:52] Step: 2743, Training Logs: loss_final: 1.683504, loss_mean: 0.896529, proj_loss: -0.384194, loss_mean_cls: 1.171169, grad_norm: 3.650839 +Steps: 0%| | 2744/1000000 [11:15<67:45:29, 4.09it/s, grad_norm=3.65, loss_final=1.68, loss_mean=0.897, loss_mean_cls=1.17, proj_loss=-0.384][2026-03-22 14:29:52] Step: 2744, Training Logs: loss_final: 1.615557, loss_mean: 0.919913, proj_loss: -0.396911, loss_mean_cls: 1.092555, grad_norm: 2.283148 +Steps: 0%| | 2745/1000000 [11:16<67:43:42, 4.09it/s, grad_norm=2.28, loss_final=1.62, loss_mean=0.92, loss_mean_cls=1.09, proj_loss=-0.397][2026-03-22 14:29:53] Step: 2745, Training Logs: loss_final: 1.647392, loss_mean: 0.914139, proj_loss: -0.397249, loss_mean_cls: 1.130502, grad_norm: 7.847515 +Steps: 0%| | 2746/1000000 [11:16<67:43:08, 4.09it/s, grad_norm=7.85, loss_final=1.65, loss_mean=0.914, loss_mean_cls=1.13, proj_loss=-0.397][2026-03-22 14:29:53] Step: 2746, Training Logs: loss_final: 1.659299, loss_mean: 0.903902, proj_loss: -0.402398, loss_mean_cls: 1.157795, grad_norm: 4.489799 +Steps: 0%| | 2747/1000000 [11:16<67:43:04, 4.09it/s, grad_norm=4.49, loss_final=1.66, loss_mean=0.904, loss_mean_cls=1.16, proj_loss=-0.402][2026-03-22 14:29:53] Step: 2747, Training Logs: loss_final: 1.725689, loss_mean: 0.900321, proj_loss: -0.386992, loss_mean_cls: 1.212359, grad_norm: 4.337436 +Steps: 0%| | 2748/1000000 [11:16<67:43:19, 4.09it/s, grad_norm=4.34, loss_final=1.73, loss_mean=0.9, loss_mean_cls=1.21, proj_loss=-0.387][2026-03-22 14:29:53] Step: 2748, Training Logs: loss_final: 1.647915, loss_mean: 0.910246, proj_loss: -0.386475, loss_mean_cls: 1.124144, grad_norm: 2.748954 +Steps: 0%| | 2749/1000000 [11:17<67:44:04, 4.09it/s, grad_norm=2.75, loss_final=1.65, loss_mean=0.91, loss_mean_cls=1.12, proj_loss=-0.386][2026-03-22 14:29:54] Step: 2749, Training Logs: loss_final: 1.666273, loss_mean: 0.905030, proj_loss: -0.395823, loss_mean_cls: 1.157066, grad_norm: 3.951804 +Steps: 0%| | 2750/1000000 [11:17<67:44:24, 4.09it/s, grad_norm=3.95, loss_final=1.67, loss_mean=0.905, loss_mean_cls=1.16, proj_loss=-0.396][2026-03-22 14:29:54] Step: 2750, Training Logs: loss_final: 1.637585, loss_mean: 0.908022, proj_loss: -0.389195, loss_mean_cls: 1.118757, grad_norm: 4.430974 +Steps: 0%| | 2751/1000000 [11:17<67:44:42, 4.09it/s, grad_norm=4.43, loss_final=1.64, loss_mean=0.908, loss_mean_cls=1.12, proj_loss=-0.389][2026-03-22 14:29:54] Step: 2751, Training Logs: loss_final: 1.642043, loss_mean: 0.913553, proj_loss: -0.394040, loss_mean_cls: 1.122531, grad_norm: 2.629252 +Steps: 0%| | 2752/1000000 [11:17<67:44:43, 4.09it/s, grad_norm=2.63, loss_final=1.64, loss_mean=0.914, loss_mean_cls=1.12, proj_loss=-0.394][2026-03-22 14:29:54] Step: 2752, Training Logs: loss_final: 1.648919, loss_mean: 0.901590, proj_loss: -0.391872, loss_mean_cls: 1.139201, grad_norm: 6.683944 +Steps: 0%| | 2753/1000000 [11:18<67:44:31, 4.09it/s, grad_norm=6.68, loss_final=1.65, loss_mean=0.902, loss_mean_cls=1.14, proj_loss=-0.392][2026-03-22 14:29:55] Step: 2753, Training Logs: loss_final: 1.696382, loss_mean: 0.895574, proj_loss: -0.389715, loss_mean_cls: 1.190523, grad_norm: 3.965141 +Steps: 0%| | 2754/1000000 [11:18<67:43:48, 4.09it/s, grad_norm=3.97, loss_final=1.7, loss_mean=0.896, loss_mean_cls=1.19, proj_loss=-0.39][2026-03-22 14:29:55] Step: 2754, Training Logs: loss_final: 1.678739, loss_mean: 0.898996, proj_loss: -0.384400, loss_mean_cls: 1.164143, grad_norm: 2.690085 +Steps: 0%| | 2755/1000000 [11:18<67:43:33, 4.09it/s, grad_norm=2.69, loss_final=1.68, loss_mean=0.899, loss_mean_cls=1.16, proj_loss=-0.384][2026-03-22 14:29:55] Step: 2755, Training Logs: loss_final: 1.729907, loss_mean: 0.885558, proj_loss: -0.393095, loss_mean_cls: 1.237443, grad_norm: 2.721503 +Steps: 0%| | 2756/1000000 [11:18<67:43:12, 4.09it/s, grad_norm=2.72, loss_final=1.73, loss_mean=0.886, loss_mean_cls=1.24, proj_loss=-0.393][2026-03-22 14:29:55] Step: 2756, Training Logs: loss_final: 1.504094, loss_mean: 0.903997, proj_loss: -0.399172, loss_mean_cls: 0.999269, grad_norm: 1.844103 +Steps: 0%| | 2757/1000000 [11:19<67:44:24, 4.09it/s, grad_norm=1.84, loss_final=1.5, loss_mean=0.904, loss_mean_cls=0.999, proj_loss=-0.399][2026-03-22 14:29:56] Step: 2757, Training Logs: loss_final: 1.614989, loss_mean: 0.909592, proj_loss: -0.390887, loss_mean_cls: 1.096284, grad_norm: 2.276338 +Steps: 0%| | 2758/1000000 [11:19<67:44:42, 4.09it/s, grad_norm=2.28, loss_final=1.61, loss_mean=0.91, loss_mean_cls=1.1, proj_loss=-0.391][2026-03-22 14:29:56] Step: 2758, Training Logs: loss_final: 1.699923, loss_mean: 0.878957, proj_loss: -0.385104, loss_mean_cls: 1.206071, grad_norm: 2.220905 +Steps: 0%| | 2759/1000000 [11:19<67:44:57, 4.09it/s, grad_norm=2.22, loss_final=1.7, loss_mean=0.879, loss_mean_cls=1.21, proj_loss=-0.385][2026-03-22 14:29:56] Step: 2759, Training Logs: loss_final: 1.561216, loss_mean: 0.906289, proj_loss: -0.395301, loss_mean_cls: 1.050228, grad_norm: 1.927748 +Steps: 0%| | 2760/1000000 [11:19<67:45:07, 4.09it/s, grad_norm=1.93, loss_final=1.56, loss_mean=0.906, loss_mean_cls=1.05, proj_loss=-0.395][2026-03-22 14:29:56] Step: 2760, Training Logs: loss_final: 1.521225, loss_mean: 0.909024, proj_loss: -0.398953, loss_mean_cls: 1.011153, grad_norm: 1.811824 +Steps: 0%| | 2761/1000000 [11:20<68:31:48, 4.04it/s, grad_norm=1.81, loss_final=1.52, loss_mean=0.909, loss_mean_cls=1.01, proj_loss=-0.399][2026-03-22 14:29:57] Step: 2761, Training Logs: loss_final: 1.575048, loss_mean: 0.900992, proj_loss: -0.392241, loss_mean_cls: 1.066297, grad_norm: 2.822982 +Steps: 0%| | 2762/1000000 [11:20<68:18:09, 4.06it/s, grad_norm=2.82, loss_final=1.58, loss_mean=0.901, loss_mean_cls=1.07, proj_loss=-0.392][2026-03-22 14:29:57] Step: 2762, Training Logs: loss_final: 1.509919, loss_mean: 0.904182, proj_loss: -0.401745, loss_mean_cls: 1.007481, grad_norm: 2.767811 +Steps: 0%| | 2763/1000000 [11:20<68:07:59, 4.07it/s, grad_norm=2.77, loss_final=1.51, loss_mean=0.904, loss_mean_cls=1.01, proj_loss=-0.402][2026-03-22 14:29:57] Step: 2763, Training Logs: loss_final: 1.621142, loss_mean: 0.898407, proj_loss: -0.390239, loss_mean_cls: 1.112975, grad_norm: 2.728759 +Steps: 0%| | 2764/1000000 [11:20<68:00:42, 4.07it/s, grad_norm=2.73, loss_final=1.62, loss_mean=0.898, loss_mean_cls=1.11, proj_loss=-0.39][2026-03-22 14:29:57] Step: 2764, Training Logs: loss_final: 1.613883, loss_mean: 0.894355, proj_loss: -0.392528, loss_mean_cls: 1.112056, grad_norm: 1.312694 +Steps: 0%| | 2765/1000000 [11:21<67:55:08, 4.08it/s, grad_norm=1.31, loss_final=1.61, loss_mean=0.894, loss_mean_cls=1.11, proj_loss=-0.393][2026-03-22 14:29:58] Step: 2765, Training Logs: loss_final: 1.683511, loss_mean: 0.909177, proj_loss: -0.388164, loss_mean_cls: 1.162498, grad_norm: 4.308271 +Steps: 0%| | 2766/1000000 [11:21<67:52:18, 4.08it/s, grad_norm=4.31, loss_final=1.68, loss_mean=0.909, loss_mean_cls=1.16, proj_loss=-0.388][2026-03-22 14:29:58] Step: 2766, Training Logs: loss_final: 1.528274, loss_mean: 0.912319, proj_loss: -0.394615, loss_mean_cls: 1.010570, grad_norm: 1.691624 +Steps: 0%| | 2767/1000000 [11:21<67:49:08, 4.08it/s, grad_norm=1.69, loss_final=1.53, loss_mean=0.912, loss_mean_cls=1.01, proj_loss=-0.395][2026-03-22 14:29:58] Step: 2767, Training Logs: loss_final: 1.621225, loss_mean: 0.896875, proj_loss: -0.395912, loss_mean_cls: 1.120262, grad_norm: 2.735106 +Steps: 0%| | 2768/1000000 [11:21<67:48:46, 4.08it/s, grad_norm=2.74, loss_final=1.62, loss_mean=0.897, loss_mean_cls=1.12, proj_loss=-0.396][2026-03-22 14:29:58] Step: 2768, Training Logs: loss_final: 1.622965, loss_mean: 0.888293, proj_loss: -0.396293, loss_mean_cls: 1.130965, grad_norm: 4.505151 +Steps: 0%| | 2769/1000000 [11:22<67:48:54, 4.08it/s, grad_norm=4.51, loss_final=1.62, loss_mean=0.888, loss_mean_cls=1.13, proj_loss=-0.396][2026-03-22 14:29:59] Step: 2769, Training Logs: loss_final: 1.650873, loss_mean: 0.897249, proj_loss: -0.391330, loss_mean_cls: 1.144954, grad_norm: 2.555569 +Steps: 0%| | 2770/1000000 [11:22<67:47:43, 4.09it/s, grad_norm=2.56, loss_final=1.65, loss_mean=0.897, loss_mean_cls=1.14, proj_loss=-0.391][2026-03-22 14:29:59] Step: 2770, Training Logs: loss_final: 1.664572, loss_mean: 0.881504, proj_loss: -0.388517, loss_mean_cls: 1.171586, grad_norm: 3.594521 +Steps: 0%| | 2771/1000000 [11:22<67:47:00, 4.09it/s, grad_norm=3.59, loss_final=1.66, loss_mean=0.882, loss_mean_cls=1.17, proj_loss=-0.389][2026-03-22 14:29:59] Step: 2771, Training Logs: loss_final: 1.522699, loss_mean: 0.901834, proj_loss: -0.406081, loss_mean_cls: 1.026946, grad_norm: 2.474200 +Steps: 0%| | 2772/1000000 [11:22<67:46:52, 4.09it/s, grad_norm=2.47, loss_final=1.52, loss_mean=0.902, loss_mean_cls=1.03, proj_loss=-0.406][2026-03-22 14:29:59] Step: 2772, Training Logs: loss_final: 1.519807, loss_mean: 0.895603, proj_loss: -0.400477, loss_mean_cls: 1.024680, grad_norm: 5.280921 +Steps: 0%| | 2773/1000000 [11:23<67:46:33, 4.09it/s, grad_norm=5.28, loss_final=1.52, loss_mean=0.896, loss_mean_cls=1.02, proj_loss=-0.4][2026-03-22 14:30:00] Step: 2773, Training Logs: loss_final: 1.551009, loss_mean: 0.899672, proj_loss: -0.406025, loss_mean_cls: 1.057363, grad_norm: 4.682408 +Steps: 0%| | 2774/1000000 [11:23<67:44:44, 4.09it/s, grad_norm=4.68, loss_final=1.55, loss_mean=0.9, loss_mean_cls=1.06, proj_loss=-0.406][2026-03-22 14:30:00] Step: 2774, Training Logs: loss_final: 1.635201, loss_mean: 0.871262, proj_loss: -0.389173, loss_mean_cls: 1.153113, grad_norm: 2.441766 +Steps: 0%| | 2775/1000000 [11:23<67:52:15, 4.08it/s, grad_norm=2.44, loss_final=1.64, loss_mean=0.871, loss_mean_cls=1.15, proj_loss=-0.389][2026-03-22 14:30:00] Step: 2775, Training Logs: loss_final: 1.707586, loss_mean: 0.885371, proj_loss: -0.385744, loss_mean_cls: 1.207959, grad_norm: 5.530255 +Steps: 0%| | 2776/1000000 [11:23<67:52:24, 4.08it/s, grad_norm=5.53, loss_final=1.71, loss_mean=0.885, loss_mean_cls=1.21, proj_loss=-0.386][2026-03-22 14:30:00] Step: 2776, Training Logs: loss_final: 1.498312, loss_mean: 0.909096, proj_loss: -0.408194, loss_mean_cls: 0.997410, grad_norm: 4.639260 +Steps: 0%| | 2777/1000000 [11:24<67:50:18, 4.08it/s, grad_norm=4.64, loss_final=1.5, loss_mean=0.909, loss_mean_cls=0.997, proj_loss=-0.408][2026-03-22 14:30:01] Step: 2777, Training Logs: loss_final: 1.350638, loss_mean: 0.925897, proj_loss: -0.416942, loss_mean_cls: 0.841683, grad_norm: 2.353380 +Steps: 0%| | 2778/1000000 [11:24<67:49:05, 4.08it/s, grad_norm=2.35, loss_final=1.35, loss_mean=0.926, loss_mean_cls=0.842, proj_loss=-0.417][2026-03-22 14:30:01] Step: 2778, Training Logs: loss_final: 1.643222, loss_mean: 0.903348, proj_loss: -0.393711, loss_mean_cls: 1.133585, grad_norm: 3.983236 +Steps: 0%| | 2779/1000000 [11:24<67:47:32, 4.09it/s, grad_norm=3.98, loss_final=1.64, loss_mean=0.903, loss_mean_cls=1.13, proj_loss=-0.394][2026-03-22 14:30:01] Step: 2779, Training Logs: loss_final: 1.581738, loss_mean: 0.886741, proj_loss: -0.396114, loss_mean_cls: 1.091110, grad_norm: 1.801384 +Steps: 0%| | 2780/1000000 [11:24<67:47:57, 4.09it/s, grad_norm=1.8, loss_final=1.58, loss_mean=0.887, loss_mean_cls=1.09, proj_loss=-0.396][2026-03-22 14:30:01] Step: 2780, Training Logs: loss_final: 1.621933, loss_mean: 0.889602, proj_loss: -0.396411, loss_mean_cls: 1.128743, grad_norm: 3.079248 +Steps: 0%| | 2781/1000000 [11:25<67:47:57, 4.09it/s, grad_norm=3.08, loss_final=1.62, loss_mean=0.89, loss_mean_cls=1.13, proj_loss=-0.396][2026-03-22 14:30:02] Step: 2781, Training Logs: loss_final: 1.581578, loss_mean: 0.911598, proj_loss: -0.394598, loss_mean_cls: 1.064579, grad_norm: 3.622381 +Steps: 0%| | 2782/1000000 [11:25<68:04:20, 4.07it/s, grad_norm=3.62, loss_final=1.58, loss_mean=0.912, loss_mean_cls=1.06, proj_loss=-0.395][2026-03-22 14:30:02] Step: 2782, Training Logs: loss_final: 1.635189, loss_mean: 0.907433, proj_loss: -0.380211, loss_mean_cls: 1.107967, grad_norm: 3.845265 +Steps: 0%| | 2783/1000000 [11:25<67:58:49, 4.07it/s, grad_norm=3.85, loss_final=1.64, loss_mean=0.907, loss_mean_cls=1.11, proj_loss=-0.38][2026-03-22 14:30:02] Step: 2783, Training Logs: loss_final: 1.578666, loss_mean: 0.882676, proj_loss: -0.395846, loss_mean_cls: 1.091837, grad_norm: 3.227552 +Steps: 0%| | 2784/1000000 [11:25<67:54:49, 4.08it/s, grad_norm=3.23, loss_final=1.58, loss_mean=0.883, loss_mean_cls=1.09, proj_loss=-0.396][2026-03-22 14:30:02] Step: 2784, Training Logs: loss_final: 1.536763, loss_mean: 0.895470, proj_loss: -0.400370, loss_mean_cls: 1.041663, grad_norm: 2.829047 +Steps: 0%| | 2785/1000000 [11:26<67:50:07, 4.08it/s, grad_norm=2.83, loss_final=1.54, loss_mean=0.895, loss_mean_cls=1.04, proj_loss=-0.4][2026-03-22 14:30:02] Step: 2785, Training Logs: loss_final: 1.552542, loss_mean: 0.909912, proj_loss: -0.401100, loss_mean_cls: 1.043730, grad_norm: 3.754730 +Steps: 0%| | 2786/1000000 [11:26<67:48:17, 4.09it/s, grad_norm=3.75, loss_final=1.55, loss_mean=0.91, loss_mean_cls=1.04, proj_loss=-0.401][2026-03-22 14:30:03] Step: 2786, Training Logs: loss_final: 1.606927, loss_mean: 0.884021, proj_loss: -0.392041, loss_mean_cls: 1.114947, grad_norm: 1.978192 +Steps: 0%| | 2787/1000000 [11:26<67:45:55, 4.09it/s, grad_norm=1.98, loss_final=1.61, loss_mean=0.884, loss_mean_cls=1.11, proj_loss=-0.392][2026-03-22 14:30:03] Step: 2787, Training Logs: loss_final: 1.620050, loss_mean: 0.884526, proj_loss: -0.394938, loss_mean_cls: 1.130462, grad_norm: 2.106588 +Steps: 0%| | 2788/1000000 [11:26<67:45:00, 4.09it/s, grad_norm=2.11, loss_final=1.62, loss_mean=0.885, loss_mean_cls=1.13, proj_loss=-0.395][2026-03-22 14:30:03] Step: 2788, Training Logs: loss_final: 1.682011, loss_mean: 0.869703, proj_loss: -0.396627, loss_mean_cls: 1.208934, grad_norm: 3.511196 +Steps: 0%| | 2789/1000000 [11:27<67:46:09, 4.09it/s, grad_norm=3.51, loss_final=1.68, loss_mean=0.87, loss_mean_cls=1.21, proj_loss=-0.397][2026-03-22 14:30:03] Step: 2789, Training Logs: loss_final: 1.578398, loss_mean: 0.888333, proj_loss: -0.401271, loss_mean_cls: 1.091337, grad_norm: 1.487374 +Steps: 0%| | 2790/1000000 [11:27<67:45:33, 4.09it/s, grad_norm=1.49, loss_final=1.58, loss_mean=0.888, loss_mean_cls=1.09, proj_loss=-0.401][2026-03-22 14:30:04] Step: 2790, Training Logs: loss_final: 1.523649, loss_mean: 0.895346, proj_loss: -0.405030, loss_mean_cls: 1.033333, grad_norm: 2.656260 +Steps: 0%| | 2791/1000000 [11:27<67:44:10, 4.09it/s, grad_norm=2.66, loss_final=1.52, loss_mean=0.895, loss_mean_cls=1.03, proj_loss=-0.405][2026-03-22 14:30:04] Step: 2791, Training Logs: loss_final: 1.455330, loss_mean: 0.912817, proj_loss: -0.407039, loss_mean_cls: 0.949552, grad_norm: 3.125472 +Steps: 0%| | 2792/1000000 [11:27<67:44:19, 4.09it/s, grad_norm=3.13, loss_final=1.46, loss_mean=0.913, loss_mean_cls=0.95, proj_loss=-0.407][2026-03-22 14:30:04] Step: 2792, Training Logs: loss_final: 1.490144, loss_mean: 0.905120, proj_loss: -0.403449, loss_mean_cls: 0.988473, grad_norm: 1.320855 +Steps: 0%| | 2793/1000000 [11:27<67:45:37, 4.09it/s, grad_norm=1.32, loss_final=1.49, loss_mean=0.905, loss_mean_cls=0.988, proj_loss=-0.403][2026-03-22 14:30:04] Step: 2793, Training Logs: loss_final: 1.631366, loss_mean: 0.888223, proj_loss: -0.387443, loss_mean_cls: 1.130586, grad_norm: 1.424394 +Steps: 0%| | 2794/1000000 [11:28<67:45:01, 4.09it/s, grad_norm=1.42, loss_final=1.63, loss_mean=0.888, loss_mean_cls=1.13, proj_loss=-0.387][2026-03-22 14:30:05] Step: 2794, Training Logs: loss_final: 1.696991, loss_mean: 0.891636, proj_loss: -0.392207, loss_mean_cls: 1.197563, grad_norm: 7.919096 +Steps: 0%| | 2795/1000000 [11:28<67:44:20, 4.09it/s, grad_norm=7.92, loss_final=1.7, loss_mean=0.892, loss_mean_cls=1.2, proj_loss=-0.392][2026-03-22 14:30:05] Step: 2795, Training Logs: loss_final: 1.710801, loss_mean: 0.878835, proj_loss: -0.390750, loss_mean_cls: 1.222716, grad_norm: 4.355158 +Steps: 0%| | 2796/1000000 [11:28<67:44:27, 4.09it/s, grad_norm=4.36, loss_final=1.71, loss_mean=0.879, loss_mean_cls=1.22, proj_loss=-0.391][2026-03-22 14:30:05] Step: 2796, Training Logs: loss_final: 1.622210, loss_mean: 0.902197, proj_loss: -0.395834, loss_mean_cls: 1.115847, grad_norm: 2.085994 +Steps: 0%| | 2797/1000000 [11:28<67:43:45, 4.09it/s, grad_norm=2.09, loss_final=1.62, loss_mean=0.902, loss_mean_cls=1.12, proj_loss=-0.396][2026-03-22 14:30:05] Step: 2797, Training Logs: loss_final: 1.607694, loss_mean: 0.891823, proj_loss: -0.394140, loss_mean_cls: 1.110010, grad_norm: 3.346096 +Steps: 0%| | 2798/1000000 [11:29<67:44:29, 4.09it/s, grad_norm=3.35, loss_final=1.61, loss_mean=0.892, loss_mean_cls=1.11, proj_loss=-0.394][2026-03-22 14:30:06] Step: 2798, Training Logs: loss_final: 1.561845, loss_mean: 0.900960, proj_loss: -0.404116, loss_mean_cls: 1.065001, grad_norm: 4.721876 +Steps: 0%| | 2799/1000000 [11:29<67:43:31, 4.09it/s, grad_norm=4.72, loss_final=1.56, loss_mean=0.901, loss_mean_cls=1.07, proj_loss=-0.404][2026-03-22 14:30:06] Step: 2799, Training Logs: loss_final: 1.472054, loss_mean: 0.914100, proj_loss: -0.407156, loss_mean_cls: 0.965109, grad_norm: 2.844145 +Steps: 0%| | 2800/1000000 [11:29<67:46:35, 4.09it/s, grad_norm=2.84, loss_final=1.47, loss_mean=0.914, loss_mean_cls=0.965, proj_loss=-0.407][2026-03-22 14:30:06] Step: 2800, Training Logs: loss_final: 1.596550, loss_mean: 0.899352, proj_loss: -0.395495, loss_mean_cls: 1.092693, grad_norm: 3.125455 +Steps: 0%| | 2801/1000000 [11:29<67:44:16, 4.09it/s, grad_norm=3.13, loss_final=1.6, loss_mean=0.899, loss_mean_cls=1.09, proj_loss=-0.395][2026-03-22 14:30:06] Step: 2801, Training Logs: loss_final: 1.579950, loss_mean: 0.907204, proj_loss: -0.399193, loss_mean_cls: 1.071940, grad_norm: 2.978468 +Steps: 0%| | 2802/1000000 [11:30<67:44:07, 4.09it/s, grad_norm=2.98, loss_final=1.58, loss_mean=0.907, loss_mean_cls=1.07, proj_loss=-0.399][2026-03-22 14:30:07] Step: 2802, Training Logs: loss_final: 1.615431, loss_mean: 0.886783, proj_loss: -0.399916, loss_mean_cls: 1.128564, grad_norm: 2.954216 +Steps: 0%| | 2803/1000000 [11:30<67:43:11, 4.09it/s, grad_norm=2.95, loss_final=1.62, loss_mean=0.887, loss_mean_cls=1.13, proj_loss=-0.4][2026-03-22 14:30:07] Step: 2803, Training Logs: loss_final: 1.662155, loss_mean: 0.894510, proj_loss: -0.392861, loss_mean_cls: 1.160506, grad_norm: 5.184897 +Steps: 0%| | 2804/1000000 [11:30<67:43:37, 4.09it/s, grad_norm=5.18, loss_final=1.66, loss_mean=0.895, loss_mean_cls=1.16, proj_loss=-0.393][2026-03-22 14:30:07] Step: 2804, Training Logs: loss_final: 1.590859, loss_mean: 0.904238, proj_loss: -0.400992, loss_mean_cls: 1.087613, grad_norm: 5.208942 +Steps: 0%| | 2805/1000000 [11:30<67:42:44, 4.09it/s, grad_norm=5.21, loss_final=1.59, loss_mean=0.904, loss_mean_cls=1.09, proj_loss=-0.401][2026-03-22 14:30:07] Step: 2805, Training Logs: loss_final: 1.501381, loss_mean: 0.919097, proj_loss: -0.400810, loss_mean_cls: 0.983093, grad_norm: 3.563009 +Steps: 0%| | 2806/1000000 [11:31<67:43:50, 4.09it/s, grad_norm=3.56, loss_final=1.5, loss_mean=0.919, loss_mean_cls=0.983, proj_loss=-0.401][2026-03-22 14:30:08] Step: 2806, Training Logs: loss_final: 1.618113, loss_mean: 0.888790, proj_loss: -0.397985, loss_mean_cls: 1.127308, grad_norm: 4.756448 +Steps: 0%| | 2807/1000000 [11:31<67:43:18, 4.09it/s, grad_norm=4.76, loss_final=1.62, loss_mean=0.889, loss_mean_cls=1.13, proj_loss=-0.398][2026-03-22 14:30:08] Step: 2807, Training Logs: loss_final: 1.519580, loss_mean: 0.901150, proj_loss: -0.402808, loss_mean_cls: 1.021238, grad_norm: 1.400465 +Steps: 0%| | 2808/1000000 [11:31<67:43:40, 4.09it/s, grad_norm=1.4, loss_final=1.52, loss_mean=0.901, loss_mean_cls=1.02, proj_loss=-0.403][2026-03-22 14:30:08] Step: 2808, Training Logs: loss_final: 1.501620, loss_mean: 0.896956, proj_loss: -0.408357, loss_mean_cls: 1.013021, grad_norm: 1.424583 +Steps: 0%| | 2809/1000000 [11:31<67:43:39, 4.09it/s, grad_norm=1.42, loss_final=1.5, loss_mean=0.897, loss_mean_cls=1.01, proj_loss=-0.408][2026-03-22 14:30:08] Step: 2809, Training Logs: loss_final: 1.494685, loss_mean: 0.902234, proj_loss: -0.400834, loss_mean_cls: 0.993285, grad_norm: 1.146047 +Steps: 0%| | 2810/1000000 [11:32<68:20:02, 4.05it/s, grad_norm=1.15, loss_final=1.49, loss_mean=0.902, loss_mean_cls=0.993, proj_loss=-0.401][2026-03-22 14:30:09] Step: 2810, Training Logs: loss_final: 1.603705, loss_mean: 0.888786, proj_loss: -0.393450, loss_mean_cls: 1.108369, grad_norm: 2.153435 +Steps: 0%| | 2811/1000000 [11:32<68:09:01, 4.06it/s, grad_norm=2.15, loss_final=1.6, loss_mean=0.889, loss_mean_cls=1.11, proj_loss=-0.393][2026-03-22 14:30:09] Step: 2811, Training Logs: loss_final: 1.638694, loss_mean: 0.893053, proj_loss: -0.392950, loss_mean_cls: 1.138591, grad_norm: 1.614696 +Steps: 0%| | 2812/1000000 [11:32<68:00:56, 4.07it/s, grad_norm=1.61, loss_final=1.64, loss_mean=0.893, loss_mean_cls=1.14, proj_loss=-0.393][2026-03-22 14:30:09] Step: 2812, Training Logs: loss_final: 1.623474, loss_mean: 0.878696, proj_loss: -0.391865, loss_mean_cls: 1.136643, grad_norm: 2.083499 +Steps: 0%| | 2813/1000000 [11:32<67:55:17, 4.08it/s, grad_norm=2.08, loss_final=1.62, loss_mean=0.879, loss_mean_cls=1.14, proj_loss=-0.392][2026-03-22 14:30:09] Step: 2813, Training Logs: loss_final: 1.697981, loss_mean: 0.881268, proj_loss: -0.394087, loss_mean_cls: 1.210800, grad_norm: 4.403111 +Steps: 0%| | 2814/1000000 [11:33<67:50:36, 4.08it/s, grad_norm=4.4, loss_final=1.7, loss_mean=0.881, loss_mean_cls=1.21, proj_loss=-0.394][2026-03-22 14:30:10] Step: 2814, Training Logs: loss_final: 1.547240, loss_mean: 0.889486, proj_loss: -0.399637, loss_mean_cls: 1.057391, grad_norm: 3.943078 +Steps: 0%| | 2815/1000000 [11:33<67:47:05, 4.09it/s, grad_norm=3.94, loss_final=1.55, loss_mean=0.889, loss_mean_cls=1.06, proj_loss=-0.4][2026-03-22 14:30:10] Step: 2815, Training Logs: loss_final: 1.622744, loss_mean: 0.895862, proj_loss: -0.388756, loss_mean_cls: 1.115638, grad_norm: 2.605867 +Steps: 0%| | 2816/1000000 [11:33<67:45:34, 4.09it/s, grad_norm=2.61, loss_final=1.62, loss_mean=0.896, loss_mean_cls=1.12, proj_loss=-0.389][2026-03-22 14:30:10] Step: 2816, Training Logs: loss_final: 1.659248, loss_mean: 0.901839, proj_loss: -0.389999, loss_mean_cls: 1.147408, grad_norm: 1.613918 +Steps: 0%| | 2817/1000000 [11:33<67:45:31, 4.09it/s, grad_norm=1.61, loss_final=1.66, loss_mean=0.902, loss_mean_cls=1.15, proj_loss=-0.39][2026-03-22 14:30:10] Step: 2817, Training Logs: loss_final: 1.520340, loss_mean: 0.905674, proj_loss: -0.404560, loss_mean_cls: 1.019226, grad_norm: 3.584317 +Steps: 0%| | 2818/1000000 [11:34<67:45:32, 4.09it/s, grad_norm=3.58, loss_final=1.52, loss_mean=0.906, loss_mean_cls=1.02, proj_loss=-0.405][2026-03-22 14:30:11] Step: 2818, Training Logs: loss_final: 1.493627, loss_mean: 0.912262, proj_loss: -0.406384, loss_mean_cls: 0.987750, grad_norm: 3.899385 +Steps: 0%| | 2819/1000000 [11:34<67:50:27, 4.08it/s, grad_norm=3.9, loss_final=1.49, loss_mean=0.912, loss_mean_cls=0.988, proj_loss=-0.406][2026-03-22 14:30:11] Step: 2819, Training Logs: loss_final: 1.672324, loss_mean: 0.914851, proj_loss: -0.383370, loss_mean_cls: 1.140843, grad_norm: 1.955767 +Steps: 0%| | 2820/1000000 [11:34<67:49:46, 4.08it/s, grad_norm=1.96, loss_final=1.67, loss_mean=0.915, loss_mean_cls=1.14, proj_loss=-0.383][2026-03-22 14:30:11] Step: 2820, Training Logs: loss_final: 1.536693, loss_mean: 0.887894, proj_loss: -0.403727, loss_mean_cls: 1.052527, grad_norm: 4.717118 +Steps: 0%| | 2821/1000000 [11:34<67:46:19, 4.09it/s, grad_norm=4.72, loss_final=1.54, loss_mean=0.888, loss_mean_cls=1.05, proj_loss=-0.404][2026-03-22 14:30:11] Step: 2821, Training Logs: loss_final: 1.630703, loss_mean: 0.888509, proj_loss: -0.385679, loss_mean_cls: 1.127873, grad_norm: 3.144616 +Steps: 0%| | 2822/1000000 [11:35<67:45:15, 4.09it/s, grad_norm=3.14, loss_final=1.63, loss_mean=0.889, loss_mean_cls=1.13, proj_loss=-0.386][2026-03-22 14:30:12] Step: 2822, Training Logs: loss_final: 1.550360, loss_mean: 0.911195, proj_loss: -0.395806, loss_mean_cls: 1.034971, grad_norm: 1.659808 +Steps: 0%| | 2823/1000000 [11:35<67:42:33, 4.09it/s, grad_norm=1.66, loss_final=1.55, loss_mean=0.911, loss_mean_cls=1.03, proj_loss=-0.396][2026-03-22 14:30:12] Step: 2823, Training Logs: loss_final: 1.515722, loss_mean: 0.905770, proj_loss: -0.406143, loss_mean_cls: 1.016095, grad_norm: 4.093767 +Steps: 0%| | 2824/1000000 [11:35<67:42:11, 4.09it/s, grad_norm=4.09, loss_final=1.52, loss_mean=0.906, loss_mean_cls=1.02, proj_loss=-0.406][2026-03-22 14:30:12] Step: 2824, Training Logs: loss_final: 1.678570, loss_mean: 0.887187, proj_loss: -0.391052, loss_mean_cls: 1.182435, grad_norm: 4.698005 +Steps: 0%| | 2825/1000000 [11:35<67:42:47, 4.09it/s, grad_norm=4.7, loss_final=1.68, loss_mean=0.887, loss_mean_cls=1.18, proj_loss=-0.391][2026-03-22 14:30:12] Step: 2825, Training Logs: loss_final: 1.606560, loss_mean: 0.902513, proj_loss: -0.392782, loss_mean_cls: 1.096829, grad_norm: 2.132286 +Steps: 0%| | 2826/1000000 [11:36<67:41:35, 4.09it/s, grad_norm=2.13, loss_final=1.61, loss_mean=0.903, loss_mean_cls=1.1, proj_loss=-0.393][2026-03-22 14:30:13] Step: 2826, Training Logs: loss_final: 1.503910, loss_mean: 0.927905, proj_loss: -0.407619, loss_mean_cls: 0.983623, grad_norm: 4.172679 +Steps: 0%| | 2827/1000000 [11:36<67:40:02, 4.09it/s, grad_norm=4.17, loss_final=1.5, loss_mean=0.928, loss_mean_cls=0.984, proj_loss=-0.408][2026-03-22 14:30:13] Step: 2827, Training Logs: loss_final: 1.524070, loss_mean: 0.911644, proj_loss: -0.402806, loss_mean_cls: 1.015232, grad_norm: 5.193640 +Steps: 0%| | 2828/1000000 [11:36<67:40:16, 4.09it/s, grad_norm=5.19, loss_final=1.52, loss_mean=0.912, loss_mean_cls=1.02, proj_loss=-0.403][2026-03-22 14:30:13] Step: 2828, Training Logs: loss_final: 1.659461, loss_mean: 0.912293, proj_loss: -0.387230, loss_mean_cls: 1.134397, grad_norm: 2.265409 +Steps: 0%| | 2829/1000000 [11:36<67:39:23, 4.09it/s, grad_norm=2.27, loss_final=1.66, loss_mean=0.912, loss_mean_cls=1.13, proj_loss=-0.387][2026-03-22 14:30:13] Step: 2829, Training Logs: loss_final: 1.529099, loss_mean: 0.911276, proj_loss: -0.399662, loss_mean_cls: 1.017485, grad_norm: 2.384854 +Steps: 0%| | 2830/1000000 [11:37<67:39:20, 4.09it/s, grad_norm=2.38, loss_final=1.53, loss_mean=0.911, loss_mean_cls=1.02, proj_loss=-0.4][2026-03-22 14:30:14] Step: 2830, Training Logs: loss_final: 1.530873, loss_mean: 0.897471, proj_loss: -0.400901, loss_mean_cls: 1.034302, grad_norm: 1.664074 +Steps: 0%| | 2831/1000000 [11:37<67:38:44, 4.09it/s, grad_norm=1.66, loss_final=1.53, loss_mean=0.897, loss_mean_cls=1.03, proj_loss=-0.401][2026-03-22 14:30:14] Step: 2831, Training Logs: loss_final: 1.528323, loss_mean: 0.896488, proj_loss: -0.403759, loss_mean_cls: 1.035593, grad_norm: 1.566458 +Steps: 0%| | 2832/1000000 [11:37<67:40:00, 4.09it/s, grad_norm=1.57, loss_final=1.53, loss_mean=0.896, loss_mean_cls=1.04, proj_loss=-0.404][2026-03-22 14:30:14] Step: 2832, Training Logs: loss_final: 1.558988, loss_mean: 0.904673, proj_loss: -0.401519, loss_mean_cls: 1.055834, grad_norm: 1.623352 +Steps: 0%| | 2833/1000000 [11:37<67:39:08, 4.09it/s, grad_norm=1.62, loss_final=1.56, loss_mean=0.905, loss_mean_cls=1.06, proj_loss=-0.402][2026-03-22 14:30:14] Step: 2833, Training Logs: loss_final: 1.674934, loss_mean: 0.898739, proj_loss: -0.391901, loss_mean_cls: 1.168096, grad_norm: 4.943269 +Steps: 0%| | 2834/1000000 [11:38<67:39:52, 4.09it/s, grad_norm=4.94, loss_final=1.67, loss_mean=0.899, loss_mean_cls=1.17, proj_loss=-0.392][2026-03-22 14:30:14] Step: 2834, Training Logs: loss_final: 1.614931, loss_mean: 0.897905, proj_loss: -0.392778, loss_mean_cls: 1.109803, grad_norm: 2.918675 +Steps: 0%| | 2835/1000000 [11:38<67:39:53, 4.09it/s, grad_norm=2.92, loss_final=1.61, loss_mean=0.898, loss_mean_cls=1.11, proj_loss=-0.393][2026-03-22 14:30:15] Step: 2835, Training Logs: loss_final: 1.587373, loss_mean: 0.903281, proj_loss: -0.399607, loss_mean_cls: 1.083698, grad_norm: 1.831305 +Steps: 0%| | 2836/1000000 [11:38<67:41:48, 4.09it/s, grad_norm=1.83, loss_final=1.59, loss_mean=0.903, loss_mean_cls=1.08, proj_loss=-0.4][2026-03-22 14:30:15] Step: 2836, Training Logs: loss_final: 1.524139, loss_mean: 0.923731, proj_loss: -0.406221, loss_mean_cls: 1.006629, grad_norm: 5.859869 +Steps: 0%| | 2837/1000000 [11:38<67:41:40, 4.09it/s, grad_norm=5.86, loss_final=1.52, loss_mean=0.924, loss_mean_cls=1.01, proj_loss=-0.406][2026-03-22 14:30:15] Step: 2837, Training Logs: loss_final: 1.562027, loss_mean: 0.914086, proj_loss: -0.400928, loss_mean_cls: 1.048869, grad_norm: 4.148342 +Steps: 0%| | 2838/1000000 [11:39<67:41:36, 4.09it/s, grad_norm=4.15, loss_final=1.56, loss_mean=0.914, loss_mean_cls=1.05, proj_loss=-0.401][2026-03-22 14:30:15] Step: 2838, Training Logs: loss_final: 1.581336, loss_mean: 0.917077, proj_loss: -0.397121, loss_mean_cls: 1.061380, grad_norm: 3.348338 +Steps: 0%| | 2839/1000000 [11:39<68:02:52, 4.07it/s, grad_norm=3.35, loss_final=1.58, loss_mean=0.917, loss_mean_cls=1.06, proj_loss=-0.397][2026-03-22 14:30:16] Step: 2839, Training Logs: loss_final: 1.559881, loss_mean: 0.897800, proj_loss: -0.402293, loss_mean_cls: 1.064374, grad_norm: 6.324273 +Steps: 0%| | 2840/1000000 [11:39<68:01:15, 4.07it/s, grad_norm=6.32, loss_final=1.56, loss_mean=0.898, loss_mean_cls=1.06, proj_loss=-0.402][2026-03-22 14:30:16] Step: 2840, Training Logs: loss_final: 1.539189, loss_mean: 0.921756, proj_loss: -0.397927, loss_mean_cls: 1.015360, grad_norm: 5.663509 +Steps: 0%| | 2841/1000000 [11:39<67:54:16, 4.08it/s, grad_norm=5.66, loss_final=1.54, loss_mean=0.922, loss_mean_cls=1.02, proj_loss=-0.398][2026-03-22 14:30:16] Step: 2841, Training Logs: loss_final: 1.682054, loss_mean: 0.899499, proj_loss: -0.392460, loss_mean_cls: 1.175015, grad_norm: 5.191937 +Steps: 0%| | 2842/1000000 [11:39<67:50:47, 4.08it/s, grad_norm=5.19, loss_final=1.68, loss_mean=0.899, loss_mean_cls=1.18, proj_loss=-0.392][2026-03-22 14:30:16] Step: 2842, Training Logs: loss_final: 1.648030, loss_mean: 0.897643, proj_loss: -0.392132, loss_mean_cls: 1.142519, grad_norm: 3.473465 +Steps: 0%| | 2843/1000000 [11:40<67:47:43, 4.09it/s, grad_norm=3.47, loss_final=1.65, loss_mean=0.898, loss_mean_cls=1.14, proj_loss=-0.392][2026-03-22 14:30:17] Step: 2843, Training Logs: loss_final: 1.429641, loss_mean: 0.908358, proj_loss: -0.411946, loss_mean_cls: 0.933230, grad_norm: 4.702435 +Steps: 0%| | 2844/1000000 [11:40<67:46:16, 4.09it/s, grad_norm=4.7, loss_final=1.43, loss_mean=0.908, loss_mean_cls=0.933, proj_loss=-0.412][2026-03-22 14:30:17] Step: 2844, Training Logs: loss_final: 1.664235, loss_mean: 0.916605, proj_loss: -0.398640, loss_mean_cls: 1.146270, grad_norm: 4.980391 +Steps: 0%| | 2845/1000000 [11:40<67:44:21, 4.09it/s, grad_norm=4.98, loss_final=1.66, loss_mean=0.917, loss_mean_cls=1.15, proj_loss=-0.399][2026-03-22 14:30:17] Step: 2845, Training Logs: loss_final: 1.622176, loss_mean: 0.901780, proj_loss: -0.388894, loss_mean_cls: 1.109289, grad_norm: 4.281640 +Steps: 0%| | 2846/1000000 [11:40<67:41:53, 4.09it/s, grad_norm=4.28, loss_final=1.62, loss_mean=0.902, loss_mean_cls=1.11, proj_loss=-0.389][2026-03-22 14:30:17] Step: 2846, Training Logs: loss_final: 1.551315, loss_mean: 0.895979, proj_loss: -0.397837, loss_mean_cls: 1.053173, grad_norm: 5.151112 +Steps: 0%| | 2847/1000000 [11:41<67:42:16, 4.09it/s, grad_norm=5.15, loss_final=1.55, loss_mean=0.896, loss_mean_cls=1.05, proj_loss=-0.398][2026-03-22 14:30:18] Step: 2847, Training Logs: loss_final: 1.609332, loss_mean: 0.889419, proj_loss: -0.396942, loss_mean_cls: 1.116856, grad_norm: 2.360171 +Steps: 0%| | 2848/1000000 [11:41<67:40:50, 4.09it/s, grad_norm=2.36, loss_final=1.61, loss_mean=0.889, loss_mean_cls=1.12, proj_loss=-0.397][2026-03-22 14:30:18] Step: 2848, Training Logs: loss_final: 1.677755, loss_mean: 0.886663, proj_loss: -0.392314, loss_mean_cls: 1.183406, grad_norm: 5.946720 +Steps: 0%| | 2849/1000000 [11:41<67:39:56, 4.09it/s, grad_norm=5.95, loss_final=1.68, loss_mean=0.887, loss_mean_cls=1.18, proj_loss=-0.392][2026-03-22 14:30:18] Step: 2849, Training Logs: loss_final: 1.527515, loss_mean: 0.909393, proj_loss: -0.403133, loss_mean_cls: 1.021256, grad_norm: 4.822425 +Steps: 0%| | 2850/1000000 [11:41<67:40:21, 4.09it/s, grad_norm=4.82, loss_final=1.53, loss_mean=0.909, loss_mean_cls=1.02, proj_loss=-0.403][2026-03-22 14:30:18] Step: 2850, Training Logs: loss_final: 1.559269, loss_mean: 0.912253, proj_loss: -0.405519, loss_mean_cls: 1.052535, grad_norm: 4.639251 +Steps: 0%| | 2851/1000000 [11:42<67:40:38, 4.09it/s, grad_norm=4.64, loss_final=1.56, loss_mean=0.912, loss_mean_cls=1.05, proj_loss=-0.406][2026-03-22 14:30:19] Step: 2851, Training Logs: loss_final: 1.591185, loss_mean: 0.898002, proj_loss: -0.403149, loss_mean_cls: 1.096332, grad_norm: 2.355874 +Steps: 0%| | 2852/1000000 [11:42<67:41:06, 4.09it/s, grad_norm=2.36, loss_final=1.59, loss_mean=0.898, loss_mean_cls=1.1, proj_loss=-0.403][2026-03-22 14:30:19] Step: 2852, Training Logs: loss_final: 1.570799, loss_mean: 0.911786, proj_loss: -0.404281, loss_mean_cls: 1.063295, grad_norm: 3.901995 +Steps: 0%| | 2853/1000000 [11:42<67:41:05, 4.09it/s, grad_norm=3.9, loss_final=1.57, loss_mean=0.912, loss_mean_cls=1.06, proj_loss=-0.404][2026-03-22 14:30:19] Step: 2853, Training Logs: loss_final: 1.604229, loss_mean: 0.898472, proj_loss: -0.403228, loss_mean_cls: 1.108985, grad_norm: 5.446922 +Steps: 0%| | 2854/1000000 [11:42<67:40:57, 4.09it/s, grad_norm=5.45, loss_final=1.6, loss_mean=0.898, loss_mean_cls=1.11, proj_loss=-0.403][2026-03-22 14:30:19] Step: 2854, Training Logs: loss_final: 1.644726, loss_mean: 0.911816, proj_loss: -0.400978, loss_mean_cls: 1.133888, grad_norm: 5.015480 +Steps: 0%| | 2855/1000000 [11:43<67:39:34, 4.09it/s, grad_norm=5.02, loss_final=1.64, loss_mean=0.912, loss_mean_cls=1.13, proj_loss=-0.401][2026-03-22 14:30:20] Step: 2855, Training Logs: loss_final: 1.502610, loss_mean: 0.916461, proj_loss: -0.408526, loss_mean_cls: 0.994675, grad_norm: 5.362929 +Steps: 0%| | 2856/1000000 [11:43<67:40:26, 4.09it/s, grad_norm=5.36, loss_final=1.5, loss_mean=0.916, loss_mean_cls=0.995, proj_loss=-0.409][2026-03-22 14:30:20] Step: 2856, Training Logs: loss_final: 1.507814, loss_mean: 0.927412, proj_loss: -0.401460, loss_mean_cls: 0.981862, grad_norm: 2.269748 +Steps: 0%| | 2857/1000000 [11:43<67:43:16, 4.09it/s, grad_norm=2.27, loss_final=1.51, loss_mean=0.927, loss_mean_cls=0.982, proj_loss=-0.401][2026-03-22 14:30:20] Step: 2857, Training Logs: loss_final: 1.446116, loss_mean: 0.920204, proj_loss: -0.412789, loss_mean_cls: 0.938701, grad_norm: 3.460777 +Steps: 0%| | 2858/1000000 [11:43<67:41:35, 4.09it/s, grad_norm=3.46, loss_final=1.45, loss_mean=0.92, loss_mean_cls=0.939, proj_loss=-0.413][2026-03-22 14:30:20] Step: 2858, Training Logs: loss_final: 1.476406, loss_mean: 0.914513, proj_loss: -0.407564, loss_mean_cls: 0.969458, grad_norm: 2.604802 +Steps: 0%| | 2859/1000000 [11:44<67:50:52, 4.08it/s, grad_norm=2.6, loss_final=1.48, loss_mean=0.915, loss_mean_cls=0.969, proj_loss=-0.408][2026-03-22 14:30:21] Step: 2859, Training Logs: loss_final: 1.485951, loss_mean: 0.899055, proj_loss: -0.403834, loss_mean_cls: 0.990730, grad_norm: 1.498348 +Steps: 0%| | 2860/1000000 [11:44<67:48:54, 4.08it/s, grad_norm=1.5, loss_final=1.49, loss_mean=0.899, loss_mean_cls=0.991, proj_loss=-0.404][2026-03-22 14:30:21] Step: 2860, Training Logs: loss_final: 1.512861, loss_mean: 0.920996, proj_loss: -0.407481, loss_mean_cls: 0.999346, grad_norm: 1.664025 +Steps: 0%| | 2861/1000000 [11:44<67:46:36, 4.09it/s, grad_norm=1.66, loss_final=1.51, loss_mean=0.921, loss_mean_cls=0.999, proj_loss=-0.407][2026-03-22 14:30:21] Step: 2861, Training Logs: loss_final: 1.632845, loss_mean: 0.897197, proj_loss: -0.392206, loss_mean_cls: 1.127854, grad_norm: 2.924479 +Steps: 0%| | 2862/1000000 [11:44<67:44:08, 4.09it/s, grad_norm=2.92, loss_final=1.63, loss_mean=0.897, loss_mean_cls=1.13, proj_loss=-0.392][2026-03-22 14:30:21] Step: 2862, Training Logs: loss_final: 1.596210, loss_mean: 0.902805, proj_loss: -0.397936, loss_mean_cls: 1.091342, grad_norm: 1.727485 +Steps: 0%| | 2863/1000000 [11:45<67:43:02, 4.09it/s, grad_norm=1.73, loss_final=1.6, loss_mean=0.903, loss_mean_cls=1.09, proj_loss=-0.398][2026-03-22 14:30:22] Step: 2863, Training Logs: loss_final: 1.541757, loss_mean: 0.908620, proj_loss: -0.393951, loss_mean_cls: 1.027088, grad_norm: 1.435226 +Steps: 0%| | 2864/1000000 [11:45<67:51:01, 4.08it/s, grad_norm=1.44, loss_final=1.54, loss_mean=0.909, loss_mean_cls=1.03, proj_loss=-0.394][2026-03-22 14:30:22] Step: 2864, Training Logs: loss_final: 1.637801, loss_mean: 0.893548, proj_loss: -0.396133, loss_mean_cls: 1.140386, grad_norm: 1.539477 +Steps: 0%| | 2865/1000000 [11:45<67:48:46, 4.08it/s, grad_norm=1.54, loss_final=1.64, loss_mean=0.894, loss_mean_cls=1.14, proj_loss=-0.396][2026-03-22 14:30:22] Step: 2865, Training Logs: loss_final: 1.561703, loss_mean: 0.888561, proj_loss: -0.396538, loss_mean_cls: 1.069681, grad_norm: 2.296753 +Steps: 0%| | 2866/1000000 [11:45<67:46:50, 4.09it/s, grad_norm=2.3, loss_final=1.56, loss_mean=0.889, loss_mean_cls=1.07, proj_loss=-0.397][2026-03-22 14:30:22] Step: 2866, Training Logs: loss_final: 1.493403, loss_mean: 0.928808, proj_loss: -0.403885, loss_mean_cls: 0.968480, grad_norm: 1.908730 +Steps: 0%| | 2867/1000000 [11:46<67:45:13, 4.09it/s, grad_norm=1.91, loss_final=1.49, loss_mean=0.929, loss_mean_cls=0.968, proj_loss=-0.404][2026-03-22 14:30:23] Step: 2867, Training Logs: loss_final: 1.477530, loss_mean: 0.937119, proj_loss: -0.406868, loss_mean_cls: 0.947279, grad_norm: 2.173926 +Steps: 0%| | 2868/1000000 [11:46<67:43:30, 4.09it/s, grad_norm=2.17, loss_final=1.48, loss_mean=0.937, loss_mean_cls=0.947, proj_loss=-0.407][2026-03-22 14:30:23] Step: 2868, Training Logs: loss_final: 1.631118, loss_mean: 0.887675, proj_loss: -0.390804, loss_mean_cls: 1.134247, grad_norm: 1.308005 +Steps: 0%| | 2869/1000000 [11:46<67:42:55, 4.09it/s, grad_norm=1.31, loss_final=1.63, loss_mean=0.888, loss_mean_cls=1.13, proj_loss=-0.391][2026-03-22 14:30:23] Step: 2869, Training Logs: loss_final: 1.642797, loss_mean: 0.893172, proj_loss: -0.394787, loss_mean_cls: 1.144412, grad_norm: 5.479660 +Steps: 0%| | 2870/1000000 [11:46<67:43:00, 4.09it/s, grad_norm=5.48, loss_final=1.64, loss_mean=0.893, loss_mean_cls=1.14, proj_loss=-0.395][2026-03-22 14:30:23] Step: 2870, Training Logs: loss_final: 1.586162, loss_mean: 0.881813, proj_loss: -0.400572, loss_mean_cls: 1.104920, grad_norm: 2.792810 +Steps: 0%| | 2871/1000000 [11:47<67:51:41, 4.08it/s, grad_norm=2.79, loss_final=1.59, loss_mean=0.882, loss_mean_cls=1.1, proj_loss=-0.401][2026-03-22 14:30:24] Step: 2871, Training Logs: loss_final: 1.554648, loss_mean: 0.900422, proj_loss: -0.394729, loss_mean_cls: 1.048955, grad_norm: 1.530749 +Steps: 0%| | 2872/1000000 [11:47<67:48:19, 4.08it/s, grad_norm=1.53, loss_final=1.55, loss_mean=0.9, loss_mean_cls=1.05, proj_loss=-0.395][2026-03-22 14:30:24] Step: 2872, Training Logs: loss_final: 1.627578, loss_mean: 0.900880, proj_loss: -0.388838, loss_mean_cls: 1.115536, grad_norm: 2.906561 +Steps: 0%| | 2873/1000000 [11:47<67:46:52, 4.09it/s, grad_norm=2.91, loss_final=1.63, loss_mean=0.901, loss_mean_cls=1.12, proj_loss=-0.389][2026-03-22 14:30:24] Step: 2873, Training Logs: loss_final: 1.560681, loss_mean: 0.896884, proj_loss: -0.405150, loss_mean_cls: 1.068947, grad_norm: 2.640087 +Steps: 0%| | 2874/1000000 [11:47<67:45:11, 4.09it/s, grad_norm=2.64, loss_final=1.56, loss_mean=0.897, loss_mean_cls=1.07, proj_loss=-0.405][2026-03-22 14:30:24] Step: 2874, Training Logs: loss_final: 1.612122, loss_mean: 0.891651, proj_loss: -0.398085, loss_mean_cls: 1.118556, grad_norm: 2.498389 +Steps: 0%| | 2875/1000000 [11:48<67:44:04, 4.09it/s, grad_norm=2.5, loss_final=1.61, loss_mean=0.892, loss_mean_cls=1.12, proj_loss=-0.398][2026-03-22 14:30:25] Step: 2875, Training Logs: loss_final: 1.674134, loss_mean: 0.903943, proj_loss: -0.391387, loss_mean_cls: 1.161578, grad_norm: 3.655624 +Steps: 0%| | 2876/1000000 [11:48<67:42:32, 4.09it/s, grad_norm=3.66, loss_final=1.67, loss_mean=0.904, loss_mean_cls=1.16, proj_loss=-0.391][2026-03-22 14:30:25] Step: 2876, Training Logs: loss_final: 1.623039, loss_mean: 0.908117, proj_loss: -0.389755, loss_mean_cls: 1.104677, grad_norm: 1.928626 +Steps: 0%| | 2877/1000000 [11:48<67:40:03, 4.09it/s, grad_norm=1.93, loss_final=1.62, loss_mean=0.908, loss_mean_cls=1.1, proj_loss=-0.39][2026-03-22 14:30:25] Step: 2877, Training Logs: loss_final: 1.576174, loss_mean: 0.894201, proj_loss: -0.393984, loss_mean_cls: 1.075958, grad_norm: 3.560336 +Steps: 0%| | 2878/1000000 [11:48<67:41:07, 4.09it/s, grad_norm=3.56, loss_final=1.58, loss_mean=0.894, loss_mean_cls=1.08, proj_loss=-0.394][2026-03-22 14:30:25] Step: 2878, Training Logs: loss_final: 1.713550, loss_mean: 0.890030, proj_loss: -0.395047, loss_mean_cls: 1.218567, grad_norm: 5.338796 +Steps: 0%| | 2879/1000000 [11:49<67:40:13, 4.09it/s, grad_norm=5.34, loss_final=1.71, loss_mean=0.89, loss_mean_cls=1.22, proj_loss=-0.395][2026-03-22 14:30:25] Step: 2879, Training Logs: loss_final: 1.412991, loss_mean: 0.909902, proj_loss: -0.409745, loss_mean_cls: 0.912834, grad_norm: 2.726310 +Steps: 0%| | 2880/1000000 [11:49<67:39:33, 4.09it/s, grad_norm=2.73, loss_final=1.41, loss_mean=0.91, loss_mean_cls=0.913, proj_loss=-0.41][2026-03-22 14:30:26] Step: 2880, Training Logs: loss_final: 1.677122, loss_mean: 0.911153, proj_loss: -0.387798, loss_mean_cls: 1.153768, grad_norm: 2.353706 +Steps: 0%| | 2881/1000000 [11:49<67:39:19, 4.09it/s, grad_norm=2.35, loss_final=1.68, loss_mean=0.911, loss_mean_cls=1.15, proj_loss=-0.388][2026-03-22 14:30:26] Step: 2881, Training Logs: loss_final: 1.563218, loss_mean: 0.904765, proj_loss: -0.401177, loss_mean_cls: 1.059630, grad_norm: 3.683391 +Steps: 0%| | 2882/1000000 [11:49<67:39:00, 4.09it/s, grad_norm=3.68, loss_final=1.56, loss_mean=0.905, loss_mean_cls=1.06, proj_loss=-0.401][2026-03-22 14:30:26] Step: 2882, Training Logs: loss_final: 1.599075, loss_mean: 0.919170, proj_loss: -0.401034, loss_mean_cls: 1.080939, grad_norm: 3.779164 +Steps: 0%| | 2883/1000000 [11:50<67:38:55, 4.09it/s, grad_norm=3.78, loss_final=1.6, loss_mean=0.919, loss_mean_cls=1.08, proj_loss=-0.401][2026-03-22 14:30:26] Step: 2883, Training Logs: loss_final: 1.784729, loss_mean: 0.885344, proj_loss: -0.379548, loss_mean_cls: 1.278933, grad_norm: 3.076147 +Steps: 0%| | 2884/1000000 [11:50<67:39:49, 4.09it/s, grad_norm=3.08, loss_final=1.78, loss_mean=0.885, loss_mean_cls=1.28, proj_loss=-0.38][2026-03-22 14:30:27] Step: 2884, Training Logs: loss_final: 1.645661, loss_mean: 0.894362, proj_loss: -0.391809, loss_mean_cls: 1.143109, grad_norm: 3.184347 +Steps: 0%| | 2885/1000000 [11:50<67:40:09, 4.09it/s, grad_norm=3.18, loss_final=1.65, loss_mean=0.894, loss_mean_cls=1.14, proj_loss=-0.392][2026-03-22 14:30:27] Step: 2885, Training Logs: loss_final: 1.647688, loss_mean: 0.902900, proj_loss: -0.386646, loss_mean_cls: 1.131434, grad_norm: 3.672266 +Steps: 0%| | 2886/1000000 [11:50<67:38:40, 4.09it/s, grad_norm=3.67, loss_final=1.65, loss_mean=0.903, loss_mean_cls=1.13, proj_loss=-0.387][2026-03-22 14:30:27] Step: 2886, Training Logs: loss_final: 1.625586, loss_mean: 0.889751, proj_loss: -0.401565, loss_mean_cls: 1.137400, grad_norm: 3.682544 +Steps: 0%| | 2887/1000000 [11:50<67:39:32, 4.09it/s, grad_norm=3.68, loss_final=1.63, loss_mean=0.89, loss_mean_cls=1.14, proj_loss=-0.402][2026-03-22 14:30:27] Step: 2887, Training Logs: loss_final: 1.692744, loss_mean: 0.883908, proj_loss: -0.392542, loss_mean_cls: 1.201379, grad_norm: 3.336197 +Steps: 0%| | 2888/1000000 [11:51<67:41:18, 4.09it/s, grad_norm=3.34, loss_final=1.69, loss_mean=0.884, loss_mean_cls=1.2, proj_loss=-0.393][2026-03-22 14:30:28] Step: 2888, Training Logs: loss_final: 1.606171, loss_mean: 0.896815, proj_loss: -0.394453, loss_mean_cls: 1.103809, grad_norm: 2.817679 +Steps: 0%| | 2889/1000000 [11:51<67:41:17, 4.09it/s, grad_norm=2.82, loss_final=1.61, loss_mean=0.897, loss_mean_cls=1.1, proj_loss=-0.394][2026-03-22 14:30:28] Step: 2889, Training Logs: loss_final: 1.642427, loss_mean: 0.900356, proj_loss: -0.392169, loss_mean_cls: 1.134240, grad_norm: 1.585269 +Steps: 0%| | 2890/1000000 [11:51<67:39:43, 4.09it/s, grad_norm=1.59, loss_final=1.64, loss_mean=0.9, loss_mean_cls=1.13, proj_loss=-0.392][2026-03-22 14:30:28] Step: 2890, Training Logs: loss_final: 1.690180, loss_mean: 0.888883, proj_loss: -0.388750, loss_mean_cls: 1.190048, grad_norm: 3.708319 +Steps: 0%| | 2891/1000000 [11:51<67:39:01, 4.09it/s, grad_norm=3.71, loss_final=1.69, loss_mean=0.889, loss_mean_cls=1.19, proj_loss=-0.389][2026-03-22 14:30:28] Step: 2891, Training Logs: loss_final: 1.596944, loss_mean: 0.899174, proj_loss: -0.395179, loss_mean_cls: 1.092949, grad_norm: 3.663273 +Steps: 0%| | 2892/1000000 [11:52<67:40:33, 4.09it/s, grad_norm=3.66, loss_final=1.6, loss_mean=0.899, loss_mean_cls=1.09, proj_loss=-0.395][2026-03-22 14:30:29] Step: 2892, Training Logs: loss_final: 1.716739, loss_mean: 0.866375, proj_loss: -0.390807, loss_mean_cls: 1.241171, grad_norm: 3.585124 +Steps: 0%| | 2893/1000000 [11:52<67:39:35, 4.09it/s, grad_norm=3.59, loss_final=1.72, loss_mean=0.866, loss_mean_cls=1.24, proj_loss=-0.391][2026-03-22 14:30:29] Step: 2893, Training Logs: loss_final: 1.518037, loss_mean: 0.906477, proj_loss: -0.403326, loss_mean_cls: 1.014886, grad_norm: 2.503427 +Steps: 0%| | 2894/1000000 [11:52<67:39:10, 4.09it/s, grad_norm=2.5, loss_final=1.52, loss_mean=0.906, loss_mean_cls=1.01, proj_loss=-0.403][2026-03-22 14:30:29] Step: 2894, Training Logs: loss_final: 1.713803, loss_mean: 0.890876, proj_loss: -0.385827, loss_mean_cls: 1.208753, grad_norm: 1.337951 +Steps: 0%| | 2895/1000000 [11:52<67:39:01, 4.09it/s, grad_norm=1.34, loss_final=1.71, loss_mean=0.891, loss_mean_cls=1.21, proj_loss=-0.386][2026-03-22 14:30:29] Step: 2895, Training Logs: loss_final: 1.566422, loss_mean: 0.890125, proj_loss: -0.395513, loss_mean_cls: 1.071810, grad_norm: 2.673118 +Steps: 0%| | 2896/1000000 [11:53<67:40:20, 4.09it/s, grad_norm=2.67, loss_final=1.57, loss_mean=0.89, loss_mean_cls=1.07, proj_loss=-0.396][2026-03-22 14:30:30] Step: 2896, Training Logs: loss_final: 1.624707, loss_mean: 0.893696, proj_loss: -0.397181, loss_mean_cls: 1.128191, grad_norm: 1.601618 +Steps: 0%| | 2897/1000000 [11:53<67:40:16, 4.09it/s, grad_norm=1.6, loss_final=1.62, loss_mean=0.894, loss_mean_cls=1.13, proj_loss=-0.397][2026-03-22 14:30:30] Step: 2897, Training Logs: loss_final: 1.606859, loss_mean: 0.900460, proj_loss: -0.389471, loss_mean_cls: 1.095869, grad_norm: 1.496406 +Steps: 0%| | 2898/1000000 [11:53<67:40:30, 4.09it/s, grad_norm=1.5, loss_final=1.61, loss_mean=0.9, loss_mean_cls=1.1, proj_loss=-0.389][2026-03-22 14:30:30] Step: 2898, Training Logs: loss_final: 1.658165, loss_mean: 0.891099, proj_loss: -0.393385, loss_mean_cls: 1.160451, grad_norm: 2.624482 +Steps: 0%| | 2899/1000000 [11:53<67:41:04, 4.09it/s, grad_norm=2.62, loss_final=1.66, loss_mean=0.891, loss_mean_cls=1.16, proj_loss=-0.393][2026-03-22 14:30:30] Step: 2899, Training Logs: loss_final: 1.649302, loss_mean: 0.869239, proj_loss: -0.386787, loss_mean_cls: 1.166850, grad_norm: 1.309692 +Steps: 0%| | 2900/1000000 [11:54<67:43:58, 4.09it/s, grad_norm=1.31, loss_final=1.65, loss_mean=0.869, loss_mean_cls=1.17, proj_loss=-0.387][2026-03-22 14:30:31] Step: 2900, Training Logs: loss_final: 1.488538, loss_mean: 0.909096, proj_loss: -0.402044, loss_mean_cls: 0.981487, grad_norm: 1.808749 +Steps: 0%| | 2901/1000000 [11:54<67:42:35, 4.09it/s, grad_norm=1.81, loss_final=1.49, loss_mean=0.909, loss_mean_cls=0.981, proj_loss=-0.402][2026-03-22 14:30:31] Step: 2901, Training Logs: loss_final: 1.587589, loss_mean: 0.882200, proj_loss: -0.400819, loss_mean_cls: 1.106207, grad_norm: 2.484739 +Steps: 0%| | 2902/1000000 [11:54<67:41:07, 4.09it/s, grad_norm=2.48, loss_final=1.59, loss_mean=0.882, loss_mean_cls=1.11, proj_loss=-0.401][2026-03-22 14:30:31] Step: 2902, Training Logs: loss_final: 1.680212, loss_mean: 0.888669, proj_loss: -0.390812, loss_mean_cls: 1.182355, grad_norm: 2.486088 +Steps: 0%| | 2903/1000000 [11:54<67:40:40, 4.09it/s, grad_norm=2.49, loss_final=1.68, loss_mean=0.889, loss_mean_cls=1.18, proj_loss=-0.391][2026-03-22 14:30:31] Step: 2903, Training Logs: loss_final: 1.723877, loss_mean: 0.887976, proj_loss: -0.382095, loss_mean_cls: 1.217996, grad_norm: 1.633870 +Steps: 0%| | 2904/1000000 [11:55<67:40:50, 4.09it/s, grad_norm=1.63, loss_final=1.72, loss_mean=0.888, loss_mean_cls=1.22, proj_loss=-0.382][2026-03-22 14:30:32] Step: 2904, Training Logs: loss_final: 1.721627, loss_mean: 0.875103, proj_loss: -0.386492, loss_mean_cls: 1.233016, grad_norm: 1.105592 +Steps: 0%| | 2905/1000000 [11:55<68:39:22, 4.03it/s, grad_norm=1.11, loss_final=1.72, loss_mean=0.875, loss_mean_cls=1.23, proj_loss=-0.386][2026-03-22 14:30:32] Step: 2905, Training Logs: loss_final: 1.489030, loss_mean: 0.898664, proj_loss: -0.407838, loss_mean_cls: 0.998204, grad_norm: 4.428061 +Steps: 0%| | 2906/1000000 [11:55<68:22:18, 4.05it/s, grad_norm=4.43, loss_final=1.49, loss_mean=0.899, loss_mean_cls=0.998, proj_loss=-0.408][2026-03-22 14:30:32] Step: 2906, Training Logs: loss_final: 1.652126, loss_mean: 0.894154, proj_loss: -0.390206, loss_mean_cls: 1.148177, grad_norm: 3.979848 +Steps: 0%| | 2907/1000000 [11:55<68:08:52, 4.06it/s, grad_norm=3.98, loss_final=1.65, loss_mean=0.894, loss_mean_cls=1.15, proj_loss=-0.39][2026-03-22 14:30:32] Step: 2907, Training Logs: loss_final: 1.553775, loss_mean: 0.893945, proj_loss: -0.399370, loss_mean_cls: 1.059200, grad_norm: 3.802073 +Steps: 0%| | 2908/1000000 [11:56<68:01:41, 4.07it/s, grad_norm=3.8, loss_final=1.55, loss_mean=0.894, loss_mean_cls=1.06, proj_loss=-0.399][2026-03-22 14:30:33] Step: 2908, Training Logs: loss_final: 1.541355, loss_mean: 0.931688, proj_loss: -0.399067, loss_mean_cls: 1.008735, grad_norm: 1.798043 +Steps: 0%| | 2909/1000000 [11:56<67:56:42, 4.08it/s, grad_norm=1.8, loss_final=1.54, loss_mean=0.932, loss_mean_cls=1.01, proj_loss=-0.399][2026-03-22 14:30:33] Step: 2909, Training Logs: loss_final: 1.695508, loss_mean: 0.881090, proj_loss: -0.388712, loss_mean_cls: 1.203130, grad_norm: 1.742940 +Steps: 0%| | 2910/1000000 [11:56<67:51:32, 4.08it/s, grad_norm=1.74, loss_final=1.7, loss_mean=0.881, loss_mean_cls=1.2, proj_loss=-0.389][2026-03-22 14:30:33] Step: 2910, Training Logs: loss_final: 1.610018, loss_mean: 0.890075, proj_loss: -0.397721, loss_mean_cls: 1.117664, grad_norm: 3.376704 +Steps: 0%| | 2911/1000000 [11:56<67:46:56, 4.09it/s, grad_norm=3.38, loss_final=1.61, loss_mean=0.89, loss_mean_cls=1.12, proj_loss=-0.398][2026-03-22 14:30:33] Step: 2911, Training Logs: loss_final: 1.613253, loss_mean: 0.890242, proj_loss: -0.389075, loss_mean_cls: 1.112086, grad_norm: 6.257403 +Steps: 0%| | 2912/1000000 [11:57<67:50:55, 4.08it/s, grad_norm=6.26, loss_final=1.61, loss_mean=0.89, loss_mean_cls=1.11, proj_loss=-0.389][2026-03-22 14:30:34] Step: 2912, Training Logs: loss_final: 1.582728, loss_mean: 0.902355, proj_loss: -0.394993, loss_mean_cls: 1.075366, grad_norm: 4.112385 +Steps: 0%| | 2913/1000000 [11:57<67:46:06, 4.09it/s, grad_norm=4.11, loss_final=1.58, loss_mean=0.902, loss_mean_cls=1.08, proj_loss=-0.395][2026-03-22 14:30:34] Step: 2913, Training Logs: loss_final: 1.738409, loss_mean: 0.889965, proj_loss: -0.385509, loss_mean_cls: 1.233953, grad_norm: 2.539226 +Steps: 0%| | 2914/1000000 [11:57<67:43:41, 4.09it/s, grad_norm=2.54, loss_final=1.74, loss_mean=0.89, loss_mean_cls=1.23, proj_loss=-0.386][2026-03-22 14:30:34] Step: 2914, Training Logs: loss_final: 1.687458, loss_mean: 0.887023, proj_loss: -0.387947, loss_mean_cls: 1.188382, grad_norm: 2.862397 +Steps: 0%| | 2915/1000000 [11:57<67:40:41, 4.09it/s, grad_norm=2.86, loss_final=1.69, loss_mean=0.887, loss_mean_cls=1.19, proj_loss=-0.388][2026-03-22 14:30:34] Step: 2915, Training Logs: loss_final: 1.471752, loss_mean: 0.905509, proj_loss: -0.397784, loss_mean_cls: 0.964027, grad_norm: 1.748111 +Steps: 0%| | 2916/1000000 [11:58<67:41:50, 4.09it/s, grad_norm=1.75, loss_final=1.47, loss_mean=0.906, loss_mean_cls=0.964, proj_loss=-0.398][2026-03-22 14:30:35] Step: 2916, Training Logs: loss_final: 1.570704, loss_mean: 0.910594, proj_loss: -0.402243, loss_mean_cls: 1.062353, grad_norm: 1.353937 +Steps: 0%| | 2917/1000000 [11:58<67:42:37, 4.09it/s, grad_norm=1.35, loss_final=1.57, loss_mean=0.911, loss_mean_cls=1.06, proj_loss=-0.402][2026-03-22 14:30:35] Step: 2917, Training Logs: loss_final: 1.723773, loss_mean: 0.894364, proj_loss: -0.387224, loss_mean_cls: 1.216634, grad_norm: 1.728577 +Steps: 0%| | 2918/1000000 [11:58<67:41:30, 4.09it/s, grad_norm=1.73, loss_final=1.72, loss_mean=0.894, loss_mean_cls=1.22, proj_loss=-0.387][2026-03-22 14:30:35] Step: 2918, Training Logs: loss_final: 1.606511, loss_mean: 0.893723, proj_loss: -0.398605, loss_mean_cls: 1.111392, grad_norm: 3.707920 +Steps: 0%| | 2919/1000000 [11:58<67:40:28, 4.09it/s, grad_norm=3.71, loss_final=1.61, loss_mean=0.894, loss_mean_cls=1.11, proj_loss=-0.399][2026-03-22 14:30:35] Step: 2919, Training Logs: loss_final: 1.636733, loss_mean: 0.899723, proj_loss: -0.387149, loss_mean_cls: 1.124159, grad_norm: 1.811047 +Steps: 0%| | 2920/1000000 [11:59<67:50:09, 4.08it/s, grad_norm=1.81, loss_final=1.64, loss_mean=0.9, loss_mean_cls=1.12, proj_loss=-0.387][2026-03-22 14:30:36] Step: 2920, Training Logs: loss_final: 1.562413, loss_mean: 0.903296, proj_loss: -0.398618, loss_mean_cls: 1.057735, grad_norm: 3.286510 +Steps: 0%| | 2921/1000000 [11:59<67:48:24, 4.08it/s, grad_norm=3.29, loss_final=1.56, loss_mean=0.903, loss_mean_cls=1.06, proj_loss=-0.399][2026-03-22 14:30:36] Step: 2921, Training Logs: loss_final: 1.644668, loss_mean: 0.877273, proj_loss: -0.391863, loss_mean_cls: 1.159259, grad_norm: 2.142014 +Steps: 0%| | 2922/1000000 [11:59<67:46:18, 4.09it/s, grad_norm=2.14, loss_final=1.64, loss_mean=0.877, loss_mean_cls=1.16, proj_loss=-0.392][2026-03-22 14:30:36] Step: 2922, Training Logs: loss_final: 1.548061, loss_mean: 0.907773, proj_loss: -0.400634, loss_mean_cls: 1.040922, grad_norm: 3.500350 +Steps: 0%| | 2923/1000000 [11:59<67:42:49, 4.09it/s, grad_norm=3.5, loss_final=1.55, loss_mean=0.908, loss_mean_cls=1.04, proj_loss=-0.401][2026-03-22 14:30:36] Step: 2923, Training Logs: loss_final: 1.666597, loss_mean: 0.876576, proj_loss: -0.388365, loss_mean_cls: 1.178385, grad_norm: 3.362974 +Steps: 0%| | 2924/1000000 [12:00<67:41:06, 4.09it/s, grad_norm=3.36, loss_final=1.67, loss_mean=0.877, loss_mean_cls=1.18, proj_loss=-0.388][2026-03-22 14:30:36] Step: 2924, Training Logs: loss_final: 1.564250, loss_mean: 0.927226, proj_loss: -0.403168, loss_mean_cls: 1.040193, grad_norm: 2.254245 +Steps: 0%| | 2925/1000000 [12:00<67:41:54, 4.09it/s, grad_norm=2.25, loss_final=1.56, loss_mean=0.927, loss_mean_cls=1.04, proj_loss=-0.403][2026-03-22 14:30:37] Step: 2925, Training Logs: loss_final: 1.484845, loss_mean: 0.919682, proj_loss: -0.399580, loss_mean_cls: 0.964743, grad_norm: 1.095048 +Steps: 0%| | 2926/1000000 [12:00<67:41:42, 4.09it/s, grad_norm=1.1, loss_final=1.48, loss_mean=0.92, loss_mean_cls=0.965, proj_loss=-0.4][2026-03-22 14:30:37] Step: 2926, Training Logs: loss_final: 1.465306, loss_mean: 0.920877, proj_loss: -0.407992, loss_mean_cls: 0.952420, grad_norm: 2.100490 +Steps: 0%| | 2927/1000000 [12:00<67:41:24, 4.09it/s, grad_norm=2.1, loss_final=1.47, loss_mean=0.921, loss_mean_cls=0.952, proj_loss=-0.408][2026-03-22 14:30:37] Step: 2927, Training Logs: loss_final: 1.631168, loss_mean: 0.882105, proj_loss: -0.388881, loss_mean_cls: 1.137944, grad_norm: 2.368755 +Steps: 0%| | 2928/1000000 [12:01<67:43:26, 4.09it/s, grad_norm=2.37, loss_final=1.63, loss_mean=0.882, loss_mean_cls=1.14, proj_loss=-0.389][2026-03-22 14:30:37] Step: 2928, Training Logs: loss_final: 1.630880, loss_mean: 0.895224, proj_loss: -0.388793, loss_mean_cls: 1.124449, grad_norm: 1.758041 +Steps: 0%| | 2929/1000000 [12:01<67:42:57, 4.09it/s, grad_norm=1.76, loss_final=1.63, loss_mean=0.895, loss_mean_cls=1.12, proj_loss=-0.389][2026-03-22 14:30:38] Step: 2929, Training Logs: loss_final: 1.546560, loss_mean: 0.885888, proj_loss: -0.400679, loss_mean_cls: 1.061351, grad_norm: 1.511236 +Steps: 0%| | 2930/1000000 [12:01<67:42:34, 4.09it/s, grad_norm=1.51, loss_final=1.55, loss_mean=0.886, loss_mean_cls=1.06, proj_loss=-0.401][2026-03-22 14:30:38] Step: 2930, Training Logs: loss_final: 1.490113, loss_mean: 0.888384, proj_loss: -0.402073, loss_mean_cls: 1.003802, grad_norm: 3.438120 +Steps: 0%| | 2931/1000000 [12:01<67:42:49, 4.09it/s, grad_norm=3.44, loss_final=1.49, loss_mean=0.888, loss_mean_cls=1, proj_loss=-0.402][2026-03-22 14:30:38] Step: 2931, Training Logs: loss_final: 1.582215, loss_mean: 0.896212, proj_loss: -0.406057, loss_mean_cls: 1.092060, grad_norm: 4.362165 +Steps: 0%| | 2932/1000000 [12:01<67:43:42, 4.09it/s, grad_norm=4.36, loss_final=1.58, loss_mean=0.896, loss_mean_cls=1.09, proj_loss=-0.406][2026-03-22 14:30:38] Step: 2932, Training Logs: loss_final: 1.577956, loss_mean: 0.904641, proj_loss: -0.394218, loss_mean_cls: 1.067533, grad_norm: 1.237439 +Steps: 0%| | 2933/1000000 [12:02<67:43:08, 4.09it/s, grad_norm=1.24, loss_final=1.58, loss_mean=0.905, loss_mean_cls=1.07, proj_loss=-0.394][2026-03-22 14:30:39] Step: 2933, Training Logs: loss_final: 1.497640, loss_mean: 0.911989, proj_loss: -0.407368, loss_mean_cls: 0.993020, grad_norm: 1.480740 +Steps: 0%| | 2934/1000000 [12:02<67:42:48, 4.09it/s, grad_norm=1.48, loss_final=1.5, loss_mean=0.912, loss_mean_cls=0.993, proj_loss=-0.407][2026-03-22 14:30:39] Step: 2934, Training Logs: loss_final: 1.529283, loss_mean: 0.912765, proj_loss: -0.401967, loss_mean_cls: 1.018485, grad_norm: 2.194957 +Steps: 0%| | 2935/1000000 [12:02<67:41:27, 4.09it/s, grad_norm=2.19, loss_final=1.53, loss_mean=0.913, loss_mean_cls=1.02, proj_loss=-0.402][2026-03-22 14:30:39] Step: 2935, Training Logs: loss_final: 1.603233, loss_mean: 0.907561, proj_loss: -0.393751, loss_mean_cls: 1.089423, grad_norm: 4.778432 +Steps: 0%| | 2936/1000000 [12:02<67:42:31, 4.09it/s, grad_norm=4.78, loss_final=1.6, loss_mean=0.908, loss_mean_cls=1.09, proj_loss=-0.394][2026-03-22 14:30:39] Step: 2936, Training Logs: loss_final: 1.546103, loss_mean: 0.912425, proj_loss: -0.403473, loss_mean_cls: 1.037151, grad_norm: 1.720725 +Steps: 0%| | 2937/1000000 [12:03<67:41:22, 4.09it/s, grad_norm=1.72, loss_final=1.55, loss_mean=0.912, loss_mean_cls=1.04, proj_loss=-0.403][2026-03-22 14:30:40] Step: 2937, Training Logs: loss_final: 1.562547, loss_mean: 0.940876, proj_loss: -0.400142, loss_mean_cls: 1.021813, grad_norm: 1.822105 +Steps: 0%| | 2938/1000000 [12:03<67:41:48, 4.09it/s, grad_norm=1.82, loss_final=1.56, loss_mean=0.941, loss_mean_cls=1.02, proj_loss=-0.4][2026-03-22 14:30:40] Step: 2938, Training Logs: loss_final: 1.581633, loss_mean: 0.902941, proj_loss: -0.398979, loss_mean_cls: 1.077671, grad_norm: 3.853288 +Steps: 0%| | 2939/1000000 [12:03<67:43:34, 4.09it/s, grad_norm=3.85, loss_final=1.58, loss_mean=0.903, loss_mean_cls=1.08, proj_loss=-0.399][2026-03-22 14:30:40] Step: 2939, Training Logs: loss_final: 1.495828, loss_mean: 0.912159, proj_loss: -0.398761, loss_mean_cls: 0.982430, grad_norm: 2.162953 +Steps: 0%| | 2940/1000000 [12:03<67:43:39, 4.09it/s, grad_norm=2.16, loss_final=1.5, loss_mean=0.912, loss_mean_cls=0.982, proj_loss=-0.399][2026-03-22 14:30:40] Step: 2940, Training Logs: loss_final: 1.672055, loss_mean: 0.904790, proj_loss: -0.391773, loss_mean_cls: 1.159039, grad_norm: 5.664450 +Steps: 0%| | 2941/1000000 [12:04<67:44:07, 4.09it/s, grad_norm=5.66, loss_final=1.67, loss_mean=0.905, loss_mean_cls=1.16, proj_loss=-0.392][2026-03-22 14:30:41] Step: 2941, Training Logs: loss_final: 1.617648, loss_mean: 0.903607, proj_loss: -0.394898, loss_mean_cls: 1.108940, grad_norm: 3.664349 +Steps: 0%| | 2942/1000000 [12:04<67:42:02, 4.09it/s, grad_norm=3.66, loss_final=1.62, loss_mean=0.904, loss_mean_cls=1.11, proj_loss=-0.395][2026-03-22 14:30:41] Step: 2942, Training Logs: loss_final: 1.641629, loss_mean: 0.917726, proj_loss: -0.394634, loss_mean_cls: 1.118537, grad_norm: 5.017601 +Steps: 0%| | 2943/1000000 [12:04<67:42:35, 4.09it/s, grad_norm=5.02, loss_final=1.64, loss_mean=0.918, loss_mean_cls=1.12, proj_loss=-0.395][2026-03-22 14:30:41] Step: 2943, Training Logs: loss_final: 1.651503, loss_mean: 0.906488, proj_loss: -0.394363, loss_mean_cls: 1.139379, grad_norm: 2.645200 +Steps: 0%| | 2944/1000000 [12:04<67:46:03, 4.09it/s, grad_norm=2.65, loss_final=1.65, loss_mean=0.906, loss_mean_cls=1.14, proj_loss=-0.394][2026-03-22 14:30:41] Step: 2944, Training Logs: loss_final: 1.467756, loss_mean: 0.921306, proj_loss: -0.411717, loss_mean_cls: 0.958167, grad_norm: 3.383590 +Steps: 0%| | 2945/1000000 [12:05<67:45:18, 4.09it/s, grad_norm=3.38, loss_final=1.47, loss_mean=0.921, loss_mean_cls=0.958, proj_loss=-0.412][2026-03-22 14:30:42] Step: 2945, Training Logs: loss_final: 1.526312, loss_mean: 0.913312, proj_loss: -0.401885, loss_mean_cls: 1.014886, grad_norm: 4.530690 +Steps: 0%| | 2946/1000000 [12:05<67:44:59, 4.09it/s, grad_norm=4.53, loss_final=1.53, loss_mean=0.913, loss_mean_cls=1.01, proj_loss=-0.402][2026-03-22 14:30:42] Step: 2946, Training Logs: loss_final: 1.600131, loss_mean: 0.912993, proj_loss: -0.404799, loss_mean_cls: 1.091937, grad_norm: 6.522645 +Steps: 0%| | 2947/1000000 [12:05<67:43:08, 4.09it/s, grad_norm=6.52, loss_final=1.6, loss_mean=0.913, loss_mean_cls=1.09, proj_loss=-0.405][2026-03-22 14:30:42] Step: 2947, Training Logs: loss_final: 1.522248, loss_mean: 0.908608, proj_loss: -0.405732, loss_mean_cls: 1.019371, grad_norm: 4.165780 +Steps: 0%| | 2948/1000000 [12:05<67:43:45, 4.09it/s, grad_norm=4.17, loss_final=1.52, loss_mean=0.909, loss_mean_cls=1.02, proj_loss=-0.406][2026-03-22 14:30:42] Step: 2948, Training Logs: loss_final: 1.628510, loss_mean: 0.904370, proj_loss: -0.389912, loss_mean_cls: 1.114052, grad_norm: 3.360932 +Steps: 0%| | 2949/1000000 [12:06<67:42:32, 4.09it/s, grad_norm=3.36, loss_final=1.63, loss_mean=0.904, loss_mean_cls=1.11, proj_loss=-0.39][2026-03-22 14:30:43] Step: 2949, Training Logs: loss_final: 1.591725, loss_mean: 0.904712, proj_loss: -0.400241, loss_mean_cls: 1.087254, grad_norm: 5.446346 +Steps: 0%| | 2950/1000000 [12:06<67:41:43, 4.09it/s, grad_norm=5.45, loss_final=1.59, loss_mean=0.905, loss_mean_cls=1.09, proj_loss=-0.4][2026-03-22 14:30:43] Step: 2950, Training Logs: loss_final: 1.491169, loss_mean: 0.902250, proj_loss: -0.406131, loss_mean_cls: 0.995050, grad_norm: 3.639477 +Steps: 0%| | 2951/1000000 [12:06<67:42:29, 4.09it/s, grad_norm=3.64, loss_final=1.49, loss_mean=0.902, loss_mean_cls=0.995, proj_loss=-0.406][2026-03-22 14:30:43] Step: 2951, Training Logs: loss_final: 1.521537, loss_mean: 0.911492, proj_loss: -0.405082, loss_mean_cls: 1.015126, grad_norm: 1.419290 +Steps: 0%| | 2952/1000000 [12:06<67:42:39, 4.09it/s, grad_norm=1.42, loss_final=1.52, loss_mean=0.911, loss_mean_cls=1.02, proj_loss=-0.405][2026-03-22 14:30:43] Step: 2952, Training Logs: loss_final: 1.617690, loss_mean: 0.891749, proj_loss: -0.396853, loss_mean_cls: 1.122793, grad_norm: 5.923326 +Steps: 0%| | 2953/1000000 [12:07<67:44:13, 4.09it/s, grad_norm=5.92, loss_final=1.62, loss_mean=0.892, loss_mean_cls=1.12, proj_loss=-0.397][2026-03-22 14:30:44] Step: 2953, Training Logs: loss_final: 1.658907, loss_mean: 0.887928, proj_loss: -0.392929, loss_mean_cls: 1.163908, grad_norm: 4.780132 +Steps: 0%| | 2954/1000000 [12:07<67:42:00, 4.09it/s, grad_norm=4.78, loss_final=1.66, loss_mean=0.888, loss_mean_cls=1.16, proj_loss=-0.393][2026-03-22 14:30:44] Step: 2954, Training Logs: loss_final: 1.577415, loss_mean: 0.909269, proj_loss: -0.399947, loss_mean_cls: 1.068092, grad_norm: 4.719456 +Steps: 0%| | 2955/1000000 [12:07<67:40:18, 4.09it/s, grad_norm=4.72, loss_final=1.58, loss_mean=0.909, loss_mean_cls=1.07, proj_loss=-0.4][2026-03-22 14:30:44] Step: 2955, Training Logs: loss_final: 1.565166, loss_mean: 0.919252, proj_loss: -0.399866, loss_mean_cls: 1.045781, grad_norm: 2.265993 +Steps: 0%| | 2956/1000000 [12:07<67:40:58, 4.09it/s, grad_norm=2.27, loss_final=1.57, loss_mean=0.919, loss_mean_cls=1.05, proj_loss=-0.4][2026-03-22 14:30:44] Step: 2956, Training Logs: loss_final: 1.506127, loss_mean: 0.902936, proj_loss: -0.405991, loss_mean_cls: 1.009181, grad_norm: 3.841014 +Steps: 0%| | 2957/1000000 [12:08<67:40:34, 4.09it/s, grad_norm=3.84, loss_final=1.51, loss_mean=0.903, loss_mean_cls=1.01, proj_loss=-0.406][2026-03-22 14:30:45] Step: 2957, Training Logs: loss_final: 1.536457, loss_mean: 0.908495, proj_loss: -0.404290, loss_mean_cls: 1.032252, grad_norm: 3.969476 +Steps: 0%| | 2958/1000000 [12:08<67:40:08, 4.09it/s, grad_norm=3.97, loss_final=1.54, loss_mean=0.908, loss_mean_cls=1.03, proj_loss=-0.404][2026-03-22 14:30:45] Step: 2958, Training Logs: loss_final: 1.661573, loss_mean: 0.904158, proj_loss: -0.389596, loss_mean_cls: 1.147011, grad_norm: 2.782952 +Steps: 0%| | 2959/1000000 [12:08<67:38:58, 4.09it/s, grad_norm=2.78, loss_final=1.66, loss_mean=0.904, loss_mean_cls=1.15, proj_loss=-0.39][2026-03-22 14:30:45] Step: 2959, Training Logs: loss_final: 1.663346, loss_mean: 0.899113, proj_loss: -0.391669, loss_mean_cls: 1.155902, grad_norm: 1.987767 +Steps: 0%| | 2960/1000000 [12:08<67:38:18, 4.09it/s, grad_norm=1.99, loss_final=1.66, loss_mean=0.899, loss_mean_cls=1.16, proj_loss=-0.392][2026-03-22 14:30:45] Step: 2960, Training Logs: loss_final: 1.558824, loss_mean: 0.907416, proj_loss: -0.399581, loss_mean_cls: 1.050990, grad_norm: 2.254689 +Steps: 0%| | 2961/1000000 [12:09<67:39:48, 4.09it/s, grad_norm=2.25, loss_final=1.56, loss_mean=0.907, loss_mean_cls=1.05, proj_loss=-0.4][2026-03-22 14:30:46] Step: 2961, Training Logs: loss_final: 1.666924, loss_mean: 0.880824, proj_loss: -0.391150, loss_mean_cls: 1.177250, grad_norm: 2.510803 +Steps: 0%| | 2962/1000000 [12:09<67:40:13, 4.09it/s, grad_norm=2.51, loss_final=1.67, loss_mean=0.881, loss_mean_cls=1.18, proj_loss=-0.391][2026-03-22 14:30:46] Step: 2962, Training Logs: loss_final: 1.483071, loss_mean: 0.910608, proj_loss: -0.403646, loss_mean_cls: 0.976110, grad_norm: 2.513779 +Steps: 0%| | 2963/1000000 [12:09<67:39:53, 4.09it/s, grad_norm=2.51, loss_final=1.48, loss_mean=0.911, loss_mean_cls=0.976, proj_loss=-0.404][2026-03-22 14:30:46] Step: 2963, Training Logs: loss_final: 1.659702, loss_mean: 0.892159, proj_loss: -0.388990, loss_mean_cls: 1.156533, grad_norm: 3.412122 +Steps: 0%| | 2964/1000000 [12:09<67:39:29, 4.09it/s, grad_norm=3.41, loss_final=1.66, loss_mean=0.892, loss_mean_cls=1.16, proj_loss=-0.389][2026-03-22 14:30:46] Step: 2964, Training Logs: loss_final: 1.698758, loss_mean: 0.896657, proj_loss: -0.395686, loss_mean_cls: 1.197788, grad_norm: 4.432438 +Steps: 0%| | 2965/1000000 [12:10<67:39:23, 4.09it/s, grad_norm=4.43, loss_final=1.7, loss_mean=0.897, loss_mean_cls=1.2, proj_loss=-0.396][2026-03-22 14:30:47] Step: 2965, Training Logs: loss_final: 1.524458, loss_mean: 0.919392, proj_loss: -0.401895, loss_mean_cls: 1.006960, grad_norm: 4.129745 +Steps: 0%| | 2966/1000000 [12:10<67:39:35, 4.09it/s, grad_norm=4.13, loss_final=1.52, loss_mean=0.919, loss_mean_cls=1.01, proj_loss=-0.402][2026-03-22 14:30:47] Step: 2966, Training Logs: loss_final: 1.550932, loss_mean: 0.897065, proj_loss: -0.395453, loss_mean_cls: 1.049320, grad_norm: 4.210163 +Steps: 0%| | 2967/1000000 [12:10<67:39:46, 4.09it/s, grad_norm=4.21, loss_final=1.55, loss_mean=0.897, loss_mean_cls=1.05, proj_loss=-0.395][2026-03-22 14:30:47] Step: 2967, Training Logs: loss_final: 1.544096, loss_mean: 0.899647, proj_loss: -0.404444, loss_mean_cls: 1.048893, grad_norm: 1.079556 +Steps: 0%| | 2968/1000000 [12:10<67:40:46, 4.09it/s, grad_norm=1.08, loss_final=1.54, loss_mean=0.9, loss_mean_cls=1.05, proj_loss=-0.404][2026-03-22 14:30:47] Step: 2968, Training Logs: loss_final: 1.701608, loss_mean: 0.888252, proj_loss: -0.384080, loss_mean_cls: 1.197436, grad_norm: 4.507226 +Steps: 0%| | 2969/1000000 [12:11<67:41:24, 4.09it/s, grad_norm=4.51, loss_final=1.7, loss_mean=0.888, loss_mean_cls=1.2, proj_loss=-0.384][2026-03-22 14:30:47] Step: 2969, Training Logs: loss_final: 1.603100, loss_mean: 0.899596, proj_loss: -0.401137, loss_mean_cls: 1.104641, grad_norm: 6.327202 +Steps: 0%| | 2970/1000000 [12:11<68:06:11, 4.07it/s, grad_norm=6.33, loss_final=1.6, loss_mean=0.9, loss_mean_cls=1.1, proj_loss=-0.401][2026-03-22 14:30:48] Step: 2970, Training Logs: loss_final: 1.705823, loss_mean: 0.894745, proj_loss: -0.381632, loss_mean_cls: 1.192709, grad_norm: 4.415649 +Steps: 0%| | 2971/1000000 [12:11<67:57:45, 4.08it/s, grad_norm=4.42, loss_final=1.71, loss_mean=0.895, loss_mean_cls=1.19, proj_loss=-0.382][2026-03-22 14:30:48] Step: 2971, Training Logs: loss_final: 1.614851, loss_mean: 0.891722, proj_loss: -0.393077, loss_mean_cls: 1.116206, grad_norm: 1.858420 +Steps: 0%| | 2972/1000000 [12:11<67:52:29, 4.08it/s, grad_norm=1.86, loss_final=1.61, loss_mean=0.892, loss_mean_cls=1.12, proj_loss=-0.393][2026-03-22 14:30:48] Step: 2972, Training Logs: loss_final: 1.555884, loss_mean: 0.887390, proj_loss: -0.401022, loss_mean_cls: 1.069516, grad_norm: 3.392546 +Steps: 0%| | 2973/1000000 [12:12<67:48:08, 4.08it/s, grad_norm=3.39, loss_final=1.56, loss_mean=0.887, loss_mean_cls=1.07, proj_loss=-0.401][2026-03-22 14:30:48] Step: 2973, Training Logs: loss_final: 1.562328, loss_mean: 0.886721, proj_loss: -0.397423, loss_mean_cls: 1.073030, grad_norm: 2.482515 +Steps: 0%| | 2974/1000000 [12:12<67:46:04, 4.09it/s, grad_norm=2.48, loss_final=1.56, loss_mean=0.887, loss_mean_cls=1.07, proj_loss=-0.397][2026-03-22 14:30:49] Step: 2974, Training Logs: loss_final: 1.471682, loss_mean: 0.922025, proj_loss: -0.399534, loss_mean_cls: 0.949191, grad_norm: 1.521445 +Steps: 0%| | 2975/1000000 [12:12<67:43:49, 4.09it/s, grad_norm=1.52, loss_final=1.47, loss_mean=0.922, loss_mean_cls=0.949, proj_loss=-0.4][2026-03-22 14:30:49] Step: 2975, Training Logs: loss_final: 1.606137, loss_mean: 0.871904, proj_loss: -0.399414, loss_mean_cls: 1.133647, grad_norm: 1.704090 +Steps: 0%| | 2976/1000000 [12:12<67:42:27, 4.09it/s, grad_norm=1.7, loss_final=1.61, loss_mean=0.872, loss_mean_cls=1.13, proj_loss=-0.399][2026-03-22 14:30:49] Step: 2976, Training Logs: loss_final: 1.590203, loss_mean: 0.902691, proj_loss: -0.394916, loss_mean_cls: 1.082428, grad_norm: 3.447248 +Steps: 0%| | 2977/1000000 [12:12<67:42:27, 4.09it/s, grad_norm=3.45, loss_final=1.59, loss_mean=0.903, loss_mean_cls=1.08, proj_loss=-0.395][2026-03-22 14:30:49] Step: 2977, Training Logs: loss_final: 1.636058, loss_mean: 0.899925, proj_loss: -0.384290, loss_mean_cls: 1.120422, grad_norm: 2.086925 +Steps: 0%| | 2978/1000000 [12:13<67:42:31, 4.09it/s, grad_norm=2.09, loss_final=1.64, loss_mean=0.9, loss_mean_cls=1.12, proj_loss=-0.384][2026-03-22 14:30:50] Step: 2978, Training Logs: loss_final: 1.610705, loss_mean: 0.910798, proj_loss: -0.395009, loss_mean_cls: 1.094916, grad_norm: 2.381428 +Steps: 0%| | 2979/1000000 [12:13<67:42:18, 4.09it/s, grad_norm=2.38, loss_final=1.61, loss_mean=0.911, loss_mean_cls=1.09, proj_loss=-0.395][2026-03-22 14:30:50] Step: 2979, Training Logs: loss_final: 1.472630, loss_mean: 0.919078, proj_loss: -0.406576, loss_mean_cls: 0.960128, grad_norm: 3.179820 +Steps: 0%| | 2980/1000000 [12:13<67:42:11, 4.09it/s, grad_norm=3.18, loss_final=1.47, loss_mean=0.919, loss_mean_cls=0.96, proj_loss=-0.407][2026-03-22 14:30:50] Step: 2980, Training Logs: loss_final: 1.517110, loss_mean: 0.927371, proj_loss: -0.400096, loss_mean_cls: 0.989835, grad_norm: 3.845905 +Steps: 0%| | 2981/1000000 [12:13<67:40:42, 4.09it/s, grad_norm=3.85, loss_final=1.52, loss_mean=0.927, loss_mean_cls=0.99, proj_loss=-0.4][2026-03-22 14:30:50] Step: 2981, Training Logs: loss_final: 1.596116, loss_mean: 0.889285, proj_loss: -0.396015, loss_mean_cls: 1.102846, grad_norm: 2.702695 +Steps: 0%| | 2982/1000000 [12:14<67:40:22, 4.09it/s, grad_norm=2.7, loss_final=1.6, loss_mean=0.889, loss_mean_cls=1.1, proj_loss=-0.396][2026-03-22 14:30:51] Step: 2982, Training Logs: loss_final: 1.724721, loss_mean: 0.856937, proj_loss: -0.388413, loss_mean_cls: 1.256196, grad_norm: 3.246671 +Steps: 0%| | 2983/1000000 [12:14<67:40:34, 4.09it/s, grad_norm=3.25, loss_final=1.72, loss_mean=0.857, loss_mean_cls=1.26, proj_loss=-0.388][2026-03-22 14:30:51] Step: 2983, Training Logs: loss_final: 1.487008, loss_mean: 0.918015, proj_loss: -0.399844, loss_mean_cls: 0.968838, grad_norm: 2.875457 +Steps: 0%| | 2984/1000000 [12:14<67:41:30, 4.09it/s, grad_norm=2.88, loss_final=1.49, loss_mean=0.918, loss_mean_cls=0.969, proj_loss=-0.4][2026-03-22 14:30:51] Step: 2984, Training Logs: loss_final: 1.597911, loss_mean: 0.897804, proj_loss: -0.397441, loss_mean_cls: 1.097548, grad_norm: 1.429246 +Steps: 0%| | 2985/1000000 [12:14<67:39:53, 4.09it/s, grad_norm=1.43, loss_final=1.6, loss_mean=0.898, loss_mean_cls=1.1, proj_loss=-0.397][2026-03-22 14:30:51] Step: 2985, Training Logs: loss_final: 1.578965, loss_mean: 0.877873, proj_loss: -0.401973, loss_mean_cls: 1.103066, grad_norm: 3.184081 +Steps: 0%| | 2986/1000000 [12:15<67:39:11, 4.09it/s, grad_norm=3.18, loss_final=1.58, loss_mean=0.878, loss_mean_cls=1.1, proj_loss=-0.402][2026-03-22 14:30:52] Step: 2986, Training Logs: loss_final: 1.540864, loss_mean: 0.912499, proj_loss: -0.408340, loss_mean_cls: 1.036704, grad_norm: 4.768648 +Steps: 0%| | 2987/1000000 [12:15<67:39:24, 4.09it/s, grad_norm=4.77, loss_final=1.54, loss_mean=0.912, loss_mean_cls=1.04, proj_loss=-0.408][2026-03-22 14:30:52] Step: 2987, Training Logs: loss_final: 1.668416, loss_mean: 0.906695, proj_loss: -0.390070, loss_mean_cls: 1.151791, grad_norm: 1.938383 +Steps: 0%| | 2988/1000000 [12:15<67:41:31, 4.09it/s, grad_norm=1.94, loss_final=1.67, loss_mean=0.907, loss_mean_cls=1.15, proj_loss=-0.39][2026-03-22 14:30:52] Step: 2988, Training Logs: loss_final: 1.682614, loss_mean: 0.885054, proj_loss: -0.391271, loss_mean_cls: 1.188831, grad_norm: 1.089924 +Steps: 0%| | 2989/1000000 [12:15<67:40:50, 4.09it/s, grad_norm=1.09, loss_final=1.68, loss_mean=0.885, loss_mean_cls=1.19, proj_loss=-0.391][2026-03-22 14:30:52] Step: 2989, Training Logs: loss_final: 1.511619, loss_mean: 0.895776, proj_loss: -0.407062, loss_mean_cls: 1.022905, grad_norm: 4.059010 +Steps: 0%| | 2990/1000000 [12:16<67:40:09, 4.09it/s, grad_norm=4.06, loss_final=1.51, loss_mean=0.896, loss_mean_cls=1.02, proj_loss=-0.407][2026-03-22 14:30:53] Step: 2990, Training Logs: loss_final: 1.622158, loss_mean: 0.902280, proj_loss: -0.394643, loss_mean_cls: 1.114521, grad_norm: 2.925237 +Steps: 0%| | 2991/1000000 [12:16<67:39:31, 4.09it/s, grad_norm=2.93, loss_final=1.62, loss_mean=0.902, loss_mean_cls=1.11, proj_loss=-0.395][2026-03-22 14:30:53] Step: 2991, Training Logs: loss_final: 1.554926, loss_mean: 0.913357, proj_loss: -0.398102, loss_mean_cls: 1.039671, grad_norm: 2.929474 +Steps: 0%| | 2992/1000000 [12:16<70:00:11, 3.96it/s, grad_norm=2.93, loss_final=1.55, loss_mean=0.913, loss_mean_cls=1.04, proj_loss=-0.398][2026-03-22 14:30:53] Step: 2992, Training Logs: loss_final: 1.563804, loss_mean: 0.912845, proj_loss: -0.396281, loss_mean_cls: 1.047241, grad_norm: 1.770102 +Steps: 0%| | 2993/1000000 [12:16<69:27:08, 3.99it/s, grad_norm=1.77, loss_final=1.56, loss_mean=0.913, loss_mean_cls=1.05, proj_loss=-0.396][2026-03-22 14:30:53] Step: 2993, Training Logs: loss_final: 1.582735, loss_mean: 0.882647, proj_loss: -0.401160, loss_mean_cls: 1.101248, grad_norm: 1.275698 +Steps: 0%| | 2994/1000000 [12:17<68:54:01, 4.02it/s, grad_norm=1.28, loss_final=1.58, loss_mean=0.883, loss_mean_cls=1.1, proj_loss=-0.401][2026-03-22 14:30:54] Step: 2994, Training Logs: loss_final: 1.590451, loss_mean: 0.896770, proj_loss: -0.393172, loss_mean_cls: 1.086852, grad_norm: 1.381530 +Steps: 0%| | 2995/1000000 [12:17<68:32:02, 4.04it/s, grad_norm=1.38, loss_final=1.59, loss_mean=0.897, loss_mean_cls=1.09, proj_loss=-0.393][2026-03-22 14:30:54] Step: 2995, Training Logs: loss_final: 1.657731, loss_mean: 0.878914, proj_loss: -0.388224, loss_mean_cls: 1.167041, grad_norm: 2.967886 +Steps: 0%| | 2996/1000000 [12:17<68:16:09, 4.06it/s, grad_norm=2.97, loss_final=1.66, loss_mean=0.879, loss_mean_cls=1.17, proj_loss=-0.388][2026-03-22 14:30:54] Step: 2996, Training Logs: loss_final: 1.571801, loss_mean: 0.901066, proj_loss: -0.400538, loss_mean_cls: 1.071272, grad_norm: 3.939663 +Steps: 0%| | 2997/1000000 [12:17<68:05:03, 4.07it/s, grad_norm=3.94, loss_final=1.57, loss_mean=0.901, loss_mean_cls=1.07, proj_loss=-0.401][2026-03-22 14:30:54] Step: 2997, Training Logs: loss_final: 1.667204, loss_mean: 0.885715, proj_loss: -0.385061, loss_mean_cls: 1.166550, grad_norm: 2.532970 +Steps: 0%| | 2998/1000000 [12:18<67:56:53, 4.08it/s, grad_norm=2.53, loss_final=1.67, loss_mean=0.886, loss_mean_cls=1.17, proj_loss=-0.385][2026-03-22 14:30:55] Step: 2998, Training Logs: loss_final: 1.651212, loss_mean: 0.899464, proj_loss: -0.390913, loss_mean_cls: 1.142662, grad_norm: 1.399042 +Steps: 0%| | 2999/1000000 [12:18<67:51:57, 4.08it/s, grad_norm=1.4, loss_final=1.65, loss_mean=0.899, loss_mean_cls=1.14, proj_loss=-0.391][2026-03-22 14:30:55] Step: 2999, Training Logs: loss_final: 1.581154, loss_mean: 0.896309, proj_loss: -0.397760, loss_mean_cls: 1.082605, grad_norm: 1.392243 +Steps: 0%| | 3000/1000000 [12:18<67:51:14, 4.08it/s, grad_norm=1.39, loss_final=1.58, loss_mean=0.896, loss_mean_cls=1.08, proj_loss=-0.398][2026-03-22 14:30:55] Step: 3000, Training Logs: loss_final: 1.600792, loss_mean: 0.910968, proj_loss: -0.396245, loss_mean_cls: 1.086068, grad_norm: 4.819375 +Steps: 0%| | 3001/1000000 [12:18<67:46:33, 4.09it/s, grad_norm=4.82, loss_final=1.6, loss_mean=0.911, loss_mean_cls=1.09, proj_loss=-0.396][2026-03-22 14:30:55] Step: 3001, Training Logs: loss_final: 1.635834, loss_mean: 0.911850, proj_loss: -0.392723, loss_mean_cls: 1.116707, grad_norm: 2.065098 +Steps: 0%| | 3002/1000000 [12:19<67:45:08, 4.09it/s, grad_norm=2.07, loss_final=1.64, loss_mean=0.912, loss_mean_cls=1.12, proj_loss=-0.393][2026-03-22 14:30:56] Step: 3002, Training Logs: loss_final: 1.644489, loss_mean: 0.890257, proj_loss: -0.391748, loss_mean_cls: 1.145980, grad_norm: 2.854260 +Steps: 0%| | 3003/1000000 [12:19<67:43:56, 4.09it/s, grad_norm=2.85, loss_final=1.64, loss_mean=0.89, loss_mean_cls=1.15, proj_loss=-0.392][2026-03-22 14:30:56] Step: 3003, Training Logs: loss_final: 1.619292, loss_mean: 0.882500, proj_loss: -0.391506, loss_mean_cls: 1.128298, grad_norm: 2.989981 +Steps: 0%| | 3004/1000000 [12:19<67:45:02, 4.09it/s, grad_norm=2.99, loss_final=1.62, loss_mean=0.882, loss_mean_cls=1.13, proj_loss=-0.392][2026-03-22 14:30:56] Step: 3004, Training Logs: loss_final: 1.508574, loss_mean: 0.914072, proj_loss: -0.400094, loss_mean_cls: 0.994596, grad_norm: 5.226655 +Steps: 0%| | 3005/1000000 [12:19<67:44:57, 4.09it/s, grad_norm=5.23, loss_final=1.51, loss_mean=0.914, loss_mean_cls=0.995, proj_loss=-0.4][2026-03-22 14:30:56] Step: 3005, Training Logs: loss_final: 1.645018, loss_mean: 0.894536, proj_loss: -0.398315, loss_mean_cls: 1.148797, grad_norm: 2.300089 +Steps: 0%| | 3006/1000000 [12:20<67:43:57, 4.09it/s, grad_norm=2.3, loss_final=1.65, loss_mean=0.895, loss_mean_cls=1.15, proj_loss=-0.398][2026-03-22 14:30:57] Step: 3006, Training Logs: loss_final: 1.643957, loss_mean: 0.896860, proj_loss: -0.396402, loss_mean_cls: 1.143500, grad_norm: 4.158606 +Steps: 0%| | 3007/1000000 [12:20<67:43:19, 4.09it/s, grad_norm=4.16, loss_final=1.64, loss_mean=0.897, loss_mean_cls=1.14, proj_loss=-0.396][2026-03-22 14:30:57] Step: 3007, Training Logs: loss_final: 1.592254, loss_mean: 0.882012, proj_loss: -0.399076, loss_mean_cls: 1.109318, grad_norm: 3.354590 +Steps: 0%| | 3008/1000000 [12:20<67:43:04, 4.09it/s, grad_norm=3.35, loss_final=1.59, loss_mean=0.882, loss_mean_cls=1.11, proj_loss=-0.399][2026-03-22 14:30:57] Step: 3008, Training Logs: loss_final: 1.623893, loss_mean: 0.887512, proj_loss: -0.403643, loss_mean_cls: 1.140024, grad_norm: 2.086574 +Steps: 0%| | 3009/1000000 [12:20<67:43:16, 4.09it/s, grad_norm=2.09, loss_final=1.62, loss_mean=0.888, loss_mean_cls=1.14, proj_loss=-0.404][2026-03-22 14:30:57] Step: 3009, Training Logs: loss_final: 1.570482, loss_mean: 0.882739, proj_loss: -0.393735, loss_mean_cls: 1.081479, grad_norm: 1.776625 +Steps: 0%| | 3010/1000000 [12:21<67:43:11, 4.09it/s, grad_norm=1.78, loss_final=1.57, loss_mean=0.883, loss_mean_cls=1.08, proj_loss=-0.394][2026-03-22 14:30:58] Step: 3010, Training Logs: loss_final: 1.577748, loss_mean: 0.882839, proj_loss: -0.400785, loss_mean_cls: 1.095695, grad_norm: 3.821310 +Steps: 0%| | 3011/1000000 [12:21<68:16:30, 4.06it/s, grad_norm=3.82, loss_final=1.58, loss_mean=0.883, loss_mean_cls=1.1, proj_loss=-0.401][2026-03-22 14:30:58] Step: 3011, Training Logs: loss_final: 1.666589, loss_mean: 0.880900, proj_loss: -0.391954, loss_mean_cls: 1.177643, grad_norm: 1.348109 +Steps: 0%| | 3012/1000000 [12:21<68:06:01, 4.07it/s, grad_norm=1.35, loss_final=1.67, loss_mean=0.881, loss_mean_cls=1.18, proj_loss=-0.392][2026-03-22 14:30:58] Step: 3012, Training Logs: loss_final: 1.693420, loss_mean: 0.877162, proj_loss: -0.390880, loss_mean_cls: 1.207139, grad_norm: 2.471804 +Steps: 0%| | 3013/1000000 [12:21<67:56:38, 4.08it/s, grad_norm=2.47, loss_final=1.69, loss_mean=0.877, loss_mean_cls=1.21, proj_loss=-0.391][2026-03-22 14:30:58] Step: 3013, Training Logs: loss_final: 1.594214, loss_mean: 0.899824, proj_loss: -0.398867, loss_mean_cls: 1.093258, grad_norm: 1.380658 +Steps: 0%| | 3014/1000000 [12:22<67:52:18, 4.08it/s, grad_norm=1.38, loss_final=1.59, loss_mean=0.9, loss_mean_cls=1.09, proj_loss=-0.399][2026-03-22 14:30:59] Step: 3014, Training Logs: loss_final: 1.533434, loss_mean: 0.894949, proj_loss: -0.407575, loss_mean_cls: 1.046059, grad_norm: 1.193913 +Steps: 0%| | 3015/1000000 [12:22<67:48:13, 4.08it/s, grad_norm=1.19, loss_final=1.53, loss_mean=0.895, loss_mean_cls=1.05, proj_loss=-0.408][2026-03-22 14:30:59] Step: 3015, Training Logs: loss_final: 1.578985, loss_mean: 0.893829, proj_loss: -0.399413, loss_mean_cls: 1.084569, grad_norm: 2.731554 +Steps: 0%| | 3016/1000000 [12:22<67:45:43, 4.09it/s, grad_norm=2.73, loss_final=1.58, loss_mean=0.894, loss_mean_cls=1.08, proj_loss=-0.399][2026-03-22 14:30:59] Step: 3016, Training Logs: loss_final: 1.639972, loss_mean: 0.883338, proj_loss: -0.399165, loss_mean_cls: 1.155798, grad_norm: 4.281342 +Steps: 0%| | 3017/1000000 [12:22<67:44:14, 4.09it/s, grad_norm=4.28, loss_final=1.64, loss_mean=0.883, loss_mean_cls=1.16, proj_loss=-0.399][2026-03-22 14:30:59] Step: 3017, Training Logs: loss_final: 1.656605, loss_mean: 0.895255, proj_loss: -0.391531, loss_mean_cls: 1.152881, grad_norm: 2.620430 +Steps: 0%| | 3018/1000000 [12:23<67:42:38, 4.09it/s, grad_norm=2.62, loss_final=1.66, loss_mean=0.895, loss_mean_cls=1.15, proj_loss=-0.392][2026-03-22 14:31:00] Step: 3018, Training Logs: loss_final: 1.586762, loss_mean: 0.884036, proj_loss: -0.396976, loss_mean_cls: 1.099702, grad_norm: 2.177836 +Steps: 0%| | 3019/1000000 [12:23<67:42:09, 4.09it/s, grad_norm=2.18, loss_final=1.59, loss_mean=0.884, loss_mean_cls=1.1, proj_loss=-0.397][2026-03-22 14:31:00] Step: 3019, Training Logs: loss_final: 1.732623, loss_mean: 0.884274, proj_loss: -0.384252, loss_mean_cls: 1.232600, grad_norm: 6.437181 +Steps: 0%| | 3020/1000000 [12:23<67:40:55, 4.09it/s, grad_norm=6.44, loss_final=1.73, loss_mean=0.884, loss_mean_cls=1.23, proj_loss=-0.384][2026-03-22 14:31:00] Step: 3020, Training Logs: loss_final: 1.598321, loss_mean: 0.903345, proj_loss: -0.398198, loss_mean_cls: 1.093174, grad_norm: 5.733359 +Steps: 0%| | 3021/1000000 [12:23<67:41:56, 4.09it/s, grad_norm=5.73, loss_final=1.6, loss_mean=0.903, loss_mean_cls=1.09, proj_loss=-0.398][2026-03-22 14:31:00] Step: 3021, Training Logs: loss_final: 1.478579, loss_mean: 0.900574, proj_loss: -0.408202, loss_mean_cls: 0.986207, grad_norm: 2.735194 +Steps: 0%| | 3022/1000000 [12:24<67:41:47, 4.09it/s, grad_norm=2.74, loss_final=1.48, loss_mean=0.901, loss_mean_cls=0.986, proj_loss=-0.408][2026-03-22 14:31:00] Step: 3022, Training Logs: loss_final: 1.630368, loss_mean: 0.880927, proj_loss: -0.393330, loss_mean_cls: 1.142770, grad_norm: 2.413914 +Steps: 0%| | 3023/1000000 [12:24<67:40:49, 4.09it/s, grad_norm=2.41, loss_final=1.63, loss_mean=0.881, loss_mean_cls=1.14, proj_loss=-0.393][2026-03-22 14:31:01] Step: 3023, Training Logs: loss_final: 1.475766, loss_mean: 0.897590, proj_loss: -0.408598, loss_mean_cls: 0.986774, grad_norm: 3.183270 +Steps: 0%| | 3024/1000000 [12:24<67:41:57, 4.09it/s, grad_norm=3.18, loss_final=1.48, loss_mean=0.898, loss_mean_cls=0.987, proj_loss=-0.409][2026-03-22 14:31:01] Step: 3024, Training Logs: loss_final: 1.620700, loss_mean: 0.890517, proj_loss: -0.394966, loss_mean_cls: 1.125148, grad_norm: 3.957215 +Steps: 0%| | 3025/1000000 [12:24<67:43:49, 4.09it/s, grad_norm=3.96, loss_final=1.62, loss_mean=0.891, loss_mean_cls=1.13, proj_loss=-0.395][2026-03-22 14:31:01] Step: 3025, Training Logs: loss_final: 1.555801, loss_mean: 0.903758, proj_loss: -0.402844, loss_mean_cls: 1.054887, grad_norm: 1.229737 +Steps: 0%| | 3026/1000000 [12:25<67:47:55, 4.08it/s, grad_norm=1.23, loss_final=1.56, loss_mean=0.904, loss_mean_cls=1.05, proj_loss=-0.403][2026-03-22 14:31:01] Step: 3026, Training Logs: loss_final: 1.553828, loss_mean: 0.886457, proj_loss: -0.401198, loss_mean_cls: 1.068569, grad_norm: 1.650083 +Steps: 0%| | 3027/1000000 [12:25<67:46:55, 4.09it/s, grad_norm=1.65, loss_final=1.55, loss_mean=0.886, loss_mean_cls=1.07, proj_loss=-0.401][2026-03-22 14:31:02] Step: 3027, Training Logs: loss_final: 1.556891, loss_mean: 0.903950, proj_loss: -0.396112, loss_mean_cls: 1.049053, grad_norm: 1.681754 +Steps: 0%| | 3028/1000000 [12:25<67:47:32, 4.09it/s, grad_norm=1.68, loss_final=1.56, loss_mean=0.904, loss_mean_cls=1.05, proj_loss=-0.396][2026-03-22 14:31:02] Step: 3028, Training Logs: loss_final: 1.525081, loss_mean: 0.897743, proj_loss: -0.401428, loss_mean_cls: 1.028766, grad_norm: 3.024492 +Steps: 0%| | 3029/1000000 [12:25<67:44:37, 4.09it/s, grad_norm=3.02, loss_final=1.53, loss_mean=0.898, loss_mean_cls=1.03, proj_loss=-0.401][2026-03-22 14:31:02] Step: 3029, Training Logs: loss_final: 1.463433, loss_mean: 0.930736, proj_loss: -0.408856, loss_mean_cls: 0.941554, grad_norm: 2.364177 +Steps: 0%| | 3030/1000000 [12:25<67:43:14, 4.09it/s, grad_norm=2.36, loss_final=1.46, loss_mean=0.931, loss_mean_cls=0.942, proj_loss=-0.409][2026-03-22 14:31:02] Step: 3030, Training Logs: loss_final: 1.437130, loss_mean: 0.929647, proj_loss: -0.408184, loss_mean_cls: 0.915667, grad_norm: 3.795440 +Steps: 0%| | 3031/1000000 [12:26<67:41:47, 4.09it/s, grad_norm=3.8, loss_final=1.44, loss_mean=0.93, loss_mean_cls=0.916, proj_loss=-0.408][2026-03-22 14:31:03] Step: 3031, Training Logs: loss_final: 1.653676, loss_mean: 0.880161, proj_loss: -0.391948, loss_mean_cls: 1.165464, grad_norm: 2.267395 +Steps: 0%| | 3032/1000000 [12:26<67:42:27, 4.09it/s, grad_norm=2.27, loss_final=1.65, loss_mean=0.88, loss_mean_cls=1.17, proj_loss=-0.392][2026-03-22 14:31:03] Step: 3032, Training Logs: loss_final: 1.636220, loss_mean: 0.882454, proj_loss: -0.393709, loss_mean_cls: 1.147475, grad_norm: 3.072262 +Steps: 0%| | 3033/1000000 [12:26<67:42:04, 4.09it/s, grad_norm=3.07, loss_final=1.64, loss_mean=0.882, loss_mean_cls=1.15, proj_loss=-0.394][2026-03-22 14:31:03] Step: 3033, Training Logs: loss_final: 1.622769, loss_mean: 0.902443, proj_loss: -0.397685, loss_mean_cls: 1.118011, grad_norm: 4.351336 +Steps: 0%| | 3034/1000000 [12:26<67:41:53, 4.09it/s, grad_norm=4.35, loss_final=1.62, loss_mean=0.902, loss_mean_cls=1.12, proj_loss=-0.398][2026-03-22 14:31:03] Step: 3034, Training Logs: loss_final: 1.539732, loss_mean: 0.884227, proj_loss: -0.407573, loss_mean_cls: 1.063078, grad_norm: 4.199334 +Steps: 0%| | 3035/1000000 [12:27<67:39:53, 4.09it/s, grad_norm=4.2, loss_final=1.54, loss_mean=0.884, loss_mean_cls=1.06, proj_loss=-0.408][2026-03-22 14:31:04] Step: 3035, Training Logs: loss_final: 1.541157, loss_mean: 0.916915, proj_loss: -0.396477, loss_mean_cls: 1.020719, grad_norm: 1.323314 +Steps: 0%| | 3036/1000000 [12:27<67:38:43, 4.09it/s, grad_norm=1.32, loss_final=1.54, loss_mean=0.917, loss_mean_cls=1.02, proj_loss=-0.396][2026-03-22 14:31:04] Step: 3036, Training Logs: loss_final: 1.516057, loss_mean: 0.908356, proj_loss: -0.405879, loss_mean_cls: 1.013580, grad_norm: 2.497521 +Steps: 0%| | 3037/1000000 [12:27<67:40:28, 4.09it/s, grad_norm=2.5, loss_final=1.52, loss_mean=0.908, loss_mean_cls=1.01, proj_loss=-0.406][2026-03-22 14:31:04] Step: 3037, Training Logs: loss_final: 1.438521, loss_mean: 0.918827, proj_loss: -0.395800, loss_mean_cls: 0.915494, grad_norm: 1.658656 +Steps: 0%| | 3038/1000000 [12:27<67:39:07, 4.09it/s, grad_norm=1.66, loss_final=1.44, loss_mean=0.919, loss_mean_cls=0.915, proj_loss=-0.396][2026-03-22 14:31:04] Step: 3038, Training Logs: loss_final: 1.581755, loss_mean: 0.897529, proj_loss: -0.394000, loss_mean_cls: 1.078226, grad_norm: 1.995788 +Steps: 0%| | 3039/1000000 [12:28<67:40:22, 4.09it/s, grad_norm=2, loss_final=1.58, loss_mean=0.898, loss_mean_cls=1.08, proj_loss=-0.394][2026-03-22 14:31:05] Step: 3039, Training Logs: loss_final: 1.529225, loss_mean: 0.896850, proj_loss: -0.404645, loss_mean_cls: 1.037020, grad_norm: 3.908815 +Steps: 0%| | 3040/1000000 [12:28<67:51:07, 4.08it/s, grad_norm=3.91, loss_final=1.53, loss_mean=0.897, loss_mean_cls=1.04, proj_loss=-0.405][2026-03-22 14:31:05] Step: 3040, Training Logs: loss_final: 1.495227, loss_mean: 0.905115, proj_loss: -0.404069, loss_mean_cls: 0.994181, grad_norm: 3.633392 +Steps: 0%| | 3041/1000000 [12:28<67:48:37, 4.08it/s, grad_norm=3.63, loss_final=1.5, loss_mean=0.905, loss_mean_cls=0.994, proj_loss=-0.404][2026-03-22 14:31:05] Step: 3041, Training Logs: loss_final: 1.550833, loss_mean: 0.899489, proj_loss: -0.396328, loss_mean_cls: 1.047673, grad_norm: 2.532188 +Steps: 0%| | 3042/1000000 [12:28<67:46:11, 4.09it/s, grad_norm=2.53, loss_final=1.55, loss_mean=0.899, loss_mean_cls=1.05, proj_loss=-0.396][2026-03-22 14:31:05] Step: 3042, Training Logs: loss_final: 1.671732, loss_mean: 0.871104, proj_loss: -0.393686, loss_mean_cls: 1.194314, grad_norm: 5.601039 +Steps: 0%| | 3043/1000000 [12:29<67:43:06, 4.09it/s, grad_norm=5.6, loss_final=1.67, loss_mean=0.871, loss_mean_cls=1.19, proj_loss=-0.394][2026-03-22 14:31:06] Step: 3043, Training Logs: loss_final: 1.622355, loss_mean: 0.881864, proj_loss: -0.394640, loss_mean_cls: 1.135131, grad_norm: 1.559150 +Steps: 0%| | 3044/1000000 [12:29<67:42:35, 4.09it/s, grad_norm=1.56, loss_final=1.62, loss_mean=0.882, loss_mean_cls=1.14, proj_loss=-0.395][2026-03-22 14:31:06] Step: 3044, Training Logs: loss_final: 1.518925, loss_mean: 0.908309, proj_loss: -0.400694, loss_mean_cls: 1.011310, grad_norm: 0.951552 +Steps: 0%| | 3045/1000000 [12:29<67:46:43, 4.09it/s, grad_norm=0.952, loss_final=1.52, loss_mean=0.908, loss_mean_cls=1.01, proj_loss=-0.401][2026-03-22 14:31:06] Step: 3045, Training Logs: loss_final: 1.485708, loss_mean: 0.899189, proj_loss: -0.407114, loss_mean_cls: 0.993632, grad_norm: 4.762486 +Steps: 0%| | 3046/1000000 [12:29<67:45:20, 4.09it/s, grad_norm=4.76, loss_final=1.49, loss_mean=0.899, loss_mean_cls=0.994, proj_loss=-0.407][2026-03-22 14:31:06] Step: 3046, Training Logs: loss_final: 1.551787, loss_mean: 0.912676, proj_loss: -0.403470, loss_mean_cls: 1.042581, grad_norm: 6.787750 +Steps: 0%| | 3047/1000000 [12:30<67:52:25, 4.08it/s, grad_norm=6.79, loss_final=1.55, loss_mean=0.913, loss_mean_cls=1.04, proj_loss=-0.403][2026-03-22 14:31:07] Step: 3047, Training Logs: loss_final: 1.552448, loss_mean: 0.923452, proj_loss: -0.397477, loss_mean_cls: 1.026473, grad_norm: 1.887578 +Steps: 0%| | 3048/1000000 [12:30<67:52:51, 4.08it/s, grad_norm=1.89, loss_final=1.55, loss_mean=0.923, loss_mean_cls=1.03, proj_loss=-0.397][2026-03-22 14:31:07] Step: 3048, Training Logs: loss_final: 1.513269, loss_mean: 0.899215, proj_loss: -0.404465, loss_mean_cls: 1.018518, grad_norm: 4.985117 +Steps: 0%| | 3049/1000000 [12:30<67:50:42, 4.08it/s, grad_norm=4.99, loss_final=1.51, loss_mean=0.899, loss_mean_cls=1.02, proj_loss=-0.404][2026-03-22 14:31:07] Step: 3049, Training Logs: loss_final: 1.652630, loss_mean: 0.893727, proj_loss: -0.389019, loss_mean_cls: 1.147922, grad_norm: 4.154469 +Steps: 0%| | 3050/1000000 [12:30<68:00:27, 4.07it/s, grad_norm=4.15, loss_final=1.65, loss_mean=0.894, loss_mean_cls=1.15, proj_loss=-0.389][2026-03-22 14:31:07] Step: 3050, Training Logs: loss_final: 1.650326, loss_mean: 0.891941, proj_loss: -0.392748, loss_mean_cls: 1.151133, grad_norm: 2.491746 +Steps: 0%| | 3051/1000000 [12:31<67:54:22, 4.08it/s, grad_norm=2.49, loss_final=1.65, loss_mean=0.892, loss_mean_cls=1.15, proj_loss=-0.393][2026-03-22 14:31:08] Step: 3051, Training Logs: loss_final: 1.615061, loss_mean: 0.905501, proj_loss: -0.399561, loss_mean_cls: 1.109121, grad_norm: 3.727727 +Steps: 0%| | 3052/1000000 [12:31<67:51:39, 4.08it/s, grad_norm=3.73, loss_final=1.62, loss_mean=0.906, loss_mean_cls=1.11, proj_loss=-0.4][2026-03-22 14:31:08] Step: 3052, Training Logs: loss_final: 1.552351, loss_mean: 0.918087, proj_loss: -0.407233, loss_mean_cls: 1.041498, grad_norm: 6.824468 +Steps: 0%| | 3053/1000000 [12:31<67:49:06, 4.08it/s, grad_norm=6.82, loss_final=1.55, loss_mean=0.918, loss_mean_cls=1.04, proj_loss=-0.407][2026-03-22 14:31:08] Step: 3053, Training Logs: loss_final: 1.577458, loss_mean: 0.904759, proj_loss: -0.402122, loss_mean_cls: 1.074822, grad_norm: 2.740864 +Steps: 0%| | 3054/1000000 [12:31<67:47:27, 4.09it/s, grad_norm=2.74, loss_final=1.58, loss_mean=0.905, loss_mean_cls=1.07, proj_loss=-0.402][2026-03-22 14:31:08] Step: 3054, Training Logs: loss_final: 1.595488, loss_mean: 0.897991, proj_loss: -0.393578, loss_mean_cls: 1.091075, grad_norm: 2.690002 +Steps: 0%| | 3055/1000000 [12:32<67:45:04, 4.09it/s, grad_norm=2.69, loss_final=1.6, loss_mean=0.898, loss_mean_cls=1.09, proj_loss=-0.394][2026-03-22 14:31:09] Step: 3055, Training Logs: loss_final: 1.672130, loss_mean: 0.880140, proj_loss: -0.388027, loss_mean_cls: 1.180017, grad_norm: 3.930932 +Steps: 0%| | 3056/1000000 [12:32<67:43:45, 4.09it/s, grad_norm=3.93, loss_final=1.67, loss_mean=0.88, loss_mean_cls=1.18, proj_loss=-0.388][2026-03-22 14:31:09] Step: 3056, Training Logs: loss_final: 1.826434, loss_mean: 0.863975, proj_loss: -0.377992, loss_mean_cls: 1.340450, grad_norm: 3.525036 +Steps: 0%| | 3057/1000000 [12:32<67:41:41, 4.09it/s, grad_norm=3.53, loss_final=1.83, loss_mean=0.864, loss_mean_cls=1.34, proj_loss=-0.378][2026-03-22 14:31:09] Step: 3057, Training Logs: loss_final: 1.596534, loss_mean: 0.892611, proj_loss: -0.402385, loss_mean_cls: 1.106308, grad_norm: 1.735784 +Steps: 0%| | 3058/1000000 [12:32<67:41:00, 4.09it/s, grad_norm=1.74, loss_final=1.6, loss_mean=0.893, loss_mean_cls=1.11, proj_loss=-0.402][2026-03-22 14:31:09] Step: 3058, Training Logs: loss_final: 1.627476, loss_mean: 0.893671, proj_loss: -0.397148, loss_mean_cls: 1.130953, grad_norm: 6.491071 +Steps: 0%| | 3059/1000000 [12:33<67:44:58, 4.09it/s, grad_norm=6.49, loss_final=1.63, loss_mean=0.894, loss_mean_cls=1.13, proj_loss=-0.397][2026-03-22 14:31:10] Step: 3059, Training Logs: loss_final: 1.622961, loss_mean: 0.898214, proj_loss: -0.398998, loss_mean_cls: 1.123745, grad_norm: 6.168417 +Steps: 0%| | 3060/1000000 [12:33<68:13:29, 4.06it/s, grad_norm=6.17, loss_final=1.62, loss_mean=0.898, loss_mean_cls=1.12, proj_loss=-0.399][2026-03-22 14:31:10] Step: 3060, Training Logs: loss_final: 1.489574, loss_mean: 0.897325, proj_loss: -0.407698, loss_mean_cls: 0.999947, grad_norm: 4.287776 +Steps: 0%| | 3061/1000000 [12:33<68:02:01, 4.07it/s, grad_norm=4.29, loss_final=1.49, loss_mean=0.897, loss_mean_cls=1, proj_loss=-0.408][2026-03-22 14:31:10] Step: 3061, Training Logs: loss_final: 1.466674, loss_mean: 0.924618, proj_loss: -0.403455, loss_mean_cls: 0.945511, grad_norm: 1.320079 +Steps: 0%| | 3062/1000000 [12:33<67:56:18, 4.08it/s, grad_norm=1.32, loss_final=1.47, loss_mean=0.925, loss_mean_cls=0.946, proj_loss=-0.403][2026-03-22 14:31:10] Step: 3062, Training Logs: loss_final: 1.428936, loss_mean: 0.903343, proj_loss: -0.406684, loss_mean_cls: 0.932277, grad_norm: 2.788594 +Steps: 0%| | 3063/1000000 [12:34<67:52:01, 4.08it/s, grad_norm=2.79, loss_final=1.43, loss_mean=0.903, loss_mean_cls=0.932, proj_loss=-0.407][2026-03-22 14:31:11] Step: 3063, Training Logs: loss_final: 1.552033, loss_mean: 0.898156, proj_loss: -0.399154, loss_mean_cls: 1.053032, grad_norm: 3.978237 +Steps: 0%| | 3064/1000000 [12:34<67:51:13, 4.08it/s, grad_norm=3.98, loss_final=1.55, loss_mean=0.898, loss_mean_cls=1.05, proj_loss=-0.399][2026-03-22 14:31:11] Step: 3064, Training Logs: loss_final: 1.506764, loss_mean: 0.919852, proj_loss: -0.398920, loss_mean_cls: 0.985831, grad_norm: 2.223338 +Steps: 0%| | 3065/1000000 [12:34<67:47:52, 4.08it/s, grad_norm=2.22, loss_final=1.51, loss_mean=0.92, loss_mean_cls=0.986, proj_loss=-0.399][2026-03-22 14:31:11] Step: 3065, Training Logs: loss_final: 1.548889, loss_mean: 0.909508, proj_loss: -0.403101, loss_mean_cls: 1.042482, grad_norm: 1.615456 +Steps: 0%| | 3066/1000000 [12:34<67:45:56, 4.09it/s, grad_norm=1.62, loss_final=1.55, loss_mean=0.91, loss_mean_cls=1.04, proj_loss=-0.403][2026-03-22 14:31:11] Step: 3066, Training Logs: loss_final: 1.619789, loss_mean: 0.880268, proj_loss: -0.393985, loss_mean_cls: 1.133507, grad_norm: 2.802741 +Steps: 0%| | 3067/1000000 [12:35<67:43:32, 4.09it/s, grad_norm=2.8, loss_final=1.62, loss_mean=0.88, loss_mean_cls=1.13, proj_loss=-0.394][2026-03-22 14:31:12] Step: 3067, Training Logs: loss_final: 1.522702, loss_mean: 0.922553, proj_loss: -0.402400, loss_mean_cls: 1.002548, grad_norm: 3.554003 +Steps: 0%| | 3068/1000000 [12:35<67:43:54, 4.09it/s, grad_norm=3.55, loss_final=1.52, loss_mean=0.923, loss_mean_cls=1, proj_loss=-0.402][2026-03-22 14:31:12] Step: 3068, Training Logs: loss_final: 1.661781, loss_mean: 0.891681, proj_loss: -0.382023, loss_mean_cls: 1.152123, grad_norm: 1.748165 +Steps: 0%| | 3069/1000000 [12:35<67:42:14, 4.09it/s, grad_norm=1.75, loss_final=1.66, loss_mean=0.892, loss_mean_cls=1.15, proj_loss=-0.382][2026-03-22 14:31:12] Step: 3069, Training Logs: loss_final: 1.514505, loss_mean: 0.893927, proj_loss: -0.402554, loss_mean_cls: 1.023132, grad_norm: 3.972091 +Steps: 0%| | 3070/1000000 [12:35<67:41:51, 4.09it/s, grad_norm=3.97, loss_final=1.51, loss_mean=0.894, loss_mean_cls=1.02, proj_loss=-0.403][2026-03-22 14:31:12] Step: 3070, Training Logs: loss_final: 1.664686, loss_mean: 0.887529, proj_loss: -0.395169, loss_mean_cls: 1.172326, grad_norm: 3.564639 +Steps: 0%| | 3071/1000000 [12:36<67:42:54, 4.09it/s, grad_norm=3.56, loss_final=1.66, loss_mean=0.888, loss_mean_cls=1.17, proj_loss=-0.395][2026-03-22 14:31:12] Step: 3071, Training Logs: loss_final: 1.572833, loss_mean: 0.885909, proj_loss: -0.409156, loss_mean_cls: 1.096080, grad_norm: 4.628316 +Steps: 0%| | 3072/1000000 [12:36<67:42:24, 4.09it/s, grad_norm=4.63, loss_final=1.57, loss_mean=0.886, loss_mean_cls=1.1, proj_loss=-0.409][2026-03-22 14:31:13] Step: 3072, Training Logs: loss_final: 1.654968, loss_mean: 0.878388, proj_loss: -0.386367, loss_mean_cls: 1.162947, grad_norm: 2.538913 +Steps: 0%| | 3073/1000000 [12:36<67:43:34, 4.09it/s, grad_norm=2.54, loss_final=1.65, loss_mean=0.878, loss_mean_cls=1.16, proj_loss=-0.386][2026-03-22 14:31:13] Step: 3073, Training Logs: loss_final: 1.476434, loss_mean: 0.916295, proj_loss: -0.408064, loss_mean_cls: 0.968204, grad_norm: 3.550389 +Steps: 0%| | 3074/1000000 [12:36<67:43:59, 4.09it/s, grad_norm=3.55, loss_final=1.48, loss_mean=0.916, loss_mean_cls=0.968, proj_loss=-0.408][2026-03-22 14:31:13] Step: 3074, Training Logs: loss_final: 1.573204, loss_mean: 0.904567, proj_loss: -0.394872, loss_mean_cls: 1.063509, grad_norm: 2.918546 +Steps: 0%| | 3075/1000000 [12:37<67:42:48, 4.09it/s, grad_norm=2.92, loss_final=1.57, loss_mean=0.905, loss_mean_cls=1.06, proj_loss=-0.395][2026-03-22 14:31:13] Step: 3075, Training Logs: loss_final: 1.581881, loss_mean: 0.886146, proj_loss: -0.395137, loss_mean_cls: 1.090872, grad_norm: 1.798242 +Steps: 0%| | 3076/1000000 [12:37<67:42:16, 4.09it/s, grad_norm=1.8, loss_final=1.58, loss_mean=0.886, loss_mean_cls=1.09, proj_loss=-0.395][2026-03-22 14:31:14] Step: 3076, Training Logs: loss_final: 1.528240, loss_mean: 0.907104, proj_loss: -0.403910, loss_mean_cls: 1.025046, grad_norm: 2.915305 +Steps: 0%| | 3077/1000000 [12:37<67:41:58, 4.09it/s, grad_norm=2.92, loss_final=1.53, loss_mean=0.907, loss_mean_cls=1.03, proj_loss=-0.404][2026-03-22 14:31:14] Step: 3077, Training Logs: loss_final: 1.493506, loss_mean: 0.892215, proj_loss: -0.407111, loss_mean_cls: 1.008403, grad_norm: 1.629735 +Steps: 0%| | 3078/1000000 [12:37<67:40:42, 4.09it/s, grad_norm=1.63, loss_final=1.49, loss_mean=0.892, loss_mean_cls=1.01, proj_loss=-0.407][2026-03-22 14:31:14] Step: 3078, Training Logs: loss_final: 1.692800, loss_mean: 0.881908, proj_loss: -0.390699, loss_mean_cls: 1.201592, grad_norm: 2.590729 +Steps: 0%| | 3079/1000000 [12:37<67:39:39, 4.09it/s, grad_norm=2.59, loss_final=1.69, loss_mean=0.882, loss_mean_cls=1.2, proj_loss=-0.391][2026-03-22 14:31:14] Step: 3079, Training Logs: loss_final: 1.685677, loss_mean: 0.854206, proj_loss: -0.391308, loss_mean_cls: 1.222779, grad_norm: 4.077029 +Steps: 0%| | 3080/1000000 [12:38<67:39:59, 4.09it/s, grad_norm=4.08, loss_final=1.69, loss_mean=0.854, loss_mean_cls=1.22, proj_loss=-0.391][2026-03-22 14:31:15] Step: 3080, Training Logs: loss_final: 1.633669, loss_mean: 0.900235, proj_loss: -0.389422, loss_mean_cls: 1.122855, grad_norm: 4.524435 +Steps: 0%| | 3081/1000000 [12:38<67:39:32, 4.09it/s, grad_norm=4.52, loss_final=1.63, loss_mean=0.9, loss_mean_cls=1.12, proj_loss=-0.389][2026-03-22 14:31:15] Step: 3081, Training Logs: loss_final: 1.665377, loss_mean: 0.874015, proj_loss: -0.391452, loss_mean_cls: 1.182814, grad_norm: 4.090938 +Steps: 0%| | 3082/1000000 [12:38<67:38:35, 4.09it/s, grad_norm=4.09, loss_final=1.67, loss_mean=0.874, loss_mean_cls=1.18, proj_loss=-0.391][2026-03-22 14:31:15] Step: 3082, Training Logs: loss_final: 1.618487, loss_mean: 0.884217, proj_loss: -0.396658, loss_mean_cls: 1.130929, grad_norm: 1.900638 +Steps: 0%| | 3083/1000000 [12:38<67:38:23, 4.09it/s, grad_norm=1.9, loss_final=1.62, loss_mean=0.884, loss_mean_cls=1.13, proj_loss=-0.397][2026-03-22 14:31:15] Step: 3083, Training Logs: loss_final: 1.594508, loss_mean: 0.894717, proj_loss: -0.402983, loss_mean_cls: 1.102774, grad_norm: 5.723399 +Steps: 0%| | 3084/1000000 [12:39<67:39:27, 4.09it/s, grad_norm=5.72, loss_final=1.59, loss_mean=0.895, loss_mean_cls=1.1, proj_loss=-0.403][2026-03-22 14:31:16] Step: 3084, Training Logs: loss_final: 1.576474, loss_mean: 0.877427, proj_loss: -0.399244, loss_mean_cls: 1.098291, grad_norm: 3.725321 +Steps: 0%| | 3085/1000000 [12:39<67:39:53, 4.09it/s, grad_norm=3.73, loss_final=1.58, loss_mean=0.877, loss_mean_cls=1.1, proj_loss=-0.399][2026-03-22 14:31:16] Step: 3085, Training Logs: loss_final: 1.529237, loss_mean: 0.902502, proj_loss: -0.406761, loss_mean_cls: 1.033495, grad_norm: 4.395347 +Steps: 0%| | 3086/1000000 [12:39<67:40:55, 4.09it/s, grad_norm=4.4, loss_final=1.53, loss_mean=0.903, loss_mean_cls=1.03, proj_loss=-0.407][2026-03-22 14:31:16] Step: 3086, Training Logs: loss_final: 1.634997, loss_mean: 0.875624, proj_loss: -0.393660, loss_mean_cls: 1.153032, grad_norm: 1.583541 +Steps: 0%| | 3087/1000000 [12:39<67:44:23, 4.09it/s, grad_norm=1.58, loss_final=1.63, loss_mean=0.876, loss_mean_cls=1.15, proj_loss=-0.394][2026-03-22 14:31:16] Step: 3087, Training Logs: loss_final: 1.519686, loss_mean: 0.893659, proj_loss: -0.407493, loss_mean_cls: 1.033520, grad_norm: 3.248399 +Steps: 0%| | 3087/1000000 [12:39<67:44:23, 4.09it/s, grad_norm=3.25, loss_final=1.52, loss_mean=0.894, loss_mean_cls=1.03, proj_loss=-0.407] diff --git a/back/wandb/run-20260322_141833-vm0y8t9t/files/requirements.txt b/back/wandb/run-20260322_141833-vm0y8t9t/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0235910d0d99b7dee69b9a7f2f90012c8b711cc --- /dev/null +++ b/back/wandb/run-20260322_141833-vm0y8t9t/files/requirements.txt @@ -0,0 +1,168 @@ +dill==0.3.8 +mkl-service==2.4.0 +mpmath==1.3.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +torch==2.5.1 +ptyprocess==0.7.0 +traitlets==5.14.3 +pyasn1==0.6.1 +opencv-python-headless==4.12.0.88 +nest-asyncio==1.6.0 +kiwisolver==1.4.8 +click==8.2.1 +fire==0.7.1 +diffusers==0.35.1 +accelerate==1.7.0 +ipykernel==6.29.5 +peft==0.17.1 +attrs==24.3.0 +six==1.17.0 +numpy==2.0.1 +yarl==1.18.0 +huggingface_hub==0.34.4 +Bottleneck==1.4.2 +numexpr==2.11.0 +dataclasses==0.6 +typing-inspection==0.4.1 +safetensors==0.5.3 +pyparsing==3.2.3 +psutil==7.0.0 +imageio==2.37.0 +debugpy==1.8.14 +cycler==0.12.1 +pyasn1_modules==0.4.2 +matplotlib-inline==0.1.7 +matplotlib==3.10.3 +jedi==0.19.2 +tokenizers==0.21.2 +seaborn==0.13.2 +timm==1.0.15 +aiohappyeyeballs==2.6.1 +hf-xet==1.1.8 +multidict==6.1.0 +tqdm==4.67.1 +wheel==0.45.1 +simsimd==6.5.1 +sentencepiece==0.2.1 +grpcio==1.74.0 +asttokens==3.0.0 +absl-py==2.3.1 +stack-data==0.6.3 +pandas==2.3.0 +importlib_metadata==8.7.0 +pytorch-image-generation-metrics==0.6.1 +frozenlist==1.5.0 +MarkupSafe==3.0.2 +setuptools==78.1.1 +multiprocess==0.70.15 +pip==25.1 +requests==2.32.3 +mkl_random==1.2.8 +tensorboard-plugin-wit==1.8.1 +ExifRead-nocycle==3.0.1 +webdataset==0.2.111 +threadpoolctl==3.6.0 +pyarrow==21.0.0 +executing==2.2.0 +decorator==5.2.1 +contourpy==1.3.2 +annotated-types==0.7.0 +scikit-learn==1.7.1 +jupyter_client==8.6.3 +albumentations==1.4.24 +wandb==0.25.0 +certifi==2025.8.3 +idna==3.7 +xxhash==3.5.0 +Jinja2==3.1.6 +python-dateutil==2.9.0.post0 +aiosignal==1.4.0 +triton==3.1.0 +torchvision==0.20.1 +stringzilla==3.12.6 +pure_eval==0.2.3 +braceexpand==0.1.7 +zipp==3.22.0 +oauthlib==3.3.1 +Markdown==3.8.2 +fsspec==2025.3.0 +fonttools==4.58.2 +comm==0.2.2 +ipython==9.3.0 +img2dataset==1.47.0 +networkx==3.4.2 +PySocks==1.7.1 +tzdata==2025.2 +smmap==5.0.2 +mkl_fft==1.3.11 +sentry-sdk==2.29.1 +Pygments==2.19.1 +pexpect==4.9.0 +ftfy==6.3.1 +einops==0.8.1 +requests-oauthlib==2.0.0 +gitdb==4.0.12 +albucore==0.0.23 +torchdiffeq==0.2.5 +GitPython==3.1.44 +bitsandbytes==0.47.0 +pytorch-fid==0.3.0 +clean-fid==0.1.35 +pytorch-gan-metrics==0.5.4 +Brotli==1.0.9 +charset-normalizer==3.3.2 +gmpy2==2.2.1 +pillow==11.1.0 +PyYAML==6.0.2 +tornado==6.5.1 +termcolor==3.1.0 +setproctitle==1.3.6 +scipy==1.15.3 +regex==2024.11.6 +protobuf==6.31.1 +platformdirs==4.3.8 +joblib==1.5.1 +cachetools==4.2.4 +ipython_pygments_lexers==1.1.1 +google-auth==1.35.0 +transformers==4.53.2 +torch-fidelity==0.3.0 +tensorboard==2.4.0 +filelock==3.17.0 +packaging==25.0 +propcache==0.3.1 +pytz==2025.2 +aiohttp==3.11.10 +wcwidth==0.2.13 +clip==0.2.0 +Werkzeug==3.1.3 +tensorboard-data-server==0.6.1 +sympy==1.13.1 +pyzmq==26.4.0 +pydantic_core==2.33.2 +prompt_toolkit==3.0.51 +parso==0.8.4 +docker-pycreds==0.4.0 +rsa==4.9.1 +pydantic==2.11.5 +jupyter_core==5.8.1 +google-auth-oauthlib==0.4.6 +datasets==4.0.0 +torch-tb-profiler==0.4.3 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +typeguard==4.3.0 +inflect==7.3.1 +jaraco.text==3.12.1 +tomli==2.0.1 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/back/wandb/run-20260322_141833-vm0y8t9t/files/wandb-metadata.json b/back/wandb/run-20260322_141833-vm0y8t9t/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..b147231dfab96084d06832c34dbc80d477a89e72 --- /dev/null +++ b/back/wandb/run-20260322_141833-vm0y8t9t/files/wandb-metadata.json @@ -0,0 +1,101 @@ +{ + "os": "Linux-5.15.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.9", + "startedAt": "2026-03-22T06:18:33.208941Z", + "args": [ + "--report-to", + "wandb", + "--allow-tf32", + "--mixed-precision", + "bf16", + "--seed", + "0", + "--path-type", + "linear", + "--prediction", + "v", + "--weighting", + "uniform", + "--model", + "SiT-XL/2", + "--enc-type", + "dinov2-vit-b", + "--encoder-depth", + "8", + "--proj-coeff", + "0.5", + "--output-dir", + "exps", + "--exp-name", + "jsflow-experiment", + "--batch-size", + "256", + "--data-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256", + "--semantic-features-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0", + "--learning-rate", + "0.00005", + "--t-c", + "0.5", + "--cls", + "0.2", + "--ot-cls" + ], + "program": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "git": { + "remote": "https://github.com/Martinser/REG.git", + "commit": "021ea2e50c38c5803bd9afff16316958a01fbd1d" + }, + "email": "2365972933@qq.com", + "root": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG", + "host": "24c964746905d416ce09d045f9a06f23-taskrole1-0", + "executable": "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 4, + "disk": { + "/": { + "total": "3838880616448", + "used": "357556703232" + } + }, + "memory": { + "total": "2164115296256" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-a09f2421-99e6-a72e-63bd-fd7452510758" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e" + } + ], + "cudaVersion": "13.0", + "writerId": "gklxguwapb72cxij4696gj37bh1rbthi" +} \ No newline at end of file diff --git a/back/wandb/run-20260322_141833-vm0y8t9t/logs/debug-internal.log b/back/wandb/run-20260322_141833-vm0y8t9t/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ebe8da1c8c9c5daede2abb54d343ec141b1c89d0 --- /dev/null +++ b/back/wandb/run-20260322_141833-vm0y8t9t/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-03-22T14:18:33.472940651+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-22T14:18:35.380852704+08:00","level":"INFO","msg":"stream: created new stream","id":"vm0y8t9t"} +{"time":"2026-03-22T14:18:35.381056887+08:00","level":"INFO","msg":"handler: started","stream_id":"vm0y8t9t"} +{"time":"2026-03-22T14:18:35.382108345+08:00","level":"INFO","msg":"writer: started","stream_id":"vm0y8t9t"} +{"time":"2026-03-22T14:18:35.382119604+08:00","level":"INFO","msg":"stream: started","id":"vm0y8t9t"} +{"time":"2026-03-22T14:18:35.382161533+08:00","level":"INFO","msg":"sender: started","stream_id":"vm0y8t9t"} diff --git a/back/wandb/run-20260322_141833-vm0y8t9t/logs/debug.log b/back/wandb/run-20260322_141833-vm0y8t9t/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d8feb36e3cad386547aa41310b66b1310a724dec --- /dev/null +++ b/back/wandb/run-20260322_141833-vm0y8t9t/logs/debug.log @@ -0,0 +1,20 @@ +2026-03-22 14:18:33,237 INFO MainThread:318585 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-22 14:18:33,237 INFO MainThread:318585 [wandb_setup.py:_flush():81] Configure stats pid to 318585 +2026-03-22 14:18:33,237 INFO MainThread:318585 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-22 14:18:33,237 INFO MainThread:318585 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_141833-vm0y8t9t/logs/debug.log +2026-03-22 14:18:33,237 INFO MainThread:318585 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_141833-vm0y8t9t/logs/debug-internal.log +2026-03-22 14:18:33,237 INFO MainThread:318585 [wandb_init.py:init():844] calling init triggers +2026-03-22 14:18:33,237 INFO MainThread:318585 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-22 14:18:33,237 INFO MainThread:318585 [wandb_init.py:init():892] starting backend +2026-03-22 14:18:33,460 INFO MainThread:318585 [wandb_init.py:init():895] sending inform_init request +2026-03-22 14:18:33,470 INFO MainThread:318585 [wandb_init.py:init():903] backend started and connected +2026-03-22 14:18:33,472 INFO MainThread:318585 [wandb_init.py:init():973] updated telemetry +2026-03-22 14:18:33,485 INFO MainThread:318585 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-22 14:18:36,829 INFO MainThread:318585 [wandb_init.py:init():1042] starting run threads in backend +2026-03-22 14:18:36,920 INFO MainThread:318585 [wandb_run.py:_console_start():2524] atexit reg +2026-03-22 14:18:36,920 INFO MainThread:318585 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-22 14:18:36,921 INFO MainThread:318585 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-22 14:18:36,921 INFO MainThread:318585 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-22 14:18:36,924 INFO MainThread:318585 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-22 14:18:36,924 INFO MainThread:318585 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 10000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.2, 't_c': 0.5, 'ot_cls': True} diff --git a/back/wandb/run-20260322_150022-yhxc5cgu/files/config.yaml b/back/wandb/run-20260322_150022-yhxc5cgu/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..18d81e248f09de45e3b302e45f73d8113362db04 --- /dev/null +++ b/back/wandb/run-20260322_150022-yhxc5cgu/files/config.yaml @@ -0,0 +1,202 @@ +_wandb: + value: + cli_version: 0.25.0 + e: + ucanic8s891x6sl28vnbha78lzoecw66: + args: + - --report-to + - wandb + - --allow-tf32 + - --mixed-precision + - bf16 + - --seed + - "0" + - --path-type + - linear + - --prediction + - v + - --weighting + - uniform + - --model + - SiT-XL/2 + - --enc-type + - dinov2-vit-b + - --encoder-depth + - "8" + - --proj-coeff + - "0.5" + - --output-dir + - exps + - --exp-name + - jsflow-experiment + - --batch-size + - "256" + - --data-dir + - /gemini/space/zhaozy/dataset/Imagenet/imagenet_256 + - --semantic-features-dir + - /gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0 + - --learning-rate + - "0.00005" + - --t-c + - "0.5" + - --cls + - "0.2" + - --ot-cls + codePath: train.py + codePathLocal: train.py + cpu_count: 96 + cpu_count_logical: 192 + cudaVersion: "13.0" + disk: + /: + total: "3838880616448" + used: "357557354496" + email: 2365972933@qq.com + executable: /gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python + git: + commit: 021ea2e50c38c5803bd9afff16316958a01fbd1d + remote: https://github.com/Martinser/REG.git + gpu: NVIDIA H100 80GB HBM3 + gpu_count: 4 + gpu_nvidia: + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-a09f2421-99e6-a72e-63bd-fd7452510758 + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e + host: 24c964746905d416ce09d045f9a06f23-taskrole1-0 + memory: + total: "2164115296256" + os: Linux-5.15.0-94-generic-x86_64-with-glibc2.35 + program: /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py + python: CPython 3.12.9 + root: /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG + startedAt: "2026-03-22T07:00:22.092510Z" + writerId: ucanic8s891x6sl28vnbha78lzoecw66 + m: [] + python_version: 3.12.9 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 53 + - 63 + - 71 + - 83 + - 98 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 53 + - 63 + - 71 + - 83 + - 98 + "3": + - 13 + "4": 3.12.9 + "5": 0.25.0 + "6": 4.53.2 + "12": 0.25.0 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0 +allow_tf32: + value: true +batch_size: + value: 256 +cfg_prob: + value: 0.1 +checkpointing_steps: + value: 10000 +cls: + value: 0.2 +data_dir: + value: /gemini/space/zhaozy/dataset/Imagenet/imagenet_256 +enc_type: + value: dinov2-vit-b +encoder_depth: + value: 8 +epochs: + value: 1400 +exp_name: + value: jsflow-experiment +fused_attn: + value: true +gradient_accumulation_steps: + value: 1 +learning_rate: + value: 5e-05 +legacy: + value: false +logging_dir: + value: logs +max_grad_norm: + value: 1 +max_train_steps: + value: 1000000 +mixed_precision: + value: bf16 +model: + value: SiT-XL/2 +num_classes: + value: 1000 +num_workers: + value: 4 +ops_head: + value: 16 +ot_cls: + value: true +output_dir: + value: exps +path_type: + value: linear +prediction: + value: v +proj_coeff: + value: 0.5 +qk_norm: + value: false +report_to: + value: wandb +resolution: + value: 256 +resume_step: + value: 0 +sampling_steps: + value: 2000 +seed: + value: 0 +semantic_features_dir: + value: /gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0 +t_c: + value: 0.5 +weighting: + value: uniform diff --git a/back/wandb/run-20260322_150022-yhxc5cgu/files/output.log b/back/wandb/run-20260322_150022-yhxc5cgu/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..113b7e8ca551a921fc4a58b90c37a5907ea8801d --- /dev/null +++ b/back/wandb/run-20260322_150022-yhxc5cgu/files/output.log @@ -0,0 +1,19 @@ +Steps: 0%| | 1/1000000 [00:02<652:30:07, 2.35s/it][2026-03-22 15:00:28] Generating EMA samples for evaluation (t=1→0 and t=0.5)... +Traceback (most recent call last): + File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 628, in + main(args) + File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 425, in main + cls_init = torch.randn(n_samples, base_model.semantic_channels, device=device) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1931, in __getattr__ + raise AttributeError( +AttributeError: 'SiT' object has no attribute 'semantic_channels' +[rank0]: Traceback (most recent call last): +[rank0]: File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 628, in +[rank0]: main(args) +[rank0]: File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 425, in main +[rank0]: cls_init = torch.randn(n_samples, base_model.semantic_channels, device=device) +[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +[rank0]: File "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1931, in __getattr__ +[rank0]: raise AttributeError( +[rank0]: AttributeError: 'SiT' object has no attribute 'semantic_channels' diff --git a/back/wandb/run-20260322_150022-yhxc5cgu/files/requirements.txt b/back/wandb/run-20260322_150022-yhxc5cgu/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0235910d0d99b7dee69b9a7f2f90012c8b711cc --- /dev/null +++ b/back/wandb/run-20260322_150022-yhxc5cgu/files/requirements.txt @@ -0,0 +1,168 @@ +dill==0.3.8 +mkl-service==2.4.0 +mpmath==1.3.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +torch==2.5.1 +ptyprocess==0.7.0 +traitlets==5.14.3 +pyasn1==0.6.1 +opencv-python-headless==4.12.0.88 +nest-asyncio==1.6.0 +kiwisolver==1.4.8 +click==8.2.1 +fire==0.7.1 +diffusers==0.35.1 +accelerate==1.7.0 +ipykernel==6.29.5 +peft==0.17.1 +attrs==24.3.0 +six==1.17.0 +numpy==2.0.1 +yarl==1.18.0 +huggingface_hub==0.34.4 +Bottleneck==1.4.2 +numexpr==2.11.0 +dataclasses==0.6 +typing-inspection==0.4.1 +safetensors==0.5.3 +pyparsing==3.2.3 +psutil==7.0.0 +imageio==2.37.0 +debugpy==1.8.14 +cycler==0.12.1 +pyasn1_modules==0.4.2 +matplotlib-inline==0.1.7 +matplotlib==3.10.3 +jedi==0.19.2 +tokenizers==0.21.2 +seaborn==0.13.2 +timm==1.0.15 +aiohappyeyeballs==2.6.1 +hf-xet==1.1.8 +multidict==6.1.0 +tqdm==4.67.1 +wheel==0.45.1 +simsimd==6.5.1 +sentencepiece==0.2.1 +grpcio==1.74.0 +asttokens==3.0.0 +absl-py==2.3.1 +stack-data==0.6.3 +pandas==2.3.0 +importlib_metadata==8.7.0 +pytorch-image-generation-metrics==0.6.1 +frozenlist==1.5.0 +MarkupSafe==3.0.2 +setuptools==78.1.1 +multiprocess==0.70.15 +pip==25.1 +requests==2.32.3 +mkl_random==1.2.8 +tensorboard-plugin-wit==1.8.1 +ExifRead-nocycle==3.0.1 +webdataset==0.2.111 +threadpoolctl==3.6.0 +pyarrow==21.0.0 +executing==2.2.0 +decorator==5.2.1 +contourpy==1.3.2 +annotated-types==0.7.0 +scikit-learn==1.7.1 +jupyter_client==8.6.3 +albumentations==1.4.24 +wandb==0.25.0 +certifi==2025.8.3 +idna==3.7 +xxhash==3.5.0 +Jinja2==3.1.6 +python-dateutil==2.9.0.post0 +aiosignal==1.4.0 +triton==3.1.0 +torchvision==0.20.1 +stringzilla==3.12.6 +pure_eval==0.2.3 +braceexpand==0.1.7 +zipp==3.22.0 +oauthlib==3.3.1 +Markdown==3.8.2 +fsspec==2025.3.0 +fonttools==4.58.2 +comm==0.2.2 +ipython==9.3.0 +img2dataset==1.47.0 +networkx==3.4.2 +PySocks==1.7.1 +tzdata==2025.2 +smmap==5.0.2 +mkl_fft==1.3.11 +sentry-sdk==2.29.1 +Pygments==2.19.1 +pexpect==4.9.0 +ftfy==6.3.1 +einops==0.8.1 +requests-oauthlib==2.0.0 +gitdb==4.0.12 +albucore==0.0.23 +torchdiffeq==0.2.5 +GitPython==3.1.44 +bitsandbytes==0.47.0 +pytorch-fid==0.3.0 +clean-fid==0.1.35 +pytorch-gan-metrics==0.5.4 +Brotli==1.0.9 +charset-normalizer==3.3.2 +gmpy2==2.2.1 +pillow==11.1.0 +PyYAML==6.0.2 +tornado==6.5.1 +termcolor==3.1.0 +setproctitle==1.3.6 +scipy==1.15.3 +regex==2024.11.6 +protobuf==6.31.1 +platformdirs==4.3.8 +joblib==1.5.1 +cachetools==4.2.4 +ipython_pygments_lexers==1.1.1 +google-auth==1.35.0 +transformers==4.53.2 +torch-fidelity==0.3.0 +tensorboard==2.4.0 +filelock==3.17.0 +packaging==25.0 +propcache==0.3.1 +pytz==2025.2 +aiohttp==3.11.10 +wcwidth==0.2.13 +clip==0.2.0 +Werkzeug==3.1.3 +tensorboard-data-server==0.6.1 +sympy==1.13.1 +pyzmq==26.4.0 +pydantic_core==2.33.2 +prompt_toolkit==3.0.51 +parso==0.8.4 +docker-pycreds==0.4.0 +rsa==4.9.1 +pydantic==2.11.5 +jupyter_core==5.8.1 +google-auth-oauthlib==0.4.6 +datasets==4.0.0 +torch-tb-profiler==0.4.3 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +typeguard==4.3.0 +inflect==7.3.1 +jaraco.text==3.12.1 +tomli==2.0.1 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/back/wandb/run-20260322_150022-yhxc5cgu/files/wandb-metadata.json b/back/wandb/run-20260322_150022-yhxc5cgu/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..d4272fad64662ee8f7fb05edd4772556c94827a4 --- /dev/null +++ b/back/wandb/run-20260322_150022-yhxc5cgu/files/wandb-metadata.json @@ -0,0 +1,101 @@ +{ + "os": "Linux-5.15.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.9", + "startedAt": "2026-03-22T07:00:22.092510Z", + "args": [ + "--report-to", + "wandb", + "--allow-tf32", + "--mixed-precision", + "bf16", + "--seed", + "0", + "--path-type", + "linear", + "--prediction", + "v", + "--weighting", + "uniform", + "--model", + "SiT-XL/2", + "--enc-type", + "dinov2-vit-b", + "--encoder-depth", + "8", + "--proj-coeff", + "0.5", + "--output-dir", + "exps", + "--exp-name", + "jsflow-experiment", + "--batch-size", + "256", + "--data-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256", + "--semantic-features-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0", + "--learning-rate", + "0.00005", + "--t-c", + "0.5", + "--cls", + "0.2", + "--ot-cls" + ], + "program": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "git": { + "remote": "https://github.com/Martinser/REG.git", + "commit": "021ea2e50c38c5803bd9afff16316958a01fbd1d" + }, + "email": "2365972933@qq.com", + "root": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG", + "host": "24c964746905d416ce09d045f9a06f23-taskrole1-0", + "executable": "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 4, + "disk": { + "/": { + "total": "3838880616448", + "used": "357557354496" + } + }, + "memory": { + "total": "2164115296256" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-a09f2421-99e6-a72e-63bd-fd7452510758" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e" + } + ], + "cudaVersion": "13.0", + "writerId": "ucanic8s891x6sl28vnbha78lzoecw66" +} \ No newline at end of file diff --git a/back/wandb/run-20260322_150022-yhxc5cgu/files/wandb-summary.json b/back/wandb/run-20260322_150022-yhxc5cgu/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..8afb95f49483c85658a334253ad61c5e4b5851ef --- /dev/null +++ b/back/wandb/run-20260322_150022-yhxc5cgu/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":2},"_runtime":2} \ No newline at end of file diff --git a/back/wandb/run-20260322_150022-yhxc5cgu/logs/debug-internal.log b/back/wandb/run-20260322_150022-yhxc5cgu/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..f70fe84918f7f6562832b960fbfbaefa3e2a6c88 --- /dev/null +++ b/back/wandb/run-20260322_150022-yhxc5cgu/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2026-03-22T15:00:22.432399726+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-22T15:00:25.799578446+08:00","level":"INFO","msg":"stream: created new stream","id":"yhxc5cgu"} +{"time":"2026-03-22T15:00:25.799734466+08:00","level":"INFO","msg":"handler: started","stream_id":"yhxc5cgu"} +{"time":"2026-03-22T15:00:25.80075778+08:00","level":"INFO","msg":"stream: started","id":"yhxc5cgu"} +{"time":"2026-03-22T15:00:25.800786229+08:00","level":"INFO","msg":"writer: started","stream_id":"yhxc5cgu"} +{"time":"2026-03-22T15:00:25.800837858+08:00","level":"INFO","msg":"sender: started","stream_id":"yhxc5cgu"} +{"time":"2026-03-22T15:00:28.913273863+08:00","level":"INFO","msg":"stream: closing","id":"yhxc5cgu"} diff --git a/back/wandb/run-20260322_150022-yhxc5cgu/logs/debug.log b/back/wandb/run-20260322_150022-yhxc5cgu/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..5232531c86619e8d6326ec6063d44832a5860889 --- /dev/null +++ b/back/wandb/run-20260322_150022-yhxc5cgu/logs/debug.log @@ -0,0 +1,22 @@ +2026-03-22 15:00:22,124 INFO MainThread:323629 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-22 15:00:22,124 INFO MainThread:323629 [wandb_setup.py:_flush():81] Configure stats pid to 323629 +2026-03-22 15:00:22,124 INFO MainThread:323629 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-22 15:00:22,124 INFO MainThread:323629 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_150022-yhxc5cgu/logs/debug.log +2026-03-22 15:00:22,124 INFO MainThread:323629 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_150022-yhxc5cgu/logs/debug-internal.log +2026-03-22 15:00:22,125 INFO MainThread:323629 [wandb_init.py:init():844] calling init triggers +2026-03-22 15:00:22,125 INFO MainThread:323629 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-22 15:00:22,125 INFO MainThread:323629 [wandb_init.py:init():892] starting backend +2026-03-22 15:00:22,416 INFO MainThread:323629 [wandb_init.py:init():895] sending inform_init request +2026-03-22 15:00:22,429 INFO MainThread:323629 [wandb_init.py:init():903] backend started and connected +2026-03-22 15:00:22,431 INFO MainThread:323629 [wandb_init.py:init():973] updated telemetry +2026-03-22 15:00:22,447 INFO MainThread:323629 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-22 15:00:26,403 INFO MainThread:323629 [wandb_init.py:init():1042] starting run threads in backend +2026-03-22 15:00:26,494 INFO MainThread:323629 [wandb_run.py:_console_start():2524] atexit reg +2026-03-22 15:00:26,494 INFO MainThread:323629 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-22 15:00:26,494 INFO MainThread:323629 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-22 15:00:26,495 INFO MainThread:323629 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-22 15:00:26,500 INFO MainThread:323629 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-22 15:00:26,500 INFO MainThread:323629 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 2000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.2, 't_c': 0.5, 'ot_cls': True} +2026-03-22 15:00:28,913 INFO wandb-AsyncioManager-main:323629 [service_client.py:_forward_responses():134] Reached EOF. +2026-03-22 15:00:28,913 INFO wandb-AsyncioManager-main:323629 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles. diff --git a/back/wandb/run-20260322_150022-yhxc5cgu/run-yhxc5cgu.wandb b/back/wandb/run-20260322_150022-yhxc5cgu/run-yhxc5cgu.wandb new file mode 100644 index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32 Binary files /dev/null and b/back/wandb/run-20260322_150022-yhxc5cgu/run-yhxc5cgu.wandb differ diff --git a/back/wandb/run-20260322_150443-e3yw9ii4/files/config.yaml b/back/wandb/run-20260322_150443-e3yw9ii4/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8f4eaa7e50c422b426f8002ae4ec05ca05833a5c --- /dev/null +++ b/back/wandb/run-20260322_150443-e3yw9ii4/files/config.yaml @@ -0,0 +1,202 @@ +_wandb: + value: + cli_version: 0.25.0 + e: + q63x26q8nhayytv8q2rrmj9j9uy9kvub: + args: + - --report-to + - wandb + - --allow-tf32 + - --mixed-precision + - bf16 + - --seed + - "0" + - --path-type + - linear + - --prediction + - v + - --weighting + - uniform + - --model + - SiT-XL/2 + - --enc-type + - dinov2-vit-b + - --encoder-depth + - "8" + - --proj-coeff + - "0.5" + - --output-dir + - exps + - --exp-name + - jsflow-experiment + - --batch-size + - "256" + - --data-dir + - /gemini/space/zhaozy/dataset/Imagenet/imagenet_256 + - --semantic-features-dir + - /gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0 + - --learning-rate + - "0.00005" + - --t-c + - "0.5" + - --cls + - "0.2" + - --ot-cls + codePath: train.py + codePathLocal: train.py + cpu_count: 96 + cpu_count_logical: 192 + cudaVersion: "13.0" + disk: + /: + total: "3838880616448" + used: "357557714944" + email: 2365972933@qq.com + executable: /gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python + git: + commit: 021ea2e50c38c5803bd9afff16316958a01fbd1d + remote: https://github.com/Martinser/REG.git + gpu: NVIDIA H100 80GB HBM3 + gpu_count: 4 + gpu_nvidia: + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-a09f2421-99e6-a72e-63bd-fd7452510758 + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d + - architecture: Hopper + cudaCores: 16896 + memoryTotal: "85520809984" + name: NVIDIA H100 80GB HBM3 + uuid: GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e + host: 24c964746905d416ce09d045f9a06f23-taskrole1-0 + memory: + total: "2164115296256" + os: Linux-5.15.0-94-generic-x86_64-with-glibc2.35 + program: /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py + python: CPython 3.12.9 + root: /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG + startedAt: "2026-03-22T07:04:43.133739Z" + writerId: q63x26q8nhayytv8q2rrmj9j9uy9kvub + m: [] + python_version: 3.12.9 + t: + "1": + - 1 + - 5 + - 11 + - 41 + - 49 + - 53 + - 63 + - 71 + - 83 + - 98 + "2": + - 1 + - 5 + - 11 + - 41 + - 49 + - 53 + - 63 + - 71 + - 83 + - 98 + "3": + - 13 + "4": 3.12.9 + "5": 0.25.0 + "6": 4.53.2 + "12": 0.25.0 + "13": linux-x86_64 +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +adam_weight_decay: + value: 0 +allow_tf32: + value: true +batch_size: + value: 256 +cfg_prob: + value: 0.1 +checkpointing_steps: + value: 10000 +cls: + value: 0.2 +data_dir: + value: /gemini/space/zhaozy/dataset/Imagenet/imagenet_256 +enc_type: + value: dinov2-vit-b +encoder_depth: + value: 8 +epochs: + value: 1400 +exp_name: + value: jsflow-experiment +fused_attn: + value: true +gradient_accumulation_steps: + value: 1 +learning_rate: + value: 5e-05 +legacy: + value: false +logging_dir: + value: logs +max_grad_norm: + value: 1 +max_train_steps: + value: 1000000 +mixed_precision: + value: bf16 +model: + value: SiT-XL/2 +num_classes: + value: 1000 +num_workers: + value: 4 +ops_head: + value: 16 +ot_cls: + value: true +output_dir: + value: exps +path_type: + value: linear +prediction: + value: v +proj_coeff: + value: 0.5 +qk_norm: + value: false +report_to: + value: wandb +resolution: + value: 256 +resume_step: + value: 0 +sampling_steps: + value: 2000 +seed: + value: 0 +semantic_features_dir: + value: /gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0 +t_c: + value: 0.5 +weighting: + value: uniform diff --git a/back/wandb/run-20260322_150443-e3yw9ii4/files/output.log b/back/wandb/run-20260322_150443-e3yw9ii4/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..8d8174d8731a3e269f8270a0f6ac601c62d4f255 --- /dev/null +++ b/back/wandb/run-20260322_150443-e3yw9ii4/files/output.log @@ -0,0 +1,15 @@ +Steps: 0%| | 1/1000000 [00:02<588:29:38, 2.12s/it][2026-03-22 15:04:48] Generating EMA samples for evaluation (SDE → t=0)... +Traceback (most recent call last): + File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 572, in + main(args) + File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 444, in main + if vae is not None: + ^^^ +NameError: name 'vae' is not defined +[rank0]: Traceback (most recent call last): +[rank0]: File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 572, in +[rank0]: main(args) +[rank0]: File "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", line 444, in main +[rank0]: if vae is not None: +[rank0]: ^^^ +[rank0]: NameError: name 'vae' is not defined diff --git a/back/wandb/run-20260322_150443-e3yw9ii4/files/requirements.txt b/back/wandb/run-20260322_150443-e3yw9ii4/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0235910d0d99b7dee69b9a7f2f90012c8b711cc --- /dev/null +++ b/back/wandb/run-20260322_150443-e3yw9ii4/files/requirements.txt @@ -0,0 +1,168 @@ +dill==0.3.8 +mkl-service==2.4.0 +mpmath==1.3.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +torch==2.5.1 +ptyprocess==0.7.0 +traitlets==5.14.3 +pyasn1==0.6.1 +opencv-python-headless==4.12.0.88 +nest-asyncio==1.6.0 +kiwisolver==1.4.8 +click==8.2.1 +fire==0.7.1 +diffusers==0.35.1 +accelerate==1.7.0 +ipykernel==6.29.5 +peft==0.17.1 +attrs==24.3.0 +six==1.17.0 +numpy==2.0.1 +yarl==1.18.0 +huggingface_hub==0.34.4 +Bottleneck==1.4.2 +numexpr==2.11.0 +dataclasses==0.6 +typing-inspection==0.4.1 +safetensors==0.5.3 +pyparsing==3.2.3 +psutil==7.0.0 +imageio==2.37.0 +debugpy==1.8.14 +cycler==0.12.1 +pyasn1_modules==0.4.2 +matplotlib-inline==0.1.7 +matplotlib==3.10.3 +jedi==0.19.2 +tokenizers==0.21.2 +seaborn==0.13.2 +timm==1.0.15 +aiohappyeyeballs==2.6.1 +hf-xet==1.1.8 +multidict==6.1.0 +tqdm==4.67.1 +wheel==0.45.1 +simsimd==6.5.1 +sentencepiece==0.2.1 +grpcio==1.74.0 +asttokens==3.0.0 +absl-py==2.3.1 +stack-data==0.6.3 +pandas==2.3.0 +importlib_metadata==8.7.0 +pytorch-image-generation-metrics==0.6.1 +frozenlist==1.5.0 +MarkupSafe==3.0.2 +setuptools==78.1.1 +multiprocess==0.70.15 +pip==25.1 +requests==2.32.3 +mkl_random==1.2.8 +tensorboard-plugin-wit==1.8.1 +ExifRead-nocycle==3.0.1 +webdataset==0.2.111 +threadpoolctl==3.6.0 +pyarrow==21.0.0 +executing==2.2.0 +decorator==5.2.1 +contourpy==1.3.2 +annotated-types==0.7.0 +scikit-learn==1.7.1 +jupyter_client==8.6.3 +albumentations==1.4.24 +wandb==0.25.0 +certifi==2025.8.3 +idna==3.7 +xxhash==3.5.0 +Jinja2==3.1.6 +python-dateutil==2.9.0.post0 +aiosignal==1.4.0 +triton==3.1.0 +torchvision==0.20.1 +stringzilla==3.12.6 +pure_eval==0.2.3 +braceexpand==0.1.7 +zipp==3.22.0 +oauthlib==3.3.1 +Markdown==3.8.2 +fsspec==2025.3.0 +fonttools==4.58.2 +comm==0.2.2 +ipython==9.3.0 +img2dataset==1.47.0 +networkx==3.4.2 +PySocks==1.7.1 +tzdata==2025.2 +smmap==5.0.2 +mkl_fft==1.3.11 +sentry-sdk==2.29.1 +Pygments==2.19.1 +pexpect==4.9.0 +ftfy==6.3.1 +einops==0.8.1 +requests-oauthlib==2.0.0 +gitdb==4.0.12 +albucore==0.0.23 +torchdiffeq==0.2.5 +GitPython==3.1.44 +bitsandbytes==0.47.0 +pytorch-fid==0.3.0 +clean-fid==0.1.35 +pytorch-gan-metrics==0.5.4 +Brotli==1.0.9 +charset-normalizer==3.3.2 +gmpy2==2.2.1 +pillow==11.1.0 +PyYAML==6.0.2 +tornado==6.5.1 +termcolor==3.1.0 +setproctitle==1.3.6 +scipy==1.15.3 +regex==2024.11.6 +protobuf==6.31.1 +platformdirs==4.3.8 +joblib==1.5.1 +cachetools==4.2.4 +ipython_pygments_lexers==1.1.1 +google-auth==1.35.0 +transformers==4.53.2 +torch-fidelity==0.3.0 +tensorboard==2.4.0 +filelock==3.17.0 +packaging==25.0 +propcache==0.3.1 +pytz==2025.2 +aiohttp==3.11.10 +wcwidth==0.2.13 +clip==0.2.0 +Werkzeug==3.1.3 +tensorboard-data-server==0.6.1 +sympy==1.13.1 +pyzmq==26.4.0 +pydantic_core==2.33.2 +prompt_toolkit==3.0.51 +parso==0.8.4 +docker-pycreds==0.4.0 +rsa==4.9.1 +pydantic==2.11.5 +jupyter_core==5.8.1 +google-auth-oauthlib==0.4.6 +datasets==4.0.0 +torch-tb-profiler==0.4.3 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +typeguard==4.3.0 +inflect==7.3.1 +jaraco.text==3.12.1 +tomli==2.0.1 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/back/wandb/run-20260322_150443-e3yw9ii4/files/wandb-metadata.json b/back/wandb/run-20260322_150443-e3yw9ii4/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8c79cb49af77ec4fd6a6e776bbd576f0cc4424cd --- /dev/null +++ b/back/wandb/run-20260322_150443-e3yw9ii4/files/wandb-metadata.json @@ -0,0 +1,101 @@ +{ + "os": "Linux-5.15.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.9", + "startedAt": "2026-03-22T07:04:43.133739Z", + "args": [ + "--report-to", + "wandb", + "--allow-tf32", + "--mixed-precision", + "bf16", + "--seed", + "0", + "--path-type", + "linear", + "--prediction", + "v", + "--weighting", + "uniform", + "--model", + "SiT-XL/2", + "--enc-type", + "dinov2-vit-b", + "--encoder-depth", + "8", + "--proj-coeff", + "0.5", + "--output-dir", + "exps", + "--exp-name", + "jsflow-experiment", + "--batch-size", + "256", + "--data-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256", + "--semantic-features-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0", + "--learning-rate", + "0.00005", + "--t-c", + "0.5", + "--cls", + "0.2", + "--ot-cls" + ], + "program": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "git": { + "remote": "https://github.com/Martinser/REG.git", + "commit": "021ea2e50c38c5803bd9afff16316958a01fbd1d" + }, + "email": "2365972933@qq.com", + "root": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG", + "host": "24c964746905d416ce09d045f9a06f23-taskrole1-0", + "executable": "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 4, + "disk": { + "/": { + "total": "3838880616448", + "used": "357557714944" + } + }, + "memory": { + "total": "2164115296256" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-a09f2421-99e6-a72e-63bd-fd7452510758" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e" + } + ], + "cudaVersion": "13.0", + "writerId": "q63x26q8nhayytv8q2rrmj9j9uy9kvub" +} \ No newline at end of file diff --git a/back/wandb/run-20260322_150443-e3yw9ii4/files/wandb-summary.json b/back/wandb/run-20260322_150443-e3yw9ii4/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..75631e6e43bf8471ee6ca35a1ab1286f569677cb --- /dev/null +++ b/back/wandb/run-20260322_150443-e3yw9ii4/files/wandb-summary.json @@ -0,0 +1 @@ +{"_wandb":{"runtime":3},"_runtime":3} \ No newline at end of file diff --git a/back/wandb/run-20260322_150443-e3yw9ii4/logs/debug-internal.log b/back/wandb/run-20260322_150443-e3yw9ii4/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..369b7d28b0265d2adcd0451796835668f71d33d8 --- /dev/null +++ b/back/wandb/run-20260322_150443-e3yw9ii4/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2026-03-22T15:04:43.390486873+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-22T15:04:44.970687851+08:00","level":"INFO","msg":"stream: created new stream","id":"e3yw9ii4"} +{"time":"2026-03-22T15:04:44.970802178+08:00","level":"INFO","msg":"handler: started","stream_id":"e3yw9ii4"} +{"time":"2026-03-22T15:04:44.971744065+08:00","level":"INFO","msg":"stream: started","id":"e3yw9ii4"} +{"time":"2026-03-22T15:04:44.97174913+08:00","level":"INFO","msg":"writer: started","stream_id":"e3yw9ii4"} +{"time":"2026-03-22T15:04:44.971758857+08:00","level":"INFO","msg":"sender: started","stream_id":"e3yw9ii4"} +{"time":"2026-03-22T15:04:50.286711145+08:00","level":"INFO","msg":"stream: closing","id":"e3yw9ii4"} diff --git a/back/wandb/run-20260322_150443-e3yw9ii4/logs/debug.log b/back/wandb/run-20260322_150443-e3yw9ii4/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..b7e9a15e7f483ee0e27e6f57f91f69e3bd59632c --- /dev/null +++ b/back/wandb/run-20260322_150443-e3yw9ii4/logs/debug.log @@ -0,0 +1,22 @@ +2026-03-22 15:04:43,155 INFO MainThread:326012 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-22 15:04:43,156 INFO MainThread:326012 [wandb_setup.py:_flush():81] Configure stats pid to 326012 +2026-03-22 15:04:43,156 INFO MainThread:326012 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-22 15:04:43,156 INFO MainThread:326012 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_150443-e3yw9ii4/logs/debug.log +2026-03-22 15:04:43,156 INFO MainThread:326012 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_150443-e3yw9ii4/logs/debug-internal.log +2026-03-22 15:04:43,156 INFO MainThread:326012 [wandb_init.py:init():844] calling init triggers +2026-03-22 15:04:43,156 INFO MainThread:326012 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-22 15:04:43,156 INFO MainThread:326012 [wandb_init.py:init():892] starting backend +2026-03-22 15:04:43,378 INFO MainThread:326012 [wandb_init.py:init():895] sending inform_init request +2026-03-22 15:04:43,388 INFO MainThread:326012 [wandb_init.py:init():903] backend started and connected +2026-03-22 15:04:43,389 INFO MainThread:326012 [wandb_init.py:init():973] updated telemetry +2026-03-22 15:04:43,402 INFO MainThread:326012 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-22 15:04:46,450 INFO MainThread:326012 [wandb_init.py:init():1042] starting run threads in backend +2026-03-22 15:04:46,541 INFO MainThread:326012 [wandb_run.py:_console_start():2524] atexit reg +2026-03-22 15:04:46,541 INFO MainThread:326012 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-22 15:04:46,541 INFO MainThread:326012 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-22 15:04:46,541 INFO MainThread:326012 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-22 15:04:46,545 INFO MainThread:326012 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-22 15:04:46,545 INFO MainThread:326012 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 2000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.2, 't_c': 0.5, 'ot_cls': True} +2026-03-22 15:04:50,286 INFO wandb-AsyncioManager-main:326012 [service_client.py:_forward_responses():134] Reached EOF. +2026-03-22 15:04:50,286 INFO wandb-AsyncioManager-main:326012 [mailbox.py:close():155] Closing mailbox, abandoning 1 handles. diff --git a/back/wandb/run-20260322_150443-e3yw9ii4/run-e3yw9ii4.wandb b/back/wandb/run-20260322_150443-e3yw9ii4/run-e3yw9ii4.wandb new file mode 100644 index 0000000000000000000000000000000000000000..30977402b08eb894463b83d7de7a007255879a32 Binary files /dev/null and b/back/wandb/run-20260322_150443-e3yw9ii4/run-e3yw9ii4.wandb differ diff --git a/back/wandb/run-20260322_150635-o2w3z8rq/files/requirements.txt b/back/wandb/run-20260322_150635-o2w3z8rq/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0235910d0d99b7dee69b9a7f2f90012c8b711cc --- /dev/null +++ b/back/wandb/run-20260322_150635-o2w3z8rq/files/requirements.txt @@ -0,0 +1,168 @@ +dill==0.3.8 +mkl-service==2.4.0 +mpmath==1.3.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +torch==2.5.1 +ptyprocess==0.7.0 +traitlets==5.14.3 +pyasn1==0.6.1 +opencv-python-headless==4.12.0.88 +nest-asyncio==1.6.0 +kiwisolver==1.4.8 +click==8.2.1 +fire==0.7.1 +diffusers==0.35.1 +accelerate==1.7.0 +ipykernel==6.29.5 +peft==0.17.1 +attrs==24.3.0 +six==1.17.0 +numpy==2.0.1 +yarl==1.18.0 +huggingface_hub==0.34.4 +Bottleneck==1.4.2 +numexpr==2.11.0 +dataclasses==0.6 +typing-inspection==0.4.1 +safetensors==0.5.3 +pyparsing==3.2.3 +psutil==7.0.0 +imageio==2.37.0 +debugpy==1.8.14 +cycler==0.12.1 +pyasn1_modules==0.4.2 +matplotlib-inline==0.1.7 +matplotlib==3.10.3 +jedi==0.19.2 +tokenizers==0.21.2 +seaborn==0.13.2 +timm==1.0.15 +aiohappyeyeballs==2.6.1 +hf-xet==1.1.8 +multidict==6.1.0 +tqdm==4.67.1 +wheel==0.45.1 +simsimd==6.5.1 +sentencepiece==0.2.1 +grpcio==1.74.0 +asttokens==3.0.0 +absl-py==2.3.1 +stack-data==0.6.3 +pandas==2.3.0 +importlib_metadata==8.7.0 +pytorch-image-generation-metrics==0.6.1 +frozenlist==1.5.0 +MarkupSafe==3.0.2 +setuptools==78.1.1 +multiprocess==0.70.15 +pip==25.1 +requests==2.32.3 +mkl_random==1.2.8 +tensorboard-plugin-wit==1.8.1 +ExifRead-nocycle==3.0.1 +webdataset==0.2.111 +threadpoolctl==3.6.0 +pyarrow==21.0.0 +executing==2.2.0 +decorator==5.2.1 +contourpy==1.3.2 +annotated-types==0.7.0 +scikit-learn==1.7.1 +jupyter_client==8.6.3 +albumentations==1.4.24 +wandb==0.25.0 +certifi==2025.8.3 +idna==3.7 +xxhash==3.5.0 +Jinja2==3.1.6 +python-dateutil==2.9.0.post0 +aiosignal==1.4.0 +triton==3.1.0 +torchvision==0.20.1 +stringzilla==3.12.6 +pure_eval==0.2.3 +braceexpand==0.1.7 +zipp==3.22.0 +oauthlib==3.3.1 +Markdown==3.8.2 +fsspec==2025.3.0 +fonttools==4.58.2 +comm==0.2.2 +ipython==9.3.0 +img2dataset==1.47.0 +networkx==3.4.2 +PySocks==1.7.1 +tzdata==2025.2 +smmap==5.0.2 +mkl_fft==1.3.11 +sentry-sdk==2.29.1 +Pygments==2.19.1 +pexpect==4.9.0 +ftfy==6.3.1 +einops==0.8.1 +requests-oauthlib==2.0.0 +gitdb==4.0.12 +albucore==0.0.23 +torchdiffeq==0.2.5 +GitPython==3.1.44 +bitsandbytes==0.47.0 +pytorch-fid==0.3.0 +clean-fid==0.1.35 +pytorch-gan-metrics==0.5.4 +Brotli==1.0.9 +charset-normalizer==3.3.2 +gmpy2==2.2.1 +pillow==11.1.0 +PyYAML==6.0.2 +tornado==6.5.1 +termcolor==3.1.0 +setproctitle==1.3.6 +scipy==1.15.3 +regex==2024.11.6 +protobuf==6.31.1 +platformdirs==4.3.8 +joblib==1.5.1 +cachetools==4.2.4 +ipython_pygments_lexers==1.1.1 +google-auth==1.35.0 +transformers==4.53.2 +torch-fidelity==0.3.0 +tensorboard==2.4.0 +filelock==3.17.0 +packaging==25.0 +propcache==0.3.1 +pytz==2025.2 +aiohttp==3.11.10 +wcwidth==0.2.13 +clip==0.2.0 +Werkzeug==3.1.3 +tensorboard-data-server==0.6.1 +sympy==1.13.1 +pyzmq==26.4.0 +pydantic_core==2.33.2 +prompt_toolkit==3.0.51 +parso==0.8.4 +docker-pycreds==0.4.0 +rsa==4.9.1 +pydantic==2.11.5 +jupyter_core==5.8.1 +google-auth-oauthlib==0.4.6 +datasets==4.0.0 +torch-tb-profiler==0.4.3 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +typeguard==4.3.0 +inflect==7.3.1 +jaraco.text==3.12.1 +tomli==2.0.1 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/back/wandb/run-20260322_150635-o2w3z8rq/files/wandb-metadata.json b/back/wandb/run-20260322_150635-o2w3z8rq/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..81eb6968e4867bdd89d94e8e71f28bc6bff971c4 --- /dev/null +++ b/back/wandb/run-20260322_150635-o2w3z8rq/files/wandb-metadata.json @@ -0,0 +1,101 @@ +{ + "os": "Linux-5.15.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.9", + "startedAt": "2026-03-22T07:06:35.344191Z", + "args": [ + "--report-to", + "wandb", + "--allow-tf32", + "--mixed-precision", + "bf16", + "--seed", + "0", + "--path-type", + "linear", + "--prediction", + "v", + "--weighting", + "uniform", + "--model", + "SiT-XL/2", + "--enc-type", + "dinov2-vit-b", + "--encoder-depth", + "8", + "--proj-coeff", + "0.5", + "--output-dir", + "exps", + "--exp-name", + "jsflow-experiment", + "--batch-size", + "256", + "--data-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256", + "--semantic-features-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0", + "--learning-rate", + "0.00005", + "--t-c", + "0.5", + "--cls", + "0.2", + "--ot-cls" + ], + "program": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "git": { + "remote": "https://github.com/Martinser/REG.git", + "commit": "021ea2e50c38c5803bd9afff16316958a01fbd1d" + }, + "email": "2365972933@qq.com", + "root": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG", + "host": "24c964746905d416ce09d045f9a06f23-taskrole1-0", + "executable": "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 4, + "disk": { + "/": { + "total": "3838880616448", + "used": "357557432320" + } + }, + "memory": { + "total": "2164115296256" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-a09f2421-99e6-a72e-63bd-fd7452510758" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e" + } + ], + "cudaVersion": "13.0", + "writerId": "6lh393hm7it42ndjgn98l2heg0r7rqjq" +} \ No newline at end of file diff --git a/back/wandb/run-20260322_150635-o2w3z8rq/logs/debug-internal.log b/back/wandb/run-20260322_150635-o2w3z8rq/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..97be62d16ca7d67011b61e384f68b75d7e520dbc --- /dev/null +++ b/back/wandb/run-20260322_150635-o2w3z8rq/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-03-22T15:06:35.605542246+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-22T15:06:37.585674571+08:00","level":"INFO","msg":"stream: created new stream","id":"o2w3z8rq"} +{"time":"2026-03-22T15:06:37.585934805+08:00","level":"INFO","msg":"handler: started","stream_id":"o2w3z8rq"} +{"time":"2026-03-22T15:06:37.586954142+08:00","level":"INFO","msg":"stream: started","id":"o2w3z8rq"} +{"time":"2026-03-22T15:06:37.587002572+08:00","level":"INFO","msg":"sender: started","stream_id":"o2w3z8rq"} +{"time":"2026-03-22T15:06:37.58696296+08:00","level":"INFO","msg":"writer: started","stream_id":"o2w3z8rq"} diff --git a/back/wandb/run-20260322_150635-o2w3z8rq/logs/debug.log b/back/wandb/run-20260322_150635-o2w3z8rq/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..03972c1000346afd23a195a60f845ddd10d15625 --- /dev/null +++ b/back/wandb/run-20260322_150635-o2w3z8rq/logs/debug.log @@ -0,0 +1,20 @@ +2026-03-22 15:06:35,364 INFO MainThread:328110 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-22 15:06:35,364 INFO MainThread:328110 [wandb_setup.py:_flush():81] Configure stats pid to 328110 +2026-03-22 15:06:35,364 INFO MainThread:328110 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-22 15:06:35,364 INFO MainThread:328110 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_150635-o2w3z8rq/logs/debug.log +2026-03-22 15:06:35,364 INFO MainThread:328110 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_150635-o2w3z8rq/logs/debug-internal.log +2026-03-22 15:06:35,364 INFO MainThread:328110 [wandb_init.py:init():844] calling init triggers +2026-03-22 15:06:35,364 INFO MainThread:328110 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-22 15:06:35,364 INFO MainThread:328110 [wandb_init.py:init():892] starting backend +2026-03-22 15:06:35,590 INFO MainThread:328110 [wandb_init.py:init():895] sending inform_init request +2026-03-22 15:06:35,601 INFO MainThread:328110 [wandb_init.py:init():903] backend started and connected +2026-03-22 15:06:35,604 INFO MainThread:328110 [wandb_init.py:init():973] updated telemetry +2026-03-22 15:06:35,618 INFO MainThread:328110 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-22 15:06:38,166 INFO MainThread:328110 [wandb_init.py:init():1042] starting run threads in backend +2026-03-22 15:06:38,258 INFO MainThread:328110 [wandb_run.py:_console_start():2524] atexit reg +2026-03-22 15:06:38,258 INFO MainThread:328110 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-22 15:06:38,259 INFO MainThread:328110 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-22 15:06:38,259 INFO MainThread:328110 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-22 15:06:38,262 INFO MainThread:328110 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-22 15:06:38,263 INFO MainThread:328110 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 2000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.2, 't_c': 0.5, 'ot_cls': True} diff --git a/back/wandb/run-20260322_184925-8vorqr8l/files/requirements.txt b/back/wandb/run-20260322_184925-8vorqr8l/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0235910d0d99b7dee69b9a7f2f90012c8b711cc --- /dev/null +++ b/back/wandb/run-20260322_184925-8vorqr8l/files/requirements.txt @@ -0,0 +1,168 @@ +dill==0.3.8 +mkl-service==2.4.0 +mpmath==1.3.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +torch==2.5.1 +ptyprocess==0.7.0 +traitlets==5.14.3 +pyasn1==0.6.1 +opencv-python-headless==4.12.0.88 +nest-asyncio==1.6.0 +kiwisolver==1.4.8 +click==8.2.1 +fire==0.7.1 +diffusers==0.35.1 +accelerate==1.7.0 +ipykernel==6.29.5 +peft==0.17.1 +attrs==24.3.0 +six==1.17.0 +numpy==2.0.1 +yarl==1.18.0 +huggingface_hub==0.34.4 +Bottleneck==1.4.2 +numexpr==2.11.0 +dataclasses==0.6 +typing-inspection==0.4.1 +safetensors==0.5.3 +pyparsing==3.2.3 +psutil==7.0.0 +imageio==2.37.0 +debugpy==1.8.14 +cycler==0.12.1 +pyasn1_modules==0.4.2 +matplotlib-inline==0.1.7 +matplotlib==3.10.3 +jedi==0.19.2 +tokenizers==0.21.2 +seaborn==0.13.2 +timm==1.0.15 +aiohappyeyeballs==2.6.1 +hf-xet==1.1.8 +multidict==6.1.0 +tqdm==4.67.1 +wheel==0.45.1 +simsimd==6.5.1 +sentencepiece==0.2.1 +grpcio==1.74.0 +asttokens==3.0.0 +absl-py==2.3.1 +stack-data==0.6.3 +pandas==2.3.0 +importlib_metadata==8.7.0 +pytorch-image-generation-metrics==0.6.1 +frozenlist==1.5.0 +MarkupSafe==3.0.2 +setuptools==78.1.1 +multiprocess==0.70.15 +pip==25.1 +requests==2.32.3 +mkl_random==1.2.8 +tensorboard-plugin-wit==1.8.1 +ExifRead-nocycle==3.0.1 +webdataset==0.2.111 +threadpoolctl==3.6.0 +pyarrow==21.0.0 +executing==2.2.0 +decorator==5.2.1 +contourpy==1.3.2 +annotated-types==0.7.0 +scikit-learn==1.7.1 +jupyter_client==8.6.3 +albumentations==1.4.24 +wandb==0.25.0 +certifi==2025.8.3 +idna==3.7 +xxhash==3.5.0 +Jinja2==3.1.6 +python-dateutil==2.9.0.post0 +aiosignal==1.4.0 +triton==3.1.0 +torchvision==0.20.1 +stringzilla==3.12.6 +pure_eval==0.2.3 +braceexpand==0.1.7 +zipp==3.22.0 +oauthlib==3.3.1 +Markdown==3.8.2 +fsspec==2025.3.0 +fonttools==4.58.2 +comm==0.2.2 +ipython==9.3.0 +img2dataset==1.47.0 +networkx==3.4.2 +PySocks==1.7.1 +tzdata==2025.2 +smmap==5.0.2 +mkl_fft==1.3.11 +sentry-sdk==2.29.1 +Pygments==2.19.1 +pexpect==4.9.0 +ftfy==6.3.1 +einops==0.8.1 +requests-oauthlib==2.0.0 +gitdb==4.0.12 +albucore==0.0.23 +torchdiffeq==0.2.5 +GitPython==3.1.44 +bitsandbytes==0.47.0 +pytorch-fid==0.3.0 +clean-fid==0.1.35 +pytorch-gan-metrics==0.5.4 +Brotli==1.0.9 +charset-normalizer==3.3.2 +gmpy2==2.2.1 +pillow==11.1.0 +PyYAML==6.0.2 +tornado==6.5.1 +termcolor==3.1.0 +setproctitle==1.3.6 +scipy==1.15.3 +regex==2024.11.6 +protobuf==6.31.1 +platformdirs==4.3.8 +joblib==1.5.1 +cachetools==4.2.4 +ipython_pygments_lexers==1.1.1 +google-auth==1.35.0 +transformers==4.53.2 +torch-fidelity==0.3.0 +tensorboard==2.4.0 +filelock==3.17.0 +packaging==25.0 +propcache==0.3.1 +pytz==2025.2 +aiohttp==3.11.10 +wcwidth==0.2.13 +clip==0.2.0 +Werkzeug==3.1.3 +tensorboard-data-server==0.6.1 +sympy==1.13.1 +pyzmq==26.4.0 +pydantic_core==2.33.2 +prompt_toolkit==3.0.51 +parso==0.8.4 +docker-pycreds==0.4.0 +rsa==4.9.1 +pydantic==2.11.5 +jupyter_core==5.8.1 +google-auth-oauthlib==0.4.6 +datasets==4.0.0 +torch-tb-profiler==0.4.3 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +typeguard==4.3.0 +inflect==7.3.1 +jaraco.text==3.12.1 +tomli==2.0.1 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/back/wandb/run-20260322_184925-8vorqr8l/files/wandb-metadata.json b/back/wandb/run-20260322_184925-8vorqr8l/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..47b0568a9896fa0f47ff6879023a1c29fc9ea8ee --- /dev/null +++ b/back/wandb/run-20260322_184925-8vorqr8l/files/wandb-metadata.json @@ -0,0 +1,101 @@ +{ + "os": "Linux-5.15.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.9", + "startedAt": "2026-03-22T10:49:25.039668Z", + "args": [ + "--report-to", + "wandb", + "--allow-tf32", + "--mixed-precision", + "bf16", + "--seed", + "0", + "--path-type", + "linear", + "--prediction", + "v", + "--weighting", + "uniform", + "--model", + "SiT-XL/2", + "--enc-type", + "dinov2-vit-b", + "--encoder-depth", + "8", + "--proj-coeff", + "0.5", + "--output-dir", + "exps", + "--exp-name", + "jsflow-experiment", + "--batch-size", + "256", + "--data-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256", + "--semantic-features-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0", + "--learning-rate", + "0.00005", + "--t-c", + "0.5", + "--cls", + "0.2", + "--ot-cls" + ], + "program": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "git": { + "remote": "https://github.com/Martinser/REG.git", + "commit": "021ea2e50c38c5803bd9afff16316958a01fbd1d" + }, + "email": "2365972933@qq.com", + "root": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG", + "host": "24c964746905d416ce09d045f9a06f23-taskrole1-0", + "executable": "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 4, + "disk": { + "/": { + "total": "3838880616448", + "used": "357558751232" + } + }, + "memory": { + "total": "2164115296256" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-a09f2421-99e6-a72e-63bd-fd7452510758" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e" + } + ], + "cudaVersion": "13.0", + "writerId": "ke7vwu8djzzkth9qju4c1qgqhxppykoi" +} \ No newline at end of file diff --git a/back/wandb/run-20260322_184925-8vorqr8l/logs/debug-internal.log b/back/wandb/run-20260322_184925-8vorqr8l/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d1fad8c1eb1467c148d87fc374d3ebae5ddea8e4 --- /dev/null +++ b/back/wandb/run-20260322_184925-8vorqr8l/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-03-22T18:49:25.449291001+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-22T18:49:26.764171858+08:00","level":"INFO","msg":"stream: created new stream","id":"8vorqr8l"} +{"time":"2026-03-22T18:49:26.764300091+08:00","level":"INFO","msg":"handler: started","stream_id":"8vorqr8l"} +{"time":"2026-03-22T18:49:26.765315333+08:00","level":"INFO","msg":"stream: started","id":"8vorqr8l"} +{"time":"2026-03-22T18:49:26.765351691+08:00","level":"INFO","msg":"writer: started","stream_id":"8vorqr8l"} +{"time":"2026-03-22T18:49:26.765379743+08:00","level":"INFO","msg":"sender: started","stream_id":"8vorqr8l"} diff --git a/back/wandb/run-20260322_184925-8vorqr8l/logs/debug.log b/back/wandb/run-20260322_184925-8vorqr8l/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..d6972e94937eede283a43ff2af1b5eaac1a5b9ae --- /dev/null +++ b/back/wandb/run-20260322_184925-8vorqr8l/logs/debug.log @@ -0,0 +1,20 @@ +2026-03-22 18:49:25,066 INFO MainThread:344956 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-22 18:49:25,066 INFO MainThread:344956 [wandb_setup.py:_flush():81] Configure stats pid to 344956 +2026-03-22 18:49:25,066 INFO MainThread:344956 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-22 18:49:25,066 INFO MainThread:344956 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_184925-8vorqr8l/logs/debug.log +2026-03-22 18:49:25,066 INFO MainThread:344956 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260322_184925-8vorqr8l/logs/debug-internal.log +2026-03-22 18:49:25,066 INFO MainThread:344956 [wandb_init.py:init():844] calling init triggers +2026-03-22 18:49:25,066 INFO MainThread:344956 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-22 18:49:25,066 INFO MainThread:344956 [wandb_init.py:init():892] starting backend +2026-03-22 18:49:25,430 INFO MainThread:344956 [wandb_init.py:init():895] sending inform_init request +2026-03-22 18:49:25,445 INFO MainThread:344956 [wandb_init.py:init():903] backend started and connected +2026-03-22 18:49:25,448 INFO MainThread:344956 [wandb_init.py:init():973] updated telemetry +2026-03-22 18:49:25,462 INFO MainThread:344956 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-22 18:49:28,246 INFO MainThread:344956 [wandb_init.py:init():1042] starting run threads in backend +2026-03-22 18:49:28,339 INFO MainThread:344956 [wandb_run.py:_console_start():2524] atexit reg +2026-03-22 18:49:28,339 INFO MainThread:344956 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-22 18:49:28,339 INFO MainThread:344956 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-22 18:49:28,339 INFO MainThread:344956 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-22 18:49:28,346 INFO MainThread:344956 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-22 18:49:28,347 INFO MainThread:344956 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 2000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.2, 't_c': 0.5, 'ot_cls': True} diff --git a/back/wandb/run-20260323_133616-b1ci8tv6/files/output.log b/back/wandb/run-20260323_133616-b1ci8tv6/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..1d8a55f41645853dfae61bda6ad343f96772abb0 --- /dev/null +++ b/back/wandb/run-20260323_133616-b1ci8tv6/files/output.log @@ -0,0 +1,4626 @@ +Steps: 0%| | 1/1000000 [00:02<581:44:51, 2.09s/it][2026-03-23 13:36:22] Generating EMA samples (ODE Euler, no diffusion noise; t≈0.5 → t=0)... +[2026-03-23 13:36:24] Step: 1, Training Logs: loss_final: 4.886707, loss_mean: 1.706308, proj_loss: 0.001541, loss_mean_cls: 3.178859, grad_norm: 1.484174 +Steps: 0%| | 2/1000000 [00:04<675:50:15, 2.43s/it, grad_norm=1.48, loss_final=4.89, loss_mean=1.71, loss_mean_cls=3.18, proj_loss=0.00154][2026-03-23 13:36:24] Step: 2, Training Logs: loss_final: 4.262999, loss_mean: 1.683633, proj_loss: -0.010278, loss_mean_cls: 2.589643, grad_norm: 1.072753 +Steps: 0%| | 3/1000000 [00:05<398:05:25, 1.43s/it, grad_norm=1.07, loss_final=4.26, loss_mean=1.68, loss_mean_cls=2.59, proj_loss=-0.0103][2026-03-23 13:36:24] Step: 3, Training Logs: loss_final: 4.837096, loss_mean: 1.666042, proj_loss: -0.019227, loss_mean_cls: 3.190281, grad_norm: 1.111961 +Steps: 0%| | 4/1000000 [00:05<267:32:14, 1.04it/s, grad_norm=1.11, loss_final=4.84, loss_mean=1.67, loss_mean_cls=3.19, proj_loss=-0.0192][2026-03-23 13:36:25] Step: 4, Training Logs: loss_final: 4.850037, loss_mean: 1.679780, proj_loss: -0.026350, loss_mean_cls: 3.196608, grad_norm: 0.764629 +Steps: 0%| | 5/1000000 [00:05<195:30:23, 1.42it/s, grad_norm=0.765, loss_final=4.85, loss_mean=1.68, loss_mean_cls=3.2, proj_loss=-0.0264][2026-03-23 13:36:25] Step: 5, Training Logs: loss_final: 4.416910, loss_mean: 1.680973, proj_loss: -0.034618, loss_mean_cls: 2.770555, grad_norm: 0.824600 +Steps: 0%| | 6/1000000 [00:05<151:58:47, 1.83it/s, grad_norm=0.825, loss_final=4.42, loss_mean=1.68, loss_mean_cls=2.77, proj_loss=-0.0346][2026-03-23 13:36:25] Step: 6, Training Logs: loss_final: 4.753337, loss_mean: 1.683668, proj_loss: -0.039482, loss_mean_cls: 3.109150, grad_norm: 0.968158 +Steps: 0%| | 7/1000000 [00:05<124:21:54, 2.23it/s, grad_norm=0.968, loss_final=4.75, loss_mean=1.68, loss_mean_cls=3.11, proj_loss=-0.0395][2026-03-23 13:36:25] Step: 7, Training Logs: loss_final: 4.761275, loss_mean: 1.688122, proj_loss: -0.042711, loss_mean_cls: 3.115864, grad_norm: 0.827768 +Steps: 0%| | 8/1000000 [00:06<106:20:05, 2.61it/s, grad_norm=0.828, loss_final=4.76, loss_mean=1.69, loss_mean_cls=3.12, proj_loss=-0.0427][2026-03-23 13:36:26] Step: 8, Training Logs: loss_final: 4.843365, loss_mean: 1.657367, proj_loss: -0.044991, loss_mean_cls: 3.230989, grad_norm: 0.857932 +Steps: 0%| | 9/1000000 [00:06<94:16:51, 2.95it/s, grad_norm=0.858, loss_final=4.84, loss_mean=1.66, loss_mean_cls=3.23, proj_loss=-0.045] [2026-03-23 13:36:26] Step: 9, Training Logs: loss_final: 4.787625, loss_mean: 1.657163, proj_loss: -0.046026, loss_mean_cls: 3.176489, grad_norm: 0.938965 +Steps: 0%| | 10/1000000 [00:06<86:01:11, 3.23it/s, grad_norm=0.939, loss_final=4.79, loss_mean=1.66, loss_mean_cls=3.18, proj_loss=-0.046][2026-03-23 13:36:26] Step: 10, Training Logs: loss_final: 4.966867, loss_mean: 1.666353, proj_loss: -0.047789, loss_mean_cls: 3.348303, grad_norm: 0.993417 +Steps: 0%| | 11/1000000 [00:06<80:23:41, 3.46it/s, grad_norm=0.993, loss_final=4.97, loss_mean=1.67, loss_mean_cls=3.35, proj_loss=-0.0478][2026-03-23 13:36:26] Step: 11, Training Logs: loss_final: 5.309743, loss_mean: 1.651147, proj_loss: -0.049359, loss_mean_cls: 3.707955, grad_norm: 0.967346 +Steps: 0%| | 12/1000000 [00:07<76:27:48, 3.63it/s, grad_norm=0.967, loss_final=5.31, loss_mean=1.65, loss_mean_cls=3.71, proj_loss=-0.0494][2026-03-23 13:36:27] Step: 12, Training Logs: loss_final: 5.112337, loss_mean: 1.627964, proj_loss: -0.049843, loss_mean_cls: 3.534215, grad_norm: 1.156597 +Steps: 0%| | 13/1000000 [00:07<73:50:15, 3.76it/s, grad_norm=1.16, loss_final=5.11, loss_mean=1.63, loss_mean_cls=3.53, proj_loss=-0.0498][2026-03-23 13:36:27] Step: 13, Training Logs: loss_final: 4.922093, loss_mean: 1.624116, proj_loss: -0.051420, loss_mean_cls: 3.349397, grad_norm: 1.361660 +Steps: 0%| | 14/1000000 [00:07<71:55:01, 3.86it/s, grad_norm=1.36, loss_final=4.92, loss_mean=1.62, loss_mean_cls=3.35, proj_loss=-0.0514][2026-03-23 13:36:27] Step: 14, Training Logs: loss_final: 4.321608, loss_mean: 1.598499, proj_loss: -0.051553, loss_mean_cls: 2.774662, grad_norm: 1.289792 +Steps: 0%| | 15/1000000 [00:07<70:43:14, 3.93it/s, grad_norm=1.29, loss_final=4.32, loss_mean=1.6, loss_mean_cls=2.77, proj_loss=-0.0516][2026-03-23 13:36:27] Step: 15, Training Logs: loss_final: 5.221568, loss_mean: 1.587925, proj_loss: -0.054821, loss_mean_cls: 3.688464, grad_norm: 1.193578 +Steps: 0%| | 16/1000000 [00:08<69:45:57, 3.98it/s, grad_norm=1.19, loss_final=5.22, loss_mean=1.59, loss_mean_cls=3.69, proj_loss=-0.0548][2026-03-23 13:36:28] Step: 16, Training Logs: loss_final: 4.676355, loss_mean: 1.617376, proj_loss: -0.052323, loss_mean_cls: 3.111302, grad_norm: 1.333052 +Steps: 0%| | 17/1000000 [00:08<69:08:07, 4.02it/s, grad_norm=1.33, loss_final=4.68, loss_mean=1.62, loss_mean_cls=3.11, proj_loss=-0.0523][2026-03-23 13:36:28] Step: 17, Training Logs: loss_final: 4.955086, loss_mean: 1.621716, proj_loss: -0.051674, loss_mean_cls: 3.385045, grad_norm: 1.920916 +Steps: 0%| | 18/1000000 [00:08<68:40:03, 4.05it/s, grad_norm=1.92, loss_final=4.96, loss_mean=1.62, loss_mean_cls=3.39, proj_loss=-0.0517][2026-03-23 13:36:28] Step: 18, Training Logs: loss_final: 4.340563, loss_mean: 1.571294, proj_loss: -0.054873, loss_mean_cls: 2.824142, grad_norm: 1.089570 +Steps: 0%| | 19/1000000 [00:08<68:22:42, 4.06it/s, grad_norm=1.09, loss_final=4.34, loss_mean=1.57, loss_mean_cls=2.82, proj_loss=-0.0549][2026-03-23 13:36:28] Step: 19, Training Logs: loss_final: 4.882733, loss_mean: 1.584449, proj_loss: -0.055189, loss_mean_cls: 3.353473, grad_norm: 1.569113 +Steps: 0%| | 20/1000000 [00:09<68:08:33, 4.08it/s, grad_norm=1.57, loss_final=4.88, loss_mean=1.58, loss_mean_cls=3.35, proj_loss=-0.0552][2026-03-23 13:36:29] Step: 20, Training Logs: loss_final: 4.675643, loss_mean: 1.588136, proj_loss: -0.053475, loss_mean_cls: 3.140982, grad_norm: 1.578210 +Steps: 0%| | 21/1000000 [00:09<67:58:32, 4.09it/s, grad_norm=1.58, loss_final=4.68, loss_mean=1.59, loss_mean_cls=3.14, proj_loss=-0.0535][2026-03-23 13:36:29] Step: 21, Training Logs: loss_final: 4.806199, loss_mean: 1.570678, proj_loss: -0.053972, loss_mean_cls: 3.289493, grad_norm: 1.116282 +Steps: 0%| | 22/1000000 [00:09<67:52:16, 4.09it/s, grad_norm=1.12, loss_final=4.81, loss_mean=1.57, loss_mean_cls=3.29, proj_loss=-0.054][2026-03-23 13:36:29] Step: 22, Training Logs: loss_final: 4.515004, loss_mean: 1.547593, proj_loss: -0.053569, loss_mean_cls: 3.020979, grad_norm: 1.064280 +Steps: 0%| | 23/1000000 [00:09<67:49:41, 4.10it/s, grad_norm=1.06, loss_final=4.52, loss_mean=1.55, loss_mean_cls=3.02, proj_loss=-0.0536][2026-03-23 13:36:29] Step: 23, Training Logs: loss_final: 4.365607, loss_mean: 1.570231, proj_loss: -0.051182, loss_mean_cls: 2.846559, grad_norm: 1.318048 +Steps: 0%| | 24/1000000 [00:10<67:43:14, 4.10it/s, grad_norm=1.32, loss_final=4.37, loss_mean=1.57, loss_mean_cls=2.85, proj_loss=-0.0512][2026-03-23 13:36:30] Step: 24, Training Logs: loss_final: 4.780019, loss_mean: 1.551885, proj_loss: -0.054443, loss_mean_cls: 3.282577, grad_norm: 1.209403 +Steps: 0%| | 25/1000000 [00:10<67:39:15, 4.11it/s, grad_norm=1.21, loss_final=4.78, loss_mean=1.55, loss_mean_cls=3.28, proj_loss=-0.0544][2026-03-23 13:36:30] Step: 25, Training Logs: loss_final: 5.108131, loss_mean: 1.541951, proj_loss: -0.056103, loss_mean_cls: 3.622283, grad_norm: 1.110538 +Steps: 0%| | 26/1000000 [00:10<67:36:52, 4.11it/s, grad_norm=1.11, loss_final=5.11, loss_mean=1.54, loss_mean_cls=3.62, proj_loss=-0.0561][2026-03-23 13:36:30] Step: 26, Training Logs: loss_final: 4.594624, loss_mean: 1.548291, proj_loss: -0.055673, loss_mean_cls: 3.102005, grad_norm: 1.053802 +Steps: 0%| | 27/1000000 [00:10<67:35:27, 4.11it/s, grad_norm=1.05, loss_final=4.59, loss_mean=1.55, loss_mean_cls=3.1, proj_loss=-0.0557][2026-03-23 13:36:30] Step: 27, Training Logs: loss_final: 4.380362, loss_mean: 1.537654, proj_loss: -0.055031, loss_mean_cls: 2.897738, grad_norm: 1.165141 +Steps: 0%| | 28/1000000 [00:11<67:34:10, 4.11it/s, grad_norm=1.17, loss_final=4.38, loss_mean=1.54, loss_mean_cls=2.9, proj_loss=-0.055][2026-03-23 13:36:31] Step: 28, Training Logs: loss_final: 4.471941, loss_mean: 1.537652, proj_loss: -0.054249, loss_mean_cls: 2.988538, grad_norm: 1.155479 +Steps: 0%| | 29/1000000 [00:11<67:33:03, 4.11it/s, grad_norm=1.16, loss_final=4.47, loss_mean=1.54, loss_mean_cls=2.99, proj_loss=-0.0542][2026-03-23 13:36:31] Step: 29, Training Logs: loss_final: 3.999491, loss_mean: 1.480217, proj_loss: -0.055131, loss_mean_cls: 2.574406, grad_norm: 1.072773 +Steps: 0%| | 30/1000000 [00:11<67:32:57, 4.11it/s, grad_norm=1.07, loss_final=4, loss_mean=1.48, loss_mean_cls=2.57, proj_loss=-0.0551][2026-03-23 13:36:31] Step: 30, Training Logs: loss_final: 4.276691, loss_mean: 1.480663, proj_loss: -0.055781, loss_mean_cls: 2.851809, grad_norm: 1.042456 +Steps: 0%| | 31/1000000 [00:11<67:32:43, 4.11it/s, grad_norm=1.04, loss_final=4.28, loss_mean=1.48, loss_mean_cls=2.85, proj_loss=-0.0558][2026-03-23 13:36:31] Step: 31, Training Logs: loss_final: 4.930573, loss_mean: 1.475251, proj_loss: -0.056231, loss_mean_cls: 3.511553, grad_norm: 1.196418 +Steps: 0%| | 32/1000000 [00:12<67:32:37, 4.11it/s, grad_norm=1.2, loss_final=4.93, loss_mean=1.48, loss_mean_cls=3.51, proj_loss=-0.0562][2026-03-23 13:36:32] Step: 32, Training Logs: loss_final: 4.872284, loss_mean: 1.459915, proj_loss: -0.053981, loss_mean_cls: 3.466351, grad_norm: 1.030007 +Steps: 0%| | 33/1000000 [00:12<67:31:59, 4.11it/s, grad_norm=1.03, loss_final=4.87, loss_mean=1.46, loss_mean_cls=3.47, proj_loss=-0.054][2026-03-23 13:36:32] Step: 33, Training Logs: loss_final: 5.134232, loss_mean: 1.459543, proj_loss: -0.058704, loss_mean_cls: 3.733393, grad_norm: 1.112776 +Steps: 0%| | 34/1000000 [00:12<67:32:23, 4.11it/s, grad_norm=1.11, loss_final=5.13, loss_mean=1.46, loss_mean_cls=3.73, proj_loss=-0.0587][2026-03-23 13:36:32] Step: 34, Training Logs: loss_final: 4.040306, loss_mean: 1.454316, proj_loss: -0.056937, loss_mean_cls: 2.642927, grad_norm: 0.892336 +Steps: 0%| | 35/1000000 [00:12<67:33:36, 4.11it/s, grad_norm=0.892, loss_final=4.04, loss_mean=1.45, loss_mean_cls=2.64, proj_loss=-0.0569][2026-03-23 13:36:32] Step: 35, Training Logs: loss_final: 4.252234, loss_mean: 1.441466, proj_loss: -0.052716, loss_mean_cls: 2.863483, grad_norm: 1.166586 +Steps: 0%| | 36/1000000 [00:13<67:33:45, 4.11it/s, grad_norm=1.17, loss_final=4.25, loss_mean=1.44, loss_mean_cls=2.86, proj_loss=-0.0527][2026-03-23 13:36:32] Step: 36, Training Logs: loss_final: 4.366521, loss_mean: 1.471327, proj_loss: -0.054384, loss_mean_cls: 2.949578, grad_norm: 1.782779 +Steps: 0%| | 37/1000000 [00:13<67:34:10, 4.11it/s, grad_norm=1.78, loss_final=4.37, loss_mean=1.47, loss_mean_cls=2.95, proj_loss=-0.0544][2026-03-23 13:36:33] Step: 37, Training Logs: loss_final: 4.423418, loss_mean: 1.438698, proj_loss: -0.057451, loss_mean_cls: 3.042170, grad_norm: 0.954798 +Steps: 0%| | 38/1000000 [00:13<67:32:16, 4.11it/s, grad_norm=0.955, loss_final=4.42, loss_mean=1.44, loss_mean_cls=3.04, proj_loss=-0.0575][2026-03-23 13:36:33] Step: 38, Training Logs: loss_final: 4.411051, loss_mean: 1.458385, proj_loss: -0.056669, loss_mean_cls: 3.009335, grad_norm: 1.584718 +Steps: 0%| | 39/1000000 [00:13<67:31:00, 4.11it/s, grad_norm=1.58, loss_final=4.41, loss_mean=1.46, loss_mean_cls=3.01, proj_loss=-0.0567][2026-03-23 13:36:33] Step: 39, Training Logs: loss_final: 4.525216, loss_mean: 1.437404, proj_loss: -0.055727, loss_mean_cls: 3.143538, grad_norm: 1.125282 +Steps: 0%| | 40/1000000 [00:14<67:30:53, 4.11it/s, grad_norm=1.13, loss_final=4.53, loss_mean=1.44, loss_mean_cls=3.14, proj_loss=-0.0557][2026-03-23 13:36:33] Step: 40, Training Logs: loss_final: 4.887799, loss_mean: 1.406875, proj_loss: -0.056381, loss_mean_cls: 3.537305, grad_norm: 1.122277 +Steps: 0%| | 41/1000000 [00:14<67:29:06, 4.12it/s, grad_norm=1.12, loss_final=4.89, loss_mean=1.41, loss_mean_cls=3.54, proj_loss=-0.0564][2026-03-23 13:36:34] Step: 41, Training Logs: loss_final: 4.987991, loss_mean: 1.399657, proj_loss: -0.056047, loss_mean_cls: 3.644382, grad_norm: 0.953737 +Steps: 0%| | 42/1000000 [00:14<67:28:24, 4.12it/s, grad_norm=0.954, loss_final=4.99, loss_mean=1.4, loss_mean_cls=3.64, proj_loss=-0.056][2026-03-23 13:36:34] Step: 42, Training Logs: loss_final: 4.219562, loss_mean: 1.434484, proj_loss: -0.054458, loss_mean_cls: 2.839536, grad_norm: 1.045523 +Steps: 0%| | 43/1000000 [00:14<67:29:09, 4.12it/s, grad_norm=1.05, loss_final=4.22, loss_mean=1.43, loss_mean_cls=2.84, proj_loss=-0.0545][2026-03-23 13:36:34] Step: 43, Training Logs: loss_final: 5.020249, loss_mean: 1.418020, proj_loss: -0.055626, loss_mean_cls: 3.657855, grad_norm: 0.914230 +Steps: 0%| | 44/1000000 [00:14<67:30:09, 4.11it/s, grad_norm=0.914, loss_final=5.02, loss_mean=1.42, loss_mean_cls=3.66, proj_loss=-0.0556][2026-03-23 13:36:34] Step: 44, Training Logs: loss_final: 5.015689, loss_mean: 1.389580, proj_loss: -0.055396, loss_mean_cls: 3.681504, grad_norm: 0.742577 +Steps: 0%| | 45/1000000 [00:15<67:29:39, 4.12it/s, grad_norm=0.743, loss_final=5.02, loss_mean=1.39, loss_mean_cls=3.68, proj_loss=-0.0554][2026-03-23 13:36:35] Step: 45, Training Logs: loss_final: 4.669159, loss_mean: 1.378762, proj_loss: -0.054933, loss_mean_cls: 3.345329, grad_norm: 0.873569 +Steps: 0%| | 46/1000000 [00:15<67:34:14, 4.11it/s, grad_norm=0.874, loss_final=4.67, loss_mean=1.38, loss_mean_cls=3.35, proj_loss=-0.0549][2026-03-23 13:36:35] Step: 46, Training Logs: loss_final: 4.430506, loss_mean: 1.387027, proj_loss: -0.056324, loss_mean_cls: 3.099802, grad_norm: 0.994348 +Steps: 0%| | 47/1000000 [00:15<67:33:24, 4.11it/s, grad_norm=0.994, loss_final=4.43, loss_mean=1.39, loss_mean_cls=3.1, proj_loss=-0.0563][2026-03-23 13:36:35] Step: 47, Training Logs: loss_final: 4.631330, loss_mean: 1.393778, proj_loss: -0.055982, loss_mean_cls: 3.293534, grad_norm: 0.706327 +Steps: 0%| | 48/1000000 [00:15<67:34:49, 4.11it/s, grad_norm=0.706, loss_final=4.63, loss_mean=1.39, loss_mean_cls=3.29, proj_loss=-0.056][2026-03-23 13:36:35] Step: 48, Training Logs: loss_final: 4.394535, loss_mean: 1.391383, proj_loss: -0.054929, loss_mean_cls: 3.058081, grad_norm: 0.717439 +Steps: 0%| | 49/1000000 [00:16<67:34:48, 4.11it/s, grad_norm=0.717, loss_final=4.39, loss_mean=1.39, loss_mean_cls=3.06, proj_loss=-0.0549][2026-03-23 13:36:36] Step: 49, Training Logs: loss_final: 4.914561, loss_mean: 1.368440, proj_loss: -0.055709, loss_mean_cls: 3.601830, grad_norm: 0.793065 +Steps: 0%| | 50/1000000 [00:16<67:35:11, 4.11it/s, grad_norm=0.793, loss_final=4.91, loss_mean=1.37, loss_mean_cls=3.6, proj_loss=-0.0557][2026-03-23 13:36:36] Step: 50, Training Logs: loss_final: 4.078529, loss_mean: 1.394648, proj_loss: -0.052747, loss_mean_cls: 2.736628, grad_norm: 0.808900 +Steps: 0%| | 51/1000000 [00:16<67:35:20, 4.11it/s, grad_norm=0.809, loss_final=4.08, loss_mean=1.39, loss_mean_cls=2.74, proj_loss=-0.0527][2026-03-23 13:36:36] Step: 51, Training Logs: loss_final: 4.720263, loss_mean: 1.369311, proj_loss: -0.054949, loss_mean_cls: 3.405901, grad_norm: 0.712579 +Steps: 0%| | 52/1000000 [00:16<67:36:12, 4.11it/s, grad_norm=0.713, loss_final=4.72, loss_mean=1.37, loss_mean_cls=3.41, proj_loss=-0.0549][2026-03-23 13:36:36] Step: 52, Training Logs: loss_final: 4.499831, loss_mean: 1.372577, proj_loss: -0.055997, loss_mean_cls: 3.183251, grad_norm: 0.666338 +Steps: 0%| | 53/1000000 [00:17<67:35:56, 4.11it/s, grad_norm=0.666, loss_final=4.5, loss_mean=1.37, loss_mean_cls=3.18, proj_loss=-0.056][2026-03-23 13:36:37] Step: 53, Training Logs: loss_final: 4.471875, loss_mean: 1.354121, proj_loss: -0.054967, loss_mean_cls: 3.172721, grad_norm: 0.688333 +Steps: 0%| | 54/1000000 [00:17<67:42:29, 4.10it/s, grad_norm=0.688, loss_final=4.47, loss_mean=1.35, loss_mean_cls=3.17, proj_loss=-0.055][2026-03-23 13:36:37] Step: 54, Training Logs: loss_final: 4.650017, loss_mean: 1.348148, proj_loss: -0.056815, loss_mean_cls: 3.358685, grad_norm: 0.698381 +Steps: 0%| | 55/1000000 [00:17<67:49:23, 4.10it/s, grad_norm=0.698, loss_final=4.65, loss_mean=1.35, loss_mean_cls=3.36, proj_loss=-0.0568][2026-03-23 13:36:37] Step: 55, Training Logs: loss_final: 4.689909, loss_mean: 1.338766, proj_loss: -0.056573, loss_mean_cls: 3.407716, grad_norm: 1.278330 +Steps: 0%| | 56/1000000 [00:17<67:46:37, 4.10it/s, grad_norm=1.28, loss_final=4.69, loss_mean=1.34, loss_mean_cls=3.41, proj_loss=-0.0566][2026-03-23 13:36:37] Step: 56, Training Logs: loss_final: 4.644912, loss_mean: 1.420033, proj_loss: -0.053640, loss_mean_cls: 3.278520, grad_norm: 3.657754 +Steps: 0%| | 57/1000000 [00:18<67:45:17, 4.10it/s, grad_norm=3.66, loss_final=4.64, loss_mean=1.42, loss_mean_cls=3.28, proj_loss=-0.0536][2026-03-23 13:36:38] Step: 57, Training Logs: loss_final: 4.717434, loss_mean: 1.385057, proj_loss: -0.055313, loss_mean_cls: 3.387690, grad_norm: 2.697698 +Steps: 0%| | 58/1000000 [00:18<67:41:34, 4.10it/s, grad_norm=2.7, loss_final=4.72, loss_mean=1.39, loss_mean_cls=3.39, proj_loss=-0.0553][2026-03-23 13:36:38] Step: 58, Training Logs: loss_final: 4.645914, loss_mean: 1.352183, proj_loss: -0.056301, loss_mean_cls: 3.350031, grad_norm: 1.410190 +Steps: 0%| | 59/1000000 [00:18<67:46:07, 4.10it/s, grad_norm=1.41, loss_final=4.65, loss_mean=1.35, loss_mean_cls=3.35, proj_loss=-0.0563][2026-03-23 13:36:38] Step: 59, Training Logs: loss_final: 4.311267, loss_mean: 1.330003, proj_loss: -0.054820, loss_mean_cls: 3.036084, grad_norm: 1.037639 +Steps: 0%| | 60/1000000 [00:18<67:43:44, 4.10it/s, grad_norm=1.04, loss_final=4.31, loss_mean=1.33, loss_mean_cls=3.04, proj_loss=-0.0548][2026-03-23 13:36:38] Step: 60, Training Logs: loss_final: 4.493752, loss_mean: 1.307462, proj_loss: -0.056244, loss_mean_cls: 3.242534, grad_norm: 0.946355 +Steps: 0%| | 61/1000000 [00:19<67:46:35, 4.10it/s, grad_norm=0.946, loss_final=4.49, loss_mean=1.31, loss_mean_cls=3.24, proj_loss=-0.0562][2026-03-23 13:36:39] Step: 61, Training Logs: loss_final: 4.361525, loss_mean: 1.337970, proj_loss: -0.054187, loss_mean_cls: 3.077741, grad_norm: 0.974182 +Steps: 0%| | 62/1000000 [00:19<67:47:18, 4.10it/s, grad_norm=0.974, loss_final=4.36, loss_mean=1.34, loss_mean_cls=3.08, proj_loss=-0.0542][2026-03-23 13:36:39] Step: 62, Training Logs: loss_final: 4.189270, loss_mean: 1.311800, proj_loss: -0.055131, loss_mean_cls: 2.932600, grad_norm: 0.868944 +Steps: 0%| | 63/1000000 [00:19<67:50:30, 4.09it/s, grad_norm=0.869, loss_final=4.19, loss_mean=1.31, loss_mean_cls=2.93, proj_loss=-0.0551][2026-03-23 13:36:39] Step: 63, Training Logs: loss_final: 4.340247, loss_mean: 1.309375, proj_loss: -0.054263, loss_mean_cls: 3.085135, grad_norm: 0.771930 +Steps: 0%| | 64/1000000 [00:19<67:50:32, 4.09it/s, grad_norm=0.772, loss_final=4.34, loss_mean=1.31, loss_mean_cls=3.09, proj_loss=-0.0543][2026-03-23 13:36:39] Step: 64, Training Logs: loss_final: 3.603880, loss_mean: 1.345889, proj_loss: -0.056353, loss_mean_cls: 2.314343, grad_norm: 0.674901 +Steps: 0%| | 65/1000000 [00:20<67:46:21, 4.10it/s, grad_norm=0.675, loss_final=3.6, loss_mean=1.35, loss_mean_cls=2.31, proj_loss=-0.0564][2026-03-23 13:36:40] Step: 65, Training Logs: loss_final: 3.807487, loss_mean: 1.331215, proj_loss: -0.057539, loss_mean_cls: 2.533812, grad_norm: 1.268465 +Steps: 0%| | 66/1000000 [00:20<67:42:11, 4.10it/s, grad_norm=1.27, loss_final=3.81, loss_mean=1.33, loss_mean_cls=2.53, proj_loss=-0.0575][2026-03-23 13:36:40] Step: 66, Training Logs: loss_final: 4.784114, loss_mean: 1.292676, proj_loss: -0.057269, loss_mean_cls: 3.548707, grad_norm: 1.406024 +Steps: 0%| | 67/1000000 [00:20<67:40:35, 4.10it/s, grad_norm=1.41, loss_final=4.78, loss_mean=1.29, loss_mean_cls=3.55, proj_loss=-0.0573][2026-03-23 13:36:40] Step: 67, Training Logs: loss_final: 4.604260, loss_mean: 1.293122, proj_loss: -0.055983, loss_mean_cls: 3.367121, grad_norm: 1.192854 +Steps: 0%| | 68/1000000 [00:20<67:45:43, 4.10it/s, grad_norm=1.19, loss_final=4.6, loss_mean=1.29, loss_mean_cls=3.37, proj_loss=-0.056][2026-03-23 13:36:40] Step: 68, Training Logs: loss_final: 3.852891, loss_mean: 1.283622, proj_loss: -0.055106, loss_mean_cls: 2.624375, grad_norm: 0.773356 +Steps: 0%| | 69/1000000 [00:21<67:42:20, 4.10it/s, grad_norm=0.773, loss_final=3.85, loss_mean=1.28, loss_mean_cls=2.62, proj_loss=-0.0551][2026-03-23 13:36:41] Step: 69, Training Logs: loss_final: 4.328906, loss_mean: 1.255382, proj_loss: -0.055554, loss_mean_cls: 3.129078, grad_norm: 0.878106 +Steps: 0%| | 70/1000000 [00:21<67:39:49, 4.10it/s, grad_norm=0.878, loss_final=4.33, loss_mean=1.26, loss_mean_cls=3.13, proj_loss=-0.0556][2026-03-23 13:36:41] Step: 70, Training Logs: loss_final: 5.168178, loss_mean: 1.246450, proj_loss: -0.055992, loss_mean_cls: 3.977720, grad_norm: 1.196400 +Steps: 0%| | 71/1000000 [00:21<67:38:59, 4.11it/s, grad_norm=1.2, loss_final=5.17, loss_mean=1.25, loss_mean_cls=3.98, proj_loss=-0.056][2026-03-23 13:36:41] Step: 71, Training Logs: loss_final: 4.204251, loss_mean: 1.259648, proj_loss: -0.058954, loss_mean_cls: 3.003557, grad_norm: 0.841489 +Steps: 0%| | 72/1000000 [00:21<67:44:52, 4.10it/s, grad_norm=0.841, loss_final=4.2, loss_mean=1.26, loss_mean_cls=3, proj_loss=-0.059][2026-03-23 13:36:41] Step: 72, Training Logs: loss_final: 4.913557, loss_mean: 1.232818, proj_loss: -0.054968, loss_mean_cls: 3.735708, grad_norm: 0.886124 +Steps: 0%| | 73/1000000 [00:22<67:42:07, 4.10it/s, grad_norm=0.886, loss_final=4.91, loss_mean=1.23, loss_mean_cls=3.74, proj_loss=-0.055][2026-03-23 13:36:42] Step: 73, Training Logs: loss_final: 4.108160, loss_mean: 1.247858, proj_loss: -0.054757, loss_mean_cls: 2.915059, grad_norm: 1.030328 +Steps: 0%| | 74/1000000 [00:22<67:39:34, 4.11it/s, grad_norm=1.03, loss_final=4.11, loss_mean=1.25, loss_mean_cls=2.92, proj_loss=-0.0548][2026-03-23 13:36:42] Step: 74, Training Logs: loss_final: 4.299341, loss_mean: 1.222750, proj_loss: -0.056083, loss_mean_cls: 3.132674, grad_norm: 0.967199 +Steps: 0%| | 75/1000000 [00:22<67:39:56, 4.10it/s, grad_norm=0.967, loss_final=4.3, loss_mean=1.22, loss_mean_cls=3.13, proj_loss=-0.0561][2026-03-23 13:36:42] Step: 75, Training Logs: loss_final: 4.699260, loss_mean: 1.256376, proj_loss: -0.054988, loss_mean_cls: 3.497873, grad_norm: 0.967953 +Steps: 0%| | 76/1000000 [00:22<67:43:17, 4.10it/s, grad_norm=0.968, loss_final=4.7, loss_mean=1.26, loss_mean_cls=3.5, proj_loss=-0.055][2026-03-23 13:36:42] Step: 76, Training Logs: loss_final: 4.797206, loss_mean: 1.245866, proj_loss: -0.058861, loss_mean_cls: 3.610201, grad_norm: 2.023631 +Steps: 0%| | 77/1000000 [00:23<67:42:52, 4.10it/s, grad_norm=2.02, loss_final=4.8, loss_mean=1.25, loss_mean_cls=3.61, proj_loss=-0.0589][2026-03-23 13:36:42] Step: 77, Training Logs: loss_final: 4.305351, loss_mean: 1.220361, proj_loss: -0.056511, loss_mean_cls: 3.141501, grad_norm: 1.768806 +Steps: 0%| | 78/1000000 [00:23<67:41:33, 4.10it/s, grad_norm=1.77, loss_final=4.31, loss_mean=1.22, loss_mean_cls=3.14, proj_loss=-0.0565][2026-03-23 13:36:43] Step: 78, Training Logs: loss_final: 4.421437, loss_mean: 1.295188, proj_loss: -0.057967, loss_mean_cls: 3.184215, grad_norm: 3.325398 +Steps: 0%| | 79/1000000 [00:23<67:40:13, 4.10it/s, grad_norm=3.33, loss_final=4.42, loss_mean=1.3, loss_mean_cls=3.18, proj_loss=-0.058][2026-03-23 13:36:43] Step: 79, Training Logs: loss_final: 4.319492, loss_mean: 1.269802, proj_loss: -0.057809, loss_mean_cls: 3.107499, grad_norm: 4.098631 +Steps: 0%| | 80/1000000 [00:23<67:39:10, 4.11it/s, grad_norm=4.1, loss_final=4.32, loss_mean=1.27, loss_mean_cls=3.11, proj_loss=-0.0578][2026-03-23 13:36:43] Step: 80, Training Logs: loss_final: 5.171796, loss_mean: 1.187868, proj_loss: -0.055299, loss_mean_cls: 4.039227, grad_norm: 2.000139 +Steps: 0%| | 81/1000000 [00:23<67:40:02, 4.10it/s, grad_norm=2, loss_final=5.17, loss_mean=1.19, loss_mean_cls=4.04, proj_loss=-0.0553][2026-03-23 13:36:43] Step: 81, Training Logs: loss_final: 4.155951, loss_mean: 1.235599, proj_loss: -0.057730, loss_mean_cls: 2.978083, grad_norm: 2.569617 +Steps: 0%| | 82/1000000 [00:24<67:38:22, 4.11it/s, grad_norm=2.57, loss_final=4.16, loss_mean=1.24, loss_mean_cls=2.98, proj_loss=-0.0577][2026-03-23 13:36:44] Step: 82, Training Logs: loss_final: 4.357609, loss_mean: 1.252133, proj_loss: -0.056178, loss_mean_cls: 3.161653, grad_norm: 3.458743 +Steps: 0%| | 83/1000000 [00:24<67:39:20, 4.11it/s, grad_norm=3.46, loss_final=4.36, loss_mean=1.25, loss_mean_cls=3.16, proj_loss=-0.0562][2026-03-23 13:36:44] Step: 83, Training Logs: loss_final: 4.057786, loss_mean: 1.254738, proj_loss: -0.057184, loss_mean_cls: 2.860233, grad_norm: 2.050188 +Steps: 0%| | 84/1000000 [00:24<67:40:27, 4.10it/s, grad_norm=2.05, loss_final=4.06, loss_mean=1.25, loss_mean_cls=2.86, proj_loss=-0.0572][2026-03-23 13:36:44] Step: 84, Training Logs: loss_final: 4.413987, loss_mean: 1.204479, proj_loss: -0.056045, loss_mean_cls: 3.265552, grad_norm: 1.733807 +Steps: 0%| | 85/1000000 [00:24<67:39:26, 4.11it/s, grad_norm=1.73, loss_final=4.41, loss_mean=1.2, loss_mean_cls=3.27, proj_loss=-0.056][2026-03-23 13:36:44] Step: 85, Training Logs: loss_final: 4.315685, loss_mean: 1.207567, proj_loss: -0.055839, loss_mean_cls: 3.163958, grad_norm: 1.514510 +Steps: 0%| | 86/1000000 [00:25<67:38:36, 4.11it/s, grad_norm=1.51, loss_final=4.32, loss_mean=1.21, loss_mean_cls=3.16, proj_loss=-0.0558][2026-03-23 13:36:45] Step: 86, Training Logs: loss_final: 4.045437, loss_mean: 1.207488, proj_loss: -0.057399, loss_mean_cls: 2.895348, grad_norm: 1.610461 +Steps: 0%| | 87/1000000 [00:25<67:43:11, 4.10it/s, grad_norm=1.61, loss_final=4.05, loss_mean=1.21, loss_mean_cls=2.9, proj_loss=-0.0574][2026-03-23 13:36:45] Step: 87, Training Logs: loss_final: 4.307369, loss_mean: 1.199939, proj_loss: -0.055906, loss_mean_cls: 3.163336, grad_norm: 1.239633 +Steps: 0%| | 88/1000000 [00:25<67:41:38, 4.10it/s, grad_norm=1.24, loss_final=4.31, loss_mean=1.2, loss_mean_cls=3.16, proj_loss=-0.0559][2026-03-23 13:36:45] Step: 88, Training Logs: loss_final: 4.126420, loss_mean: 1.205097, proj_loss: -0.053069, loss_mean_cls: 2.974392, grad_norm: 1.224923 +Steps: 0%| | 89/1000000 [00:25<67:40:22, 4.10it/s, grad_norm=1.22, loss_final=4.13, loss_mean=1.21, loss_mean_cls=2.97, proj_loss=-0.0531][2026-03-23 13:36:45] Step: 89, Training Logs: loss_final: 3.784119, loss_mean: 1.206287, proj_loss: -0.056534, loss_mean_cls: 2.634367, grad_norm: 1.011884 +Steps: 0%| | 90/1000000 [00:26<67:38:18, 4.11it/s, grad_norm=1.01, loss_final=3.78, loss_mean=1.21, loss_mean_cls=2.63, proj_loss=-0.0565][2026-03-23 13:36:46] Step: 90, Training Logs: loss_final: 4.284792, loss_mean: 1.225192, proj_loss: -0.055486, loss_mean_cls: 3.115087, grad_norm: 1.424123 +Steps: 0%| | 91/1000000 [00:26<67:38:18, 4.11it/s, grad_norm=1.42, loss_final=4.28, loss_mean=1.23, loss_mean_cls=3.12, proj_loss=-0.0555][2026-03-23 13:36:46] Step: 91, Training Logs: loss_final: 4.651447, loss_mean: 1.173349, proj_loss: -0.054634, loss_mean_cls: 3.532732, grad_norm: 1.335702 +Steps: 0%| | 92/1000000 [00:26<67:37:08, 4.11it/s, grad_norm=1.34, loss_final=4.65, loss_mean=1.17, loss_mean_cls=3.53, proj_loss=-0.0546][2026-03-23 13:36:46] Step: 92, Training Logs: loss_final: 4.239686, loss_mean: 1.175513, proj_loss: -0.056816, loss_mean_cls: 3.120988, grad_norm: 1.007597 +Steps: 0%| | 93/1000000 [00:26<67:37:39, 4.11it/s, grad_norm=1.01, loss_final=4.24, loss_mean=1.18, loss_mean_cls=3.12, proj_loss=-0.0568][2026-03-23 13:36:46] Step: 93, Training Logs: loss_final: 3.972426, loss_mean: 1.178181, proj_loss: -0.058559, loss_mean_cls: 2.852805, grad_norm: 1.221818 +Steps: 0%| | 94/1000000 [00:27<67:37:00, 4.11it/s, grad_norm=1.22, loss_final=3.97, loss_mean=1.18, loss_mean_cls=2.85, proj_loss=-0.0586][2026-03-23 13:36:47] Step: 94, Training Logs: loss_final: 5.054445, loss_mean: 1.158944, proj_loss: -0.058464, loss_mean_cls: 3.953966, grad_norm: 1.052288 +Steps: 0%| | 95/1000000 [00:27<67:37:22, 4.11it/s, grad_norm=1.05, loss_final=5.05, loss_mean=1.16, loss_mean_cls=3.95, proj_loss=-0.0585][2026-03-23 13:36:47] Step: 95, Training Logs: loss_final: 4.155188, loss_mean: 1.181034, proj_loss: -0.055321, loss_mean_cls: 3.029475, grad_norm: 1.227798 +Steps: 0%| | 96/1000000 [00:27<67:38:19, 4.11it/s, grad_norm=1.23, loss_final=4.16, loss_mean=1.18, loss_mean_cls=3.03, proj_loss=-0.0553][2026-03-23 13:36:47] Step: 96, Training Logs: loss_final: 4.565907, loss_mean: 1.147447, proj_loss: -0.057785, loss_mean_cls: 3.476245, grad_norm: 1.124116 +Steps: 0%| | 97/1000000 [00:27<67:47:06, 4.10it/s, grad_norm=1.12, loss_final=4.57, loss_mean=1.15, loss_mean_cls=3.48, proj_loss=-0.0578][2026-03-23 13:36:47] Step: 97, Training Logs: loss_final: 4.568320, loss_mean: 1.142494, proj_loss: -0.056103, loss_mean_cls: 3.481929, grad_norm: 1.136097 +Steps: 0%| | 98/1000000 [00:28<67:42:27, 4.10it/s, grad_norm=1.14, loss_final=4.57, loss_mean=1.14, loss_mean_cls=3.48, proj_loss=-0.0561][2026-03-23 13:36:48] Step: 98, Training Logs: loss_final: 4.257794, loss_mean: 1.142878, proj_loss: -0.058581, loss_mean_cls: 3.173497, grad_norm: 1.288088 +Steps: 0%| | 99/1000000 [00:28<67:48:22, 4.10it/s, grad_norm=1.29, loss_final=4.26, loss_mean=1.14, loss_mean_cls=3.17, proj_loss=-0.0586][2026-03-23 13:36:48] Step: 99, Training Logs: loss_final: 4.145551, loss_mean: 1.162230, proj_loss: -0.057512, loss_mean_cls: 3.040834, grad_norm: 1.769154 +Steps: 0%| | 100/1000000 [00:28<67:46:39, 4.10it/s, grad_norm=1.77, loss_final=4.15, loss_mean=1.16, loss_mean_cls=3.04, proj_loss=-0.0575][2026-03-23 13:36:48] Step: 100, Training Logs: loss_final: 4.440096, loss_mean: 1.111268, proj_loss: -0.056838, loss_mean_cls: 3.385665, grad_norm: 1.338288 +Steps: 0%| | 101/1000000 [00:28<67:42:49, 4.10it/s, grad_norm=1.34, loss_final=4.44, loss_mean=1.11, loss_mean_cls=3.39, proj_loss=-0.0568][2026-03-23 13:36:48] Step: 101, Training Logs: loss_final: 4.300415, loss_mean: 1.151653, proj_loss: -0.058087, loss_mean_cls: 3.206849, grad_norm: 1.793294 +Steps: 0%| | 102/1000000 [00:29<67:39:39, 4.11it/s, grad_norm=1.79, loss_final=4.3, loss_mean=1.15, loss_mean_cls=3.21, proj_loss=-0.0581][2026-03-23 13:36:49] Step: 102, Training Logs: loss_final: 4.057317, loss_mean: 1.157729, proj_loss: -0.053412, loss_mean_cls: 2.953000, grad_norm: 1.113412 +Steps: 0%| | 103/1000000 [00:29<67:40:48, 4.10it/s, grad_norm=1.11, loss_final=4.06, loss_mean=1.16, loss_mean_cls=2.95, proj_loss=-0.0534][2026-03-23 13:36:49] Step: 103, Training Logs: loss_final: 4.422479, loss_mean: 1.098547, proj_loss: -0.055795, loss_mean_cls: 3.379727, grad_norm: 1.045263 +Steps: 0%| | 104/1000000 [00:29<67:39:23, 4.11it/s, grad_norm=1.05, loss_final=4.42, loss_mean=1.1, loss_mean_cls=3.38, proj_loss=-0.0558][2026-03-23 13:36:49] Step: 104, Training Logs: loss_final: 3.774074, loss_mean: 1.125751, proj_loss: -0.056373, loss_mean_cls: 2.704695, grad_norm: 1.891443 +Steps: 0%| | 105/1000000 [00:29<67:37:35, 4.11it/s, grad_norm=1.89, loss_final=3.77, loss_mean=1.13, loss_mean_cls=2.7, proj_loss=-0.0564][2026-03-23 13:36:49] Step: 105, Training Logs: loss_final: 3.972304, loss_mean: 1.139271, proj_loss: -0.056001, loss_mean_cls: 2.889035, grad_norm: 1.255667 +Steps: 0%| | 106/1000000 [00:30<67:37:31, 4.11it/s, grad_norm=1.26, loss_final=3.97, loss_mean=1.14, loss_mean_cls=2.89, proj_loss=-0.056][2026-03-23 13:36:50] Step: 106, Training Logs: loss_final: 4.464149, loss_mean: 1.131362, proj_loss: -0.057327, loss_mean_cls: 3.390113, grad_norm: 2.005663 +Steps: 0%| | 107/1000000 [00:30<67:40:00, 4.10it/s, grad_norm=2.01, loss_final=4.46, loss_mean=1.13, loss_mean_cls=3.39, proj_loss=-0.0573][2026-03-23 13:36:50] Step: 107, Training Logs: loss_final: 4.151056, loss_mean: 1.123594, proj_loss: -0.056659, loss_mean_cls: 3.084121, grad_norm: 1.222773 +Steps: 0%| | 108/1000000 [00:30<67:40:00, 4.10it/s, grad_norm=1.22, loss_final=4.15, loss_mean=1.12, loss_mean_cls=3.08, proj_loss=-0.0567][2026-03-23 13:36:50] Step: 108, Training Logs: loss_final: 4.367493, loss_mean: 1.117185, proj_loss: -0.057227, loss_mean_cls: 3.307535, grad_norm: 1.306931 +Steps: 0%| | 109/1000000 [00:30<67:38:47, 4.11it/s, grad_norm=1.31, loss_final=4.37, loss_mean=1.12, loss_mean_cls=3.31, proj_loss=-0.0572][2026-03-23 13:36:50] Step: 109, Training Logs: loss_final: 4.084368, loss_mean: 1.140267, proj_loss: -0.058264, loss_mean_cls: 3.002365, grad_norm: 2.670200 +Steps: 0%| | 110/1000000 [00:31<67:38:12, 4.11it/s, grad_norm=2.67, loss_final=4.08, loss_mean=1.14, loss_mean_cls=3, proj_loss=-0.0583][2026-03-23 13:36:51] Step: 110, Training Logs: loss_final: 4.197088, loss_mean: 1.120510, proj_loss: -0.056870, loss_mean_cls: 3.133448, grad_norm: 1.564106 +Steps: 0%| | 111/1000000 [00:31<67:39:18, 4.11it/s, grad_norm=1.56, loss_final=4.2, loss_mean=1.12, loss_mean_cls=3.13, proj_loss=-0.0569][2026-03-23 13:36:51] Step: 111, Training Logs: loss_final: 3.834394, loss_mean: 1.146878, proj_loss: -0.057553, loss_mean_cls: 2.745068, grad_norm: 2.675094 +Steps: 0%| | 112/1000000 [00:31<67:38:18, 4.11it/s, grad_norm=2.68, loss_final=3.83, loss_mean=1.15, loss_mean_cls=2.75, proj_loss=-0.0576][2026-03-23 13:36:51] Step: 112, Training Logs: loss_final: 4.341834, loss_mean: 1.142249, proj_loss: -0.057768, loss_mean_cls: 3.257353, grad_norm: 1.638759 +Steps: 0%| | 113/1000000 [00:31<67:37:44, 4.11it/s, grad_norm=1.64, loss_final=4.34, loss_mean=1.14, loss_mean_cls=3.26, proj_loss=-0.0578][2026-03-23 13:36:51] Step: 113, Training Logs: loss_final: 4.725032, loss_mean: 1.098210, proj_loss: -0.057936, loss_mean_cls: 3.684758, grad_norm: 1.173083 +Steps: 0%| | 114/1000000 [00:32<67:36:50, 4.11it/s, grad_norm=1.17, loss_final=4.73, loss_mean=1.1, loss_mean_cls=3.68, proj_loss=-0.0579][2026-03-23 13:36:52] Step: 114, Training Logs: loss_final: 4.184102, loss_mean: 1.112238, proj_loss: -0.054088, loss_mean_cls: 3.125951, grad_norm: 1.641760 +Steps: 0%| | 115/1000000 [00:32<67:37:51, 4.11it/s, grad_norm=1.64, loss_final=4.18, loss_mean=1.11, loss_mean_cls=3.13, proj_loss=-0.0541][2026-03-23 13:36:52] Step: 115, Training Logs: loss_final: 4.449126, loss_mean: 1.128187, proj_loss: -0.056442, loss_mean_cls: 3.377382, grad_norm: 1.344960 +Steps: 0%| | 116/1000000 [00:32<67:37:55, 4.11it/s, grad_norm=1.34, loss_final=4.45, loss_mean=1.13, loss_mean_cls=3.38, proj_loss=-0.0564][2026-03-23 13:36:52] Step: 116, Training Logs: loss_final: 3.854568, loss_mean: 1.129354, proj_loss: -0.057410, loss_mean_cls: 2.782625, grad_norm: 1.219800 +Steps: 0%| | 117/1000000 [00:32<67:36:44, 4.11it/s, grad_norm=1.22, loss_final=3.85, loss_mean=1.13, loss_mean_cls=2.78, proj_loss=-0.0574][2026-03-23 13:36:52] Step: 117, Training Logs: loss_final: 4.636362, loss_mean: 1.085479, proj_loss: -0.056107, loss_mean_cls: 3.606990, grad_norm: 1.069546 +Steps: 0%| | 118/1000000 [00:33<67:36:44, 4.11it/s, grad_norm=1.07, loss_final=4.64, loss_mean=1.09, loss_mean_cls=3.61, proj_loss=-0.0561][2026-03-23 13:36:52] Step: 118, Training Logs: loss_final: 3.846956, loss_mean: 1.104960, proj_loss: -0.057819, loss_mean_cls: 2.799815, grad_norm: 1.065191 +Steps: 0%| | 119/1000000 [00:33<67:37:25, 4.11it/s, grad_norm=1.07, loss_final=3.85, loss_mean=1.1, loss_mean_cls=2.8, proj_loss=-0.0578][2026-03-23 13:36:53] Step: 119, Training Logs: loss_final: 4.465102, loss_mean: 1.085837, proj_loss: -0.056212, loss_mean_cls: 3.435476, grad_norm: 0.764514 +Steps: 0%| | 120/1000000 [00:33<67:36:47, 4.11it/s, grad_norm=0.765, loss_final=4.47, loss_mean=1.09, loss_mean_cls=3.44, proj_loss=-0.0562][2026-03-23 13:36:53] Step: 120, Training Logs: loss_final: 4.352205, loss_mean: 1.065777, proj_loss: -0.057855, loss_mean_cls: 3.344283, grad_norm: 0.972556 +Steps: 0%| | 121/1000000 [00:33<67:39:15, 4.11it/s, grad_norm=0.973, loss_final=4.35, loss_mean=1.07, loss_mean_cls=3.34, proj_loss=-0.0579][2026-03-23 13:36:53] Step: 121, Training Logs: loss_final: 3.741929, loss_mean: 1.097569, proj_loss: -0.057521, loss_mean_cls: 2.701881, grad_norm: 1.558717 +Steps: 0%| | 122/1000000 [00:33<67:43:30, 4.10it/s, grad_norm=1.56, loss_final=3.74, loss_mean=1.1, loss_mean_cls=2.7, proj_loss=-0.0575][2026-03-23 13:36:53] Step: 122, Training Logs: loss_final: 4.967860, loss_mean: 1.091814, proj_loss: -0.058962, loss_mean_cls: 3.935009, grad_norm: 1.242672 +Steps: 0%| | 123/1000000 [00:34<67:41:35, 4.10it/s, grad_norm=1.24, loss_final=4.97, loss_mean=1.09, loss_mean_cls=3.94, proj_loss=-0.059][2026-03-23 13:36:54] Step: 123, Training Logs: loss_final: 4.150167, loss_mean: 1.086379, proj_loss: -0.056256, loss_mean_cls: 3.120044, grad_norm: 1.034515 +Steps: 0%| | 124/1000000 [00:34<67:40:40, 4.10it/s, grad_norm=1.03, loss_final=4.15, loss_mean=1.09, loss_mean_cls=3.12, proj_loss=-0.0563][2026-03-23 13:36:54] Step: 124, Training Logs: loss_final: 3.988164, loss_mean: 1.094945, proj_loss: -0.054613, loss_mean_cls: 2.947832, grad_norm: 1.328421 +Steps: 0%| | 125/1000000 [00:34<67:39:06, 4.11it/s, grad_norm=1.33, loss_final=3.99, loss_mean=1.09, loss_mean_cls=2.95, proj_loss=-0.0546][2026-03-23 13:36:54] Step: 125, Training Logs: loss_final: 3.778289, loss_mean: 1.119502, proj_loss: -0.058022, loss_mean_cls: 2.716809, grad_norm: 1.800963 +Steps: 0%| | 126/1000000 [00:34<67:36:59, 4.11it/s, grad_norm=1.8, loss_final=3.78, loss_mean=1.12, loss_mean_cls=2.72, proj_loss=-0.058][2026-03-23 13:36:54] Step: 126, Training Logs: loss_final: 4.397158, loss_mean: 1.078158, proj_loss: -0.055574, loss_mean_cls: 3.374574, grad_norm: 1.474484 +Steps: 0%| | 127/1000000 [00:35<67:37:18, 4.11it/s, grad_norm=1.47, loss_final=4.4, loss_mean=1.08, loss_mean_cls=3.37, proj_loss=-0.0556][2026-03-23 13:36:55] Step: 127, Training Logs: loss_final: 4.370636, loss_mean: 1.066722, proj_loss: -0.057269, loss_mean_cls: 3.361183, grad_norm: 1.316687 +Steps: 0%| | 128/1000000 [00:35<67:36:12, 4.11it/s, grad_norm=1.32, loss_final=4.37, loss_mean=1.07, loss_mean_cls=3.36, proj_loss=-0.0573][2026-03-23 13:36:55] Step: 128, Training Logs: loss_final: 4.007182, loss_mean: 1.087022, proj_loss: -0.056942, loss_mean_cls: 2.977102, grad_norm: 0.871613 +Steps: 0%| | 129/1000000 [00:35<67:37:03, 4.11it/s, grad_norm=0.872, loss_final=4.01, loss_mean=1.09, loss_mean_cls=2.98, proj_loss=-0.0569][2026-03-23 13:36:55] Step: 129, Training Logs: loss_final: 4.148458, loss_mean: 1.070735, proj_loss: -0.056524, loss_mean_cls: 3.134246, grad_norm: 1.162080 +Steps: 0%| | 130/1000000 [00:35<67:37:55, 4.11it/s, grad_norm=1.16, loss_final=4.15, loss_mean=1.07, loss_mean_cls=3.13, proj_loss=-0.0565][2026-03-23 13:36:55] Step: 130, Training Logs: loss_final: 4.174772, loss_mean: 1.099887, proj_loss: -0.058064, loss_mean_cls: 3.132950, grad_norm: 1.281709 +Steps: 0%| | 131/1000000 [00:36<67:39:05, 4.11it/s, grad_norm=1.28, loss_final=4.17, loss_mean=1.1, loss_mean_cls=3.13, proj_loss=-0.0581][2026-03-23 13:36:56] Step: 131, Training Logs: loss_final: 5.004020, loss_mean: 1.054110, proj_loss: -0.057682, loss_mean_cls: 4.007592, grad_norm: 1.368666 +Steps: 0%| | 132/1000000 [00:36<67:37:18, 4.11it/s, grad_norm=1.37, loss_final=5, loss_mean=1.05, loss_mean_cls=4.01, proj_loss=-0.0577][2026-03-23 13:36:56] Step: 132, Training Logs: loss_final: 4.955034, loss_mean: 1.068821, proj_loss: -0.055308, loss_mean_cls: 3.941521, grad_norm: 1.038697 +Steps: 0%| | 133/1000000 [00:36<67:36:57, 4.11it/s, grad_norm=1.04, loss_final=4.96, loss_mean=1.07, loss_mean_cls=3.94, proj_loss=-0.0553][2026-03-23 13:36:56] Step: 133, Training Logs: loss_final: 4.333442, loss_mean: 1.056893, proj_loss: -0.057834, loss_mean_cls: 3.334383, grad_norm: 1.024345 +Steps: 0%| | 134/1000000 [00:36<67:38:12, 4.11it/s, grad_norm=1.02, loss_final=4.33, loss_mean=1.06, loss_mean_cls=3.33, proj_loss=-0.0578][2026-03-23 13:36:56] Step: 134, Training Logs: loss_final: 4.127104, loss_mean: 1.105070, proj_loss: -0.055761, loss_mean_cls: 3.077795, grad_norm: 1.607290 +Steps: 0%| | 135/1000000 [00:37<67:39:14, 4.11it/s, grad_norm=1.61, loss_final=4.13, loss_mean=1.11, loss_mean_cls=3.08, proj_loss=-0.0558][2026-03-23 13:36:57] Step: 135, Training Logs: loss_final: 4.466807, loss_mean: 1.065822, proj_loss: -0.057431, loss_mean_cls: 3.458416, grad_norm: 1.498473 +Steps: 0%| | 136/1000000 [00:37<67:39:16, 4.11it/s, grad_norm=1.5, loss_final=4.47, loss_mean=1.07, loss_mean_cls=3.46, proj_loss=-0.0574][2026-03-23 13:36:57] Step: 136, Training Logs: loss_final: 3.461324, loss_mean: 1.104192, proj_loss: -0.057591, loss_mean_cls: 2.414723, grad_norm: 1.451400 +Steps: 0%| | 137/1000000 [00:37<67:40:12, 4.10it/s, grad_norm=1.45, loss_final=3.46, loss_mean=1.1, loss_mean_cls=2.41, proj_loss=-0.0576][2026-03-23 13:36:57] Step: 137, Training Logs: loss_final: 4.161952, loss_mean: 1.111980, proj_loss: -0.056328, loss_mean_cls: 3.106300, grad_norm: 1.506182 +Steps: 0%| | 138/1000000 [00:37<67:39:35, 4.10it/s, grad_norm=1.51, loss_final=4.16, loss_mean=1.11, loss_mean_cls=3.11, proj_loss=-0.0563][2026-03-23 13:36:57] Step: 138, Training Logs: loss_final: 3.984584, loss_mean: 1.073755, proj_loss: -0.057956, loss_mean_cls: 2.968786, grad_norm: 1.379847 +Steps: 0%| | 139/1000000 [00:38<67:39:57, 4.10it/s, grad_norm=1.38, loss_final=3.98, loss_mean=1.07, loss_mean_cls=2.97, proj_loss=-0.058][2026-03-23 13:36:58] Step: 139, Training Logs: loss_final: 4.512394, loss_mean: 1.091358, proj_loss: -0.057702, loss_mean_cls: 3.478739, grad_norm: 1.153270 +Steps: 0%| | 140/1000000 [00:38<67:39:35, 4.10it/s, grad_norm=1.15, loss_final=4.51, loss_mean=1.09, loss_mean_cls=3.48, proj_loss=-0.0577][2026-03-23 13:36:58] Step: 140, Training Logs: loss_final: 3.846654, loss_mean: 1.080051, proj_loss: -0.055823, loss_mean_cls: 2.822427, grad_norm: 1.105031 +Steps: 0%| | 141/1000000 [00:38<67:38:51, 4.11it/s, grad_norm=1.11, loss_final=3.85, loss_mean=1.08, loss_mean_cls=2.82, proj_loss=-0.0558][2026-03-23 13:36:58] Step: 141, Training Logs: loss_final: 4.210699, loss_mean: 1.072231, proj_loss: -0.060189, loss_mean_cls: 3.198657, grad_norm: 1.317693 +Steps: 0%| | 142/1000000 [00:38<67:37:24, 4.11it/s, grad_norm=1.32, loss_final=4.21, loss_mean=1.07, loss_mean_cls=3.2, proj_loss=-0.0602][2026-03-23 13:36:58] Step: 142, Training Logs: loss_final: 4.543588, loss_mean: 1.061699, proj_loss: -0.056455, loss_mean_cls: 3.538343, grad_norm: 0.948847 +Steps: 0%| | 143/1000000 [00:39<67:38:44, 4.11it/s, grad_norm=0.949, loss_final=4.54, loss_mean=1.06, loss_mean_cls=3.54, proj_loss=-0.0565][2026-03-23 13:36:59] Step: 143, Training Logs: loss_final: 4.586270, loss_mean: 1.031416, proj_loss: -0.056292, loss_mean_cls: 3.611147, grad_norm: 1.702049 +Steps: 0%| | 144/1000000 [00:39<67:38:02, 4.11it/s, grad_norm=1.7, loss_final=4.59, loss_mean=1.03, loss_mean_cls=3.61, proj_loss=-0.0563][2026-03-23 13:36:59] Step: 144, Training Logs: loss_final: 4.007526, loss_mean: 1.086106, proj_loss: -0.056309, loss_mean_cls: 2.977730, grad_norm: 1.345014 +Steps: 0%| | 145/1000000 [00:39<67:38:14, 4.11it/s, grad_norm=1.35, loss_final=4.01, loss_mean=1.09, loss_mean_cls=2.98, proj_loss=-0.0563][2026-03-23 13:36:59] Step: 145, Training Logs: loss_final: 3.749062, loss_mean: 1.068075, proj_loss: -0.059249, loss_mean_cls: 2.740236, grad_norm: 1.075349 +Steps: 0%| | 146/1000000 [00:39<67:38:38, 4.11it/s, grad_norm=1.08, loss_final=3.75, loss_mean=1.07, loss_mean_cls=2.74, proj_loss=-0.0592][2026-03-23 13:36:59] Step: 146, Training Logs: loss_final: 4.144173, loss_mean: 1.054797, proj_loss: -0.057148, loss_mean_cls: 3.146524, grad_norm: 1.152260 +Steps: 0%| | 147/1000000 [00:40<67:39:56, 4.10it/s, grad_norm=1.15, loss_final=4.14, loss_mean=1.05, loss_mean_cls=3.15, proj_loss=-0.0571][2026-03-23 13:37:00] Step: 147, Training Logs: loss_final: 4.383973, loss_mean: 1.072232, proj_loss: -0.057351, loss_mean_cls: 3.369092, grad_norm: 1.247010 +Steps: 0%| | 148/1000000 [00:40<67:38:31, 4.11it/s, grad_norm=1.25, loss_final=4.38, loss_mean=1.07, loss_mean_cls=3.37, proj_loss=-0.0574][2026-03-23 13:37:00] Step: 148, Training Logs: loss_final: 3.642872, loss_mean: 1.081787, proj_loss: -0.056670, loss_mean_cls: 2.617756, grad_norm: 0.825531 +Steps: 0%| | 149/1000000 [00:40<67:39:14, 4.11it/s, grad_norm=0.826, loss_final=3.64, loss_mean=1.08, loss_mean_cls=2.62, proj_loss=-0.0567][2026-03-23 13:37:00] Step: 149, Training Logs: loss_final: 3.944113, loss_mean: 1.099083, proj_loss: -0.058042, loss_mean_cls: 2.903072, grad_norm: 1.271597 +Steps: 0%| | 150/1000000 [00:40<67:42:05, 4.10it/s, grad_norm=1.27, loss_final=3.94, loss_mean=1.1, loss_mean_cls=2.9, proj_loss=-0.058][2026-03-23 13:37:00] Step: 150, Training Logs: loss_final: 3.563294, loss_mean: 1.078317, proj_loss: -0.059267, loss_mean_cls: 2.544245, grad_norm: 0.970188 +Steps: 0%| | 151/1000000 [00:41<67:40:23, 4.10it/s, grad_norm=0.97, loss_final=3.56, loss_mean=1.08, loss_mean_cls=2.54, proj_loss=-0.0593][2026-03-23 13:37:01] Step: 151, Training Logs: loss_final: 3.809739, loss_mean: 1.041083, proj_loss: -0.057505, loss_mean_cls: 2.826161, grad_norm: 1.141341 +Steps: 0%| | 152/1000000 [00:41<67:39:07, 4.11it/s, grad_norm=1.14, loss_final=3.81, loss_mean=1.04, loss_mean_cls=2.83, proj_loss=-0.0575][2026-03-23 13:37:01] Step: 152, Training Logs: loss_final: 3.762334, loss_mean: 1.083112, proj_loss: -0.055633, loss_mean_cls: 2.734855, grad_norm: 1.874122 +Steps: 0%| | 153/1000000 [00:41<67:38:13, 4.11it/s, grad_norm=1.87, loss_final=3.76, loss_mean=1.08, loss_mean_cls=2.73, proj_loss=-0.0556][2026-03-23 13:37:01] Step: 153, Training Logs: loss_final: 3.843637, loss_mean: 1.079683, proj_loss: -0.055246, loss_mean_cls: 2.819200, grad_norm: 1.117242 +Steps: 0%| | 154/1000000 [00:41<67:38:17, 4.11it/s, grad_norm=1.12, loss_final=3.84, loss_mean=1.08, loss_mean_cls=2.82, proj_loss=-0.0552][2026-03-23 13:37:01] Step: 154, Training Logs: loss_final: 4.470123, loss_mean: 1.074438, proj_loss: -0.056604, loss_mean_cls: 3.452289, grad_norm: 1.652699 +Steps: 0%| | 155/1000000 [00:42<67:40:03, 4.10it/s, grad_norm=1.65, loss_final=4.47, loss_mean=1.07, loss_mean_cls=3.45, proj_loss=-0.0566][2026-03-23 13:37:01] Step: 155, Training Logs: loss_final: 3.964453, loss_mean: 1.093114, proj_loss: -0.056497, loss_mean_cls: 2.927836, grad_norm: 1.268589 +Steps: 0%| | 156/1000000 [00:42<67:39:32, 4.10it/s, grad_norm=1.27, loss_final=3.96, loss_mean=1.09, loss_mean_cls=2.93, proj_loss=-0.0565][2026-03-23 13:37:02] Step: 156, Training Logs: loss_final: 4.811103, loss_mean: 1.054342, proj_loss: -0.057057, loss_mean_cls: 3.813817, grad_norm: 1.334593 +Steps: 0%| | 157/1000000 [00:42<67:38:59, 4.11it/s, grad_norm=1.33, loss_final=4.81, loss_mean=1.05, loss_mean_cls=3.81, proj_loss=-0.0571][2026-03-23 13:37:02] Step: 157, Training Logs: loss_final: 4.437923, loss_mean: 1.029802, proj_loss: -0.057534, loss_mean_cls: 3.465655, grad_norm: 1.081054 +Steps: 0%| | 158/1000000 [00:42<67:37:56, 4.11it/s, grad_norm=1.08, loss_final=4.44, loss_mean=1.03, loss_mean_cls=3.47, proj_loss=-0.0575][2026-03-23 13:37:02] Step: 158, Training Logs: loss_final: 3.906409, loss_mean: 1.082703, proj_loss: -0.058122, loss_mean_cls: 2.881827, grad_norm: 1.621803 +Steps: 0%| | 159/1000000 [00:42<67:38:02, 4.11it/s, grad_norm=1.62, loss_final=3.91, loss_mean=1.08, loss_mean_cls=2.88, proj_loss=-0.0581][2026-03-23 13:37:02] Step: 159, Training Logs: loss_final: 3.266624, loss_mean: 1.100374, proj_loss: -0.057287, loss_mean_cls: 2.223537, grad_norm: 1.275749 +Steps: 0%| | 160/1000000 [00:43<87:39:16, 3.17it/s, grad_norm=1.28, loss_final=3.27, loss_mean=1.1, loss_mean_cls=2.22, proj_loss=-0.0573][2026-03-23 13:37:03] Step: 160, Training Logs: loss_final: 3.941036, loss_mean: 1.036707, proj_loss: -0.056759, loss_mean_cls: 2.961089, grad_norm: 1.108420 +Steps: 0%| | 161/1000000 [00:43<81:38:46, 3.40it/s, grad_norm=1.11, loss_final=3.94, loss_mean=1.04, loss_mean_cls=2.96, proj_loss=-0.0568][2026-03-23 13:37:03] Step: 161, Training Logs: loss_final: 3.742806, loss_mean: 1.047921, proj_loss: -0.056024, loss_mean_cls: 2.750910, grad_norm: 1.076968 +Steps: 0%| | 162/1000000 [00:43<77:25:38, 3.59it/s, grad_norm=1.08, loss_final=3.74, loss_mean=1.05, loss_mean_cls=2.75, proj_loss=-0.056][2026-03-23 13:37:03] Step: 162, Training Logs: loss_final: 4.119603, loss_mean: 1.050552, proj_loss: -0.056002, loss_mean_cls: 3.125053, grad_norm: 1.786776 +Steps: 0%| | 163/1000000 [00:44<74:30:38, 3.73it/s, grad_norm=1.79, loss_final=4.12, loss_mean=1.05, loss_mean_cls=3.13, proj_loss=-0.056][2026-03-23 13:37:04] Step: 163, Training Logs: loss_final: 3.954375, loss_mean: 1.057923, proj_loss: -0.055696, loss_mean_cls: 2.952147, grad_norm: 1.434260 +Steps: 0%| | 164/1000000 [00:44<72:25:27, 3.83it/s, grad_norm=1.43, loss_final=3.95, loss_mean=1.06, loss_mean_cls=2.95, proj_loss=-0.0557][2026-03-23 13:37:04] Step: 164, Training Logs: loss_final: 4.912530, loss_mean: 1.046373, proj_loss: -0.057922, loss_mean_cls: 3.924079, grad_norm: 1.530284 +Steps: 0%| | 165/1000000 [00:44<70:58:04, 3.91it/s, grad_norm=1.53, loss_final=4.91, loss_mean=1.05, loss_mean_cls=3.92, proj_loss=-0.0579][2026-03-23 13:37:04] Step: 165, Training Logs: loss_final: 4.061910, loss_mean: 1.080173, proj_loss: -0.055863, loss_mean_cls: 3.037599, grad_norm: 1.207377 +Steps: 0%| | 166/1000000 [00:44<69:59:07, 3.97it/s, grad_norm=1.21, loss_final=4.06, loss_mean=1.08, loss_mean_cls=3.04, proj_loss=-0.0559][2026-03-23 13:37:04] Step: 166, Training Logs: loss_final: 3.929875, loss_mean: 1.037208, proj_loss: -0.054446, loss_mean_cls: 2.947113, grad_norm: 1.200460 +Steps: 0%| | 167/1000000 [00:45<69:16:27, 4.01it/s, grad_norm=1.2, loss_final=3.93, loss_mean=1.04, loss_mean_cls=2.95, proj_loss=-0.0544][2026-03-23 13:37:05] Step: 167, Training Logs: loss_final: 4.471595, loss_mean: 1.041970, proj_loss: -0.055757, loss_mean_cls: 3.485382, grad_norm: 1.096488 +Steps: 0%| | 168/1000000 [00:45<68:47:23, 4.04it/s, grad_norm=1.1, loss_final=4.47, loss_mean=1.04, loss_mean_cls=3.49, proj_loss=-0.0558][2026-03-23 13:37:05] Step: 168, Training Logs: loss_final: 3.734079, loss_mean: 1.066706, proj_loss: -0.056951, loss_mean_cls: 2.724324, grad_norm: 1.450208 +Steps: 0%| | 169/1000000 [00:45<68:24:45, 4.06it/s, grad_norm=1.45, loss_final=3.73, loss_mean=1.07, loss_mean_cls=2.72, proj_loss=-0.057][2026-03-23 13:37:05] Step: 169, Training Logs: loss_final: 3.791073, loss_mean: 1.062107, proj_loss: -0.057216, loss_mean_cls: 2.786182, grad_norm: 1.255738 +Steps: 0%| | 170/1000000 [00:45<68:10:10, 4.07it/s, grad_norm=1.26, loss_final=3.79, loss_mean=1.06, loss_mean_cls=2.79, proj_loss=-0.0572][2026-03-23 13:37:05] Step: 170, Training Logs: loss_final: 3.904943, loss_mean: 1.044444, proj_loss: -0.058487, loss_mean_cls: 2.918986, grad_norm: 1.502835 +Steps: 0%| | 171/1000000 [00:46<68:00:19, 4.08it/s, grad_norm=1.5, loss_final=3.9, loss_mean=1.04, loss_mean_cls=2.92, proj_loss=-0.0585][2026-03-23 13:37:06] Step: 171, Training Logs: loss_final: 4.189497, loss_mean: 1.045536, proj_loss: -0.056677, loss_mean_cls: 3.200639, grad_norm: 1.172189 +Steps: 0%| | 172/1000000 [00:46<67:55:39, 4.09it/s, grad_norm=1.17, loss_final=4.19, loss_mean=1.05, loss_mean_cls=3.2, proj_loss=-0.0567][2026-03-23 13:37:06] Step: 172, Training Logs: loss_final: 3.443522, loss_mean: 1.078309, proj_loss: -0.057884, loss_mean_cls: 2.423097, grad_norm: 1.439577 +Steps: 0%| | 173/1000000 [00:46<67:48:08, 4.10it/s, grad_norm=1.44, loss_final=3.44, loss_mean=1.08, loss_mean_cls=2.42, proj_loss=-0.0579][2026-03-23 13:37:06] Step: 173, Training Logs: loss_final: 3.781421, loss_mean: 1.076328, proj_loss: -0.058033, loss_mean_cls: 2.763126, grad_norm: 1.488315 +Steps: 0%| | 174/1000000 [00:46<67:43:28, 4.10it/s, grad_norm=1.49, loss_final=3.78, loss_mean=1.08, loss_mean_cls=2.76, proj_loss=-0.058][2026-03-23 13:37:06] Step: 174, Training Logs: loss_final: 3.119474, loss_mean: 1.059711, proj_loss: -0.058535, loss_mean_cls: 2.118299, grad_norm: 1.157990 +Steps: 0%| | 175/1000000 [00:47<67:41:18, 4.10it/s, grad_norm=1.16, loss_final=3.12, loss_mean=1.06, loss_mean_cls=2.12, proj_loss=-0.0585][2026-03-23 13:37:07] Step: 175, Training Logs: loss_final: 3.682849, loss_mean: 1.065250, proj_loss: -0.055091, loss_mean_cls: 2.672690, grad_norm: 1.369484 +Steps: 0%| | 176/1000000 [00:47<67:39:26, 4.10it/s, grad_norm=1.37, loss_final=3.68, loss_mean=1.07, loss_mean_cls=2.67, proj_loss=-0.0551][2026-03-23 13:37:07] Step: 176, Training Logs: loss_final: 4.037375, loss_mean: 1.028867, proj_loss: -0.057125, loss_mean_cls: 3.065633, grad_norm: 1.094802 +Steps: 0%| | 177/1000000 [00:47<67:38:51, 4.11it/s, grad_norm=1.09, loss_final=4.04, loss_mean=1.03, loss_mean_cls=3.07, proj_loss=-0.0571][2026-03-23 13:37:07] Step: 177, Training Logs: loss_final: 3.831984, loss_mean: 1.088265, proj_loss: -0.056359, loss_mean_cls: 2.800077, grad_norm: 2.036664 +Steps: 0%| | 178/1000000 [00:47<67:37:05, 4.11it/s, grad_norm=2.04, loss_final=3.83, loss_mean=1.09, loss_mean_cls=2.8, proj_loss=-0.0564][2026-03-23 13:37:07] Step: 178, Training Logs: loss_final: 4.562847, loss_mean: 1.046903, proj_loss: -0.056404, loss_mean_cls: 3.572348, grad_norm: 1.540552 +Steps: 0%| | 179/1000000 [00:48<67:39:27, 4.10it/s, grad_norm=1.54, loss_final=4.56, loss_mean=1.05, loss_mean_cls=3.57, proj_loss=-0.0564][2026-03-23 13:37:08] Step: 179, Training Logs: loss_final: 4.077003, loss_mean: 1.043028, proj_loss: -0.057804, loss_mean_cls: 3.091779, grad_norm: 1.703333 +Steps: 0%| | 180/1000000 [00:48<67:38:03, 4.11it/s, grad_norm=1.7, loss_final=4.08, loss_mean=1.04, loss_mean_cls=3.09, proj_loss=-0.0578][2026-03-23 13:37:08] Step: 180, Training Logs: loss_final: 3.524299, loss_mean: 1.080989, proj_loss: -0.055454, loss_mean_cls: 2.498764, grad_norm: 1.606636 +Steps: 0%| | 181/1000000 [00:48<67:37:02, 4.11it/s, grad_norm=1.61, loss_final=3.52, loss_mean=1.08, loss_mean_cls=2.5, proj_loss=-0.0555][2026-03-23 13:37:08] Step: 181, Training Logs: loss_final: 4.411146, loss_mean: 1.035113, proj_loss: -0.057471, loss_mean_cls: 3.433504, grad_norm: 1.715147 +Steps: 0%| | 182/1000000 [00:48<67:37:01, 4.11it/s, grad_norm=1.72, loss_final=4.41, loss_mean=1.04, loss_mean_cls=3.43, proj_loss=-0.0575][2026-03-23 13:37:08] Step: 182, Training Logs: loss_final: 3.160106, loss_mean: 1.076947, proj_loss: -0.057270, loss_mean_cls: 2.140429, grad_norm: 1.438499 +Steps: 0%| | 183/1000000 [00:49<67:36:53, 4.11it/s, grad_norm=1.44, loss_final=3.16, loss_mean=1.08, loss_mean_cls=2.14, proj_loss=-0.0573][2026-03-23 13:37:09] Step: 183, Training Logs: loss_final: 3.617240, loss_mean: 1.060260, proj_loss: -0.057239, loss_mean_cls: 2.614219, grad_norm: 1.747722 +Steps: 0%| | 184/1000000 [00:49<67:37:38, 4.11it/s, grad_norm=1.75, loss_final=3.62, loss_mean=1.06, loss_mean_cls=2.61, proj_loss=-0.0572][2026-03-23 13:37:09] Step: 184, Training Logs: loss_final: 4.152416, loss_mean: 1.037297, proj_loss: -0.056911, loss_mean_cls: 3.172031, grad_norm: 1.229738 +Steps: 0%| | 185/1000000 [00:49<67:37:17, 4.11it/s, grad_norm=1.23, loss_final=4.15, loss_mean=1.04, loss_mean_cls=3.17, proj_loss=-0.0569][2026-03-23 13:37:09] Step: 185, Training Logs: loss_final: 3.922030, loss_mean: 1.026854, proj_loss: -0.056978, loss_mean_cls: 2.952154, grad_norm: 1.215245 +Steps: 0%| | 186/1000000 [00:49<67:39:15, 4.11it/s, grad_norm=1.22, loss_final=3.92, loss_mean=1.03, loss_mean_cls=2.95, proj_loss=-0.057][2026-03-23 13:37:09] Step: 186, Training Logs: loss_final: 4.345459, loss_mean: 1.046753, proj_loss: -0.055200, loss_mean_cls: 3.353905, grad_norm: 1.416925 +Steps: 0%| | 187/1000000 [00:50<67:37:40, 4.11it/s, grad_norm=1.42, loss_final=4.35, loss_mean=1.05, loss_mean_cls=3.35, proj_loss=-0.0552][2026-03-23 13:37:10] Step: 187, Training Logs: loss_final: 3.440201, loss_mean: 1.060236, proj_loss: -0.055057, loss_mean_cls: 2.435022, grad_norm: 1.223912 +Steps: 0%| | 188/1000000 [00:50<67:36:44, 4.11it/s, grad_norm=1.22, loss_final=3.44, loss_mean=1.06, loss_mean_cls=2.44, proj_loss=-0.0551][2026-03-23 13:37:10] Step: 188, Training Logs: loss_final: 4.400071, loss_mean: 1.041479, proj_loss: -0.056163, loss_mean_cls: 3.414755, grad_norm: 1.370163 +Steps: 0%| | 189/1000000 [00:50<67:36:51, 4.11it/s, grad_norm=1.37, loss_final=4.4, loss_mean=1.04, loss_mean_cls=3.41, proj_loss=-0.0562][2026-03-23 13:37:10] Step: 189, Training Logs: loss_final: 3.861394, loss_mean: 1.051631, proj_loss: -0.055200, loss_mean_cls: 2.864964, grad_norm: 1.641860 +Steps: 0%| | 190/1000000 [00:50<67:37:03, 4.11it/s, grad_norm=1.64, loss_final=3.86, loss_mean=1.05, loss_mean_cls=2.86, proj_loss=-0.0552][2026-03-23 13:37:10] Step: 190, Training Logs: loss_final: 3.651189, loss_mean: 1.042984, proj_loss: -0.058370, loss_mean_cls: 2.666575, grad_norm: 1.255738 +Steps: 0%| | 191/1000000 [00:51<67:38:18, 4.11it/s, grad_norm=1.26, loss_final=3.65, loss_mean=1.04, loss_mean_cls=2.67, proj_loss=-0.0584][2026-03-23 13:37:10] Step: 191, Training Logs: loss_final: 4.316720, loss_mean: 1.010310, proj_loss: -0.055786, loss_mean_cls: 3.362196, grad_norm: 1.777676 +Steps: 0%| | 192/1000000 [00:51<67:37:09, 4.11it/s, grad_norm=1.78, loss_final=4.32, loss_mean=1.01, loss_mean_cls=3.36, proj_loss=-0.0558][2026-03-23 13:37:11] Step: 192, Training Logs: loss_final: 4.368786, loss_mean: 1.040042, proj_loss: -0.056696, loss_mean_cls: 3.385439, grad_norm: 1.703618 +Steps: 0%| | 193/1000000 [00:51<67:39:30, 4.10it/s, grad_norm=1.7, loss_final=4.37, loss_mean=1.04, loss_mean_cls=3.39, proj_loss=-0.0567][2026-03-23 13:37:11] Step: 193, Training Logs: loss_final: 3.856600, loss_mean: 1.028111, proj_loss: -0.058165, loss_mean_cls: 2.886655, grad_norm: 1.470372 +Steps: 0%| | 194/1000000 [00:51<67:38:03, 4.11it/s, grad_norm=1.47, loss_final=3.86, loss_mean=1.03, loss_mean_cls=2.89, proj_loss=-0.0582][2026-03-23 13:37:11] Step: 194, Training Logs: loss_final: 3.562241, loss_mean: 1.049956, proj_loss: -0.053530, loss_mean_cls: 2.565816, grad_norm: 1.636290 +Steps: 0%| | 195/1000000 [00:52<67:38:47, 4.11it/s, grad_norm=1.64, loss_final=3.56, loss_mean=1.05, loss_mean_cls=2.57, proj_loss=-0.0535][2026-03-23 13:37:11] Step: 195, Training Logs: loss_final: 4.131094, loss_mean: 1.053023, proj_loss: -0.056169, loss_mean_cls: 3.134240, grad_norm: 1.507075 +Steps: 0%| | 196/1000000 [00:52<67:37:42, 4.11it/s, grad_norm=1.51, loss_final=4.13, loss_mean=1.05, loss_mean_cls=3.13, proj_loss=-0.0562][2026-03-23 13:37:12] Step: 196, Training Logs: loss_final: 4.166049, loss_mean: 1.028166, proj_loss: -0.057034, loss_mean_cls: 3.194916, grad_norm: 1.494999 +Steps: 0%| | 197/1000000 [00:52<67:36:53, 4.11it/s, grad_norm=1.49, loss_final=4.17, loss_mean=1.03, loss_mean_cls=3.19, proj_loss=-0.057][2026-03-23 13:37:12] Step: 197, Training Logs: loss_final: 3.985657, loss_mean: 1.034830, proj_loss: -0.058092, loss_mean_cls: 3.008919, grad_norm: 1.325755 +Steps: 0%| | 198/1000000 [00:52<67:36:21, 4.11it/s, grad_norm=1.33, loss_final=3.99, loss_mean=1.03, loss_mean_cls=3.01, proj_loss=-0.0581][2026-03-23 13:37:12] Step: 198, Training Logs: loss_final: 4.063421, loss_mean: 1.033167, proj_loss: -0.057242, loss_mean_cls: 3.087496, grad_norm: 1.189624 +Steps: 0%| | 199/1000000 [00:52<67:38:03, 4.11it/s, grad_norm=1.19, loss_final=4.06, loss_mean=1.03, loss_mean_cls=3.09, proj_loss=-0.0572][2026-03-23 13:37:12] Step: 199, Training Logs: loss_final: 3.801538, loss_mean: 1.044235, proj_loss: -0.056529, loss_mean_cls: 2.813832, grad_norm: 1.192295 +Steps: 0%| | 200/1000000 [00:53<67:40:26, 4.10it/s, grad_norm=1.19, loss_final=3.8, loss_mean=1.04, loss_mean_cls=2.81, proj_loss=-0.0565][2026-03-23 13:37:13] Step: 200, Training Logs: loss_final: 4.810986, loss_mean: 1.012435, proj_loss: -0.056861, loss_mean_cls: 3.855412, grad_norm: 1.432923 +Steps: 0%| | 201/1000000 [00:53<67:38:59, 4.11it/s, grad_norm=1.43, loss_final=4.81, loss_mean=1.01, loss_mean_cls=3.86, proj_loss=-0.0569][2026-03-23 13:37:13] Step: 201, Training Logs: loss_final: 3.936706, loss_mean: 1.029067, proj_loss: -0.058249, loss_mean_cls: 2.965889, grad_norm: 1.499375 +Steps: 0%| | 202/1000000 [00:53<67:37:52, 4.11it/s, grad_norm=1.5, loss_final=3.94, loss_mean=1.03, loss_mean_cls=2.97, proj_loss=-0.0582][2026-03-23 13:37:13] Step: 202, Training Logs: loss_final: 3.773329, loss_mean: 1.038458, proj_loss: -0.058380, loss_mean_cls: 2.793251, grad_norm: 1.162416 +Steps: 0%| | 203/1000000 [00:53<67:36:39, 4.11it/s, grad_norm=1.16, loss_final=3.77, loss_mean=1.04, loss_mean_cls=2.79, proj_loss=-0.0584][2026-03-23 13:37:13] Step: 203, Training Logs: loss_final: 3.843979, loss_mean: 1.058334, proj_loss: -0.057179, loss_mean_cls: 2.842823, grad_norm: 1.828152 +Steps: 0%| | 204/1000000 [00:54<67:36:18, 4.11it/s, grad_norm=1.83, loss_final=3.84, loss_mean=1.06, loss_mean_cls=2.84, proj_loss=-0.0572][2026-03-23 13:37:14] Step: 204, Training Logs: loss_final: 3.869987, loss_mean: 1.022833, proj_loss: -0.059686, loss_mean_cls: 2.906840, grad_norm: 1.768890 +Steps: 0%| | 205/1000000 [00:54<67:35:49, 4.11it/s, grad_norm=1.77, loss_final=3.87, loss_mean=1.02, loss_mean_cls=2.91, proj_loss=-0.0597][2026-03-23 13:37:14] Step: 205, Training Logs: loss_final: 4.536178, loss_mean: 1.023853, proj_loss: -0.059009, loss_mean_cls: 3.571334, grad_norm: 1.526569 +Steps: 0%| | 206/1000000 [00:54<67:36:50, 4.11it/s, grad_norm=1.53, loss_final=4.54, loss_mean=1.02, loss_mean_cls=3.57, proj_loss=-0.059][2026-03-23 13:37:14] Step: 206, Training Logs: loss_final: 3.888740, loss_mean: 1.035594, proj_loss: -0.057519, loss_mean_cls: 2.910666, grad_norm: 1.641277 +Steps: 0%| | 207/1000000 [00:54<67:36:20, 4.11it/s, grad_norm=1.64, loss_final=3.89, loss_mean=1.04, loss_mean_cls=2.91, proj_loss=-0.0575][2026-03-23 13:37:14] Step: 207, Training Logs: loss_final: 3.622827, loss_mean: 1.031426, proj_loss: -0.054773, loss_mean_cls: 2.646175, grad_norm: 1.867136 +Steps: 0%| | 208/1000000 [00:55<67:36:45, 4.11it/s, grad_norm=1.87, loss_final=3.62, loss_mean=1.03, loss_mean_cls=2.65, proj_loss=-0.0548][2026-03-23 13:37:15] Step: 208, Training Logs: loss_final: 3.768156, loss_mean: 1.024550, proj_loss: -0.057734, loss_mean_cls: 2.801340, grad_norm: 1.751254 +Steps: 0%| | 209/1000000 [00:55<67:37:06, 4.11it/s, grad_norm=1.75, loss_final=3.77, loss_mean=1.02, loss_mean_cls=2.8, proj_loss=-0.0577][2026-03-23 13:37:15] Step: 209, Training Logs: loss_final: 3.703754, loss_mean: 1.045684, proj_loss: -0.056905, loss_mean_cls: 2.714975, grad_norm: 1.549218 +Steps: 0%| | 210/1000000 [00:55<67:36:13, 4.11it/s, grad_norm=1.55, loss_final=3.7, loss_mean=1.05, loss_mean_cls=2.71, proj_loss=-0.0569][2026-03-23 13:37:15] Step: 210, Training Logs: loss_final: 3.910257, loss_mean: 1.039839, proj_loss: -0.057246, loss_mean_cls: 2.927664, grad_norm: 1.774339 +Steps: 0%| | 211/1000000 [00:55<67:36:54, 4.11it/s, grad_norm=1.77, loss_final=3.91, loss_mean=1.04, loss_mean_cls=2.93, proj_loss=-0.0572][2026-03-23 13:37:15] Step: 211, Training Logs: loss_final: 3.871981, loss_mean: 1.023360, proj_loss: -0.059808, loss_mean_cls: 2.908429, grad_norm: 1.830702 +Steps: 0%| | 212/1000000 [00:56<67:37:59, 4.11it/s, grad_norm=1.83, loss_final=3.87, loss_mean=1.02, loss_mean_cls=2.91, proj_loss=-0.0598][2026-03-23 13:37:16] Step: 212, Training Logs: loss_final: 4.024335, loss_mean: 1.011277, proj_loss: -0.057672, loss_mean_cls: 3.070730, grad_norm: 1.381492 +Steps: 0%| | 213/1000000 [00:56<67:36:56, 4.11it/s, grad_norm=1.38, loss_final=4.02, loss_mean=1.01, loss_mean_cls=3.07, proj_loss=-0.0577][2026-03-23 13:37:16] Step: 213, Training Logs: loss_final: 3.651708, loss_mean: 1.056225, proj_loss: -0.058471, loss_mean_cls: 2.653955, grad_norm: 1.476987 +Steps: 0%| | 214/1000000 [00:56<67:35:54, 4.11it/s, grad_norm=1.48, loss_final=3.65, loss_mean=1.06, loss_mean_cls=2.65, proj_loss=-0.0585][2026-03-23 13:37:16] Step: 214, Training Logs: loss_final: 4.083921, loss_mean: 1.016901, proj_loss: -0.055960, loss_mean_cls: 3.122981, grad_norm: 1.735650 +Steps: 0%| | 215/1000000 [00:56<67:35:38, 4.11it/s, grad_norm=1.74, loss_final=4.08, loss_mean=1.02, loss_mean_cls=3.12, proj_loss=-0.056][2026-03-23 13:37:16] Step: 215, Training Logs: loss_final: 4.301002, loss_mean: 1.000329, proj_loss: -0.057644, loss_mean_cls: 3.358317, grad_norm: 1.295369 +Steps: 0%| | 216/1000000 [00:57<67:36:57, 4.11it/s, grad_norm=1.3, loss_final=4.3, loss_mean=1, loss_mean_cls=3.36, proj_loss=-0.0576][2026-03-23 13:37:17] Step: 216, Training Logs: loss_final: 4.289544, loss_mean: 1.020474, proj_loss: -0.057699, loss_mean_cls: 3.326769, grad_norm: 2.155440 +Steps: 0%| | 217/1000000 [00:57<67:37:20, 4.11it/s, grad_norm=2.16, loss_final=4.29, loss_mean=1.02, loss_mean_cls=3.33, proj_loss=-0.0577][2026-03-23 13:37:17] Step: 217, Training Logs: loss_final: 4.007451, loss_mean: 1.035801, proj_loss: -0.059853, loss_mean_cls: 3.031503, grad_norm: 2.352972 +Steps: 0%| | 218/1000000 [00:57<67:47:36, 4.10it/s, grad_norm=2.35, loss_final=4.01, loss_mean=1.04, loss_mean_cls=3.03, proj_loss=-0.0599][2026-03-23 13:37:17] Step: 218, Training Logs: loss_final: 4.851282, loss_mean: 0.987814, proj_loss: -0.056030, loss_mean_cls: 3.919498, grad_norm: 1.814417 +Steps: 0%| | 219/1000000 [00:57<67:45:26, 4.10it/s, grad_norm=1.81, loss_final=4.85, loss_mean=0.988, loss_mean_cls=3.92, proj_loss=-0.056][2026-03-23 13:37:17] Step: 219, Training Logs: loss_final: 3.396458, loss_mean: 1.012237, proj_loss: -0.057483, loss_mean_cls: 2.441703, grad_norm: 1.722224 +Steps: 0%| | 220/1000000 [00:58<67:42:03, 4.10it/s, grad_norm=1.72, loss_final=3.4, loss_mean=1.01, loss_mean_cls=2.44, proj_loss=-0.0575][2026-03-23 13:37:18] Step: 220, Training Logs: loss_final: 4.454692, loss_mean: 1.008432, proj_loss: -0.056493, loss_mean_cls: 3.502753, grad_norm: 2.249010 +Steps: 0%| | 221/1000000 [00:58<67:39:54, 4.10it/s, grad_norm=2.25, loss_final=4.45, loss_mean=1.01, loss_mean_cls=3.5, proj_loss=-0.0565][2026-03-23 13:37:18] Step: 221, Training Logs: loss_final: 3.590767, loss_mean: 1.018525, proj_loss: -0.055418, loss_mean_cls: 2.627661, grad_norm: 1.930550 +Steps: 0%| | 222/1000000 [00:58<67:37:40, 4.11it/s, grad_norm=1.93, loss_final=3.59, loss_mean=1.02, loss_mean_cls=2.63, proj_loss=-0.0554][2026-03-23 13:37:18] Step: 222, Training Logs: loss_final: 3.994855, loss_mean: 1.010454, proj_loss: -0.058971, loss_mean_cls: 3.043372, grad_norm: 2.775717 +Steps: 0%| | 223/1000000 [00:58<67:38:53, 4.11it/s, grad_norm=2.78, loss_final=3.99, loss_mean=1.01, loss_mean_cls=3.04, proj_loss=-0.059][2026-03-23 13:37:18] Step: 223, Training Logs: loss_final: 3.390315, loss_mean: 1.037351, proj_loss: -0.056208, loss_mean_cls: 2.409172, grad_norm: 1.520704 +Steps: 0%| | 224/1000000 [00:59<67:38:18, 4.11it/s, grad_norm=1.52, loss_final=3.39, loss_mean=1.04, loss_mean_cls=2.41, proj_loss=-0.0562][2026-03-23 13:37:19] Step: 224, Training Logs: loss_final: 4.014789, loss_mean: 1.022649, proj_loss: -0.059246, loss_mean_cls: 3.051387, grad_norm: 2.386194 +Steps: 0%| | 225/1000000 [00:59<67:43:08, 4.10it/s, grad_norm=2.39, loss_final=4.01, loss_mean=1.02, loss_mean_cls=3.05, proj_loss=-0.0592][2026-03-23 13:37:19] Step: 225, Training Logs: loss_final: 3.774825, loss_mean: 1.000522, proj_loss: -0.057868, loss_mean_cls: 2.832171, grad_norm: 1.853386 +Steps: 0%| | 226/1000000 [00:59<67:40:24, 4.10it/s, grad_norm=1.85, loss_final=3.77, loss_mean=1, loss_mean_cls=2.83, proj_loss=-0.0579][2026-03-23 13:37:19] Step: 226, Training Logs: loss_final: 3.865025, loss_mean: 1.028601, proj_loss: -0.058670, loss_mean_cls: 2.895093, grad_norm: 1.635606 +Steps: 0%| | 227/1000000 [00:59<67:39:35, 4.10it/s, grad_norm=1.64, loss_final=3.87, loss_mean=1.03, loss_mean_cls=2.9, proj_loss=-0.0587][2026-03-23 13:37:19] Step: 227, Training Logs: loss_final: 4.629937, loss_mean: 1.019175, proj_loss: -0.058261, loss_mean_cls: 3.669023, grad_norm: 2.228397 +Steps: 0%| | 228/1000000 [01:00<67:39:50, 4.10it/s, grad_norm=2.23, loss_final=4.63, loss_mean=1.02, loss_mean_cls=3.67, proj_loss=-0.0583][2026-03-23 13:37:20] Step: 228, Training Logs: loss_final: 3.991970, loss_mean: 1.031901, proj_loss: -0.057185, loss_mean_cls: 3.017254, grad_norm: 3.091030 +Steps: 0%| | 229/1000000 [01:00<67:37:26, 4.11it/s, grad_norm=3.09, loss_final=3.99, loss_mean=1.03, loss_mean_cls=3.02, proj_loss=-0.0572][2026-03-23 13:37:20] Step: 229, Training Logs: loss_final: 4.168154, loss_mean: 1.016400, proj_loss: -0.058547, loss_mean_cls: 3.210301, grad_norm: 2.961325 +Steps: 0%| | 230/1000000 [01:00<67:37:55, 4.11it/s, grad_norm=2.96, loss_final=4.17, loss_mean=1.02, loss_mean_cls=3.21, proj_loss=-0.0585][2026-03-23 13:37:20] Step: 230, Training Logs: loss_final: 3.592224, loss_mean: 1.033770, proj_loss: -0.057384, loss_mean_cls: 2.615838, grad_norm: 2.159937 +Steps: 0%| | 231/1000000 [01:00<67:37:48, 4.11it/s, grad_norm=2.16, loss_final=3.59, loss_mean=1.03, loss_mean_cls=2.62, proj_loss=-0.0574][2026-03-23 13:37:20] Step: 231, Training Logs: loss_final: 4.293517, loss_mean: 1.016987, proj_loss: -0.057693, loss_mean_cls: 3.334224, grad_norm: 2.226138 +Steps: 0%| | 232/1000000 [01:01<67:36:38, 4.11it/s, grad_norm=2.23, loss_final=4.29, loss_mean=1.02, loss_mean_cls=3.33, proj_loss=-0.0577][2026-03-23 13:37:20] Step: 232, Training Logs: loss_final: 3.887983, loss_mean: 1.032482, proj_loss: -0.057732, loss_mean_cls: 2.913233, grad_norm: 1.769545 +Steps: 0%| | 233/1000000 [01:01<67:36:40, 4.11it/s, grad_norm=1.77, loss_final=3.89, loss_mean=1.03, loss_mean_cls=2.91, proj_loss=-0.0577][2026-03-23 13:37:21] Step: 233, Training Logs: loss_final: 4.120455, loss_mean: 1.012041, proj_loss: -0.055745, loss_mean_cls: 3.164159, grad_norm: 1.577054 +Steps: 0%| | 234/1000000 [01:01<67:38:23, 4.11it/s, grad_norm=1.58, loss_final=4.12, loss_mean=1.01, loss_mean_cls=3.16, proj_loss=-0.0557][2026-03-23 13:37:21] Step: 234, Training Logs: loss_final: 4.643974, loss_mean: 1.002678, proj_loss: -0.059201, loss_mean_cls: 3.700498, grad_norm: 1.805854 +Steps: 0%| | 235/1000000 [01:01<67:37:35, 4.11it/s, grad_norm=1.81, loss_final=4.64, loss_mean=1, loss_mean_cls=3.7, proj_loss=-0.0592][2026-03-23 13:37:21] Step: 235, Training Logs: loss_final: 4.299358, loss_mean: 1.017364, proj_loss: -0.054572, loss_mean_cls: 3.336566, grad_norm: 1.614303 +Steps: 0%| | 236/1000000 [01:01<67:37:33, 4.11it/s, grad_norm=1.61, loss_final=4.3, loss_mean=1.02, loss_mean_cls=3.34, proj_loss=-0.0546][2026-03-23 13:37:21] Step: 236, Training Logs: loss_final: 4.456216, loss_mean: 1.003186, proj_loss: -0.057826, loss_mean_cls: 3.510856, grad_norm: 1.874142 +Steps: 0%| | 237/1000000 [01:02<67:35:46, 4.11it/s, grad_norm=1.87, loss_final=4.46, loss_mean=1, loss_mean_cls=3.51, proj_loss=-0.0578][2026-03-23 13:37:22] Step: 237, Training Logs: loss_final: 4.697540, loss_mean: 1.012277, proj_loss: -0.057322, loss_mean_cls: 3.742585, grad_norm: 2.661236 +Steps: 0%| | 238/1000000 [01:02<67:35:42, 4.11it/s, grad_norm=2.66, loss_final=4.7, loss_mean=1.01, loss_mean_cls=3.74, proj_loss=-0.0573][2026-03-23 13:37:22] Step: 238, Training Logs: loss_final: 4.677876, loss_mean: 1.005281, proj_loss: -0.057921, loss_mean_cls: 3.730516, grad_norm: 2.139711 +Steps: 0%| | 239/1000000 [01:02<67:37:26, 4.11it/s, grad_norm=2.14, loss_final=4.68, loss_mean=1.01, loss_mean_cls=3.73, proj_loss=-0.0579][2026-03-23 13:37:22] Step: 239, Training Logs: loss_final: 4.527804, loss_mean: 0.994907, proj_loss: -0.058543, loss_mean_cls: 3.591441, grad_norm: 1.859028 +Steps: 0%| | 240/1000000 [01:02<67:36:55, 4.11it/s, grad_norm=1.86, loss_final=4.53, loss_mean=0.995, loss_mean_cls=3.59, proj_loss=-0.0585][2026-03-23 13:37:22] Step: 240, Training Logs: loss_final: 3.833556, loss_mean: 0.998571, proj_loss: -0.059304, loss_mean_cls: 2.894289, grad_norm: 2.514359 +Steps: 0%| | 241/1000000 [01:03<67:36:45, 4.11it/s, grad_norm=2.51, loss_final=3.83, loss_mean=0.999, loss_mean_cls=2.89, proj_loss=-0.0593][2026-03-23 13:37:23] Step: 241, Training Logs: loss_final: 4.254143, loss_mean: 0.974837, proj_loss: -0.057389, loss_mean_cls: 3.336694, grad_norm: 1.855104 +Steps: 0%| | 242/1000000 [01:03<67:36:28, 4.11it/s, grad_norm=1.86, loss_final=4.25, loss_mean=0.975, loss_mean_cls=3.34, proj_loss=-0.0574][2026-03-23 13:37:23] Step: 242, Training Logs: loss_final: 4.103676, loss_mean: 1.011363, proj_loss: -0.056911, loss_mean_cls: 3.149223, grad_norm: 2.652989 +Steps: 0%| | 243/1000000 [01:03<67:36:41, 4.11it/s, grad_norm=2.65, loss_final=4.1, loss_mean=1.01, loss_mean_cls=3.15, proj_loss=-0.0569][2026-03-23 13:37:23] Step: 243, Training Logs: loss_final: 3.621749, loss_mean: 1.038889, proj_loss: -0.057600, loss_mean_cls: 2.640460, grad_norm: 2.504799 +Steps: 0%| | 244/1000000 [01:03<67:36:54, 4.11it/s, grad_norm=2.5, loss_final=3.62, loss_mean=1.04, loss_mean_cls=2.64, proj_loss=-0.0576][2026-03-23 13:37:23] Step: 244, Training Logs: loss_final: 3.786818, loss_mean: 1.033059, proj_loss: -0.057845, loss_mean_cls: 2.811604, grad_norm: 2.006486 +Steps: 0%| | 245/1000000 [01:04<69:47:50, 3.98it/s, grad_norm=2.01, loss_final=3.79, loss_mean=1.03, loss_mean_cls=2.81, proj_loss=-0.0578][2026-03-23 13:37:24] Step: 245, Training Logs: loss_final: 4.137387, loss_mean: 1.002630, proj_loss: -0.057031, loss_mean_cls: 3.191788, grad_norm: 2.460148 +Steps: 0%| | 246/1000000 [01:04<69:09:13, 4.02it/s, grad_norm=2.46, loss_final=4.14, loss_mean=1, loss_mean_cls=3.19, proj_loss=-0.057][2026-03-23 13:37:24] Step: 246, Training Logs: loss_final: 3.940315, loss_mean: 1.018739, proj_loss: -0.058355, loss_mean_cls: 2.979932, grad_norm: 2.141928 +Steps: 0%| | 247/1000000 [01:04<68:43:02, 4.04it/s, grad_norm=2.14, loss_final=3.94, loss_mean=1.02, loss_mean_cls=2.98, proj_loss=-0.0584][2026-03-23 13:37:24] Step: 247, Training Logs: loss_final: 3.798366, loss_mean: 1.019967, proj_loss: -0.060034, loss_mean_cls: 2.838433, grad_norm: 2.177763 +Steps: 0%| | 248/1000000 [01:04<68:23:45, 4.06it/s, grad_norm=2.18, loss_final=3.8, loss_mean=1.02, loss_mean_cls=2.84, proj_loss=-0.06][2026-03-23 13:37:24] Step: 248, Training Logs: loss_final: 3.551230, loss_mean: 0.995989, proj_loss: -0.057924, loss_mean_cls: 2.613165, grad_norm: 1.816457 +Steps: 0%| | 249/1000000 [01:05<68:10:10, 4.07it/s, grad_norm=1.82, loss_final=3.55, loss_mean=0.996, loss_mean_cls=2.61, proj_loss=-0.0579][2026-03-23 13:37:25] Step: 249, Training Logs: loss_final: 3.656512, loss_mean: 1.016011, proj_loss: -0.056679, loss_mean_cls: 2.697180, grad_norm: 2.451005 +Steps: 0%| | 250/1000000 [01:05<68:01:04, 4.08it/s, grad_norm=2.45, loss_final=3.66, loss_mean=1.02, loss_mean_cls=2.7, proj_loss=-0.0567][2026-03-23 13:37:25] Step: 250, Training Logs: loss_final: 3.774314, loss_mean: 1.012924, proj_loss: -0.057447, loss_mean_cls: 2.818837, grad_norm: 2.356424 +Steps: 0%| | 251/1000000 [01:05<67:53:53, 4.09it/s, grad_norm=2.36, loss_final=3.77, loss_mean=1.01, loss_mean_cls=2.82, proj_loss=-0.0574][2026-03-23 13:37:25] Step: 251, Training Logs: loss_final: 3.913889, loss_mean: 1.010174, proj_loss: -0.057040, loss_mean_cls: 2.960755, grad_norm: 1.715187 +Steps: 0%| | 252/1000000 [01:05<67:48:51, 4.10it/s, grad_norm=1.72, loss_final=3.91, loss_mean=1.01, loss_mean_cls=2.96, proj_loss=-0.057][2026-03-23 13:37:25] Step: 252, Training Logs: loss_final: 4.328947, loss_mean: 0.996023, proj_loss: -0.059433, loss_mean_cls: 3.392357, grad_norm: 2.616096 +Steps: 0%| | 253/1000000 [01:06<67:46:00, 4.10it/s, grad_norm=2.62, loss_final=4.33, loss_mean=0.996, loss_mean_cls=3.39, proj_loss=-0.0594][2026-03-23 13:37:26] Step: 253, Training Logs: loss_final: 3.242682, loss_mean: 1.010110, proj_loss: -0.060344, loss_mean_cls: 2.292915, grad_norm: 2.299402 +Steps: 0%| | 254/1000000 [01:06<67:43:28, 4.10it/s, grad_norm=2.3, loss_final=3.24, loss_mean=1.01, loss_mean_cls=2.29, proj_loss=-0.0603][2026-03-23 13:37:26] Step: 254, Training Logs: loss_final: 3.669651, loss_mean: 1.008291, proj_loss: -0.057648, loss_mean_cls: 2.719009, grad_norm: 2.248977 +Steps: 0%| | 255/1000000 [01:06<67:42:03, 4.10it/s, grad_norm=2.25, loss_final=3.67, loss_mean=1.01, loss_mean_cls=2.72, proj_loss=-0.0576][2026-03-23 13:37:26] Step: 255, Training Logs: loss_final: 3.639956, loss_mean: 1.043140, proj_loss: -0.058325, loss_mean_cls: 2.655140, grad_norm: 2.593900 +Steps: 0%| | 256/1000000 [01:06<67:40:18, 4.10it/s, grad_norm=2.59, loss_final=3.64, loss_mean=1.04, loss_mean_cls=2.66, proj_loss=-0.0583][2026-03-23 13:37:26] Step: 256, Training Logs: loss_final: 4.088728, loss_mean: 0.998613, proj_loss: -0.058601, loss_mean_cls: 3.148716, grad_norm: 2.201190 +Steps: 0%| | 257/1000000 [01:07<78:11:48, 3.55it/s, grad_norm=2.2, loss_final=4.09, loss_mean=0.999, loss_mean_cls=3.15, proj_loss=-0.0586][2026-03-23 13:37:27] Step: 257, Training Logs: loss_final: 3.717172, loss_mean: 1.001660, proj_loss: -0.058121, loss_mean_cls: 2.773634, grad_norm: 2.522588 +Steps: 0%| | 258/1000000 [01:07<76:50:14, 3.61it/s, grad_norm=2.52, loss_final=3.72, loss_mean=1, loss_mean_cls=2.77, proj_loss=-0.0581][2026-03-23 13:37:27] Step: 258, Training Logs: loss_final: 4.007435, loss_mean: 1.024614, proj_loss: -0.058192, loss_mean_cls: 3.041012, grad_norm: 2.024996 +Steps: 0%| | 259/1000000 [01:07<74:04:04, 3.75it/s, grad_norm=2.02, loss_final=4.01, loss_mean=1.02, loss_mean_cls=3.04, proj_loss=-0.0582][2026-03-23 13:37:27] Step: 259, Training Logs: loss_final: 3.975487, loss_mean: 1.017042, proj_loss: -0.055453, loss_mean_cls: 3.013898, grad_norm: 3.040855 +Steps: 0%| | 260/1000000 [01:08<72:08:09, 3.85it/s, grad_norm=3.04, loss_final=3.98, loss_mean=1.02, loss_mean_cls=3.01, proj_loss=-0.0555][2026-03-23 13:37:27] Step: 260, Training Logs: loss_final: 3.960590, loss_mean: 0.996155, proj_loss: -0.055287, loss_mean_cls: 3.019722, grad_norm: 2.232038 +Steps: 0%| | 261/1000000 [01:08<70:46:16, 3.92it/s, grad_norm=2.23, loss_final=3.96, loss_mean=0.996, loss_mean_cls=3.02, proj_loss=-0.0553][2026-03-23 13:37:28] Step: 261, Training Logs: loss_final: 3.944844, loss_mean: 1.012114, proj_loss: -0.057810, loss_mean_cls: 2.990540, grad_norm: 2.818413 +Steps: 0%| | 262/1000000 [01:08<69:48:46, 3.98it/s, grad_norm=2.82, loss_final=3.94, loss_mean=1.01, loss_mean_cls=2.99, proj_loss=-0.0578][2026-03-23 13:37:28] Step: 262, Training Logs: loss_final: 4.338410, loss_mean: 0.983083, proj_loss: -0.057002, loss_mean_cls: 3.412329, grad_norm: 2.333358 +Steps: 0%| | 263/1000000 [01:08<69:10:31, 4.01it/s, grad_norm=2.33, loss_final=4.34, loss_mean=0.983, loss_mean_cls=3.41, proj_loss=-0.057][2026-03-23 13:37:28] Step: 263, Training Logs: loss_final: 3.854807, loss_mean: 1.004464, proj_loss: -0.056427, loss_mean_cls: 2.906770, grad_norm: 2.604435 +Steps: 0%| | 264/1000000 [01:08<68:42:26, 4.04it/s, grad_norm=2.6, loss_final=3.85, loss_mean=1, loss_mean_cls=2.91, proj_loss=-0.0564][2026-03-23 13:37:28] Step: 264, Training Logs: loss_final: 3.931269, loss_mean: 1.040363, proj_loss: -0.057941, loss_mean_cls: 2.948847, grad_norm: 3.225369 +Steps: 0%| | 265/1000000 [01:09<68:23:50, 4.06it/s, grad_norm=3.23, loss_final=3.93, loss_mean=1.04, loss_mean_cls=2.95, proj_loss=-0.0579][2026-03-23 13:37:29] Step: 265, Training Logs: loss_final: 4.205052, loss_mean: 0.995311, proj_loss: -0.059422, loss_mean_cls: 3.269163, grad_norm: 2.421202 +Steps: 0%| | 266/1000000 [01:09<68:56:50, 4.03it/s, grad_norm=2.42, loss_final=4.21, loss_mean=0.995, loss_mean_cls=3.27, proj_loss=-0.0594][2026-03-23 13:37:29] Step: 266, Training Logs: loss_final: 3.778931, loss_mean: 1.003468, proj_loss: -0.060769, loss_mean_cls: 2.836232, grad_norm: 2.919547 +Steps: 0%| | 267/1000000 [01:09<68:33:34, 4.05it/s, grad_norm=2.92, loss_final=3.78, loss_mean=1, loss_mean_cls=2.84, proj_loss=-0.0608][2026-03-23 13:37:29] Step: 267, Training Logs: loss_final: 4.102454, loss_mean: 1.009674, proj_loss: -0.058564, loss_mean_cls: 3.151343, grad_norm: 2.834389 +Steps: 0%| | 268/1000000 [01:09<69:28:00, 4.00it/s, grad_norm=2.83, loss_final=4.1, loss_mean=1.01, loss_mean_cls=3.15, proj_loss=-0.0586][2026-03-23 13:37:29] Step: 268, Training Logs: loss_final: 3.495481, loss_mean: 1.021675, proj_loss: -0.056977, loss_mean_cls: 2.530782, grad_norm: 2.176593 +Steps: 0%| | 269/1000000 [01:10<68:55:17, 4.03it/s, grad_norm=2.18, loss_final=3.5, loss_mean=1.02, loss_mean_cls=2.53, proj_loss=-0.057][2026-03-23 13:37:30] Step: 269, Training Logs: loss_final: 3.293135, loss_mean: 1.009369, proj_loss: -0.058272, loss_mean_cls: 2.342039, grad_norm: 2.162158 +Steps: 0%| | 270/1000000 [01:10<68:32:50, 4.05it/s, grad_norm=2.16, loss_final=3.29, loss_mean=1.01, loss_mean_cls=2.34, proj_loss=-0.0583][2026-03-23 13:37:30] Step: 270, Training Logs: loss_final: 3.982392, loss_mean: 1.007620, proj_loss: -0.058974, loss_mean_cls: 3.033745, grad_norm: 2.531084 +Steps: 0%| | 271/1000000 [01:10<68:15:34, 4.07it/s, grad_norm=2.53, loss_final=3.98, loss_mean=1.01, loss_mean_cls=3.03, proj_loss=-0.059][2026-03-23 13:37:30] Step: 271, Training Logs: loss_final: 4.394677, loss_mean: 1.030366, proj_loss: -0.058929, loss_mean_cls: 3.423241, grad_norm: 2.959767 +Steps: 0%| | 272/1000000 [01:10<68:03:21, 4.08it/s, grad_norm=2.96, loss_final=4.39, loss_mean=1.03, loss_mean_cls=3.42, proj_loss=-0.0589][2026-03-23 13:37:30] Step: 272, Training Logs: loss_final: 3.885933, loss_mean: 1.014815, proj_loss: -0.059820, loss_mean_cls: 2.930939, grad_norm: 2.024471 +Steps: 0%| | 273/1000000 [01:11<67:55:09, 4.09it/s, grad_norm=2.02, loss_final=3.89, loss_mean=1.01, loss_mean_cls=2.93, proj_loss=-0.0598][2026-03-23 13:37:31] Step: 273, Training Logs: loss_final: 3.647665, loss_mean: 1.015839, proj_loss: -0.056455, loss_mean_cls: 2.688282, grad_norm: 1.558683 +Steps: 0%| | 274/1000000 [01:11<67:49:48, 4.09it/s, grad_norm=1.56, loss_final=3.65, loss_mean=1.02, loss_mean_cls=2.69, proj_loss=-0.0565][2026-03-23 13:37:31] Step: 274, Training Logs: loss_final: 3.602215, loss_mean: 1.018396, proj_loss: -0.057653, loss_mean_cls: 2.641472, grad_norm: 2.441238 +Steps: 0%| | 275/1000000 [01:11<67:47:10, 4.10it/s, grad_norm=2.44, loss_final=3.6, loss_mean=1.02, loss_mean_cls=2.64, proj_loss=-0.0577][2026-03-23 13:37:31] Step: 275, Training Logs: loss_final: 3.773496, loss_mean: 1.007611, proj_loss: -0.057190, loss_mean_cls: 2.823076, grad_norm: 2.856299 +Steps: 0%| | 276/1000000 [01:11<67:43:47, 4.10it/s, grad_norm=2.86, loss_final=3.77, loss_mean=1.01, loss_mean_cls=2.82, proj_loss=-0.0572][2026-03-23 13:37:31] Step: 276, Training Logs: loss_final: 4.035643, loss_mean: 0.987048, proj_loss: -0.058336, loss_mean_cls: 3.106930, grad_norm: 2.306134 +Steps: 0%| | 277/1000000 [01:12<67:42:35, 4.10it/s, grad_norm=2.31, loss_final=4.04, loss_mean=0.987, loss_mean_cls=3.11, proj_loss=-0.0583][2026-03-23 13:37:32] Step: 277, Training Logs: loss_final: 3.552574, loss_mean: 1.028046, proj_loss: -0.058734, loss_mean_cls: 2.583262, grad_norm: 2.734590 +Steps: 0%| | 278/1000000 [01:12<67:42:02, 4.10it/s, grad_norm=2.73, loss_final=3.55, loss_mean=1.03, loss_mean_cls=2.58, proj_loss=-0.0587][2026-03-23 13:37:32] Step: 278, Training Logs: loss_final: 3.874134, loss_mean: 1.022824, proj_loss: -0.057524, loss_mean_cls: 2.908834, grad_norm: 2.058674 +Steps: 0%| | 279/1000000 [01:12<67:40:45, 4.10it/s, grad_norm=2.06, loss_final=3.87, loss_mean=1.02, loss_mean_cls=2.91, proj_loss=-0.0575][2026-03-23 13:37:32] Step: 279, Training Logs: loss_final: 3.561639, loss_mean: 1.016915, proj_loss: -0.058373, loss_mean_cls: 2.603098, grad_norm: 2.810825 +Steps: 0%| | 280/1000000 [01:12<67:40:45, 4.10it/s, grad_norm=2.81, loss_final=3.56, loss_mean=1.02, loss_mean_cls=2.6, proj_loss=-0.0584][2026-03-23 13:37:32] Step: 280, Training Logs: loss_final: 4.691525, loss_mean: 0.982147, proj_loss: -0.056183, loss_mean_cls: 3.765562, grad_norm: 3.496017 +Steps: 0%| | 281/1000000 [01:13<67:41:34, 4.10it/s, grad_norm=3.5, loss_final=4.69, loss_mean=0.982, loss_mean_cls=3.77, proj_loss=-0.0562][2026-03-23 13:37:33] Step: 281, Training Logs: loss_final: 4.307351, loss_mean: 1.016460, proj_loss: -0.057098, loss_mean_cls: 3.347989, grad_norm: 3.205639 +Steps: 0%| | 282/1000000 [01:13<67:40:23, 4.10it/s, grad_norm=3.21, loss_final=4.31, loss_mean=1.02, loss_mean_cls=3.35, proj_loss=-0.0571][2026-03-23 13:37:33] Step: 282, Training Logs: loss_final: 3.667176, loss_mean: 1.015446, proj_loss: -0.056914, loss_mean_cls: 2.708644, grad_norm: 3.292808 +Steps: 0%| | 283/1000000 [01:13<67:40:33, 4.10it/s, grad_norm=3.29, loss_final=3.67, loss_mean=1.02, loss_mean_cls=2.71, proj_loss=-0.0569][2026-03-23 13:37:33] Step: 283, Training Logs: loss_final: 3.616500, loss_mean: 1.012538, proj_loss: -0.058288, loss_mean_cls: 2.662250, grad_norm: 2.492370 +Steps: 0%| | 284/1000000 [01:13<67:39:04, 4.10it/s, grad_norm=2.49, loss_final=3.62, loss_mean=1.01, loss_mean_cls=2.66, proj_loss=-0.0583][2026-03-23 13:37:33] Step: 284, Training Logs: loss_final: 3.594413, loss_mean: 1.014537, proj_loss: -0.058344, loss_mean_cls: 2.638220, grad_norm: 2.429539 +Steps: 0%| | 285/1000000 [01:14<67:39:27, 4.10it/s, grad_norm=2.43, loss_final=3.59, loss_mean=1.01, loss_mean_cls=2.64, proj_loss=-0.0583][2026-03-23 13:37:34] Step: 285, Training Logs: loss_final: 3.945487, loss_mean: 1.001096, proj_loss: -0.056853, loss_mean_cls: 3.001245, grad_norm: 3.470700 +Steps: 0%| | 286/1000000 [01:14<67:38:17, 4.11it/s, grad_norm=3.47, loss_final=3.95, loss_mean=1, loss_mean_cls=3, proj_loss=-0.0569][2026-03-23 13:37:34] Step: 286, Training Logs: loss_final: 3.911296, loss_mean: 1.015281, proj_loss: -0.057196, loss_mean_cls: 2.953211, grad_norm: 2.201473 +Steps: 0%| | 287/1000000 [01:14<67:38:09, 4.11it/s, grad_norm=2.2, loss_final=3.91, loss_mean=1.02, loss_mean_cls=2.95, proj_loss=-0.0572][2026-03-23 13:37:34] Step: 287, Training Logs: loss_final: 4.058732, loss_mean: 1.013369, proj_loss: -0.059599, loss_mean_cls: 3.104962, grad_norm: 3.052839 +Steps: 0%| | 288/1000000 [01:14<67:37:41, 4.11it/s, grad_norm=3.05, loss_final=4.06, loss_mean=1.01, loss_mean_cls=3.1, proj_loss=-0.0596][2026-03-23 13:37:34] Step: 288, Training Logs: loss_final: 4.307568, loss_mean: 0.984469, proj_loss: -0.058924, loss_mean_cls: 3.382022, grad_norm: 3.083347 +Steps: 0%| | 289/1000000 [01:15<67:38:17, 4.11it/s, grad_norm=3.08, loss_final=4.31, loss_mean=0.984, loss_mean_cls=3.38, proj_loss=-0.0589][2026-03-23 13:37:35] Step: 289, Training Logs: loss_final: 4.025605, loss_mean: 0.998654, proj_loss: -0.059356, loss_mean_cls: 3.086308, grad_norm: 2.948990 +Steps: 0%| | 290/1000000 [01:15<67:37:13, 4.11it/s, grad_norm=2.95, loss_final=4.03, loss_mean=0.999, loss_mean_cls=3.09, proj_loss=-0.0594][2026-03-23 13:37:35] Step: 290, Training Logs: loss_final: 3.627743, loss_mean: 0.999764, proj_loss: -0.058082, loss_mean_cls: 2.686061, grad_norm: 2.863844 +Steps: 0%| | 291/1000000 [01:15<67:38:58, 4.10it/s, grad_norm=2.86, loss_final=3.63, loss_mean=1, loss_mean_cls=2.69, proj_loss=-0.0581][2026-03-23 13:37:35] Step: 291, Training Logs: loss_final: 3.867119, loss_mean: 1.027231, proj_loss: -0.056651, loss_mean_cls: 2.896539, grad_norm: 3.620899 +Steps: 0%| | 292/1000000 [01:15<67:38:33, 4.11it/s, grad_norm=3.62, loss_final=3.87, loss_mean=1.03, loss_mean_cls=2.9, proj_loss=-0.0567][2026-03-23 13:37:35] Step: 292, Training Logs: loss_final: 3.876428, loss_mean: 0.999188, proj_loss: -0.058782, loss_mean_cls: 2.936022, grad_norm: 2.651929 +Steps: 0%| | 293/1000000 [01:16<67:39:38, 4.10it/s, grad_norm=2.65, loss_final=3.88, loss_mean=0.999, loss_mean_cls=2.94, proj_loss=-0.0588][2026-03-23 13:37:36] Step: 293, Training Logs: loss_final: 3.905880, loss_mean: 1.026564, proj_loss: -0.057401, loss_mean_cls: 2.936717, grad_norm: 4.310424 +Steps: 0%| | 294/1000000 [01:16<67:37:00, 4.11it/s, grad_norm=4.31, loss_final=3.91, loss_mean=1.03, loss_mean_cls=2.94, proj_loss=-0.0574][2026-03-23 13:37:36] Step: 294, Training Logs: loss_final: 3.661989, loss_mean: 1.038023, proj_loss: -0.057280, loss_mean_cls: 2.681246, grad_norm: 3.589076 +Steps: 0%| | 295/1000000 [01:16<100:53:42, 2.75it/s, grad_norm=3.59, loss_final=3.66, loss_mean=1.04, loss_mean_cls=2.68, proj_loss=-0.0573][2026-03-23 13:37:36] Step: 295, Training Logs: loss_final: 3.812592, loss_mean: 1.016426, proj_loss: -0.058113, loss_mean_cls: 2.854280, grad_norm: 3.018946 +Steps: 0%| | 296/1000000 [01:17<90:53:15, 3.06it/s, grad_norm=3.02, loss_final=3.81, loss_mean=1.02, loss_mean_cls=2.85, proj_loss=-0.0581] [2026-03-23 13:37:37] Step: 296, Training Logs: loss_final: 4.329463, loss_mean: 1.008330, proj_loss: -0.058650, loss_mean_cls: 3.379782, grad_norm: 2.890720 +Steps: 0%| | 297/1000000 [01:17<83:57:56, 3.31it/s, grad_norm=2.89, loss_final=4.33, loss_mean=1.01, loss_mean_cls=3.38, proj_loss=-0.0586][2026-03-23 13:37:37] Step: 297, Training Logs: loss_final: 3.669531, loss_mean: 1.034374, proj_loss: -0.056772, loss_mean_cls: 2.691929, grad_norm: 3.507985 +Steps: 0%| | 298/1000000 [01:17<79:02:47, 3.51it/s, grad_norm=3.51, loss_final=3.67, loss_mean=1.03, loss_mean_cls=2.69, proj_loss=-0.0568][2026-03-23 13:37:37] Step: 298, Training Logs: loss_final: 3.582529, loss_mean: 1.018537, proj_loss: -0.058946, loss_mean_cls: 2.622938, grad_norm: 2.707349 +Steps: 0%| | 299/1000000 [01:17<75:38:38, 3.67it/s, grad_norm=2.71, loss_final=3.58, loss_mean=1.02, loss_mean_cls=2.62, proj_loss=-0.0589][2026-03-23 13:37:37] Step: 299, Training Logs: loss_final: 4.371418, loss_mean: 1.000267, proj_loss: -0.058158, loss_mean_cls: 3.429309, grad_norm: 3.102408 +Steps: 0%| | 300/1000000 [01:18<73:16:53, 3.79it/s, grad_norm=3.1, loss_final=4.37, loss_mean=1, loss_mean_cls=3.43, proj_loss=-0.0582][2026-03-23 13:37:38] Step: 300, Training Logs: loss_final: 3.843588, loss_mean: 1.019171, proj_loss: -0.056688, loss_mean_cls: 2.881106, grad_norm: 2.794140 +Steps: 0%| | 301/1000000 [01:18<71:34:27, 3.88it/s, grad_norm=2.79, loss_final=3.84, loss_mean=1.02, loss_mean_cls=2.88, proj_loss=-0.0567][2026-03-23 13:37:38] Step: 301, Training Logs: loss_final: 4.319116, loss_mean: 1.023464, proj_loss: -0.059904, loss_mean_cls: 3.355556, grad_norm: 3.155430 +Steps: 0%| | 302/1000000 [01:18<70:24:11, 3.94it/s, grad_norm=3.16, loss_final=4.32, loss_mean=1.02, loss_mean_cls=3.36, proj_loss=-0.0599][2026-03-23 13:37:38] Step: 302, Training Logs: loss_final: 3.931679, loss_mean: 0.988003, proj_loss: -0.056508, loss_mean_cls: 3.000185, grad_norm: 3.333779 +Steps: 0%| | 303/1000000 [01:18<69:41:14, 3.98it/s, grad_norm=3.33, loss_final=3.93, loss_mean=0.988, loss_mean_cls=3, proj_loss=-0.0565][2026-03-23 13:37:38] Step: 303, Training Logs: loss_final: 3.392973, loss_mean: 1.029751, proj_loss: -0.057698, loss_mean_cls: 2.420920, grad_norm: 2.888580 +Steps: 0%| | 304/1000000 [01:19<69:03:06, 4.02it/s, grad_norm=2.89, loss_final=3.39, loss_mean=1.03, loss_mean_cls=2.42, proj_loss=-0.0577][2026-03-23 13:37:39] Step: 304, Training Logs: loss_final: 4.128853, loss_mean: 1.001318, proj_loss: -0.056831, loss_mean_cls: 3.184367, grad_norm: 2.522434 +Steps: 0%| | 305/1000000 [01:19<68:37:49, 4.05it/s, grad_norm=2.52, loss_final=4.13, loss_mean=1, loss_mean_cls=3.18, proj_loss=-0.0568][2026-03-23 13:37:39] Step: 305, Training Logs: loss_final: 3.612441, loss_mean: 1.007329, proj_loss: -0.061426, loss_mean_cls: 2.666538, grad_norm: 2.346760 +Steps: 0%| | 306/1000000 [01:19<68:21:18, 4.06it/s, grad_norm=2.35, loss_final=3.61, loss_mean=1.01, loss_mean_cls=2.67, proj_loss=-0.0614][2026-03-23 13:37:39] Step: 306, Training Logs: loss_final: 3.900506, loss_mean: 0.989076, proj_loss: -0.057206, loss_mean_cls: 2.968637, grad_norm: 2.190770 +Steps: 0%| | 307/1000000 [01:19<68:08:06, 4.08it/s, grad_norm=2.19, loss_final=3.9, loss_mean=0.989, loss_mean_cls=2.97, proj_loss=-0.0572][2026-03-23 13:37:39] Step: 307, Training Logs: loss_final: 4.293480, loss_mean: 0.993287, proj_loss: -0.055347, loss_mean_cls: 3.355540, grad_norm: 2.631457 +Steps: 0%| | 308/1000000 [01:20<67:58:16, 4.09it/s, grad_norm=2.63, loss_final=4.29, loss_mean=0.993, loss_mean_cls=3.36, proj_loss=-0.0553][2026-03-23 13:37:40] Step: 308, Training Logs: loss_final: 3.598759, loss_mean: 1.013403, proj_loss: -0.057086, loss_mean_cls: 2.642443, grad_norm: 2.930168 +Steps: 0%| | 309/1000000 [01:20<67:51:07, 4.09it/s, grad_norm=2.93, loss_final=3.6, loss_mean=1.01, loss_mean_cls=2.64, proj_loss=-0.0571][2026-03-23 13:37:40] Step: 309, Training Logs: loss_final: 3.794257, loss_mean: 1.001377, proj_loss: -0.057784, loss_mean_cls: 2.850664, grad_norm: 2.678800 +Steps: 0%| | 310/1000000 [01:20<67:47:00, 4.10it/s, grad_norm=2.68, loss_final=3.79, loss_mean=1, loss_mean_cls=2.85, proj_loss=-0.0578][2026-03-23 13:37:40] Step: 310, Training Logs: loss_final: 3.892301, loss_mean: 1.013732, proj_loss: -0.059102, loss_mean_cls: 2.937671, grad_norm: 2.302141 +Steps: 0%| | 311/1000000 [01:20<67:43:21, 4.10it/s, grad_norm=2.3, loss_final=3.89, loss_mean=1.01, loss_mean_cls=2.94, proj_loss=-0.0591][2026-03-23 13:37:40] Step: 311, Training Logs: loss_final: 3.869138, loss_mean: 0.999396, proj_loss: -0.059502, loss_mean_cls: 2.929245, grad_norm: 2.592452 +Steps: 0%| | 312/1000000 [01:21<67:41:27, 4.10it/s, grad_norm=2.59, loss_final=3.87, loss_mean=0.999, loss_mean_cls=2.93, proj_loss=-0.0595][2026-03-23 13:37:41] Step: 312, Training Logs: loss_final: 3.950172, loss_mean: 1.015035, proj_loss: -0.058818, loss_mean_cls: 2.993956, grad_norm: 2.796893 +Steps: 0%| | 313/1000000 [01:21<67:41:41, 4.10it/s, grad_norm=2.8, loss_final=3.95, loss_mean=1.02, loss_mean_cls=2.99, proj_loss=-0.0588][2026-03-23 13:37:41] Step: 313, Training Logs: loss_final: 4.034022, loss_mean: 0.986436, proj_loss: -0.059301, loss_mean_cls: 3.106887, grad_norm: 3.398511 +Steps: 0%| | 314/1000000 [01:21<67:39:19, 4.10it/s, grad_norm=3.4, loss_final=4.03, loss_mean=0.986, loss_mean_cls=3.11, proj_loss=-0.0593][2026-03-23 13:37:41] Step: 314, Training Logs: loss_final: 4.486757, loss_mean: 0.986830, proj_loss: -0.057997, loss_mean_cls: 3.557923, grad_norm: 3.135322 +Steps: 0%| | 315/1000000 [01:21<67:39:54, 4.10it/s, grad_norm=3.14, loss_final=4.49, loss_mean=0.987, loss_mean_cls=3.56, proj_loss=-0.058][2026-03-23 13:37:41] Step: 315, Training Logs: loss_final: 3.487810, loss_mean: 1.013852, proj_loss: -0.056949, loss_mean_cls: 2.530906, grad_norm: 2.779725 +Steps: 0%| | 316/1000000 [01:22<67:37:48, 4.11it/s, grad_norm=2.78, loss_final=3.49, loss_mean=1.01, loss_mean_cls=2.53, proj_loss=-0.0569][2026-03-23 13:37:42] Step: 316, Training Logs: loss_final: 4.248762, loss_mean: 0.986589, proj_loss: -0.059642, loss_mean_cls: 3.321816, grad_norm: 2.460742 +Steps: 0%| | 317/1000000 [01:22<67:42:56, 4.10it/s, grad_norm=2.46, loss_final=4.25, loss_mean=0.987, loss_mean_cls=3.32, proj_loss=-0.0596][2026-03-23 13:37:42] Step: 317, Training Logs: loss_final: 3.236108, loss_mean: 1.020262, proj_loss: -0.057438, loss_mean_cls: 2.273284, grad_norm: 2.483791 +Steps: 0%| | 318/1000000 [01:22<67:43:23, 4.10it/s, grad_norm=2.48, loss_final=3.24, loss_mean=1.02, loss_mean_cls=2.27, proj_loss=-0.0574][2026-03-23 13:37:42] Step: 318, Training Logs: loss_final: 3.536809, loss_mean: 1.040298, proj_loss: -0.060941, loss_mean_cls: 2.557452, grad_norm: 3.206617 +Steps: 0%| | 319/1000000 [01:22<67:44:41, 4.10it/s, grad_norm=3.21, loss_final=3.54, loss_mean=1.04, loss_mean_cls=2.56, proj_loss=-0.0609][2026-03-23 13:37:42] Step: 319, Training Logs: loss_final: 3.907405, loss_mean: 0.995717, proj_loss: -0.056083, loss_mean_cls: 2.967771, grad_norm: 4.418262 +Steps: 0%| | 320/1000000 [01:23<89:43:06, 3.10it/s, grad_norm=4.42, loss_final=3.91, loss_mean=0.996, loss_mean_cls=2.97, proj_loss=-0.0561][2026-03-23 13:37:43] Step: 320, Training Logs: loss_final: 4.205626, loss_mean: 1.027736, proj_loss: -0.058734, loss_mean_cls: 3.236623, grad_norm: 4.572106 +Steps: 0%| | 321/1000000 [01:23<83:05:51, 3.34it/s, grad_norm=4.57, loss_final=4.21, loss_mean=1.03, loss_mean_cls=3.24, proj_loss=-0.0587][2026-03-23 13:37:43] Step: 321, Training Logs: loss_final: 2.984803, loss_mean: 1.038218, proj_loss: -0.056273, loss_mean_cls: 2.002858, grad_norm: 2.678849 +Steps: 0%| | 322/1000000 [01:23<78:27:45, 3.54it/s, grad_norm=2.68, loss_final=2.98, loss_mean=1.04, loss_mean_cls=2, proj_loss=-0.0563][2026-03-23 13:37:43] Step: 322, Training Logs: loss_final: 3.546442, loss_mean: 1.016473, proj_loss: -0.058264, loss_mean_cls: 2.588233, grad_norm: 3.126689 +Steps: 0%| | 323/1000000 [01:24<75:17:34, 3.69it/s, grad_norm=3.13, loss_final=3.55, loss_mean=1.02, loss_mean_cls=2.59, proj_loss=-0.0583][2026-03-23 13:37:44] Step: 323, Training Logs: loss_final: 3.869281, loss_mean: 1.012386, proj_loss: -0.055501, loss_mean_cls: 2.912395, grad_norm: 3.574378 +Steps: 0%| | 324/1000000 [01:24<72:59:37, 3.80it/s, grad_norm=3.57, loss_final=3.87, loss_mean=1.01, loss_mean_cls=2.91, proj_loss=-0.0555][2026-03-23 13:37:44] Step: 324, Training Logs: loss_final: 4.156145, loss_mean: 1.005045, proj_loss: -0.059081, loss_mean_cls: 3.210180, grad_norm: 3.212246 +Steps: 0%| | 325/1000000 [01:24<71:23:47, 3.89it/s, grad_norm=3.21, loss_final=4.16, loss_mean=1.01, loss_mean_cls=3.21, proj_loss=-0.0591][2026-03-23 13:37:44] Step: 325, Training Logs: loss_final: 3.626032, loss_mean: 1.037323, proj_loss: -0.057881, loss_mean_cls: 2.646591, grad_norm: 3.366884 +Steps: 0%| | 326/1000000 [01:24<70:15:54, 3.95it/s, grad_norm=3.37, loss_final=3.63, loss_mean=1.04, loss_mean_cls=2.65, proj_loss=-0.0579][2026-03-23 13:37:44] Step: 326, Training Logs: loss_final: 3.853813, loss_mean: 1.007667, proj_loss: -0.059228, loss_mean_cls: 2.905374, grad_norm: 3.011024 +Steps: 0%| | 327/1000000 [01:25<69:31:25, 3.99it/s, grad_norm=3.01, loss_final=3.85, loss_mean=1.01, loss_mean_cls=2.91, proj_loss=-0.0592][2026-03-23 13:37:44] Step: 327, Training Logs: loss_final: 3.522632, loss_mean: 1.017583, proj_loss: -0.058127, loss_mean_cls: 2.563176, grad_norm: 3.559788 +Steps: 0%| | 328/1000000 [01:25<69:00:02, 4.02it/s, grad_norm=3.56, loss_final=3.52, loss_mean=1.02, loss_mean_cls=2.56, proj_loss=-0.0581][2026-03-23 13:37:45] Step: 328, Training Logs: loss_final: 4.040976, loss_mean: 1.004637, proj_loss: -0.057840, loss_mean_cls: 3.094178, grad_norm: 3.943012 +Steps: 0%| | 329/1000000 [01:25<68:38:26, 4.05it/s, grad_norm=3.94, loss_final=4.04, loss_mean=1, loss_mean_cls=3.09, proj_loss=-0.0578][2026-03-23 13:37:45] Step: 329, Training Logs: loss_final: 3.749008, loss_mean: 1.000181, proj_loss: -0.060052, loss_mean_cls: 2.808879, grad_norm: 3.146994 +Steps: 0%| | 330/1000000 [01:25<68:23:42, 4.06it/s, grad_norm=3.15, loss_final=3.75, loss_mean=1, loss_mean_cls=2.81, proj_loss=-0.0601][2026-03-23 13:37:45] Step: 330, Training Logs: loss_final: 3.658376, loss_mean: 1.003237, proj_loss: -0.057612, loss_mean_cls: 2.712751, grad_norm: 3.101245 +Steps: 0%| | 331/1000000 [01:25<68:10:58, 4.07it/s, grad_norm=3.1, loss_final=3.66, loss_mean=1, loss_mean_cls=2.71, proj_loss=-0.0576][2026-03-23 13:37:45] Step: 331, Training Logs: loss_final: 3.778796, loss_mean: 1.029789, proj_loss: -0.058911, loss_mean_cls: 2.807918, grad_norm: 3.066409 +Steps: 0%| | 332/1000000 [01:26<68:02:16, 4.08it/s, grad_norm=3.07, loss_final=3.78, loss_mean=1.03, loss_mean_cls=2.81, proj_loss=-0.0589][2026-03-23 13:37:46] Step: 332, Training Logs: loss_final: 3.858400, loss_mean: 0.973972, proj_loss: -0.056428, loss_mean_cls: 2.940855, grad_norm: 2.726356 +Steps: 0%| | 333/1000000 [01:26<67:53:58, 4.09it/s, grad_norm=2.73, loss_final=3.86, loss_mean=0.974, loss_mean_cls=2.94, proj_loss=-0.0564][2026-03-23 13:37:46] Step: 333, Training Logs: loss_final: 4.010131, loss_mean: 0.997377, proj_loss: -0.058718, loss_mean_cls: 3.071472, grad_norm: 3.418314 +Steps: 0%| | 334/1000000 [01:26<67:51:14, 4.09it/s, grad_norm=3.42, loss_final=4.01, loss_mean=0.997, loss_mean_cls=3.07, proj_loss=-0.0587][2026-03-23 13:37:46] Step: 334, Training Logs: loss_final: 3.681862, loss_mean: 1.005076, proj_loss: -0.060391, loss_mean_cls: 2.737178, grad_norm: 2.714313 +Steps: 0%| | 335/1000000 [01:26<67:49:00, 4.09it/s, grad_norm=2.71, loss_final=3.68, loss_mean=1.01, loss_mean_cls=2.74, proj_loss=-0.0604][2026-03-23 13:37:46] Step: 335, Training Logs: loss_final: 4.213812, loss_mean: 0.982568, proj_loss: -0.061119, loss_mean_cls: 3.292364, grad_norm: 3.202840 +Steps: 0%| | 336/1000000 [01:27<67:48:57, 4.09it/s, grad_norm=3.2, loss_final=4.21, loss_mean=0.983, loss_mean_cls=3.29, proj_loss=-0.0611][2026-03-23 13:37:47] Step: 336, Training Logs: loss_final: 3.724508, loss_mean: 0.996315, proj_loss: -0.058215, loss_mean_cls: 2.786409, grad_norm: 3.990045 +Steps: 0%| | 337/1000000 [01:27<67:47:09, 4.10it/s, grad_norm=3.99, loss_final=3.72, loss_mean=0.996, loss_mean_cls=2.79, proj_loss=-0.0582][2026-03-23 13:37:47] Step: 337, Training Logs: loss_final: 4.006943, loss_mean: 0.996930, proj_loss: -0.057548, loss_mean_cls: 3.067561, grad_norm: 2.959555 +Steps: 0%| | 338/1000000 [01:27<67:46:59, 4.10it/s, grad_norm=2.96, loss_final=4.01, loss_mean=0.997, loss_mean_cls=3.07, proj_loss=-0.0575][2026-03-23 13:37:47] Step: 338, Training Logs: loss_final: 4.259184, loss_mean: 1.001817, proj_loss: -0.059495, loss_mean_cls: 3.316862, grad_norm: 3.860257 +Steps: 0%| | 339/1000000 [01:27<67:48:50, 4.09it/s, grad_norm=3.86, loss_final=4.26, loss_mean=1, loss_mean_cls=3.32, proj_loss=-0.0595][2026-03-23 13:37:47] Step: 339, Training Logs: loss_final: 3.754354, loss_mean: 0.996577, proj_loss: -0.059152, loss_mean_cls: 2.816929, grad_norm: 3.631840 +Steps: 0%| | 340/1000000 [01:28<67:46:23, 4.10it/s, grad_norm=3.63, loss_final=3.75, loss_mean=0.997, loss_mean_cls=2.82, proj_loss=-0.0592][2026-03-23 13:37:48] Step: 340, Training Logs: loss_final: 3.576205, loss_mean: 1.012405, proj_loss: -0.057210, loss_mean_cls: 2.621010, grad_norm: 3.005658 +Steps: 0%| | 341/1000000 [01:28<67:47:46, 4.10it/s, grad_norm=3.01, loss_final=3.58, loss_mean=1.01, loss_mean_cls=2.62, proj_loss=-0.0572][2026-03-23 13:37:48] Step: 341, Training Logs: loss_final: 3.852180, loss_mean: 0.999421, proj_loss: -0.058261, loss_mean_cls: 2.911020, grad_norm: 4.224223 +Steps: 0%| | 342/1000000 [01:28<67:46:35, 4.10it/s, grad_norm=4.22, loss_final=3.85, loss_mean=0.999, loss_mean_cls=2.91, proj_loss=-0.0583][2026-03-23 13:37:48] Step: 342, Training Logs: loss_final: 4.287997, loss_mean: 1.002059, proj_loss: -0.055832, loss_mean_cls: 3.341770, grad_norm: 2.866695 +Steps: 0%| | 343/1000000 [01:28<67:48:22, 4.10it/s, grad_norm=2.87, loss_final=4.29, loss_mean=1, loss_mean_cls=3.34, proj_loss=-0.0558][2026-03-23 13:37:48] Step: 343, Training Logs: loss_final: 3.705786, loss_mean: 1.010663, proj_loss: -0.059344, loss_mean_cls: 2.754467, grad_norm: 2.286735 +Steps: 0%| | 344/1000000 [01:29<67:46:01, 4.10it/s, grad_norm=2.29, loss_final=3.71, loss_mean=1.01, loss_mean_cls=2.75, proj_loss=-0.0593][2026-03-23 13:37:49] Step: 344, Training Logs: loss_final: 3.719071, loss_mean: 1.023116, proj_loss: -0.057620, loss_mean_cls: 2.753574, grad_norm: 2.720178 +Steps: 0%| | 345/1000000 [01:29<67:44:27, 4.10it/s, grad_norm=2.72, loss_final=3.72, loss_mean=1.02, loss_mean_cls=2.75, proj_loss=-0.0576][2026-03-23 13:37:49] Step: 345, Training Logs: loss_final: 3.986015, loss_mean: 0.986666, proj_loss: -0.060065, loss_mean_cls: 3.059415, grad_norm: 4.063354 +Steps: 0%| | 346/1000000 [01:29<67:42:10, 4.10it/s, grad_norm=4.06, loss_final=3.99, loss_mean=0.987, loss_mean_cls=3.06, proj_loss=-0.0601][2026-03-23 13:37:49] Step: 346, Training Logs: loss_final: 3.953215, loss_mean: 1.010508, proj_loss: -0.056166, loss_mean_cls: 2.998873, grad_norm: 4.408749 +Steps: 0%| | 347/1000000 [01:29<67:43:44, 4.10it/s, grad_norm=4.41, loss_final=3.95, loss_mean=1.01, loss_mean_cls=3, proj_loss=-0.0562][2026-03-23 13:37:49] Step: 347, Training Logs: loss_final: 3.608379, loss_mean: 1.014834, proj_loss: -0.057683, loss_mean_cls: 2.651228, grad_norm: 3.444885 +Steps: 0%| | 348/1000000 [01:30<67:45:16, 4.10it/s, grad_norm=3.44, loss_final=3.61, loss_mean=1.01, loss_mean_cls=2.65, proj_loss=-0.0577][2026-03-23 13:37:50] Step: 348, Training Logs: loss_final: 4.125015, loss_mean: 1.013402, proj_loss: -0.059990, loss_mean_cls: 3.171602, grad_norm: 4.406186 +Steps: 0%| | 349/1000000 [01:30<67:45:10, 4.10it/s, grad_norm=4.41, loss_final=4.13, loss_mean=1.01, loss_mean_cls=3.17, proj_loss=-0.06][2026-03-23 13:37:50] Step: 349, Training Logs: loss_final: 4.201127, loss_mean: 1.000545, proj_loss: -0.057967, loss_mean_cls: 3.258548, grad_norm: 4.266675 +Steps: 0%| | 350/1000000 [01:30<67:43:34, 4.10it/s, grad_norm=4.27, loss_final=4.2, loss_mean=1, loss_mean_cls=3.26, proj_loss=-0.058][2026-03-23 13:37:50] Step: 350, Training Logs: loss_final: 4.032846, loss_mean: 0.985661, proj_loss: -0.057579, loss_mean_cls: 3.104764, grad_norm: 2.872370 +Steps: 0%| | 351/1000000 [01:30<67:44:14, 4.10it/s, grad_norm=2.87, loss_final=4.03, loss_mean=0.986, loss_mean_cls=3.1, proj_loss=-0.0576][2026-03-23 13:37:50] Step: 351, Training Logs: loss_final: 3.874955, loss_mean: 1.016733, proj_loss: -0.057095, loss_mean_cls: 2.915318, grad_norm: 3.647907 +Steps: 0%| | 352/1000000 [01:31<67:44:17, 4.10it/s, grad_norm=3.65, loss_final=3.87, loss_mean=1.02, loss_mean_cls=2.92, proj_loss=-0.0571][2026-03-23 13:37:51] Step: 352, Training Logs: loss_final: 3.785285, loss_mean: 1.005697, proj_loss: -0.059209, loss_mean_cls: 2.838797, grad_norm: 3.046071 +Steps: 0%| | 353/1000000 [01:31<67:42:34, 4.10it/s, grad_norm=3.05, loss_final=3.79, loss_mean=1.01, loss_mean_cls=2.84, proj_loss=-0.0592][2026-03-23 13:37:51] Step: 353, Training Logs: loss_final: 3.529530, loss_mean: 0.990240, proj_loss: -0.058699, loss_mean_cls: 2.597990, grad_norm: 2.936688 +Steps: 0%| | 354/1000000 [01:31<67:42:19, 4.10it/s, grad_norm=2.94, loss_final=3.53, loss_mean=0.99, loss_mean_cls=2.6, proj_loss=-0.0587][2026-03-23 13:37:51] Step: 354, Training Logs: loss_final: 4.211801, loss_mean: 0.979218, proj_loss: -0.056706, loss_mean_cls: 3.289289, grad_norm: 2.775118 +Steps: 0%| | 355/1000000 [01:31<67:42:02, 4.10it/s, grad_norm=2.78, loss_final=4.21, loss_mean=0.979, loss_mean_cls=3.29, proj_loss=-0.0567][2026-03-23 13:37:51] Step: 355, Training Logs: loss_final: 3.776501, loss_mean: 1.006654, proj_loss: -0.057815, loss_mean_cls: 2.827662, grad_norm: 3.468966 +Steps: 0%| | 356/1000000 [01:32<67:40:06, 4.10it/s, grad_norm=3.47, loss_final=3.78, loss_mean=1.01, loss_mean_cls=2.83, proj_loss=-0.0578][2026-03-23 13:37:52] Step: 356, Training Logs: loss_final: 3.551684, loss_mean: 1.000604, proj_loss: -0.056923, loss_mean_cls: 2.608003, grad_norm: 3.286696 +Steps: 0%| | 357/1000000 [01:32<67:39:50, 4.10it/s, grad_norm=3.29, loss_final=3.55, loss_mean=1, loss_mean_cls=2.61, proj_loss=-0.0569][2026-03-23 13:37:52] Step: 357, Training Logs: loss_final: 3.416160, loss_mean: 1.018969, proj_loss: -0.058944, loss_mean_cls: 2.456134, grad_norm: 2.896245 +Steps: 0%| | 358/1000000 [01:32<67:37:42, 4.11it/s, grad_norm=2.9, loss_final=3.42, loss_mean=1.02, loss_mean_cls=2.46, proj_loss=-0.0589][2026-03-23 13:37:52] Step: 358, Training Logs: loss_final: 3.929278, loss_mean: 0.990120, proj_loss: -0.058139, loss_mean_cls: 2.997298, grad_norm: 3.573277 +Steps: 0%| | 359/1000000 [01:32<67:38:26, 4.11it/s, grad_norm=3.57, loss_final=3.93, loss_mean=0.99, loss_mean_cls=3, proj_loss=-0.0581][2026-03-23 13:37:52] Step: 359, Training Logs: loss_final: 3.926426, loss_mean: 1.012582, proj_loss: -0.058543, loss_mean_cls: 2.972387, grad_norm: 4.717799 +Steps: 0%| | 360/1000000 [01:33<67:38:06, 4.11it/s, grad_norm=4.72, loss_final=3.93, loss_mean=1.01, loss_mean_cls=2.97, proj_loss=-0.0585][2026-03-23 13:37:53] Step: 360, Training Logs: loss_final: 3.734210, loss_mean: 0.987489, proj_loss: -0.056995, loss_mean_cls: 2.803716, grad_norm: 3.226545 +Steps: 0%| | 361/1000000 [01:33<67:39:02, 4.10it/s, grad_norm=3.23, loss_final=3.73, loss_mean=0.987, loss_mean_cls=2.8, proj_loss=-0.057][2026-03-23 13:37:53] Step: 361, Training Logs: loss_final: 3.270634, loss_mean: 1.021888, proj_loss: -0.059376, loss_mean_cls: 2.308122, grad_norm: 5.203046 +Steps: 0%| | 362/1000000 [01:33<67:38:14, 4.11it/s, grad_norm=5.2, loss_final=3.27, loss_mean=1.02, loss_mean_cls=2.31, proj_loss=-0.0594][2026-03-23 13:37:53] Step: 362, Training Logs: loss_final: 3.667138, loss_mean: 1.016119, proj_loss: -0.060708, loss_mean_cls: 2.711727, grad_norm: 4.466422 +Steps: 0%| | 363/1000000 [01:33<67:39:21, 4.10it/s, grad_norm=4.47, loss_final=3.67, loss_mean=1.02, loss_mean_cls=2.71, proj_loss=-0.0607][2026-03-23 13:37:53] Step: 363, Training Logs: loss_final: 3.734156, loss_mean: 1.000050, proj_loss: -0.056929, loss_mean_cls: 2.791035, grad_norm: 4.626692 +Steps: 0%| | 364/1000000 [01:34<67:45:45, 4.10it/s, grad_norm=4.63, loss_final=3.73, loss_mean=1, loss_mean_cls=2.79, proj_loss=-0.0569][2026-03-23 13:37:54] Step: 364, Training Logs: loss_final: 3.417872, loss_mean: 1.013145, proj_loss: -0.058018, loss_mean_cls: 2.462745, grad_norm: 4.699632 +Steps: 0%| | 365/1000000 [01:34<67:54:37, 4.09it/s, grad_norm=4.7, loss_final=3.42, loss_mean=1.01, loss_mean_cls=2.46, proj_loss=-0.058][2026-03-23 13:37:54] Step: 365, Training Logs: loss_final: 3.828597, loss_mean: 1.004641, proj_loss: -0.057841, loss_mean_cls: 2.881797, grad_norm: 4.537452 +Steps: 0%| | 366/1000000 [01:34<67:48:46, 4.09it/s, grad_norm=4.54, loss_final=3.83, loss_mean=1, loss_mean_cls=2.88, proj_loss=-0.0578][2026-03-23 13:37:54] Step: 366, Training Logs: loss_final: 3.705940, loss_mean: 1.033906, proj_loss: -0.057911, loss_mean_cls: 2.729945, grad_norm: 4.508230 +Steps: 0%| | 367/1000000 [01:34<67:45:21, 4.10it/s, grad_norm=4.51, loss_final=3.71, loss_mean=1.03, loss_mean_cls=2.73, proj_loss=-0.0579][2026-03-23 13:37:54] Step: 367, Training Logs: loss_final: 3.755884, loss_mean: 0.970517, proj_loss: -0.056380, loss_mean_cls: 2.841747, grad_norm: 3.251605 +Steps: 0%| | 368/1000000 [01:35<67:42:42, 4.10it/s, grad_norm=3.25, loss_final=3.76, loss_mean=0.971, loss_mean_cls=2.84, proj_loss=-0.0564][2026-03-23 13:37:54] Step: 368, Training Logs: loss_final: 4.283263, loss_mean: 0.990184, proj_loss: -0.057812, loss_mean_cls: 3.350891, grad_norm: 4.465212 +Steps: 0%| | 369/1000000 [01:35<67:41:29, 4.10it/s, grad_norm=4.47, loss_final=4.28, loss_mean=0.99, loss_mean_cls=3.35, proj_loss=-0.0578][2026-03-23 13:37:55] Step: 369, Training Logs: loss_final: 3.902206, loss_mean: 0.997576, proj_loss: -0.059062, loss_mean_cls: 2.963691, grad_norm: 3.681235 +Steps: 0%| | 370/1000000 [01:35<67:40:38, 4.10it/s, grad_norm=3.68, loss_final=3.9, loss_mean=0.998, loss_mean_cls=2.96, proj_loss=-0.0591][2026-03-23 13:37:55] Step: 370, Training Logs: loss_final: 3.806526, loss_mean: 0.996807, proj_loss: -0.060668, loss_mean_cls: 2.870388, grad_norm: 3.474685 +Steps: 0%| | 371/1000000 [01:35<68:00:17, 4.08it/s, grad_norm=3.47, loss_final=3.81, loss_mean=0.997, loss_mean_cls=2.87, proj_loss=-0.0607][2026-03-23 13:37:55] Step: 371, Training Logs: loss_final: 3.404234, loss_mean: 1.026602, proj_loss: -0.056869, loss_mean_cls: 2.434502, grad_norm: 3.240109 +Steps: 0%| | 372/1000000 [01:36<68:08:56, 4.07it/s, grad_norm=3.24, loss_final=3.4, loss_mean=1.03, loss_mean_cls=2.43, proj_loss=-0.0569][2026-03-23 13:37:55] Step: 372, Training Logs: loss_final: 3.497092, loss_mean: 1.021417, proj_loss: -0.057944, loss_mean_cls: 2.533619, grad_norm: 3.723712 +Steps: 0%| | 373/1000000 [01:36<67:59:33, 4.08it/s, grad_norm=3.72, loss_final=3.5, loss_mean=1.02, loss_mean_cls=2.53, proj_loss=-0.0579][2026-03-23 13:37:56] Step: 373, Training Logs: loss_final: 4.127393, loss_mean: 0.981887, proj_loss: -0.057174, loss_mean_cls: 3.202680, grad_norm: 2.748789 +Steps: 0%| | 374/1000000 [01:36<67:52:26, 4.09it/s, grad_norm=2.75, loss_final=4.13, loss_mean=0.982, loss_mean_cls=3.2, proj_loss=-0.0572][2026-03-23 13:37:56] Step: 374, Training Logs: loss_final: 3.933258, loss_mean: 0.990646, proj_loss: -0.056828, loss_mean_cls: 2.999440, grad_norm: 3.638493 +Steps: 0%| | 375/1000000 [01:36<68:11:17, 4.07it/s, grad_norm=3.64, loss_final=3.93, loss_mean=0.991, loss_mean_cls=3, proj_loss=-0.0568][2026-03-23 13:37:56] Step: 375, Training Logs: loss_final: 4.355663, loss_mean: 0.977721, proj_loss: -0.059416, loss_mean_cls: 3.437358, grad_norm: 3.629993 +Steps: 0%| | 376/1000000 [01:36<68:02:10, 4.08it/s, grad_norm=3.63, loss_final=4.36, loss_mean=0.978, loss_mean_cls=3.44, proj_loss=-0.0594][2026-03-23 13:37:56] Step: 376, Training Logs: loss_final: 3.613506, loss_mean: 0.994713, proj_loss: -0.059096, loss_mean_cls: 2.677889, grad_norm: 4.163279 +Steps: 0%| | 377/1000000 [01:37<68:10:24, 4.07it/s, grad_norm=4.16, loss_final=3.61, loss_mean=0.995, loss_mean_cls=2.68, proj_loss=-0.0591][2026-03-23 13:37:57] Step: 377, Training Logs: loss_final: 3.826010, loss_mean: 0.985312, proj_loss: -0.058542, loss_mean_cls: 2.899240, grad_norm: 3.371379 +Steps: 0%| | 378/1000000 [01:37<68:20:31, 4.06it/s, grad_norm=3.37, loss_final=3.83, loss_mean=0.985, loss_mean_cls=2.9, proj_loss=-0.0585][2026-03-23 13:37:57] Step: 378, Training Logs: loss_final: 3.682867, loss_mean: 1.012500, proj_loss: -0.058385, loss_mean_cls: 2.728752, grad_norm: 4.518425 +Steps: 0%| | 379/1000000 [01:37<68:09:01, 4.07it/s, grad_norm=4.52, loss_final=3.68, loss_mean=1.01, loss_mean_cls=2.73, proj_loss=-0.0584][2026-03-23 13:37:57] Step: 379, Training Logs: loss_final: 3.622719, loss_mean: 1.015603, proj_loss: -0.057785, loss_mean_cls: 2.664901, grad_norm: 4.638657 +Steps: 0%| | 380/1000000 [01:37<68:00:30, 4.08it/s, grad_norm=4.64, loss_final=3.62, loss_mean=1.02, loss_mean_cls=2.66, proj_loss=-0.0578][2026-03-23 13:37:57] Step: 380, Training Logs: loss_final: 3.506017, loss_mean: 0.992923, proj_loss: -0.059878, loss_mean_cls: 2.572972, grad_norm: 3.673554 +Steps: 0%| | 381/1000000 [01:38<68:17:33, 4.07it/s, grad_norm=3.67, loss_final=3.51, loss_mean=0.993, loss_mean_cls=2.57, proj_loss=-0.0599][2026-03-23 13:37:58] Step: 381, Training Logs: loss_final: 4.020730, loss_mean: 0.998361, proj_loss: -0.059693, loss_mean_cls: 3.082062, grad_norm: 5.395801 +Steps: 0%| | 382/1000000 [01:38<68:27:35, 4.06it/s, grad_norm=5.4, loss_final=4.02, loss_mean=0.998, loss_mean_cls=3.08, proj_loss=-0.0597][2026-03-23 13:37:58] Step: 382, Training Logs: loss_final: 4.084125, loss_mean: 1.037431, proj_loss: -0.060547, loss_mean_cls: 3.107240, grad_norm: 6.453217 +Steps: 0%| | 383/1000000 [01:38<68:15:26, 4.07it/s, grad_norm=6.45, loss_final=4.08, loss_mean=1.04, loss_mean_cls=3.11, proj_loss=-0.0605][2026-03-23 13:37:58] Step: 383, Training Logs: loss_final: 4.085636, loss_mean: 0.998921, proj_loss: -0.058699, loss_mean_cls: 3.145415, grad_norm: 4.368627 +Steps: 0%| | 384/1000000 [01:38<68:04:49, 4.08it/s, grad_norm=4.37, loss_final=4.09, loss_mean=0.999, loss_mean_cls=3.15, proj_loss=-0.0587][2026-03-23 13:37:58] Step: 384, Training Logs: loss_final: 3.918699, loss_mean: 0.981990, proj_loss: -0.058793, loss_mean_cls: 2.995502, grad_norm: 3.868781 +Steps: 0%| | 385/1000000 [01:39<68:17:07, 4.07it/s, grad_norm=3.87, loss_final=3.92, loss_mean=0.982, loss_mean_cls=3, proj_loss=-0.0588][2026-03-23 13:37:59] Step: 385, Training Logs: loss_final: 3.394302, loss_mean: 1.044443, proj_loss: -0.057135, loss_mean_cls: 2.406994, grad_norm: 3.792483 +Steps: 0%| | 386/1000000 [01:39<68:26:19, 4.06it/s, grad_norm=3.79, loss_final=3.39, loss_mean=1.04, loss_mean_cls=2.41, proj_loss=-0.0571][2026-03-23 13:37:59] Step: 386, Training Logs: loss_final: 3.872888, loss_mean: 0.992363, proj_loss: -0.060322, loss_mean_cls: 2.940847, grad_norm: 2.830643 +Steps: 0%| | 387/1000000 [01:39<68:13:41, 4.07it/s, grad_norm=2.83, loss_final=3.87, loss_mean=0.992, loss_mean_cls=2.94, proj_loss=-0.0603][2026-03-23 13:37:59] Step: 387, Training Logs: loss_final: 3.844979, loss_mean: 0.993630, proj_loss: -0.062177, loss_mean_cls: 2.913525, grad_norm: 4.151726 +Steps: 0%| | 388/1000000 [01:39<68:24:07, 4.06it/s, grad_norm=4.15, loss_final=3.84, loss_mean=0.994, loss_mean_cls=2.91, proj_loss=-0.0622][2026-03-23 13:37:59] Step: 388, Training Logs: loss_final: 3.907385, loss_mean: 0.988917, proj_loss: -0.058458, loss_mean_cls: 2.976925, grad_norm: 4.424355 +Steps: 0%| | 389/1000000 [01:40<68:26:35, 4.06it/s, grad_norm=4.42, loss_final=3.91, loss_mean=0.989, loss_mean_cls=2.98, proj_loss=-0.0585][2026-03-23 13:38:00] Step: 389, Training Logs: loss_final: 3.327323, loss_mean: 0.989985, proj_loss: -0.057957, loss_mean_cls: 2.395295, grad_norm: 3.095365 +Steps: 0%| | 390/1000000 [01:40<68:11:31, 4.07it/s, grad_norm=3.1, loss_final=3.33, loss_mean=0.99, loss_mean_cls=2.4, proj_loss=-0.058][2026-03-23 13:38:00] Step: 390, Training Logs: loss_final: 3.308805, loss_mean: 1.034614, proj_loss: -0.058628, loss_mean_cls: 2.332818, grad_norm: 4.913064 +Steps: 0%| | 391/1000000 [01:40<68:01:36, 4.08it/s, grad_norm=4.91, loss_final=3.31, loss_mean=1.03, loss_mean_cls=2.33, proj_loss=-0.0586][2026-03-23 13:38:00] Step: 391, Training Logs: loss_final: 3.713989, loss_mean: 1.001573, proj_loss: -0.057862, loss_mean_cls: 2.770278, grad_norm: 5.146685 +Steps: 0%| | 392/1000000 [01:40<67:56:08, 4.09it/s, grad_norm=5.15, loss_final=3.71, loss_mean=1, loss_mean_cls=2.77, proj_loss=-0.0579][2026-03-23 13:38:00] Step: 392, Training Logs: loss_final: 3.726823, loss_mean: 1.001923, proj_loss: -0.060552, loss_mean_cls: 2.785452, grad_norm: 3.354698 +Steps: 0%| | 393/1000000 [01:41<67:52:16, 4.09it/s, grad_norm=3.35, loss_final=3.73, loss_mean=1, loss_mean_cls=2.79, proj_loss=-0.0606][2026-03-23 13:38:01] Step: 393, Training Logs: loss_final: 3.799680, loss_mean: 1.019991, proj_loss: -0.058267, loss_mean_cls: 2.837956, grad_norm: 4.012832 +Steps: 0%| | 394/1000000 [01:41<67:47:26, 4.10it/s, grad_norm=4.01, loss_final=3.8, loss_mean=1.02, loss_mean_cls=2.84, proj_loss=-0.0583][2026-03-23 13:38:01] Step: 394, Training Logs: loss_final: 3.869999, loss_mean: 0.995879, proj_loss: -0.058217, loss_mean_cls: 2.932338, grad_norm: 3.583144 +Steps: 0%| | 395/1000000 [01:41<67:46:03, 4.10it/s, grad_norm=3.58, loss_final=3.87, loss_mean=0.996, loss_mean_cls=2.93, proj_loss=-0.0582][2026-03-23 13:38:01] Step: 395, Training Logs: loss_final: 3.301066, loss_mean: 1.017507, proj_loss: -0.057256, loss_mean_cls: 2.340815, grad_norm: 3.644650 +Steps: 0%| | 396/1000000 [01:41<67:43:44, 4.10it/s, grad_norm=3.64, loss_final=3.3, loss_mean=1.02, loss_mean_cls=2.34, proj_loss=-0.0573][2026-03-23 13:38:01] Step: 396, Training Logs: loss_final: 4.273154, loss_mean: 0.982387, proj_loss: -0.057120, loss_mean_cls: 3.347887, grad_norm: 4.111063 +Steps: 0%| | 397/1000000 [01:42<67:40:46, 4.10it/s, grad_norm=4.11, loss_final=4.27, loss_mean=0.982, loss_mean_cls=3.35, proj_loss=-0.0571][2026-03-23 13:38:02] Step: 397, Training Logs: loss_final: 4.027816, loss_mean: 1.027129, proj_loss: -0.057801, loss_mean_cls: 3.058488, grad_norm: 5.838287 +Steps: 0%| | 398/1000000 [01:42<67:38:30, 4.10it/s, grad_norm=5.84, loss_final=4.03, loss_mean=1.03, loss_mean_cls=3.06, proj_loss=-0.0578][2026-03-23 13:38:02] Step: 398, Training Logs: loss_final: 4.098645, loss_mean: 1.013320, proj_loss: -0.060827, loss_mean_cls: 3.146152, grad_norm: 4.467819 +Steps: 0%| | 399/1000000 [01:42<67:38:44, 4.10it/s, grad_norm=4.47, loss_final=4.1, loss_mean=1.01, loss_mean_cls=3.15, proj_loss=-0.0608][2026-03-23 13:38:02] Step: 399, Training Logs: loss_final: 4.578247, loss_mean: 0.977268, proj_loss: -0.059525, loss_mean_cls: 3.660504, grad_norm: 3.607288 +Steps: 0%| | 400/1000000 [01:42<67:40:22, 4.10it/s, grad_norm=3.61, loss_final=4.58, loss_mean=0.977, loss_mean_cls=3.66, proj_loss=-0.0595][2026-03-23 13:38:02] Step: 400, Training Logs: loss_final: 3.908290, loss_mean: 1.008396, proj_loss: -0.058050, loss_mean_cls: 2.957944, grad_norm: 4.005125 +Steps: 0%| | 401/1000000 [01:43<67:37:07, 4.11it/s, grad_norm=4.01, loss_final=3.91, loss_mean=1.01, loss_mean_cls=2.96, proj_loss=-0.0581][2026-03-23 13:38:03] Step: 401, Training Logs: loss_final: 3.692235, loss_mean: 1.003217, proj_loss: -0.057411, loss_mean_cls: 2.746429, grad_norm: 3.248769 +Steps: 0%| | 402/1000000 [01:43<67:36:30, 4.11it/s, grad_norm=3.25, loss_final=3.69, loss_mean=1, loss_mean_cls=2.75, proj_loss=-0.0574][2026-03-23 13:38:03] Step: 402, Training Logs: loss_final: 3.783054, loss_mean: 0.984101, proj_loss: -0.060964, loss_mean_cls: 2.859917, grad_norm: 3.431727 +Steps: 0%| | 403/1000000 [01:43<67:36:46, 4.11it/s, grad_norm=3.43, loss_final=3.78, loss_mean=0.984, loss_mean_cls=2.86, proj_loss=-0.061][2026-03-23 13:38:03] Step: 403, Training Logs: loss_final: 3.933758, loss_mean: 1.002243, proj_loss: -0.058339, loss_mean_cls: 2.989853, grad_norm: 3.104194 +Steps: 0%| | 404/1000000 [01:43<67:37:21, 4.11it/s, grad_norm=3.1, loss_final=3.93, loss_mean=1, loss_mean_cls=2.99, proj_loss=-0.0583][2026-03-23 13:38:03] Step: 404, Training Logs: loss_final: 4.004271, loss_mean: 0.998125, proj_loss: -0.057603, loss_mean_cls: 3.063748, grad_norm: 3.206752 +Steps: 0%| | 405/1000000 [01:44<67:36:48, 4.11it/s, grad_norm=3.21, loss_final=4, loss_mean=0.998, loss_mean_cls=3.06, proj_loss=-0.0576][2026-03-23 13:38:04] Step: 405, Training Logs: loss_final: 4.279324, loss_mean: 0.973958, proj_loss: -0.060364, loss_mean_cls: 3.365730, grad_norm: 4.749627 +Steps: 0%| | 406/1000000 [01:44<67:37:55, 4.11it/s, grad_norm=4.75, loss_final=4.28, loss_mean=0.974, loss_mean_cls=3.37, proj_loss=-0.0604][2026-03-23 13:38:04] Step: 406, Training Logs: loss_final: 4.557069, loss_mean: 0.970182, proj_loss: -0.058120, loss_mean_cls: 3.645008, grad_norm: 6.298654 +Steps: 0%| | 407/1000000 [01:44<67:40:15, 4.10it/s, grad_norm=6.3, loss_final=4.56, loss_mean=0.97, loss_mean_cls=3.65, proj_loss=-0.0581][2026-03-23 13:38:04] Step: 407, Training Logs: loss_final: 4.299712, loss_mean: 0.989943, proj_loss: -0.061339, loss_mean_cls: 3.371109, grad_norm: 3.608554 +Steps: 0%| | 408/1000000 [01:44<67:39:56, 4.10it/s, grad_norm=3.61, loss_final=4.3, loss_mean=0.99, loss_mean_cls=3.37, proj_loss=-0.0613][2026-03-23 13:38:04] Step: 408, Training Logs: loss_final: 3.760141, loss_mean: 1.021230, proj_loss: -0.056846, loss_mean_cls: 2.795758, grad_norm: 4.819811 +Steps: 0%| | 409/1000000 [01:45<67:37:42, 4.11it/s, grad_norm=4.82, loss_final=3.76, loss_mean=1.02, loss_mean_cls=2.8, proj_loss=-0.0568][2026-03-23 13:38:05] Step: 409, Training Logs: loss_final: 3.948972, loss_mean: 0.997647, proj_loss: -0.058138, loss_mean_cls: 3.009463, grad_norm: 3.135281 +Steps: 0%| | 410/1000000 [01:45<67:36:48, 4.11it/s, grad_norm=3.14, loss_final=3.95, loss_mean=0.998, loss_mean_cls=3.01, proj_loss=-0.0581][2026-03-23 13:38:05] Step: 410, Training Logs: loss_final: 3.639134, loss_mean: 1.001842, proj_loss: -0.061212, loss_mean_cls: 2.698505, grad_norm: 3.721157 +Steps: 0%| | 411/1000000 [01:45<67:37:15, 4.11it/s, grad_norm=3.72, loss_final=3.64, loss_mean=1, loss_mean_cls=2.7, proj_loss=-0.0612][2026-03-23 13:38:05] Step: 411, Training Logs: loss_final: 3.753303, loss_mean: 0.997413, proj_loss: -0.060331, loss_mean_cls: 2.816221, grad_norm: 4.552992 +Steps: 0%| | 412/1000000 [01:45<67:37:23, 4.11it/s, grad_norm=4.55, loss_final=3.75, loss_mean=0.997, loss_mean_cls=2.82, proj_loss=-0.0603][2026-03-23 13:38:05] Step: 412, Training Logs: loss_final: 4.136001, loss_mean: 0.999732, proj_loss: -0.059898, loss_mean_cls: 3.196168, grad_norm: 3.089385 +Steps: 0%| | 413/1000000 [01:46<67:36:27, 4.11it/s, grad_norm=3.09, loss_final=4.14, loss_mean=1, loss_mean_cls=3.2, proj_loss=-0.0599][2026-03-23 13:38:05] Step: 413, Training Logs: loss_final: 3.910921, loss_mean: 0.989452, proj_loss: -0.058123, loss_mean_cls: 2.979593, grad_norm: 5.346345 +Steps: 0%| | 414/1000000 [01:46<67:37:23, 4.11it/s, grad_norm=5.35, loss_final=3.91, loss_mean=0.989, loss_mean_cls=2.98, proj_loss=-0.0581][2026-03-23 13:38:06] Step: 414, Training Logs: loss_final: 4.084109, loss_mean: 0.980169, proj_loss: -0.059401, loss_mean_cls: 3.163341, grad_norm: 3.036572 +Steps: 0%| | 415/1000000 [01:46<67:37:10, 4.11it/s, grad_norm=3.04, loss_final=4.08, loss_mean=0.98, loss_mean_cls=3.16, proj_loss=-0.0594][2026-03-23 13:38:06] Step: 415, Training Logs: loss_final: 3.976451, loss_mean: 1.025134, proj_loss: -0.059875, loss_mean_cls: 3.011192, grad_norm: 4.680345 +Steps: 0%| | 416/1000000 [01:46<67:37:24, 4.11it/s, grad_norm=4.68, loss_final=3.98, loss_mean=1.03, loss_mean_cls=3.01, proj_loss=-0.0599][2026-03-23 13:38:06] Step: 416, Training Logs: loss_final: 4.241142, loss_mean: 0.983263, proj_loss: -0.062144, loss_mean_cls: 3.320023, grad_norm: 3.763190 +Steps: 0%| | 417/1000000 [01:47<67:37:25, 4.11it/s, grad_norm=3.76, loss_final=4.24, loss_mean=0.983, loss_mean_cls=3.32, proj_loss=-0.0621][2026-03-23 13:38:06] Step: 417, Training Logs: loss_final: 3.659859, loss_mean: 1.008510, proj_loss: -0.057709, loss_mean_cls: 2.709058, grad_norm: 4.223990 +Steps: 0%| | 418/1000000 [01:47<67:36:49, 4.11it/s, grad_norm=4.22, loss_final=3.66, loss_mean=1.01, loss_mean_cls=2.71, proj_loss=-0.0577][2026-03-23 13:38:07] Step: 418, Training Logs: loss_final: 3.351686, loss_mean: 0.992085, proj_loss: -0.056795, loss_mean_cls: 2.416396, grad_norm: 4.152516 +Steps: 0%| | 419/1000000 [01:47<67:36:41, 4.11it/s, grad_norm=4.15, loss_final=3.35, loss_mean=0.992, loss_mean_cls=2.42, proj_loss=-0.0568][2026-03-23 13:38:07] Step: 419, Training Logs: loss_final: 3.397219, loss_mean: 1.000797, proj_loss: -0.059153, loss_mean_cls: 2.455575, grad_norm: 4.016322 +Steps: 0%| | 420/1000000 [01:47<67:37:32, 4.11it/s, grad_norm=4.02, loss_final=3.4, loss_mean=1, loss_mean_cls=2.46, proj_loss=-0.0592][2026-03-23 13:38:07] Step: 420, Training Logs: loss_final: 3.987557, loss_mean: 0.975253, proj_loss: -0.059740, loss_mean_cls: 3.072045, grad_norm: 4.380704 +Steps: 0%| | 421/1000000 [01:47<67:38:16, 4.11it/s, grad_norm=4.38, loss_final=3.99, loss_mean=0.975, loss_mean_cls=3.07, proj_loss=-0.0597][2026-03-23 13:38:07] Step: 421, Training Logs: loss_final: 3.779744, loss_mean: 0.987544, proj_loss: -0.059183, loss_mean_cls: 2.851383, grad_norm: 3.360396 +Steps: 0%| | 422/1000000 [01:48<67:38:23, 4.10it/s, grad_norm=3.36, loss_final=3.78, loss_mean=0.988, loss_mean_cls=2.85, proj_loss=-0.0592][2026-03-23 13:38:08] Step: 422, Training Logs: loss_final: 4.060089, loss_mean: 1.009879, proj_loss: -0.059963, loss_mean_cls: 3.110173, grad_norm: 5.880581 +Steps: 0%| | 423/1000000 [01:48<67:37:34, 4.11it/s, grad_norm=5.88, loss_final=4.06, loss_mean=1.01, loss_mean_cls=3.11, proj_loss=-0.06][2026-03-23 13:38:08] Step: 423, Training Logs: loss_final: 3.957276, loss_mean: 1.018812, proj_loss: -0.057049, loss_mean_cls: 2.995514, grad_norm: 4.875398 +Steps: 0%| | 424/1000000 [01:48<67:39:33, 4.10it/s, grad_norm=4.88, loss_final=3.96, loss_mean=1.02, loss_mean_cls=3, proj_loss=-0.057][2026-03-23 13:38:08] Step: 424, Training Logs: loss_final: 4.084050, loss_mean: 0.992674, proj_loss: -0.059829, loss_mean_cls: 3.151205, grad_norm: 3.908008 +Steps: 0%| | 425/1000000 [01:48<67:39:06, 4.10it/s, grad_norm=3.91, loss_final=4.08, loss_mean=0.993, loss_mean_cls=3.15, proj_loss=-0.0598][2026-03-23 13:38:08] Step: 425, Training Logs: loss_final: 3.620151, loss_mean: 0.979499, proj_loss: -0.058779, loss_mean_cls: 2.699431, grad_norm: 4.299834 +Steps: 0%| | 426/1000000 [01:49<67:37:43, 4.11it/s, grad_norm=4.3, loss_final=3.62, loss_mean=0.979, loss_mean_cls=2.7, proj_loss=-0.0588][2026-03-23 13:38:09] Step: 426, Training Logs: loss_final: 4.003482, loss_mean: 0.999299, proj_loss: -0.060849, loss_mean_cls: 3.065033, grad_norm: 4.316888 +Steps: 0%| | 427/1000000 [01:49<67:39:49, 4.10it/s, grad_norm=4.32, loss_final=4, loss_mean=0.999, loss_mean_cls=3.07, proj_loss=-0.0608][2026-03-23 13:38:09] Step: 427, Training Logs: loss_final: 3.348166, loss_mean: 1.004211, proj_loss: -0.060160, loss_mean_cls: 2.404114, grad_norm: 3.284352 +Steps: 0%| | 428/1000000 [01:49<67:39:24, 4.10it/s, grad_norm=3.28, loss_final=3.35, loss_mean=1, loss_mean_cls=2.4, proj_loss=-0.0602][2026-03-23 13:38:09] Step: 428, Training Logs: loss_final: 3.942003, loss_mean: 0.984608, proj_loss: -0.061491, loss_mean_cls: 3.018887, grad_norm: 3.479348 +Steps: 0%| | 429/1000000 [01:49<67:41:11, 4.10it/s, grad_norm=3.48, loss_final=3.94, loss_mean=0.985, loss_mean_cls=3.02, proj_loss=-0.0615][2026-03-23 13:38:09] Step: 429, Training Logs: loss_final: 4.289022, loss_mean: 0.986825, proj_loss: -0.058249, loss_mean_cls: 3.360445, grad_norm: 5.906828 +Steps: 0%| | 430/1000000 [01:50<67:38:41, 4.10it/s, grad_norm=5.91, loss_final=4.29, loss_mean=0.987, loss_mean_cls=3.36, proj_loss=-0.0582][2026-03-23 13:38:10] Step: 430, Training Logs: loss_final: 3.767004, loss_mean: 0.995299, proj_loss: -0.058921, loss_mean_cls: 2.830626, grad_norm: 3.928360 +Steps: 0%| | 431/1000000 [01:50<67:42:24, 4.10it/s, grad_norm=3.93, loss_final=3.77, loss_mean=0.995, loss_mean_cls=2.83, proj_loss=-0.0589][2026-03-23 13:38:10] Step: 431, Training Logs: loss_final: 3.871011, loss_mean: 0.978871, proj_loss: -0.059511, loss_mean_cls: 2.951651, grad_norm: 3.367649 +Steps: 0%| | 432/1000000 [01:50<67:41:51, 4.10it/s, grad_norm=3.37, loss_final=3.87, loss_mean=0.979, loss_mean_cls=2.95, proj_loss=-0.0595][2026-03-23 13:38:10] Step: 432, Training Logs: loss_final: 4.120427, loss_mean: 1.004004, proj_loss: -0.059882, loss_mean_cls: 3.176304, grad_norm: 3.453291 +Steps: 0%| | 433/1000000 [01:50<67:41:16, 4.10it/s, grad_norm=3.45, loss_final=4.12, loss_mean=1, loss_mean_cls=3.18, proj_loss=-0.0599][2026-03-23 13:38:10] Step: 433, Training Logs: loss_final: 3.692594, loss_mean: 0.985637, proj_loss: -0.060044, loss_mean_cls: 2.767002, grad_norm: 3.105240 +Steps: 0%| | 434/1000000 [01:51<67:39:36, 4.10it/s, grad_norm=3.11, loss_final=3.69, loss_mean=0.986, loss_mean_cls=2.77, proj_loss=-0.06][2026-03-23 13:38:11] Step: 434, Training Logs: loss_final: 4.061102, loss_mean: 1.001565, proj_loss: -0.057277, loss_mean_cls: 3.116814, grad_norm: 5.923434 +Steps: 0%| | 435/1000000 [01:51<67:38:48, 4.10it/s, grad_norm=5.92, loss_final=4.06, loss_mean=1, loss_mean_cls=3.12, proj_loss=-0.0573][2026-03-23 13:38:11] Step: 435, Training Logs: loss_final: 3.796114, loss_mean: 0.976365, proj_loss: -0.056927, loss_mean_cls: 2.876676, grad_norm: 3.922393 +Steps: 0%| | 436/1000000 [01:51<67:38:02, 4.11it/s, grad_norm=3.92, loss_final=3.8, loss_mean=0.976, loss_mean_cls=2.88, proj_loss=-0.0569][2026-03-23 13:38:11] Step: 436, Training Logs: loss_final: 4.038236, loss_mean: 0.980158, proj_loss: -0.059093, loss_mean_cls: 3.117172, grad_norm: 4.482647 +Steps: 0%| | 437/1000000 [01:51<67:38:56, 4.10it/s, grad_norm=4.48, loss_final=4.04, loss_mean=0.98, loss_mean_cls=3.12, proj_loss=-0.0591][2026-03-23 13:38:11] Step: 437, Training Logs: loss_final: 3.235079, loss_mean: 1.012203, proj_loss: -0.058087, loss_mean_cls: 2.280963, grad_norm: 3.157582 +Steps: 0%| | 438/1000000 [01:52<67:38:03, 4.11it/s, grad_norm=3.16, loss_final=3.24, loss_mean=1.01, loss_mean_cls=2.28, proj_loss=-0.0581][2026-03-23 13:38:12] Step: 438, Training Logs: loss_final: 3.565121, loss_mean: 1.013994, proj_loss: -0.059328, loss_mean_cls: 2.610454, grad_norm: 4.222639 +Steps: 0%| | 439/1000000 [01:52<67:43:21, 4.10it/s, grad_norm=4.22, loss_final=3.57, loss_mean=1.01, loss_mean_cls=2.61, proj_loss=-0.0593][2026-03-23 13:38:12] Step: 439, Training Logs: loss_final: 3.663046, loss_mean: 0.985738, proj_loss: -0.062498, loss_mean_cls: 2.739807, grad_norm: 3.432475 +Steps: 0%| | 440/1000000 [01:52<67:42:02, 4.10it/s, grad_norm=3.43, loss_final=3.66, loss_mean=0.986, loss_mean_cls=2.74, proj_loss=-0.0625][2026-03-23 13:38:12] Step: 440, Training Logs: loss_final: 3.598989, loss_mean: 0.967839, proj_loss: -0.059984, loss_mean_cls: 2.691134, grad_norm: 3.229549 +Steps: 0%| | 441/1000000 [01:52<67:40:55, 4.10it/s, grad_norm=3.23, loss_final=3.6, loss_mean=0.968, loss_mean_cls=2.69, proj_loss=-0.06][2026-03-23 13:38:12] Step: 441, Training Logs: loss_final: 3.245439, loss_mean: 1.023835, proj_loss: -0.058113, loss_mean_cls: 2.279717, grad_norm: 2.773751 +Steps: 0%| | 442/1000000 [01:53<67:40:18, 4.10it/s, grad_norm=2.77, loss_final=3.25, loss_mean=1.02, loss_mean_cls=2.28, proj_loss=-0.0581][2026-03-23 13:38:13] Step: 442, Training Logs: loss_final: 3.363658, loss_mean: 0.989245, proj_loss: -0.057933, loss_mean_cls: 2.432346, grad_norm: 3.333236 +Steps: 0%| | 443/1000000 [01:53<67:38:53, 4.10it/s, grad_norm=3.33, loss_final=3.36, loss_mean=0.989, loss_mean_cls=2.43, proj_loss=-0.0579][2026-03-23 13:38:13] Step: 443, Training Logs: loss_final: 4.047393, loss_mean: 0.990661, proj_loss: -0.059674, loss_mean_cls: 3.116406, grad_norm: 4.465771 +Steps: 0%| | 444/1000000 [01:53<67:38:13, 4.11it/s, grad_norm=4.47, loss_final=4.05, loss_mean=0.991, loss_mean_cls=3.12, proj_loss=-0.0597][2026-03-23 13:38:13] Step: 444, Training Logs: loss_final: 3.732424, loss_mean: 0.978433, proj_loss: -0.057372, loss_mean_cls: 2.811364, grad_norm: 3.033458 +Steps: 0%| | 445/1000000 [01:53<67:42:11, 4.10it/s, grad_norm=3.03, loss_final=3.73, loss_mean=0.978, loss_mean_cls=2.81, proj_loss=-0.0574][2026-03-23 13:38:13] Step: 445, Training Logs: loss_final: 4.046782, loss_mean: 0.960415, proj_loss: -0.059124, loss_mean_cls: 3.145491, grad_norm: 3.785550 +Steps: 0%| | 446/1000000 [01:54<67:38:55, 4.10it/s, grad_norm=3.79, loss_final=4.05, loss_mean=0.96, loss_mean_cls=3.15, proj_loss=-0.0591][2026-03-23 13:38:14] Step: 446, Training Logs: loss_final: 3.254864, loss_mean: 1.004406, proj_loss: -0.058104, loss_mean_cls: 2.308562, grad_norm: 3.151400 +Steps: 0%| | 447/1000000 [01:54<67:38:29, 4.10it/s, grad_norm=3.15, loss_final=3.25, loss_mean=1, loss_mean_cls=2.31, proj_loss=-0.0581][2026-03-23 13:38:14] Step: 447, Training Logs: loss_final: 3.608467, loss_mean: 0.998360, proj_loss: -0.059259, loss_mean_cls: 2.669366, grad_norm: 3.355722 +Steps: 0%| | 448/1000000 [01:54<67:37:54, 4.11it/s, grad_norm=3.36, loss_final=3.61, loss_mean=0.998, loss_mean_cls=2.67, proj_loss=-0.0593][2026-03-23 13:38:14] Step: 448, Training Logs: loss_final: 3.954360, loss_mean: 0.996033, proj_loss: -0.059068, loss_mean_cls: 3.017395, grad_norm: 5.048338 +Steps: 0%| | 449/1000000 [01:54<67:44:29, 4.10it/s, grad_norm=5.05, loss_final=3.95, loss_mean=0.996, loss_mean_cls=3.02, proj_loss=-0.0591][2026-03-23 13:38:14] Step: 449, Training Logs: loss_final: 3.932964, loss_mean: 0.991402, proj_loss: -0.059570, loss_mean_cls: 3.001132, grad_norm: 4.596323 +Steps: 0%| | 450/1000000 [01:55<67:42:57, 4.10it/s, grad_norm=4.6, loss_final=3.93, loss_mean=0.991, loss_mean_cls=3, proj_loss=-0.0596][2026-03-23 13:38:15] Step: 450, Training Logs: loss_final: 4.152198, loss_mean: 0.963918, proj_loss: -0.058184, loss_mean_cls: 3.246464, grad_norm: 5.431376 +Steps: 0%| | 451/1000000 [01:55<67:42:19, 4.10it/s, grad_norm=5.43, loss_final=4.15, loss_mean=0.964, loss_mean_cls=3.25, proj_loss=-0.0582][2026-03-23 13:38:15] Step: 451, Training Logs: loss_final: 3.619426, loss_mean: 0.983749, proj_loss: -0.062607, loss_mean_cls: 2.698284, grad_norm: 5.557251 +Steps: 0%| | 452/1000000 [01:55<67:39:01, 4.10it/s, grad_norm=5.56, loss_final=3.62, loss_mean=0.984, loss_mean_cls=2.7, proj_loss=-0.0626][2026-03-23 13:38:15] Step: 452, Training Logs: loss_final: 3.768842, loss_mean: 0.978969, proj_loss: -0.059195, loss_mean_cls: 2.849068, grad_norm: 2.968557 +Steps: 0%| | 453/1000000 [01:55<67:44:58, 4.10it/s, grad_norm=2.97, loss_final=3.77, loss_mean=0.979, loss_mean_cls=2.85, proj_loss=-0.0592][2026-03-23 13:38:15] Step: 453, Training Logs: loss_final: 3.820686, loss_mean: 1.000989, proj_loss: -0.062077, loss_mean_cls: 2.881774, grad_norm: 5.237685 +Steps: 0%| | 454/1000000 [01:56<67:42:34, 4.10it/s, grad_norm=5.24, loss_final=3.82, loss_mean=1, loss_mean_cls=2.88, proj_loss=-0.0621][2026-03-23 13:38:15] Step: 454, Training Logs: loss_final: 3.655086, loss_mean: 0.986606, proj_loss: -0.057601, loss_mean_cls: 2.726081, grad_norm: 3.305473 +Steps: 0%| | 455/1000000 [01:56<67:41:08, 4.10it/s, grad_norm=3.31, loss_final=3.66, loss_mean=0.987, loss_mean_cls=2.73, proj_loss=-0.0576][2026-03-23 13:38:16] Step: 455, Training Logs: loss_final: 3.695682, loss_mean: 0.988198, proj_loss: -0.057030, loss_mean_cls: 2.764515, grad_norm: 5.572070 +Steps: 0%| | 456/1000000 [01:56<67:48:53, 4.09it/s, grad_norm=5.57, loss_final=3.7, loss_mean=0.988, loss_mean_cls=2.76, proj_loss=-0.057][2026-03-23 13:38:16] Step: 456, Training Logs: loss_final: 3.527836, loss_mean: 0.990351, proj_loss: -0.059549, loss_mean_cls: 2.597033, grad_norm: 3.535115 +Steps: 0%| | 457/1000000 [01:56<67:49:35, 4.09it/s, grad_norm=3.54, loss_final=3.53, loss_mean=0.99, loss_mean_cls=2.6, proj_loss=-0.0595][2026-03-23 13:38:16] Step: 457, Training Logs: loss_final: 3.849177, loss_mean: 1.009021, proj_loss: -0.060231, loss_mean_cls: 2.900388, grad_norm: 6.278506 +Steps: 0%| | 458/1000000 [01:56<67:47:02, 4.10it/s, grad_norm=6.28, loss_final=3.85, loss_mean=1.01, loss_mean_cls=2.9, proj_loss=-0.0602][2026-03-23 13:38:16] Step: 458, Training Logs: loss_final: 4.165937, loss_mean: 0.984625, proj_loss: -0.058690, loss_mean_cls: 3.240002, grad_norm: 5.756927 +Steps: 0%| | 459/1000000 [01:57<67:44:57, 4.10it/s, grad_norm=5.76, loss_final=4.17, loss_mean=0.985, loss_mean_cls=3.24, proj_loss=-0.0587][2026-03-23 13:38:17] Step: 459, Training Logs: loss_final: 3.484751, loss_mean: 1.013544, proj_loss: -0.061612, loss_mean_cls: 2.532819, grad_norm: 5.681323 +Steps: 0%| | 460/1000000 [01:57<67:43:28, 4.10it/s, grad_norm=5.68, loss_final=3.48, loss_mean=1.01, loss_mean_cls=2.53, proj_loss=-0.0616][2026-03-23 13:38:17] Step: 460, Training Logs: loss_final: 3.853418, loss_mean: 1.011980, proj_loss: -0.057327, loss_mean_cls: 2.898765, grad_norm: 4.533947 +Steps: 0%| | 461/1000000 [01:57<67:48:59, 4.09it/s, grad_norm=4.53, loss_final=3.85, loss_mean=1.01, loss_mean_cls=2.9, proj_loss=-0.0573][2026-03-23 13:38:17] Step: 461, Training Logs: loss_final: 4.010116, loss_mean: 1.011127, proj_loss: -0.059453, loss_mean_cls: 3.058442, grad_norm: 7.804974 +Steps: 0%| | 462/1000000 [01:57<67:45:22, 4.10it/s, grad_norm=7.8, loss_final=4.01, loss_mean=1.01, loss_mean_cls=3.06, proj_loss=-0.0595][2026-03-23 13:38:17] Step: 462, Training Logs: loss_final: 4.023099, loss_mean: 0.993703, proj_loss: -0.058313, loss_mean_cls: 3.087709, grad_norm: 7.041640 +Steps: 0%| | 463/1000000 [01:58<67:43:56, 4.10it/s, grad_norm=7.04, loss_final=4.02, loss_mean=0.994, loss_mean_cls=3.09, proj_loss=-0.0583][2026-03-23 13:38:18] Step: 463, Training Logs: loss_final: 3.498697, loss_mean: 1.031959, proj_loss: -0.059314, loss_mean_cls: 2.526052, grad_norm: 5.072158 +Steps: 0%| | 464/1000000 [01:58<67:41:35, 4.10it/s, grad_norm=5.07, loss_final=3.5, loss_mean=1.03, loss_mean_cls=2.53, proj_loss=-0.0593][2026-03-23 13:38:18] Step: 464, Training Logs: loss_final: 3.713818, loss_mean: 1.018613, proj_loss: -0.058001, loss_mean_cls: 2.753206, grad_norm: 5.744684 +Steps: 0%| | 465/1000000 [01:58<67:45:23, 4.10it/s, grad_norm=5.74, loss_final=3.71, loss_mean=1.02, loss_mean_cls=2.75, proj_loss=-0.058][2026-03-23 13:38:18] Step: 465, Training Logs: loss_final: 4.117116, loss_mean: 0.994501, proj_loss: -0.062001, loss_mean_cls: 3.184616, grad_norm: 5.745746 +Steps: 0%| | 466/1000000 [01:58<67:43:16, 4.10it/s, grad_norm=5.75, loss_final=4.12, loss_mean=0.995, loss_mean_cls=3.18, proj_loss=-0.062][2026-03-23 13:38:18] Step: 466, Training Logs: loss_final: 3.396567, loss_mean: 1.013430, proj_loss: -0.060217, loss_mean_cls: 2.443354, grad_norm: 5.811415 +Steps: 0%| | 467/1000000 [01:59<67:41:35, 4.10it/s, grad_norm=5.81, loss_final=3.4, loss_mean=1.01, loss_mean_cls=2.44, proj_loss=-0.0602][2026-03-23 13:38:19] Step: 467, Training Logs: loss_final: 3.512218, loss_mean: 1.027738, proj_loss: -0.059120, loss_mean_cls: 2.543600, grad_norm: 6.152915 +Steps: 0%| | 468/1000000 [01:59<67:40:30, 4.10it/s, grad_norm=6.15, loss_final=3.51, loss_mean=1.03, loss_mean_cls=2.54, proj_loss=-0.0591][2026-03-23 13:38:19] Step: 468, Training Logs: loss_final: 3.837753, loss_mean: 0.998879, proj_loss: -0.057948, loss_mean_cls: 2.896822, grad_norm: 4.869075 +Steps: 0%| | 469/1000000 [01:59<67:46:08, 4.10it/s, grad_norm=4.87, loss_final=3.84, loss_mean=0.999, loss_mean_cls=2.9, proj_loss=-0.0579][2026-03-23 13:38:19] Step: 469, Training Logs: loss_final: 3.549381, loss_mean: 1.016195, proj_loss: -0.061754, loss_mean_cls: 2.594940, grad_norm: 4.621217 +Steps: 0%| | 470/1000000 [01:59<67:43:55, 4.10it/s, grad_norm=4.62, loss_final=3.55, loss_mean=1.02, loss_mean_cls=2.59, proj_loss=-0.0618][2026-03-23 13:38:19] Step: 470, Training Logs: loss_final: 3.661156, loss_mean: 0.997600, proj_loss: -0.058973, loss_mean_cls: 2.722529, grad_norm: 4.588041 +Steps: 0%| | 471/1000000 [02:00<67:42:36, 4.10it/s, grad_norm=4.59, loss_final=3.66, loss_mean=0.998, loss_mean_cls=2.72, proj_loss=-0.059][2026-03-23 13:38:20] Step: 471, Training Logs: loss_final: 3.959616, loss_mean: 0.996180, proj_loss: -0.059105, loss_mean_cls: 3.022541, grad_norm: 4.661160 +Steps: 0%| | 472/1000000 [02:00<67:41:48, 4.10it/s, grad_norm=4.66, loss_final=3.96, loss_mean=0.996, loss_mean_cls=3.02, proj_loss=-0.0591][2026-03-23 13:38:20] Step: 472, Training Logs: loss_final: 4.472188, loss_mean: 0.952498, proj_loss: -0.058649, loss_mean_cls: 3.578339, grad_norm: 4.556396 +Steps: 0%| | 473/1000000 [02:00<67:45:33, 4.10it/s, grad_norm=4.56, loss_final=4.47, loss_mean=0.952, loss_mean_cls=3.58, proj_loss=-0.0586][2026-03-23 13:38:20] Step: 473, Training Logs: loss_final: 3.708116, loss_mean: 1.022299, proj_loss: -0.059989, loss_mean_cls: 2.745806, grad_norm: 5.338718 +Steps: 0%| | 474/1000000 [02:00<67:42:34, 4.10it/s, grad_norm=5.34, loss_final=3.71, loss_mean=1.02, loss_mean_cls=2.75, proj_loss=-0.06][2026-03-23 13:38:20] Step: 474, Training Logs: loss_final: 3.976010, loss_mean: 0.994154, proj_loss: -0.060348, loss_mean_cls: 3.042203, grad_norm: 4.552530 +Steps: 0%| | 475/1000000 [02:01<67:39:50, 4.10it/s, grad_norm=4.55, loss_final=3.98, loss_mean=0.994, loss_mean_cls=3.04, proj_loss=-0.0603][2026-03-23 13:38:21] Step: 475, Training Logs: loss_final: 3.471146, loss_mean: 1.016472, proj_loss: -0.059878, loss_mean_cls: 2.514552, grad_norm: 4.380024 +Steps: 0%| | 476/1000000 [02:01<67:43:19, 4.10it/s, grad_norm=4.38, loss_final=3.47, loss_mean=1.02, loss_mean_cls=2.51, proj_loss=-0.0599][2026-03-23 13:38:21] Step: 476, Training Logs: loss_final: 3.957986, loss_mean: 0.985465, proj_loss: -0.060814, loss_mean_cls: 3.033335, grad_norm: 5.958980 +Steps: 0%| | 477/1000000 [02:01<67:50:25, 4.09it/s, grad_norm=5.96, loss_final=3.96, loss_mean=0.985, loss_mean_cls=3.03, proj_loss=-0.0608][2026-03-23 13:38:21] Step: 477, Training Logs: loss_final: 3.761233, loss_mean: 0.991739, proj_loss: -0.062001, loss_mean_cls: 2.831495, grad_norm: 3.781288 +Steps: 0%| | 478/1000000 [02:01<67:45:21, 4.10it/s, grad_norm=3.78, loss_final=3.76, loss_mean=0.992, loss_mean_cls=2.83, proj_loss=-0.062][2026-03-23 13:38:21] Step: 478, Training Logs: loss_final: 3.147285, loss_mean: 1.017535, proj_loss: -0.059881, loss_mean_cls: 2.189630, grad_norm: 6.673585 +Steps: 0%| | 479/1000000 [02:02<67:43:38, 4.10it/s, grad_norm=6.67, loss_final=3.15, loss_mean=1.02, loss_mean_cls=2.19, proj_loss=-0.0599][2026-03-23 13:38:22] Step: 479, Training Logs: loss_final: 3.897202, loss_mean: 0.983054, proj_loss: -0.060054, loss_mean_cls: 2.974203, grad_norm: 4.541247 +Steps: 0%| | 480/1000000 [02:02<67:58:36, 4.08it/s, grad_norm=4.54, loss_final=3.9, loss_mean=0.983, loss_mean_cls=2.97, proj_loss=-0.0601][2026-03-23 13:38:22] Step: 480, Training Logs: loss_final: 3.803994, loss_mean: 1.020277, proj_loss: -0.061208, loss_mean_cls: 2.844924, grad_norm: 7.569435 +Steps: 0%| | 481/1000000 [02:02<68:59:40, 4.02it/s, grad_norm=7.57, loss_final=3.8, loss_mean=1.02, loss_mean_cls=2.84, proj_loss=-0.0612][2026-03-23 13:38:22] Step: 481, Training Logs: loss_final: 3.028141, loss_mean: 1.042134, proj_loss: -0.058715, loss_mean_cls: 2.044723, grad_norm: 4.544116 +Steps: 0%| | 482/1000000 [02:02<68:34:54, 4.05it/s, grad_norm=4.54, loss_final=3.03, loss_mean=1.04, loss_mean_cls=2.04, proj_loss=-0.0587][2026-03-23 13:38:22] Step: 482, Training Logs: loss_final: 3.880368, loss_mean: 1.011301, proj_loss: -0.059091, loss_mean_cls: 2.928157, grad_norm: 4.908041 +Steps: 0%| | 483/1000000 [02:03<68:18:53, 4.06it/s, grad_norm=4.91, loss_final=3.88, loss_mean=1.01, loss_mean_cls=2.93, proj_loss=-0.0591][2026-03-23 13:38:23] Step: 483, Training Logs: loss_final: 4.140957, loss_mean: 1.006997, proj_loss: -0.058522, loss_mean_cls: 3.192483, grad_norm: 5.635218 +Steps: 0%| | 484/1000000 [02:03<68:08:22, 4.07it/s, grad_norm=5.64, loss_final=4.14, loss_mean=1.01, loss_mean_cls=3.19, proj_loss=-0.0585][2026-03-23 13:38:23] Step: 484, Training Logs: loss_final: 4.059102, loss_mean: 1.006114, proj_loss: -0.061391, loss_mean_cls: 3.114377, grad_norm: 6.104209 +Steps: 0%| | 485/1000000 [02:03<68:07:27, 4.08it/s, grad_norm=6.1, loss_final=4.06, loss_mean=1.01, loss_mean_cls=3.11, proj_loss=-0.0614][2026-03-23 13:38:23] Step: 485, Training Logs: loss_final: 4.057761, loss_mean: 1.032733, proj_loss: -0.059740, loss_mean_cls: 3.084767, grad_norm: 6.394452 +Steps: 0%| | 486/1000000 [02:03<67:56:57, 4.09it/s, grad_norm=6.39, loss_final=4.06, loss_mean=1.03, loss_mean_cls=3.08, proj_loss=-0.0597][2026-03-23 13:38:23] Step: 486, Training Logs: loss_final: 3.172134, loss_mean: 1.008804, proj_loss: -0.062471, loss_mean_cls: 2.225801, grad_norm: 4.687089 +Steps: 0%| | 487/1000000 [02:04<67:53:06, 4.09it/s, grad_norm=4.69, loss_final=3.17, loss_mean=1.01, loss_mean_cls=2.23, proj_loss=-0.0625][2026-03-23 13:38:24] Step: 487, Training Logs: loss_final: 3.898747, loss_mean: 1.004198, proj_loss: -0.062245, loss_mean_cls: 2.956795, grad_norm: 3.743562 +Steps: 0%| | 488/1000000 [02:04<67:48:43, 4.09it/s, grad_norm=3.74, loss_final=3.9, loss_mean=1, loss_mean_cls=2.96, proj_loss=-0.0622][2026-03-23 13:38:24] Step: 488, Training Logs: loss_final: 3.520661, loss_mean: 1.035395, proj_loss: -0.057540, loss_mean_cls: 2.542805, grad_norm: 4.465253 +Steps: 0%| | 489/1000000 [02:04<67:45:34, 4.10it/s, grad_norm=4.47, loss_final=3.52, loss_mean=1.04, loss_mean_cls=2.54, proj_loss=-0.0575][2026-03-23 13:38:24] Step: 489, Training Logs: loss_final: 3.667616, loss_mean: 0.998908, proj_loss: -0.060104, loss_mean_cls: 2.728812, grad_norm: 3.315513 +Steps: 0%| | 490/1000000 [02:04<67:43:47, 4.10it/s, grad_norm=3.32, loss_final=3.67, loss_mean=0.999, loss_mean_cls=2.73, proj_loss=-0.0601][2026-03-23 13:38:24] Step: 490, Training Logs: loss_final: 3.793316, loss_mean: 1.002668, proj_loss: -0.059251, loss_mean_cls: 2.849899, grad_norm: 3.777720 +Steps: 0%| | 491/1000000 [02:05<67:41:38, 4.10it/s, grad_norm=3.78, loss_final=3.79, loss_mean=1, loss_mean_cls=2.85, proj_loss=-0.0593][2026-03-23 13:38:25] Step: 491, Training Logs: loss_final: 4.013474, loss_mean: 1.014656, proj_loss: -0.058233, loss_mean_cls: 3.057050, grad_norm: 5.078125 +Steps: 0%| | 492/1000000 [02:05<67:41:13, 4.10it/s, grad_norm=5.08, loss_final=4.01, loss_mean=1.01, loss_mean_cls=3.06, proj_loss=-0.0582][2026-03-23 13:38:25] Step: 492, Training Logs: loss_final: 4.072273, loss_mean: 0.989192, proj_loss: -0.058564, loss_mean_cls: 3.141645, grad_norm: 4.269209 +Steps: 0%| | 493/1000000 [02:05<67:45:48, 4.10it/s, grad_norm=4.27, loss_final=4.07, loss_mean=0.989, loss_mean_cls=3.14, proj_loss=-0.0586][2026-03-23 13:38:25] Step: 493, Training Logs: loss_final: 3.690774, loss_mean: 0.992102, proj_loss: -0.059336, loss_mean_cls: 2.758008, grad_norm: 5.123598 +Steps: 0%| | 494/1000000 [02:05<67:42:37, 4.10it/s, grad_norm=5.12, loss_final=3.69, loss_mean=0.992, loss_mean_cls=2.76, proj_loss=-0.0593][2026-03-23 13:38:25] Step: 494, Training Logs: loss_final: 4.122980, loss_mean: 0.979838, proj_loss: -0.059962, loss_mean_cls: 3.203104, grad_norm: 4.371683 +Steps: 0%| | 495/1000000 [02:06<67:40:06, 4.10it/s, grad_norm=4.37, loss_final=4.12, loss_mean=0.98, loss_mean_cls=3.2, proj_loss=-0.06][2026-03-23 13:38:25] Step: 495, Training Logs: loss_final: 3.342616, loss_mean: 1.034947, proj_loss: -0.059670, loss_mean_cls: 2.367339, grad_norm: 6.697191 +Steps: 0%| | 496/1000000 [02:06<67:38:59, 4.10it/s, grad_norm=6.7, loss_final=3.34, loss_mean=1.03, loss_mean_cls=2.37, proj_loss=-0.0597][2026-03-23 13:38:26] Step: 496, Training Logs: loss_final: 3.557297, loss_mean: 1.028300, proj_loss: -0.059151, loss_mean_cls: 2.588148, grad_norm: 5.318191 +Steps: 0%| | 497/1000000 [02:06<67:46:09, 4.10it/s, grad_norm=5.32, loss_final=3.56, loss_mean=1.03, loss_mean_cls=2.59, proj_loss=-0.0592][2026-03-23 13:38:26] Step: 497, Training Logs: loss_final: 3.638560, loss_mean: 0.999883, proj_loss: -0.058537, loss_mean_cls: 2.697214, grad_norm: 4.994712 +Steps: 0%| | 498/1000000 [02:06<67:43:25, 4.10it/s, grad_norm=4.99, loss_final=3.64, loss_mean=1, loss_mean_cls=2.7, proj_loss=-0.0585][2026-03-23 13:38:26] Step: 498, Training Logs: loss_final: 3.382539, loss_mean: 1.012767, proj_loss: -0.059525, loss_mean_cls: 2.429296, grad_norm: 5.753767 +Steps: 0%| | 499/1000000 [02:07<67:45:52, 4.10it/s, grad_norm=5.75, loss_final=3.38, loss_mean=1.01, loss_mean_cls=2.43, proj_loss=-0.0595][2026-03-23 13:38:26] Step: 499, Training Logs: loss_final: 4.078226, loss_mean: 1.000360, proj_loss: -0.063423, loss_mean_cls: 3.141289, grad_norm: 5.357095 +Steps: 0%| | 500/1000000 [02:07<67:44:41, 4.10it/s, grad_norm=5.36, loss_final=4.08, loss_mean=1, loss_mean_cls=3.14, proj_loss=-0.0634][2026-03-23 13:38:27] Step: 500, Training Logs: loss_final: 3.804945, loss_mean: 0.992369, proj_loss: -0.059287, loss_mean_cls: 2.871864, grad_norm: 4.791928 +Steps: 0%| | 501/1000000 [02:07<67:47:56, 4.10it/s, grad_norm=4.79, loss_final=3.8, loss_mean=0.992, loss_mean_cls=2.87, proj_loss=-0.0593][2026-03-23 13:38:27] Step: 501, Training Logs: loss_final: 3.527294, loss_mean: 1.010652, proj_loss: -0.060098, loss_mean_cls: 2.576740, grad_norm: 5.218223 +Steps: 0%| | 502/1000000 [02:07<67:44:54, 4.10it/s, grad_norm=5.22, loss_final=3.53, loss_mean=1.01, loss_mean_cls=2.58, proj_loss=-0.0601][2026-03-23 13:38:27] Step: 502, Training Logs: loss_final: 3.743524, loss_mean: 0.974976, proj_loss: -0.061488, loss_mean_cls: 2.830035, grad_norm: 5.032670 +Steps: 0%| | 503/1000000 [02:07<67:43:31, 4.10it/s, grad_norm=5.03, loss_final=3.74, loss_mean=0.975, loss_mean_cls=2.83, proj_loss=-0.0615][2026-03-23 13:38:27] Step: 503, Training Logs: loss_final: 4.319933, loss_mean: 0.965484, proj_loss: -0.058231, loss_mean_cls: 3.412680, grad_norm: 3.884716 +Steps: 0%| | 504/1000000 [02:08<67:41:49, 4.10it/s, grad_norm=3.88, loss_final=4.32, loss_mean=0.965, loss_mean_cls=3.41, proj_loss=-0.0582][2026-03-23 13:38:28] Step: 504, Training Logs: loss_final: 4.075004, loss_mean: 0.990851, proj_loss: -0.059652, loss_mean_cls: 3.143805, grad_norm: 7.007979 +Steps: 0%| | 505/1000000 [02:08<67:47:21, 4.10it/s, grad_norm=7.01, loss_final=4.08, loss_mean=0.991, loss_mean_cls=3.14, proj_loss=-0.0597][2026-03-23 13:38:28] Step: 505, Training Logs: loss_final: 3.839616, loss_mean: 0.982554, proj_loss: -0.058630, loss_mean_cls: 2.915692, grad_norm: 6.696390 +Steps: 0%| | 506/1000000 [02:08<67:43:28, 4.10it/s, grad_norm=6.7, loss_final=3.84, loss_mean=0.983, loss_mean_cls=2.92, proj_loss=-0.0586][2026-03-23 13:38:28] Step: 506, Training Logs: loss_final: 3.775164, loss_mean: 1.006670, proj_loss: -0.062580, loss_mean_cls: 2.831074, grad_norm: 5.005618 +Steps: 0%| | 507/1000000 [02:08<67:42:05, 4.10it/s, grad_norm=5.01, loss_final=3.78, loss_mean=1.01, loss_mean_cls=2.83, proj_loss=-0.0626][2026-03-23 13:38:28] Step: 507, Training Logs: loss_final: 3.471088, loss_mean: 0.983736, proj_loss: -0.059496, loss_mean_cls: 2.546848, grad_norm: 5.421797 +Steps: 0%| | 508/1000000 [02:09<67:39:53, 4.10it/s, grad_norm=5.42, loss_final=3.47, loss_mean=0.984, loss_mean_cls=2.55, proj_loss=-0.0595][2026-03-23 13:38:29] Step: 508, Training Logs: loss_final: 4.028605, loss_mean: 0.989691, proj_loss: -0.060272, loss_mean_cls: 3.099185, grad_norm: 4.484790 +Steps: 0%| | 509/1000000 [02:09<70:27:18, 3.94it/s, grad_norm=4.48, loss_final=4.03, loss_mean=0.99, loss_mean_cls=3.1, proj_loss=-0.0603][2026-03-23 13:38:29] Step: 509, Training Logs: loss_final: 4.039705, loss_mean: 1.013254, proj_loss: -0.060995, loss_mean_cls: 3.087446, grad_norm: 6.434253 +Steps: 0%| | 510/1000000 [02:09<69:35:53, 3.99it/s, grad_norm=6.43, loss_final=4.04, loss_mean=1.01, loss_mean_cls=3.09, proj_loss=-0.061][2026-03-23 13:38:29] Step: 510, Training Logs: loss_final: 3.861214, loss_mean: 0.989505, proj_loss: -0.058805, loss_mean_cls: 2.930513, grad_norm: 3.087045 +Steps: 0%| | 511/1000000 [02:09<68:59:27, 4.02it/s, grad_norm=3.09, loss_final=3.86, loss_mean=0.99, loss_mean_cls=2.93, proj_loss=-0.0588][2026-03-23 13:38:29] Step: 511, Training Logs: loss_final: 3.817897, loss_mean: 0.980981, proj_loss: -0.059287, loss_mean_cls: 2.896203, grad_norm: 6.764958 +Steps: 0%| | 512/1000000 [02:10<68:35:12, 4.05it/s, grad_norm=6.76, loss_final=3.82, loss_mean=0.981, loss_mean_cls=2.9, proj_loss=-0.0593][2026-03-23 13:38:30] Step: 512, Training Logs: loss_final: 3.479124, loss_mean: 1.008865, proj_loss: -0.060714, loss_mean_cls: 2.530973, grad_norm: 5.616838 +Steps: 0%| | 513/1000000 [02:10<68:26:48, 4.06it/s, grad_norm=5.62, loss_final=3.48, loss_mean=1.01, loss_mean_cls=2.53, proj_loss=-0.0607][2026-03-23 13:38:30] Step: 513, Training Logs: loss_final: 3.856430, loss_mean: 0.995223, proj_loss: -0.060454, loss_mean_cls: 2.921661, grad_norm: 4.630571 +Steps: 0%| | 514/1000000 [02:10<68:11:30, 4.07it/s, grad_norm=4.63, loss_final=3.86, loss_mean=0.995, loss_mean_cls=2.92, proj_loss=-0.0605][2026-03-23 13:38:30] Step: 514, Training Logs: loss_final: 4.440639, loss_mean: 0.970428, proj_loss: -0.061545, loss_mean_cls: 3.531756, grad_norm: 4.422764 +Steps: 0%| | 515/1000000 [02:10<68:35:40, 4.05it/s, grad_norm=4.42, loss_final=4.44, loss_mean=0.97, loss_mean_cls=3.53, proj_loss=-0.0615][2026-03-23 13:38:30] Step: 515, Training Logs: loss_final: 3.251636, loss_mean: 1.004770, proj_loss: -0.060600, loss_mean_cls: 2.307466, grad_norm: 4.386863 +Steps: 0%| | 516/1000000 [02:11<68:19:00, 4.06it/s, grad_norm=4.39, loss_final=3.25, loss_mean=1, loss_mean_cls=2.31, proj_loss=-0.0606][2026-03-23 13:38:31] Step: 516, Training Logs: loss_final: 3.644873, loss_mean: 0.985497, proj_loss: -0.060186, loss_mean_cls: 2.719562, grad_norm: 4.262082 +Steps: 0%| | 517/1000000 [02:11<68:14:06, 4.07it/s, grad_norm=4.26, loss_final=3.64, loss_mean=0.985, loss_mean_cls=2.72, proj_loss=-0.0602][2026-03-23 13:38:31] Step: 517, Training Logs: loss_final: 4.010631, loss_mean: 0.995438, proj_loss: -0.062326, loss_mean_cls: 3.077518, grad_norm: 3.290236 +Steps: 0%| | 518/1000000 [02:11<68:02:43, 4.08it/s, grad_norm=3.29, loss_final=4.01, loss_mean=0.995, loss_mean_cls=3.08, proj_loss=-0.0623][2026-03-23 13:38:31] Step: 518, Training Logs: loss_final: 3.930444, loss_mean: 0.980124, proj_loss: -0.062498, loss_mean_cls: 3.012818, grad_norm: 4.994658 +Steps: 0%| | 519/1000000 [02:11<69:10:31, 4.01it/s, grad_norm=4.99, loss_final=3.93, loss_mean=0.98, loss_mean_cls=3.01, proj_loss=-0.0625][2026-03-23 13:38:31] Step: 519, Training Logs: loss_final: 4.163662, loss_mean: 0.986464, proj_loss: -0.061845, loss_mean_cls: 3.239043, grad_norm: 3.952677 +Steps: 0%| | 520/1000000 [02:12<68:43:32, 4.04it/s, grad_norm=3.95, loss_final=4.16, loss_mean=0.986, loss_mean_cls=3.24, proj_loss=-0.0618][2026-03-23 13:38:32] Step: 520, Training Logs: loss_final: 3.249564, loss_mean: 1.022780, proj_loss: -0.062097, loss_mean_cls: 2.288881, grad_norm: 8.118796 +Steps: 0%| | 521/1000000 [02:12<68:31:11, 4.05it/s, grad_norm=8.12, loss_final=3.25, loss_mean=1.02, loss_mean_cls=2.29, proj_loss=-0.0621][2026-03-23 13:38:32] Step: 521, Training Logs: loss_final: 3.716078, loss_mean: 1.003020, proj_loss: -0.060916, loss_mean_cls: 2.773974, grad_norm: 8.374680 +Steps: 0%| | 522/1000000 [02:12<68:14:56, 4.07it/s, grad_norm=8.37, loss_final=3.72, loss_mean=1, loss_mean_cls=2.77, proj_loss=-0.0609][2026-03-23 13:38:32] Step: 522, Training Logs: loss_final: 3.646288, loss_mean: 0.984829, proj_loss: -0.059073, loss_mean_cls: 2.720533, grad_norm: 4.336975 +Steps: 0%| | 523/1000000 [02:12<68:03:52, 4.08it/s, grad_norm=4.34, loss_final=3.65, loss_mean=0.985, loss_mean_cls=2.72, proj_loss=-0.0591][2026-03-23 13:38:32] Step: 523, Training Logs: loss_final: 3.749321, loss_mean: 0.999049, proj_loss: -0.061702, loss_mean_cls: 2.811974, grad_norm: 5.087591 +Steps: 0%| | 524/1000000 [02:13<67:55:26, 4.09it/s, grad_norm=5.09, loss_final=3.75, loss_mean=0.999, loss_mean_cls=2.81, proj_loss=-0.0617][2026-03-23 13:38:33] Step: 524, Training Logs: loss_final: 3.557367, loss_mean: 1.009531, proj_loss: -0.060214, loss_mean_cls: 2.608050, grad_norm: 5.260213 +Steps: 0%| | 525/1000000 [02:13<67:56:42, 4.09it/s, grad_norm=5.26, loss_final=3.56, loss_mean=1.01, loss_mean_cls=2.61, proj_loss=-0.0602][2026-03-23 13:38:33] Step: 525, Training Logs: loss_final: 3.356538, loss_mean: 1.013984, proj_loss: -0.062924, loss_mean_cls: 2.405479, grad_norm: 3.361717 +Steps: 0%| | 526/1000000 [02:13<67:50:06, 4.09it/s, grad_norm=3.36, loss_final=3.36, loss_mean=1.01, loss_mean_cls=2.41, proj_loss=-0.0629][2026-03-23 13:38:33] Step: 526, Training Logs: loss_final: 3.268602, loss_mean: 1.008961, proj_loss: -0.062959, loss_mean_cls: 2.322600, grad_norm: 5.273700 +Steps: 0%| | 527/1000000 [02:13<67:46:44, 4.10it/s, grad_norm=5.27, loss_final=3.27, loss_mean=1.01, loss_mean_cls=2.32, proj_loss=-0.063][2026-03-23 13:38:33] Step: 527, Training Logs: loss_final: 3.810208, loss_mean: 0.995844, proj_loss: -0.060248, loss_mean_cls: 2.874612, grad_norm: 3.684431 +Steps: 0%| | 528/1000000 [02:14<67:44:25, 4.10it/s, grad_norm=3.68, loss_final=3.81, loss_mean=0.996, loss_mean_cls=2.87, proj_loss=-0.0602][2026-03-23 13:38:34] Step: 528, Training Logs: loss_final: 3.762990, loss_mean: 0.961380, proj_loss: -0.062704, loss_mean_cls: 2.864315, grad_norm: 4.041296 +Steps: 0%| | 529/1000000 [02:14<67:43:02, 4.10it/s, grad_norm=4.04, loss_final=3.76, loss_mean=0.961, loss_mean_cls=2.86, proj_loss=-0.0627][2026-03-23 13:38:34] Step: 529, Training Logs: loss_final: 3.706540, loss_mean: 0.972243, proj_loss: -0.062127, loss_mean_cls: 2.796424, grad_norm: 5.035536 +Steps: 0%| | 530/1000000 [02:14<67:40:10, 4.10it/s, grad_norm=5.04, loss_final=3.71, loss_mean=0.972, loss_mean_cls=2.8, proj_loss=-0.0621][2026-03-23 13:38:34] Step: 530, Training Logs: loss_final: 3.336358, loss_mean: 0.990504, proj_loss: -0.062151, loss_mean_cls: 2.408005, grad_norm: 3.309624 +Steps: 0%| | 531/1000000 [02:14<67:41:56, 4.10it/s, grad_norm=3.31, loss_final=3.34, loss_mean=0.991, loss_mean_cls=2.41, proj_loss=-0.0622][2026-03-23 13:38:34] Step: 531, Training Logs: loss_final: 3.190954, loss_mean: 0.998733, proj_loss: -0.060947, loss_mean_cls: 2.253169, grad_norm: 4.959621 +Steps: 0%| | 532/1000000 [02:15<67:40:30, 4.10it/s, grad_norm=4.96, loss_final=3.19, loss_mean=0.999, loss_mean_cls=2.25, proj_loss=-0.0609][2026-03-23 13:38:35] Step: 532, Training Logs: loss_final: 3.966578, loss_mean: 0.974030, proj_loss: -0.060892, loss_mean_cls: 3.053440, grad_norm: 4.352316 +Steps: 0%| | 533/1000000 [02:15<67:39:06, 4.10it/s, grad_norm=4.35, loss_final=3.97, loss_mean=0.974, loss_mean_cls=3.05, proj_loss=-0.0609][2026-03-23 13:38:35] Step: 533, Training Logs: loss_final: 4.025322, loss_mean: 0.999340, proj_loss: -0.059904, loss_mean_cls: 3.085886, grad_norm: 5.332438 +Steps: 0%| | 534/1000000 [02:15<67:39:21, 4.10it/s, grad_norm=5.33, loss_final=4.03, loss_mean=0.999, loss_mean_cls=3.09, proj_loss=-0.0599][2026-03-23 13:38:35] Step: 534, Training Logs: loss_final: 4.066311, loss_mean: 1.009628, proj_loss: -0.056813, loss_mean_cls: 3.113496, grad_norm: 9.078892 +Steps: 0%| | 535/1000000 [02:15<67:39:40, 4.10it/s, grad_norm=9.08, loss_final=4.07, loss_mean=1.01, loss_mean_cls=3.11, proj_loss=-0.0568][2026-03-23 13:38:35] Step: 535, Training Logs: loss_final: 3.194691, loss_mean: 0.983540, proj_loss: -0.060585, loss_mean_cls: 2.271736, grad_norm: 3.836528 +Steps: 0%| | 536/1000000 [02:16<67:43:14, 4.10it/s, grad_norm=3.84, loss_final=3.19, loss_mean=0.984, loss_mean_cls=2.27, proj_loss=-0.0606][2026-03-23 13:38:36] Step: 536, Training Logs: loss_final: 4.173995, loss_mean: 0.982386, proj_loss: -0.059618, loss_mean_cls: 3.251227, grad_norm: 8.564028 +Steps: 0%| | 537/1000000 [02:16<67:39:29, 4.10it/s, grad_norm=8.56, loss_final=4.17, loss_mean=0.982, loss_mean_cls=3.25, proj_loss=-0.0596][2026-03-23 13:38:36] Step: 537, Training Logs: loss_final: 3.400764, loss_mean: 0.982321, proj_loss: -0.061463, loss_mean_cls: 2.479906, grad_norm: 6.037397 +Steps: 0%| | 538/1000000 [02:16<67:37:49, 4.11it/s, grad_norm=6.04, loss_final=3.4, loss_mean=0.982, loss_mean_cls=2.48, proj_loss=-0.0615][2026-03-23 13:38:36] Step: 538, Training Logs: loss_final: 3.623095, loss_mean: 0.970594, proj_loss: -0.061940, loss_mean_cls: 2.714441, grad_norm: 4.319024 +Steps: 0%| | 539/1000000 [02:16<67:38:34, 4.10it/s, grad_norm=4.32, loss_final=3.62, loss_mean=0.971, loss_mean_cls=2.71, proj_loss=-0.0619][2026-03-23 13:38:36] Step: 539, Training Logs: loss_final: 3.258579, loss_mean: 0.992889, proj_loss: -0.061899, loss_mean_cls: 2.327590, grad_norm: 4.696199 +Steps: 0%| | 540/1000000 [02:17<67:40:25, 4.10it/s, grad_norm=4.7, loss_final=3.26, loss_mean=0.993, loss_mean_cls=2.33, proj_loss=-0.0619][2026-03-23 13:38:37] Step: 540, Training Logs: loss_final: 3.390191, loss_mean: 1.008426, proj_loss: -0.062534, loss_mean_cls: 2.444298, grad_norm: 4.977747 +Steps: 0%| | 541/1000000 [02:17<67:38:25, 4.10it/s, grad_norm=4.98, loss_final=3.39, loss_mean=1.01, loss_mean_cls=2.44, proj_loss=-0.0625][2026-03-23 13:38:37] Step: 541, Training Logs: loss_final: 4.581411, loss_mean: 0.973458, proj_loss: -0.060452, loss_mean_cls: 3.668405, grad_norm: 4.955641 +Steps: 0%| | 542/1000000 [02:17<67:38:02, 4.10it/s, grad_norm=4.96, loss_final=4.58, loss_mean=0.973, loss_mean_cls=3.67, proj_loss=-0.0605][2026-03-23 13:38:37] Step: 542, Training Logs: loss_final: 4.032188, loss_mean: 0.974069, proj_loss: -0.061275, loss_mean_cls: 3.119394, grad_norm: 3.906263 +Steps: 0%| | 543/1000000 [02:17<67:37:20, 4.11it/s, grad_norm=3.91, loss_final=4.03, loss_mean=0.974, loss_mean_cls=3.12, proj_loss=-0.0613][2026-03-23 13:38:37] Step: 543, Training Logs: loss_final: 3.463408, loss_mean: 1.000154, proj_loss: -0.065114, loss_mean_cls: 2.528367, grad_norm: 6.118182 +Steps: 0%| | 544/1000000 [02:18<67:41:06, 4.10it/s, grad_norm=6.12, loss_final=3.46, loss_mean=1, loss_mean_cls=2.53, proj_loss=-0.0651][2026-03-23 13:38:38] Step: 544, Training Logs: loss_final: 3.758740, loss_mean: 0.973132, proj_loss: -0.061497, loss_mean_cls: 2.847106, grad_norm: 4.395969 +Steps: 0%| | 545/1000000 [02:18<67:47:25, 4.10it/s, grad_norm=4.4, loss_final=3.76, loss_mean=0.973, loss_mean_cls=2.85, proj_loss=-0.0615][2026-03-23 13:38:38] Step: 545, Training Logs: loss_final: 4.186006, loss_mean: 0.962199, proj_loss: -0.057909, loss_mean_cls: 3.281716, grad_norm: 3.740661 +Steps: 0%| | 546/1000000 [02:18<67:45:26, 4.10it/s, grad_norm=3.74, loss_final=4.19, loss_mean=0.962, loss_mean_cls=3.28, proj_loss=-0.0579][2026-03-23 13:38:38] Step: 546, Training Logs: loss_final: 3.414413, loss_mean: 0.998901, proj_loss: -0.060240, loss_mean_cls: 2.475751, grad_norm: 4.763890 +Steps: 0%| | 547/1000000 [02:18<67:42:40, 4.10it/s, grad_norm=4.76, loss_final=3.41, loss_mean=0.999, loss_mean_cls=2.48, proj_loss=-0.0602][2026-03-23 13:38:38] Step: 547, Training Logs: loss_final: 3.602814, loss_mean: 1.009493, proj_loss: -0.061821, loss_mean_cls: 2.655142, grad_norm: 5.685105 +Steps: 0%| | 548/1000000 [02:19<67:43:53, 4.10it/s, grad_norm=5.69, loss_final=3.6, loss_mean=1.01, loss_mean_cls=2.66, proj_loss=-0.0618][2026-03-23 13:38:38] Step: 548, Training Logs: loss_final: 4.187895, loss_mean: 0.976631, proj_loss: -0.062883, loss_mean_cls: 3.274146, grad_norm: 4.600231 +Steps: 0%| | 549/1000000 [02:19<67:50:34, 4.09it/s, grad_norm=4.6, loss_final=4.19, loss_mean=0.977, loss_mean_cls=3.27, proj_loss=-0.0629][2026-03-23 13:38:39] Step: 549, Training Logs: loss_final: 3.955620, loss_mean: 0.988290, proj_loss: -0.063752, loss_mean_cls: 3.031082, grad_norm: 5.912054 +Steps: 0%| | 550/1000000 [02:19<67:48:02, 4.09it/s, grad_norm=5.91, loss_final=3.96, loss_mean=0.988, loss_mean_cls=3.03, proj_loss=-0.0638][2026-03-23 13:38:39] Step: 550, Training Logs: loss_final: 3.751331, loss_mean: 0.979937, proj_loss: -0.064727, loss_mean_cls: 2.836122, grad_norm: 4.195404 +Steps: 0%| | 551/1000000 [02:19<67:43:45, 4.10it/s, grad_norm=4.2, loss_final=3.75, loss_mean=0.98, loss_mean_cls=2.84, proj_loss=-0.0647][2026-03-23 13:38:39] Step: 551, Training Logs: loss_final: 3.587581, loss_mean: 1.023491, proj_loss: -0.062534, loss_mean_cls: 2.626624, grad_norm: 5.962085 +Steps: 0%| | 552/1000000 [02:19<67:42:09, 4.10it/s, grad_norm=5.96, loss_final=3.59, loss_mean=1.02, loss_mean_cls=2.63, proj_loss=-0.0625][2026-03-23 13:38:39] Step: 552, Training Logs: loss_final: 3.748555, loss_mean: 1.010626, proj_loss: -0.062925, loss_mean_cls: 2.800854, grad_norm: 5.589802 +Steps: 0%| | 553/1000000 [02:20<67:45:42, 4.10it/s, grad_norm=5.59, loss_final=3.75, loss_mean=1.01, loss_mean_cls=2.8, proj_loss=-0.0629][2026-03-23 13:38:40] Step: 553, Training Logs: loss_final: 3.597425, loss_mean: 0.995272, proj_loss: -0.062110, loss_mean_cls: 2.664263, grad_norm: 5.738871 +Steps: 0%| | 554/1000000 [02:20<67:42:43, 4.10it/s, grad_norm=5.74, loss_final=3.6, loss_mean=0.995, loss_mean_cls=2.66, proj_loss=-0.0621][2026-03-23 13:38:40] Step: 554, Training Logs: loss_final: 3.553435, loss_mean: 1.001637, proj_loss: -0.063198, loss_mean_cls: 2.614996, grad_norm: 8.457430 +Steps: 0%| | 555/1000000 [02:20<67:42:05, 4.10it/s, grad_norm=8.46, loss_final=3.55, loss_mean=1, loss_mean_cls=2.61, proj_loss=-0.0632][2026-03-23 13:38:40] Step: 555, Training Logs: loss_final: 3.085080, loss_mean: 1.016666, proj_loss: -0.064804, loss_mean_cls: 2.133218, grad_norm: 6.191314 +Steps: 0%| | 556/1000000 [02:20<67:44:34, 4.10it/s, grad_norm=6.19, loss_final=3.09, loss_mean=1.02, loss_mean_cls=2.13, proj_loss=-0.0648][2026-03-23 13:38:40] Step: 556, Training Logs: loss_final: 4.139660, loss_mean: 0.986485, proj_loss: -0.063379, loss_mean_cls: 3.216554, grad_norm: 8.551427 +Steps: 0%| | 557/1000000 [02:21<67:50:15, 4.09it/s, grad_norm=8.55, loss_final=4.14, loss_mean=0.986, loss_mean_cls=3.22, proj_loss=-0.0634][2026-03-23 13:38:41] Step: 557, Training Logs: loss_final: 3.786111, loss_mean: 1.022130, proj_loss: -0.061798, loss_mean_cls: 2.825779, grad_norm: 7.436475 +Steps: 0%| | 558/1000000 [02:21<70:26:04, 3.94it/s, grad_norm=7.44, loss_final=3.79, loss_mean=1.02, loss_mean_cls=2.83, proj_loss=-0.0618][2026-03-23 13:38:41] Step: 558, Training Logs: loss_final: 3.553229, loss_mean: 0.998251, proj_loss: -0.063196, loss_mean_cls: 2.618174, grad_norm: 4.897469 +Steps: 0%| | 559/1000000 [02:21<69:35:42, 3.99it/s, grad_norm=4.9, loss_final=3.55, loss_mean=0.998, loss_mean_cls=2.62, proj_loss=-0.0632][2026-03-23 13:38:41] Step: 559, Training Logs: loss_final: 3.864410, loss_mean: 0.992712, proj_loss: -0.064330, loss_mean_cls: 2.936028, grad_norm: 7.186272 +Steps: 0%| | 560/1000000 [02:21<69:04:02, 4.02it/s, grad_norm=7.19, loss_final=3.86, loss_mean=0.993, loss_mean_cls=2.94, proj_loss=-0.0643][2026-03-23 13:38:41] Step: 560, Training Logs: loss_final: 3.630350, loss_mean: 0.992960, proj_loss: -0.062120, loss_mean_cls: 2.699510, grad_norm: 4.672362 +Steps: 0%| | 561/1000000 [02:22<68:45:37, 4.04it/s, grad_norm=4.67, loss_final=3.63, loss_mean=0.993, loss_mean_cls=2.7, proj_loss=-0.0621][2026-03-23 13:38:42] Step: 561, Training Logs: loss_final: 3.826174, loss_mean: 0.992180, proj_loss: -0.063471, loss_mean_cls: 2.897465, grad_norm: 6.354654 +Steps: 0%| | 562/1000000 [02:22<68:26:30, 4.06it/s, grad_norm=6.35, loss_final=3.83, loss_mean=0.992, loss_mean_cls=2.9, proj_loss=-0.0635][2026-03-23 13:38:42] Step: 562, Training Logs: loss_final: 3.939189, loss_mean: 0.975055, proj_loss: -0.064188, loss_mean_cls: 3.028322, grad_norm: 5.764719 +Steps: 0%| | 563/1000000 [02:22<68:11:53, 4.07it/s, grad_norm=5.76, loss_final=3.94, loss_mean=0.975, loss_mean_cls=3.03, proj_loss=-0.0642][2026-03-23 13:38:42] Step: 563, Training Logs: loss_final: 3.145711, loss_mean: 0.993379, proj_loss: -0.063646, loss_mean_cls: 2.215978, grad_norm: 4.579361 +Steps: 0%| | 564/1000000 [02:22<68:03:25, 4.08it/s, grad_norm=4.58, loss_final=3.15, loss_mean=0.993, loss_mean_cls=2.22, proj_loss=-0.0636][2026-03-23 13:38:42] Step: 564, Training Logs: loss_final: 4.070833, loss_mean: 0.997661, proj_loss: -0.063006, loss_mean_cls: 3.136178, grad_norm: 5.804032 +Steps: 0%| | 565/1000000 [02:23<67:59:35, 4.08it/s, grad_norm=5.8, loss_final=4.07, loss_mean=0.998, loss_mean_cls=3.14, proj_loss=-0.063][2026-03-23 13:38:43] Step: 565, Training Logs: loss_final: 4.005676, loss_mean: 0.980404, proj_loss: -0.061940, loss_mean_cls: 3.087212, grad_norm: 7.045742 +Steps: 0%| | 566/1000000 [02:23<67:53:38, 4.09it/s, grad_norm=7.05, loss_final=4.01, loss_mean=0.98, loss_mean_cls=3.09, proj_loss=-0.0619][2026-03-23 13:38:43] Step: 566, Training Logs: loss_final: 4.012024, loss_mean: 1.021922, proj_loss: -0.062626, loss_mean_cls: 3.052728, grad_norm: 6.839530 +Steps: 0%| | 567/1000000 [02:23<67:49:27, 4.09it/s, grad_norm=6.84, loss_final=4.01, loss_mean=1.02, loss_mean_cls=3.05, proj_loss=-0.0626][2026-03-23 13:38:43] Step: 567, Training Logs: loss_final: 3.790454, loss_mean: 1.010508, proj_loss: -0.065436, loss_mean_cls: 2.845383, grad_norm: 5.886967 +Steps: 0%| | 568/1000000 [02:23<67:46:52, 4.10it/s, grad_norm=5.89, loss_final=3.79, loss_mean=1.01, loss_mean_cls=2.85, proj_loss=-0.0654][2026-03-23 13:38:43] Step: 568, Training Logs: loss_final: 3.804877, loss_mean: 0.986045, proj_loss: -0.064870, loss_mean_cls: 2.883702, grad_norm: 4.999908 +Steps: 0%| | 569/1000000 [02:24<67:50:17, 4.09it/s, grad_norm=5, loss_final=3.8, loss_mean=0.986, loss_mean_cls=2.88, proj_loss=-0.0649][2026-03-23 13:38:44] Step: 569, Training Logs: loss_final: 4.004358, loss_mean: 0.997293, proj_loss: -0.063264, loss_mean_cls: 3.070329, grad_norm: 4.949094 +Steps: 0%| | 570/1000000 [02:24<67:46:09, 4.10it/s, grad_norm=4.95, loss_final=4, loss_mean=0.997, loss_mean_cls=3.07, proj_loss=-0.0633][2026-03-23 13:38:44] Step: 570, Training Logs: loss_final: 4.073775, loss_mean: 0.997508, proj_loss: -0.064519, loss_mean_cls: 3.140787, grad_norm: 4.538961 +Steps: 0%| | 571/1000000 [02:24<67:43:01, 4.10it/s, grad_norm=4.54, loss_final=4.07, loss_mean=0.998, loss_mean_cls=3.14, proj_loss=-0.0645][2026-03-23 13:38:44] Step: 571, Training Logs: loss_final: 4.378188, loss_mean: 0.985083, proj_loss: -0.063450, loss_mean_cls: 3.456556, grad_norm: 5.902861 +Steps: 0%| | 572/1000000 [02:24<67:42:12, 4.10it/s, grad_norm=5.9, loss_final=4.38, loss_mean=0.985, loss_mean_cls=3.46, proj_loss=-0.0635][2026-03-23 13:38:44] Step: 572, Training Logs: loss_final: 3.749765, loss_mean: 0.998891, proj_loss: -0.065885, loss_mean_cls: 2.816759, grad_norm: 5.053129 +Steps: 0%| | 573/1000000 [02:25<67:46:26, 4.10it/s, grad_norm=5.05, loss_final=3.75, loss_mean=0.999, loss_mean_cls=2.82, proj_loss=-0.0659][2026-03-23 13:38:45] Step: 573, Training Logs: loss_final: 3.955175, loss_mean: 1.012789, proj_loss: -0.061832, loss_mean_cls: 3.004218, grad_norm: 4.699675 +Steps: 0%| | 574/1000000 [02:25<67:43:44, 4.10it/s, grad_norm=4.7, loss_final=3.96, loss_mean=1.01, loss_mean_cls=3, proj_loss=-0.0618][2026-03-23 13:38:45] Step: 574, Training Logs: loss_final: 3.589915, loss_mean: 1.014884, proj_loss: -0.068075, loss_mean_cls: 2.643106, grad_norm: 4.725757 +Steps: 0%| | 575/1000000 [02:25<67:42:53, 4.10it/s, grad_norm=4.73, loss_final=3.59, loss_mean=1.01, loss_mean_cls=2.64, proj_loss=-0.0681][2026-03-23 13:38:45] Step: 575, Training Logs: loss_final: 3.842888, loss_mean: 0.995320, proj_loss: -0.066413, loss_mean_cls: 2.913981, grad_norm: 4.963584 +Steps: 0%| | 576/1000000 [02:25<67:44:40, 4.10it/s, grad_norm=4.96, loss_final=3.84, loss_mean=0.995, loss_mean_cls=2.91, proj_loss=-0.0664][2026-03-23 13:38:45] Step: 576, Training Logs: loss_final: 3.785400, loss_mean: 1.010315, proj_loss: -0.063341, loss_mean_cls: 2.838426, grad_norm: 5.387070 +Steps: 0%| | 577/1000000 [02:26<67:49:19, 4.09it/s, grad_norm=5.39, loss_final=3.79, loss_mean=1.01, loss_mean_cls=2.84, proj_loss=-0.0633][2026-03-23 13:38:46] Step: 577, Training Logs: loss_final: 3.962281, loss_mean: 0.972135, proj_loss: -0.066690, loss_mean_cls: 3.056836, grad_norm: 4.700878 +Steps: 0%| | 578/1000000 [02:26<67:44:43, 4.10it/s, grad_norm=4.7, loss_final=3.96, loss_mean=0.972, loss_mean_cls=3.06, proj_loss=-0.0667][2026-03-23 13:38:46] Step: 578, Training Logs: loss_final: 3.516800, loss_mean: 1.010832, proj_loss: -0.064978, loss_mean_cls: 2.570946, grad_norm: 4.912512 +Steps: 0%| | 579/1000000 [02:26<68:09:09, 4.07it/s, grad_norm=4.91, loss_final=3.52, loss_mean=1.01, loss_mean_cls=2.57, proj_loss=-0.065][2026-03-23 13:38:46] Step: 579, Training Logs: loss_final: 3.995673, loss_mean: 0.986532, proj_loss: -0.065121, loss_mean_cls: 3.074263, grad_norm: 5.715196 +Steps: 0%| | 580/1000000 [02:26<68:03:49, 4.08it/s, grad_norm=5.72, loss_final=4, loss_mean=0.987, loss_mean_cls=3.07, proj_loss=-0.0651][2026-03-23 13:38:46] Step: 580, Training Logs: loss_final: 3.670063, loss_mean: 0.999017, proj_loss: -0.067415, loss_mean_cls: 2.738461, grad_norm: 6.919309 +Steps: 0%| | 581/1000000 [02:27<68:02:36, 4.08it/s, grad_norm=6.92, loss_final=3.67, loss_mean=0.999, loss_mean_cls=2.74, proj_loss=-0.0674][2026-03-23 13:38:47] Step: 581, Training Logs: loss_final: 3.714254, loss_mean: 1.028748, proj_loss: -0.069246, loss_mean_cls: 2.754752, grad_norm: 6.434298 +Steps: 0%| | 582/1000000 [02:27<67:53:46, 4.09it/s, grad_norm=6.43, loss_final=3.71, loss_mean=1.03, loss_mean_cls=2.75, proj_loss=-0.0692][2026-03-23 13:38:47] Step: 582, Training Logs: loss_final: 3.429539, loss_mean: 0.994100, proj_loss: -0.064966, loss_mean_cls: 2.500405, grad_norm: 5.886649 +Steps: 0%| | 583/1000000 [02:27<67:48:49, 4.09it/s, grad_norm=5.89, loss_final=3.43, loss_mean=0.994, loss_mean_cls=2.5, proj_loss=-0.065][2026-03-23 13:38:47] Step: 583, Training Logs: loss_final: 3.814884, loss_mean: 0.995706, proj_loss: -0.067389, loss_mean_cls: 2.886566, grad_norm: 6.494982 +Steps: 0%| | 584/1000000 [02:27<67:47:31, 4.10it/s, grad_norm=6.49, loss_final=3.81, loss_mean=0.996, loss_mean_cls=2.89, proj_loss=-0.0674][2026-03-23 13:38:47] Step: 584, Training Logs: loss_final: 4.053805, loss_mean: 0.983811, proj_loss: -0.066296, loss_mean_cls: 3.136291, grad_norm: 5.078593 +Steps: 0%| | 585/1000000 [02:28<67:50:19, 4.09it/s, grad_norm=5.08, loss_final=4.05, loss_mean=0.984, loss_mean_cls=3.14, proj_loss=-0.0663][2026-03-23 13:38:48] Step: 585, Training Logs: loss_final: 3.598365, loss_mean: 0.985342, proj_loss: -0.066295, loss_mean_cls: 2.679317, grad_norm: 4.861182 +Steps: 0%| | 586/1000000 [02:28<67:46:17, 4.10it/s, grad_norm=4.86, loss_final=3.6, loss_mean=0.985, loss_mean_cls=2.68, proj_loss=-0.0663][2026-03-23 13:38:48] Step: 586, Training Logs: loss_final: 3.842358, loss_mean: 0.980810, proj_loss: -0.066950, loss_mean_cls: 2.928498, grad_norm: 4.951399 +Steps: 0%| | 587/1000000 [02:28<67:42:54, 4.10it/s, grad_norm=4.95, loss_final=3.84, loss_mean=0.981, loss_mean_cls=2.93, proj_loss=-0.0669][2026-03-23 13:38:48] Step: 587, Training Logs: loss_final: 4.082364, loss_mean: 0.981364, proj_loss: -0.064789, loss_mean_cls: 3.165789, grad_norm: 4.702199 +Steps: 0%| | 588/1000000 [02:28<67:40:38, 4.10it/s, grad_norm=4.7, loss_final=4.08, loss_mean=0.981, loss_mean_cls=3.17, proj_loss=-0.0648][2026-03-23 13:38:48] Step: 588, Training Logs: loss_final: 3.916625, loss_mean: 0.980813, proj_loss: -0.065888, loss_mean_cls: 3.001699, grad_norm: 3.544697 +Steps: 0%| | 589/1000000 [02:29<67:48:43, 4.09it/s, grad_norm=3.54, loss_final=3.92, loss_mean=0.981, loss_mean_cls=3, proj_loss=-0.0659][2026-03-23 13:38:49] Step: 589, Training Logs: loss_final: 3.355223, loss_mean: 0.996214, proj_loss: -0.067345, loss_mean_cls: 2.426354, grad_norm: 4.875427 +Steps: 0%| | 590/1000000 [02:29<67:47:47, 4.09it/s, grad_norm=4.88, loss_final=3.36, loss_mean=0.996, loss_mean_cls=2.43, proj_loss=-0.0673][2026-03-23 13:38:49] Step: 590, Training Logs: loss_final: 3.178071, loss_mean: 0.990766, proj_loss: -0.065328, loss_mean_cls: 2.252634, grad_norm: 4.561001 +Steps: 0%| | 591/1000000 [02:29<67:46:14, 4.10it/s, grad_norm=4.56, loss_final=3.18, loss_mean=0.991, loss_mean_cls=2.25, proj_loss=-0.0653][2026-03-23 13:38:49] Step: 591, Training Logs: loss_final: 3.113630, loss_mean: 0.977367, proj_loss: -0.066653, loss_mean_cls: 2.202915, grad_norm: 4.286854 +Steps: 0%| | 592/1000000 [02:29<67:44:33, 4.10it/s, grad_norm=4.29, loss_final=3.11, loss_mean=0.977, loss_mean_cls=2.2, proj_loss=-0.0667][2026-03-23 13:38:49] Step: 592, Training Logs: loss_final: 3.278258, loss_mean: 0.991871, proj_loss: -0.068388, loss_mean_cls: 2.354775, grad_norm: 4.076110 +Steps: 0%| | 593/1000000 [02:30<67:49:09, 4.09it/s, grad_norm=4.08, loss_final=3.28, loss_mean=0.992, loss_mean_cls=2.35, proj_loss=-0.0684][2026-03-23 13:38:49] Step: 593, Training Logs: loss_final: 4.136976, loss_mean: 0.980628, proj_loss: -0.064529, loss_mean_cls: 3.220878, grad_norm: 5.929049 +Steps: 0%| | 594/1000000 [02:30<67:45:33, 4.10it/s, grad_norm=5.93, loss_final=4.14, loss_mean=0.981, loss_mean_cls=3.22, proj_loss=-0.0645][2026-03-23 13:38:50] Step: 594, Training Logs: loss_final: 3.421935, loss_mean: 1.002496, proj_loss: -0.068007, loss_mean_cls: 2.487447, grad_norm: 4.308617 +Steps: 0%| | 595/1000000 [02:30<67:43:20, 4.10it/s, grad_norm=4.31, loss_final=3.42, loss_mean=1, loss_mean_cls=2.49, proj_loss=-0.068][2026-03-23 13:38:50] Step: 595, Training Logs: loss_final: 3.539484, loss_mean: 0.987705, proj_loss: -0.065007, loss_mean_cls: 2.616785, grad_norm: 5.210318 +Steps: 0%| | 596/1000000 [02:30<67:46:53, 4.10it/s, grad_norm=5.21, loss_final=3.54, loss_mean=0.988, loss_mean_cls=2.62, proj_loss=-0.065][2026-03-23 13:38:50] Step: 596, Training Logs: loss_final: 3.484890, loss_mean: 0.982969, proj_loss: -0.066847, loss_mean_cls: 2.568768, grad_norm: 5.495376 +Steps: 0%| | 597/1000000 [02:31<67:50:09, 4.09it/s, grad_norm=5.5, loss_final=3.48, loss_mean=0.983, loss_mean_cls=2.57, proj_loss=-0.0668][2026-03-23 13:38:50] Step: 597, Training Logs: loss_final: 3.946268, loss_mean: 0.972365, proj_loss: -0.064528, loss_mean_cls: 3.038430, grad_norm: 5.060549 +Steps: 0%| | 598/1000000 [02:31<67:45:50, 4.10it/s, grad_norm=5.06, loss_final=3.95, loss_mean=0.972, loss_mean_cls=3.04, proj_loss=-0.0645][2026-03-23 13:38:51] Step: 598, Training Logs: loss_final: 3.434661, loss_mean: 1.013579, proj_loss: -0.069635, loss_mean_cls: 2.490717, grad_norm: 3.900484 +Steps: 0%| | 599/1000000 [02:31<67:43:47, 4.10it/s, grad_norm=3.9, loss_final=3.43, loss_mean=1.01, loss_mean_cls=2.49, proj_loss=-0.0696][2026-03-23 13:38:51] Step: 599, Training Logs: loss_final: 3.528610, loss_mean: 0.995155, proj_loss: -0.066395, loss_mean_cls: 2.599850, grad_norm: 4.336702 +Steps: 0%| | 600/1000000 [02:31<67:44:16, 4.10it/s, grad_norm=4.34, loss_final=3.53, loss_mean=0.995, loss_mean_cls=2.6, proj_loss=-0.0664][2026-03-23 13:38:51] Step: 600, Training Logs: loss_final: 3.535697, loss_mean: 0.989652, proj_loss: -0.070092, loss_mean_cls: 2.616137, grad_norm: 5.959737 +Steps: 0%| | 601/1000000 [02:31<67:48:56, 4.09it/s, grad_norm=5.96, loss_final=3.54, loss_mean=0.99, loss_mean_cls=2.62, proj_loss=-0.0701][2026-03-23 13:38:51] Step: 601, Training Logs: loss_final: 4.077101, loss_mean: 0.974285, proj_loss: -0.068629, loss_mean_cls: 3.171444, grad_norm: 7.018533 +Steps: 0%| | 602/1000000 [02:32<67:44:34, 4.10it/s, grad_norm=7.02, loss_final=4.08, loss_mean=0.974, loss_mean_cls=3.17, proj_loss=-0.0686][2026-03-23 13:38:52] Step: 602, Training Logs: loss_final: 4.345075, loss_mean: 0.979988, proj_loss: -0.066197, loss_mean_cls: 3.431284, grad_norm: 6.637357 +Steps: 0%| | 603/1000000 [02:32<67:43:02, 4.10it/s, grad_norm=6.64, loss_final=4.35, loss_mean=0.98, loss_mean_cls=3.43, proj_loss=-0.0662][2026-03-23 13:38:52] Step: 603, Training Logs: loss_final: 3.686632, loss_mean: 1.005512, proj_loss: -0.067870, loss_mean_cls: 2.748990, grad_norm: 6.364308 +Steps: 0%| | 604/1000000 [02:32<67:45:04, 4.10it/s, grad_norm=6.36, loss_final=3.69, loss_mean=1.01, loss_mean_cls=2.75, proj_loss=-0.0679][2026-03-23 13:38:52] Step: 604, Training Logs: loss_final: 3.248256, loss_mean: 0.991005, proj_loss: -0.068987, loss_mean_cls: 2.326239, grad_norm: 4.413094 +Steps: 0%| | 605/1000000 [02:32<67:47:42, 4.09it/s, grad_norm=4.41, loss_final=3.25, loss_mean=0.991, loss_mean_cls=2.33, proj_loss=-0.069][2026-03-23 13:38:52] Step: 605, Training Logs: loss_final: 3.776988, loss_mean: 0.995666, proj_loss: -0.066143, loss_mean_cls: 2.847465, grad_norm: 6.790796 +Steps: 0%| | 606/1000000 [02:33<67:45:52, 4.10it/s, grad_norm=6.79, loss_final=3.78, loss_mean=0.996, loss_mean_cls=2.85, proj_loss=-0.0661][2026-03-23 13:38:53] Step: 606, Training Logs: loss_final: 3.583329, loss_mean: 1.018788, proj_loss: -0.067666, loss_mean_cls: 2.632207, grad_norm: 6.372702 +Steps: 0%| | 607/1000000 [02:33<67:42:59, 4.10it/s, grad_norm=6.37, loss_final=3.58, loss_mean=1.02, loss_mean_cls=2.63, proj_loss=-0.0677][2026-03-23 13:38:53] Step: 607, Training Logs: loss_final: 3.753204, loss_mean: 1.016731, proj_loss: -0.066934, loss_mean_cls: 2.803408, grad_norm: 4.588667 +Steps: 0%| | 608/1000000 [02:33<67:43:59, 4.10it/s, grad_norm=4.59, loss_final=3.75, loss_mean=1.02, loss_mean_cls=2.8, proj_loss=-0.0669][2026-03-23 13:38:53] Step: 608, Training Logs: loss_final: 3.857388, loss_mean: 1.008127, proj_loss: -0.068592, loss_mean_cls: 2.917853, grad_norm: 7.660881 +Steps: 0%| | 609/1000000 [02:33<67:49:41, 4.09it/s, grad_norm=7.66, loss_final=3.86, loss_mean=1.01, loss_mean_cls=2.92, proj_loss=-0.0686][2026-03-23 13:38:53] Step: 609, Training Logs: loss_final: 4.045995, loss_mean: 0.981015, proj_loss: -0.064117, loss_mean_cls: 3.129098, grad_norm: 7.789477 +Steps: 0%| | 610/1000000 [02:34<67:45:10, 4.10it/s, grad_norm=7.79, loss_final=4.05, loss_mean=0.981, loss_mean_cls=3.13, proj_loss=-0.0641][2026-03-23 13:38:54] Step: 610, Training Logs: loss_final: 3.177725, loss_mean: 1.025448, proj_loss: -0.066694, loss_mean_cls: 2.218971, grad_norm: 5.636034 +Steps: 0%| | 611/1000000 [02:34<67:43:15, 4.10it/s, grad_norm=5.64, loss_final=3.18, loss_mean=1.03, loss_mean_cls=2.22, proj_loss=-0.0667][2026-03-23 13:38:54] Step: 611, Training Logs: loss_final: 3.592640, loss_mean: 1.010593, proj_loss: -0.067750, loss_mean_cls: 2.649797, grad_norm: 5.041424 +Steps: 0%| | 612/1000000 [02:34<67:42:37, 4.10it/s, grad_norm=5.04, loss_final=3.59, loss_mean=1.01, loss_mean_cls=2.65, proj_loss=-0.0677][2026-03-23 13:38:54] Step: 612, Training Logs: loss_final: 3.155896, loss_mean: 1.024753, proj_loss: -0.069619, loss_mean_cls: 2.200761, grad_norm: 6.296671 +Steps: 0%| | 613/1000000 [02:34<67:48:07, 4.09it/s, grad_norm=6.3, loss_final=3.16, loss_mean=1.02, loss_mean_cls=2.2, proj_loss=-0.0696][2026-03-23 13:38:54] Step: 613, Training Logs: loss_final: 3.765137, loss_mean: 0.989627, proj_loss: -0.067434, loss_mean_cls: 2.842943, grad_norm: 4.904646 +Steps: 0%| | 614/1000000 [02:35<67:44:36, 4.10it/s, grad_norm=4.9, loss_final=3.77, loss_mean=0.99, loss_mean_cls=2.84, proj_loss=-0.0674][2026-03-23 13:38:55] Step: 614, Training Logs: loss_final: 3.991667, loss_mean: 0.985149, proj_loss: -0.070072, loss_mean_cls: 3.076591, grad_norm: 5.152972 +Steps: 0%| | 615/1000000 [02:35<67:42:04, 4.10it/s, grad_norm=5.15, loss_final=3.99, loss_mean=0.985, loss_mean_cls=3.08, proj_loss=-0.0701][2026-03-23 13:38:55] Step: 615, Training Logs: loss_final: 3.678521, loss_mean: 0.996266, proj_loss: -0.068939, loss_mean_cls: 2.751194, grad_norm: 6.563666 +Steps: 0%| | 616/1000000 [02:35<67:40:48, 4.10it/s, grad_norm=6.56, loss_final=3.68, loss_mean=0.996, loss_mean_cls=2.75, proj_loss=-0.0689][2026-03-23 13:38:55] Step: 616, Training Logs: loss_final: 3.930480, loss_mean: 0.990894, proj_loss: -0.068622, loss_mean_cls: 3.008207, grad_norm: 4.891110 +Steps: 0%| | 617/1000000 [02:35<67:44:26, 4.10it/s, grad_norm=4.89, loss_final=3.93, loss_mean=0.991, loss_mean_cls=3.01, proj_loss=-0.0686][2026-03-23 13:38:55] Step: 617, Training Logs: loss_final: 3.709914, loss_mean: 0.985112, proj_loss: -0.070680, loss_mean_cls: 2.795483, grad_norm: 5.365278 +Steps: 0%| | 618/1000000 [02:36<67:41:57, 4.10it/s, grad_norm=5.37, loss_final=3.71, loss_mean=0.985, loss_mean_cls=2.8, proj_loss=-0.0707][2026-03-23 13:38:56] Step: 618, Training Logs: loss_final: 3.847844, loss_mean: 0.975687, proj_loss: -0.070219, loss_mean_cls: 2.942376, grad_norm: 7.215324 +Steps: 0%| | 619/1000000 [02:36<67:40:56, 4.10it/s, grad_norm=7.22, loss_final=3.85, loss_mean=0.976, loss_mean_cls=2.94, proj_loss=-0.0702][2026-03-23 13:38:56] Step: 619, Training Logs: loss_final: 4.083265, loss_mean: 0.974841, proj_loss: -0.070987, loss_mean_cls: 3.179412, grad_norm: 6.561662 +Steps: 0%| | 620/1000000 [02:36<67:41:26, 4.10it/s, grad_norm=6.56, loss_final=4.08, loss_mean=0.975, loss_mean_cls=3.18, proj_loss=-0.071][2026-03-23 13:38:56] Step: 620, Training Logs: loss_final: 3.890210, loss_mean: 0.992520, proj_loss: -0.069519, loss_mean_cls: 2.967209, grad_norm: 5.976606 +Steps: 0%| | 621/1000000 [02:36<67:47:02, 4.10it/s, grad_norm=5.98, loss_final=3.89, loss_mean=0.993, loss_mean_cls=2.97, proj_loss=-0.0695][2026-03-23 13:38:56] Step: 621, Training Logs: loss_final: 3.419559, loss_mean: 0.995270, proj_loss: -0.070840, loss_mean_cls: 2.495130, grad_norm: 5.371793 +Steps: 0%| | 622/1000000 [02:37<67:42:30, 4.10it/s, grad_norm=5.37, loss_final=3.42, loss_mean=0.995, loss_mean_cls=2.5, proj_loss=-0.0708][2026-03-23 13:38:57] Step: 622, Training Logs: loss_final: 3.224472, loss_mean: 1.022084, proj_loss: -0.069704, loss_mean_cls: 2.272092, grad_norm: 7.125209 +Steps: 0%| | 623/1000000 [02:37<67:41:56, 4.10it/s, grad_norm=7.13, loss_final=3.22, loss_mean=1.02, loss_mean_cls=2.27, proj_loss=-0.0697][2026-03-23 13:38:57] Step: 623, Training Logs: loss_final: 3.573836, loss_mean: 0.994158, proj_loss: -0.068619, loss_mean_cls: 2.648297, grad_norm: 5.272168 +Steps: 0%| | 624/1000000 [02:37<67:40:35, 4.10it/s, grad_norm=5.27, loss_final=3.57, loss_mean=0.994, loss_mean_cls=2.65, proj_loss=-0.0686][2026-03-23 13:38:57] Step: 624, Training Logs: loss_final: 3.517462, loss_mean: 1.004244, proj_loss: -0.070110, loss_mean_cls: 2.583327, grad_norm: 5.883142 +Steps: 0%| | 625/1000000 [02:37<67:45:50, 4.10it/s, grad_norm=5.88, loss_final=3.52, loss_mean=1, loss_mean_cls=2.58, proj_loss=-0.0701][2026-03-23 13:38:57] Step: 625, Training Logs: loss_final: 3.469484, loss_mean: 0.992222, proj_loss: -0.069078, loss_mean_cls: 2.546340, grad_norm: 5.350204 +Steps: 0%| | 626/1000000 [02:38<67:43:15, 4.10it/s, grad_norm=5.35, loss_final=3.47, loss_mean=0.992, loss_mean_cls=2.55, proj_loss=-0.0691][2026-03-23 13:38:58] Step: 626, Training Logs: loss_final: 3.109628, loss_mean: 0.983171, proj_loss: -0.070999, loss_mean_cls: 2.197456, grad_norm: 4.271540 +Steps: 0%| | 627/1000000 [02:38<67:41:14, 4.10it/s, grad_norm=4.27, loss_final=3.11, loss_mean=0.983, loss_mean_cls=2.2, proj_loss=-0.071][2026-03-23 13:38:58] Step: 627, Training Logs: loss_final: 4.035884, loss_mean: 1.001273, proj_loss: -0.069472, loss_mean_cls: 3.104084, grad_norm: 4.851622 +Steps: 0%| | 628/1000000 [02:38<67:40:30, 4.10it/s, grad_norm=4.85, loss_final=4.04, loss_mean=1, loss_mean_cls=3.1, proj_loss=-0.0695][2026-03-23 13:38:58] Step: 628, Training Logs: loss_final: 3.454526, loss_mean: 0.978706, proj_loss: -0.072675, loss_mean_cls: 2.548496, grad_norm: 5.396392 +Steps: 0%| | 629/1000000 [02:38<67:51:25, 4.09it/s, grad_norm=5.4, loss_final=3.45, loss_mean=0.979, loss_mean_cls=2.55, proj_loss=-0.0727][2026-03-23 13:38:58] Step: 629, Training Logs: loss_final: 3.149767, loss_mean: 0.999992, proj_loss: -0.072493, loss_mean_cls: 2.222269, grad_norm: 3.960849 +Steps: 0%| | 630/1000000 [02:39<67:44:57, 4.10it/s, grad_norm=3.96, loss_final=3.15, loss_mean=1, loss_mean_cls=2.22, proj_loss=-0.0725][2026-03-23 13:38:59] Step: 630, Training Logs: loss_final: 2.944920, loss_mean: 0.985119, proj_loss: -0.071875, loss_mean_cls: 2.031675, grad_norm: 4.155864 +Steps: 0%| | 631/1000000 [02:39<67:42:20, 4.10it/s, grad_norm=4.16, loss_final=2.94, loss_mean=0.985, loss_mean_cls=2.03, proj_loss=-0.0719][2026-03-23 13:38:59] Step: 631, Training Logs: loss_final: 3.273981, loss_mean: 0.974510, proj_loss: -0.072717, loss_mean_cls: 2.372188, grad_norm: 5.066461 +Steps: 0%| | 632/1000000 [02:39<67:41:40, 4.10it/s, grad_norm=5.07, loss_final=3.27, loss_mean=0.975, loss_mean_cls=2.37, proj_loss=-0.0727][2026-03-23 13:38:59] Step: 632, Training Logs: loss_final: 3.343126, loss_mean: 1.006204, proj_loss: -0.069746, loss_mean_cls: 2.406669, grad_norm: 4.035533 +Steps: 0%| | 633/1000000 [02:39<67:44:41, 4.10it/s, grad_norm=4.04, loss_final=3.34, loss_mean=1.01, loss_mean_cls=2.41, proj_loss=-0.0697][2026-03-23 13:38:59] Step: 633, Training Logs: loss_final: 3.809744, loss_mean: 0.981405, proj_loss: -0.071253, loss_mean_cls: 2.899592, grad_norm: 5.536509 +Steps: 0%| | 634/1000000 [02:40<67:43:06, 4.10it/s, grad_norm=5.54, loss_final=3.81, loss_mean=0.981, loss_mean_cls=2.9, proj_loss=-0.0713][2026-03-23 13:39:00] Step: 634, Training Logs: loss_final: 3.619757, loss_mean: 0.997240, proj_loss: -0.074238, loss_mean_cls: 2.696755, grad_norm: 4.721961 +Steps: 0%| | 635/1000000 [02:40<67:42:36, 4.10it/s, grad_norm=4.72, loss_final=3.62, loss_mean=0.997, loss_mean_cls=2.7, proj_loss=-0.0742][2026-03-23 13:39:00] Step: 635, Training Logs: loss_final: 3.474412, loss_mean: 0.980894, proj_loss: -0.072518, loss_mean_cls: 2.566037, grad_norm: 4.225765 +Steps: 0%| | 636/1000000 [02:40<67:40:16, 4.10it/s, grad_norm=4.23, loss_final=3.47, loss_mean=0.981, loss_mean_cls=2.57, proj_loss=-0.0725][2026-03-23 13:39:00] Step: 636, Training Logs: loss_final: 3.363288, loss_mean: 1.008512, proj_loss: -0.070998, loss_mean_cls: 2.425774, grad_norm: 4.145810 +Steps: 0%| | 637/1000000 [02:40<67:43:46, 4.10it/s, grad_norm=4.15, loss_final=3.36, loss_mean=1.01, loss_mean_cls=2.43, proj_loss=-0.071][2026-03-23 13:39:00] Step: 637, Training Logs: loss_final: 3.189651, loss_mean: 1.004915, proj_loss: -0.072299, loss_mean_cls: 2.257034, grad_norm: 3.073282 +Steps: 0%| | 638/1000000 [02:41<67:40:51, 4.10it/s, grad_norm=3.07, loss_final=3.19, loss_mean=1, loss_mean_cls=2.26, proj_loss=-0.0723][2026-03-23 13:39:00] Step: 638, Training Logs: loss_final: 3.847389, loss_mean: 0.956192, proj_loss: -0.070862, loss_mean_cls: 2.962059, grad_norm: 4.682324 +Steps: 0%| | 639/1000000 [02:41<67:38:34, 4.10it/s, grad_norm=4.68, loss_final=3.85, loss_mean=0.956, loss_mean_cls=2.96, proj_loss=-0.0709][2026-03-23 13:39:01] Step: 639, Training Logs: loss_final: 3.887917, loss_mean: 0.976132, proj_loss: -0.071480, loss_mean_cls: 2.983265, grad_norm: 5.262200 +Steps: 0%| | 640/1000000 [02:41<67:38:45, 4.10it/s, grad_norm=5.26, loss_final=3.89, loss_mean=0.976, loss_mean_cls=2.98, proj_loss=-0.0715][2026-03-23 13:39:01] Step: 640, Training Logs: loss_final: 3.647943, loss_mean: 0.974218, proj_loss: -0.074260, loss_mean_cls: 2.747984, grad_norm: 4.739567 +Steps: 0%| | 641/1000000 [02:41<67:37:50, 4.10it/s, grad_norm=4.74, loss_final=3.65, loss_mean=0.974, loss_mean_cls=2.75, proj_loss=-0.0743][2026-03-23 13:39:01] Step: 641, Training Logs: loss_final: 4.011278, loss_mean: 0.969581, proj_loss: -0.070656, loss_mean_cls: 3.112353, grad_norm: 6.556427 +Steps: 0%| | 642/1000000 [02:41<67:38:24, 4.10it/s, grad_norm=6.56, loss_final=4.01, loss_mean=0.97, loss_mean_cls=3.11, proj_loss=-0.0707][2026-03-23 13:39:01] Step: 642, Training Logs: loss_final: 3.809179, loss_mean: 0.996552, proj_loss: -0.072563, loss_mean_cls: 2.885190, grad_norm: 5.569734 +Steps: 0%| | 643/1000000 [02:42<67:43:51, 4.10it/s, grad_norm=5.57, loss_final=3.81, loss_mean=0.997, loss_mean_cls=2.89, proj_loss=-0.0726][2026-03-23 13:39:02] Step: 643, Training Logs: loss_final: 3.760360, loss_mean: 0.990192, proj_loss: -0.072604, loss_mean_cls: 2.842772, grad_norm: 5.809318 +Steps: 0%| | 644/1000000 [02:42<67:41:05, 4.10it/s, grad_norm=5.81, loss_final=3.76, loss_mean=0.99, loss_mean_cls=2.84, proj_loss=-0.0726][2026-03-23 13:39:02] Step: 644, Training Logs: loss_final: 3.422057, loss_mean: 1.009518, proj_loss: -0.069412, loss_mean_cls: 2.481951, grad_norm: 4.502415 +Steps: 0%| | 645/1000000 [02:42<67:38:27, 4.10it/s, grad_norm=4.5, loss_final=3.42, loss_mean=1.01, loss_mean_cls=2.48, proj_loss=-0.0694][2026-03-23 13:39:02] Step: 645, Training Logs: loss_final: 4.018957, loss_mean: 0.994714, proj_loss: -0.073214, loss_mean_cls: 3.097456, grad_norm: 5.860019 +Steps: 0%| | 646/1000000 [02:42<67:36:51, 4.11it/s, grad_norm=5.86, loss_final=4.02, loss_mean=0.995, loss_mean_cls=3.1, proj_loss=-0.0732][2026-03-23 13:39:02] Step: 646, Training Logs: loss_final: 3.559326, loss_mean: 0.994960, proj_loss: -0.070038, loss_mean_cls: 2.634405, grad_norm: 4.463538 +Steps: 0%| | 647/1000000 [02:43<67:36:35, 4.11it/s, grad_norm=4.46, loss_final=3.56, loss_mean=0.995, loss_mean_cls=2.63, proj_loss=-0.07][2026-03-23 13:39:03] Step: 647, Training Logs: loss_final: 3.553550, loss_mean: 0.982921, proj_loss: -0.072998, loss_mean_cls: 2.643628, grad_norm: 7.534102 +Steps: 0%| | 648/1000000 [02:43<67:40:49, 4.10it/s, grad_norm=7.53, loss_final=3.55, loss_mean=0.983, loss_mean_cls=2.64, proj_loss=-0.073][2026-03-23 13:39:03] Step: 648, Training Logs: loss_final: 3.792564, loss_mean: 1.010030, proj_loss: -0.068061, loss_mean_cls: 2.850595, grad_norm: 6.420865 +Steps: 0%| | 649/1000000 [02:43<67:43:23, 4.10it/s, grad_norm=6.42, loss_final=3.79, loss_mean=1.01, loss_mean_cls=2.85, proj_loss=-0.0681][2026-03-23 13:39:03] Step: 649, Training Logs: loss_final: 3.754485, loss_mean: 0.979923, proj_loss: -0.072568, loss_mean_cls: 2.847130, grad_norm: 3.768997 +Steps: 0%| | 650/1000000 [02:43<67:42:00, 4.10it/s, grad_norm=3.77, loss_final=3.75, loss_mean=0.98, loss_mean_cls=2.85, proj_loss=-0.0726][2026-03-23 13:39:03] Step: 650, Training Logs: loss_final: 3.642642, loss_mean: 0.983123, proj_loss: -0.068324, loss_mean_cls: 2.727843, grad_norm: 5.449425 +Steps: 0%| | 651/1000000 [02:44<67:39:44, 4.10it/s, grad_norm=5.45, loss_final=3.64, loss_mean=0.983, loss_mean_cls=2.73, proj_loss=-0.0683][2026-03-23 13:39:04] Step: 651, Training Logs: loss_final: 3.462548, loss_mean: 0.989659, proj_loss: -0.076006, loss_mean_cls: 2.548894, grad_norm: 4.069594 +Steps: 0%| | 652/1000000 [02:44<67:39:04, 4.10it/s, grad_norm=4.07, loss_final=3.46, loss_mean=0.99, loss_mean_cls=2.55, proj_loss=-0.076][2026-03-23 13:39:04] Step: 652, Training Logs: loss_final: 3.315111, loss_mean: 0.982330, proj_loss: -0.072165, loss_mean_cls: 2.404946, grad_norm: 4.008397 +Steps: 0%| | 653/1000000 [02:44<67:44:17, 4.10it/s, grad_norm=4.01, loss_final=3.32, loss_mean=0.982, loss_mean_cls=2.4, proj_loss=-0.0722][2026-03-23 13:39:04] Step: 653, Training Logs: loss_final: 3.427796, loss_mean: 0.972897, proj_loss: -0.071306, loss_mean_cls: 2.526205, grad_norm: 3.622565 +Steps: 0%| | 654/1000000 [02:44<67:41:14, 4.10it/s, grad_norm=3.62, loss_final=3.43, loss_mean=0.973, loss_mean_cls=2.53, proj_loss=-0.0713][2026-03-23 13:39:04] Step: 654, Training Logs: loss_final: 3.921290, loss_mean: 0.986140, proj_loss: -0.073373, loss_mean_cls: 3.008523, grad_norm: 10.913305 +Steps: 0%| | 655/1000000 [02:45<67:39:39, 4.10it/s, grad_norm=10.9, loss_final=3.92, loss_mean=0.986, loss_mean_cls=3.01, proj_loss=-0.0734][2026-03-23 13:39:05] Step: 655, Training Logs: loss_final: 3.146154, loss_mean: 1.021695, proj_loss: -0.069289, loss_mean_cls: 2.193748, grad_norm: 7.908805 +Steps: 0%| | 656/1000000 [02:45<67:39:25, 4.10it/s, grad_norm=7.91, loss_final=3.15, loss_mean=1.02, loss_mean_cls=2.19, proj_loss=-0.0693][2026-03-23 13:39:05] Step: 656, Training Logs: loss_final: 3.502577, loss_mean: 0.994367, proj_loss: -0.071463, loss_mean_cls: 2.579674, grad_norm: 5.329564 +Steps: 0%| | 657/1000000 [02:45<67:43:34, 4.10it/s, grad_norm=5.33, loss_final=3.5, loss_mean=0.994, loss_mean_cls=2.58, proj_loss=-0.0715][2026-03-23 13:39:05] Step: 657, Training Logs: loss_final: 3.408917, loss_mean: 0.979598, proj_loss: -0.073946, loss_mean_cls: 2.503265, grad_norm: 7.494924 +Steps: 0%| | 658/1000000 [02:45<67:42:54, 4.10it/s, grad_norm=7.49, loss_final=3.41, loss_mean=0.98, loss_mean_cls=2.5, proj_loss=-0.0739][2026-03-23 13:39:05] Step: 658, Training Logs: loss_final: 3.970888, loss_mean: 0.996578, proj_loss: -0.074969, loss_mean_cls: 3.049279, grad_norm: 8.144650 +Steps: 0%| | 659/1000000 [02:46<67:40:32, 4.10it/s, grad_norm=8.14, loss_final=3.97, loss_mean=0.997, loss_mean_cls=3.05, proj_loss=-0.075][2026-03-23 13:39:06] Step: 659, Training Logs: loss_final: 3.505337, loss_mean: 1.006392, proj_loss: -0.075187, loss_mean_cls: 2.574131, grad_norm: 6.635328 +Steps: 0%| | 660/1000000 [02:46<67:39:32, 4.10it/s, grad_norm=6.64, loss_final=3.51, loss_mean=1.01, loss_mean_cls=2.57, proj_loss=-0.0752][2026-03-23 13:39:06] Step: 660, Training Logs: loss_final: 3.580181, loss_mean: 1.004478, proj_loss: -0.074361, loss_mean_cls: 2.650064, grad_norm: 5.444265 +Steps: 0%| | 661/1000000 [02:46<67:41:58, 4.10it/s, grad_norm=5.44, loss_final=3.58, loss_mean=1, loss_mean_cls=2.65, proj_loss=-0.0744][2026-03-23 13:39:06] Step: 661, Training Logs: loss_final: 3.652513, loss_mean: 1.003221, proj_loss: -0.071283, loss_mean_cls: 2.720576, grad_norm: 7.735732 +Steps: 0%| | 662/1000000 [02:46<67:40:47, 4.10it/s, grad_norm=7.74, loss_final=3.65, loss_mean=1, loss_mean_cls=2.72, proj_loss=-0.0713][2026-03-23 13:39:06] Step: 662, Training Logs: loss_final: 3.439461, loss_mean: 1.006971, proj_loss: -0.073250, loss_mean_cls: 2.505740, grad_norm: 6.285820 +Steps: 0%| | 663/1000000 [02:47<67:40:56, 4.10it/s, grad_norm=6.29, loss_final=3.44, loss_mean=1.01, loss_mean_cls=2.51, proj_loss=-0.0732][2026-03-23 13:39:07] Step: 663, Training Logs: loss_final: 3.729247, loss_mean: 0.999087, proj_loss: -0.071322, loss_mean_cls: 2.801482, grad_norm: 4.931449 +Steps: 0%| | 664/1000000 [02:47<67:39:59, 4.10it/s, grad_norm=4.93, loss_final=3.73, loss_mean=0.999, loss_mean_cls=2.8, proj_loss=-0.0713][2026-03-23 13:39:07] Step: 664, Training Logs: loss_final: 3.833301, loss_mean: 0.984064, proj_loss: -0.072831, loss_mean_cls: 2.922068, grad_norm: 6.815080 +Steps: 0%| | 665/1000000 [02:47<67:39:03, 4.10it/s, grad_norm=6.82, loss_final=3.83, loss_mean=0.984, loss_mean_cls=2.92, proj_loss=-0.0728][2026-03-23 13:39:07] Step: 665, Training Logs: loss_final: 3.763162, loss_mean: 0.980405, proj_loss: -0.077271, loss_mean_cls: 2.860028, grad_norm: 4.738036 +Steps: 0%| | 666/1000000 [02:47<67:38:57, 4.10it/s, grad_norm=4.74, loss_final=3.76, loss_mean=0.98, loss_mean_cls=2.86, proj_loss=-0.0773][2026-03-23 13:39:07] Step: 666, Training Logs: loss_final: 3.565862, loss_mean: 1.005757, proj_loss: -0.074128, loss_mean_cls: 2.634233, grad_norm: 6.102293 +Steps: 0%| | 667/1000000 [02:48<67:39:49, 4.10it/s, grad_norm=6.1, loss_final=3.57, loss_mean=1.01, loss_mean_cls=2.63, proj_loss=-0.0741][2026-03-23 13:39:08] Step: 667, Training Logs: loss_final: 3.720221, loss_mean: 0.982540, proj_loss: -0.073271, loss_mean_cls: 2.810952, grad_norm: 5.778759 +Steps: 0%| | 668/1000000 [02:48<67:39:28, 4.10it/s, grad_norm=5.78, loss_final=3.72, loss_mean=0.983, loss_mean_cls=2.81, proj_loss=-0.0733][2026-03-23 13:39:08] Step: 668, Training Logs: loss_final: 3.556055, loss_mean: 0.976116, proj_loss: -0.072133, loss_mean_cls: 2.652072, grad_norm: 4.333327 +Steps: 0%| | 669/1000000 [02:48<67:38:54, 4.10it/s, grad_norm=4.33, loss_final=3.56, loss_mean=0.976, loss_mean_cls=2.65, proj_loss=-0.0721][2026-03-23 13:39:08] Step: 669, Training Logs: loss_final: 3.538865, loss_mean: 0.984908, proj_loss: -0.073319, loss_mean_cls: 2.627275, grad_norm: 4.646189 +Steps: 0%| | 670/1000000 [02:48<67:37:25, 4.10it/s, grad_norm=4.65, loss_final=3.54, loss_mean=0.985, loss_mean_cls=2.63, proj_loss=-0.0733][2026-03-23 13:39:08] Step: 670, Training Logs: loss_final: 3.222993, loss_mean: 1.005772, proj_loss: -0.073031, loss_mean_cls: 2.290252, grad_norm: 4.195864 +Steps: 0%| | 671/1000000 [02:49<67:35:52, 4.11it/s, grad_norm=4.2, loss_final=3.22, loss_mean=1.01, loss_mean_cls=2.29, proj_loss=-0.073][2026-03-23 13:39:09] Step: 671, Training Logs: loss_final: 3.706622, loss_mean: 0.978291, proj_loss: -0.074084, loss_mean_cls: 2.802415, grad_norm: 4.323566 +Steps: 0%| | 672/1000000 [02:49<67:36:40, 4.11it/s, grad_norm=4.32, loss_final=3.71, loss_mean=0.978, loss_mean_cls=2.8, proj_loss=-0.0741][2026-03-23 13:39:09] Step: 672, Training Logs: loss_final: 3.166535, loss_mean: 0.988378, proj_loss: -0.072018, loss_mean_cls: 2.250175, grad_norm: 3.550086 +Steps: 0%| | 673/1000000 [02:49<67:36:36, 4.11it/s, grad_norm=3.55, loss_final=3.17, loss_mean=0.988, loss_mean_cls=2.25, proj_loss=-0.072][2026-03-23 13:39:09] Step: 673, Training Logs: loss_final: 3.705919, loss_mean: 0.979710, proj_loss: -0.074012, loss_mean_cls: 2.800221, grad_norm: 6.914817 +Steps: 0%| | 674/1000000 [02:49<67:36:37, 4.11it/s, grad_norm=6.91, loss_final=3.71, loss_mean=0.98, loss_mean_cls=2.8, proj_loss=-0.074][2026-03-23 13:39:09] Step: 674, Training Logs: loss_final: 3.209517, loss_mean: 0.990392, proj_loss: -0.073498, loss_mean_cls: 2.292622, grad_norm: 3.757284 +Steps: 0%| | 675/1000000 [02:50<67:35:58, 4.11it/s, grad_norm=3.76, loss_final=3.21, loss_mean=0.99, loss_mean_cls=2.29, proj_loss=-0.0735][2026-03-23 13:39:09] Step: 675, Training Logs: loss_final: 3.359666, loss_mean: 0.996678, proj_loss: -0.072825, loss_mean_cls: 2.435813, grad_norm: 4.273813 +Steps: 0%| | 676/1000000 [02:50<67:35:53, 4.11it/s, grad_norm=4.27, loss_final=3.36, loss_mean=0.997, loss_mean_cls=2.44, proj_loss=-0.0728][2026-03-23 13:39:10] Step: 676, Training Logs: loss_final: 3.275000, loss_mean: 0.990260, proj_loss: -0.074253, loss_mean_cls: 2.358993, grad_norm: 7.387978 +Steps: 0%| | 677/1000000 [02:50<67:35:09, 4.11it/s, grad_norm=7.39, loss_final=3.27, loss_mean=0.99, loss_mean_cls=2.36, proj_loss=-0.0743][2026-03-23 13:39:10] Step: 677, Training Logs: loss_final: 3.816035, loss_mean: 0.946894, proj_loss: -0.074582, loss_mean_cls: 2.943723, grad_norm: 5.809414 +Steps: 0%| | 678/1000000 [02:50<67:34:50, 4.11it/s, grad_norm=5.81, loss_final=3.82, loss_mean=0.947, loss_mean_cls=2.94, proj_loss=-0.0746][2026-03-23 13:39:10] Step: 678, Training Logs: loss_final: 3.975829, loss_mean: 0.982450, proj_loss: -0.075772, loss_mean_cls: 3.069150, grad_norm: 4.295566 +Steps: 0%| | 679/1000000 [02:51<67:35:53, 4.11it/s, grad_norm=4.3, loss_final=3.98, loss_mean=0.982, loss_mean_cls=3.07, proj_loss=-0.0758][2026-03-23 13:39:10] Step: 679, Training Logs: loss_final: 3.684226, loss_mean: 0.993835, proj_loss: -0.072584, loss_mean_cls: 2.762976, grad_norm: 6.275542 +Steps: 0%| | 680/1000000 [02:51<67:36:17, 4.11it/s, grad_norm=6.28, loss_final=3.68, loss_mean=0.994, loss_mean_cls=2.76, proj_loss=-0.0726][2026-03-23 13:39:11] Step: 680, Training Logs: loss_final: 3.183569, loss_mean: 0.993943, proj_loss: -0.074144, loss_mean_cls: 2.263771, grad_norm: 3.803782 +Steps: 0%| | 681/1000000 [02:51<67:36:37, 4.11it/s, grad_norm=3.8, loss_final=3.18, loss_mean=0.994, loss_mean_cls=2.26, proj_loss=-0.0741][2026-03-23 13:39:11] Step: 681, Training Logs: loss_final: 3.912516, loss_mean: 0.968795, proj_loss: -0.074261, loss_mean_cls: 3.017983, grad_norm: 2.603490 +Steps: 0%| | 682/1000000 [02:51<67:36:45, 4.11it/s, grad_norm=2.6, loss_final=3.91, loss_mean=0.969, loss_mean_cls=3.02, proj_loss=-0.0743][2026-03-23 13:39:11] Step: 682, Training Logs: loss_final: 3.891802, loss_mean: 0.980792, proj_loss: -0.075574, loss_mean_cls: 2.986585, grad_norm: 5.809853 +Steps: 0%| | 683/1000000 [02:51<67:37:04, 4.11it/s, grad_norm=5.81, loss_final=3.89, loss_mean=0.981, loss_mean_cls=2.99, proj_loss=-0.0756][2026-03-23 13:39:11] Step: 683, Training Logs: loss_final: 3.069578, loss_mean: 1.008756, proj_loss: -0.074409, loss_mean_cls: 2.135230, grad_norm: 4.672829 +Steps: 0%| | 684/1000000 [02:52<67:36:39, 4.11it/s, grad_norm=4.67, loss_final=3.07, loss_mean=1.01, loss_mean_cls=2.14, proj_loss=-0.0744][2026-03-23 13:39:12] Step: 684, Training Logs: loss_final: 3.336938, loss_mean: 0.984799, proj_loss: -0.072362, loss_mean_cls: 2.424501, grad_norm: 7.611021 +Steps: 0%| | 685/1000000 [02:52<67:35:38, 4.11it/s, grad_norm=7.61, loss_final=3.34, loss_mean=0.985, loss_mean_cls=2.42, proj_loss=-0.0724][2026-03-23 13:39:12] Step: 685, Training Logs: loss_final: 3.840975, loss_mean: 0.980004, proj_loss: -0.072958, loss_mean_cls: 2.933929, grad_norm: 5.732267 +Steps: 0%| | 686/1000000 [02:52<67:35:57, 4.11it/s, grad_norm=5.73, loss_final=3.84, loss_mean=0.98, loss_mean_cls=2.93, proj_loss=-0.073][2026-03-23 13:39:12] Step: 686, Training Logs: loss_final: 3.734528, loss_mean: 0.997197, proj_loss: -0.073670, loss_mean_cls: 2.811002, grad_norm: 8.565435 +Steps: 0%| | 687/1000000 [02:52<67:36:05, 4.11it/s, grad_norm=8.57, loss_final=3.73, loss_mean=0.997, loss_mean_cls=2.81, proj_loss=-0.0737][2026-03-23 13:39:12] Step: 687, Training Logs: loss_final: 3.034903, loss_mean: 1.008398, proj_loss: -0.072516, loss_mean_cls: 2.099021, grad_norm: 7.644182 +Steps: 0%| | 688/1000000 [02:53<67:34:39, 4.11it/s, grad_norm=7.64, loss_final=3.03, loss_mean=1.01, loss_mean_cls=2.1, proj_loss=-0.0725][2026-03-23 13:39:13] Step: 688, Training Logs: loss_final: 3.249298, loss_mean: 0.979090, proj_loss: -0.075930, loss_mean_cls: 2.346138, grad_norm: 5.353604 +Steps: 0%| | 689/1000000 [02:53<67:35:01, 4.11it/s, grad_norm=5.35, loss_final=3.25, loss_mean=0.979, loss_mean_cls=2.35, proj_loss=-0.0759][2026-03-23 13:39:13] Step: 689, Training Logs: loss_final: 3.529031, loss_mean: 1.017095, proj_loss: -0.074968, loss_mean_cls: 2.586903, grad_norm: 8.292342 +Steps: 0%| | 690/1000000 [02:53<67:34:35, 4.11it/s, grad_norm=8.29, loss_final=3.53, loss_mean=1.02, loss_mean_cls=2.59, proj_loss=-0.075][2026-03-23 13:39:13] Step: 690, Training Logs: loss_final: 3.634476, loss_mean: 0.990817, proj_loss: -0.075845, loss_mean_cls: 2.719505, grad_norm: 8.933654 +Steps: 0%| | 691/1000000 [02:53<67:35:20, 4.11it/s, grad_norm=8.93, loss_final=3.63, loss_mean=0.991, loss_mean_cls=2.72, proj_loss=-0.0758][2026-03-23 13:39:13] Step: 691, Training Logs: loss_final: 4.000779, loss_mean: 0.967952, proj_loss: -0.073483, loss_mean_cls: 3.106309, grad_norm: 5.921020 +Steps: 0%| | 692/1000000 [02:54<67:34:41, 4.11it/s, grad_norm=5.92, loss_final=4, loss_mean=0.968, loss_mean_cls=3.11, proj_loss=-0.0735][2026-03-23 13:39:14] Step: 692, Training Logs: loss_final: 3.676219, loss_mean: 0.991583, proj_loss: -0.076336, loss_mean_cls: 2.760972, grad_norm: 5.195033 +Steps: 0%| | 693/1000000 [02:54<67:34:21, 4.11it/s, grad_norm=5.2, loss_final=3.68, loss_mean=0.992, loss_mean_cls=2.76, proj_loss=-0.0763][2026-03-23 13:39:14] Step: 693, Training Logs: loss_final: 3.287046, loss_mean: 0.997569, proj_loss: -0.079833, loss_mean_cls: 2.369310, grad_norm: 5.946924 +Steps: 0%| | 694/1000000 [02:54<67:36:56, 4.11it/s, grad_norm=5.95, loss_final=3.29, loss_mean=0.998, loss_mean_cls=2.37, proj_loss=-0.0798][2026-03-23 13:39:14] Step: 694, Training Logs: loss_final: 3.632764, loss_mean: 0.974557, proj_loss: -0.074215, loss_mean_cls: 2.732421, grad_norm: 5.487023 +Steps: 0%| | 695/1000000 [02:54<67:36:20, 4.11it/s, grad_norm=5.49, loss_final=3.63, loss_mean=0.975, loss_mean_cls=2.73, proj_loss=-0.0742][2026-03-23 13:39:14] Step: 695, Training Logs: loss_final: 3.978842, loss_mean: 0.956442, proj_loss: -0.074799, loss_mean_cls: 3.097199, grad_norm: 4.611621 +Steps: 0%| | 696/1000000 [02:55<67:36:47, 4.11it/s, grad_norm=4.61, loss_final=3.98, loss_mean=0.956, loss_mean_cls=3.1, proj_loss=-0.0748][2026-03-23 13:39:15] Step: 696, Training Logs: loss_final: 3.528749, loss_mean: 0.987386, proj_loss: -0.074789, loss_mean_cls: 2.616151, grad_norm: 5.523913 +Steps: 0%| | 697/1000000 [02:55<67:35:29, 4.11it/s, grad_norm=5.52, loss_final=3.53, loss_mean=0.987, loss_mean_cls=2.62, proj_loss=-0.0748][2026-03-23 13:39:15] Step: 697, Training Logs: loss_final: 3.346277, loss_mean: 0.990494, proj_loss: -0.078206, loss_mean_cls: 2.433990, grad_norm: 4.601668 +Steps: 0%| | 698/1000000 [02:55<67:35:28, 4.11it/s, grad_norm=4.6, loss_final=3.35, loss_mean=0.99, loss_mean_cls=2.43, proj_loss=-0.0782][2026-03-23 13:39:15] Step: 698, Training Logs: loss_final: 3.579554, loss_mean: 0.992420, proj_loss: -0.074903, loss_mean_cls: 2.662037, grad_norm: 3.829463 +Steps: 0%| | 699/1000000 [02:55<67:35:19, 4.11it/s, grad_norm=3.83, loss_final=3.58, loss_mean=0.992, loss_mean_cls=2.66, proj_loss=-0.0749][2026-03-23 13:39:15] Step: 699, Training Logs: loss_final: 3.375189, loss_mean: 0.983147, proj_loss: -0.077716, loss_mean_cls: 2.469758, grad_norm: 3.401431 +Steps: 0%| | 700/1000000 [02:56<67:37:26, 4.10it/s, grad_norm=3.4, loss_final=3.38, loss_mean=0.983, loss_mean_cls=2.47, proj_loss=-0.0777][2026-03-23 13:39:16] Step: 700, Training Logs: loss_final: 3.435968, loss_mean: 0.963509, proj_loss: -0.077146, loss_mean_cls: 2.549605, grad_norm: 5.764654 +Steps: 0%| | 701/1000000 [02:56<67:33:49, 4.11it/s, grad_norm=5.76, loss_final=3.44, loss_mean=0.964, loss_mean_cls=2.55, proj_loss=-0.0771][2026-03-23 13:39:16] Step: 701, Training Logs: loss_final: 3.598762, loss_mean: 0.979841, proj_loss: -0.073783, loss_mean_cls: 2.692704, grad_norm: 5.642076 +Steps: 0%| | 702/1000000 [02:56<67:34:22, 4.11it/s, grad_norm=5.64, loss_final=3.6, loss_mean=0.98, loss_mean_cls=2.69, proj_loss=-0.0738][2026-03-23 13:39:16] Step: 702, Training Logs: loss_final: 3.876357, loss_mean: 0.979565, proj_loss: -0.073387, loss_mean_cls: 2.970180, grad_norm: 8.591981 +Steps: 0%| | 703/1000000 [02:56<67:35:55, 4.11it/s, grad_norm=8.59, loss_final=3.88, loss_mean=0.98, loss_mean_cls=2.97, proj_loss=-0.0734][2026-03-23 13:39:16] Step: 703, Training Logs: loss_final: 3.324064, loss_mean: 1.006969, proj_loss: -0.079410, loss_mean_cls: 2.396505, grad_norm: 8.568647 +Steps: 0%| | 704/1000000 [02:57<67:35:02, 4.11it/s, grad_norm=8.57, loss_final=3.32, loss_mean=1.01, loss_mean_cls=2.4, proj_loss=-0.0794][2026-03-23 13:39:17] Step: 704, Training Logs: loss_final: 3.076050, loss_mean: 1.004667, proj_loss: -0.078953, loss_mean_cls: 2.150336, grad_norm: 6.452478 +Steps: 0%| | 705/1000000 [02:57<67:36:02, 4.11it/s, grad_norm=6.45, loss_final=3.08, loss_mean=1, loss_mean_cls=2.15, proj_loss=-0.079][2026-03-23 13:39:17] Step: 705, Training Logs: loss_final: 4.086580, loss_mean: 0.957523, proj_loss: -0.078751, loss_mean_cls: 3.207807, grad_norm: 5.812120 +Steps: 0%| | 706/1000000 [02:57<67:36:11, 4.11it/s, grad_norm=5.81, loss_final=4.09, loss_mean=0.958, loss_mean_cls=3.21, proj_loss=-0.0788][2026-03-23 13:39:17] Step: 706, Training Logs: loss_final: 3.308291, loss_mean: 0.992144, proj_loss: -0.074688, loss_mean_cls: 2.390835, grad_norm: 4.951040 +Steps: 0%| | 707/1000000 [02:57<67:36:24, 4.11it/s, grad_norm=4.95, loss_final=3.31, loss_mean=0.992, loss_mean_cls=2.39, proj_loss=-0.0747][2026-03-23 13:39:17] Step: 707, Training Logs: loss_final: 4.000900, loss_mean: 0.979496, proj_loss: -0.078573, loss_mean_cls: 3.099976, grad_norm: 4.617919 +Steps: 0%| | 708/1000000 [02:58<67:36:46, 4.11it/s, grad_norm=4.62, loss_final=4, loss_mean=0.979, loss_mean_cls=3.1, proj_loss=-0.0786][2026-03-23 13:39:18] Step: 708, Training Logs: loss_final: 4.025853, loss_mean: 0.957239, proj_loss: -0.078658, loss_mean_cls: 3.147272, grad_norm: 4.855988 +Steps: 0%| | 709/1000000 [02:58<67:35:59, 4.11it/s, grad_norm=4.86, loss_final=4.03, loss_mean=0.957, loss_mean_cls=3.15, proj_loss=-0.0787][2026-03-23 13:39:18] Step: 709, Training Logs: loss_final: 3.730726, loss_mean: 0.976782, proj_loss: -0.075767, loss_mean_cls: 2.829711, grad_norm: 6.156684 +Steps: 0%| | 710/1000000 [02:58<67:35:21, 4.11it/s, grad_norm=6.16, loss_final=3.73, loss_mean=0.977, loss_mean_cls=2.83, proj_loss=-0.0758][2026-03-23 13:39:18] Step: 710, Training Logs: loss_final: 3.635360, loss_mean: 0.993217, proj_loss: -0.077101, loss_mean_cls: 2.719244, grad_norm: 5.281393 +Steps: 0%| | 711/1000000 [02:58<67:35:56, 4.11it/s, grad_norm=5.28, loss_final=3.64, loss_mean=0.993, loss_mean_cls=2.72, proj_loss=-0.0771][2026-03-23 13:39:18] Step: 711, Training Logs: loss_final: 3.598890, loss_mean: 0.985071, proj_loss: -0.076693, loss_mean_cls: 2.690512, grad_norm: 6.800245 +Steps: 0%| | 712/1000000 [02:59<67:35:03, 4.11it/s, grad_norm=6.8, loss_final=3.6, loss_mean=0.985, loss_mean_cls=2.69, proj_loss=-0.0767][2026-03-23 13:39:19] Step: 712, Training Logs: loss_final: 3.453531, loss_mean: 0.978555, proj_loss: -0.077700, loss_mean_cls: 2.552676, grad_norm: 5.610466 +Steps: 0%| | 713/1000000 [02:59<67:33:52, 4.11it/s, grad_norm=5.61, loss_final=3.45, loss_mean=0.979, loss_mean_cls=2.55, proj_loss=-0.0777][2026-03-23 13:39:19] Step: 713, Training Logs: loss_final: 3.931498, loss_mean: 0.994008, proj_loss: -0.077536, loss_mean_cls: 3.015027, grad_norm: 4.943375 +Steps: 0%| | 714/1000000 [02:59<67:34:26, 4.11it/s, grad_norm=4.94, loss_final=3.93, loss_mean=0.994, loss_mean_cls=3.02, proj_loss=-0.0775][2026-03-23 13:39:19] Step: 714, Training Logs: loss_final: 3.551745, loss_mean: 0.981512, proj_loss: -0.076529, loss_mean_cls: 2.646762, grad_norm: 6.741214 +Steps: 0%| | 715/1000000 [02:59<67:36:08, 4.11it/s, grad_norm=6.74, loss_final=3.55, loss_mean=0.982, loss_mean_cls=2.65, proj_loss=-0.0765][2026-03-23 13:39:19] Step: 715, Training Logs: loss_final: 3.702582, loss_mean: 0.985122, proj_loss: -0.077766, loss_mean_cls: 2.795226, grad_norm: 7.532003 +Steps: 0%| | 716/1000000 [03:00<67:34:55, 4.11it/s, grad_norm=7.53, loss_final=3.7, loss_mean=0.985, loss_mean_cls=2.8, proj_loss=-0.0778][2026-03-23 13:39:19] Step: 716, Training Logs: loss_final: 3.763269, loss_mean: 0.977052, proj_loss: -0.081874, loss_mean_cls: 2.868091, grad_norm: 6.938812 +Steps: 0%| | 717/1000000 [03:00<67:33:12, 4.11it/s, grad_norm=6.94, loss_final=3.76, loss_mean=0.977, loss_mean_cls=2.87, proj_loss=-0.0819][2026-03-23 13:39:20] Step: 717, Training Logs: loss_final: 4.013162, loss_mean: 0.977585, proj_loss: -0.077617, loss_mean_cls: 3.113193, grad_norm: 11.293145 +Steps: 0%| | 718/1000000 [03:00<67:32:44, 4.11it/s, grad_norm=11.3, loss_final=4.01, loss_mean=0.978, loss_mean_cls=3.11, proj_loss=-0.0776][2026-03-23 13:39:20] Step: 718, Training Logs: loss_final: 3.491323, loss_mean: 0.993949, proj_loss: -0.078223, loss_mean_cls: 2.575597, grad_norm: 8.516858 +Steps: 0%| | 719/1000000 [03:00<67:34:34, 4.11it/s, grad_norm=8.52, loss_final=3.49, loss_mean=0.994, loss_mean_cls=2.58, proj_loss=-0.0782][2026-03-23 13:39:20] Step: 719, Training Logs: loss_final: 3.830101, loss_mean: 0.998823, proj_loss: -0.077856, loss_mean_cls: 2.909133, grad_norm: 7.918154 +Steps: 0%| | 720/1000000 [03:00<67:34:34, 4.11it/s, grad_norm=7.92, loss_final=3.83, loss_mean=0.999, loss_mean_cls=2.91, proj_loss=-0.0779][2026-03-23 13:39:20] Step: 720, Training Logs: loss_final: 3.423378, loss_mean: 0.998336, proj_loss: -0.079453, loss_mean_cls: 2.504495, grad_norm: 5.185297 +Steps: 0%| | 721/1000000 [03:01<67:35:12, 4.11it/s, grad_norm=5.19, loss_final=3.42, loss_mean=0.998, loss_mean_cls=2.5, proj_loss=-0.0795][2026-03-23 13:39:21] Step: 721, Training Logs: loss_final: 3.388807, loss_mean: 1.020669, proj_loss: -0.076590, loss_mean_cls: 2.444728, grad_norm: 7.884128 +Steps: 0%| | 722/1000000 [03:01<67:34:49, 4.11it/s, grad_norm=7.88, loss_final=3.39, loss_mean=1.02, loss_mean_cls=2.44, proj_loss=-0.0766][2026-03-23 13:39:21] Step: 722, Training Logs: loss_final: 3.571930, loss_mean: 1.010725, proj_loss: -0.079656, loss_mean_cls: 2.640860, grad_norm: 9.496881 +Steps: 0%| | 723/1000000 [03:01<68:42:51, 4.04it/s, grad_norm=9.5, loss_final=3.57, loss_mean=1.01, loss_mean_cls=2.64, proj_loss=-0.0797][2026-03-23 13:39:21] Step: 723, Training Logs: loss_final: 3.705218, loss_mean: 1.003640, proj_loss: -0.078077, loss_mean_cls: 2.779655, grad_norm: 8.443123 +Steps: 0%| | 724/1000000 [03:01<68:22:11, 4.06it/s, grad_norm=8.44, loss_final=3.71, loss_mean=1, loss_mean_cls=2.78, proj_loss=-0.0781][2026-03-23 13:39:21] Step: 724, Training Logs: loss_final: 3.490997, loss_mean: 0.998883, proj_loss: -0.080095, loss_mean_cls: 2.572209, grad_norm: 7.080363 +Steps: 0%| | 725/1000000 [03:02<68:07:48, 4.07it/s, grad_norm=7.08, loss_final=3.49, loss_mean=0.999, loss_mean_cls=2.57, proj_loss=-0.0801][2026-03-23 13:39:22] Step: 725, Training Logs: loss_final: 3.574905, loss_mean: 0.974599, proj_loss: -0.076004, loss_mean_cls: 2.676310, grad_norm: 7.393443 +Steps: 0%| | 726/1000000 [03:02<67:57:23, 4.08it/s, grad_norm=7.39, loss_final=3.57, loss_mean=0.975, loss_mean_cls=2.68, proj_loss=-0.076][2026-03-23 13:39:22] Step: 726, Training Logs: loss_final: 3.727262, loss_mean: 0.993702, proj_loss: -0.078721, loss_mean_cls: 2.812281, grad_norm: 6.542141 +Steps: 0%| | 727/1000000 [03:02<67:51:35, 4.09it/s, grad_norm=6.54, loss_final=3.73, loss_mean=0.994, loss_mean_cls=2.81, proj_loss=-0.0787][2026-03-23 13:39:22] Step: 727, Training Logs: loss_final: 3.661003, loss_mean: 0.966546, proj_loss: -0.078703, loss_mean_cls: 2.773161, grad_norm: 4.455476 +Steps: 0%| | 728/1000000 [03:02<67:47:34, 4.09it/s, grad_norm=4.46, loss_final=3.66, loss_mean=0.967, loss_mean_cls=2.77, proj_loss=-0.0787][2026-03-23 13:39:22] Step: 728, Training Logs: loss_final: 3.335029, loss_mean: 0.982587, proj_loss: -0.079042, loss_mean_cls: 2.431484, grad_norm: 5.651514 +Steps: 0%| | 729/1000000 [03:03<67:42:55, 4.10it/s, grad_norm=5.65, loss_final=3.34, loss_mean=0.983, loss_mean_cls=2.43, proj_loss=-0.079][2026-03-23 13:39:23] Step: 729, Training Logs: loss_final: 3.679703, loss_mean: 0.988676, proj_loss: -0.076792, loss_mean_cls: 2.767819, grad_norm: 5.636966 +Steps: 0%| | 730/1000000 [03:03<67:41:52, 4.10it/s, grad_norm=5.64, loss_final=3.68, loss_mean=0.989, loss_mean_cls=2.77, proj_loss=-0.0768][2026-03-23 13:39:23] Step: 730, Training Logs: loss_final: 3.811187, loss_mean: 0.972613, proj_loss: -0.082163, loss_mean_cls: 2.920738, grad_norm: 3.950387 +Steps: 0%| | 731/1000000 [03:03<67:41:52, 4.10it/s, grad_norm=3.95, loss_final=3.81, loss_mean=0.973, loss_mean_cls=2.92, proj_loss=-0.0822][2026-03-23 13:39:23] Step: 731, Training Logs: loss_final: 3.278399, loss_mean: 0.993929, proj_loss: -0.079969, loss_mean_cls: 2.364439, grad_norm: 5.501733 +Steps: 0%| | 732/1000000 [03:03<67:40:36, 4.10it/s, grad_norm=5.5, loss_final=3.28, loss_mean=0.994, loss_mean_cls=2.36, proj_loss=-0.08][2026-03-23 13:39:23] Step: 732, Training Logs: loss_final: 3.758329, loss_mean: 0.976398, proj_loss: -0.078917, loss_mean_cls: 2.860848, grad_norm: 9.270187 +Steps: 0%| | 733/1000000 [03:04<67:38:28, 4.10it/s, grad_norm=9.27, loss_final=3.76, loss_mean=0.976, loss_mean_cls=2.86, proj_loss=-0.0789][2026-03-23 13:39:24] Step: 733, Training Logs: loss_final: 3.241796, loss_mean: 1.012154, proj_loss: -0.080374, loss_mean_cls: 2.310017, grad_norm: 7.019979 +Steps: 0%| | 734/1000000 [03:04<67:37:59, 4.10it/s, grad_norm=7.02, loss_final=3.24, loss_mean=1.01, loss_mean_cls=2.31, proj_loss=-0.0804][2026-03-23 13:39:24] Step: 734, Training Logs: loss_final: 3.382919, loss_mean: 0.981735, proj_loss: -0.081569, loss_mean_cls: 2.482752, grad_norm: 5.103244 +Steps: 0%| | 735/1000000 [03:04<67:37:08, 4.10it/s, grad_norm=5.1, loss_final=3.38, loss_mean=0.982, loss_mean_cls=2.48, proj_loss=-0.0816][2026-03-23 13:39:24] Step: 735, Training Logs: loss_final: 3.948299, loss_mean: 0.965274, proj_loss: -0.082738, loss_mean_cls: 3.065762, grad_norm: 6.471291 +Steps: 0%| | 736/1000000 [03:04<67:36:52, 4.11it/s, grad_norm=6.47, loss_final=3.95, loss_mean=0.965, loss_mean_cls=3.07, proj_loss=-0.0827][2026-03-23 13:39:24] Step: 736, Training Logs: loss_final: 3.128489, loss_mean: 0.994448, proj_loss: -0.082520, loss_mean_cls: 2.216562, grad_norm: 5.521655 +Steps: 0%| | 737/1000000 [03:05<67:36:53, 4.11it/s, grad_norm=5.52, loss_final=3.13, loss_mean=0.994, loss_mean_cls=2.22, proj_loss=-0.0825][2026-03-23 13:39:25] Step: 737, Training Logs: loss_final: 3.272221, loss_mean: 0.989367, proj_loss: -0.076870, loss_mean_cls: 2.359724, grad_norm: 5.424657 +Steps: 0%| | 738/1000000 [03:05<67:36:45, 4.11it/s, grad_norm=5.42, loss_final=3.27, loss_mean=0.989, loss_mean_cls=2.36, proj_loss=-0.0769][2026-03-23 13:39:25] Step: 738, Training Logs: loss_final: 3.702153, loss_mean: 1.000439, proj_loss: -0.080253, loss_mean_cls: 2.781967, grad_norm: 5.467771 +Steps: 0%| | 739/1000000 [03:05<67:36:06, 4.11it/s, grad_norm=5.47, loss_final=3.7, loss_mean=1, loss_mean_cls=2.78, proj_loss=-0.0803][2026-03-23 13:39:25] Step: 739, Training Logs: loss_final: 3.926397, loss_mean: 0.964507, proj_loss: -0.079960, loss_mean_cls: 3.041849, grad_norm: 7.820321 +Steps: 0%| | 740/1000000 [03:05<67:36:22, 4.11it/s, grad_norm=7.82, loss_final=3.93, loss_mean=0.965, loss_mean_cls=3.04, proj_loss=-0.08][2026-03-23 13:39:25] Step: 740, Training Logs: loss_final: 3.702613, loss_mean: 0.960624, proj_loss: -0.083077, loss_mean_cls: 2.825066, grad_norm: 5.888918 +Steps: 0%| | 741/1000000 [03:06<67:34:55, 4.11it/s, grad_norm=5.89, loss_final=3.7, loss_mean=0.961, loss_mean_cls=2.83, proj_loss=-0.0831][2026-03-23 13:39:26] Step: 741, Training Logs: loss_final: 4.180512, loss_mean: 0.963186, proj_loss: -0.082583, loss_mean_cls: 3.299908, grad_norm: 6.354979 +Steps: 0%| | 742/1000000 [03:06<67:33:58, 4.11it/s, grad_norm=6.35, loss_final=4.18, loss_mean=0.963, loss_mean_cls=3.3, proj_loss=-0.0826][2026-03-23 13:39:26] Step: 742, Training Logs: loss_final: 3.655820, loss_mean: 0.988223, proj_loss: -0.080668, loss_mean_cls: 2.748265, grad_norm: 5.832325 +Steps: 0%| | 743/1000000 [03:06<67:34:53, 4.11it/s, grad_norm=5.83, loss_final=3.66, loss_mean=0.988, loss_mean_cls=2.75, proj_loss=-0.0807][2026-03-23 13:39:26] Step: 743, Training Logs: loss_final: 3.517449, loss_mean: 1.002331, proj_loss: -0.083721, loss_mean_cls: 2.598839, grad_norm: 6.086798 +Steps: 0%| | 744/1000000 [03:06<67:34:44, 4.11it/s, grad_norm=6.09, loss_final=3.52, loss_mean=1, loss_mean_cls=2.6, proj_loss=-0.0837][2026-03-23 13:39:26] Step: 744, Training Logs: loss_final: 3.151043, loss_mean: 1.009961, proj_loss: -0.082826, loss_mean_cls: 2.223907, grad_norm: 4.216563 +Steps: 0%| | 745/1000000 [03:07<67:34:52, 4.11it/s, grad_norm=4.22, loss_final=3.15, loss_mean=1.01, loss_mean_cls=2.22, proj_loss=-0.0828][2026-03-23 13:39:27] Step: 745, Training Logs: loss_final: 4.213492, loss_mean: 0.960979, proj_loss: -0.080324, loss_mean_cls: 3.332837, grad_norm: 11.954671 +Steps: 0%| | 746/1000000 [03:07<67:35:53, 4.11it/s, grad_norm=12, loss_final=4.21, loss_mean=0.961, loss_mean_cls=3.33, proj_loss=-0.0803][2026-03-23 13:39:27] Step: 746, Training Logs: loss_final: 2.759285, loss_mean: 1.031931, proj_loss: -0.086416, loss_mean_cls: 1.813771, grad_norm: 7.462880 +Steps: 0%| | 747/1000000 [03:07<67:37:09, 4.10it/s, grad_norm=7.46, loss_final=2.76, loss_mean=1.03, loss_mean_cls=1.81, proj_loss=-0.0864][2026-03-23 13:39:27] Step: 747, Training Logs: loss_final: 3.426326, loss_mean: 0.998624, proj_loss: -0.084162, loss_mean_cls: 2.511864, grad_norm: 6.844501 +Steps: 0%| | 748/1000000 [03:07<67:39:51, 4.10it/s, grad_norm=6.84, loss_final=3.43, loss_mean=0.999, loss_mean_cls=2.51, proj_loss=-0.0842][2026-03-23 13:39:27] Step: 748, Training Logs: loss_final: 3.621568, loss_mean: 0.996336, proj_loss: -0.085571, loss_mean_cls: 2.710802, grad_norm: 5.416052 +Steps: 0%| | 749/1000000 [03:08<67:38:46, 4.10it/s, grad_norm=5.42, loss_final=3.62, loss_mean=0.996, loss_mean_cls=2.71, proj_loss=-0.0856][2026-03-23 13:39:28] Step: 749, Training Logs: loss_final: 3.625631, loss_mean: 0.999284, proj_loss: -0.081800, loss_mean_cls: 2.708147, grad_norm: 6.617371 +Steps: 0%| | 750/1000000 [03:08<67:36:51, 4.11it/s, grad_norm=6.62, loss_final=3.63, loss_mean=0.999, loss_mean_cls=2.71, proj_loss=-0.0818][2026-03-23 13:39:28] Step: 750, Training Logs: loss_final: 4.160687, loss_mean: 0.955072, proj_loss: -0.082554, loss_mean_cls: 3.288169, grad_norm: 4.380515 +Steps: 0%| | 751/1000000 [03:08<67:36:43, 4.11it/s, grad_norm=4.38, loss_final=4.16, loss_mean=0.955, loss_mean_cls=3.29, proj_loss=-0.0826][2026-03-23 13:39:28] Step: 751, Training Logs: loss_final: 3.970424, loss_mean: 0.965468, proj_loss: -0.084901, loss_mean_cls: 3.089857, grad_norm: 6.188289 +Steps: 0%| | 752/1000000 [03:08<67:37:02, 4.10it/s, grad_norm=6.19, loss_final=3.97, loss_mean=0.965, loss_mean_cls=3.09, proj_loss=-0.0849][2026-03-23 13:39:28] Step: 752, Training Logs: loss_final: 3.372648, loss_mean: 0.980615, proj_loss: -0.086300, loss_mean_cls: 2.478333, grad_norm: 3.560378 +Steps: 0%| | 753/1000000 [03:09<67:35:48, 4.11it/s, grad_norm=3.56, loss_final=3.37, loss_mean=0.981, loss_mean_cls=2.48, proj_loss=-0.0863][2026-03-23 13:39:29] Step: 753, Training Logs: loss_final: 3.561321, loss_mean: 0.984065, proj_loss: -0.086020, loss_mean_cls: 2.663275, grad_norm: 8.691132 +Steps: 0%| | 754/1000000 [03:09<67:35:20, 4.11it/s, grad_norm=8.69, loss_final=3.56, loss_mean=0.984, loss_mean_cls=2.66, proj_loss=-0.086][2026-03-23 13:39:29] Step: 754, Training Logs: loss_final: 3.158317, loss_mean: 0.995941, proj_loss: -0.089343, loss_mean_cls: 2.251720, grad_norm: 4.775361 +Steps: 0%| | 755/1000000 [03:09<67:36:13, 4.11it/s, grad_norm=4.78, loss_final=3.16, loss_mean=0.996, loss_mean_cls=2.25, proj_loss=-0.0893][2026-03-23 13:39:29] Step: 755, Training Logs: loss_final: 3.313581, loss_mean: 0.983759, proj_loss: -0.086234, loss_mean_cls: 2.416056, grad_norm: 6.662338 +Steps: 0%| | 756/1000000 [03:09<67:36:12, 4.11it/s, grad_norm=6.66, loss_final=3.31, loss_mean=0.984, loss_mean_cls=2.42, proj_loss=-0.0862][2026-03-23 13:39:29] Step: 756, Training Logs: loss_final: 3.890031, loss_mean: 0.963662, proj_loss: -0.084847, loss_mean_cls: 3.011216, grad_norm: 7.599316 +Steps: 0%| | 757/1000000 [03:10<68:18:46, 4.06it/s, grad_norm=7.6, loss_final=3.89, loss_mean=0.964, loss_mean_cls=3.01, proj_loss=-0.0848][2026-03-23 13:39:29] Step: 757, Training Logs: loss_final: 3.477201, loss_mean: 0.999395, proj_loss: -0.085919, loss_mean_cls: 2.563725, grad_norm: 5.844714 +Steps: 0%| | 758/1000000 [03:10<68:05:40, 4.08it/s, grad_norm=5.84, loss_final=3.48, loss_mean=0.999, loss_mean_cls=2.56, proj_loss=-0.0859][2026-03-23 13:39:30] Step: 758, Training Logs: loss_final: 3.611258, loss_mean: 0.979735, proj_loss: -0.086474, loss_mean_cls: 2.717996, grad_norm: 5.975582 +Steps: 0%| | 759/1000000 [03:10<67:56:27, 4.09it/s, grad_norm=5.98, loss_final=3.61, loss_mean=0.98, loss_mean_cls=2.72, proj_loss=-0.0865][2026-03-23 13:39:30] Step: 759, Training Logs: loss_final: 3.776136, loss_mean: 0.992188, proj_loss: -0.088262, loss_mean_cls: 2.872211, grad_norm: 7.313224 +Steps: 0%| | 760/1000000 [03:10<67:50:47, 4.09it/s, grad_norm=7.31, loss_final=3.78, loss_mean=0.992, loss_mean_cls=2.87, proj_loss=-0.0883][2026-03-23 13:39:30] Step: 760, Training Logs: loss_final: 3.553355, loss_mean: 0.993866, proj_loss: -0.088702, loss_mean_cls: 2.648191, grad_norm: 6.727736 +Steps: 0%| | 761/1000000 [03:10<67:45:27, 4.10it/s, grad_norm=6.73, loss_final=3.55, loss_mean=0.994, loss_mean_cls=2.65, proj_loss=-0.0887][2026-03-23 13:39:30] Step: 761, Training Logs: loss_final: 3.893735, loss_mean: 0.976187, proj_loss: -0.088498, loss_mean_cls: 3.006047, grad_norm: 5.338701 +Steps: 0%| | 762/1000000 [03:11<67:44:04, 4.10it/s, grad_norm=5.34, loss_final=3.89, loss_mean=0.976, loss_mean_cls=3.01, proj_loss=-0.0885][2026-03-23 13:39:31] Step: 762, Training Logs: loss_final: 3.799626, loss_mean: 0.985434, proj_loss: -0.091293, loss_mean_cls: 2.905485, grad_norm: 5.438722 +Steps: 0%| | 763/1000000 [03:11<67:41:45, 4.10it/s, grad_norm=5.44, loss_final=3.8, loss_mean=0.985, loss_mean_cls=2.91, proj_loss=-0.0913][2026-03-23 13:39:31] Step: 763, Training Logs: loss_final: 3.616353, loss_mean: 0.977185, proj_loss: -0.093104, loss_mean_cls: 2.732272, grad_norm: 6.003807 +Steps: 0%| | 764/1000000 [03:11<67:43:47, 4.10it/s, grad_norm=6, loss_final=3.62, loss_mean=0.977, loss_mean_cls=2.73, proj_loss=-0.0931][2026-03-23 13:39:31] Step: 764, Training Logs: loss_final: 3.502560, loss_mean: 0.999795, proj_loss: -0.089630, loss_mean_cls: 2.592395, grad_norm: 5.419434 +Steps: 0%| | 765/1000000 [03:11<67:41:13, 4.10it/s, grad_norm=5.42, loss_final=3.5, loss_mean=1, loss_mean_cls=2.59, proj_loss=-0.0896][2026-03-23 13:39:31] Step: 765, Training Logs: loss_final: 3.319090, loss_mean: 1.008871, proj_loss: -0.092483, loss_mean_cls: 2.402702, grad_norm: 6.505475 +Steps: 0%| | 766/1000000 [03:12<69:14:23, 4.01it/s, grad_norm=6.51, loss_final=3.32, loss_mean=1.01, loss_mean_cls=2.4, proj_loss=-0.0925][2026-03-23 13:39:32] Step: 766, Training Logs: loss_final: 3.495530, loss_mean: 1.001176, proj_loss: -0.092342, loss_mean_cls: 2.586696, grad_norm: 6.144180 +Steps: 0%| | 767/1000000 [03:12<68:45:18, 4.04it/s, grad_norm=6.14, loss_final=3.5, loss_mean=1, loss_mean_cls=2.59, proj_loss=-0.0923][2026-03-23 13:39:32] Step: 767, Training Logs: loss_final: 4.223293, loss_mean: 0.993302, proj_loss: -0.092033, loss_mean_cls: 3.322024, grad_norm: 5.731522 +Steps: 0%| | 768/1000000 [03:12<68:24:38, 4.06it/s, grad_norm=5.73, loss_final=4.22, loss_mean=0.993, loss_mean_cls=3.32, proj_loss=-0.092][2026-03-23 13:39:32] Step: 768, Training Logs: loss_final: 3.217403, loss_mean: 0.991042, proj_loss: -0.095034, loss_mean_cls: 2.321395, grad_norm: 5.203373 +Steps: 0%| | 769/1000000 [03:12<68:09:09, 4.07it/s, grad_norm=5.2, loss_final=3.22, loss_mean=0.991, loss_mean_cls=2.32, proj_loss=-0.095][2026-03-23 13:39:32] Step: 769, Training Logs: loss_final: 3.466010, loss_mean: 0.990224, proj_loss: -0.093318, loss_mean_cls: 2.569104, grad_norm: 5.856824 +Steps: 0%| | 770/1000000 [03:13<67:59:34, 4.08it/s, grad_norm=5.86, loss_final=3.47, loss_mean=0.99, loss_mean_cls=2.57, proj_loss=-0.0933][2026-03-23 13:39:33] Step: 770, Training Logs: loss_final: 3.800175, loss_mean: 0.974010, proj_loss: -0.091713, loss_mean_cls: 2.917878, grad_norm: 3.968943 +Steps: 0%| | 771/1000000 [03:13<67:53:03, 4.09it/s, grad_norm=3.97, loss_final=3.8, loss_mean=0.974, loss_mean_cls=2.92, proj_loss=-0.0917][2026-03-23 13:39:33] Step: 771, Training Logs: loss_final: 3.662961, loss_mean: 0.987652, proj_loss: -0.094367, loss_mean_cls: 2.769675, grad_norm: 4.285180 +Steps: 0%| | 772/1000000 [03:13<67:48:53, 4.09it/s, grad_norm=4.29, loss_final=3.66, loss_mean=0.988, loss_mean_cls=2.77, proj_loss=-0.0944][2026-03-23 13:39:33] Step: 772, Training Logs: loss_final: 3.540758, loss_mean: 0.985178, proj_loss: -0.095542, loss_mean_cls: 2.651122, grad_norm: 5.091965 +Steps: 0%| | 773/1000000 [03:13<67:45:17, 4.10it/s, grad_norm=5.09, loss_final=3.54, loss_mean=0.985, loss_mean_cls=2.65, proj_loss=-0.0955][2026-03-23 13:39:33] Step: 773, Training Logs: loss_final: 3.571492, loss_mean: 0.967050, proj_loss: -0.095441, loss_mean_cls: 2.699883, grad_norm: 4.777302 +Steps: 0%| | 774/1000000 [03:14<67:43:03, 4.10it/s, grad_norm=4.78, loss_final=3.57, loss_mean=0.967, loss_mean_cls=2.7, proj_loss=-0.0954][2026-03-23 13:39:34] Step: 774, Training Logs: loss_final: 3.436475, loss_mean: 0.969581, proj_loss: -0.100475, loss_mean_cls: 2.567369, grad_norm: 6.545211 +Steps: 0%| | 775/1000000 [03:14<67:41:40, 4.10it/s, grad_norm=6.55, loss_final=3.44, loss_mean=0.97, loss_mean_cls=2.57, proj_loss=-0.1][2026-03-23 13:39:34] Step: 775, Training Logs: loss_final: 3.729449, loss_mean: 0.962170, proj_loss: -0.093367, loss_mean_cls: 2.860646, grad_norm: 4.311515 +Steps: 0%| | 776/1000000 [03:14<67:40:48, 4.10it/s, grad_norm=4.31, loss_final=3.73, loss_mean=0.962, loss_mean_cls=2.86, proj_loss=-0.0934][2026-03-23 13:39:34] Step: 776, Training Logs: loss_final: 3.589051, loss_mean: 0.967600, proj_loss: -0.099295, loss_mean_cls: 2.720745, grad_norm: 9.527555 +Steps: 0%| | 777/1000000 [03:14<67:39:10, 4.10it/s, grad_norm=9.53, loss_final=3.59, loss_mean=0.968, loss_mean_cls=2.72, proj_loss=-0.0993][2026-03-23 13:39:34] Step: 777, Training Logs: loss_final: 3.431742, loss_mean: 0.992707, proj_loss: -0.094227, loss_mean_cls: 2.533262, grad_norm: 6.853449 +Steps: 0%| | 778/1000000 [03:15<67:38:29, 4.10it/s, grad_norm=6.85, loss_final=3.43, loss_mean=0.993, loss_mean_cls=2.53, proj_loss=-0.0942][2026-03-23 13:39:35] Step: 778, Training Logs: loss_final: 3.857946, loss_mean: 0.989901, proj_loss: -0.098856, loss_mean_cls: 2.966901, grad_norm: 4.674265 +Steps: 0%| | 779/1000000 [03:15<67:36:42, 4.11it/s, grad_norm=4.67, loss_final=3.86, loss_mean=0.99, loss_mean_cls=2.97, proj_loss=-0.0989][2026-03-23 13:39:35] Step: 779, Training Logs: loss_final: 3.296926, loss_mean: 0.988589, proj_loss: -0.097125, loss_mean_cls: 2.405461, grad_norm: 5.645866 +Steps: 0%| | 780/1000000 [03:15<67:36:42, 4.11it/s, grad_norm=5.65, loss_final=3.3, loss_mean=0.989, loss_mean_cls=2.41, proj_loss=-0.0971][2026-03-23 13:39:35] Step: 780, Training Logs: loss_final: 3.132453, loss_mean: 0.999193, proj_loss: -0.102272, loss_mean_cls: 2.235533, grad_norm: 4.672471 +Steps: 0%| | 781/1000000 [03:15<67:37:43, 4.10it/s, grad_norm=4.67, loss_final=3.13, loss_mean=0.999, loss_mean_cls=2.24, proj_loss=-0.102][2026-03-23 13:39:35] Step: 781, Training Logs: loss_final: 3.464539, loss_mean: 0.987525, proj_loss: -0.100555, loss_mean_cls: 2.577569, grad_norm: 7.745009 +Steps: 0%| | 782/1000000 [03:16<67:37:39, 4.10it/s, grad_norm=7.75, loss_final=3.46, loss_mean=0.988, loss_mean_cls=2.58, proj_loss=-0.101][2026-03-23 13:39:36] Step: 782, Training Logs: loss_final: 3.955363, loss_mean: 0.987371, proj_loss: -0.105254, loss_mean_cls: 3.073245, grad_norm: 8.130844 +Steps: 0%| | 783/1000000 [03:16<67:37:36, 4.10it/s, grad_norm=8.13, loss_final=3.96, loss_mean=0.987, loss_mean_cls=3.07, proj_loss=-0.105][2026-03-23 13:39:36] Step: 783, Training Logs: loss_final: 3.878459, loss_mean: 0.949905, proj_loss: -0.100298, loss_mean_cls: 3.028852, grad_norm: 7.081015 +Steps: 0%| | 784/1000000 [03:16<67:36:17, 4.11it/s, grad_norm=7.08, loss_final=3.88, loss_mean=0.95, loss_mean_cls=3.03, proj_loss=-0.1][2026-03-23 13:39:36] Step: 784, Training Logs: loss_final: 3.506197, loss_mean: 0.960778, proj_loss: -0.105070, loss_mean_cls: 2.650489, grad_norm: 6.435397 +Steps: 0%| | 785/1000000 [03:16<67:36:32, 4.11it/s, grad_norm=6.44, loss_final=3.51, loss_mean=0.961, loss_mean_cls=2.65, proj_loss=-0.105][2026-03-23 13:39:36] Step: 785, Training Logs: loss_final: 3.050841, loss_mean: 1.005005, proj_loss: -0.112185, loss_mean_cls: 2.158021, grad_norm: 5.454555 +Steps: 0%| | 786/1000000 [03:17<67:36:04, 4.11it/s, grad_norm=5.45, loss_final=3.05, loss_mean=1.01, loss_mean_cls=2.16, proj_loss=-0.112][2026-03-23 13:39:37] Step: 786, Training Logs: loss_final: 3.433201, loss_mean: 0.989603, proj_loss: -0.106082, loss_mean_cls: 2.549680, grad_norm: 7.216142 +Steps: 0%| | 787/1000000 [03:17<67:36:33, 4.11it/s, grad_norm=7.22, loss_final=3.43, loss_mean=0.99, loss_mean_cls=2.55, proj_loss=-0.106][2026-03-23 13:39:37] Step: 787, Training Logs: loss_final: 3.178681, loss_mean: 0.983927, proj_loss: -0.111900, loss_mean_cls: 2.306654, grad_norm: 8.394272 +Steps: 0%| | 788/1000000 [03:17<67:36:27, 4.11it/s, grad_norm=8.39, loss_final=3.18, loss_mean=0.984, loss_mean_cls=2.31, proj_loss=-0.112][2026-03-23 13:39:37] Step: 788, Training Logs: loss_final: 3.609515, loss_mean: 0.974195, proj_loss: -0.112934, loss_mean_cls: 2.748255, grad_norm: 5.760150 +Steps: 0%| | 789/1000000 [03:17<67:36:07, 4.11it/s, grad_norm=5.76, loss_final=3.61, loss_mean=0.974, loss_mean_cls=2.75, proj_loss=-0.113][2026-03-23 13:39:37] Step: 789, Training Logs: loss_final: 3.957981, loss_mean: 0.970279, proj_loss: -0.110405, loss_mean_cls: 3.098108, grad_norm: 6.969221 +Steps: 0%| | 790/1000000 [03:18<67:35:47, 4.11it/s, grad_norm=6.97, loss_final=3.96, loss_mean=0.97, loss_mean_cls=3.1, proj_loss=-0.11][2026-03-23 13:39:38] Step: 790, Training Logs: loss_final: 3.344214, loss_mean: 0.984185, proj_loss: -0.112748, loss_mean_cls: 2.472777, grad_norm: 5.210845 +Steps: 0%| | 791/1000000 [03:18<67:35:18, 4.11it/s, grad_norm=5.21, loss_final=3.34, loss_mean=0.984, loss_mean_cls=2.47, proj_loss=-0.113][2026-03-23 13:39:38] Step: 791, Training Logs: loss_final: 3.734034, loss_mean: 0.981696, proj_loss: -0.109999, loss_mean_cls: 2.862337, grad_norm: 7.008366 +Steps: 0%| | 792/1000000 [03:18<67:35:17, 4.11it/s, grad_norm=7.01, loss_final=3.73, loss_mean=0.982, loss_mean_cls=2.86, proj_loss=-0.11][2026-03-23 13:39:38] Step: 792, Training Logs: loss_final: 3.540596, loss_mean: 0.988846, proj_loss: -0.114754, loss_mean_cls: 2.666503, grad_norm: 5.542114 +Steps: 0%| | 793/1000000 [03:18<67:35:51, 4.11it/s, grad_norm=5.54, loss_final=3.54, loss_mean=0.989, loss_mean_cls=2.67, proj_loss=-0.115][2026-03-23 13:39:38] Step: 793, Training Logs: loss_final: 3.350283, loss_mean: 1.000211, proj_loss: -0.113103, loss_mean_cls: 2.463175, grad_norm: 6.968897 +Steps: 0%| | 794/1000000 [03:19<67:35:02, 4.11it/s, grad_norm=6.97, loss_final=3.35, loss_mean=1, loss_mean_cls=2.46, proj_loss=-0.113][2026-03-23 13:39:39] Step: 794, Training Logs: loss_final: 3.305461, loss_mean: 1.016082, proj_loss: -0.115779, loss_mean_cls: 2.405157, grad_norm: 5.964035 +Steps: 0%| | 795/1000000 [03:19<67:36:05, 4.11it/s, grad_norm=5.96, loss_final=3.31, loss_mean=1.02, loss_mean_cls=2.41, proj_loss=-0.116][2026-03-23 13:39:39] Step: 795, Training Logs: loss_final: 3.723802, loss_mean: 0.987075, proj_loss: -0.116410, loss_mean_cls: 2.853137, grad_norm: 5.130540 +Steps: 0%| | 796/1000000 [03:19<67:38:31, 4.10it/s, grad_norm=5.13, loss_final=3.72, loss_mean=0.987, loss_mean_cls=2.85, proj_loss=-0.116][2026-03-23 13:39:39] Step: 796, Training Logs: loss_final: 3.487829, loss_mean: 0.994069, proj_loss: -0.117651, loss_mean_cls: 2.611412, grad_norm: 6.853619 +Steps: 0%| | 797/1000000 [03:19<67:37:58, 4.10it/s, grad_norm=6.85, loss_final=3.49, loss_mean=0.994, loss_mean_cls=2.61, proj_loss=-0.118][2026-03-23 13:39:39] Step: 797, Training Logs: loss_final: 3.372700, loss_mean: 0.974201, proj_loss: -0.120528, loss_mean_cls: 2.519027, grad_norm: 2.813239 +Steps: 0%| | 798/1000000 [03:20<67:36:04, 4.11it/s, grad_norm=2.81, loss_final=3.37, loss_mean=0.974, loss_mean_cls=2.52, proj_loss=-0.121][2026-03-23 13:39:39] Step: 798, Training Logs: loss_final: 3.522202, loss_mean: 0.995845, proj_loss: -0.124294, loss_mean_cls: 2.650652, grad_norm: 9.110901 +Steps: 0%| | 799/1000000 [03:20<67:36:02, 4.11it/s, grad_norm=9.11, loss_final=3.52, loss_mean=0.996, loss_mean_cls=2.65, proj_loss=-0.124][2026-03-23 13:39:40] Step: 799, Training Logs: loss_final: 3.521028, loss_mean: 1.027289, proj_loss: -0.118107, loss_mean_cls: 2.611846, grad_norm: 7.997164 +Steps: 0%| | 800/1000000 [03:20<67:38:11, 4.10it/s, grad_norm=8, loss_final=3.52, loss_mean=1.03, loss_mean_cls=2.61, proj_loss=-0.118][2026-03-23 13:39:40] Step: 800, Training Logs: loss_final: 3.574762, loss_mean: 0.988289, proj_loss: -0.121195, loss_mean_cls: 2.707668, grad_norm: 6.071355 +Steps: 0%| | 801/1000000 [03:20<67:34:55, 4.11it/s, grad_norm=6.07, loss_final=3.57, loss_mean=0.988, loss_mean_cls=2.71, proj_loss=-0.121][2026-03-23 13:39:40] Step: 801, Training Logs: loss_final: 3.915268, loss_mean: 0.989526, proj_loss: -0.123918, loss_mean_cls: 3.049661, grad_norm: 6.910089 +Steps: 0%| | 802/1000000 [03:21<67:35:44, 4.11it/s, grad_norm=6.91, loss_final=3.92, loss_mean=0.99, loss_mean_cls=3.05, proj_loss=-0.124][2026-03-23 13:39:40] Step: 802, Training Logs: loss_final: 3.423812, loss_mean: 1.016216, proj_loss: -0.122628, loss_mean_cls: 2.530224, grad_norm: 5.868536 +Steps: 0%| | 803/1000000 [03:21<67:36:50, 4.10it/s, grad_norm=5.87, loss_final=3.42, loss_mean=1.02, loss_mean_cls=2.53, proj_loss=-0.123][2026-03-23 13:39:41] Step: 803, Training Logs: loss_final: 3.357081, loss_mean: 0.991914, proj_loss: -0.127112, loss_mean_cls: 2.492279, grad_norm: 5.893596 +Steps: 0%| | 804/1000000 [03:21<67:36:44, 4.11it/s, grad_norm=5.89, loss_final=3.36, loss_mean=0.992, loss_mean_cls=2.49, proj_loss=-0.127][2026-03-23 13:39:41] Step: 804, Training Logs: loss_final: 3.446700, loss_mean: 0.989514, proj_loss: -0.127526, loss_mean_cls: 2.584713, grad_norm: 5.365756 +Steps: 0%| | 805/1000000 [03:21<67:36:07, 4.11it/s, grad_norm=5.37, loss_final=3.45, loss_mean=0.99, loss_mean_cls=2.58, proj_loss=-0.128][2026-03-23 13:39:41] Step: 805, Training Logs: loss_final: 3.699772, loss_mean: 0.972530, proj_loss: -0.125764, loss_mean_cls: 2.853007, grad_norm: 8.497247 +Steps: 0%| | 806/1000000 [03:21<67:35:41, 4.11it/s, grad_norm=8.5, loss_final=3.7, loss_mean=0.973, loss_mean_cls=2.85, proj_loss=-0.126][2026-03-23 13:39:41] Step: 806, Training Logs: loss_final: 2.938036, loss_mean: 0.976070, proj_loss: -0.130495, loss_mean_cls: 2.092461, grad_norm: 8.318488 +Steps: 0%| | 807/1000000 [03:22<67:36:08, 4.11it/s, grad_norm=8.32, loss_final=2.94, loss_mean=0.976, loss_mean_cls=2.09, proj_loss=-0.13][2026-03-23 13:39:42] Step: 807, Training Logs: loss_final: 3.812966, loss_mean: 0.971581, proj_loss: -0.128087, loss_mean_cls: 2.969472, grad_norm: 5.282433 +Steps: 0%| | 808/1000000 [03:22<67:35:33, 4.11it/s, grad_norm=5.28, loss_final=3.81, loss_mean=0.972, loss_mean_cls=2.97, proj_loss=-0.128][2026-03-23 13:39:42] Step: 808, Training Logs: loss_final: 2.889681, loss_mean: 1.001408, proj_loss: -0.136477, loss_mean_cls: 2.024751, grad_norm: 6.604538 +Steps: 0%| | 809/1000000 [03:22<67:35:17, 4.11it/s, grad_norm=6.6, loss_final=2.89, loss_mean=1, loss_mean_cls=2.02, proj_loss=-0.136][2026-03-23 13:39:42] Step: 809, Training Logs: loss_final: 3.157080, loss_mean: 0.997906, proj_loss: -0.133034, loss_mean_cls: 2.292207, grad_norm: 5.258725 +Steps: 0%| | 810/1000000 [03:22<67:35:14, 4.11it/s, grad_norm=5.26, loss_final=3.16, loss_mean=0.998, loss_mean_cls=2.29, proj_loss=-0.133][2026-03-23 13:39:42] Step: 810, Training Logs: loss_final: 3.449077, loss_mean: 0.982016, proj_loss: -0.136911, loss_mean_cls: 2.603971, grad_norm: 6.233846 +Steps: 0%| | 811/1000000 [03:23<67:51:59, 4.09it/s, grad_norm=6.23, loss_final=3.45, loss_mean=0.982, loss_mean_cls=2.6, proj_loss=-0.137][2026-03-23 13:39:43] Step: 811, Training Logs: loss_final: 3.737308, loss_mean: 0.985887, proj_loss: -0.133463, loss_mean_cls: 2.884884, grad_norm: 8.791126 +Steps: 0%| | 812/1000000 [03:23<67:51:43, 4.09it/s, grad_norm=8.79, loss_final=3.74, loss_mean=0.986, loss_mean_cls=2.88, proj_loss=-0.133][2026-03-23 13:39:43] Step: 812, Training Logs: loss_final: 3.495837, loss_mean: 0.969712, proj_loss: -0.135284, loss_mean_cls: 2.661410, grad_norm: 7.966519 +Steps: 0%| | 813/1000000 [03:23<67:45:35, 4.10it/s, grad_norm=7.97, loss_final=3.5, loss_mean=0.97, loss_mean_cls=2.66, proj_loss=-0.135][2026-03-23 13:39:43] Step: 813, Training Logs: loss_final: 3.722309, loss_mean: 0.992976, proj_loss: -0.138076, loss_mean_cls: 2.867409, grad_norm: 6.655502 +Steps: 0%| | 814/1000000 [03:23<67:42:47, 4.10it/s, grad_norm=6.66, loss_final=3.72, loss_mean=0.993, loss_mean_cls=2.87, proj_loss=-0.138][2026-03-23 13:39:43] Step: 814, Training Logs: loss_final: 3.144622, loss_mean: 1.006407, proj_loss: -0.141466, loss_mean_cls: 2.279681, grad_norm: 7.282391 +Steps: 0%| | 815/1000000 [03:24<67:41:34, 4.10it/s, grad_norm=7.28, loss_final=3.14, loss_mean=1.01, loss_mean_cls=2.28, proj_loss=-0.141][2026-03-23 13:39:44] Step: 815, Training Logs: loss_final: 3.859135, loss_mean: 0.959587, proj_loss: -0.135603, loss_mean_cls: 3.035151, grad_norm: 5.860390 +Steps: 0%| | 816/1000000 [03:24<67:39:23, 4.10it/s, grad_norm=5.86, loss_final=3.86, loss_mean=0.96, loss_mean_cls=3.04, proj_loss=-0.136][2026-03-23 13:39:44] Step: 816, Training Logs: loss_final: 2.955823, loss_mean: 1.009116, proj_loss: -0.143776, loss_mean_cls: 2.090483, grad_norm: 8.998664 +Steps: 0%| | 817/1000000 [03:24<67:42:30, 4.10it/s, grad_norm=9, loss_final=2.96, loss_mean=1.01, loss_mean_cls=2.09, proj_loss=-0.144][2026-03-23 13:39:44] Step: 817, Training Logs: loss_final: 3.157166, loss_mean: 0.991247, proj_loss: -0.141242, loss_mean_cls: 2.307162, grad_norm: 8.061997 +Steps: 0%| | 818/1000000 [03:24<67:42:56, 4.10it/s, grad_norm=8.06, loss_final=3.16, loss_mean=0.991, loss_mean_cls=2.31, proj_loss=-0.141][2026-03-23 13:39:44] Step: 818, Training Logs: loss_final: 3.357513, loss_mean: 1.004605, proj_loss: -0.143389, loss_mean_cls: 2.496298, grad_norm: 4.961581 +Steps: 0%| | 819/1000000 [03:25<67:40:39, 4.10it/s, grad_norm=4.96, loss_final=3.36, loss_mean=1, loss_mean_cls=2.5, proj_loss=-0.143][2026-03-23 13:39:45] Step: 819, Training Logs: loss_final: 3.818464, loss_mean: 0.968626, proj_loss: -0.138895, loss_mean_cls: 2.988732, grad_norm: 5.874891 +Steps: 0%| | 820/1000000 [03:25<67:40:01, 4.10it/s, grad_norm=5.87, loss_final=3.82, loss_mean=0.969, loss_mean_cls=2.99, proj_loss=-0.139][2026-03-23 13:39:45] Step: 820, Training Logs: loss_final: 4.012197, loss_mean: 0.947888, proj_loss: -0.142145, loss_mean_cls: 3.206455, grad_norm: 4.538189 +Steps: 0%| | 821/1000000 [03:25<67:38:36, 4.10it/s, grad_norm=4.54, loss_final=4.01, loss_mean=0.948, loss_mean_cls=3.21, proj_loss=-0.142][2026-03-23 13:39:45] Step: 821, Training Logs: loss_final: 3.348689, loss_mean: 0.996907, proj_loss: -0.142265, loss_mean_cls: 2.494047, grad_norm: 6.748330 +Steps: 0%| | 822/1000000 [03:25<67:37:14, 4.10it/s, grad_norm=6.75, loss_final=3.35, loss_mean=0.997, loss_mean_cls=2.49, proj_loss=-0.142][2026-03-23 13:39:45] Step: 822, Training Logs: loss_final: 3.290074, loss_mean: 0.972478, proj_loss: -0.145423, loss_mean_cls: 2.463019, grad_norm: 6.327968 +Steps: 0%| | 823/1000000 [03:26<67:37:17, 4.10it/s, grad_norm=6.33, loss_final=3.29, loss_mean=0.972, loss_mean_cls=2.46, proj_loss=-0.145][2026-03-23 13:39:46] Step: 823, Training Logs: loss_final: 2.987862, loss_mean: 0.991090, proj_loss: -0.150302, loss_mean_cls: 2.147074, grad_norm: 6.224729 +Steps: 0%| | 824/1000000 [03:26<67:36:54, 4.10it/s, grad_norm=6.22, loss_final=2.99, loss_mean=0.991, loss_mean_cls=2.15, proj_loss=-0.15][2026-03-23 13:39:46] Step: 824, Training Logs: loss_final: 3.973331, loss_mean: 0.992767, proj_loss: -0.144044, loss_mean_cls: 3.124608, grad_norm: 4.839965 +Steps: 0%| | 825/1000000 [03:26<67:39:02, 4.10it/s, grad_norm=4.84, loss_final=3.97, loss_mean=0.993, loss_mean_cls=3.12, proj_loss=-0.144][2026-03-23 13:39:46] Step: 825, Training Logs: loss_final: 3.738248, loss_mean: 0.978639, proj_loss: -0.150342, loss_mean_cls: 2.909952, grad_norm: 6.831635 +Steps: 0%| | 826/1000000 [03:26<67:37:02, 4.10it/s, grad_norm=6.83, loss_final=3.74, loss_mean=0.979, loss_mean_cls=2.91, proj_loss=-0.15][2026-03-23 13:39:46] Step: 826, Training Logs: loss_final: 3.087155, loss_mean: 0.984653, proj_loss: -0.157644, loss_mean_cls: 2.260146, grad_norm: 4.233655 +Steps: 0%| | 827/1000000 [03:27<67:36:21, 4.11it/s, grad_norm=4.23, loss_final=3.09, loss_mean=0.985, loss_mean_cls=2.26, proj_loss=-0.158][2026-03-23 13:39:47] Step: 827, Training Logs: loss_final: 3.512220, loss_mean: 0.942331, proj_loss: -0.153842, loss_mean_cls: 2.723730, grad_norm: 6.435760 +Steps: 0%| | 828/1000000 [03:27<67:35:35, 4.11it/s, grad_norm=6.44, loss_final=3.51, loss_mean=0.942, loss_mean_cls=2.72, proj_loss=-0.154][2026-03-23 13:39:47] Step: 828, Training Logs: loss_final: 3.239361, loss_mean: 0.963564, proj_loss: -0.157151, loss_mean_cls: 2.432948, grad_norm: 4.418409 +Steps: 0%| | 829/1000000 [03:27<67:34:33, 4.11it/s, grad_norm=4.42, loss_final=3.24, loss_mean=0.964, loss_mean_cls=2.43, proj_loss=-0.157][2026-03-23 13:39:47] Step: 829, Training Logs: loss_final: 3.366974, loss_mean: 0.992091, proj_loss: -0.158435, loss_mean_cls: 2.533318, grad_norm: 5.938358 +Steps: 0%| | 830/1000000 [03:27<67:38:44, 4.10it/s, grad_norm=5.94, loss_final=3.37, loss_mean=0.992, loss_mean_cls=2.53, proj_loss=-0.158][2026-03-23 13:39:47] Step: 830, Training Logs: loss_final: 3.757596, loss_mean: 0.962832, proj_loss: -0.153981, loss_mean_cls: 2.948744, grad_norm: 5.803342 +Steps: 0%| | 831/1000000 [03:28<67:41:26, 4.10it/s, grad_norm=5.8, loss_final=3.76, loss_mean=0.963, loss_mean_cls=2.95, proj_loss=-0.154][2026-03-23 13:39:48] Step: 831, Training Logs: loss_final: 3.435548, loss_mean: 0.986939, proj_loss: -0.158247, loss_mean_cls: 2.606856, grad_norm: 4.646476 +Steps: 0%| | 832/1000000 [03:28<67:38:10, 4.10it/s, grad_norm=4.65, loss_final=3.44, loss_mean=0.987, loss_mean_cls=2.61, proj_loss=-0.158][2026-03-23 13:39:48] Step: 832, Training Logs: loss_final: 4.006648, loss_mean: 0.965343, proj_loss: -0.156404, loss_mean_cls: 3.197709, grad_norm: 12.459236 +Steps: 0%| | 833/1000000 [03:28<67:35:59, 4.11it/s, grad_norm=12.5, loss_final=4.01, loss_mean=0.965, loss_mean_cls=3.2, proj_loss=-0.156][2026-03-23 13:39:48] Step: 833, Training Logs: loss_final: 3.216105, loss_mean: 0.998174, proj_loss: -0.160500, loss_mean_cls: 2.378431, grad_norm: 7.179099 +Steps: 0%| | 834/1000000 [03:28<67:35:15, 4.11it/s, grad_norm=7.18, loss_final=3.22, loss_mean=0.998, loss_mean_cls=2.38, proj_loss=-0.16][2026-03-23 13:39:48] Step: 834, Training Logs: loss_final: 3.471359, loss_mean: 0.985072, proj_loss: -0.160287, loss_mean_cls: 2.646574, grad_norm: 6.186459 +Steps: 0%| | 835/1000000 [03:29<67:35:36, 4.11it/s, grad_norm=6.19, loss_final=3.47, loss_mean=0.985, loss_mean_cls=2.65, proj_loss=-0.16][2026-03-23 13:39:49] Step: 835, Training Logs: loss_final: 3.107620, loss_mean: 0.975063, proj_loss: -0.166256, loss_mean_cls: 2.298813, grad_norm: 6.128019 +Steps: 0%| | 836/1000000 [03:29<67:36:15, 4.11it/s, grad_norm=6.13, loss_final=3.11, loss_mean=0.975, loss_mean_cls=2.3, proj_loss=-0.166][2026-03-23 13:39:49] Step: 836, Training Logs: loss_final: 3.452482, loss_mean: 0.976100, proj_loss: -0.163476, loss_mean_cls: 2.639858, grad_norm: 5.052592 +Steps: 0%| | 837/1000000 [03:29<67:36:31, 4.11it/s, grad_norm=5.05, loss_final=3.45, loss_mean=0.976, loss_mean_cls=2.64, proj_loss=-0.163][2026-03-23 13:39:49] Step: 837, Training Logs: loss_final: 3.547332, loss_mean: 0.982467, proj_loss: -0.162147, loss_mean_cls: 2.727012, grad_norm: 9.105545 +Steps: 0%| | 838/1000000 [03:29<67:35:46, 4.11it/s, grad_norm=9.11, loss_final=3.55, loss_mean=0.982, loss_mean_cls=2.73, proj_loss=-0.162][2026-03-23 13:39:49] Step: 838, Training Logs: loss_final: 3.738686, loss_mean: 0.991434, proj_loss: -0.163425, loss_mean_cls: 2.910678, grad_norm: 7.827240 +Steps: 0%| | 839/1000000 [03:30<67:36:26, 4.11it/s, grad_norm=7.83, loss_final=3.74, loss_mean=0.991, loss_mean_cls=2.91, proj_loss=-0.163][2026-03-23 13:39:49] Step: 839, Training Logs: loss_final: 3.461011, loss_mean: 0.985309, proj_loss: -0.163840, loss_mean_cls: 2.639542, grad_norm: 4.101181 +Steps: 0%| | 840/1000000 [03:30<67:35:21, 4.11it/s, grad_norm=4.1, loss_final=3.46, loss_mean=0.985, loss_mean_cls=2.64, proj_loss=-0.164][2026-03-23 13:39:50] Step: 840, Training Logs: loss_final: 3.647676, loss_mean: 0.979098, proj_loss: -0.159458, loss_mean_cls: 2.828036, grad_norm: 7.801009 +Steps: 0%| | 841/1000000 [03:30<67:35:27, 4.11it/s, grad_norm=7.8, loss_final=3.65, loss_mean=0.979, loss_mean_cls=2.83, proj_loss=-0.159][2026-03-23 13:39:50] Step: 841, Training Logs: loss_final: 3.437075, loss_mean: 1.009387, proj_loss: -0.165280, loss_mean_cls: 2.592968, grad_norm: 6.783402 +Steps: 0%| | 842/1000000 [03:30<67:35:54, 4.11it/s, grad_norm=6.78, loss_final=3.44, loss_mean=1.01, loss_mean_cls=2.59, proj_loss=-0.165][2026-03-23 13:39:50] Step: 842, Training Logs: loss_final: 3.021873, loss_mean: 1.000837, proj_loss: -0.165839, loss_mean_cls: 2.186875, grad_norm: 4.112114 +Steps: 0%| | 843/1000000 [03:30<67:36:39, 4.11it/s, grad_norm=4.11, loss_final=3.02, loss_mean=1, loss_mean_cls=2.19, proj_loss=-0.166][2026-03-23 13:39:50] Step: 843, Training Logs: loss_final: 3.573853, loss_mean: 0.981347, proj_loss: -0.165782, loss_mean_cls: 2.758288, grad_norm: 6.614796 +Steps: 0%| | 844/1000000 [03:31<67:35:22, 4.11it/s, grad_norm=6.61, loss_final=3.57, loss_mean=0.981, loss_mean_cls=2.76, proj_loss=-0.166][2026-03-23 13:39:51] Step: 844, Training Logs: loss_final: 3.343616, loss_mean: 0.981221, proj_loss: -0.173393, loss_mean_cls: 2.535788, grad_norm: 7.976209 +Steps: 0%| | 845/1000000 [03:31<67:34:47, 4.11it/s, grad_norm=7.98, loss_final=3.34, loss_mean=0.981, loss_mean_cls=2.54, proj_loss=-0.173][2026-03-23 13:39:51] Step: 845, Training Logs: loss_final: 3.627429, loss_mean: 0.992958, proj_loss: -0.171805, loss_mean_cls: 2.806275, grad_norm: 6.550201 +Steps: 0%| | 846/1000000 [03:31<67:34:59, 4.11it/s, grad_norm=6.55, loss_final=3.63, loss_mean=0.993, loss_mean_cls=2.81, proj_loss=-0.172][2026-03-23 13:39:51] Step: 846, Training Logs: loss_final: 3.231954, loss_mean: 0.985083, proj_loss: -0.180046, loss_mean_cls: 2.426917, grad_norm: 5.858192 +Steps: 0%| | 847/1000000 [03:31<67:35:19, 4.11it/s, grad_norm=5.86, loss_final=3.23, loss_mean=0.985, loss_mean_cls=2.43, proj_loss=-0.18][2026-03-23 13:39:51] Step: 847, Training Logs: loss_final: 3.698299, loss_mean: 0.959622, proj_loss: -0.174285, loss_mean_cls: 2.912962, grad_norm: 5.758743 +Steps: 0%| | 848/1000000 [03:32<67:35:36, 4.11it/s, grad_norm=5.76, loss_final=3.7, loss_mean=0.96, loss_mean_cls=2.91, proj_loss=-0.174][2026-03-23 13:39:52] Step: 848, Training Logs: loss_final: 3.290376, loss_mean: 0.983661, proj_loss: -0.180009, loss_mean_cls: 2.486725, grad_norm: 5.914536 +Steps: 0%| | 849/1000000 [03:32<67:35:31, 4.11it/s, grad_norm=5.91, loss_final=3.29, loss_mean=0.984, loss_mean_cls=2.49, proj_loss=-0.18][2026-03-23 13:39:52] Step: 849, Training Logs: loss_final: 3.514209, loss_mean: 0.978468, proj_loss: -0.176861, loss_mean_cls: 2.712602, grad_norm: 6.200832 +Steps: 0%| | 850/1000000 [03:32<67:33:46, 4.11it/s, grad_norm=6.2, loss_final=3.51, loss_mean=0.978, loss_mean_cls=2.71, proj_loss=-0.177][2026-03-23 13:39:52] Step: 850, Training Logs: loss_final: 3.438050, loss_mean: 0.994745, proj_loss: -0.176395, loss_mean_cls: 2.619700, grad_norm: 6.167976 +Steps: 0%| | 851/1000000 [03:32<67:34:26, 4.11it/s, grad_norm=6.17, loss_final=3.44, loss_mean=0.995, loss_mean_cls=2.62, proj_loss=-0.176][2026-03-23 13:39:52] Step: 851, Training Logs: loss_final: 3.531171, loss_mean: 0.981303, proj_loss: -0.175892, loss_mean_cls: 2.725760, grad_norm: 5.460580 +Steps: 0%| | 852/1000000 [03:33<67:35:12, 4.11it/s, grad_norm=5.46, loss_final=3.53, loss_mean=0.981, loss_mean_cls=2.73, proj_loss=-0.176][2026-03-23 13:39:53] Step: 852, Training Logs: loss_final: 3.357795, loss_mean: 0.989046, proj_loss: -0.183897, loss_mean_cls: 2.552646, grad_norm: 7.712152 +Steps: 0%| | 853/1000000 [03:33<67:34:11, 4.11it/s, grad_norm=7.71, loss_final=3.36, loss_mean=0.989, loss_mean_cls=2.55, proj_loss=-0.184][2026-03-23 13:39:53] Step: 853, Training Logs: loss_final: 3.415781, loss_mean: 1.003062, proj_loss: -0.182414, loss_mean_cls: 2.595134, grad_norm: 6.711967 +Steps: 0%| | 854/1000000 [03:33<67:33:41, 4.11it/s, grad_norm=6.71, loss_final=3.42, loss_mean=1, loss_mean_cls=2.6, proj_loss=-0.182][2026-03-23 13:39:53] Step: 854, Training Logs: loss_final: 3.414579, loss_mean: 0.976503, proj_loss: -0.184246, loss_mean_cls: 2.622322, grad_norm: 7.890229 +Steps: 0%| | 855/1000000 [03:33<68:09:46, 4.07it/s, grad_norm=7.89, loss_final=3.41, loss_mean=0.977, loss_mean_cls=2.62, proj_loss=-0.184][2026-03-23 13:39:53] Step: 855, Training Logs: loss_final: 3.428938, loss_mean: 0.962553, proj_loss: -0.186611, loss_mean_cls: 2.652996, grad_norm: 5.095406 +Steps: 0%| | 856/1000000 [03:34<67:58:51, 4.08it/s, grad_norm=5.1, loss_final=3.43, loss_mean=0.963, loss_mean_cls=2.65, proj_loss=-0.187][2026-03-23 13:39:54] Step: 856, Training Logs: loss_final: 2.772397, loss_mean: 0.996235, proj_loss: -0.193172, loss_mean_cls: 1.969333, grad_norm: 5.562906 +Steps: 0%| | 857/1000000 [03:34<67:51:31, 4.09it/s, grad_norm=5.56, loss_final=2.77, loss_mean=0.996, loss_mean_cls=1.97, proj_loss=-0.193][2026-03-23 13:39:54] Step: 857, Training Logs: loss_final: 3.056956, loss_mean: 0.996065, proj_loss: -0.191105, loss_mean_cls: 2.251997, grad_norm: 5.120454 +Steps: 0%| | 858/1000000 [03:34<67:48:32, 4.09it/s, grad_norm=5.12, loss_final=3.06, loss_mean=0.996, loss_mean_cls=2.25, proj_loss=-0.191][2026-03-23 13:39:54] Step: 858, Training Logs: loss_final: 3.539784, loss_mean: 0.953327, proj_loss: -0.185899, loss_mean_cls: 2.772356, grad_norm: 6.452792 +Steps: 0%| | 859/1000000 [03:34<67:45:07, 4.10it/s, grad_norm=6.45, loss_final=3.54, loss_mean=0.953, loss_mean_cls=2.77, proj_loss=-0.186][2026-03-23 13:39:54] Step: 859, Training Logs: loss_final: 3.357882, loss_mean: 0.960147, proj_loss: -0.190746, loss_mean_cls: 2.588481, grad_norm: 7.407739 +Steps: 0%| | 860/1000000 [03:35<67:40:21, 4.10it/s, grad_norm=7.41, loss_final=3.36, loss_mean=0.96, loss_mean_cls=2.59, proj_loss=-0.191][2026-03-23 13:39:55] Step: 860, Training Logs: loss_final: 3.365143, loss_mean: 0.980286, proj_loss: -0.187723, loss_mean_cls: 2.572580, grad_norm: 7.294630 +Steps: 0%| | 861/1000000 [03:35<67:39:04, 4.10it/s, grad_norm=7.29, loss_final=3.37, loss_mean=0.98, loss_mean_cls=2.57, proj_loss=-0.188][2026-03-23 13:39:55] Step: 861, Training Logs: loss_final: 3.859162, loss_mean: 0.934767, proj_loss: -0.186352, loss_mean_cls: 3.110747, grad_norm: 8.049034 +Steps: 0%| | 862/1000000 [03:35<67:36:35, 4.10it/s, grad_norm=8.05, loss_final=3.86, loss_mean=0.935, loss_mean_cls=3.11, proj_loss=-0.186][2026-03-23 13:39:55] Step: 862, Training Logs: loss_final: 3.689306, loss_mean: 0.973234, proj_loss: -0.188648, loss_mean_cls: 2.904721, grad_norm: 6.528265 +Steps: 0%| | 863/1000000 [03:35<67:35:29, 4.11it/s, grad_norm=6.53, loss_final=3.69, loss_mean=0.973, loss_mean_cls=2.9, proj_loss=-0.189][2026-03-23 13:39:55] Step: 863, Training Logs: loss_final: 3.147394, loss_mean: 0.979872, proj_loss: -0.191808, loss_mean_cls: 2.359331, grad_norm: 5.654211 +Steps: 0%| | 864/1000000 [03:36<67:36:35, 4.10it/s, grad_norm=5.65, loss_final=3.15, loss_mean=0.98, loss_mean_cls=2.36, proj_loss=-0.192][2026-03-23 13:39:56] Step: 864, Training Logs: loss_final: 3.635007, loss_mean: 0.979382, proj_loss: -0.190558, loss_mean_cls: 2.846184, grad_norm: 3.355230 +Steps: 0%| | 865/1000000 [03:36<67:34:32, 4.11it/s, grad_norm=3.36, loss_final=3.64, loss_mean=0.979, loss_mean_cls=2.85, proj_loss=-0.191][2026-03-23 13:39:56] Step: 865, Training Logs: loss_final: 3.611118, loss_mean: 0.953018, proj_loss: -0.189866, loss_mean_cls: 2.847965, grad_norm: 5.315028 +Steps: 0%| | 866/1000000 [03:36<69:16:33, 4.01it/s, grad_norm=5.32, loss_final=3.61, loss_mean=0.953, loss_mean_cls=2.85, proj_loss=-0.19][2026-03-23 13:39:56] Step: 866, Training Logs: loss_final: 3.268916, loss_mean: 0.963431, proj_loss: -0.189366, loss_mean_cls: 2.494851, grad_norm: 7.088167 +Steps: 0%| | 867/1000000 [03:36<69:00:37, 4.02it/s, grad_norm=7.09, loss_final=3.27, loss_mean=0.963, loss_mean_cls=2.49, proj_loss=-0.189][2026-03-23 13:39:56] Step: 867, Training Logs: loss_final: 3.264347, loss_mean: 0.984184, proj_loss: -0.191846, loss_mean_cls: 2.472010, grad_norm: 6.332934 +Steps: 0%| | 868/1000000 [03:37<68:35:33, 4.05it/s, grad_norm=6.33, loss_final=3.26, loss_mean=0.984, loss_mean_cls=2.47, proj_loss=-0.192][2026-03-23 13:39:57] Step: 868, Training Logs: loss_final: 2.620491, loss_mean: 0.994569, proj_loss: -0.199029, loss_mean_cls: 1.824951, grad_norm: 4.657275 +Steps: 0%| | 869/1000000 [03:37<68:17:15, 4.06it/s, grad_norm=4.66, loss_final=2.62, loss_mean=0.995, loss_mean_cls=1.82, proj_loss=-0.199][2026-03-23 13:39:57] Step: 869, Training Logs: loss_final: 3.298075, loss_mean: 0.978754, proj_loss: -0.197489, loss_mean_cls: 2.516811, grad_norm: 4.917058 +Steps: 0%| | 870/1000000 [03:37<68:04:20, 4.08it/s, grad_norm=4.92, loss_final=3.3, loss_mean=0.979, loss_mean_cls=2.52, proj_loss=-0.197][2026-03-23 13:39:57] Step: 870, Training Logs: loss_final: 3.441407, loss_mean: 0.980170, proj_loss: -0.196413, loss_mean_cls: 2.657650, grad_norm: 5.830078 +Steps: 0%| | 871/1000000 [03:37<67:55:54, 4.09it/s, grad_norm=5.83, loss_final=3.44, loss_mean=0.98, loss_mean_cls=2.66, proj_loss=-0.196][2026-03-23 13:39:57] Step: 871, Training Logs: loss_final: 3.521652, loss_mean: 0.966677, proj_loss: -0.195985, loss_mean_cls: 2.750960, grad_norm: 5.239420 +Steps: 0%| | 872/1000000 [03:38<67:51:32, 4.09it/s, grad_norm=5.24, loss_final=3.52, loss_mean=0.967, loss_mean_cls=2.75, proj_loss=-0.196][2026-03-23 13:39:58] Step: 872, Training Logs: loss_final: 3.524415, loss_mean: 0.966188, proj_loss: -0.199908, loss_mean_cls: 2.758135, grad_norm: 4.551807 +Steps: 0%| | 873/1000000 [03:38<67:45:53, 4.10it/s, grad_norm=4.55, loss_final=3.52, loss_mean=0.966, loss_mean_cls=2.76, proj_loss=-0.2][2026-03-23 13:39:58] Step: 873, Training Logs: loss_final: 3.044600, loss_mean: 0.980438, proj_loss: -0.199722, loss_mean_cls: 2.263885, grad_norm: 4.883636 +Steps: 0%| | 874/1000000 [03:38<67:41:11, 4.10it/s, grad_norm=4.88, loss_final=3.04, loss_mean=0.98, loss_mean_cls=2.26, proj_loss=-0.2][2026-03-23 13:39:58] Step: 874, Training Logs: loss_final: 3.751343, loss_mean: 0.984624, proj_loss: -0.198430, loss_mean_cls: 2.965149, grad_norm: 10.452662 +Steps: 0%| | 875/1000000 [03:38<67:39:28, 4.10it/s, grad_norm=10.5, loss_final=3.75, loss_mean=0.985, loss_mean_cls=2.97, proj_loss=-0.198][2026-03-23 13:39:58] Step: 875, Training Logs: loss_final: 3.222786, loss_mean: 0.988611, proj_loss: -0.196319, loss_mean_cls: 2.430494, grad_norm: 6.769777 +Steps: 0%| | 876/1000000 [03:39<67:40:16, 4.10it/s, grad_norm=6.77, loss_final=3.22, loss_mean=0.989, loss_mean_cls=2.43, proj_loss=-0.196][2026-03-23 13:39:59] Step: 876, Training Logs: loss_final: 3.567767, loss_mean: 0.961620, proj_loss: -0.200631, loss_mean_cls: 2.806777, grad_norm: 6.473975 +Steps: 0%| | 877/1000000 [03:39<67:38:38, 4.10it/s, grad_norm=6.47, loss_final=3.57, loss_mean=0.962, loss_mean_cls=2.81, proj_loss=-0.201][2026-03-23 13:39:59] Step: 877, Training Logs: loss_final: 3.038472, loss_mean: 1.006655, proj_loss: -0.201998, loss_mean_cls: 2.233815, grad_norm: 5.841194 +Steps: 0%| | 878/1000000 [03:39<67:36:57, 4.10it/s, grad_norm=5.84, loss_final=3.04, loss_mean=1.01, loss_mean_cls=2.23, proj_loss=-0.202][2026-03-23 13:39:59] Step: 878, Training Logs: loss_final: 3.230123, loss_mean: 1.000312, proj_loss: -0.204853, loss_mean_cls: 2.434664, grad_norm: 6.379198 +Steps: 0%| | 879/1000000 [03:39<67:36:23, 4.11it/s, grad_norm=6.38, loss_final=3.23, loss_mean=1, loss_mean_cls=2.43, proj_loss=-0.205][2026-03-23 13:39:59] Step: 879, Training Logs: loss_final: 3.441126, loss_mean: 0.989009, proj_loss: -0.204416, loss_mean_cls: 2.656533, grad_norm: 6.120109 +Steps: 0%| | 880/1000000 [03:40<67:37:55, 4.10it/s, grad_norm=6.12, loss_final=3.44, loss_mean=0.989, loss_mean_cls=2.66, proj_loss=-0.204][2026-03-23 13:39:59] Step: 880, Training Logs: loss_final: 4.094015, loss_mean: 0.995770, proj_loss: -0.199463, loss_mean_cls: 3.297708, grad_norm: 8.366229 +Steps: 0%| | 881/1000000 [03:40<67:36:04, 4.11it/s, grad_norm=8.37, loss_final=4.09, loss_mean=0.996, loss_mean_cls=3.3, proj_loss=-0.199][2026-03-23 13:40:00] Step: 881, Training Logs: loss_final: 3.435583, loss_mean: 0.988463, proj_loss: -0.197890, loss_mean_cls: 2.645009, grad_norm: 5.379455 +Steps: 0%| | 882/1000000 [03:40<67:34:35, 4.11it/s, grad_norm=5.38, loss_final=3.44, loss_mean=0.988, loss_mean_cls=2.65, proj_loss=-0.198][2026-03-23 13:40:00] Step: 882, Training Logs: loss_final: 3.140794, loss_mean: 0.989536, proj_loss: -0.205060, loss_mean_cls: 2.356318, grad_norm: 7.064296 +Steps: 0%| | 883/1000000 [03:40<67:34:46, 4.11it/s, grad_norm=7.06, loss_final=3.14, loss_mean=0.99, loss_mean_cls=2.36, proj_loss=-0.205][2026-03-23 13:40:00] Step: 883, Training Logs: loss_final: 3.551335, loss_mean: 1.003485, proj_loss: -0.203458, loss_mean_cls: 2.751309, grad_norm: 9.818319 +Steps: 0%| | 884/1000000 [03:41<67:36:40, 4.10it/s, grad_norm=9.82, loss_final=3.55, loss_mean=1, loss_mean_cls=2.75, proj_loss=-0.203][2026-03-23 13:40:00] Step: 884, Training Logs: loss_final: 3.607657, loss_mean: 1.007776, proj_loss: -0.203537, loss_mean_cls: 2.803418, grad_norm: 6.843055 +Steps: 0%| | 885/1000000 [03:41<67:36:41, 4.10it/s, grad_norm=6.84, loss_final=3.61, loss_mean=1.01, loss_mean_cls=2.8, proj_loss=-0.204][2026-03-23 13:40:01] Step: 885, Training Logs: loss_final: 3.081289, loss_mean: 1.001638, proj_loss: -0.207887, loss_mean_cls: 2.287538, grad_norm: 3.873506 +Steps: 0%| | 886/1000000 [03:41<67:34:48, 4.11it/s, grad_norm=3.87, loss_final=3.08, loss_mean=1, loss_mean_cls=2.29, proj_loss=-0.208][2026-03-23 13:40:01] Step: 886, Training Logs: loss_final: 3.340070, loss_mean: 1.017452, proj_loss: -0.205867, loss_mean_cls: 2.528485, grad_norm: 4.511684 +Steps: 0%| | 887/1000000 [03:41<67:33:41, 4.11it/s, grad_norm=4.51, loss_final=3.34, loss_mean=1.02, loss_mean_cls=2.53, proj_loss=-0.206][2026-03-23 13:40:01] Step: 887, Training Logs: loss_final: 2.894582, loss_mean: 1.001234, proj_loss: -0.216179, loss_mean_cls: 2.109527, grad_norm: 4.296713 +Steps: 0%| | 888/1000000 [03:41<67:36:25, 4.11it/s, grad_norm=4.3, loss_final=2.89, loss_mean=1, loss_mean_cls=2.11, proj_loss=-0.216][2026-03-23 13:40:01] Step: 888, Training Logs: loss_final: 3.685542, loss_mean: 0.968721, proj_loss: -0.209224, loss_mean_cls: 2.926045, grad_norm: 4.128066 +Steps: 0%| | 889/1000000 [03:42<67:36:48, 4.10it/s, grad_norm=4.13, loss_final=3.69, loss_mean=0.969, loss_mean_cls=2.93, proj_loss=-0.209][2026-03-23 13:40:02] Step: 889, Training Logs: loss_final: 3.305892, loss_mean: 0.991842, proj_loss: -0.209635, loss_mean_cls: 2.523684, grad_norm: 3.315322 +Steps: 0%| | 890/1000000 [03:42<67:37:09, 4.10it/s, grad_norm=3.32, loss_final=3.31, loss_mean=0.992, loss_mean_cls=2.52, proj_loss=-0.21][2026-03-23 13:40:02] Step: 890, Training Logs: loss_final: 3.501468, loss_mean: 0.974478, proj_loss: -0.209670, loss_mean_cls: 2.736661, grad_norm: 3.978948 +Steps: 0%| | 891/1000000 [03:42<67:36:13, 4.11it/s, grad_norm=3.98, loss_final=3.5, loss_mean=0.974, loss_mean_cls=2.74, proj_loss=-0.21][2026-03-23 13:40:02] Step: 891, Training Logs: loss_final: 3.143916, loss_mean: 0.984226, proj_loss: -0.211816, loss_mean_cls: 2.371506, grad_norm: 4.319946 +Steps: 0%| | 892/1000000 [03:42<67:38:20, 4.10it/s, grad_norm=4.32, loss_final=3.14, loss_mean=0.984, loss_mean_cls=2.37, proj_loss=-0.212][2026-03-23 13:40:02] Step: 892, Training Logs: loss_final: 3.252292, loss_mean: 0.986972, proj_loss: -0.211556, loss_mean_cls: 2.476876, grad_norm: 7.067959 +Steps: 0%| | 893/1000000 [03:43<67:37:47, 4.10it/s, grad_norm=7.07, loss_final=3.25, loss_mean=0.987, loss_mean_cls=2.48, proj_loss=-0.212][2026-03-23 13:40:03] Step: 893, Training Logs: loss_final: 3.677285, loss_mean: 0.989933, proj_loss: -0.211880, loss_mean_cls: 2.899232, grad_norm: 6.677636 +Steps: 0%| | 894/1000000 [03:43<67:36:26, 4.11it/s, grad_norm=6.68, loss_final=3.68, loss_mean=0.99, loss_mean_cls=2.9, proj_loss=-0.212][2026-03-23 13:40:03] Step: 894, Training Logs: loss_final: 3.577920, loss_mean: 0.963703, proj_loss: -0.214765, loss_mean_cls: 2.828982, grad_norm: 7.697509 +Steps: 0%| | 895/1000000 [03:43<67:36:27, 4.10it/s, grad_norm=7.7, loss_final=3.58, loss_mean=0.964, loss_mean_cls=2.83, proj_loss=-0.215][2026-03-23 13:40:03] Step: 895, Training Logs: loss_final: 3.420432, loss_mean: 0.979769, proj_loss: -0.215066, loss_mean_cls: 2.655730, grad_norm: 6.250987 +Steps: 0%| | 896/1000000 [03:43<67:37:01, 4.10it/s, grad_norm=6.25, loss_final=3.42, loss_mean=0.98, loss_mean_cls=2.66, proj_loss=-0.215][2026-03-23 13:40:03] Step: 896, Training Logs: loss_final: 3.646176, loss_mean: 0.980433, proj_loss: -0.216663, loss_mean_cls: 2.882407, grad_norm: 7.153473 +Steps: 0%| | 897/1000000 [03:44<67:36:15, 4.11it/s, grad_norm=7.15, loss_final=3.65, loss_mean=0.98, loss_mean_cls=2.88, proj_loss=-0.217][2026-03-23 13:40:04] Step: 897, Training Logs: loss_final: 3.329418, loss_mean: 0.981956, proj_loss: -0.214473, loss_mean_cls: 2.561934, grad_norm: 3.536794 +Steps: 0%| | 898/1000000 [03:44<67:35:03, 4.11it/s, grad_norm=3.54, loss_final=3.33, loss_mean=0.982, loss_mean_cls=2.56, proj_loss=-0.214][2026-03-23 13:40:04] Step: 898, Training Logs: loss_final: 3.160993, loss_mean: 0.983668, proj_loss: -0.219419, loss_mean_cls: 2.396744, grad_norm: 4.180305 +Steps: 0%| | 899/1000000 [03:44<67:35:45, 4.11it/s, grad_norm=4.18, loss_final=3.16, loss_mean=0.984, loss_mean_cls=2.4, proj_loss=-0.219][2026-03-23 13:40:04] Step: 899, Training Logs: loss_final: 3.046375, loss_mean: 0.990698, proj_loss: -0.220563, loss_mean_cls: 2.276240, grad_norm: 3.839815 +Steps: 0%| | 900/1000000 [03:44<67:39:43, 4.10it/s, grad_norm=3.84, loss_final=3.05, loss_mean=0.991, loss_mean_cls=2.28, proj_loss=-0.221][2026-03-23 13:40:04] Step: 900, Training Logs: loss_final: 3.152389, loss_mean: 0.971785, proj_loss: -0.221472, loss_mean_cls: 2.402076, grad_norm: 4.973435 +Steps: 0%| | 901/1000000 [03:45<67:38:17, 4.10it/s, grad_norm=4.97, loss_final=3.15, loss_mean=0.972, loss_mean_cls=2.4, proj_loss=-0.221][2026-03-23 13:40:05] Step: 901, Training Logs: loss_final: 3.399503, loss_mean: 0.986146, proj_loss: -0.221496, loss_mean_cls: 2.634853, grad_norm: 6.106980 +Steps: 0%| | 902/1000000 [03:45<67:36:42, 4.10it/s, grad_norm=6.11, loss_final=3.4, loss_mean=0.986, loss_mean_cls=2.63, proj_loss=-0.221][2026-03-23 13:40:05] Step: 902, Training Logs: loss_final: 3.538857, loss_mean: 0.961910, proj_loss: -0.212881, loss_mean_cls: 2.789828, grad_norm: 5.608520 +Steps: 0%| | 903/1000000 [03:45<67:36:24, 4.11it/s, grad_norm=5.61, loss_final=3.54, loss_mean=0.962, loss_mean_cls=2.79, proj_loss=-0.213][2026-03-23 13:40:05] Step: 903, Training Logs: loss_final: 3.457092, loss_mean: 0.974264, proj_loss: -0.219082, loss_mean_cls: 2.701911, grad_norm: 5.271274 +Steps: 0%| | 904/1000000 [03:45<67:33:55, 4.11it/s, grad_norm=5.27, loss_final=3.46, loss_mean=0.974, loss_mean_cls=2.7, proj_loss=-0.219][2026-03-23 13:40:05] Step: 904, Training Logs: loss_final: 3.549177, loss_mean: 0.969089, proj_loss: -0.218387, loss_mean_cls: 2.798475, grad_norm: 9.104020 +Steps: 0%| | 905/1000000 [03:46<67:35:58, 4.11it/s, grad_norm=9.1, loss_final=3.55, loss_mean=0.969, loss_mean_cls=2.8, proj_loss=-0.218][2026-03-23 13:40:06] Step: 905, Training Logs: loss_final: 3.607774, loss_mean: 0.959325, proj_loss: -0.220977, loss_mean_cls: 2.869426, grad_norm: 4.537321 +Steps: 0%| | 906/1000000 [03:46<67:35:57, 4.11it/s, grad_norm=4.54, loss_final=3.61, loss_mean=0.959, loss_mean_cls=2.87, proj_loss=-0.221][2026-03-23 13:40:06] Step: 906, Training Logs: loss_final: 3.535569, loss_mean: 0.952295, proj_loss: -0.222043, loss_mean_cls: 2.805317, grad_norm: 4.444352 +Steps: 0%| | 907/1000000 [03:46<67:35:33, 4.11it/s, grad_norm=4.44, loss_final=3.54, loss_mean=0.952, loss_mean_cls=2.81, proj_loss=-0.222][2026-03-23 13:40:06] Step: 907, Training Logs: loss_final: 3.752468, loss_mean: 0.948446, proj_loss: -0.220944, loss_mean_cls: 3.024966, grad_norm: 4.727448 +Steps: 0%| | 908/1000000 [03:46<67:37:05, 4.10it/s, grad_norm=4.73, loss_final=3.75, loss_mean=0.948, loss_mean_cls=3.02, proj_loss=-0.221][2026-03-23 13:40:06] Step: 908, Training Logs: loss_final: 3.573634, loss_mean: 0.949597, proj_loss: -0.220110, loss_mean_cls: 2.844147, grad_norm: 6.167300 +Steps: 0%| | 909/1000000 [03:47<67:34:21, 4.11it/s, grad_norm=6.17, loss_final=3.57, loss_mean=0.95, loss_mean_cls=2.84, proj_loss=-0.22][2026-03-23 13:40:07] Step: 909, Training Logs: loss_final: 3.269623, loss_mean: 0.967390, proj_loss: -0.227783, loss_mean_cls: 2.530016, grad_norm: 6.041188 +Steps: 0%| | 910/1000000 [03:47<67:33:53, 4.11it/s, grad_norm=6.04, loss_final=3.27, loss_mean=0.967, loss_mean_cls=2.53, proj_loss=-0.228][2026-03-23 13:40:07] Step: 910, Training Logs: loss_final: 3.061518, loss_mean: 0.945076, proj_loss: -0.227316, loss_mean_cls: 2.343759, grad_norm: 6.890638 +Steps: 0%| | 911/1000000 [03:47<67:34:38, 4.11it/s, grad_norm=6.89, loss_final=3.06, loss_mean=0.945, loss_mean_cls=2.34, proj_loss=-0.227][2026-03-23 13:40:07] Step: 911, Training Logs: loss_final: 3.316642, loss_mean: 0.971566, proj_loss: -0.229298, loss_mean_cls: 2.574374, grad_norm: 8.973440 +Steps: 0%| | 912/1000000 [03:47<67:34:19, 4.11it/s, grad_norm=8.97, loss_final=3.32, loss_mean=0.972, loss_mean_cls=2.57, proj_loss=-0.229][2026-03-23 13:40:07] Step: 912, Training Logs: loss_final: 3.401032, loss_mean: 0.965212, proj_loss: -0.231402, loss_mean_cls: 2.667222, grad_norm: 7.507606 +Steps: 0%| | 913/1000000 [03:48<67:33:49, 4.11it/s, grad_norm=7.51, loss_final=3.4, loss_mean=0.965, loss_mean_cls=2.67, proj_loss=-0.231][2026-03-23 13:40:08] Step: 913, Training Logs: loss_final: 3.040753, loss_mean: 0.968251, proj_loss: -0.231414, loss_mean_cls: 2.303916, grad_norm: 6.032136 +Steps: 0%| | 914/1000000 [03:48<67:33:21, 4.11it/s, grad_norm=6.03, loss_final=3.04, loss_mean=0.968, loss_mean_cls=2.3, proj_loss=-0.231][2026-03-23 13:40:08] Step: 914, Training Logs: loss_final: 3.674403, loss_mean: 0.983970, proj_loss: -0.225130, loss_mean_cls: 2.915562, grad_norm: 6.485215 +Steps: 0%| | 915/1000000 [03:48<67:35:09, 4.11it/s, grad_norm=6.49, loss_final=3.67, loss_mean=0.984, loss_mean_cls=2.92, proj_loss=-0.225][2026-03-23 13:40:08] Step: 915, Training Logs: loss_final: 3.411092, loss_mean: 0.962777, proj_loss: -0.227921, loss_mean_cls: 2.676237, grad_norm: 3.662704 +Steps: 0%| | 916/1000000 [03:48<67:36:09, 4.11it/s, grad_norm=3.66, loss_final=3.41, loss_mean=0.963, loss_mean_cls=2.68, proj_loss=-0.228][2026-03-23 13:40:08] Step: 916, Training Logs: loss_final: 3.206199, loss_mean: 0.979987, proj_loss: -0.232603, loss_mean_cls: 2.458815, grad_norm: 8.573928 +Steps: 0%| | 917/1000000 [03:49<67:34:38, 4.11it/s, grad_norm=8.57, loss_final=3.21, loss_mean=0.98, loss_mean_cls=2.46, proj_loss=-0.233][2026-03-23 13:40:09] Step: 917, Training Logs: loss_final: 3.387380, loss_mean: 0.990258, proj_loss: -0.231072, loss_mean_cls: 2.628195, grad_norm: 6.966342 +Steps: 0%| | 918/1000000 [03:49<67:34:26, 4.11it/s, grad_norm=6.97, loss_final=3.39, loss_mean=0.99, loss_mean_cls=2.63, proj_loss=-0.231][2026-03-23 13:40:09] Step: 918, Training Logs: loss_final: 3.848824, loss_mean: 0.969089, proj_loss: -0.227515, loss_mean_cls: 3.107250, grad_norm: 6.592562 +Steps: 0%| | 919/1000000 [03:49<67:33:57, 4.11it/s, grad_norm=6.59, loss_final=3.85, loss_mean=0.969, loss_mean_cls=3.11, proj_loss=-0.228][2026-03-23 13:40:09] Step: 919, Training Logs: loss_final: 3.236008, loss_mean: 1.002388, proj_loss: -0.232806, loss_mean_cls: 2.466426, grad_norm: 7.978788 +Steps: 0%| | 920/1000000 [03:49<67:34:31, 4.11it/s, grad_norm=7.98, loss_final=3.24, loss_mean=1, loss_mean_cls=2.47, proj_loss=-0.233][2026-03-23 13:40:09] Step: 920, Training Logs: loss_final: 3.621940, loss_mean: 0.979262, proj_loss: -0.230853, loss_mean_cls: 2.873531, grad_norm: 6.580014 +Steps: 0%| | 921/1000000 [03:50<67:34:27, 4.11it/s, grad_norm=6.58, loss_final=3.62, loss_mean=0.979, loss_mean_cls=2.87, proj_loss=-0.231][2026-03-23 13:40:09] Step: 921, Training Logs: loss_final: 3.312191, loss_mean: 0.964022, proj_loss: -0.236528, loss_mean_cls: 2.584697, grad_norm: 7.782434 +Steps: 0%| | 922/1000000 [03:50<67:34:25, 4.11it/s, grad_norm=7.78, loss_final=3.31, loss_mean=0.964, loss_mean_cls=2.58, proj_loss=-0.237][2026-03-23 13:40:10] Step: 922, Training Logs: loss_final: 3.052814, loss_mean: 0.985570, proj_loss: -0.239236, loss_mean_cls: 2.306481, grad_norm: 6.648209 +Steps: 0%| | 923/1000000 [03:50<67:35:33, 4.11it/s, grad_norm=6.65, loss_final=3.05, loss_mean=0.986, loss_mean_cls=2.31, proj_loss=-0.239][2026-03-23 13:40:10] Step: 923, Training Logs: loss_final: 3.141264, loss_mean: 0.996710, proj_loss: -0.236725, loss_mean_cls: 2.381278, grad_norm: 5.887782 +Steps: 0%| | 924/1000000 [03:50<67:34:47, 4.11it/s, grad_norm=5.89, loss_final=3.14, loss_mean=0.997, loss_mean_cls=2.38, proj_loss=-0.237][2026-03-23 13:40:10] Step: 924, Training Logs: loss_final: 3.539228, loss_mean: 0.958759, proj_loss: -0.232049, loss_mean_cls: 2.812517, grad_norm: 5.126229 +Steps: 0%| | 925/1000000 [03:50<67:34:14, 4.11it/s, grad_norm=5.13, loss_final=3.54, loss_mean=0.959, loss_mean_cls=2.81, proj_loss=-0.232][2026-03-23 13:40:10] Step: 925, Training Logs: loss_final: 3.301076, loss_mean: 0.950323, proj_loss: -0.234449, loss_mean_cls: 2.585202, grad_norm: 6.469912 +Steps: 0%| | 926/1000000 [03:51<67:33:52, 4.11it/s, grad_norm=6.47, loss_final=3.3, loss_mean=0.95, loss_mean_cls=2.59, proj_loss=-0.234][2026-03-23 13:40:11] Step: 926, Training Logs: loss_final: 3.246916, loss_mean: 0.980566, proj_loss: -0.236869, loss_mean_cls: 2.503218, grad_norm: 6.223745 +Steps: 0%| | 927/1000000 [03:51<67:35:09, 4.11it/s, grad_norm=6.22, loss_final=3.25, loss_mean=0.981, loss_mean_cls=2.5, proj_loss=-0.237][2026-03-23 13:40:11] Step: 927, Training Logs: loss_final: 3.206907, loss_mean: 0.979329, proj_loss: -0.237304, loss_mean_cls: 2.464881, grad_norm: 2.779962 +Steps: 0%| | 928/1000000 [03:51<67:34:52, 4.11it/s, grad_norm=2.78, loss_final=3.21, loss_mean=0.979, loss_mean_cls=2.46, proj_loss=-0.237][2026-03-23 13:40:11] Step: 928, Training Logs: loss_final: 3.437654, loss_mean: 0.969463, proj_loss: -0.239784, loss_mean_cls: 2.707974, grad_norm: 4.939271 +Steps: 0%| | 929/1000000 [03:51<67:34:18, 4.11it/s, grad_norm=4.94, loss_final=3.44, loss_mean=0.969, loss_mean_cls=2.71, proj_loss=-0.24][2026-03-23 13:40:11] Step: 929, Training Logs: loss_final: 3.184403, loss_mean: 0.981874, proj_loss: -0.240547, loss_mean_cls: 2.443076, grad_norm: 7.363547 +Steps: 0%| | 930/1000000 [03:52<67:35:11, 4.11it/s, grad_norm=7.36, loss_final=3.18, loss_mean=0.982, loss_mean_cls=2.44, proj_loss=-0.241][2026-03-23 13:40:12] Step: 930, Training Logs: loss_final: 3.165072, loss_mean: 0.960614, proj_loss: -0.234122, loss_mean_cls: 2.438580, grad_norm: 5.978977 +Steps: 0%| | 931/1000000 [03:52<67:35:00, 4.11it/s, grad_norm=5.98, loss_final=3.17, loss_mean=0.961, loss_mean_cls=2.44, proj_loss=-0.234][2026-03-23 13:40:12] Step: 931, Training Logs: loss_final: 3.353462, loss_mean: 0.978435, proj_loss: -0.232894, loss_mean_cls: 2.607921, grad_norm: 4.875524 +Steps: 0%| | 932/1000000 [03:52<67:34:40, 4.11it/s, grad_norm=4.88, loss_final=3.35, loss_mean=0.978, loss_mean_cls=2.61, proj_loss=-0.233][2026-03-23 13:40:12] Step: 932, Training Logs: loss_final: 3.544646, loss_mean: 0.972082, proj_loss: -0.234496, loss_mean_cls: 2.807060, grad_norm: 8.781539 +Steps: 0%| | 933/1000000 [03:52<67:35:05, 4.11it/s, grad_norm=8.78, loss_final=3.54, loss_mean=0.972, loss_mean_cls=2.81, proj_loss=-0.234][2026-03-23 13:40:12] Step: 933, Training Logs: loss_final: 3.209239, loss_mean: 0.978421, proj_loss: -0.238815, loss_mean_cls: 2.469632, grad_norm: 4.892093 +Steps: 0%| | 934/1000000 [03:53<67:34:41, 4.11it/s, grad_norm=4.89, loss_final=3.21, loss_mean=0.978, loss_mean_cls=2.47, proj_loss=-0.239][2026-03-23 13:40:13] Step: 934, Training Logs: loss_final: 3.474085, loss_mean: 0.974422, proj_loss: -0.239769, loss_mean_cls: 2.739433, grad_norm: 14.735762 +Steps: 0%| | 935/1000000 [03:53<67:34:50, 4.11it/s, grad_norm=14.7, loss_final=3.47, loss_mean=0.974, loss_mean_cls=2.74, proj_loss=-0.24][2026-03-23 13:40:13] Step: 935, Training Logs: loss_final: 3.321769, loss_mean: 0.990286, proj_loss: -0.242762, loss_mean_cls: 2.574245, grad_norm: 8.379802 +Steps: 0%| | 936/1000000 [03:53<67:33:35, 4.11it/s, grad_norm=8.38, loss_final=3.32, loss_mean=0.99, loss_mean_cls=2.57, proj_loss=-0.243][2026-03-23 13:40:13] Step: 936, Training Logs: loss_final: 3.328807, loss_mean: 0.969122, proj_loss: -0.239681, loss_mean_cls: 2.599366, grad_norm: 3.486700 +Steps: 0%| | 937/1000000 [03:53<67:34:14, 4.11it/s, grad_norm=3.49, loss_final=3.33, loss_mean=0.969, loss_mean_cls=2.6, proj_loss=-0.24][2026-03-23 13:40:13] Step: 937, Training Logs: loss_final: 3.264494, loss_mean: 0.976112, proj_loss: -0.239740, loss_mean_cls: 2.528123, grad_norm: 9.561045 +Steps: 0%| | 938/1000000 [03:54<67:34:46, 4.11it/s, grad_norm=9.56, loss_final=3.26, loss_mean=0.976, loss_mean_cls=2.53, proj_loss=-0.24][2026-03-23 13:40:14] Step: 938, Training Logs: loss_final: 3.000464, loss_mean: 1.026348, proj_loss: -0.240637, loss_mean_cls: 2.214753, grad_norm: 9.068563 +Steps: 0%| | 939/1000000 [03:54<67:36:36, 4.10it/s, grad_norm=9.07, loss_final=3, loss_mean=1.03, loss_mean_cls=2.21, proj_loss=-0.241][2026-03-23 13:40:14] Step: 939, Training Logs: loss_final: 3.296926, loss_mean: 0.984218, proj_loss: -0.240063, loss_mean_cls: 2.552770, grad_norm: 5.778642 +Steps: 0%| | 940/1000000 [03:54<67:36:14, 4.11it/s, grad_norm=5.78, loss_final=3.3, loss_mean=0.984, loss_mean_cls=2.55, proj_loss=-0.24][2026-03-23 13:40:14] Step: 940, Training Logs: loss_final: 3.265121, loss_mean: 0.985581, proj_loss: -0.245317, loss_mean_cls: 2.524857, grad_norm: 12.272017 +Steps: 0%| | 941/1000000 [03:54<67:34:02, 4.11it/s, grad_norm=12.3, loss_final=3.27, loss_mean=0.986, loss_mean_cls=2.52, proj_loss=-0.245][2026-03-23 13:40:14] Step: 941, Training Logs: loss_final: 3.340146, loss_mean: 0.996254, proj_loss: -0.240904, loss_mean_cls: 2.584796, grad_norm: 9.733750 +Steps: 0%| | 942/1000000 [03:55<67:35:06, 4.11it/s, grad_norm=9.73, loss_final=3.34, loss_mean=0.996, loss_mean_cls=2.58, proj_loss=-0.241][2026-03-23 13:40:15] Step: 942, Training Logs: loss_final: 3.216736, loss_mean: 1.051498, proj_loss: -0.240441, loss_mean_cls: 2.405680, grad_norm: 10.120093 +Steps: 0%| | 943/1000000 [03:55<67:34:07, 4.11it/s, grad_norm=10.1, loss_final=3.22, loss_mean=1.05, loss_mean_cls=2.41, proj_loss=-0.24][2026-03-23 13:40:15] Step: 943, Training Logs: loss_final: 3.348927, loss_mean: 0.998901, proj_loss: -0.239488, loss_mean_cls: 2.589514, grad_norm: 8.137872 +Steps: 0%| | 944/1000000 [03:55<67:34:31, 4.11it/s, grad_norm=8.14, loss_final=3.35, loss_mean=0.999, loss_mean_cls=2.59, proj_loss=-0.239][2026-03-23 13:40:15] Step: 944, Training Logs: loss_final: 3.140655, loss_mean: 0.983717, proj_loss: -0.248804, loss_mean_cls: 2.405742, grad_norm: 5.338933 +Steps: 0%| | 945/1000000 [03:55<67:33:34, 4.11it/s, grad_norm=5.34, loss_final=3.14, loss_mean=0.984, loss_mean_cls=2.41, proj_loss=-0.249][2026-03-23 13:40:15] Step: 945, Training Logs: loss_final: 2.536420, loss_mean: 1.026748, proj_loss: -0.253736, loss_mean_cls: 1.763408, grad_norm: 8.357156 +Steps: 0%| | 946/1000000 [03:56<67:33:56, 4.11it/s, grad_norm=8.36, loss_final=2.54, loss_mean=1.03, loss_mean_cls=1.76, proj_loss=-0.254][2026-03-23 13:40:16] Step: 946, Training Logs: loss_final: 3.319153, loss_mean: 0.985327, proj_loss: -0.244833, loss_mean_cls: 2.578658, grad_norm: 11.170642 +Steps: 0%| | 947/1000000 [03:56<67:33:13, 4.11it/s, grad_norm=11.2, loss_final=3.32, loss_mean=0.985, loss_mean_cls=2.58, proj_loss=-0.245][2026-03-23 13:40:16] Step: 947, Training Logs: loss_final: 3.449298, loss_mean: 0.984005, proj_loss: -0.242110, loss_mean_cls: 2.707403, grad_norm: 6.604457 +Steps: 0%| | 948/1000000 [03:56<67:32:15, 4.11it/s, grad_norm=6.6, loss_final=3.45, loss_mean=0.984, loss_mean_cls=2.71, proj_loss=-0.242][2026-03-23 13:40:16] Step: 948, Training Logs: loss_final: 3.580212, loss_mean: 0.968406, proj_loss: -0.240719, loss_mean_cls: 2.852524, grad_norm: 8.440939 +Steps: 0%| | 949/1000000 [03:56<67:32:04, 4.11it/s, grad_norm=8.44, loss_final=3.58, loss_mean=0.968, loss_mean_cls=2.85, proj_loss=-0.241][2026-03-23 13:40:16] Step: 949, Training Logs: loss_final: 2.965429, loss_mean: 1.001993, proj_loss: -0.252228, loss_mean_cls: 2.215664, grad_norm: 5.002984 +Steps: 0%| | 950/1000000 [03:57<67:31:44, 4.11it/s, grad_norm=5, loss_final=2.97, loss_mean=1, loss_mean_cls=2.22, proj_loss=-0.252][2026-03-23 13:40:17] Step: 950, Training Logs: loss_final: 3.037251, loss_mean: 0.971864, proj_loss: -0.250558, loss_mean_cls: 2.315945, grad_norm: 7.295323 +Steps: 0%| | 951/1000000 [03:57<67:51:55, 4.09it/s, grad_norm=7.3, loss_final=3.04, loss_mean=0.972, loss_mean_cls=2.32, proj_loss=-0.251][2026-03-23 13:40:17] Step: 951, Training Logs: loss_final: 2.538830, loss_mean: 0.999296, proj_loss: -0.257114, loss_mean_cls: 1.796648, grad_norm: 4.202278 +Steps: 0%| | 952/1000000 [03:57<67:50:59, 4.09it/s, grad_norm=4.2, loss_final=2.54, loss_mean=0.999, loss_mean_cls=1.8, proj_loss=-0.257][2026-03-23 13:40:17] Step: 952, Training Logs: loss_final: 3.357757, loss_mean: 0.948262, proj_loss: -0.246996, loss_mean_cls: 2.656490, grad_norm: 9.274776 +Steps: 0%| | 953/1000000 [03:57<67:44:55, 4.10it/s, grad_norm=9.27, loss_final=3.36, loss_mean=0.948, loss_mean_cls=2.66, proj_loss=-0.247][2026-03-23 13:40:17] Step: 953, Training Logs: loss_final: 3.401956, loss_mean: 0.976227, proj_loss: -0.247033, loss_mean_cls: 2.672762, grad_norm: 5.681752 +Steps: 0%| | 954/1000000 [03:58<67:40:20, 4.10it/s, grad_norm=5.68, loss_final=3.4, loss_mean=0.976, loss_mean_cls=2.67, proj_loss=-0.247][2026-03-23 13:40:18] Step: 954, Training Logs: loss_final: 2.804461, loss_mean: 0.981557, proj_loss: -0.257026, loss_mean_cls: 2.079930, grad_norm: 3.315095 +Steps: 0%| | 955/1000000 [03:58<67:40:11, 4.10it/s, grad_norm=3.32, loss_final=2.8, loss_mean=0.982, loss_mean_cls=2.08, proj_loss=-0.257][2026-03-23 13:40:18] Step: 955, Training Logs: loss_final: 3.438772, loss_mean: 0.969593, proj_loss: -0.247829, loss_mean_cls: 2.717008, grad_norm: 3.450827 +Steps: 0%| | 956/1000000 [03:58<67:42:04, 4.10it/s, grad_norm=3.45, loss_final=3.44, loss_mean=0.97, loss_mean_cls=2.72, proj_loss=-0.248][2026-03-23 13:40:18] Step: 956, Training Logs: loss_final: 3.212772, loss_mean: 0.956013, proj_loss: -0.250100, loss_mean_cls: 2.506859, grad_norm: 3.550903 +Steps: 0%| | 957/1000000 [03:58<67:38:12, 4.10it/s, grad_norm=3.55, loss_final=3.21, loss_mean=0.956, loss_mean_cls=2.51, proj_loss=-0.25][2026-03-23 13:40:18] Step: 957, Training Logs: loss_final: 3.583805, loss_mean: 0.971698, proj_loss: -0.249544, loss_mean_cls: 2.861650, grad_norm: 6.829504 +Steps: 0%| | 958/1000000 [03:59<67:36:20, 4.10it/s, grad_norm=6.83, loss_final=3.58, loss_mean=0.972, loss_mean_cls=2.86, proj_loss=-0.25][2026-03-23 13:40:18] Step: 958, Training Logs: loss_final: 2.993153, loss_mean: 1.000183, proj_loss: -0.249726, loss_mean_cls: 2.242695, grad_norm: 3.740790 +Steps: 0%| | 959/1000000 [03:59<67:34:50, 4.11it/s, grad_norm=3.74, loss_final=2.99, loss_mean=1, loss_mean_cls=2.24, proj_loss=-0.25][2026-03-23 13:40:19] Step: 959, Training Logs: loss_final: 3.405118, loss_mean: 0.976373, proj_loss: -0.249073, loss_mean_cls: 2.677818, grad_norm: 3.465406 +Steps: 0%| | 960/1000000 [03:59<68:28:44, 4.05it/s, grad_norm=3.47, loss_final=3.41, loss_mean=0.976, loss_mean_cls=2.68, proj_loss=-0.249][2026-03-23 13:40:19] Step: 960, Training Logs: loss_final: 3.401169, loss_mean: 0.995067, proj_loss: -0.252467, loss_mean_cls: 2.658570, grad_norm: 9.098404 +Steps: 0%| | 961/1000000 [03:59<68:11:07, 4.07it/s, grad_norm=9.1, loss_final=3.4, loss_mean=0.995, loss_mean_cls=2.66, proj_loss=-0.252][2026-03-23 13:40:19] Step: 961, Training Logs: loss_final: 3.120783, loss_mean: 0.975568, proj_loss: -0.247412, loss_mean_cls: 2.392628, grad_norm: 5.119543 +Steps: 0%| | 962/1000000 [04:00<67:59:39, 4.08it/s, grad_norm=5.12, loss_final=3.12, loss_mean=0.976, loss_mean_cls=2.39, proj_loss=-0.247][2026-03-23 13:40:19] Step: 962, Training Logs: loss_final: 3.048557, loss_mean: 0.989142, proj_loss: -0.246799, loss_mean_cls: 2.306214, grad_norm: 2.296514 +Steps: 0%| | 963/1000000 [04:00<67:51:41, 4.09it/s, grad_norm=2.3, loss_final=3.05, loss_mean=0.989, loss_mean_cls=2.31, proj_loss=-0.247][2026-03-23 13:40:20] Step: 963, Training Logs: loss_final: 3.433601, loss_mean: 0.969514, proj_loss: -0.247646, loss_mean_cls: 2.711733, grad_norm: 3.273325 +Steps: 0%| | 964/1000000 [04:00<67:47:20, 4.09it/s, grad_norm=3.27, loss_final=3.43, loss_mean=0.97, loss_mean_cls=2.71, proj_loss=-0.248][2026-03-23 13:40:20] Step: 964, Training Logs: loss_final: 2.857007, loss_mean: 0.975428, proj_loss: -0.258460, loss_mean_cls: 2.140039, grad_norm: 5.588086 +Steps: 0%| | 965/1000000 [04:00<67:42:01, 4.10it/s, grad_norm=5.59, loss_final=2.86, loss_mean=0.975, loss_mean_cls=2.14, proj_loss=-0.258][2026-03-23 13:40:20] Step: 965, Training Logs: loss_final: 3.214569, loss_mean: 0.957324, proj_loss: -0.251199, loss_mean_cls: 2.508444, grad_norm: 3.747616 +Steps: 0%| | 966/1000000 [04:00<67:40:09, 4.10it/s, grad_norm=3.75, loss_final=3.21, loss_mean=0.957, loss_mean_cls=2.51, proj_loss=-0.251][2026-03-23 13:40:20] Step: 966, Training Logs: loss_final: 3.173263, loss_mean: 0.972917, proj_loss: -0.252723, loss_mean_cls: 2.453069, grad_norm: 4.510270 +Steps: 0%| | 967/1000000 [04:01<67:39:55, 4.10it/s, grad_norm=4.51, loss_final=3.17, loss_mean=0.973, loss_mean_cls=2.45, proj_loss=-0.253][2026-03-23 13:40:21] Step: 967, Training Logs: loss_final: 2.814808, loss_mean: 1.009333, proj_loss: -0.257597, loss_mean_cls: 2.063072, grad_norm: 7.157945 +Steps: 0%| | 968/1000000 [04:01<67:39:26, 4.10it/s, grad_norm=7.16, loss_final=2.81, loss_mean=1.01, loss_mean_cls=2.06, proj_loss=-0.258][2026-03-23 13:40:21] Step: 968, Training Logs: loss_final: 3.165732, loss_mean: 0.977728, proj_loss: -0.252931, loss_mean_cls: 2.440935, grad_norm: 5.388104 +Steps: 0%| | 969/1000000 [04:01<67:37:02, 4.10it/s, grad_norm=5.39, loss_final=3.17, loss_mean=0.978, loss_mean_cls=2.44, proj_loss=-0.253][2026-03-23 13:40:21] Step: 969, Training Logs: loss_final: 3.612702, loss_mean: 0.966011, proj_loss: -0.251395, loss_mean_cls: 2.898086, grad_norm: 9.375174 +Steps: 0%| | 970/1000000 [04:01<67:36:06, 4.11it/s, grad_norm=9.38, loss_final=3.61, loss_mean=0.966, loss_mean_cls=2.9, proj_loss=-0.251][2026-03-23 13:40:21] Step: 970, Training Logs: loss_final: 3.232956, loss_mean: 0.982333, proj_loss: -0.255598, loss_mean_cls: 2.506221, grad_norm: 10.031583 +Steps: 0%| | 971/1000000 [04:02<67:36:39, 4.10it/s, grad_norm=10, loss_final=3.23, loss_mean=0.982, loss_mean_cls=2.51, proj_loss=-0.256][2026-03-23 13:40:22] Step: 971, Training Logs: loss_final: 2.882950, loss_mean: 1.016876, proj_loss: -0.255519, loss_mean_cls: 2.121593, grad_norm: 6.771010 +Steps: 0%| | 972/1000000 [04:02<67:35:57, 4.11it/s, grad_norm=6.77, loss_final=2.88, loss_mean=1.02, loss_mean_cls=2.12, proj_loss=-0.256][2026-03-23 13:40:22] Step: 972, Training Logs: loss_final: 3.717132, loss_mean: 0.996090, proj_loss: -0.247958, loss_mean_cls: 2.969000, grad_norm: 14.736269 +Steps: 0%| | 973/1000000 [04:02<67:34:49, 4.11it/s, grad_norm=14.7, loss_final=3.72, loss_mean=0.996, loss_mean_cls=2.97, proj_loss=-0.248][2026-03-23 13:40:22] Step: 973, Training Logs: loss_final: 3.860916, loss_mean: 0.995758, proj_loss: -0.253119, loss_mean_cls: 3.118277, grad_norm: 12.538587 +Steps: 0%| | 974/1000000 [04:02<67:35:16, 4.11it/s, grad_norm=12.5, loss_final=3.86, loss_mean=0.996, loss_mean_cls=3.12, proj_loss=-0.253][2026-03-23 13:40:22] Step: 974, Training Logs: loss_final: 3.075214, loss_mean: 1.030539, proj_loss: -0.255964, loss_mean_cls: 2.300639, grad_norm: 12.498296 +Steps: 0%| | 975/1000000 [04:03<67:34:23, 4.11it/s, grad_norm=12.5, loss_final=3.08, loss_mean=1.03, loss_mean_cls=2.3, proj_loss=-0.256][2026-03-23 13:40:23] Step: 975, Training Logs: loss_final: 3.328099, loss_mean: 1.021275, proj_loss: -0.250047, loss_mean_cls: 2.556871, grad_norm: 8.798976 +Steps: 0%| | 976/1000000 [04:03<67:33:01, 4.11it/s, grad_norm=8.8, loss_final=3.33, loss_mean=1.02, loss_mean_cls=2.56, proj_loss=-0.25][2026-03-23 13:40:23] Step: 976, Training Logs: loss_final: 3.471191, loss_mean: 1.001226, proj_loss: -0.254105, loss_mean_cls: 2.724070, grad_norm: 9.088717 +Steps: 0%| | 977/1000000 [04:03<67:32:27, 4.11it/s, grad_norm=9.09, loss_final=3.47, loss_mean=1, loss_mean_cls=2.72, proj_loss=-0.254][2026-03-23 13:40:23] Step: 977, Training Logs: loss_final: 3.297056, loss_mean: 1.065697, proj_loss: -0.257612, loss_mean_cls: 2.488970, grad_norm: 14.402028 +Steps: 0%| | 978/1000000 [04:03<67:32:36, 4.11it/s, grad_norm=14.4, loss_final=3.3, loss_mean=1.07, loss_mean_cls=2.49, proj_loss=-0.258][2026-03-23 13:40:23] Step: 978, Training Logs: loss_final: 3.886168, loss_mean: 1.044864, proj_loss: -0.249807, loss_mean_cls: 3.091112, grad_norm: 14.132775 +Steps: 0%| | 979/1000000 [04:04<67:32:54, 4.11it/s, grad_norm=14.1, loss_final=3.89, loss_mean=1.04, loss_mean_cls=3.09, proj_loss=-0.25][2026-03-23 13:40:24] Step: 979, Training Logs: loss_final: 3.311659, loss_mean: 1.032244, proj_loss: -0.249957, loss_mean_cls: 2.529372, grad_norm: 7.597231 +Steps: 0%| | 980/1000000 [04:04<67:33:07, 4.11it/s, grad_norm=7.6, loss_final=3.31, loss_mean=1.03, loss_mean_cls=2.53, proj_loss=-0.25][2026-03-23 13:40:24] Step: 980, Training Logs: loss_final: 3.428217, loss_mean: 1.008171, proj_loss: -0.252951, loss_mean_cls: 2.672997, grad_norm: 5.879751 +Steps: 0%| | 981/1000000 [04:04<67:31:55, 4.11it/s, grad_norm=5.88, loss_final=3.43, loss_mean=1.01, loss_mean_cls=2.67, proj_loss=-0.253][2026-03-23 13:40:24] Step: 981, Training Logs: loss_final: 3.295587, loss_mean: 0.994923, proj_loss: -0.254650, loss_mean_cls: 2.555314, grad_norm: 6.772698 +Steps: 0%| | 982/1000000 [04:04<67:31:40, 4.11it/s, grad_norm=6.77, loss_final=3.3, loss_mean=0.995, loss_mean_cls=2.56, proj_loss=-0.255][2026-03-23 13:40:24] Step: 982, Training Logs: loss_final: 3.075507, loss_mean: 1.013434, proj_loss: -0.266766, loss_mean_cls: 2.328838, grad_norm: 5.767572 +Steps: 0%| | 983/1000000 [04:05<67:36:05, 4.11it/s, grad_norm=5.77, loss_final=3.08, loss_mean=1.01, loss_mean_cls=2.33, proj_loss=-0.267][2026-03-23 13:40:25] Step: 983, Training Logs: loss_final: 3.047209, loss_mean: 1.018685, proj_loss: -0.259702, loss_mean_cls: 2.288226, grad_norm: 5.000931 +Steps: 0%| | 984/1000000 [04:05<67:35:13, 4.11it/s, grad_norm=5, loss_final=3.05, loss_mean=1.02, loss_mean_cls=2.29, proj_loss=-0.26][2026-03-23 13:40:25] Step: 984, Training Logs: loss_final: 3.285589, loss_mean: 0.996445, proj_loss: -0.258971, loss_mean_cls: 2.548114, grad_norm: 5.938268 +Steps: 0%| | 985/1000000 [04:05<67:33:16, 4.11it/s, grad_norm=5.94, loss_final=3.29, loss_mean=0.996, loss_mean_cls=2.55, proj_loss=-0.259][2026-03-23 13:40:25] Step: 985, Training Logs: loss_final: 3.377077, loss_mean: 0.985293, proj_loss: -0.253280, loss_mean_cls: 2.645063, grad_norm: 6.132157 +Steps: 0%| | 986/1000000 [04:05<67:32:18, 4.11it/s, grad_norm=6.13, loss_final=3.38, loss_mean=0.985, loss_mean_cls=2.65, proj_loss=-0.253][2026-03-23 13:40:25] Step: 986, Training Logs: loss_final: 3.143353, loss_mean: 0.986076, proj_loss: -0.255970, loss_mean_cls: 2.413248, grad_norm: 5.234632 +Steps: 0%| | 987/1000000 [04:06<67:32:40, 4.11it/s, grad_norm=5.23, loss_final=3.14, loss_mean=0.986, loss_mean_cls=2.41, proj_loss=-0.256][2026-03-23 13:40:26] Step: 987, Training Logs: loss_final: 3.429005, loss_mean: 0.964931, proj_loss: -0.257361, loss_mean_cls: 2.721435, grad_norm: 10.744798 +Steps: 0%| | 988/1000000 [04:06<67:33:06, 4.11it/s, grad_norm=10.7, loss_final=3.43, loss_mean=0.965, loss_mean_cls=2.72, proj_loss=-0.257][2026-03-23 13:40:26] Step: 988, Training Logs: loss_final: 3.521429, loss_mean: 0.984608, proj_loss: -0.252518, loss_mean_cls: 2.789339, grad_norm: 5.601554 +Steps: 0%| | 989/1000000 [04:06<67:31:29, 4.11it/s, grad_norm=5.6, loss_final=3.52, loss_mean=0.985, loss_mean_cls=2.79, proj_loss=-0.253][2026-03-23 13:40:26] Step: 989, Training Logs: loss_final: 3.101814, loss_mean: 1.009576, proj_loss: -0.261880, loss_mean_cls: 2.354118, grad_norm: 7.638994 +Steps: 0%| | 990/1000000 [04:06<67:31:14, 4.11it/s, grad_norm=7.64, loss_final=3.1, loss_mean=1.01, loss_mean_cls=2.35, proj_loss=-0.262][2026-03-23 13:40:26] Step: 990, Training Logs: loss_final: 3.453676, loss_mean: 0.981746, proj_loss: -0.257406, loss_mean_cls: 2.729336, grad_norm: 6.615604 +Steps: 0%| | 991/1000000 [04:07<67:31:39, 4.11it/s, grad_norm=6.62, loss_final=3.45, loss_mean=0.982, loss_mean_cls=2.73, proj_loss=-0.257][2026-03-23 13:40:27] Step: 991, Training Logs: loss_final: 2.766138, loss_mean: 0.991564, proj_loss: -0.271174, loss_mean_cls: 2.045748, grad_norm: 4.693060 +Steps: 0%| | 992/1000000 [04:07<67:32:06, 4.11it/s, grad_norm=4.69, loss_final=2.77, loss_mean=0.992, loss_mean_cls=2.05, proj_loss=-0.271][2026-03-23 13:40:27] Step: 992, Training Logs: loss_final: 2.922798, loss_mean: 1.002762, proj_loss: -0.266789, loss_mean_cls: 2.186825, grad_norm: 5.299560 +Steps: 0%| | 993/1000000 [04:07<67:30:14, 4.11it/s, grad_norm=5.3, loss_final=2.92, loss_mean=1, loss_mean_cls=2.19, proj_loss=-0.267][2026-03-23 13:40:27] Step: 993, Training Logs: loss_final: 3.020019, loss_mean: 0.981456, proj_loss: -0.261680, loss_mean_cls: 2.300242, grad_norm: 3.569873 +Steps: 0%| | 994/1000000 [04:07<67:31:03, 4.11it/s, grad_norm=3.57, loss_final=3.02, loss_mean=0.981, loss_mean_cls=2.3, proj_loss=-0.262][2026-03-23 13:40:27] Step: 994, Training Logs: loss_final: 2.974222, loss_mean: 0.996423, proj_loss: -0.265387, loss_mean_cls: 2.243186, grad_norm: 4.788045 +Steps: 0%| | 995/1000000 [04:08<67:33:22, 4.11it/s, grad_norm=4.79, loss_final=2.97, loss_mean=0.996, loss_mean_cls=2.24, proj_loss=-0.265][2026-03-23 13:40:28] Step: 995, Training Logs: loss_final: 3.287812, loss_mean: 0.971469, proj_loss: -0.264423, loss_mean_cls: 2.580767, grad_norm: 6.176558 +Steps: 0%| | 996/1000000 [04:08<67:32:03, 4.11it/s, grad_norm=6.18, loss_final=3.29, loss_mean=0.971, loss_mean_cls=2.58, proj_loss=-0.264][2026-03-23 13:40:28] Step: 996, Training Logs: loss_final: 3.255604, loss_mean: 0.994302, proj_loss: -0.258133, loss_mean_cls: 2.519434, grad_norm: 3.638067 +Steps: 0%| | 997/1000000 [04:08<67:31:16, 4.11it/s, grad_norm=3.64, loss_final=3.26, loss_mean=0.994, loss_mean_cls=2.52, proj_loss=-0.258][2026-03-23 13:40:28] Step: 997, Training Logs: loss_final: 3.501650, loss_mean: 0.956280, proj_loss: -0.265570, loss_mean_cls: 2.810940, grad_norm: 4.686517 +Steps: 0%| | 998/1000000 [04:08<67:30:38, 4.11it/s, grad_norm=4.69, loss_final=3.5, loss_mean=0.956, loss_mean_cls=2.81, proj_loss=-0.266][2026-03-23 13:40:28] Step: 998, Training Logs: loss_final: 3.452612, loss_mean: 0.960304, proj_loss: -0.263397, loss_mean_cls: 2.755706, grad_norm: 4.686319 +Steps: 0%| | 999/1000000 [04:09<67:31:46, 4.11it/s, grad_norm=4.69, loss_final=3.45, loss_mean=0.96, loss_mean_cls=2.76, proj_loss=-0.263][2026-03-23 13:40:28] Step: 999, Training Logs: loss_final: 3.242402, loss_mean: 0.982370, proj_loss: -0.268650, loss_mean_cls: 2.528682, grad_norm: 6.533368 +Steps: 0%| | 1000/1000000 [04:09<68:25:36, 4.06it/s, grad_norm=6.53, loss_final=3.24, loss_mean=0.982, loss_mean_cls=2.53, proj_loss=-0.269][2026-03-23 13:40:29] Step: 1000, Training Logs: loss_final: 3.301492, loss_mean: 0.992805, proj_loss: -0.261193, loss_mean_cls: 2.569880, grad_norm: 5.470813 +Steps: 0%| | 1001/1000000 [04:09<68:08:39, 4.07it/s, grad_norm=5.47, loss_final=3.3, loss_mean=0.993, loss_mean_cls=2.57, proj_loss=-0.261][2026-03-23 13:40:29] Step: 1001, Training Logs: loss_final: 3.989800, loss_mean: 0.961484, proj_loss: -0.261735, loss_mean_cls: 3.290052, grad_norm: 6.811598 +Steps: 0%| | 1002/1000000 [04:09<67:56:56, 4.08it/s, grad_norm=6.81, loss_final=3.99, loss_mean=0.961, loss_mean_cls=3.29, proj_loss=-0.262][2026-03-23 13:40:29] Step: 1002, Training Logs: loss_final: 3.017685, loss_mean: 0.961167, proj_loss: -0.274504, loss_mean_cls: 2.331023, grad_norm: 5.097668 +Steps: 0%| | 1003/1000000 [04:10<67:50:37, 4.09it/s, grad_norm=5.1, loss_final=3.02, loss_mean=0.961, loss_mean_cls=2.33, proj_loss=-0.275][2026-03-23 13:40:29] Step: 1003, Training Logs: loss_final: 3.542899, loss_mean: 0.968566, proj_loss: -0.268461, loss_mean_cls: 2.842794, grad_norm: 5.291746 +Steps: 0%| | 1004/1000000 [04:10<67:44:45, 4.10it/s, grad_norm=5.29, loss_final=3.54, loss_mean=0.969, loss_mean_cls=2.84, proj_loss=-0.268][2026-03-23 13:40:30] Step: 1004, Training Logs: loss_final: 3.959602, loss_mean: 0.927434, proj_loss: -0.268961, loss_mean_cls: 3.301129, grad_norm: 5.663870 +Steps: 0%| | 1005/1000000 [04:10<67:40:44, 4.10it/s, grad_norm=5.66, loss_final=3.96, loss_mean=0.927, loss_mean_cls=3.3, proj_loss=-0.269][2026-03-23 13:40:30] Step: 1005, Training Logs: loss_final: 3.706823, loss_mean: 0.961462, proj_loss: -0.266926, loss_mean_cls: 3.012287, grad_norm: 4.934190 +Steps: 0%| | 1006/1000000 [04:10<67:37:01, 4.10it/s, grad_norm=4.93, loss_final=3.71, loss_mean=0.961, loss_mean_cls=3.01, proj_loss=-0.267][2026-03-23 13:40:30] Step: 1006, Training Logs: loss_final: 3.188708, loss_mean: 0.978750, proj_loss: -0.266266, loss_mean_cls: 2.476224, grad_norm: 6.861451 +Steps: 0%| | 1007/1000000 [04:10<67:36:23, 4.10it/s, grad_norm=6.86, loss_final=3.19, loss_mean=0.979, loss_mean_cls=2.48, proj_loss=-0.266][2026-03-23 13:40:30] Step: 1007, Training Logs: loss_final: 3.276395, loss_mean: 0.974438, proj_loss: -0.267570, loss_mean_cls: 2.569527, grad_norm: 8.755691 +Steps: 0%| | 1008/1000000 [04:11<67:34:52, 4.11it/s, grad_norm=8.76, loss_final=3.28, loss_mean=0.974, loss_mean_cls=2.57, proj_loss=-0.268][2026-03-23 13:40:31] Step: 1008, Training Logs: loss_final: 3.471865, loss_mean: 0.995026, proj_loss: -0.265669, loss_mean_cls: 2.742508, grad_norm: 7.014292 +Steps: 0%| | 1009/1000000 [04:11<67:33:31, 4.11it/s, grad_norm=7.01, loss_final=3.47, loss_mean=0.995, loss_mean_cls=2.74, proj_loss=-0.266][2026-03-23 13:40:31] Step: 1009, Training Logs: loss_final: 3.461152, loss_mean: 0.982292, proj_loss: -0.268239, loss_mean_cls: 2.747098, grad_norm: 11.413527 +Steps: 0%| | 1010/1000000 [04:11<67:33:30, 4.11it/s, grad_norm=11.4, loss_final=3.46, loss_mean=0.982, loss_mean_cls=2.75, proj_loss=-0.268][2026-03-23 13:40:31] Step: 1010, Training Logs: loss_final: 3.948769, loss_mean: 0.968122, proj_loss: -0.260646, loss_mean_cls: 3.241292, grad_norm: 8.189745 +Steps: 0%| | 1011/1000000 [04:11<67:36:39, 4.10it/s, grad_norm=8.19, loss_final=3.95, loss_mean=0.968, loss_mean_cls=3.24, proj_loss=-0.261][2026-03-23 13:40:31] Step: 1011, Training Logs: loss_final: 3.062029, loss_mean: 0.978208, proj_loss: -0.273287, loss_mean_cls: 2.357109, grad_norm: 6.402600 +Steps: 0%| | 1012/1000000 [04:12<67:34:12, 4.11it/s, grad_norm=6.4, loss_final=3.06, loss_mean=0.978, loss_mean_cls=2.36, proj_loss=-0.273][2026-03-23 13:40:32] Step: 1012, Training Logs: loss_final: 3.665686, loss_mean: 0.942534, proj_loss: -0.263899, loss_mean_cls: 2.987051, grad_norm: 7.521571 +Steps: 0%| | 1013/1000000 [04:12<68:52:28, 4.03it/s, grad_norm=7.52, loss_final=3.67, loss_mean=0.943, loss_mean_cls=2.99, proj_loss=-0.264][2026-03-23 13:40:32] Step: 1013, Training Logs: loss_final: 3.077967, loss_mean: 0.960182, proj_loss: -0.273161, loss_mean_cls: 2.390945, grad_norm: 5.230310 +Steps: 0%| | 1014/1000000 [04:12<68:29:54, 4.05it/s, grad_norm=5.23, loss_final=3.08, loss_mean=0.96, loss_mean_cls=2.39, proj_loss=-0.273][2026-03-23 13:40:32] Step: 1014, Training Logs: loss_final: 3.211326, loss_mean: 0.950553, proj_loss: -0.273932, loss_mean_cls: 2.534705, grad_norm: 7.067472 +Steps: 0%| | 1015/1000000 [04:12<68:15:52, 4.06it/s, grad_norm=7.07, loss_final=3.21, loss_mean=0.951, loss_mean_cls=2.53, proj_loss=-0.274][2026-03-23 13:40:32] Step: 1015, Training Logs: loss_final: 3.544832, loss_mean: 0.971291, proj_loss: -0.266397, loss_mean_cls: 2.839938, grad_norm: 4.563501 +Steps: 0%| | 1016/1000000 [04:13<68:03:53, 4.08it/s, grad_norm=4.56, loss_final=3.54, loss_mean=0.971, loss_mean_cls=2.84, proj_loss=-0.266][2026-03-23 13:40:33] Step: 1016, Training Logs: loss_final: 3.407650, loss_mean: 0.957103, proj_loss: -0.273025, loss_mean_cls: 2.723572, grad_norm: 2.992783 +Steps: 0%| | 1017/1000000 [04:13<67:53:38, 4.09it/s, grad_norm=2.99, loss_final=3.41, loss_mean=0.957, loss_mean_cls=2.72, proj_loss=-0.273][2026-03-23 13:40:33] Step: 1017, Training Logs: loss_final: 2.734017, loss_mean: 0.984483, proj_loss: -0.277139, loss_mean_cls: 2.026674, grad_norm: 7.601571 +Steps: 0%| | 1018/1000000 [04:13<70:44:06, 3.92it/s, grad_norm=7.6, loss_final=2.73, loss_mean=0.984, loss_mean_cls=2.03, proj_loss=-0.277][2026-03-23 13:40:33] Step: 1018, Training Logs: loss_final: 3.161210, loss_mean: 0.971455, proj_loss: -0.270528, loss_mean_cls: 2.460282, grad_norm: 4.802865 +Steps: 0%| | 1019/1000000 [04:13<69:47:36, 3.98it/s, grad_norm=4.8, loss_final=3.16, loss_mean=0.971, loss_mean_cls=2.46, proj_loss=-0.271][2026-03-23 13:40:33] Step: 1019, Training Logs: loss_final: 2.958566, loss_mean: 0.955727, proj_loss: -0.275929, loss_mean_cls: 2.278768, grad_norm: 2.728377 +Steps: 0%| | 1020/1000000 [04:14<69:06:01, 4.02it/s, grad_norm=2.73, loss_final=2.96, loss_mean=0.956, loss_mean_cls=2.28, proj_loss=-0.276][2026-03-23 13:40:34] Step: 1020, Training Logs: loss_final: 2.824934, loss_mean: 0.974079, proj_loss: -0.281773, loss_mean_cls: 2.132628, grad_norm: 3.024271 +Steps: 0%| | 1021/1000000 [04:14<68:38:15, 4.04it/s, grad_norm=3.02, loss_final=2.82, loss_mean=0.974, loss_mean_cls=2.13, proj_loss=-0.282][2026-03-23 13:40:34] Step: 1021, Training Logs: loss_final: 2.943833, loss_mean: 0.986359, proj_loss: -0.276495, loss_mean_cls: 2.233969, grad_norm: 5.363401 +Steps: 0%| | 1022/1000000 [04:14<68:18:16, 4.06it/s, grad_norm=5.36, loss_final=2.94, loss_mean=0.986, loss_mean_cls=2.23, proj_loss=-0.276][2026-03-23 13:40:34] Step: 1022, Training Logs: loss_final: 3.018774, loss_mean: 0.963932, proj_loss: -0.274213, loss_mean_cls: 2.329055, grad_norm: 9.403516 +Steps: 0%| | 1023/1000000 [04:14<68:04:31, 4.08it/s, grad_norm=9.4, loss_final=3.02, loss_mean=0.964, loss_mean_cls=2.33, proj_loss=-0.274][2026-03-23 13:40:34] Step: 1023, Training Logs: loss_final: 3.228917, loss_mean: 0.968282, proj_loss: -0.274015, loss_mean_cls: 2.534649, grad_norm: 4.896817 +Steps: 0%| | 1024/1000000 [04:15<67:54:49, 4.09it/s, grad_norm=4.9, loss_final=3.23, loss_mean=0.968, loss_mean_cls=2.53, proj_loss=-0.274][2026-03-23 13:40:35] Step: 1024, Training Logs: loss_final: 3.592821, loss_mean: 0.973568, proj_loss: -0.268561, loss_mean_cls: 2.887814, grad_norm: 8.806969 +Steps: 0%| | 1025/1000000 [04:15<67:47:48, 4.09it/s, grad_norm=8.81, loss_final=3.59, loss_mean=0.974, loss_mean_cls=2.89, proj_loss=-0.269][2026-03-23 13:40:35] Step: 1025, Training Logs: loss_final: 3.378083, loss_mean: 0.975192, proj_loss: -0.272724, loss_mean_cls: 2.675615, grad_norm: 3.699044 +Steps: 0%| | 1026/1000000 [04:15<67:41:18, 4.10it/s, grad_norm=3.7, loss_final=3.38, loss_mean=0.975, loss_mean_cls=2.68, proj_loss=-0.273][2026-03-23 13:40:35] Step: 1026, Training Logs: loss_final: 3.170653, loss_mean: 0.964159, proj_loss: -0.280892, loss_mean_cls: 2.487387, grad_norm: 10.832685 +Steps: 0%| | 1027/1000000 [04:15<67:40:23, 4.10it/s, grad_norm=10.8, loss_final=3.17, loss_mean=0.964, loss_mean_cls=2.49, proj_loss=-0.281][2026-03-23 13:40:35] Step: 1027, Training Logs: loss_final: 3.130652, loss_mean: 0.988486, proj_loss: -0.275089, loss_mean_cls: 2.417255, grad_norm: 11.764797 +Steps: 0%| | 1028/1000000 [04:16<67:39:15, 4.10it/s, grad_norm=11.8, loss_final=3.13, loss_mean=0.988, loss_mean_cls=2.42, proj_loss=-0.275][2026-03-23 13:40:36] Step: 1028, Training Logs: loss_final: 3.171324, loss_mean: 1.002455, proj_loss: -0.272673, loss_mean_cls: 2.441542, grad_norm: 9.027860 +Steps: 0%| | 1029/1000000 [04:16<67:37:17, 4.10it/s, grad_norm=9.03, loss_final=3.17, loss_mean=1, loss_mean_cls=2.44, proj_loss=-0.273][2026-03-23 13:40:36] Step: 1029, Training Logs: loss_final: 2.928823, loss_mean: 0.984651, proj_loss: -0.274759, loss_mean_cls: 2.218930, grad_norm: 9.493007 +Steps: 0%| | 1030/1000000 [04:16<67:36:02, 4.10it/s, grad_norm=9.49, loss_final=2.93, loss_mean=0.985, loss_mean_cls=2.22, proj_loss=-0.275][2026-03-23 13:40:36] Step: 1030, Training Logs: loss_final: 3.074771, loss_mean: 0.963178, proj_loss: -0.274157, loss_mean_cls: 2.385751, grad_norm: 5.071749 +Steps: 0%| | 1031/1000000 [04:16<67:33:40, 4.11it/s, grad_norm=5.07, loss_final=3.07, loss_mean=0.963, loss_mean_cls=2.39, proj_loss=-0.274][2026-03-23 13:40:36] Step: 1031, Training Logs: loss_final: 3.081924, loss_mean: 0.981736, proj_loss: -0.276481, loss_mean_cls: 2.376669, grad_norm: 12.125161 +Steps: 0%| | 1032/1000000 [04:17<67:33:23, 4.11it/s, grad_norm=12.1, loss_final=3.08, loss_mean=0.982, loss_mean_cls=2.38, proj_loss=-0.276][2026-03-23 13:40:37] Step: 1032, Training Logs: loss_final: 3.636712, loss_mean: 0.964961, proj_loss: -0.273235, loss_mean_cls: 2.944986, grad_norm: 12.286339 +Steps: 0%| | 1033/1000000 [04:17<67:31:48, 4.11it/s, grad_norm=12.3, loss_final=3.64, loss_mean=0.965, loss_mean_cls=2.94, proj_loss=-0.273][2026-03-23 13:40:37] Step: 1033, Training Logs: loss_final: 3.023659, loss_mean: 1.005238, proj_loss: -0.274036, loss_mean_cls: 2.292457, grad_norm: 3.841571 +Steps: 0%| | 1034/1000000 [04:17<67:32:13, 4.11it/s, grad_norm=3.84, loss_final=3.02, loss_mean=1.01, loss_mean_cls=2.29, proj_loss=-0.274][2026-03-23 13:40:37] Step: 1034, Training Logs: loss_final: 3.075401, loss_mean: 0.963498, proj_loss: -0.280031, loss_mean_cls: 2.391934, grad_norm: 7.610475 +Steps: 0%| | 1035/1000000 [04:17<67:34:21, 4.11it/s, grad_norm=7.61, loss_final=3.08, loss_mean=0.963, loss_mean_cls=2.39, proj_loss=-0.28][2026-03-23 13:40:37] Step: 1035, Training Logs: loss_final: 3.642169, loss_mean: 0.980151, proj_loss: -0.272465, loss_mean_cls: 2.934482, grad_norm: 8.835723 +Steps: 0%| | 1036/1000000 [04:18<67:33:58, 4.11it/s, grad_norm=8.84, loss_final=3.64, loss_mean=0.98, loss_mean_cls=2.93, proj_loss=-0.272][2026-03-23 13:40:38] Step: 1036, Training Logs: loss_final: 3.598573, loss_mean: 0.958202, proj_loss: -0.269773, loss_mean_cls: 2.910144, grad_norm: 8.134662 +Steps: 0%| | 1037/1000000 [04:18<67:35:22, 4.11it/s, grad_norm=8.13, loss_final=3.6, loss_mean=0.958, loss_mean_cls=2.91, proj_loss=-0.27][2026-03-23 13:40:38] Step: 1037, Training Logs: loss_final: 2.858454, loss_mean: 1.000577, proj_loss: -0.280744, loss_mean_cls: 2.138620, grad_norm: 3.831894 +Steps: 0%| | 1038/1000000 [04:18<67:33:44, 4.11it/s, grad_norm=3.83, loss_final=2.86, loss_mean=1, loss_mean_cls=2.14, proj_loss=-0.281][2026-03-23 13:40:38] Step: 1038, Training Logs: loss_final: 3.064444, loss_mean: 0.956768, proj_loss: -0.287192, loss_mean_cls: 2.394868, grad_norm: 6.390052 +Steps: 0%| | 1039/1000000 [04:18<67:34:29, 4.11it/s, grad_norm=6.39, loss_final=3.06, loss_mean=0.957, loss_mean_cls=2.39, proj_loss=-0.287][2026-03-23 13:40:38] Step: 1039, Training Logs: loss_final: 3.111069, loss_mean: 0.969733, proj_loss: -0.279296, loss_mean_cls: 2.420633, grad_norm: 4.223545 +Steps: 0%| | 1040/1000000 [04:19<67:33:05, 4.11it/s, grad_norm=4.22, loss_final=3.11, loss_mean=0.97, loss_mean_cls=2.42, proj_loss=-0.279][2026-03-23 13:40:39] Step: 1040, Training Logs: loss_final: 3.408720, loss_mean: 0.973046, proj_loss: -0.277611, loss_mean_cls: 2.713284, grad_norm: 7.736208 +Steps: 0%| | 1041/1000000 [04:19<67:32:19, 4.11it/s, grad_norm=7.74, loss_final=3.41, loss_mean=0.973, loss_mean_cls=2.71, proj_loss=-0.278][2026-03-23 13:40:39] Step: 1041, Training Logs: loss_final: 3.599596, loss_mean: 0.979763, proj_loss: -0.276142, loss_mean_cls: 2.895975, grad_norm: 4.948798 +Steps: 0%| | 1042/1000000 [04:19<67:33:15, 4.11it/s, grad_norm=4.95, loss_final=3.6, loss_mean=0.98, loss_mean_cls=2.9, proj_loss=-0.276][2026-03-23 13:40:39] Step: 1042, Training Logs: loss_final: 3.813253, loss_mean: 0.943602, proj_loss: -0.271822, loss_mean_cls: 3.141473, grad_norm: 13.105306 +Steps: 0%| | 1043/1000000 [04:19<67:39:05, 4.10it/s, grad_norm=13.1, loss_final=3.81, loss_mean=0.944, loss_mean_cls=3.14, proj_loss=-0.272][2026-03-23 13:40:39] Step: 1043, Training Logs: loss_final: 3.608954, loss_mean: 0.970431, proj_loss: -0.272209, loss_mean_cls: 2.910732, grad_norm: 9.767679 +Steps: 0%| | 1044/1000000 [04:20<67:36:54, 4.10it/s, grad_norm=9.77, loss_final=3.61, loss_mean=0.97, loss_mean_cls=2.91, proj_loss=-0.272][2026-03-23 13:40:40] Step: 1044, Training Logs: loss_final: 3.074526, loss_mean: 0.985965, proj_loss: -0.275191, loss_mean_cls: 2.363752, grad_norm: 4.259029 +Steps: 0%| | 1045/1000000 [04:20<67:36:10, 4.10it/s, grad_norm=4.26, loss_final=3.07, loss_mean=0.986, loss_mean_cls=2.36, proj_loss=-0.275][2026-03-23 13:40:40] Step: 1045, Training Logs: loss_final: 3.083913, loss_mean: 0.984579, proj_loss: -0.278271, loss_mean_cls: 2.377605, grad_norm: 12.867167 +Steps: 0%| | 1046/1000000 [04:20<67:38:05, 4.10it/s, grad_norm=12.9, loss_final=3.08, loss_mean=0.985, loss_mean_cls=2.38, proj_loss=-0.278][2026-03-23 13:40:40] Step: 1046, Training Logs: loss_final: 3.514823, loss_mean: 1.003472, proj_loss: -0.280351, loss_mean_cls: 2.791703, grad_norm: 11.032563 +Steps: 0%| | 1047/1000000 [04:20<67:37:07, 4.10it/s, grad_norm=11, loss_final=3.51, loss_mean=1, loss_mean_cls=2.79, proj_loss=-0.28][2026-03-23 13:40:40] Step: 1047, Training Logs: loss_final: 3.648435, loss_mean: 0.968496, proj_loss: -0.271472, loss_mean_cls: 2.951411, grad_norm: 8.918668 +Steps: 0%| | 1048/1000000 [04:21<67:37:05, 4.10it/s, grad_norm=8.92, loss_final=3.65, loss_mean=0.968, loss_mean_cls=2.95, proj_loss=-0.271][2026-03-23 13:40:40] Step: 1048, Training Logs: loss_final: 3.638303, loss_mean: 0.932193, proj_loss: -0.275433, loss_mean_cls: 2.981544, grad_norm: 4.543445 +Steps: 0%| | 1049/1000000 [04:21<67:35:07, 4.11it/s, grad_norm=4.54, loss_final=3.64, loss_mean=0.932, loss_mean_cls=2.98, proj_loss=-0.275][2026-03-23 13:40:41] Step: 1049, Training Logs: loss_final: 3.273413, loss_mean: 0.968867, proj_loss: -0.283042, loss_mean_cls: 2.587588, grad_norm: 6.058908 +Steps: 0%| | 1050/1000000 [04:21<67:33:30, 4.11it/s, grad_norm=6.06, loss_final=3.27, loss_mean=0.969, loss_mean_cls=2.59, proj_loss=-0.283][2026-03-23 13:40:41] Step: 1050, Training Logs: loss_final: 3.294904, loss_mean: 0.974608, proj_loss: -0.279107, loss_mean_cls: 2.599403, grad_norm: 5.431648 +Steps: 0%| | 1051/1000000 [04:21<67:33:18, 4.11it/s, grad_norm=5.43, loss_final=3.29, loss_mean=0.975, loss_mean_cls=2.6, proj_loss=-0.279][2026-03-23 13:40:41] Step: 1051, Training Logs: loss_final: 3.437912, loss_mean: 0.968008, proj_loss: -0.278798, loss_mean_cls: 2.748703, grad_norm: 4.733047 +Steps: 0%| | 1052/1000000 [04:21<67:33:33, 4.11it/s, grad_norm=4.73, loss_final=3.44, loss_mean=0.968, loss_mean_cls=2.75, proj_loss=-0.279][2026-03-23 13:40:41] Step: 1052, Training Logs: loss_final: 3.450063, loss_mean: 0.964759, proj_loss: -0.277847, loss_mean_cls: 2.763151, grad_norm: 6.345976 +Steps: 0%| | 1053/1000000 [04:22<67:34:20, 4.11it/s, grad_norm=6.35, loss_final=3.45, loss_mean=0.965, loss_mean_cls=2.76, proj_loss=-0.278][2026-03-23 13:40:42] Step: 1053, Training Logs: loss_final: 2.732630, loss_mean: 0.994532, proj_loss: -0.286040, loss_mean_cls: 2.024138, grad_norm: 4.496934 +Steps: 0%| | 1054/1000000 [04:22<67:32:43, 4.11it/s, grad_norm=4.5, loss_final=2.73, loss_mean=0.995, loss_mean_cls=2.02, proj_loss=-0.286][2026-03-23 13:40:42] Step: 1054, Training Logs: loss_final: 3.560143, loss_mean: 0.967066, proj_loss: -0.280011, loss_mean_cls: 2.873088, grad_norm: 8.736233 +Steps: 0%| | 1055/1000000 [04:22<67:33:03, 4.11it/s, grad_norm=8.74, loss_final=3.56, loss_mean=0.967, loss_mean_cls=2.87, proj_loss=-0.28][2026-03-23 13:40:42] Step: 1055, Training Logs: loss_final: 3.291619, loss_mean: 0.998623, proj_loss: -0.281536, loss_mean_cls: 2.574532, grad_norm: 4.847973 +Steps: 0%| | 1056/1000000 [04:22<67:39:07, 4.10it/s, grad_norm=4.85, loss_final=3.29, loss_mean=0.999, loss_mean_cls=2.57, proj_loss=-0.282][2026-03-23 13:40:42] Step: 1056, Training Logs: loss_final: 2.882553, loss_mean: 0.983689, proj_loss: -0.285328, loss_mean_cls: 2.184192, grad_norm: 6.416617 +Steps: 0%| | 1057/1000000 [04:23<67:36:33, 4.10it/s, grad_norm=6.42, loss_final=2.88, loss_mean=0.984, loss_mean_cls=2.18, proj_loss=-0.285][2026-03-23 13:40:43] Step: 1057, Training Logs: loss_final: 3.436150, loss_mean: 0.959154, proj_loss: -0.280372, loss_mean_cls: 2.757368, grad_norm: 4.000648 +Steps: 0%| | 1058/1000000 [04:23<67:34:36, 4.11it/s, grad_norm=4, loss_final=3.44, loss_mean=0.959, loss_mean_cls=2.76, proj_loss=-0.28][2026-03-23 13:40:43] Step: 1058, Training Logs: loss_final: 3.159197, loss_mean: 0.970366, proj_loss: -0.285899, loss_mean_cls: 2.474730, grad_norm: 3.189174 +Steps: 0%| | 1059/1000000 [04:23<67:34:21, 4.11it/s, grad_norm=3.19, loss_final=3.16, loss_mean=0.97, loss_mean_cls=2.47, proj_loss=-0.286][2026-03-23 13:40:43] Step: 1059, Training Logs: loss_final: 3.144421, loss_mean: 0.989944, proj_loss: -0.288106, loss_mean_cls: 2.442583, grad_norm: 4.466369 +Steps: 0%| | 1060/1000000 [04:23<67:34:44, 4.11it/s, grad_norm=4.47, loss_final=3.14, loss_mean=0.99, loss_mean_cls=2.44, proj_loss=-0.288][2026-03-23 13:40:43] Step: 1060, Training Logs: loss_final: 2.915456, loss_mean: 0.991071, proj_loss: -0.291458, loss_mean_cls: 2.215843, grad_norm: 4.022809 +Steps: 0%| | 1061/1000000 [04:24<67:33:00, 4.11it/s, grad_norm=4.02, loss_final=2.92, loss_mean=0.991, loss_mean_cls=2.22, proj_loss=-0.291][2026-03-23 13:40:44] Step: 1061, Training Logs: loss_final: 3.018514, loss_mean: 0.963205, proj_loss: -0.285773, loss_mean_cls: 2.341081, grad_norm: 4.285662 +Steps: 0%| | 1062/1000000 [04:24<67:32:47, 4.11it/s, grad_norm=4.29, loss_final=3.02, loss_mean=0.963, loss_mean_cls=2.34, proj_loss=-0.286][2026-03-23 13:40:44] Step: 1062, Training Logs: loss_final: 2.694803, loss_mean: 0.989905, proj_loss: -0.286699, loss_mean_cls: 1.991597, grad_norm: 6.350080 +Steps: 0%| | 1063/1000000 [04:24<67:32:57, 4.11it/s, grad_norm=6.35, loss_final=2.69, loss_mean=0.99, loss_mean_cls=1.99, proj_loss=-0.287][2026-03-23 13:40:44] Step: 1063, Training Logs: loss_final: 3.128744, loss_mean: 0.966320, proj_loss: -0.286026, loss_mean_cls: 2.448450, grad_norm: 4.349292 +Steps: 0%| | 1064/1000000 [04:24<67:32:32, 4.11it/s, grad_norm=4.35, loss_final=3.13, loss_mean=0.966, loss_mean_cls=2.45, proj_loss=-0.286][2026-03-23 13:40:44] Step: 1064, Training Logs: loss_final: 3.173198, loss_mean: 0.966451, proj_loss: -0.282527, loss_mean_cls: 2.489274, grad_norm: 4.477384 +Steps: 0%| | 1065/1000000 [04:25<67:32:32, 4.11it/s, grad_norm=4.48, loss_final=3.17, loss_mean=0.966, loss_mean_cls=2.49, proj_loss=-0.283][2026-03-23 13:40:45] Step: 1065, Training Logs: loss_final: 2.954811, loss_mean: 0.974400, proj_loss: -0.286902, loss_mean_cls: 2.267313, grad_norm: 8.215356 +Steps: 0%| | 1066/1000000 [04:25<67:33:06, 4.11it/s, grad_norm=8.22, loss_final=2.95, loss_mean=0.974, loss_mean_cls=2.27, proj_loss=-0.287][2026-03-23 13:40:45] Step: 1066, Training Logs: loss_final: 3.015337, loss_mean: 0.987067, proj_loss: -0.293805, loss_mean_cls: 2.322075, grad_norm: 4.866321 +Steps: 0%| | 1067/1000000 [04:25<67:35:31, 4.11it/s, grad_norm=4.87, loss_final=3.02, loss_mean=0.987, loss_mean_cls=2.32, proj_loss=-0.294][2026-03-23 13:40:45] Step: 1067, Training Logs: loss_final: 3.654657, loss_mean: 0.993743, proj_loss: -0.282896, loss_mean_cls: 2.943810, grad_norm: 10.996150 +Steps: 0%| | 1068/1000000 [04:25<67:34:02, 4.11it/s, grad_norm=11, loss_final=3.65, loss_mean=0.994, loss_mean_cls=2.94, proj_loss=-0.283][2026-03-23 13:40:45] Step: 1068, Training Logs: loss_final: 3.403906, loss_mean: 0.997860, proj_loss: -0.289430, loss_mean_cls: 2.695476, grad_norm: 8.744484 +Steps: 0%| | 1069/1000000 [04:26<67:31:57, 4.11it/s, grad_norm=8.74, loss_final=3.4, loss_mean=0.998, loss_mean_cls=2.7, proj_loss=-0.289][2026-03-23 13:40:46] Step: 1069, Training Logs: loss_final: 3.159442, loss_mean: 0.982436, proj_loss: -0.285118, loss_mean_cls: 2.462125, grad_norm: 4.761458 +Steps: 0%| | 1070/1000000 [04:26<67:31:55, 4.11it/s, grad_norm=4.76, loss_final=3.16, loss_mean=0.982, loss_mean_cls=2.46, proj_loss=-0.285][2026-03-23 13:40:46] Step: 1070, Training Logs: loss_final: 2.874173, loss_mean: 1.000275, proj_loss: -0.287132, loss_mean_cls: 2.161030, grad_norm: 4.092637 +Steps: 0%| | 1071/1000000 [04:26<67:32:15, 4.11it/s, grad_norm=4.09, loss_final=2.87, loss_mean=1, loss_mean_cls=2.16, proj_loss=-0.287][2026-03-23 13:40:46] Step: 1071, Training Logs: loss_final: 3.189296, loss_mean: 0.978626, proj_loss: -0.288954, loss_mean_cls: 2.499624, grad_norm: 14.435069 +Steps: 0%| | 1072/1000000 [04:26<67:31:40, 4.11it/s, grad_norm=14.4, loss_final=3.19, loss_mean=0.979, loss_mean_cls=2.5, proj_loss=-0.289][2026-03-23 13:40:46] Step: 1072, Training Logs: loss_final: 2.615433, loss_mean: 1.010946, proj_loss: -0.290795, loss_mean_cls: 1.895282, grad_norm: 7.612328 +Steps: 0%| | 1073/1000000 [04:27<67:32:31, 4.11it/s, grad_norm=7.61, loss_final=2.62, loss_mean=1.01, loss_mean_cls=1.9, proj_loss=-0.291][2026-03-23 13:40:47] Step: 1073, Training Logs: loss_final: 3.234912, loss_mean: 0.986269, proj_loss: -0.288177, loss_mean_cls: 2.536821, grad_norm: 7.139818 +Steps: 0%| | 1074/1000000 [04:27<67:31:48, 4.11it/s, grad_norm=7.14, loss_final=3.23, loss_mean=0.986, loss_mean_cls=2.54, proj_loss=-0.288][2026-03-23 13:40:47] Step: 1074, Training Logs: loss_final: 3.033409, loss_mean: 0.957017, proj_loss: -0.296560, loss_mean_cls: 2.372952, grad_norm: 10.781853 +Steps: 0%| | 1075/1000000 [04:27<67:32:25, 4.11it/s, grad_norm=10.8, loss_final=3.03, loss_mean=0.957, loss_mean_cls=2.37, proj_loss=-0.297][2026-03-23 13:40:47] Step: 1075, Training Logs: loss_final: 2.714150, loss_mean: 0.987634, proj_loss: -0.294188, loss_mean_cls: 2.020705, grad_norm: 5.185725 +Steps: 0%| | 1076/1000000 [04:27<67:32:18, 4.11it/s, grad_norm=5.19, loss_final=2.71, loss_mean=0.988, loss_mean_cls=2.02, proj_loss=-0.294][2026-03-23 13:40:47] Step: 1076, Training Logs: loss_final: 3.120813, loss_mean: 0.984157, proj_loss: -0.291842, loss_mean_cls: 2.428498, grad_norm: 9.104776 +Steps: 0%| | 1077/1000000 [04:28<67:31:52, 4.11it/s, grad_norm=9.1, loss_final=3.12, loss_mean=0.984, loss_mean_cls=2.43, proj_loss=-0.292][2026-03-23 13:40:48] Step: 1077, Training Logs: loss_final: 2.861291, loss_mean: 0.975686, proj_loss: -0.289943, loss_mean_cls: 2.175547, grad_norm: 3.729801 +Steps: 0%| | 1078/1000000 [04:28<67:32:04, 4.11it/s, grad_norm=3.73, loss_final=2.86, loss_mean=0.976, loss_mean_cls=2.18, proj_loss=-0.29][2026-03-23 13:40:48] Step: 1078, Training Logs: loss_final: 2.965350, loss_mean: 0.986736, proj_loss: -0.292382, loss_mean_cls: 2.270996, grad_norm: 3.955391 +Steps: 0%| | 1079/1000000 [04:28<67:31:24, 4.11it/s, grad_norm=3.96, loss_final=2.97, loss_mean=0.987, loss_mean_cls=2.27, proj_loss=-0.292][2026-03-23 13:40:48] Step: 1079, Training Logs: loss_final: 3.628241, loss_mean: 0.967027, proj_loss: -0.287975, loss_mean_cls: 2.949189, grad_norm: 3.920178 +Steps: 0%| | 1080/1000000 [04:28<67:31:31, 4.11it/s, grad_norm=3.92, loss_final=3.63, loss_mean=0.967, loss_mean_cls=2.95, proj_loss=-0.288][2026-03-23 13:40:48] Step: 1080, Training Logs: loss_final: 3.348373, loss_mean: 0.966058, proj_loss: -0.288049, loss_mean_cls: 2.670364, grad_norm: 9.515426 +Steps: 0%| | 1081/1000000 [04:29<67:32:23, 4.11it/s, grad_norm=9.52, loss_final=3.35, loss_mean=0.966, loss_mean_cls=2.67, proj_loss=-0.288][2026-03-23 13:40:49] Step: 1081, Training Logs: loss_final: 3.161504, loss_mean: 0.959303, proj_loss: -0.287672, loss_mean_cls: 2.489873, grad_norm: 10.776253 +Steps: 0%| | 1082/1000000 [04:29<67:32:17, 4.11it/s, grad_norm=10.8, loss_final=3.16, loss_mean=0.959, loss_mean_cls=2.49, proj_loss=-0.288][2026-03-23 13:40:49] Step: 1082, Training Logs: loss_final: 3.532400, loss_mean: 0.968940, proj_loss: -0.283731, loss_mean_cls: 2.847191, grad_norm: 6.622351 +Steps: 0%| | 1083/1000000 [04:29<67:31:30, 4.11it/s, grad_norm=6.62, loss_final=3.53, loss_mean=0.969, loss_mean_cls=2.85, proj_loss=-0.284][2026-03-23 13:40:49] Step: 1083, Training Logs: loss_final: 3.193553, loss_mean: 0.979916, proj_loss: -0.290865, loss_mean_cls: 2.504502, grad_norm: 3.881576 +Steps: 0%| | 1084/1000000 [04:29<67:30:02, 4.11it/s, grad_norm=3.88, loss_final=3.19, loss_mean=0.98, loss_mean_cls=2.5, proj_loss=-0.291][2026-03-23 13:40:49] Step: 1084, Training Logs: loss_final: 3.084223, loss_mean: 0.973090, proj_loss: -0.293601, loss_mean_cls: 2.404735, grad_norm: 3.911067 +Steps: 0%| | 1085/1000000 [04:30<67:30:56, 4.11it/s, grad_norm=3.91, loss_final=3.08, loss_mean=0.973, loss_mean_cls=2.4, proj_loss=-0.294][2026-03-23 13:40:49] Step: 1085, Training Logs: loss_final: 3.360683, loss_mean: 0.967510, proj_loss: -0.295341, loss_mean_cls: 2.688514, grad_norm: 10.864831 +Steps: 0%| | 1086/1000000 [04:30<67:30:26, 4.11it/s, grad_norm=10.9, loss_final=3.36, loss_mean=0.968, loss_mean_cls=2.69, proj_loss=-0.295][2026-03-23 13:40:50] Step: 1086, Training Logs: loss_final: 3.861495, loss_mean: 0.956235, proj_loss: -0.282716, loss_mean_cls: 3.187975, grad_norm: 6.031152 +Steps: 0%| | 1087/1000000 [04:30<67:31:53, 4.11it/s, grad_norm=6.03, loss_final=3.86, loss_mean=0.956, loss_mean_cls=3.19, proj_loss=-0.283][2026-03-23 13:40:50] Step: 1087, Training Logs: loss_final: 3.047905, loss_mean: 0.985426, proj_loss: -0.295094, loss_mean_cls: 2.357573, grad_norm: 8.321869 +Steps: 0%| | 1088/1000000 [04:30<67:32:24, 4.11it/s, grad_norm=8.32, loss_final=3.05, loss_mean=0.985, loss_mean_cls=2.36, proj_loss=-0.295][2026-03-23 13:40:50] Step: 1088, Training Logs: loss_final: 3.452605, loss_mean: 0.975381, proj_loss: -0.290187, loss_mean_cls: 2.767411, grad_norm: 11.884154 +Steps: 0%| | 1089/1000000 [04:30<67:31:13, 4.11it/s, grad_norm=11.9, loss_final=3.45, loss_mean=0.975, loss_mean_cls=2.77, proj_loss=-0.29][2026-03-23 13:40:50] Step: 1089, Training Logs: loss_final: 3.291870, loss_mean: 0.951123, proj_loss: -0.284361, loss_mean_cls: 2.625108, grad_norm: 4.833124 +Steps: 0%| | 1090/1000000 [04:31<67:31:13, 4.11it/s, grad_norm=4.83, loss_final=3.29, loss_mean=0.951, loss_mean_cls=2.63, proj_loss=-0.284][2026-03-23 13:40:51] Step: 1090, Training Logs: loss_final: 3.494079, loss_mean: 0.973821, proj_loss: -0.291127, loss_mean_cls: 2.811385, grad_norm: 8.587254 +Steps: 0%| | 1091/1000000 [04:31<67:31:18, 4.11it/s, grad_norm=8.59, loss_final=3.49, loss_mean=0.974, loss_mean_cls=2.81, proj_loss=-0.291][2026-03-23 13:40:51] Step: 1091, Training Logs: loss_final: 3.128200, loss_mean: 0.976817, proj_loss: -0.295345, loss_mean_cls: 2.446728, grad_norm: 7.541398 +Steps: 0%| | 1092/1000000 [04:31<67:32:05, 4.11it/s, grad_norm=7.54, loss_final=3.13, loss_mean=0.977, loss_mean_cls=2.45, proj_loss=-0.295][2026-03-23 13:40:51] Step: 1092, Training Logs: loss_final: 3.253130, loss_mean: 0.986949, proj_loss: -0.291283, loss_mean_cls: 2.557464, grad_norm: 3.292532 +Steps: 0%| | 1093/1000000 [04:31<67:31:43, 4.11it/s, grad_norm=3.29, loss_final=3.25, loss_mean=0.987, loss_mean_cls=2.56, proj_loss=-0.291][2026-03-23 13:40:51] Step: 1093, Training Logs: loss_final: 3.370761, loss_mean: 0.939969, proj_loss: -0.294190, loss_mean_cls: 2.724982, grad_norm: 8.509807 +Steps: 0%| | 1094/1000000 [04:32<67:31:38, 4.11it/s, grad_norm=8.51, loss_final=3.37, loss_mean=0.94, loss_mean_cls=2.72, proj_loss=-0.294][2026-03-23 13:40:52] Step: 1094, Training Logs: loss_final: 3.041476, loss_mean: 0.972037, proj_loss: -0.290029, loss_mean_cls: 2.359468, grad_norm: 6.119041 +Steps: 0%| | 1095/1000000 [04:32<67:31:21, 4.11it/s, grad_norm=6.12, loss_final=3.04, loss_mean=0.972, loss_mean_cls=2.36, proj_loss=-0.29][2026-03-23 13:40:52] Step: 1095, Training Logs: loss_final: 3.339016, loss_mean: 0.972769, proj_loss: -0.290865, loss_mean_cls: 2.657112, grad_norm: 3.721154 +Steps: 0%| | 1096/1000000 [04:32<67:31:13, 4.11it/s, grad_norm=3.72, loss_final=3.34, loss_mean=0.973, loss_mean_cls=2.66, proj_loss=-0.291][2026-03-23 13:40:52] Step: 1096, Training Logs: loss_final: 2.634564, loss_mean: 0.982271, proj_loss: -0.297418, loss_mean_cls: 1.949710, grad_norm: 3.609487 +Steps: 0%| | 1097/1000000 [04:32<67:30:51, 4.11it/s, grad_norm=3.61, loss_final=2.63, loss_mean=0.982, loss_mean_cls=1.95, proj_loss=-0.297][2026-03-23 13:40:52] Step: 1097, Training Logs: loss_final: 2.898271, loss_mean: 0.961107, proj_loss: -0.303754, loss_mean_cls: 2.240918, grad_norm: 4.957806 +Steps: 0%| | 1098/1000000 [04:33<67:32:21, 4.11it/s, grad_norm=4.96, loss_final=2.9, loss_mean=0.961, loss_mean_cls=2.24, proj_loss=-0.304][2026-03-23 13:40:53] Step: 1098, Training Logs: loss_final: 3.034018, loss_mean: 0.982535, proj_loss: -0.299584, loss_mean_cls: 2.351068, grad_norm: 3.696319 +Steps: 0%| | 1099/1000000 [04:33<67:33:25, 4.11it/s, grad_norm=3.7, loss_final=3.03, loss_mean=0.983, loss_mean_cls=2.35, proj_loss=-0.3][2026-03-23 13:40:53] Step: 1099, Training Logs: loss_final: 3.514344, loss_mean: 0.970190, proj_loss: -0.290387, loss_mean_cls: 2.834541, grad_norm: 3.039565 +Steps: 0%| | 1100/1000000 [04:33<67:35:50, 4.10it/s, grad_norm=3.04, loss_final=3.51, loss_mean=0.97, loss_mean_cls=2.83, proj_loss=-0.29][2026-03-23 13:40:53] Step: 1100, Training Logs: loss_final: 2.861143, loss_mean: 0.994709, proj_loss: -0.297420, loss_mean_cls: 2.163854, grad_norm: 8.341577 +Steps: 0%| | 1101/1000000 [04:33<67:35:07, 4.11it/s, grad_norm=8.34, loss_final=2.86, loss_mean=0.995, loss_mean_cls=2.16, proj_loss=-0.297][2026-03-23 13:40:53] Step: 1101, Training Logs: loss_final: 3.029137, loss_mean: 0.996834, proj_loss: -0.294681, loss_mean_cls: 2.326983, grad_norm: 3.832089 +Steps: 0%| | 1102/1000000 [04:34<67:34:08, 4.11it/s, grad_norm=3.83, loss_final=3.03, loss_mean=0.997, loss_mean_cls=2.33, proj_loss=-0.295][2026-03-23 13:40:54] Step: 1102, Training Logs: loss_final: 3.400135, loss_mean: 0.950879, proj_loss: -0.293361, loss_mean_cls: 2.742617, grad_norm: 8.572910 +Steps: 0%| | 1103/1000000 [04:34<67:33:02, 4.11it/s, grad_norm=8.57, loss_final=3.4, loss_mean=0.951, loss_mean_cls=2.74, proj_loss=-0.293][2026-03-23 13:40:54] Step: 1103, Training Logs: loss_final: 3.344556, loss_mean: 0.992087, proj_loss: -0.292493, loss_mean_cls: 2.644961, grad_norm: 10.498415 +Steps: 0%| | 1104/1000000 [04:34<67:34:16, 4.11it/s, grad_norm=10.5, loss_final=3.34, loss_mean=0.992, loss_mean_cls=2.64, proj_loss=-0.292][2026-03-23 13:40:54] Step: 1104, Training Logs: loss_final: 2.898670, loss_mean: 1.009504, proj_loss: -0.298334, loss_mean_cls: 2.187500, grad_norm: 5.945292 +Steps: 0%| | 1105/1000000 [04:34<67:45:37, 4.09it/s, grad_norm=5.95, loss_final=2.9, loss_mean=1.01, loss_mean_cls=2.19, proj_loss=-0.298][2026-03-23 13:40:54] Step: 1105, Training Logs: loss_final: 3.552963, loss_mean: 0.946844, proj_loss: -0.293213, loss_mean_cls: 2.899331, grad_norm: 8.019274 +Steps: 0%| | 1106/1000000 [04:35<67:40:40, 4.10it/s, grad_norm=8.02, loss_final=3.55, loss_mean=0.947, loss_mean_cls=2.9, proj_loss=-0.293][2026-03-23 13:40:55] Step: 1106, Training Logs: loss_final: 3.138650, loss_mean: 0.999434, proj_loss: -0.297952, loss_mean_cls: 2.437168, grad_norm: 5.805452 +Steps: 0%| | 1107/1000000 [04:35<67:38:31, 4.10it/s, grad_norm=5.81, loss_final=3.14, loss_mean=0.999, loss_mean_cls=2.44, proj_loss=-0.298][2026-03-23 13:40:55] Step: 1107, Training Logs: loss_final: 3.380474, loss_mean: 0.974945, proj_loss: -0.296529, loss_mean_cls: 2.702058, grad_norm: 12.037453 +Steps: 0%| | 1108/1000000 [04:35<67:37:27, 4.10it/s, grad_norm=12, loss_final=3.38, loss_mean=0.975, loss_mean_cls=2.7, proj_loss=-0.297][2026-03-23 13:40:55] Step: 1108, Training Logs: loss_final: 3.741847, loss_mean: 0.974536, proj_loss: -0.292353, loss_mean_cls: 3.059663, grad_norm: 15.349536 +Steps: 0%| | 1109/1000000 [04:35<67:43:34, 4.10it/s, grad_norm=15.3, loss_final=3.74, loss_mean=0.975, loss_mean_cls=3.06, proj_loss=-0.292][2026-03-23 13:40:55] Step: 1109, Training Logs: loss_final: 3.235519, loss_mean: 0.985101, proj_loss: -0.294554, loss_mean_cls: 2.544971, grad_norm: 6.733988 +Steps: 0%| | 1110/1000000 [04:36<67:39:32, 4.10it/s, grad_norm=6.73, loss_final=3.24, loss_mean=0.985, loss_mean_cls=2.54, proj_loss=-0.295][2026-03-23 13:40:56] Step: 1110, Training Logs: loss_final: 3.178654, loss_mean: 0.994157, proj_loss: -0.295592, loss_mean_cls: 2.480089, grad_norm: 8.558690 +Steps: 0%| | 1111/1000000 [04:36<67:38:41, 4.10it/s, grad_norm=8.56, loss_final=3.18, loss_mean=0.994, loss_mean_cls=2.48, proj_loss=-0.296][2026-03-23 13:40:56] Step: 1111, Training Logs: loss_final: 3.389332, loss_mean: 0.979472, proj_loss: -0.296719, loss_mean_cls: 2.706579, grad_norm: 11.716136 +Steps: 0%| | 1112/1000000 [04:36<67:36:40, 4.10it/s, grad_norm=11.7, loss_final=3.39, loss_mean=0.979, loss_mean_cls=2.71, proj_loss=-0.297][2026-03-23 13:40:56] Step: 1112, Training Logs: loss_final: 3.392278, loss_mean: 0.968544, proj_loss: -0.294837, loss_mean_cls: 2.718571, grad_norm: 6.906572 +Steps: 0%| | 1113/1000000 [04:36<67:35:41, 4.10it/s, grad_norm=6.91, loss_final=3.39, loss_mean=0.969, loss_mean_cls=2.72, proj_loss=-0.295][2026-03-23 13:40:56] Step: 1113, Training Logs: loss_final: 3.415730, loss_mean: 0.956480, proj_loss: -0.293046, loss_mean_cls: 2.752296, grad_norm: 7.399305 +Steps: 0%| | 1114/1000000 [04:37<67:34:06, 4.11it/s, grad_norm=7.4, loss_final=3.42, loss_mean=0.956, loss_mean_cls=2.75, proj_loss=-0.293][2026-03-23 13:40:57] Step: 1114, Training Logs: loss_final: 3.497465, loss_mean: 0.998012, proj_loss: -0.295283, loss_mean_cls: 2.794736, grad_norm: 7.923552 +Steps: 0%| | 1115/1000000 [04:37<67:34:47, 4.11it/s, grad_norm=7.92, loss_final=3.5, loss_mean=0.998, loss_mean_cls=2.79, proj_loss=-0.295][2026-03-23 13:40:57] Step: 1115, Training Logs: loss_final: 3.632222, loss_mean: 0.972887, proj_loss: -0.295335, loss_mean_cls: 2.954669, grad_norm: 12.820712 +Steps: 0%| | 1116/1000000 [04:37<67:33:39, 4.11it/s, grad_norm=12.8, loss_final=3.63, loss_mean=0.973, loss_mean_cls=2.95, proj_loss=-0.295][2026-03-23 13:40:57] Step: 1116, Training Logs: loss_final: 3.033679, loss_mean: 0.969913, proj_loss: -0.298735, loss_mean_cls: 2.362502, grad_norm: 11.283792 +Steps: 0%| | 1117/1000000 [04:37<67:33:43, 4.11it/s, grad_norm=11.3, loss_final=3.03, loss_mean=0.97, loss_mean_cls=2.36, proj_loss=-0.299][2026-03-23 13:40:57] Step: 1117, Training Logs: loss_final: 3.472846, loss_mean: 0.988901, proj_loss: -0.292288, loss_mean_cls: 2.776234, grad_norm: 6.503602 +Steps: 0%| | 1118/1000000 [04:38<67:33:08, 4.11it/s, grad_norm=6.5, loss_final=3.47, loss_mean=0.989, loss_mean_cls=2.78, proj_loss=-0.292][2026-03-23 13:40:58] Step: 1118, Training Logs: loss_final: 2.773130, loss_mean: 0.993373, proj_loss: -0.304558, loss_mean_cls: 2.084315, grad_norm: 7.337964 +Steps: 0%| | 1119/1000000 [04:38<67:34:14, 4.11it/s, grad_norm=7.34, loss_final=2.77, loss_mean=0.993, loss_mean_cls=2.08, proj_loss=-0.305][2026-03-23 13:40:58] Step: 1119, Training Logs: loss_final: 3.346787, loss_mean: 0.964437, proj_loss: -0.299770, loss_mean_cls: 2.682120, grad_norm: 8.678042 +Steps: 0%| | 1120/1000000 [04:38<67:34:06, 4.11it/s, grad_norm=8.68, loss_final=3.35, loss_mean=0.964, loss_mean_cls=2.68, proj_loss=-0.3][2026-03-23 13:40:58] Step: 1120, Training Logs: loss_final: 2.770964, loss_mean: 0.993691, proj_loss: -0.302404, loss_mean_cls: 2.079678, grad_norm: 13.960000 +Steps: 0%| | 1121/1000000 [04:38<68:59:16, 4.02it/s, grad_norm=14, loss_final=2.77, loss_mean=0.994, loss_mean_cls=2.08, proj_loss=-0.302][2026-03-23 13:40:58] Step: 1121, Training Logs: loss_final: 3.563709, loss_mean: 0.976238, proj_loss: -0.296349, loss_mean_cls: 2.883819, grad_norm: 12.222085 +Steps: 0%| | 1122/1000000 [04:39<68:48:09, 4.03it/s, grad_norm=12.2, loss_final=3.56, loss_mean=0.976, loss_mean_cls=2.88, proj_loss=-0.296][2026-03-23 13:40:59] Step: 1122, Training Logs: loss_final: 2.811204, loss_mean: 0.988570, proj_loss: -0.299977, loss_mean_cls: 2.122612, grad_norm: 3.910089 +Steps: 0%| | 1123/1000000 [04:39<68:26:48, 4.05it/s, grad_norm=3.91, loss_final=2.81, loss_mean=0.989, loss_mean_cls=2.12, proj_loss=-0.3][2026-03-23 13:40:59] Step: 1123, Training Logs: loss_final: 3.242522, loss_mean: 0.977299, proj_loss: -0.300987, loss_mean_cls: 2.566209, grad_norm: 8.412979 +Steps: 0%| | 1124/1000000 [04:39<68:11:24, 4.07it/s, grad_norm=8.41, loss_final=3.24, loss_mean=0.977, loss_mean_cls=2.57, proj_loss=-0.301][2026-03-23 13:40:59] Step: 1124, Training Logs: loss_final: 2.907289, loss_mean: 0.999275, proj_loss: -0.302676, loss_mean_cls: 2.210690, grad_norm: 9.402081 +Steps: 0%| | 1125/1000000 [04:39<68:00:28, 4.08it/s, grad_norm=9.4, loss_final=2.91, loss_mean=0.999, loss_mean_cls=2.21, proj_loss=-0.303][2026-03-23 13:40:59] Step: 1125, Training Logs: loss_final: 3.059639, loss_mean: 0.973694, proj_loss: -0.298162, loss_mean_cls: 2.384107, grad_norm: 9.199976 +Steps: 0%| | 1126/1000000 [04:40<67:52:12, 4.09it/s, grad_norm=9.2, loss_final=3.06, loss_mean=0.974, loss_mean_cls=2.38, proj_loss=-0.298][2026-03-23 13:40:59] Step: 1126, Training Logs: loss_final: 3.118575, loss_mean: 0.986716, proj_loss: -0.299100, loss_mean_cls: 2.430959, grad_norm: 10.266216 +Steps: 0%| | 1127/1000000 [04:40<67:46:13, 4.09it/s, grad_norm=10.3, loss_final=3.12, loss_mean=0.987, loss_mean_cls=2.43, proj_loss=-0.299][2026-03-23 13:41:00] Step: 1127, Training Logs: loss_final: 3.205460, loss_mean: 0.970286, proj_loss: -0.299822, loss_mean_cls: 2.534996, grad_norm: 5.893661 +Steps: 0%| | 1128/1000000 [04:40<67:42:49, 4.10it/s, grad_norm=5.89, loss_final=3.21, loss_mean=0.97, loss_mean_cls=2.53, proj_loss=-0.3][2026-03-23 13:41:00] Step: 1128, Training Logs: loss_final: 3.142620, loss_mean: 0.963274, proj_loss: -0.302194, loss_mean_cls: 2.481539, grad_norm: 8.613654 +Steps: 0%| | 1129/1000000 [04:40<67:38:42, 4.10it/s, grad_norm=8.61, loss_final=3.14, loss_mean=0.963, loss_mean_cls=2.48, proj_loss=-0.302][2026-03-23 13:41:00] Step: 1129, Training Logs: loss_final: 3.028044, loss_mean: 1.000859, proj_loss: -0.299386, loss_mean_cls: 2.326571, grad_norm: 14.538082 +Steps: 0%| | 1130/1000000 [04:41<67:38:52, 4.10it/s, grad_norm=14.5, loss_final=3.03, loss_mean=1, loss_mean_cls=2.33, proj_loss=-0.299][2026-03-23 13:41:00] Step: 1130, Training Logs: loss_final: 3.292875, loss_mean: 0.958623, proj_loss: -0.295290, loss_mean_cls: 2.629541, grad_norm: 7.766092 +Steps: 0%| | 1131/1000000 [04:41<67:37:04, 4.10it/s, grad_norm=7.77, loss_final=3.29, loss_mean=0.959, loss_mean_cls=2.63, proj_loss=-0.295][2026-03-23 13:41:01] Step: 1131, Training Logs: loss_final: 3.133125, loss_mean: 0.980893, proj_loss: -0.302794, loss_mean_cls: 2.455026, grad_norm: 7.021394 +Steps: 0%| | 1132/1000000 [04:41<67:37:08, 4.10it/s, grad_norm=7.02, loss_final=3.13, loss_mean=0.981, loss_mean_cls=2.46, proj_loss=-0.303][2026-03-23 13:41:01] Step: 1132, Training Logs: loss_final: 3.409844, loss_mean: 0.946649, proj_loss: -0.302625, loss_mean_cls: 2.765821, grad_norm: 10.800589 +Steps: 0%| | 1133/1000000 [04:41<67:36:07, 4.10it/s, grad_norm=10.8, loss_final=3.41, loss_mean=0.947, loss_mean_cls=2.77, proj_loss=-0.303][2026-03-23 13:41:01] Step: 1133, Training Logs: loss_final: 3.075236, loss_mean: 0.984923, proj_loss: -0.298800, loss_mean_cls: 2.389113, grad_norm: 7.060285 +Steps: 0%| | 1134/1000000 [04:41<67:34:19, 4.11it/s, grad_norm=7.06, loss_final=3.08, loss_mean=0.985, loss_mean_cls=2.39, proj_loss=-0.299][2026-03-23 13:41:01] Step: 1134, Training Logs: loss_final: 3.312932, loss_mean: 0.973538, proj_loss: -0.303955, loss_mean_cls: 2.643349, grad_norm: 8.844578 +Steps: 0%| | 1135/1000000 [04:42<67:35:39, 4.10it/s, grad_norm=8.84, loss_final=3.31, loss_mean=0.974, loss_mean_cls=2.64, proj_loss=-0.304][2026-03-23 13:41:02] Step: 1135, Training Logs: loss_final: 3.321269, loss_mean: 0.977334, proj_loss: -0.301330, loss_mean_cls: 2.645265, grad_norm: 7.782376 +Steps: 0%| | 1136/1000000 [04:42<67:35:07, 4.11it/s, grad_norm=7.78, loss_final=3.32, loss_mean=0.977, loss_mean_cls=2.65, proj_loss=-0.301][2026-03-23 13:41:02] Step: 1136, Training Logs: loss_final: 3.212949, loss_mean: 0.964513, proj_loss: -0.303079, loss_mean_cls: 2.551514, grad_norm: 5.879279 +Steps: 0%| | 1137/1000000 [04:42<67:34:26, 4.11it/s, grad_norm=5.88, loss_final=3.21, loss_mean=0.965, loss_mean_cls=2.55, proj_loss=-0.303][2026-03-23 13:41:02] Step: 1137, Training Logs: loss_final: 3.332048, loss_mean: 0.955967, proj_loss: -0.300633, loss_mean_cls: 2.676714, grad_norm: 6.781036 +Steps: 0%| | 1138/1000000 [04:42<67:33:50, 4.11it/s, grad_norm=6.78, loss_final=3.33, loss_mean=0.956, loss_mean_cls=2.68, proj_loss=-0.301][2026-03-23 13:41:02] Step: 1138, Training Logs: loss_final: 3.414647, loss_mean: 0.956737, proj_loss: -0.304056, loss_mean_cls: 2.761966, grad_norm: 6.517492 +Steps: 0%| | 1139/1000000 [04:43<67:33:10, 4.11it/s, grad_norm=6.52, loss_final=3.41, loss_mean=0.957, loss_mean_cls=2.76, proj_loss=-0.304][2026-03-23 13:41:03] Step: 1139, Training Logs: loss_final: 2.887747, loss_mean: 0.988939, proj_loss: -0.307166, loss_mean_cls: 2.205975, grad_norm: 9.158473 +Steps: 0%| | 1140/1000000 [04:43<67:33:32, 4.11it/s, grad_norm=9.16, loss_final=2.89, loss_mean=0.989, loss_mean_cls=2.21, proj_loss=-0.307][2026-03-23 13:41:03] Step: 1140, Training Logs: loss_final: 3.009380, loss_mean: 0.980286, proj_loss: -0.305365, loss_mean_cls: 2.334460, grad_norm: 8.993142 +Steps: 0%| | 1141/1000000 [04:43<67:33:46, 4.11it/s, grad_norm=8.99, loss_final=3.01, loss_mean=0.98, loss_mean_cls=2.33, proj_loss=-0.305][2026-03-23 13:41:03] Step: 1141, Training Logs: loss_final: 3.241634, loss_mean: 0.955725, proj_loss: -0.303401, loss_mean_cls: 2.589309, grad_norm: 4.391622 +Steps: 0%| | 1142/1000000 [04:43<67:34:09, 4.11it/s, grad_norm=4.39, loss_final=3.24, loss_mean=0.956, loss_mean_cls=2.59, proj_loss=-0.303][2026-03-23 13:41:03] Step: 1142, Training Logs: loss_final: 2.971251, loss_mean: 0.982740, proj_loss: -0.317140, loss_mean_cls: 2.305651, grad_norm: 5.592550 +Steps: 0%| | 1143/1000000 [04:44<67:33:52, 4.11it/s, grad_norm=5.59, loss_final=2.97, loss_mean=0.983, loss_mean_cls=2.31, proj_loss=-0.317][2026-03-23 13:41:04] Step: 1143, Training Logs: loss_final: 2.941748, loss_mean: 0.960631, proj_loss: -0.311587, loss_mean_cls: 2.292705, grad_norm: 6.036450 +Steps: 0%| | 1144/1000000 [04:44<67:34:39, 4.11it/s, grad_norm=6.04, loss_final=2.94, loss_mean=0.961, loss_mean_cls=2.29, proj_loss=-0.312][2026-03-23 13:41:04] Step: 1144, Training Logs: loss_final: 3.200696, loss_mean: 0.964885, proj_loss: -0.305475, loss_mean_cls: 2.541285, grad_norm: 6.571102 +Steps: 0%| | 1145/1000000 [04:44<67:34:27, 4.11it/s, grad_norm=6.57, loss_final=3.2, loss_mean=0.965, loss_mean_cls=2.54, proj_loss=-0.305][2026-03-23 13:41:04] Step: 1145, Training Logs: loss_final: 2.710406, loss_mean: 0.968347, proj_loss: -0.306200, loss_mean_cls: 2.048259, grad_norm: 3.441562 +Steps: 0%| | 1146/1000000 [04:44<67:33:21, 4.11it/s, grad_norm=3.44, loss_final=2.71, loss_mean=0.968, loss_mean_cls=2.05, proj_loss=-0.306][2026-03-23 13:41:04] Step: 1146, Training Logs: loss_final: 2.778702, loss_mean: 0.966317, proj_loss: -0.310879, loss_mean_cls: 2.123264, grad_norm: 3.920933 +Steps: 0%| | 1147/1000000 [04:45<67:35:49, 4.10it/s, grad_norm=3.92, loss_final=2.78, loss_mean=0.966, loss_mean_cls=2.12, proj_loss=-0.311][2026-03-23 13:41:05] Step: 1147, Training Logs: loss_final: 2.931630, loss_mean: 0.977378, proj_loss: -0.304096, loss_mean_cls: 2.258348, grad_norm: 3.002802 +Steps: 0%| | 1148/1000000 [04:45<67:36:31, 4.10it/s, grad_norm=3, loss_final=2.93, loss_mean=0.977, loss_mean_cls=2.26, proj_loss=-0.304][2026-03-23 13:41:05] Step: 1148, Training Logs: loss_final: 3.611728, loss_mean: 0.937038, proj_loss: -0.309305, loss_mean_cls: 2.983994, grad_norm: 5.017470 +Steps: 0%| | 1149/1000000 [04:45<67:36:21, 4.10it/s, grad_norm=5.02, loss_final=3.61, loss_mean=0.937, loss_mean_cls=2.98, proj_loss=-0.309][2026-03-23 13:41:05] Step: 1149, Training Logs: loss_final: 3.386103, loss_mean: 0.950982, proj_loss: -0.305675, loss_mean_cls: 2.740796, grad_norm: 4.367815 +Steps: 0%| | 1150/1000000 [04:45<67:37:10, 4.10it/s, grad_norm=4.37, loss_final=3.39, loss_mean=0.951, loss_mean_cls=2.74, proj_loss=-0.306][2026-03-23 13:41:05] Step: 1150, Training Logs: loss_final: 3.148691, loss_mean: 0.985736, proj_loss: -0.308656, loss_mean_cls: 2.471610, grad_norm: 8.273145 +Steps: 0%| | 1151/1000000 [04:46<67:36:52, 4.10it/s, grad_norm=8.27, loss_final=3.15, loss_mean=0.986, loss_mean_cls=2.47, proj_loss=-0.309][2026-03-23 13:41:06] Step: 1151, Training Logs: loss_final: 3.283611, loss_mean: 0.965401, proj_loss: -0.306023, loss_mean_cls: 2.624232, grad_norm: 6.286378 +Steps: 0%| | 1152/1000000 [04:46<67:38:22, 4.10it/s, grad_norm=6.29, loss_final=3.28, loss_mean=0.965, loss_mean_cls=2.62, proj_loss=-0.306][2026-03-23 13:41:06] Step: 1152, Training Logs: loss_final: 2.984543, loss_mean: 0.980214, proj_loss: -0.307263, loss_mean_cls: 2.311592, grad_norm: 9.795528 +Steps: 0%| | 1153/1000000 [04:46<67:36:41, 4.10it/s, grad_norm=9.8, loss_final=2.98, loss_mean=0.98, loss_mean_cls=2.31, proj_loss=-0.307][2026-03-23 13:41:06] Step: 1153, Training Logs: loss_final: 2.941401, loss_mean: 0.959978, proj_loss: -0.303601, loss_mean_cls: 2.285025, grad_norm: 4.893495 +Steps: 0%| | 1154/1000000 [04:46<67:36:07, 4.10it/s, grad_norm=4.89, loss_final=2.94, loss_mean=0.96, loss_mean_cls=2.29, proj_loss=-0.304][2026-03-23 13:41:06] Step: 1154, Training Logs: loss_final: 3.189877, loss_mean: 0.946574, proj_loss: -0.310278, loss_mean_cls: 2.553580, grad_norm: 8.879821 +Steps: 0%| | 1155/1000000 [04:47<67:36:07, 4.10it/s, grad_norm=8.88, loss_final=3.19, loss_mean=0.947, loss_mean_cls=2.55, proj_loss=-0.31][2026-03-23 13:41:07] Step: 1155, Training Logs: loss_final: 2.937415, loss_mean: 0.965004, proj_loss: -0.305738, loss_mean_cls: 2.278149, grad_norm: 3.634586 +Steps: 0%| | 1156/1000000 [04:47<67:35:37, 4.10it/s, grad_norm=3.63, loss_final=2.94, loss_mean=0.965, loss_mean_cls=2.28, proj_loss=-0.306][2026-03-23 13:41:07] Step: 1156, Training Logs: loss_final: 3.346327, loss_mean: 0.955134, proj_loss: -0.306425, loss_mean_cls: 2.697618, grad_norm: 7.436550 +Steps: 0%| | 1157/1000000 [04:47<67:34:02, 4.11it/s, grad_norm=7.44, loss_final=3.35, loss_mean=0.955, loss_mean_cls=2.7, proj_loss=-0.306][2026-03-23 13:41:07] Step: 1157, Training Logs: loss_final: 3.097135, loss_mean: 0.983232, proj_loss: -0.305245, loss_mean_cls: 2.419148, grad_norm: 3.397789 +Steps: 0%| | 1158/1000000 [04:47<67:34:33, 4.11it/s, grad_norm=3.4, loss_final=3.1, loss_mean=0.983, loss_mean_cls=2.42, proj_loss=-0.305][2026-03-23 13:41:07] Step: 1158, Training Logs: loss_final: 3.144958, loss_mean: 0.970536, proj_loss: -0.311581, loss_mean_cls: 2.486002, grad_norm: 5.786127 +Steps: 0%| | 1159/1000000 [04:48<67:36:46, 4.10it/s, grad_norm=5.79, loss_final=3.14, loss_mean=0.971, loss_mean_cls=2.49, proj_loss=-0.312][2026-03-23 13:41:08] Step: 1159, Training Logs: loss_final: 2.784011, loss_mean: 0.980198, proj_loss: -0.310452, loss_mean_cls: 2.114265, grad_norm: 3.328309 +Steps: 0%| | 1160/1000000 [04:48<67:39:58, 4.10it/s, grad_norm=3.33, loss_final=2.78, loss_mean=0.98, loss_mean_cls=2.11, proj_loss=-0.31][2026-03-23 13:41:08] Step: 1160, Training Logs: loss_final: 2.884273, loss_mean: 0.972301, proj_loss: -0.310336, loss_mean_cls: 2.222308, grad_norm: 9.640995 +Steps: 0%| | 1161/1000000 [04:48<67:37:06, 4.10it/s, grad_norm=9.64, loss_final=2.88, loss_mean=0.972, loss_mean_cls=2.22, proj_loss=-0.31][2026-03-23 13:41:08] Step: 1161, Training Logs: loss_final: 3.106716, loss_mean: 0.966600, proj_loss: -0.317846, loss_mean_cls: 2.457962, grad_norm: 7.205474 +Steps: 0%| | 1162/1000000 [04:48<67:34:20, 4.11it/s, grad_norm=7.21, loss_final=3.11, loss_mean=0.967, loss_mean_cls=2.46, proj_loss=-0.318][2026-03-23 13:41:08] Step: 1162, Training Logs: loss_final: 3.429278, loss_mean: 0.943274, proj_loss: -0.303549, loss_mean_cls: 2.789554, grad_norm: 16.442324 +Steps: 0%| | 1163/1000000 [04:49<67:35:20, 4.11it/s, grad_norm=16.4, loss_final=3.43, loss_mean=0.943, loss_mean_cls=2.79, proj_loss=-0.304][2026-03-23 13:41:09] Step: 1163, Training Logs: loss_final: 3.387297, loss_mean: 0.964314, proj_loss: -0.303861, loss_mean_cls: 2.726845, grad_norm: 6.851428 +Steps: 0%| | 1164/1000000 [04:49<67:33:45, 4.11it/s, grad_norm=6.85, loss_final=3.39, loss_mean=0.964, loss_mean_cls=2.73, proj_loss=-0.304][2026-03-23 13:41:09] Step: 1164, Training Logs: loss_final: 2.603923, loss_mean: 1.011319, proj_loss: -0.314792, loss_mean_cls: 1.907397, grad_norm: 7.183140 +Steps: 0%| | 1165/1000000 [04:49<67:31:47, 4.11it/s, grad_norm=7.18, loss_final=2.6, loss_mean=1.01, loss_mean_cls=1.91, proj_loss=-0.315][2026-03-23 13:41:09] Step: 1165, Training Logs: loss_final: 3.656082, loss_mean: 0.942614, proj_loss: -0.298416, loss_mean_cls: 3.011884, grad_norm: 3.033975 +Steps: 0%| | 1166/1000000 [04:49<67:31:22, 4.11it/s, grad_norm=3.03, loss_final=3.66, loss_mean=0.943, loss_mean_cls=3.01, proj_loss=-0.298][2026-03-23 13:41:09] Step: 1166, Training Logs: loss_final: 2.964319, loss_mean: 0.991931, proj_loss: -0.315095, loss_mean_cls: 2.287483, grad_norm: 2.842756 +Steps: 0%| | 1167/1000000 [04:50<67:31:59, 4.11it/s, grad_norm=2.84, loss_final=2.96, loss_mean=0.992, loss_mean_cls=2.29, proj_loss=-0.315][2026-03-23 13:41:09] Step: 1167, Training Logs: loss_final: 3.104850, loss_mean: 0.947732, proj_loss: -0.313431, loss_mean_cls: 2.470549, grad_norm: 5.440420 +Steps: 0%| | 1168/1000000 [04:50<67:30:14, 4.11it/s, grad_norm=5.44, loss_final=3.1, loss_mean=0.948, loss_mean_cls=2.47, proj_loss=-0.313][2026-03-23 13:41:10] Step: 1168, Training Logs: loss_final: 3.404697, loss_mean: 0.964380, proj_loss: -0.306463, loss_mean_cls: 2.746781, grad_norm: 10.485926 +Steps: 0%| | 1169/1000000 [04:50<68:54:48, 4.03it/s, grad_norm=10.5, loss_final=3.4, loss_mean=0.964, loss_mean_cls=2.75, proj_loss=-0.306][2026-03-23 13:41:10] Step: 1169, Training Logs: loss_final: 3.357345, loss_mean: 0.971420, proj_loss: -0.298149, loss_mean_cls: 2.684073, grad_norm: 8.187978 +Steps: 0%| | 1170/1000000 [04:50<68:42:36, 4.04it/s, grad_norm=8.19, loss_final=3.36, loss_mean=0.971, loss_mean_cls=2.68, proj_loss=-0.298][2026-03-23 13:41:10] Step: 1170, Training Logs: loss_final: 2.732413, loss_mean: 0.998781, proj_loss: -0.307721, loss_mean_cls: 2.041353, grad_norm: 5.660029 +Steps: 0%| | 1171/1000000 [04:51<68:20:56, 4.06it/s, grad_norm=5.66, loss_final=2.73, loss_mean=0.999, loss_mean_cls=2.04, proj_loss=-0.308][2026-03-23 13:41:10] Step: 1171, Training Logs: loss_final: 3.050670, loss_mean: 0.991759, proj_loss: -0.308305, loss_mean_cls: 2.367216, grad_norm: 10.553487 +Steps: 0%| | 1172/1000000 [04:51<68:05:20, 4.07it/s, grad_norm=10.6, loss_final=3.05, loss_mean=0.992, loss_mean_cls=2.37, proj_loss=-0.308][2026-03-23 13:41:11] Step: 1172, Training Logs: loss_final: 3.240204, loss_mean: 1.002240, proj_loss: -0.305301, loss_mean_cls: 2.543265, grad_norm: 4.509305 +Steps: 0%| | 1173/1000000 [04:51<67:54:23, 4.09it/s, grad_norm=4.51, loss_final=3.24, loss_mean=1, loss_mean_cls=2.54, proj_loss=-0.305][2026-03-23 13:41:11] Step: 1173, Training Logs: loss_final: 3.352740, loss_mean: 0.973309, proj_loss: -0.307150, loss_mean_cls: 2.686581, grad_norm: 8.499378 +Steps: 0%| | 1174/1000000 [04:51<67:46:42, 4.09it/s, grad_norm=8.5, loss_final=3.35, loss_mean=0.973, loss_mean_cls=2.69, proj_loss=-0.307][2026-03-23 13:41:11] Step: 1174, Training Logs: loss_final: 3.491694, loss_mean: 0.982257, proj_loss: -0.301494, loss_mean_cls: 2.810932, grad_norm: 5.739394 +Steps: 0%| | 1175/1000000 [04:51<67:44:41, 4.10it/s, grad_norm=5.74, loss_final=3.49, loss_mean=0.982, loss_mean_cls=2.81, proj_loss=-0.301][2026-03-23 13:41:11] Step: 1175, Training Logs: loss_final: 3.126083, loss_mean: 0.962616, proj_loss: -0.308281, loss_mean_cls: 2.471748, grad_norm: 7.607542 +Steps: 0%| | 1176/1000000 [04:52<67:41:53, 4.10it/s, grad_norm=7.61, loss_final=3.13, loss_mean=0.963, loss_mean_cls=2.47, proj_loss=-0.308][2026-03-23 13:41:12] Step: 1176, Training Logs: loss_final: 3.242850, loss_mean: 0.980312, proj_loss: -0.305879, loss_mean_cls: 2.568417, grad_norm: 7.072228 +Steps: 0%| | 1177/1000000 [04:52<67:44:55, 4.10it/s, grad_norm=7.07, loss_final=3.24, loss_mean=0.98, loss_mean_cls=2.57, proj_loss=-0.306][2026-03-23 13:41:12] Step: 1177, Training Logs: loss_final: 2.972026, loss_mean: 0.959630, proj_loss: -0.316187, loss_mean_cls: 2.328582, grad_norm: 9.089021 +Steps: 0%| | 1178/1000000 [04:52<67:42:40, 4.10it/s, grad_norm=9.09, loss_final=2.97, loss_mean=0.96, loss_mean_cls=2.33, proj_loss=-0.316][2026-03-23 13:41:12] Step: 1178, Training Logs: loss_final: 3.188125, loss_mean: 0.967784, proj_loss: -0.310969, loss_mean_cls: 2.531309, grad_norm: 8.947553 +Steps: 0%| | 1179/1000000 [04:52<67:41:42, 4.10it/s, grad_norm=8.95, loss_final=3.19, loss_mean=0.968, loss_mean_cls=2.53, proj_loss=-0.311][2026-03-23 13:41:12] Step: 1179, Training Logs: loss_final: 2.948331, loss_mean: 0.993731, proj_loss: -0.310780, loss_mean_cls: 2.265380, grad_norm: 2.613205 +Steps: 0%| | 1180/1000000 [04:53<67:38:27, 4.10it/s, grad_norm=2.61, loss_final=2.95, loss_mean=0.994, loss_mean_cls=2.27, proj_loss=-0.311][2026-03-23 13:41:13] Step: 1180, Training Logs: loss_final: 3.129437, loss_mean: 0.959849, proj_loss: -0.307101, loss_mean_cls: 2.476689, grad_norm: 8.674301 +Steps: 0%| | 1181/1000000 [04:53<67:42:52, 4.10it/s, grad_norm=8.67, loss_final=3.13, loss_mean=0.96, loss_mean_cls=2.48, proj_loss=-0.307][2026-03-23 13:41:13] Step: 1181, Training Logs: loss_final: 3.357108, loss_mean: 0.975805, proj_loss: -0.305347, loss_mean_cls: 2.686650, grad_norm: 11.458467 +Steps: 0%| | 1182/1000000 [04:53<67:38:51, 4.10it/s, grad_norm=11.5, loss_final=3.36, loss_mean=0.976, loss_mean_cls=2.69, proj_loss=-0.305][2026-03-23 13:41:13] Step: 1182, Training Logs: loss_final: 3.536065, loss_mean: 0.976960, proj_loss: -0.304159, loss_mean_cls: 2.863263, grad_norm: 6.162800 +Steps: 0%| | 1183/1000000 [04:53<67:37:36, 4.10it/s, grad_norm=6.16, loss_final=3.54, loss_mean=0.977, loss_mean_cls=2.86, proj_loss=-0.304][2026-03-23 13:41:13] Step: 1183, Training Logs: loss_final: 2.911883, loss_mean: 0.983940, proj_loss: -0.311630, loss_mean_cls: 2.239572, grad_norm: 10.415386 +Steps: 0%| | 1184/1000000 [04:54<67:34:13, 4.11it/s, grad_norm=10.4, loss_final=2.91, loss_mean=0.984, loss_mean_cls=2.24, proj_loss=-0.312][2026-03-23 13:41:14] Step: 1184, Training Logs: loss_final: 3.295963, loss_mean: 0.990401, proj_loss: -0.300673, loss_mean_cls: 2.606235, grad_norm: 7.965898 +Steps: 0%| | 1185/1000000 [04:54<67:40:40, 4.10it/s, grad_norm=7.97, loss_final=3.3, loss_mean=0.99, loss_mean_cls=2.61, proj_loss=-0.301][2026-03-23 13:41:14] Step: 1185, Training Logs: loss_final: 3.600784, loss_mean: 0.966195, proj_loss: -0.301124, loss_mean_cls: 2.935713, grad_norm: 8.708016 +Steps: 0%| | 1186/1000000 [04:54<67:38:10, 4.10it/s, grad_norm=8.71, loss_final=3.6, loss_mean=0.966, loss_mean_cls=2.94, proj_loss=-0.301][2026-03-23 13:41:14] Step: 1186, Training Logs: loss_final: 3.057520, loss_mean: 0.964587, proj_loss: -0.310702, loss_mean_cls: 2.403635, grad_norm: 4.525071 +Steps: 0%| | 1187/1000000 [04:54<67:37:15, 4.10it/s, grad_norm=4.53, loss_final=3.06, loss_mean=0.965, loss_mean_cls=2.4, proj_loss=-0.311][2026-03-23 13:41:14] Step: 1187, Training Logs: loss_final: 2.587389, loss_mean: 1.001922, proj_loss: -0.322733, loss_mean_cls: 1.908200, grad_norm: 7.506896 +Steps: 0%| | 1188/1000000 [04:55<67:35:34, 4.10it/s, grad_norm=7.51, loss_final=2.59, loss_mean=1, loss_mean_cls=1.91, proj_loss=-0.323][2026-03-23 13:41:15] Step: 1188, Training Logs: loss_final: 3.107550, loss_mean: 0.976803, proj_loss: -0.312378, loss_mean_cls: 2.443125, grad_norm: 7.608323 +Steps: 0%| | 1189/1000000 [04:55<67:39:36, 4.10it/s, grad_norm=7.61, loss_final=3.11, loss_mean=0.977, loss_mean_cls=2.44, proj_loss=-0.312][2026-03-23 13:41:15] Step: 1189, Training Logs: loss_final: 3.365309, loss_mean: 0.966595, proj_loss: -0.310828, loss_mean_cls: 2.709542, grad_norm: 6.869634 +Steps: 0%| | 1190/1000000 [04:55<67:37:16, 4.10it/s, grad_norm=6.87, loss_final=3.37, loss_mean=0.967, loss_mean_cls=2.71, proj_loss=-0.311][2026-03-23 13:41:15] Step: 1190, Training Logs: loss_final: 3.399628, loss_mean: 0.974844, proj_loss: -0.307776, loss_mean_cls: 2.732560, grad_norm: 5.826264 +Steps: 0%| | 1191/1000000 [04:55<67:35:04, 4.11it/s, grad_norm=5.83, loss_final=3.4, loss_mean=0.975, loss_mean_cls=2.73, proj_loss=-0.308][2026-03-23 13:41:15] Step: 1191, Training Logs: loss_final: 2.653247, loss_mean: 0.967131, proj_loss: -0.318697, loss_mean_cls: 2.004813, grad_norm: 9.999562 +Steps: 0%| | 1192/1000000 [04:56<67:35:17, 4.10it/s, grad_norm=10, loss_final=2.65, loss_mean=0.967, loss_mean_cls=2, proj_loss=-0.319][2026-03-23 13:41:16] Step: 1192, Training Logs: loss_final: 3.239798, loss_mean: 0.980385, proj_loss: -0.309173, loss_mean_cls: 2.568585, grad_norm: 7.697807 +Steps: 0%| | 1193/1000000 [04:56<67:39:59, 4.10it/s, grad_norm=7.7, loss_final=3.24, loss_mean=0.98, loss_mean_cls=2.57, proj_loss=-0.309][2026-03-23 13:41:16] Step: 1193, Training Logs: loss_final: 2.957490, loss_mean: 0.983078, proj_loss: -0.312185, loss_mean_cls: 2.286597, grad_norm: 8.626239 +Steps: 0%| | 1194/1000000 [04:56<67:36:31, 4.10it/s, grad_norm=8.63, loss_final=2.96, loss_mean=0.983, loss_mean_cls=2.29, proj_loss=-0.312][2026-03-23 13:41:16] Step: 1194, Training Logs: loss_final: 3.185610, loss_mean: 0.958733, proj_loss: -0.316006, loss_mean_cls: 2.542883, grad_norm: 5.799075 +Steps: 0%| | 1195/1000000 [04:56<67:35:23, 4.10it/s, grad_norm=5.8, loss_final=3.19, loss_mean=0.959, loss_mean_cls=2.54, proj_loss=-0.316][2026-03-23 13:41:16] Step: 1195, Training Logs: loss_final: 3.028419, loss_mean: 0.963849, proj_loss: -0.314975, loss_mean_cls: 2.379545, grad_norm: 2.975310 +Steps: 0%| | 1196/1000000 [04:57<67:32:50, 4.11it/s, grad_norm=2.98, loss_final=3.03, loss_mean=0.964, loss_mean_cls=2.38, proj_loss=-0.315][2026-03-23 13:41:17] Step: 1196, Training Logs: loss_final: 3.162929, loss_mean: 0.981684, proj_loss: -0.313263, loss_mean_cls: 2.494507, grad_norm: 5.477049 +Steps: 0%| | 1197/1000000 [04:57<67:38:16, 4.10it/s, grad_norm=5.48, loss_final=3.16, loss_mean=0.982, loss_mean_cls=2.49, proj_loss=-0.313][2026-03-23 13:41:17] Step: 1197, Training Logs: loss_final: 3.218119, loss_mean: 0.968290, proj_loss: -0.314606, loss_mean_cls: 2.564435, grad_norm: 8.893763 +Steps: 0%| | 1198/1000000 [04:57<67:37:20, 4.10it/s, grad_norm=8.89, loss_final=3.22, loss_mean=0.968, loss_mean_cls=2.56, proj_loss=-0.315][2026-03-23 13:41:17] Step: 1198, Training Logs: loss_final: 3.392535, loss_mean: 0.962419, proj_loss: -0.306402, loss_mean_cls: 2.736519, grad_norm: 3.588887 +Steps: 0%| | 1199/1000000 [04:57<67:35:49, 4.10it/s, grad_norm=3.59, loss_final=3.39, loss_mean=0.962, loss_mean_cls=2.74, proj_loss=-0.306][2026-03-23 13:41:17] Step: 1199, Training Logs: loss_final: 3.207316, loss_mean: 0.958969, proj_loss: -0.314719, loss_mean_cls: 2.563066, grad_norm: 2.965649 +Steps: 0%| | 1200/1000000 [04:58<68:34:03, 4.05it/s, grad_norm=2.97, loss_final=3.21, loss_mean=0.959, loss_mean_cls=2.56, proj_loss=-0.315][2026-03-23 13:41:18] Step: 1200, Training Logs: loss_final: 3.169437, loss_mean: 0.973104, proj_loss: -0.317610, loss_mean_cls: 2.513943, grad_norm: 11.355494 +Steps: 0%| | 1201/1000000 [04:58<68:20:57, 4.06it/s, grad_norm=11.4, loss_final=3.17, loss_mean=0.973, loss_mean_cls=2.51, proj_loss=-0.318][2026-03-23 13:41:18] Step: 1201, Training Logs: loss_final: 3.050124, loss_mean: 0.969534, proj_loss: -0.316087, loss_mean_cls: 2.396678, grad_norm: 10.222289 +Steps: 0%| | 1202/1000000 [04:58<68:10:22, 4.07it/s, grad_norm=10.2, loss_final=3.05, loss_mean=0.97, loss_mean_cls=2.4, proj_loss=-0.316][2026-03-23 13:41:18] Step: 1202, Training Logs: loss_final: 3.193627, loss_mean: 0.937733, proj_loss: -0.310037, loss_mean_cls: 2.565931, grad_norm: 8.652317 +Steps: 0%| | 1203/1000000 [04:58<68:02:12, 4.08it/s, grad_norm=8.65, loss_final=3.19, loss_mean=0.938, loss_mean_cls=2.57, proj_loss=-0.31][2026-03-23 13:41:18] Step: 1203, Training Logs: loss_final: 2.923999, loss_mean: 0.978131, proj_loss: -0.310015, loss_mean_cls: 2.255883, grad_norm: 4.330058 +Steps: 0%| | 1204/1000000 [04:59<67:56:18, 4.08it/s, grad_norm=4.33, loss_final=2.92, loss_mean=0.978, loss_mean_cls=2.26, proj_loss=-0.31][2026-03-23 13:41:19] Step: 1204, Training Logs: loss_final: 3.119836, loss_mean: 0.955927, proj_loss: -0.320572, loss_mean_cls: 2.484481, grad_norm: 8.920811 +Steps: 0%| | 1205/1000000 [04:59<67:53:57, 4.09it/s, grad_norm=8.92, loss_final=3.12, loss_mean=0.956, loss_mean_cls=2.48, proj_loss=-0.321][2026-03-23 13:41:19] Step: 1205, Training Logs: loss_final: 2.952278, loss_mean: 0.968223, proj_loss: -0.319109, loss_mean_cls: 2.303164, grad_norm: 8.212152 +Steps: 0%| | 1206/1000000 [04:59<67:46:54, 4.09it/s, grad_norm=8.21, loss_final=2.95, loss_mean=0.968, loss_mean_cls=2.3, proj_loss=-0.319][2026-03-23 13:41:19] Step: 1206, Training Logs: loss_final: 2.609036, loss_mean: 0.996266, proj_loss: -0.318444, loss_mean_cls: 1.931214, grad_norm: 3.375917 +Steps: 0%| | 1207/1000000 [04:59<67:41:36, 4.10it/s, grad_norm=3.38, loss_final=2.61, loss_mean=0.996, loss_mean_cls=1.93, proj_loss=-0.318][2026-03-23 13:41:19] Step: 1207, Training Logs: loss_final: 3.103538, loss_mean: 0.963463, proj_loss: -0.317993, loss_mean_cls: 2.458068, grad_norm: 4.266016 +Steps: 0%| | 1208/1000000 [05:00<67:39:20, 4.10it/s, grad_norm=4.27, loss_final=3.1, loss_mean=0.963, loss_mean_cls=2.46, proj_loss=-0.318][2026-03-23 13:41:19] Step: 1208, Training Logs: loss_final: 3.379039, loss_mean: 0.960727, proj_loss: -0.307600, loss_mean_cls: 2.725912, grad_norm: 5.477923 +Steps: 0%| | 1209/1000000 [05:00<67:40:10, 4.10it/s, grad_norm=5.48, loss_final=3.38, loss_mean=0.961, loss_mean_cls=2.73, proj_loss=-0.308][2026-03-23 13:41:20] Step: 1209, Training Logs: loss_final: 3.300223, loss_mean: 0.957266, proj_loss: -0.310416, loss_mean_cls: 2.653374, grad_norm: 3.509273 +Steps: 0%| | 1210/1000000 [05:00<67:38:19, 4.10it/s, grad_norm=3.51, loss_final=3.3, loss_mean=0.957, loss_mean_cls=2.65, proj_loss=-0.31][2026-03-23 13:41:20] Step: 1210, Training Logs: loss_final: 3.549890, loss_mean: 0.954470, proj_loss: -0.311580, loss_mean_cls: 2.907001, grad_norm: 4.929626 +Steps: 0%| | 1211/1000000 [05:00<67:36:23, 4.10it/s, grad_norm=4.93, loss_final=3.55, loss_mean=0.954, loss_mean_cls=2.91, proj_loss=-0.312][2026-03-23 13:41:20] Step: 1211, Training Logs: loss_final: 2.887236, loss_mean: 0.966159, proj_loss: -0.321308, loss_mean_cls: 2.242384, grad_norm: 9.152077 +Steps: 0%| | 1212/1000000 [05:01<67:40:26, 4.10it/s, grad_norm=9.15, loss_final=2.89, loss_mean=0.966, loss_mean_cls=2.24, proj_loss=-0.321][2026-03-23 13:41:20] Step: 1212, Training Logs: loss_final: 3.436569, loss_mean: 0.942385, proj_loss: -0.306674, loss_mean_cls: 2.800858, grad_norm: 3.553532 +Steps: 0%| | 1213/1000000 [05:01<67:57:26, 4.08it/s, grad_norm=3.55, loss_final=3.44, loss_mean=0.942, loss_mean_cls=2.8, proj_loss=-0.307][2026-03-23 13:41:21] Step: 1213, Training Logs: loss_final: 3.483566, loss_mean: 0.932722, proj_loss: -0.315002, loss_mean_cls: 2.865846, grad_norm: 3.812054 +Steps: 0%| | 1214/1000000 [05:01<67:50:06, 4.09it/s, grad_norm=3.81, loss_final=3.48, loss_mean=0.933, loss_mean_cls=2.87, proj_loss=-0.315][2026-03-23 13:41:21] Step: 1214, Training Logs: loss_final: 3.288643, loss_mean: 0.970126, proj_loss: -0.310738, loss_mean_cls: 2.629254, grad_norm: 5.922238 +Steps: 0%| | 1215/1000000 [05:01<67:43:42, 4.10it/s, grad_norm=5.92, loss_final=3.29, loss_mean=0.97, loss_mean_cls=2.63, proj_loss=-0.311][2026-03-23 13:41:21] Step: 1215, Training Logs: loss_final: 2.773854, loss_mean: 0.961852, proj_loss: -0.319328, loss_mean_cls: 2.131330, grad_norm: 10.417454 +Steps: 0%| | 1216/1000000 [05:01<67:46:08, 4.09it/s, grad_norm=10.4, loss_final=2.77, loss_mean=0.962, loss_mean_cls=2.13, proj_loss=-0.319][2026-03-23 13:41:21] Step: 1216, Training Logs: loss_final: 3.716473, loss_mean: 0.939559, proj_loss: -0.305251, loss_mean_cls: 3.082164, grad_norm: 13.371938 +Steps: 0%| | 1217/1000000 [05:02<67:47:48, 4.09it/s, grad_norm=13.4, loss_final=3.72, loss_mean=0.94, loss_mean_cls=3.08, proj_loss=-0.305][2026-03-23 13:41:22] Step: 1217, Training Logs: loss_final: 3.535335, loss_mean: 0.965693, proj_loss: -0.307895, loss_mean_cls: 2.877536, grad_norm: 12.535737 +Steps: 0%| | 1218/1000000 [05:02<67:43:37, 4.10it/s, grad_norm=12.5, loss_final=3.54, loss_mean=0.966, loss_mean_cls=2.88, proj_loss=-0.308][2026-03-23 13:41:22] Step: 1218, Training Logs: loss_final: 3.488383, loss_mean: 0.935452, proj_loss: -0.307219, loss_mean_cls: 2.860149, grad_norm: 7.833519 +Steps: 0%| | 1219/1000000 [05:02<67:43:29, 4.10it/s, grad_norm=7.83, loss_final=3.49, loss_mean=0.935, loss_mean_cls=2.86, proj_loss=-0.307][2026-03-23 13:41:22] Step: 1219, Training Logs: loss_final: 2.954226, loss_mean: 0.998661, proj_loss: -0.315121, loss_mean_cls: 2.270686, grad_norm: 8.632198 +Steps: 0%| | 1220/1000000 [05:02<67:42:04, 4.10it/s, grad_norm=8.63, loss_final=2.95, loss_mean=0.999, loss_mean_cls=2.27, proj_loss=-0.315][2026-03-23 13:41:22] Step: 1220, Training Logs: loss_final: 3.769584, loss_mean: 0.976530, proj_loss: -0.312663, loss_mean_cls: 3.105716, grad_norm: 17.522264 +Steps: 0%| | 1221/1000000 [05:03<67:47:52, 4.09it/s, grad_norm=17.5, loss_final=3.77, loss_mean=0.977, loss_mean_cls=3.11, proj_loss=-0.313][2026-03-23 13:41:23] Step: 1221, Training Logs: loss_final: 3.496059, loss_mean: 0.951227, proj_loss: -0.306849, loss_mean_cls: 2.851681, grad_norm: 14.927700 +Steps: 0%| | 1222/1000000 [05:03<67:43:48, 4.10it/s, grad_norm=14.9, loss_final=3.5, loss_mean=0.951, loss_mean_cls=2.85, proj_loss=-0.307][2026-03-23 13:41:23] Step: 1222, Training Logs: loss_final: 3.110201, loss_mean: 0.993438, proj_loss: -0.317695, loss_mean_cls: 2.434459, grad_norm: 6.206051 +Steps: 0%| | 1223/1000000 [05:03<67:39:56, 4.10it/s, grad_norm=6.21, loss_final=3.11, loss_mean=0.993, loss_mean_cls=2.43, proj_loss=-0.318][2026-03-23 13:41:23] Step: 1223, Training Logs: loss_final: 2.958466, loss_mean: 0.971727, proj_loss: -0.324850, loss_mean_cls: 2.311589, grad_norm: 9.867150 +Steps: 0%| | 1224/1000000 [05:03<67:39:39, 4.10it/s, grad_norm=9.87, loss_final=2.96, loss_mean=0.972, loss_mean_cls=2.31, proj_loss=-0.325][2026-03-23 13:41:23] Step: 1224, Training Logs: loss_final: 3.361951, loss_mean: 0.998182, proj_loss: -0.309930, loss_mean_cls: 2.673698, grad_norm: 8.477698 +Steps: 0%| | 1225/1000000 [05:04<67:43:17, 4.10it/s, grad_norm=8.48, loss_final=3.36, loss_mean=0.998, loss_mean_cls=2.67, proj_loss=-0.31][2026-03-23 13:41:24] Step: 1225, Training Logs: loss_final: 3.335689, loss_mean: 0.966671, proj_loss: -0.312652, loss_mean_cls: 2.681670, grad_norm: 9.983760 +Steps: 0%| | 1226/1000000 [05:04<67:40:18, 4.10it/s, grad_norm=9.98, loss_final=3.34, loss_mean=0.967, loss_mean_cls=2.68, proj_loss=-0.313][2026-03-23 13:41:24] Step: 1226, Training Logs: loss_final: 3.614371, loss_mean: 0.967124, proj_loss: -0.315269, loss_mean_cls: 2.962516, grad_norm: 10.837229 +Steps: 0%| | 1227/1000000 [05:04<67:38:35, 4.10it/s, grad_norm=10.8, loss_final=3.61, loss_mean=0.967, loss_mean_cls=2.96, proj_loss=-0.315][2026-03-23 13:41:24] Step: 1227, Training Logs: loss_final: 3.095296, loss_mean: 0.991290, proj_loss: -0.320298, loss_mean_cls: 2.424304, grad_norm: 7.915008 +Steps: 0%| | 1228/1000000 [05:04<67:35:53, 4.10it/s, grad_norm=7.92, loss_final=3.1, loss_mean=0.991, loss_mean_cls=2.42, proj_loss=-0.32][2026-03-23 13:41:24] Step: 1228, Training Logs: loss_final: 3.415985, loss_mean: 0.975878, proj_loss: -0.312854, loss_mean_cls: 2.752960, grad_norm: 10.000176 +Steps: 0%| | 1229/1000000 [05:05<67:41:05, 4.10it/s, grad_norm=10, loss_final=3.42, loss_mean=0.976, loss_mean_cls=2.75, proj_loss=-0.313][2026-03-23 13:41:25] Step: 1229, Training Logs: loss_final: 3.398078, loss_mean: 0.978444, proj_loss: -0.313946, loss_mean_cls: 2.733579, grad_norm: 12.646403 +Steps: 0%| | 1230/1000000 [05:05<67:39:50, 4.10it/s, grad_norm=12.6, loss_final=3.4, loss_mean=0.978, loss_mean_cls=2.73, proj_loss=-0.314][2026-03-23 13:41:25] Step: 1230, Training Logs: loss_final: 2.824821, loss_mean: 0.999947, proj_loss: -0.318330, loss_mean_cls: 2.143204, grad_norm: 8.731183 +Steps: 0%| | 1231/1000000 [05:05<67:39:24, 4.10it/s, grad_norm=8.73, loss_final=2.82, loss_mean=1, loss_mean_cls=2.14, proj_loss=-0.318][2026-03-23 13:41:25] Step: 1231, Training Logs: loss_final: 3.298292, loss_mean: 0.963162, proj_loss: -0.320752, loss_mean_cls: 2.655881, grad_norm: 14.299128 +Steps: 0%| | 1232/1000000 [05:05<67:36:53, 4.10it/s, grad_norm=14.3, loss_final=3.3, loss_mean=0.963, loss_mean_cls=2.66, proj_loss=-0.321][2026-03-23 13:41:25] Step: 1232, Training Logs: loss_final: 3.264194, loss_mean: 0.965227, proj_loss: -0.316374, loss_mean_cls: 2.615342, grad_norm: 10.729575 +Steps: 0%| | 1233/1000000 [05:06<67:35:12, 4.10it/s, grad_norm=10.7, loss_final=3.26, loss_mean=0.965, loss_mean_cls=2.62, proj_loss=-0.316][2026-03-23 13:41:26] Step: 1233, Training Logs: loss_final: 2.922657, loss_mean: 0.997924, proj_loss: -0.318018, loss_mean_cls: 2.242751, grad_norm: 6.039731 +Steps: 0%| | 1234/1000000 [05:06<67:36:22, 4.10it/s, grad_norm=6.04, loss_final=2.92, loss_mean=0.998, loss_mean_cls=2.24, proj_loss=-0.318][2026-03-23 13:41:26] Step: 1234, Training Logs: loss_final: 2.721747, loss_mean: 0.985115, proj_loss: -0.326557, loss_mean_cls: 2.063189, grad_norm: 4.525736 +Steps: 0%| | 1235/1000000 [05:06<67:37:09, 4.10it/s, grad_norm=4.53, loss_final=2.72, loss_mean=0.985, loss_mean_cls=2.06, proj_loss=-0.327][2026-03-23 13:41:26] Step: 1235, Training Logs: loss_final: 3.202938, loss_mean: 0.963591, proj_loss: -0.323648, loss_mean_cls: 2.562995, grad_norm: 8.383333 +Steps: 0%| | 1236/1000000 [05:06<67:34:13, 4.11it/s, grad_norm=8.38, loss_final=3.2, loss_mean=0.964, loss_mean_cls=2.56, proj_loss=-0.324][2026-03-23 13:41:26] Step: 1236, Training Logs: loss_final: 3.157144, loss_mean: 0.954203, proj_loss: -0.320458, loss_mean_cls: 2.523400, grad_norm: 5.532086 +Steps: 0%| | 1237/1000000 [05:07<67:32:53, 4.11it/s, grad_norm=5.53, loss_final=3.16, loss_mean=0.954, loss_mean_cls=2.52, proj_loss=-0.32][2026-03-23 13:41:27] Step: 1237, Training Logs: loss_final: 2.810682, loss_mean: 1.002008, proj_loss: -0.324758, loss_mean_cls: 2.133432, grad_norm: 7.493155 +Steps: 0%| | 1238/1000000 [05:07<67:35:19, 4.10it/s, grad_norm=7.49, loss_final=2.81, loss_mean=1, loss_mean_cls=2.13, proj_loss=-0.325][2026-03-23 13:41:27] Step: 1238, Training Logs: loss_final: 3.561103, loss_mean: 0.952096, proj_loss: -0.320966, loss_mean_cls: 2.929973, grad_norm: 15.022573 +Steps: 0%| | 1239/1000000 [05:07<67:35:30, 4.10it/s, grad_norm=15, loss_final=3.56, loss_mean=0.952, loss_mean_cls=2.93, proj_loss=-0.321][2026-03-23 13:41:27] Step: 1239, Training Logs: loss_final: 3.075515, loss_mean: 0.990634, proj_loss: -0.318637, loss_mean_cls: 2.403518, grad_norm: 7.056044 +Steps: 0%| | 1240/1000000 [05:07<67:35:30, 4.10it/s, grad_norm=7.06, loss_final=3.08, loss_mean=0.991, loss_mean_cls=2.4, proj_loss=-0.319][2026-03-23 13:41:27] Step: 1240, Training Logs: loss_final: 3.000690, loss_mean: 0.968824, proj_loss: -0.316953, loss_mean_cls: 2.348819, grad_norm: 6.039891 +Steps: 0%| | 1241/1000000 [05:08<67:35:20, 4.10it/s, grad_norm=6.04, loss_final=3, loss_mean=0.969, loss_mean_cls=2.35, proj_loss=-0.317][2026-03-23 13:41:28] Step: 1241, Training Logs: loss_final: 2.869320, loss_mean: 0.985788, proj_loss: -0.325467, loss_mean_cls: 2.209000, grad_norm: 7.162256 +Steps: 0%| | 1242/1000000 [05:08<67:33:49, 4.11it/s, grad_norm=7.16, loss_final=2.87, loss_mean=0.986, loss_mean_cls=2.21, proj_loss=-0.325][2026-03-23 13:41:28] Step: 1242, Training Logs: loss_final: 3.219374, loss_mean: 0.994066, proj_loss: -0.325962, loss_mean_cls: 2.551271, grad_norm: 13.057402 +Steps: 0%| | 1243/1000000 [05:08<67:33:50, 4.11it/s, grad_norm=13.1, loss_final=3.22, loss_mean=0.994, loss_mean_cls=2.55, proj_loss=-0.326][2026-03-23 13:41:28] Step: 1243, Training Logs: loss_final: 3.275629, loss_mean: 0.978258, proj_loss: -0.314775, loss_mean_cls: 2.612145, grad_norm: 11.299537 +Steps: 0%| | 1244/1000000 [05:08<67:33:34, 4.11it/s, grad_norm=11.3, loss_final=3.28, loss_mean=0.978, loss_mean_cls=2.61, proj_loss=-0.315][2026-03-23 13:41:28] Step: 1244, Training Logs: loss_final: 3.165665, loss_mean: 0.953977, proj_loss: -0.313338, loss_mean_cls: 2.525026, grad_norm: 5.843001 +Steps: 0%| | 1245/1000000 [05:09<67:37:39, 4.10it/s, grad_norm=5.84, loss_final=3.17, loss_mean=0.954, loss_mean_cls=2.53, proj_loss=-0.313][2026-03-23 13:41:29] Step: 1245, Training Logs: loss_final: 3.769130, loss_mean: 0.943607, proj_loss: -0.314829, loss_mean_cls: 3.140352, grad_norm: 10.531381 +Steps: 0%| | 1246/1000000 [05:09<68:19:09, 4.06it/s, grad_norm=10.5, loss_final=3.77, loss_mean=0.944, loss_mean_cls=3.14, proj_loss=-0.315][2026-03-23 13:41:29] Step: 1246, Training Logs: loss_final: 3.113125, loss_mean: 0.968174, proj_loss: -0.321435, loss_mean_cls: 2.466386, grad_norm: 9.419699 +Steps: 0%| | 1247/1000000 [05:09<68:06:02, 4.07it/s, grad_norm=9.42, loss_final=3.11, loss_mean=0.968, loss_mean_cls=2.47, proj_loss=-0.321][2026-03-23 13:41:29] Step: 1247, Training Logs: loss_final: 3.265938, loss_mean: 0.971566, proj_loss: -0.318315, loss_mean_cls: 2.612687, grad_norm: 9.971322 +Steps: 0%| | 1248/1000000 [05:09<67:56:33, 4.08it/s, grad_norm=9.97, loss_final=3.27, loss_mean=0.972, loss_mean_cls=2.61, proj_loss=-0.318][2026-03-23 13:41:29] Step: 1248, Training Logs: loss_final: 3.591566, loss_mean: 0.963156, proj_loss: -0.320536, loss_mean_cls: 2.948946, grad_norm: 6.432787 +Steps: 0%| | 1249/1000000 [05:10<67:55:28, 4.08it/s, grad_norm=6.43, loss_final=3.59, loss_mean=0.963, loss_mean_cls=2.95, proj_loss=-0.321][2026-03-23 13:41:30] Step: 1249, Training Logs: loss_final: 2.985950, loss_mean: 0.992026, proj_loss: -0.328409, loss_mean_cls: 2.322332, grad_norm: 8.938420 +Steps: 0%| | 1250/1000000 [05:10<67:49:41, 4.09it/s, grad_norm=8.94, loss_final=2.99, loss_mean=0.992, loss_mean_cls=2.32, proj_loss=-0.328][2026-03-23 13:41:30] Step: 1250, Training Logs: loss_final: 3.208561, loss_mean: 0.963196, proj_loss: -0.321915, loss_mean_cls: 2.567280, grad_norm: 6.588032 +Steps: 0%| | 1251/1000000 [05:10<67:47:04, 4.09it/s, grad_norm=6.59, loss_final=3.21, loss_mean=0.963, loss_mean_cls=2.57, proj_loss=-0.322][2026-03-23 13:41:30] Step: 1251, Training Logs: loss_final: 2.979179, loss_mean: 0.998371, proj_loss: -0.322910, loss_mean_cls: 2.303718, grad_norm: 5.490127 +Steps: 0%| | 1252/1000000 [05:10<67:44:20, 4.10it/s, grad_norm=5.49, loss_final=2.98, loss_mean=0.998, loss_mean_cls=2.3, proj_loss=-0.323][2026-03-23 13:41:30] Step: 1252, Training Logs: loss_final: 3.431067, loss_mean: 0.933233, proj_loss: -0.314196, loss_mean_cls: 2.812031, grad_norm: 6.766499 +Steps: 0%| | 1253/1000000 [05:11<67:46:48, 4.09it/s, grad_norm=6.77, loss_final=3.43, loss_mean=0.933, loss_mean_cls=2.81, proj_loss=-0.314][2026-03-23 13:41:30] Step: 1253, Training Logs: loss_final: 3.001680, loss_mean: 0.983688, proj_loss: -0.325726, loss_mean_cls: 2.343718, grad_norm: 11.637913 +Steps: 0%| | 1254/1000000 [05:11<67:42:14, 4.10it/s, grad_norm=11.6, loss_final=3, loss_mean=0.984, loss_mean_cls=2.34, proj_loss=-0.326][2026-03-23 13:41:31] Step: 1254, Training Logs: loss_final: 3.260542, loss_mean: 1.012674, proj_loss: -0.321196, loss_mean_cls: 2.569064, grad_norm: 10.815703 +Steps: 0%| | 1255/1000000 [05:11<67:39:41, 4.10it/s, grad_norm=10.8, loss_final=3.26, loss_mean=1.01, loss_mean_cls=2.57, proj_loss=-0.321][2026-03-23 13:41:31] Step: 1255, Training Logs: loss_final: 2.855156, loss_mean: 0.996212, proj_loss: -0.323268, loss_mean_cls: 2.182212, grad_norm: 4.953330 +Steps: 0%| | 1256/1000000 [05:11<67:38:22, 4.10it/s, grad_norm=4.95, loss_final=2.86, loss_mean=0.996, loss_mean_cls=2.18, proj_loss=-0.323][2026-03-23 13:41:31] Step: 1256, Training Logs: loss_final: 3.067744, loss_mean: 0.985371, proj_loss: -0.327258, loss_mean_cls: 2.409632, grad_norm: 8.462524 +Steps: 0%| | 1257/1000000 [05:11<67:43:22, 4.10it/s, grad_norm=8.46, loss_final=3.07, loss_mean=0.985, loss_mean_cls=2.41, proj_loss=-0.327][2026-03-23 13:41:31] Step: 1257, Training Logs: loss_final: 3.140543, loss_mean: 0.970148, proj_loss: -0.321846, loss_mean_cls: 2.492242, grad_norm: 7.186398 +Steps: 0%| | 1258/1000000 [05:12<67:38:45, 4.10it/s, grad_norm=7.19, loss_final=3.14, loss_mean=0.97, loss_mean_cls=2.49, proj_loss=-0.322][2026-03-23 13:41:32] Step: 1258, Training Logs: loss_final: 2.895171, loss_mean: 0.971132, proj_loss: -0.328171, loss_mean_cls: 2.252209, grad_norm: 5.689233 +Steps: 0%| | 1259/1000000 [05:12<67:39:48, 4.10it/s, grad_norm=5.69, loss_final=2.9, loss_mean=0.971, loss_mean_cls=2.25, proj_loss=-0.328][2026-03-23 13:41:32] Step: 1259, Training Logs: loss_final: 2.946368, loss_mean: 0.978660, proj_loss: -0.326535, loss_mean_cls: 2.294243, grad_norm: 5.249206 +Steps: 0%| | 1260/1000000 [05:12<68:50:17, 4.03it/s, grad_norm=5.25, loss_final=2.95, loss_mean=0.979, loss_mean_cls=2.29, proj_loss=-0.327][2026-03-23 13:41:32] Step: 1260, Training Logs: loss_final: 3.454425, loss_mean: 0.956923, proj_loss: -0.321661, loss_mean_cls: 2.819162, grad_norm: 4.669608 +Steps: 0%| | 1261/1000000 [05:12<68:34:36, 4.05it/s, grad_norm=4.67, loss_final=3.45, loss_mean=0.957, loss_mean_cls=2.82, proj_loss=-0.322][2026-03-23 13:41:32] Step: 1261, Training Logs: loss_final: 3.414052, loss_mean: 0.971510, proj_loss: -0.317925, loss_mean_cls: 2.760467, grad_norm: 9.172959 +Steps: 0%| | 1262/1000000 [05:13<68:31:04, 4.05it/s, grad_norm=9.17, loss_final=3.41, loss_mean=0.972, loss_mean_cls=2.76, proj_loss=-0.318][2026-03-23 13:41:33] Step: 1262, Training Logs: loss_final: 3.299335, loss_mean: 0.968438, proj_loss: -0.319406, loss_mean_cls: 2.650303, grad_norm: 6.754809 +Steps: 0%| | 1263/1000000 [05:13<68:16:14, 4.06it/s, grad_norm=6.75, loss_final=3.3, loss_mean=0.968, loss_mean_cls=2.65, proj_loss=-0.319][2026-03-23 13:41:33] Step: 1263, Training Logs: loss_final: 3.424195, loss_mean: 0.927006, proj_loss: -0.325422, loss_mean_cls: 2.822611, grad_norm: 5.613561 +Steps: 0%| | 1264/1000000 [05:13<68:05:04, 4.07it/s, grad_norm=5.61, loss_final=3.42, loss_mean=0.927, loss_mean_cls=2.82, proj_loss=-0.325][2026-03-23 13:41:33] Step: 1264, Training Logs: loss_final: 3.033328, loss_mean: 0.968342, proj_loss: -0.330060, loss_mean_cls: 2.395046, grad_norm: 9.461199 +Steps: 0%| | 1265/1000000 [05:13<67:56:41, 4.08it/s, grad_norm=9.46, loss_final=3.03, loss_mean=0.968, loss_mean_cls=2.4, proj_loss=-0.33][2026-03-23 13:41:33] Step: 1265, Training Logs: loss_final: 3.547823, loss_mean: 0.936038, proj_loss: -0.323608, loss_mean_cls: 2.935394, grad_norm: 12.822103 +Steps: 0%| | 1266/1000000 [05:14<67:50:48, 4.09it/s, grad_norm=12.8, loss_final=3.55, loss_mean=0.936, loss_mean_cls=2.94, proj_loss=-0.324][2026-03-23 13:41:34] Step: 1266, Training Logs: loss_final: 3.152737, loss_mean: 0.981325, proj_loss: -0.323366, loss_mean_cls: 2.494777, grad_norm: 8.536192 +Steps: 0%| | 1267/1000000 [05:14<67:46:05, 4.09it/s, grad_norm=8.54, loss_final=3.15, loss_mean=0.981, loss_mean_cls=2.49, proj_loss=-0.323][2026-03-23 13:41:34] Step: 1267, Training Logs: loss_final: 3.207214, loss_mean: 0.949401, proj_loss: -0.319677, loss_mean_cls: 2.577490, grad_norm: 7.031060 +Steps: 0%| | 1268/1000000 [05:14<67:44:00, 4.10it/s, grad_norm=7.03, loss_final=3.21, loss_mean=0.949, loss_mean_cls=2.58, proj_loss=-0.32][2026-03-23 13:41:34] Step: 1268, Training Logs: loss_final: 3.289139, loss_mean: 0.933531, proj_loss: -0.330590, loss_mean_cls: 2.686198, grad_norm: 11.846085 +Steps: 0%| | 1269/1000000 [05:14<67:40:19, 4.10it/s, grad_norm=11.8, loss_final=3.29, loss_mean=0.934, loss_mean_cls=2.69, proj_loss=-0.331][2026-03-23 13:41:34] Step: 1269, Training Logs: loss_final: 3.509050, loss_mean: 0.968744, proj_loss: -0.316770, loss_mean_cls: 2.857076, grad_norm: 7.141188 +Steps: 0%| | 1270/1000000 [05:15<67:40:58, 4.10it/s, grad_norm=7.14, loss_final=3.51, loss_mean=0.969, loss_mean_cls=2.86, proj_loss=-0.317][2026-03-23 13:41:35] Step: 1270, Training Logs: loss_final: 2.882645, loss_mean: 0.959723, proj_loss: -0.325195, loss_mean_cls: 2.248118, grad_norm: 6.467248 +Steps: 0%| | 1271/1000000 [05:15<67:40:32, 4.10it/s, grad_norm=6.47, loss_final=2.88, loss_mean=0.96, loss_mean_cls=2.25, proj_loss=-0.325][2026-03-23 13:41:35] Step: 1271, Training Logs: loss_final: 3.176489, loss_mean: 0.964680, proj_loss: -0.329631, loss_mean_cls: 2.541441, grad_norm: 7.576861 +Steps: 0%| | 1272/1000000 [05:15<67:38:58, 4.10it/s, grad_norm=7.58, loss_final=3.18, loss_mean=0.965, loss_mean_cls=2.54, proj_loss=-0.33][2026-03-23 13:41:35] Step: 1272, Training Logs: loss_final: 3.227985, loss_mean: 0.949027, proj_loss: -0.326701, loss_mean_cls: 2.605659, grad_norm: 6.844894 +Steps: 0%| | 1273/1000000 [05:15<70:28:20, 3.94it/s, grad_norm=6.84, loss_final=3.23, loss_mean=0.949, loss_mean_cls=2.61, proj_loss=-0.327][2026-03-23 13:41:35] Step: 1273, Training Logs: loss_final: 3.379270, loss_mean: 0.941634, proj_loss: -0.323964, loss_mean_cls: 2.761600, grad_norm: 8.853448 +Steps: 0%| | 1274/1000000 [05:16<69:35:03, 3.99it/s, grad_norm=8.85, loss_final=3.38, loss_mean=0.942, loss_mean_cls=2.76, proj_loss=-0.324][2026-03-23 13:41:36] Step: 1274, Training Logs: loss_final: 3.042534, loss_mean: 0.972149, proj_loss: -0.327001, loss_mean_cls: 2.397386, grad_norm: 9.611083 +Steps: 0%| | 1275/1000000 [05:16<69:01:37, 4.02it/s, grad_norm=9.61, loss_final=3.04, loss_mean=0.972, loss_mean_cls=2.4, proj_loss=-0.327][2026-03-23 13:41:36] Step: 1275, Training Logs: loss_final: 3.207569, loss_mean: 0.961909, proj_loss: -0.330513, loss_mean_cls: 2.576173, grad_norm: 6.531894 +Steps: 0%| | 1276/1000000 [05:16<68:35:16, 4.04it/s, grad_norm=6.53, loss_final=3.21, loss_mean=0.962, loss_mean_cls=2.58, proj_loss=-0.331][2026-03-23 13:41:36] Step: 1276, Training Logs: loss_final: 2.925310, loss_mean: 0.973570, proj_loss: -0.331155, loss_mean_cls: 2.282896, grad_norm: 7.852894 +Steps: 0%| | 1277/1000000 [05:16<68:17:19, 4.06it/s, grad_norm=7.85, loss_final=2.93, loss_mean=0.974, loss_mean_cls=2.28, proj_loss=-0.331][2026-03-23 13:41:36] Step: 1277, Training Logs: loss_final: 3.499620, loss_mean: 0.947281, proj_loss: -0.321103, loss_mean_cls: 2.873442, grad_norm: 11.416069 +Steps: 0%| | 1278/1000000 [05:17<68:03:02, 4.08it/s, grad_norm=11.4, loss_final=3.5, loss_mean=0.947, loss_mean_cls=2.87, proj_loss=-0.321][2026-03-23 13:41:37] Step: 1278, Training Logs: loss_final: 2.995964, loss_mean: 0.952883, proj_loss: -0.331835, loss_mean_cls: 2.374915, grad_norm: 7.733103 +Steps: 0%| | 1279/1000000 [05:17<67:54:01, 4.09it/s, grad_norm=7.73, loss_final=3, loss_mean=0.953, loss_mean_cls=2.37, proj_loss=-0.332][2026-03-23 13:41:37] Step: 1279, Training Logs: loss_final: 3.615254, loss_mean: 0.940673, proj_loss: -0.318403, loss_mean_cls: 2.992985, grad_norm: 8.695398 +Steps: 0%| | 1280/1000000 [05:17<67:48:12, 4.09it/s, grad_norm=8.7, loss_final=3.62, loss_mean=0.941, loss_mean_cls=2.99, proj_loss=-0.318][2026-03-23 13:41:37] Step: 1280, Training Logs: loss_final: 3.286794, loss_mean: 0.962435, proj_loss: -0.328735, loss_mean_cls: 2.653094, grad_norm: 9.391966 +Steps: 0%| | 1281/1000000 [05:17<67:43:36, 4.10it/s, grad_norm=9.39, loss_final=3.29, loss_mean=0.962, loss_mean_cls=2.65, proj_loss=-0.329][2026-03-23 13:41:37] Step: 1281, Training Logs: loss_final: 3.127982, loss_mean: 0.965923, proj_loss: -0.328135, loss_mean_cls: 2.490194, grad_norm: 8.712268 +Steps: 0%| | 1282/1000000 [05:18<67:41:14, 4.10it/s, grad_norm=8.71, loss_final=3.13, loss_mean=0.966, loss_mean_cls=2.49, proj_loss=-0.328][2026-03-23 13:41:38] Step: 1282, Training Logs: loss_final: 3.015710, loss_mean: 0.955479, proj_loss: -0.329259, loss_mean_cls: 2.389490, grad_norm: 4.049550 +Steps: 0%| | 1283/1000000 [05:18<67:42:43, 4.10it/s, grad_norm=4.05, loss_final=3.02, loss_mean=0.955, loss_mean_cls=2.39, proj_loss=-0.329][2026-03-23 13:41:38] Step: 1283, Training Logs: loss_final: 3.216019, loss_mean: 0.961394, proj_loss: -0.327984, loss_mean_cls: 2.582609, grad_norm: 9.612285 +Steps: 0%| | 1284/1000000 [05:18<67:41:03, 4.10it/s, grad_norm=9.61, loss_final=3.22, loss_mean=0.961, loss_mean_cls=2.58, proj_loss=-0.328][2026-03-23 13:41:38] Step: 1284, Training Logs: loss_final: 2.869157, loss_mean: 0.971833, proj_loss: -0.326305, loss_mean_cls: 2.223629, grad_norm: 11.666966 +Steps: 0%| | 1285/1000000 [05:18<67:39:56, 4.10it/s, grad_norm=11.7, loss_final=2.87, loss_mean=0.972, loss_mean_cls=2.22, proj_loss=-0.326][2026-03-23 13:41:38] Step: 1285, Training Logs: loss_final: 2.932728, loss_mean: 0.980727, proj_loss: -0.328391, loss_mean_cls: 2.280393, grad_norm: 5.735778 +Steps: 0%| | 1286/1000000 [05:19<67:37:50, 4.10it/s, grad_norm=5.74, loss_final=2.93, loss_mean=0.981, loss_mean_cls=2.28, proj_loss=-0.328][2026-03-23 13:41:39] Step: 1286, Training Logs: loss_final: 3.141746, loss_mean: 0.969065, proj_loss: -0.329975, loss_mean_cls: 2.502656, grad_norm: 11.349870 +Steps: 0%| | 1287/1000000 [05:19<67:36:57, 4.10it/s, grad_norm=11.3, loss_final=3.14, loss_mean=0.969, loss_mean_cls=2.5, proj_loss=-0.33][2026-03-23 13:41:39] Step: 1287, Training Logs: loss_final: 3.099847, loss_mean: 0.969269, proj_loss: -0.331071, loss_mean_cls: 2.461648, grad_norm: 11.524776 +Steps: 0%| | 1288/1000000 [05:19<67:36:38, 4.10it/s, grad_norm=11.5, loss_final=3.1, loss_mean=0.969, loss_mean_cls=2.46, proj_loss=-0.331][2026-03-23 13:41:39] Step: 1288, Training Logs: loss_final: 3.109513, loss_mean: 0.994909, proj_loss: -0.328051, loss_mean_cls: 2.442655, grad_norm: 8.299112 +Steps: 0%| | 1289/1000000 [05:19<67:34:20, 4.11it/s, grad_norm=8.3, loss_final=3.11, loss_mean=0.995, loss_mean_cls=2.44, proj_loss=-0.328][2026-03-23 13:41:39] Step: 1289, Training Logs: loss_final: 3.155475, loss_mean: 0.967797, proj_loss: -0.323873, loss_mean_cls: 2.511551, grad_norm: 9.992801 +Steps: 0%| | 1290/1000000 [05:20<67:33:39, 4.11it/s, grad_norm=9.99, loss_final=3.16, loss_mean=0.968, loss_mean_cls=2.51, proj_loss=-0.324][2026-03-23 13:41:40] Step: 1290, Training Logs: loss_final: 2.811830, loss_mean: 0.967054, proj_loss: -0.332603, loss_mean_cls: 2.177379, grad_norm: 5.501231 +Steps: 0%| | 1291/1000000 [05:20<67:34:04, 4.11it/s, grad_norm=5.5, loss_final=2.81, loss_mean=0.967, loss_mean_cls=2.18, proj_loss=-0.333][2026-03-23 13:41:40] Step: 1291, Training Logs: loss_final: 3.229259, loss_mean: 0.989247, proj_loss: -0.329154, loss_mean_cls: 2.569166, grad_norm: 6.766212 +Steps: 0%| | 1292/1000000 [05:20<67:32:24, 4.11it/s, grad_norm=6.77, loss_final=3.23, loss_mean=0.989, loss_mean_cls=2.57, proj_loss=-0.329][2026-03-23 13:41:40] Step: 1292, Training Logs: loss_final: 2.975939, loss_mean: 0.949347, proj_loss: -0.336222, loss_mean_cls: 2.362814, grad_norm: 7.510567 +Steps: 0%| | 1293/1000000 [05:20<67:31:58, 4.11it/s, grad_norm=7.51, loss_final=2.98, loss_mean=0.949, loss_mean_cls=2.36, proj_loss=-0.336][2026-03-23 13:41:40] Step: 1293, Training Logs: loss_final: 3.186740, loss_mean: 0.953921, proj_loss: -0.329684, loss_mean_cls: 2.562503, grad_norm: 3.222987 +Steps: 0%| | 1294/1000000 [05:21<67:40:24, 4.10it/s, grad_norm=3.22, loss_final=3.19, loss_mean=0.954, loss_mean_cls=2.56, proj_loss=-0.33][2026-03-23 13:41:41] Step: 1294, Training Logs: loss_final: 3.722414, loss_mean: 0.945621, proj_loss: -0.328629, loss_mean_cls: 3.105422, grad_norm: 13.245031 +Steps: 0%| | 1295/1000000 [05:21<67:46:56, 4.09it/s, grad_norm=13.2, loss_final=3.72, loss_mean=0.946, loss_mean_cls=3.11, proj_loss=-0.329][2026-03-23 13:41:41] Step: 1295, Training Logs: loss_final: 3.691514, loss_mean: 0.960996, proj_loss: -0.319683, loss_mean_cls: 3.050200, grad_norm: 13.897870 +Steps: 0%| | 1296/1000000 [05:21<67:42:41, 4.10it/s, grad_norm=13.9, loss_final=3.69, loss_mean=0.961, loss_mean_cls=3.05, proj_loss=-0.32][2026-03-23 13:41:41] Step: 1296, Training Logs: loss_final: 3.040784, loss_mean: 0.939025, proj_loss: -0.320406, loss_mean_cls: 2.422166, grad_norm: 4.228326 +Steps: 0%| | 1297/1000000 [05:21<67:49:29, 4.09it/s, grad_norm=4.23, loss_final=3.04, loss_mean=0.939, loss_mean_cls=2.42, proj_loss=-0.32][2026-03-23 13:41:41] Step: 1297, Training Logs: loss_final: 3.229775, loss_mean: 0.957315, proj_loss: -0.325790, loss_mean_cls: 2.598250, grad_norm: 6.702120 +Steps: 0%| | 1298/1000000 [05:22<67:52:32, 4.09it/s, grad_norm=6.7, loss_final=3.23, loss_mean=0.957, loss_mean_cls=2.6, proj_loss=-0.326][2026-03-23 13:41:42] Step: 1298, Training Logs: loss_final: 2.604372, loss_mean: 0.993460, proj_loss: -0.339365, loss_mean_cls: 1.950277, grad_norm: 8.215422 +Steps: 0%| | 1299/1000000 [05:22<67:56:39, 4.08it/s, grad_norm=8.22, loss_final=2.6, loss_mean=0.993, loss_mean_cls=1.95, proj_loss=-0.339][2026-03-23 13:41:42] Step: 1299, Training Logs: loss_final: 3.211652, loss_mean: 0.961493, proj_loss: -0.332313, loss_mean_cls: 2.582472, grad_norm: 4.565433 +Steps: 0%| | 1300/1000000 [05:22<67:54:02, 4.09it/s, grad_norm=4.57, loss_final=3.21, loss_mean=0.961, loss_mean_cls=2.58, proj_loss=-0.332][2026-03-23 13:41:42] Step: 1300, Training Logs: loss_final: 3.144296, loss_mean: 0.936150, proj_loss: -0.323967, loss_mean_cls: 2.532113, grad_norm: 3.890749 +Steps: 0%| | 1301/1000000 [05:22<67:47:17, 4.09it/s, grad_norm=3.89, loss_final=3.14, loss_mean=0.936, loss_mean_cls=2.53, proj_loss=-0.324][2026-03-23 13:41:42] Step: 1301, Training Logs: loss_final: 2.937433, loss_mean: 0.984636, proj_loss: -0.329169, loss_mean_cls: 2.281965, grad_norm: 7.416243 +Steps: 0%| | 1302/1000000 [05:23<67:48:22, 4.09it/s, grad_norm=7.42, loss_final=2.94, loss_mean=0.985, loss_mean_cls=2.28, proj_loss=-0.329][2026-03-23 13:41:42] Step: 1302, Training Logs: loss_final: 3.064830, loss_mean: 0.958225, proj_loss: -0.324544, loss_mean_cls: 2.431149, grad_norm: 4.607447 +Steps: 0%| | 1303/1000000 [05:23<67:50:02, 4.09it/s, grad_norm=4.61, loss_final=3.06, loss_mean=0.958, loss_mean_cls=2.43, proj_loss=-0.325][2026-03-23 13:41:43] Step: 1303, Training Logs: loss_final: 2.992056, loss_mean: 0.971026, proj_loss: -0.330663, loss_mean_cls: 2.351693, grad_norm: 3.037043 +Steps: 0%| | 1304/1000000 [05:23<67:53:06, 4.09it/s, grad_norm=3.04, loss_final=2.99, loss_mean=0.971, loss_mean_cls=2.35, proj_loss=-0.331][2026-03-23 13:41:43] Step: 1304, Training Logs: loss_final: 3.068998, loss_mean: 0.977390, proj_loss: -0.325560, loss_mean_cls: 2.417168, grad_norm: 3.411866 +Steps: 0%| | 1305/1000000 [05:23<67:46:35, 4.09it/s, grad_norm=3.41, loss_final=3.07, loss_mean=0.977, loss_mean_cls=2.42, proj_loss=-0.326][2026-03-23 13:41:43] Step: 1305, Training Logs: loss_final: 2.869483, loss_mean: 0.967453, proj_loss: -0.330334, loss_mean_cls: 2.232365, grad_norm: 6.387418 +Steps: 0%| | 1306/1000000 [05:24<67:41:10, 4.10it/s, grad_norm=6.39, loss_final=2.87, loss_mean=0.967, loss_mean_cls=2.23, proj_loss=-0.33][2026-03-23 13:41:43] Step: 1306, Training Logs: loss_final: 3.446475, loss_mean: 0.944939, proj_loss: -0.320605, loss_mean_cls: 2.822141, grad_norm: 3.878382 +Steps: 0%| | 1307/1000000 [05:24<67:43:11, 4.10it/s, grad_norm=3.88, loss_final=3.45, loss_mean=0.945, loss_mean_cls=2.82, proj_loss=-0.321][2026-03-23 13:41:44] Step: 1307, Training Logs: loss_final: 3.145830, loss_mean: 0.959067, proj_loss: -0.331767, loss_mean_cls: 2.518530, grad_norm: 5.977529 +Steps: 0%| | 1308/1000000 [05:24<67:39:01, 4.10it/s, grad_norm=5.98, loss_final=3.15, loss_mean=0.959, loss_mean_cls=2.52, proj_loss=-0.332][2026-03-23 13:41:44] Step: 1308, Training Logs: loss_final: 3.026065, loss_mean: 0.957780, proj_loss: -0.330443, loss_mean_cls: 2.398727, grad_norm: 5.781617 +Steps: 0%| | 1309/1000000 [05:24<67:36:50, 4.10it/s, grad_norm=5.78, loss_final=3.03, loss_mean=0.958, loss_mean_cls=2.4, proj_loss=-0.33][2026-03-23 13:41:44] Step: 1309, Training Logs: loss_final: 3.294996, loss_mean: 0.957933, proj_loss: -0.326102, loss_mean_cls: 2.663165, grad_norm: 7.705647 +Steps: 0%| | 1310/1000000 [05:24<67:34:09, 4.11it/s, grad_norm=7.71, loss_final=3.29, loss_mean=0.958, loss_mean_cls=2.66, proj_loss=-0.326][2026-03-23 13:41:44] Step: 1310, Training Logs: loss_final: 2.901639, loss_mean: 0.959312, proj_loss: -0.333495, loss_mean_cls: 2.275823, grad_norm: 8.189142 +Steps: 0%| | 1311/1000000 [05:25<67:33:34, 4.11it/s, grad_norm=8.19, loss_final=2.9, loss_mean=0.959, loss_mean_cls=2.28, proj_loss=-0.333][2026-03-23 13:41:45] Step: 1311, Training Logs: loss_final: 3.043614, loss_mean: 0.971185, proj_loss: -0.332887, loss_mean_cls: 2.405316, grad_norm: 3.979463 +Steps: 0%| | 1312/1000000 [05:25<67:32:11, 4.11it/s, grad_norm=3.98, loss_final=3.04, loss_mean=0.971, loss_mean_cls=2.41, proj_loss=-0.333][2026-03-23 13:41:45] Step: 1312, Training Logs: loss_final: 2.970924, loss_mean: 0.961247, proj_loss: -0.335686, loss_mean_cls: 2.345363, grad_norm: 5.715133 +Steps: 0%| | 1313/1000000 [05:25<67:36:25, 4.10it/s, grad_norm=5.72, loss_final=2.97, loss_mean=0.961, loss_mean_cls=2.35, proj_loss=-0.336][2026-03-23 13:41:45] Step: 1313, Training Logs: loss_final: 2.980734, loss_mean: 0.977478, proj_loss: -0.330632, loss_mean_cls: 2.333888, grad_norm: 5.225981 +Steps: 0%| | 1314/1000000 [05:25<67:36:24, 4.10it/s, grad_norm=5.23, loss_final=2.98, loss_mean=0.977, loss_mean_cls=2.33, proj_loss=-0.331][2026-03-23 13:41:45] Step: 1314, Training Logs: loss_final: 2.859637, loss_mean: 0.978857, proj_loss: -0.328627, loss_mean_cls: 2.209407, grad_norm: 3.336268 +Steps: 0%| | 1315/1000000 [05:26<67:35:23, 4.10it/s, grad_norm=3.34, loss_final=2.86, loss_mean=0.979, loss_mean_cls=2.21, proj_loss=-0.329][2026-03-23 13:41:46] Step: 1315, Training Logs: loss_final: 2.954089, loss_mean: 0.975011, proj_loss: -0.331585, loss_mean_cls: 2.310663, grad_norm: 10.344514 +Steps: 0%| | 1316/1000000 [05:26<67:35:08, 4.10it/s, grad_norm=10.3, loss_final=2.95, loss_mean=0.975, loss_mean_cls=2.31, proj_loss=-0.332][2026-03-23 13:41:46] Step: 1316, Training Logs: loss_final: 2.802002, loss_mean: 0.968022, proj_loss: -0.331854, loss_mean_cls: 2.165835, grad_norm: 4.460737 +Steps: 0%| | 1317/1000000 [05:26<67:39:06, 4.10it/s, grad_norm=4.46, loss_final=2.8, loss_mean=0.968, loss_mean_cls=2.17, proj_loss=-0.332][2026-03-23 13:41:46] Step: 1317, Training Logs: loss_final: 3.123237, loss_mean: 0.965249, proj_loss: -0.330026, loss_mean_cls: 2.488014, grad_norm: 3.298855 +Steps: 0%| | 1318/1000000 [05:26<67:39:07, 4.10it/s, grad_norm=3.3, loss_final=3.12, loss_mean=0.965, loss_mean_cls=2.49, proj_loss=-0.33][2026-03-23 13:41:46] Step: 1318, Training Logs: loss_final: 3.263839, loss_mean: 0.957208, proj_loss: -0.335234, loss_mean_cls: 2.641865, grad_norm: 8.779588 +Steps: 0%| | 1319/1000000 [05:27<67:37:32, 4.10it/s, grad_norm=8.78, loss_final=3.26, loss_mean=0.957, loss_mean_cls=2.64, proj_loss=-0.335][2026-03-23 13:41:47] Step: 1319, Training Logs: loss_final: 3.001029, loss_mean: 0.959632, proj_loss: -0.331294, loss_mean_cls: 2.372691, grad_norm: 4.905334 +Steps: 0%| | 1320/1000000 [05:27<67:37:11, 4.10it/s, grad_norm=4.91, loss_final=3, loss_mean=0.96, loss_mean_cls=2.37, proj_loss=-0.331][2026-03-23 13:41:47] Step: 1320, Training Logs: loss_final: 3.344096, loss_mean: 0.965276, proj_loss: -0.324269, loss_mean_cls: 2.703089, grad_norm: 3.569385 +Steps: 0%| | 1321/1000000 [05:27<67:44:23, 4.10it/s, grad_norm=3.57, loss_final=3.34, loss_mean=0.965, loss_mean_cls=2.7, proj_loss=-0.324][2026-03-23 13:41:47] Step: 1321, Training Logs: loss_final: 2.607011, loss_mean: 0.982528, proj_loss: -0.337692, loss_mean_cls: 1.962175, grad_norm: 3.897840 +Steps: 0%| | 1322/1000000 [05:27<67:41:17, 4.10it/s, grad_norm=3.9, loss_final=2.61, loss_mean=0.983, loss_mean_cls=1.96, proj_loss=-0.338][2026-03-23 13:41:47] Step: 1322, Training Logs: loss_final: 3.284016, loss_mean: 0.962150, proj_loss: -0.332143, loss_mean_cls: 2.654009, grad_norm: 12.158188 +Steps: 0%| | 1323/1000000 [05:28<67:39:43, 4.10it/s, grad_norm=12.2, loss_final=3.28, loss_mean=0.962, loss_mean_cls=2.65, proj_loss=-0.332][2026-03-23 13:41:48] Step: 1323, Training Logs: loss_final: 2.990620, loss_mean: 0.988972, proj_loss: -0.329284, loss_mean_cls: 2.330932, grad_norm: 7.596878 +Steps: 0%| | 1324/1000000 [05:28<67:36:14, 4.10it/s, grad_norm=7.6, loss_final=2.99, loss_mean=0.989, loss_mean_cls=2.33, proj_loss=-0.329][2026-03-23 13:41:48] Step: 1324, Training Logs: loss_final: 3.064197, loss_mean: 0.960083, proj_loss: -0.333059, loss_mean_cls: 2.437173, grad_norm: 6.671799 +Steps: 0%| | 1325/1000000 [05:28<67:38:22, 4.10it/s, grad_norm=6.67, loss_final=3.06, loss_mean=0.96, loss_mean_cls=2.44, proj_loss=-0.333][2026-03-23 13:41:48] Step: 1325, Training Logs: loss_final: 3.117505, loss_mean: 0.951755, proj_loss: -0.327821, loss_mean_cls: 2.493571, grad_norm: 9.869998 +Steps: 0%| | 1326/1000000 [05:28<67:34:53, 4.10it/s, grad_norm=9.87, loss_final=3.12, loss_mean=0.952, loss_mean_cls=2.49, proj_loss=-0.328][2026-03-23 13:41:48] Step: 1326, Training Logs: loss_final: 3.219110, loss_mean: 0.954254, proj_loss: -0.330373, loss_mean_cls: 2.595229, grad_norm: 15.586904 +Steps: 0%| | 1327/1000000 [05:29<67:32:35, 4.11it/s, grad_norm=15.6, loss_final=3.22, loss_mean=0.954, loss_mean_cls=2.6, proj_loss=-0.33][2026-03-23 13:41:49] Step: 1327, Training Logs: loss_final: 3.123473, loss_mean: 0.975216, proj_loss: -0.326155, loss_mean_cls: 2.474412, grad_norm: 14.865219 +Steps: 0%| | 1328/1000000 [05:29<67:30:49, 4.11it/s, grad_norm=14.9, loss_final=3.12, loss_mean=0.975, loss_mean_cls=2.47, proj_loss=-0.326][2026-03-23 13:41:49] Step: 1328, Training Logs: loss_final: 3.601959, loss_mean: 0.965795, proj_loss: -0.322666, loss_mean_cls: 2.958830, grad_norm: 7.398426 +Steps: 0%| | 1329/1000000 [05:29<67:37:15, 4.10it/s, grad_norm=7.4, loss_final=3.6, loss_mean=0.966, loss_mean_cls=2.96, proj_loss=-0.323][2026-03-23 13:41:49] Step: 1329, Training Logs: loss_final: 2.745035, loss_mean: 0.962543, proj_loss: -0.327104, loss_mean_cls: 2.109596, grad_norm: 7.160389 +Steps: 0%| | 1330/1000000 [05:29<67:35:55, 4.10it/s, grad_norm=7.16, loss_final=2.75, loss_mean=0.963, loss_mean_cls=2.11, proj_loss=-0.327][2026-03-23 13:41:49] Step: 1330, Training Logs: loss_final: 3.270157, loss_mean: 0.939720, proj_loss: -0.331089, loss_mean_cls: 2.661525, grad_norm: 3.310723 +Steps: 0%| | 1331/1000000 [05:30<67:37:58, 4.10it/s, grad_norm=3.31, loss_final=3.27, loss_mean=0.94, loss_mean_cls=2.66, proj_loss=-0.331][2026-03-23 13:41:50] Step: 1331, Training Logs: loss_final: 3.011152, loss_mean: 0.956479, proj_loss: -0.329486, loss_mean_cls: 2.384160, grad_norm: 6.532500 +Steps: 0%| | 1332/1000000 [05:30<67:36:41, 4.10it/s, grad_norm=6.53, loss_final=3.01, loss_mean=0.956, loss_mean_cls=2.38, proj_loss=-0.329][2026-03-23 13:41:50] Step: 1332, Training Logs: loss_final: 2.935152, loss_mean: 0.948560, proj_loss: -0.332088, loss_mean_cls: 2.318680, grad_norm: 3.336418 +Steps: 0%| | 1333/1000000 [05:30<67:34:11, 4.11it/s, grad_norm=3.34, loss_final=2.94, loss_mean=0.949, loss_mean_cls=2.32, proj_loss=-0.332][2026-03-23 13:41:50] Step: 1333, Training Logs: loss_final: 2.916893, loss_mean: 0.965754, proj_loss: -0.334752, loss_mean_cls: 2.285892, grad_norm: 3.652047 +Steps: 0%| | 1334/1000000 [05:30<67:32:43, 4.11it/s, grad_norm=3.65, loss_final=2.92, loss_mean=0.966, loss_mean_cls=2.29, proj_loss=-0.335][2026-03-23 13:41:50] Step: 1334, Training Logs: loss_final: 3.184680, loss_mean: 0.945142, proj_loss: -0.334257, loss_mean_cls: 2.573794, grad_norm: 3.552944 +Steps: 0%| | 1335/1000000 [05:31<67:33:08, 4.11it/s, grad_norm=3.55, loss_final=3.18, loss_mean=0.945, loss_mean_cls=2.57, proj_loss=-0.334][2026-03-23 13:41:51] Step: 1335, Training Logs: loss_final: 3.730886, loss_mean: 0.928572, proj_loss: -0.320882, loss_mean_cls: 3.123197, grad_norm: 11.515704 +Steps: 0%| | 1336/1000000 [05:31<67:34:32, 4.11it/s, grad_norm=11.5, loss_final=3.73, loss_mean=0.929, loss_mean_cls=3.12, proj_loss=-0.321][2026-03-23 13:41:51] Step: 1336, Training Logs: loss_final: 3.061172, loss_mean: 0.954868, proj_loss: -0.337088, loss_mean_cls: 2.443392, grad_norm: 9.647620 +Steps: 0%| | 1337/1000000 [05:31<67:38:48, 4.10it/s, grad_norm=9.65, loss_final=3.06, loss_mean=0.955, loss_mean_cls=2.44, proj_loss=-0.337][2026-03-23 13:41:51] Step: 1337, Training Logs: loss_final: 2.709787, loss_mean: 0.987326, proj_loss: -0.332633, loss_mean_cls: 2.055094, grad_norm: 4.601330 +Steps: 0%| | 1338/1000000 [05:31<67:39:59, 4.10it/s, grad_norm=4.6, loss_final=2.71, loss_mean=0.987, loss_mean_cls=2.06, proj_loss=-0.333][2026-03-23 13:41:51] Step: 1338, Training Logs: loss_final: 2.866128, loss_mean: 0.975220, proj_loss: -0.336473, loss_mean_cls: 2.227380, grad_norm: 5.728030 +Steps: 0%| | 1339/1000000 [05:32<67:39:58, 4.10it/s, grad_norm=5.73, loss_final=2.87, loss_mean=0.975, loss_mean_cls=2.23, proj_loss=-0.336][2026-03-23 13:41:52] Step: 1339, Training Logs: loss_final: 2.943139, loss_mean: 0.968062, proj_loss: -0.333690, loss_mean_cls: 2.308766, grad_norm: 4.793234 +Steps: 0%| | 1340/1000000 [05:32<67:35:50, 4.10it/s, grad_norm=4.79, loss_final=2.94, loss_mean=0.968, loss_mean_cls=2.31, proj_loss=-0.334][2026-03-23 13:41:52] Step: 1340, Training Logs: loss_final: 2.700586, loss_mean: 0.975998, proj_loss: -0.337242, loss_mean_cls: 2.061830, grad_norm: 3.908799 +Steps: 0%| | 1341/1000000 [05:32<67:38:31, 4.10it/s, grad_norm=3.91, loss_final=2.7, loss_mean=0.976, loss_mean_cls=2.06, proj_loss=-0.337][2026-03-23 13:41:52] Step: 1341, Training Logs: loss_final: 3.156714, loss_mean: 0.956115, proj_loss: -0.336004, loss_mean_cls: 2.536604, grad_norm: 6.626270 +Steps: 0%| | 1342/1000000 [05:32<67:39:46, 4.10it/s, grad_norm=6.63, loss_final=3.16, loss_mean=0.956, loss_mean_cls=2.54, proj_loss=-0.336][2026-03-23 13:41:52] Step: 1342, Training Logs: loss_final: 3.272654, loss_mean: 0.947647, proj_loss: -0.333176, loss_mean_cls: 2.658182, grad_norm: 4.739450 +Steps: 0%| | 1343/1000000 [05:33<67:42:00, 4.10it/s, grad_norm=4.74, loss_final=3.27, loss_mean=0.948, loss_mean_cls=2.66, proj_loss=-0.333][2026-03-23 13:41:52] Step: 1343, Training Logs: loss_final: 2.925472, loss_mean: 0.956844, proj_loss: -0.336947, loss_mean_cls: 2.305576, grad_norm: 5.697206 +Steps: 0%| | 1344/1000000 [05:33<67:38:41, 4.10it/s, grad_norm=5.7, loss_final=2.93, loss_mean=0.957, loss_mean_cls=2.31, proj_loss=-0.337][2026-03-23 13:41:53] Step: 1344, Training Logs: loss_final: 3.295954, loss_mean: 0.944351, proj_loss: -0.329297, loss_mean_cls: 2.680900, grad_norm: 8.127292 +Steps: 0%| | 1345/1000000 [05:33<67:44:20, 4.10it/s, grad_norm=8.13, loss_final=3.3, loss_mean=0.944, loss_mean_cls=2.68, proj_loss=-0.329][2026-03-23 13:41:53] Step: 1345, Training Logs: loss_final: 2.976838, loss_mean: 0.969446, proj_loss: -0.332974, loss_mean_cls: 2.340365, grad_norm: 5.838646 +Steps: 0%| | 1346/1000000 [05:33<67:43:54, 4.10it/s, grad_norm=5.84, loss_final=2.98, loss_mean=0.969, loss_mean_cls=2.34, proj_loss=-0.333][2026-03-23 13:41:53] Step: 1346, Training Logs: loss_final: 2.846244, loss_mean: 0.952360, proj_loss: -0.333353, loss_mean_cls: 2.227237, grad_norm: 3.896975 +Steps: 0%| | 1347/1000000 [05:33<67:40:23, 4.10it/s, grad_norm=3.9, loss_final=2.85, loss_mean=0.952, loss_mean_cls=2.23, proj_loss=-0.333][2026-03-23 13:41:53] Step: 1347, Training Logs: loss_final: 3.481032, loss_mean: 0.937775, proj_loss: -0.339658, loss_mean_cls: 2.882914, grad_norm: 11.599091 +Steps: 0%| | 1348/1000000 [05:34<67:36:21, 4.10it/s, grad_norm=11.6, loss_final=3.48, loss_mean=0.938, loss_mean_cls=2.88, proj_loss=-0.34][2026-03-23 13:41:54] Step: 1348, Training Logs: loss_final: 3.209697, loss_mean: 0.976958, proj_loss: -0.324901, loss_mean_cls: 2.557640, grad_norm: 12.684163 +Steps: 0%| | 1349/1000000 [05:34<67:40:54, 4.10it/s, grad_norm=12.7, loss_final=3.21, loss_mean=0.977, loss_mean_cls=2.56, proj_loss=-0.325][2026-03-23 13:41:54] Step: 1349, Training Logs: loss_final: 3.107524, loss_mean: 0.970543, proj_loss: -0.329578, loss_mean_cls: 2.466558, grad_norm: 5.045334 +Steps: 0%| | 1350/1000000 [05:34<67:42:23, 4.10it/s, grad_norm=5.05, loss_final=3.11, loss_mean=0.971, loss_mean_cls=2.47, proj_loss=-0.33][2026-03-23 13:41:54] Step: 1350, Training Logs: loss_final: 3.272487, loss_mean: 0.955411, proj_loss: -0.325398, loss_mean_cls: 2.642474, grad_norm: 12.067108 +Steps: 0%| | 1351/1000000 [05:34<67:39:47, 4.10it/s, grad_norm=12.1, loss_final=3.27, loss_mean=0.955, loss_mean_cls=2.64, proj_loss=-0.325][2026-03-23 13:41:54] Step: 1351, Training Logs: loss_final: 3.050806, loss_mean: 0.984171, proj_loss: -0.332498, loss_mean_cls: 2.399132, grad_norm: 9.073371 +Steps: 0%| | 1352/1000000 [05:35<67:36:24, 4.10it/s, grad_norm=9.07, loss_final=3.05, loss_mean=0.984, loss_mean_cls=2.4, proj_loss=-0.332][2026-03-23 13:41:55] Step: 1352, Training Logs: loss_final: 2.948936, loss_mean: 0.972557, proj_loss: -0.335413, loss_mean_cls: 2.311792, grad_norm: 10.782212 +Steps: 0%| | 1353/1000000 [05:35<67:39:47, 4.10it/s, grad_norm=10.8, loss_final=2.95, loss_mean=0.973, loss_mean_cls=2.31, proj_loss=-0.335][2026-03-23 13:41:55] Step: 1353, Training Logs: loss_final: 2.651330, loss_mean: 0.989611, proj_loss: -0.339106, loss_mean_cls: 2.000825, grad_norm: 5.278589 +Steps: 0%| | 1354/1000000 [05:35<67:40:04, 4.10it/s, grad_norm=5.28, loss_final=2.65, loss_mean=0.99, loss_mean_cls=2, proj_loss=-0.339][2026-03-23 13:41:55] Step: 1354, Training Logs: loss_final: 2.935597, loss_mean: 0.953225, proj_loss: -0.336711, loss_mean_cls: 2.319083, grad_norm: 3.265316 +Steps: 0%| | 1355/1000000 [05:35<67:39:48, 4.10it/s, grad_norm=3.27, loss_final=2.94, loss_mean=0.953, loss_mean_cls=2.32, proj_loss=-0.337][2026-03-23 13:41:55] Step: 1355, Training Logs: loss_final: 3.006869, loss_mean: 0.934885, proj_loss: -0.335287, loss_mean_cls: 2.407270, grad_norm: 2.864014 +Steps: 0%| | 1356/1000000 [05:36<67:37:07, 4.10it/s, grad_norm=2.86, loss_final=3.01, loss_mean=0.935, loss_mean_cls=2.41, proj_loss=-0.335][2026-03-23 13:41:56] Step: 1356, Training Logs: loss_final: 3.234382, loss_mean: 0.957117, proj_loss: -0.330977, loss_mean_cls: 2.608241, grad_norm: 3.811844 +Steps: 0%| | 1357/1000000 [05:36<67:41:39, 4.10it/s, grad_norm=3.81, loss_final=3.23, loss_mean=0.957, loss_mean_cls=2.61, proj_loss=-0.331][2026-03-23 13:41:56] Step: 1357, Training Logs: loss_final: 3.160310, loss_mean: 0.945519, proj_loss: -0.331698, loss_mean_cls: 2.546489, grad_norm: 11.848282 +Steps: 0%| | 1358/1000000 [05:36<67:41:05, 4.10it/s, grad_norm=11.8, loss_final=3.16, loss_mean=0.946, loss_mean_cls=2.55, proj_loss=-0.332][2026-03-23 13:41:56] Step: 1358, Training Logs: loss_final: 3.515707, loss_mean: 0.954357, proj_loss: -0.325382, loss_mean_cls: 2.886732, grad_norm: 9.711579 +Steps: 0%| | 1359/1000000 [05:36<67:42:11, 4.10it/s, grad_norm=9.71, loss_final=3.52, loss_mean=0.954, loss_mean_cls=2.89, proj_loss=-0.325][2026-03-23 13:41:56] Step: 1359, Training Logs: loss_final: 2.914444, loss_mean: 0.956609, proj_loss: -0.335430, loss_mean_cls: 2.293265, grad_norm: 6.893406 +Steps: 0%| | 1360/1000000 [05:37<67:39:20, 4.10it/s, grad_norm=6.89, loss_final=2.91, loss_mean=0.957, loss_mean_cls=2.29, proj_loss=-0.335][2026-03-23 13:41:57] Step: 1360, Training Logs: loss_final: 3.031753, loss_mean: 0.976047, proj_loss: -0.333359, loss_mean_cls: 2.389065, grad_norm: 9.427832 +Steps: 0%| | 1361/1000000 [05:37<67:40:10, 4.10it/s, grad_norm=9.43, loss_final=3.03, loss_mean=0.976, loss_mean_cls=2.39, proj_loss=-0.333][2026-03-23 13:41:57] Step: 1361, Training Logs: loss_final: 3.060665, loss_mean: 0.958982, proj_loss: -0.336230, loss_mean_cls: 2.437913, grad_norm: 4.232547 +Steps: 0%| | 1362/1000000 [05:37<67:41:25, 4.10it/s, grad_norm=4.23, loss_final=3.06, loss_mean=0.959, loss_mean_cls=2.44, proj_loss=-0.336][2026-03-23 13:41:57] Step: 1362, Training Logs: loss_final: 2.899081, loss_mean: 0.955270, proj_loss: -0.334863, loss_mean_cls: 2.278675, grad_norm: 14.338437 +Steps: 0%| | 1363/1000000 [05:37<67:37:04, 4.10it/s, grad_norm=14.3, loss_final=2.9, loss_mean=0.955, loss_mean_cls=2.28, proj_loss=-0.335][2026-03-23 13:41:57] Step: 1363, Training Logs: loss_final: 3.663877, loss_mean: 0.935140, proj_loss: -0.326664, loss_mean_cls: 3.055401, grad_norm: 11.250578 +Steps: 0%| | 1364/1000000 [05:38<67:34:50, 4.10it/s, grad_norm=11.3, loss_final=3.66, loss_mean=0.935, loss_mean_cls=3.06, proj_loss=-0.327][2026-03-23 13:41:58] Step: 1364, Training Logs: loss_final: 2.816147, loss_mean: 0.977310, proj_loss: -0.335697, loss_mean_cls: 2.174534, grad_norm: 4.910552 +Steps: 0%| | 1365/1000000 [05:38<67:37:36, 4.10it/s, grad_norm=4.91, loss_final=2.82, loss_mean=0.977, loss_mean_cls=2.17, proj_loss=-0.336][2026-03-23 13:41:58] Step: 1365, Training Logs: loss_final: 2.913786, loss_mean: 0.976249, proj_loss: -0.338494, loss_mean_cls: 2.276031, grad_norm: 5.757598 +Steps: 0%| | 1366/1000000 [05:38<67:39:16, 4.10it/s, grad_norm=5.76, loss_final=2.91, loss_mean=0.976, loss_mean_cls=2.28, proj_loss=-0.338][2026-03-23 13:41:58] Step: 1366, Training Logs: loss_final: 3.204130, loss_mean: 0.970601, proj_loss: -0.338384, loss_mean_cls: 2.571913, grad_norm: 8.503531 +Steps: 0%| | 1367/1000000 [05:38<67:41:20, 4.10it/s, grad_norm=8.5, loss_final=3.2, loss_mean=0.971, loss_mean_cls=2.57, proj_loss=-0.338][2026-03-23 13:41:58] Step: 1367, Training Logs: loss_final: 3.309620, loss_mean: 0.968586, proj_loss: -0.336180, loss_mean_cls: 2.677214, grad_norm: 5.267352 +Steps: 0%| | 1368/1000000 [05:39<67:39:09, 4.10it/s, grad_norm=5.27, loss_final=3.31, loss_mean=0.969, loss_mean_cls=2.68, proj_loss=-0.336][2026-03-23 13:41:59] Step: 1368, Training Logs: loss_final: 3.344017, loss_mean: 0.966719, proj_loss: -0.335684, loss_mean_cls: 2.712982, grad_norm: 9.625667 +Steps: 0%| | 1369/1000000 [05:39<67:40:53, 4.10it/s, grad_norm=9.63, loss_final=3.34, loss_mean=0.967, loss_mean_cls=2.71, proj_loss=-0.336][2026-03-23 13:41:59] Step: 1369, Training Logs: loss_final: 3.349894, loss_mean: 0.998247, proj_loss: -0.333150, loss_mean_cls: 2.684797, grad_norm: 8.823762 +Steps: 0%| | 1370/1000000 [05:39<67:38:25, 4.10it/s, grad_norm=8.82, loss_final=3.35, loss_mean=0.998, loss_mean_cls=2.68, proj_loss=-0.333][2026-03-23 13:41:59] Step: 1370, Training Logs: loss_final: 3.109079, loss_mean: 0.948646, proj_loss: -0.339304, loss_mean_cls: 2.499737, grad_norm: 5.040744 +Steps: 0%| | 1371/1000000 [05:39<67:40:34, 4.10it/s, grad_norm=5.04, loss_final=3.11, loss_mean=0.949, loss_mean_cls=2.5, proj_loss=-0.339][2026-03-23 13:41:59] Step: 1371, Training Logs: loss_final: 3.028494, loss_mean: 0.973735, proj_loss: -0.340234, loss_mean_cls: 2.394993, grad_norm: 11.410913 +Steps: 0%| | 1372/1000000 [05:40<67:37:00, 4.10it/s, grad_norm=11.4, loss_final=3.03, loss_mean=0.974, loss_mean_cls=2.39, proj_loss=-0.34][2026-03-23 13:42:00] Step: 1372, Training Logs: loss_final: 3.103192, loss_mean: 0.994764, proj_loss: -0.332760, loss_mean_cls: 2.441189, grad_norm: 10.497052 +Steps: 0%| | 1373/1000000 [05:40<67:34:11, 4.11it/s, grad_norm=10.5, loss_final=3.1, loss_mean=0.995, loss_mean_cls=2.44, proj_loss=-0.333][2026-03-23 13:42:00] Step: 1373, Training Logs: loss_final: 3.065110, loss_mean: 0.988633, proj_loss: -0.329256, loss_mean_cls: 2.405734, grad_norm: 7.361359 +Steps: 0%| | 1374/1000000 [05:40<67:34:15, 4.11it/s, grad_norm=7.36, loss_final=3.07, loss_mean=0.989, loss_mean_cls=2.41, proj_loss=-0.329][2026-03-23 13:42:00] Step: 1374, Training Logs: loss_final: 3.253454, loss_mean: 0.978735, proj_loss: -0.327611, loss_mean_cls: 2.602330, grad_norm: 4.646953 +Steps: 0%| | 1375/1000000 [05:40<67:32:25, 4.11it/s, grad_norm=4.65, loss_final=3.25, loss_mean=0.979, loss_mean_cls=2.6, proj_loss=-0.328][2026-03-23 13:42:00] Step: 1375, Training Logs: loss_final: 3.307256, loss_mean: 0.977459, proj_loss: -0.337443, loss_mean_cls: 2.667240, grad_norm: 10.797763 +Steps: 0%| | 1376/1000000 [05:41<67:32:26, 4.11it/s, grad_norm=10.8, loss_final=3.31, loss_mean=0.977, loss_mean_cls=2.67, proj_loss=-0.337][2026-03-23 13:42:01] Step: 1376, Training Logs: loss_final: 3.133512, loss_mean: 0.979464, proj_loss: -0.338095, loss_mean_cls: 2.492143, grad_norm: 8.371516 +Steps: 0%| | 1377/1000000 [05:41<67:32:58, 4.11it/s, grad_norm=8.37, loss_final=3.13, loss_mean=0.979, loss_mean_cls=2.49, proj_loss=-0.338][2026-03-23 13:42:01] Step: 1377, Training Logs: loss_final: 3.049681, loss_mean: 0.978678, proj_loss: -0.339584, loss_mean_cls: 2.410587, grad_norm: 7.118255 +Steps: 0%| | 1378/1000000 [05:41<67:34:52, 4.10it/s, grad_norm=7.12, loss_final=3.05, loss_mean=0.979, loss_mean_cls=2.41, proj_loss=-0.34][2026-03-23 13:42:01] Step: 1378, Training Logs: loss_final: 2.671421, loss_mean: 0.996488, proj_loss: -0.339054, loss_mean_cls: 2.013987, grad_norm: 3.532550 +Steps: 0%| | 1379/1000000 [05:41<67:36:35, 4.10it/s, grad_norm=3.53, loss_final=2.67, loss_mean=0.996, loss_mean_cls=2.01, proj_loss=-0.339][2026-03-23 13:42:01] Step: 1379, Training Logs: loss_final: 2.829293, loss_mean: 0.979721, proj_loss: -0.341392, loss_mean_cls: 2.190964, grad_norm: 11.544888 +Steps: 0%| | 1380/1000000 [05:42<67:34:27, 4.11it/s, grad_norm=11.5, loss_final=2.83, loss_mean=0.98, loss_mean_cls=2.19, proj_loss=-0.341][2026-03-23 13:42:02] Step: 1380, Training Logs: loss_final: 3.170528, loss_mean: 1.003788, proj_loss: -0.329956, loss_mean_cls: 2.496696, grad_norm: 7.706328 +Steps: 0%| | 1381/1000000 [05:42<67:32:47, 4.11it/s, grad_norm=7.71, loss_final=3.17, loss_mean=1, loss_mean_cls=2.5, proj_loss=-0.33][2026-03-23 13:42:02] Step: 1381, Training Logs: loss_final: 3.279160, loss_mean: 0.980833, proj_loss: -0.327815, loss_mean_cls: 2.626143, grad_norm: 7.796432 +Steps: 0%| | 1382/1000000 [05:42<67:40:58, 4.10it/s, grad_norm=7.8, loss_final=3.28, loss_mean=0.981, loss_mean_cls=2.63, proj_loss=-0.328][2026-03-23 13:42:02] Step: 1382, Training Logs: loss_final: 3.293210, loss_mean: 0.988641, proj_loss: -0.329491, loss_mean_cls: 2.634060, grad_norm: 2.628328 +Steps: 0%| | 1383/1000000 [05:42<67:39:53, 4.10it/s, grad_norm=2.63, loss_final=3.29, loss_mean=0.989, loss_mean_cls=2.63, proj_loss=-0.329][2026-03-23 13:42:02] Step: 1383, Training Logs: loss_final: 3.364740, loss_mean: 0.949262, proj_loss: -0.331202, loss_mean_cls: 2.746680, grad_norm: 4.407062 +Steps: 0%| | 1384/1000000 [05:43<67:46:52, 4.09it/s, grad_norm=4.41, loss_final=3.36, loss_mean=0.949, loss_mean_cls=2.75, proj_loss=-0.331][2026-03-23 13:42:02] Step: 1384, Training Logs: loss_final: 2.865473, loss_mean: 0.973965, proj_loss: -0.335371, loss_mean_cls: 2.226879, grad_norm: 4.006075 +Steps: 0%| | 1385/1000000 [05:43<67:40:55, 4.10it/s, grad_norm=4.01, loss_final=2.87, loss_mean=0.974, loss_mean_cls=2.23, proj_loss=-0.335][2026-03-23 13:42:03] Step: 1385, Training Logs: loss_final: 3.120709, loss_mean: 0.939809, proj_loss: -0.331075, loss_mean_cls: 2.511974, grad_norm: 5.770145 +Steps: 0%| | 1386/1000000 [05:43<67:40:36, 4.10it/s, grad_norm=5.77, loss_final=3.12, loss_mean=0.94, loss_mean_cls=2.51, proj_loss=-0.331][2026-03-23 13:42:03] Step: 1386, Training Logs: loss_final: 2.974139, loss_mean: 0.957492, proj_loss: -0.339736, loss_mean_cls: 2.356384, grad_norm: 5.253175 +Steps: 0%| | 1387/1000000 [05:43<67:40:14, 4.10it/s, grad_norm=5.25, loss_final=2.97, loss_mean=0.957, loss_mean_cls=2.36, proj_loss=-0.34][2026-03-23 13:42:03] Step: 1387, Training Logs: loss_final: 3.039896, loss_mean: 0.980284, proj_loss: -0.336168, loss_mean_cls: 2.395781, grad_norm: 5.695372 +Steps: 0%| | 1388/1000000 [05:43<67:45:05, 4.09it/s, grad_norm=5.7, loss_final=3.04, loss_mean=0.98, loss_mean_cls=2.4, proj_loss=-0.336][2026-03-23 13:42:03] Step: 1388, Training Logs: loss_final: 3.721004, loss_mean: 0.972693, proj_loss: -0.330325, loss_mean_cls: 3.078635, grad_norm: 12.065490 +Steps: 0%| | 1389/1000000 [05:44<67:40:50, 4.10it/s, grad_norm=12.1, loss_final=3.72, loss_mean=0.973, loss_mean_cls=3.08, proj_loss=-0.33][2026-03-23 13:42:04] Step: 1389, Training Logs: loss_final: 3.493849, loss_mean: 0.950963, proj_loss: -0.329278, loss_mean_cls: 2.872164, grad_norm: 7.671635 +Steps: 0%| | 1390/1000000 [05:44<67:36:45, 4.10it/s, grad_norm=7.67, loss_final=3.49, loss_mean=0.951, loss_mean_cls=2.87, proj_loss=-0.329][2026-03-23 13:42:04] Step: 1390, Training Logs: loss_final: 2.538258, loss_mean: 0.975357, proj_loss: -0.341985, loss_mean_cls: 1.904885, grad_norm: 11.159031 +Steps: 0%| | 1391/1000000 [05:44<67:35:24, 4.10it/s, grad_norm=11.2, loss_final=2.54, loss_mean=0.975, loss_mean_cls=1.9, proj_loss=-0.342][2026-03-23 13:42:04] Step: 1391, Training Logs: loss_final: 3.312451, loss_mean: 0.958324, proj_loss: -0.336489, loss_mean_cls: 2.690616, grad_norm: 6.407349 +Steps: 0%| | 1392/1000000 [05:44<67:42:52, 4.10it/s, grad_norm=6.41, loss_final=3.31, loss_mean=0.958, loss_mean_cls=2.69, proj_loss=-0.336][2026-03-23 13:42:04] Step: 1392, Training Logs: loss_final: 3.282487, loss_mean: 0.960974, proj_loss: -0.334047, loss_mean_cls: 2.655560, grad_norm: 5.179072 +Steps: 0%| | 1393/1000000 [05:45<67:42:51, 4.10it/s, grad_norm=5.18, loss_final=3.28, loss_mean=0.961, loss_mean_cls=2.66, proj_loss=-0.334][2026-03-23 13:42:05] Step: 1393, Training Logs: loss_final: 2.929788, loss_mean: 0.953586, proj_loss: -0.342110, loss_mean_cls: 2.318312, grad_norm: 4.417194 +Steps: 0%| | 1394/1000000 [05:45<67:41:58, 4.10it/s, grad_norm=4.42, loss_final=2.93, loss_mean=0.954, loss_mean_cls=2.32, proj_loss=-0.342][2026-03-23 13:42:05] Step: 1394, Training Logs: loss_final: 2.817971, loss_mean: 0.962436, proj_loss: -0.343013, loss_mean_cls: 2.198549, grad_norm: 5.139846 +Steps: 0%| | 1395/1000000 [05:45<67:38:20, 4.10it/s, grad_norm=5.14, loss_final=2.82, loss_mean=0.962, loss_mean_cls=2.2, proj_loss=-0.343][2026-03-23 13:42:05] Step: 1395, Training Logs: loss_final: 3.330439, loss_mean: 0.968528, proj_loss: -0.329512, loss_mean_cls: 2.691423, grad_norm: 4.892266 +Steps: 0%| | 1396/1000000 [05:45<67:36:11, 4.10it/s, grad_norm=4.89, loss_final=3.33, loss_mean=0.969, loss_mean_cls=2.69, proj_loss=-0.33][2026-03-23 13:42:05] Step: 1396, Training Logs: loss_final: 3.247748, loss_mean: 0.953016, proj_loss: -0.335215, loss_mean_cls: 2.629947, grad_norm: 5.777120 +Steps: 0%| | 1397/1000000 [05:46<67:34:00, 4.11it/s, grad_norm=5.78, loss_final=3.25, loss_mean=0.953, loss_mean_cls=2.63, proj_loss=-0.335][2026-03-23 13:42:06] Step: 1397, Training Logs: loss_final: 3.055509, loss_mean: 0.957541, proj_loss: -0.336156, loss_mean_cls: 2.434124, grad_norm: 5.569547 +Steps: 0%| | 1398/1000000 [05:46<67:34:03, 4.11it/s, grad_norm=5.57, loss_final=3.06, loss_mean=0.958, loss_mean_cls=2.43, proj_loss=-0.336][2026-03-23 13:42:06] Step: 1398, Training Logs: loss_final: 3.062859, loss_mean: 0.951556, proj_loss: -0.339061, loss_mean_cls: 2.450364, grad_norm: 8.724830 +Steps: 0%| | 1399/1000000 [05:46<67:32:47, 4.11it/s, grad_norm=8.72, loss_final=3.06, loss_mean=0.952, loss_mean_cls=2.45, proj_loss=-0.339][2026-03-23 13:42:06] Step: 1399, Training Logs: loss_final: 3.208409, loss_mean: 0.955555, proj_loss: -0.336085, loss_mean_cls: 2.588939, grad_norm: 5.985652 +Steps: 0%| | 1400/1000000 [05:46<67:38:05, 4.10it/s, grad_norm=5.99, loss_final=3.21, loss_mean=0.956, loss_mean_cls=2.59, proj_loss=-0.336][2026-03-23 13:42:06] Step: 1400, Training Logs: loss_final: 3.378959, loss_mean: 0.936354, proj_loss: -0.337077, loss_mean_cls: 2.779682, grad_norm: 12.913147 +Steps: 0%| | 1401/1000000 [05:47<67:34:40, 4.10it/s, grad_norm=12.9, loss_final=3.38, loss_mean=0.936, loss_mean_cls=2.78, proj_loss=-0.337][2026-03-23 13:42:07] Step: 1401, Training Logs: loss_final: 3.259323, loss_mean: 0.958753, proj_loss: -0.342749, loss_mean_cls: 2.643319, grad_norm: 5.753226 +Steps: 0%| | 1402/1000000 [05:47<67:33:01, 4.11it/s, grad_norm=5.75, loss_final=3.26, loss_mean=0.959, loss_mean_cls=2.64, proj_loss=-0.343][2026-03-23 13:42:07] Step: 1402, Training Logs: loss_final: 3.169331, loss_mean: 0.945176, proj_loss: -0.338569, loss_mean_cls: 2.562725, grad_norm: 9.599866 +Steps: 0%| | 1403/1000000 [05:47<67:33:42, 4.11it/s, grad_norm=9.6, loss_final=3.17, loss_mean=0.945, loss_mean_cls=2.56, proj_loss=-0.339][2026-03-23 13:42:07] Step: 1403, Training Logs: loss_final: 2.824068, loss_mean: 0.966499, proj_loss: -0.342156, loss_mean_cls: 2.199725, grad_norm: 3.561332 +Steps: 0%| | 1404/1000000 [05:47<67:33:36, 4.11it/s, grad_norm=3.56, loss_final=2.82, loss_mean=0.966, loss_mean_cls=2.2, proj_loss=-0.342][2026-03-23 13:42:07] Step: 1404, Training Logs: loss_final: 2.757147, loss_mean: 0.957127, proj_loss: -0.346793, loss_mean_cls: 2.146813, grad_norm: 11.708247 +Steps: 0%| | 1405/1000000 [05:48<67:34:07, 4.11it/s, grad_norm=11.7, loss_final=2.76, loss_mean=0.957, loss_mean_cls=2.15, proj_loss=-0.347][2026-03-23 13:42:08] Step: 1405, Training Logs: loss_final: 2.925694, loss_mean: 0.960138, proj_loss: -0.342889, loss_mean_cls: 2.308444, grad_norm: 4.677698 +Steps: 0%| | 1406/1000000 [05:48<67:34:06, 4.11it/s, grad_norm=4.68, loss_final=2.93, loss_mean=0.96, loss_mean_cls=2.31, proj_loss=-0.343][2026-03-23 13:42:08] Step: 1406, Training Logs: loss_final: 3.216656, loss_mean: 0.965093, proj_loss: -0.335469, loss_mean_cls: 2.587032, grad_norm: 8.423184 +Steps: 0%| | 1407/1000000 [05:48<67:34:08, 4.11it/s, grad_norm=8.42, loss_final=3.22, loss_mean=0.965, loss_mean_cls=2.59, proj_loss=-0.335][2026-03-23 13:42:08] Step: 1407, Training Logs: loss_final: 3.168606, loss_mean: 0.963718, proj_loss: -0.338040, loss_mean_cls: 2.542928, grad_norm: 8.764949 +Steps: 0%| | 1408/1000000 [05:48<67:33:24, 4.11it/s, grad_norm=8.76, loss_final=3.17, loss_mean=0.964, loss_mean_cls=2.54, proj_loss=-0.338][2026-03-23 13:42:08] Step: 1408, Training Logs: loss_final: 3.049685, loss_mean: 0.946386, proj_loss: -0.336981, loss_mean_cls: 2.440280, grad_norm: 6.709769 +Steps: 0%| | 1409/1000000 [05:49<67:31:49, 4.11it/s, grad_norm=6.71, loss_final=3.05, loss_mean=0.946, loss_mean_cls=2.44, proj_loss=-0.337][2026-03-23 13:42:09] Step: 1409, Training Logs: loss_final: 3.115263, loss_mean: 0.963494, proj_loss: -0.340704, loss_mean_cls: 2.492474, grad_norm: 9.022931 +Steps: 0%| | 1410/1000000 [05:49<67:32:55, 4.11it/s, grad_norm=9.02, loss_final=3.12, loss_mean=0.963, loss_mean_cls=2.49, proj_loss=-0.341][2026-03-23 13:42:09] Step: 1410, Training Logs: loss_final: 3.199651, loss_mean: 0.947458, proj_loss: -0.337172, loss_mean_cls: 2.589364, grad_norm: 4.601237 +Steps: 0%| | 1411/1000000 [05:49<67:33:17, 4.11it/s, grad_norm=4.6, loss_final=3.2, loss_mean=0.947, loss_mean_cls=2.59, proj_loss=-0.337][2026-03-23 13:42:09] Step: 1411, Training Logs: loss_final: 3.090085, loss_mean: 0.963433, proj_loss: -0.340434, loss_mean_cls: 2.467085, grad_norm: 10.110411 +Steps: 0%| | 1412/1000000 [05:49<67:33:33, 4.11it/s, grad_norm=10.1, loss_final=3.09, loss_mean=0.963, loss_mean_cls=2.47, proj_loss=-0.34][2026-03-23 13:42:09] Step: 1412, Training Logs: loss_final: 2.912380, loss_mean: 0.974966, proj_loss: -0.343377, loss_mean_cls: 2.280790, grad_norm: 13.909620 +Steps: 0%| | 1413/1000000 [05:50<67:31:58, 4.11it/s, grad_norm=13.9, loss_final=2.91, loss_mean=0.975, loss_mean_cls=2.28, proj_loss=-0.343][2026-03-23 13:42:10] Step: 1413, Training Logs: loss_final: 2.895133, loss_mean: 0.978517, proj_loss: -0.347585, loss_mean_cls: 2.264201, grad_norm: 10.843130 +Steps: 0%| | 1414/1000000 [05:50<67:35:46, 4.10it/s, grad_norm=10.8, loss_final=2.9, loss_mean=0.979, loss_mean_cls=2.26, proj_loss=-0.348][2026-03-23 13:42:10] Step: 1414, Training Logs: loss_final: 3.348886, loss_mean: 0.941685, proj_loss: -0.335409, loss_mean_cls: 2.742610, grad_norm: 4.880676 +Steps: 0%| | 1415/1000000 [05:50<67:33:18, 4.11it/s, grad_norm=4.88, loss_final=3.35, loss_mean=0.942, loss_mean_cls=2.74, proj_loss=-0.335][2026-03-23 13:42:10] Step: 1415, Training Logs: loss_final: 3.474310, loss_mean: 0.960319, proj_loss: -0.341172, loss_mean_cls: 2.855163, grad_norm: 16.778366 +Steps: 0%| | 1416/1000000 [05:50<67:30:53, 4.11it/s, grad_norm=16.8, loss_final=3.47, loss_mean=0.96, loss_mean_cls=2.86, proj_loss=-0.341][2026-03-23 13:42:10] Step: 1416, Training Logs: loss_final: 3.340079, loss_mean: 0.978838, proj_loss: -0.343310, loss_mean_cls: 2.704552, grad_norm: 12.845671 +Steps: 0%| | 1417/1000000 [05:51<67:31:44, 4.11it/s, grad_norm=12.8, loss_final=3.34, loss_mean=0.979, loss_mean_cls=2.7, proj_loss=-0.343][2026-03-23 13:42:11] Step: 1417, Training Logs: loss_final: 2.880861, loss_mean: 0.982778, proj_loss: -0.345341, loss_mean_cls: 2.243423, grad_norm: 7.903592 +Steps: 0%| | 1418/1000000 [05:51<67:33:56, 4.11it/s, grad_norm=7.9, loss_final=2.88, loss_mean=0.983, loss_mean_cls=2.24, proj_loss=-0.345][2026-03-23 13:42:11] Step: 1418, Training Logs: loss_final: 3.160125, loss_mean: 0.963502, proj_loss: -0.340647, loss_mean_cls: 2.537270, grad_norm: 5.653132 +Steps: 0%| | 1419/1000000 [05:51<67:32:10, 4.11it/s, grad_norm=5.65, loss_final=3.16, loss_mean=0.964, loss_mean_cls=2.54, proj_loss=-0.341][2026-03-23 13:42:11] Step: 1419, Training Logs: loss_final: 2.725035, loss_mean: 0.981771, proj_loss: -0.353375, loss_mean_cls: 2.096638, grad_norm: 7.967353 +Steps: 0%| | 1420/1000000 [05:51<67:32:20, 4.11it/s, grad_norm=7.97, loss_final=2.73, loss_mean=0.982, loss_mean_cls=2.1, proj_loss=-0.353][2026-03-23 13:42:11] Step: 1420, Training Logs: loss_final: 3.305147, loss_mean: 0.975680, proj_loss: -0.344180, loss_mean_cls: 2.673647, grad_norm: 8.502553 +Steps: 0%| | 1421/1000000 [05:52<67:33:49, 4.11it/s, grad_norm=8.5, loss_final=3.31, loss_mean=0.976, loss_mean_cls=2.67, proj_loss=-0.344][2026-03-23 13:42:11] Step: 1421, Training Logs: loss_final: 3.007462, loss_mean: 0.956341, proj_loss: -0.341735, loss_mean_cls: 2.392857, grad_norm: 5.333292 +Steps: 0%| | 1422/1000000 [05:52<67:34:45, 4.10it/s, grad_norm=5.33, loss_final=3.01, loss_mean=0.956, loss_mean_cls=2.39, proj_loss=-0.342][2026-03-23 13:42:12] Step: 1422, Training Logs: loss_final: 3.133044, loss_mean: 0.982920, proj_loss: -0.342499, loss_mean_cls: 2.492623, grad_norm: 12.292782 +Steps: 0%| | 1423/1000000 [05:52<67:32:39, 4.11it/s, grad_norm=12.3, loss_final=3.13, loss_mean=0.983, loss_mean_cls=2.49, proj_loss=-0.342][2026-03-23 13:42:12] Step: 1423, Training Logs: loss_final: 2.974602, loss_mean: 0.984605, proj_loss: -0.345064, loss_mean_cls: 2.335061, grad_norm: 8.293684 +Steps: 0%| | 1424/1000000 [05:52<67:31:12, 4.11it/s, grad_norm=8.29, loss_final=2.97, loss_mean=0.985, loss_mean_cls=2.34, proj_loss=-0.345][2026-03-23 13:42:12] Step: 1424, Training Logs: loss_final: 3.414040, loss_mean: 0.993174, proj_loss: -0.341355, loss_mean_cls: 2.762221, grad_norm: 11.514207 +Steps: 0%| | 1425/1000000 [05:53<67:32:15, 4.11it/s, grad_norm=11.5, loss_final=3.41, loss_mean=0.993, loss_mean_cls=2.76, proj_loss=-0.341][2026-03-23 13:42:12] Step: 1425, Training Logs: loss_final: 3.200630, loss_mean: 0.981513, proj_loss: -0.344323, loss_mean_cls: 2.563439, grad_norm: 5.061605 +Steps: 0%| | 1426/1000000 [05:53<67:34:44, 4.10it/s, grad_norm=5.06, loss_final=3.2, loss_mean=0.982, loss_mean_cls=2.56, proj_loss=-0.344][2026-03-23 13:42:13] Step: 1426, Training Logs: loss_final: 2.697270, loss_mean: 1.010332, proj_loss: -0.349034, loss_mean_cls: 2.035972, grad_norm: 10.781907 +Steps: 0%| | 1427/1000000 [05:53<67:34:24, 4.10it/s, grad_norm=10.8, loss_final=2.7, loss_mean=1.01, loss_mean_cls=2.04, proj_loss=-0.349][2026-03-23 13:42:13] Step: 1427, Training Logs: loss_final: 2.774630, loss_mean: 0.986797, proj_loss: -0.348621, loss_mean_cls: 2.136454, grad_norm: 5.599110 +Steps: 0%| | 1428/1000000 [05:53<67:33:22, 4.11it/s, grad_norm=5.6, loss_final=2.77, loss_mean=0.987, loss_mean_cls=2.14, proj_loss=-0.349][2026-03-23 13:42:13] Step: 1428, Training Logs: loss_final: 3.002372, loss_mean: 0.965633, proj_loss: -0.344578, loss_mean_cls: 2.381316, grad_norm: 5.913617 +Steps: 0%| | 1429/1000000 [05:53<67:34:00, 4.11it/s, grad_norm=5.91, loss_final=3, loss_mean=0.966, loss_mean_cls=2.38, proj_loss=-0.345][2026-03-23 13:42:13] Step: 1429, Training Logs: loss_final: 3.289504, loss_mean: 0.950633, proj_loss: -0.338746, loss_mean_cls: 2.677617, grad_norm: 3.588206 +Steps: 0%| | 1430/1000000 [05:54<67:32:39, 4.11it/s, grad_norm=3.59, loss_final=3.29, loss_mean=0.951, loss_mean_cls=2.68, proj_loss=-0.339][2026-03-23 13:42:14] Step: 1430, Training Logs: loss_final: 3.070040, loss_mean: 0.950551, proj_loss: -0.345637, loss_mean_cls: 2.465127, grad_norm: 7.781072 +Steps: 0%| | 1431/1000000 [05:54<67:33:06, 4.11it/s, grad_norm=7.78, loss_final=3.07, loss_mean=0.951, loss_mean_cls=2.47, proj_loss=-0.346][2026-03-23 13:42:14] Step: 1431, Training Logs: loss_final: 3.466833, loss_mean: 0.951938, proj_loss: -0.335879, loss_mean_cls: 2.850774, grad_norm: 7.821112 +Steps: 0%| | 1432/1000000 [05:54<67:33:15, 4.11it/s, grad_norm=7.82, loss_final=3.47, loss_mean=0.952, loss_mean_cls=2.85, proj_loss=-0.336][2026-03-23 13:42:14] Step: 1432, Training Logs: loss_final: 3.221859, loss_mean: 0.971002, proj_loss: -0.340830, loss_mean_cls: 2.591687, grad_norm: 9.033523 +Steps: 0%| | 1433/1000000 [05:54<67:33:16, 4.11it/s, grad_norm=9.03, loss_final=3.22, loss_mean=0.971, loss_mean_cls=2.59, proj_loss=-0.341][2026-03-23 13:42:14] Step: 1433, Training Logs: loss_final: 3.480281, loss_mean: 0.966410, proj_loss: -0.338461, loss_mean_cls: 2.852333, grad_norm: 3.040754 +Steps: 0%| | 1434/1000000 [05:55<67:32:59, 4.11it/s, grad_norm=3.04, loss_final=3.48, loss_mean=0.966, loss_mean_cls=2.85, proj_loss=-0.338][2026-03-23 13:42:15] Step: 1434, Training Logs: loss_final: 2.773540, loss_mean: 0.948784, proj_loss: -0.344564, loss_mean_cls: 2.169319, grad_norm: 3.701053 +Steps: 0%| | 1435/1000000 [05:55<67:33:10, 4.11it/s, grad_norm=3.7, loss_final=2.77, loss_mean=0.949, loss_mean_cls=2.17, proj_loss=-0.345][2026-03-23 13:42:15] Step: 1435, Training Logs: loss_final: 3.422741, loss_mean: 0.961002, proj_loss: -0.344373, loss_mean_cls: 2.806111, grad_norm: 14.778490 +Steps: 0%| | 1436/1000000 [05:55<67:33:46, 4.11it/s, grad_norm=14.8, loss_final=3.42, loss_mean=0.961, loss_mean_cls=2.81, proj_loss=-0.344][2026-03-23 13:42:15] Step: 1436, Training Logs: loss_final: 3.367245, loss_mean: 0.949401, proj_loss: -0.330973, loss_mean_cls: 2.748817, grad_norm: 13.689323 +Steps: 0%| | 1437/1000000 [05:55<67:32:55, 4.11it/s, grad_norm=13.7, loss_final=3.37, loss_mean=0.949, loss_mean_cls=2.75, proj_loss=-0.331][2026-03-23 13:42:15] Step: 1437, Training Logs: loss_final: 2.809504, loss_mean: 0.987888, proj_loss: -0.341788, loss_mean_cls: 2.163403, grad_norm: 3.638918 +Steps: 0%| | 1438/1000000 [05:56<67:37:59, 4.10it/s, grad_norm=3.64, loss_final=2.81, loss_mean=0.988, loss_mean_cls=2.16, proj_loss=-0.342][2026-03-23 13:42:16] Step: 1438, Training Logs: loss_final: 3.366788, loss_mean: 0.927175, proj_loss: -0.343607, loss_mean_cls: 2.783220, grad_norm: 18.290693 +Steps: 0%| | 1439/1000000 [05:56<67:36:57, 4.10it/s, grad_norm=18.3, loss_final=3.37, loss_mean=0.927, loss_mean_cls=2.78, proj_loss=-0.344][2026-03-23 13:42:16] Step: 1439, Training Logs: loss_final: 2.759815, loss_mean: 0.994790, proj_loss: -0.348040, loss_mean_cls: 2.113065, grad_norm: 11.213283 +Steps: 0%| | 1440/1000000 [05:56<67:36:43, 4.10it/s, grad_norm=11.2, loss_final=2.76, loss_mean=0.995, loss_mean_cls=2.11, proj_loss=-0.348][2026-03-23 13:42:16] Step: 1440, Training Logs: loss_final: 3.425947, loss_mean: 0.946987, proj_loss: -0.333305, loss_mean_cls: 2.812266, grad_norm: 11.123885 +Steps: 0%| | 1441/1000000 [05:56<67:35:02, 4.10it/s, grad_norm=11.1, loss_final=3.43, loss_mean=0.947, loss_mean_cls=2.81, proj_loss=-0.333][2026-03-23 13:42:16] Step: 1441, Training Logs: loss_final: 3.249331, loss_mean: 0.985004, proj_loss: -0.338663, loss_mean_cls: 2.602989, grad_norm: 16.976917 +Steps: 0%| | 1442/1000000 [05:57<67:36:19, 4.10it/s, grad_norm=17, loss_final=3.25, loss_mean=0.985, loss_mean_cls=2.6, proj_loss=-0.339][2026-03-23 13:42:17] Step: 1442, Training Logs: loss_final: 3.317275, loss_mean: 0.967219, proj_loss: -0.340994, loss_mean_cls: 2.691049, grad_norm: 12.039012 +Steps: 0%| | 1443/1000000 [05:57<67:35:48, 4.10it/s, grad_norm=12, loss_final=3.32, loss_mean=0.967, loss_mean_cls=2.69, proj_loss=-0.341][2026-03-23 13:42:17] Step: 1443, Training Logs: loss_final: 2.699085, loss_mean: 0.972038, proj_loss: -0.345432, loss_mean_cls: 2.072479, grad_norm: 16.179678 +Steps: 0%| | 1444/1000000 [05:57<67:34:28, 4.10it/s, grad_norm=16.2, loss_final=2.7, loss_mean=0.972, loss_mean_cls=2.07, proj_loss=-0.345][2026-03-23 13:42:17] Step: 1444, Training Logs: loss_final: 2.950719, loss_mean: 0.968518, proj_loss: -0.344893, loss_mean_cls: 2.327095, grad_norm: 7.132165 +Steps: 0%| | 1445/1000000 [05:57<67:34:00, 4.11it/s, grad_norm=7.13, loss_final=2.95, loss_mean=0.969, loss_mean_cls=2.33, proj_loss=-0.345][2026-03-23 13:42:17] Step: 1445, Training Logs: loss_final: 3.203182, loss_mean: 0.952035, proj_loss: -0.339859, loss_mean_cls: 2.591006, grad_norm: 7.620331 +Steps: 0%| | 1446/1000000 [05:58<67:36:43, 4.10it/s, grad_norm=7.62, loss_final=3.2, loss_mean=0.952, loss_mean_cls=2.59, proj_loss=-0.34][2026-03-23 13:42:18] Step: 1446, Training Logs: loss_final: 3.411578, loss_mean: 0.935611, proj_loss: -0.334458, loss_mean_cls: 2.810425, grad_norm: 7.891594 +Steps: 0%| | 1447/1000000 [05:58<67:34:21, 4.10it/s, grad_norm=7.89, loss_final=3.41, loss_mean=0.936, loss_mean_cls=2.81, proj_loss=-0.334][2026-03-23 13:42:18] Step: 1447, Training Logs: loss_final: 3.090736, loss_mean: 0.959505, proj_loss: -0.344158, loss_mean_cls: 2.475389, grad_norm: 12.381062 +Steps: 0%| | 1448/1000000 [05:58<67:34:12, 4.11it/s, grad_norm=12.4, loss_final=3.09, loss_mean=0.96, loss_mean_cls=2.48, proj_loss=-0.344][2026-03-23 13:42:18] Step: 1448, Training Logs: loss_final: 3.151798, loss_mean: 0.958827, proj_loss: -0.344296, loss_mean_cls: 2.537266, grad_norm: 9.250732 +Steps: 0%| | 1449/1000000 [05:58<67:31:38, 4.11it/s, grad_norm=9.25, loss_final=3.15, loss_mean=0.959, loss_mean_cls=2.54, proj_loss=-0.344][2026-03-23 13:42:18] Step: 1449, Training Logs: loss_final: 2.792148, loss_mean: 0.949550, proj_loss: -0.344683, loss_mean_cls: 2.187281, grad_norm: 5.846426 +Steps: 0%| | 1450/1000000 [05:59<67:35:04, 4.10it/s, grad_norm=5.85, loss_final=2.79, loss_mean=0.95, loss_mean_cls=2.19, proj_loss=-0.345][2026-03-23 13:42:19] Step: 1450, Training Logs: loss_final: 2.953234, loss_mean: 0.970804, proj_loss: -0.350671, loss_mean_cls: 2.333100, grad_norm: 12.494272 +Steps: 0%| | 1451/1000000 [05:59<67:34:05, 4.11it/s, grad_norm=12.5, loss_final=2.95, loss_mean=0.971, loss_mean_cls=2.33, proj_loss=-0.351][2026-03-23 13:42:19] Step: 1451, Training Logs: loss_final: 3.410339, loss_mean: 0.971406, proj_loss: -0.342438, loss_mean_cls: 2.781370, grad_norm: 9.065194 +Steps: 0%| | 1452/1000000 [05:59<67:33:47, 4.11it/s, grad_norm=9.07, loss_final=3.41, loss_mean=0.971, loss_mean_cls=2.78, proj_loss=-0.342][2026-03-23 13:42:19] Step: 1452, Training Logs: loss_final: 3.511740, loss_mean: 0.948616, proj_loss: -0.336694, loss_mean_cls: 2.899819, grad_norm: 10.912529 +Steps: 0%| | 1453/1000000 [05:59<67:32:12, 4.11it/s, grad_norm=10.9, loss_final=3.51, loss_mean=0.949, loss_mean_cls=2.9, proj_loss=-0.337][2026-03-23 13:42:19] Step: 1453, Training Logs: loss_final: 2.902380, loss_mean: 0.976543, proj_loss: -0.345805, loss_mean_cls: 2.271641, grad_norm: 4.421455 +Steps: 0%| | 1454/1000000 [06:00<67:33:22, 4.11it/s, grad_norm=4.42, loss_final=2.9, loss_mean=0.977, loss_mean_cls=2.27, proj_loss=-0.346][2026-03-23 13:42:20] Step: 1454, Training Logs: loss_final: 2.992944, loss_mean: 0.934363, proj_loss: -0.345790, loss_mean_cls: 2.404372, grad_norm: 10.204779 +Steps: 0%| | 1455/1000000 [06:00<67:31:46, 4.11it/s, grad_norm=10.2, loss_final=2.99, loss_mean=0.934, loss_mean_cls=2.4, proj_loss=-0.346][2026-03-23 13:42:20] Step: 1455, Training Logs: loss_final: 3.480271, loss_mean: 0.938761, proj_loss: -0.339774, loss_mean_cls: 2.881284, grad_norm: 5.781138 +Steps: 0%| | 1456/1000000 [06:00<67:30:46, 4.11it/s, grad_norm=5.78, loss_final=3.48, loss_mean=0.939, loss_mean_cls=2.88, proj_loss=-0.34][2026-03-23 13:42:20] Step: 1456, Training Logs: loss_final: 3.088391, loss_mean: 0.964793, proj_loss: -0.347840, loss_mean_cls: 2.471438, grad_norm: 9.878484 +Steps: 0%| | 1457/1000000 [06:00<67:29:43, 4.11it/s, grad_norm=9.88, loss_final=3.09, loss_mean=0.965, loss_mean_cls=2.47, proj_loss=-0.348][2026-03-23 13:42:20] Step: 1457, Training Logs: loss_final: 3.087163, loss_mean: 0.972243, proj_loss: -0.340138, loss_mean_cls: 2.455058, grad_norm: 9.458599 +Steps: 0%| | 1458/1000000 [06:01<67:30:36, 4.11it/s, grad_norm=9.46, loss_final=3.09, loss_mean=0.972, loss_mean_cls=2.46, proj_loss=-0.34][2026-03-23 13:42:21] Step: 1458, Training Logs: loss_final: 3.041623, loss_mean: 0.968643, proj_loss: -0.345318, loss_mean_cls: 2.418298, grad_norm: 7.789050 +Steps: 0%| | 1459/1000000 [06:01<67:32:37, 4.11it/s, grad_norm=7.79, loss_final=3.04, loss_mean=0.969, loss_mean_cls=2.42, proj_loss=-0.345][2026-03-23 13:42:21] Step: 1459, Training Logs: loss_final: 2.609975, loss_mean: 0.955178, proj_loss: -0.354708, loss_mean_cls: 2.009505, grad_norm: 8.081085 +Steps: 0%| | 1460/1000000 [06:01<67:32:41, 4.11it/s, grad_norm=8.08, loss_final=2.61, loss_mean=0.955, loss_mean_cls=2.01, proj_loss=-0.355][2026-03-23 13:42:21] Step: 1460, Training Logs: loss_final: 2.967266, loss_mean: 0.957671, proj_loss: -0.343112, loss_mean_cls: 2.352707, grad_norm: 4.553901 +Steps: 0%| | 1461/1000000 [06:01<71:35:06, 3.87it/s, grad_norm=4.55, loss_final=2.97, loss_mean=0.958, loss_mean_cls=2.35, proj_loss=-0.343][2026-03-23 13:42:21] Step: 1461, Training Logs: loss_final: 2.877618, loss_mean: 0.953834, proj_loss: -0.353203, loss_mean_cls: 2.276987, grad_norm: 11.053661 +Steps: 0%| | 1462/1000000 [06:02<70:22:39, 3.94it/s, grad_norm=11.1, loss_final=2.88, loss_mean=0.954, loss_mean_cls=2.28, proj_loss=-0.353][2026-03-23 13:42:22] Step: 1462, Training Logs: loss_final: 3.026879, loss_mean: 0.958311, proj_loss: -0.348636, loss_mean_cls: 2.417204, grad_norm: 9.750180 +Steps: 0%| | 1463/1000000 [06:02<69:32:56, 3.99it/s, grad_norm=9.75, loss_final=3.03, loss_mean=0.958, loss_mean_cls=2.42, proj_loss=-0.349][2026-03-23 13:42:22] Step: 1463, Training Logs: loss_final: 3.564741, loss_mean: 0.938230, proj_loss: -0.336220, loss_mean_cls: 2.962731, grad_norm: 8.653172 +Steps: 0%| | 1464/1000000 [06:02<69:01:24, 4.02it/s, grad_norm=8.65, loss_final=3.56, loss_mean=0.938, loss_mean_cls=2.96, proj_loss=-0.336][2026-03-23 13:42:22] Step: 1464, Training Logs: loss_final: 2.939762, loss_mean: 0.954669, proj_loss: -0.345646, loss_mean_cls: 2.330739, grad_norm: 5.538828 +Steps: 0%| | 1465/1000000 [06:02<68:34:02, 4.05it/s, grad_norm=5.54, loss_final=2.94, loss_mean=0.955, loss_mean_cls=2.33, proj_loss=-0.346][2026-03-23 13:42:22] Step: 1465, Training Logs: loss_final: 2.884844, loss_mean: 0.957804, proj_loss: -0.347882, loss_mean_cls: 2.274923, grad_norm: 5.013650 +Steps: 0%| | 1466/1000000 [06:03<68:18:50, 4.06it/s, grad_norm=5.01, loss_final=2.88, loss_mean=0.958, loss_mean_cls=2.27, proj_loss=-0.348][2026-03-23 13:42:23] Step: 1466, Training Logs: loss_final: 3.149590, loss_mean: 0.944694, proj_loss: -0.347969, loss_mean_cls: 2.552864, grad_norm: 5.935058 +Steps: 0%| | 1467/1000000 [06:03<68:05:15, 4.07it/s, grad_norm=5.94, loss_final=3.15, loss_mean=0.945, loss_mean_cls=2.55, proj_loss=-0.348][2026-03-23 13:42:23] Step: 1467, Training Logs: loss_final: 3.575747, loss_mean: 0.940569, proj_loss: -0.342130, loss_mean_cls: 2.977308, grad_norm: 5.673089 +Steps: 0%| | 1468/1000000 [06:03<67:55:27, 4.08it/s, grad_norm=5.67, loss_final=3.58, loss_mean=0.941, loss_mean_cls=2.98, proj_loss=-0.342][2026-03-23 13:42:23] Step: 1468, Training Logs: loss_final: 2.872197, loss_mean: 0.957183, proj_loss: -0.348686, loss_mean_cls: 2.263700, grad_norm: 13.166099 +Steps: 0%| | 1469/1000000 [06:03<67:46:47, 4.09it/s, grad_norm=13.2, loss_final=2.87, loss_mean=0.957, loss_mean_cls=2.26, proj_loss=-0.349][2026-03-23 13:42:23] Step: 1469, Training Logs: loss_final: 2.944611, loss_mean: 0.972149, proj_loss: -0.345106, loss_mean_cls: 2.317567, grad_norm: 5.595646 +Steps: 0%| | 1470/1000000 [06:04<67:46:11, 4.09it/s, grad_norm=5.6, loss_final=2.94, loss_mean=0.972, loss_mean_cls=2.32, proj_loss=-0.345][2026-03-23 13:42:23] Step: 1470, Training Logs: loss_final: 3.095400, loss_mean: 0.953663, proj_loss: -0.341463, loss_mean_cls: 2.483200, grad_norm: 10.414948 +Steps: 0%| | 1471/1000000 [06:04<67:39:50, 4.10it/s, grad_norm=10.4, loss_final=3.1, loss_mean=0.954, loss_mean_cls=2.48, proj_loss=-0.341][2026-03-23 13:42:24] Step: 1471, Training Logs: loss_final: 3.142435, loss_mean: 0.940921, proj_loss: -0.347648, loss_mean_cls: 2.549162, grad_norm: 13.193465 +Steps: 0%| | 1472/1000000 [06:04<67:37:48, 4.10it/s, grad_norm=13.2, loss_final=3.14, loss_mean=0.941, loss_mean_cls=2.55, proj_loss=-0.348][2026-03-23 13:42:24] Step: 1472, Training Logs: loss_final: 3.239527, loss_mean: 0.960676, proj_loss: -0.342543, loss_mean_cls: 2.621394, grad_norm: 2.807473 +Steps: 0%| | 1473/1000000 [06:04<67:38:31, 4.10it/s, grad_norm=2.81, loss_final=3.24, loss_mean=0.961, loss_mean_cls=2.62, proj_loss=-0.343][2026-03-23 13:42:24] Step: 1473, Training Logs: loss_final: 3.294545, loss_mean: 0.946680, proj_loss: -0.345894, loss_mean_cls: 2.693759, grad_norm: 7.714353 +Steps: 0%| | 1474/1000000 [06:05<69:12:03, 4.01it/s, grad_norm=7.71, loss_final=3.29, loss_mean=0.947, loss_mean_cls=2.69, proj_loss=-0.346][2026-03-23 13:42:24] Step: 1474, Training Logs: loss_final: 3.540686, loss_mean: 0.953650, proj_loss: -0.340369, loss_mean_cls: 2.927406, grad_norm: 7.488045 +Steps: 0%| | 1475/1000000 [06:05<68:49:24, 4.03it/s, grad_norm=7.49, loss_final=3.54, loss_mean=0.954, loss_mean_cls=2.93, proj_loss=-0.34][2026-03-23 13:42:25] Step: 1475, Training Logs: loss_final: 3.436061, loss_mean: 0.960237, proj_loss: -0.337112, loss_mean_cls: 2.812935, grad_norm: 10.595548 +Steps: 0%| | 1476/1000000 [06:05<68:24:38, 4.05it/s, grad_norm=10.6, loss_final=3.44, loss_mean=0.96, loss_mean_cls=2.81, proj_loss=-0.337][2026-03-23 13:42:25] Step: 1476, Training Logs: loss_final: 2.472569, loss_mean: 0.964967, proj_loss: -0.346560, loss_mean_cls: 1.854162, grad_norm: 8.979127 +Steps: 0%| | 1477/1000000 [06:05<68:15:33, 4.06it/s, grad_norm=8.98, loss_final=2.47, loss_mean=0.965, loss_mean_cls=1.85, proj_loss=-0.347][2026-03-23 13:42:25] Step: 1477, Training Logs: loss_final: 2.599704, loss_mean: 0.962931, proj_loss: -0.352446, loss_mean_cls: 1.989219, grad_norm: 3.466741 +Steps: 0%| | 1478/1000000 [06:05<68:01:09, 4.08it/s, grad_norm=3.47, loss_final=2.6, loss_mean=0.963, loss_mean_cls=1.99, proj_loss=-0.352][2026-03-23 13:42:25] Step: 1478, Training Logs: loss_final: 2.893739, loss_mean: 0.969436, proj_loss: -0.355370, loss_mean_cls: 2.279673, grad_norm: 16.594032 +Steps: 0%| | 1479/1000000 [06:06<67:55:03, 4.08it/s, grad_norm=16.6, loss_final=2.89, loss_mean=0.969, loss_mean_cls=2.28, proj_loss=-0.355][2026-03-23 13:42:26] Step: 1479, Training Logs: loss_final: 3.386280, loss_mean: 0.957615, proj_loss: -0.337147, loss_mean_cls: 2.765812, grad_norm: 11.942086 +Steps: 0%| | 1480/1000000 [06:06<67:48:42, 4.09it/s, grad_norm=11.9, loss_final=3.39, loss_mean=0.958, loss_mean_cls=2.77, proj_loss=-0.337][2026-03-23 13:42:26] Step: 1480, Training Logs: loss_final: 2.713756, loss_mean: 0.949256, proj_loss: -0.349209, loss_mean_cls: 2.113709, grad_norm: 5.765430 +Steps: 0%| | 1481/1000000 [06:06<67:43:40, 4.10it/s, grad_norm=5.77, loss_final=2.71, loss_mean=0.949, loss_mean_cls=2.11, proj_loss=-0.349][2026-03-23 13:42:26] Step: 1481, Training Logs: loss_final: 3.301394, loss_mean: 0.972570, proj_loss: -0.346132, loss_mean_cls: 2.674956, grad_norm: 8.308727 +Steps: 0%| | 1482/1000000 [06:06<67:40:20, 4.10it/s, grad_norm=8.31, loss_final=3.3, loss_mean=0.973, loss_mean_cls=2.67, proj_loss=-0.346][2026-03-23 13:42:26] Step: 1482, Training Logs: loss_final: 3.435735, loss_mean: 0.953639, proj_loss: -0.347249, loss_mean_cls: 2.829345, grad_norm: 13.791023 +Steps: 0%| | 1483/1000000 [06:07<67:38:49, 4.10it/s, grad_norm=13.8, loss_final=3.44, loss_mean=0.954, loss_mean_cls=2.83, proj_loss=-0.347][2026-03-23 13:42:27] Step: 1483, Training Logs: loss_final: 3.009342, loss_mean: 0.961152, proj_loss: -0.348100, loss_mean_cls: 2.396290, grad_norm: 6.641910 +Steps: 0%| | 1484/1000000 [06:07<67:34:52, 4.10it/s, grad_norm=6.64, loss_final=3.01, loss_mean=0.961, loss_mean_cls=2.4, proj_loss=-0.348][2026-03-23 13:42:27] Step: 1484, Training Logs: loss_final: 2.422090, loss_mean: 0.954516, proj_loss: -0.354789, loss_mean_cls: 1.822363, grad_norm: 7.367452 +Steps: 0%| | 1485/1000000 [06:07<67:34:55, 4.10it/s, grad_norm=7.37, loss_final=2.42, loss_mean=0.955, loss_mean_cls=1.82, proj_loss=-0.355][2026-03-23 13:42:27] Step: 1485, Training Logs: loss_final: 3.629198, loss_mean: 0.954377, proj_loss: -0.342039, loss_mean_cls: 3.016860, grad_norm: 12.024493 +Steps: 0%| | 1486/1000000 [06:07<67:34:30, 4.10it/s, grad_norm=12, loss_final=3.63, loss_mean=0.954, loss_mean_cls=3.02, proj_loss=-0.342][2026-03-23 13:42:27] Step: 1486, Training Logs: loss_final: 2.818871, loss_mean: 0.969970, proj_loss: -0.351020, loss_mean_cls: 2.199921, grad_norm: 4.585326 +Steps: 0%| | 1487/1000000 [06:08<67:32:49, 4.11it/s, grad_norm=4.59, loss_final=2.82, loss_mean=0.97, loss_mean_cls=2.2, proj_loss=-0.351][2026-03-23 13:42:28] Step: 1487, Training Logs: loss_final: 3.148981, loss_mean: 0.954301, proj_loss: -0.347617, loss_mean_cls: 2.542297, grad_norm: 5.030999 +Steps: 0%| | 1488/1000000 [06:08<67:36:36, 4.10it/s, grad_norm=5.03, loss_final=3.15, loss_mean=0.954, loss_mean_cls=2.54, proj_loss=-0.348][2026-03-23 13:42:28] Step: 1488, Training Logs: loss_final: 2.724397, loss_mean: 0.967757, proj_loss: -0.354329, loss_mean_cls: 2.110969, grad_norm: 14.123543 +Steps: 0%| | 1489/1000000 [06:08<67:39:09, 4.10it/s, grad_norm=14.1, loss_final=2.72, loss_mean=0.968, loss_mean_cls=2.11, proj_loss=-0.354][2026-03-23 13:42:28] Step: 1489, Training Logs: loss_final: 2.949973, loss_mean: 0.958976, proj_loss: -0.345118, loss_mean_cls: 2.336114, grad_norm: 7.150953 +Steps: 0%| | 1490/1000000 [06:08<67:43:25, 4.10it/s, grad_norm=7.15, loss_final=2.95, loss_mean=0.959, loss_mean_cls=2.34, proj_loss=-0.345][2026-03-23 13:42:28] Step: 1490, Training Logs: loss_final: 3.075984, loss_mean: 0.957544, proj_loss: -0.348921, loss_mean_cls: 2.467361, grad_norm: 4.307553 +Steps: 0%| | 1491/1000000 [06:09<67:43:44, 4.10it/s, grad_norm=4.31, loss_final=3.08, loss_mean=0.958, loss_mean_cls=2.47, proj_loss=-0.349][2026-03-23 13:42:29] Step: 1491, Training Logs: loss_final: 2.850376, loss_mean: 0.973328, proj_loss: -0.351037, loss_mean_cls: 2.228084, grad_norm: 10.482136 +Steps: 0%| | 1492/1000000 [06:09<68:47:57, 4.03it/s, grad_norm=10.5, loss_final=2.85, loss_mean=0.973, loss_mean_cls=2.23, proj_loss=-0.351][2026-03-23 13:42:29] Step: 1492, Training Logs: loss_final: 3.497935, loss_mean: 0.951765, proj_loss: -0.345614, loss_mean_cls: 2.891783, grad_norm: 4.466560 +Steps: 0%| | 1493/1000000 [06:09<68:23:56, 4.06it/s, grad_norm=4.47, loss_final=3.5, loss_mean=0.952, loss_mean_cls=2.89, proj_loss=-0.346][2026-03-23 13:42:29] Step: 1493, Training Logs: loss_final: 3.107586, loss_mean: 0.952759, proj_loss: -0.350483, loss_mean_cls: 2.505309, grad_norm: 12.004602 +Steps: 0%| | 1494/1000000 [06:09<68:11:21, 4.07it/s, grad_norm=12, loss_final=3.11, loss_mean=0.953, loss_mean_cls=2.51, proj_loss=-0.35][2026-03-23 13:42:29] Step: 1494, Training Logs: loss_final: 3.065847, loss_mean: 0.969342, proj_loss: -0.346264, loss_mean_cls: 2.442769, grad_norm: 6.527762 +Steps: 0%| | 1495/1000000 [06:10<68:02:16, 4.08it/s, grad_norm=6.53, loss_final=3.07, loss_mean=0.969, loss_mean_cls=2.44, proj_loss=-0.346][2026-03-23 13:42:30] Step: 1495, Training Logs: loss_final: 2.959139, loss_mean: 0.959933, proj_loss: -0.354323, loss_mean_cls: 2.353528, grad_norm: 10.502824 +Steps: 0%| | 1496/1000000 [06:10<67:53:00, 4.09it/s, grad_norm=10.5, loss_final=2.96, loss_mean=0.96, loss_mean_cls=2.35, proj_loss=-0.354][2026-03-23 13:42:30] Step: 1496, Training Logs: loss_final: 2.920377, loss_mean: 0.966094, proj_loss: -0.351608, loss_mean_cls: 2.305891, grad_norm: 3.414882 +Steps: 0%| | 1497/1000000 [06:10<67:45:58, 4.09it/s, grad_norm=3.41, loss_final=2.92, loss_mean=0.966, loss_mean_cls=2.31, proj_loss=-0.352][2026-03-23 13:42:30] Step: 1497, Training Logs: loss_final: 2.654237, loss_mean: 0.962397, proj_loss: -0.356588, loss_mean_cls: 2.048427, grad_norm: 3.404766 +Steps: 0%| | 1498/1000000 [06:10<67:41:25, 4.10it/s, grad_norm=3.4, loss_final=2.65, loss_mean=0.962, loss_mean_cls=2.05, proj_loss=-0.357][2026-03-23 13:42:30] Step: 1498, Training Logs: loss_final: 3.144162, loss_mean: 0.932638, proj_loss: -0.350385, loss_mean_cls: 2.561909, grad_norm: 10.471912 +Steps: 0%| | 1499/1000000 [06:11<67:38:54, 4.10it/s, grad_norm=10.5, loss_final=3.14, loss_mean=0.933, loss_mean_cls=2.56, proj_loss=-0.35][2026-03-23 13:42:31] Step: 1499, Training Logs: loss_final: 3.102743, loss_mean: 0.994352, proj_loss: -0.349243, loss_mean_cls: 2.457634, grad_norm: 13.145702 +Steps: 0%| | 1500/1000000 [06:11<67:39:03, 4.10it/s, grad_norm=13.1, loss_final=3.1, loss_mean=0.994, loss_mean_cls=2.46, proj_loss=-0.349][2026-03-23 13:42:31] Step: 1500, Training Logs: loss_final: 3.049389, loss_mean: 0.964199, proj_loss: -0.342893, loss_mean_cls: 2.428082, grad_norm: 3.614163 +Steps: 0%| | 1501/1000000 [06:11<67:35:06, 4.10it/s, grad_norm=3.61, loss_final=3.05, loss_mean=0.964, loss_mean_cls=2.43, proj_loss=-0.343][2026-03-23 13:42:31] Step: 1501, Training Logs: loss_final: 3.216794, loss_mean: 0.954688, proj_loss: -0.350518, loss_mean_cls: 2.612624, grad_norm: 14.006196 +Steps: 0%| | 1502/1000000 [06:11<67:33:59, 4.10it/s, grad_norm=14, loss_final=3.22, loss_mean=0.955, loss_mean_cls=2.61, proj_loss=-0.351][2026-03-23 13:42:31] Step: 1502, Training Logs: loss_final: 3.310638, loss_mean: 0.966836, proj_loss: -0.347893, loss_mean_cls: 2.691695, grad_norm: 8.831320 +Steps: 0%| | 1503/1000000 [06:12<67:33:33, 4.11it/s, grad_norm=8.83, loss_final=3.31, loss_mean=0.967, loss_mean_cls=2.69, proj_loss=-0.348][2026-03-23 13:42:32] Step: 1503, Training Logs: loss_final: 3.118774, loss_mean: 0.942562, proj_loss: -0.346506, loss_mean_cls: 2.522717, grad_norm: 10.232064 +Steps: 0%| | 1504/1000000 [06:12<67:32:06, 4.11it/s, grad_norm=10.2, loss_final=3.12, loss_mean=0.943, loss_mean_cls=2.52, proj_loss=-0.347][2026-03-23 13:42:32] Step: 1504, Training Logs: loss_final: 3.094249, loss_mean: 0.967463, proj_loss: -0.348545, loss_mean_cls: 2.475331, grad_norm: 12.776621 +Steps: 0%| | 1505/1000000 [06:12<67:31:30, 4.11it/s, grad_norm=12.8, loss_final=3.09, loss_mean=0.967, loss_mean_cls=2.48, proj_loss=-0.349][2026-03-23 13:42:32] Step: 1505, Training Logs: loss_final: 2.734968, loss_mean: 0.975799, proj_loss: -0.352222, loss_mean_cls: 2.111391, grad_norm: 14.318069 +Steps: 0%| | 1506/1000000 [06:12<67:32:07, 4.11it/s, grad_norm=14.3, loss_final=2.73, loss_mean=0.976, loss_mean_cls=2.11, proj_loss=-0.352][2026-03-23 13:42:32] Step: 1506, Training Logs: loss_final: 3.047639, loss_mean: 0.977264, proj_loss: -0.351209, loss_mean_cls: 2.421585, grad_norm: 10.727880 +Steps: 0%| | 1507/1000000 [06:13<67:31:03, 4.11it/s, grad_norm=10.7, loss_final=3.05, loss_mean=0.977, loss_mean_cls=2.42, proj_loss=-0.351][2026-03-23 13:42:33] Step: 1507, Training Logs: loss_final: 3.343105, loss_mean: 0.965653, proj_loss: -0.339757, loss_mean_cls: 2.717209, grad_norm: 8.323774 +Steps: 0%| | 1508/1000000 [06:13<67:30:13, 4.11it/s, grad_norm=8.32, loss_final=3.34, loss_mean=0.966, loss_mean_cls=2.72, proj_loss=-0.34][2026-03-23 13:42:33] Step: 1508, Training Logs: loss_final: 3.196959, loss_mean: 0.983107, proj_loss: -0.347631, loss_mean_cls: 2.561483, grad_norm: 11.276477 +Steps: 0%| | 1509/1000000 [06:13<67:30:17, 4.11it/s, grad_norm=11.3, loss_final=3.2, loss_mean=0.983, loss_mean_cls=2.56, proj_loss=-0.348][2026-03-23 13:42:33] Step: 1509, Training Logs: loss_final: 2.711824, loss_mean: 1.005512, proj_loss: -0.352383, loss_mean_cls: 2.058695, grad_norm: 4.176998 +Steps: 0%| | 1510/1000000 [06:13<69:02:57, 4.02it/s, grad_norm=4.18, loss_final=2.71, loss_mean=1.01, loss_mean_cls=2.06, proj_loss=-0.352][2026-03-23 13:42:33] Step: 1510, Training Logs: loss_final: 3.375683, loss_mean: 0.966150, proj_loss: -0.343819, loss_mean_cls: 2.753352, grad_norm: 3.848676 +Steps: 0%| | 1511/1000000 [06:14<68:35:08, 4.04it/s, grad_norm=3.85, loss_final=3.38, loss_mean=0.966, loss_mean_cls=2.75, proj_loss=-0.344][2026-03-23 13:42:34] Step: 1511, Training Logs: loss_final: 2.384263, loss_mean: 0.967001, proj_loss: -0.355896, loss_mean_cls: 1.773159, grad_norm: 5.041234 +Steps: 0%| | 1512/1000000 [06:14<68:16:07, 4.06it/s, grad_norm=5.04, loss_final=2.38, loss_mean=0.967, loss_mean_cls=1.77, proj_loss=-0.356][2026-03-23 13:42:34] Step: 1512, Training Logs: loss_final: 2.460867, loss_mean: 0.973598, proj_loss: -0.359204, loss_mean_cls: 1.846473, grad_norm: 7.817869 +Steps: 0%| | 1513/1000000 [06:14<68:00:56, 4.08it/s, grad_norm=7.82, loss_final=2.46, loss_mean=0.974, loss_mean_cls=1.85, proj_loss=-0.359][2026-03-23 13:42:34] Step: 1513, Training Logs: loss_final: 2.926462, loss_mean: 0.948887, proj_loss: -0.351010, loss_mean_cls: 2.328584, grad_norm: 12.276997 +Steps: 0%| | 1514/1000000 [06:14<67:52:07, 4.09it/s, grad_norm=12.3, loss_final=2.93, loss_mean=0.949, loss_mean_cls=2.33, proj_loss=-0.351][2026-03-23 13:42:34] Step: 1514, Training Logs: loss_final: 3.601212, loss_mean: 0.959011, proj_loss: -0.340810, loss_mean_cls: 2.983011, grad_norm: 10.306658 +Steps: 0%| | 1515/1000000 [06:15<67:46:25, 4.09it/s, grad_norm=10.3, loss_final=3.6, loss_mean=0.959, loss_mean_cls=2.98, proj_loss=-0.341][2026-03-23 13:42:34] Step: 1515, Training Logs: loss_final: 2.991036, loss_mean: 0.960763, proj_loss: -0.347674, loss_mean_cls: 2.377946, grad_norm: 7.683598 +Steps: 0%| | 1516/1000000 [06:15<67:40:10, 4.10it/s, grad_norm=7.68, loss_final=2.99, loss_mean=0.961, loss_mean_cls=2.38, proj_loss=-0.348][2026-03-23 13:42:35] Step: 1516, Training Logs: loss_final: 3.214598, loss_mean: 0.941018, proj_loss: -0.349676, loss_mean_cls: 2.623256, grad_norm: 14.235320 +Steps: 0%| | 1517/1000000 [06:15<67:37:19, 4.10it/s, grad_norm=14.2, loss_final=3.21, loss_mean=0.941, loss_mean_cls=2.62, proj_loss=-0.35][2026-03-23 13:42:35] Step: 1517, Training Logs: loss_final: 2.605692, loss_mean: 0.988009, proj_loss: -0.355775, loss_mean_cls: 1.973457, grad_norm: 3.474451 +Steps: 0%| | 1518/1000000 [06:15<67:33:52, 4.11it/s, grad_norm=3.47, loss_final=2.61, loss_mean=0.988, loss_mean_cls=1.97, proj_loss=-0.356][2026-03-23 13:42:35] Step: 1518, Training Logs: loss_final: 2.945054, loss_mean: 0.947616, proj_loss: -0.353441, loss_mean_cls: 2.350880, grad_norm: 12.539145 +Steps: 0%| | 1519/1000000 [06:16<67:33:15, 4.11it/s, grad_norm=12.5, loss_final=2.95, loss_mean=0.948, loss_mean_cls=2.35, proj_loss=-0.353][2026-03-23 13:42:35] Step: 1519, Training Logs: loss_final: 3.013039, loss_mean: 0.960386, proj_loss: -0.351204, loss_mean_cls: 2.403857, grad_norm: 10.958729 +Steps: 0%| | 1520/1000000 [06:16<67:33:03, 4.11it/s, grad_norm=11, loss_final=3.01, loss_mean=0.96, loss_mean_cls=2.4, proj_loss=-0.351][2026-03-23 13:42:36] Step: 1520, Training Logs: loss_final: 2.861323, loss_mean: 0.950412, proj_loss: -0.356398, loss_mean_cls: 2.267308, grad_norm: 11.733768 +Steps: 0%| | 1521/1000000 [06:16<67:34:21, 4.10it/s, grad_norm=11.7, loss_final=2.86, loss_mean=0.95, loss_mean_cls=2.27, proj_loss=-0.356][2026-03-23 13:42:36] Step: 1521, Training Logs: loss_final: 3.210392, loss_mean: 0.978702, proj_loss: -0.344597, loss_mean_cls: 2.576287, grad_norm: 14.896688 +Steps: 0%| | 1522/1000000 [06:16<67:32:56, 4.11it/s, grad_norm=14.9, loss_final=3.21, loss_mean=0.979, loss_mean_cls=2.58, proj_loss=-0.345][2026-03-23 13:42:36] Step: 1522, Training Logs: loss_final: 2.997707, loss_mean: 0.990262, proj_loss: -0.353750, loss_mean_cls: 2.361195, grad_norm: 12.430891 +Steps: 0%| | 1523/1000000 [06:16<67:35:59, 4.10it/s, grad_norm=12.4, loss_final=3, loss_mean=0.99, loss_mean_cls=2.36, proj_loss=-0.354][2026-03-23 13:42:36] Step: 1523, Training Logs: loss_final: 3.080528, loss_mean: 0.945867, proj_loss: -0.351786, loss_mean_cls: 2.486447, grad_norm: 14.662792 +Steps: 0%| | 1524/1000000 [06:17<67:41:52, 4.10it/s, grad_norm=14.7, loss_final=3.08, loss_mean=0.946, loss_mean_cls=2.49, proj_loss=-0.352][2026-03-23 13:42:37] Step: 1524, Training Logs: loss_final: 2.689845, loss_mean: 0.964801, proj_loss: -0.352468, loss_mean_cls: 2.077512, grad_norm: 3.653426 +Steps: 0%| | 1525/1000000 [06:17<67:37:59, 4.10it/s, grad_norm=3.65, loss_final=2.69, loss_mean=0.965, loss_mean_cls=2.08, proj_loss=-0.352][2026-03-23 13:42:37] Step: 1525, Training Logs: loss_final: 3.516589, loss_mean: 0.950142, proj_loss: -0.344584, loss_mean_cls: 2.911031, grad_norm: 15.471262 +Steps: 0%| | 1526/1000000 [06:17<67:35:40, 4.10it/s, grad_norm=15.5, loss_final=3.52, loss_mean=0.95, loss_mean_cls=2.91, proj_loss=-0.345][2026-03-23 13:42:37] Step: 1526, Training Logs: loss_final: 3.418858, loss_mean: 0.973412, proj_loss: -0.348832, loss_mean_cls: 2.794277, grad_norm: 14.758959 +Steps: 0%| | 1527/1000000 [06:17<67:33:47, 4.11it/s, grad_norm=14.8, loss_final=3.42, loss_mean=0.973, loss_mean_cls=2.79, proj_loss=-0.349][2026-03-23 13:42:37] Step: 1527, Training Logs: loss_final: 2.958606, loss_mean: 0.993517, proj_loss: -0.347156, loss_mean_cls: 2.312245, grad_norm: 13.981500 +Steps: 0%| | 1528/1000000 [06:18<67:34:40, 4.10it/s, grad_norm=14, loss_final=2.96, loss_mean=0.994, loss_mean_cls=2.31, proj_loss=-0.347][2026-03-23 13:42:38] Step: 1528, Training Logs: loss_final: 3.609776, loss_mean: 0.945290, proj_loss: -0.347614, loss_mean_cls: 3.012100, grad_norm: 19.008051 +Steps: 0%| | 1529/1000000 [06:18<67:34:28, 4.10it/s, grad_norm=19, loss_final=3.61, loss_mean=0.945, loss_mean_cls=3.01, proj_loss=-0.348][2026-03-23 13:42:38] Step: 1529, Training Logs: loss_final: 2.798918, loss_mean: 0.955879, proj_loss: -0.353700, loss_mean_cls: 2.196739, grad_norm: 10.465096 +Steps: 0%| | 1530/1000000 [06:18<67:32:51, 4.11it/s, grad_norm=10.5, loss_final=2.8, loss_mean=0.956, loss_mean_cls=2.2, proj_loss=-0.354][2026-03-23 13:42:38] Step: 1530, Training Logs: loss_final: 3.171281, loss_mean: 0.960419, proj_loss: -0.349134, loss_mean_cls: 2.559995, grad_norm: 11.605280 +Steps: 0%| | 1531/1000000 [06:18<67:33:32, 4.11it/s, grad_norm=11.6, loss_final=3.17, loss_mean=0.96, loss_mean_cls=2.56, proj_loss=-0.349][2026-03-23 13:42:38] Step: 1531, Training Logs: loss_final: 3.442314, loss_mean: 0.956390, proj_loss: -0.348827, loss_mean_cls: 2.834751, grad_norm: 13.487546 +Steps: 0%| | 1532/1000000 [06:19<67:30:42, 4.11it/s, grad_norm=13.5, loss_final=3.44, loss_mean=0.956, loss_mean_cls=2.83, proj_loss=-0.349][2026-03-23 13:42:39] Step: 1532, Training Logs: loss_final: 3.052949, loss_mean: 0.957824, proj_loss: -0.347942, loss_mean_cls: 2.443067, grad_norm: 5.150134 +Steps: 0%| | 1533/1000000 [06:19<67:30:10, 4.11it/s, grad_norm=5.15, loss_final=3.05, loss_mean=0.958, loss_mean_cls=2.44, proj_loss=-0.348][2026-03-23 13:42:39] Step: 1533, Training Logs: loss_final: 2.965988, loss_mean: 0.969028, proj_loss: -0.357219, loss_mean_cls: 2.354180, grad_norm: 12.790105 +Steps: 0%| | 1534/1000000 [06:19<67:30:26, 4.11it/s, grad_norm=12.8, loss_final=2.97, loss_mean=0.969, loss_mean_cls=2.35, proj_loss=-0.357][2026-03-23 13:42:39] Step: 1534, Training Logs: loss_final: 3.393996, loss_mean: 0.966097, proj_loss: -0.345552, loss_mean_cls: 2.773451, grad_norm: 12.732840 +Steps: 0%| | 1535/1000000 [06:19<67:29:34, 4.11it/s, grad_norm=12.7, loss_final=3.39, loss_mean=0.966, loss_mean_cls=2.77, proj_loss=-0.346][2026-03-23 13:42:39] Step: 1535, Training Logs: loss_final: 3.429685, loss_mean: 0.967439, proj_loss: -0.344743, loss_mean_cls: 2.806988, grad_norm: 12.842412 +Steps: 0%| | 1536/1000000 [06:20<68:42:38, 4.04it/s, grad_norm=12.8, loss_final=3.43, loss_mean=0.967, loss_mean_cls=2.81, proj_loss=-0.345][2026-03-23 13:42:40] Step: 1536, Training Logs: loss_final: 3.231853, loss_mean: 0.956066, proj_loss: -0.352370, loss_mean_cls: 2.628157, grad_norm: 6.117287 +Steps: 0%| | 1537/1000000 [06:20<68:20:42, 4.06it/s, grad_norm=6.12, loss_final=3.23, loss_mean=0.956, loss_mean_cls=2.63, proj_loss=-0.352][2026-03-23 13:42:40] Step: 1537, Training Logs: loss_final: 2.804784, loss_mean: 0.954721, proj_loss: -0.359584, loss_mean_cls: 2.209647, grad_norm: 14.367352 +Steps: 0%| | 1538/1000000 [06:20<68:05:34, 4.07it/s, grad_norm=14.4, loss_final=2.8, loss_mean=0.955, loss_mean_cls=2.21, proj_loss=-0.36][2026-03-23 13:42:40] Step: 1538, Training Logs: loss_final: 3.035443, loss_mean: 0.961513, proj_loss: -0.356327, loss_mean_cls: 2.430257, grad_norm: 14.445378 +Steps: 0%| | 1539/1000000 [06:20<67:54:53, 4.08it/s, grad_norm=14.4, loss_final=3.04, loss_mean=0.962, loss_mean_cls=2.43, proj_loss=-0.356][2026-03-23 13:42:40] Step: 1539, Training Logs: loss_final: 3.174442, loss_mean: 0.967293, proj_loss: -0.351615, loss_mean_cls: 2.558764, grad_norm: 11.144781 +Steps: 0%| | 1540/1000000 [06:21<68:00:08, 4.08it/s, grad_norm=11.1, loss_final=3.17, loss_mean=0.967, loss_mean_cls=2.56, proj_loss=-0.352][2026-03-23 13:42:41] Step: 1540, Training Logs: loss_final: 3.162140, loss_mean: 0.976931, proj_loss: -0.345529, loss_mean_cls: 2.530738, grad_norm: 7.907671 +Steps: 0%| | 1541/1000000 [06:21<67:51:41, 4.09it/s, grad_norm=7.91, loss_final=3.16, loss_mean=0.977, loss_mean_cls=2.53, proj_loss=-0.346][2026-03-23 13:42:41] Step: 1541, Training Logs: loss_final: 3.163827, loss_mean: 0.963560, proj_loss: -0.347905, loss_mean_cls: 2.548172, grad_norm: 9.939147 +Steps: 0%| | 1542/1000000 [06:21<67:46:08, 4.09it/s, grad_norm=9.94, loss_final=3.16, loss_mean=0.964, loss_mean_cls=2.55, proj_loss=-0.348][2026-03-23 13:42:41] Step: 1542, Training Logs: loss_final: 3.199076, loss_mean: 0.974696, proj_loss: -0.349851, loss_mean_cls: 2.574232, grad_norm: 15.103493 +Steps: 0%| | 1543/1000000 [06:21<67:40:14, 4.10it/s, grad_norm=15.1, loss_final=3.2, loss_mean=0.975, loss_mean_cls=2.57, proj_loss=-0.35][2026-03-23 13:42:41] Step: 1543, Training Logs: loss_final: 3.557261, loss_mean: 0.950782, proj_loss: -0.344225, loss_mean_cls: 2.950704, grad_norm: 15.597673 +Steps: 0%| | 1544/1000000 [06:22<67:37:32, 4.10it/s, grad_norm=15.6, loss_final=3.56, loss_mean=0.951, loss_mean_cls=2.95, proj_loss=-0.344][2026-03-23 13:42:42] Step: 1544, Training Logs: loss_final: 3.273531, loss_mean: 1.008969, proj_loss: -0.344392, loss_mean_cls: 2.608954, grad_norm: 8.107602 +Steps: 0%| | 1545/1000000 [06:22<67:42:04, 4.10it/s, grad_norm=8.11, loss_final=3.27, loss_mean=1.01, loss_mean_cls=2.61, proj_loss=-0.344][2026-03-23 13:42:42] Step: 1545, Training Logs: loss_final: 2.993623, loss_mean: 0.994300, proj_loss: -0.357832, loss_mean_cls: 2.357155, grad_norm: 9.535883 +Steps: 0%| | 1546/1000000 [06:22<67:39:08, 4.10it/s, grad_norm=9.54, loss_final=2.99, loss_mean=0.994, loss_mean_cls=2.36, proj_loss=-0.358][2026-03-23 13:42:42] Step: 1546, Training Logs: loss_final: 3.120749, loss_mean: 0.983787, proj_loss: -0.349349, loss_mean_cls: 2.486310, grad_norm: 11.300523 +Steps: 0%| | 1547/1000000 [06:22<67:35:23, 4.10it/s, grad_norm=11.3, loss_final=3.12, loss_mean=0.984, loss_mean_cls=2.49, proj_loss=-0.349][2026-03-23 13:42:42] Step: 1547, Training Logs: loss_final: 2.971317, loss_mean: 0.981920, proj_loss: -0.355727, loss_mean_cls: 2.345124, grad_norm: 6.227867 +Steps: 0%| | 1548/1000000 [06:23<67:34:50, 4.10it/s, grad_norm=6.23, loss_final=2.97, loss_mean=0.982, loss_mean_cls=2.35, proj_loss=-0.356][2026-03-23 13:42:43] Step: 1548, Training Logs: loss_final: 2.991045, loss_mean: 0.967703, proj_loss: -0.359885, loss_mean_cls: 2.383227, grad_norm: 11.141178 +Steps: 0%| | 1549/1000000 [06:23<67:33:45, 4.11it/s, grad_norm=11.1, loss_final=2.99, loss_mean=0.968, loss_mean_cls=2.38, proj_loss=-0.36][2026-03-23 13:42:43] Step: 1549, Training Logs: loss_final: 3.323132, loss_mean: 0.972873, proj_loss: -0.354623, loss_mean_cls: 2.704882, grad_norm: 10.928255 +Steps: 0%| | 1550/1000000 [06:23<67:32:16, 4.11it/s, grad_norm=10.9, loss_final=3.32, loss_mean=0.973, loss_mean_cls=2.7, proj_loss=-0.355][2026-03-23 13:42:43] Step: 1550, Training Logs: loss_final: 3.010439, loss_mean: 0.975308, proj_loss: -0.355175, loss_mean_cls: 2.390306, grad_norm: 5.907875 +Steps: 0%| | 1551/1000000 [06:23<67:30:19, 4.11it/s, grad_norm=5.91, loss_final=3.01, loss_mean=0.975, loss_mean_cls=2.39, proj_loss=-0.355][2026-03-23 13:42:43] Step: 1551, Training Logs: loss_final: 2.975951, loss_mean: 0.988679, proj_loss: -0.354263, loss_mean_cls: 2.341535, grad_norm: 4.100262 +Steps: 0%| | 1552/1000000 [06:24<67:29:06, 4.11it/s, grad_norm=4.1, loss_final=2.98, loss_mean=0.989, loss_mean_cls=2.34, proj_loss=-0.354][2026-03-23 13:42:44] Step: 1552, Training Logs: loss_final: 2.935752, loss_mean: 0.983710, proj_loss: -0.352950, loss_mean_cls: 2.304992, grad_norm: 6.664919 +Steps: 0%| | 1553/1000000 [06:24<67:28:20, 4.11it/s, grad_norm=6.66, loss_final=2.94, loss_mean=0.984, loss_mean_cls=2.3, proj_loss=-0.353][2026-03-23 13:42:44] Step: 1553, Training Logs: loss_final: 3.119485, loss_mean: 0.967208, proj_loss: -0.351922, loss_mean_cls: 2.504200, grad_norm: 7.373316 +Steps: 0%| | 1554/1000000 [06:24<67:30:56, 4.11it/s, grad_norm=7.37, loss_final=3.12, loss_mean=0.967, loss_mean_cls=2.5, proj_loss=-0.352][2026-03-23 13:42:44] Step: 1554, Training Logs: loss_final: 2.973594, loss_mean: 0.981188, proj_loss: -0.353105, loss_mean_cls: 2.345511, grad_norm: 5.768367 +Steps: 0%| | 1555/1000000 [06:24<67:31:07, 4.11it/s, grad_norm=5.77, loss_final=2.97, loss_mean=0.981, loss_mean_cls=2.35, proj_loss=-0.353][2026-03-23 13:42:44] Step: 1555, Training Logs: loss_final: 3.469422, loss_mean: 0.960529, proj_loss: -0.342934, loss_mean_cls: 2.851827, grad_norm: 5.307539 +Steps: 0%| | 1556/1000000 [06:25<67:31:00, 4.11it/s, grad_norm=5.31, loss_final=3.47, loss_mean=0.961, loss_mean_cls=2.85, proj_loss=-0.343][2026-03-23 13:42:44] Step: 1556, Training Logs: loss_final: 3.344542, loss_mean: 0.983963, proj_loss: -0.346553, loss_mean_cls: 2.707133, grad_norm: 12.899610 +Steps: 0%| | 1557/1000000 [06:25<67:30:04, 4.11it/s, grad_norm=12.9, loss_final=3.34, loss_mean=0.984, loss_mean_cls=2.71, proj_loss=-0.347][2026-03-23 13:42:45] Step: 1557, Training Logs: loss_final: 2.920314, loss_mean: 0.973013, proj_loss: -0.361996, loss_mean_cls: 2.309297, grad_norm: 13.115915 +Steps: 0%| | 1558/1000000 [06:25<67:30:36, 4.11it/s, grad_norm=13.1, loss_final=2.92, loss_mean=0.973, loss_mean_cls=2.31, proj_loss=-0.362][2026-03-23 13:42:45] Step: 1558, Training Logs: loss_final: 3.022401, loss_mean: 0.969085, proj_loss: -0.353039, loss_mean_cls: 2.406355, grad_norm: 6.632608 +Steps: 0%| | 1559/1000000 [06:25<67:32:05, 4.11it/s, grad_norm=6.63, loss_final=3.02, loss_mean=0.969, loss_mean_cls=2.41, proj_loss=-0.353][2026-03-23 13:42:45] Step: 1559, Training Logs: loss_final: 3.099252, loss_mean: 0.973533, proj_loss: -0.354474, loss_mean_cls: 2.480192, grad_norm: 8.670877 +Steps: 0%| | 1560/1000000 [06:26<67:31:46, 4.11it/s, grad_norm=8.67, loss_final=3.1, loss_mean=0.974, loss_mean_cls=2.48, proj_loss=-0.354][2026-03-23 13:42:45] Step: 1560, Training Logs: loss_final: 3.196277, loss_mean: 0.974599, proj_loss: -0.358584, loss_mean_cls: 2.580262, grad_norm: 6.348928 +Steps: 0%| | 1561/1000000 [06:26<67:31:37, 4.11it/s, grad_norm=6.35, loss_final=3.2, loss_mean=0.975, loss_mean_cls=2.58, proj_loss=-0.359][2026-03-23 13:42:46] Step: 1561, Training Logs: loss_final: 2.817995, loss_mean: 0.964877, proj_loss: -0.352589, loss_mean_cls: 2.205707, grad_norm: 3.839162 +Steps: 0%| | 1562/1000000 [06:26<67:34:38, 4.10it/s, grad_norm=3.84, loss_final=2.82, loss_mean=0.965, loss_mean_cls=2.21, proj_loss=-0.353][2026-03-23 13:42:46] Step: 1562, Training Logs: loss_final: 2.638297, loss_mean: 0.950600, proj_loss: -0.363115, loss_mean_cls: 2.050812, grad_norm: 11.088832 +Steps: 0%| | 1563/1000000 [06:26<67:33:51, 4.10it/s, grad_norm=11.1, loss_final=2.64, loss_mean=0.951, loss_mean_cls=2.05, proj_loss=-0.363][2026-03-23 13:42:46] Step: 1563, Training Logs: loss_final: 2.902452, loss_mean: 0.973331, proj_loss: -0.355376, loss_mean_cls: 2.284497, grad_norm: 12.368455 +Steps: 0%| | 1564/1000000 [06:26<67:32:47, 4.11it/s, grad_norm=12.4, loss_final=2.9, loss_mean=0.973, loss_mean_cls=2.28, proj_loss=-0.355][2026-03-23 13:42:46] Step: 1564, Training Logs: loss_final: 3.284330, loss_mean: 0.941456, proj_loss: -0.348904, loss_mean_cls: 2.691778, grad_norm: 3.467922 +Steps: 0%| | 1565/1000000 [06:27<67:32:03, 4.11it/s, grad_norm=3.47, loss_final=3.28, loss_mean=0.941, loss_mean_cls=2.69, proj_loss=-0.349][2026-03-23 13:42:47] Step: 1565, Training Logs: loss_final: 2.973991, loss_mean: 0.978701, proj_loss: -0.361425, loss_mean_cls: 2.356715, grad_norm: 10.032041 +Steps: 0%| | 1566/1000000 [06:27<67:32:03, 4.11it/s, grad_norm=10, loss_final=2.97, loss_mean=0.979, loss_mean_cls=2.36, proj_loss=-0.361][2026-03-23 13:42:47] Step: 1566, Training Logs: loss_final: 3.304800, loss_mean: 0.961625, proj_loss: -0.355055, loss_mean_cls: 2.698230, grad_norm: 8.045096 +Steps: 0%| | 1567/1000000 [06:27<67:31:23, 4.11it/s, grad_norm=8.05, loss_final=3.3, loss_mean=0.962, loss_mean_cls=2.7, proj_loss=-0.355][2026-03-23 13:42:47] Step: 1567, Training Logs: loss_final: 3.530255, loss_mean: 0.957790, proj_loss: -0.350601, loss_mean_cls: 2.923066, grad_norm: 3.981374 +Steps: 0%| | 1568/1000000 [06:27<67:30:50, 4.11it/s, grad_norm=3.98, loss_final=3.53, loss_mean=0.958, loss_mean_cls=2.92, proj_loss=-0.351][2026-03-23 13:42:47] Step: 1568, Training Logs: loss_final: 2.708413, loss_mean: 0.987751, proj_loss: -0.360648, loss_mean_cls: 2.081310, grad_norm: 7.431085 +Steps: 0%| | 1569/1000000 [06:28<67:27:55, 4.11it/s, grad_norm=7.43, loss_final=2.71, loss_mean=0.988, loss_mean_cls=2.08, proj_loss=-0.361][2026-03-23 13:42:48] Step: 1569, Training Logs: loss_final: 2.907888, loss_mean: 0.981083, proj_loss: -0.350445, loss_mean_cls: 2.277250, grad_norm: 5.740780 +Steps: 0%| | 1570/1000000 [06:28<67:27:28, 4.11it/s, grad_norm=5.74, loss_final=2.91, loss_mean=0.981, loss_mean_cls=2.28, proj_loss=-0.35][2026-03-23 13:42:48] Step: 1570, Training Logs: loss_final: 2.826122, loss_mean: 0.944145, proj_loss: -0.359900, loss_mean_cls: 2.241878, grad_norm: 6.523353 +Steps: 0%| | 1571/1000000 [06:28<67:29:05, 4.11it/s, grad_norm=6.52, loss_final=2.83, loss_mean=0.944, loss_mean_cls=2.24, proj_loss=-0.36][2026-03-23 13:42:48] Step: 1571, Training Logs: loss_final: 3.620495, loss_mean: 0.947343, proj_loss: -0.350362, loss_mean_cls: 3.023515, grad_norm: 5.188433 +Steps: 0%| | 1572/1000000 [06:28<67:28:52, 4.11it/s, grad_norm=5.19, loss_final=3.62, loss_mean=0.947, loss_mean_cls=3.02, proj_loss=-0.35][2026-03-23 13:42:48] Step: 1572, Training Logs: loss_final: 3.362876, loss_mean: 0.943149, proj_loss: -0.346515, loss_mean_cls: 2.766241, grad_norm: 4.492006 +Steps: 0%| | 1573/1000000 [06:29<67:27:56, 4.11it/s, grad_norm=4.49, loss_final=3.36, loss_mean=0.943, loss_mean_cls=2.77, proj_loss=-0.347][2026-03-23 13:42:49] Step: 1573, Training Logs: loss_final: 3.268051, loss_mean: 0.948404, proj_loss: -0.351820, loss_mean_cls: 2.671467, grad_norm: 3.662537 +Steps: 0%| | 1574/1000000 [06:29<67:30:12, 4.11it/s, grad_norm=3.66, loss_final=3.27, loss_mean=0.948, loss_mean_cls=2.67, proj_loss=-0.352][2026-03-23 13:42:49] Step: 1574, Training Logs: loss_final: 2.917910, loss_mean: 0.947799, proj_loss: -0.361041, loss_mean_cls: 2.331153, grad_norm: 4.670862 +Steps: 0%| | 1575/1000000 [06:29<67:31:46, 4.11it/s, grad_norm=4.67, loss_final=2.92, loss_mean=0.948, loss_mean_cls=2.33, proj_loss=-0.361][2026-03-23 13:42:49] Step: 1575, Training Logs: loss_final: 2.926780, loss_mean: 0.981605, proj_loss: -0.354704, loss_mean_cls: 2.299879, grad_norm: 4.611703 +Steps: 0%| | 1576/1000000 [06:29<67:31:22, 4.11it/s, grad_norm=4.61, loss_final=2.93, loss_mean=0.982, loss_mean_cls=2.3, proj_loss=-0.355][2026-03-23 13:42:49] Step: 1576, Training Logs: loss_final: 2.453179, loss_mean: 0.983220, proj_loss: -0.365888, loss_mean_cls: 1.835847, grad_norm: 9.241731 +Steps: 0%| | 1577/1000000 [06:30<67:33:40, 4.11it/s, grad_norm=9.24, loss_final=2.45, loss_mean=0.983, loss_mean_cls=1.84, proj_loss=-0.366][2026-03-23 13:42:50] Step: 1577, Training Logs: loss_final: 2.898562, loss_mean: 0.979410, proj_loss: -0.360818, loss_mean_cls: 2.279969, grad_norm: 7.636331 +Steps: 0%| | 1578/1000000 [06:30<67:34:58, 4.10it/s, grad_norm=7.64, loss_final=2.9, loss_mean=0.979, loss_mean_cls=2.28, proj_loss=-0.361][2026-03-23 13:42:50] Step: 1578, Training Logs: loss_final: 2.969978, loss_mean: 0.961714, proj_loss: -0.352011, loss_mean_cls: 2.360274, grad_norm: 10.819164 +Steps: 0%| | 1579/1000000 [06:30<67:32:25, 4.11it/s, grad_norm=10.8, loss_final=2.97, loss_mean=0.962, loss_mean_cls=2.36, proj_loss=-0.352][2026-03-23 13:42:50] Step: 1579, Training Logs: loss_final: 3.216893, loss_mean: 0.944162, proj_loss: -0.352144, loss_mean_cls: 2.624875, grad_norm: 7.043958 +Steps: 0%| | 1580/1000000 [06:30<67:30:55, 4.11it/s, grad_norm=7.04, loss_final=3.22, loss_mean=0.944, loss_mean_cls=2.62, proj_loss=-0.352][2026-03-23 13:42:50] Step: 1580, Training Logs: loss_final: 2.753266, loss_mean: 0.961506, proj_loss: -0.356313, loss_mean_cls: 2.148073, grad_norm: 5.789616 +Steps: 0%| | 1581/1000000 [06:31<67:30:34, 4.11it/s, grad_norm=5.79, loss_final=2.75, loss_mean=0.962, loss_mean_cls=2.15, proj_loss=-0.356][2026-03-23 13:42:51] Step: 1581, Training Logs: loss_final: 3.087173, loss_mean: 0.956124, proj_loss: -0.359324, loss_mean_cls: 2.490374, grad_norm: 12.502512 +Steps: 0%| | 1582/1000000 [06:31<67:28:08, 4.11it/s, grad_norm=12.5, loss_final=3.09, loss_mean=0.956, loss_mean_cls=2.49, proj_loss=-0.359][2026-03-23 13:42:51] Step: 1582, Training Logs: loss_final: 3.196606, loss_mean: 0.964734, proj_loss: -0.356424, loss_mean_cls: 2.588296, grad_norm: 12.151020 +Steps: 0%| | 1583/1000000 [06:31<67:31:44, 4.11it/s, grad_norm=12.2, loss_final=3.2, loss_mean=0.965, loss_mean_cls=2.59, proj_loss=-0.356][2026-03-23 13:42:51] Step: 1583, Training Logs: loss_final: 3.049083, loss_mean: 0.959921, proj_loss: -0.352097, loss_mean_cls: 2.441259, grad_norm: 14.419067 +Steps: 0%| | 1584/1000000 [06:31<67:29:54, 4.11it/s, grad_norm=14.4, loss_final=3.05, loss_mean=0.96, loss_mean_cls=2.44, proj_loss=-0.352][2026-03-23 13:42:51] Step: 1584, Training Logs: loss_final: 3.192211, loss_mean: 0.972822, proj_loss: -0.347065, loss_mean_cls: 2.566454, grad_norm: 6.491845 +Steps: 0%| | 1585/1000000 [06:32<67:31:23, 4.11it/s, grad_norm=6.49, loss_final=3.19, loss_mean=0.973, loss_mean_cls=2.57, proj_loss=-0.347][2026-03-23 13:42:52] Step: 1585, Training Logs: loss_final: 3.388634, loss_mean: 0.930470, proj_loss: -0.352538, loss_mean_cls: 2.810702, grad_norm: 10.247929 +Steps: 0%| | 1586/1000000 [06:32<67:58:13, 4.08it/s, grad_norm=10.2, loss_final=3.39, loss_mean=0.93, loss_mean_cls=2.81, proj_loss=-0.353][2026-03-23 13:42:52] Step: 1586, Training Logs: loss_final: 2.792034, loss_mean: 0.949859, proj_loss: -0.356117, loss_mean_cls: 2.198292, grad_norm: 3.445123 +Steps: 0%| | 1587/1000000 [06:32<67:51:17, 4.09it/s, grad_norm=3.45, loss_final=2.79, loss_mean=0.95, loss_mean_cls=2.2, proj_loss=-0.356][2026-03-23 13:42:52] Step: 1587, Training Logs: loss_final: 3.416470, loss_mean: 0.939775, proj_loss: -0.352227, loss_mean_cls: 2.828922, grad_norm: 11.384736 +Steps: 0%| | 1588/1000000 [06:32<67:44:32, 4.09it/s, grad_norm=11.4, loss_final=3.42, loss_mean=0.94, loss_mean_cls=2.83, proj_loss=-0.352][2026-03-23 13:42:52] Step: 1588, Training Logs: loss_final: 3.606797, loss_mean: 0.947766, proj_loss: -0.344049, loss_mean_cls: 3.003080, grad_norm: 11.132773 +Steps: 0%| | 1589/1000000 [06:33<67:38:49, 4.10it/s, grad_norm=11.1, loss_final=3.61, loss_mean=0.948, loss_mean_cls=3, proj_loss=-0.344][2026-03-23 13:42:53] Step: 1589, Training Logs: loss_final: 3.296438, loss_mean: 0.957660, proj_loss: -0.345540, loss_mean_cls: 2.684317, grad_norm: 4.601461 +Steps: 0%| | 1590/1000000 [06:33<67:36:51, 4.10it/s, grad_norm=4.6, loss_final=3.3, loss_mean=0.958, loss_mean_cls=2.68, proj_loss=-0.346][2026-03-23 13:42:53] Step: 1590, Training Logs: loss_final: 2.800692, loss_mean: 0.963262, proj_loss: -0.355582, loss_mean_cls: 2.193013, grad_norm: 4.434661 +Steps: 0%| | 1591/1000000 [06:33<67:34:04, 4.10it/s, grad_norm=4.43, loss_final=2.8, loss_mean=0.963, loss_mean_cls=2.19, proj_loss=-0.356][2026-03-23 13:42:53] Step: 1591, Training Logs: loss_final: 3.183663, loss_mean: 0.921148, proj_loss: -0.356447, loss_mean_cls: 2.618962, grad_norm: 3.171255 +Steps: 0%| | 1592/1000000 [06:33<67:32:21, 4.11it/s, grad_norm=3.17, loss_final=3.18, loss_mean=0.921, loss_mean_cls=2.62, proj_loss=-0.356][2026-03-23 13:42:53] Step: 1592, Training Logs: loss_final: 3.038484, loss_mean: 0.956584, proj_loss: -0.352377, loss_mean_cls: 2.434277, grad_norm: 5.532920 +Steps: 0%| | 1593/1000000 [06:34<67:33:37, 4.11it/s, grad_norm=5.53, loss_final=3.04, loss_mean=0.957, loss_mean_cls=2.43, proj_loss=-0.352][2026-03-23 13:42:54] Step: 1593, Training Logs: loss_final: 3.065850, loss_mean: 0.964747, proj_loss: -0.355965, loss_mean_cls: 2.457068, grad_norm: 3.678937 +Steps: 0%| | 1594/1000000 [06:34<67:32:28, 4.11it/s, grad_norm=3.68, loss_final=3.07, loss_mean=0.965, loss_mean_cls=2.46, proj_loss=-0.356][2026-03-23 13:42:54] Step: 1594, Training Logs: loss_final: 3.351851, loss_mean: 0.944780, proj_loss: -0.353517, loss_mean_cls: 2.760587, grad_norm: 2.865821 +Steps: 0%| | 1595/1000000 [06:34<67:32:14, 4.11it/s, grad_norm=2.87, loss_final=3.35, loss_mean=0.945, loss_mean_cls=2.76, proj_loss=-0.354][2026-03-23 13:42:54] Step: 1595, Training Logs: loss_final: 2.994520, loss_mean: 0.942572, proj_loss: -0.357687, loss_mean_cls: 2.409636, grad_norm: 5.240125 +Steps: 0%| | 1596/1000000 [06:34<67:30:34, 4.11it/s, grad_norm=5.24, loss_final=2.99, loss_mean=0.943, loss_mean_cls=2.41, proj_loss=-0.358][2026-03-23 13:42:54] Step: 1596, Training Logs: loss_final: 2.915140, loss_mean: 0.943758, proj_loss: -0.355594, loss_mean_cls: 2.326976, grad_norm: 7.385954 +Steps: 0%| | 1597/1000000 [06:35<67:30:37, 4.11it/s, grad_norm=7.39, loss_final=2.92, loss_mean=0.944, loss_mean_cls=2.33, proj_loss=-0.356][2026-03-23 13:42:54] Step: 1597, Training Logs: loss_final: 3.171813, loss_mean: 0.911346, proj_loss: -0.346867, loss_mean_cls: 2.607334, grad_norm: 6.948291 +Steps: 0%| | 1598/1000000 [06:35<67:29:30, 4.11it/s, grad_norm=6.95, loss_final=3.17, loss_mean=0.911, loss_mean_cls=2.61, proj_loss=-0.347][2026-03-23 13:42:55] Step: 1598, Training Logs: loss_final: 3.015526, loss_mean: 0.972548, proj_loss: -0.359644, loss_mean_cls: 2.402623, grad_norm: 14.571008 +Steps: 0%| | 1599/1000000 [06:35<67:31:39, 4.11it/s, grad_norm=14.6, loss_final=3.02, loss_mean=0.973, loss_mean_cls=2.4, proj_loss=-0.36][2026-03-23 13:42:55] Step: 1599, Training Logs: loss_final: 3.031062, loss_mean: 0.951235, proj_loss: -0.352513, loss_mean_cls: 2.432340, grad_norm: 10.691643 +Steps: 0%| | 1600/1000000 [06:35<67:32:25, 4.11it/s, grad_norm=10.7, loss_final=3.03, loss_mean=0.951, loss_mean_cls=2.43, proj_loss=-0.353][2026-03-23 13:42:55] Step: 1600, Training Logs: loss_final: 3.236454, loss_mean: 0.943147, proj_loss: -0.352769, loss_mean_cls: 2.646076, grad_norm: 6.308773 +Steps: 0%| | 1601/1000000 [06:35<67:30:07, 4.11it/s, grad_norm=6.31, loss_final=3.24, loss_mean=0.943, loss_mean_cls=2.65, proj_loss=-0.353][2026-03-23 13:42:55] Step: 1601, Training Logs: loss_final: 3.075903, loss_mean: 0.974166, proj_loss: -0.355769, loss_mean_cls: 2.457506, grad_norm: 6.942876 +Steps: 0%| | 1602/1000000 [06:36<67:29:37, 4.11it/s, grad_norm=6.94, loss_final=3.08, loss_mean=0.974, loss_mean_cls=2.46, proj_loss=-0.356][2026-03-23 13:42:56] Step: 1602, Training Logs: loss_final: 3.196312, loss_mean: 0.942299, proj_loss: -0.351883, loss_mean_cls: 2.605896, grad_norm: 10.143624 +Steps: 0%| | 1603/1000000 [06:36<67:29:49, 4.11it/s, grad_norm=10.1, loss_final=3.2, loss_mean=0.942, loss_mean_cls=2.61, proj_loss=-0.352][2026-03-23 13:42:56] Step: 1603, Training Logs: loss_final: 3.335392, loss_mean: 0.950085, proj_loss: -0.346109, loss_mean_cls: 2.731415, grad_norm: 13.093863 +Steps: 0%| | 1604/1000000 [06:36<67:30:04, 4.11it/s, grad_norm=13.1, loss_final=3.34, loss_mean=0.95, loss_mean_cls=2.73, proj_loss=-0.346][2026-03-23 13:42:56] Step: 1604, Training Logs: loss_final: 3.477484, loss_mean: 0.925399, proj_loss: -0.354292, loss_mean_cls: 2.906377, grad_norm: 26.566967 +Steps: 0%| | 1605/1000000 [06:36<67:30:03, 4.11it/s, grad_norm=26.6, loss_final=3.48, loss_mean=0.925, loss_mean_cls=2.91, proj_loss=-0.354][2026-03-23 13:42:56] Step: 1605, Training Logs: loss_final: 2.438789, loss_mean: 0.980320, proj_loss: -0.363017, loss_mean_cls: 1.821486, grad_norm: 13.085832 +Steps: 0%| | 1606/1000000 [06:37<67:29:22, 4.11it/s, grad_norm=13.1, loss_final=2.44, loss_mean=0.98, loss_mean_cls=1.82, proj_loss=-0.363][2026-03-23 13:42:57] Step: 1606, Training Logs: loss_final: 3.092612, loss_mean: 0.954362, proj_loss: -0.352486, loss_mean_cls: 2.490736, grad_norm: 8.245051 +Steps: 0%| | 1607/1000000 [06:37<67:30:46, 4.11it/s, grad_norm=8.25, loss_final=3.09, loss_mean=0.954, loss_mean_cls=2.49, proj_loss=-0.352][2026-03-23 13:42:57] Step: 1607, Training Logs: loss_final: 2.871623, loss_mean: 0.957864, proj_loss: -0.358891, loss_mean_cls: 2.272650, grad_norm: 12.889237 +Steps: 0%| | 1608/1000000 [06:37<67:32:05, 4.11it/s, grad_norm=12.9, loss_final=2.87, loss_mean=0.958, loss_mean_cls=2.27, proj_loss=-0.359][2026-03-23 13:42:57] Step: 1608, Training Logs: loss_final: 3.191310, loss_mean: 0.945755, proj_loss: -0.345796, loss_mean_cls: 2.591351, grad_norm: 5.140188 +Steps: 0%| | 1609/1000000 [06:37<67:31:29, 4.11it/s, grad_norm=5.14, loss_final=3.19, loss_mean=0.946, loss_mean_cls=2.59, proj_loss=-0.346][2026-03-23 13:42:57] Step: 1609, Training Logs: loss_final: 3.394392, loss_mean: 0.945686, proj_loss: -0.352619, loss_mean_cls: 2.801325, grad_norm: 10.937043 +Steps: 0%| | 1610/1000000 [06:38<67:29:42, 4.11it/s, grad_norm=10.9, loss_final=3.39, loss_mean=0.946, loss_mean_cls=2.8, proj_loss=-0.353][2026-03-23 13:42:58] Step: 1610, Training Logs: loss_final: 2.602896, loss_mean: 0.962709, proj_loss: -0.360567, loss_mean_cls: 2.000754, grad_norm: 7.725523 +Steps: 0%| | 1611/1000000 [06:38<67:29:48, 4.11it/s, grad_norm=7.73, loss_final=2.6, loss_mean=0.963, loss_mean_cls=2, proj_loss=-0.361][2026-03-23 13:42:58] Step: 1611, Training Logs: loss_final: 2.569510, loss_mean: 0.954987, proj_loss: -0.361490, loss_mean_cls: 1.976014, grad_norm: 4.945390 +Steps: 0%| | 1612/1000000 [06:38<67:29:08, 4.11it/s, grad_norm=4.95, loss_final=2.57, loss_mean=0.955, loss_mean_cls=1.98, proj_loss=-0.361][2026-03-23 13:42:58] Step: 1612, Training Logs: loss_final: 3.263940, loss_mean: 0.943893, proj_loss: -0.356694, loss_mean_cls: 2.676740, grad_norm: 13.457012 +Steps: 0%| | 1613/1000000 [06:38<67:31:35, 4.11it/s, grad_norm=13.5, loss_final=3.26, loss_mean=0.944, loss_mean_cls=2.68, proj_loss=-0.357][2026-03-23 13:42:58] Step: 1613, Training Logs: loss_final: 2.923363, loss_mean: 0.950203, proj_loss: -0.361392, loss_mean_cls: 2.334552, grad_norm: 11.288425 +Steps: 0%| | 1614/1000000 [06:39<67:31:06, 4.11it/s, grad_norm=11.3, loss_final=2.92, loss_mean=0.95, loss_mean_cls=2.33, proj_loss=-0.361][2026-03-23 13:42:59] Step: 1614, Training Logs: loss_final: 2.769035, loss_mean: 0.976519, proj_loss: -0.360415, loss_mean_cls: 2.152931, grad_norm: 9.381811 +Steps: 0%| | 1615/1000000 [06:39<67:30:01, 4.11it/s, grad_norm=9.38, loss_final=2.77, loss_mean=0.977, loss_mean_cls=2.15, proj_loss=-0.36][2026-03-23 13:42:59] Step: 1615, Training Logs: loss_final: 3.094930, loss_mean: 0.933350, proj_loss: -0.353524, loss_mean_cls: 2.515105, grad_norm: 11.190261 +Steps: 0%| | 1616/1000000 [06:39<67:30:01, 4.11it/s, grad_norm=11.2, loss_final=3.09, loss_mean=0.933, loss_mean_cls=2.52, proj_loss=-0.354][2026-03-23 13:42:59] Step: 1616, Training Logs: loss_final: 3.153231, loss_mean: 0.951927, proj_loss: -0.355520, loss_mean_cls: 2.556825, grad_norm: 10.944530 +Steps: 0%| | 1617/1000000 [06:39<67:32:01, 4.11it/s, grad_norm=10.9, loss_final=3.15, loss_mean=0.952, loss_mean_cls=2.56, proj_loss=-0.356][2026-03-23 13:42:59] Step: 1617, Training Logs: loss_final: 3.045237, loss_mean: 0.955373, proj_loss: -0.353433, loss_mean_cls: 2.443297, grad_norm: 2.816311 +Steps: 0%| | 1618/1000000 [06:40<67:31:26, 4.11it/s, grad_norm=2.82, loss_final=3.05, loss_mean=0.955, loss_mean_cls=2.44, proj_loss=-0.353][2026-03-23 13:43:00] Step: 1618, Training Logs: loss_final: 3.241998, loss_mean: 0.956748, proj_loss: -0.356009, loss_mean_cls: 2.641259, grad_norm: 2.546957 +Steps: 0%| | 1619/1000000 [06:40<67:30:33, 4.11it/s, grad_norm=2.55, loss_final=3.24, loss_mean=0.957, loss_mean_cls=2.64, proj_loss=-0.356][2026-03-23 13:43:00] Step: 1619, Training Logs: loss_final: 2.834117, loss_mean: 0.964684, proj_loss: -0.361124, loss_mean_cls: 2.230557, grad_norm: 7.122824 +Steps: 0%| | 1620/1000000 [06:40<67:30:04, 4.11it/s, grad_norm=7.12, loss_final=2.83, loss_mean=0.965, loss_mean_cls=2.23, proj_loss=-0.361][2026-03-23 13:43:00] Step: 1620, Training Logs: loss_final: 2.751534, loss_mean: 0.963578, proj_loss: -0.356989, loss_mean_cls: 2.144944, grad_norm: 4.002816 +Steps: 0%| | 1621/1000000 [06:40<67:30:55, 4.11it/s, grad_norm=4, loss_final=2.75, loss_mean=0.964, loss_mean_cls=2.14, proj_loss=-0.357][2026-03-23 13:43:00] Step: 1621, Training Logs: loss_final: 2.988272, loss_mean: 0.954946, proj_loss: -0.356884, loss_mean_cls: 2.390210, grad_norm: 9.890831 +Steps: 0%| | 1622/1000000 [06:41<67:32:09, 4.11it/s, grad_norm=9.89, loss_final=2.99, loss_mean=0.955, loss_mean_cls=2.39, proj_loss=-0.357][2026-03-23 13:43:01] Step: 1622, Training Logs: loss_final: 3.118850, loss_mean: 0.981303, proj_loss: -0.360930, loss_mean_cls: 2.498477, grad_norm: 5.857049 +Steps: 0%| | 1623/1000000 [06:41<67:31:12, 4.11it/s, grad_norm=5.86, loss_final=3.12, loss_mean=0.981, loss_mean_cls=2.5, proj_loss=-0.361][2026-03-23 13:43:01] Step: 1623, Training Logs: loss_final: 2.998938, loss_mean: 0.945782, proj_loss: -0.356990, loss_mean_cls: 2.410147, grad_norm: 19.398952 +Steps: 0%| | 1624/1000000 [06:41<68:00:03, 4.08it/s, grad_norm=19.4, loss_final=3, loss_mean=0.946, loss_mean_cls=2.41, proj_loss=-0.357][2026-03-23 13:43:01] Step: 1624, Training Logs: loss_final: 3.204537, loss_mean: 0.947792, proj_loss: -0.357546, loss_mean_cls: 2.614291, grad_norm: 9.313207 +Steps: 0%| | 1625/1000000 [06:41<67:50:28, 4.09it/s, grad_norm=9.31, loss_final=3.2, loss_mean=0.948, loss_mean_cls=2.61, proj_loss=-0.358][2026-03-23 13:43:01] Step: 1625, Training Logs: loss_final: 2.887077, loss_mean: 0.952081, proj_loss: -0.361165, loss_mean_cls: 2.296161, grad_norm: 11.406656 +Steps: 0%| | 1626/1000000 [06:42<67:46:24, 4.09it/s, grad_norm=11.4, loss_final=2.89, loss_mean=0.952, loss_mean_cls=2.3, proj_loss=-0.361][2026-03-23 13:43:02] Step: 1626, Training Logs: loss_final: 2.937336, loss_mean: 0.986000, proj_loss: -0.354215, loss_mean_cls: 2.305551, grad_norm: 8.332114 +Steps: 0%| | 1627/1000000 [06:42<67:41:36, 4.10it/s, grad_norm=8.33, loss_final=2.94, loss_mean=0.986, loss_mean_cls=2.31, proj_loss=-0.354][2026-03-23 13:43:02] Step: 1627, Training Logs: loss_final: 3.422336, loss_mean: 0.955173, proj_loss: -0.352141, loss_mean_cls: 2.819304, grad_norm: 3.672195 +Steps: 0%| | 1628/1000000 [06:42<67:43:43, 4.09it/s, grad_norm=3.67, loss_final=3.42, loss_mean=0.955, loss_mean_cls=2.82, proj_loss=-0.352][2026-03-23 13:43:02] Step: 1628, Training Logs: loss_final: 2.488322, loss_mean: 0.962677, proj_loss: -0.362794, loss_mean_cls: 1.888440, grad_norm: 9.222030 +Steps: 0%| | 1629/1000000 [06:42<67:37:21, 4.10it/s, grad_norm=9.22, loss_final=2.49, loss_mean=0.963, loss_mean_cls=1.89, proj_loss=-0.363][2026-03-23 13:43:02] Step: 1629, Training Logs: loss_final: 3.275472, loss_mean: 0.980713, proj_loss: -0.353935, loss_mean_cls: 2.648695, grad_norm: 10.495204 +Steps: 0%| | 1630/1000000 [06:43<67:36:17, 4.10it/s, grad_norm=10.5, loss_final=3.28, loss_mean=0.981, loss_mean_cls=2.65, proj_loss=-0.354][2026-03-23 13:43:03] Step: 1630, Training Logs: loss_final: 2.962354, loss_mean: 0.960719, proj_loss: -0.357210, loss_mean_cls: 2.358845, grad_norm: 6.338550 +Steps: 0%| | 1631/1000000 [06:43<67:33:54, 4.10it/s, grad_norm=6.34, loss_final=2.96, loss_mean=0.961, loss_mean_cls=2.36, proj_loss=-0.357][2026-03-23 13:43:03] Step: 1631, Training Logs: loss_final: 3.291827, loss_mean: 0.954016, proj_loss: -0.355366, loss_mean_cls: 2.693177, grad_norm: 7.065762 +Steps: 0%| | 1632/1000000 [06:43<67:32:10, 4.11it/s, grad_norm=7.07, loss_final=3.29, loss_mean=0.954, loss_mean_cls=2.69, proj_loss=-0.355][2026-03-23 13:43:03] Step: 1632, Training Logs: loss_final: 3.283473, loss_mean: 0.956014, proj_loss: -0.359884, loss_mean_cls: 2.687343, grad_norm: 15.065584 +Steps: 0%| | 1633/1000000 [06:43<67:31:45, 4.11it/s, grad_norm=15.1, loss_final=3.28, loss_mean=0.956, loss_mean_cls=2.69, proj_loss=-0.36][2026-03-23 13:43:03] Step: 1633, Training Logs: loss_final: 3.382303, loss_mean: 0.964877, proj_loss: -0.348884, loss_mean_cls: 2.766310, grad_norm: 5.916055 +Steps: 0%| | 1634/1000000 [06:44<67:33:56, 4.10it/s, grad_norm=5.92, loss_final=3.38, loss_mean=0.965, loss_mean_cls=2.77, proj_loss=-0.349][2026-03-23 13:43:03] Step: 1634, Training Logs: loss_final: 2.864572, loss_mean: 0.982091, proj_loss: -0.363419, loss_mean_cls: 2.245900, grad_norm: 11.834321 +Steps: 0%| | 1635/1000000 [06:44<67:31:34, 4.11it/s, grad_norm=11.8, loss_final=2.86, loss_mean=0.982, loss_mean_cls=2.25, proj_loss=-0.363][2026-03-23 13:43:04] Step: 1635, Training Logs: loss_final: 3.037431, loss_mean: 0.974691, proj_loss: -0.348904, loss_mean_cls: 2.411644, grad_norm: 7.079793 +Steps: 0%| | 1636/1000000 [06:44<67:32:00, 4.11it/s, grad_norm=7.08, loss_final=3.04, loss_mean=0.975, loss_mean_cls=2.41, proj_loss=-0.349][2026-03-23 13:43:04] Step: 1636, Training Logs: loss_final: 2.862161, loss_mean: 0.961504, proj_loss: -0.359801, loss_mean_cls: 2.260459, grad_norm: 3.980849 +Steps: 0%| | 1637/1000000 [06:44<67:31:00, 4.11it/s, grad_norm=3.98, loss_final=2.86, loss_mean=0.962, loss_mean_cls=2.26, proj_loss=-0.36][2026-03-23 13:43:04] Step: 1637, Training Logs: loss_final: 2.972823, loss_mean: 0.973621, proj_loss: -0.362509, loss_mean_cls: 2.361711, grad_norm: 15.377238 +Steps: 0%| | 1638/1000000 [06:45<67:32:57, 4.11it/s, grad_norm=15.4, loss_final=2.97, loss_mean=0.974, loss_mean_cls=2.36, proj_loss=-0.363][2026-03-23 13:43:04] Step: 1638, Training Logs: loss_final: 2.318976, loss_mean: 1.015951, proj_loss: -0.361837, loss_mean_cls: 1.664863, grad_norm: 10.464694 +Steps: 0%| | 1639/1000000 [06:45<68:03:53, 4.07it/s, grad_norm=10.5, loss_final=2.32, loss_mean=1.02, loss_mean_cls=1.66, proj_loss=-0.362][2026-03-23 13:43:05] Step: 1639, Training Logs: loss_final: 2.912868, loss_mean: 0.977324, proj_loss: -0.357427, loss_mean_cls: 2.292970, grad_norm: 6.909609 +Steps: 0%| | 1640/1000000 [06:45<67:23:26, 4.12it/s, grad_norm=6.91, loss_final=2.91, loss_mean=0.977, loss_mean_cls=2.29, proj_loss=-0.357][2026-03-23 13:43:05] Step: 1640, Training Logs: loss_final: 2.993854, loss_mean: 0.944675, proj_loss: -0.360378, loss_mean_cls: 2.409557, grad_norm: 12.825631 +Steps: 0%| | 1641/1000000 [06:45<67:25:47, 4.11it/s, grad_norm=12.8, loss_final=2.99, loss_mean=0.945, loss_mean_cls=2.41, proj_loss=-0.36][2026-03-23 13:43:05] Step: 1641, Training Logs: loss_final: 2.866516, loss_mean: 0.986553, proj_loss: -0.361388, loss_mean_cls: 2.241352, grad_norm: 4.952305 +Steps: 0%| | 1642/1000000 [06:45<67:26:19, 4.11it/s, grad_norm=4.95, loss_final=2.87, loss_mean=0.987, loss_mean_cls=2.24, proj_loss=-0.361][2026-03-23 13:43:05] Step: 1642, Training Logs: loss_final: 3.068738, loss_mean: 0.964118, proj_loss: -0.357109, loss_mean_cls: 2.461730, grad_norm: 12.155602 +Steps: 0%| | 1643/1000000 [06:46<67:28:53, 4.11it/s, grad_norm=12.2, loss_final=3.07, loss_mean=0.964, loss_mean_cls=2.46, proj_loss=-0.357][2026-03-23 13:43:06] Step: 1643, Training Logs: loss_final: 3.319471, loss_mean: 0.964584, proj_loss: -0.353558, loss_mean_cls: 2.708445, grad_norm: 13.889063 +Steps: 0%| | 1644/1000000 [06:46<67:28:01, 4.11it/s, grad_norm=13.9, loss_final=3.32, loss_mean=0.965, loss_mean_cls=2.71, proj_loss=-0.354][2026-03-23 13:43:06] Step: 1644, Training Logs: loss_final: 2.728708, loss_mean: 0.963589, proj_loss: -0.363428, loss_mean_cls: 2.128547, grad_norm: 4.631152 +Steps: 0%| | 1645/1000000 [06:46<67:28:36, 4.11it/s, grad_norm=4.63, loss_final=2.73, loss_mean=0.964, loss_mean_cls=2.13, proj_loss=-0.363][2026-03-23 13:43:06] Step: 1645, Training Logs: loss_final: 2.844779, loss_mean: 0.970336, proj_loss: -0.360035, loss_mean_cls: 2.234478, grad_norm: 12.683215 +Steps: 0%| | 1646/1000000 [06:46<67:33:09, 4.11it/s, grad_norm=12.7, loss_final=2.84, loss_mean=0.97, loss_mean_cls=2.23, proj_loss=-0.36][2026-03-23 13:43:06] Step: 1646, Training Logs: loss_final: 2.987923, loss_mean: 0.948181, proj_loss: -0.356711, loss_mean_cls: 2.396453, grad_norm: 5.881278 +Steps: 0%| | 1647/1000000 [06:47<67:30:55, 4.11it/s, grad_norm=5.88, loss_final=2.99, loss_mean=0.948, loss_mean_cls=2.4, proj_loss=-0.357][2026-03-23 13:43:07] Step: 1647, Training Logs: loss_final: 3.906199, loss_mean: 0.911606, proj_loss: -0.348328, loss_mean_cls: 3.342921, grad_norm: 16.821110 +Steps: 0%| | 1648/1000000 [06:47<67:30:47, 4.11it/s, grad_norm=16.8, loss_final=3.91, loss_mean=0.912, loss_mean_cls=3.34, proj_loss=-0.348][2026-03-23 13:43:07] Step: 1648, Training Logs: loss_final: 3.122712, loss_mean: 0.963591, proj_loss: -0.357674, loss_mean_cls: 2.516795, grad_norm: 11.006652 +Steps: 0%| | 1649/1000000 [06:47<67:29:40, 4.11it/s, grad_norm=11, loss_final=3.12, loss_mean=0.964, loss_mean_cls=2.52, proj_loss=-0.358][2026-03-23 13:43:07] Step: 1649, Training Logs: loss_final: 2.805829, loss_mean: 0.956058, proj_loss: -0.365029, loss_mean_cls: 2.214799, grad_norm: 6.645406 +Steps: 0%| | 1650/1000000 [06:47<67:29:06, 4.11it/s, grad_norm=6.65, loss_final=2.81, loss_mean=0.956, loss_mean_cls=2.21, proj_loss=-0.365][2026-03-23 13:43:07] Step: 1650, Training Logs: loss_final: 3.237654, loss_mean: 0.954245, proj_loss: -0.354750, loss_mean_cls: 2.638160, grad_norm: 6.821177 +Steps: 0%| | 1651/1000000 [06:48<67:28:13, 4.11it/s, grad_norm=6.82, loss_final=3.24, loss_mean=0.954, loss_mean_cls=2.64, proj_loss=-0.355][2026-03-23 13:43:08] Step: 1651, Training Logs: loss_final: 2.712798, loss_mean: 0.983896, proj_loss: -0.362252, loss_mean_cls: 2.091154, grad_norm: 10.160157 +Steps: 0%| | 1652/1000000 [06:48<67:28:16, 4.11it/s, grad_norm=10.2, loss_final=2.71, loss_mean=0.984, loss_mean_cls=2.09, proj_loss=-0.362][2026-03-23 13:43:08] Step: 1652, Training Logs: loss_final: 3.126476, loss_mean: 0.964352, proj_loss: -0.362614, loss_mean_cls: 2.524739, grad_norm: 14.431285 +Steps: 0%| | 1653/1000000 [06:48<67:28:08, 4.11it/s, grad_norm=14.4, loss_final=3.13, loss_mean=0.964, loss_mean_cls=2.52, proj_loss=-0.363][2026-03-23 13:43:08] Step: 1653, Training Logs: loss_final: 3.359515, loss_mean: 0.952401, proj_loss: -0.349074, loss_mean_cls: 2.756189, grad_norm: 6.159184 +Steps: 0%| | 1654/1000000 [06:48<67:27:05, 4.11it/s, grad_norm=6.16, loss_final=3.36, loss_mean=0.952, loss_mean_cls=2.76, proj_loss=-0.349][2026-03-23 13:43:08] Step: 1654, Training Logs: loss_final: 3.285914, loss_mean: 0.966189, proj_loss: -0.356654, loss_mean_cls: 2.676379, grad_norm: 11.706588 +Steps: 0%| | 1655/1000000 [06:49<67:29:00, 4.11it/s, grad_norm=11.7, loss_final=3.29, loss_mean=0.966, loss_mean_cls=2.68, proj_loss=-0.357][2026-03-23 13:43:09] Step: 1655, Training Logs: loss_final: 2.960196, loss_mean: 0.946333, proj_loss: -0.362080, loss_mean_cls: 2.375942, grad_norm: 6.455517 +Steps: 0%| | 1656/1000000 [06:49<67:29:45, 4.11it/s, grad_norm=6.46, loss_final=2.96, loss_mean=0.946, loss_mean_cls=2.38, proj_loss=-0.362][2026-03-23 13:43:09] Step: 1656, Training Logs: loss_final: 2.719909, loss_mean: 0.962570, proj_loss: -0.362746, loss_mean_cls: 2.120086, grad_norm: 5.786065 +Steps: 0%| | 1657/1000000 [06:49<67:29:01, 4.11it/s, grad_norm=5.79, loss_final=2.72, loss_mean=0.963, loss_mean_cls=2.12, proj_loss=-0.363][2026-03-23 13:43:09] Step: 1657, Training Logs: loss_final: 3.406665, loss_mean: 0.937542, proj_loss: -0.355713, loss_mean_cls: 2.824836, grad_norm: 3.442876 +Steps: 0%| | 1658/1000000 [06:49<67:27:39, 4.11it/s, grad_norm=3.44, loss_final=3.41, loss_mean=0.938, loss_mean_cls=2.82, proj_loss=-0.356][2026-03-23 13:43:09] Step: 1658, Training Logs: loss_final: 3.321853, loss_mean: 0.964637, proj_loss: -0.356378, loss_mean_cls: 2.713593, grad_norm: 3.076313 +Steps: 0%| | 1659/1000000 [06:50<67:26:46, 4.11it/s, grad_norm=3.08, loss_final=3.32, loss_mean=0.965, loss_mean_cls=2.71, proj_loss=-0.356][2026-03-23 13:43:10] Step: 1659, Training Logs: loss_final: 3.221113, loss_mean: 0.933300, proj_loss: -0.357202, loss_mean_cls: 2.645015, grad_norm: 7.565939 +Steps: 0%| | 1660/1000000 [06:50<67:26:24, 4.11it/s, grad_norm=7.57, loss_final=3.22, loss_mean=0.933, loss_mean_cls=2.65, proj_loss=-0.357][2026-03-23 13:43:10] Step: 1660, Training Logs: loss_final: 2.875326, loss_mean: 0.959796, proj_loss: -0.359735, loss_mean_cls: 2.275265, grad_norm: 2.439922 +Steps: 0%| | 1661/1000000 [06:50<67:26:26, 4.11it/s, grad_norm=2.44, loss_final=2.88, loss_mean=0.96, loss_mean_cls=2.28, proj_loss=-0.36][2026-03-23 13:43:10] Step: 1661, Training Logs: loss_final: 3.028481, loss_mean: 0.952766, proj_loss: -0.360399, loss_mean_cls: 2.436114, grad_norm: 5.935452 +Steps: 0%| | 1662/1000000 [06:50<67:25:57, 4.11it/s, grad_norm=5.94, loss_final=3.03, loss_mean=0.953, loss_mean_cls=2.44, proj_loss=-0.36][2026-03-23 13:43:10] Step: 1662, Training Logs: loss_final: 3.324552, loss_mean: 0.931466, proj_loss: -0.365178, loss_mean_cls: 2.758264, grad_norm: 3.111835 +Steps: 0%| | 1663/1000000 [06:51<67:25:43, 4.11it/s, grad_norm=3.11, loss_final=3.32, loss_mean=0.931, loss_mean_cls=2.76, proj_loss=-0.365][2026-03-23 13:43:11] Step: 1663, Training Logs: loss_final: 2.819025, loss_mean: 0.950173, proj_loss: -0.366044, loss_mean_cls: 2.234897, grad_norm: 7.933424 +Steps: 0%| | 1664/1000000 [06:51<67:27:09, 4.11it/s, grad_norm=7.93, loss_final=2.82, loss_mean=0.95, loss_mean_cls=2.23, proj_loss=-0.366][2026-03-23 13:43:11] Step: 1664, Training Logs: loss_final: 2.751657, loss_mean: 0.976253, proj_loss: -0.356495, loss_mean_cls: 2.131899, grad_norm: 5.323426 +Steps: 0%| | 1665/1000000 [06:51<67:27:55, 4.11it/s, grad_norm=5.32, loss_final=2.75, loss_mean=0.976, loss_mean_cls=2.13, proj_loss=-0.356][2026-03-23 13:43:11] Step: 1665, Training Logs: loss_final: 2.611905, loss_mean: 0.967293, proj_loss: -0.365927, loss_mean_cls: 2.010539, grad_norm: 10.417037 +Steps: 0%| | 1666/1000000 [06:51<67:27:19, 4.11it/s, grad_norm=10.4, loss_final=2.61, loss_mean=0.967, loss_mean_cls=2.01, proj_loss=-0.366][2026-03-23 13:43:11] Step: 1666, Training Logs: loss_final: 3.209638, loss_mean: 0.961078, proj_loss: -0.359953, loss_mean_cls: 2.608512, grad_norm: 8.244787 +Steps: 0%| | 1667/1000000 [06:52<67:27:18, 4.11it/s, grad_norm=8.24, loss_final=3.21, loss_mean=0.961, loss_mean_cls=2.61, proj_loss=-0.36][2026-03-23 13:43:12] Step: 1667, Training Logs: loss_final: 2.979340, loss_mean: 0.950770, proj_loss: -0.362941, loss_mean_cls: 2.391511, grad_norm: 13.983011 +Steps: 0%| | 1668/1000000 [06:52<67:28:24, 4.11it/s, grad_norm=14, loss_final=2.98, loss_mean=0.951, loss_mean_cls=2.39, proj_loss=-0.363][2026-03-23 13:43:12] Step: 1668, Training Logs: loss_final: 2.981056, loss_mean: 0.970780, proj_loss: -0.359578, loss_mean_cls: 2.369855, grad_norm: 12.739525 +Steps: 0%| | 1669/1000000 [06:52<67:28:04, 4.11it/s, grad_norm=12.7, loss_final=2.98, loss_mean=0.971, loss_mean_cls=2.37, proj_loss=-0.36][2026-03-23 13:43:12] Step: 1669, Training Logs: loss_final: 2.716476, loss_mean: 0.955680, proj_loss: -0.361627, loss_mean_cls: 2.122423, grad_norm: 5.043996 +Steps: 0%| | 1670/1000000 [06:52<67:27:38, 4.11it/s, grad_norm=5.04, loss_final=2.72, loss_mean=0.956, loss_mean_cls=2.12, proj_loss=-0.362][2026-03-23 13:43:12] Step: 1670, Training Logs: loss_final: 2.780652, loss_mean: 0.963210, proj_loss: -0.361605, loss_mean_cls: 2.179046, grad_norm: 8.801538 +Steps: 0%| | 1671/1000000 [06:53<67:26:50, 4.11it/s, grad_norm=8.8, loss_final=2.78, loss_mean=0.963, loss_mean_cls=2.18, proj_loss=-0.362][2026-03-23 13:43:12] Step: 1671, Training Logs: loss_final: 3.045213, loss_mean: 0.973161, proj_loss: -0.364941, loss_mean_cls: 2.436992, grad_norm: 13.397303 +Steps: 0%| | 1672/1000000 [06:53<67:27:19, 4.11it/s, grad_norm=13.4, loss_final=3.05, loss_mean=0.973, loss_mean_cls=2.44, proj_loss=-0.365][2026-03-23 13:43:13] Step: 1672, Training Logs: loss_final: 3.061067, loss_mean: 0.973281, proj_loss: -0.360185, loss_mean_cls: 2.447970, grad_norm: 17.563612 +Steps: 0%| | 1673/1000000 [06:53<67:27:51, 4.11it/s, grad_norm=17.6, loss_final=3.06, loss_mean=0.973, loss_mean_cls=2.45, proj_loss=-0.36][2026-03-23 13:43:13] Step: 1673, Training Logs: loss_final: 2.808118, loss_mean: 0.970840, proj_loss: -0.358397, loss_mean_cls: 2.195675, grad_norm: 6.439587 +Steps: 0%| | 1674/1000000 [06:53<67:27:59, 4.11it/s, grad_norm=6.44, loss_final=2.81, loss_mean=0.971, loss_mean_cls=2.2, proj_loss=-0.358][2026-03-23 13:43:13] Step: 1674, Training Logs: loss_final: 3.088176, loss_mean: 0.959057, proj_loss: -0.357307, loss_mean_cls: 2.486427, grad_norm: 8.298572 +Steps: 0%| | 1675/1000000 [06:54<67:26:36, 4.11it/s, grad_norm=8.3, loss_final=3.09, loss_mean=0.959, loss_mean_cls=2.49, proj_loss=-0.357][2026-03-23 13:43:13] Step: 1675, Training Logs: loss_final: 3.556121, loss_mean: 0.958311, proj_loss: -0.354446, loss_mean_cls: 2.952256, grad_norm: 10.209874 +Steps: 0%| | 1676/1000000 [06:54<67:28:22, 4.11it/s, grad_norm=10.2, loss_final=3.56, loss_mean=0.958, loss_mean_cls=2.95, proj_loss=-0.354][2026-03-23 13:43:14] Step: 1676, Training Logs: loss_final: 2.945159, loss_mean: 0.949226, proj_loss: -0.361359, loss_mean_cls: 2.357293, grad_norm: 5.569319 +Steps: 0%| | 1677/1000000 [06:54<67:30:46, 4.11it/s, grad_norm=5.57, loss_final=2.95, loss_mean=0.949, loss_mean_cls=2.36, proj_loss=-0.361][2026-03-23 13:43:14] Step: 1677, Training Logs: loss_final: 3.205383, loss_mean: 0.962998, proj_loss: -0.365981, loss_mean_cls: 2.608366, grad_norm: 16.204477 +Steps: 0%| | 1678/1000000 [06:54<67:30:28, 4.11it/s, grad_norm=16.2, loss_final=3.21, loss_mean=0.963, loss_mean_cls=2.61, proj_loss=-0.366][2026-03-23 13:43:14] Step: 1678, Training Logs: loss_final: 3.255130, loss_mean: 0.943636, proj_loss: -0.358450, loss_mean_cls: 2.669945, grad_norm: 9.231002 +Steps: 0%| | 1679/1000000 [06:54<67:30:22, 4.11it/s, grad_norm=9.23, loss_final=3.26, loss_mean=0.944, loss_mean_cls=2.67, proj_loss=-0.358][2026-03-23 13:43:14] Step: 1679, Training Logs: loss_final: 2.851515, loss_mean: 1.001187, proj_loss: -0.364725, loss_mean_cls: 2.215052, grad_norm: 9.597744 +Steps: 0%| | 1680/1000000 [06:55<67:31:20, 4.11it/s, grad_norm=9.6, loss_final=2.85, loss_mean=1, loss_mean_cls=2.22, proj_loss=-0.365][2026-03-23 13:43:15] Step: 1680, Training Logs: loss_final: 2.847615, loss_mean: 0.969202, proj_loss: -0.362517, loss_mean_cls: 2.240930, grad_norm: 12.948436 +Steps: 0%| | 1681/1000000 [06:55<67:32:21, 4.11it/s, grad_norm=12.9, loss_final=2.85, loss_mean=0.969, loss_mean_cls=2.24, proj_loss=-0.363][2026-03-23 13:43:15] Step: 1681, Training Logs: loss_final: 2.967904, loss_mean: 0.966383, proj_loss: -0.358251, loss_mean_cls: 2.359771, grad_norm: 2.785384 +Steps: 0%| | 1682/1000000 [06:55<67:32:05, 4.11it/s, grad_norm=2.79, loss_final=2.97, loss_mean=0.966, loss_mean_cls=2.36, proj_loss=-0.358][2026-03-23 13:43:15] Step: 1682, Training Logs: loss_final: 3.254808, loss_mean: 0.926776, proj_loss: -0.363349, loss_mean_cls: 2.691382, grad_norm: 12.858184 +Steps: 0%| | 1683/1000000 [06:55<67:30:45, 4.11it/s, grad_norm=12.9, loss_final=3.25, loss_mean=0.927, loss_mean_cls=2.69, proj_loss=-0.363][2026-03-23 13:43:15] Step: 1683, Training Logs: loss_final: 3.492239, loss_mean: 0.948739, proj_loss: -0.354103, loss_mean_cls: 2.897603, grad_norm: 8.200516 +Steps: 0%| | 1684/1000000 [06:56<67:28:22, 4.11it/s, grad_norm=8.2, loss_final=3.49, loss_mean=0.949, loss_mean_cls=2.9, proj_loss=-0.354][2026-03-23 13:43:16] Step: 1684, Training Logs: loss_final: 2.761560, loss_mean: 0.974523, proj_loss: -0.361199, loss_mean_cls: 2.148236, grad_norm: 5.452947 +Steps: 0%| | 1685/1000000 [06:56<67:27:34, 4.11it/s, grad_norm=5.45, loss_final=2.76, loss_mean=0.975, loss_mean_cls=2.15, proj_loss=-0.361][2026-03-23 13:43:16] Step: 1685, Training Logs: loss_final: 3.034948, loss_mean: 0.954867, proj_loss: -0.361077, loss_mean_cls: 2.441159, grad_norm: 7.241168 +Steps: 0%| | 1686/1000000 [06:56<67:27:20, 4.11it/s, grad_norm=7.24, loss_final=3.03, loss_mean=0.955, loss_mean_cls=2.44, proj_loss=-0.361][2026-03-23 13:43:16] Step: 1686, Training Logs: loss_final: 3.278301, loss_mean: 0.951395, proj_loss: -0.355571, loss_mean_cls: 2.682478, grad_norm: 8.015994 +Steps: 0%| | 1687/1000000 [06:56<67:29:29, 4.11it/s, grad_norm=8.02, loss_final=3.28, loss_mean=0.951, loss_mean_cls=2.68, proj_loss=-0.356][2026-03-23 13:43:16] Step: 1687, Training Logs: loss_final: 2.946612, loss_mean: 0.949756, proj_loss: -0.360481, loss_mean_cls: 2.357337, grad_norm: 5.572026 +Steps: 0%| | 1688/1000000 [06:57<67:28:57, 4.11it/s, grad_norm=5.57, loss_final=2.95, loss_mean=0.95, loss_mean_cls=2.36, proj_loss=-0.36][2026-03-23 13:43:17] Step: 1688, Training Logs: loss_final: 2.918301, loss_mean: 0.957365, proj_loss: -0.361502, loss_mean_cls: 2.322438, grad_norm: 8.012796 +Steps: 0%| | 1689/1000000 [06:57<67:29:20, 4.11it/s, grad_norm=8.01, loss_final=2.92, loss_mean=0.957, loss_mean_cls=2.32, proj_loss=-0.362][2026-03-23 13:43:17] Step: 1689, Training Logs: loss_final: 2.716850, loss_mean: 0.948071, proj_loss: -0.367769, loss_mean_cls: 2.136549, grad_norm: 5.073623 +Steps: 0%| | 1690/1000000 [06:57<67:27:33, 4.11it/s, grad_norm=5.07, loss_final=2.72, loss_mean=0.948, loss_mean_cls=2.14, proj_loss=-0.368][2026-03-23 13:43:17] Step: 1690, Training Logs: loss_final: 2.809314, loss_mean: 0.955184, proj_loss: -0.369614, loss_mean_cls: 2.223744, grad_norm: 14.499409 +Steps: 0%| | 1691/1000000 [06:57<67:27:33, 4.11it/s, grad_norm=14.5, loss_final=2.81, loss_mean=0.955, loss_mean_cls=2.22, proj_loss=-0.37][2026-03-23 13:43:17] Step: 1691, Training Logs: loss_final: 3.017796, loss_mean: 0.975655, proj_loss: -0.360661, loss_mean_cls: 2.402802, grad_norm: 13.461200 +Steps: 0%| | 1692/1000000 [06:58<67:28:04, 4.11it/s, grad_norm=13.5, loss_final=3.02, loss_mean=0.976, loss_mean_cls=2.4, proj_loss=-0.361][2026-03-23 13:43:18] Step: 1692, Training Logs: loss_final: 2.833251, loss_mean: 0.983673, proj_loss: -0.361548, loss_mean_cls: 2.211126, grad_norm: 7.818724 +Steps: 0%| | 1693/1000000 [06:58<67:28:01, 4.11it/s, grad_norm=7.82, loss_final=2.83, loss_mean=0.984, loss_mean_cls=2.21, proj_loss=-0.362][2026-03-23 13:43:18] Step: 1693, Training Logs: loss_final: 2.944885, loss_mean: 0.949438, proj_loss: -0.363607, loss_mean_cls: 2.359054, grad_norm: 16.054787 +Steps: 0%| | 1694/1000000 [06:58<67:28:03, 4.11it/s, grad_norm=16.1, loss_final=2.94, loss_mean=0.949, loss_mean_cls=2.36, proj_loss=-0.364][2026-03-23 13:43:18] Step: 1694, Training Logs: loss_final: 2.683748, loss_mean: 0.950614, proj_loss: -0.367426, loss_mean_cls: 2.100559, grad_norm: 8.430010 +Steps: 0%| | 1695/1000000 [06:58<67:27:21, 4.11it/s, grad_norm=8.43, loss_final=2.68, loss_mean=0.951, loss_mean_cls=2.1, proj_loss=-0.367][2026-03-23 13:43:18] Step: 1695, Training Logs: loss_final: 3.111175, loss_mean: 0.972769, proj_loss: -0.360149, loss_mean_cls: 2.498556, grad_norm: 9.268894 +Steps: 0%| | 1696/1000000 [06:59<67:28:35, 4.11it/s, grad_norm=9.27, loss_final=3.11, loss_mean=0.973, loss_mean_cls=2.5, proj_loss=-0.36][2026-03-23 13:43:19] Step: 1696, Training Logs: loss_final: 2.717452, loss_mean: 0.965674, proj_loss: -0.362322, loss_mean_cls: 2.114100, grad_norm: 7.556376 +Steps: 0%| | 1697/1000000 [06:59<67:29:15, 4.11it/s, grad_norm=7.56, loss_final=2.72, loss_mean=0.966, loss_mean_cls=2.11, proj_loss=-0.362][2026-03-23 13:43:19] Step: 1697, Training Logs: loss_final: 2.737548, loss_mean: 0.955141, proj_loss: -0.364439, loss_mean_cls: 2.146846, grad_norm: 7.063430 +Steps: 0%| | 1698/1000000 [06:59<67:28:24, 4.11it/s, grad_norm=7.06, loss_final=2.74, loss_mean=0.955, loss_mean_cls=2.15, proj_loss=-0.364][2026-03-23 13:43:19] Step: 1698, Training Logs: loss_final: 3.007811, loss_mean: 0.937317, proj_loss: -0.357422, loss_mean_cls: 2.427915, grad_norm: 7.748862 +Steps: 0%| | 1699/1000000 [06:59<67:30:44, 4.11it/s, grad_norm=7.75, loss_final=3.01, loss_mean=0.937, loss_mean_cls=2.43, proj_loss=-0.357][2026-03-23 13:43:19] Step: 1699, Training Logs: loss_final: 3.268993, loss_mean: 0.943307, proj_loss: -0.355855, loss_mean_cls: 2.681541, grad_norm: 5.718846 +Steps: 0%| | 1700/1000000 [07:00<67:32:12, 4.11it/s, grad_norm=5.72, loss_final=3.27, loss_mean=0.943, loss_mean_cls=2.68, proj_loss=-0.356][2026-03-23 13:43:20] Step: 1700, Training Logs: loss_final: 2.709513, loss_mean: 0.947674, proj_loss: -0.368511, loss_mean_cls: 2.130351, grad_norm: 9.408784 +Steps: 0%| | 1701/1000000 [07:00<67:30:08, 4.11it/s, grad_norm=9.41, loss_final=2.71, loss_mean=0.948, loss_mean_cls=2.13, proj_loss=-0.369][2026-03-23 13:43:20] Step: 1701, Training Logs: loss_final: 2.689393, loss_mean: 0.953789, proj_loss: -0.368827, loss_mean_cls: 2.104431, grad_norm: 3.418407 +Steps: 0%| | 1702/1000000 [07:00<67:28:54, 4.11it/s, grad_norm=3.42, loss_final=2.69, loss_mean=0.954, loss_mean_cls=2.1, proj_loss=-0.369][2026-03-23 13:43:20] Step: 1702, Training Logs: loss_final: 3.142791, loss_mean: 0.962432, proj_loss: -0.364784, loss_mean_cls: 2.545143, grad_norm: 11.656523 +Steps: 0%| | 1703/1000000 [07:00<67:29:20, 4.11it/s, grad_norm=11.7, loss_final=3.14, loss_mean=0.962, loss_mean_cls=2.55, proj_loss=-0.365][2026-03-23 13:43:20] Step: 1703, Training Logs: loss_final: 3.387047, loss_mean: 0.944052, proj_loss: -0.359080, loss_mean_cls: 2.802074, grad_norm: 16.415461 +Steps: 0%| | 1704/1000000 [07:01<67:28:25, 4.11it/s, grad_norm=16.4, loss_final=3.39, loss_mean=0.944, loss_mean_cls=2.8, proj_loss=-0.359][2026-03-23 13:43:21] Step: 1704, Training Logs: loss_final: 3.377972, loss_mean: 0.956666, proj_loss: -0.363795, loss_mean_cls: 2.785100, grad_norm: 7.928507 +Steps: 0%| | 1705/1000000 [07:01<67:28:26, 4.11it/s, grad_norm=7.93, loss_final=3.38, loss_mean=0.957, loss_mean_cls=2.79, proj_loss=-0.364][2026-03-23 13:43:21] Step: 1705, Training Logs: loss_final: 3.053576, loss_mean: 0.949594, proj_loss: -0.365912, loss_mean_cls: 2.469894, grad_norm: 8.823053 +Steps: 0%| | 1706/1000000 [07:01<67:27:52, 4.11it/s, grad_norm=8.82, loss_final=3.05, loss_mean=0.95, loss_mean_cls=2.47, proj_loss=-0.366][2026-03-23 13:43:21] Step: 1706, Training Logs: loss_final: 3.171936, loss_mean: 0.951909, proj_loss: -0.360778, loss_mean_cls: 2.580805, grad_norm: 16.923975 +Steps: 0%| | 1707/1000000 [07:01<67:30:14, 4.11it/s, grad_norm=16.9, loss_final=3.17, loss_mean=0.952, loss_mean_cls=2.58, proj_loss=-0.361][2026-03-23 13:43:21] Step: 1707, Training Logs: loss_final: 3.078893, loss_mean: 0.972743, proj_loss: -0.366427, loss_mean_cls: 2.472577, grad_norm: 10.382402 +Steps: 0%| | 1708/1000000 [07:02<67:30:30, 4.11it/s, grad_norm=10.4, loss_final=3.08, loss_mean=0.973, loss_mean_cls=2.47, proj_loss=-0.366][2026-03-23 13:43:22] Step: 1708, Training Logs: loss_final: 2.932865, loss_mean: 0.966162, proj_loss: -0.364784, loss_mean_cls: 2.331486, grad_norm: 4.417000 +Steps: 0%| | 1709/1000000 [07:02<67:41:15, 4.10it/s, grad_norm=4.42, loss_final=2.93, loss_mean=0.966, loss_mean_cls=2.33, proj_loss=-0.365][2026-03-23 13:43:22] Step: 1709, Training Logs: loss_final: 3.012322, loss_mean: 0.964326, proj_loss: -0.367173, loss_mean_cls: 2.415168, grad_norm: 8.749293 +Steps: 0%| | 1710/1000000 [07:02<67:38:33, 4.10it/s, grad_norm=8.75, loss_final=3.01, loss_mean=0.964, loss_mean_cls=2.42, proj_loss=-0.367][2026-03-23 13:43:22] Step: 1710, Training Logs: loss_final: 3.004860, loss_mean: 0.976483, proj_loss: -0.369578, loss_mean_cls: 2.397954, grad_norm: 17.247904 +Steps: 0%| | 1711/1000000 [07:02<67:36:26, 4.10it/s, grad_norm=17.2, loss_final=3, loss_mean=0.976, loss_mean_cls=2.4, proj_loss=-0.37][2026-03-23 13:43:22] Step: 1711, Training Logs: loss_final: 3.036135, loss_mean: 0.975792, proj_loss: -0.367631, loss_mean_cls: 2.427974, grad_norm: 14.285152 +Steps: 0%| | 1712/1000000 [07:03<67:37:29, 4.10it/s, grad_norm=14.3, loss_final=3.04, loss_mean=0.976, loss_mean_cls=2.43, proj_loss=-0.368][2026-03-23 13:43:22] Step: 1712, Training Logs: loss_final: 2.982620, loss_mean: 0.974353, proj_loss: -0.358214, loss_mean_cls: 2.366481, grad_norm: 8.110920 +Steps: 0%| | 1713/1000000 [07:03<67:32:12, 4.11it/s, grad_norm=8.11, loss_final=2.98, loss_mean=0.974, loss_mean_cls=2.37, proj_loss=-0.358][2026-03-23 13:43:23] Step: 1713, Training Logs: loss_final: 2.841171, loss_mean: 0.927926, proj_loss: -0.367383, loss_mean_cls: 2.280629, grad_norm: 14.047326 +Steps: 0%| | 1714/1000000 [07:03<67:31:32, 4.11it/s, grad_norm=14, loss_final=2.84, loss_mean=0.928, loss_mean_cls=2.28, proj_loss=-0.367][2026-03-23 13:43:23] Step: 1714, Training Logs: loss_final: 3.022913, loss_mean: 0.994910, proj_loss: -0.365964, loss_mean_cls: 2.393966, grad_norm: 8.424021 +Steps: 0%| | 1715/1000000 [07:03<67:35:17, 4.10it/s, grad_norm=8.42, loss_final=3.02, loss_mean=0.995, loss_mean_cls=2.39, proj_loss=-0.366][2026-03-23 13:43:23] Step: 1715, Training Logs: loss_final: 2.867370, loss_mean: 0.979833, proj_loss: -0.367565, loss_mean_cls: 2.255102, grad_norm: 5.777491 +Steps: 0%| | 1716/1000000 [07:03<67:31:58, 4.11it/s, grad_norm=5.78, loss_final=2.87, loss_mean=0.98, loss_mean_cls=2.26, proj_loss=-0.368][2026-03-23 13:43:23] Step: 1716, Training Logs: loss_final: 2.610888, loss_mean: 0.967462, proj_loss: -0.367923, loss_mean_cls: 2.011350, grad_norm: 8.445328 +Steps: 0%| | 1717/1000000 [07:04<67:29:37, 4.11it/s, grad_norm=8.45, loss_final=2.61, loss_mean=0.967, loss_mean_cls=2.01, proj_loss=-0.368][2026-03-23 13:43:24] Step: 1717, Training Logs: loss_final: 2.943620, loss_mean: 0.962612, proj_loss: -0.364877, loss_mean_cls: 2.345886, grad_norm: 6.216468 +Steps: 0%| | 1718/1000000 [07:04<67:28:09, 4.11it/s, grad_norm=6.22, loss_final=2.94, loss_mean=0.963, loss_mean_cls=2.35, proj_loss=-0.365][2026-03-23 13:43:24] Step: 1718, Training Logs: loss_final: 2.934481, loss_mean: 0.962117, proj_loss: -0.362156, loss_mean_cls: 2.334520, grad_norm: 5.664713 +Steps: 0%| | 1719/1000000 [07:04<67:29:05, 4.11it/s, grad_norm=5.66, loss_final=2.93, loss_mean=0.962, loss_mean_cls=2.33, proj_loss=-0.362][2026-03-23 13:43:24] Step: 1719, Training Logs: loss_final: 2.553437, loss_mean: 0.978494, proj_loss: -0.372445, loss_mean_cls: 1.947388, grad_norm: 9.060186 +Steps: 0%| | 1720/1000000 [07:04<67:29:10, 4.11it/s, grad_norm=9.06, loss_final=2.55, loss_mean=0.978, loss_mean_cls=1.95, proj_loss=-0.372][2026-03-23 13:43:24] Step: 1720, Training Logs: loss_final: 3.256945, loss_mean: 0.956779, proj_loss: -0.360045, loss_mean_cls: 2.660210, grad_norm: 5.662857 +Steps: 0%| | 1721/1000000 [07:05<67:27:24, 4.11it/s, grad_norm=5.66, loss_final=3.26, loss_mean=0.957, loss_mean_cls=2.66, proj_loss=-0.36][2026-03-23 13:43:25] Step: 1721, Training Logs: loss_final: 2.626098, loss_mean: 0.966493, proj_loss: -0.370416, loss_mean_cls: 2.030021, grad_norm: 5.828259 +Steps: 0%| | 1722/1000000 [07:05<67:35:03, 4.10it/s, grad_norm=5.83, loss_final=2.63, loss_mean=0.966, loss_mean_cls=2.03, proj_loss=-0.37][2026-03-23 13:43:25] Step: 1722, Training Logs: loss_final: 3.341830, loss_mean: 0.967184, proj_loss: -0.361180, loss_mean_cls: 2.735825, grad_norm: 7.256876 +Steps: 0%| | 1723/1000000 [07:05<67:33:40, 4.10it/s, grad_norm=7.26, loss_final=3.34, loss_mean=0.967, loss_mean_cls=2.74, proj_loss=-0.361][2026-03-23 13:43:25] Step: 1723, Training Logs: loss_final: 2.806233, loss_mean: 0.957986, proj_loss: -0.371771, loss_mean_cls: 2.220018, grad_norm: 8.109957 +Steps: 0%| | 1724/1000000 [07:05<67:32:37, 4.11it/s, grad_norm=8.11, loss_final=2.81, loss_mean=0.958, loss_mean_cls=2.22, proj_loss=-0.372][2026-03-23 13:43:25] Step: 1724, Training Logs: loss_final: 3.208175, loss_mean: 0.970938, proj_loss: -0.368180, loss_mean_cls: 2.605417, grad_norm: 7.036218 +Steps: 0%| | 1725/1000000 [07:06<67:31:03, 4.11it/s, grad_norm=7.04, loss_final=3.21, loss_mean=0.971, loss_mean_cls=2.61, proj_loss=-0.368][2026-03-23 13:43:26] Step: 1725, Training Logs: loss_final: 2.855876, loss_mean: 0.938384, proj_loss: -0.368368, loss_mean_cls: 2.285860, grad_norm: 4.388970 +Steps: 0%| | 1726/1000000 [07:06<67:31:52, 4.11it/s, grad_norm=4.39, loss_final=2.86, loss_mean=0.938, loss_mean_cls=2.29, proj_loss=-0.368][2026-03-23 13:43:26] Step: 1726, Training Logs: loss_final: 2.929514, loss_mean: 0.952893, proj_loss: -0.361802, loss_mean_cls: 2.338423, grad_norm: 3.447654 +Steps: 0%| | 1727/1000000 [07:06<67:29:59, 4.11it/s, grad_norm=3.45, loss_final=2.93, loss_mean=0.953, loss_mean_cls=2.34, proj_loss=-0.362][2026-03-23 13:43:26] Step: 1727, Training Logs: loss_final: 2.761979, loss_mean: 0.946597, proj_loss: -0.370680, loss_mean_cls: 2.186062, grad_norm: 5.465137 +Steps: 0%| | 1728/1000000 [07:06<67:31:47, 4.11it/s, grad_norm=5.47, loss_final=2.76, loss_mean=0.947, loss_mean_cls=2.19, proj_loss=-0.371][2026-03-23 13:43:26] Step: 1728, Training Logs: loss_final: 3.269737, loss_mean: 0.950079, proj_loss: -0.361484, loss_mean_cls: 2.681143, grad_norm: 4.056202 +Steps: 0%| | 1729/1000000 [07:07<69:37:13, 3.98it/s, grad_norm=4.06, loss_final=3.27, loss_mean=0.95, loss_mean_cls=2.68, proj_loss=-0.361][2026-03-23 13:43:27] Step: 1729, Training Logs: loss_final: 3.503983, loss_mean: 0.922287, proj_loss: -0.355576, loss_mean_cls: 2.937271, grad_norm: 7.960418 +Steps: 0%| | 1730/1000000 [07:07<69:09:19, 4.01it/s, grad_norm=7.96, loss_final=3.5, loss_mean=0.922, loss_mean_cls=2.94, proj_loss=-0.356][2026-03-23 13:43:27] Step: 1730, Training Logs: loss_final: 2.890522, loss_mean: 0.970529, proj_loss: -0.365032, loss_mean_cls: 2.285025, grad_norm: 6.191104 +Steps: 0%| | 1731/1000000 [07:07<68:39:23, 4.04it/s, grad_norm=6.19, loss_final=2.89, loss_mean=0.971, loss_mean_cls=2.29, proj_loss=-0.365][2026-03-23 13:43:27] Step: 1731, Training Logs: loss_final: 3.176285, loss_mean: 0.952754, proj_loss: -0.368795, loss_mean_cls: 2.592325, grad_norm: 7.464998 +Steps: 0%| | 1732/1000000 [07:07<68:18:50, 4.06it/s, grad_norm=7.46, loss_final=3.18, loss_mean=0.953, loss_mean_cls=2.59, proj_loss=-0.369][2026-03-23 13:43:27] Step: 1732, Training Logs: loss_final: 3.119315, loss_mean: 0.954526, proj_loss: -0.364197, loss_mean_cls: 2.528986, grad_norm: 3.655708 +Steps: 0%| | 1733/1000000 [07:08<68:03:21, 4.07it/s, grad_norm=3.66, loss_final=3.12, loss_mean=0.955, loss_mean_cls=2.53, proj_loss=-0.364][2026-03-23 13:43:28] Step: 1733, Training Logs: loss_final: 2.662306, loss_mean: 0.963879, proj_loss: -0.372155, loss_mean_cls: 2.070581, grad_norm: 2.962729 +Steps: 0%| | 1734/1000000 [07:08<68:01:39, 4.08it/s, grad_norm=2.96, loss_final=2.66, loss_mean=0.964, loss_mean_cls=2.07, proj_loss=-0.372][2026-03-23 13:43:28] Step: 1734, Training Logs: loss_final: 2.815214, loss_mean: 0.968097, proj_loss: -0.359313, loss_mean_cls: 2.206430, grad_norm: 6.178611 +Steps: 0%| | 1735/1000000 [07:08<67:54:06, 4.08it/s, grad_norm=6.18, loss_final=2.82, loss_mean=0.968, loss_mean_cls=2.21, proj_loss=-0.359][2026-03-23 13:43:28] Step: 1735, Training Logs: loss_final: 2.766918, loss_mean: 0.966996, proj_loss: -0.365667, loss_mean_cls: 2.165589, grad_norm: 4.160521 +Steps: 0%| | 1736/1000000 [07:08<68:43:07, 4.04it/s, grad_norm=4.16, loss_final=2.77, loss_mean=0.967, loss_mean_cls=2.17, proj_loss=-0.366][2026-03-23 13:43:28] Step: 1736, Training Logs: loss_final: 3.080870, loss_mean: 0.933310, proj_loss: -0.366628, loss_mean_cls: 2.514187, grad_norm: 20.589983 +Steps: 0%| | 1737/1000000 [07:09<68:19:44, 4.06it/s, grad_norm=20.6, loss_final=3.08, loss_mean=0.933, loss_mean_cls=2.51, proj_loss=-0.367][2026-03-23 13:43:29] Step: 1737, Training Logs: loss_final: 3.317323, loss_mean: 0.939963, proj_loss: -0.361875, loss_mean_cls: 2.739235, grad_norm: 9.133201 +Steps: 0%| | 1738/1000000 [07:09<68:11:07, 4.07it/s, grad_norm=9.13, loss_final=3.32, loss_mean=0.94, loss_mean_cls=2.74, proj_loss=-0.362][2026-03-23 13:43:29] Step: 1738, Training Logs: loss_final: 2.922437, loss_mean: 0.960604, proj_loss: -0.368305, loss_mean_cls: 2.330139, grad_norm: 6.545097 +Steps: 0%| | 1739/1000000 [07:09<67:58:49, 4.08it/s, grad_norm=6.55, loss_final=2.92, loss_mean=0.961, loss_mean_cls=2.33, proj_loss=-0.368][2026-03-23 13:43:29] Step: 1739, Training Logs: loss_final: 3.153974, loss_mean: 0.929365, proj_loss: -0.368745, loss_mean_cls: 2.593354, grad_norm: 19.425695 +Steps: 0%| | 1740/1000000 [07:09<67:49:16, 4.09it/s, grad_norm=19.4, loss_final=3.15, loss_mean=0.929, loss_mean_cls=2.59, proj_loss=-0.369][2026-03-23 13:43:29] Step: 1740, Training Logs: loss_final: 3.244378, loss_mean: 0.952655, proj_loss: -0.362333, loss_mean_cls: 2.654055, grad_norm: 6.363342 +Steps: 0%| | 1741/1000000 [07:10<67:43:58, 4.09it/s, grad_norm=6.36, loss_final=3.24, loss_mean=0.953, loss_mean_cls=2.65, proj_loss=-0.362][2026-03-23 13:43:30] Step: 1741, Training Logs: loss_final: 3.127532, loss_mean: 0.962016, proj_loss: -0.367663, loss_mean_cls: 2.533180, grad_norm: 9.695665 +Steps: 0%| | 1742/1000000 [07:10<67:43:01, 4.09it/s, grad_norm=9.7, loss_final=3.13, loss_mean=0.962, loss_mean_cls=2.53, proj_loss=-0.368][2026-03-23 13:43:30] Step: 1742, Training Logs: loss_final: 3.329054, loss_mean: 0.927917, proj_loss: -0.354846, loss_mean_cls: 2.755983, grad_norm: 8.265457 +Steps: 0%| | 1743/1000000 [07:10<67:38:52, 4.10it/s, grad_norm=8.27, loss_final=3.33, loss_mean=0.928, loss_mean_cls=2.76, proj_loss=-0.355][2026-03-23 13:43:30] Step: 1743, Training Logs: loss_final: 3.266890, loss_mean: 0.929798, proj_loss: -0.363284, loss_mean_cls: 2.700376, grad_norm: 11.307660 +Steps: 0%| | 1744/1000000 [07:10<67:38:27, 4.10it/s, grad_norm=11.3, loss_final=3.27, loss_mean=0.93, loss_mean_cls=2.7, proj_loss=-0.363][2026-03-23 13:43:30] Step: 1744, Training Logs: loss_final: 2.890928, loss_mean: 0.971366, proj_loss: -0.366051, loss_mean_cls: 2.285612, grad_norm: 7.428044 +Steps: 0%| | 1745/1000000 [07:11<67:34:54, 4.10it/s, grad_norm=7.43, loss_final=2.89, loss_mean=0.971, loss_mean_cls=2.29, proj_loss=-0.366][2026-03-23 13:43:31] Step: 1745, Training Logs: loss_final: 2.845875, loss_mean: 0.941513, proj_loss: -0.365409, loss_mean_cls: 2.269772, grad_norm: 10.483812 +Steps: 0%| | 1746/1000000 [07:11<67:40:41, 4.10it/s, grad_norm=10.5, loss_final=2.85, loss_mean=0.942, loss_mean_cls=2.27, proj_loss=-0.365][2026-03-23 13:43:31] Step: 1746, Training Logs: loss_final: 2.958863, loss_mean: 0.964870, proj_loss: -0.367854, loss_mean_cls: 2.361847, grad_norm: 16.978403 +Steps: 0%| | 1747/1000000 [07:11<67:37:33, 4.10it/s, grad_norm=17, loss_final=2.96, loss_mean=0.965, loss_mean_cls=2.36, proj_loss=-0.368][2026-03-23 13:43:31] Step: 1747, Training Logs: loss_final: 2.548346, loss_mean: 0.980928, proj_loss: -0.373524, loss_mean_cls: 1.940942, grad_norm: 10.242049 +Steps: 0%| | 1748/1000000 [07:11<67:33:21, 4.10it/s, grad_norm=10.2, loss_final=2.55, loss_mean=0.981, loss_mean_cls=1.94, proj_loss=-0.374][2026-03-23 13:43:31] Step: 1748, Training Logs: loss_final: 2.811961, loss_mean: 0.948601, proj_loss: -0.371136, loss_mean_cls: 2.234496, grad_norm: 8.444653 +Steps: 0%| | 1749/1000000 [07:12<67:32:33, 4.11it/s, grad_norm=8.44, loss_final=2.81, loss_mean=0.949, loss_mean_cls=2.23, proj_loss=-0.371][2026-03-23 13:43:32] Step: 1749, Training Logs: loss_final: 3.977734, loss_mean: 0.927982, proj_loss: -0.357197, loss_mean_cls: 3.406949, grad_norm: 24.642601 +Steps: 0%| | 1750/1000000 [07:12<67:34:31, 4.10it/s, grad_norm=24.6, loss_final=3.98, loss_mean=0.928, loss_mean_cls=3.41, proj_loss=-0.357][2026-03-23 13:43:32] Step: 1750, Training Logs: loss_final: 3.654378, loss_mean: 0.932103, proj_loss: -0.364679, loss_mean_cls: 3.086954, grad_norm: 16.230251 +Steps: 0%| | 1751/1000000 [07:12<67:34:04, 4.10it/s, grad_norm=16.2, loss_final=3.65, loss_mean=0.932, loss_mean_cls=3.09, proj_loss=-0.365][2026-03-23 13:43:32] Step: 1751, Training Logs: loss_final: 3.473209, loss_mean: 0.973198, proj_loss: -0.370704, loss_mean_cls: 2.870714, grad_norm: 22.251337 +Steps: 0%| | 1752/1000000 [07:12<67:34:28, 4.10it/s, grad_norm=22.3, loss_final=3.47, loss_mean=0.973, loss_mean_cls=2.87, proj_loss=-0.371][2026-03-23 13:43:32] Step: 1752, Training Logs: loss_final: 2.936796, loss_mean: 0.957149, proj_loss: -0.366282, loss_mean_cls: 2.345929, grad_norm: 15.151399 +Steps: 0%| | 1753/1000000 [07:13<67:37:30, 4.10it/s, grad_norm=15.2, loss_final=2.94, loss_mean=0.957, loss_mean_cls=2.35, proj_loss=-0.366][2026-03-23 13:43:33] Step: 1753, Training Logs: loss_final: 3.162127, loss_mean: 0.958599, proj_loss: -0.361781, loss_mean_cls: 2.565310, grad_norm: 5.691969 +Steps: 0%| | 1754/1000000 [07:13<67:40:24, 4.10it/s, grad_norm=5.69, loss_final=3.16, loss_mean=0.959, loss_mean_cls=2.57, proj_loss=-0.362][2026-03-23 13:43:33] Step: 1754, Training Logs: loss_final: 2.762545, loss_mean: 0.985941, proj_loss: -0.375739, loss_mean_cls: 2.152343, grad_norm: 10.943213 +Steps: 0%| | 1755/1000000 [07:13<67:39:16, 4.10it/s, grad_norm=10.9, loss_final=2.76, loss_mean=0.986, loss_mean_cls=2.15, proj_loss=-0.376][2026-03-23 13:43:33] Step: 1755, Training Logs: loss_final: 3.302127, loss_mean: 0.949921, proj_loss: -0.363039, loss_mean_cls: 2.715245, grad_norm: 17.037437 +Steps: 0%| | 1756/1000000 [07:13<68:50:53, 4.03it/s, grad_norm=17, loss_final=3.3, loss_mean=0.95, loss_mean_cls=2.72, proj_loss=-0.363][2026-03-23 13:43:33] Step: 1756, Training Logs: loss_final: 3.410163, loss_mean: 0.962349, proj_loss: -0.365725, loss_mean_cls: 2.813539, grad_norm: 16.107079 +Steps: 0%| | 1757/1000000 [07:14<68:30:25, 4.05it/s, grad_norm=16.1, loss_final=3.41, loss_mean=0.962, loss_mean_cls=2.81, proj_loss=-0.366][2026-03-23 13:43:33] Step: 1757, Training Logs: loss_final: 3.231712, loss_mean: 0.960751, proj_loss: -0.362660, loss_mean_cls: 2.633621, grad_norm: 17.752752 +Steps: 0%| | 1758/1000000 [07:14<68:11:55, 4.07it/s, grad_norm=17.8, loss_final=3.23, loss_mean=0.961, loss_mean_cls=2.63, proj_loss=-0.363][2026-03-23 13:43:34] Step: 1758, Training Logs: loss_final: 2.796955, loss_mean: 0.986533, proj_loss: -0.364924, loss_mean_cls: 2.175345, grad_norm: 7.816629 +Steps: 0%| | 1759/1000000 [07:14<67:59:43, 4.08it/s, grad_norm=7.82, loss_final=2.8, loss_mean=0.987, loss_mean_cls=2.18, proj_loss=-0.365][2026-03-23 13:43:34] Step: 1759, Training Logs: loss_final: 2.409564, loss_mean: 0.974292, proj_loss: -0.371044, loss_mean_cls: 1.806316, grad_norm: 10.341411 +Steps: 0%| | 1760/1000000 [07:14<67:51:13, 4.09it/s, grad_norm=10.3, loss_final=2.41, loss_mean=0.974, loss_mean_cls=1.81, proj_loss=-0.371][2026-03-23 13:43:34] Step: 1760, Training Logs: loss_final: 3.204627, loss_mean: 0.955200, proj_loss: -0.364814, loss_mean_cls: 2.614241, grad_norm: 15.511062 +Steps: 0%| | 1761/1000000 [07:15<67:46:25, 4.09it/s, grad_norm=15.5, loss_final=3.2, loss_mean=0.955, loss_mean_cls=2.61, proj_loss=-0.365][2026-03-23 13:43:34] Step: 1761, Training Logs: loss_final: 3.489514, loss_mean: 0.935602, proj_loss: -0.358306, loss_mean_cls: 2.912219, grad_norm: 17.397747 +Steps: 0%| | 1762/1000000 [07:15<67:42:30, 4.10it/s, grad_norm=17.4, loss_final=3.49, loss_mean=0.936, loss_mean_cls=2.91, proj_loss=-0.358][2026-03-23 13:43:35] Step: 1762, Training Logs: loss_final: 3.075116, loss_mean: 0.996049, proj_loss: -0.364895, loss_mean_cls: 2.443962, grad_norm: 11.964323 +Steps: 0%| | 1763/1000000 [07:15<67:39:21, 4.10it/s, grad_norm=12, loss_final=3.08, loss_mean=0.996, loss_mean_cls=2.44, proj_loss=-0.365][2026-03-23 13:43:35] Step: 1763, Training Logs: loss_final: 3.279654, loss_mean: 0.959985, proj_loss: -0.352009, loss_mean_cls: 2.671679, grad_norm: 8.729793 +Steps: 0%| | 1764/1000000 [07:15<67:35:16, 4.10it/s, grad_norm=8.73, loss_final=3.28, loss_mean=0.96, loss_mean_cls=2.67, proj_loss=-0.352][2026-03-23 13:43:35] Step: 1764, Training Logs: loss_final: 2.885604, loss_mean: 0.963303, proj_loss: -0.368584, loss_mean_cls: 2.290885, grad_norm: 8.906143 +Steps: 0%| | 1765/1000000 [07:15<67:40:10, 4.10it/s, grad_norm=8.91, loss_final=2.89, loss_mean=0.963, loss_mean_cls=2.29, proj_loss=-0.369][2026-03-23 13:43:35] Step: 1765, Training Logs: loss_final: 2.757291, loss_mean: 0.947841, proj_loss: -0.372867, loss_mean_cls: 2.182317, grad_norm: 9.094489 +Steps: 0%| | 1766/1000000 [07:16<67:38:11, 4.10it/s, grad_norm=9.09, loss_final=2.76, loss_mean=0.948, loss_mean_cls=2.18, proj_loss=-0.373][2026-03-23 13:43:36] Step: 1766, Training Logs: loss_final: 2.838276, loss_mean: 0.969952, proj_loss: -0.367996, loss_mean_cls: 2.236321, grad_norm: 5.303263 +Steps: 0%| | 1767/1000000 [07:16<67:34:42, 4.10it/s, grad_norm=5.3, loss_final=2.84, loss_mean=0.97, loss_mean_cls=2.24, proj_loss=-0.368][2026-03-23 13:43:36] Step: 1767, Training Logs: loss_final: 3.245831, loss_mean: 0.954490, proj_loss: -0.357274, loss_mean_cls: 2.648615, grad_norm: 4.817949 +Steps: 0%| | 1768/1000000 [07:16<67:33:44, 4.10it/s, grad_norm=4.82, loss_final=3.25, loss_mean=0.954, loss_mean_cls=2.65, proj_loss=-0.357][2026-03-23 13:43:36] Step: 1768, Training Logs: loss_final: 2.996684, loss_mean: 0.946667, proj_loss: -0.373950, loss_mean_cls: 2.423967, grad_norm: 7.957964 +Steps: 0%| | 1769/1000000 [07:16<67:31:54, 4.11it/s, grad_norm=7.96, loss_final=3, loss_mean=0.947, loss_mean_cls=2.42, proj_loss=-0.374][2026-03-23 13:43:36] Step: 1769, Training Logs: loss_final: 2.821100, loss_mean: 0.984110, proj_loss: -0.366530, loss_mean_cls: 2.203520, grad_norm: 3.809775 +Steps: 0%| | 1770/1000000 [07:17<67:31:50, 4.11it/s, grad_norm=3.81, loss_final=2.82, loss_mean=0.984, loss_mean_cls=2.2, proj_loss=-0.367][2026-03-23 13:43:37] Step: 1770, Training Logs: loss_final: 3.044583, loss_mean: 0.959334, proj_loss: -0.364995, loss_mean_cls: 2.450243, grad_norm: 9.979096 +Steps: 0%| | 1771/1000000 [07:17<67:31:09, 4.11it/s, grad_norm=9.98, loss_final=3.04, loss_mean=0.959, loss_mean_cls=2.45, proj_loss=-0.365][2026-03-23 13:43:37] Step: 1771, Training Logs: loss_final: 3.416950, loss_mean: 0.949957, proj_loss: -0.359571, loss_mean_cls: 2.826565, grad_norm: 7.787172 +Steps: 0%| | 1772/1000000 [07:17<67:30:14, 4.11it/s, grad_norm=7.79, loss_final=3.42, loss_mean=0.95, loss_mean_cls=2.83, proj_loss=-0.36][2026-03-23 13:43:37] Step: 1772, Training Logs: loss_final: 2.886132, loss_mean: 0.958801, proj_loss: -0.370840, loss_mean_cls: 2.298171, grad_norm: 7.638668 +Steps: 0%| | 1773/1000000 [07:17<67:30:17, 4.11it/s, grad_norm=7.64, loss_final=2.89, loss_mean=0.959, loss_mean_cls=2.3, proj_loss=-0.371][2026-03-23 13:43:37] Step: 1773, Training Logs: loss_final: 2.541656, loss_mean: 0.968602, proj_loss: -0.371712, loss_mean_cls: 1.944767, grad_norm: 11.367435 +Steps: 0%| | 1774/1000000 [07:18<67:30:24, 4.11it/s, grad_norm=11.4, loss_final=2.54, loss_mean=0.969, loss_mean_cls=1.94, proj_loss=-0.372][2026-03-23 13:43:38] Step: 1774, Training Logs: loss_final: 3.604572, loss_mean: 0.927699, proj_loss: -0.354648, loss_mean_cls: 3.031520, grad_norm: 4.023781 +Steps: 0%| | 1775/1000000 [07:18<67:47:53, 4.09it/s, grad_norm=4.02, loss_final=3.6, loss_mean=0.928, loss_mean_cls=3.03, proj_loss=-0.355][2026-03-23 13:43:38] Step: 1775, Training Logs: loss_final: 3.198637, loss_mean: 0.949482, proj_loss: -0.367696, loss_mean_cls: 2.616852, grad_norm: 13.337156 +Steps: 0%| | 1776/1000000 [07:18<67:42:38, 4.10it/s, grad_norm=13.3, loss_final=3.2, loss_mean=0.949, loss_mean_cls=2.62, proj_loss=-0.368][2026-03-23 13:43:38] Step: 1776, Training Logs: loss_final: 3.070227, loss_mean: 0.944237, proj_loss: -0.363880, loss_mean_cls: 2.489870, grad_norm: 9.026452 +Steps: 0%| | 1777/1000000 [07:18<67:38:36, 4.10it/s, grad_norm=9.03, loss_final=3.07, loss_mean=0.944, loss_mean_cls=2.49, proj_loss=-0.364][2026-03-23 13:43:38] Step: 1777, Training Logs: loss_final: 2.664709, loss_mean: 0.942871, proj_loss: -0.370776, loss_mean_cls: 2.092614, grad_norm: 9.245486 +Steps: 0%| | 1778/1000000 [07:19<67:34:48, 4.10it/s, grad_norm=9.25, loss_final=2.66, loss_mean=0.943, loss_mean_cls=2.09, proj_loss=-0.371][2026-03-23 13:43:39] Step: 1778, Training Logs: loss_final: 2.605970, loss_mean: 0.966459, proj_loss: -0.374528, loss_mean_cls: 2.014039, grad_norm: 9.288058 +Steps: 0%| | 1779/1000000 [07:19<67:39:48, 4.10it/s, grad_norm=9.29, loss_final=2.61, loss_mean=0.966, loss_mean_cls=2.01, proj_loss=-0.375][2026-03-23 13:43:39] Step: 1779, Training Logs: loss_final: 3.678703, loss_mean: 0.934520, proj_loss: -0.363028, loss_mean_cls: 3.107210, grad_norm: 12.519835 +Steps: 0%| | 1780/1000000 [07:19<67:37:51, 4.10it/s, grad_norm=12.5, loss_final=3.68, loss_mean=0.935, loss_mean_cls=3.11, proj_loss=-0.363][2026-03-23 13:43:39] Step: 1780, Training Logs: loss_final: 3.579903, loss_mean: 0.944840, proj_loss: -0.361827, loss_mean_cls: 2.996891, grad_norm: 8.152548 +Steps: 0%| | 1781/1000000 [07:19<67:34:26, 4.10it/s, grad_norm=8.15, loss_final=3.58, loss_mean=0.945, loss_mean_cls=3, proj_loss=-0.362][2026-03-23 13:43:39] Step: 1781, Training Logs: loss_final: 3.124685, loss_mean: 0.938948, proj_loss: -0.367037, loss_mean_cls: 2.552773, grad_norm: 6.442628 +Steps: 0%| | 1782/1000000 [07:20<67:39:15, 4.10it/s, grad_norm=6.44, loss_final=3.12, loss_mean=0.939, loss_mean_cls=2.55, proj_loss=-0.367][2026-03-23 13:43:40] Step: 1782, Training Logs: loss_final: 3.171597, loss_mean: 0.959747, proj_loss: -0.365882, loss_mean_cls: 2.577732, grad_norm: 5.715344 +Steps: 0%| | 1783/1000000 [07:20<67:36:02, 4.10it/s, grad_norm=5.72, loss_final=3.17, loss_mean=0.96, loss_mean_cls=2.58, proj_loss=-0.366][2026-03-23 13:43:40] Step: 1783, Training Logs: loss_final: 2.858465, loss_mean: 0.939094, proj_loss: -0.371944, loss_mean_cls: 2.291316, grad_norm: 3.021868 +Steps: 0%| | 1784/1000000 [07:20<67:39:34, 4.10it/s, grad_norm=3.02, loss_final=2.86, loss_mean=0.939, loss_mean_cls=2.29, proj_loss=-0.372][2026-03-23 13:43:40] Step: 1784, Training Logs: loss_final: 3.218134, loss_mean: 0.917336, proj_loss: -0.367299, loss_mean_cls: 2.668097, grad_norm: 10.568882 +Steps: 0%| | 1785/1000000 [07:20<67:37:04, 4.10it/s, grad_norm=10.6, loss_final=3.22, loss_mean=0.917, loss_mean_cls=2.67, proj_loss=-0.367][2026-03-23 13:43:40] Step: 1785, Training Logs: loss_final: 2.594580, loss_mean: 0.955824, proj_loss: -0.377010, loss_mean_cls: 2.015766, grad_norm: 8.005360 +Steps: 0%| | 1786/1000000 [07:21<67:41:17, 4.10it/s, grad_norm=8.01, loss_final=2.59, loss_mean=0.956, loss_mean_cls=2.02, proj_loss=-0.377][2026-03-23 13:43:41] Step: 1786, Training Logs: loss_final: 3.290454, loss_mean: 0.946312, proj_loss: -0.364758, loss_mean_cls: 2.708901, grad_norm: 9.317375 +Steps: 0%| | 1787/1000000 [07:21<67:38:35, 4.10it/s, grad_norm=9.32, loss_final=3.29, loss_mean=0.946, loss_mean_cls=2.71, proj_loss=-0.365][2026-03-23 13:43:41] Step: 1787, Training Logs: loss_final: 3.125347, loss_mean: 0.944012, proj_loss: -0.366599, loss_mean_cls: 2.547934, grad_norm: 8.539940 +Steps: 0%| | 1788/1000000 [07:21<67:35:44, 4.10it/s, grad_norm=8.54, loss_final=3.13, loss_mean=0.944, loss_mean_cls=2.55, proj_loss=-0.367][2026-03-23 13:43:41] Step: 1788, Training Logs: loss_final: 3.132853, loss_mean: 0.922871, proj_loss: -0.369663, loss_mean_cls: 2.579645, grad_norm: 4.957303 +Steps: 0%| | 1789/1000000 [07:21<67:33:35, 4.10it/s, grad_norm=4.96, loss_final=3.13, loss_mean=0.923, loss_mean_cls=2.58, proj_loss=-0.37][2026-03-23 13:43:41] Step: 1789, Training Logs: loss_final: 3.012038, loss_mean: 0.941951, proj_loss: -0.366148, loss_mean_cls: 2.436235, grad_norm: 7.842648 +Steps: 0%| | 1790/1000000 [07:22<67:32:44, 4.11it/s, grad_norm=7.84, loss_final=3.01, loss_mean=0.942, loss_mean_cls=2.44, proj_loss=-0.366][2026-03-23 13:43:42] Step: 1790, Training Logs: loss_final: 3.280839, loss_mean: 0.941761, proj_loss: -0.360590, loss_mean_cls: 2.699668, grad_norm: 2.211731 +Steps: 0%| | 1791/1000000 [07:22<67:45:10, 4.09it/s, grad_norm=2.21, loss_final=3.28, loss_mean=0.942, loss_mean_cls=2.7, proj_loss=-0.361][2026-03-23 13:43:42] Step: 1791, Training Logs: loss_final: 3.314321, loss_mean: 0.968327, proj_loss: -0.359445, loss_mean_cls: 2.705438, grad_norm: 7.243266 +Steps: 0%| | 1792/1000000 [07:22<67:40:42, 4.10it/s, grad_norm=7.24, loss_final=3.31, loss_mean=0.968, loss_mean_cls=2.71, proj_loss=-0.359][2026-03-23 13:43:42] Step: 1792, Training Logs: loss_final: 2.784964, loss_mean: 0.936339, proj_loss: -0.372574, loss_mean_cls: 2.221200, grad_norm: 9.457064 +Steps: 0%| | 1793/1000000 [07:22<67:38:53, 4.10it/s, grad_norm=9.46, loss_final=2.78, loss_mean=0.936, loss_mean_cls=2.22, proj_loss=-0.373][2026-03-23 13:43:42] Step: 1793, Training Logs: loss_final: 3.277256, loss_mean: 0.940983, proj_loss: -0.360466, loss_mean_cls: 2.696738, grad_norm: 8.434647 +Steps: 0%| | 1794/1000000 [07:23<67:37:04, 4.10it/s, grad_norm=8.43, loss_final=3.28, loss_mean=0.941, loss_mean_cls=2.7, proj_loss=-0.36][2026-03-23 13:43:43] Step: 1794, Training Logs: loss_final: 2.785424, loss_mean: 0.961086, proj_loss: -0.370270, loss_mean_cls: 2.194608, grad_norm: 14.087017 +Steps: 0%| | 1795/1000000 [07:23<67:38:32, 4.10it/s, grad_norm=14.1, loss_final=2.79, loss_mean=0.961, loss_mean_cls=2.19, proj_loss=-0.37][2026-03-23 13:43:43] Step: 1795, Training Logs: loss_final: 3.147114, loss_mean: 0.940882, proj_loss: -0.361906, loss_mean_cls: 2.568138, grad_norm: 13.633131 +Steps: 0%| | 1796/1000000 [07:23<67:37:27, 4.10it/s, grad_norm=13.6, loss_final=3.15, loss_mean=0.941, loss_mean_cls=2.57, proj_loss=-0.362][2026-03-23 13:43:43] Step: 1796, Training Logs: loss_final: 3.267090, loss_mean: 0.958115, proj_loss: -0.357223, loss_mean_cls: 2.666198, grad_norm: 8.755287 +Steps: 0%| | 1797/1000000 [07:23<67:36:50, 4.10it/s, grad_norm=8.76, loss_final=3.27, loss_mean=0.958, loss_mean_cls=2.67, proj_loss=-0.357][2026-03-23 13:43:43] Step: 1797, Training Logs: loss_final: 3.070038, loss_mean: 0.930898, proj_loss: -0.367594, loss_mean_cls: 2.506734, grad_norm: 13.809558 +Steps: 0%| | 1798/1000000 [07:24<67:34:13, 4.10it/s, grad_norm=13.8, loss_final=3.07, loss_mean=0.931, loss_mean_cls=2.51, proj_loss=-0.368][2026-03-23 13:43:43] Step: 1798, Training Logs: loss_final: 3.094651, loss_mean: 0.937634, proj_loss: -0.365693, loss_mean_cls: 2.522710, grad_norm: 7.731764 +Steps: 0%| | 1799/1000000 [07:24<67:32:22, 4.11it/s, grad_norm=7.73, loss_final=3.09, loss_mean=0.938, loss_mean_cls=2.52, proj_loss=-0.366][2026-03-23 13:43:44] Step: 1799, Training Logs: loss_final: 3.252764, loss_mean: 0.924647, proj_loss: -0.364841, loss_mean_cls: 2.692957, grad_norm: 8.670899 +Steps: 0%| | 1800/1000000 [07:24<67:35:51, 4.10it/s, grad_norm=8.67, loss_final=3.25, loss_mean=0.925, loss_mean_cls=2.69, proj_loss=-0.365][2026-03-23 13:43:44] Step: 1800, Training Logs: loss_final: 3.120913, loss_mean: 0.951422, proj_loss: -0.370274, loss_mean_cls: 2.539765, grad_norm: 12.207883 +Steps: 0%| | 1801/1000000 [07:24<67:34:14, 4.10it/s, grad_norm=12.2, loss_final=3.12, loss_mean=0.951, loss_mean_cls=2.54, proj_loss=-0.37][2026-03-23 13:43:44] Step: 1801, Training Logs: loss_final: 2.773800, loss_mean: 0.977388, proj_loss: -0.371914, loss_mean_cls: 2.168326, grad_norm: 10.607113 +Steps: 0%| | 1802/1000000 [07:25<67:34:21, 4.10it/s, grad_norm=10.6, loss_final=2.77, loss_mean=0.977, loss_mean_cls=2.17, proj_loss=-0.372][2026-03-23 13:43:44] Step: 1802, Training Logs: loss_final: 3.054619, loss_mean: 0.943783, proj_loss: -0.369963, loss_mean_cls: 2.480799, grad_norm: 15.004421 +Steps: 0%| | 1803/1000000 [07:25<67:33:01, 4.10it/s, grad_norm=15, loss_final=3.05, loss_mean=0.944, loss_mean_cls=2.48, proj_loss=-0.37][2026-03-23 13:43:45] Step: 1803, Training Logs: loss_final: 2.807292, loss_mean: 0.954942, proj_loss: -0.370146, loss_mean_cls: 2.222496, grad_norm: 6.560244 +Steps: 0%| | 1804/1000000 [07:25<67:33:58, 4.10it/s, grad_norm=6.56, loss_final=2.81, loss_mean=0.955, loss_mean_cls=2.22, proj_loss=-0.37][2026-03-23 13:43:45] Step: 1804, Training Logs: loss_final: 2.674692, loss_mean: 0.973040, proj_loss: -0.366330, loss_mean_cls: 2.067981, grad_norm: 6.952021 +Steps: 0%| | 1805/1000000 [07:25<67:30:26, 4.11it/s, grad_norm=6.95, loss_final=2.67, loss_mean=0.973, loss_mean_cls=2.07, proj_loss=-0.366][2026-03-23 13:43:45] Step: 1805, Training Logs: loss_final: 2.899619, loss_mean: 0.965439, proj_loss: -0.373585, loss_mean_cls: 2.307764, grad_norm: 5.835149 +Steps: 0%| | 1806/1000000 [07:25<67:29:03, 4.11it/s, grad_norm=5.84, loss_final=2.9, loss_mean=0.965, loss_mean_cls=2.31, proj_loss=-0.374][2026-03-23 13:43:45] Step: 1806, Training Logs: loss_final: 2.511254, loss_mean: 0.956447, proj_loss: -0.375925, loss_mean_cls: 1.930732, grad_norm: 9.133846 +Steps: 0%| | 1807/1000000 [07:26<67:27:43, 4.11it/s, grad_norm=9.13, loss_final=2.51, loss_mean=0.956, loss_mean_cls=1.93, proj_loss=-0.376][2026-03-23 13:43:46] Step: 1807, Training Logs: loss_final: 2.587870, loss_mean: 0.967667, proj_loss: -0.376470, loss_mean_cls: 1.996672, grad_norm: 8.433607 +Steps: 0%| | 1808/1000000 [07:26<67:28:19, 4.11it/s, grad_norm=8.43, loss_final=2.59, loss_mean=0.968, loss_mean_cls=2, proj_loss=-0.376][2026-03-23 13:43:46] Step: 1808, Training Logs: loss_final: 2.919496, loss_mean: 0.933975, proj_loss: -0.372932, loss_mean_cls: 2.358452, grad_norm: 4.082678 +Steps: 0%| | 1809/1000000 [07:26<67:27:38, 4.11it/s, grad_norm=4.08, loss_final=2.92, loss_mean=0.934, loss_mean_cls=2.36, proj_loss=-0.373][2026-03-23 13:43:46] Step: 1809, Training Logs: loss_final: 2.811549, loss_mean: 0.944824, proj_loss: -0.373913, loss_mean_cls: 2.240639, grad_norm: 6.674659 +Steps: 0%| | 1810/1000000 [07:26<67:35:02, 4.10it/s, grad_norm=6.67, loss_final=2.81, loss_mean=0.945, loss_mean_cls=2.24, proj_loss=-0.374][2026-03-23 13:43:46] Step: 1810, Training Logs: loss_final: 2.893242, loss_mean: 0.961835, proj_loss: -0.370749, loss_mean_cls: 2.302155, grad_norm: 8.408562 +Steps: 0%| | 1811/1000000 [07:27<67:32:58, 4.10it/s, grad_norm=8.41, loss_final=2.89, loss_mean=0.962, loss_mean_cls=2.3, proj_loss=-0.371][2026-03-23 13:43:47] Step: 1811, Training Logs: loss_final: 2.845882, loss_mean: 0.936099, proj_loss: -0.369728, loss_mean_cls: 2.279510, grad_norm: 3.126701 +Steps: 0%| | 1812/1000000 [07:27<67:32:18, 4.11it/s, grad_norm=3.13, loss_final=2.85, loss_mean=0.936, loss_mean_cls=2.28, proj_loss=-0.37][2026-03-23 13:43:47] Step: 1812, Training Logs: loss_final: 2.463581, loss_mean: 0.973858, proj_loss: -0.381115, loss_mean_cls: 1.870838, grad_norm: 4.270693 +Steps: 0%| | 1813/1000000 [07:27<67:31:53, 4.11it/s, grad_norm=4.27, loss_final=2.46, loss_mean=0.974, loss_mean_cls=1.87, proj_loss=-0.381][2026-03-23 13:43:47] Step: 1813, Training Logs: loss_final: 3.004196, loss_mean: 0.949896, proj_loss: -0.368042, loss_mean_cls: 2.422342, grad_norm: 11.915813 +Steps: 0%| | 1814/1000000 [07:27<67:40:15, 4.10it/s, grad_norm=11.9, loss_final=3, loss_mean=0.95, loss_mean_cls=2.42, proj_loss=-0.368][2026-03-23 13:43:47] Step: 1814, Training Logs: loss_final: 2.753165, loss_mean: 0.940520, proj_loss: -0.370441, loss_mean_cls: 2.183087, grad_norm: 3.445854 +Steps: 0%| | 1815/1000000 [07:28<67:38:02, 4.10it/s, grad_norm=3.45, loss_final=2.75, loss_mean=0.941, loss_mean_cls=2.18, proj_loss=-0.37][2026-03-23 13:43:48] Step: 1815, Training Logs: loss_final: 3.030201, loss_mean: 0.930737, proj_loss: -0.377303, loss_mean_cls: 2.476767, grad_norm: 16.628248 +Steps: 0%| | 1816/1000000 [07:28<67:35:54, 4.10it/s, grad_norm=16.6, loss_final=3.03, loss_mean=0.931, loss_mean_cls=2.48, proj_loss=-0.377][2026-03-23 13:43:48] Step: 1816, Training Logs: loss_final: 3.522990, loss_mean: 0.957400, proj_loss: -0.361730, loss_mean_cls: 2.927321, grad_norm: 10.877635 +Steps: 0%| | 1817/1000000 [07:28<67:33:43, 4.10it/s, grad_norm=10.9, loss_final=3.52, loss_mean=0.957, loss_mean_cls=2.93, proj_loss=-0.362][2026-03-23 13:43:48] Step: 1817, Training Logs: loss_final: 3.218057, loss_mean: 0.962027, proj_loss: -0.369310, loss_mean_cls: 2.625340, grad_norm: 5.929490 +Steps: 0%| | 1818/1000000 [07:28<67:33:16, 4.10it/s, grad_norm=5.93, loss_final=3.22, loss_mean=0.962, loss_mean_cls=2.63, proj_loss=-0.369][2026-03-23 13:43:48] Step: 1818, Training Logs: loss_final: 3.602643, loss_mean: 0.933191, proj_loss: -0.355608, loss_mean_cls: 3.025061, grad_norm: 11.042360 +Steps: 0%| | 1819/1000000 [07:29<67:31:57, 4.11it/s, grad_norm=11, loss_final=3.6, loss_mean=0.933, loss_mean_cls=3.03, proj_loss=-0.356][2026-03-23 13:43:49] Step: 1819, Training Logs: loss_final: 3.004613, loss_mean: 0.972050, proj_loss: -0.367636, loss_mean_cls: 2.400199, grad_norm: 7.272541 +Steps: 0%| | 1820/1000000 [07:29<67:32:03, 4.11it/s, grad_norm=7.27, loss_final=3, loss_mean=0.972, loss_mean_cls=2.4, proj_loss=-0.368][2026-03-23 13:43:49] Step: 1820, Training Logs: loss_final: 2.926287, loss_mean: 0.975062, proj_loss: -0.369397, loss_mean_cls: 2.320622, grad_norm: 9.148950 +Steps: 0%| | 1821/1000000 [07:29<67:40:06, 4.10it/s, grad_norm=9.15, loss_final=2.93, loss_mean=0.975, loss_mean_cls=2.32, proj_loss=-0.369][2026-03-23 13:43:49] Step: 1821, Training Logs: loss_final: 2.928957, loss_mean: 0.956153, proj_loss: -0.369223, loss_mean_cls: 2.342026, grad_norm: 5.683403 +Steps: 0%| | 1822/1000000 [07:29<67:37:25, 4.10it/s, grad_norm=5.68, loss_final=2.93, loss_mean=0.956, loss_mean_cls=2.34, proj_loss=-0.369][2026-03-23 13:43:49] Step: 1822, Training Logs: loss_final: 2.837001, loss_mean: 0.938919, proj_loss: -0.373905, loss_mean_cls: 2.271987, grad_norm: 4.081095 +Steps: 0%| | 1823/1000000 [07:30<67:39:52, 4.10it/s, grad_norm=4.08, loss_final=2.84, loss_mean=0.939, loss_mean_cls=2.27, proj_loss=-0.374][2026-03-23 13:43:50] Step: 1823, Training Logs: loss_final: 2.883953, loss_mean: 0.968238, proj_loss: -0.384374, loss_mean_cls: 2.300089, grad_norm: 24.636589 +Steps: 0%| | 1824/1000000 [07:30<67:39:16, 4.10it/s, grad_norm=24.6, loss_final=2.88, loss_mean=0.968, loss_mean_cls=2.3, proj_loss=-0.384][2026-03-23 13:43:50] Step: 1824, Training Logs: loss_final: 2.956905, loss_mean: 0.973530, proj_loss: -0.368896, loss_mean_cls: 2.352271, grad_norm: 13.456961 +Steps: 0%| | 1825/1000000 [07:30<67:38:42, 4.10it/s, grad_norm=13.5, loss_final=2.96, loss_mean=0.974, loss_mean_cls=2.35, proj_loss=-0.369][2026-03-23 13:43:50] Step: 1825, Training Logs: loss_final: 3.150121, loss_mean: 0.957815, proj_loss: -0.364242, loss_mean_cls: 2.556549, grad_norm: 5.350358 +Steps: 0%| | 1826/1000000 [07:30<67:50:11, 4.09it/s, grad_norm=5.35, loss_final=3.15, loss_mean=0.958, loss_mean_cls=2.56, proj_loss=-0.364][2026-03-23 13:43:50] Step: 1826, Training Logs: loss_final: 3.494687, loss_mean: 0.947140, proj_loss: -0.363777, loss_mean_cls: 2.911323, grad_norm: 11.461744 +Steps: 0%| | 1827/1000000 [07:31<67:43:35, 4.09it/s, grad_norm=11.5, loss_final=3.49, loss_mean=0.947, loss_mean_cls=2.91, proj_loss=-0.364][2026-03-23 13:43:51] Step: 1827, Training Logs: loss_final: 2.538761, loss_mean: 0.968552, proj_loss: -0.377923, loss_mean_cls: 1.948131, grad_norm: 5.054139 +Steps: 0%| | 1828/1000000 [07:31<67:43:36, 4.09it/s, grad_norm=5.05, loss_final=2.54, loss_mean=0.969, loss_mean_cls=1.95, proj_loss=-0.378][2026-03-23 13:43:51] Step: 1828, Training Logs: loss_final: 2.755179, loss_mean: 0.951730, proj_loss: -0.376507, loss_mean_cls: 2.179957, grad_norm: 3.989673 +Steps: 0%| | 1829/1000000 [07:31<67:40:36, 4.10it/s, grad_norm=3.99, loss_final=2.76, loss_mean=0.952, loss_mean_cls=2.18, proj_loss=-0.377][2026-03-23 13:43:51] Step: 1829, Training Logs: loss_final: 3.245421, loss_mean: 0.946215, proj_loss: -0.363202, loss_mean_cls: 2.662408, grad_norm: 12.432812 +Steps: 0%| | 1830/1000000 [07:31<67:39:24, 4.10it/s, grad_norm=12.4, loss_final=3.25, loss_mean=0.946, loss_mean_cls=2.66, proj_loss=-0.363][2026-03-23 13:43:51] Step: 1830, Training Logs: loss_final: 3.154683, loss_mean: 0.959587, proj_loss: -0.363637, loss_mean_cls: 2.558733, grad_norm: 9.335675 +Steps: 0%| | 1831/1000000 [07:32<67:38:27, 4.10it/s, grad_norm=9.34, loss_final=3.15, loss_mean=0.96, loss_mean_cls=2.56, proj_loss=-0.364][2026-03-23 13:43:52] Step: 1831, Training Logs: loss_final: 2.966053, loss_mean: 0.938164, proj_loss: -0.368108, loss_mean_cls: 2.395997, grad_norm: 9.817312 +Steps: 0%| | 1832/1000000 [07:32<67:38:43, 4.10it/s, grad_norm=9.82, loss_final=2.97, loss_mean=0.938, loss_mean_cls=2.4, proj_loss=-0.368][2026-03-23 13:43:52] Step: 1832, Training Logs: loss_final: 2.756625, loss_mean: 0.967055, proj_loss: -0.369765, loss_mean_cls: 2.159335, grad_norm: 12.303928 +Steps: 0%| | 1833/1000000 [07:32<67:36:11, 4.10it/s, grad_norm=12.3, loss_final=2.76, loss_mean=0.967, loss_mean_cls=2.16, proj_loss=-0.37][2026-03-23 13:43:52] Step: 1833, Training Logs: loss_final: 2.857785, loss_mean: 0.963707, proj_loss: -0.367780, loss_mean_cls: 2.261858, grad_norm: 8.872371 +Steps: 0%| | 1834/1000000 [07:32<67:40:16, 4.10it/s, grad_norm=8.87, loss_final=2.86, loss_mean=0.964, loss_mean_cls=2.26, proj_loss=-0.368][2026-03-23 13:43:52] Step: 1834, Training Logs: loss_final: 3.734063, loss_mean: 0.938728, proj_loss: -0.358465, loss_mean_cls: 3.153800, grad_norm: 12.862713 +Steps: 0%| | 1835/1000000 [07:33<67:39:07, 4.10it/s, grad_norm=12.9, loss_final=3.73, loss_mean=0.939, loss_mean_cls=3.15, proj_loss=-0.358][2026-03-23 13:43:53] Step: 1835, Training Logs: loss_final: 2.346016, loss_mean: 0.979382, proj_loss: -0.378848, loss_mean_cls: 1.745481, grad_norm: 4.310440 +Steps: 0%| | 1836/1000000 [07:33<67:36:01, 4.10it/s, grad_norm=4.31, loss_final=2.35, loss_mean=0.979, loss_mean_cls=1.75, proj_loss=-0.379][2026-03-23 13:43:53] Step: 1836, Training Logs: loss_final: 3.048908, loss_mean: 0.934538, proj_loss: -0.376643, loss_mean_cls: 2.491014, grad_norm: 12.065886 +Steps: 0%| | 1837/1000000 [07:33<67:34:31, 4.10it/s, grad_norm=12.1, loss_final=3.05, loss_mean=0.935, loss_mean_cls=2.49, proj_loss=-0.377][2026-03-23 13:43:53] Step: 1837, Training Logs: loss_final: 2.518197, loss_mean: 0.942831, proj_loss: -0.379878, loss_mean_cls: 1.955244, grad_norm: 4.469793 +Steps: 0%| | 1838/1000000 [07:33<67:39:00, 4.10it/s, grad_norm=4.47, loss_final=2.52, loss_mean=0.943, loss_mean_cls=1.96, proj_loss=-0.38][2026-03-23 13:43:53] Step: 1838, Training Logs: loss_final: 3.012158, loss_mean: 0.975087, proj_loss: -0.369483, loss_mean_cls: 2.406555, grad_norm: 9.726233 +Steps: 0%| | 1839/1000000 [07:34<67:36:21, 4.10it/s, grad_norm=9.73, loss_final=3.01, loss_mean=0.975, loss_mean_cls=2.41, proj_loss=-0.369][2026-03-23 13:43:53] Step: 1839, Training Logs: loss_final: 3.109657, loss_mean: 0.951868, proj_loss: -0.368888, loss_mean_cls: 2.526678, grad_norm: 8.767427 +Steps: 0%| | 1840/1000000 [07:34<67:36:04, 4.10it/s, grad_norm=8.77, loss_final=3.11, loss_mean=0.952, loss_mean_cls=2.53, proj_loss=-0.369][2026-03-23 13:43:54] Step: 1840, Training Logs: loss_final: 2.945475, loss_mean: 0.950436, proj_loss: -0.368406, loss_mean_cls: 2.363445, grad_norm: 5.745079 +Steps: 0%| | 1841/1000000 [07:34<67:33:05, 4.10it/s, grad_norm=5.75, loss_final=2.95, loss_mean=0.95, loss_mean_cls=2.36, proj_loss=-0.368][2026-03-23 13:43:54] Step: 1841, Training Logs: loss_final: 2.824821, loss_mean: 0.953702, proj_loss: -0.369601, loss_mean_cls: 2.240720, grad_norm: 6.436207 +Steps: 0%| | 1842/1000000 [07:34<67:34:22, 4.10it/s, grad_norm=6.44, loss_final=2.82, loss_mean=0.954, loss_mean_cls=2.24, proj_loss=-0.37][2026-03-23 13:43:54] Step: 1842, Training Logs: loss_final: 3.109457, loss_mean: 0.959110, proj_loss: -0.369895, loss_mean_cls: 2.520243, grad_norm: 18.542778 +Steps: 0%| | 1843/1000000 [07:34<67:35:21, 4.10it/s, grad_norm=18.5, loss_final=3.11, loss_mean=0.959, loss_mean_cls=2.52, proj_loss=-0.37][2026-03-23 13:43:54] Step: 1843, Training Logs: loss_final: 2.774714, loss_mean: 0.970951, proj_loss: -0.379730, loss_mean_cls: 2.183493, grad_norm: 10.643207 +Steps: 0%| | 1844/1000000 [07:35<67:33:17, 4.10it/s, grad_norm=10.6, loss_final=2.77, loss_mean=0.971, loss_mean_cls=2.18, proj_loss=-0.38][2026-03-23 13:43:55] Step: 1844, Training Logs: loss_final: 3.249792, loss_mean: 0.931527, proj_loss: -0.367619, loss_mean_cls: 2.685884, grad_norm: 6.220457 +Steps: 0%| | 1845/1000000 [07:35<67:31:18, 4.11it/s, grad_norm=6.22, loss_final=3.25, loss_mean=0.932, loss_mean_cls=2.69, proj_loss=-0.368][2026-03-23 13:43:55] Step: 1845, Training Logs: loss_final: 3.121033, loss_mean: 0.935259, proj_loss: -0.369198, loss_mean_cls: 2.554973, grad_norm: 9.885269 +Steps: 0%| | 1846/1000000 [07:35<67:29:30, 4.11it/s, grad_norm=9.89, loss_final=3.12, loss_mean=0.935, loss_mean_cls=2.55, proj_loss=-0.369][2026-03-23 13:43:55] Step: 1846, Training Logs: loss_final: 2.759668, loss_mean: 0.965930, proj_loss: -0.371828, loss_mean_cls: 2.165566, grad_norm: 6.716837 +Steps: 0%| | 1847/1000000 [07:35<67:28:19, 4.11it/s, grad_norm=6.72, loss_final=2.76, loss_mean=0.966, loss_mean_cls=2.17, proj_loss=-0.372][2026-03-23 13:43:55] Step: 1847, Training Logs: loss_final: 2.652462, loss_mean: 0.970317, proj_loss: -0.374144, loss_mean_cls: 2.056289, grad_norm: 7.835024 +Steps: 0%| | 1848/1000000 [07:36<67:29:26, 4.11it/s, grad_norm=7.84, loss_final=2.65, loss_mean=0.97, loss_mean_cls=2.06, proj_loss=-0.374][2026-03-23 13:43:56] Step: 1848, Training Logs: loss_final: 2.502611, loss_mean: 0.946107, proj_loss: -0.372537, loss_mean_cls: 1.929041, grad_norm: 3.882281 +Steps: 0%| | 1849/1000000 [07:36<67:29:10, 4.11it/s, grad_norm=3.88, loss_final=2.5, loss_mean=0.946, loss_mean_cls=1.93, proj_loss=-0.373][2026-03-23 13:43:56] Step: 1849, Training Logs: loss_final: 2.860797, loss_mean: 0.947686, proj_loss: -0.370853, loss_mean_cls: 2.283964, grad_norm: 6.405592 +Steps: 0%| | 1850/1000000 [07:36<67:30:51, 4.11it/s, grad_norm=6.41, loss_final=2.86, loss_mean=0.948, loss_mean_cls=2.28, proj_loss=-0.371][2026-03-23 13:43:56] Step: 1850, Training Logs: loss_final: 3.071633, loss_mean: 0.946299, proj_loss: -0.373057, loss_mean_cls: 2.498390, grad_norm: 8.001746 +Steps: 0%| | 1851/1000000 [07:36<67:31:02, 4.11it/s, grad_norm=8, loss_final=3.07, loss_mean=0.946, loss_mean_cls=2.5, proj_loss=-0.373][2026-03-23 13:43:56] Step: 1851, Training Logs: loss_final: 2.795327, loss_mean: 0.941687, proj_loss: -0.377708, loss_mean_cls: 2.231348, grad_norm: 2.956196 +Steps: 0%| | 1852/1000000 [07:37<67:30:58, 4.11it/s, grad_norm=2.96, loss_final=2.8, loss_mean=0.942, loss_mean_cls=2.23, proj_loss=-0.378][2026-03-23 13:43:57] Step: 1852, Training Logs: loss_final: 2.669403, loss_mean: 0.972252, proj_loss: -0.379180, loss_mean_cls: 2.076331, grad_norm: 4.855297 +Steps: 0%| | 1853/1000000 [07:37<67:31:26, 4.11it/s, grad_norm=4.86, loss_final=2.67, loss_mean=0.972, loss_mean_cls=2.08, proj_loss=-0.379][2026-03-23 13:43:57] Step: 1853, Training Logs: loss_final: 2.498358, loss_mean: 0.968494, proj_loss: -0.380239, loss_mean_cls: 1.910104, grad_norm: 6.724028 +Steps: 0%| | 1854/1000000 [07:37<67:29:38, 4.11it/s, grad_norm=6.72, loss_final=2.5, loss_mean=0.968, loss_mean_cls=1.91, proj_loss=-0.38][2026-03-23 13:43:57] Step: 1854, Training Logs: loss_final: 2.743342, loss_mean: 0.953116, proj_loss: -0.375529, loss_mean_cls: 2.165756, grad_norm: 4.878323 +Steps: 0%| | 1855/1000000 [07:37<67:30:45, 4.11it/s, grad_norm=4.88, loss_final=2.74, loss_mean=0.953, loss_mean_cls=2.17, proj_loss=-0.376][2026-03-23 13:43:57] Step: 1855, Training Logs: loss_final: 2.992461, loss_mean: 0.962351, proj_loss: -0.371483, loss_mean_cls: 2.401594, grad_norm: 6.752222 +Steps: 0%| | 1856/1000000 [07:38<67:31:34, 4.11it/s, grad_norm=6.75, loss_final=2.99, loss_mean=0.962, loss_mean_cls=2.4, proj_loss=-0.371][2026-03-23 13:43:58] Step: 1856, Training Logs: loss_final: 2.409571, loss_mean: 0.991127, proj_loss: -0.380503, loss_mean_cls: 1.798948, grad_norm: 11.624877 +Steps: 0%| | 1857/1000000 [07:38<67:30:21, 4.11it/s, grad_norm=11.6, loss_final=2.41, loss_mean=0.991, loss_mean_cls=1.8, proj_loss=-0.381][2026-03-23 13:43:58] Step: 1857, Training Logs: loss_final: 3.175803, loss_mean: 0.951527, proj_loss: -0.364599, loss_mean_cls: 2.588874, grad_norm: 5.700446 +Steps: 0%| | 1858/1000000 [07:38<67:29:45, 4.11it/s, grad_norm=5.7, loss_final=3.18, loss_mean=0.952, loss_mean_cls=2.59, proj_loss=-0.365][2026-03-23 13:43:58] Step: 1858, Training Logs: loss_final: 3.406036, loss_mean: 0.937368, proj_loss: -0.369336, loss_mean_cls: 2.838004, grad_norm: 10.733318 +Steps: 0%| | 1859/1000000 [07:38<67:30:45, 4.11it/s, grad_norm=10.7, loss_final=3.41, loss_mean=0.937, loss_mean_cls=2.84, proj_loss=-0.369][2026-03-23 13:43:58] Step: 1859, Training Logs: loss_final: 2.851498, loss_mean: 0.948373, proj_loss: -0.376086, loss_mean_cls: 2.279211, grad_norm: 3.881465 +Steps: 0%| | 1860/1000000 [07:39<67:30:33, 4.11it/s, grad_norm=3.88, loss_final=2.85, loss_mean=0.948, loss_mean_cls=2.28, proj_loss=-0.376][2026-03-23 13:43:59] Step: 1860, Training Logs: loss_final: 3.073522, loss_mean: 0.940502, proj_loss: -0.370823, loss_mean_cls: 2.503842, grad_norm: 5.751287 +Steps: 0%| | 1861/1000000 [07:39<67:29:39, 4.11it/s, grad_norm=5.75, loss_final=3.07, loss_mean=0.941, loss_mean_cls=2.5, proj_loss=-0.371][2026-03-23 13:43:59] Step: 1861, Training Logs: loss_final: 3.047101, loss_mean: 0.938685, proj_loss: -0.365992, loss_mean_cls: 2.474408, grad_norm: 6.339403 +Steps: 0%| | 1862/1000000 [07:39<67:27:55, 4.11it/s, grad_norm=6.34, loss_final=3.05, loss_mean=0.939, loss_mean_cls=2.47, proj_loss=-0.366][2026-03-23 13:43:59] Step: 1862, Training Logs: loss_final: 3.169862, loss_mean: 0.957294, proj_loss: -0.368166, loss_mean_cls: 2.580734, grad_norm: 8.686994 +Steps: 0%| | 1863/1000000 [07:39<67:27:53, 4.11it/s, grad_norm=8.69, loss_final=3.17, loss_mean=0.957, loss_mean_cls=2.58, proj_loss=-0.368][2026-03-23 13:43:59] Step: 1863, Training Logs: loss_final: 2.694455, loss_mean: 0.954061, proj_loss: -0.378845, loss_mean_cls: 2.119239, grad_norm: 8.156844 +Steps: 0%| | 1864/1000000 [07:40<67:28:14, 4.11it/s, grad_norm=8.16, loss_final=2.69, loss_mean=0.954, loss_mean_cls=2.12, proj_loss=-0.379][2026-03-23 13:44:00] Step: 1864, Training Logs: loss_final: 3.240105, loss_mean: 0.941125, proj_loss: -0.369769, loss_mean_cls: 2.668749, grad_norm: 15.998892 +Steps: 0%| | 1865/1000000 [07:40<67:26:43, 4.11it/s, grad_norm=16, loss_final=3.24, loss_mean=0.941, loss_mean_cls=2.67, proj_loss=-0.37][2026-03-23 13:44:00] Step: 1865, Training Logs: loss_final: 2.880909, loss_mean: 0.964858, proj_loss: -0.371954, loss_mean_cls: 2.288006, grad_norm: 4.277064 +Steps: 0%| | 1866/1000000 [07:40<67:26:30, 4.11it/s, grad_norm=4.28, loss_final=2.88, loss_mean=0.965, loss_mean_cls=2.29, proj_loss=-0.372][2026-03-23 13:44:00] Step: 1866, Training Logs: loss_final: 3.490638, loss_mean: 0.925454, proj_loss: -0.368809, loss_mean_cls: 2.933993, grad_norm: 23.150505 +Steps: 0%| | 1867/1000000 [07:40<67:28:50, 4.11it/s, grad_norm=23.2, loss_final=3.49, loss_mean=0.925, loss_mean_cls=2.93, proj_loss=-0.369][2026-03-23 13:44:00] Step: 1867, Training Logs: loss_final: 3.403008, loss_mean: 0.949546, proj_loss: -0.370008, loss_mean_cls: 2.823470, grad_norm: 13.741731 +Steps: 0%| | 1868/1000000 [07:41<67:30:07, 4.11it/s, grad_norm=13.7, loss_final=3.4, loss_mean=0.95, loss_mean_cls=2.82, proj_loss=-0.37][2026-03-23 13:44:01] Step: 1868, Training Logs: loss_final: 2.959562, loss_mean: 0.955443, proj_loss: -0.374496, loss_mean_cls: 2.378615, grad_norm: 16.291430 +Steps: 0%| | 1869/1000000 [07:41<67:29:29, 4.11it/s, grad_norm=16.3, loss_final=2.96, loss_mean=0.955, loss_mean_cls=2.38, proj_loss=-0.374][2026-03-23 13:44:01] Step: 1869, Training Logs: loss_final: 2.992547, loss_mean: 0.954895, proj_loss: -0.375387, loss_mean_cls: 2.413039, grad_norm: 7.518246 +Steps: 0%| | 1870/1000000 [07:41<67:28:42, 4.11it/s, grad_norm=7.52, loss_final=2.99, loss_mean=0.955, loss_mean_cls=2.41, proj_loss=-0.375][2026-03-23 13:44:01] Step: 1870, Training Logs: loss_final: 3.180319, loss_mean: 0.940410, proj_loss: -0.368767, loss_mean_cls: 2.608676, grad_norm: 20.129833 +Steps: 0%| | 1871/1000000 [07:41<67:29:58, 4.11it/s, grad_norm=20.1, loss_final=3.18, loss_mean=0.94, loss_mean_cls=2.61, proj_loss=-0.369][2026-03-23 13:44:01] Step: 1871, Training Logs: loss_final: 2.914497, loss_mean: 0.971177, proj_loss: -0.376282, loss_mean_cls: 2.319602, grad_norm: 17.339964 +Steps: 0%| | 1872/1000000 [07:42<67:29:48, 4.11it/s, grad_norm=17.3, loss_final=2.91, loss_mean=0.971, loss_mean_cls=2.32, proj_loss=-0.376][2026-03-23 13:44:02] Step: 1872, Training Logs: loss_final: 3.076383, loss_mean: 0.986026, proj_loss: -0.374488, loss_mean_cls: 2.464844, grad_norm: 14.716223 +Steps: 0%| | 1873/1000000 [07:42<67:36:12, 4.10it/s, grad_norm=14.7, loss_final=3.08, loss_mean=0.986, loss_mean_cls=2.46, proj_loss=-0.374][2026-03-23 13:44:02] Step: 1873, Training Logs: loss_final: 2.838601, loss_mean: 0.975988, proj_loss: -0.372777, loss_mean_cls: 2.235390, grad_norm: 13.812629 +Steps: 0%| | 1874/1000000 [07:42<67:35:28, 4.10it/s, grad_norm=13.8, loss_final=2.84, loss_mean=0.976, loss_mean_cls=2.24, proj_loss=-0.373][2026-03-23 13:44:02] Step: 1874, Training Logs: loss_final: 2.945844, loss_mean: 0.955274, proj_loss: -0.373082, loss_mean_cls: 2.363652, grad_norm: 5.926426 +Steps: 0%| | 1875/1000000 [07:42<67:34:47, 4.10it/s, grad_norm=5.93, loss_final=2.95, loss_mean=0.955, loss_mean_cls=2.36, proj_loss=-0.373][2026-03-23 13:44:02] Step: 1875, Training Logs: loss_final: 3.333108, loss_mean: 0.948224, proj_loss: -0.366592, loss_mean_cls: 2.751476, grad_norm: 17.412720 +Steps: 0%| | 1876/1000000 [07:43<67:34:58, 4.10it/s, grad_norm=17.4, loss_final=3.33, loss_mean=0.948, loss_mean_cls=2.75, proj_loss=-0.367][2026-03-23 13:44:02] Step: 1876, Training Logs: loss_final: 3.445851, loss_mean: 0.959941, proj_loss: -0.373350, loss_mean_cls: 2.859261, grad_norm: 23.680750 +Steps: 0%| | 1877/1000000 [07:43<67:32:19, 4.11it/s, grad_norm=23.7, loss_final=3.45, loss_mean=0.96, loss_mean_cls=2.86, proj_loss=-0.373][2026-03-23 13:44:03] Step: 1877, Training Logs: loss_final: 3.784200, loss_mean: 0.972958, proj_loss: -0.371127, loss_mean_cls: 3.182369, grad_norm: 20.626572 +Steps: 0%| | 1878/1000000 [07:43<67:53:01, 4.08it/s, grad_norm=20.6, loss_final=3.78, loss_mean=0.973, loss_mean_cls=3.18, proj_loss=-0.371][2026-03-23 13:44:03] Step: 1878, Training Logs: loss_final: 3.529933, loss_mean: 0.950388, proj_loss: -0.359570, loss_mean_cls: 2.939116, grad_norm: 20.622538 +Steps: 0%| | 1879/1000000 [07:43<67:46:17, 4.09it/s, grad_norm=20.6, loss_final=3.53, loss_mean=0.95, loss_mean_cls=2.94, proj_loss=-0.36][2026-03-23 13:44:03] Step: 1879, Training Logs: loss_final: 2.764010, loss_mean: 0.964965, proj_loss: -0.369270, loss_mean_cls: 2.168315, grad_norm: 9.628356 +Steps: 0%| | 1880/1000000 [07:44<67:40:54, 4.10it/s, grad_norm=9.63, loss_final=2.76, loss_mean=0.965, loss_mean_cls=2.17, proj_loss=-0.369][2026-03-23 13:44:03] Step: 1880, Training Logs: loss_final: 3.140172, loss_mean: 0.950868, proj_loss: -0.366555, loss_mean_cls: 2.555860, grad_norm: 8.383750 +Steps: 0%| | 1881/1000000 [07:44<67:36:39, 4.10it/s, grad_norm=8.38, loss_final=3.14, loss_mean=0.951, loss_mean_cls=2.56, proj_loss=-0.367][2026-03-23 13:44:04] Step: 1881, Training Logs: loss_final: 3.366634, loss_mean: 0.941055, proj_loss: -0.367986, loss_mean_cls: 2.793565, grad_norm: 12.022265 +Steps: 0%| | 1882/1000000 [07:44<67:32:17, 4.11it/s, grad_norm=12, loss_final=3.37, loss_mean=0.941, loss_mean_cls=2.79, proj_loss=-0.368][2026-03-23 13:44:04] Step: 1882, Training Logs: loss_final: 3.023841, loss_mean: 0.974997, proj_loss: -0.375315, loss_mean_cls: 2.424159, grad_norm: 18.470137 +Steps: 0%| | 1883/1000000 [07:44<67:32:26, 4.11it/s, grad_norm=18.5, loss_final=3.02, loss_mean=0.975, loss_mean_cls=2.42, proj_loss=-0.375][2026-03-23 13:44:04] Step: 1883, Training Logs: loss_final: 3.722968, loss_mean: 0.954611, proj_loss: -0.358575, loss_mean_cls: 3.126931, grad_norm: 12.343509 +Steps: 0%| | 1884/1000000 [07:44<67:30:36, 4.11it/s, grad_norm=12.3, loss_final=3.72, loss_mean=0.955, loss_mean_cls=3.13, proj_loss=-0.359][2026-03-23 13:44:04] Step: 1884, Training Logs: loss_final: 3.375526, loss_mean: 0.942363, proj_loss: -0.368897, loss_mean_cls: 2.802060, grad_norm: 6.585025 +Steps: 0%| | 1885/1000000 [07:45<67:29:10, 4.11it/s, grad_norm=6.59, loss_final=3.38, loss_mean=0.942, loss_mean_cls=2.8, proj_loss=-0.369][2026-03-23 13:44:05] Step: 1885, Training Logs: loss_final: 2.362051, loss_mean: 0.979238, proj_loss: -0.384425, loss_mean_cls: 1.767238, grad_norm: 11.795956 +Steps: 0%| | 1886/1000000 [07:45<67:26:54, 4.11it/s, grad_norm=11.8, loss_final=2.36, loss_mean=0.979, loss_mean_cls=1.77, proj_loss=-0.384][2026-03-23 13:44:05] Step: 1886, Training Logs: loss_final: 3.069925, loss_mean: 0.950978, proj_loss: -0.378892, loss_mean_cls: 2.497839, grad_norm: 18.751411 +Steps: 0%| | 1887/1000000 [07:45<67:26:44, 4.11it/s, grad_norm=18.8, loss_final=3.07, loss_mean=0.951, loss_mean_cls=2.5, proj_loss=-0.379][2026-03-23 13:44:05] Step: 1887, Training Logs: loss_final: 3.424929, loss_mean: 0.956319, proj_loss: -0.371536, loss_mean_cls: 2.840146, grad_norm: 18.665461 +Steps: 0%| | 1888/1000000 [07:45<67:26:33, 4.11it/s, grad_norm=18.7, loss_final=3.42, loss_mean=0.956, loss_mean_cls=2.84, proj_loss=-0.372][2026-03-23 13:44:05] Step: 1888, Training Logs: loss_final: 3.304091, loss_mean: 0.952873, proj_loss: -0.372390, loss_mean_cls: 2.723608, grad_norm: 11.068395 +Steps: 0%| | 1889/1000000 [07:46<67:27:15, 4.11it/s, grad_norm=11.1, loss_final=3.3, loss_mean=0.953, loss_mean_cls=2.72, proj_loss=-0.372][2026-03-23 13:44:06] Step: 1889, Training Logs: loss_final: 2.807866, loss_mean: 0.980273, proj_loss: -0.376673, loss_mean_cls: 2.204265, grad_norm: 5.979496 +Steps: 0%| | 1890/1000000 [07:46<67:27:49, 4.11it/s, grad_norm=5.98, loss_final=2.81, loss_mean=0.98, loss_mean_cls=2.2, proj_loss=-0.377][2026-03-23 13:44:06] Step: 1890, Training Logs: loss_final: 3.287931, loss_mean: 0.945865, proj_loss: -0.365471, loss_mean_cls: 2.707537, grad_norm: 10.406279 +Steps: 0%| | 1891/1000000 [07:46<67:28:30, 4.11it/s, grad_norm=10.4, loss_final=3.29, loss_mean=0.946, loss_mean_cls=2.71, proj_loss=-0.365][2026-03-23 13:44:06] Step: 1891, Training Logs: loss_final: 3.099242, loss_mean: 0.964385, proj_loss: -0.370312, loss_mean_cls: 2.505169, grad_norm: 7.883871 +Steps: 0%| | 1892/1000000 [07:46<67:29:55, 4.11it/s, grad_norm=7.88, loss_final=3.1, loss_mean=0.964, loss_mean_cls=2.51, proj_loss=-0.37][2026-03-23 13:44:06] Step: 1892, Training Logs: loss_final: 3.432644, loss_mean: 0.956126, proj_loss: -0.367531, loss_mean_cls: 2.844049, grad_norm: 5.969536 +Steps: 0%| | 1893/1000000 [07:47<67:29:20, 4.11it/s, grad_norm=5.97, loss_final=3.43, loss_mean=0.956, loss_mean_cls=2.84, proj_loss=-0.368][2026-03-23 13:44:07] Step: 1893, Training Logs: loss_final: 3.251849, loss_mean: 0.947838, proj_loss: -0.370516, loss_mean_cls: 2.674527, grad_norm: 9.505151 +Steps: 0%| | 1894/1000000 [07:47<67:28:00, 4.11it/s, grad_norm=9.51, loss_final=3.25, loss_mean=0.948, loss_mean_cls=2.67, proj_loss=-0.371][2026-03-23 13:44:07] Step: 1894, Training Logs: loss_final: 2.937166, loss_mean: 0.981420, proj_loss: -0.375028, loss_mean_cls: 2.330775, grad_norm: 6.148169 +Steps: 0%| | 1895/1000000 [07:47<67:26:25, 4.11it/s, grad_norm=6.15, loss_final=2.94, loss_mean=0.981, loss_mean_cls=2.33, proj_loss=-0.375][2026-03-23 13:44:07] Step: 1895, Training Logs: loss_final: 2.781221, loss_mean: 0.966895, proj_loss: -0.378803, loss_mean_cls: 2.193129, grad_norm: 4.799123 +Steps: 0%| | 1896/1000000 [07:47<67:28:54, 4.11it/s, grad_norm=4.8, loss_final=2.78, loss_mean=0.967, loss_mean_cls=2.19, proj_loss=-0.379][2026-03-23 13:44:07] Step: 1896, Training Logs: loss_final: 2.783250, loss_mean: 0.944771, proj_loss: -0.368381, loss_mean_cls: 2.206860, grad_norm: 4.324197 +Steps: 0%| | 1897/1000000 [07:48<67:32:12, 4.11it/s, grad_norm=4.32, loss_final=2.78, loss_mean=0.945, loss_mean_cls=2.21, proj_loss=-0.368][2026-03-23 13:44:08] Step: 1897, Training Logs: loss_final: 3.114995, loss_mean: 0.948843, proj_loss: -0.376224, loss_mean_cls: 2.542376, grad_norm: 3.976885 +Steps: 0%| | 1898/1000000 [07:48<67:30:29, 4.11it/s, grad_norm=3.98, loss_final=3.11, loss_mean=0.949, loss_mean_cls=2.54, proj_loss=-0.376][2026-03-23 13:44:08] Step: 1898, Training Logs: loss_final: 2.742333, loss_mean: 0.969553, proj_loss: -0.372602, loss_mean_cls: 2.145383, grad_norm: 4.101505 +Steps: 0%| | 1899/1000000 [07:48<67:30:04, 4.11it/s, grad_norm=4.1, loss_final=2.74, loss_mean=0.97, loss_mean_cls=2.15, proj_loss=-0.373][2026-03-23 13:44:08] Step: 1899, Training Logs: loss_final: 3.090730, loss_mean: 0.940172, proj_loss: -0.372271, loss_mean_cls: 2.522829, grad_norm: 3.300000 +Steps: 0%| | 1900/1000000 [07:48<67:30:08, 4.11it/s, grad_norm=3.3, loss_final=3.09, loss_mean=0.94, loss_mean_cls=2.52, proj_loss=-0.372][2026-03-23 13:44:08] Step: 1900, Training Logs: loss_final: 2.971080, loss_mean: 0.963766, proj_loss: -0.374182, loss_mean_cls: 2.381496, grad_norm: 4.457256 +Steps: 0%| | 1901/1000000 [07:49<67:28:06, 4.11it/s, grad_norm=4.46, loss_final=2.97, loss_mean=0.964, loss_mean_cls=2.38, proj_loss=-0.374][2026-03-23 13:44:09] Step: 1901, Training Logs: loss_final: 2.927053, loss_mean: 0.935817, proj_loss: -0.380045, loss_mean_cls: 2.371282, grad_norm: 9.063329 +Steps: 0%| | 1902/1000000 [07:49<67:27:25, 4.11it/s, grad_norm=9.06, loss_final=2.93, loss_mean=0.936, loss_mean_cls=2.37, proj_loss=-0.38][2026-03-23 13:44:09] Step: 1902, Training Logs: loss_final: 2.846367, loss_mean: 0.979825, proj_loss: -0.374829, loss_mean_cls: 2.241371, grad_norm: 8.616020 +Steps: 0%| | 1903/1000000 [07:49<67:29:08, 4.11it/s, grad_norm=8.62, loss_final=2.85, loss_mean=0.98, loss_mean_cls=2.24, proj_loss=-0.375][2026-03-23 13:44:09] Step: 1903, Training Logs: loss_final: 2.919878, loss_mean: 0.944149, proj_loss: -0.375371, loss_mean_cls: 2.351099, grad_norm: 3.825544 +Steps: 0%| | 1904/1000000 [07:49<67:30:12, 4.11it/s, grad_norm=3.83, loss_final=2.92, loss_mean=0.944, loss_mean_cls=2.35, proj_loss=-0.375][2026-03-23 13:44:09] Step: 1904, Training Logs: loss_final: 3.265636, loss_mean: 0.914923, proj_loss: -0.372816, loss_mean_cls: 2.723529, grad_norm: 16.703028 +Steps: 0%| | 1905/1000000 [07:50<67:34:47, 4.10it/s, grad_norm=16.7, loss_final=3.27, loss_mean=0.915, loss_mean_cls=2.72, proj_loss=-0.373][2026-03-23 13:44:10] Step: 1905, Training Logs: loss_final: 2.789534, loss_mean: 0.989616, proj_loss: -0.372816, loss_mean_cls: 2.172734, grad_norm: 15.111530 +Steps: 0%| | 1906/1000000 [07:50<67:39:14, 4.10it/s, grad_norm=15.1, loss_final=2.79, loss_mean=0.99, loss_mean_cls=2.17, proj_loss=-0.373][2026-03-23 13:44:10] Step: 1906, Training Logs: loss_final: 3.073624, loss_mean: 1.002702, proj_loss: -0.364998, loss_mean_cls: 2.435920, grad_norm: 8.753744 +Steps: 0%| | 1907/1000000 [07:50<67:36:05, 4.10it/s, grad_norm=8.75, loss_final=3.07, loss_mean=1, loss_mean_cls=2.44, proj_loss=-0.365][2026-03-23 13:44:10] Step: 1907, Training Logs: loss_final: 2.468461, loss_mean: 0.960240, proj_loss: -0.382261, loss_mean_cls: 1.890482, grad_norm: 13.554482 +Steps: 0%| | 1908/1000000 [07:50<67:32:36, 4.10it/s, grad_norm=13.6, loss_final=2.47, loss_mean=0.96, loss_mean_cls=1.89, proj_loss=-0.382][2026-03-23 13:44:10] Step: 1908, Training Logs: loss_final: 3.695542, loss_mean: 0.948614, proj_loss: -0.362772, loss_mean_cls: 3.109700, grad_norm: 11.917005 +Steps: 0%| | 1909/1000000 [07:51<67:30:33, 4.11it/s, grad_norm=11.9, loss_final=3.7, loss_mean=0.949, loss_mean_cls=3.11, proj_loss=-0.363][2026-03-23 13:44:11] Step: 1909, Training Logs: loss_final: 2.941749, loss_mean: 0.971360, proj_loss: -0.368298, loss_mean_cls: 2.338687, grad_norm: 8.880040 +Steps: 0%| | 1910/1000000 [07:51<67:29:22, 4.11it/s, grad_norm=8.88, loss_final=2.94, loss_mean=0.971, loss_mean_cls=2.34, proj_loss=-0.368][2026-03-23 13:44:11] Step: 1910, Training Logs: loss_final: 3.336937, loss_mean: 0.951588, proj_loss: -0.371482, loss_mean_cls: 2.756831, grad_norm: 4.610095 +Steps: 0%| | 1911/1000000 [07:51<67:27:03, 4.11it/s, grad_norm=4.61, loss_final=3.34, loss_mean=0.952, loss_mean_cls=2.76, proj_loss=-0.371][2026-03-23 13:44:11] Step: 1911, Training Logs: loss_final: 2.865345, loss_mean: 0.939368, proj_loss: -0.380074, loss_mean_cls: 2.306051, grad_norm: 13.726627 +Steps: 0%| | 1912/1000000 [07:51<67:27:01, 4.11it/s, grad_norm=13.7, loss_final=2.87, loss_mean=0.939, loss_mean_cls=2.31, proj_loss=-0.38][2026-03-23 13:44:11] Step: 1912, Training Logs: loss_final: 3.282800, loss_mean: 0.956879, proj_loss: -0.374750, loss_mean_cls: 2.700672, grad_norm: 15.874613 +Steps: 0%| | 1913/1000000 [07:52<67:26:32, 4.11it/s, grad_norm=15.9, loss_final=3.28, loss_mean=0.957, loss_mean_cls=2.7, proj_loss=-0.375][2026-03-23 13:44:12] Step: 1913, Training Logs: loss_final: 3.227447, loss_mean: 0.988420, proj_loss: -0.363775, loss_mean_cls: 2.602802, grad_norm: 8.246054 +Steps: 0%| | 1914/1000000 [07:52<67:27:34, 4.11it/s, grad_norm=8.25, loss_final=3.23, loss_mean=0.988, loss_mean_cls=2.6, proj_loss=-0.364][2026-03-23 13:44:12] Step: 1914, Training Logs: loss_final: 3.125282, loss_mean: 0.969874, proj_loss: -0.365245, loss_mean_cls: 2.520652, grad_norm: 3.974498 +Steps: 0%| | 1915/1000000 [07:52<68:34:13, 4.04it/s, grad_norm=3.97, loss_final=3.13, loss_mean=0.97, loss_mean_cls=2.52, proj_loss=-0.365][2026-03-23 13:44:12] Step: 1915, Training Logs: loss_final: 2.715042, loss_mean: 0.980983, proj_loss: -0.381827, loss_mean_cls: 2.115886, grad_norm: 9.113098 +Steps: 0%| | 1916/1000000 [07:52<68:14:50, 4.06it/s, grad_norm=9.11, loss_final=2.72, loss_mean=0.981, loss_mean_cls=2.12, proj_loss=-0.382][2026-03-23 13:44:12] Step: 1916, Training Logs: loss_final: 2.601993, loss_mean: 1.009739, proj_loss: -0.379866, loss_mean_cls: 1.972120, grad_norm: 8.526324 +Steps: 0%| | 1917/1000000 [07:53<68:00:34, 4.08it/s, grad_norm=8.53, loss_final=2.6, loss_mean=1.01, loss_mean_cls=1.97, proj_loss=-0.38][2026-03-23 13:44:12] Step: 1917, Training Logs: loss_final: 3.041839, loss_mean: 0.999633, proj_loss: -0.369904, loss_mean_cls: 2.412110, grad_norm: 8.242023 +Steps: 0%| | 1918/1000000 [07:53<67:49:49, 4.09it/s, grad_norm=8.24, loss_final=3.04, loss_mean=1, loss_mean_cls=2.41, proj_loss=-0.37][2026-03-23 13:44:13] Step: 1918, Training Logs: loss_final: 2.885490, loss_mean: 0.982090, proj_loss: -0.368357, loss_mean_cls: 2.271757, grad_norm: 17.992363 +Steps: 0%| | 1919/1000000 [07:53<67:41:03, 4.10it/s, grad_norm=18, loss_final=2.89, loss_mean=0.982, loss_mean_cls=2.27, proj_loss=-0.368][2026-03-23 13:44:13] Step: 1919, Training Logs: loss_final: 2.560515, loss_mean: 1.000490, proj_loss: -0.372920, loss_mean_cls: 1.932946, grad_norm: 7.436285 +Steps: 0%| | 1920/1000000 [07:53<67:35:13, 4.10it/s, grad_norm=7.44, loss_final=2.56, loss_mean=1, loss_mean_cls=1.93, proj_loss=-0.373][2026-03-23 13:44:13] Step: 1920, Training Logs: loss_final: 2.966860, loss_mean: 0.962144, proj_loss: -0.371919, loss_mean_cls: 2.376635, grad_norm: 4.800770 +Steps: 0%| | 1921/1000000 [07:54<67:30:56, 4.11it/s, grad_norm=4.8, loss_final=2.97, loss_mean=0.962, loss_mean_cls=2.38, proj_loss=-0.372][2026-03-23 13:44:13] Step: 1921, Training Logs: loss_final: 2.788282, loss_mean: 0.956280, proj_loss: -0.374153, loss_mean_cls: 2.206155, grad_norm: 6.567938 +Steps: 0%| | 1922/1000000 [07:54<67:28:58, 4.11it/s, grad_norm=6.57, loss_final=2.79, loss_mean=0.956, loss_mean_cls=2.21, proj_loss=-0.374][2026-03-23 13:44:14] Step: 1922, Training Logs: loss_final: 3.051435, loss_mean: 0.960924, proj_loss: -0.370209, loss_mean_cls: 2.460720, grad_norm: 5.861495 +Steps: 0%| | 1923/1000000 [07:54<67:37:06, 4.10it/s, grad_norm=5.86, loss_final=3.05, loss_mean=0.961, loss_mean_cls=2.46, proj_loss=-0.37][2026-03-23 13:44:14] Step: 1923, Training Logs: loss_final: 2.990613, loss_mean: 0.960116, proj_loss: -0.371693, loss_mean_cls: 2.402189, grad_norm: 7.649192 +Steps: 0%| | 1924/1000000 [07:54<67:34:17, 4.10it/s, grad_norm=7.65, loss_final=2.99, loss_mean=0.96, loss_mean_cls=2.4, proj_loss=-0.372][2026-03-23 13:44:14] Step: 1924, Training Logs: loss_final: 2.754692, loss_mean: 0.972563, proj_loss: -0.376634, loss_mean_cls: 2.158763, grad_norm: 8.041865 +Steps: 0%| | 1925/1000000 [07:54<67:31:53, 4.11it/s, grad_norm=8.04, loss_final=2.75, loss_mean=0.973, loss_mean_cls=2.16, proj_loss=-0.377][2026-03-23 13:44:14] Step: 1925, Training Logs: loss_final: 2.751148, loss_mean: 0.965060, proj_loss: -0.370093, loss_mean_cls: 2.156181, grad_norm: 5.899128 +Steps: 0%| | 1926/1000000 [07:55<67:29:03, 4.11it/s, grad_norm=5.9, loss_final=2.75, loss_mean=0.965, loss_mean_cls=2.16, proj_loss=-0.37][2026-03-23 13:44:15] Step: 1926, Training Logs: loss_final: 2.969651, loss_mean: 0.970738, proj_loss: -0.377475, loss_mean_cls: 2.376388, grad_norm: 11.931825 +Steps: 0%| | 1927/1000000 [07:55<67:32:55, 4.10it/s, grad_norm=11.9, loss_final=2.97, loss_mean=0.971, loss_mean_cls=2.38, proj_loss=-0.377][2026-03-23 13:44:15] Step: 1927, Training Logs: loss_final: 3.189359, loss_mean: 0.961795, proj_loss: -0.366834, loss_mean_cls: 2.594398, grad_norm: 12.464342 +Steps: 0%| | 1928/1000000 [07:55<67:30:30, 4.11it/s, grad_norm=12.5, loss_final=3.19, loss_mean=0.962, loss_mean_cls=2.59, proj_loss=-0.367][2026-03-23 13:44:15] Step: 1928, Training Logs: loss_final: 3.136108, loss_mean: 0.951979, proj_loss: -0.373673, loss_mean_cls: 2.557802, grad_norm: 10.485117 +Steps: 0%| | 1929/1000000 [07:55<67:28:50, 4.11it/s, grad_norm=10.5, loss_final=3.14, loss_mean=0.952, loss_mean_cls=2.56, proj_loss=-0.374][2026-03-23 13:44:15] Step: 1929, Training Logs: loss_final: 3.491710, loss_mean: 0.924132, proj_loss: -0.366751, loss_mean_cls: 2.934330, grad_norm: 12.808695 +Steps: 0%| | 1930/1000000 [07:56<67:27:23, 4.11it/s, grad_norm=12.8, loss_final=3.49, loss_mean=0.924, loss_mean_cls=2.93, proj_loss=-0.367][2026-03-23 13:44:16] Step: 1930, Training Logs: loss_final: 2.753796, loss_mean: 0.960472, proj_loss: -0.373383, loss_mean_cls: 2.166707, grad_norm: 7.636239 +Steps: 0%| | 1931/1000000 [07:56<67:29:04, 4.11it/s, grad_norm=7.64, loss_final=2.75, loss_mean=0.96, loss_mean_cls=2.17, proj_loss=-0.373][2026-03-23 13:44:16] Step: 1931, Training Logs: loss_final: 3.072787, loss_mean: 0.946253, proj_loss: -0.374576, loss_mean_cls: 2.501110, grad_norm: 4.018815 +Steps: 0%| | 1932/1000000 [07:56<67:27:05, 4.11it/s, grad_norm=4.02, loss_final=3.07, loss_mean=0.946, loss_mean_cls=2.5, proj_loss=-0.375][2026-03-23 13:44:16] Step: 1932, Training Logs: loss_final: 3.048150, loss_mean: 0.938796, proj_loss: -0.378327, loss_mean_cls: 2.487681, grad_norm: 11.672125 +Steps: 0%| | 1933/1000000 [07:57<98:15:06, 2.82it/s, grad_norm=11.7, loss_final=3.05, loss_mean=0.939, loss_mean_cls=2.49, proj_loss=-0.378][2026-03-23 13:44:17] Step: 1933, Training Logs: loss_final: 3.093501, loss_mean: 0.964456, proj_loss: -0.370723, loss_mean_cls: 2.499768, grad_norm: 15.289946 +Steps: 0%| | 1934/1000000 [07:57<89:01:08, 3.11it/s, grad_norm=15.3, loss_final=3.09, loss_mean=0.964, loss_mean_cls=2.5, proj_loss=-0.371][2026-03-23 13:44:17] Step: 1934, Training Logs: loss_final: 3.167243, loss_mean: 0.953668, proj_loss: -0.368609, loss_mean_cls: 2.582184, grad_norm: 6.831712 +Steps: 0%| | 1935/1000000 [07:57<82:32:05, 3.36it/s, grad_norm=6.83, loss_final=3.17, loss_mean=0.954, loss_mean_cls=2.58, proj_loss=-0.369][2026-03-23 13:44:17] Step: 1935, Training Logs: loss_final: 3.262030, loss_mean: 0.958532, proj_loss: -0.366487, loss_mean_cls: 2.669985, grad_norm: 9.149492 +Steps: 0%| | 1936/1000000 [07:58<78:02:38, 3.55it/s, grad_norm=9.15, loss_final=3.26, loss_mean=0.959, loss_mean_cls=2.67, proj_loss=-0.366][2026-03-23 13:44:17] Step: 1936, Training Logs: loss_final: 2.571640, loss_mean: 0.956235, proj_loss: -0.378477, loss_mean_cls: 1.993882, grad_norm: 14.378422 +Steps: 0%| | 1937/1000000 [07:58<74:49:57, 3.70it/s, grad_norm=14.4, loss_final=2.57, loss_mean=0.956, loss_mean_cls=1.99, proj_loss=-0.378][2026-03-23 13:44:18] Step: 1937, Training Logs: loss_final: 2.830016, loss_mean: 0.943244, proj_loss: -0.375705, loss_mean_cls: 2.262476, grad_norm: 10.502783 +Steps: 0%| | 1938/1000000 [07:58<72:37:24, 3.82it/s, grad_norm=10.5, loss_final=2.83, loss_mean=0.943, loss_mean_cls=2.26, proj_loss=-0.376][2026-03-23 13:44:18] Step: 1938, Training Logs: loss_final: 2.699193, loss_mean: 0.958457, proj_loss: -0.377434, loss_mean_cls: 2.118170, grad_norm: 5.674830 +Steps: 0%| | 1939/1000000 [07:58<71:03:12, 3.90it/s, grad_norm=5.67, loss_final=2.7, loss_mean=0.958, loss_mean_cls=2.12, proj_loss=-0.377][2026-03-23 13:44:18] Step: 1939, Training Logs: loss_final: 3.069228, loss_mean: 0.947073, proj_loss: -0.377668, loss_mean_cls: 2.499823, grad_norm: 14.677471 +Steps: 0%| | 1940/1000000 [07:58<69:57:29, 3.96it/s, grad_norm=14.7, loss_final=3.07, loss_mean=0.947, loss_mean_cls=2.5, proj_loss=-0.378][2026-03-23 13:44:18] Step: 1940, Training Logs: loss_final: 2.847305, loss_mean: 0.958452, proj_loss: -0.382312, loss_mean_cls: 2.271164, grad_norm: 11.164616 +Steps: 0%| | 1941/1000000 [07:59<69:12:30, 4.01it/s, grad_norm=11.2, loss_final=2.85, loss_mean=0.958, loss_mean_cls=2.27, proj_loss=-0.382][2026-03-23 13:44:19] Step: 1941, Training Logs: loss_final: 3.167903, loss_mean: 0.946791, proj_loss: -0.373059, loss_mean_cls: 2.594171, grad_norm: 6.206810 +Steps: 0%| | 1942/1000000 [07:59<68:38:54, 4.04it/s, grad_norm=6.21, loss_final=3.17, loss_mean=0.947, loss_mean_cls=2.59, proj_loss=-0.373][2026-03-23 13:44:19] Step: 1942, Training Logs: loss_final: 2.910396, loss_mean: 0.960141, proj_loss: -0.378688, loss_mean_cls: 2.328944, grad_norm: 4.761368 +Steps: 0%| | 1943/1000000 [07:59<68:18:33, 4.06it/s, grad_norm=4.76, loss_final=2.91, loss_mean=0.96, loss_mean_cls=2.33, proj_loss=-0.379][2026-03-23 13:44:19] Step: 1943, Training Logs: loss_final: 2.940850, loss_mean: 0.948119, proj_loss: -0.377709, loss_mean_cls: 2.370440, grad_norm: 6.574450 +Steps: 0%| | 1944/1000000 [07:59<68:03:23, 4.07it/s, grad_norm=6.57, loss_final=2.94, loss_mean=0.948, loss_mean_cls=2.37, proj_loss=-0.378][2026-03-23 13:44:19] Step: 1944, Training Logs: loss_final: 3.126034, loss_mean: 0.952991, proj_loss: -0.369518, loss_mean_cls: 2.542562, grad_norm: 8.791523 +Steps: 0%| | 1945/1000000 [08:00<67:52:17, 4.08it/s, grad_norm=8.79, loss_final=3.13, loss_mean=0.953, loss_mean_cls=2.54, proj_loss=-0.37][2026-03-23 13:44:20] Step: 1945, Training Logs: loss_final: 3.085020, loss_mean: 0.936616, proj_loss: -0.375860, loss_mean_cls: 2.524264, grad_norm: 14.607684 +Steps: 0%| | 1946/1000000 [08:00<67:44:18, 4.09it/s, grad_norm=14.6, loss_final=3.09, loss_mean=0.937, loss_mean_cls=2.52, proj_loss=-0.376][2026-03-23 13:44:20] Step: 1946, Training Logs: loss_final: 3.104537, loss_mean: 0.943922, proj_loss: -0.376516, loss_mean_cls: 2.537131, grad_norm: 6.409404 +Steps: 0%| | 1947/1000000 [08:00<67:38:38, 4.10it/s, grad_norm=6.41, loss_final=3.1, loss_mean=0.944, loss_mean_cls=2.54, proj_loss=-0.377][2026-03-23 13:44:20] Step: 1947, Training Logs: loss_final: 2.902503, loss_mean: 0.952803, proj_loss: -0.375356, loss_mean_cls: 2.325056, grad_norm: 5.211392 +Steps: 0%| | 1948/1000000 [08:00<67:35:25, 4.10it/s, grad_norm=5.21, loss_final=2.9, loss_mean=0.953, loss_mean_cls=2.33, proj_loss=-0.375][2026-03-23 13:44:20] Step: 1948, Training Logs: loss_final: 3.138502, loss_mean: 0.931295, proj_loss: -0.376297, loss_mean_cls: 2.583504, grad_norm: 10.467975 +Steps: 0%| | 1949/1000000 [08:01<67:31:16, 4.11it/s, grad_norm=10.5, loss_final=3.14, loss_mean=0.931, loss_mean_cls=2.58, proj_loss=-0.376][2026-03-23 13:44:21] Step: 1949, Training Logs: loss_final: 3.066694, loss_mean: 0.946139, proj_loss: -0.373653, loss_mean_cls: 2.494208, grad_norm: 4.535215 +Steps: 0%| | 1950/1000000 [08:01<67:30:01, 4.11it/s, grad_norm=4.54, loss_final=3.07, loss_mean=0.946, loss_mean_cls=2.49, proj_loss=-0.374][2026-03-23 13:44:21] Step: 1950, Training Logs: loss_final: 3.091634, loss_mean: 0.940419, proj_loss: -0.380275, loss_mean_cls: 2.531490, grad_norm: 5.977802 +Steps: 0%| | 1951/1000000 [08:01<67:29:19, 4.11it/s, grad_norm=5.98, loss_final=3.09, loss_mean=0.94, loss_mean_cls=2.53, proj_loss=-0.38][2026-03-23 13:44:21] Step: 1951, Training Logs: loss_final: 3.281027, loss_mean: 0.931753, proj_loss: -0.374461, loss_mean_cls: 2.723735, grad_norm: 15.153315 +Steps: 0%| | 1952/1000000 [08:01<67:30:58, 4.11it/s, grad_norm=15.2, loss_final=3.28, loss_mean=0.932, loss_mean_cls=2.72, proj_loss=-0.374][2026-03-23 13:44:21] Step: 1952, Training Logs: loss_final: 3.255087, loss_mean: 0.949698, proj_loss: -0.381996, loss_mean_cls: 2.687386, grad_norm: 21.051344 +Steps: 0%| | 1953/1000000 [08:02<67:30:05, 4.11it/s, grad_norm=21.1, loss_final=3.26, loss_mean=0.95, loss_mean_cls=2.69, proj_loss=-0.382][2026-03-23 13:44:22] Step: 1953, Training Logs: loss_final: 3.076647, loss_mean: 0.972709, proj_loss: -0.372384, loss_mean_cls: 2.476322, grad_norm: 13.338116 +Steps: 0%| | 1954/1000000 [08:02<67:28:48, 4.11it/s, grad_norm=13.3, loss_final=3.08, loss_mean=0.973, loss_mean_cls=2.48, proj_loss=-0.372][2026-03-23 13:44:22] Step: 1954, Training Logs: loss_final: 3.106065, loss_mean: 0.956952, proj_loss: -0.373667, loss_mean_cls: 2.522781, grad_norm: 18.507114 +Steps: 0%| | 1955/1000000 [08:02<67:29:57, 4.11it/s, grad_norm=18.5, loss_final=3.11, loss_mean=0.957, loss_mean_cls=2.52, proj_loss=-0.374][2026-03-23 13:44:22] Step: 1955, Training Logs: loss_final: 2.747687, loss_mean: 0.950026, proj_loss: -0.379415, loss_mean_cls: 2.177076, grad_norm: 6.623316 +Steps: 0%| | 1956/1000000 [08:02<67:40:55, 4.10it/s, grad_norm=6.62, loss_final=2.75, loss_mean=0.95, loss_mean_cls=2.18, proj_loss=-0.379][2026-03-23 13:44:22] Step: 1956, Training Logs: loss_final: 2.741893, loss_mean: 0.943013, proj_loss: -0.382234, loss_mean_cls: 2.181115, grad_norm: 17.882029 +Steps: 0%| | 1957/1000000 [08:03<67:37:04, 4.10it/s, grad_norm=17.9, loss_final=2.74, loss_mean=0.943, loss_mean_cls=2.18, proj_loss=-0.382][2026-03-23 13:44:23] Step: 1957, Training Logs: loss_final: 2.810370, loss_mean: 0.930521, proj_loss: -0.383254, loss_mean_cls: 2.263103, grad_norm: 11.446910 +Steps: 0%| | 1958/1000000 [08:03<67:35:02, 4.10it/s, grad_norm=11.4, loss_final=2.81, loss_mean=0.931, loss_mean_cls=2.26, proj_loss=-0.383][2026-03-23 13:44:23] Step: 1958, Training Logs: loss_final: 3.149274, loss_mean: 0.942048, proj_loss: -0.371278, loss_mean_cls: 2.578504, grad_norm: 3.969868 +Steps: 0%| | 1959/1000000 [08:03<67:33:05, 4.10it/s, grad_norm=3.97, loss_final=3.15, loss_mean=0.942, loss_mean_cls=2.58, proj_loss=-0.371][2026-03-23 13:44:23] Step: 1959, Training Logs: loss_final: 2.777044, loss_mean: 0.953383, proj_loss: -0.381980, loss_mean_cls: 2.205641, grad_norm: 4.470694 +Steps: 0%| | 1960/1000000 [08:03<67:33:24, 4.10it/s, grad_norm=4.47, loss_final=2.78, loss_mean=0.953, loss_mean_cls=2.21, proj_loss=-0.382][2026-03-23 13:44:23] Step: 1960, Training Logs: loss_final: 2.757701, loss_mean: 0.956774, proj_loss: -0.379391, loss_mean_cls: 2.180318, grad_norm: 9.462005 +Steps: 0%| | 1961/1000000 [08:04<67:31:21, 4.11it/s, grad_norm=9.46, loss_final=2.76, loss_mean=0.957, loss_mean_cls=2.18, proj_loss=-0.379][2026-03-23 13:44:24] Step: 1961, Training Logs: loss_final: 3.111762, loss_mean: 0.947891, proj_loss: -0.373579, loss_mean_cls: 2.537450, grad_norm: 14.643014 +Steps: 0%| | 1962/1000000 [08:04<67:28:34, 4.11it/s, grad_norm=14.6, loss_final=3.11, loss_mean=0.948, loss_mean_cls=2.54, proj_loss=-0.374][2026-03-23 13:44:24] Step: 1962, Training Logs: loss_final: 3.063232, loss_mean: 0.970375, proj_loss: -0.365830, loss_mean_cls: 2.458687, grad_norm: 3.251167 +Steps: 0%| | 1963/1000000 [08:04<67:28:50, 4.11it/s, grad_norm=3.25, loss_final=3.06, loss_mean=0.97, loss_mean_cls=2.46, proj_loss=-0.366][2026-03-23 13:44:24] Step: 1963, Training Logs: loss_final: 3.137165, loss_mean: 0.941708, proj_loss: -0.377047, loss_mean_cls: 2.572504, grad_norm: 9.672579 +Steps: 0%| | 1964/1000000 [08:04<67:28:46, 4.11it/s, grad_norm=9.67, loss_final=3.14, loss_mean=0.942, loss_mean_cls=2.57, proj_loss=-0.377][2026-03-23 13:44:24] Step: 1964, Training Logs: loss_final: 2.514076, loss_mean: 0.984458, proj_loss: -0.384083, loss_mean_cls: 1.913700, grad_norm: 11.440967 +Steps: 0%| | 1965/1000000 [08:05<67:29:22, 4.11it/s, grad_norm=11.4, loss_final=2.51, loss_mean=0.984, loss_mean_cls=1.91, proj_loss=-0.384][2026-03-23 13:44:25] Step: 1965, Training Logs: loss_final: 3.309635, loss_mean: 0.944141, proj_loss: -0.373624, loss_mean_cls: 2.739118, grad_norm: 10.157703 +Steps: 0%| | 1966/1000000 [08:05<67:29:55, 4.11it/s, grad_norm=10.2, loss_final=3.31, loss_mean=0.944, loss_mean_cls=2.74, proj_loss=-0.374][2026-03-23 13:44:25] Step: 1966, Training Logs: loss_final: 3.274650, loss_mean: 0.968959, proj_loss: -0.363262, loss_mean_cls: 2.668953, grad_norm: 13.588305 +Steps: 0%| | 1967/1000000 [08:05<67:28:33, 4.11it/s, grad_norm=13.6, loss_final=3.27, loss_mean=0.969, loss_mean_cls=2.67, proj_loss=-0.363][2026-03-23 13:44:25] Step: 1967, Training Logs: loss_final: 2.833830, loss_mean: 0.960724, proj_loss: -0.373155, loss_mean_cls: 2.246261, grad_norm: 10.148376 +Steps: 0%| | 1968/1000000 [08:05<67:28:03, 4.11it/s, grad_norm=10.1, loss_final=2.83, loss_mean=0.961, loss_mean_cls=2.25, proj_loss=-0.373][2026-03-23 13:44:25] Step: 1968, Training Logs: loss_final: 3.101931, loss_mean: 0.964882, proj_loss: -0.375939, loss_mean_cls: 2.512988, grad_norm: 13.800985 +Steps: 0%| | 1969/1000000 [08:06<67:27:22, 4.11it/s, grad_norm=13.8, loss_final=3.1, loss_mean=0.965, loss_mean_cls=2.51, proj_loss=-0.376][2026-03-23 13:44:26] Step: 1969, Training Logs: loss_final: 3.330801, loss_mean: 0.929770, proj_loss: -0.376870, loss_mean_cls: 2.777901, grad_norm: 15.732014 +Steps: 0%| | 1970/1000000 [08:06<67:26:01, 4.11it/s, grad_norm=15.7, loss_final=3.33, loss_mean=0.93, loss_mean_cls=2.78, proj_loss=-0.377][2026-03-23 13:44:26] Step: 1970, Training Logs: loss_final: 2.599960, loss_mean: 0.968857, proj_loss: -0.376803, loss_mean_cls: 2.007906, grad_norm: 12.996893 +Steps: 0%| | 1971/1000000 [08:06<67:26:10, 4.11it/s, grad_norm=13, loss_final=2.6, loss_mean=0.969, loss_mean_cls=2.01, proj_loss=-0.377][2026-03-23 13:44:26] Step: 1971, Training Logs: loss_final: 3.451234, loss_mean: 0.963872, proj_loss: -0.366345, loss_mean_cls: 2.853708, grad_norm: 20.050049 +Steps: 0%| | 1972/1000000 [08:06<67:26:49, 4.11it/s, grad_norm=20.1, loss_final=3.45, loss_mean=0.964, loss_mean_cls=2.85, proj_loss=-0.366][2026-03-23 13:44:26] Step: 1972, Training Logs: loss_final: 2.737495, loss_mean: 0.971467, proj_loss: -0.379511, loss_mean_cls: 2.145539, grad_norm: 11.309019 +Steps: 0%| | 1973/1000000 [08:07<67:25:20, 4.11it/s, grad_norm=11.3, loss_final=2.74, loss_mean=0.971, loss_mean_cls=2.15, proj_loss=-0.38][2026-03-23 13:44:26] Step: 1973, Training Logs: loss_final: 3.074771, loss_mean: 0.977278, proj_loss: -0.370747, loss_mean_cls: 2.468240, grad_norm: 10.257510 +Steps: 0%| | 1974/1000000 [08:07<67:25:46, 4.11it/s, grad_norm=10.3, loss_final=3.07, loss_mean=0.977, loss_mean_cls=2.47, proj_loss=-0.371][2026-03-23 13:44:27] Step: 1974, Training Logs: loss_final: 2.698503, loss_mean: 0.966370, proj_loss: -0.382145, loss_mean_cls: 2.114277, grad_norm: 4.016724 +Steps: 0%| | 1975/1000000 [08:07<67:26:23, 4.11it/s, grad_norm=4.02, loss_final=2.7, loss_mean=0.966, loss_mean_cls=2.11, proj_loss=-0.382][2026-03-23 13:44:27] Step: 1975, Training Logs: loss_final: 2.862114, loss_mean: 0.949540, proj_loss: -0.383226, loss_mean_cls: 2.295801, grad_norm: 12.985388 +Steps: 0%| | 1976/1000000 [08:07<67:27:39, 4.11it/s, grad_norm=13, loss_final=2.86, loss_mean=0.95, loss_mean_cls=2.3, proj_loss=-0.383][2026-03-23 13:44:27] Step: 1976, Training Logs: loss_final: 3.056423, loss_mean: 0.952127, proj_loss: -0.373201, loss_mean_cls: 2.477497, grad_norm: 5.648325 +Steps: 0%| | 1977/1000000 [08:08<67:27:36, 4.11it/s, grad_norm=5.65, loss_final=3.06, loss_mean=0.952, loss_mean_cls=2.48, proj_loss=-0.373][2026-03-23 13:44:27] Step: 1977, Training Logs: loss_final: 2.702626, loss_mean: 0.959446, proj_loss: -0.375090, loss_mean_cls: 2.118270, grad_norm: 4.255664 +Steps: 0%| | 1978/1000000 [08:08<67:28:03, 4.11it/s, grad_norm=4.26, loss_final=2.7, loss_mean=0.959, loss_mean_cls=2.12, proj_loss=-0.375][2026-03-23 13:44:28] Step: 1978, Training Logs: loss_final: 3.217118, loss_mean: 0.922517, proj_loss: -0.377656, loss_mean_cls: 2.672256, grad_norm: 5.657108 +Steps: 0%| | 1979/1000000 [08:08<67:28:17, 4.11it/s, grad_norm=5.66, loss_final=3.22, loss_mean=0.923, loss_mean_cls=2.67, proj_loss=-0.378][2026-03-23 13:44:28] Step: 1979, Training Logs: loss_final: 3.044533, loss_mean: 0.927266, proj_loss: -0.374988, loss_mean_cls: 2.492255, grad_norm: 3.573210 +Steps: 0%| | 1980/1000000 [08:08<67:32:10, 4.10it/s, grad_norm=3.57, loss_final=3.04, loss_mean=0.927, loss_mean_cls=2.49, proj_loss=-0.375][2026-03-23 13:44:28] Step: 1980, Training Logs: loss_final: 2.997291, loss_mean: 0.959268, proj_loss: -0.372906, loss_mean_cls: 2.410928, grad_norm: 5.910650 +Steps: 0%| | 1981/1000000 [08:08<68:37:33, 4.04it/s, grad_norm=5.91, loss_final=3, loss_mean=0.959, loss_mean_cls=2.41, proj_loss=-0.373][2026-03-23 13:44:28] Step: 1981, Training Logs: loss_final: 3.171605, loss_mean: 0.954134, proj_loss: -0.372623, loss_mean_cls: 2.590094, grad_norm: 2.917690 +Steps: 0%| | 1982/1000000 [08:09<68:17:30, 4.06it/s, grad_norm=2.92, loss_final=3.17, loss_mean=0.954, loss_mean_cls=2.59, proj_loss=-0.373][2026-03-23 13:44:29] Step: 1982, Training Logs: loss_final: 2.547610, loss_mean: 0.951893, proj_loss: -0.383096, loss_mean_cls: 1.978814, grad_norm: 7.741107 +Steps: 0%| | 1983/1000000 [08:09<68:01:22, 4.08it/s, grad_norm=7.74, loss_final=2.55, loss_mean=0.952, loss_mean_cls=1.98, proj_loss=-0.383][2026-03-23 13:44:29] Step: 1983, Training Logs: loss_final: 2.997611, loss_mean: 0.984999, proj_loss: -0.376446, loss_mean_cls: 2.389057, grad_norm: 4.772802 +Steps: 0%| | 1984/1000000 [08:09<67:56:48, 4.08it/s, grad_norm=4.77, loss_final=3, loss_mean=0.985, loss_mean_cls=2.39, proj_loss=-0.376][2026-03-23 13:44:29] Step: 1984, Training Logs: loss_final: 3.380662, loss_mean: 0.928479, proj_loss: -0.375063, loss_mean_cls: 2.827247, grad_norm: 4.841754 +Steps: 0%| | 1985/1000000 [08:09<67:46:27, 4.09it/s, grad_norm=4.84, loss_final=3.38, loss_mean=0.928, loss_mean_cls=2.83, proj_loss=-0.375][2026-03-23 13:44:29] Step: 1985, Training Logs: loss_final: 2.558168, loss_mean: 0.954653, proj_loss: -0.378030, loss_mean_cls: 1.981546, grad_norm: 8.151909 +Steps: 0%| | 1986/1000000 [08:10<67:43:44, 4.09it/s, grad_norm=8.15, loss_final=2.56, loss_mean=0.955, loss_mean_cls=1.98, proj_loss=-0.378][2026-03-23 13:44:30] Step: 1986, Training Logs: loss_final: 2.979116, loss_mean: 0.949691, proj_loss: -0.372847, loss_mean_cls: 2.402272, grad_norm: 6.440948 +Steps: 0%| | 1987/1000000 [08:10<67:38:32, 4.10it/s, grad_norm=6.44, loss_final=2.98, loss_mean=0.95, loss_mean_cls=2.4, proj_loss=-0.373][2026-03-23 13:44:30] Step: 1987, Training Logs: loss_final: 2.323005, loss_mean: 0.959570, proj_loss: -0.387972, loss_mean_cls: 1.751406, grad_norm: 9.064518 +Steps: 0%| | 1988/1000000 [08:10<67:44:41, 4.09it/s, grad_norm=9.06, loss_final=2.32, loss_mean=0.96, loss_mean_cls=1.75, proj_loss=-0.388][2026-03-23 13:44:30] Step: 1988, Training Logs: loss_final: 2.601706, loss_mean: 0.972261, proj_loss: -0.381841, loss_mean_cls: 2.011286, grad_norm: 5.407274 +Steps: 0%| | 1989/1000000 [08:10<67:38:53, 4.10it/s, grad_norm=5.41, loss_final=2.6, loss_mean=0.972, loss_mean_cls=2.01, proj_loss=-0.382][2026-03-23 13:44:30] Step: 1989, Training Logs: loss_final: 2.821360, loss_mean: 0.965745, proj_loss: -0.376236, loss_mean_cls: 2.231851, grad_norm: 4.272678 +Steps: 0%| | 1990/1000000 [08:11<67:35:33, 4.10it/s, grad_norm=4.27, loss_final=2.82, loss_mean=0.966, loss_mean_cls=2.23, proj_loss=-0.376][2026-03-23 13:44:31] Step: 1990, Training Logs: loss_final: 2.974957, loss_mean: 0.956166, proj_loss: -0.377790, loss_mean_cls: 2.396582, grad_norm: 4.031069 +Steps: 0%| | 1991/1000000 [08:11<67:33:23, 4.10it/s, grad_norm=4.03, loss_final=2.97, loss_mean=0.956, loss_mean_cls=2.4, proj_loss=-0.378][2026-03-23 13:44:31] Step: 1991, Training Logs: loss_final: 2.944710, loss_mean: 0.956934, proj_loss: -0.377133, loss_mean_cls: 2.364910, grad_norm: 3.414618 +Steps: 0%| | 1992/1000000 [08:11<67:31:42, 4.11it/s, grad_norm=3.41, loss_final=2.94, loss_mean=0.957, loss_mean_cls=2.36, proj_loss=-0.377][2026-03-23 13:44:31] Step: 1992, Training Logs: loss_final: 2.683626, loss_mean: 0.958310, proj_loss: -0.381761, loss_mean_cls: 2.107077, grad_norm: 7.819563 +Steps: 0%| | 1993/1000000 [08:11<67:31:39, 4.11it/s, grad_norm=7.82, loss_final=2.68, loss_mean=0.958, loss_mean_cls=2.11, proj_loss=-0.382][2026-03-23 13:44:31] Step: 1993, Training Logs: loss_final: 3.259008, loss_mean: 0.919233, proj_loss: -0.377565, loss_mean_cls: 2.717340, grad_norm: 5.633096 +Steps: 0%| | 1994/1000000 [08:12<67:31:44, 4.11it/s, grad_norm=5.63, loss_final=3.26, loss_mean=0.919, loss_mean_cls=2.72, proj_loss=-0.378][2026-03-23 13:44:32] Step: 1994, Training Logs: loss_final: 3.214892, loss_mean: 0.939386, proj_loss: -0.375028, loss_mean_cls: 2.650535, grad_norm: 10.582218 +Steps: 0%| | 1995/1000000 [08:12<67:32:51, 4.10it/s, grad_norm=10.6, loss_final=3.21, loss_mean=0.939, loss_mean_cls=2.65, proj_loss=-0.375][2026-03-23 13:44:32] Step: 1995, Training Logs: loss_final: 3.013087, loss_mean: 0.957076, proj_loss: -0.378244, loss_mean_cls: 2.434255, grad_norm: 3.191751 +Steps: 0%| | 1996/1000000 [08:12<67:29:56, 4.11it/s, grad_norm=3.19, loss_final=3.01, loss_mean=0.957, loss_mean_cls=2.43, proj_loss=-0.378][2026-03-23 13:44:32] Step: 1996, Training Logs: loss_final: 2.595000, loss_mean: 0.958102, proj_loss: -0.378852, loss_mean_cls: 2.015751, grad_norm: 8.284100 +Steps: 0%| | 1997/1000000 [08:12<67:28:40, 4.11it/s, grad_norm=8.28, loss_final=2.6, loss_mean=0.958, loss_mean_cls=2.02, proj_loss=-0.379][2026-03-23 13:44:32] Step: 1997, Training Logs: loss_final: 2.831496, loss_mean: 0.947069, proj_loss: -0.380360, loss_mean_cls: 2.264787, grad_norm: 16.831379 +Steps: 0%| | 1998/1000000 [08:13<67:28:22, 4.11it/s, grad_norm=16.8, loss_final=2.83, loss_mean=0.947, loss_mean_cls=2.26, proj_loss=-0.38][2026-03-23 13:44:33] Step: 1998, Training Logs: loss_final: 3.084805, loss_mean: 0.935949, proj_loss: -0.377393, loss_mean_cls: 2.526249, grad_norm: 6.329860 +Steps: 0%| | 1999/1000000 [08:13<67:28:36, 4.11it/s, grad_norm=6.33, loss_final=3.08, loss_mean=0.936, loss_mean_cls=2.53, proj_loss=-0.377][2026-03-23 13:44:33] Step: 1999, Training Logs: loss_final: 2.960995, loss_mean: 0.971419, proj_loss: -0.378995, loss_mean_cls: 2.368571, grad_norm: 7.905913 +Steps: 0%| | 2000/1000000 [08:13<67:31:25, 4.11it/s, grad_norm=7.91, loss_final=2.96, loss_mean=0.971, loss_mean_cls=2.37, proj_loss=-0.379][2026-03-23 13:44:33] Generating EMA samples (ODE Euler, no diffusion noise; t≈0.5 → t=0)... +[2026-03-23 13:44:35] Step: 2000, Training Logs: loss_final: 3.369677, loss_mean: 0.930906, proj_loss: -0.371941, loss_mean_cls: 2.810712, grad_norm: 10.778982 +Steps: 0%| | 2001/1000000 [08:16<275:21:18, 1.01it/s, grad_norm=10.8, loss_final=3.37, loss_mean=0.931, loss_mean_cls=2.81, proj_loss=-0.372][2026-03-23 13:44:36] Step: 2001, Training Logs: loss_final: 3.177741, loss_mean: 0.934056, proj_loss: -0.377223, loss_mean_cls: 2.620908, grad_norm: 9.323277 +Steps: 0%| | 2002/1000000 [08:16<213:00:44, 1.30it/s, grad_norm=9.32, loss_final=3.18, loss_mean=0.934, loss_mean_cls=2.62, proj_loss=-0.377][2026-03-23 13:44:36] Step: 2002, Training Logs: loss_final: 3.035652, loss_mean: 0.939466, proj_loss: -0.371235, loss_mean_cls: 2.467420, grad_norm: 10.581064 +Steps: 0%| | 2003/1000000 [08:16<169:19:56, 1.64it/s, grad_norm=10.6, loss_final=3.04, loss_mean=0.939, loss_mean_cls=2.47, proj_loss=-0.371][2026-03-23 13:44:36] Step: 2003, Training Logs: loss_final: 3.135123, loss_mean: 0.952626, proj_loss: -0.370646, loss_mean_cls: 2.553143, grad_norm: 8.549973 +Steps: 0%| | 2004/1000000 [08:17<138:46:38, 2.00it/s, grad_norm=8.55, loss_final=3.14, loss_mean=0.953, loss_mean_cls=2.55, proj_loss=-0.371][2026-03-23 13:44:37] Step: 2004, Training Logs: loss_final: 2.910171, loss_mean: 0.931921, proj_loss: -0.373570, loss_mean_cls: 2.351820, grad_norm: 10.444967 +Steps: 0%| | 2005/1000000 [08:17<117:22:26, 2.36it/s, grad_norm=10.4, loss_final=2.91, loss_mean=0.932, loss_mean_cls=2.35, proj_loss=-0.374][2026-03-23 13:44:37] Step: 2005, Training Logs: loss_final: 2.970087, loss_mean: 0.951065, proj_loss: -0.377039, loss_mean_cls: 2.396061, grad_norm: 14.688217 +Steps: 0%| | 2006/1000000 [08:17<102:23:57, 2.71it/s, grad_norm=14.7, loss_final=2.97, loss_mean=0.951, loss_mean_cls=2.4, proj_loss=-0.377][2026-03-23 13:44:37] Step: 2006, Training Logs: loss_final: 2.936601, loss_mean: 0.956078, proj_loss: -0.381205, loss_mean_cls: 2.361728, grad_norm: 10.513307 +Steps: 0%| | 2007/1000000 [08:17<91:59:03, 3.01it/s, grad_norm=10.5, loss_final=2.94, loss_mean=0.956, loss_mean_cls=2.36, proj_loss=-0.381] [2026-03-23 13:44:37] Step: 2007, Training Logs: loss_final: 2.783104, loss_mean: 0.939333, proj_loss: -0.378260, loss_mean_cls: 2.222032, grad_norm: 6.426382 +Steps: 0%| | 2008/1000000 [08:18<85:55:36, 3.23it/s, grad_norm=6.43, loss_final=2.78, loss_mean=0.939, loss_mean_cls=2.22, proj_loss=-0.378][2026-03-23 13:44:38] Step: 2008, Training Logs: loss_final: 2.820397, loss_mean: 0.946173, proj_loss: -0.383240, loss_mean_cls: 2.257463, grad_norm: 5.063628 +Steps: 0%| | 2009/1000000 [08:18<80:24:34, 3.45it/s, grad_norm=5.06, loss_final=2.82, loss_mean=0.946, loss_mean_cls=2.26, proj_loss=-0.383][2026-03-23 13:44:38] Step: 2009, Training Logs: loss_final: 2.948508, loss_mean: 0.968888, proj_loss: -0.376912, loss_mean_cls: 2.356532, grad_norm: 10.319561 +Steps: 0%| | 2010/1000000 [08:18<76:39:06, 3.62it/s, grad_norm=10.3, loss_final=2.95, loss_mean=0.969, loss_mean_cls=2.36, proj_loss=-0.377][2026-03-23 13:44:38] Step: 2010, Training Logs: loss_final: 2.942197, loss_mean: 0.952951, proj_loss: -0.380021, loss_mean_cls: 2.369267, grad_norm: 14.655229 +Steps: 0%| | 2011/1000000 [08:18<73:53:17, 3.75it/s, grad_norm=14.7, loss_final=2.94, loss_mean=0.953, loss_mean_cls=2.37, proj_loss=-0.38][2026-03-23 13:44:38] Step: 2011, Training Logs: loss_final: 2.777302, loss_mean: 0.945682, proj_loss: -0.379134, loss_mean_cls: 2.210754, grad_norm: 10.097304 +Steps: 0%| | 2012/1000000 [08:19<71:57:26, 3.85it/s, grad_norm=10.1, loss_final=2.78, loss_mean=0.946, loss_mean_cls=2.21, proj_loss=-0.379][2026-03-23 13:44:39] Step: 2012, Training Logs: loss_final: 3.673387, loss_mean: 0.933265, proj_loss: -0.363161, loss_mean_cls: 3.103283, grad_norm: 14.002968 +Steps: 0%| | 2013/1000000 [08:19<70:39:00, 3.92it/s, grad_norm=14, loss_final=3.67, loss_mean=0.933, loss_mean_cls=3.1, proj_loss=-0.363][2026-03-23 13:44:39] Step: 2013, Training Logs: loss_final: 3.083276, loss_mean: 0.930126, proj_loss: -0.373496, loss_mean_cls: 2.526646, grad_norm: 6.449616 +Steps: 0%| | 2014/1000000 [08:19<69:43:25, 3.98it/s, grad_norm=6.45, loss_final=3.08, loss_mean=0.93, loss_mean_cls=2.53, proj_loss=-0.373][2026-03-23 13:44:39] Step: 2014, Training Logs: loss_final: 3.043405, loss_mean: 0.932107, proj_loss: -0.376182, loss_mean_cls: 2.487479, grad_norm: 2.445398 +Steps: 0%| | 2015/1000000 [08:19<69:03:16, 4.01it/s, grad_norm=2.45, loss_final=3.04, loss_mean=0.932, loss_mean_cls=2.49, proj_loss=-0.376][2026-03-23 13:44:39] Step: 2015, Training Logs: loss_final: 2.665535, loss_mean: 0.959329, proj_loss: -0.385200, loss_mean_cls: 2.091406, grad_norm: 8.311483 +Steps: 0%| | 2016/1000000 [08:20<68:35:32, 4.04it/s, grad_norm=8.31, loss_final=2.67, loss_mean=0.959, loss_mean_cls=2.09, proj_loss=-0.385][2026-03-23 13:44:39] Step: 2016, Training Logs: loss_final: 2.911154, loss_mean: 0.956232, proj_loss: -0.378775, loss_mean_cls: 2.333697, grad_norm: 9.198733 +Steps: 0%| | 2017/1000000 [08:20<68:14:40, 4.06it/s, grad_norm=9.2, loss_final=2.91, loss_mean=0.956, loss_mean_cls=2.33, proj_loss=-0.379][2026-03-23 13:44:40] Step: 2017, Training Logs: loss_final: 2.748199, loss_mean: 0.970612, proj_loss: -0.375501, loss_mean_cls: 2.153088, grad_norm: 10.325195 +Steps: 0%| | 2018/1000000 [08:20<68:01:44, 4.07it/s, grad_norm=10.3, loss_final=2.75, loss_mean=0.971, loss_mean_cls=2.15, proj_loss=-0.376][2026-03-23 13:44:40] Step: 2018, Training Logs: loss_final: 2.715458, loss_mean: 0.968708, proj_loss: -0.374899, loss_mean_cls: 2.121650, grad_norm: 7.342640 +Steps: 0%| | 2019/1000000 [08:20<67:50:43, 4.09it/s, grad_norm=7.34, loss_final=2.72, loss_mean=0.969, loss_mean_cls=2.12, proj_loss=-0.375][2026-03-23 13:44:40] Step: 2019, Training Logs: loss_final: 3.044788, loss_mean: 0.936045, proj_loss: -0.376646, loss_mean_cls: 2.485389, grad_norm: 4.489057 +Steps: 0%| | 2020/1000000 [08:21<67:43:34, 4.09it/s, grad_norm=4.49, loss_final=3.04, loss_mean=0.936, loss_mean_cls=2.49, proj_loss=-0.377][2026-03-23 13:44:40] Step: 2020, Training Logs: loss_final: 3.242301, loss_mean: 0.972047, proj_loss: -0.374627, loss_mean_cls: 2.644881, grad_norm: 20.310247 +Steps: 0%| | 2021/1000000 [08:21<67:39:03, 4.10it/s, grad_norm=20.3, loss_final=3.24, loss_mean=0.972, loss_mean_cls=2.64, proj_loss=-0.375][2026-03-23 13:44:41] Step: 2021, Training Logs: loss_final: 2.873556, loss_mean: 0.962236, proj_loss: -0.383665, loss_mean_cls: 2.294984, grad_norm: 13.472033 +Steps: 0%| | 2022/1000000 [08:21<67:37:47, 4.10it/s, grad_norm=13.5, loss_final=2.87, loss_mean=0.962, loss_mean_cls=2.29, proj_loss=-0.384][2026-03-23 13:44:41] Step: 2022, Training Logs: loss_final: 3.154023, loss_mean: 0.958545, proj_loss: -0.373585, loss_mean_cls: 2.569062, grad_norm: 10.361230 +Steps: 0%| | 2023/1000000 [08:21<67:34:23, 4.10it/s, grad_norm=10.4, loss_final=3.15, loss_mean=0.959, loss_mean_cls=2.57, proj_loss=-0.374][2026-03-23 13:44:41] Step: 2023, Training Logs: loss_final: 3.035203, loss_mean: 0.949434, proj_loss: -0.370116, loss_mean_cls: 2.455884, grad_norm: 3.837206 +Steps: 0%| | 2024/1000000 [08:21<67:36:45, 4.10it/s, grad_norm=3.84, loss_final=3.04, loss_mean=0.949, loss_mean_cls=2.46, proj_loss=-0.37][2026-03-23 13:44:41] Step: 2024, Training Logs: loss_final: 3.040429, loss_mean: 0.935607, proj_loss: -0.379633, loss_mean_cls: 2.484455, grad_norm: 15.680818 +Steps: 0%| | 2025/1000000 [08:22<67:33:45, 4.10it/s, grad_norm=15.7, loss_final=3.04, loss_mean=0.936, loss_mean_cls=2.48, proj_loss=-0.38][2026-03-23 13:44:42] Step: 2025, Training Logs: loss_final: 3.356676, loss_mean: 0.941536, proj_loss: -0.371246, loss_mean_cls: 2.786386, grad_norm: 9.867002 +Steps: 0%| | 2026/1000000 [08:22<67:33:19, 4.10it/s, grad_norm=9.87, loss_final=3.36, loss_mean=0.942, loss_mean_cls=2.79, proj_loss=-0.371][2026-03-23 13:44:42] Step: 2026, Training Logs: loss_final: 2.551878, loss_mean: 0.982015, proj_loss: -0.379840, loss_mean_cls: 1.949704, grad_norm: 5.252882 +Steps: 0%| | 2027/1000000 [08:22<67:32:47, 4.10it/s, grad_norm=5.25, loss_final=2.55, loss_mean=0.982, loss_mean_cls=1.95, proj_loss=-0.38][2026-03-23 13:44:42] Step: 2027, Training Logs: loss_final: 2.887745, loss_mean: 0.951804, proj_loss: -0.381550, loss_mean_cls: 2.317492, grad_norm: 14.264357 +Steps: 0%| | 2028/1000000 [08:22<67:31:27, 4.11it/s, grad_norm=14.3, loss_final=2.89, loss_mean=0.952, loss_mean_cls=2.32, proj_loss=-0.382][2026-03-23 13:44:42] Step: 2028, Training Logs: loss_final: 2.924193, loss_mean: 0.950779, proj_loss: -0.379823, loss_mean_cls: 2.353238, grad_norm: 10.101985 +Steps: 0%| | 2029/1000000 [08:23<67:30:40, 4.11it/s, grad_norm=10.1, loss_final=2.92, loss_mean=0.951, loss_mean_cls=2.35, proj_loss=-0.38][2026-03-23 13:44:43] Step: 2029, Training Logs: loss_final: 3.292869, loss_mean: 0.951841, proj_loss: -0.369146, loss_mean_cls: 2.710174, grad_norm: 6.895480 +Steps: 0%| | 2030/1000000 [08:23<67:30:07, 4.11it/s, grad_norm=6.9, loss_final=3.29, loss_mean=0.952, loss_mean_cls=2.71, proj_loss=-0.369][2026-03-23 13:44:43] Step: 2030, Training Logs: loss_final: 3.175050, loss_mean: 0.936683, proj_loss: -0.377830, loss_mean_cls: 2.616198, grad_norm: 9.845366 +Steps: 0%| | 2031/1000000 [08:23<69:24:33, 3.99it/s, grad_norm=9.85, loss_final=3.18, loss_mean=0.937, loss_mean_cls=2.62, proj_loss=-0.378][2026-03-23 13:44:43] Step: 2031, Training Logs: loss_final: 3.213897, loss_mean: 0.949095, proj_loss: -0.372172, loss_mean_cls: 2.636973, grad_norm: 11.418241 +Steps: 0%| | 2032/1000000 [08:23<68:53:32, 4.02it/s, grad_norm=11.4, loss_final=3.21, loss_mean=0.949, loss_mean_cls=2.64, proj_loss=-0.372][2026-03-23 13:44:43] Step: 2032, Training Logs: loss_final: 3.004393, loss_mean: 0.955842, proj_loss: -0.375488, loss_mean_cls: 2.424039, grad_norm: 4.283676 +Steps: 0%| | 2033/1000000 [08:24<68:29:30, 4.05it/s, grad_norm=4.28, loss_final=3, loss_mean=0.956, loss_mean_cls=2.42, proj_loss=-0.375][2026-03-23 13:44:44] Step: 2033, Training Logs: loss_final: 2.906055, loss_mean: 0.948495, proj_loss: -0.386855, loss_mean_cls: 2.344415, grad_norm: 15.904605 +Steps: 0%| | 2034/1000000 [08:24<68:14:04, 4.06it/s, grad_norm=15.9, loss_final=2.91, loss_mean=0.948, loss_mean_cls=2.34, proj_loss=-0.387][2026-03-23 13:44:44] Step: 2034, Training Logs: loss_final: 3.155613, loss_mean: 0.939734, proj_loss: -0.373688, loss_mean_cls: 2.589567, grad_norm: 12.828749 +Steps: 0%| | 2035/1000000 [08:24<67:58:53, 4.08it/s, grad_norm=12.8, loss_final=3.16, loss_mean=0.94, loss_mean_cls=2.59, proj_loss=-0.374][2026-03-23 13:44:44] Step: 2035, Training Logs: loss_final: 2.628482, loss_mean: 0.956227, proj_loss: -0.384679, loss_mean_cls: 2.056934, grad_norm: 4.921745 +Steps: 0%| | 2036/1000000 [08:24<67:49:33, 4.09it/s, grad_norm=4.92, loss_final=2.63, loss_mean=0.956, loss_mean_cls=2.06, proj_loss=-0.385][2026-03-23 13:44:44] Step: 2036, Training Logs: loss_final: 3.207503, loss_mean: 0.926281, proj_loss: -0.379338, loss_mean_cls: 2.660560, grad_norm: 8.468684 +Steps: 0%| | 2037/1000000 [08:25<67:43:20, 4.09it/s, grad_norm=8.47, loss_final=3.21, loss_mean=0.926, loss_mean_cls=2.66, proj_loss=-0.379][2026-03-23 13:44:45] Step: 2037, Training Logs: loss_final: 2.682198, loss_mean: 0.983088, proj_loss: -0.386807, loss_mean_cls: 2.085917, grad_norm: 7.741522 +Steps: 0%| | 2038/1000000 [08:25<67:38:42, 4.10it/s, grad_norm=7.74, loss_final=2.68, loss_mean=0.983, loss_mean_cls=2.09, proj_loss=-0.387][2026-03-23 13:44:45] Step: 2038, Training Logs: loss_final: 2.874913, loss_mean: 0.970949, proj_loss: -0.382624, loss_mean_cls: 2.286588, grad_norm: 7.043294 +Steps: 0%| | 2039/1000000 [08:25<67:36:23, 4.10it/s, grad_norm=7.04, loss_final=2.87, loss_mean=0.971, loss_mean_cls=2.29, proj_loss=-0.383][2026-03-23 13:44:45] Step: 2039, Training Logs: loss_final: 2.554969, loss_mean: 0.950311, proj_loss: -0.384638, loss_mean_cls: 1.989295, grad_norm: 5.549814 +Steps: 0%| | 2040/1000000 [08:25<67:37:28, 4.10it/s, grad_norm=5.55, loss_final=2.55, loss_mean=0.95, loss_mean_cls=1.99, proj_loss=-0.385][2026-03-23 13:44:45] Step: 2040, Training Logs: loss_final: 3.026310, loss_mean: 0.967284, proj_loss: -0.378916, loss_mean_cls: 2.437943, grad_norm: 8.890443 +Steps: 0%| | 2041/1000000 [08:26<67:35:14, 4.10it/s, grad_norm=8.89, loss_final=3.03, loss_mean=0.967, loss_mean_cls=2.44, proj_loss=-0.379][2026-03-23 13:44:46] Step: 2041, Training Logs: loss_final: 2.754020, loss_mean: 0.963595, proj_loss: -0.381636, loss_mean_cls: 2.172061, grad_norm: 4.479521 +Steps: 0%| | 2042/1000000 [08:26<67:33:10, 4.10it/s, grad_norm=4.48, loss_final=2.75, loss_mean=0.964, loss_mean_cls=2.17, proj_loss=-0.382][2026-03-23 13:44:46] Step: 2042, Training Logs: loss_final: 2.921381, loss_mean: 0.933533, proj_loss: -0.383725, loss_mean_cls: 2.371572, grad_norm: 6.277138 +Steps: 0%| | 2043/1000000 [08:26<67:32:16, 4.10it/s, grad_norm=6.28, loss_final=2.92, loss_mean=0.934, loss_mean_cls=2.37, proj_loss=-0.384][2026-03-23 13:44:46] Step: 2043, Training Logs: loss_final: 2.700030, loss_mean: 0.977886, proj_loss: -0.387735, loss_mean_cls: 2.109879, grad_norm: 10.055480 +Steps: 0%| | 2044/1000000 [08:26<67:31:24, 4.11it/s, grad_norm=10.1, loss_final=2.7, loss_mean=0.978, loss_mean_cls=2.11, proj_loss=-0.388][2026-03-23 13:44:46] Step: 2044, Training Logs: loss_final: 3.529845, loss_mean: 0.950746, proj_loss: -0.377356, loss_mean_cls: 2.956456, grad_norm: 16.729757 +Steps: 0%| | 2045/1000000 [08:27<67:31:28, 4.11it/s, grad_norm=16.7, loss_final=3.53, loss_mean=0.951, loss_mean_cls=2.96, proj_loss=-0.377][2026-03-23 13:44:47] Step: 2045, Training Logs: loss_final: 2.847298, loss_mean: 0.988111, proj_loss: -0.384321, loss_mean_cls: 2.243508, grad_norm: 6.713747 +Steps: 0%| | 2046/1000000 [08:27<67:34:13, 4.10it/s, grad_norm=6.71, loss_final=2.85, loss_mean=0.988, loss_mean_cls=2.24, proj_loss=-0.384][2026-03-23 13:44:47] Step: 2046, Training Logs: loss_final: 3.104334, loss_mean: 0.943858, proj_loss: -0.380997, loss_mean_cls: 2.541473, grad_norm: 6.485529 +Steps: 0%| | 2047/1000000 [08:27<67:32:38, 4.10it/s, grad_norm=6.49, loss_final=3.1, loss_mean=0.944, loss_mean_cls=2.54, proj_loss=-0.381][2026-03-23 13:44:47] Step: 2047, Training Logs: loss_final: 3.067552, loss_mean: 0.950416, proj_loss: -0.377043, loss_mean_cls: 2.494179, grad_norm: 6.456046 +Steps: 0%| | 2048/1000000 [08:27<67:32:14, 4.10it/s, grad_norm=6.46, loss_final=3.07, loss_mean=0.95, loss_mean_cls=2.49, proj_loss=-0.377][2026-03-23 13:44:47] Step: 2048, Training Logs: loss_final: 2.913444, loss_mean: 0.952875, proj_loss: -0.377141, loss_mean_cls: 2.337710, grad_norm: 9.445815 +Steps: 0%| | 2049/1000000 [08:28<67:32:55, 4.10it/s, grad_norm=9.45, loss_final=2.91, loss_mean=0.953, loss_mean_cls=2.34, proj_loss=-0.377][2026-03-23 13:44:48] Step: 2049, Training Logs: loss_final: 3.077498, loss_mean: 0.961141, proj_loss: -0.379550, loss_mean_cls: 2.495907, grad_norm: 11.477693 +Steps: 0%| | 2050/1000000 [08:28<67:31:15, 4.11it/s, grad_norm=11.5, loss_final=3.08, loss_mean=0.961, loss_mean_cls=2.5, proj_loss=-0.38][2026-03-23 13:44:48] Step: 2050, Training Logs: loss_final: 3.259970, loss_mean: 0.951972, proj_loss: -0.370535, loss_mean_cls: 2.678532, grad_norm: 9.642062 +Steps: 0%| | 2051/1000000 [08:28<67:29:09, 4.11it/s, grad_norm=9.64, loss_final=3.26, loss_mean=0.952, loss_mean_cls=2.68, proj_loss=-0.371][2026-03-23 13:44:48] Step: 2051, Training Logs: loss_final: 3.693113, loss_mean: 0.941726, proj_loss: -0.367750, loss_mean_cls: 3.119137, grad_norm: 4.648488 +Steps: 0%| | 2052/1000000 [08:28<67:28:49, 4.11it/s, grad_norm=4.65, loss_final=3.69, loss_mean=0.942, loss_mean_cls=3.12, proj_loss=-0.368][2026-03-23 13:44:48] Step: 2052, Training Logs: loss_final: 2.732100, loss_mean: 0.958533, proj_loss: -0.384645, loss_mean_cls: 2.158211, grad_norm: 8.342499 +Steps: 0%| | 2053/1000000 [08:29<67:27:40, 4.11it/s, grad_norm=8.34, loss_final=2.73, loss_mean=0.959, loss_mean_cls=2.16, proj_loss=-0.385][2026-03-23 13:44:49] Step: 2053, Training Logs: loss_final: 3.496349, loss_mean: 0.952858, proj_loss: -0.369327, loss_mean_cls: 2.912818, grad_norm: 8.213030 +Steps: 0%| | 2054/1000000 [08:29<67:27:59, 4.11it/s, grad_norm=8.21, loss_final=3.5, loss_mean=0.953, loss_mean_cls=2.91, proj_loss=-0.369][2026-03-23 13:44:49] Step: 2054, Training Logs: loss_final: 2.851483, loss_mean: 0.951470, proj_loss: -0.379897, loss_mean_cls: 2.279910, grad_norm: 5.367812 +Steps: 0%| | 2055/1000000 [08:29<67:27:59, 4.11it/s, grad_norm=5.37, loss_final=2.85, loss_mean=0.951, loss_mean_cls=2.28, proj_loss=-0.38][2026-03-23 13:44:49] Step: 2055, Training Logs: loss_final: 3.072328, loss_mean: 0.943106, proj_loss: -0.381066, loss_mean_cls: 2.510287, grad_norm: 17.673328 +Steps: 0%| | 2056/1000000 [08:29<67:27:39, 4.11it/s, grad_norm=17.7, loss_final=3.07, loss_mean=0.943, loss_mean_cls=2.51, proj_loss=-0.381][2026-03-23 13:44:49] Step: 2056, Training Logs: loss_final: 3.170234, loss_mean: 0.948950, proj_loss: -0.378832, loss_mean_cls: 2.600115, grad_norm: 9.262544 +Steps: 0%| | 2057/1000000 [08:30<67:28:36, 4.11it/s, grad_norm=9.26, loss_final=3.17, loss_mean=0.949, loss_mean_cls=2.6, proj_loss=-0.379][2026-03-23 13:44:49] Step: 2057, Training Logs: loss_final: 3.293099, loss_mean: 0.958866, proj_loss: -0.377044, loss_mean_cls: 2.711277, grad_norm: 6.544756 +Steps: 0%| | 2058/1000000 [08:30<67:31:55, 4.10it/s, grad_norm=6.54, loss_final=3.29, loss_mean=0.959, loss_mean_cls=2.71, proj_loss=-0.377][2026-03-23 13:44:50] Step: 2058, Training Logs: loss_final: 3.217927, loss_mean: 0.941668, proj_loss: -0.376775, loss_mean_cls: 2.653033, grad_norm: 6.002486 +Steps: 0%| | 2059/1000000 [08:30<67:34:04, 4.10it/s, grad_norm=6, loss_final=3.22, loss_mean=0.942, loss_mean_cls=2.65, proj_loss=-0.377][2026-03-23 13:44:50] Step: 2059, Training Logs: loss_final: 2.932772, loss_mean: 0.950483, proj_loss: -0.384951, loss_mean_cls: 2.367240, grad_norm: 11.946103 +Steps: 0%| | 2060/1000000 [08:30<67:31:39, 4.11it/s, grad_norm=11.9, loss_final=2.93, loss_mean=0.95, loss_mean_cls=2.37, proj_loss=-0.385][2026-03-23 13:44:50] Step: 2060, Training Logs: loss_final: 2.553814, loss_mean: 0.958692, proj_loss: -0.385486, loss_mean_cls: 1.980607, grad_norm: 10.474882 +Steps: 0%| | 2061/1000000 [08:31<67:29:58, 4.11it/s, grad_norm=10.5, loss_final=2.55, loss_mean=0.959, loss_mean_cls=1.98, proj_loss=-0.385][2026-03-23 13:44:50] Step: 2061, Training Logs: loss_final: 2.856270, loss_mean: 0.947549, proj_loss: -0.388692, loss_mean_cls: 2.297413, grad_norm: 6.705814 +Steps: 0%| | 2062/1000000 [08:31<67:31:16, 4.11it/s, grad_norm=6.71, loss_final=2.86, loss_mean=0.948, loss_mean_cls=2.3, proj_loss=-0.389][2026-03-23 13:44:51] Step: 2062, Training Logs: loss_final: 3.530944, loss_mean: 0.917199, proj_loss: -0.375398, loss_mean_cls: 2.989143, grad_norm: 8.797955 +Steps: 0%| | 2063/1000000 [08:31<67:29:37, 4.11it/s, grad_norm=8.8, loss_final=3.53, loss_mean=0.917, loss_mean_cls=2.99, proj_loss=-0.375][2026-03-23 13:44:51] Step: 2063, Training Logs: loss_final: 3.018039, loss_mean: 0.918726, proj_loss: -0.385943, loss_mean_cls: 2.485256, grad_norm: 2.665905 +Steps: 0%| | 2064/1000000 [08:31<67:29:22, 4.11it/s, grad_norm=2.67, loss_final=3.02, loss_mean=0.919, loss_mean_cls=2.49, proj_loss=-0.386][2026-03-23 13:44:51] Step: 2064, Training Logs: loss_final: 2.505616, loss_mean: 0.948008, proj_loss: -0.389197, loss_mean_cls: 1.946805, grad_norm: 3.345519 +Steps: 0%| | 2065/1000000 [08:31<67:35:43, 4.10it/s, grad_norm=3.35, loss_final=2.51, loss_mean=0.948, loss_mean_cls=1.95, proj_loss=-0.389][2026-03-23 13:44:51] Step: 2065, Training Logs: loss_final: 3.456306, loss_mean: 0.914750, proj_loss: -0.374887, loss_mean_cls: 2.916443, grad_norm: 8.048999 +Steps: 0%| | 2066/1000000 [08:32<67:36:15, 4.10it/s, grad_norm=8.05, loss_final=3.46, loss_mean=0.915, loss_mean_cls=2.92, proj_loss=-0.375][2026-03-23 13:44:52] Step: 2066, Training Logs: loss_final: 3.048214, loss_mean: 0.927946, proj_loss: -0.379814, loss_mean_cls: 2.500082, grad_norm: 11.327693 +Steps: 0%| | 2067/1000000 [08:32<67:33:27, 4.10it/s, grad_norm=11.3, loss_final=3.05, loss_mean=0.928, loss_mean_cls=2.5, proj_loss=-0.38][2026-03-23 13:44:52] Step: 2067, Training Logs: loss_final: 2.671920, loss_mean: 0.960970, proj_loss: -0.384367, loss_mean_cls: 2.095317, grad_norm: 5.013912 +Steps: 0%| | 2068/1000000 [08:32<67:31:06, 4.11it/s, grad_norm=5.01, loss_final=2.67, loss_mean=0.961, loss_mean_cls=2.1, proj_loss=-0.384][2026-03-23 13:44:52] Step: 2068, Training Logs: loss_final: 2.690361, loss_mean: 0.950981, proj_loss: -0.387109, loss_mean_cls: 2.126489, grad_norm: 9.383462 +Steps: 0%| | 2069/1000000 [08:32<67:30:25, 4.11it/s, grad_norm=9.38, loss_final=2.69, loss_mean=0.951, loss_mean_cls=2.13, proj_loss=-0.387][2026-03-23 13:44:52] Step: 2069, Training Logs: loss_final: 3.144457, loss_mean: 0.929752, proj_loss: -0.379985, loss_mean_cls: 2.594690, grad_norm: 5.125848 +Steps: 0%| | 2070/1000000 [08:33<67:32:09, 4.10it/s, grad_norm=5.13, loss_final=3.14, loss_mean=0.93, loss_mean_cls=2.59, proj_loss=-0.38][2026-03-23 13:44:53] Step: 2070, Training Logs: loss_final: 3.128547, loss_mean: 0.927974, proj_loss: -0.381227, loss_mean_cls: 2.581799, grad_norm: 3.654951 +Steps: 0%| | 2071/1000000 [08:33<67:31:13, 4.11it/s, grad_norm=3.65, loss_final=3.13, loss_mean=0.928, loss_mean_cls=2.58, proj_loss=-0.381][2026-03-23 13:44:53] Step: 2071, Training Logs: loss_final: 2.458509, loss_mean: 0.930066, proj_loss: -0.390080, loss_mean_cls: 1.918523, grad_norm: 3.692830 +Steps: 0%| | 2072/1000000 [08:33<85:52:46, 3.23it/s, grad_norm=3.69, loss_final=2.46, loss_mean=0.93, loss_mean_cls=1.92, proj_loss=-0.39][2026-03-23 13:44:53] Step: 2072, Training Logs: loss_final: 2.934242, loss_mean: 0.932566, proj_loss: -0.386964, loss_mean_cls: 2.388640, grad_norm: 7.290570 +Steps: 0%| | 2073/1000000 [08:34<80:22:39, 3.45it/s, grad_norm=7.29, loss_final=2.93, loss_mean=0.933, loss_mean_cls=2.39, proj_loss=-0.387][2026-03-23 13:44:54] Step: 2073, Training Logs: loss_final: 3.252384, loss_mean: 0.915678, proj_loss: -0.378548, loss_mean_cls: 2.715254, grad_norm: 5.717304 +Steps: 0%| | 2074/1000000 [08:34<76:31:38, 3.62it/s, grad_norm=5.72, loss_final=3.25, loss_mean=0.916, loss_mean_cls=2.72, proj_loss=-0.379][2026-03-23 13:44:54] Step: 2074, Training Logs: loss_final: 2.827004, loss_mean: 0.955788, proj_loss: -0.380811, loss_mean_cls: 2.252027, grad_norm: 10.342894 +Steps: 0%| | 2075/1000000 [08:34<73:48:29, 3.76it/s, grad_norm=10.3, loss_final=2.83, loss_mean=0.956, loss_mean_cls=2.25, proj_loss=-0.381][2026-03-23 13:44:54] Step: 2075, Training Logs: loss_final: 2.834375, loss_mean: 0.938078, proj_loss: -0.386052, loss_mean_cls: 2.282349, grad_norm: 7.509810 +Steps: 0%| | 2076/1000000 [08:34<71:54:05, 3.86it/s, grad_norm=7.51, loss_final=2.83, loss_mean=0.938, loss_mean_cls=2.28, proj_loss=-0.386][2026-03-23 13:44:54] Step: 2076, Training Logs: loss_final: 2.991204, loss_mean: 0.931701, proj_loss: -0.378528, loss_mean_cls: 2.438030, grad_norm: 3.027841 +Steps: 0%| | 2077/1000000 [08:35<70:34:08, 3.93it/s, grad_norm=3.03, loss_final=2.99, loss_mean=0.932, loss_mean_cls=2.44, proj_loss=-0.379][2026-03-23 13:44:55] Step: 2077, Training Logs: loss_final: 3.245630, loss_mean: 0.935946, proj_loss: -0.380282, loss_mean_cls: 2.689966, grad_norm: 1.865279 +Steps: 0%| | 2078/1000000 [08:35<69:37:07, 3.98it/s, grad_norm=1.87, loss_final=3.25, loss_mean=0.936, loss_mean_cls=2.69, proj_loss=-0.38][2026-03-23 13:44:55] Step: 2078, Training Logs: loss_final: 2.767993, loss_mean: 0.941769, proj_loss: -0.382691, loss_mean_cls: 2.208916, grad_norm: 2.525072 +Steps: 0%| | 2079/1000000 [08:35<68:57:38, 4.02it/s, grad_norm=2.53, loss_final=2.77, loss_mean=0.942, loss_mean_cls=2.21, proj_loss=-0.383][2026-03-23 13:44:55] Step: 2079, Training Logs: loss_final: 2.924766, loss_mean: 0.912406, proj_loss: -0.386157, loss_mean_cls: 2.398517, grad_norm: 7.694647 +Steps: 0%| | 2080/1000000 [08:35<68:29:39, 4.05it/s, grad_norm=7.69, loss_final=2.92, loss_mean=0.912, loss_mean_cls=2.4, proj_loss=-0.386][2026-03-23 13:44:55] Step: 2080, Training Logs: loss_final: 3.200166, loss_mean: 0.952522, proj_loss: -0.376511, loss_mean_cls: 2.624156, grad_norm: 7.379915 +Steps: 0%| | 2081/1000000 [08:36<68:12:23, 4.06it/s, grad_norm=7.38, loss_final=3.2, loss_mean=0.953, loss_mean_cls=2.62, proj_loss=-0.377][2026-03-23 13:44:56] Step: 2081, Training Logs: loss_final: 2.911696, loss_mean: 0.913466, proj_loss: -0.384745, loss_mean_cls: 2.382975, grad_norm: 2.473604 +Steps: 0%| | 2082/1000000 [08:36<67:58:50, 4.08it/s, grad_norm=2.47, loss_final=2.91, loss_mean=0.913, loss_mean_cls=2.38, proj_loss=-0.385][2026-03-23 13:44:56] Step: 2082, Training Logs: loss_final: 2.738533, loss_mean: 0.905926, proj_loss: -0.384473, loss_mean_cls: 2.217080, grad_norm: 7.667495 +Steps: 0%| | 2083/1000000 [08:36<67:48:21, 4.09it/s, grad_norm=7.67, loss_final=2.74, loss_mean=0.906, loss_mean_cls=2.22, proj_loss=-0.384][2026-03-23 13:44:56] Step: 2083, Training Logs: loss_final: 3.133307, loss_mean: 0.943822, proj_loss: -0.379423, loss_mean_cls: 2.568908, grad_norm: 9.605786 +Steps: 0%| | 2084/1000000 [08:36<67:43:04, 4.09it/s, grad_norm=9.61, loss_final=3.13, loss_mean=0.944, loss_mean_cls=2.57, proj_loss=-0.379][2026-03-23 13:44:56] Step: 2084, Training Logs: loss_final: 3.128789, loss_mean: 0.922446, proj_loss: -0.368623, loss_mean_cls: 2.574966, grad_norm: 6.848668 +Steps: 0%| | 2085/1000000 [08:37<67:37:28, 4.10it/s, grad_norm=6.85, loss_final=3.13, loss_mean=0.922, loss_mean_cls=2.57, proj_loss=-0.369][2026-03-23 13:44:57] Step: 2085, Training Logs: loss_final: 3.419853, loss_mean: 0.944426, proj_loss: -0.371134, loss_mean_cls: 2.846561, grad_norm: 23.423965 +Steps: 0%| | 2086/1000000 [08:37<67:33:29, 4.10it/s, grad_norm=23.4, loss_final=3.42, loss_mean=0.944, loss_mean_cls=2.85, proj_loss=-0.371][2026-03-23 13:44:57] Step: 2086, Training Logs: loss_final: 2.645651, loss_mean: 0.963151, proj_loss: -0.380154, loss_mean_cls: 2.062655, grad_norm: 5.931748 +Steps: 0%| | 2087/1000000 [08:37<67:31:23, 4.11it/s, grad_norm=5.93, loss_final=2.65, loss_mean=0.963, loss_mean_cls=2.06, proj_loss=-0.38][2026-03-23 13:44:57] Step: 2087, Training Logs: loss_final: 3.178830, loss_mean: 0.952066, proj_loss: -0.371225, loss_mean_cls: 2.597990, grad_norm: 6.535145 +Steps: 0%| | 2088/1000000 [08:37<67:30:49, 4.11it/s, grad_norm=6.54, loss_final=3.18, loss_mean=0.952, loss_mean_cls=2.6, proj_loss=-0.371][2026-03-23 13:44:57] Step: 2088, Training Logs: loss_final: 3.104470, loss_mean: 0.934680, proj_loss: -0.377777, loss_mean_cls: 2.547567, grad_norm: 11.729357 +Steps: 0%| | 2089/1000000 [08:38<67:30:27, 4.11it/s, grad_norm=11.7, loss_final=3.1, loss_mean=0.935, loss_mean_cls=2.55, proj_loss=-0.378][2026-03-23 13:44:58] Step: 2089, Training Logs: loss_final: 3.064872, loss_mean: 0.933113, proj_loss: -0.379355, loss_mean_cls: 2.511114, grad_norm: 6.483505 +Steps: 0%| | 2090/1000000 [08:38<67:29:02, 4.11it/s, grad_norm=6.48, loss_final=3.06, loss_mean=0.933, loss_mean_cls=2.51, proj_loss=-0.379][2026-03-23 13:44:58] Step: 2090, Training Logs: loss_final: 3.326003, loss_mean: 0.942868, proj_loss: -0.379295, loss_mean_cls: 2.762430, grad_norm: 14.434322 +Steps: 0%| | 2091/1000000 [08:38<67:28:56, 4.11it/s, grad_norm=14.4, loss_final=3.33, loss_mean=0.943, loss_mean_cls=2.76, proj_loss=-0.379][2026-03-23 13:44:58] Step: 2091, Training Logs: loss_final: 2.895799, loss_mean: 0.946924, proj_loss: -0.378068, loss_mean_cls: 2.326943, grad_norm: 3.535871 +Steps: 0%| | 2092/1000000 [08:38<67:29:03, 4.11it/s, grad_norm=3.54, loss_final=2.9, loss_mean=0.947, loss_mean_cls=2.33, proj_loss=-0.378][2026-03-23 13:44:58] Step: 2092, Training Logs: loss_final: 3.293269, loss_mean: 0.943738, proj_loss: -0.378569, loss_mean_cls: 2.728099, grad_norm: 24.461351 +Steps: 0%| | 2093/1000000 [08:39<67:29:16, 4.11it/s, grad_norm=24.5, loss_final=3.29, loss_mean=0.944, loss_mean_cls=2.73, proj_loss=-0.379][2026-03-23 13:44:58] Step: 2093, Training Logs: loss_final: 2.919183, loss_mean: 0.970269, proj_loss: -0.384525, loss_mean_cls: 2.333440, grad_norm: 16.664049 +Steps: 0%| | 2094/1000000 [08:39<67:26:59, 4.11it/s, grad_norm=16.7, loss_final=2.92, loss_mean=0.97, loss_mean_cls=2.33, proj_loss=-0.385][2026-03-23 13:44:59] Step: 2094, Training Logs: loss_final: 2.724635, loss_mean: 0.954557, proj_loss: -0.383969, loss_mean_cls: 2.154047, grad_norm: 9.773705 +Steps: 0%| | 2095/1000000 [08:39<67:26:44, 4.11it/s, grad_norm=9.77, loss_final=2.72, loss_mean=0.955, loss_mean_cls=2.15, proj_loss=-0.384][2026-03-23 13:44:59] Step: 2095, Training Logs: loss_final: 3.028411, loss_mean: 0.950166, proj_loss: -0.379393, loss_mean_cls: 2.457637, grad_norm: 10.346543 +Steps: 0%| | 2096/1000000 [08:39<67:27:14, 4.11it/s, grad_norm=10.3, loss_final=3.03, loss_mean=0.95, loss_mean_cls=2.46, proj_loss=-0.379][2026-03-23 13:44:59] Step: 2096, Training Logs: loss_final: 2.675689, loss_mean: 0.961609, proj_loss: -0.389035, loss_mean_cls: 2.103116, grad_norm: 10.151735 +Steps: 0%| | 2097/1000000 [08:39<67:27:14, 4.11it/s, grad_norm=10.2, loss_final=2.68, loss_mean=0.962, loss_mean_cls=2.1, proj_loss=-0.389][2026-03-23 13:44:59] Step: 2097, Training Logs: loss_final: 3.011367, loss_mean: 0.950929, proj_loss: -0.380024, loss_mean_cls: 2.440462, grad_norm: 8.653729 +Steps: 0%| | 2098/1000000 [08:40<67:27:21, 4.11it/s, grad_norm=8.65, loss_final=3.01, loss_mean=0.951, loss_mean_cls=2.44, proj_loss=-0.38][2026-03-23 13:45:00] Step: 2098, Training Logs: loss_final: 3.498262, loss_mean: 0.917502, proj_loss: -0.370163, loss_mean_cls: 2.950923, grad_norm: 13.035095 +Steps: 0%| | 2099/1000000 [08:40<67:27:06, 4.11it/s, grad_norm=13, loss_final=3.5, loss_mean=0.918, loss_mean_cls=2.95, proj_loss=-0.37][2026-03-23 13:45:00] Step: 2099, Training Logs: loss_final: 3.083950, loss_mean: 0.948328, proj_loss: -0.373497, loss_mean_cls: 2.509119, grad_norm: 14.650706 +Steps: 0%| | 2100/1000000 [08:40<67:28:43, 4.11it/s, grad_norm=14.7, loss_final=3.08, loss_mean=0.948, loss_mean_cls=2.51, proj_loss=-0.373][2026-03-23 13:45:00] Step: 2100, Training Logs: loss_final: 2.796350, loss_mean: 0.963821, proj_loss: -0.379343, loss_mean_cls: 2.211872, grad_norm: 5.363592 +Steps: 0%| | 2101/1000000 [08:40<67:26:53, 4.11it/s, grad_norm=5.36, loss_final=2.8, loss_mean=0.964, loss_mean_cls=2.21, proj_loss=-0.379][2026-03-23 13:45:00] Step: 2101, Training Logs: loss_final: 3.177936, loss_mean: 0.924607, proj_loss: -0.377739, loss_mean_cls: 2.631067, grad_norm: 11.544356 +Steps: 0%| | 2102/1000000 [08:41<67:27:16, 4.11it/s, grad_norm=11.5, loss_final=3.18, loss_mean=0.925, loss_mean_cls=2.63, proj_loss=-0.378][2026-03-23 13:45:01] Step: 2102, Training Logs: loss_final: 3.054621, loss_mean: 0.954908, proj_loss: -0.379369, loss_mean_cls: 2.479081, grad_norm: 10.056393 +Steps: 0%| | 2103/1000000 [08:41<67:26:10, 4.11it/s, grad_norm=10.1, loss_final=3.05, loss_mean=0.955, loss_mean_cls=2.48, proj_loss=-0.379][2026-03-23 13:45:01] Step: 2103, Training Logs: loss_final: 3.361100, loss_mean: 0.946542, proj_loss: -0.379058, loss_mean_cls: 2.793617, grad_norm: 7.993638 +Steps: 0%| | 2104/1000000 [08:41<67:25:20, 4.11it/s, grad_norm=7.99, loss_final=3.36, loss_mean=0.947, loss_mean_cls=2.79, proj_loss=-0.379][2026-03-23 13:45:01] Step: 2104, Training Logs: loss_final: 2.988123, loss_mean: 0.945359, proj_loss: -0.376598, loss_mean_cls: 2.419362, grad_norm: 7.178576 +Steps: 0%| | 2105/1000000 [08:41<67:25:46, 4.11it/s, grad_norm=7.18, loss_final=2.99, loss_mean=0.945, loss_mean_cls=2.42, proj_loss=-0.377][2026-03-23 13:45:01] Step: 2105, Training Logs: loss_final: 2.582946, loss_mean: 0.951890, proj_loss: -0.388419, loss_mean_cls: 2.019475, grad_norm: 8.094941 +Steps: 0%| | 2106/1000000 [08:42<67:28:18, 4.11it/s, grad_norm=8.09, loss_final=2.58, loss_mean=0.952, loss_mean_cls=2.02, proj_loss=-0.388][2026-03-23 13:45:02] Step: 2106, Training Logs: loss_final: 3.333811, loss_mean: 0.930176, proj_loss: -0.375373, loss_mean_cls: 2.779008, grad_norm: 5.495228 +Steps: 0%| | 2107/1000000 [08:42<67:28:54, 4.11it/s, grad_norm=5.5, loss_final=3.33, loss_mean=0.93, loss_mean_cls=2.78, proj_loss=-0.375][2026-03-23 13:45:02] Step: 2107, Training Logs: loss_final: 3.466897, loss_mean: 0.914424, proj_loss: -0.374110, loss_mean_cls: 2.926584, grad_norm: 11.878737 +Steps: 0%| | 2108/1000000 [08:42<67:28:46, 4.11it/s, grad_norm=11.9, loss_final=3.47, loss_mean=0.914, loss_mean_cls=2.93, proj_loss=-0.374][2026-03-23 13:45:02] Step: 2108, Training Logs: loss_final: 2.744387, loss_mean: 0.936936, proj_loss: -0.389186, loss_mean_cls: 2.196637, grad_norm: 13.455218 +Steps: 0%| | 2109/1000000 [08:43<83:34:58, 3.32it/s, grad_norm=13.5, loss_final=2.74, loss_mean=0.937, loss_mean_cls=2.2, proj_loss=-0.389][2026-03-23 13:45:03] Step: 2109, Training Logs: loss_final: 2.726358, loss_mean: 0.927041, proj_loss: -0.386739, loss_mean_cls: 2.186056, grad_norm: 10.721169 +Steps: 0%| | 2110/1000000 [08:43<78:43:15, 3.52it/s, grad_norm=10.7, loss_final=2.73, loss_mean=0.927, loss_mean_cls=2.19, proj_loss=-0.387][2026-03-23 13:45:03] Step: 2110, Training Logs: loss_final: 3.199289, loss_mean: 0.929052, proj_loss: -0.384120, loss_mean_cls: 2.654357, grad_norm: 8.786626 +Steps: 0%| | 2111/1000000 [08:43<75:19:23, 3.68it/s, grad_norm=8.79, loss_final=3.2, loss_mean=0.929, loss_mean_cls=2.65, proj_loss=-0.384][2026-03-23 13:45:03] Step: 2111, Training Logs: loss_final: 2.622139, loss_mean: 0.955916, proj_loss: -0.387267, loss_mean_cls: 2.053490, grad_norm: 13.907764 +Steps: 0%| | 2112/1000000 [08:43<72:57:47, 3.80it/s, grad_norm=13.9, loss_final=2.62, loss_mean=0.956, loss_mean_cls=2.05, proj_loss=-0.387][2026-03-23 13:45:03] Step: 2112, Training Logs: loss_final: 2.339574, loss_mean: 0.951967, proj_loss: -0.391404, loss_mean_cls: 1.779011, grad_norm: 4.106862 +Steps: 0%| | 2113/1000000 [08:44<71:17:37, 3.89it/s, grad_norm=4.11, loss_final=2.34, loss_mean=0.952, loss_mean_cls=1.78, proj_loss=-0.391][2026-03-23 13:45:04] Step: 2113, Training Logs: loss_final: 3.172673, loss_mean: 0.938052, proj_loss: -0.384129, loss_mean_cls: 2.618750, grad_norm: 6.999000 +Steps: 0%| | 2114/1000000 [08:44<70:08:19, 3.95it/s, grad_norm=7, loss_final=3.17, loss_mean=0.938, loss_mean_cls=2.62, proj_loss=-0.384][2026-03-23 13:45:04] Step: 2114, Training Logs: loss_final: 2.770638, loss_mean: 0.942880, proj_loss: -0.382356, loss_mean_cls: 2.210114, grad_norm: 7.382864 +Steps: 0%| | 2115/1000000 [08:44<69:19:07, 4.00it/s, grad_norm=7.38, loss_final=2.77, loss_mean=0.943, loss_mean_cls=2.21, proj_loss=-0.382][2026-03-23 13:45:04] Step: 2115, Training Logs: loss_final: 3.160812, loss_mean: 0.915356, proj_loss: -0.385986, loss_mean_cls: 2.631442, grad_norm: 6.628596 +Steps: 0%| | 2116/1000000 [08:44<68:46:50, 4.03it/s, grad_norm=6.63, loss_final=3.16, loss_mean=0.915, loss_mean_cls=2.63, proj_loss=-0.386][2026-03-23 13:45:04] Step: 2116, Training Logs: loss_final: 3.193706, loss_mean: 0.921942, proj_loss: -0.380346, loss_mean_cls: 2.652110, grad_norm: 13.088593 +Steps: 0%| | 2117/1000000 [08:45<68:21:40, 4.05it/s, grad_norm=13.1, loss_final=3.19, loss_mean=0.922, loss_mean_cls=2.65, proj_loss=-0.38][2026-03-23 13:45:05] Step: 2117, Training Logs: loss_final: 2.757035, loss_mean: 0.973873, proj_loss: -0.383719, loss_mean_cls: 2.166881, grad_norm: 9.503331 +Steps: 0%| | 2118/1000000 [08:45<68:04:19, 4.07it/s, grad_norm=9.5, loss_final=2.76, loss_mean=0.974, loss_mean_cls=2.17, proj_loss=-0.384][2026-03-23 13:45:05] Step: 2118, Training Logs: loss_final: 3.117756, loss_mean: 0.947406, proj_loss: -0.382568, loss_mean_cls: 2.552918, grad_norm: 7.810087 +Steps: 0%| | 2119/1000000 [08:45<67:54:06, 4.08it/s, grad_norm=7.81, loss_final=3.12, loss_mean=0.947, loss_mean_cls=2.55, proj_loss=-0.383][2026-03-23 13:45:05] Step: 2119, Training Logs: loss_final: 2.872799, loss_mean: 0.939269, proj_loss: -0.386069, loss_mean_cls: 2.319599, grad_norm: 3.324641 +Steps: 0%| | 2120/1000000 [08:45<67:44:12, 4.09it/s, grad_norm=3.32, loss_final=2.87, loss_mean=0.939, loss_mean_cls=2.32, proj_loss=-0.386][2026-03-23 13:45:05] Step: 2120, Training Logs: loss_final: 3.376261, loss_mean: 0.920814, proj_loss: -0.380332, loss_mean_cls: 2.835780, grad_norm: 16.152683 +Steps: 0%| | 2121/1000000 [08:46<67:41:15, 4.10it/s, grad_norm=16.2, loss_final=3.38, loss_mean=0.921, loss_mean_cls=2.84, proj_loss=-0.38][2026-03-23 13:45:05] Step: 2121, Training Logs: loss_final: 3.199019, loss_mean: 0.932990, proj_loss: -0.382190, loss_mean_cls: 2.648219, grad_norm: 6.350170 +Steps: 0%| | 2122/1000000 [08:46<67:37:21, 4.10it/s, grad_norm=6.35, loss_final=3.2, loss_mean=0.933, loss_mean_cls=2.65, proj_loss=-0.382][2026-03-23 13:45:06] Step: 2122, Training Logs: loss_final: 3.017640, loss_mean: 0.960418, proj_loss: -0.388504, loss_mean_cls: 2.445726, grad_norm: 13.969493 +Steps: 0%| | 2123/1000000 [08:46<67:34:03, 4.10it/s, grad_norm=14, loss_final=3.02, loss_mean=0.96, loss_mean_cls=2.45, proj_loss=-0.389][2026-03-23 13:45:06] Step: 2123, Training Logs: loss_final: 2.732983, loss_mean: 0.944755, proj_loss: -0.383450, loss_mean_cls: 2.171679, grad_norm: 6.786331 +Steps: 0%| | 2124/1000000 [08:46<67:31:30, 4.10it/s, grad_norm=6.79, loss_final=2.73, loss_mean=0.945, loss_mean_cls=2.17, proj_loss=-0.383][2026-03-23 13:45:06] Step: 2124, Training Logs: loss_final: 2.775015, loss_mean: 0.948711, proj_loss: -0.384979, loss_mean_cls: 2.211284, grad_norm: 3.377267 +Steps: 0%| | 2125/1000000 [08:47<67:31:22, 4.11it/s, grad_norm=3.38, loss_final=2.78, loss_mean=0.949, loss_mean_cls=2.21, proj_loss=-0.385][2026-03-23 13:45:06] Step: 2125, Training Logs: loss_final: 2.898477, loss_mean: 0.949697, proj_loss: -0.377454, loss_mean_cls: 2.326233, grad_norm: 17.283886 +Steps: 0%| | 2126/1000000 [08:47<67:31:30, 4.10it/s, grad_norm=17.3, loss_final=2.9, loss_mean=0.95, loss_mean_cls=2.33, proj_loss=-0.377][2026-03-23 13:45:07] Step: 2126, Training Logs: loss_final: 3.333018, loss_mean: 0.962326, proj_loss: -0.370094, loss_mean_cls: 2.740786, grad_norm: 8.939260 +Steps: 0%| | 2127/1000000 [08:47<67:30:21, 4.11it/s, grad_norm=8.94, loss_final=3.33, loss_mean=0.962, loss_mean_cls=2.74, proj_loss=-0.37][2026-03-23 13:45:07] Step: 2127, Training Logs: loss_final: 3.463982, loss_mean: 0.950765, proj_loss: -0.370650, loss_mean_cls: 2.883866, grad_norm: 4.989305 +Steps: 0%| | 2128/1000000 [08:47<67:34:09, 4.10it/s, grad_norm=4.99, loss_final=3.46, loss_mean=0.951, loss_mean_cls=2.88, proj_loss=-0.371][2026-03-23 13:45:07] Step: 2128, Training Logs: loss_final: 2.660310, loss_mean: 0.959764, proj_loss: -0.387984, loss_mean_cls: 2.088529, grad_norm: 2.903236 +Steps: 0%| | 2129/1000000 [08:47<67:33:25, 4.10it/s, grad_norm=2.9, loss_final=2.66, loss_mean=0.96, loss_mean_cls=2.09, proj_loss=-0.388][2026-03-23 13:45:07] Step: 2129, Training Logs: loss_final: 3.087760, loss_mean: 0.958837, proj_loss: -0.382989, loss_mean_cls: 2.511913, grad_norm: 6.793599 +Steps: 0%| | 2130/1000000 [08:48<67:33:40, 4.10it/s, grad_norm=6.79, loss_final=3.09, loss_mean=0.959, loss_mean_cls=2.51, proj_loss=-0.383][2026-03-23 13:45:08] Step: 2130, Training Logs: loss_final: 3.547071, loss_mean: 0.943565, proj_loss: -0.372018, loss_mean_cls: 2.975525, grad_norm: 18.825363 +Steps: 0%| | 2131/1000000 [08:48<67:34:08, 4.10it/s, grad_norm=18.8, loss_final=3.55, loss_mean=0.944, loss_mean_cls=2.98, proj_loss=-0.372][2026-03-23 13:45:08] Step: 2131, Training Logs: loss_final: 2.562115, loss_mean: 0.968872, proj_loss: -0.381871, loss_mean_cls: 1.975114, grad_norm: 3.963953 +Steps: 0%| | 2132/1000000 [08:48<67:32:24, 4.10it/s, grad_norm=3.96, loss_final=2.56, loss_mean=0.969, loss_mean_cls=1.98, proj_loss=-0.382][2026-03-23 13:45:08] Step: 2132, Training Logs: loss_final: 2.838705, loss_mean: 0.944120, proj_loss: -0.382831, loss_mean_cls: 2.277416, grad_norm: 8.862259 +Steps: 0%| | 2133/1000000 [08:48<67:32:41, 4.10it/s, grad_norm=8.86, loss_final=2.84, loss_mean=0.944, loss_mean_cls=2.28, proj_loss=-0.383][2026-03-23 13:45:08] Step: 2133, Training Logs: loss_final: 3.458150, loss_mean: 0.964771, proj_loss: -0.370361, loss_mean_cls: 2.863739, grad_norm: 4.528745 +Steps: 0%| | 2134/1000000 [08:49<67:37:23, 4.10it/s, grad_norm=4.53, loss_final=3.46, loss_mean=0.965, loss_mean_cls=2.86, proj_loss=-0.37][2026-03-23 13:45:09] Step: 2134, Training Logs: loss_final: 3.046330, loss_mean: 0.949573, proj_loss: -0.379791, loss_mean_cls: 2.476547, grad_norm: 15.855848 +Steps: 0%| | 2135/1000000 [08:49<67:39:02, 4.10it/s, grad_norm=15.9, loss_final=3.05, loss_mean=0.95, loss_mean_cls=2.48, proj_loss=-0.38][2026-03-23 13:45:09] Step: 2135, Training Logs: loss_final: 3.249879, loss_mean: 0.946143, proj_loss: -0.379716, loss_mean_cls: 2.683451, grad_norm: 10.785070 +Steps: 0%| | 2136/1000000 [08:49<67:40:09, 4.10it/s, grad_norm=10.8, loss_final=3.25, loss_mean=0.946, loss_mean_cls=2.68, proj_loss=-0.38][2026-03-23 13:45:09] Step: 2136, Training Logs: loss_final: 2.864745, loss_mean: 0.946920, proj_loss: -0.385692, loss_mean_cls: 2.303517, grad_norm: 6.365906 +Steps: 0%| | 2137/1000000 [08:49<67:37:37, 4.10it/s, grad_norm=6.37, loss_final=2.86, loss_mean=0.947, loss_mean_cls=2.3, proj_loss=-0.386][2026-03-23 13:45:09] Step: 2137, Training Logs: loss_final: 3.173475, loss_mean: 0.921759, proj_loss: -0.380482, loss_mean_cls: 2.632198, grad_norm: 5.347319 +Steps: 0%| | 2138/1000000 [08:50<67:49:22, 4.09it/s, grad_norm=5.35, loss_final=3.17, loss_mean=0.922, loss_mean_cls=2.63, proj_loss=-0.38][2026-03-23 13:45:10] Step: 2138, Training Logs: loss_final: 3.004137, loss_mean: 0.942733, proj_loss: -0.389004, loss_mean_cls: 2.450407, grad_norm: 12.231249 +Steps: 0%| | 2139/1000000 [08:50<68:00:25, 4.08it/s, grad_norm=12.2, loss_final=3, loss_mean=0.943, loss_mean_cls=2.45, proj_loss=-0.389][2026-03-23 13:45:10] Step: 2139, Training Logs: loss_final: 3.079232, loss_mean: 0.964285, proj_loss: -0.377638, loss_mean_cls: 2.492585, grad_norm: 9.070547 +Steps: 0%| | 2140/1000000 [08:50<68:05:21, 4.07it/s, grad_norm=9.07, loss_final=3.08, loss_mean=0.964, loss_mean_cls=2.49, proj_loss=-0.378][2026-03-23 13:45:10] Step: 2140, Training Logs: loss_final: 2.831328, loss_mean: 0.968260, proj_loss: -0.383229, loss_mean_cls: 2.246297, grad_norm: 9.179639 +Steps: 0%| | 2141/1000000 [08:50<68:06:47, 4.07it/s, grad_norm=9.18, loss_final=2.83, loss_mean=0.968, loss_mean_cls=2.25, proj_loss=-0.383][2026-03-23 13:45:10] Step: 2141, Training Logs: loss_final: 3.072148, loss_mean: 0.928747, proj_loss: -0.383109, loss_mean_cls: 2.526510, grad_norm: 6.926341 +Steps: 0%| | 2142/1000000 [08:51<67:55:14, 4.08it/s, grad_norm=6.93, loss_final=3.07, loss_mean=0.929, loss_mean_cls=2.53, proj_loss=-0.383][2026-03-23 13:45:11] Step: 2142, Training Logs: loss_final: 3.409436, loss_mean: 0.932020, proj_loss: -0.384296, loss_mean_cls: 2.861712, grad_norm: 3.400304 +Steps: 0%| | 2143/1000000 [08:51<67:48:25, 4.09it/s, grad_norm=3.4, loss_final=3.41, loss_mean=0.932, loss_mean_cls=2.86, proj_loss=-0.384][2026-03-23 13:45:11] Step: 2143, Training Logs: loss_final: 3.039034, loss_mean: 0.961509, proj_loss: -0.381284, loss_mean_cls: 2.458810, grad_norm: 3.418947 +Steps: 0%| | 2144/1000000 [08:51<67:44:20, 4.09it/s, grad_norm=3.42, loss_final=3.04, loss_mean=0.962, loss_mean_cls=2.46, proj_loss=-0.381][2026-03-23 13:45:11] Step: 2144, Training Logs: loss_final: 2.927290, loss_mean: 0.939515, proj_loss: -0.389977, loss_mean_cls: 2.377752, grad_norm: 3.471550 +Steps: 0%| | 2145/1000000 [08:51<67:38:32, 4.10it/s, grad_norm=3.47, loss_final=2.93, loss_mean=0.94, loss_mean_cls=2.38, proj_loss=-0.39][2026-03-23 13:45:11] Step: 2145, Training Logs: loss_final: 2.506934, loss_mean: 0.937869, proj_loss: -0.386193, loss_mean_cls: 1.955258, grad_norm: 6.722809 +Steps: 0%| | 2146/1000000 [08:52<67:37:23, 4.10it/s, grad_norm=6.72, loss_final=2.51, loss_mean=0.938, loss_mean_cls=1.96, proj_loss=-0.386][2026-03-23 13:45:12] Step: 2146, Training Logs: loss_final: 2.995089, loss_mean: 0.950747, proj_loss: -0.381152, loss_mean_cls: 2.425494, grad_norm: 8.916503 +Steps: 0%| | 2147/1000000 [08:52<67:35:44, 4.10it/s, grad_norm=8.92, loss_final=3, loss_mean=0.951, loss_mean_cls=2.43, proj_loss=-0.381][2026-03-23 13:45:12] Step: 2147, Training Logs: loss_final: 2.956368, loss_mean: 0.949962, proj_loss: -0.387624, loss_mean_cls: 2.394029, grad_norm: 10.625644 +Steps: 0%| | 2148/1000000 [08:52<67:36:27, 4.10it/s, grad_norm=10.6, loss_final=2.96, loss_mean=0.95, loss_mean_cls=2.39, proj_loss=-0.388][2026-03-23 13:45:12] Step: 2148, Training Logs: loss_final: 2.718516, loss_mean: 0.960773, proj_loss: -0.386499, loss_mean_cls: 2.144242, grad_norm: 6.392930 +Steps: 0%| | 2149/1000000 [08:52<67:33:54, 4.10it/s, grad_norm=6.39, loss_final=2.72, loss_mean=0.961, loss_mean_cls=2.14, proj_loss=-0.386][2026-03-23 13:45:12] Step: 2149, Training Logs: loss_final: 3.196348, loss_mean: 0.954783, proj_loss: -0.379468, loss_mean_cls: 2.621034, grad_norm: 20.615522 +Steps: 0%| | 2150/1000000 [08:53<67:33:47, 4.10it/s, grad_norm=20.6, loss_final=3.2, loss_mean=0.955, loss_mean_cls=2.62, proj_loss=-0.379][2026-03-23 13:45:13] Step: 2150, Training Logs: loss_final: 2.892691, loss_mean: 0.947156, proj_loss: -0.383700, loss_mean_cls: 2.329235, grad_norm: 16.887676 +Steps: 0%| | 2151/1000000 [08:53<67:31:40, 4.10it/s, grad_norm=16.9, loss_final=2.89, loss_mean=0.947, loss_mean_cls=2.33, proj_loss=-0.384][2026-03-23 13:45:13] Step: 2151, Training Logs: loss_final: 3.038575, loss_mean: 0.943923, proj_loss: -0.379951, loss_mean_cls: 2.474602, grad_norm: 9.824072 +Steps: 0%| | 2152/1000000 [08:53<67:52:29, 4.08it/s, grad_norm=9.82, loss_final=3.04, loss_mean=0.944, loss_mean_cls=2.47, proj_loss=-0.38][2026-03-23 13:45:13] Step: 2152, Training Logs: loss_final: 2.360462, loss_mean: 0.956334, proj_loss: -0.392132, loss_mean_cls: 1.796261, grad_norm: 2.723587 +Steps: 0%| | 2153/1000000 [08:53<67:44:56, 4.09it/s, grad_norm=2.72, loss_final=2.36, loss_mean=0.956, loss_mean_cls=1.8, proj_loss=-0.392][2026-03-23 13:45:13] Step: 2153, Training Logs: loss_final: 2.640405, loss_mean: 0.962181, proj_loss: -0.394494, loss_mean_cls: 2.072717, grad_norm: 4.215739 +Steps: 0%| | 2154/1000000 [08:54<67:39:47, 4.10it/s, grad_norm=4.22, loss_final=2.64, loss_mean=0.962, loss_mean_cls=2.07, proj_loss=-0.394][2026-03-23 13:45:14] Step: 2154, Training Logs: loss_final: 3.009225, loss_mean: 0.947554, proj_loss: -0.381641, loss_mean_cls: 2.443312, grad_norm: 3.243937 +Steps: 0%| | 2155/1000000 [08:54<67:35:47, 4.10it/s, grad_norm=3.24, loss_final=3.01, loss_mean=0.948, loss_mean_cls=2.44, proj_loss=-0.382][2026-03-23 13:45:14] Step: 2155, Training Logs: loss_final: 3.035287, loss_mean: 0.958838, proj_loss: -0.388094, loss_mean_cls: 2.464544, grad_norm: 3.741657 +Steps: 0%| | 2156/1000000 [08:54<67:32:51, 4.10it/s, grad_norm=3.74, loss_final=3.04, loss_mean=0.959, loss_mean_cls=2.46, proj_loss=-0.388][2026-03-23 13:45:14] Step: 2156, Training Logs: loss_final: 3.475222, loss_mean: 0.906977, proj_loss: -0.374370, loss_mean_cls: 2.942614, grad_norm: 5.625713 +Steps: 0%| | 2157/1000000 [08:54<67:33:32, 4.10it/s, grad_norm=5.63, loss_final=3.48, loss_mean=0.907, loss_mean_cls=2.94, proj_loss=-0.374][2026-03-23 13:45:14] Step: 2157, Training Logs: loss_final: 2.644953, loss_mean: 0.969520, proj_loss: -0.382963, loss_mean_cls: 2.058396, grad_norm: 8.887070 +Steps: 0%| | 2158/1000000 [08:55<67:30:21, 4.11it/s, grad_norm=8.89, loss_final=2.64, loss_mean=0.97, loss_mean_cls=2.06, proj_loss=-0.383][2026-03-23 13:45:15] Step: 2158, Training Logs: loss_final: 3.362578, loss_mean: 0.944676, proj_loss: -0.374995, loss_mean_cls: 2.792897, grad_norm: 15.434739 +Steps: 0%| | 2159/1000000 [08:55<67:30:03, 4.11it/s, grad_norm=15.4, loss_final=3.36, loss_mean=0.945, loss_mean_cls=2.79, proj_loss=-0.375][2026-03-23 13:45:15] Step: 2159, Training Logs: loss_final: 3.188144, loss_mean: 0.946716, proj_loss: -0.380598, loss_mean_cls: 2.622026, grad_norm: 5.524584 +Steps: 0%| | 2160/1000000 [08:55<67:28:51, 4.11it/s, grad_norm=5.52, loss_final=3.19, loss_mean=0.947, loss_mean_cls=2.62, proj_loss=-0.381][2026-03-23 13:45:15] Step: 2160, Training Logs: loss_final: 3.215941, loss_mean: 0.935385, proj_loss: -0.379783, loss_mean_cls: 2.660339, grad_norm: 14.590359 +Steps: 0%| | 2161/1000000 [08:55<67:28:03, 4.11it/s, grad_norm=14.6, loss_final=3.22, loss_mean=0.935, loss_mean_cls=2.66, proj_loss=-0.38][2026-03-23 13:45:15] Step: 2161, Training Logs: loss_final: 2.832087, loss_mean: 0.974420, proj_loss: -0.385771, loss_mean_cls: 2.243438, grad_norm: 8.926023 +Steps: 0%| | 2162/1000000 [08:56<67:28:53, 4.11it/s, grad_norm=8.93, loss_final=2.83, loss_mean=0.974, loss_mean_cls=2.24, proj_loss=-0.386][2026-03-23 13:45:15] Step: 2162, Training Logs: loss_final: 2.408062, loss_mean: 0.964819, proj_loss: -0.390604, loss_mean_cls: 1.833847, grad_norm: 4.458999 +Steps: 0%| | 2163/1000000 [08:56<67:27:39, 4.11it/s, grad_norm=4.46, loss_final=2.41, loss_mean=0.965, loss_mean_cls=1.83, proj_loss=-0.391][2026-03-23 13:45:16] Step: 2163, Training Logs: loss_final: 2.852684, loss_mean: 0.966935, proj_loss: -0.384763, loss_mean_cls: 2.270512, grad_norm: 4.838327 +Steps: 0%| | 2164/1000000 [08:56<67:27:11, 4.11it/s, grad_norm=4.84, loss_final=2.85, loss_mean=0.967, loss_mean_cls=2.27, proj_loss=-0.385][2026-03-23 13:45:16] Step: 2164, Training Logs: loss_final: 2.458459, loss_mean: 0.969179, proj_loss: -0.392098, loss_mean_cls: 1.881378, grad_norm: 3.515687 +Steps: 0%| | 2165/1000000 [08:56<67:26:05, 4.11it/s, grad_norm=3.52, loss_final=2.46, loss_mean=0.969, loss_mean_cls=1.88, proj_loss=-0.392][2026-03-23 13:45:16] Step: 2165, Training Logs: loss_final: 2.718983, loss_mean: 0.947324, proj_loss: -0.391262, loss_mean_cls: 2.162921, grad_norm: 3.210079 +Steps: 0%| | 2166/1000000 [08:57<67:32:32, 4.10it/s, grad_norm=3.21, loss_final=2.72, loss_mean=0.947, loss_mean_cls=2.16, proj_loss=-0.391][2026-03-23 13:45:16] Step: 2166, Training Logs: loss_final: 2.811341, loss_mean: 0.955377, proj_loss: -0.391147, loss_mean_cls: 2.247111, grad_norm: 5.906884 +Steps: 0%| | 2167/1000000 [08:57<67:30:16, 4.11it/s, grad_norm=5.91, loss_final=2.81, loss_mean=0.955, loss_mean_cls=2.25, proj_loss=-0.391][2026-03-23 13:45:17] Step: 2167, Training Logs: loss_final: 2.749963, loss_mean: 0.967668, proj_loss: -0.380177, loss_mean_cls: 2.162472, grad_norm: 3.960344 +Steps: 0%| | 2168/1000000 [08:57<67:28:13, 4.11it/s, grad_norm=3.96, loss_final=2.75, loss_mean=0.968, loss_mean_cls=2.16, proj_loss=-0.38][2026-03-23 13:45:17] Step: 2168, Training Logs: loss_final: 3.267585, loss_mean: 0.933496, proj_loss: -0.378296, loss_mean_cls: 2.712385, grad_norm: 4.416393 +Steps: 0%| | 2169/1000000 [08:57<67:28:44, 4.11it/s, grad_norm=4.42, loss_final=3.27, loss_mean=0.933, loss_mean_cls=2.71, proj_loss=-0.378][2026-03-23 13:45:17] Step: 2169, Training Logs: loss_final: 2.944974, loss_mean: 0.964155, proj_loss: -0.385820, loss_mean_cls: 2.366640, grad_norm: 10.860222 +Steps: 0%| | 2170/1000000 [08:57<67:29:22, 4.11it/s, grad_norm=10.9, loss_final=2.94, loss_mean=0.964, loss_mean_cls=2.37, proj_loss=-0.386][2026-03-23 13:45:17] Step: 2170, Training Logs: loss_final: 2.878932, loss_mean: 0.967793, proj_loss: -0.383961, loss_mean_cls: 2.295100, grad_norm: 4.210829 +Steps: 0%| | 2171/1000000 [08:58<67:28:38, 4.11it/s, grad_norm=4.21, loss_final=2.88, loss_mean=0.968, loss_mean_cls=2.3, proj_loss=-0.384][2026-03-23 13:45:18] Step: 2171, Training Logs: loss_final: 2.376164, loss_mean: 0.957273, proj_loss: -0.397687, loss_mean_cls: 1.816578, grad_norm: 4.601637 +Steps: 0%| | 2172/1000000 [08:58<67:28:25, 4.11it/s, grad_norm=4.6, loss_final=2.38, loss_mean=0.957, loss_mean_cls=1.82, proj_loss=-0.398][2026-03-23 13:45:18] Step: 2172, Training Logs: loss_final: 3.259502, loss_mean: 0.966148, proj_loss: -0.384134, loss_mean_cls: 2.677488, grad_norm: 25.293715 +Steps: 0%| | 2173/1000000 [08:58<67:26:49, 4.11it/s, grad_norm=25.3, loss_final=3.26, loss_mean=0.966, loss_mean_cls=2.68, proj_loss=-0.384][2026-03-23 13:45:18] Step: 2173, Training Logs: loss_final: 3.043874, loss_mean: 0.980843, proj_loss: -0.386814, loss_mean_cls: 2.449845, grad_norm: 15.791194 +Steps: 0%| | 2174/1000000 [08:58<67:26:19, 4.11it/s, grad_norm=15.8, loss_final=3.04, loss_mean=0.981, loss_mean_cls=2.45, proj_loss=-0.387][2026-03-23 13:45:18] Step: 2174, Training Logs: loss_final: 3.355068, loss_mean: 0.981616, proj_loss: -0.374846, loss_mean_cls: 2.748299, grad_norm: 8.378043 +Steps: 0%| | 2175/1000000 [08:59<67:26:51, 4.11it/s, grad_norm=8.38, loss_final=3.36, loss_mean=0.982, loss_mean_cls=2.75, proj_loss=-0.375][2026-03-23 13:45:19] Step: 2175, Training Logs: loss_final: 3.100642, loss_mean: 0.960797, proj_loss: -0.376135, loss_mean_cls: 2.515979, grad_norm: 4.576781 +Steps: 0%| | 2176/1000000 [08:59<67:27:18, 4.11it/s, grad_norm=4.58, loss_final=3.1, loss_mean=0.961, loss_mean_cls=2.52, proj_loss=-0.376][2026-03-23 13:45:19] Step: 2176, Training Logs: loss_final: 2.850067, loss_mean: 0.953717, proj_loss: -0.387133, loss_mean_cls: 2.283483, grad_norm: 8.994888 +Steps: 0%| | 2177/1000000 [08:59<67:26:09, 4.11it/s, grad_norm=8.99, loss_final=2.85, loss_mean=0.954, loss_mean_cls=2.28, proj_loss=-0.387][2026-03-23 13:45:19] Step: 2177, Training Logs: loss_final: 3.188806, loss_mean: 0.973338, proj_loss: -0.379664, loss_mean_cls: 2.595132, grad_norm: 14.063877 +Steps: 0%| | 2178/1000000 [08:59<67:29:00, 4.11it/s, grad_norm=14.1, loss_final=3.19, loss_mean=0.973, loss_mean_cls=2.6, proj_loss=-0.38][2026-03-23 13:45:19] Step: 2178, Training Logs: loss_final: 2.745791, loss_mean: 0.998349, proj_loss: -0.380149, loss_mean_cls: 2.127592, grad_norm: 13.027170 +Steps: 0%| | 2179/1000000 [09:00<67:29:03, 4.11it/s, grad_norm=13, loss_final=2.75, loss_mean=0.998, loss_mean_cls=2.13, proj_loss=-0.38][2026-03-23 13:45:20] Step: 2179, Training Logs: loss_final: 3.070457, loss_mean: 1.000928, proj_loss: -0.371384, loss_mean_cls: 2.440912, grad_norm: 14.536942 +Steps: 0%| | 2180/1000000 [09:00<67:26:56, 4.11it/s, grad_norm=14.5, loss_final=3.07, loss_mean=1, loss_mean_cls=2.44, proj_loss=-0.371][2026-03-23 13:45:20] Step: 2180, Training Logs: loss_final: 3.656152, loss_mean: 0.970926, proj_loss: -0.370358, loss_mean_cls: 3.055583, grad_norm: 27.122467 +Steps: 0%| | 2181/1000000 [09:00<67:26:08, 4.11it/s, grad_norm=27.1, loss_final=3.66, loss_mean=0.971, loss_mean_cls=3.06, proj_loss=-0.37][2026-03-23 13:45:20] Step: 2181, Training Logs: loss_final: 3.464207, loss_mean: 0.973688, proj_loss: -0.366034, loss_mean_cls: 2.856554, grad_norm: 19.129065 +Steps: 0%| | 2182/1000000 [09:00<67:26:31, 4.11it/s, grad_norm=19.1, loss_final=3.46, loss_mean=0.974, loss_mean_cls=2.86, proj_loss=-0.366][2026-03-23 13:45:20] Step: 2182, Training Logs: loss_final: 2.875731, loss_mean: 0.974939, proj_loss: -0.383827, loss_mean_cls: 2.284619, grad_norm: 14.909983 +Steps: 0%| | 2183/1000000 [09:01<67:26:05, 4.11it/s, grad_norm=14.9, loss_final=2.88, loss_mean=0.975, loss_mean_cls=2.28, proj_loss=-0.384][2026-03-23 13:45:21] Step: 2183, Training Logs: loss_final: 2.820408, loss_mean: 0.977000, proj_loss: -0.377484, loss_mean_cls: 2.220892, grad_norm: 9.032605 +Steps: 0%| | 2184/1000000 [09:01<67:26:33, 4.11it/s, grad_norm=9.03, loss_final=2.82, loss_mean=0.977, loss_mean_cls=2.22, proj_loss=-0.377][2026-03-23 13:45:21] Step: 2184, Training Logs: loss_final: 3.030706, loss_mean: 0.952694, proj_loss: -0.383085, loss_mean_cls: 2.461097, grad_norm: 19.184605 +Steps: 0%| | 2185/1000000 [09:01<67:25:01, 4.11it/s, grad_norm=19.2, loss_final=3.03, loss_mean=0.953, loss_mean_cls=2.46, proj_loss=-0.383][2026-03-23 13:45:21] Step: 2185, Training Logs: loss_final: 3.151853, loss_mean: 0.974491, proj_loss: -0.379083, loss_mean_cls: 2.556445, grad_norm: 13.069005 +Steps: 0%| | 2186/1000000 [09:01<67:24:58, 4.11it/s, grad_norm=13.1, loss_final=3.15, loss_mean=0.974, loss_mean_cls=2.56, proj_loss=-0.379][2026-03-23 13:45:21] Step: 2186, Training Logs: loss_final: 3.347366, loss_mean: 0.957681, proj_loss: -0.379638, loss_mean_cls: 2.769323, grad_norm: 5.449679 +Steps: 0%| | 2187/1000000 [09:02<67:26:50, 4.11it/s, grad_norm=5.45, loss_final=3.35, loss_mean=0.958, loss_mean_cls=2.77, proj_loss=-0.38][2026-03-23 13:45:22] Step: 2187, Training Logs: loss_final: 3.367688, loss_mean: 0.926607, proj_loss: -0.381912, loss_mean_cls: 2.822992, grad_norm: 11.187886 +Steps: 0%| | 2188/1000000 [09:02<67:26:03, 4.11it/s, grad_norm=11.2, loss_final=3.37, loss_mean=0.927, loss_mean_cls=2.82, proj_loss=-0.382][2026-03-23 13:45:22] Step: 2188, Training Logs: loss_final: 3.303656, loss_mean: 0.945068, proj_loss: -0.381557, loss_mean_cls: 2.740145, grad_norm: 28.361565 +Steps: 0%| | 2189/1000000 [09:02<67:27:21, 4.11it/s, grad_norm=28.4, loss_final=3.3, loss_mean=0.945, loss_mean_cls=2.74, proj_loss=-0.382][2026-03-23 13:45:22] Step: 2189, Training Logs: loss_final: 3.366831, loss_mean: 0.955082, proj_loss: -0.378494, loss_mean_cls: 2.790243, grad_norm: 15.902273 +Steps: 0%| | 2190/1000000 [09:02<67:28:32, 4.11it/s, grad_norm=15.9, loss_final=3.37, loss_mean=0.955, loss_mean_cls=2.79, proj_loss=-0.378][2026-03-23 13:45:22] Step: 2190, Training Logs: loss_final: 2.450404, loss_mean: 0.990477, proj_loss: -0.398315, loss_mean_cls: 1.858242, grad_norm: 5.603765 +Steps: 0%| | 2191/1000000 [09:03<67:28:56, 4.11it/s, grad_norm=5.6, loss_final=2.45, loss_mean=0.99, loss_mean_cls=1.86, proj_loss=-0.398][2026-03-23 13:45:23] Step: 2191, Training Logs: loss_final: 3.476343, loss_mean: 0.928459, proj_loss: -0.376485, loss_mean_cls: 2.924368, grad_norm: 22.417990 +Steps: 0%| | 2192/1000000 [09:03<67:29:40, 4.11it/s, grad_norm=22.4, loss_final=3.48, loss_mean=0.928, loss_mean_cls=2.92, proj_loss=-0.376][2026-03-23 13:45:23] Step: 2192, Training Logs: loss_final: 3.451224, loss_mean: 0.952543, proj_loss: -0.382628, loss_mean_cls: 2.881309, grad_norm: 26.107201 +Steps: 0%| | 2193/1000000 [09:03<67:27:48, 4.11it/s, grad_norm=26.1, loss_final=3.45, loss_mean=0.953, loss_mean_cls=2.88, proj_loss=-0.383][2026-03-23 13:45:23] Step: 2193, Training Logs: loss_final: 2.988107, loss_mean: 0.976452, proj_loss: -0.386010, loss_mean_cls: 2.397666, grad_norm: 20.162872 +Steps: 0%| | 2194/1000000 [09:03<67:26:31, 4.11it/s, grad_norm=20.2, loss_final=2.99, loss_mean=0.976, loss_mean_cls=2.4, proj_loss=-0.386][2026-03-23 13:45:23] Step: 2194, Training Logs: loss_final: 3.260382, loss_mean: 0.964771, proj_loss: -0.378639, loss_mean_cls: 2.674250, grad_norm: 10.655018 +Steps: 0%| | 2195/1000000 [09:04<67:29:59, 4.11it/s, grad_norm=10.7, loss_final=3.26, loss_mean=0.965, loss_mean_cls=2.67, proj_loss=-0.379][2026-03-23 13:45:24] Step: 2195, Training Logs: loss_final: 2.934930, loss_mean: 0.942099, proj_loss: -0.383320, loss_mean_cls: 2.376150, grad_norm: 14.651814 +Steps: 0%| | 2196/1000000 [09:04<67:29:24, 4.11it/s, grad_norm=14.7, loss_final=2.93, loss_mean=0.942, loss_mean_cls=2.38, proj_loss=-0.383][2026-03-23 13:45:24] Step: 2196, Training Logs: loss_final: 2.806940, loss_mean: 0.952948, proj_loss: -0.387915, loss_mean_cls: 2.241906, grad_norm: 17.324078 +Steps: 0%| | 2197/1000000 [09:04<67:28:50, 4.11it/s, grad_norm=17.3, loss_final=2.81, loss_mean=0.953, loss_mean_cls=2.24, proj_loss=-0.388][2026-03-23 13:45:24] Step: 2197, Training Logs: loss_final: 2.809853, loss_mean: 0.950887, proj_loss: -0.393086, loss_mean_cls: 2.252052, grad_norm: 16.493612 +Steps: 0%| | 2198/1000000 [09:04<67:28:49, 4.11it/s, grad_norm=16.5, loss_final=2.81, loss_mean=0.951, loss_mean_cls=2.25, proj_loss=-0.393][2026-03-23 13:45:24] Step: 2198, Training Logs: loss_final: 3.215348, loss_mean: 0.969419, proj_loss: -0.382222, loss_mean_cls: 2.628151, grad_norm: 12.275432 +Steps: 0%| | 2199/1000000 [09:05<67:28:54, 4.11it/s, grad_norm=12.3, loss_final=3.22, loss_mean=0.969, loss_mean_cls=2.63, proj_loss=-0.382][2026-03-23 13:45:25] Step: 2199, Training Logs: loss_final: 2.523723, loss_mean: 0.973046, proj_loss: -0.393507, loss_mean_cls: 1.944184, grad_norm: 14.728611 +Steps: 0%| | 2200/1000000 [09:05<67:30:02, 4.11it/s, grad_norm=14.7, loss_final=2.52, loss_mean=0.973, loss_mean_cls=1.94, proj_loss=-0.394][2026-03-23 13:45:25] Step: 2200, Training Logs: loss_final: 2.875581, loss_mean: 0.970140, proj_loss: -0.387676, loss_mean_cls: 2.293117, grad_norm: 15.542490 +Steps: 0%| | 2201/1000000 [09:05<67:28:30, 4.11it/s, grad_norm=15.5, loss_final=2.88, loss_mean=0.97, loss_mean_cls=2.29, proj_loss=-0.388][2026-03-23 13:45:25] Step: 2201, Training Logs: loss_final: 3.089983, loss_mean: 0.970905, proj_loss: -0.384736, loss_mean_cls: 2.503815, grad_norm: 16.951540 +Steps: 0%| | 2202/1000000 [09:05<67:27:43, 4.11it/s, grad_norm=17, loss_final=3.09, loss_mean=0.971, loss_mean_cls=2.5, proj_loss=-0.385][2026-03-23 13:45:25] Step: 2202, Training Logs: loss_final: 2.956755, loss_mean: 0.957107, proj_loss: -0.386051, loss_mean_cls: 2.385699, grad_norm: 12.755438 +Steps: 0%| | 2203/1000000 [09:06<67:27:44, 4.11it/s, grad_norm=12.8, loss_final=2.96, loss_mean=0.957, loss_mean_cls=2.39, proj_loss=-0.386][2026-03-23 13:45:25] Step: 2203, Training Logs: loss_final: 2.991088, loss_mean: 0.962033, proj_loss: -0.388897, loss_mean_cls: 2.417953, grad_norm: 15.032445 +Steps: 0%| | 2204/1000000 [09:06<67:28:14, 4.11it/s, grad_norm=15, loss_final=2.99, loss_mean=0.962, loss_mean_cls=2.42, proj_loss=-0.389][2026-03-23 13:45:26] Step: 2204, Training Logs: loss_final: 3.129663, loss_mean: 0.966678, proj_loss: -0.385972, loss_mean_cls: 2.548956, grad_norm: 13.175470 +Steps: 0%| | 2205/1000000 [09:06<67:27:59, 4.11it/s, grad_norm=13.2, loss_final=3.13, loss_mean=0.967, loss_mean_cls=2.55, proj_loss=-0.386][2026-03-23 13:45:26] Step: 2205, Training Logs: loss_final: 2.515064, loss_mean: 0.965349, proj_loss: -0.395052, loss_mean_cls: 1.944767, grad_norm: 13.228516 +Steps: 0%| | 2206/1000000 [09:06<67:27:56, 4.11it/s, grad_norm=13.2, loss_final=2.52, loss_mean=0.965, loss_mean_cls=1.94, proj_loss=-0.395][2026-03-23 13:45:26] Step: 2206, Training Logs: loss_final: 2.791742, loss_mean: 0.972852, proj_loss: -0.395322, loss_mean_cls: 2.214211, grad_norm: 3.610126 +Steps: 0%| | 2207/1000000 [09:06<67:28:10, 4.11it/s, grad_norm=3.61, loss_final=2.79, loss_mean=0.973, loss_mean_cls=2.21, proj_loss=-0.395][2026-03-23 13:45:26] Step: 2207, Training Logs: loss_final: 2.865152, loss_mean: 0.946367, proj_loss: -0.391973, loss_mean_cls: 2.310758, grad_norm: 14.437947 +Steps: 0%| | 2208/1000000 [09:07<67:27:19, 4.11it/s, grad_norm=14.4, loss_final=2.87, loss_mean=0.946, loss_mean_cls=2.31, proj_loss=-0.392][2026-03-23 13:45:27] Step: 2208, Training Logs: loss_final: 2.964631, loss_mean: 0.945890, proj_loss: -0.386682, loss_mean_cls: 2.405423, grad_norm: 17.210489 +Steps: 0%| | 2209/1000000 [09:07<67:26:13, 4.11it/s, grad_norm=17.2, loss_final=2.96, loss_mean=0.946, loss_mean_cls=2.41, proj_loss=-0.387][2026-03-23 13:45:27] Step: 2209, Training Logs: loss_final: 3.064197, loss_mean: 0.965435, proj_loss: -0.383388, loss_mean_cls: 2.482150, grad_norm: 7.120484 +Steps: 0%| | 2210/1000000 [09:07<67:26:03, 4.11it/s, grad_norm=7.12, loss_final=3.06, loss_mean=0.965, loss_mean_cls=2.48, proj_loss=-0.383][2026-03-23 13:45:27] Step: 2210, Training Logs: loss_final: 3.218038, loss_mean: 0.958010, proj_loss: -0.384797, loss_mean_cls: 2.644825, grad_norm: 6.591699 +Steps: 0%| | 2211/1000000 [09:07<67:26:29, 4.11it/s, grad_norm=6.59, loss_final=3.22, loss_mean=0.958, loss_mean_cls=2.64, proj_loss=-0.385][2026-03-23 13:45:27] Step: 2211, Training Logs: loss_final: 3.030556, loss_mean: 0.933496, proj_loss: -0.385485, loss_mean_cls: 2.482546, grad_norm: 7.426790 +Steps: 0%| | 2212/1000000 [09:08<67:26:09, 4.11it/s, grad_norm=7.43, loss_final=3.03, loss_mean=0.933, loss_mean_cls=2.48, proj_loss=-0.385][2026-03-23 13:45:28] Step: 2212, Training Logs: loss_final: 2.871709, loss_mean: 0.948829, proj_loss: -0.390617, loss_mean_cls: 2.313496, grad_norm: 3.058423 +Steps: 0%| | 2213/1000000 [09:08<67:25:24, 4.11it/s, grad_norm=3.06, loss_final=2.87, loss_mean=0.949, loss_mean_cls=2.31, proj_loss=-0.391][2026-03-23 13:45:28] Step: 2213, Training Logs: loss_final: 2.977343, loss_mean: 0.946142, proj_loss: -0.385554, loss_mean_cls: 2.416755, grad_norm: 9.046623 +Steps: 0%| | 2214/1000000 [09:08<67:25:21, 4.11it/s, grad_norm=9.05, loss_final=2.98, loss_mean=0.946, loss_mean_cls=2.42, proj_loss=-0.386][2026-03-23 13:45:28] Step: 2214, Training Logs: loss_final: 3.167438, loss_mean: 0.953124, proj_loss: -0.386852, loss_mean_cls: 2.601166, grad_norm: 6.063272 +Steps: 0%| | 2215/1000000 [09:08<67:25:52, 4.11it/s, grad_norm=6.06, loss_final=3.17, loss_mean=0.953, loss_mean_cls=2.6, proj_loss=-0.387][2026-03-23 13:45:28] Step: 2215, Training Logs: loss_final: 2.582343, loss_mean: 0.968472, proj_loss: -0.394828, loss_mean_cls: 2.008699, grad_norm: 2.813939 +Steps: 0%| | 2216/1000000 [09:09<67:24:23, 4.11it/s, grad_norm=2.81, loss_final=2.58, loss_mean=0.968, loss_mean_cls=2.01, proj_loss=-0.395][2026-03-23 13:45:29] Step: 2216, Training Logs: loss_final: 2.685942, loss_mean: 0.954457, proj_loss: -0.390864, loss_mean_cls: 2.122350, grad_norm: 8.129560 +Steps: 0%| | 2217/1000000 [09:09<67:25:50, 4.11it/s, grad_norm=8.13, loss_final=2.69, loss_mean=0.954, loss_mean_cls=2.12, proj_loss=-0.391][2026-03-23 13:45:29] Step: 2217, Training Logs: loss_final: 2.783610, loss_mean: 0.960347, proj_loss: -0.384493, loss_mean_cls: 2.207756, grad_norm: 5.912566 +Steps: 0%| | 2218/1000000 [09:09<67:25:46, 4.11it/s, grad_norm=5.91, loss_final=2.78, loss_mean=0.96, loss_mean_cls=2.21, proj_loss=-0.384][2026-03-23 13:45:29] Step: 2218, Training Logs: loss_final: 2.722569, loss_mean: 0.980271, proj_loss: -0.389771, loss_mean_cls: 2.132069, grad_norm: 11.658656 +Steps: 0%| | 2219/1000000 [09:09<67:26:20, 4.11it/s, grad_norm=11.7, loss_final=2.72, loss_mean=0.98, loss_mean_cls=2.13, proj_loss=-0.39][2026-03-23 13:45:29] Step: 2219, Training Logs: loss_final: 2.973697, loss_mean: 0.954043, proj_loss: -0.388590, loss_mean_cls: 2.408244, grad_norm: 12.245505 +Steps: 0%| | 2220/1000000 [09:10<67:28:43, 4.11it/s, grad_norm=12.2, loss_final=2.97, loss_mean=0.954, loss_mean_cls=2.41, proj_loss=-0.389][2026-03-23 13:45:30] Step: 2220, Training Logs: loss_final: 3.168475, loss_mean: 0.935957, proj_loss: -0.376841, loss_mean_cls: 2.609359, grad_norm: 3.344058 +Steps: 0%| | 2221/1000000 [09:10<67:27:23, 4.11it/s, grad_norm=3.34, loss_final=3.17, loss_mean=0.936, loss_mean_cls=2.61, proj_loss=-0.377][2026-03-23 13:45:30] Step: 2221, Training Logs: loss_final: 3.173707, loss_mean: 0.942149, proj_loss: -0.387316, loss_mean_cls: 2.618874, grad_norm: 12.562044 +Steps: 0%| | 2222/1000000 [09:10<67:29:56, 4.11it/s, grad_norm=12.6, loss_final=3.17, loss_mean=0.942, loss_mean_cls=2.62, proj_loss=-0.387][2026-03-23 13:45:30] Step: 2222, Training Logs: loss_final: 2.445081, loss_mean: 0.969409, proj_loss: -0.393790, loss_mean_cls: 1.869463, grad_norm: 12.922935 +Steps: 0%| | 2223/1000000 [09:10<67:27:59, 4.11it/s, grad_norm=12.9, loss_final=2.45, loss_mean=0.969, loss_mean_cls=1.87, proj_loss=-0.394][2026-03-23 13:45:30] Step: 2223, Training Logs: loss_final: 2.533132, loss_mean: 0.972723, proj_loss: -0.395509, loss_mean_cls: 1.955918, grad_norm: 11.688527 +Steps: 0%| | 2224/1000000 [09:11<67:27:18, 4.11it/s, grad_norm=11.7, loss_final=2.53, loss_mean=0.973, loss_mean_cls=1.96, proj_loss=-0.396][2026-03-23 13:45:31] Step: 2224, Training Logs: loss_final: 3.115125, loss_mean: 0.940758, proj_loss: -0.377216, loss_mean_cls: 2.551582, grad_norm: 5.664723 +Steps: 0%| | 2225/1000000 [09:11<67:26:07, 4.11it/s, grad_norm=5.66, loss_final=3.12, loss_mean=0.941, loss_mean_cls=2.55, proj_loss=-0.377][2026-03-23 13:45:31] Step: 2225, Training Logs: loss_final: 3.000609, loss_mean: 0.932252, proj_loss: -0.376899, loss_mean_cls: 2.445256, grad_norm: 9.298238 +Steps: 0%| | 2226/1000000 [09:11<67:26:59, 4.11it/s, grad_norm=9.3, loss_final=3, loss_mean=0.932, loss_mean_cls=2.45, proj_loss=-0.377][2026-03-23 13:45:31] Step: 2226, Training Logs: loss_final: 3.106833, loss_mean: 0.927977, proj_loss: -0.392938, loss_mean_cls: 2.571795, grad_norm: 4.532318 +Steps: 0%| | 2227/1000000 [09:11<67:27:21, 4.11it/s, grad_norm=4.53, loss_final=3.11, loss_mean=0.928, loss_mean_cls=2.57, proj_loss=-0.393][2026-03-23 13:45:31] Step: 2227, Training Logs: loss_final: 3.021505, loss_mean: 0.941108, proj_loss: -0.388434, loss_mean_cls: 2.468832, grad_norm: 14.994109 +Steps: 0%| | 2228/1000000 [09:12<67:26:32, 4.11it/s, grad_norm=15, loss_final=3.02, loss_mean=0.941, loss_mean_cls=2.47, proj_loss=-0.388][2026-03-23 13:45:32] Step: 2228, Training Logs: loss_final: 3.355935, loss_mean: 0.919585, proj_loss: -0.379530, loss_mean_cls: 2.815881, grad_norm: 14.665204 +Steps: 0%| | 2229/1000000 [09:12<67:25:58, 4.11it/s, grad_norm=14.7, loss_final=3.36, loss_mean=0.92, loss_mean_cls=2.82, proj_loss=-0.38][2026-03-23 13:45:32] Step: 2229, Training Logs: loss_final: 3.167613, loss_mean: 0.928285, proj_loss: -0.380957, loss_mean_cls: 2.620285, grad_norm: 6.923979 +Steps: 0%| | 2230/1000000 [09:12<67:26:39, 4.11it/s, grad_norm=6.92, loss_final=3.17, loss_mean=0.928, loss_mean_cls=2.62, proj_loss=-0.381][2026-03-23 13:45:32] Step: 2230, Training Logs: loss_final: 3.209349, loss_mean: 0.941008, proj_loss: -0.378621, loss_mean_cls: 2.646962, grad_norm: 8.652774 +Steps: 0%| | 2231/1000000 [09:12<68:26:45, 4.05it/s, grad_norm=8.65, loss_final=3.21, loss_mean=0.941, loss_mean_cls=2.65, proj_loss=-0.379][2026-03-23 13:45:32] Step: 2231, Training Logs: loss_final: 3.080634, loss_mean: 0.938922, proj_loss: -0.383876, loss_mean_cls: 2.525588, grad_norm: 15.107339 +Steps: 0%| | 2232/1000000 [09:13<68:08:30, 4.07it/s, grad_norm=15.1, loss_final=3.08, loss_mean=0.939, loss_mean_cls=2.53, proj_loss=-0.384][2026-03-23 13:45:33] Step: 2232, Training Logs: loss_final: 2.541659, loss_mean: 0.967025, proj_loss: -0.395952, loss_mean_cls: 1.970585, grad_norm: 4.137338 +Steps: 0%| | 2233/1000000 [09:13<67:56:26, 4.08it/s, grad_norm=4.14, loss_final=2.54, loss_mean=0.967, loss_mean_cls=1.97, proj_loss=-0.396][2026-03-23 13:45:33] Step: 2233, Training Logs: loss_final: 2.758149, loss_mean: 0.951028, proj_loss: -0.386447, loss_mean_cls: 2.193568, grad_norm: 8.046482 +Steps: 0%| | 2234/1000000 [09:13<67:49:22, 4.09it/s, grad_norm=8.05, loss_final=2.76, loss_mean=0.951, loss_mean_cls=2.19, proj_loss=-0.386][2026-03-23 13:45:33] Step: 2234, Training Logs: loss_final: 2.958385, loss_mean: 0.917195, proj_loss: -0.390361, loss_mean_cls: 2.431551, grad_norm: 5.906283 +Steps: 0%| | 2235/1000000 [09:13<67:42:04, 4.09it/s, grad_norm=5.91, loss_final=2.96, loss_mean=0.917, loss_mean_cls=2.43, proj_loss=-0.39][2026-03-23 13:45:33] Step: 2235, Training Logs: loss_final: 2.961512, loss_mean: 0.940947, proj_loss: -0.389677, loss_mean_cls: 2.410243, grad_norm: 1.985884 +Steps: 0%| | 2236/1000000 [09:14<67:37:50, 4.10it/s, grad_norm=1.99, loss_final=2.96, loss_mean=0.941, loss_mean_cls=2.41, proj_loss=-0.39][2026-03-23 13:45:34] Step: 2236, Training Logs: loss_final: 2.859572, loss_mean: 0.926556, proj_loss: -0.396463, loss_mean_cls: 2.329479, grad_norm: 2.855719 +Steps: 0%| | 2237/1000000 [09:14<67:33:13, 4.10it/s, grad_norm=2.86, loss_final=2.86, loss_mean=0.927, loss_mean_cls=2.33, proj_loss=-0.396][2026-03-23 13:45:34] Step: 2237, Training Logs: loss_final: 2.937593, loss_mean: 0.959769, proj_loss: -0.387129, loss_mean_cls: 2.364954, grad_norm: 2.813287 +Steps: 0%| | 2238/1000000 [09:14<67:32:08, 4.10it/s, grad_norm=2.81, loss_final=2.94, loss_mean=0.96, loss_mean_cls=2.36, proj_loss=-0.387][2026-03-23 13:45:34] Step: 2238, Training Logs: loss_final: 3.557687, loss_mean: 0.925578, proj_loss: -0.374142, loss_mean_cls: 3.006252, grad_norm: 5.483311 +Steps: 0%| | 2239/1000000 [09:14<67:29:43, 4.11it/s, grad_norm=5.48, loss_final=3.56, loss_mean=0.926, loss_mean_cls=3.01, proj_loss=-0.374][2026-03-23 13:45:34] Step: 2239, Training Logs: loss_final: 2.848590, loss_mean: 0.952597, proj_loss: -0.387461, loss_mean_cls: 2.283454, grad_norm: 6.295072 +Steps: 0%| | 2240/1000000 [09:15<67:28:23, 4.11it/s, grad_norm=6.3, loss_final=2.85, loss_mean=0.953, loss_mean_cls=2.28, proj_loss=-0.387][2026-03-23 13:45:34] Step: 2240, Training Logs: loss_final: 2.712815, loss_mean: 0.952559, proj_loss: -0.393756, loss_mean_cls: 2.154012, grad_norm: 14.656046 +Steps: 0%| | 2241/1000000 [09:15<67:27:44, 4.11it/s, grad_norm=14.7, loss_final=2.71, loss_mean=0.953, loss_mean_cls=2.15, proj_loss=-0.394][2026-03-23 13:45:35] Step: 2241, Training Logs: loss_final: 2.821977, loss_mean: 0.959248, proj_loss: -0.382343, loss_mean_cls: 2.245072, grad_norm: 7.047902 +Steps: 0%| | 2242/1000000 [09:15<67:29:13, 4.11it/s, grad_norm=7.05, loss_final=2.82, loss_mean=0.959, loss_mean_cls=2.25, proj_loss=-0.382][2026-03-23 13:45:35] Step: 2242, Training Logs: loss_final: 2.954681, loss_mean: 0.955573, proj_loss: -0.383378, loss_mean_cls: 2.382486, grad_norm: 2.675300 +Steps: 0%| | 2243/1000000 [09:15<67:28:42, 4.11it/s, grad_norm=2.68, loss_final=2.95, loss_mean=0.956, loss_mean_cls=2.38, proj_loss=-0.383][2026-03-23 13:45:35] Step: 2243, Training Logs: loss_final: 3.530878, loss_mean: 0.928651, proj_loss: -0.380435, loss_mean_cls: 2.982663, grad_norm: 14.518725 +Steps: 0%| | 2244/1000000 [09:16<67:27:28, 4.11it/s, grad_norm=14.5, loss_final=3.53, loss_mean=0.929, loss_mean_cls=2.98, proj_loss=-0.38][2026-03-23 13:45:35] Step: 2244, Training Logs: loss_final: 3.036497, loss_mean: 0.965054, proj_loss: -0.383591, loss_mean_cls: 2.455034, grad_norm: 14.821588 +Steps: 0%| | 2245/1000000 [09:16<67:26:18, 4.11it/s, grad_norm=14.8, loss_final=3.04, loss_mean=0.965, loss_mean_cls=2.46, proj_loss=-0.384][2026-03-23 13:45:36] Step: 2245, Training Logs: loss_final: 3.258379, loss_mean: 0.940949, proj_loss: -0.374120, loss_mean_cls: 2.691550, grad_norm: 6.415960 +Steps: 0%| | 2246/1000000 [09:16<67:27:50, 4.11it/s, grad_norm=6.42, loss_final=3.26, loss_mean=0.941, loss_mean_cls=2.69, proj_loss=-0.374][2026-03-23 13:45:36] Step: 2246, Training Logs: loss_final: 2.837384, loss_mean: 0.929504, proj_loss: -0.380032, loss_mean_cls: 2.287913, grad_norm: 11.823596 +Steps: 0%| | 2247/1000000 [09:16<67:28:38, 4.11it/s, grad_norm=11.8, loss_final=2.84, loss_mean=0.93, loss_mean_cls=2.29, proj_loss=-0.38][2026-03-23 13:45:36] Step: 2247, Training Logs: loss_final: 2.532580, loss_mean: 0.964349, proj_loss: -0.391254, loss_mean_cls: 1.959485, grad_norm: 13.517427 +Steps: 0%| | 2248/1000000 [09:16<67:28:45, 4.11it/s, grad_norm=13.5, loss_final=2.53, loss_mean=0.964, loss_mean_cls=1.96, proj_loss=-0.391][2026-03-23 13:45:36] Step: 2248, Training Logs: loss_final: 3.125015, loss_mean: 0.947222, proj_loss: -0.382923, loss_mean_cls: 2.560717, grad_norm: 18.090321 +Steps: 0%| | 2249/1000000 [09:17<67:28:54, 4.11it/s, grad_norm=18.1, loss_final=3.13, loss_mean=0.947, loss_mean_cls=2.56, proj_loss=-0.383][2026-03-23 13:45:37] Step: 2249, Training Logs: loss_final: 2.828598, loss_mean: 0.960000, proj_loss: -0.386521, loss_mean_cls: 2.255119, grad_norm: 8.860727 +Steps: 0%| | 2250/1000000 [09:17<68:07:02, 4.07it/s, grad_norm=8.86, loss_final=2.83, loss_mean=0.96, loss_mean_cls=2.26, proj_loss=-0.387][2026-03-23 13:45:37] Step: 2250, Training Logs: loss_final: 2.953221, loss_mean: 0.933203, proj_loss: -0.384337, loss_mean_cls: 2.404355, grad_norm: 15.739930 +Steps: 0%| | 2251/1000000 [09:17<67:55:34, 4.08it/s, grad_norm=15.7, loss_final=2.95, loss_mean=0.933, loss_mean_cls=2.4, proj_loss=-0.384][2026-03-23 13:45:37] Step: 2251, Training Logs: loss_final: 2.650859, loss_mean: 0.949644, proj_loss: -0.386126, loss_mean_cls: 2.087341, grad_norm: 6.036304 +Steps: 0%| | 2252/1000000 [09:17<67:47:23, 4.09it/s, grad_norm=6.04, loss_final=2.65, loss_mean=0.95, loss_mean_cls=2.09, proj_loss=-0.386][2026-03-23 13:45:37] Step: 2252, Training Logs: loss_final: 2.567963, loss_mean: 0.955451, proj_loss: -0.389882, loss_mean_cls: 2.002394, grad_norm: 8.112456 +Steps: 0%| | 2253/1000000 [09:18<67:43:41, 4.09it/s, grad_norm=8.11, loss_final=2.57, loss_mean=0.955, loss_mean_cls=2, proj_loss=-0.39][2026-03-23 13:45:38] Step: 2253, Training Logs: loss_final: 2.679108, loss_mean: 0.950126, proj_loss: -0.393591, loss_mean_cls: 2.122572, grad_norm: 10.283797 +Steps: 0%| | 2254/1000000 [09:18<67:39:03, 4.10it/s, grad_norm=10.3, loss_final=2.68, loss_mean=0.95, loss_mean_cls=2.12, proj_loss=-0.394][2026-03-23 13:45:38] Step: 2254, Training Logs: loss_final: 3.343019, loss_mean: 0.919069, proj_loss: -0.377908, loss_mean_cls: 2.801858, grad_norm: 3.148458 +Steps: 0%| | 2255/1000000 [09:18<68:58:45, 4.02it/s, grad_norm=3.15, loss_final=3.34, loss_mean=0.919, loss_mean_cls=2.8, proj_loss=-0.378][2026-03-23 13:45:38] Step: 2255, Training Logs: loss_final: 2.438125, loss_mean: 0.955384, proj_loss: -0.393702, loss_mean_cls: 1.876444, grad_norm: 3.483447 +Steps: 0%| | 2256/1000000 [09:18<68:32:03, 4.04it/s, grad_norm=3.48, loss_final=2.44, loss_mean=0.955, loss_mean_cls=1.88, proj_loss=-0.394][2026-03-23 13:45:38] Step: 2256, Training Logs: loss_final: 2.707281, loss_mean: 0.964078, proj_loss: -0.385352, loss_mean_cls: 2.128555, grad_norm: 8.302536 +Steps: 0%| | 2257/1000000 [09:19<68:12:07, 4.06it/s, grad_norm=8.3, loss_final=2.71, loss_mean=0.964, loss_mean_cls=2.13, proj_loss=-0.385][2026-03-23 13:45:39] Step: 2257, Training Logs: loss_final: 2.958879, loss_mean: 0.969987, proj_loss: -0.386971, loss_mean_cls: 2.375863, grad_norm: 9.751342 +Steps: 0%| | 2258/1000000 [09:19<67:58:57, 4.08it/s, grad_norm=9.75, loss_final=2.96, loss_mean=0.97, loss_mean_cls=2.38, proj_loss=-0.387][2026-03-23 13:45:39] Step: 2258, Training Logs: loss_final: 2.723319, loss_mean: 0.975301, proj_loss: -0.388740, loss_mean_cls: 2.136758, grad_norm: 12.939790 +Steps: 0%| | 2259/1000000 [09:19<67:48:17, 4.09it/s, grad_norm=12.9, loss_final=2.72, loss_mean=0.975, loss_mean_cls=2.14, proj_loss=-0.389][2026-03-23 13:45:39] Step: 2259, Training Logs: loss_final: 2.787192, loss_mean: 0.944417, proj_loss: -0.386066, loss_mean_cls: 2.228841, grad_norm: 5.358928 +Steps: 0%| | 2260/1000000 [09:19<67:41:30, 4.09it/s, grad_norm=5.36, loss_final=2.79, loss_mean=0.944, loss_mean_cls=2.23, proj_loss=-0.386][2026-03-23 13:45:39] Step: 2260, Training Logs: loss_final: 3.156715, loss_mean: 0.919406, proj_loss: -0.383448, loss_mean_cls: 2.620757, grad_norm: 7.276807 +Steps: 0%| | 2261/1000000 [09:20<67:36:25, 4.10it/s, grad_norm=7.28, loss_final=3.16, loss_mean=0.919, loss_mean_cls=2.62, proj_loss=-0.383][2026-03-23 13:45:40] Step: 2261, Training Logs: loss_final: 2.832936, loss_mean: 0.947169, proj_loss: -0.389184, loss_mean_cls: 2.274952, grad_norm: 8.320445 +Steps: 0%| | 2262/1000000 [09:20<67:36:43, 4.10it/s, grad_norm=8.32, loss_final=2.83, loss_mean=0.947, loss_mean_cls=2.27, proj_loss=-0.389][2026-03-23 13:45:40] Step: 2262, Training Logs: loss_final: 3.177035, loss_mean: 0.942289, proj_loss: -0.384009, loss_mean_cls: 2.618755, grad_norm: 21.582773 +Steps: 0%| | 2263/1000000 [09:20<67:34:13, 4.10it/s, grad_norm=21.6, loss_final=3.18, loss_mean=0.942, loss_mean_cls=2.62, proj_loss=-0.384][2026-03-23 13:45:40] Step: 2263, Training Logs: loss_final: 2.642047, loss_mean: 0.949897, proj_loss: -0.388720, loss_mean_cls: 2.080870, grad_norm: 5.247509 +Steps: 0%| | 2264/1000000 [09:20<67:32:06, 4.10it/s, grad_norm=5.25, loss_final=2.64, loss_mean=0.95, loss_mean_cls=2.08, proj_loss=-0.389][2026-03-23 13:45:40] Step: 2264, Training Logs: loss_final: 3.353326, loss_mean: 0.905935, proj_loss: -0.387376, loss_mean_cls: 2.834768, grad_norm: 20.000652 +Steps: 0%| | 2265/1000000 [09:21<67:30:33, 4.11it/s, grad_norm=20, loss_final=3.35, loss_mean=0.906, loss_mean_cls=2.83, proj_loss=-0.387][2026-03-23 13:45:41] Step: 2265, Training Logs: loss_final: 2.851327, loss_mean: 0.924684, proj_loss: -0.389974, loss_mean_cls: 2.316617, grad_norm: 16.183867 +Steps: 0%| | 2266/1000000 [09:21<67:29:54, 4.11it/s, grad_norm=16.2, loss_final=2.85, loss_mean=0.925, loss_mean_cls=2.32, proj_loss=-0.39][2026-03-23 13:45:41] Step: 2266, Training Logs: loss_final: 2.585631, loss_mean: 0.956015, proj_loss: -0.392234, loss_mean_cls: 2.021850, grad_norm: 17.515598 +Steps: 0%| | 2267/1000000 [09:21<67:30:31, 4.11it/s, grad_norm=17.5, loss_final=2.59, loss_mean=0.956, loss_mean_cls=2.02, proj_loss=-0.392][2026-03-23 13:45:41] Step: 2267, Training Logs: loss_final: 2.405919, loss_mean: 0.957914, proj_loss: -0.390669, loss_mean_cls: 1.838674, grad_norm: 2.750073 +Steps: 0%| | 2268/1000000 [09:21<67:28:37, 4.11it/s, grad_norm=2.75, loss_final=2.41, loss_mean=0.958, loss_mean_cls=1.84, proj_loss=-0.391][2026-03-23 13:45:41] Step: 2268, Training Logs: loss_final: 3.056472, loss_mean: 0.940145, proj_loss: -0.389700, loss_mean_cls: 2.506027, grad_norm: 12.274599 +Steps: 0%| | 2269/1000000 [09:22<67:28:07, 4.11it/s, grad_norm=12.3, loss_final=3.06, loss_mean=0.94, loss_mean_cls=2.51, proj_loss=-0.39][2026-03-23 13:45:42] Step: 2269, Training Logs: loss_final: 3.507995, loss_mean: 0.936938, proj_loss: -0.375285, loss_mean_cls: 2.946342, grad_norm: 4.677584 +Steps: 0%| | 2270/1000000 [09:22<67:28:30, 4.11it/s, grad_norm=4.68, loss_final=3.51, loss_mean=0.937, loss_mean_cls=2.95, proj_loss=-0.375][2026-03-23 13:45:42] Step: 2270, Training Logs: loss_final: 3.088940, loss_mean: 0.960449, proj_loss: -0.373984, loss_mean_cls: 2.502475, grad_norm: 10.108357 +Steps: 0%| | 2271/1000000 [09:22<67:31:08, 4.10it/s, grad_norm=10.1, loss_final=3.09, loss_mean=0.96, loss_mean_cls=2.5, proj_loss=-0.374][2026-03-23 13:45:42] Step: 2271, Training Logs: loss_final: 2.718731, loss_mean: 0.958693, proj_loss: -0.395069, loss_mean_cls: 2.155107, grad_norm: 16.953850 +Steps: 0%| | 2272/1000000 [09:22<67:32:19, 4.10it/s, grad_norm=17, loss_final=2.72, loss_mean=0.959, loss_mean_cls=2.16, proj_loss=-0.395][2026-03-23 13:45:42] Step: 2272, Training Logs: loss_final: 2.704444, loss_mean: 0.963082, proj_loss: -0.392952, loss_mean_cls: 2.134314, grad_norm: 18.273111 +Steps: 0%| | 2273/1000000 [09:23<67:33:22, 4.10it/s, grad_norm=18.3, loss_final=2.7, loss_mean=0.963, loss_mean_cls=2.13, proj_loss=-0.393][2026-03-23 13:45:43] Step: 2273, Training Logs: loss_final: 2.931990, loss_mean: 0.934224, proj_loss: -0.388529, loss_mean_cls: 2.386295, grad_norm: 5.234196 +Steps: 0%| | 2274/1000000 [09:23<67:34:51, 4.10it/s, grad_norm=5.23, loss_final=2.93, loss_mean=0.934, loss_mean_cls=2.39, proj_loss=-0.389][2026-03-23 13:45:43] Step: 2274, Training Logs: loss_final: 2.657599, loss_mean: 0.961648, proj_loss: -0.393837, loss_mean_cls: 2.089788, grad_norm: 14.436442 +Steps: 0%| | 2275/1000000 [09:23<67:35:18, 4.10it/s, grad_norm=14.4, loss_final=2.66, loss_mean=0.962, loss_mean_cls=2.09, proj_loss=-0.394][2026-03-23 13:45:43] Step: 2275, Training Logs: loss_final: 3.182318, loss_mean: 0.925894, proj_loss: -0.380726, loss_mean_cls: 2.637149, grad_norm: 11.046445 +Steps: 0%| | 2276/1000000 [09:23<67:36:05, 4.10it/s, grad_norm=11, loss_final=3.18, loss_mean=0.926, loss_mean_cls=2.64, proj_loss=-0.381][2026-03-23 13:45:43] Step: 2276, Training Logs: loss_final: 3.259283, loss_mean: 0.949202, proj_loss: -0.380142, loss_mean_cls: 2.690223, grad_norm: 13.749881 +Steps: 0%| | 2277/1000000 [09:24<67:39:56, 4.10it/s, grad_norm=13.7, loss_final=3.26, loss_mean=0.949, loss_mean_cls=2.69, proj_loss=-0.38][2026-03-23 13:45:44] Step: 2277, Training Logs: loss_final: 2.954557, loss_mean: 0.947245, proj_loss: -0.381977, loss_mean_cls: 2.389289, grad_norm: 15.086909 +Steps: 0%| | 2278/1000000 [09:24<67:39:18, 4.10it/s, grad_norm=15.1, loss_final=2.95, loss_mean=0.947, loss_mean_cls=2.39, proj_loss=-0.382][2026-03-23 13:45:44] Step: 2278, Training Logs: loss_final: 2.998864, loss_mean: 0.950124, proj_loss: -0.389513, loss_mean_cls: 2.438252, grad_norm: 9.967711 +Steps: 0%| | 2279/1000000 [09:24<67:36:54, 4.10it/s, grad_norm=9.97, loss_final=3, loss_mean=0.95, loss_mean_cls=2.44, proj_loss=-0.39][2026-03-23 13:45:44] Step: 2279, Training Logs: loss_final: 3.226607, loss_mean: 0.930081, proj_loss: -0.387442, loss_mean_cls: 2.683968, grad_norm: 6.785139 +Steps: 0%| | 2280/1000000 [09:24<67:32:18, 4.10it/s, grad_norm=6.79, loss_final=3.23, loss_mean=0.93, loss_mean_cls=2.68, proj_loss=-0.387][2026-03-23 13:45:44] Step: 2280, Training Logs: loss_final: 2.975946, loss_mean: 0.949799, proj_loss: -0.382964, loss_mean_cls: 2.409111, grad_norm: 2.832032 +Steps: 0%| | 2281/1000000 [09:25<67:30:30, 4.11it/s, grad_norm=2.83, loss_final=2.98, loss_mean=0.95, loss_mean_cls=2.41, proj_loss=-0.383][2026-03-23 13:45:45] Step: 2281, Training Logs: loss_final: 3.230965, loss_mean: 0.929981, proj_loss: -0.385635, loss_mean_cls: 2.686620, grad_norm: 14.457704 +Steps: 0%| | 2282/1000000 [09:25<67:33:17, 4.10it/s, grad_norm=14.5, loss_final=3.23, loss_mean=0.93, loss_mean_cls=2.69, proj_loss=-0.386][2026-03-23 13:45:45] Step: 2282, Training Logs: loss_final: 2.623619, loss_mean: 0.974220, proj_loss: -0.389809, loss_mean_cls: 2.039208, grad_norm: 8.744569 +Steps: 0%| | 2283/1000000 [09:25<75:04:08, 3.69it/s, grad_norm=8.74, loss_final=2.62, loss_mean=0.974, loss_mean_cls=2.04, proj_loss=-0.39][2026-03-23 13:45:45] Step: 2283, Training Logs: loss_final: 3.035561, loss_mean: 0.942051, proj_loss: -0.382042, loss_mean_cls: 2.475553, grad_norm: 5.374267 +Steps: 0%| | 2284/1000000 [09:25<73:26:23, 3.77it/s, grad_norm=5.37, loss_final=3.04, loss_mean=0.942, loss_mean_cls=2.48, proj_loss=-0.382][2026-03-23 13:45:45] Step: 2284, Training Logs: loss_final: 3.051071, loss_mean: 0.954909, proj_loss: -0.384178, loss_mean_cls: 2.480340, grad_norm: 9.603436 +Steps: 0%| | 2285/1000000 [09:26<71:38:54, 3.87it/s, grad_norm=9.6, loss_final=3.05, loss_mean=0.955, loss_mean_cls=2.48, proj_loss=-0.384][2026-03-23 13:45:46] Step: 2285, Training Logs: loss_final: 2.617352, loss_mean: 0.960923, proj_loss: -0.396878, loss_mean_cls: 2.053306, grad_norm: 7.624844 +Steps: 0%| | 2286/1000000 [09:26<70:26:52, 3.93it/s, grad_norm=7.62, loss_final=2.62, loss_mean=0.961, loss_mean_cls=2.05, proj_loss=-0.397][2026-03-23 13:45:46] Step: 2286, Training Logs: loss_final: 2.860287, loss_mean: 0.976024, proj_loss: -0.387456, loss_mean_cls: 2.271719, grad_norm: 12.852657 +Steps: 0%| | 2287/1000000 [09:26<69:35:07, 3.98it/s, grad_norm=12.9, loss_final=2.86, loss_mean=0.976, loss_mean_cls=2.27, proj_loss=-0.387][2026-03-23 13:45:46] Step: 2287, Training Logs: loss_final: 2.804848, loss_mean: 0.977663, proj_loss: -0.386525, loss_mean_cls: 2.213710, grad_norm: 8.173683 +Steps: 0%| | 2288/1000000 [09:26<69:10:01, 4.01it/s, grad_norm=8.17, loss_final=2.8, loss_mean=0.978, loss_mean_cls=2.21, proj_loss=-0.387][2026-03-23 13:45:46] Step: 2288, Training Logs: loss_final: 3.088985, loss_mean: 0.914596, proj_loss: -0.391541, loss_mean_cls: 2.565930, grad_norm: 2.944961 +Steps: 0%| | 2289/1000000 [09:27<68:40:11, 4.04it/s, grad_norm=2.94, loss_final=3.09, loss_mean=0.915, loss_mean_cls=2.57, proj_loss=-0.392][2026-03-23 13:45:47] Step: 2289, Training Logs: loss_final: 2.977475, loss_mean: 0.941070, proj_loss: -0.392073, loss_mean_cls: 2.428478, grad_norm: 8.633835 +Steps: 0%| | 2290/1000000 [09:27<68:19:51, 4.06it/s, grad_norm=8.63, loss_final=2.98, loss_mean=0.941, loss_mean_cls=2.43, proj_loss=-0.392][2026-03-23 13:45:47] Step: 2290, Training Logs: loss_final: 2.836180, loss_mean: 0.961360, proj_loss: -0.383404, loss_mean_cls: 2.258223, grad_norm: 7.955153 +Steps: 0%| | 2291/1000000 [09:27<68:15:11, 4.06it/s, grad_norm=7.96, loss_final=2.84, loss_mean=0.961, loss_mean_cls=2.26, proj_loss=-0.383][2026-03-23 13:45:47] Step: 2291, Training Logs: loss_final: 2.638576, loss_mean: 0.956495, proj_loss: -0.390603, loss_mean_cls: 2.072684, grad_norm: 4.693114 +Steps: 0%| | 2292/1000000 [09:27<68:15:24, 4.06it/s, grad_norm=4.69, loss_final=2.64, loss_mean=0.956, loss_mean_cls=2.07, proj_loss=-0.391][2026-03-23 13:45:47] Step: 2292, Training Logs: loss_final: 2.590894, loss_mean: 0.952497, proj_loss: -0.395514, loss_mean_cls: 2.033911, grad_norm: 12.046104 +Steps: 0%| | 2293/1000000 [09:28<68:03:57, 4.07it/s, grad_norm=12, loss_final=2.59, loss_mean=0.952, loss_mean_cls=2.03, proj_loss=-0.396][2026-03-23 13:45:48] Step: 2293, Training Logs: loss_final: 3.283435, loss_mean: 0.936573, proj_loss: -0.379496, loss_mean_cls: 2.726358, grad_norm: 2.998072 +Steps: 0%| | 2294/1000000 [09:28<67:54:49, 4.08it/s, grad_norm=3, loss_final=3.28, loss_mean=0.937, loss_mean_cls=2.73, proj_loss=-0.379][2026-03-23 13:45:48] Step: 2294, Training Logs: loss_final: 2.786714, loss_mean: 0.956734, proj_loss: -0.396788, loss_mean_cls: 2.226768, grad_norm: 13.047872 +Steps: 0%| | 2295/1000000 [09:28<67:57:07, 4.08it/s, grad_norm=13, loss_final=2.79, loss_mean=0.957, loss_mean_cls=2.23, proj_loss=-0.397][2026-03-23 13:45:48] Step: 2295, Training Logs: loss_final: 3.109571, loss_mean: 0.929863, proj_loss: -0.381523, loss_mean_cls: 2.561230, grad_norm: 9.024102 +Steps: 0%| | 2296/1000000 [09:28<67:55:15, 4.08it/s, grad_norm=9.02, loss_final=3.11, loss_mean=0.93, loss_mean_cls=2.56, proj_loss=-0.382][2026-03-23 13:45:48] Step: 2296, Training Logs: loss_final: 2.963168, loss_mean: 0.946557, proj_loss: -0.379188, loss_mean_cls: 2.395799, grad_norm: 3.786608 +Steps: 0%| | 2297/1000000 [09:29<67:51:28, 4.08it/s, grad_norm=3.79, loss_final=2.96, loss_mean=0.947, loss_mean_cls=2.4, proj_loss=-0.379][2026-03-23 13:45:49] Step: 2297, Training Logs: loss_final: 3.358901, loss_mean: 0.925848, proj_loss: -0.383298, loss_mean_cls: 2.816350, grad_norm: 11.629026 +Steps: 0%| | 2298/1000000 [09:29<67:47:12, 4.09it/s, grad_norm=11.6, loss_final=3.36, loss_mean=0.926, loss_mean_cls=2.82, proj_loss=-0.383][2026-03-23 13:45:49] Step: 2298, Training Logs: loss_final: 2.928156, loss_mean: 0.944496, proj_loss: -0.388374, loss_mean_cls: 2.372035, grad_norm: 11.006239 +Steps: 0%| | 2299/1000000 [09:29<67:43:08, 4.09it/s, grad_norm=11, loss_final=2.93, loss_mean=0.944, loss_mean_cls=2.37, proj_loss=-0.388][2026-03-23 13:45:49] Step: 2299, Training Logs: loss_final: 2.764167, loss_mean: 0.932816, proj_loss: -0.387360, loss_mean_cls: 2.218711, grad_norm: 2.719675 +Steps: 0%| | 2300/1000000 [09:29<67:42:47, 4.09it/s, grad_norm=2.72, loss_final=2.76, loss_mean=0.933, loss_mean_cls=2.22, proj_loss=-0.387][2026-03-23 13:45:49] Step: 2300, Training Logs: loss_final: 3.031115, loss_mean: 0.934281, proj_loss: -0.390336, loss_mean_cls: 2.487170, grad_norm: 6.503884 +Steps: 0%| | 2301/1000000 [09:30<67:42:15, 4.09it/s, grad_norm=6.5, loss_final=3.03, loss_mean=0.934, loss_mean_cls=2.49, proj_loss=-0.39][2026-03-23 13:45:49] Step: 2301, Training Logs: loss_final: 2.929621, loss_mean: 0.926112, proj_loss: -0.383570, loss_mean_cls: 2.387079, grad_norm: 7.307705 +Steps: 0%| | 2302/1000000 [09:30<67:37:18, 4.10it/s, grad_norm=7.31, loss_final=2.93, loss_mean=0.926, loss_mean_cls=2.39, proj_loss=-0.384][2026-03-23 13:45:50] Step: 2302, Training Logs: loss_final: 3.022504, loss_mean: 0.927314, proj_loss: -0.383334, loss_mean_cls: 2.478523, grad_norm: 15.597763 +Steps: 0%| | 2303/1000000 [09:30<67:34:17, 4.10it/s, grad_norm=15.6, loss_final=3.02, loss_mean=0.927, loss_mean_cls=2.48, proj_loss=-0.383][2026-03-23 13:45:50] Step: 2303, Training Logs: loss_final: 3.182748, loss_mean: 0.932125, proj_loss: -0.382241, loss_mean_cls: 2.632864, grad_norm: 14.820339 +Steps: 0%| | 2304/1000000 [09:30<67:42:28, 4.09it/s, grad_norm=14.8, loss_final=3.18, loss_mean=0.932, loss_mean_cls=2.63, proj_loss=-0.382][2026-03-23 13:45:50] Step: 2304, Training Logs: loss_final: 2.697334, loss_mean: 0.959833, proj_loss: -0.384405, loss_mean_cls: 2.121907, grad_norm: 9.751592 +Steps: 0%| | 2305/1000000 [09:31<67:38:46, 4.10it/s, grad_norm=9.75, loss_final=2.7, loss_mean=0.96, loss_mean_cls=2.12, proj_loss=-0.384][2026-03-23 13:45:50] Step: 2305, Training Logs: loss_final: 2.813441, loss_mean: 0.946046, proj_loss: -0.387357, loss_mean_cls: 2.254752, grad_norm: 7.236101 +Steps: 0%| | 2306/1000000 [09:31<67:36:47, 4.10it/s, grad_norm=7.24, loss_final=2.81, loss_mean=0.946, loss_mean_cls=2.25, proj_loss=-0.387][2026-03-23 13:45:51] Step: 2306, Training Logs: loss_final: 3.063810, loss_mean: 0.933062, proj_loss: -0.384384, loss_mean_cls: 2.515132, grad_norm: 24.060434 +Steps: 0%| | 2307/1000000 [09:31<67:34:36, 4.10it/s, grad_norm=24.1, loss_final=3.06, loss_mean=0.933, loss_mean_cls=2.52, proj_loss=-0.384][2026-03-23 13:45:51] Step: 2307, Training Logs: loss_final: 3.330678, loss_mean: 0.926225, proj_loss: -0.389713, loss_mean_cls: 2.794165, grad_norm: 5.780705 +Steps: 0%| | 2308/1000000 [09:31<67:35:05, 4.10it/s, grad_norm=5.78, loss_final=3.33, loss_mean=0.926, loss_mean_cls=2.79, proj_loss=-0.39][2026-03-23 13:45:51] Step: 2308, Training Logs: loss_final: 3.039788, loss_mean: 0.929791, proj_loss: -0.389044, loss_mean_cls: 2.499040, grad_norm: 3.515000 +Steps: 0%| | 2309/1000000 [09:31<67:31:36, 4.10it/s, grad_norm=3.51, loss_final=3.04, loss_mean=0.93, loss_mean_cls=2.5, proj_loss=-0.389][2026-03-23 13:45:51] Step: 2309, Training Logs: loss_final: 3.232911, loss_mean: 0.929414, proj_loss: -0.383372, loss_mean_cls: 2.686869, grad_norm: 18.849213 +Steps: 0%| | 2310/1000000 [09:32<67:30:43, 4.10it/s, grad_norm=18.8, loss_final=3.23, loss_mean=0.929, loss_mean_cls=2.69, proj_loss=-0.383][2026-03-23 13:45:52] Step: 2310, Training Logs: loss_final: 3.509219, loss_mean: 0.940878, proj_loss: -0.389323, loss_mean_cls: 2.957664, grad_norm: 22.374634 +Steps: 0%| | 2311/1000000 [09:32<67:31:08, 4.10it/s, grad_norm=22.4, loss_final=3.51, loss_mean=0.941, loss_mean_cls=2.96, proj_loss=-0.389][2026-03-23 13:45:52] Step: 2311, Training Logs: loss_final: 2.696078, loss_mean: 0.961073, proj_loss: -0.387756, loss_mean_cls: 2.122761, grad_norm: 10.383631 +Steps: 0%| | 2312/1000000 [09:32<67:34:02, 4.10it/s, grad_norm=10.4, loss_final=2.7, loss_mean=0.961, loss_mean_cls=2.12, proj_loss=-0.388][2026-03-23 13:45:52] Step: 2312, Training Logs: loss_final: 2.977345, loss_mean: 0.934388, proj_loss: -0.385451, loss_mean_cls: 2.428408, grad_norm: 10.320250 +Steps: 0%| | 2313/1000000 [09:32<67:32:05, 4.10it/s, grad_norm=10.3, loss_final=2.98, loss_mean=0.934, loss_mean_cls=2.43, proj_loss=-0.385][2026-03-23 13:45:52] Step: 2313, Training Logs: loss_final: 2.331771, loss_mean: 0.968257, proj_loss: -0.388776, loss_mean_cls: 1.752290, grad_norm: 3.876046 +Steps: 0%| | 2314/1000000 [09:33<67:33:57, 4.10it/s, grad_norm=3.88, loss_final=2.33, loss_mean=0.968, loss_mean_cls=1.75, proj_loss=-0.389][2026-03-23 13:45:53] Step: 2314, Training Logs: loss_final: 3.060370, loss_mean: 0.963672, proj_loss: -0.389307, loss_mean_cls: 2.486004, grad_norm: 20.252390 +Steps: 0%| | 2315/1000000 [09:33<67:35:37, 4.10it/s, grad_norm=20.3, loss_final=3.06, loss_mean=0.964, loss_mean_cls=2.49, proj_loss=-0.389][2026-03-23 13:45:53] Step: 2315, Training Logs: loss_final: 2.905650, loss_mean: 0.955645, proj_loss: -0.393446, loss_mean_cls: 2.343451, grad_norm: 20.505131 +Steps: 0%| | 2316/1000000 [09:33<67:35:16, 4.10it/s, grad_norm=20.5, loss_final=2.91, loss_mean=0.956, loss_mean_cls=2.34, proj_loss=-0.393][2026-03-23 13:45:53] Step: 2316, Training Logs: loss_final: 3.491638, loss_mean: 0.966474, proj_loss: -0.378452, loss_mean_cls: 2.903616, grad_norm: 31.673983 +Steps: 0%| | 2317/1000000 [09:33<67:34:09, 4.10it/s, grad_norm=31.7, loss_final=3.49, loss_mean=0.966, loss_mean_cls=2.9, proj_loss=-0.378][2026-03-23 13:45:53] Step: 2317, Training Logs: loss_final: 3.624402, loss_mean: 0.946642, proj_loss: -0.369388, loss_mean_cls: 3.047149, grad_norm: 19.618511 +Steps: 0%| | 2318/1000000 [09:34<67:32:55, 4.10it/s, grad_norm=19.6, loss_final=3.62, loss_mean=0.947, loss_mean_cls=3.05, proj_loss=-0.369][2026-03-23 13:45:54] Step: 2318, Training Logs: loss_final: 2.884371, loss_mean: 0.978797, proj_loss: -0.379214, loss_mean_cls: 2.284788, grad_norm: 12.762804 +Steps: 0%| | 2319/1000000 [09:34<67:33:33, 4.10it/s, grad_norm=12.8, loss_final=2.88, loss_mean=0.979, loss_mean_cls=2.28, proj_loss=-0.379][2026-03-23 13:45:54] Step: 2319, Training Logs: loss_final: 3.450779, loss_mean: 0.962324, proj_loss: -0.366352, loss_mean_cls: 2.854807, grad_norm: 5.113488 +Steps: 0%| | 2320/1000000 [09:34<67:33:05, 4.10it/s, grad_norm=5.11, loss_final=3.45, loss_mean=0.962, loss_mean_cls=2.85, proj_loss=-0.366][2026-03-23 13:45:54] Step: 2320, Training Logs: loss_final: 3.216179, loss_mean: 0.928508, proj_loss: -0.385085, loss_mean_cls: 2.672755, grad_norm: 16.224936 +Steps: 0%| | 2321/1000000 [09:34<67:38:04, 4.10it/s, grad_norm=16.2, loss_final=3.22, loss_mean=0.929, loss_mean_cls=2.67, proj_loss=-0.385][2026-03-23 13:45:54] Step: 2321, Training Logs: loss_final: 3.951090, loss_mean: 0.951013, proj_loss: -0.378153, loss_mean_cls: 3.378230, grad_norm: 26.052130 +Steps: 0%| | 2322/1000000 [09:35<67:36:07, 4.10it/s, grad_norm=26.1, loss_final=3.95, loss_mean=0.951, loss_mean_cls=3.38, proj_loss=-0.378][2026-03-23 13:45:55] Step: 2322, Training Logs: loss_final: 3.180375, loss_mean: 0.961020, proj_loss: -0.381866, loss_mean_cls: 2.601221, grad_norm: 22.985699 +Steps: 0%| | 2323/1000000 [09:35<67:36:21, 4.10it/s, grad_norm=23, loss_final=3.18, loss_mean=0.961, loss_mean_cls=2.6, proj_loss=-0.382][2026-03-23 13:45:55] Step: 2323, Training Logs: loss_final: 3.789807, loss_mean: 0.957669, proj_loss: -0.365561, loss_mean_cls: 3.197699, grad_norm: 23.685274 +Steps: 0%| | 2324/1000000 [09:35<67:36:44, 4.10it/s, grad_norm=23.7, loss_final=3.79, loss_mean=0.958, loss_mean_cls=3.2, proj_loss=-0.366][2026-03-23 13:45:55] Step: 2324, Training Logs: loss_final: 3.207737, loss_mean: 0.983699, proj_loss: -0.366074, loss_mean_cls: 2.590112, grad_norm: 15.985385 +Steps: 0%| | 2325/1000000 [09:35<67:34:12, 4.10it/s, grad_norm=16, loss_final=3.21, loss_mean=0.984, loss_mean_cls=2.59, proj_loss=-0.366][2026-03-23 13:45:55] Step: 2325, Training Logs: loss_final: 2.932276, loss_mean: 0.984236, proj_loss: -0.373261, loss_mean_cls: 2.321301, grad_norm: 4.665193 +Steps: 0%| | 2326/1000000 [09:36<67:34:19, 4.10it/s, grad_norm=4.67, loss_final=2.93, loss_mean=0.984, loss_mean_cls=2.32, proj_loss=-0.373][2026-03-23 13:45:56] Step: 2326, Training Logs: loss_final: 2.709910, loss_mean: 0.979934, proj_loss: -0.390201, loss_mean_cls: 2.120176, grad_norm: 6.294175 +Steps: 0%| | 2327/1000000 [09:36<67:35:07, 4.10it/s, grad_norm=6.29, loss_final=2.71, loss_mean=0.98, loss_mean_cls=2.12, proj_loss=-0.39][2026-03-23 13:45:56] Step: 2327, Training Logs: loss_final: 3.020343, loss_mean: 0.948118, proj_loss: -0.387009, loss_mean_cls: 2.459234, grad_norm: 13.774411 +Steps: 0%| | 2328/1000000 [09:36<67:39:15, 4.10it/s, grad_norm=13.8, loss_final=3.02, loss_mean=0.948, loss_mean_cls=2.46, proj_loss=-0.387][2026-03-23 13:45:56] Step: 2328, Training Logs: loss_final: 2.577856, loss_mean: 0.979477, proj_loss: -0.389359, loss_mean_cls: 1.987738, grad_norm: 10.341409 +Steps: 0%| | 2329/1000000 [09:36<67:37:25, 4.10it/s, grad_norm=10.3, loss_final=2.58, loss_mean=0.979, loss_mean_cls=1.99, proj_loss=-0.389][2026-03-23 13:45:56] Step: 2329, Training Logs: loss_final: 3.038637, loss_mean: 0.997803, proj_loss: -0.377371, loss_mean_cls: 2.418205, grad_norm: 8.587603 +Steps: 0%| | 2330/1000000 [09:37<67:35:09, 4.10it/s, grad_norm=8.59, loss_final=3.04, loss_mean=0.998, loss_mean_cls=2.42, proj_loss=-0.377][2026-03-23 13:45:57] Step: 2330, Training Logs: loss_final: 2.769654, loss_mean: 0.998592, proj_loss: -0.384540, loss_mean_cls: 2.155602, grad_norm: 13.666408 +Steps: 0%| | 2331/1000000 [09:37<67:33:41, 4.10it/s, grad_norm=13.7, loss_final=2.77, loss_mean=0.999, loss_mean_cls=2.16, proj_loss=-0.385][2026-03-23 13:45:57] Step: 2331, Training Logs: loss_final: 3.256792, loss_mean: 0.966391, proj_loss: -0.384704, loss_mean_cls: 2.675105, grad_norm: 16.725641 +Steps: 0%| | 2332/1000000 [09:37<67:35:57, 4.10it/s, grad_norm=16.7, loss_final=3.26, loss_mean=0.966, loss_mean_cls=2.68, proj_loss=-0.385][2026-03-23 13:45:57] Step: 2332, Training Logs: loss_final: 3.535944, loss_mean: 0.951162, proj_loss: -0.376559, loss_mean_cls: 2.961341, grad_norm: 25.163202 +Steps: 0%| | 2333/1000000 [09:37<67:32:30, 4.10it/s, grad_norm=25.2, loss_final=3.54, loss_mean=0.951, loss_mean_cls=2.96, proj_loss=-0.377][2026-03-23 13:45:57] Step: 2333, Training Logs: loss_final: 2.658958, loss_mean: 0.977407, proj_loss: -0.385831, loss_mean_cls: 2.067382, grad_norm: 9.367922 +Steps: 0%| | 2334/1000000 [09:38<67:33:10, 4.10it/s, grad_norm=9.37, loss_final=2.66, loss_mean=0.977, loss_mean_cls=2.07, proj_loss=-0.386][2026-03-23 13:45:58] Step: 2334, Training Logs: loss_final: 3.323065, loss_mean: 0.978232, proj_loss: -0.374036, loss_mean_cls: 2.718869, grad_norm: 3.776347 +Steps: 0%| | 2335/1000000 [09:38<67:31:34, 4.10it/s, grad_norm=3.78, loss_final=3.32, loss_mean=0.978, loss_mean_cls=2.72, proj_loss=-0.374][2026-03-23 13:45:58] Step: 2335, Training Logs: loss_final: 3.136200, loss_mean: 0.945415, proj_loss: -0.383427, loss_mean_cls: 2.574212, grad_norm: 12.269566 +Steps: 0%| | 2336/1000000 [09:38<67:28:49, 4.11it/s, grad_norm=12.3, loss_final=3.14, loss_mean=0.945, loss_mean_cls=2.57, proj_loss=-0.383][2026-03-23 13:45:58] Step: 2336, Training Logs: loss_final: 3.076855, loss_mean: 0.979335, proj_loss: -0.382981, loss_mean_cls: 2.480501, grad_norm: 8.043669 +Steps: 0%| | 2337/1000000 [09:38<67:28:28, 4.11it/s, grad_norm=8.04, loss_final=3.08, loss_mean=0.979, loss_mean_cls=2.48, proj_loss=-0.383][2026-03-23 13:45:58] Step: 2337, Training Logs: loss_final: 3.334670, loss_mean: 0.963152, proj_loss: -0.378279, loss_mean_cls: 2.749796, grad_norm: 6.940506 +Steps: 0%| | 2338/1000000 [09:39<67:30:24, 4.11it/s, grad_norm=6.94, loss_final=3.33, loss_mean=0.963, loss_mean_cls=2.75, proj_loss=-0.378][2026-03-23 13:45:59] Step: 2338, Training Logs: loss_final: 2.879705, loss_mean: 0.956513, proj_loss: -0.386097, loss_mean_cls: 2.309289, grad_norm: 8.153208 +Steps: 0%| | 2339/1000000 [09:39<67:29:52, 4.11it/s, grad_norm=8.15, loss_final=2.88, loss_mean=0.957, loss_mean_cls=2.31, proj_loss=-0.386][2026-03-23 13:45:59] Step: 2339, Training Logs: loss_final: 3.341602, loss_mean: 0.958636, proj_loss: -0.384498, loss_mean_cls: 2.767463, grad_norm: 3.755689 +Steps: 0%| | 2340/1000000 [09:39<67:28:55, 4.11it/s, grad_norm=3.76, loss_final=3.34, loss_mean=0.959, loss_mean_cls=2.77, proj_loss=-0.384][2026-03-23 13:45:59] Step: 2340, Training Logs: loss_final: 3.168568, loss_mean: 0.936995, proj_loss: -0.383109, loss_mean_cls: 2.614682, grad_norm: 12.381761 +Steps: 0%| | 2341/1000000 [09:39<67:28:56, 4.11it/s, grad_norm=12.4, loss_final=3.17, loss_mean=0.937, loss_mean_cls=2.61, proj_loss=-0.383][2026-03-23 13:45:59] Step: 2341, Training Logs: loss_final: 3.108483, loss_mean: 0.958589, proj_loss: -0.379540, loss_mean_cls: 2.529433, grad_norm: 5.349547 +Steps: 0%| | 2342/1000000 [09:40<68:20:33, 4.05it/s, grad_norm=5.35, loss_final=3.11, loss_mean=0.959, loss_mean_cls=2.53, proj_loss=-0.38][2026-03-23 13:45:59] Step: 2342, Training Logs: loss_final: 3.024848, loss_mean: 0.947799, proj_loss: -0.375178, loss_mean_cls: 2.452228, grad_norm: 4.757423 +Steps: 0%| | 2343/1000000 [09:40<67:29:46, 4.11it/s, grad_norm=4.76, loss_final=3.02, loss_mean=0.948, loss_mean_cls=2.45, proj_loss=-0.375][2026-03-23 13:46:00] Step: 2343, Training Logs: loss_final: 2.339519, loss_mean: 0.979206, proj_loss: -0.399165, loss_mean_cls: 1.759477, grad_norm: 8.953857 +Steps: 0%| | 2344/1000000 [09:40<67:31:40, 4.10it/s, grad_norm=8.95, loss_final=2.34, loss_mean=0.979, loss_mean_cls=1.76, proj_loss=-0.399][2026-03-23 13:46:00] Step: 2344, Training Logs: loss_final: 3.160011, loss_mean: 0.947085, proj_loss: -0.386537, loss_mean_cls: 2.599463, grad_norm: 8.274674 +Steps: 0%| | 2345/1000000 [09:40<67:29:15, 4.11it/s, grad_norm=8.27, loss_final=3.16, loss_mean=0.947, loss_mean_cls=2.6, proj_loss=-0.387][2026-03-23 13:46:00] Step: 2345, Training Logs: loss_final: 2.634526, loss_mean: 0.970605, proj_loss: -0.384218, loss_mean_cls: 2.048139, grad_norm: 8.750192 +Steps: 0%| | 2346/1000000 [09:40<67:32:04, 4.10it/s, grad_norm=8.75, loss_final=2.63, loss_mean=0.971, loss_mean_cls=2.05, proj_loss=-0.384][2026-03-23 13:46:00] Step: 2346, Training Logs: loss_final: 3.050992, loss_mean: 0.966063, proj_loss: -0.379551, loss_mean_cls: 2.464481, grad_norm: 10.021965 +Steps: 0%| | 2347/1000000 [09:41<67:35:33, 4.10it/s, grad_norm=10, loss_final=3.05, loss_mean=0.966, loss_mean_cls=2.46, proj_loss=-0.38][2026-03-23 13:46:01] Step: 2347, Training Logs: loss_final: 3.039327, loss_mean: 0.964980, proj_loss: -0.382976, loss_mean_cls: 2.457323, grad_norm: 11.047492 +Steps: 0%| | 2348/1000000 [09:41<67:33:58, 4.10it/s, grad_norm=11, loss_final=3.04, loss_mean=0.965, loss_mean_cls=2.46, proj_loss=-0.383][2026-03-23 13:46:01] Step: 2348, Training Logs: loss_final: 2.956983, loss_mean: 0.956654, proj_loss: -0.389464, loss_mean_cls: 2.389792, grad_norm: 7.252070 +Steps: 0%| | 2349/1000000 [09:41<67:33:08, 4.10it/s, grad_norm=7.25, loss_final=2.96, loss_mean=0.957, loss_mean_cls=2.39, proj_loss=-0.389][2026-03-23 13:46:01] Step: 2349, Training Logs: loss_final: 2.965279, loss_mean: 0.936607, proj_loss: -0.389691, loss_mean_cls: 2.418363, grad_norm: 17.746460 +Steps: 0%| | 2350/1000000 [09:41<67:30:35, 4.10it/s, grad_norm=17.7, loss_final=2.97, loss_mean=0.937, loss_mean_cls=2.42, proj_loss=-0.39][2026-03-23 13:46:01] Step: 2350, Training Logs: loss_final: 3.482373, loss_mean: 0.934087, proj_loss: -0.377875, loss_mean_cls: 2.926161, grad_norm: 13.140029 +Steps: 0%| | 2351/1000000 [09:42<67:29:05, 4.11it/s, grad_norm=13.1, loss_final=3.48, loss_mean=0.934, loss_mean_cls=2.93, proj_loss=-0.378][2026-03-23 13:46:02] Step: 2351, Training Logs: loss_final: 2.979218, loss_mean: 0.949272, proj_loss: -0.382875, loss_mean_cls: 2.412821, grad_norm: 7.088810 +Steps: 0%| | 2352/1000000 [09:42<67:28:58, 4.11it/s, grad_norm=7.09, loss_final=2.98, loss_mean=0.949, loss_mean_cls=2.41, proj_loss=-0.383][2026-03-23 13:46:02] Step: 2352, Training Logs: loss_final: 2.486826, loss_mean: 0.960721, proj_loss: -0.389720, loss_mean_cls: 1.915825, grad_norm: 8.427671 +Steps: 0%| | 2353/1000000 [09:42<67:26:27, 4.11it/s, grad_norm=8.43, loss_final=2.49, loss_mean=0.961, loss_mean_cls=1.92, proj_loss=-0.39][2026-03-23 13:46:02] Step: 2353, Training Logs: loss_final: 3.028020, loss_mean: 0.957120, proj_loss: -0.392454, loss_mean_cls: 2.463354, grad_norm: 8.087681 +Steps: 0%| | 2354/1000000 [09:42<67:25:48, 4.11it/s, grad_norm=8.09, loss_final=3.03, loss_mean=0.957, loss_mean_cls=2.46, proj_loss=-0.392][2026-03-23 13:46:02] Step: 2354, Training Logs: loss_final: 2.694744, loss_mean: 0.962750, proj_loss: -0.391123, loss_mean_cls: 2.123117, grad_norm: 3.366001 +Steps: 0%| | 2355/1000000 [09:43<67:26:37, 4.11it/s, grad_norm=3.37, loss_final=2.69, loss_mean=0.963, loss_mean_cls=2.12, proj_loss=-0.391][2026-03-23 13:46:03] Step: 2355, Training Logs: loss_final: 3.053314, loss_mean: 0.944700, proj_loss: -0.390424, loss_mean_cls: 2.499038, grad_norm: 3.909158 +Steps: 0%| | 2356/1000000 [09:43<67:26:45, 4.11it/s, grad_norm=3.91, loss_final=3.05, loss_mean=0.945, loss_mean_cls=2.5, proj_loss=-0.39][2026-03-23 13:46:03] Step: 2356, Training Logs: loss_final: 2.883054, loss_mean: 0.945529, proj_loss: -0.387332, loss_mean_cls: 2.324857, grad_norm: 12.932006 +Steps: 0%| | 2357/1000000 [09:43<67:24:29, 4.11it/s, grad_norm=12.9, loss_final=2.88, loss_mean=0.946, loss_mean_cls=2.32, proj_loss=-0.387][2026-03-23 13:46:03] Step: 2357, Training Logs: loss_final: 3.226906, loss_mean: 0.923617, proj_loss: -0.384032, loss_mean_cls: 2.687321, grad_norm: 3.100008 +Steps: 0%| | 2358/1000000 [09:43<67:28:32, 4.11it/s, grad_norm=3.1, loss_final=3.23, loss_mean=0.924, loss_mean_cls=2.69, proj_loss=-0.384][2026-03-23 13:46:03] Step: 2358, Training Logs: loss_final: 2.627547, loss_mean: 0.938343, proj_loss: -0.397500, loss_mean_cls: 2.086704, grad_norm: 5.994499 +Steps: 0%| | 2359/1000000 [09:44<67:26:25, 4.11it/s, grad_norm=5.99, loss_final=2.63, loss_mean=0.938, loss_mean_cls=2.09, proj_loss=-0.397][2026-03-23 13:46:04] Step: 2359, Training Logs: loss_final: 3.254201, loss_mean: 0.936155, proj_loss: -0.383850, loss_mean_cls: 2.701895, grad_norm: 5.494087 +Steps: 0%| | 2360/1000000 [09:44<67:28:08, 4.11it/s, grad_norm=5.49, loss_final=3.25, loss_mean=0.936, loss_mean_cls=2.7, proj_loss=-0.384][2026-03-23 13:46:04] Step: 2360, Training Logs: loss_final: 2.739505, loss_mean: 0.931943, proj_loss: -0.391513, loss_mean_cls: 2.199075, grad_norm: 5.082531 +Steps: 0%| | 2361/1000000 [09:44<67:28:47, 4.11it/s, grad_norm=5.08, loss_final=2.74, loss_mean=0.932, loss_mean_cls=2.2, proj_loss=-0.392][2026-03-23 13:46:04] Step: 2361, Training Logs: loss_final: 3.283565, loss_mean: 0.932467, proj_loss: -0.388681, loss_mean_cls: 2.739779, grad_norm: 3.115013 +Steps: 0%| | 2362/1000000 [09:44<67:32:28, 4.10it/s, grad_norm=3.12, loss_final=3.28, loss_mean=0.932, loss_mean_cls=2.74, proj_loss=-0.389][2026-03-23 13:46:04] Step: 2362, Training Logs: loss_final: 2.698413, loss_mean: 0.952912, proj_loss: -0.393948, loss_mean_cls: 2.139449, grad_norm: 12.858301 +Steps: 0%| | 2363/1000000 [09:45<67:34:21, 4.10it/s, grad_norm=12.9, loss_final=2.7, loss_mean=0.953, loss_mean_cls=2.14, proj_loss=-0.394][2026-03-23 13:46:05] Step: 2363, Training Logs: loss_final: 2.625941, loss_mean: 0.994654, proj_loss: -0.392600, loss_mean_cls: 2.023887, grad_norm: 5.264392 +Steps: 0%| | 2364/1000000 [09:45<67:33:47, 4.10it/s, grad_norm=5.26, loss_final=2.63, loss_mean=0.995, loss_mean_cls=2.02, proj_loss=-0.393][2026-03-23 13:46:05] Step: 2364, Training Logs: loss_final: 2.689337, loss_mean: 0.958115, proj_loss: -0.391553, loss_mean_cls: 2.122775, grad_norm: 10.165495 +Steps: 0%| | 2365/1000000 [09:45<67:31:13, 4.10it/s, grad_norm=10.2, loss_final=2.69, loss_mean=0.958, loss_mean_cls=2.12, proj_loss=-0.392][2026-03-23 13:46:05] Step: 2365, Training Logs: loss_final: 2.913572, loss_mean: 0.942614, proj_loss: -0.391441, loss_mean_cls: 2.362399, grad_norm: 12.994112 +Steps: 0%| | 2366/1000000 [09:45<67:29:26, 4.11it/s, grad_norm=13, loss_final=2.91, loss_mean=0.943, loss_mean_cls=2.36, proj_loss=-0.391][2026-03-23 13:46:05] Step: 2366, Training Logs: loss_final: 3.395002, loss_mean: 0.933028, proj_loss: -0.387890, loss_mean_cls: 2.849863, grad_norm: 4.686817 +Steps: 0%| | 2367/1000000 [09:46<67:29:34, 4.11it/s, grad_norm=4.69, loss_final=3.4, loss_mean=0.933, loss_mean_cls=2.85, proj_loss=-0.388][2026-03-23 13:46:06] Step: 2367, Training Logs: loss_final: 2.743847, loss_mean: 0.954913, proj_loss: -0.394225, loss_mean_cls: 2.183159, grad_norm: 11.442642 +Steps: 0%| | 2368/1000000 [09:46<67:28:04, 4.11it/s, grad_norm=11.4, loss_final=2.74, loss_mean=0.955, loss_mean_cls=2.18, proj_loss=-0.394][2026-03-23 13:46:06] Step: 2368, Training Logs: loss_final: 2.993181, loss_mean: 0.941154, proj_loss: -0.388686, loss_mean_cls: 2.440712, grad_norm: 4.373786 +Steps: 0%| | 2369/1000000 [09:46<67:26:51, 4.11it/s, grad_norm=4.37, loss_final=2.99, loss_mean=0.941, loss_mean_cls=2.44, proj_loss=-0.389][2026-03-23 13:46:06] Step: 2369, Training Logs: loss_final: 3.272585, loss_mean: 0.926883, proj_loss: -0.381807, loss_mean_cls: 2.727509, grad_norm: 19.508472 +Steps: 0%| | 2370/1000000 [09:46<67:26:08, 4.11it/s, grad_norm=19.5, loss_final=3.27, loss_mean=0.927, loss_mean_cls=2.73, proj_loss=-0.382][2026-03-23 13:46:06] Step: 2370, Training Logs: loss_final: 2.995389, loss_mean: 0.934712, proj_loss: -0.389963, loss_mean_cls: 2.450640, grad_norm: 12.311272 +Steps: 0%| | 2371/1000000 [09:47<67:25:43, 4.11it/s, grad_norm=12.3, loss_final=3, loss_mean=0.935, loss_mean_cls=2.45, proj_loss=-0.39][2026-03-23 13:46:07] Step: 2371, Training Logs: loss_final: 3.006476, loss_mean: 0.933759, proj_loss: -0.386128, loss_mean_cls: 2.458845, grad_norm: 7.496337 +Steps: 0%| | 2372/1000000 [09:47<67:26:17, 4.11it/s, grad_norm=7.5, loss_final=3.01, loss_mean=0.934, loss_mean_cls=2.46, proj_loss=-0.386][2026-03-23 13:46:07] Step: 2372, Training Logs: loss_final: 3.280590, loss_mean: 0.918588, proj_loss: -0.387620, loss_mean_cls: 2.749622, grad_norm: 14.825168 +Steps: 0%| | 2373/1000000 [09:47<67:29:57, 4.11it/s, grad_norm=14.8, loss_final=3.28, loss_mean=0.919, loss_mean_cls=2.75, proj_loss=-0.388][2026-03-23 13:46:07] Step: 2373, Training Logs: loss_final: 2.763682, loss_mean: 0.947667, proj_loss: -0.392913, loss_mean_cls: 2.208928, grad_norm: 13.978548 +Steps: 0%| | 2374/1000000 [09:47<67:30:00, 4.11it/s, grad_norm=14, loss_final=2.76, loss_mean=0.948, loss_mean_cls=2.21, proj_loss=-0.393][2026-03-23 13:46:07] Step: 2374, Training Logs: loss_final: 2.609964, loss_mean: 0.957133, proj_loss: -0.392862, loss_mean_cls: 2.045693, grad_norm: 8.783580 +Steps: 0%| | 2375/1000000 [09:48<67:31:43, 4.10it/s, grad_norm=8.78, loss_final=2.61, loss_mean=0.957, loss_mean_cls=2.05, proj_loss=-0.393][2026-03-23 13:46:08] Step: 2375, Training Logs: loss_final: 3.015188, loss_mean: 0.930977, proj_loss: -0.391069, loss_mean_cls: 2.475280, grad_norm: 7.902009 +Steps: 0%| | 2376/1000000 [09:48<67:31:06, 4.10it/s, grad_norm=7.9, loss_final=3.02, loss_mean=0.931, loss_mean_cls=2.48, proj_loss=-0.391][2026-03-23 13:46:08] Step: 2376, Training Logs: loss_final: 2.733045, loss_mean: 0.938049, proj_loss: -0.388922, loss_mean_cls: 2.183917, grad_norm: 2.090959 +Steps: 0%| | 2377/1000000 [09:48<67:32:35, 4.10it/s, grad_norm=2.09, loss_final=2.73, loss_mean=0.938, loss_mean_cls=2.18, proj_loss=-0.389][2026-03-23 13:46:08] Step: 2377, Training Logs: loss_final: 2.954828, loss_mean: 0.943561, proj_loss: -0.394421, loss_mean_cls: 2.405688, grad_norm: 11.989889 +Steps: 0%| | 2378/1000000 [09:48<67:34:32, 4.10it/s, grad_norm=12, loss_final=2.95, loss_mean=0.944, loss_mean_cls=2.41, proj_loss=-0.394][2026-03-23 13:46:08] Step: 2378, Training Logs: loss_final: 2.693528, loss_mean: 0.958274, proj_loss: -0.391467, loss_mean_cls: 2.126720, grad_norm: 3.552269 +Steps: 0%| | 2379/1000000 [09:49<67:33:45, 4.10it/s, grad_norm=3.55, loss_final=2.69, loss_mean=0.958, loss_mean_cls=2.13, proj_loss=-0.391][2026-03-23 13:46:08] Step: 2379, Training Logs: loss_final: 3.086662, loss_mean: 0.929437, proj_loss: -0.384750, loss_mean_cls: 2.541975, grad_norm: 13.551343 +Steps: 0%| | 2380/1000000 [09:49<67:42:04, 4.09it/s, grad_norm=13.6, loss_final=3.09, loss_mean=0.929, loss_mean_cls=2.54, proj_loss=-0.385][2026-03-23 13:46:09] Step: 2380, Training Logs: loss_final: 2.464348, loss_mean: 0.972136, proj_loss: -0.395222, loss_mean_cls: 1.887435, grad_norm: 6.398902 +Steps: 0%| | 2381/1000000 [09:49<67:38:04, 4.10it/s, grad_norm=6.4, loss_final=2.46, loss_mean=0.972, loss_mean_cls=1.89, proj_loss=-0.395][2026-03-23 13:46:09] Step: 2381, Training Logs: loss_final: 3.101549, loss_mean: 0.932770, proj_loss: -0.389301, loss_mean_cls: 2.558080, grad_norm: 7.074415 +Steps: 0%| | 2382/1000000 [09:49<67:36:47, 4.10it/s, grad_norm=7.07, loss_final=3.1, loss_mean=0.933, loss_mean_cls=2.56, proj_loss=-0.389][2026-03-23 13:46:09] Step: 2382, Training Logs: loss_final: 3.066158, loss_mean: 0.949685, proj_loss: -0.390513, loss_mean_cls: 2.506986, grad_norm: 5.226468 +Steps: 0%| | 2383/1000000 [09:50<67:34:50, 4.10it/s, grad_norm=5.23, loss_final=3.07, loss_mean=0.95, loss_mean_cls=2.51, proj_loss=-0.391][2026-03-23 13:46:09] Step: 2383, Training Logs: loss_final: 3.012690, loss_mean: 0.916112, proj_loss: -0.390846, loss_mean_cls: 2.487424, grad_norm: 9.104554 +Steps: 0%| | 2384/1000000 [09:50<67:30:52, 4.10it/s, grad_norm=9.1, loss_final=3.01, loss_mean=0.916, loss_mean_cls=2.49, proj_loss=-0.391][2026-03-23 13:46:10] Step: 2384, Training Logs: loss_final: 2.819761, loss_mean: 0.954756, proj_loss: -0.392827, loss_mean_cls: 2.257832, grad_norm: 9.545852 +Steps: 0%| | 2385/1000000 [09:50<67:29:05, 4.11it/s, grad_norm=9.55, loss_final=2.82, loss_mean=0.955, loss_mean_cls=2.26, proj_loss=-0.393][2026-03-23 13:46:10] Step: 2385, Training Logs: loss_final: 3.153715, loss_mean: 0.952837, proj_loss: -0.387380, loss_mean_cls: 2.588258, grad_norm: 4.274674 +Steps: 0%| | 2386/1000000 [09:50<67:27:06, 4.11it/s, grad_norm=4.27, loss_final=3.15, loss_mean=0.953, loss_mean_cls=2.59, proj_loss=-0.387][2026-03-23 13:46:10] Step: 2386, Training Logs: loss_final: 2.627130, loss_mean: 0.945230, proj_loss: -0.396139, loss_mean_cls: 2.078038, grad_norm: 3.264199 +Steps: 0%| | 2387/1000000 [09:50<67:25:10, 4.11it/s, grad_norm=3.26, loss_final=2.63, loss_mean=0.945, loss_mean_cls=2.08, proj_loss=-0.396][2026-03-23 13:46:10] Step: 2387, Training Logs: loss_final: 3.271678, loss_mean: 0.919444, proj_loss: -0.386635, loss_mean_cls: 2.738869, grad_norm: 2.302098 +Steps: 0%| | 2388/1000000 [09:51<67:25:18, 4.11it/s, grad_norm=2.3, loss_final=3.27, loss_mean=0.919, loss_mean_cls=2.74, proj_loss=-0.387][2026-03-23 13:46:11] Step: 2388, Training Logs: loss_final: 3.223823, loss_mean: 0.928451, proj_loss: -0.389133, loss_mean_cls: 2.684505, grad_norm: 3.075900 +Steps: 0%| | 2389/1000000 [09:51<67:25:38, 4.11it/s, grad_norm=3.08, loss_final=3.22, loss_mean=0.928, loss_mean_cls=2.68, proj_loss=-0.389][2026-03-23 13:46:11] Step: 2389, Training Logs: loss_final: 2.783573, loss_mean: 0.943013, proj_loss: -0.398527, loss_mean_cls: 2.239086, grad_norm: 11.935467 +Steps: 0%| | 2390/1000000 [09:51<67:25:15, 4.11it/s, grad_norm=11.9, loss_final=2.78, loss_mean=0.943, loss_mean_cls=2.24, proj_loss=-0.399][2026-03-23 13:46:11] Step: 2390, Training Logs: loss_final: 2.640124, loss_mean: 0.944798, proj_loss: -0.393923, loss_mean_cls: 2.089249, grad_norm: 12.495429 +Steps: 0%| | 2391/1000000 [09:51<67:36:10, 4.10it/s, grad_norm=12.5, loss_final=2.64, loss_mean=0.945, loss_mean_cls=2.09, proj_loss=-0.394][2026-03-23 13:46:11] Step: 2391, Training Logs: loss_final: 2.899413, loss_mean: 0.957748, proj_loss: -0.383678, loss_mean_cls: 2.325343, grad_norm: 12.767413 +Steps: 0%| | 2392/1000000 [09:52<67:37:07, 4.10it/s, grad_norm=12.8, loss_final=2.9, loss_mean=0.958, loss_mean_cls=2.33, proj_loss=-0.384][2026-03-23 13:46:12] Step: 2392, Training Logs: loss_final: 2.873934, loss_mean: 0.954819, proj_loss: -0.384760, loss_mean_cls: 2.303874, grad_norm: 2.953129 +Steps: 0%| | 2393/1000000 [09:52<67:34:34, 4.10it/s, grad_norm=2.95, loss_final=2.87, loss_mean=0.955, loss_mean_cls=2.3, proj_loss=-0.385][2026-03-23 13:46:12] Step: 2393, Training Logs: loss_final: 2.731310, loss_mean: 0.946166, proj_loss: -0.392869, loss_mean_cls: 2.178013, grad_norm: 2.097651 +Steps: 0%| | 2394/1000000 [09:52<67:34:50, 4.10it/s, grad_norm=2.1, loss_final=2.73, loss_mean=0.946, loss_mean_cls=2.18, proj_loss=-0.393][2026-03-23 13:46:12] Step: 2394, Training Logs: loss_final: 3.280188, loss_mean: 0.929070, proj_loss: -0.383470, loss_mean_cls: 2.734588, grad_norm: 9.682512 +Steps: 0%| | 2395/1000000 [09:52<67:31:48, 4.10it/s, grad_norm=9.68, loss_final=3.28, loss_mean=0.929, loss_mean_cls=2.73, proj_loss=-0.383][2026-03-23 13:46:12] Step: 2395, Training Logs: loss_final: 2.934466, loss_mean: 0.934193, proj_loss: -0.390681, loss_mean_cls: 2.390954, grad_norm: 4.577152 +Steps: 0%| | 2396/1000000 [09:53<67:36:30, 4.10it/s, grad_norm=4.58, loss_final=2.93, loss_mean=0.934, loss_mean_cls=2.39, proj_loss=-0.391][2026-03-23 13:46:13] Step: 2396, Training Logs: loss_final: 2.957288, loss_mean: 0.922632, proj_loss: -0.387165, loss_mean_cls: 2.421822, grad_norm: 9.739257 +Steps: 0%| | 2397/1000000 [09:53<67:46:13, 4.09it/s, grad_norm=9.74, loss_final=2.96, loss_mean=0.923, loss_mean_cls=2.42, proj_loss=-0.387][2026-03-23 13:46:13] Step: 2397, Training Logs: loss_final: 2.786147, loss_mean: 0.972523, proj_loss: -0.390302, loss_mean_cls: 2.203925, grad_norm: 9.519972 +Steps: 0%| | 2398/1000000 [09:53<67:40:49, 4.09it/s, grad_norm=9.52, loss_final=2.79, loss_mean=0.973, loss_mean_cls=2.2, proj_loss=-0.39][2026-03-23 13:46:13] Step: 2398, Training Logs: loss_final: 2.902637, loss_mean: 0.946018, proj_loss: -0.394123, loss_mean_cls: 2.350743, grad_norm: 13.591548 +Steps: 0%| | 2399/1000000 [09:53<67:36:38, 4.10it/s, grad_norm=13.6, loss_final=2.9, loss_mean=0.946, loss_mean_cls=2.35, proj_loss=-0.394][2026-03-23 13:46:13] Step: 2399, Training Logs: loss_final: 2.932720, loss_mean: 0.956486, proj_loss: -0.390130, loss_mean_cls: 2.366364, grad_norm: 5.673531 +Steps: 0%| | 2400/1000000 [09:54<67:34:36, 4.10it/s, grad_norm=5.67, loss_final=2.93, loss_mean=0.956, loss_mean_cls=2.37, proj_loss=-0.39][2026-03-23 13:46:14] Step: 2400, Training Logs: loss_final: 2.836558, loss_mean: 0.938267, proj_loss: -0.392539, loss_mean_cls: 2.290830, grad_norm: 10.181646 +Steps: 0%| | 2401/1000000 [09:54<67:31:24, 4.10it/s, grad_norm=10.2, loss_final=2.84, loss_mean=0.938, loss_mean_cls=2.29, proj_loss=-0.393][2026-03-23 13:46:14] Step: 2401, Training Logs: loss_final: 3.241819, loss_mean: 0.937661, proj_loss: -0.387026, loss_mean_cls: 2.691185, grad_norm: 2.511678 +Steps: 0%| | 2402/1000000 [09:54<67:29:09, 4.11it/s, grad_norm=2.51, loss_final=3.24, loss_mean=0.938, loss_mean_cls=2.69, proj_loss=-0.387][2026-03-23 13:46:14] Step: 2402, Training Logs: loss_final: 3.037200, loss_mean: 0.944042, proj_loss: -0.383234, loss_mean_cls: 2.476392, grad_norm: 10.283236 +Steps: 0%| | 2403/1000000 [09:54<67:28:16, 4.11it/s, grad_norm=10.3, loss_final=3.04, loss_mean=0.944, loss_mean_cls=2.48, proj_loss=-0.383][2026-03-23 13:46:14] Step: 2403, Training Logs: loss_final: 3.170341, loss_mean: 0.914357, proj_loss: -0.385289, loss_mean_cls: 2.641273, grad_norm: 4.174337 +Steps: 0%| | 2404/1000000 [09:55<67:27:16, 4.11it/s, grad_norm=4.17, loss_final=3.17, loss_mean=0.914, loss_mean_cls=2.64, proj_loss=-0.385][2026-03-23 13:46:15] Step: 2404, Training Logs: loss_final: 2.676465, loss_mean: 0.965725, proj_loss: -0.397404, loss_mean_cls: 2.108143, grad_norm: 3.888089 +Steps: 0%| | 2405/1000000 [09:55<67:26:02, 4.11it/s, grad_norm=3.89, loss_final=2.68, loss_mean=0.966, loss_mean_cls=2.11, proj_loss=-0.397][2026-03-23 13:46:15] Step: 2405, Training Logs: loss_final: 2.707456, loss_mean: 0.952685, proj_loss: -0.394272, loss_mean_cls: 2.149043, grad_norm: 20.400623 +Steps: 0%| | 2406/1000000 [09:55<67:25:02, 4.11it/s, grad_norm=20.4, loss_final=2.71, loss_mean=0.953, loss_mean_cls=2.15, proj_loss=-0.394][2026-03-23 13:46:15] Step: 2406, Training Logs: loss_final: 3.088508, loss_mean: 0.964417, proj_loss: -0.392436, loss_mean_cls: 2.516528, grad_norm: 17.711954 +Steps: 0%| | 2407/1000000 [09:55<67:25:12, 4.11it/s, grad_norm=17.7, loss_final=3.09, loss_mean=0.964, loss_mean_cls=2.52, proj_loss=-0.392][2026-03-23 13:46:15] Step: 2407, Training Logs: loss_final: 3.027281, loss_mean: 0.934788, proj_loss: -0.381813, loss_mean_cls: 2.474306, grad_norm: 9.814915 +Steps: 0%| | 2408/1000000 [09:56<67:29:44, 4.11it/s, grad_norm=9.81, loss_final=3.03, loss_mean=0.935, loss_mean_cls=2.47, proj_loss=-0.382][2026-03-23 13:46:16] Step: 2408, Training Logs: loss_final: 2.628174, loss_mean: 0.928551, proj_loss: -0.391564, loss_mean_cls: 2.091187, grad_norm: 24.017393 +Steps: 0%| | 2409/1000000 [09:56<67:27:57, 4.11it/s, grad_norm=24, loss_final=2.63, loss_mean=0.929, loss_mean_cls=2.09, proj_loss=-0.392][2026-03-23 13:46:16] Step: 2409, Training Logs: loss_final: 2.664481, loss_mean: 0.964669, proj_loss: -0.389719, loss_mean_cls: 2.089531, grad_norm: 11.807569 +Steps: 0%| | 2410/1000000 [09:56<67:27:44, 4.11it/s, grad_norm=11.8, loss_final=2.66, loss_mean=0.965, loss_mean_cls=2.09, proj_loss=-0.39][2026-03-23 13:46:16] Step: 2410, Training Logs: loss_final: 3.126819, loss_mean: 0.931760, proj_loss: -0.385451, loss_mean_cls: 2.580509, grad_norm: 14.057022 +Steps: 0%| | 2411/1000000 [09:56<67:26:53, 4.11it/s, grad_norm=14.1, loss_final=3.13, loss_mean=0.932, loss_mean_cls=2.58, proj_loss=-0.385][2026-03-23 13:46:16] Step: 2411, Training Logs: loss_final: 2.744798, loss_mean: 0.940185, proj_loss: -0.393849, loss_mean_cls: 2.198461, grad_norm: 3.556506 +Steps: 0%| | 2412/1000000 [09:57<67:26:24, 4.11it/s, grad_norm=3.56, loss_final=2.74, loss_mean=0.94, loss_mean_cls=2.2, proj_loss=-0.394][2026-03-23 13:46:17] Step: 2412, Training Logs: loss_final: 2.909680, loss_mean: 0.962555, proj_loss: -0.390756, loss_mean_cls: 2.337881, grad_norm: 5.717029 +Steps: 0%| | 2413/1000000 [09:57<67:26:18, 4.11it/s, grad_norm=5.72, loss_final=2.91, loss_mean=0.963, loss_mean_cls=2.34, proj_loss=-0.391][2026-03-23 13:46:17] Step: 2413, Training Logs: loss_final: 2.953197, loss_mean: 0.916323, proj_loss: -0.384880, loss_mean_cls: 2.421755, grad_norm: 3.026975 +Steps: 0%| | 2414/1000000 [09:57<67:26:42, 4.11it/s, grad_norm=3.03, loss_final=2.95, loss_mean=0.916, loss_mean_cls=2.42, proj_loss=-0.385][2026-03-23 13:46:17] Step: 2414, Training Logs: loss_final: 2.860687, loss_mean: 0.930782, proj_loss: -0.394946, loss_mean_cls: 2.324851, grad_norm: 7.329420 +Steps: 0%| | 2415/1000000 [09:57<67:28:49, 4.11it/s, grad_norm=7.33, loss_final=2.86, loss_mean=0.931, loss_mean_cls=2.32, proj_loss=-0.395][2026-03-23 13:46:17] Step: 2415, Training Logs: loss_final: 2.886220, loss_mean: 0.948661, proj_loss: -0.389137, loss_mean_cls: 2.326696, grad_norm: 4.231555 +Steps: 0%| | 2416/1000000 [09:58<67:29:36, 4.11it/s, grad_norm=4.23, loss_final=2.89, loss_mean=0.949, loss_mean_cls=2.33, proj_loss=-0.389][2026-03-23 13:46:18] Step: 2416, Training Logs: loss_final: 3.343974, loss_mean: 0.940059, proj_loss: -0.385442, loss_mean_cls: 2.789357, grad_norm: 3.953401 +Steps: 0%| | 2417/1000000 [09:58<67:28:55, 4.11it/s, grad_norm=3.95, loss_final=3.34, loss_mean=0.94, loss_mean_cls=2.79, proj_loss=-0.385][2026-03-23 13:46:18] Step: 2417, Training Logs: loss_final: 2.786407, loss_mean: 0.931286, proj_loss: -0.393887, loss_mean_cls: 2.249008, grad_norm: 2.675928 +Steps: 0%| | 2418/1000000 [09:58<67:29:34, 4.11it/s, grad_norm=2.68, loss_final=2.79, loss_mean=0.931, loss_mean_cls=2.25, proj_loss=-0.394][2026-03-23 13:46:18] Step: 2418, Training Logs: loss_final: 2.800606, loss_mean: 0.925267, proj_loss: -0.390960, loss_mean_cls: 2.266299, grad_norm: 17.632090 +Steps: 0%| | 2419/1000000 [09:58<67:28:45, 4.11it/s, grad_norm=17.6, loss_final=2.8, loss_mean=0.925, loss_mean_cls=2.27, proj_loss=-0.391][2026-03-23 13:46:18] Step: 2419, Training Logs: loss_final: 3.115144, loss_mean: 0.970093, proj_loss: -0.379568, loss_mean_cls: 2.524619, grad_norm: 12.025116 +Steps: 0%| | 2420/1000000 [09:59<67:29:52, 4.11it/s, grad_norm=12, loss_final=3.12, loss_mean=0.97, loss_mean_cls=2.52, proj_loss=-0.38][2026-03-23 13:46:18] Step: 2420, Training Logs: loss_final: 2.889092, loss_mean: 0.975752, proj_loss: -0.384730, loss_mean_cls: 2.298070, grad_norm: 3.930571 +Steps: 0%| | 2421/1000000 [09:59<67:27:18, 4.11it/s, grad_norm=3.93, loss_final=2.89, loss_mean=0.976, loss_mean_cls=2.3, proj_loss=-0.385][2026-03-23 13:46:19] Step: 2421, Training Logs: loss_final: 3.496935, loss_mean: 0.944577, proj_loss: -0.381605, loss_mean_cls: 2.933963, grad_norm: 5.553320 +Steps: 0%| | 2422/1000000 [09:59<67:26:11, 4.11it/s, grad_norm=5.55, loss_final=3.5, loss_mean=0.945, loss_mean_cls=2.93, proj_loss=-0.382][2026-03-23 13:46:19] Step: 2422, Training Logs: loss_final: 2.884058, loss_mean: 0.975108, proj_loss: -0.391574, loss_mean_cls: 2.300524, grad_norm: 7.527597 +Steps: 0%| | 2423/1000000 [09:59<67:25:29, 4.11it/s, grad_norm=7.53, loss_final=2.88, loss_mean=0.975, loss_mean_cls=2.3, proj_loss=-0.392][2026-03-23 13:46:19] Step: 2423, Training Logs: loss_final: 3.464158, loss_mean: 0.931971, proj_loss: -0.378949, loss_mean_cls: 2.911137, grad_norm: 5.033210 +Steps: 0%| | 2424/1000000 [09:59<67:25:10, 4.11it/s, grad_norm=5.03, loss_final=3.46, loss_mean=0.932, loss_mean_cls=2.91, proj_loss=-0.379][2026-03-23 13:46:19] Step: 2424, Training Logs: loss_final: 2.529990, loss_mean: 0.966219, proj_loss: -0.398305, loss_mean_cls: 1.962075, grad_norm: 14.135222 +Steps: 0%| | 2425/1000000 [10:00<67:25:24, 4.11it/s, grad_norm=14.1, loss_final=2.53, loss_mean=0.966, loss_mean_cls=1.96, proj_loss=-0.398][2026-03-23 13:46:20] Step: 2425, Training Logs: loss_final: 2.947088, loss_mean: 0.961125, proj_loss: -0.391067, loss_mean_cls: 2.377030, grad_norm: 9.370640 +Steps: 0%| | 2426/1000000 [10:00<67:25:00, 4.11it/s, grad_norm=9.37, loss_final=2.95, loss_mean=0.961, loss_mean_cls=2.38, proj_loss=-0.391][2026-03-23 13:46:20] Step: 2426, Training Logs: loss_final: 2.794395, loss_mean: 0.960028, proj_loss: -0.391375, loss_mean_cls: 2.225742, grad_norm: 4.407180 +Steps: 0%| | 2427/1000000 [10:00<67:24:54, 4.11it/s, grad_norm=4.41, loss_final=2.79, loss_mean=0.96, loss_mean_cls=2.23, proj_loss=-0.391][2026-03-23 13:46:20] Step: 2427, Training Logs: loss_final: 3.509578, loss_mean: 0.918769, proj_loss: -0.379948, loss_mean_cls: 2.970757, grad_norm: 8.911112 +Steps: 0%| | 2428/1000000 [10:00<67:27:05, 4.11it/s, grad_norm=8.91, loss_final=3.51, loss_mean=0.919, loss_mean_cls=2.97, proj_loss=-0.38][2026-03-23 13:46:20] Step: 2428, Training Logs: loss_final: 2.722192, loss_mean: 0.953433, proj_loss: -0.396718, loss_mean_cls: 2.165477, grad_norm: 5.254706 +Steps: 0%| | 2429/1000000 [10:01<67:26:41, 4.11it/s, grad_norm=5.25, loss_final=2.72, loss_mean=0.953, loss_mean_cls=2.17, proj_loss=-0.397][2026-03-23 13:46:21] Step: 2429, Training Logs: loss_final: 2.327291, loss_mean: 0.959018, proj_loss: -0.398296, loss_mean_cls: 1.766569, grad_norm: 4.205764 +Steps: 0%| | 2430/1000000 [10:01<67:26:34, 4.11it/s, grad_norm=4.21, loss_final=2.33, loss_mean=0.959, loss_mean_cls=1.77, proj_loss=-0.398][2026-03-23 13:46:21] Step: 2430, Training Logs: loss_final: 2.264001, loss_mean: 0.980725, proj_loss: -0.404366, loss_mean_cls: 1.687643, grad_norm: 2.299567 +Steps: 0%| | 2431/1000000 [10:01<67:24:01, 4.11it/s, grad_norm=2.3, loss_final=2.26, loss_mean=0.981, loss_mean_cls=1.69, proj_loss=-0.404][2026-03-23 13:46:21] Step: 2431, Training Logs: loss_final: 2.859177, loss_mean: 0.945586, proj_loss: -0.397499, loss_mean_cls: 2.311090, grad_norm: 6.913231 +Steps: 0%| | 2432/1000000 [10:01<67:23:26, 4.11it/s, grad_norm=6.91, loss_final=2.86, loss_mean=0.946, loss_mean_cls=2.31, proj_loss=-0.397][2026-03-23 13:46:21] Step: 2432, Training Logs: loss_final: 2.910579, loss_mean: 0.949156, proj_loss: -0.387678, loss_mean_cls: 2.349101, grad_norm: 5.373342 +Steps: 0%| | 2433/1000000 [10:02<67:26:55, 4.11it/s, grad_norm=5.37, loss_final=2.91, loss_mean=0.949, loss_mean_cls=2.35, proj_loss=-0.388][2026-03-23 13:46:22] Step: 2433, Training Logs: loss_final: 2.931169, loss_mean: 0.946226, proj_loss: -0.390072, loss_mean_cls: 2.375015, grad_norm: 4.783076 +Steps: 0%| | 2434/1000000 [10:02<67:26:40, 4.11it/s, grad_norm=4.78, loss_final=2.93, loss_mean=0.946, loss_mean_cls=2.38, proj_loss=-0.39][2026-03-23 13:46:22] Step: 2434, Training Logs: loss_final: 2.258577, loss_mean: 0.972419, proj_loss: -0.401942, loss_mean_cls: 1.688099, grad_norm: 7.221009 +Steps: 0%| | 2435/1000000 [10:02<67:26:01, 4.11it/s, grad_norm=7.22, loss_final=2.26, loss_mean=0.972, loss_mean_cls=1.69, proj_loss=-0.402][2026-03-23 13:46:22] Step: 2435, Training Logs: loss_final: 3.359217, loss_mean: 0.929519, proj_loss: -0.386313, loss_mean_cls: 2.816010, grad_norm: 6.113584 +Steps: 0%| | 2436/1000000 [10:02<67:26:07, 4.11it/s, grad_norm=6.11, loss_final=3.36, loss_mean=0.93, loss_mean_cls=2.82, proj_loss=-0.386][2026-03-23 13:46:22] Step: 2436, Training Logs: loss_final: 2.758827, loss_mean: 0.949622, proj_loss: -0.389090, loss_mean_cls: 2.198295, grad_norm: 7.743430 +Steps: 0%| | 2437/1000000 [10:03<67:25:02, 4.11it/s, grad_norm=7.74, loss_final=2.76, loss_mean=0.95, loss_mean_cls=2.2, proj_loss=-0.389][2026-03-23 13:46:23] Step: 2437, Training Logs: loss_final: 2.399020, loss_mean: 0.948049, proj_loss: -0.404718, loss_mean_cls: 1.855689, grad_norm: 7.370488 +Steps: 0%| | 2438/1000000 [10:03<67:28:36, 4.11it/s, grad_norm=7.37, loss_final=2.4, loss_mean=0.948, loss_mean_cls=1.86, proj_loss=-0.405][2026-03-23 13:46:23] Step: 2438, Training Logs: loss_final: 3.125315, loss_mean: 0.955396, proj_loss: -0.394611, loss_mean_cls: 2.564531, grad_norm: 26.438850 +Steps: 0%| | 2439/1000000 [10:03<67:26:51, 4.11it/s, grad_norm=26.4, loss_final=3.13, loss_mean=0.955, loss_mean_cls=2.56, proj_loss=-0.395][2026-03-23 13:46:23] Step: 2439, Training Logs: loss_final: 2.389089, loss_mean: 0.952278, proj_loss: -0.400789, loss_mean_cls: 1.837599, grad_norm: 6.603148 +Steps: 0%| | 2440/1000000 [10:03<67:26:49, 4.11it/s, grad_norm=6.6, loss_final=2.39, loss_mean=0.952, loss_mean_cls=1.84, proj_loss=-0.401][2026-03-23 13:46:23] Step: 2440, Training Logs: loss_final: 3.436980, loss_mean: 0.939178, proj_loss: -0.378796, loss_mean_cls: 2.876598, grad_norm: 21.254549 +Steps: 0%| | 2441/1000000 [10:04<67:27:31, 4.11it/s, grad_norm=21.3, loss_final=3.44, loss_mean=0.939, loss_mean_cls=2.88, proj_loss=-0.379][2026-03-23 13:46:24] Step: 2441, Training Logs: loss_final: 2.669592, loss_mean: 0.954402, proj_loss: -0.395774, loss_mean_cls: 2.110964, grad_norm: 11.274336 +Steps: 0%| | 2442/1000000 [10:04<67:26:52, 4.11it/s, grad_norm=11.3, loss_final=2.67, loss_mean=0.954, loss_mean_cls=2.11, proj_loss=-0.396][2026-03-23 13:46:24] Step: 2442, Training Logs: loss_final: 2.816905, loss_mean: 0.934405, proj_loss: -0.393250, loss_mean_cls: 2.275750, grad_norm: 6.895835 +Steps: 0%| | 2443/1000000 [10:04<67:27:42, 4.11it/s, grad_norm=6.9, loss_final=2.82, loss_mean=0.934, loss_mean_cls=2.28, proj_loss=-0.393][2026-03-23 13:46:24] Step: 2443, Training Logs: loss_final: 2.744403, loss_mean: 0.951143, proj_loss: -0.397204, loss_mean_cls: 2.190463, grad_norm: 10.362988 +Steps: 0%| | 2444/1000000 [10:04<67:25:22, 4.11it/s, grad_norm=10.4, loss_final=2.74, loss_mean=0.951, loss_mean_cls=2.19, proj_loss=-0.397][2026-03-23 13:46:24] Step: 2444, Training Logs: loss_final: 3.309115, loss_mean: 0.923605, proj_loss: -0.389159, loss_mean_cls: 2.774670, grad_norm: 17.128012 +Steps: 0%| | 2445/1000000 [10:05<67:25:02, 4.11it/s, grad_norm=17.1, loss_final=3.31, loss_mean=0.924, loss_mean_cls=2.77, proj_loss=-0.389][2026-03-23 13:46:25] Step: 2445, Training Logs: loss_final: 2.759860, loss_mean: 0.957911, proj_loss: -0.385668, loss_mean_cls: 2.187618, grad_norm: 11.345791 +Steps: 0%| | 2446/1000000 [10:05<67:25:46, 4.11it/s, grad_norm=11.3, loss_final=2.76, loss_mean=0.958, loss_mean_cls=2.19, proj_loss=-0.386][2026-03-23 13:46:25] Step: 2446, Training Logs: loss_final: 2.603497, loss_mean: 0.964388, proj_loss: -0.400228, loss_mean_cls: 2.039336, grad_norm: 6.938545 +Steps: 0%| | 2447/1000000 [10:05<67:24:54, 4.11it/s, grad_norm=6.94, loss_final=2.6, loss_mean=0.964, loss_mean_cls=2.04, proj_loss=-0.4][2026-03-23 13:46:25] Step: 2447, Training Logs: loss_final: 3.058109, loss_mean: 0.930631, proj_loss: -0.390921, loss_mean_cls: 2.518399, grad_norm: 2.073139 +Steps: 0%| | 2448/1000000 [10:05<67:26:33, 4.11it/s, grad_norm=2.07, loss_final=3.06, loss_mean=0.931, loss_mean_cls=2.52, proj_loss=-0.391][2026-03-23 13:46:25] Step: 2448, Training Logs: loss_final: 3.295578, loss_mean: 0.923073, proj_loss: -0.383253, loss_mean_cls: 2.755758, grad_norm: 3.448461 +Steps: 0%| | 2449/1000000 [10:06<67:25:53, 4.11it/s, grad_norm=3.45, loss_final=3.3, loss_mean=0.923, loss_mean_cls=2.76, proj_loss=-0.383][2026-03-23 13:46:26] Step: 2449, Training Logs: loss_final: 2.990002, loss_mean: 0.923316, proj_loss: -0.398450, loss_mean_cls: 2.465135, grad_norm: 16.724905 +Steps: 0%| | 2450/1000000 [10:06<67:26:39, 4.11it/s, grad_norm=16.7, loss_final=2.99, loss_mean=0.923, loss_mean_cls=2.47, proj_loss=-0.398][2026-03-23 13:46:26] Step: 2450, Training Logs: loss_final: 3.177397, loss_mean: 0.946359, proj_loss: -0.385869, loss_mean_cls: 2.616907, grad_norm: 5.380743 +Steps: 0%| | 2451/1000000 [10:06<67:24:30, 4.11it/s, grad_norm=5.38, loss_final=3.18, loss_mean=0.946, loss_mean_cls=2.62, proj_loss=-0.386][2026-03-23 13:46:26] Step: 2451, Training Logs: loss_final: 2.866597, loss_mean: 0.943042, proj_loss: -0.392968, loss_mean_cls: 2.316524, grad_norm: 12.075882 +Steps: 0%| | 2452/1000000 [10:06<67:24:43, 4.11it/s, grad_norm=12.1, loss_final=2.87, loss_mean=0.943, loss_mean_cls=2.32, proj_loss=-0.393][2026-03-23 13:46:26] Step: 2452, Training Logs: loss_final: 2.957762, loss_mean: 0.942876, proj_loss: -0.385086, loss_mean_cls: 2.399972, grad_norm: 5.557896 +Steps: 0%| | 2453/1000000 [10:07<67:24:15, 4.11it/s, grad_norm=5.56, loss_final=2.96, loss_mean=0.943, loss_mean_cls=2.4, proj_loss=-0.385][2026-03-23 13:46:27] Step: 2453, Training Logs: loss_final: 3.166430, loss_mean: 0.932999, proj_loss: -0.391779, loss_mean_cls: 2.625210, grad_norm: 19.673645 +Steps: 0%| | 2454/1000000 [10:07<67:24:56, 4.11it/s, grad_norm=19.7, loss_final=3.17, loss_mean=0.933, loss_mean_cls=2.63, proj_loss=-0.392][2026-03-23 13:46:27] Step: 2454, Training Logs: loss_final: 3.228982, loss_mean: 0.922975, proj_loss: -0.387827, loss_mean_cls: 2.693834, grad_norm: 13.477143 +Steps: 0%| | 2455/1000000 [10:07<67:25:23, 4.11it/s, grad_norm=13.5, loss_final=3.23, loss_mean=0.923, loss_mean_cls=2.69, proj_loss=-0.388][2026-03-23 13:46:27] Step: 2455, Training Logs: loss_final: 2.659129, loss_mean: 0.942712, proj_loss: -0.397498, loss_mean_cls: 2.113915, grad_norm: 5.782819 +Steps: 0%| | 2456/1000000 [10:07<67:24:09, 4.11it/s, grad_norm=5.78, loss_final=2.66, loss_mean=0.943, loss_mean_cls=2.11, proj_loss=-0.397][2026-03-23 13:46:27] Step: 2456, Training Logs: loss_final: 3.449233, loss_mean: 0.930123, proj_loss: -0.383568, loss_mean_cls: 2.902677, grad_norm: 14.888750 +Steps: 0%| | 2457/1000000 [10:08<67:27:33, 4.11it/s, grad_norm=14.9, loss_final=3.45, loss_mean=0.93, loss_mean_cls=2.9, proj_loss=-0.384][2026-03-23 13:46:27] Step: 2457, Training Logs: loss_final: 3.077062, loss_mean: 0.950161, proj_loss: -0.393959, loss_mean_cls: 2.520860, grad_norm: 13.566856 +Steps: 0%| | 2458/1000000 [10:08<67:26:21, 4.11it/s, grad_norm=13.6, loss_final=3.08, loss_mean=0.95, loss_mean_cls=2.52, proj_loss=-0.394][2026-03-23 13:46:28] Step: 2458, Training Logs: loss_final: 3.208894, loss_mean: 0.958299, proj_loss: -0.387552, loss_mean_cls: 2.638146, grad_norm: 6.013097 +Steps: 0%| | 2459/1000000 [10:08<67:26:44, 4.11it/s, grad_norm=6.01, loss_final=3.21, loss_mean=0.958, loss_mean_cls=2.64, proj_loss=-0.388][2026-03-23 13:46:28] Step: 2459, Training Logs: loss_final: 2.598863, loss_mean: 0.952967, proj_loss: -0.402292, loss_mean_cls: 2.048188, grad_norm: 14.680429 +Steps: 0%| | 2460/1000000 [10:08<67:25:53, 4.11it/s, grad_norm=14.7, loss_final=2.6, loss_mean=0.953, loss_mean_cls=2.05, proj_loss=-0.402][2026-03-23 13:46:28] Step: 2460, Training Logs: loss_final: 2.809784, loss_mean: 0.929952, proj_loss: -0.397389, loss_mean_cls: 2.277220, grad_norm: 24.259079 +Steps: 0%| | 2461/1000000 [10:09<67:25:45, 4.11it/s, grad_norm=24.3, loss_final=2.81, loss_mean=0.93, loss_mean_cls=2.28, proj_loss=-0.397][2026-03-23 13:46:28] Step: 2461, Training Logs: loss_final: 3.149467, loss_mean: 0.943536, proj_loss: -0.397787, loss_mean_cls: 2.603718, grad_norm: 22.451622 +Steps: 0%| | 2462/1000000 [10:09<67:25:05, 4.11it/s, grad_norm=22.5, loss_final=3.15, loss_mean=0.944, loss_mean_cls=2.6, proj_loss=-0.398][2026-03-23 13:46:29] Step: 2462, Training Logs: loss_final: 3.166907, loss_mean: 0.960225, proj_loss: -0.391486, loss_mean_cls: 2.598168, grad_norm: 26.487373 +Steps: 0%| | 2463/1000000 [10:09<67:24:23, 4.11it/s, grad_norm=26.5, loss_final=3.17, loss_mean=0.96, loss_mean_cls=2.6, proj_loss=-0.391][2026-03-23 13:46:29] Step: 2463, Training Logs: loss_final: 3.084430, loss_mean: 0.955490, proj_loss: -0.396557, loss_mean_cls: 2.525497, grad_norm: 21.707777 +Steps: 0%| | 2464/1000000 [10:09<67:24:10, 4.11it/s, grad_norm=21.7, loss_final=3.08, loss_mean=0.955, loss_mean_cls=2.53, proj_loss=-0.397][2026-03-23 13:46:29] Step: 2464, Training Logs: loss_final: 3.060201, loss_mean: 0.946131, proj_loss: -0.390143, loss_mean_cls: 2.504214, grad_norm: 9.582196 +Steps: 0%| | 2465/1000000 [10:09<67:24:05, 4.11it/s, grad_norm=9.58, loss_final=3.06, loss_mean=0.946, loss_mean_cls=2.5, proj_loss=-0.39][2026-03-23 13:46:29] Step: 2465, Training Logs: loss_final: 2.906149, loss_mean: 0.963006, proj_loss: -0.392677, loss_mean_cls: 2.335820, grad_norm: 3.431876 +Steps: 0%| | 2466/1000000 [10:10<67:24:30, 4.11it/s, grad_norm=3.43, loss_final=2.91, loss_mean=0.963, loss_mean_cls=2.34, proj_loss=-0.393][2026-03-23 13:46:30] Step: 2466, Training Logs: loss_final: 2.762997, loss_mean: 0.946137, proj_loss: -0.394567, loss_mean_cls: 2.211427, grad_norm: 15.253424 +Steps: 0%| | 2467/1000000 [10:10<67:27:00, 4.11it/s, grad_norm=15.3, loss_final=2.76, loss_mean=0.946, loss_mean_cls=2.21, proj_loss=-0.395][2026-03-23 13:46:30] Step: 2467, Training Logs: loss_final: 3.055204, loss_mean: 0.982786, proj_loss: -0.388040, loss_mean_cls: 2.460457, grad_norm: 7.581713 +Steps: 0%| | 2468/1000000 [10:10<67:26:24, 4.11it/s, grad_norm=7.58, loss_final=3.06, loss_mean=0.983, loss_mean_cls=2.46, proj_loss=-0.388][2026-03-23 13:46:30] Step: 2468, Training Logs: loss_final: 2.555756, loss_mean: 0.960110, proj_loss: -0.399049, loss_mean_cls: 1.994695, grad_norm: 8.530686 +Steps: 0%| | 2469/1000000 [10:10<67:26:35, 4.11it/s, grad_norm=8.53, loss_final=2.56, loss_mean=0.96, loss_mean_cls=1.99, proj_loss=-0.399][2026-03-23 13:46:30] Step: 2469, Training Logs: loss_final: 2.532526, loss_mean: 0.951458, proj_loss: -0.397532, loss_mean_cls: 1.978599, grad_norm: 22.039566 +Steps: 0%| | 2470/1000000 [10:11<67:26:30, 4.11it/s, grad_norm=22, loss_final=2.53, loss_mean=0.951, loss_mean_cls=1.98, proj_loss=-0.398][2026-03-23 13:46:31] Step: 2470, Training Logs: loss_final: 2.760738, loss_mean: 0.988251, proj_loss: -0.398872, loss_mean_cls: 2.171359, grad_norm: 5.773829 +Steps: 0%| | 2471/1000000 [10:11<67:26:25, 4.11it/s, grad_norm=5.77, loss_final=2.76, loss_mean=0.988, loss_mean_cls=2.17, proj_loss=-0.399][2026-03-23 13:46:31] Step: 2471, Training Logs: loss_final: 2.600564, loss_mean: 0.973279, proj_loss: -0.400969, loss_mean_cls: 2.028253, grad_norm: 7.025678 +Steps: 0%| | 2472/1000000 [10:11<67:26:21, 4.11it/s, grad_norm=7.03, loss_final=2.6, loss_mean=0.973, loss_mean_cls=2.03, proj_loss=-0.401][2026-03-23 13:46:31] Step: 2472, Training Logs: loss_final: 2.717258, loss_mean: 0.950603, proj_loss: -0.402701, loss_mean_cls: 2.169356, grad_norm: 7.217523 +Steps: 0%| | 2473/1000000 [10:11<68:35:41, 4.04it/s, grad_norm=7.22, loss_final=2.72, loss_mean=0.951, loss_mean_cls=2.17, proj_loss=-0.403][2026-03-23 13:46:31] Step: 2473, Training Logs: loss_final: 2.778785, loss_mean: 0.969458, proj_loss: -0.394870, loss_mean_cls: 2.204197, grad_norm: 3.900687 +Steps: 0%| | 2474/1000000 [10:12<68:15:01, 4.06it/s, grad_norm=3.9, loss_final=2.78, loss_mean=0.969, loss_mean_cls=2.2, proj_loss=-0.395][2026-03-23 13:46:32] Step: 2474, Training Logs: loss_final: 3.299288, loss_mean: 0.941991, proj_loss: -0.389933, loss_mean_cls: 2.747230, grad_norm: 9.116871 +Steps: 0%| | 2475/1000000 [10:12<67:59:04, 4.08it/s, grad_norm=9.12, loss_final=3.3, loss_mean=0.942, loss_mean_cls=2.75, proj_loss=-0.39][2026-03-23 13:46:32] Step: 2475, Training Logs: loss_final: 3.100463, loss_mean: 0.939617, proj_loss: -0.393898, loss_mean_cls: 2.554744, grad_norm: 12.173196 +Steps: 0%| | 2476/1000000 [10:12<67:49:14, 4.09it/s, grad_norm=12.2, loss_final=3.1, loss_mean=0.94, loss_mean_cls=2.55, proj_loss=-0.394][2026-03-23 13:46:32] Step: 2476, Training Logs: loss_final: 3.227695, loss_mean: 0.947833, proj_loss: -0.393758, loss_mean_cls: 2.673619, grad_norm: 12.745120 +Steps: 0%| | 2477/1000000 [10:12<67:43:24, 4.09it/s, grad_norm=12.7, loss_final=3.23, loss_mean=0.948, loss_mean_cls=2.67, proj_loss=-0.394][2026-03-23 13:46:32] Step: 2477, Training Logs: loss_final: 2.584507, loss_mean: 0.978267, proj_loss: -0.400302, loss_mean_cls: 2.006542, grad_norm: 4.016188 +Steps: 0%| | 2478/1000000 [10:13<67:38:23, 4.10it/s, grad_norm=4.02, loss_final=2.58, loss_mean=0.978, loss_mean_cls=2.01, proj_loss=-0.4][2026-03-23 13:46:33] Step: 2478, Training Logs: loss_final: 2.349428, loss_mean: 0.956197, proj_loss: -0.404617, loss_mean_cls: 1.797849, grad_norm: 4.318551 +Steps: 0%| | 2479/1000000 [10:13<67:36:29, 4.10it/s, grad_norm=4.32, loss_final=2.35, loss_mean=0.956, loss_mean_cls=1.8, proj_loss=-0.405][2026-03-23 13:46:33] Step: 2479, Training Logs: loss_final: 3.009212, loss_mean: 0.942218, proj_loss: -0.394067, loss_mean_cls: 2.461061, grad_norm: 6.324945 +Steps: 0%| | 2480/1000000 [10:13<67:33:41, 4.10it/s, grad_norm=6.32, loss_final=3.01, loss_mean=0.942, loss_mean_cls=2.46, proj_loss=-0.394][2026-03-23 13:46:33] Step: 2480, Training Logs: loss_final: 3.454341, loss_mean: 0.930515, proj_loss: -0.388059, loss_mean_cls: 2.911885, grad_norm: 4.728663 +Steps: 0%| | 2481/1000000 [10:13<67:33:34, 4.10it/s, grad_norm=4.73, loss_final=3.45, loss_mean=0.931, loss_mean_cls=2.91, proj_loss=-0.388][2026-03-23 13:46:33] Step: 2481, Training Logs: loss_final: 3.289883, loss_mean: 0.945672, proj_loss: -0.388150, loss_mean_cls: 2.732361, grad_norm: 3.275453 +Steps: 0%| | 2482/1000000 [10:14<67:32:05, 4.10it/s, grad_norm=3.28, loss_final=3.29, loss_mean=0.946, loss_mean_cls=2.73, proj_loss=-0.388][2026-03-23 13:46:34] Step: 2482, Training Logs: loss_final: 3.127621, loss_mean: 0.927831, proj_loss: -0.385844, loss_mean_cls: 2.585633, grad_norm: 2.942777 +Steps: 0%| | 2483/1000000 [10:14<67:29:58, 4.11it/s, grad_norm=2.94, loss_final=3.13, loss_mean=0.928, loss_mean_cls=2.59, proj_loss=-0.386][2026-03-23 13:46:34] Step: 2483, Training Logs: loss_final: 3.319135, loss_mean: 0.957055, proj_loss: -0.387465, loss_mean_cls: 2.749545, grad_norm: 18.172445 +Steps: 0%| | 2484/1000000 [10:14<67:30:36, 4.10it/s, grad_norm=18.2, loss_final=3.32, loss_mean=0.957, loss_mean_cls=2.75, proj_loss=-0.387][2026-03-23 13:46:34] Step: 2484, Training Logs: loss_final: 2.972980, loss_mean: 0.954161, proj_loss: -0.396022, loss_mean_cls: 2.414841, grad_norm: 4.957958 +Steps: 0%| | 2485/1000000 [10:14<67:31:43, 4.10it/s, grad_norm=4.96, loss_final=2.97, loss_mean=0.954, loss_mean_cls=2.41, proj_loss=-0.396][2026-03-23 13:46:34] Step: 2485, Training Logs: loss_final: 3.048467, loss_mean: 0.950323, proj_loss: -0.393494, loss_mean_cls: 2.491637, grad_norm: 4.237015 +Steps: 0%| | 2486/1000000 [10:15<67:30:58, 4.10it/s, grad_norm=4.24, loss_final=3.05, loss_mean=0.95, loss_mean_cls=2.49, proj_loss=-0.393][2026-03-23 13:46:35] Step: 2486, Training Logs: loss_final: 2.754138, loss_mean: 0.942870, proj_loss: -0.398666, loss_mean_cls: 2.209934, grad_norm: 15.894574 +Steps: 0%| | 2487/1000000 [10:15<68:06:16, 4.07it/s, grad_norm=15.9, loss_final=2.75, loss_mean=0.943, loss_mean_cls=2.21, proj_loss=-0.399][2026-03-23 13:46:35] Step: 2487, Training Logs: loss_final: 3.000856, loss_mean: 0.976388, proj_loss: -0.392272, loss_mean_cls: 2.416740, grad_norm: 7.936259 +Steps: 0%| | 2488/1000000 [10:15<67:54:35, 4.08it/s, grad_norm=7.94, loss_final=3, loss_mean=0.976, loss_mean_cls=2.42, proj_loss=-0.392][2026-03-23 13:46:35] Step: 2488, Training Logs: loss_final: 3.339794, loss_mean: 0.953811, proj_loss: -0.391270, loss_mean_cls: 2.777254, grad_norm: 7.287972 +Steps: 0%| | 2489/1000000 [10:15<67:45:39, 4.09it/s, grad_norm=7.29, loss_final=3.34, loss_mean=0.954, loss_mean_cls=2.78, proj_loss=-0.391][2026-03-23 13:46:35] Step: 2489, Training Logs: loss_final: 2.717393, loss_mean: 0.958472, proj_loss: -0.399196, loss_mean_cls: 2.158117, grad_norm: 8.543011 +Steps: 0%| | 2490/1000000 [10:16<67:39:30, 4.10it/s, grad_norm=8.54, loss_final=2.72, loss_mean=0.958, loss_mean_cls=2.16, proj_loss=-0.399][2026-03-23 13:46:36] Step: 2490, Training Logs: loss_final: 3.113756, loss_mean: 0.938616, proj_loss: -0.391419, loss_mean_cls: 2.566559, grad_norm: 3.510058 +Steps: 0%| | 2491/1000000 [10:16<67:36:21, 4.10it/s, grad_norm=3.51, loss_final=3.11, loss_mean=0.939, loss_mean_cls=2.57, proj_loss=-0.391][2026-03-23 13:46:36] Step: 2491, Training Logs: loss_final: 2.783130, loss_mean: 0.951701, proj_loss: -0.397438, loss_mean_cls: 2.228867, grad_norm: 4.055578 +Steps: 0%| | 2492/1000000 [10:16<67:33:45, 4.10it/s, grad_norm=4.06, loss_final=2.78, loss_mean=0.952, loss_mean_cls=2.23, proj_loss=-0.397][2026-03-23 13:46:36] Step: 2492, Training Logs: loss_final: 2.922144, loss_mean: 0.938869, proj_loss: -0.396453, loss_mean_cls: 2.379729, grad_norm: 9.457985 +Steps: 0%| | 2493/1000000 [10:16<67:31:47, 4.10it/s, grad_norm=9.46, loss_final=2.92, loss_mean=0.939, loss_mean_cls=2.38, proj_loss=-0.396][2026-03-23 13:46:36] Step: 2493, Training Logs: loss_final: 3.063731, loss_mean: 0.959941, proj_loss: -0.385831, loss_mean_cls: 2.489621, grad_norm: 4.145828 +Steps: 0%| | 2494/1000000 [10:17<67:30:48, 4.10it/s, grad_norm=4.15, loss_final=3.06, loss_mean=0.96, loss_mean_cls=2.49, proj_loss=-0.386][2026-03-23 13:46:37] Step: 2494, Training Logs: loss_final: 2.865777, loss_mean: 0.941649, proj_loss: -0.393889, loss_mean_cls: 2.318017, grad_norm: 13.548530 +Steps: 0%| | 2495/1000000 [10:17<67:29:56, 4.11it/s, grad_norm=13.5, loss_final=2.87, loss_mean=0.942, loss_mean_cls=2.32, proj_loss=-0.394][2026-03-23 13:46:37] Step: 2495, Training Logs: loss_final: 2.968623, loss_mean: 0.939437, proj_loss: -0.399957, loss_mean_cls: 2.429143, grad_norm: 2.922508 +Steps: 0%| | 2496/1000000 [10:17<67:28:14, 4.11it/s, grad_norm=2.92, loss_final=2.97, loss_mean=0.939, loss_mean_cls=2.43, proj_loss=-0.4][2026-03-23 13:46:37] Step: 2496, Training Logs: loss_final: 2.459368, loss_mean: 0.951993, proj_loss: -0.400116, loss_mean_cls: 1.907491, grad_norm: 4.849279 +Steps: 0%| | 2497/1000000 [10:17<67:28:34, 4.11it/s, grad_norm=4.85, loss_final=2.46, loss_mean=0.952, loss_mean_cls=1.91, proj_loss=-0.4][2026-03-23 13:46:37] Step: 2497, Training Logs: loss_final: 2.971315, loss_mean: 0.960785, proj_loss: -0.389015, loss_mean_cls: 2.399545, grad_norm: 14.646505 +Steps: 0%| | 2498/1000000 [10:18<67:29:52, 4.11it/s, grad_norm=14.6, loss_final=2.97, loss_mean=0.961, loss_mean_cls=2.4, proj_loss=-0.389][2026-03-23 13:46:37] Step: 2498, Training Logs: loss_final: 2.995130, loss_mean: 0.947020, proj_loss: -0.390604, loss_mean_cls: 2.438714, grad_norm: 11.317253 +Steps: 0%| | 2499/1000000 [10:18<67:28:50, 4.11it/s, grad_norm=11.3, loss_final=3, loss_mean=0.947, loss_mean_cls=2.44, proj_loss=-0.391][2026-03-23 13:46:38] Step: 2499, Training Logs: loss_final: 2.854330, loss_mean: 0.959607, proj_loss: -0.386744, loss_mean_cls: 2.281467, grad_norm: 10.031837 +Steps: 0%| | 2500/1000000 [10:18<67:31:21, 4.10it/s, grad_norm=10, loss_final=2.85, loss_mean=0.96, loss_mean_cls=2.28, proj_loss=-0.387][2026-03-23 13:46:38] Step: 2500, Training Logs: loss_final: 2.826531, loss_mean: 0.948780, proj_loss: -0.396644, loss_mean_cls: 2.274395, grad_norm: 24.086061 +Steps: 0%| | 2501/1000000 [10:18<67:28:17, 4.11it/s, grad_norm=24.1, loss_final=2.83, loss_mean=0.949, loss_mean_cls=2.27, proj_loss=-0.397][2026-03-23 13:46:38] Step: 2501, Training Logs: loss_final: 2.687584, loss_mean: 0.954876, proj_loss: -0.392216, loss_mean_cls: 2.124924, grad_norm: 14.887847 +Steps: 0%| | 2502/1000000 [10:19<67:27:04, 4.11it/s, grad_norm=14.9, loss_final=2.69, loss_mean=0.955, loss_mean_cls=2.12, proj_loss=-0.392][2026-03-23 13:46:38] Step: 2502, Training Logs: loss_final: 3.070854, loss_mean: 0.945624, proj_loss: -0.386972, loss_mean_cls: 2.512202, grad_norm: 3.246238 +Steps: 0%| | 2503/1000000 [10:19<67:26:27, 4.11it/s, grad_norm=3.25, loss_final=3.07, loss_mean=0.946, loss_mean_cls=2.51, proj_loss=-0.387][2026-03-23 13:46:39] Step: 2503, Training Logs: loss_final: 2.842598, loss_mean: 0.944765, proj_loss: -0.395435, loss_mean_cls: 2.293269, grad_norm: 5.997876 +Steps: 0%| | 2504/1000000 [10:19<67:28:23, 4.11it/s, grad_norm=6, loss_final=2.84, loss_mean=0.945, loss_mean_cls=2.29, proj_loss=-0.395][2026-03-23 13:46:39] Step: 2504, Training Logs: loss_final: 3.042776, loss_mean: 0.944002, proj_loss: -0.392028, loss_mean_cls: 2.490803, grad_norm: 18.325062 +Steps: 0%| | 2505/1000000 [10:19<67:29:39, 4.11it/s, grad_norm=18.3, loss_final=3.04, loss_mean=0.944, loss_mean_cls=2.49, proj_loss=-0.392][2026-03-23 13:46:39] Step: 2505, Training Logs: loss_final: 3.022224, loss_mean: 0.954984, proj_loss: -0.392449, loss_mean_cls: 2.459689, grad_norm: 12.191711 +Steps: 0%| | 2506/1000000 [10:19<67:29:09, 4.11it/s, grad_norm=12.2, loss_final=3.02, loss_mean=0.955, loss_mean_cls=2.46, proj_loss=-0.392][2026-03-23 13:46:39] Step: 2506, Training Logs: loss_final: 2.609851, loss_mean: 0.965613, proj_loss: -0.394169, loss_mean_cls: 2.038406, grad_norm: 9.556293 +Steps: 0%| | 2507/1000000 [10:20<68:57:25, 4.02it/s, grad_norm=9.56, loss_final=2.61, loss_mean=0.966, loss_mean_cls=2.04, proj_loss=-0.394][2026-03-23 13:46:40] Step: 2507, Training Logs: loss_final: 3.084318, loss_mean: 0.936007, proj_loss: -0.394261, loss_mean_cls: 2.542572, grad_norm: 13.520138 +Steps: 0%| | 2508/1000000 [10:20<68:29:28, 4.05it/s, grad_norm=13.5, loss_final=3.08, loss_mean=0.936, loss_mean_cls=2.54, proj_loss=-0.394][2026-03-23 13:46:40] Step: 2508, Training Logs: loss_final: 2.772888, loss_mean: 0.938430, proj_loss: -0.392687, loss_mean_cls: 2.227144, grad_norm: 12.769271 +Steps: 0%| | 2509/1000000 [10:20<68:10:11, 4.06it/s, grad_norm=12.8, loss_final=2.77, loss_mean=0.938, loss_mean_cls=2.23, proj_loss=-0.393][2026-03-23 13:46:40] Step: 2509, Training Logs: loss_final: 2.928870, loss_mean: 0.950855, proj_loss: -0.385877, loss_mean_cls: 2.363892, grad_norm: 3.823591 +Steps: 0%| | 2510/1000000 [10:20<67:56:33, 4.08it/s, grad_norm=3.82, loss_final=2.93, loss_mean=0.951, loss_mean_cls=2.36, proj_loss=-0.386][2026-03-23 13:46:40] Step: 2510, Training Logs: loss_final: 3.012062, loss_mean: 0.933271, proj_loss: -0.393664, loss_mean_cls: 2.472455, grad_norm: 17.409483 +Steps: 0%| | 2511/1000000 [10:21<67:47:38, 4.09it/s, grad_norm=17.4, loss_final=3.01, loss_mean=0.933, loss_mean_cls=2.47, proj_loss=-0.394][2026-03-23 13:46:41] Step: 2511, Training Logs: loss_final: 2.967681, loss_mean: 0.917254, proj_loss: -0.391049, loss_mean_cls: 2.441475, grad_norm: 4.569881 +Steps: 0%| | 2512/1000000 [10:21<67:41:42, 4.09it/s, grad_norm=4.57, loss_final=2.97, loss_mean=0.917, loss_mean_cls=2.44, proj_loss=-0.391][2026-03-23 13:46:41] Step: 2512, Training Logs: loss_final: 2.886672, loss_mean: 0.952499, proj_loss: -0.400293, loss_mean_cls: 2.334465, grad_norm: 6.156919 +Steps: 0%| | 2513/1000000 [10:21<67:38:15, 4.10it/s, grad_norm=6.16, loss_final=2.89, loss_mean=0.952, loss_mean_cls=2.33, proj_loss=-0.4][2026-03-23 13:46:41] Step: 2513, Training Logs: loss_final: 2.582398, loss_mean: 0.941560, proj_loss: -0.402681, loss_mean_cls: 2.043519, grad_norm: 14.500363 +Steps: 0%| | 2514/1000000 [10:21<67:35:26, 4.10it/s, grad_norm=14.5, loss_final=2.58, loss_mean=0.942, loss_mean_cls=2.04, proj_loss=-0.403][2026-03-23 13:46:41] Step: 2514, Training Logs: loss_final: 2.874321, loss_mean: 0.941401, proj_loss: -0.398160, loss_mean_cls: 2.331080, grad_norm: 12.745111 +Steps: 0%| | 2515/1000000 [10:22<67:33:55, 4.10it/s, grad_norm=12.7, loss_final=2.87, loss_mean=0.941, loss_mean_cls=2.33, proj_loss=-0.398][2026-03-23 13:46:42] Step: 2515, Training Logs: loss_final: 2.925772, loss_mean: 0.942071, proj_loss: -0.400078, loss_mean_cls: 2.383779, grad_norm: 10.758328 +Steps: 0%| | 2516/1000000 [10:22<67:32:56, 4.10it/s, grad_norm=10.8, loss_final=2.93, loss_mean=0.942, loss_mean_cls=2.38, proj_loss=-0.4][2026-03-23 13:46:42] Step: 2516, Training Logs: loss_final: 2.950250, loss_mean: 0.948547, proj_loss: -0.397482, loss_mean_cls: 2.399185, grad_norm: 10.690203 +Steps: 0%| | 2517/1000000 [10:22<67:29:43, 4.11it/s, grad_norm=10.7, loss_final=2.95, loss_mean=0.949, loss_mean_cls=2.4, proj_loss=-0.397][2026-03-23 13:46:42] Step: 2517, Training Logs: loss_final: 2.815764, loss_mean: 0.926921, proj_loss: -0.397898, loss_mean_cls: 2.286742, grad_norm: 2.517085 +Steps: 0%| | 2518/1000000 [10:22<67:29:57, 4.10it/s, grad_norm=2.52, loss_final=2.82, loss_mean=0.927, loss_mean_cls=2.29, proj_loss=-0.398][2026-03-23 13:46:42] Step: 2518, Training Logs: loss_final: 2.443973, loss_mean: 0.963342, proj_loss: -0.409720, loss_mean_cls: 1.890351, grad_norm: 8.407898 +Steps: 0%| | 2519/1000000 [10:23<67:29:37, 4.11it/s, grad_norm=8.41, loss_final=2.44, loss_mean=0.963, loss_mean_cls=1.89, proj_loss=-0.41][2026-03-23 13:46:43] Step: 2519, Training Logs: loss_final: 3.017355, loss_mean: 0.958234, proj_loss: -0.394856, loss_mean_cls: 2.453977, grad_norm: 14.156537 +Steps: 0%| | 2520/1000000 [10:23<67:27:37, 4.11it/s, grad_norm=14.2, loss_final=3.02, loss_mean=0.958, loss_mean_cls=2.45, proj_loss=-0.395][2026-03-23 13:46:43] Step: 2520, Training Logs: loss_final: 3.130306, loss_mean: 0.935317, proj_loss: -0.395360, loss_mean_cls: 2.590349, grad_norm: 19.220793 +Steps: 0%| | 2521/1000000 [10:23<67:31:31, 4.10it/s, grad_norm=19.2, loss_final=3.13, loss_mean=0.935, loss_mean_cls=2.59, proj_loss=-0.395][2026-03-23 13:46:43] Step: 2521, Training Logs: loss_final: 2.743343, loss_mean: 0.946065, proj_loss: -0.397684, loss_mean_cls: 2.194963, grad_norm: 4.085663 +Steps: 0%| | 2522/1000000 [10:23<67:30:02, 4.10it/s, grad_norm=4.09, loss_final=2.74, loss_mean=0.946, loss_mean_cls=2.19, proj_loss=-0.398][2026-03-23 13:46:43] Step: 2522, Training Logs: loss_final: 2.582887, loss_mean: 0.956350, proj_loss: -0.402479, loss_mean_cls: 2.029016, grad_norm: 12.272098 +Steps: 0%| | 2523/1000000 [10:24<67:29:11, 4.11it/s, grad_norm=12.3, loss_final=2.58, loss_mean=0.956, loss_mean_cls=2.03, proj_loss=-0.402][2026-03-23 13:46:44] Step: 2523, Training Logs: loss_final: 2.997778, loss_mean: 0.959439, proj_loss: -0.390906, loss_mean_cls: 2.429245, grad_norm: 16.121918 +Steps: 0%| | 2524/1000000 [10:24<67:28:00, 4.11it/s, grad_norm=16.1, loss_final=3, loss_mean=0.959, loss_mean_cls=2.43, proj_loss=-0.391][2026-03-23 13:46:44] Step: 2524, Training Logs: loss_final: 2.672010, loss_mean: 0.974593, proj_loss: -0.399319, loss_mean_cls: 2.096736, grad_norm: 20.386658 +Steps: 0%| | 2525/1000000 [10:24<67:28:48, 4.11it/s, grad_norm=20.4, loss_final=2.67, loss_mean=0.975, loss_mean_cls=2.1, proj_loss=-0.399][2026-03-23 13:46:44] Step: 2525, Training Logs: loss_final: 3.051173, loss_mean: 0.948128, proj_loss: -0.393708, loss_mean_cls: 2.496754, grad_norm: 20.506065 +Steps: 0%| | 2526/1000000 [10:24<67:26:50, 4.11it/s, grad_norm=20.5, loss_final=3.05, loss_mean=0.948, loss_mean_cls=2.5, proj_loss=-0.394][2026-03-23 13:46:44] Step: 2526, Training Logs: loss_final: 3.112949, loss_mean: 0.928549, proj_loss: -0.384756, loss_mean_cls: 2.569156, grad_norm: 3.660855 +Steps: 0%| | 2527/1000000 [10:25<67:25:24, 4.11it/s, grad_norm=3.66, loss_final=3.11, loss_mean=0.929, loss_mean_cls=2.57, proj_loss=-0.385][2026-03-23 13:46:45] Step: 2527, Training Logs: loss_final: 3.285704, loss_mean: 0.938096, proj_loss: -0.390936, loss_mean_cls: 2.738544, grad_norm: 22.266125 +Steps: 0%| | 2528/1000000 [10:25<67:25:22, 4.11it/s, grad_norm=22.3, loss_final=3.29, loss_mean=0.938, loss_mean_cls=2.74, proj_loss=-0.391][2026-03-23 13:46:45] Step: 2528, Training Logs: loss_final: 3.387055, loss_mean: 0.955581, proj_loss: -0.387937, loss_mean_cls: 2.819411, grad_norm: 10.865032 +Steps: 0%| | 2529/1000000 [10:25<67:26:01, 4.11it/s, grad_norm=10.9, loss_final=3.39, loss_mean=0.956, loss_mean_cls=2.82, proj_loss=-0.388][2026-03-23 13:46:45] Step: 2529, Training Logs: loss_final: 3.008980, loss_mean: 0.956085, proj_loss: -0.389423, loss_mean_cls: 2.442318, grad_norm: 16.727282 +Steps: 0%| | 2530/1000000 [10:25<67:25:40, 4.11it/s, grad_norm=16.7, loss_final=3.01, loss_mean=0.956, loss_mean_cls=2.44, proj_loss=-0.389][2026-03-23 13:46:45] Step: 2530, Training Logs: loss_final: 2.968314, loss_mean: 0.945739, proj_loss: -0.386132, loss_mean_cls: 2.408708, grad_norm: 4.101682 +Steps: 0%| | 2531/1000000 [10:26<67:23:51, 4.11it/s, grad_norm=4.1, loss_final=2.97, loss_mean=0.946, loss_mean_cls=2.41, proj_loss=-0.386][2026-03-23 13:46:46] Step: 2531, Training Logs: loss_final: 3.463565, loss_mean: 0.939437, proj_loss: -0.379451, loss_mean_cls: 2.903580, grad_norm: 23.783768 +Steps: 0%| | 2532/1000000 [10:26<67:25:25, 4.11it/s, grad_norm=23.8, loss_final=3.46, loss_mean=0.939, loss_mean_cls=2.9, proj_loss=-0.379][2026-03-23 13:46:46] Step: 2532, Training Logs: loss_final: 2.660751, loss_mean: 0.995033, proj_loss: -0.401388, loss_mean_cls: 2.067107, grad_norm: 21.682400 +Steps: 0%| | 2533/1000000 [10:26<67:25:28, 4.11it/s, grad_norm=21.7, loss_final=2.66, loss_mean=0.995, loss_mean_cls=2.07, proj_loss=-0.401][2026-03-23 13:46:46] Step: 2533, Training Logs: loss_final: 2.629432, loss_mean: 1.020324, proj_loss: -0.395438, loss_mean_cls: 2.004546, grad_norm: 13.463143 +Steps: 0%| | 2534/1000000 [10:26<67:25:57, 4.11it/s, grad_norm=13.5, loss_final=2.63, loss_mean=1.02, loss_mean_cls=2, proj_loss=-0.395][2026-03-23 13:46:46] Step: 2534, Training Logs: loss_final: 2.957028, loss_mean: 1.007810, proj_loss: -0.392561, loss_mean_cls: 2.341780, grad_norm: 9.423356 +Steps: 0%| | 2535/1000000 [10:27<67:26:39, 4.11it/s, grad_norm=9.42, loss_final=2.96, loss_mean=1.01, loss_mean_cls=2.34, proj_loss=-0.393][2026-03-23 13:46:47] Step: 2535, Training Logs: loss_final: 2.631395, loss_mean: 0.968773, proj_loss: -0.391833, loss_mean_cls: 2.054455, grad_norm: 8.251296 +Steps: 0%| | 2536/1000000 [10:27<67:25:46, 4.11it/s, grad_norm=8.25, loss_final=2.63, loss_mean=0.969, loss_mean_cls=2.05, proj_loss=-0.392][2026-03-23 13:46:47] Step: 2536, Training Logs: loss_final: 2.766751, loss_mean: 0.955754, proj_loss: -0.398703, loss_mean_cls: 2.209700, grad_norm: 2.819850 +Steps: 0%| | 2537/1000000 [10:27<67:46:22, 4.09it/s, grad_norm=2.82, loss_final=2.77, loss_mean=0.956, loss_mean_cls=2.21, proj_loss=-0.399][2026-03-23 13:46:47] Step: 2537, Training Logs: loss_final: 2.463974, loss_mean: 0.975422, proj_loss: -0.402282, loss_mean_cls: 1.890833, grad_norm: 8.162036 +Steps: 0%| | 2538/1000000 [10:27<68:03:05, 4.07it/s, grad_norm=8.16, loss_final=2.46, loss_mean=0.975, loss_mean_cls=1.89, proj_loss=-0.402][2026-03-23 13:46:47] Step: 2538, Training Logs: loss_final: 2.957250, loss_mean: 0.941678, proj_loss: -0.383335, loss_mean_cls: 2.398908, grad_norm: 6.214363 +Steps: 0%| | 2539/1000000 [10:28<67:54:38, 4.08it/s, grad_norm=6.21, loss_final=2.96, loss_mean=0.942, loss_mean_cls=2.4, proj_loss=-0.383][2026-03-23 13:46:47] Step: 2539, Training Logs: loss_final: 3.026424, loss_mean: 0.962159, proj_loss: -0.389602, loss_mean_cls: 2.453866, grad_norm: 12.689582 +Steps: 0%| | 2540/1000000 [10:28<67:45:31, 4.09it/s, grad_norm=12.7, loss_final=3.03, loss_mean=0.962, loss_mean_cls=2.45, proj_loss=-0.39][2026-03-23 13:46:48] Step: 2540, Training Logs: loss_final: 3.042501, loss_mean: 0.960563, proj_loss: -0.387052, loss_mean_cls: 2.468990, grad_norm: 11.974690 +Steps: 0%| | 2541/1000000 [10:28<67:40:03, 4.09it/s, grad_norm=12, loss_final=3.04, loss_mean=0.961, loss_mean_cls=2.47, proj_loss=-0.387][2026-03-23 13:46:48] Step: 2541, Training Logs: loss_final: 3.237263, loss_mean: 0.947560, proj_loss: -0.383191, loss_mean_cls: 2.672895, grad_norm: 19.505217 +Steps: 0%| | 2542/1000000 [10:28<67:36:30, 4.10it/s, grad_norm=19.5, loss_final=3.24, loss_mean=0.948, loss_mean_cls=2.67, proj_loss=-0.383][2026-03-23 13:46:48] Step: 2542, Training Logs: loss_final: 3.145171, loss_mean: 0.954498, proj_loss: -0.393047, loss_mean_cls: 2.583721, grad_norm: 15.516805 +Steps: 0%| | 2543/1000000 [10:29<67:34:50, 4.10it/s, grad_norm=15.5, loss_final=3.15, loss_mean=0.954, loss_mean_cls=2.58, proj_loss=-0.393][2026-03-23 13:46:48] Step: 2543, Training Logs: loss_final: 2.488180, loss_mean: 0.951117, proj_loss: -0.400121, loss_mean_cls: 1.937184, grad_norm: 2.742671 +Steps: 0%| | 2544/1000000 [10:29<67:35:32, 4.10it/s, grad_norm=2.74, loss_final=2.49, loss_mean=0.951, loss_mean_cls=1.94, proj_loss=-0.4][2026-03-23 13:46:49] Step: 2544, Training Logs: loss_final: 2.945679, loss_mean: 0.943822, proj_loss: -0.398826, loss_mean_cls: 2.400682, grad_norm: 18.942045 +Steps: 0%| | 2545/1000000 [10:29<67:39:18, 4.10it/s, grad_norm=18.9, loss_final=2.95, loss_mean=0.944, loss_mean_cls=2.4, proj_loss=-0.399][2026-03-23 13:46:49] Step: 2545, Training Logs: loss_final: 3.525432, loss_mean: 0.942679, proj_loss: -0.386767, loss_mean_cls: 2.969520, grad_norm: 23.033714 +Steps: 0%| | 2546/1000000 [10:29<67:35:15, 4.10it/s, grad_norm=23, loss_final=3.53, loss_mean=0.943, loss_mean_cls=2.97, proj_loss=-0.387][2026-03-23 13:46:49] Step: 2546, Training Logs: loss_final: 2.965832, loss_mean: 0.948653, proj_loss: -0.389104, loss_mean_cls: 2.406284, grad_norm: 8.673765 +Steps: 0%| | 2547/1000000 [10:29<67:32:32, 4.10it/s, grad_norm=8.67, loss_final=2.97, loss_mean=0.949, loss_mean_cls=2.41, proj_loss=-0.389][2026-03-23 13:46:49] Step: 2547, Training Logs: loss_final: 3.172052, loss_mean: 0.957089, proj_loss: -0.382421, loss_mean_cls: 2.597384, grad_norm: 14.794175 +Steps: 0%| | 2548/1000000 [10:30<67:32:26, 4.10it/s, grad_norm=14.8, loss_final=3.17, loss_mean=0.957, loss_mean_cls=2.6, proj_loss=-0.382][2026-03-23 13:46:50] Step: 2548, Training Logs: loss_final: 2.951533, loss_mean: 0.963201, proj_loss: -0.392267, loss_mean_cls: 2.380599, grad_norm: 17.545570 +Steps: 0%| | 2549/1000000 [10:30<67:30:39, 4.10it/s, grad_norm=17.5, loss_final=2.95, loss_mean=0.963, loss_mean_cls=2.38, proj_loss=-0.392][2026-03-23 13:46:50] Step: 2549, Training Logs: loss_final: 2.458241, loss_mean: 0.953624, proj_loss: -0.397455, loss_mean_cls: 1.902072, grad_norm: 10.866863 +Steps: 0%| | 2550/1000000 [10:30<67:30:35, 4.10it/s, grad_norm=10.9, loss_final=2.46, loss_mean=0.954, loss_mean_cls=1.9, proj_loss=-0.397][2026-03-23 13:46:50] Step: 2550, Training Logs: loss_final: 2.730897, loss_mean: 0.967626, proj_loss: -0.397202, loss_mean_cls: 2.160473, grad_norm: 2.636384 +Steps: 0%| | 2551/1000000 [10:30<67:29:36, 4.11it/s, grad_norm=2.64, loss_final=2.73, loss_mean=0.968, loss_mean_cls=2.16, proj_loss=-0.397][2026-03-23 13:46:50] Step: 2551, Training Logs: loss_final: 2.746691, loss_mean: 0.963574, proj_loss: -0.399802, loss_mean_cls: 2.182919, grad_norm: 14.896547 +Steps: 0%| | 2552/1000000 [10:31<67:32:19, 4.10it/s, grad_norm=14.9, loss_final=2.75, loss_mean=0.964, loss_mean_cls=2.18, proj_loss=-0.4][2026-03-23 13:46:51] Step: 2552, Training Logs: loss_final: 2.841519, loss_mean: 0.953550, proj_loss: -0.400146, loss_mean_cls: 2.288115, grad_norm: 21.118187 +Steps: 0%| | 2553/1000000 [10:31<67:31:08, 4.10it/s, grad_norm=21.1, loss_final=2.84, loss_mean=0.954, loss_mean_cls=2.29, proj_loss=-0.4][2026-03-23 13:46:51] Step: 2553, Training Logs: loss_final: 3.133454, loss_mean: 0.956803, proj_loss: -0.387451, loss_mean_cls: 2.564102, grad_norm: 21.760738 +Steps: 0%| | 2554/1000000 [10:31<67:30:54, 4.10it/s, grad_norm=21.8, loss_final=3.13, loss_mean=0.957, loss_mean_cls=2.56, proj_loss=-0.387][2026-03-23 13:46:51] Step: 2554, Training Logs: loss_final: 2.735086, loss_mean: 0.971095, proj_loss: -0.390069, loss_mean_cls: 2.154060, grad_norm: 13.797827 +Steps: 0%| | 2555/1000000 [10:31<67:30:47, 4.10it/s, grad_norm=13.8, loss_final=2.74, loss_mean=0.971, loss_mean_cls=2.15, proj_loss=-0.39][2026-03-23 13:46:51] Step: 2555, Training Logs: loss_final: 3.379222, loss_mean: 0.941918, proj_loss: -0.379700, loss_mean_cls: 2.817003, grad_norm: 17.858669 +Steps: 0%| | 2556/1000000 [10:32<67:29:29, 4.11it/s, grad_norm=17.9, loss_final=3.38, loss_mean=0.942, loss_mean_cls=2.82, proj_loss=-0.38][2026-03-23 13:46:52] Step: 2556, Training Logs: loss_final: 2.865204, loss_mean: 0.965620, proj_loss: -0.392260, loss_mean_cls: 2.291844, grad_norm: 14.427188 +Steps: 0%| | 2557/1000000 [10:32<67:29:01, 4.11it/s, grad_norm=14.4, loss_final=2.87, loss_mean=0.966, loss_mean_cls=2.29, proj_loss=-0.392][2026-03-23 13:46:52] Step: 2557, Training Logs: loss_final: 3.296828, loss_mean: 0.939184, proj_loss: -0.383787, loss_mean_cls: 2.741431, grad_norm: 15.640512 +Steps: 0%| | 2558/1000000 [10:32<67:28:50, 4.11it/s, grad_norm=15.6, loss_final=3.3, loss_mean=0.939, loss_mean_cls=2.74, proj_loss=-0.384][2026-03-23 13:46:52] Step: 2558, Training Logs: loss_final: 2.642347, loss_mean: 0.948411, proj_loss: -0.399352, loss_mean_cls: 2.093288, grad_norm: 6.789556 +Steps: 0%| | 2559/1000000 [10:32<67:34:39, 4.10it/s, grad_norm=6.79, loss_final=2.64, loss_mean=0.948, loss_mean_cls=2.09, proj_loss=-0.399][2026-03-23 13:46:52] Step: 2559, Training Logs: loss_final: 3.165004, loss_mean: 0.938707, proj_loss: -0.391174, loss_mean_cls: 2.617471, grad_norm: 18.564705 +Steps: 0%| | 2560/1000000 [10:33<67:37:07, 4.10it/s, grad_norm=18.6, loss_final=3.17, loss_mean=0.939, loss_mean_cls=2.62, proj_loss=-0.391][2026-03-23 13:46:53] Step: 2560, Training Logs: loss_final: 2.719607, loss_mean: 0.962609, proj_loss: -0.398305, loss_mean_cls: 2.155303, grad_norm: 15.790337 +Steps: 0%| | 2561/1000000 [10:33<67:34:07, 4.10it/s, grad_norm=15.8, loss_final=2.72, loss_mean=0.963, loss_mean_cls=2.16, proj_loss=-0.398][2026-03-23 13:46:53] Step: 2561, Training Logs: loss_final: 3.115890, loss_mean: 0.955715, proj_loss: -0.399551, loss_mean_cls: 2.559726, grad_norm: 17.982883 +Steps: 0%| | 2562/1000000 [10:33<67:32:59, 4.10it/s, grad_norm=18, loss_final=3.12, loss_mean=0.956, loss_mean_cls=2.56, proj_loss=-0.4][2026-03-23 13:46:53] Step: 2562, Training Logs: loss_final: 2.797522, loss_mean: 0.943269, proj_loss: -0.398636, loss_mean_cls: 2.252890, grad_norm: 10.027902 +Steps: 0%| | 2563/1000000 [10:33<67:31:10, 4.10it/s, grad_norm=10, loss_final=2.8, loss_mean=0.943, loss_mean_cls=2.25, proj_loss=-0.399][2026-03-23 13:46:53] Step: 2563, Training Logs: loss_final: 2.487133, loss_mean: 0.969178, proj_loss: -0.395174, loss_mean_cls: 1.913129, grad_norm: 9.180758 +Steps: 0%| | 2564/1000000 [10:34<67:38:19, 4.10it/s, grad_norm=9.18, loss_final=2.49, loss_mean=0.969, loss_mean_cls=1.91, proj_loss=-0.395][2026-03-23 13:46:54] Step: 2564, Training Logs: loss_final: 3.252995, loss_mean: 0.927026, proj_loss: -0.393235, loss_mean_cls: 2.719204, grad_norm: 2.116300 +Steps: 0%| | 2565/1000000 [10:34<67:35:33, 4.10it/s, grad_norm=2.12, loss_final=3.25, loss_mean=0.927, loss_mean_cls=2.72, proj_loss=-0.393][2026-03-23 13:46:54] Step: 2565, Training Logs: loss_final: 2.971325, loss_mean: 0.928742, proj_loss: -0.393071, loss_mean_cls: 2.435654, grad_norm: 8.689617 +Steps: 0%| | 2566/1000000 [10:34<67:35:22, 4.10it/s, grad_norm=8.69, loss_final=2.97, loss_mean=0.929, loss_mean_cls=2.44, proj_loss=-0.393][2026-03-23 13:46:54] Step: 2566, Training Logs: loss_final: 2.568054, loss_mean: 0.952900, proj_loss: -0.398541, loss_mean_cls: 2.013696, grad_norm: 7.094273 +Steps: 0%| | 2567/1000000 [10:34<67:33:37, 4.10it/s, grad_norm=7.09, loss_final=2.57, loss_mean=0.953, loss_mean_cls=2.01, proj_loss=-0.399][2026-03-23 13:46:54] Step: 2567, Training Logs: loss_final: 3.062604, loss_mean: 0.940497, proj_loss: -0.389312, loss_mean_cls: 2.511419, grad_norm: 9.241493 +Steps: 0%| | 2568/1000000 [10:35<67:38:01, 4.10it/s, grad_norm=9.24, loss_final=3.06, loss_mean=0.94, loss_mean_cls=2.51, proj_loss=-0.389][2026-03-23 13:46:55] Step: 2568, Training Logs: loss_final: 3.287966, loss_mean: 0.947036, proj_loss: -0.388006, loss_mean_cls: 2.728936, grad_norm: 12.181645 +Steps: 0%| | 2569/1000000 [10:35<67:34:29, 4.10it/s, grad_norm=12.2, loss_final=3.29, loss_mean=0.947, loss_mean_cls=2.73, proj_loss=-0.388][2026-03-23 13:46:55] Step: 2569, Training Logs: loss_final: 3.037283, loss_mean: 0.962974, proj_loss: -0.395587, loss_mean_cls: 2.469896, grad_norm: 11.034200 +Steps: 0%| | 2570/1000000 [10:35<67:32:53, 4.10it/s, grad_norm=11, loss_final=3.04, loss_mean=0.963, loss_mean_cls=2.47, proj_loss=-0.396][2026-03-23 13:46:55] Step: 2570, Training Logs: loss_final: 3.064092, loss_mean: 0.933085, proj_loss: -0.394232, loss_mean_cls: 2.525239, grad_norm: 11.506495 +Steps: 0%| | 2571/1000000 [10:35<67:36:10, 4.10it/s, grad_norm=11.5, loss_final=3.06, loss_mean=0.933, loss_mean_cls=2.53, proj_loss=-0.394][2026-03-23 13:46:55] Step: 2571, Training Logs: loss_final: 3.256970, loss_mean: 0.944629, proj_loss: -0.393068, loss_mean_cls: 2.705410, grad_norm: 20.475582 +Steps: 0%| | 2572/1000000 [10:36<67:40:41, 4.09it/s, grad_norm=20.5, loss_final=3.26, loss_mean=0.945, loss_mean_cls=2.71, proj_loss=-0.393][2026-03-23 13:46:56] Step: 2572, Training Logs: loss_final: 2.881115, loss_mean: 0.946944, proj_loss: -0.391571, loss_mean_cls: 2.325743, grad_norm: 11.351048 +Steps: 0%| | 2573/1000000 [10:36<67:35:54, 4.10it/s, grad_norm=11.4, loss_final=2.88, loss_mean=0.947, loss_mean_cls=2.33, proj_loss=-0.392][2026-03-23 13:46:56] Step: 2573, Training Logs: loss_final: 3.098573, loss_mean: 0.920689, proj_loss: -0.393946, loss_mean_cls: 2.571829, grad_norm: 5.427573 +Steps: 0%| | 2574/1000000 [10:36<67:32:25, 4.10it/s, grad_norm=5.43, loss_final=3.1, loss_mean=0.921, loss_mean_cls=2.57, proj_loss=-0.394][2026-03-23 13:46:56] Step: 2574, Training Logs: loss_final: 2.296750, loss_mean: 0.959545, proj_loss: -0.410906, loss_mean_cls: 1.748111, grad_norm: 6.096369 +Steps: 0%| | 2575/1000000 [10:36<67:35:31, 4.10it/s, grad_norm=6.1, loss_final=2.3, loss_mean=0.96, loss_mean_cls=1.75, proj_loss=-0.411][2026-03-23 13:46:56] Step: 2575, Training Logs: loss_final: 2.920846, loss_mean: 0.928646, proj_loss: -0.390640, loss_mean_cls: 2.382840, grad_norm: 11.677547 +Steps: 0%| | 2576/1000000 [10:37<67:32:54, 4.10it/s, grad_norm=11.7, loss_final=2.92, loss_mean=0.929, loss_mean_cls=2.38, proj_loss=-0.391][2026-03-23 13:46:57] Step: 2576, Training Logs: loss_final: 3.047454, loss_mean: 0.952424, proj_loss: -0.399653, loss_mean_cls: 2.494683, grad_norm: 2.447293 +Steps: 0%| | 2577/1000000 [10:37<67:32:23, 4.10it/s, grad_norm=2.45, loss_final=3.05, loss_mean=0.952, loss_mean_cls=2.49, proj_loss=-0.4][2026-03-23 13:46:57] Step: 2577, Training Logs: loss_final: 3.091804, loss_mean: 0.923495, proj_loss: -0.392646, loss_mean_cls: 2.560954, grad_norm: 4.583364 +Steps: 0%| | 2578/1000000 [10:37<67:32:37, 4.10it/s, grad_norm=4.58, loss_final=3.09, loss_mean=0.923, loss_mean_cls=2.56, proj_loss=-0.393][2026-03-23 13:46:57] Step: 2578, Training Logs: loss_final: 2.540233, loss_mean: 0.960159, proj_loss: -0.404329, loss_mean_cls: 1.984403, grad_norm: 7.293975 +Steps: 0%| | 2579/1000000 [10:37<67:34:14, 4.10it/s, grad_norm=7.29, loss_final=2.54, loss_mean=0.96, loss_mean_cls=1.98, proj_loss=-0.404][2026-03-23 13:46:57] Step: 2579, Training Logs: loss_final: 2.619914, loss_mean: 0.945274, proj_loss: -0.400112, loss_mean_cls: 2.074752, grad_norm: 8.175939 +Steps: 0%| | 2580/1000000 [10:38<67:34:25, 4.10it/s, grad_norm=8.18, loss_final=2.62, loss_mean=0.945, loss_mean_cls=2.07, proj_loss=-0.4][2026-03-23 13:46:57] Step: 2580, Training Logs: loss_final: 2.956352, loss_mean: 0.943870, proj_loss: -0.396923, loss_mean_cls: 2.409406, grad_norm: 10.900553 +Steps: 0%| | 2581/1000000 [10:38<67:33:16, 4.10it/s, grad_norm=10.9, loss_final=2.96, loss_mean=0.944, loss_mean_cls=2.41, proj_loss=-0.397][2026-03-23 13:46:58] Step: 2581, Training Logs: loss_final: 2.610866, loss_mean: 0.955142, proj_loss: -0.399836, loss_mean_cls: 2.055560, grad_norm: 7.139885 +Steps: 0%| | 2582/1000000 [10:38<67:31:53, 4.10it/s, grad_norm=7.14, loss_final=2.61, loss_mean=0.955, loss_mean_cls=2.06, proj_loss=-0.4][2026-03-23 13:46:58] Step: 2582, Training Logs: loss_final: 2.994133, loss_mean: 0.958986, proj_loss: -0.392184, loss_mean_cls: 2.427330, grad_norm: 6.994151 +Steps: 0%| | 2583/1000000 [10:38<70:12:42, 3.95it/s, grad_norm=6.99, loss_final=2.99, loss_mean=0.959, loss_mean_cls=2.43, proj_loss=-0.392][2026-03-23 13:46:58] Step: 2583, Training Logs: loss_final: 2.790608, loss_mean: 0.927716, proj_loss: -0.393522, loss_mean_cls: 2.256415, grad_norm: 3.314399 +Steps: 0%| | 2584/1000000 [10:39<69:22:55, 3.99it/s, grad_norm=3.31, loss_final=2.79, loss_mean=0.928, loss_mean_cls=2.26, proj_loss=-0.394][2026-03-23 13:46:59] Step: 2584, Training Logs: loss_final: 2.972700, loss_mean: 0.937571, proj_loss: -0.399221, loss_mean_cls: 2.434350, grad_norm: 5.359605 +Steps: 0%| | 2585/1000000 [10:39<68:46:53, 4.03it/s, grad_norm=5.36, loss_final=2.97, loss_mean=0.938, loss_mean_cls=2.43, proj_loss=-0.399][2026-03-23 13:46:59] Step: 2585, Training Logs: loss_final: 3.365560, loss_mean: 0.939215, proj_loss: -0.392683, loss_mean_cls: 2.819028, grad_norm: 7.335173 +Steps: 0%| | 2586/1000000 [10:39<68:23:22, 4.05it/s, grad_norm=7.34, loss_final=3.37, loss_mean=0.939, loss_mean_cls=2.82, proj_loss=-0.393][2026-03-23 13:46:59] Step: 2586, Training Logs: loss_final: 2.826588, loss_mean: 0.951656, proj_loss: -0.400371, loss_mean_cls: 2.275303, grad_norm: 17.141453 +Steps: 0%| | 2587/1000000 [10:39<68:10:12, 4.06it/s, grad_norm=17.1, loss_final=2.83, loss_mean=0.952, loss_mean_cls=2.28, proj_loss=-0.4][2026-03-23 13:46:59] Step: 2587, Training Logs: loss_final: 2.726116, loss_mean: 0.938640, proj_loss: -0.397747, loss_mean_cls: 2.185223, grad_norm: 17.721048 +Steps: 0%| | 2588/1000000 [10:40<67:57:20, 4.08it/s, grad_norm=17.7, loss_final=2.73, loss_mean=0.939, loss_mean_cls=2.19, proj_loss=-0.398][2026-03-23 13:46:59] Step: 2588, Training Logs: loss_final: 2.743049, loss_mean: 0.951943, proj_loss: -0.394654, loss_mean_cls: 2.185760, grad_norm: 4.222861 +Steps: 0%| | 2589/1000000 [10:40<67:47:36, 4.09it/s, grad_norm=4.22, loss_final=2.74, loss_mean=0.952, loss_mean_cls=2.19, proj_loss=-0.395][2026-03-23 13:47:00] Step: 2589, Training Logs: loss_final: 2.828429, loss_mean: 0.963349, proj_loss: -0.395849, loss_mean_cls: 2.260929, grad_norm: 4.172306 +Steps: 0%| | 2590/1000000 [10:40<67:41:56, 4.09it/s, grad_norm=4.17, loss_final=2.83, loss_mean=0.963, loss_mean_cls=2.26, proj_loss=-0.396][2026-03-23 13:47:00] Step: 2590, Training Logs: loss_final: 2.623040, loss_mean: 0.962312, proj_loss: -0.399308, loss_mean_cls: 2.060036, grad_norm: 3.444558 +Steps: 0%| | 2591/1000000 [10:40<67:41:57, 4.09it/s, grad_norm=3.44, loss_final=2.62, loss_mean=0.962, loss_mean_cls=2.06, proj_loss=-0.399][2026-03-23 13:47:00] Step: 2591, Training Logs: loss_final: 2.714384, loss_mean: 0.951209, proj_loss: -0.401437, loss_mean_cls: 2.164611, grad_norm: 22.438116 +Steps: 0%| | 2592/1000000 [10:40<67:38:26, 4.10it/s, grad_norm=22.4, loss_final=2.71, loss_mean=0.951, loss_mean_cls=2.16, proj_loss=-0.401][2026-03-23 13:47:00] Step: 2592, Training Logs: loss_final: 2.687384, loss_mean: 0.912240, proj_loss: -0.400982, loss_mean_cls: 2.176126, grad_norm: 14.467337 +Steps: 0%| | 2593/1000000 [10:41<67:34:50, 4.10it/s, grad_norm=14.5, loss_final=2.69, loss_mean=0.912, loss_mean_cls=2.18, proj_loss=-0.401][2026-03-23 13:47:01] Step: 2593, Training Logs: loss_final: 2.952754, loss_mean: 0.970143, proj_loss: -0.394027, loss_mean_cls: 2.376637, grad_norm: 12.097935 +Steps: 0%| | 2594/1000000 [10:41<67:32:44, 4.10it/s, grad_norm=12.1, loss_final=2.95, loss_mean=0.97, loss_mean_cls=2.38, proj_loss=-0.394][2026-03-23 13:47:01] Step: 2594, Training Logs: loss_final: 3.241539, loss_mean: 0.944235, proj_loss: -0.385437, loss_mean_cls: 2.682742, grad_norm: 3.314348 +Steps: 0%| | 2595/1000000 [10:41<67:34:48, 4.10it/s, grad_norm=3.31, loss_final=3.24, loss_mean=0.944, loss_mean_cls=2.68, proj_loss=-0.385][2026-03-23 13:47:01] Step: 2595, Training Logs: loss_final: 3.248264, loss_mean: 0.940428, proj_loss: -0.395690, loss_mean_cls: 2.703526, grad_norm: 20.107841 +Steps: 0%| | 2596/1000000 [10:41<67:30:43, 4.10it/s, grad_norm=20.1, loss_final=3.25, loss_mean=0.94, loss_mean_cls=2.7, proj_loss=-0.396][2026-03-23 13:47:01] Step: 2596, Training Logs: loss_final: 3.040951, loss_mean: 0.949027, proj_loss: -0.400840, loss_mean_cls: 2.492764, grad_norm: 26.228703 +Steps: 0%| | 2597/1000000 [10:42<67:30:47, 4.10it/s, grad_norm=26.2, loss_final=3.04, loss_mean=0.949, loss_mean_cls=2.49, proj_loss=-0.401][2026-03-23 13:47:02] Step: 2597, Training Logs: loss_final: 2.943890, loss_mean: 1.001614, proj_loss: -0.397152, loss_mean_cls: 2.339427, grad_norm: 12.348268 +Steps: 0%| | 2598/1000000 [10:42<67:29:12, 4.11it/s, grad_norm=12.3, loss_final=2.94, loss_mean=1, loss_mean_cls=2.34, proj_loss=-0.397][2026-03-23 13:47:02] Step: 2598, Training Logs: loss_final: 2.641529, loss_mean: 0.964958, proj_loss: -0.399882, loss_mean_cls: 2.076453, grad_norm: 10.256269 +Steps: 0%| | 2599/1000000 [10:42<67:36:03, 4.10it/s, grad_norm=10.3, loss_final=2.64, loss_mean=0.965, loss_mean_cls=2.08, proj_loss=-0.4][2026-03-23 13:47:02] Step: 2599, Training Logs: loss_final: 2.823657, loss_mean: 0.951624, proj_loss: -0.391305, loss_mean_cls: 2.263339, grad_norm: 8.222818 +Steps: 0%| | 2600/1000000 [10:42<67:40:15, 4.09it/s, grad_norm=8.22, loss_final=2.82, loss_mean=0.952, loss_mean_cls=2.26, proj_loss=-0.391][2026-03-23 13:47:02] Step: 2600, Training Logs: loss_final: 2.871158, loss_mean: 0.950624, proj_loss: -0.401901, loss_mean_cls: 2.322435, grad_norm: 20.961063 +Steps: 0%| | 2601/1000000 [10:43<67:38:23, 4.10it/s, grad_norm=21, loss_final=2.87, loss_mean=0.951, loss_mean_cls=2.32, proj_loss=-0.402][2026-03-23 13:47:03] Step: 2601, Training Logs: loss_final: 2.875768, loss_mean: 0.970415, proj_loss: -0.400115, loss_mean_cls: 2.305468, grad_norm: 15.430295 +Steps: 0%| | 2602/1000000 [10:43<67:35:23, 4.10it/s, grad_norm=15.4, loss_final=2.88, loss_mean=0.97, loss_mean_cls=2.31, proj_loss=-0.4][2026-03-23 13:47:03] Step: 2602, Training Logs: loss_final: 3.694753, loss_mean: 0.957734, proj_loss: -0.392144, loss_mean_cls: 3.129163, grad_norm: 26.613010 +Steps: 0%| | 2603/1000000 [10:43<67:32:53, 4.10it/s, grad_norm=26.6, loss_final=3.69, loss_mean=0.958, loss_mean_cls=3.13, proj_loss=-0.392][2026-03-23 13:47:03] Step: 2603, Training Logs: loss_final: 3.310264, loss_mean: 0.981941, proj_loss: -0.396746, loss_mean_cls: 2.725068, grad_norm: 22.484745 +Steps: 0%| | 2604/1000000 [10:43<67:31:50, 4.10it/s, grad_norm=22.5, loss_final=3.31, loss_mean=0.982, loss_mean_cls=2.73, proj_loss=-0.397][2026-03-23 13:47:03] Step: 2604, Training Logs: loss_final: 2.811686, loss_mean: 1.002133, proj_loss: -0.396717, loss_mean_cls: 2.206270, grad_norm: 19.383455 +Steps: 0%| | 2605/1000000 [10:44<67:29:21, 4.11it/s, grad_norm=19.4, loss_final=2.81, loss_mean=1, loss_mean_cls=2.21, proj_loss=-0.397][2026-03-23 13:47:04] Step: 2605, Training Logs: loss_final: 2.823745, loss_mean: 0.983072, proj_loss: -0.396600, loss_mean_cls: 2.237273, grad_norm: 4.569506 +Steps: 0%| | 2606/1000000 [10:44<67:28:51, 4.11it/s, grad_norm=4.57, loss_final=2.82, loss_mean=0.983, loss_mean_cls=2.24, proj_loss=-0.397][2026-03-23 13:47:04] Step: 2606, Training Logs: loss_final: 3.389374, loss_mean: 0.950266, proj_loss: -0.388212, loss_mean_cls: 2.827321, grad_norm: 22.361889 +Steps: 0%| | 2607/1000000 [10:44<67:37:50, 4.10it/s, grad_norm=22.4, loss_final=3.39, loss_mean=0.95, loss_mean_cls=2.83, proj_loss=-0.388][2026-03-23 13:47:04] Step: 2607, Training Logs: loss_final: 2.884289, loss_mean: 0.970335, proj_loss: -0.397987, loss_mean_cls: 2.311941, grad_norm: 13.027394 +Steps: 0%| | 2608/1000000 [10:44<67:34:40, 4.10it/s, grad_norm=13, loss_final=2.88, loss_mean=0.97, loss_mean_cls=2.31, proj_loss=-0.398][2026-03-23 13:47:04] Step: 2608, Training Logs: loss_final: 2.690355, loss_mean: 0.979860, proj_loss: -0.400481, loss_mean_cls: 2.110976, grad_norm: 13.037502 +Steps: 0%| | 2609/1000000 [10:45<67:31:43, 4.10it/s, grad_norm=13, loss_final=2.69, loss_mean=0.98, loss_mean_cls=2.11, proj_loss=-0.4][2026-03-23 13:47:05] Step: 2609, Training Logs: loss_final: 3.063496, loss_mean: 0.960875, proj_loss: -0.394739, loss_mean_cls: 2.497360, grad_norm: 5.562047 +Steps: 0%| | 2610/1000000 [10:45<67:30:39, 4.10it/s, grad_norm=5.56, loss_final=3.06, loss_mean=0.961, loss_mean_cls=2.5, proj_loss=-0.395][2026-03-23 13:47:05] Step: 2610, Training Logs: loss_final: 3.302622, loss_mean: 0.925389, proj_loss: -0.389786, loss_mean_cls: 2.767020, grad_norm: 14.141366 +Steps: 0%| | 2611/1000000 [10:45<67:29:30, 4.10it/s, grad_norm=14.1, loss_final=3.3, loss_mean=0.925, loss_mean_cls=2.77, proj_loss=-0.39][2026-03-23 13:47:05] Step: 2611, Training Logs: loss_final: 3.003737, loss_mean: 0.938192, proj_loss: -0.403306, loss_mean_cls: 2.468851, grad_norm: 9.238860 +Steps: 0%| | 2612/1000000 [10:45<67:34:23, 4.10it/s, grad_norm=9.24, loss_final=3, loss_mean=0.938, loss_mean_cls=2.47, proj_loss=-0.403][2026-03-23 13:47:05] Step: 2612, Training Logs: loss_final: 2.990172, loss_mean: 0.955472, proj_loss: -0.396548, loss_mean_cls: 2.431247, grad_norm: 8.242758 +Steps: 0%| | 2613/1000000 [10:46<67:31:38, 4.10it/s, grad_norm=8.24, loss_final=2.99, loss_mean=0.955, loss_mean_cls=2.43, proj_loss=-0.397][2026-03-23 13:47:06] Step: 2613, Training Logs: loss_final: 2.436539, loss_mean: 0.963176, proj_loss: -0.402871, loss_mean_cls: 1.876234, grad_norm: 11.488666 +Steps: 0%| | 2614/1000000 [10:46<67:35:31, 4.10it/s, grad_norm=11.5, loss_final=2.44, loss_mean=0.963, loss_mean_cls=1.88, proj_loss=-0.403][2026-03-23 13:47:06] Step: 2614, Training Logs: loss_final: 2.492229, loss_mean: 0.960021, proj_loss: -0.408299, loss_mean_cls: 1.940506, grad_norm: 3.893958 +Steps: 0%| | 2615/1000000 [10:46<67:33:23, 4.10it/s, grad_norm=3.89, loss_final=2.49, loss_mean=0.96, loss_mean_cls=1.94, proj_loss=-0.408][2026-03-23 13:47:06] Step: 2615, Training Logs: loss_final: 3.016119, loss_mean: 0.929498, proj_loss: -0.395631, loss_mean_cls: 2.482251, grad_norm: 8.735944 +Steps: 0%| | 2616/1000000 [10:46<67:32:48, 4.10it/s, grad_norm=8.74, loss_final=3.02, loss_mean=0.929, loss_mean_cls=2.48, proj_loss=-0.396][2026-03-23 13:47:06] Step: 2616, Training Logs: loss_final: 3.085521, loss_mean: 0.911551, proj_loss: -0.393009, loss_mean_cls: 2.566979, grad_norm: 2.061933 +Steps: 0%| | 2617/1000000 [10:47<67:30:46, 4.10it/s, grad_norm=2.06, loss_final=3.09, loss_mean=0.912, loss_mean_cls=2.57, proj_loss=-0.393][2026-03-23 13:47:07] Step: 2617, Training Logs: loss_final: 2.935599, loss_mean: 0.930390, proj_loss: -0.400612, loss_mean_cls: 2.405821, grad_norm: 2.148402 +Steps: 0%| | 2618/1000000 [10:47<67:29:00, 4.11it/s, grad_norm=2.15, loss_final=2.94, loss_mean=0.93, loss_mean_cls=2.41, proj_loss=-0.401][2026-03-23 13:47:07] Step: 2618, Training Logs: loss_final: 2.492627, loss_mean: 0.935580, proj_loss: -0.411360, loss_mean_cls: 1.968406, grad_norm: 11.158887 +Steps: 0%| | 2619/1000000 [10:47<67:28:52, 4.11it/s, grad_norm=11.2, loss_final=2.49, loss_mean=0.936, loss_mean_cls=1.97, proj_loss=-0.411][2026-03-23 13:47:07] Step: 2619, Training Logs: loss_final: 2.811339, loss_mean: 0.937776, proj_loss: -0.399330, loss_mean_cls: 2.272893, grad_norm: 15.896874 +Steps: 0%| | 2620/1000000 [10:47<67:27:12, 4.11it/s, grad_norm=15.9, loss_final=2.81, loss_mean=0.938, loss_mean_cls=2.27, proj_loss=-0.399][2026-03-23 13:47:07] Step: 2620, Training Logs: loss_final: 3.156775, loss_mean: 0.968226, proj_loss: -0.385905, loss_mean_cls: 2.574455, grad_norm: 4.439607 +Steps: 0%| | 2621/1000000 [10:48<67:30:26, 4.10it/s, grad_norm=4.44, loss_final=3.16, loss_mean=0.968, loss_mean_cls=2.57, proj_loss=-0.386][2026-03-23 13:47:08] Step: 2621, Training Logs: loss_final: 3.059885, loss_mean: 0.937471, proj_loss: -0.391146, loss_mean_cls: 2.513560, grad_norm: 18.502304 +Steps: 0%| | 2622/1000000 [10:48<67:27:45, 4.11it/s, grad_norm=18.5, loss_final=3.06, loss_mean=0.937, loss_mean_cls=2.51, proj_loss=-0.391][2026-03-23 13:47:08] Step: 2622, Training Logs: loss_final: 3.310775, loss_mean: 0.924211, proj_loss: -0.398522, loss_mean_cls: 2.785085, grad_norm: 20.525326 +Steps: 0%| | 2623/1000000 [10:48<67:27:08, 4.11it/s, grad_norm=20.5, loss_final=3.31, loss_mean=0.924, loss_mean_cls=2.79, proj_loss=-0.399][2026-03-23 13:47:08] Step: 2623, Training Logs: loss_final: 2.821374, loss_mean: 0.976009, proj_loss: -0.399638, loss_mean_cls: 2.245003, grad_norm: 11.250418 +Steps: 0%| | 2624/1000000 [10:48<67:25:34, 4.11it/s, grad_norm=11.3, loss_final=2.82, loss_mean=0.976, loss_mean_cls=2.25, proj_loss=-0.4][2026-03-23 13:47:08] Step: 2624, Training Logs: loss_final: 3.217758, loss_mean: 0.950022, proj_loss: -0.391571, loss_mean_cls: 2.659307, grad_norm: 11.713396 +Steps: 0%| | 2625/1000000 [10:49<67:25:06, 4.11it/s, grad_norm=11.7, loss_final=3.22, loss_mean=0.95, loss_mean_cls=2.66, proj_loss=-0.392][2026-03-23 13:47:08] Step: 2625, Training Logs: loss_final: 3.238090, loss_mean: 0.929332, proj_loss: -0.390549, loss_mean_cls: 2.699307, grad_norm: 17.760069 +Steps: 0%| | 2626/1000000 [10:49<67:24:44, 4.11it/s, grad_norm=17.8, loss_final=3.24, loss_mean=0.929, loss_mean_cls=2.7, proj_loss=-0.391][2026-03-23 13:47:09] Step: 2626, Training Logs: loss_final: 2.728435, loss_mean: 0.943640, proj_loss: -0.401479, loss_mean_cls: 2.186275, grad_norm: 13.388498 +Steps: 0%| | 2627/1000000 [10:49<67:23:40, 4.11it/s, grad_norm=13.4, loss_final=2.73, loss_mean=0.944, loss_mean_cls=2.19, proj_loss=-0.401][2026-03-23 13:47:09] Step: 2627, Training Logs: loss_final: 3.262761, loss_mean: 0.946938, proj_loss: -0.388989, loss_mean_cls: 2.704812, grad_norm: 10.244219 +Steps: 0%| | 2628/1000000 [10:49<67:27:29, 4.11it/s, grad_norm=10.2, loss_final=3.26, loss_mean=0.947, loss_mean_cls=2.7, proj_loss=-0.389][2026-03-23 13:47:09] Step: 2628, Training Logs: loss_final: 2.613744, loss_mean: 0.962174, proj_loss: -0.406534, loss_mean_cls: 2.058104, grad_norm: 12.107348 +Steps: 0%| | 2629/1000000 [10:50<67:25:59, 4.11it/s, grad_norm=12.1, loss_final=2.61, loss_mean=0.962, loss_mean_cls=2.06, proj_loss=-0.407][2026-03-23 13:47:09] Step: 2629, Training Logs: loss_final: 2.930223, loss_mean: 0.928783, proj_loss: -0.395317, loss_mean_cls: 2.396757, grad_norm: 7.359444 +Steps: 0%| | 2630/1000000 [10:50<67:26:24, 4.11it/s, grad_norm=7.36, loss_final=2.93, loss_mean=0.929, loss_mean_cls=2.4, proj_loss=-0.395][2026-03-23 13:47:10] Step: 2630, Training Logs: loss_final: 2.869120, loss_mean: 0.944715, proj_loss: -0.402011, loss_mean_cls: 2.326416, grad_norm: 19.960539 +Steps: 0%| | 2631/1000000 [10:50<67:25:41, 4.11it/s, grad_norm=20, loss_final=2.87, loss_mean=0.945, loss_mean_cls=2.33, proj_loss=-0.402][2026-03-23 13:47:10] Step: 2631, Training Logs: loss_final: 3.110899, loss_mean: 0.937110, proj_loss: -0.394843, loss_mean_cls: 2.568633, grad_norm: 9.268209 +Steps: 0%| | 2632/1000000 [10:50<67:25:41, 4.11it/s, grad_norm=9.27, loss_final=3.11, loss_mean=0.937, loss_mean_cls=2.57, proj_loss=-0.395][2026-03-23 13:47:10] Step: 2632, Training Logs: loss_final: 2.713405, loss_mean: 0.933658, proj_loss: -0.404205, loss_mean_cls: 2.183951, grad_norm: 8.510756 +Steps: 0%| | 2633/1000000 [10:50<67:27:42, 4.11it/s, grad_norm=8.51, loss_final=2.71, loss_mean=0.934, loss_mean_cls=2.18, proj_loss=-0.404][2026-03-23 13:47:10] Step: 2633, Training Logs: loss_final: 3.502525, loss_mean: 0.926168, proj_loss: -0.389294, loss_mean_cls: 2.965651, grad_norm: 11.254914 +Steps: 0%| | 2634/1000000 [10:51<67:27:02, 4.11it/s, grad_norm=11.3, loss_final=3.5, loss_mean=0.926, loss_mean_cls=2.97, proj_loss=-0.389][2026-03-23 13:47:11] Step: 2634, Training Logs: loss_final: 2.899632, loss_mean: 0.934137, proj_loss: -0.405793, loss_mean_cls: 2.371288, grad_norm: 17.291313 +Steps: 0%| | 2635/1000000 [10:51<67:30:16, 4.10it/s, grad_norm=17.3, loss_final=2.9, loss_mean=0.934, loss_mean_cls=2.37, proj_loss=-0.406][2026-03-23 13:47:11] Step: 2635, Training Logs: loss_final: 2.830451, loss_mean: 0.963479, proj_loss: -0.395299, loss_mean_cls: 2.262271, grad_norm: 14.254413 +Steps: 0%| | 2636/1000000 [10:51<67:27:39, 4.11it/s, grad_norm=14.3, loss_final=2.83, loss_mean=0.963, loss_mean_cls=2.26, proj_loss=-0.395][2026-03-23 13:47:11] Step: 2636, Training Logs: loss_final: 3.020319, loss_mean: 0.930404, proj_loss: -0.398875, loss_mean_cls: 2.488791, grad_norm: 21.364828 +Steps: 0%| | 2637/1000000 [10:51<67:25:43, 4.11it/s, grad_norm=21.4, loss_final=3.02, loss_mean=0.93, loss_mean_cls=2.49, proj_loss=-0.399][2026-03-23 13:47:11] Step: 2637, Training Logs: loss_final: 2.443510, loss_mean: 0.964309, proj_loss: -0.403507, loss_mean_cls: 1.882707, grad_norm: 5.335331 +Steps: 0%| | 2638/1000000 [10:52<67:25:14, 4.11it/s, grad_norm=5.34, loss_final=2.44, loss_mean=0.964, loss_mean_cls=1.88, proj_loss=-0.404][2026-03-23 13:47:12] Step: 2638, Training Logs: loss_final: 3.175224, loss_mean: 0.919265, proj_loss: -0.397115, loss_mean_cls: 2.653074, grad_norm: 8.431595 +Steps: 0%| | 2639/1000000 [10:52<67:25:16, 4.11it/s, grad_norm=8.43, loss_final=3.18, loss_mean=0.919, loss_mean_cls=2.65, proj_loss=-0.397][2026-03-23 13:47:12] Step: 2639, Training Logs: loss_final: 2.858023, loss_mean: 0.942759, proj_loss: -0.398373, loss_mean_cls: 2.313637, grad_norm: 10.258066 +Steps: 0%| | 2640/1000000 [10:52<67:23:46, 4.11it/s, grad_norm=10.3, loss_final=2.86, loss_mean=0.943, loss_mean_cls=2.31, proj_loss=-0.398][2026-03-23 13:47:12] Step: 2640, Training Logs: loss_final: 3.396807, loss_mean: 0.924637, proj_loss: -0.394309, loss_mean_cls: 2.866479, grad_norm: 2.117172 +Steps: 0%| | 2641/1000000 [10:52<67:25:21, 4.11it/s, grad_norm=2.12, loss_final=3.4, loss_mean=0.925, loss_mean_cls=2.87, proj_loss=-0.394][2026-03-23 13:47:12] Step: 2641, Training Logs: loss_final: 2.704540, loss_mean: 0.949761, proj_loss: -0.405775, loss_mean_cls: 2.160554, grad_norm: 8.747325 +Steps: 0%| | 2642/1000000 [10:53<67:28:27, 4.11it/s, grad_norm=8.75, loss_final=2.7, loss_mean=0.95, loss_mean_cls=2.16, proj_loss=-0.406][2026-03-23 13:47:13] Step: 2642, Training Logs: loss_final: 2.919081, loss_mean: 0.917229, proj_loss: -0.397975, loss_mean_cls: 2.399828, grad_norm: 13.286440 +Steps: 0%| | 2643/1000000 [10:53<67:28:06, 4.11it/s, grad_norm=13.3, loss_final=2.92, loss_mean=0.917, loss_mean_cls=2.4, proj_loss=-0.398][2026-03-23 13:47:13] Step: 2643, Training Logs: loss_final: 2.858951, loss_mean: 0.949508, proj_loss: -0.394308, loss_mean_cls: 2.303751, grad_norm: 3.485518 +Steps: 0%| | 2644/1000000 [10:53<67:27:04, 4.11it/s, grad_norm=3.49, loss_final=2.86, loss_mean=0.95, loss_mean_cls=2.3, proj_loss=-0.394][2026-03-23 13:47:13] Step: 2644, Training Logs: loss_final: 2.318916, loss_mean: 0.950942, proj_loss: -0.408109, loss_mean_cls: 1.776083, grad_norm: 13.665146 +Steps: 0%| | 2645/1000000 [10:53<67:27:55, 4.11it/s, grad_norm=13.7, loss_final=2.32, loss_mean=0.951, loss_mean_cls=1.78, proj_loss=-0.408][2026-03-23 13:47:13] Step: 2645, Training Logs: loss_final: 2.412659, loss_mean: 0.956582, proj_loss: -0.405256, loss_mean_cls: 1.861334, grad_norm: 3.642808 +Steps: 0%| | 2646/1000000 [10:54<67:29:07, 4.11it/s, grad_norm=3.64, loss_final=2.41, loss_mean=0.957, loss_mean_cls=1.86, proj_loss=-0.405][2026-03-23 13:47:14] Step: 2646, Training Logs: loss_final: 2.887333, loss_mean: 0.947444, proj_loss: -0.404236, loss_mean_cls: 2.344125, grad_norm: 5.720807 +Steps: 0%| | 2647/1000000 [10:54<67:29:16, 4.11it/s, grad_norm=5.72, loss_final=2.89, loss_mean=0.947, loss_mean_cls=2.34, proj_loss=-0.404][2026-03-23 13:47:14] Step: 2647, Training Logs: loss_final: 2.393830, loss_mean: 0.959762, proj_loss: -0.407291, loss_mean_cls: 1.841359, grad_norm: 6.858603 +Steps: 0%| | 2648/1000000 [10:54<67:27:52, 4.11it/s, grad_norm=6.86, loss_final=2.39, loss_mean=0.96, loss_mean_cls=1.84, proj_loss=-0.407][2026-03-23 13:47:14] Step: 2648, Training Logs: loss_final: 2.965145, loss_mean: 0.948400, proj_loss: -0.394129, loss_mean_cls: 2.410874, grad_norm: 3.678505 +Steps: 0%| | 2649/1000000 [10:54<67:28:12, 4.11it/s, grad_norm=3.68, loss_final=2.97, loss_mean=0.948, loss_mean_cls=2.41, proj_loss=-0.394][2026-03-23 13:47:14] Step: 2649, Training Logs: loss_final: 3.032057, loss_mean: 0.925172, proj_loss: -0.398541, loss_mean_cls: 2.505425, grad_norm: 5.726108 +Steps: 0%| | 2650/1000000 [10:55<67:28:09, 4.11it/s, grad_norm=5.73, loss_final=3.03, loss_mean=0.925, loss_mean_cls=2.51, proj_loss=-0.399][2026-03-23 13:47:15] Step: 2650, Training Logs: loss_final: 3.086914, loss_mean: 0.918023, proj_loss: -0.395198, loss_mean_cls: 2.564089, grad_norm: 3.405626 +Steps: 0%| | 2651/1000000 [10:55<67:26:24, 4.11it/s, grad_norm=3.41, loss_final=3.09, loss_mean=0.918, loss_mean_cls=2.56, proj_loss=-0.395][2026-03-23 13:47:15] Step: 2651, Training Logs: loss_final: 3.285944, loss_mean: 0.910793, proj_loss: -0.391052, loss_mean_cls: 2.766204, grad_norm: 3.260640 +Steps: 0%| | 2652/1000000 [10:55<67:25:25, 4.11it/s, grad_norm=3.26, loss_final=3.29, loss_mean=0.911, loss_mean_cls=2.77, proj_loss=-0.391][2026-03-23 13:47:15] Step: 2652, Training Logs: loss_final: 3.054792, loss_mean: 0.942740, proj_loss: -0.401969, loss_mean_cls: 2.514020, grad_norm: 6.581503 +Steps: 0%| | 2653/1000000 [10:55<67:26:26, 4.11it/s, grad_norm=6.58, loss_final=3.05, loss_mean=0.943, loss_mean_cls=2.51, proj_loss=-0.402][2026-03-23 13:47:15] Step: 2653, Training Logs: loss_final: 2.869223, loss_mean: 0.939224, proj_loss: -0.403170, loss_mean_cls: 2.333169, grad_norm: 7.139536 +Steps: 0%| | 2654/1000000 [10:56<67:26:41, 4.11it/s, grad_norm=7.14, loss_final=2.87, loss_mean=0.939, loss_mean_cls=2.33, proj_loss=-0.403][2026-03-23 13:47:16] Step: 2654, Training Logs: loss_final: 2.399009, loss_mean: 0.933040, proj_loss: -0.405737, loss_mean_cls: 1.871707, grad_norm: 3.702302 +Steps: 0%| | 2655/1000000 [10:56<67:27:13, 4.11it/s, grad_norm=3.7, loss_final=2.4, loss_mean=0.933, loss_mean_cls=1.87, proj_loss=-0.406][2026-03-23 13:47:16] Step: 2655, Training Logs: loss_final: 2.818263, loss_mean: 0.938356, proj_loss: -0.401142, loss_mean_cls: 2.281049, grad_norm: 8.448137 +Steps: 0%| | 2656/1000000 [10:56<67:27:53, 4.11it/s, grad_norm=8.45, loss_final=2.82, loss_mean=0.938, loss_mean_cls=2.28, proj_loss=-0.401][2026-03-23 13:47:16] Step: 2656, Training Logs: loss_final: 2.774391, loss_mean: 0.939751, proj_loss: -0.398892, loss_mean_cls: 2.233531, grad_norm: 2.805354 +Steps: 0%| | 2657/1000000 [10:56<67:26:19, 4.11it/s, grad_norm=2.81, loss_final=2.77, loss_mean=0.94, loss_mean_cls=2.23, proj_loss=-0.399][2026-03-23 13:47:16] Step: 2657, Training Logs: loss_final: 2.629605, loss_mean: 0.943549, proj_loss: -0.403349, loss_mean_cls: 2.089404, grad_norm: 3.336589 +Steps: 0%| | 2658/1000000 [10:57<67:28:22, 4.11it/s, grad_norm=3.34, loss_final=2.63, loss_mean=0.944, loss_mean_cls=2.09, proj_loss=-0.403][2026-03-23 13:47:17] Step: 2658, Training Logs: loss_final: 2.763460, loss_mean: 0.945365, proj_loss: -0.405452, loss_mean_cls: 2.223547, grad_norm: 9.520585 +Steps: 0%| | 2659/1000000 [10:57<67:28:15, 4.11it/s, grad_norm=9.52, loss_final=2.76, loss_mean=0.945, loss_mean_cls=2.22, proj_loss=-0.405][2026-03-23 13:47:17] Step: 2659, Training Logs: loss_final: 2.900223, loss_mean: 0.969789, proj_loss: -0.395820, loss_mean_cls: 2.326254, grad_norm: 7.440005 +Steps: 0%| | 2660/1000000 [10:57<67:26:44, 4.11it/s, grad_norm=7.44, loss_final=2.9, loss_mean=0.97, loss_mean_cls=2.33, proj_loss=-0.396][2026-03-23 13:47:17] Step: 2660, Training Logs: loss_final: 3.134691, loss_mean: 0.921490, proj_loss: -0.393996, loss_mean_cls: 2.607197, grad_norm: 4.896517 +Steps: 0%| | 2661/1000000 [10:57<67:28:21, 4.11it/s, grad_norm=4.9, loss_final=3.13, loss_mean=0.921, loss_mean_cls=2.61, proj_loss=-0.394][2026-03-23 13:47:17] Step: 2661, Training Logs: loss_final: 2.587823, loss_mean: 0.934172, proj_loss: -0.403455, loss_mean_cls: 2.057106, grad_norm: 14.143140 +Steps: 0%| | 2662/1000000 [10:58<67:27:36, 4.11it/s, grad_norm=14.1, loss_final=2.59, loss_mean=0.934, loss_mean_cls=2.06, proj_loss=-0.403][2026-03-23 13:47:18] Step: 2662, Training Logs: loss_final: 3.218664, loss_mean: 0.932819, proj_loss: -0.391544, loss_mean_cls: 2.677388, grad_norm: 4.047275 +Steps: 0%| | 2663/1000000 [10:58<67:27:45, 4.11it/s, grad_norm=4.05, loss_final=3.22, loss_mean=0.933, loss_mean_cls=2.68, proj_loss=-0.392][2026-03-23 13:47:18] Step: 2663, Training Logs: loss_final: 2.825654, loss_mean: 0.926723, proj_loss: -0.405843, loss_mean_cls: 2.304773, grad_norm: 12.827229 +Steps: 0%| | 2664/1000000 [10:58<67:31:17, 4.10it/s, grad_norm=12.8, loss_final=2.83, loss_mean=0.927, loss_mean_cls=2.3, proj_loss=-0.406][2026-03-23 13:47:18] Step: 2664, Training Logs: loss_final: 2.971266, loss_mean: 0.931617, proj_loss: -0.400635, loss_mean_cls: 2.440284, grad_norm: 16.051313 +Steps: 0%| | 2665/1000000 [10:58<67:29:25, 4.10it/s, grad_norm=16.1, loss_final=2.97, loss_mean=0.932, loss_mean_cls=2.44, proj_loss=-0.401][2026-03-23 13:47:18] Step: 2665, Training Logs: loss_final: 2.875604, loss_mean: 0.951048, proj_loss: -0.403484, loss_mean_cls: 2.328040, grad_norm: 2.120720 +Steps: 0%| | 2666/1000000 [10:59<67:28:50, 4.11it/s, grad_norm=2.12, loss_final=2.88, loss_mean=0.951, loss_mean_cls=2.33, proj_loss=-0.403][2026-03-23 13:47:18] Step: 2666, Training Logs: loss_final: 2.793237, loss_mean: 0.920784, proj_loss: -0.401192, loss_mean_cls: 2.273645, grad_norm: 7.268575 +Steps: 0%| | 2667/1000000 [10:59<67:28:27, 4.11it/s, grad_norm=7.27, loss_final=2.79, loss_mean=0.921, loss_mean_cls=2.27, proj_loss=-0.401][2026-03-23 13:47:19] Step: 2667, Training Logs: loss_final: 2.780860, loss_mean: 0.944932, proj_loss: -0.399511, loss_mean_cls: 2.235439, grad_norm: 4.612639 +Steps: 0%| | 2668/1000000 [10:59<67:28:31, 4.11it/s, grad_norm=4.61, loss_final=2.78, loss_mean=0.945, loss_mean_cls=2.24, proj_loss=-0.4][2026-03-23 13:47:19] Step: 2668, Training Logs: loss_final: 2.909242, loss_mean: 0.923332, proj_loss: -0.395107, loss_mean_cls: 2.381017, grad_norm: 3.378293 +Steps: 0%| | 2669/1000000 [10:59<67:27:09, 4.11it/s, grad_norm=3.38, loss_final=2.91, loss_mean=0.923, loss_mean_cls=2.38, proj_loss=-0.395][2026-03-23 13:47:19] Step: 2669, Training Logs: loss_final: 2.917073, loss_mean: 0.935589, proj_loss: -0.395794, loss_mean_cls: 2.377278, grad_norm: 5.389546 +Steps: 0%| | 2670/1000000 [10:59<67:37:18, 4.10it/s, grad_norm=5.39, loss_final=2.92, loss_mean=0.936, loss_mean_cls=2.38, proj_loss=-0.396][2026-03-23 13:47:19] Step: 2670, Training Logs: loss_final: 3.547402, loss_mean: 0.930965, proj_loss: -0.382746, loss_mean_cls: 2.999182, grad_norm: 6.004197 +Steps: 0%| | 2671/1000000 [11:00<67:34:23, 4.10it/s, grad_norm=6, loss_final=3.55, loss_mean=0.931, loss_mean_cls=3, proj_loss=-0.383][2026-03-23 13:47:20] Step: 2671, Training Logs: loss_final: 2.737309, loss_mean: 0.982313, proj_loss: -0.399201, loss_mean_cls: 2.154197, grad_norm: 22.335667 +Steps: 0%| | 2672/1000000 [11:00<67:31:35, 4.10it/s, grad_norm=22.3, loss_final=2.74, loss_mean=0.982, loss_mean_cls=2.15, proj_loss=-0.399][2026-03-23 13:47:20] Step: 2672, Training Logs: loss_final: 2.918067, loss_mean: 0.975403, proj_loss: -0.396158, loss_mean_cls: 2.338822, grad_norm: 17.230837 +Steps: 0%| | 2673/1000000 [11:00<67:28:51, 4.11it/s, grad_norm=17.2, loss_final=2.92, loss_mean=0.975, loss_mean_cls=2.34, proj_loss=-0.396][2026-03-23 13:47:20] Step: 2673, Training Logs: loss_final: 2.693787, loss_mean: 0.957681, proj_loss: -0.399319, loss_mean_cls: 2.135425, grad_norm: 18.562256 +Steps: 0%| | 2674/1000000 [11:00<67:26:49, 4.11it/s, grad_norm=18.6, loss_final=2.69, loss_mean=0.958, loss_mean_cls=2.14, proj_loss=-0.399][2026-03-23 13:47:20] Step: 2674, Training Logs: loss_final: 2.339172, loss_mean: 0.978432, proj_loss: -0.403184, loss_mean_cls: 1.763924, grad_norm: 9.778016 +Steps: 0%| | 2675/1000000 [11:01<67:27:25, 4.11it/s, grad_norm=9.78, loss_final=2.34, loss_mean=0.978, loss_mean_cls=1.76, proj_loss=-0.403][2026-03-23 13:47:21] Step: 2675, Training Logs: loss_final: 2.911892, loss_mean: 0.957868, proj_loss: -0.398604, loss_mean_cls: 2.352628, grad_norm: 21.271296 +Steps: 0%| | 2676/1000000 [11:01<67:26:01, 4.11it/s, grad_norm=21.3, loss_final=2.91, loss_mean=0.958, loss_mean_cls=2.35, proj_loss=-0.399][2026-03-23 13:47:21] Step: 2676, Training Logs: loss_final: 2.785495, loss_mean: 0.952196, proj_loss: -0.405853, loss_mean_cls: 2.239152, grad_norm: 20.266718 +Steps: 0%| | 2677/1000000 [11:01<67:25:08, 4.11it/s, grad_norm=20.3, loss_final=2.79, loss_mean=0.952, loss_mean_cls=2.24, proj_loss=-0.406][2026-03-23 13:47:21] Step: 2677, Training Logs: loss_final: 3.057103, loss_mean: 0.957724, proj_loss: -0.398007, loss_mean_cls: 2.497386, grad_norm: 17.630508 +Steps: 0%| | 2678/1000000 [11:01<67:40:33, 4.09it/s, grad_norm=17.6, loss_final=3.06, loss_mean=0.958, loss_mean_cls=2.5, proj_loss=-0.398][2026-03-23 13:47:21] Step: 2678, Training Logs: loss_final: 3.126660, loss_mean: 0.933282, proj_loss: -0.400194, loss_mean_cls: 2.593571, grad_norm: 18.281607 +Steps: 0%| | 2679/1000000 [11:02<67:34:34, 4.10it/s, grad_norm=18.3, loss_final=3.13, loss_mean=0.933, loss_mean_cls=2.59, proj_loss=-0.4][2026-03-23 13:47:22] Step: 2679, Training Logs: loss_final: 2.931094, loss_mean: 0.950449, proj_loss: -0.390194, loss_mean_cls: 2.370840, grad_norm: 4.744397 +Steps: 0%| | 2680/1000000 [11:02<67:32:29, 4.10it/s, grad_norm=4.74, loss_final=2.93, loss_mean=0.95, loss_mean_cls=2.37, proj_loss=-0.39][2026-03-23 13:47:22] Step: 2680, Training Logs: loss_final: 2.789974, loss_mean: 0.949319, proj_loss: -0.400385, loss_mean_cls: 2.241040, grad_norm: 5.154407 +Steps: 0%| | 2681/1000000 [11:02<67:29:40, 4.10it/s, grad_norm=5.15, loss_final=2.79, loss_mean=0.949, loss_mean_cls=2.24, proj_loss=-0.4][2026-03-23 13:47:22] Step: 2681, Training Logs: loss_final: 2.631312, loss_mean: 0.952713, proj_loss: -0.403328, loss_mean_cls: 2.081927, grad_norm: 15.900587 +Steps: 0%| | 2682/1000000 [11:02<67:30:28, 4.10it/s, grad_norm=15.9, loss_final=2.63, loss_mean=0.953, loss_mean_cls=2.08, proj_loss=-0.403][2026-03-23 13:47:22] Step: 2682, Training Logs: loss_final: 2.424270, loss_mean: 0.959973, proj_loss: -0.407157, loss_mean_cls: 1.871454, grad_norm: 12.140917 +Steps: 0%| | 2683/1000000 [11:03<67:29:42, 4.10it/s, grad_norm=12.1, loss_final=2.42, loss_mean=0.96, loss_mean_cls=1.87, proj_loss=-0.407][2026-03-23 13:47:23] Step: 2683, Training Logs: loss_final: 3.017088, loss_mean: 0.958172, proj_loss: -0.394484, loss_mean_cls: 2.453399, grad_norm: 6.242874 +Steps: 0%| | 2684/1000000 [11:03<67:29:39, 4.10it/s, grad_norm=6.24, loss_final=3.02, loss_mean=0.958, loss_mean_cls=2.45, proj_loss=-0.394][2026-03-23 13:47:23] Step: 2684, Training Logs: loss_final: 2.690231, loss_mean: 0.934705, proj_loss: -0.398806, loss_mean_cls: 2.154332, grad_norm: 2.233250 +Steps: 0%| | 2685/1000000 [11:03<67:28:18, 4.11it/s, grad_norm=2.23, loss_final=2.69, loss_mean=0.935, loss_mean_cls=2.15, proj_loss=-0.399][2026-03-23 13:47:23] Step: 2685, Training Logs: loss_final: 3.053082, loss_mean: 0.973579, proj_loss: -0.401157, loss_mean_cls: 2.480660, grad_norm: 10.823644 +Steps: 0%| | 2686/1000000 [11:03<67:27:35, 4.11it/s, grad_norm=10.8, loss_final=3.05, loss_mean=0.974, loss_mean_cls=2.48, proj_loss=-0.401][2026-03-23 13:47:23] Step: 2686, Training Logs: loss_final: 2.317433, loss_mean: 0.992291, proj_loss: -0.404447, loss_mean_cls: 1.729589, grad_norm: 10.953644 +Steps: 0%| | 2687/1000000 [11:04<67:28:41, 4.11it/s, grad_norm=11, loss_final=2.32, loss_mean=0.992, loss_mean_cls=1.73, proj_loss=-0.404][2026-03-23 13:47:24] Step: 2687, Training Logs: loss_final: 3.375606, loss_mean: 0.946918, proj_loss: -0.383672, loss_mean_cls: 2.812360, grad_norm: 8.241358 +Steps: 0%| | 2688/1000000 [11:04<67:26:40, 4.11it/s, grad_norm=8.24, loss_final=3.38, loss_mean=0.947, loss_mean_cls=2.81, proj_loss=-0.384][2026-03-23 13:47:24] Step: 2688, Training Logs: loss_final: 3.158978, loss_mean: 0.956235, proj_loss: -0.394365, loss_mean_cls: 2.597108, grad_norm: 11.663147 +Steps: 0%| | 2689/1000000 [11:04<67:25:59, 4.11it/s, grad_norm=11.7, loss_final=3.16, loss_mean=0.956, loss_mean_cls=2.6, proj_loss=-0.394][2026-03-23 13:47:24] Step: 2689, Training Logs: loss_final: 2.820515, loss_mean: 0.946330, proj_loss: -0.397128, loss_mean_cls: 2.271313, grad_norm: 8.224688 +Steps: 0%| | 2690/1000000 [11:04<67:26:53, 4.11it/s, grad_norm=8.22, loss_final=2.82, loss_mean=0.946, loss_mean_cls=2.27, proj_loss=-0.397][2026-03-23 13:47:24] Step: 2690, Training Logs: loss_final: 2.824891, loss_mean: 0.958828, proj_loss: -0.395471, loss_mean_cls: 2.261534, grad_norm: 6.393375 +Steps: 0%| | 2691/1000000 [11:05<67:26:53, 4.11it/s, grad_norm=6.39, loss_final=2.82, loss_mean=0.959, loss_mean_cls=2.26, proj_loss=-0.395][2026-03-23 13:47:25] Step: 2691, Training Logs: loss_final: 2.804364, loss_mean: 0.971106, proj_loss: -0.396655, loss_mean_cls: 2.229913, grad_norm: 3.923411 +Steps: 0%| | 2692/1000000 [11:05<67:27:14, 4.11it/s, grad_norm=3.92, loss_final=2.8, loss_mean=0.971, loss_mean_cls=2.23, proj_loss=-0.397][2026-03-23 13:47:25] Step: 2692, Training Logs: loss_final: 2.524750, loss_mean: 0.954583, proj_loss: -0.408356, loss_mean_cls: 1.978524, grad_norm: 4.544145 +Steps: 0%| | 2693/1000000 [11:05<67:28:56, 4.11it/s, grad_norm=4.54, loss_final=2.52, loss_mean=0.955, loss_mean_cls=1.98, proj_loss=-0.408][2026-03-23 13:47:25] Step: 2693, Training Logs: loss_final: 2.948176, loss_mean: 0.922374, proj_loss: -0.398610, loss_mean_cls: 2.424412, grad_norm: 9.383405 +Steps: 0%| | 2694/1000000 [11:05<67:25:36, 4.11it/s, grad_norm=9.38, loss_final=2.95, loss_mean=0.922, loss_mean_cls=2.42, proj_loss=-0.399][2026-03-23 13:47:25] Step: 2694, Training Logs: loss_final: 2.940021, loss_mean: 0.920537, proj_loss: -0.398697, loss_mean_cls: 2.418181, grad_norm: 6.692512 +Steps: 0%| | 2695/1000000 [11:06<67:25:06, 4.11it/s, grad_norm=6.69, loss_final=2.94, loss_mean=0.921, loss_mean_cls=2.42, proj_loss=-0.399][2026-03-23 13:47:26] Step: 2695, Training Logs: loss_final: 2.572829, loss_mean: 0.947320, proj_loss: -0.402066, loss_mean_cls: 2.027575, grad_norm: 3.648124 +Steps: 0%| | 2696/1000000 [11:06<67:25:00, 4.11it/s, grad_norm=3.65, loss_final=2.57, loss_mean=0.947, loss_mean_cls=2.03, proj_loss=-0.402][2026-03-23 13:47:26] Step: 2696, Training Logs: loss_final: 2.606757, loss_mean: 0.935531, proj_loss: -0.402722, loss_mean_cls: 2.073947, grad_norm: 5.854701 +Steps: 0%| | 2697/1000000 [11:06<67:24:44, 4.11it/s, grad_norm=5.85, loss_final=2.61, loss_mean=0.936, loss_mean_cls=2.07, proj_loss=-0.403][2026-03-23 13:47:26] Step: 2697, Training Logs: loss_final: 3.077567, loss_mean: 0.928111, proj_loss: -0.393805, loss_mean_cls: 2.543262, grad_norm: 14.543475 +Steps: 0%| | 2698/1000000 [11:06<67:25:42, 4.11it/s, grad_norm=14.5, loss_final=3.08, loss_mean=0.928, loss_mean_cls=2.54, proj_loss=-0.394][2026-03-23 13:47:26] Step: 2698, Training Logs: loss_final: 3.266012, loss_mean: 0.940535, proj_loss: -0.389060, loss_mean_cls: 2.714537, grad_norm: 10.254705 +Steps: 0%| | 2699/1000000 [11:07<67:26:27, 4.11it/s, grad_norm=10.3, loss_final=3.27, loss_mean=0.941, loss_mean_cls=2.71, proj_loss=-0.389][2026-03-23 13:47:27] Step: 2699, Training Logs: loss_final: 3.266703, loss_mean: 0.924562, proj_loss: -0.389452, loss_mean_cls: 2.731594, grad_norm: 3.001055 +Steps: 0%| | 2700/1000000 [11:07<67:27:39, 4.11it/s, grad_norm=3, loss_final=3.27, loss_mean=0.925, loss_mean_cls=2.73, proj_loss=-0.389][2026-03-23 13:47:27] Step: 2700, Training Logs: loss_final: 2.655197, loss_mean: 0.953498, proj_loss: -0.404291, loss_mean_cls: 2.105990, grad_norm: 6.292511 +Steps: 0%| | 2701/1000000 [11:07<67:25:56, 4.11it/s, grad_norm=6.29, loss_final=2.66, loss_mean=0.953, loss_mean_cls=2.11, proj_loss=-0.404][2026-03-23 13:47:27] Step: 2701, Training Logs: loss_final: 3.119183, loss_mean: 0.954078, proj_loss: -0.395896, loss_mean_cls: 2.561001, grad_norm: 19.923613 +Steps: 0%| | 2702/1000000 [11:07<67:25:16, 4.11it/s, grad_norm=19.9, loss_final=3.12, loss_mean=0.954, loss_mean_cls=2.56, proj_loss=-0.396][2026-03-23 13:47:27] Step: 2702, Training Logs: loss_final: 3.054508, loss_mean: 0.938376, proj_loss: -0.392739, loss_mean_cls: 2.508871, grad_norm: 5.300758 +Steps: 0%| | 2703/1000000 [11:08<67:28:27, 4.11it/s, grad_norm=5.3, loss_final=3.05, loss_mean=0.938, loss_mean_cls=2.51, proj_loss=-0.393][2026-03-23 13:47:27] Step: 2703, Training Logs: loss_final: 3.134874, loss_mean: 0.956804, proj_loss: -0.395698, loss_mean_cls: 2.573769, grad_norm: 19.858070 +Steps: 0%| | 2704/1000000 [11:08<67:26:17, 4.11it/s, grad_norm=19.9, loss_final=3.13, loss_mean=0.957, loss_mean_cls=2.57, proj_loss=-0.396][2026-03-23 13:47:28] Step: 2704, Training Logs: loss_final: 3.542975, loss_mean: 0.933650, proj_loss: -0.391461, loss_mean_cls: 3.000786, grad_norm: 24.637436 +Steps: 0%| | 2705/1000000 [11:08<67:26:00, 4.11it/s, grad_norm=24.6, loss_final=3.54, loss_mean=0.934, loss_mean_cls=3, proj_loss=-0.391][2026-03-23 13:47:28] Step: 2705, Training Logs: loss_final: 3.160140, loss_mean: 0.948969, proj_loss: -0.392865, loss_mean_cls: 2.604036, grad_norm: 15.090787 +Steps: 0%| | 2706/1000000 [11:08<67:24:43, 4.11it/s, grad_norm=15.1, loss_final=3.16, loss_mean=0.949, loss_mean_cls=2.6, proj_loss=-0.393][2026-03-23 13:47:28] Step: 2706, Training Logs: loss_final: 2.760475, loss_mean: 0.958086, proj_loss: -0.398787, loss_mean_cls: 2.201176, grad_norm: 12.131640 +Steps: 0%| | 2707/1000000 [11:08<67:23:47, 4.11it/s, grad_norm=12.1, loss_final=2.76, loss_mean=0.958, loss_mean_cls=2.2, proj_loss=-0.399][2026-03-23 13:47:28] Step: 2707, Training Logs: loss_final: 3.072415, loss_mean: 0.938432, proj_loss: -0.395320, loss_mean_cls: 2.529303, grad_norm: 17.004530 +Steps: 0%| | 2708/1000000 [11:09<67:23:40, 4.11it/s, grad_norm=17, loss_final=3.07, loss_mean=0.938, loss_mean_cls=2.53, proj_loss=-0.395][2026-03-23 13:47:29] Step: 2708, Training Logs: loss_final: 2.987850, loss_mean: 0.943891, proj_loss: -0.397704, loss_mean_cls: 2.441663, grad_norm: 7.874877 +Steps: 0%| | 2709/1000000 [11:09<67:23:31, 4.11it/s, grad_norm=7.87, loss_final=2.99, loss_mean=0.944, loss_mean_cls=2.44, proj_loss=-0.398][2026-03-23 13:47:29] Step: 2709, Training Logs: loss_final: 3.273335, loss_mean: 0.911694, proj_loss: -0.397229, loss_mean_cls: 2.758870, grad_norm: 14.966992 +Steps: 0%| | 2710/1000000 [11:09<67:29:12, 4.10it/s, grad_norm=15, loss_final=3.27, loss_mean=0.912, loss_mean_cls=2.76, proj_loss=-0.397][2026-03-23 13:47:29] Step: 2710, Training Logs: loss_final: 2.871158, loss_mean: 0.952316, proj_loss: -0.408929, loss_mean_cls: 2.327771, grad_norm: 8.549462 +Steps: 0%| | 2711/1000000 [11:09<67:26:42, 4.11it/s, grad_norm=8.55, loss_final=2.87, loss_mean=0.952, loss_mean_cls=2.33, proj_loss=-0.409][2026-03-23 13:47:29] Step: 2711, Training Logs: loss_final: 2.941087, loss_mean: 0.930326, proj_loss: -0.404149, loss_mean_cls: 2.414911, grad_norm: 9.025351 +Steps: 0%| | 2712/1000000 [11:10<67:26:39, 4.11it/s, grad_norm=9.03, loss_final=2.94, loss_mean=0.93, loss_mean_cls=2.41, proj_loss=-0.404][2026-03-23 13:47:30] Step: 2712, Training Logs: loss_final: 2.690691, loss_mean: 0.946959, proj_loss: -0.408799, loss_mean_cls: 2.152532, grad_norm: 28.448914 +Steps: 0%| | 2713/1000000 [11:10<67:26:21, 4.11it/s, grad_norm=28.4, loss_final=2.69, loss_mean=0.947, loss_mean_cls=2.15, proj_loss=-0.409][2026-03-23 13:47:30] Step: 2713, Training Logs: loss_final: 3.275204, loss_mean: 0.950298, proj_loss: -0.390773, loss_mean_cls: 2.715679, grad_norm: 26.123123 +Steps: 0%| | 2714/1000000 [11:10<67:25:49, 4.11it/s, grad_norm=26.1, loss_final=3.28, loss_mean=0.95, loss_mean_cls=2.72, proj_loss=-0.391][2026-03-23 13:47:30] Step: 2714, Training Logs: loss_final: 2.979477, loss_mean: 0.946963, proj_loss: -0.405025, loss_mean_cls: 2.437539, grad_norm: 24.468586 +Steps: 0%| | 2715/1000000 [11:10<67:26:51, 4.11it/s, grad_norm=24.5, loss_final=2.98, loss_mean=0.947, loss_mean_cls=2.44, proj_loss=-0.405][2026-03-23 13:47:30] Step: 2715, Training Logs: loss_final: 3.161234, loss_mean: 0.934634, proj_loss: -0.392807, loss_mean_cls: 2.619406, grad_norm: 15.821030 +Steps: 0%| | 2716/1000000 [11:11<67:27:27, 4.11it/s, grad_norm=15.8, loss_final=3.16, loss_mean=0.935, loss_mean_cls=2.62, proj_loss=-0.393][2026-03-23 13:47:31] Step: 2716, Training Logs: loss_final: 3.105906, loss_mean: 0.942860, proj_loss: -0.398438, loss_mean_cls: 2.561483, grad_norm: 16.806351 +Steps: 0%| | 2717/1000000 [11:11<67:33:16, 4.10it/s, grad_norm=16.8, loss_final=3.11, loss_mean=0.943, loss_mean_cls=2.56, proj_loss=-0.398][2026-03-23 13:47:31] Step: 2717, Training Logs: loss_final: 2.807272, loss_mean: 0.949770, proj_loss: -0.399570, loss_mean_cls: 2.257072, grad_norm: 5.560839 +Steps: 0%| | 2718/1000000 [11:11<67:30:29, 4.10it/s, grad_norm=5.56, loss_final=2.81, loss_mean=0.95, loss_mean_cls=2.26, proj_loss=-0.4][2026-03-23 13:47:31] Step: 2718, Training Logs: loss_final: 2.750957, loss_mean: 0.937268, proj_loss: -0.406238, loss_mean_cls: 2.219927, grad_norm: 9.892349 +Steps: 0%| | 2719/1000000 [11:11<67:30:52, 4.10it/s, grad_norm=9.89, loss_final=2.75, loss_mean=0.937, loss_mean_cls=2.22, proj_loss=-0.406][2026-03-23 13:47:31] Step: 2719, Training Logs: loss_final: 2.996992, loss_mean: 0.960977, proj_loss: -0.406552, loss_mean_cls: 2.442566, grad_norm: 13.929790 +Steps: 0%| | 2720/1000000 [11:12<67:30:50, 4.10it/s, grad_norm=13.9, loss_final=3, loss_mean=0.961, loss_mean_cls=2.44, proj_loss=-0.407][2026-03-23 13:47:32] Step: 2720, Training Logs: loss_final: 2.737784, loss_mean: 0.911994, proj_loss: -0.403266, loss_mean_cls: 2.229057, grad_norm: 11.090598 +Steps: 0%| | 2721/1000000 [11:12<67:30:33, 4.10it/s, grad_norm=11.1, loss_final=2.74, loss_mean=0.912, loss_mean_cls=2.23, proj_loss=-0.403][2026-03-23 13:47:32] Step: 2721, Training Logs: loss_final: 2.503518, loss_mean: 0.955478, proj_loss: -0.407127, loss_mean_cls: 1.955167, grad_norm: 3.165887 +Steps: 0%| | 2722/1000000 [11:12<68:12:34, 4.06it/s, grad_norm=3.17, loss_final=2.5, loss_mean=0.955, loss_mean_cls=1.96, proj_loss=-0.407][2026-03-23 13:47:32] Step: 2722, Training Logs: loss_final: 3.227482, loss_mean: 0.912714, proj_loss: -0.401527, loss_mean_cls: 2.716295, grad_norm: 13.267730 +Steps: 0%| | 2723/1000000 [11:12<67:57:45, 4.08it/s, grad_norm=13.3, loss_final=3.23, loss_mean=0.913, loss_mean_cls=2.72, proj_loss=-0.402][2026-03-23 13:47:32] Step: 2723, Training Logs: loss_final: 3.086001, loss_mean: 0.945497, proj_loss: -0.403748, loss_mean_cls: 2.544251, grad_norm: 13.990541 +Steps: 0%| | 2724/1000000 [11:13<67:47:30, 4.09it/s, grad_norm=14, loss_final=3.09, loss_mean=0.945, loss_mean_cls=2.54, proj_loss=-0.404][2026-03-23 13:47:33] Step: 2724, Training Logs: loss_final: 2.789422, loss_mean: 0.974189, proj_loss: -0.404333, loss_mean_cls: 2.219566, grad_norm: 12.842612 +Steps: 0%| | 2725/1000000 [11:13<67:40:35, 4.09it/s, grad_norm=12.8, loss_final=2.79, loss_mean=0.974, loss_mean_cls=2.22, proj_loss=-0.404][2026-03-23 13:47:33] Step: 2725, Training Logs: loss_final: 2.668464, loss_mean: 0.976894, proj_loss: -0.399700, loss_mean_cls: 2.091270, grad_norm: 10.092062 +Steps: 0%| | 2726/1000000 [11:13<67:35:44, 4.10it/s, grad_norm=10.1, loss_final=2.67, loss_mean=0.977, loss_mean_cls=2.09, proj_loss=-0.4][2026-03-23 13:47:33] Step: 2726, Training Logs: loss_final: 2.843448, loss_mean: 0.948637, proj_loss: -0.405163, loss_mean_cls: 2.299974, grad_norm: 10.557165 +Steps: 0%| | 2727/1000000 [11:13<68:16:22, 4.06it/s, grad_norm=10.6, loss_final=2.84, loss_mean=0.949, loss_mean_cls=2.3, proj_loss=-0.405][2026-03-23 13:47:33] Step: 2727, Training Logs: loss_final: 2.698894, loss_mean: 0.954465, proj_loss: -0.411346, loss_mean_cls: 2.155776, grad_norm: 2.678085 +Steps: 0%| | 2728/1000000 [11:14<68:02:18, 4.07it/s, grad_norm=2.68, loss_final=2.7, loss_mean=0.954, loss_mean_cls=2.16, proj_loss=-0.411][2026-03-23 13:47:34] Step: 2728, Training Logs: loss_final: 3.056985, loss_mean: 0.928854, proj_loss: -0.397844, loss_mean_cls: 2.525975, grad_norm: 3.294077 +Steps: 0%| | 2729/1000000 [11:14<67:50:41, 4.08it/s, grad_norm=3.29, loss_final=3.06, loss_mean=0.929, loss_mean_cls=2.53, proj_loss=-0.398][2026-03-23 13:47:34] Step: 2729, Training Logs: loss_final: 3.034854, loss_mean: 0.945146, proj_loss: -0.394804, loss_mean_cls: 2.484512, grad_norm: 6.433477 +Steps: 0%| | 2730/1000000 [11:14<67:43:03, 4.09it/s, grad_norm=6.43, loss_final=3.03, loss_mean=0.945, loss_mean_cls=2.48, proj_loss=-0.395][2026-03-23 13:47:34] Step: 2730, Training Logs: loss_final: 3.514086, loss_mean: 0.917389, proj_loss: -0.388831, loss_mean_cls: 2.985528, grad_norm: 3.569962 +Steps: 0%| | 2731/1000000 [11:14<67:36:28, 4.10it/s, grad_norm=3.57, loss_final=3.51, loss_mean=0.917, loss_mean_cls=2.99, proj_loss=-0.389][2026-03-23 13:47:34] Step: 2731, Training Logs: loss_final: 2.911709, loss_mean: 0.925050, proj_loss: -0.398301, loss_mean_cls: 2.384959, grad_norm: 7.564897 +Steps: 0%| | 2732/1000000 [11:15<67:33:33, 4.10it/s, grad_norm=7.56, loss_final=2.91, loss_mean=0.925, loss_mean_cls=2.38, proj_loss=-0.398][2026-03-23 13:47:35] Step: 2732, Training Logs: loss_final: 3.193691, loss_mean: 0.938814, proj_loss: -0.392615, loss_mean_cls: 2.647493, grad_norm: 2.318650 +Steps: 0%| | 2733/1000000 [11:15<67:34:52, 4.10it/s, grad_norm=2.32, loss_final=3.19, loss_mean=0.939, loss_mean_cls=2.65, proj_loss=-0.393][2026-03-23 13:47:35] Step: 2733, Training Logs: loss_final: 3.066890, loss_mean: 0.938556, proj_loss: -0.402732, loss_mean_cls: 2.531066, grad_norm: 2.706819 +Steps: 0%| | 2734/1000000 [11:15<67:30:46, 4.10it/s, grad_norm=2.71, loss_final=3.07, loss_mean=0.939, loss_mean_cls=2.53, proj_loss=-0.403][2026-03-23 13:47:35] Step: 2734, Training Logs: loss_final: 2.819103, loss_mean: 0.922961, proj_loss: -0.400564, loss_mean_cls: 2.296706, grad_norm: 3.526447 +Steps: 0%| | 2735/1000000 [11:15<67:28:10, 4.11it/s, grad_norm=3.53, loss_final=2.82, loss_mean=0.923, loss_mean_cls=2.3, proj_loss=-0.401][2026-03-23 13:47:35] Step: 2735, Training Logs: loss_final: 3.206396, loss_mean: 0.940820, proj_loss: -0.391907, loss_mean_cls: 2.657483, grad_norm: 2.194266 +Steps: 0%| | 2736/1000000 [11:16<67:26:44, 4.11it/s, grad_norm=2.19, loss_final=3.21, loss_mean=0.941, loss_mean_cls=2.66, proj_loss=-0.392][2026-03-23 13:47:36] Step: 2736, Training Logs: loss_final: 2.974016, loss_mean: 0.931401, proj_loss: -0.402809, loss_mean_cls: 2.445424, grad_norm: 6.329696 +Steps: 0%| | 2737/1000000 [11:16<67:25:55, 4.11it/s, grad_norm=6.33, loss_final=2.97, loss_mean=0.931, loss_mean_cls=2.45, proj_loss=-0.403][2026-03-23 13:47:36] Step: 2737, Training Logs: loss_final: 2.718598, loss_mean: 0.947250, proj_loss: -0.402591, loss_mean_cls: 2.173939, grad_norm: 5.641792 +Steps: 0%| | 2738/1000000 [11:16<67:24:36, 4.11it/s, grad_norm=5.64, loss_final=2.72, loss_mean=0.947, loss_mean_cls=2.17, proj_loss=-0.403][2026-03-23 13:47:36] Step: 2738, Training Logs: loss_final: 2.887886, loss_mean: 0.958143, proj_loss: -0.394391, loss_mean_cls: 2.324134, grad_norm: 3.035271 +Steps: 0%| | 2739/1000000 [11:16<67:23:27, 4.11it/s, grad_norm=3.04, loss_final=2.89, loss_mean=0.958, loss_mean_cls=2.32, proj_loss=-0.394][2026-03-23 13:47:36] Step: 2739, Training Logs: loss_final: 3.146351, loss_mean: 0.932031, proj_loss: -0.396325, loss_mean_cls: 2.610645, grad_norm: 13.784636 +Steps: 0%| | 2740/1000000 [11:17<67:28:48, 4.11it/s, grad_norm=13.8, loss_final=3.15, loss_mean=0.932, loss_mean_cls=2.61, proj_loss=-0.396][2026-03-23 13:47:37] Step: 2740, Training Logs: loss_final: 3.096787, loss_mean: 0.935169, proj_loss: -0.401393, loss_mean_cls: 2.563012, grad_norm: 10.763335 +Steps: 0%| | 2741/1000000 [11:17<67:28:29, 4.11it/s, grad_norm=10.8, loss_final=3.1, loss_mean=0.935, loss_mean_cls=2.56, proj_loss=-0.401][2026-03-23 13:47:37] Step: 2741, Training Logs: loss_final: 2.983131, loss_mean: 0.952100, proj_loss: -0.390803, loss_mean_cls: 2.421834, grad_norm: 4.610774 +Steps: 0%| | 2742/1000000 [11:17<67:27:44, 4.11it/s, grad_norm=4.61, loss_final=2.98, loss_mean=0.952, loss_mean_cls=2.42, proj_loss=-0.391][2026-03-23 13:47:37] Step: 2742, Training Logs: loss_final: 2.710124, loss_mean: 0.899062, proj_loss: -0.404171, loss_mean_cls: 2.215233, grad_norm: 3.311973 +Steps: 0%| | 2743/1000000 [11:17<67:27:56, 4.11it/s, grad_norm=3.31, loss_final=2.71, loss_mean=0.899, loss_mean_cls=2.22, proj_loss=-0.404][2026-03-23 13:47:37] Step: 2743, Training Logs: loss_final: 2.917474, loss_mean: 0.934396, proj_loss: -0.399373, loss_mean_cls: 2.382451, grad_norm: 3.132672 +Steps: 0%| | 2744/1000000 [11:18<67:26:48, 4.11it/s, grad_norm=3.13, loss_final=2.92, loss_mean=0.934, loss_mean_cls=2.38, proj_loss=-0.399][2026-03-23 13:47:37] Step: 2744, Training Logs: loss_final: 3.349323, loss_mean: 0.922731, proj_loss: -0.393861, loss_mean_cls: 2.820453, grad_norm: 26.839123 +Steps: 0%| | 2745/1000000 [11:18<67:27:05, 4.11it/s, grad_norm=26.8, loss_final=3.35, loss_mean=0.923, loss_mean_cls=2.82, proj_loss=-0.394][2026-03-23 13:47:38] Step: 2745, Training Logs: loss_final: 2.552942, loss_mean: 0.986335, proj_loss: -0.399817, loss_mean_cls: 1.966423, grad_norm: 19.231680 +Steps: 0%| | 2746/1000000 [11:18<67:27:30, 4.11it/s, grad_norm=19.2, loss_final=2.55, loss_mean=0.986, loss_mean_cls=1.97, proj_loss=-0.4][2026-03-23 13:47:38] Step: 2746, Training Logs: loss_final: 3.053617, loss_mean: 0.967084, proj_loss: -0.391845, loss_mean_cls: 2.478378, grad_norm: 10.989689 +Steps: 0%| | 2747/1000000 [11:18<67:31:57, 4.10it/s, grad_norm=11, loss_final=3.05, loss_mean=0.967, loss_mean_cls=2.48, proj_loss=-0.392][2026-03-23 13:47:38] Step: 2747, Training Logs: loss_final: 2.993556, loss_mean: 0.973529, proj_loss: -0.388675, loss_mean_cls: 2.408702, grad_norm: 10.751620 +Steps: 0%| | 2748/1000000 [11:19<67:29:14, 4.10it/s, grad_norm=10.8, loss_final=2.99, loss_mean=0.974, loss_mean_cls=2.41, proj_loss=-0.389][2026-03-23 13:47:38] Step: 2748, Training Logs: loss_final: 2.956050, loss_mean: 0.947030, proj_loss: -0.388865, loss_mean_cls: 2.397885, grad_norm: 2.347030 +Steps: 0%| | 2749/1000000 [11:19<67:27:33, 4.11it/s, grad_norm=2.35, loss_final=2.96, loss_mean=0.947, loss_mean_cls=2.4, proj_loss=-0.389][2026-03-23 13:47:39] Step: 2749, Training Logs: loss_final: 3.356316, loss_mean: 0.934426, proj_loss: -0.394138, loss_mean_cls: 2.816027, grad_norm: 20.784698 +Steps: 0%| | 2750/1000000 [11:19<67:26:03, 4.11it/s, grad_norm=20.8, loss_final=3.36, loss_mean=0.934, loss_mean_cls=2.82, proj_loss=-0.394][2026-03-23 13:47:39] Step: 2750, Training Logs: loss_final: 2.987785, loss_mean: 0.958989, proj_loss: -0.390473, loss_mean_cls: 2.419269, grad_norm: 19.319677 +Steps: 0%| | 2751/1000000 [11:19<67:25:05, 4.11it/s, grad_norm=19.3, loss_final=2.99, loss_mean=0.959, loss_mean_cls=2.42, proj_loss=-0.39][2026-03-23 13:47:39] Step: 2751, Training Logs: loss_final: 3.251516, loss_mean: 0.975058, proj_loss: -0.379670, loss_mean_cls: 2.656128, grad_norm: 13.767559 +Steps: 0%| | 2752/1000000 [11:19<67:24:44, 4.11it/s, grad_norm=13.8, loss_final=3.25, loss_mean=0.975, loss_mean_cls=2.66, proj_loss=-0.38][2026-03-23 13:47:39] Step: 2752, Training Logs: loss_final: 2.754570, loss_mean: 0.955248, proj_loss: -0.383773, loss_mean_cls: 2.183095, grad_norm: 10.665900 +Steps: 0%| | 2753/1000000 [11:20<67:23:48, 4.11it/s, grad_norm=10.7, loss_final=2.75, loss_mean=0.955, loss_mean_cls=2.18, proj_loss=-0.384][2026-03-23 13:47:40] Step: 2753, Training Logs: loss_final: 2.984768, loss_mean: 0.968104, proj_loss: -0.385415, loss_mean_cls: 2.402079, grad_norm: 10.467518 +Steps: 0%| | 2754/1000000 [11:20<68:39:21, 4.03it/s, grad_norm=10.5, loss_final=2.98, loss_mean=0.968, loss_mean_cls=2.4, proj_loss=-0.385][2026-03-23 13:47:40] Step: 2754, Training Logs: loss_final: 2.348008, loss_mean: 0.980072, proj_loss: -0.399259, loss_mean_cls: 1.767195, grad_norm: 6.510905 +Steps: 0%| | 2755/1000000 [11:20<68:15:32, 4.06it/s, grad_norm=6.51, loss_final=2.35, loss_mean=0.98, loss_mean_cls=1.77, proj_loss=-0.399][2026-03-23 13:47:40] Step: 2755, Training Logs: loss_final: 2.957422, loss_mean: 0.944358, proj_loss: -0.393774, loss_mean_cls: 2.406838, grad_norm: 18.283525 +Steps: 0%| | 2756/1000000 [11:20<68:03:28, 4.07it/s, grad_norm=18.3, loss_final=2.96, loss_mean=0.944, loss_mean_cls=2.41, proj_loss=-0.394][2026-03-23 13:47:40] Step: 2756, Training Logs: loss_final: 3.192309, loss_mean: 0.921404, proj_loss: -0.393373, loss_mean_cls: 2.664279, grad_norm: 25.147501 +Steps: 0%| | 2757/1000000 [11:21<67:51:59, 4.08it/s, grad_norm=25.1, loss_final=3.19, loss_mean=0.921, loss_mean_cls=2.66, proj_loss=-0.393][2026-03-23 13:47:41] Step: 2757, Training Logs: loss_final: 2.800575, loss_mean: 0.961061, proj_loss: -0.395416, loss_mean_cls: 2.234929, grad_norm: 4.110896 +Steps: 0%| | 2758/1000000 [11:21<67:43:55, 4.09it/s, grad_norm=4.11, loss_final=2.8, loss_mean=0.961, loss_mean_cls=2.23, proj_loss=-0.395][2026-03-23 13:47:41] Step: 2758, Training Logs: loss_final: 2.681858, loss_mean: 0.940140, proj_loss: -0.405333, loss_mean_cls: 2.147051, grad_norm: 4.502959 +Steps: 0%| | 2759/1000000 [11:21<67:38:11, 4.10it/s, grad_norm=4.5, loss_final=2.68, loss_mean=0.94, loss_mean_cls=2.15, proj_loss=-0.405][2026-03-23 13:47:41] Step: 2759, Training Logs: loss_final: 3.108878, loss_mean: 0.945158, proj_loss: -0.389646, loss_mean_cls: 2.553366, grad_norm: 6.549894 +Steps: 0%| | 2760/1000000 [11:21<67:33:54, 4.10it/s, grad_norm=6.55, loss_final=3.11, loss_mean=0.945, loss_mean_cls=2.55, proj_loss=-0.39][2026-03-23 13:47:41] Step: 2760, Training Logs: loss_final: 3.198083, loss_mean: 0.932451, proj_loss: -0.393340, loss_mean_cls: 2.658972, grad_norm: 4.165552 +Steps: 0%| | 2761/1000000 [11:22<67:31:08, 4.10it/s, grad_norm=4.17, loss_final=3.2, loss_mean=0.932, loss_mean_cls=2.66, proj_loss=-0.393][2026-03-23 13:47:42] Step: 2761, Training Logs: loss_final: 2.678616, loss_mean: 0.941354, proj_loss: -0.404557, loss_mean_cls: 2.141819, grad_norm: 16.383516 +Steps: 0%| | 2762/1000000 [11:22<67:28:34, 4.11it/s, grad_norm=16.4, loss_final=2.68, loss_mean=0.941, loss_mean_cls=2.14, proj_loss=-0.405][2026-03-23 13:47:42] Step: 2762, Training Logs: loss_final: 3.064936, loss_mean: 0.925156, proj_loss: -0.391723, loss_mean_cls: 2.531504, grad_norm: 3.162985 +Steps: 0%| | 2763/1000000 [11:22<67:33:06, 4.10it/s, grad_norm=3.16, loss_final=3.06, loss_mean=0.925, loss_mean_cls=2.53, proj_loss=-0.392][2026-03-23 13:47:42] Step: 2763, Training Logs: loss_final: 2.768882, loss_mean: 0.934857, proj_loss: -0.396209, loss_mean_cls: 2.230234, grad_norm: 5.062262 +Steps: 0%| | 2764/1000000 [11:22<67:31:27, 4.10it/s, grad_norm=5.06, loss_final=2.77, loss_mean=0.935, loss_mean_cls=2.23, proj_loss=-0.396][2026-03-23 13:47:42] Step: 2764, Training Logs: loss_final: 2.849649, loss_mean: 0.950748, proj_loss: -0.397039, loss_mean_cls: 2.295940, grad_norm: 3.540677 +Steps: 0%| | 2765/1000000 [11:23<67:29:45, 4.10it/s, grad_norm=3.54, loss_final=2.85, loss_mean=0.951, loss_mean_cls=2.3, proj_loss=-0.397][2026-03-23 13:47:43] Step: 2765, Training Logs: loss_final: 3.097678, loss_mean: 0.940797, proj_loss: -0.394244, loss_mean_cls: 2.551125, grad_norm: 11.910502 +Steps: 0%| | 2766/1000000 [11:23<67:28:35, 4.11it/s, grad_norm=11.9, loss_final=3.1, loss_mean=0.941, loss_mean_cls=2.55, proj_loss=-0.394][2026-03-23 13:47:43] Step: 2766, Training Logs: loss_final: 3.069612, loss_mean: 0.926005, proj_loss: -0.393494, loss_mean_cls: 2.537100, grad_norm: 14.399607 +Steps: 0%| | 2767/1000000 [11:23<67:25:40, 4.11it/s, grad_norm=14.4, loss_final=3.07, loss_mean=0.926, loss_mean_cls=2.54, proj_loss=-0.393][2026-03-23 13:47:43] Step: 2767, Training Logs: loss_final: 2.592975, loss_mean: 0.964873, proj_loss: -0.389061, loss_mean_cls: 2.017164, grad_norm: 16.336538 +Steps: 0%| | 2768/1000000 [11:23<67:24:01, 4.11it/s, grad_norm=16.3, loss_final=2.59, loss_mean=0.965, loss_mean_cls=2.02, proj_loss=-0.389][2026-03-23 13:47:43] Step: 2768, Training Logs: loss_final: 2.791446, loss_mean: 0.953153, proj_loss: -0.393716, loss_mean_cls: 2.232010, grad_norm: 3.527158 +Steps: 0%| | 2769/1000000 [11:24<67:25:25, 4.11it/s, grad_norm=3.53, loss_final=2.79, loss_mean=0.953, loss_mean_cls=2.23, proj_loss=-0.394][2026-03-23 13:47:44] Step: 2769, Training Logs: loss_final: 3.185719, loss_mean: 0.945707, proj_loss: -0.394437, loss_mean_cls: 2.634449, grad_norm: 5.305788 +Steps: 0%| | 2770/1000000 [11:24<67:29:22, 4.10it/s, grad_norm=5.31, loss_final=3.19, loss_mean=0.946, loss_mean_cls=2.63, proj_loss=-0.394][2026-03-23 13:47:44] Step: 2770, Training Logs: loss_final: 2.783805, loss_mean: 0.969764, proj_loss: -0.390601, loss_mean_cls: 2.204643, grad_norm: 12.037407 +Steps: 0%| | 2771/1000000 [11:24<67:28:28, 4.11it/s, grad_norm=12, loss_final=2.78, loss_mean=0.97, loss_mean_cls=2.2, proj_loss=-0.391][2026-03-23 13:47:44] Step: 2771, Training Logs: loss_final: 2.908765, loss_mean: 0.936037, proj_loss: -0.391041, loss_mean_cls: 2.363768, grad_norm: 4.202331 +Steps: 0%| | 2772/1000000 [11:24<67:26:13, 4.11it/s, grad_norm=4.2, loss_final=2.91, loss_mean=0.936, loss_mean_cls=2.36, proj_loss=-0.391][2026-03-23 13:47:44] Step: 2772, Training Logs: loss_final: 2.476318, loss_mean: 0.963668, proj_loss: -0.406725, loss_mean_cls: 1.919376, grad_norm: 7.354016 +Steps: 0%| | 2773/1000000 [11:25<67:25:43, 4.11it/s, grad_norm=7.35, loss_final=2.48, loss_mean=0.964, loss_mean_cls=1.92, proj_loss=-0.407][2026-03-23 13:47:45] Step: 2773, Training Logs: loss_final: 2.930651, loss_mean: 0.920832, proj_loss: -0.398059, loss_mean_cls: 2.407879, grad_norm: 5.297656 +Steps: 0%| | 2774/1000000 [11:25<67:24:14, 4.11it/s, grad_norm=5.3, loss_final=2.93, loss_mean=0.921, loss_mean_cls=2.41, proj_loss=-0.398][2026-03-23 13:47:45] Step: 2774, Training Logs: loss_final: 2.681591, loss_mean: 0.935612, proj_loss: -0.400465, loss_mean_cls: 2.146443, grad_norm: 13.442483 +Steps: 0%| | 2775/1000000 [11:25<67:26:28, 4.11it/s, grad_norm=13.4, loss_final=2.68, loss_mean=0.936, loss_mean_cls=2.15, proj_loss=-0.4][2026-03-23 13:47:45] Step: 2775, Training Logs: loss_final: 2.638291, loss_mean: 0.958814, proj_loss: -0.402171, loss_mean_cls: 2.081647, grad_norm: 4.163233 +Steps: 0%| | 2776/1000000 [11:25<67:25:46, 4.11it/s, grad_norm=4.16, loss_final=2.64, loss_mean=0.959, loss_mean_cls=2.08, proj_loss=-0.402][2026-03-23 13:47:45] Step: 2776, Training Logs: loss_final: 2.623572, loss_mean: 0.932296, proj_loss: -0.403156, loss_mean_cls: 2.094432, grad_norm: 2.749206 +Steps: 0%| | 2777/1000000 [11:26<67:30:09, 4.10it/s, grad_norm=2.75, loss_final=2.62, loss_mean=0.932, loss_mean_cls=2.09, proj_loss=-0.403][2026-03-23 13:47:46] Step: 2777, Training Logs: loss_final: 2.189993, loss_mean: 0.962393, proj_loss: -0.409973, loss_mean_cls: 1.637573, grad_norm: 6.720041 +Steps: 0%| | 2778/1000000 [11:26<67:28:49, 4.10it/s, grad_norm=6.72, loss_final=2.19, loss_mean=0.962, loss_mean_cls=1.64, proj_loss=-0.41][2026-03-23 13:47:46] Step: 2778, Training Logs: loss_final: 3.210929, loss_mean: 0.943995, proj_loss: -0.391401, loss_mean_cls: 2.658335, grad_norm: 6.754708 +Steps: 0%| | 2779/1000000 [11:26<67:25:21, 4.11it/s, grad_norm=6.75, loss_final=3.21, loss_mean=0.944, loss_mean_cls=2.66, proj_loss=-0.391][2026-03-23 13:47:46] Step: 2779, Training Logs: loss_final: 2.747076, loss_mean: 0.967005, proj_loss: -0.397675, loss_mean_cls: 2.177746, grad_norm: 4.593230 +Steps: 0%| | 2780/1000000 [11:26<67:24:30, 4.11it/s, grad_norm=4.59, loss_final=2.75, loss_mean=0.967, loss_mean_cls=2.18, proj_loss=-0.398][2026-03-23 13:47:46] Step: 2780, Training Logs: loss_final: 3.137655, loss_mean: 0.916836, proj_loss: -0.398098, loss_mean_cls: 2.618917, grad_norm: 2.053438 +Steps: 0%| | 2781/1000000 [11:27<67:23:52, 4.11it/s, grad_norm=2.05, loss_final=3.14, loss_mean=0.917, loss_mean_cls=2.62, proj_loss=-0.398][2026-03-23 13:47:47] Step: 2781, Training Logs: loss_final: 3.348737, loss_mean: 0.920837, proj_loss: -0.387878, loss_mean_cls: 2.815778, grad_norm: 7.425600 +Steps: 0%| | 2782/1000000 [11:27<67:27:17, 4.11it/s, grad_norm=7.43, loss_final=3.35, loss_mean=0.921, loss_mean_cls=2.82, proj_loss=-0.388][2026-03-23 13:47:47] Step: 2782, Training Logs: loss_final: 2.641193, loss_mean: 0.977954, proj_loss: -0.399889, loss_mean_cls: 2.063128, grad_norm: 8.236914 +Steps: 0%| | 2783/1000000 [11:27<67:26:25, 4.11it/s, grad_norm=8.24, loss_final=2.64, loss_mean=0.978, loss_mean_cls=2.06, proj_loss=-0.4][2026-03-23 13:47:47] Step: 2783, Training Logs: loss_final: 2.926221, loss_mean: 0.938478, proj_loss: -0.395858, loss_mean_cls: 2.383602, grad_norm: 3.782197 +Steps: 0%| | 2784/1000000 [11:27<67:30:43, 4.10it/s, grad_norm=3.78, loss_final=2.93, loss_mean=0.938, loss_mean_cls=2.38, proj_loss=-0.396][2026-03-23 13:47:47] Step: 2784, Training Logs: loss_final: 2.562377, loss_mean: 0.933995, proj_loss: -0.401054, loss_mean_cls: 2.029436, grad_norm: 7.789555 +Steps: 0%| | 2785/1000000 [11:28<67:30:21, 4.10it/s, grad_norm=7.79, loss_final=2.56, loss_mean=0.934, loss_mean_cls=2.03, proj_loss=-0.401][2026-03-23 13:47:47] Step: 2785, Training Logs: loss_final: 3.469964, loss_mean: 0.946802, proj_loss: -0.390819, loss_mean_cls: 2.913981, grad_norm: 11.304055 +Steps: 0%| | 2786/1000000 [11:28<67:30:12, 4.10it/s, grad_norm=11.3, loss_final=3.47, loss_mean=0.947, loss_mean_cls=2.91, proj_loss=-0.391][2026-03-23 13:47:48] Step: 2786, Training Logs: loss_final: 3.135548, loss_mean: 0.940693, proj_loss: -0.383247, loss_mean_cls: 2.578103, grad_norm: 8.910704 +Steps: 0%| | 2787/1000000 [11:28<67:29:43, 4.10it/s, grad_norm=8.91, loss_final=3.14, loss_mean=0.941, loss_mean_cls=2.58, proj_loss=-0.383][2026-03-23 13:47:48] Step: 2787, Training Logs: loss_final: 2.979590, loss_mean: 0.959722, proj_loss: -0.395278, loss_mean_cls: 2.415146, grad_norm: 3.218914 +Steps: 0%| | 2788/1000000 [11:28<67:28:44, 4.11it/s, grad_norm=3.22, loss_final=2.98, loss_mean=0.96, loss_mean_cls=2.42, proj_loss=-0.395][2026-03-23 13:47:48] Step: 2788, Training Logs: loss_final: 3.269982, loss_mean: 0.917241, proj_loss: -0.396302, loss_mean_cls: 2.749043, grad_norm: 15.140548 +Steps: 0%| | 2789/1000000 [11:28<67:27:27, 4.11it/s, grad_norm=15.1, loss_final=3.27, loss_mean=0.917, loss_mean_cls=2.75, proj_loss=-0.396][2026-03-23 13:47:48] Step: 2789, Training Logs: loss_final: 2.809405, loss_mean: 0.959398, proj_loss: -0.405496, loss_mean_cls: 2.255503, grad_norm: 12.603086 +Steps: 0%| | 2790/1000000 [11:29<67:29:46, 4.10it/s, grad_norm=12.6, loss_final=2.81, loss_mean=0.959, loss_mean_cls=2.26, proj_loss=-0.405][2026-03-23 13:47:49] Step: 2790, Training Logs: loss_final: 2.851832, loss_mean: 0.957436, proj_loss: -0.398733, loss_mean_cls: 2.293129, grad_norm: 14.194839 +Steps: 0%| | 2791/1000000 [11:29<67:33:09, 4.10it/s, grad_norm=14.2, loss_final=2.85, loss_mean=0.957, loss_mean_cls=2.29, proj_loss=-0.399][2026-03-23 13:47:49] Step: 2791, Training Logs: loss_final: 2.939415, loss_mean: 0.964650, proj_loss: -0.385383, loss_mean_cls: 2.360147, grad_norm: 8.521876 +Steps: 0%| | 2792/1000000 [11:29<67:31:02, 4.10it/s, grad_norm=8.52, loss_final=2.94, loss_mean=0.965, loss_mean_cls=2.36, proj_loss=-0.385][2026-03-23 13:47:49] Step: 2792, Training Logs: loss_final: 2.970474, loss_mean: 0.957780, proj_loss: -0.392453, loss_mean_cls: 2.405147, grad_norm: 16.188547 +Steps: 0%| | 2793/1000000 [11:29<67:34:16, 4.10it/s, grad_norm=16.2, loss_final=2.97, loss_mean=0.958, loss_mean_cls=2.41, proj_loss=-0.392][2026-03-23 13:47:49] Step: 2793, Training Logs: loss_final: 3.171649, loss_mean: 0.938828, proj_loss: -0.389623, loss_mean_cls: 2.622445, grad_norm: 19.169994 +Steps: 0%| | 2794/1000000 [11:30<67:31:05, 4.10it/s, grad_norm=19.2, loss_final=3.17, loss_mean=0.939, loss_mean_cls=2.62, proj_loss=-0.39][2026-03-23 13:47:50] Step: 2794, Training Logs: loss_final: 3.224722, loss_mean: 0.954991, proj_loss: -0.389218, loss_mean_cls: 2.658949, grad_norm: 3.865413 +Steps: 0%| | 2795/1000000 [11:30<67:30:23, 4.10it/s, grad_norm=3.87, loss_final=3.22, loss_mean=0.955, loss_mean_cls=2.66, proj_loss=-0.389][2026-03-23 13:47:50] Step: 2795, Training Logs: loss_final: 3.411674, loss_mean: 0.917561, proj_loss: -0.388248, loss_mean_cls: 2.882362, grad_norm: 23.216776 +Steps: 0%| | 2796/1000000 [11:30<67:28:15, 4.11it/s, grad_norm=23.2, loss_final=3.41, loss_mean=0.918, loss_mean_cls=2.88, proj_loss=-0.388][2026-03-23 13:47:50] Step: 2796, Training Logs: loss_final: 2.761412, loss_mean: 0.978914, proj_loss: -0.390978, loss_mean_cls: 2.173475, grad_norm: 16.096058 +Steps: 0%| | 2797/1000000 [11:30<67:26:54, 4.11it/s, grad_norm=16.1, loss_final=2.76, loss_mean=0.979, loss_mean_cls=2.17, proj_loss=-0.391][2026-03-23 13:47:50] Step: 2797, Training Logs: loss_final: 2.944627, loss_mean: 1.007758, proj_loss: -0.393055, loss_mean_cls: 2.329924, grad_norm: 9.748968 +Steps: 0%| | 2798/1000000 [11:31<67:30:59, 4.10it/s, grad_norm=9.75, loss_final=2.94, loss_mean=1.01, loss_mean_cls=2.33, proj_loss=-0.393][2026-03-23 13:47:51] Step: 2798, Training Logs: loss_final: 2.667235, loss_mean: 0.970090, proj_loss: -0.391586, loss_mean_cls: 2.088731, grad_norm: 8.956702 +Steps: 0%| | 2799/1000000 [11:31<67:29:26, 4.10it/s, grad_norm=8.96, loss_final=2.67, loss_mean=0.97, loss_mean_cls=2.09, proj_loss=-0.392][2026-03-23 13:47:51] Step: 2799, Training Logs: loss_final: 3.033401, loss_mean: 0.955108, proj_loss: -0.389642, loss_mean_cls: 2.467935, grad_norm: 8.054405 +Steps: 0%| | 2800/1000000 [11:31<67:27:48, 4.11it/s, grad_norm=8.05, loss_final=3.03, loss_mean=0.955, loss_mean_cls=2.47, proj_loss=-0.39][2026-03-23 13:47:51] Step: 2800, Training Logs: loss_final: 2.853706, loss_mean: 0.945939, proj_loss: -0.401659, loss_mean_cls: 2.309426, grad_norm: 3.764273 +Steps: 0%| | 2801/1000000 [11:31<67:25:46, 4.11it/s, grad_norm=3.76, loss_final=2.85, loss_mean=0.946, loss_mean_cls=2.31, proj_loss=-0.402][2026-03-23 13:47:51] Step: 2801, Training Logs: loss_final: 2.808757, loss_mean: 0.960858, proj_loss: -0.392366, loss_mean_cls: 2.240265, grad_norm: 17.228695 +Steps: 0%| | 2802/1000000 [11:32<67:25:53, 4.11it/s, grad_norm=17.2, loss_final=2.81, loss_mean=0.961, loss_mean_cls=2.24, proj_loss=-0.392][2026-03-23 13:47:52] Step: 2802, Training Logs: loss_final: 2.912164, loss_mean: 0.962320, proj_loss: -0.382848, loss_mean_cls: 2.332692, grad_norm: 5.235146 +Steps: 0%| | 2803/1000000 [11:32<67:28:36, 4.11it/s, grad_norm=5.24, loss_final=2.91, loss_mean=0.962, loss_mean_cls=2.33, proj_loss=-0.383][2026-03-23 13:47:52] Step: 2803, Training Logs: loss_final: 2.256941, loss_mean: 0.968787, proj_loss: -0.406896, loss_mean_cls: 1.695050, grad_norm: 8.394691 +Steps: 0%| | 2804/1000000 [11:32<67:27:14, 4.11it/s, grad_norm=8.39, loss_final=2.26, loss_mean=0.969, loss_mean_cls=1.7, proj_loss=-0.407][2026-03-23 13:47:52] Step: 2804, Training Logs: loss_final: 2.921212, loss_mean: 0.954223, proj_loss: -0.388715, loss_mean_cls: 2.355704, grad_norm: 15.030197 +Steps: 0%| | 2805/1000000 [11:32<67:25:49, 4.11it/s, grad_norm=15, loss_final=2.92, loss_mean=0.954, loss_mean_cls=2.36, proj_loss=-0.389][2026-03-23 13:47:52] Step: 2805, Training Logs: loss_final: 3.411204, loss_mean: 0.931194, proj_loss: -0.364366, loss_mean_cls: 2.844376, grad_norm: 15.143748 +Steps: 0%| | 2806/1000000 [11:33<67:24:00, 4.11it/s, grad_norm=15.1, loss_final=3.41, loss_mean=0.931, loss_mean_cls=2.84, proj_loss=-0.364][2026-03-23 13:47:53] Step: 2806, Training Logs: loss_final: 2.959542, loss_mean: 0.976108, proj_loss: -0.376150, loss_mean_cls: 2.359584, grad_norm: 10.222049 +Steps: 0%| | 2807/1000000 [11:33<67:23:26, 4.11it/s, grad_norm=10.2, loss_final=2.96, loss_mean=0.976, loss_mean_cls=2.36, proj_loss=-0.376][2026-03-23 13:47:53] Step: 2807, Training Logs: loss_final: 2.914014, loss_mean: 0.917907, proj_loss: -0.394901, loss_mean_cls: 2.391008, grad_norm: 8.521851 +Steps: 0%| | 2808/1000000 [11:33<67:22:49, 4.11it/s, grad_norm=8.52, loss_final=2.91, loss_mean=0.918, loss_mean_cls=2.39, proj_loss=-0.395][2026-03-23 13:47:53] Step: 2808, Training Logs: loss_final: 3.086570, loss_mean: 0.927176, proj_loss: -0.394466, loss_mean_cls: 2.553859, grad_norm: 12.269035 +Steps: 0%| | 2809/1000000 [11:33<67:21:58, 4.11it/s, grad_norm=12.3, loss_final=3.09, loss_mean=0.927, loss_mean_cls=2.55, proj_loss=-0.394][2026-03-23 13:47:53] Step: 2809, Training Logs: loss_final: 2.674381, loss_mean: 0.947752, proj_loss: -0.397724, loss_mean_cls: 2.124353, grad_norm: 6.271772 +Steps: 0%| | 2810/1000000 [11:34<67:23:48, 4.11it/s, grad_norm=6.27, loss_final=2.67, loss_mean=0.948, loss_mean_cls=2.12, proj_loss=-0.398][2026-03-23 13:47:54] Step: 2810, Training Logs: loss_final: 3.231098, loss_mean: 0.917032, proj_loss: -0.393645, loss_mean_cls: 2.707711, grad_norm: 10.710782 +Steps: 0%| | 2811/1000000 [11:34<67:23:45, 4.11it/s, grad_norm=10.7, loss_final=3.23, loss_mean=0.917, loss_mean_cls=2.71, proj_loss=-0.394][2026-03-23 13:47:54] Step: 2811, Training Logs: loss_final: 3.393971, loss_mean: 0.913930, proj_loss: -0.389211, loss_mean_cls: 2.869252, grad_norm: 5.456219 +Steps: 0%| | 2812/1000000 [11:34<67:25:37, 4.11it/s, grad_norm=5.46, loss_final=3.39, loss_mean=0.914, loss_mean_cls=2.87, proj_loss=-0.389][2026-03-23 13:47:54] Step: 2812, Training Logs: loss_final: 2.822846, loss_mean: 0.925121, proj_loss: -0.401366, loss_mean_cls: 2.299091, grad_norm: 5.288928 +Steps: 0%| | 2813/1000000 [11:34<67:24:41, 4.11it/s, grad_norm=5.29, loss_final=2.82, loss_mean=0.925, loss_mean_cls=2.3, proj_loss=-0.401][2026-03-23 13:47:54] Step: 2813, Training Logs: loss_final: 2.954580, loss_mean: 0.932224, proj_loss: -0.396665, loss_mean_cls: 2.419021, grad_norm: 11.833790 +Steps: 0%| | 2814/1000000 [11:35<67:24:54, 4.11it/s, grad_norm=11.8, loss_final=2.95, loss_mean=0.932, loss_mean_cls=2.42, proj_loss=-0.397][2026-03-23 13:47:55] Step: 2814, Training Logs: loss_final: 2.656783, loss_mean: 0.946799, proj_loss: -0.400121, loss_mean_cls: 2.110106, grad_norm: 7.298584 +Steps: 0%| | 2815/1000000 [11:35<67:26:54, 4.11it/s, grad_norm=7.3, loss_final=2.66, loss_mean=0.947, loss_mean_cls=2.11, proj_loss=-0.4][2026-03-23 13:47:55] Step: 2815, Training Logs: loss_final: 2.673063, loss_mean: 0.937670, proj_loss: -0.399516, loss_mean_cls: 2.134909, grad_norm: 2.680294 +Steps: 0%| | 2816/1000000 [11:35<67:25:31, 4.11it/s, grad_norm=2.68, loss_final=2.67, loss_mean=0.938, loss_mean_cls=2.13, proj_loss=-0.4][2026-03-23 13:47:55] Step: 2816, Training Logs: loss_final: 2.893099, loss_mean: 0.922231, proj_loss: -0.404300, loss_mean_cls: 2.375168, grad_norm: 6.037982 +Steps: 0%| | 2817/1000000 [11:35<67:25:16, 4.11it/s, grad_norm=6.04, loss_final=2.89, loss_mean=0.922, loss_mean_cls=2.38, proj_loss=-0.404][2026-03-23 13:47:55] Step: 2817, Training Logs: loss_final: 2.878713, loss_mean: 0.951420, proj_loss: -0.399550, loss_mean_cls: 2.326843, grad_norm: 5.493019 +Steps: 0%| | 2818/1000000 [11:36<67:29:56, 4.10it/s, grad_norm=5.49, loss_final=2.88, loss_mean=0.951, loss_mean_cls=2.33, proj_loss=-0.4][2026-03-23 13:47:56] Step: 2818, Training Logs: loss_final: 2.796540, loss_mean: 0.931370, proj_loss: -0.393721, loss_mean_cls: 2.258891, grad_norm: 7.588803 +Steps: 0%| | 2819/1000000 [11:36<67:28:35, 4.11it/s, grad_norm=7.59, loss_final=2.8, loss_mean=0.931, loss_mean_cls=2.26, proj_loss=-0.394][2026-03-23 13:47:56] Step: 2819, Training Logs: loss_final: 2.690726, loss_mean: 0.968719, proj_loss: -0.401964, loss_mean_cls: 2.123971, grad_norm: 11.966952 +Steps: 0%| | 2820/1000000 [11:36<67:27:11, 4.11it/s, grad_norm=12, loss_final=2.69, loss_mean=0.969, loss_mean_cls=2.12, proj_loss=-0.402][2026-03-23 13:47:56] Step: 2820, Training Logs: loss_final: 3.133517, loss_mean: 0.908766, proj_loss: -0.394447, loss_mean_cls: 2.619198, grad_norm: 9.376453 +Steps: 0%| | 2821/1000000 [11:36<67:26:30, 4.11it/s, grad_norm=9.38, loss_final=3.13, loss_mean=0.909, loss_mean_cls=2.62, proj_loss=-0.394][2026-03-23 13:47:56] Step: 2821, Training Logs: loss_final: 2.852016, loss_mean: 0.929866, proj_loss: -0.405288, loss_mean_cls: 2.327439, grad_norm: 9.966318 +Steps: 0%| | 2822/1000000 [11:37<67:32:25, 4.10it/s, grad_norm=9.97, loss_final=2.85, loss_mean=0.93, loss_mean_cls=2.33, proj_loss=-0.405][2026-03-23 13:47:56] Step: 2822, Training Logs: loss_final: 3.008884, loss_mean: 0.936353, proj_loss: -0.399216, loss_mean_cls: 2.471746, grad_norm: 3.031483 +Steps: 0%| | 2823/1000000 [11:37<67:30:18, 4.10it/s, grad_norm=3.03, loss_final=3.01, loss_mean=0.936, loss_mean_cls=2.47, proj_loss=-0.399][2026-03-23 13:47:57] Step: 2823, Training Logs: loss_final: 2.915038, loss_mean: 0.912852, proj_loss: -0.403173, loss_mean_cls: 2.405358, grad_norm: 2.367294 +Steps: 0%| | 2824/1000000 [11:37<67:27:09, 4.11it/s, grad_norm=2.37, loss_final=2.92, loss_mean=0.913, loss_mean_cls=2.41, proj_loss=-0.403][2026-03-23 13:47:57] Step: 2824, Training Logs: loss_final: 2.727036, loss_mean: 0.935403, proj_loss: -0.398985, loss_mean_cls: 2.190617, grad_norm: 8.255743 +Steps: 0%| | 2825/1000000 [11:37<67:24:50, 4.11it/s, grad_norm=8.26, loss_final=2.73, loss_mean=0.935, loss_mean_cls=2.19, proj_loss=-0.399][2026-03-23 13:47:57] Step: 2825, Training Logs: loss_final: 3.009763, loss_mean: 0.933407, proj_loss: -0.394539, loss_mean_cls: 2.470895, grad_norm: 9.229704 +Steps: 0%| | 2826/1000000 [11:38<67:25:21, 4.11it/s, grad_norm=9.23, loss_final=3.01, loss_mean=0.933, loss_mean_cls=2.47, proj_loss=-0.395][2026-03-23 13:47:57] Step: 2826, Training Logs: loss_final: 2.958583, loss_mean: 0.947368, proj_loss: -0.401396, loss_mean_cls: 2.412610, grad_norm: 24.553022 +Steps: 0%| | 2827/1000000 [11:38<67:24:52, 4.11it/s, grad_norm=24.6, loss_final=2.96, loss_mean=0.947, loss_mean_cls=2.41, proj_loss=-0.401][2026-03-23 13:47:58] Step: 2827, Training Logs: loss_final: 2.622139, loss_mean: 0.947599, proj_loss: -0.403516, loss_mean_cls: 2.078055, grad_norm: 3.223913 +Steps: 0%| | 2828/1000000 [11:38<67:24:58, 4.11it/s, grad_norm=3.22, loss_final=2.62, loss_mean=0.948, loss_mean_cls=2.08, proj_loss=-0.404][2026-03-23 13:47:58] Step: 2828, Training Logs: loss_final: 2.950932, loss_mean: 0.937581, proj_loss: -0.396230, loss_mean_cls: 2.409581, grad_norm: 3.885195 +Steps: 0%| | 2829/1000000 [11:38<67:25:49, 4.11it/s, grad_norm=3.89, loss_final=2.95, loss_mean=0.938, loss_mean_cls=2.41, proj_loss=-0.396][2026-03-23 13:47:58] Step: 2829, Training Logs: loss_final: 2.847112, loss_mean: 0.937927, proj_loss: -0.401234, loss_mean_cls: 2.310419, grad_norm: 3.360793 +Steps: 0%| | 2830/1000000 [11:38<67:24:14, 4.11it/s, grad_norm=3.36, loss_final=2.85, loss_mean=0.938, loss_mean_cls=2.31, proj_loss=-0.401][2026-03-23 13:47:58] Step: 2830, Training Logs: loss_final: 3.037692, loss_mean: 0.929816, proj_loss: -0.400439, loss_mean_cls: 2.508316, grad_norm: 2.144750 +Steps: 0%| | 2831/1000000 [11:39<67:26:00, 4.11it/s, grad_norm=2.14, loss_final=3.04, loss_mean=0.93, loss_mean_cls=2.51, proj_loss=-0.4][2026-03-23 13:47:59] Step: 2831, Training Logs: loss_final: 2.674981, loss_mean: 0.951099, proj_loss: -0.407959, loss_mean_cls: 2.131840, grad_norm: 2.473487 +Steps: 0%| | 2832/1000000 [11:39<67:25:17, 4.11it/s, grad_norm=2.47, loss_final=2.67, loss_mean=0.951, loss_mean_cls=2.13, proj_loss=-0.408][2026-03-23 13:47:59] Step: 2832, Training Logs: loss_final: 2.228913, loss_mean: 0.959221, proj_loss: -0.409183, loss_mean_cls: 1.678875, grad_norm: 3.590356 +Steps: 0%| | 2833/1000000 [11:39<67:23:50, 4.11it/s, grad_norm=3.59, loss_final=2.23, loss_mean=0.959, loss_mean_cls=1.68, proj_loss=-0.409][2026-03-23 13:47:59] Step: 2833, Training Logs: loss_final: 3.096763, loss_mean: 0.952925, proj_loss: -0.394042, loss_mean_cls: 2.537880, grad_norm: 4.065245 +Steps: 0%| | 2834/1000000 [11:39<67:23:52, 4.11it/s, grad_norm=4.07, loss_final=3.1, loss_mean=0.953, loss_mean_cls=2.54, proj_loss=-0.394][2026-03-23 13:47:59] Step: 2834, Training Logs: loss_final: 2.424440, loss_mean: 0.941234, proj_loss: -0.411290, loss_mean_cls: 1.894496, grad_norm: 23.709822 +Steps: 0%| | 2835/1000000 [11:40<67:23:28, 4.11it/s, grad_norm=23.7, loss_final=2.42, loss_mean=0.941, loss_mean_cls=1.89, proj_loss=-0.411][2026-03-23 13:48:00] Step: 2835, Training Logs: loss_final: 3.042861, loss_mean: 0.922332, proj_loss: -0.397153, loss_mean_cls: 2.517682, grad_norm: 17.092159 +Steps: 0%| | 2836/1000000 [11:40<67:23:37, 4.11it/s, grad_norm=17.1, loss_final=3.04, loss_mean=0.922, loss_mean_cls=2.52, proj_loss=-0.397][2026-03-23 13:48:00] Step: 2836, Training Logs: loss_final: 2.898515, loss_mean: 0.971398, proj_loss: -0.393660, loss_mean_cls: 2.320777, grad_norm: 5.712744 +Steps: 0%| | 2837/1000000 [11:40<67:25:42, 4.11it/s, grad_norm=5.71, loss_final=2.9, loss_mean=0.971, loss_mean_cls=2.32, proj_loss=-0.394][2026-03-23 13:48:00] Step: 2837, Training Logs: loss_final: 2.975528, loss_mean: 0.917009, proj_loss: -0.401260, loss_mean_cls: 2.459779, grad_norm: 3.320653 +Steps: 0%| | 2838/1000000 [11:40<67:25:21, 4.11it/s, grad_norm=3.32, loss_final=2.98, loss_mean=0.917, loss_mean_cls=2.46, proj_loss=-0.401][2026-03-23 13:48:00] Step: 2838, Training Logs: loss_final: 3.049929, loss_mean: 0.919944, proj_loss: -0.402457, loss_mean_cls: 2.532442, grad_norm: 13.238050 +Steps: 0%| | 2839/1000000 [11:41<67:24:03, 4.11it/s, grad_norm=13.2, loss_final=3.05, loss_mean=0.92, loss_mean_cls=2.53, proj_loss=-0.402][2026-03-23 13:48:01] Step: 2839, Training Logs: loss_final: 2.100327, loss_mean: 0.971763, proj_loss: -0.410208, loss_mean_cls: 1.538773, grad_norm: 10.113618 +Steps: 0%| | 2840/1000000 [11:41<67:26:21, 4.11it/s, grad_norm=10.1, loss_final=2.1, loss_mean=0.972, loss_mean_cls=1.54, proj_loss=-0.41][2026-03-23 13:48:01] Step: 2840, Training Logs: loss_final: 2.967332, loss_mean: 0.957027, proj_loss: -0.402907, loss_mean_cls: 2.413212, grad_norm: 19.426722 +Steps: 0%| | 2841/1000000 [11:41<67:26:02, 4.11it/s, grad_norm=19.4, loss_final=2.97, loss_mean=0.957, loss_mean_cls=2.41, proj_loss=-0.403][2026-03-23 13:48:01] Step: 2841, Training Logs: loss_final: 3.159185, loss_mean: 0.952142, proj_loss: -0.393044, loss_mean_cls: 2.600087, grad_norm: 10.776107 +Steps: 0%| | 2842/1000000 [11:41<67:49:36, 4.08it/s, grad_norm=10.8, loss_final=3.16, loss_mean=0.952, loss_mean_cls=2.6, proj_loss=-0.393][2026-03-23 13:48:01] Step: 2842, Training Logs: loss_final: 3.004607, loss_mean: 0.950234, proj_loss: -0.392936, loss_mean_cls: 2.447309, grad_norm: 6.471710 +Steps: 0%| | 2843/1000000 [11:42<67:41:48, 4.09it/s, grad_norm=6.47, loss_final=3, loss_mean=0.95, loss_mean_cls=2.45, proj_loss=-0.393][2026-03-23 13:48:02] Step: 2843, Training Logs: loss_final: 2.549987, loss_mean: 0.946254, proj_loss: -0.402940, loss_mean_cls: 2.006673, grad_norm: 13.285142 +Steps: 0%| | 2844/1000000 [11:42<67:41:20, 4.09it/s, grad_norm=13.3, loss_final=2.55, loss_mean=0.946, loss_mean_cls=2.01, proj_loss=-0.403][2026-03-23 13:48:02] Step: 2844, Training Logs: loss_final: 2.882575, loss_mean: 0.941618, proj_loss: -0.403226, loss_mean_cls: 2.344183, grad_norm: 24.092829 +Steps: 0%| | 2845/1000000 [11:42<67:38:40, 4.09it/s, grad_norm=24.1, loss_final=2.88, loss_mean=0.942, loss_mean_cls=2.34, proj_loss=-0.403][2026-03-23 13:48:02] Step: 2845, Training Logs: loss_final: 2.775063, loss_mean: 0.958005, proj_loss: -0.400948, loss_mean_cls: 2.218007, grad_norm: 14.962865 +Steps: 0%| | 2846/1000000 [11:42<67:33:56, 4.10it/s, grad_norm=15, loss_final=2.78, loss_mean=0.958, loss_mean_cls=2.22, proj_loss=-0.401][2026-03-23 13:48:02] Step: 2846, Training Logs: loss_final: 2.738211, loss_mean: 0.967437, proj_loss: -0.397989, loss_mean_cls: 2.168763, grad_norm: 13.477940 +Steps: 0%| | 2847/1000000 [11:43<67:31:58, 4.10it/s, grad_norm=13.5, loss_final=2.74, loss_mean=0.967, loss_mean_cls=2.17, proj_loss=-0.398][2026-03-23 13:48:03] Step: 2847, Training Logs: loss_final: 2.985536, loss_mean: 0.943527, proj_loss: -0.400631, loss_mean_cls: 2.442639, grad_norm: 18.658588 +Steps: 0%| | 2848/1000000 [11:43<67:28:52, 4.10it/s, grad_norm=18.7, loss_final=2.99, loss_mean=0.944, loss_mean_cls=2.44, proj_loss=-0.401][2026-03-23 13:48:03] Step: 2848, Training Logs: loss_final: 3.012689, loss_mean: 0.928714, proj_loss: -0.397606, loss_mean_cls: 2.481581, grad_norm: 8.369230 +Steps: 0%| | 2849/1000000 [11:43<67:26:44, 4.11it/s, grad_norm=8.37, loss_final=3.01, loss_mean=0.929, loss_mean_cls=2.48, proj_loss=-0.398][2026-03-23 13:48:03] Step: 2849, Training Logs: loss_final: 2.536242, loss_mean: 0.954414, proj_loss: -0.410239, loss_mean_cls: 1.992067, grad_norm: 14.515893 +Steps: 0%| | 2850/1000000 [11:43<67:26:35, 4.11it/s, grad_norm=14.5, loss_final=2.54, loss_mean=0.954, loss_mean_cls=1.99, proj_loss=-0.41][2026-03-23 13:48:03] Step: 2850, Training Logs: loss_final: 3.287814, loss_mean: 0.936973, proj_loss: -0.394559, loss_mean_cls: 2.745399, grad_norm: 21.763647 +Steps: 0%| | 2851/1000000 [11:44<67:24:01, 4.11it/s, grad_norm=21.8, loss_final=3.29, loss_mean=0.937, loss_mean_cls=2.75, proj_loss=-0.395][2026-03-23 13:48:04] Step: 2851, Training Logs: loss_final: 3.161381, loss_mean: 0.946478, proj_loss: -0.393532, loss_mean_cls: 2.608436, grad_norm: 8.751126 +Steps: 0%| | 2852/1000000 [11:44<67:23:50, 4.11it/s, grad_norm=8.75, loss_final=3.16, loss_mean=0.946, loss_mean_cls=2.61, proj_loss=-0.394][2026-03-23 13:48:04] Step: 2852, Training Logs: loss_final: 2.824737, loss_mean: 0.961166, proj_loss: -0.401743, loss_mean_cls: 2.265313, grad_norm: 2.457571 +Steps: 0%| | 2853/1000000 [11:44<67:23:36, 4.11it/s, grad_norm=2.46, loss_final=2.82, loss_mean=0.961, loss_mean_cls=2.27, proj_loss=-0.402][2026-03-23 13:48:04] Step: 2853, Training Logs: loss_final: 3.089575, loss_mean: 0.931211, proj_loss: -0.404445, loss_mean_cls: 2.562809, grad_norm: 13.666122 +Steps: 0%| | 2854/1000000 [11:44<67:23:25, 4.11it/s, grad_norm=13.7, loss_final=3.09, loss_mean=0.931, loss_mean_cls=2.56, proj_loss=-0.404][2026-03-23 13:48:04] Step: 2854, Training Logs: loss_final: 2.871208, loss_mean: 0.960392, proj_loss: -0.393178, loss_mean_cls: 2.303994, grad_norm: 15.875444 +Steps: 0%| | 2855/1000000 [11:45<67:23:42, 4.11it/s, grad_norm=15.9, loss_final=2.87, loss_mean=0.96, loss_mean_cls=2.3, proj_loss=-0.393][2026-03-23 13:48:05] Step: 2855, Training Logs: loss_final: 3.215725, loss_mean: 0.963822, proj_loss: -0.383715, loss_mean_cls: 2.635617, grad_norm: 10.412195 +Steps: 0%| | 2856/1000000 [11:45<67:23:30, 4.11it/s, grad_norm=10.4, loss_final=3.22, loss_mean=0.964, loss_mean_cls=2.64, proj_loss=-0.384][2026-03-23 13:48:05] Step: 2856, Training Logs: loss_final: 2.651473, loss_mean: 0.964413, proj_loss: -0.391100, loss_mean_cls: 2.078160, grad_norm: 12.394427 +Steps: 0%| | 2857/1000000 [11:45<67:23:10, 4.11it/s, grad_norm=12.4, loss_final=2.65, loss_mean=0.964, loss_mean_cls=2.08, proj_loss=-0.391][2026-03-23 13:48:05] Step: 2857, Training Logs: loss_final: 3.011190, loss_mean: 0.937679, proj_loss: -0.394067, loss_mean_cls: 2.467578, grad_norm: 11.706195 +Steps: 0%| | 2858/1000000 [11:45<67:23:19, 4.11it/s, grad_norm=11.7, loss_final=3.01, loss_mean=0.938, loss_mean_cls=2.47, proj_loss=-0.394][2026-03-23 13:48:05] Step: 2858, Training Logs: loss_final: 2.718348, loss_mean: 0.947643, proj_loss: -0.405235, loss_mean_cls: 2.175940, grad_norm: 12.284317 +Steps: 0%| | 2859/1000000 [11:46<67:23:52, 4.11it/s, grad_norm=12.3, loss_final=2.72, loss_mean=0.948, loss_mean_cls=2.18, proj_loss=-0.405][2026-03-23 13:48:06] Step: 2859, Training Logs: loss_final: 2.765764, loss_mean: 0.956567, proj_loss: -0.401081, loss_mean_cls: 2.210279, grad_norm: 5.939968 +Steps: 0%| | 2860/1000000 [11:46<67:23:13, 4.11it/s, grad_norm=5.94, loss_final=2.77, loss_mean=0.957, loss_mean_cls=2.21, proj_loss=-0.401][2026-03-23 13:48:06] Step: 2860, Training Logs: loss_final: 3.337055, loss_mean: 0.950488, proj_loss: -0.392040, loss_mean_cls: 2.778607, grad_norm: 4.855196 +Steps: 0%| | 2861/1000000 [11:46<67:23:18, 4.11it/s, grad_norm=4.86, loss_final=3.34, loss_mean=0.95, loss_mean_cls=2.78, proj_loss=-0.392][2026-03-23 13:48:06] Step: 2861, Training Logs: loss_final: 3.156311, loss_mean: 0.912875, proj_loss: -0.397143, loss_mean_cls: 2.640579, grad_norm: 7.259372 +Steps: 0%| | 2862/1000000 [11:46<67:22:10, 4.11it/s, grad_norm=7.26, loss_final=3.16, loss_mean=0.913, loss_mean_cls=2.64, proj_loss=-0.397][2026-03-23 13:48:06] Step: 2862, Training Logs: loss_final: 3.062394, loss_mean: 0.929805, proj_loss: -0.401061, loss_mean_cls: 2.533650, grad_norm: 5.709474 +Steps: 0%| | 2863/1000000 [11:47<67:24:14, 4.11it/s, grad_norm=5.71, loss_final=3.06, loss_mean=0.93, loss_mean_cls=2.53, proj_loss=-0.401][2026-03-23 13:48:06] Step: 2863, Training Logs: loss_final: 3.059139, loss_mean: 0.911897, proj_loss: -0.402312, loss_mean_cls: 2.549555, grad_norm: 17.087597 +Steps: 0%| | 2864/1000000 [11:47<67:24:43, 4.11it/s, grad_norm=17.1, loss_final=3.06, loss_mean=0.912, loss_mean_cls=2.55, proj_loss=-0.402][2026-03-23 13:48:07] Step: 2864, Training Logs: loss_final: 2.900623, loss_mean: 0.932215, proj_loss: -0.402171, loss_mean_cls: 2.370578, grad_norm: 20.529150 +Steps: 0%| | 2865/1000000 [11:47<67:23:55, 4.11it/s, grad_norm=20.5, loss_final=2.9, loss_mean=0.932, loss_mean_cls=2.37, proj_loss=-0.402][2026-03-23 13:48:07] Step: 2865, Training Logs: loss_final: 2.935572, loss_mean: 0.935763, proj_loss: -0.397710, loss_mean_cls: 2.397518, grad_norm: 2.192047 +Steps: 0%| | 2866/1000000 [11:47<67:22:17, 4.11it/s, grad_norm=2.19, loss_final=2.94, loss_mean=0.936, loss_mean_cls=2.4, proj_loss=-0.398][2026-03-23 13:48:07] Step: 2866, Training Logs: loss_final: 2.599054, loss_mean: 0.932942, proj_loss: -0.407186, loss_mean_cls: 2.073298, grad_norm: 4.202205 +Steps: 0%| | 2867/1000000 [11:47<67:22:36, 4.11it/s, grad_norm=4.2, loss_final=2.6, loss_mean=0.933, loss_mean_cls=2.07, proj_loss=-0.407][2026-03-23 13:48:07] Step: 2867, Training Logs: loss_final: 2.579399, loss_mean: 0.968853, proj_loss: -0.407557, loss_mean_cls: 2.018103, grad_norm: 2.763850 +Steps: 0%| | 2868/1000000 [11:48<67:23:25, 4.11it/s, grad_norm=2.76, loss_final=2.58, loss_mean=0.969, loss_mean_cls=2.02, proj_loss=-0.408][2026-03-23 13:48:08] Step: 2868, Training Logs: loss_final: 3.154347, loss_mean: 0.920109, proj_loss: -0.399185, loss_mean_cls: 2.633423, grad_norm: 2.561803 +Steps: 0%| | 2869/1000000 [11:48<67:54:47, 4.08it/s, grad_norm=2.56, loss_final=3.15, loss_mean=0.92, loss_mean_cls=2.63, proj_loss=-0.399][2026-03-23 13:48:08] Step: 2869, Training Logs: loss_final: 2.890893, loss_mean: 0.922677, proj_loss: -0.404856, loss_mean_cls: 2.373072, grad_norm: 27.335188 +Steps: 0%| | 2870/1000000 [11:48<67:48:04, 4.09it/s, grad_norm=27.3, loss_final=2.89, loss_mean=0.923, loss_mean_cls=2.37, proj_loss=-0.405][2026-03-23 13:48:08] Step: 2870, Training Logs: loss_final: 3.172019, loss_mean: 0.924592, proj_loss: -0.397180, loss_mean_cls: 2.644608, grad_norm: 16.576706 +Steps: 0%| | 2871/1000000 [11:48<67:42:24, 4.09it/s, grad_norm=16.6, loss_final=3.17, loss_mean=0.925, loss_mean_cls=2.64, proj_loss=-0.397][2026-03-23 13:48:08] Step: 2871, Training Logs: loss_final: 2.877986, loss_mean: 0.975233, proj_loss: -0.390997, loss_mean_cls: 2.293750, grad_norm: 5.659210 +Steps: 0%| | 2872/1000000 [11:49<67:38:17, 4.10it/s, grad_norm=5.66, loss_final=2.88, loss_mean=0.975, loss_mean_cls=2.29, proj_loss=-0.391][2026-03-23 13:48:09] Step: 2872, Training Logs: loss_final: 2.626932, loss_mean: 0.958860, proj_loss: -0.403542, loss_mean_cls: 2.071615, grad_norm: 8.974197 +Steps: 0%| | 2873/1000000 [11:49<67:35:30, 4.10it/s, grad_norm=8.97, loss_final=2.63, loss_mean=0.959, loss_mean_cls=2.07, proj_loss=-0.404][2026-03-23 13:48:09] Step: 2873, Training Logs: loss_final: 3.239869, loss_mean: 0.938447, proj_loss: -0.399284, loss_mean_cls: 2.700706, grad_norm: 11.175241 +Steps: 0%| | 2874/1000000 [11:49<67:33:37, 4.10it/s, grad_norm=11.2, loss_final=3.24, loss_mean=0.938, loss_mean_cls=2.7, proj_loss=-0.399][2026-03-23 13:48:09] Step: 2874, Training Logs: loss_final: 3.042322, loss_mean: 0.944517, proj_loss: -0.400355, loss_mean_cls: 2.498159, grad_norm: 9.255926 +Steps: 0%| | 2875/1000000 [11:49<67:32:57, 4.10it/s, grad_norm=9.26, loss_final=3.04, loss_mean=0.945, loss_mean_cls=2.5, proj_loss=-0.4][2026-03-23 13:48:09] Step: 2875, Training Logs: loss_final: 2.533809, loss_mean: 0.957763, proj_loss: -0.400122, loss_mean_cls: 1.976167, grad_norm: 2.431721 +Steps: 0%| | 2876/1000000 [11:50<67:32:15, 4.10it/s, grad_norm=2.43, loss_final=2.53, loss_mean=0.958, loss_mean_cls=1.98, proj_loss=-0.4][2026-03-23 13:48:10] Step: 2876, Training Logs: loss_final: 3.116791, loss_mean: 0.927619, proj_loss: -0.397652, loss_mean_cls: 2.586824, grad_norm: 14.780457 +Steps: 0%| | 2877/1000000 [11:50<67:31:08, 4.10it/s, grad_norm=14.8, loss_final=3.12, loss_mean=0.928, loss_mean_cls=2.59, proj_loss=-0.398][2026-03-23 13:48:10] Step: 2877, Training Logs: loss_final: 2.586755, loss_mean: 0.994189, proj_loss: -0.406016, loss_mean_cls: 1.998582, grad_norm: 22.625711 +Steps: 0%| | 2878/1000000 [11:50<67:29:49, 4.10it/s, grad_norm=22.6, loss_final=2.59, loss_mean=0.994, loss_mean_cls=2, proj_loss=-0.406][2026-03-23 13:48:10] Step: 2878, Training Logs: loss_final: 3.065661, loss_mean: 0.994976, proj_loss: -0.398921, loss_mean_cls: 2.469606, grad_norm: 23.617222 +Steps: 0%| | 2879/1000000 [11:50<67:29:23, 4.10it/s, grad_norm=23.6, loss_final=3.07, loss_mean=0.995, loss_mean_cls=2.47, proj_loss=-0.399][2026-03-23 13:48:10] Step: 2879, Training Logs: loss_final: 3.142186, loss_mean: 1.008019, proj_loss: -0.388838, loss_mean_cls: 2.523005, grad_norm: 9.616378 +Steps: 0%| | 2880/1000000 [11:51<67:28:15, 4.11it/s, grad_norm=9.62, loss_final=3.14, loss_mean=1.01, loss_mean_cls=2.52, proj_loss=-0.389][2026-03-23 13:48:11] Step: 2880, Training Logs: loss_final: 2.800424, loss_mean: 0.966876, proj_loss: -0.390765, loss_mean_cls: 2.224312, grad_norm: 7.111203 +Steps: 0%| | 2881/1000000 [11:51<67:28:13, 4.11it/s, grad_norm=7.11, loss_final=2.8, loss_mean=0.967, loss_mean_cls=2.22, proj_loss=-0.391][2026-03-23 13:48:11] Step: 2881, Training Logs: loss_final: 2.957874, loss_mean: 0.942018, proj_loss: -0.397268, loss_mean_cls: 2.413124, grad_norm: 6.866630 +Steps: 0%| | 2882/1000000 [11:51<67:28:39, 4.10it/s, grad_norm=6.87, loss_final=2.96, loss_mean=0.942, loss_mean_cls=2.41, proj_loss=-0.397][2026-03-23 13:48:11] Step: 2882, Training Logs: loss_final: 2.769709, loss_mean: 0.958075, proj_loss: -0.400340, loss_mean_cls: 2.211974, grad_norm: 3.424031 +Steps: 0%| | 2883/1000000 [11:51<67:27:14, 4.11it/s, grad_norm=3.42, loss_final=2.77, loss_mean=0.958, loss_mean_cls=2.21, proj_loss=-0.4][2026-03-23 13:48:11] Step: 2883, Training Logs: loss_final: 2.741625, loss_mean: 0.960298, proj_loss: -0.402791, loss_mean_cls: 2.184119, grad_norm: 8.335391 +Steps: 0%| | 2884/1000000 [11:52<67:27:24, 4.11it/s, grad_norm=8.34, loss_final=2.74, loss_mean=0.96, loss_mean_cls=2.18, proj_loss=-0.403][2026-03-23 13:48:12] Step: 2884, Training Logs: loss_final: 2.451419, loss_mean: 0.947519, proj_loss: -0.407143, loss_mean_cls: 1.911043, grad_norm: 3.992890 +Steps: 0%| | 2885/1000000 [11:52<67:37:11, 4.10it/s, grad_norm=3.99, loss_final=2.45, loss_mean=0.948, loss_mean_cls=1.91, proj_loss=-0.407][2026-03-23 13:48:12] Step: 2885, Training Logs: loss_final: 2.853252, loss_mean: 0.945608, proj_loss: -0.395841, loss_mean_cls: 2.303484, grad_norm: 10.045535 +Steps: 0%| | 2886/1000000 [11:52<67:33:17, 4.10it/s, grad_norm=10, loss_final=2.85, loss_mean=0.946, loss_mean_cls=2.3, proj_loss=-0.396][2026-03-23 13:48:12] Step: 2886, Training Logs: loss_final: 2.801543, loss_mean: 0.946425, proj_loss: -0.404753, loss_mean_cls: 2.259871, grad_norm: 11.291478 +Steps: 0%| | 2887/1000000 [11:52<67:29:32, 4.10it/s, grad_norm=11.3, loss_final=2.8, loss_mean=0.946, loss_mean_cls=2.26, proj_loss=-0.405][2026-03-23 13:48:12] Step: 2887, Training Logs: loss_final: 2.709753, loss_mean: 0.978033, proj_loss: -0.407253, loss_mean_cls: 2.138974, grad_norm: 5.258164 +Steps: 0%| | 2888/1000000 [11:53<67:27:55, 4.11it/s, grad_norm=5.26, loss_final=2.71, loss_mean=0.978, loss_mean_cls=2.14, proj_loss=-0.407][2026-03-23 13:48:13] Step: 2888, Training Logs: loss_final: 3.562806, loss_mean: 0.923183, proj_loss: -0.387589, loss_mean_cls: 3.027212, grad_norm: 18.760986 +Steps: 0%| | 2889/1000000 [11:53<67:26:23, 4.11it/s, grad_norm=18.8, loss_final=3.56, loss_mean=0.923, loss_mean_cls=3.03, proj_loss=-0.388][2026-03-23 13:48:13] Step: 2889, Training Logs: loss_final: 2.437807, loss_mean: 0.974058, proj_loss: -0.407410, loss_mean_cls: 1.871159, grad_norm: 7.158608 +Steps: 0%| | 2890/1000000 [11:53<67:26:00, 4.11it/s, grad_norm=7.16, loss_final=2.44, loss_mean=0.974, loss_mean_cls=1.87, proj_loss=-0.407][2026-03-23 13:48:13] Step: 2890, Training Logs: loss_final: 2.913568, loss_mean: 0.941034, proj_loss: -0.405849, loss_mean_cls: 2.378383, grad_norm: 16.296225 +Steps: 0%| | 2891/1000000 [11:53<67:26:02, 4.11it/s, grad_norm=16.3, loss_final=2.91, loss_mean=0.941, loss_mean_cls=2.38, proj_loss=-0.406][2026-03-23 13:48:13] Step: 2891, Training Logs: loss_final: 2.738425, loss_mean: 0.965973, proj_loss: -0.406356, loss_mean_cls: 2.178808, grad_norm: 18.998486 +Steps: 0%| | 2892/1000000 [11:54<67:25:27, 4.11it/s, grad_norm=19, loss_final=2.74, loss_mean=0.966, loss_mean_cls=2.18, proj_loss=-0.406][2026-03-23 13:48:14] Step: 2892, Training Logs: loss_final: 2.821167, loss_mean: 0.924589, proj_loss: -0.406450, loss_mean_cls: 2.303029, grad_norm: 30.468437 +Steps: 0%| | 2893/1000000 [11:54<69:38:30, 3.98it/s, grad_norm=30.5, loss_final=2.82, loss_mean=0.925, loss_mean_cls=2.3, proj_loss=-0.406][2026-03-23 13:48:14] Step: 2893, Training Logs: loss_final: 3.108930, loss_mean: 0.938185, proj_loss: -0.402396, loss_mean_cls: 2.573141, grad_norm: 24.778276 +Steps: 0%| | 2894/1000000 [11:54<69:06:22, 4.01it/s, grad_norm=24.8, loss_final=3.11, loss_mean=0.938, loss_mean_cls=2.57, proj_loss=-0.402][2026-03-23 13:48:14] Step: 2894, Training Logs: loss_final: 2.674089, loss_mean: 0.941402, proj_loss: -0.408283, loss_mean_cls: 2.140970, grad_norm: 7.711839 +Steps: 0%| | 2895/1000000 [11:54<68:36:05, 4.04it/s, grad_norm=7.71, loss_final=2.67, loss_mean=0.941, loss_mean_cls=2.14, proj_loss=-0.408][2026-03-23 13:48:14] Step: 2895, Training Logs: loss_final: 3.176968, loss_mean: 0.911161, proj_loss: -0.397229, loss_mean_cls: 2.663036, grad_norm: 8.791143 +Steps: 0%| | 2896/1000000 [11:55<68:13:15, 4.06it/s, grad_norm=8.79, loss_final=3.18, loss_mean=0.911, loss_mean_cls=2.66, proj_loss=-0.397][2026-03-23 13:48:15] Step: 2896, Training Logs: loss_final: 2.634597, loss_mean: 0.937345, proj_loss: -0.407858, loss_mean_cls: 2.105111, grad_norm: 5.508456 +Steps: 0%| | 2897/1000000 [11:55<67:58:30, 4.07it/s, grad_norm=5.51, loss_final=2.63, loss_mean=0.937, loss_mean_cls=2.11, proj_loss=-0.408][2026-03-23 13:48:15] Step: 2897, Training Logs: loss_final: 2.900975, loss_mean: 0.953233, proj_loss: -0.400074, loss_mean_cls: 2.347817, grad_norm: 8.377467 +Steps: 0%| | 2898/1000000 [11:55<67:47:48, 4.09it/s, grad_norm=8.38, loss_final=2.9, loss_mean=0.953, loss_mean_cls=2.35, proj_loss=-0.4][2026-03-23 13:48:15] Step: 2898, Training Logs: loss_final: 2.634231, loss_mean: 0.963597, proj_loss: -0.412308, loss_mean_cls: 2.082942, grad_norm: 1.902364 +Steps: 0%| | 2899/1000000 [11:55<67:39:58, 4.09it/s, grad_norm=1.9, loss_final=2.63, loss_mean=0.964, loss_mean_cls=2.08, proj_loss=-0.412][2026-03-23 13:48:15] Step: 2899, Training Logs: loss_final: 2.745804, loss_mean: 0.914187, proj_loss: -0.408596, loss_mean_cls: 2.240213, grad_norm: 6.980783 +Steps: 0%| | 2900/1000000 [11:56<67:35:37, 4.10it/s, grad_norm=6.98, loss_final=2.75, loss_mean=0.914, loss_mean_cls=2.24, proj_loss=-0.409][2026-03-23 13:48:16] Step: 2900, Training Logs: loss_final: 2.864161, loss_mean: 0.936678, proj_loss: -0.404282, loss_mean_cls: 2.331766, grad_norm: 8.083488 +Steps: 0%| | 2901/1000000 [11:56<67:33:16, 4.10it/s, grad_norm=8.08, loss_final=2.86, loss_mean=0.937, loss_mean_cls=2.33, proj_loss=-0.404][2026-03-23 13:48:16] Step: 2901, Training Logs: loss_final: 3.190328, loss_mean: 0.911117, proj_loss: -0.393597, loss_mean_cls: 2.672808, grad_norm: 4.097359 +Steps: 0%| | 2902/1000000 [11:56<67:30:30, 4.10it/s, grad_norm=4.1, loss_final=3.19, loss_mean=0.911, loss_mean_cls=2.67, proj_loss=-0.394][2026-03-23 13:48:16] Step: 2902, Training Logs: loss_final: 2.810247, loss_mean: 0.953763, proj_loss: -0.409113, loss_mean_cls: 2.265596, grad_norm: 8.752423 +Steps: 0%| | 2903/1000000 [11:56<67:28:25, 4.10it/s, grad_norm=8.75, loss_final=2.81, loss_mean=0.954, loss_mean_cls=2.27, proj_loss=-0.409][2026-03-23 13:48:16] Step: 2903, Training Logs: loss_final: 2.614196, loss_mean: 0.939784, proj_loss: -0.409506, loss_mean_cls: 2.083919, grad_norm: 4.889745 +Steps: 0%| | 2904/1000000 [11:57<67:28:09, 4.11it/s, grad_norm=4.89, loss_final=2.61, loss_mean=0.94, loss_mean_cls=2.08, proj_loss=-0.41][2026-03-23 13:48:16] Step: 2904, Training Logs: loss_final: 2.648760, loss_mean: 0.949186, proj_loss: -0.409053, loss_mean_cls: 2.108626, grad_norm: 7.539793 +Steps: 0%| | 2905/1000000 [11:57<67:25:47, 4.11it/s, grad_norm=7.54, loss_final=2.65, loss_mean=0.949, loss_mean_cls=2.11, proj_loss=-0.409][2026-03-23 13:48:17] Step: 2905, Training Logs: loss_final: 2.931670, loss_mean: 0.918423, proj_loss: -0.402441, loss_mean_cls: 2.415687, grad_norm: 4.887928 +Steps: 0%| | 2906/1000000 [11:57<67:24:36, 4.11it/s, grad_norm=4.89, loss_final=2.93, loss_mean=0.918, loss_mean_cls=2.42, proj_loss=-0.402][2026-03-23 13:48:17] Step: 2906, Training Logs: loss_final: 2.804578, loss_mean: 0.924585, proj_loss: -0.410275, loss_mean_cls: 2.290268, grad_norm: 12.714099 +Steps: 0%| | 2907/1000000 [11:57<67:24:42, 4.11it/s, grad_norm=12.7, loss_final=2.8, loss_mean=0.925, loss_mean_cls=2.29, proj_loss=-0.41][2026-03-23 13:48:17] Step: 2907, Training Logs: loss_final: 2.748909, loss_mean: 0.946246, proj_loss: -0.403069, loss_mean_cls: 2.205732, grad_norm: 9.721884 +Steps: 0%| | 2908/1000000 [11:58<67:24:53, 4.11it/s, grad_norm=9.72, loss_final=2.75, loss_mean=0.946, loss_mean_cls=2.21, proj_loss=-0.403][2026-03-23 13:48:17] Step: 2908, Training Logs: loss_final: 3.081916, loss_mean: 0.937283, proj_loss: -0.402300, loss_mean_cls: 2.546933, grad_norm: 15.333923 +Steps: 0%| | 2909/1000000 [11:58<67:23:46, 4.11it/s, grad_norm=15.3, loss_final=3.08, loss_mean=0.937, loss_mean_cls=2.55, proj_loss=-0.402][2026-03-23 13:48:18] Step: 2909, Training Logs: loss_final: 2.877581, loss_mean: 0.947221, proj_loss: -0.404787, loss_mean_cls: 2.335146, grad_norm: 11.467946 +Steps: 0%| | 2910/1000000 [11:58<67:23:53, 4.11it/s, grad_norm=11.5, loss_final=2.88, loss_mean=0.947, loss_mean_cls=2.34, proj_loss=-0.405][2026-03-23 13:48:18] Step: 2910, Training Logs: loss_final: 2.409312, loss_mean: 0.934008, proj_loss: -0.417006, loss_mean_cls: 1.892310, grad_norm: 11.776847 +Steps: 0%| | 2911/1000000 [11:58<67:24:43, 4.11it/s, grad_norm=11.8, loss_final=2.41, loss_mean=0.934, loss_mean_cls=1.89, proj_loss=-0.417][2026-03-23 13:48:18] Step: 2911, Training Logs: loss_final: 2.874528, loss_mean: 0.915454, proj_loss: -0.404634, loss_mean_cls: 2.363708, grad_norm: 2.096404 +Steps: 0%| | 2912/1000000 [11:58<67:23:45, 4.11it/s, grad_norm=2.1, loss_final=2.87, loss_mean=0.915, loss_mean_cls=2.36, proj_loss=-0.405][2026-03-23 13:48:18] Step: 2912, Training Logs: loss_final: 2.756033, loss_mean: 0.915295, proj_loss: -0.410897, loss_mean_cls: 2.251635, grad_norm: 1.915729 +Steps: 0%| | 2913/1000000 [11:59<67:25:50, 4.11it/s, grad_norm=1.92, loss_final=2.76, loss_mean=0.915, loss_mean_cls=2.25, proj_loss=-0.411][2026-03-23 13:48:19] Step: 2913, Training Logs: loss_final: 2.740556, loss_mean: 0.930673, proj_loss: -0.402570, loss_mean_cls: 2.212454, grad_norm: 24.182133 +Steps: 0%| | 2914/1000000 [11:59<67:26:06, 4.11it/s, grad_norm=24.2, loss_final=2.74, loss_mean=0.931, loss_mean_cls=2.21, proj_loss=-0.403][2026-03-23 13:48:19] Step: 2914, Training Logs: loss_final: 2.769631, loss_mean: 0.940497, proj_loss: -0.403795, loss_mean_cls: 2.232929, grad_norm: 12.863930 +Steps: 0%| | 2915/1000000 [11:59<67:24:21, 4.11it/s, grad_norm=12.9, loss_final=2.77, loss_mean=0.94, loss_mean_cls=2.23, proj_loss=-0.404][2026-03-23 13:48:19] Step: 2915, Training Logs: loss_final: 2.865537, loss_mean: 0.945102, proj_loss: -0.402791, loss_mean_cls: 2.323225, grad_norm: 15.515693 +Steps: 0%| | 2916/1000000 [11:59<67:23:58, 4.11it/s, grad_norm=15.5, loss_final=2.87, loss_mean=0.945, loss_mean_cls=2.32, proj_loss=-0.403][2026-03-23 13:48:19] Step: 2916, Training Logs: loss_final: 3.062979, loss_mean: 0.941744, proj_loss: -0.402216, loss_mean_cls: 2.523451, grad_norm: 22.133801 +Steps: 0%| | 2917/1000000 [12:00<67:24:03, 4.11it/s, grad_norm=22.1, loss_final=3.06, loss_mean=0.942, loss_mean_cls=2.52, proj_loss=-0.402][2026-03-23 13:48:20] Step: 2917, Training Logs: loss_final: 3.197263, loss_mean: 0.935027, proj_loss: -0.393538, loss_mean_cls: 2.655775, grad_norm: 17.492496 +Steps: 0%| | 2918/1000000 [12:00<67:24:04, 4.11it/s, grad_norm=17.5, loss_final=3.2, loss_mean=0.935, loss_mean_cls=2.66, proj_loss=-0.394][2026-03-23 13:48:20] Step: 2918, Training Logs: loss_final: 2.640828, loss_mean: 0.947033, proj_loss: -0.406874, loss_mean_cls: 2.100670, grad_norm: 7.195465 +Steps: 0%| | 2919/1000000 [12:00<67:24:38, 4.11it/s, grad_norm=7.2, loss_final=2.64, loss_mean=0.947, loss_mean_cls=2.1, proj_loss=-0.407][2026-03-23 13:48:20] Step: 2919, Training Logs: loss_final: 2.738910, loss_mean: 0.935692, proj_loss: -0.407860, loss_mean_cls: 2.211078, grad_norm: 19.211374 +Steps: 0%| | 2920/1000000 [12:00<67:23:09, 4.11it/s, grad_norm=19.2, loss_final=2.74, loss_mean=0.936, loss_mean_cls=2.21, proj_loss=-0.408][2026-03-23 13:48:20] Step: 2920, Training Logs: loss_final: 3.024123, loss_mean: 0.953524, proj_loss: -0.399435, loss_mean_cls: 2.470034, grad_norm: 5.246547 +Steps: 0%| | 2921/1000000 [12:01<67:23:19, 4.11it/s, grad_norm=5.25, loss_final=3.02, loss_mean=0.954, loss_mean_cls=2.47, proj_loss=-0.399][2026-03-23 13:48:21] Step: 2921, Training Logs: loss_final: 2.622116, loss_mean: 0.949135, proj_loss: -0.406232, loss_mean_cls: 2.079213, grad_norm: 4.542670 +Steps: 0%| | 2922/1000000 [12:01<67:24:24, 4.11it/s, grad_norm=4.54, loss_final=2.62, loss_mean=0.949, loss_mean_cls=2.08, proj_loss=-0.406][2026-03-23 13:48:21] Step: 2922, Training Logs: loss_final: 3.060890, loss_mean: 0.941824, proj_loss: -0.403079, loss_mean_cls: 2.522145, grad_norm: 12.543233 +Steps: 0%| | 2923/1000000 [12:01<67:24:33, 4.11it/s, grad_norm=12.5, loss_final=3.06, loss_mean=0.942, loss_mean_cls=2.52, proj_loss=-0.403][2026-03-23 13:48:21] Step: 2923, Training Logs: loss_final: 2.998596, loss_mean: 0.936985, proj_loss: -0.401235, loss_mean_cls: 2.462846, grad_norm: 4.180919 +Steps: 0%| | 2924/1000000 [12:01<67:23:30, 4.11it/s, grad_norm=4.18, loss_final=3, loss_mean=0.937, loss_mean_cls=2.46, proj_loss=-0.401][2026-03-23 13:48:21] Step: 2924, Training Logs: loss_final: 3.111324, loss_mean: 0.936469, proj_loss: -0.403553, loss_mean_cls: 2.578408, grad_norm: 13.898079 +Steps: 0%| | 2925/1000000 [12:02<67:23:21, 4.11it/s, grad_norm=13.9, loss_final=3.11, loss_mean=0.936, loss_mean_cls=2.58, proj_loss=-0.404][2026-03-23 13:48:22] Step: 2925, Training Logs: loss_final: 2.676268, loss_mean: 0.945926, proj_loss: -0.413817, loss_mean_cls: 2.144159, grad_norm: 32.884338 +Steps: 0%| | 2926/1000000 [12:02<67:24:26, 4.11it/s, grad_norm=32.9, loss_final=2.68, loss_mean=0.946, loss_mean_cls=2.14, proj_loss=-0.414][2026-03-23 13:48:22] Step: 2926, Training Logs: loss_final: 2.510336, loss_mean: 0.961786, proj_loss: -0.409236, loss_mean_cls: 1.957786, grad_norm: 5.620550 +Steps: 0%| | 2927/1000000 [12:02<67:25:13, 4.11it/s, grad_norm=5.62, loss_final=2.51, loss_mean=0.962, loss_mean_cls=1.96, proj_loss=-0.409][2026-03-23 13:48:22] Step: 2927, Training Logs: loss_final: 3.069615, loss_mean: 0.929995, proj_loss: -0.399654, loss_mean_cls: 2.539274, grad_norm: 25.755650 +Steps: 0%| | 2928/1000000 [12:02<67:25:32, 4.11it/s, grad_norm=25.8, loss_final=3.07, loss_mean=0.93, loss_mean_cls=2.54, proj_loss=-0.4][2026-03-23 13:48:22] Step: 2928, Training Logs: loss_final: 2.940111, loss_mean: 0.930670, proj_loss: -0.403225, loss_mean_cls: 2.412666, grad_norm: 23.566463 +Steps: 0%| | 2929/1000000 [12:03<67:27:08, 4.11it/s, grad_norm=23.6, loss_final=2.94, loss_mean=0.931, loss_mean_cls=2.41, proj_loss=-0.403][2026-03-23 13:48:23] Step: 2929, Training Logs: loss_final: 2.820217, loss_mean: 0.949844, proj_loss: -0.406412, loss_mean_cls: 2.276785, grad_norm: 13.868169 +Steps: 0%| | 2930/1000000 [12:03<67:25:49, 4.11it/s, grad_norm=13.9, loss_final=2.82, loss_mean=0.95, loss_mean_cls=2.28, proj_loss=-0.406][2026-03-23 13:48:23] Step: 2930, Training Logs: loss_final: 2.547291, loss_mean: 0.918804, proj_loss: -0.407554, loss_mean_cls: 2.036042, grad_norm: 2.320695 +Steps: 0%| | 2931/1000000 [12:03<67:26:21, 4.11it/s, grad_norm=2.32, loss_final=2.55, loss_mean=0.919, loss_mean_cls=2.04, proj_loss=-0.408][2026-03-23 13:48:23] Step: 2931, Training Logs: loss_final: 3.041977, loss_mean: 0.921910, proj_loss: -0.400039, loss_mean_cls: 2.520106, grad_norm: 24.756783 +Steps: 0%| | 2932/1000000 [12:03<67:25:44, 4.11it/s, grad_norm=24.8, loss_final=3.04, loss_mean=0.922, loss_mean_cls=2.52, proj_loss=-0.4][2026-03-23 13:48:23] Step: 2932, Training Logs: loss_final: 2.753978, loss_mean: 0.963074, proj_loss: -0.404062, loss_mean_cls: 2.194967, grad_norm: 9.290462 +Steps: 0%| | 2933/1000000 [12:04<67:25:50, 4.11it/s, grad_norm=9.29, loss_final=2.75, loss_mean=0.963, loss_mean_cls=2.19, proj_loss=-0.404][2026-03-23 13:48:24] Step: 2933, Training Logs: loss_final: 2.720027, loss_mean: 0.935599, proj_loss: -0.408043, loss_mean_cls: 2.192472, grad_norm: 8.146112 +Steps: 0%| | 2934/1000000 [12:04<67:25:51, 4.11it/s, grad_norm=8.15, loss_final=2.72, loss_mean=0.936, loss_mean_cls=2.19, proj_loss=-0.408][2026-03-23 13:48:24] Step: 2934, Training Logs: loss_final: 3.030539, loss_mean: 0.946213, proj_loss: -0.393708, loss_mean_cls: 2.478033, grad_norm: 6.472701 +Steps: 0%| | 2935/1000000 [12:04<67:27:58, 4.11it/s, grad_norm=6.47, loss_final=3.03, loss_mean=0.946, loss_mean_cls=2.48, proj_loss=-0.394][2026-03-23 13:48:24] Step: 2935, Training Logs: loss_final: 2.723258, loss_mean: 0.942079, proj_loss: -0.408856, loss_mean_cls: 2.190035, grad_norm: 18.761103 +Steps: 0%| | 2936/1000000 [12:04<67:27:20, 4.11it/s, grad_norm=18.8, loss_final=2.72, loss_mean=0.942, loss_mean_cls=2.19, proj_loss=-0.409][2026-03-23 13:48:24] Step: 2936, Training Logs: loss_final: 2.850428, loss_mean: 0.951735, proj_loss: -0.401051, loss_mean_cls: 2.299744, grad_norm: 7.953339 +Steps: 0%| | 2937/1000000 [12:05<67:27:24, 4.11it/s, grad_norm=7.95, loss_final=2.85, loss_mean=0.952, loss_mean_cls=2.3, proj_loss=-0.401][2026-03-23 13:48:25] Step: 2937, Training Logs: loss_final: 2.994173, loss_mean: 0.948457, proj_loss: -0.400979, loss_mean_cls: 2.446695, grad_norm: 6.314023 +Steps: 0%| | 2938/1000000 [12:05<67:27:20, 4.11it/s, grad_norm=6.31, loss_final=2.99, loss_mean=0.948, loss_mean_cls=2.45, proj_loss=-0.401][2026-03-23 13:48:25] Step: 2938, Training Logs: loss_final: 2.566731, loss_mean: 0.951167, proj_loss: -0.411215, loss_mean_cls: 2.026779, grad_norm: 15.702182 +Steps: 0%| | 2939/1000000 [12:05<67:26:37, 4.11it/s, grad_norm=15.7, loss_final=2.57, loss_mean=0.951, loss_mean_cls=2.03, proj_loss=-0.411][2026-03-23 13:48:25] Step: 2939, Training Logs: loss_final: 2.702271, loss_mean: 0.960325, proj_loss: -0.413729, loss_mean_cls: 2.155676, grad_norm: 5.473548 +Steps: 0%| | 2940/1000000 [12:05<67:26:31, 4.11it/s, grad_norm=5.47, loss_final=2.7, loss_mean=0.96, loss_mean_cls=2.16, proj_loss=-0.414][2026-03-23 13:48:25] Step: 2940, Training Logs: loss_final: 2.811524, loss_mean: 0.961886, proj_loss: -0.403890, loss_mean_cls: 2.253529, grad_norm: 15.546206 +Steps: 0%| | 2941/1000000 [12:06<67:27:14, 4.11it/s, grad_norm=15.5, loss_final=2.81, loss_mean=0.962, loss_mean_cls=2.25, proj_loss=-0.404][2026-03-23 13:48:26] Step: 2941, Training Logs: loss_final: 3.016284, loss_mean: 0.922817, proj_loss: -0.402268, loss_mean_cls: 2.495736, grad_norm: 18.288157 +Steps: 0%| | 2942/1000000 [12:06<67:31:18, 4.10it/s, grad_norm=18.3, loss_final=3.02, loss_mean=0.923, loss_mean_cls=2.5, proj_loss=-0.402][2026-03-23 13:48:26] Step: 2942, Training Logs: loss_final: 2.611483, loss_mean: 0.957659, proj_loss: -0.408314, loss_mean_cls: 2.062138, grad_norm: 10.698792 +Steps: 0%| | 2943/1000000 [12:06<67:31:04, 4.10it/s, grad_norm=10.7, loss_final=2.61, loss_mean=0.958, loss_mean_cls=2.06, proj_loss=-0.408][2026-03-23 13:48:26] Step: 2943, Training Logs: loss_final: 2.853315, loss_mean: 0.946218, proj_loss: -0.402958, loss_mean_cls: 2.310056, grad_norm: 3.374731 +Steps: 0%| | 2944/1000000 [12:06<67:30:39, 4.10it/s, grad_norm=3.37, loss_final=2.85, loss_mean=0.946, loss_mean_cls=2.31, proj_loss=-0.403][2026-03-23 13:48:26] Step: 2944, Training Logs: loss_final: 3.070880, loss_mean: 0.902738, proj_loss: -0.408109, loss_mean_cls: 2.576252, grad_norm: 11.096478 +Steps: 0%| | 2945/1000000 [12:07<67:31:34, 4.10it/s, grad_norm=11.1, loss_final=3.07, loss_mean=0.903, loss_mean_cls=2.58, proj_loss=-0.408][2026-03-23 13:48:26] Step: 2945, Training Logs: loss_final: 2.887640, loss_mean: 0.932043, proj_loss: -0.407169, loss_mean_cls: 2.362766, grad_norm: 10.072250 +Steps: 0%| | 2946/1000000 [12:07<67:28:58, 4.10it/s, grad_norm=10.1, loss_final=2.89, loss_mean=0.932, loss_mean_cls=2.36, proj_loss=-0.407][2026-03-23 13:48:27] Step: 2946, Training Logs: loss_final: 2.821289, loss_mean: 0.945947, proj_loss: -0.407676, loss_mean_cls: 2.283019, grad_norm: 8.192406 +Steps: 0%| | 2947/1000000 [12:07<67:28:44, 4.10it/s, grad_norm=8.19, loss_final=2.82, loss_mean=0.946, loss_mean_cls=2.28, proj_loss=-0.408][2026-03-23 13:48:27] Step: 2947, Training Logs: loss_final: 2.974419, loss_mean: 0.915699, proj_loss: -0.401228, loss_mean_cls: 2.459947, grad_norm: 17.056963 +Steps: 0%| | 2948/1000000 [12:07<67:29:15, 4.10it/s, grad_norm=17.1, loss_final=2.97, loss_mean=0.916, loss_mean_cls=2.46, proj_loss=-0.401][2026-03-23 13:48:27] Step: 2948, Training Logs: loss_final: 2.889473, loss_mean: 0.925840, proj_loss: -0.405641, loss_mean_cls: 2.369275, grad_norm: 12.664014 +Steps: 0%| | 2949/1000000 [12:07<67:29:49, 4.10it/s, grad_norm=12.7, loss_final=2.89, loss_mean=0.926, loss_mean_cls=2.37, proj_loss=-0.406][2026-03-23 13:48:27] Step: 2949, Training Logs: loss_final: 2.874623, loss_mean: 0.931069, proj_loss: -0.405771, loss_mean_cls: 2.349325, grad_norm: 13.717686 +Steps: 0%| | 2950/1000000 [12:08<67:29:16, 4.10it/s, grad_norm=13.7, loss_final=2.87, loss_mean=0.931, loss_mean_cls=2.35, proj_loss=-0.406][2026-03-23 13:48:28] Step: 2950, Training Logs: loss_final: 2.234266, loss_mean: 0.957563, proj_loss: -0.417127, loss_mean_cls: 1.693830, grad_norm: 7.740655 +Steps: 0%| | 2951/1000000 [12:08<67:30:01, 4.10it/s, grad_norm=7.74, loss_final=2.23, loss_mean=0.958, loss_mean_cls=1.69, proj_loss=-0.417][2026-03-23 13:48:28] Step: 2951, Training Logs: loss_final: 2.731960, loss_mean: 0.925757, proj_loss: -0.408940, loss_mean_cls: 2.215143, grad_norm: 5.584727 +Steps: 0%| | 2952/1000000 [12:08<67:31:22, 4.10it/s, grad_norm=5.58, loss_final=2.73, loss_mean=0.926, loss_mean_cls=2.22, proj_loss=-0.409][2026-03-23 13:48:28] Step: 2952, Training Logs: loss_final: 2.854244, loss_mean: 0.918669, proj_loss: -0.407066, loss_mean_cls: 2.342641, grad_norm: 6.589704 +Steps: 0%| | 2953/1000000 [12:08<67:29:49, 4.10it/s, grad_norm=6.59, loss_final=2.85, loss_mean=0.919, loss_mean_cls=2.34, proj_loss=-0.407][2026-03-23 13:48:28] Step: 2953, Training Logs: loss_final: 2.697082, loss_mean: 0.948178, proj_loss: -0.408222, loss_mean_cls: 2.157126, grad_norm: 4.926823 +Steps: 0%| | 2954/1000000 [12:09<67:27:41, 4.11it/s, grad_norm=4.93, loss_final=2.7, loss_mean=0.948, loss_mean_cls=2.16, proj_loss=-0.408][2026-03-23 13:48:29] Step: 2954, Training Logs: loss_final: 2.638257, loss_mean: 0.913645, proj_loss: -0.408947, loss_mean_cls: 2.133558, grad_norm: 6.595260 +Steps: 0%| | 2955/1000000 [12:09<67:26:29, 4.11it/s, grad_norm=6.6, loss_final=2.64, loss_mean=0.914, loss_mean_cls=2.13, proj_loss=-0.409][2026-03-23 13:48:29] Step: 2955, Training Logs: loss_final: 3.214855, loss_mean: 0.939825, proj_loss: -0.400852, loss_mean_cls: 2.675882, grad_norm: 8.974614 +Steps: 0%| | 2956/1000000 [12:09<67:26:40, 4.11it/s, grad_norm=8.97, loss_final=3.21, loss_mean=0.94, loss_mean_cls=2.68, proj_loss=-0.401][2026-03-23 13:48:29] Step: 2956, Training Logs: loss_final: 2.563638, loss_mean: 0.937533, proj_loss: -0.421251, loss_mean_cls: 2.047355, grad_norm: 13.003317 +Steps: 0%| | 2957/1000000 [12:09<67:26:03, 4.11it/s, grad_norm=13, loss_final=2.56, loss_mean=0.938, loss_mean_cls=2.05, proj_loss=-0.421][2026-03-23 13:48:29] Step: 2957, Training Logs: loss_final: 2.718426, loss_mean: 0.957299, proj_loss: -0.411323, loss_mean_cls: 2.172450, grad_norm: 8.938113 +Steps: 0%| | 2958/1000000 [12:10<67:26:47, 4.11it/s, grad_norm=8.94, loss_final=2.72, loss_mean=0.957, loss_mean_cls=2.17, proj_loss=-0.411][2026-03-23 13:48:30] Step: 2958, Training Logs: loss_final: 2.751530, loss_mean: 0.908996, proj_loss: -0.410972, loss_mean_cls: 2.253506, grad_norm: 12.237070 +Steps: 0%| | 2959/1000000 [12:10<67:26:31, 4.11it/s, grad_norm=12.2, loss_final=2.75, loss_mean=0.909, loss_mean_cls=2.25, proj_loss=-0.411][2026-03-23 13:48:30] Step: 2959, Training Logs: loss_final: 2.507227, loss_mean: 0.931326, proj_loss: -0.416350, loss_mean_cls: 1.992251, grad_norm: 18.606874 +Steps: 0%| | 2960/1000000 [12:10<67:25:16, 4.11it/s, grad_norm=18.6, loss_final=2.51, loss_mean=0.931, loss_mean_cls=1.99, proj_loss=-0.416][2026-03-23 13:48:30] Step: 2960, Training Logs: loss_final: 3.136362, loss_mean: 0.942414, proj_loss: -0.399693, loss_mean_cls: 2.593641, grad_norm: 1.858150 +Steps: 0%| | 2961/1000000 [12:10<67:25:00, 4.11it/s, grad_norm=1.86, loss_final=3.14, loss_mean=0.942, loss_mean_cls=2.59, proj_loss=-0.4][2026-03-23 13:48:30] Step: 2961, Training Logs: loss_final: 2.958745, loss_mean: 0.929144, proj_loss: -0.411197, loss_mean_cls: 2.440798, grad_norm: 13.792418 +Steps: 0%| | 2962/1000000 [12:11<67:25:15, 4.11it/s, grad_norm=13.8, loss_final=2.96, loss_mean=0.929, loss_mean_cls=2.44, proj_loss=-0.411][2026-03-23 13:48:31] Step: 2962, Training Logs: loss_final: 2.866734, loss_mean: 0.952805, proj_loss: -0.404055, loss_mean_cls: 2.317984, grad_norm: 3.022131 +Steps: 0%| | 2963/1000000 [12:11<67:26:57, 4.11it/s, grad_norm=3.02, loss_final=2.87, loss_mean=0.953, loss_mean_cls=2.32, proj_loss=-0.404][2026-03-23 13:48:31] Step: 2963, Training Logs: loss_final: 3.290892, loss_mean: 0.916559, proj_loss: -0.398496, loss_mean_cls: 2.772829, grad_norm: 6.356105 +Steps: 0%| | 2964/1000000 [12:11<68:44:54, 4.03it/s, grad_norm=6.36, loss_final=3.29, loss_mean=0.917, loss_mean_cls=2.77, proj_loss=-0.398][2026-03-23 13:48:31] Step: 2964, Training Logs: loss_final: 2.573277, loss_mean: 0.948178, proj_loss: -0.414525, loss_mean_cls: 2.039623, grad_norm: 8.925101 +Steps: 0%| | 2965/1000000 [12:11<68:20:11, 4.05it/s, grad_norm=8.93, loss_final=2.57, loss_mean=0.948, loss_mean_cls=2.04, proj_loss=-0.415][2026-03-23 13:48:31] Step: 2965, Training Logs: loss_final: 2.710811, loss_mean: 0.956089, proj_loss: -0.404581, loss_mean_cls: 2.159303, grad_norm: 4.008339 +Steps: 0%| | 2966/1000000 [12:12<68:02:39, 4.07it/s, grad_norm=4.01, loss_final=2.71, loss_mean=0.956, loss_mean_cls=2.16, proj_loss=-0.405][2026-03-23 13:48:32] Step: 2966, Training Logs: loss_final: 2.597424, loss_mean: 0.930549, proj_loss: -0.417953, loss_mean_cls: 2.084828, grad_norm: 4.006652 +Steps: 0%| | 2967/1000000 [12:12<67:52:10, 4.08it/s, grad_norm=4.01, loss_final=2.6, loss_mean=0.931, loss_mean_cls=2.08, proj_loss=-0.418][2026-03-23 13:48:32] Step: 2967, Training Logs: loss_final: 2.964640, loss_mean: 0.917023, proj_loss: -0.408954, loss_mean_cls: 2.456571, grad_norm: 17.629110 +Steps: 0%| | 2968/1000000 [12:12<67:45:02, 4.09it/s, grad_norm=17.6, loss_final=2.96, loss_mean=0.917, loss_mean_cls=2.46, proj_loss=-0.409][2026-03-23 13:48:32] Step: 2968, Training Logs: loss_final: 2.811588, loss_mean: 0.943772, proj_loss: -0.412008, loss_mean_cls: 2.279824, grad_norm: 13.527210 +Steps: 0%| | 2969/1000000 [12:12<67:39:05, 4.09it/s, grad_norm=13.5, loss_final=2.81, loss_mean=0.944, loss_mean_cls=2.28, proj_loss=-0.412][2026-03-23 13:48:32] Step: 2969, Training Logs: loss_final: 2.437226, loss_mean: 0.959282, proj_loss: -0.409090, loss_mean_cls: 1.887034, grad_norm: 8.866488 +Steps: 0%| | 2970/1000000 [12:13<67:35:19, 4.10it/s, grad_norm=8.87, loss_final=2.44, loss_mean=0.959, loss_mean_cls=1.89, proj_loss=-0.409][2026-03-23 13:48:33] Step: 2970, Training Logs: loss_final: 2.878330, loss_mean: 0.945418, proj_loss: -0.403205, loss_mean_cls: 2.336116, grad_norm: 3.496460 +Steps: 0%| | 2971/1000000 [12:13<67:30:41, 4.10it/s, grad_norm=3.5, loss_final=2.88, loss_mean=0.945, loss_mean_cls=2.34, proj_loss=-0.403][2026-03-23 13:48:33] Step: 2971, Training Logs: loss_final: 2.701065, loss_mean: 0.928865, proj_loss: -0.410538, loss_mean_cls: 2.182738, grad_norm: 15.138571 +Steps: 0%| | 2972/1000000 [12:13<67:29:05, 4.10it/s, grad_norm=15.1, loss_final=2.7, loss_mean=0.929, loss_mean_cls=2.18, proj_loss=-0.411][2026-03-23 13:48:33] Step: 2972, Training Logs: loss_final: 2.823714, loss_mean: 0.900918, proj_loss: -0.403912, loss_mean_cls: 2.326707, grad_norm: 5.198573 +Steps: 0%| | 2973/1000000 [12:13<67:27:44, 4.11it/s, grad_norm=5.2, loss_final=2.82, loss_mean=0.901, loss_mean_cls=2.33, proj_loss=-0.404][2026-03-23 13:48:33] Step: 2973, Training Logs: loss_final: 2.867780, loss_mean: 0.950611, proj_loss: -0.406667, loss_mean_cls: 2.323836, grad_norm: 7.256310 +Steps: 0%| | 2974/1000000 [12:14<67:28:14, 4.10it/s, grad_norm=7.26, loss_final=2.87, loss_mean=0.951, loss_mean_cls=2.32, proj_loss=-0.407][2026-03-23 13:48:34] Step: 2974, Training Logs: loss_final: 2.549457, loss_mean: 0.960214, proj_loss: -0.410589, loss_mean_cls: 1.999832, grad_norm: 24.654419 +Steps: 0%| | 2975/1000000 [12:14<67:27:55, 4.11it/s, grad_norm=24.7, loss_final=2.55, loss_mean=0.96, loss_mean_cls=2, proj_loss=-0.411][2026-03-23 13:48:34] Step: 2975, Training Logs: loss_final: 2.817097, loss_mean: 0.939853, proj_loss: -0.411912, loss_mean_cls: 2.289156, grad_norm: 13.014071 +Steps: 0%| | 2976/1000000 [12:14<67:26:10, 4.11it/s, grad_norm=13, loss_final=2.82, loss_mean=0.94, loss_mean_cls=2.29, proj_loss=-0.412][2026-03-23 13:48:34] Step: 2976, Training Logs: loss_final: 3.255177, loss_mean: 0.950677, proj_loss: -0.403249, loss_mean_cls: 2.707749, grad_norm: 31.407206 +Steps: 0%| | 2977/1000000 [12:14<67:25:53, 4.11it/s, grad_norm=31.4, loss_final=3.26, loss_mean=0.951, loss_mean_cls=2.71, proj_loss=-0.403][2026-03-23 13:48:34] Step: 2977, Training Logs: loss_final: 2.804118, loss_mean: 0.945677, proj_loss: -0.404278, loss_mean_cls: 2.262719, grad_norm: 7.859406 +Steps: 0%| | 2978/1000000 [12:15<67:26:21, 4.11it/s, grad_norm=7.86, loss_final=2.8, loss_mean=0.946, loss_mean_cls=2.26, proj_loss=-0.404][2026-03-23 13:48:35] Step: 2978, Training Logs: loss_final: 2.635929, loss_mean: 0.944350, proj_loss: -0.413122, loss_mean_cls: 2.104701, grad_norm: 11.542870 +Steps: 0%| | 2979/1000000 [12:15<67:26:08, 4.11it/s, grad_norm=11.5, loss_final=2.64, loss_mean=0.944, loss_mean_cls=2.1, proj_loss=-0.413][2026-03-23 13:48:35] Step: 2979, Training Logs: loss_final: 2.729693, loss_mean: 0.945007, proj_loss: -0.409823, loss_mean_cls: 2.194509, grad_norm: 16.240175 +Steps: 0%| | 2980/1000000 [12:15<67:24:03, 4.11it/s, grad_norm=16.2, loss_final=2.73, loss_mean=0.945, loss_mean_cls=2.19, proj_loss=-0.41][2026-03-23 13:48:35] Step: 2980, Training Logs: loss_final: 2.833869, loss_mean: 0.943424, proj_loss: -0.407336, loss_mean_cls: 2.297781, grad_norm: 8.730344 +Steps: 0%| | 2981/1000000 [12:15<67:23:59, 4.11it/s, grad_norm=8.73, loss_final=2.83, loss_mean=0.943, loss_mean_cls=2.3, proj_loss=-0.407][2026-03-23 13:48:35] Step: 2981, Training Logs: loss_final: 2.731703, loss_mean: 0.943407, proj_loss: -0.411567, loss_mean_cls: 2.199863, grad_norm: 12.371495 +Steps: 0%| | 2982/1000000 [12:16<67:23:56, 4.11it/s, grad_norm=12.4, loss_final=2.73, loss_mean=0.943, loss_mean_cls=2.2, proj_loss=-0.412][2026-03-23 13:48:36] Step: 2982, Training Logs: loss_final: 2.990505, loss_mean: 0.911130, proj_loss: -0.409218, loss_mean_cls: 2.488594, grad_norm: 15.190776 +Steps: 0%| | 2983/1000000 [12:16<67:25:15, 4.11it/s, grad_norm=15.2, loss_final=2.99, loss_mean=0.911, loss_mean_cls=2.49, proj_loss=-0.409][2026-03-23 13:48:36] Step: 2983, Training Logs: loss_final: 3.079823, loss_mean: 0.955673, proj_loss: -0.409331, loss_mean_cls: 2.533481, grad_norm: 19.919123 +Steps: 0%| | 2984/1000000 [12:16<67:25:56, 4.11it/s, grad_norm=19.9, loss_final=3.08, loss_mean=0.956, loss_mean_cls=2.53, proj_loss=-0.409][2026-03-23 13:48:36] Step: 2984, Training Logs: loss_final: 2.988994, loss_mean: 0.932810, proj_loss: -0.405398, loss_mean_cls: 2.461582, grad_norm: 13.118830 +Steps: 0%| | 2985/1000000 [12:16<67:24:55, 4.11it/s, grad_norm=13.1, loss_final=2.99, loss_mean=0.933, loss_mean_cls=2.46, proj_loss=-0.405][2026-03-23 13:48:36] Step: 2985, Training Logs: loss_final: 2.577192, loss_mean: 0.928098, proj_loss: -0.412057, loss_mean_cls: 2.061151, grad_norm: 2.878622 +Steps: 0%| | 2986/1000000 [12:17<67:26:44, 4.11it/s, grad_norm=2.88, loss_final=2.58, loss_mean=0.928, loss_mean_cls=2.06, proj_loss=-0.412][2026-03-23 13:48:36] Step: 2986, Training Logs: loss_final: 2.931462, loss_mean: 0.923203, proj_loss: -0.408201, loss_mean_cls: 2.416461, grad_norm: 11.945847 +Steps: 0%| | 2987/1000000 [12:17<67:24:12, 4.11it/s, grad_norm=11.9, loss_final=2.93, loss_mean=0.923, loss_mean_cls=2.42, proj_loss=-0.408][2026-03-23 13:48:37] Step: 2987, Training Logs: loss_final: 2.751414, loss_mean: 0.929132, proj_loss: -0.413018, loss_mean_cls: 2.235300, grad_norm: 3.624344 +Steps: 0%| | 2988/1000000 [12:17<67:22:49, 4.11it/s, grad_norm=3.62, loss_final=2.75, loss_mean=0.929, loss_mean_cls=2.24, proj_loss=-0.413][2026-03-23 13:48:37] Step: 2988, Training Logs: loss_final: 2.587613, loss_mean: 0.954674, proj_loss: -0.413969, loss_mean_cls: 2.046908, grad_norm: 16.381124 +Steps: 0%| | 2989/1000000 [12:17<67:23:36, 4.11it/s, grad_norm=16.4, loss_final=2.59, loss_mean=0.955, loss_mean_cls=2.05, proj_loss=-0.414][2026-03-23 13:48:37] Step: 2989, Training Logs: loss_final: 2.960218, loss_mean: 0.926796, proj_loss: -0.408854, loss_mean_cls: 2.442276, grad_norm: 18.749853 +Steps: 0%| | 2990/1000000 [12:17<67:25:05, 4.11it/s, grad_norm=18.7, loss_final=2.96, loss_mean=0.927, loss_mean_cls=2.44, proj_loss=-0.409][2026-03-23 13:48:37] Step: 2990, Training Logs: loss_final: 2.608358, loss_mean: 0.945189, proj_loss: -0.410202, loss_mean_cls: 2.073370, grad_norm: 6.480428 +Steps: 0%| | 2991/1000000 [12:18<67:25:57, 4.11it/s, grad_norm=6.48, loss_final=2.61, loss_mean=0.945, loss_mean_cls=2.07, proj_loss=-0.41][2026-03-23 13:48:38] Step: 2991, Training Logs: loss_final: 2.611815, loss_mean: 0.953150, proj_loss: -0.414194, loss_mean_cls: 2.072859, grad_norm: 4.996348 +Steps: 0%| | 2992/1000000 [12:18<67:25:44, 4.11it/s, grad_norm=5, loss_final=2.61, loss_mean=0.953, loss_mean_cls=2.07, proj_loss=-0.414][2026-03-23 13:48:38] Step: 2992, Training Logs: loss_final: 3.017229, loss_mean: 0.910587, proj_loss: -0.402189, loss_mean_cls: 2.508830, grad_norm: 3.205502 +Steps: 0%| | 2993/1000000 [12:18<67:26:06, 4.11it/s, grad_norm=3.21, loss_final=3.02, loss_mean=0.911, loss_mean_cls=2.51, proj_loss=-0.402][2026-03-23 13:48:38] Step: 2993, Training Logs: loss_final: 3.137453, loss_mean: 0.932098, proj_loss: -0.406842, loss_mean_cls: 2.612198, grad_norm: 29.156092 +Steps: 0%| | 2994/1000000 [12:18<67:25:56, 4.11it/s, grad_norm=29.2, loss_final=3.14, loss_mean=0.932, loss_mean_cls=2.61, proj_loss=-0.407][2026-03-23 13:48:38] Step: 2994, Training Logs: loss_final: 2.812649, loss_mean: 0.934780, proj_loss: -0.408290, loss_mean_cls: 2.286160, grad_norm: 11.750332 +Steps: 0%| | 2995/1000000 [12:19<67:24:49, 4.11it/s, grad_norm=11.8, loss_final=2.81, loss_mean=0.935, loss_mean_cls=2.29, proj_loss=-0.408][2026-03-23 13:48:39] Step: 2995, Training Logs: loss_final: 2.994921, loss_mean: 0.926758, proj_loss: -0.401305, loss_mean_cls: 2.469468, grad_norm: 16.352589 +Steps: 0%| | 2996/1000000 [12:19<67:23:54, 4.11it/s, grad_norm=16.4, loss_final=2.99, loss_mean=0.927, loss_mean_cls=2.47, proj_loss=-0.401][2026-03-23 13:48:39] Step: 2996, Training Logs: loss_final: 2.788356, loss_mean: 0.930356, proj_loss: -0.412756, loss_mean_cls: 2.270757, grad_norm: 20.624317 +Steps: 0%| | 2997/1000000 [12:19<67:24:58, 4.11it/s, grad_norm=20.6, loss_final=2.79, loss_mean=0.93, loss_mean_cls=2.27, proj_loss=-0.413][2026-03-23 13:48:39] Step: 2997, Training Logs: loss_final: 2.850321, loss_mean: 0.922161, proj_loss: -0.400446, loss_mean_cls: 2.328606, grad_norm: 16.194763 +Steps: 0%| | 2998/1000000 [12:19<67:25:55, 4.11it/s, grad_norm=16.2, loss_final=2.85, loss_mean=0.922, loss_mean_cls=2.33, proj_loss=-0.4][2026-03-23 13:48:39] Step: 2998, Training Logs: loss_final: 3.071222, loss_mean: 0.924473, proj_loss: -0.404387, loss_mean_cls: 2.551136, grad_norm: 19.534170 +Steps: 0%| | 2999/1000000 [12:20<67:25:57, 4.11it/s, grad_norm=19.5, loss_final=3.07, loss_mean=0.924, loss_mean_cls=2.55, proj_loss=-0.404][2026-03-23 13:48:40] Step: 2999, Training Logs: loss_final: 3.189212, loss_mean: 0.914083, proj_loss: -0.400991, loss_mean_cls: 2.676120, grad_norm: 7.322028 +Steps: 0%| | 3000/1000000 [12:20<67:28:04, 4.10it/s, grad_norm=7.32, loss_final=3.19, loss_mean=0.914, loss_mean_cls=2.68, proj_loss=-0.401][2026-03-23 13:48:40] Step: 3000, Training Logs: loss_final: 2.943569, loss_mean: 0.928839, proj_loss: -0.406012, loss_mean_cls: 2.420742, grad_norm: 5.492412 +Steps: 0%| | 3001/1000000 [12:20<67:26:45, 4.11it/s, grad_norm=5.49, loss_final=2.94, loss_mean=0.929, loss_mean_cls=2.42, proj_loss=-0.406][2026-03-23 13:48:40] Step: 3001, Training Logs: loss_final: 2.622105, loss_mean: 0.932867, proj_loss: -0.413322, loss_mean_cls: 2.102561, grad_norm: 4.591970 +Steps: 0%| | 3002/1000000 [12:20<67:25:41, 4.11it/s, grad_norm=4.59, loss_final=2.62, loss_mean=0.933, loss_mean_cls=2.1, proj_loss=-0.413][2026-03-23 13:48:40] Step: 3002, Training Logs: loss_final: 3.311488, loss_mean: 0.890883, proj_loss: -0.398841, loss_mean_cls: 2.819446, grad_norm: 19.942154 +Steps: 0%| | 3003/1000000 [12:21<67:25:42, 4.11it/s, grad_norm=19.9, loss_final=3.31, loss_mean=0.891, loss_mean_cls=2.82, proj_loss=-0.399][2026-03-23 13:48:41] Step: 3003, Training Logs: loss_final: 2.486402, loss_mean: 0.929575, proj_loss: -0.415623, loss_mean_cls: 1.972450, grad_norm: 13.226481 +Steps: 0%| | 3004/1000000 [12:21<67:24:42, 4.11it/s, grad_norm=13.2, loss_final=2.49, loss_mean=0.93, loss_mean_cls=1.97, proj_loss=-0.416][2026-03-23 13:48:41] Step: 3004, Training Logs: loss_final: 2.802843, loss_mean: 0.941505, proj_loss: -0.404223, loss_mean_cls: 2.265561, grad_norm: 2.778104 +Steps: 0%| | 3005/1000000 [12:21<68:29:24, 4.04it/s, grad_norm=2.78, loss_final=2.8, loss_mean=0.942, loss_mean_cls=2.27, proj_loss=-0.404][2026-03-23 13:48:41] Step: 3005, Training Logs: loss_final: 3.001987, loss_mean: 0.926646, proj_loss: -0.409214, loss_mean_cls: 2.484555, grad_norm: 10.506498 +Steps: 0%| | 3006/1000000 [12:21<68:10:36, 4.06it/s, grad_norm=10.5, loss_final=3, loss_mean=0.927, loss_mean_cls=2.48, proj_loss=-0.409][2026-03-23 13:48:41] Step: 3006, Training Logs: loss_final: 2.662709, loss_mean: 0.934146, proj_loss: -0.412446, loss_mean_cls: 2.141008, grad_norm: 9.949147 +Steps: 0%| | 3007/1000000 [12:22<67:55:55, 4.08it/s, grad_norm=9.95, loss_final=2.66, loss_mean=0.934, loss_mean_cls=2.14, proj_loss=-0.412][2026-03-23 13:48:42] Step: 3007, Training Logs: loss_final: 2.707954, loss_mean: 0.914776, proj_loss: -0.412412, loss_mean_cls: 2.205591, grad_norm: 13.295479 +Steps: 0%| | 3008/1000000 [12:22<67:48:27, 4.08it/s, grad_norm=13.3, loss_final=2.71, loss_mean=0.915, loss_mean_cls=2.21, proj_loss=-0.412][2026-03-23 13:48:42] Step: 3008, Training Logs: loss_final: 2.733347, loss_mean: 0.928066, proj_loss: -0.412812, loss_mean_cls: 2.218092, grad_norm: 6.833642 +Steps: 0%| | 3009/1000000 [12:22<67:41:48, 4.09it/s, grad_norm=6.83, loss_final=2.73, loss_mean=0.928, loss_mean_cls=2.22, proj_loss=-0.413][2026-03-23 13:48:42] Step: 3009, Training Logs: loss_final: 2.803666, loss_mean: 0.939713, proj_loss: -0.408788, loss_mean_cls: 2.272742, grad_norm: 12.787022 +Steps: 0%| | 3010/1000000 [12:22<67:37:31, 4.10it/s, grad_norm=12.8, loss_final=2.8, loss_mean=0.94, loss_mean_cls=2.27, proj_loss=-0.409][2026-03-23 13:48:42] Step: 3010, Training Logs: loss_final: 3.409400, loss_mean: 0.918631, proj_loss: -0.396405, loss_mean_cls: 2.887173, grad_norm: 9.772836 +Steps: 0%| | 3011/1000000 [12:23<67:33:51, 4.10it/s, grad_norm=9.77, loss_final=3.41, loss_mean=0.919, loss_mean_cls=2.89, proj_loss=-0.396][2026-03-23 13:48:43] Step: 3011, Training Logs: loss_final: 2.323135, loss_mean: 0.945883, proj_loss: -0.414500, loss_mean_cls: 1.791752, grad_norm: 14.083781 +Steps: 0%| | 3012/1000000 [12:23<67:31:43, 4.10it/s, grad_norm=14.1, loss_final=2.32, loss_mean=0.946, loss_mean_cls=1.79, proj_loss=-0.414][2026-03-23 13:48:43] Step: 3012, Training Logs: loss_final: 2.972587, loss_mean: 0.909624, proj_loss: -0.407195, loss_mean_cls: 2.470158, grad_norm: 18.123987 +Steps: 0%| | 3013/1000000 [12:23<67:29:14, 4.10it/s, grad_norm=18.1, loss_final=2.97, loss_mean=0.91, loss_mean_cls=2.47, proj_loss=-0.407][2026-03-23 13:48:43] Step: 3013, Training Logs: loss_final: 3.031060, loss_mean: 0.924388, proj_loss: -0.405296, loss_mean_cls: 2.511968, grad_norm: 7.340248 +Steps: 0%| | 3014/1000000 [12:23<67:27:44, 4.11it/s, grad_norm=7.34, loss_final=3.03, loss_mean=0.924, loss_mean_cls=2.51, proj_loss=-0.405][2026-03-23 13:48:43] Step: 3014, Training Logs: loss_final: 2.951487, loss_mean: 0.945246, proj_loss: -0.401631, loss_mean_cls: 2.407873, grad_norm: 14.020543 +Steps: 0%| | 3015/1000000 [12:24<67:29:05, 4.10it/s, grad_norm=14, loss_final=2.95, loss_mean=0.945, loss_mean_cls=2.41, proj_loss=-0.402][2026-03-23 13:48:44] Step: 3015, Training Logs: loss_final: 2.716636, loss_mean: 0.937020, proj_loss: -0.408388, loss_mean_cls: 2.188004, grad_norm: 11.261159 +Steps: 0%| | 3016/1000000 [12:24<67:27:25, 4.11it/s, grad_norm=11.3, loss_final=2.72, loss_mean=0.937, loss_mean_cls=2.19, proj_loss=-0.408][2026-03-23 13:48:44] Step: 3016, Training Logs: loss_final: 2.813255, loss_mean: 0.937849, proj_loss: -0.410546, loss_mean_cls: 2.285951, grad_norm: 3.038516 +Steps: 0%| | 3017/1000000 [12:24<67:25:50, 4.11it/s, grad_norm=3.04, loss_final=2.81, loss_mean=0.938, loss_mean_cls=2.29, proj_loss=-0.411][2026-03-23 13:48:44] Step: 3017, Training Logs: loss_final: 2.728137, loss_mean: 0.945175, proj_loss: -0.411597, loss_mean_cls: 2.194559, grad_norm: 17.135395 +Steps: 0%| | 3018/1000000 [12:24<67:24:57, 4.11it/s, grad_norm=17.1, loss_final=2.73, loss_mean=0.945, loss_mean_cls=2.19, proj_loss=-0.412][2026-03-23 13:48:44] Step: 3018, Training Logs: loss_final: 3.411881, loss_mean: 0.915628, proj_loss: -0.397061, loss_mean_cls: 2.893314, grad_norm: 8.568817 +Steps: 0%| | 3019/1000000 [12:25<67:24:18, 4.11it/s, grad_norm=8.57, loss_final=3.41, loss_mean=0.916, loss_mean_cls=2.89, proj_loss=-0.397][2026-03-23 13:48:45] Step: 3019, Training Logs: loss_final: 2.838433, loss_mean: 0.939296, proj_loss: -0.406965, loss_mean_cls: 2.306101, grad_norm: 3.167816 +Steps: 0%| | 3020/1000000 [12:25<67:24:10, 4.11it/s, grad_norm=3.17, loss_final=2.84, loss_mean=0.939, loss_mean_cls=2.31, proj_loss=-0.407][2026-03-23 13:48:45] Step: 3020, Training Logs: loss_final: 2.802019, loss_mean: 0.933317, proj_loss: -0.411541, loss_mean_cls: 2.280244, grad_norm: 12.027785 +Steps: 0%| | 3021/1000000 [12:25<67:23:18, 4.11it/s, grad_norm=12, loss_final=2.8, loss_mean=0.933, loss_mean_cls=2.28, proj_loss=-0.412][2026-03-23 13:48:45] Step: 3021, Training Logs: loss_final: 3.217284, loss_mean: 0.915665, proj_loss: -0.409604, loss_mean_cls: 2.711223, grad_norm: 16.306246 +Steps: 0%| | 3022/1000000 [12:25<67:23:44, 4.11it/s, grad_norm=16.3, loss_final=3.22, loss_mean=0.916, loss_mean_cls=2.71, proj_loss=-0.41][2026-03-23 13:48:45] Step: 3022, Training Logs: loss_final: 2.769927, loss_mean: 0.920866, proj_loss: -0.411765, loss_mean_cls: 2.260826, grad_norm: 13.801814 +Steps: 0%| | 3023/1000000 [12:26<67:22:13, 4.11it/s, grad_norm=13.8, loss_final=2.77, loss_mean=0.921, loss_mean_cls=2.26, proj_loss=-0.412][2026-03-23 13:48:45] Step: 3023, Training Logs: loss_final: 2.968081, loss_mean: 0.960585, proj_loss: -0.403814, loss_mean_cls: 2.411310, grad_norm: 3.548273 +Steps: 0%| | 3024/1000000 [12:26<67:23:28, 4.11it/s, grad_norm=3.55, loss_final=2.97, loss_mean=0.961, loss_mean_cls=2.41, proj_loss=-0.404][2026-03-23 13:48:46] Step: 3024, Training Logs: loss_final: 2.756952, loss_mean: 0.940361, proj_loss: -0.411928, loss_mean_cls: 2.228519, grad_norm: 13.564919 +Steps: 0%| | 3025/1000000 [12:26<67:25:47, 4.11it/s, grad_norm=13.6, loss_final=2.76, loss_mean=0.94, loss_mean_cls=2.23, proj_loss=-0.412][2026-03-23 13:48:46] Step: 3025, Training Logs: loss_final: 2.808731, loss_mean: 0.982191, proj_loss: -0.410748, loss_mean_cls: 2.237287, grad_norm: 19.965046 +Steps: 0%| | 3026/1000000 [12:26<67:25:35, 4.11it/s, grad_norm=20, loss_final=2.81, loss_mean=0.982, loss_mean_cls=2.24, proj_loss=-0.411][2026-03-23 13:48:46] Step: 3026, Training Logs: loss_final: 2.791213, loss_mean: 0.991370, proj_loss: -0.406596, loss_mean_cls: 2.206439, grad_norm: 17.553194 +Steps: 0%| | 3027/1000000 [12:27<67:27:26, 4.11it/s, grad_norm=17.6, loss_final=2.79, loss_mean=0.991, loss_mean_cls=2.21, proj_loss=-0.407][2026-03-23 13:48:46] Step: 3027, Training Logs: loss_final: 2.673423, loss_mean: 1.011267, proj_loss: -0.404226, loss_mean_cls: 2.066382, grad_norm: 11.390882 +Steps: 0%| | 3028/1000000 [12:27<67:26:37, 4.11it/s, grad_norm=11.4, loss_final=2.67, loss_mean=1.01, loss_mean_cls=2.07, proj_loss=-0.404][2026-03-23 13:48:47] Step: 3028, Training Logs: loss_final: 2.608827, loss_mean: 0.958450, proj_loss: -0.405818, loss_mean_cls: 2.056195, grad_norm: 16.659895 +Steps: 0%| | 3029/1000000 [12:27<67:24:53, 4.11it/s, grad_norm=16.7, loss_final=2.61, loss_mean=0.958, loss_mean_cls=2.06, proj_loss=-0.406][2026-03-23 13:48:47] Step: 3029, Training Logs: loss_final: 2.697234, loss_mean: 0.960583, proj_loss: -0.410514, loss_mean_cls: 2.147164, grad_norm: 12.770165 +Steps: 0%| | 3030/1000000 [12:27<67:25:26, 4.11it/s, grad_norm=12.8, loss_final=2.7, loss_mean=0.961, loss_mean_cls=2.15, proj_loss=-0.411][2026-03-23 13:48:47] Step: 3030, Training Logs: loss_final: 2.744250, loss_mean: 0.941321, proj_loss: -0.408766, loss_mean_cls: 2.211694, grad_norm: 21.654713 +Steps: 0%| | 3031/1000000 [12:27<67:26:22, 4.11it/s, grad_norm=21.7, loss_final=2.74, loss_mean=0.941, loss_mean_cls=2.21, proj_loss=-0.409][2026-03-23 13:48:47] Step: 3031, Training Logs: loss_final: 2.992213, loss_mean: 0.925038, proj_loss: -0.407867, loss_mean_cls: 2.475042, grad_norm: 6.547913 +Steps: 0%| | 3032/1000000 [12:28<67:25:36, 4.11it/s, grad_norm=6.55, loss_final=2.99, loss_mean=0.925, loss_mean_cls=2.48, proj_loss=-0.408][2026-03-23 13:48:48] Step: 3032, Training Logs: loss_final: 2.937747, loss_mean: 0.936273, proj_loss: -0.406318, loss_mean_cls: 2.407793, grad_norm: 14.057660 +Steps: 0%| | 3033/1000000 [12:28<67:25:29, 4.11it/s, grad_norm=14.1, loss_final=2.94, loss_mean=0.936, loss_mean_cls=2.41, proj_loss=-0.406][2026-03-23 13:48:48] Step: 3033, Training Logs: loss_final: 2.851770, loss_mean: 0.953030, proj_loss: -0.408656, loss_mean_cls: 2.307396, grad_norm: 8.944036 +Steps: 0%| | 3034/1000000 [12:28<67:25:59, 4.11it/s, grad_norm=8.94, loss_final=2.85, loss_mean=0.953, loss_mean_cls=2.31, proj_loss=-0.409][2026-03-23 13:48:48] Step: 3034, Training Logs: loss_final: 2.864445, loss_mean: 0.931955, proj_loss: -0.412866, loss_mean_cls: 2.345356, grad_norm: 10.780737 +Steps: 0%| | 3035/1000000 [12:28<67:23:52, 4.11it/s, grad_norm=10.8, loss_final=2.86, loss_mean=0.932, loss_mean_cls=2.35, proj_loss=-0.413][2026-03-23 13:48:48] Step: 3035, Training Logs: loss_final: 2.832767, loss_mean: 0.957232, proj_loss: -0.404605, loss_mean_cls: 2.280140, grad_norm: 18.573160 +Steps: 0%| | 3036/1000000 [12:29<67:23:32, 4.11it/s, grad_norm=18.6, loss_final=2.83, loss_mean=0.957, loss_mean_cls=2.28, proj_loss=-0.405][2026-03-23 13:48:49] Step: 3036, Training Logs: loss_final: 3.076001, loss_mean: 0.925409, proj_loss: -0.405216, loss_mean_cls: 2.555809, grad_norm: 5.593194 +Steps: 0%| | 3037/1000000 [12:29<67:24:28, 4.11it/s, grad_norm=5.59, loss_final=3.08, loss_mean=0.925, loss_mean_cls=2.56, proj_loss=-0.405][2026-03-23 13:48:49] Step: 3037, Training Logs: loss_final: 3.134688, loss_mean: 0.928378, proj_loss: -0.408687, loss_mean_cls: 2.614997, grad_norm: 30.414820 +Steps: 0%| | 3038/1000000 [12:29<67:24:14, 4.11it/s, grad_norm=30.4, loss_final=3.13, loss_mean=0.928, loss_mean_cls=2.61, proj_loss=-0.409][2026-03-23 13:48:49] Step: 3038, Training Logs: loss_final: 2.719353, loss_mean: 0.949171, proj_loss: -0.419165, loss_mean_cls: 2.189347, grad_norm: 17.745239 +Steps: 0%| | 3039/1000000 [12:29<67:23:44, 4.11it/s, grad_norm=17.7, loss_final=2.72, loss_mean=0.949, loss_mean_cls=2.19, proj_loss=-0.419][2026-03-23 13:48:49] Step: 3039, Training Logs: loss_final: 2.135155, loss_mean: 0.970631, proj_loss: -0.419722, loss_mean_cls: 1.584246, grad_norm: 8.308549 +Steps: 0%| | 3040/1000000 [12:30<67:23:14, 4.11it/s, grad_norm=8.31, loss_final=2.14, loss_mean=0.971, loss_mean_cls=1.58, proj_loss=-0.42][2026-03-23 13:48:50] Step: 3040, Training Logs: loss_final: 2.781812, loss_mean: 0.933107, proj_loss: -0.409638, loss_mean_cls: 2.258342, grad_norm: 6.336894 +Steps: 0%| | 3041/1000000 [12:30<67:23:40, 4.11it/s, grad_norm=6.34, loss_final=2.78, loss_mean=0.933, loss_mean_cls=2.26, proj_loss=-0.41][2026-03-23 13:48:50] Step: 3041, Training Logs: loss_final: 2.886288, loss_mean: 0.932299, proj_loss: -0.411189, loss_mean_cls: 2.365179, grad_norm: 10.984136 +Steps: 0%| | 3042/1000000 [12:30<67:23:44, 4.11it/s, grad_norm=11, loss_final=2.89, loss_mean=0.932, loss_mean_cls=2.37, proj_loss=-0.411][2026-03-23 13:48:50] Step: 3042, Training Logs: loss_final: 2.897166, loss_mean: 0.907621, proj_loss: -0.411004, loss_mean_cls: 2.400549, grad_norm: 22.729912 +Steps: 0%| | 3043/1000000 [12:30<67:23:16, 4.11it/s, grad_norm=22.7, loss_final=2.9, loss_mean=0.908, loss_mean_cls=2.4, proj_loss=-0.411][2026-03-23 13:48:50] Step: 3043, Training Logs: loss_final: 2.946584, loss_mean: 0.926789, proj_loss: -0.408697, loss_mean_cls: 2.428492, grad_norm: 18.881716 +Steps: 0%| | 3044/1000000 [12:31<67:23:44, 4.11it/s, grad_norm=18.9, loss_final=2.95, loss_mean=0.927, loss_mean_cls=2.43, proj_loss=-0.409][2026-03-23 13:48:51] Step: 3044, Training Logs: loss_final: 2.533109, loss_mean: 0.940760, proj_loss: -0.416031, loss_mean_cls: 2.008380, grad_norm: 28.831387 +Steps: 0%| | 3045/1000000 [12:31<67:24:34, 4.11it/s, grad_norm=28.8, loss_final=2.53, loss_mean=0.941, loss_mean_cls=2.01, proj_loss=-0.416][2026-03-23 13:48:51] Step: 3045, Training Logs: loss_final: 2.575629, loss_mean: 0.941277, proj_loss: -0.409418, loss_mean_cls: 2.043771, grad_norm: 8.161974 +Steps: 0%| | 3046/1000000 [12:31<67:28:56, 4.10it/s, grad_norm=8.16, loss_final=2.58, loss_mean=0.941, loss_mean_cls=2.04, proj_loss=-0.409][2026-03-23 13:48:51] Step: 3046, Training Logs: loss_final: 3.304591, loss_mean: 0.922031, proj_loss: -0.406731, loss_mean_cls: 2.789291, grad_norm: 32.882435 +Steps: 0%| | 3047/1000000 [12:31<67:26:37, 4.11it/s, grad_norm=32.9, loss_final=3.3, loss_mean=0.922, loss_mean_cls=2.79, proj_loss=-0.407][2026-03-23 13:48:51] Step: 3047, Training Logs: loss_final: 3.242753, loss_mean: 0.923713, proj_loss: -0.409011, loss_mean_cls: 2.728051, grad_norm: 34.472454 +Steps: 0%| | 3048/1000000 [12:32<67:31:47, 4.10it/s, grad_norm=34.5, loss_final=3.24, loss_mean=0.924, loss_mean_cls=2.73, proj_loss=-0.409][2026-03-23 13:48:52] Step: 3048, Training Logs: loss_final: 3.419718, loss_mean: 0.906851, proj_loss: -0.410564, loss_mean_cls: 2.923431, grad_norm: 25.408997 +Steps: 0%| | 3049/1000000 [12:32<67:28:52, 4.10it/s, grad_norm=25.4, loss_final=3.42, loss_mean=0.907, loss_mean_cls=2.92, proj_loss=-0.411][2026-03-23 13:48:52] Step: 3049, Training Logs: loss_final: 3.280337, loss_mean: 0.932280, proj_loss: -0.400993, loss_mean_cls: 2.749051, grad_norm: 16.919626 +Steps: 0%| | 3050/1000000 [12:32<67:28:51, 4.10it/s, grad_norm=16.9, loss_final=3.28, loss_mean=0.932, loss_mean_cls=2.75, proj_loss=-0.401][2026-03-23 13:48:52] Step: 3050, Training Logs: loss_final: 3.071342, loss_mean: 0.905280, proj_loss: -0.405584, loss_mean_cls: 2.571645, grad_norm: 10.311320 +Steps: 0%| | 3051/1000000 [12:32<67:27:28, 4.11it/s, grad_norm=10.3, loss_final=3.07, loss_mean=0.905, loss_mean_cls=2.57, proj_loss=-0.406][2026-03-23 13:48:52] Step: 3051, Training Logs: loss_final: 2.798999, loss_mean: 0.933603, proj_loss: -0.408845, loss_mean_cls: 2.274240, grad_norm: 6.435824 +Steps: 0%| | 3052/1000000 [12:33<67:25:36, 4.11it/s, grad_norm=6.44, loss_final=2.8, loss_mean=0.934, loss_mean_cls=2.27, proj_loss=-0.409][2026-03-23 13:48:53] Step: 3052, Training Logs: loss_final: 2.936527, loss_mean: 0.941983, proj_loss: -0.411092, loss_mean_cls: 2.405636, grad_norm: 7.511250 +Steps: 0%| | 3053/1000000 [12:33<67:24:03, 4.11it/s, grad_norm=7.51, loss_final=2.94, loss_mean=0.942, loss_mean_cls=2.41, proj_loss=-0.411][2026-03-23 13:48:53] Step: 3053, Training Logs: loss_final: 2.548282, loss_mean: 0.938737, proj_loss: -0.413813, loss_mean_cls: 2.023358, grad_norm: 17.922201 +Steps: 0%| | 3054/1000000 [12:33<67:24:22, 4.11it/s, grad_norm=17.9, loss_final=2.55, loss_mean=0.939, loss_mean_cls=2.02, proj_loss=-0.414][2026-03-23 13:48:53] Step: 3054, Training Logs: loss_final: 2.460986, loss_mean: 0.939739, proj_loss: -0.422077, loss_mean_cls: 1.943325, grad_norm: 10.936401 +Steps: 0%| | 3055/1000000 [12:33<67:24:58, 4.11it/s, grad_norm=10.9, loss_final=2.46, loss_mean=0.94, loss_mean_cls=1.94, proj_loss=-0.422][2026-03-23 13:48:53] Step: 3055, Training Logs: loss_final: 2.937331, loss_mean: 0.925424, proj_loss: -0.409936, loss_mean_cls: 2.421843, grad_norm: 36.797546 +Steps: 0%| | 3056/1000000 [12:34<67:23:45, 4.11it/s, grad_norm=36.8, loss_final=2.94, loss_mean=0.925, loss_mean_cls=2.42, proj_loss=-0.41][2026-03-23 13:48:54] Step: 3056, Training Logs: loss_final: 2.470219, loss_mean: 0.925836, proj_loss: -0.417500, loss_mean_cls: 1.961883, grad_norm: 23.990677 +Steps: 0%| | 3057/1000000 [12:34<67:26:15, 4.11it/s, grad_norm=24, loss_final=2.47, loss_mean=0.926, loss_mean_cls=1.96, proj_loss=-0.417][2026-03-23 13:48:54] Step: 3057, Training Logs: loss_final: 2.798103, loss_mean: 0.922193, proj_loss: -0.410183, loss_mean_cls: 2.286093, grad_norm: 13.655671 +Steps: 0%| | 3058/1000000 [12:34<67:24:45, 4.11it/s, grad_norm=13.7, loss_final=2.8, loss_mean=0.922, loss_mean_cls=2.29, proj_loss=-0.41][2026-03-23 13:48:54] Step: 3058, Training Logs: loss_final: 2.922971, loss_mean: 0.916990, proj_loss: -0.403659, loss_mean_cls: 2.409640, grad_norm: 12.946737 +Steps: 0%| | 3059/1000000 [12:34<67:45:54, 4.09it/s, grad_norm=12.9, loss_final=2.92, loss_mean=0.917, loss_mean_cls=2.41, proj_loss=-0.404][2026-03-23 13:48:54] Step: 3059, Training Logs: loss_final: 3.320094, loss_mean: 0.907041, proj_loss: -0.405018, loss_mean_cls: 2.818071, grad_norm: 13.001372 +Steps: 0%| | 3060/1000000 [12:35<67:54:37, 4.08it/s, grad_norm=13, loss_final=3.32, loss_mean=0.907, loss_mean_cls=2.82, proj_loss=-0.405][2026-03-23 13:48:55] Step: 3060, Training Logs: loss_final: 2.849412, loss_mean: 0.936746, proj_loss: -0.411076, loss_mean_cls: 2.323742, grad_norm: 4.051317 +Steps: 0%| | 3061/1000000 [12:35<67:45:12, 4.09it/s, grad_norm=4.05, loss_final=2.85, loss_mean=0.937, loss_mean_cls=2.32, proj_loss=-0.411][2026-03-23 13:48:55] Step: 3061, Training Logs: loss_final: 2.693249, loss_mean: 0.949003, proj_loss: -0.411985, loss_mean_cls: 2.156231, grad_norm: 19.809032 +Steps: 0%| | 3062/1000000 [12:35<67:37:36, 4.09it/s, grad_norm=19.8, loss_final=2.69, loss_mean=0.949, loss_mean_cls=2.16, proj_loss=-0.412][2026-03-23 13:48:55] Step: 3062, Training Logs: loss_final: 2.791280, loss_mean: 0.909360, proj_loss: -0.414860, loss_mean_cls: 2.296779, grad_norm: 13.352506 +Steps: 0%| | 3063/1000000 [12:35<67:34:02, 4.10it/s, grad_norm=13.4, loss_final=2.79, loss_mean=0.909, loss_mean_cls=2.3, proj_loss=-0.415][2026-03-23 13:48:55] Step: 3063, Training Logs: loss_final: 3.056836, loss_mean: 0.918360, proj_loss: -0.409964, loss_mean_cls: 2.548440, grad_norm: 16.520142 +Steps: 0%| | 3064/1000000 [12:36<67:31:35, 4.10it/s, grad_norm=16.5, loss_final=3.06, loss_mean=0.918, loss_mean_cls=2.55, proj_loss=-0.41][2026-03-23 13:48:55] Step: 3064, Training Logs: loss_final: 2.581216, loss_mean: 0.924676, proj_loss: -0.420720, loss_mean_cls: 2.077260, grad_norm: 6.959211 +Steps: 0%| | 3065/1000000 [12:36<67:30:31, 4.10it/s, grad_norm=6.96, loss_final=2.58, loss_mean=0.925, loss_mean_cls=2.08, proj_loss=-0.421][2026-03-23 13:48:56] Step: 3065, Training Logs: loss_final: 2.936329, loss_mean: 0.929381, proj_loss: -0.409786, loss_mean_cls: 2.416734, grad_norm: 7.756001 +Steps: 0%| | 3066/1000000 [12:36<67:32:36, 4.10it/s, grad_norm=7.76, loss_final=2.94, loss_mean=0.929, loss_mean_cls=2.42, proj_loss=-0.41][2026-03-23 13:48:56] Step: 3066, Training Logs: loss_final: 2.552903, loss_mean: 0.907885, proj_loss: -0.417568, loss_mean_cls: 2.062587, grad_norm: 4.445195 +Steps: 0%| | 3067/1000000 [12:36<67:30:00, 4.10it/s, grad_norm=4.45, loss_final=2.55, loss_mean=0.908, loss_mean_cls=2.06, proj_loss=-0.418][2026-03-23 13:48:56] Step: 3067, Training Logs: loss_final: 3.065489, loss_mean: 0.920707, proj_loss: -0.407829, loss_mean_cls: 2.552612, grad_norm: 11.599415 +Steps: 0%| | 3068/1000000 [12:37<67:31:02, 4.10it/s, grad_norm=11.6, loss_final=3.07, loss_mean=0.921, loss_mean_cls=2.55, proj_loss=-0.408][2026-03-23 13:48:56] Step: 3068, Training Logs: loss_final: 3.003894, loss_mean: 0.931036, proj_loss: -0.406911, loss_mean_cls: 2.479769, grad_norm: 13.408573 +Steps: 0%| | 3069/1000000 [12:37<67:30:27, 4.10it/s, grad_norm=13.4, loss_final=3, loss_mean=0.931, loss_mean_cls=2.48, proj_loss=-0.407][2026-03-23 13:48:57] Step: 3069, Training Logs: loss_final: 2.907562, loss_mean: 0.902036, proj_loss: -0.411994, loss_mean_cls: 2.417520, grad_norm: 12.562563 +Steps: 0%| | 3070/1000000 [12:37<67:33:25, 4.10it/s, grad_norm=12.6, loss_final=2.91, loss_mean=0.902, loss_mean_cls=2.42, proj_loss=-0.412][2026-03-23 13:48:57] Step: 3070, Training Logs: loss_final: 2.634164, loss_mean: 0.923151, proj_loss: -0.413624, loss_mean_cls: 2.124637, grad_norm: 3.378922 +Steps: 0%| | 3071/1000000 [12:37<67:33:40, 4.10it/s, grad_norm=3.38, loss_final=2.63, loss_mean=0.923, loss_mean_cls=2.12, proj_loss=-0.414][2026-03-23 13:48:57] Step: 3071, Training Logs: loss_final: 2.418061, loss_mean: 0.953578, proj_loss: -0.421951, loss_mean_cls: 1.886434, grad_norm: 13.116865 +Steps: 0%| | 3072/1000000 [12:37<67:32:26, 4.10it/s, grad_norm=13.1, loss_final=2.42, loss_mean=0.954, loss_mean_cls=1.89, proj_loss=-0.422][2026-03-23 13:48:57] Step: 3072, Training Logs: loss_final: 2.460910, loss_mean: 0.917176, proj_loss: -0.420451, loss_mean_cls: 1.964186, grad_norm: 15.111308 +Steps: 0%| | 3073/1000000 [12:38<67:30:25, 4.10it/s, grad_norm=15.1, loss_final=2.46, loss_mean=0.917, loss_mean_cls=1.96, proj_loss=-0.42][2026-03-23 13:48:58] Step: 3073, Training Logs: loss_final: 2.866033, loss_mean: 0.932415, proj_loss: -0.401814, loss_mean_cls: 2.335433, grad_norm: 6.015528 +Steps: 0%| | 3074/1000000 [12:38<67:38:27, 4.09it/s, grad_norm=6.02, loss_final=2.87, loss_mean=0.932, loss_mean_cls=2.34, proj_loss=-0.402][2026-03-23 13:48:58] Step: 3074, Training Logs: loss_final: 2.913304, loss_mean: 0.929149, proj_loss: -0.408133, loss_mean_cls: 2.392288, grad_norm: 17.718048 +Steps: 0%| | 3075/1000000 [12:38<67:35:54, 4.10it/s, grad_norm=17.7, loss_final=2.91, loss_mean=0.929, loss_mean_cls=2.39, proj_loss=-0.408][2026-03-23 13:48:58] Step: 3075, Training Logs: loss_final: 2.556979, loss_mean: 0.911098, proj_loss: -0.415730, loss_mean_cls: 2.061610, grad_norm: 4.070651 +Steps: 0%| | 3076/1000000 [12:38<67:32:36, 4.10it/s, grad_norm=4.07, loss_final=2.56, loss_mean=0.911, loss_mean_cls=2.06, proj_loss=-0.416][2026-03-23 13:48:58] Step: 3076, Training Logs: loss_final: 2.530500, loss_mean: 0.935278, proj_loss: -0.418285, loss_mean_cls: 2.013508, grad_norm: 12.439223 +Steps: 0%| | 3077/1000000 [12:39<67:31:17, 4.10it/s, grad_norm=12.4, loss_final=2.53, loss_mean=0.935, loss_mean_cls=2.01, proj_loss=-0.418][2026-03-23 13:48:59] Step: 3077, Training Logs: loss_final: 2.428518, loss_mean: 0.911048, proj_loss: -0.420363, loss_mean_cls: 1.937832, grad_norm: 17.340685 +Steps: 0%| | 3078/1000000 [12:39<67:29:59, 4.10it/s, grad_norm=17.3, loss_final=2.43, loss_mean=0.911, loss_mean_cls=1.94, proj_loss=-0.42][2026-03-23 13:48:59] Step: 3078, Training Logs: loss_final: 2.653714, loss_mean: 0.940987, proj_loss: -0.414470, loss_mean_cls: 2.127196, grad_norm: 2.497975 +Steps: 0%| | 3079/1000000 [12:39<67:29:42, 4.10it/s, grad_norm=2.5, loss_final=2.65, loss_mean=0.941, loss_mean_cls=2.13, proj_loss=-0.414][2026-03-23 13:48:59] Step: 3079, Training Logs: loss_final: 3.061152, loss_mean: 0.906768, proj_loss: -0.408455, loss_mean_cls: 2.562840, grad_norm: 6.981747 +Steps: 0%| | 3080/1000000 [12:39<67:29:24, 4.10it/s, grad_norm=6.98, loss_final=3.06, loss_mean=0.907, loss_mean_cls=2.56, proj_loss=-0.408][2026-03-23 13:48:59] Step: 3080, Training Logs: loss_final: 2.804122, loss_mean: 0.930020, proj_loss: -0.409944, loss_mean_cls: 2.284045, grad_norm: 7.138546 +Steps: 0%| | 3081/1000000 [12:40<67:27:34, 4.11it/s, grad_norm=7.14, loss_final=2.8, loss_mean=0.93, loss_mean_cls=2.28, proj_loss=-0.41][2026-03-23 13:49:00] Step: 3081, Training Logs: loss_final: 3.285235, loss_mean: 0.905558, proj_loss: -0.401411, loss_mean_cls: 2.781089, grad_norm: 8.435513 +Steps: 0%| | 3082/1000000 [12:40<67:28:14, 4.10it/s, grad_norm=8.44, loss_final=3.29, loss_mean=0.906, loss_mean_cls=2.78, proj_loss=-0.401][2026-03-23 13:49:00] Step: 3082, Training Logs: loss_final: 2.295022, loss_mean: 0.943457, proj_loss: -0.414264, loss_mean_cls: 1.765828, grad_norm: 4.618097 +Steps: 0%| | 3083/1000000 [12:40<67:30:19, 4.10it/s, grad_norm=4.62, loss_final=2.3, loss_mean=0.943, loss_mean_cls=1.77, proj_loss=-0.414][2026-03-23 13:49:00] Step: 3083, Training Logs: loss_final: 2.998045, loss_mean: 0.930143, proj_loss: -0.409672, loss_mean_cls: 2.477575, grad_norm: 11.723609 +Steps: 0%| | 3084/1000000 [12:40<67:29:39, 4.10it/s, grad_norm=11.7, loss_final=3, loss_mean=0.93, loss_mean_cls=2.48, proj_loss=-0.41][2026-03-23 13:49:00] Step: 3084, Training Logs: loss_final: 2.833453, loss_mean: 0.914338, proj_loss: -0.406734, loss_mean_cls: 2.325849, grad_norm: 6.811559 +Steps: 0%| | 3085/1000000 [12:41<67:29:55, 4.10it/s, grad_norm=6.81, loss_final=2.83, loss_mean=0.914, loss_mean_cls=2.33, proj_loss=-0.407][2026-03-23 13:49:01] Step: 3085, Training Logs: loss_final: 3.009633, loss_mean: 0.938520, proj_loss: -0.402885, loss_mean_cls: 2.473997, grad_norm: 8.148941 +Steps: 0%| | 3086/1000000 [12:41<67:30:14, 4.10it/s, grad_norm=8.15, loss_final=3.01, loss_mean=0.939, loss_mean_cls=2.47, proj_loss=-0.403][2026-03-23 13:49:01] Step: 3086, Training Logs: loss_final: 3.240315, loss_mean: 0.913965, proj_loss: -0.407120, loss_mean_cls: 2.733471, grad_norm: 2.161384 +Steps: 0%| | 3087/1000000 [12:41<67:31:24, 4.10it/s, grad_norm=2.16, loss_final=3.24, loss_mean=0.914, loss_mean_cls=2.73, proj_loss=-0.407][2026-03-23 13:49:01] Step: 3087, Training Logs: loss_final: 2.976395, loss_mean: 0.925913, proj_loss: -0.412583, loss_mean_cls: 2.463065, grad_norm: 5.625825 +Steps: 0%| | 3088/1000000 [12:41<67:29:21, 4.10it/s, grad_norm=5.63, loss_final=2.98, loss_mean=0.926, loss_mean_cls=2.46, proj_loss=-0.413][2026-03-23 13:49:01] Step: 3088, Training Logs: loss_final: 2.979287, loss_mean: 0.927123, proj_loss: -0.405194, loss_mean_cls: 2.457359, grad_norm: 3.319774 +Steps: 0%| | 3089/1000000 [12:42<67:27:20, 4.11it/s, grad_norm=3.32, loss_final=2.98, loss_mean=0.927, loss_mean_cls=2.46, proj_loss=-0.405][2026-03-23 13:49:02] Step: 3089, Training Logs: loss_final: 2.935556, loss_mean: 0.912346, proj_loss: -0.407102, loss_mean_cls: 2.430312, grad_norm: 6.263561 +Steps: 0%| | 3090/1000000 [12:42<67:37:07, 4.10it/s, grad_norm=6.26, loss_final=2.94, loss_mean=0.912, loss_mean_cls=2.43, proj_loss=-0.407][2026-03-23 13:49:02] Step: 3090, Training Logs: loss_final: 2.913644, loss_mean: 0.913423, proj_loss: -0.410682, loss_mean_cls: 2.410903, grad_norm: 18.024132 +Steps: 0%| | 3091/1000000 [12:42<67:35:37, 4.10it/s, grad_norm=18, loss_final=2.91, loss_mean=0.913, loss_mean_cls=2.41, proj_loss=-0.411][2026-03-23 13:49:02] Step: 3091, Training Logs: loss_final: 2.504243, loss_mean: 0.962761, proj_loss: -0.411963, loss_mean_cls: 1.953446, grad_norm: 15.078801 +Steps: 0%| | 3092/1000000 [12:42<67:33:09, 4.10it/s, grad_norm=15.1, loss_final=2.5, loss_mean=0.963, loss_mean_cls=1.95, proj_loss=-0.412][2026-03-23 13:49:02] Step: 3092, Training Logs: loss_final: 2.812702, loss_mean: 0.964485, proj_loss: -0.401786, loss_mean_cls: 2.250002, grad_norm: 3.388484 +Steps: 0%| | 3093/1000000 [12:43<67:32:11, 4.10it/s, grad_norm=3.39, loss_final=2.81, loss_mean=0.964, loss_mean_cls=2.25, proj_loss=-0.402][2026-03-23 13:49:03] Step: 3093, Training Logs: loss_final: 2.686477, loss_mean: 0.942286, proj_loss: -0.410497, loss_mean_cls: 2.154687, grad_norm: 11.977121 +Steps: 0%| | 3094/1000000 [12:43<67:30:45, 4.10it/s, grad_norm=12, loss_final=2.69, loss_mean=0.942, loss_mean_cls=2.15, proj_loss=-0.41][2026-03-23 13:49:03] Step: 3094, Training Logs: loss_final: 2.750022, loss_mean: 0.915202, proj_loss: -0.412579, loss_mean_cls: 2.247399, grad_norm: 9.531139 +Steps: 0%| | 3095/1000000 [12:43<67:28:56, 4.10it/s, grad_norm=9.53, loss_final=2.75, loss_mean=0.915, loss_mean_cls=2.25, proj_loss=-0.413][2026-03-23 13:49:03] Step: 3095, Training Logs: loss_final: 3.121070, loss_mean: 0.957569, proj_loss: -0.399392, loss_mean_cls: 2.562893, grad_norm: 7.079947 +Steps: 0%| | 3096/1000000 [12:43<67:28:08, 4.10it/s, grad_norm=7.08, loss_final=3.12, loss_mean=0.958, loss_mean_cls=2.56, proj_loss=-0.399][2026-03-23 13:49:03] Step: 3096, Training Logs: loss_final: 2.720870, loss_mean: 0.937074, proj_loss: -0.408128, loss_mean_cls: 2.191924, grad_norm: 26.613768 +Steps: 0%| | 3097/1000000 [12:44<68:06:44, 4.07it/s, grad_norm=26.6, loss_final=2.72, loss_mean=0.937, loss_mean_cls=2.19, proj_loss=-0.408][2026-03-23 13:49:04] Step: 3097, Training Logs: loss_final: 3.083333, loss_mean: 0.947519, proj_loss: -0.405950, loss_mean_cls: 2.541764, grad_norm: 3.198530 +Steps: 0%| | 3098/1000000 [12:44<67:57:14, 4.08it/s, grad_norm=3.2, loss_final=3.08, loss_mean=0.948, loss_mean_cls=2.54, proj_loss=-0.406][2026-03-23 13:49:04] Step: 3098, Training Logs: loss_final: 2.442746, loss_mean: 0.926244, proj_loss: -0.412252, loss_mean_cls: 1.928754, grad_norm: 17.198616 +Steps: 0%| | 3099/1000000 [12:44<67:49:02, 4.08it/s, grad_norm=17.2, loss_final=2.44, loss_mean=0.926, loss_mean_cls=1.93, proj_loss=-0.412][2026-03-23 13:49:04] Step: 3099, Training Logs: loss_final: 2.669201, loss_mean: 0.926895, proj_loss: -0.411726, loss_mean_cls: 2.154032, grad_norm: 2.861184 +Steps: 0%| | 3100/1000000 [12:44<67:43:45, 4.09it/s, grad_norm=2.86, loss_final=2.67, loss_mean=0.927, loss_mean_cls=2.15, proj_loss=-0.412][2026-03-23 13:49:04] Step: 3100, Training Logs: loss_final: 3.111288, loss_mean: 0.920624, proj_loss: -0.400444, loss_mean_cls: 2.591108, grad_norm: 6.546043 +Steps: 0%| | 3101/1000000 [12:45<67:38:16, 4.09it/s, grad_norm=6.55, loss_final=3.11, loss_mean=0.921, loss_mean_cls=2.59, proj_loss=-0.4][2026-03-23 13:49:05] Step: 3101, Training Logs: loss_final: 2.794738, loss_mean: 0.944449, proj_loss: -0.412914, loss_mean_cls: 2.263203, grad_norm: 3.289819 +Steps: 0%| | 3102/1000000 [12:45<67:35:38, 4.10it/s, grad_norm=3.29, loss_final=2.79, loss_mean=0.944, loss_mean_cls=2.26, proj_loss=-0.413][2026-03-23 13:49:05] Step: 3102, Training Logs: loss_final: 2.583143, loss_mean: 0.951504, proj_loss: -0.413342, loss_mean_cls: 2.044982, grad_norm: 13.055026 +Steps: 0%| | 3103/1000000 [12:45<67:35:03, 4.10it/s, grad_norm=13.1, loss_final=2.58, loss_mean=0.952, loss_mean_cls=2.04, proj_loss=-0.413][2026-03-23 13:49:05] Step: 3103, Training Logs: loss_final: 2.527709, loss_mean: 0.937108, proj_loss: -0.411600, loss_mean_cls: 2.002201, grad_norm: 2.765569 +Steps: 0%| | 3104/1000000 [12:45<67:32:55, 4.10it/s, grad_norm=2.77, loss_final=2.53, loss_mean=0.937, loss_mean_cls=2, proj_loss=-0.412][2026-03-23 13:49:05] Step: 3104, Training Logs: loss_final: 2.809042, loss_mean: 0.933189, proj_loss: -0.409844, loss_mean_cls: 2.285696, grad_norm: 16.246452 +Steps: 0%| | 3105/1000000 [12:46<67:29:32, 4.10it/s, grad_norm=16.2, loss_final=2.81, loss_mean=0.933, loss_mean_cls=2.29, proj_loss=-0.41][2026-03-23 13:49:05] Step: 3105, Training Logs: loss_final: 2.595324, loss_mean: 0.946888, proj_loss: -0.407810, loss_mean_cls: 2.056246, grad_norm: 9.858002 +Steps: 0%| | 3106/1000000 [12:46<67:28:37, 4.10it/s, grad_norm=9.86, loss_final=2.6, loss_mean=0.947, loss_mean_cls=2.06, proj_loss=-0.408][2026-03-23 13:49:06] Step: 3106, Training Logs: loss_final: 2.564555, loss_mean: 0.962572, proj_loss: -0.414242, loss_mean_cls: 2.016225, grad_norm: 15.747851 +Steps: 0%| | 3107/1000000 [12:46<67:27:20, 4.11it/s, grad_norm=15.7, loss_final=2.56, loss_mean=0.963, loss_mean_cls=2.02, proj_loss=-0.414][2026-03-23 13:49:06] Step: 3107, Training Logs: loss_final: 2.858703, loss_mean: 0.949832, proj_loss: -0.405347, loss_mean_cls: 2.314218, grad_norm: 13.236592 +Steps: 0%| | 3108/1000000 [12:46<67:26:04, 4.11it/s, grad_norm=13.2, loss_final=2.86, loss_mean=0.95, loss_mean_cls=2.31, proj_loss=-0.405][2026-03-23 13:49:06] Step: 3108, Training Logs: loss_final: 2.951237, loss_mean: 0.940999, proj_loss: -0.404530, loss_mean_cls: 2.414769, grad_norm: 22.255322 +Steps: 0%| | 3109/1000000 [12:47<67:26:34, 4.11it/s, grad_norm=22.3, loss_final=2.95, loss_mean=0.941, loss_mean_cls=2.41, proj_loss=-0.405][2026-03-23 13:49:06] Step: 3109, Training Logs: loss_final: 2.289859, loss_mean: 0.964042, proj_loss: -0.413288, loss_mean_cls: 1.739105, grad_norm: 4.838930 +Steps: 0%| | 3110/1000000 [12:47<67:33:52, 4.10it/s, grad_norm=4.84, loss_final=2.29, loss_mean=0.964, loss_mean_cls=1.74, proj_loss=-0.413][2026-03-23 13:49:07] Step: 3110, Training Logs: loss_final: 2.289061, loss_mean: 0.955627, proj_loss: -0.416531, loss_mean_cls: 1.749965, grad_norm: 9.953429 +Steps: 0%| | 3111/1000000 [12:47<67:34:15, 4.10it/s, grad_norm=9.95, loss_final=2.29, loss_mean=0.956, loss_mean_cls=1.75, proj_loss=-0.417][2026-03-23 13:49:07] Step: 3111, Training Logs: loss_final: 3.234927, loss_mean: 0.937307, proj_loss: -0.405160, loss_mean_cls: 2.702779, grad_norm: 7.053019 +Steps: 0%| | 3112/1000000 [12:47<67:31:38, 4.10it/s, grad_norm=7.05, loss_final=3.23, loss_mean=0.937, loss_mean_cls=2.7, proj_loss=-0.405][2026-03-23 13:49:07] Step: 3112, Training Logs: loss_final: 2.941473, loss_mean: 0.936662, proj_loss: -0.406141, loss_mean_cls: 2.410952, grad_norm: 9.650564 +Steps: 0%| | 3113/1000000 [12:47<67:31:26, 4.10it/s, grad_norm=9.65, loss_final=2.94, loss_mean=0.937, loss_mean_cls=2.41, proj_loss=-0.406][2026-03-23 13:49:07] Step: 3113, Training Logs: loss_final: 2.691640, loss_mean: 0.921538, proj_loss: -0.410751, loss_mean_cls: 2.180853, grad_norm: 8.528423 +Steps: 0%| | 3114/1000000 [12:48<67:31:56, 4.10it/s, grad_norm=8.53, loss_final=2.69, loss_mean=0.922, loss_mean_cls=2.18, proj_loss=-0.411][2026-03-23 13:49:08] Step: 3114, Training Logs: loss_final: 2.605610, loss_mean: 0.947082, proj_loss: -0.414853, loss_mean_cls: 2.073381, grad_norm: 6.232945 +Steps: 0%| | 3115/1000000 [12:48<67:29:49, 4.10it/s, grad_norm=6.23, loss_final=2.61, loss_mean=0.947, loss_mean_cls=2.07, proj_loss=-0.415][2026-03-23 13:49:08] Step: 3115, Training Logs: loss_final: 2.473910, loss_mean: 0.922495, proj_loss: -0.415339, loss_mean_cls: 1.966755, grad_norm: 2.367039 +Steps: 0%| | 3116/1000000 [12:48<67:29:52, 4.10it/s, grad_norm=2.37, loss_final=2.47, loss_mean=0.922, loss_mean_cls=1.97, proj_loss=-0.415][2026-03-23 13:49:08] Step: 3116, Training Logs: loss_final: 2.864531, loss_mean: 0.939333, proj_loss: -0.409537, loss_mean_cls: 2.334735, grad_norm: 9.455907 +Steps: 0%| | 3117/1000000 [12:48<67:30:01, 4.10it/s, grad_norm=9.46, loss_final=2.86, loss_mean=0.939, loss_mean_cls=2.33, proj_loss=-0.41][2026-03-23 13:49:08] Step: 3117, Training Logs: loss_final: 2.737001, loss_mean: 0.934635, proj_loss: -0.409750, loss_mean_cls: 2.212116, grad_norm: 20.233046 +Steps: 0%| | 3118/1000000 [12:49<67:32:12, 4.10it/s, grad_norm=20.2, loss_final=2.74, loss_mean=0.935, loss_mean_cls=2.21, proj_loss=-0.41][2026-03-23 13:49:09] Step: 3118, Training Logs: loss_final: 2.921145, loss_mean: 0.941070, proj_loss: -0.404197, loss_mean_cls: 2.384271, grad_norm: 5.935154 +Steps: 0%| | 3119/1000000 [12:49<67:33:27, 4.10it/s, grad_norm=5.94, loss_final=2.92, loss_mean=0.941, loss_mean_cls=2.38, proj_loss=-0.404][2026-03-23 13:49:09] Step: 3119, Training Logs: loss_final: 3.081614, loss_mean: 0.917584, proj_loss: -0.406291, loss_mean_cls: 2.570322, grad_norm: 2.546414 +Steps: 0%| | 3120/1000000 [12:49<67:46:22, 4.09it/s, grad_norm=2.55, loss_final=3.08, loss_mean=0.918, loss_mean_cls=2.57, proj_loss=-0.406][2026-03-23 13:49:09] Step: 3120, Training Logs: loss_final: 2.730201, loss_mean: 0.934494, proj_loss: -0.408070, loss_mean_cls: 2.203777, grad_norm: 12.149329 +Steps: 0%| | 3121/1000000 [12:49<67:40:11, 4.09it/s, grad_norm=12.1, loss_final=2.73, loss_mean=0.934, loss_mean_cls=2.2, proj_loss=-0.408][2026-03-23 13:49:09] Step: 3121, Training Logs: loss_final: 3.012342, loss_mean: 0.978449, proj_loss: -0.407582, loss_mean_cls: 2.441475, grad_norm: 9.978564 +Steps: 0%| | 3122/1000000 [12:50<67:38:02, 4.09it/s, grad_norm=9.98, loss_final=3.01, loss_mean=0.978, loss_mean_cls=2.44, proj_loss=-0.408][2026-03-23 13:49:10] Step: 3122, Training Logs: loss_final: 2.916225, loss_mean: 0.970499, proj_loss: -0.403978, loss_mean_cls: 2.349704, grad_norm: 15.773154 +Steps: 0%| | 3123/1000000 [12:50<67:37:16, 4.10it/s, grad_norm=15.8, loss_final=2.92, loss_mean=0.97, loss_mean_cls=2.35, proj_loss=-0.404][2026-03-23 13:49:10] Step: 3123, Training Logs: loss_final: 2.744222, loss_mean: 0.969674, proj_loss: -0.400806, loss_mean_cls: 2.175354, grad_norm: 21.512642 +Steps: 0%| | 3124/1000000 [12:50<67:44:35, 4.09it/s, grad_norm=21.5, loss_final=2.74, loss_mean=0.97, loss_mean_cls=2.18, proj_loss=-0.401][2026-03-23 13:49:10] Step: 3124, Training Logs: loss_final: 2.531360, loss_mean: 0.962159, proj_loss: -0.402884, loss_mean_cls: 1.972085, grad_norm: 4.889472 +Steps: 0%| | 3125/1000000 [12:50<67:40:40, 4.09it/s, grad_norm=4.89, loss_final=2.53, loss_mean=0.962, loss_mean_cls=1.97, proj_loss=-0.403][2026-03-23 13:49:10] Step: 3125, Training Logs: loss_final: 3.116603, loss_mean: 0.936391, proj_loss: -0.399612, loss_mean_cls: 2.579823, grad_norm: 8.298103 +Steps: 0%| | 3126/1000000 [12:51<67:39:41, 4.09it/s, grad_norm=8.3, loss_final=3.12, loss_mean=0.936, loss_mean_cls=2.58, proj_loss=-0.4][2026-03-23 13:49:11] Step: 3126, Training Logs: loss_final: 2.940516, loss_mean: 0.936738, proj_loss: -0.408096, loss_mean_cls: 2.411873, grad_norm: 11.943250 +Steps: 0%| | 3127/1000000 [12:51<67:40:16, 4.09it/s, grad_norm=11.9, loss_final=2.94, loss_mean=0.937, loss_mean_cls=2.41, proj_loss=-0.408][2026-03-23 13:49:11] Step: 3127, Training Logs: loss_final: 2.940078, loss_mean: 0.942206, proj_loss: -0.403839, loss_mean_cls: 2.401711, grad_norm: 3.230238 +Steps: 0%| | 3128/1000000 [12:51<67:34:22, 4.10it/s, grad_norm=3.23, loss_final=2.94, loss_mean=0.942, loss_mean_cls=2.4, proj_loss=-0.404][2026-03-23 13:49:11] Step: 3128, Training Logs: loss_final: 2.941027, loss_mean: 0.931104, proj_loss: -0.403715, loss_mean_cls: 2.413638, grad_norm: 5.000459 +Steps: 0%| | 3129/1000000 [12:51<67:35:09, 4.10it/s, grad_norm=5, loss_final=2.94, loss_mean=0.931, loss_mean_cls=2.41, proj_loss=-0.404][2026-03-23 13:49:11] Step: 3129, Training Logs: loss_final: 2.992911, loss_mean: 0.932556, proj_loss: -0.404076, loss_mean_cls: 2.464430, grad_norm: 14.713330 +Steps: 0%| | 3130/1000000 [12:52<67:36:54, 4.10it/s, grad_norm=14.7, loss_final=2.99, loss_mean=0.933, loss_mean_cls=2.46, proj_loss=-0.404][2026-03-23 13:49:12] Step: 3130, Training Logs: loss_final: 2.897304, loss_mean: 0.935196, proj_loss: -0.404093, loss_mean_cls: 2.366200, grad_norm: 18.957327 +Steps: 0%| | 3131/1000000 [12:52<67:35:27, 4.10it/s, grad_norm=19, loss_final=2.9, loss_mean=0.935, loss_mean_cls=2.37, proj_loss=-0.404][2026-03-23 13:49:12] Step: 3131, Training Logs: loss_final: 2.716082, loss_mean: 0.954150, proj_loss: -0.407014, loss_mean_cls: 2.168946, grad_norm: 22.371902 +Steps: 0%| | 3132/1000000 [12:52<67:34:15, 4.10it/s, grad_norm=22.4, loss_final=2.72, loss_mean=0.954, loss_mean_cls=2.17, proj_loss=-0.407][2026-03-23 13:49:12] Step: 3132, Training Logs: loss_final: 2.771275, loss_mean: 0.968191, proj_loss: -0.400308, loss_mean_cls: 2.203391, grad_norm: 5.154198 +Steps: 0%| | 3133/1000000 [12:52<67:35:02, 4.10it/s, grad_norm=5.15, loss_final=2.77, loss_mean=0.968, loss_mean_cls=2.2, proj_loss=-0.4][2026-03-23 13:49:12] Step: 3133, Training Logs: loss_final: 2.907570, loss_mean: 0.940585, proj_loss: -0.406180, loss_mean_cls: 2.373166, grad_norm: 10.492230 +Steps: 0%| | 3134/1000000 [12:53<67:35:39, 4.10it/s, grad_norm=10.5, loss_final=2.91, loss_mean=0.941, loss_mean_cls=2.37, proj_loss=-0.406][2026-03-23 13:49:13] Step: 3134, Training Logs: loss_final: 2.615183, loss_mean: 0.935820, proj_loss: -0.411724, loss_mean_cls: 2.091086, grad_norm: 14.556149 +Steps: 0%| | 3135/1000000 [12:53<67:35:05, 4.10it/s, grad_norm=14.6, loss_final=2.62, loss_mean=0.936, loss_mean_cls=2.09, proj_loss=-0.412][2026-03-23 13:49:13] Step: 3135, Training Logs: loss_final: 3.469239, loss_mean: 0.937871, proj_loss: -0.390301, loss_mean_cls: 2.921670, grad_norm: 7.251186 +Steps: 0%| | 3136/1000000 [12:53<67:33:03, 4.10it/s, grad_norm=7.25, loss_final=3.47, loss_mean=0.938, loss_mean_cls=2.92, proj_loss=-0.39][2026-03-23 13:49:13] Step: 3136, Training Logs: loss_final: 3.102876, loss_mean: 0.948538, proj_loss: -0.402743, loss_mean_cls: 2.557081, grad_norm: 29.819004 +Steps: 0%| | 3137/1000000 [12:53<67:33:46, 4.10it/s, grad_norm=29.8, loss_final=3.1, loss_mean=0.949, loss_mean_cls=2.56, proj_loss=-0.403][2026-03-23 13:49:13] Step: 3137, Training Logs: loss_final: 3.245010, loss_mean: 0.944320, proj_loss: -0.402276, loss_mean_cls: 2.702965, grad_norm: 12.392806 +Steps: 0%| | 3138/1000000 [12:54<67:34:57, 4.10it/s, grad_norm=12.4, loss_final=3.25, loss_mean=0.944, loss_mean_cls=2.7, proj_loss=-0.402][2026-03-23 13:49:14] Step: 3138, Training Logs: loss_final: 2.772731, loss_mean: 0.947801, proj_loss: -0.406644, loss_mean_cls: 2.231573, grad_norm: 5.372221 +Steps: 0%| | 3139/1000000 [12:54<67:34:59, 4.10it/s, grad_norm=5.37, loss_final=2.77, loss_mean=0.948, loss_mean_cls=2.23, proj_loss=-0.407][2026-03-23 13:49:14] Step: 3139, Training Logs: loss_final: 2.873814, loss_mean: 0.928533, proj_loss: -0.407439, loss_mean_cls: 2.352720, grad_norm: 2.531340 +Steps: 0%| | 3140/1000000 [12:54<67:30:37, 4.10it/s, grad_norm=2.53, loss_final=2.87, loss_mean=0.929, loss_mean_cls=2.35, proj_loss=-0.407][2026-03-23 13:49:14] Step: 3140, Training Logs: loss_final: 3.215086, loss_mean: 0.927549, proj_loss: -0.399917, loss_mean_cls: 2.687454, grad_norm: 4.775786 +Steps: 0%| | 3141/1000000 [12:54<67:32:32, 4.10it/s, grad_norm=4.78, loss_final=3.22, loss_mean=0.928, loss_mean_cls=2.69, proj_loss=-0.4][2026-03-23 13:49:14] Step: 3141, Training Logs: loss_final: 2.645927, loss_mean: 0.938374, proj_loss: -0.413390, loss_mean_cls: 2.120944, grad_norm: 2.643346 +Steps: 0%| | 3142/1000000 [12:55<67:36:51, 4.10it/s, grad_norm=2.64, loss_final=2.65, loss_mean=0.938, loss_mean_cls=2.12, proj_loss=-0.413][2026-03-23 13:49:15] Step: 3142, Training Logs: loss_final: 2.957414, loss_mean: 0.933648, proj_loss: -0.411110, loss_mean_cls: 2.434875, grad_norm: 7.490565 +Steps: 0%| | 3143/1000000 [12:55<67:43:26, 4.09it/s, grad_norm=7.49, loss_final=2.96, loss_mean=0.934, loss_mean_cls=2.43, proj_loss=-0.411][2026-03-23 13:49:15] Step: 3143, Training Logs: loss_final: 2.802828, loss_mean: 0.917384, proj_loss: -0.410340, loss_mean_cls: 2.295784, grad_norm: 5.383497 +Steps: 0%| | 3144/1000000 [12:55<67:39:35, 4.09it/s, grad_norm=5.38, loss_final=2.8, loss_mean=0.917, loss_mean_cls=2.3, proj_loss=-0.41][2026-03-23 13:49:15] Step: 3144, Training Logs: loss_final: 2.991462, loss_mean: 0.930900, proj_loss: -0.402186, loss_mean_cls: 2.462748, grad_norm: 13.620853 +Steps: 0%| | 3145/1000000 [12:55<68:13:34, 4.06it/s, grad_norm=13.6, loss_final=2.99, loss_mean=0.931, loss_mean_cls=2.46, proj_loss=-0.402][2026-03-23 13:49:15] Step: 3145, Training Logs: loss_final: 2.679271, loss_mean: 0.930528, proj_loss: -0.411747, loss_mean_cls: 2.160489, grad_norm: 23.659597 +Steps: 0%| | 3146/1000000 [12:56<68:02:36, 4.07it/s, grad_norm=23.7, loss_final=2.68, loss_mean=0.931, loss_mean_cls=2.16, proj_loss=-0.412][2026-03-23 13:49:16] Step: 3146, Training Logs: loss_final: 2.515104, loss_mean: 0.954591, proj_loss: -0.412866, loss_mean_cls: 1.973379, grad_norm: 7.812395 +Steps: 0%| | 3147/1000000 [12:56<67:50:48, 4.08it/s, grad_norm=7.81, loss_final=2.52, loss_mean=0.955, loss_mean_cls=1.97, proj_loss=-0.413][2026-03-23 13:49:16] Step: 3147, Training Logs: loss_final: 2.628103, loss_mean: 0.929425, proj_loss: -0.410555, loss_mean_cls: 2.109233, grad_norm: 2.327777 +Steps: 0%| | 3148/1000000 [12:56<67:44:10, 4.09it/s, grad_norm=2.33, loss_final=2.63, loss_mean=0.929, loss_mean_cls=2.11, proj_loss=-0.411][2026-03-23 13:49:16] Step: 3148, Training Logs: loss_final: 2.664794, loss_mean: 0.926539, proj_loss: -0.416192, loss_mean_cls: 2.154448, grad_norm: 6.292645 +Steps: 0%| | 3149/1000000 [12:56<67:43:07, 4.09it/s, grad_norm=6.29, loss_final=2.66, loss_mean=0.927, loss_mean_cls=2.15, proj_loss=-0.416][2026-03-23 13:49:16] Step: 3149, Training Logs: loss_final: 2.722737, loss_mean: 0.951353, proj_loss: -0.410581, loss_mean_cls: 2.181965, grad_norm: 3.357270 +Steps: 0%| | 3150/1000000 [12:57<67:46:46, 4.09it/s, grad_norm=3.36, loss_final=2.72, loss_mean=0.951, loss_mean_cls=2.18, proj_loss=-0.411][2026-03-23 13:49:16] Step: 3150, Training Logs: loss_final: 3.348938, loss_mean: 0.914083, proj_loss: -0.397200, loss_mean_cls: 2.832055, grad_norm: 5.793581 +Steps: 0%| | 3151/1000000 [12:57<67:46:21, 4.09it/s, grad_norm=5.79, loss_final=3.35, loss_mean=0.914, loss_mean_cls=2.83, proj_loss=-0.397][2026-03-23 13:49:17] Step: 3151, Training Logs: loss_final: 2.573726, loss_mean: 0.944117, proj_loss: -0.410500, loss_mean_cls: 2.040109, grad_norm: 2.201256 +Steps: 0%| | 3152/1000000 [12:57<67:47:55, 4.08it/s, grad_norm=2.2, loss_final=2.57, loss_mean=0.944, loss_mean_cls=2.04, proj_loss=-0.41][2026-03-23 13:49:17] Step: 3152, Training Logs: loss_final: 2.671417, loss_mean: 0.915583, proj_loss: -0.415669, loss_mean_cls: 2.171503, grad_norm: 7.433698 +Steps: 0%| | 3153/1000000 [12:57<67:47:09, 4.08it/s, grad_norm=7.43, loss_final=2.67, loss_mean=0.916, loss_mean_cls=2.17, proj_loss=-0.416][2026-03-23 13:49:17] Step: 3153, Training Logs: loss_final: 2.679789, loss_mean: 0.936891, proj_loss: -0.410398, loss_mean_cls: 2.153296, grad_norm: 13.987606 +Steps: 0%| | 3154/1000000 [12:58<67:47:59, 4.08it/s, grad_norm=14, loss_final=2.68, loss_mean=0.937, loss_mean_cls=2.15, proj_loss=-0.41][2026-03-23 13:49:17] Step: 3154, Training Logs: loss_final: 2.621458, loss_mean: 0.951120, proj_loss: -0.410947, loss_mean_cls: 2.081285, grad_norm: 10.723114 +Steps: 0%| | 3155/1000000 [12:58<67:50:03, 4.08it/s, grad_norm=10.7, loss_final=2.62, loss_mean=0.951, loss_mean_cls=2.08, proj_loss=-0.411][2026-03-23 13:49:18] Step: 3155, Training Logs: loss_final: 2.786443, loss_mean: 0.970263, proj_loss: -0.404435, loss_mean_cls: 2.220615, grad_norm: 22.116369 +Steps: 0%| | 3156/1000000 [12:58<67:50:35, 4.08it/s, grad_norm=22.1, loss_final=2.79, loss_mean=0.97, loss_mean_cls=2.22, proj_loss=-0.404][2026-03-23 13:49:18] Step: 3156, Training Logs: loss_final: 3.406763, loss_mean: 0.937535, proj_loss: -0.391890, loss_mean_cls: 2.861117, grad_norm: 17.634083 +Steps: 0%| | 3157/1000000 [12:58<67:48:40, 4.08it/s, grad_norm=17.6, loss_final=3.41, loss_mean=0.938, loss_mean_cls=2.86, proj_loss=-0.392][2026-03-23 13:49:18] Step: 3157, Training Logs: loss_final: 2.966547, loss_mean: 0.956950, proj_loss: -0.402074, loss_mean_cls: 2.411671, grad_norm: 5.984241 +Steps: 0%| | 3158/1000000 [12:58<67:45:18, 4.09it/s, grad_norm=5.98, loss_final=2.97, loss_mean=0.957, loss_mean_cls=2.41, proj_loss=-0.402][2026-03-23 13:49:18] Step: 3158, Training Logs: loss_final: 2.523179, loss_mean: 0.955500, proj_loss: -0.409050, loss_mean_cls: 1.976729, grad_norm: 18.401834 +Steps: 0%| | 3159/1000000 [12:59<67:42:38, 4.09it/s, grad_norm=18.4, loss_final=2.52, loss_mean=0.955, loss_mean_cls=1.98, proj_loss=-0.409][2026-03-23 13:49:19] Step: 3159, Training Logs: loss_final: 3.335432, loss_mean: 0.950361, proj_loss: -0.394511, loss_mean_cls: 2.779581, grad_norm: 14.018165 +Steps: 0%| | 3160/1000000 [12:59<67:46:08, 4.09it/s, grad_norm=14, loss_final=3.34, loss_mean=0.95, loss_mean_cls=2.78, proj_loss=-0.395][2026-03-23 13:49:19] Step: 3160, Training Logs: loss_final: 3.239529, loss_mean: 0.951218, proj_loss: -0.404683, loss_mean_cls: 2.692994, grad_norm: 36.993088 +Steps: 0%| | 3161/1000000 [12:59<67:40:09, 4.09it/s, grad_norm=37, loss_final=3.24, loss_mean=0.951, loss_mean_cls=2.69, proj_loss=-0.405][2026-03-23 13:49:19] Step: 3161, Training Logs: loss_final: 3.109916, loss_mean: 0.963310, proj_loss: -0.404125, loss_mean_cls: 2.550730, grad_norm: 20.504263 +Steps: 0%| | 3162/1000000 [12:59<67:52:11, 4.08it/s, grad_norm=20.5, loss_final=3.11, loss_mean=0.963, loss_mean_cls=2.55, proj_loss=-0.404][2026-03-23 13:49:19] Step: 3162, Training Logs: loss_final: 2.740866, loss_mean: 0.944301, proj_loss: -0.412602, loss_mean_cls: 2.209166, grad_norm: 18.384981 +Steps: 0%| | 3163/1000000 [13:00<67:49:58, 4.08it/s, grad_norm=18.4, loss_final=2.74, loss_mean=0.944, loss_mean_cls=2.21, proj_loss=-0.413][2026-03-23 13:49:20] Step: 3163, Training Logs: loss_final: 2.995625, loss_mean: 0.937562, proj_loss: -0.399904, loss_mean_cls: 2.457967, grad_norm: 4.636288 +Steps: 0%| | 3164/1000000 [13:00<67:48:52, 4.08it/s, grad_norm=4.64, loss_final=3, loss_mean=0.938, loss_mean_cls=2.46, proj_loss=-0.4][2026-03-23 13:49:20] Step: 3164, Training Logs: loss_final: 3.015040, loss_mean: 0.939589, proj_loss: -0.410043, loss_mean_cls: 2.485493, grad_norm: 16.138939 +Steps: 0%| | 3165/1000000 [13:00<67:41:33, 4.09it/s, grad_norm=16.1, loss_final=3.02, loss_mean=0.94, loss_mean_cls=2.49, proj_loss=-0.41][2026-03-23 13:49:20] Step: 3165, Training Logs: loss_final: 2.908212, loss_mean: 0.936619, proj_loss: -0.405684, loss_mean_cls: 2.377277, grad_norm: 21.103474 +Steps: 0%| | 3166/1000000 [13:00<67:41:16, 4.09it/s, grad_norm=21.1, loss_final=2.91, loss_mean=0.937, loss_mean_cls=2.38, proj_loss=-0.406][2026-03-23 13:49:20] Step: 3166, Training Logs: loss_final: 2.727509, loss_mean: 0.925468, proj_loss: -0.412723, loss_mean_cls: 2.214764, grad_norm: 3.248627 +Steps: 0%| | 3167/1000000 [13:01<67:35:50, 4.10it/s, grad_norm=3.25, loss_final=2.73, loss_mean=0.925, loss_mean_cls=2.21, proj_loss=-0.413][2026-03-23 13:49:21] Step: 3167, Training Logs: loss_final: 2.863938, loss_mean: 0.933953, proj_loss: -0.410708, loss_mean_cls: 2.340693, grad_norm: 3.586272 +Steps: 0%| | 3168/1000000 [13:01<67:32:03, 4.10it/s, grad_norm=3.59, loss_final=2.86, loss_mean=0.934, loss_mean_cls=2.34, proj_loss=-0.411][2026-03-23 13:49:21] Step: 3168, Training Logs: loss_final: 2.485188, loss_mean: 0.966473, proj_loss: -0.414838, loss_mean_cls: 1.933552, grad_norm: 5.298324 +Steps: 0%| | 3169/1000000 [13:01<67:29:15, 4.10it/s, grad_norm=5.3, loss_final=2.49, loss_mean=0.966, loss_mean_cls=1.93, proj_loss=-0.415][2026-03-23 13:49:21] Step: 3169, Training Logs: loss_final: 2.994860, loss_mean: 0.943198, proj_loss: -0.411052, loss_mean_cls: 2.462713, grad_norm: 14.136535 +Steps: 0%| | 3170/1000000 [13:01<67:31:50, 4.10it/s, grad_norm=14.1, loss_final=2.99, loss_mean=0.943, loss_mean_cls=2.46, proj_loss=-0.411][2026-03-23 13:49:21] Step: 3170, Training Logs: loss_final: 2.532659, loss_mean: 0.928333, proj_loss: -0.416417, loss_mean_cls: 2.020742, grad_norm: 11.131648 +Steps: 0%| | 3171/1000000 [13:02<67:30:10, 4.10it/s, grad_norm=11.1, loss_final=2.53, loss_mean=0.928, loss_mean_cls=2.02, proj_loss=-0.416][2026-03-23 13:49:22] Step: 3171, Training Logs: loss_final: 2.243186, loss_mean: 0.933566, proj_loss: -0.421306, loss_mean_cls: 1.730926, grad_norm: 24.891233 +Steps: 0%| | 3172/1000000 [13:02<67:28:37, 4.10it/s, grad_norm=24.9, loss_final=2.24, loss_mean=0.934, loss_mean_cls=1.73, proj_loss=-0.421][2026-03-23 13:49:22] Step: 3172, Training Logs: loss_final: 2.896239, loss_mean: 0.936092, proj_loss: -0.413275, loss_mean_cls: 2.373421, grad_norm: 18.097506 +Steps: 0%| | 3173/1000000 [13:02<67:37:48, 4.09it/s, grad_norm=18.1, loss_final=2.9, loss_mean=0.936, loss_mean_cls=2.37, proj_loss=-0.413][2026-03-23 13:49:22] Step: 3173, Training Logs: loss_final: 2.868249, loss_mean: 0.949188, proj_loss: -0.407901, loss_mean_cls: 2.326962, grad_norm: 12.157679 +Steps: 0%| | 3174/1000000 [13:02<67:37:51, 4.09it/s, grad_norm=12.2, loss_final=2.87, loss_mean=0.949, loss_mean_cls=2.33, proj_loss=-0.408][2026-03-23 13:49:22] Step: 3174, Training Logs: loss_final: 2.828686, loss_mean: 0.938388, proj_loss: -0.413817, loss_mean_cls: 2.304115, grad_norm: 11.186629 +Steps: 0%| | 3175/1000000 [13:03<67:32:15, 4.10it/s, grad_norm=11.2, loss_final=2.83, loss_mean=0.938, loss_mean_cls=2.3, proj_loss=-0.414][2026-03-23 13:49:23] Step: 3175, Training Logs: loss_final: 2.765163, loss_mean: 0.942953, proj_loss: -0.407823, loss_mean_cls: 2.230033, grad_norm: 7.246838 +Steps: 0%| | 3176/1000000 [13:03<67:30:44, 4.10it/s, grad_norm=7.25, loss_final=2.77, loss_mean=0.943, loss_mean_cls=2.23, proj_loss=-0.408][2026-03-23 13:49:23] Step: 3176, Training Logs: loss_final: 2.344357, loss_mean: 0.952571, proj_loss: -0.417769, loss_mean_cls: 1.809555, grad_norm: 15.955461 +Steps: 0%| | 3177/1000000 [13:03<67:30:16, 4.10it/s, grad_norm=16, loss_final=2.34, loss_mean=0.953, loss_mean_cls=1.81, proj_loss=-0.418][2026-03-23 13:49:23] Step: 3177, Training Logs: loss_final: 2.737286, loss_mean: 0.927743, proj_loss: -0.411602, loss_mean_cls: 2.221146, grad_norm: 3.830010 +Steps: 0%| | 3178/1000000 [13:03<67:31:28, 4.10it/s, grad_norm=3.83, loss_final=2.74, loss_mean=0.928, loss_mean_cls=2.22, proj_loss=-0.412][2026-03-23 13:49:23] Step: 3178, Training Logs: loss_final: 3.061270, loss_mean: 0.935086, proj_loss: -0.409512, loss_mean_cls: 2.535697, grad_norm: 4.000271 +Steps: 0%| | 3179/1000000 [13:04<67:28:48, 4.10it/s, grad_norm=4, loss_final=3.06, loss_mean=0.935, loss_mean_cls=2.54, proj_loss=-0.41][2026-03-23 13:49:24] Step: 3179, Training Logs: loss_final: 2.396380, loss_mean: 0.948261, proj_loss: -0.418237, loss_mean_cls: 1.866356, grad_norm: 8.147470 +Steps: 0%| | 3180/1000000 [13:04<67:30:34, 4.10it/s, grad_norm=8.15, loss_final=2.4, loss_mean=0.948, loss_mean_cls=1.87, proj_loss=-0.418][2026-03-23 13:49:24] Step: 3180, Training Logs: loss_final: 2.581819, loss_mean: 0.910704, proj_loss: -0.414963, loss_mean_cls: 2.086077, grad_norm: 3.139809 +Steps: 0%| | 3181/1000000 [13:04<67:29:17, 4.10it/s, grad_norm=3.14, loss_final=2.58, loss_mean=0.911, loss_mean_cls=2.09, proj_loss=-0.415][2026-03-23 13:49:24] Step: 3181, Training Logs: loss_final: 2.894603, loss_mean: 0.912936, proj_loss: -0.413619, loss_mean_cls: 2.395286, grad_norm: 17.412815 +Steps: 0%| | 3182/1000000 [13:04<67:32:10, 4.10it/s, grad_norm=17.4, loss_final=2.89, loss_mean=0.913, loss_mean_cls=2.4, proj_loss=-0.414][2026-03-23 13:49:24] Step: 3182, Training Logs: loss_final: 3.308032, loss_mean: 0.922313, proj_loss: -0.411884, loss_mean_cls: 2.797603, grad_norm: 31.555136 +Steps: 0%| | 3183/1000000 [13:05<67:32:13, 4.10it/s, grad_norm=31.6, loss_final=3.31, loss_mean=0.922, loss_mean_cls=2.8, proj_loss=-0.412][2026-03-23 13:49:25] Step: 3183, Training Logs: loss_final: 3.038606, loss_mean: 0.940966, proj_loss: -0.407966, loss_mean_cls: 2.505606, grad_norm: 17.251984 +Steps: 0%| | 3184/1000000 [13:05<67:31:36, 4.10it/s, grad_norm=17.3, loss_final=3.04, loss_mean=0.941, loss_mean_cls=2.51, proj_loss=-0.408][2026-03-23 13:49:25] Step: 3184, Training Logs: loss_final: 2.614262, loss_mean: 0.958461, proj_loss: -0.415466, loss_mean_cls: 2.071267, grad_norm: 8.131625 +Steps: 0%| | 3185/1000000 [13:05<67:28:04, 4.10it/s, grad_norm=8.13, loss_final=2.61, loss_mean=0.958, loss_mean_cls=2.07, proj_loss=-0.415][2026-03-23 13:49:25] Step: 3185, Training Logs: loss_final: 3.272313, loss_mean: 0.915307, proj_loss: -0.405308, loss_mean_cls: 2.762313, grad_norm: 16.325932 +Steps: 0%| | 3186/1000000 [13:05<67:28:44, 4.10it/s, grad_norm=16.3, loss_final=3.27, loss_mean=0.915, loss_mean_cls=2.76, proj_loss=-0.405][2026-03-23 13:49:25] Step: 3186, Training Logs: loss_final: 3.025456, loss_mean: 0.926854, proj_loss: -0.407544, loss_mean_cls: 2.506146, grad_norm: 27.670752 +Steps: 0%| | 3187/1000000 [13:06<67:26:52, 4.11it/s, grad_norm=27.7, loss_final=3.03, loss_mean=0.927, loss_mean_cls=2.51, proj_loss=-0.408][2026-03-23 13:49:26] Step: 3187, Training Logs: loss_final: 2.940784, loss_mean: 0.947295, proj_loss: -0.412332, loss_mean_cls: 2.405821, grad_norm: 8.939172 +Steps: 0%| | 3188/1000000 [13:06<67:25:25, 4.11it/s, grad_norm=8.94, loss_final=2.94, loss_mean=0.947, loss_mean_cls=2.41, proj_loss=-0.412][2026-03-23 13:49:26] Step: 3188, Training Logs: loss_final: 3.082411, loss_mean: 0.937207, proj_loss: -0.409121, loss_mean_cls: 2.554325, grad_norm: 13.356016 +Steps: 0%| | 3189/1000000 [13:06<67:24:43, 4.11it/s, grad_norm=13.4, loss_final=3.08, loss_mean=0.937, loss_mean_cls=2.55, proj_loss=-0.409][2026-03-23 13:49:26] Step: 3189, Training Logs: loss_final: 2.675315, loss_mean: 0.938898, proj_loss: -0.419116, loss_mean_cls: 2.155533, grad_norm: 12.346691 +Steps: 0%| | 3190/1000000 [13:06<67:26:30, 4.11it/s, grad_norm=12.3, loss_final=2.68, loss_mean=0.939, loss_mean_cls=2.16, proj_loss=-0.419][2026-03-23 13:49:26] Step: 3190, Training Logs: loss_final: 2.354996, loss_mean: 0.944534, proj_loss: -0.420583, loss_mean_cls: 1.831045, grad_norm: 15.998188 +Steps: 0%| | 3191/1000000 [13:07<67:25:53, 4.11it/s, grad_norm=16, loss_final=2.35, loss_mean=0.945, loss_mean_cls=1.83, proj_loss=-0.421][2026-03-23 13:49:26] Step: 3191, Training Logs: loss_final: 3.010122, loss_mean: 0.940386, proj_loss: -0.410865, loss_mean_cls: 2.480601, grad_norm: 17.800207 +Steps: 0%| | 3192/1000000 [13:07<67:24:43, 4.11it/s, grad_norm=17.8, loss_final=3.01, loss_mean=0.94, loss_mean_cls=2.48, proj_loss=-0.411][2026-03-23 13:49:27] Step: 3192, Training Logs: loss_final: 3.443819, loss_mean: 0.911585, proj_loss: -0.403011, loss_mean_cls: 2.935246, grad_norm: 18.823380 +Steps: 0%| | 3193/1000000 [13:07<67:23:55, 4.11it/s, grad_norm=18.8, loss_final=3.44, loss_mean=0.912, loss_mean_cls=2.94, proj_loss=-0.403][2026-03-23 13:49:27] Step: 3193, Training Logs: loss_final: 2.809854, loss_mean: 0.933600, proj_loss: -0.416639, loss_mean_cls: 2.292892, grad_norm: 27.752890 +Steps: 0%| | 3194/1000000 [13:07<67:26:04, 4.11it/s, grad_norm=27.8, loss_final=2.81, loss_mean=0.934, loss_mean_cls=2.29, proj_loss=-0.417][2026-03-23 13:49:27] Step: 3194, Training Logs: loss_final: 3.197051, loss_mean: 0.926460, proj_loss: -0.401397, loss_mean_cls: 2.671988, grad_norm: 15.737246 +Steps: 0%| | 3195/1000000 [13:08<67:30:12, 4.10it/s, grad_norm=15.7, loss_final=3.2, loss_mean=0.926, loss_mean_cls=2.67, proj_loss=-0.401][2026-03-23 13:49:27] Step: 3195, Training Logs: loss_final: 3.188416, loss_mean: 0.911561, proj_loss: -0.406004, loss_mean_cls: 2.682859, grad_norm: 10.173464 +Steps: 0%| | 3196/1000000 [13:08<67:30:16, 4.10it/s, grad_norm=10.2, loss_final=3.19, loss_mean=0.912, loss_mean_cls=2.68, proj_loss=-0.406][2026-03-23 13:49:28] Step: 3196, Training Logs: loss_final: 2.794813, loss_mean: 0.936085, proj_loss: -0.415791, loss_mean_cls: 2.274520, grad_norm: 14.718662 +Steps: 0%| | 3197/1000000 [13:08<67:28:52, 4.10it/s, grad_norm=14.7, loss_final=2.79, loss_mean=0.936, loss_mean_cls=2.27, proj_loss=-0.416][2026-03-23 13:49:28] Step: 3197, Training Logs: loss_final: 3.376265, loss_mean: 0.926659, proj_loss: -0.403714, loss_mean_cls: 2.853320, grad_norm: 14.280690 +Steps: 0%| | 3198/1000000 [13:08<67:26:49, 4.11it/s, grad_norm=14.3, loss_final=3.38, loss_mean=0.927, loss_mean_cls=2.85, proj_loss=-0.404][2026-03-23 13:49:28] Step: 3198, Training Logs: loss_final: 3.051443, loss_mean: 0.930272, proj_loss: -0.412736, loss_mean_cls: 2.533907, grad_norm: 19.979593 +Steps: 0%| | 3199/1000000 [13:08<67:26:14, 4.11it/s, grad_norm=20, loss_final=3.05, loss_mean=0.93, loss_mean_cls=2.53, proj_loss=-0.413][2026-03-23 13:49:28] Step: 3199, Training Logs: loss_final: 2.826590, loss_mean: 0.943945, proj_loss: -0.410684, loss_mean_cls: 2.293330, grad_norm: 2.872044 +Steps: 0%| | 3200/1000000 [13:09<67:29:58, 4.10it/s, grad_norm=2.87, loss_final=2.83, loss_mean=0.944, loss_mean_cls=2.29, proj_loss=-0.411][2026-03-23 13:49:29] Step: 3200, Training Logs: loss_final: 2.708726, loss_mean: 0.942859, proj_loss: -0.412478, loss_mean_cls: 2.178345, grad_norm: 6.498767 +Steps: 0%| | 3201/1000000 [13:09<67:28:02, 4.10it/s, grad_norm=6.5, loss_final=2.71, loss_mean=0.943, loss_mean_cls=2.18, proj_loss=-0.412][2026-03-23 13:49:29] Step: 3201, Training Logs: loss_final: 2.952030, loss_mean: 0.952863, proj_loss: -0.416219, loss_mean_cls: 2.415386, grad_norm: 23.057631 +Steps: 0%| | 3202/1000000 [13:09<67:27:08, 4.10it/s, grad_norm=23.1, loss_final=2.95, loss_mean=0.953, loss_mean_cls=2.42, proj_loss=-0.416][2026-03-23 13:49:29] Step: 3202, Training Logs: loss_final: 2.942489, loss_mean: 0.971539, proj_loss: -0.413600, loss_mean_cls: 2.384550, grad_norm: 13.283759 +Steps: 0%| | 3203/1000000 [13:09<67:25:56, 4.11it/s, grad_norm=13.3, loss_final=2.94, loss_mean=0.972, loss_mean_cls=2.38, proj_loss=-0.414][2026-03-23 13:49:29] Step: 3203, Training Logs: loss_final: 2.879375, loss_mean: 0.963511, proj_loss: -0.406640, loss_mean_cls: 2.322504, grad_norm: 4.992756 +Steps: 0%| | 3204/1000000 [13:10<67:27:28, 4.10it/s, grad_norm=4.99, loss_final=2.88, loss_mean=0.964, loss_mean_cls=2.32, proj_loss=-0.407][2026-03-23 13:49:30] Step: 3204, Training Logs: loss_final: 2.860398, loss_mean: 0.920400, proj_loss: -0.415370, loss_mean_cls: 2.355368, grad_norm: 22.323778 +Steps: 0%| | 3205/1000000 [13:10<67:26:01, 4.11it/s, grad_norm=22.3, loss_final=2.86, loss_mean=0.92, loss_mean_cls=2.36, proj_loss=-0.415][2026-03-23 13:49:30] Step: 3205, Training Logs: loss_final: 2.491024, loss_mean: 0.922559, proj_loss: -0.418866, loss_mean_cls: 1.987331, grad_norm: 5.208500 +Steps: 0%| | 3206/1000000 [13:10<67:25:19, 4.11it/s, grad_norm=5.21, loss_final=2.49, loss_mean=0.923, loss_mean_cls=1.99, proj_loss=-0.419][2026-03-23 13:49:30] Step: 3206, Training Logs: loss_final: 2.181799, loss_mean: 0.949120, proj_loss: -0.416472, loss_mean_cls: 1.649151, grad_norm: 9.734663 +Steps: 0%| | 3207/1000000 [13:10<67:25:42, 4.11it/s, grad_norm=9.73, loss_final=2.18, loss_mean=0.949, loss_mean_cls=1.65, proj_loss=-0.416][2026-03-23 13:49:30] Step: 3207, Training Logs: loss_final: 3.156315, loss_mean: 0.937870, proj_loss: -0.408855, loss_mean_cls: 2.627300, grad_norm: 11.468073 +Steps: 0%| | 3208/1000000 [13:11<68:12:42, 4.06it/s, grad_norm=11.5, loss_final=3.16, loss_mean=0.938, loss_mean_cls=2.63, proj_loss=-0.409][2026-03-23 13:49:31] Step: 3208, Training Logs: loss_final: 3.172370, loss_mean: 0.941108, proj_loss: -0.410023, loss_mean_cls: 2.641285, grad_norm: 20.592451 +Steps: 0%| | 3209/1000000 [13:11<68:45:46, 4.03it/s, grad_norm=20.6, loss_final=3.17, loss_mean=0.941, loss_mean_cls=2.64, proj_loss=-0.41][2026-03-23 13:49:31] Step: 3209, Training Logs: loss_final: 2.936491, loss_mean: 0.931951, proj_loss: -0.406571, loss_mean_cls: 2.411111, grad_norm: 10.388968 +Steps: 0%| | 3210/1000000 [13:11<68:20:31, 4.05it/s, grad_norm=10.4, loss_final=2.94, loss_mean=0.932, loss_mean_cls=2.41, proj_loss=-0.407][2026-03-23 13:49:31] Step: 3210, Training Logs: loss_final: 2.887570, loss_mean: 0.935521, proj_loss: -0.414825, loss_mean_cls: 2.366874, grad_norm: 12.403163 +Steps: 0%| | 3211/1000000 [13:11<68:04:09, 4.07it/s, grad_norm=12.4, loss_final=2.89, loss_mean=0.936, loss_mean_cls=2.37, proj_loss=-0.415][2026-03-23 13:49:31] Step: 3211, Training Logs: loss_final: 2.796021, loss_mean: 0.935090, proj_loss: -0.408377, loss_mean_cls: 2.269308, grad_norm: 15.481987 +Steps: 0%| | 3212/1000000 [13:12<67:51:39, 4.08it/s, grad_norm=15.5, loss_final=2.8, loss_mean=0.935, loss_mean_cls=2.27, proj_loss=-0.408][2026-03-23 13:49:32] Step: 3212, Training Logs: loss_final: 2.735657, loss_mean: 0.957567, proj_loss: -0.411777, loss_mean_cls: 2.189867, grad_norm: 2.394816 +Steps: 0%| | 3213/1000000 [13:12<67:43:57, 4.09it/s, grad_norm=2.39, loss_final=2.74, loss_mean=0.958, loss_mean_cls=2.19, proj_loss=-0.412][2026-03-23 13:49:32] Step: 3213, Training Logs: loss_final: 2.887163, loss_mean: 0.918878, proj_loss: -0.416497, loss_mean_cls: 2.384781, grad_norm: 10.529479 +Steps: 0%| | 3214/1000000 [13:12<67:39:06, 4.09it/s, grad_norm=10.5, loss_final=2.89, loss_mean=0.919, loss_mean_cls=2.38, proj_loss=-0.416][2026-03-23 13:49:32] Step: 3214, Training Logs: loss_final: 3.021195, loss_mean: 0.922635, proj_loss: -0.403100, loss_mean_cls: 2.501660, grad_norm: 3.723220 +Steps: 0%| | 3215/1000000 [13:12<67:39:02, 4.09it/s, grad_norm=3.72, loss_final=3.02, loss_mean=0.923, loss_mean_cls=2.5, proj_loss=-0.403][2026-03-23 13:49:32] Step: 3215, Training Logs: loss_final: 2.869597, loss_mean: 0.937953, proj_loss: -0.412160, loss_mean_cls: 2.343804, grad_norm: 10.938539 +Steps: 0%| | 3216/1000000 [13:13<67:34:10, 4.10it/s, grad_norm=10.9, loss_final=2.87, loss_mean=0.938, loss_mean_cls=2.34, proj_loss=-0.412][2026-03-23 13:49:33] Step: 3216, Training Logs: loss_final: 2.776917, loss_mean: 0.908015, proj_loss: -0.410380, loss_mean_cls: 2.279282, grad_norm: 6.542996 +Steps: 0%| | 3217/1000000 [13:13<67:37:35, 4.09it/s, grad_norm=6.54, loss_final=2.78, loss_mean=0.908, loss_mean_cls=2.28, proj_loss=-0.41][2026-03-23 13:49:33] Step: 3217, Training Logs: loss_final: 2.783073, loss_mean: 0.916030, proj_loss: -0.413878, loss_mean_cls: 2.280921, grad_norm: 7.477996 +Steps: 0%| | 3218/1000000 [13:13<67:33:28, 4.10it/s, grad_norm=7.48, loss_final=2.78, loss_mean=0.916, loss_mean_cls=2.28, proj_loss=-0.414][2026-03-23 13:49:33] Step: 3218, Training Logs: loss_final: 3.239592, loss_mean: 0.913686, proj_loss: -0.406188, loss_mean_cls: 2.732094, grad_norm: 5.366584 +Steps: 0%| | 3219/1000000 [13:13<67:30:41, 4.10it/s, grad_norm=5.37, loss_final=3.24, loss_mean=0.914, loss_mean_cls=2.73, proj_loss=-0.406][2026-03-23 13:49:33] Step: 3219, Training Logs: loss_final: 2.521981, loss_mean: 0.945411, proj_loss: -0.419782, loss_mean_cls: 1.996352, grad_norm: 11.781877 +Steps: 0%| | 3220/1000000 [13:14<67:28:24, 4.10it/s, grad_norm=11.8, loss_final=2.52, loss_mean=0.945, loss_mean_cls=2, proj_loss=-0.42][2026-03-23 13:49:34] Step: 3220, Training Logs: loss_final: 2.717752, loss_mean: 0.926918, proj_loss: -0.416810, loss_mean_cls: 2.207644, grad_norm: 15.500848 +Steps: 0%| | 3221/1000000 [13:14<67:30:03, 4.10it/s, grad_norm=15.5, loss_final=2.72, loss_mean=0.927, loss_mean_cls=2.21, proj_loss=-0.417][2026-03-23 13:49:34] Step: 3221, Training Logs: loss_final: 2.657594, loss_mean: 0.935155, proj_loss: -0.417742, loss_mean_cls: 2.140180, grad_norm: 4.334132 +Steps: 0%| | 3222/1000000 [13:14<67:28:36, 4.10it/s, grad_norm=4.33, loss_final=2.66, loss_mean=0.935, loss_mean_cls=2.14, proj_loss=-0.418][2026-03-23 13:49:34] Step: 3222, Training Logs: loss_final: 2.519486, loss_mean: 0.930717, proj_loss: -0.420009, loss_mean_cls: 2.008778, grad_norm: 15.707201 +Steps: 0%| | 3223/1000000 [13:14<67:30:14, 4.10it/s, grad_norm=15.7, loss_final=2.52, loss_mean=0.931, loss_mean_cls=2.01, proj_loss=-0.42][2026-03-23 13:49:34] Step: 3223, Training Logs: loss_final: 2.913377, loss_mean: 0.907273, proj_loss: -0.413216, loss_mean_cls: 2.419320, grad_norm: 1.732110 +Steps: 0%| | 3224/1000000 [13:15<67:33:24, 4.10it/s, grad_norm=1.73, loss_final=2.91, loss_mean=0.907, loss_mean_cls=2.42, proj_loss=-0.413][2026-03-23 13:49:35] Step: 3224, Training Logs: loss_final: 3.265056, loss_mean: 0.919631, proj_loss: -0.406200, loss_mean_cls: 2.751624, grad_norm: 9.874155 +Steps: 0%| | 3225/1000000 [13:15<67:31:32, 4.10it/s, grad_norm=9.87, loss_final=3.27, loss_mean=0.92, loss_mean_cls=2.75, proj_loss=-0.406][2026-03-23 13:49:35] Step: 3225, Training Logs: loss_final: 2.682880, loss_mean: 0.929662, proj_loss: -0.413408, loss_mean_cls: 2.166626, grad_norm: 16.602270 +Steps: 0%| | 3226/1000000 [13:15<67:30:54, 4.10it/s, grad_norm=16.6, loss_final=2.68, loss_mean=0.93, loss_mean_cls=2.17, proj_loss=-0.413][2026-03-23 13:49:35] Step: 3226, Training Logs: loss_final: 2.689837, loss_mean: 0.924367, proj_loss: -0.412543, loss_mean_cls: 2.178013, grad_norm: 18.633142 +Steps: 0%| | 3227/1000000 [13:15<67:30:39, 4.10it/s, grad_norm=18.6, loss_final=2.69, loss_mean=0.924, loss_mean_cls=2.18, proj_loss=-0.413][2026-03-23 13:49:35] Step: 3227, Training Logs: loss_final: 2.910276, loss_mean: 0.925024, proj_loss: -0.406407, loss_mean_cls: 2.391660, grad_norm: 27.622450 +Steps: 0%| | 3228/1000000 [13:16<67:30:37, 4.10it/s, grad_norm=27.6, loss_final=2.91, loss_mean=0.925, loss_mean_cls=2.39, proj_loss=-0.406][2026-03-23 13:49:36] Step: 3228, Training Logs: loss_final: 3.078615, loss_mean: 0.936781, proj_loss: -0.401236, loss_mean_cls: 2.543070, grad_norm: 26.620234 +Steps: 0%| | 3229/1000000 [13:16<67:29:12, 4.10it/s, grad_norm=26.6, loss_final=3.08, loss_mean=0.937, loss_mean_cls=2.54, proj_loss=-0.401][2026-03-23 13:49:36] Step: 3229, Training Logs: loss_final: 2.762747, loss_mean: 0.944429, proj_loss: -0.407439, loss_mean_cls: 2.225757, grad_norm: 23.515493 +Steps: 0%| | 3230/1000000 [13:16<67:27:55, 4.10it/s, grad_norm=23.5, loss_final=2.76, loss_mean=0.944, loss_mean_cls=2.23, proj_loss=-0.407][2026-03-23 13:49:36] Step: 3230, Training Logs: loss_final: 3.004576, loss_mean: 0.950922, proj_loss: -0.402514, loss_mean_cls: 2.456168, grad_norm: 3.071341 +Steps: 0%| | 3231/1000000 [13:16<67:26:31, 4.11it/s, grad_norm=3.07, loss_final=3, loss_mean=0.951, loss_mean_cls=2.46, proj_loss=-0.403][2026-03-23 13:49:36] Step: 3231, Training Logs: loss_final: 2.884779, loss_mean: 0.912279, proj_loss: -0.411584, loss_mean_cls: 2.384085, grad_norm: 21.426600 +Steps: 0%| | 3232/1000000 [13:17<67:27:02, 4.10it/s, grad_norm=21.4, loss_final=2.88, loss_mean=0.912, loss_mean_cls=2.38, proj_loss=-0.412][2026-03-23 13:49:36] Step: 3232, Training Logs: loss_final: 2.916805, loss_mean: 0.920244, proj_loss: -0.411098, loss_mean_cls: 2.407659, grad_norm: 15.196678 +Steps: 0%| | 3233/1000000 [13:17<67:28:09, 4.10it/s, grad_norm=15.2, loss_final=2.92, loss_mean=0.92, loss_mean_cls=2.41, proj_loss=-0.411][2026-03-23 13:49:37] Step: 3233, Training Logs: loss_final: 3.312766, loss_mean: 0.917060, proj_loss: -0.406496, loss_mean_cls: 2.802202, grad_norm: 12.811726 +Steps: 0%| | 3234/1000000 [13:17<67:29:23, 4.10it/s, grad_norm=12.8, loss_final=3.31, loss_mean=0.917, loss_mean_cls=2.8, proj_loss=-0.406][2026-03-23 13:49:37] Step: 3234, Training Logs: loss_final: 2.161313, loss_mean: 0.958018, proj_loss: -0.416930, loss_mean_cls: 1.620225, grad_norm: 2.533644 +Steps: 0%| | 3235/1000000 [13:17<67:32:18, 4.10it/s, grad_norm=2.53, loss_final=2.16, loss_mean=0.958, loss_mean_cls=1.62, proj_loss=-0.417][2026-03-23 13:49:37] Step: 3235, Training Logs: loss_final: 2.781663, loss_mean: 0.932460, proj_loss: -0.415444, loss_mean_cls: 2.264647, grad_norm: 9.332526 +Steps: 0%| | 3236/1000000 [13:18<67:32:53, 4.10it/s, grad_norm=9.33, loss_final=2.78, loss_mean=0.932, loss_mean_cls=2.26, proj_loss=-0.415][2026-03-23 13:49:37] Step: 3236, Training Logs: loss_final: 2.740784, loss_mean: 0.950876, proj_loss: -0.413728, loss_mean_cls: 2.203635, grad_norm: 13.411963 +Steps: 0%| | 3237/1000000 [13:18<67:29:27, 4.10it/s, grad_norm=13.4, loss_final=2.74, loss_mean=0.951, loss_mean_cls=2.2, proj_loss=-0.414][2026-03-23 13:49:38] Step: 3237, Training Logs: loss_final: 2.796304, loss_mean: 0.950880, proj_loss: -0.397815, loss_mean_cls: 2.243240, grad_norm: 18.062300 +Steps: 0%| | 3238/1000000 [13:18<67:31:07, 4.10it/s, grad_norm=18.1, loss_final=2.8, loss_mean=0.951, loss_mean_cls=2.24, proj_loss=-0.398][2026-03-23 13:49:38] Step: 3238, Training Logs: loss_final: 3.020159, loss_mean: 0.969532, proj_loss: -0.397292, loss_mean_cls: 2.447919, grad_norm: 6.445590 +Steps: 0%| | 3239/1000000 [13:18<67:29:59, 4.10it/s, grad_norm=6.45, loss_final=3.02, loss_mean=0.97, loss_mean_cls=2.45, proj_loss=-0.397][2026-03-23 13:49:38] Step: 3239, Training Logs: loss_final: 3.302465, loss_mean: 0.949778, proj_loss: -0.396414, loss_mean_cls: 2.749101, grad_norm: 12.971319 +Steps: 0%| | 3240/1000000 [13:18<67:27:30, 4.10it/s, grad_norm=13, loss_final=3.3, loss_mean=0.95, loss_mean_cls=2.75, proj_loss=-0.396][2026-03-23 13:49:38] Step: 3240, Training Logs: loss_final: 2.814362, loss_mean: 0.928421, proj_loss: -0.409293, loss_mean_cls: 2.295235, grad_norm: 5.154029 +Steps: 0%| | 3241/1000000 [13:19<67:27:11, 4.10it/s, grad_norm=5.15, loss_final=2.81, loss_mean=0.928, loss_mean_cls=2.3, proj_loss=-0.409][2026-03-23 13:49:39] Step: 3241, Training Logs: loss_final: 2.813915, loss_mean: 0.934119, proj_loss: -0.412996, loss_mean_cls: 2.292792, grad_norm: 15.499140 +Steps: 0%| | 3242/1000000 [13:19<67:29:44, 4.10it/s, grad_norm=15.5, loss_final=2.81, loss_mean=0.934, loss_mean_cls=2.29, proj_loss=-0.413][2026-03-23 13:49:39] Step: 3242, Training Logs: loss_final: 2.154016, loss_mean: 0.954709, proj_loss: -0.420351, loss_mean_cls: 1.619657, grad_norm: 6.894516 +Steps: 0%| | 3243/1000000 [13:19<67:30:40, 4.10it/s, grad_norm=6.89, loss_final=2.15, loss_mean=0.955, loss_mean_cls=1.62, proj_loss=-0.42][2026-03-23 13:49:39] Step: 3243, Training Logs: loss_final: 2.580806, loss_mean: 0.930664, proj_loss: -0.411422, loss_mean_cls: 2.061564, grad_norm: 6.192189 +Steps: 0%| | 3244/1000000 [13:19<67:29:23, 4.10it/s, grad_norm=6.19, loss_final=2.58, loss_mean=0.931, loss_mean_cls=2.06, proj_loss=-0.411][2026-03-23 13:49:39] Step: 3244, Training Logs: loss_final: 3.242816, loss_mean: 0.924165, proj_loss: -0.407109, loss_mean_cls: 2.725760, grad_norm: 22.059071 +Steps: 0%| | 3245/1000000 [13:20<67:32:25, 4.10it/s, grad_norm=22.1, loss_final=3.24, loss_mean=0.924, loss_mean_cls=2.73, proj_loss=-0.407][2026-03-23 13:49:40] Step: 3245, Training Logs: loss_final: 3.753937, loss_mean: 0.916974, proj_loss: -0.398098, loss_mean_cls: 3.235060, grad_norm: 28.914173 +Steps: 0%| | 3246/1000000 [13:20<67:35:33, 4.10it/s, grad_norm=28.9, loss_final=3.75, loss_mean=0.917, loss_mean_cls=3.24, proj_loss=-0.398][2026-03-23 13:49:40] Step: 3246, Training Logs: loss_final: 2.637882, loss_mean: 0.937730, proj_loss: -0.413949, loss_mean_cls: 2.114101, grad_norm: 28.355776 +Steps: 0%| | 3247/1000000 [13:20<67:38:58, 4.09it/s, grad_norm=28.4, loss_final=2.64, loss_mean=0.938, loss_mean_cls=2.11, proj_loss=-0.414][2026-03-23 13:49:40] Step: 3247, Training Logs: loss_final: 2.696246, loss_mean: 0.954938, proj_loss: -0.412408, loss_mean_cls: 2.153716, grad_norm: 21.348938 +Steps: 0%| | 3248/1000000 [13:20<67:38:13, 4.09it/s, grad_norm=21.3, loss_final=2.7, loss_mean=0.955, loss_mean_cls=2.15, proj_loss=-0.412][2026-03-23 13:49:40] Step: 3248, Training Logs: loss_final: 3.086372, loss_mean: 0.959537, proj_loss: -0.399173, loss_mean_cls: 2.526009, grad_norm: 28.283159 +Steps: 0%| | 3249/1000000 [13:21<67:36:38, 4.10it/s, grad_norm=28.3, loss_final=3.09, loss_mean=0.96, loss_mean_cls=2.53, proj_loss=-0.399][2026-03-23 13:49:41] Step: 3249, Training Logs: loss_final: 2.930784, loss_mean: 0.936969, proj_loss: -0.406974, loss_mean_cls: 2.400789, grad_norm: 27.483114 +Steps: 0%| | 3250/1000000 [13:21<67:40:12, 4.09it/s, grad_norm=27.5, loss_final=2.93, loss_mean=0.937, loss_mean_cls=2.4, proj_loss=-0.407][2026-03-23 13:49:41] Step: 3250, Training Logs: loss_final: 2.996897, loss_mean: 0.947421, proj_loss: -0.402139, loss_mean_cls: 2.451616, grad_norm: 18.661232 +Steps: 0%| | 3251/1000000 [13:21<67:38:24, 4.09it/s, grad_norm=18.7, loss_final=3, loss_mean=0.947, loss_mean_cls=2.45, proj_loss=-0.402][2026-03-23 13:49:41] Step: 3251, Training Logs: loss_final: 2.946249, loss_mean: 0.925627, proj_loss: -0.412990, loss_mean_cls: 2.433613, grad_norm: 11.170186 +Steps: 0%| | 3252/1000000 [13:21<67:35:08, 4.10it/s, grad_norm=11.2, loss_final=2.95, loss_mean=0.926, loss_mean_cls=2.43, proj_loss=-0.413][2026-03-23 13:49:41] Step: 3252, Training Logs: loss_final: 3.241209, loss_mean: 0.921609, proj_loss: -0.396867, loss_mean_cls: 2.716467, grad_norm: 3.140061 +Steps: 0%| | 3253/1000000 [13:22<67:32:16, 4.10it/s, grad_norm=3.14, loss_final=3.24, loss_mean=0.922, loss_mean_cls=2.72, proj_loss=-0.397][2026-03-23 13:49:42] Step: 3253, Training Logs: loss_final: 2.880625, loss_mean: 0.917135, proj_loss: -0.412402, loss_mean_cls: 2.375892, grad_norm: 36.953529 +Steps: 0%| | 3254/1000000 [13:22<68:54:41, 4.02it/s, grad_norm=37, loss_final=2.88, loss_mean=0.917, loss_mean_cls=2.38, proj_loss=-0.412][2026-03-23 13:49:42] Step: 3254, Training Logs: loss_final: 2.774087, loss_mean: 0.955092, proj_loss: -0.409243, loss_mean_cls: 2.228239, grad_norm: 16.899591 +Steps: 0%| | 3255/1000000 [13:22<68:27:58, 4.04it/s, grad_norm=16.9, loss_final=2.77, loss_mean=0.955, loss_mean_cls=2.23, proj_loss=-0.409][2026-03-23 13:49:42] Step: 3255, Training Logs: loss_final: 2.744511, loss_mean: 0.932750, proj_loss: -0.405093, loss_mean_cls: 2.216854, grad_norm: 2.908378 +Steps: 0%| | 3256/1000000 [13:22<68:09:40, 4.06it/s, grad_norm=2.91, loss_final=2.74, loss_mean=0.933, loss_mean_cls=2.22, proj_loss=-0.405][2026-03-23 13:49:42] Step: 3256, Training Logs: loss_final: 2.724534, loss_mean: 0.927273, proj_loss: -0.411576, loss_mean_cls: 2.208838, grad_norm: 2.582633 +Steps: 0%| | 3257/1000000 [13:23<67:56:01, 4.08it/s, grad_norm=2.58, loss_final=2.72, loss_mean=0.927, loss_mean_cls=2.21, proj_loss=-0.412][2026-03-23 13:49:43] Step: 3257, Training Logs: loss_final: 3.088274, loss_mean: 0.925162, proj_loss: -0.411798, loss_mean_cls: 2.574910, grad_norm: 4.669608 +Steps: 0%| | 3258/1000000 [13:23<67:49:10, 4.08it/s, grad_norm=4.67, loss_final=3.09, loss_mean=0.925, loss_mean_cls=2.57, proj_loss=-0.412][2026-03-23 13:49:43] Step: 3258, Training Logs: loss_final: 3.496008, loss_mean: 0.918483, proj_loss: -0.397016, loss_mean_cls: 2.974542, grad_norm: 8.956629 +Steps: 0%| | 3259/1000000 [13:23<67:42:08, 4.09it/s, grad_norm=8.96, loss_final=3.5, loss_mean=0.918, loss_mean_cls=2.97, proj_loss=-0.397][2026-03-23 13:49:43] Step: 3259, Training Logs: loss_final: 2.585068, loss_mean: 0.954529, proj_loss: -0.407009, loss_mean_cls: 2.037549, grad_norm: 8.381247 +Steps: 0%| | 3260/1000000 [13:23<67:36:13, 4.10it/s, grad_norm=8.38, loss_final=2.59, loss_mean=0.955, loss_mean_cls=2.04, proj_loss=-0.407][2026-03-23 13:49:43] Step: 3260, Training Logs: loss_final: 2.896452, loss_mean: 0.925374, proj_loss: -0.404374, loss_mean_cls: 2.375452, grad_norm: 5.494833 +Steps: 0%| | 3261/1000000 [13:24<67:31:13, 4.10it/s, grad_norm=5.49, loss_final=2.9, loss_mean=0.925, loss_mean_cls=2.38, proj_loss=-0.404][2026-03-23 13:49:44] Step: 3261, Training Logs: loss_final: 3.088091, loss_mean: 0.925248, proj_loss: -0.403693, loss_mean_cls: 2.566536, grad_norm: 19.632431 +Steps: 0%| | 3262/1000000 [13:24<67:31:48, 4.10it/s, grad_norm=19.6, loss_final=3.09, loss_mean=0.925, loss_mean_cls=2.57, proj_loss=-0.404][2026-03-23 13:49:44] Step: 3262, Training Logs: loss_final: 3.102235, loss_mean: 0.933932, proj_loss: -0.405822, loss_mean_cls: 2.574124, grad_norm: 24.436823 +Steps: 0%| | 3263/1000000 [13:24<67:35:17, 4.10it/s, grad_norm=24.4, loss_final=3.1, loss_mean=0.934, loss_mean_cls=2.57, proj_loss=-0.406][2026-03-23 13:49:44] Step: 3263, Training Logs: loss_final: 2.675610, loss_mean: 0.950606, proj_loss: -0.407536, loss_mean_cls: 2.132540, grad_norm: 7.551528 +Steps: 0%| | 3264/1000000 [13:24<67:35:50, 4.10it/s, grad_norm=7.55, loss_final=2.68, loss_mean=0.951, loss_mean_cls=2.13, proj_loss=-0.408][2026-03-23 13:49:44] Step: 3264, Training Logs: loss_final: 3.092462, loss_mean: 0.931964, proj_loss: -0.403460, loss_mean_cls: 2.563958, grad_norm: 3.796345 +Steps: 0%| | 3265/1000000 [13:25<67:32:11, 4.10it/s, grad_norm=3.8, loss_final=3.09, loss_mean=0.932, loss_mean_cls=2.56, proj_loss=-0.403][2026-03-23 13:49:45] Step: 3265, Training Logs: loss_final: 2.929507, loss_mean: 0.938165, proj_loss: -0.413998, loss_mean_cls: 2.405340, grad_norm: 20.921526 +Steps: 0%| | 3266/1000000 [13:25<67:31:16, 4.10it/s, grad_norm=20.9, loss_final=2.93, loss_mean=0.938, loss_mean_cls=2.41, proj_loss=-0.414][2026-03-23 13:49:45] Step: 3266, Training Logs: loss_final: 2.630611, loss_mean: 0.948409, proj_loss: -0.414720, loss_mean_cls: 2.096922, grad_norm: 16.663071 +Steps: 0%| | 3267/1000000 [13:25<67:29:54, 4.10it/s, grad_norm=16.7, loss_final=2.63, loss_mean=0.948, loss_mean_cls=2.1, proj_loss=-0.415][2026-03-23 13:49:45] Step: 3267, Training Logs: loss_final: 3.367684, loss_mean: 0.938459, proj_loss: -0.401024, loss_mean_cls: 2.830249, grad_norm: 25.254959 +Steps: 0%| | 3268/1000000 [13:25<67:29:25, 4.10it/s, grad_norm=25.3, loss_final=3.37, loss_mean=0.938, loss_mean_cls=2.83, proj_loss=-0.401][2026-03-23 13:49:45] Step: 3268, Training Logs: loss_final: 2.776231, loss_mean: 0.949034, proj_loss: -0.407118, loss_mean_cls: 2.234315, grad_norm: 12.767673 +Steps: 0%| | 3269/1000000 [13:26<67:28:50, 4.10it/s, grad_norm=12.8, loss_final=2.78, loss_mean=0.949, loss_mean_cls=2.23, proj_loss=-0.407][2026-03-23 13:49:46] Step: 3269, Training Logs: loss_final: 3.070562, loss_mean: 0.938379, proj_loss: -0.406103, loss_mean_cls: 2.538286, grad_norm: 14.622743 +Steps: 0%| | 3270/1000000 [13:26<67:27:11, 4.10it/s, grad_norm=14.6, loss_final=3.07, loss_mean=0.938, loss_mean_cls=2.54, proj_loss=-0.406][2026-03-23 13:49:46] Step: 3270, Training Logs: loss_final: 2.716153, loss_mean: 0.952327, proj_loss: -0.411110, loss_mean_cls: 2.174936, grad_norm: 8.839236 +Steps: 0%| | 3271/1000000 [13:26<67:26:42, 4.11it/s, grad_norm=8.84, loss_final=2.72, loss_mean=0.952, loss_mean_cls=2.17, proj_loss=-0.411][2026-03-23 13:49:46] Step: 3271, Training Logs: loss_final: 2.861253, loss_mean: 0.919689, proj_loss: -0.408406, loss_mean_cls: 2.349971, grad_norm: 11.764990 +Steps: 0%| | 3272/1000000 [13:26<67:28:27, 4.10it/s, grad_norm=11.8, loss_final=2.86, loss_mean=0.92, loss_mean_cls=2.35, proj_loss=-0.408][2026-03-23 13:49:46] Step: 3272, Training Logs: loss_final: 2.915786, loss_mean: 0.938015, proj_loss: -0.407958, loss_mean_cls: 2.385728, grad_norm: 7.990373 +Steps: 0%| | 3273/1000000 [13:27<67:28:11, 4.10it/s, grad_norm=7.99, loss_final=2.92, loss_mean=0.938, loss_mean_cls=2.39, proj_loss=-0.408][2026-03-23 13:49:47] Step: 3273, Training Logs: loss_final: 2.602350, loss_mean: 0.946558, proj_loss: -0.415069, loss_mean_cls: 2.070862, grad_norm: 21.874172 +Steps: 0%| | 3274/1000000 [13:27<67:28:14, 4.10it/s, grad_norm=21.9, loss_final=2.6, loss_mean=0.947, loss_mean_cls=2.07, proj_loss=-0.415][2026-03-23 13:49:47] Step: 3274, Training Logs: loss_final: 3.043710, loss_mean: 0.916202, proj_loss: -0.408437, loss_mean_cls: 2.535945, grad_norm: 17.556608 +Steps: 0%| | 3275/1000000 [13:27<67:29:45, 4.10it/s, grad_norm=17.6, loss_final=3.04, loss_mean=0.916, loss_mean_cls=2.54, proj_loss=-0.408][2026-03-23 13:49:47] Step: 3275, Training Logs: loss_final: 2.706749, loss_mean: 0.913513, proj_loss: -0.408549, loss_mean_cls: 2.201786, grad_norm: 14.501580 +Steps: 0%| | 3276/1000000 [13:27<67:27:57, 4.10it/s, grad_norm=14.5, loss_final=2.71, loss_mean=0.914, loss_mean_cls=2.2, proj_loss=-0.409][2026-03-23 13:49:47] Step: 3276, Training Logs: loss_final: 3.244864, loss_mean: 0.941101, proj_loss: -0.399339, loss_mean_cls: 2.703102, grad_norm: 13.019986 +Steps: 0%| | 3277/1000000 [13:28<67:31:08, 4.10it/s, grad_norm=13, loss_final=3.24, loss_mean=0.941, loss_mean_cls=2.7, proj_loss=-0.399][2026-03-23 13:49:47] Step: 3277, Training Logs: loss_final: 2.714839, loss_mean: 0.909641, proj_loss: -0.412603, loss_mean_cls: 2.217801, grad_norm: 5.483166 +Steps: 0%| | 3278/1000000 [13:28<67:31:20, 4.10it/s, grad_norm=5.48, loss_final=2.71, loss_mean=0.91, loss_mean_cls=2.22, proj_loss=-0.413][2026-03-23 13:49:48] Step: 3278, Training Logs: loss_final: 2.985174, loss_mean: 0.930373, proj_loss: -0.410820, loss_mean_cls: 2.465621, grad_norm: 8.469340 +Steps: 0%| | 3279/1000000 [13:28<67:30:11, 4.10it/s, grad_norm=8.47, loss_final=2.99, loss_mean=0.93, loss_mean_cls=2.47, proj_loss=-0.411][2026-03-23 13:49:48] Step: 3279, Training Logs: loss_final: 2.764303, loss_mean: 0.925823, proj_loss: -0.416301, loss_mean_cls: 2.254782, grad_norm: 16.093065 +Steps: 0%| | 3280/1000000 [13:28<67:28:36, 4.10it/s, grad_norm=16.1, loss_final=2.76, loss_mean=0.926, loss_mean_cls=2.25, proj_loss=-0.416][2026-03-23 13:49:48] Step: 3280, Training Logs: loss_final: 2.830960, loss_mean: 0.930140, proj_loss: -0.414428, loss_mean_cls: 2.315247, grad_norm: 3.582263 +Steps: 0%| | 3281/1000000 [13:29<67:27:07, 4.10it/s, grad_norm=3.58, loss_final=2.83, loss_mean=0.93, loss_mean_cls=2.32, proj_loss=-0.414][2026-03-23 13:49:48] Step: 3281, Training Logs: loss_final: 2.449069, loss_mean: 0.926388, proj_loss: -0.417769, loss_mean_cls: 1.940450, grad_norm: 1.698092 +Steps: 0%| | 3282/1000000 [13:29<67:26:08, 4.11it/s, grad_norm=1.7, loss_final=2.45, loss_mean=0.926, loss_mean_cls=1.94, proj_loss=-0.418][2026-03-23 13:49:49] Step: 3282, Training Logs: loss_final: 2.620974, loss_mean: 0.927233, proj_loss: -0.412583, loss_mean_cls: 2.106323, grad_norm: 6.502893 +Steps: 0%| | 3283/1000000 [13:29<67:26:44, 4.11it/s, grad_norm=6.5, loss_final=2.62, loss_mean=0.927, loss_mean_cls=2.11, proj_loss=-0.413][2026-03-23 13:49:49] Step: 3283, Training Logs: loss_final: 2.745610, loss_mean: 0.933005, proj_loss: -0.411118, loss_mean_cls: 2.223723, grad_norm: 2.556279 +Steps: 0%| | 3284/1000000 [13:29<67:24:44, 4.11it/s, grad_norm=2.56, loss_final=2.75, loss_mean=0.933, loss_mean_cls=2.22, proj_loss=-0.411][2026-03-23 13:49:49] Step: 3284, Training Logs: loss_final: 3.197166, loss_mean: 0.912554, proj_loss: -0.405601, loss_mean_cls: 2.690213, grad_norm: 7.023041 +Steps: 0%| | 3285/1000000 [13:29<67:23:51, 4.11it/s, grad_norm=7.02, loss_final=3.2, loss_mean=0.913, loss_mean_cls=2.69, proj_loss=-0.406][2026-03-23 13:49:49] Step: 3285, Training Logs: loss_final: 3.240285, loss_mean: 0.925493, proj_loss: -0.402680, loss_mean_cls: 2.717472, grad_norm: 3.795268 +Steps: 0%| | 3286/1000000 [13:30<67:22:59, 4.11it/s, grad_norm=3.8, loss_final=3.24, loss_mean=0.925, loss_mean_cls=2.72, proj_loss=-0.403][2026-03-23 13:49:50] Step: 3286, Training Logs: loss_final: 2.740087, loss_mean: 0.940836, proj_loss: -0.412700, loss_mean_cls: 2.211951, grad_norm: 12.829675 +Steps: 0%| | 3287/1000000 [13:30<67:23:27, 4.11it/s, grad_norm=12.8, loss_final=2.74, loss_mean=0.941, loss_mean_cls=2.21, proj_loss=-0.413][2026-03-23 13:49:50] Step: 3287, Training Logs: loss_final: 2.759655, loss_mean: 0.949060, proj_loss: -0.411103, loss_mean_cls: 2.221697, grad_norm: 16.878519 +Steps: 0%| | 3288/1000000 [13:30<67:25:39, 4.11it/s, grad_norm=16.9, loss_final=2.76, loss_mean=0.949, loss_mean_cls=2.22, proj_loss=-0.411][2026-03-23 13:49:50] Step: 3288, Training Logs: loss_final: 3.054178, loss_mean: 0.938621, proj_loss: -0.402910, loss_mean_cls: 2.518467, grad_norm: 9.234226 +Steps: 0%| | 3289/1000000 [13:30<67:33:06, 4.10it/s, grad_norm=9.23, loss_final=3.05, loss_mean=0.939, loss_mean_cls=2.52, proj_loss=-0.403][2026-03-23 13:49:50] Step: 3289, Training Logs: loss_final: 3.230825, loss_mean: 0.921256, proj_loss: -0.393179, loss_mean_cls: 2.702748, grad_norm: 4.020617 +Steps: 0%| | 3290/1000000 [13:31<67:30:15, 4.10it/s, grad_norm=4.02, loss_final=3.23, loss_mean=0.921, loss_mean_cls=2.7, proj_loss=-0.393][2026-03-23 13:49:51] Step: 3290, Training Logs: loss_final: 2.841259, loss_mean: 0.927317, proj_loss: -0.412945, loss_mean_cls: 2.326887, grad_norm: 11.512490 +Steps: 0%| | 3291/1000000 [13:31<67:29:54, 4.10it/s, grad_norm=11.5, loss_final=2.84, loss_mean=0.927, loss_mean_cls=2.33, proj_loss=-0.413][2026-03-23 13:49:51] Step: 3291, Training Logs: loss_final: 2.420125, loss_mean: 0.943062, proj_loss: -0.421066, loss_mean_cls: 1.898129, grad_norm: 3.927660 +Steps: 0%| | 3292/1000000 [13:31<67:28:20, 4.10it/s, grad_norm=3.93, loss_final=2.42, loss_mean=0.943, loss_mean_cls=1.9, proj_loss=-0.421][2026-03-23 13:49:51] Step: 3292, Training Logs: loss_final: 2.795977, loss_mean: 0.925924, proj_loss: -0.412785, loss_mean_cls: 2.282838, grad_norm: 1.822837 +Steps: 0%| | 3293/1000000 [13:31<67:26:52, 4.10it/s, grad_norm=1.82, loss_final=2.8, loss_mean=0.926, loss_mean_cls=2.28, proj_loss=-0.413][2026-03-23 13:49:51] Step: 3293, Training Logs: loss_final: 2.775211, loss_mean: 0.929900, proj_loss: -0.415080, loss_mean_cls: 2.260391, grad_norm: 9.576942 +Steps: 0%| | 3294/1000000 [13:32<67:25:34, 4.11it/s, grad_norm=9.58, loss_final=2.78, loss_mean=0.93, loss_mean_cls=2.26, proj_loss=-0.415][2026-03-23 13:49:52] Step: 3294, Training Logs: loss_final: 2.859682, loss_mean: 0.942661, proj_loss: -0.414843, loss_mean_cls: 2.331864, grad_norm: 3.616511 +Steps: 0%| | 3295/1000000 [13:32<67:26:54, 4.10it/s, grad_norm=3.62, loss_final=2.86, loss_mean=0.943, loss_mean_cls=2.33, proj_loss=-0.415][2026-03-23 13:49:52] Step: 3295, Training Logs: loss_final: 3.226943, loss_mean: 0.915888, proj_loss: -0.403032, loss_mean_cls: 2.714087, grad_norm: 10.493486 +Steps: 0%| | 3296/1000000 [13:32<67:27:33, 4.10it/s, grad_norm=10.5, loss_final=3.23, loss_mean=0.916, loss_mean_cls=2.71, proj_loss=-0.403][2026-03-23 13:49:52] Step: 3296, Training Logs: loss_final: 2.487066, loss_mean: 0.932538, proj_loss: -0.413766, loss_mean_cls: 1.968294, grad_norm: 8.052335 +Steps: 0%| | 3297/1000000 [13:32<67:26:33, 4.11it/s, grad_norm=8.05, loss_final=2.49, loss_mean=0.933, loss_mean_cls=1.97, proj_loss=-0.414][2026-03-23 13:49:52] Step: 3297, Training Logs: loss_final: 2.784416, loss_mean: 0.926701, proj_loss: -0.409311, loss_mean_cls: 2.267026, grad_norm: 15.238099 +Steps: 0%| | 3298/1000000 [13:33<67:27:29, 4.10it/s, grad_norm=15.2, loss_final=2.78, loss_mean=0.927, loss_mean_cls=2.27, proj_loss=-0.409][2026-03-23 13:49:53] Step: 3298, Training Logs: loss_final: 2.365173, loss_mean: 0.936609, proj_loss: -0.417941, loss_mean_cls: 1.846505, grad_norm: 28.683994 +Steps: 0%| | 3299/1000000 [13:33<67:26:32, 4.11it/s, grad_norm=28.7, loss_final=2.37, loss_mean=0.937, loss_mean_cls=1.85, proj_loss=-0.418][2026-03-23 13:49:53] Step: 3299, Training Logs: loss_final: 3.103403, loss_mean: 0.951767, proj_loss: -0.407764, loss_mean_cls: 2.559400, grad_norm: 21.937548 +Steps: 0%| | 3300/1000000 [13:33<67:28:01, 4.10it/s, grad_norm=21.9, loss_final=3.1, loss_mean=0.952, loss_mean_cls=2.56, proj_loss=-0.408][2026-03-23 13:49:53] Step: 3300, Training Logs: loss_final: 2.625476, loss_mean: 0.930297, proj_loss: -0.408758, loss_mean_cls: 2.103937, grad_norm: 15.633218 +Steps: 0%| | 3301/1000000 [13:33<67:24:58, 4.11it/s, grad_norm=15.6, loss_final=2.63, loss_mean=0.93, loss_mean_cls=2.1, proj_loss=-0.409][2026-03-23 13:49:53] Step: 3301, Training Logs: loss_final: 2.607422, loss_mean: 0.923420, proj_loss: -0.415966, loss_mean_cls: 2.099968, grad_norm: 2.508821 +Steps: 0%| | 3302/1000000 [13:34<69:54:07, 3.96it/s, grad_norm=2.51, loss_final=2.61, loss_mean=0.923, loss_mean_cls=2.1, proj_loss=-0.416][2026-03-23 13:49:54] Step: 3302, Training Logs: loss_final: 2.775134, loss_mean: 0.929641, proj_loss: -0.409776, loss_mean_cls: 2.255269, grad_norm: 11.645665 +Steps: 0%| | 3303/1000000 [13:34<69:18:48, 3.99it/s, grad_norm=11.6, loss_final=2.78, loss_mean=0.93, loss_mean_cls=2.26, proj_loss=-0.41][2026-03-23 13:49:54] Step: 3303, Training Logs: loss_final: 2.595722, loss_mean: 0.954614, proj_loss: -0.410759, loss_mean_cls: 2.051866, grad_norm: 4.639548 +Steps: 0%| | 3304/1000000 [13:34<68:46:08, 4.03it/s, grad_norm=4.64, loss_final=2.6, loss_mean=0.955, loss_mean_cls=2.05, proj_loss=-0.411][2026-03-23 13:49:54] Step: 3304, Training Logs: loss_final: 2.738998, loss_mean: 0.953443, proj_loss: -0.411747, loss_mean_cls: 2.197302, grad_norm: 12.513372 +Steps: 0%| | 3305/1000000 [13:34<68:21:13, 4.05it/s, grad_norm=12.5, loss_final=2.74, loss_mean=0.953, loss_mean_cls=2.2, proj_loss=-0.412][2026-03-23 13:49:54] Step: 3305, Training Logs: loss_final: 3.068235, loss_mean: 0.952616, proj_loss: -0.397818, loss_mean_cls: 2.513437, grad_norm: 7.798642 +Steps: 0%| | 3306/1000000 [13:35<68:06:19, 4.07it/s, grad_norm=7.8, loss_final=3.07, loss_mean=0.953, loss_mean_cls=2.51, proj_loss=-0.398][2026-03-23 13:49:55] Step: 3306, Training Logs: loss_final: 2.587224, loss_mean: 0.970256, proj_loss: -0.409042, loss_mean_cls: 2.026010, grad_norm: 21.523457 +Steps: 0%| | 3307/1000000 [13:35<67:54:24, 4.08it/s, grad_norm=21.5, loss_final=2.59, loss_mean=0.97, loss_mean_cls=2.03, proj_loss=-0.409][2026-03-23 13:49:55] Step: 3307, Training Logs: loss_final: 3.077407, loss_mean: 0.929592, proj_loss: -0.412076, loss_mean_cls: 2.559891, grad_norm: 15.642076 +Steps: 0%| | 3308/1000000 [13:35<67:44:39, 4.09it/s, grad_norm=15.6, loss_final=3.08, loss_mean=0.93, loss_mean_cls=2.56, proj_loss=-0.412][2026-03-23 13:49:55] Step: 3308, Training Logs: loss_final: 2.966524, loss_mean: 0.926834, proj_loss: -0.411257, loss_mean_cls: 2.450948, grad_norm: 14.146492 +Steps: 0%| | 3309/1000000 [13:35<67:37:36, 4.09it/s, grad_norm=14.1, loss_final=2.97, loss_mean=0.927, loss_mean_cls=2.45, proj_loss=-0.411][2026-03-23 13:49:55] Step: 3309, Training Logs: loss_final: 2.781656, loss_mean: 0.934849, proj_loss: -0.410285, loss_mean_cls: 2.257093, grad_norm: 15.160158 +Steps: 0%| | 3310/1000000 [13:36<67:34:49, 4.10it/s, grad_norm=15.2, loss_final=2.78, loss_mean=0.935, loss_mean_cls=2.26, proj_loss=-0.41][2026-03-23 13:49:56] Step: 3310, Training Logs: loss_final: 2.772974, loss_mean: 0.935989, proj_loss: -0.410413, loss_mean_cls: 2.247398, grad_norm: 10.421140 +Steps: 0%| | 3311/1000000 [13:36<67:32:19, 4.10it/s, grad_norm=10.4, loss_final=2.77, loss_mean=0.936, loss_mean_cls=2.25, proj_loss=-0.41][2026-03-23 13:49:56] Step: 3311, Training Logs: loss_final: 3.075415, loss_mean: 0.920853, proj_loss: -0.409257, loss_mean_cls: 2.563819, grad_norm: 23.835005 +Steps: 0%| | 3312/1000000 [13:36<67:29:46, 4.10it/s, grad_norm=23.8, loss_final=3.08, loss_mean=0.921, loss_mean_cls=2.56, proj_loss=-0.409][2026-03-23 13:49:56] Step: 3312, Training Logs: loss_final: 3.112601, loss_mean: 0.919247, proj_loss: -0.408571, loss_mean_cls: 2.601925, grad_norm: 21.111031 +Steps: 0%| | 3313/1000000 [13:36<67:28:31, 4.10it/s, grad_norm=21.1, loss_final=3.11, loss_mean=0.919, loss_mean_cls=2.6, proj_loss=-0.409][2026-03-23 13:49:56] Step: 3313, Training Logs: loss_final: 3.111761, loss_mean: 0.914219, proj_loss: -0.408749, loss_mean_cls: 2.606291, grad_norm: 31.876171 +Steps: 0%| | 3314/1000000 [13:37<67:27:36, 4.10it/s, grad_norm=31.9, loss_final=3.11, loss_mean=0.914, loss_mean_cls=2.61, proj_loss=-0.409][2026-03-23 13:49:57] Step: 3314, Training Logs: loss_final: 2.392181, loss_mean: 0.924398, proj_loss: -0.416304, loss_mean_cls: 1.884087, grad_norm: 7.851206 +Steps: 0%| | 3315/1000000 [13:37<67:26:55, 4.10it/s, grad_norm=7.85, loss_final=2.39, loss_mean=0.924, loss_mean_cls=1.88, proj_loss=-0.416][2026-03-23 13:49:57] Step: 3315, Training Logs: loss_final: 2.799376, loss_mean: 0.932159, proj_loss: -0.413754, loss_mean_cls: 2.280971, grad_norm: 10.040903 +Steps: 0%| | 3316/1000000 [13:37<67:25:48, 4.11it/s, grad_norm=10, loss_final=2.8, loss_mean=0.932, loss_mean_cls=2.28, proj_loss=-0.414][2026-03-23 13:49:57] Step: 3316, Training Logs: loss_final: 2.244765, loss_mean: 0.918691, proj_loss: -0.422070, loss_mean_cls: 1.748144, grad_norm: 18.947306 +Steps: 0%| | 3317/1000000 [13:37<67:24:29, 4.11it/s, grad_norm=18.9, loss_final=2.24, loss_mean=0.919, loss_mean_cls=1.75, proj_loss=-0.422][2026-03-23 13:49:57] Step: 3317, Training Logs: loss_final: 2.830958, loss_mean: 0.937761, proj_loss: -0.417466, loss_mean_cls: 2.310663, grad_norm: 19.596207 +Steps: 0%| | 3318/1000000 [13:38<67:26:42, 4.10it/s, grad_norm=19.6, loss_final=2.83, loss_mean=0.938, loss_mean_cls=2.31, proj_loss=-0.417][2026-03-23 13:49:58] Step: 3318, Training Logs: loss_final: 3.360796, loss_mean: 0.928416, proj_loss: -0.413335, loss_mean_cls: 2.845715, grad_norm: 27.941261 +Steps: 0%| | 3319/1000000 [13:38<67:26:10, 4.11it/s, grad_norm=27.9, loss_final=3.36, loss_mean=0.928, loss_mean_cls=2.85, proj_loss=-0.413][2026-03-23 13:49:58] Step: 3319, Training Logs: loss_final: 3.021818, loss_mean: 0.930446, proj_loss: -0.412834, loss_mean_cls: 2.504206, grad_norm: 15.646397 +Steps: 0%| | 3320/1000000 [13:38<67:26:47, 4.10it/s, grad_norm=15.6, loss_final=3.02, loss_mean=0.93, loss_mean_cls=2.5, proj_loss=-0.413][2026-03-23 13:49:58] Step: 3320, Training Logs: loss_final: 2.881496, loss_mean: 0.942153, proj_loss: -0.410713, loss_mean_cls: 2.350056, grad_norm: 8.261080 +Steps: 0%| | 3321/1000000 [13:38<67:26:29, 4.11it/s, grad_norm=8.26, loss_final=2.88, loss_mean=0.942, loss_mean_cls=2.35, proj_loss=-0.411][2026-03-23 13:49:58] Step: 3321, Training Logs: loss_final: 2.590670, loss_mean: 0.947824, proj_loss: -0.414041, loss_mean_cls: 2.056887, grad_norm: 1.962216 +Steps: 0%| | 3322/1000000 [13:39<67:26:02, 4.11it/s, grad_norm=1.96, loss_final=2.59, loss_mean=0.948, loss_mean_cls=2.06, proj_loss=-0.414][2026-03-23 13:49:58] Step: 3322, Training Logs: loss_final: 2.725479, loss_mean: 0.925163, proj_loss: -0.417434, loss_mean_cls: 2.217750, grad_norm: 20.965740 +Steps: 0%| | 3323/1000000 [13:39<67:25:21, 4.11it/s, grad_norm=21, loss_final=2.73, loss_mean=0.925, loss_mean_cls=2.22, proj_loss=-0.417][2026-03-23 13:49:59] Step: 3323, Training Logs: loss_final: 3.330959, loss_mean: 0.938142, proj_loss: -0.403964, loss_mean_cls: 2.796781, grad_norm: 33.005058 +Steps: 0%| | 3324/1000000 [13:39<67:25:58, 4.11it/s, grad_norm=33, loss_final=3.33, loss_mean=0.938, loss_mean_cls=2.8, proj_loss=-0.404][2026-03-23 13:49:59] Step: 3324, Training Logs: loss_final: 2.911151, loss_mean: 0.990281, proj_loss: -0.401079, loss_mean_cls: 2.321949, grad_norm: 19.209326 +Steps: 0%| | 3325/1000000 [13:39<67:25:08, 4.11it/s, grad_norm=19.2, loss_final=2.91, loss_mean=0.99, loss_mean_cls=2.32, proj_loss=-0.401][2026-03-23 13:49:59] Step: 3325, Training Logs: loss_final: 2.888505, loss_mean: 0.962338, proj_loss: -0.402089, loss_mean_cls: 2.328255, grad_norm: 17.690712 +Steps: 0%| | 3326/1000000 [13:39<67:25:07, 4.11it/s, grad_norm=17.7, loss_final=2.89, loss_mean=0.962, loss_mean_cls=2.33, proj_loss=-0.402][2026-03-23 13:49:59] Step: 3326, Training Logs: loss_final: 2.707765, loss_mean: 0.970806, proj_loss: -0.402102, loss_mean_cls: 2.139060, grad_norm: 4.142420 +Steps: 0%| | 3327/1000000 [13:40<67:24:02, 4.11it/s, grad_norm=4.14, loss_final=2.71, loss_mean=0.971, loss_mean_cls=2.14, proj_loss=-0.402][2026-03-23 13:50:00] Step: 3327, Training Logs: loss_final: 2.992266, loss_mean: 0.932355, proj_loss: -0.404671, loss_mean_cls: 2.464583, grad_norm: 17.384333 +Steps: 0%| | 3328/1000000 [13:40<67:23:58, 4.11it/s, grad_norm=17.4, loss_final=2.99, loss_mean=0.932, loss_mean_cls=2.46, proj_loss=-0.405][2026-03-23 13:50:00] Step: 3328, Training Logs: loss_final: 2.478752, loss_mean: 0.992692, proj_loss: -0.415427, loss_mean_cls: 1.901486, grad_norm: 20.286209 +Steps: 0%| | 3329/1000000 [13:40<67:24:37, 4.11it/s, grad_norm=20.3, loss_final=2.48, loss_mean=0.993, loss_mean_cls=1.9, proj_loss=-0.415][2026-03-23 13:50:00] Step: 3329, Training Logs: loss_final: 3.018235, loss_mean: 0.986159, proj_loss: -0.407953, loss_mean_cls: 2.440029, grad_norm: 11.231983 +Steps: 0%| | 3330/1000000 [13:40<67:23:55, 4.11it/s, grad_norm=11.2, loss_final=3.02, loss_mean=0.986, loss_mean_cls=2.44, proj_loss=-0.408][2026-03-23 13:50:00] Step: 3330, Training Logs: loss_final: 2.942226, loss_mean: 0.992912, proj_loss: -0.403534, loss_mean_cls: 2.352849, grad_norm: 15.862268 +Steps: 0%| | 3331/1000000 [13:41<67:23:05, 4.11it/s, grad_norm=15.9, loss_final=2.94, loss_mean=0.993, loss_mean_cls=2.35, proj_loss=-0.404][2026-03-23 13:50:01] Step: 3331, Training Logs: loss_final: 2.635430, loss_mean: 0.986529, proj_loss: -0.408077, loss_mean_cls: 2.056979, grad_norm: 16.077700 +Steps: 0%| | 3332/1000000 [13:41<67:23:26, 4.11it/s, grad_norm=16.1, loss_final=2.64, loss_mean=0.987, loss_mean_cls=2.06, proj_loss=-0.408][2026-03-23 13:50:01] Step: 3332, Training Logs: loss_final: 2.756107, loss_mean: 0.997538, proj_loss: -0.405537, loss_mean_cls: 2.164105, grad_norm: 8.305678 +Steps: 0%| | 3333/1000000 [13:41<67:22:58, 4.11it/s, grad_norm=8.31, loss_final=2.76, loss_mean=0.998, loss_mean_cls=2.16, proj_loss=-0.406][2026-03-23 13:50:01] Step: 3333, Training Logs: loss_final: 2.593528, loss_mean: 0.985426, proj_loss: -0.408068, loss_mean_cls: 2.016170, grad_norm: 22.892328 +Steps: 0%| | 3334/1000000 [13:41<67:25:26, 4.11it/s, grad_norm=22.9, loss_final=2.59, loss_mean=0.985, loss_mean_cls=2.02, proj_loss=-0.408][2026-03-23 13:50:01] Step: 3334, Training Logs: loss_final: 2.579237, loss_mean: 0.965023, proj_loss: -0.413705, loss_mean_cls: 2.027920, grad_norm: 16.961538 +Steps: 0%| | 3335/1000000 [13:42<67:24:11, 4.11it/s, grad_norm=17, loss_final=2.58, loss_mean=0.965, loss_mean_cls=2.03, proj_loss=-0.414][2026-03-23 13:50:02] Step: 3335, Training Logs: loss_final: 2.902116, loss_mean: 0.947638, proj_loss: -0.414686, loss_mean_cls: 2.369164, grad_norm: 22.713032 +Steps: 0%| | 3336/1000000 [13:42<67:24:21, 4.11it/s, grad_norm=22.7, loss_final=2.9, loss_mean=0.948, loss_mean_cls=2.37, proj_loss=-0.415][2026-03-23 13:50:02] Step: 3336, Training Logs: loss_final: 3.085508, loss_mean: 0.948819, proj_loss: -0.402981, loss_mean_cls: 2.539670, grad_norm: 7.263896 +Steps: 0%| | 3337/1000000 [13:42<67:24:09, 4.11it/s, grad_norm=7.26, loss_final=3.09, loss_mean=0.949, loss_mean_cls=2.54, proj_loss=-0.403][2026-03-23 13:50:02] Step: 3337, Training Logs: loss_final: 3.000373, loss_mean: 0.933072, proj_loss: -0.410526, loss_mean_cls: 2.477827, grad_norm: 12.365880 +Steps: 0%| | 3338/1000000 [13:42<67:25:06, 4.11it/s, grad_norm=12.4, loss_final=3, loss_mean=0.933, loss_mean_cls=2.48, proj_loss=-0.411][2026-03-23 13:50:02] Step: 3338, Training Logs: loss_final: 2.709972, loss_mean: 0.970787, proj_loss: -0.414480, loss_mean_cls: 2.153665, grad_norm: 23.088118 +Steps: 0%| | 3339/1000000 [13:43<67:24:42, 4.11it/s, grad_norm=23.1, loss_final=2.71, loss_mean=0.971, loss_mean_cls=2.15, proj_loss=-0.414][2026-03-23 13:50:03] Step: 3339, Training Logs: loss_final: 2.982436, loss_mean: 0.965250, proj_loss: -0.410308, loss_mean_cls: 2.427494, grad_norm: 10.736061 +Steps: 0%| | 3340/1000000 [13:43<67:23:49, 4.11it/s, grad_norm=10.7, loss_final=2.98, loss_mean=0.965, loss_mean_cls=2.43, proj_loss=-0.41][2026-03-23 13:50:03] Step: 3340, Training Logs: loss_final: 2.693583, loss_mean: 0.972898, proj_loss: -0.414723, loss_mean_cls: 2.135409, grad_norm: 23.394375 +Steps: 0%| | 3341/1000000 [13:43<67:24:23, 4.11it/s, grad_norm=23.4, loss_final=2.69, loss_mean=0.973, loss_mean_cls=2.14, proj_loss=-0.415][2026-03-23 13:50:03] Step: 3341, Training Logs: loss_final: 2.613188, loss_mean: 0.931520, proj_loss: -0.418726, loss_mean_cls: 2.100394, grad_norm: 13.787512 +Steps: 0%| | 3342/1000000 [13:43<67:25:38, 4.11it/s, grad_norm=13.8, loss_final=2.61, loss_mean=0.932, loss_mean_cls=2.1, proj_loss=-0.419][2026-03-23 13:50:03] Step: 3342, Training Logs: loss_final: 2.246427, loss_mean: 0.961878, proj_loss: -0.414041, loss_mean_cls: 1.698589, grad_norm: 9.375662 +Steps: 0%| | 3343/1000000 [13:44<67:26:10, 4.11it/s, grad_norm=9.38, loss_final=2.25, loss_mean=0.962, loss_mean_cls=1.7, proj_loss=-0.414][2026-03-23 13:50:04] Step: 3343, Training Logs: loss_final: 2.970295, loss_mean: 0.907002, proj_loss: -0.407603, loss_mean_cls: 2.470896, grad_norm: 12.403038 +Steps: 0%| | 3344/1000000 [13:44<67:26:55, 4.10it/s, grad_norm=12.4, loss_final=2.97, loss_mean=0.907, loss_mean_cls=2.47, proj_loss=-0.408][2026-03-23 13:50:04] Step: 3344, Training Logs: loss_final: 2.765787, loss_mean: 0.959467, proj_loss: -0.414895, loss_mean_cls: 2.221215, grad_norm: 20.781260 +Steps: 0%| | 3345/1000000 [13:44<67:26:10, 4.11it/s, grad_norm=20.8, loss_final=2.77, loss_mean=0.959, loss_mean_cls=2.22, proj_loss=-0.415][2026-03-23 13:50:04] Step: 3345, Training Logs: loss_final: 3.046824, loss_mean: 0.941131, proj_loss: -0.409689, loss_mean_cls: 2.515382, grad_norm: 15.692236 +Steps: 0%| | 3346/1000000 [13:44<67:24:24, 4.11it/s, grad_norm=15.7, loss_final=3.05, loss_mean=0.941, loss_mean_cls=2.52, proj_loss=-0.41][2026-03-23 13:50:04] Step: 3346, Training Logs: loss_final: 2.634264, loss_mean: 0.963410, proj_loss: -0.411723, loss_mean_cls: 2.082578, grad_norm: 8.393341 +Steps: 0%| | 3347/1000000 [13:45<67:23:28, 4.11it/s, grad_norm=8.39, loss_final=2.63, loss_mean=0.963, loss_mean_cls=2.08, proj_loss=-0.412][2026-03-23 13:50:05] Step: 3347, Training Logs: loss_final: 2.911067, loss_mean: 0.934147, proj_loss: -0.406027, loss_mean_cls: 2.382947, grad_norm: 9.035724 +Steps: 0%| | 3348/1000000 [13:45<67:25:15, 4.11it/s, grad_norm=9.04, loss_final=2.91, loss_mean=0.934, loss_mean_cls=2.38, proj_loss=-0.406][2026-03-23 13:50:05] Step: 3348, Training Logs: loss_final: 2.895482, loss_mean: 0.929494, proj_loss: -0.414861, loss_mean_cls: 2.380849, grad_norm: 13.233603 +Steps: 0%| | 3349/1000000 [13:45<67:26:55, 4.10it/s, grad_norm=13.2, loss_final=2.9, loss_mean=0.929, loss_mean_cls=2.38, proj_loss=-0.415][2026-03-23 13:50:05] Step: 3349, Training Logs: loss_final: 2.389860, loss_mean: 0.943471, proj_loss: -0.421631, loss_mean_cls: 1.868019, grad_norm: 17.834295 +Steps: 0%| | 3350/1000000 [13:45<67:35:26, 4.10it/s, grad_norm=17.8, loss_final=2.39, loss_mean=0.943, loss_mean_cls=1.87, proj_loss=-0.422][2026-03-23 13:50:05] Step: 3350, Training Logs: loss_final: 2.997613, loss_mean: 0.940439, proj_loss: -0.412257, loss_mean_cls: 2.469430, grad_norm: 14.679912 +Steps: 0%| | 3351/1000000 [13:46<67:32:44, 4.10it/s, grad_norm=14.7, loss_final=3, loss_mean=0.94, loss_mean_cls=2.47, proj_loss=-0.412][2026-03-23 13:50:06] Step: 3351, Training Logs: loss_final: 2.916872, loss_mean: 0.934276, proj_loss: -0.413740, loss_mean_cls: 2.396336, grad_norm: 3.498453 +Steps: 0%| | 3352/1000000 [13:46<67:31:12, 4.10it/s, grad_norm=3.5, loss_final=2.92, loss_mean=0.934, loss_mean_cls=2.4, proj_loss=-0.414][2026-03-23 13:50:06] Step: 3352, Training Logs: loss_final: 2.441685, loss_mean: 0.941216, proj_loss: -0.424840, loss_mean_cls: 1.925309, grad_norm: 10.965629 +Steps: 0%| | 3353/1000000 [13:46<67:29:06, 4.10it/s, grad_norm=11, loss_final=2.44, loss_mean=0.941, loss_mean_cls=1.93, proj_loss=-0.425][2026-03-23 13:50:06] Step: 3353, Training Logs: loss_final: 2.791907, loss_mean: 0.930640, proj_loss: -0.420453, loss_mean_cls: 2.281719, grad_norm: 4.339816 +Steps: 0%| | 3354/1000000 [13:46<67:28:08, 4.10it/s, grad_norm=4.34, loss_final=2.79, loss_mean=0.931, loss_mean_cls=2.28, proj_loss=-0.42][2026-03-23 13:50:06] Step: 3354, Training Logs: loss_final: 2.648462, loss_mean: 0.922067, proj_loss: -0.416824, loss_mean_cls: 2.143219, grad_norm: 3.800601 +Steps: 0%| | 3355/1000000 [13:47<67:27:37, 4.10it/s, grad_norm=3.8, loss_final=2.65, loss_mean=0.922, loss_mean_cls=2.14, proj_loss=-0.417][2026-03-23 13:50:07] Step: 3355, Training Logs: loss_final: 2.623315, loss_mean: 0.912313, proj_loss: -0.419828, loss_mean_cls: 2.130830, grad_norm: 17.729414 +Steps: 0%| | 3356/1000000 [13:47<67:29:24, 4.10it/s, grad_norm=17.7, loss_final=2.62, loss_mean=0.912, loss_mean_cls=2.13, proj_loss=-0.42][2026-03-23 13:50:07] Step: 3356, Training Logs: loss_final: 2.727873, loss_mean: 0.972977, proj_loss: -0.413873, loss_mean_cls: 2.168769, grad_norm: 4.611351 +Steps: 0%| | 3357/1000000 [13:47<67:27:26, 4.10it/s, grad_norm=4.61, loss_final=2.73, loss_mean=0.973, loss_mean_cls=2.17, proj_loss=-0.414][2026-03-23 13:50:07] Step: 3357, Training Logs: loss_final: 2.902730, loss_mean: 0.938391, proj_loss: -0.414175, loss_mean_cls: 2.378514, grad_norm: 8.689626 +Steps: 0%| | 3358/1000000 [13:47<67:30:52, 4.10it/s, grad_norm=8.69, loss_final=2.9, loss_mean=0.938, loss_mean_cls=2.38, proj_loss=-0.414][2026-03-23 13:50:07] Step: 3358, Training Logs: loss_final: 2.548029, loss_mean: 0.955863, proj_loss: -0.417144, loss_mean_cls: 2.009311, grad_norm: 2.400737 +Steps: 0%| | 3359/1000000 [13:48<67:29:39, 4.10it/s, grad_norm=2.4, loss_final=2.55, loss_mean=0.956, loss_mean_cls=2.01, proj_loss=-0.417][2026-03-23 13:50:07] Step: 3359, Training Logs: loss_final: 3.105447, loss_mean: 0.939529, proj_loss: -0.411937, loss_mean_cls: 2.577855, grad_norm: 14.947109 +Steps: 0%| | 3360/1000000 [13:48<67:28:43, 4.10it/s, grad_norm=14.9, loss_final=3.11, loss_mean=0.94, loss_mean_cls=2.58, proj_loss=-0.412][2026-03-23 13:50:08] Step: 3360, Training Logs: loss_final: 2.947134, loss_mean: 0.937664, proj_loss: -0.411064, loss_mean_cls: 2.420534, grad_norm: 9.994235 +Steps: 0%| | 3361/1000000 [13:48<67:27:28, 4.10it/s, grad_norm=9.99, loss_final=2.95, loss_mean=0.938, loss_mean_cls=2.42, proj_loss=-0.411][2026-03-23 13:50:08] Step: 3361, Training Logs: loss_final: 3.257502, loss_mean: 0.952046, proj_loss: -0.405309, loss_mean_cls: 2.710765, grad_norm: 7.593151 +Steps: 0%| | 3362/1000000 [13:48<67:27:52, 4.10it/s, grad_norm=7.59, loss_final=3.26, loss_mean=0.952, loss_mean_cls=2.71, proj_loss=-0.405][2026-03-23 13:50:08] Step: 3362, Training Logs: loss_final: 2.566987, loss_mean: 0.954570, proj_loss: -0.415655, loss_mean_cls: 2.028071, grad_norm: 6.834701 +Steps: 0%| | 3363/1000000 [13:49<67:26:18, 4.11it/s, grad_norm=6.83, loss_final=2.57, loss_mean=0.955, loss_mean_cls=2.03, proj_loss=-0.416][2026-03-23 13:50:08] Step: 3363, Training Logs: loss_final: 2.797015, loss_mean: 0.944965, proj_loss: -0.414058, loss_mean_cls: 2.266108, grad_norm: 13.453527 +Steps: 0%| | 3364/1000000 [13:49<67:26:48, 4.10it/s, grad_norm=13.5, loss_final=2.8, loss_mean=0.945, loss_mean_cls=2.27, proj_loss=-0.414][2026-03-23 13:50:09] Step: 3364, Training Logs: loss_final: 2.759144, loss_mean: 0.919246, proj_loss: -0.415671, loss_mean_cls: 2.255569, grad_norm: 16.050402 +Steps: 0%| | 3365/1000000 [13:49<67:25:43, 4.11it/s, grad_norm=16.1, loss_final=2.76, loss_mean=0.919, loss_mean_cls=2.26, proj_loss=-0.416][2026-03-23 13:50:09] Step: 3365, Training Logs: loss_final: 2.897675, loss_mean: 0.938108, proj_loss: -0.418387, loss_mean_cls: 2.377954, grad_norm: 5.136914 +Steps: 0%| | 3366/1000000 [13:49<67:25:51, 4.11it/s, grad_norm=5.14, loss_final=2.9, loss_mean=0.938, loss_mean_cls=2.38, proj_loss=-0.418][2026-03-23 13:50:09] Step: 3366, Training Logs: loss_final: 2.843006, loss_mean: 0.937570, proj_loss: -0.411620, loss_mean_cls: 2.317056, grad_norm: 15.302591 +Steps: 0%| | 3367/1000000 [13:49<67:24:28, 4.11it/s, grad_norm=15.3, loss_final=2.84, loss_mean=0.938, loss_mean_cls=2.32, proj_loss=-0.412][2026-03-23 13:50:09] Step: 3367, Training Logs: loss_final: 2.955597, loss_mean: 0.937739, proj_loss: -0.414495, loss_mean_cls: 2.432354, grad_norm: 23.492006 +Steps: 0%| | 3368/1000000 [13:50<67:24:12, 4.11it/s, grad_norm=23.5, loss_final=2.96, loss_mean=0.938, loss_mean_cls=2.43, proj_loss=-0.414][2026-03-23 13:50:10] Step: 3368, Training Logs: loss_final: 3.087523, loss_mean: 0.956793, proj_loss: -0.410010, loss_mean_cls: 2.540740, grad_norm: 20.209009 +Steps: 0%| | 3369/1000000 [13:50<67:24:42, 4.11it/s, grad_norm=20.2, loss_final=3.09, loss_mean=0.957, loss_mean_cls=2.54, proj_loss=-0.41][2026-03-23 13:50:10] Step: 3369, Training Logs: loss_final: 3.038008, loss_mean: 0.930152, proj_loss: -0.415279, loss_mean_cls: 2.523135, grad_norm: 22.404928 +Steps: 0%| | 3370/1000000 [13:50<67:24:26, 4.11it/s, grad_norm=22.4, loss_final=3.04, loss_mean=0.93, loss_mean_cls=2.52, proj_loss=-0.415][2026-03-23 13:50:10] Step: 3370, Training Logs: loss_final: 2.962713, loss_mean: 0.962633, proj_loss: -0.412211, loss_mean_cls: 2.412290, grad_norm: 3.868939 +Steps: 0%| | 3371/1000000 [13:50<67:24:10, 4.11it/s, grad_norm=3.87, loss_final=2.96, loss_mean=0.963, loss_mean_cls=2.41, proj_loss=-0.412][2026-03-23 13:50:10] Step: 3371, Training Logs: loss_final: 3.245159, loss_mean: 0.915708, proj_loss: -0.407478, loss_mean_cls: 2.736928, grad_norm: 20.143517 +Steps: 0%| | 3372/1000000 [13:51<67:26:47, 4.10it/s, grad_norm=20.1, loss_final=3.25, loss_mean=0.916, loss_mean_cls=2.74, proj_loss=-0.407][2026-03-23 13:50:11] Step: 3372, Training Logs: loss_final: 3.081071, loss_mean: 0.919525, proj_loss: -0.415319, loss_mean_cls: 2.576864, grad_norm: 14.597292 +Steps: 0%| | 3373/1000000 [13:51<67:25:44, 4.11it/s, grad_norm=14.6, loss_final=3.08, loss_mean=0.92, loss_mean_cls=2.58, proj_loss=-0.415][2026-03-23 13:50:11] Step: 3373, Training Logs: loss_final: 3.441944, loss_mean: 0.925987, proj_loss: -0.395825, loss_mean_cls: 2.911782, grad_norm: 4.383469 +Steps: 0%| | 3374/1000000 [13:51<67:27:43, 4.10it/s, grad_norm=4.38, loss_final=3.44, loss_mean=0.926, loss_mean_cls=2.91, proj_loss=-0.396][2026-03-23 13:50:11] Step: 3374, Training Logs: loss_final: 3.156533, loss_mean: 0.931779, proj_loss: -0.400444, loss_mean_cls: 2.625199, grad_norm: 5.986377 +Steps: 0%| | 3375/1000000 [13:51<67:27:40, 4.10it/s, grad_norm=5.99, loss_final=3.16, loss_mean=0.932, loss_mean_cls=2.63, proj_loss=-0.4][2026-03-23 13:50:11] Step: 3375, Training Logs: loss_final: 2.610092, loss_mean: 0.928263, proj_loss: -0.417500, loss_mean_cls: 2.099330, grad_norm: 2.047858 +Steps: 0%| | 3376/1000000 [13:52<67:26:30, 4.10it/s, grad_norm=2.05, loss_final=2.61, loss_mean=0.928, loss_mean_cls=2.1, proj_loss=-0.418][2026-03-23 13:50:12] Step: 3376, Training Logs: loss_final: 2.906512, loss_mean: 0.946991, proj_loss: -0.409967, loss_mean_cls: 2.369488, grad_norm: 3.659801 +Steps: 0%| | 3377/1000000 [13:52<67:25:49, 4.11it/s, grad_norm=3.66, loss_final=2.91, loss_mean=0.947, loss_mean_cls=2.37, proj_loss=-0.41][2026-03-23 13:50:12] Step: 3377, Training Logs: loss_final: 2.771080, loss_mean: 0.939057, proj_loss: -0.411217, loss_mean_cls: 2.243239, grad_norm: 4.316000 +Steps: 0%| | 3378/1000000 [13:52<67:25:39, 4.11it/s, grad_norm=4.32, loss_final=2.77, loss_mean=0.939, loss_mean_cls=2.24, proj_loss=-0.411][2026-03-23 13:50:12] Step: 3378, Training Logs: loss_final: 2.845492, loss_mean: 0.943995, proj_loss: -0.416280, loss_mean_cls: 2.317776, grad_norm: 3.781753 +Steps: 0%| | 3379/1000000 [13:52<67:26:14, 4.11it/s, grad_norm=3.78, loss_final=2.85, loss_mean=0.944, loss_mean_cls=2.32, proj_loss=-0.416][2026-03-23 13:50:12] Step: 3379, Training Logs: loss_final: 2.594635, loss_mean: 0.927689, proj_loss: -0.411592, loss_mean_cls: 2.078538, grad_norm: 7.137869 +Steps: 0%| | 3380/1000000 [13:53<67:26:11, 4.11it/s, grad_norm=7.14, loss_final=2.59, loss_mean=0.928, loss_mean_cls=2.08, proj_loss=-0.412][2026-03-23 13:50:13] Step: 3380, Training Logs: loss_final: 3.303627, loss_mean: 0.908411, proj_loss: -0.411023, loss_mean_cls: 2.806240, grad_norm: 4.622414 +Steps: 0%| | 3381/1000000 [13:53<67:25:18, 4.11it/s, grad_norm=4.62, loss_final=3.3, loss_mean=0.908, loss_mean_cls=2.81, proj_loss=-0.411][2026-03-23 13:50:13] Step: 3381, Training Logs: loss_final: 2.631836, loss_mean: 0.931765, proj_loss: -0.411025, loss_mean_cls: 2.111096, grad_norm: 3.465481 +Steps: 0%| | 3382/1000000 [13:53<67:27:11, 4.10it/s, grad_norm=3.47, loss_final=2.63, loss_mean=0.932, loss_mean_cls=2.11, proj_loss=-0.411][2026-03-23 13:50:13] Step: 3382, Training Logs: loss_final: 2.457022, loss_mean: 0.914078, proj_loss: -0.420593, loss_mean_cls: 1.963537, grad_norm: 4.912023 +Steps: 0%| | 3383/1000000 [13:53<67:42:40, 4.09it/s, grad_norm=4.91, loss_final=2.46, loss_mean=0.914, loss_mean_cls=1.96, proj_loss=-0.421][2026-03-23 13:50:13] Step: 3383, Training Logs: loss_final: 2.688438, loss_mean: 0.930518, proj_loss: -0.418870, loss_mean_cls: 2.176790, grad_norm: 15.301936 +Steps: 0%| | 3384/1000000 [13:54<67:37:50, 4.09it/s, grad_norm=15.3, loss_final=2.69, loss_mean=0.931, loss_mean_cls=2.18, proj_loss=-0.419][2026-03-23 13:50:14] Step: 3384, Training Logs: loss_final: 2.823741, loss_mean: 0.950251, proj_loss: -0.410756, loss_mean_cls: 2.284246, grad_norm: 7.964291 +Steps: 0%| | 3385/1000000 [13:54<67:33:25, 4.10it/s, grad_norm=7.96, loss_final=2.82, loss_mean=0.95, loss_mean_cls=2.28, proj_loss=-0.411][2026-03-23 13:50:14] Step: 3385, Training Logs: loss_final: 3.040167, loss_mean: 0.923298, proj_loss: -0.407173, loss_mean_cls: 2.524042, grad_norm: 6.346141 +Steps: 0%| | 3386/1000000 [13:54<67:31:57, 4.10it/s, grad_norm=6.35, loss_final=3.04, loss_mean=0.923, loss_mean_cls=2.52, proj_loss=-0.407][2026-03-23 13:50:14] Step: 3386, Training Logs: loss_final: 2.885152, loss_mean: 0.940766, proj_loss: -0.409505, loss_mean_cls: 2.353892, grad_norm: 16.826485 +Steps: 0%| | 3387/1000000 [13:54<67:31:06, 4.10it/s, grad_norm=16.8, loss_final=2.89, loss_mean=0.941, loss_mean_cls=2.35, proj_loss=-0.41][2026-03-23 13:50:14] Step: 3387, Training Logs: loss_final: 2.578064, loss_mean: 0.932031, proj_loss: -0.413449, loss_mean_cls: 2.059482, grad_norm: 2.114539 +Steps: 0%| | 3388/1000000 [13:55<67:29:08, 4.10it/s, grad_norm=2.11, loss_final=2.58, loss_mean=0.932, loss_mean_cls=2.06, proj_loss=-0.413][2026-03-23 13:50:15] Step: 3388, Training Logs: loss_final: 2.801759, loss_mean: 0.928190, proj_loss: -0.417623, loss_mean_cls: 2.291193, grad_norm: 14.403196 +Steps: 0%| | 3389/1000000 [13:55<67:27:36, 4.10it/s, grad_norm=14.4, loss_final=2.8, loss_mean=0.928, loss_mean_cls=2.29, proj_loss=-0.418][2026-03-23 13:50:15] Step: 3389, Training Logs: loss_final: 2.763602, loss_mean: 0.925173, proj_loss: -0.410464, loss_mean_cls: 2.248894, grad_norm: 2.826104 +Steps: 0%| | 3390/1000000 [13:55<67:27:05, 4.10it/s, grad_norm=2.83, loss_final=2.76, loss_mean=0.925, loss_mean_cls=2.25, proj_loss=-0.41][2026-03-23 13:50:15] Step: 3390, Training Logs: loss_final: 2.467537, loss_mean: 0.948488, proj_loss: -0.418414, loss_mean_cls: 1.937463, grad_norm: 1.885985 +Steps: 0%| | 3391/1000000 [13:55<67:27:59, 4.10it/s, grad_norm=1.89, loss_final=2.47, loss_mean=0.948, loss_mean_cls=1.94, proj_loss=-0.418][2026-03-23 13:50:15] Step: 3391, Training Logs: loss_final: 3.416905, loss_mean: 0.913578, proj_loss: -0.400033, loss_mean_cls: 2.903360, grad_norm: 12.049987 +Steps: 0%| | 3392/1000000 [13:56<67:29:27, 4.10it/s, grad_norm=12, loss_final=3.42, loss_mean=0.914, loss_mean_cls=2.9, proj_loss=-0.4][2026-03-23 13:50:16] Step: 3392, Training Logs: loss_final: 3.271356, loss_mean: 0.938060, proj_loss: -0.400951, loss_mean_cls: 2.734247, grad_norm: 10.842478 +Steps: 0%| | 3393/1000000 [13:56<67:26:10, 4.11it/s, grad_norm=10.8, loss_final=3.27, loss_mean=0.938, loss_mean_cls=2.73, proj_loss=-0.401][2026-03-23 13:50:16] Step: 3393, Training Logs: loss_final: 2.528259, loss_mean: 0.941461, proj_loss: -0.408901, loss_mean_cls: 1.995699, grad_norm: 3.595255 +Steps: 0%| | 3394/1000000 [13:56<67:25:35, 4.11it/s, grad_norm=3.6, loss_final=2.53, loss_mean=0.941, loss_mean_cls=2, proj_loss=-0.409][2026-03-23 13:50:16] Step: 3394, Training Logs: loss_final: 3.275694, loss_mean: 0.932532, proj_loss: -0.399157, loss_mean_cls: 2.742320, grad_norm: 10.898932 +Steps: 0%| | 3395/1000000 [13:56<67:26:02, 4.11it/s, grad_norm=10.9, loss_final=3.28, loss_mean=0.933, loss_mean_cls=2.74, proj_loss=-0.399][2026-03-23 13:50:16] Step: 3395, Training Logs: loss_final: 2.291521, loss_mean: 0.931965, proj_loss: -0.423798, loss_mean_cls: 1.783354, grad_norm: 3.354044 +Steps: 0%| | 3396/1000000 [13:57<68:30:51, 4.04it/s, grad_norm=3.35, loss_final=2.29, loss_mean=0.932, loss_mean_cls=1.78, proj_loss=-0.424][2026-03-23 13:50:17] Step: 3396, Training Logs: loss_final: 2.899028, loss_mean: 0.909766, proj_loss: -0.406031, loss_mean_cls: 2.395292, grad_norm: 12.292164 +Steps: 0%| | 3397/1000000 [13:57<68:10:23, 4.06it/s, grad_norm=12.3, loss_final=2.9, loss_mean=0.91, loss_mean_cls=2.4, proj_loss=-0.406][2026-03-23 13:50:17] Step: 3397, Training Logs: loss_final: 2.659976, loss_mean: 0.911445, proj_loss: -0.412596, loss_mean_cls: 2.161127, grad_norm: 12.188064 +Steps: 0%| | 3398/1000000 [13:57<67:56:57, 4.07it/s, grad_norm=12.2, loss_final=2.66, loss_mean=0.911, loss_mean_cls=2.16, proj_loss=-0.413][2026-03-23 13:50:17] Step: 3398, Training Logs: loss_final: 2.857451, loss_mean: 0.946168, proj_loss: -0.408331, loss_mean_cls: 2.319614, grad_norm: 3.679869 +Steps: 0%| | 3399/1000000 [13:57<67:50:49, 4.08it/s, grad_norm=3.68, loss_final=2.86, loss_mean=0.946, loss_mean_cls=2.32, proj_loss=-0.408][2026-03-23 13:50:17] Step: 3399, Training Logs: loss_final: 2.795921, loss_mean: 0.925109, proj_loss: -0.414928, loss_mean_cls: 2.285740, grad_norm: 5.033641 +Steps: 0%| | 3400/1000000 [13:58<68:29:58, 4.04it/s, grad_norm=5.03, loss_final=2.8, loss_mean=0.925, loss_mean_cls=2.29, proj_loss=-0.415][2026-03-23 13:50:17] Step: 3400, Training Logs: loss_final: 2.679924, loss_mean: 0.926098, proj_loss: -0.415768, loss_mean_cls: 2.169594, grad_norm: 2.256548 +Steps: 0%| | 3401/1000000 [13:58<68:41:10, 4.03it/s, grad_norm=2.26, loss_final=2.68, loss_mean=0.926, loss_mean_cls=2.17, proj_loss=-0.416][2026-03-23 13:50:18] Step: 3401, Training Logs: loss_final: 2.555698, loss_mean: 0.927267, proj_loss: -0.421268, loss_mean_cls: 2.049699, grad_norm: 19.908857 +Steps: 0%| | 3402/1000000 [13:58<68:19:54, 4.05it/s, grad_norm=19.9, loss_final=2.56, loss_mean=0.927, loss_mean_cls=2.05, proj_loss=-0.421][2026-03-23 13:50:18] Step: 3402, Training Logs: loss_final: 3.205281, loss_mean: 0.931741, proj_loss: -0.402093, loss_mean_cls: 2.675633, grad_norm: 18.128920 +Steps: 0%| | 3403/1000000 [13:58<68:11:02, 4.06it/s, grad_norm=18.1, loss_final=3.21, loss_mean=0.932, loss_mean_cls=2.68, proj_loss=-0.402][2026-03-23 13:50:18] Step: 3403, Training Logs: loss_final: 2.970101, loss_mean: 0.935354, proj_loss: -0.397616, loss_mean_cls: 2.432363, grad_norm: 8.969021 +Steps: 0%| | 3404/1000000 [13:59<67:50:08, 4.08it/s, grad_norm=8.97, loss_final=2.97, loss_mean=0.935, loss_mean_cls=2.43, proj_loss=-0.398][2026-03-23 13:50:18] Step: 3404, Training Logs: loss_final: 2.856006, loss_mean: 0.955359, proj_loss: -0.405450, loss_mean_cls: 2.306097, grad_norm: 11.083170 +Steps: 0%| | 3405/1000000 [13:59<67:41:00, 4.09it/s, grad_norm=11.1, loss_final=2.86, loss_mean=0.955, loss_mean_cls=2.31, proj_loss=-0.405][2026-03-23 13:50:19] Step: 3405, Training Logs: loss_final: 2.714034, loss_mean: 0.941497, proj_loss: -0.410488, loss_mean_cls: 2.183025, grad_norm: 16.590645 +Steps: 0%| | 3406/1000000 [13:59<67:34:30, 4.10it/s, grad_norm=16.6, loss_final=2.71, loss_mean=0.941, loss_mean_cls=2.18, proj_loss=-0.41][2026-03-23 13:50:19] Step: 3406, Training Logs: loss_final: 2.988424, loss_mean: 0.943376, proj_loss: -0.403702, loss_mean_cls: 2.448751, grad_norm: 14.707320 +Steps: 0%| | 3407/1000000 [13:59<67:32:05, 4.10it/s, grad_norm=14.7, loss_final=2.99, loss_mean=0.943, loss_mean_cls=2.45, proj_loss=-0.404][2026-03-23 13:50:19] Step: 3407, Training Logs: loss_final: 2.870757, loss_mean: 0.921309, proj_loss: -0.403483, loss_mean_cls: 2.352931, grad_norm: 4.342122 +Steps: 0%| | 3408/1000000 [14:00<67:31:14, 4.10it/s, grad_norm=4.34, loss_final=2.87, loss_mean=0.921, loss_mean_cls=2.35, proj_loss=-0.403][2026-03-23 13:50:19] Step: 3408, Training Logs: loss_final: 2.575872, loss_mean: 0.915128, proj_loss: -0.414387, loss_mean_cls: 2.075131, grad_norm: 12.398461 +Steps: 0%| | 3409/1000000 [14:00<67:28:02, 4.10it/s, grad_norm=12.4, loss_final=2.58, loss_mean=0.915, loss_mean_cls=2.08, proj_loss=-0.414][2026-03-23 13:50:20] Step: 3409, Training Logs: loss_final: 2.290568, loss_mean: 0.944440, proj_loss: -0.415647, loss_mean_cls: 1.761774, grad_norm: 10.324135 +Steps: 0%| | 3410/1000000 [14:00<67:26:55, 4.10it/s, grad_norm=10.3, loss_final=2.29, loss_mean=0.944, loss_mean_cls=1.76, proj_loss=-0.416][2026-03-23 13:50:20] Step: 3410, Training Logs: loss_final: 3.381639, loss_mean: 0.901646, proj_loss: -0.402463, loss_mean_cls: 2.882457, grad_norm: 25.387327 +Steps: 0%| | 3411/1000000 [14:00<67:25:17, 4.11it/s, grad_norm=25.4, loss_final=3.38, loss_mean=0.902, loss_mean_cls=2.88, proj_loss=-0.402][2026-03-23 13:50:20] Step: 3411, Training Logs: loss_final: 2.738652, loss_mean: 0.951583, proj_loss: -0.410382, loss_mean_cls: 2.197451, grad_norm: 18.340075 +Steps: 0%| | 3412/1000000 [14:00<67:24:18, 4.11it/s, grad_norm=18.3, loss_final=2.74, loss_mean=0.952, loss_mean_cls=2.2, proj_loss=-0.41][2026-03-23 13:50:20] Step: 3412, Training Logs: loss_final: 2.775284, loss_mean: 0.944839, proj_loss: -0.412092, loss_mean_cls: 2.242536, grad_norm: 13.834302 +Steps: 0%| | 3413/1000000 [14:01<67:24:21, 4.11it/s, grad_norm=13.8, loss_final=2.78, loss_mean=0.945, loss_mean_cls=2.24, proj_loss=-0.412][2026-03-23 13:50:21] Step: 3413, Training Logs: loss_final: 2.704869, loss_mean: 0.932866, proj_loss: -0.415937, loss_mean_cls: 2.187940, grad_norm: 4.217304 +Steps: 0%| | 3414/1000000 [14:01<67:23:02, 4.11it/s, grad_norm=4.22, loss_final=2.7, loss_mean=0.933, loss_mean_cls=2.19, proj_loss=-0.416][2026-03-23 13:50:21] Step: 3414, Training Logs: loss_final: 2.918751, loss_mean: 0.934742, proj_loss: -0.402974, loss_mean_cls: 2.386983, grad_norm: 8.730615 +Steps: 0%| | 3415/1000000 [14:01<67:23:24, 4.11it/s, grad_norm=8.73, loss_final=2.92, loss_mean=0.935, loss_mean_cls=2.39, proj_loss=-0.403][2026-03-23 13:50:21] Step: 3415, Training Logs: loss_final: 2.998085, loss_mean: 0.930165, proj_loss: -0.407846, loss_mean_cls: 2.475766, grad_norm: 15.602584 +Steps: 0%| | 3416/1000000 [14:01<67:24:44, 4.11it/s, grad_norm=15.6, loss_final=3, loss_mean=0.93, loss_mean_cls=2.48, proj_loss=-0.408][2026-03-23 13:50:21] Step: 3416, Training Logs: loss_final: 3.136623, loss_mean: 0.919381, proj_loss: -0.407433, loss_mean_cls: 2.624675, grad_norm: 4.511095 +Steps: 0%| | 3417/1000000 [14:02<67:24:52, 4.11it/s, grad_norm=4.51, loss_final=3.14, loss_mean=0.919, loss_mean_cls=2.62, proj_loss=-0.407][2026-03-23 13:50:22] Step: 3417, Training Logs: loss_final: 2.722626, loss_mean: 0.923212, proj_loss: -0.415452, loss_mean_cls: 2.214866, grad_norm: 3.659755 +Steps: 0%| | 3418/1000000 [14:02<67:26:45, 4.10it/s, grad_norm=3.66, loss_final=2.72, loss_mean=0.923, loss_mean_cls=2.21, proj_loss=-0.415][2026-03-23 13:50:22] Step: 3418, Training Logs: loss_final: 2.394502, loss_mean: 0.937699, proj_loss: -0.422168, loss_mean_cls: 1.878970, grad_norm: 13.899806 +Steps: 0%| | 3419/1000000 [14:02<67:26:14, 4.10it/s, grad_norm=13.9, loss_final=2.39, loss_mean=0.938, loss_mean_cls=1.88, proj_loss=-0.422][2026-03-23 13:50:22] Step: 3419, Training Logs: loss_final: 3.305536, loss_mean: 0.932798, proj_loss: -0.405663, loss_mean_cls: 2.778401, grad_norm: 27.699490 +Steps: 0%| | 3420/1000000 [14:02<67:28:47, 4.10it/s, grad_norm=27.7, loss_final=3.31, loss_mean=0.933, loss_mean_cls=2.78, proj_loss=-0.406][2026-03-23 13:50:22] Step: 3420, Training Logs: loss_final: 3.171463, loss_mean: 0.929028, proj_loss: -0.406097, loss_mean_cls: 2.648532, grad_norm: 3.673110 +Steps: 0%| | 3421/1000000 [14:03<67:27:32, 4.10it/s, grad_norm=3.67, loss_final=3.17, loss_mean=0.929, loss_mean_cls=2.65, proj_loss=-0.406][2026-03-23 13:50:23] Step: 3421, Training Logs: loss_final: 3.196224, loss_mean: 0.914192, proj_loss: -0.405937, loss_mean_cls: 2.687970, grad_norm: 12.657566 +Steps: 0%| | 3422/1000000 [14:03<67:28:33, 4.10it/s, grad_norm=12.7, loss_final=3.2, loss_mean=0.914, loss_mean_cls=2.69, proj_loss=-0.406][2026-03-23 13:50:23] Step: 3422, Training Logs: loss_final: 2.754671, loss_mean: 0.948819, proj_loss: -0.415746, loss_mean_cls: 2.221598, grad_norm: 23.882040 +Steps: 0%| | 3423/1000000 [14:03<67:27:31, 4.10it/s, grad_norm=23.9, loss_final=2.75, loss_mean=0.949, loss_mean_cls=2.22, proj_loss=-0.416][2026-03-23 13:50:23] Step: 3423, Training Logs: loss_final: 2.784298, loss_mean: 0.983010, proj_loss: -0.409215, loss_mean_cls: 2.210504, grad_norm: 5.191439 +Steps: 0%| | 3424/1000000 [14:03<67:25:44, 4.11it/s, grad_norm=5.19, loss_final=2.78, loss_mean=0.983, loss_mean_cls=2.21, proj_loss=-0.409][2026-03-23 13:50:23] Step: 3424, Training Logs: loss_final: 2.411662, loss_mean: 0.948090, proj_loss: -0.418940, loss_mean_cls: 1.882511, grad_norm: 11.879557 +Steps: 0%| | 3425/1000000 [14:04<67:26:12, 4.10it/s, grad_norm=11.9, loss_final=2.41, loss_mean=0.948, loss_mean_cls=1.88, proj_loss=-0.419][2026-03-23 13:50:24] Step: 3425, Training Logs: loss_final: 3.382152, loss_mean: 0.914100, proj_loss: -0.401002, loss_mean_cls: 2.869055, grad_norm: 28.145657 +Steps: 0%| | 3426/1000000 [14:04<67:26:17, 4.10it/s, grad_norm=28.1, loss_final=3.38, loss_mean=0.914, loss_mean_cls=2.87, proj_loss=-0.401][2026-03-23 13:50:24] Step: 3426, Training Logs: loss_final: 2.751081, loss_mean: 0.943950, proj_loss: -0.412698, loss_mean_cls: 2.219829, grad_norm: 13.593628 +Steps: 0%| | 3427/1000000 [14:04<67:26:33, 4.10it/s, grad_norm=13.6, loss_final=2.75, loss_mean=0.944, loss_mean_cls=2.22, proj_loss=-0.413][2026-03-23 13:50:24] Step: 3427, Training Logs: loss_final: 2.734854, loss_mean: 0.961266, proj_loss: -0.404738, loss_mean_cls: 2.178326, grad_norm: 3.956317 +Steps: 0%| | 3428/1000000 [14:04<67:25:33, 4.11it/s, grad_norm=3.96, loss_final=2.73, loss_mean=0.961, loss_mean_cls=2.18, proj_loss=-0.405][2026-03-23 13:50:24] Step: 3428, Training Logs: loss_final: 2.932884, loss_mean: 0.934404, proj_loss: -0.412698, loss_mean_cls: 2.411178, grad_norm: 21.955021 +Steps: 0%| | 3429/1000000 [14:05<67:25:34, 4.11it/s, grad_norm=22, loss_final=2.93, loss_mean=0.934, loss_mean_cls=2.41, proj_loss=-0.413][2026-03-23 13:50:25] Step: 3429, Training Logs: loss_final: 2.738910, loss_mean: 0.945751, proj_loss: -0.417053, loss_mean_cls: 2.210211, grad_norm: 8.763285 +Steps: 0%| | 3430/1000000 [14:05<67:25:56, 4.11it/s, grad_norm=8.76, loss_final=2.74, loss_mean=0.946, loss_mean_cls=2.21, proj_loss=-0.417][2026-03-23 13:50:25] Step: 3430, Training Logs: loss_final: 2.774266, loss_mean: 0.929220, proj_loss: -0.411252, loss_mean_cls: 2.256299, grad_norm: 3.190500 +Steps: 0%| | 3431/1000000 [14:05<67:24:49, 4.11it/s, grad_norm=3.19, loss_final=2.77, loss_mean=0.929, loss_mean_cls=2.26, proj_loss=-0.411][2026-03-23 13:50:25] Step: 3431, Training Logs: loss_final: 2.512340, loss_mean: 0.958393, proj_loss: -0.418986, loss_mean_cls: 1.972934, grad_norm: 12.355727 +Steps: 0%| | 3432/1000000 [14:05<67:25:53, 4.11it/s, grad_norm=12.4, loss_final=2.51, loss_mean=0.958, loss_mean_cls=1.97, proj_loss=-0.419][2026-03-23 13:50:25] Step: 3432, Training Logs: loss_final: 2.981809, loss_mean: 0.962064, proj_loss: -0.399948, loss_mean_cls: 2.419693, grad_norm: 13.522807 +Steps: 0%| | 3433/1000000 [14:06<67:25:51, 4.11it/s, grad_norm=13.5, loss_final=2.98, loss_mean=0.962, loss_mean_cls=2.42, proj_loss=-0.4][2026-03-23 13:50:26] Step: 3433, Training Logs: loss_final: 3.075704, loss_mean: 0.987824, proj_loss: -0.408046, loss_mean_cls: 2.495926, grad_norm: 5.785050 +Steps: 0%| | 3434/1000000 [14:06<67:30:04, 4.10it/s, grad_norm=5.79, loss_final=3.08, loss_mean=0.988, loss_mean_cls=2.5, proj_loss=-0.408][2026-03-23 13:50:26] Step: 3434, Training Logs: loss_final: 2.418030, loss_mean: 0.979310, proj_loss: -0.419427, loss_mean_cls: 1.858146, grad_norm: 3.388128 +Steps: 0%| | 3435/1000000 [14:06<67:28:05, 4.10it/s, grad_norm=3.39, loss_final=2.42, loss_mean=0.979, loss_mean_cls=1.86, proj_loss=-0.419][2026-03-23 13:50:26] Step: 3435, Training Logs: loss_final: 2.941278, loss_mean: 0.938339, proj_loss: -0.415452, loss_mean_cls: 2.418391, grad_norm: 30.018427 +Steps: 0%| | 3436/1000000 [14:06<67:27:46, 4.10it/s, grad_norm=30, loss_final=2.94, loss_mean=0.938, loss_mean_cls=2.42, proj_loss=-0.415][2026-03-23 13:50:26] Step: 3436, Training Logs: loss_final: 2.504812, loss_mean: 1.004090, proj_loss: -0.416676, loss_mean_cls: 1.917398, grad_norm: 4.414921 +Steps: 0%| | 3437/1000000 [14:07<67:25:51, 4.11it/s, grad_norm=4.41, loss_final=2.5, loss_mean=1, loss_mean_cls=1.92, proj_loss=-0.417][2026-03-23 13:50:27] Step: 3437, Training Logs: loss_final: 3.014117, loss_mean: 0.975826, proj_loss: -0.414406, loss_mean_cls: 2.452697, grad_norm: 7.897708 +Steps: 0%| | 3438/1000000 [14:07<67:27:16, 4.10it/s, grad_norm=7.9, loss_final=3.01, loss_mean=0.976, loss_mean_cls=2.45, proj_loss=-0.414][2026-03-23 13:50:27] Step: 3438, Training Logs: loss_final: 2.892522, loss_mean: 0.959816, proj_loss: -0.412050, loss_mean_cls: 2.344757, grad_norm: 7.102387 +Steps: 0%| | 3439/1000000 [14:07<67:25:59, 4.11it/s, grad_norm=7.1, loss_final=2.89, loss_mean=0.96, loss_mean_cls=2.34, proj_loss=-0.412][2026-03-23 13:50:27] Step: 3439, Training Logs: loss_final: 2.859566, loss_mean: 0.933136, proj_loss: -0.412968, loss_mean_cls: 2.339399, grad_norm: 11.002019 +Steps: 0%| | 3440/1000000 [14:07<67:25:53, 4.11it/s, grad_norm=11, loss_final=2.86, loss_mean=0.933, loss_mean_cls=2.34, proj_loss=-0.413][2026-03-23 13:50:27] Step: 3440, Training Logs: loss_final: 2.357803, loss_mean: 0.966010, proj_loss: -0.418263, loss_mean_cls: 1.810057, grad_norm: 12.828234 +Steps: 0%| | 3441/1000000 [14:08<67:27:22, 4.10it/s, grad_norm=12.8, loss_final=2.36, loss_mean=0.966, loss_mean_cls=1.81, proj_loss=-0.418][2026-03-23 13:50:28] Step: 3441, Training Logs: loss_final: 2.886188, loss_mean: 0.950126, proj_loss: -0.410049, loss_mean_cls: 2.346110, grad_norm: 3.755183 +Steps: 0%| | 3442/1000000 [14:08<67:26:23, 4.10it/s, grad_norm=3.76, loss_final=2.89, loss_mean=0.95, loss_mean_cls=2.35, proj_loss=-0.41][2026-03-23 13:50:28] Step: 3442, Training Logs: loss_final: 2.627222, loss_mean: 0.939448, proj_loss: -0.409388, loss_mean_cls: 2.097162, grad_norm: 5.926595 +Steps: 0%| | 3443/1000000 [14:08<67:25:30, 4.11it/s, grad_norm=5.93, loss_final=2.63, loss_mean=0.939, loss_mean_cls=2.1, proj_loss=-0.409][2026-03-23 13:50:28] Step: 3443, Training Logs: loss_final: 3.101496, loss_mean: 0.939160, proj_loss: -0.411103, loss_mean_cls: 2.573439, grad_norm: 4.076449 +Steps: 0%| | 3444/1000000 [14:08<67:31:16, 4.10it/s, grad_norm=4.08, loss_final=3.1, loss_mean=0.939, loss_mean_cls=2.57, proj_loss=-0.411][2026-03-23 13:50:28] Step: 3444, Training Logs: loss_final: 2.700041, loss_mean: 0.928995, proj_loss: -0.418152, loss_mean_cls: 2.189197, grad_norm: 10.001642 +Steps: 0%| | 3445/1000000 [14:09<67:28:24, 4.10it/s, grad_norm=10, loss_final=2.7, loss_mean=0.929, loss_mean_cls=2.19, proj_loss=-0.418][2026-03-23 13:50:28] Step: 3445, Training Logs: loss_final: 2.787261, loss_mean: 0.922007, proj_loss: -0.417120, loss_mean_cls: 2.282373, grad_norm: 33.068096 +Steps: 0%| | 3446/1000000 [14:09<67:30:48, 4.10it/s, grad_norm=33.1, loss_final=2.79, loss_mean=0.922, loss_mean_cls=2.28, proj_loss=-0.417][2026-03-23 13:50:29] Step: 3446, Training Logs: loss_final: 2.686851, loss_mean: 0.943428, proj_loss: -0.414849, loss_mean_cls: 2.158272, grad_norm: 8.927422 +Steps: 0%| | 3447/1000000 [14:09<67:28:35, 4.10it/s, grad_norm=8.93, loss_final=2.69, loss_mean=0.943, loss_mean_cls=2.16, proj_loss=-0.415][2026-03-23 13:50:29] Step: 3447, Training Logs: loss_final: 2.824357, loss_mean: 0.956034, proj_loss: -0.412737, loss_mean_cls: 2.281060, grad_norm: 23.129049 +Steps: 0%| | 3448/1000000 [14:09<67:27:30, 4.10it/s, grad_norm=23.1, loss_final=2.82, loss_mean=0.956, loss_mean_cls=2.28, proj_loss=-0.413][2026-03-23 13:50:29] Step: 3448, Training Logs: loss_final: 2.954808, loss_mean: 0.919794, proj_loss: -0.408997, loss_mean_cls: 2.444011, grad_norm: 3.433362 +Steps: 0%| | 3449/1000000 [14:09<67:26:33, 4.10it/s, grad_norm=3.43, loss_final=2.95, loss_mean=0.92, loss_mean_cls=2.44, proj_loss=-0.409][2026-03-23 13:50:29] Step: 3449, Training Logs: loss_final: 3.163921, loss_mean: 0.924637, proj_loss: -0.408873, loss_mean_cls: 2.648158, grad_norm: 7.500821 +Steps: 0%| | 3450/1000000 [14:10<67:26:25, 4.10it/s, grad_norm=7.5, loss_final=3.16, loss_mean=0.925, loss_mean_cls=2.65, proj_loss=-0.409][2026-03-23 13:50:30] Step: 3450, Training Logs: loss_final: 2.682479, loss_mean: 0.920353, proj_loss: -0.417370, loss_mean_cls: 2.179497, grad_norm: 12.693628 +Steps: 0%| | 3451/1000000 [14:10<67:26:08, 4.10it/s, grad_norm=12.7, loss_final=2.68, loss_mean=0.92, loss_mean_cls=2.18, proj_loss=-0.417][2026-03-23 13:50:30] Step: 3451, Training Logs: loss_final: 2.591299, loss_mean: 0.954461, proj_loss: -0.412404, loss_mean_cls: 2.049242, grad_norm: 22.942616 +Steps: 0%| | 3452/1000000 [14:10<67:25:50, 4.11it/s, grad_norm=22.9, loss_final=2.59, loss_mean=0.954, loss_mean_cls=2.05, proj_loss=-0.412][2026-03-23 13:50:30] Step: 3452, Training Logs: loss_final: 3.174786, loss_mean: 0.963062, proj_loss: -0.402692, loss_mean_cls: 2.614416, grad_norm: 7.512009 +Steps: 0%| | 3453/1000000 [14:10<67:29:28, 4.10it/s, grad_norm=7.51, loss_final=3.17, loss_mean=0.963, loss_mean_cls=2.61, proj_loss=-0.403][2026-03-23 13:50:30] Step: 3453, Training Logs: loss_final: 2.721080, loss_mean: 0.946269, proj_loss: -0.413410, loss_mean_cls: 2.188221, grad_norm: 10.729289 +Steps: 0%| | 3454/1000000 [14:11<70:35:13, 3.92it/s, grad_norm=10.7, loss_final=2.72, loss_mean=0.946, loss_mean_cls=2.19, proj_loss=-0.413][2026-03-23 13:50:31] Step: 3454, Training Logs: loss_final: 3.068007, loss_mean: 0.922092, proj_loss: -0.403651, loss_mean_cls: 2.549566, grad_norm: 6.040692 +Steps: 0%| | 3455/1000000 [14:11<70:43:28, 3.91it/s, grad_norm=6.04, loss_final=3.07, loss_mean=0.922, loss_mean_cls=2.55, proj_loss=-0.404][2026-03-23 13:50:31] Step: 3455, Training Logs: loss_final: 3.409147, loss_mean: 0.903831, proj_loss: -0.399689, loss_mean_cls: 2.905005, grad_norm: 2.294681 +Steps: 0%| | 3456/1000000 [14:11<69:42:49, 3.97it/s, grad_norm=2.29, loss_final=3.41, loss_mean=0.904, loss_mean_cls=2.91, proj_loss=-0.4][2026-03-23 13:50:31] Step: 3456, Training Logs: loss_final: 2.726774, loss_mean: 0.927381, proj_loss: -0.416092, loss_mean_cls: 2.215485, grad_norm: 4.298052 +Steps: 0%| | 3457/1000000 [14:11<69:01:53, 4.01it/s, grad_norm=4.3, loss_final=2.73, loss_mean=0.927, loss_mean_cls=2.22, proj_loss=-0.416][2026-03-23 13:50:31] Step: 3457, Training Logs: loss_final: 2.588339, loss_mean: 0.952080, proj_loss: -0.418354, loss_mean_cls: 2.054612, grad_norm: 5.329677 +Steps: 0%| | 3458/1000000 [14:12<68:32:19, 4.04it/s, grad_norm=5.33, loss_final=2.59, loss_mean=0.952, loss_mean_cls=2.05, proj_loss=-0.418][2026-03-23 13:50:32] Step: 3458, Training Logs: loss_final: 2.802396, loss_mean: 0.925441, proj_loss: -0.412250, loss_mean_cls: 2.289206, grad_norm: 6.745372 +Steps: 0%| | 3459/1000000 [14:12<68:12:10, 4.06it/s, grad_norm=6.75, loss_final=2.8, loss_mean=0.925, loss_mean_cls=2.29, proj_loss=-0.412][2026-03-23 13:50:32] Step: 3459, Training Logs: loss_final: 2.579852, loss_mean: 0.964313, proj_loss: -0.415121, loss_mean_cls: 2.030660, grad_norm: 23.417841 +Steps: 0%| | 3460/1000000 [14:12<67:57:18, 4.07it/s, grad_norm=23.4, loss_final=2.58, loss_mean=0.964, loss_mean_cls=2.03, proj_loss=-0.415][2026-03-23 13:50:32] Step: 3460, Training Logs: loss_final: 2.975288, loss_mean: 0.938219, proj_loss: -0.407281, loss_mean_cls: 2.444350, grad_norm: 11.300847 +Steps: 0%| | 3461/1000000 [14:12<67:45:54, 4.08it/s, grad_norm=11.3, loss_final=2.98, loss_mean=0.938, loss_mean_cls=2.44, proj_loss=-0.407][2026-03-23 13:50:32] Step: 3461, Training Logs: loss_final: 2.384522, loss_mean: 0.945583, proj_loss: -0.421657, loss_mean_cls: 1.860596, grad_norm: 8.163797 +Steps: 0%| | 3462/1000000 [14:13<67:40:36, 4.09it/s, grad_norm=8.16, loss_final=2.38, loss_mean=0.946, loss_mean_cls=1.86, proj_loss=-0.422][2026-03-23 13:50:33] Step: 3462, Training Logs: loss_final: 2.660163, loss_mean: 0.933814, proj_loss: -0.414890, loss_mean_cls: 2.141239, grad_norm: 2.155551 +Steps: 0%| | 3463/1000000 [14:13<67:37:16, 4.09it/s, grad_norm=2.16, loss_final=2.66, loss_mean=0.934, loss_mean_cls=2.14, proj_loss=-0.415][2026-03-23 13:50:33] Step: 3463, Training Logs: loss_final: 2.345150, loss_mean: 0.942302, proj_loss: -0.424554, loss_mean_cls: 1.827402, grad_norm: 3.422505 +Steps: 0%| | 3464/1000000 [14:13<67:33:29, 4.10it/s, grad_norm=3.42, loss_final=2.35, loss_mean=0.942, loss_mean_cls=1.83, proj_loss=-0.425][2026-03-23 13:50:33] Step: 3464, Training Logs: loss_final: 2.276609, loss_mean: 0.938081, proj_loss: -0.423341, loss_mean_cls: 1.761869, grad_norm: 4.999967 +Steps: 0%| | 3465/1000000 [14:13<67:32:38, 4.10it/s, grad_norm=5, loss_final=2.28, loss_mean=0.938, loss_mean_cls=1.76, proj_loss=-0.423][2026-03-23 13:50:33] Step: 3465, Training Logs: loss_final: 2.964867, loss_mean: 0.919775, proj_loss: -0.410231, loss_mean_cls: 2.455323, grad_norm: 4.000692 +Steps: 0%| | 3466/1000000 [14:14<67:30:04, 4.10it/s, grad_norm=4, loss_final=2.96, loss_mean=0.92, loss_mean_cls=2.46, proj_loss=-0.41][2026-03-23 13:50:34] Step: 3466, Training Logs: loss_final: 2.647215, loss_mean: 0.919620, proj_loss: -0.412342, loss_mean_cls: 2.139937, grad_norm: 7.211997 +Steps: 0%| | 3467/1000000 [14:14<67:26:45, 4.10it/s, grad_norm=7.21, loss_final=2.65, loss_mean=0.92, loss_mean_cls=2.14, proj_loss=-0.412][2026-03-23 13:50:34] Step: 3467, Training Logs: loss_final: 2.559392, loss_mean: 0.915216, proj_loss: -0.417095, loss_mean_cls: 2.061272, grad_norm: 2.071184 +Steps: 0%| | 3468/1000000 [14:14<67:26:02, 4.10it/s, grad_norm=2.07, loss_final=2.56, loss_mean=0.915, loss_mean_cls=2.06, proj_loss=-0.417][2026-03-23 13:50:34] Step: 3468, Training Logs: loss_final: 2.880423, loss_mean: 0.932262, proj_loss: -0.417375, loss_mean_cls: 2.365537, grad_norm: 3.909467 +Steps: 0%| | 3469/1000000 [14:14<67:25:50, 4.11it/s, grad_norm=3.91, loss_final=2.88, loss_mean=0.932, loss_mean_cls=2.37, proj_loss=-0.417][2026-03-23 13:50:34] Step: 3469, Training Logs: loss_final: 2.821341, loss_mean: 0.927421, proj_loss: -0.410357, loss_mean_cls: 2.304277, grad_norm: 16.459711 +Steps: 0%| | 3470/1000000 [14:15<67:25:45, 4.11it/s, grad_norm=16.5, loss_final=2.82, loss_mean=0.927, loss_mean_cls=2.3, proj_loss=-0.41][2026-03-23 13:50:35] Step: 3470, Training Logs: loss_final: 2.843015, loss_mean: 0.916770, proj_loss: -0.411320, loss_mean_cls: 2.337565, grad_norm: 11.987542 +Steps: 0%| | 3471/1000000 [14:15<67:25:53, 4.11it/s, grad_norm=12, loss_final=2.84, loss_mean=0.917, loss_mean_cls=2.34, proj_loss=-0.411][2026-03-23 13:50:35] Step: 3471, Training Logs: loss_final: 2.157261, loss_mean: 0.965232, proj_loss: -0.416911, loss_mean_cls: 1.608940, grad_norm: 1.929274 +Steps: 0%| | 3472/1000000 [14:15<67:26:52, 4.10it/s, grad_norm=1.93, loss_final=2.16, loss_mean=0.965, loss_mean_cls=1.61, proj_loss=-0.417][2026-03-23 13:50:35] Step: 3472, Training Logs: loss_final: 3.116030, loss_mean: 0.918328, proj_loss: -0.406502, loss_mean_cls: 2.604204, grad_norm: 8.958382 +Steps: 0%| | 3473/1000000 [14:15<67:27:35, 4.10it/s, grad_norm=8.96, loss_final=3.12, loss_mean=0.918, loss_mean_cls=2.6, proj_loss=-0.407][2026-03-23 13:50:35] Step: 3473, Training Logs: loss_final: 2.758054, loss_mean: 0.945146, proj_loss: -0.401662, loss_mean_cls: 2.214570, grad_norm: 23.333368 +Steps: 0%| | 3474/1000000 [14:16<67:28:26, 4.10it/s, grad_norm=23.3, loss_final=2.76, loss_mean=0.945, loss_mean_cls=2.21, proj_loss=-0.402][2026-03-23 13:50:36] Step: 3474, Training Logs: loss_final: 2.676328, loss_mean: 0.964153, proj_loss: -0.399558, loss_mean_cls: 2.111733, grad_norm: 6.362663 +Steps: 0%| | 3475/1000000 [14:16<67:26:33, 4.10it/s, grad_norm=6.36, loss_final=2.68, loss_mean=0.964, loss_mean_cls=2.11, proj_loss=-0.4][2026-03-23 13:50:36] Step: 3475, Training Logs: loss_final: 2.508417, loss_mean: 0.972549, proj_loss: -0.398395, loss_mean_cls: 1.934263, grad_norm: 13.236071 +Steps: 0%| | 3476/1000000 [14:16<67:26:41, 4.10it/s, grad_norm=13.2, loss_final=2.51, loss_mean=0.973, loss_mean_cls=1.93, proj_loss=-0.398][2026-03-23 13:50:36] Step: 3476, Training Logs: loss_final: 3.229644, loss_mean: 0.922319, proj_loss: -0.397166, loss_mean_cls: 2.704492, grad_norm: 10.139494 +Steps: 0%| | 3477/1000000 [14:16<67:25:38, 4.11it/s, grad_norm=10.1, loss_final=3.23, loss_mean=0.922, loss_mean_cls=2.7, proj_loss=-0.397][2026-03-23 13:50:36] Step: 3477, Training Logs: loss_final: 3.166929, loss_mean: 0.939310, proj_loss: -0.401114, loss_mean_cls: 2.628734, grad_norm: 32.061867 +Steps: 0%| | 3478/1000000 [14:17<67:25:21, 4.11it/s, grad_norm=32.1, loss_final=3.17, loss_mean=0.939, loss_mean_cls=2.63, proj_loss=-0.401][2026-03-23 13:50:37] Step: 3478, Training Logs: loss_final: 2.764258, loss_mean: 0.965169, proj_loss: -0.406270, loss_mean_cls: 2.205359, grad_norm: 8.676410 +Steps: 0%| | 3479/1000000 [14:17<67:26:32, 4.10it/s, grad_norm=8.68, loss_final=2.76, loss_mean=0.965, loss_mean_cls=2.21, proj_loss=-0.406][2026-03-23 13:50:37] Step: 3479, Training Logs: loss_final: 3.200898, loss_mean: 0.947417, proj_loss: -0.402971, loss_mean_cls: 2.656452, grad_norm: 21.236280 +Steps: 0%| | 3480/1000000 [14:17<67:25:33, 4.11it/s, grad_norm=21.2, loss_final=3.2, loss_mean=0.947, loss_mean_cls=2.66, proj_loss=-0.403][2026-03-23 13:50:37] Step: 3480, Training Logs: loss_final: 2.862990, loss_mean: 0.972079, proj_loss: -0.410918, loss_mean_cls: 2.301830, grad_norm: 19.446358 +Steps: 0%| | 3481/1000000 [14:17<67:26:22, 4.10it/s, grad_norm=19.4, loss_final=2.86, loss_mean=0.972, loss_mean_cls=2.3, proj_loss=-0.411][2026-03-23 13:50:37] Step: 3481, Training Logs: loss_final: 2.929755, loss_mean: 0.944237, proj_loss: -0.407636, loss_mean_cls: 2.393154, grad_norm: 24.014154 +Steps: 0%| | 3482/1000000 [14:18<67:26:36, 4.10it/s, grad_norm=24, loss_final=2.93, loss_mean=0.944, loss_mean_cls=2.39, proj_loss=-0.408][2026-03-23 13:50:38] Step: 3482, Training Logs: loss_final: 3.135472, loss_mean: 0.925481, proj_loss: -0.403037, loss_mean_cls: 2.613028, grad_norm: 21.558535 +Steps: 0%| | 3483/1000000 [14:18<67:26:49, 4.10it/s, grad_norm=21.6, loss_final=3.14, loss_mean=0.925, loss_mean_cls=2.61, proj_loss=-0.403][2026-03-23 13:50:38] Step: 3483, Training Logs: loss_final: 2.452959, loss_mean: 0.954506, proj_loss: -0.417708, loss_mean_cls: 1.916161, grad_norm: 13.630107 +Steps: 0%| | 3484/1000000 [14:18<67:26:21, 4.10it/s, grad_norm=13.6, loss_final=2.45, loss_mean=0.955, loss_mean_cls=1.92, proj_loss=-0.418][2026-03-23 13:50:38] Step: 3484, Training Logs: loss_final: 2.618749, loss_mean: 0.936356, proj_loss: -0.406738, loss_mean_cls: 2.089131, grad_norm: 22.186817 +Steps: 0%| | 3485/1000000 [14:18<67:26:19, 4.10it/s, grad_norm=22.2, loss_final=2.62, loss_mean=0.936, loss_mean_cls=2.09, proj_loss=-0.407][2026-03-23 13:50:38] Step: 3485, Training Logs: loss_final: 3.271107, loss_mean: 0.939512, proj_loss: -0.400602, loss_mean_cls: 2.732196, grad_norm: 17.010860 +Steps: 0%| | 3486/1000000 [14:19<67:26:14, 4.10it/s, grad_norm=17, loss_final=3.27, loss_mean=0.94, loss_mean_cls=2.73, proj_loss=-0.401][2026-03-23 13:50:39] Step: 3486, Training Logs: loss_final: 3.457140, loss_mean: 0.896335, proj_loss: -0.403914, loss_mean_cls: 2.964720, grad_norm: 21.442610 +Steps: 0%| | 3487/1000000 [14:19<67:26:18, 4.10it/s, grad_norm=21.4, loss_final=3.46, loss_mean=0.896, loss_mean_cls=2.96, proj_loss=-0.404][2026-03-23 13:50:39] Step: 3487, Training Logs: loss_final: 3.193482, loss_mean: 0.941829, proj_loss: -0.401127, loss_mean_cls: 2.652780, grad_norm: 6.646049 +Steps: 0%| | 3488/1000000 [14:19<67:26:17, 4.10it/s, grad_norm=6.65, loss_final=3.19, loss_mean=0.942, loss_mean_cls=2.65, proj_loss=-0.401][2026-03-23 13:50:39] Step: 3488, Training Logs: loss_final: 3.074244, loss_mean: 0.942163, proj_loss: -0.404366, loss_mean_cls: 2.536447, grad_norm: 3.446595 +Steps: 0%| | 3489/1000000 [14:19<67:25:33, 4.11it/s, grad_norm=3.45, loss_final=3.07, loss_mean=0.942, loss_mean_cls=2.54, proj_loss=-0.404][2026-03-23 13:50:39] Step: 3489, Training Logs: loss_final: 2.647938, loss_mean: 0.930311, proj_loss: -0.419497, loss_mean_cls: 2.137125, grad_norm: 2.481171 +Steps: 0%| | 3490/1000000 [14:20<67:26:05, 4.10it/s, grad_norm=2.48, loss_final=2.65, loss_mean=0.93, loss_mean_cls=2.14, proj_loss=-0.419][2026-03-23 13:50:39] Step: 3490, Training Logs: loss_final: 2.641937, loss_mean: 0.941828, proj_loss: -0.417178, loss_mean_cls: 2.117286, grad_norm: 10.704431 +Steps: 0%| | 3491/1000000 [14:20<67:25:37, 4.11it/s, grad_norm=10.7, loss_final=2.64, loss_mean=0.942, loss_mean_cls=2.12, proj_loss=-0.417][2026-03-23 13:50:40] Step: 3491, Training Logs: loss_final: 2.342833, loss_mean: 0.956371, proj_loss: -0.422281, loss_mean_cls: 1.808742, grad_norm: 8.440373 +Steps: 0%| | 3492/1000000 [14:20<67:23:59, 4.11it/s, grad_norm=8.44, loss_final=2.34, loss_mean=0.956, loss_mean_cls=1.81, proj_loss=-0.422][2026-03-23 13:50:40] Step: 3492, Training Logs: loss_final: 3.082465, loss_mean: 0.893858, proj_loss: -0.408262, loss_mean_cls: 2.596870, grad_norm: 2.161137 +Steps: 0%| | 3493/1000000 [14:20<67:25:52, 4.11it/s, grad_norm=2.16, loss_final=3.08, loss_mean=0.894, loss_mean_cls=2.6, proj_loss=-0.408][2026-03-23 13:50:40] Step: 3493, Training Logs: loss_final: 2.734438, loss_mean: 0.910224, proj_loss: -0.421354, loss_mean_cls: 2.245569, grad_norm: 23.371571 +Steps: 0%| | 3494/1000000 [14:21<67:25:32, 4.11it/s, grad_norm=23.4, loss_final=2.73, loss_mean=0.91, loss_mean_cls=2.25, proj_loss=-0.421][2026-03-23 13:50:40] Step: 3494, Training Logs: loss_final: 2.834695, loss_mean: 0.928480, proj_loss: -0.409067, loss_mean_cls: 2.315282, grad_norm: 11.553753 +Steps: 0%| | 3495/1000000 [14:21<67:26:59, 4.10it/s, grad_norm=11.6, loss_final=2.83, loss_mean=0.928, loss_mean_cls=2.32, proj_loss=-0.409][2026-03-23 13:50:41] Step: 3495, Training Logs: loss_final: 2.879757, loss_mean: 0.928852, proj_loss: -0.407787, loss_mean_cls: 2.358692, grad_norm: 8.679558 +Steps: 0%| | 3496/1000000 [14:21<67:26:38, 4.10it/s, grad_norm=8.68, loss_final=2.88, loss_mean=0.929, loss_mean_cls=2.36, proj_loss=-0.408][2026-03-23 13:50:41] Step: 3496, Training Logs: loss_final: 2.569401, loss_mean: 0.959005, proj_loss: -0.407149, loss_mean_cls: 2.017545, grad_norm: 16.854551 +Steps: 0%| | 3497/1000000 [14:21<67:26:11, 4.10it/s, grad_norm=16.9, loss_final=2.57, loss_mean=0.959, loss_mean_cls=2.02, proj_loss=-0.407][2026-03-23 13:50:41] Step: 3497, Training Logs: loss_final: 2.736914, loss_mean: 0.951568, proj_loss: -0.401929, loss_mean_cls: 2.187275, grad_norm: 7.555604 +Steps: 0%| | 3498/1000000 [14:21<67:25:32, 4.11it/s, grad_norm=7.56, loss_final=2.74, loss_mean=0.952, loss_mean_cls=2.19, proj_loss=-0.402][2026-03-23 13:50:41] Step: 3498, Training Logs: loss_final: 3.088859, loss_mean: 0.934718, proj_loss: -0.396629, loss_mean_cls: 2.550769, grad_norm: 9.209198 +Steps: 0%| | 3499/1000000 [14:22<67:26:06, 4.10it/s, grad_norm=9.21, loss_final=3.09, loss_mean=0.935, loss_mean_cls=2.55, proj_loss=-0.397][2026-03-23 13:50:42] Step: 3499, Training Logs: loss_final: 2.672709, loss_mean: 0.948141, proj_loss: -0.415246, loss_mean_cls: 2.139814, grad_norm: 9.476560 +Steps: 0%| | 3500/1000000 [14:22<67:27:50, 4.10it/s, grad_norm=9.48, loss_final=2.67, loss_mean=0.948, loss_mean_cls=2.14, proj_loss=-0.415][2026-03-23 13:50:42] Step: 3500, Training Logs: loss_final: 2.391455, loss_mean: 0.963191, proj_loss: -0.418289, loss_mean_cls: 1.846553, grad_norm: 2.694536 +Steps: 0%| | 3501/1000000 [14:22<67:26:32, 4.10it/s, grad_norm=2.69, loss_final=2.39, loss_mean=0.963, loss_mean_cls=1.85, proj_loss=-0.418][2026-03-23 13:50:42] Step: 3501, Training Logs: loss_final: 2.855233, loss_mean: 0.924697, proj_loss: -0.411891, loss_mean_cls: 2.342427, grad_norm: 2.238260 +Steps: 0%| | 3502/1000000 [14:22<67:25:38, 4.11it/s, grad_norm=2.24, loss_final=2.86, loss_mean=0.925, loss_mean_cls=2.34, proj_loss=-0.412][2026-03-23 13:50:42] Step: 3502, Training Logs: loss_final: 2.546400, loss_mean: 0.928609, proj_loss: -0.414523, loss_mean_cls: 2.032314, grad_norm: 16.199186 +Steps: 0%| | 3503/1000000 [14:23<67:26:30, 4.10it/s, grad_norm=16.2, loss_final=2.55, loss_mean=0.929, loss_mean_cls=2.03, proj_loss=-0.415][2026-03-23 13:50:43] Step: 3503, Training Logs: loss_final: 3.329635, loss_mean: 0.896554, proj_loss: -0.400323, loss_mean_cls: 2.833403, grad_norm: 27.056030 +Steps: 0%| | 3504/1000000 [14:23<67:27:10, 4.10it/s, grad_norm=27.1, loss_final=3.33, loss_mean=0.897, loss_mean_cls=2.83, proj_loss=-0.4][2026-03-23 13:50:43] Step: 3504, Training Logs: loss_final: 2.924581, loss_mean: 0.963070, proj_loss: -0.407160, loss_mean_cls: 2.368671, grad_norm: 16.056913 +Steps: 0%| | 3505/1000000 [14:23<67:26:34, 4.10it/s, grad_norm=16.1, loss_final=2.92, loss_mean=0.963, loss_mean_cls=2.37, proj_loss=-0.407][2026-03-23 13:50:43] Step: 3505, Training Logs: loss_final: 3.135349, loss_mean: 0.953337, proj_loss: -0.398494, loss_mean_cls: 2.580506, grad_norm: 13.162598 +Steps: 0%| | 3506/1000000 [14:23<68:51:08, 4.02it/s, grad_norm=13.2, loss_final=3.14, loss_mean=0.953, loss_mean_cls=2.58, proj_loss=-0.398][2026-03-23 13:50:43] Step: 3506, Training Logs: loss_final: 2.774372, loss_mean: 0.957558, proj_loss: -0.403847, loss_mean_cls: 2.220661, grad_norm: 25.250835 +Steps: 0%| | 3507/1000000 [14:24<68:24:15, 4.05it/s, grad_norm=25.3, loss_final=2.77, loss_mean=0.958, loss_mean_cls=2.22, proj_loss=-0.404][2026-03-23 13:50:44] Step: 3507, Training Logs: loss_final: 2.529191, loss_mean: 0.969132, proj_loss: -0.410409, loss_mean_cls: 1.970467, grad_norm: 15.095587 +Steps: 0%| | 3508/1000000 [14:24<68:05:36, 4.07it/s, grad_norm=15.1, loss_final=2.53, loss_mean=0.969, loss_mean_cls=1.97, proj_loss=-0.41][2026-03-23 13:50:44] Step: 3508, Training Logs: loss_final: 2.943199, loss_mean: 0.940452, proj_loss: -0.403885, loss_mean_cls: 2.406632, grad_norm: 10.271795 +Steps: 0%| | 3509/1000000 [14:24<67:53:51, 4.08it/s, grad_norm=10.3, loss_final=2.94, loss_mean=0.94, loss_mean_cls=2.41, proj_loss=-0.404][2026-03-23 13:50:44] Step: 3509, Training Logs: loss_final: 3.204938, loss_mean: 0.942965, proj_loss: -0.396587, loss_mean_cls: 2.658561, grad_norm: 9.159494 +Steps: 0%| | 3510/1000000 [14:24<67:56:20, 4.07it/s, grad_norm=9.16, loss_final=3.2, loss_mean=0.943, loss_mean_cls=2.66, proj_loss=-0.397][2026-03-23 13:50:44] Step: 3510, Training Logs: loss_final: 2.683850, loss_mean: 0.951375, proj_loss: -0.402987, loss_mean_cls: 2.135462, grad_norm: 1.590378 +Steps: 0%| | 3511/1000000 [14:25<67:46:21, 4.08it/s, grad_norm=1.59, loss_final=2.68, loss_mean=0.951, loss_mean_cls=2.14, proj_loss=-0.403][2026-03-23 13:50:45] Step: 3511, Training Logs: loss_final: 2.961020, loss_mean: 0.923076, proj_loss: -0.411283, loss_mean_cls: 2.449226, grad_norm: 5.658719 +Steps: 0%| | 3512/1000000 [14:25<67:41:34, 4.09it/s, grad_norm=5.66, loss_final=2.96, loss_mean=0.923, loss_mean_cls=2.45, proj_loss=-0.411][2026-03-23 13:50:45] Step: 3512, Training Logs: loss_final: 3.600897, loss_mean: 0.896172, proj_loss: -0.396087, loss_mean_cls: 3.100811, grad_norm: 7.517296 +Steps: 0%| | 3513/1000000 [14:25<67:37:06, 4.09it/s, grad_norm=7.52, loss_final=3.6, loss_mean=0.896, loss_mean_cls=3.1, proj_loss=-0.396][2026-03-23 13:50:45] Step: 3513, Training Logs: loss_final: 2.813384, loss_mean: 0.960106, proj_loss: -0.405005, loss_mean_cls: 2.258282, grad_norm: 5.673486 +Steps: 0%| | 3514/1000000 [14:25<67:38:43, 4.09it/s, grad_norm=5.67, loss_final=2.81, loss_mean=0.96, loss_mean_cls=2.26, proj_loss=-0.405][2026-03-23 13:50:45] Step: 3514, Training Logs: loss_final: 2.650593, loss_mean: 0.906026, proj_loss: -0.417068, loss_mean_cls: 2.161635, grad_norm: 5.040182 +Steps: 0%| | 3515/1000000 [14:26<67:33:57, 4.10it/s, grad_norm=5.04, loss_final=2.65, loss_mean=0.906, loss_mean_cls=2.16, proj_loss=-0.417][2026-03-23 13:50:46] Step: 3515, Training Logs: loss_final: 2.833658, loss_mean: 0.924969, proj_loss: -0.406103, loss_mean_cls: 2.314792, grad_norm: 2.695823 +Steps: 0%| | 3516/1000000 [14:26<67:29:48, 4.10it/s, grad_norm=2.7, loss_final=2.83, loss_mean=0.925, loss_mean_cls=2.31, proj_loss=-0.406][2026-03-23 13:50:46] Step: 3516, Training Logs: loss_final: 2.628361, loss_mean: 0.943899, proj_loss: -0.418386, loss_mean_cls: 2.102848, grad_norm: 12.360070 +Steps: 0%| | 3517/1000000 [14:26<67:29:24, 4.10it/s, grad_norm=12.4, loss_final=2.63, loss_mean=0.944, loss_mean_cls=2.1, proj_loss=-0.418][2026-03-23 13:50:46] Step: 3517, Training Logs: loss_final: 2.660613, loss_mean: 0.942082, proj_loss: -0.414474, loss_mean_cls: 2.133005, grad_norm: 15.462103 +Steps: 0%| | 3518/1000000 [14:26<67:31:59, 4.10it/s, grad_norm=15.5, loss_final=2.66, loss_mean=0.942, loss_mean_cls=2.13, proj_loss=-0.414][2026-03-23 13:50:46] Step: 3518, Training Logs: loss_final: 3.202647, loss_mean: 0.919839, proj_loss: -0.406009, loss_mean_cls: 2.688816, grad_norm: 9.195794 +Steps: 0%| | 3519/1000000 [14:27<67:30:15, 4.10it/s, grad_norm=9.2, loss_final=3.2, loss_mean=0.92, loss_mean_cls=2.69, proj_loss=-0.406][2026-03-23 13:50:47] Step: 3519, Training Logs: loss_final: 2.381318, loss_mean: 0.948329, proj_loss: -0.418167, loss_mean_cls: 1.851156, grad_norm: 2.432626 +Steps: 0%| | 3520/1000000 [14:27<67:27:55, 4.10it/s, grad_norm=2.43, loss_final=2.38, loss_mean=0.948, loss_mean_cls=1.85, proj_loss=-0.418][2026-03-23 13:50:47] Step: 3520, Training Logs: loss_final: 2.657625, loss_mean: 0.948294, proj_loss: -0.416771, loss_mean_cls: 2.126101, grad_norm: 10.097632 +Steps: 0%| | 3521/1000000 [14:27<67:29:09, 4.10it/s, grad_norm=10.1, loss_final=2.66, loss_mean=0.948, loss_mean_cls=2.13, proj_loss=-0.417][2026-03-23 13:50:47] Step: 3521, Training Logs: loss_final: 2.949072, loss_mean: 0.932090, proj_loss: -0.414450, loss_mean_cls: 2.431431, grad_norm: 22.310534 +Steps: 0%| | 3522/1000000 [14:27<67:35:24, 4.10it/s, grad_norm=22.3, loss_final=2.95, loss_mean=0.932, loss_mean_cls=2.43, proj_loss=-0.414][2026-03-23 13:50:47] Step: 3522, Training Logs: loss_final: 3.131431, loss_mean: 0.924187, proj_loss: -0.402057, loss_mean_cls: 2.609301, grad_norm: 22.497162 +Steps: 0%| | 3523/1000000 [14:28<67:33:10, 4.10it/s, grad_norm=22.5, loss_final=3.13, loss_mean=0.924, loss_mean_cls=2.61, proj_loss=-0.402][2026-03-23 13:50:48] Step: 3523, Training Logs: loss_final: 3.318196, loss_mean: 0.933154, proj_loss: -0.393939, loss_mean_cls: 2.778981, grad_norm: 16.327356 +Steps: 0%| | 3524/1000000 [14:28<67:31:15, 4.10it/s, grad_norm=16.3, loss_final=3.32, loss_mean=0.933, loss_mean_cls=2.78, proj_loss=-0.394][2026-03-23 13:50:48] Step: 3524, Training Logs: loss_final: 2.785957, loss_mean: 0.962900, proj_loss: -0.398253, loss_mean_cls: 2.221310, grad_norm: 15.373618 +Steps: 0%| | 3525/1000000 [14:28<67:29:47, 4.10it/s, grad_norm=15.4, loss_final=2.79, loss_mean=0.963, loss_mean_cls=2.22, proj_loss=-0.398][2026-03-23 13:50:48] Step: 3525, Training Logs: loss_final: 2.553334, loss_mean: 0.947446, proj_loss: -0.406273, loss_mean_cls: 2.012161, grad_norm: 13.729643 +Steps: 0%| | 3526/1000000 [14:28<67:36:28, 4.09it/s, grad_norm=13.7, loss_final=2.55, loss_mean=0.947, loss_mean_cls=2.01, proj_loss=-0.406][2026-03-23 13:50:48] Step: 3526, Training Logs: loss_final: 2.968339, loss_mean: 0.960697, proj_loss: -0.399600, loss_mean_cls: 2.407243, grad_norm: 12.907944 +Steps: 0%| | 3527/1000000 [14:29<67:34:12, 4.10it/s, grad_norm=12.9, loss_final=2.97, loss_mean=0.961, loss_mean_cls=2.41, proj_loss=-0.4][2026-03-23 13:50:49] Step: 3527, Training Logs: loss_final: 2.496066, loss_mean: 0.965619, proj_loss: -0.412914, loss_mean_cls: 1.943361, grad_norm: 11.095406 +Steps: 0%| | 3528/1000000 [14:29<67:32:18, 4.10it/s, grad_norm=11.1, loss_final=2.5, loss_mean=0.966, loss_mean_cls=1.94, proj_loss=-0.413][2026-03-23 13:50:49] Step: 3528, Training Logs: loss_final: 2.409744, loss_mean: 0.934447, proj_loss: -0.414198, loss_mean_cls: 1.889495, grad_norm: 11.856788 +Steps: 0%| | 3529/1000000 [14:29<67:29:45, 4.10it/s, grad_norm=11.9, loss_final=2.41, loss_mean=0.934, loss_mean_cls=1.89, proj_loss=-0.414][2026-03-23 13:50:49] Step: 3529, Training Logs: loss_final: 2.732967, loss_mean: 0.950103, proj_loss: -0.412780, loss_mean_cls: 2.195644, grad_norm: 13.634778 +Steps: 0%| | 3530/1000000 [14:29<67:33:00, 4.10it/s, grad_norm=13.6, loss_final=2.73, loss_mean=0.95, loss_mean_cls=2.2, proj_loss=-0.413][2026-03-23 13:50:49] Step: 3530, Training Logs: loss_final: 2.715333, loss_mean: 0.965677, proj_loss: -0.414510, loss_mean_cls: 2.164166, grad_norm: 10.823475 +Steps: 0%| | 3531/1000000 [14:30<67:30:50, 4.10it/s, grad_norm=10.8, loss_final=2.72, loss_mean=0.966, loss_mean_cls=2.16, proj_loss=-0.415][2026-03-23 13:50:50] Step: 3531, Training Logs: loss_final: 3.009859, loss_mean: 0.912015, proj_loss: -0.407687, loss_mean_cls: 2.505531, grad_norm: 20.890604 +Steps: 0%| | 3532/1000000 [14:30<67:26:30, 4.10it/s, grad_norm=20.9, loss_final=3.01, loss_mean=0.912, loss_mean_cls=2.51, proj_loss=-0.408][2026-03-23 13:50:50] Step: 3532, Training Logs: loss_final: 2.805800, loss_mean: 0.941861, proj_loss: -0.414498, loss_mean_cls: 2.278438, grad_norm: 13.564817 +Steps: 0%| | 3533/1000000 [14:30<67:26:18, 4.10it/s, grad_norm=13.6, loss_final=2.81, loss_mean=0.942, loss_mean_cls=2.28, proj_loss=-0.414][2026-03-23 13:50:50] Step: 3533, Training Logs: loss_final: 3.331642, loss_mean: 0.904114, proj_loss: -0.405720, loss_mean_cls: 2.833247, grad_norm: 19.241129 +Steps: 0%| | 3534/1000000 [14:30<67:32:23, 4.10it/s, grad_norm=19.2, loss_final=3.33, loss_mean=0.904, loss_mean_cls=2.83, proj_loss=-0.406][2026-03-23 13:50:50] Step: 3534, Training Logs: loss_final: 3.015543, loss_mean: 0.905244, proj_loss: -0.407423, loss_mean_cls: 2.517722, grad_norm: 12.127764 +Steps: 0%| | 3535/1000000 [14:31<67:30:14, 4.10it/s, grad_norm=12.1, loss_final=3.02, loss_mean=0.905, loss_mean_cls=2.52, proj_loss=-0.407][2026-03-23 13:50:50] Step: 3535, Training Logs: loss_final: 2.660929, loss_mean: 0.931714, proj_loss: -0.412083, loss_mean_cls: 2.141298, grad_norm: 16.449532 +Steps: 0%| | 3536/1000000 [14:31<67:28:07, 4.10it/s, grad_norm=16.4, loss_final=2.66, loss_mean=0.932, loss_mean_cls=2.14, proj_loss=-0.412][2026-03-23 13:50:51] Step: 3536, Training Logs: loss_final: 2.583836, loss_mean: 0.942025, proj_loss: -0.409898, loss_mean_cls: 2.051709, grad_norm: 8.239120 +Steps: 0%| | 3537/1000000 [14:31<67:27:45, 4.10it/s, grad_norm=8.24, loss_final=2.58, loss_mean=0.942, loss_mean_cls=2.05, proj_loss=-0.41][2026-03-23 13:50:51] Step: 3537, Training Logs: loss_final: 2.804298, loss_mean: 0.937400, proj_loss: -0.417722, loss_mean_cls: 2.284620, grad_norm: 27.041136 +Steps: 0%| | 3538/1000000 [14:31<67:35:02, 4.10it/s, grad_norm=27, loss_final=2.8, loss_mean=0.937, loss_mean_cls=2.28, proj_loss=-0.418][2026-03-23 13:50:51] Step: 3538, Training Logs: loss_final: 3.237831, loss_mean: 0.919555, proj_loss: -0.407838, loss_mean_cls: 2.726114, grad_norm: 27.897230 +Steps: 0%| | 3539/1000000 [14:31<67:32:23, 4.10it/s, grad_norm=27.9, loss_final=3.24, loss_mean=0.92, loss_mean_cls=2.73, proj_loss=-0.408][2026-03-23 13:50:51] Step: 3539, Training Logs: loss_final: 2.843564, loss_mean: 0.949020, proj_loss: -0.413459, loss_mean_cls: 2.308002, grad_norm: 6.492594 +Steps: 0%| | 3540/1000000 [14:32<67:35:36, 4.09it/s, grad_norm=6.49, loss_final=2.84, loss_mean=0.949, loss_mean_cls=2.31, proj_loss=-0.413][2026-03-23 13:50:52] Step: 3540, Training Logs: loss_final: 2.852229, loss_mean: 0.930428, proj_loss: -0.409392, loss_mean_cls: 2.331193, grad_norm: 9.798042 +Steps: 0%| | 3541/1000000 [14:32<67:31:35, 4.10it/s, grad_norm=9.8, loss_final=2.85, loss_mean=0.93, loss_mean_cls=2.33, proj_loss=-0.409][2026-03-23 13:50:52] Step: 3541, Training Logs: loss_final: 3.186684, loss_mean: 0.929293, proj_loss: -0.406748, loss_mean_cls: 2.664139, grad_norm: 17.417154 +Steps: 0%| | 3542/1000000 [14:32<67:30:45, 4.10it/s, grad_norm=17.4, loss_final=3.19, loss_mean=0.929, loss_mean_cls=2.66, proj_loss=-0.407][2026-03-23 13:50:52] Step: 3542, Training Logs: loss_final: 2.647321, loss_mean: 0.932093, proj_loss: -0.423448, loss_mean_cls: 2.138676, grad_norm: 6.784909 +Steps: 0%| | 3543/1000000 [14:32<67:28:07, 4.10it/s, grad_norm=6.78, loss_final=2.65, loss_mean=0.932, loss_mean_cls=2.14, proj_loss=-0.423][2026-03-23 13:50:52] Step: 3543, Training Logs: loss_final: 2.530968, loss_mean: 0.954796, proj_loss: -0.415970, loss_mean_cls: 1.992141, grad_norm: 3.224581 +Steps: 0%| | 3544/1000000 [14:33<67:26:45, 4.10it/s, grad_norm=3.22, loss_final=2.53, loss_mean=0.955, loss_mean_cls=1.99, proj_loss=-0.416][2026-03-23 13:50:53] Step: 3544, Training Logs: loss_final: 2.493643, loss_mean: 0.939105, proj_loss: -0.426679, loss_mean_cls: 1.981216, grad_norm: 2.987569 +Steps: 0%| | 3545/1000000 [14:33<67:28:21, 4.10it/s, grad_norm=2.99, loss_final=2.49, loss_mean=0.939, loss_mean_cls=1.98, proj_loss=-0.427][2026-03-23 13:50:53] Step: 3545, Training Logs: loss_final: 2.727882, loss_mean: 0.933494, proj_loss: -0.414507, loss_mean_cls: 2.208895, grad_norm: 6.003799 +Steps: 0%| | 3546/1000000 [14:33<67:26:05, 4.10it/s, grad_norm=6, loss_final=2.73, loss_mean=0.933, loss_mean_cls=2.21, proj_loss=-0.415][2026-03-23 13:50:53] Step: 3546, Training Logs: loss_final: 3.074850, loss_mean: 0.899860, proj_loss: -0.410750, loss_mean_cls: 2.585740, grad_norm: 8.699301 +Steps: 0%| | 3547/1000000 [14:33<67:26:29, 4.10it/s, grad_norm=8.7, loss_final=3.07, loss_mean=0.9, loss_mean_cls=2.59, proj_loss=-0.411][2026-03-23 13:50:53] Step: 3547, Training Logs: loss_final: 2.593090, loss_mean: 0.911293, proj_loss: -0.414826, loss_mean_cls: 2.096623, grad_norm: 3.083875 +Steps: 0%| | 3548/1000000 [14:34<67:24:59, 4.11it/s, grad_norm=3.08, loss_final=2.59, loss_mean=0.911, loss_mean_cls=2.1, proj_loss=-0.415][2026-03-23 13:50:54] Step: 3548, Training Logs: loss_final: 3.048209, loss_mean: 0.925996, proj_loss: -0.407089, loss_mean_cls: 2.529302, grad_norm: 23.107616 +Steps: 0%| | 3549/1000000 [14:34<67:25:01, 4.11it/s, grad_norm=23.1, loss_final=3.05, loss_mean=0.926, loss_mean_cls=2.53, proj_loss=-0.407][2026-03-23 13:50:54] Step: 3549, Training Logs: loss_final: 3.505221, loss_mean: 0.908998, proj_loss: -0.411066, loss_mean_cls: 3.007289, grad_norm: 22.627113 +Steps: 0%| | 3550/1000000 [14:34<67:24:02, 4.11it/s, grad_norm=22.6, loss_final=3.51, loss_mean=0.909, loss_mean_cls=3.01, proj_loss=-0.411][2026-03-23 13:50:54] Step: 3550, Training Logs: loss_final: 3.012258, loss_mean: 0.922655, proj_loss: -0.410684, loss_mean_cls: 2.500287, grad_norm: 5.568468 +Steps: 0%| | 3551/1000000 [14:34<67:25:23, 4.11it/s, grad_norm=5.57, loss_final=3.01, loss_mean=0.923, loss_mean_cls=2.5, proj_loss=-0.411][2026-03-23 13:50:54] Step: 3551, Training Logs: loss_final: 2.733710, loss_mean: 0.927429, proj_loss: -0.413146, loss_mean_cls: 2.219428, grad_norm: 3.407613 +Steps: 0%| | 3552/1000000 [14:35<67:24:31, 4.11it/s, grad_norm=3.41, loss_final=2.73, loss_mean=0.927, loss_mean_cls=2.22, proj_loss=-0.413][2026-03-23 13:50:55] Step: 3552, Training Logs: loss_final: 2.726643, loss_mean: 0.935700, proj_loss: -0.420730, loss_mean_cls: 2.211674, grad_norm: 24.176546 +Steps: 0%| | 3553/1000000 [14:35<67:33:00, 4.10it/s, grad_norm=24.2, loss_final=2.73, loss_mean=0.936, loss_mean_cls=2.21, proj_loss=-0.421][2026-03-23 13:50:55] Step: 3553, Training Logs: loss_final: 2.542159, loss_mean: 0.951262, proj_loss: -0.420033, loss_mean_cls: 2.010929, grad_norm: 15.271505 +Steps: 0%| | 3554/1000000 [14:35<67:30:41, 4.10it/s, grad_norm=15.3, loss_final=2.54, loss_mean=0.951, loss_mean_cls=2.01, proj_loss=-0.42][2026-03-23 13:50:55] Step: 3554, Training Logs: loss_final: 2.968896, loss_mean: 0.943736, proj_loss: -0.418783, loss_mean_cls: 2.443944, grad_norm: 24.580648 +Steps: 0%| | 3555/1000000 [14:35<67:32:04, 4.10it/s, grad_norm=24.6, loss_final=2.97, loss_mean=0.944, loss_mean_cls=2.44, proj_loss=-0.419][2026-03-23 13:50:55] Step: 3555, Training Logs: loss_final: 2.468380, loss_mean: 0.959567, proj_loss: -0.415694, loss_mean_cls: 1.924506, grad_norm: 4.028739 +Steps: 0%| | 3556/1000000 [14:36<67:29:24, 4.10it/s, grad_norm=4.03, loss_final=2.47, loss_mean=0.96, loss_mean_cls=1.92, proj_loss=-0.416][2026-03-23 13:50:56] Step: 3556, Training Logs: loss_final: 2.669526, loss_mean: 0.931717, proj_loss: -0.417852, loss_mean_cls: 2.155661, grad_norm: 6.150387 +Steps: 0%| | 3557/1000000 [14:36<67:26:03, 4.10it/s, grad_norm=6.15, loss_final=2.67, loss_mean=0.932, loss_mean_cls=2.16, proj_loss=-0.418][2026-03-23 13:50:56] Step: 3557, Training Logs: loss_final: 3.178768, loss_mean: 0.924653, proj_loss: -0.407807, loss_mean_cls: 2.661922, grad_norm: 5.560914 +Steps: 0%| | 3558/1000000 [14:36<67:24:51, 4.11it/s, grad_norm=5.56, loss_final=3.18, loss_mean=0.925, loss_mean_cls=2.66, proj_loss=-0.408][2026-03-23 13:50:56] Step: 3558, Training Logs: loss_final: 2.880649, loss_mean: 0.945350, proj_loss: -0.408756, loss_mean_cls: 2.344056, grad_norm: 7.180016 +Steps: 0%| | 3559/1000000 [14:36<67:23:23, 4.11it/s, grad_norm=7.18, loss_final=2.88, loss_mean=0.945, loss_mean_cls=2.34, proj_loss=-0.409][2026-03-23 13:50:56] Step: 3559, Training Logs: loss_final: 2.453834, loss_mean: 0.947657, proj_loss: -0.417550, loss_mean_cls: 1.923727, grad_norm: 12.061245 +Steps: 0%| | 3560/1000000 [14:37<67:21:44, 4.11it/s, grad_norm=12.1, loss_final=2.45, loss_mean=0.948, loss_mean_cls=1.92, proj_loss=-0.418][2026-03-23 13:50:57] Step: 3560, Training Logs: loss_final: 2.556751, loss_mean: 0.929700, proj_loss: -0.413421, loss_mean_cls: 2.040473, grad_norm: 4.964212 +Steps: 0%| | 3561/1000000 [14:37<67:22:17, 4.11it/s, grad_norm=4.96, loss_final=2.56, loss_mean=0.93, loss_mean_cls=2.04, proj_loss=-0.413][2026-03-23 13:50:57] Step: 3561, Training Logs: loss_final: 3.161861, loss_mean: 0.923584, proj_loss: -0.407249, loss_mean_cls: 2.645526, grad_norm: 17.462347 +Steps: 0%| | 3562/1000000 [14:37<67:22:55, 4.11it/s, grad_norm=17.5, loss_final=3.16, loss_mean=0.924, loss_mean_cls=2.65, proj_loss=-0.407][2026-03-23 13:50:57] Step: 3562, Training Logs: loss_final: 3.378855, loss_mean: 0.899211, proj_loss: -0.406053, loss_mean_cls: 2.885697, grad_norm: 13.922083 +Steps: 0%| | 3563/1000000 [14:37<67:22:23, 4.11it/s, grad_norm=13.9, loss_final=3.38, loss_mean=0.899, loss_mean_cls=2.89, proj_loss=-0.406][2026-03-23 13:50:57] Step: 3563, Training Logs: loss_final: 2.935030, loss_mean: 0.939550, proj_loss: -0.418176, loss_mean_cls: 2.413656, grad_norm: 17.403635 +Steps: 0%| | 3564/1000000 [14:38<67:22:41, 4.11it/s, grad_norm=17.4, loss_final=2.94, loss_mean=0.94, loss_mean_cls=2.41, proj_loss=-0.418][2026-03-23 13:50:58] Step: 3564, Training Logs: loss_final: 2.550471, loss_mean: 0.948388, proj_loss: -0.417018, loss_mean_cls: 2.019100, grad_norm: 18.366987 +Steps: 0%| | 3565/1000000 [14:38<67:22:25, 4.11it/s, grad_norm=18.4, loss_final=2.55, loss_mean=0.948, loss_mean_cls=2.02, proj_loss=-0.417][2026-03-23 13:50:58] Step: 3565, Training Logs: loss_final: 2.791165, loss_mean: 0.942991, proj_loss: -0.416923, loss_mean_cls: 2.265096, grad_norm: 3.212159 +Steps: 0%| | 3566/1000000 [14:38<67:23:26, 4.11it/s, grad_norm=3.21, loss_final=2.79, loss_mean=0.943, loss_mean_cls=2.27, proj_loss=-0.417][2026-03-23 13:50:58] Step: 3566, Training Logs: loss_final: 2.752888, loss_mean: 0.933392, proj_loss: -0.415047, loss_mean_cls: 2.234543, grad_norm: 14.182187 +Steps: 0%| | 3567/1000000 [14:38<67:23:38, 4.11it/s, grad_norm=14.2, loss_final=2.75, loss_mean=0.933, loss_mean_cls=2.23, proj_loss=-0.415][2026-03-23 13:50:58] Step: 3567, Training Logs: loss_final: 3.143980, loss_mean: 0.925464, proj_loss: -0.407261, loss_mean_cls: 2.625778, grad_norm: 14.162212 +Steps: 0%| | 3568/1000000 [14:39<67:24:34, 4.11it/s, grad_norm=14.2, loss_final=3.14, loss_mean=0.925, loss_mean_cls=2.63, proj_loss=-0.407][2026-03-23 13:50:59] Step: 3568, Training Logs: loss_final: 2.634227, loss_mean: 0.931651, proj_loss: -0.416798, loss_mean_cls: 2.119374, grad_norm: 8.245468 +Steps: 0%| | 3569/1000000 [14:39<67:24:09, 4.11it/s, grad_norm=8.25, loss_final=2.63, loss_mean=0.932, loss_mean_cls=2.12, proj_loss=-0.417][2026-03-23 13:50:59] Step: 3569, Training Logs: loss_final: 2.735701, loss_mean: 0.953526, proj_loss: -0.416571, loss_mean_cls: 2.198745, grad_norm: 12.934089 +Steps: 0%| | 3570/1000000 [14:39<67:23:56, 4.11it/s, grad_norm=12.9, loss_final=2.74, loss_mean=0.954, loss_mean_cls=2.2, proj_loss=-0.417][2026-03-23 13:50:59] Step: 3570, Training Logs: loss_final: 2.705929, loss_mean: 0.950252, proj_loss: -0.415915, loss_mean_cls: 2.171592, grad_norm: 13.088720 +Steps: 0%| | 3571/1000000 [14:39<67:24:09, 4.11it/s, grad_norm=13.1, loss_final=2.71, loss_mean=0.95, loss_mean_cls=2.17, proj_loss=-0.416][2026-03-23 13:50:59] Step: 3571, Training Logs: loss_final: 2.820789, loss_mean: 0.937783, proj_loss: -0.418263, loss_mean_cls: 2.301269, grad_norm: 9.012239 +Steps: 0%| | 3572/1000000 [14:40<67:24:55, 4.11it/s, grad_norm=9.01, loss_final=2.82, loss_mean=0.938, loss_mean_cls=2.3, proj_loss=-0.418][2026-03-23 13:50:59] Step: 3572, Training Logs: loss_final: 2.700530, loss_mean: 0.930463, proj_loss: -0.415761, loss_mean_cls: 2.185827, grad_norm: 5.316518 +Steps: 0%| | 3573/1000000 [14:40<67:23:31, 4.11it/s, grad_norm=5.32, loss_final=2.7, loss_mean=0.93, loss_mean_cls=2.19, proj_loss=-0.416][2026-03-23 13:51:00] Step: 3573, Training Logs: loss_final: 2.789985, loss_mean: 0.924575, proj_loss: -0.416514, loss_mean_cls: 2.281923, grad_norm: 28.907528 +Steps: 0%| | 3574/1000000 [14:40<67:23:15, 4.11it/s, grad_norm=28.9, loss_final=2.79, loss_mean=0.925, loss_mean_cls=2.28, proj_loss=-0.417][2026-03-23 13:51:00] Step: 3574, Training Logs: loss_final: 2.867531, loss_mean: 0.934992, proj_loss: -0.417865, loss_mean_cls: 2.350405, grad_norm: 28.563562 +Steps: 0%| | 3575/1000000 [14:40<67:22:45, 4.11it/s, grad_norm=28.6, loss_final=2.87, loss_mean=0.935, loss_mean_cls=2.35, proj_loss=-0.418][2026-03-23 13:51:00] Step: 3575, Training Logs: loss_final: 2.836866, loss_mean: 0.956304, proj_loss: -0.415028, loss_mean_cls: 2.295590, grad_norm: 15.092492 +Steps: 0%| | 3576/1000000 [14:41<67:22:01, 4.11it/s, grad_norm=15.1, loss_final=2.84, loss_mean=0.956, loss_mean_cls=2.3, proj_loss=-0.415][2026-03-23 13:51:00] Step: 3576, Training Logs: loss_final: 2.975250, loss_mean: 0.957603, proj_loss: -0.414917, loss_mean_cls: 2.432564, grad_norm: 10.821575 +Steps: 0%| | 3577/1000000 [14:41<67:23:56, 4.11it/s, grad_norm=10.8, loss_final=2.98, loss_mean=0.958, loss_mean_cls=2.43, proj_loss=-0.415][2026-03-23 13:51:01] Step: 3577, Training Logs: loss_final: 2.525812, loss_mean: 0.944001, proj_loss: -0.420210, loss_mean_cls: 2.002021, grad_norm: 5.062121 +Steps: 0%| | 3578/1000000 [14:41<67:25:03, 4.11it/s, grad_norm=5.06, loss_final=2.53, loss_mean=0.944, loss_mean_cls=2, proj_loss=-0.42][2026-03-23 13:51:01] Step: 3578, Training Logs: loss_final: 2.950488, loss_mean: 0.915640, proj_loss: -0.415931, loss_mean_cls: 2.450779, grad_norm: 8.029245 +Steps: 0%| | 3579/1000000 [14:41<67:23:23, 4.11it/s, grad_norm=8.03, loss_final=2.95, loss_mean=0.916, loss_mean_cls=2.45, proj_loss=-0.416][2026-03-23 13:51:01] Step: 3579, Training Logs: loss_final: 2.880563, loss_mean: 0.938455, proj_loss: -0.410587, loss_mean_cls: 2.352696, grad_norm: 26.695129 +Steps: 0%| | 3580/1000000 [14:41<67:25:40, 4.10it/s, grad_norm=26.7, loss_final=2.88, loss_mean=0.938, loss_mean_cls=2.35, proj_loss=-0.411][2026-03-23 13:51:01] Step: 3580, Training Logs: loss_final: 2.588458, loss_mean: 0.932581, proj_loss: -0.417274, loss_mean_cls: 2.073150, grad_norm: 4.376892 +Steps: 0%| | 3581/1000000 [14:42<67:24:00, 4.11it/s, grad_norm=4.38, loss_final=2.59, loss_mean=0.933, loss_mean_cls=2.07, proj_loss=-0.417][2026-03-23 13:51:02] Step: 3581, Training Logs: loss_final: 2.649533, loss_mean: 0.935368, proj_loss: -0.423505, loss_mean_cls: 2.137671, grad_norm: 5.749558 +Steps: 0%| | 3582/1000000 [14:42<67:23:19, 4.11it/s, grad_norm=5.75, loss_final=2.65, loss_mean=0.935, loss_mean_cls=2.14, proj_loss=-0.424][2026-03-23 13:51:02] Step: 3582, Training Logs: loss_final: 2.937214, loss_mean: 0.920820, proj_loss: -0.408762, loss_mean_cls: 2.425156, grad_norm: 11.494907 +Steps: 0%| | 3583/1000000 [14:42<67:24:48, 4.11it/s, grad_norm=11.5, loss_final=2.94, loss_mean=0.921, loss_mean_cls=2.43, proj_loss=-0.409][2026-03-23 13:51:02] Step: 3583, Training Logs: loss_final: 2.606420, loss_mean: 0.928064, proj_loss: -0.420600, loss_mean_cls: 2.098955, grad_norm: 5.583880 +Steps: 0%| | 3584/1000000 [14:42<67:25:08, 4.11it/s, grad_norm=5.58, loss_final=2.61, loss_mean=0.928, loss_mean_cls=2.1, proj_loss=-0.421][2026-03-23 13:51:02] Step: 3584, Training Logs: loss_final: 2.685480, loss_mean: 0.938690, proj_loss: -0.415970, loss_mean_cls: 2.162760, grad_norm: 7.748720 +Steps: 0%| | 3585/1000000 [14:43<67:25:06, 4.11it/s, grad_norm=7.75, loss_final=2.69, loss_mean=0.939, loss_mean_cls=2.16, proj_loss=-0.416][2026-03-23 13:51:03] Step: 3585, Training Logs: loss_final: 3.492564, loss_mean: 0.921491, proj_loss: -0.402711, loss_mean_cls: 2.973785, grad_norm: 24.725101 +Steps: 0%| | 3586/1000000 [14:43<67:26:21, 4.10it/s, grad_norm=24.7, loss_final=3.49, loss_mean=0.921, loss_mean_cls=2.97, proj_loss=-0.403][2026-03-23 13:51:03] Step: 3586, Training Logs: loss_final: 2.688034, loss_mean: 0.959997, proj_loss: -0.412850, loss_mean_cls: 2.140888, grad_norm: 4.605812 +Steps: 0%| | 3587/1000000 [14:43<67:23:41, 4.11it/s, grad_norm=4.61, loss_final=2.69, loss_mean=0.96, loss_mean_cls=2.14, proj_loss=-0.413][2026-03-23 13:51:03] Step: 3587, Training Logs: loss_final: 2.599898, loss_mean: 0.945781, proj_loss: -0.422651, loss_mean_cls: 2.076769, grad_norm: 14.201969 +Steps: 0%| | 3588/1000000 [14:43<67:24:51, 4.11it/s, grad_norm=14.2, loss_final=2.6, loss_mean=0.946, loss_mean_cls=2.08, proj_loss=-0.423][2026-03-23 13:51:03] Step: 3588, Training Logs: loss_final: 2.827501, loss_mean: 0.921292, proj_loss: -0.420573, loss_mean_cls: 2.326783, grad_norm: 14.538508 +Steps: 0%| | 3589/1000000 [14:44<67:28:10, 4.10it/s, grad_norm=14.5, loss_final=2.83, loss_mean=0.921, loss_mean_cls=2.33, proj_loss=-0.421][2026-03-23 13:51:04] Step: 3589, Training Logs: loss_final: 2.720268, loss_mean: 0.924750, proj_loss: -0.422924, loss_mean_cls: 2.218442, grad_norm: 10.752731 +Steps: 0%| | 3590/1000000 [14:44<67:32:38, 4.10it/s, grad_norm=10.8, loss_final=2.72, loss_mean=0.925, loss_mean_cls=2.22, proj_loss=-0.423][2026-03-23 13:51:04] Step: 3590, Training Logs: loss_final: 2.519767, loss_mean: 0.919276, proj_loss: -0.427680, loss_mean_cls: 2.028172, grad_norm: 3.117637 +Steps: 0%| | 3591/1000000 [14:44<67:28:52, 4.10it/s, grad_norm=3.12, loss_final=2.52, loss_mean=0.919, loss_mean_cls=2.03, proj_loss=-0.428][2026-03-23 13:51:04] Step: 3591, Training Logs: loss_final: 2.888832, loss_mean: 0.901248, proj_loss: -0.418627, loss_mean_cls: 2.406211, grad_norm: 17.439472 +Steps: 0%| | 3592/1000000 [14:44<67:27:47, 4.10it/s, grad_norm=17.4, loss_final=2.89, loss_mean=0.901, loss_mean_cls=2.41, proj_loss=-0.419][2026-03-23 13:51:04] Step: 3592, Training Logs: loss_final: 2.712989, loss_mean: 0.944588, proj_loss: -0.421057, loss_mean_cls: 2.189459, grad_norm: 12.131833 +Steps: 0%| | 3593/1000000 [14:45<67:25:46, 4.10it/s, grad_norm=12.1, loss_final=2.71, loss_mean=0.945, loss_mean_cls=2.19, proj_loss=-0.421][2026-03-23 13:51:05] Step: 3593, Training Logs: loss_final: 2.896249, loss_mean: 0.925864, proj_loss: -0.417539, loss_mean_cls: 2.387924, grad_norm: 18.988712 +Steps: 0%| | 3594/1000000 [14:45<67:25:53, 4.10it/s, grad_norm=19, loss_final=2.9, loss_mean=0.926, loss_mean_cls=2.39, proj_loss=-0.418][2026-03-23 13:51:05] Step: 3594, Training Logs: loss_final: 2.847698, loss_mean: 0.917102, proj_loss: -0.413318, loss_mean_cls: 2.343914, grad_norm: 17.163853 +Steps: 0%| | 3595/1000000 [14:45<67:25:39, 4.10it/s, grad_norm=17.2, loss_final=2.85, loss_mean=0.917, loss_mean_cls=2.34, proj_loss=-0.413][2026-03-23 13:51:05] Step: 3595, Training Logs: loss_final: 3.004645, loss_mean: 0.930100, proj_loss: -0.411236, loss_mean_cls: 2.485781, grad_norm: 14.122970 +Steps: 0%| | 3596/1000000 [14:45<67:24:33, 4.11it/s, grad_norm=14.1, loss_final=3, loss_mean=0.93, loss_mean_cls=2.49, proj_loss=-0.411][2026-03-23 13:51:05] Step: 3596, Training Logs: loss_final: 3.088334, loss_mean: 0.917526, proj_loss: -0.413350, loss_mean_cls: 2.584158, grad_norm: 33.058552 +Steps: 0%| | 3597/1000000 [14:46<67:24:22, 4.11it/s, grad_norm=33.1, loss_final=3.09, loss_mean=0.918, loss_mean_cls=2.58, proj_loss=-0.413][2026-03-23 13:51:06] Step: 3597, Training Logs: loss_final: 2.482220, loss_mean: 0.949953, proj_loss: -0.420880, loss_mean_cls: 1.953147, grad_norm: 14.105314 +Steps: 0%| | 3598/1000000 [14:46<67:23:40, 4.11it/s, grad_norm=14.1, loss_final=2.48, loss_mean=0.95, loss_mean_cls=1.95, proj_loss=-0.421][2026-03-23 13:51:06] Step: 3598, Training Logs: loss_final: 2.459580, loss_mean: 0.926596, proj_loss: -0.418443, loss_mean_cls: 1.951426, grad_norm: 24.762775 +Steps: 0%| | 3599/1000000 [14:46<67:24:27, 4.11it/s, grad_norm=24.8, loss_final=2.46, loss_mean=0.927, loss_mean_cls=1.95, proj_loss=-0.418][2026-03-23 13:51:06] Step: 3599, Training Logs: loss_final: 2.850249, loss_mean: 0.908973, proj_loss: -0.416481, loss_mean_cls: 2.357758, grad_norm: 2.454834 +Steps: 0%| | 3600/1000000 [14:46<67:38:08, 4.09it/s, grad_norm=2.45, loss_final=2.85, loss_mean=0.909, loss_mean_cls=2.36, proj_loss=-0.416][2026-03-23 13:51:06] Step: 3600, Training Logs: loss_final: 2.895835, loss_mean: 0.926255, proj_loss: -0.418470, loss_mean_cls: 2.388051, grad_norm: 12.171497 +Steps: 0%| | 3601/1000000 [14:47<67:35:13, 4.10it/s, grad_norm=12.2, loss_final=2.9, loss_mean=0.926, loss_mean_cls=2.39, proj_loss=-0.418][2026-03-23 13:51:07] Step: 3601, Training Logs: loss_final: 2.675901, loss_mean: 0.925105, proj_loss: -0.422480, loss_mean_cls: 2.173277, grad_norm: 34.680344 +Steps: 0%| | 3602/1000000 [14:47<67:32:50, 4.10it/s, grad_norm=34.7, loss_final=2.68, loss_mean=0.925, loss_mean_cls=2.17, proj_loss=-0.422][2026-03-23 13:51:07] Step: 3602, Training Logs: loss_final: 2.948853, loss_mean: 0.924899, proj_loss: -0.417632, loss_mean_cls: 2.441586, grad_norm: 3.283672 +Steps: 0%| | 3603/1000000 [14:47<67:31:15, 4.10it/s, grad_norm=3.28, loss_final=2.95, loss_mean=0.925, loss_mean_cls=2.44, proj_loss=-0.418][2026-03-23 13:51:07] Step: 3603, Training Logs: loss_final: 3.543044, loss_mean: 0.892047, proj_loss: -0.405389, loss_mean_cls: 3.056386, grad_norm: 16.481335 +Steps: 0%| | 3604/1000000 [14:47<67:30:25, 4.10it/s, grad_norm=16.5, loss_final=3.54, loss_mean=0.892, loss_mean_cls=3.06, proj_loss=-0.405][2026-03-23 13:51:07] Step: 3604, Training Logs: loss_final: 2.664320, loss_mean: 0.937428, proj_loss: -0.419596, loss_mean_cls: 2.146487, grad_norm: 17.036306 +Steps: 0%| | 3605/1000000 [14:48<67:29:02, 4.10it/s, grad_norm=17, loss_final=2.66, loss_mean=0.937, loss_mean_cls=2.15, proj_loss=-0.42][2026-03-23 13:51:08] Step: 3605, Training Logs: loss_final: 3.011159, loss_mean: 0.940302, proj_loss: -0.408293, loss_mean_cls: 2.479150, grad_norm: 6.080274 +Steps: 0%| | 3606/1000000 [14:48<67:28:40, 4.10it/s, grad_norm=6.08, loss_final=3.01, loss_mean=0.94, loss_mean_cls=2.48, proj_loss=-0.408][2026-03-23 13:51:08] Step: 3606, Training Logs: loss_final: 3.145655, loss_mean: 0.917466, proj_loss: -0.408813, loss_mean_cls: 2.637002, grad_norm: 8.343206 +Steps: 0%| | 3607/1000000 [14:48<67:28:56, 4.10it/s, grad_norm=8.34, loss_final=3.15, loss_mean=0.917, loss_mean_cls=2.64, proj_loss=-0.409][2026-03-23 13:51:08] Step: 3607, Training Logs: loss_final: 2.354536, loss_mean: 0.950560, proj_loss: -0.425817, loss_mean_cls: 1.829792, grad_norm: 11.437060 +Steps: 0%| | 3608/1000000 [14:48<67:27:16, 4.10it/s, grad_norm=11.4, loss_final=2.35, loss_mean=0.951, loss_mean_cls=1.83, proj_loss=-0.426][2026-03-23 13:51:08] Step: 3608, Training Logs: loss_final: 2.729408, loss_mean: 0.910265, proj_loss: -0.418982, loss_mean_cls: 2.238125, grad_norm: 3.489802 +Steps: 0%| | 3609/1000000 [14:49<67:27:43, 4.10it/s, grad_norm=3.49, loss_final=2.73, loss_mean=0.91, loss_mean_cls=2.24, proj_loss=-0.419][2026-03-23 13:51:09] Step: 3609, Training Logs: loss_final: 2.753879, loss_mean: 0.928107, proj_loss: -0.416872, loss_mean_cls: 2.242645, grad_norm: 9.000393 +Steps: 0%| | 3610/1000000 [14:49<67:28:17, 4.10it/s, grad_norm=9, loss_final=2.75, loss_mean=0.928, loss_mean_cls=2.24, proj_loss=-0.417][2026-03-23 13:51:09] Step: 3610, Training Logs: loss_final: 2.984417, loss_mean: 0.922249, proj_loss: -0.419101, loss_mean_cls: 2.481269, grad_norm: 8.070186 +Steps: 0%| | 3611/1000000 [14:49<67:29:17, 4.10it/s, grad_norm=8.07, loss_final=2.98, loss_mean=0.922, loss_mean_cls=2.48, proj_loss=-0.419][2026-03-23 13:51:09] Step: 3611, Training Logs: loss_final: 2.430685, loss_mean: 0.943916, proj_loss: -0.423099, loss_mean_cls: 1.909868, grad_norm: 5.225240 +Steps: 0%| | 3612/1000000 [14:49<67:28:55, 4.10it/s, grad_norm=5.23, loss_final=2.43, loss_mean=0.944, loss_mean_cls=1.91, proj_loss=-0.423][2026-03-23 13:51:09] Step: 3612, Training Logs: loss_final: 3.062722, loss_mean: 0.895049, proj_loss: -0.412324, loss_mean_cls: 2.579998, grad_norm: 6.251221 +Steps: 0%| | 3613/1000000 [14:50<67:29:03, 4.10it/s, grad_norm=6.25, loss_final=3.06, loss_mean=0.895, loss_mean_cls=2.58, proj_loss=-0.412][2026-03-23 13:51:09] Step: 3613, Training Logs: loss_final: 2.954019, loss_mean: 0.916950, proj_loss: -0.416236, loss_mean_cls: 2.453305, grad_norm: 15.155474 +Steps: 0%| | 3614/1000000 [14:50<67:28:19, 4.10it/s, grad_norm=15.2, loss_final=2.95, loss_mean=0.917, loss_mean_cls=2.45, proj_loss=-0.416][2026-03-23 13:51:10] Step: 3614, Training Logs: loss_final: 2.590234, loss_mean: 0.934143, proj_loss: -0.424391, loss_mean_cls: 2.080482, grad_norm: 4.156639 +Steps: 0%| | 3615/1000000 [14:50<67:27:15, 4.10it/s, grad_norm=4.16, loss_final=2.59, loss_mean=0.934, loss_mean_cls=2.08, proj_loss=-0.424][2026-03-23 13:51:10] Step: 3615, Training Logs: loss_final: 2.652918, loss_mean: 0.949490, proj_loss: -0.420448, loss_mean_cls: 2.123875, grad_norm: 17.418392 +Steps: 0%| | 3616/1000000 [14:50<67:28:32, 4.10it/s, grad_norm=17.4, loss_final=2.65, loss_mean=0.949, loss_mean_cls=2.12, proj_loss=-0.42][2026-03-23 13:51:10] Step: 3616, Training Logs: loss_final: 2.827809, loss_mean: 0.918881, proj_loss: -0.417946, loss_mean_cls: 2.326874, grad_norm: 1.847610 +Steps: 0%| | 3617/1000000 [14:50<67:31:50, 4.10it/s, grad_norm=1.85, loss_final=2.83, loss_mean=0.919, loss_mean_cls=2.33, proj_loss=-0.418][2026-03-23 13:51:10] Step: 3617, Training Logs: loss_final: 2.765323, loss_mean: 0.923486, proj_loss: -0.421926, loss_mean_cls: 2.263763, grad_norm: 7.843616 +Steps: 0%| | 3618/1000000 [14:51<67:30:27, 4.10it/s, grad_norm=7.84, loss_final=2.77, loss_mean=0.923, loss_mean_cls=2.26, proj_loss=-0.422][2026-03-23 13:51:11] Step: 3618, Training Logs: loss_final: 2.999261, loss_mean: 0.914942, proj_loss: -0.412915, loss_mean_cls: 2.497235, grad_norm: 23.551123 +Steps: 0%| | 3619/1000000 [14:51<67:29:03, 4.10it/s, grad_norm=23.6, loss_final=3, loss_mean=0.915, loss_mean_cls=2.5, proj_loss=-0.413][2026-03-23 13:51:11] Step: 3619, Training Logs: loss_final: 2.997399, loss_mean: 0.932994, proj_loss: -0.410607, loss_mean_cls: 2.475013, grad_norm: 5.471592 +Steps: 0%| | 3620/1000000 [14:51<67:27:48, 4.10it/s, grad_norm=5.47, loss_final=3, loss_mean=0.933, loss_mean_cls=2.48, proj_loss=-0.411][2026-03-23 13:51:11] Step: 3620, Training Logs: loss_final: 3.066098, loss_mean: 0.921595, proj_loss: -0.412318, loss_mean_cls: 2.556822, grad_norm: 14.109632 +Steps: 0%| | 3621/1000000 [14:51<67:26:23, 4.10it/s, grad_norm=14.1, loss_final=3.07, loss_mean=0.922, loss_mean_cls=2.56, proj_loss=-0.412][2026-03-23 13:51:11] Step: 3621, Training Logs: loss_final: 2.866946, loss_mean: 0.919159, proj_loss: -0.414780, loss_mean_cls: 2.362567, grad_norm: 23.313320 +Steps: 0%| | 3622/1000000 [14:52<67:26:49, 4.10it/s, grad_norm=23.3, loss_final=2.87, loss_mean=0.919, loss_mean_cls=2.36, proj_loss=-0.415][2026-03-23 13:51:12] Step: 3622, Training Logs: loss_final: 1.982582, loss_mean: 0.957932, proj_loss: -0.430416, loss_mean_cls: 1.455066, grad_norm: 4.494435 +Steps: 0%| | 3623/1000000 [14:52<67:26:37, 4.10it/s, grad_norm=4.49, loss_final=1.98, loss_mean=0.958, loss_mean_cls=1.46, proj_loss=-0.43][2026-03-23 13:51:12] Step: 3623, Training Logs: loss_final: 2.895299, loss_mean: 0.931613, proj_loss: -0.414765, loss_mean_cls: 2.378451, grad_norm: 1.686479 +Steps: 0%| | 3624/1000000 [14:52<67:24:34, 4.11it/s, grad_norm=1.69, loss_final=2.9, loss_mean=0.932, loss_mean_cls=2.38, proj_loss=-0.415][2026-03-23 13:51:12] Step: 3624, Training Logs: loss_final: 2.802351, loss_mean: 0.904263, proj_loss: -0.417909, loss_mean_cls: 2.315996, grad_norm: 2.212652 +Steps: 0%| | 3625/1000000 [14:52<67:23:25, 4.11it/s, grad_norm=2.21, loss_final=2.8, loss_mean=0.904, loss_mean_cls=2.32, proj_loss=-0.418][2026-03-23 13:51:12] Step: 3625, Training Logs: loss_final: 2.847196, loss_mean: 0.932922, proj_loss: -0.416892, loss_mean_cls: 2.331166, grad_norm: 15.646891 +Steps: 0%| | 3626/1000000 [14:53<67:24:16, 4.11it/s, grad_norm=15.6, loss_final=2.85, loss_mean=0.933, loss_mean_cls=2.33, proj_loss=-0.417][2026-03-23 13:51:13] Step: 3626, Training Logs: loss_final: 2.777875, loss_mean: 0.937012, proj_loss: -0.414883, loss_mean_cls: 2.255746, grad_norm: 26.021006 +Steps: 0%| | 3627/1000000 [14:53<67:25:05, 4.11it/s, grad_norm=26, loss_final=2.78, loss_mean=0.937, loss_mean_cls=2.26, proj_loss=-0.415][2026-03-23 13:51:13] Step: 3627, Training Logs: loss_final: 2.407496, loss_mean: 0.962679, proj_loss: -0.422207, loss_mean_cls: 1.867025, grad_norm: 11.491070 +Steps: 0%| | 3628/1000000 [14:53<67:25:50, 4.10it/s, grad_norm=11.5, loss_final=2.41, loss_mean=0.963, loss_mean_cls=1.87, proj_loss=-0.422][2026-03-23 13:51:13] Step: 3628, Training Logs: loss_final: 2.716016, loss_mean: 0.941314, proj_loss: -0.410216, loss_mean_cls: 2.184919, grad_norm: 14.996081 +Steps: 0%| | 3629/1000000 [14:53<67:24:33, 4.11it/s, grad_norm=15, loss_final=2.72, loss_mean=0.941, loss_mean_cls=2.18, proj_loss=-0.41][2026-03-23 13:51:13] Step: 3629, Training Logs: loss_final: 2.356910, loss_mean: 0.970091, proj_loss: -0.416771, loss_mean_cls: 1.803590, grad_norm: 4.202273 +Steps: 0%| | 3630/1000000 [14:54<67:23:54, 4.11it/s, grad_norm=4.2, loss_final=2.36, loss_mean=0.97, loss_mean_cls=1.8, proj_loss=-0.417][2026-03-23 13:51:14] Step: 3630, Training Logs: loss_final: 2.760252, loss_mean: 0.954470, proj_loss: -0.416827, loss_mean_cls: 2.222610, grad_norm: 19.466654 +Steps: 0%| | 3631/1000000 [14:54<67:24:21, 4.11it/s, grad_norm=19.5, loss_final=2.76, loss_mean=0.954, loss_mean_cls=2.22, proj_loss=-0.417][2026-03-23 13:51:14] Step: 3631, Training Logs: loss_final: 2.917955, loss_mean: 0.949777, proj_loss: -0.413886, loss_mean_cls: 2.382064, grad_norm: 40.449081 +Steps: 0%| | 3632/1000000 [14:54<67:26:41, 4.10it/s, grad_norm=40.4, loss_final=2.92, loss_mean=0.95, loss_mean_cls=2.38, proj_loss=-0.414][2026-03-23 13:51:14] Step: 3632, Training Logs: loss_final: 2.981015, loss_mean: 0.936792, proj_loss: -0.413146, loss_mean_cls: 2.457369, grad_norm: 23.991920 +Steps: 0%| | 3633/1000000 [14:54<67:26:41, 4.10it/s, grad_norm=24, loss_final=2.98, loss_mean=0.937, loss_mean_cls=2.46, proj_loss=-0.413][2026-03-23 13:51:14] Step: 3633, Training Logs: loss_final: 2.845486, loss_mean: 0.943437, proj_loss: -0.406526, loss_mean_cls: 2.308575, grad_norm: 3.260093 +Steps: 0%| | 3634/1000000 [14:55<67:27:01, 4.10it/s, grad_norm=3.26, loss_final=2.85, loss_mean=0.943, loss_mean_cls=2.31, proj_loss=-0.407][2026-03-23 13:51:15] Step: 3634, Training Logs: loss_final: 2.579014, loss_mean: 0.933645, proj_loss: -0.419983, loss_mean_cls: 2.065351, grad_norm: 33.180950 +Steps: 0%| | 3635/1000000 [14:55<67:27:03, 4.10it/s, grad_norm=33.2, loss_final=2.58, loss_mean=0.934, loss_mean_cls=2.07, proj_loss=-0.42][2026-03-23 13:51:15] Step: 3635, Training Logs: loss_final: 2.885389, loss_mean: 0.952927, proj_loss: -0.409219, loss_mean_cls: 2.341681, grad_norm: 6.810179 +Steps: 0%| | 3636/1000000 [14:55<67:29:27, 4.10it/s, grad_norm=6.81, loss_final=2.89, loss_mean=0.953, loss_mean_cls=2.34, proj_loss=-0.409][2026-03-23 13:51:15] Step: 3636, Training Logs: loss_final: 2.790379, loss_mean: 0.929908, proj_loss: -0.414698, loss_mean_cls: 2.275168, grad_norm: 13.608640 +Steps: 0%| | 3637/1000000 [14:55<67:27:54, 4.10it/s, grad_norm=13.6, loss_final=2.79, loss_mean=0.93, loss_mean_cls=2.28, proj_loss=-0.415][2026-03-23 13:51:15] Step: 3637, Training Logs: loss_final: 2.645614, loss_mean: 0.946963, proj_loss: -0.411324, loss_mean_cls: 2.109975, grad_norm: 20.674137 +Steps: 0%| | 3638/1000000 [14:56<67:27:29, 4.10it/s, grad_norm=20.7, loss_final=2.65, loss_mean=0.947, loss_mean_cls=2.11, proj_loss=-0.411][2026-03-23 13:51:16] Step: 3638, Training Logs: loss_final: 3.064563, loss_mean: 0.939390, proj_loss: -0.411596, loss_mean_cls: 2.536769, grad_norm: 12.679941 +Steps: 0%| | 3639/1000000 [14:56<67:29:02, 4.10it/s, grad_norm=12.7, loss_final=3.06, loss_mean=0.939, loss_mean_cls=2.54, proj_loss=-0.412][2026-03-23 13:51:16] Step: 3639, Training Logs: loss_final: 2.759319, loss_mean: 0.927798, proj_loss: -0.412366, loss_mean_cls: 2.243887, grad_norm: 28.271706 +Steps: 0%| | 3640/1000000 [14:56<67:28:33, 4.10it/s, grad_norm=28.3, loss_final=2.76, loss_mean=0.928, loss_mean_cls=2.24, proj_loss=-0.412][2026-03-23 13:51:16] Step: 3640, Training Logs: loss_final: 3.092534, loss_mean: 0.920674, proj_loss: -0.414303, loss_mean_cls: 2.586163, grad_norm: 21.617527 +Steps: 0%| | 3641/1000000 [14:56<67:28:30, 4.10it/s, grad_norm=21.6, loss_final=3.09, loss_mean=0.921, loss_mean_cls=2.59, proj_loss=-0.414][2026-03-23 13:51:16] Step: 3641, Training Logs: loss_final: 2.538663, loss_mean: 0.919979, proj_loss: -0.417545, loss_mean_cls: 2.036229, grad_norm: 7.774411 +Steps: 0%| | 3642/1000000 [14:57<67:32:16, 4.10it/s, grad_norm=7.77, loss_final=2.54, loss_mean=0.92, loss_mean_cls=2.04, proj_loss=-0.418][2026-03-23 13:51:17] Step: 3642, Training Logs: loss_final: 2.815639, loss_mean: 0.914732, proj_loss: -0.416819, loss_mean_cls: 2.317727, grad_norm: 8.551237 +Steps: 0%| | 3643/1000000 [14:57<67:30:33, 4.10it/s, grad_norm=8.55, loss_final=2.82, loss_mean=0.915, loss_mean_cls=2.32, proj_loss=-0.417][2026-03-23 13:51:17] Step: 3643, Training Logs: loss_final: 2.635772, loss_mean: 0.948143, proj_loss: -0.415598, loss_mean_cls: 2.103227, grad_norm: 4.800022 +Steps: 0%| | 3644/1000000 [14:57<67:29:06, 4.10it/s, grad_norm=4.8, loss_final=2.64, loss_mean=0.948, loss_mean_cls=2.1, proj_loss=-0.416][2026-03-23 13:51:17] Step: 3644, Training Logs: loss_final: 3.137758, loss_mean: 0.919052, proj_loss: -0.412532, loss_mean_cls: 2.631238, grad_norm: 8.950124 +Steps: 0%| | 3645/1000000 [14:57<67:27:12, 4.10it/s, grad_norm=8.95, loss_final=3.14, loss_mean=0.919, loss_mean_cls=2.63, proj_loss=-0.413][2026-03-23 13:51:17] Step: 3645, Training Logs: loss_final: 2.610660, loss_mean: 0.919426, proj_loss: -0.421490, loss_mean_cls: 2.112724, grad_norm: 16.257259 +Steps: 0%| | 3646/1000000 [14:58<67:27:21, 4.10it/s, grad_norm=16.3, loss_final=2.61, loss_mean=0.919, loss_mean_cls=2.11, proj_loss=-0.421][2026-03-23 13:51:18] Step: 3646, Training Logs: loss_final: 2.646382, loss_mean: 0.912994, proj_loss: -0.420239, loss_mean_cls: 2.153627, grad_norm: 4.085724 +Steps: 0%| | 3647/1000000 [14:58<67:27:49, 4.10it/s, grad_norm=4.09, loss_final=2.65, loss_mean=0.913, loss_mean_cls=2.15, proj_loss=-0.42][2026-03-23 13:51:18] Step: 3647, Training Logs: loss_final: 3.109558, loss_mean: 0.911670, proj_loss: -0.417713, loss_mean_cls: 2.615601, grad_norm: 15.127779 +Steps: 0%| | 3648/1000000 [14:58<67:28:50, 4.10it/s, grad_norm=15.1, loss_final=3.11, loss_mean=0.912, loss_mean_cls=2.62, proj_loss=-0.418][2026-03-23 13:51:18] Step: 3648, Training Logs: loss_final: 2.663068, loss_mean: 0.930216, proj_loss: -0.420278, loss_mean_cls: 2.153131, grad_norm: 22.282192 +Steps: 0%| | 3649/1000000 [14:58<67:28:41, 4.10it/s, grad_norm=22.3, loss_final=2.66, loss_mean=0.93, loss_mean_cls=2.15, proj_loss=-0.42][2026-03-23 13:51:18] Step: 3649, Training Logs: loss_final: 3.128309, loss_mean: 0.909666, proj_loss: -0.415783, loss_mean_cls: 2.634426, grad_norm: 34.143776 +Steps: 0%| | 3650/1000000 [14:59<67:29:30, 4.10it/s, grad_norm=34.1, loss_final=3.13, loss_mean=0.91, loss_mean_cls=2.63, proj_loss=-0.416][2026-03-23 13:51:19] Step: 3650, Training Logs: loss_final: 3.076954, loss_mean: 0.929348, proj_loss: -0.403390, loss_mean_cls: 2.550997, grad_norm: 12.743709 +Steps: 0%| | 3651/1000000 [14:59<67:27:01, 4.10it/s, grad_norm=12.7, loss_final=3.08, loss_mean=0.929, loss_mean_cls=2.55, proj_loss=-0.403][2026-03-23 13:51:19] Step: 3651, Training Logs: loss_final: 2.931939, loss_mean: 0.936531, proj_loss: -0.415350, loss_mean_cls: 2.410758, grad_norm: 9.847341 +Steps: 0%| | 3652/1000000 [14:59<67:41:37, 4.09it/s, grad_norm=9.85, loss_final=2.93, loss_mean=0.937, loss_mean_cls=2.41, proj_loss=-0.415][2026-03-23 13:51:19] Step: 3652, Training Logs: loss_final: 2.253615, loss_mean: 0.965146, proj_loss: -0.426002, loss_mean_cls: 1.714470, grad_norm: 6.995166 +Steps: 0%| | 3653/1000000 [14:59<67:36:47, 4.09it/s, grad_norm=7, loss_final=2.25, loss_mean=0.965, loss_mean_cls=1.71, proj_loss=-0.426][2026-03-23 13:51:19] Step: 3653, Training Logs: loss_final: 3.049046, loss_mean: 0.902507, proj_loss: -0.415973, loss_mean_cls: 2.562511, grad_norm: 21.595745 +Steps: 0%| | 3654/1000000 [15:00<67:33:03, 4.10it/s, grad_norm=21.6, loss_final=3.05, loss_mean=0.903, loss_mean_cls=2.56, proj_loss=-0.416][2026-03-23 13:51:19] Step: 3654, Training Logs: loss_final: 3.192051, loss_mean: 0.936165, proj_loss: -0.410498, loss_mean_cls: 2.666385, grad_norm: 24.136759 +Steps: 0%| | 3655/1000000 [15:00<67:31:28, 4.10it/s, grad_norm=24.1, loss_final=3.19, loss_mean=0.936, loss_mean_cls=2.67, proj_loss=-0.41][2026-03-23 13:51:20] Step: 3655, Training Logs: loss_final: 2.866669, loss_mean: 0.963323, proj_loss: -0.418358, loss_mean_cls: 2.321704, grad_norm: 30.938486 +Steps: 0%| | 3656/1000000 [15:00<67:52:21, 4.08it/s, grad_norm=30.9, loss_final=2.87, loss_mean=0.963, loss_mean_cls=2.32, proj_loss=-0.418][2026-03-23 13:51:20] Step: 3656, Training Logs: loss_final: 3.387568, loss_mean: 0.938272, proj_loss: -0.410112, loss_mean_cls: 2.859407, grad_norm: 24.189108 +Steps: 0%| | 3657/1000000 [15:00<67:52:25, 4.08it/s, grad_norm=24.2, loss_final=3.39, loss_mean=0.938, loss_mean_cls=2.86, proj_loss=-0.41][2026-03-23 13:51:20] Step: 3657, Training Logs: loss_final: 3.007860, loss_mean: 0.920152, proj_loss: -0.410623, loss_mean_cls: 2.498331, grad_norm: 25.232248 +Steps: 0%| | 3658/1000000 [15:00<67:44:09, 4.09it/s, grad_norm=25.2, loss_final=3.01, loss_mean=0.92, loss_mean_cls=2.5, proj_loss=-0.411][2026-03-23 13:51:20] Step: 3658, Training Logs: loss_final: 3.236566, loss_mean: 0.949532, proj_loss: -0.407783, loss_mean_cls: 2.694817, grad_norm: 17.411884 +Steps: 0%| | 3659/1000000 [15:01<67:40:23, 4.09it/s, grad_norm=17.4, loss_final=3.24, loss_mean=0.95, loss_mean_cls=2.69, proj_loss=-0.408][2026-03-23 13:51:21] Step: 3659, Training Logs: loss_final: 2.508578, loss_mean: 0.938533, proj_loss: -0.422127, loss_mean_cls: 1.992172, grad_norm: 14.419516 +Steps: 0%| | 3660/1000000 [15:01<67:34:44, 4.10it/s, grad_norm=14.4, loss_final=2.51, loss_mean=0.939, loss_mean_cls=1.99, proj_loss=-0.422][2026-03-23 13:51:21] Step: 3660, Training Logs: loss_final: 3.124286, loss_mean: 0.914895, proj_loss: -0.415344, loss_mean_cls: 2.624734, grad_norm: 24.585611 +Steps: 0%| | 3661/1000000 [15:01<67:38:13, 4.09it/s, grad_norm=24.6, loss_final=3.12, loss_mean=0.915, loss_mean_cls=2.62, proj_loss=-0.415][2026-03-23 13:51:21] Step: 3661, Training Logs: loss_final: 3.238901, loss_mean: 0.907300, proj_loss: -0.409312, loss_mean_cls: 2.740912, grad_norm: 22.254269 +Steps: 0%| | 3662/1000000 [15:01<67:43:02, 4.09it/s, grad_norm=22.3, loss_final=3.24, loss_mean=0.907, loss_mean_cls=2.74, proj_loss=-0.409][2026-03-23 13:51:21] Step: 3662, Training Logs: loss_final: 3.191802, loss_mean: 0.903812, proj_loss: -0.409144, loss_mean_cls: 2.697133, grad_norm: 25.423529 +Steps: 0%| | 3663/1000000 [15:02<67:37:13, 4.09it/s, grad_norm=25.4, loss_final=3.19, loss_mean=0.904, loss_mean_cls=2.7, proj_loss=-0.409][2026-03-23 13:51:22] Step: 3663, Training Logs: loss_final: 2.678536, loss_mean: 0.932204, proj_loss: -0.420381, loss_mean_cls: 2.166713, grad_norm: 10.797632 +Steps: 0%| | 3664/1000000 [15:02<67:33:52, 4.10it/s, grad_norm=10.8, loss_final=2.68, loss_mean=0.932, loss_mean_cls=2.17, proj_loss=-0.42][2026-03-23 13:51:22] Step: 3664, Training Logs: loss_final: 2.816554, loss_mean: 0.929698, proj_loss: -0.419929, loss_mean_cls: 2.306785, grad_norm: 9.149309 +Steps: 0%| | 3665/1000000 [15:02<67:31:28, 4.10it/s, grad_norm=9.15, loss_final=2.82, loss_mean=0.93, loss_mean_cls=2.31, proj_loss=-0.42][2026-03-23 13:51:22] Step: 3665, Training Logs: loss_final: 2.566305, loss_mean: 0.950387, proj_loss: -0.421375, loss_mean_cls: 2.037292, grad_norm: 16.725706 +Steps: 0%| | 3666/1000000 [15:02<67:35:28, 4.09it/s, grad_norm=16.7, loss_final=2.57, loss_mean=0.95, loss_mean_cls=2.04, proj_loss=-0.421][2026-03-23 13:51:22] Step: 3666, Training Logs: loss_final: 2.925537, loss_mean: 0.916519, proj_loss: -0.417148, loss_mean_cls: 2.426167, grad_norm: 16.175573 +Steps: 0%| | 3667/1000000 [15:03<67:33:28, 4.10it/s, grad_norm=16.2, loss_final=2.93, loss_mean=0.917, loss_mean_cls=2.43, proj_loss=-0.417][2026-03-23 13:51:23] Step: 3667, Training Logs: loss_final: 3.079849, loss_mean: 0.935818, proj_loss: -0.412487, loss_mean_cls: 2.556517, grad_norm: 11.622046 +Steps: 0%| | 3668/1000000 [15:03<67:31:14, 4.10it/s, grad_norm=11.6, loss_final=3.08, loss_mean=0.936, loss_mean_cls=2.56, proj_loss=-0.412][2026-03-23 13:51:23] Step: 3668, Training Logs: loss_final: 2.908453, loss_mean: 0.925257, proj_loss: -0.411874, loss_mean_cls: 2.395071, grad_norm: 15.687626 +Steps: 0%| | 3669/1000000 [15:03<67:29:03, 4.10it/s, grad_norm=15.7, loss_final=2.91, loss_mean=0.925, loss_mean_cls=2.4, proj_loss=-0.412][2026-03-23 13:51:23] Step: 3669, Training Logs: loss_final: 2.744059, loss_mean: 0.928373, proj_loss: -0.415696, loss_mean_cls: 2.231382, grad_norm: 16.214361 +Steps: 0%| | 3670/1000000 [15:03<67:35:10, 4.09it/s, grad_norm=16.2, loss_final=2.74, loss_mean=0.928, loss_mean_cls=2.23, proj_loss=-0.416][2026-03-23 13:51:23] Step: 3670, Training Logs: loss_final: 2.794855, loss_mean: 0.926233, proj_loss: -0.418841, loss_mean_cls: 2.287463, grad_norm: 11.822037 +Steps: 0%| | 3671/1000000 [15:04<67:32:22, 4.10it/s, grad_norm=11.8, loss_final=2.79, loss_mean=0.926, loss_mean_cls=2.29, proj_loss=-0.419][2026-03-23 13:51:24] Step: 3671, Training Logs: loss_final: 2.744147, loss_mean: 0.912229, proj_loss: -0.422802, loss_mean_cls: 2.254720, grad_norm: 4.114040 +Steps: 0%| | 3672/1000000 [15:04<67:31:13, 4.10it/s, grad_norm=4.11, loss_final=2.74, loss_mean=0.912, loss_mean_cls=2.25, proj_loss=-0.423][2026-03-23 13:51:24] Step: 3672, Training Logs: loss_final: 2.266351, loss_mean: 0.938659, proj_loss: -0.431217, loss_mean_cls: 1.758909, grad_norm: 2.823017 +Steps: 0%| | 3673/1000000 [15:04<67:28:48, 4.10it/s, grad_norm=2.82, loss_final=2.27, loss_mean=0.939, loss_mean_cls=1.76, proj_loss=-0.431][2026-03-23 13:51:24] Step: 3673, Training Logs: loss_final: 2.751285, loss_mean: 0.933705, proj_loss: -0.424340, loss_mean_cls: 2.241920, grad_norm: 6.002905 +Steps: 0%| | 3674/1000000 [15:04<67:36:08, 4.09it/s, grad_norm=6, loss_final=2.75, loss_mean=0.934, loss_mean_cls=2.24, proj_loss=-0.424][2026-03-23 13:51:24] Step: 3674, Training Logs: loss_final: 2.194987, loss_mean: 0.925781, proj_loss: -0.429512, loss_mean_cls: 1.698718, grad_norm: 6.047395 +Steps: 0%| | 3675/1000000 [15:05<67:33:09, 4.10it/s, grad_norm=6.05, loss_final=2.19, loss_mean=0.926, loss_mean_cls=1.7, proj_loss=-0.43][2026-03-23 13:51:25] Step: 3675, Training Logs: loss_final: 2.856730, loss_mean: 0.934094, proj_loss: -0.417096, loss_mean_cls: 2.339733, grad_norm: 13.885297 +Steps: 0%| | 3676/1000000 [15:05<67:30:36, 4.10it/s, grad_norm=13.9, loss_final=2.86, loss_mean=0.934, loss_mean_cls=2.34, proj_loss=-0.417][2026-03-23 13:51:25] Step: 3676, Training Logs: loss_final: 2.633698, loss_mean: 0.921414, proj_loss: -0.418692, loss_mean_cls: 2.130977, grad_norm: 2.955143 +Steps: 0%| | 3677/1000000 [15:05<67:29:10, 4.10it/s, grad_norm=2.96, loss_final=2.63, loss_mean=0.921, loss_mean_cls=2.13, proj_loss=-0.419][2026-03-23 13:51:25] Step: 3677, Training Logs: loss_final: 2.662342, loss_mean: 0.910428, proj_loss: -0.423299, loss_mean_cls: 2.175213, grad_norm: 1.637583 +Steps: 0%| | 3678/1000000 [15:05<67:34:28, 4.10it/s, grad_norm=1.64, loss_final=2.66, loss_mean=0.91, loss_mean_cls=2.18, proj_loss=-0.423][2026-03-23 13:51:25] Step: 3678, Training Logs: loss_final: 2.649413, loss_mean: 0.946889, proj_loss: -0.425009, loss_mean_cls: 2.127533, grad_norm: 10.396049 +Steps: 0%| | 3679/1000000 [15:06<67:33:12, 4.10it/s, grad_norm=10.4, loss_final=2.65, loss_mean=0.947, loss_mean_cls=2.13, proj_loss=-0.425][2026-03-23 13:51:26] Step: 3679, Training Logs: loss_final: 2.859127, loss_mean: 0.917334, proj_loss: -0.414089, loss_mean_cls: 2.355882, grad_norm: 22.879131 +Steps: 0%| | 3680/1000000 [15:06<67:31:52, 4.10it/s, grad_norm=22.9, loss_final=2.86, loss_mean=0.917, loss_mean_cls=2.36, proj_loss=-0.414][2026-03-23 13:51:26] Step: 3680, Training Logs: loss_final: 2.662392, loss_mean: 0.937992, proj_loss: -0.416095, loss_mean_cls: 2.140495, grad_norm: 7.096234 +Steps: 0%| | 3681/1000000 [15:06<67:29:22, 4.10it/s, grad_norm=7.1, loss_final=2.66, loss_mean=0.938, loss_mean_cls=2.14, proj_loss=-0.416][2026-03-23 13:51:26] Step: 3681, Training Logs: loss_final: 2.803432, loss_mean: 0.923362, proj_loss: -0.411485, loss_mean_cls: 2.291556, grad_norm: 7.453078 +Steps: 0%| | 3682/1000000 [15:06<67:32:44, 4.10it/s, grad_norm=7.45, loss_final=2.8, loss_mean=0.923, loss_mean_cls=2.29, proj_loss=-0.411][2026-03-23 13:51:26] Step: 3682, Training Logs: loss_final: 3.066862, loss_mean: 0.909722, proj_loss: -0.412196, loss_mean_cls: 2.569336, grad_norm: 21.424879 +Steps: 0%| | 3683/1000000 [15:07<67:31:35, 4.10it/s, grad_norm=21.4, loss_final=3.07, loss_mean=0.91, loss_mean_cls=2.57, proj_loss=-0.412][2026-03-23 13:51:27] Step: 3683, Training Logs: loss_final: 3.303013, loss_mean: 0.908599, proj_loss: -0.413057, loss_mean_cls: 2.807471, grad_norm: 16.776215 +Steps: 0%| | 3684/1000000 [15:07<68:47:59, 4.02it/s, grad_norm=16.8, loss_final=3.3, loss_mean=0.909, loss_mean_cls=2.81, proj_loss=-0.413][2026-03-23 13:51:27] Step: 3684, Training Logs: loss_final: 2.879683, loss_mean: 0.924367, proj_loss: -0.418682, loss_mean_cls: 2.373998, grad_norm: 34.837780 +Steps: 0%| | 3685/1000000 [15:07<68:29:24, 4.04it/s, grad_norm=34.8, loss_final=2.88, loss_mean=0.924, loss_mean_cls=2.37, proj_loss=-0.419][2026-03-23 13:51:27] Step: 3685, Training Logs: loss_final: 2.497676, loss_mean: 0.954440, proj_loss: -0.421404, loss_mean_cls: 1.964640, grad_norm: 24.279470 +Steps: 0%| | 3686/1000000 [15:07<68:09:52, 4.06it/s, grad_norm=24.3, loss_final=2.5, loss_mean=0.954, loss_mean_cls=1.96, proj_loss=-0.421][2026-03-23 13:51:27] Step: 3686, Training Logs: loss_final: 2.527754, loss_mean: 0.931452, proj_loss: -0.421445, loss_mean_cls: 2.017748, grad_norm: 3.483486 +Steps: 0%| | 3687/1000000 [15:08<68:00:49, 4.07it/s, grad_norm=3.48, loss_final=2.53, loss_mean=0.931, loss_mean_cls=2.02, proj_loss=-0.421][2026-03-23 13:51:28] Step: 3687, Training Logs: loss_final: 2.900536, loss_mean: 0.919985, proj_loss: -0.415944, loss_mean_cls: 2.396495, grad_norm: 3.203661 +Steps: 0%| | 3688/1000000 [15:08<67:49:10, 4.08it/s, grad_norm=3.2, loss_final=2.9, loss_mean=0.92, loss_mean_cls=2.4, proj_loss=-0.416][2026-03-23 13:51:28] Step: 3688, Training Logs: loss_final: 2.863959, loss_mean: 0.919611, proj_loss: -0.416575, loss_mean_cls: 2.360922, grad_norm: 8.969199 +Steps: 0%| | 3689/1000000 [15:08<67:41:08, 4.09it/s, grad_norm=8.97, loss_final=2.86, loss_mean=0.92, loss_mean_cls=2.36, proj_loss=-0.417][2026-03-23 13:51:28] Step: 3689, Training Logs: loss_final: 2.593096, loss_mean: 0.933128, proj_loss: -0.419441, loss_mean_cls: 2.079410, grad_norm: 8.209428 +Steps: 0%| | 3690/1000000 [15:08<67:37:40, 4.09it/s, grad_norm=8.21, loss_final=2.59, loss_mean=0.933, loss_mean_cls=2.08, proj_loss=-0.419][2026-03-23 13:51:28] Step: 3690, Training Logs: loss_final: 2.707253, loss_mean: 0.946261, proj_loss: -0.414809, loss_mean_cls: 2.175801, grad_norm: 12.105803 +Steps: 0%| | 3691/1000000 [15:09<67:33:34, 4.10it/s, grad_norm=12.1, loss_final=2.71, loss_mean=0.946, loss_mean_cls=2.18, proj_loss=-0.415][2026-03-23 13:51:29] Step: 3691, Training Logs: loss_final: 2.923638, loss_mean: 0.911591, proj_loss: -0.415015, loss_mean_cls: 2.427062, grad_norm: 13.097160 +Steps: 0%| | 3692/1000000 [15:09<67:31:08, 4.10it/s, grad_norm=13.1, loss_final=2.92, loss_mean=0.912, loss_mean_cls=2.43, proj_loss=-0.415][2026-03-23 13:51:29] Step: 3692, Training Logs: loss_final: 2.510288, loss_mean: 0.944707, proj_loss: -0.420762, loss_mean_cls: 1.986343, grad_norm: 2.272186 +Steps: 0%| | 3693/1000000 [15:09<67:28:50, 4.10it/s, grad_norm=2.27, loss_final=2.51, loss_mean=0.945, loss_mean_cls=1.99, proj_loss=-0.421][2026-03-23 13:51:29] Step: 3693, Training Logs: loss_final: 2.728312, loss_mean: 0.909684, proj_loss: -0.421295, loss_mean_cls: 2.239923, grad_norm: 2.888129 +Steps: 0%| | 3694/1000000 [15:09<67:28:35, 4.10it/s, grad_norm=2.89, loss_final=2.73, loss_mean=0.91, loss_mean_cls=2.24, proj_loss=-0.421][2026-03-23 13:51:29] Step: 3694, Training Logs: loss_final: 3.290314, loss_mean: 0.918902, proj_loss: -0.403144, loss_mean_cls: 2.774556, grad_norm: 14.683088 +Steps: 0%| | 3695/1000000 [15:10<67:26:53, 4.10it/s, grad_norm=14.7, loss_final=3.29, loss_mean=0.919, loss_mean_cls=2.77, proj_loss=-0.403][2026-03-23 13:51:30] Step: 3695, Training Logs: loss_final: 2.236389, loss_mean: 0.961391, proj_loss: -0.423070, loss_mean_cls: 1.698069, grad_norm: 16.063021 +Steps: 0%| | 3696/1000000 [15:10<67:25:51, 4.10it/s, grad_norm=16.1, loss_final=2.24, loss_mean=0.961, loss_mean_cls=1.7, proj_loss=-0.423][2026-03-23 13:51:30] Step: 3696, Training Logs: loss_final: 2.970160, loss_mean: 0.924865, proj_loss: -0.410668, loss_mean_cls: 2.455962, grad_norm: 13.282218 +Steps: 0%| | 3697/1000000 [15:10<67:27:24, 4.10it/s, grad_norm=13.3, loss_final=2.97, loss_mean=0.925, loss_mean_cls=2.46, proj_loss=-0.411][2026-03-23 13:51:30] Step: 3697, Training Logs: loss_final: 2.645521, loss_mean: 0.933162, proj_loss: -0.410672, loss_mean_cls: 2.123031, grad_norm: 6.491468 +Steps: 0%| | 3698/1000000 [15:10<67:27:17, 4.10it/s, grad_norm=6.49, loss_final=2.65, loss_mean=0.933, loss_mean_cls=2.12, proj_loss=-0.411][2026-03-23 13:51:30] Step: 3698, Training Logs: loss_final: 2.507060, loss_mean: 0.931345, proj_loss: -0.417787, loss_mean_cls: 1.993502, grad_norm: 16.809566 +Steps: 0%| | 3699/1000000 [15:11<67:27:11, 4.10it/s, grad_norm=16.8, loss_final=2.51, loss_mean=0.931, loss_mean_cls=1.99, proj_loss=-0.418][2026-03-23 13:51:30] Step: 3699, Training Logs: loss_final: 3.172357, loss_mean: 0.935923, proj_loss: -0.404307, loss_mean_cls: 2.640742, grad_norm: 22.067219 +Steps: 0%| | 3700/1000000 [15:11<67:28:40, 4.10it/s, grad_norm=22.1, loss_final=3.17, loss_mean=0.936, loss_mean_cls=2.64, proj_loss=-0.404][2026-03-23 13:51:31] Step: 3700, Training Logs: loss_final: 2.450554, loss_mean: 0.938337, proj_loss: -0.418095, loss_mean_cls: 1.930312, grad_norm: 6.008388 +Steps: 0%| | 3701/1000000 [15:11<67:27:28, 4.10it/s, grad_norm=6.01, loss_final=2.45, loss_mean=0.938, loss_mean_cls=1.93, proj_loss=-0.418][2026-03-23 13:51:31] Step: 3701, Training Logs: loss_final: 2.479973, loss_mean: 0.942952, proj_loss: -0.420333, loss_mean_cls: 1.957354, grad_norm: 7.688042 +Steps: 0%| | 3702/1000000 [15:11<68:18:57, 4.05it/s, grad_norm=7.69, loss_final=2.48, loss_mean=0.943, loss_mean_cls=1.96, proj_loss=-0.42][2026-03-23 13:51:31] Step: 3702, Training Logs: loss_final: 2.582037, loss_mean: 0.932849, proj_loss: -0.425601, loss_mean_cls: 2.074789, grad_norm: 2.911562 +Steps: 0%| | 3703/1000000 [15:12<68:06:05, 4.06it/s, grad_norm=2.91, loss_final=2.58, loss_mean=0.933, loss_mean_cls=2.07, proj_loss=-0.426][2026-03-23 13:51:31] Step: 3703, Training Logs: loss_final: 3.042656, loss_mean: 0.917837, proj_loss: -0.409757, loss_mean_cls: 2.534577, grad_norm: 16.470819 +Steps: 0%| | 3704/1000000 [15:12<67:53:16, 4.08it/s, grad_norm=16.5, loss_final=3.04, loss_mean=0.918, loss_mean_cls=2.53, proj_loss=-0.41][2026-03-23 13:51:32] Step: 3704, Training Logs: loss_final: 2.366143, loss_mean: 0.931431, proj_loss: -0.423415, loss_mean_cls: 1.858127, grad_norm: 3.981920 +Steps: 0%| | 3705/1000000 [15:12<67:58:33, 4.07it/s, grad_norm=3.98, loss_final=2.37, loss_mean=0.931, loss_mean_cls=1.86, proj_loss=-0.423][2026-03-23 13:51:32] Step: 3705, Training Logs: loss_final: 3.049002, loss_mean: 0.912558, proj_loss: -0.421611, loss_mean_cls: 2.558055, grad_norm: 4.807099 +Steps: 0%| | 3706/1000000 [15:12<67:47:50, 4.08it/s, grad_norm=4.81, loss_final=3.05, loss_mean=0.913, loss_mean_cls=2.56, proj_loss=-0.422][2026-03-23 13:51:32] Step: 3706, Training Logs: loss_final: 2.156465, loss_mean: 0.924347, proj_loss: -0.432519, loss_mean_cls: 1.664638, grad_norm: 10.028115 +Steps: 0%| | 3707/1000000 [15:12<67:41:35, 4.09it/s, grad_norm=10, loss_final=2.16, loss_mean=0.924, loss_mean_cls=1.66, proj_loss=-0.433][2026-03-23 13:51:32] Step: 3707, Training Logs: loss_final: 2.290027, loss_mean: 0.940268, proj_loss: -0.424623, loss_mean_cls: 1.774383, grad_norm: 7.717356 +Steps: 0%| | 3708/1000000 [15:13<97:47:12, 2.83it/s, grad_norm=7.72, loss_final=2.29, loss_mean=0.94, loss_mean_cls=1.77, proj_loss=-0.425][2026-03-23 13:51:33] Step: 3708, Training Logs: loss_final: 2.826492, loss_mean: 0.920791, proj_loss: -0.416782, loss_mean_cls: 2.322483, grad_norm: 10.527734 +Steps: 0%| | 3709/1000000 [15:13<88:38:51, 3.12it/s, grad_norm=10.5, loss_final=2.83, loss_mean=0.921, loss_mean_cls=2.32, proj_loss=-0.417][2026-03-23 13:51:33] Step: 3709, Training Logs: loss_final: 2.649561, loss_mean: 0.918898, proj_loss: -0.419737, loss_mean_cls: 2.150400, grad_norm: 11.899863 +Steps: 0%| | 3710/1000000 [15:14<82:16:00, 3.36it/s, grad_norm=11.9, loss_final=2.65, loss_mean=0.919, loss_mean_cls=2.15, proj_loss=-0.42][2026-03-23 13:51:34] Step: 3710, Training Logs: loss_final: 3.340618, loss_mean: 0.914492, proj_loss: -0.405677, loss_mean_cls: 2.831803, grad_norm: 7.235033 +Steps: 0%| | 3711/1000000 [15:14<77:49:36, 3.56it/s, grad_norm=7.24, loss_final=3.34, loss_mean=0.914, loss_mean_cls=2.83, proj_loss=-0.406][2026-03-23 13:51:34] Step: 3711, Training Logs: loss_final: 2.702239, loss_mean: 0.914260, proj_loss: -0.419935, loss_mean_cls: 2.207914, grad_norm: 16.223255 +Steps: 0%| | 3712/1000000 [15:14<74:51:26, 3.70it/s, grad_norm=16.2, loss_final=2.7, loss_mean=0.914, loss_mean_cls=2.21, proj_loss=-0.42][2026-03-23 13:51:34] Step: 3712, Training Logs: loss_final: 2.339701, loss_mean: 0.927124, proj_loss: -0.423317, loss_mean_cls: 1.835894, grad_norm: 2.066481 +Steps: 0%| | 3713/1000000 [15:14<72:36:38, 3.81it/s, grad_norm=2.07, loss_final=2.34, loss_mean=0.927, loss_mean_cls=1.84, proj_loss=-0.423][2026-03-23 13:51:34] Step: 3713, Training Logs: loss_final: 3.153356, loss_mean: 0.906561, proj_loss: -0.411982, loss_mean_cls: 2.658778, grad_norm: 25.831263 +Steps: 0%| | 3714/1000000 [15:15<71:04:45, 3.89it/s, grad_norm=25.8, loss_final=3.15, loss_mean=0.907, loss_mean_cls=2.66, proj_loss=-0.412][2026-03-23 13:51:35] Step: 3714, Training Logs: loss_final: 2.601416, loss_mean: 0.960974, proj_loss: -0.422857, loss_mean_cls: 2.063300, grad_norm: 27.838091 +Steps: 0%| | 3715/1000000 [15:15<69:58:27, 3.95it/s, grad_norm=27.8, loss_final=2.6, loss_mean=0.961, loss_mean_cls=2.06, proj_loss=-0.423][2026-03-23 13:51:35] Step: 3715, Training Logs: loss_final: 2.742930, loss_mean: 0.971565, proj_loss: -0.417578, loss_mean_cls: 2.188943, grad_norm: 11.844476 +Steps: 0%| | 3716/1000000 [15:15<69:20:54, 3.99it/s, grad_norm=11.8, loss_final=2.74, loss_mean=0.972, loss_mean_cls=2.19, proj_loss=-0.418][2026-03-23 13:51:35] Step: 3716, Training Logs: loss_final: 3.067920, loss_mean: 0.945185, proj_loss: -0.407810, loss_mean_cls: 2.530545, grad_norm: 24.955101 +Steps: 0%| | 3717/1000000 [15:15<68:45:40, 4.02it/s, grad_norm=25, loss_final=3.07, loss_mean=0.945, loss_mean_cls=2.53, proj_loss=-0.408][2026-03-23 13:51:35] Step: 3717, Training Logs: loss_final: 2.503792, loss_mean: 0.954688, proj_loss: -0.416184, loss_mean_cls: 1.965288, grad_norm: 8.102253 +Steps: 0%| | 3718/1000000 [15:16<68:20:38, 4.05it/s, grad_norm=8.1, loss_final=2.5, loss_mean=0.955, loss_mean_cls=1.97, proj_loss=-0.416][2026-03-23 13:51:35] Step: 3718, Training Logs: loss_final: 2.598100, loss_mean: 0.908811, proj_loss: -0.421225, loss_mean_cls: 2.110514, grad_norm: 2.226729 +Steps: 0%| | 3719/1000000 [15:16<68:02:55, 4.07it/s, grad_norm=2.23, loss_final=2.6, loss_mean=0.909, loss_mean_cls=2.11, proj_loss=-0.421][2026-03-23 13:51:36] Step: 3719, Training Logs: loss_final: 3.126971, loss_mean: 0.940695, proj_loss: -0.417701, loss_mean_cls: 2.603976, grad_norm: 52.574593 +Steps: 0%| | 3720/1000000 [15:16<68:01:30, 4.07it/s, grad_norm=52.6, loss_final=3.13, loss_mean=0.941, loss_mean_cls=2.6, proj_loss=-0.418][2026-03-23 13:51:36] Step: 3720, Training Logs: loss_final: 2.827580, loss_mean: 0.958430, proj_loss: -0.407031, loss_mean_cls: 2.276182, grad_norm: 6.729904 +Steps: 0%| | 3721/1000000 [15:16<67:51:00, 4.08it/s, grad_norm=6.73, loss_final=2.83, loss_mean=0.958, loss_mean_cls=2.28, proj_loss=-0.407][2026-03-23 13:51:36] Step: 3721, Training Logs: loss_final: 2.652498, loss_mean: 0.950791, proj_loss: -0.411088, loss_mean_cls: 2.112795, grad_norm: 6.263056 +Steps: 0%| | 3722/1000000 [15:17<67:44:01, 4.09it/s, grad_norm=6.26, loss_final=2.65, loss_mean=0.951, loss_mean_cls=2.11, proj_loss=-0.411][2026-03-23 13:51:36] Step: 3722, Training Logs: loss_final: 3.013778, loss_mean: 0.916308, proj_loss: -0.413005, loss_mean_cls: 2.510474, grad_norm: 25.168829 +Steps: 0%| | 3723/1000000 [15:17<67:38:44, 4.09it/s, grad_norm=25.2, loss_final=3.01, loss_mean=0.916, loss_mean_cls=2.51, proj_loss=-0.413][2026-03-23 13:51:37] Step: 3723, Training Logs: loss_final: 2.481372, loss_mean: 0.945019, proj_loss: -0.413162, loss_mean_cls: 1.949516, grad_norm: 10.209567 +Steps: 0%| | 3724/1000000 [15:17<67:42:10, 4.09it/s, grad_norm=10.2, loss_final=2.48, loss_mean=0.945, loss_mean_cls=1.95, proj_loss=-0.413][2026-03-23 13:51:37] Step: 3724, Training Logs: loss_final: 2.548228, loss_mean: 0.961344, proj_loss: -0.412681, loss_mean_cls: 1.999565, grad_norm: 9.629066 +Steps: 0%| | 3725/1000000 [15:17<67:37:53, 4.09it/s, grad_norm=9.63, loss_final=2.55, loss_mean=0.961, loss_mean_cls=2, proj_loss=-0.413][2026-03-23 13:51:37] Step: 3725, Training Logs: loss_final: 2.665979, loss_mean: 0.967920, proj_loss: -0.413420, loss_mean_cls: 2.111479, grad_norm: 3.335560 +Steps: 0%| | 3726/1000000 [15:17<67:35:30, 4.09it/s, grad_norm=3.34, loss_final=2.67, loss_mean=0.968, loss_mean_cls=2.11, proj_loss=-0.413][2026-03-23 13:51:37] Step: 3726, Training Logs: loss_final: 2.852286, loss_mean: 0.911945, proj_loss: -0.419908, loss_mean_cls: 2.360249, grad_norm: 7.784812 +Steps: 0%| | 3727/1000000 [15:18<67:32:51, 4.10it/s, grad_norm=7.78, loss_final=2.85, loss_mean=0.912, loss_mean_cls=2.36, proj_loss=-0.42][2026-03-23 13:51:38] Step: 3727, Training Logs: loss_final: 2.895513, loss_mean: 0.941736, proj_loss: -0.418965, loss_mean_cls: 2.372742, grad_norm: 24.897036 +Steps: 0%| | 3728/1000000 [15:18<67:38:27, 4.09it/s, grad_norm=24.9, loss_final=2.9, loss_mean=0.942, loss_mean_cls=2.37, proj_loss=-0.419][2026-03-23 13:51:38] Step: 3728, Training Logs: loss_final: 2.890433, loss_mean: 0.960560, proj_loss: -0.418259, loss_mean_cls: 2.348132, grad_norm: 52.134380 +Steps: 0%| | 3729/1000000 [15:18<67:33:02, 4.10it/s, grad_norm=52.1, loss_final=2.89, loss_mean=0.961, loss_mean_cls=2.35, proj_loss=-0.418][2026-03-23 13:51:38] Step: 3729, Training Logs: loss_final: 3.055117, loss_mean: 0.988715, proj_loss: -0.403098, loss_mean_cls: 2.469500, grad_norm: 9.015488 +Steps: 0%| | 3730/1000000 [15:18<67:30:17, 4.10it/s, grad_norm=9.02, loss_final=3.06, loss_mean=0.989, loss_mean_cls=2.47, proj_loss=-0.403][2026-03-23 13:51:38] Step: 3730, Training Logs: loss_final: 2.731076, loss_mean: 0.996065, proj_loss: -0.412200, loss_mean_cls: 2.147211, grad_norm: 11.227450 +Steps: 0%| | 3731/1000000 [15:19<67:30:39, 4.10it/s, grad_norm=11.2, loss_final=2.73, loss_mean=0.996, loss_mean_cls=2.15, proj_loss=-0.412][2026-03-23 13:51:39] Step: 3731, Training Logs: loss_final: 2.394717, loss_mean: 0.968561, proj_loss: -0.417540, loss_mean_cls: 1.843695, grad_norm: 9.302365 +Steps: 0%| | 3732/1000000 [15:19<67:38:58, 4.09it/s, grad_norm=9.3, loss_final=2.39, loss_mean=0.969, loss_mean_cls=1.84, proj_loss=-0.418][2026-03-23 13:51:39] Step: 3732, Training Logs: loss_final: 2.989905, loss_mean: 0.952679, proj_loss: -0.405552, loss_mean_cls: 2.442778, grad_norm: 19.647549 +Steps: 0%| | 3733/1000000 [15:19<67:36:24, 4.09it/s, grad_norm=19.6, loss_final=2.99, loss_mean=0.953, loss_mean_cls=2.44, proj_loss=-0.406][2026-03-23 13:51:39] Step: 3733, Training Logs: loss_final: 2.556257, loss_mean: 0.952111, proj_loss: -0.413483, loss_mean_cls: 2.017629, grad_norm: 17.183060 +Steps: 0%| | 3734/1000000 [15:19<67:32:16, 4.10it/s, grad_norm=17.2, loss_final=2.56, loss_mean=0.952, loss_mean_cls=2.02, proj_loss=-0.413][2026-03-23 13:51:39] Step: 3734, Training Logs: loss_final: 2.491998, loss_mean: 0.957722, proj_loss: -0.419647, loss_mean_cls: 1.953923, grad_norm: 9.818388 +Steps: 0%| | 3735/1000000 [15:20<67:30:52, 4.10it/s, grad_norm=9.82, loss_final=2.49, loss_mean=0.958, loss_mean_cls=1.95, proj_loss=-0.42][2026-03-23 13:51:40] Step: 3735, Training Logs: loss_final: 2.844544, loss_mean: 0.943002, proj_loss: -0.413511, loss_mean_cls: 2.315053, grad_norm: 14.902107 +Steps: 0%| | 3736/1000000 [15:20<67:29:33, 4.10it/s, grad_norm=14.9, loss_final=2.84, loss_mean=0.943, loss_mean_cls=2.32, proj_loss=-0.414][2026-03-23 13:51:40] Step: 3736, Training Logs: loss_final: 2.882822, loss_mean: 0.921561, proj_loss: -0.414370, loss_mean_cls: 2.375632, grad_norm: 17.342075 +Steps: 0%| | 3737/1000000 [15:20<67:26:37, 4.10it/s, grad_norm=17.3, loss_final=2.88, loss_mean=0.922, loss_mean_cls=2.38, proj_loss=-0.414][2026-03-23 13:51:40] Step: 3737, Training Logs: loss_final: 3.213518, loss_mean: 0.925462, proj_loss: -0.403079, loss_mean_cls: 2.691135, grad_norm: 11.841778 +Steps: 0%| | 3738/1000000 [15:20<67:27:21, 4.10it/s, grad_norm=11.8, loss_final=3.21, loss_mean=0.925, loss_mean_cls=2.69, proj_loss=-0.403][2026-03-23 13:51:40] Step: 3738, Training Logs: loss_final: 2.655506, loss_mean: 0.942765, proj_loss: -0.416729, loss_mean_cls: 2.129470, grad_norm: 5.409753 +Steps: 0%| | 3739/1000000 [15:21<67:26:46, 4.10it/s, grad_norm=5.41, loss_final=2.66, loss_mean=0.943, loss_mean_cls=2.13, proj_loss=-0.417][2026-03-23 13:51:41] Step: 3739, Training Logs: loss_final: 2.739016, loss_mean: 0.916085, proj_loss: -0.413747, loss_mean_cls: 2.236678, grad_norm: 14.401271 +Steps: 0%| | 3740/1000000 [15:21<67:26:38, 4.10it/s, grad_norm=14.4, loss_final=2.74, loss_mean=0.916, loss_mean_cls=2.24, proj_loss=-0.414][2026-03-23 13:51:41] Step: 3740, Training Logs: loss_final: 2.351840, loss_mean: 0.934826, proj_loss: -0.425881, loss_mean_cls: 1.842895, grad_norm: 1.586195 +Steps: 0%| | 3741/1000000 [15:21<67:26:55, 4.10it/s, grad_norm=1.59, loss_final=2.35, loss_mean=0.935, loss_mean_cls=1.84, proj_loss=-0.426][2026-03-23 13:51:41] Step: 3741, Training Logs: loss_final: 2.363202, loss_mean: 0.943952, proj_loss: -0.428752, loss_mean_cls: 1.848001, grad_norm: 4.389547 +Steps: 0%| | 3742/1000000 [15:21<67:26:29, 4.10it/s, grad_norm=4.39, loss_final=2.36, loss_mean=0.944, loss_mean_cls=1.85, proj_loss=-0.429][2026-03-23 13:51:41] Step: 3742, Training Logs: loss_final: 2.578595, loss_mean: 0.936057, proj_loss: -0.419221, loss_mean_cls: 2.061759, grad_norm: 2.156862 +Steps: 0%| | 3743/1000000 [15:22<67:28:03, 4.10it/s, grad_norm=2.16, loss_final=2.58, loss_mean=0.936, loss_mean_cls=2.06, proj_loss=-0.419][2026-03-23 13:51:42] Step: 3743, Training Logs: loss_final: 3.276990, loss_mean: 0.911938, proj_loss: -0.409460, loss_mean_cls: 2.774511, grad_norm: 2.562236 +Steps: 0%| | 3744/1000000 [15:22<67:27:12, 4.10it/s, grad_norm=2.56, loss_final=3.28, loss_mean=0.912, loss_mean_cls=2.77, proj_loss=-0.409][2026-03-23 13:51:42] Step: 3744, Training Logs: loss_final: 2.484033, loss_mean: 0.937364, proj_loss: -0.422946, loss_mean_cls: 1.969614, grad_norm: 1.952921 +Steps: 0%| | 3745/1000000 [15:22<67:27:09, 4.10it/s, grad_norm=1.95, loss_final=2.48, loss_mean=0.937, loss_mean_cls=1.97, proj_loss=-0.423][2026-03-23 13:51:42] Step: 3745, Training Logs: loss_final: 2.446460, loss_mean: 0.914287, proj_loss: -0.425373, loss_mean_cls: 1.957546, grad_norm: 41.013233 +Steps: 0%| | 3746/1000000 [15:22<67:27:16, 4.10it/s, grad_norm=41, loss_final=2.45, loss_mean=0.914, loss_mean_cls=1.96, proj_loss=-0.425][2026-03-23 13:51:42] Step: 3746, Training Logs: loss_final: 2.809288, loss_mean: 0.919859, proj_loss: -0.415492, loss_mean_cls: 2.304920, grad_norm: 5.171505 +Steps: 0%| | 3747/1000000 [15:23<67:26:32, 4.10it/s, grad_norm=5.17, loss_final=2.81, loss_mean=0.92, loss_mean_cls=2.3, proj_loss=-0.415][2026-03-23 13:51:43] Step: 3747, Training Logs: loss_final: 2.471923, loss_mean: 0.948021, proj_loss: -0.424671, loss_mean_cls: 1.948573, grad_norm: 44.405357 +Steps: 0%| | 3748/1000000 [15:23<67:27:23, 4.10it/s, grad_norm=44.4, loss_final=2.47, loss_mean=0.948, loss_mean_cls=1.95, proj_loss=-0.425][2026-03-23 13:51:43] Step: 3748, Training Logs: loss_final: 2.384313, loss_mean: 0.933062, proj_loss: -0.421786, loss_mean_cls: 1.873037, grad_norm: 7.627737 +Steps: 0%| | 3749/1000000 [15:23<67:26:19, 4.10it/s, grad_norm=7.63, loss_final=2.38, loss_mean=0.933, loss_mean_cls=1.87, proj_loss=-0.422][2026-03-23 13:51:43] Step: 3749, Training Logs: loss_final: 2.655307, loss_mean: 0.937637, proj_loss: -0.420597, loss_mean_cls: 2.138267, grad_norm: 2.876897 +Steps: 0%| | 3750/1000000 [15:23<67:24:56, 4.10it/s, grad_norm=2.88, loss_final=2.66, loss_mean=0.938, loss_mean_cls=2.14, proj_loss=-0.421][2026-03-23 13:51:43] Step: 3750, Training Logs: loss_final: 3.017332, loss_mean: 0.898298, proj_loss: -0.418231, loss_mean_cls: 2.537265, grad_norm: 12.626593 +Steps: 0%| | 3751/1000000 [15:24<67:25:07, 4.10it/s, grad_norm=12.6, loss_final=3.02, loss_mean=0.898, loss_mean_cls=2.54, proj_loss=-0.418][2026-03-23 13:51:44] Step: 3751, Training Logs: loss_final: 2.131962, loss_mean: 0.939385, proj_loss: -0.422617, loss_mean_cls: 1.615194, grad_norm: 18.360615 +Steps: 0%| | 3752/1000000 [15:24<67:24:31, 4.11it/s, grad_norm=18.4, loss_final=2.13, loss_mean=0.939, loss_mean_cls=1.62, proj_loss=-0.423][2026-03-23 13:51:44] Step: 3752, Training Logs: loss_final: 2.923457, loss_mean: 0.907347, proj_loss: -0.413812, loss_mean_cls: 2.429922, grad_norm: 3.672895 +Steps: 0%| | 3753/1000000 [15:24<67:25:00, 4.10it/s, grad_norm=3.67, loss_final=2.92, loss_mean=0.907, loss_mean_cls=2.43, proj_loss=-0.414][2026-03-23 13:51:44] Step: 3753, Training Logs: loss_final: 2.995009, loss_mean: 0.907825, proj_loss: -0.417754, loss_mean_cls: 2.504937, grad_norm: 15.814542 +Steps: 0%| | 3754/1000000 [15:24<67:24:46, 4.11it/s, grad_norm=15.8, loss_final=3, loss_mean=0.908, loss_mean_cls=2.5, proj_loss=-0.418][2026-03-23 13:51:44] Step: 3754, Training Logs: loss_final: 2.452712, loss_mean: 0.933124, proj_loss: -0.422839, loss_mean_cls: 1.942427, grad_norm: 21.253893 +Steps: 0%| | 3755/1000000 [15:25<68:44:20, 4.03it/s, grad_norm=21.3, loss_final=2.45, loss_mean=0.933, loss_mean_cls=1.94, proj_loss=-0.423][2026-03-23 13:51:45] Step: 3755, Training Logs: loss_final: 2.786038, loss_mean: 0.932469, proj_loss: -0.418164, loss_mean_cls: 2.271733, grad_norm: 22.812546 +Steps: 0%| | 3756/1000000 [15:25<68:20:47, 4.05it/s, grad_norm=22.8, loss_final=2.79, loss_mean=0.932, loss_mean_cls=2.27, proj_loss=-0.418][2026-03-23 13:51:45] Step: 3756, Training Logs: loss_final: 2.468910, loss_mean: 0.961401, proj_loss: -0.418852, loss_mean_cls: 1.926361, grad_norm: 3.289678 +Steps: 0%| | 3757/1000000 [15:25<68:04:28, 4.07it/s, grad_norm=3.29, loss_final=2.47, loss_mean=0.961, loss_mean_cls=1.93, proj_loss=-0.419][2026-03-23 13:51:45] Step: 3757, Training Logs: loss_final: 3.122046, loss_mean: 0.897881, proj_loss: -0.411707, loss_mean_cls: 2.635870, grad_norm: 9.078296 +Steps: 0%| | 3758/1000000 [15:25<67:53:00, 4.08it/s, grad_norm=9.08, loss_final=3.12, loss_mean=0.898, loss_mean_cls=2.64, proj_loss=-0.412][2026-03-23 13:51:45] Step: 3758, Training Logs: loss_final: 3.004305, loss_mean: 0.949071, proj_loss: -0.414731, loss_mean_cls: 2.469965, grad_norm: 27.173172 +Steps: 0%| | 3759/1000000 [15:26<67:43:48, 4.09it/s, grad_norm=27.2, loss_final=3, loss_mean=0.949, loss_mean_cls=2.47, proj_loss=-0.415][2026-03-23 13:51:46] Step: 3759, Training Logs: loss_final: 3.010971, loss_mean: 0.966049, proj_loss: -0.410711, loss_mean_cls: 2.455633, grad_norm: 8.206454 +Steps: 0%| | 3760/1000000 [15:26<67:38:01, 4.09it/s, grad_norm=8.21, loss_final=3.01, loss_mean=0.966, loss_mean_cls=2.46, proj_loss=-0.411][2026-03-23 13:51:46] Step: 3760, Training Logs: loss_final: 3.205108, loss_mean: 0.934798, proj_loss: -0.402573, loss_mean_cls: 2.672882, grad_norm: 24.793261 +Steps: 0%| | 3761/1000000 [15:26<67:34:47, 4.09it/s, grad_norm=24.8, loss_final=3.21, loss_mean=0.935, loss_mean_cls=2.67, proj_loss=-0.403][2026-03-23 13:51:46] Step: 3761, Training Logs: loss_final: 2.932093, loss_mean: 0.954704, proj_loss: -0.408562, loss_mean_cls: 2.385951, grad_norm: 8.314305 +Steps: 0%| | 3762/1000000 [15:26<67:29:39, 4.10it/s, grad_norm=8.31, loss_final=2.93, loss_mean=0.955, loss_mean_cls=2.39, proj_loss=-0.409][2026-03-23 13:51:46] Step: 3762, Training Logs: loss_final: 2.439570, loss_mean: 0.941815, proj_loss: -0.418400, loss_mean_cls: 1.916155, grad_norm: 11.066625 +Steps: 0%| | 3763/1000000 [15:27<67:31:35, 4.10it/s, grad_norm=11.1, loss_final=2.44, loss_mean=0.942, loss_mean_cls=1.92, proj_loss=-0.418][2026-03-23 13:51:46] Step: 3763, Training Logs: loss_final: 2.930748, loss_mean: 0.901109, proj_loss: -0.418008, loss_mean_cls: 2.447647, grad_norm: 17.156071 +Steps: 0%| | 3764/1000000 [15:27<67:28:36, 4.10it/s, grad_norm=17.2, loss_final=2.93, loss_mean=0.901, loss_mean_cls=2.45, proj_loss=-0.418][2026-03-23 13:51:47] Step: 3764, Training Logs: loss_final: 2.789437, loss_mean: 0.964618, proj_loss: -0.414816, loss_mean_cls: 2.239635, grad_norm: 14.739256 +Steps: 0%| | 3765/1000000 [15:27<67:26:41, 4.10it/s, grad_norm=14.7, loss_final=2.79, loss_mean=0.965, loss_mean_cls=2.24, proj_loss=-0.415][2026-03-23 13:51:47] Step: 3765, Training Logs: loss_final: 2.809387, loss_mean: 0.966537, proj_loss: -0.411787, loss_mean_cls: 2.254638, grad_norm: 3.907199 +Steps: 0%| | 3766/1000000 [15:27<67:28:06, 4.10it/s, grad_norm=3.91, loss_final=2.81, loss_mean=0.967, loss_mean_cls=2.25, proj_loss=-0.412][2026-03-23 13:51:47] Step: 3766, Training Logs: loss_final: 2.623111, loss_mean: 0.942384, proj_loss: -0.420337, loss_mean_cls: 2.101063, grad_norm: 2.320698 +Steps: 0%| | 3767/1000000 [15:27<67:26:56, 4.10it/s, grad_norm=2.32, loss_final=2.62, loss_mean=0.942, loss_mean_cls=2.1, proj_loss=-0.42][2026-03-23 13:51:47] Step: 3767, Training Logs: loss_final: 2.665591, loss_mean: 0.934485, proj_loss: -0.417603, loss_mean_cls: 2.148709, grad_norm: 11.501395 +Steps: 0%| | 3768/1000000 [15:28<67:26:08, 4.10it/s, grad_norm=11.5, loss_final=2.67, loss_mean=0.934, loss_mean_cls=2.15, proj_loss=-0.418][2026-03-23 13:51:48] Step: 3768, Training Logs: loss_final: 2.930531, loss_mean: 0.913217, proj_loss: -0.407846, loss_mean_cls: 2.425160, grad_norm: 13.127287 +Steps: 0%| | 3769/1000000 [15:28<67:24:08, 4.11it/s, grad_norm=13.1, loss_final=2.93, loss_mean=0.913, loss_mean_cls=2.43, proj_loss=-0.408][2026-03-23 13:51:48] Step: 3769, Training Logs: loss_final: 2.616156, loss_mean: 0.950057, proj_loss: -0.417395, loss_mean_cls: 2.083494, grad_norm: 4.470024 +Steps: 0%| | 3770/1000000 [15:28<67:25:11, 4.10it/s, grad_norm=4.47, loss_final=2.62, loss_mean=0.95, loss_mean_cls=2.08, proj_loss=-0.417][2026-03-23 13:51:48] Step: 3770, Training Logs: loss_final: 2.749166, loss_mean: 0.946824, proj_loss: -0.413174, loss_mean_cls: 2.215516, grad_norm: 11.205648 +Steps: 0%| | 3771/1000000 [15:28<67:24:24, 4.11it/s, grad_norm=11.2, loss_final=2.75, loss_mean=0.947, loss_mean_cls=2.22, proj_loss=-0.413][2026-03-23 13:51:48] Step: 3771, Training Logs: loss_final: 2.865848, loss_mean: 0.926155, proj_loss: -0.412478, loss_mean_cls: 2.352170, grad_norm: 9.467360 +Steps: 0%| | 3772/1000000 [15:29<67:25:39, 4.10it/s, grad_norm=9.47, loss_final=2.87, loss_mean=0.926, loss_mean_cls=2.35, proj_loss=-0.412][2026-03-23 13:51:49] Step: 3772, Training Logs: loss_final: 2.549017, loss_mean: 0.941373, proj_loss: -0.417646, loss_mean_cls: 2.025290, grad_norm: 2.125852 +Steps: 0%| | 3773/1000000 [15:29<67:25:41, 4.10it/s, grad_norm=2.13, loss_final=2.55, loss_mean=0.941, loss_mean_cls=2.03, proj_loss=-0.418][2026-03-23 13:51:49] Step: 3773, Training Logs: loss_final: 2.741414, loss_mean: 0.937558, proj_loss: -0.416759, loss_mean_cls: 2.220614, grad_norm: 13.028362 +Steps: 0%| | 3774/1000000 [15:29<67:24:47, 4.10it/s, grad_norm=13, loss_final=2.74, loss_mean=0.938, loss_mean_cls=2.22, proj_loss=-0.417][2026-03-23 13:51:49] Step: 3774, Training Logs: loss_final: 2.861214, loss_mean: 0.946782, proj_loss: -0.415844, loss_mean_cls: 2.330276, grad_norm: 6.521942 +Steps: 0%| | 3775/1000000 [15:29<67:26:24, 4.10it/s, grad_norm=6.52, loss_final=2.86, loss_mean=0.947, loss_mean_cls=2.33, proj_loss=-0.416][2026-03-23 13:51:49] Step: 3775, Training Logs: loss_final: 3.068442, loss_mean: 0.955133, proj_loss: -0.417002, loss_mean_cls: 2.530311, grad_norm: 24.618666 +Steps: 0%| | 3776/1000000 [15:30<67:27:52, 4.10it/s, grad_norm=24.6, loss_final=3.07, loss_mean=0.955, loss_mean_cls=2.53, proj_loss=-0.417][2026-03-23 13:51:50] Step: 3776, Training Logs: loss_final: 2.529408, loss_mean: 0.953823, proj_loss: -0.413628, loss_mean_cls: 1.989212, grad_norm: 3.546870 +Steps: 0%| | 3777/1000000 [15:30<67:27:38, 4.10it/s, grad_norm=3.55, loss_final=2.53, loss_mean=0.954, loss_mean_cls=1.99, proj_loss=-0.414][2026-03-23 13:51:50] Step: 3777, Training Logs: loss_final: 2.947872, loss_mean: 0.937875, proj_loss: -0.409863, loss_mean_cls: 2.419860, grad_norm: 17.853838 +Steps: 0%| | 3778/1000000 [15:30<67:28:13, 4.10it/s, grad_norm=17.9, loss_final=2.95, loss_mean=0.938, loss_mean_cls=2.42, proj_loss=-0.41][2026-03-23 13:51:50] Step: 3778, Training Logs: loss_final: 3.165178, loss_mean: 0.911353, proj_loss: -0.412018, loss_mean_cls: 2.665842, grad_norm: 17.234781 +Steps: 0%| | 3779/1000000 [15:30<67:30:35, 4.10it/s, grad_norm=17.2, loss_final=3.17, loss_mean=0.911, loss_mean_cls=2.67, proj_loss=-0.412][2026-03-23 13:51:50] Step: 3779, Training Logs: loss_final: 2.800466, loss_mean: 0.949814, proj_loss: -0.416229, loss_mean_cls: 2.266881, grad_norm: 9.980507 +Steps: 0%| | 3780/1000000 [15:31<67:33:22, 4.10it/s, grad_norm=9.98, loss_final=2.8, loss_mean=0.95, loss_mean_cls=2.27, proj_loss=-0.416][2026-03-23 13:51:51] Step: 3780, Training Logs: loss_final: 2.853490, loss_mean: 0.958666, proj_loss: -0.411433, loss_mean_cls: 2.306257, grad_norm: 5.532482 +Steps: 0%| | 3781/1000000 [15:31<67:31:27, 4.10it/s, grad_norm=5.53, loss_final=2.85, loss_mean=0.959, loss_mean_cls=2.31, proj_loss=-0.411][2026-03-23 13:51:51] Step: 3781, Training Logs: loss_final: 2.795897, loss_mean: 0.931000, proj_loss: -0.418720, loss_mean_cls: 2.283617, grad_norm: 11.181558 +Steps: 0%| | 3782/1000000 [15:31<67:30:47, 4.10it/s, grad_norm=11.2, loss_final=2.8, loss_mean=0.931, loss_mean_cls=2.28, proj_loss=-0.419][2026-03-23 13:51:51] Step: 3782, Training Logs: loss_final: 3.006762, loss_mean: 0.935439, proj_loss: -0.413107, loss_mean_cls: 2.484430, grad_norm: 8.474396 +Steps: 0%| | 3783/1000000 [15:31<67:28:30, 4.10it/s, grad_norm=8.47, loss_final=3.01, loss_mean=0.935, loss_mean_cls=2.48, proj_loss=-0.413][2026-03-23 13:51:51] Step: 3783, Training Logs: loss_final: 2.414956, loss_mean: 0.948088, proj_loss: -0.424282, loss_mean_cls: 1.891149, grad_norm: 2.479072 +Steps: 0%| | 3784/1000000 [15:32<67:27:16, 4.10it/s, grad_norm=2.48, loss_final=2.41, loss_mean=0.948, loss_mean_cls=1.89, proj_loss=-0.424][2026-03-23 13:51:52] Step: 3784, Training Logs: loss_final: 2.317663, loss_mean: 0.955684, proj_loss: -0.428660, loss_mean_cls: 1.790640, grad_norm: 3.187016 +Steps: 0%| | 3785/1000000 [15:32<67:25:41, 4.10it/s, grad_norm=3.19, loss_final=2.32, loss_mean=0.956, loss_mean_cls=1.79, proj_loss=-0.429][2026-03-23 13:51:52] Step: 3785, Training Logs: loss_final: 3.146460, loss_mean: 0.934085, proj_loss: -0.415640, loss_mean_cls: 2.628015, grad_norm: 17.677078 +Steps: 0%| | 3786/1000000 [15:32<67:32:00, 4.10it/s, grad_norm=17.7, loss_final=3.15, loss_mean=0.934, loss_mean_cls=2.63, proj_loss=-0.416][2026-03-23 13:51:52] Step: 3786, Training Logs: loss_final: 2.789339, loss_mean: 0.922244, proj_loss: -0.416501, loss_mean_cls: 2.283595, grad_norm: 2.722329 +Steps: 0%| | 3787/1000000 [15:32<67:28:17, 4.10it/s, grad_norm=2.72, loss_final=2.79, loss_mean=0.922, loss_mean_cls=2.28, proj_loss=-0.417][2026-03-23 13:51:52] Step: 3787, Training Logs: loss_final: 2.724875, loss_mean: 0.906856, proj_loss: -0.416556, loss_mean_cls: 2.234574, grad_norm: 9.347543 +Steps: 0%| | 3788/1000000 [15:33<67:28:33, 4.10it/s, grad_norm=9.35, loss_final=2.72, loss_mean=0.907, loss_mean_cls=2.23, proj_loss=-0.417][2026-03-23 13:51:53] Step: 3788, Training Logs: loss_final: 2.798519, loss_mean: 0.942349, proj_loss: -0.415346, loss_mean_cls: 2.271516, grad_norm: 28.780338 +Steps: 0%| | 3789/1000000 [15:33<67:26:46, 4.10it/s, grad_norm=28.8, loss_final=2.8, loss_mean=0.942, loss_mean_cls=2.27, proj_loss=-0.415][2026-03-23 13:51:53] Step: 3789, Training Logs: loss_final: 2.600965, loss_mean: 0.964415, proj_loss: -0.413484, loss_mean_cls: 2.050035, grad_norm: 6.421463 +Steps: 0%| | 3790/1000000 [15:33<67:25:12, 4.10it/s, grad_norm=6.42, loss_final=2.6, loss_mean=0.964, loss_mean_cls=2.05, proj_loss=-0.413][2026-03-23 13:51:53] Step: 3790, Training Logs: loss_final: 2.887087, loss_mean: 0.915647, proj_loss: -0.411494, loss_mean_cls: 2.382934, grad_norm: 3.438328 +Steps: 0%| | 3791/1000000 [15:33<67:27:31, 4.10it/s, grad_norm=3.44, loss_final=2.89, loss_mean=0.916, loss_mean_cls=2.38, proj_loss=-0.411][2026-03-23 13:51:53] Step: 3791, Training Logs: loss_final: 3.160451, loss_mean: 0.909694, proj_loss: -0.410558, loss_mean_cls: 2.661315, grad_norm: 10.400357 +Steps: 0%| | 3792/1000000 [15:34<67:25:27, 4.10it/s, grad_norm=10.4, loss_final=3.16, loss_mean=0.91, loss_mean_cls=2.66, proj_loss=-0.411][2026-03-23 13:51:54] Step: 3792, Training Logs: loss_final: 3.390130, loss_mean: 0.922289, proj_loss: -0.404773, loss_mean_cls: 2.872614, grad_norm: 7.855125 +Steps: 0%| | 3793/1000000 [15:34<67:29:44, 4.10it/s, grad_norm=7.86, loss_final=3.39, loss_mean=0.922, loss_mean_cls=2.87, proj_loss=-0.405][2026-03-23 13:51:54] Step: 3793, Training Logs: loss_final: 2.733020, loss_mean: 0.930643, proj_loss: -0.411307, loss_mean_cls: 2.213684, grad_norm: 6.616753 +Steps: 0%| | 3794/1000000 [15:34<67:27:36, 4.10it/s, grad_norm=6.62, loss_final=2.73, loss_mean=0.931, loss_mean_cls=2.21, proj_loss=-0.411][2026-03-23 13:51:54] Step: 3794, Training Logs: loss_final: 3.045990, loss_mean: 0.937707, proj_loss: -0.412372, loss_mean_cls: 2.520655, grad_norm: 26.776573 +Steps: 0%| | 3795/1000000 [15:34<67:26:24, 4.10it/s, grad_norm=26.8, loss_final=3.05, loss_mean=0.938, loss_mean_cls=2.52, proj_loss=-0.412][2026-03-23 13:51:54] Step: 3795, Training Logs: loss_final: 2.731074, loss_mean: 0.953685, proj_loss: -0.417037, loss_mean_cls: 2.194426, grad_norm: 10.855615 +Steps: 0%| | 3796/1000000 [15:35<67:25:10, 4.10it/s, grad_norm=10.9, loss_final=2.73, loss_mean=0.954, loss_mean_cls=2.19, proj_loss=-0.417][2026-03-23 13:51:55] Step: 3796, Training Logs: loss_final: 2.746169, loss_mean: 0.924440, proj_loss: -0.414849, loss_mean_cls: 2.236578, grad_norm: 8.817619 +Steps: 0%| | 3797/1000000 [15:35<67:24:44, 4.10it/s, grad_norm=8.82, loss_final=2.75, loss_mean=0.924, loss_mean_cls=2.24, proj_loss=-0.415][2026-03-23 13:51:55] Step: 3797, Training Logs: loss_final: 3.024188, loss_mean: 0.905221, proj_loss: -0.414529, loss_mean_cls: 2.533496, grad_norm: 6.531523 +Steps: 0%| | 3798/1000000 [15:35<67:24:42, 4.10it/s, grad_norm=6.53, loss_final=3.02, loss_mean=0.905, loss_mean_cls=2.53, proj_loss=-0.415][2026-03-23 13:51:55] Step: 3798, Training Logs: loss_final: 2.598122, loss_mean: 0.946590, proj_loss: -0.419423, loss_mean_cls: 2.070955, grad_norm: 19.631552 +Steps: 0%| | 3799/1000000 [15:35<67:25:32, 4.10it/s, grad_norm=19.6, loss_final=2.6, loss_mean=0.947, loss_mean_cls=2.07, proj_loss=-0.419][2026-03-23 13:51:55] Step: 3799, Training Logs: loss_final: 3.042849, loss_mean: 0.880919, proj_loss: -0.418036, loss_mean_cls: 2.579966, grad_norm: 6.772539 +Steps: 0%| | 3800/1000000 [15:36<67:32:11, 4.10it/s, grad_norm=6.77, loss_final=3.04, loss_mean=0.881, loss_mean_cls=2.58, proj_loss=-0.418][2026-03-23 13:51:55] Step: 3800, Training Logs: loss_final: 2.653920, loss_mean: 0.934289, proj_loss: -0.425231, loss_mean_cls: 2.144862, grad_norm: 5.189890 +Steps: 0%| | 3801/1000000 [15:36<67:30:26, 4.10it/s, grad_norm=5.19, loss_final=2.65, loss_mean=0.934, loss_mean_cls=2.14, proj_loss=-0.425][2026-03-23 13:51:56] Step: 3801, Training Logs: loss_final: 3.382415, loss_mean: 0.892579, proj_loss: -0.404337, loss_mean_cls: 2.894173, grad_norm: 22.178452 +Steps: 0%| | 3802/1000000 [15:36<67:28:27, 4.10it/s, grad_norm=22.2, loss_final=3.38, loss_mean=0.893, loss_mean_cls=2.89, proj_loss=-0.404][2026-03-23 13:51:56] Step: 3802, Training Logs: loss_final: 2.514920, loss_mean: 0.908468, proj_loss: -0.428320, loss_mean_cls: 2.034773, grad_norm: 6.425496 +Steps: 0%| | 3803/1000000 [15:36<67:28:38, 4.10it/s, grad_norm=6.43, loss_final=2.51, loss_mean=0.908, loss_mean_cls=2.03, proj_loss=-0.428][2026-03-23 13:51:56] Step: 3803, Training Logs: loss_final: 2.242805, loss_mean: 0.948383, proj_loss: -0.424473, loss_mean_cls: 1.718895, grad_norm: 7.117714 +Steps: 0%| | 3804/1000000 [15:37<67:27:48, 4.10it/s, grad_norm=7.12, loss_final=2.24, loss_mean=0.948, loss_mean_cls=1.72, proj_loss=-0.424][2026-03-23 13:51:56] Step: 3804, Training Logs: loss_final: 2.822218, loss_mean: 0.925563, proj_loss: -0.415381, loss_mean_cls: 2.312036, grad_norm: 13.472541 +Steps: 0%| | 3805/1000000 [15:37<67:26:22, 4.10it/s, grad_norm=13.5, loss_final=2.82, loss_mean=0.926, loss_mean_cls=2.31, proj_loss=-0.415][2026-03-23 13:51:57] Step: 3805, Training Logs: loss_final: 2.680329, loss_mean: 0.938091, proj_loss: -0.420979, loss_mean_cls: 2.163216, grad_norm: 4.046064 +Steps: 0%| | 3806/1000000 [15:37<70:38:27, 3.92it/s, grad_norm=4.05, loss_final=2.68, loss_mean=0.938, loss_mean_cls=2.16, proj_loss=-0.421][2026-03-23 13:51:57] Step: 3806, Training Logs: loss_final: 2.677273, loss_mean: 0.926107, proj_loss: -0.417298, loss_mean_cls: 2.168464, grad_norm: 19.599358 +Steps: 0%| | 3807/1000000 [15:37<69:40:06, 3.97it/s, grad_norm=19.6, loss_final=2.68, loss_mean=0.926, loss_mean_cls=2.17, proj_loss=-0.417][2026-03-23 13:51:57] Step: 3807, Training Logs: loss_final: 2.349092, loss_mean: 0.935661, proj_loss: -0.427317, loss_mean_cls: 1.840748, grad_norm: 5.409156 +Steps: 0%| | 3808/1000000 [15:38<68:59:53, 4.01it/s, grad_norm=5.41, loss_final=2.35, loss_mean=0.936, loss_mean_cls=1.84, proj_loss=-0.427][2026-03-23 13:51:57] Step: 3808, Training Logs: loss_final: 2.906044, loss_mean: 0.915932, proj_loss: -0.413875, loss_mean_cls: 2.403986, grad_norm: 2.204793 +Steps: 0%| | 3809/1000000 [15:38<68:30:33, 4.04it/s, grad_norm=2.2, loss_final=2.91, loss_mean=0.916, loss_mean_cls=2.4, proj_loss=-0.414][2026-03-23 13:51:58] Step: 3809, Training Logs: loss_final: 2.678301, loss_mean: 0.922415, proj_loss: -0.422493, loss_mean_cls: 2.178380, grad_norm: 1.872866 +Steps: 0%| | 3810/1000000 [15:38<68:12:02, 4.06it/s, grad_norm=1.87, loss_final=2.68, loss_mean=0.922, loss_mean_cls=2.18, proj_loss=-0.422][2026-03-23 13:51:58] Step: 3810, Training Logs: loss_final: 2.856950, loss_mean: 0.918930, proj_loss: -0.418089, loss_mean_cls: 2.356109, grad_norm: 27.162760 +Steps: 0%| | 3811/1000000 [15:38<67:57:16, 4.07it/s, grad_norm=27.2, loss_final=2.86, loss_mean=0.919, loss_mean_cls=2.36, proj_loss=-0.418][2026-03-23 13:51:58] Step: 3811, Training Logs: loss_final: 2.560736, loss_mean: 0.927701, proj_loss: -0.425434, loss_mean_cls: 2.058469, grad_norm: 12.418660 +Steps: 0%| | 3812/1000000 [15:38<67:49:08, 4.08it/s, grad_norm=12.4, loss_final=2.56, loss_mean=0.928, loss_mean_cls=2.06, proj_loss=-0.425][2026-03-23 13:51:58] Step: 3812, Training Logs: loss_final: 2.959442, loss_mean: 0.914615, proj_loss: -0.414157, loss_mean_cls: 2.458983, grad_norm: 13.861281 +Steps: 0%| | 3813/1000000 [15:39<67:45:17, 4.08it/s, grad_norm=13.9, loss_final=2.96, loss_mean=0.915, loss_mean_cls=2.46, proj_loss=-0.414][2026-03-23 13:51:59] Step: 3813, Training Logs: loss_final: 2.734231, loss_mean: 0.939879, proj_loss: -0.411931, loss_mean_cls: 2.206283, grad_norm: 3.013460 +Steps: 0%| | 3814/1000000 [15:39<67:37:33, 4.09it/s, grad_norm=3.01, loss_final=2.73, loss_mean=0.94, loss_mean_cls=2.21, proj_loss=-0.412][2026-03-23 13:51:59] Step: 3814, Training Logs: loss_final: 2.788075, loss_mean: 0.910648, proj_loss: -0.418677, loss_mean_cls: 2.296104, grad_norm: 13.706726 +Steps: 0%| | 3815/1000000 [15:39<67:32:21, 4.10it/s, grad_norm=13.7, loss_final=2.79, loss_mean=0.911, loss_mean_cls=2.3, proj_loss=-0.419][2026-03-23 13:51:59] Step: 3815, Training Logs: loss_final: 2.577653, loss_mean: 0.937370, proj_loss: -0.415142, loss_mean_cls: 2.055425, grad_norm: 2.351567 +Steps: 0%| | 3816/1000000 [15:39<67:29:02, 4.10it/s, grad_norm=2.35, loss_final=2.58, loss_mean=0.937, loss_mean_cls=2.06, proj_loss=-0.415][2026-03-23 13:51:59] Step: 3816, Training Logs: loss_final: 3.281436, loss_mean: 0.900493, proj_loss: -0.412234, loss_mean_cls: 2.793177, grad_norm: 15.816701 +Steps: 0%| | 3817/1000000 [15:40<67:29:04, 4.10it/s, grad_norm=15.8, loss_final=3.28, loss_mean=0.9, loss_mean_cls=2.79, proj_loss=-0.412][2026-03-23 13:52:00] Step: 3817, Training Logs: loss_final: 2.598013, loss_mean: 0.943371, proj_loss: -0.416471, loss_mean_cls: 2.071113, grad_norm: 7.633718 +Steps: 0%| | 3818/1000000 [15:40<67:26:32, 4.10it/s, grad_norm=7.63, loss_final=2.6, loss_mean=0.943, loss_mean_cls=2.07, proj_loss=-0.416][2026-03-23 13:52:00] Step: 3818, Training Logs: loss_final: 2.703808, loss_mean: 0.916801, proj_loss: -0.414390, loss_mean_cls: 2.201397, grad_norm: 3.882917 +Steps: 0%| | 3819/1000000 [15:40<67:24:36, 4.10it/s, grad_norm=3.88, loss_final=2.7, loss_mean=0.917, loss_mean_cls=2.2, proj_loss=-0.414][2026-03-23 13:52:00] Step: 3819, Training Logs: loss_final: 2.915000, loss_mean: 0.915503, proj_loss: -0.416559, loss_mean_cls: 2.416056, grad_norm: 4.295232 +Steps: 0%| | 3820/1000000 [15:40<67:29:56, 4.10it/s, grad_norm=4.3, loss_final=2.91, loss_mean=0.916, loss_mean_cls=2.42, proj_loss=-0.417][2026-03-23 13:52:00] Step: 3820, Training Logs: loss_final: 2.564803, loss_mean: 0.913118, proj_loss: -0.422063, loss_mean_cls: 2.073748, grad_norm: 2.963998 +Steps: 0%| | 3821/1000000 [15:41<67:27:51, 4.10it/s, grad_norm=2.96, loss_final=2.56, loss_mean=0.913, loss_mean_cls=2.07, proj_loss=-0.422][2026-03-23 13:52:01] Step: 3821, Training Logs: loss_final: 2.693652, loss_mean: 0.926310, proj_loss: -0.418211, loss_mean_cls: 2.185553, grad_norm: 5.511175 +Steps: 0%| | 3822/1000000 [15:41<67:27:54, 4.10it/s, grad_norm=5.51, loss_final=2.69, loss_mean=0.926, loss_mean_cls=2.19, proj_loss=-0.418][2026-03-23 13:52:01] Step: 3822, Training Logs: loss_final: 3.013138, loss_mean: 0.948109, proj_loss: -0.417446, loss_mean_cls: 2.482475, grad_norm: 4.028723 +Steps: 0%| | 3823/1000000 [15:41<67:26:39, 4.10it/s, grad_norm=4.03, loss_final=3.01, loss_mean=0.948, loss_mean_cls=2.48, proj_loss=-0.417][2026-03-23 13:52:01] Step: 3823, Training Logs: loss_final: 2.976213, loss_mean: 0.925271, proj_loss: -0.411628, loss_mean_cls: 2.462570, grad_norm: 4.285810 +Steps: 0%| | 3824/1000000 [15:41<67:37:37, 4.09it/s, grad_norm=4.29, loss_final=2.98, loss_mean=0.925, loss_mean_cls=2.46, proj_loss=-0.412][2026-03-23 13:52:01] Step: 3824, Training Logs: loss_final: 2.682520, loss_mean: 0.932682, proj_loss: -0.421680, loss_mean_cls: 2.171518, grad_norm: 4.695325 +Steps: 0%| | 3825/1000000 [15:42<68:32:35, 4.04it/s, grad_norm=4.7, loss_final=2.68, loss_mean=0.933, loss_mean_cls=2.17, proj_loss=-0.422][2026-03-23 13:52:02] Step: 3825, Training Logs: loss_final: 2.467216, loss_mean: 0.927495, proj_loss: -0.427186, loss_mean_cls: 1.966908, grad_norm: 19.849739 +Steps: 0%| | 3826/1000000 [15:42<67:35:07, 4.09it/s, grad_norm=19.8, loss_final=2.47, loss_mean=0.927, loss_mean_cls=1.97, proj_loss=-0.427][2026-03-23 13:52:02] Step: 3826, Training Logs: loss_final: 2.637189, loss_mean: 0.940930, proj_loss: -0.415534, loss_mean_cls: 2.111793, grad_norm: 5.722906 +Steps: 0%| | 3827/1000000 [15:42<67:31:10, 4.10it/s, grad_norm=5.72, loss_final=2.64, loss_mean=0.941, loss_mean_cls=2.11, proj_loss=-0.416][2026-03-23 13:52:02] Step: 3827, Training Logs: loss_final: 2.496554, loss_mean: 0.934468, proj_loss: -0.418562, loss_mean_cls: 1.980648, grad_norm: 5.121482 +Steps: 0%| | 3828/1000000 [15:42<67:28:06, 4.10it/s, grad_norm=5.12, loss_final=2.5, loss_mean=0.934, loss_mean_cls=1.98, proj_loss=-0.419][2026-03-23 13:52:02] Step: 3828, Training Logs: loss_final: 2.832022, loss_mean: 0.915569, proj_loss: -0.414421, loss_mean_cls: 2.330874, grad_norm: 2.158027 +Steps: 0%| | 3829/1000000 [15:43<67:25:36, 4.10it/s, grad_norm=2.16, loss_final=2.83, loss_mean=0.916, loss_mean_cls=2.33, proj_loss=-0.414][2026-03-23 13:52:03] Step: 3829, Training Logs: loss_final: 2.666196, loss_mean: 0.909705, proj_loss: -0.424263, loss_mean_cls: 2.180754, grad_norm: 17.987110 +Steps: 0%| | 3830/1000000 [15:43<67:25:13, 4.10it/s, grad_norm=18, loss_final=2.67, loss_mean=0.91, loss_mean_cls=2.18, proj_loss=-0.424][2026-03-23 13:52:03] Step: 3830, Training Logs: loss_final: 2.964734, loss_mean: 0.926689, proj_loss: -0.410725, loss_mean_cls: 2.448769, grad_norm: 14.883302 +Steps: 0%| | 3831/1000000 [15:43<67:23:50, 4.11it/s, grad_norm=14.9, loss_final=2.96, loss_mean=0.927, loss_mean_cls=2.45, proj_loss=-0.411][2026-03-23 13:52:03] Step: 3831, Training Logs: loss_final: 2.958843, loss_mean: 0.945771, proj_loss: -0.409559, loss_mean_cls: 2.422632, grad_norm: 4.896601 +Steps: 0%| | 3832/1000000 [15:43<67:23:13, 4.11it/s, grad_norm=4.9, loss_final=2.96, loss_mean=0.946, loss_mean_cls=2.42, proj_loss=-0.41][2026-03-23 13:52:03] Step: 3832, Training Logs: loss_final: 2.546635, loss_mean: 0.936373, proj_loss: -0.422984, loss_mean_cls: 2.033246, grad_norm: 15.696010 +Steps: 0%| | 3833/1000000 [15:44<67:22:26, 4.11it/s, grad_norm=15.7, loss_final=2.55, loss_mean=0.936, loss_mean_cls=2.03, proj_loss=-0.423][2026-03-23 13:52:04] Step: 3833, Training Logs: loss_final: 3.037207, loss_mean: 0.924509, proj_loss: -0.414193, loss_mean_cls: 2.526890, grad_norm: 14.888566 +Steps: 0%| | 3834/1000000 [15:44<67:24:42, 4.10it/s, grad_norm=14.9, loss_final=3.04, loss_mean=0.925, loss_mean_cls=2.53, proj_loss=-0.414][2026-03-23 13:52:04] Step: 3834, Training Logs: loss_final: 2.351964, loss_mean: 0.948054, proj_loss: -0.423532, loss_mean_cls: 1.827442, grad_norm: 15.626753 +Steps: 0%| | 3835/1000000 [15:44<67:24:50, 4.10it/s, grad_norm=15.6, loss_final=2.35, loss_mean=0.948, loss_mean_cls=1.83, proj_loss=-0.424][2026-03-23 13:52:04] Step: 3835, Training Logs: loss_final: 2.626399, loss_mean: 0.928181, proj_loss: -0.420002, loss_mean_cls: 2.118220, grad_norm: 28.157167 +Steps: 0%| | 3836/1000000 [15:44<67:24:48, 4.10it/s, grad_norm=28.2, loss_final=2.63, loss_mean=0.928, loss_mean_cls=2.12, proj_loss=-0.42][2026-03-23 13:52:04] Step: 3836, Training Logs: loss_final: 2.794702, loss_mean: 0.952494, proj_loss: -0.415047, loss_mean_cls: 2.257256, grad_norm: 4.599290 +Steps: 0%| | 3837/1000000 [15:45<67:24:14, 4.11it/s, grad_norm=4.6, loss_final=2.79, loss_mean=0.952, loss_mean_cls=2.26, proj_loss=-0.415][2026-03-23 13:52:05] Step: 3837, Training Logs: loss_final: 2.625131, loss_mean: 0.959451, proj_loss: -0.421140, loss_mean_cls: 2.086820, grad_norm: 12.543320 +Steps: 0%| | 3838/1000000 [15:45<67:25:48, 4.10it/s, grad_norm=12.5, loss_final=2.63, loss_mean=0.959, loss_mean_cls=2.09, proj_loss=-0.421][2026-03-23 13:52:05] Step: 3838, Training Logs: loss_final: 2.673913, loss_mean: 0.923697, proj_loss: -0.414011, loss_mean_cls: 2.164227, grad_norm: 28.093380 +Steps: 0%| | 3839/1000000 [15:45<67:23:25, 4.11it/s, grad_norm=28.1, loss_final=2.67, loss_mean=0.924, loss_mean_cls=2.16, proj_loss=-0.414][2026-03-23 13:52:05] Step: 3839, Training Logs: loss_final: 2.677314, loss_mean: 0.948421, proj_loss: -0.418102, loss_mean_cls: 2.146995, grad_norm: 5.550889 +Steps: 0%| | 3840/1000000 [15:45<67:23:10, 4.11it/s, grad_norm=5.55, loss_final=2.68, loss_mean=0.948, loss_mean_cls=2.15, proj_loss=-0.418][2026-03-23 13:52:05] Step: 3840, Training Logs: loss_final: 2.388654, loss_mean: 0.972093, proj_loss: -0.428187, loss_mean_cls: 1.844747, grad_norm: 3.164068 +Steps: 0%| | 3841/1000000 [15:46<67:23:07, 4.11it/s, grad_norm=3.16, loss_final=2.39, loss_mean=0.972, loss_mean_cls=1.84, proj_loss=-0.428][2026-03-23 13:52:06] Step: 3841, Training Logs: loss_final: 2.371883, loss_mean: 0.927567, proj_loss: -0.426885, loss_mean_cls: 1.871201, grad_norm: 3.011928 +Steps: 0%| | 3842/1000000 [15:46<67:23:05, 4.11it/s, grad_norm=3.01, loss_final=2.37, loss_mean=0.928, loss_mean_cls=1.87, proj_loss=-0.427][2026-03-23 13:52:06] Step: 3842, Training Logs: loss_final: 2.894639, loss_mean: 0.932690, proj_loss: -0.420210, loss_mean_cls: 2.382159, grad_norm: 2.796219 +Steps: 0%| | 3843/1000000 [15:46<67:23:13, 4.11it/s, grad_norm=2.8, loss_final=2.89, loss_mean=0.933, loss_mean_cls=2.38, proj_loss=-0.42][2026-03-23 13:52:06] Step: 3843, Training Logs: loss_final: 2.614862, loss_mean: 0.928424, proj_loss: -0.419084, loss_mean_cls: 2.105522, grad_norm: 2.275026 +Steps: 0%| | 3844/1000000 [15:46<67:21:47, 4.11it/s, grad_norm=2.28, loss_final=2.61, loss_mean=0.928, loss_mean_cls=2.11, proj_loss=-0.419][2026-03-23 13:52:06] Step: 3844, Training Logs: loss_final: 2.824317, loss_mean: 0.930287, proj_loss: -0.415682, loss_mean_cls: 2.309711, grad_norm: 14.954107 +Steps: 0%| | 3845/1000000 [15:47<67:22:23, 4.11it/s, grad_norm=15, loss_final=2.82, loss_mean=0.93, loss_mean_cls=2.31, proj_loss=-0.416][2026-03-23 13:52:07] Step: 3845, Training Logs: loss_final: 2.648980, loss_mean: 0.939685, proj_loss: -0.418660, loss_mean_cls: 2.127955, grad_norm: 13.144403 +Steps: 0%| | 3846/1000000 [15:47<67:21:51, 4.11it/s, grad_norm=13.1, loss_final=2.65, loss_mean=0.94, loss_mean_cls=2.13, proj_loss=-0.419][2026-03-23 13:52:07] Step: 3846, Training Logs: loss_final: 2.636382, loss_mean: 0.955767, proj_loss: -0.413040, loss_mean_cls: 2.093655, grad_norm: 16.901917 +Steps: 0%| | 3847/1000000 [15:47<67:22:36, 4.11it/s, grad_norm=16.9, loss_final=2.64, loss_mean=0.956, loss_mean_cls=2.09, proj_loss=-0.413][2026-03-23 13:52:07] Step: 3847, Training Logs: loss_final: 2.946382, loss_mean: 0.953323, proj_loss: -0.412022, loss_mean_cls: 2.405082, grad_norm: 13.951566 +Steps: 0%| | 3848/1000000 [15:47<67:23:02, 4.11it/s, grad_norm=14, loss_final=2.95, loss_mean=0.953, loss_mean_cls=2.41, proj_loss=-0.412][2026-03-23 13:52:07] Step: 3848, Training Logs: loss_final: 2.528581, loss_mean: 0.947277, proj_loss: -0.412505, loss_mean_cls: 1.993809, grad_norm: 3.459240 +Steps: 0%| | 3849/1000000 [15:48<94:07:03, 2.94it/s, grad_norm=3.46, loss_final=2.53, loss_mean=0.947, loss_mean_cls=1.99, proj_loss=-0.413][2026-03-23 13:52:08] Step: 3849, Training Logs: loss_final: 2.766164, loss_mean: 0.939803, proj_loss: -0.421343, loss_mean_cls: 2.247704, grad_norm: 9.380424 +Steps: 0%| | 3850/1000000 [15:48<86:02:12, 3.22it/s, grad_norm=9.38, loss_final=2.77, loss_mean=0.94, loss_mean_cls=2.25, proj_loss=-0.421][2026-03-23 13:52:08] Step: 3850, Training Logs: loss_final: 2.592555, loss_mean: 0.934257, proj_loss: -0.423888, loss_mean_cls: 2.082186, grad_norm: 4.663884 +Steps: 0%| | 3851/1000000 [15:48<80:26:03, 3.44it/s, grad_norm=4.66, loss_final=2.59, loss_mean=0.934, loss_mean_cls=2.08, proj_loss=-0.424][2026-03-23 13:52:08] Step: 3851, Training Logs: loss_final: 2.637320, loss_mean: 0.923124, proj_loss: -0.415018, loss_mean_cls: 2.129213, grad_norm: 6.310608 +Steps: 0%| | 3852/1000000 [15:49<76:30:14, 3.62it/s, grad_norm=6.31, loss_final=2.64, loss_mean=0.923, loss_mean_cls=2.13, proj_loss=-0.415][2026-03-23 13:52:09] Step: 3852, Training Logs: loss_final: 2.526135, loss_mean: 0.919544, proj_loss: -0.419912, loss_mean_cls: 2.026503, grad_norm: 17.453074 +Steps: 0%| | 3853/1000000 [15:49<73:44:57, 3.75it/s, grad_norm=17.5, loss_final=2.53, loss_mean=0.92, loss_mean_cls=2.03, proj_loss=-0.42][2026-03-23 13:52:09] Step: 3853, Training Logs: loss_final: 3.060604, loss_mean: 0.921386, proj_loss: -0.408031, loss_mean_cls: 2.547250, grad_norm: 2.365186 +Steps: 0%| | 3854/1000000 [15:49<71:51:32, 3.85it/s, grad_norm=2.37, loss_final=3.06, loss_mean=0.921, loss_mean_cls=2.55, proj_loss=-0.408][2026-03-23 13:52:09] Step: 3854, Training Logs: loss_final: 2.786690, loss_mean: 0.918527, proj_loss: -0.420265, loss_mean_cls: 2.288427, grad_norm: 3.160219 +Steps: 0%| | 3855/1000000 [15:49<70:32:53, 3.92it/s, grad_norm=3.16, loss_final=2.79, loss_mean=0.919, loss_mean_cls=2.29, proj_loss=-0.42][2026-03-23 13:52:09] Step: 3855, Training Logs: loss_final: 2.839321, loss_mean: 0.915302, proj_loss: -0.416059, loss_mean_cls: 2.340079, grad_norm: 34.778358 +Steps: 0%| | 3856/1000000 [15:50<69:35:54, 3.98it/s, grad_norm=34.8, loss_final=2.84, loss_mean=0.915, loss_mean_cls=2.34, proj_loss=-0.416][2026-03-23 13:52:10] Step: 3856, Training Logs: loss_final: 2.433233, loss_mean: 0.974672, proj_loss: -0.412024, loss_mean_cls: 1.870585, grad_norm: 10.951872 +Steps: 0%| | 3857/1000000 [15:50<68:54:51, 4.02it/s, grad_norm=11, loss_final=2.43, loss_mean=0.975, loss_mean_cls=1.87, proj_loss=-0.412][2026-03-23 13:52:10] Step: 3857, Training Logs: loss_final: 2.885889, loss_mean: 0.927259, proj_loss: -0.409895, loss_mean_cls: 2.368526, grad_norm: 3.342399 +Steps: 0%| | 3858/1000000 [15:50<68:26:19, 4.04it/s, grad_norm=3.34, loss_final=2.89, loss_mean=0.927, loss_mean_cls=2.37, proj_loss=-0.41][2026-03-23 13:52:10] Step: 3858, Training Logs: loss_final: 2.997033, loss_mean: 0.939203, proj_loss: -0.409986, loss_mean_cls: 2.467817, grad_norm: 26.429516 +Steps: 0%| | 3859/1000000 [15:50<68:11:16, 4.06it/s, grad_norm=26.4, loss_final=3, loss_mean=0.939, loss_mean_cls=2.47, proj_loss=-0.41][2026-03-23 13:52:10] Step: 3859, Training Logs: loss_final: 2.878221, loss_mean: 0.941904, proj_loss: -0.416711, loss_mean_cls: 2.353028, grad_norm: 20.267279 +Steps: 0%| | 3860/1000000 [15:51<67:57:01, 4.07it/s, grad_norm=20.3, loss_final=2.88, loss_mean=0.942, loss_mean_cls=2.35, proj_loss=-0.417][2026-03-23 13:52:10] Step: 3860, Training Logs: loss_final: 3.173486, loss_mean: 0.906922, proj_loss: -0.409703, loss_mean_cls: 2.676267, grad_norm: 11.547026 +Steps: 0%| | 3861/1000000 [15:51<67:51:36, 4.08it/s, grad_norm=11.5, loss_final=3.17, loss_mean=0.907, loss_mean_cls=2.68, proj_loss=-0.41][2026-03-23 13:52:11] Step: 3861, Training Logs: loss_final: 2.571673, loss_mean: 0.945675, proj_loss: -0.416627, loss_mean_cls: 2.042625, grad_norm: 2.750943 +Steps: 0%| | 3862/1000000 [15:51<67:43:48, 4.09it/s, grad_norm=2.75, loss_final=2.57, loss_mean=0.946, loss_mean_cls=2.04, proj_loss=-0.417][2026-03-23 13:52:11] Step: 3862, Training Logs: loss_final: 2.796168, loss_mean: 0.947292, proj_loss: -0.416799, loss_mean_cls: 2.265675, grad_norm: 17.929356 +Steps: 0%| | 3863/1000000 [15:51<67:38:26, 4.09it/s, grad_norm=17.9, loss_final=2.8, loss_mean=0.947, loss_mean_cls=2.27, proj_loss=-0.417][2026-03-23 13:52:11] Step: 3863, Training Logs: loss_final: 3.189065, loss_mean: 0.945981, proj_loss: -0.412897, loss_mean_cls: 2.655981, grad_norm: 7.619485 +Steps: 0%| | 3864/1000000 [15:51<67:36:00, 4.09it/s, grad_norm=7.62, loss_final=3.19, loss_mean=0.946, loss_mean_cls=2.66, proj_loss=-0.413][2026-03-23 13:52:11] Step: 3864, Training Logs: loss_final: 3.038698, loss_mean: 0.955050, proj_loss: -0.412535, loss_mean_cls: 2.496182, grad_norm: 10.141557 +Steps: 0%| | 3865/1000000 [15:52<67:34:04, 4.10it/s, grad_norm=10.1, loss_final=3.04, loss_mean=0.955, loss_mean_cls=2.5, proj_loss=-0.413][2026-03-23 13:52:12] Step: 3865, Training Logs: loss_final: 2.886823, loss_mean: 0.932426, proj_loss: -0.420517, loss_mean_cls: 2.374915, grad_norm: 31.094444 +Steps: 0%| | 3866/1000000 [15:52<67:30:11, 4.10it/s, grad_norm=31.1, loss_final=2.89, loss_mean=0.932, loss_mean_cls=2.37, proj_loss=-0.421][2026-03-23 13:52:12] Step: 3866, Training Logs: loss_final: 2.661992, loss_mean: 0.948695, proj_loss: -0.417900, loss_mean_cls: 2.131197, grad_norm: 14.451181 +Steps: 0%| | 3867/1000000 [15:52<67:26:42, 4.10it/s, grad_norm=14.5, loss_final=2.66, loss_mean=0.949, loss_mean_cls=2.13, proj_loss=-0.418][2026-03-23 13:52:12] Step: 3867, Training Logs: loss_final: 2.814303, loss_mean: 0.958482, proj_loss: -0.414342, loss_mean_cls: 2.270163, grad_norm: 26.756285 +Steps: 0%| | 3868/1000000 [15:52<67:23:16, 4.11it/s, grad_norm=26.8, loss_final=2.81, loss_mean=0.958, loss_mean_cls=2.27, proj_loss=-0.414][2026-03-23 13:52:12] Step: 3868, Training Logs: loss_final: 2.990334, loss_mean: 0.937455, proj_loss: -0.408947, loss_mean_cls: 2.461827, grad_norm: 5.109649 +Steps: 0%| | 3869/1000000 [15:53<67:24:16, 4.11it/s, grad_norm=5.11, loss_final=2.99, loss_mean=0.937, loss_mean_cls=2.46, proj_loss=-0.409][2026-03-23 13:52:13] Step: 3869, Training Logs: loss_final: 2.255096, loss_mean: 0.944592, proj_loss: -0.426241, loss_mean_cls: 1.736745, grad_norm: 5.380660 +Steps: 0%| | 3870/1000000 [15:53<67:23:24, 4.11it/s, grad_norm=5.38, loss_final=2.26, loss_mean=0.945, loss_mean_cls=1.74, proj_loss=-0.426][2026-03-23 13:52:13] Step: 3870, Training Logs: loss_final: 2.689992, loss_mean: 0.928707, proj_loss: -0.423175, loss_mean_cls: 2.184461, grad_norm: 16.903549 +Steps: 0%| | 3871/1000000 [15:53<67:22:23, 4.11it/s, grad_norm=16.9, loss_final=2.69, loss_mean=0.929, loss_mean_cls=2.18, proj_loss=-0.423][2026-03-23 13:52:13] Step: 3871, Training Logs: loss_final: 2.496092, loss_mean: 0.936152, proj_loss: -0.425316, loss_mean_cls: 1.985256, grad_norm: 11.552868 +Steps: 0%| | 3872/1000000 [15:53<67:25:18, 4.10it/s, grad_norm=11.6, loss_final=2.5, loss_mean=0.936, loss_mean_cls=1.99, proj_loss=-0.425][2026-03-23 13:52:13] Step: 3872, Training Logs: loss_final: 2.935429, loss_mean: 0.929716, proj_loss: -0.412315, loss_mean_cls: 2.418028, grad_norm: 3.192445 +Steps: 0%| | 3873/1000000 [15:54<67:28:09, 4.10it/s, grad_norm=3.19, loss_final=2.94, loss_mean=0.93, loss_mean_cls=2.42, proj_loss=-0.412][2026-03-23 13:52:14] Step: 3873, Training Logs: loss_final: 2.854943, loss_mean: 0.942953, proj_loss: -0.420215, loss_mean_cls: 2.332205, grad_norm: 9.418602 +Steps: 0%| | 3874/1000000 [15:54<67:26:26, 4.10it/s, grad_norm=9.42, loss_final=2.85, loss_mean=0.943, loss_mean_cls=2.33, proj_loss=-0.42][2026-03-23 13:52:14] Step: 3874, Training Logs: loss_final: 2.581669, loss_mean: 0.907254, proj_loss: -0.427422, loss_mean_cls: 2.101838, grad_norm: 29.170927 +Steps: 0%| | 3875/1000000 [15:54<67:23:53, 4.11it/s, grad_norm=29.2, loss_final=2.58, loss_mean=0.907, loss_mean_cls=2.1, proj_loss=-0.427][2026-03-23 13:52:14] Step: 3875, Training Logs: loss_final: 2.863342, loss_mean: 0.938260, proj_loss: -0.419080, loss_mean_cls: 2.344161, grad_norm: 4.137585 +Steps: 0%| | 3876/1000000 [15:54<67:26:46, 4.10it/s, grad_norm=4.14, loss_final=2.86, loss_mean=0.938, loss_mean_cls=2.34, proj_loss=-0.419][2026-03-23 13:52:14] Step: 3876, Training Logs: loss_final: 2.957273, loss_mean: 0.926713, proj_loss: -0.413422, loss_mean_cls: 2.443983, grad_norm: 9.741819 +Steps: 0%| | 3877/1000000 [15:55<67:25:27, 4.10it/s, grad_norm=9.74, loss_final=2.96, loss_mean=0.927, loss_mean_cls=2.44, proj_loss=-0.413][2026-03-23 13:52:15] Step: 3877, Training Logs: loss_final: 2.709096, loss_mean: 0.941140, proj_loss: -0.419290, loss_mean_cls: 2.187246, grad_norm: 30.376865 +Steps: 0%| | 3878/1000000 [15:55<67:24:20, 4.10it/s, grad_norm=30.4, loss_final=2.71, loss_mean=0.941, loss_mean_cls=2.19, proj_loss=-0.419][2026-03-23 13:52:15] Step: 3878, Training Logs: loss_final: 3.088545, loss_mean: 0.921642, proj_loss: -0.416922, loss_mean_cls: 2.583825, grad_norm: 15.459913 +Steps: 0%| | 3879/1000000 [15:55<67:22:01, 4.11it/s, grad_norm=15.5, loss_final=3.09, loss_mean=0.922, loss_mean_cls=2.58, proj_loss=-0.417][2026-03-23 13:52:15] Step: 3879, Training Logs: loss_final: 2.620272, loss_mean: 0.949497, proj_loss: -0.422950, loss_mean_cls: 2.093725, grad_norm: 5.637051 +Steps: 0%| | 3880/1000000 [15:55<67:25:36, 4.10it/s, grad_norm=5.64, loss_final=2.62, loss_mean=0.949, loss_mean_cls=2.09, proj_loss=-0.423][2026-03-23 13:52:15] Step: 3880, Training Logs: loss_final: 2.671682, loss_mean: 0.924743, proj_loss: -0.423534, loss_mean_cls: 2.170473, grad_norm: 21.088470 +Steps: 0%| | 3881/1000000 [15:56<67:26:10, 4.10it/s, grad_norm=21.1, loss_final=2.67, loss_mean=0.925, loss_mean_cls=2.17, proj_loss=-0.424][2026-03-23 13:52:16] Step: 3881, Training Logs: loss_final: 2.996428, loss_mean: 0.923714, proj_loss: -0.420123, loss_mean_cls: 2.492837, grad_norm: 6.593771 +Steps: 0%| | 3882/1000000 [15:56<67:25:11, 4.10it/s, grad_norm=6.59, loss_final=3, loss_mean=0.924, loss_mean_cls=2.49, proj_loss=-0.42][2026-03-23 13:52:16] Step: 3882, Training Logs: loss_final: 3.173469, loss_mean: 0.901847, proj_loss: -0.409915, loss_mean_cls: 2.681537, grad_norm: 16.142746 +Steps: 0%| | 3883/1000000 [15:56<67:27:05, 4.10it/s, grad_norm=16.1, loss_final=3.17, loss_mean=0.902, loss_mean_cls=2.68, proj_loss=-0.41][2026-03-23 13:52:16] Step: 3883, Training Logs: loss_final: 2.785594, loss_mean: 0.927190, proj_loss: -0.417672, loss_mean_cls: 2.276076, grad_norm: 2.582892 +Steps: 0%| | 3884/1000000 [15:56<67:23:53, 4.11it/s, grad_norm=2.58, loss_final=2.79, loss_mean=0.927, loss_mean_cls=2.28, proj_loss=-0.418][2026-03-23 13:52:16] Step: 3884, Training Logs: loss_final: 2.961877, loss_mean: 0.917914, proj_loss: -0.414342, loss_mean_cls: 2.458305, grad_norm: 14.137255 +Steps: 0%| | 3885/1000000 [15:57<67:23:15, 4.11it/s, grad_norm=14.1, loss_final=2.96, loss_mean=0.918, loss_mean_cls=2.46, proj_loss=-0.414][2026-03-23 13:52:17] Step: 3885, Training Logs: loss_final: 2.861201, loss_mean: 0.916087, proj_loss: -0.416802, loss_mean_cls: 2.361917, grad_norm: 3.745754 +Steps: 0%| | 3886/1000000 [15:57<84:40:05, 3.27it/s, grad_norm=3.75, loss_final=2.86, loss_mean=0.916, loss_mean_cls=2.36, proj_loss=-0.417][2026-03-23 13:52:17] Step: 3886, Training Logs: loss_final: 2.658689, loss_mean: 0.941487, proj_loss: -0.422544, loss_mean_cls: 2.139745, grad_norm: 11.444356 +Steps: 0%| | 3887/1000000 [15:57<79:29:58, 3.48it/s, grad_norm=11.4, loss_final=2.66, loss_mean=0.941, loss_mean_cls=2.14, proj_loss=-0.423][2026-03-23 13:52:17] Step: 3887, Training Logs: loss_final: 2.950966, loss_mean: 0.922502, proj_loss: -0.418325, loss_mean_cls: 2.446789, grad_norm: 21.094046 +Steps: 0%| | 3888/1000000 [15:58<75:50:06, 3.65it/s, grad_norm=21.1, loss_final=2.95, loss_mean=0.923, loss_mean_cls=2.45, proj_loss=-0.418][2026-03-23 13:52:18] Step: 3888, Training Logs: loss_final: 2.139753, loss_mean: 0.929006, proj_loss: -0.432073, loss_mean_cls: 1.642820, grad_norm: 17.792429 +Steps: 0%| | 3889/1000000 [15:58<73:16:51, 3.78it/s, grad_norm=17.8, loss_final=2.14, loss_mean=0.929, loss_mean_cls=1.64, proj_loss=-0.432][2026-03-23 13:52:18] Step: 3889, Training Logs: loss_final: 2.422140, loss_mean: 0.941787, proj_loss: -0.419937, loss_mean_cls: 1.900290, grad_norm: 14.911428 +Steps: 0%| | 3890/1000000 [15:58<71:30:11, 3.87it/s, grad_norm=14.9, loss_final=2.42, loss_mean=0.942, loss_mean_cls=1.9, proj_loss=-0.42][2026-03-23 13:52:18] Step: 3890, Training Logs: loss_final: 2.673573, loss_mean: 0.925269, proj_loss: -0.421506, loss_mean_cls: 2.169810, grad_norm: 47.107784 +Steps: 0%| | 3891/1000000 [15:58<70:15:17, 3.94it/s, grad_norm=47.1, loss_final=2.67, loss_mean=0.925, loss_mean_cls=2.17, proj_loss=-0.422][2026-03-23 13:52:18] Step: 3891, Training Logs: loss_final: 2.350329, loss_mean: 0.960433, proj_loss: -0.422304, loss_mean_cls: 1.812200, grad_norm: 12.932275 +Steps: 0%| | 3892/1000000 [15:59<69:22:23, 3.99it/s, grad_norm=12.9, loss_final=2.35, loss_mean=0.96, loss_mean_cls=1.81, proj_loss=-0.422][2026-03-23 13:52:18] Step: 3892, Training Logs: loss_final: 2.364940, loss_mean: 0.951266, proj_loss: -0.428315, loss_mean_cls: 1.841989, grad_norm: 13.940792 +Steps: 0%| | 3893/1000000 [15:59<68:44:34, 4.03it/s, grad_norm=13.9, loss_final=2.36, loss_mean=0.951, loss_mean_cls=1.84, proj_loss=-0.428][2026-03-23 13:52:19] Step: 3893, Training Logs: loss_final: 2.691872, loss_mean: 0.937051, proj_loss: -0.424907, loss_mean_cls: 2.179729, grad_norm: 7.924897 +Steps: 0%| | 3894/1000000 [15:59<68:21:26, 4.05it/s, grad_norm=7.92, loss_final=2.69, loss_mean=0.937, loss_mean_cls=2.18, proj_loss=-0.425][2026-03-23 13:52:19] Step: 3894, Training Logs: loss_final: 3.082867, loss_mean: 0.904193, proj_loss: -0.419563, loss_mean_cls: 2.598237, grad_norm: 10.610038 +Steps: 0%| | 3895/1000000 [15:59<68:04:37, 4.06it/s, grad_norm=10.6, loss_final=3.08, loss_mean=0.904, loss_mean_cls=2.6, proj_loss=-0.42][2026-03-23 13:52:19] Step: 3895, Training Logs: loss_final: 2.680709, loss_mean: 0.944664, proj_loss: -0.417626, loss_mean_cls: 2.153671, grad_norm: 25.830332 +Steps: 0%| | 3896/1000000 [15:59<67:51:01, 4.08it/s, grad_norm=25.8, loss_final=2.68, loss_mean=0.945, loss_mean_cls=2.15, proj_loss=-0.418][2026-03-23 13:52:19] Step: 3896, Training Logs: loss_final: 2.990818, loss_mean: 0.925485, proj_loss: -0.419195, loss_mean_cls: 2.484527, grad_norm: 6.948463 +Steps: 0%| | 3897/1000000 [16:00<67:42:09, 4.09it/s, grad_norm=6.95, loss_final=2.99, loss_mean=0.925, loss_mean_cls=2.48, proj_loss=-0.419][2026-03-23 13:52:20] Step: 3897, Training Logs: loss_final: 2.517547, loss_mean: 0.923576, proj_loss: -0.427880, loss_mean_cls: 2.021851, grad_norm: 6.826310 +Steps: 0%| | 3898/1000000 [16:00<67:35:51, 4.09it/s, grad_norm=6.83, loss_final=2.52, loss_mean=0.924, loss_mean_cls=2.02, proj_loss=-0.428][2026-03-23 13:52:20] Step: 3898, Training Logs: loss_final: 2.267832, loss_mean: 0.930967, proj_loss: -0.426017, loss_mean_cls: 1.762881, grad_norm: 10.665287 +Steps: 0%| | 3899/1000000 [16:00<67:31:53, 4.10it/s, grad_norm=10.7, loss_final=2.27, loss_mean=0.931, loss_mean_cls=1.76, proj_loss=-0.426][2026-03-23 13:52:20] Step: 3899, Training Logs: loss_final: 2.676319, loss_mean: 0.920490, proj_loss: -0.430009, loss_mean_cls: 2.185838, grad_norm: 19.720013 +Steps: 0%| | 3900/1000000 [16:00<67:40:46, 4.09it/s, grad_norm=19.7, loss_final=2.68, loss_mean=0.92, loss_mean_cls=2.19, proj_loss=-0.43][2026-03-23 13:52:20] Step: 3900, Training Logs: loss_final: 3.073251, loss_mean: 0.919137, proj_loss: -0.413007, loss_mean_cls: 2.567121, grad_norm: 25.787008 +Steps: 0%| | 3901/1000000 [16:01<67:34:10, 4.09it/s, grad_norm=25.8, loss_final=3.07, loss_mean=0.919, loss_mean_cls=2.57, proj_loss=-0.413][2026-03-23 13:52:21] Step: 3901, Training Logs: loss_final: 3.441301, loss_mean: 0.917580, proj_loss: -0.411867, loss_mean_cls: 2.935589, grad_norm: 27.274603 +Steps: 0%| | 3902/1000000 [16:01<67:31:03, 4.10it/s, grad_norm=27.3, loss_final=3.44, loss_mean=0.918, loss_mean_cls=2.94, proj_loss=-0.412][2026-03-23 13:52:21] Step: 3902, Training Logs: loss_final: 2.710635, loss_mean: 0.923273, proj_loss: -0.421604, loss_mean_cls: 2.208966, grad_norm: 16.618618 +Steps: 0%| | 3903/1000000 [16:01<67:29:47, 4.10it/s, grad_norm=16.6, loss_final=2.71, loss_mean=0.923, loss_mean_cls=2.21, proj_loss=-0.422][2026-03-23 13:52:21] Step: 3903, Training Logs: loss_final: 2.818271, loss_mean: 0.915921, proj_loss: -0.419482, loss_mean_cls: 2.321831, grad_norm: 5.741780 +Steps: 0%| | 3904/1000000 [16:01<67:28:44, 4.10it/s, grad_norm=5.74, loss_final=2.82, loss_mean=0.916, loss_mean_cls=2.32, proj_loss=-0.419][2026-03-23 13:52:21] Step: 3904, Training Logs: loss_final: 2.503928, loss_mean: 0.938331, proj_loss: -0.427169, loss_mean_cls: 1.992766, grad_norm: 9.155195 +Steps: 0%| | 3905/1000000 [16:02<67:27:17, 4.10it/s, grad_norm=9.16, loss_final=2.5, loss_mean=0.938, loss_mean_cls=1.99, proj_loss=-0.427][2026-03-23 13:52:22] Step: 3905, Training Logs: loss_final: 2.569744, loss_mean: 0.897431, proj_loss: -0.427983, loss_mean_cls: 2.100296, grad_norm: 8.063369 +Steps: 0%| | 3906/1000000 [16:02<67:26:03, 4.10it/s, grad_norm=8.06, loss_final=2.57, loss_mean=0.897, loss_mean_cls=2.1, proj_loss=-0.428][2026-03-23 13:52:22] Step: 3906, Training Logs: loss_final: 2.557515, loss_mean: 0.923517, proj_loss: -0.426803, loss_mean_cls: 2.060801, grad_norm: 8.554008 +Steps: 0%| | 3907/1000000 [16:02<67:25:01, 4.10it/s, grad_norm=8.55, loss_final=2.56, loss_mean=0.924, loss_mean_cls=2.06, proj_loss=-0.427][2026-03-23 13:52:22] Step: 3907, Training Logs: loss_final: 2.757092, loss_mean: 0.912173, proj_loss: -0.422744, loss_mean_cls: 2.267662, grad_norm: 7.531701 +Steps: 0%| | 3908/1000000 [16:02<67:25:48, 4.10it/s, grad_norm=7.53, loss_final=2.76, loss_mean=0.912, loss_mean_cls=2.27, proj_loss=-0.423][2026-03-23 13:52:22] Step: 3908, Training Logs: loss_final: 2.397724, loss_mean: 0.905242, proj_loss: -0.430979, loss_mean_cls: 1.923462, grad_norm: 18.492092 +Steps: 0%| | 3909/1000000 [16:03<67:23:20, 4.11it/s, grad_norm=18.5, loss_final=2.4, loss_mean=0.905, loss_mean_cls=1.92, proj_loss=-0.431][2026-03-23 13:52:23] Step: 3909, Training Logs: loss_final: 2.399988, loss_mean: 0.919947, proj_loss: -0.426636, loss_mean_cls: 1.906677, grad_norm: 2.514412 +Steps: 0%| | 3910/1000000 [16:03<67:23:43, 4.11it/s, grad_norm=2.51, loss_final=2.4, loss_mean=0.92, loss_mean_cls=1.91, proj_loss=-0.427][2026-03-23 13:52:23] Step: 3910, Training Logs: loss_final: 2.728333, loss_mean: 0.928231, proj_loss: -0.423366, loss_mean_cls: 2.223468, grad_norm: 10.671954 +Steps: 0%| | 3911/1000000 [16:03<67:24:43, 4.10it/s, grad_norm=10.7, loss_final=2.73, loss_mean=0.928, loss_mean_cls=2.22, proj_loss=-0.423][2026-03-23 13:52:23] Step: 3911, Training Logs: loss_final: 2.651279, loss_mean: 0.936183, proj_loss: -0.419571, loss_mean_cls: 2.134667, grad_norm: 2.040349 +Steps: 0%| | 3912/1000000 [16:03<67:23:56, 4.11it/s, grad_norm=2.04, loss_final=2.65, loss_mean=0.936, loss_mean_cls=2.13, proj_loss=-0.42][2026-03-23 13:52:23] Step: 3912, Training Logs: loss_final: 2.928094, loss_mean: 0.911731, proj_loss: -0.418154, loss_mean_cls: 2.434517, grad_norm: 6.130195 +Steps: 0%| | 3913/1000000 [16:04<67:23:35, 4.11it/s, grad_norm=6.13, loss_final=2.93, loss_mean=0.912, loss_mean_cls=2.43, proj_loss=-0.418][2026-03-23 13:52:24] Step: 3913, Training Logs: loss_final: 2.489990, loss_mean: 0.924305, proj_loss: -0.424013, loss_mean_cls: 1.989698, grad_norm: 2.729717 +Steps: 0%| | 3914/1000000 [16:04<67:23:55, 4.11it/s, grad_norm=2.73, loss_final=2.49, loss_mean=0.924, loss_mean_cls=1.99, proj_loss=-0.424][2026-03-23 13:52:24] Step: 3914, Training Logs: loss_final: 2.544194, loss_mean: 0.921963, proj_loss: -0.426656, loss_mean_cls: 2.048887, grad_norm: 10.319615 +Steps: 0%| | 3915/1000000 [16:04<67:21:39, 4.11it/s, grad_norm=10.3, loss_final=2.54, loss_mean=0.922, loss_mean_cls=2.05, proj_loss=-0.427][2026-03-23 13:52:24] Step: 3915, Training Logs: loss_final: 2.664332, loss_mean: 0.920667, proj_loss: -0.423317, loss_mean_cls: 2.166983, grad_norm: 4.526459 +Steps: 0%| | 3916/1000000 [16:04<67:19:57, 4.11it/s, grad_norm=4.53, loss_final=2.66, loss_mean=0.921, loss_mean_cls=2.17, proj_loss=-0.423][2026-03-23 13:52:24] Step: 3916, Training Logs: loss_final: 3.112786, loss_mean: 0.927683, proj_loss: -0.417123, loss_mean_cls: 2.602226, grad_norm: 10.133533 +Steps: 0%| | 3917/1000000 [16:05<67:19:22, 4.11it/s, grad_norm=10.1, loss_final=3.11, loss_mean=0.928, loss_mean_cls=2.6, proj_loss=-0.417][2026-03-23 13:52:25] Step: 3917, Training Logs: loss_final: 2.873902, loss_mean: 0.921592, proj_loss: -0.419305, loss_mean_cls: 2.371615, grad_norm: 14.368392 +Steps: 0%| | 3918/1000000 [16:05<67:20:20, 4.11it/s, grad_norm=14.4, loss_final=2.87, loss_mean=0.922, loss_mean_cls=2.37, proj_loss=-0.419][2026-03-23 13:52:25] Step: 3918, Training Logs: loss_final: 2.614765, loss_mean: 0.904554, proj_loss: -0.425937, loss_mean_cls: 2.136148, grad_norm: 27.204851 +Steps: 0%| | 3919/1000000 [16:05<67:21:04, 4.11it/s, grad_norm=27.2, loss_final=2.61, loss_mean=0.905, loss_mean_cls=2.14, proj_loss=-0.426][2026-03-23 13:52:25] Step: 3919, Training Logs: loss_final: 2.519160, loss_mean: 0.933336, proj_loss: -0.421357, loss_mean_cls: 2.007181, grad_norm: 2.252249 +Steps: 0%| | 3920/1000000 [16:05<67:21:43, 4.11it/s, grad_norm=2.25, loss_final=2.52, loss_mean=0.933, loss_mean_cls=2.01, proj_loss=-0.421][2026-03-23 13:52:25] Step: 3920, Training Logs: loss_final: 2.851661, loss_mean: 0.927688, proj_loss: -0.421157, loss_mean_cls: 2.345130, grad_norm: 25.193302 +Steps: 0%| | 3921/1000000 [16:06<67:22:28, 4.11it/s, grad_norm=25.2, loss_final=2.85, loss_mean=0.928, loss_mean_cls=2.35, proj_loss=-0.421][2026-03-23 13:52:26] Step: 3921, Training Logs: loss_final: 3.229059, loss_mean: 0.921529, proj_loss: -0.417170, loss_mean_cls: 2.724700, grad_norm: 14.839234 +Steps: 0%| | 3922/1000000 [16:06<67:21:20, 4.11it/s, grad_norm=14.8, loss_final=3.23, loss_mean=0.922, loss_mean_cls=2.72, proj_loss=-0.417][2026-03-23 13:52:26] Step: 3922, Training Logs: loss_final: 2.877623, loss_mean: 0.927351, proj_loss: -0.418217, loss_mean_cls: 2.368488, grad_norm: 26.311785 +Steps: 0%| | 3923/1000000 [16:06<67:20:10, 4.11it/s, grad_norm=26.3, loss_final=2.88, loss_mean=0.927, loss_mean_cls=2.37, proj_loss=-0.418][2026-03-23 13:52:26] Step: 3923, Training Logs: loss_final: 3.280589, loss_mean: 0.905417, proj_loss: -0.413820, loss_mean_cls: 2.788992, grad_norm: 19.730886 +Steps: 0%| | 3924/1000000 [16:06<67:19:55, 4.11it/s, grad_norm=19.7, loss_final=3.28, loss_mean=0.905, loss_mean_cls=2.79, proj_loss=-0.414][2026-03-23 13:52:26] Step: 3924, Training Logs: loss_final: 2.663369, loss_mean: 0.953816, proj_loss: -0.416638, loss_mean_cls: 2.126192, grad_norm: 3.945343 +Steps: 0%| | 3925/1000000 [16:07<67:20:18, 4.11it/s, grad_norm=3.95, loss_final=2.66, loss_mean=0.954, loss_mean_cls=2.13, proj_loss=-0.417][2026-03-23 13:52:27] Step: 3925, Training Logs: loss_final: 2.775008, loss_mean: 0.916356, proj_loss: -0.419264, loss_mean_cls: 2.277916, grad_norm: 15.500574 +Steps: 0%| | 3926/1000000 [16:07<67:21:08, 4.11it/s, grad_norm=15.5, loss_final=2.78, loss_mean=0.916, loss_mean_cls=2.28, proj_loss=-0.419][2026-03-23 13:52:27] Step: 3926, Training Logs: loss_final: 3.052587, loss_mean: 0.908948, proj_loss: -0.415620, loss_mean_cls: 2.559259, grad_norm: 9.965136 +Steps: 0%| | 3927/1000000 [16:07<67:20:31, 4.11it/s, grad_norm=9.97, loss_final=3.05, loss_mean=0.909, loss_mean_cls=2.56, proj_loss=-0.416][2026-03-23 13:52:27] Step: 3927, Training Logs: loss_final: 2.480194, loss_mean: 0.935168, proj_loss: -0.426165, loss_mean_cls: 1.971190, grad_norm: 4.491463 +Steps: 0%| | 3928/1000000 [16:07<67:20:34, 4.11it/s, grad_norm=4.49, loss_final=2.48, loss_mean=0.935, loss_mean_cls=1.97, proj_loss=-0.426][2026-03-23 13:52:27] Step: 3928, Training Logs: loss_final: 2.643150, loss_mean: 0.929898, proj_loss: -0.420753, loss_mean_cls: 2.134004, grad_norm: 33.312637 +Steps: 0%| | 3929/1000000 [16:08<67:20:44, 4.11it/s, grad_norm=33.3, loss_final=2.64, loss_mean=0.93, loss_mean_cls=2.13, proj_loss=-0.421][2026-03-23 13:52:27] Step: 3929, Training Logs: loss_final: 2.535678, loss_mean: 0.931477, proj_loss: -0.426168, loss_mean_cls: 2.030369, grad_norm: 21.270388 +Steps: 0%| | 3930/1000000 [16:08<67:20:47, 4.11it/s, grad_norm=21.3, loss_final=2.54, loss_mean=0.931, loss_mean_cls=2.03, proj_loss=-0.426][2026-03-23 13:52:28] Step: 3930, Training Logs: loss_final: 2.703500, loss_mean: 0.920485, proj_loss: -0.414061, loss_mean_cls: 2.197075, grad_norm: 10.617208 +Steps: 0%| | 3931/1000000 [16:08<67:20:59, 4.11it/s, grad_norm=10.6, loss_final=2.7, loss_mean=0.92, loss_mean_cls=2.2, proj_loss=-0.414][2026-03-23 13:52:28] Step: 3931, Training Logs: loss_final: 2.406264, loss_mean: 0.927840, proj_loss: -0.421567, loss_mean_cls: 1.899990, grad_norm: 6.691401 +Steps: 0%| | 3932/1000000 [16:08<67:22:42, 4.11it/s, grad_norm=6.69, loss_final=2.41, loss_mean=0.928, loss_mean_cls=1.9, proj_loss=-0.422][2026-03-23 13:52:28] Step: 3932, Training Logs: loss_final: 2.799462, loss_mean: 0.928626, proj_loss: -0.416530, loss_mean_cls: 2.287366, grad_norm: 16.648609 +Steps: 0%| | 3933/1000000 [16:09<67:20:30, 4.11it/s, grad_norm=16.6, loss_final=2.8, loss_mean=0.929, loss_mean_cls=2.29, proj_loss=-0.417][2026-03-23 13:52:28] Step: 3933, Training Logs: loss_final: 2.934464, loss_mean: 0.918720, proj_loss: -0.425750, loss_mean_cls: 2.441493, grad_norm: 14.430348 +Steps: 0%| | 3934/1000000 [16:09<67:20:02, 4.11it/s, grad_norm=14.4, loss_final=2.93, loss_mean=0.919, loss_mean_cls=2.44, proj_loss=-0.426][2026-03-23 13:52:29] Step: 3934, Training Logs: loss_final: 2.861659, loss_mean: 0.915630, proj_loss: -0.422585, loss_mean_cls: 2.368614, grad_norm: 40.331989 +Steps: 0%| | 3935/1000000 [16:09<67:20:42, 4.11it/s, grad_norm=40.3, loss_final=2.86, loss_mean=0.916, loss_mean_cls=2.37, proj_loss=-0.423][2026-03-23 13:52:29] Step: 3935, Training Logs: loss_final: 2.720532, loss_mean: 0.943350, proj_loss: -0.424385, loss_mean_cls: 2.201567, grad_norm: 10.878301 +Steps: 0%| | 3936/1000000 [16:09<67:22:15, 4.11it/s, grad_norm=10.9, loss_final=2.72, loss_mean=0.943, loss_mean_cls=2.2, proj_loss=-0.424][2026-03-23 13:52:29] Step: 3936, Training Logs: loss_final: 2.561585, loss_mean: 0.930549, proj_loss: -0.423324, loss_mean_cls: 2.054360, grad_norm: 11.263740 +Steps: 0%| | 3937/1000000 [16:09<67:21:27, 4.11it/s, grad_norm=11.3, loss_final=2.56, loss_mean=0.931, loss_mean_cls=2.05, proj_loss=-0.423][2026-03-23 13:52:29] Step: 3937, Training Logs: loss_final: 2.879526, loss_mean: 0.937472, proj_loss: -0.421411, loss_mean_cls: 2.363466, grad_norm: 12.479384 +Steps: 0%| | 3938/1000000 [16:10<67:21:13, 4.11it/s, grad_norm=12.5, loss_final=2.88, loss_mean=0.937, loss_mean_cls=2.36, proj_loss=-0.421][2026-03-23 13:52:30] Step: 3938, Training Logs: loss_final: 3.210547, loss_mean: 0.919879, proj_loss: -0.423676, loss_mean_cls: 2.714344, grad_norm: 20.305723 +Steps: 0%| | 3939/1000000 [16:10<67:20:23, 4.11it/s, grad_norm=20.3, loss_final=3.21, loss_mean=0.92, loss_mean_cls=2.71, proj_loss=-0.424][2026-03-23 13:52:30] Step: 3939, Training Logs: loss_final: 3.413652, loss_mean: 0.899105, proj_loss: -0.415055, loss_mean_cls: 2.929603, grad_norm: 41.374409 +Steps: 0%| | 3940/1000000 [16:10<67:20:58, 4.11it/s, grad_norm=41.4, loss_final=3.41, loss_mean=0.899, loss_mean_cls=2.93, proj_loss=-0.415][2026-03-23 13:52:30] Step: 3940, Training Logs: loss_final: 2.941935, loss_mean: 0.908341, proj_loss: -0.420896, loss_mean_cls: 2.454491, grad_norm: 28.608046 +Steps: 0%| | 3941/1000000 [16:10<67:21:08, 4.11it/s, grad_norm=28.6, loss_final=2.94, loss_mean=0.908, loss_mean_cls=2.45, proj_loss=-0.421][2026-03-23 13:52:30] Step: 3941, Training Logs: loss_final: 3.023180, loss_mean: 0.945870, proj_loss: -0.419225, loss_mean_cls: 2.496535, grad_norm: 26.570183 +Steps: 0%| | 3942/1000000 [16:11<67:22:32, 4.11it/s, grad_norm=26.6, loss_final=3.02, loss_mean=0.946, loss_mean_cls=2.5, proj_loss=-0.419][2026-03-23 13:52:31] Step: 3942, Training Logs: loss_final: 2.845009, loss_mean: 0.917070, proj_loss: -0.426694, loss_mean_cls: 2.354633, grad_norm: 3.966270 +Steps: 0%| | 3943/1000000 [16:11<67:22:38, 4.11it/s, grad_norm=3.97, loss_final=2.85, loss_mean=0.917, loss_mean_cls=2.35, proj_loss=-0.427][2026-03-23 13:52:31] Step: 3943, Training Logs: loss_final: 2.758931, loss_mean: 0.903028, proj_loss: -0.428323, loss_mean_cls: 2.284226, grad_norm: 10.953306 +Steps: 0%| | 3944/1000000 [16:11<67:59:09, 4.07it/s, grad_norm=11, loss_final=2.76, loss_mean=0.903, loss_mean_cls=2.28, proj_loss=-0.428][2026-03-23 13:52:31] Step: 3944, Training Logs: loss_final: 2.717623, loss_mean: 0.908286, proj_loss: -0.427582, loss_mean_cls: 2.236918, grad_norm: 21.007717 +Steps: 0%| | 3945/1000000 [16:11<67:57:24, 4.07it/s, grad_norm=21, loss_final=2.72, loss_mean=0.908, loss_mean_cls=2.24, proj_loss=-0.428][2026-03-23 13:52:31] Step: 3945, Training Logs: loss_final: 2.557975, loss_mean: 0.929591, proj_loss: -0.425271, loss_mean_cls: 2.053655, grad_norm: 17.661842 +Steps: 0%| | 3946/1000000 [16:12<67:47:03, 4.08it/s, grad_norm=17.7, loss_final=2.56, loss_mean=0.93, loss_mean_cls=2.05, proj_loss=-0.425][2026-03-23 13:52:32] Step: 3946, Training Logs: loss_final: 3.117771, loss_mean: 0.913258, proj_loss: -0.420794, loss_mean_cls: 2.625307, grad_norm: 11.466757 +Steps: 0%| | 3947/1000000 [16:12<67:38:40, 4.09it/s, grad_norm=11.5, loss_final=3.12, loss_mean=0.913, loss_mean_cls=2.63, proj_loss=-0.421][2026-03-23 13:52:32] Step: 3947, Training Logs: loss_final: 2.643917, loss_mean: 0.938520, proj_loss: -0.425547, loss_mean_cls: 2.130943, grad_norm: 23.115458 +Steps: 0%| | 3948/1000000 [16:12<67:32:22, 4.10it/s, grad_norm=23.1, loss_final=2.64, loss_mean=0.939, loss_mean_cls=2.13, proj_loss=-0.426][2026-03-23 13:52:32] Step: 3948, Training Logs: loss_final: 2.946419, loss_mean: 0.939624, proj_loss: -0.418880, loss_mean_cls: 2.425674, grad_norm: 2.291875 +Steps: 0%| | 3949/1000000 [16:12<67:29:02, 4.10it/s, grad_norm=2.29, loss_final=2.95, loss_mean=0.94, loss_mean_cls=2.43, proj_loss=-0.419][2026-03-23 13:52:32] Step: 3949, Training Logs: loss_final: 3.261811, loss_mean: 0.921621, proj_loss: -0.409712, loss_mean_cls: 2.749902, grad_norm: 5.569227 +Steps: 0%| | 3950/1000000 [16:13<67:26:58, 4.10it/s, grad_norm=5.57, loss_final=3.26, loss_mean=0.922, loss_mean_cls=2.75, proj_loss=-0.41][2026-03-23 13:52:33] Step: 3950, Training Logs: loss_final: 3.101675, loss_mean: 0.919350, proj_loss: -0.414125, loss_mean_cls: 2.596450, grad_norm: 4.090050 +Steps: 0%| | 3951/1000000 [16:13<67:35:40, 4.09it/s, grad_norm=4.09, loss_final=3.1, loss_mean=0.919, loss_mean_cls=2.6, proj_loss=-0.414][2026-03-23 13:52:33] Step: 3951, Training Logs: loss_final: 2.630885, loss_mean: 0.956683, proj_loss: -0.418700, loss_mean_cls: 2.092902, grad_norm: 7.100989 +Steps: 0%| | 3952/1000000 [16:13<67:31:57, 4.10it/s, grad_norm=7.1, loss_final=2.63, loss_mean=0.957, loss_mean_cls=2.09, proj_loss=-0.419][2026-03-23 13:52:33] Step: 3952, Training Logs: loss_final: 2.716068, loss_mean: 0.910611, proj_loss: -0.423493, loss_mean_cls: 2.228950, grad_norm: 9.664457 +Steps: 0%| | 3953/1000000 [16:13<67:27:03, 4.10it/s, grad_norm=9.66, loss_final=2.72, loss_mean=0.911, loss_mean_cls=2.23, proj_loss=-0.423][2026-03-23 13:52:33] Step: 3953, Training Logs: loss_final: 2.759293, loss_mean: 0.925715, proj_loss: -0.418473, loss_mean_cls: 2.252051, grad_norm: 22.077904 +Steps: 0%| | 3954/1000000 [16:14<70:52:17, 3.90it/s, grad_norm=22.1, loss_final=2.76, loss_mean=0.926, loss_mean_cls=2.25, proj_loss=-0.418][2026-03-23 13:52:34] Step: 3954, Training Logs: loss_final: 2.989861, loss_mean: 0.932989, proj_loss: -0.416710, loss_mean_cls: 2.473581, grad_norm: 8.980906 +Steps: 0%| | 3955/1000000 [16:14<69:48:28, 3.96it/s, grad_norm=8.98, loss_final=2.99, loss_mean=0.933, loss_mean_cls=2.47, proj_loss=-0.417][2026-03-23 13:52:34] Step: 3955, Training Logs: loss_final: 2.701847, loss_mean: 0.925924, proj_loss: -0.418886, loss_mean_cls: 2.194809, grad_norm: 8.745658 +Steps: 0%| | 3956/1000000 [16:14<69:03:24, 4.01it/s, grad_norm=8.75, loss_final=2.7, loss_mean=0.926, loss_mean_cls=2.19, proj_loss=-0.419][2026-03-23 13:52:34] Step: 3956, Training Logs: loss_final: 2.653898, loss_mean: 0.937579, proj_loss: -0.425713, loss_mean_cls: 2.142032, grad_norm: 2.668250 +Steps: 0%| | 3957/1000000 [16:14<68:31:33, 4.04it/s, grad_norm=2.67, loss_final=2.65, loss_mean=0.938, loss_mean_cls=2.14, proj_loss=-0.426][2026-03-23 13:52:34] Step: 3957, Training Logs: loss_final: 3.051004, loss_mean: 0.908335, proj_loss: -0.417015, loss_mean_cls: 2.559684, grad_norm: 6.578324 +Steps: 0%| | 3958/1000000 [16:15<68:13:19, 4.06it/s, grad_norm=6.58, loss_final=3.05, loss_mean=0.908, loss_mean_cls=2.56, proj_loss=-0.417][2026-03-23 13:52:35] Step: 3958, Training Logs: loss_final: 2.512089, loss_mean: 0.940484, proj_loss: -0.427077, loss_mean_cls: 1.998682, grad_norm: 8.417682 +Steps: 0%| | 3959/1000000 [16:15<67:57:42, 4.07it/s, grad_norm=8.42, loss_final=2.51, loss_mean=0.94, loss_mean_cls=2, proj_loss=-0.427][2026-03-23 13:52:35] Step: 3959, Training Logs: loss_final: 2.696502, loss_mean: 0.922100, proj_loss: -0.419294, loss_mean_cls: 2.193696, grad_norm: 23.679558 +Steps: 0%| | 3960/1000000 [16:15<68:10:17, 4.06it/s, grad_norm=23.7, loss_final=2.7, loss_mean=0.922, loss_mean_cls=2.19, proj_loss=-0.419][2026-03-23 13:52:35] Step: 3960, Training Logs: loss_final: 2.607038, loss_mean: 0.934434, proj_loss: -0.428024, loss_mean_cls: 2.100627, grad_norm: 28.460951 +Steps: 0%| | 3961/1000000 [16:15<67:55:28, 4.07it/s, grad_norm=28.5, loss_final=2.61, loss_mean=0.934, loss_mean_cls=2.1, proj_loss=-0.428][2026-03-23 13:52:35] Step: 3961, Training Logs: loss_final: 3.901834, loss_mean: 0.922750, proj_loss: -0.402514, loss_mean_cls: 3.381597, grad_norm: 34.788605 +Steps: 0%| | 3962/1000000 [16:16<67:44:30, 4.08it/s, grad_norm=34.8, loss_final=3.9, loss_mean=0.923, loss_mean_cls=3.38, proj_loss=-0.403][2026-03-23 13:52:36] Step: 3962, Training Logs: loss_final: 2.871997, loss_mean: 0.944632, proj_loss: -0.418618, loss_mean_cls: 2.345984, grad_norm: 34.621826 +Steps: 0%| | 3963/1000000 [16:16<67:36:44, 4.09it/s, grad_norm=34.6, loss_final=2.87, loss_mean=0.945, loss_mean_cls=2.35, proj_loss=-0.419][2026-03-23 13:52:36] Step: 3963, Training Logs: loss_final: 3.297763, loss_mean: 0.929827, proj_loss: -0.408282, loss_mean_cls: 2.776218, grad_norm: 16.173668 +Steps: 0%| | 3964/1000000 [16:16<67:32:30, 4.10it/s, grad_norm=16.2, loss_final=3.3, loss_mean=0.93, loss_mean_cls=2.78, proj_loss=-0.408][2026-03-23 13:52:36] Step: 3964, Training Logs: loss_final: 2.593849, loss_mean: 0.937567, proj_loss: -0.420848, loss_mean_cls: 2.077130, grad_norm: 3.221577 +Steps: 0%| | 3965/1000000 [16:16<67:29:44, 4.10it/s, grad_norm=3.22, loss_final=2.59, loss_mean=0.938, loss_mean_cls=2.08, proj_loss=-0.421][2026-03-23 13:52:36] Step: 3965, Training Logs: loss_final: 2.671575, loss_mean: 0.913230, proj_loss: -0.423198, loss_mean_cls: 2.181542, grad_norm: 9.524197 +Steps: 0%| | 3966/1000000 [16:17<67:27:23, 4.10it/s, grad_norm=9.52, loss_final=2.67, loss_mean=0.913, loss_mean_cls=2.18, proj_loss=-0.423][2026-03-23 13:52:37] Step: 3966, Training Logs: loss_final: 2.594798, loss_mean: 0.933735, proj_loss: -0.425580, loss_mean_cls: 2.086643, grad_norm: 16.694525 +Steps: 0%| | 3967/1000000 [16:17<67:25:39, 4.10it/s, grad_norm=16.7, loss_final=2.59, loss_mean=0.934, loss_mean_cls=2.09, proj_loss=-0.426][2026-03-23 13:52:37] Step: 3967, Training Logs: loss_final: 2.681866, loss_mean: 0.928241, proj_loss: -0.427426, loss_mean_cls: 2.181050, grad_norm: 15.993710 +Steps: 0%| | 3968/1000000 [16:17<67:24:37, 4.10it/s, grad_norm=16, loss_final=2.68, loss_mean=0.928, loss_mean_cls=2.18, proj_loss=-0.427][2026-03-23 13:52:37] Step: 3968, Training Logs: loss_final: 3.161508, loss_mean: 0.912193, proj_loss: -0.418367, loss_mean_cls: 2.667683, grad_norm: 12.975194 +Steps: 0%| | 3969/1000000 [16:17<67:22:57, 4.11it/s, grad_norm=13, loss_final=3.16, loss_mean=0.912, loss_mean_cls=2.67, proj_loss=-0.418][2026-03-23 13:52:37] Step: 3969, Training Logs: loss_final: 3.445734, loss_mean: 0.919457, proj_loss: -0.407930, loss_mean_cls: 2.934207, grad_norm: 21.009083 +Steps: 0%| | 3970/1000000 [16:18<67:22:22, 4.11it/s, grad_norm=21, loss_final=3.45, loss_mean=0.919, loss_mean_cls=2.93, proj_loss=-0.408][2026-03-23 13:52:38] Step: 3970, Training Logs: loss_final: 2.581692, loss_mean: 0.952671, proj_loss: -0.418877, loss_mean_cls: 2.047899, grad_norm: 21.092089 +Steps: 0%| | 3971/1000000 [16:18<67:21:54, 4.11it/s, grad_norm=21.1, loss_final=2.58, loss_mean=0.953, loss_mean_cls=2.05, proj_loss=-0.419][2026-03-23 13:52:38] Step: 3971, Training Logs: loss_final: 2.771118, loss_mean: 0.923141, proj_loss: -0.418542, loss_mean_cls: 2.266519, grad_norm: 9.657596 +Steps: 0%| | 3972/1000000 [16:18<67:27:52, 4.10it/s, grad_norm=9.66, loss_final=2.77, loss_mean=0.923, loss_mean_cls=2.27, proj_loss=-0.419][2026-03-23 13:52:38] Step: 3972, Training Logs: loss_final: 3.472034, loss_mean: 0.898862, proj_loss: -0.409521, loss_mean_cls: 2.982692, grad_norm: 18.915890 +Steps: 0%| | 3973/1000000 [16:18<67:26:18, 4.10it/s, grad_norm=18.9, loss_final=3.47, loss_mean=0.899, loss_mean_cls=2.98, proj_loss=-0.41][2026-03-23 13:52:38] Step: 3973, Training Logs: loss_final: 2.688317, loss_mean: 0.958189, proj_loss: -0.422849, loss_mean_cls: 2.152977, grad_norm: 14.850263 +Steps: 0%| | 3974/1000000 [16:19<67:25:28, 4.10it/s, grad_norm=14.9, loss_final=2.69, loss_mean=0.958, loss_mean_cls=2.15, proj_loss=-0.423][2026-03-23 13:52:39] Step: 3974, Training Logs: loss_final: 2.397179, loss_mean: 0.931173, proj_loss: -0.429316, loss_mean_cls: 1.895323, grad_norm: 22.714222 +Steps: 0%| | 3975/1000000 [16:19<67:23:15, 4.11it/s, grad_norm=22.7, loss_final=2.4, loss_mean=0.931, loss_mean_cls=1.9, proj_loss=-0.429][2026-03-23 13:52:39] Step: 3975, Training Logs: loss_final: 2.446552, loss_mean: 0.933422, proj_loss: -0.427031, loss_mean_cls: 1.940160, grad_norm: 32.130798 +Steps: 0%| | 3976/1000000 [16:19<67:21:20, 4.11it/s, grad_norm=32.1, loss_final=2.45, loss_mean=0.933, loss_mean_cls=1.94, proj_loss=-0.427][2026-03-23 13:52:39] Step: 3976, Training Logs: loss_final: 2.745723, loss_mean: 0.925068, proj_loss: -0.421766, loss_mean_cls: 2.242421, grad_norm: 8.675097 +Steps: 0%| | 3977/1000000 [16:19<67:21:34, 4.11it/s, grad_norm=8.68, loss_final=2.75, loss_mean=0.925, loss_mean_cls=2.24, proj_loss=-0.422][2026-03-23 13:52:39] Step: 3977, Training Logs: loss_final: 2.765616, loss_mean: 0.923236, proj_loss: -0.419392, loss_mean_cls: 2.261772, grad_norm: 5.570866 +Steps: 0%| | 3978/1000000 [16:20<67:24:20, 4.10it/s, grad_norm=5.57, loss_final=2.77, loss_mean=0.923, loss_mean_cls=2.26, proj_loss=-0.419][2026-03-23 13:52:39] Step: 3978, Training Logs: loss_final: 2.558681, loss_mean: 0.913688, proj_loss: -0.424529, loss_mean_cls: 2.069522, grad_norm: 10.827423 +Steps: 0%| | 3979/1000000 [16:20<67:22:23, 4.11it/s, grad_norm=10.8, loss_final=2.56, loss_mean=0.914, loss_mean_cls=2.07, proj_loss=-0.425][2026-03-23 13:52:40] Step: 3979, Training Logs: loss_final: 2.847752, loss_mean: 0.905337, proj_loss: -0.418195, loss_mean_cls: 2.360610, grad_norm: 1.647153 +Steps: 0%| | 3980/1000000 [16:20<67:22:47, 4.11it/s, grad_norm=1.65, loss_final=2.85, loss_mean=0.905, loss_mean_cls=2.36, proj_loss=-0.418][2026-03-23 13:52:40] Step: 3980, Training Logs: loss_final: 2.679647, loss_mean: 0.907755, proj_loss: -0.415457, loss_mean_cls: 2.187349, grad_norm: 6.950619 +Steps: 0%| | 3981/1000000 [16:20<67:22:20, 4.11it/s, grad_norm=6.95, loss_final=2.68, loss_mean=0.908, loss_mean_cls=2.19, proj_loss=-0.415][2026-03-23 13:52:40] Step: 3981, Training Logs: loss_final: 3.279071, loss_mean: 0.908754, proj_loss: -0.413841, loss_mean_cls: 2.784157, grad_norm: 7.863813 +Steps: 0%| | 3982/1000000 [16:20<67:21:56, 4.11it/s, grad_norm=7.86, loss_final=3.28, loss_mean=0.909, loss_mean_cls=2.78, proj_loss=-0.414][2026-03-23 13:52:40] Step: 3982, Training Logs: loss_final: 2.930985, loss_mean: 0.911216, proj_loss: -0.415755, loss_mean_cls: 2.435524, grad_norm: 4.258604 +Steps: 0%| | 3983/1000000 [16:21<67:21:04, 4.11it/s, grad_norm=4.26, loss_final=2.93, loss_mean=0.911, loss_mean_cls=2.44, proj_loss=-0.416][2026-03-23 13:52:41] Step: 3983, Training Logs: loss_final: 2.822962, loss_mean: 0.926968, proj_loss: -0.421918, loss_mean_cls: 2.317912, grad_norm: 4.731431 +Steps: 0%| | 3984/1000000 [16:21<67:19:56, 4.11it/s, grad_norm=4.73, loss_final=2.82, loss_mean=0.927, loss_mean_cls=2.32, proj_loss=-0.422][2026-03-23 13:52:41] Step: 3984, Training Logs: loss_final: 2.571073, loss_mean: 0.927850, proj_loss: -0.423585, loss_mean_cls: 2.066809, grad_norm: 3.580075 +Steps: 0%| | 3985/1000000 [16:21<67:18:58, 4.11it/s, grad_norm=3.58, loss_final=2.57, loss_mean=0.928, loss_mean_cls=2.07, proj_loss=-0.424][2026-03-23 13:52:41] Step: 3985, Training Logs: loss_final: 2.635784, loss_mean: 0.922897, proj_loss: -0.433077, loss_mean_cls: 2.145964, grad_norm: 10.945322 +Steps: 0%| | 3986/1000000 [16:21<67:20:28, 4.11it/s, grad_norm=10.9, loss_final=2.64, loss_mean=0.923, loss_mean_cls=2.15, proj_loss=-0.433][2026-03-23 13:52:41] Step: 3986, Training Logs: loss_final: 2.773138, loss_mean: 0.926721, proj_loss: -0.419193, loss_mean_cls: 2.265610, grad_norm: 12.607018 +Steps: 0%| | 3987/1000000 [16:22<67:19:52, 4.11it/s, grad_norm=12.6, loss_final=2.77, loss_mean=0.927, loss_mean_cls=2.27, proj_loss=-0.419][2026-03-23 13:52:42] Step: 3987, Training Logs: loss_final: 2.379287, loss_mean: 0.939984, proj_loss: -0.423957, loss_mean_cls: 1.863259, grad_norm: 10.079547 +Steps: 0%| | 3988/1000000 [16:22<67:19:08, 4.11it/s, grad_norm=10.1, loss_final=2.38, loss_mean=0.94, loss_mean_cls=1.86, proj_loss=-0.424][2026-03-23 13:52:42] Step: 3988, Training Logs: loss_final: 2.455111, loss_mean: 0.915872, proj_loss: -0.428231, loss_mean_cls: 1.967469, grad_norm: 5.487077 +Steps: 0%| | 3989/1000000 [16:22<67:18:57, 4.11it/s, grad_norm=5.49, loss_final=2.46, loss_mean=0.916, loss_mean_cls=1.97, proj_loss=-0.428][2026-03-23 13:52:42] Step: 3989, Training Logs: loss_final: 2.495810, loss_mean: 0.951858, proj_loss: -0.422589, loss_mean_cls: 1.966541, grad_norm: 7.356920 +Steps: 0%| | 3990/1000000 [16:22<67:19:40, 4.11it/s, grad_norm=7.36, loss_final=2.5, loss_mean=0.952, loss_mean_cls=1.97, proj_loss=-0.423][2026-03-23 13:52:42] Step: 3990, Training Logs: loss_final: 2.593560, loss_mean: 0.943522, proj_loss: -0.418853, loss_mean_cls: 2.068891, grad_norm: 14.186711 +Steps: 0%| | 3991/1000000 [16:23<67:20:46, 4.11it/s, grad_norm=14.2, loss_final=2.59, loss_mean=0.944, loss_mean_cls=2.07, proj_loss=-0.419][2026-03-23 13:52:43] Step: 3991, Training Logs: loss_final: 2.826968, loss_mean: 0.924337, proj_loss: -0.420278, loss_mean_cls: 2.322910, grad_norm: 14.379563 +Steps: 0%| | 3992/1000000 [16:23<67:20:46, 4.11it/s, grad_norm=14.4, loss_final=2.83, loss_mean=0.924, loss_mean_cls=2.32, proj_loss=-0.42][2026-03-23 13:52:43] Step: 3992, Training Logs: loss_final: 2.598668, loss_mean: 0.934637, proj_loss: -0.425385, loss_mean_cls: 2.089415, grad_norm: 6.314969 +Steps: 0%| | 3993/1000000 [16:23<67:21:20, 4.11it/s, grad_norm=6.31, loss_final=2.6, loss_mean=0.935, loss_mean_cls=2.09, proj_loss=-0.425][2026-03-23 13:52:43] Step: 3993, Training Logs: loss_final: 2.879490, loss_mean: 0.915586, proj_loss: -0.418223, loss_mean_cls: 2.382126, grad_norm: 10.187022 +Steps: 0%| | 3994/1000000 [16:23<67:21:48, 4.11it/s, grad_norm=10.2, loss_final=2.88, loss_mean=0.916, loss_mean_cls=2.38, proj_loss=-0.418][2026-03-23 13:52:43] Step: 3994, Training Logs: loss_final: 2.417572, loss_mean: 0.926265, proj_loss: -0.421572, loss_mean_cls: 1.912878, grad_norm: 6.834720 +Steps: 0%| | 3995/1000000 [16:24<67:20:58, 4.11it/s, grad_norm=6.83, loss_final=2.42, loss_mean=0.926, loss_mean_cls=1.91, proj_loss=-0.422][2026-03-23 13:52:44] Step: 3995, Training Logs: loss_final: 2.928032, loss_mean: 0.905793, proj_loss: -0.419607, loss_mean_cls: 2.441847, grad_norm: 4.557928 +Steps: 0%| | 3996/1000000 [16:24<67:21:13, 4.11it/s, grad_norm=4.56, loss_final=2.93, loss_mean=0.906, loss_mean_cls=2.44, proj_loss=-0.42][2026-03-23 13:52:44] Step: 3996, Training Logs: loss_final: 2.837727, loss_mean: 0.898500, proj_loss: -0.421994, loss_mean_cls: 2.361222, grad_norm: 7.550261 +Steps: 0%| | 3997/1000000 [16:24<67:20:50, 4.11it/s, grad_norm=7.55, loss_final=2.84, loss_mean=0.898, loss_mean_cls=2.36, proj_loss=-0.422][2026-03-23 13:52:44] Step: 3997, Training Logs: loss_final: 2.803603, loss_mean: 0.918741, proj_loss: -0.422666, loss_mean_cls: 2.307528, grad_norm: 9.345160 +Steps: 0%| | 3998/1000000 [16:24<67:19:56, 4.11it/s, grad_norm=9.35, loss_final=2.8, loss_mean=0.919, loss_mean_cls=2.31, proj_loss=-0.423][2026-03-23 13:52:44] Step: 3998, Training Logs: loss_final: 2.668813, loss_mean: 0.927607, proj_loss: -0.423074, loss_mean_cls: 2.164281, grad_norm: 21.641579 +Steps: 0%| | 3999/1000000 [16:25<67:21:02, 4.11it/s, grad_norm=21.6, loss_final=2.67, loss_mean=0.928, loss_mean_cls=2.16, proj_loss=-0.423][2026-03-23 13:52:45] Step: 3999, Training Logs: loss_final: 2.872526, loss_mean: 0.927687, proj_loss: -0.422946, loss_mean_cls: 2.367785, grad_norm: 4.299336 +Steps: 0%| | 4000/1000000 [16:25<67:22:39, 4.11it/s, grad_norm=4.3, loss_final=2.87, loss_mean=0.928, loss_mean_cls=2.37, proj_loss=-0.423][2026-03-23 13:52:45] Generating EMA samples (ODE Euler, no diffusion noise; t≈0.5 → t=0)... +[2026-03-23 13:52:47] Step: 4000, Training Logs: loss_final: 2.879370, loss_mean: 0.909940, proj_loss: -0.421831, loss_mean_cls: 2.391261, grad_norm: 24.366697 +Steps: 0%| | 4001/1000000 [16:28<283:51:49, 1.03s/it, grad_norm=24.4, loss_final=2.88, loss_mean=0.91, loss_mean_cls=2.39, proj_loss=-0.422][2026-03-23 13:52:48] Step: 4001, Training Logs: loss_final: 3.109880, loss_mean: 0.908689, proj_loss: -0.417292, loss_mean_cls: 2.618483, grad_norm: 9.580506 +Steps: 0%| | 4002/1000000 [16:28<218:54:32, 1.26it/s, grad_norm=9.58, loss_final=3.11, loss_mean=0.909, loss_mean_cls=2.62, proj_loss=-0.417][2026-03-23 13:52:48] Step: 4002, Training Logs: loss_final: 2.787983, loss_mean: 0.906919, proj_loss: -0.413886, loss_mean_cls: 2.294951, grad_norm: 20.858616 +Steps: 0%| | 4003/1000000 [16:28<173:26:07, 1.60it/s, grad_norm=20.9, loss_final=2.79, loss_mean=0.907, loss_mean_cls=2.29, proj_loss=-0.414][2026-03-23 13:52:48] Step: 4003, Training Logs: loss_final: 2.715690, loss_mean: 0.931997, proj_loss: -0.417513, loss_mean_cls: 2.201206, grad_norm: 13.516343 +Steps: 0%| | 4004/1000000 [16:28<141:35:50, 1.95it/s, grad_norm=13.5, loss_final=2.72, loss_mean=0.932, loss_mean_cls=2.2, proj_loss=-0.418][2026-03-23 13:52:48] Step: 4004, Training Logs: loss_final: 2.513039, loss_mean: 0.940758, proj_loss: -0.427721, loss_mean_cls: 2.000002, grad_norm: 21.143156 +Steps: 0%| | 4005/1000000 [16:29<119:20:40, 2.32it/s, grad_norm=21.1, loss_final=2.51, loss_mean=0.941, loss_mean_cls=2, proj_loss=-0.428][2026-03-23 13:52:49] Step: 4005, Training Logs: loss_final: 3.119569, loss_mean: 0.893894, proj_loss: -0.414899, loss_mean_cls: 2.640574, grad_norm: 33.736485 +Steps: 0%| | 4006/1000000 [16:29<103:45:40, 2.67it/s, grad_norm=33.7, loss_final=3.12, loss_mean=0.894, loss_mean_cls=2.64, proj_loss=-0.415][2026-03-23 13:52:49] Step: 4006, Training Logs: loss_final: 2.965207, loss_mean: 0.904575, proj_loss: -0.418077, loss_mean_cls: 2.478710, grad_norm: 3.330840 +Steps: 0%| | 4007/1000000 [16:29<92:49:59, 2.98it/s, grad_norm=3.33, loss_final=2.97, loss_mean=0.905, loss_mean_cls=2.48, proj_loss=-0.418] [2026-03-23 13:52:49] Step: 4007, Training Logs: loss_final: 2.932622, loss_mean: 0.905437, proj_loss: -0.421750, loss_mean_cls: 2.448934, grad_norm: 13.917981 +Steps: 0%| | 4008/1000000 [16:29<85:09:55, 3.25it/s, grad_norm=13.9, loss_final=2.93, loss_mean=0.905, loss_mean_cls=2.45, proj_loss=-0.422][2026-03-23 13:52:49] Step: 4008, Training Logs: loss_final: 2.470766, loss_mean: 0.945870, proj_loss: -0.423644, loss_mean_cls: 1.948539, grad_norm: 8.737852 +Steps: 0%| | 4009/1000000 [16:30<79:49:25, 3.47it/s, grad_norm=8.74, loss_final=2.47, loss_mean=0.946, loss_mean_cls=1.95, proj_loss=-0.424][2026-03-23 13:52:50] Step: 4009, Training Logs: loss_final: 3.075572, loss_mean: 0.913618, proj_loss: -0.422639, loss_mean_cls: 2.584594, grad_norm: 32.716427 +Steps: 0%| | 4010/1000000 [16:30<76:04:18, 3.64it/s, grad_norm=32.7, loss_final=3.08, loss_mean=0.914, loss_mean_cls=2.58, proj_loss=-0.423][2026-03-23 13:52:50] Step: 4010, Training Logs: loss_final: 2.597470, loss_mean: 0.912736, proj_loss: -0.426889, loss_mean_cls: 2.111622, grad_norm: 44.629555 +Steps: 0%| | 4011/1000000 [16:30<73:27:11, 3.77it/s, grad_norm=44.6, loss_final=2.6, loss_mean=0.913, loss_mean_cls=2.11, proj_loss=-0.427][2026-03-23 13:52:50] Step: 4011, Training Logs: loss_final: 3.106273, loss_mean: 0.904697, proj_loss: -0.415535, loss_mean_cls: 2.617111, grad_norm: 4.718365 +Steps: 0%| | 4012/1000000 [16:30<71:38:17, 3.86it/s, grad_norm=4.72, loss_final=3.11, loss_mean=0.905, loss_mean_cls=2.62, proj_loss=-0.416][2026-03-23 13:52:50] Step: 4012, Training Logs: loss_final: 3.022502, loss_mean: 0.880903, proj_loss: -0.423992, loss_mean_cls: 2.565592, grad_norm: 25.241264 +Steps: 0%| | 4013/1000000 [16:31<70:20:12, 3.93it/s, grad_norm=25.2, loss_final=3.02, loss_mean=0.881, loss_mean_cls=2.57, proj_loss=-0.424][2026-03-23 13:52:51] Step: 4013, Training Logs: loss_final: 2.700427, loss_mean: 0.902753, proj_loss: -0.426070, loss_mean_cls: 2.223744, grad_norm: 10.205569 +Steps: 0%| | 4014/1000000 [16:31<69:28:16, 3.98it/s, grad_norm=10.2, loss_final=2.7, loss_mean=0.903, loss_mean_cls=2.22, proj_loss=-0.426][2026-03-23 13:52:51] Step: 4014, Training Logs: loss_final: 2.540118, loss_mean: 0.905499, proj_loss: -0.424859, loss_mean_cls: 2.059478, grad_norm: 7.016303 +Steps: 0%| | 4015/1000000 [16:31<68:50:15, 4.02it/s, grad_norm=7.02, loss_final=2.54, loss_mean=0.905, loss_mean_cls=2.06, proj_loss=-0.425][2026-03-23 13:52:51] Step: 4015, Training Logs: loss_final: 2.789302, loss_mean: 0.941495, proj_loss: -0.421274, loss_mean_cls: 2.269080, grad_norm: 5.026924 +Steps: 0%| | 4016/1000000 [16:31<68:22:44, 4.05it/s, grad_norm=5.03, loss_final=2.79, loss_mean=0.941, loss_mean_cls=2.27, proj_loss=-0.421][2026-03-23 13:52:51] Step: 4016, Training Logs: loss_final: 2.910041, loss_mean: 0.927593, proj_loss: -0.421159, loss_mean_cls: 2.403607, grad_norm: 33.577099 +Steps: 0%| | 4017/1000000 [16:32<68:05:06, 4.06it/s, grad_norm=33.6, loss_final=2.91, loss_mean=0.928, loss_mean_cls=2.4, proj_loss=-0.421][2026-03-23 13:52:52] Step: 4017, Training Logs: loss_final: 2.979797, loss_mean: 0.902095, proj_loss: -0.421398, loss_mean_cls: 2.499099, grad_norm: 25.154091 +Steps: 0%| | 4018/1000000 [16:32<67:50:59, 4.08it/s, grad_norm=25.2, loss_final=2.98, loss_mean=0.902, loss_mean_cls=2.5, proj_loss=-0.421][2026-03-23 13:52:52] Step: 4018, Training Logs: loss_final: 2.836963, loss_mean: 0.933843, proj_loss: -0.418207, loss_mean_cls: 2.321327, grad_norm: 14.264254 +Steps: 0%| | 4019/1000000 [16:32<67:41:16, 4.09it/s, grad_norm=14.3, loss_final=2.84, loss_mean=0.934, loss_mean_cls=2.32, proj_loss=-0.418][2026-03-23 13:52:52] Step: 4019, Training Logs: loss_final: 2.553509, loss_mean: 0.939278, proj_loss: -0.424535, loss_mean_cls: 2.038766, grad_norm: 3.539953 +Steps: 0%| | 4020/1000000 [16:32<67:34:34, 4.09it/s, grad_norm=3.54, loss_final=2.55, loss_mean=0.939, loss_mean_cls=2.04, proj_loss=-0.425][2026-03-23 13:52:52] Step: 4020, Training Logs: loss_final: 2.872525, loss_mean: 0.899956, proj_loss: -0.427260, loss_mean_cls: 2.399829, grad_norm: 20.980556 +Steps: 0%| | 4021/1000000 [16:33<67:31:01, 4.10it/s, grad_norm=21, loss_final=2.87, loss_mean=0.9, loss_mean_cls=2.4, proj_loss=-0.427][2026-03-23 13:52:53] Step: 4021, Training Logs: loss_final: 2.899143, loss_mean: 0.917305, proj_loss: -0.420651, loss_mean_cls: 2.402488, grad_norm: 35.605938 +Steps: 0%| | 4022/1000000 [16:33<67:29:40, 4.10it/s, grad_norm=35.6, loss_final=2.9, loss_mean=0.917, loss_mean_cls=2.4, proj_loss=-0.421][2026-03-23 13:52:53] Step: 4022, Training Logs: loss_final: 3.161861, loss_mean: 0.941778, proj_loss: -0.415908, loss_mean_cls: 2.635992, grad_norm: 15.787221 +Steps: 0%| | 4023/1000000 [16:33<67:28:08, 4.10it/s, grad_norm=15.8, loss_final=3.16, loss_mean=0.942, loss_mean_cls=2.64, proj_loss=-0.416][2026-03-23 13:52:53] Step: 4023, Training Logs: loss_final: 3.004438, loss_mean: 0.948020, proj_loss: -0.416951, loss_mean_cls: 2.473369, grad_norm: 4.931346 +Steps: 0%| | 4024/1000000 [16:33<67:25:38, 4.10it/s, grad_norm=4.93, loss_final=3, loss_mean=0.948, loss_mean_cls=2.47, proj_loss=-0.417][2026-03-23 13:52:53] Step: 4024, Training Logs: loss_final: 2.590528, loss_mean: 0.956133, proj_loss: -0.423632, loss_mean_cls: 2.058027, grad_norm: 13.187500 +Steps: 0%| | 4025/1000000 [16:34<67:24:10, 4.10it/s, grad_norm=13.2, loss_final=2.59, loss_mean=0.956, loss_mean_cls=2.06, proj_loss=-0.424][2026-03-23 13:52:54] Step: 4025, Training Logs: loss_final: 2.527795, loss_mean: 0.920682, proj_loss: -0.430051, loss_mean_cls: 2.037163, grad_norm: 32.780483 +Steps: 0%| | 4026/1000000 [16:34<67:23:56, 4.10it/s, grad_norm=32.8, loss_final=2.53, loss_mean=0.921, loss_mean_cls=2.04, proj_loss=-0.43][2026-03-23 13:52:54] Step: 4026, Training Logs: loss_final: 2.811991, loss_mean: 0.925113, proj_loss: -0.424852, loss_mean_cls: 2.311730, grad_norm: 12.219789 +Steps: 0%| | 4027/1000000 [16:34<67:22:48, 4.11it/s, grad_norm=12.2, loss_final=2.81, loss_mean=0.925, loss_mean_cls=2.31, proj_loss=-0.425][2026-03-23 13:52:54] Step: 4027, Training Logs: loss_final: 2.855433, loss_mean: 0.949762, proj_loss: -0.416910, loss_mean_cls: 2.322581, grad_norm: 3.749854 +Steps: 0%| | 4028/1000000 [16:34<67:22:42, 4.11it/s, grad_norm=3.75, loss_final=2.86, loss_mean=0.95, loss_mean_cls=2.32, proj_loss=-0.417][2026-03-23 13:52:54] Step: 4028, Training Logs: loss_final: 2.958272, loss_mean: 0.928824, proj_loss: -0.417010, loss_mean_cls: 2.446458, grad_norm: 6.052210 +Steps: 0%| | 4029/1000000 [16:35<67:22:35, 4.11it/s, grad_norm=6.05, loss_final=2.96, loss_mean=0.929, loss_mean_cls=2.45, proj_loss=-0.417][2026-03-23 13:52:55] Step: 4029, Training Logs: loss_final: 3.261477, loss_mean: 0.915999, proj_loss: -0.414288, loss_mean_cls: 2.759766, grad_norm: 17.046322 +Steps: 0%| | 4030/1000000 [16:35<67:21:37, 4.11it/s, grad_norm=17, loss_final=3.26, loss_mean=0.916, loss_mean_cls=2.76, proj_loss=-0.414][2026-03-23 13:52:55] Step: 4030, Training Logs: loss_final: 2.850928, loss_mean: 0.930461, proj_loss: -0.418533, loss_mean_cls: 2.339000, grad_norm: 19.255045 +Steps: 0%| | 4031/1000000 [16:35<67:26:45, 4.10it/s, grad_norm=19.3, loss_final=2.85, loss_mean=0.93, loss_mean_cls=2.34, proj_loss=-0.419][2026-03-23 13:52:55] Step: 4031, Training Logs: loss_final: 3.313066, loss_mean: 0.923637, proj_loss: -0.413918, loss_mean_cls: 2.803347, grad_norm: 18.957989 +Steps: 0%| | 4032/1000000 [16:35<67:25:17, 4.10it/s, grad_norm=19, loss_final=3.31, loss_mean=0.924, loss_mean_cls=2.8, proj_loss=-0.414][2026-03-23 13:52:55] Step: 4032, Training Logs: loss_final: 2.508507, loss_mean: 0.948053, proj_loss: -0.419978, loss_mean_cls: 1.980432, grad_norm: 2.828460 +Steps: 0%| | 4033/1000000 [16:36<67:24:33, 4.10it/s, grad_norm=2.83, loss_final=2.51, loss_mean=0.948, loss_mean_cls=1.98, proj_loss=-0.42][2026-03-23 13:52:55] Step: 4033, Training Logs: loss_final: 2.695439, loss_mean: 0.921101, proj_loss: -0.424322, loss_mean_cls: 2.198660, grad_norm: 7.090634 +Steps: 0%| | 4034/1000000 [16:36<67:26:53, 4.10it/s, grad_norm=7.09, loss_final=2.7, loss_mean=0.921, loss_mean_cls=2.2, proj_loss=-0.424][2026-03-23 13:52:56] Step: 4034, Training Logs: loss_final: 3.492516, loss_mean: 0.922959, proj_loss: -0.415711, loss_mean_cls: 2.985267, grad_norm: 3.441670 +Steps: 0%| | 4035/1000000 [16:36<67:32:32, 4.10it/s, grad_norm=3.44, loss_final=3.49, loss_mean=0.923, loss_mean_cls=2.99, proj_loss=-0.416][2026-03-23 13:52:56] Step: 4035, Training Logs: loss_final: 2.789268, loss_mean: 0.915071, proj_loss: -0.426349, loss_mean_cls: 2.300546, grad_norm: 2.645189 +Steps: 0%| | 4036/1000000 [16:36<67:30:00, 4.10it/s, grad_norm=2.65, loss_final=2.79, loss_mean=0.915, loss_mean_cls=2.3, proj_loss=-0.426][2026-03-23 13:52:56] Step: 4036, Training Logs: loss_final: 2.701402, loss_mean: 0.938128, proj_loss: -0.419219, loss_mean_cls: 2.182492, grad_norm: 17.338608 +Steps: 0%| | 4037/1000000 [16:36<67:30:54, 4.10it/s, grad_norm=17.3, loss_final=2.7, loss_mean=0.938, loss_mean_cls=2.18, proj_loss=-0.419][2026-03-23 13:52:56] Step: 4037, Training Logs: loss_final: 2.491665, loss_mean: 0.964757, proj_loss: -0.428526, loss_mean_cls: 1.955435, grad_norm: 6.310493 +Steps: 0%| | 4038/1000000 [16:37<67:44:54, 4.08it/s, grad_norm=6.31, loss_final=2.49, loss_mean=0.965, loss_mean_cls=1.96, proj_loss=-0.429][2026-03-23 13:52:57] Step: 4038, Training Logs: loss_final: 2.681567, loss_mean: 0.939239, proj_loss: -0.423184, loss_mean_cls: 2.165512, grad_norm: 13.282598 +Steps: 0%| | 4039/1000000 [16:37<67:43:26, 4.09it/s, grad_norm=13.3, loss_final=2.68, loss_mean=0.939, loss_mean_cls=2.17, proj_loss=-0.423][2026-03-23 13:52:57] Step: 4039, Training Logs: loss_final: 2.548428, loss_mean: 0.945540, proj_loss: -0.422313, loss_mean_cls: 2.025200, grad_norm: 3.955564 +Steps: 0%| | 4040/1000000 [16:37<67:37:04, 4.09it/s, grad_norm=3.96, loss_final=2.55, loss_mean=0.946, loss_mean_cls=2.03, proj_loss=-0.422][2026-03-23 13:52:57] Step: 4040, Training Logs: loss_final: 2.284401, loss_mean: 0.940419, proj_loss: -0.425010, loss_mean_cls: 1.768992, grad_norm: 11.412564 +Steps: 0%| | 4041/1000000 [16:37<67:33:26, 4.10it/s, grad_norm=11.4, loss_final=2.28, loss_mean=0.94, loss_mean_cls=1.77, proj_loss=-0.425][2026-03-23 13:52:57] Step: 4041, Training Logs: loss_final: 2.981925, loss_mean: 0.913107, proj_loss: -0.419923, loss_mean_cls: 2.488741, grad_norm: 2.474134 +Steps: 0%| | 4042/1000000 [16:38<67:31:22, 4.10it/s, grad_norm=2.47, loss_final=2.98, loss_mean=0.913, loss_mean_cls=2.49, proj_loss=-0.42][2026-03-23 13:52:58] Step: 4042, Training Logs: loss_final: 2.527308, loss_mean: 0.926918, proj_loss: -0.427096, loss_mean_cls: 2.027486, grad_norm: 12.103065 +Steps: 0%| | 4043/1000000 [16:38<67:35:49, 4.09it/s, grad_norm=12.1, loss_final=2.53, loss_mean=0.927, loss_mean_cls=2.03, proj_loss=-0.427][2026-03-23 13:52:58] Step: 4043, Training Logs: loss_final: 2.709770, loss_mean: 0.924137, proj_loss: -0.424857, loss_mean_cls: 2.210490, grad_norm: 12.030312 +Steps: 0%| | 4044/1000000 [16:38<67:34:09, 4.09it/s, grad_norm=12, loss_final=2.71, loss_mean=0.924, loss_mean_cls=2.21, proj_loss=-0.425][2026-03-23 13:52:58] Step: 4044, Training Logs: loss_final: 2.740831, loss_mean: 0.945359, proj_loss: -0.419235, loss_mean_cls: 2.214708, grad_norm: 13.444010 +Steps: 0%| | 4045/1000000 [16:38<67:30:55, 4.10it/s, grad_norm=13.4, loss_final=2.74, loss_mean=0.945, loss_mean_cls=2.21, proj_loss=-0.419][2026-03-23 13:52:58] Step: 4045, Training Logs: loss_final: 2.511228, loss_mean: 0.956731, proj_loss: -0.423019, loss_mean_cls: 1.977517, grad_norm: 19.845539 +Steps: 0%| | 4046/1000000 [16:39<67:28:58, 4.10it/s, grad_norm=19.8, loss_final=2.51, loss_mean=0.957, loss_mean_cls=1.98, proj_loss=-0.423][2026-03-23 13:52:59] Step: 4046, Training Logs: loss_final: 3.097543, loss_mean: 0.917270, proj_loss: -0.415133, loss_mean_cls: 2.595405, grad_norm: 25.903767 +Steps: 0%| | 4047/1000000 [16:39<67:46:24, 4.08it/s, grad_norm=25.9, loss_final=3.1, loss_mean=0.917, loss_mean_cls=2.6, proj_loss=-0.415][2026-03-23 13:52:59] Step: 4047, Training Logs: loss_final: 2.453354, loss_mean: 0.968925, proj_loss: -0.425776, loss_mean_cls: 1.910205, grad_norm: 17.589775 +Steps: 0%| | 4048/1000000 [16:39<67:39:31, 4.09it/s, grad_norm=17.6, loss_final=2.45, loss_mean=0.969, loss_mean_cls=1.91, proj_loss=-0.426][2026-03-23 13:52:59] Step: 4048, Training Logs: loss_final: 3.066167, loss_mean: 0.922721, proj_loss: -0.407248, loss_mean_cls: 2.550694, grad_norm: 8.393085 +Steps: 0%| | 4049/1000000 [16:39<67:32:52, 4.10it/s, grad_norm=8.39, loss_final=3.07, loss_mean=0.923, loss_mean_cls=2.55, proj_loss=-0.407][2026-03-23 13:52:59] Step: 4049, Training Logs: loss_final: 2.468053, loss_mean: 0.942545, proj_loss: -0.425055, loss_mean_cls: 1.950563, grad_norm: 8.323102 +Steps: 0%| | 4050/1000000 [16:40<67:29:08, 4.10it/s, grad_norm=8.32, loss_final=2.47, loss_mean=0.943, loss_mean_cls=1.95, proj_loss=-0.425][2026-03-23 13:53:00] Step: 4050, Training Logs: loss_final: 2.738451, loss_mean: 0.924908, proj_loss: -0.422034, loss_mean_cls: 2.235577, grad_norm: 30.484367 +Steps: 0%| | 4051/1000000 [16:40<67:26:19, 4.10it/s, grad_norm=30.5, loss_final=2.74, loss_mean=0.925, loss_mean_cls=2.24, proj_loss=-0.422][2026-03-23 13:53:00] Step: 4051, Training Logs: loss_final: 2.714587, loss_mean: 0.926961, proj_loss: -0.419948, loss_mean_cls: 2.207574, grad_norm: 16.542208 +Steps: 0%| | 4052/1000000 [16:40<67:24:30, 4.10it/s, grad_norm=16.5, loss_final=2.71, loss_mean=0.927, loss_mean_cls=2.21, proj_loss=-0.42][2026-03-23 13:53:00] Step: 4052, Training Logs: loss_final: 3.233740, loss_mean: 0.919656, proj_loss: -0.412191, loss_mean_cls: 2.726275, grad_norm: 5.597503 +Steps: 0%| | 4053/1000000 [16:40<67:23:58, 4.10it/s, grad_norm=5.6, loss_final=3.23, loss_mean=0.92, loss_mean_cls=2.73, proj_loss=-0.412][2026-03-23 13:53:00] Step: 4053, Training Logs: loss_final: 3.193234, loss_mean: 0.905633, proj_loss: -0.415107, loss_mean_cls: 2.702709, grad_norm: 31.226171 +Steps: 0%| | 4054/1000000 [16:41<67:24:30, 4.10it/s, grad_norm=31.2, loss_final=3.19, loss_mean=0.906, loss_mean_cls=2.7, proj_loss=-0.415][2026-03-23 13:53:01] Step: 4054, Training Logs: loss_final: 2.397547, loss_mean: 0.935674, proj_loss: -0.431617, loss_mean_cls: 1.893489, grad_norm: 26.701458 +Steps: 0%| | 4055/1000000 [16:41<67:24:32, 4.10it/s, grad_norm=26.7, loss_final=2.4, loss_mean=0.936, loss_mean_cls=1.89, proj_loss=-0.432][2026-03-23 13:53:01] Step: 4055, Training Logs: loss_final: 3.420787, loss_mean: 0.906648, proj_loss: -0.404268, loss_mean_cls: 2.918407, grad_norm: 13.826122 +Steps: 0%| | 4056/1000000 [16:41<67:24:30, 4.10it/s, grad_norm=13.8, loss_final=3.42, loss_mean=0.907, loss_mean_cls=2.92, proj_loss=-0.404][2026-03-23 13:53:01] Step: 4056, Training Logs: loss_final: 2.654671, loss_mean: 0.918874, proj_loss: -0.419209, loss_mean_cls: 2.155005, grad_norm: 2.238829 +Steps: 0%| | 4057/1000000 [16:41<67:24:45, 4.10it/s, grad_norm=2.24, loss_final=2.65, loss_mean=0.919, loss_mean_cls=2.16, proj_loss=-0.419][2026-03-23 13:53:01] Step: 4057, Training Logs: loss_final: 2.504576, loss_mean: 0.913547, proj_loss: -0.419477, loss_mean_cls: 2.010505, grad_norm: 8.509248 +Steps: 0%| | 4058/1000000 [16:42<67:23:59, 4.10it/s, grad_norm=8.51, loss_final=2.5, loss_mean=0.914, loss_mean_cls=2.01, proj_loss=-0.419][2026-03-23 13:53:02] Step: 4058, Training Logs: loss_final: 2.473988, loss_mean: 0.941487, proj_loss: -0.428492, loss_mean_cls: 1.960993, grad_norm: 3.326202 +Steps: 0%| | 4059/1000000 [16:42<67:31:33, 4.10it/s, grad_norm=3.33, loss_final=2.47, loss_mean=0.941, loss_mean_cls=1.96, proj_loss=-0.428][2026-03-23 13:53:02] Step: 4059, Training Logs: loss_final: 2.613485, loss_mean: 0.917676, proj_loss: -0.426618, loss_mean_cls: 2.122427, grad_norm: 11.749078 +Steps: 0%| | 4060/1000000 [16:42<69:33:09, 3.98it/s, grad_norm=11.7, loss_final=2.61, loss_mean=0.918, loss_mean_cls=2.12, proj_loss=-0.427][2026-03-23 13:53:02] Step: 4060, Training Logs: loss_final: 2.672493, loss_mean: 0.930642, proj_loss: -0.426602, loss_mean_cls: 2.168453, grad_norm: 19.363079 +Steps: 0%| | 4061/1000000 [16:42<68:53:58, 4.02it/s, grad_norm=19.4, loss_final=2.67, loss_mean=0.931, loss_mean_cls=2.17, proj_loss=-0.427][2026-03-23 13:53:02] Step: 4061, Training Logs: loss_final: 2.928612, loss_mean: 0.921677, proj_loss: -0.415773, loss_mean_cls: 2.422708, grad_norm: 19.200951 +Steps: 0%| | 4062/1000000 [16:43<68:29:56, 4.04it/s, grad_norm=19.2, loss_final=2.93, loss_mean=0.922, loss_mean_cls=2.42, proj_loss=-0.416][2026-03-23 13:53:03] Step: 4062, Training Logs: loss_final: 2.288478, loss_mean: 0.950092, proj_loss: -0.429628, loss_mean_cls: 1.768013, grad_norm: 5.353250 +Steps: 0%| | 4063/1000000 [16:43<68:07:23, 4.06it/s, grad_norm=5.35, loss_final=2.29, loss_mean=0.95, loss_mean_cls=1.77, proj_loss=-0.43][2026-03-23 13:53:03] Step: 4063, Training Logs: loss_final: 2.406674, loss_mean: 0.937993, proj_loss: -0.432065, loss_mean_cls: 1.900747, grad_norm: 34.728138 +Steps: 0%| | 4064/1000000 [16:43<67:52:48, 4.08it/s, grad_norm=34.7, loss_final=2.41, loss_mean=0.938, loss_mean_cls=1.9, proj_loss=-0.432][2026-03-23 13:53:03] Step: 4064, Training Logs: loss_final: 3.051408, loss_mean: 0.918222, proj_loss: -0.416854, loss_mean_cls: 2.550040, grad_norm: 15.718217 +Steps: 0%| | 4065/1000000 [16:43<67:43:46, 4.08it/s, grad_norm=15.7, loss_final=3.05, loss_mean=0.918, loss_mean_cls=2.55, proj_loss=-0.417][2026-03-23 13:53:03] Step: 4065, Training Logs: loss_final: 2.658196, loss_mean: 0.927173, proj_loss: -0.416638, loss_mean_cls: 2.147661, grad_norm: 10.308114 +Steps: 0%| | 4066/1000000 [16:44<67:39:53, 4.09it/s, grad_norm=10.3, loss_final=2.66, loss_mean=0.927, loss_mean_cls=2.15, proj_loss=-0.417][2026-03-23 13:53:04] Step: 4066, Training Logs: loss_final: 2.715492, loss_mean: 0.927336, proj_loss: -0.420421, loss_mean_cls: 2.208577, grad_norm: 11.350438 +Steps: 0%| | 4067/1000000 [16:44<67:33:32, 4.09it/s, grad_norm=11.4, loss_final=2.72, loss_mean=0.927, loss_mean_cls=2.21, proj_loss=-0.42][2026-03-23 13:53:04] Step: 4067, Training Logs: loss_final: 2.852710, loss_mean: 0.928190, proj_loss: -0.417152, loss_mean_cls: 2.341671, grad_norm: 14.029865 +Steps: 0%| | 4068/1000000 [16:44<67:55:42, 4.07it/s, grad_norm=14, loss_final=2.85, loss_mean=0.928, loss_mean_cls=2.34, proj_loss=-0.417][2026-03-23 13:53:04] Step: 4068, Training Logs: loss_final: 2.777680, loss_mean: 0.926467, proj_loss: -0.422487, loss_mean_cls: 2.273701, grad_norm: 2.469018 +Steps: 0%| | 4069/1000000 [16:44<67:45:39, 4.08it/s, grad_norm=2.47, loss_final=2.78, loss_mean=0.926, loss_mean_cls=2.27, proj_loss=-0.422][2026-03-23 13:53:04] Step: 4069, Training Logs: loss_final: 2.230584, loss_mean: 0.928464, proj_loss: -0.434726, loss_mean_cls: 1.736845, grad_norm: 14.337918 +Steps: 0%| | 4070/1000000 [16:45<67:41:33, 4.09it/s, grad_norm=14.3, loss_final=2.23, loss_mean=0.928, loss_mean_cls=1.74, proj_loss=-0.435][2026-03-23 13:53:05] Step: 4070, Training Logs: loss_final: 2.725394, loss_mean: 0.957267, proj_loss: -0.417929, loss_mean_cls: 2.186056, grad_norm: 12.109859 +Steps: 0%| | 4071/1000000 [16:45<67:42:59, 4.09it/s, grad_norm=12.1, loss_final=2.73, loss_mean=0.957, loss_mean_cls=2.19, proj_loss=-0.418][2026-03-23 13:53:05] Step: 4071, Training Logs: loss_final: 2.641996, loss_mean: 0.966358, proj_loss: -0.410234, loss_mean_cls: 2.085873, grad_norm: 11.312420 +Steps: 0%| | 4072/1000000 [16:45<67:34:31, 4.09it/s, grad_norm=11.3, loss_final=2.64, loss_mean=0.966, loss_mean_cls=2.09, proj_loss=-0.41][2026-03-23 13:53:05] Step: 4072, Training Logs: loss_final: 2.387077, loss_mean: 0.971272, proj_loss: -0.414916, loss_mean_cls: 1.830721, grad_norm: 13.016992 +Steps: 0%| | 4073/1000000 [16:45<67:30:16, 4.10it/s, grad_norm=13, loss_final=2.39, loss_mean=0.971, loss_mean_cls=1.83, proj_loss=-0.415][2026-03-23 13:53:05] Step: 4073, Training Logs: loss_final: 2.726682, loss_mean: 0.930873, proj_loss: -0.418137, loss_mean_cls: 2.213946, grad_norm: 4.341223 +Steps: 0%| | 4074/1000000 [16:46<67:26:57, 4.10it/s, grad_norm=4.34, loss_final=2.73, loss_mean=0.931, loss_mean_cls=2.21, proj_loss=-0.418][2026-03-23 13:53:06] Step: 4074, Training Logs: loss_final: 2.907836, loss_mean: 0.935151, proj_loss: -0.423230, loss_mean_cls: 2.395916, grad_norm: 11.944712 +Steps: 0%| | 4075/1000000 [16:46<67:31:52, 4.10it/s, grad_norm=11.9, loss_final=2.91, loss_mean=0.935, loss_mean_cls=2.4, proj_loss=-0.423][2026-03-23 13:53:06] Step: 4075, Training Logs: loss_final: 2.964316, loss_mean: 0.918921, proj_loss: -0.419170, loss_mean_cls: 2.464565, grad_norm: 10.456450 +Steps: 0%| | 4076/1000000 [16:46<67:28:26, 4.10it/s, grad_norm=10.5, loss_final=2.96, loss_mean=0.919, loss_mean_cls=2.46, proj_loss=-0.419][2026-03-23 13:53:06] Step: 4076, Training Logs: loss_final: 2.823286, loss_mean: 0.943395, proj_loss: -0.423907, loss_mean_cls: 2.303798, grad_norm: 14.652585 +Steps: 0%| | 4077/1000000 [16:46<67:24:55, 4.10it/s, grad_norm=14.7, loss_final=2.82, loss_mean=0.943, loss_mean_cls=2.3, proj_loss=-0.424][2026-03-23 13:53:06] Step: 4077, Training Logs: loss_final: 2.755697, loss_mean: 0.939074, proj_loss: -0.418002, loss_mean_cls: 2.234625, grad_norm: 3.695538 +Steps: 0%| | 4078/1000000 [16:47<67:27:45, 4.10it/s, grad_norm=3.7, loss_final=2.76, loss_mean=0.939, loss_mean_cls=2.23, proj_loss=-0.418][2026-03-23 13:53:06] Step: 4078, Training Logs: loss_final: 2.989605, loss_mean: 0.909610, proj_loss: -0.417207, loss_mean_cls: 2.497202, grad_norm: 24.240194 +Steps: 0%| | 4079/1000000 [16:47<67:30:23, 4.10it/s, grad_norm=24.2, loss_final=2.99, loss_mean=0.91, loss_mean_cls=2.5, proj_loss=-0.417][2026-03-23 13:53:07] Step: 4079, Training Logs: loss_final: 2.593936, loss_mean: 0.943559, proj_loss: -0.419653, loss_mean_cls: 2.070030, grad_norm: 14.440164 +Steps: 0%| | 4080/1000000 [16:47<67:27:37, 4.10it/s, grad_norm=14.4, loss_final=2.59, loss_mean=0.944, loss_mean_cls=2.07, proj_loss=-0.42][2026-03-23 13:53:07] Step: 4080, Training Logs: loss_final: 2.599518, loss_mean: 0.923684, proj_loss: -0.426314, loss_mean_cls: 2.102148, grad_norm: 10.680936 +Steps: 0%| | 4081/1000000 [16:47<67:26:18, 4.10it/s, grad_norm=10.7, loss_final=2.6, loss_mean=0.924, loss_mean_cls=2.1, proj_loss=-0.426][2026-03-23 13:53:07] Step: 4081, Training Logs: loss_final: 2.875348, loss_mean: 0.939935, proj_loss: -0.415533, loss_mean_cls: 2.350946, grad_norm: 19.272961 +Steps: 0%| | 4082/1000000 [16:47<67:27:44, 4.10it/s, grad_norm=19.3, loss_final=2.88, loss_mean=0.94, loss_mean_cls=2.35, proj_loss=-0.416][2026-03-23 13:53:07] Step: 4082, Training Logs: loss_final: 2.160018, loss_mean: 0.949315, proj_loss: -0.428771, loss_mean_cls: 1.639474, grad_norm: 13.121377 +Steps: 0%| | 4083/1000000 [16:48<67:31:38, 4.10it/s, grad_norm=13.1, loss_final=2.16, loss_mean=0.949, loss_mean_cls=1.64, proj_loss=-0.429][2026-03-23 13:53:08] Step: 4083, Training Logs: loss_final: 2.893743, loss_mean: 0.930072, proj_loss: -0.414797, loss_mean_cls: 2.378467, grad_norm: 9.563332 +Steps: 0%| | 4084/1000000 [16:48<67:27:34, 4.10it/s, grad_norm=9.56, loss_final=2.89, loss_mean=0.93, loss_mean_cls=2.38, proj_loss=-0.415][2026-03-23 13:53:08] Step: 4084, Training Logs: loss_final: 2.767834, loss_mean: 0.923531, proj_loss: -0.427411, loss_mean_cls: 2.271714, grad_norm: 22.818947 +Steps: 0%| | 4085/1000000 [16:48<67:25:17, 4.10it/s, grad_norm=22.8, loss_final=2.77, loss_mean=0.924, loss_mean_cls=2.27, proj_loss=-0.427][2026-03-23 13:53:08] Step: 4085, Training Logs: loss_final: 2.822771, loss_mean: 0.913558, proj_loss: -0.421766, loss_mean_cls: 2.330979, grad_norm: 17.760265 +Steps: 0%| | 4086/1000000 [16:48<67:25:17, 4.10it/s, grad_norm=17.8, loss_final=2.82, loss_mean=0.914, loss_mean_cls=2.33, proj_loss=-0.422][2026-03-23 13:53:08] Step: 4086, Training Logs: loss_final: 2.873141, loss_mean: 0.912137, proj_loss: -0.421662, loss_mean_cls: 2.382666, grad_norm: 11.294894 +Steps: 0%| | 4087/1000000 [16:49<67:32:30, 4.10it/s, grad_norm=11.3, loss_final=2.87, loss_mean=0.912, loss_mean_cls=2.38, proj_loss=-0.422][2026-03-23 13:53:09] Step: 4087, Training Logs: loss_final: 2.692967, loss_mean: 0.918297, proj_loss: -0.415760, loss_mean_cls: 2.190430, grad_norm: 31.104496 +Steps: 0%| | 4088/1000000 [16:49<67:28:21, 4.10it/s, grad_norm=31.1, loss_final=2.69, loss_mean=0.918, loss_mean_cls=2.19, proj_loss=-0.416][2026-03-23 13:53:09] Step: 4088, Training Logs: loss_final: 2.299913, loss_mean: 0.911169, proj_loss: -0.429985, loss_mean_cls: 1.818729, grad_norm: 20.415237 +Steps: 0%| | 4089/1000000 [16:49<67:26:23, 4.10it/s, grad_norm=20.4, loss_final=2.3, loss_mean=0.911, loss_mean_cls=1.82, proj_loss=-0.43][2026-03-23 13:53:09] Step: 4089, Training Logs: loss_final: 2.880081, loss_mean: 0.946060, proj_loss: -0.427784, loss_mean_cls: 2.361805, grad_norm: 8.099217 +Steps: 0%| | 4090/1000000 [16:49<67:28:46, 4.10it/s, grad_norm=8.1, loss_final=2.88, loss_mean=0.946, loss_mean_cls=2.36, proj_loss=-0.428][2026-03-23 13:53:09] Step: 4090, Training Logs: loss_final: 2.307497, loss_mean: 0.914367, proj_loss: -0.432312, loss_mean_cls: 1.825441, grad_norm: 8.572067 +Steps: 0%| | 4091/1000000 [16:50<67:32:46, 4.10it/s, grad_norm=8.57, loss_final=2.31, loss_mean=0.914, loss_mean_cls=1.83, proj_loss=-0.432][2026-03-23 13:53:10] Step: 4091, Training Logs: loss_final: 2.389761, loss_mean: 0.922051, proj_loss: -0.426848, loss_mean_cls: 1.894559, grad_norm: 4.844142 +Steps: 0%| | 4092/1000000 [16:50<67:28:29, 4.10it/s, grad_norm=4.84, loss_final=2.39, loss_mean=0.922, loss_mean_cls=1.89, proj_loss=-0.427][2026-03-23 13:53:10] Step: 4092, Training Logs: loss_final: 2.491808, loss_mean: 0.923053, proj_loss: -0.427691, loss_mean_cls: 1.996447, grad_norm: 3.754297 +Steps: 0%| | 4093/1000000 [16:50<67:26:11, 4.10it/s, grad_norm=3.75, loss_final=2.49, loss_mean=0.923, loss_mean_cls=2, proj_loss=-0.428][2026-03-23 13:53:10] Step: 4093, Training Logs: loss_final: 2.808933, loss_mean: 0.913566, proj_loss: -0.418029, loss_mean_cls: 2.313396, grad_norm: 17.716557 +Steps: 0%| | 4094/1000000 [16:50<67:27:56, 4.10it/s, grad_norm=17.7, loss_final=2.81, loss_mean=0.914, loss_mean_cls=2.31, proj_loss=-0.418][2026-03-23 13:53:10] Step: 4094, Training Logs: loss_final: 2.775151, loss_mean: 0.943774, proj_loss: -0.420430, loss_mean_cls: 2.251807, grad_norm: 13.575731 +Steps: 0%| | 4095/1000000 [16:51<67:32:32, 4.10it/s, grad_norm=13.6, loss_final=2.78, loss_mean=0.944, loss_mean_cls=2.25, proj_loss=-0.42][2026-03-23 13:53:11] Step: 4095, Training Logs: loss_final: 2.435809, loss_mean: 0.950915, proj_loss: -0.421537, loss_mean_cls: 1.906430, grad_norm: 3.096792 +Steps: 0%| | 4096/1000000 [16:51<67:28:42, 4.10it/s, grad_norm=3.1, loss_final=2.44, loss_mean=0.951, loss_mean_cls=1.91, proj_loss=-0.422][2026-03-23 13:53:11] Step: 4096, Training Logs: loss_final: 2.151361, loss_mean: 0.935417, proj_loss: -0.425334, loss_mean_cls: 1.641277, grad_norm: 4.372847 +Steps: 0%| | 4097/1000000 [16:51<67:26:06, 4.10it/s, grad_norm=4.37, loss_final=2.15, loss_mean=0.935, loss_mean_cls=1.64, proj_loss=-0.425][2026-03-23 13:53:11] Step: 4097, Training Logs: loss_final: 2.444160, loss_mean: 0.922578, proj_loss: -0.434425, loss_mean_cls: 1.956006, grad_norm: 10.164580 +Steps: 0%| | 4098/1000000 [16:51<67:23:14, 4.11it/s, grad_norm=10.2, loss_final=2.44, loss_mean=0.923, loss_mean_cls=1.96, proj_loss=-0.434][2026-03-23 13:53:11] Step: 4098, Training Logs: loss_final: 2.499529, loss_mean: 0.912102, proj_loss: -0.422279, loss_mean_cls: 2.009706, grad_norm: 12.386970 +Steps: 0%| | 4099/1000000 [16:52<67:27:07, 4.10it/s, grad_norm=12.4, loss_final=2.5, loss_mean=0.912, loss_mean_cls=2.01, proj_loss=-0.422][2026-03-23 13:53:12] Step: 4099, Training Logs: loss_final: 2.708823, loss_mean: 0.931077, proj_loss: -0.418606, loss_mean_cls: 2.196352, grad_norm: 8.600847 +Steps: 0%| | 4100/1000000 [16:52<67:26:17, 4.10it/s, grad_norm=8.6, loss_final=2.71, loss_mean=0.931, loss_mean_cls=2.2, proj_loss=-0.419][2026-03-23 13:53:12] Step: 4100, Training Logs: loss_final: 2.826903, loss_mean: 0.930605, proj_loss: -0.420340, loss_mean_cls: 2.316638, grad_norm: 16.953775 +Steps: 0%| | 4101/1000000 [16:52<67:24:35, 4.10it/s, grad_norm=17, loss_final=2.83, loss_mean=0.931, loss_mean_cls=2.32, proj_loss=-0.42][2026-03-23 13:53:12] Step: 4101, Training Logs: loss_final: 2.534966, loss_mean: 0.927596, proj_loss: -0.417588, loss_mean_cls: 2.024958, grad_norm: 3.045487 +Steps: 0%| | 4102/1000000 [16:52<67:22:04, 4.11it/s, grad_norm=3.05, loss_final=2.53, loss_mean=0.928, loss_mean_cls=2.02, proj_loss=-0.418][2026-03-23 13:53:12] Step: 4102, Training Logs: loss_final: 2.331254, loss_mean: 0.918108, proj_loss: -0.431812, loss_mean_cls: 1.844957, grad_norm: 3.989932 +Steps: 0%| | 4103/1000000 [16:53<67:22:42, 4.11it/s, grad_norm=3.99, loss_final=2.33, loss_mean=0.918, loss_mean_cls=1.84, proj_loss=-0.432][2026-03-23 13:53:13] Step: 4103, Training Logs: loss_final: 2.861473, loss_mean: 0.902448, proj_loss: -0.421022, loss_mean_cls: 2.380047, grad_norm: 15.173331 +Steps: 0%| | 4104/1000000 [16:53<67:21:40, 4.11it/s, grad_norm=15.2, loss_final=2.86, loss_mean=0.902, loss_mean_cls=2.38, proj_loss=-0.421][2026-03-23 13:53:13] Step: 4104, Training Logs: loss_final: 2.682130, loss_mean: 0.923588, proj_loss: -0.418952, loss_mean_cls: 2.177494, grad_norm: 11.117822 +Steps: 0%| | 4105/1000000 [16:53<67:22:16, 4.11it/s, grad_norm=11.1, loss_final=2.68, loss_mean=0.924, loss_mean_cls=2.18, proj_loss=-0.419][2026-03-23 13:53:13] Step: 4105, Training Logs: loss_final: 2.711552, loss_mean: 0.933533, proj_loss: -0.424201, loss_mean_cls: 2.202221, grad_norm: 32.216995 +Steps: 0%| | 4106/1000000 [16:53<67:21:46, 4.11it/s, grad_norm=32.2, loss_final=2.71, loss_mean=0.934, loss_mean_cls=2.2, proj_loss=-0.424][2026-03-23 13:53:13] Step: 4106, Training Logs: loss_final: 3.021182, loss_mean: 0.913824, proj_loss: -0.412823, loss_mean_cls: 2.520180, grad_norm: 18.153786 +Steps: 0%| | 4107/1000000 [16:54<67:21:46, 4.11it/s, grad_norm=18.2, loss_final=3.02, loss_mean=0.914, loss_mean_cls=2.52, proj_loss=-0.413][2026-03-23 13:53:14] Step: 4107, Training Logs: loss_final: 2.333815, loss_mean: 0.950075, proj_loss: -0.423308, loss_mean_cls: 1.807048, grad_norm: 11.435022 +Steps: 0%| | 4108/1000000 [16:54<67:20:38, 4.11it/s, grad_norm=11.4, loss_final=2.33, loss_mean=0.95, loss_mean_cls=1.81, proj_loss=-0.423][2026-03-23 13:53:14] Step: 4108, Training Logs: loss_final: 2.630759, loss_mean: 0.933838, proj_loss: -0.418551, loss_mean_cls: 2.115471, grad_norm: 7.395375 +Steps: 0%| | 4109/1000000 [16:54<67:20:14, 4.11it/s, grad_norm=7.4, loss_final=2.63, loss_mean=0.934, loss_mean_cls=2.12, proj_loss=-0.419][2026-03-23 13:53:14] Step: 4109, Training Logs: loss_final: 2.971207, loss_mean: 0.913808, proj_loss: -0.420841, loss_mean_cls: 2.478240, grad_norm: 8.553583 +Steps: 0%| | 4110/1000000 [16:54<67:20:05, 4.11it/s, grad_norm=8.55, loss_final=2.97, loss_mean=0.914, loss_mean_cls=2.48, proj_loss=-0.421][2026-03-23 13:53:14] Step: 4110, Training Logs: loss_final: 2.975392, loss_mean: 0.917355, proj_loss: -0.417270, loss_mean_cls: 2.475307, grad_norm: 1.959249 +Steps: 0%| | 4111/1000000 [16:55<67:18:17, 4.11it/s, grad_norm=1.96, loss_final=2.98, loss_mean=0.917, loss_mean_cls=2.48, proj_loss=-0.417][2026-03-23 13:53:15] Step: 4111, Training Logs: loss_final: 2.882960, loss_mean: 0.919941, proj_loss: -0.417962, loss_mean_cls: 2.380982, grad_norm: 4.437959 +Steps: 0%| | 4112/1000000 [16:55<67:19:12, 4.11it/s, grad_norm=4.44, loss_final=2.88, loss_mean=0.92, loss_mean_cls=2.38, proj_loss=-0.418][2026-03-23 13:53:15] Step: 4112, Training Logs: loss_final: 2.328649, loss_mean: 0.911757, proj_loss: -0.428606, loss_mean_cls: 1.845498, grad_norm: 19.702730 +Steps: 0%| | 4113/1000000 [16:55<67:23:25, 4.10it/s, grad_norm=19.7, loss_final=2.33, loss_mean=0.912, loss_mean_cls=1.85, proj_loss=-0.429][2026-03-23 13:53:15] Step: 4113, Training Logs: loss_final: 2.549099, loss_mean: 0.929147, proj_loss: -0.421733, loss_mean_cls: 2.041685, grad_norm: 9.759535 +Steps: 0%| | 4114/1000000 [16:55<67:22:52, 4.11it/s, grad_norm=9.76, loss_final=2.55, loss_mean=0.929, loss_mean_cls=2.04, proj_loss=-0.422][2026-03-23 13:53:15] Step: 4114, Training Logs: loss_final: 3.190872, loss_mean: 0.929916, proj_loss: -0.404731, loss_mean_cls: 2.665686, grad_norm: 5.336949 +Steps: 0%| | 4115/1000000 [16:56<67:22:26, 4.11it/s, grad_norm=5.34, loss_final=3.19, loss_mean=0.93, loss_mean_cls=2.67, proj_loss=-0.405][2026-03-23 13:53:16] Step: 4115, Training Logs: loss_final: 2.426098, loss_mean: 0.961117, proj_loss: -0.423934, loss_mean_cls: 1.888914, grad_norm: 13.509417 +Steps: 0%| | 4116/1000000 [16:56<67:21:33, 4.11it/s, grad_norm=13.5, loss_final=2.43, loss_mean=0.961, loss_mean_cls=1.89, proj_loss=-0.424][2026-03-23 13:53:16] Step: 4116, Training Logs: loss_final: 2.842088, loss_mean: 0.909368, proj_loss: -0.420146, loss_mean_cls: 2.352867, grad_norm: 14.855431 +Steps: 0%| | 4117/1000000 [16:56<67:22:34, 4.11it/s, grad_norm=14.9, loss_final=2.84, loss_mean=0.909, loss_mean_cls=2.35, proj_loss=-0.42][2026-03-23 13:53:16] Step: 4117, Training Logs: loss_final: 2.827679, loss_mean: 0.890447, proj_loss: -0.413706, loss_mean_cls: 2.350938, grad_norm: 1.939538 +Steps: 0%| | 4118/1000000 [16:56<67:20:55, 4.11it/s, grad_norm=1.94, loss_final=2.83, loss_mean=0.89, loss_mean_cls=2.35, proj_loss=-0.414][2026-03-23 13:53:16] Step: 4118, Training Logs: loss_final: 2.479478, loss_mean: 0.929974, proj_loss: -0.430482, loss_mean_cls: 1.979986, grad_norm: 16.002338 +Steps: 0%| | 4119/1000000 [16:57<67:23:06, 4.11it/s, grad_norm=16, loss_final=2.48, loss_mean=0.93, loss_mean_cls=1.98, proj_loss=-0.43][2026-03-23 13:53:16] Step: 4119, Training Logs: loss_final: 2.876438, loss_mean: 0.910121, proj_loss: -0.423090, loss_mean_cls: 2.389407, grad_norm: 7.030149 +Steps: 0%| | 4120/1000000 [16:57<67:23:23, 4.10it/s, grad_norm=7.03, loss_final=2.88, loss_mean=0.91, loss_mean_cls=2.39, proj_loss=-0.423][2026-03-23 13:53:17] Step: 4120, Training Logs: loss_final: 2.568992, loss_mean: 0.936807, proj_loss: -0.423209, loss_mean_cls: 2.055394, grad_norm: 7.937933 +Steps: 0%| | 4121/1000000 [16:57<67:22:54, 4.11it/s, grad_norm=7.94, loss_final=2.57, loss_mean=0.937, loss_mean_cls=2.06, proj_loss=-0.423][2026-03-23 13:53:17] Step: 4121, Training Logs: loss_final: 2.811018, loss_mean: 0.934730, proj_loss: -0.420250, loss_mean_cls: 2.296537, grad_norm: 20.610519 +Steps: 0%| | 4122/1000000 [16:57<67:24:59, 4.10it/s, grad_norm=20.6, loss_final=2.81, loss_mean=0.935, loss_mean_cls=2.3, proj_loss=-0.42][2026-03-23 13:53:17] Step: 4122, Training Logs: loss_final: 2.755446, loss_mean: 0.921913, proj_loss: -0.423253, loss_mean_cls: 2.256786, grad_norm: 7.655581 +Steps: 0%| | 4123/1000000 [16:57<67:24:14, 4.10it/s, grad_norm=7.66, loss_final=2.76, loss_mean=0.922, loss_mean_cls=2.26, proj_loss=-0.423][2026-03-23 13:53:17] Step: 4123, Training Logs: loss_final: 2.772554, loss_mean: 0.936884, proj_loss: -0.421786, loss_mean_cls: 2.257457, grad_norm: 15.896873 +Steps: 0%| | 4124/1000000 [16:58<67:23:26, 4.10it/s, grad_norm=15.9, loss_final=2.77, loss_mean=0.937, loss_mean_cls=2.26, proj_loss=-0.422][2026-03-23 13:53:18] Step: 4124, Training Logs: loss_final: 2.374263, loss_mean: 0.935825, proj_loss: -0.424954, loss_mean_cls: 1.863392, grad_norm: 8.808393 +Steps: 0%| | 4125/1000000 [16:58<67:22:26, 4.11it/s, grad_norm=8.81, loss_final=2.37, loss_mean=0.936, loss_mean_cls=1.86, proj_loss=-0.425][2026-03-23 13:53:18] Step: 4125, Training Logs: loss_final: 3.045480, loss_mean: 0.927301, proj_loss: -0.413713, loss_mean_cls: 2.531892, grad_norm: 15.955854 +Steps: 0%| | 4126/1000000 [16:58<67:23:52, 4.10it/s, grad_norm=16, loss_final=3.05, loss_mean=0.927, loss_mean_cls=2.53, proj_loss=-0.414][2026-03-23 13:53:18] Step: 4126, Training Logs: loss_final: 2.766379, loss_mean: 0.928493, proj_loss: -0.424387, loss_mean_cls: 2.262274, grad_norm: 18.230537 +Steps: 0%| | 4127/1000000 [16:58<67:22:27, 4.11it/s, grad_norm=18.2, loss_final=2.77, loss_mean=0.928, loss_mean_cls=2.26, proj_loss=-0.424][2026-03-23 13:53:18] Step: 4127, Training Logs: loss_final: 2.547161, loss_mean: 0.917183, proj_loss: -0.425795, loss_mean_cls: 2.055774, grad_norm: 8.035852 +Steps: 0%| | 4128/1000000 [16:59<67:22:44, 4.11it/s, grad_norm=8.04, loss_final=2.55, loss_mean=0.917, loss_mean_cls=2.06, proj_loss=-0.426][2026-03-23 13:53:19] Step: 4128, Training Logs: loss_final: 2.461103, loss_mean: 0.926178, proj_loss: -0.426528, loss_mean_cls: 1.961453, grad_norm: 15.064340 +Steps: 0%| | 4129/1000000 [16:59<67:22:34, 4.11it/s, grad_norm=15.1, loss_final=2.46, loss_mean=0.926, loss_mean_cls=1.96, proj_loss=-0.427][2026-03-23 13:53:19] Step: 4129, Training Logs: loss_final: 2.185772, loss_mean: 0.952525, proj_loss: -0.430346, loss_mean_cls: 1.663593, grad_norm: 6.655861 +Steps: 0%| | 4130/1000000 [16:59<67:21:29, 4.11it/s, grad_norm=6.66, loss_final=2.19, loss_mean=0.953, loss_mean_cls=1.66, proj_loss=-0.43][2026-03-23 13:53:19] Step: 4130, Training Logs: loss_final: 2.394837, loss_mean: 0.920041, proj_loss: -0.428931, loss_mean_cls: 1.903727, grad_norm: 4.830666 +Steps: 0%| | 4131/1000000 [16:59<67:21:29, 4.11it/s, grad_norm=4.83, loss_final=2.39, loss_mean=0.92, loss_mean_cls=1.9, proj_loss=-0.429][2026-03-23 13:53:19] Step: 4131, Training Logs: loss_final: 2.873274, loss_mean: 0.907664, proj_loss: -0.419177, loss_mean_cls: 2.384787, grad_norm: 17.223949 +Steps: 0%| | 4132/1000000 [17:00<67:21:08, 4.11it/s, grad_norm=17.2, loss_final=2.87, loss_mean=0.908, loss_mean_cls=2.38, proj_loss=-0.419][2026-03-23 13:53:20] Step: 4132, Training Logs: loss_final: 2.752039, loss_mean: 0.902996, proj_loss: -0.421499, loss_mean_cls: 2.270542, grad_norm: 4.362284 +Steps: 0%| | 4133/1000000 [17:00<67:23:12, 4.11it/s, grad_norm=4.36, loss_final=2.75, loss_mean=0.903, loss_mean_cls=2.27, proj_loss=-0.421][2026-03-23 13:53:20] Step: 4133, Training Logs: loss_final: 2.595747, loss_mean: 0.919334, proj_loss: -0.426248, loss_mean_cls: 2.102661, grad_norm: 2.675825 +Steps: 0%| | 4134/1000000 [17:00<67:21:36, 4.11it/s, grad_norm=2.68, loss_final=2.6, loss_mean=0.919, loss_mean_cls=2.1, proj_loss=-0.426][2026-03-23 13:53:20] Step: 4134, Training Logs: loss_final: 2.902397, loss_mean: 0.913979, proj_loss: -0.421442, loss_mean_cls: 2.409860, grad_norm: 29.144825 +Steps: 0%| | 4135/1000000 [17:00<67:22:00, 4.11it/s, grad_norm=29.1, loss_final=2.9, loss_mean=0.914, loss_mean_cls=2.41, proj_loss=-0.421][2026-03-23 13:53:20] Step: 4135, Training Logs: loss_final: 2.967775, loss_mean: 0.926374, proj_loss: -0.421519, loss_mean_cls: 2.462920, grad_norm: 31.164656 +Steps: 0%| | 4136/1000000 [17:01<67:21:46, 4.11it/s, grad_norm=31.2, loss_final=2.97, loss_mean=0.926, loss_mean_cls=2.46, proj_loss=-0.422][2026-03-23 13:53:21] Step: 4136, Training Logs: loss_final: 3.387524, loss_mean: 0.916114, proj_loss: -0.410105, loss_mean_cls: 2.881516, grad_norm: 4.617352 +Steps: 0%| | 4137/1000000 [17:01<67:23:12, 4.11it/s, grad_norm=4.62, loss_final=3.39, loss_mean=0.916, loss_mean_cls=2.88, proj_loss=-0.41][2026-03-23 13:53:21] Step: 4137, Training Logs: loss_final: 2.102424, loss_mean: 0.948672, proj_loss: -0.435237, loss_mean_cls: 1.588989, grad_norm: 7.314471 +Steps: 0%| | 4138/1000000 [17:01<67:22:37, 4.11it/s, grad_norm=7.31, loss_final=2.1, loss_mean=0.949, loss_mean_cls=1.59, proj_loss=-0.435][2026-03-23 13:53:21] Step: 4138, Training Logs: loss_final: 2.507259, loss_mean: 0.924817, proj_loss: -0.426147, loss_mean_cls: 2.008589, grad_norm: 2.067906 +Steps: 0%| | 4139/1000000 [17:01<67:22:22, 4.11it/s, grad_norm=2.07, loss_final=2.51, loss_mean=0.925, loss_mean_cls=2.01, proj_loss=-0.426][2026-03-23 13:53:21] Step: 4139, Training Logs: loss_final: 2.429958, loss_mean: 0.946273, proj_loss: -0.428491, loss_mean_cls: 1.912176, grad_norm: 2.646723 +Steps: 0%| | 4140/1000000 [17:02<67:21:24, 4.11it/s, grad_norm=2.65, loss_final=2.43, loss_mean=0.946, loss_mean_cls=1.91, proj_loss=-0.428][2026-03-23 13:53:22] Step: 4140, Training Logs: loss_final: 3.033152, loss_mean: 0.934248, proj_loss: -0.418312, loss_mean_cls: 2.517215, grad_norm: 30.347013 +Steps: 0%| | 4141/1000000 [17:02<67:21:05, 4.11it/s, grad_norm=30.3, loss_final=3.03, loss_mean=0.934, loss_mean_cls=2.52, proj_loss=-0.418][2026-03-23 13:53:22] Step: 4141, Training Logs: loss_final: 2.629662, loss_mean: 0.933650, proj_loss: -0.426289, loss_mean_cls: 2.122301, grad_norm: 16.340000 +Steps: 0%| | 4142/1000000 [17:02<67:21:47, 4.11it/s, grad_norm=16.3, loss_final=2.63, loss_mean=0.934, loss_mean_cls=2.12, proj_loss=-0.426][2026-03-23 13:53:22] Step: 4142, Training Logs: loss_final: 2.546167, loss_mean: 0.937153, proj_loss: -0.421224, loss_mean_cls: 2.030238, grad_norm: 14.070056 +Steps: 0%| | 4143/1000000 [17:02<67:21:01, 4.11it/s, grad_norm=14.1, loss_final=2.55, loss_mean=0.937, loss_mean_cls=2.03, proj_loss=-0.421][2026-03-23 13:53:22] Step: 4143, Training Logs: loss_final: 2.275212, loss_mean: 0.961606, proj_loss: -0.423830, loss_mean_cls: 1.737436, grad_norm: 18.278662 +Steps: 0%| | 4144/1000000 [17:03<67:20:57, 4.11it/s, grad_norm=18.3, loss_final=2.28, loss_mean=0.962, loss_mean_cls=1.74, proj_loss=-0.424][2026-03-23 13:53:23] Step: 4144, Training Logs: loss_final: 2.804348, loss_mean: 0.924381, proj_loss: -0.419544, loss_mean_cls: 2.299510, grad_norm: 19.978365 +Steps: 0%| | 4145/1000000 [17:03<67:20:30, 4.11it/s, grad_norm=20, loss_final=2.8, loss_mean=0.924, loss_mean_cls=2.3, proj_loss=-0.42][2026-03-23 13:53:23] Step: 4145, Training Logs: loss_final: 2.511346, loss_mean: 0.935223, proj_loss: -0.423470, loss_mean_cls: 1.999593, grad_norm: 23.105534 +Steps: 0%| | 4146/1000000 [17:03<67:21:10, 4.11it/s, grad_norm=23.1, loss_final=2.51, loss_mean=0.935, loss_mean_cls=2, proj_loss=-0.423][2026-03-23 13:53:23] Step: 4146, Training Logs: loss_final: 2.992460, loss_mean: 0.924884, proj_loss: -0.419294, loss_mean_cls: 2.486870, grad_norm: 20.533218 +Steps: 0%| | 4147/1000000 [17:03<67:21:45, 4.11it/s, grad_norm=20.5, loss_final=2.99, loss_mean=0.925, loss_mean_cls=2.49, proj_loss=-0.419][2026-03-23 13:53:23] Step: 4147, Training Logs: loss_final: 3.035583, loss_mean: 0.929278, proj_loss: -0.414354, loss_mean_cls: 2.520660, grad_norm: 12.274551 +Steps: 0%| | 4148/1000000 [17:04<67:21:05, 4.11it/s, grad_norm=12.3, loss_final=3.04, loss_mean=0.929, loss_mean_cls=2.52, proj_loss=-0.414][2026-03-23 13:53:24] Step: 4148, Training Logs: loss_final: 2.788746, loss_mean: 0.928675, proj_loss: -0.417291, loss_mean_cls: 2.277362, grad_norm: 5.428931 +Steps: 0%| | 4149/1000000 [17:04<67:20:08, 4.11it/s, grad_norm=5.43, loss_final=2.79, loss_mean=0.929, loss_mean_cls=2.28, proj_loss=-0.417][2026-03-23 13:53:24] Step: 4149, Training Logs: loss_final: 3.072078, loss_mean: 0.917411, proj_loss: -0.415779, loss_mean_cls: 2.570446, grad_norm: 10.657283 +Steps: 0%| | 4150/1000000 [17:04<67:23:46, 4.10it/s, grad_norm=10.7, loss_final=3.07, loss_mean=0.917, loss_mean_cls=2.57, proj_loss=-0.416][2026-03-23 13:53:24] Step: 4150, Training Logs: loss_final: 2.972912, loss_mean: 0.924096, proj_loss: -0.424351, loss_mean_cls: 2.473166, grad_norm: 24.170795 +Steps: 0%| | 4151/1000000 [17:04<67:23:37, 4.10it/s, grad_norm=24.2, loss_final=2.97, loss_mean=0.924, loss_mean_cls=2.47, proj_loss=-0.424][2026-03-23 13:53:24] Step: 4151, Training Logs: loss_final: 3.177258, loss_mean: 0.916835, proj_loss: -0.413791, loss_mean_cls: 2.674214, grad_norm: 30.179247 +Steps: 0%| | 4152/1000000 [17:05<67:23:09, 4.11it/s, grad_norm=30.2, loss_final=3.18, loss_mean=0.917, loss_mean_cls=2.67, proj_loss=-0.414][2026-03-23 13:53:25] Step: 4152, Training Logs: loss_final: 3.284544, loss_mean: 0.917885, proj_loss: -0.413512, loss_mean_cls: 2.780171, grad_norm: 22.290501 +Steps: 0%| | 4153/1000000 [17:05<67:23:06, 4.11it/s, grad_norm=22.3, loss_final=3.28, loss_mean=0.918, loss_mean_cls=2.78, proj_loss=-0.414][2026-03-23 13:53:25] Step: 4153, Training Logs: loss_final: 2.975972, loss_mean: 0.917064, proj_loss: -0.414933, loss_mean_cls: 2.473841, grad_norm: 11.755500 +Steps: 0%| | 4154/1000000 [17:05<67:21:03, 4.11it/s, grad_norm=11.8, loss_final=2.98, loss_mean=0.917, loss_mean_cls=2.47, proj_loss=-0.415][2026-03-23 13:53:25] Step: 4154, Training Logs: loss_final: 2.699952, loss_mean: 0.933395, proj_loss: -0.425140, loss_mean_cls: 2.191696, grad_norm: 4.800706 +Steps: 0%| | 4155/1000000 [17:05<67:19:51, 4.11it/s, grad_norm=4.8, loss_final=2.7, loss_mean=0.933, loss_mean_cls=2.19, proj_loss=-0.425][2026-03-23 13:53:25] Step: 4155, Training Logs: loss_final: 2.875700, loss_mean: 0.923558, proj_loss: -0.421671, loss_mean_cls: 2.373813, grad_norm: 15.212069 +Steps: 0%| | 4156/1000000 [17:06<67:19:15, 4.11it/s, grad_norm=15.2, loss_final=2.88, loss_mean=0.924, loss_mean_cls=2.37, proj_loss=-0.422][2026-03-23 13:53:25] Step: 4156, Training Logs: loss_final: 2.892871, loss_mean: 0.912220, proj_loss: -0.423214, loss_mean_cls: 2.403865, grad_norm: 16.962345 +Steps: 0%| | 4157/1000000 [17:06<67:19:28, 4.11it/s, grad_norm=17, loss_final=2.89, loss_mean=0.912, loss_mean_cls=2.4, proj_loss=-0.423][2026-03-23 13:53:26] Step: 4157, Training Logs: loss_final: 2.993598, loss_mean: 0.931696, proj_loss: -0.418509, loss_mean_cls: 2.480411, grad_norm: 15.127428 +Steps: 0%| | 4158/1000000 [17:06<67:18:54, 4.11it/s, grad_norm=15.1, loss_final=2.99, loss_mean=0.932, loss_mean_cls=2.48, proj_loss=-0.419][2026-03-23 13:53:26] Step: 4158, Training Logs: loss_final: 2.929924, loss_mean: 0.894093, proj_loss: -0.418996, loss_mean_cls: 2.454828, grad_norm: 12.015965 +Steps: 0%| | 4159/1000000 [17:06<67:20:24, 4.11it/s, grad_norm=12, loss_final=2.93, loss_mean=0.894, loss_mean_cls=2.45, proj_loss=-0.419][2026-03-23 13:53:26] Step: 4159, Training Logs: loss_final: 3.082969, loss_mean: 0.918867, proj_loss: -0.419970, loss_mean_cls: 2.584072, grad_norm: 31.081928 +Steps: 0%| | 4160/1000000 [17:06<67:20:27, 4.11it/s, grad_norm=31.1, loss_final=3.08, loss_mean=0.919, loss_mean_cls=2.58, proj_loss=-0.42][2026-03-23 13:53:26] Step: 4160, Training Logs: loss_final: 2.724409, loss_mean: 0.934052, proj_loss: -0.426730, loss_mean_cls: 2.217087, grad_norm: 26.544672 +Steps: 0%| | 4161/1000000 [17:07<67:21:03, 4.11it/s, grad_norm=26.5, loss_final=2.72, loss_mean=0.934, loss_mean_cls=2.22, proj_loss=-0.427][2026-03-23 13:53:27] Step: 4161, Training Logs: loss_final: 3.163603, loss_mean: 0.917323, proj_loss: -0.424678, loss_mean_cls: 2.670959, grad_norm: 31.867981 +Steps: 0%| | 4162/1000000 [17:07<67:20:09, 4.11it/s, grad_norm=31.9, loss_final=3.16, loss_mean=0.917, loss_mean_cls=2.67, proj_loss=-0.425][2026-03-23 13:53:27] Step: 4162, Training Logs: loss_final: 3.144384, loss_mean: 0.908105, proj_loss: -0.418573, loss_mean_cls: 2.654851, grad_norm: 30.624344 +Steps: 0%| | 4163/1000000 [17:07<67:25:17, 4.10it/s, grad_norm=30.6, loss_final=3.14, loss_mean=0.908, loss_mean_cls=2.65, proj_loss=-0.419][2026-03-23 13:53:27] Step: 4163, Training Logs: loss_final: 3.127764, loss_mean: 0.919544, proj_loss: -0.416401, loss_mean_cls: 2.624620, grad_norm: 17.975140 +Steps: 0%| | 4164/1000000 [17:07<67:25:47, 4.10it/s, grad_norm=18, loss_final=3.13, loss_mean=0.92, loss_mean_cls=2.62, proj_loss=-0.416][2026-03-23 13:53:27] Step: 4164, Training Logs: loss_final: 3.070867, loss_mean: 0.905386, proj_loss: -0.417502, loss_mean_cls: 2.582983, grad_norm: 14.416522 +Steps: 0%| | 4165/1000000 [17:08<67:24:11, 4.10it/s, grad_norm=14.4, loss_final=3.07, loss_mean=0.905, loss_mean_cls=2.58, proj_loss=-0.418][2026-03-23 13:53:28] Step: 4165, Training Logs: loss_final: 3.294070, loss_mean: 0.921505, proj_loss: -0.413197, loss_mean_cls: 2.785763, grad_norm: 13.146492 +Steps: 0%| | 4166/1000000 [17:08<67:22:21, 4.11it/s, grad_norm=13.1, loss_final=3.29, loss_mean=0.922, loss_mean_cls=2.79, proj_loss=-0.413][2026-03-23 13:53:28] Step: 4166, Training Logs: loss_final: 3.200155, loss_mean: 0.915335, proj_loss: -0.416732, loss_mean_cls: 2.701553, grad_norm: 20.685713 +Steps: 0%| | 4167/1000000 [17:08<67:20:57, 4.11it/s, grad_norm=20.7, loss_final=3.2, loss_mean=0.915, loss_mean_cls=2.7, proj_loss=-0.417][2026-03-23 13:53:28] Step: 4167, Training Logs: loss_final: 2.840031, loss_mean: 0.906054, proj_loss: -0.416611, loss_mean_cls: 2.350587, grad_norm: 20.639498 +Steps: 0%| | 4168/1000000 [17:08<67:22:14, 4.11it/s, grad_norm=20.6, loss_final=2.84, loss_mean=0.906, loss_mean_cls=2.35, proj_loss=-0.417][2026-03-23 13:53:28] Step: 4168, Training Logs: loss_final: 2.778847, loss_mean: 0.904977, proj_loss: -0.427294, loss_mean_cls: 2.301164, grad_norm: 5.063087 +Steps: 0%| | 4169/1000000 [17:09<67:21:35, 4.11it/s, grad_norm=5.06, loss_final=2.78, loss_mean=0.905, loss_mean_cls=2.3, proj_loss=-0.427][2026-03-23 13:53:29] Step: 4169, Training Logs: loss_final: 2.271118, loss_mean: 0.915045, proj_loss: -0.433579, loss_mean_cls: 1.789652, grad_norm: 13.773179 +Steps: 0%| | 4170/1000000 [17:09<67:21:49, 4.11it/s, grad_norm=13.8, loss_final=2.27, loss_mean=0.915, loss_mean_cls=1.79, proj_loss=-0.434][2026-03-23 13:53:29] Step: 4170, Training Logs: loss_final: 2.907334, loss_mean: 0.914822, proj_loss: -0.422924, loss_mean_cls: 2.415436, grad_norm: 29.007717 +Steps: 0%| | 4171/1000000 [17:09<67:20:26, 4.11it/s, grad_norm=29, loss_final=2.91, loss_mean=0.915, loss_mean_cls=2.42, proj_loss=-0.423][2026-03-23 13:53:29] Step: 4171, Training Logs: loss_final: 2.668892, loss_mean: 0.926740, proj_loss: -0.426284, loss_mean_cls: 2.168435, grad_norm: 20.190737 +Steps: 0%| | 4172/1000000 [17:09<67:20:17, 4.11it/s, grad_norm=20.2, loss_final=2.67, loss_mean=0.927, loss_mean_cls=2.17, proj_loss=-0.426][2026-03-23 13:53:29] Step: 4172, Training Logs: loss_final: 2.554787, loss_mean: 0.906828, proj_loss: -0.427452, loss_mean_cls: 2.075411, grad_norm: 10.879191 +Steps: 0%| | 4173/1000000 [17:10<67:20:22, 4.11it/s, grad_norm=10.9, loss_final=2.55, loss_mean=0.907, loss_mean_cls=2.08, proj_loss=-0.427][2026-03-23 13:53:30] Step: 4173, Training Logs: loss_final: 2.484180, loss_mean: 0.922545, proj_loss: -0.430942, loss_mean_cls: 1.992578, grad_norm: 8.245120 +Steps: 0%| | 4174/1000000 [17:10<67:21:54, 4.11it/s, grad_norm=8.25, loss_final=2.48, loss_mean=0.923, loss_mean_cls=1.99, proj_loss=-0.431][2026-03-23 13:53:30] Step: 4174, Training Logs: loss_final: 2.945009, loss_mean: 0.917137, proj_loss: -0.422683, loss_mean_cls: 2.450555, grad_norm: 12.037968 +Steps: 0%| | 4175/1000000 [17:10<67:21:58, 4.11it/s, grad_norm=12, loss_final=2.95, loss_mean=0.917, loss_mean_cls=2.45, proj_loss=-0.423][2026-03-23 13:53:30] Step: 4175, Training Logs: loss_final: 2.666205, loss_mean: 0.930114, proj_loss: -0.429751, loss_mean_cls: 2.165842, grad_norm: 16.820717 +Steps: 0%| | 4176/1000000 [17:10<67:21:59, 4.11it/s, grad_norm=16.8, loss_final=2.67, loss_mean=0.93, loss_mean_cls=2.17, proj_loss=-0.43][2026-03-23 13:53:30] Step: 4176, Training Logs: loss_final: 2.889465, loss_mean: 0.929525, proj_loss: -0.422822, loss_mean_cls: 2.382763, grad_norm: 26.207436 +Steps: 0%| | 4177/1000000 [17:11<67:20:34, 4.11it/s, grad_norm=26.2, loss_final=2.89, loss_mean=0.93, loss_mean_cls=2.38, proj_loss=-0.423][2026-03-23 13:53:31] Step: 4177, Training Logs: loss_final: 2.864759, loss_mean: 0.906279, proj_loss: -0.426637, loss_mean_cls: 2.385117, grad_norm: 19.186296 +Steps: 0%| | 4178/1000000 [17:11<67:20:44, 4.11it/s, grad_norm=19.2, loss_final=2.86, loss_mean=0.906, loss_mean_cls=2.39, proj_loss=-0.427][2026-03-23 13:53:31] Step: 4178, Training Logs: loss_final: 2.346281, loss_mean: 0.902605, proj_loss: -0.429811, loss_mean_cls: 1.873487, grad_norm: 6.703275 +Steps: 0%| | 4179/1000000 [17:11<67:21:43, 4.11it/s, grad_norm=6.7, loss_final=2.35, loss_mean=0.903, loss_mean_cls=1.87, proj_loss=-0.43][2026-03-23 13:53:31] Step: 4179, Training Logs: loss_final: 3.146731, loss_mean: 0.920537, proj_loss: -0.423531, loss_mean_cls: 2.649725, grad_norm: 28.811041 +Steps: 0%| | 4180/1000000 [17:11<67:20:48, 4.11it/s, grad_norm=28.8, loss_final=3.15, loss_mean=0.921, loss_mean_cls=2.65, proj_loss=-0.424][2026-03-23 13:53:31] Step: 4180, Training Logs: loss_final: 3.089307, loss_mean: 0.904724, proj_loss: -0.418713, loss_mean_cls: 2.603297, grad_norm: 24.353828 +Steps: 0%| | 4181/1000000 [17:12<67:21:10, 4.11it/s, grad_norm=24.4, loss_final=3.09, loss_mean=0.905, loss_mean_cls=2.6, proj_loss=-0.419][2026-03-23 13:53:32] Step: 4181, Training Logs: loss_final: 2.603398, loss_mean: 0.893070, proj_loss: -0.425801, loss_mean_cls: 2.136129, grad_norm: 17.849762 +Steps: 0%| | 4182/1000000 [17:12<67:20:44, 4.11it/s, grad_norm=17.8, loss_final=2.6, loss_mean=0.893, loss_mean_cls=2.14, proj_loss=-0.426][2026-03-23 13:53:32] Step: 4182, Training Logs: loss_final: 3.301388, loss_mean: 0.910354, proj_loss: -0.418089, loss_mean_cls: 2.809123, grad_norm: 18.717754 +Steps: 0%| | 4183/1000000 [17:12<67:22:11, 4.11it/s, grad_norm=18.7, loss_final=3.3, loss_mean=0.91, loss_mean_cls=2.81, proj_loss=-0.418][2026-03-23 13:53:32] Step: 4183, Training Logs: loss_final: 2.610678, loss_mean: 0.918439, proj_loss: -0.424189, loss_mean_cls: 2.116429, grad_norm: 4.799510 +Steps: 0%| | 4184/1000000 [17:12<67:20:32, 4.11it/s, grad_norm=4.8, loss_final=2.61, loss_mean=0.918, loss_mean_cls=2.12, proj_loss=-0.424][2026-03-23 13:53:32] Step: 4184, Training Logs: loss_final: 3.027581, loss_mean: 0.914940, proj_loss: -0.421558, loss_mean_cls: 2.534198, grad_norm: 23.292370 +Steps: 0%| | 4185/1000000 [17:13<67:22:54, 4.11it/s, grad_norm=23.3, loss_final=3.03, loss_mean=0.915, loss_mean_cls=2.53, proj_loss=-0.422][2026-03-23 13:53:33] Step: 4185, Training Logs: loss_final: 2.796342, loss_mean: 0.913238, proj_loss: -0.424086, loss_mean_cls: 2.307190, grad_norm: 28.776741 +Steps: 0%| | 4186/1000000 [17:13<67:23:47, 4.10it/s, grad_norm=28.8, loss_final=2.8, loss_mean=0.913, loss_mean_cls=2.31, proj_loss=-0.424][2026-03-23 13:53:33] Step: 4186, Training Logs: loss_final: 2.395678, loss_mean: 0.918656, proj_loss: -0.437523, loss_mean_cls: 1.914545, grad_norm: 19.487427 +Steps: 0%| | 4187/1000000 [17:13<67:22:44, 4.11it/s, grad_norm=19.5, loss_final=2.4, loss_mean=0.919, loss_mean_cls=1.91, proj_loss=-0.438][2026-03-23 13:53:33] Step: 4187, Training Logs: loss_final: 2.867250, loss_mean: 0.914524, proj_loss: -0.424242, loss_mean_cls: 2.376968, grad_norm: 18.230556 +Steps: 0%| | 4188/1000000 [17:13<67:21:12, 4.11it/s, grad_norm=18.2, loss_final=2.87, loss_mean=0.915, loss_mean_cls=2.38, proj_loss=-0.424][2026-03-23 13:53:33] Step: 4188, Training Logs: loss_final: 3.106638, loss_mean: 0.906439, proj_loss: -0.413604, loss_mean_cls: 2.613804, grad_norm: 7.822644 +Steps: 0%| | 4189/1000000 [17:14<67:20:52, 4.11it/s, grad_norm=7.82, loss_final=3.11, loss_mean=0.906, loss_mean_cls=2.61, proj_loss=-0.414][2026-03-23 13:53:34] Step: 4189, Training Logs: loss_final: 3.145777, loss_mean: 0.895484, proj_loss: -0.415006, loss_mean_cls: 2.665300, grad_norm: 10.052365 +Steps: 0%| | 4190/1000000 [17:14<67:20:50, 4.11it/s, grad_norm=10.1, loss_final=3.15, loss_mean=0.895, loss_mean_cls=2.67, proj_loss=-0.415][2026-03-23 13:53:34] Step: 4190, Training Logs: loss_final: 3.167929, loss_mean: 0.899902, proj_loss: -0.426626, loss_mean_cls: 2.694653, grad_norm: 31.110126 +Steps: 0%| | 4191/1000000 [17:14<68:03:37, 4.06it/s, grad_norm=31.1, loss_final=3.17, loss_mean=0.9, loss_mean_cls=2.69, proj_loss=-0.427][2026-03-23 13:53:34] Step: 4191, Training Logs: loss_final: 2.340527, loss_mean: 0.925382, proj_loss: -0.434285, loss_mean_cls: 1.849430, grad_norm: 9.813469 +Steps: 0%| | 4192/1000000 [17:14<67:51:48, 4.08it/s, grad_norm=9.81, loss_final=2.34, loss_mean=0.925, loss_mean_cls=1.85, proj_loss=-0.434][2026-03-23 13:53:34] Step: 4192, Training Logs: loss_final: 3.123075, loss_mean: 0.901873, proj_loss: -0.416134, loss_mean_cls: 2.637336, grad_norm: 12.317755 +Steps: 0%| | 4193/1000000 [17:15<67:43:05, 4.08it/s, grad_norm=12.3, loss_final=3.12, loss_mean=0.902, loss_mean_cls=2.64, proj_loss=-0.416][2026-03-23 13:53:35] Step: 4193, Training Logs: loss_final: 2.194263, loss_mean: 0.927935, proj_loss: -0.433592, loss_mean_cls: 1.699920, grad_norm: 1.936159 +Steps: 0%| | 4194/1000000 [17:15<67:36:34, 4.09it/s, grad_norm=1.94, loss_final=2.19, loss_mean=0.928, loss_mean_cls=1.7, proj_loss=-0.434][2026-03-23 13:53:35] Step: 4194, Training Logs: loss_final: 2.650447, loss_mean: 0.915331, proj_loss: -0.428908, loss_mean_cls: 2.164023, grad_norm: 15.315984 +Steps: 0%| | 4195/1000000 [17:15<67:31:52, 4.10it/s, grad_norm=15.3, loss_final=2.65, loss_mean=0.915, loss_mean_cls=2.16, proj_loss=-0.429][2026-03-23 13:53:35] Step: 4195, Training Logs: loss_final: 2.935339, loss_mean: 0.927414, proj_loss: -0.423037, loss_mean_cls: 2.430962, grad_norm: 20.721739 +Steps: 0%| | 4196/1000000 [17:15<67:27:41, 4.10it/s, grad_norm=20.7, loss_final=2.94, loss_mean=0.927, loss_mean_cls=2.43, proj_loss=-0.423][2026-03-23 13:53:35] Step: 4196, Training Logs: loss_final: 2.602027, loss_mean: 0.940752, proj_loss: -0.425786, loss_mean_cls: 2.087061, grad_norm: 10.698114 +Steps: 0%| | 4197/1000000 [17:16<67:26:35, 4.10it/s, grad_norm=10.7, loss_final=2.6, loss_mean=0.941, loss_mean_cls=2.09, proj_loss=-0.426][2026-03-23 13:53:35] Step: 4197, Training Logs: loss_final: 2.814170, loss_mean: 0.918947, proj_loss: -0.414749, loss_mean_cls: 2.309972, grad_norm: 10.434210 +Steps: 0%| | 4198/1000000 [17:16<67:36:34, 4.09it/s, grad_norm=10.4, loss_final=2.81, loss_mean=0.919, loss_mean_cls=2.31, proj_loss=-0.415][2026-03-23 13:53:36] Step: 4198, Training Logs: loss_final: 2.669771, loss_mean: 0.904011, proj_loss: -0.422870, loss_mean_cls: 2.188629, grad_norm: 7.764086 +Steps: 0%| | 4199/1000000 [17:16<67:29:43, 4.10it/s, grad_norm=7.76, loss_final=2.67, loss_mean=0.904, loss_mean_cls=2.19, proj_loss=-0.423][2026-03-23 13:53:36] Step: 4199, Training Logs: loss_final: 2.589579, loss_mean: 0.909563, proj_loss: -0.423761, loss_mean_cls: 2.103778, grad_norm: 5.676785 +Steps: 0%| | 4200/1000000 [17:16<67:28:34, 4.10it/s, grad_norm=5.68, loss_final=2.59, loss_mean=0.91, loss_mean_cls=2.1, proj_loss=-0.424][2026-03-23 13:53:36] Step: 4200, Training Logs: loss_final: 2.440839, loss_mean: 0.938658, proj_loss: -0.423398, loss_mean_cls: 1.925579, grad_norm: 3.809152 +Steps: 0%| | 4201/1000000 [17:16<67:26:51, 4.10it/s, grad_norm=3.81, loss_final=2.44, loss_mean=0.939, loss_mean_cls=1.93, proj_loss=-0.423][2026-03-23 13:53:36] Step: 4201, Training Logs: loss_final: 2.616709, loss_mean: 0.931612, proj_loss: -0.422553, loss_mean_cls: 2.107650, grad_norm: 16.189537 +Steps: 0%| | 4202/1000000 [17:17<67:29:15, 4.10it/s, grad_norm=16.2, loss_final=2.62, loss_mean=0.932, loss_mean_cls=2.11, proj_loss=-0.423][2026-03-23 13:53:37] Step: 4202, Training Logs: loss_final: 2.226784, loss_mean: 0.924955, proj_loss: -0.432732, loss_mean_cls: 1.734562, grad_norm: 10.235853 +Steps: 0%| | 4203/1000000 [17:17<67:25:36, 4.10it/s, grad_norm=10.2, loss_final=2.23, loss_mean=0.925, loss_mean_cls=1.73, proj_loss=-0.433][2026-03-23 13:53:37] Step: 4203, Training Logs: loss_final: 2.807262, loss_mean: 0.912850, proj_loss: -0.421011, loss_mean_cls: 2.315423, grad_norm: 3.807421 +Steps: 0%| | 4204/1000000 [17:17<67:21:56, 4.11it/s, grad_norm=3.81, loss_final=2.81, loss_mean=0.913, loss_mean_cls=2.32, proj_loss=-0.421][2026-03-23 13:53:37] Step: 4204, Training Logs: loss_final: 2.926186, loss_mean: 0.907237, proj_loss: -0.421032, loss_mean_cls: 2.439981, grad_norm: 2.769072 +Steps: 0%| | 4205/1000000 [17:17<67:21:54, 4.11it/s, grad_norm=2.77, loss_final=2.93, loss_mean=0.907, loss_mean_cls=2.44, proj_loss=-0.421][2026-03-23 13:53:37] Step: 4205, Training Logs: loss_final: 2.512432, loss_mean: 0.923282, proj_loss: -0.425516, loss_mean_cls: 2.014665, grad_norm: 16.305319 +Steps: 0%| | 4206/1000000 [17:18<67:21:02, 4.11it/s, grad_norm=16.3, loss_final=2.51, loss_mean=0.923, loss_mean_cls=2.01, proj_loss=-0.426][2026-03-23 13:53:38] Step: 4206, Training Logs: loss_final: 3.059434, loss_mean: 0.899100, proj_loss: -0.413452, loss_mean_cls: 2.573786, grad_norm: 6.028147 +Steps: 0%| | 4207/1000000 [17:18<67:20:40, 4.11it/s, grad_norm=6.03, loss_final=3.06, loss_mean=0.899, loss_mean_cls=2.57, proj_loss=-0.413][2026-03-23 13:53:38] Step: 4207, Training Logs: loss_final: 2.706522, loss_mean: 0.908189, proj_loss: -0.423976, loss_mean_cls: 2.222309, grad_norm: 14.305156 +Steps: 0%| | 4208/1000000 [17:18<67:19:55, 4.11it/s, grad_norm=14.3, loss_final=2.71, loss_mean=0.908, loss_mean_cls=2.22, proj_loss=-0.424][2026-03-23 13:53:38] Step: 4208, Training Logs: loss_final: 2.357095, loss_mean: 0.932726, proj_loss: -0.429873, loss_mean_cls: 1.854241, grad_norm: 4.391114 +Steps: 0%| | 4209/1000000 [17:18<67:21:10, 4.11it/s, grad_norm=4.39, loss_final=2.36, loss_mean=0.933, loss_mean_cls=1.85, proj_loss=-0.43][2026-03-23 13:53:38] Step: 4209, Training Logs: loss_final: 2.660348, loss_mean: 0.912700, proj_loss: -0.424553, loss_mean_cls: 2.172202, grad_norm: 11.704018 +Steps: 0%| | 4210/1000000 [17:19<67:18:34, 4.11it/s, grad_norm=11.7, loss_final=2.66, loss_mean=0.913, loss_mean_cls=2.17, proj_loss=-0.425][2026-03-23 13:53:39] Step: 4210, Training Logs: loss_final: 3.110774, loss_mean: 0.910673, proj_loss: -0.418881, loss_mean_cls: 2.618982, grad_norm: 26.210209 +Steps: 0%| | 4211/1000000 [17:19<67:42:43, 4.09it/s, grad_norm=26.2, loss_final=3.11, loss_mean=0.911, loss_mean_cls=2.62, proj_loss=-0.419][2026-03-23 13:53:39] Step: 4211, Training Logs: loss_final: 2.578791, loss_mean: 0.921335, proj_loss: -0.425824, loss_mean_cls: 2.083280, grad_norm: 17.739014 +Steps: 0%| | 4212/1000000 [17:19<67:33:56, 4.09it/s, grad_norm=17.7, loss_final=2.58, loss_mean=0.921, loss_mean_cls=2.08, proj_loss=-0.426][2026-03-23 13:53:39] Step: 4212, Training Logs: loss_final: 3.111624, loss_mean: 0.919715, proj_loss: -0.419165, loss_mean_cls: 2.611073, grad_norm: 26.253614 +Steps: 0%| | 4213/1000000 [17:19<67:29:24, 4.10it/s, grad_norm=26.3, loss_final=3.11, loss_mean=0.92, loss_mean_cls=2.61, proj_loss=-0.419][2026-03-23 13:53:39] Step: 4213, Training Logs: loss_final: 2.883704, loss_mean: 0.906070, proj_loss: -0.417972, loss_mean_cls: 2.395607, grad_norm: 8.633466 +Steps: 0%| | 4214/1000000 [17:20<67:24:04, 4.10it/s, grad_norm=8.63, loss_final=2.88, loss_mean=0.906, loss_mean_cls=2.4, proj_loss=-0.418][2026-03-23 13:53:40] Step: 4214, Training Logs: loss_final: 2.936027, loss_mean: 0.909894, proj_loss: -0.425152, loss_mean_cls: 2.451285, grad_norm: 38.445923 +Steps: 0%| | 4215/1000000 [17:20<67:21:19, 4.11it/s, grad_norm=38.4, loss_final=2.94, loss_mean=0.91, loss_mean_cls=2.45, proj_loss=-0.425][2026-03-23 13:53:40] Step: 4215, Training Logs: loss_final: 2.720898, loss_mean: 0.910339, proj_loss: -0.419695, loss_mean_cls: 2.230253, grad_norm: 14.229655 +Steps: 0%| | 4216/1000000 [17:20<67:20:44, 4.11it/s, grad_norm=14.2, loss_final=2.72, loss_mean=0.91, loss_mean_cls=2.23, proj_loss=-0.42][2026-03-23 13:53:40] Step: 4216, Training Logs: loss_final: 2.954316, loss_mean: 0.896187, proj_loss: -0.420000, loss_mean_cls: 2.478128, grad_norm: 16.935907 +Steps: 0%| | 4217/1000000 [17:20<67:20:17, 4.11it/s, grad_norm=16.9, loss_final=2.95, loss_mean=0.896, loss_mean_cls=2.48, proj_loss=-0.42][2026-03-23 13:53:40] Step: 4217, Training Logs: loss_final: 3.541197, loss_mean: 0.902085, proj_loss: -0.407696, loss_mean_cls: 3.046808, grad_norm: 15.332151 +Steps: 0%| | 4218/1000000 [17:21<67:21:06, 4.11it/s, grad_norm=15.3, loss_final=3.54, loss_mean=0.902, loss_mean_cls=3.05, proj_loss=-0.408][2026-03-23 13:53:41] Step: 4218, Training Logs: loss_final: 2.662016, loss_mean: 0.931020, proj_loss: -0.419711, loss_mean_cls: 2.150707, grad_norm: 9.591941 +Steps: 0%| | 4219/1000000 [17:21<67:19:11, 4.11it/s, grad_norm=9.59, loss_final=2.66, loss_mean=0.931, loss_mean_cls=2.15, proj_loss=-0.42][2026-03-23 13:53:41] Step: 4219, Training Logs: loss_final: 2.775255, loss_mean: 0.866822, proj_loss: -0.434670, loss_mean_cls: 2.343104, grad_norm: 16.174040 +Steps: 0%| | 4220/1000000 [17:21<67:19:07, 4.11it/s, grad_norm=16.2, loss_final=2.78, loss_mean=0.867, loss_mean_cls=2.34, proj_loss=-0.435][2026-03-23 13:53:41] Step: 4220, Training Logs: loss_final: 2.400461, loss_mean: 0.925565, proj_loss: -0.437796, loss_mean_cls: 1.912692, grad_norm: 11.647777 +Steps: 0%| | 4221/1000000 [17:21<67:17:49, 4.11it/s, grad_norm=11.6, loss_final=2.4, loss_mean=0.926, loss_mean_cls=1.91, proj_loss=-0.438][2026-03-23 13:53:41] Step: 4221, Training Logs: loss_final: 2.518441, loss_mean: 0.919093, proj_loss: -0.423974, loss_mean_cls: 2.023321, grad_norm: 6.144751 +Steps: 0%| | 4222/1000000 [17:22<67:19:37, 4.11it/s, grad_norm=6.14, loss_final=2.52, loss_mean=0.919, loss_mean_cls=2.02, proj_loss=-0.424][2026-03-23 13:53:42] Step: 4222, Training Logs: loss_final: 2.657268, loss_mean: 0.895474, proj_loss: -0.422516, loss_mean_cls: 2.184309, grad_norm: 23.421036 +Steps: 0%| | 4223/1000000 [17:22<67:21:07, 4.11it/s, grad_norm=23.4, loss_final=2.66, loss_mean=0.895, loss_mean_cls=2.18, proj_loss=-0.423][2026-03-23 13:53:42] Step: 4223, Training Logs: loss_final: 2.400068, loss_mean: 0.914051, proj_loss: -0.427763, loss_mean_cls: 1.913780, grad_norm: 2.315536 +Steps: 0%| | 4224/1000000 [17:22<67:22:39, 4.11it/s, grad_norm=2.32, loss_final=2.4, loss_mean=0.914, loss_mean_cls=1.91, proj_loss=-0.428][2026-03-23 13:53:42] Step: 4224, Training Logs: loss_final: 2.767008, loss_mean: 0.916594, proj_loss: -0.420218, loss_mean_cls: 2.270631, grad_norm: 11.004361 +Steps: 0%| | 4225/1000000 [17:22<67:22:19, 4.11it/s, grad_norm=11, loss_final=2.77, loss_mean=0.917, loss_mean_cls=2.27, proj_loss=-0.42][2026-03-23 13:53:42] Step: 4225, Training Logs: loss_final: 2.447268, loss_mean: 0.900274, proj_loss: -0.429985, loss_mean_cls: 1.976980, grad_norm: 6.787434 +Steps: 0%| | 4226/1000000 [17:23<67:20:45, 4.11it/s, grad_norm=6.79, loss_final=2.45, loss_mean=0.9, loss_mean_cls=1.98, proj_loss=-0.43][2026-03-23 13:53:43] Step: 4226, Training Logs: loss_final: 2.357096, loss_mean: 0.892275, proj_loss: -0.430190, loss_mean_cls: 1.895011, grad_norm: 15.866518 +Steps: 0%| | 4227/1000000 [17:23<67:22:08, 4.11it/s, grad_norm=15.9, loss_final=2.36, loss_mean=0.892, loss_mean_cls=1.9, proj_loss=-0.43][2026-03-23 13:53:43] Step: 4227, Training Logs: loss_final: 3.076471, loss_mean: 0.899422, proj_loss: -0.418655, loss_mean_cls: 2.595704, grad_norm: 14.170625 +Steps: 0%| | 4228/1000000 [17:23<67:20:07, 4.11it/s, grad_norm=14.2, loss_final=3.08, loss_mean=0.899, loss_mean_cls=2.6, proj_loss=-0.419][2026-03-23 13:53:43] Step: 4228, Training Logs: loss_final: 2.910342, loss_mean: 0.926068, proj_loss: -0.424218, loss_mean_cls: 2.408492, grad_norm: 6.465648 +Steps: 0%| | 4229/1000000 [17:23<67:20:33, 4.11it/s, grad_norm=6.47, loss_final=2.91, loss_mean=0.926, loss_mean_cls=2.41, proj_loss=-0.424][2026-03-23 13:53:43] Step: 4229, Training Logs: loss_final: 2.806627, loss_mean: 0.901774, proj_loss: -0.426597, loss_mean_cls: 2.331451, grad_norm: 10.836160 +Steps: 0%| | 4230/1000000 [17:24<67:20:10, 4.11it/s, grad_norm=10.8, loss_final=2.81, loss_mean=0.902, loss_mean_cls=2.33, proj_loss=-0.427][2026-03-23 13:53:44] Step: 4230, Training Logs: loss_final: 2.774724, loss_mean: 0.909534, proj_loss: -0.430639, loss_mean_cls: 2.295829, grad_norm: 13.828879 +Steps: 0%| | 4231/1000000 [17:24<67:22:37, 4.11it/s, grad_norm=13.8, loss_final=2.77, loss_mean=0.91, loss_mean_cls=2.3, proj_loss=-0.431][2026-03-23 13:53:44] Step: 4231, Training Logs: loss_final: 2.506679, loss_mean: 0.909652, proj_loss: -0.430077, loss_mean_cls: 2.027104, grad_norm: 4.437479 +Steps: 0%| | 4232/1000000 [17:24<67:20:45, 4.11it/s, grad_norm=4.44, loss_final=2.51, loss_mean=0.91, loss_mean_cls=2.03, proj_loss=-0.43][2026-03-23 13:53:44] Step: 4232, Training Logs: loss_final: 2.964515, loss_mean: 0.913618, proj_loss: -0.424092, loss_mean_cls: 2.474989, grad_norm: 36.274399 +Steps: 0%| | 4233/1000000 [17:24<67:21:17, 4.11it/s, grad_norm=36.3, loss_final=2.96, loss_mean=0.914, loss_mean_cls=2.47, proj_loss=-0.424][2026-03-23 13:53:44] Step: 4233, Training Logs: loss_final: 2.745910, loss_mean: 0.911986, proj_loss: -0.431502, loss_mean_cls: 2.265426, grad_norm: 15.036245 +Steps: 0%| | 4234/1000000 [17:25<67:21:18, 4.11it/s, grad_norm=15, loss_final=2.75, loss_mean=0.912, loss_mean_cls=2.27, proj_loss=-0.432][2026-03-23 13:53:44] Step: 4234, Training Logs: loss_final: 2.656011, loss_mean: 0.932976, proj_loss: -0.427910, loss_mean_cls: 2.150945, grad_norm: 13.237981 +Steps: 0%| | 4235/1000000 [17:25<67:20:37, 4.11it/s, grad_norm=13.2, loss_final=2.66, loss_mean=0.933, loss_mean_cls=2.15, proj_loss=-0.428][2026-03-23 13:53:45] Step: 4235, Training Logs: loss_final: 2.429621, loss_mean: 0.921862, proj_loss: -0.433272, loss_mean_cls: 1.941032, grad_norm: 30.524914 +Steps: 0%| | 4236/1000000 [17:25<67:20:22, 4.11it/s, grad_norm=30.5, loss_final=2.43, loss_mean=0.922, loss_mean_cls=1.94, proj_loss=-0.433][2026-03-23 13:53:45] Step: 4236, Training Logs: loss_final: 2.908793, loss_mean: 0.908809, proj_loss: -0.421346, loss_mean_cls: 2.421330, grad_norm: 17.467573 +Steps: 0%| | 4237/1000000 [17:25<67:20:29, 4.11it/s, grad_norm=17.5, loss_final=2.91, loss_mean=0.909, loss_mean_cls=2.42, proj_loss=-0.421][2026-03-23 13:53:45] Step: 4237, Training Logs: loss_final: 3.160742, loss_mean: 0.892906, proj_loss: -0.415924, loss_mean_cls: 2.683760, grad_norm: 18.398380 +Steps: 0%| | 4238/1000000 [17:26<67:19:44, 4.11it/s, grad_norm=18.4, loss_final=3.16, loss_mean=0.893, loss_mean_cls=2.68, proj_loss=-0.416][2026-03-23 13:53:45] Step: 4238, Training Logs: loss_final: 2.763196, loss_mean: 0.904147, proj_loss: -0.427772, loss_mean_cls: 2.286821, grad_norm: 21.768904 +Steps: 0%| | 4239/1000000 [17:26<67:19:20, 4.11it/s, grad_norm=21.8, loss_final=2.76, loss_mean=0.904, loss_mean_cls=2.29, proj_loss=-0.428][2026-03-23 13:53:46] Step: 4239, Training Logs: loss_final: 2.493537, loss_mean: 0.901729, proj_loss: -0.427711, loss_mean_cls: 2.019519, grad_norm: 7.572347 +Steps: 0%| | 4240/1000000 [17:26<67:19:05, 4.11it/s, grad_norm=7.57, loss_final=2.49, loss_mean=0.902, loss_mean_cls=2.02, proj_loss=-0.428][2026-03-23 13:53:46] Step: 4240, Training Logs: loss_final: 2.614957, loss_mean: 0.909958, proj_loss: -0.423151, loss_mean_cls: 2.128150, grad_norm: 8.149400 +Steps: 0%| | 4241/1000000 [17:26<67:21:04, 4.11it/s, grad_norm=8.15, loss_final=2.61, loss_mean=0.91, loss_mean_cls=2.13, proj_loss=-0.423][2026-03-23 13:53:46] Step: 4241, Training Logs: loss_final: 2.310574, loss_mean: 0.933928, proj_loss: -0.432284, loss_mean_cls: 1.808930, grad_norm: 6.614198 +Steps: 0%| | 4242/1000000 [17:26<67:25:09, 4.10it/s, grad_norm=6.61, loss_final=2.31, loss_mean=0.934, loss_mean_cls=1.81, proj_loss=-0.432][2026-03-23 13:53:46] Step: 4242, Training Logs: loss_final: 2.756484, loss_mean: 0.892931, proj_loss: -0.418717, loss_mean_cls: 2.282270, grad_norm: 5.882365 +Steps: 0%| | 4243/1000000 [17:27<67:25:38, 4.10it/s, grad_norm=5.88, loss_final=2.76, loss_mean=0.893, loss_mean_cls=2.28, proj_loss=-0.419][2026-03-23 13:53:47] Step: 4243, Training Logs: loss_final: 2.847398, loss_mean: 0.919072, proj_loss: -0.422166, loss_mean_cls: 2.350491, grad_norm: 20.720018 +Steps: 0%| | 4244/1000000 [17:27<67:26:15, 4.10it/s, grad_norm=20.7, loss_final=2.85, loss_mean=0.919, loss_mean_cls=2.35, proj_loss=-0.422][2026-03-23 13:53:47] Step: 4244, Training Logs: loss_final: 2.899250, loss_mean: 0.888831, proj_loss: -0.428495, loss_mean_cls: 2.438913, grad_norm: 10.806239 +Steps: 0%| | 4245/1000000 [17:27<67:29:37, 4.10it/s, grad_norm=10.8, loss_final=2.9, loss_mean=0.889, loss_mean_cls=2.44, proj_loss=-0.428][2026-03-23 13:53:47] Step: 4245, Training Logs: loss_final: 2.729545, loss_mean: 0.922922, proj_loss: -0.427303, loss_mean_cls: 2.233926, grad_norm: 22.232304 +Steps: 0%| | 4246/1000000 [17:27<67:29:06, 4.10it/s, grad_norm=22.2, loss_final=2.73, loss_mean=0.923, loss_mean_cls=2.23, proj_loss=-0.427][2026-03-23 13:53:47] Step: 4246, Training Logs: loss_final: 2.421366, loss_mean: 0.924897, proj_loss: -0.431140, loss_mean_cls: 1.927609, grad_norm: 11.547444 +Steps: 0%| | 4247/1000000 [17:28<67:29:19, 4.10it/s, grad_norm=11.5, loss_final=2.42, loss_mean=0.925, loss_mean_cls=1.93, proj_loss=-0.431][2026-03-23 13:53:48] Step: 4247, Training Logs: loss_final: 2.736424, loss_mean: 0.920217, proj_loss: -0.424240, loss_mean_cls: 2.240447, grad_norm: 3.825717 +Steps: 0%| | 4248/1000000 [17:28<67:29:37, 4.10it/s, grad_norm=3.83, loss_final=2.74, loss_mean=0.92, loss_mean_cls=2.24, proj_loss=-0.424][2026-03-23 13:53:48] Step: 4248, Training Logs: loss_final: 2.485662, loss_mean: 0.914393, proj_loss: -0.426549, loss_mean_cls: 1.997818, grad_norm: 16.896364 +Steps: 0%| | 4249/1000000 [17:28<67:31:03, 4.10it/s, grad_norm=16.9, loss_final=2.49, loss_mean=0.914, loss_mean_cls=2, proj_loss=-0.427][2026-03-23 13:53:48] Step: 4249, Training Logs: loss_final: 2.575862, loss_mean: 0.938552, proj_loss: -0.427431, loss_mean_cls: 2.064741, grad_norm: 31.989027 +Steps: 0%| | 4250/1000000 [17:28<67:31:43, 4.10it/s, grad_norm=32, loss_final=2.58, loss_mean=0.939, loss_mean_cls=2.06, proj_loss=-0.427][2026-03-23 13:53:48] Step: 4250, Training Logs: loss_final: 3.070832, loss_mean: 0.911032, proj_loss: -0.420064, loss_mean_cls: 2.579864, grad_norm: 20.554850 +Steps: 0%| | 4251/1000000 [17:29<67:33:27, 4.09it/s, grad_norm=20.6, loss_final=3.07, loss_mean=0.911, loss_mean_cls=2.58, proj_loss=-0.42][2026-03-23 13:53:49] Step: 4251, Training Logs: loss_final: 2.633806, loss_mean: 0.918901, proj_loss: -0.429628, loss_mean_cls: 2.144532, grad_norm: 22.326527 +Steps: 0%| | 4252/1000000 [17:29<68:54:23, 4.01it/s, grad_norm=22.3, loss_final=2.63, loss_mean=0.919, loss_mean_cls=2.14, proj_loss=-0.43][2026-03-23 13:53:49] Step: 4252, Training Logs: loss_final: 2.792053, loss_mean: 0.919458, proj_loss: -0.426310, loss_mean_cls: 2.298905, grad_norm: 15.229056 +Steps: 0%| | 4253/1000000 [17:29<68:30:23, 4.04it/s, grad_norm=15.2, loss_final=2.79, loss_mean=0.919, loss_mean_cls=2.3, proj_loss=-0.426][2026-03-23 13:53:49] Step: 4253, Training Logs: loss_final: 2.972950, loss_mean: 0.910123, proj_loss: -0.429965, loss_mean_cls: 2.492793, grad_norm: 39.585598 +Steps: 0%| | 4254/1000000 [17:29<68:11:21, 4.06it/s, grad_norm=39.6, loss_final=2.97, loss_mean=0.91, loss_mean_cls=2.49, proj_loss=-0.43][2026-03-23 13:53:49] Step: 4254, Training Logs: loss_final: 2.758727, loss_mean: 0.930811, proj_loss: -0.423053, loss_mean_cls: 2.250968, grad_norm: 13.547466 +Steps: 0%| | 4255/1000000 [17:30<67:59:54, 4.07it/s, grad_norm=13.5, loss_final=2.76, loss_mean=0.931, loss_mean_cls=2.25, proj_loss=-0.423][2026-03-23 13:53:50] Step: 4255, Training Logs: loss_final: 2.274443, loss_mean: 0.924523, proj_loss: -0.432792, loss_mean_cls: 1.782712, grad_norm: 18.317150 +Steps: 0%| | 4256/1000000 [17:30<67:52:12, 4.08it/s, grad_norm=18.3, loss_final=2.27, loss_mean=0.925, loss_mean_cls=1.78, proj_loss=-0.433][2026-03-23 13:53:50] Step: 4256, Training Logs: loss_final: 2.335780, loss_mean: 0.943624, proj_loss: -0.431682, loss_mean_cls: 1.823838, grad_norm: 10.915119 +Steps: 0%| | 4257/1000000 [17:30<67:49:46, 4.08it/s, grad_norm=10.9, loss_final=2.34, loss_mean=0.944, loss_mean_cls=1.82, proj_loss=-0.432][2026-03-23 13:53:50] Step: 4257, Training Logs: loss_final: 3.113099, loss_mean: 0.911579, proj_loss: -0.425732, loss_mean_cls: 2.627253, grad_norm: 22.762012 +Steps: 0%| | 4258/1000000 [17:30<67:45:20, 4.08it/s, grad_norm=22.8, loss_final=3.11, loss_mean=0.912, loss_mean_cls=2.63, proj_loss=-0.426][2026-03-23 13:53:50] Step: 4258, Training Logs: loss_final: 2.436692, loss_mean: 0.942374, proj_loss: -0.428414, loss_mean_cls: 1.922732, grad_norm: 3.978887 +Steps: 0%| | 4259/1000000 [17:31<67:39:20, 4.09it/s, grad_norm=3.98, loss_final=2.44, loss_mean=0.942, loss_mean_cls=1.92, proj_loss=-0.428][2026-03-23 13:53:51] Step: 4259, Training Logs: loss_final: 2.703078, loss_mean: 0.909021, proj_loss: -0.427297, loss_mean_cls: 2.221354, grad_norm: 1.598973 +Steps: 0%| | 4260/1000000 [17:31<67:36:56, 4.09it/s, grad_norm=1.6, loss_final=2.7, loss_mean=0.909, loss_mean_cls=2.22, proj_loss=-0.427][2026-03-23 13:53:51] Step: 4260, Training Logs: loss_final: 2.420306, loss_mean: 0.913427, proj_loss: -0.431792, loss_mean_cls: 1.938671, grad_norm: 19.405313 +Steps: 0%| | 4261/1000000 [17:31<67:47:08, 4.08it/s, grad_norm=19.4, loss_final=2.42, loss_mean=0.913, loss_mean_cls=1.94, proj_loss=-0.432][2026-03-23 13:53:51] Step: 4261, Training Logs: loss_final: 2.744636, loss_mean: 0.919687, proj_loss: -0.422701, loss_mean_cls: 2.247651, grad_norm: 8.535991 +Steps: 0%| | 4262/1000000 [17:31<67:38:20, 4.09it/s, grad_norm=8.54, loss_final=2.74, loss_mean=0.92, loss_mean_cls=2.25, proj_loss=-0.423][2026-03-23 13:53:51] Step: 4262, Training Logs: loss_final: 2.515347, loss_mean: 0.931075, proj_loss: -0.425489, loss_mean_cls: 2.009761, grad_norm: 3.391905 +Steps: 0%| | 4263/1000000 [17:32<67:36:42, 4.09it/s, grad_norm=3.39, loss_final=2.52, loss_mean=0.931, loss_mean_cls=2.01, proj_loss=-0.425][2026-03-23 13:53:52] Step: 4263, Training Logs: loss_final: 2.311666, loss_mean: 0.934576, proj_loss: -0.436867, loss_mean_cls: 1.813958, grad_norm: 13.249024 +Steps: 0%| | 4264/1000000 [17:32<67:46:32, 4.08it/s, grad_norm=13.2, loss_final=2.31, loss_mean=0.935, loss_mean_cls=1.81, proj_loss=-0.437][2026-03-23 13:53:52] Step: 4264, Training Logs: loss_final: 3.098882, loss_mean: 0.903188, proj_loss: -0.415475, loss_mean_cls: 2.611168, grad_norm: 14.240531 +Steps: 0%| | 4265/1000000 [17:32<67:42:15, 4.09it/s, grad_norm=14.2, loss_final=3.1, loss_mean=0.903, loss_mean_cls=2.61, proj_loss=-0.415][2026-03-23 13:53:52] Step: 4265, Training Logs: loss_final: 3.088169, loss_mean: 0.962294, proj_loss: -0.418048, loss_mean_cls: 2.543924, grad_norm: 16.190386 +Steps: 0%| | 4266/1000000 [17:32<67:36:31, 4.09it/s, grad_norm=16.2, loss_final=3.09, loss_mean=0.962, loss_mean_cls=2.54, proj_loss=-0.418][2026-03-23 13:53:52] Step: 4266, Training Logs: loss_final: 2.528750, loss_mean: 0.935453, proj_loss: -0.425185, loss_mean_cls: 2.018483, grad_norm: 10.979929 +Steps: 0%| | 4267/1000000 [17:33<67:35:48, 4.09it/s, grad_norm=11, loss_final=2.53, loss_mean=0.935, loss_mean_cls=2.02, proj_loss=-0.425][2026-03-23 13:53:53] Step: 4267, Training Logs: loss_final: 2.539408, loss_mean: 0.919629, proj_loss: -0.421559, loss_mean_cls: 2.041339, grad_norm: 9.455549 +Steps: 0%| | 4268/1000000 [17:33<67:36:05, 4.09it/s, grad_norm=9.46, loss_final=2.54, loss_mean=0.92, loss_mean_cls=2.04, proj_loss=-0.422][2026-03-23 13:53:53] Step: 4268, Training Logs: loss_final: 2.711618, loss_mean: 0.926633, proj_loss: -0.425141, loss_mean_cls: 2.210127, grad_norm: 6.870669 +Steps: 0%| | 4269/1000000 [17:33<67:33:56, 4.09it/s, grad_norm=6.87, loss_final=2.71, loss_mean=0.927, loss_mean_cls=2.21, proj_loss=-0.425][2026-03-23 13:53:53] Step: 4269, Training Logs: loss_final: 2.388840, loss_mean: 0.923774, proj_loss: -0.430185, loss_mean_cls: 1.895250, grad_norm: 8.546510 +Steps: 0%| | 4270/1000000 [17:33<67:33:37, 4.09it/s, grad_norm=8.55, loss_final=2.39, loss_mean=0.924, loss_mean_cls=1.9, proj_loss=-0.43][2026-03-23 13:53:53] Step: 4270, Training Logs: loss_final: 2.525473, loss_mean: 0.950865, proj_loss: -0.426698, loss_mean_cls: 2.001306, grad_norm: 11.575319 +Steps: 0%| | 4271/1000000 [17:34<67:34:11, 4.09it/s, grad_norm=11.6, loss_final=2.53, loss_mean=0.951, loss_mean_cls=2, proj_loss=-0.427][2026-03-23 13:53:54] Step: 4271, Training Logs: loss_final: 2.721398, loss_mean: 0.921837, proj_loss: -0.423420, loss_mean_cls: 2.222980, grad_norm: 6.223900 +Steps: 0%| | 4272/1000000 [17:34<67:33:41, 4.09it/s, grad_norm=6.22, loss_final=2.72, loss_mean=0.922, loss_mean_cls=2.22, proj_loss=-0.423][2026-03-23 13:53:54] Step: 4272, Training Logs: loss_final: 2.698643, loss_mean: 0.899543, proj_loss: -0.427202, loss_mean_cls: 2.226302, grad_norm: 8.022677 +Steps: 0%| | 4273/1000000 [17:34<67:32:45, 4.09it/s, grad_norm=8.02, loss_final=2.7, loss_mean=0.9, loss_mean_cls=2.23, proj_loss=-0.427][2026-03-23 13:53:54] Step: 4273, Training Logs: loss_final: 2.458315, loss_mean: 0.933722, proj_loss: -0.432271, loss_mean_cls: 1.956864, grad_norm: 9.728999 +Steps: 0%| | 4274/1000000 [17:34<67:32:28, 4.10it/s, grad_norm=9.73, loss_final=2.46, loss_mean=0.934, loss_mean_cls=1.96, proj_loss=-0.432][2026-03-23 13:53:54] Step: 4274, Training Logs: loss_final: 2.344799, loss_mean: 0.931555, proj_loss: -0.435813, loss_mean_cls: 1.849057, grad_norm: 20.307234 +Steps: 0%| | 4275/1000000 [17:35<67:32:58, 4.09it/s, grad_norm=20.3, loss_final=2.34, loss_mean=0.932, loss_mean_cls=1.85, proj_loss=-0.436][2026-03-23 13:53:55] Step: 4275, Training Logs: loss_final: 2.526605, loss_mean: 0.927358, proj_loss: -0.430565, loss_mean_cls: 2.029812, grad_norm: 7.634762 +Steps: 0%| | 4276/1000000 [17:35<67:29:57, 4.10it/s, grad_norm=7.63, loss_final=2.53, loss_mean=0.927, loss_mean_cls=2.03, proj_loss=-0.431][2026-03-23 13:53:55] Step: 4276, Training Logs: loss_final: 2.435931, loss_mean: 0.937936, proj_loss: -0.426180, loss_mean_cls: 1.924175, grad_norm: 9.545748 +Steps: 0%| | 4277/1000000 [17:35<67:31:55, 4.10it/s, grad_norm=9.55, loss_final=2.44, loss_mean=0.938, loss_mean_cls=1.92, proj_loss=-0.426][2026-03-23 13:53:55] Step: 4277, Training Logs: loss_final: 2.738801, loss_mean: 0.912035, proj_loss: -0.421507, loss_mean_cls: 2.248274, grad_norm: 2.308031 +Steps: 0%| | 4278/1000000 [17:35<67:31:18, 4.10it/s, grad_norm=2.31, loss_final=2.74, loss_mean=0.912, loss_mean_cls=2.25, proj_loss=-0.422][2026-03-23 13:53:55] Step: 4278, Training Logs: loss_final: 2.731070, loss_mean: 0.896791, proj_loss: -0.431239, loss_mean_cls: 2.265518, grad_norm: 5.981927 +Steps: 0%| | 4279/1000000 [17:36<67:30:22, 4.10it/s, grad_norm=5.98, loss_final=2.73, loss_mean=0.897, loss_mean_cls=2.27, proj_loss=-0.431][2026-03-23 13:53:55] Step: 4279, Training Logs: loss_final: 2.949598, loss_mean: 0.906051, proj_loss: -0.420103, loss_mean_cls: 2.463650, grad_norm: 5.742696 +Steps: 0%| | 4280/1000000 [17:36<67:29:53, 4.10it/s, grad_norm=5.74, loss_final=2.95, loss_mean=0.906, loss_mean_cls=2.46, proj_loss=-0.42][2026-03-23 13:53:56] Step: 4280, Training Logs: loss_final: 2.477947, loss_mean: 0.926615, proj_loss: -0.425079, loss_mean_cls: 1.976412, grad_norm: 6.528683 +Steps: 0%| | 4281/1000000 [17:36<67:29:23, 4.10it/s, grad_norm=6.53, loss_final=2.48, loss_mean=0.927, loss_mean_cls=1.98, proj_loss=-0.425][2026-03-23 13:53:56] Step: 4281, Training Logs: loss_final: 2.287257, loss_mean: 0.922386, proj_loss: -0.430840, loss_mean_cls: 1.795710, grad_norm: 12.526521 +Steps: 0%| | 4282/1000000 [17:36<67:28:06, 4.10it/s, grad_norm=12.5, loss_final=2.29, loss_mean=0.922, loss_mean_cls=1.8, proj_loss=-0.431][2026-03-23 13:53:56] Step: 4282, Training Logs: loss_final: 2.738741, loss_mean: 0.912119, proj_loss: -0.419841, loss_mean_cls: 2.246463, grad_norm: 23.487324 +Steps: 0%| | 4283/1000000 [17:37<67:29:52, 4.10it/s, grad_norm=23.5, loss_final=2.74, loss_mean=0.912, loss_mean_cls=2.25, proj_loss=-0.42][2026-03-23 13:53:56] Step: 4283, Training Logs: loss_final: 2.716380, loss_mean: 0.921449, proj_loss: -0.422654, loss_mean_cls: 2.217584, grad_norm: 11.843858 +Steps: 0%| | 4284/1000000 [17:37<67:30:03, 4.10it/s, grad_norm=11.8, loss_final=2.72, loss_mean=0.921, loss_mean_cls=2.22, proj_loss=-0.423][2026-03-23 13:53:57] Step: 4284, Training Logs: loss_final: 3.596186, loss_mean: 0.885091, proj_loss: -0.418406, loss_mean_cls: 3.129501, grad_norm: 23.208162 +Steps: 0%| | 4285/1000000 [17:37<67:30:48, 4.10it/s, grad_norm=23.2, loss_final=3.6, loss_mean=0.885, loss_mean_cls=3.13, proj_loss=-0.418][2026-03-23 13:53:57] Step: 4285, Training Logs: loss_final: 2.662969, loss_mean: 0.929319, proj_loss: -0.425062, loss_mean_cls: 2.158712, grad_norm: 13.365156 +Steps: 0%| | 4286/1000000 [17:37<67:30:30, 4.10it/s, grad_norm=13.4, loss_final=2.66, loss_mean=0.929, loss_mean_cls=2.16, proj_loss=-0.425][2026-03-23 13:53:57] Step: 4286, Training Logs: loss_final: 2.552630, loss_mean: 0.914646, proj_loss: -0.428675, loss_mean_cls: 2.066660, grad_norm: 20.123384 +Steps: 0%| | 4287/1000000 [17:37<67:31:20, 4.10it/s, grad_norm=20.1, loss_final=2.55, loss_mean=0.915, loss_mean_cls=2.07, proj_loss=-0.429][2026-03-23 13:53:57] Step: 4287, Training Logs: loss_final: 3.060709, loss_mean: 0.906108, proj_loss: -0.423246, loss_mean_cls: 2.577847, grad_norm: 22.870275 +Steps: 0%| | 4288/1000000 [17:38<67:33:32, 4.09it/s, grad_norm=22.9, loss_final=3.06, loss_mean=0.906, loss_mean_cls=2.58, proj_loss=-0.423][2026-03-23 13:53:58] Step: 4288, Training Logs: loss_final: 2.316999, loss_mean: 0.924318, proj_loss: -0.430403, loss_mean_cls: 1.823084, grad_norm: 1.438158 +Steps: 0%| | 4289/1000000 [17:38<67:33:20, 4.09it/s, grad_norm=1.44, loss_final=2.32, loss_mean=0.924, loss_mean_cls=1.82, proj_loss=-0.43][2026-03-23 13:53:58] Step: 4289, Training Logs: loss_final: 2.857708, loss_mean: 0.911465, proj_loss: -0.431309, loss_mean_cls: 2.377552, grad_norm: 2.922635 +Steps: 0%| | 4290/1000000 [17:38<67:32:10, 4.10it/s, grad_norm=2.92, loss_final=2.86, loss_mean=0.911, loss_mean_cls=2.38, proj_loss=-0.431][2026-03-23 13:53:58] Step: 4290, Training Logs: loss_final: 3.357906, loss_mean: 0.887584, proj_loss: -0.413041, loss_mean_cls: 2.883363, grad_norm: 19.480532 +Steps: 0%| | 4291/1000000 [17:38<67:31:54, 4.10it/s, grad_norm=19.5, loss_final=3.36, loss_mean=0.888, loss_mean_cls=2.88, proj_loss=-0.413][2026-03-23 13:53:58] Step: 4291, Training Logs: loss_final: 2.844442, loss_mean: 0.932308, proj_loss: -0.415781, loss_mean_cls: 2.327915, grad_norm: 12.247518 +Steps: 0%| | 4292/1000000 [17:39<67:32:21, 4.10it/s, grad_norm=12.2, loss_final=2.84, loss_mean=0.932, loss_mean_cls=2.33, proj_loss=-0.416][2026-03-23 13:53:59] Step: 4292, Training Logs: loss_final: 2.678825, loss_mean: 0.924489, proj_loss: -0.415592, loss_mean_cls: 2.169929, grad_norm: 4.313275 +Steps: 0%| | 4293/1000000 [17:39<67:32:17, 4.10it/s, grad_norm=4.31, loss_final=2.68, loss_mean=0.924, loss_mean_cls=2.17, proj_loss=-0.416][2026-03-23 13:53:59] Step: 4293, Training Logs: loss_final: 2.334503, loss_mean: 0.917412, proj_loss: -0.432407, loss_mean_cls: 1.849498, grad_norm: 15.115166 +Steps: 0%| | 4294/1000000 [17:39<67:32:29, 4.10it/s, grad_norm=15.1, loss_final=2.33, loss_mean=0.917, loss_mean_cls=1.85, proj_loss=-0.432][2026-03-23 13:53:59] Step: 4294, Training Logs: loss_final: 3.026405, loss_mean: 0.904954, proj_loss: -0.415012, loss_mean_cls: 2.536464, grad_norm: 6.766339 +Steps: 0%| | 4295/1000000 [17:39<67:31:41, 4.10it/s, grad_norm=6.77, loss_final=3.03, loss_mean=0.905, loss_mean_cls=2.54, proj_loss=-0.415][2026-03-23 13:53:59] Step: 4295, Training Logs: loss_final: 2.281739, loss_mean: 0.956225, proj_loss: -0.431996, loss_mean_cls: 1.757510, grad_norm: 5.639354 +Steps: 0%| | 4296/1000000 [17:40<67:30:07, 4.10it/s, grad_norm=5.64, loss_final=2.28, loss_mean=0.956, loss_mean_cls=1.76, proj_loss=-0.432][2026-03-23 13:54:00] Step: 4296, Training Logs: loss_final: 2.705633, loss_mean: 0.934473, proj_loss: -0.422577, loss_mean_cls: 2.193737, grad_norm: 12.661433 +Steps: 0%| | 4297/1000000 [17:40<67:32:25, 4.10it/s, grad_norm=12.7, loss_final=2.71, loss_mean=0.934, loss_mean_cls=2.19, proj_loss=-0.423][2026-03-23 13:54:00] Step: 4297, Training Logs: loss_final: 2.614014, loss_mean: 0.929873, proj_loss: -0.417521, loss_mean_cls: 2.101662, grad_norm: 2.996676 +Steps: 0%| | 4298/1000000 [17:40<67:30:58, 4.10it/s, grad_norm=3, loss_final=2.61, loss_mean=0.93, loss_mean_cls=2.1, proj_loss=-0.418][2026-03-23 13:54:00] Step: 4298, Training Logs: loss_final: 2.767779, loss_mean: 0.915751, proj_loss: -0.428376, loss_mean_cls: 2.280404, grad_norm: 18.768015 +Steps: 0%| | 4299/1000000 [17:40<67:31:27, 4.10it/s, grad_norm=18.8, loss_final=2.77, loss_mean=0.916, loss_mean_cls=2.28, proj_loss=-0.428][2026-03-23 13:54:00] Step: 4299, Training Logs: loss_final: 2.946929, loss_mean: 0.908331, proj_loss: -0.421629, loss_mean_cls: 2.460227, grad_norm: 8.363053 +Steps: 0%| | 4300/1000000 [17:41<67:32:33, 4.09it/s, grad_norm=8.36, loss_final=2.95, loss_mean=0.908, loss_mean_cls=2.46, proj_loss=-0.422][2026-03-23 13:54:01] Step: 4300, Training Logs: loss_final: 2.798753, loss_mean: 0.912021, proj_loss: -0.414625, loss_mean_cls: 2.301358, grad_norm: 2.754487 +Steps: 0%| | 4301/1000000 [17:41<67:31:02, 4.10it/s, grad_norm=2.75, loss_final=2.8, loss_mean=0.912, loss_mean_cls=2.3, proj_loss=-0.415][2026-03-23 13:54:01] Step: 4301, Training Logs: loss_final: 2.601273, loss_mean: 0.921706, proj_loss: -0.420986, loss_mean_cls: 2.100552, grad_norm: 18.045971 +Steps: 0%| | 4302/1000000 [17:41<67:31:00, 4.10it/s, grad_norm=18, loss_final=2.6, loss_mean=0.922, loss_mean_cls=2.1, proj_loss=-0.421][2026-03-23 13:54:01] Step: 4302, Training Logs: loss_final: 2.883656, loss_mean: 0.935240, proj_loss: -0.424124, loss_mean_cls: 2.372541, grad_norm: 10.054478 +Steps: 0%| | 4303/1000000 [17:41<67:28:53, 4.10it/s, grad_norm=10.1, loss_final=2.88, loss_mean=0.935, loss_mean_cls=2.37, proj_loss=-0.424][2026-03-23 13:54:01] Step: 4303, Training Logs: loss_final: 2.628930, loss_mean: 0.937625, proj_loss: -0.423177, loss_mean_cls: 2.114481, grad_norm: 7.226999 +Steps: 0%| | 4304/1000000 [17:42<67:43:59, 4.08it/s, grad_norm=7.23, loss_final=2.63, loss_mean=0.938, loss_mean_cls=2.11, proj_loss=-0.423][2026-03-23 13:54:02] Step: 4304, Training Logs: loss_final: 2.794377, loss_mean: 0.921896, proj_loss: -0.416514, loss_mean_cls: 2.288996, grad_norm: 10.398714 +Steps: 0%| | 4305/1000000 [17:42<67:40:17, 4.09it/s, grad_norm=10.4, loss_final=2.79, loss_mean=0.922, loss_mean_cls=2.29, proj_loss=-0.417][2026-03-23 13:54:02] Step: 4305, Training Logs: loss_final: 3.050917, loss_mean: 0.912238, proj_loss: -0.418369, loss_mean_cls: 2.557047, grad_norm: 24.713686 +Steps: 0%| | 4306/1000000 [17:42<67:36:45, 4.09it/s, grad_norm=24.7, loss_final=3.05, loss_mean=0.912, loss_mean_cls=2.56, proj_loss=-0.418][2026-03-23 13:54:02] Step: 4306, Training Logs: loss_final: 2.814068, loss_mean: 0.923932, proj_loss: -0.422065, loss_mean_cls: 2.312201, grad_norm: 12.884428 +Steps: 0%| | 4307/1000000 [17:42<67:33:18, 4.09it/s, grad_norm=12.9, loss_final=2.81, loss_mean=0.924, loss_mean_cls=2.31, proj_loss=-0.422][2026-03-23 13:54:02] Step: 4307, Training Logs: loss_final: 2.898871, loss_mean: 0.924393, proj_loss: -0.428716, loss_mean_cls: 2.403194, grad_norm: 27.458200 +Steps: 0%| | 4308/1000000 [17:43<67:30:25, 4.10it/s, grad_norm=27.5, loss_final=2.9, loss_mean=0.924, loss_mean_cls=2.4, proj_loss=-0.429][2026-03-23 13:54:03] Step: 4308, Training Logs: loss_final: 2.581697, loss_mean: 0.936041, proj_loss: -0.427417, loss_mean_cls: 2.073074, grad_norm: 12.030269 +Steps: 0%| | 4309/1000000 [17:43<67:29:55, 4.10it/s, grad_norm=12, loss_final=2.58, loss_mean=0.936, loss_mean_cls=2.07, proj_loss=-0.427][2026-03-23 13:54:03] Step: 4309, Training Logs: loss_final: 2.957075, loss_mean: 0.904959, proj_loss: -0.422016, loss_mean_cls: 2.474132, grad_norm: 19.425989 +Steps: 0%| | 4310/1000000 [17:43<67:29:18, 4.10it/s, grad_norm=19.4, loss_final=2.96, loss_mean=0.905, loss_mean_cls=2.47, proj_loss=-0.422][2026-03-23 13:54:03] Step: 4310, Training Logs: loss_final: 2.689455, loss_mean: 0.931211, proj_loss: -0.421548, loss_mean_cls: 2.179792, grad_norm: 4.248131 +Steps: 0%| | 4311/1000000 [17:43<67:28:19, 4.10it/s, grad_norm=4.25, loss_final=2.69, loss_mean=0.931, loss_mean_cls=2.18, proj_loss=-0.422][2026-03-23 13:54:03] Step: 4311, Training Logs: loss_final: 3.037503, loss_mean: 0.906993, proj_loss: -0.412607, loss_mean_cls: 2.543117, grad_norm: 3.653364 +Steps: 0%| | 4312/1000000 [17:44<67:28:29, 4.10it/s, grad_norm=3.65, loss_final=3.04, loss_mean=0.907, loss_mean_cls=2.54, proj_loss=-0.413][2026-03-23 13:54:04] Step: 4312, Training Logs: loss_final: 2.715389, loss_mean: 0.920682, proj_loss: -0.428601, loss_mean_cls: 2.223308, grad_norm: 17.929140 +Steps: 0%| | 4313/1000000 [17:44<67:28:06, 4.10it/s, grad_norm=17.9, loss_final=2.72, loss_mean=0.921, loss_mean_cls=2.22, proj_loss=-0.429][2026-03-23 13:54:04] Step: 4313, Training Logs: loss_final: 2.524745, loss_mean: 0.931695, proj_loss: -0.427948, loss_mean_cls: 2.020998, grad_norm: 20.059780 +Steps: 0%| | 4314/1000000 [17:44<67:28:30, 4.10it/s, grad_norm=20.1, loss_final=2.52, loss_mean=0.932, loss_mean_cls=2.02, proj_loss=-0.428][2026-03-23 13:54:04] Step: 4314, Training Logs: loss_final: 2.803568, loss_mean: 0.929662, proj_loss: -0.422033, loss_mean_cls: 2.295939, grad_norm: 31.520746 +Steps: 0%| | 4315/1000000 [17:44<67:32:27, 4.09it/s, grad_norm=31.5, loss_final=2.8, loss_mean=0.93, loss_mean_cls=2.3, proj_loss=-0.422][2026-03-23 13:54:04] Step: 4315, Training Logs: loss_final: 2.719324, loss_mean: 0.933141, proj_loss: -0.424069, loss_mean_cls: 2.210251, grad_norm: 15.691895 +Steps: 0%| | 4316/1000000 [17:45<67:29:11, 4.10it/s, grad_norm=15.7, loss_final=2.72, loss_mean=0.933, loss_mean_cls=2.21, proj_loss=-0.424][2026-03-23 13:54:05] Step: 4316, Training Logs: loss_final: 2.836698, loss_mean: 0.927421, proj_loss: -0.417541, loss_mean_cls: 2.326819, grad_norm: 2.587287 +Steps: 0%| | 4317/1000000 [17:45<67:28:29, 4.10it/s, grad_norm=2.59, loss_final=2.84, loss_mean=0.927, loss_mean_cls=2.33, proj_loss=-0.418][2026-03-23 13:54:05] Step: 4317, Training Logs: loss_final: 2.617041, loss_mean: 0.916635, proj_loss: -0.428036, loss_mean_cls: 2.128441, grad_norm: 5.815876 +Steps: 0%| | 4318/1000000 [17:45<67:28:03, 4.10it/s, grad_norm=5.82, loss_final=2.62, loss_mean=0.917, loss_mean_cls=2.13, proj_loss=-0.428][2026-03-23 13:54:05] Step: 4318, Training Logs: loss_final: 2.817839, loss_mean: 0.934415, proj_loss: -0.427212, loss_mean_cls: 2.310636, grad_norm: 11.630864 +Steps: 0%| | 4319/1000000 [17:45<67:29:12, 4.10it/s, grad_norm=11.6, loss_final=2.82, loss_mean=0.934, loss_mean_cls=2.31, proj_loss=-0.427][2026-03-23 13:54:05] Step: 4319, Training Logs: loss_final: 2.670021, loss_mean: 0.976214, proj_loss: -0.420776, loss_mean_cls: 2.114583, grad_norm: 20.530092 +Steps: 0%| | 4320/1000000 [17:46<67:28:19, 4.10it/s, grad_norm=20.5, loss_final=2.67, loss_mean=0.976, loss_mean_cls=2.11, proj_loss=-0.421][2026-03-23 13:54:06] Step: 4320, Training Logs: loss_final: 3.497921, loss_mean: 0.936512, proj_loss: -0.394412, loss_mean_cls: 2.955821, grad_norm: 6.258118 +Steps: 0%| | 4321/1000000 [17:46<67:28:52, 4.10it/s, grad_norm=6.26, loss_final=3.5, loss_mean=0.937, loss_mean_cls=2.96, proj_loss=-0.394][2026-03-23 13:54:06] Step: 4321, Training Logs: loss_final: 2.953584, loss_mean: 0.937858, proj_loss: -0.416029, loss_mean_cls: 2.431755, grad_norm: 6.620993 +Steps: 0%| | 4322/1000000 [17:46<67:27:47, 4.10it/s, grad_norm=6.62, loss_final=2.95, loss_mean=0.938, loss_mean_cls=2.43, proj_loss=-0.416][2026-03-23 13:54:06] Step: 4322, Training Logs: loss_final: 2.420053, loss_mean: 0.928416, proj_loss: -0.429015, loss_mean_cls: 1.920653, grad_norm: 18.022249 +Steps: 0%| | 4323/1000000 [17:46<67:26:40, 4.10it/s, grad_norm=18, loss_final=2.42, loss_mean=0.928, loss_mean_cls=1.92, proj_loss=-0.429][2026-03-23 13:54:06] Step: 4323, Training Logs: loss_final: 3.071929, loss_mean: 0.930274, proj_loss: -0.415886, loss_mean_cls: 2.557541, grad_norm: 16.238169 +Steps: 0%| | 4324/1000000 [17:47<67:27:32, 4.10it/s, grad_norm=16.2, loss_final=3.07, loss_mean=0.93, loss_mean_cls=2.56, proj_loss=-0.416][2026-03-23 13:54:06] Step: 4324, Training Logs: loss_final: 2.744530, loss_mean: 0.908879, proj_loss: -0.422387, loss_mean_cls: 2.258038, grad_norm: 29.371586 +Steps: 0%| | 4325/1000000 [17:47<67:25:34, 4.10it/s, grad_norm=29.4, loss_final=2.74, loss_mean=0.909, loss_mean_cls=2.26, proj_loss=-0.422][2026-03-23 13:54:07] Step: 4325, Training Logs: loss_final: 2.692024, loss_mean: 0.953622, proj_loss: -0.419141, loss_mean_cls: 2.157542, grad_norm: 6.284770 +Steps: 0%| | 4326/1000000 [17:47<67:25:52, 4.10it/s, grad_norm=6.28, loss_final=2.69, loss_mean=0.954, loss_mean_cls=2.16, proj_loss=-0.419][2026-03-23 13:54:07] Step: 4326, Training Logs: loss_final: 2.863697, loss_mean: 0.931144, proj_loss: -0.423333, loss_mean_cls: 2.355886, grad_norm: 26.738117 +Steps: 0%| | 4327/1000000 [17:47<67:26:17, 4.10it/s, grad_norm=26.7, loss_final=2.86, loss_mean=0.931, loss_mean_cls=2.36, proj_loss=-0.423][2026-03-23 13:54:07] Step: 4327, Training Logs: loss_final: 2.828863, loss_mean: 0.919375, proj_loss: -0.415203, loss_mean_cls: 2.324690, grad_norm: 2.066280 +Steps: 0%| | 4328/1000000 [17:47<67:27:15, 4.10it/s, grad_norm=2.07, loss_final=2.83, loss_mean=0.919, loss_mean_cls=2.32, proj_loss=-0.415][2026-03-23 13:54:07] Step: 4328, Training Logs: loss_final: 2.660980, loss_mean: 0.906304, proj_loss: -0.423999, loss_mean_cls: 2.178674, grad_norm: 2.545700 +Steps: 0%| | 4329/1000000 [17:48<67:26:32, 4.10it/s, grad_norm=2.55, loss_final=2.66, loss_mean=0.906, loss_mean_cls=2.18, proj_loss=-0.424][2026-03-23 13:54:08] Step: 4329, Training Logs: loss_final: 2.298027, loss_mean: 0.945503, proj_loss: -0.432201, loss_mean_cls: 1.784725, grad_norm: 14.674886 +Steps: 0%| | 4330/1000000 [17:48<67:27:08, 4.10it/s, grad_norm=14.7, loss_final=2.3, loss_mean=0.946, loss_mean_cls=1.78, proj_loss=-0.432][2026-03-23 13:54:08] Step: 4330, Training Logs: loss_final: 3.141291, loss_mean: 0.932530, proj_loss: -0.414078, loss_mean_cls: 2.622839, grad_norm: 15.115360 +Steps: 0%| | 4331/1000000 [17:48<67:25:47, 4.10it/s, grad_norm=15.1, loss_final=3.14, loss_mean=0.933, loss_mean_cls=2.62, proj_loss=-0.414][2026-03-23 13:54:08] Step: 4331, Training Logs: loss_final: 2.933006, loss_mean: 0.941793, proj_loss: -0.413552, loss_mean_cls: 2.404766, grad_norm: 16.945389 +Steps: 0%| | 4332/1000000 [17:48<67:24:27, 4.10it/s, grad_norm=16.9, loss_final=2.93, loss_mean=0.942, loss_mean_cls=2.4, proj_loss=-0.414][2026-03-23 13:54:08] Step: 4332, Training Logs: loss_final: 2.976559, loss_mean: 0.934562, proj_loss: -0.415475, loss_mean_cls: 2.457472, grad_norm: 7.821887 +Steps: 0%| | 4333/1000000 [17:49<67:25:49, 4.10it/s, grad_norm=7.82, loss_final=2.98, loss_mean=0.935, loss_mean_cls=2.46, proj_loss=-0.415][2026-03-23 13:54:09] Step: 4333, Training Logs: loss_final: 2.862439, loss_mean: 0.930082, proj_loss: -0.413119, loss_mean_cls: 2.345477, grad_norm: 30.166565 +Steps: 0%| | 4334/1000000 [17:49<67:25:13, 4.10it/s, grad_norm=30.2, loss_final=2.86, loss_mean=0.93, loss_mean_cls=2.35, proj_loss=-0.413][2026-03-23 13:54:09] Step: 4334, Training Logs: loss_final: 2.952702, loss_mean: 0.938639, proj_loss: -0.420894, loss_mean_cls: 2.434957, grad_norm: 27.884378 +Steps: 0%| | 4335/1000000 [17:49<67:26:13, 4.10it/s, grad_norm=27.9, loss_final=2.95, loss_mean=0.939, loss_mean_cls=2.43, proj_loss=-0.421][2026-03-23 13:54:09] Step: 4335, Training Logs: loss_final: 3.216197, loss_mean: 0.916316, proj_loss: -0.412150, loss_mean_cls: 2.712031, grad_norm: 18.276947 +Steps: 0%| | 4336/1000000 [17:49<67:26:25, 4.10it/s, grad_norm=18.3, loss_final=3.22, loss_mean=0.916, loss_mean_cls=2.71, proj_loss=-0.412][2026-03-23 13:54:09] Step: 4336, Training Logs: loss_final: 2.581317, loss_mean: 0.939679, proj_loss: -0.425250, loss_mean_cls: 2.066888, grad_norm: 31.493927 +Steps: 0%| | 4337/1000000 [17:50<67:26:10, 4.10it/s, grad_norm=31.5, loss_final=2.58, loss_mean=0.94, loss_mean_cls=2.07, proj_loss=-0.425][2026-03-23 13:54:10] Step: 4337, Training Logs: loss_final: 2.759130, loss_mean: 0.925869, proj_loss: -0.422412, loss_mean_cls: 2.255672, grad_norm: 16.465645 +Steps: 0%| | 4338/1000000 [17:50<67:27:08, 4.10it/s, grad_norm=16.5, loss_final=2.76, loss_mean=0.926, loss_mean_cls=2.26, proj_loss=-0.422][2026-03-23 13:54:10] Step: 4338, Training Logs: loss_final: 2.730617, loss_mean: 0.938088, proj_loss: -0.423383, loss_mean_cls: 2.215911, grad_norm: 12.633681 +Steps: 0%| | 4339/1000000 [17:50<67:26:36, 4.10it/s, grad_norm=12.6, loss_final=2.73, loss_mean=0.938, loss_mean_cls=2.22, proj_loss=-0.423][2026-03-23 13:54:10] Step: 4339, Training Logs: loss_final: 3.027219, loss_mean: 0.927891, proj_loss: -0.422332, loss_mean_cls: 2.521660, grad_norm: 37.466717 +Steps: 0%| | 4340/1000000 [17:50<67:27:50, 4.10it/s, grad_norm=37.5, loss_final=3.03, loss_mean=0.928, loss_mean_cls=2.52, proj_loss=-0.422][2026-03-23 13:54:10] Step: 4340, Training Logs: loss_final: 2.345444, loss_mean: 0.944385, proj_loss: -0.426140, loss_mean_cls: 1.827200, grad_norm: 13.319764 +Steps: 0%| | 4341/1000000 [17:51<67:27:02, 4.10it/s, grad_norm=13.3, loss_final=2.35, loss_mean=0.944, loss_mean_cls=1.83, proj_loss=-0.426][2026-03-23 13:54:11] Step: 4341, Training Logs: loss_final: 3.014002, loss_mean: 0.903147, proj_loss: -0.417585, loss_mean_cls: 2.528440, grad_norm: 16.626286 +Steps: 0%| | 4342/1000000 [17:51<67:28:26, 4.10it/s, grad_norm=16.6, loss_final=3.01, loss_mean=0.903, loss_mean_cls=2.53, proj_loss=-0.418][2026-03-23 13:54:11] Step: 4342, Training Logs: loss_final: 2.779142, loss_mean: 0.915654, proj_loss: -0.420663, loss_mean_cls: 2.284151, grad_norm: 27.221325 +Steps: 0%| | 4343/1000000 [17:51<67:28:39, 4.10it/s, grad_norm=27.2, loss_final=2.78, loss_mean=0.916, loss_mean_cls=2.28, proj_loss=-0.421][2026-03-23 13:54:11] Step: 4343, Training Logs: loss_final: 2.840333, loss_mean: 0.927026, proj_loss: -0.422652, loss_mean_cls: 2.335959, grad_norm: 4.963420 +Steps: 0%| | 4344/1000000 [17:51<67:28:36, 4.10it/s, grad_norm=4.96, loss_final=2.84, loss_mean=0.927, loss_mean_cls=2.34, proj_loss=-0.423][2026-03-23 13:54:11] Step: 4344, Training Logs: loss_final: 2.605426, loss_mean: 0.909408, proj_loss: -0.432925, loss_mean_cls: 2.128943, grad_norm: 4.893560 +Steps: 0%| | 4345/1000000 [17:52<67:28:05, 4.10it/s, grad_norm=4.89, loss_final=2.61, loss_mean=0.909, loss_mean_cls=2.13, proj_loss=-0.433][2026-03-23 13:54:12] Step: 4345, Training Logs: loss_final: 2.710815, loss_mean: 0.923843, proj_loss: -0.423822, loss_mean_cls: 2.210793, grad_norm: 16.961630 +Steps: 0%| | 4346/1000000 [17:52<67:28:33, 4.10it/s, grad_norm=17, loss_final=2.71, loss_mean=0.924, loss_mean_cls=2.21, proj_loss=-0.424][2026-03-23 13:54:12] Step: 4346, Training Logs: loss_final: 2.700030, loss_mean: 0.917287, proj_loss: -0.424692, loss_mean_cls: 2.207435, grad_norm: 7.422806 +Steps: 0%| | 4347/1000000 [17:52<67:29:25, 4.10it/s, grad_norm=7.42, loss_final=2.7, loss_mean=0.917, loss_mean_cls=2.21, proj_loss=-0.425][2026-03-23 13:54:12] Step: 4347, Training Logs: loss_final: 2.459519, loss_mean: 0.927279, proj_loss: -0.432285, loss_mean_cls: 1.964525, grad_norm: 16.296736 +Steps: 0%| | 4348/1000000 [17:52<67:28:55, 4.10it/s, grad_norm=16.3, loss_final=2.46, loss_mean=0.927, loss_mean_cls=1.96, proj_loss=-0.432][2026-03-23 13:54:12] Step: 4348, Training Logs: loss_final: 2.736099, loss_mean: 0.933283, proj_loss: -0.422068, loss_mean_cls: 2.224883, grad_norm: 15.350614 +Steps: 0%| | 4349/1000000 [17:53<67:28:04, 4.10it/s, grad_norm=15.4, loss_final=2.74, loss_mean=0.933, loss_mean_cls=2.22, proj_loss=-0.422][2026-03-23 13:54:13] Step: 4349, Training Logs: loss_final: 2.406787, loss_mean: 0.939800, proj_loss: -0.434416, loss_mean_cls: 1.901403, grad_norm: 18.496172 +Steps: 0%| | 4350/1000000 [17:53<68:13:55, 4.05it/s, grad_norm=18.5, loss_final=2.41, loss_mean=0.94, loss_mean_cls=1.9, proj_loss=-0.434][2026-03-23 13:54:13] Step: 4350, Training Logs: loss_final: 2.636630, loss_mean: 0.923138, proj_loss: -0.428714, loss_mean_cls: 2.142205, grad_norm: 24.664026 +Steps: 0%| | 4351/1000000 [17:53<68:05:46, 4.06it/s, grad_norm=24.7, loss_final=2.64, loss_mean=0.923, loss_mean_cls=2.14, proj_loss=-0.429][2026-03-23 13:54:13] Step: 4351, Training Logs: loss_final: 2.519614, loss_mean: 0.937575, proj_loss: -0.425678, loss_mean_cls: 2.007717, grad_norm: 3.191875 +Steps: 0%| | 4352/1000000 [17:53<67:54:31, 4.07it/s, grad_norm=3.19, loss_final=2.52, loss_mean=0.938, loss_mean_cls=2.01, proj_loss=-0.426][2026-03-23 13:54:13] Step: 4352, Training Logs: loss_final: 2.070699, loss_mean: 0.930619, proj_loss: -0.436626, loss_mean_cls: 1.576707, grad_norm: 2.466984 +Steps: 0%| | 4353/1000000 [17:54<67:46:33, 4.08it/s, grad_norm=2.47, loss_final=2.07, loss_mean=0.931, loss_mean_cls=1.58, proj_loss=-0.437][2026-03-23 13:54:14] Step: 4353, Training Logs: loss_final: 2.360404, loss_mean: 0.915278, proj_loss: -0.431895, loss_mean_cls: 1.877021, grad_norm: 9.093241 +Steps: 0%| | 4354/1000000 [17:54<67:41:38, 4.09it/s, grad_norm=9.09, loss_final=2.36, loss_mean=0.915, loss_mean_cls=1.88, proj_loss=-0.432][2026-03-23 13:54:14] Step: 4354, Training Logs: loss_final: 2.581470, loss_mean: 0.920219, proj_loss: -0.426928, loss_mean_cls: 2.088179, grad_norm: 8.703982 +Steps: 0%| | 4355/1000000 [17:54<67:36:42, 4.09it/s, grad_norm=8.7, loss_final=2.58, loss_mean=0.92, loss_mean_cls=2.09, proj_loss=-0.427][2026-03-23 13:54:14] Step: 4355, Training Logs: loss_final: 2.965560, loss_mean: 0.890239, proj_loss: -0.423800, loss_mean_cls: 2.499121, grad_norm: 5.310341 +Steps: 0%| | 4356/1000000 [17:54<67:33:39, 4.09it/s, grad_norm=5.31, loss_final=2.97, loss_mean=0.89, loss_mean_cls=2.5, proj_loss=-0.424][2026-03-23 13:54:14] Step: 4356, Training Logs: loss_final: 2.700841, loss_mean: 0.901590, proj_loss: -0.426338, loss_mean_cls: 2.225588, grad_norm: 5.508039 +Steps: 0%| | 4357/1000000 [17:55<67:31:56, 4.10it/s, grad_norm=5.51, loss_final=2.7, loss_mean=0.902, loss_mean_cls=2.23, proj_loss=-0.426][2026-03-23 13:54:15] Step: 4357, Training Logs: loss_final: 2.811177, loss_mean: 0.917039, proj_loss: -0.428087, loss_mean_cls: 2.322225, grad_norm: 9.871041 +Steps: 0%| | 4358/1000000 [17:55<67:30:52, 4.10it/s, grad_norm=9.87, loss_final=2.81, loss_mean=0.917, loss_mean_cls=2.32, proj_loss=-0.428][2026-03-23 13:54:15] Step: 4358, Training Logs: loss_final: 2.666849, loss_mean: 0.918916, proj_loss: -0.425447, loss_mean_cls: 2.173380, grad_norm: 26.835814 +Steps: 0%| | 4359/1000000 [17:55<67:27:12, 4.10it/s, grad_norm=26.8, loss_final=2.67, loss_mean=0.919, loss_mean_cls=2.17, proj_loss=-0.425][2026-03-23 13:54:15] Step: 4359, Training Logs: loss_final: 2.773908, loss_mean: 0.919695, proj_loss: -0.421960, loss_mean_cls: 2.276173, grad_norm: 39.085880 +Steps: 0%| | 4360/1000000 [17:55<67:27:00, 4.10it/s, grad_norm=39.1, loss_final=2.77, loss_mean=0.92, loss_mean_cls=2.28, proj_loss=-0.422][2026-03-23 13:54:15] Step: 4360, Training Logs: loss_final: 2.754172, loss_mean: 0.914168, proj_loss: -0.418807, loss_mean_cls: 2.258811, grad_norm: 2.905403 +Steps: 0%| | 4361/1000000 [17:56<67:27:23, 4.10it/s, grad_norm=2.91, loss_final=2.75, loss_mean=0.914, loss_mean_cls=2.26, proj_loss=-0.419][2026-03-23 13:54:16] Step: 4361, Training Logs: loss_final: 2.663336, loss_mean: 0.915906, proj_loss: -0.430067, loss_mean_cls: 2.177496, grad_norm: 30.535145 +Steps: 0%| | 4362/1000000 [17:56<67:26:45, 4.10it/s, grad_norm=30.5, loss_final=2.66, loss_mean=0.916, loss_mean_cls=2.18, proj_loss=-0.43][2026-03-23 13:54:16] Step: 4362, Training Logs: loss_final: 2.841252, loss_mean: 0.886661, proj_loss: -0.426373, loss_mean_cls: 2.380965, grad_norm: 10.235613 +Steps: 0%| | 4363/1000000 [17:56<67:27:04, 4.10it/s, grad_norm=10.2, loss_final=2.84, loss_mean=0.887, loss_mean_cls=2.38, proj_loss=-0.426][2026-03-23 13:54:16] Step: 4363, Training Logs: loss_final: 2.597227, loss_mean: 0.919620, proj_loss: -0.421941, loss_mean_cls: 2.099549, grad_norm: 9.966398 +Steps: 0%| | 4364/1000000 [17:56<67:26:51, 4.10it/s, grad_norm=9.97, loss_final=2.6, loss_mean=0.92, loss_mean_cls=2.1, proj_loss=-0.422][2026-03-23 13:54:16] Step: 4364, Training Logs: loss_final: 2.470693, loss_mean: 0.920469, proj_loss: -0.425341, loss_mean_cls: 1.975565, grad_norm: 19.717392 +Steps: 0%| | 4365/1000000 [17:57<67:28:04, 4.10it/s, grad_norm=19.7, loss_final=2.47, loss_mean=0.92, loss_mean_cls=1.98, proj_loss=-0.425][2026-03-23 13:54:16] Step: 4365, Training Logs: loss_final: 3.117460, loss_mean: 0.896226, proj_loss: -0.420891, loss_mean_cls: 2.642125, grad_norm: 30.924534 +Steps: 0%| | 4366/1000000 [17:57<67:26:50, 4.10it/s, grad_norm=30.9, loss_final=3.12, loss_mean=0.896, loss_mean_cls=2.64, proj_loss=-0.421][2026-03-23 13:54:17] Step: 4366, Training Logs: loss_final: 2.992748, loss_mean: 0.914495, proj_loss: -0.414286, loss_mean_cls: 2.492539, grad_norm: 18.149017 +Steps: 0%| | 4367/1000000 [17:57<67:26:44, 4.10it/s, grad_norm=18.1, loss_final=2.99, loss_mean=0.914, loss_mean_cls=2.49, proj_loss=-0.414][2026-03-23 13:54:17] Step: 4367, Training Logs: loss_final: 2.518065, loss_mean: 0.913216, proj_loss: -0.429724, loss_mean_cls: 2.034573, grad_norm: 10.826698 +Steps: 0%| | 4368/1000000 [17:57<67:26:45, 4.10it/s, grad_norm=10.8, loss_final=2.52, loss_mean=0.913, loss_mean_cls=2.03, proj_loss=-0.43][2026-03-23 13:54:17] Step: 4368, Training Logs: loss_final: 2.160945, loss_mean: 0.940721, proj_loss: -0.430355, loss_mean_cls: 1.650578, grad_norm: 1.746491 +Steps: 0%| | 4369/1000000 [17:58<67:28:02, 4.10it/s, grad_norm=1.75, loss_final=2.16, loss_mean=0.941, loss_mean_cls=1.65, proj_loss=-0.43][2026-03-23 13:54:17] Step: 4369, Training Logs: loss_final: 2.488687, loss_mean: 0.934984, proj_loss: -0.426211, loss_mean_cls: 1.979914, grad_norm: 14.824599 +Steps: 0%| | 4370/1000000 [17:58<67:26:28, 4.10it/s, grad_norm=14.8, loss_final=2.49, loss_mean=0.935, loss_mean_cls=1.98, proj_loss=-0.426][2026-03-23 13:54:18] Step: 4370, Training Logs: loss_final: 2.748622, loss_mean: 0.923983, proj_loss: -0.426511, loss_mean_cls: 2.251150, grad_norm: 29.929401 +Steps: 0%| | 4371/1000000 [17:58<67:27:58, 4.10it/s, grad_norm=29.9, loss_final=2.75, loss_mean=0.924, loss_mean_cls=2.25, proj_loss=-0.427][2026-03-23 13:54:18] Step: 4371, Training Logs: loss_final: 3.012272, loss_mean: 0.940920, proj_loss: -0.414883, loss_mean_cls: 2.486235, grad_norm: 20.043829 +Steps: 0%| | 4372/1000000 [17:58<67:27:32, 4.10it/s, grad_norm=20, loss_final=3.01, loss_mean=0.941, loss_mean_cls=2.49, proj_loss=-0.415][2026-03-23 13:54:18] Step: 4372, Training Logs: loss_final: 3.325228, loss_mean: 0.927822, proj_loss: -0.403235, loss_mean_cls: 2.800641, grad_norm: 7.997041 +Steps: 0%| | 4373/1000000 [17:58<67:26:40, 4.10it/s, grad_norm=8, loss_final=3.33, loss_mean=0.928, loss_mean_cls=2.8, proj_loss=-0.403][2026-03-23 13:54:18] Step: 4373, Training Logs: loss_final: 2.998803, loss_mean: 0.932038, proj_loss: -0.404667, loss_mean_cls: 2.471432, grad_norm: 23.727030 +Steps: 0%| | 4374/1000000 [17:59<67:28:11, 4.10it/s, grad_norm=23.7, loss_final=3, loss_mean=0.932, loss_mean_cls=2.47, proj_loss=-0.405][2026-03-23 13:54:19] Step: 4374, Training Logs: loss_final: 2.607168, loss_mean: 0.943426, proj_loss: -0.415889, loss_mean_cls: 2.079632, grad_norm: 15.085101 +Steps: 0%| | 4375/1000000 [17:59<67:26:30, 4.10it/s, grad_norm=15.1, loss_final=2.61, loss_mean=0.943, loss_mean_cls=2.08, proj_loss=-0.416][2026-03-23 13:54:19] Step: 4375, Training Logs: loss_final: 3.224424, loss_mean: 0.906820, proj_loss: -0.410114, loss_mean_cls: 2.727719, grad_norm: 14.800237 +Steps: 0%| | 4376/1000000 [17:59<67:27:57, 4.10it/s, grad_norm=14.8, loss_final=3.22, loss_mean=0.907, loss_mean_cls=2.73, proj_loss=-0.41][2026-03-23 13:54:19] Step: 4376, Training Logs: loss_final: 3.015055, loss_mean: 0.915298, proj_loss: -0.419227, loss_mean_cls: 2.518984, grad_norm: 15.498006 +Steps: 0%| | 4377/1000000 [17:59<67:28:42, 4.10it/s, grad_norm=15.5, loss_final=3.02, loss_mean=0.915, loss_mean_cls=2.52, proj_loss=-0.419][2026-03-23 13:54:19] Step: 4377, Training Logs: loss_final: 3.084395, loss_mean: 0.927916, proj_loss: -0.412091, loss_mean_cls: 2.568570, grad_norm: 13.136971 +Steps: 0%| | 4378/1000000 [18:00<67:28:02, 4.10it/s, grad_norm=13.1, loss_final=3.08, loss_mean=0.928, loss_mean_cls=2.57, proj_loss=-0.412][2026-03-23 13:54:20] Step: 4378, Training Logs: loss_final: 2.664351, loss_mean: 0.946533, proj_loss: -0.423022, loss_mean_cls: 2.140840, grad_norm: 4.728772 +Steps: 0%| | 4379/1000000 [18:00<67:29:26, 4.10it/s, grad_norm=4.73, loss_final=2.66, loss_mean=0.947, loss_mean_cls=2.14, proj_loss=-0.423][2026-03-23 13:54:20] Step: 4379, Training Logs: loss_final: 2.629270, loss_mean: 0.937366, proj_loss: -0.424422, loss_mean_cls: 2.116326, grad_norm: 6.527883 +Steps: 0%| | 4380/1000000 [18:00<67:28:01, 4.10it/s, grad_norm=6.53, loss_final=2.63, loss_mean=0.937, loss_mean_cls=2.12, proj_loss=-0.424][2026-03-23 13:54:20] Step: 4380, Training Logs: loss_final: 2.620931, loss_mean: 0.943587, proj_loss: -0.427583, loss_mean_cls: 2.104927, grad_norm: 23.446970 +Steps: 0%| | 4381/1000000 [18:00<67:27:57, 4.10it/s, grad_norm=23.4, loss_final=2.62, loss_mean=0.944, loss_mean_cls=2.1, proj_loss=-0.428][2026-03-23 13:54:20] Step: 4381, Training Logs: loss_final: 2.673967, loss_mean: 0.938714, proj_loss: -0.428028, loss_mean_cls: 2.163281, grad_norm: 17.214701 +Steps: 0%| | 4382/1000000 [18:01<67:27:59, 4.10it/s, grad_norm=17.2, loss_final=2.67, loss_mean=0.939, loss_mean_cls=2.16, proj_loss=-0.428][2026-03-23 13:54:21] Step: 4382, Training Logs: loss_final: 2.774192, loss_mean: 0.916272, proj_loss: -0.421312, loss_mean_cls: 2.279232, grad_norm: 22.698692 +Steps: 0%| | 4383/1000000 [18:01<67:27:19, 4.10it/s, grad_norm=22.7, loss_final=2.77, loss_mean=0.916, loss_mean_cls=2.28, proj_loss=-0.421][2026-03-23 13:54:21] Step: 4383, Training Logs: loss_final: 2.333378, loss_mean: 0.924708, proj_loss: -0.429614, loss_mean_cls: 1.838284, grad_norm: 10.435004 +Steps: 0%| | 4384/1000000 [18:01<67:26:13, 4.10it/s, grad_norm=10.4, loss_final=2.33, loss_mean=0.925, loss_mean_cls=1.84, proj_loss=-0.43][2026-03-23 13:54:21] Step: 4384, Training Logs: loss_final: 2.757393, loss_mean: 0.913493, proj_loss: -0.423919, loss_mean_cls: 2.267820, grad_norm: 3.467038 +Steps: 0%| | 4385/1000000 [18:01<67:26:22, 4.10it/s, grad_norm=3.47, loss_final=2.76, loss_mean=0.913, loss_mean_cls=2.27, proj_loss=-0.424][2026-03-23 13:54:21] Step: 4385, Training Logs: loss_final: 2.773648, loss_mean: 0.919263, proj_loss: -0.425218, loss_mean_cls: 2.279602, grad_norm: 3.288742 +Steps: 0%| | 4386/1000000 [18:02<67:26:45, 4.10it/s, grad_norm=3.29, loss_final=2.77, loss_mean=0.919, loss_mean_cls=2.28, proj_loss=-0.425][2026-03-23 13:54:22] Step: 4386, Training Logs: loss_final: 3.093491, loss_mean: 0.899533, proj_loss: -0.418268, loss_mean_cls: 2.612226, grad_norm: 18.047741 +Steps: 0%| | 4387/1000000 [18:02<67:26:44, 4.10it/s, grad_norm=18, loss_final=3.09, loss_mean=0.9, loss_mean_cls=2.61, proj_loss=-0.418][2026-03-23 13:54:22] Step: 4387, Training Logs: loss_final: 2.981509, loss_mean: 0.917817, proj_loss: -0.420228, loss_mean_cls: 2.483920, grad_norm: 4.713761 +Steps: 0%| | 4388/1000000 [18:02<67:28:13, 4.10it/s, grad_norm=4.71, loss_final=2.98, loss_mean=0.918, loss_mean_cls=2.48, proj_loss=-0.42][2026-03-23 13:54:22] Step: 4388, Training Logs: loss_final: 2.733165, loss_mean: 0.939923, proj_loss: -0.422370, loss_mean_cls: 2.215612, grad_norm: 14.333666 +Steps: 0%| | 4389/1000000 [18:02<67:27:30, 4.10it/s, grad_norm=14.3, loss_final=2.73, loss_mean=0.94, loss_mean_cls=2.22, proj_loss=-0.422][2026-03-23 13:54:22] Step: 4389, Training Logs: loss_final: 3.082772, loss_mean: 0.914722, proj_loss: -0.416061, loss_mean_cls: 2.584111, grad_norm: 2.679861 +Steps: 0%| | 4390/1000000 [18:03<67:28:08, 4.10it/s, grad_norm=2.68, loss_final=3.08, loss_mean=0.915, loss_mean_cls=2.58, proj_loss=-0.416][2026-03-23 13:54:23] Step: 4390, Training Logs: loss_final: 3.224814, loss_mean: 0.890774, proj_loss: -0.416671, loss_mean_cls: 2.750711, grad_norm: 20.987246 +Steps: 0%| | 4391/1000000 [18:03<67:27:39, 4.10it/s, grad_norm=21, loss_final=3.22, loss_mean=0.891, loss_mean_cls=2.75, proj_loss=-0.417][2026-03-23 13:54:23] Step: 4391, Training Logs: loss_final: 2.527416, loss_mean: 0.937055, proj_loss: -0.426741, loss_mean_cls: 2.017103, grad_norm: 2.463927 +Steps: 0%| | 4392/1000000 [18:03<67:26:56, 4.10it/s, grad_norm=2.46, loss_final=2.53, loss_mean=0.937, loss_mean_cls=2.02, proj_loss=-0.427][2026-03-23 13:54:23] Step: 4392, Training Logs: loss_final: 2.658393, loss_mean: 0.913267, proj_loss: -0.424415, loss_mean_cls: 2.169540, grad_norm: 16.264450 +Steps: 0%| | 4393/1000000 [18:03<67:27:15, 4.10it/s, grad_norm=16.3, loss_final=2.66, loss_mean=0.913, loss_mean_cls=2.17, proj_loss=-0.424][2026-03-23 13:54:23] Step: 4393, Training Logs: loss_final: 2.699623, loss_mean: 0.920350, proj_loss: -0.420915, loss_mean_cls: 2.200188, grad_norm: 2.356712 +Steps: 0%| | 4394/1000000 [18:04<67:27:01, 4.10it/s, grad_norm=2.36, loss_final=2.7, loss_mean=0.92, loss_mean_cls=2.2, proj_loss=-0.421][2026-03-23 13:54:24] Step: 4394, Training Logs: loss_final: 2.402805, loss_mean: 0.921889, proj_loss: -0.427333, loss_mean_cls: 1.908248, grad_norm: 9.077902 +Steps: 0%| | 4395/1000000 [18:04<67:25:56, 4.10it/s, grad_norm=9.08, loss_final=2.4, loss_mean=0.922, loss_mean_cls=1.91, proj_loss=-0.427][2026-03-23 13:54:24] Step: 4395, Training Logs: loss_final: 2.244829, loss_mean: 0.937434, proj_loss: -0.429057, loss_mean_cls: 1.736452, grad_norm: 8.480758 +Steps: 0%| | 4396/1000000 [18:04<67:24:49, 4.10it/s, grad_norm=8.48, loss_final=2.24, loss_mean=0.937, loss_mean_cls=1.74, proj_loss=-0.429][2026-03-23 13:54:24] Step: 4396, Training Logs: loss_final: 2.607318, loss_mean: 0.936526, proj_loss: -0.423781, loss_mean_cls: 2.094573, grad_norm: 6.774193 +Steps: 0%| | 4397/1000000 [18:04<67:26:17, 4.10it/s, grad_norm=6.77, loss_final=2.61, loss_mean=0.937, loss_mean_cls=2.09, proj_loss=-0.424][2026-03-23 13:54:24] Step: 4397, Training Logs: loss_final: 2.724805, loss_mean: 0.921101, proj_loss: -0.418199, loss_mean_cls: 2.221903, grad_norm: 2.637213 +Steps: 0%| | 4398/1000000 [18:05<67:24:41, 4.10it/s, grad_norm=2.64, loss_final=2.72, loss_mean=0.921, loss_mean_cls=2.22, proj_loss=-0.418][2026-03-23 13:54:25] Step: 4398, Training Logs: loss_final: 2.487561, loss_mean: 0.921111, proj_loss: -0.426145, loss_mean_cls: 1.992595, grad_norm: 7.737377 +Steps: 0%| | 4399/1000000 [18:05<67:32:27, 4.09it/s, grad_norm=7.74, loss_final=2.49, loss_mean=0.921, loss_mean_cls=1.99, proj_loss=-0.426][2026-03-23 13:54:25] Step: 4399, Training Logs: loss_final: 2.980454, loss_mean: 0.922472, proj_loss: -0.410705, loss_mean_cls: 2.468687, grad_norm: 2.574944 +Steps: 0%| | 4400/1000000 [18:05<67:31:18, 4.10it/s, grad_norm=2.57, loss_final=2.98, loss_mean=0.922, loss_mean_cls=2.47, proj_loss=-0.411][2026-03-23 13:54:25] Step: 4400, Training Logs: loss_final: 2.499032, loss_mean: 0.939514, proj_loss: -0.422414, loss_mean_cls: 1.981933, grad_norm: 5.339013 +Steps: 0%| | 4401/1000000 [18:05<67:30:18, 4.10it/s, grad_norm=5.34, loss_final=2.5, loss_mean=0.94, loss_mean_cls=1.98, proj_loss=-0.422][2026-03-23 13:54:25] Step: 4401, Training Logs: loss_final: 2.574208, loss_mean: 0.906395, proj_loss: -0.429028, loss_mean_cls: 2.096841, grad_norm: 25.805853 +Steps: 0%| | 4402/1000000 [18:06<67:29:20, 4.10it/s, grad_norm=25.8, loss_final=2.57, loss_mean=0.906, loss_mean_cls=2.1, proj_loss=-0.429][2026-03-23 13:54:26] Step: 4402, Training Logs: loss_final: 2.228759, loss_mean: 0.942289, proj_loss: -0.426686, loss_mean_cls: 1.713156, grad_norm: 6.475710 +Steps: 0%| | 4403/1000000 [18:06<67:27:41, 4.10it/s, grad_norm=6.48, loss_final=2.23, loss_mean=0.942, loss_mean_cls=1.71, proj_loss=-0.427][2026-03-23 13:54:26] Step: 4403, Training Logs: loss_final: 2.526513, loss_mean: 0.944500, proj_loss: -0.420701, loss_mean_cls: 2.002714, grad_norm: 12.986832 +Steps: 0%| | 4404/1000000 [18:06<67:26:11, 4.10it/s, grad_norm=13, loss_final=2.53, loss_mean=0.944, loss_mean_cls=2, proj_loss=-0.421][2026-03-23 13:54:26] Step: 4404, Training Logs: loss_final: 2.555600, loss_mean: 0.926502, proj_loss: -0.422782, loss_mean_cls: 2.051880, grad_norm: 13.691259 +Steps: 0%| | 4405/1000000 [18:06<67:25:57, 4.10it/s, grad_norm=13.7, loss_final=2.56, loss_mean=0.927, loss_mean_cls=2.05, proj_loss=-0.423][2026-03-23 13:54:26] Step: 4405, Training Logs: loss_final: 2.904646, loss_mean: 0.921305, proj_loss: -0.418932, loss_mean_cls: 2.402273, grad_norm: 40.480122 +Steps: 0%| | 4406/1000000 [18:07<67:24:43, 4.10it/s, grad_norm=40.5, loss_final=2.9, loss_mean=0.921, loss_mean_cls=2.4, proj_loss=-0.419][2026-03-23 13:54:26] Step: 4406, Training Logs: loss_final: 3.251748, loss_mean: 0.905880, proj_loss: -0.406894, loss_mean_cls: 2.752762, grad_norm: 11.699661 +Steps: 0%| | 4407/1000000 [18:07<67:25:05, 4.10it/s, grad_norm=11.7, loss_final=3.25, loss_mean=0.906, loss_mean_cls=2.75, proj_loss=-0.407][2026-03-23 13:54:27] Step: 4407, Training Logs: loss_final: 2.842746, loss_mean: 0.913870, proj_loss: -0.411634, loss_mean_cls: 2.340510, grad_norm: 27.914909 +Steps: 0%| | 4408/1000000 [18:07<67:24:32, 4.10it/s, grad_norm=27.9, loss_final=2.84, loss_mean=0.914, loss_mean_cls=2.34, proj_loss=-0.412][2026-03-23 13:54:27] Step: 4408, Training Logs: loss_final: 2.896774, loss_mean: 0.894822, proj_loss: -0.416781, loss_mean_cls: 2.418732, grad_norm: 25.468374 +Steps: 0%| | 4409/1000000 [18:07<67:25:54, 4.10it/s, grad_norm=25.5, loss_final=2.9, loss_mean=0.895, loss_mean_cls=2.42, proj_loss=-0.417][2026-03-23 13:54:27] Step: 4409, Training Logs: loss_final: 2.842100, loss_mean: 0.928432, proj_loss: -0.418272, loss_mean_cls: 2.331940, grad_norm: 18.677137 +Steps: 0%| | 4410/1000000 [18:08<67:25:00, 4.10it/s, grad_norm=18.7, loss_final=2.84, loss_mean=0.928, loss_mean_cls=2.33, proj_loss=-0.418][2026-03-23 13:54:27] Step: 4410, Training Logs: loss_final: 2.808683, loss_mean: 0.924175, proj_loss: -0.423154, loss_mean_cls: 2.307661, grad_norm: 24.169468 +Steps: 0%| | 4411/1000000 [18:08<67:26:03, 4.10it/s, grad_norm=24.2, loss_final=2.81, loss_mean=0.924, loss_mean_cls=2.31, proj_loss=-0.423][2026-03-23 13:54:28] Step: 4411, Training Logs: loss_final: 2.850628, loss_mean: 0.894124, proj_loss: -0.419151, loss_mean_cls: 2.375656, grad_norm: 28.361437 +Steps: 0%| | 4412/1000000 [18:08<67:29:36, 4.10it/s, grad_norm=28.4, loss_final=2.85, loss_mean=0.894, loss_mean_cls=2.38, proj_loss=-0.419][2026-03-23 13:54:28] Step: 4412, Training Logs: loss_final: 2.807536, loss_mean: 0.922884, proj_loss: -0.422819, loss_mean_cls: 2.307470, grad_norm: 21.033211 +Steps: 0%| | 4413/1000000 [18:08<67:30:18, 4.10it/s, grad_norm=21, loss_final=2.81, loss_mean=0.923, loss_mean_cls=2.31, proj_loss=-0.423][2026-03-23 13:54:28] Step: 4413, Training Logs: loss_final: 2.639600, loss_mean: 0.923589, proj_loss: -0.424099, loss_mean_cls: 2.140110, grad_norm: 23.323532 +Steps: 0%| | 4414/1000000 [18:08<67:28:34, 4.10it/s, grad_norm=23.3, loss_final=2.64, loss_mean=0.924, loss_mean_cls=2.14, proj_loss=-0.424][2026-03-23 13:54:28] Step: 4414, Training Logs: loss_final: 2.197328, loss_mean: 0.932783, proj_loss: -0.427831, loss_mean_cls: 1.692376, grad_norm: 6.279866 +Steps: 0%| | 4415/1000000 [18:09<67:27:52, 4.10it/s, grad_norm=6.28, loss_final=2.2, loss_mean=0.933, loss_mean_cls=1.69, proj_loss=-0.428][2026-03-23 13:54:29] Step: 4415, Training Logs: loss_final: 2.887943, loss_mean: 0.915255, proj_loss: -0.421476, loss_mean_cls: 2.394165, grad_norm: 10.806762 +Steps: 0%| | 4416/1000000 [18:09<67:26:45, 4.10it/s, grad_norm=10.8, loss_final=2.89, loss_mean=0.915, loss_mean_cls=2.39, proj_loss=-0.421][2026-03-23 13:54:29] Step: 4416, Training Logs: loss_final: 2.820047, loss_mean: 0.911199, proj_loss: -0.424552, loss_mean_cls: 2.333400, grad_norm: 17.256643 +Steps: 0%| | 4417/1000000 [18:09<67:27:06, 4.10it/s, grad_norm=17.3, loss_final=2.82, loss_mean=0.911, loss_mean_cls=2.33, proj_loss=-0.425][2026-03-23 13:54:29] Step: 4417, Training Logs: loss_final: 2.521587, loss_mean: 0.937453, proj_loss: -0.427642, loss_mean_cls: 2.011777, grad_norm: 22.972956 +Steps: 0%| | 4418/1000000 [18:09<67:25:57, 4.10it/s, grad_norm=23, loss_final=2.52, loss_mean=0.937, loss_mean_cls=2.01, proj_loss=-0.428][2026-03-23 13:54:29] Step: 4418, Training Logs: loss_final: 2.471411, loss_mean: 0.928039, proj_loss: -0.428544, loss_mean_cls: 1.971915, grad_norm: 3.936677 +Steps: 0%| | 4419/1000000 [18:10<67:24:33, 4.10it/s, grad_norm=3.94, loss_final=2.47, loss_mean=0.928, loss_mean_cls=1.97, proj_loss=-0.429][2026-03-23 13:54:30] Step: 4419, Training Logs: loss_final: 2.713113, loss_mean: 0.912443, proj_loss: -0.424636, loss_mean_cls: 2.225307, grad_norm: 1.871353 +Steps: 0%| | 4420/1000000 [18:10<67:25:19, 4.10it/s, grad_norm=1.87, loss_final=2.71, loss_mean=0.912, loss_mean_cls=2.23, proj_loss=-0.425][2026-03-23 13:54:30] Step: 4420, Training Logs: loss_final: 2.746572, loss_mean: 0.932701, proj_loss: -0.425440, loss_mean_cls: 2.239311, grad_norm: 9.326387 +Steps: 0%| | 4421/1000000 [18:10<67:25:50, 4.10it/s, grad_norm=9.33, loss_final=2.75, loss_mean=0.933, loss_mean_cls=2.24, proj_loss=-0.425][2026-03-23 13:54:30] Step: 4421, Training Logs: loss_final: 2.876036, loss_mean: 0.922858, proj_loss: -0.416534, loss_mean_cls: 2.369712, grad_norm: 8.173133 +Steps: 0%| | 4422/1000000 [18:10<67:24:50, 4.10it/s, grad_norm=8.17, loss_final=2.88, loss_mean=0.923, loss_mean_cls=2.37, proj_loss=-0.417][2026-03-23 13:54:30] Step: 4422, Training Logs: loss_final: 2.662685, loss_mean: 0.915385, proj_loss: -0.424624, loss_mean_cls: 2.171924, grad_norm: 9.983831 +Steps: 0%| | 4423/1000000 [18:11<67:24:19, 4.10it/s, grad_norm=9.98, loss_final=2.66, loss_mean=0.915, loss_mean_cls=2.17, proj_loss=-0.425][2026-03-23 13:54:31] Step: 4423, Training Logs: loss_final: 2.394364, loss_mean: 0.907239, proj_loss: -0.425325, loss_mean_cls: 1.912449, grad_norm: 12.902708 +Steps: 0%| | 4424/1000000 [18:11<67:24:48, 4.10it/s, grad_norm=12.9, loss_final=2.39, loss_mean=0.907, loss_mean_cls=1.91, proj_loss=-0.425][2026-03-23 13:54:31] Step: 4424, Training Logs: loss_final: 2.733374, loss_mean: 0.917424, proj_loss: -0.418121, loss_mean_cls: 2.234071, grad_norm: 7.080873 +Steps: 0%| | 4425/1000000 [18:11<67:25:45, 4.10it/s, grad_norm=7.08, loss_final=2.73, loss_mean=0.917, loss_mean_cls=2.23, proj_loss=-0.418][2026-03-23 13:54:31] Step: 4425, Training Logs: loss_final: 3.072667, loss_mean: 0.921765, proj_loss: -0.415507, loss_mean_cls: 2.566409, grad_norm: 28.902811 +Steps: 0%| | 4426/1000000 [18:11<67:24:44, 4.10it/s, grad_norm=28.9, loss_final=3.07, loss_mean=0.922, loss_mean_cls=2.57, proj_loss=-0.416][2026-03-23 13:54:31] Step: 4426, Training Logs: loss_final: 2.385979, loss_mean: 0.903557, proj_loss: -0.437030, loss_mean_cls: 1.919451, grad_norm: 38.220493 +Steps: 0%| | 4427/1000000 [18:12<67:25:12, 4.10it/s, grad_norm=38.2, loss_final=2.39, loss_mean=0.904, loss_mean_cls=1.92, proj_loss=-0.437][2026-03-23 13:54:32] Step: 4427, Training Logs: loss_final: 2.783466, loss_mean: 0.910219, proj_loss: -0.424190, loss_mean_cls: 2.297437, grad_norm: 2.032533 +Steps: 0%| | 4428/1000000 [18:12<67:25:52, 4.10it/s, grad_norm=2.03, loss_final=2.78, loss_mean=0.91, loss_mean_cls=2.3, proj_loss=-0.424][2026-03-23 13:54:32] Step: 4428, Training Logs: loss_final: 2.676522, loss_mean: 0.924383, proj_loss: -0.424699, loss_mean_cls: 2.176838, grad_norm: 12.946785 +Steps: 0%| | 4429/1000000 [18:12<67:26:59, 4.10it/s, grad_norm=12.9, loss_final=2.68, loss_mean=0.924, loss_mean_cls=2.18, proj_loss=-0.425][2026-03-23 13:54:32] Step: 4429, Training Logs: loss_final: 2.450819, loss_mean: 0.954556, proj_loss: -0.421255, loss_mean_cls: 1.917517, grad_norm: 24.056728 +Steps: 0%| | 4430/1000000 [18:12<67:27:38, 4.10it/s, grad_norm=24.1, loss_final=2.45, loss_mean=0.955, loss_mean_cls=1.92, proj_loss=-0.421][2026-03-23 13:54:32] Step: 4430, Training Logs: loss_final: 2.666165, loss_mean: 0.942777, proj_loss: -0.413551, loss_mean_cls: 2.136939, grad_norm: 10.693551 +Steps: 0%| | 4431/1000000 [18:13<67:26:44, 4.10it/s, grad_norm=10.7, loss_final=2.67, loss_mean=0.943, loss_mean_cls=2.14, proj_loss=-0.414][2026-03-23 13:54:33] Step: 4431, Training Logs: loss_final: 2.272504, loss_mean: 0.976584, proj_loss: -0.424987, loss_mean_cls: 1.720907, grad_norm: 15.755383 +Steps: 0%| | 4432/1000000 [18:13<67:26:35, 4.10it/s, grad_norm=15.8, loss_final=2.27, loss_mean=0.977, loss_mean_cls=1.72, proj_loss=-0.425][2026-03-23 13:54:33] Step: 4432, Training Logs: loss_final: 2.647669, loss_mean: 0.945768, proj_loss: -0.421967, loss_mean_cls: 2.123868, grad_norm: 13.906477 +Steps: 0%| | 4433/1000000 [18:13<67:27:29, 4.10it/s, grad_norm=13.9, loss_final=2.65, loss_mean=0.946, loss_mean_cls=2.12, proj_loss=-0.422][2026-03-23 13:54:33] Step: 4433, Training Logs: loss_final: 2.512916, loss_mean: 0.939878, proj_loss: -0.430264, loss_mean_cls: 2.003302, grad_norm: 20.414171 +Steps: 0%| | 4434/1000000 [18:13<67:26:41, 4.10it/s, grad_norm=20.4, loss_final=2.51, loss_mean=0.94, loss_mean_cls=2, proj_loss=-0.43][2026-03-23 13:54:33] Step: 4434, Training Logs: loss_final: 2.369553, loss_mean: 0.953180, proj_loss: -0.422376, loss_mean_cls: 1.838750, grad_norm: 6.889218 +Steps: 0%| | 4435/1000000 [18:14<67:25:09, 4.10it/s, grad_norm=6.89, loss_final=2.37, loss_mean=0.953, loss_mean_cls=1.84, proj_loss=-0.422][2026-03-23 13:54:34] Step: 4435, Training Logs: loss_final: 2.860763, loss_mean: 0.917574, proj_loss: -0.419676, loss_mean_cls: 2.362864, grad_norm: 7.268692 +Steps: 0%| | 4436/1000000 [18:14<67:24:42, 4.10it/s, grad_norm=7.27, loss_final=2.86, loss_mean=0.918, loss_mean_cls=2.36, proj_loss=-0.42][2026-03-23 13:54:34] Step: 4436, Training Logs: loss_final: 2.710155, loss_mean: 0.902570, proj_loss: -0.427572, loss_mean_cls: 2.235156, grad_norm: 22.067574 +Steps: 0%| | 4437/1000000 [18:14<67:26:06, 4.10it/s, grad_norm=22.1, loss_final=2.71, loss_mean=0.903, loss_mean_cls=2.24, proj_loss=-0.428][2026-03-23 13:54:34] Step: 4437, Training Logs: loss_final: 2.557338, loss_mean: 0.915063, proj_loss: -0.422577, loss_mean_cls: 2.064852, grad_norm: 16.774895 +Steps: 0%| | 4438/1000000 [18:14<67:26:26, 4.10it/s, grad_norm=16.8, loss_final=2.56, loss_mean=0.915, loss_mean_cls=2.06, proj_loss=-0.423][2026-03-23 13:54:34] Step: 4438, Training Logs: loss_final: 3.195927, loss_mean: 0.902733, proj_loss: -0.409647, loss_mean_cls: 2.702841, grad_norm: 7.056559 +Steps: 0%| | 4439/1000000 [18:15<67:59:53, 4.07it/s, grad_norm=7.06, loss_final=3.2, loss_mean=0.903, loss_mean_cls=2.7, proj_loss=-0.41][2026-03-23 13:54:35] Step: 4439, Training Logs: loss_final: 2.642396, loss_mean: 0.939771, proj_loss: -0.426180, loss_mean_cls: 2.128805, grad_norm: 14.749456 +Steps: 0%| | 4440/1000000 [18:15<67:50:50, 4.08it/s, grad_norm=14.7, loss_final=2.64, loss_mean=0.94, loss_mean_cls=2.13, proj_loss=-0.426][2026-03-23 13:54:35] Step: 4440, Training Logs: loss_final: 2.694830, loss_mean: 0.898655, proj_loss: -0.425994, loss_mean_cls: 2.222169, grad_norm: 14.107543 +Steps: 0%| | 4441/1000000 [18:15<67:43:05, 4.08it/s, grad_norm=14.1, loss_final=2.69, loss_mean=0.899, loss_mean_cls=2.22, proj_loss=-0.426][2026-03-23 13:54:35] Step: 4441, Training Logs: loss_final: 2.312423, loss_mean: 0.911159, proj_loss: -0.430237, loss_mean_cls: 1.831501, grad_norm: 31.650753 +Steps: 0%| | 4442/1000000 [18:15<67:38:22, 4.09it/s, grad_norm=31.7, loss_final=2.31, loss_mean=0.911, loss_mean_cls=1.83, proj_loss=-0.43][2026-03-23 13:54:35] Step: 4442, Training Logs: loss_final: 2.877743, loss_mean: 0.917379, proj_loss: -0.427446, loss_mean_cls: 2.387810, grad_norm: 5.192500 +Steps: 0%| | 4443/1000000 [18:16<67:34:29, 4.09it/s, grad_norm=5.19, loss_final=2.88, loss_mean=0.917, loss_mean_cls=2.39, proj_loss=-0.427][2026-03-23 13:54:36] Step: 4443, Training Logs: loss_final: 2.362670, loss_mean: 0.912161, proj_loss: -0.423476, loss_mean_cls: 1.873984, grad_norm: 3.415790 +Steps: 0%| | 4444/1000000 [18:16<67:33:31, 4.09it/s, grad_norm=3.42, loss_final=2.36, loss_mean=0.912, loss_mean_cls=1.87, proj_loss=-0.423][2026-03-23 13:54:36] Step: 4444, Training Logs: loss_final: 2.950425, loss_mean: 0.900669, proj_loss: -0.422515, loss_mean_cls: 2.472271, grad_norm: 29.538227 +Steps: 0%| | 4445/1000000 [18:16<67:31:28, 4.10it/s, grad_norm=29.5, loss_final=2.95, loss_mean=0.901, loss_mean_cls=2.47, proj_loss=-0.423][2026-03-23 13:54:36] Step: 4445, Training Logs: loss_final: 2.715272, loss_mean: 0.921783, proj_loss: -0.421758, loss_mean_cls: 2.215246, grad_norm: 10.028102 +Steps: 0%| | 4446/1000000 [18:16<67:31:26, 4.10it/s, grad_norm=10, loss_final=2.72, loss_mean=0.922, loss_mean_cls=2.22, proj_loss=-0.422][2026-03-23 13:54:36] Step: 4446, Training Logs: loss_final: 3.373279, loss_mean: 0.885739, proj_loss: -0.421710, loss_mean_cls: 2.909251, grad_norm: 31.497339 +Steps: 0%| | 4447/1000000 [18:17<67:30:26, 4.10it/s, grad_norm=31.5, loss_final=3.37, loss_mean=0.886, loss_mean_cls=2.91, proj_loss=-0.422][2026-03-23 13:54:36] Step: 4447, Training Logs: loss_final: 2.430104, loss_mean: 0.923569, proj_loss: -0.425241, loss_mean_cls: 1.931776, grad_norm: 3.597104 +Steps: 0%| | 4448/1000000 [18:17<67:29:58, 4.10it/s, grad_norm=3.6, loss_final=2.43, loss_mean=0.924, loss_mean_cls=1.93, proj_loss=-0.425][2026-03-23 13:54:37] Step: 4448, Training Logs: loss_final: 2.574909, loss_mean: 0.915692, proj_loss: -0.426471, loss_mean_cls: 2.085688, grad_norm: 29.435383 +Steps: 0%| | 4449/1000000 [18:17<67:27:29, 4.10it/s, grad_norm=29.4, loss_final=2.57, loss_mean=0.916, loss_mean_cls=2.09, proj_loss=-0.426][2026-03-23 13:54:37] Step: 4449, Training Logs: loss_final: 2.596076, loss_mean: 0.945140, proj_loss: -0.431195, loss_mean_cls: 2.082130, grad_norm: 17.850651 +Steps: 0%| | 4450/1000000 [18:17<67:26:59, 4.10it/s, grad_norm=17.9, loss_final=2.6, loss_mean=0.945, loss_mean_cls=2.08, proj_loss=-0.431][2026-03-23 13:54:37] Step: 4450, Training Logs: loss_final: 2.945484, loss_mean: 0.905158, proj_loss: -0.424114, loss_mean_cls: 2.464441, grad_norm: 30.294668 +Steps: 0%| | 4451/1000000 [18:18<67:26:54, 4.10it/s, grad_norm=30.3, loss_final=2.95, loss_mean=0.905, loss_mean_cls=2.46, proj_loss=-0.424][2026-03-23 13:54:37] Step: 4451, Training Logs: loss_final: 2.615098, loss_mean: 0.938395, proj_loss: -0.428003, loss_mean_cls: 2.104707, grad_norm: 15.737089 +Steps: 0%| | 4452/1000000 [18:18<67:26:11, 4.10it/s, grad_norm=15.7, loss_final=2.62, loss_mean=0.938, loss_mean_cls=2.1, proj_loss=-0.428][2026-03-23 13:54:38] Step: 4452, Training Logs: loss_final: 3.137668, loss_mean: 0.900497, proj_loss: -0.409923, loss_mean_cls: 2.647094, grad_norm: 15.407358 +Steps: 0%| | 4453/1000000 [18:18<67:26:41, 4.10it/s, grad_norm=15.4, loss_final=3.14, loss_mean=0.9, loss_mean_cls=2.65, proj_loss=-0.41][2026-03-23 13:54:38] Step: 4453, Training Logs: loss_final: 2.241387, loss_mean: 0.945677, proj_loss: -0.431533, loss_mean_cls: 1.727243, grad_norm: 7.591509 +Steps: 0%| | 4454/1000000 [18:18<67:27:10, 4.10it/s, grad_norm=7.59, loss_final=2.24, loss_mean=0.946, loss_mean_cls=1.73, proj_loss=-0.432][2026-03-23 13:54:38] Step: 4454, Training Logs: loss_final: 2.480471, loss_mean: 0.919358, proj_loss: -0.429650, loss_mean_cls: 1.990763, grad_norm: 31.577562 +Steps: 0%| | 4455/1000000 [18:18<67:26:06, 4.10it/s, grad_norm=31.6, loss_final=2.48, loss_mean=0.919, loss_mean_cls=1.99, proj_loss=-0.43][2026-03-23 13:54:38] Step: 4455, Training Logs: loss_final: 2.555628, loss_mean: 0.926425, proj_loss: -0.429260, loss_mean_cls: 2.058463, grad_norm: 8.957483 +Steps: 0%| | 4456/1000000 [18:19<67:25:11, 4.10it/s, grad_norm=8.96, loss_final=2.56, loss_mean=0.926, loss_mean_cls=2.06, proj_loss=-0.429][2026-03-23 13:54:39] Step: 4456, Training Logs: loss_final: 2.830612, loss_mean: 0.928484, proj_loss: -0.426945, loss_mean_cls: 2.329073, grad_norm: 36.663864 +Steps: 0%| | 4457/1000000 [18:19<67:24:50, 4.10it/s, grad_norm=36.7, loss_final=2.83, loss_mean=0.928, loss_mean_cls=2.33, proj_loss=-0.427][2026-03-23 13:54:39] Step: 4457, Training Logs: loss_final: 2.425881, loss_mean: 0.928078, proj_loss: -0.425998, loss_mean_cls: 1.923802, grad_norm: 2.037036 +Steps: 0%| | 4458/1000000 [18:19<67:24:23, 4.10it/s, grad_norm=2.04, loss_final=2.43, loss_mean=0.928, loss_mean_cls=1.92, proj_loss=-0.426][2026-03-23 13:54:39] Step: 4458, Training Logs: loss_final: 2.876588, loss_mean: 0.915375, proj_loss: -0.425459, loss_mean_cls: 2.386671, grad_norm: 12.576836 +Steps: 0%| | 4459/1000000 [18:19<67:25:19, 4.10it/s, grad_norm=12.6, loss_final=2.88, loss_mean=0.915, loss_mean_cls=2.39, proj_loss=-0.425][2026-03-23 13:54:39] Step: 4459, Training Logs: loss_final: 2.800461, loss_mean: 0.923515, proj_loss: -0.424359, loss_mean_cls: 2.301306, grad_norm: 43.497616 +Steps: 0%| | 4460/1000000 [18:20<67:24:53, 4.10it/s, grad_norm=43.5, loss_final=2.8, loss_mean=0.924, loss_mean_cls=2.3, proj_loss=-0.424][2026-03-23 13:54:40] Step: 4460, Training Logs: loss_final: 2.644194, loss_mean: 0.943284, proj_loss: -0.423189, loss_mean_cls: 2.124099, grad_norm: 13.797549 +Steps: 0%| | 4461/1000000 [18:20<67:24:50, 4.10it/s, grad_norm=13.8, loss_final=2.64, loss_mean=0.943, loss_mean_cls=2.12, proj_loss=-0.423][2026-03-23 13:54:40] Step: 4461, Training Logs: loss_final: 2.897314, loss_mean: 0.933627, proj_loss: -0.416790, loss_mean_cls: 2.380477, grad_norm: 9.712308 +Steps: 0%| | 4462/1000000 [18:20<67:24:58, 4.10it/s, grad_norm=9.71, loss_final=2.9, loss_mean=0.934, loss_mean_cls=2.38, proj_loss=-0.417][2026-03-23 13:54:40] Step: 4462, Training Logs: loss_final: 2.700448, loss_mean: 0.931137, proj_loss: -0.418600, loss_mean_cls: 2.187911, grad_norm: 21.029554 +Steps: 0%| | 4463/1000000 [18:20<67:25:17, 4.10it/s, grad_norm=21, loss_final=2.7, loss_mean=0.931, loss_mean_cls=2.19, proj_loss=-0.419][2026-03-23 13:54:40] Step: 4463, Training Logs: loss_final: 2.801696, loss_mean: 0.930009, proj_loss: -0.420914, loss_mean_cls: 2.292601, grad_norm: 12.728193 +Steps: 0%| | 4464/1000000 [18:21<67:25:36, 4.10it/s, grad_norm=12.7, loss_final=2.8, loss_mean=0.93, loss_mean_cls=2.29, proj_loss=-0.421][2026-03-23 13:54:41] Step: 4464, Training Logs: loss_final: 2.324687, loss_mean: 0.947041, proj_loss: -0.430710, loss_mean_cls: 1.808357, grad_norm: 12.022120 +Steps: 0%| | 4465/1000000 [18:21<67:26:08, 4.10it/s, grad_norm=12, loss_final=2.32, loss_mean=0.947, loss_mean_cls=1.81, proj_loss=-0.431][2026-03-23 13:54:41] Step: 4465, Training Logs: loss_final: 2.348838, loss_mean: 0.937073, proj_loss: -0.429243, loss_mean_cls: 1.841008, grad_norm: 10.032985 +Steps: 0%| | 4466/1000000 [18:21<67:24:46, 4.10it/s, grad_norm=10, loss_final=2.35, loss_mean=0.937, loss_mean_cls=1.84, proj_loss=-0.429][2026-03-23 13:54:41] Step: 4466, Training Logs: loss_final: 3.179146, loss_mean: 0.912682, proj_loss: -0.415869, loss_mean_cls: 2.682333, grad_norm: 15.352624 +Steps: 0%| | 4467/1000000 [18:21<67:24:30, 4.10it/s, grad_norm=15.4, loss_final=3.18, loss_mean=0.913, loss_mean_cls=2.68, proj_loss=-0.416][2026-03-23 13:54:41] Step: 4467, Training Logs: loss_final: 2.473934, loss_mean: 0.948772, proj_loss: -0.426403, loss_mean_cls: 1.951565, grad_norm: 15.419750 +Steps: 0%| | 4468/1000000 [18:22<67:24:47, 4.10it/s, grad_norm=15.4, loss_final=2.47, loss_mean=0.949, loss_mean_cls=1.95, proj_loss=-0.426][2026-03-23 13:54:42] Step: 4468, Training Logs: loss_final: 2.856580, loss_mean: 0.935513, proj_loss: -0.420569, loss_mean_cls: 2.341635, grad_norm: 23.506872 +Steps: 0%| | 4469/1000000 [18:22<67:33:08, 4.09it/s, grad_norm=23.5, loss_final=2.86, loss_mean=0.936, loss_mean_cls=2.34, proj_loss=-0.421][2026-03-23 13:54:42] Step: 4469, Training Logs: loss_final: 3.162249, loss_mean: 0.922284, proj_loss: -0.418489, loss_mean_cls: 2.658453, grad_norm: 32.141632 +Steps: 0%| | 4470/1000000 [18:22<67:30:46, 4.10it/s, grad_norm=32.1, loss_final=3.16, loss_mean=0.922, loss_mean_cls=2.66, proj_loss=-0.418][2026-03-23 13:54:42] Step: 4470, Training Logs: loss_final: 2.582402, loss_mean: 0.936681, proj_loss: -0.430580, loss_mean_cls: 2.076301, grad_norm: 21.480244 +Steps: 0%| | 4471/1000000 [18:22<67:29:28, 4.10it/s, grad_norm=21.5, loss_final=2.58, loss_mean=0.937, loss_mean_cls=2.08, proj_loss=-0.431][2026-03-23 13:54:42] Step: 4471, Training Logs: loss_final: 2.498989, loss_mean: 0.924106, proj_loss: -0.422696, loss_mean_cls: 1.997579, grad_norm: 14.356331 +Steps: 0%| | 4472/1000000 [18:23<67:29:03, 4.10it/s, grad_norm=14.4, loss_final=2.5, loss_mean=0.924, loss_mean_cls=2, proj_loss=-0.423][2026-03-23 13:54:43] Step: 4472, Training Logs: loss_final: 2.588139, loss_mean: 0.918615, proj_loss: -0.420921, loss_mean_cls: 2.090445, grad_norm: 7.922727 +Steps: 0%| | 4473/1000000 [18:23<67:28:10, 4.10it/s, grad_norm=7.92, loss_final=2.59, loss_mean=0.919, loss_mean_cls=2.09, proj_loss=-0.421][2026-03-23 13:54:43] Step: 4473, Training Logs: loss_final: 2.877162, loss_mean: 0.923748, proj_loss: -0.422888, loss_mean_cls: 2.376302, grad_norm: 16.566607 +Steps: 0%| | 4474/1000000 [18:23<67:27:51, 4.10it/s, grad_norm=16.6, loss_final=2.88, loss_mean=0.924, loss_mean_cls=2.38, proj_loss=-0.423][2026-03-23 13:54:43] Step: 4474, Training Logs: loss_final: 2.852286, loss_mean: 0.934329, proj_loss: -0.420758, loss_mean_cls: 2.338715, grad_norm: 18.558809 +Steps: 0%| | 4475/1000000 [18:23<67:27:01, 4.10it/s, grad_norm=18.6, loss_final=2.85, loss_mean=0.934, loss_mean_cls=2.34, proj_loss=-0.421][2026-03-23 13:54:43] Step: 4475, Training Logs: loss_final: 2.318929, loss_mean: 0.934390, proj_loss: -0.429512, loss_mean_cls: 1.814051, grad_norm: 7.549119 +Steps: 0%| | 4476/1000000 [18:24<67:26:44, 4.10it/s, grad_norm=7.55, loss_final=2.32, loss_mean=0.934, loss_mean_cls=1.81, proj_loss=-0.43][2026-03-23 13:54:44] Step: 4476, Training Logs: loss_final: 2.851755, loss_mean: 0.925106, proj_loss: -0.418182, loss_mean_cls: 2.344831, grad_norm: 2.043188 +Steps: 0%| | 4477/1000000 [18:24<67:25:21, 4.10it/s, grad_norm=2.04, loss_final=2.85, loss_mean=0.925, loss_mean_cls=2.34, proj_loss=-0.418][2026-03-23 13:54:44] Step: 4477, Training Logs: loss_final: 2.779237, loss_mean: 0.924543, proj_loss: -0.423249, loss_mean_cls: 2.277942, grad_norm: 9.986922 +Steps: 0%| | 4478/1000000 [18:24<67:24:48, 4.10it/s, grad_norm=9.99, loss_final=2.78, loss_mean=0.925, loss_mean_cls=2.28, proj_loss=-0.423][2026-03-23 13:54:44] Step: 4478, Training Logs: loss_final: 2.729779, loss_mean: 0.927158, proj_loss: -0.420216, loss_mean_cls: 2.222837, grad_norm: 12.975925 +Steps: 0%| | 4479/1000000 [18:24<67:25:24, 4.10it/s, grad_norm=13, loss_final=2.73, loss_mean=0.927, loss_mean_cls=2.22, proj_loss=-0.42][2026-03-23 13:54:44] Step: 4479, Training Logs: loss_final: 2.677776, loss_mean: 0.927145, proj_loss: -0.420160, loss_mean_cls: 2.170791, grad_norm: 18.711325 +Steps: 0%| | 4480/1000000 [18:25<67:24:32, 4.10it/s, grad_norm=18.7, loss_final=2.68, loss_mean=0.927, loss_mean_cls=2.17, proj_loss=-0.42][2026-03-23 13:54:45] Step: 4480, Training Logs: loss_final: 2.728987, loss_mean: 0.932702, proj_loss: -0.422234, loss_mean_cls: 2.218520, grad_norm: 17.633961 +Steps: 0%| | 4481/1000000 [18:25<67:26:32, 4.10it/s, grad_norm=17.6, loss_final=2.73, loss_mean=0.933, loss_mean_cls=2.22, proj_loss=-0.422][2026-03-23 13:54:45] Step: 4481, Training Logs: loss_final: 3.098475, loss_mean: 0.919734, proj_loss: -0.407982, loss_mean_cls: 2.586723, grad_norm: 6.265988 +Steps: 0%| | 4482/1000000 [18:25<67:25:54, 4.10it/s, grad_norm=6.27, loss_final=3.1, loss_mean=0.92, loss_mean_cls=2.59, proj_loss=-0.408][2026-03-23 13:54:45] Step: 4482, Training Logs: loss_final: 2.988461, loss_mean: 0.897632, proj_loss: -0.424010, loss_mean_cls: 2.514839, grad_norm: 11.929556 +Steps: 0%| | 4483/1000000 [18:25<67:25:16, 4.10it/s, grad_norm=11.9, loss_final=2.99, loss_mean=0.898, loss_mean_cls=2.51, proj_loss=-0.424][2026-03-23 13:54:45] Step: 4483, Training Logs: loss_final: 2.687279, loss_mean: 0.931820, proj_loss: -0.422382, loss_mean_cls: 2.177841, grad_norm: 7.111683 +Steps: 0%| | 4484/1000000 [18:26<67:26:24, 4.10it/s, grad_norm=7.11, loss_final=2.69, loss_mean=0.932, loss_mean_cls=2.18, proj_loss=-0.422][2026-03-23 13:54:46] Step: 4484, Training Logs: loss_final: 2.493701, loss_mean: 0.935487, proj_loss: -0.425183, loss_mean_cls: 1.983397, grad_norm: 2.116510 +Steps: 0%| | 4485/1000000 [18:26<67:26:21, 4.10it/s, grad_norm=2.12, loss_final=2.49, loss_mean=0.935, loss_mean_cls=1.98, proj_loss=-0.425][2026-03-23 13:54:46] Step: 4485, Training Logs: loss_final: 2.696394, loss_mean: 0.916416, proj_loss: -0.431154, loss_mean_cls: 2.211132, grad_norm: 11.623766 +Steps: 0%| | 4486/1000000 [18:26<67:27:37, 4.10it/s, grad_norm=11.6, loss_final=2.7, loss_mean=0.916, loss_mean_cls=2.21, proj_loss=-0.431][2026-03-23 13:54:46] Step: 4486, Training Logs: loss_final: 2.709124, loss_mean: 0.917602, proj_loss: -0.417246, loss_mean_cls: 2.208769, grad_norm: 9.559751 +Steps: 0%| | 4487/1000000 [18:26<67:28:12, 4.10it/s, grad_norm=9.56, loss_final=2.71, loss_mean=0.918, loss_mean_cls=2.21, proj_loss=-0.417][2026-03-23 13:54:46] Step: 4487, Training Logs: loss_final: 2.775208, loss_mean: 0.922126, proj_loss: -0.417323, loss_mean_cls: 2.270405, grad_norm: 13.870966 +Steps: 0%| | 4488/1000000 [18:27<67:49:01, 4.08it/s, grad_norm=13.9, loss_final=2.78, loss_mean=0.922, loss_mean_cls=2.27, proj_loss=-0.417][2026-03-23 13:54:46] Step: 4488, Training Logs: loss_final: 3.048914, loss_mean: 0.885498, proj_loss: -0.411659, loss_mean_cls: 2.575075, grad_norm: 4.422943 +Steps: 0%| | 4489/1000000 [18:27<67:41:14, 4.09it/s, grad_norm=4.42, loss_final=3.05, loss_mean=0.885, loss_mean_cls=2.58, proj_loss=-0.412][2026-03-23 13:54:47] Step: 4489, Training Logs: loss_final: 2.918586, loss_mean: 0.887285, proj_loss: -0.423040, loss_mean_cls: 2.454340, grad_norm: 19.294056 +Steps: 0%| | 4490/1000000 [18:27<67:36:31, 4.09it/s, grad_norm=19.3, loss_final=2.92, loss_mean=0.887, loss_mean_cls=2.45, proj_loss=-0.423][2026-03-23 13:54:47] Step: 4490, Training Logs: loss_final: 2.987182, loss_mean: 0.939317, proj_loss: -0.421035, loss_mean_cls: 2.468900, grad_norm: 16.177805 +Steps: 0%| | 4491/1000000 [18:27<67:32:29, 4.09it/s, grad_norm=16.2, loss_final=2.99, loss_mean=0.939, loss_mean_cls=2.47, proj_loss=-0.421][2026-03-23 13:54:47] Step: 4491, Training Logs: loss_final: 2.324276, loss_mean: 0.917634, proj_loss: -0.427455, loss_mean_cls: 1.834097, grad_norm: 10.354565 +Steps: 0%| | 4492/1000000 [18:28<67:31:52, 4.09it/s, grad_norm=10.4, loss_final=2.32, loss_mean=0.918, loss_mean_cls=1.83, proj_loss=-0.427][2026-03-23 13:54:47] Step: 4492, Training Logs: loss_final: 2.709190, loss_mean: 0.934987, proj_loss: -0.414210, loss_mean_cls: 2.188413, grad_norm: 6.484708 +Steps: 0%| | 4493/1000000 [18:28<67:31:20, 4.10it/s, grad_norm=6.48, loss_final=2.71, loss_mean=0.935, loss_mean_cls=2.19, proj_loss=-0.414][2026-03-23 13:54:48] Step: 4493, Training Logs: loss_final: 2.661227, loss_mean: 0.929312, proj_loss: -0.419036, loss_mean_cls: 2.150952, grad_norm: 1.467622 +Steps: 0%| | 4494/1000000 [18:28<67:30:56, 4.10it/s, grad_norm=1.47, loss_final=2.66, loss_mean=0.929, loss_mean_cls=2.15, proj_loss=-0.419][2026-03-23 13:54:48] Step: 4494, Training Logs: loss_final: 3.005701, loss_mean: 0.903606, proj_loss: -0.417442, loss_mean_cls: 2.519537, grad_norm: 11.717750 +Steps: 0%| | 4495/1000000 [18:28<67:32:07, 4.09it/s, grad_norm=11.7, loss_final=3.01, loss_mean=0.904, loss_mean_cls=2.52, proj_loss=-0.417][2026-03-23 13:54:48] Step: 4495, Training Logs: loss_final: 2.947240, loss_mean: 0.921294, proj_loss: -0.408040, loss_mean_cls: 2.433986, grad_norm: 15.056485 +Steps: 0%| | 4496/1000000 [18:28<67:33:08, 4.09it/s, grad_norm=15.1, loss_final=2.95, loss_mean=0.921, loss_mean_cls=2.43, proj_loss=-0.408][2026-03-23 13:54:48] Step: 4496, Training Logs: loss_final: 2.848022, loss_mean: 0.918119, proj_loss: -0.406213, loss_mean_cls: 2.336115, grad_norm: 27.683430 +Steps: 0%| | 4497/1000000 [18:29<67:33:45, 4.09it/s, grad_norm=27.7, loss_final=2.85, loss_mean=0.918, loss_mean_cls=2.34, proj_loss=-0.406][2026-03-23 13:54:49] Step: 4497, Training Logs: loss_final: 2.950055, loss_mean: 0.912744, proj_loss: -0.400074, loss_mean_cls: 2.437386, grad_norm: 17.370768 +Steps: 0%| | 4498/1000000 [18:29<67:31:48, 4.09it/s, grad_norm=17.4, loss_final=2.95, loss_mean=0.913, loss_mean_cls=2.44, proj_loss=-0.4][2026-03-23 13:54:49] Step: 4498, Training Logs: loss_final: 2.937601, loss_mean: 0.920526, proj_loss: -0.403226, loss_mean_cls: 2.420301, grad_norm: 10.988057 +Steps: 0%| | 4499/1000000 [18:29<67:32:13, 4.09it/s, grad_norm=11, loss_final=2.94, loss_mean=0.921, loss_mean_cls=2.42, proj_loss=-0.403][2026-03-23 13:54:49] Step: 4499, Training Logs: loss_final: 3.071489, loss_mean: 0.940807, proj_loss: -0.408359, loss_mean_cls: 2.539040, grad_norm: 45.638596 +Steps: 0%| | 4500/1000000 [18:29<67:32:21, 4.09it/s, grad_norm=45.6, loss_final=3.07, loss_mean=0.941, loss_mean_cls=2.54, proj_loss=-0.408][2026-03-23 13:54:49] Step: 4500, Training Logs: loss_final: 2.857743, loss_mean: 0.918946, proj_loss: -0.414533, loss_mean_cls: 2.353330, grad_norm: 15.052650 +Steps: 0%| | 4501/1000000 [18:30<67:32:02, 4.09it/s, grad_norm=15.1, loss_final=2.86, loss_mean=0.919, loss_mean_cls=2.35, proj_loss=-0.415][2026-03-23 13:54:50] Step: 4501, Training Logs: loss_final: 3.020165, loss_mean: 0.928067, proj_loss: -0.407778, loss_mean_cls: 2.499876, grad_norm: 4.276296 +Steps: 0%| | 4502/1000000 [18:30<68:52:09, 4.02it/s, grad_norm=4.28, loss_final=3.02, loss_mean=0.928, loss_mean_cls=2.5, proj_loss=-0.408][2026-03-23 13:54:50] Step: 4502, Training Logs: loss_final: 2.712004, loss_mean: 0.921369, proj_loss: -0.420737, loss_mean_cls: 2.211372, grad_norm: 18.868874 +Steps: 0%| | 4503/1000000 [18:30<68:25:41, 4.04it/s, grad_norm=18.9, loss_final=2.71, loss_mean=0.921, loss_mean_cls=2.21, proj_loss=-0.421][2026-03-23 13:54:50] Step: 4503, Training Logs: loss_final: 2.665630, loss_mean: 0.923419, proj_loss: -0.422731, loss_mean_cls: 2.164942, grad_norm: 22.257971 +Steps: 0%| | 4504/1000000 [18:30<68:07:22, 4.06it/s, grad_norm=22.3, loss_final=2.67, loss_mean=0.923, loss_mean_cls=2.16, proj_loss=-0.423][2026-03-23 13:54:50] Step: 4504, Training Logs: loss_final: 2.560226, loss_mean: 0.941021, proj_loss: -0.416627, loss_mean_cls: 2.035833, grad_norm: 25.582115 +Steps: 0%| | 4505/1000000 [18:31<67:56:35, 4.07it/s, grad_norm=25.6, loss_final=2.56, loss_mean=0.941, loss_mean_cls=2.04, proj_loss=-0.417][2026-03-23 13:54:51] Step: 4505, Training Logs: loss_final: 2.292143, loss_mean: 0.912116, proj_loss: -0.420591, loss_mean_cls: 1.800618, grad_norm: 11.481490 +Steps: 0%| | 4506/1000000 [18:31<67:47:53, 4.08it/s, grad_norm=11.5, loss_final=2.29, loss_mean=0.912, loss_mean_cls=1.8, proj_loss=-0.421][2026-03-23 13:54:51] Step: 4506, Training Logs: loss_final: 2.619569, loss_mean: 0.939152, proj_loss: -0.411970, loss_mean_cls: 2.092387, grad_norm: 18.192387 +Steps: 0%| | 4507/1000000 [18:31<67:42:37, 4.08it/s, grad_norm=18.2, loss_final=2.62, loss_mean=0.939, loss_mean_cls=2.09, proj_loss=-0.412][2026-03-23 13:54:51] Step: 4507, Training Logs: loss_final: 3.334934, loss_mean: 0.889115, proj_loss: -0.408822, loss_mean_cls: 2.854641, grad_norm: 33.440655 +Steps: 0%| | 4508/1000000 [18:31<67:39:33, 4.09it/s, grad_norm=33.4, loss_final=3.33, loss_mean=0.889, loss_mean_cls=2.85, proj_loss=-0.409][2026-03-23 13:54:51] Step: 4508, Training Logs: loss_final: 2.557033, loss_mean: 0.918963, proj_loss: -0.422957, loss_mean_cls: 2.061027, grad_norm: 14.984410 +Steps: 0%| | 4509/1000000 [18:32<67:36:11, 4.09it/s, grad_norm=15, loss_final=2.56, loss_mean=0.919, loss_mean_cls=2.06, proj_loss=-0.423][2026-03-23 13:54:52] Step: 4509, Training Logs: loss_final: 2.963677, loss_mean: 0.903489, proj_loss: -0.412224, loss_mean_cls: 2.472412, grad_norm: 2.642625 +Steps: 0%| | 4510/1000000 [18:32<67:33:07, 4.09it/s, grad_norm=2.64, loss_final=2.96, loss_mean=0.903, loss_mean_cls=2.47, proj_loss=-0.412][2026-03-23 13:54:52] Step: 4510, Training Logs: loss_final: 2.815479, loss_mean: 0.901914, proj_loss: -0.414766, loss_mean_cls: 2.328330, grad_norm: 9.994140 +Steps: 0%| | 4511/1000000 [18:32<67:31:19, 4.10it/s, grad_norm=9.99, loss_final=2.82, loss_mean=0.902, loss_mean_cls=2.33, proj_loss=-0.415][2026-03-23 13:54:52] Step: 4511, Training Logs: loss_final: 2.963181, loss_mean: 0.916786, proj_loss: -0.407318, loss_mean_cls: 2.453714, grad_norm: 10.210347 +Steps: 0%| | 4512/1000000 [18:32<67:31:29, 4.10it/s, grad_norm=10.2, loss_final=2.96, loss_mean=0.917, loss_mean_cls=2.45, proj_loss=-0.407][2026-03-23 13:54:52] Step: 4512, Training Logs: loss_final: 2.587175, loss_mean: 0.948497, proj_loss: -0.417534, loss_mean_cls: 2.056212, grad_norm: 4.911975 +Steps: 0%| | 4513/1000000 [18:33<67:30:18, 4.10it/s, grad_norm=4.91, loss_final=2.59, loss_mean=0.948, loss_mean_cls=2.06, proj_loss=-0.418][2026-03-23 13:54:53] Step: 4513, Training Logs: loss_final: 2.947640, loss_mean: 0.902580, proj_loss: -0.414820, loss_mean_cls: 2.459880, grad_norm: 29.276470 +Steps: 0%| | 4514/1000000 [18:33<67:28:03, 4.10it/s, grad_norm=29.3, loss_final=2.95, loss_mean=0.903, loss_mean_cls=2.46, proj_loss=-0.415][2026-03-23 13:54:53] Step: 4514, Training Logs: loss_final: 2.411468, loss_mean: 0.912481, proj_loss: -0.399295, loss_mean_cls: 1.898283, grad_norm: 12.765885 +Steps: 0%| | 4515/1000000 [18:33<67:29:06, 4.10it/s, grad_norm=12.8, loss_final=2.41, loss_mean=0.912, loss_mean_cls=1.9, proj_loss=-0.399][2026-03-23 13:54:53] Step: 4515, Training Logs: loss_final: 2.436088, loss_mean: 0.941725, proj_loss: -0.395720, loss_mean_cls: 1.890083, grad_norm: 10.783795 +Steps: 0%| | 4516/1000000 [18:33<70:11:41, 3.94it/s, grad_norm=10.8, loss_final=2.44, loss_mean=0.942, loss_mean_cls=1.89, proj_loss=-0.396][2026-03-23 13:54:53] Step: 4516, Training Logs: loss_final: 2.414021, loss_mean: 0.951383, proj_loss: -0.414231, loss_mean_cls: 1.876869, grad_norm: 2.950766 +Steps: 0%| | 4517/1000000 [18:34<69:21:25, 3.99it/s, grad_norm=2.95, loss_final=2.41, loss_mean=0.951, loss_mean_cls=1.88, proj_loss=-0.414][2026-03-23 13:54:54] Step: 4517, Training Logs: loss_final: 2.591057, loss_mean: 0.921543, proj_loss: -0.427630, loss_mean_cls: 2.097143, grad_norm: 18.902555 +Steps: 0%| | 4518/1000000 [18:34<68:47:28, 4.02it/s, grad_norm=18.9, loss_final=2.59, loss_mean=0.922, loss_mean_cls=2.1, proj_loss=-0.428][2026-03-23 13:54:54] Step: 4518, Training Logs: loss_final: 2.748575, loss_mean: 0.915992, proj_loss: -0.399860, loss_mean_cls: 2.232443, grad_norm: 19.242905 +Steps: 0%| | 4519/1000000 [18:34<68:24:40, 4.04it/s, grad_norm=19.2, loss_final=2.75, loss_mean=0.916, loss_mean_cls=2.23, proj_loss=-0.4][2026-03-23 13:54:54] Step: 4519, Training Logs: loss_final: 2.828722, loss_mean: 0.958766, proj_loss: -0.386403, loss_mean_cls: 2.256359, grad_norm: 14.909803 +Steps: 0%| | 4520/1000000 [18:34<68:08:55, 4.06it/s, grad_norm=14.9, loss_final=2.83, loss_mean=0.959, loss_mean_cls=2.26, proj_loss=-0.386][2026-03-23 13:54:54] Step: 4520, Training Logs: loss_final: 2.356810, loss_mean: 0.969663, proj_loss: -0.393651, loss_mean_cls: 1.780798, grad_norm: 5.079886 +Steps: 0%| | 4521/1000000 [18:35<67:57:03, 4.07it/s, grad_norm=5.08, loss_final=2.36, loss_mean=0.97, loss_mean_cls=1.78, proj_loss=-0.394][2026-03-23 13:54:55] Step: 4521, Training Logs: loss_final: 2.249714, loss_mean: 0.923593, proj_loss: -0.415234, loss_mean_cls: 1.741355, grad_norm: 8.142897 +Steps: 0%| | 4522/1000000 [18:35<67:48:10, 4.08it/s, grad_norm=8.14, loss_final=2.25, loss_mean=0.924, loss_mean_cls=1.74, proj_loss=-0.415][2026-03-23 13:54:55] Step: 4522, Training Logs: loss_final: 2.505850, loss_mean: 0.924303, proj_loss: -0.413157, loss_mean_cls: 1.994704, grad_norm: 2.958621 +Steps: 0%| | 4523/1000000 [18:35<67:44:00, 4.08it/s, grad_norm=2.96, loss_final=2.51, loss_mean=0.924, loss_mean_cls=1.99, proj_loss=-0.413][2026-03-23 13:54:55] Step: 4523, Training Logs: loss_final: 2.334572, loss_mean: 0.911021, proj_loss: -0.428591, loss_mean_cls: 1.852142, grad_norm: 21.369493 +Steps: 0%| | 4524/1000000 [18:35<67:40:15, 4.09it/s, grad_norm=21.4, loss_final=2.33, loss_mean=0.911, loss_mean_cls=1.85, proj_loss=-0.429][2026-03-23 13:54:55] Step: 4524, Training Logs: loss_final: 2.714166, loss_mean: 0.946910, proj_loss: -0.415765, loss_mean_cls: 2.183021, grad_norm: 15.721107 +Steps: 0%| | 4525/1000000 [18:36<67:36:58, 4.09it/s, grad_norm=15.7, loss_final=2.71, loss_mean=0.947, loss_mean_cls=2.18, proj_loss=-0.416][2026-03-23 13:54:56] Step: 4525, Training Logs: loss_final: 2.722270, loss_mean: 0.934299, proj_loss: -0.406211, loss_mean_cls: 2.194181, grad_norm: 2.441744 +Steps: 0%| | 4526/1000000 [18:36<67:35:05, 4.09it/s, grad_norm=2.44, loss_final=2.72, loss_mean=0.934, loss_mean_cls=2.19, proj_loss=-0.406][2026-03-23 13:54:56] Step: 4526, Training Logs: loss_final: 2.641307, loss_mean: 0.923237, proj_loss: -0.419890, loss_mean_cls: 2.137960, grad_norm: 8.047075 +Steps: 0%| | 4527/1000000 [18:36<67:34:42, 4.09it/s, grad_norm=8.05, loss_final=2.64, loss_mean=0.923, loss_mean_cls=2.14, proj_loss=-0.42][2026-03-23 13:54:56] Step: 4527, Training Logs: loss_final: 2.994374, loss_mean: 0.925595, proj_loss: -0.409920, loss_mean_cls: 2.478699, grad_norm: 21.671177 +Steps: 0%| | 4528/1000000 [18:36<67:33:05, 4.09it/s, grad_norm=21.7, loss_final=2.99, loss_mean=0.926, loss_mean_cls=2.48, proj_loss=-0.41][2026-03-23 13:54:56] Step: 4528, Training Logs: loss_final: 2.453511, loss_mean: 0.968733, proj_loss: -0.408974, loss_mean_cls: 1.893753, grad_norm: 28.095747 +Steps: 0%| | 4529/1000000 [18:37<67:37:38, 4.09it/s, grad_norm=28.1, loss_final=2.45, loss_mean=0.969, loss_mean_cls=1.89, proj_loss=-0.409][2026-03-23 13:54:57] Step: 4529, Training Logs: loss_final: 2.565824, loss_mean: 0.991463, proj_loss: -0.399991, loss_mean_cls: 1.974352, grad_norm: 8.680688 +Steps: 0%| | 4530/1000000 [18:37<67:35:28, 4.09it/s, grad_norm=8.68, loss_final=2.57, loss_mean=0.991, loss_mean_cls=1.97, proj_loss=-0.4][2026-03-23 13:54:57] Step: 4530, Training Logs: loss_final: 3.059191, loss_mean: 0.947064, proj_loss: -0.404693, loss_mean_cls: 2.516820, grad_norm: 18.756735 +Steps: 0%| | 4531/1000000 [18:37<67:34:02, 4.09it/s, grad_norm=18.8, loss_final=3.06, loss_mean=0.947, loss_mean_cls=2.52, proj_loss=-0.405][2026-03-23 13:54:57] Step: 4531, Training Logs: loss_final: 2.950827, loss_mean: 0.927623, proj_loss: -0.408452, loss_mean_cls: 2.431657, grad_norm: 14.234770 +Steps: 0%| | 4532/1000000 [18:37<67:34:15, 4.09it/s, grad_norm=14.2, loss_final=2.95, loss_mean=0.928, loss_mean_cls=2.43, proj_loss=-0.408][2026-03-23 13:54:57] Step: 4532, Training Logs: loss_final: 2.808098, loss_mean: 0.929884, proj_loss: -0.411719, loss_mean_cls: 2.289933, grad_norm: 6.513550 +Steps: 0%| | 4533/1000000 [18:38<67:33:21, 4.09it/s, grad_norm=6.51, loss_final=2.81, loss_mean=0.93, loss_mean_cls=2.29, proj_loss=-0.412][2026-03-23 13:54:58] Step: 4533, Training Logs: loss_final: 2.640793, loss_mean: 0.946336, proj_loss: -0.418788, loss_mean_cls: 2.113244, grad_norm: 12.469192 +Steps: 0%| | 4534/1000000 [18:38<67:31:10, 4.10it/s, grad_norm=12.5, loss_final=2.64, loss_mean=0.946, loss_mean_cls=2.11, proj_loss=-0.419][2026-03-23 13:54:58] Step: 4534, Training Logs: loss_final: 2.530877, loss_mean: 0.934760, proj_loss: -0.419878, loss_mean_cls: 2.015995, grad_norm: 5.694010 +Steps: 0%| | 4535/1000000 [18:38<67:28:24, 4.10it/s, grad_norm=5.69, loss_final=2.53, loss_mean=0.935, loss_mean_cls=2.02, proj_loss=-0.42][2026-03-23 13:54:58] Step: 4535, Training Logs: loss_final: 2.706155, loss_mean: 0.918347, proj_loss: -0.420741, loss_mean_cls: 2.208549, grad_norm: 17.986654 +Steps: 0%| | 4536/1000000 [18:38<67:29:13, 4.10it/s, grad_norm=18, loss_final=2.71, loss_mean=0.918, loss_mean_cls=2.21, proj_loss=-0.421][2026-03-23 13:54:58] Step: 4536, Training Logs: loss_final: 3.137906, loss_mean: 0.910093, proj_loss: -0.409164, loss_mean_cls: 2.636977, grad_norm: 3.078768 +Steps: 0%| | 4537/1000000 [18:39<67:30:36, 4.10it/s, grad_norm=3.08, loss_final=3.14, loss_mean=0.91, loss_mean_cls=2.64, proj_loss=-0.409][2026-03-23 13:54:59] Step: 4537, Training Logs: loss_final: 2.696367, loss_mean: 0.925147, proj_loss: -0.418130, loss_mean_cls: 2.189349, grad_norm: 7.294250 +Steps: 0%| | 4538/1000000 [18:39<67:29:18, 4.10it/s, grad_norm=7.29, loss_final=2.7, loss_mean=0.925, loss_mean_cls=2.19, proj_loss=-0.418][2026-03-23 13:54:59] Step: 4538, Training Logs: loss_final: 2.447917, loss_mean: 0.915059, proj_loss: -0.424769, loss_mean_cls: 1.957626, grad_norm: 16.943371 +Steps: 0%| | 4539/1000000 [18:39<67:29:26, 4.10it/s, grad_norm=16.9, loss_final=2.45, loss_mean=0.915, loss_mean_cls=1.96, proj_loss=-0.425][2026-03-23 13:54:59] Step: 4539, Training Logs: loss_final: 2.605228, loss_mean: 0.926844, proj_loss: -0.420282, loss_mean_cls: 2.098666, grad_norm: 8.848583 +Steps: 0%| | 4540/1000000 [18:39<67:30:13, 4.10it/s, grad_norm=8.85, loss_final=2.61, loss_mean=0.927, loss_mean_cls=2.1, proj_loss=-0.42][2026-03-23 13:54:59] Step: 4540, Training Logs: loss_final: 2.045560, loss_mean: 0.933003, proj_loss: -0.430947, loss_mean_cls: 1.543504, grad_norm: 8.861781 +Steps: 0%| | 4541/1000000 [18:40<67:28:53, 4.10it/s, grad_norm=8.86, loss_final=2.05, loss_mean=0.933, loss_mean_cls=1.54, proj_loss=-0.431][2026-03-23 13:54:59] Step: 4541, Training Logs: loss_final: 2.579585, loss_mean: 0.929645, proj_loss: -0.422245, loss_mean_cls: 2.072185, grad_norm: 3.074804 +Steps: 0%| | 4542/1000000 [18:40<67:25:54, 4.10it/s, grad_norm=3.07, loss_final=2.58, loss_mean=0.93, loss_mean_cls=2.07, proj_loss=-0.422][2026-03-23 13:55:00] Step: 4542, Training Logs: loss_final: 2.972931, loss_mean: 0.891093, proj_loss: -0.408295, loss_mean_cls: 2.490133, grad_norm: 8.663810 +Steps: 0%| | 4543/1000000 [18:40<67:27:21, 4.10it/s, grad_norm=8.66, loss_final=2.97, loss_mean=0.891, loss_mean_cls=2.49, proj_loss=-0.408][2026-03-23 13:55:00] Step: 4543, Training Logs: loss_final: 2.709385, loss_mean: 0.892104, proj_loss: -0.413089, loss_mean_cls: 2.230370, grad_norm: 7.219029 +Steps: 0%| | 4544/1000000 [18:40<67:27:19, 4.10it/s, grad_norm=7.22, loss_final=2.71, loss_mean=0.892, loss_mean_cls=2.23, proj_loss=-0.413][2026-03-23 13:55:00] Step: 4544, Training Logs: loss_final: 2.674099, loss_mean: 0.915528, proj_loss: -0.414955, loss_mean_cls: 2.173525, grad_norm: 2.529852 +Steps: 0%| | 4545/1000000 [18:40<67:28:55, 4.10it/s, grad_norm=2.53, loss_final=2.67, loss_mean=0.916, loss_mean_cls=2.17, proj_loss=-0.415][2026-03-23 13:55:00] Step: 4545, Training Logs: loss_final: 2.841507, loss_mean: 0.913779, proj_loss: -0.414135, loss_mean_cls: 2.341864, grad_norm: 2.604456 +Steps: 0%| | 4546/1000000 [18:41<67:27:11, 4.10it/s, grad_norm=2.6, loss_final=2.84, loss_mean=0.914, loss_mean_cls=2.34, proj_loss=-0.414][2026-03-23 13:55:01] Step: 4546, Training Logs: loss_final: 2.722001, loss_mean: 0.936555, proj_loss: -0.428829, loss_mean_cls: 2.214275, grad_norm: 3.018600 +Steps: 0%| | 4547/1000000 [18:41<67:26:16, 4.10it/s, grad_norm=3.02, loss_final=2.72, loss_mean=0.937, loss_mean_cls=2.21, proj_loss=-0.429][2026-03-23 13:55:01] Step: 4547, Training Logs: loss_final: 2.325507, loss_mean: 0.951780, proj_loss: -0.424302, loss_mean_cls: 1.798029, grad_norm: 10.198171 +Steps: 0%| | 4548/1000000 [18:41<67:26:55, 4.10it/s, grad_norm=10.2, loss_final=2.33, loss_mean=0.952, loss_mean_cls=1.8, proj_loss=-0.424][2026-03-23 13:55:01] Step: 4548, Training Logs: loss_final: 2.761791, loss_mean: 0.914351, proj_loss: -0.414057, loss_mean_cls: 2.261497, grad_norm: 7.593086 +Steps: 0%| | 4549/1000000 [18:41<67:27:34, 4.10it/s, grad_norm=7.59, loss_final=2.76, loss_mean=0.914, loss_mean_cls=2.26, proj_loss=-0.414][2026-03-23 13:55:01] Step: 4549, Training Logs: loss_final: 2.662787, loss_mean: 0.907018, proj_loss: -0.416271, loss_mean_cls: 2.172040, grad_norm: 3.857042 +Steps: 0%| | 4550/1000000 [18:42<67:28:19, 4.10it/s, grad_norm=3.86, loss_final=2.66, loss_mean=0.907, loss_mean_cls=2.17, proj_loss=-0.416][2026-03-23 13:55:02] Step: 4550, Training Logs: loss_final: 2.451946, loss_mean: 0.912932, proj_loss: -0.428431, loss_mean_cls: 1.967445, grad_norm: 15.749128 +Steps: 0%| | 4551/1000000 [18:42<67:27:56, 4.10it/s, grad_norm=15.7, loss_final=2.45, loss_mean=0.913, loss_mean_cls=1.97, proj_loss=-0.428][2026-03-23 13:55:02] Step: 4551, Training Logs: loss_final: 3.008913, loss_mean: 0.917594, proj_loss: -0.417337, loss_mean_cls: 2.508656, grad_norm: 53.366085 +Steps: 0%| | 4552/1000000 [18:42<67:28:35, 4.10it/s, grad_norm=53.4, loss_final=3.01, loss_mean=0.918, loss_mean_cls=2.51, proj_loss=-0.417][2026-03-23 13:55:02] Step: 4552, Training Logs: loss_final: 3.026999, loss_mean: 0.942538, proj_loss: -0.407839, loss_mean_cls: 2.492299, grad_norm: 16.899412 +Steps: 0%| | 4553/1000000 [18:42<67:30:32, 4.10it/s, grad_norm=16.9, loss_final=3.03, loss_mean=0.943, loss_mean_cls=2.49, proj_loss=-0.408][2026-03-23 13:55:02] Step: 4553, Training Logs: loss_final: 3.013242, loss_mean: 0.896503, proj_loss: -0.410009, loss_mean_cls: 2.526747, grad_norm: 6.407750 +Steps: 0%| | 4554/1000000 [18:43<67:30:13, 4.10it/s, grad_norm=6.41, loss_final=3.01, loss_mean=0.897, loss_mean_cls=2.53, proj_loss=-0.41][2026-03-23 13:55:03] Step: 4554, Training Logs: loss_final: 2.996397, loss_mean: 0.913845, proj_loss: -0.409428, loss_mean_cls: 2.491980, grad_norm: 14.395100 +Steps: 0%| | 4555/1000000 [18:43<67:30:46, 4.10it/s, grad_norm=14.4, loss_final=3, loss_mean=0.914, loss_mean_cls=2.49, proj_loss=-0.409][2026-03-23 13:55:03] Step: 4555, Training Logs: loss_final: 2.902855, loss_mean: 0.924143, proj_loss: -0.415628, loss_mean_cls: 2.394340, grad_norm: 16.824314 +Steps: 0%| | 4556/1000000 [18:43<67:29:40, 4.10it/s, grad_norm=16.8, loss_final=2.9, loss_mean=0.924, loss_mean_cls=2.39, proj_loss=-0.416][2026-03-23 13:55:03] Step: 4556, Training Logs: loss_final: 2.818470, loss_mean: 0.935388, proj_loss: -0.416273, loss_mean_cls: 2.299355, grad_norm: 17.335150 +Steps: 0%| | 4557/1000000 [18:43<67:29:08, 4.10it/s, grad_norm=17.3, loss_final=2.82, loss_mean=0.935, loss_mean_cls=2.3, proj_loss=-0.416][2026-03-23 13:55:03] Step: 4557, Training Logs: loss_final: 2.821293, loss_mean: 0.926704, proj_loss: -0.417929, loss_mean_cls: 2.312518, grad_norm: 9.656942 +Steps: 0%| | 4558/1000000 [18:44<67:28:20, 4.10it/s, grad_norm=9.66, loss_final=2.82, loss_mean=0.927, loss_mean_cls=2.31, proj_loss=-0.418][2026-03-23 13:55:04] Step: 4558, Training Logs: loss_final: 3.109060, loss_mean: 0.903754, proj_loss: -0.411810, loss_mean_cls: 2.617115, grad_norm: 3.931559 +Steps: 0%| | 4559/1000000 [18:44<67:27:48, 4.10it/s, grad_norm=3.93, loss_final=3.11, loss_mean=0.904, loss_mean_cls=2.62, proj_loss=-0.412][2026-03-23 13:55:04] Step: 4559, Training Logs: loss_final: 2.731620, loss_mean: 0.925954, proj_loss: -0.419369, loss_mean_cls: 2.225035, grad_norm: 19.112787 +Steps: 0%| | 4560/1000000 [18:44<67:28:07, 4.10it/s, grad_norm=19.1, loss_final=2.73, loss_mean=0.926, loss_mean_cls=2.23, proj_loss=-0.419][2026-03-23 13:55:04] Step: 4560, Training Logs: loss_final: 2.992417, loss_mean: 0.919902, proj_loss: -0.418990, loss_mean_cls: 2.491505, grad_norm: 23.421598 +Steps: 0%| | 4561/1000000 [18:44<67:28:05, 4.10it/s, grad_norm=23.4, loss_final=2.99, loss_mean=0.92, loss_mean_cls=2.49, proj_loss=-0.419][2026-03-23 13:55:04] Step: 4561, Training Logs: loss_final: 2.364531, loss_mean: 0.931597, proj_loss: -0.433502, loss_mean_cls: 1.866436, grad_norm: 38.424240 +Steps: 0%| | 4562/1000000 [18:45<67:31:22, 4.10it/s, grad_norm=38.4, loss_final=2.36, loss_mean=0.932, loss_mean_cls=1.87, proj_loss=-0.434][2026-03-23 13:55:05] Step: 4562, Training Logs: loss_final: 2.609276, loss_mean: 0.953890, proj_loss: -0.420814, loss_mean_cls: 2.076199, grad_norm: 14.186088 +Steps: 0%| | 4563/1000000 [18:45<67:32:03, 4.09it/s, grad_norm=14.2, loss_final=2.61, loss_mean=0.954, loss_mean_cls=2.08, proj_loss=-0.421][2026-03-23 13:55:05] Step: 4563, Training Logs: loss_final: 2.960169, loss_mean: 0.928834, proj_loss: -0.414846, loss_mean_cls: 2.446182, grad_norm: 7.258934 +Steps: 0%| | 4564/1000000 [18:45<67:30:20, 4.10it/s, grad_norm=7.26, loss_final=2.96, loss_mean=0.929, loss_mean_cls=2.45, proj_loss=-0.415][2026-03-23 13:55:05] Step: 4564, Training Logs: loss_final: 2.972702, loss_mean: 0.915216, proj_loss: -0.420400, loss_mean_cls: 2.477885, grad_norm: 18.449461 +Steps: 0%| | 4565/1000000 [18:45<69:36:52, 3.97it/s, grad_norm=18.4, loss_final=2.97, loss_mean=0.915, loss_mean_cls=2.48, proj_loss=-0.42][2026-03-23 13:55:05] Step: 4565, Training Logs: loss_final: 2.721717, loss_mean: 0.930701, proj_loss: -0.423316, loss_mean_cls: 2.214333, grad_norm: 4.928121 +Steps: 0%| | 4566/1000000 [18:46<68:58:55, 4.01it/s, grad_norm=4.93, loss_final=2.72, loss_mean=0.931, loss_mean_cls=2.21, proj_loss=-0.423][2026-03-23 13:55:06] Step: 4566, Training Logs: loss_final: 2.642281, loss_mean: 0.926694, proj_loss: -0.428572, loss_mean_cls: 2.144159, grad_norm: 10.581121 +Steps: 0%| | 4567/1000000 [18:46<68:32:36, 4.03it/s, grad_norm=10.6, loss_final=2.64, loss_mean=0.927, loss_mean_cls=2.14, proj_loss=-0.429][2026-03-23 13:55:06] Step: 4567, Training Logs: loss_final: 3.067207, loss_mean: 0.918498, proj_loss: -0.422270, loss_mean_cls: 2.570979, grad_norm: 30.656752 +Steps: 0%| | 4568/1000000 [18:46<68:13:31, 4.05it/s, grad_norm=30.7, loss_final=3.07, loss_mean=0.918, loss_mean_cls=2.57, proj_loss=-0.422][2026-03-23 13:55:06] Step: 4568, Training Logs: loss_final: 2.645292, loss_mean: 0.922704, proj_loss: -0.423411, loss_mean_cls: 2.145999, grad_norm: 4.981709 +Steps: 0%| | 4569/1000000 [18:46<68:02:22, 4.06it/s, grad_norm=4.98, loss_final=2.65, loss_mean=0.923, loss_mean_cls=2.15, proj_loss=-0.423][2026-03-23 13:55:06] Step: 4569, Training Logs: loss_final: 2.862497, loss_mean: 0.882666, proj_loss: -0.418457, loss_mean_cls: 2.398287, grad_norm: 18.981808 +Steps: 0%| | 4570/1000000 [18:47<67:50:45, 4.08it/s, grad_norm=19, loss_final=2.86, loss_mean=0.883, loss_mean_cls=2.4, proj_loss=-0.418][2026-03-23 13:55:07] Step: 4570, Training Logs: loss_final: 3.136332, loss_mean: 0.909634, proj_loss: -0.414100, loss_mean_cls: 2.640797, grad_norm: 27.776384 +Steps: 0%| | 4571/1000000 [18:47<67:43:19, 4.08it/s, grad_norm=27.8, loss_final=3.14, loss_mean=0.91, loss_mean_cls=2.64, proj_loss=-0.414][2026-03-23 13:55:07] Step: 4571, Training Logs: loss_final: 2.573328, loss_mean: 0.917483, proj_loss: -0.430850, loss_mean_cls: 2.086696, grad_norm: 15.450710 +Steps: 0%| | 4572/1000000 [18:47<67:38:29, 4.09it/s, grad_norm=15.5, loss_final=2.57, loss_mean=0.917, loss_mean_cls=2.09, proj_loss=-0.431][2026-03-23 13:55:07] Step: 4572, Training Logs: loss_final: 2.388973, loss_mean: 0.921011, proj_loss: -0.424978, loss_mean_cls: 1.892940, grad_norm: 10.119411 +Steps: 0%| | 4573/1000000 [18:47<67:35:09, 4.09it/s, grad_norm=10.1, loss_final=2.39, loss_mean=0.921, loss_mean_cls=1.89, proj_loss=-0.425][2026-03-23 13:55:07] Step: 4573, Training Logs: loss_final: 3.059219, loss_mean: 0.905349, proj_loss: -0.414441, loss_mean_cls: 2.568311, grad_norm: 3.140483 +Steps: 0%| | 4574/1000000 [18:48<67:34:01, 4.09it/s, grad_norm=3.14, loss_final=3.06, loss_mean=0.905, loss_mean_cls=2.57, proj_loss=-0.414][2026-03-23 13:55:08] Step: 4574, Training Logs: loss_final: 3.202034, loss_mean: 0.861986, proj_loss: -0.422669, loss_mean_cls: 2.762716, grad_norm: 22.504892 +Steps: 0%| | 4575/1000000 [18:48<67:32:10, 4.09it/s, grad_norm=22.5, loss_final=3.2, loss_mean=0.862, loss_mean_cls=2.76, proj_loss=-0.423][2026-03-23 13:55:08] Step: 4575, Training Logs: loss_final: 3.094969, loss_mean: 0.928926, proj_loss: -0.414100, loss_mean_cls: 2.580143, grad_norm: 12.404799 +Steps: 0%| | 4576/1000000 [18:48<67:32:31, 4.09it/s, grad_norm=12.4, loss_final=3.09, loss_mean=0.929, loss_mean_cls=2.58, proj_loss=-0.414][2026-03-23 13:55:08] Step: 4576, Training Logs: loss_final: 2.651053, loss_mean: 0.937884, proj_loss: -0.415318, loss_mean_cls: 2.128486, grad_norm: 15.336436 +Steps: 0%| | 4577/1000000 [18:48<67:30:34, 4.10it/s, grad_norm=15.3, loss_final=2.65, loss_mean=0.938, loss_mean_cls=2.13, proj_loss=-0.415][2026-03-23 13:55:08] Step: 4577, Training Logs: loss_final: 2.828041, loss_mean: 0.907805, proj_loss: -0.411547, loss_mean_cls: 2.331784, grad_norm: 20.557102 +Steps: 0%| | 4578/1000000 [18:49<67:29:32, 4.10it/s, grad_norm=20.6, loss_final=2.83, loss_mean=0.908, loss_mean_cls=2.33, proj_loss=-0.412][2026-03-23 13:55:09] Step: 4578, Training Logs: loss_final: 2.900689, loss_mean: 0.921451, proj_loss: -0.418279, loss_mean_cls: 2.397516, grad_norm: 15.742491 +Steps: 0%| | 4579/1000000 [18:49<67:28:07, 4.10it/s, grad_norm=15.7, loss_final=2.9, loss_mean=0.921, loss_mean_cls=2.4, proj_loss=-0.418][2026-03-23 13:55:09] Step: 4579, Training Logs: loss_final: 2.717444, loss_mean: 0.904410, proj_loss: -0.423830, loss_mean_cls: 2.236865, grad_norm: 11.761939 +Steps: 0%| | 4580/1000000 [18:49<67:26:37, 4.10it/s, grad_norm=11.8, loss_final=2.72, loss_mean=0.904, loss_mean_cls=2.24, proj_loss=-0.424][2026-03-23 13:55:09] Step: 4580, Training Logs: loss_final: 2.745703, loss_mean: 0.921298, proj_loss: -0.415294, loss_mean_cls: 2.239698, grad_norm: 19.616568 +Steps: 0%| | 4581/1000000 [18:49<67:29:06, 4.10it/s, grad_norm=19.6, loss_final=2.75, loss_mean=0.921, loss_mean_cls=2.24, proj_loss=-0.415][2026-03-23 13:55:09] Step: 4581, Training Logs: loss_final: 3.116491, loss_mean: 0.917004, proj_loss: -0.418691, loss_mean_cls: 2.618179, grad_norm: 24.368752 +Steps: 0%| | 4582/1000000 [18:50<67:28:48, 4.10it/s, grad_norm=24.4, loss_final=3.12, loss_mean=0.917, loss_mean_cls=2.62, proj_loss=-0.419][2026-03-23 13:55:10] Step: 4582, Training Logs: loss_final: 2.529032, loss_mean: 0.943145, proj_loss: -0.426048, loss_mean_cls: 2.011936, grad_norm: 16.030319 +Steps: 0%| | 4583/1000000 [18:50<67:28:10, 4.10it/s, grad_norm=16, loss_final=2.53, loss_mean=0.943, loss_mean_cls=2.01, proj_loss=-0.426][2026-03-23 13:55:10] Step: 4583, Training Logs: loss_final: 2.551455, loss_mean: 0.921152, proj_loss: -0.423641, loss_mean_cls: 2.053943, grad_norm: 8.358541 +Steps: 0%| | 4584/1000000 [18:50<67:26:53, 4.10it/s, grad_norm=8.36, loss_final=2.55, loss_mean=0.921, loss_mean_cls=2.05, proj_loss=-0.424][2026-03-23 13:55:10] Step: 4584, Training Logs: loss_final: 2.493676, loss_mean: 0.927987, proj_loss: -0.422720, loss_mean_cls: 1.988409, grad_norm: 13.515940 +Steps: 0%| | 4585/1000000 [18:50<67:25:54, 4.10it/s, grad_norm=13.5, loss_final=2.49, loss_mean=0.928, loss_mean_cls=1.99, proj_loss=-0.423][2026-03-23 13:55:10] Step: 4585, Training Logs: loss_final: 2.177692, loss_mean: 0.934340, proj_loss: -0.434509, loss_mean_cls: 1.677862, grad_norm: 12.665313 +Steps: 0%| | 4586/1000000 [18:51<67:27:32, 4.10it/s, grad_norm=12.7, loss_final=2.18, loss_mean=0.934, loss_mean_cls=1.68, proj_loss=-0.435][2026-03-23 13:55:10] Step: 4586, Training Logs: loss_final: 2.979960, loss_mean: 0.913958, proj_loss: -0.419761, loss_mean_cls: 2.485763, grad_norm: 20.353855 +Steps: 0%| | 4587/1000000 [18:51<67:26:17, 4.10it/s, grad_norm=20.4, loss_final=2.98, loss_mean=0.914, loss_mean_cls=2.49, proj_loss=-0.42][2026-03-23 13:55:11] Step: 4587, Training Logs: loss_final: 2.504228, loss_mean: 0.920356, proj_loss: -0.427526, loss_mean_cls: 2.011399, grad_norm: 26.601131 +Steps: 0%| | 4588/1000000 [18:51<67:25:49, 4.10it/s, grad_norm=26.6, loss_final=2.5, loss_mean=0.92, loss_mean_cls=2.01, proj_loss=-0.428][2026-03-23 13:55:11] Step: 4588, Training Logs: loss_final: 2.818565, loss_mean: 0.922435, proj_loss: -0.426162, loss_mean_cls: 2.322292, grad_norm: 4.293335 +Steps: 0%| | 4589/1000000 [18:51<68:10:35, 4.06it/s, grad_norm=4.29, loss_final=2.82, loss_mean=0.922, loss_mean_cls=2.32, proj_loss=-0.426][2026-03-23 13:55:11] Step: 4589, Training Logs: loss_final: 3.089334, loss_mean: 0.879380, proj_loss: -0.422823, loss_mean_cls: 2.632778, grad_norm: 26.768049 +Steps: 0%| | 4590/1000000 [18:52<68:01:00, 4.07it/s, grad_norm=26.8, loss_final=3.09, loss_mean=0.879, loss_mean_cls=2.63, proj_loss=-0.423][2026-03-23 13:55:11] Step: 4590, Training Logs: loss_final: 2.735231, loss_mean: 0.910234, proj_loss: -0.423830, loss_mean_cls: 2.248827, grad_norm: 9.575607 +Steps: 0%| | 4591/1000000 [18:52<67:51:00, 4.08it/s, grad_norm=9.58, loss_final=2.74, loss_mean=0.91, loss_mean_cls=2.25, proj_loss=-0.424][2026-03-23 13:55:12] Step: 4591, Training Logs: loss_final: 3.226018, loss_mean: 0.923558, proj_loss: -0.411868, loss_mean_cls: 2.714328, grad_norm: 5.533328 +Steps: 0%| | 4592/1000000 [18:52<67:44:06, 4.08it/s, grad_norm=5.53, loss_final=3.23, loss_mean=0.924, loss_mean_cls=2.71, proj_loss=-0.412][2026-03-23 13:55:12] Step: 4592, Training Logs: loss_final: 2.326586, loss_mean: 0.924396, proj_loss: -0.436634, loss_mean_cls: 1.838824, grad_norm: 5.930710 +Steps: 0%| | 4593/1000000 [18:52<67:39:48, 4.09it/s, grad_norm=5.93, loss_final=2.33, loss_mean=0.924, loss_mean_cls=1.84, proj_loss=-0.437][2026-03-23 13:55:12] Step: 4593, Training Logs: loss_final: 2.550083, loss_mean: 0.920030, proj_loss: -0.429162, loss_mean_cls: 2.059215, grad_norm: 6.619169 +Steps: 0%| | 4594/1000000 [18:52<67:36:01, 4.09it/s, grad_norm=6.62, loss_final=2.55, loss_mean=0.92, loss_mean_cls=2.06, proj_loss=-0.429][2026-03-23 13:55:12] Step: 4594, Training Logs: loss_final: 2.489225, loss_mean: 0.916158, proj_loss: -0.429975, loss_mean_cls: 2.003042, grad_norm: 16.865734 +Steps: 0%| | 4595/1000000 [18:53<67:34:15, 4.09it/s, grad_norm=16.9, loss_final=2.49, loss_mean=0.916, loss_mean_cls=2, proj_loss=-0.43][2026-03-23 13:55:13] Step: 4595, Training Logs: loss_final: 2.634404, loss_mean: 0.926741, proj_loss: -0.426004, loss_mean_cls: 2.133667, grad_norm: 24.289383 +Steps: 0%| | 4596/1000000 [18:53<67:31:48, 4.09it/s, grad_norm=24.3, loss_final=2.63, loss_mean=0.927, loss_mean_cls=2.13, proj_loss=-0.426][2026-03-23 13:55:13] Step: 4596, Training Logs: loss_final: 2.844794, loss_mean: 0.904553, proj_loss: -0.415578, loss_mean_cls: 2.355819, grad_norm: 11.782250 +Steps: 0%| | 4597/1000000 [18:53<67:30:59, 4.10it/s, grad_norm=11.8, loss_final=2.84, loss_mean=0.905, loss_mean_cls=2.36, proj_loss=-0.416][2026-03-23 13:55:13] Step: 4597, Training Logs: loss_final: 2.275986, loss_mean: 0.942376, proj_loss: -0.433863, loss_mean_cls: 1.767472, grad_norm: 2.340553 +Steps: 0%| | 4598/1000000 [18:53<67:29:13, 4.10it/s, grad_norm=2.34, loss_final=2.28, loss_mean=0.942, loss_mean_cls=1.77, proj_loss=-0.434][2026-03-23 13:55:13] Step: 4598, Training Logs: loss_final: 2.899186, loss_mean: 0.885608, proj_loss: -0.423478, loss_mean_cls: 2.437056, grad_norm: 5.257597 +Steps: 0%| | 4599/1000000 [18:54<67:29:31, 4.10it/s, grad_norm=5.26, loss_final=2.9, loss_mean=0.886, loss_mean_cls=2.44, proj_loss=-0.423][2026-03-23 13:55:14] Step: 4599, Training Logs: loss_final: 2.562253, loss_mean: 0.922882, proj_loss: -0.429537, loss_mean_cls: 2.068907, grad_norm: 5.150064 +Steps: 0%| | 4600/1000000 [18:54<67:32:18, 4.09it/s, grad_norm=5.15, loss_final=2.56, loss_mean=0.923, loss_mean_cls=2.07, proj_loss=-0.43][2026-03-23 13:55:14] Step: 4600, Training Logs: loss_final: 2.441720, loss_mean: 0.935076, proj_loss: -0.431170, loss_mean_cls: 1.937814, grad_norm: 4.901441 +Steps: 0%| | 4601/1000000 [18:54<67:31:47, 4.09it/s, grad_norm=4.9, loss_final=2.44, loss_mean=0.935, loss_mean_cls=1.94, proj_loss=-0.431][2026-03-23 13:55:14] Step: 4601, Training Logs: loss_final: 2.757331, loss_mean: 0.935976, proj_loss: -0.423943, loss_mean_cls: 2.245298, grad_norm: 19.612114 +Steps: 0%| | 4602/1000000 [18:54<67:28:14, 4.10it/s, grad_norm=19.6, loss_final=2.76, loss_mean=0.936, loss_mean_cls=2.25, proj_loss=-0.424][2026-03-23 13:55:14] Step: 4602, Training Logs: loss_final: 2.669765, loss_mean: 0.922607, proj_loss: -0.427517, loss_mean_cls: 2.174675, grad_norm: 36.881245 +Steps: 0%| | 4603/1000000 [18:55<67:26:58, 4.10it/s, grad_norm=36.9, loss_final=2.67, loss_mean=0.923, loss_mean_cls=2.17, proj_loss=-0.428][2026-03-23 13:55:15] Step: 4603, Training Logs: loss_final: 2.287531, loss_mean: 0.939078, proj_loss: -0.431291, loss_mean_cls: 1.779743, grad_norm: 3.744966 +Steps: 0%| | 4604/1000000 [18:55<67:27:40, 4.10it/s, grad_norm=3.74, loss_final=2.29, loss_mean=0.939, loss_mean_cls=1.78, proj_loss=-0.431][2026-03-23 13:55:15] Step: 4604, Training Logs: loss_final: 2.950824, loss_mean: 0.911658, proj_loss: -0.420928, loss_mean_cls: 2.460094, grad_norm: 9.572637 +Steps: 0%| | 4605/1000000 [18:55<67:27:43, 4.10it/s, grad_norm=9.57, loss_final=2.95, loss_mean=0.912, loss_mean_cls=2.46, proj_loss=-0.421][2026-03-23 13:55:15] Step: 4605, Training Logs: loss_final: 2.446841, loss_mean: 0.921032, proj_loss: -0.432388, loss_mean_cls: 1.958197, grad_norm: 12.866824 +Steps: 0%| | 4606/1000000 [18:55<67:29:03, 4.10it/s, grad_norm=12.9, loss_final=2.45, loss_mean=0.921, loss_mean_cls=1.96, proj_loss=-0.432][2026-03-23 13:55:15] Step: 4606, Training Logs: loss_final: 2.190385, loss_mean: 0.947889, proj_loss: -0.436578, loss_mean_cls: 1.679074, grad_norm: 10.839270 +Steps: 0%| | 4607/1000000 [18:56<67:27:09, 4.10it/s, grad_norm=10.8, loss_final=2.19, loss_mean=0.948, loss_mean_cls=1.68, proj_loss=-0.437][2026-03-23 13:55:16] Step: 4607, Training Logs: loss_final: 2.516908, loss_mean: 0.915324, proj_loss: -0.431673, loss_mean_cls: 2.033257, grad_norm: 22.037283 +Steps: 0%| | 4608/1000000 [18:56<67:25:41, 4.10it/s, grad_norm=22, loss_final=2.52, loss_mean=0.915, loss_mean_cls=2.03, proj_loss=-0.432][2026-03-23 13:55:16] Step: 4608, Training Logs: loss_final: 2.789435, loss_mean: 0.932139, proj_loss: -0.423331, loss_mean_cls: 2.280626, grad_norm: 11.552321 +Steps: 0%| | 4609/1000000 [18:56<67:26:11, 4.10it/s, grad_norm=11.6, loss_final=2.79, loss_mean=0.932, loss_mean_cls=2.28, proj_loss=-0.423][2026-03-23 13:55:16] Step: 4609, Training Logs: loss_final: 2.836751, loss_mean: 0.902172, proj_loss: -0.424078, loss_mean_cls: 2.358656, grad_norm: 2.171767 +Steps: 0%| | 4610/1000000 [18:56<67:27:00, 4.10it/s, grad_norm=2.17, loss_final=2.84, loss_mean=0.902, loss_mean_cls=2.36, proj_loss=-0.424][2026-03-23 13:55:16] Step: 4610, Training Logs: loss_final: 2.821602, loss_mean: 0.890241, proj_loss: -0.425032, loss_mean_cls: 2.356393, grad_norm: 11.577736 +Steps: 0%| | 4611/1000000 [18:57<67:27:26, 4.10it/s, grad_norm=11.6, loss_final=2.82, loss_mean=0.89, loss_mean_cls=2.36, proj_loss=-0.425][2026-03-23 13:55:17] Step: 4611, Training Logs: loss_final: 2.694284, loss_mean: 0.918069, proj_loss: -0.432271, loss_mean_cls: 2.208486, grad_norm: 23.512386 +Steps: 0%| | 4612/1000000 [18:57<67:26:48, 4.10it/s, grad_norm=23.5, loss_final=2.69, loss_mean=0.918, loss_mean_cls=2.21, proj_loss=-0.432][2026-03-23 13:55:17] Step: 4612, Training Logs: loss_final: 2.814148, loss_mean: 0.929258, proj_loss: -0.420552, loss_mean_cls: 2.305442, grad_norm: 12.787362 +Steps: 0%| | 4613/1000000 [18:57<67:27:27, 4.10it/s, grad_norm=12.8, loss_final=2.81, loss_mean=0.929, loss_mean_cls=2.31, proj_loss=-0.421][2026-03-23 13:55:17] Step: 4613, Training Logs: loss_final: 2.539687, loss_mean: 0.956758, proj_loss: -0.423565, loss_mean_cls: 2.006493, grad_norm: 3.071517 +Steps: 0%| | 4614/1000000 [18:57<67:27:30, 4.10it/s, grad_norm=3.07, loss_final=2.54, loss_mean=0.957, loss_mean_cls=2.01, proj_loss=-0.424][2026-03-23 13:55:17] Step: 4614, Training Logs: loss_final: 2.811004, loss_mean: 0.933621, proj_loss: -0.422942, loss_mean_cls: 2.300325, grad_norm: 7.731696 +Steps: 0%| | 4615/1000000 [18:58<67:28:53, 4.10it/s, grad_norm=7.73, loss_final=2.81, loss_mean=0.934, loss_mean_cls=2.3, proj_loss=-0.423][2026-03-23 13:55:18] Step: 4615, Training Logs: loss_final: 2.582541, loss_mean: 0.942701, proj_loss: -0.425683, loss_mean_cls: 2.065523, grad_norm: 28.822844 +Steps: 0%| | 4616/1000000 [18:58<67:28:35, 4.10it/s, grad_norm=28.8, loss_final=2.58, loss_mean=0.943, loss_mean_cls=2.07, proj_loss=-0.426][2026-03-23 13:55:18] Step: 4616, Training Logs: loss_final: 2.751225, loss_mean: 0.915514, proj_loss: -0.426934, loss_mean_cls: 2.262645, grad_norm: 5.568722 +Steps: 0%| | 4617/1000000 [18:58<67:29:05, 4.10it/s, grad_norm=5.57, loss_final=2.75, loss_mean=0.916, loss_mean_cls=2.26, proj_loss=-0.427][2026-03-23 13:55:18] Step: 4617, Training Logs: loss_final: 2.889510, loss_mean: 0.945555, proj_loss: -0.416234, loss_mean_cls: 2.360189, grad_norm: 5.890336 +Steps: 0%| | 4618/1000000 [18:58<67:29:35, 4.10it/s, grad_norm=5.89, loss_final=2.89, loss_mean=0.946, loss_mean_cls=2.36, proj_loss=-0.416][2026-03-23 13:55:18] Step: 4618, Training Logs: loss_final: 2.923925, loss_mean: 0.919975, proj_loss: -0.422856, loss_mean_cls: 2.426806, grad_norm: 13.337447 +Steps: 0%| | 4619/1000000 [18:59<67:29:14, 4.10it/s, grad_norm=13.3, loss_final=2.92, loss_mean=0.92, loss_mean_cls=2.43, proj_loss=-0.423][2026-03-23 13:55:19] Step: 4619, Training Logs: loss_final: 2.468733, loss_mean: 0.923603, proj_loss: -0.433479, loss_mean_cls: 1.978609, grad_norm: 9.957445 +Steps: 0%| | 4620/1000000 [18:59<67:27:59, 4.10it/s, grad_norm=9.96, loss_final=2.47, loss_mean=0.924, loss_mean_cls=1.98, proj_loss=-0.433][2026-03-23 13:55:19] Step: 4620, Training Logs: loss_final: 2.819474, loss_mean: 0.941531, proj_loss: -0.425426, loss_mean_cls: 2.303368, grad_norm: 26.657587 +Steps: 0%| | 4621/1000000 [18:59<67:28:04, 4.10it/s, grad_norm=26.7, loss_final=2.82, loss_mean=0.942, loss_mean_cls=2.3, proj_loss=-0.425][2026-03-23 13:55:19] Step: 4621, Training Logs: loss_final: 2.263032, loss_mean: 0.935539, proj_loss: -0.433464, loss_mean_cls: 1.760957, grad_norm: 18.542774 +Steps: 0%| | 4622/1000000 [18:59<67:28:42, 4.10it/s, grad_norm=18.5, loss_final=2.26, loss_mean=0.936, loss_mean_cls=1.76, proj_loss=-0.433][2026-03-23 13:55:19] Step: 4622, Training Logs: loss_final: 2.780943, loss_mean: 0.922102, proj_loss: -0.428035, loss_mean_cls: 2.286875, grad_norm: 4.585610 +Steps: 0%| | 4622/1000000 [18:59<67:28:42, 4.10it/s, grad_norm=4.59, loss_final=2.78, loss_mean=0.922, loss_mean_cls=2.29, proj_loss=-0.428] diff --git a/back/wandb/run-20260323_133616-b1ci8tv6/files/requirements.txt b/back/wandb/run-20260323_133616-b1ci8tv6/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0235910d0d99b7dee69b9a7f2f90012c8b711cc --- /dev/null +++ b/back/wandb/run-20260323_133616-b1ci8tv6/files/requirements.txt @@ -0,0 +1,168 @@ +dill==0.3.8 +mkl-service==2.4.0 +mpmath==1.3.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +torch==2.5.1 +ptyprocess==0.7.0 +traitlets==5.14.3 +pyasn1==0.6.1 +opencv-python-headless==4.12.0.88 +nest-asyncio==1.6.0 +kiwisolver==1.4.8 +click==8.2.1 +fire==0.7.1 +diffusers==0.35.1 +accelerate==1.7.0 +ipykernel==6.29.5 +peft==0.17.1 +attrs==24.3.0 +six==1.17.0 +numpy==2.0.1 +yarl==1.18.0 +huggingface_hub==0.34.4 +Bottleneck==1.4.2 +numexpr==2.11.0 +dataclasses==0.6 +typing-inspection==0.4.1 +safetensors==0.5.3 +pyparsing==3.2.3 +psutil==7.0.0 +imageio==2.37.0 +debugpy==1.8.14 +cycler==0.12.1 +pyasn1_modules==0.4.2 +matplotlib-inline==0.1.7 +matplotlib==3.10.3 +jedi==0.19.2 +tokenizers==0.21.2 +seaborn==0.13.2 +timm==1.0.15 +aiohappyeyeballs==2.6.1 +hf-xet==1.1.8 +multidict==6.1.0 +tqdm==4.67.1 +wheel==0.45.1 +simsimd==6.5.1 +sentencepiece==0.2.1 +grpcio==1.74.0 +asttokens==3.0.0 +absl-py==2.3.1 +stack-data==0.6.3 +pandas==2.3.0 +importlib_metadata==8.7.0 +pytorch-image-generation-metrics==0.6.1 +frozenlist==1.5.0 +MarkupSafe==3.0.2 +setuptools==78.1.1 +multiprocess==0.70.15 +pip==25.1 +requests==2.32.3 +mkl_random==1.2.8 +tensorboard-plugin-wit==1.8.1 +ExifRead-nocycle==3.0.1 +webdataset==0.2.111 +threadpoolctl==3.6.0 +pyarrow==21.0.0 +executing==2.2.0 +decorator==5.2.1 +contourpy==1.3.2 +annotated-types==0.7.0 +scikit-learn==1.7.1 +jupyter_client==8.6.3 +albumentations==1.4.24 +wandb==0.25.0 +certifi==2025.8.3 +idna==3.7 +xxhash==3.5.0 +Jinja2==3.1.6 +python-dateutil==2.9.0.post0 +aiosignal==1.4.0 +triton==3.1.0 +torchvision==0.20.1 +stringzilla==3.12.6 +pure_eval==0.2.3 +braceexpand==0.1.7 +zipp==3.22.0 +oauthlib==3.3.1 +Markdown==3.8.2 +fsspec==2025.3.0 +fonttools==4.58.2 +comm==0.2.2 +ipython==9.3.0 +img2dataset==1.47.0 +networkx==3.4.2 +PySocks==1.7.1 +tzdata==2025.2 +smmap==5.0.2 +mkl_fft==1.3.11 +sentry-sdk==2.29.1 +Pygments==2.19.1 +pexpect==4.9.0 +ftfy==6.3.1 +einops==0.8.1 +requests-oauthlib==2.0.0 +gitdb==4.0.12 +albucore==0.0.23 +torchdiffeq==0.2.5 +GitPython==3.1.44 +bitsandbytes==0.47.0 +pytorch-fid==0.3.0 +clean-fid==0.1.35 +pytorch-gan-metrics==0.5.4 +Brotli==1.0.9 +charset-normalizer==3.3.2 +gmpy2==2.2.1 +pillow==11.1.0 +PyYAML==6.0.2 +tornado==6.5.1 +termcolor==3.1.0 +setproctitle==1.3.6 +scipy==1.15.3 +regex==2024.11.6 +protobuf==6.31.1 +platformdirs==4.3.8 +joblib==1.5.1 +cachetools==4.2.4 +ipython_pygments_lexers==1.1.1 +google-auth==1.35.0 +transformers==4.53.2 +torch-fidelity==0.3.0 +tensorboard==2.4.0 +filelock==3.17.0 +packaging==25.0 +propcache==0.3.1 +pytz==2025.2 +aiohttp==3.11.10 +wcwidth==0.2.13 +clip==0.2.0 +Werkzeug==3.1.3 +tensorboard-data-server==0.6.1 +sympy==1.13.1 +pyzmq==26.4.0 +pydantic_core==2.33.2 +prompt_toolkit==3.0.51 +parso==0.8.4 +docker-pycreds==0.4.0 +rsa==4.9.1 +pydantic==2.11.5 +jupyter_core==5.8.1 +google-auth-oauthlib==0.4.6 +datasets==4.0.0 +torch-tb-profiler==0.4.3 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +typeguard==4.3.0 +inflect==7.3.1 +jaraco.text==3.12.1 +tomli==2.0.1 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/back/wandb/run-20260323_133616-b1ci8tv6/files/wandb-metadata.json b/back/wandb/run-20260323_133616-b1ci8tv6/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ac8b6627bcc96712a9a786d734cc988efea045 --- /dev/null +++ b/back/wandb/run-20260323_133616-b1ci8tv6/files/wandb-metadata.json @@ -0,0 +1,101 @@ +{ + "os": "Linux-5.15.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.9", + "startedAt": "2026-03-23T05:36:16.612764Z", + "args": [ + "--report-to", + "wandb", + "--allow-tf32", + "--mixed-precision", + "bf16", + "--seed", + "0", + "--path-type", + "linear", + "--prediction", + "v", + "--weighting", + "uniform", + "--model", + "SiT-XL/2", + "--enc-type", + "dinov2-vit-b", + "--encoder-depth", + "8", + "--proj-coeff", + "0.5", + "--output-dir", + "exps", + "--exp-name", + "jsflow-experiment-0.75", + "--batch-size", + "256", + "--data-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256", + "--semantic-features-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0", + "--learning-rate", + "0.00005", + "--t-c", + "0.75", + "--cls", + "0.2", + "--ot-cls" + ], + "program": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "git": { + "remote": "https://github.com/Martinser/REG.git", + "commit": "021ea2e50c38c5803bd9afff16316958a01fbd1d" + }, + "email": "2365972933@qq.com", + "root": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG", + "host": "24c964746905d416ce09d045f9a06f23-taskrole1-0", + "executable": "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 4, + "disk": { + "/": { + "total": "3838880616448", + "used": "357567815680" + } + }, + "memory": { + "total": "2164115296256" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-a09f2421-99e6-a72e-63bd-fd7452510758" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e" + } + ], + "cudaVersion": "13.0", + "writerId": "sv5cjreim34cqttqhabmnhfdttwvk4c4" +} \ No newline at end of file diff --git a/back/wandb/run-20260323_133616-b1ci8tv6/logs/debug-internal.log b/back/wandb/run-20260323_133616-b1ci8tv6/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..3819cea5f310a14ba2e742e102c864ce4782a3f9 --- /dev/null +++ b/back/wandb/run-20260323_133616-b1ci8tv6/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-03-23T13:36:16.970320415+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-23T13:36:18.27874562+08:00","level":"INFO","msg":"stream: created new stream","id":"b1ci8tv6"} +{"time":"2026-03-23T13:36:18.278923889+08:00","level":"INFO","msg":"handler: started","stream_id":"b1ci8tv6"} +{"time":"2026-03-23T13:36:18.279752929+08:00","level":"INFO","msg":"stream: started","id":"b1ci8tv6"} +{"time":"2026-03-23T13:36:18.279811391+08:00","level":"INFO","msg":"writer: started","stream_id":"b1ci8tv6"} +{"time":"2026-03-23T13:36:18.279861642+08:00","level":"INFO","msg":"sender: started","stream_id":"b1ci8tv6"} diff --git a/back/wandb/run-20260323_133616-b1ci8tv6/logs/debug.log b/back/wandb/run-20260323_133616-b1ci8tv6/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..57149d7951f9226107d9a65c7d2ed30d835de008 --- /dev/null +++ b/back/wandb/run-20260323_133616-b1ci8tv6/logs/debug.log @@ -0,0 +1,20 @@ +2026-03-23 13:36:16,646 INFO MainThread:394082 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-23 13:36:16,646 INFO MainThread:394082 [wandb_setup.py:_flush():81] Configure stats pid to 394082 +2026-03-23 13:36:16,646 INFO MainThread:394082 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-23 13:36:16,646 INFO MainThread:394082 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260323_133616-b1ci8tv6/logs/debug.log +2026-03-23 13:36:16,646 INFO MainThread:394082 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260323_133616-b1ci8tv6/logs/debug-internal.log +2026-03-23 13:36:16,646 INFO MainThread:394082 [wandb_init.py:init():844] calling init triggers +2026-03-23 13:36:16,647 INFO MainThread:394082 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-23 13:36:16,647 INFO MainThread:394082 [wandb_init.py:init():892] starting backend +2026-03-23 13:36:16,953 INFO MainThread:394082 [wandb_init.py:init():895] sending inform_init request +2026-03-23 13:36:16,967 INFO MainThread:394082 [wandb_init.py:init():903] backend started and connected +2026-03-23 13:36:16,968 INFO MainThread:394082 [wandb_init.py:init():973] updated telemetry +2026-03-23 13:36:16,988 INFO MainThread:394082 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-23 13:36:19,831 INFO MainThread:394082 [wandb_init.py:init():1042] starting run threads in backend +2026-03-23 13:36:19,922 INFO MainThread:394082 [wandb_run.py:_console_start():2524] atexit reg +2026-03-23 13:36:19,922 INFO MainThread:394082 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-23 13:36:19,922 INFO MainThread:394082 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-23 13:36:19,922 INFO MainThread:394082 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-23 13:36:19,927 INFO MainThread:394082 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-23 13:36:19,927 INFO MainThread:394082 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment-0.75', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 2000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.2, 't_c': 0.75, 'ot_cls': True} diff --git a/back/wandb/run-20260323_135607-zue1y2ba/files/output.log b/back/wandb/run-20260323_135607-zue1y2ba/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7d6fb275e37d00c05a73ce8850606f6986768669 --- /dev/null +++ b/back/wandb/run-20260323_135607-zue1y2ba/files/output.log @@ -0,0 +1,289 @@ +Steps: 0%| | 1/1000000 [00:02<603:07:37, 2.17s/it][2026-03-23 13:56:13] Generating EMA samples (Euler-Maruyama; t≈0.75 → t=0)... +[2026-03-23 13:56:16] Step: 1, Training Logs: loss_final: 4.886707, loss_mean: 1.706308, proj_loss: 0.001541, loss_mean_cls: 3.178859, grad_norm: 1.484174 +Steps: 0%| | 2/1000000 [00:04<669:29:28, 2.41s/it, grad_norm=1.48, loss_final=4.89, loss_mean=1.71, loss_mean_cls=3.18, proj_loss=0.00154][2026-03-23 13:56:16] Step: 2, Training Logs: loss_final: 4.294325, loss_mean: 1.689698, proj_loss: -0.010266, loss_mean_cls: 2.614893, grad_norm: 1.062257 +Steps: 0%| | 3/1000000 [00:04<394:40:41, 1.42s/it, grad_norm=1.06, loss_final=4.29, loss_mean=1.69, loss_mean_cls=2.61, proj_loss=-0.0103][2026-03-23 13:56:16] Step: 3, Training Logs: loss_final: 4.829489, loss_mean: 1.666703, proj_loss: -0.019215, loss_mean_cls: 3.182001, grad_norm: 1.115940 +Steps: 0%| | 4/1000000 [00:05<265:36:26, 1.05it/s, grad_norm=1.12, loss_final=4.83, loss_mean=1.67, loss_mean_cls=3.18, proj_loss=-0.0192][2026-03-23 13:56:16] Step: 4, Training Logs: loss_final: 4.838729, loss_mean: 1.683388, proj_loss: -0.026348, loss_mean_cls: 3.181689, grad_norm: 0.753645 +Steps: 0%| | 5/1000000 [00:05<194:13:56, 1.43it/s, grad_norm=0.754, loss_final=4.84, loss_mean=1.68, loss_mean_cls=3.18, proj_loss=-0.0263][2026-03-23 13:56:17] Step: 5, Training Logs: loss_final: 4.434019, loss_mean: 1.678989, proj_loss: -0.034618, loss_mean_cls: 2.789647, grad_norm: 0.828005 +Steps: 0%| | 6/1000000 [00:05<151:13:33, 1.84it/s, grad_norm=0.828, loss_final=4.43, loss_mean=1.68, loss_mean_cls=2.79, proj_loss=-0.0346][2026-03-23 13:56:17] Step: 6, Training Logs: loss_final: 4.717834, loss_mean: 1.685299, proj_loss: -0.039478, loss_mean_cls: 3.072013, grad_norm: 0.944329 +Steps: 0%| | 7/1000000 [00:05<123:56:37, 2.24it/s, grad_norm=0.944, loss_final=4.72, loss_mean=1.69, loss_mean_cls=3.07, proj_loss=-0.0395][2026-03-23 13:56:17] Step: 7, Training Logs: loss_final: 4.768410, loss_mean: 1.687585, proj_loss: -0.042706, loss_mean_cls: 3.123530, grad_norm: 0.827338 +Steps: 0%| | 8/1000000 [00:06<106:03:40, 2.62it/s, grad_norm=0.827, loss_final=4.77, loss_mean=1.69, loss_mean_cls=3.12, proj_loss=-0.0427][2026-03-23 13:56:17] Step: 8, Training Logs: loss_final: 4.815378, loss_mean: 1.655840, proj_loss: -0.044987, loss_mean_cls: 3.204525, grad_norm: 0.854108 +Steps: 0%| | 9/1000000 [00:06<94:05:27, 2.95it/s, grad_norm=0.854, loss_final=4.82, loss_mean=1.66, loss_mean_cls=3.2, proj_loss=-0.045] [2026-03-23 13:56:18] Step: 9, Training Logs: loss_final: 4.777181, loss_mean: 1.657609, proj_loss: -0.046041, loss_mean_cls: 3.165613, grad_norm: 0.939216 +Steps: 0%| | 10/1000000 [00:06<86:16:44, 3.22it/s, grad_norm=0.939, loss_final=4.78, loss_mean=1.66, loss_mean_cls=3.17, proj_loss=-0.046][2026-03-23 13:56:18] Step: 10, Training Logs: loss_final: 4.966327, loss_mean: 1.662955, proj_loss: -0.047787, loss_mean_cls: 3.351158, grad_norm: 0.991139 +Steps: 0%| | 11/1000000 [00:06<80:53:45, 3.43it/s, grad_norm=0.991, loss_final=4.97, loss_mean=1.66, loss_mean_cls=3.35, proj_loss=-0.0478][2026-03-23 13:56:18] Step: 11, Training Logs: loss_final: 5.292466, loss_mean: 1.650309, proj_loss: -0.049361, loss_mean_cls: 3.691518, grad_norm: 0.968609 +Steps: 0%| | 12/1000000 [00:07<76:53:52, 3.61it/s, grad_norm=0.969, loss_final=5.29, loss_mean=1.65, loss_mean_cls=3.69, proj_loss=-0.0494][2026-03-23 13:56:18] Step: 12, Training Logs: loss_final: 5.088713, loss_mean: 1.626424, proj_loss: -0.049838, loss_mean_cls: 3.512127, grad_norm: 1.162980 +Steps: 0%| | 13/1000000 [00:07<74:06:27, 3.75it/s, grad_norm=1.16, loss_final=5.09, loss_mean=1.63, loss_mean_cls=3.51, proj_loss=-0.0498][2026-03-23 13:56:19] Step: 13, Training Logs: loss_final: 4.928039, loss_mean: 1.623318, proj_loss: -0.051428, loss_mean_cls: 3.356148, grad_norm: 1.370081 +Steps: 0%| | 14/1000000 [00:07<72:19:32, 3.84it/s, grad_norm=1.37, loss_final=4.93, loss_mean=1.62, loss_mean_cls=3.36, proj_loss=-0.0514][2026-03-23 13:56:19] Step: 14, Training Logs: loss_final: 4.342262, loss_mean: 1.599151, proj_loss: -0.051557, loss_mean_cls: 2.794668, grad_norm: 1.247963 +Steps: 0%| | 15/1000000 [00:07<70:56:55, 3.92it/s, grad_norm=1.25, loss_final=4.34, loss_mean=1.6, loss_mean_cls=2.79, proj_loss=-0.0516][2026-03-23 13:56:19] Step: 15, Training Logs: loss_final: 5.220107, loss_mean: 1.589133, proj_loss: -0.054820, loss_mean_cls: 3.685793, grad_norm: 1.207489 +Steps: 0%| | 16/1000000 [00:08<69:57:27, 3.97it/s, grad_norm=1.21, loss_final=5.22, loss_mean=1.59, loss_mean_cls=3.69, proj_loss=-0.0548][2026-03-23 13:56:19] Step: 16, Training Logs: loss_final: 4.653113, loss_mean: 1.599458, proj_loss: -0.052329, loss_mean_cls: 3.105984, grad_norm: 1.168053 +Steps: 0%| | 17/1000000 [00:08<69:18:40, 4.01it/s, grad_norm=1.17, loss_final=4.65, loss_mean=1.6, loss_mean_cls=3.11, proj_loss=-0.0523][2026-03-23 13:56:20] Step: 17, Training Logs: loss_final: 4.984623, loss_mean: 1.639365, proj_loss: -0.051690, loss_mean_cls: 3.396948, grad_norm: 2.396022 +Steps: 0%| | 18/1000000 [00:08<69:00:10, 4.03it/s, grad_norm=2.4, loss_final=4.98, loss_mean=1.64, loss_mean_cls=3.4, proj_loss=-0.0517][2026-03-23 13:56:20] Step: 18, Training Logs: loss_final: 4.356441, loss_mean: 1.560717, proj_loss: -0.054883, loss_mean_cls: 2.850607, grad_norm: 0.931938 +Steps: 0%| | 19/1000000 [00:08<68:37:45, 4.05it/s, grad_norm=0.932, loss_final=4.36, loss_mean=1.56, loss_mean_cls=2.85, proj_loss=-0.0549][2026-03-23 13:56:20] Step: 19, Training Logs: loss_final: 4.922860, loss_mean: 1.625848, proj_loss: -0.055209, loss_mean_cls: 3.352221, grad_norm: 2.649956 +Steps: 0%| | 20/1000000 [00:09<68:31:08, 4.05it/s, grad_norm=2.65, loss_final=4.92, loss_mean=1.63, loss_mean_cls=3.35, proj_loss=-0.0552][2026-03-23 13:56:20] Step: 20, Training Logs: loss_final: 4.693020, loss_mean: 1.635111, proj_loss: -0.053463, loss_mean_cls: 3.111372, grad_norm: 2.886806 +Steps: 0%| | 21/1000000 [00:09<68:17:30, 4.07it/s, grad_norm=2.89, loss_final=4.69, loss_mean=1.64, loss_mean_cls=3.11, proj_loss=-0.0535][2026-03-23 13:56:21] Step: 21, Training Logs: loss_final: 4.847960, loss_mean: 1.608342, proj_loss: -0.053926, loss_mean_cls: 3.293545, grad_norm: 1.804076 +Steps: 0%| | 22/1000000 [00:09<68:07:59, 4.08it/s, grad_norm=1.8, loss_final=4.85, loss_mean=1.61, loss_mean_cls=3.29, proj_loss=-0.0539][2026-03-23 13:56:21] Step: 22, Training Logs: loss_final: 4.531857, loss_mean: 1.556872, proj_loss: -0.053500, loss_mean_cls: 3.028484, grad_norm: 1.136153 +Steps: 0%| | 23/1000000 [00:09<68:01:00, 4.08it/s, grad_norm=1.14, loss_final=4.53, loss_mean=1.56, loss_mean_cls=3.03, proj_loss=-0.0535][2026-03-23 13:56:21] Step: 23, Training Logs: loss_final: 4.347858, loss_mean: 1.571953, proj_loss: -0.051053, loss_mean_cls: 2.826958, grad_norm: 1.379415 +Steps: 0%| | 24/1000000 [00:10<67:55:02, 4.09it/s, grad_norm=1.38, loss_final=4.35, loss_mean=1.57, loss_mean_cls=2.83, proj_loss=-0.0511][2026-03-23 13:56:21] Step: 24, Training Logs: loss_final: 4.812301, loss_mean: 1.573597, proj_loss: -0.054251, loss_mean_cls: 3.292954, grad_norm: 1.536997 +Steps: 0%| | 25/1000000 [00:10<67:50:56, 4.09it/s, grad_norm=1.54, loss_final=4.81, loss_mean=1.57, loss_mean_cls=3.29, proj_loss=-0.0543][2026-03-23 13:56:22] Step: 25, Training Logs: loss_final: 5.140490, loss_mean: 1.572134, proj_loss: -0.055976, loss_mean_cls: 3.624332, grad_norm: 1.501310 +Steps: 0%| | 26/1000000 [00:10<67:49:32, 4.10it/s, grad_norm=1.5, loss_final=5.14, loss_mean=1.57, loss_mean_cls=3.62, proj_loss=-0.056][2026-03-23 13:56:22] Step: 26, Training Logs: loss_final: 4.614564, loss_mean: 1.575298, proj_loss: -0.055522, loss_mean_cls: 3.094787, grad_norm: 1.374379 +Steps: 0%| | 27/1000000 [00:10<67:48:51, 4.10it/s, grad_norm=1.37, loss_final=4.61, loss_mean=1.58, loss_mean_cls=3.09, proj_loss=-0.0555][2026-03-23 13:56:22] Step: 27, Training Logs: loss_final: 4.428196, loss_mean: 1.559373, proj_loss: -0.054907, loss_mean_cls: 2.923730, grad_norm: 1.247505 +Steps: 0%| | 28/1000000 [00:11<67:51:05, 4.09it/s, grad_norm=1.25, loss_final=4.43, loss_mean=1.56, loss_mean_cls=2.92, proj_loss=-0.0549][2026-03-23 13:56:22] Step: 28, Training Logs: loss_final: 4.514633, loss_mean: 1.561718, proj_loss: -0.054182, loss_mean_cls: 3.007097, grad_norm: 1.272959 +Steps: 0%| | 29/1000000 [00:11<67:50:57, 4.09it/s, grad_norm=1.27, loss_final=4.51, loss_mean=1.56, loss_mean_cls=3.01, proj_loss=-0.0542][2026-03-23 13:56:23] Step: 29, Training Logs: loss_final: 4.064789, loss_mean: 1.520746, proj_loss: -0.055004, loss_mean_cls: 2.599047, grad_norm: 1.213601 +Steps: 0%| | 30/1000000 [00:11<67:49:21, 4.10it/s, grad_norm=1.21, loss_final=4.06, loss_mean=1.52, loss_mean_cls=2.6, proj_loss=-0.055][2026-03-23 13:56:23] Step: 30, Training Logs: loss_final: 4.343926, loss_mean: 1.523063, proj_loss: -0.055674, loss_mean_cls: 2.876538, grad_norm: 1.148791 +Steps: 0%| | 31/1000000 [00:11<67:50:28, 4.09it/s, grad_norm=1.15, loss_final=4.34, loss_mean=1.52, loss_mean_cls=2.88, proj_loss=-0.0557][2026-03-23 13:56:23] Step: 31, Training Logs: loss_final: 4.943371, loss_mean: 1.500860, proj_loss: -0.056089, loss_mean_cls: 3.498600, grad_norm: 1.126629 +Steps: 0%| | 32/1000000 [00:12<67:51:49, 4.09it/s, grad_norm=1.13, loss_final=4.94, loss_mean=1.5, loss_mean_cls=3.5, proj_loss=-0.0561][2026-03-23 13:56:23] Step: 32, Training Logs: loss_final: 4.913333, loss_mean: 1.487980, proj_loss: -0.053908, loss_mean_cls: 3.479261, grad_norm: 1.111562 +Steps: 0%| | 33/1000000 [00:12<67:49:26, 4.10it/s, grad_norm=1.11, loss_final=4.91, loss_mean=1.49, loss_mean_cls=3.48, proj_loss=-0.0539][2026-03-23 13:56:24] Step: 33, Training Logs: loss_final: 5.135122, loss_mean: 1.485391, proj_loss: -0.058625, loss_mean_cls: 3.708356, grad_norm: 1.166992 +Steps: 0%| | 34/1000000 [00:12<67:49:31, 4.10it/s, grad_norm=1.17, loss_final=5.14, loss_mean=1.49, loss_mean_cls=3.71, proj_loss=-0.0586][2026-03-23 13:56:24] Step: 34, Training Logs: loss_final: 4.094241, loss_mean: 1.475285, proj_loss: -0.056888, loss_mean_cls: 2.675844, grad_norm: 1.037110 +Steps: 0%| | 35/1000000 [00:12<67:47:41, 4.10it/s, grad_norm=1.04, loss_final=4.09, loss_mean=1.48, loss_mean_cls=2.68, proj_loss=-0.0569][2026-03-23 13:56:24] Step: 35, Training Logs: loss_final: 4.305937, loss_mean: 1.460218, proj_loss: -0.052705, loss_mean_cls: 2.898424, grad_norm: 1.091828 +Steps: 0%| | 36/1000000 [00:13<67:50:00, 4.09it/s, grad_norm=1.09, loss_final=4.31, loss_mean=1.46, loss_mean_cls=2.9, proj_loss=-0.0527][2026-03-23 13:56:24] Step: 36, Training Logs: loss_final: 4.382219, loss_mean: 1.466251, proj_loss: -0.054344, loss_mean_cls: 2.970313, grad_norm: 1.093016 +Steps: 0%| | 37/1000000 [00:13<68:14:38, 4.07it/s, grad_norm=1.09, loss_final=4.38, loss_mean=1.47, loss_mean_cls=2.97, proj_loss=-0.0543][2026-03-23 13:56:25] Step: 37, Training Logs: loss_final: 4.428336, loss_mean: 1.469206, proj_loss: -0.057364, loss_mean_cls: 3.016495, grad_norm: 1.468433 +Steps: 0%| | 38/1000000 [00:13<68:07:38, 4.08it/s, grad_norm=1.47, loss_final=4.43, loss_mean=1.47, loss_mean_cls=3.02, proj_loss=-0.0574][2026-03-23 13:56:25] Step: 38, Training Logs: loss_final: 4.412411, loss_mean: 1.453988, proj_loss: -0.056726, loss_mean_cls: 3.015149, grad_norm: 0.825492 +Steps: 0%| | 39/1000000 [00:13<68:01:05, 4.08it/s, grad_norm=0.825, loss_final=4.41, loss_mean=1.45, loss_mean_cls=3.02, proj_loss=-0.0567][2026-03-23 13:56:25] Step: 39, Training Logs: loss_final: 4.554020, loss_mean: 1.450588, proj_loss: -0.055717, loss_mean_cls: 3.159149, grad_norm: 1.281297 +Steps: 0%| | 40/1000000 [00:14<68:02:46, 4.08it/s, grad_norm=1.28, loss_final=4.55, loss_mean=1.45, loss_mean_cls=3.16, proj_loss=-0.0557][2026-03-23 13:56:25] Step: 40, Training Logs: loss_final: 4.900630, loss_mean: 1.421538, proj_loss: -0.056336, loss_mean_cls: 3.535427, grad_norm: 1.096009 +Steps: 0%| | 41/1000000 [00:14<67:56:56, 4.09it/s, grad_norm=1.1, loss_final=4.9, loss_mean=1.42, loss_mean_cls=3.54, proj_loss=-0.0563][2026-03-23 13:56:26] Step: 41, Training Logs: loss_final: 5.000130, loss_mean: 1.418576, proj_loss: -0.056068, loss_mean_cls: 3.637622, grad_norm: 1.111790 +Steps: 0%| | 42/1000000 [00:14<67:53:46, 4.09it/s, grad_norm=1.11, loss_final=5, loss_mean=1.42, loss_mean_cls=3.64, proj_loss=-0.0561][2026-03-23 13:56:26] Step: 42, Training Logs: loss_final: 4.223974, loss_mean: 1.442425, proj_loss: -0.054436, loss_mean_cls: 2.835985, grad_norm: 1.223004 +Steps: 0%| | 43/1000000 [00:14<67:51:37, 4.09it/s, grad_norm=1.22, loss_final=4.22, loss_mean=1.44, loss_mean_cls=2.84, proj_loss=-0.0544][2026-03-23 13:56:26] Step: 43, Training Logs: loss_final: 5.023710, loss_mean: 1.441718, proj_loss: -0.055548, loss_mean_cls: 3.637539, grad_norm: 1.391159 +Steps: 0%| | 44/1000000 [00:15<67:53:09, 4.09it/s, grad_norm=1.39, loss_final=5.02, loss_mean=1.44, loss_mean_cls=3.64, proj_loss=-0.0555][2026-03-23 13:56:26] Step: 44, Training Logs: loss_final: 5.000957, loss_mean: 1.414230, proj_loss: -0.055362, loss_mean_cls: 3.642089, grad_norm: 1.162115 +Steps: 0%| | 45/1000000 [00:15<67:51:51, 4.09it/s, grad_norm=1.16, loss_final=5, loss_mean=1.41, loss_mean_cls=3.64, proj_loss=-0.0554][2026-03-23 13:56:27] Step: 45, Training Logs: loss_final: 4.689414, loss_mean: 1.389372, proj_loss: -0.054793, loss_mean_cls: 3.354835, grad_norm: 0.814496 +Steps: 0%| | 46/1000000 [00:15<67:51:06, 4.09it/s, grad_norm=0.814, loss_final=4.69, loss_mean=1.39, loss_mean_cls=3.35, proj_loss=-0.0548][2026-03-23 13:56:27] Step: 46, Training Logs: loss_final: 4.452005, loss_mean: 1.403370, proj_loss: -0.056346, loss_mean_cls: 3.104981, grad_norm: 1.062373 +Steps: 0%| | 47/1000000 [00:15<67:50:59, 4.09it/s, grad_norm=1.06, loss_final=4.45, loss_mean=1.4, loss_mean_cls=3.1, proj_loss=-0.0563][2026-03-23 13:56:27] Step: 47, Training Logs: loss_final: 4.638161, loss_mean: 1.413495, proj_loss: -0.055944, loss_mean_cls: 3.280609, grad_norm: 0.873316 +Steps: 0%| | 48/1000000 [00:15<67:51:08, 4.09it/s, grad_norm=0.873, loss_final=4.64, loss_mean=1.41, loss_mean_cls=3.28, proj_loss=-0.0559][2026-03-23 13:56:27] Step: 48, Training Logs: loss_final: 4.429680, loss_mean: 1.411911, proj_loss: -0.054931, loss_mean_cls: 3.072701, grad_norm: 0.759066 +Steps: 0%| | 49/1000000 [00:16<67:51:48, 4.09it/s, grad_norm=0.759, loss_final=4.43, loss_mean=1.41, loss_mean_cls=3.07, proj_loss=-0.0549][2026-03-23 13:56:27] Step: 49, Training Logs: loss_final: 4.901843, loss_mean: 1.388557, proj_loss: -0.055650, loss_mean_cls: 3.568936, grad_norm: 0.750354 +Steps: 0%| | 50/1000000 [00:16<67:49:39, 4.10it/s, grad_norm=0.75, loss_final=4.9, loss_mean=1.39, loss_mean_cls=3.57, proj_loss=-0.0557][2026-03-23 13:56:28] Step: 50, Training Logs: loss_final: 4.100002, loss_mean: 1.412004, proj_loss: -0.052795, loss_mean_cls: 2.740793, grad_norm: 0.767011 +Steps: 0%| | 51/1000000 [00:16<67:51:25, 4.09it/s, grad_norm=0.767, loss_final=4.1, loss_mean=1.41, loss_mean_cls=2.74, proj_loss=-0.0528][2026-03-23 13:56:28] Step: 51, Training Logs: loss_final: 4.759223, loss_mean: 1.391587, proj_loss: -0.054960, loss_mean_cls: 3.422596, grad_norm: 0.719157 +Steps: 0%| | 52/1000000 [00:16<67:51:11, 4.09it/s, grad_norm=0.719, loss_final=4.76, loss_mean=1.39, loss_mean_cls=3.42, proj_loss=-0.055][2026-03-23 13:56:28] Step: 52, Training Logs: loss_final: 4.482382, loss_mean: 1.395736, proj_loss: -0.056075, loss_mean_cls: 3.142721, grad_norm: 0.726095 +Steps: 0%| | 53/1000000 [00:17<67:50:55, 4.09it/s, grad_norm=0.726, loss_final=4.48, loss_mean=1.4, loss_mean_cls=3.14, proj_loss=-0.0561][2026-03-23 13:56:28] Step: 53, Training Logs: loss_final: 4.466825, loss_mean: 1.373211, proj_loss: -0.055033, loss_mean_cls: 3.148648, grad_norm: 0.663936 +Steps: 0%| | 54/1000000 [00:17<67:49:43, 4.10it/s, grad_norm=0.664, loss_final=4.47, loss_mean=1.37, loss_mean_cls=3.15, proj_loss=-0.055][2026-03-23 13:56:29] Step: 54, Training Logs: loss_final: 4.687413, loss_mean: 1.367031, proj_loss: -0.056856, loss_mean_cls: 3.377239, grad_norm: 0.706146 +Steps: 0%| | 55/1000000 [00:17<67:47:50, 4.10it/s, grad_norm=0.706, loss_final=4.69, loss_mean=1.37, loss_mean_cls=3.38, proj_loss=-0.0569][2026-03-23 13:56:29] Step: 55, Training Logs: loss_final: 4.724741, loss_mean: 1.355043, proj_loss: -0.056556, loss_mean_cls: 3.426254, grad_norm: 0.891681 +Steps: 0%| | 56/1000000 [00:17<67:49:45, 4.10it/s, grad_norm=0.892, loss_final=4.72, loss_mean=1.36, loss_mean_cls=3.43, proj_loss=-0.0566][2026-03-23 13:56:29] Step: 56, Training Logs: loss_final: 4.598913, loss_mean: 1.357820, proj_loss: -0.053707, loss_mean_cls: 3.294799, grad_norm: 0.688579 +Steps: 0%| | 57/1000000 [00:18<67:48:24, 4.10it/s, grad_norm=0.689, loss_final=4.6, loss_mean=1.36, loss_mean_cls=3.29, proj_loss=-0.0537][2026-03-23 13:56:29] Step: 57, Training Logs: loss_final: 4.647234, loss_mean: 1.346388, proj_loss: -0.055346, loss_mean_cls: 3.356192, grad_norm: 0.643795 +Steps: 0%| | 58/1000000 [00:18<67:48:01, 4.10it/s, grad_norm=0.644, loss_final=4.65, loss_mean=1.35, loss_mean_cls=3.36, proj_loss=-0.0553][2026-03-23 13:56:30] Step: 58, Training Logs: loss_final: 4.629024, loss_mean: 1.361648, proj_loss: -0.056354, loss_mean_cls: 3.323730, grad_norm: 0.640788 +Steps: 0%| | 59/1000000 [00:18<67:48:29, 4.10it/s, grad_norm=0.641, loss_final=4.63, loss_mean=1.36, loss_mean_cls=3.32, proj_loss=-0.0564][2026-03-23 13:56:30] Step: 59, Training Logs: loss_final: 4.318052, loss_mean: 1.352944, proj_loss: -0.054974, loss_mean_cls: 3.020082, grad_norm: 1.103365 +Steps: 0%| | 60/1000000 [00:18<67:52:13, 4.09it/s, grad_norm=1.1, loss_final=4.32, loss_mean=1.35, loss_mean_cls=3.02, proj_loss=-0.055][2026-03-23 13:56:30] Step: 60, Training Logs: loss_final: 4.558999, loss_mean: 1.356349, proj_loss: -0.056304, loss_mean_cls: 3.258954, grad_norm: 2.178408 +Steps: 0%| | 61/1000000 [00:19<67:50:06, 4.09it/s, grad_norm=2.18, loss_final=4.56, loss_mean=1.36, loss_mean_cls=3.26, proj_loss=-0.0563][2026-03-23 13:56:30] Step: 61, Training Logs: loss_final: 4.405947, loss_mean: 1.345089, proj_loss: -0.054088, loss_mean_cls: 3.114946, grad_norm: 0.833812 +Steps: 0%| | 62/1000000 [00:19<67:49:48, 4.09it/s, grad_norm=0.834, loss_final=4.41, loss_mean=1.35, loss_mean_cls=3.11, proj_loss=-0.0541][2026-03-23 13:56:31] Step: 62, Training Logs: loss_final: 4.248630, loss_mean: 1.369897, proj_loss: -0.055107, loss_mean_cls: 2.933840, grad_norm: 2.604588 +Steps: 0%| | 63/1000000 [00:19<67:49:04, 4.10it/s, grad_norm=2.6, loss_final=4.25, loss_mean=1.37, loss_mean_cls=2.93, proj_loss=-0.0551][2026-03-23 13:56:31] Step: 63, Training Logs: loss_final: 4.370800, loss_mean: 1.356826, proj_loss: -0.054216, loss_mean_cls: 3.068191, grad_norm: 1.995899 +Steps: 0%| | 64/1000000 [00:19<67:52:12, 4.09it/s, grad_norm=2, loss_final=4.37, loss_mean=1.36, loss_mean_cls=3.07, proj_loss=-0.0542][2026-03-23 13:56:31] Step: 64, Training Logs: loss_final: 3.620857, loss_mean: 1.371637, proj_loss: -0.056188, loss_mean_cls: 2.305408, grad_norm: 1.270011 +Steps: 0%| | 65/1000000 [00:20<67:52:26, 4.09it/s, grad_norm=1.27, loss_final=3.62, loss_mean=1.37, loss_mean_cls=2.31, proj_loss=-0.0562][2026-03-23 13:56:31] Step: 65, Training Logs: loss_final: 3.841930, loss_mean: 1.345907, proj_loss: -0.057290, loss_mean_cls: 2.553313, grad_norm: 1.103519 +Steps: 0%| | 66/1000000 [00:20<67:50:32, 4.09it/s, grad_norm=1.1, loss_final=3.84, loss_mean=1.35, loss_mean_cls=2.55, proj_loss=-0.0573][2026-03-23 13:56:32] Step: 66, Training Logs: loss_final: 4.823415, loss_mean: 1.309009, proj_loss: -0.057261, loss_mean_cls: 3.571667, grad_norm: 1.146726 +Steps: 0%| | 67/1000000 [00:20<67:51:11, 4.09it/s, grad_norm=1.15, loss_final=4.82, loss_mean=1.31, loss_mean_cls=3.57, proj_loss=-0.0573][2026-03-23 13:56:32] Step: 67, Training Logs: loss_final: 4.654735, loss_mean: 1.315030, proj_loss: -0.055812, loss_mean_cls: 3.395517, grad_norm: 0.900543 +Steps: 0%| | 68/1000000 [00:20<67:53:43, 4.09it/s, grad_norm=0.901, loss_final=4.65, loss_mean=1.32, loss_mean_cls=3.4, proj_loss=-0.0558][2026-03-23 13:56:32] Step: 68, Training Logs: loss_final: 3.865313, loss_mean: 1.311920, proj_loss: -0.055171, loss_mean_cls: 2.608564, grad_norm: 0.968363 +Steps: 0%| | 69/1000000 [00:21<67:51:01, 4.09it/s, grad_norm=0.968, loss_final=3.87, loss_mean=1.31, loss_mean_cls=2.61, proj_loss=-0.0552][2026-03-23 13:56:32] Step: 69, Training Logs: loss_final: 4.332185, loss_mean: 1.287292, proj_loss: -0.055524, loss_mean_cls: 3.100417, grad_norm: 0.992981 +Steps: 0%| | 70/1000000 [00:21<67:50:34, 4.09it/s, grad_norm=0.993, loss_final=4.33, loss_mean=1.29, loss_mean_cls=3.1, proj_loss=-0.0555][2026-03-23 13:56:33] Step: 70, Training Logs: loss_final: 5.214293, loss_mean: 1.271327, proj_loss: -0.056000, loss_mean_cls: 3.998966, grad_norm: 0.950980 +Steps: 0%| | 71/1000000 [00:21<67:51:17, 4.09it/s, grad_norm=0.951, loss_final=5.21, loss_mean=1.27, loss_mean_cls=4, proj_loss=-0.056][2026-03-23 13:56:33] Step: 71, Training Logs: loss_final: 4.271416, loss_mean: 1.298077, proj_loss: -0.058767, loss_mean_cls: 3.032106, grad_norm: 0.897326 +Steps: 0%| | 72/1000000 [00:21<67:53:03, 4.09it/s, grad_norm=0.897, loss_final=4.27, loss_mean=1.3, loss_mean_cls=3.03, proj_loss=-0.0588][2026-03-23 13:56:33] Step: 72, Training Logs: loss_final: 4.944882, loss_mean: 1.275258, proj_loss: -0.054986, loss_mean_cls: 3.724611, grad_norm: 1.406698 +Steps: 0%| | 73/1000000 [00:22<67:51:07, 4.09it/s, grad_norm=1.41, loss_final=4.94, loss_mean=1.28, loss_mean_cls=3.72, proj_loss=-0.055][2026-03-23 13:56:33] Step: 73, Training Logs: loss_final: 4.262454, loss_mean: 1.391114, proj_loss: -0.054645, loss_mean_cls: 2.925985, grad_norm: 4.082608 +Steps: 0%| | 74/1000000 [00:22<67:49:45, 4.09it/s, grad_norm=4.08, loss_final=4.26, loss_mean=1.39, loss_mean_cls=2.93, proj_loss=-0.0546][2026-03-23 13:56:34] Step: 74, Training Logs: loss_final: 4.441927, loss_mean: 1.368322, proj_loss: -0.056059, loss_mean_cls: 3.129665, grad_norm: 3.740271 +Steps: 0%| | 75/1000000 [00:22<67:48:41, 4.10it/s, grad_norm=3.74, loss_final=4.44, loss_mean=1.37, loss_mean_cls=3.13, proj_loss=-0.0561][2026-03-23 13:56:34] Step: 75, Training Logs: loss_final: 4.786633, loss_mean: 1.323001, proj_loss: -0.054923, loss_mean_cls: 3.518555, grad_norm: 1.360202 +Steps: 0%| | 76/1000000 [00:22<67:47:44, 4.10it/s, grad_norm=1.36, loss_final=4.79, loss_mean=1.32, loss_mean_cls=3.52, proj_loss=-0.0549][2026-03-23 13:56:34] Step: 76, Training Logs: loss_final: 4.880804, loss_mean: 1.312154, proj_loss: -0.058812, loss_mean_cls: 3.627462, grad_norm: 2.462873 +Steps: 0%| | 77/1000000 [00:23<67:48:38, 4.10it/s, grad_norm=2.46, loss_final=4.88, loss_mean=1.31, loss_mean_cls=3.63, proj_loss=-0.0588][2026-03-23 13:56:34] Step: 77, Training Logs: loss_final: 4.347323, loss_mean: 1.277282, proj_loss: -0.056339, loss_mean_cls: 3.126381, grad_norm: 1.089468 +Steps: 0%| | 78/1000000 [00:23<67:47:33, 4.10it/s, grad_norm=1.09, loss_final=4.35, loss_mean=1.28, loss_mean_cls=3.13, proj_loss=-0.0563][2026-03-23 13:56:35] Step: 78, Training Logs: loss_final: 4.452549, loss_mean: 1.308045, proj_loss: -0.057751, loss_mean_cls: 3.202255, grad_norm: 1.409237 +Steps: 0%| | 79/1000000 [00:23<67:48:01, 4.10it/s, grad_norm=1.41, loss_final=4.45, loss_mean=1.31, loss_mean_cls=3.2, proj_loss=-0.0578][2026-03-23 13:56:35] Step: 79, Training Logs: loss_final: 4.300065, loss_mean: 1.256680, proj_loss: -0.057729, loss_mean_cls: 3.101114, grad_norm: 1.063736 +Steps: 0%| | 80/1000000 [00:23<67:46:24, 4.10it/s, grad_norm=1.06, loss_final=4.3, loss_mean=1.26, loss_mean_cls=3.1, proj_loss=-0.0577][2026-03-23 13:56:35] Step: 80, Training Logs: loss_final: 5.255699, loss_mean: 1.230699, proj_loss: -0.055231, loss_mean_cls: 4.080230, grad_norm: 1.159766 +Steps: 0%| | 81/1000000 [00:24<67:45:22, 4.10it/s, grad_norm=1.16, loss_final=5.26, loss_mean=1.23, loss_mean_cls=4.08, proj_loss=-0.0552][2026-03-23 13:56:35] Step: 81, Training Logs: loss_final: 4.184104, loss_mean: 1.256977, proj_loss: -0.057558, loss_mean_cls: 2.984685, grad_norm: 1.074197 +Steps: 0%| | 82/1000000 [00:24<67:46:48, 4.10it/s, grad_norm=1.07, loss_final=4.18, loss_mean=1.26, loss_mean_cls=2.98, proj_loss=-0.0576][2026-03-23 13:56:36] Step: 82, Training Logs: loss_final: 4.368578, loss_mean: 1.246449, proj_loss: -0.055964, loss_mean_cls: 3.178093, grad_norm: 1.212082 +Steps: 0%| | 83/1000000 [00:24<67:46:42, 4.10it/s, grad_norm=1.21, loss_final=4.37, loss_mean=1.25, loss_mean_cls=3.18, proj_loss=-0.056][2026-03-23 13:56:36] Step: 83, Training Logs: loss_final: 4.066692, loss_mean: 1.279943, proj_loss: -0.057100, loss_mean_cls: 2.843850, grad_norm: 1.081535 +Steps: 0%| | 84/1000000 [00:24<67:45:25, 4.10it/s, grad_norm=1.08, loss_final=4.07, loss_mean=1.28, loss_mean_cls=2.84, proj_loss=-0.0571][2026-03-23 13:56:36] Step: 84, Training Logs: loss_final: 4.419612, loss_mean: 1.222443, proj_loss: -0.056123, loss_mean_cls: 3.253291, grad_norm: 1.017672 +Steps: 0%| | 85/1000000 [00:25<67:46:48, 4.10it/s, grad_norm=1.02, loss_final=4.42, loss_mean=1.22, loss_mean_cls=3.25, proj_loss=-0.0561][2026-03-23 13:56:36] Step: 85, Training Logs: loss_final: 4.330603, loss_mean: 1.217940, proj_loss: -0.055811, loss_mean_cls: 3.168474, grad_norm: 1.048292 +Steps: 0%| | 86/1000000 [00:25<67:46:18, 4.10it/s, grad_norm=1.05, loss_final=4.33, loss_mean=1.22, loss_mean_cls=3.17, proj_loss=-0.0558][2026-03-23 13:56:37] Step: 86, Training Logs: loss_final: 4.073160, loss_mean: 1.261520, proj_loss: -0.057447, loss_mean_cls: 2.869087, grad_norm: 2.760909 +Steps: 0%| | 87/1000000 [00:25<67:46:56, 4.10it/s, grad_norm=2.76, loss_final=4.07, loss_mean=1.26, loss_mean_cls=2.87, proj_loss=-0.0574][2026-03-23 13:56:37] Step: 87, Training Logs: loss_final: 4.349278, loss_mean: 1.225316, proj_loss: -0.055978, loss_mean_cls: 3.179941, grad_norm: 1.415633 +Steps: 0%| | 88/1000000 [00:25<67:48:19, 4.10it/s, grad_norm=1.42, loss_final=4.35, loss_mean=1.23, loss_mean_cls=3.18, proj_loss=-0.056][2026-03-23 13:56:37] Step: 88, Training Logs: loss_final: 4.191838, loss_mean: 1.244159, proj_loss: -0.053158, loss_mean_cls: 3.000837, grad_norm: 2.142123 +Steps: 0%| | 89/1000000 [00:26<67:48:47, 4.10it/s, grad_norm=2.14, loss_final=4.19, loss_mean=1.24, loss_mean_cls=3, proj_loss=-0.0532][2026-03-23 13:56:37] Step: 89, Training Logs: loss_final: 3.810838, loss_mean: 1.241034, proj_loss: -0.056567, loss_mean_cls: 2.626369, grad_norm: 1.977536 +Steps: 0%| | 90/1000000 [00:26<67:49:26, 4.10it/s, grad_norm=1.98, loss_final=3.81, loss_mean=1.24, loss_mean_cls=2.63, proj_loss=-0.0566][2026-03-23 13:56:37] Step: 90, Training Logs: loss_final: 4.302361, loss_mean: 1.235736, proj_loss: -0.055757, loss_mean_cls: 3.122383, grad_norm: 1.465364 +Steps: 0%| | 91/1000000 [00:26<67:49:17, 4.10it/s, grad_norm=1.47, loss_final=4.3, loss_mean=1.24, loss_mean_cls=3.12, proj_loss=-0.0558][2026-03-23 13:56:38] Step: 91, Training Logs: loss_final: 4.793566, loss_mean: 1.295077, proj_loss: -0.054678, loss_mean_cls: 3.553167, grad_norm: 4.777657 +Steps: 0%| | 92/1000000 [00:26<67:50:23, 4.09it/s, grad_norm=4.78, loss_final=4.79, loss_mean=1.3, loss_mean_cls=3.55, proj_loss=-0.0547][2026-03-23 13:56:38] Step: 92, Training Logs: loss_final: 4.318158, loss_mean: 1.268673, proj_loss: -0.056761, loss_mean_cls: 3.106246, grad_norm: 3.593853 +Steps: 0%| | 93/1000000 [00:26<67:49:16, 4.10it/s, grad_norm=3.59, loss_final=4.32, loss_mean=1.27, loss_mean_cls=3.11, proj_loss=-0.0568][2026-03-23 13:56:38] Step: 93, Training Logs: loss_final: 4.038731, loss_mean: 1.237636, proj_loss: -0.058346, loss_mean_cls: 2.859441, grad_norm: 2.444315 +Steps: 0%| | 94/1000000 [00:27<67:49:30, 4.10it/s, grad_norm=2.44, loss_final=4.04, loss_mean=1.24, loss_mean_cls=2.86, proj_loss=-0.0583][2026-03-23 13:56:38] Step: 94, Training Logs: loss_final: 5.108729, loss_mean: 1.212993, proj_loss: -0.058394, loss_mean_cls: 3.954130, grad_norm: 1.895388 +Steps: 0%| | 95/1000000 [00:27<67:48:34, 4.10it/s, grad_norm=1.9, loss_final=5.11, loss_mean=1.21, loss_mean_cls=3.95, proj_loss=-0.0584][2026-03-23 13:56:39] Step: 95, Training Logs: loss_final: 4.209163, loss_mean: 1.207963, proj_loss: -0.055074, loss_mean_cls: 3.056275, grad_norm: 1.442985 +Steps: 0%| | 96/1000000 [00:27<67:50:56, 4.09it/s, grad_norm=1.44, loss_final=4.21, loss_mean=1.21, loss_mean_cls=3.06, proj_loss=-0.0551][2026-03-23 13:56:39] Step: 96, Training Logs: loss_final: 4.601864, loss_mean: 1.191701, proj_loss: -0.057576, loss_mean_cls: 3.467739, grad_norm: 1.299760 +Steps: 0%| | 97/1000000 [00:27<67:51:29, 4.09it/s, grad_norm=1.3, loss_final=4.6, loss_mean=1.19, loss_mean_cls=3.47, proj_loss=-0.0576][2026-03-23 13:56:39] Step: 97, Training Logs: loss_final: 4.614489, loss_mean: 1.185281, proj_loss: -0.056098, loss_mean_cls: 3.485306, grad_norm: 1.128512 +Steps: 0%| | 98/1000000 [00:28<67:50:38, 4.09it/s, grad_norm=1.13, loss_final=4.61, loss_mean=1.19, loss_mean_cls=3.49, proj_loss=-0.0561][2026-03-23 13:56:39] Step: 98, Training Logs: loss_final: 4.294365, loss_mean: 1.196555, proj_loss: -0.058301, loss_mean_cls: 3.156111, grad_norm: 1.399289 +Steps: 0%| | 99/1000000 [00:28<67:49:41, 4.09it/s, grad_norm=1.4, loss_final=4.29, loss_mean=1.2, loss_mean_cls=3.16, proj_loss=-0.0583][2026-03-23 13:56:40] Step: 99, Training Logs: loss_final: 4.215539, loss_mean: 1.199367, proj_loss: -0.057493, loss_mean_cls: 3.073665, grad_norm: 1.343760 +Steps: 0%| | 100/1000000 [00:28<67:51:40, 4.09it/s, grad_norm=1.34, loss_final=4.22, loss_mean=1.2, loss_mean_cls=3.07, proj_loss=-0.0575][2026-03-23 13:56:40] Step: 100, Training Logs: loss_final: 4.488984, loss_mean: 1.149765, proj_loss: -0.056815, loss_mean_cls: 3.396034, grad_norm: 1.045582 +Steps: 0%| | 101/1000000 [00:28<67:50:50, 4.09it/s, grad_norm=1.05, loss_final=4.49, loss_mean=1.15, loss_mean_cls=3.4, proj_loss=-0.0568][2026-03-23 13:56:40] Step: 101, Training Logs: loss_final: 4.368436, loss_mean: 1.178788, proj_loss: -0.058116, loss_mean_cls: 3.247765, grad_norm: 1.342256 +Steps: 0%| | 102/1000000 [00:29<67:50:14, 4.09it/s, grad_norm=1.34, loss_final=4.37, loss_mean=1.18, loss_mean_cls=3.25, proj_loss=-0.0581][2026-03-23 13:56:40] Step: 102, Training Logs: loss_final: 4.096940, loss_mean: 1.195665, proj_loss: -0.053437, loss_mean_cls: 2.954712, grad_norm: 1.167338 +Steps: 0%| | 103/1000000 [00:29<67:52:23, 4.09it/s, grad_norm=1.17, loss_final=4.1, loss_mean=1.2, loss_mean_cls=2.95, proj_loss=-0.0534][2026-03-23 13:56:41] Step: 103, Training Logs: loss_final: 4.406911, loss_mean: 1.123473, proj_loss: -0.055880, loss_mean_cls: 3.339318, grad_norm: 0.934016 +Steps: 0%| | 104/1000000 [00:29<67:52:00, 4.09it/s, grad_norm=0.934, loss_final=4.41, loss_mean=1.12, loss_mean_cls=3.34, proj_loss=-0.0559][2026-03-23 13:56:41] Step: 104, Training Logs: loss_final: 3.831389, loss_mean: 1.127624, proj_loss: -0.056442, loss_mean_cls: 2.760207, grad_norm: 1.079511 +Steps: 0%| | 105/1000000 [00:29<67:50:29, 4.09it/s, grad_norm=1.08, loss_final=3.83, loss_mean=1.13, loss_mean_cls=2.76, proj_loss=-0.0564][2026-03-23 13:56:41] Step: 105, Training Logs: loss_final: 3.949926, loss_mean: 1.169529, proj_loss: -0.056276, loss_mean_cls: 2.836674, grad_norm: 0.823243 +Steps: 0%| | 106/1000000 [00:30<67:49:55, 4.09it/s, grad_norm=0.823, loss_final=3.95, loss_mean=1.17, loss_mean_cls=2.84, proj_loss=-0.0563][2026-03-23 13:56:41] Step: 106, Training Logs: loss_final: 4.447761, loss_mean: 1.131930, proj_loss: -0.057379, loss_mean_cls: 3.373209, grad_norm: 0.780265 +Steps: 0%| | 107/1000000 [00:30<67:50:26, 4.09it/s, grad_norm=0.78, loss_final=4.45, loss_mean=1.13, loss_mean_cls=3.37, proj_loss=-0.0574][2026-03-23 13:56:42] Step: 107, Training Logs: loss_final: 4.185682, loss_mean: 1.137062, proj_loss: -0.056792, loss_mean_cls: 3.105412, grad_norm: 0.896073 +Steps: 0%| | 108/1000000 [00:30<67:51:47, 4.09it/s, grad_norm=0.896, loss_final=4.19, loss_mean=1.14, loss_mean_cls=3.11, proj_loss=-0.0568][2026-03-23 13:56:42] Step: 108, Training Logs: loss_final: 4.381330, loss_mean: 1.127375, proj_loss: -0.057231, loss_mean_cls: 3.311185, grad_norm: 0.791624 +Steps: 0%| | 109/1000000 [00:30<67:50:27, 4.09it/s, grad_norm=0.792, loss_final=4.38, loss_mean=1.13, loss_mean_cls=3.31, proj_loss=-0.0572][2026-03-23 13:56:42] Step: 109, Training Logs: loss_final: 4.101658, loss_mean: 1.133106, proj_loss: -0.058366, loss_mean_cls: 3.026918, grad_norm: 1.389539 +Steps: 0%| | 110/1000000 [00:31<67:50:56, 4.09it/s, grad_norm=1.39, loss_final=4.1, loss_mean=1.13, loss_mean_cls=3.03, proj_loss=-0.0584][2026-03-23 13:56:42] Step: 110, Training Logs: loss_final: 4.222694, loss_mean: 1.125139, proj_loss: -0.056927, loss_mean_cls: 3.154482, grad_norm: 1.616820 +Steps: 0%| | 111/1000000 [00:31<67:52:18, 4.09it/s, grad_norm=1.62, loss_final=4.22, loss_mean=1.13, loss_mean_cls=3.15, proj_loss=-0.0569][2026-03-23 13:56:43] Step: 111, Training Logs: loss_final: 3.836849, loss_mean: 1.123032, proj_loss: -0.057576, loss_mean_cls: 2.771394, grad_norm: 1.516640 +Steps: 0%| | 112/1000000 [00:31<67:50:27, 4.09it/s, grad_norm=1.52, loss_final=3.84, loss_mean=1.12, loss_mean_cls=2.77, proj_loss=-0.0576][2026-03-23 13:56:43] Step: 112, Training Logs: loss_final: 4.385750, loss_mean: 1.170239, proj_loss: -0.057827, loss_mean_cls: 3.273338, grad_norm: 3.234420 +Steps: 0%| | 113/1000000 [00:31<67:50:21, 4.09it/s, grad_norm=3.23, loss_final=4.39, loss_mean=1.17, loss_mean_cls=3.27, proj_loss=-0.0578][2026-03-23 13:56:43] Step: 113, Training Logs: loss_final: 4.791656, loss_mean: 1.139730, proj_loss: -0.057919, loss_mean_cls: 3.709845, grad_norm: 2.920728 +Steps: 0%| | 114/1000000 [00:32<67:52:04, 4.09it/s, grad_norm=2.92, loss_final=4.79, loss_mean=1.14, loss_mean_cls=3.71, proj_loss=-0.0579][2026-03-23 13:56:43] Step: 114, Training Logs: loss_final: 4.198164, loss_mean: 1.112274, proj_loss: -0.054186, loss_mean_cls: 3.140076, grad_norm: 1.197350 +Steps: 0%| | 115/1000000 [00:32<67:52:07, 4.09it/s, grad_norm=1.2, loss_final=4.2, loss_mean=1.11, loss_mean_cls=3.14, proj_loss=-0.0542][2026-03-23 13:56:44] Step: 115, Training Logs: loss_final: 4.495673, loss_mean: 1.149152, proj_loss: -0.056316, loss_mean_cls: 3.402837, grad_norm: 1.762717 +Steps: 0%| | 116/1000000 [00:32<67:52:00, 4.09it/s, grad_norm=1.76, loss_final=4.5, loss_mean=1.15, loss_mean_cls=3.4, proj_loss=-0.0563][2026-03-23 13:56:44] Step: 116, Training Logs: loss_final: 3.863493, loss_mean: 1.173006, proj_loss: -0.057397, loss_mean_cls: 2.747884, grad_norm: 2.976036 +Steps: 0%| | 117/1000000 [00:32<67:52:53, 4.09it/s, grad_norm=2.98, loss_final=3.86, loss_mean=1.17, loss_mean_cls=2.75, proj_loss=-0.0574][2026-03-23 13:56:44] Step: 117, Training Logs: loss_final: 4.637027, loss_mean: 1.119735, proj_loss: -0.056042, loss_mean_cls: 3.573334, grad_norm: 2.094532 +Steps: 0%| | 118/1000000 [00:33<67:52:15, 4.09it/s, grad_norm=2.09, loss_final=4.64, loss_mean=1.12, loss_mean_cls=3.57, proj_loss=-0.056][2026-03-23 13:56:44] Step: 118, Training Logs: loss_final: 3.896062, loss_mean: 1.133374, proj_loss: -0.057744, loss_mean_cls: 2.820432, grad_norm: 2.091462 +Steps: 0%| | 119/1000000 [00:33<67:52:19, 4.09it/s, grad_norm=2.09, loss_final=3.9, loss_mean=1.13, loss_mean_cls=2.82, proj_loss=-0.0577][2026-03-23 13:56:45] Step: 119, Training Logs: loss_final: 4.494811, loss_mean: 1.127229, proj_loss: -0.056165, loss_mean_cls: 3.423747, grad_norm: 2.573430 +Steps: 0%| | 120/1000000 [00:33<67:52:07, 4.09it/s, grad_norm=2.57, loss_final=4.49, loss_mean=1.13, loss_mean_cls=3.42, proj_loss=-0.0562][2026-03-23 13:56:45] Step: 120, Training Logs: loss_final: 4.395417, loss_mean: 1.094057, proj_loss: -0.057758, loss_mean_cls: 3.359118, grad_norm: 1.652974 +Steps: 0%| | 121/1000000 [00:33<67:49:02, 4.10it/s, grad_norm=1.65, loss_final=4.4, loss_mean=1.09, loss_mean_cls=3.36, proj_loss=-0.0578][2026-03-23 13:56:45] Step: 121, Training Logs: loss_final: 3.780295, loss_mean: 1.143866, proj_loss: -0.057413, loss_mean_cls: 2.693842, grad_norm: 2.330424 +Steps: 0%| | 122/1000000 [00:34<67:51:29, 4.09it/s, grad_norm=2.33, loss_final=3.78, loss_mean=1.14, loss_mean_cls=2.69, proj_loss=-0.0574][2026-03-23 13:56:45] Step: 122, Training Logs: loss_final: 5.011144, loss_mean: 1.122681, proj_loss: -0.058837, loss_mean_cls: 3.947300, grad_norm: 1.509129 +Steps: 0%| | 123/1000000 [00:34<67:51:03, 4.09it/s, grad_norm=1.51, loss_final=5.01, loss_mean=1.12, loss_mean_cls=3.95, proj_loss=-0.0588][2026-03-23 13:56:46] Step: 123, Training Logs: loss_final: 4.172338, loss_mean: 1.115492, proj_loss: -0.056144, loss_mean_cls: 3.112991, grad_norm: 1.528705 +Steps: 0%| | 124/1000000 [00:34<67:50:03, 4.09it/s, grad_norm=1.53, loss_final=4.17, loss_mean=1.12, loss_mean_cls=3.11, proj_loss=-0.0561][2026-03-23 13:56:46] Step: 124, Training Logs: loss_final: 4.010628, loss_mean: 1.117229, proj_loss: -0.054621, loss_mean_cls: 2.948020, grad_norm: 1.329769 +Steps: 0%| | 125/1000000 [00:34<67:48:41, 4.10it/s, grad_norm=1.33, loss_final=4.01, loss_mean=1.12, loss_mean_cls=2.95, proj_loss=-0.0546][2026-03-23 13:56:46] Step: 125, Training Logs: loss_final: 3.764182, loss_mean: 1.127925, proj_loss: -0.057976, loss_mean_cls: 2.694233, grad_norm: 1.674507 +Steps: 0%| | 126/1000000 [00:35<67:47:01, 4.10it/s, grad_norm=1.67, loss_final=3.76, loss_mean=1.13, loss_mean_cls=2.69, proj_loss=-0.058][2026-03-23 13:56:46] Step: 126, Training Logs: loss_final: 4.371668, loss_mean: 1.097472, proj_loss: -0.055551, loss_mean_cls: 3.329747, grad_norm: 1.962917 +Steps: 0%| | 127/1000000 [00:35<67:47:01, 4.10it/s, grad_norm=1.96, loss_final=4.37, loss_mean=1.1, loss_mean_cls=3.33, proj_loss=-0.0556][2026-03-23 13:56:47] Step: 127, Training Logs: loss_final: 4.417885, loss_mean: 1.097842, proj_loss: -0.057305, loss_mean_cls: 3.377348, grad_norm: 1.860904 +Steps: 0%| | 128/1000000 [00:35<67:46:39, 4.10it/s, grad_norm=1.86, loss_final=4.42, loss_mean=1.1, loss_mean_cls=3.38, proj_loss=-0.0573][2026-03-23 13:56:47] Step: 128, Training Logs: loss_final: 4.017467, loss_mean: 1.103394, proj_loss: -0.057005, loss_mean_cls: 2.971077, grad_norm: 1.122871 +Steps: 0%| | 129/1000000 [00:35<67:55:50, 4.09it/s, grad_norm=1.12, loss_final=4.02, loss_mean=1.1, loss_mean_cls=2.97, proj_loss=-0.057][2026-03-23 13:56:47] Step: 129, Training Logs: loss_final: 4.208836, loss_mean: 1.102054, proj_loss: -0.056320, loss_mean_cls: 3.163102, grad_norm: 1.851275 +Steps: 0%| | 130/1000000 [00:36<67:52:57, 4.09it/s, grad_norm=1.85, loss_final=4.21, loss_mean=1.1, loss_mean_cls=3.16, proj_loss=-0.0563][2026-03-23 13:56:47] Step: 130, Training Logs: loss_final: 4.207572, loss_mean: 1.127353, proj_loss: -0.057998, loss_mean_cls: 3.138218, grad_norm: 1.432804 +Steps: 0%| | 131/1000000 [00:36<67:51:54, 4.09it/s, grad_norm=1.43, loss_final=4.21, loss_mean=1.13, loss_mean_cls=3.14, proj_loss=-0.058][2026-03-23 13:56:48] Step: 131, Training Logs: loss_final: 4.999227, loss_mean: 1.065102, proj_loss: -0.057811, loss_mean_cls: 3.991935, grad_norm: 1.260865 +Steps: 0%| | 132/1000000 [00:36<67:50:15, 4.09it/s, grad_norm=1.26, loss_final=5, loss_mean=1.07, loss_mean_cls=3.99, proj_loss=-0.0578][2026-03-23 13:56:48] Step: 132, Training Logs: loss_final: 4.958624, loss_mean: 1.080354, proj_loss: -0.055375, loss_mean_cls: 3.933645, grad_norm: 1.087305 +Steps: 0%| | 133/1000000 [00:36<67:49:24, 4.10it/s, grad_norm=1.09, loss_final=4.96, loss_mean=1.08, loss_mean_cls=3.93, proj_loss=-0.0554][2026-03-23 13:56:48] Step: 133, Training Logs: loss_final: 4.336567, loss_mean: 1.074137, proj_loss: -0.057906, loss_mean_cls: 3.320336, grad_norm: 0.845532 +Steps: 0%| | 134/1000000 [00:36<67:49:27, 4.09it/s, grad_norm=0.846, loss_final=4.34, loss_mean=1.07, loss_mean_cls=3.32, proj_loss=-0.0579][2026-03-23 13:56:48] Step: 134, Training Logs: loss_final: 4.113417, loss_mean: 1.101781, proj_loss: -0.055828, loss_mean_cls: 3.067464, grad_norm: 0.853130 +Steps: 0%| | 135/1000000 [00:37<67:48:26, 4.10it/s, grad_norm=0.853, loss_final=4.11, loss_mean=1.1, loss_mean_cls=3.07, proj_loss=-0.0558][2026-03-23 13:56:48] Step: 135, Training Logs: loss_final: 4.451187, loss_mean: 1.071875, proj_loss: -0.057522, loss_mean_cls: 3.436834, grad_norm: 1.211643 +Steps: 0%| | 136/1000000 [00:37<67:48:00, 4.10it/s, grad_norm=1.21, loss_final=4.45, loss_mean=1.07, loss_mean_cls=3.44, proj_loss=-0.0575][2026-03-23 13:56:49] Step: 136, Training Logs: loss_final: 3.470972, loss_mean: 1.105708, proj_loss: -0.057636, loss_mean_cls: 2.422900, grad_norm: 1.028398 +Steps: 0%| | 137/1000000 [00:37<67:48:46, 4.10it/s, grad_norm=1.03, loss_final=3.47, loss_mean=1.11, loss_mean_cls=2.42, proj_loss=-0.0576][2026-03-23 13:56:49] Step: 137, Training Logs: loss_final: 4.167720, loss_mean: 1.116012, proj_loss: -0.056297, loss_mean_cls: 3.108005, grad_norm: 1.331365 +Steps: 0%| | 138/1000000 [00:37<67:49:54, 4.09it/s, grad_norm=1.33, loss_final=4.17, loss_mean=1.12, loss_mean_cls=3.11, proj_loss=-0.0563][2026-03-23 13:56:49] Step: 138, Training Logs: loss_final: 3.969088, loss_mean: 1.080545, proj_loss: -0.058032, loss_mean_cls: 2.946575, grad_norm: 1.159432 +Steps: 0%| | 139/1000000 [00:38<67:49:15, 4.10it/s, grad_norm=1.16, loss_final=3.97, loss_mean=1.08, loss_mean_cls=2.95, proj_loss=-0.058][2026-03-23 13:56:49] Step: 139, Training Logs: loss_final: 4.553603, loss_mean: 1.100756, proj_loss: -0.057699, loss_mean_cls: 3.510547, grad_norm: 1.379237 +Steps: 0%| | 140/1000000 [00:38<67:48:29, 4.10it/s, grad_norm=1.38, loss_final=4.55, loss_mean=1.1, loss_mean_cls=3.51, proj_loss=-0.0577][2026-03-23 13:56:50] Step: 140, Training Logs: loss_final: 3.823996, loss_mean: 1.091382, proj_loss: -0.055826, loss_mean_cls: 2.788440, grad_norm: 1.269147 +Steps: 0%| | 141/1000000 [00:38<67:49:20, 4.10it/s, grad_norm=1.27, loss_final=3.82, loss_mean=1.09, loss_mean_cls=2.79, proj_loss=-0.0558][2026-03-23 13:56:50] Step: 141, Training Logs: loss_final: 4.244388, loss_mean: 1.086618, proj_loss: -0.060178, loss_mean_cls: 3.217947, grad_norm: 1.224737 +Steps: 0%| | 142/1000000 [00:38<67:48:47, 4.10it/s, grad_norm=1.22, loss_final=4.24, loss_mean=1.09, loss_mean_cls=3.22, proj_loss=-0.0602][2026-03-23 13:56:50] Step: 142, Training Logs: loss_final: 4.555122, loss_mean: 1.074355, proj_loss: -0.056447, loss_mean_cls: 3.537215, grad_norm: 1.288044 +Steps: 0%| | 143/1000000 [00:39<67:49:15, 4.10it/s, grad_norm=1.29, loss_final=4.56, loss_mean=1.07, loss_mean_cls=3.54, proj_loss=-0.0564][2026-03-23 13:56:50] Step: 143, Training Logs: loss_final: 4.631051, loss_mean: 1.031162, proj_loss: -0.056307, loss_mean_cls: 3.656196, grad_norm: 0.875154 +Steps: 0%| | 144/1000000 [00:39<67:47:56, 4.10it/s, grad_norm=0.875, loss_final=4.63, loss_mean=1.03, loss_mean_cls=3.66, proj_loss=-0.0563][2026-03-23 13:56:51] Step: 144, Training Logs: loss_final: 4.039268, loss_mean: 1.089356, proj_loss: -0.056316, loss_mean_cls: 3.006228, grad_norm: 1.155283 +Steps: 0%| | 145/1000000 [00:39<67:55:59, 4.09it/s, grad_norm=1.16, loss_final=4.04, loss_mean=1.09, loss_mean_cls=3.01, proj_loss=-0.0563][2026-03-23 13:56:51] Step: 145, Training Logs: loss_final: 3.760608, loss_mean: 1.070431, proj_loss: -0.059299, loss_mean_cls: 2.749476, grad_norm: 1.032837 +Steps: 0%| | 146/1000000 [00:39<67:53:40, 4.09it/s, grad_norm=1.03, loss_final=3.76, loss_mean=1.07, loss_mean_cls=2.75, proj_loss=-0.0593][2026-03-23 13:56:51] Step: 146, Training Logs: loss_final: 4.181468, loss_mean: 1.069318, proj_loss: -0.057231, loss_mean_cls: 3.169381, grad_norm: 1.739599 +Steps: 0%| | 147/1000000 [00:40<87:04:35, 3.19it/s, grad_norm=1.74, loss_final=4.18, loss_mean=1.07, loss_mean_cls=3.17, proj_loss=-0.0572][2026-03-23 13:56:52] Step: 147, Training Logs: loss_final: 4.397898, loss_mean: 1.077711, proj_loss: -0.057317, loss_mean_cls: 3.377505, grad_norm: 1.324051 +Steps: 0%| | 148/1000000 [00:40<81:18:29, 3.42it/s, grad_norm=1.32, loss_final=4.4, loss_mean=1.08, loss_mean_cls=3.38, proj_loss=-0.0573][2026-03-23 13:56:52] Step: 148, Training Logs: loss_final: 3.678877, loss_mean: 1.095331, proj_loss: -0.056643, loss_mean_cls: 2.640189, grad_norm: 0.957576 +Steps: 0%| | 149/1000000 [00:40<77:21:07, 3.59it/s, grad_norm=0.958, loss_final=3.68, loss_mean=1.1, loss_mean_cls=2.64, proj_loss=-0.0566][2026-03-23 13:56:52] Step: 149, Training Logs: loss_final: 3.970488, loss_mean: 1.096237, proj_loss: -0.057982, loss_mean_cls: 2.932232, grad_norm: 0.751983 +Steps: 0%| | 150/1000000 [00:41<74:28:13, 3.73it/s, grad_norm=0.752, loss_final=3.97, loss_mean=1.1, loss_mean_cls=2.93, proj_loss=-0.058][2026-03-23 13:56:52] Step: 150, Training Logs: loss_final: 3.589296, loss_mean: 1.085876, proj_loss: -0.059221, loss_mean_cls: 2.562641, grad_norm: 1.001571 +Steps: 0%| | 151/1000000 [00:41<72:29:17, 3.83it/s, grad_norm=1, loss_final=3.59, loss_mean=1.09, loss_mean_cls=2.56, proj_loss=-0.0592][2026-03-23 13:56:53] Step: 151, Training Logs: loss_final: 3.809782, loss_mean: 1.041376, proj_loss: -0.057593, loss_mean_cls: 2.825998, grad_norm: 0.772958 +Steps: 0%| | 152/1000000 [00:41<71:06:45, 3.91it/s, grad_norm=0.773, loss_final=3.81, loss_mean=1.04, loss_mean_cls=2.83, proj_loss=-0.0576][2026-03-23 13:56:53] Step: 152, Training Logs: loss_final: 3.756772, loss_mean: 1.077754, proj_loss: -0.055612, loss_mean_cls: 2.734629, grad_norm: 0.942414 +Steps: 0%| | 153/1000000 [00:41<70:15:13, 3.95it/s, grad_norm=0.942, loss_final=3.76, loss_mean=1.08, loss_mean_cls=2.73, proj_loss=-0.0556][2026-03-23 13:56:53] Step: 153, Training Logs: loss_final: 3.815321, loss_mean: 1.091242, proj_loss: -0.055186, loss_mean_cls: 2.779264, grad_norm: 1.427716 +Steps: 0%| | 154/1000000 [00:42<69:30:56, 4.00it/s, grad_norm=1.43, loss_final=3.82, loss_mean=1.09, loss_mean_cls=2.78, proj_loss=-0.0552][2026-03-23 13:56:53] Step: 154, Training Logs: loss_final: 4.484177, loss_mean: 1.070384, proj_loss: -0.056569, loss_mean_cls: 3.470362, grad_norm: 0.890748 +Steps: 0%| | 155/1000000 [00:42<68:58:59, 4.03it/s, grad_norm=0.891, loss_final=4.48, loss_mean=1.07, loss_mean_cls=3.47, proj_loss=-0.0566][2026-03-23 13:56:54] Step: 155, Training Logs: loss_final: 3.992099, loss_mean: 1.094668, proj_loss: -0.056406, loss_mean_cls: 2.953837, grad_norm: 1.103738 +Steps: 0%| | 156/1000000 [00:42<68:38:04, 4.05it/s, grad_norm=1.1, loss_final=3.99, loss_mean=1.09, loss_mean_cls=2.95, proj_loss=-0.0564][2026-03-23 13:56:54] Step: 156, Training Logs: loss_final: 4.835541, loss_mean: 1.066255, proj_loss: -0.057052, loss_mean_cls: 3.826337, grad_norm: 1.572935 +Steps: 0%| | 157/1000000 [00:42<68:32:39, 4.05it/s, grad_norm=1.57, loss_final=4.84, loss_mean=1.07, loss_mean_cls=3.83, proj_loss=-0.0571][2026-03-23 13:56:54] Step: 157, Training Logs: loss_final: 4.422526, loss_mean: 1.037669, proj_loss: -0.057450, loss_mean_cls: 3.442307, grad_norm: 0.881763 +Steps: 0%| | 158/1000000 [00:43<68:18:41, 4.07it/s, grad_norm=0.882, loss_final=4.42, loss_mean=1.04, loss_mean_cls=3.44, proj_loss=-0.0574][2026-03-23 13:56:54] Step: 158, Training Logs: loss_final: 3.933815, loss_mean: 1.094053, proj_loss: -0.058011, loss_mean_cls: 2.897773, grad_norm: 1.524001 +Steps: 0%| | 159/1000000 [00:43<68:08:39, 4.08it/s, grad_norm=1.52, loss_final=3.93, loss_mean=1.09, loss_mean_cls=2.9, proj_loss=-0.058][2026-03-23 13:56:55] Step: 159, Training Logs: loss_final: 3.294493, loss_mean: 1.101539, proj_loss: -0.057312, loss_mean_cls: 2.250265, grad_norm: 0.862417 +Steps: 0%| | 160/1000000 [00:43<68:00:56, 4.08it/s, grad_norm=0.862, loss_final=3.29, loss_mean=1.1, loss_mean_cls=2.25, proj_loss=-0.0573][2026-03-23 13:56:55] Step: 160, Training Logs: loss_final: 4.003653, loss_mean: 1.049939, proj_loss: -0.056768, loss_mean_cls: 3.010482, grad_norm: 1.406133 +Steps: 0%| | 161/1000000 [00:43<68:05:23, 4.08it/s, grad_norm=1.41, loss_final=4, loss_mean=1.05, loss_mean_cls=3.01, proj_loss=-0.0568][2026-03-23 13:56:55] Step: 161, Training Logs: loss_final: 3.735500, loss_mean: 1.059312, proj_loss: -0.055975, loss_mean_cls: 2.732163, grad_norm: 1.257030 +Steps: 0%| | 162/1000000 [00:44<67:59:39, 4.08it/s, grad_norm=1.26, loss_final=3.74, loss_mean=1.06, loss_mean_cls=2.73, proj_loss=-0.056][2026-03-23 13:56:55] Step: 162, Training Logs: loss_final: 4.134730, loss_mean: 1.051557, proj_loss: -0.055925, loss_mean_cls: 3.139099, grad_norm: 1.383831 +Steps: 0%| | 163/1000000 [00:44<67:54:57, 4.09it/s, grad_norm=1.38, loss_final=4.13, loss_mean=1.05, loss_mean_cls=3.14, proj_loss=-0.0559][2026-03-23 13:56:56] Step: 163, Training Logs: loss_final: 3.976990, loss_mean: 1.080228, proj_loss: -0.055570, loss_mean_cls: 2.952332, grad_norm: 2.321836 +Steps: 0%| | 164/1000000 [00:44<67:52:28, 4.09it/s, grad_norm=2.32, loss_final=3.98, loss_mean=1.08, loss_mean_cls=2.95, proj_loss=-0.0556][2026-03-23 13:56:56] Step: 164, Training Logs: loss_final: 4.920438, loss_mean: 1.053734, proj_loss: -0.057831, loss_mean_cls: 3.924535, grad_norm: 1.427572 +Steps: 0%| | 165/1000000 [00:44<67:51:20, 4.09it/s, grad_norm=1.43, loss_final=4.92, loss_mean=1.05, loss_mean_cls=3.92, proj_loss=-0.0578][2026-03-23 13:56:56] Step: 165, Training Logs: loss_final: 4.075354, loss_mean: 1.102409, proj_loss: -0.055738, loss_mean_cls: 3.028682, grad_norm: 1.554521 +Steps: 0%| | 166/1000000 [00:45<67:50:22, 4.09it/s, grad_norm=1.55, loss_final=4.08, loss_mean=1.1, loss_mean_cls=3.03, proj_loss=-0.0557][2026-03-23 13:56:56] Step: 166, Training Logs: loss_final: 3.970260, loss_mean: 1.052696, proj_loss: -0.054281, loss_mean_cls: 2.971844, grad_norm: 1.398542 +Steps: 0%| | 167/1000000 [00:45<67:49:03, 4.10it/s, grad_norm=1.4, loss_final=3.97, loss_mean=1.05, loss_mean_cls=2.97, proj_loss=-0.0543][2026-03-23 13:56:57] Step: 167, Training Logs: loss_final: 4.506516, loss_mean: 1.058694, proj_loss: -0.055691, loss_mean_cls: 3.503513, grad_norm: 1.465548 +Steps: 0%| | 168/1000000 [00:45<67:48:15, 4.10it/s, grad_norm=1.47, loss_final=4.51, loss_mean=1.06, loss_mean_cls=3.5, proj_loss=-0.0557][2026-03-23 13:56:57] Step: 168, Training Logs: loss_final: 3.774779, loss_mean: 1.070169, proj_loss: -0.056798, loss_mean_cls: 2.761408, grad_norm: 1.063631 +Steps: 0%| | 169/1000000 [00:45<67:55:41, 4.09it/s, grad_norm=1.06, loss_final=3.77, loss_mean=1.07, loss_mean_cls=2.76, proj_loss=-0.0568][2026-03-23 13:56:57] Step: 169, Training Logs: loss_final: 3.799341, loss_mean: 1.072764, proj_loss: -0.057054, loss_mean_cls: 2.783631, grad_norm: 1.432508 +Steps: 0%| | 170/1000000 [00:46<67:53:00, 4.09it/s, grad_norm=1.43, loss_final=3.8, loss_mean=1.07, loss_mean_cls=2.78, proj_loss=-0.0571][2026-03-23 13:56:57] Step: 170, Training Logs: loss_final: 3.950215, loss_mean: 1.050979, proj_loss: -0.058352, loss_mean_cls: 2.957588, grad_norm: 1.403183 +Steps: 0%| | 171/1000000 [00:46<67:51:12, 4.09it/s, grad_norm=1.4, loss_final=3.95, loss_mean=1.05, loss_mean_cls=2.96, proj_loss=-0.0584][2026-03-23 13:56:58] Step: 171, Training Logs: loss_final: 4.198855, loss_mean: 1.049812, proj_loss: -0.056677, loss_mean_cls: 3.205721, grad_norm: 0.921758 +Steps: 0%| | 172/1000000 [00:46<67:49:22, 4.09it/s, grad_norm=0.922, loss_final=4.2, loss_mean=1.05, loss_mean_cls=3.21, proj_loss=-0.0567][2026-03-23 13:56:58] Step: 172, Training Logs: loss_final: 3.441348, loss_mean: 1.076280, proj_loss: -0.057867, loss_mean_cls: 2.422936, grad_norm: 0.854744 +Steps: 0%| | 173/1000000 [00:46<67:50:30, 4.09it/s, grad_norm=0.855, loss_final=3.44, loss_mean=1.08, loss_mean_cls=2.42, proj_loss=-0.0579][2026-03-23 13:56:58] Step: 173, Training Logs: loss_final: 3.798677, loss_mean: 1.085783, proj_loss: -0.057882, loss_mean_cls: 2.770777, grad_norm: 1.453923 +Steps: 0%| | 174/1000000 [00:46<67:50:14, 4.09it/s, grad_norm=1.45, loss_final=3.8, loss_mean=1.09, loss_mean_cls=2.77, proj_loss=-0.0579][2026-03-23 13:56:58] Step: 174, Training Logs: loss_final: 3.112827, loss_mean: 1.064432, proj_loss: -0.058472, loss_mean_cls: 2.106868, grad_norm: 1.250231 +Steps: 0%| | 175/1000000 [00:47<67:49:59, 4.09it/s, grad_norm=1.25, loss_final=3.11, loss_mean=1.06, loss_mean_cls=2.11, proj_loss=-0.0585][2026-03-23 13:56:58] Step: 175, Training Logs: loss_final: 3.708531, loss_mean: 1.080565, proj_loss: -0.055104, loss_mean_cls: 2.683070, grad_norm: 1.818153 +Steps: 0%| | 176/1000000 [00:47<67:49:26, 4.09it/s, grad_norm=1.82, loss_final=3.71, loss_mean=1.08, loss_mean_cls=2.68, proj_loss=-0.0551][2026-03-23 13:56:59] Step: 176, Training Logs: loss_final: 4.057882, loss_mean: 1.041325, proj_loss: -0.057044, loss_mean_cls: 3.073601, grad_norm: 1.056386 +Steps: 0%| | 177/1000000 [00:47<67:50:14, 4.09it/s, grad_norm=1.06, loss_final=4.06, loss_mean=1.04, loss_mean_cls=3.07, proj_loss=-0.057][2026-03-23 13:56:59] Step: 177, Training Logs: loss_final: 3.843677, loss_mean: 1.082743, proj_loss: -0.056343, loss_mean_cls: 2.817278, grad_norm: 1.454900 +Steps: 0%| | 178/1000000 [00:47<67:50:45, 4.09it/s, grad_norm=1.45, loss_final=3.84, loss_mean=1.08, loss_mean_cls=2.82, proj_loss=-0.0563][2026-03-23 13:56:59] Step: 178, Training Logs: loss_final: 4.592722, loss_mean: 1.051988, proj_loss: -0.056379, loss_mean_cls: 3.597114, grad_norm: 1.553295 +Steps: 0%| | 179/1000000 [00:48<67:51:26, 4.09it/s, grad_norm=1.55, loss_final=4.59, loss_mean=1.05, loss_mean_cls=3.6, proj_loss=-0.0564][2026-03-23 13:56:59] Step: 179, Training Logs: loss_final: 4.083874, loss_mean: 1.044562, proj_loss: -0.057760, loss_mean_cls: 3.097073, grad_norm: 1.207282 +Steps: 0%| | 180/1000000 [00:48<67:50:22, 4.09it/s, grad_norm=1.21, loss_final=4.08, loss_mean=1.04, loss_mean_cls=3.1, proj_loss=-0.0578][2026-03-23 13:57:00] Step: 180, Training Logs: loss_final: 3.533881, loss_mean: 1.084617, proj_loss: -0.055471, loss_mean_cls: 2.504735, grad_norm: 1.384633 +Steps: 0%| | 181/1000000 [00:48<67:49:31, 4.09it/s, grad_norm=1.38, loss_final=3.53, loss_mean=1.08, loss_mean_cls=2.5, proj_loss=-0.0555][2026-03-23 13:57:00] Step: 181, Training Logs: loss_final: 4.409495, loss_mean: 1.036472, proj_loss: -0.057444, loss_mean_cls: 3.430467, grad_norm: 1.297928 +Steps: 0%| | 182/1000000 [00:48<67:48:57, 4.10it/s, grad_norm=1.3, loss_final=4.41, loss_mean=1.04, loss_mean_cls=3.43, proj_loss=-0.0574][2026-03-23 13:57:00] Step: 182, Training Logs: loss_final: 3.176256, loss_mean: 1.091783, proj_loss: -0.057242, loss_mean_cls: 2.141715, grad_norm: 1.965931 +Steps: 0%| | 183/1000000 [00:49<67:49:28, 4.09it/s, grad_norm=1.97, loss_final=3.18, loss_mean=1.09, loss_mean_cls=2.14, proj_loss=-0.0572][2026-03-23 13:57:00] Step: 183, Training Logs: loss_final: 3.651900, loss_mean: 1.063701, proj_loss: -0.057272, loss_mean_cls: 2.645471, grad_norm: 1.260781 +Steps: 0%| | 184/1000000 [00:49<67:49:34, 4.09it/s, grad_norm=1.26, loss_final=3.65, loss_mean=1.06, loss_mean_cls=2.65, proj_loss=-0.0573][2026-03-23 13:57:01] Step: 184, Training Logs: loss_final: 4.134811, loss_mean: 1.052191, proj_loss: -0.056823, loss_mean_cls: 3.139443, grad_norm: 1.083165 +Steps: 0%| | 185/1000000 [00:49<67:54:27, 4.09it/s, grad_norm=1.08, loss_final=4.13, loss_mean=1.05, loss_mean_cls=3.14, proj_loss=-0.0568][2026-03-23 13:57:01] Step: 185, Training Logs: loss_final: 3.952762, loss_mean: 1.034234, proj_loss: -0.056918, loss_mean_cls: 2.975446, grad_norm: 1.370537 +Steps: 0%| | 186/1000000 [00:49<67:53:04, 4.09it/s, grad_norm=1.37, loss_final=3.95, loss_mean=1.03, loss_mean_cls=2.98, proj_loss=-0.0569][2026-03-23 13:57:01] Step: 186, Training Logs: loss_final: 4.377052, loss_mean: 1.068180, proj_loss: -0.055129, loss_mean_cls: 3.364000, grad_norm: 2.047776 +Steps: 0%| | 187/1000000 [00:50<67:51:19, 4.09it/s, grad_norm=2.05, loss_final=4.38, loss_mean=1.07, loss_mean_cls=3.36, proj_loss=-0.0551][2026-03-23 13:57:01] Step: 187, Training Logs: loss_final: 3.465180, loss_mean: 1.067420, proj_loss: -0.055022, loss_mean_cls: 2.452782, grad_norm: 1.094218 +Steps: 0%| | 188/1000000 [00:50<67:50:38, 4.09it/s, grad_norm=1.09, loss_final=3.47, loss_mean=1.07, loss_mean_cls=2.45, proj_loss=-0.055][2026-03-23 13:57:02] Step: 188, Training Logs: loss_final: 4.428637, loss_mean: 1.075879, proj_loss: -0.056132, loss_mean_cls: 3.408890, grad_norm: 2.243081 +Steps: 0%| | 189/1000000 [00:50<67:50:39, 4.09it/s, grad_norm=2.24, loss_final=4.43, loss_mean=1.08, loss_mean_cls=3.41, proj_loss=-0.0561][2026-03-23 13:57:02] Step: 189, Training Logs: loss_final: 3.874612, loss_mean: 1.059214, proj_loss: -0.055212, loss_mean_cls: 2.870610, grad_norm: 1.509933 +Steps: 0%| | 190/1000000 [00:50<67:53:49, 4.09it/s, grad_norm=1.51, loss_final=3.87, loss_mean=1.06, loss_mean_cls=2.87, proj_loss=-0.0552][2026-03-23 13:57:02] Step: 190, Training Logs: loss_final: 3.696738, loss_mean: 1.066510, proj_loss: -0.058307, loss_mean_cls: 2.688535, grad_norm: 1.849224 +Steps: 0%| | 191/1000000 [00:51<67:54:49, 4.09it/s, grad_norm=1.85, loss_final=3.7, loss_mean=1.07, loss_mean_cls=2.69, proj_loss=-0.0583][2026-03-23 13:57:02] Step: 191, Training Logs: loss_final: 4.354342, loss_mean: 1.020227, proj_loss: -0.055745, loss_mean_cls: 3.389860, grad_norm: 1.847939 +Steps: 0%| | 192/1000000 [00:51<67:54:25, 4.09it/s, grad_norm=1.85, loss_final=4.35, loss_mean=1.02, loss_mean_cls=3.39, proj_loss=-0.0557][2026-03-23 13:57:03] Step: 192, Training Logs: loss_final: 4.409212, loss_mean: 1.048026, proj_loss: -0.056594, loss_mean_cls: 3.417780, grad_norm: 1.524102 +Steps: 0%| | 193/1000000 [00:51<67:53:07, 4.09it/s, grad_norm=1.52, loss_final=4.41, loss_mean=1.05, loss_mean_cls=3.42, proj_loss=-0.0566][2026-03-23 13:57:03] Step: 193, Training Logs: loss_final: 3.909914, loss_mean: 1.034270, proj_loss: -0.058143, loss_mean_cls: 2.933788, grad_norm: 1.197792 +Steps: 0%| | 194/1000000 [00:51<67:53:28, 4.09it/s, grad_norm=1.2, loss_final=3.91, loss_mean=1.03, loss_mean_cls=2.93, proj_loss=-0.0581][2026-03-23 13:57:03] Step: 194, Training Logs: loss_final: 3.518339, loss_mean: 1.061185, proj_loss: -0.053464, loss_mean_cls: 2.510617, grad_norm: 1.685875 +Steps: 0%| | 195/1000000 [00:52<67:52:30, 4.09it/s, grad_norm=1.69, loss_final=3.52, loss_mean=1.06, loss_mean_cls=2.51, proj_loss=-0.0535][2026-03-23 13:57:03] Step: 195, Training Logs: loss_final: 4.144682, loss_mean: 1.069985, proj_loss: -0.056024, loss_mean_cls: 3.130722, grad_norm: 1.617648 +Steps: 0%| | 196/1000000 [00:52<67:53:25, 4.09it/s, grad_norm=1.62, loss_final=4.14, loss_mean=1.07, loss_mean_cls=3.13, proj_loss=-0.056][2026-03-23 13:57:04] Step: 196, Training Logs: loss_final: 4.209203, loss_mean: 1.043093, proj_loss: -0.056968, loss_mean_cls: 3.223077, grad_norm: 1.561610 +Steps: 0%| | 197/1000000 [00:52<67:51:41, 4.09it/s, grad_norm=1.56, loss_final=4.21, loss_mean=1.04, loss_mean_cls=3.22, proj_loss=-0.057][2026-03-23 13:57:04] Step: 197, Training Logs: loss_final: 3.986165, loss_mean: 1.048607, proj_loss: -0.058094, loss_mean_cls: 2.995652, grad_norm: 1.465502 +Steps: 0%| | 198/1000000 [00:52<67:51:10, 4.09it/s, grad_norm=1.47, loss_final=3.99, loss_mean=1.05, loss_mean_cls=3, proj_loss=-0.0581][2026-03-23 13:57:04] Step: 198, Training Logs: loss_final: 4.109353, loss_mean: 1.057425, proj_loss: -0.057203, loss_mean_cls: 3.109131, grad_norm: 1.602210 +Steps: 0%| | 199/1000000 [00:53<67:51:00, 4.09it/s, grad_norm=1.6, loss_final=4.11, loss_mean=1.06, loss_mean_cls=3.11, proj_loss=-0.0572][2026-03-23 13:57:04] Step: 199, Training Logs: loss_final: 3.827809, loss_mean: 1.060561, proj_loss: -0.056529, loss_mean_cls: 2.823776, grad_norm: 1.281930 +Steps: 0%| | 200/1000000 [00:53<67:52:45, 4.09it/s, grad_norm=1.28, loss_final=3.83, loss_mean=1.06, loss_mean_cls=2.82, proj_loss=-0.0565][2026-03-23 13:57:05] Step: 200, Training Logs: loss_final: 4.859810, loss_mean: 1.018786, proj_loss: -0.056772, loss_mean_cls: 3.897796, grad_norm: 1.346983 +Steps: 0%| | 201/1000000 [00:53<67:50:08, 4.09it/s, grad_norm=1.35, loss_final=4.86, loss_mean=1.02, loss_mean_cls=3.9, proj_loss=-0.0568][2026-03-23 13:57:05] Step: 201, Training Logs: loss_final: 3.945022, loss_mean: 1.039917, proj_loss: -0.058203, loss_mean_cls: 2.963308, grad_norm: 1.138654 +Steps: 0%| | 202/1000000 [00:53<67:49:17, 4.09it/s, grad_norm=1.14, loss_final=3.95, loss_mean=1.04, loss_mean_cls=2.96, proj_loss=-0.0582][2026-03-23 13:57:05] Step: 202, Training Logs: loss_final: 3.795771, loss_mean: 1.045783, proj_loss: -0.058287, loss_mean_cls: 2.808275, grad_norm: 1.378415 +Steps: 0%| | 203/1000000 [00:54<67:47:45, 4.10it/s, grad_norm=1.38, loss_final=3.8, loss_mean=1.05, loss_mean_cls=2.81, proj_loss=-0.0583][2026-03-23 13:57:05] Step: 203, Training Logs: loss_final: 3.873578, loss_mean: 1.057585, proj_loss: -0.057143, loss_mean_cls: 2.873136, grad_norm: 1.039413 +Steps: 0%| | 204/1000000 [00:54<67:47:23, 4.10it/s, grad_norm=1.04, loss_final=3.87, loss_mean=1.06, loss_mean_cls=2.87, proj_loss=-0.0571][2026-03-23 13:57:06] Step: 204, Training Logs: loss_final: 3.869634, loss_mean: 1.025371, proj_loss: -0.059635, loss_mean_cls: 2.903898, grad_norm: 1.113446 +Steps: 0%| | 205/1000000 [00:54<67:46:28, 4.10it/s, grad_norm=1.11, loss_final=3.87, loss_mean=1.03, loss_mean_cls=2.9, proj_loss=-0.0596][2026-03-23 13:57:06] Step: 205, Training Logs: loss_final: 4.516129, loss_mean: 1.026917, proj_loss: -0.059113, loss_mean_cls: 3.548324, grad_norm: 1.246919 +Steps: 0%| | 206/1000000 [00:54<67:46:58, 4.10it/s, grad_norm=1.25, loss_final=4.52, loss_mean=1.03, loss_mean_cls=3.55, proj_loss=-0.0591][2026-03-23 13:57:06] Step: 206, Training Logs: loss_final: 3.870474, loss_mean: 1.033348, proj_loss: -0.057558, loss_mean_cls: 2.894683, grad_norm: 1.104687 +Steps: 0%| | 207/1000000 [00:55<67:49:28, 4.09it/s, grad_norm=1.1, loss_final=3.87, loss_mean=1.03, loss_mean_cls=2.89, proj_loss=-0.0576][2026-03-23 13:57:06] Step: 207, Training Logs: loss_final: 3.657545, loss_mean: 1.036661, proj_loss: -0.054751, loss_mean_cls: 2.675634, grad_norm: 1.384344 +Steps: 0%| | 208/1000000 [00:55<67:48:34, 4.10it/s, grad_norm=1.38, loss_final=3.66, loss_mean=1.04, loss_mean_cls=2.68, proj_loss=-0.0548][2026-03-23 13:57:07] Step: 208, Training Logs: loss_final: 3.776420, loss_mean: 1.025812, proj_loss: -0.057821, loss_mean_cls: 2.808429, grad_norm: 1.096370 +Steps: 0%| | 209/1000000 [00:55<67:48:19, 4.10it/s, grad_norm=1.1, loss_final=3.78, loss_mean=1.03, loss_mean_cls=2.81, proj_loss=-0.0578][2026-03-23 13:57:07] Step: 209, Training Logs: loss_final: 3.761090, loss_mean: 1.062359, proj_loss: -0.056922, loss_mean_cls: 2.755653, grad_norm: 1.903223 +Steps: 0%| | 210/1000000 [00:55<67:48:31, 4.10it/s, grad_norm=1.9, loss_final=3.76, loss_mean=1.06, loss_mean_cls=2.76, proj_loss=-0.0569][2026-03-23 13:57:07] Step: 210, Training Logs: loss_final: 3.906418, loss_mean: 1.041829, proj_loss: -0.057181, loss_mean_cls: 2.921770, grad_norm: 1.735096 +Steps: 0%| | 211/1000000 [00:56<67:48:32, 4.10it/s, grad_norm=1.74, loss_final=3.91, loss_mean=1.04, loss_mean_cls=2.92, proj_loss=-0.0572][2026-03-23 13:57:07] Step: 211, Training Logs: loss_final: 3.858996, loss_mean: 1.023147, proj_loss: -0.059874, loss_mean_cls: 2.895722, grad_norm: 1.423884 +Steps: 0%| | 212/1000000 [00:56<67:46:51, 4.10it/s, grad_norm=1.42, loss_final=3.86, loss_mean=1.02, loss_mean_cls=2.9, proj_loss=-0.0599][2026-03-23 13:57:08] Step: 212, Training Logs: loss_final: 4.049764, loss_mean: 1.020845, proj_loss: -0.057857, loss_mean_cls: 3.086776, grad_norm: 1.539841 +Steps: 0%| | 213/1000000 [00:56<67:48:03, 4.10it/s, grad_norm=1.54, loss_final=4.05, loss_mean=1.02, loss_mean_cls=3.09, proj_loss=-0.0579][2026-03-23 13:57:08] Step: 213, Training Logs: loss_final: 3.666535, loss_mean: 1.064638, proj_loss: -0.058414, loss_mean_cls: 2.660310, grad_norm: 1.364981 +Steps: 0%| | 214/1000000 [00:56<67:48:11, 4.10it/s, grad_norm=1.36, loss_final=3.67, loss_mean=1.06, loss_mean_cls=2.66, proj_loss=-0.0584][2026-03-23 13:57:08] Step: 214, Training Logs: loss_final: 4.115675, loss_mean: 1.022230, proj_loss: -0.055882, loss_mean_cls: 3.149326, grad_norm: 1.587428 +Steps: 0%| | 215/1000000 [00:57<67:46:32, 4.10it/s, grad_norm=1.59, loss_final=4.12, loss_mean=1.02, loss_mean_cls=3.15, proj_loss=-0.0559][2026-03-23 13:57:08] Step: 215, Training Logs: loss_final: 4.290797, loss_mean: 1.006664, proj_loss: -0.057540, loss_mean_cls: 3.341673, grad_norm: 1.364294 +Steps: 0%| | 216/1000000 [00:57<67:48:39, 4.10it/s, grad_norm=1.36, loss_final=4.29, loss_mean=1.01, loss_mean_cls=3.34, proj_loss=-0.0575][2026-03-23 13:57:09] Step: 216, Training Logs: loss_final: 4.299802, loss_mean: 1.033919, proj_loss: -0.057627, loss_mean_cls: 3.323509, grad_norm: 2.076446 +Steps: 0%| | 217/1000000 [00:57<67:48:50, 4.10it/s, grad_norm=2.08, loss_final=4.3, loss_mean=1.03, loss_mean_cls=3.32, proj_loss=-0.0576][2026-03-23 13:57:09] Step: 217, Training Logs: loss_final: 4.032390, loss_mean: 1.038509, proj_loss: -0.059780, loss_mean_cls: 3.053661, grad_norm: 2.095842 +Steps: 0%| | 218/1000000 [00:57<67:48:13, 4.10it/s, grad_norm=2.1, loss_final=4.03, loss_mean=1.04, loss_mean_cls=3.05, proj_loss=-0.0598][2026-03-23 13:57:09] Step: 218, Training Logs: loss_final: 4.852066, loss_mean: 0.991051, proj_loss: -0.055897, loss_mean_cls: 3.916911, grad_norm: 1.589446 +Steps: 0%| | 219/1000000 [00:57<67:54:34, 4.09it/s, grad_norm=1.59, loss_final=4.85, loss_mean=0.991, loss_mean_cls=3.92, proj_loss=-0.0559][2026-03-23 13:57:09] Step: 219, Training Logs: loss_final: 3.452114, loss_mean: 1.032779, proj_loss: -0.057448, loss_mean_cls: 2.476782, grad_norm: 2.759307 +Steps: 0%| | 220/1000000 [00:58<67:53:33, 4.09it/s, grad_norm=2.76, loss_final=3.45, loss_mean=1.03, loss_mean_cls=2.48, proj_loss=-0.0574][2026-03-23 13:57:09] Step: 220, Training Logs: loss_final: 4.474224, loss_mean: 1.012484, proj_loss: -0.056469, loss_mean_cls: 3.518209, grad_norm: 2.037073 +Steps: 0%| | 221/1000000 [00:58<67:58:08, 4.09it/s, grad_norm=2.04, loss_final=4.47, loss_mean=1.01, loss_mean_cls=3.52, proj_loss=-0.0565][2026-03-23 13:57:10] Step: 221, Training Logs: loss_final: 3.619599, loss_mean: 1.027089, proj_loss: -0.055216, loss_mean_cls: 2.647726, grad_norm: 1.876838 +Steps: 0%| | 222/1000000 [00:58<67:55:14, 4.09it/s, grad_norm=1.88, loss_final=3.62, loss_mean=1.03, loss_mean_cls=2.65, proj_loss=-0.0552][2026-03-23 13:57:10] Step: 222, Training Logs: loss_final: 3.956278, loss_mean: 0.994196, proj_loss: -0.058793, loss_mean_cls: 3.020875, grad_norm: 1.756858 +Steps: 0%| | 223/1000000 [00:58<67:57:51, 4.09it/s, grad_norm=1.76, loss_final=3.96, loss_mean=0.994, loss_mean_cls=3.02, proj_loss=-0.0588][2026-03-23 13:57:10] Step: 223, Training Logs: loss_final: 3.396563, loss_mean: 1.048192, proj_loss: -0.056162, loss_mean_cls: 2.404532, grad_norm: 1.566004 +Steps: 0%| | 224/1000000 [00:59<67:54:18, 4.09it/s, grad_norm=1.57, loss_final=3.4, loss_mean=1.05, loss_mean_cls=2.4, proj_loss=-0.0562][2026-03-23 13:57:10] Step: 224, Training Logs: loss_final: 4.041858, loss_mean: 1.026601, proj_loss: -0.059143, loss_mean_cls: 3.074400, grad_norm: 1.464198 +Steps: 0%| | 225/1000000 [00:59<67:52:38, 4.09it/s, grad_norm=1.46, loss_final=4.04, loss_mean=1.03, loss_mean_cls=3.07, proj_loss=-0.0591][2026-03-23 13:57:11] Step: 225, Training Logs: loss_final: 3.787225, loss_mean: 1.008417, proj_loss: -0.057760, loss_mean_cls: 2.836567, grad_norm: 1.825020 +Steps: 0%| | 226/1000000 [00:59<67:52:45, 4.09it/s, grad_norm=1.83, loss_final=3.79, loss_mean=1.01, loss_mean_cls=2.84, proj_loss=-0.0578][2026-03-23 13:57:11] Step: 226, Training Logs: loss_final: 3.892933, loss_mean: 1.035562, proj_loss: -0.058638, loss_mean_cls: 2.916008, grad_norm: 1.450337 +Steps: 0%| | 227/1000000 [00:59<67:53:30, 4.09it/s, grad_norm=1.45, loss_final=3.89, loss_mean=1.04, loss_mean_cls=2.92, proj_loss=-0.0586][2026-03-23 13:57:11] Step: 227, Training Logs: loss_final: 4.605734, loss_mean: 1.015767, proj_loss: -0.058328, loss_mean_cls: 3.648294, grad_norm: 1.576961 +Steps: 0%| | 228/1000000 [01:00<67:52:30, 4.09it/s, grad_norm=1.58, loss_final=4.61, loss_mean=1.02, loss_mean_cls=3.65, proj_loss=-0.0583][2026-03-23 13:57:11] Step: 228, Training Logs: loss_final: 3.988146, loss_mean: 1.020183, proj_loss: -0.057191, loss_mean_cls: 3.025154, grad_norm: 1.494521 +Steps: 0%| | 229/1000000 [01:00<67:49:43, 4.09it/s, grad_norm=1.49, loss_final=3.99, loss_mean=1.02, loss_mean_cls=3.03, proj_loss=-0.0572][2026-03-23 13:57:12] Step: 229, Training Logs: loss_final: 4.138639, loss_mean: 1.001511, proj_loss: -0.058552, loss_mean_cls: 3.195680, grad_norm: 1.869578 +Steps: 0%| | 230/1000000 [01:00<67:48:23, 4.10it/s, grad_norm=1.87, loss_final=4.14, loss_mean=1, loss_mean_cls=3.2, proj_loss=-0.0586][2026-03-23 13:57:12] Step: 230, Training Logs: loss_final: 3.587549, loss_mean: 1.026086, proj_loss: -0.057275, loss_mean_cls: 2.618739, grad_norm: 1.361713 +Steps: 0%| | 231/1000000 [01:00<67:50:24, 4.09it/s, grad_norm=1.36, loss_final=3.59, loss_mean=1.03, loss_mean_cls=2.62, proj_loss=-0.0573][2026-03-23 13:57:12] Step: 231, Training Logs: loss_final: 4.299013, loss_mean: 1.019667, proj_loss: -0.057679, loss_mean_cls: 3.337025, grad_norm: 2.507548 +Steps: 0%| | 232/1000000 [01:01<67:49:48, 4.09it/s, grad_norm=2.51, loss_final=4.3, loss_mean=1.02, loss_mean_cls=3.34, proj_loss=-0.0577][2026-03-23 13:57:12] Step: 232, Training Logs: loss_final: 3.918818, loss_mean: 1.041992, proj_loss: -0.057748, loss_mean_cls: 2.934574, grad_norm: 1.768478 +Steps: 0%| | 233/1000000 [01:01<67:48:26, 4.10it/s, grad_norm=1.77, loss_final=3.92, loss_mean=1.04, loss_mean_cls=2.93, proj_loss=-0.0577][2026-03-23 13:57:13] Step: 233, Training Logs: loss_final: 4.135336, loss_mean: 1.012130, proj_loss: -0.055806, loss_mean_cls: 3.179013, grad_norm: 2.010638 +Steps: 0%| | 234/1000000 [01:01<67:48:16, 4.10it/s, grad_norm=2.01, loss_final=4.14, loss_mean=1.01, loss_mean_cls=3.18, proj_loss=-0.0558][2026-03-23 13:57:13] Step: 234, Training Logs: loss_final: 4.622794, loss_mean: 1.008856, proj_loss: -0.059102, loss_mean_cls: 3.673040, grad_norm: 1.824655 +Steps: 0%| | 235/1000000 [01:01<67:48:42, 4.10it/s, grad_norm=1.82, loss_final=4.62, loss_mean=1.01, loss_mean_cls=3.67, proj_loss=-0.0591][2026-03-23 13:57:13] Step: 235, Training Logs: loss_final: 4.330180, loss_mean: 1.019101, proj_loss: -0.054455, loss_mean_cls: 3.365534, grad_norm: 1.865192 +Steps: 0%| | 236/1000000 [01:02<67:47:59, 4.10it/s, grad_norm=1.87, loss_final=4.33, loss_mean=1.02, loss_mean_cls=3.37, proj_loss=-0.0545][2026-03-23 13:57:13] Step: 236, Training Logs: loss_final: 4.471676, loss_mean: 1.005463, proj_loss: -0.057791, loss_mean_cls: 3.524004, grad_norm: 1.992857 +Steps: 0%| | 237/1000000 [01:02<67:48:33, 4.10it/s, grad_norm=1.99, loss_final=4.47, loss_mean=1.01, loss_mean_cls=3.52, proj_loss=-0.0578][2026-03-23 13:57:14] Step: 237, Training Logs: loss_final: 4.708742, loss_mean: 1.007163, proj_loss: -0.057294, loss_mean_cls: 3.758873, grad_norm: 1.696959 +Steps: 0%| | 238/1000000 [01:02<67:48:46, 4.10it/s, grad_norm=1.7, loss_final=4.71, loss_mean=1.01, loss_mean_cls=3.76, proj_loss=-0.0573][2026-03-23 13:57:14] Step: 238, Training Logs: loss_final: 4.700453, loss_mean: 1.011082, proj_loss: -0.057899, loss_mean_cls: 3.747270, grad_norm: 2.577658 +Steps: 0%| | 239/1000000 [01:02<67:48:18, 4.10it/s, grad_norm=2.58, loss_final=4.7, loss_mean=1.01, loss_mean_cls=3.75, proj_loss=-0.0579][2026-03-23 13:57:14] Step: 239, Training Logs: loss_final: 4.519000, loss_mean: 0.999633, proj_loss: -0.058458, loss_mean_cls: 3.577825, grad_norm: 2.147466 +Steps: 0%| | 240/1000000 [01:03<67:48:55, 4.10it/s, grad_norm=2.15, loss_final=4.52, loss_mean=1, loss_mean_cls=3.58, proj_loss=-0.0585][2026-03-23 13:57:14] Step: 240, Training Logs: loss_final: 3.874477, loss_mean: 1.005100, proj_loss: -0.059289, loss_mean_cls: 2.928666, grad_norm: 2.779819 +Steps: 0%| | 241/1000000 [01:03<67:50:12, 4.09it/s, grad_norm=2.78, loss_final=3.87, loss_mean=1.01, loss_mean_cls=2.93, proj_loss=-0.0593][2026-03-23 13:57:15] Step: 241, Training Logs: loss_final: 4.244301, loss_mean: 0.978814, proj_loss: -0.057319, loss_mean_cls: 3.322806, grad_norm: 1.934856 +Steps: 0%| | 242/1000000 [01:03<67:50:20, 4.09it/s, grad_norm=1.93, loss_final=4.24, loss_mean=0.979, loss_mean_cls=3.32, proj_loss=-0.0573][2026-03-23 13:57:15] Step: 242, Training Logs: loss_final: 4.099132, loss_mean: 1.017777, proj_loss: -0.056814, loss_mean_cls: 3.138168, grad_norm: 2.223382 +Steps: 0%| | 243/1000000 [01:03<67:49:22, 4.09it/s, grad_norm=2.22, loss_final=4.1, loss_mean=1.02, loss_mean_cls=3.14, proj_loss=-0.0568][2026-03-23 13:57:15] Step: 243, Training Logs: loss_final: 3.643645, loss_mean: 1.043657, proj_loss: -0.057582, loss_mean_cls: 2.657569, grad_norm: 1.876094 +Steps: 0%| | 244/1000000 [01:04<67:48:22, 4.10it/s, grad_norm=1.88, loss_final=3.64, loss_mean=1.04, loss_mean_cls=2.66, proj_loss=-0.0576][2026-03-23 13:57:15] Step: 244, Training Logs: loss_final: 3.791383, loss_mean: 1.039013, proj_loss: -0.057821, loss_mean_cls: 2.810191, grad_norm: 2.101933 +Steps: 0%| | 245/1000000 [01:04<67:48:03, 4.10it/s, grad_norm=2.1, loss_final=3.79, loss_mean=1.04, loss_mean_cls=2.81, proj_loss=-0.0578][2026-03-23 13:57:16] Step: 245, Training Logs: loss_final: 4.122998, loss_mean: 1.003735, proj_loss: -0.056978, loss_mean_cls: 3.176242, grad_norm: 1.748520 +Steps: 0%| | 246/1000000 [01:04<67:47:44, 4.10it/s, grad_norm=1.75, loss_final=4.12, loss_mean=1, loss_mean_cls=3.18, proj_loss=-0.057][2026-03-23 13:57:16] Step: 246, Training Logs: loss_final: 3.975110, loss_mean: 1.031366, proj_loss: -0.058318, loss_mean_cls: 3.002061, grad_norm: 2.088799 +Steps: 0%| | 247/1000000 [01:04<67:49:45, 4.09it/s, grad_norm=2.09, loss_final=3.98, loss_mean=1.03, loss_mean_cls=3, proj_loss=-0.0583][2026-03-23 13:57:16] Step: 247, Training Logs: loss_final: 3.795434, loss_mean: 1.016577, proj_loss: -0.059947, loss_mean_cls: 2.838804, grad_norm: 1.548398 +Steps: 0%| | 248/1000000 [01:05<67:48:52, 4.10it/s, grad_norm=1.55, loss_final=3.8, loss_mean=1.02, loss_mean_cls=2.84, proj_loss=-0.0599][2026-03-23 13:57:16] Step: 248, Training Logs: loss_final: 3.547227, loss_mean: 1.007156, proj_loss: -0.057843, loss_mean_cls: 2.597914, grad_norm: 2.097709 +Steps: 0%| | 249/1000000 [01:05<67:47:51, 4.10it/s, grad_norm=2.1, loss_final=3.55, loss_mean=1.01, loss_mean_cls=2.6, proj_loss=-0.0578][2026-03-23 13:57:17] Step: 249, Training Logs: loss_final: 3.670953, loss_mean: 1.005734, proj_loss: -0.056730, loss_mean_cls: 2.721948, grad_norm: 1.993682 +Steps: 0%| | 250/1000000 [01:05<67:48:32, 4.10it/s, grad_norm=1.99, loss_final=3.67, loss_mean=1.01, loss_mean_cls=2.72, proj_loss=-0.0567][2026-03-23 13:57:17] Step: 250, Training Logs: loss_final: 3.783008, loss_mean: 1.009454, proj_loss: -0.057429, loss_mean_cls: 2.830983, grad_norm: 2.015732 +Steps: 0%| | 251/1000000 [01:05<67:49:52, 4.09it/s, grad_norm=2.02, loss_final=3.78, loss_mean=1.01, loss_mean_cls=2.83, proj_loss=-0.0574][2026-03-23 13:57:17] Step: 251, Training Logs: loss_final: 3.917508, loss_mean: 1.015374, proj_loss: -0.057006, loss_mean_cls: 2.959140, grad_norm: 1.911892 +Steps: 0%| | 252/1000000 [01:06<67:48:52, 4.10it/s, grad_norm=1.91, loss_final=3.92, loss_mean=1.02, loss_mean_cls=2.96, proj_loss=-0.057][2026-03-23 13:57:17] Step: 252, Training Logs: loss_final: 4.345321, loss_mean: 0.999146, proj_loss: -0.059442, loss_mean_cls: 3.405616, grad_norm: 2.544859 +Steps: 0%| | 253/1000000 [01:06<67:48:39, 4.10it/s, grad_norm=2.54, loss_final=4.35, loss_mean=0.999, loss_mean_cls=3.41, proj_loss=-0.0594][2026-03-23 13:57:18] Step: 253, Training Logs: loss_final: 3.264407, loss_mean: 1.012033, proj_loss: -0.060338, loss_mean_cls: 2.312712, grad_norm: 2.002759 +Steps: 0%| | 254/1000000 [01:06<67:47:32, 4.10it/s, grad_norm=2, loss_final=3.26, loss_mean=1.01, loss_mean_cls=2.31, proj_loss=-0.0603][2026-03-23 13:57:18] Step: 254, Training Logs: loss_final: 3.680009, loss_mean: 1.009429, proj_loss: -0.057669, loss_mean_cls: 2.728250, grad_norm: 2.470939 +Steps: 0%| | 255/1000000 [01:06<67:46:55, 4.10it/s, grad_norm=2.47, loss_final=3.68, loss_mean=1.01, loss_mean_cls=2.73, proj_loss=-0.0577][2026-03-23 13:57:18] Step: 255, Training Logs: loss_final: 3.617963, loss_mean: 1.040573, proj_loss: -0.058244, loss_mean_cls: 2.635633, grad_norm: 2.220286 +Steps: 0%| | 256/1000000 [01:07<67:48:18, 4.10it/s, grad_norm=2.22, loss_final=3.62, loss_mean=1.04, loss_mean_cls=2.64, proj_loss=-0.0582][2026-03-23 13:57:18] Step: 256, Training Logs: loss_final: 4.080931, loss_mean: 0.997066, proj_loss: -0.058532, loss_mean_cls: 3.142397, grad_norm: 2.140220 +Steps: 0%| | 257/1000000 [01:07<67:47:49, 4.10it/s, grad_norm=2.14, loss_final=4.08, loss_mean=0.997, loss_mean_cls=3.14, proj_loss=-0.0585][2026-03-23 13:57:19] Step: 257, Training Logs: loss_final: 3.699068, loss_mean: 1.000059, proj_loss: -0.058102, loss_mean_cls: 2.757111, grad_norm: 1.981827 +Steps: 0%| | 258/1000000 [01:07<67:46:24, 4.10it/s, grad_norm=1.98, loss_final=3.7, loss_mean=1, loss_mean_cls=2.76, proj_loss=-0.0581][2026-03-23 13:57:19] Step: 258, Training Logs: loss_final: 3.992317, loss_mean: 1.029925, proj_loss: -0.058207, loss_mean_cls: 3.020598, grad_norm: 2.269195 +Steps: 0%| | 259/1000000 [01:07<67:46:06, 4.10it/s, grad_norm=2.27, loss_final=3.99, loss_mean=1.03, loss_mean_cls=3.02, proj_loss=-0.0582][2026-03-23 13:57:19] Step: 259, Training Logs: loss_final: 3.969361, loss_mean: 1.005176, proj_loss: -0.055455, loss_mean_cls: 3.019640, grad_norm: 2.430135 +Steps: 0%| | 260/1000000 [01:08<67:47:12, 4.10it/s, grad_norm=2.43, loss_final=3.97, loss_mean=1.01, loss_mean_cls=3.02, proj_loss=-0.0555][2026-03-23 13:57:19] Step: 260, Training Logs: loss_final: 3.946629, loss_mean: 0.986893, proj_loss: -0.055241, loss_mean_cls: 3.014977, grad_norm: 2.162997 +Steps: 0%| | 261/1000000 [01:08<79:04:37, 3.51it/s, grad_norm=2.16, loss_final=3.95, loss_mean=0.987, loss_mean_cls=3.01, proj_loss=-0.0552][2026-03-23 13:57:20] Step: 261, Training Logs: loss_final: 3.920058, loss_mean: 0.984585, proj_loss: -0.057787, loss_mean_cls: 2.993260, grad_norm: 2.114045 +Steps: 0%| | 262/1000000 [01:08<78:26:04, 3.54it/s, grad_norm=2.11, loss_final=3.92, loss_mean=0.985, loss_mean_cls=2.99, proj_loss=-0.0578][2026-03-23 13:57:20] Step: 262, Training Logs: loss_final: 4.358081, loss_mean: 0.977391, proj_loss: -0.057046, loss_mean_cls: 3.437736, grad_norm: 2.065785 +Steps: 0%| | 263/1000000 [01:08<75:13:43, 3.69it/s, grad_norm=2.07, loss_final=4.36, loss_mean=0.977, loss_mean_cls=3.44, proj_loss=-0.057][2026-03-23 13:57:20] Step: 263, Training Logs: loss_final: 3.840959, loss_mean: 0.999257, proj_loss: -0.056460, loss_mean_cls: 2.898163, grad_norm: 2.413785 +Steps: 0%| | 264/1000000 [01:09<72:59:25, 3.80it/s, grad_norm=2.41, loss_final=3.84, loss_mean=0.999, loss_mean_cls=2.9, proj_loss=-0.0565][2026-03-23 13:57:20] Step: 264, Training Logs: loss_final: 3.940903, loss_mean: 1.023376, proj_loss: -0.057920, loss_mean_cls: 2.975446, grad_norm: 1.844397 +Steps: 0%| | 265/1000000 [01:09<71:26:14, 3.89it/s, grad_norm=1.84, loss_final=3.94, loss_mean=1.02, loss_mean_cls=2.98, proj_loss=-0.0579][2026-03-23 13:57:21] Step: 265, Training Logs: loss_final: 4.206155, loss_mean: 0.981618, proj_loss: -0.059432, loss_mean_cls: 3.283970, grad_norm: 2.069561 +Steps: 0%| | 266/1000000 [01:09<70:20:03, 3.95it/s, grad_norm=2.07, loss_final=4.21, loss_mean=0.982, loss_mean_cls=3.28, proj_loss=-0.0594][2026-03-23 13:57:21] Step: 266, Training Logs: loss_final: 3.804409, loss_mean: 0.990602, proj_loss: -0.060707, loss_mean_cls: 2.874513, grad_norm: 1.578623 +Steps: 0%| | 267/1000000 [01:09<69:36:48, 3.99it/s, grad_norm=1.58, loss_final=3.8, loss_mean=0.991, loss_mean_cls=2.87, proj_loss=-0.0607][2026-03-23 13:57:21] Step: 267, Training Logs: loss_final: 4.086649, loss_mean: 1.005726, proj_loss: -0.058538, loss_mean_cls: 3.139461, grad_norm: 2.692237 +Steps: 0%| | 268/1000000 [01:10<69:04:32, 4.02it/s, grad_norm=2.69, loss_final=4.09, loss_mean=1.01, loss_mean_cls=3.14, proj_loss=-0.0585][2026-03-23 13:57:21] Step: 268, Training Logs: loss_final: 3.484965, loss_mean: 1.013161, proj_loss: -0.056986, loss_mean_cls: 2.528790, grad_norm: 2.118618 +Steps: 0%| | 269/1000000 [01:10<69:49:28, 3.98it/s, grad_norm=2.12, loss_final=3.48, loss_mean=1.01, loss_mean_cls=2.53, proj_loss=-0.057][2026-03-23 13:57:22] Step: 269, Training Logs: loss_final: 3.312028, loss_mean: 1.008822, proj_loss: -0.058315, loss_mean_cls: 2.361521, grad_norm: 1.997808 +Steps: 0%| | 270/1000000 [01:10<68:55:10, 4.03it/s, grad_norm=2, loss_final=3.31, loss_mean=1.01, loss_mean_cls=2.36, proj_loss=-0.0583][2026-03-23 13:57:22] Step: 270, Training Logs: loss_final: 3.996657, loss_mean: 1.009214, proj_loss: -0.058962, loss_mean_cls: 3.046405, grad_norm: 2.755926 +Steps: 0%| | 271/1000000 [01:10<68:39:18, 4.04it/s, grad_norm=2.76, loss_final=4, loss_mean=1.01, loss_mean_cls=3.05, proj_loss=-0.059][2026-03-23 13:57:22] Step: 271, Training Logs: loss_final: 4.391388, loss_mean: 1.024230, proj_loss: -0.058988, loss_mean_cls: 3.426146, grad_norm: 2.096098 +Steps: 0%| | 272/1000000 [01:11<68:23:53, 4.06it/s, grad_norm=2.1, loss_final=4.39, loss_mean=1.02, loss_mean_cls=3.43, proj_loss=-0.059][2026-03-23 13:57:22] Step: 272, Training Logs: loss_final: 3.920087, loss_mean: 1.024717, proj_loss: -0.059831, loss_mean_cls: 2.955201, grad_norm: 2.465345 +Steps: 0%| | 273/1000000 [01:11<68:12:32, 4.07it/s, grad_norm=2.47, loss_final=3.92, loss_mean=1.02, loss_mean_cls=2.96, proj_loss=-0.0598][2026-03-23 13:57:23] Step: 273, Training Logs: loss_final: 3.677792, loss_mean: 1.019457, proj_loss: -0.056399, loss_mean_cls: 2.714733, grad_norm: 2.047685 +Steps: 0%| | 274/1000000 [01:11<68:06:48, 4.08it/s, grad_norm=2.05, loss_final=3.68, loss_mean=1.02, loss_mean_cls=2.71, proj_loss=-0.0564][2026-03-23 13:57:23] Step: 274, Training Logs: loss_final: 3.611300, loss_mean: 1.009368, proj_loss: -0.057628, loss_mean_cls: 2.659561, grad_norm: 1.780220 +Steps: 0%| | 275/1000000 [01:11<68:01:32, 4.08it/s, grad_norm=1.78, loss_final=3.61, loss_mean=1.01, loss_mean_cls=2.66, proj_loss=-0.0576][2026-03-23 13:57:23] Step: 275, Training Logs: loss_final: 3.762297, loss_mean: 1.000715, proj_loss: -0.057146, loss_mean_cls: 2.818728, grad_norm: 2.212818 +Steps: 0%| | 276/1000000 [01:12<67:56:22, 4.09it/s, grad_norm=2.21, loss_final=3.76, loss_mean=1, loss_mean_cls=2.82, proj_loss=-0.0571][2026-03-23 13:57:23] Step: 276, Training Logs: loss_final: 4.110180, loss_mean: 1.001855, proj_loss: -0.058339, loss_mean_cls: 3.166663, grad_norm: 3.446807 +Steps: 0%| | 277/1000000 [01:12<67:54:20, 4.09it/s, grad_norm=3.45, loss_final=4.11, loss_mean=1, loss_mean_cls=3.17, proj_loss=-0.0583][2026-03-23 13:57:24] Step: 277, Training Logs: loss_final: 3.589436, loss_mean: 1.014081, proj_loss: -0.058753, loss_mean_cls: 2.634108, grad_norm: 1.866073 +Steps: 0%| | 278/1000000 [01:12<67:51:55, 4.09it/s, grad_norm=1.87, loss_final=3.59, loss_mean=1.01, loss_mean_cls=2.63, proj_loss=-0.0588][2026-03-23 13:57:24] Step: 278, Training Logs: loss_final: 3.909306, loss_mean: 1.036322, proj_loss: -0.057491, loss_mean_cls: 2.930474, grad_norm: 3.416674 +Steps: 0%| | 279/1000000 [01:12<67:51:48, 4.09it/s, grad_norm=3.42, loss_final=3.91, loss_mean=1.04, loss_mean_cls=2.93, proj_loss=-0.0575][2026-03-23 13:57:24] Step: 279, Training Logs: loss_final: 3.635005, loss_mean: 1.021109, proj_loss: -0.058334, loss_mean_cls: 2.672229, grad_norm: 2.773069 +Steps: 0%| | 280/1000000 [01:13<67:50:10, 4.09it/s, grad_norm=2.77, loss_final=3.64, loss_mean=1.02, loss_mean_cls=2.67, proj_loss=-0.0583][2026-03-23 13:57:24] Step: 280, Training Logs: loss_final: 4.731202, loss_mean: 0.976675, proj_loss: -0.056143, loss_mean_cls: 3.810670, grad_norm: 2.045803 +Steps: 0%| | 281/1000000 [01:13<67:48:49, 4.10it/s, grad_norm=2.05, loss_final=4.73, loss_mean=0.977, loss_mean_cls=3.81, proj_loss=-0.0561][2026-03-23 13:57:25] Step: 281, Training Logs: loss_final: 4.352121, loss_mean: 1.020213, proj_loss: -0.057039, loss_mean_cls: 3.388947, grad_norm: 3.085879 +Steps: 0%| | 282/1000000 [01:13<67:48:21, 4.10it/s, grad_norm=3.09, loss_final=4.35, loss_mean=1.02, loss_mean_cls=3.39, proj_loss=-0.057][2026-03-23 13:57:25] Step: 282, Training Logs: loss_final: 3.645599, loss_mean: 1.009737, proj_loss: -0.056890, loss_mean_cls: 2.692752, grad_norm: 2.743040 +Steps: 0%| | 283/1000000 [01:13<67:47:40, 4.10it/s, grad_norm=2.74, loss_final=3.65, loss_mean=1.01, loss_mean_cls=2.69, proj_loss=-0.0569][2026-03-23 13:57:25] Step: 283, Training Logs: loss_final: 3.621672, loss_mean: 1.014742, proj_loss: -0.058281, loss_mean_cls: 2.665210, grad_norm: 2.332768 +Steps: 0%| | 284/1000000 [01:14<67:48:17, 4.10it/s, grad_norm=2.33, loss_final=3.62, loss_mean=1.01, loss_mean_cls=2.67, proj_loss=-0.0583][2026-03-23 13:57:25] Step: 284, Training Logs: loss_final: 3.634438, loss_mean: 1.014846, proj_loss: -0.058438, loss_mean_cls: 2.678030, grad_norm: 2.844196 +Steps: 0%| | 285/1000000 [01:14<67:48:15, 4.10it/s, grad_norm=2.84, loss_final=3.63, loss_mean=1.01, loss_mean_cls=2.68, proj_loss=-0.0584][2026-03-23 13:57:26] Step: 285, Training Logs: loss_final: 3.942388, loss_mean: 0.986733, proj_loss: -0.056891, loss_mean_cls: 3.012546, grad_norm: 2.134713 +Steps: 0%| | 286/1000000 [01:14<67:47:40, 4.10it/s, grad_norm=2.13, loss_final=3.94, loss_mean=0.987, loss_mean_cls=3.01, proj_loss=-0.0569][2026-03-23 13:57:26] Step: 286, Training Logs: loss_final: 3.926682, loss_mean: 1.011870, proj_loss: -0.057176, loss_mean_cls: 2.971987, grad_norm: 2.768237 +Steps: 0%| | 287/1000000 [01:14<67:46:57, 4.10it/s, grad_norm=2.77, loss_final=3.93, loss_mean=1.01, loss_mean_cls=2.97, proj_loss=-0.0572][2026-03-23 13:57:26] Step: 287, Training Logs: loss_final: 4.047902, loss_mean: 1.002864, proj_loss: -0.059520, loss_mean_cls: 3.104558, grad_norm: 2.354553 +Steps: 0%| | 287/1000000 [01:14<67:46:57, 4.10it/s, grad_norm=2.35, loss_final=4.05, loss_mean=1, loss_mean_cls=3.1, proj_loss=-0.0595] diff --git a/back/wandb/run-20260323_135607-zue1y2ba/files/requirements.txt b/back/wandb/run-20260323_135607-zue1y2ba/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0235910d0d99b7dee69b9a7f2f90012c8b711cc --- /dev/null +++ b/back/wandb/run-20260323_135607-zue1y2ba/files/requirements.txt @@ -0,0 +1,168 @@ +dill==0.3.8 +mkl-service==2.4.0 +mpmath==1.3.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +torch==2.5.1 +ptyprocess==0.7.0 +traitlets==5.14.3 +pyasn1==0.6.1 +opencv-python-headless==4.12.0.88 +nest-asyncio==1.6.0 +kiwisolver==1.4.8 +click==8.2.1 +fire==0.7.1 +diffusers==0.35.1 +accelerate==1.7.0 +ipykernel==6.29.5 +peft==0.17.1 +attrs==24.3.0 +six==1.17.0 +numpy==2.0.1 +yarl==1.18.0 +huggingface_hub==0.34.4 +Bottleneck==1.4.2 +numexpr==2.11.0 +dataclasses==0.6 +typing-inspection==0.4.1 +safetensors==0.5.3 +pyparsing==3.2.3 +psutil==7.0.0 +imageio==2.37.0 +debugpy==1.8.14 +cycler==0.12.1 +pyasn1_modules==0.4.2 +matplotlib-inline==0.1.7 +matplotlib==3.10.3 +jedi==0.19.2 +tokenizers==0.21.2 +seaborn==0.13.2 +timm==1.0.15 +aiohappyeyeballs==2.6.1 +hf-xet==1.1.8 +multidict==6.1.0 +tqdm==4.67.1 +wheel==0.45.1 +simsimd==6.5.1 +sentencepiece==0.2.1 +grpcio==1.74.0 +asttokens==3.0.0 +absl-py==2.3.1 +stack-data==0.6.3 +pandas==2.3.0 +importlib_metadata==8.7.0 +pytorch-image-generation-metrics==0.6.1 +frozenlist==1.5.0 +MarkupSafe==3.0.2 +setuptools==78.1.1 +multiprocess==0.70.15 +pip==25.1 +requests==2.32.3 +mkl_random==1.2.8 +tensorboard-plugin-wit==1.8.1 +ExifRead-nocycle==3.0.1 +webdataset==0.2.111 +threadpoolctl==3.6.0 +pyarrow==21.0.0 +executing==2.2.0 +decorator==5.2.1 +contourpy==1.3.2 +annotated-types==0.7.0 +scikit-learn==1.7.1 +jupyter_client==8.6.3 +albumentations==1.4.24 +wandb==0.25.0 +certifi==2025.8.3 +idna==3.7 +xxhash==3.5.0 +Jinja2==3.1.6 +python-dateutil==2.9.0.post0 +aiosignal==1.4.0 +triton==3.1.0 +torchvision==0.20.1 +stringzilla==3.12.6 +pure_eval==0.2.3 +braceexpand==0.1.7 +zipp==3.22.0 +oauthlib==3.3.1 +Markdown==3.8.2 +fsspec==2025.3.0 +fonttools==4.58.2 +comm==0.2.2 +ipython==9.3.0 +img2dataset==1.47.0 +networkx==3.4.2 +PySocks==1.7.1 +tzdata==2025.2 +smmap==5.0.2 +mkl_fft==1.3.11 +sentry-sdk==2.29.1 +Pygments==2.19.1 +pexpect==4.9.0 +ftfy==6.3.1 +einops==0.8.1 +requests-oauthlib==2.0.0 +gitdb==4.0.12 +albucore==0.0.23 +torchdiffeq==0.2.5 +GitPython==3.1.44 +bitsandbytes==0.47.0 +pytorch-fid==0.3.0 +clean-fid==0.1.35 +pytorch-gan-metrics==0.5.4 +Brotli==1.0.9 +charset-normalizer==3.3.2 +gmpy2==2.2.1 +pillow==11.1.0 +PyYAML==6.0.2 +tornado==6.5.1 +termcolor==3.1.0 +setproctitle==1.3.6 +scipy==1.15.3 +regex==2024.11.6 +protobuf==6.31.1 +platformdirs==4.3.8 +joblib==1.5.1 +cachetools==4.2.4 +ipython_pygments_lexers==1.1.1 +google-auth==1.35.0 +transformers==4.53.2 +torch-fidelity==0.3.0 +tensorboard==2.4.0 +filelock==3.17.0 +packaging==25.0 +propcache==0.3.1 +pytz==2025.2 +aiohttp==3.11.10 +wcwidth==0.2.13 +clip==0.2.0 +Werkzeug==3.1.3 +tensorboard-data-server==0.6.1 +sympy==1.13.1 +pyzmq==26.4.0 +pydantic_core==2.33.2 +prompt_toolkit==3.0.51 +parso==0.8.4 +docker-pycreds==0.4.0 +rsa==4.9.1 +pydantic==2.11.5 +jupyter_core==5.8.1 +google-auth-oauthlib==0.4.6 +datasets==4.0.0 +torch-tb-profiler==0.4.3 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +typeguard==4.3.0 +inflect==7.3.1 +jaraco.text==3.12.1 +tomli==2.0.1 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/back/wandb/run-20260323_135607-zue1y2ba/files/wandb-metadata.json b/back/wandb/run-20260323_135607-zue1y2ba/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5d407d1d21b5dd510a849b77fbc1020aa50f9ac6 --- /dev/null +++ b/back/wandb/run-20260323_135607-zue1y2ba/files/wandb-metadata.json @@ -0,0 +1,101 @@ +{ + "os": "Linux-5.15.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.9", + "startedAt": "2026-03-23T05:56:07.858187Z", + "args": [ + "--report-to", + "wandb", + "--allow-tf32", + "--mixed-precision", + "bf16", + "--seed", + "0", + "--path-type", + "linear", + "--prediction", + "v", + "--weighting", + "uniform", + "--model", + "SiT-XL/2", + "--enc-type", + "dinov2-vit-b", + "--encoder-depth", + "8", + "--proj-coeff", + "0.5", + "--output-dir", + "exps", + "--exp-name", + "jsflow-experiment-0.75", + "--batch-size", + "256", + "--data-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256", + "--semantic-features-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0", + "--learning-rate", + "0.00005", + "--t-c", + "0.75", + "--cls", + "0.2", + "--ot-cls" + ], + "program": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "git": { + "remote": "https://github.com/Martinser/REG.git", + "commit": "021ea2e50c38c5803bd9afff16316958a01fbd1d" + }, + "email": "2365972933@qq.com", + "root": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG", + "host": "24c964746905d416ce09d045f9a06f23-taskrole1-0", + "executable": "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 4, + "disk": { + "/": { + "total": "3838880616448", + "used": "357568126976" + } + }, + "memory": { + "total": "2164115296256" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-a09f2421-99e6-a72e-63bd-fd7452510758" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e" + } + ], + "cudaVersion": "13.0", + "writerId": "l4vui4vfnl881ctol25fj9y70t6im9l9" +} \ No newline at end of file diff --git a/back/wandb/run-20260323_135607-zue1y2ba/logs/debug-internal.log b/back/wandb/run-20260323_135607-zue1y2ba/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..c3dcdf185dbb703e1c2ed64205bbc5022b6eeda0 --- /dev/null +++ b/back/wandb/run-20260323_135607-zue1y2ba/logs/debug-internal.log @@ -0,0 +1,6 @@ +{"time":"2026-03-23T13:56:08.183465712+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-23T13:56:10.661719633+08:00","level":"INFO","msg":"stream: created new stream","id":"zue1y2ba"} +{"time":"2026-03-23T13:56:10.661931952+08:00","level":"INFO","msg":"handler: started","stream_id":"zue1y2ba"} +{"time":"2026-03-23T13:56:10.662874633+08:00","level":"INFO","msg":"stream: started","id":"zue1y2ba"} +{"time":"2026-03-23T13:56:10.662895027+08:00","level":"INFO","msg":"writer: started","stream_id":"zue1y2ba"} +{"time":"2026-03-23T13:56:10.662918583+08:00","level":"INFO","msg":"sender: started","stream_id":"zue1y2ba"} diff --git a/back/wandb/run-20260323_135607-zue1y2ba/logs/debug.log b/back/wandb/run-20260323_135607-zue1y2ba/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..01603de8f1e7aa92007ac26739f92235178cfc27 --- /dev/null +++ b/back/wandb/run-20260323_135607-zue1y2ba/logs/debug.log @@ -0,0 +1,20 @@ +2026-03-23 13:56:07,881 INFO MainThread:397944 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-23 13:56:07,881 INFO MainThread:397944 [wandb_setup.py:_flush():81] Configure stats pid to 397944 +2026-03-23 13:56:07,881 INFO MainThread:397944 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-23 13:56:07,881 INFO MainThread:397944 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260323_135607-zue1y2ba/logs/debug.log +2026-03-23 13:56:07,881 INFO MainThread:397944 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260323_135607-zue1y2ba/logs/debug-internal.log +2026-03-23 13:56:07,881 INFO MainThread:397944 [wandb_init.py:init():844] calling init triggers +2026-03-23 13:56:07,881 INFO MainThread:397944 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-23 13:56:07,881 INFO MainThread:397944 [wandb_init.py:init():892] starting backend +2026-03-23 13:56:08,167 INFO MainThread:397944 [wandb_init.py:init():895] sending inform_init request +2026-03-23 13:56:08,180 INFO MainThread:397944 [wandb_init.py:init():903] backend started and connected +2026-03-23 13:56:08,181 INFO MainThread:397944 [wandb_init.py:init():973] updated telemetry +2026-03-23 13:56:08,194 INFO MainThread:397944 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-23 13:56:11,614 INFO MainThread:397944 [wandb_init.py:init():1042] starting run threads in backend +2026-03-23 13:56:11,706 INFO MainThread:397944 [wandb_run.py:_console_start():2524] atexit reg +2026-03-23 13:56:11,707 INFO MainThread:397944 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-23 13:56:11,707 INFO MainThread:397944 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-23 13:56:11,707 INFO MainThread:397944 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-23 13:56:11,712 INFO MainThread:397944 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-23 13:56:11,713 INFO MainThread:397944 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment-0.75', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 2000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.2, 't_c': 0.75, 'ot_cls': True} diff --git a/back/wandb/run-20260323_135841-w9holkos/files/requirements.txt b/back/wandb/run-20260323_135841-w9holkos/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0235910d0d99b7dee69b9a7f2f90012c8b711cc --- /dev/null +++ b/back/wandb/run-20260323_135841-w9holkos/files/requirements.txt @@ -0,0 +1,168 @@ +dill==0.3.8 +mkl-service==2.4.0 +mpmath==1.3.0 +typing_extensions==4.12.2 +urllib3==2.3.0 +torch==2.5.1 +ptyprocess==0.7.0 +traitlets==5.14.3 +pyasn1==0.6.1 +opencv-python-headless==4.12.0.88 +nest-asyncio==1.6.0 +kiwisolver==1.4.8 +click==8.2.1 +fire==0.7.1 +diffusers==0.35.1 +accelerate==1.7.0 +ipykernel==6.29.5 +peft==0.17.1 +attrs==24.3.0 +six==1.17.0 +numpy==2.0.1 +yarl==1.18.0 +huggingface_hub==0.34.4 +Bottleneck==1.4.2 +numexpr==2.11.0 +dataclasses==0.6 +typing-inspection==0.4.1 +safetensors==0.5.3 +pyparsing==3.2.3 +psutil==7.0.0 +imageio==2.37.0 +debugpy==1.8.14 +cycler==0.12.1 +pyasn1_modules==0.4.2 +matplotlib-inline==0.1.7 +matplotlib==3.10.3 +jedi==0.19.2 +tokenizers==0.21.2 +seaborn==0.13.2 +timm==1.0.15 +aiohappyeyeballs==2.6.1 +hf-xet==1.1.8 +multidict==6.1.0 +tqdm==4.67.1 +wheel==0.45.1 +simsimd==6.5.1 +sentencepiece==0.2.1 +grpcio==1.74.0 +asttokens==3.0.0 +absl-py==2.3.1 +stack-data==0.6.3 +pandas==2.3.0 +importlib_metadata==8.7.0 +pytorch-image-generation-metrics==0.6.1 +frozenlist==1.5.0 +MarkupSafe==3.0.2 +setuptools==78.1.1 +multiprocess==0.70.15 +pip==25.1 +requests==2.32.3 +mkl_random==1.2.8 +tensorboard-plugin-wit==1.8.1 +ExifRead-nocycle==3.0.1 +webdataset==0.2.111 +threadpoolctl==3.6.0 +pyarrow==21.0.0 +executing==2.2.0 +decorator==5.2.1 +contourpy==1.3.2 +annotated-types==0.7.0 +scikit-learn==1.7.1 +jupyter_client==8.6.3 +albumentations==1.4.24 +wandb==0.25.0 +certifi==2025.8.3 +idna==3.7 +xxhash==3.5.0 +Jinja2==3.1.6 +python-dateutil==2.9.0.post0 +aiosignal==1.4.0 +triton==3.1.0 +torchvision==0.20.1 +stringzilla==3.12.6 +pure_eval==0.2.3 +braceexpand==0.1.7 +zipp==3.22.0 +oauthlib==3.3.1 +Markdown==3.8.2 +fsspec==2025.3.0 +fonttools==4.58.2 +comm==0.2.2 +ipython==9.3.0 +img2dataset==1.47.0 +networkx==3.4.2 +PySocks==1.7.1 +tzdata==2025.2 +smmap==5.0.2 +mkl_fft==1.3.11 +sentry-sdk==2.29.1 +Pygments==2.19.1 +pexpect==4.9.0 +ftfy==6.3.1 +einops==0.8.1 +requests-oauthlib==2.0.0 +gitdb==4.0.12 +albucore==0.0.23 +torchdiffeq==0.2.5 +GitPython==3.1.44 +bitsandbytes==0.47.0 +pytorch-fid==0.3.0 +clean-fid==0.1.35 +pytorch-gan-metrics==0.5.4 +Brotli==1.0.9 +charset-normalizer==3.3.2 +gmpy2==2.2.1 +pillow==11.1.0 +PyYAML==6.0.2 +tornado==6.5.1 +termcolor==3.1.0 +setproctitle==1.3.6 +scipy==1.15.3 +regex==2024.11.6 +protobuf==6.31.1 +platformdirs==4.3.8 +joblib==1.5.1 +cachetools==4.2.4 +ipython_pygments_lexers==1.1.1 +google-auth==1.35.0 +transformers==4.53.2 +torch-fidelity==0.3.0 +tensorboard==2.4.0 +filelock==3.17.0 +packaging==25.0 +propcache==0.3.1 +pytz==2025.2 +aiohttp==3.11.10 +wcwidth==0.2.13 +clip==0.2.0 +Werkzeug==3.1.3 +tensorboard-data-server==0.6.1 +sympy==1.13.1 +pyzmq==26.4.0 +pydantic_core==2.33.2 +prompt_toolkit==3.0.51 +parso==0.8.4 +docker-pycreds==0.4.0 +rsa==4.9.1 +pydantic==2.11.5 +jupyter_core==5.8.1 +google-auth-oauthlib==0.4.6 +datasets==4.0.0 +torch-tb-profiler==0.4.3 +autocommand==2.2.2 +backports.tarfile==1.2.0 +importlib_metadata==8.0.0 +jaraco.collections==5.1.0 +jaraco.context==5.3.0 +jaraco.functools==4.0.1 +more-itertools==10.3.0 +packaging==24.2 +platformdirs==4.2.2 +typeguard==4.3.0 +inflect==7.3.1 +jaraco.text==3.12.1 +tomli==2.0.1 +typing_extensions==4.12.2 +wheel==0.45.1 +zipp==3.19.2 diff --git a/back/wandb/run-20260323_135841-w9holkos/files/wandb-metadata.json b/back/wandb/run-20260323_135841-w9holkos/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4dcd79366a048961b30fa5e908d9e0659f1fc160 --- /dev/null +++ b/back/wandb/run-20260323_135841-w9holkos/files/wandb-metadata.json @@ -0,0 +1,101 @@ +{ + "os": "Linux-5.15.0-94-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.9", + "startedAt": "2026-03-23T05:58:41.322248Z", + "args": [ + "--report-to", + "wandb", + "--allow-tf32", + "--mixed-precision", + "bf16", + "--seed", + "0", + "--path-type", + "linear", + "--prediction", + "v", + "--weighting", + "uniform", + "--model", + "SiT-XL/2", + "--enc-type", + "dinov2-vit-b", + "--encoder-depth", + "8", + "--proj-coeff", + "0.5", + "--output-dir", + "exps", + "--exp-name", + "jsflow-experiment-0.75", + "--batch-size", + "256", + "--data-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256", + "--semantic-features-dir", + "/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0", + "--learning-rate", + "0.00005", + "--t-c", + "0.75", + "--cls", + "0.05", + "--ot-cls" + ], + "program": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/train.py", + "codePath": "train.py", + "codePathLocal": "train.py", + "git": { + "remote": "https://github.com/Martinser/REG.git", + "commit": "021ea2e50c38c5803bd9afff16316958a01fbd1d" + }, + "email": "2365972933@qq.com", + "root": "/gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG", + "host": "24c964746905d416ce09d045f9a06f23-taskrole1-0", + "executable": "/gemini/space/zhaozy/guzhenyu/envs/envs/SiT/bin/python", + "cpu_count": 96, + "cpu_count_logical": 192, + "gpu": "NVIDIA H100 80GB HBM3", + "gpu_count": 4, + "disk": { + "/": { + "total": "3838880616448", + "used": "357568360448" + } + }, + "memory": { + "total": "2164115296256" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-757303bb-4ec2-808b-a17f-95f6f5bad6dc" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-a09f2421-99e6-a72e-63bd-fd7452510758" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-9c670cc7-60a8-17f8-9b39-7ced3744976d" + }, + { + "name": "NVIDIA H100 80GB HBM3", + "memoryTotal": "85520809984", + "cudaCores": 16896, + "architecture": "Hopper", + "uuid": "GPU-e6b1d8da-68d7-ed83-90d0-a4dedf33120e" + } + ], + "cudaVersion": "13.0", + "writerId": "nlbia82zbry6kpqgagoidmc6x8szwd5d" +} \ No newline at end of file diff --git a/back/wandb/run-20260323_135841-w9holkos/logs/debug-internal.log b/back/wandb/run-20260323_135841-w9holkos/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..2484b13253b230d8b8997b5a63e356b295b653e3 --- /dev/null +++ b/back/wandb/run-20260323_135841-w9holkos/logs/debug-internal.log @@ -0,0 +1,19 @@ +{"time":"2026-03-23T13:58:41.647788404+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.0"} +{"time":"2026-03-23T13:58:42.578470875+08:00","level":"INFO","msg":"stream: created new stream","id":"w9holkos"} +{"time":"2026-03-23T13:58:42.578676113+08:00","level":"INFO","msg":"handler: started","stream_id":"w9holkos"} +{"time":"2026-03-23T13:58:42.579473589+08:00","level":"INFO","msg":"stream: started","id":"w9holkos"} +{"time":"2026-03-23T13:58:42.57951741+08:00","level":"INFO","msg":"sender: started","stream_id":"w9holkos"} +{"time":"2026-03-23T13:58:42.579478227+08:00","level":"INFO","msg":"writer: started","stream_id":"w9holkos"} +{"time":"2026-03-23T14:49:13.568442881+08:00","level":"INFO","msg":"api: retrying HTTP error","status":408,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n408 Request Timeout\n\n\n

Error: Request Timeout

\n

Your client has taken too long to issue its request.

\n

\n\n"} +{"time":"2026-03-23T14:52:15.597652411+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-23T14:52:26.072213509+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": write tcp 172.20.98.27:52324->35.186.228.49:443: write: broken pipe"} +{"time":"2026-03-23T17:02:52.905542765+08:00","level":"INFO","msg":"api: retrying HTTP error","status":408,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n408 Request Timeout\n\n\n

Error: Request Timeout

\n

Your client has taken too long to issue its request.

\n

\n\n"} +{"time":"2026-03-23T17:05:55.176103762+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"} +{"time":"2026-03-23T17:06:10.164453104+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": unexpected EOF"} +{"time":"2026-03-23T22:05:06.25355716+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": read tcp 172.20.98.27:44154->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-23T22:05:20.791067182+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": read tcp 172.20.98.27:40392->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-24T02:18:38.770696332+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-03-24T06:25:41.879737278+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-03-24T06:30:14.989373032+08:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-03-24T09:05:02.85908394+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": read tcp 172.20.98.27:46722->35.186.228.49:443: read: connection reset by peer"} +{"time":"2026-03-25T04:41:04.741907157+08:00","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/2365972933-teleai/REG/w9holkos/file_stream\": unexpected EOF"} diff --git a/back/wandb/run-20260323_135841-w9holkos/logs/debug.log b/back/wandb/run-20260323_135841-w9holkos/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..64360b6d725bb9039e8cce4fc50ca91a6d72ced5 --- /dev/null +++ b/back/wandb/run-20260323_135841-w9holkos/logs/debug.log @@ -0,0 +1,20 @@ +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_setup.py:_flush():81] Current SDK version is 0.25.0 +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_setup.py:_flush():81] Configure stats pid to 400275 +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260323_135841-w9holkos/logs/debug.log +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /gemini/space/zhaozy/guzhenyu/UAVFlow/UAV_Flow_base/exps/jsflow-experiment/samples/REG/wandb/run-20260323_135841-w9holkos/logs/debug-internal.log +2026-03-23 13:58:41,343 INFO MainThread:400275 [wandb_init.py:init():844] calling init triggers +2026-03-23 13:58:41,344 INFO MainThread:400275 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2026-03-23 13:58:41,344 INFO MainThread:400275 [wandb_init.py:init():892] starting backend +2026-03-23 13:58:41,630 INFO MainThread:400275 [wandb_init.py:init():895] sending inform_init request +2026-03-23 13:58:41,643 INFO MainThread:400275 [wandb_init.py:init():903] backend started and connected +2026-03-23 13:58:41,646 INFO MainThread:400275 [wandb_init.py:init():973] updated telemetry +2026-03-23 13:58:41,659 INFO MainThread:400275 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-03-23 13:58:43,108 INFO MainThread:400275 [wandb_init.py:init():1042] starting run threads in backend +2026-03-23 13:58:43,201 INFO MainThread:400275 [wandb_run.py:_console_start():2524] atexit reg +2026-03-23 13:58:43,201 INFO MainThread:400275 [wandb_run.py:_redirect():2373] redirect: wrap_raw +2026-03-23 13:58:43,201 INFO MainThread:400275 [wandb_run.py:_redirect():2442] Wrapping output streams. +2026-03-23 13:58:43,202 INFO MainThread:400275 [wandb_run.py:_redirect():2465] Redirects installed. +2026-03-23 13:58:43,209 INFO MainThread:400275 [wandb_init.py:init():1082] run started, returning control to user process +2026-03-23 13:58:43,210 INFO MainThread:400275 [wandb_run.py:_config_callback():1403] config_cb None None {'output_dir': 'exps', 'exp_name': 'jsflow-experiment-0.75', 'logging_dir': 'logs', 'report_to': 'wandb', 'sampling_steps': 2000, 'resume_step': 0, 'model': 'SiT-XL/2', 'num_classes': 1000, 'encoder_depth': 8, 'fused_attn': True, 'qk_norm': False, 'ops_head': 16, 'data_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256', 'semantic_features_dir': '/gemini/space/zhaozy/dataset/Imagenet/imagenet_256/imagenet_256_features/dinov2-vit-b_tmp/gpu0', 'resolution': 256, 'batch_size': 256, 'allow_tf32': True, 'mixed_precision': 'bf16', 'epochs': 1400, 'max_train_steps': 1000000, 'checkpointing_steps': 10000, 'gradient_accumulation_steps': 1, 'learning_rate': 5e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'seed': 0, 'num_workers': 4, 'path_type': 'linear', 'prediction': 'v', 'cfg_prob': 0.1, 'enc_type': 'dinov2-vit-b', 'proj_coeff': 0.5, 'weighting': 'uniform', 'legacy': False, 'cls': 0.05, 't_c': 0.75, 'ot_cls': True} diff --git a/conditional-flow-matching/runner/src/datamodules/components/__init__.py b/conditional-flow-matching/runner/src/datamodules/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/conditional-flow-matching/runner/src/datamodules/components/base.py b/conditional-flow-matching/runner/src/datamodules/components/base.py new file mode 100644 index 0000000000000000000000000000000000000000..ab80cba2df172f33caa869a2d30da4e03e2a6fb6 --- /dev/null +++ b/conditional-flow-matching/runner/src/datamodules/components/base.py @@ -0,0 +1,33 @@ +from pytorch_lightning import LightningDataModule +from torch.utils.data import DataLoader + + +class BaseLightningDataModule(LightningDataModule): + """Adds base train, val, test dataloaders from data_train, data_val, and data_test.""" + + def train_dataloader(self): + return DataLoader( + dataset=self.data_train, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + pin_memory=self.hparams.pin_memory, + shuffle=True, + ) + + def val_dataloader(self): + return DataLoader( + dataset=self.data_val, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + pin_memory=self.hparams.pin_memory, + shuffle=False, + ) + + def test_dataloader(self): + return DataLoader( + dataset=self.data_test, + batch_size=self.hparams.batch_size, + num_workers=self.hparams.num_workers, + pin_memory=self.hparams.pin_memory, + shuffle=False, + ) diff --git a/conditional-flow-matching/runner/src/datamodules/components/generators2d.py b/conditional-flow-matching/runner/src/datamodules/components/generators2d.py new file mode 100644 index 0000000000000000000000000000000000000000..49d34dd06cb9728932ab2bb44fb4feab0a0edf3f --- /dev/null +++ b/conditional-flow-matching/runner/src/datamodules/components/generators2d.py @@ -0,0 +1,183 @@ +"""Random data generators. + +Largely from +https://github.com/AmirTag/OT-ICNN/blob/6caa9b982596a101b90a8a947d10f35f18c7de4e/2_dim_experiments/W2-minimax-tf.py +""" + +import random + +import numpy as np +import sklearn + + +def generate_uniform_around_centers(centers, variance): + num_center = len(centers) + + return centers[np.random.choice(num_center)] + variance * np.random.uniform(-1, 1, (2)) + + +def generate_cross(centers, variance): + num_center = len(centers) + x = variance * np.random.uniform(-1, 1) + y = (np.random.randint(2) * 2 - 1) * x + + return centers[np.random.choice(num_center)] + [x, y] + + +def sample_data(dataset, batch_size, scale, var): + if dataset == "25gaussians": + dataset = [] + for i in range(100000 / 25): + for x in range(-2, 3): + for y in range(-2, 3): + point = np.random.randn(2) * 0.05 + point[0] += 2 * x + point[1] += 2 * y + dataset.append(point) + dataset = np.array(dataset, dtype="float32") + np.random.shuffle(dataset) + # dataset /= 2.828 # stdev + while True: + for i in range(len(dataset) / batch_size): + yield dataset[i * batch_size : (i + 1) * batch_size] + + elif dataset == "swissroll": + while True: + data = sklearn.datasets.make_swiss_roll(n_samples=batch_size, noise=0.25)[0] + data = data.astype("float32")[:, [0, 2]] + # data /= 7.5 # stdev plus a little + yield data + + elif dataset == "8gaussians": + scale = scale + variance = var + centers = [ + (1, 0), + (-1, 0), + (0, 1), + (0, -1), + (1.0 / np.sqrt(2), 1.0 / np.sqrt(2)), + (1.0 / np.sqrt(2), -1.0 / np.sqrt(2)), + (-1.0 / np.sqrt(2), 1.0 / np.sqrt(2)), + (-1.0 / np.sqrt(2), -1.0 / np.sqrt(2)), + ] + centers = [(scale * x, scale * y) for x, y in centers] + while True: + dataset = [] + for i in range(batch_size): + point = np.random.randn(2) * variance + center = random.choice(centers) + point[0] += center[0] + point[1] += center[1] + dataset.append(point) + dataset = np.array(dataset, dtype="float32") + # dataset /= 1.414 # stdev + yield dataset + + elif dataset == "checker_board_five": + scale = scale + variance = var + centers = scale * np.array([[0, 0], [1, 1], [-1, 1], [-1, -1], [1, -1]]) + while True: + dataset = [] + for i in range(batch_size): + dataset.append(generate_uniform_around_centers(centers, variance)) + dataset = np.array(dataset, dtype="float32") + # dataset /= 1.414 # stdev + yield dataset + + elif dataset == "checker_board_four": + scale = scale + variance = var + centers = scale * np.array([[1, 0], [0, 1], [-1, 0], [0, -1]]) + while True: + dataset = [] + for i in range(batch_size): + dataset.append(generate_uniform_around_centers(centers, variance)) + dataset = np.array(dataset, dtype="float32") + # dataset /= 1.414 # stdev + yield dataset + + elif dataset == "simpleGaussian": + while True: + dataset = [] + for i in range(batch_size): + point = np.random.randn(2) + dataset.append(point) + dataset = np.array(dataset, dtype="float32") + # dataset /= 1.414 # stdev + yield dataset + + elif dataset == "unif_square": + while True: + dataset = [] + for i in range(batch_size): + point = np.random.uniform(-var, var, 2) + dataset.append(point) + dataset = np.array(dataset, dtype="float32") + # dataset /= 1.414 # stdev + yield dataset + + elif dataset == "simpletranslatedGaussian": + while True: + dataset = [] + for i in range(batch_size): + point = scale * np.array([1.0, 1.0]) + np.random.randn(2) + dataset.append(point) + dataset = np.array(dataset, dtype="float32") + # dataset /= 1.414 # stdev + yield dataset + + elif dataset == "simpletranslated_scaled_Gaussian": + while True: + dataset = [] + for i in range(batch_size): + point = scale * np.array([1.0, 1.0]) + var * np.random.randn(2) + dataset.append(point) + dataset = np.array(dataset, dtype="float32") + # dataset /= 1.414 # stdev + yield dataset + + elif dataset == "circle-S1": + while True: + dataset = [] + for i in range(batch_size): + angle = np.random.rand() * 2 * np.pi + point = scale * np.array([np.cos(angle), np.sin(angle)]) + dataset.append(point) + dataset = np.array(dataset, dtype="float32") + yield dataset + + elif dataset == "semi-circle-S1": + while True: + dataset = [] + for i in range(batch_size): + angle = np.random.rand() * np.pi + point = scale * np.array([np.cos(angle), np.sin(angle)]) + dataset.append(point) + dataset = np.array(dataset, dtype="float32") + yield dataset + + elif dataset == "checker_board_five_cross": + scale = scale + variance = var + centers = scale * np.array([[0, 0], [1, 1], [-1, 1], [-1, -1], [1, -1]]) + while True: + dataset = [] + for i in range(batch_size): + dataset.append(generate_cross(centers, variance)) + dataset = np.array(dataset, dtype="float32") + # dataset /= 1.414 # stdev + yield dataset + + elif dataset == "checker_board_five_expanded": + scale = scale + variance = 2 * var + centers = scale * np.array([[0, 0], [1, 1], [-1, 1], [-1, -1], [1, -1]]) + while True: + dataset = [] + for i in range(batch_size): + dataset.append(generate_uniform_around_centers(centers, variance)) + dataset = np.array(dataset, dtype="float32") + # dataset /= 1.414 # stdev + yield dataset diff --git a/conditional-flow-matching/runner/src/datamodules/components/tnet_dataset.py b/conditional-flow-matching/runner/src/datamodules/components/tnet_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e8024eabb48fa2171443efc0d8bda2559cd3b0f2 --- /dev/null +++ b/conditional-flow-matching/runner/src/datamodules/components/tnet_dataset.py @@ -0,0 +1,886 @@ +"""dataset.py. + +Loads datasets into uniform format for learning continuous flows +""" + +import math + +import numpy as np +import scipy.sparse +import torch +from sklearn.preprocessing import StandardScaler + + +class SCData: + """Base Class for single cell like trajectory data.""" + + def __init__(self): + super().__init__() + self.val_labels = [] + + def load(self): + raise NotImplementedError + + def get_labels(self): + raise NotImplementedError + + def get_data(self, labels=None): + raise NotImplementedError + + def get_ncells(self): + raise NotImplementedError + + def get_velocity(self): + raise NotImplementedError + + def has_velocity(self): + raise NotImplementedError + + def leaveout_timepoint(self, tp): + raise NotImplementedError + + def num_timepoints(self): + raise NotImplementedError + + def known_base_density(self): + """Returns if the dataset starts from a known base density. + + Generally single cell datasets do not have a known base density where generated datasets + do. + """ + raise NotImplementedError + + def base_density(self): + def standard_normal_logprob(z): + logZ = -0.5 * math.log(2 * math.pi) + return torch.sum(logZ - z.pow(2) / 2, 1, keepdim=True) + + return standard_normal_logprob + + def base_sample(self): + return torch.randn + + def get_shape(self): + return [self.data.shape[1]] + + def plot_density(self): + import matplotlib.pyplot as plt + + npts = 100 + side = np.linspace(-4, 4, npts) + xx, yy = np.meshgrid(side, side) + xx = torch.from_numpy(xx).type(torch.float32) + yy = torch.from_numpy(yy).type(torch.float32) + z_grid = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1)], 1) + logp_grid = self.base_density()(z_grid) + plt.pcolormesh(xx, yy, np.exp(logp_grid.numpy()).reshape(npts, npts)) + plt.show() + + def plot_data(self): + import matplotlib.pyplot as plt + import scprep + + nbase = 5000 + all_data = np.concatenate( + [self.get_data(), self.base_sample()(nbase, self.get_shape()[0]).numpy()], + axis=0, + ) + lbs = np.concatenate([self.get_times(), np.repeat(["Base"], nbase)]) + if all_data.shape[1] == 2: + scprep.plot.scatter2d(all_data, c=lbs) + else: + fig, axes = plt.subplots(2, all_data.shape[1] // 2) + axes = axes.flatten() + for i in range(all_data.shape[1] - 1): + scprep.plot.scatter2d( + all_data[:, i : i + 2], + c=lbs, + ax=axes[i], + xlabel="PC %d" % (i + 1), + ylabel="PC %d" % (i + 2), + ) + plt.show() + + def plot_velocity(self): + import matplotlib.pyplot as plt + + idx = np.random.randint(self.get_ncells(), size=200) + data = self.get_data()[idx] + velocity = self.velocity[idx] + plt.quiver(data[:, 0], data[:, 1], velocity[:, 0], velocity[:, 1]) + plt.show() + + def plot_paths(self): + paths = self.get_paths() + paths = paths[:1000] + import matplotlib.pyplot as plt + + for path in paths: + plt.plot(path[:, 0], path[:, 1]) + plt.show() + + def factory(name, args): + if type(args) is dict: + from argparse import Namespace + + args = Namespace(**args) + # Generated Circle datasets + if name == "CIRCLE3": + return CircleTestDataV3() + if name == "CIRCLE5": + return CircleTestDataV5() + if name == "TREE": + return TreeTestData() + if name == "CYCLE": + return CycleDataset() + + # Generated sklearn datasets + if name == "MOONS": + return SklearnData("moons") + if name == "SCURVE": + return SklearnData("scurve") + if name == "BLOBS": + return SklearnData("blobs") + if name == "CIRCLES": + return SklearnData("circles") + + if name == "EB": + return EBData() + if name == "EB-PHATE": + return EBData() + if name == "EB-PCA": + return EBData("pcs", max_dim=args.max_dim) + + # If none of the above, we assume a path to a .npz file is supplied + if name.endswith(".h5ad"): + return CustomAnnDataFromFile(name, args) + if name.endswith(".npz"): + return CustomData(name, args) + + raise KeyError(f"Unknown dataset name {name}") + + +def _get_data_points(adata, basis) -> np.ndarray: + """Returns the data points corresponding to the selected basis.""" + if basis == "highly_variable": + data_points = adata[:, adata.var[basis]].X.toarray() + elif basis in adata.obsm.keys(): + basis_key = basis + data_points = np.array(adata.obsm[basis_key]) + elif f"X_{basis}" in adata.obsm.keys(): + basis_key = f"X_{basis}" + data_points = np.array(adata.obsm[basis_key]) + else: + raise KeyError( + f"Could not find entry in `obsm` for '{basis}'.\n" + f"Available keys are: {list(adata.obsm.keys())}." + ) + + velocity_points = None + + if f"velocity_{basis}" in adata.obsm.keys(): + velocity_basis_key = f"velocity_{basis}" + velocity_points = np.array(adata.obsm[velocity_basis_key]) + else: + print( + f"Could not find entry in `obsm` for 'velocity_{basis}'.\n" + f"Available keys are: {list(adata.obsm.keys())}.\n" + f"Assuming no velocity data." + ) + + return data_points, velocity_points + + +class CustomData(SCData): + def __init__(self, name, args): + super().__init__() + self.args = args + self.embedding_name = args.embedding_name + self.load(name, args.max_dim) + + def load(self, data_file, max_dim): + self.data_dict = np.load(data_file, allow_pickle=True) + self.labels = self.data_dict["sample_labels"] + if self.embedding_name not in self.data_dict.keys(): + raise ValueError("Unknown embedding name %s" % self.embedding_name) + self.data = self.data_dict[self.embedding_name] + if self.args.whiten: + scaler = StandardScaler() + scaler.fit(self.data) + self.data = scaler.transform(self.data) + self.ncells = self.data.shape[0] + assert self.labels.shape[0] == self.ncells + # Scale so that embedding is normally distributed + + delta_name = "delta_%s" % self.embedding_name + if delta_name not in self.data_dict.keys(): + print("No velocity found for embedding %s skipping velocity" % self.embedding_name) + self.use_velocity = False + else: + self.velocity = self.data_dict[delta_name] + assert self.velocity.shape[0] == self.ncells + # Normalize ignoring mean from embedding + if self.args.whiten: + self.velocity = self.velocity / scaler.scale_ + + if max_dim is not None and self.data.shape[1] > max_dim: + print("Warning: Clipping dimensionality to %d" % max_dim) + self.data = self.data[:, :max_dim] + if self.use_velocity: + self.velocity = self.velocity[:, :max_dim] + + def has_velocity(self): + return self.use_velocity + + def known_base_density(self): + return False + + def get_data(self): + return self.data + + def get_times(self): + return self.labels + + def get_unique_times(self): + return np.unique(self.labels) + + def get_velocity(self): + return self.velocity + + def get_shape(self): + return [self.data.shape[1]] + + def get_ncells(self): + return self.ncells + + def leaveout_timepoint(self, tp): + """Takes a timepoint label to leaveout Alters data stored in object to leave out all data + associated with that timepoint.""" + if tp < 0: + raise RuntimeError("Cannot leaveout negative timepoint %d." % tp) + mask = self.labels != tp + print(f"Leaving out {np.sum(~mask)} samples from sample tp") + self.labels = self.labels[mask] + self.data = self.data[mask] + self.velocity = self.velocity[mask] + self.ncells = np.sum(mask) + + def sample_index(self, n, label_subset): + arr = np.arange(self.ncells)[self.labels == label_subset] + return np.random.choice(arr, size=n) + + +class CustomAnnData(CustomData): + def __init__(self, adata, args): + self.args = args + self.adata = adata + self.grn = None + self.load() + + def load(self): + self.labels = np.array(self.adata.obs["sample_labels"]) + self.data, self.velocity = _get_data_points(self.adata, self.args.embedding_name) + + if "grn" in self.adata.uns: + self.grn = self.adata.uns["grn"] + + if self.args.whiten: + scaler = StandardScaler() + scaler.fit(self.data) + self.data = scaler.transform(self.data) + if self.velocity is not None: + self.velocity = self.velocity / scaler.scale_ + self.use_velocity = self.velocity is not None + + self.ncells = self.data.shape[0] + assert self.labels.shape[0] == self.ncells + + max_dim = self.args.max_dim + if max_dim is not None and self.data.shape[1] > max_dim: + print(f"Warning: Clipping dimensionality from {self.data.shape[1]} to {max_dim}") + self.data = self.data[:, :max_dim] + if self.use_velocity: + self.velocity = self.velocity[:, :max_dim] + + def get_grn(self): + if self.grn is not None: + return self.grn + else: + raise ValueError( + f"No visible grn key in adata.uns, visible keys: {self.adata.uns.keys()}" + ) + + +class CustomAnnDataFromFile(CustomAnnData): + def __init__(self, name, args): + import scanpy as sc + + adata = sc.read_h5ad(name) + super().__init__(adata, args) + + +class EBData(SCData): + def __init__(self, embedding_name="phate", max_dim=None, use_velocity=True, version=5): + super().__init__() + self.embedding_name = embedding_name + self.use_velocity = use_velocity + if version == 5: + data_file = "../data/eb_velocity_v5.npz" + else: + raise ValueError("Unknown Version number") + self.load(data_file, max_dim) + + def load(self, data_file, max_dim): + self.data_dict = np.load(data_file, allow_pickle=True) + self.labels = self.data_dict["sample_labels"] + if self.embedding_name not in self.data_dict.keys(): + raise ValueError("Unknown embedding name %s" % self.embedding_name) + embedding = self.data_dict[self.embedding_name] + scaler = StandardScaler() + scaler.fit(embedding) + self.ncells = embedding.shape[0] + assert self.labels.shape[0] == self.ncells + # Scale so that embedding is normally distributed + self.data = scaler.transform(embedding) + + if self.has_velocity() and self.use_velocity: + if self.embedding_name == "pcs": + delta = self.data_dict["pcs_delta"] + elif self.embedding_name == "phate": + delta = self.data_dict["delta_embedding"] + else: + raise NotImplementedError("rna velocity must use phate") + assert delta.shape[0] == self.ncells + # Ignore mean from embedding + self.velocity = delta / scaler.scale_ + + if max_dim is not None and self.data.shape[1] > max_dim: + print("Warning: Clipping dimensionality to %d" % max_dim) + self.data = self.data[:, :max_dim] + if self.has_velocity() and self.use_velocity: + self.velocity = self.velocity[:, :max_dim] + + def has_velocity(self): + return True + + def known_base_density(self): + return False + + def get_data(self): + return self.data + + def get_times(self): + return self.labels + + def get_unique_times(self): + return np.unique(self.labels) + + def get_velocity(self): + return self.velocity + + def get_shape(self): + return [self.data.shape[1]] + + def get_ncells(self): + return self.ncells + + def leaveout_timepoint(self, tp): + """Takes a timepoint label to leaveout Alters data stored in object to leave out all data + associated with that timepoint.""" + if tp < 0: + raise RuntimeError("Cannot leaveout negative timepoint %d." % tp) + mask = self.labels != tp + print("Leaving out %d samples from sample %d" % (np.sum(~mask), tp)) + self.labels = self.labels[mask] + self.data = self.data[mask] + self.velocity = self.velocity[mask] + self.ncells = np.sum(mask) + + def sample_index(self, n, label_subset): + arr = np.arange(self.ncells)[self.labels == label_subset] + return np.random.choice(arr, size=n) + + +class CircleTestDataV3(EBData): + """Implements the curvy tree dataset. + + Has an analytical base density and two timepoints instead of 3. Where the base distribution is + a half-gaussian at theta=0 and the end distribution is a half-gaussian at theta=2*pi. Both + truncated below y=0. this is to experiment with the standard deviation of theta to see if we + can learn a flow along the circle instead of across it. The hope is that the default flow is + across the circle where we can regularize it towards density. + """ + + def __init__(self): + super().__init__() + np.random.seed(42) + n = 5000 + self.r1, self.r2, self.r3 = (0.25, 0.1, 0.1) + self.r1, self.r2, self.r3 = (0.4, 0.1, 0.1) + self.r1, self.r2, self.r3 = (0.5, 0.1, 0.1) + + self.labels = np.repeat(np.arange(2), n) + theta = (self.labels * np.pi / 2) + np.pi / 2 + # theta = (self.labels * np.pi / 4) + np.pi / 2 + theta += np.random.randn(*theta.shape) * self.r1 + # Move set 0 to a weird place for verification + # TODO remove + # theta[self.labels == 0] += np.pi / 2 + theta[self.labels == 0] += np.random.randn(*theta.shape)[self.labels == 0] * 2 + theta[theta < 0] *= -1 + theta[theta > np.pi] = 2 * np.pi - theta[theta > np.pi] + r = (1 + np.random.randn(*theta.shape) * self.r2)[:, None] + r = np.repeat(r, 2, axis=1) + x2d = np.array([np.cos(theta), np.sin(theta)]).T * r + # x2d[self.labels == 1] -= [0.7, 0.0] + # x2d[x2d[:, 1] < 0] *= [1, -1] + self.data = x2d + self.ncells = self.data.shape[0] + + next2d = np.array([np.cos(theta + 0.3), np.sin(theta + 0.3)]).T * r + # next2d += np.random.randn(*next2d.shape) * self.r3 + self.velocity = next2d - x2d + + def base_density(self): + def logprob(z): + # I no longer understand how this function works, but it looks right + r = torch.sqrt(torch.sum(z.pow(2), 1)) + theta = torch.atan2(z[:, 0], -z[:, 1]) + zp1 = (r - 1) / self.r2 + zp2 = theta - np.pi / 2 + # zp2 = (theta - np.pi / 4) + zp2[zp2 > np.pi] -= 2 * np.pi + zp2[zp2 < -np.pi] += 2 * np.pi + zp2 = zp2 / self.r1 + # Find Quadrant + logZ = -0.5 * math.log(2 * math.pi) + z_polar = torch.stack([zp1, zp2], 1) + to_return = torch.sum(logZ - z_polar.pow(2) / 2, 1, keepdim=True) + to_return[zp2 < 0] += 20 * zp2[zp2 < 0][:, None] + # to_return[zp2 >= 0] -= 0 # Multiply in log space? + # to_return[zp2 < 0] += 50 + # to_return[zp2 >= 0] -= 50 # Multiply in log space? + return to_return + + return logprob + + def known_base_density(self): + return True + + def base_sample(self): + def f(*args, **kwargs): + sample = torch.randn(*args, **kwargs) + theta = sample[:, 0] * self.r1 + r = (sample[:, 1] * self.r2 + 1)[:, None] + s = torch.stack([torch.cos(theta), torch.sin(theta)], 1) * r + s[s[:, 1] < 0] *= torch.tensor([1, -1], dtype=torch.float32)[None, :] + return s + + return f + + def has_velocity(self): + return True + + +def interpolate_with_ot(p0, p1, tmap, interp_frac, size): + """Interpolate between p0 and p1 at fraction t_interpolate knowing a transport map from p0 to + p1. + + Parameters + ---------- + p0 : 2-D array + The genes of each cell in the source population + p1 : 2-D array + The genes of each cell in the destination population + tmap : 2-D array + A transport map from p0 to p1 + t_interpolate : float + The fraction at which to interpolate + size : int + The number of cells in the interpolated population + Returns + ------- + p05 : 2-D array + An interpolated population of 'size' cells + """ + p0 = p0.toarray() if scipy.sparse.isspmatrix(p0) else p0 + p1 = p1.toarray() if scipy.sparse.isspmatrix(p1) else p1 + p0 = np.asarray(p0, dtype=np.float64) + p1 = np.asarray(p1, dtype=np.float64) + tmap = np.asarray(tmap, dtype=np.float64) + if p0.shape[1] != p1.shape[1]: + raise ValueError("Unable to interpolate. Number of genes do not match") + if p0.shape[0] != tmap.shape[0] or p1.shape[0] != tmap.shape[1]: + raise ValueError( + "Unable to interpolate. Tmap size is {}, expected {}".format( + tmap.shape, (len(p0), len(p1)) + ) + ) + len1 = len(p0) + len2 = len(p1) + # Assume growth is exponential and retrieve growth rate at t_interpolate + p = tmap / np.power(tmap.sum(axis=0), 1.0 - interp_frac) + p = p.flatten(order="C") + p = p / p.sum() + choices = np.random.choice(len1 * len2, p=p, size=size) + return np.asarray( + [p0[i // len2] * (1 - interp_frac) + p1[i % len2] * interp_frac for i in choices], + dtype=np.float64, + ) + + +class TreeTestData(CircleTestDataV3): + def __init__(self): + np.random.seed(42) + n = 5000 + self.r1, self.r2, self.r3 = (0.5, 0.1, 0.1) + self.labels = np.repeat([0, 2], n) + + data = np.abs(np.random.randn(n * 2) * 0.5 / np.pi) + data[self.labels == 2] = 1 - data[self.labels == 2] + # print(data) + + # McCann interpolant / barycenter interpolation + import ot + + gamma = ot.emd_1d(data[self.labels == 0], data[self.labels == 2]) + ninterp = 5000 + i05 = interpolate_with_ot( + data[self.labels == 0][:, np.newaxis], + data[self.labels == 2][:, np.newaxis], + gamma, + 0.5, + ninterp, + ) + data = np.concatenate([data, i05.flatten()]) + self.labels = np.concatenate([self.labels, np.ones(n)]) + theta = data * np.pi # transform to along the circle + + r = (1 + np.random.randn(*theta.shape) * self.r2)[:, None] + r = np.repeat(r, 2, axis=1) + x2d = np.array([np.cos(theta), np.sin(theta)]).T * r + + mask = np.random.rand(x2d.shape[0]) > 0.5 + mask *= x2d[:, 0] < 0 + x2d[mask] = [[0, 2]] + [[1, -1]] * x2d[mask] + + # x2d[self.labels == 1] -= [0.7, 0.0] + # x2d[x2d[:, 1] < 0] *= [1, -1] + self.data = x2d + self.ncells = self.data.shape[0] + + next2d = np.array([np.cos(theta + 0.3), np.sin(theta + 0.3)]).T * r + next2d[mask] = [[0, 2]] + [[1, -1]] * next2d[mask] + # next2d += np.random.randn(*next2d.shape) * self.r3 + self.velocity = next2d - x2d + + # Mask out timepoint zero + mask = self.labels != 0 + self.labels = self.labels[mask] + self.labels -= 1 + self.data = self.data[mask] + self.velocity = self.velocity[mask] + self.ncells = self.labels.shape[0] + + def get_paths(self, n=5000, n_steps=3): + # Only 3 steps are supported at this time. + assert n_steps == 3 + np.random.seed(42) + self.r1, self.r2, self.r3 = (0.5, 0.1, 0.1) + labels = np.repeat([0, 2], n) + + data = np.abs(np.random.randn(n * 2) * 0.5 / np.pi) + data[labels == 2] = 1 - data[labels == 2] + # print(data) + + # McCann interpolant / barycenter interpolation + import ot + + gamma = ot.emd_1d(data[labels == 0], data[labels == 2]) + ninterp = 5000 + i05 = interpolate_with_ot( + data[labels == 0][:, np.newaxis], + data[labels == 2][:, np.newaxis], + gamma, + 0.5, + ninterp, + ) + # data = data.reshape(-1, 2) + data = np.stack([data[labels == 0], i05.flatten(), data[labels == 2]], axis=-1) + + theta = data * np.pi # transform to along the circle + + r = (1 + np.random.randn(n) * self.r2)[:, None, None] + + x2d = np.stack([np.cos(theta), np.sin(theta)], axis=-1) * r + # mask = (r > 1.0) + # TODO these reference paths could be improved to include better routing + # along the manifold. Right now they are calculated using 1d and are just lifted into + # 2d along the same radius. Trouble comes when the branch for the tree gets + # Flipped over y=1, this gives opposite of expected radiuses. + # Furthermore, 2d Transport is no longer the same as 1d when we have gaussian + # Noise along the manifold. + # + # Right now they are good enough for our purposes, and making them better will only + # improve how TrajectoryNet looks. + """ + import optimal_transport.emd as emd + _, log = emd.earth_mover_distance(x2d[:,0], x2d[:,1], return_matrix=True) + print(np.where(log['G'] > 1e-8)) + path = np.stack([x2d[:,0], x2d[np.where(log['G'] > 1e-8)[1],1]]) + path = np.swapaxes(path, 0,1) + import matplotlib.pyplot as plt + #plt.hist(log['G'].flatten()) + fig, axes = plt.subplots(1,2,figsize=(20,10)) + + for p in path[:1000]: + axes[0].plot(p[:,0], p[:,1]) + for p in x2d[:1000,:2]: + axes[1].plot(p[:,0], p[:,1]) + plt.show() + exit() + """ + mask = np.random.rand(*x2d.shape[:2]) > 0.5 + mask *= x2d[:, :, 0] < 0 + x2d[mask] = [[0, 2]] + [[1, -1]] * x2d[mask] + x2d = x2d.reshape(n, n_steps, 2) + return x2d + # Samples x Time x Dimension + # return x2d + + +class CircleTestDataV5(TreeTestData): + """This builds on version 3 to include a better middle timepoint. + + Where instead of being parametrically defined, the middle timepoint is defined in terms of the + interpolant between the first and last timepoints along the manifold. + + This is a useful thing to relate to in terms of transport along the manifold. + """ + + def __init__(self): + np.random.seed(42) + n = 5000 + self.r1, self.r2, self.r3 = (0.5, 0.1, 0.1) + self.labels = np.repeat([0, 2], n) + + data = np.abs(np.random.randn(n * 2) * 0.5 / np.pi) + data[self.labels == 2] = 1 - data[self.labels == 2] + # print(data) + + # McCann interpolant / barycenter interpolation + import ot + + gamma = ot.emd_1d(data[self.labels == 0], data[self.labels == 2]) + ninterp = 5000 + i05 = interpolate_with_ot( + data[self.labels == 0][:, np.newaxis], + data[self.labels == 2][:, np.newaxis], + gamma, + 0.5, + ninterp, + ) + data = np.concatenate([data, i05.flatten()]) + self.labels = np.concatenate([self.labels, np.ones(n)]) + theta = data * np.pi # transform to along the circle + + r = (1 + np.random.randn(*theta.shape) * self.r2)[:, None] + r = np.repeat(r, 2, axis=1) + x2d = np.array([np.cos(theta), np.sin(theta)]).T * r + + ########################## + # ONLY CHANGE FROM ABOVE # + mask = np.random.rand(x2d.shape[0]) > 1.0 + ########################## + + mask *= x2d[:, 0] < 0 + x2d[mask] = [[0, 2]] + [[1, -1]] * x2d[mask] + + # x2d[self.labels == 1] -= [0.7, 0.0] + # x2d[x2d[:, 1] < 0] *= [1, -1] + self.data = x2d + self.ncells = self.data.shape[0] + + next2d = np.array([np.cos(theta + 0.3), np.sin(theta + 0.3)]).T * r + next2d[mask] = [[0, 2]] + [[1, -1]] * next2d[mask] + # next2d += np.random.randn(*next2d.shape) * self.r3 + self.velocity = next2d - x2d + + # Mask out timepoint zero + mask = self.labels != 0 + self.labels = self.labels[mask] + self.labels -= 1 + self.data = self.data[mask] + self.velocity = self.velocity[mask] + self.ncells = self.labels.shape[0] + + def get_paths(self, n=5000, n_steps=3): + # Only 3 steps are supported at this time. + assert n_steps == 3 + np.random.seed(42) + self.r1, self.r2, self.r3 = (0.5, 0.1, 0.1) + labels = np.repeat([0, 2], n) + + data = np.abs(np.random.randn(n * 2) * 0.5 / np.pi) + data[labels == 2] = 1 - data[labels == 2] + # print(data) + + # McCann interpolant / barycenter interpolation + import ot + + gamma = ot.emd_1d(data[labels == 0], data[labels == 2]) + ninterp = 5000 + i05 = interpolate_with_ot( + data[labels == 0][:, np.newaxis], + data[labels == 2][:, np.newaxis], + gamma, + 0.5, + ninterp, + ) + # data = data.reshape(-1, 2) + data = np.stack([data[labels == 0], i05.flatten(), data[labels == 2]], axis=-1) + + theta = data * np.pi # transform to along the circle + + r = (1 + np.random.randn(n) * self.r2)[:, None, None] + + x2d = np.stack([np.cos(theta), np.sin(theta)], axis=-1) * r + return x2d + + +class CycleDataset(TreeTestData): + """The idea here is that the distribution does not change, but there is movement around the + circle over time. + + First we define a rotation speed with a uniform distribution around the circle. + + We generate this by taking a uniform distribution then rotating it 1/4 way around the circle. + + The interpolation is then 1/8 of the way around the circle. We need a new evaluation mechanism + to be able to handle this case, as distribution level, all are approximately zero difference. + """ + + def __init__(self, shift=0.1, r_std=0.1): + np.random.seed(42) + n = 5000 + self.shift = shift + self.r_std = r_std + data = np.random.rand(n) + data = np.concatenate([data, data + shift, data + 2 * shift]) + r = np.tile(np.ones(n) + np.random.randn(n) * self.r_std, 3)[:, np.newaxis] + self.labels = np.repeat(np.arange(2), n) + theta = data * 2 * np.pi + x2d = np.array([np.cos(theta), np.sin(theta)]).T * r + self.data = x2d[n:] + self.old_data = x2d[:n] + next_theta = theta + 2 * np.pi * shift * 0.001 + next2d = np.array([np.cos(next_theta), np.sin(next_theta)]).T * r + self.velocity = ((next2d - x2d) * 1000)[n:] * 2 + self.ncells = 2 * n + + def get_paths(self, n=5000, n_steps=3): + # Only 3 steps are supported at this time. + assert n_steps == 3 + shift = self.shift + np.random.seed(42) + data = np.random.rand(n) + data = np.stack([data, data + shift, data + 2 * shift], axis=0) + r = (np.ones(n) + np.random.randn(n) * self.r_std)[np.newaxis, :, np.newaxis] + theta = data * 2 * np.pi + x2d = np.stack([np.cos(theta), np.sin(theta)], axis=-1) * r + x2d = np.swapaxes(x2d, 0, 1) + # Samples x Time x Dimension + return x2d + + def base_density(self): + # It is OK if this is only proportional to the true distribution + # As long as it is relatively close for scaling purposes + def logprob(z): + r = torch.sqrt(torch.sum(z.pow(2), 1)) + zp1 = (r - 1) / self.r_std + logZ = -0.5 * math.log(2 * math.pi * self.r_std * self.r_std) + to_return = logZ - zp1.pow(2) / 2 + # I don't know why this correction factor works, but it seems to integrate to 1 now. + return (to_return - math.log(2 * np.pi))[:, np.newaxis] + + return logprob + + def known_base_density(self): + return True + + def base_sample(self): + def f(*args, **kwargs): + sample = torch.randn(*args, **kwargs) + sample_uniform = torch.rand(*args, **kwargs) + theta = sample_uniform[:, 0] * 2 * np.pi + r = (sample[:, 0] * self.r_std + 1)[:, None] + s = torch.stack([torch.cos(theta), torch.sin(theta)], 1) * r + return s + + return f + + +class SklearnData(SCData): + def __init__(self, name="moons", n_samples=10000): + import sklearn.datasets + + self.name = name + # From sklearn auto_examples/cluster/plot_cluster_comparison + seed = 42 + np.random.seed(seed) + if name == "circles": + self.data, _ = sklearn.datasets.make_circles( + n_samples=n_samples, factor=0.5, noise=0.05, random_state=seed + ) + self.data *= 3.5 + elif name == "moons": + self.data, _ = sklearn.datasets.make_moons( + n_samples=n_samples, noise=0.05, random_state=seed + ) + self.data *= 2 + self.data[:, 0] -= 1 + elif name == "blobs": + self.data, _ = sklearn.datasets.make_blobs(n_samples=n_samples) + elif name == "scurve": + self.data, _ = sklearn.datasets.make_s_curve( + n_samples=n_samples, noise=0.05, random_state=seed + ) + self.data = np.vstack([self.data[:, 0], self.data[:, 2]]).T + self.data *= 1.5 + else: + raise NotImplementedError("Unknown dataset name %s" % name) + + def get_times(self): + return np.repeat([0], self.data.shape[0]) + + def get_unique_times(self): + return [0] + + def has_velocity(self): + return False + + def known_base_density(self): + return True + + def get_data(self): + return self.data + + def get_shape(self): + return [self.data.shape[1]] + + def get_ncells(self): + return self.data.shape[0] + + def base_density(self): + def standard_normal_logprob(z): + logZ = -0.5 * math.log(2 * math.pi) + return torch.sum(logZ - z.pow(2) / 2, 1, keepdim=True) + + return standard_normal_logprob + + def base_sample(self): + return torch.randn + + def sample_index(self, n, label_subset): + arr = np.arange(self.get_ncells())[self.get_times() == label_subset] + return np.random.choice(arr, size=n) diff --git a/conditional-flow-matching/runner/src/datamodules/components/two_dim.py b/conditional-flow-matching/runner/src/datamodules/components/two_dim.py new file mode 100644 index 0000000000000000000000000000000000000000..401e122337ca550461ca1865555dfc50d48dc62f --- /dev/null +++ b/conditional-flow-matching/runner/src/datamodules/components/two_dim.py @@ -0,0 +1,104 @@ +# Adapted from From DSB +# https://github.com/JTT94/diffusion_schrodinger_bridge/blob/main/bridge/data/two_dim.py +import numpy as np +import torch +from sklearn import datasets +from torch.utils.data import TensorDataset + +# checker/pinwheel/8gaussians can be found at +# https://github.com/rtqichen/ffjord/blob/994864ad0517db3549717c25170f9b71e96788b1/lib/toy_data.py#L8 + + +def data_distrib(npar, data, random_state=42): + np.random.seed(random_state) + + if data == "mixture": + init_sample = torch.randn(npar, 2) + p = init_sample.shape[0] // 2 + init_sample[:p, 0] = init_sample[:p, 0] - 7.0 + init_sample[p:, 0] = init_sample[p:, 0] + 7.0 + + if data == "scurve": + X, y = datasets.make_s_curve(n_samples=npar, noise=0.1, random_state=None) + init_sample = torch.tensor(X)[:, [0, 2]] + scaling_factor = 7 + init_sample = (init_sample - init_sample.mean()) / init_sample.std() * scaling_factor + + if data == "swiss": + X, y = datasets.make_swiss_roll(n_samples=npar, noise=0.1, random_state=None) + init_sample = torch.tensor(X)[:, [0, 2]] + scaling_factor = 7 + init_sample = (init_sample - init_sample.mean()) / init_sample.std() * scaling_factor + + if data == "moon": + X, y = datasets.make_moons(n_samples=npar, noise=0.1, random_state=None) + scaling_factor = 7.0 + init_sample = torch.tensor(X) + init_sample = (init_sample - init_sample.mean()) / init_sample.std() * scaling_factor + + if data == "circle": + X, y = datasets.make_circles(n_samples=npar, noise=0.0, random_state=None, factor=0.5) + init_sample = torch.tensor(X) * 10 + + if data == "checker": + x1 = np.random.rand(npar) * 4 - 2 + x2_ = np.random.rand(npar) - np.random.randint(0, 2, npar) * 2 + x2 = x2_ + (np.floor(x1) % 2) + x = np.concatenate([x1[:, None], x2[:, None]], 1) * 7.5 + init_sample = torch.from_numpy(x) + + if data == "pinwheel": + radial_std = 0.3 + tangential_std = 0.1 + num_classes = 5 + num_per_class = npar // 5 + rate = 0.25 + rads = np.linspace(0, 2 * np.pi, num_classes, endpoint=False) + + features = np.random.randn(num_classes * num_per_class, 2) * np.array( + [radial_std, tangential_std] + ) + features[:, 0] += 1.0 + labels = np.repeat(np.arange(num_classes), num_per_class) + + angles = rads[labels] + rate * np.exp(features[:, 0]) + rotations = np.stack([np.cos(angles), -np.sin(angles), np.sin(angles), np.cos(angles)]) + rotations = np.reshape(rotations.T, (-1, 2, 2)) + x = 7.5 * np.random.permutation(np.einsum("ti,tij->tj", features, rotations)) + init_sample = torch.from_numpy(x) + + if data == "8gaussians": + scale = 4.0 + centers = [ + (1, 0), + (-1, 0), + (0, 1), + (0, -1), + (1.0 / np.sqrt(2), 1.0 / np.sqrt(2)), + (1.0 / np.sqrt(2), -1.0 / np.sqrt(2)), + (-1.0 / np.sqrt(2), 1.0 / np.sqrt(2)), + (-1.0 / np.sqrt(2), -1.0 / np.sqrt(2)), + ] + centers = [(scale * x, scale * y) for x, y in centers] + + dataset = [] + for i in range(npar): + point = np.random.randn(2) * 0.5 + idx = np.random.randint(8) + center = centers[idx] + point[0] += center[0] + point[1] += center[1] + dataset.append(point) + dataset = np.array(dataset, dtype="float32") + dataset *= 3 + init_sample = torch.from_numpy(dataset) + + init_sample = init_sample.float() + + return init_sample + + +def two_dim_ds(npar, data_tag): + init_sample = data_distrib(npar, data_tag) + init_ds = TensorDataset(init_sample) + return init_ds diff --git a/conditional-flow-matching/runner/src/models/components/__init__.py b/conditional-flow-matching/runner/src/models/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/conditional-flow-matching/runner/src/models/components/augmentation.py b/conditional-flow-matching/runner/src/models/components/augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..48ed5dfda25bf56000fd84f95ebe1ca748f6f633 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/augmentation.py @@ -0,0 +1,359 @@ +import torch +from torch import nn + + +class Regularizer(nn.Module): + def __init__(self): + pass + + +def _batch_root_mean_squared(tensor): + tensor = tensor.view(tensor.shape[0], -1) + return torch.norm(tensor, p=2, dim=1) / tensor.shape[1] ** 0.5 + + +class RegularizationFunc(nn.Module): + def forward(self, t, x, dx, context) -> torch.Tensor: + """Outputs a batch of scaler regularizations.""" + raise NotImplementedError + + +class L1Reg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + return torch.mean(torch.abs(dx), dim=1) + + +class L2Reg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + return _batch_root_mean_squared(dx) + + +class SquaredL2Reg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + to_return = dx.view(dx.shape[0], -1) + return torch.pow(torch.norm(to_return, p=2, dim=1), 2) + + +def _get_minibatch_jacobian(y, x, create_graph=True): + """Computes the Jacobian of y wrt x assuming minibatch-mode. + + Args: + y: (N, ...) with a total of D_y elements in ... + x: (N, ...) with a total of D_x elements in ... + Returns: + The minibatch Jacobian matrix of shape (N, D_y, D_x) + """ + # assert y.shape[0] == x.shape[0] + y = y.view(y.shape[0], -1) + + # Compute Jacobian row by row. + jac = [] + for j in range(y.shape[1]): + dy_j_dx = torch.autograd.grad( + y[:, j], + x, + torch.ones_like(y[:, j]), + retain_graph=True, + create_graph=create_graph, + )[0] + jac.append(torch.unsqueeze(dy_j_dx, -1)) + jac = torch.cat(jac, -1) + return jac + + +class JacobianFrobeniusReg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + if hasattr(context, "jac"): + jac = context.jac + else: + jac = _get_minibatch_jacobian(dx, x) + context.jac = jac + jac = _get_minibatch_jacobian(dx, x) + context.jac = jac + return _batch_root_mean_squared(jac) + + +class JacobianDiagFrobeniusReg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + if hasattr(context, "jac"): + jac = context.jac + else: + jac = _get_minibatch_jacobian(dx, x) + context.jac = jac + diagonal = jac.view(jac.shape[0], -1)[ + :, :: jac.shape[1] + ] # assumes jac is minibatch square, ie. (N, M, M). + return _batch_root_mean_squared(diagonal) + + +class JacobianOffDiagFrobeniusReg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + if hasattr(context, "jac"): + jac = context.jac + else: + jac = _get_minibatch_jacobian(dx, x) + context.jac = jac + diagonal = jac.view(jac.shape[0], -1)[ + :, :: jac.shape[1] + ] # assumes jac is minibatch square, ie. (N, M, M). + ss_offdiag = torch.sum(jac.view(jac.shape[0], -1) ** 2, dim=1) - torch.sum( + diagonal**2, dim=1 + ) + ms_offdiag = ss_offdiag / (diagonal.shape[1] * (diagonal.shape[1] - 1)) + return ms_offdiag + + +def autograd_trace(x_out, x_in, **kwargs): + """Standard brute-force means of obtaining trace of the Jacobian, O(d) calls to autograd.""" + trJ = 0.0 + for i in range(x_in.shape[1]): + trJ += torch.autograd.grad(x_out[:, i].sum(), x_in, allow_unused=False, create_graph=True)[ + 0 + ][:, i] + return trJ + + +def hutch_trace(x_out, x_in, noise=None, **kwargs): + """Hutchinson's trace Jacobian estimator, O(1) call to autograd.""" + noise = torch.randn_like(x_in) + jvp = torch.autograd.grad(x_out, x_in, noise, create_graph=True)[0] + trJ = torch.einsum("bi,bi->b", jvp, noise) + return trJ + + +class CNFReg(RegularizationFunc): + def __init__(self, trace_estimator=None, noise_dist=None): + super().__init__() + self.trace_estimator = autograd_trace + if trace_estimator == "hutch": + self.trace_estimator = hutch_trace + self.noise_dist, self.noise = noise_dist, None + + def forward(self, t, x, dx, context): + # TODO we could check if jac is in the context to speed up + return -self.trace_estimator(dx, x) + 0 * x + + +class AugmentationModule(nn.Module): + """Class orchestrating augmentations. + + Also establishes order. + """ + + def __init__( + self, + cnf_estimator: str = None, + l1_reg: float = 0.0, + l2_reg: float = 0.0, + squared_l2_reg: float = 0.0, + jacobian_frobenius_reg: float = 0.0, + jacobian_diag_frobenius_reg: float = 0.0, + jacobian_off_diag_frobenius_reg: float = 0.0, + ) -> None: + super().__init__() + self.cnf_estimator = cnf_estimator + names = [] + coeffs = [] + regs = [] + if cnf_estimator == "exact": + names.append("log_prob") + coeffs.append(1) + regs.append(CNFReg(None, noise_dist=None)) + if l1_reg > 0.0: + names.append("L1") + coeffs.append(l1_reg) + regs.append(L1Reg()) + if l2_reg > 0.0: + names.append("L2") + coeffs.append(l2_reg) + regs.append(L2Reg()) + if squared_l2_reg > 0.0: + names.append("squared_L2") + coeffs.append(squared_l2_reg) + regs.append(SquaredL2Reg()) + if jacobian_frobenius_reg > 0.0: + names.append("jacobian_frobenius") + coeffs.append(jacobian_frobenius_reg) + regs.append(JacobianFrobeniusReg()) + if jacobian_diag_frobenius_reg > 0.0: + names.append("jacobian_diag_frobenius") + coeffs.append(jacobian_diag_frobenius_reg) + regs.append(JacobianDiagFrobeniusReg()) + if jacobian_off_diag_frobenius_reg > 0.0: + names.append("jacobian_off_diag_frobenius") + coeffs.append(jacobian_off_diag_frobenius_reg) + regs.append(JacobianOffDiagFrobeniusReg()) + self.names = names + self.coeffs = torch.tensor(coeffs) + self.regs = torch.nn.ModuleList(regs) + assert len(self.coeffs) == len(self.regs) + self.aug_dims = len(self.coeffs) + self.augmenter = Augmenter(augment_idx=1, augment_dims=self.aug_dims) + + def forward(self, x): + """Separates and adds together losses.""" + # if x.dim() > 2: + # augmentation is broken, return regs = 0 for now + # reg = torch.zeros(1).type_as(x) + # return reg, x + if self.cnf_estimator is None: + if self.aug_dims == 0: + reg = torch.zeros(1).type_as(x) + else: + aug, x = x[:, : self.aug_dims], x[:, self.aug_dims :] + reg = aug * self.coeffs + return reg, x + delta_logprob, aug, x = x[:, :1], x[:, 1 : self.aug_dims], x[:, self.aug_dims :] + reg = aug * self.coeffs[1:].to(aug) + if self.aug_dims == 1: + reg = torch.zeros(1).type_as(x) + return delta_logprob, reg, x + + +class Augmenter(nn.Module): + """Augmentation class. + + Can handle several types of augmentation strategies for Neural DEs. + :param augment_dims: number of augmented dimensions to initialize + :type augment_dims: int + :param augment_idx: index of dimension to augment + :type augment_idx: int + :param augment_func: nn.Module applied to the input datasets of dimension `d` to determine the + augmented initial condition of dimension `d + a`. `a` is defined implicitly in + `augment_func` e.g. augment_func=nn.Linear(2, 5) augments a 2 dimensional input with 3 + additional dimensions. + :type augment_func: nn.Module + :param order: whether to augment before datasets [augmentation, x] or after [x, augmentation] + along dimension `augment_idx`. Options: ('first', 'last') + :type order: str + """ + + def __init__( + self, + augment_idx: int = 1, + augment_dims: int = 5, + augment_func=None, + order="first", + ): + super().__init__() + self.augment_dims, self.augment_idx, self.augment_func = ( + augment_dims, + augment_idx, + augment_func, + ) + self.order = order + + def forward(self, x: torch.Tensor, ts: torch.Tensor): + if not self.augment_func: + x = x.reshape(x.shape[0], -1) + new_dims = list(x.shape) + new_dims[self.augment_idx] = self.augment_dims + + # if-else check for augmentation order + if self.order == "first": + x = torch.cat([torch.zeros(new_dims).to(x), x], self.augment_idx) + else: + x = torch.cat([x, torch.zeros(new_dims).to(x)], self.augment_idx) + else: + # if-else check for augmentation order + if self.order == "first": + x = torch.cat([self.augment_func(x).to(x), x], self.augment_idx) + else: + x = torch.cat([x, self.augment_func(x).to(x)], self.augment_idx) + return x, ts + + +class AugmentedVectorField(nn.Module): + """NeuralODE but augmented state. + + Preprends Augmentations to state for easy integration over time + """ + + def __init__(self, net, augmentation_list: nn.ModuleList, dim): + super().__init__() + self.net = net + self.dim = dim + self.augmentation_list = augmentation_list + + def forward(self, t, state, augmented_input=True, *args, **kwargs): + n_aug = len(self.augmentation_list) + + class SharedContext: + pass + + with torch.set_grad_enabled(True): + # first dimensions reserved for augmentations + x = state + if augmented_input: + x = x[:, n_aug:].requires_grad_(True) + + # the neural network will handle the data-dynamics here + if isinstance(self.dim, int): + dx = self.net(t, x.reshape(-1, self.dim)) + else: + dx = self.net(t, x.reshape(-1, *self.dim)) + if n_aug == 0: + return dx + dx = dx.reshape(dx.shape[0], -1) + # x_out = x_out.squeeze(dim=1) + + augs = [aug_fn(t, x, dx, SharedContext) for aug_fn in self.augmentation_list] + augs = torch.stack(augs, dim=1) + # `+ 0*state` has the only purpose of connecting state[:, 0] to autograd graph + return torch.cat([augs, dx], 1) + (0 * state if augmented_input else 0) + + +class CNF(AugmentedVectorField): + def __init__(self, net, trace_estimator=None, noise_dist=None): + cnf_reg = CNFReg(trace_estimator, noise_dist) + super().__init__(net, [cnf_reg]) + + +class Old_CNF(nn.Module): + def __init__(self, net, trace_estimator=None, noise_dist=None): + super().__init__() + self.net = net + self.trace_estimator = trace_estimator if trace_estimator is not None else autograd_trace + self.noise_dist, self.noise = noise_dist, None + + def forward(self, t, x): + with torch.set_grad_enabled(True): + x_in = x[:, 1:].requires_grad_( + True + ) # first dimension reserved to divergence propagation + # the neural network will handle the data-dynamics here + x_out = self.net(t, x_in) + x_out = x_out.squeeze(dim=1) + trJ = self.trace_estimator(x_out, x_in, noise=self.noise) + return ( + torch.cat([-trJ[:, None], x_out], 1) + 0 * x + ) # `+ 0*x` has the only purpose of connecting x[:, 0] to autograd graph + + +class Sequential(nn.Sequential): + """A sequential module which handles multiple inputs.""" + + def forward(self, *input): + for module in self._modules.values(): + input = module(*input) + return input + + +if __name__ == "__main__": + # Test Shapes + class SharedContext: + pass + + for reg in [ + L1Reg, + L2Reg, + SquaredL2Reg, + JacobianFrobeniusReg, + JacobianDiagFrobeniusReg, + JacobianOffDiagFrobeniusReg, + ]: + x = torch.ones(2, 3).requires_grad_(True) + dx = x * 2 + out = reg().forward(torch.ones(1), x, dx, SharedContext) + assert out.dim() == 1 + assert out.shape[0] == 2 diff --git a/conditional-flow-matching/runner/src/models/components/base.py b/conditional-flow-matching/runner/src/models/components/base.py new file mode 100644 index 0000000000000000000000000000000000000000..95ec7a4367a5e67f48c03368bfb66e0e5eb67ef9 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/base.py @@ -0,0 +1,681 @@ +"""Implements dynamics models that support interventions on a known and prespecified set of +targets.""" + +import functools +import math + +import numpy as np +import torch +from torch import Tensor, nn +from torch.nn import functional as F +from torch.nn.parameter import Parameter + + +class LocallyConnected(nn.Module): + """Local linear layer, i.e. Conv1dLocal() with filter size 1. + + Args: + num_linear: num of local linear layers, i.e. + in_features: m1 + out_features: m2 + bias: whether to include bias or not + + Shape: + - Input: [n, d, m1] + - Output: [n, d, m2] + + Attributes: + weight: [d, m1, m2] + bias: [d, m2] + """ + + def __init__(self, num_linear, input_features, output_features, bias=True): + super().__init__() + self.num_linear = num_linear + self.input_features = input_features + self.output_features = output_features + + self.weight = nn.Parameter(torch.Tensor(num_linear, input_features, output_features)) + if bias: + self.bias = nn.Parameter(torch.Tensor(num_linear, output_features)) + else: + # You should always register all possible parameters, but the + # optional ones can be None if you want. + self.register_parameter("bias", None) + + self.reset_parameters() + + @torch.no_grad() + def reset_parameters(self): + k = 1.0 / self.input_features + bound = math.sqrt(k) + nn.init.uniform_(self.weight, -bound, bound) + if self.bias is not None: + nn.init.uniform_(self.bias, -bound, bound) + + def forward(self, input: torch.Tensor): + # [n, d, 1, m2] = [n, d, 1, m1] @ [1, d, m1, m2] + out = torch.matmul(input.unsqueeze(dim=2), self.weight.unsqueeze(dim=0)) + out = out.squeeze(dim=2) + if self.bias is not None: + # [n, d, m2] += [d, m2] + out += self.bias + return out + + def extra_repr(self): + # (Optional)Set the extra information about this module. You can test + # it by printing an object of this class. + return "num_linear={}, in_features={}, out_features={}, bias={}".format( + self.num_linear, + self.input_features, + self.output_features, + self.bias is not None, + ) + + +class Intervenable(nn.Module): + """Models implementing intervenable are useful for learning in the experimental setting. + + This should represent interventions on a preexisting set of possible targets. + """ + + def __init__(self, targets=None): + super().__init__() + self.targets = targets + self.current_target = None + + # def do(self, target, value=0.0): + # raise NotImplementedError + + def get_linear_structure(self): + """Gets the linear approximation of the structure coefficients. + + May not be applicable for all models + """ + raise NotImplementedError + + def get_structure(self) -> np.ndarray: + """Extracts a single summary structure from the model.""" + raise NotImplementedError + + def get_structures(self, n_structures: int) -> np.ndarray: + """Some models can provide empirical distributions over structures, this function samples a + number of structures from the model.""" + raise NotImplementedError + + def set_target(self, target): + if self.targets is not None and not np.isin(target, self.targets): + raise ValueError("Bad Target selected {target}") + self.current_target = target + + def l1_reg(self): + raise NotImplementedError + + def l2_reg(self): + raise NotImplementedError + + +def _parse_activation(activation): + if activation == "softplus": + return nn.Softplus + if activation == "sigmoid": + return nn.Sigmoid + if activation == "tanh": + return nn.Tanh + if activation == "relu": + return nn.ReLU + if activation == "lrelu": + return nn.LeakyReLU + if activation == "elu": + return nn.ELU + if issubclass(activation, nn.Module): + return activation + raise ValueError(f"Urecognized activation function {activation}") + + +class MLP(Intervenable): + """Basic MLP drift is that supports perfect interventions key piece is n_inputs is always equal + to n_outputs.""" + + def __init__( + self, + n_inputs, + n_layers=3, + n_hidden=64, + activation="softplus", + time_invariant=True, + ): + super().__init__() + self.n_layers = n_layers + self.n_inputs = n_inputs + self.n_hidden = n_hidden + self.activation = _parse_activation(activation) + self.time_invariant = time_invariant + self.model = nn.Sequential( + nn.Linear(n_inputs, n_hidden), + self.activation(), + *([nn.Linear(n_hidden, n_hidden), self.activation()] * (self.n_layers - 2)), + nn.Linear(n_hidden, n_inputs), + ) + + def _get_linear_weights(self, absolute_weights=False): + """Pretend the network is linear and find that weight matrix. + + This is used in Aliee et al. + """ + m = self.model + weights = [m[2 * i].weight for i in range(self.n_layers)[::-1]] + if absolute_weights: + weights = [torch.abs(w) for w in weights] + return functools.reduce(lambda x, y: x @ y, weights) + + def get_linear_structure(self): + return self._get_linear_weights().cpu().detach().numpy() + + def get_structure(self): + """Score based on the absolute value of the coefficient.""" + # pretend there are no non-linearities? + weight_matrix = self._get_linear_weights() + return np.abs(weight_matrix.cpu().detach().numpy()) + + def l1_reg(self, absolute_weights=False): + weights = self._get_linear_weights(absolute_weights) + return torch.mean(torch.abs(weights)) + + def l2_reg(self, absolute_weights=False): + weights = self._get_linear_weights(absolute_weights) + return torch.mean(torch.abs(weights)) + + def grn_reg(self, grn, absolute_weights=False): + weights = self._get_linear_weights(absolute_weights) + return torch.sum(torch.abs(weights * (1 - grn))) + + def forward(self, t, x, target=None): + if not self.time_invariant: + x = torch.cat((x, t), dim=-1) + out = self.model(x) + if target is not None: + # out [Batch x [Dynamics, Conditions]] + out[:, target] = 0.0 + if self.current_target is not None: + out[:, self.current_target] = 0.0 + return out + + +class DeepEnsDibsLayer(nn.Module): + """ + DEPRECATED: + Current bugs linear is not correct for x, g + Don't want bias + Deep Ensemble for distribution over function space. + - incorporating DiBS framework for distributions over + graphs + functions (v.s. graphs + parameters) + """ + + def __init__(self, n_inputs, n_outputs, k_hidden, dropout_flag=False, bias=True): + super().__init__() + self.n_inputs = n_inputs + self.n_outputs = n_outputs + self.k_hidden = k_hidden + self.sample_once_flag = True + self.with_dropout = False + + # define network weights + self.w = Parameter(torch.empty((self.k_hidden, self.n_inputs))) + self.v = Parameter(torch.empty((self.k_hidden, self.n_outputs))) + + self.weight = torch.empty((self.n_outputs, self.n_inputs), requires_grad=False) + + # define network biases + if bias: + self.bias = Parameter(torch.empty(self.n_outputs)) + + if dropout_flag: + self.p_bias = Parameter(torch.empty(self.n_outputs)) + self.sampled_p_bias = Parameter(torch.empty(self.n_outputs)) + else: + self.bias = None + + # define dropout and dropout parameters + if dropout_flag: + self.with_dropout = dropout_flag + self.p_weight = Parameter(torch.empty((self.n_outputs, self.n_inputs))) + self.sampled_p_weight = Parameter(torch.empty((self.n_outputs, self.n_inputs))) + + self.reset_parameters() + + def update_p(self): + with torch.no_grad(): + self.p_weight.copy_(self.sampled_p_weight.detach().clone()) + self.p_bias.copy_(self.sampled_p_bias.detach().clone()) + + def get_structure(self, alpha=1.0): + Z = torch.matmul(self.w.t(), self.v).t() + G = torch.sigmoid(alpha * Z) + return G + + def forward(self, input, alpha=1.0): + if self.with_dropout: + if self.sample_once_flag: + self.p = torch.sigmoid(F.linear(input, self.p_weight, self.p_bias)) + self.mask = torch.bernoulli(1 - self.p) + self.sample_once_flag = False + with torch.no_grad(): + self.sampled_p_weight.copy_(self.p_weight.detach().clone()) + self.sampled_p_bias.copy_(self.p_bias.detach().clone()) + else: + raise NotImplementedError() + G = self.get_structure(alpha) + print(input.shape, G.shape, self.bias.shape) + out = F.linear(input, G, self.bias) + print(out.shape) + if self.with_dropout: + out = self.mask * out + return out, G + + def reset_parameters(self): + # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with + # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see + # https://github.com/pytorch/pytorch/issues/57109 + if self.with_dropout: + torch.nn.init.constant_(self.p_weight, -0.5) + # torch.nn.init.kaiming_uniform_(self.p_weight, a=math.sqrt(1)) + torch.nn.init.kaiming_uniform_(self.w, a=math.sqrt(5)) + torch.nn.init.kaiming_uniform_(self.v, a=math.sqrt(5)) + if self.bias is not None: + fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + torch.nn.init.uniform_(self.bias, -bound, bound) + if self.with_dropout: + fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.p_weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + torch.nn.init.uniform_(self.p_bias, -bound, bound) + + +class BayesLinear(nn.Module): + """A Bayes linear layer for SVI-MLP (Bayes-by-backprop MLP). + + Based off code by Wilson and Izmailov 2020 'Bayesion deep learning and a probabilistic + perspective'. + """ + + def __init__(self, n_inputs, n_outputs, init_log_var, bias=True, sparse=False): + super().__init__() + self.n_inputs = n_inputs + self.n_outputs = n_outputs + self.init_log_var = init_log_var + self.sparse = sparse + self.sample_once_flag = True + self.pre_train = True + self.count = 0 + + self.weight = Parameter(torch.empty((self.n_outputs, self.n_inputs))) + self.weight_isp_std = Parameter(torch.empty((self.n_outputs, self.n_inputs))) + self.sampled_weights = Parameter(torch.empty((self.n_outputs, self.n_inputs))) + if bias: + self.bias_mean = Parameter(torch.empty(self.n_outputs)) + self.bias_isp_std = Parameter(torch.empty(self.n_outputs)) + self.sampled_biases = Parameter(torch.empty(self.n_outputs)) + self.with_bias = True + else: + self.with_bias = False + + self.reset_parameters() + self.eps = 1e-8 + + def sample_weights(self, n_samples: int = 0) -> Tensor: + if n_samples > 0: + w_sigma = F.softplus(self.weight_isp_std) + self.eps + w = self.weight + torch.randn((n_samples, *self.weight.shape)) * w_sigma + else: + w_sigma = F.softplus(self.weight_isp_std) + self.eps + w = self.weight + torch.randn_like(self.weight) * w_sigma + return w + + def sample(self): + w_sigma = F.softplus(self.weight_isp_std) + self.eps + if self.sparse: + e = 0.5 - torch.rand_like(self.weight) + w = self.weight - torch.sign(e) * w_sigma * torch.log(1.0 - 2.0 * torch.abs(e)) + else: + w = self.weight + torch.randn_like(self.weight) * w_sigma + if self.with_bias: + b_sigma = F.softplus(self.bias_isp_std) + self.eps + if self.sparse: + e = 0.5 - torch.rand_like(self.bias_mean) + b = self.bias_mean - torch.sign(e) * b_sigma * torch.log(1.0 - 2.0 * torch.abs(e)) + else: + b = self.bias_mean + torch.randn_like(self.bias_mean) * b_sigma + else: + b = None + return w, b + + def update_means(self): + with torch.no_grad(): + self.weight.copy_(self.sampled_weights.detach().clone()) + self.bias_mean.copy_(self.sampled_biases.detach().clone()) + + def forward(self, input: Tensor): + if self.pre_train: + return F.linear(input, self.sampled_weights, self.sampled_biases) + else: + if self.sample_once_flag: + w, b = self.sample() + with torch.no_grad(): + self.sampled_weights.copy_(w.detach().clone()) + self.sampled_biases.copy_(b.detach().clone()) + if self.count >= 1: + self.sample_once_flag = False + return F.linear(input, w, b) + else: + self.count += 1 + return F.linear(input, w, b) + else: + return F.linear(input, self.sampled_weights, self.sampled_biases) + + def reset_parameters(self): + torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + torch.nn.init.kaiming_uniform_(self.sampled_weights, a=math.sqrt(5)) + self.weight_isp_std.data.fill_(self.init_log_var) + # torch.nn.init.normal_(self.weight_isp_std, mean=self.init_log_var, std=0.1) + if self.with_bias: + fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight) + bound = 1 / math.sqrt(fan_in) + torch.nn.init.uniform_(self.bias_mean, -bound, bound) + torch.nn.init.uniform_(self.sampled_biases, -bound, bound) + self.bias_isp_std.data.fill_(self.init_log_var) + # torch.nn.init.normal_(self.bias_isp_std, mean=self.init_log_var, std=0.1) + print( + "\ncheck init w_mean:", + self.weight, + "\ncheck init w_std:", + F.softplus(self.weight_isp_std), + ) + + def _get_kl(self, param_mean, param_isp_std, prior_log_sigma): + sigma = F.softplus(param_isp_std) + self.eps + if self.sparse: + kl = torch.sum(sigma * torch.exp(-torch.abs(param_mean) / sigma)) + kl = torch.sum((kl + torch.abs(param_mean)) / math.exp(prior_log_sigma)) + kl += prior_log_sigma - torch.sum(torch.log(sigma)) + else: + kl = torch.sum( + prior_log_sigma + - torch.log(sigma) + + 0.5 * (sigma**2) / (math.exp(prior_log_sigma * 2)) + ) + # kl += 0.5 * torch.sum(param_mean ** 2) / math.exp(prior_log_sigma * 2) + return kl + + def kl_with_prior(self, prior_log_sigma, t=1): + w_kl = self._get_kl(self.weight, self.weight_isp_std, prior_log_sigma) + if self.with_bias: + b_kl = self._get_kl(self.bias_mean, self.bias_isp_std, prior_log_sigma) + return w_kl + b_kl + else: + return w_kl + + +class DibsLayer(nn.Module): + def __init__( + self, + n_inputs, + n_outputs, + k_hidden, + init_log_var, + alpha=0.1, + beta=0.5, + bias=True, + ): + super().__init__() + self.n_inputs = n_inputs + self.n_outputs = n_outputs + self.k_hidden = k_hidden + self.init_log_var = init_log_var + self.alpha = alpha + self.beta = beta + self.sample_once_flag = True + self.pre_train = True + self.iter_num = 0 + self.count = 0 + + self.w = Parameter(torch.empty((self.k_hidden, self.n_inputs))) + self.w_isp_std = Parameter(torch.empty((self.k_hidden, self.n_inputs))) + self.sampled_w = Parameter(torch.empty((self.k_hidden, self.n_inputs))) + + self.v = Parameter(torch.empty((self.k_hidden, self.n_outputs))) + self.v_isp_std = Parameter(torch.empty((self.k_hidden, self.n_outputs))) + self.sampled_v = Parameter(torch.empty((self.k_hidden, self.n_outputs))) + + if bias: + self.b = Parameter(torch.empty(self.n_outputs)) + self.b_isp_std = Parameter(torch.empty(self.n_outputs)) + self.sampled_b = Parameter(torch.empty(self.n_outputs)) + + self.with_bias = True + else: + self.with_bias = False + + # Latent graph Z + self.weight = torch.empty((self.n_outputs, self.n_inputs), requires_grad=False) + self.weight_sampled = torch.empty((self.n_outputs, self.n_inputs), requires_grad=False) + + self.reset_parameters() + self.eps = 1e-8 + + def get_graph(self, d, t=1, get_structure_flag=False): + if t > self.iter_num: + self.iter_num = t + if get_structure_flag: + W, V, _ = self.sample() + self.weight = torch.matmul(W.t(), V).t() + else: + self.weight = torch.matmul(self.w.t(), self.v).t() + fc1_weight = self.weight.view(d, -1, d) # [j, m1, i] + # Z = torch.sum(fc1_weight**2, dim=1).pow(0.5) # [i, j] + # Z = Z - torch.mean(Z) + Z = torch.mean(fc1_weight, dim=1) # [i, j] + self.alpha_t = self.alpha * self.iter_num + p_G_Z = torch.sigmoid(self.alpha_t * Z) + # print(Z[1], p_G_Z[1]) + return p_G_Z + + def h_acyclic(self, t): + d = self.n_inputs + G = self.get_graph(d, t) + arg = torch.matrix_power(torch.eye(d) + 1 / d * G, d) + return torch.trace(arg) - d + + def sample(self): + w_sigma, v_sigma = ( + F.softplus(self.w_isp_std) + self.eps, + F.softplus(self.v_isp_std) + self.eps, + ) + W = self.w + torch.randn_like(self.w) * w_sigma + V = self.v + torch.randn_like(self.v) * v_sigma + if self.with_bias: + b_sigma = F.softplus(self.b_isp_std) + self.eps + b = self.b + torch.randn_like(self.b) * b_sigma + else: + b = None + return W, V, b + + def update_means(self): + # self.w = self.sampled_w + # self.v = self.sampled_v + # self.b = self.sampled_b + # self.weight = self.weight_sampled + with torch.no_grad(): + self.w.copy_(self.sampled_w.detach().clone()) + self.v.copy_(self.sampled_v.detach().clone()) + if self.with_bias: + self.b.copy_(self.sampled_b.detach().clone()) + self.weight.copy_(self.sampled_weights.detach().clone()) + + def forward(self, input: Tensor): + if self.pre_train: + if self.with_bias: + self.sampled_weights = torch.matmul(self.sampled_w.t(), self.sampled_v).t() + return F.linear(input, self.sampled_weights, bias=self.sampled_b) + else: + self.sampled_weights = torch.matmul(self.sampled_w.t(), self.sampled_v).t() + return F.linear(input, self.sampled_weights, bias=None) + else: + if self.sample_once_flag: + W, V, b_samp = self.sample() + + with torch.no_grad(): + self.sampled_w.copy_(W.detach().clone()) + self.sampled_v.copy_(V.detach().clone()) + if self.with_bias: + self.sampled_b.copy_(b_samp.detach().clone()) + + self.sample_once_flag = False + if self.with_bias: + self.weight = torch.matmul(W.t(), V).t() + return F.linear(input, self.weight, bias=b_samp) + else: + self.weight = torch.matmul(W.t(), V).t() + return F.linear(input, self.weight, bias=None) + + else: + if self.with_bias: + self.sampled_weights = torch.matmul(self.sampled_w.t(), self.sampled_v).t() + return F.linear(input, self.sampled_weights, bias=self.sampled_b) + else: + self.sampled_weights = torch.matmul(self.sampled_w.t(), self.sampled_v).t() + return F.linear(input, self.sampled_weights, bias=None) + + def reset_parameters(self): + torch.nn.init.kaiming_uniform_(self.w, a=math.sqrt(5)) + torch.nn.init.kaiming_uniform_(self.v, a=math.sqrt(5)) + torch.nn.init.kaiming_uniform_(self.sampled_w, a=math.sqrt(5)) + torch.nn.init.kaiming_uniform_(self.sampled_v, a=math.sqrt(5)) + self.w_isp_std.data.fill_(self.init_log_var) + self.v_isp_std.data.fill_(self.init_log_var) + if self.with_bias: + fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.v) + bound = 1 / math.sqrt(fan_in) + torch.nn.init.uniform_(self.b, -bound, bound) + torch.nn.init.uniform_(self.sampled_b, -bound, bound) + self.b_isp_std.data.fill_(self.init_log_var) + print( + "\ncheck init u_mean:", + self.v, + "\ncheck init u_std:", + F.softplus(self.v_isp_std), + ) + + def _get_kl(self, param_mean, sigma, prior_log_sigma): + kl = torch.sum( + prior_log_sigma - torch.log(sigma) + 0.5 * (sigma**2) / (math.exp(prior_log_sigma * 2)) + ) + kl += 0.5 * torch.sum(param_mean**2) / math.exp(prior_log_sigma * 2) + return kl + + def kl_with_prior(self, prior_log_sigma, t=1): + self.weight = torch.matmul(self.w.t(), self.v).t() + w_sigma, v_sigma = ( + F.softplus(self.w_isp_std) + self.eps, + F.softplus(self.v_isp_std) + self.eps, + ) + weight_std = torch.matmul(w_sigma.t(), v_sigma).t() + z_kl = self._get_kl(self.weight, weight_std, prior_log_sigma) - self.beta * self.h_acyclic( + t + ) + if self.with_bias: + b_sigma = F.softplus(self.b_isp_std) + self.eps + b_kl = self._get_kl(self.b, b_sigma, prior_log_sigma) + else: + b_kl = 0.0 + return z_kl + b_kl + + +class BayesMLP(Intervenable): + """Bayes MLP drift is that supports perfect interventions key piece is n_inputs is always equal + to n_outputs.""" + + def __init__( + self, + n_inputs, + n_layers=3, + n_hidden=64, + activation="softplus", + time_invariant=True, + ): + super().__init__() + self.n_layers = n_layers + self.n_inputs = n_inputs + self.n_hidden = n_hidden + self.activation = _parse_activation(activation) + self.time_invariant = time_invariant + self.model = nn.Sequential( + BayesLinear(n_inputs, n_hidden), + self.activation(), + *([BayesLinear(n_hidden, n_hidden), self.activation()] * (self.n_layers - 2)), + BayesLinear(n_hidden, n_inputs), + ) + + def _get_linear_weights(self, absolute_weights=False): + """Pretend the network is linear and find that weight matrix. + + This is used in Aliee et al. + """ + m = self.model + weights = [m[2 * i].weight for i in range(self.n_layers)[::-1]] + if absolute_weights: + weights = [torch.abs(w) for w in weights] + return functools.reduce(lambda x, y: x @ y, weights) + + def get_linear_structure(self): + return self._get_linear_weights().cpu().detach().numpy() + + def get_structure(self): + """Score based on the absolute value of the coefficient.""" + # pretend there are no non-linearities? + weight_matrix = self._get_linear_weights() + return np.abs(weight_matrix.cpu().detach().numpy()) + + def l1_reg(self, absolute_weights=False): + weights = self._get_linear_weights(absolute_weights) + return torch.mean(torch.abs(weights)) + + def l2_reg(self, absolute_weights=False): + weights = self._get_linear_weights(absolute_weights) + return torch.mean(torch.abs(weights)) + + def forward(self, t, x, target=None): + if not self.time_invariant: + x = torch.cat((x, t), dim=-1) + out = self.model(x) + if target is not None: + # out [Batch x [Dynamics, Conditions]] + out[:, target] = 0.0 + if self.current_target is not None: + out[:, self.current_target] = 0.0 + return out + + +class Linear(Intervenable): + def __init__(self, n_inputs, targets=None, time_invariant=True): + super().__init__(targets) + self.time_invariant = time_invariant + self.n_inputs = n_inputs + self.model = nn.Sequential(nn.Linear(n_inputs, n_inputs, bias=False)) + + def get_linear_structure(self): + return self.model[0].weight.cpu().detach().numpy() + + def get_structure(self): + """Score based on the absolute value of the coefficient.""" + return np.abs(self.model[0].weight.cpu().detach().numpy()) + + def forward(self, t, x, target=None): + if not self.time_invariant: + x = torch.cat((x, t), dim=-1) + out = self.model(x) + if target is not None: + out[:, target] = 0.0 + if self.current_target is not None: + out[:, self.current_target] = 0.0 + return out diff --git a/conditional-flow-matching/runner/src/models/components/distribution_distances.py b/conditional-flow-matching/runner/src/models/components/distribution_distances.py new file mode 100644 index 0000000000000000000000000000000000000000..92010b26d9f5573891524610c8ceb757967cfb7e --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/distribution_distances.py @@ -0,0 +1,74 @@ +import math +from typing import Union + +import numpy as np +import torch + +from .mmd import linear_mmd2, mix_rbf_mmd2, poly_mmd2 +from .optimal_transport import wasserstein + + +def compute_distances(pred, true): + """Computes distances between vectors.""" + mse = torch.nn.functional.mse_loss(pred, true).item() + me = math.sqrt(mse) + mae = torch.mean(torch.abs(pred - true)).item() + return mse, me, mae + + +def compute_distribution_distances(pred: torch.Tensor, true: Union[torch.Tensor, list]): + """computes distances between distributions. + pred: [batch, times, dims] tensor + true: [batch, times, dims] tensor or list[batch[i], dims] of length times + + This handles jagged times as a list of tensors. + """ + NAMES = [ + "1-Wasserstein", + "2-Wasserstein", + "Linear_MMD", + "Poly_MMD", + "RBF_MMD", + "Mean_MSE", + "Mean_L2", + "Mean_L1", + "Median_MSE", + "Median_L2", + "Median_L1", + ] + is_jagged = isinstance(true, list) + pred_is_jagged = isinstance(pred, list) + dists = [] + to_return = [] + names = [] + filtered_names = [name for name in NAMES if not is_jagged or not name.endswith("MMD")] + ts = len(pred) if pred_is_jagged else pred.shape[1] + for t in np.arange(ts): + if pred_is_jagged: + a = pred[t] + else: + a = pred[:, t, :] + if is_jagged: + b = true[t] + else: + b = true[:, t, :] + w1 = wasserstein(a, b, power=1) + w2 = wasserstein(a, b, power=2) + if not pred_is_jagged and not is_jagged: + mmd_linear = linear_mmd2(a, b).item() + mmd_poly = poly_mmd2(a, b, d=2, alpha=1.0, c=2.0).item() + mmd_rbf = mix_rbf_mmd2(a, b, sigma_list=[0.01, 0.1, 1, 10, 100]).item() + mean_dists = compute_distances(torch.mean(a, dim=0), torch.mean(b, dim=0)) + median_dists = compute_distances(torch.median(a, dim=0)[0], torch.median(b, dim=0)[0]) + if pred_is_jagged or is_jagged: + dists.append((w1, w2, *mean_dists, *median_dists)) + else: + dists.append((w1, w2, mmd_linear, mmd_poly, mmd_rbf, *mean_dists, *median_dists)) + # For multipoint datasets add timepoint specific distances + if ts > 1: + names.extend([f"t{t+1}/{name}" for name in filtered_names]) + to_return.extend(dists[-1]) + + to_return.extend(np.array(dists).mean(axis=0)) + names.extend(filtered_names) + return names, to_return diff --git a/conditional-flow-matching/runner/src/models/components/emd.py b/conditional-flow-matching/runner/src/models/components/emd.py new file mode 100644 index 0000000000000000000000000000000000000000..8a2f47b430a5b3b0b78126313005ba22b10d72ad --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/emd.py @@ -0,0 +1,154 @@ +import numpy as np +import ot as pot # Python Optimal Transport package +import scipy.sparse +from sklearn.metrics.pairwise import pairwise_distances + + +def earth_mover_distance( + p, + q, + eigenvals=None, + weights1=None, + weights2=None, + return_matrix=False, + metric="sqeuclidean", +): + """Returns the earth mover's distance between two point clouds. + + Parameters + ---------- + cloud1 : 2-D array + First point cloud + cloud2 : 2-D array + Second point cloud + Returns + ------- + distance : float + The distance between the two point clouds + """ + p = p.toarray() if scipy.sparse.isspmatrix(p) else p + q = q.toarray() if scipy.sparse.isspmatrix(q) else q + if eigenvals is not None: + p = p.dot(eigenvals) + q = q.dot(eigenvals) + if weights1 is None: + p_weights = np.ones(len(p)) / len(p) + else: + weights1 = weights1.astype("float64") + p_weights = weights1 / weights1.sum() + + if weights2 is None: + q_weights = np.ones(len(q)) / len(q) + else: + weights2 = weights2.astype("float64") + q_weights = weights2 / weights2.sum() + + pairwise_dist = np.ascontiguousarray(pairwise_distances(p, Y=q, metric=metric, n_jobs=-1)) + + result = pot.emd2( + p_weights, q_weights, pairwise_dist, numItermax=1e7, return_matrix=return_matrix + ) + if return_matrix: + square_emd, log_dict = result + return np.sqrt(square_emd), log_dict + else: + return np.sqrt(result) + + +def interpolate_with_ot(p0, p1, tmap, interp_frac, size): + """Interpolate between p0 and p1 at fraction t_interpolate knowing a transport map from p0 to + p1. + + Parameters + ---------- + p0 : 2-D array + The genes of each cell in the source population + p1 : 2-D array + The genes of each cell in the destination population + tmap : 2-D array + A transport map from p0 to p1 + t_interpolate : float + The fraction at which to interpolate + size : int + The number of cells in the interpolated population + Returns + ------- + p05 : 2-D array + An interpolated population of 'size' cells + """ + p0 = p0.toarray() if scipy.sparse.isspmatrix(p0) else p0 + p1 = p1.toarray() if scipy.sparse.isspmatrix(p1) else p1 + p0 = np.asarray(p0, dtype=np.float64) + p1 = np.asarray(p1, dtype=np.float64) + tmap = np.asarray(tmap, dtype=np.float64) + if p0.shape[1] != p1.shape[1]: + raise ValueError("Unable to interpolate. Number of genes do not match") + if p0.shape[0] != tmap.shape[0] or p1.shape[0] != tmap.shape[1]: + raise ValueError( + "Unable to interpolate. Tmap size is {}, expected {}".format( + tmap.shape, (len(p0), len(p1)) + ) + ) + I = len(p0) + J = len(p1) + # Assume growth is exponential and retrieve growth rate at t_interpolate + # If all sums are the same then this does not change anything + # This only matters if sum is not the same for all rows + p = tmap / np.power(tmap.sum(axis=0), 1.0 - interp_frac) + p = p.flatten(order="C") + p = p / p.sum() + choices = np.random.choice(I * J, p=p, size=size) + return np.asarray( + [p0[i // J] * (1 - interp_frac) + p1[i % J] * interp_frac for i in choices], + dtype=np.float64, + ) + + +def interpolate_per_point_with_ot(p0, p1, tmap, interp_frac): + """Interpolate between p0 and p1 at fraction t_interpolate knowing a transport map from p0 to + p1. + + Parameters + ---------- + p0 : 2-D array + The genes of each cell in the source population + p1 : 2-D array + The genes of each cell in the destination population + tmap : 2-D array + A transport map from p0 to p1 + t_interpolate : float + The fraction at which to interpolate + Returns + ------- + p05 : 2-D array + An interpolated population of 'size' cells + """ + assert len(p0) == len(p1) + p0 = p0.toarray() if scipy.sparse.isspmatrix(p0) else p0 + p1 = p1.toarray() if scipy.sparse.isspmatrix(p1) else p1 + p0 = np.asarray(p0, dtype=np.float64) + p1 = np.asarray(p1, dtype=np.float64) + tmap = np.asarray(tmap, dtype=np.float64) + if p0.shape[1] != p1.shape[1]: + raise ValueError("Unable to interpolate. Number of genes do not match") + if p0.shape[0] != tmap.shape[0] or p1.shape[0] != tmap.shape[1]: + raise ValueError( + "Unable to interpolate. Tmap size is {}, expected {}".format( + tmap.shape, (len(p0), len(p1)) + ) + ) + + I = len(p0) + # J = len(p1) + # Assume growth is exponential and retrieve growth rate at t_interpolate + # If all sums are the same then this does not change anything + # This only matters if sum is not the same for all rows + p = tmap / (tmap.sum(axis=0) / 1.0 - interp_frac) + # p = tmap / np.power(tmap.sum(axis=0), 1.0 - interp_frac) + # p = p.flatten(order="C") + p = p / p.sum(axis=0) + choices = np.array([np.random.choice(I, p=p[i]) for i in range(I)]) + return np.asarray( + [p0[i] * (1 - interp_frac) + p1[j] * interp_frac for i, j in enumerate(choices)], + dtype=np.float64, + ) diff --git a/conditional-flow-matching/runner/src/models/components/evaluation.py b/conditional-flow-matching/runner/src/models/components/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..9b0c540c2946e7c8db2a07dd0ed338d06019d289 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/evaluation.py @@ -0,0 +1,255 @@ +from collections import Counter + +import numpy as np +from sklearn.metrics import average_precision_score, roc_auc_score + + +def compare_single_graph_bayesian_shd(true_graph, estimated_graph): + """Compute performance measures on encoded distribution over graphs + Args: + true_graph: (dxd) np.array, the true adjacency matrix, encoded in + negative values are the nodes that could be one or zero. + estimated graph: (dxd) np.array, the estimated adjacency matricies + (weighted or unweighted) where b is the batch size + """ + + def shd(a, b): + return np.sum(np.abs(a - b)) + + true_graph = true_graph.squeeze().astype(int) + var_maps = np.minimum(0, true_graph)[:, 0] + var_mask = var_maps < 0 + vars_to_deidentify = -(var_maps[var_mask] + 1) + estimated_graph = estimated_graph.squeeze() + summed_estimated_graph = estimated_graph[~var_mask] + # Distance to the nearest admissible graph. + for i, v in enumerate(vars_to_deidentify): + summed_estimated_graph[v] += estimated_graph[var_mask][i] + hamming = shd(true_graph[~var_mask], summed_estimated_graph) + return hamming + + +def compare_graphs_bayesian_shd(true_graph, estimated_graphs): + shd = np.mean( + [compare_single_graph_bayesian_shd(true_graph, graph) for graph in estimated_graphs] + ) + thresholded_shd = np.mean( + [ + compare_single_graph_bayesian_shd(true_graph, (graph > 0.5).astype(float)) + for graph in estimated_graphs + ] + ) + return shd, thresholded_shd + + +def compare_graphs_bayesian_dist(true_graph, estimated_graphs): + """Compute performance measures on encoded distribution over graphs + Args: + true_graph: (dxd) np.array, the true adjacency matrix, encoded in + negative values are the nodes that could be one or zero. + estimated graph: (dxd) np.array, the estimated adjacency matricies + (weighted or unweighted) where b is the batch size + """ + + def shd(a, b): + return np.sum(np.abs(a - b)) + + true_graph = true_graph.squeeze().astype(int) + var_maps = np.minimum(0, true_graph)[:, 0] + var_mask = var_maps < 0 + vars_to_deidentify = -(var_maps[var_mask] + 1) + unique, counts = np.unique(vars_to_deidentify, return_counts=True) + admissible_count = Counter() + sample_count = Counter() + for estimated_graph in estimated_graphs: + summed_estimated_graph = estimated_graph[~var_mask] + # Distance to the nearest admissible graph. + for i, v in enumerate(vars_to_deidentify): + summed_estimated_graph[v] += estimated_graph[var_mask][i] + hamming = shd(true_graph[unique], summed_estimated_graph[unique]) + mask = var_mask.copy() + mask[unique] = True + sample_count.update([tuple(estimated_graph[mask].flatten())]) + if hamming == 0: + admissible_count.update([tuple(estimated_graph[mask].flatten())]) + + # Consider the undetermined edges only, lets count the # of unique admissible graphs observed? + seen_admissible = len(list(admissible_count)) + unique_admissible = len(admissible_count) + total_targets = np.sum(true_graph[unique], axis=1) + total_admissible = 1 + for c, t in zip(counts, total_targets): + total_admissible *= (c + 1) ** t + + return ( + seen_admissible, + total_admissible, + unique_admissible, + admissible_count, + sample_count, + ) + + +def compare_graphs_bayesian_cover(true_graph, estimated_graphs): + ( + seen_admissible, + total_admissible, + unique_admissible, + admissible_count, + sample_count, + ) = compare_graphs_bayesian_dist(true_graph, estimated_graphs) + print("id-graphs:", unique_admissible, "-- total graphs:", total_admissible) + return unique_admissible / total_admissible + + +def compute_gfn_neg_log_likelihood(true_graph, estimated_graphs, p_mse): + r""" + Warning: Currently Not being used. + + + NNL = - \sum_G p(G | D)P(D | G) + + G - possible graphs in search space + P(D | G) - given a graph, we can calculate the MSE of the data. + P(G | D) - the probability of generating a graph given the data. This + is generate using the learned P_F(G) + - Need to compute P(G) over possible trajectories + """ + pass + + +def compare_graph_distribution(true_graph, estimated_graphs): + ( + seen_admissible, + total_admissible, + unique_admissible, + admissible_count, + sample_count, + ) = compare_graphs_bayesian_dist(true_graph, estimated_graphs) + # compute distacne to uniform + dist_admissible = [ + float(x) / float(sum(admissible_count.values())) for x in list(admissible_count.values()) + ] + entropy_admissible = 0.0 + for p in dist_admissible: + if p == 0.0: + entropy_admissible += 0.0 + else: + entropy_admissible += p * np.log2(p) + kl_unif = np.log2(len(admissible_count)) - entropy_admissible + + # compute proportion of admissible graphs + admissible_proportion = [ + float(x) / float(sum(sample_count.values())) for x in list(admissible_count.values()) + ] + + # graph variation dist + entropy_proportion = 0.0 + for p in admissible_proportion: + if p == 0.0: + entropy_proportion += 0.0 + else: + entropy_proportion += p * np.log2(p) + kl_proportion = np.log2(len(sample_count)) - entropy_proportion + + return kl_unif, admissible_proportion, kl_proportion + + +def compute_graphs_bayesian_diversity(graphs): + """ + Input(s): + - n_ens graphs: [n_ens, d, d] + Output: + - diversity metric: node-wise variance of predicted graphs + normalized by node-wise varaince of graph generated with + Bernoulli random variable. + """ + ber_graphs = np.random.binomial(1, 0.5, size=graphs.shape) + node_wise_var = np.var(graphs, axis=0) + diversity = np.sum(node_wise_var) + return diversity / np.sum(np.var(ber_graphs, axis=0)) + + +def compute_graphs_sparsity(graph): + """ + Input(s): + - n_ens graphs: [n_ens, d, d] + Output: + - average sparsity metric + """ + Adj = np.around(graph, decimals=0) + sparsity = 1 - np.mean(Adj) + return sparsity + + +def compare_graphs(true_graph, estimated_graph): + """Compute performance measures on (binary) adjacency matrix. + + Input: + - true_graph: (dxd) np.array, the true adjacency matrix + - estimated graph: (dxd) np.array, the estimated adjacency matrix (weighted or unweighted) + """ + # Handle new case where we encode information in the negative numbers + true_graph = np.maximum(0, true_graph) + + def structural_hamming_distance(W_true, W_est): + """Computes the structural hamming distance.""" + pred = np.flatnonzero(W_est != 0) + cond = np.flatnonzero(W_true) + cond_reversed = np.flatnonzero(W_true.T) + extra = np.setdiff1d(pred, cond, assume_unique=True) + reverse = np.intersect1d(extra, cond_reversed, assume_unique=True) + pred_lower = np.flatnonzero(np.tril(W_est + W_est.T)) + cond_lower = np.flatnonzero(np.tril(W_true + W_true.T)) + extra_lower = np.setdiff1d(pred_lower, cond_lower, assume_unique=True) + missing_lower = np.setdiff1d(cond_lower, pred_lower, assume_unique=True) + shd = len(extra_lower) + len(missing_lower) + len(reverse) + return shd + + num_edges = len(true_graph[np.where(true_graph != 0.0)]) + + tam = np.array([[1 if x != 0.0 else 0.0 for x in y] for y in true_graph]) + eam = np.array([[1 if x != 0.0 else 0.0 for x in y] for y in estimated_graph]) + + tp = len(np.argwhere((tam + eam) == 2)) + fp = len(np.argwhere((tam - eam) < 0)) + tn = len(np.argwhere((tam + eam) == 0)) + fn = num_edges - tp + x = [tp, fp, tn, fn] + + if x[0] + x[1] == 0: + precision = 0 + else: + precision = float(x[0]) / float(x[0] + x[1]) + if tp + fn == 0: + tpr = 0 + else: + tpr = float(tp) / float(tp + fn) + if x[2] + x[1] == 0: + specificity = 0 + else: + specificity = float(x[2]) / float(x[2] + x[1]) + if precision + tpr == 0: + f1 = 0 + else: + f1 = 2 * precision * tpr / (precision + tpr) + if fp + tp == 0: + fdr = 0 + else: + fdr = float(fp) / (float(fp) + float(tp)) + + shd = float(structural_hamming_distance(true_graph, estimated_graph)) + thresh_shd = float( + structural_hamming_distance(true_graph, (estimated_graph > 0.5).astype(float)) + ) + + if np.all(true_graph.flatten()): + AUC = -1 + AP = -1 + else: + AUC = roc_auc_score(true_graph.flatten(), estimated_graph.flatten()) + AP = average_precision_score(true_graph.flatten(), estimated_graph.flatten()) + + metrics = ["tpr", "fdr", "shd", "tshd", "auc", "ap", "f1", "specificity"] + values = [tpr, fdr, shd, thresh_shd, AUC, AP, f1, specificity] + return dict(zip(metrics, values)) diff --git a/conditional-flow-matching/runner/src/models/components/fp16_util.py b/conditional-flow-matching/runner/src/models/components/fp16_util.py new file mode 100644 index 0000000000000000000000000000000000000000..8c1298682bd519f7eab903d7b1f6f5ca94f28a40 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/fp16_util.py @@ -0,0 +1,216 @@ +"""Helpers to train with 16-bit precision.""" + +import numpy as np +import torch as th +import torch.nn as nn +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + +from . import logger + +INITIAL_LOG_LOSS_SCALE = 20.0 + + +def convert_module_to_f16(l): + """Convert primitive modules to float16.""" + if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): + l.weight.data = l.weight.data.half() + if l.bias is not None: + l.bias.data = l.bias.data.half() + + +def convert_module_to_f32(l): + """Convert primitive modules to float32, undoing convert_module_to_f16().""" + if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): + l.weight.data = l.weight.data.float() + if l.bias is not None: + l.bias.data = l.bias.data.float() + + +def make_master_params(param_groups_and_shapes): + """Copy model parameters into a (differently-shaped) list of full-precision parameters.""" + master_params = [] + for param_group, shape in param_groups_and_shapes: + master_param = nn.Parameter( + _flatten_dense_tensors([param.detach().float() for (_, param) in param_group]).view( + shape + ) + ) + master_param.requires_grad = True + master_params.append(master_param) + return master_params + + +def model_grads_to_master_grads(param_groups_and_shapes, master_params): + """Copy the gradients from the model parameters into the master parameters from + make_master_params().""" + for master_param, (param_group, shape) in zip(master_params, param_groups_and_shapes): + master_param.grad = _flatten_dense_tensors( + [param_grad_or_zeros(param) for (_, param) in param_group] + ).view(shape) + + +def master_params_to_model_params(param_groups_and_shapes, master_params): + """Copy the master parameter data back into the model parameters.""" + # Without copying to a list, if a generator is passed, this will + # silently not copy any parameters. + for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes): + for (_, param), unflat_master_param in zip( + param_group, unflatten_master_params(param_group, master_param.view(-1)) + ): + param.detach().copy_(unflat_master_param) + + +def unflatten_master_params(param_group, master_param): + return _unflatten_dense_tensors(master_param, [param for (_, param) in param_group]) + + +def get_param_groups_and_shapes(named_model_params): + named_model_params = list(named_model_params) + scalar_vector_named_params = ( + [(n, p) for (n, p) in named_model_params if p.ndim <= 1], + (-1), + ) + matrix_named_params = ( + [(n, p) for (n, p) in named_model_params if p.ndim > 1], + (1, -1), + ) + return [scalar_vector_named_params, matrix_named_params] + + +def master_params_to_state_dict(model, param_groups_and_shapes, master_params, use_fp16): + if use_fp16: + state_dict = model.state_dict() + for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes): + for (name, _), unflat_master_param in zip( + param_group, unflatten_master_params(param_group, master_param.view(-1)) + ): + assert name in state_dict + state_dict[name] = unflat_master_param + else: + state_dict = model.state_dict() + for i, (name, _value) in enumerate(model.named_parameters()): + assert name in state_dict + state_dict[name] = master_params[i] + return state_dict + + +def state_dict_to_master_params(model, state_dict, use_fp16): + if use_fp16: + named_model_params = [(name, state_dict[name]) for name, _ in model.named_parameters()] + param_groups_and_shapes = get_param_groups_and_shapes(named_model_params) + master_params = make_master_params(param_groups_and_shapes) + else: + master_params = [state_dict[name] for name, _ in model.named_parameters()] + return master_params + + +def zero_master_grads(master_params): + for param in master_params: + param.grad = None + + +def zero_grad(model_params): + for param in model_params: + # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group + if param.grad is not None: + param.grad.detach_() + param.grad.zero_() + + +def param_grad_or_zeros(param): + if param.grad is not None: + return param.grad.data.detach() + else: + return th.zeros_like(param) + + +class MixedPrecisionTrainer: + def __init__( + self, + *, + model, + use_fp16=False, + fp16_scale_growth=1e-3, + initial_lg_loss_scale=INITIAL_LOG_LOSS_SCALE, + ): + self.model = model + self.use_fp16 = use_fp16 + self.fp16_scale_growth = fp16_scale_growth + + self.model_params = list(self.model.parameters()) + self.master_params = self.model_params + self.param_groups_and_shapes = None + self.lg_loss_scale = initial_lg_loss_scale + + if self.use_fp16: + self.param_groups_and_shapes = get_param_groups_and_shapes( + self.model.named_parameters() + ) + self.master_params = make_master_params(self.param_groups_and_shapes) + self.model.convert_to_fp16() + + def zero_grad(self): + zero_grad(self.model_params) + + def backward(self, loss: th.Tensor): + if self.use_fp16: + loss_scale = 2**self.lg_loss_scale + (loss * loss_scale).backward() + else: + loss.backward() + + def optimize(self, opt: th.optim.Optimizer): + if self.use_fp16: + return self._optimize_fp16(opt) + else: + return self._optimize_normal(opt) + + def _optimize_fp16(self, opt: th.optim.Optimizer): + logger.logkv_mean("lg_loss_scale", self.lg_loss_scale) + model_grads_to_master_grads(self.param_groups_and_shapes, self.master_params) + grad_norm, param_norm = self._compute_norms(grad_scale=2**self.lg_loss_scale) + if check_overflow(grad_norm): + self.lg_loss_scale -= 1 + logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}") + zero_master_grads(self.master_params) + return False + + logger.logkv_mean("grad_norm", grad_norm) + logger.logkv_mean("param_norm", param_norm) + + for p in self.master_params: + p.grad.mul_(1.0 / (2**self.lg_loss_scale)) + opt.step() + zero_master_grads(self.master_params) + master_params_to_model_params(self.param_groups_and_shapes, self.master_params) + self.lg_loss_scale += self.fp16_scale_growth + return True + + def _optimize_normal(self, opt: th.optim.Optimizer): + grad_norm, param_norm = self._compute_norms() + logger.logkv_mean("grad_norm", grad_norm) + logger.logkv_mean("param_norm", param_norm) + opt.step() + return True + + def _compute_norms(self, grad_scale=1.0): + grad_norm = 0.0 + param_norm = 0.0 + for p in self.master_params: + with th.no_grad(): + param_norm += th.norm(p, p=2, dtype=th.float32).item() ** 2 + if p.grad is not None: + grad_norm += th.norm(p.grad, p=2, dtype=th.float32).item() ** 2 + return np.sqrt(grad_norm) / grad_scale, np.sqrt(param_norm) + + def master_params_to_state_dict(self, master_params): + return master_params_to_state_dict( + self.model, self.param_groups_and_shapes, master_params, self.use_fp16 + ) + + def state_dict_to_master_params(self, state_dict): + return state_dict_to_master_params(self.model, state_dict, self.use_fp16) + + +def check_overflow(value): + return (value == float("inf")) or (value == -float("inf")) or (value != value) diff --git a/conditional-flow-matching/runner/src/models/components/hyper_nets.py b/conditional-flow-matching/runner/src/models/components/hyper_nets.py new file mode 100644 index 0000000000000000000000000000000000000000..976d649e45384af49ad7e84d37e636d3680a3de8 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/hyper_nets.py @@ -0,0 +1,441 @@ +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .base import LocallyConnected + + +class ResNetBlock(nn.Module): + def __init__(self, in_size=16, h_size=16, out_size=16): + super().__init__() + + layer = nn.ModuleList() + layer.append(nn.Linear(in_features=in_size, out_features=h_size)) + layer.append(nn.ReLU()) + layer.append(nn.Linear(in_features=h_size, out_features=out_size)) + + self.f = nn.Sequential(*layer) + self.shortcut = nn.Sequential() + + def forward(self, x): + return F.relu(self.f(x) + self.shortcut(x)) + + +class HyperResNet(nn.Module): + def __init__(self, in_size=16, h_size=16, out_size=16, num_block=2): + super().__init__() + + blocks = nn.ModuleList() + for _ in range(num_block): + blocks.append(ResNetBlock(in_size, h_size, out_size)) + self.model = nn.Sequential(*blocks) + + def forward(self, x): + return self.model(x) + + +class HyperLocallyConnected(nn.Module): + """Hyper Local linear layer, i.e. Conv1dLocal() with filter size 1 which parameters are learned + from another netwokr: + + y = LocallyConnected_{params}(x), + where params = h(G) + + Args: + num_linear: num of local linear layers, i.e. + in_features: m1 + out_features: m2 + bias: whether to include bias or not + + Shape: + - Input: [n, d, m1] + - Output: [n, d, m2] + + Attributes: + weight: [d, m1, m2] + bias: [d, m2] + """ + + VALID_HYPER = [ + "mlp", + "gnn", + "invariant", + "per_graph", + "deep_set", + ] + + def __init__( + self, + num_linear, + input_features, + output_features, + hyper, + n_ens=1, + bias=True, + hyper_hidden_dims: Optional[list] = None, + ): + super().__init__() + self.num_linear = num_linear + self.input_features = input_features + self.output_features = output_features + self.n_ens = n_ens + self.hyper = hyper + + assert ( + hyper in self.VALID_HYPER + ), f"hyper hparam not a valid option - choices: {self.VALID_HYPER}" + + if hyper == "invariant": + hyper_type = HyperInvariant + elif hyper == "mlp": + hyper_type = HyperMLP + elif hyper == "per_graph": + hyper_type = HyperInvariantPerGraph + elif hyper == "deep_set": + hyper_type = DeepSet + + self.hyper_layer = hyper_type( + n_ens=n_ens, + num_linear=num_linear, + input_features=input_features, + output_features=output_features, + bias=bias, + hidden_dims=hyper_hidden_dims, + ) + + def forward(self, x: torch.Tensor, G: torch.Tensor): + # [n_ens, n, d, 1, m2] = [n_ens, n, d, 1, m1] @ [n_ens, 1, d, m1, m2] + weights, biases = self.hyper_layer(G.to(x)) + x = torch.matmul(x, weights.unsqueeze(1)) + if biases is not None: + # [n, d, m2] += [d, m2] + x += biases.unsqueeze(-2).unsqueeze(1) + return x + + +class AnalyiticLinearLocallyConnected(nn.Module): + """Analytic linear Local linear layer, i.e. Conv1dLocal() with filter size 1 which parameters + are learned from another netwokr: + + y = LocallyConnected_{params}(x), + where params = h(G) + + Args: + num_linear: num of local linear layers, i.e. + in_features: m1 + out_features: m2 + bias: whether to include bias or not + + Shape: + - Input: [n, d, m1] + - Output: [n, d, m2] + + Attributes: + weight: [d, m1, m2] + bias: [d, m2] + """ + + def __init__( + self, + num_linear, + input_features, + hyper, + n_ens=1, + bias=True, + hyper_hidden_dims: Optional[list] = None, + ): + super().__init__() + self.num_linear = num_linear + self.input_features = input_features + self.n_ens = n_ens + self.hyper = hyper + + self.hyper_layer = HyperAnalyticLinear( + n_ens=n_ens, + num_linear=num_linear, + input_features=input_features, + ) + + self.weights = torch.randn((n_ens, num_linear, num_linear)) + + def forward(self, x: torch.Tensor, dx: torch.Tensor, G: torch.Tensor): + # [n_ens, n, d, 1, m2] = [n_ens, n, d, 1, m1] @ [n_ens, 1, d, m1, m2] + self.weights = self.hyper_layer(x, dx, G.to(x)) + x = torch.matmul( + self.weights.unsqueeze(1).transpose(-2, -1), + x.squeeze(-2).squeeze(0), + ) + return x + + +class NodeHyperLocallyConnected(nn.Module): + """Hyper Local linear layer, i.e. Conv1dLocal() with filter size 1 which parameters are learned + from another netwokr: + + y = LocallyConnected_{params}(x), + where params = h(G) + + Args: + num_linear: num of local linear layers, i.e. + in_features: m1 + out_features: m2 + bias: whether to include bias or not + + Shape: + - Input: [n, d, m1] + - Output: [n, d, m2] + + Attributes: + weight: [d, m1, m2] + bias: [d, m2] + """ + + VALID_HYPER = [ + "mlp", + "gnn", + "invariant", + "per_graph", + "deep_set", + ] + + def __init__( + self, + num_linear, + input_features, + output_features, + hyper, + n_ens=1, + bias=True, + hyper_hidden_dims: Optional[list] = None, + ): + super().__init__() + self.num_linear = num_linear + self.input_features = input_features + self.output_features = output_features + self.n_ens = n_ens + self.hyper = hyper + self.G = None + + assert ( + hyper in self.VALID_HYPER + ), f"hyper hparam not a valid option - choices: {self.VALID_HYPER}" + + if hyper == "invariant": + hyper_type = HyperInvariant + elif hyper == "mlp": + hyper_type = HyperMLP + elif hyper == "per_graph": + hyper_type = HyperInvariantPerGraph + elif hyper == "deep_set": + hyper_type = DeepSet + + self.hyper_layer = hyper_type( + n_ens=n_ens, + num_linear=num_linear, + input_features=input_features, + output_features=output_features, + bias=bias, + hidden_dims=hyper_hidden_dims, + ) + + def forward(self, x: torch.Tensor): + # [n_ens, n, d, 1, m2] = [n_ens, n, d, 1, m1] @ [n_ens, 1, d, m1, m2] + G = self.G + weights, biases = self.hyper_layer(G.to(x)) + x = torch.matmul(x, weights.unsqueeze(1)) + if biases is not None: + # [n, d, m2] += [d, m2] + x += biases.unsqueeze(-2).unsqueeze(1) + return x + + +class MLP(nn.Module): + def __init__(self, dims, bias=True): + super().__init__() + self.net = nn.Sequential() + for i in range(len(dims) - 1): + if i > 0: + self.net.append(nn.ELU()) + self.net.append(nn.Linear(dims[i], dims[i + 1], bias=bias)) + + def forward(self, x): + return self.net(x) + + +class DeepSet(nn.Module): + def __init__( + self, + num_nodes, + input_features, + output_features, + bias=True, + embedding_size: Optional[int] = None, + phi_dims: Optional[list] = None, + f_dims: Optional[list] = None, + **kwargs, + ): + super().__init__() + if embedding_size is None: + embedding_size = 16 + if phi_dims is None: + phi_dims = [64, 64] + if f_dims is None: + f_dims = [64, 64] + self.embedding_size = embedding_size + self.phi_dims = phi_dims + self.f_dims = f_dims + self.node_embedding = nn.Parameter(torch.Tensor(embedding_size, num_nodes)) + self.phi_net = MLP([embedding_size + input_features, *phi_dims, embedding_size], bias=bias) + self.f_net = MLP([embedding_size, *f_dims, output_features], bias=bias) + + def forward(self, x: torch.Tensor, G: torch.Tensor): + # x: [n_ens, batch, d, 1, d] + # [n_ens, n, d, 1, m2] = [n_ens, n, d, 1, m1] @ [n_ens, 1, d, m1, m2] + del G + x = self.phi_net(torch.cat(self.node_embeddings, x)) + # [n_ens, batch, d, 1, emb] + x = torch.sum(x, dim=-2) + + return x + + +class HyperMLP(nn.Module): + """Hypernetwork that takes in a graph (represented as an adjacency matrix) and outputs weights + and biases for a linear layer over each node.""" + + def __init__( + self, + num_linear, + input_features, + output_features, + bias=True, + hidden_dims: Optional[list] = None, + **kwargs, + ): + super().__init__() + if hidden_dims is None: + # hidden_dims = [64, 64, 64] + # hidden_dims = [64, 64] + hidden_dims = [1024, 512, 128, 64] + # hidden_dims = [1024, 1024, 1024, 64] + self.dims = hidden_dims + self.num_linear = num_linear + self.input_features = input_features + self.output_features = output_features + self.bias = bias + + self.w_features = self.num_linear * self.input_features * self.output_features + self.b_features = self.num_linear * self.output_features + self.total_features = self.w_features + if self.bias: + self.total_features += self.b_features + full_dims = [num_linear**2, *self.dims, self.total_features] + self.net = nn.Sequential() + for i in range(len(full_dims) - 1): + if i > 0: + self.net.append(nn.ELU()) + self.net.append(nn.Linear(full_dims[i], full_dims[i + 1])) + + def forward(self, x): + # input = G ~ A [n_ens x d x d] + # Want: output = |params| + # params = h(G) + n_ens = x.shape[0] + x = x.reshape(n_ens, -1) + x = self.net(x) + x_w = x[:, : self.w_features].reshape( + n_ens, self.num_linear, self.input_features, self.output_features + ) + x_b = None + if self.bias: + x_b = x[:, self.w_features :].reshape(n_ens, self.num_linear, self.output_features) + return x_w, x_b + + +class HyperAnalyticLinear(LocallyConnected): + """Analytic linear hyper-net module. + + Locally connected but directly returns weights + """ + + def __init__( + self, + n_ens, + num_linear, + input_features, + ): + super(LocallyConnected, self).__init__() + self.n_ens = n_ens + self.num_linear = num_linear + self.input_features = input_features + self.output_features = input_features + self.beta = 0.01 # per-node-GFN = 0.01 + + # self.weight = nn.Parameter( + # torch.FloatTensor(n_ens, num_linear, num_linear) + # ) + self.register_parameter("bias", None) + + # self.reset_parameters() + + def analytic_linear(self, x, dx, G): + Gt = torch.transpose(G.to(x), -2, -1).unsqueeze(1) + x_masked = Gt * x + A_est = [] + for p in range(self.num_linear): + w_est = torch.linalg.solve( + (torch.transpose(x_masked[:, :, p, :], -2, -1) @ x_masked[:, :, p, :]) + + self.beta * torch.eye(self.num_linear).unsqueeze(0).type_as(x_masked), + torch.transpose(x_masked[:, :, p, :], -2, -1) @ dx[:, :, p], + ) + A_est.append(w_est) + A_est = torch.cat(A_est, dim=2) + return A_est + + def forward(self, x, dx, G): + self.weights = self.analytic_linear(x, dx, G).to(x) + return self.weights + + +class HyperInvariantPerGraph(LocallyConnected): + """Invariant hyper-net module per graph. + + Locally connected but directly returns weights + """ + + def __init__(self, n_ens, num_linear, input_features, output_features, bias=True, **kwargs): + super(LocallyConnected, self).__init__() + self.n_ens = n_ens + self.num_linear = num_linear + self.input_features = input_features + self.output_features = output_features + + self.weight = nn.Parameter( + torch.Tensor(n_ens, num_linear, input_features, output_features) + ) + if bias: + self.bias = nn.Parameter(torch.Tensor(n_ens, num_linear, output_features)) + else: + # You should always register all possible parameters, but the + # optional ones can be None if you want. + self.register_parameter("bias", None) + + self.reset_parameters() + + def forward(self, input): + return self.weight, self.bias + + +class HyperInvariant(LocallyConnected): + """Invariant hyper-net module. + + Locally connected but directly returns weights + """ + + def __init__(self, num_linear, input_features, output_features, bias=True, **kwargs): + super().__init__(num_linear, input_features, output_features, bias) + + def forward(self, input): + return self.weight.unsqueeze(0), self.bias.unsqueeze(0) diff --git a/conditional-flow-matching/runner/src/models/components/icnn_model.py b/conditional-flow-matching/runner/src/models/components/icnn_model.py new file mode 100644 index 0000000000000000000000000000000000000000..fefb9149cad3491b4160ebc0fc5a745b9d2b474c --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/icnn_model.py @@ -0,0 +1,29 @@ +import torch +from torch import nn + + +class ICNN(torch.nn.Module): + """Input Convex Neural Network.""" + + def __init__(self, dim=2, dimh=64, num_hidden_layers=4): + super().__init__() + + Wzs = [] + Wzs.append(nn.Linear(dim, dimh)) + for _ in range(num_hidden_layers - 1): + Wzs.append(torch.nn.Linear(dimh, dimh, bias=False)) + Wzs.append(torch.nn.Linear(dimh, 1, bias=False)) + self.Wzs = torch.nn.ModuleList(Wzs) + + Wxs = [] + for _ in range(num_hidden_layers - 1): + Wxs.append(nn.Linear(dim, dimh)) + Wxs.append(nn.Linear(dim, 1, bias=False)) + self.Wxs = torch.nn.ModuleList(Wxs) + self.act = nn.Softplus() + + def forward(self, x): + z = self.act(self.Wzs[0](x)) + for Wz, Wx in zip(self.Wzs[1:-1], self.Wxs[:-1]): + z = self.act(Wz(z) + Wx(x)) + return self.Wzs[-1](z) + self.Wxs[-1](x) diff --git a/conditional-flow-matching/runner/src/models/components/layers/__init__.py b/conditional-flow-matching/runner/src/models/components/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/__init__.py b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dffd018a89d300283a64c3c2664abfa6977419ee --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/__init__.py @@ -0,0 +1,4 @@ +from .basic import * +from .container import * +from .resnet import * +from .wrappers import * diff --git a/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/basic.py b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/basic.py new file mode 100644 index 0000000000000000000000000000000000000000..19f57323a0a83a92912ee535988171514bb0bb35 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/basic.py @@ -0,0 +1,469 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find("Linear") != -1 or classname.find("Conv") != -1: + nn.init.constant_(m.weight, 0) + nn.init.normal_(m.bias, 0, 0.01) + + +class HyperLinear(nn.Module): + def __init__(self, dim_in, dim_out, hypernet_dim=8, n_hidden=1, activation=nn.Tanh): + super().__init__() + self.dim_in = dim_in + self.dim_out = dim_out + self.params_dim = self.dim_in * self.dim_out + self.dim_out + + layers = [] + dims = [1] + [hypernet_dim] * n_hidden + [self.params_dim] + for i in range(1, len(dims)): + layers.append(nn.Linear(dims[i - 1], dims[i])) + if i < len(dims) - 1: + layers.append(activation()) + self._hypernet = nn.Sequential(*layers) + self._hypernet.apply(weights_init) + + def forward(self, t, x): + params = self._hypernet(t.view(1, 1)).view(-1) + b = params[: self.dim_out].view(self.dim_out) + w = params[self.dim_out :].view(self.dim_out, self.dim_in) + return F.linear(x, w, b) + + +class IgnoreLinear(nn.Module): + def __init__(self, dim_in, dim_out): + super().__init__() + self._layer = nn.Linear(dim_in, dim_out) + + def forward(self, t, x): + return self._layer(x) + + +class ConcatLinear(nn.Module): + def __init__(self, dim_in, dim_out): + super().__init__() + self._layer = nn.Linear(dim_in + 1, dim_out) + + def forward(self, t, x): + tt = torch.ones_like(x[:, :1]) * t + ttx = torch.cat([tt, x], 1) + return self._layer(ttx) + + +class ConcatLinear_v2(nn.Module): + def __init__(self, dim_in, dim_out): + super(ConcatLinear, self).__init__() + self._layer = nn.Linear(dim_in, dim_out) + self._hyper_bias = nn.Linear(1, dim_out, bias=False) + + def forward(self, t, x): + return self._layer(x) + self._hyper_bias(t.view(1, 1)) + + +class SquashLinear(nn.Module): + def __init__(self, dim_in, dim_out): + super().__init__() + self._layer = nn.Linear(dim_in, dim_out) + self._hyper = nn.Linear(1, dim_out) + + def forward(self, t, x): + return self._layer(x) * torch.sigmoid(self._hyper(t.view(1, 1))) + + +class ConcatSquashLinear(nn.Module): + def __init__(self, dim_in, dim_out): + super().__init__() + self._layer = nn.Linear(dim_in, dim_out) + self._hyper_bias = nn.Linear(1, dim_out, bias=False) + self._hyper_gate = nn.Linear(1, dim_out) + + def forward(self, t, x): + return self._layer(x) * torch.sigmoid(self._hyper_gate(t.view(1, 1))) + self._hyper_bias( + t.view(1, 1) + ) + + +class HyperConv2d(nn.Module): + def __init__( + self, + dim_in, + dim_out, + ksize=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + transpose=False, + ): + super().__init__() + assert ( + dim_in % groups == 0 and dim_out % groups == 0 + ), "dim_in and dim_out must both be divisible by groups." + self.dim_in = dim_in + self.dim_out = dim_out + self.ksize = ksize + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.bias = bias + self.transpose = transpose + + self.params_dim = int(dim_in * dim_out * ksize * ksize / groups) + if self.bias: + self.params_dim += dim_out + self._hypernet = nn.Linear(1, self.params_dim) + self.conv_fn = F.conv_transpose2d if transpose else F.conv2d + + self._hypernet.apply(weights_init) + + def forward(self, t, x): + params = self._hypernet(t.view(1, 1)).view(-1) + weight_size = int(self.dim_in * self.dim_out * self.ksize * self.ksize / self.groups) + if self.transpose: + weight = params[:weight_size].view( + self.dim_in, self.dim_out // self.groups, self.ksize, self.ksize + ) + else: + weight = params[:weight_size].view( + self.dim_out, self.dim_in // self.groups, self.ksize, self.ksize + ) + bias = params[: self.dim_out].view(self.dim_out) if self.bias else None + return self.conv_fn( + x, + weight=weight, + bias=bias, + stride=self.stride, + padding=self.padding, + groups=self.groups, + dilation=self.dilation, + ) + + +class IgnoreConv2d(nn.Module): + def __init__( + self, + dim_in, + dim_out, + ksize=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + transpose=False, + ): + super().__init__() + module = nn.ConvTranspose2d if transpose else nn.Conv2d + self._layer = module( + dim_in, + dim_out, + kernel_size=ksize, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + + def forward(self, t, x): + return self._layer(x) + + +class SquashConv2d(nn.Module): + def __init__( + self, + dim_in, + dim_out, + ksize=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + transpose=False, + ): + super().__init__() + module = nn.ConvTranspose2d if transpose else nn.Conv2d + self._layer = module( + dim_in + 1, + dim_out, + kernel_size=ksize, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self._hyper = nn.Linear(1, dim_out) + + def forward(self, t, x): + return self._layer(x) * torch.sigmoid(self._hyper(t.view(1, 1))).view(1, -1, 1, 1) + + +class ConcatConv2d(nn.Module): + def __init__( + self, + dim_in, + dim_out, + ksize=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + transpose=False, + ): + super().__init__() + module = nn.ConvTranspose2d if transpose else nn.Conv2d + self._layer = module( + dim_in + 1, + dim_out, + kernel_size=ksize, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + + def forward(self, t, x): + tt = torch.ones_like(x[:, :1, :, :]) * t + ttx = torch.cat([tt, x], 1) + return self._layer(ttx) + + +class ConcatConv2d_v2(nn.Module): + def __init__( + self, + dim_in, + dim_out, + ksize=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + transpose=False, + ): + super(ConcatConv2d, self).__init__() + module = nn.ConvTranspose2d if transpose else nn.Conv2d + self._layer = module( + dim_in, + dim_out, + kernel_size=ksize, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self._hyper_bias = nn.Linear(1, dim_out, bias=False) + + def forward(self, t, x): + return self._layer(x) + self._hyper_bias(t.view(1, 1)).view(1, -1, 1, 1) + + +class ConcatSquashConv2d(nn.Module): + def __init__( + self, + dim_in, + dim_out, + ksize=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + transpose=False, + ): + super().__init__() + module = nn.ConvTranspose2d if transpose else nn.Conv2d + self._layer = module( + dim_in, + dim_out, + kernel_size=ksize, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self._hyper_gate = nn.Linear(1, dim_out) + self._hyper_bias = nn.Linear(1, dim_out, bias=False) + + def forward(self, t, x): + return self._layer(x) * torch.sigmoid(self._hyper_gate(t.view(1, 1))).view( + 1, -1, 1, 1 + ) + self._hyper_bias(t.view(1, 1)).view(1, -1, 1, 1) + + +class ConcatCoordConv2d(nn.Module): + def __init__( + self, + dim_in, + dim_out, + ksize=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + transpose=False, + ): + super().__init__() + module = nn.ConvTranspose2d if transpose else nn.Conv2d + self._layer = module( + dim_in + 3, + dim_out, + kernel_size=ksize, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + + def forward(self, t, x): + b, c, h, w = x.shape + hh = torch.arange(h).to(x).view(1, 1, h, 1).expand(b, 1, h, w) + ww = torch.arange(w).to(x).view(1, 1, 1, w).expand(b, 1, h, w) + tt = t.to(x).view(1, 1, 1, 1).expand(b, 1, h, w) + x_aug = torch.cat([x, tt, hh, ww], 1) + return self._layer(x_aug) + + +class GatedLinear(nn.Module): + def __init__(self, in_features, out_features): + super().__init__() + self.layer_f = nn.Linear(in_features, out_features) + self.layer_g = nn.Linear(in_features, out_features) + + def forward(self, x): + f = self.layer_f(x) + g = torch.sigmoid(self.layer_g(x)) + return f * g + + +class GatedConv(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, groups=1): + super().__init__() + self.layer_f = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=1, + groups=groups, + ) + self.layer_g = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=1, + groups=groups, + ) + + def forward(self, x): + f = self.layer_f(x) + g = torch.sigmoid(self.layer_g(x)) + return f * g + + +class GatedConvTranspose(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + groups=1, + ): + super().__init__() + self.layer_f = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + ) + self.layer_g = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + groups=groups, + ) + + def forward(self, x): + f = self.layer_f(x) + g = torch.sigmoid(self.layer_g(x)) + return f * g + + +class BlendLinear(nn.Module): + def __init__(self, dim_in, dim_out, layer_type=nn.Linear, **unused_kwargs): + super().__init__() + self._layer0 = layer_type(dim_in, dim_out) + self._layer1 = layer_type(dim_in, dim_out) + + def forward(self, t, x): + y0 = self._layer0(x) + y1 = self._layer1(x) + return y0 + (y1 - y0) * t + + +class BlendConv2d(nn.Module): + def __init__( + self, + dim_in, + dim_out, + ksize=3, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + transpose=False, + **unused_kwargs, + ): + super().__init__() + module = nn.ConvTranspose2d if transpose else nn.Conv2d + self._layer0 = module( + dim_in, + dim_out, + kernel_size=ksize, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + self._layer1 = module( + dim_in, + dim_out, + kernel_size=ksize, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + + def forward(self, t, x): + y0 = self._layer0(x) + y1 = self._layer1(x) + return y0 + (y1 - y0) * t diff --git a/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/container.py b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/container.py new file mode 100644 index 0000000000000000000000000000000000000000..a4393f8bf22872656408bce839a0f62b514ffe9f --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/container.py @@ -0,0 +1,45 @@ +import torch +import torch.nn as nn + +from .wrappers import diffeq_wrapper + + +class SequentialDiffEq(nn.Module): + """A container for a sequential chain of layers. + + Supports both regular and diffeq layers. + """ + + def __init__(self, *layers): + super().__init__() + self.layers = nn.ModuleList([diffeq_wrapper(layer) for layer in layers]) + + def forward(self, t, x): + for layer in self.layers: + x = layer(t, x) + return x + + +class MixtureODELayer(nn.Module): + """Produces a mixture of experts where output = sigma(t) * f(t, x). + + Time-dependent weights sigma(t) help learn to blend the experts without resorting to a highly + stiff f. Supports both regular and diffeq experts. + """ + + def __init__(self, experts): + super().__init__() + assert len(experts) > 1 + wrapped_experts = [diffeq_wrapper(ex) for ex in experts] + self.experts = nn.ModuleList(wrapped_experts) + self.mixture_weights = nn.Linear(1, len(self.experts)) + + def forward(self, t, y): + dys = [] + for f in self.experts: + dys.append(f(t, y)) + dys = torch.stack(dys, 0) + weights = self.mixture_weights(t).view(-1, *([1] * (dys.ndimension() - 1))) + + dy = torch.sum(dys * weights, dim=0, keepdim=False) + return dy diff --git a/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/resnet.py b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..28bf38c8bdc6e5cb98eed88503a49ff8ab829cd3 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/resnet.py @@ -0,0 +1,66 @@ +import torch.nn as nn + +from . import basic, container + +NGROUPS = 16 + + +class ResNet(container.SequentialDiffEq): + def __init__(self, dim, intermediate_dim, n_resblocks, conv_block=None): + super().__init__() + + if conv_block is None: + conv_block = basic.ConcatCoordConv2d + + self.dim = dim + self.intermediate_dim = intermediate_dim + self.n_resblocks = n_resblocks + + layers = [] + layers.append(conv_block(dim, intermediate_dim, ksize=3, stride=1, padding=1, bias=False)) + for _ in range(n_resblocks): + layers.append(BasicBlock(intermediate_dim, conv_block)) + layers.append(nn.GroupNorm(NGROUPS, intermediate_dim, eps=1e-4)) + layers.append(nn.ReLU(inplace=True)) + layers.append(conv_block(intermediate_dim, dim, ksize=1, bias=False)) + + super().__init__(*layers) + + def __repr__(self): + return ( + "{name}({dim}, intermediate_dim={intermediate_dim}, n_resblocks={n_resblocks})".format( + name=self.__class__.__name__, **self.__dict__ + ) + ) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, dim, conv_block=None): + super().__init__() + + if conv_block is None: + conv_block = basic.ConcatCoordConv2d + + self.norm1 = nn.GroupNorm(NGROUPS, dim, eps=1e-4) + self.relu1 = nn.ReLU(inplace=True) + self.conv1 = conv_block(dim, dim, ksize=3, stride=1, padding=1, bias=False) + self.norm2 = nn.GroupNorm(NGROUPS, dim, eps=1e-4) + self.relu2 = nn.ReLU(inplace=True) + self.conv2 = conv_block(dim, dim, ksize=3, stride=1, padding=1, bias=False) + + def forward(self, t, x): + residual = x + + out = self.norm1(x) + out = self.relu1(out) + out = self.conv1(t, out) + + out = self.norm2(out) + out = self.relu2(out) + out = self.conv2(t, out) + + out += residual + + return out diff --git a/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/wrappers.py b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..f7fda9daee1c64b09ec470ec2f5ba835a1e5b4b0 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/layers/diffeq_layers/wrappers.py @@ -0,0 +1,49 @@ +from inspect import signature + +import torch.nn as nn + +__all__ = ["diffeq_wrapper", "reshape_wrapper"] + + +class DiffEqWrapper(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + if len(signature(self.module.forward).parameters) == 1: + self.diffeq = lambda t, y: self.module(y) + elif len(signature(self.module.forward).parameters) == 2: + self.diffeq = self.module + else: + raise ValueError("Differential equation needs to either take (t, y) or (y,) as input.") + + def forward(self, t, y): + return self.diffeq(t, y) + + def __repr__(self): + return self.diffeq.__repr__() + + +def diffeq_wrapper(layer): + return DiffEqWrapper(layer) + + +class ReshapeDiffEq(nn.Module): + def __init__(self, input_shape, net): + super().__init__() + assert ( + len(signature(net.forward).parameters) == 2 + ), "use diffeq_wrapper before reshape_wrapper." + self.input_shape = input_shape + self.net = net + + def forward(self, t, x): + batchsize = x.shape[0] + x = x.view(batchsize, *self.input_shape) + return self.net(t, x).view(batchsize, -1) + + def __repr__(self): + return self.diffeq.__repr__() + + +def reshape_wrapper(input_shape, layer): + return ReshapeDiffEq(input_shape, layer) diff --git a/conditional-flow-matching/runner/src/models/components/layers/odefunc.py b/conditional-flow-matching/runner/src/models/components/layers/odefunc.py new file mode 100644 index 0000000000000000000000000000000000000000..50b3fc0d4dab1f7ca0282759e3acc282fb314b12 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/layers/odefunc.py @@ -0,0 +1,258 @@ +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from . import diffeq_layers +from .squeeze import squeeze, unsqueeze + +__all__ = ["ODEnet", "AutoencoderDiffEqNet"] + + +class Swish(nn.Module): + def __init__(self): + super().__init__() + self.beta = nn.Parameter(torch.tensor(1.0)) + + def forward(self, x): + return x * torch.sigmoid(self.beta * x) + + +class Lambda(nn.Module): + def __init__(self, f): + super().__init__() + self.f = f + + def forward(self, x): + return self.f(x) + + +NONLINEARITIES = { + "tanh": nn.Tanh(), + "relu": nn.ReLU(), + "softplus": nn.Softplus(), + "elu": nn.ELU(), + "swish": Swish(), + "square": Lambda(lambda x: x**2), + "identity": Lambda(lambda x: x), +} + + +class ODEnet(nn.Module): + """Helper class to make neural nets for use in continuous normalizing flows.""" + + def __init__( + self, + hidden_dims, + input_shape, + strides, + conv, + layer_type="concat", + nonlinearity="softplus", + num_squeeze=0, + ): + super().__init__() + self.num_squeeze = num_squeeze + if conv: + assert len(strides) == len(hidden_dims) + 1 + base_layer = { + "ignore": diffeq_layers.IgnoreConv2d, + "hyper": diffeq_layers.HyperConv2d, + "squash": diffeq_layers.SquashConv2d, + "concat": diffeq_layers.ConcatConv2d, + "concat_v2": diffeq_layers.ConcatConv2d_v2, + "concatsquash": diffeq_layers.ConcatSquashConv2d, + "blend": diffeq_layers.BlendConv2d, + "concatcoord": diffeq_layers.ConcatCoordConv2d, + }[layer_type] + else: + strides = [None] * (len(hidden_dims) + 1) + base_layer = { + "ignore": diffeq_layers.IgnoreLinear, + "hyper": diffeq_layers.HyperLinear, + "squash": diffeq_layers.SquashLinear, + "concat": diffeq_layers.ConcatLinear, + "concat_v2": diffeq_layers.ConcatLinear_v2, + "concatsquash": diffeq_layers.ConcatSquashLinear, + "blend": diffeq_layers.BlendLinear, + "concatcoord": diffeq_layers.ConcatLinear, + }[layer_type] + + # build layers and add them + layers = [] + activation_fns = [] + hidden_shape = input_shape + + for dim_out, stride in zip(hidden_dims + (input_shape[0],), strides): + if stride is None: + layer_kwargs = {} + elif stride == 1: + layer_kwargs = { + "ksize": 3, + "stride": 1, + "padding": 1, + "transpose": False, + } + elif stride == 2: + layer_kwargs = { + "ksize": 4, + "stride": 2, + "padding": 1, + "transpose": False, + } + elif stride == -2: + layer_kwargs = { + "ksize": 4, + "stride": 2, + "padding": 1, + "transpose": True, + } + else: + raise ValueError(f"Unsupported stride: {stride}") + + layer = base_layer(hidden_shape[0], dim_out, **layer_kwargs) + layers.append(layer) + activation_fns.append(NONLINEARITIES[nonlinearity]) + + hidden_shape = list(copy.copy(hidden_shape)) + hidden_shape[0] = dim_out + if stride == 2: + hidden_shape[1], hidden_shape[2] = ( + hidden_shape[1] // 2, + hidden_shape[2] // 2, + ) + elif stride == -2: + hidden_shape[1], hidden_shape[2] = ( + hidden_shape[1] * 2, + hidden_shape[2] * 2, + ) + + self.layers = nn.ModuleList(layers) + self.activation_fns = nn.ModuleList(activation_fns[:-1]) + + def forward(self, t, y): + dx = y + # squeeze + for _ in range(self.num_squeeze): + dx = squeeze(dx, 2) + for l, layer in enumerate(self.layers): + dx = layer(t, dx) + # if not last layer, use nonlinearity + if l < len(self.layers) - 1: + dx = self.activation_fns[l](dx) + # unsqueeze + for _ in range(self.num_squeeze): + dx = unsqueeze(dx, 2) + return dx + + +class AutoencoderDiffEqNet(nn.Module): + """Helper class to make neural nets for use in continuous normalizing flows.""" + + def __init__( + self, + hidden_dims, + input_shape, + strides, + conv, + layer_type="concat", + nonlinearity="softplus", + ): + super().__init__() + assert layer_type in ("ignore", "hyper", "concat", "concatcoord", "blend") + assert nonlinearity in ("tanh", "relu", "softplus", "elu") + + self.nonlinearity = { + "tanh": F.tanh, + "relu": F.relu, + "softplus": F.softplus, + "elu": F.elu, + }[nonlinearity] + if conv: + assert len(strides) == len(hidden_dims) + 1 + base_layer = { + "ignore": diffeq_layers.IgnoreConv2d, + "hyper": diffeq_layers.HyperConv2d, + "squash": diffeq_layers.SquashConv2d, + "concat": diffeq_layers.ConcatConv2d, + "blend": diffeq_layers.BlendConv2d, + "concatcoord": diffeq_layers.ConcatCoordConv2d, + }[layer_type] + else: + strides = [None] * (len(hidden_dims) + 1) + base_layer = { + "ignore": diffeq_layers.IgnoreLinear, + "hyper": diffeq_layers.HyperLinear, + "squash": diffeq_layers.SquashLinear, + "concat": diffeq_layers.ConcatLinear, + "blend": diffeq_layers.BlendLinear, + "concatcoord": diffeq_layers.ConcatLinear, + }[layer_type] + + # build layers and add them + encoder_layers = [] + decoder_layers = [] + hidden_shape = input_shape + for i, (dim_out, stride) in enumerate(zip(hidden_dims + (input_shape[0],), strides)): + if i <= len(hidden_dims) // 2: + layers = encoder_layers + else: + layers = decoder_layers + + if stride is None: + layer_kwargs = {} + elif stride == 1: + layer_kwargs = { + "ksize": 3, + "stride": 1, + "padding": 1, + "transpose": False, + } + elif stride == 2: + layer_kwargs = { + "ksize": 4, + "stride": 2, + "padding": 1, + "transpose": False, + } + elif stride == -2: + layer_kwargs = { + "ksize": 4, + "stride": 2, + "padding": 1, + "transpose": True, + } + else: + raise ValueError(f"Unsupported stride: {stride}") + + layers.append(base_layer(hidden_shape[0], dim_out, **layer_kwargs)) + + hidden_shape = list(copy.copy(hidden_shape)) + hidden_shape[0] = dim_out + if stride == 2: + hidden_shape[1], hidden_shape[2] = ( + hidden_shape[1] // 2, + hidden_shape[2] // 2, + ) + elif stride == -2: + hidden_shape[1], hidden_shape[2] = ( + hidden_shape[1] * 2, + hidden_shape[2] * 2, + ) + + self.encoder_layers = nn.ModuleList(encoder_layers) + self.decoder_layers = nn.ModuleList(decoder_layers) + + def forward(self, t, y): + h = y + for layer in self.encoder_layers: + h = self.nonlinearity(layer(t, h)) + + dx = h + for i, layer in enumerate(self.decoder_layers): + dx = layer(t, dx) + # if not last layer, use nonlinearity + if i < len(self.decoder_layers) - 1: + dx = self.nonlinearity(dx) + return h, dx diff --git a/conditional-flow-matching/runner/src/models/components/layers/squeeze.py b/conditional-flow-matching/runner/src/models/components/layers/squeeze.py new file mode 100644 index 0000000000000000000000000000000000000000..2fbb900bdaf0cd04d2eb2158f741d1dd9c7fd779 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/layers/squeeze.py @@ -0,0 +1,66 @@ +import torch.nn as nn + +__all__ = ["SqueezeLayer"] + + +class SqueezeLayer(nn.Module): + def __init__(self, downscale_factor): + super().__init__() + self.downscale_factor = downscale_factor + + def forward(self, x, logpx=None, reverse=False): + if reverse: + return self._upsample(x, logpx) + else: + return self._downsample(x, logpx) + + def _downsample(self, x, logpx=None): + squeeze_x = squeeze(x, self.downscale_factor) + if logpx is None: + return squeeze_x + else: + return squeeze_x, logpx + + def _upsample(self, y, logpy=None): + unsqueeze_y = unsqueeze(y, self.downscale_factor) + if logpy is None: + return unsqueeze_y + else: + return unsqueeze_y, logpy + + +def unsqueeze(input, upscale_factor=2): + """[:, C*r^2, H, W] -> [:, C, H*r, W*r]""" + batch_size, in_channels, in_height, in_width = input.size() + out_channels = in_channels // (upscale_factor**2) + + out_height = in_height * upscale_factor + out_width = in_width * upscale_factor + + input_view = input.contiguous().view( + batch_size, out_channels, upscale_factor, upscale_factor, in_height, in_width + ) + + output = input_view.permute(0, 1, 4, 2, 5, 3).contiguous() + return output.view(batch_size, out_channels, out_height, out_width) + + +def squeeze(input, downscale_factor=2): + """[:, C, H*r, W*r] -> [:, C*r^2, H, W]""" + batch_size, in_channels, in_height, in_width = input.size() + out_channels = in_channels * (downscale_factor**2) + + out_height = in_height // downscale_factor + out_width = in_width // downscale_factor + + input_view = input.contiguous().view( + batch_size, + in_channels, + out_height, + downscale_factor, + out_width, + downscale_factor, + ) + + output = input_view.permute(0, 1, 3, 5, 2, 4).contiguous() + return output.view(batch_size, out_channels, out_height, out_width) diff --git a/conditional-flow-matching/runner/src/models/components/logger.py b/conditional-flow-matching/runner/src/models/components/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..911dc54aca9e8017e20c144241b072a48cc49b79 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/logger.py @@ -0,0 +1,468 @@ +"""Logger copied from OpenAI baselines to avoid extra RL-based dependencies: + +https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/logger.py +""" + +import datetime +import json +import os +import os.path as osp +import sys +import tempfile +import time +import warnings +from collections import defaultdict +from contextlib import contextmanager + +DEBUG = 10 +INFO = 20 +WARN = 30 +ERROR = 40 + +DISABLED = 50 + + +class KVWriter: + def writekvs(self, kvs): + raise NotImplementedError + + +class SeqWriter: + def writeseq(self, seq): + raise NotImplementedError + + +class HumanOutputFormat(KVWriter, SeqWriter): + def __init__(self, filename_or_file): + if isinstance(filename_or_file, str): + self.file = open(filename_or_file, "w") + self.own_file = True + else: + assert hasattr(filename_or_file, "read"), ( + "expected file or str, got %s" % filename_or_file + ) + self.file = filename_or_file + self.own_file = False + + def writekvs(self, kvs): + # Create strings for printing + key2str = {} + for key, val in sorted(kvs.items()): + if hasattr(val, "__float__"): + valstr = "%-8.3g" % val + else: + valstr = str(val) + key2str[self._truncate(key)] = self._truncate(valstr) + + # Find max widths + if len(key2str) == 0: + print("WARNING: tried to write empty key-value dict") + return + else: + keywidth = max(map(len, key2str.keys())) + valwidth = max(map(len, key2str.values())) + + # Write out the data + dashes = "-" * (keywidth + valwidth + 7) + lines = [dashes] + for key, val in sorted(key2str.items(), key=lambda kv: kv[0].lower()): + lines.append( + "| %s%s | %s%s |" + % (key, " " * (keywidth - len(key)), val, " " * (valwidth - len(val))) + ) + lines.append(dashes) + self.file.write("\n".join(lines) + "\n") + + # Flush the output to the file + self.file.flush() + + def _truncate(self, s): + maxlen = 30 + return s[: maxlen - 3] + "..." if len(s) > maxlen else s + + def writeseq(self, seq): + seq = list(seq) + for i, elem in enumerate(seq): + self.file.write(elem) + if i < len(seq) - 1: # add space unless this is the last one + self.file.write(" ") + self.file.write("\n") + self.file.flush() + + def close(self): + if self.own_file: + self.file.close() + + +class JSONOutputFormat(KVWriter): + def __init__(self, filename): + self.file = open(filename, "w") + + def writekvs(self, kvs): + for k, v in sorted(kvs.items()): + if hasattr(v, "dtype"): + kvs[k] = float(v) + self.file.write(json.dumps(kvs) + "\n") + self.file.flush() + + def close(self): + self.file.close() + + +class CSVOutputFormat(KVWriter): + def __init__(self, filename): + self.file = open(filename, "w+t") + self.keys = [] + self.sep = "," + + def writekvs(self, kvs): + # Add our current row to the history + extra_keys = list(kvs.keys() - self.keys) + extra_keys.sort() + if extra_keys: + self.keys.extend(extra_keys) + self.file.seek(0) + lines = self.file.readlines() + self.file.seek(0) + for i, k in enumerate(self.keys): + if i > 0: + self.file.write(",") + self.file.write(k) + self.file.write("\n") + for line in lines[1:]: + self.file.write(line[:-1]) + self.file.write(self.sep * len(extra_keys)) + self.file.write("\n") + for i, k in enumerate(self.keys): + if i > 0: + self.file.write(",") + v = kvs.get(k) + if v is not None: + self.file.write(str(v)) + self.file.write("\n") + self.file.flush() + + def close(self): + self.file.close() + + +class TensorBoardOutputFormat(KVWriter): + """Dumps key/value pairs into TensorBoard's numeric format.""" + + def __init__(self, dir): + os.makedirs(dir, exist_ok=True) + self.dir = dir + self.step = 1 + prefix = "events" + path = osp.join(osp.abspath(dir), prefix) + import tensorflow as tf + from tensorflow.core.util import event_pb2 + from tensorflow.python import pywrap_tensorflow + from tensorflow.python.util import compat + + self.tf = tf + self.event_pb2 = event_pb2 + self.pywrap_tensorflow = pywrap_tensorflow + self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) + + def writekvs(self, kvs): + def summary_val(k, v): + kwargs = {"tag": k, "simple_value": float(v)} + return self.tf.Summary.Value(**kwargs) + + summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) + event = self.event_pb2.Event(wall_time=time.time(), summary=summary) + event.step = self.step # is there any reason why you'd want to specify the step? + self.writer.WriteEvent(event) + self.writer.Flush() + self.step += 1 + + def close(self): + if self.writer: + self.writer.Close() + self.writer = None + + +def make_output_format(format, ev_dir, log_suffix=""): + os.makedirs(ev_dir, exist_ok=True) + if format == "stdout": + return HumanOutputFormat(sys.stdout) + elif format == "log": + return HumanOutputFormat(osp.join(ev_dir, "log%s.txt" % log_suffix)) + elif format == "json": + return JSONOutputFormat(osp.join(ev_dir, "progress%s.json" % log_suffix)) + elif format == "csv": + return CSVOutputFormat(osp.join(ev_dir, "progress%s.csv" % log_suffix)) + elif format == "tensorboard": + return TensorBoardOutputFormat(osp.join(ev_dir, "tb%s" % log_suffix)) + else: + raise ValueError(f"Unknown format specified: {format}") + + +# ================================================================ +# API +# ================================================================ + + +def logkv(key, val): + """Log a value of some diagnostic Call this once for each diagnostic quantity, each iteration + If called many times, last value will be used.""" + get_current().logkv(key, val) + + +def logkv_mean(key, val): + """The same as logkv(), but if called many times, values averaged.""" + get_current().logkv_mean(key, val) + + +def logkvs(d): + """Log a dictionary of key-value pairs.""" + for k, v in d.items(): + logkv(k, v) + + +def dumpkvs(): + """Write all of the diagnostics from the current iteration.""" + return get_current().dumpkvs() + + +def getkvs(): + return get_current().name2val + + +def log(*args, level=INFO): + """Write the sequence of args, with no separators, to the console and output files (if you've + configured an output file).""" + get_current().log(*args, level=level) + + +def debug(*args): + log(*args, level=DEBUG) + + +def info(*args): + log(*args, level=INFO) + + +def warn(*args): + log(*args, level=WARN) + + +def error(*args): + log(*args, level=ERROR) + + +def set_level(level): + """Set logging threshold on current logger.""" + get_current().set_level(level) + + +def set_comm(comm): + get_current().set_comm(comm) + + +def get_dir(): + """Get directory that log files are being written to. + + will be None if there is no output directory (i.e., if you didn't call start) + """ + return get_current().get_dir() + + +record_tabular = logkv +dump_tabular = dumpkvs + + +@contextmanager +def profile_kv(scopename): + logkey = "wait_" + scopename + tstart = time.time() + try: + yield + finally: + get_current().name2val[logkey] += time.time() - tstart + + +def profile(n): + """ + Usage: + @profile("my_func") + def my_func(): code + """ + + def decorator_with_name(func): + def func_wrapper(*args, **kwargs): + with profile_kv(n): + return func(*args, **kwargs) + + return func_wrapper + + return decorator_with_name + + +# ================================================================ +# Backend +# ================================================================ + + +def get_current(): + if Logger.CURRENT is None: + _configure_default_logger() + + return Logger.CURRENT + + +class Logger: + DEFAULT = None # A logger with no output files. (See right below class definition) + # So that you can still log to the terminal without setting up any output files + CURRENT = None # Current logger being used by the free functions above + + def __init__(self, dir, output_formats, comm=None): + self.name2val = defaultdict(float) # values this iteration + self.name2cnt = defaultdict(int) + self.level = INFO + self.dir = dir + self.output_formats = output_formats + self.comm = comm + + # Logging API, forwarded + # ---------------------------------------- + def logkv(self, key, val): + self.name2val[key] = val + + def logkv_mean(self, key, val): + oldval, cnt = self.name2val[key], self.name2cnt[key] + self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1) + self.name2cnt[key] = cnt + 1 + + def dumpkvs(self): + if self.comm is None: + d = self.name2val + else: + d = mpi_weighted_mean( + self.comm, + {name: (val, self.name2cnt.get(name, 1)) for (name, val) in self.name2val.items()}, + ) + if self.comm.rank != 0: + d["dummy"] = 1 # so we don't get a warning about empty dict + out = d.copy() # Return the dict for unit testing purposes + for fmt in self.output_formats: + if isinstance(fmt, KVWriter): + fmt.writekvs(d) + self.name2val.clear() + self.name2cnt.clear() + return out + + def log(self, *args, level=INFO): + if self.level <= level: + self._do_log(args) + + # Configuration + # ---------------------------------------- + def set_level(self, level): + self.level = level + + def set_comm(self, comm): + self.comm = comm + + def get_dir(self): + return self.dir + + def close(self): + for fmt in self.output_formats: + fmt.close() + + # Misc + # ---------------------------------------- + def _do_log(self, args): + for fmt in self.output_formats: + if isinstance(fmt, SeqWriter): + fmt.writeseq(map(str, args)) + + +def get_rank_without_mpi_import(): + # check environment variables here instead of importing mpi4py + # to avoid calling MPI_Init() when this module is imported + for varname in ["PMI_RANK", "OMPI_COMM_WORLD_RANK"]: + if varname in os.environ: + return int(os.environ[varname]) + return 0 + + +def mpi_weighted_mean(comm, local_name2valcount): + """ + Copied from: https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/mpi_util.py#L110 + Perform a weighted average over dicts that are each on a different node + Input: local_name2valcount: dict mapping key -> (value, count) + Returns: key -> mean + """ + all_name2valcount = comm.gather(local_name2valcount) + if comm.rank == 0: + name2sum = defaultdict(float) + name2count = defaultdict(float) + for n2vc in all_name2valcount: + for name, (val, count) in n2vc.items(): + try: + val = float(val) + except ValueError: + if comm.rank == 0: + warnings.warn(f"WARNING: tried to compute mean on non-float {name}={val}") + else: + name2sum[name] += val * count + name2count[name] += count + return {name: name2sum[name] / name2count[name] for name in name2sum} + else: + return {} + + +def configure(dir=None, format_strs=None, comm=None, log_suffix=""): + """If comm is provided, average all numerical stats across that comm.""" + if dir is None: + dir = os.getenv("OPENAI_LOGDIR") + if dir is None: + dir = osp.join( + tempfile.gettempdir(), + datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"), + ) + assert isinstance(dir, str) + dir = os.path.expanduser(dir) + os.makedirs(os.path.expanduser(dir), exist_ok=True) + + rank = get_rank_without_mpi_import() + if rank > 0: + log_suffix = log_suffix + "-rank%03i" % rank + + if format_strs is None: + if rank == 0: + format_strs = os.getenv("OPENAI_LOG_FORMAT", "stdout,log,csv").split(",") + else: + format_strs = os.getenv("OPENAI_LOG_FORMAT_MPI", "log").split(",") + format_strs = filter(None, format_strs) + output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs] + + Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm) + if output_formats: + log("Logging to %s" % dir) + + +def _configure_default_logger(): + configure() + Logger.DEFAULT = Logger.CURRENT + + +def reset(): + if Logger.CURRENT is not Logger.DEFAULT: + Logger.CURRENT.close() + Logger.CURRENT = Logger.DEFAULT + log("Reset logger") + + +@contextmanager +def scoped_configure(dir=None, format_strs=None, comm=None): + prevlogger = Logger.CURRENT + configure(dir=dir, format_strs=format_strs, comm=comm) + try: + yield + finally: + Logger.CURRENT.close() + Logger.CURRENT = prevlogger diff --git a/conditional-flow-matching/runner/src/models/components/mlpode.py b/conditional-flow-matching/runner/src/models/components/mlpode.py new file mode 100644 index 0000000000000000000000000000000000000000..8dd5f523b0994dffd8420af65c9f07f4acbd2bde --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/mlpode.py @@ -0,0 +1,612 @@ +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from .base import ( + BayesLinear, + DeepEnsDibsLayer, + DibsLayer, + Intervenable, + LocallyConnected, +) +from .hyper_nets import HyperLocallyConnected + + +class DeepEnsEmbedMLPODEF(Intervenable): + pass + + +class MLPODEF(Intervenable): + """Define an MLP ODE function according to Neural Graphical Models definition.""" + + def __init__(self, dims, GL_reg=0.01, bias=True, time_invariant=True): + # dims: [number of variables, dimension hidden layers, output dim=1] + super().__init__() + assert len(dims) >= 2 + assert dims[-1] == 1 + + self.dims = dims + self.time_invariant = time_invariant + self.GL_reg = GL_reg # adaptive lasso parameter + + self.fc1 = nn.Linear(dims[0], dims[0] * dims[1], bias=bias) + """Old way of implementing time_invariant. + + if time_invariant: + self.fc1 = nn.Linear(dims[0], dims[0] * dims[1], bias=bias) + else: + self.fc1 = nn.Linear(dims[0] + 1, dims[0] * dims[1], bias=bias) + """ + + # fc2: local linear layers + layers = [] + for i in range(len(dims) - 2): + layers.append( + LocallyConnected( + dims[0], + dims[i + 1] + (0 if self.time_invariant else 1), + dims[i + 2], + bias=bias, + ) + ) + self.fc2 = nn.ModuleList(layers) + self.elu = nn.ELU(inplace=True) + + def forward(self, t, x): # [n, 1, d] -> [n, 1, d] + x = self.fc1(x) + x = x.view(-1, self.dims[0], self.dims[1]) # [n, d, m1] + if not self.time_invariant: + x = torch.cat((x, t.repeat(*x.shape[:-1], 1)), dim=-1) + for fc in self.fc2: + x = fc(self.elu(x)) # [n, d, m2] + x = x.squeeze(dim=2) # [n, d] + x = x.unsqueeze(dim=1) # [n, 1, d] + return x + + def l2_reg(self): + """L2 regularization on all parameters.""" + reg = 0.0 + fc1_weight = self.fc1.weight # [j * m1, i], m1 = number of hidden nodes + reg += torch.sum(fc1_weight**2) + for fc in self.fc2: + reg += torch.sum(fc.weight**2) + return reg + + def l1_reg(self): + """L1 regularization on input layer parameters.""" + return torch.sum(torch.abs(self.fc1.weight)) + + def grn_reg(self, grn): + """ + Args: + grn: torch.tensor (d x d) 1 if likely edge 0 if not + """ + fc1_weight = self.fc1.weight # d * m1, d + d = fc1_weight.shape[-1] + fc1_weight = fc1_weight.reshape(d, -1, d) + fc1_weight = fc1_weight.transpose(0, 1) # m1, d, d + return torch.sum(torch.abs(fc1_weight * (1 - grn))) + + def group_weights(self, gamma=0.5): + """Group lasso weights.""" + fc1_weight = self.fc1.weight.view(self.dims[0], -1, self.dims[0]) # [j, m1, i] + weights = torch.sum(fc1_weight**2, dim=1).pow(gamma).data # [i, j] + return weights + + def get_structure(self): + """Score each edge based on the the weight sum.""" + d = self.dims[0] + fc1_weight = self.fc1.weight # [j * m1, i] + fc1_weight = fc1_weight.view(d, -1, d) # [j, m1, i] + W = torch.sum(fc1_weight**2, dim=1).pow(0.5) # [i, j] + W = W.cpu().detach().numpy() # [i, j] + return W + + def reset_parameters(self): + self.fc1.reset_parameters() + for fc in self.fc2: + fc.reset_parameters() + + +class BayesMLPODEF(Intervenable): + """Define an Bayes-MLP ODE (via SVI) function according to Neural Graphical Models + definition.""" + + def __init__( + self, + dims, + GL_reg=0.01, + init_log_var=-5, + dibs=False, + k_hidden=1, + alpha=0.1, + beta=0.5, + bias=True, + time_invariant=True, + sparse=False, + ): + # dims: [number of variables, dimension hidden layers, output dim=1] + super().__init__() + assert len(dims) >= 2 + assert dims[-1] == 1 + + self.dims = dims + self.time_invariant = time_invariant + self.GL_reg = GL_reg # adaptive lasso parameter + self.dibs = dibs + + if time_invariant: + if self.dibs: + assert k_hidden <= dims[0], "U,V dimension k larger than amount of nodes!" + self.fc1 = DibsLayer( + dims[0], + dims[0] * dims[1], + k_hidden=k_hidden, + init_log_var=init_log_var, + alpha=alpha, + beta=beta, + bias=bias, + ) + else: + self.fc1 = BayesLinear( + dims[0], + dims[0] * dims[1], + init_log_var=init_log_var, + bias=bias, + sparse=sparse, + ) + else: + if self.dibs: + assert k_hidden <= dims[0], "U,V dimension k larger than amount of nodes!" + self.fc1 = DibsLayer( + dims[0] + 1, + dims[0] * dims[1], + k_hidden=k_hidden, + init_log_var=init_log_var, + alpha=alpha, + beta=beta, + bias=bias, + ) + else: + self.fc1 = BayesLinear( + dims[0] + 1, + dims[0] * dims[1], + init_log_var=init_log_var, + bias=bias, + sparse=sparse, + ) + + # fc2: local linear layers + layers = [] + for i in range(len(dims) - 2): + layers.append(LocallyConnected(dims[0], dims[i + 1], dims[i + 2], bias=bias)) + self.fc2 = nn.ModuleList(layers) + self.elu = nn.ELU(inplace=True) + + def forward(self, t, x): # [n, 1, d] -> [n, 1, d] + if not self.time_invariant: + x = torch.cat((x, t), dim=-1) + + x = self.fc1(x) + x = x.view(-1, self.dims[0], self.dims[1]) # [n, d, m1] + for fc in self.fc2: + x = fc(self.elu(x)) # [n, d, m2] + x = x.squeeze(dim=2) # [n, d] + x = x.unsqueeze(dim=1) # [n, 1, d] + return x + + def l2_reg(self): + """L2 regularization on all parameters.""" + reg = 0.0 + fc1_weight = self.fc1.get_structure( + self.alpha_t + ) # [j * m1, i], m1 = number of hidden nodes + reg += torch.sum(fc1_weight**2) + for fc in self.fc2: + reg += torch.sum(fc.weight**2) + return reg + + def l1_reg(self): + """L1 regularization on input layer parameters.""" + return torch.sum(torch.abs(self.fc1.weight)) + + def compute_kl_term(self, net, prior_log_sigma, t=1): + kl = 0.0 + for module in net.children(): + if self.dibs: + if isinstance(module, DibsLayer): + kl += module.kl_with_prior(prior_log_sigma, t) + else: + if isinstance(module, BayesLinear): + kl += module.kl_with_prior(prior_log_sigma, t) + return kl + + def kl_reg(self, net, prior_log_sigma, t=1): + return self.compute_kl_term(net, prior_log_sigma, t) + + def group_weights(self, gamma=0.5): + """Group lasso weights.""" + fc1_weight = self.fc1.weight.view(self.dims[0], -1, self.dims[0]) # [j, m1, i] + weights = torch.sum(fc1_weight**2, dim=1).pow(gamma).data # [i, j] + return weights + + def get_structure(self, t=1, test_mode: bool = False): + """Score each edge based on the the weight sum.""" + if self.dibs: + d = self.dims[0] + W_list = [] + for _ in range(1000): + W_tmp = self.fc1.get_graph(d, t, get_structure_flag=True) + W_tmp = W_tmp.cpu().detach().numpy() # [i, j] + W_list.append(W_tmp) + W = np.mean(np.array(W_list), axis=0) + if test_mode: + W_std = np.std(np.array(W_list), axis=0) + + else: + d = self.dims[0] + W_list = [] + for _ in range(1000): + fc1_weight, _ = self.fc1.sample() # [j * m1, i] + fc1_weight = fc1_weight.view(d, -1, d) # [j, m1, i] + W_tmp = fc1_weight.cpu().detach().numpy() # [i, j] + W_list.append(W_tmp) + W = np.mean(np.array(W_list), axis=0) + W = np.sum(W**2, axis=1) ** (0.5) # [i, j] + if test_mode: + W_std = np.std(np.array(W_list), axis=0) + W_std = np.sum(W_std**2, axis=1) ** (0.5) # [i, j] + # fc1_weight = self.fc1.weight # [j * m1, i] + # fc1_weight = fc1_weight.view(d, -1, d) # [j, m1, i] + # W = torch.sum(fc1_weight**2, dim=1).pow(0.5) # [i, j] + # W = W.cpu().detach().numpy() # [i, j] + if test_mode: + return W, W_std + else: + return W + + def get_structures(self, n_structures: int): + d = self.dims[0] + w_samples = self.fc1.sample_weights(n_structures) + w_samples = w_samples.view(n_structures, d, -1, d) # [n, j, m1, i] + W = torch.sum(w_samples**2, dim=2).pow(0.5) # [n, i, j] + W = W.cpu().detach().numpy() # [n, i, j] + return W + + def reset_parameters(self): + self.fc1.reset_parameters() + for fc in self.fc2: + fc.reset_parameters() + + +class DeepEnsMLPODEF(Intervenable): + """Define an DeepEns-MLP ODE function according to Neural Graphical Models definition.""" + + def __init__( + self, + dims, + n_ens=25, + GL_reg=0.01, + dibs=False, + k_hidden=1, + alpha=0.1, + dropout_flag=False, + bias=True, + time_invariant=True, + ): + # dims: [number of variables, dimension hidden layers, output dim=1] + super().__init__() + assert len(dims) >= 2 + assert dims[-1] == 1 + + self.dims = dims + self.n_ens = n_ens + self.time_invariant = time_invariant + self.GL_reg = GL_reg # adaptive lasso parameter + self.dibs = dibs + self.alpha = alpha + + self.fc1_modules = [] + if self.dibs: + for m in range(self.n_ens): + if time_invariant: + self.fc1_modules.append( + DeepEnsDibsLayer( + dims[0], + dims[0] * dims[1], + k_hidden=k_hidden, + dropout_flag=True, + bias=bias, + ) + ) + else: + self.fc1_modules.append( + DeepEnsDibsLayer( + dims[0] + 1, + dims[0] * dims[1], + k_hidden=k_hidden, + dropout_flag=True, + bias=bias, + ) + ) + self.fc1 = nn.ModuleList(self.fc1_modules) + else: + for m in range(self.n_ens): + if time_invariant: + self.fc1_modules.append(nn.Linear(dims[0], dims[0] * dims[1], bias=bias)) + else: + self.fc1_modules.append(nn.Linear(dims[0] + 1, dims[0] * dims[1], bias=bias)) + self.fc1 = nn.ModuleList(self.fc1_modules) + + # if time_invariant: + # self.fc1 = nn.Linear(dims[0], dims[0] * dims[1], bias=bias) + # else: + # self.fc1 = nn.Linear(dims[0] + 1, dims[0] * dims[1], bias=bias) + + # fc2: local linear layers + self.fc2_modules, self.elu_modules = nn.ModuleList(), nn.ModuleList() + for m in range(self.n_ens): + layers = [] + for i in range(len(dims) - 2): + layers.append(LocallyConnected(dims[0], dims[i + 1], dims[i + 2], bias=bias)) + # self.fc2 = nn.ModuleList(layers) + self.fc2_modules.append(nn.ModuleList(layers)) + self.elu = nn.ELU(inplace=True) + + def update_p(self): + for fc in self.fc1: + fc.update_p() + + def set_sample_flag(self): + for fc in self.fc1: + fc.sample_once_flag = True + + def forward(self, t, x): # [n, 1, d] -> [n, 1, d] + if not self.time_invariant: + x = torch.cat((x, t), dim=-1) + + x_tmp = [] + for fc in self.fc1: + x_tmp.append(fc(x)[0]) + x = torch.stack(x_tmp) + # x = x.mean(dim=0) + # x = self.fc1(x) + + x = x.view(self.n_ens, -1, self.dims[0], self.dims[1]) # [n_ens, n, d, m1] + # TODO broken for > 1 layer + x_out = [] + m = 0 + for fc2 in self.fc2_modules: + for fc in fc2: + x_out.append(fc(F.elu(x[m]))) # [n, d, m2] + m += 1 + x = torch.stack(x_out) + x = x.squeeze(dim=3) # [n_ens, n, d] + x = x.unsqueeze(dim=2) # [n_ens, n, 1, d] + return x + + def l2_reg(self): + """L2 regularization on all parameters.""" + reg = 0.0 + fc1_weight = self.fc1.weight # [j * m1, i], m1 = number of hidden nodes + reg += torch.sum(fc1_weight**2) + for fc in self.fc2: + reg += torch.sum(fc.weight**2) + return reg + + def l1_reg(self): + """L1 regularization on input layer parameters.""" + return torch.sum(torch.abs(self.fc1.weight)) + + def DeepEns_prior(self, net, prior_var): + reg = 0.0 + with torch.no_grad(): + for fc in net.fc1: + for p in fc.parameters(): + reg += torch.norm(p) ** 2 / (2 * prior_var) + return reg + + def group_weights(self, gamma=0.5): + """Group lasso weights.""" + fc1_weight = self.fc1.weight.view(self.dims[0], -1, self.dims[0]) # [j, m1, i] + weights = torch.sum(fc1_weight**2, dim=1).pow(gamma).data # [i, j] + return weights + + def get_structure(self, t=1, test_mode: bool = False): + """Score each edge based on the the weight sum.""" + if self.dibs: + d = self.dims[0] + G_list = [] + for fc in self.fc1: + fc1_weight = torch.matmul(fc.w.t(), fc.v).t() + fc1_weight = fc1_weight.view(d, -1, d) # [j, m1, i] + # Z = torch.sum(fc1_weight**2, dim=1).pow(0.5) # [i, j] + Z = torch.sum(torch.abs(fc1_weight), dim=1) # [i, j] + Z = Z - torch.mean(Z) # [i, j] + self.alpha_t = self.alpha * t + G_list.append(torch.sigmoid(self.alpha_t * Z)) + G = torch.stack(G_list) + # G_mean = P_G.mean(dim=0) + # G_std = P_G.std(dim=0).detach().numpy() + # G_mean = G_mean.cpu().detach().numpy() + # G = np.heaviside(G_mean - 0.5, 0.5) + if test_mode: + G = G.cpu().detach().numpy() + return G # , G_mean, G_std + else: + return G + else: + d = self.dims[0] + fc1_weight_list = [] + for fc in self.fc1: + fc1_weight_list.append(fc.weight) # [j * m1, i] + fc1_weight = torch.stack(fc1_weight_list) + fc1_weight = fc1_weight.mean(dim=0) + # fc1_std = fc1_weight.std(dim=0) + fc1_weight = fc1_weight.view(d, -1, d) # [j, m1, i] + W = torch.sum(fc1_weight**2, dim=1).pow(0.5) # [i, j] + W = W.cpu().detach().numpy() # [i, j] + return W + + def reset_parameters(self): + for fc in self.fc1: + fc.reset_parameters() + for fc in self.fc2: + fc.reset_parameters() + + +class EnsembleLayer(nn.Module): + def __init__(self, n_ens, layer, *args, **kwargs): + super().__init__() + self.n_ens = n_ens + self.ensemble = nn.ModuleList([layer(*args, **kwargs) for _ in range(n_ens)]) + + +class DibsEnsembleLayer(EnsembleLayer): + def __init__(self, n_ens, alpha, layer, *args, **kwargs): + super().__init__(n_ens, layer, *args, **kwargs) + self.alpha = alpha + + def forward(self, x): + x_tmp = [] + gs = [] + print(x.shape) + for fc in self.ensemble: + x_i, g_i = fc(x, alpha=self.alpha) + x_tmp.append(x_i) + gs.append(g_i) + x = torch.stack(x_tmp) + print(x.shape) + G = torch.stack(gs) + return x, G + + def update_p(self): + for fc in self.ensemble: + fc.update_p() + + def set_sample_flag(self): + for fc in self.ensemble: + fc.sample_once_flag = True + + def get_structures(self): + return torch.stack([fc.get_structure(self.alpha) for fc in self.ensemble]) + + +class DeepEnsHyperMLPODEF(Intervenable): + """Define an DeepEns-MLP ODE function to acquire graph structure G and use G in linear pipeline + via hyper-net architecture. + + - goals: P(params | G)P(G) + """ + + def __init__( + self, + dims, + n_ens=25, + GL_reg=0.01, + dibs=False, + k_hidden=1, + alpha=0.1, + dropout_flag=False, + hyper=None, + bias=True, + time_invariant=True, + ): + # dims: [number of variables, dimension hidden layers, output dim=1] + super().__init__() + assert len(dims) >= 2 + assert dims[-1] == 1 + + self.dims = dims + self.n_ens = n_ens + self.time_invariant = time_invariant + self.GL_reg = GL_reg # adaptive lasso parameter + self.dibs = dibs + self.alpha = alpha + self.current_epoch = 0 + fc1_input_dim = dims[0] if time_invariant else dims[0] + 1 + + if self.dibs: + self.fc1 = DibsEnsembleLayer( + n_ens, + self.alpha, + DeepEnsDibsLayer, + fc1_input_dim, + dims[0], + k_hidden=k_hidden, + dropout_flag=dropout_flag, + bias=bias, + ) + else: + self.fc1 = DibsEnsembleLayer( + n_ens, + self.alpha, + nn.Linear, + fc1_input_dim, + dims[0], + bias=bias, + ) + + # fc2: let params be function of G ~ A + layers = [] + for i in range(len(dims) - 1): + layers.append( + HyperLocallyConnected( + dims[0], # num_linear + dims[i], # input_features + dims[i + 1], # output_features + n_ens=n_ens, + hyper=hyper, + bias=bias, + ) + ) + self.fc2 = nn.ModuleList(layers) + + def update_epoch(self, epoch): + self.current_epoch = epoch + + def set_sample_flag(self): + self.fc1.set_sample_flag() + + def update_p(self): + self.fc1.update_p() + + def forward(self, t, x): # [n, 1, d] -> [n, 1, d] + if not self.time_invariant: + x = torch.cat((x, t), dim=-1) + self.fc1.alpha = self.alpha * (self.current_epoch + 1) + x, G = self.fc1(x) + x = x.unsqueeze(dim=2) # [n_ens, n, 1, d] + for fc in self.fc2: + x = fc(F.elu(x), G) # [n_ens, n, d, mi] + # x.shape [n_ens, n, d, 1] + return x.transpose(-1, -2) # x.shape [n_ens, n, 1, d] + + def l2_reg(self): + """L2 regularization on all parameters.""" + return torch.sum(self.fc1.get_structures() ** 2) + + def l1_reg(self): + """L1 regularization on input layer parameters.""" + return torch.sum(torch.abs(self.fc1.get_structures())) + + def DeepEns_prior(self, net, prior_var): + reg = 0.0 + with torch.no_grad(): + for p in net.fc1.parameters(): + reg += torch.norm(p) ** 2 / (2 * prior_var) + return reg + + def group_weights(self, gamma=0.5): + """Group lasso weights.""" + Gs = self.fc1.get_structures() + weights = torch.sum(Gs**2, dim=0).pow(gamma).data # [i, j] + return weights + + def get_structure(self, t=1, test_mode: bool = False): + """Score each edge based on the the weight sum.""" + G = self.fc1.get_structures() + if test_mode: + return G.cpu().detach().numpy() + return G diff --git a/conditional-flow-matching/runner/src/models/components/mmd.py b/conditional-flow-matching/runner/src/models/components/mmd.py new file mode 100644 index 0000000000000000000000000000000000000000..c05a53926f60586cc9040cf7d61df76a9ee07e0f --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/mmd.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python + + +import torch + +min_var_est = 1e-8 + + +# Consider linear time MMD with a linear kernel: +# K(f(x), f(y)) = f(x)^Tf(y) +# h(z_i, z_j) = k(x_i, x_j) + k(y_i, y_j) - k(x_i, y_j) - k(x_j, y_i) +# = [f(x_i) - f(y_i)]^T[f(x_j) - f(y_j)] +# +# f_of_X: batch_size * k +# f_of_Y: batch_size * k +def linear_mmd2(f_of_X, f_of_Y): + loss = 0.0 + delta = f_of_X - f_of_Y + loss = torch.mean((delta[:-1] * delta[1:]).sum(1)) + return loss + + +# Consider linear time MMD with a polynomial kernel: +# K(f(x), f(y)) = (alpha*f(x)^Tf(y) + c)^d +# f_of_X: batch_size * k +# f_of_Y: batch_size * k +def poly_mmd2(f_of_X, f_of_Y, d=2, alpha=1.0, c=2.0): + K_XX = alpha * (f_of_X[:-1] * f_of_X[1:]).sum(1) + c + K_XX_mean = torch.mean(K_XX.pow(d)) + + K_YY = alpha * (f_of_Y[:-1] * f_of_Y[1:]).sum(1) + c + K_YY_mean = torch.mean(K_YY.pow(d)) + + K_XY = alpha * (f_of_X[:-1] * f_of_Y[1:]).sum(1) + c + K_XY_mean = torch.mean(K_XY.pow(d)) + + K_YX = alpha * (f_of_Y[:-1] * f_of_X[1:]).sum(1) + c + K_YX_mean = torch.mean(K_YX.pow(d)) + + return K_XX_mean + K_YY_mean - K_XY_mean - K_YX_mean + + +def _mix_rbf_kernel(X, Y, sigma_list): + assert X.size(0) == Y.size(0) + m = X.size(0) + + Z = torch.cat((X, Y), 0) + ZZT = torch.mm(Z, Z.t()) + diag_ZZT = torch.diag(ZZT).unsqueeze(1) + Z_norm_sqr = diag_ZZT.expand_as(ZZT) + exponent = Z_norm_sqr - 2 * ZZT + Z_norm_sqr.t() + + K = 0.0 + for sigma in sigma_list: + gamma = 1.0 / (2 * sigma**2) + K += torch.exp(-gamma * exponent) + + return K[:m, :m], K[:m, m:], K[m:, m:], len(sigma_list) + + +def mix_rbf_mmd2(X, Y, sigma_list, biased=True): + K_XX, K_XY, K_YY, d = _mix_rbf_kernel(X, Y, sigma_list) + # return _mmd2(K_XX, K_XY, K_YY, const_diagonal=d, biased=biased) + return _mmd2(K_XX, K_XY, K_YY, const_diagonal=False, biased=biased) + + +def mix_rbf_mmd2_and_ratio(X, Y, sigma_list, biased=True): + K_XX, K_XY, K_YY, d = _mix_rbf_kernel(X, Y, sigma_list) + # return _mmd2_and_ratio(K_XX, K_XY, K_YY, const_diagonal=d, biased=biased) + return _mmd2_and_ratio(K_XX, K_XY, K_YY, const_diagonal=False, biased=biased) + + +################################################################################ +# Helper functions to compute variances based on kernel matrices +################################################################################ + + +def _mmd2(K_XX, K_XY, K_YY, const_diagonal=False, biased=False): + m = K_XX.size(0) # assume X, Y are same shape + + # Get the various sums of kernels that we'll use + # Kts drop the diagonal, but we don't need to compute them explicitly + if const_diagonal is not False: + diag_X = diag_Y = const_diagonal + sum_diag_X = sum_diag_Y = m * const_diagonal + else: + diag_X = torch.diag(K_XX) # (m,) + diag_Y = torch.diag(K_YY) # (m,) + sum_diag_X = torch.sum(diag_X) + sum_diag_Y = torch.sum(diag_Y) + + Kt_XX_sums = K_XX.sum(dim=1) - diag_X # \tilde{K}_XX * e = K_XX * e - diag_X + Kt_YY_sums = K_YY.sum(dim=1) - diag_Y # \tilde{K}_YY * e = K_YY * e - diag_Y + K_XY_sums_0 = K_XY.sum(dim=0) # K_{XY}^T * e + + Kt_XX_sum = Kt_XX_sums.sum() # e^T * \tilde{K}_XX * e + Kt_YY_sum = Kt_YY_sums.sum() # e^T * \tilde{K}_YY * e + K_XY_sum = K_XY_sums_0.sum() # e^T * K_{XY} * e + + if biased: + mmd2 = ( + (Kt_XX_sum + sum_diag_X) / (m * m) + + (Kt_YY_sum + sum_diag_Y) / (m * m) + - 2.0 * K_XY_sum / (m * m) + ) + else: + mmd2 = Kt_XX_sum / (m * (m - 1)) + Kt_YY_sum / (m * (m - 1)) - 2.0 * K_XY_sum / (m * m) + + return mmd2 + + +def _mmd2_and_ratio(K_XX, K_XY, K_YY, const_diagonal=False, biased=False): + mmd2, var_est = _mmd2_and_variance( + K_XX, K_XY, K_YY, const_diagonal=const_diagonal, biased=biased + ) + loss = mmd2 / torch.sqrt(torch.clamp(var_est, min=min_var_est)) + return loss, mmd2, var_est + + +def _mmd2_and_variance(K_XX, K_XY, K_YY, const_diagonal=False, biased=False): + m = K_XX.size(0) # assume X, Y are same shape + + # Get the various sums of kernels that we'll use + # Kts drop the diagonal, but we don't need to compute them explicitly + if const_diagonal is not False: + diag_X = diag_Y = const_diagonal + sum_diag_X = sum_diag_Y = m * const_diagonal + sum_diag2_X = sum_diag2_Y = m * const_diagonal**2 + else: + diag_X = torch.diag(K_XX) # (m,) + diag_Y = torch.diag(K_YY) # (m,) + sum_diag_X = torch.sum(diag_X) + sum_diag_Y = torch.sum(diag_Y) + sum_diag2_X = diag_X.dot(diag_X) + sum_diag2_Y = diag_Y.dot(diag_Y) + + Kt_XX_sums = K_XX.sum(dim=1) - diag_X # \tilde{K}_XX * e = K_XX * e - diag_X + Kt_YY_sums = K_YY.sum(dim=1) - diag_Y # \tilde{K}_YY * e = K_YY * e - diag_Y + K_XY_sums_0 = K_XY.sum(dim=0) # K_{XY}^T * e + K_XY_sums_1 = K_XY.sum(dim=1) # K_{XY} * e + + Kt_XX_sum = Kt_XX_sums.sum() # e^T * \tilde{K}_XX * e + Kt_YY_sum = Kt_YY_sums.sum() # e^T * \tilde{K}_YY * e + K_XY_sum = K_XY_sums_0.sum() # e^T * K_{XY} * e + + Kt_XX_2_sum = (K_XX**2).sum() - sum_diag2_X # \| \tilde{K}_XX \|_F^2 + Kt_YY_2_sum = (K_YY**2).sum() - sum_diag2_Y # \| \tilde{K}_YY \|_F^2 + K_XY_2_sum = (K_XY**2).sum() # \| K_{XY} \|_F^2 + + if biased: + mmd2 = ( + (Kt_XX_sum + sum_diag_X) / (m * m) + + (Kt_YY_sum + sum_diag_Y) / (m * m) + - 2.0 * K_XY_sum / (m * m) + ) + else: + mmd2 = Kt_XX_sum / (m * (m - 1)) + Kt_YY_sum / (m * (m - 1)) - 2.0 * K_XY_sum / (m * m) + + var_est = ( + 2.0 + / (m**2 * (m - 1.0) ** 2) + * ( + 2 * Kt_XX_sums.dot(Kt_XX_sums) + - Kt_XX_2_sum + + 2 * Kt_YY_sums.dot(Kt_YY_sums) + - Kt_YY_2_sum + ) + - (4.0 * m - 6.0) / (m**3 * (m - 1.0) ** 3) * (Kt_XX_sum**2 + Kt_YY_sum**2) + + 4.0 + * (m - 2.0) + / (m**3 * (m - 1.0) ** 2) + * (K_XY_sums_1.dot(K_XY_sums_1) + K_XY_sums_0.dot(K_XY_sums_0)) + - 4.0 * (m - 3.0) / (m**3 * (m - 1.0) ** 2) * (K_XY_2_sum) + - (8 * m - 12) / (m**5 * (m - 1)) * K_XY_sum**2 + + 8.0 + / (m**3 * (m - 1.0)) + * ( + 1.0 / m * (Kt_XX_sum + Kt_YY_sum) * K_XY_sum + - Kt_XX_sums.dot(K_XY_sums_1) + - Kt_YY_sums.dot(K_XY_sums_0) + ) + ) + return mmd2, var_est diff --git a/conditional-flow-matching/runner/src/models/components/nn.py b/conditional-flow-matching/runner/src/models/components/nn.py new file mode 100644 index 0000000000000000000000000000000000000000..62d99802ae714b77d1fcf0b91566a82e8ba430b1 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/nn.py @@ -0,0 +1,153 @@ +"""Various utilities for neural networks.""" + +import math + +import torch as th +import torch.nn as nn + + +# PyTorch 1.7 has SiLU, but we support PyTorch 1.5. +class SiLU(nn.Module): + def forward(self, x): + return x * th.sigmoid(x) + + +class GroupNorm32(nn.GroupNorm): + def forward(self, x): + return super().forward(x.float()).type(x.dtype) + + +def conv_nd(dims, *args, **kwargs): + """Create a 1D, 2D, or 3D convolution module.""" + if dims == 1: + return nn.Conv1d(*args, **kwargs) + elif dims == 2: + return nn.Conv2d(*args, **kwargs) + elif dims == 3: + return nn.Conv3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def linear(*args, **kwargs): + """Create a linear module.""" + return nn.Linear(*args, **kwargs) + + +def avg_pool_nd(dims, *args, **kwargs): + """Create a 1D, 2D, or 3D average pooling module.""" + if dims == 1: + return nn.AvgPool1d(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2d(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def update_ema(target_params, source_params, rate=0.99): + """Update target parameters to be closer to those of source parameters using an exponential + moving average. + + :param target_params: the target parameter sequence. + :param source_params: the source parameter sequence. + :param rate: the EMA rate (closer to 1 means slower). + """ + for targ, src in zip(target_params, source_params): + targ.detach().mul_(rate).add_(src, alpha=1 - rate) + + +def zero_module(module): + """Zero out the parameters of a module and return it.""" + for p in module.parameters(): + p.detach().zero_() + return module + + +def scale_module(module, scale): + """Scale the parameters of a module and return it.""" + for p in module.parameters(): + p.detach().mul_(scale) + return module + + +def mean_flat(tensor): + """Take the mean over all non-batch dimensions.""" + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def normalization(channels): + """Make a standard normalization layer. + + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + return GroupNorm32(32, channels) + + +def timestep_embedding(timesteps, dim, max_period=10000): + """Create sinusoidal timestep embeddings. + + :param timesteps: a 1-D Tensor of N indices, one per batch element. These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + half = dim // 2 + freqs = th.exp( + -math.log(max_period) + * th.arange(start=0, end=half, dtype=th.float32, device=timesteps.device) + / half + ) + args = timesteps[:, None].float() * freqs[None] + embedding = th.cat([th.cos(args), th.sin(args)], dim=-1) + if dim % 2: + embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1) + return embedding + + +def checkpoint(func, inputs, params, flag): + """Evaluate a function without caching intermediate activations, allowing for reduced memory at + the expense of extra compute in the backward pass. + + :param func: the function to evaluate. + :param inputs: the argument sequence to pass to `func`. + :param params: a sequence of parameters `func` depends on but does not explicitly take as + arguments. + :param flag: if False, disable gradient checkpointing. + """ + if flag: + args = tuple(inputs) + tuple(params) + return CheckpointFunction.apply(func, len(inputs), *args) + else: + return func(*inputs) + + +class CheckpointFunction(th.autograd.Function): + @staticmethod + def forward(ctx, run_function, length, *args): + ctx.run_function = run_function + ctx.input_tensors = list(args[:length]) + ctx.input_params = list(args[length:]) + with th.no_grad(): + output_tensors = ctx.run_function(*ctx.input_tensors) + return output_tensors + + @staticmethod + def backward(ctx, *output_grads): + ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors] + with th.enable_grad(): + # Fixes a bug where the first op in run_function modifies the + # Tensor storage in place, which is not allowed for detach()'d + # Tensors. + shallow_copies = [x.view_as(x) for x in ctx.input_tensors] + output_tensors = ctx.run_function(*shallow_copies) + input_grads = th.autograd.grad( + output_tensors, + ctx.input_tensors + ctx.input_params, + output_grads, + allow_unused=True, + ) + del ctx.input_tensors + del ctx.input_params + del output_tensors + return (None, None) + input_grads diff --git a/conditional-flow-matching/runner/src/models/components/optimal_transport.py b/conditional-flow-matching/runner/src/models/components/optimal_transport.py new file mode 100644 index 0000000000000000000000000000000000000000..3c6eb0305eafc8a4176fc353f4d3585e1d5404b0 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/optimal_transport.py @@ -0,0 +1,118 @@ +import math +from functools import partial +from typing import Optional + +import numpy as np +import ot as pot +import torch + + +class OTPlanSampler: + """OTPlanSampler implements sampling coordinates according to an squared L2 OT plan with + different implementations of the plan calculation.""" + + def __init__( + self, + method: str, + reg: float = 0.05, + reg_m: float = 1.0, + normalize_cost=False, + **kwargs, + ): + # ot_fn should take (a, b, M) as arguments where a, b are marginals and + # M is a cost matrix + if method == "exact": + self.ot_fn = pot.emd + elif method == "sinkhorn": + self.ot_fn = partial(pot.sinkhorn, reg=reg) + elif method == "unbalanced": + self.ot_fn = partial(pot.unbalanced.sinkhorn_knopp_unbalanced, reg=reg, reg_m=reg_m) + elif method == "partial": + self.ot_fn = partial(pot.partial.entropic_partial_wasserstein, reg=reg) + else: + raise ValueError(f"Unknown method: {method}") + self.reg = reg + self.reg_m = reg_m + self.normalize_cost = normalize_cost + self.kwargs = kwargs + + def get_map(self, x0, x1): + a, b = pot.unif(x0.shape[0]), pot.unif(x1.shape[0]) + if x0.dim() > 2: + x0 = x0.reshape(x0.shape[0], -1) + if x1.dim() > 2: + x1 = x1.reshape(x1.shape[0], -1) + x1 = x1.reshape(x1.shape[0], -1) + M = torch.cdist(x0, x1) ** 2 + if self.normalize_cost: + M = M / M.max() + p = self.ot_fn(a, b, M.detach().cpu().numpy()) + if not np.all(np.isfinite(p)): + print("ERROR: p is not finite") + print(p) + print("Cost mean, max", M.mean(), M.max()) + print(x0, x1) + return p + + def sample_map(self, pi, batch_size): + p = pi.flatten() + p = p / p.sum() + choices = np.random.choice(pi.shape[0] * pi.shape[1], p=p, size=batch_size) + return np.divmod(choices, pi.shape[1]) + + def sample_plan(self, x0, x1): + pi = self.get_map(x0, x1) + i, j = self.sample_map(pi, x0.shape[0]) + return x0[i], x1[j] + + def sample_trajectory(self, X): + # Assume X is [batch, times, dim] + times = X.shape[1] + pis = [] + for t in range(times - 1): + pis.append(self.get_map(X[:, t], X[:, t + 1])) + + indices = [np.arange(X.shape[0])] + for pi in pis: + j = [] + for i in indices[-1]: + j.append(np.random.choice(pi.shape[1], p=pi[i] / pi[i].sum())) + indices.append(np.array(j)) + + to_return = [] + for t in range(times): + to_return.append(X[:, t][indices[t]]) + to_return = np.stack(to_return, axis=1) + return to_return + + +def wasserstein( + x0: torch.Tensor, + x1: torch.Tensor, + method: Optional[str] = None, + reg: float = 0.05, + power: int = 2, + **kwargs, +) -> float: + assert power == 1 or power == 2 + # ot_fn should take (a, b, M) as arguments where a, b are marginals and + # M is a cost matrix + if method == "exact" or method is None: + ot_fn = pot.emd2 + elif method == "sinkhorn": + ot_fn = partial(pot.sinkhorn2, reg=reg) + else: + raise ValueError(f"Unknown method: {method}") + + a, b = pot.unif(x0.shape[0]), pot.unif(x1.shape[0]) + if x0.dim() > 2: + x0 = x0.reshape(x0.shape[0], -1) + if x1.dim() > 2: + x1 = x1.reshape(x1.shape[0], -1) + M = torch.cdist(x0, x1) + if power == 2: + M = M**2 + ret = ot_fn(a, b, M.detach().cpu().numpy(), numItermax=1e7) + if power == 2: + ret = math.sqrt(ret) + return ret diff --git a/conditional-flow-matching/runner/src/models/components/plotting.py b/conditional-flow-matching/runner/src/models/components/plotting.py new file mode 100644 index 0000000000000000000000000000000000000000..7b39481b2756de5c2b0247f3bb289bc452972466 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/plotting.py @@ -0,0 +1,182 @@ +import os +from typing import Union + +import matplotlib.pyplot as plt +import numpy as np +import scprep +import torch + + +def plot_scatter(obs, model, title="fig", wandb_logger=None): + fig, ax = plt.subplots(1, 1, figsize=(5, 5)) + batch_size, ts, dim = obs.shape + obs = obs.reshape(-1, dim).detach().cpu().numpy() + ts = np.tile(np.arange(ts), batch_size) + scprep.plot.scatter2d(obs, c=ts, ax=ax) + os.makedirs("figs", exist_ok=True) + plt.savefig(f"figs/{title}.png") + if wandb_logger: + wandb_logger.log_image(key=title, images=[f"figs/{title}.png"]) + plt.close() + + +def plot_scatter_and_flow(obs, model, title="stream", wandb_logger=None): + batch_size, ts, dim = obs.shape + device = obs.device + obs = obs.reshape(-1, dim).detach().cpu().numpy() + diff = obs.max() - obs.min() + wmin = obs.min() - diff * 0.1 + wmax = obs.max() + diff * 0.1 + points = 50j + points_real = 50 + Y, X, T = np.mgrid[wmin:wmax:points, wmin:wmax:points, 0 : ts - 1 : 7j] + gridpoints = torch.tensor( + np.stack([X.flatten(), Y.flatten()], axis=1), requires_grad=True, device=device + ).type(torch.float32) + times = torch.tensor(T.flatten(), requires_grad=True, device=device).type(torch.float32)[ + :, None + ] + out = model(times, gridpoints) + out = out.reshape([points_real, points_real, 7, dim]) + out = out.cpu().detach().numpy() + # Stream over time + fig, axes = plt.subplots(1, 7, figsize=(20, 4), sharey=True) + axes = axes.flatten() + tts = np.tile(np.arange(ts), batch_size) + for i in range(7): + scprep.plot.scatter2d(obs, c=tts, ax=axes[i]) + axes[i].streamplot( + X[:, :, 0], + Y[:, :, 0], + out[:, :, i, 0], + out[:, :, i, 1], + color=np.sum(out[:, :, i] ** 2, axis=-1), + ) + axes[i].set_title(f"t = {np.linspace(0,ts-1,7)[i]:0.2f}") + os.makedirs("figs", exist_ok=True) + plt.savefig(f"figs/{title}.png") + plt.close() + if wandb_logger: + wandb_logger.log_image(key="flow", images=[f"figs/{title}.png"]) + + +def store_trajectories(obs: Union[torch.Tensor, list], model, title="trajs", start_time=0): + n = 2000 + if isinstance(obs, list): + data, labels = [], [] + for t, xi in enumerate(obs): + xi = xi.detach().cpu().numpy() + data.append(xi) + labels.append(t * np.ones(xi.shape[0])) + data = np.concatenate(data, axis=0) + labels = np.concatenate(labels, axis=0) + scprep.plot.scatter2d(data, c=labels) + start = obs[0][:n] + ts = len(obs) + else: + batch_size, ts, dim = obs.shape + start = obs[:n, start_time, :] + obs = obs.reshape(-1, dim).detach().cpu().numpy() + from torchdyn.core import NeuralODE + + with torch.no_grad(): + node = NeuralODE(model) + # For consistency with DSB + traj = node.trajectory(start, t_span=torch.linspace(0, ts - 1, 20 * (ts - 1))) + traj = traj.cpu().detach().numpy() + os.makedirs("figs", exist_ok=True) + np.save(f"figs/{title}.npy", traj) + + +def plot_trajectory( + obs: Union[torch.Tensor, list], + traj: torch.Tensor, + title="traj", + key="traj", + start_time=0, + n=200, + wandb_logger=None, +): + plt.figure(figsize=(6, 6)) + if isinstance(obs, list): + data, labels = [], [] + for t, xi in enumerate(obs): + xi = xi.detach().cpu().numpy() + data.append(xi) + labels.append(t * np.ones(xi.shape[0])) + data = np.concatenate(data, axis=0) + labels = np.concatenate(labels, axis=0) + scprep.plot.scatter2d(data, c=labels) + ts = len(obs) + else: + batch_size, ts, dim = obs.shape + obs = obs.reshape(-1, dim).detach().cpu().numpy() + tts = np.tile(np.arange(ts), batch_size) + scprep.plot.scatter2d(obs, c=tts) + plt.scatter(traj[:, :n, 0], traj[:, :n, 1], s=0.3, alpha=0.2, c="black", label="Flow") + plt.scatter(traj[-1, :n, 0], traj[-1, :n, 1], s=6, alpha=1, c="purple", marker="x") + for i in range(20): + plt.plot(traj[:, i, 0], traj[:, i, 1], c="red", alpha=0.5) + # plt.legend(["Prior sample z(S)", "Flow", "z(0)"]) + os.makedirs("figs", exist_ok=True) + plt.savefig(f"figs/{title}.png") + plt.close() + if wandb_logger: + wandb_logger.log_image(key=key, images=[f"figs/{title}.png"]) + + +def plot_paths( + obs: Union[torch.Tensor, list], + model, + title="paths", + start_time=0, + n=200, + wandb_logger=None, +): + plt.figure(figsize=(6, 6)) + if isinstance(obs, list): + data, labels = [], [] + for t, xi in enumerate(obs): + xi = xi.detach().cpu().numpy() + data.append(xi) + labels.append(t * np.ones(xi.shape[0])) + data = np.concatenate(data, axis=0) + labels = np.concatenate(labels, axis=0) + scprep.plot.scatter2d(data, c=labels) + start = obs[0][:n] + ts = len(obs) + else: + batch_size, ts, dim = obs.shape + start = obs[:n, start_time, :] + obs = obs.reshape(-1, dim).detach().cpu().numpy() + tts = np.tile(np.arange(ts), batch_size) + scprep.plot.scatter2d(obs, c=tts) + from torchdyn.core import NeuralODE + + with torch.no_grad(): + node = NeuralODE(model) + traj = node.trajectory(start, t_span=torch.linspace(0, ts - 1, max(20 * ts, 100))) + traj = traj.cpu().detach().numpy() + # plt.scatter(traj[0, :n, 0], traj[0, :n, 1], s=10, alpha=0.8, c="black") + plt.scatter(traj[:, :n, 0], traj[:, :n, 1], s=0.3, alpha=0.2, c="black", label="Flow") + plt.scatter(traj[-1, :n, 0], traj[-1, :n, 1], s=6, alpha=1, c="purple", marker="x") + # plt.legend(["Prior sample z(S)", "Flow", "z(0)"]) + os.makedirs("figs", exist_ok=True) + plt.savefig(f"figs/{title}.png") + plt.close() + if wandb_logger: + wandb_logger.log_image(key="paths", images=[f"figs/{title}.png"]) + + +def plot_samples(trajs, title="samples", wandb_logger=None): + import PIL + from torchvision.utils import save_image + + images = trajs[:100] + os.makedirs("figs", exist_ok=True) + save_image(images, fp=f"figs/{title}.jpg", nrow=10, normalize=True, padding=0) + if wandb_logger: + try: + wandb_logger.log_image(key="paths", images=[f"figs/{title}.jpg"]) + except PIL.UnidentifiedImageError: + print(f"ERROR logging {title}") diff --git a/conditional-flow-matching/runner/src/models/components/regularizers.py b/conditional-flow-matching/runner/src/models/components/regularizers.py new file mode 100644 index 0000000000000000000000000000000000000000..268c18cc6af6333564e0e5c841501a8ce64fc457 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/regularizers.py @@ -0,0 +1,189 @@ +import torch +from torch import nn + + +class Regularizer(nn.Module): + def __init__(self): + pass + + +def _batch_root_mean_squared(tensor): + tensor = tensor.view(tensor.shape[0], -1) + return torch.norm(tensor, p=2, dim=1) / tensor.shape[1] ** 0.5 + + +class RegularizationFunc(nn.Module): + def forward(self, t, x, dx, context) -> torch.Tensor: + """Outputs a batch of scaler regularizations.""" + raise NotImplementedError + + +class L1Reg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + return torch.mean(torch.abs(dx), dim=1) + + +class L2Reg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + return _batch_root_mean_squared(dx) + + +class SquaredL2Reg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + to_return = dx.view(dx.shape[0], -1) + return torch.pow(torch.norm(to_return, p=2, dim=1), 2) + + +def _get_minibatch_jacobian(y, x, create_graph=True): + """Computes the Jacobian of y wrt x assuming minibatch-mode. + + Args: + y: (N, ...) with a total of D_y elements in ... + x: (N, ...) with a total of D_x elements in ... + Returns: + The minibatch Jacobian matrix of shape (N, D_y, D_x) + """ + # assert y.shape[0] == x.shape[0] + y = y.view(y.shape[0], -1) + + # Compute Jacobian row by row. + jac = [] + for j in range(y.shape[1]): + dy_j_dx = torch.autograd.grad( + y[:, j], + x, + torch.ones_like(y[:, j]), + retain_graph=True, + create_graph=create_graph, + )[0] + jac.append(torch.unsqueeze(dy_j_dx, -1)) + jac = torch.cat(jac, -1) + return jac + + +class JacobianFrobeniusReg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + if hasattr(context, "jac"): + jac = context.jac + else: + jac = _get_minibatch_jacobian(dx, x) + context.jac = jac + jac = _get_minibatch_jacobian(dx, x) + context.jac = jac + return _batch_root_mean_squared(jac) + + +class JacobianDiagFrobeniusReg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + if hasattr(context, "jac"): + jac = context.jac + else: + jac = _get_minibatch_jacobian(dx, x) + context.jac = jac + diagonal = jac.view(jac.shape[0], -1)[ + :, :: jac.shape[1] + ] # assumes jac is minibatch square, ie. (N, M, M). + return _batch_root_mean_squared(diagonal) + + +class JacobianOffDiagFrobeniusReg(RegularizationFunc): + def forward(self, t, x, dx, context) -> torch.Tensor: + if hasattr(context, "jac"): + jac = context.jac + else: + jac = _get_minibatch_jacobian(dx, x) + context.jac = jac + diagonal = jac.view(jac.shape[0], -1)[ + :, :: jac.shape[1] + ] # assumes jac is minibatch square, ie. (N, M, M). + ss_offdiag = torch.sum(jac.view(jac.shape[0], -1) ** 2, dim=1) - torch.sum( + diagonal**2, dim=1 + ) + ms_offdiag = ss_offdiag / (diagonal.shape[1] * (diagonal.shape[1] - 1)) + return ms_offdiag + + +def autograd_trace(x_out, x_in, **kwargs): + """Standard brute-force means of obtaining trace of the Jacobian, O(d) calls to autograd.""" + trJ = 0.0 + for i in range(x_in.shape[1]): + trJ += torch.autograd.grad(x_out[:, i].sum(), x_in, allow_unused=False, create_graph=True)[ + 0 + ][:, i] + return trJ + + +class CNFReg(RegularizationFunc): + def __init__(self, trace_estimator=None, noise_dist=None): + super().__init__() + self.trace_estimator = trace_estimator if trace_estimator is not None else autograd_trace + self.noise_dist, self.noise = noise_dist, None + + def forward(self, t, x, dx, context): + # TODO we could check if jac is in the context to speed up + return -self.trace_estimator(dx, x, noise=self.noise) + + +class AugmentationModule(nn.Module): + """Class orchestrating augmentations. + + Also establishes order. + """ + + def __init__( + self, + cnf_estimator: str = None, + l1_reg: float = 0.0, + l2_reg: float = 0.0, + squared_l2_reg: float = 0.0, + jacobian_frobenius_reg: float = 0.0, + jacobian_diag_frobenius_reg: float = 0.0, + jacobian_off_diag_frobenius_reg: float = 0.0, + ) -> None: + super().__init__() + coeffs = [] + regs = [] + if cnf_estimator == "exact": + coeffs.append(1) + regs.append(CNFReg(None, noise_dist=None)) + if l1_reg > 0.0: + coeffs.append(l1_reg) + regs.append(L1Reg()) + if l2_reg > 0.0: + coeffs.append(l2_reg) + regs.append(L2Reg()) + if squared_l2_reg > 0.0: + coeffs.append(squared_l2_reg) + regs.append(SquaredL2Reg()) + if jacobian_frobenius_reg > 0.0: + coeffs.append(jacobian_frobenius_reg) + regs.append(JacobianFrobeniusReg()) + if jacobian_diag_frobenius_reg > 0.0: + coeffs.append(jacobian_diag_frobenius_reg) + regs.append(JacobianDiagFrobeniusReg()) + if jacobian_off_diag_frobenius_reg > 0.0: + coeffs.append(jacobian_off_diag_frobenius_reg) + regs.append(JacobianOffDiagFrobeniusReg()) + + self.coeffs = torch.tensor(coeffs) + self.regs = torch.ModuleList(regs) + + +if __name__ == "__main__": + # Test Shapes + class SharedContext: + pass + + for reg in [ + L1Reg, + L2Reg, + SquaredL2Reg, + JacobianFrobeniusReg, + JacobianDiagFrobeniusReg, + JacobianOffDiagFrobeniusReg, + ]: + x = torch.ones(2, 3).requires_grad_(True) + dx = x * 2 + out = reg().forward(torch.ones(1), x, dx, SharedContext) + assert out.dim() == 1 + assert out.shape[0] == 2 diff --git a/conditional-flow-matching/runner/src/models/components/schedule.py b/conditional-flow-matching/runner/src/models/components/schedule.py new file mode 100644 index 0000000000000000000000000000000000000000..73bcd8c5ba2b63c1fb8d1acc1cb83d8aa14041a5 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/schedule.py @@ -0,0 +1,72 @@ +import numpy as np +import torch + + +class NoiseScheduler: + """Base Class for noise schedule. + + The noise schedule is a function that maps time to reference process noise level. We can use + this to determine the Brownian bridge noise schedule. + + We define the noise schedule with __call__ and the Brownian bridge noise schedule with sigma_t. + We define F as the integral of the squared reference process noise schedule which is a useful + intermediate quantity. + """ + + def __call__(self, t): + """Calculate the reference process noise schedule. + + g(t) in the paper. + """ + raise NotImplementedError + + def F(self, t): + """Calculate the integral of the squared reference process noise schedule.""" + raise NotImplementedError + + def sigma_t(self, t): + """Given the reference process noise schedule, calculate the brownian bridge noise + schedule.""" + return torch.sqrt(self.F(t) - self.F(t) ** 2 / self.F(1)) + + +class ConstantNoiseScheduler(NoiseScheduler): + def __init__(self, sigma: float): + self.sigma = sigma + + def __call__(self, t): + return self.sigma + + def F(self, t): + return self.sigma**2 * t + + +class LinearDecreasingNoiseScheduler(NoiseScheduler): + def __init__(self, sigma_min: float, sigma_max: float): + self.sigma_min = sigma_min + self.sigma_max = sigma_max + + def __call__(self, t): + return torch.sqrt(t * self.sigma_min + (1 - t) * self.sigma_max) + + def F(self, t): + return (t**2) * self.sigma_min / 2 - (t**2) * self.sigma_max / 2 + self.sigma_max * t + + +class CosineNoiseScheduler(NoiseScheduler): + def __init__(self, sigma_min: float, scale: float): + self.sigma_min = sigma_min + self.scale = scale + + def __call__(self, t): + return self.scale * (1 - (t * np.pi * 2).cos()) + self.sigma_min + + def F(self, t): + antider = t - (t * 2 * np.pi).sin() / (2 * np.pi) + antider2 = t - 2 * (t * 2 * np.pi).sin() / (2 * np.pi) + antider2 += t / 2 + (t * 4 * np.pi).sin() / (8 * np.pi) + return ( + self.scale**2 * antider2 + + t * self.sigma_min**2 + + self.scale * 2 * self.sigma_min * antider + ) diff --git a/conditional-flow-matching/runner/src/models/components/simple_dense_net.py b/conditional-flow-matching/runner/src/models/components/simple_dense_net.py new file mode 100644 index 0000000000000000000000000000000000000000..73be1b211e8656a837d4c043bafe9192d74eae09 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/simple_dense_net.py @@ -0,0 +1,38 @@ +from torch import nn + + +class SimpleDenseNet(nn.Module): + def __init__( + self, + input_size: int = 784, + lin1_size: int = 256, + lin2_size: int = 256, + lin3_size: int = 256, + output_size: int = 10, + ): + super().__init__() + + self.model = nn.Sequential( + nn.Linear(input_size, lin1_size), + nn.BatchNorm1d(lin1_size), + nn.ReLU(), + nn.Linear(lin1_size, lin2_size), + nn.BatchNorm1d(lin2_size), + nn.ReLU(), + nn.Linear(lin2_size, lin3_size), + nn.BatchNorm1d(lin3_size), + nn.ReLU(), + nn.Linear(lin3_size, output_size), + ) + + def forward(self, x): + batch_size, channels, width, height = x.size() + + # (batch, 1, width, height) -> (batch, 1*width*height) + x = x.view(batch_size, -1) + + return self.model(x) + + +if __name__ == "__main__": + _ = SimpleDenseNet() diff --git a/conditional-flow-matching/runner/src/models/components/simple_mlp.py b/conditional-flow-matching/runner/src/models/components/simple_mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..e3153dd5d3e8b07bb4f18a487d28f71ceeef8ef7 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/simple_mlp.py @@ -0,0 +1,89 @@ +from typing import List, Optional + +import torch +from torch import nn + +ACTIVATION_MAP = { + "relu": nn.ReLU, + "sigmoid": nn.Sigmoid, + "tanh": nn.Tanh, + "selu": nn.SELU, + "elu": nn.ELU, + "lrelu": nn.LeakyReLU, + "softplus": nn.Softplus, + "silu": nn.SiLU, +} + + +class SimpleDenseNet(nn.Module): + def __init__( + self, + input_size: int, + target_size: int, + activation: str, + batch_norm: bool = True, + hidden_dims: Optional[List[int]] = None, + ): + super().__init__() + if hidden_dims is None: + hidden_dims = [256, 256, 256] + dims = [input_size, *hidden_dims, target_size] + layers = [] + for i in range(len(dims) - 2): + layers.append(nn.Linear(dims[i], dims[i + 1])) + if batch_norm: + layers.append(nn.BatchNorm1d(dims[i + 1])) + layers.append(ACTIVATION_MAP[activation]()) + layers.append(nn.Linear(dims[-2], dims[-1])) + self.model = nn.Sequential(*layers) + + def forward(self, x): + return self.model(x) + + +class DivergenceFreeNet(SimpleDenseNet): + """Implements a divergence free network as the gradient of a scalar potential function.""" + + def __init__(self, dim: int, *args, **kwargs): + super().__init__(input_size=dim + 1, target_size=1, *args, **kwargs) + + def energy(self, x): + return self.model(x) + + def forward(self, t, x, *args, **kwargs): + """Ignore t run model.""" + if t.dim() < 2: + t = t.repeat(x.shape[0])[:, None] + x = torch.cat([t, x], dim=-1) + x = x.requires_grad_(True) + grad = torch.autograd.grad(torch.sum(self.model(x)), x, create_graph=True)[0] + return grad[:, :-1] + + +class TimeInvariantVelocityNet(SimpleDenseNet): + def __init__(self, dim: int, *args, **kwargs): + super().__init__(input_size=dim, target_size=dim, *args, **kwargs) + + def forward(self, t, x, *args, **kwargs): + """Ignore t run model.""" + del t + return self.model(x) + + +class VelocityNet(SimpleDenseNet): + def __init__(self, dim: int, *args, **kwargs): + super().__init__(input_size=dim + 1, target_size=dim, *args, **kwargs) + + def forward(self, t, x, *args, **kwargs): + """Ignore t run model.""" + if t.dim() < 1 or t.shape[0] != x.shape[0]: + t = t.repeat(x.shape[0])[:, None] + if t.dim() < 2: + t = t[:, None] + x = torch.cat([t, x], dim=-1) + return self.model(x) + + +if __name__ == "__main__": + _ = SimpleDenseNet() + _ = TimeInvariantVelocityNet() diff --git a/conditional-flow-matching/runner/src/models/components/sinkhorn_knopp_unbalanced.py b/conditional-flow-matching/runner/src/models/components/sinkhorn_knopp_unbalanced.py new file mode 100644 index 0000000000000000000000000000000000000000..c5eede9ed317f33b998b6d53f9bda2cfd1159319 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/sinkhorn_knopp_unbalanced.py @@ -0,0 +1,201 @@ +"""Implements unbalanced sinkhorn knopp optimization for unbalanced ot. + +This is from the package python optimal transport but modified to take three regularization +parameters instead of two. This is necessary to find growth rates of the source distribution that +best match the target distribution or vis versa. by setting reg_m_1 to something low and reg_m_2 to +something large we can compute an unbalanced optimal transport where all the scaling is done on the +source distribution and none is done on the target distribution. +""" + +import warnings + +import numpy as np + + +def sinkhorn_knopp_unbalanced( + a, + b, + M, + reg, + reg_m_1, + reg_m_2, + numItermax=1000, + stopThr=1e-6, + verbose=False, + log=False, + **kwargs, +): + """Solve the entropic regularization unbalanced optimal transport problem. + + The function solves the following optimization problem: + + .. math:: + W = \\min_\\gamma <\\gamma,M>_F + reg\\cdot\\Omega(\\gamma) + \ + \reg_m_1 KL(\\gamma 1, a) + \reg_m_2 KL(\\gamma^T 1, b) + + s.t. + \\gamma\\geq 0 + where : + + - M is the (dim_a, dim_b) metric cost matrix + - :math:`\\Omega` is the entropic regularization term + :math:`\\Omega(\\gamma)=\\sum_{i,j} \\gamma_{i,j}\\log(\\gamma_{i,j})` + - a and b are source and target unbalanced distributions + - KL is the Kullback-Leibler divergence + + The algorithm used for solving the problem is the generalized + Sinkhorn-Knopp matrix scaling algorithm as proposed in [10, 23]_ + + + Parameters + ---------- + a : np.ndarray (dim_a,) + Unnormalized histogram of dimension dim_a + b : np.ndarray (dim_b,) or np.ndarray (dim_b, n_hists) + One or multiple unnormalized histograms of dimension dim_b + If many, compute all the OT distances (a, b_i) + M : np.ndarray (dim_a, dim_b) + loss matrix + reg : float + Entropy regularization term > 0 + reg_m: float + Marginal relaxation term > 0 + numItermax : int, optional + Max number of iterations + stopThr : float, optional + Stop threshold on error (> 0) + verbose : bool, optional + Print information along iterations + log : bool, optional + record log if True + + + Returns + ------- + if n_hists == 1: + gamma : (dim_a x dim_b) ndarray + Optimal transportation matrix for the given parameters + log : dict + log dictionary returned only if `log` is `True` + else: + ot_distance : (n_hists,) ndarray + the OT distance between `a` and each of the histograms `b_i` + log : dict + log dictionary returned only if `log` is `True` + Examples + -------- + + >>> import ot + >>> a=[.5, .5] + >>> b=[.5, .5] + >>> M=[[0., 1.],[1., 0.]] + >>> ot.unbalanced.sinkhorn_knopp_unbalanced(a, b, M, 1., 1.) + array([[0.51122814, 0.18807032], + [0.18807032, 0.51122814]]) + + References + ---------- + + .. [10] Chizat, L., Peyré, G., Schmitzer, B., & Vialard, F. X. (2016). + Scaling algorithms for unbalanced transport problems. arXiv preprint + arXiv:1607.05816. + + .. [25] Frogner C., Zhang C., Mobahi H., Araya-Polo M., Poggio T. : + Learning with a Wasserstein Loss, Advances in Neural Information + Processing Systems (NIPS) 2015 + + See Also + -------- + ot.lp.emd : Unregularized OT + ot.optim.cg : General regularized OT + """ + + a = np.asarray(a, dtype=np.float64) + b = np.asarray(b, dtype=np.float64) + M = np.asarray(M, dtype=np.float64) + + dim_a, dim_b = M.shape + + if len(a) == 0: + a = np.ones(dim_a, dtype=np.float64) / dim_a + if len(b) == 0: + b = np.ones(dim_b, dtype=np.float64) / dim_b + + if len(b.shape) > 1: + n_hists = b.shape[1] + else: + n_hists = 0 + + if log: + log = {"err": []} + + # we assume that no distances are null except those of the diagonal of + # distances + if n_hists: + u = np.ones((dim_a, 1)) / dim_a + v = np.ones((dim_b, n_hists)) / dim_b + a = a.reshape(dim_a, 1) + else: + u = np.ones(dim_a) / dim_a + v = np.ones(dim_b) / dim_b + + # Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute + K = np.empty(M.shape, dtype=M.dtype) + np.divide(M, -reg, out=K) + np.exp(K, out=K) + + cpt = 0 + err = 1.0 + + while err > stopThr and cpt < numItermax: + uprev = u + vprev = v + + Kv = K.dot(v) + u = (a / Kv) ** (reg_m_1 / (reg_m_1 + reg)) + Ktu = K.T.dot(u) + v = (b / Ktu) ** (reg_m_2 / (reg_m_2 + reg)) + + if ( + np.any(Ktu == 0.0) + or np.any(np.isnan(u)) + or np.any(np.isnan(v)) + or np.any(np.isinf(u)) + or np.any(np.isinf(v)) + ): + # we have reached the machine precision + # come back to previous solution and quit loop + warnings.warn("Numerical errors at iteration %s" % cpt) + u = uprev + v = vprev + break + if cpt % 10 == 0: + # we can speed up the process by checking for the error only all + # the 10th iterations + err_u = abs(u - uprev).max() / max(abs(u).max(), abs(uprev).max(), 1.0) + err_v = abs(v - vprev).max() / max(abs(v).max(), abs(vprev).max(), 1.0) + err = 0.5 * (err_u + err_v) + if log: + log["err"].append(err) + if verbose: + if cpt % 200 == 0: + print("{:5s}|{:12s}".format("It.", "Err") + "\n" + "-" * 19) + print(f"{cpt:5d}|{err:8e}|") + cpt += 1 + + if log: + log["logu"] = np.log(u + 1e-16) + log["logv"] = np.log(v + 1e-16) + + if n_hists: # return only loss + res = np.einsum("ik,ij,jk,ij->k", u, K, v, M) + if log: + return res, log + else: + return res + + else: # return OT matrix + if log: + return u[:, None] * K * v[None, :], log + else: + return u[:, None] * K * v[None, :] diff --git a/conditional-flow-matching/runner/src/models/components/solver.py b/conditional-flow-matching/runner/src/models/components/solver.py new file mode 100644 index 0000000000000000000000000000000000000000..ec81a72da012590f92fc77f86deb8d4fdb629232 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/solver.py @@ -0,0 +1,269 @@ +"""solver.py. + +Implements ODE and SDE solvers for the model. + +Joins the torchdyn and torchsde libraries. +""" + +from math import prod + +import torch +import torchsde +from torchdyn.core import NeuralODE + +from .augmentation import AugmentedVectorField, Sequential + + +class TorchSDE(torch.nn.Module): + def __init__( + self, + sigma, + forward_sde_drift, + backward_sde_drift, + noise_type, + sde_type, + reverse=False, + ): + super().__init__() + self.sigma = sigma + self.forward_sde_drift = forward_sde_drift + self.backward_sde_drift = backward_sde_drift + self.noise_type = noise_type + self.sde_type = sde_type + self.reverse = reverse + + def f(self, t, y): + if self.reverse: + return self.backward_sde_drift(1 - t, y) + return self.forward_sde_drift(t, y) + + def g(self, t, y): + return self.sigma(t) * torch.ones_like(y) + + def h(self, t, y): + return torch.zeros_like(y) + + +class FlowSolver(torch.nn.Module): + def __init__( + self, + vector_field, + dim, + augmentations=None, + score_field=None, + sigma=None, + ode_solver="euler", + sde_solver="euler", + sde_noise_type="diagonal", + sde_type="ito", + dt=0.01, + dt_min=1e-3, + atol=1e-5, + rtol=1e-5, + **kwargs, + ): + """Initializes the solver. + + Merges Torchdyn with torchsde. + Args: + vector_field (torch.nn.Module): The vector field of the ODE. + augmentations (torch.nn.Module): The augmentations of the ODE. Not used for SDE + score_field (torch.nn.Module): The score field of the SDE. Score field is -g(t)^2 / 2 \nabla log p(x(t)). + sigma (noise_schedule): The noise schedule of the SDE. + reverse (bool): Whether to reverse the SDE no effect on ODE. + ode_solver (str): The ODE solver to use. + sde_solver (str): The SDE solver to use. + sde_noise_type (str): The noise type of the SDE. + dt (float): The fixed time step of the ODE solver. + atol (float): The absolute tolerance of the ODE solver. + rtol (float): The relative tolerance of the ODE solver. + """ + super().__init__() + self.net = vector_field + self.dim = dim + self.augmentations = augmentations + self.score_net = score_field + self.separate_score = score_field is not None + self.sigma = sigma + self.ode_solver = ode_solver + self.sde_solver = sde_solver + self.sde_noise_type = sde_noise_type + self.sde_type = sde_type + self.dt = dt + self.dt_min = dt_min + self.atol = atol + self.rtol = rtol + self.nfe = 0 + self.kwargs = kwargs + self.is_image = not isinstance(self.dim, int) + if self.is_image: + self.flat_dim = prod(dim) + + def forward_flow_and_score(self, t, x, only_flow=False): + if self.is_image: + x = x.reshape(-1, *self.dim) + if self.separate_score: + vt, st = self.net(t, x), self.score_net(t, x) + else: + vtst = self.net(t, x) + if vtst.shape[1] == x.shape[1]: + return vtst + split_idx = vtst.shape[1] // 2 + vt, st = vtst[:, :split_idx], vtst[:, split_idx:] + assert vt.shape == x.shape + if only_flow: + return vt + if self.is_image: + vt = vt.reshape(-1, self.flat_dim) + st = st.reshape(-1, self.flat_dim) + return vt, st + + def forward_sde_drift(self, t, x): + """Computes the forwards drift of the SDE.""" + self.nfe += 1 + vt, st = self.forward_flow_and_score(t, x) + return vt + st + + def backward_sde_drift(self, t, x): + """Computes the backwards drift of the SDE.""" + self.nfe += 1 + vt, st = self.forward_flow_and_score(t, x) + return -vt + st + + def forward_ode_drift(self, t, x): + """Computes the forwards drift of the ODE.""" + self.nfe += 1 + return self.forward_flow_and_score(t, x, only_flow=True) + + def backward_ode_drift(self, t, x): + """Computes the backwards drift of the ODE.""" + self.nfe += 1 + return -self.forward_flow_and_score(t, x, only_flow=True) + + def ode_drift(self, reverse=False): + return self.forward_ode_drift if not reverse else self.backward_ode_drift + + def sde_drift(self, reverse=False): + return self.forward_sde_drift if not reverse else self.backward_sde_drift + + def flat_wrapper(self, func): + if not isinstance(self.dim, int): + + def wrap(t, x): + x = x.reshape(-1, self.dim) + y = func(t, x) + y = y.reshape(-1, self.flat_dim) + + def sdeint(self, x0, t_span, logqp=False, adaptive=False, reverse=False): + self.nfe = 0 + sde = TorchSDE( + self.sigma, + self.forward_sde_drift, + self.backward_sde_drift, + self.sde_noise_type, + self.sde_type, + reverse, + ) + if self.is_image: + x0 = x0.reshape(-1, self.flat_dim) + traj = torchsde.sdeint( + sde, + x0, + t_span, + method=self.sde_solver, + dt=self.dt, + rtol=self.rtol, + atol=self.atol, + logqp=logqp, + adaptive=adaptive, + ) + if self.is_image: + traj = traj.reshape(traj.shape[0], traj.shape[1], *self.dim) + return traj + + def odeint(self, x0, t_span): + """Computes the ODE trajectory. + + Relies on the torchdyn library to compute the ODE trajectory and to handle reverse t_spans. + """ + self.nfe = 0 + + if self.augmentations is None: + node = NeuralODE( + self.forward_ode_drift, + solver=self.ode_solver, + atol=self.atol, + rtol=self.rtol, + return_t_eval=False, + ) + return node(x0, t_span) + + aug_dims = self.augmentations.aug_dims + aug_net = AugmentedVectorField(self.forward_ode_drift, self.augmentations.regs, self.dim) + node_partial = NeuralODE( + aug_net, + solver=self.ode_solver, + atol=self.atol, + rtol=self.rtol, + return_t_eval=False, + ) + node = Sequential( + self.augmentations.augmenter, + node_partial, + ) + aug_traj = node(x0, t_span) + aug, traj = aug_traj[:, :, :aug_dims], aug_traj[:, :, aug_dims:] + return traj, aug + + def get_nfe(self): + return self.nfe + + def reset_nfe(self): + self.nfe = 0 + + +class DSBMFlowSolver(FlowSolver): + """Same as SF2M except interprets net as forward and score_net as backward SDE drifts.""" + + def forward_flow_and_score(self, t, x, only_forward=False, only_backward=False): + if self.is_image: + x = x.reshape(-1, *self.dim) + if only_forward: + fvt = self.net(t, x) + return fvt.reshape(-1, self.flat_dim) if self.is_image else fvt + if only_backward: + return self.score_net(t, x) + if self.separate_score: + fvt, bvt = self.net(t, x), self.score_net(t, x) + else: + fbvt = self.net(t, x) + # if using a single network split the network in two along the [1] dimension + # batch, *(dims) + split_idx = fbvt.shape[1] // 2 + fvt, bvt = fbvt[..., :split_idx], fbvt[..., split_idx:] + if self.is_image: + fvt = fvt.reshape(-1, self.flat_dim) + bvt = bvt.reshape(-1, self.flat_dim) + return fvt, bvt + + def forward_sde_drift(self, t, x): + """Computes the forwards drift of the SDE.""" + self.nfe += 1 + return self.forward_flow_and_score(t, x, only_forward=True) + + def backward_sde_drift(self, t, x): + """Computes the backwards drift of the SDE.""" + self.nfe += 1 + return self.forward_flow_and_score(t, x, only_backward=True) + + def forward_ode_drift(self, t, x): + """Computes the forwards drift of the ODE.""" + self.nfe += 1 + fvt, bvt = self.forward_flow_and_score(t, x) + return (fvt - bvt) / 2 + + def backward_ode_drift(self, t, x): + """Computes the backwards drift of the ODE.""" + self.nfe += 1 + fvt, bvt = self.forward_flow_and_score(t, x) + return -(fvt - bvt) / 2 diff --git a/conditional-flow-matching/runner/src/models/components/unet.py b/conditional-flow-matching/runner/src/models/components/unet.py new file mode 100644 index 0000000000000000000000000000000000000000..b6d393c8989e68fef3ae9d09d5c4dfa7639f6b34 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/unet.py @@ -0,0 +1,919 @@ +"""From https://raw.githubusercontent.com/openai/guided-diffusion/main/guided_diffusion/unet.py.""" + +import math +from abc import abstractmethod + +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F + +from .fp16_util import convert_module_to_f16, convert_module_to_f32 +from .nn import ( + avg_pool_nd, + checkpoint, + conv_nd, + linear, + normalization, + timestep_embedding, + zero_module, +) + + +class AttentionPool2d(nn.Module): + """Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py.""" + + def __init__( + self, + spacial_dim: int, + embed_dim: int, + num_heads_channels: int, + output_dim: int = None, + ): + super().__init__() + self.positional_embedding = nn.Parameter( + th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5 + ) + self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1) + self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1) + self.num_heads = embed_dim // num_heads_channels + self.attention = QKVAttention(self.num_heads) + + def forward(self, x): + b, c, *_spatial = x.shape + x = x.reshape(b, c, -1) # NC(HW) + x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1) + x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1) + x = self.qkv_proj(x) + x = self.attention(x) + x = self.c_proj(x) + return x[:, :, 0] + + +class TimestepBlock(nn.Module): + """Any module where forward() takes timestep embeddings as a second argument.""" + + @abstractmethod + def forward(self, x, emb): + """Apply the module to `x` given `emb` timestep embeddings.""" + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + """A sequential module that passes timestep embeddings to the children that support it as an + extra input.""" + + def forward(self, x, emb): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + else: + x = layer(x) + return x + + +class Upsample(nn.Module): + """An upsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then upsampling occurs in the + inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + if use_conv: + self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1) + + def forward(self, x): + assert x.shape[1] == self.channels + if self.dims == 3: + x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest") + else: + x = F.interpolate(x, scale_factor=2, mode="nearest") + if self.use_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + """A downsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then downsampling occurs in the + inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=1) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(TimestepBlock): + """A residual block that can optionally change the number of channels. + + :param channels: the number of input channels. + :param emb_channels: the number of timestep embedding channels. + :param dropout: the rate of dropout. + :param out_channels: if specified, the number of out channels. + :param use_conv: if True and out_channels is specified, use a spatial convolution instead of a + smaller 1x1 convolution to change the channels in the skip connection. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param use_checkpoint: if True, use gradient checkpointing on this module. + :param up: if True, use this block for upsampling. + :param down: if True, use this block for downsampling. + """ + + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + use_checkpoint=False, + up=False, + down=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_checkpoint = use_checkpoint + self.use_scale_shift_norm = use_scale_shift_norm + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + conv_nd(dims, channels, self.out_channels, 3, padding=1), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.emb_layers = nn.Sequential( + nn.SiLU(), + linear( + emb_channels, + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module(conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1) + else: + self.skip_connection = conv_nd(dims, channels, self.out_channels, 1) + + def forward(self, x, emb): + """Apply the block to a Tensor, conditioned on a timestep embedding. + + :param x: an [N x C x ...] Tensor of features. + :param emb: an [N x emb_channels] Tensor of timestep embeddings. + :return: an [N x C x ...] Tensor of outputs. + """ + return checkpoint(self._forward, (x, emb), self.parameters(), self.use_checkpoint) + + def _forward(self, x, emb): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + emb_out = self.emb_layers(emb).type(h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = th.chunk(emb_out, 2, dim=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AttentionBlock(nn.Module): + """An attention block that allows spatial positions to attend to each other. + + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + use_new_attention_order=False, + ): + super().__init__() + self.channels = channels + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + self.num_heads = channels // num_head_channels + self.use_checkpoint = use_checkpoint + self.norm = normalization(channels) + self.qkv = conv_nd(1, channels, channels * 3, 1) + if use_new_attention_order: + # split qkv before split heads + self.attention = QKVAttention(self.num_heads) + else: + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) + + def forward(self, x): + return checkpoint(self._forward, (x,), self.parameters(), True) + + def _forward(self, x): + b, c, *spatial = x.shape + x = x.reshape(b, c, -1) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv) + h = self.proj_out(h) + return (x + h).reshape(b, c, *spatial) + + +def count_flops_attn(model, _x, y): + """A counter for the `thop` package to count the operations in an attention operation. + + Meant to be used like: + macs, params = thop.profile( + model, + inputs=(inputs, timestamps), + custom_ops={QKVAttention: QKVAttention.count_flops}, + ) + """ + b, c, *spatial = y[0].shape + num_spatial = int(np.prod(spatial)) + # We perform two matmuls with the same number of ops. + # The first computes the weight matrix, the second computes + # the combination of the value vectors. + matmul_ops = 2 * b * (num_spatial**2) * c + model.total_ops += th.DoubleTensor([matmul_ops]) + + +class QKVAttentionLegacy(nn.Module): + """A module which performs QKV attention. + + Matches legacy QKVAttention + input/output heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """Apply QKV attention. + + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + "bct,bcs->bts", q * scale, k * scale + ) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum("bts,bcs->bct", weight, v) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class QKVAttention(nn.Module): + """A module which performs QKV attention and splits in a different order.""" + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """Apply QKV attention. + + :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.chunk(3, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + "bct,bcs->bts", + (q * scale).view(bs * self.n_heads, ch, length), + (k * scale).view(bs * self.n_heads, ch, length), + ) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class UNetModel(nn.Module): + """The full UNet model with attention and timestep embedding. + + :param in_channels: channels in the input Tensor. + :param model_channels: base channel count for the model. + :param out_channels: channels in the output Tensor. + :param num_res_blocks: number of residual blocks per downsample. + :param attention_resolutions: a collection of downsample rates at which attention will take + place. May be a set, list, or tuple. For example, if this contains 4, then at 4x + downsampling, attention will be used. + :param dropout: the dropout probability. + :param channel_mult: channel multiplier for each level of the UNet. + :param conv_resample: if True, use learned convolutions for upsampling and downsampling. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param num_classes: if specified (as an int), then this model will be class-conditional with + `num_classes` classes. + :param use_checkpoint: use gradient checkpointing to reduce memory usage. + :param num_heads: the number of attention heads in each attention layer. + :param num_heads_channels: if specified, ignore num_heads and instead use a fixed channel width + per attention head. + :param num_heads_upsample: works with num_heads to set a different number of heads for + upsampling. Deprecated. + :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. + :param resblock_updown: use residual blocks for up/downsampling. + :param use_new_attention_order: use a different attention pattern for potentially increased + efficiency. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + num_classes=None, + use_checkpoint=False, + use_fp16=False, + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + ): + super().__init__() + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + self.dtype = th.float16 if use_fp16 else th.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + if self.num_classes is not None: + self.label_emb = nn.Embedding(num_classes, time_embed_dim) + + ch = input_ch = int(channel_mult[0] * model_channels) + self.input_blocks = nn.ModuleList( + [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))] + ) + self._feature_size = ch + input_block_chans = [ch] + ds = 1 + for level, mult in enumerate(channel_mult): + for _ in range(num_res_blocks): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=int(mult * model_channels), + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = int(mult * model_channels) + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ) + ) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) + if resblock_updown + else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch) + ) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + + self.output_blocks = nn.ModuleList([]) + for level, mult in list(enumerate(channel_mult))[::-1]: + for i in range(num_res_blocks + 1): + ich = input_block_chans.pop() + layers = [ + ResBlock( + ch + ich, + time_embed_dim, + dropout, + out_channels=int(model_channels * mult), + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = int(model_channels * mult) + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads_upsample, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ) + ) + if level and i == num_res_blocks: + out_ch = ch + layers.append( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + up=True, + ) + if resblock_updown + else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch) + ) + ds //= 2 + self.output_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)), + ) + + def convert_to_fp16(self): + """Convert the torso of the model to float16.""" + self.input_blocks.apply(convert_module_to_f16) + self.middle_block.apply(convert_module_to_f16) + self.output_blocks.apply(convert_module_to_f16) + + def convert_to_fp32(self): + """Convert the torso of the model to float32.""" + self.input_blocks.apply(convert_module_to_f32) + self.middle_block.apply(convert_module_to_f32) + self.output_blocks.apply(convert_module_to_f32) + + def forward(self, t, x, y=None): + """Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param y: an [N] Tensor of labels, if class-conditional. + :return: an [N x C x ...] Tensor of outputs. + """ + timesteps = t + assert (y is not None) == ( + self.num_classes is not None + ), "must specify y if and only if the model is class-conditional" + while timesteps.dim() > 1: + print(timesteps.shape) + timesteps = timesteps[:, 0] + if timesteps.dim() == 0: + timesteps = timesteps.repeat(x.shape[0]) + + hs = [] + emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) + + if self.num_classes is not None: + assert y.shape == (x.shape[0],) + emb = emb + self.label_emb(y) + + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb) + hs.append(h) + h = self.middle_block(h, emb) + for module in self.output_blocks: + h = th.cat([h, hs.pop()], dim=1) + h = module(h, emb) + h = h.type(x.dtype) + return self.out(h) + + +class SuperResModel(UNetModel): + """A UNetModel that performs super-resolution. + + Expects an extra kwarg `low_res` to condition on a low-resolution image. + """ + + def __init__(self, image_size, in_channels, *args, **kwargs): + super().__init__(image_size, in_channels * 2, *args, **kwargs) + + def forward(self, x, timesteps, low_res=None, **kwargs): + _, _, new_height, new_width = x.shape + upsampled = F.interpolate(low_res, (new_height, new_width), mode="bilinear") + x = th.cat([x, upsampled], dim=1) + return super().forward(x, timesteps, **kwargs) + + +class EncoderUNetModel(nn.Module): + """The half UNet model with attention and timestep embedding. + + For usage, see UNet. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + use_checkpoint=False, + use_fp16=False, + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + pool="adaptive", + ): + super().__init__() + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.use_checkpoint = use_checkpoint + self.dtype = th.float16 if use_fp16 else th.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + ch = int(channel_mult[0] * model_channels) + self.input_blocks = nn.ModuleList( + [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))] + ) + self._feature_size = ch + input_block_chans = [ch] + ds = 1 + for level, mult in enumerate(channel_mult): + for _ in range(num_res_blocks): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=int(mult * model_channels), + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = int(mult * model_channels) + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ) + ) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) + if resblock_updown + else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch) + ) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + self.pool = pool + if pool == "adaptive": + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + nn.AdaptiveAvgPool2d((1, 1)), + zero_module(conv_nd(dims, ch, out_channels, 1)), + nn.Flatten(), + ) + elif pool == "attention": + assert num_head_channels != -1 + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + AttentionPool2d((image_size // ds), ch, num_head_channels, out_channels), + ) + elif pool == "spatial": + self.out = nn.Sequential( + nn.Linear(self._feature_size, 2048), + nn.ReLU(), + nn.Linear(2048, self.out_channels), + ) + elif pool == "spatial_v2": + self.out = nn.Sequential( + nn.Linear(self._feature_size, 2048), + normalization(2048), + nn.SiLU(), + nn.Linear(2048, self.out_channels), + ) + else: + raise NotImplementedError(f"Unexpected {pool} pooling") + + def convert_to_fp16(self): + """Convert the torso of the model to float16.""" + self.input_blocks.apply(convert_module_to_f16) + self.middle_block.apply(convert_module_to_f16) + + def convert_to_fp32(self): + """Convert the torso of the model to float32.""" + self.input_blocks.apply(convert_module_to_f32) + self.middle_block.apply(convert_module_to_f32) + + def forward(self, x, timesteps): + """Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :return: an [N x K] Tensor of outputs. + """ + emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) + + results = [] + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb) + if self.pool.startswith("spatial"): + results.append(h.type(x.dtype).mean(dim=(2, 3))) + h = self.middle_block(h, emb) + if self.pool.startswith("spatial"): + results.append(h.type(x.dtype).mean(dim=(2, 3))) + h = th.cat(results, axis=-1) + return self.out(h) + else: + h = h.type(x.dtype) + return self.out(h) + + +NUM_CLASSES = 1000 + + +class UNetModelWrapper(UNetModel): + def __init__( + self, + dim, + num_channels, + num_res_blocks, + channel_mult=None, + learn_sigma=False, + class_cond=False, + use_checkpoint=False, + attention_resolutions="16", + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + dropout=0, + resblock_updown=False, + use_fp16=False, + use_new_attention_order=False, + ): + image_size = dim[-1] + if channel_mult is None: + if image_size == 512: + channel_mult = (0.5, 1, 1, 2, 2, 4, 4) + elif image_size == 256: + channel_mult = (1, 1, 2, 2, 4, 4) + elif image_size == 128: + channel_mult = (1, 1, 2, 3, 4) + elif image_size == 64: + channel_mult = (1, 2, 3, 4) + elif image_size == 32: + channel_mult = (1, 2, 2, 2) + elif image_size == 28: + channel_mult = (1, 2, 2, 2) + else: + raise ValueError(f"unsupported image size: {image_size}") + else: + channel_mult = list(channel_mult) + + attention_ds = [] + for res in attention_resolutions.split(","): + attention_ds.append(image_size // int(res)) + + return super().__init__( + image_size=image_size, + in_channels=3, + model_channels=num_channels, + out_channels=(3 if not learn_sigma else 6), + num_res_blocks=num_res_blocks, + attention_resolutions=tuple(attention_ds), + dropout=dropout, + channel_mult=channel_mult, + num_classes=(NUM_CLASSES if class_cond else None), + use_checkpoint=use_checkpoint, + use_fp16=use_fp16, + num_heads=num_heads, + num_head_channels=num_head_channels, + num_heads_upsample=num_heads_upsample, + use_scale_shift_norm=use_scale_shift_norm, + resblock_updown=resblock_updown, + use_new_attention_order=use_new_attention_order, + ) diff --git a/conditional-flow-matching/runner/src/models/components/utils.py b/conditional-flow-matching/runner/src/models/components/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..30a4ce801a395dd23a6e83d03186eb250542a949 --- /dev/null +++ b/conditional-flow-matching/runner/src/models/components/utils.py @@ -0,0 +1,137 @@ +import os + +import matplotlib.pyplot as plt +import numpy as np +import torch + + +def plot_trajectories(data, pred, graph, dataset, title=[1, 2.1]): + fig, axs = plt.subplots(1, 3, figsize=(10, 2.3)) + fig.tight_layout(pad=0.2, w_pad=2, h_pad=3) + assert data.shape[-1] == pred.shape[-1] + for i in range(data.shape[-1]): + axs[0].plot(data[0, :, i].squeeze()) + axs[1].plot(pred[0, :, i].squeeze()) + title = f"{dataset}: Epoch = {title[0]}, Loss = {title[1]:1.3f}" + axs[1].set_title(title) + cax = axs[2].matshow(graph) + fig.colorbar(cax) + if not os.path.exists("figs"): + os.mkdir("figs") + plt.savefig(f"figs/{title}.png") + plt.close() + + +def plot_graph_dist(graph_mu, graph_thresh, graph_std, ground_truth, path): + fig, axs = plt.subplots(1, 4, figsize=(13, 4.5)) + # fig.tight_layout(pad=0.2, w_pad=2, h_pad=3) + axs[0].set_title("Ground Truth") + axs[1].set_title("Graph means") + axs[2].set_title("Graph post-threshold") + axs[3].set_title("Graph std") + + print(graph_mu.shape, ground_truth.shape) + + g = [ground_truth, graph_mu, graph_thresh, graph_std] + for col in range(4): + ax = axs[col] + pcm = ax.matshow(g[col], cmap="viridis") + fig.colorbar(pcm, ax=ax) + + if not os.path.exists(path + "/figs"): + os.mkdir(path + "/figs") + plt.savefig(f"{path}/figs/graph_dist_plot.png") + plt.close() + + +def plot_traj_dist(data, pred, dataset, title=[1, 2.1]): + fig, axs = plt.subplots(1, 2, figsize=(10, 2.3)) + fig.tight_layout(pad=0.2, w_pad=2, h_pad=3) + assert data.shape[-1] == pred.shape[-1] + for i in range(data.shape[-1]): + axs[0].plot(data[0, :, i].squeeze()) + axs[1].plot(pred[0, :, i].squeeze()) + title = f"{dataset}: Epoch = {title[0]}, Loss = {title[1]:1.3f}" + axs[1].set_title(title) + if not os.path.exists("figs"): + os.mkdir("figs") + plt.savefig(f"figs/{title}.png") + plt.close() + + +def plot_cnf(data, traj, graph, dataset, title): + n = 1000 + fig, axes = plt.subplots(1, 2, figsize=(10, 5)) + ax = axes[0] + data = data.reshape([-1, *data.shape[2:]]) + ax.scatter(data[:, 0], data[:, 1], alpha=0.5) + ax.scatter(traj[:n, -1, 0], traj[:n, -1, 1], s=10, alpha=0.8, c="black") + # ax.scatter(traj[0, :n, 0], traj[0, :n, 1], s=10, alpha=0.8, c="black") + # ax.scatter(traj[:, :n, 0], traj[:, :n, 1], s=0.2, alpha=0.2, c="olive") + ax.scatter(traj[:n, :, 0], traj[:n, :, 1], s=0.2, alpha=0.2, c="olive") + # ax.scatter(traj[-1, :n, 0], traj[-1, :n, 1], s=4, alpha=1, c="blue") + ax.scatter(traj[:n, 0, 0], traj[:n, 0, 1], s=4, alpha=1, c="blue") + ax.legend(["data", "Last Timepoint", "Flow", "Posterior"]) + + ax = axes[1] + cax = ax.matshow(graph) + fig.colorbar(cax) + title = f"{dataset}: Epoch = {title[0]}, Loss = {title[1]:1.3f}" + ax.set_title(title) + if not os.path.exists("figs"): + os.mkdir("figs") + plt.savefig(f"figs/{title}.png") + plt.close() + + +def plot_pca_traj(data, traj, graph, adata, dataset, title): + """ + Args: + data: np.array [N, T, D] + traj: np.array [N, T, D] + graph: np.array [D, D] + """ + n = 1000 + fig, axes = plt.subplots(1, 2, figsize=(10, 5)) + ax = axes[0] + # data = data.reshape([-1, *data.shape[2:]]) + + def pca_transform(x, d=2): + return (x - adata.var["means"].values) @ adata.varm["PCs"][:, :d] + + traj = pca_transform(traj) + + for t in range(data.shape[1]): + pcd = pca_transform(data[:, t]) + ax.scatter(pcd[:, 0], pcd[:, 1], alpha=0.5) + ax.scatter(traj[:n, -1, 0], traj[:n, -1, 1], s=10, alpha=0.8, c="black") + ax.scatter(traj[:n, :, 0], traj[:n, :, 1], s=0.2, alpha=0.2, c="olive") + ax.scatter(traj[:n, 0, 0], traj[:n, 0, 1], s=4, alpha=1, c="blue") + ax.legend( + [ + *[f"T={i}" for i in range(data.shape[1])], + "Last Timepoint", + "Flow", + "Posterior", + ] + ) + + ax = axes[1] + cax = ax.matshow(graph) + fig.colorbar(cax) + title = f"{dataset}: Epoch = {title[0]}, Loss = {title[1]:1.3f}" + ax.set_title(title) + if not os.path.exists("figs_pca"): + os.mkdir("figs_pca") + plt.savefig(f"figs_pca/{title}.png") + np.save(f"figs_pca/{title}.npy", graph) + plt.close() + + +def to_torch(arr): + if isinstance(arr, list): + return torch.tensor(np.array(arr)).float() + elif isinstance(arr, (np.ndarray, np.generic)): + return torch.tensor(arr).float() + else: + raise NotImplementedError(f"to_torch not implemented for type: {type(arr)}") diff --git a/conditional-flow-matching/runner/tests/helpers/__init__.py b/conditional-flow-matching/runner/tests/helpers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/conditional-flow-matching/runner/tests/helpers/package_available.py b/conditional-flow-matching/runner/tests/helpers/package_available.py new file mode 100644 index 0000000000000000000000000000000000000000..d844aa01b424c82ea52328c3dc843c06bf09e5b1 --- /dev/null +++ b/conditional-flow-matching/runner/tests/helpers/package_available.py @@ -0,0 +1,27 @@ +import platform + +import pkg_resources +from pytorch_lightning.accelerators import TPUAccelerator + + +def _package_available(package_name: str) -> bool: + """Check if a package is available in your environment.""" + try: + return pkg_resources.require(package_name) is not None + except pkg_resources.DistributionNotFound: + return False + + +_TPU_AVAILABLE = TPUAccelerator.is_available() + +_IS_WINDOWS = platform.system() == "Windows" + +_SH_AVAILABLE = not _IS_WINDOWS and _package_available("sh") + +_DEEPSPEED_AVAILABLE = not _IS_WINDOWS and _package_available("deepspeed") +_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _package_available("fairscale") + +_WANDB_AVAILABLE = _package_available("wandb") +_NEPTUNE_AVAILABLE = _package_available("neptune") +_COMET_AVAILABLE = _package_available("comet_ml") +_MLFLOW_AVAILABLE = _package_available("mlflow") diff --git a/conditional-flow-matching/runner/tests/helpers/run_if.py b/conditional-flow-matching/runner/tests/helpers/run_if.py new file mode 100644 index 0000000000000000000000000000000000000000..cba4033c6d9b1000900495f6182c3682cfd2e0ef --- /dev/null +++ b/conditional-flow-matching/runner/tests/helpers/run_if.py @@ -0,0 +1,139 @@ +"""Adapted from: + +https://github.com/PyTorchLightning/pytorch-lightning/blob/master/tests/helpers/runif.py +""" + +import sys +from typing import Optional + +import pytest +import torch +from packaging.version import Version +from pkg_resources import get_distribution + +from tests.helpers.package_available import ( + _COMET_AVAILABLE, + _DEEPSPEED_AVAILABLE, + _FAIRSCALE_AVAILABLE, + _IS_WINDOWS, + _MLFLOW_AVAILABLE, + _NEPTUNE_AVAILABLE, + _SH_AVAILABLE, + _TPU_AVAILABLE, + _WANDB_AVAILABLE, +) + + +class RunIf: + """RunIf wrapper for conditional skipping of tests. + + Fully compatible with `@pytest.mark`. + + Example: + + @RunIf(min_torch="1.8") + @pytest.mark.parametrize("arg1", [1.0, 2.0]) + def test_wrapper(arg1): + assert arg1 > 0 + """ + + def __new__( + self, + min_gpus: int = 0, + min_torch: Optional[str] = None, + max_torch: Optional[str] = None, + min_python: Optional[str] = None, + skip_windows: bool = False, + sh: bool = False, + tpu: bool = False, + fairscale: bool = False, + deepspeed: bool = False, + wandb: bool = False, + neptune: bool = False, + comet: bool = False, + mlflow: bool = False, + **kwargs, + ): + """ + Args: + min_gpus: min number of GPUs required to run test + min_torch: minimum pytorch version to run test + max_torch: maximum pytorch version to run test + min_python: minimum python version required to run test + skip_windows: skip test for Windows platform + tpu: if TPU is available + sh: if `sh` module is required to run the test + fairscale: if `fairscale` module is required to run the test + deepspeed: if `deepspeed` module is required to run the test + wandb: if `wandb` module is required to run the test + neptune: if `neptune` module is required to run the test + comet: if `comet` module is required to run the test + mlflow: if `mlflow` module is required to run the test + kwargs: native pytest.mark.skipif keyword arguments + """ + conditions = [] + reasons = [] + + if min_gpus: + conditions.append(torch.cuda.device_count() < min_gpus) + reasons.append(f"GPUs>={min_gpus}") + + if min_torch: + torch_version = get_distribution("torch").version + conditions.append(Version(torch_version) < Version(min_torch)) + reasons.append(f"torch>={min_torch}") + + if max_torch: + torch_version = get_distribution("torch").version + conditions.append(Version(torch_version) >= Version(max_torch)) + reasons.append(f"torch<{max_torch}") + + if min_python: + py_version = ( + f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + ) + conditions.append(Version(py_version) < Version(min_python)) + reasons.append(f"python>={min_python}") + + if skip_windows: + conditions.append(_IS_WINDOWS) + reasons.append("does not run on Windows") + + if tpu: + conditions.append(not _TPU_AVAILABLE) + reasons.append("TPU") + + if sh: + conditions.append(not _SH_AVAILABLE) + reasons.append("sh") + + if fairscale: + conditions.append(not _FAIRSCALE_AVAILABLE) + reasons.append("fairscale") + + if deepspeed: + conditions.append(not _DEEPSPEED_AVAILABLE) + reasons.append("deepspeed") + + if wandb: + conditions.append(not _WANDB_AVAILABLE) + reasons.append("wandb") + + if neptune: + conditions.append(not _NEPTUNE_AVAILABLE) + reasons.append("neptune") + + if comet: + conditions.append(not _COMET_AVAILABLE) + reasons.append("comet") + + if mlflow: + conditions.append(not _MLFLOW_AVAILABLE) + reasons.append("mlflow") + + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] + return pytest.mark.skipif( + condition=any(conditions), + reason=f"Requires: [{' + '.join(reasons)}]", + **kwargs, + ) diff --git a/conditional-flow-matching/runner/tests/helpers/run_sh_command.py b/conditional-flow-matching/runner/tests/helpers/run_sh_command.py new file mode 100644 index 0000000000000000000000000000000000000000..ef7609e6ec79cbac44dd2d0bf40667fdde90e7ba --- /dev/null +++ b/conditional-flow-matching/runner/tests/helpers/run_sh_command.py @@ -0,0 +1,19 @@ +from typing import List + +import pytest + +from tests.helpers.package_available import _SH_AVAILABLE + +if _SH_AVAILABLE: + import sh + + +def run_sh_command(command: List[str]): + """Default method for executing shell commands with pytest and sh package.""" + msg = None + try: + sh.python(command) + except sh.ErrorReturnCode as e: + msg = e.stderr.decode() + if msg: + pytest.fail(msg=msg)